zenrows 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.standard.yml +8 -0
- data/.yardopts +10 -0
- data/CHANGELOG.md +28 -0
- data/CLAUDE.md +63 -0
- data/LICENSE.txt +21 -0
- data/README.md +174 -0
- data/Rakefile +15 -0
- data/lib/zenrows/backends/base.rb +95 -0
- data/lib/zenrows/backends/http_rb.rb +59 -0
- data/lib/zenrows/client.rb +159 -0
- data/lib/zenrows/configuration.rb +136 -0
- data/lib/zenrows/errors.rb +74 -0
- data/lib/zenrows/js_instructions.rb +267 -0
- data/lib/zenrows/proxy.rb +226 -0
- data/lib/zenrows/railtie.rb +25 -0
- data/lib/zenrows/version.rb +5 -0
- data/lib/zenrows.rb +67 -0
- data/plan.md +430 -0
- data/sig/zenrows.rbs +4 -0
- data/test/test_helper.rb +7 -0
- data/test/zenrows/client_test.rb +83 -0
- data/test/zenrows/js_instructions_test.rb +140 -0
- data/test/zenrows/proxy_test.rb +114 -0
- data/test/zenrows_test.rb +43 -0
- metadata +99 -0
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Zenrows
|
|
4
|
+
# Rails integration for Zenrows
|
|
5
|
+
#
|
|
6
|
+
# Provides Rails-specific features:
|
|
7
|
+
# - ActiveSupport::Duration support for wait times
|
|
8
|
+
# - Rails logger integration
|
|
9
|
+
#
|
|
10
|
+
# @author Ernest Bursa
|
|
11
|
+
# @since 0.1.0
|
|
12
|
+
class Railtie < Rails::Railtie
|
|
13
|
+
initializer "zenrows.configure_rails_initialization" do
|
|
14
|
+
# Set Rails logger as default if not configured
|
|
15
|
+
Zenrows.configure do |config|
|
|
16
|
+
config.logger ||= Rails.logger
|
|
17
|
+
end
|
|
18
|
+
end
|
|
19
|
+
|
|
20
|
+
# Generator for creating initializer
|
|
21
|
+
generators do
|
|
22
|
+
require_relative "generators/zenrows/install_generator" if defined?(Rails::Generators)
|
|
23
|
+
end
|
|
24
|
+
end
|
|
25
|
+
end
|
data/lib/zenrows.rb
ADDED
|
@@ -0,0 +1,67 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require_relative "zenrows/version"
|
|
4
|
+
require_relative "zenrows/errors"
|
|
5
|
+
require_relative "zenrows/configuration"
|
|
6
|
+
require_relative "zenrows/proxy"
|
|
7
|
+
require_relative "zenrows/js_instructions"
|
|
8
|
+
require_relative "zenrows/backends/base"
|
|
9
|
+
require_relative "zenrows/backends/http_rb"
|
|
10
|
+
require_relative "zenrows/client"
|
|
11
|
+
|
|
12
|
+
# ZenRows Ruby client for web scraping proxy
|
|
13
|
+
#
|
|
14
|
+
# @example Basic configuration and usage
|
|
15
|
+
# Zenrows.configure do |config|
|
|
16
|
+
# config.api_key = 'YOUR_API_KEY'
|
|
17
|
+
# end
|
|
18
|
+
#
|
|
19
|
+
# client = Zenrows::Client.new
|
|
20
|
+
# http = client.http(js_render: true, premium_proxy: true)
|
|
21
|
+
# response = http.get('https://example.com', ssl_context: client.ssl_context)
|
|
22
|
+
#
|
|
23
|
+
# @example With JavaScript instructions
|
|
24
|
+
# instructions = Zenrows::JsInstructions.build do
|
|
25
|
+
# click '.load-more'
|
|
26
|
+
# wait 2000
|
|
27
|
+
# scroll_to :bottom
|
|
28
|
+
# end
|
|
29
|
+
#
|
|
30
|
+
# http = client.http(js_render: true, js_instructions: instructions)
|
|
31
|
+
#
|
|
32
|
+
# @author Ernest Bursa
|
|
33
|
+
# @since 0.1.0
|
|
34
|
+
module Zenrows
|
|
35
|
+
class << self
|
|
36
|
+
# @return [Configuration] Global configuration instance
|
|
37
|
+
def configuration
|
|
38
|
+
@configuration ||= Configuration.new
|
|
39
|
+
end
|
|
40
|
+
|
|
41
|
+
# Configure Zenrows with a block
|
|
42
|
+
#
|
|
43
|
+
# @example
|
|
44
|
+
# Zenrows.configure do |config|
|
|
45
|
+
# config.api_key = 'YOUR_API_KEY'
|
|
46
|
+
# config.host = 'superproxy.zenrows.com'
|
|
47
|
+
# config.port = 1337
|
|
48
|
+
# end
|
|
49
|
+
#
|
|
50
|
+
# @yield [Configuration] configuration instance
|
|
51
|
+
# @return [Configuration] configuration instance
|
|
52
|
+
def configure
|
|
53
|
+
yield(configuration) if block_given?
|
|
54
|
+
configuration
|
|
55
|
+
end
|
|
56
|
+
|
|
57
|
+
# Reset configuration to defaults
|
|
58
|
+
#
|
|
59
|
+
# @return [void]
|
|
60
|
+
def reset_configuration!
|
|
61
|
+
@configuration = Configuration.new
|
|
62
|
+
end
|
|
63
|
+
end
|
|
64
|
+
end
|
|
65
|
+
|
|
66
|
+
# Optional Rails integration
|
|
67
|
+
require_relative "zenrows/railtie" if defined?(Rails::Railtie)
|
data/plan.md
ADDED
|
@@ -0,0 +1,430 @@
|
|
|
1
|
+
# ZenRows Ruby Gem Extraction - Research & Plan
|
|
2
|
+
|
|
3
|
+
## Executive Summary
|
|
4
|
+
|
|
5
|
+
Extract ZenRows HTTP client from Marmot into standalone Ruby gem with multi-backend support (http.rb primary), comprehensive YARD docs, and feature parity with ZenRows API.
|
|
6
|
+
|
|
7
|
+
## Decisions
|
|
8
|
+
|
|
9
|
+
| Decision | Choice |
|
|
10
|
+
|----------|--------|
|
|
11
|
+
| Gem name | `zenrows` |
|
|
12
|
+
| Repository | Separate repo (new standalone) |
|
|
13
|
+
| Primary mode | Proxy mode (current impl) |
|
|
14
|
+
| Credits API | Dashboard only (no public API exists) |
|
|
15
|
+
| Rails support | Optional (ActiveSupport::Duration) |
|
|
16
|
+
| License | MIT |
|
|
17
|
+
|
|
18
|
+
---
|
|
19
|
+
|
|
20
|
+
## Current Implementation Analysis
|
|
21
|
+
|
|
22
|
+
### Source Files (Marmot)
|
|
23
|
+
| File | Lines | Purpose |
|
|
24
|
+
|------|-------|---------|
|
|
25
|
+
| `app/helpers/zenrows_helper.rb` | 286 | HTTP client factory, proxy config, SSL, DNS caching |
|
|
26
|
+
| `lib/cli/zenrows_cli.rb` | 1632 | CLI tool (not part of gem) |
|
|
27
|
+
| `lib/cli/dom_simplifier.rb` | 1484 | HTML processing (not part of gem) |
|
|
28
|
+
| `lib/cli/file_cache.rb` | 121 | Caching (not part of gem) |
|
|
29
|
+
|
|
30
|
+
### Current Features in `ZenrowsHelper`
|
|
31
|
+
```ruby
|
|
32
|
+
zenrows_http_client(opts = {})
|
|
33
|
+
# Connection
|
|
34
|
+
:connect_timeout # default 5s
|
|
35
|
+
:read_timeout # default 180s (auto-calc with js_render)
|
|
36
|
+
|
|
37
|
+
# JavaScript Rendering
|
|
38
|
+
:js_render # boolean - headless browser
|
|
39
|
+
:wait # int/duration - wait time (ms or ActiveSupport::Duration)
|
|
40
|
+
:js_instructions # JSON array - browser automation
|
|
41
|
+
:json_response # boolean - get JSON instead of HTML
|
|
42
|
+
:screenshot # boolean - take screenshot
|
|
43
|
+
:screenshot_fullpage # boolean
|
|
44
|
+
:screenshot_selector # CSS selector
|
|
45
|
+
|
|
46
|
+
# Proxy
|
|
47
|
+
:premium_proxy # boolean - residential IPs
|
|
48
|
+
:proxy_country # string - country code (enables premium)
|
|
49
|
+
:session_id # bool/string - session persistence
|
|
50
|
+
|
|
51
|
+
# Browser
|
|
52
|
+
:window_height # int
|
|
53
|
+
:window_width # int
|
|
54
|
+
|
|
55
|
+
# Headers
|
|
56
|
+
:headers # Hash
|
|
57
|
+
:custom_headers # boolean - enable custom headers
|
|
58
|
+
:original_status # boolean - return original HTTP status
|
|
59
|
+
|
|
60
|
+
# Helper methods
|
|
61
|
+
zenrows_ssl_context # SSL context (verify_mode: NONE)
|
|
62
|
+
http_zenrows_proxy_url(opts) # Build proxy URL
|
|
63
|
+
http_zenrows_proxy_array(opts) # [host, port, user, pass]
|
|
64
|
+
wss_zenrows_proxy_url(opts) # WebSocket proxy URL
|
|
65
|
+
ensure_dns_resolved(url, opts) # DNS pre-resolution with caching
|
|
66
|
+
```
|
|
67
|
+
|
|
68
|
+
---
|
|
69
|
+
|
|
70
|
+
## ZenRows API Feature Comparison
|
|
71
|
+
|
|
72
|
+
### API Parameters (from ZenRows docs)
|
|
73
|
+
| Parameter | Current | Notes |
|
|
74
|
+
|-----------|---------|-------|
|
|
75
|
+
| `js_render` | ✅ | Headless browser |
|
|
76
|
+
| `premium_proxy` | ✅ | Residential IPs |
|
|
77
|
+
| `proxy_country` | ✅ | Geolocation |
|
|
78
|
+
| `wait` | ✅ | Wait after load (ms) |
|
|
79
|
+
| `wait_for` | ❌ MISSING | Wait for CSS selector |
|
|
80
|
+
| `js_instructions` | ✅ | Browser automation |
|
|
81
|
+
| `json_response` | ✅ | JSON output |
|
|
82
|
+
| `screenshot` | ✅ | Page screenshot |
|
|
83
|
+
| `screenshot_fullpage` | ✅ | Full page screenshot |
|
|
84
|
+
| `screenshot_selector` | ✅ | Element screenshot |
|
|
85
|
+
| `custom_headers` | ✅ | Custom HTTP headers |
|
|
86
|
+
| `session_id` | ✅ | Session persistence |
|
|
87
|
+
| `block_resources` | ❌ MISSING | Block CSS/images/fonts |
|
|
88
|
+
| `original_status` | ✅ | Original HTTP status |
|
|
89
|
+
| `window_width/height` | ✅ | Browser dimensions |
|
|
90
|
+
| `autoparse` | ❌ MISSING | Auto-extract data |
|
|
91
|
+
| `css_extractor` | ❌ MISSING | CSS-based extraction |
|
|
92
|
+
| `markdown_response` | ❌ MISSING | Markdown output |
|
|
93
|
+
|
|
94
|
+
### JavaScript Instructions (all supported in current impl)
|
|
95
|
+
- `wait` - Wait duration
|
|
96
|
+
- `wait_for` - Wait for selector
|
|
97
|
+
- `wait_event` - networkidle/load/domcontentloaded
|
|
98
|
+
- `click` - Click element
|
|
99
|
+
- `fill` - Fill input
|
|
100
|
+
- `check`/`uncheck` - Checkboxes
|
|
101
|
+
- `select_option` - Dropdowns
|
|
102
|
+
- `scroll_y`/`scroll_x` - Scrolling
|
|
103
|
+
- `scroll_to` - bottom/top
|
|
104
|
+
- `evaluate` - Custom JS
|
|
105
|
+
- `frame_*` - Iframe interactions
|
|
106
|
+
- `solve_captcha` - reCAPTCHA solving
|
|
107
|
+
|
|
108
|
+
---
|
|
109
|
+
|
|
110
|
+
## Gem Architecture Design
|
|
111
|
+
|
|
112
|
+
### Directory Structure
|
|
113
|
+
```
|
|
114
|
+
zenrows-rb/
|
|
115
|
+
├── lib/
|
|
116
|
+
│ ├── zenrows.rb # Main entry, configuration
|
|
117
|
+
│ ├── zenrows/
|
|
118
|
+
│ │ ├── version.rb
|
|
119
|
+
│ │ ├── configuration.rb # Global config (api_key, defaults)
|
|
120
|
+
│ │ ├── client.rb # Main client class
|
|
121
|
+
│ │ ├── request.rb # Request builder
|
|
122
|
+
│ │ ├── response.rb # Response wrapper
|
|
123
|
+
│ │ ├── errors.rb # Custom exceptions
|
|
124
|
+
│ │ ├── proxy.rb # Proxy URL builder
|
|
125
|
+
│ │ ├── js_instructions.rb # JS instruction builder (DSL)
|
|
126
|
+
│ │ └── backends/
|
|
127
|
+
│ │ ├── base.rb # Backend interface
|
|
128
|
+
│ │ ├── http_rb.rb # http.rb adapter (primary)
|
|
129
|
+
│ │ ├── faraday.rb # Faraday adapter (future)
|
|
130
|
+
│ │ └── net_http.rb # Net::HTTP adapter (future)
|
|
131
|
+
├── sig/ # RBS type signatures
|
|
132
|
+
├── spec/ # RSpec tests
|
|
133
|
+
├── .yardopts
|
|
134
|
+
├── zenrows.gemspec
|
|
135
|
+
├── Gemfile
|
|
136
|
+
├── README.md
|
|
137
|
+
├── CHANGELOG.md
|
|
138
|
+
└── LICENSE
|
|
139
|
+
```
|
|
140
|
+
|
|
141
|
+
### Multi-Backend Architecture
|
|
142
|
+
```ruby
|
|
143
|
+
module Zenrows
|
|
144
|
+
module Backends
|
|
145
|
+
class Base
|
|
146
|
+
def get(url, options = {})
|
|
147
|
+
raise NotImplementedError
|
|
148
|
+
end
|
|
149
|
+
|
|
150
|
+
def post(url, body, options = {})
|
|
151
|
+
raise NotImplementedError
|
|
152
|
+
end
|
|
153
|
+
end
|
|
154
|
+
|
|
155
|
+
class HttpRb < Base
|
|
156
|
+
# Primary backend using http.rb gem
|
|
157
|
+
# Supports: proxy, ssl_context, timeouts, headers
|
|
158
|
+
end
|
|
159
|
+
end
|
|
160
|
+
end
|
|
161
|
+
```
|
|
162
|
+
|
|
163
|
+
### Public API Design
|
|
164
|
+
```ruby
|
|
165
|
+
# Configuration
|
|
166
|
+
Zenrows.configure do |config|
|
|
167
|
+
config.api_key = 'YOUR_API_KEY'
|
|
168
|
+
config.host = 'superproxy.zenrows.com' # from credentials
|
|
169
|
+
config.port = 1337
|
|
170
|
+
config.default_timeout = 180
|
|
171
|
+
config.backend = :http_rb # :faraday, :net_http (future)
|
|
172
|
+
end
|
|
173
|
+
|
|
174
|
+
# Get pre-configured HTTP client (proxy mode)
|
|
175
|
+
client = Zenrows::Client.new
|
|
176
|
+
|
|
177
|
+
# Simple proxy client
|
|
178
|
+
http = client.http(js_render: true, premium_proxy: true)
|
|
179
|
+
response = http.get('https://example.com')
|
|
180
|
+
|
|
181
|
+
# With full options
|
|
182
|
+
http = client.http(
|
|
183
|
+
js_render: true,
|
|
184
|
+
premium_proxy: true,
|
|
185
|
+
proxy_country: 'us',
|
|
186
|
+
wait: 5000, # ms or ActiveSupport::Duration
|
|
187
|
+
wait_for: '.content', # CSS selector
|
|
188
|
+
session_id: true # sticky session
|
|
189
|
+
)
|
|
190
|
+
response = http.get(url)
|
|
191
|
+
|
|
192
|
+
# Response handling
|
|
193
|
+
response.body # HTML string
|
|
194
|
+
response.status # HTTP status
|
|
195
|
+
response.headers # Response headers (includes Zr-* headers)
|
|
196
|
+
response.final_url # From Zr-Final-Url header
|
|
197
|
+
response.cookies # From Zr-Cookies header
|
|
198
|
+
|
|
199
|
+
# JS Instructions DSL (optional)
|
|
200
|
+
instructions = Zenrows::JsInstructions.build do
|
|
201
|
+
click '.load-more'
|
|
202
|
+
wait 2000
|
|
203
|
+
fill 'input#email', 'test@example.com'
|
|
204
|
+
scroll_to :bottom
|
|
205
|
+
wait_for '.results'
|
|
206
|
+
end
|
|
207
|
+
http = client.http(js_render: true, js_instructions: instructions)
|
|
208
|
+
|
|
209
|
+
# SSL context (auto-configured, verify_mode: NONE for proxy)
|
|
210
|
+
# Timeouts auto-calculated based on js_render + wait options
|
|
211
|
+
```
|
|
212
|
+
|
|
213
|
+
---
|
|
214
|
+
|
|
215
|
+
## http.rb Backend Implementation
|
|
216
|
+
|
|
217
|
+
### Key Patterns from http.rb docs
|
|
218
|
+
```ruby
|
|
219
|
+
# Proxy with auth
|
|
220
|
+
HTTP.via("proxy-hostname", 8080, "username", "password")
|
|
221
|
+
.get("http://example.com")
|
|
222
|
+
|
|
223
|
+
# Custom proxy headers
|
|
224
|
+
HTTP.via("proxy-hostname", 8080, {"Proxy-Authorization" => "..."})
|
|
225
|
+
.get("http://example.com")
|
|
226
|
+
|
|
227
|
+
# Timeouts (per-operation)
|
|
228
|
+
HTTP.timeout(connect: 5, write: 2, read: 10).get(url)
|
|
229
|
+
|
|
230
|
+
# Global timeout
|
|
231
|
+
HTTP.timeout(30).get(url)
|
|
232
|
+
|
|
233
|
+
# SSL context
|
|
234
|
+
ssl_context = OpenSSL::SSL::SSLContext.new
|
|
235
|
+
ssl_context.verify_mode = OpenSSL::SSL::VERIFY_NONE
|
|
236
|
+
HTTP.get(url, ssl_context: ssl_context)
|
|
237
|
+
|
|
238
|
+
# Headers
|
|
239
|
+
HTTP.headers("User-Agent" => "...", "Accept" => "...").get(url)
|
|
240
|
+
```
|
|
241
|
+
|
|
242
|
+
---
|
|
243
|
+
|
|
244
|
+
## YARD Documentation Standards
|
|
245
|
+
|
|
246
|
+
### Required Tags
|
|
247
|
+
```ruby
|
|
248
|
+
# @author Ernest Surudo
|
|
249
|
+
# @since 1.0.0
|
|
250
|
+
# @api public
|
|
251
|
+
|
|
252
|
+
# @param url [String] Target URL to scrape
|
|
253
|
+
# @param options [Hash] Request options
|
|
254
|
+
# @option options [Boolean] :js_render Enable JavaScript rendering
|
|
255
|
+
# @option options [Boolean] :premium_proxy Use residential proxies
|
|
256
|
+
# @option options [String] :proxy_country ISO country code (us, gb, de)
|
|
257
|
+
# @option options [Integer] :wait Wait time in milliseconds
|
|
258
|
+
# @option options [String] :wait_for CSS selector to wait for
|
|
259
|
+
# @return [Zenrows::Response] Response object
|
|
260
|
+
# @raise [Zenrows::Error] Base error class
|
|
261
|
+
# @raise [Zenrows::RateLimitError] When rate limited (429)
|
|
262
|
+
# @raise [Zenrows::AuthenticationError] Invalid API key
|
|
263
|
+
# @example Basic request
|
|
264
|
+
# client.get('https://example.com')
|
|
265
|
+
# @example With JavaScript rendering
|
|
266
|
+
# client.get('https://example.com', js_render: true, wait: 5000)
|
|
267
|
+
def get(url, options = {})
|
|
268
|
+
end
|
|
269
|
+
```
|
|
270
|
+
|
|
271
|
+
---
|
|
272
|
+
|
|
273
|
+
## Implementation Plan
|
|
274
|
+
|
|
275
|
+
### Phase 1: Gem Skeleton
|
|
276
|
+
```bash
|
|
277
|
+
bundle gem zenrows --test=rspec --ci=github --linter=rubocop
|
|
278
|
+
```
|
|
279
|
+
- Configure gemspec (http dependency, MIT license)
|
|
280
|
+
- Set up YARD (.yardopts)
|
|
281
|
+
- Configure RuboCop with relaxed rules
|
|
282
|
+
|
|
283
|
+
### Phase 2: Configuration Module
|
|
284
|
+
```ruby
|
|
285
|
+
# lib/zenrows.rb
|
|
286
|
+
# lib/zenrows/configuration.rb
|
|
287
|
+
```
|
|
288
|
+
- Global config: api_key, host, port, default_timeout, backend
|
|
289
|
+
- Thread-safe configuration
|
|
290
|
+
- Optional Rails initializer generator
|
|
291
|
+
|
|
292
|
+
### Phase 3: Backend Interface + http.rb Adapter
|
|
293
|
+
```ruby
|
|
294
|
+
# lib/zenrows/backends/base.rb
|
|
295
|
+
# lib/zenrows/backends/http_rb.rb
|
|
296
|
+
```
|
|
297
|
+
- Abstract base with `#build_client(options)` method
|
|
298
|
+
- http.rb implementation with proxy, SSL, timeouts
|
|
299
|
+
- Port logic from `zenrows_helper.rb:30-200`
|
|
300
|
+
|
|
301
|
+
### Phase 4: Proxy URL Builder
|
|
302
|
+
```ruby
|
|
303
|
+
# lib/zenrows/proxy.rb
|
|
304
|
+
```
|
|
305
|
+
- Build proxy URL with options encoded in username
|
|
306
|
+
- Format: `http://API_KEY-opt1-val1-opt2-val2:@host:port`
|
|
307
|
+
- Support all ZenRows proxy options
|
|
308
|
+
|
|
309
|
+
### Phase 5: Client Class
|
|
310
|
+
```ruby
|
|
311
|
+
# lib/zenrows/client.rb
|
|
312
|
+
```
|
|
313
|
+
- `#http(options)` returns configured HTTP client
|
|
314
|
+
- Timeout auto-calculation (base + js_render + wait)
|
|
315
|
+
- SSL context configuration
|
|
316
|
+
|
|
317
|
+
### Phase 6: JS Instructions DSL
|
|
318
|
+
```ruby
|
|
319
|
+
# lib/zenrows/js_instructions.rb
|
|
320
|
+
```
|
|
321
|
+
- Builder pattern with block syntax
|
|
322
|
+
- All instructions: click, fill, wait, scroll, evaluate, frame_*
|
|
323
|
+
- JSON serialization for proxy header
|
|
324
|
+
|
|
325
|
+
### Phase 7: Response Enhancement (optional)
|
|
326
|
+
```ruby
|
|
327
|
+
# lib/zenrows/response.rb # Only if wrapping needed
|
|
328
|
+
```
|
|
329
|
+
- Helper methods for Zr-* headers
|
|
330
|
+
- Cookie extraction
|
|
331
|
+
- Final URL extraction
|
|
332
|
+
|
|
333
|
+
### Phase 8: Error Classes
|
|
334
|
+
```ruby
|
|
335
|
+
# lib/zenrows/errors.rb
|
|
336
|
+
```
|
|
337
|
+
- `Zenrows::Error` base class
|
|
338
|
+
- Specific errors with helpful messages
|
|
339
|
+
- Retry suggestions
|
|
340
|
+
|
|
341
|
+
### Phase 9: Rails Integration (optional)
|
|
342
|
+
```ruby
|
|
343
|
+
# lib/zenrows/railtie.rb
|
|
344
|
+
# lib/generators/zenrows/install_generator.rb
|
|
345
|
+
```
|
|
346
|
+
- ActiveSupport::Duration support in wait option
|
|
347
|
+
- Rails config generator (optional)
|
|
348
|
+
|
|
349
|
+
### Phase 10: Documentation & Testing
|
|
350
|
+
- YARD docs for all public methods
|
|
351
|
+
- RSpec unit tests with WebMock
|
|
352
|
+
- VCR for integration tests
|
|
353
|
+
- README with comprehensive examples
|
|
354
|
+
- CHANGELOG
|
|
355
|
+
|
|
356
|
+
---
|
|
357
|
+
|
|
358
|
+
## Missing Features to Add
|
|
359
|
+
|
|
360
|
+
| Feature | Priority | Notes |
|
|
361
|
+
|---------|----------|-------|
|
|
362
|
+
| `wait_for` | HIGH | Wait for CSS selector |
|
|
363
|
+
| `block_resources` | MEDIUM | Block images/fonts/CSS |
|
|
364
|
+
| `autoparse` | LOW | Auto-extraction |
|
|
365
|
+
| `css_extractor` | LOW | CSS-based extraction |
|
|
366
|
+
| `markdown_response` | LOW | Markdown output |
|
|
367
|
+
|
|
368
|
+
---
|
|
369
|
+
|
|
370
|
+
## Files to Extract/Reference
|
|
371
|
+
|
|
372
|
+
### From Marmot (copy logic, rewrite for gem)
|
|
373
|
+
- `app/helpers/zenrows_helper.rb:30-100` - Proxy URL building
|
|
374
|
+
- `app/helpers/zenrows_helper.rb:100-200` - HTTP client configuration
|
|
375
|
+
- `app/helpers/zenrows_helper.rb:200-286` - DNS resolution, SSL context
|
|
376
|
+
|
|
377
|
+
### Dependencies
|
|
378
|
+
```ruby
|
|
379
|
+
# zenrows.gemspec
|
|
380
|
+
spec.add_dependency 'http', '~> 5.0' # Primary HTTP backend
|
|
381
|
+
spec.add_dependency 'addressable', '~> 2.8' # URL handling
|
|
382
|
+
|
|
383
|
+
spec.add_development_dependency 'rspec', '~> 3.12'
|
|
384
|
+
spec.add_development_dependency 'vcr', '~> 6.0'
|
|
385
|
+
spec.add_development_dependency 'webmock', '~> 3.0'
|
|
386
|
+
spec.add_development_dependency 'yard', '~> 0.9'
|
|
387
|
+
spec.add_development_dependency 'rubocop', '~> 1.0'
|
|
388
|
+
```
|
|
389
|
+
|
|
390
|
+
---
|
|
391
|
+
|
|
392
|
+
## Final File Structure
|
|
393
|
+
|
|
394
|
+
```
|
|
395
|
+
zenrows/
|
|
396
|
+
├── lib/
|
|
397
|
+
│ ├── zenrows.rb # Main entry point
|
|
398
|
+
│ └── zenrows/
|
|
399
|
+
│ ├── version.rb
|
|
400
|
+
│ ├── configuration.rb
|
|
401
|
+
│ ├── client.rb
|
|
402
|
+
│ ├── proxy.rb
|
|
403
|
+
│ ├── js_instructions.rb
|
|
404
|
+
│ ├── errors.rb
|
|
405
|
+
│ ├── railtie.rb # Optional Rails
|
|
406
|
+
│ └── backends/
|
|
407
|
+
│ ├── base.rb
|
|
408
|
+
│ └── http_rb.rb # Primary backend
|
|
409
|
+
├── sig/ # RBS types (future)
|
|
410
|
+
├── spec/
|
|
411
|
+
│ ├── zenrows/
|
|
412
|
+
│ │ ├── client_spec.rb
|
|
413
|
+
│ │ ├── proxy_spec.rb
|
|
414
|
+
│ │ └── js_instructions_spec.rb
|
|
415
|
+
│ ├── spec_helper.rb
|
|
416
|
+
│ └── fixtures/vcr_cassettes/
|
|
417
|
+
├── .yardopts
|
|
418
|
+
├── .rubocop.yml
|
|
419
|
+
├── zenrows.gemspec
|
|
420
|
+
├── Gemfile
|
|
421
|
+
├── Rakefile
|
|
422
|
+
├── README.md
|
|
423
|
+
├── CHANGELOG.md
|
|
424
|
+
└── LICENSE.txt
|
|
425
|
+
```
|
|
426
|
+
|
|
427
|
+
## Source Reference
|
|
428
|
+
|
|
429
|
+
Extract core logic from Marmot file:
|
|
430
|
+
- **`app/helpers/zenrows_helper.rb`** - All proxy/client building logic
|
data/sig/zenrows.rbs
ADDED
data/test/test_helper.rb
ADDED
|
@@ -0,0 +1,83 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "test_helper"
|
|
4
|
+
|
|
5
|
+
class ClientTest < Minitest::Test
|
|
6
|
+
def setup
|
|
7
|
+
Zenrows.reset_configuration!
|
|
8
|
+
Zenrows.configure do |config|
|
|
9
|
+
config.api_key = "test_api_key"
|
|
10
|
+
end
|
|
11
|
+
end
|
|
12
|
+
|
|
13
|
+
def test_initialize_with_global_config
|
|
14
|
+
client = Zenrows::Client.new
|
|
15
|
+
|
|
16
|
+
assert_equal "test_api_key", client.config.api_key
|
|
17
|
+
assert_equal "superproxy.zenrows.com", client.config.host
|
|
18
|
+
assert_equal 1337, client.config.port
|
|
19
|
+
end
|
|
20
|
+
|
|
21
|
+
def test_initialize_with_overrides
|
|
22
|
+
client = Zenrows::Client.new(
|
|
23
|
+
api_key: "override_key",
|
|
24
|
+
host: "custom.proxy.com",
|
|
25
|
+
port: 8080
|
|
26
|
+
)
|
|
27
|
+
|
|
28
|
+
assert_equal "override_key", client.config.api_key
|
|
29
|
+
assert_equal "custom.proxy.com", client.config.host
|
|
30
|
+
assert_equal 8080, client.config.port
|
|
31
|
+
end
|
|
32
|
+
|
|
33
|
+
def test_initialize_raises_without_api_key
|
|
34
|
+
Zenrows.reset_configuration!
|
|
35
|
+
|
|
36
|
+
assert_raises Zenrows::ConfigurationError do
|
|
37
|
+
Zenrows::Client.new
|
|
38
|
+
end
|
|
39
|
+
end
|
|
40
|
+
|
|
41
|
+
def test_http_returns_http_client
|
|
42
|
+
client = Zenrows::Client.new
|
|
43
|
+
http = client.http(js_render: true)
|
|
44
|
+
|
|
45
|
+
assert_kind_of HTTP::Client, http
|
|
46
|
+
end
|
|
47
|
+
|
|
48
|
+
def test_ssl_context
|
|
49
|
+
client = Zenrows::Client.new
|
|
50
|
+
ssl = client.ssl_context
|
|
51
|
+
|
|
52
|
+
assert_kind_of OpenSSL::SSL::SSLContext, ssl
|
|
53
|
+
assert_equal OpenSSL::SSL::VERIFY_NONE, ssl.verify_mode
|
|
54
|
+
end
|
|
55
|
+
|
|
56
|
+
def test_proxy_config
|
|
57
|
+
client = Zenrows::Client.new
|
|
58
|
+
config = client.proxy_config(js_render: true, premium_proxy: true)
|
|
59
|
+
|
|
60
|
+
assert_equal "superproxy.zenrows.com", config[:host]
|
|
61
|
+
assert_equal 1337, config[:port]
|
|
62
|
+
assert_equal "test_api_key", config[:username]
|
|
63
|
+
assert_includes config[:password], "js_render=true"
|
|
64
|
+
assert_includes config[:password], "premium_proxy=true"
|
|
65
|
+
end
|
|
66
|
+
|
|
67
|
+
def test_proxy_url
|
|
68
|
+
client = Zenrows::Client.new
|
|
69
|
+
url = client.proxy_url(js_render: true)
|
|
70
|
+
|
|
71
|
+
assert_includes url, "test_api_key"
|
|
72
|
+
assert_includes url, "superproxy.zenrows.com"
|
|
73
|
+
assert_includes url, "js_render=true"
|
|
74
|
+
end
|
|
75
|
+
|
|
76
|
+
def test_unsupported_backend_raises
|
|
77
|
+
Zenrows.configure { |c| c.backend = :unsupported }
|
|
78
|
+
|
|
79
|
+
assert_raises Zenrows::ConfigurationError do
|
|
80
|
+
Zenrows::Client.new
|
|
81
|
+
end
|
|
82
|
+
end
|
|
83
|
+
end
|