zenrows 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,25 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Zenrows
4
+ # Rails integration for Zenrows
5
+ #
6
+ # Provides Rails-specific features:
7
+ # - ActiveSupport::Duration support for wait times
8
+ # - Rails logger integration
9
+ #
10
+ # @author Ernest Bursa
11
+ # @since 0.1.0
12
+ class Railtie < Rails::Railtie
13
+ initializer "zenrows.configure_rails_initialization" do
14
+ # Set Rails logger as default if not configured
15
+ Zenrows.configure do |config|
16
+ config.logger ||= Rails.logger
17
+ end
18
+ end
19
+
20
+ # Generator for creating initializer
21
+ generators do
22
+ require_relative "generators/zenrows/install_generator" if defined?(Rails::Generators)
23
+ end
24
+ end
25
+ end
@@ -0,0 +1,5 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Zenrows
4
+ VERSION = "0.1.0"
5
+ end
data/lib/zenrows.rb ADDED
@@ -0,0 +1,67 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative "zenrows/version"
4
+ require_relative "zenrows/errors"
5
+ require_relative "zenrows/configuration"
6
+ require_relative "zenrows/proxy"
7
+ require_relative "zenrows/js_instructions"
8
+ require_relative "zenrows/backends/base"
9
+ require_relative "zenrows/backends/http_rb"
10
+ require_relative "zenrows/client"
11
+
12
+ # ZenRows Ruby client for web scraping proxy
13
+ #
14
+ # @example Basic configuration and usage
15
+ # Zenrows.configure do |config|
16
+ # config.api_key = 'YOUR_API_KEY'
17
+ # end
18
+ #
19
+ # client = Zenrows::Client.new
20
+ # http = client.http(js_render: true, premium_proxy: true)
21
+ # response = http.get('https://example.com', ssl_context: client.ssl_context)
22
+ #
23
+ # @example With JavaScript instructions
24
+ # instructions = Zenrows::JsInstructions.build do
25
+ # click '.load-more'
26
+ # wait 2000
27
+ # scroll_to :bottom
28
+ # end
29
+ #
30
+ # http = client.http(js_render: true, js_instructions: instructions)
31
+ #
32
+ # @author Ernest Bursa
33
+ # @since 0.1.0
34
+ module Zenrows
35
+ class << self
36
+ # @return [Configuration] Global configuration instance
37
+ def configuration
38
+ @configuration ||= Configuration.new
39
+ end
40
+
41
+ # Configure Zenrows with a block
42
+ #
43
+ # @example
44
+ # Zenrows.configure do |config|
45
+ # config.api_key = 'YOUR_API_KEY'
46
+ # config.host = 'superproxy.zenrows.com'
47
+ # config.port = 1337
48
+ # end
49
+ #
50
+ # @yield [Configuration] configuration instance
51
+ # @return [Configuration] configuration instance
52
+ def configure
53
+ yield(configuration) if block_given?
54
+ configuration
55
+ end
56
+
57
+ # Reset configuration to defaults
58
+ #
59
+ # @return [void]
60
+ def reset_configuration!
61
+ @configuration = Configuration.new
62
+ end
63
+ end
64
+ end
65
+
66
+ # Optional Rails integration
67
+ require_relative "zenrows/railtie" if defined?(Rails::Railtie)
data/plan.md ADDED
@@ -0,0 +1,430 @@
1
+ # ZenRows Ruby Gem Extraction - Research & Plan
2
+
3
+ ## Executive Summary
4
+
5
+ Extract ZenRows HTTP client from Marmot into standalone Ruby gem with multi-backend support (http.rb primary), comprehensive YARD docs, and feature parity with ZenRows API.
6
+
7
+ ## Decisions
8
+
9
+ | Decision | Choice |
10
+ |----------|--------|
11
+ | Gem name | `zenrows` |
12
+ | Repository | Separate repo (new standalone) |
13
+ | Primary mode | Proxy mode (current impl) |
14
+ | Credits API | Dashboard only (no public API exists) |
15
+ | Rails support | Optional (ActiveSupport::Duration) |
16
+ | License | MIT |
17
+
18
+ ---
19
+
20
+ ## Current Implementation Analysis
21
+
22
+ ### Source Files (Marmot)
23
+ | File | Lines | Purpose |
24
+ |------|-------|---------|
25
+ | `app/helpers/zenrows_helper.rb` | 286 | HTTP client factory, proxy config, SSL, DNS caching |
26
+ | `lib/cli/zenrows_cli.rb` | 1632 | CLI tool (not part of gem) |
27
+ | `lib/cli/dom_simplifier.rb` | 1484 | HTML processing (not part of gem) |
28
+ | `lib/cli/file_cache.rb` | 121 | Caching (not part of gem) |
29
+
30
+ ### Current Features in `ZenrowsHelper`
31
+ ```ruby
32
+ zenrows_http_client(opts = {})
33
+ # Connection
34
+ :connect_timeout # default 5s
35
+ :read_timeout # default 180s (auto-calc with js_render)
36
+
37
+ # JavaScript Rendering
38
+ :js_render # boolean - headless browser
39
+ :wait # int/duration - wait time (ms or ActiveSupport::Duration)
40
+ :js_instructions # JSON array - browser automation
41
+ :json_response # boolean - get JSON instead of HTML
42
+ :screenshot # boolean - take screenshot
43
+ :screenshot_fullpage # boolean
44
+ :screenshot_selector # CSS selector
45
+
46
+ # Proxy
47
+ :premium_proxy # boolean - residential IPs
48
+ :proxy_country # string - country code (enables premium)
49
+ :session_id # bool/string - session persistence
50
+
51
+ # Browser
52
+ :window_height # int
53
+ :window_width # int
54
+
55
+ # Headers
56
+ :headers # Hash
57
+ :custom_headers # boolean - enable custom headers
58
+ :original_status # boolean - return original HTTP status
59
+
60
+ # Helper methods
61
+ zenrows_ssl_context # SSL context (verify_mode: NONE)
62
+ http_zenrows_proxy_url(opts) # Build proxy URL
63
+ http_zenrows_proxy_array(opts) # [host, port, user, pass]
64
+ wss_zenrows_proxy_url(opts) # WebSocket proxy URL
65
+ ensure_dns_resolved(url, opts) # DNS pre-resolution with caching
66
+ ```
67
+
68
+ ---
69
+
70
+ ## ZenRows API Feature Comparison
71
+
72
+ ### API Parameters (from ZenRows docs)
73
+ | Parameter | Current | Notes |
74
+ |-----------|---------|-------|
75
+ | `js_render` | ✅ | Headless browser |
76
+ | `premium_proxy` | ✅ | Residential IPs |
77
+ | `proxy_country` | ✅ | Geolocation |
78
+ | `wait` | ✅ | Wait after load (ms) |
79
+ | `wait_for` | ❌ MISSING | Wait for CSS selector |
80
+ | `js_instructions` | ✅ | Browser automation |
81
+ | `json_response` | ✅ | JSON output |
82
+ | `screenshot` | ✅ | Page screenshot |
83
+ | `screenshot_fullpage` | ✅ | Full page screenshot |
84
+ | `screenshot_selector` | ✅ | Element screenshot |
85
+ | `custom_headers` | ✅ | Custom HTTP headers |
86
+ | `session_id` | ✅ | Session persistence |
87
+ | `block_resources` | ❌ MISSING | Block CSS/images/fonts |
88
+ | `original_status` | ✅ | Original HTTP status |
89
+ | `window_width/height` | ✅ | Browser dimensions |
90
+ | `autoparse` | ❌ MISSING | Auto-extract data |
91
+ | `css_extractor` | ❌ MISSING | CSS-based extraction |
92
+ | `markdown_response` | ❌ MISSING | Markdown output |
93
+
94
+ ### JavaScript Instructions (all supported in current impl)
95
+ - `wait` - Wait duration
96
+ - `wait_for` - Wait for selector
97
+ - `wait_event` - networkidle/load/domcontentloaded
98
+ - `click` - Click element
99
+ - `fill` - Fill input
100
+ - `check`/`uncheck` - Checkboxes
101
+ - `select_option` - Dropdowns
102
+ - `scroll_y`/`scroll_x` - Scrolling
103
+ - `scroll_to` - bottom/top
104
+ - `evaluate` - Custom JS
105
+ - `frame_*` - Iframe interactions
106
+ - `solve_captcha` - reCAPTCHA solving
107
+
108
+ ---
109
+
110
+ ## Gem Architecture Design
111
+
112
+ ### Directory Structure
113
+ ```
114
+ zenrows-rb/
115
+ ├── lib/
116
+ │ ├── zenrows.rb # Main entry, configuration
117
+ │ ├── zenrows/
118
+ │ │ ├── version.rb
119
+ │ │ ├── configuration.rb # Global config (api_key, defaults)
120
+ │ │ ├── client.rb # Main client class
121
+ │ │ ├── request.rb # Request builder
122
+ │ │ ├── response.rb # Response wrapper
123
+ │ │ ├── errors.rb # Custom exceptions
124
+ │ │ ├── proxy.rb # Proxy URL builder
125
+ │ │ ├── js_instructions.rb # JS instruction builder (DSL)
126
+ │ │ └── backends/
127
+ │ │ ├── base.rb # Backend interface
128
+ │ │ ├── http_rb.rb # http.rb adapter (primary)
129
+ │ │ ├── faraday.rb # Faraday adapter (future)
130
+ │ │ └── net_http.rb # Net::HTTP adapter (future)
131
+ ├── sig/ # RBS type signatures
132
+ ├── spec/ # RSpec tests
133
+ ├── .yardopts
134
+ ├── zenrows.gemspec
135
+ ├── Gemfile
136
+ ├── README.md
137
+ ├── CHANGELOG.md
138
+ └── LICENSE
139
+ ```
140
+
141
+ ### Multi-Backend Architecture
142
+ ```ruby
143
+ module Zenrows
144
+ module Backends
145
+ class Base
146
+ def get(url, options = {})
147
+ raise NotImplementedError
148
+ end
149
+
150
+ def post(url, body, options = {})
151
+ raise NotImplementedError
152
+ end
153
+ end
154
+
155
+ class HttpRb < Base
156
+ # Primary backend using http.rb gem
157
+ # Supports: proxy, ssl_context, timeouts, headers
158
+ end
159
+ end
160
+ end
161
+ ```
162
+
163
+ ### Public API Design
164
+ ```ruby
165
+ # Configuration
166
+ Zenrows.configure do |config|
167
+ config.api_key = 'YOUR_API_KEY'
168
+ config.host = 'superproxy.zenrows.com' # from credentials
169
+ config.port = 1337
170
+ config.default_timeout = 180
171
+ config.backend = :http_rb # :faraday, :net_http (future)
172
+ end
173
+
174
+ # Get pre-configured HTTP client (proxy mode)
175
+ client = Zenrows::Client.new
176
+
177
+ # Simple proxy client
178
+ http = client.http(js_render: true, premium_proxy: true)
179
+ response = http.get('https://example.com')
180
+
181
+ # With full options
182
+ http = client.http(
183
+ js_render: true,
184
+ premium_proxy: true,
185
+ proxy_country: 'us',
186
+ wait: 5000, # ms or ActiveSupport::Duration
187
+ wait_for: '.content', # CSS selector
188
+ session_id: true # sticky session
189
+ )
190
+ response = http.get(url)
191
+
192
+ # Response handling
193
+ response.body # HTML string
194
+ response.status # HTTP status
195
+ response.headers # Response headers (includes Zr-* headers)
196
+ response.final_url # From Zr-Final-Url header
197
+ response.cookies # From Zr-Cookies header
198
+
199
+ # JS Instructions DSL (optional)
200
+ instructions = Zenrows::JsInstructions.build do
201
+ click '.load-more'
202
+ wait 2000
203
+ fill 'input#email', 'test@example.com'
204
+ scroll_to :bottom
205
+ wait_for '.results'
206
+ end
207
+ http = client.http(js_render: true, js_instructions: instructions)
208
+
209
+ # SSL context (auto-configured, verify_mode: NONE for proxy)
210
+ # Timeouts auto-calculated based on js_render + wait options
211
+ ```
212
+
213
+ ---
214
+
215
+ ## http.rb Backend Implementation
216
+
217
+ ### Key Patterns from http.rb docs
218
+ ```ruby
219
+ # Proxy with auth
220
+ HTTP.via("proxy-hostname", 8080, "username", "password")
221
+ .get("http://example.com")
222
+
223
+ # Custom proxy headers
224
+ HTTP.via("proxy-hostname", 8080, {"Proxy-Authorization" => "..."})
225
+ .get("http://example.com")
226
+
227
+ # Timeouts (per-operation)
228
+ HTTP.timeout(connect: 5, write: 2, read: 10).get(url)
229
+
230
+ # Global timeout
231
+ HTTP.timeout(30).get(url)
232
+
233
+ # SSL context
234
+ ssl_context = OpenSSL::SSL::SSLContext.new
235
+ ssl_context.verify_mode = OpenSSL::SSL::VERIFY_NONE
236
+ HTTP.get(url, ssl_context: ssl_context)
237
+
238
+ # Headers
239
+ HTTP.headers("User-Agent" => "...", "Accept" => "...").get(url)
240
+ ```
241
+
242
+ ---
243
+
244
+ ## YARD Documentation Standards
245
+
246
+ ### Required Tags
247
+ ```ruby
248
+ # @author Ernest Surudo
249
+ # @since 1.0.0
250
+ # @api public
251
+
252
+ # @param url [String] Target URL to scrape
253
+ # @param options [Hash] Request options
254
+ # @option options [Boolean] :js_render Enable JavaScript rendering
255
+ # @option options [Boolean] :premium_proxy Use residential proxies
256
+ # @option options [String] :proxy_country ISO country code (us, gb, de)
257
+ # @option options [Integer] :wait Wait time in milliseconds
258
+ # @option options [String] :wait_for CSS selector to wait for
259
+ # @return [Zenrows::Response] Response object
260
+ # @raise [Zenrows::Error] Base error class
261
+ # @raise [Zenrows::RateLimitError] When rate limited (429)
262
+ # @raise [Zenrows::AuthenticationError] Invalid API key
263
+ # @example Basic request
264
+ # client.get('https://example.com')
265
+ # @example With JavaScript rendering
266
+ # client.get('https://example.com', js_render: true, wait: 5000)
267
+ def get(url, options = {})
268
+ end
269
+ ```
270
+
271
+ ---
272
+
273
+ ## Implementation Plan
274
+
275
+ ### Phase 1: Gem Skeleton
276
+ ```bash
277
+ bundle gem zenrows --test=rspec --ci=github --linter=rubocop
278
+ ```
279
+ - Configure gemspec (http dependency, MIT license)
280
+ - Set up YARD (.yardopts)
281
+ - Configure RuboCop with relaxed rules
282
+
283
+ ### Phase 2: Configuration Module
284
+ ```ruby
285
+ # lib/zenrows.rb
286
+ # lib/zenrows/configuration.rb
287
+ ```
288
+ - Global config: api_key, host, port, default_timeout, backend
289
+ - Thread-safe configuration
290
+ - Optional Rails initializer generator
291
+
292
+ ### Phase 3: Backend Interface + http.rb Adapter
293
+ ```ruby
294
+ # lib/zenrows/backends/base.rb
295
+ # lib/zenrows/backends/http_rb.rb
296
+ ```
297
+ - Abstract base with `#build_client(options)` method
298
+ - http.rb implementation with proxy, SSL, timeouts
299
+ - Port logic from `zenrows_helper.rb:30-200`
300
+
301
+ ### Phase 4: Proxy URL Builder
302
+ ```ruby
303
+ # lib/zenrows/proxy.rb
304
+ ```
305
+ - Build proxy URL with options encoded in username
306
+ - Format: `http://API_KEY-opt1-val1-opt2-val2:@host:port`
307
+ - Support all ZenRows proxy options
308
+
309
+ ### Phase 5: Client Class
310
+ ```ruby
311
+ # lib/zenrows/client.rb
312
+ ```
313
+ - `#http(options)` returns configured HTTP client
314
+ - Timeout auto-calculation (base + js_render + wait)
315
+ - SSL context configuration
316
+
317
+ ### Phase 6: JS Instructions DSL
318
+ ```ruby
319
+ # lib/zenrows/js_instructions.rb
320
+ ```
321
+ - Builder pattern with block syntax
322
+ - All instructions: click, fill, wait, scroll, evaluate, frame_*
323
+ - JSON serialization for proxy header
324
+
325
+ ### Phase 7: Response Enhancement (optional)
326
+ ```ruby
327
+ # lib/zenrows/response.rb # Only if wrapping needed
328
+ ```
329
+ - Helper methods for Zr-* headers
330
+ - Cookie extraction
331
+ - Final URL extraction
332
+
333
+ ### Phase 8: Error Classes
334
+ ```ruby
335
+ # lib/zenrows/errors.rb
336
+ ```
337
+ - `Zenrows::Error` base class
338
+ - Specific errors with helpful messages
339
+ - Retry suggestions
340
+
341
+ ### Phase 9: Rails Integration (optional)
342
+ ```ruby
343
+ # lib/zenrows/railtie.rb
344
+ # lib/generators/zenrows/install_generator.rb
345
+ ```
346
+ - ActiveSupport::Duration support in wait option
347
+ - Rails config generator (optional)
348
+
349
+ ### Phase 10: Documentation & Testing
350
+ - YARD docs for all public methods
351
+ - RSpec unit tests with WebMock
352
+ - VCR for integration tests
353
+ - README with comprehensive examples
354
+ - CHANGELOG
355
+
356
+ ---
357
+
358
+ ## Missing Features to Add
359
+
360
+ | Feature | Priority | Notes |
361
+ |---------|----------|-------|
362
+ | `wait_for` | HIGH | Wait for CSS selector |
363
+ | `block_resources` | MEDIUM | Block images/fonts/CSS |
364
+ | `autoparse` | LOW | Auto-extraction |
365
+ | `css_extractor` | LOW | CSS-based extraction |
366
+ | `markdown_response` | LOW | Markdown output |
367
+
368
+ ---
369
+
370
+ ## Files to Extract/Reference
371
+
372
+ ### From Marmot (copy logic, rewrite for gem)
373
+ - `app/helpers/zenrows_helper.rb:30-100` - Proxy URL building
374
+ - `app/helpers/zenrows_helper.rb:100-200` - HTTP client configuration
375
+ - `app/helpers/zenrows_helper.rb:200-286` - DNS resolution, SSL context
376
+
377
+ ### Dependencies
378
+ ```ruby
379
+ # zenrows.gemspec
380
+ spec.add_dependency 'http', '~> 5.0' # Primary HTTP backend
381
+ spec.add_dependency 'addressable', '~> 2.8' # URL handling
382
+
383
+ spec.add_development_dependency 'rspec', '~> 3.12'
384
+ spec.add_development_dependency 'vcr', '~> 6.0'
385
+ spec.add_development_dependency 'webmock', '~> 3.0'
386
+ spec.add_development_dependency 'yard', '~> 0.9'
387
+ spec.add_development_dependency 'rubocop', '~> 1.0'
388
+ ```
389
+
390
+ ---
391
+
392
+ ## Final File Structure
393
+
394
+ ```
395
+ zenrows/
396
+ ├── lib/
397
+ │ ├── zenrows.rb # Main entry point
398
+ │ └── zenrows/
399
+ │ ├── version.rb
400
+ │ ├── configuration.rb
401
+ │ ├── client.rb
402
+ │ ├── proxy.rb
403
+ │ ├── js_instructions.rb
404
+ │ ├── errors.rb
405
+ │ ├── railtie.rb # Optional Rails
406
+ │ └── backends/
407
+ │ ├── base.rb
408
+ │ └── http_rb.rb # Primary backend
409
+ ├── sig/ # RBS types (future)
410
+ ├── spec/
411
+ │ ├── zenrows/
412
+ │ │ ├── client_spec.rb
413
+ │ │ ├── proxy_spec.rb
414
+ │ │ └── js_instructions_spec.rb
415
+ │ ├── spec_helper.rb
416
+ │ └── fixtures/vcr_cassettes/
417
+ ├── .yardopts
418
+ ├── .rubocop.yml
419
+ ├── zenrows.gemspec
420
+ ├── Gemfile
421
+ ├── Rakefile
422
+ ├── README.md
423
+ ├── CHANGELOG.md
424
+ └── LICENSE.txt
425
+ ```
426
+
427
+ ## Source Reference
428
+
429
+ Extract core logic from Marmot file:
430
+ - **`app/helpers/zenrows_helper.rb`** - All proxy/client building logic
data/sig/zenrows.rbs ADDED
@@ -0,0 +1,4 @@
1
+ module Zenrows
2
+ VERSION: String
3
+ # See the writing guide of rbs: https://github.com/ruby/rbs#guides
4
+ end
@@ -0,0 +1,7 @@
1
+ # frozen_string_literal: true
2
+
3
+ $LOAD_PATH.unshift File.expand_path("../lib", __dir__)
4
+ require "zenrows"
5
+
6
+ require "minitest/autorun"
7
+ require "webmock/minitest"
@@ -0,0 +1,83 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "test_helper"
4
+
5
+ class ClientTest < Minitest::Test
6
+ def setup
7
+ Zenrows.reset_configuration!
8
+ Zenrows.configure do |config|
9
+ config.api_key = "test_api_key"
10
+ end
11
+ end
12
+
13
+ def test_initialize_with_global_config
14
+ client = Zenrows::Client.new
15
+
16
+ assert_equal "test_api_key", client.config.api_key
17
+ assert_equal "superproxy.zenrows.com", client.config.host
18
+ assert_equal 1337, client.config.port
19
+ end
20
+
21
+ def test_initialize_with_overrides
22
+ client = Zenrows::Client.new(
23
+ api_key: "override_key",
24
+ host: "custom.proxy.com",
25
+ port: 8080
26
+ )
27
+
28
+ assert_equal "override_key", client.config.api_key
29
+ assert_equal "custom.proxy.com", client.config.host
30
+ assert_equal 8080, client.config.port
31
+ end
32
+
33
+ def test_initialize_raises_without_api_key
34
+ Zenrows.reset_configuration!
35
+
36
+ assert_raises Zenrows::ConfigurationError do
37
+ Zenrows::Client.new
38
+ end
39
+ end
40
+
41
+ def test_http_returns_http_client
42
+ client = Zenrows::Client.new
43
+ http = client.http(js_render: true)
44
+
45
+ assert_kind_of HTTP::Client, http
46
+ end
47
+
48
+ def test_ssl_context
49
+ client = Zenrows::Client.new
50
+ ssl = client.ssl_context
51
+
52
+ assert_kind_of OpenSSL::SSL::SSLContext, ssl
53
+ assert_equal OpenSSL::SSL::VERIFY_NONE, ssl.verify_mode
54
+ end
55
+
56
+ def test_proxy_config
57
+ client = Zenrows::Client.new
58
+ config = client.proxy_config(js_render: true, premium_proxy: true)
59
+
60
+ assert_equal "superproxy.zenrows.com", config[:host]
61
+ assert_equal 1337, config[:port]
62
+ assert_equal "test_api_key", config[:username]
63
+ assert_includes config[:password], "js_render=true"
64
+ assert_includes config[:password], "premium_proxy=true"
65
+ end
66
+
67
+ def test_proxy_url
68
+ client = Zenrows::Client.new
69
+ url = client.proxy_url(js_render: true)
70
+
71
+ assert_includes url, "test_api_key"
72
+ assert_includes url, "superproxy.zenrows.com"
73
+ assert_includes url, "js_render=true"
74
+ end
75
+
76
+ def test_unsupported_backend_raises
77
+ Zenrows.configure { |c| c.backend = :unsupported }
78
+
79
+ assert_raises Zenrows::ConfigurationError do
80
+ Zenrows::Client.new
81
+ end
82
+ end
83
+ end