zenrows 0.1.0 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.mcp.json +10 -0
- data/.tool-versions +1 -0
- data/CHANGELOG.md +45 -0
- data/CLAUDE.md +1 -1
- data/Makefile +19 -0
- data/README.md +140 -19
- data/lib/zenrows/api_client.rb +243 -0
- data/lib/zenrows/api_response.rb +185 -0
- data/lib/zenrows/backends/base.rb +31 -1
- data/lib/zenrows/backends/http_rb.rb +17 -10
- data/lib/zenrows/backends/net_http.rb +149 -0
- data/lib/zenrows/client.rb +120 -11
- data/lib/zenrows/configuration.rb +117 -0
- data/lib/zenrows/css_extractor.rb +111 -0
- data/lib/zenrows/hooks/context.rb +142 -0
- data/lib/zenrows/hooks/log_subscriber.rb +124 -0
- data/lib/zenrows/hooks.rb +213 -0
- data/lib/zenrows/instrumented_client.rb +187 -0
- data/lib/zenrows/proxy.rb +19 -0
- data/lib/zenrows/version.rb +1 -1
- data/lib/zenrows.rb +14 -2
- data/sig/manifest.yaml +5 -0
- data/sig/zenrows/api_client.rbs +18 -0
- data/sig/zenrows/api_response.rbs +28 -0
- data/sig/zenrows/backends/base.rbs +12 -0
- data/sig/zenrows/backends/http_rb.rbs +3 -0
- data/sig/zenrows/backends/net_http.rbs +28 -0
- data/sig/zenrows/backends.rbs +2 -0
- data/sig/zenrows/client.rbs +12 -0
- data/sig/zenrows/configuration.rbs +29 -0
- data/sig/zenrows/css_extractor.rbs +14 -0
- data/sig/zenrows/errors.rbs +27 -0
- data/sig/zenrows/hook_configurator.rbs +9 -0
- data/sig/zenrows/hooks/context.rbs +6 -0
- data/sig/zenrows/hooks/log_subscriber.rbs +15 -0
- data/sig/zenrows/hooks.rbs +23 -0
- data/sig/zenrows/instrumented_client.rbs +22 -0
- data/sig/zenrows/js_instructions.rbs +28 -0
- data/sig/zenrows/proxy.rbs +14 -0
- data/sig/zenrows.rbs +4 -1
- data/test/test_helper.rb +42 -0
- data/test/zenrows/api_client_test.rb +161 -0
- data/test/zenrows/api_response_test.rb +142 -0
- data/test/zenrows/client_hooks_test.rb +105 -0
- data/test/zenrows/configuration_hooks_test.rb +101 -0
- data/test/zenrows/css_extractor_test.rb +84 -0
- data/test/zenrows/hooks/context_test.rb +150 -0
- data/test/zenrows/hooks/log_subscriber_test.rb +105 -0
- data/test/zenrows/hooks_test.rb +215 -0
- data/test/zenrows/instrumented_client_test.rb +153 -0
- data/test/zenrows/js_instructions_test.rb +2 -1
- data/test/zenrows/proxy_test.rb +39 -0
- metadata +42 -4
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: e684c8840821205883d52bd782aa3d22c8c21404bc9e3a4ab068f9069389e575
|
|
4
|
+
data.tar.gz: eed9546964086c061082a55aa4fa9218c5c24dfa02b70f20297b04b55fa77891
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 31a6e6f8d95e0431cd7ce0bf545e813c64954021914eb67cf8312155b0c9d7cdea2ab1ede5378972757c1b148be0cd7be13e7ddda5ed2fbdb83cb30aad9fc18a
|
|
7
|
+
data.tar.gz: 74584d555a2915b1e0f8ac2fd0fb46ca78b21a80430c0e20f252237497a27087b4027e244c450222ae322b2f1ae2f0a02b59652daddc2069ac48783525d5cc69
|
data/.mcp.json
ADDED
data/.tool-versions
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
ruby 3.4.2
|
data/CHANGELOG.md
CHANGED
|
@@ -7,6 +7,51 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
|
|
|
7
7
|
|
|
8
8
|
## [Unreleased]
|
|
9
9
|
|
|
10
|
+
## [0.3.0] - 2025-12-30
|
|
11
|
+
|
|
12
|
+
### Added
|
|
13
|
+
|
|
14
|
+
- Hooks/callbacks system for request lifecycle events
|
|
15
|
+
- Five hook types: `before_request`, `after_request`, `on_response`, `on_error`, `around_request`
|
|
16
|
+
- Global and per-client hook registration
|
|
17
|
+
- `Zenrows::Hooks::LogSubscriber` built-in logging subscriber
|
|
18
|
+
- `InstrumentedClient` wrapper for HTTP client instrumentation
|
|
19
|
+
- Context object with ZenRows header parsing (request_cost, concurrency_remaining, request_id, final_url)
|
|
20
|
+
- RBS type signatures for all hooks classes
|
|
21
|
+
- Monotonic clock for accurate request duration timing
|
|
22
|
+
|
|
23
|
+
### Changed
|
|
24
|
+
|
|
25
|
+
- `after_request` hook now always runs (via `ensure`), even on errors
|
|
26
|
+
|
|
27
|
+
## [0.2.1] - 2025-12-25
|
|
28
|
+
|
|
29
|
+
### Added
|
|
30
|
+
|
|
31
|
+
- Configurable `api_endpoint` for ApiClient (global config or per-instance)
|
|
32
|
+
- `net_http` backend as fallback when http.rb unavailable
|
|
33
|
+
- GitHub Pages for YARD documentation
|
|
34
|
+
|
|
35
|
+
### Changed
|
|
36
|
+
|
|
37
|
+
- SSL context now auto-configured in proxy client (no need to pass `ssl_context:` on every request)
|
|
38
|
+
|
|
39
|
+
## [0.2.0] - 2025-12-25
|
|
40
|
+
|
|
41
|
+
### Added
|
|
42
|
+
|
|
43
|
+
- `ApiClient` for REST API mode (autoparse, css_extractor, markdown output)
|
|
44
|
+
- `CssExtractor` DSL for building extraction rules
|
|
45
|
+
- `ApiResponse` wrapper with typed accessors
|
|
46
|
+
- Proxy options: `device`, `antibot`, `session_ttl`
|
|
47
|
+
- RBS type signatures
|
|
48
|
+
- rubocop-minitest linting
|
|
49
|
+
- Dependabot configuration
|
|
50
|
+
|
|
51
|
+
### Changed
|
|
52
|
+
|
|
53
|
+
- Require Ruby >= 3.2.0
|
|
54
|
+
|
|
10
55
|
## [0.1.0] - 2025-12-25
|
|
11
56
|
|
|
12
57
|
### Added
|
data/CLAUDE.md
CHANGED
|
@@ -46,7 +46,7 @@ end
|
|
|
46
46
|
|
|
47
47
|
client = Zenrows::Client.new
|
|
48
48
|
http = client.http(js_render: true, premium_proxy: true)
|
|
49
|
-
response = http.get('https://example.com'
|
|
49
|
+
response = http.get('https://example.com') # SSL verification disabled for proxy
|
|
50
50
|
```
|
|
51
51
|
|
|
52
52
|
## Key Options
|
data/Makefile
ADDED
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
.PHONY: install test lint rbs docs build
|
|
2
|
+
|
|
3
|
+
install:
|
|
4
|
+
bundle install
|
|
5
|
+
|
|
6
|
+
test:
|
|
7
|
+
bundle exec rake test
|
|
8
|
+
|
|
9
|
+
lint:
|
|
10
|
+
bundle exec rubocop
|
|
11
|
+
|
|
12
|
+
rbs:
|
|
13
|
+
rbs -I sig -r monitor -r openssl -r logger validate
|
|
14
|
+
|
|
15
|
+
docs:
|
|
16
|
+
bundle exec yard doc
|
|
17
|
+
|
|
18
|
+
build:
|
|
19
|
+
bundle exec rake build
|
data/README.md
CHANGED
|
@@ -35,12 +35,14 @@ end
|
|
|
35
35
|
```ruby
|
|
36
36
|
client = Zenrows::Client.new
|
|
37
37
|
http = client.http(js_render: true, premium_proxy: true)
|
|
38
|
-
response = http.get('https://example.com'
|
|
38
|
+
response = http.get('https://example.com')
|
|
39
39
|
|
|
40
40
|
puts response.body
|
|
41
41
|
puts response.status
|
|
42
42
|
```
|
|
43
43
|
|
|
44
|
+
> **Note:** SSL verification is disabled automatically for proxy connections (required by ZenRows).
|
|
45
|
+
|
|
44
46
|
### With Options
|
|
45
47
|
|
|
46
48
|
```ruby
|
|
@@ -70,10 +72,11 @@ instructions = Zenrows::JsInstructions.build do
|
|
|
70
72
|
end
|
|
71
73
|
|
|
72
74
|
http = client.http(js_render: true, js_instructions: instructions)
|
|
73
|
-
response = http.get(url
|
|
75
|
+
response = http.get(url)
|
|
74
76
|
```
|
|
75
77
|
|
|
76
78
|
Available instructions:
|
|
79
|
+
|
|
77
80
|
- `click(selector)` - Click element
|
|
78
81
|
- `wait(ms)` - Wait duration
|
|
79
82
|
- `wait_for(selector)` - Wait for element
|
|
@@ -108,25 +111,142 @@ http = client.http(
|
|
|
108
111
|
)
|
|
109
112
|
```
|
|
110
113
|
|
|
114
|
+
### Device & Antibot
|
|
115
|
+
|
|
116
|
+
```ruby
|
|
117
|
+
http = client.http(
|
|
118
|
+
js_render: true,
|
|
119
|
+
device: 'mobile', # mobile/desktop emulation
|
|
120
|
+
antibot: true # enhanced antibot bypass
|
|
121
|
+
)
|
|
122
|
+
```
|
|
123
|
+
|
|
124
|
+
## API Client (v0.2.0+)
|
|
125
|
+
|
|
126
|
+
For advanced extraction features, use the REST API client:
|
|
127
|
+
|
|
128
|
+
### Autoparse
|
|
129
|
+
|
|
130
|
+
Extract structured data from known sites (Amazon, etc.):
|
|
131
|
+
|
|
132
|
+
```ruby
|
|
133
|
+
api = Zenrows::ApiClient.new
|
|
134
|
+
response = api.get('https://amazon.com/dp/B01LD5GO7I', autoparse: true)
|
|
135
|
+
|
|
136
|
+
response.parsed # => { "title" => "...", "price" => "$29.99", ... }
|
|
137
|
+
```
|
|
138
|
+
|
|
139
|
+
### CSS Extraction
|
|
140
|
+
|
|
141
|
+
Extract data using CSS selectors:
|
|
142
|
+
|
|
143
|
+
```ruby
|
|
144
|
+
# Hash syntax
|
|
145
|
+
response = api.get(url, css_extractor: {
|
|
146
|
+
title: 'h1',
|
|
147
|
+
links: 'a @href',
|
|
148
|
+
prices: '.price'
|
|
149
|
+
})
|
|
150
|
+
response.extracted # => { "title" => "...", "links" => [...], "prices" => [...] }
|
|
151
|
+
|
|
152
|
+
# DSL syntax
|
|
153
|
+
extractor = Zenrows::CssExtractor.build do
|
|
154
|
+
extract :title, 'h1'
|
|
155
|
+
links :urls, 'a.product'
|
|
156
|
+
images :photos, 'img.gallery'
|
|
157
|
+
end
|
|
158
|
+
response = api.get(url, css_extractor: extractor)
|
|
159
|
+
```
|
|
160
|
+
|
|
161
|
+
### Markdown Output
|
|
162
|
+
|
|
163
|
+
```ruby
|
|
164
|
+
response = api.get(url, response_type: 'markdown')
|
|
165
|
+
response.markdown # => "# Page Title\n\nContent..."
|
|
166
|
+
```
|
|
167
|
+
|
|
168
|
+
### Response Metadata
|
|
169
|
+
|
|
170
|
+
```ruby
|
|
171
|
+
response = api.get(url)
|
|
172
|
+
response.status # => 200
|
|
173
|
+
response.success? # => true
|
|
174
|
+
response.final_url # => "https://example.com/redirected"
|
|
175
|
+
response.request_cost # => 0.001
|
|
176
|
+
response.concurrency_remaining # => 199
|
|
177
|
+
```
|
|
178
|
+
|
|
111
179
|
## Options Reference
|
|
112
180
|
|
|
113
|
-
| Option
|
|
114
|
-
|
|
115
|
-
| `js_render`
|
|
116
|
-
| `premium_proxy`
|
|
117
|
-
| `proxy_country`
|
|
118
|
-
| `
|
|
119
|
-
| `
|
|
120
|
-
| `
|
|
121
|
-
| `
|
|
122
|
-
| `
|
|
123
|
-
| `
|
|
124
|
-
| `
|
|
125
|
-
| `
|
|
126
|
-
| `
|
|
127
|
-
| `
|
|
128
|
-
| `
|
|
129
|
-
| `
|
|
181
|
+
| Option | Type | Description |
|
|
182
|
+
| --------------------- | --------------- | ---------------------------------- |
|
|
183
|
+
| `js_render` | Boolean | Enable JavaScript rendering |
|
|
184
|
+
| `premium_proxy` | Boolean | Use residential proxies |
|
|
185
|
+
| `proxy_country` | String | Country code (us, gb, de, etc.) |
|
|
186
|
+
| `device` | String | Device emulation (mobile/desktop) |
|
|
187
|
+
| `antibot` | Boolean | Enhanced antibot bypass |
|
|
188
|
+
| `wait` | Integer/Boolean | Wait time in ms (true = 15000) |
|
|
189
|
+
| `wait_for` | String | CSS selector to wait for |
|
|
190
|
+
| `session_id` | Boolean/String | Session persistence |
|
|
191
|
+
| `session_ttl` | String | Session duration (1m, 10m, 30m) |
|
|
192
|
+
| `window_height` | Integer | Browser window height |
|
|
193
|
+
| `window_width` | Integer | Browser window width |
|
|
194
|
+
| `js_instructions` | Array/String | Browser automation |
|
|
195
|
+
| `json_response` | Boolean | Return JSON instead of HTML |
|
|
196
|
+
| `screenshot` | Boolean | Take screenshot |
|
|
197
|
+
| `screenshot_fullpage` | Boolean | Full page screenshot |
|
|
198
|
+
| `screenshot_selector` | String | Screenshot specific element |
|
|
199
|
+
| `block_resources` | String | Block resources (image,media,font) |
|
|
200
|
+
| `headers` | Hash | Custom HTTP headers |
|
|
201
|
+
|
|
202
|
+
### API Client Options
|
|
203
|
+
|
|
204
|
+
| Option | Type | Description |
|
|
205
|
+
| ---------------- | ------------ | ------------------------------------ |
|
|
206
|
+
| `autoparse` | Boolean | Auto-extract structured data |
|
|
207
|
+
| `css_extractor` | Hash/Object | CSS selectors for extraction |
|
|
208
|
+
| `response_type` | String | Output format ('markdown') |
|
|
209
|
+
| `outputs` | String | Extract specific data (headings,links) |
|
|
210
|
+
|
|
211
|
+
## Hooks
|
|
212
|
+
|
|
213
|
+
Register callbacks for request lifecycle events:
|
|
214
|
+
|
|
215
|
+
```ruby
|
|
216
|
+
Zenrows.configure do |c|
|
|
217
|
+
c.api_key = 'YOUR_KEY'
|
|
218
|
+
|
|
219
|
+
# Log responses
|
|
220
|
+
c.on_response { |resp, ctx| puts "#{ctx[:host]} -> #{resp.status}" }
|
|
221
|
+
|
|
222
|
+
# Track errors
|
|
223
|
+
c.on_error { |err, ctx| Sentry.capture_exception(err) }
|
|
224
|
+
|
|
225
|
+
# Monitor costs
|
|
226
|
+
c.on_response do |resp, ctx|
|
|
227
|
+
cost = ctx[:zenrows_headers][:request_cost]
|
|
228
|
+
StatsD.increment('zenrows.cost', cost) if cost
|
|
229
|
+
end
|
|
230
|
+
end
|
|
231
|
+
```
|
|
232
|
+
|
|
233
|
+
Per-client hooks:
|
|
234
|
+
|
|
235
|
+
```ruby
|
|
236
|
+
client = Zenrows::Client.new do |c|
|
|
237
|
+
c.on_response { |resp, ctx| log_specific(resp) }
|
|
238
|
+
end
|
|
239
|
+
```
|
|
240
|
+
|
|
241
|
+
Built-in logger:
|
|
242
|
+
|
|
243
|
+
```ruby
|
|
244
|
+
c.add_subscriber(Zenrows::Hooks::LogSubscriber.new)
|
|
245
|
+
```
|
|
246
|
+
|
|
247
|
+
Available hooks: `before_request`, `after_request`, `on_response`, `on_error`, `around_request`
|
|
248
|
+
|
|
249
|
+
Context includes: `method`, `url`, `host`, `duration`, `zenrows_headers` (request_cost, concurrency_remaining, request_id, final_url)
|
|
130
250
|
|
|
131
251
|
## Error Handling
|
|
132
252
|
|
|
@@ -152,6 +272,7 @@ end
|
|
|
152
272
|
## Rails Integration
|
|
153
273
|
|
|
154
274
|
The gem automatically integrates with Rails when detected:
|
|
275
|
+
|
|
155
276
|
- Uses Rails.logger by default
|
|
156
277
|
- Supports ActiveSupport::Duration for wait times
|
|
157
278
|
|
|
@@ -0,0 +1,243 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "http"
|
|
4
|
+
require "json"
|
|
5
|
+
require "cgi"
|
|
6
|
+
|
|
7
|
+
module Zenrows
|
|
8
|
+
# REST API client for ZenRows Universal Scraper API
|
|
9
|
+
#
|
|
10
|
+
# Unlike the proxy-based Client, ApiClient calls the ZenRows API directly.
|
|
11
|
+
# This enables features not available in proxy mode: autoparse, css_extractor,
|
|
12
|
+
# response_type (markdown), and outputs.
|
|
13
|
+
#
|
|
14
|
+
# @example Basic usage
|
|
15
|
+
# api = Zenrows::ApiClient.new
|
|
16
|
+
# response = api.get('https://example.com')
|
|
17
|
+
#
|
|
18
|
+
# @example With autoparse
|
|
19
|
+
# response = api.get('https://amazon.com/dp/B01LD5GO7I', autoparse: true)
|
|
20
|
+
# puts response.data # Structured product data
|
|
21
|
+
#
|
|
22
|
+
# @example With CSS extraction
|
|
23
|
+
# response = api.get(url, css_extractor: { title: 'h1', links: 'a @href' })
|
|
24
|
+
#
|
|
25
|
+
# @example With markdown output
|
|
26
|
+
# response = api.get(url, response_type: 'markdown')
|
|
27
|
+
#
|
|
28
|
+
# @example With per-client hooks
|
|
29
|
+
# api = Zenrows::ApiClient.new do |c|
|
|
30
|
+
# c.on_response { |resp, ctx| puts "#{ctx[:host]} -> #{resp.status}" }
|
|
31
|
+
# end
|
|
32
|
+
#
|
|
33
|
+
# @author Ernest Bursa
|
|
34
|
+
# @since 0.2.0
|
|
35
|
+
# @api public
|
|
36
|
+
class ApiClient
|
|
37
|
+
# @return [String] ZenRows API key
|
|
38
|
+
attr_reader :api_key
|
|
39
|
+
|
|
40
|
+
# @return [String] API endpoint URL
|
|
41
|
+
attr_reader :api_endpoint
|
|
42
|
+
|
|
43
|
+
# @return [Configuration] Configuration instance
|
|
44
|
+
attr_reader :config
|
|
45
|
+
|
|
46
|
+
# @return [Hooks] Hook registry for this client
|
|
47
|
+
attr_reader :hooks
|
|
48
|
+
|
|
49
|
+
# Initialize API client
|
|
50
|
+
#
|
|
51
|
+
# @param api_key [String, nil] Override API key (uses global config if nil)
|
|
52
|
+
# @param api_endpoint [String, nil] Override API endpoint (uses global config if nil)
|
|
53
|
+
# @yield [config] Optional block for per-client configuration (hooks)
|
|
54
|
+
# @yieldparam config [HookConfigurator] Hook configuration DSL
|
|
55
|
+
def initialize(api_key: nil, api_endpoint: nil, &block)
|
|
56
|
+
@config = Zenrows.configuration
|
|
57
|
+
@api_key = api_key || @config.api_key
|
|
58
|
+
@api_endpoint = api_endpoint || @config.api_endpoint
|
|
59
|
+
@config.validate! unless api_key
|
|
60
|
+
|
|
61
|
+
# Build hooks: start with global, allow per-client additions
|
|
62
|
+
@hooks = block ? build_hooks(&block) : Zenrows.configuration.hooks.dup
|
|
63
|
+
end
|
|
64
|
+
|
|
65
|
+
# Make GET request through ZenRows API
|
|
66
|
+
#
|
|
67
|
+
# @param url [String] Target URL to scrape
|
|
68
|
+
# @param options [Hash] Request options
|
|
69
|
+
# @option options [Boolean] :autoparse Auto-extract structured data
|
|
70
|
+
# @option options [Hash, CssExtractor] :css_extractor CSS selectors for extraction
|
|
71
|
+
# @option options [String] :response_type Response format ('markdown')
|
|
72
|
+
# @option options [String] :outputs Extract specific data ('headings,links,menus')
|
|
73
|
+
# @option options [Boolean] :js_render Enable JavaScript rendering
|
|
74
|
+
# @option options [Boolean] :premium_proxy Use residential proxies
|
|
75
|
+
# @option options [String] :proxy_country Country code
|
|
76
|
+
# @option options [Integer, Boolean] :wait Wait time in ms
|
|
77
|
+
# @option options [String] :wait_for CSS selector to wait for
|
|
78
|
+
# @option options [Boolean, String] :session_id Session persistence
|
|
79
|
+
# @option options [Array, String] :js_instructions Browser automation
|
|
80
|
+
# @option options [Boolean] :json_response Return JSON with XHR data
|
|
81
|
+
# @option options [Boolean] :screenshot Take screenshot
|
|
82
|
+
# @option options [Boolean] :screenshot_fullpage Full page screenshot
|
|
83
|
+
# @option options [String] :screenshot_selector Screenshot element
|
|
84
|
+
# @option options [String] :block_resources Block resources
|
|
85
|
+
# @option options [String] :device Device emulation
|
|
86
|
+
# @option options [Boolean] :antibot Enhanced antibot
|
|
87
|
+
# @return [ApiResponse] Response wrapper
|
|
88
|
+
# @raise [ConfigurationError] if API key not configured
|
|
89
|
+
# @raise [AuthenticationError] if API key invalid
|
|
90
|
+
# @raise [RateLimitError] if rate limited
|
|
91
|
+
def get(url, **options)
|
|
92
|
+
instrument(:get, url, options) do
|
|
93
|
+
params = build_params(url, options)
|
|
94
|
+
http_response = build_http_client.get(api_endpoint, params: params)
|
|
95
|
+
handle_response(http_response, options)
|
|
96
|
+
end
|
|
97
|
+
end
|
|
98
|
+
|
|
99
|
+
# Make POST request through ZenRows API
|
|
100
|
+
#
|
|
101
|
+
# @param url [String] Target URL
|
|
102
|
+
# @param body [String, Hash] Request body
|
|
103
|
+
# @param options [Hash] Request options (same as #get)
|
|
104
|
+
# @return [ApiResponse] Response wrapper
|
|
105
|
+
def post(url, body: nil, **options)
|
|
106
|
+
instrument(:post, url, options) do
|
|
107
|
+
params = build_params(url, options)
|
|
108
|
+
http_response = build_http_client.post(api_endpoint, params: params, body: body)
|
|
109
|
+
handle_response(http_response, options)
|
|
110
|
+
end
|
|
111
|
+
end
|
|
112
|
+
|
|
113
|
+
private
|
|
114
|
+
|
|
115
|
+
# Build hooks registry for this client
|
|
116
|
+
#
|
|
117
|
+
# @yield [config] Block for registering per-client hooks
|
|
118
|
+
# @return [Hooks] Combined hooks registry
|
|
119
|
+
def build_hooks
|
|
120
|
+
client_hooks = Zenrows.configuration.hooks.dup
|
|
121
|
+
hook_config = HookConfigurator.new(client_hooks)
|
|
122
|
+
yield(hook_config)
|
|
123
|
+
client_hooks
|
|
124
|
+
end
|
|
125
|
+
|
|
126
|
+
# Instrument a request with hooks
|
|
127
|
+
#
|
|
128
|
+
# @param method [Symbol] HTTP method
|
|
129
|
+
# @param url [String] Target URL
|
|
130
|
+
# @param options [Hash] Request options
|
|
131
|
+
# @yield Block that executes the actual request
|
|
132
|
+
# @return [Object] Response from block
|
|
133
|
+
def instrument(method, url, options)
|
|
134
|
+
return yield if hooks.empty?
|
|
135
|
+
|
|
136
|
+
context = Hooks::Context.for_request(
|
|
137
|
+
method: method,
|
|
138
|
+
url: url,
|
|
139
|
+
options: options,
|
|
140
|
+
backend: :api
|
|
141
|
+
)
|
|
142
|
+
|
|
143
|
+
hooks.run(:before_request, context)
|
|
144
|
+
|
|
145
|
+
response = hooks.run_around(context) do
|
|
146
|
+
result = yield
|
|
147
|
+
Hooks::Context.enrich_with_response(context, result)
|
|
148
|
+
hooks.run(:on_response, result, context)
|
|
149
|
+
result
|
|
150
|
+
end
|
|
151
|
+
|
|
152
|
+
response
|
|
153
|
+
rescue => e
|
|
154
|
+
context[:error] = e if context
|
|
155
|
+
hooks.run(:on_error, e, context) if context
|
|
156
|
+
raise
|
|
157
|
+
ensure
|
|
158
|
+
hooks.run(:after_request, context) if context
|
|
159
|
+
end
|
|
160
|
+
|
|
161
|
+
def build_http_client
|
|
162
|
+
HTTP
|
|
163
|
+
.timeout(connect: config.connect_timeout, read: config.read_timeout)
|
|
164
|
+
end
|
|
165
|
+
|
|
166
|
+
def build_params(url, options)
|
|
167
|
+
params = {apikey: api_key, url: url}
|
|
168
|
+
|
|
169
|
+
# API-mode only features
|
|
170
|
+
params[:autoparse] = "true" if options[:autoparse]
|
|
171
|
+
params[:response_type] = options[:response_type] if options[:response_type]
|
|
172
|
+
params[:outputs] = options[:outputs] if options[:outputs]
|
|
173
|
+
|
|
174
|
+
if options[:css_extractor]
|
|
175
|
+
extractor = options[:css_extractor]
|
|
176
|
+
params[:css_extractor] = extractor.to_json
|
|
177
|
+
end
|
|
178
|
+
|
|
179
|
+
# Common options (also available in proxy mode)
|
|
180
|
+
params[:js_render] = "true" if options[:js_render]
|
|
181
|
+
params[:premium_proxy] = "true" if options[:premium_proxy]
|
|
182
|
+
params[:proxy_country] = options[:proxy_country] if options[:proxy_country]
|
|
183
|
+
params[:json_response] = "true" if options[:json_response]
|
|
184
|
+
params[:original_status] = "true" if options[:original_status]
|
|
185
|
+
|
|
186
|
+
# Wait options
|
|
187
|
+
if options[:wait]
|
|
188
|
+
params[:wait] = (options[:wait] == true) ? 15000 : options[:wait]
|
|
189
|
+
end
|
|
190
|
+
params[:wait_for] = options[:wait_for] if options[:wait_for]
|
|
191
|
+
|
|
192
|
+
# Session
|
|
193
|
+
if options[:session_id]
|
|
194
|
+
params[:session_id] = (options[:session_id] == true) ? rand(1..99999) : options[:session_id]
|
|
195
|
+
end
|
|
196
|
+
|
|
197
|
+
# Window dimensions
|
|
198
|
+
params[:window_height] = options[:window_height] if options[:window_height]
|
|
199
|
+
params[:window_width] = options[:window_width] if options[:window_width]
|
|
200
|
+
|
|
201
|
+
# Screenshots
|
|
202
|
+
params[:screenshot] = "true" if options[:screenshot]
|
|
203
|
+
params[:screenshot_fullpage] = "true" if options[:screenshot_fullpage]
|
|
204
|
+
params[:screenshot_selector] = options[:screenshot_selector] if options[:screenshot_selector]
|
|
205
|
+
|
|
206
|
+
# JS instructions
|
|
207
|
+
if options[:js_instructions]
|
|
208
|
+
instructions = options[:js_instructions]
|
|
209
|
+
instructions = instructions.to_json if instructions.respond_to?(:to_a)
|
|
210
|
+
params[:js_instructions] = instructions
|
|
211
|
+
end
|
|
212
|
+
|
|
213
|
+
# Other options
|
|
214
|
+
params[:block_resources] = options[:block_resources] if options[:block_resources]
|
|
215
|
+
params[:device] = options[:device] if options[:device]
|
|
216
|
+
params[:antibot] = "true" if options[:antibot]
|
|
217
|
+
|
|
218
|
+
# Custom headers
|
|
219
|
+
options[:headers]&.each do |key, value|
|
|
220
|
+
params["custom_headers[#{key}]"] = value
|
|
221
|
+
end
|
|
222
|
+
|
|
223
|
+
params
|
|
224
|
+
end
|
|
225
|
+
|
|
226
|
+
def handle_response(http_response, options)
|
|
227
|
+
case http_response.status.code
|
|
228
|
+
when 200..299
|
|
229
|
+
ApiResponse.new(http_response, options)
|
|
230
|
+
when 401
|
|
231
|
+
raise AuthenticationError, "Invalid API key"
|
|
232
|
+
when 429
|
|
233
|
+
retry_after = http_response.headers["Retry-After"]&.to_i
|
|
234
|
+
raise RateLimitError.new("Rate limited", retry_after: retry_after)
|
|
235
|
+
when 403
|
|
236
|
+
raise BotDetectedError.new("Bot detected", suggestion: "Try premium_proxy or antibot options")
|
|
237
|
+
else
|
|
238
|
+
body = http_response.body.to_s
|
|
239
|
+
raise Error, "API error (#{http_response.status}): #{body[0, 200]}"
|
|
240
|
+
end
|
|
241
|
+
end
|
|
242
|
+
end
|
|
243
|
+
end
|