zenrows 0.1.0 → 0.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.mcp.json +10 -0
- data/.tool-versions +1 -0
- data/CHANGELOG.md +28 -0
- data/CLAUDE.md +1 -1
- data/Makefile +19 -0
- data/README.md +100 -19
- data/lib/zenrows/api_client.rb +180 -0
- data/lib/zenrows/api_response.rb +185 -0
- data/lib/zenrows/backends/http_rb.rb +10 -11
- data/lib/zenrows/backends/net_http.rb +141 -0
- data/lib/zenrows/client.rb +35 -9
- data/lib/zenrows/configuration.rb +6 -0
- data/lib/zenrows/css_extractor.rb +111 -0
- data/lib/zenrows/proxy.rb +19 -0
- data/lib/zenrows/version.rb +1 -1
- data/lib/zenrows.rb +10 -2
- data/sig/manifest.yaml +5 -0
- data/sig/zenrows/api_client.rbs +15 -0
- data/sig/zenrows/api_response.rbs +28 -0
- data/sig/zenrows/backends/base.rbs +9 -0
- data/sig/zenrows/backends/http_rb.rbs +3 -0
- data/sig/zenrows/backends/net_http.rbs +28 -0
- data/sig/zenrows/backends.rbs +2 -0
- data/sig/zenrows/client.rbs +11 -0
- data/sig/zenrows/configuration.rbs +20 -0
- data/sig/zenrows/css_extractor.rbs +14 -0
- data/sig/zenrows/errors.rbs +27 -0
- data/sig/zenrows/js_instructions.rbs +28 -0
- data/sig/zenrows/proxy.rbs +14 -0
- data/sig/zenrows.rbs +4 -1
- data/test/zenrows/api_client_test.rb +161 -0
- data/test/zenrows/api_response_test.rb +142 -0
- data/test/zenrows/css_extractor_test.rb +84 -0
- data/test/zenrows/js_instructions_test.rb +2 -1
- data/test/zenrows/proxy_test.rb +39 -0
- metadata +25 -2
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 5c04285fb07c401d81848da8a4ba763b3e0cd0a550c5e34fc1716101f229fbb7
|
|
4
|
+
data.tar.gz: 04d35bac2bafd413d8969d5d61b54f414c54390db0deb014dd721e2874944cc2
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 12becd343181666b4dc8b8edccc5fffcb410ecaf7ba74b9b7ed8a6859693d20f7b466e27f5e8a245b9841c56d7f2c4b1d2b3fa7a84b241b3725140acc546ad2c
|
|
7
|
+
data.tar.gz: '0368ed029138f05de5e2041fb4136caacc04d3ae9b172e7b5983f1204eefa0d12afc2ab70724f124922358d201195aa070e60db2248cee7b9031bb9d64e0c6d2'
|
data/.mcp.json
ADDED
data/.tool-versions
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
ruby 3.4.2
|
data/CHANGELOG.md
CHANGED
|
@@ -7,6 +7,34 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
|
|
|
7
7
|
|
|
8
8
|
## [Unreleased]
|
|
9
9
|
|
|
10
|
+
## [0.2.1] - 2025-12-25
|
|
11
|
+
|
|
12
|
+
### Added
|
|
13
|
+
|
|
14
|
+
- Configurable `api_endpoint` for ApiClient (global config or per-instance)
|
|
15
|
+
- `net_http` backend as fallback when http.rb unavailable
|
|
16
|
+
- GitHub Pages for YARD documentation
|
|
17
|
+
|
|
18
|
+
### Changed
|
|
19
|
+
|
|
20
|
+
- SSL context now auto-configured in proxy client (no need to pass `ssl_context:` on every request)
|
|
21
|
+
|
|
22
|
+
## [0.2.0] - 2025-12-25
|
|
23
|
+
|
|
24
|
+
### Added
|
|
25
|
+
|
|
26
|
+
- `ApiClient` for REST API mode (autoparse, css_extractor, markdown output)
|
|
27
|
+
- `CssExtractor` DSL for building extraction rules
|
|
28
|
+
- `ApiResponse` wrapper with typed accessors
|
|
29
|
+
- Proxy options: `device`, `antibot`, `session_ttl`
|
|
30
|
+
- RBS type signatures
|
|
31
|
+
- rubocop-minitest linting
|
|
32
|
+
- Dependabot configuration
|
|
33
|
+
|
|
34
|
+
### Changed
|
|
35
|
+
|
|
36
|
+
- Require Ruby >= 3.2.0
|
|
37
|
+
|
|
10
38
|
## [0.1.0] - 2025-12-25
|
|
11
39
|
|
|
12
40
|
### Added
|
data/CLAUDE.md
CHANGED
|
@@ -46,7 +46,7 @@ end
|
|
|
46
46
|
|
|
47
47
|
client = Zenrows::Client.new
|
|
48
48
|
http = client.http(js_render: true, premium_proxy: true)
|
|
49
|
-
response = http.get('https://example.com'
|
|
49
|
+
response = http.get('https://example.com') # SSL verification disabled for proxy
|
|
50
50
|
```
|
|
51
51
|
|
|
52
52
|
## Key Options
|
data/Makefile
ADDED
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
.PHONY: install test lint rbs docs build
|
|
2
|
+
|
|
3
|
+
install:
|
|
4
|
+
bundle install
|
|
5
|
+
|
|
6
|
+
test:
|
|
7
|
+
bundle exec rake test
|
|
8
|
+
|
|
9
|
+
lint:
|
|
10
|
+
bundle exec rubocop
|
|
11
|
+
|
|
12
|
+
rbs:
|
|
13
|
+
rbs -I sig -r monitor -r openssl -r logger validate
|
|
14
|
+
|
|
15
|
+
docs:
|
|
16
|
+
bundle exec yard doc
|
|
17
|
+
|
|
18
|
+
build:
|
|
19
|
+
bundle exec rake build
|
data/README.md
CHANGED
|
@@ -35,12 +35,14 @@ end
|
|
|
35
35
|
```ruby
|
|
36
36
|
client = Zenrows::Client.new
|
|
37
37
|
http = client.http(js_render: true, premium_proxy: true)
|
|
38
|
-
response = http.get('https://example.com'
|
|
38
|
+
response = http.get('https://example.com')
|
|
39
39
|
|
|
40
40
|
puts response.body
|
|
41
41
|
puts response.status
|
|
42
42
|
```
|
|
43
43
|
|
|
44
|
+
> **Note:** SSL verification is disabled automatically for proxy connections (required by ZenRows).
|
|
45
|
+
|
|
44
46
|
### With Options
|
|
45
47
|
|
|
46
48
|
```ruby
|
|
@@ -70,10 +72,11 @@ instructions = Zenrows::JsInstructions.build do
|
|
|
70
72
|
end
|
|
71
73
|
|
|
72
74
|
http = client.http(js_render: true, js_instructions: instructions)
|
|
73
|
-
response = http.get(url
|
|
75
|
+
response = http.get(url)
|
|
74
76
|
```
|
|
75
77
|
|
|
76
78
|
Available instructions:
|
|
79
|
+
|
|
77
80
|
- `click(selector)` - Click element
|
|
78
81
|
- `wait(ms)` - Wait duration
|
|
79
82
|
- `wait_for(selector)` - Wait for element
|
|
@@ -108,25 +111,102 @@ http = client.http(
|
|
|
108
111
|
)
|
|
109
112
|
```
|
|
110
113
|
|
|
114
|
+
### Device & Antibot
|
|
115
|
+
|
|
116
|
+
```ruby
|
|
117
|
+
http = client.http(
|
|
118
|
+
js_render: true,
|
|
119
|
+
device: 'mobile', # mobile/desktop emulation
|
|
120
|
+
antibot: true # enhanced antibot bypass
|
|
121
|
+
)
|
|
122
|
+
```
|
|
123
|
+
|
|
124
|
+
## API Client (v0.2.0+)
|
|
125
|
+
|
|
126
|
+
For advanced extraction features, use the REST API client:
|
|
127
|
+
|
|
128
|
+
### Autoparse
|
|
129
|
+
|
|
130
|
+
Extract structured data from known sites (Amazon, etc.):
|
|
131
|
+
|
|
132
|
+
```ruby
|
|
133
|
+
api = Zenrows::ApiClient.new
|
|
134
|
+
response = api.get('https://amazon.com/dp/B01LD5GO7I', autoparse: true)
|
|
135
|
+
|
|
136
|
+
response.parsed # => { "title" => "...", "price" => "$29.99", ... }
|
|
137
|
+
```
|
|
138
|
+
|
|
139
|
+
### CSS Extraction
|
|
140
|
+
|
|
141
|
+
Extract data using CSS selectors:
|
|
142
|
+
|
|
143
|
+
```ruby
|
|
144
|
+
# Hash syntax
|
|
145
|
+
response = api.get(url, css_extractor: {
|
|
146
|
+
title: 'h1',
|
|
147
|
+
links: 'a @href',
|
|
148
|
+
prices: '.price'
|
|
149
|
+
})
|
|
150
|
+
response.extracted # => { "title" => "...", "links" => [...], "prices" => [...] }
|
|
151
|
+
|
|
152
|
+
# DSL syntax
|
|
153
|
+
extractor = Zenrows::CssExtractor.build do
|
|
154
|
+
extract :title, 'h1'
|
|
155
|
+
links :urls, 'a.product'
|
|
156
|
+
images :photos, 'img.gallery'
|
|
157
|
+
end
|
|
158
|
+
response = api.get(url, css_extractor: extractor)
|
|
159
|
+
```
|
|
160
|
+
|
|
161
|
+
### Markdown Output
|
|
162
|
+
|
|
163
|
+
```ruby
|
|
164
|
+
response = api.get(url, response_type: 'markdown')
|
|
165
|
+
response.markdown # => "# Page Title\n\nContent..."
|
|
166
|
+
```
|
|
167
|
+
|
|
168
|
+
### Response Metadata
|
|
169
|
+
|
|
170
|
+
```ruby
|
|
171
|
+
response = api.get(url)
|
|
172
|
+
response.status # => 200
|
|
173
|
+
response.success? # => true
|
|
174
|
+
response.final_url # => "https://example.com/redirected"
|
|
175
|
+
response.request_cost # => 0.001
|
|
176
|
+
response.concurrency_remaining # => 199
|
|
177
|
+
```
|
|
178
|
+
|
|
111
179
|
## Options Reference
|
|
112
180
|
|
|
113
|
-
| Option
|
|
114
|
-
|
|
115
|
-
| `js_render`
|
|
116
|
-
| `premium_proxy`
|
|
117
|
-
| `proxy_country`
|
|
118
|
-
| `
|
|
119
|
-
| `
|
|
120
|
-
| `
|
|
121
|
-
| `
|
|
122
|
-
| `
|
|
123
|
-
| `
|
|
124
|
-
| `
|
|
125
|
-
| `
|
|
126
|
-
| `
|
|
127
|
-
| `
|
|
128
|
-
| `
|
|
129
|
-
| `
|
|
181
|
+
| Option | Type | Description |
|
|
182
|
+
| --------------------- | --------------- | ---------------------------------- |
|
|
183
|
+
| `js_render` | Boolean | Enable JavaScript rendering |
|
|
184
|
+
| `premium_proxy` | Boolean | Use residential proxies |
|
|
185
|
+
| `proxy_country` | String | Country code (us, gb, de, etc.) |
|
|
186
|
+
| `device` | String | Device emulation (mobile/desktop) |
|
|
187
|
+
| `antibot` | Boolean | Enhanced antibot bypass |
|
|
188
|
+
| `wait` | Integer/Boolean | Wait time in ms (true = 15000) |
|
|
189
|
+
| `wait_for` | String | CSS selector to wait for |
|
|
190
|
+
| `session_id` | Boolean/String | Session persistence |
|
|
191
|
+
| `session_ttl` | String | Session duration (1m, 10m, 30m) |
|
|
192
|
+
| `window_height` | Integer | Browser window height |
|
|
193
|
+
| `window_width` | Integer | Browser window width |
|
|
194
|
+
| `js_instructions` | Array/String | Browser automation |
|
|
195
|
+
| `json_response` | Boolean | Return JSON instead of HTML |
|
|
196
|
+
| `screenshot` | Boolean | Take screenshot |
|
|
197
|
+
| `screenshot_fullpage` | Boolean | Full page screenshot |
|
|
198
|
+
| `screenshot_selector` | String | Screenshot specific element |
|
|
199
|
+
| `block_resources` | String | Block resources (image,media,font) |
|
|
200
|
+
| `headers` | Hash | Custom HTTP headers |
|
|
201
|
+
|
|
202
|
+
### API Client Options
|
|
203
|
+
|
|
204
|
+
| Option | Type | Description |
|
|
205
|
+
| ---------------- | ------------ | ------------------------------------ |
|
|
206
|
+
| `autoparse` | Boolean | Auto-extract structured data |
|
|
207
|
+
| `css_extractor` | Hash/Object | CSS selectors for extraction |
|
|
208
|
+
| `response_type` | String | Output format ('markdown') |
|
|
209
|
+
| `outputs` | String | Extract specific data (headings,links) |
|
|
130
210
|
|
|
131
211
|
## Error Handling
|
|
132
212
|
|
|
@@ -152,6 +232,7 @@ end
|
|
|
152
232
|
## Rails Integration
|
|
153
233
|
|
|
154
234
|
The gem automatically integrates with Rails when detected:
|
|
235
|
+
|
|
155
236
|
- Uses Rails.logger by default
|
|
156
237
|
- Supports ActiveSupport::Duration for wait times
|
|
157
238
|
|
|
@@ -0,0 +1,180 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "http"
|
|
4
|
+
require "json"
|
|
5
|
+
require "cgi"
|
|
6
|
+
|
|
7
|
+
module Zenrows
|
|
8
|
+
# REST API client for ZenRows Universal Scraper API
|
|
9
|
+
#
|
|
10
|
+
# Unlike the proxy-based Client, ApiClient calls the ZenRows API directly.
|
|
11
|
+
# This enables features not available in proxy mode: autoparse, css_extractor,
|
|
12
|
+
# response_type (markdown), and outputs.
|
|
13
|
+
#
|
|
14
|
+
# @example Basic usage
|
|
15
|
+
# api = Zenrows::ApiClient.new
|
|
16
|
+
# response = api.get('https://example.com')
|
|
17
|
+
#
|
|
18
|
+
# @example With autoparse
|
|
19
|
+
# response = api.get('https://amazon.com/dp/B01LD5GO7I', autoparse: true)
|
|
20
|
+
# puts response.data # Structured product data
|
|
21
|
+
#
|
|
22
|
+
# @example With CSS extraction
|
|
23
|
+
# response = api.get(url, css_extractor: { title: 'h1', links: 'a @href' })
|
|
24
|
+
#
|
|
25
|
+
# @example With markdown output
|
|
26
|
+
# response = api.get(url, response_type: 'markdown')
|
|
27
|
+
#
|
|
28
|
+
# @author Ernest Bursa
|
|
29
|
+
# @since 0.2.0
|
|
30
|
+
# @api public
|
|
31
|
+
class ApiClient
|
|
32
|
+
# @return [String] ZenRows API key
|
|
33
|
+
attr_reader :api_key
|
|
34
|
+
|
|
35
|
+
# @return [String] API endpoint URL
|
|
36
|
+
attr_reader :api_endpoint
|
|
37
|
+
|
|
38
|
+
# @return [Configuration] Configuration instance
|
|
39
|
+
attr_reader :config
|
|
40
|
+
|
|
41
|
+
# Initialize API client
|
|
42
|
+
#
|
|
43
|
+
# @param api_key [String, nil] Override API key (uses global config if nil)
|
|
44
|
+
# @param api_endpoint [String, nil] Override API endpoint (uses global config if nil)
|
|
45
|
+
def initialize(api_key: nil, api_endpoint: nil)
|
|
46
|
+
@config = Zenrows.configuration
|
|
47
|
+
@api_key = api_key || @config.api_key
|
|
48
|
+
@api_endpoint = api_endpoint || @config.api_endpoint
|
|
49
|
+
@config.validate! unless api_key
|
|
50
|
+
end
|
|
51
|
+
|
|
52
|
+
# Make GET request through ZenRows API
|
|
53
|
+
#
|
|
54
|
+
# @param url [String] Target URL to scrape
|
|
55
|
+
# @param options [Hash] Request options
|
|
56
|
+
# @option options [Boolean] :autoparse Auto-extract structured data
|
|
57
|
+
# @option options [Hash, CssExtractor] :css_extractor CSS selectors for extraction
|
|
58
|
+
# @option options [String] :response_type Response format ('markdown')
|
|
59
|
+
# @option options [String] :outputs Extract specific data ('headings,links,menus')
|
|
60
|
+
# @option options [Boolean] :js_render Enable JavaScript rendering
|
|
61
|
+
# @option options [Boolean] :premium_proxy Use residential proxies
|
|
62
|
+
# @option options [String] :proxy_country Country code
|
|
63
|
+
# @option options [Integer, Boolean] :wait Wait time in ms
|
|
64
|
+
# @option options [String] :wait_for CSS selector to wait for
|
|
65
|
+
# @option options [Boolean, String] :session_id Session persistence
|
|
66
|
+
# @option options [Array, String] :js_instructions Browser automation
|
|
67
|
+
# @option options [Boolean] :json_response Return JSON with XHR data
|
|
68
|
+
# @option options [Boolean] :screenshot Take screenshot
|
|
69
|
+
# @option options [Boolean] :screenshot_fullpage Full page screenshot
|
|
70
|
+
# @option options [String] :screenshot_selector Screenshot element
|
|
71
|
+
# @option options [String] :block_resources Block resources
|
|
72
|
+
# @option options [String] :device Device emulation
|
|
73
|
+
# @option options [Boolean] :antibot Enhanced antibot
|
|
74
|
+
# @return [ApiResponse] Response wrapper
|
|
75
|
+
# @raise [ConfigurationError] if API key not configured
|
|
76
|
+
# @raise [AuthenticationError] if API key invalid
|
|
77
|
+
# @raise [RateLimitError] if rate limited
|
|
78
|
+
def get(url, **options)
|
|
79
|
+
params = build_params(url, options)
|
|
80
|
+
http_response = build_http_client.get(api_endpoint, params: params)
|
|
81
|
+
handle_response(http_response, options)
|
|
82
|
+
end
|
|
83
|
+
|
|
84
|
+
# Make POST request through ZenRows API
|
|
85
|
+
#
|
|
86
|
+
# @param url [String] Target URL
|
|
87
|
+
# @param body [String, Hash] Request body
|
|
88
|
+
# @param options [Hash] Request options (same as #get)
|
|
89
|
+
# @return [ApiResponse] Response wrapper
|
|
90
|
+
def post(url, body: nil, **options)
|
|
91
|
+
params = build_params(url, options)
|
|
92
|
+
http_response = build_http_client.post(api_endpoint, params: params, body: body)
|
|
93
|
+
handle_response(http_response, options)
|
|
94
|
+
end
|
|
95
|
+
|
|
96
|
+
private
|
|
97
|
+
|
|
98
|
+
def build_http_client
|
|
99
|
+
HTTP
|
|
100
|
+
.timeout(connect: config.connect_timeout, read: config.read_timeout)
|
|
101
|
+
end
|
|
102
|
+
|
|
103
|
+
def build_params(url, options)
|
|
104
|
+
params = {apikey: api_key, url: url}
|
|
105
|
+
|
|
106
|
+
# API-mode only features
|
|
107
|
+
params[:autoparse] = "true" if options[:autoparse]
|
|
108
|
+
params[:response_type] = options[:response_type] if options[:response_type]
|
|
109
|
+
params[:outputs] = options[:outputs] if options[:outputs]
|
|
110
|
+
|
|
111
|
+
if options[:css_extractor]
|
|
112
|
+
extractor = options[:css_extractor]
|
|
113
|
+
params[:css_extractor] = extractor.to_json
|
|
114
|
+
end
|
|
115
|
+
|
|
116
|
+
# Common options (also available in proxy mode)
|
|
117
|
+
params[:js_render] = "true" if options[:js_render]
|
|
118
|
+
params[:premium_proxy] = "true" if options[:premium_proxy]
|
|
119
|
+
params[:proxy_country] = options[:proxy_country] if options[:proxy_country]
|
|
120
|
+
params[:json_response] = "true" if options[:json_response]
|
|
121
|
+
params[:original_status] = "true" if options[:original_status]
|
|
122
|
+
|
|
123
|
+
# Wait options
|
|
124
|
+
if options[:wait]
|
|
125
|
+
params[:wait] = (options[:wait] == true) ? 15000 : options[:wait]
|
|
126
|
+
end
|
|
127
|
+
params[:wait_for] = options[:wait_for] if options[:wait_for]
|
|
128
|
+
|
|
129
|
+
# Session
|
|
130
|
+
if options[:session_id]
|
|
131
|
+
params[:session_id] = (options[:session_id] == true) ? rand(1..99999) : options[:session_id]
|
|
132
|
+
end
|
|
133
|
+
|
|
134
|
+
# Window dimensions
|
|
135
|
+
params[:window_height] = options[:window_height] if options[:window_height]
|
|
136
|
+
params[:window_width] = options[:window_width] if options[:window_width]
|
|
137
|
+
|
|
138
|
+
# Screenshots
|
|
139
|
+
params[:screenshot] = "true" if options[:screenshot]
|
|
140
|
+
params[:screenshot_fullpage] = "true" if options[:screenshot_fullpage]
|
|
141
|
+
params[:screenshot_selector] = options[:screenshot_selector] if options[:screenshot_selector]
|
|
142
|
+
|
|
143
|
+
# JS instructions
|
|
144
|
+
if options[:js_instructions]
|
|
145
|
+
instructions = options[:js_instructions]
|
|
146
|
+
instructions = instructions.to_json if instructions.respond_to?(:to_a)
|
|
147
|
+
params[:js_instructions] = instructions
|
|
148
|
+
end
|
|
149
|
+
|
|
150
|
+
# Other options
|
|
151
|
+
params[:block_resources] = options[:block_resources] if options[:block_resources]
|
|
152
|
+
params[:device] = options[:device] if options[:device]
|
|
153
|
+
params[:antibot] = "true" if options[:antibot]
|
|
154
|
+
|
|
155
|
+
# Custom headers
|
|
156
|
+
options[:headers]&.each do |key, value|
|
|
157
|
+
params["custom_headers[#{key}]"] = value
|
|
158
|
+
end
|
|
159
|
+
|
|
160
|
+
params
|
|
161
|
+
end
|
|
162
|
+
|
|
163
|
+
def handle_response(http_response, options)
|
|
164
|
+
case http_response.status.code
|
|
165
|
+
when 200..299
|
|
166
|
+
ApiResponse.new(http_response, options)
|
|
167
|
+
when 401
|
|
168
|
+
raise AuthenticationError, "Invalid API key"
|
|
169
|
+
when 429
|
|
170
|
+
retry_after = http_response.headers["Retry-After"]&.to_i
|
|
171
|
+
raise RateLimitError.new("Rate limited", retry_after: retry_after)
|
|
172
|
+
when 403
|
|
173
|
+
raise BotDetectedError.new("Bot detected", suggestion: "Try premium_proxy or antibot options")
|
|
174
|
+
else
|
|
175
|
+
body = http_response.body.to_s
|
|
176
|
+
raise Error, "API error (#{http_response.status}): #{body[0, 200]}"
|
|
177
|
+
end
|
|
178
|
+
end
|
|
179
|
+
end
|
|
180
|
+
end
|
|
@@ -0,0 +1,185 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "json"
|
|
4
|
+
|
|
5
|
+
module Zenrows
|
|
6
|
+
# Response wrapper for ZenRows API responses
|
|
7
|
+
#
|
|
8
|
+
# Provides convenient accessors for different response types based on
|
|
9
|
+
# the options used in the request.
|
|
10
|
+
#
|
|
11
|
+
# @example HTML response
|
|
12
|
+
# response = api.get(url)
|
|
13
|
+
# response.html # => "<html>..."
|
|
14
|
+
#
|
|
15
|
+
# @example JSON response with XHR data
|
|
16
|
+
# response = api.get(url, json_response: true)
|
|
17
|
+
# response.data # => { "html" => "...", "xhr" => [...] }
|
|
18
|
+
# response.html # => "<html>..."
|
|
19
|
+
# response.xhr # => [...]
|
|
20
|
+
#
|
|
21
|
+
# @example Autoparse response
|
|
22
|
+
# response = api.get(url, autoparse: true)
|
|
23
|
+
# response.parsed # => { "title" => "...", "price" => "..." }
|
|
24
|
+
#
|
|
25
|
+
# @example CSS extraction
|
|
26
|
+
# response = api.get(url, css_extractor: { title: 'h1' })
|
|
27
|
+
# response.extracted # => { "title" => "Page Title" }
|
|
28
|
+
#
|
|
29
|
+
# @example Markdown response
|
|
30
|
+
# response = api.get(url, response_type: 'markdown')
|
|
31
|
+
# response.markdown # => "# Page Title\n\n..."
|
|
32
|
+
#
|
|
33
|
+
# @author Ernest Bursa
|
|
34
|
+
# @since 0.2.0
|
|
35
|
+
# @api public
|
|
36
|
+
class ApiResponse
|
|
37
|
+
# @return [HTTP::Response] Raw HTTP response
|
|
38
|
+
attr_reader :raw
|
|
39
|
+
|
|
40
|
+
# @return [Integer] HTTP status code
|
|
41
|
+
attr_reader :status
|
|
42
|
+
|
|
43
|
+
# @return [Hash] Request options used
|
|
44
|
+
attr_reader :options
|
|
45
|
+
|
|
46
|
+
# Initialize response wrapper
|
|
47
|
+
#
|
|
48
|
+
# @param http_response [HTTP::Response] Raw HTTP response
|
|
49
|
+
# @param options [Hash] Request options
|
|
50
|
+
def initialize(http_response, options = {})
|
|
51
|
+
@raw = http_response
|
|
52
|
+
@status = http_response.status.code
|
|
53
|
+
@options = options
|
|
54
|
+
@body = http_response.body.to_s
|
|
55
|
+
@parsed_json = nil
|
|
56
|
+
end
|
|
57
|
+
|
|
58
|
+
# Response body as string
|
|
59
|
+
#
|
|
60
|
+
# @return [String] Raw response body
|
|
61
|
+
attr_reader :body
|
|
62
|
+
|
|
63
|
+
# Parsed data (for JSON responses)
|
|
64
|
+
#
|
|
65
|
+
# @return [Hash, Array, String] Parsed response data
|
|
66
|
+
def data
|
|
67
|
+
@parsed_json ||= parse_body
|
|
68
|
+
end
|
|
69
|
+
|
|
70
|
+
# HTML content
|
|
71
|
+
#
|
|
72
|
+
# Returns HTML from json_response data or raw body
|
|
73
|
+
#
|
|
74
|
+
# @return [String] HTML content
|
|
75
|
+
def html
|
|
76
|
+
if json_response?
|
|
77
|
+
data.is_a?(Hash) ? data["html"] : data
|
|
78
|
+
else
|
|
79
|
+
@body
|
|
80
|
+
end
|
|
81
|
+
end
|
|
82
|
+
|
|
83
|
+
# Markdown content (when response_type: 'markdown')
|
|
84
|
+
#
|
|
85
|
+
# @return [String] Markdown content
|
|
86
|
+
def markdown
|
|
87
|
+
@body
|
|
88
|
+
end
|
|
89
|
+
|
|
90
|
+
# Parsed/extracted data (for autoparse or css_extractor)
|
|
91
|
+
#
|
|
92
|
+
# @return [Hash] Structured data
|
|
93
|
+
def parsed
|
|
94
|
+
data
|
|
95
|
+
end
|
|
96
|
+
|
|
97
|
+
# Alias for parsed data when using css_extractor
|
|
98
|
+
#
|
|
99
|
+
# @return [Hash] Extracted data
|
|
100
|
+
def extracted
|
|
101
|
+
data
|
|
102
|
+
end
|
|
103
|
+
|
|
104
|
+
# XHR/fetch request data (when json_response: true)
|
|
105
|
+
#
|
|
106
|
+
# @return [Array, nil] XHR request data
|
|
107
|
+
def xhr
|
|
108
|
+
data.is_a?(Hash) ? data["xhr"] : nil
|
|
109
|
+
end
|
|
110
|
+
|
|
111
|
+
# JS instructions execution report (when json_response: true)
|
|
112
|
+
#
|
|
113
|
+
# @return [Hash, nil] Instructions report
|
|
114
|
+
def js_instructions_report
|
|
115
|
+
data.is_a?(Hash) ? data["js_instructions_report"] : nil
|
|
116
|
+
end
|
|
117
|
+
|
|
118
|
+
# Screenshot data (when screenshot options used with json_response)
|
|
119
|
+
#
|
|
120
|
+
# @return [String, nil] Base64 encoded screenshot
|
|
121
|
+
def screenshot
|
|
122
|
+
data.is_a?(Hash) ? data["screenshot"] : nil
|
|
123
|
+
end
|
|
124
|
+
|
|
125
|
+
# Response headers
|
|
126
|
+
#
|
|
127
|
+
# @return [Hash] Response headers
|
|
128
|
+
def headers
|
|
129
|
+
@raw.headers.to_h
|
|
130
|
+
end
|
|
131
|
+
|
|
132
|
+
# Concurrency limit from headers
|
|
133
|
+
#
|
|
134
|
+
# @return [Integer, nil] Max concurrent requests
|
|
135
|
+
def concurrency_limit
|
|
136
|
+
headers["Concurrency-Limit"]&.to_i
|
|
137
|
+
end
|
|
138
|
+
|
|
139
|
+
# Remaining concurrency from headers
|
|
140
|
+
#
|
|
141
|
+
# @return [Integer, nil] Available concurrent request slots
|
|
142
|
+
def concurrency_remaining
|
|
143
|
+
headers["Concurrency-Remaining"]&.to_i
|
|
144
|
+
end
|
|
145
|
+
|
|
146
|
+
# Request cost from headers
|
|
147
|
+
#
|
|
148
|
+
# @return [Float, nil] Credit cost of request
|
|
149
|
+
def request_cost
|
|
150
|
+
headers["X-Request-Cost"]&.to_f
|
|
151
|
+
end
|
|
152
|
+
|
|
153
|
+
# Final URL after redirects
|
|
154
|
+
#
|
|
155
|
+
# @return [String, nil] Final URL
|
|
156
|
+
def final_url
|
|
157
|
+
headers["Zr-Final-Url"]
|
|
158
|
+
end
|
|
159
|
+
|
|
160
|
+
# Check if response is successful
|
|
161
|
+
#
|
|
162
|
+
# @return [Boolean]
|
|
163
|
+
def success?
|
|
164
|
+
status >= 200 && status < 300
|
|
165
|
+
end
|
|
166
|
+
|
|
167
|
+
private
|
|
168
|
+
|
|
169
|
+
def json_response?
|
|
170
|
+
options[:json_response] || options[:autoparse] || options[:css_extractor] || options[:outputs]
|
|
171
|
+
end
|
|
172
|
+
|
|
173
|
+
def parse_body
|
|
174
|
+
return @body unless json_response? || looks_like_json?
|
|
175
|
+
|
|
176
|
+
JSON.parse(@body)
|
|
177
|
+
rescue JSON::ParserError
|
|
178
|
+
@body
|
|
179
|
+
end
|
|
180
|
+
|
|
181
|
+
def looks_like_json?
|
|
182
|
+
@body.start_with?("{") || @body.start_with?("[")
|
|
183
|
+
end
|
|
184
|
+
end
|
|
185
|
+
end
|
|
@@ -12,7 +12,7 @@ module Zenrows
|
|
|
12
12
|
# @example Basic usage
|
|
13
13
|
# backend = Zenrows::Backends::HttpRb.new(proxy: proxy, config: config)
|
|
14
14
|
# http = backend.build_client(js_render: true)
|
|
15
|
-
# response = http.get(url
|
|
15
|
+
# response = http.get(url) # SSL context is auto-configured
|
|
16
16
|
#
|
|
17
17
|
# @author Ernest Bursa
|
|
18
18
|
# @since 0.1.0
|
|
@@ -41,18 +41,17 @@ module Zenrows
|
|
|
41
41
|
# Calculate timeouts
|
|
42
42
|
timeouts = calculate_timeouts(opts)
|
|
43
43
|
|
|
44
|
-
# Build HTTP client
|
|
45
|
-
|
|
44
|
+
# Build HTTP client with SSL context and proxy
|
|
45
|
+
HTTP
|
|
46
46
|
.timeout(connect: timeouts[:connect], read: timeouts[:read])
|
|
47
47
|
.headers(headers)
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
)
|
|
48
|
+
.via(
|
|
49
|
+
proxy_config[:host],
|
|
50
|
+
proxy_config[:port],
|
|
51
|
+
proxy_config[:username],
|
|
52
|
+
proxy_config[:password],
|
|
53
|
+
ssl_context: ssl_context
|
|
54
|
+
)
|
|
56
55
|
end
|
|
57
56
|
end
|
|
58
57
|
end
|