zenrows 0.1.0 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (54) hide show
  1. checksums.yaml +4 -4
  2. data/.mcp.json +10 -0
  3. data/.tool-versions +1 -0
  4. data/CHANGELOG.md +45 -0
  5. data/CLAUDE.md +1 -1
  6. data/Makefile +19 -0
  7. data/README.md +140 -19
  8. data/lib/zenrows/api_client.rb +243 -0
  9. data/lib/zenrows/api_response.rb +185 -0
  10. data/lib/zenrows/backends/base.rb +31 -1
  11. data/lib/zenrows/backends/http_rb.rb +17 -10
  12. data/lib/zenrows/backends/net_http.rb +149 -0
  13. data/lib/zenrows/client.rb +120 -11
  14. data/lib/zenrows/configuration.rb +117 -0
  15. data/lib/zenrows/css_extractor.rb +111 -0
  16. data/lib/zenrows/hooks/context.rb +142 -0
  17. data/lib/zenrows/hooks/log_subscriber.rb +124 -0
  18. data/lib/zenrows/hooks.rb +213 -0
  19. data/lib/zenrows/instrumented_client.rb +187 -0
  20. data/lib/zenrows/proxy.rb +19 -0
  21. data/lib/zenrows/version.rb +1 -1
  22. data/lib/zenrows.rb +14 -2
  23. data/sig/manifest.yaml +5 -0
  24. data/sig/zenrows/api_client.rbs +18 -0
  25. data/sig/zenrows/api_response.rbs +28 -0
  26. data/sig/zenrows/backends/base.rbs +12 -0
  27. data/sig/zenrows/backends/http_rb.rbs +3 -0
  28. data/sig/zenrows/backends/net_http.rbs +28 -0
  29. data/sig/zenrows/backends.rbs +2 -0
  30. data/sig/zenrows/client.rbs +12 -0
  31. data/sig/zenrows/configuration.rbs +29 -0
  32. data/sig/zenrows/css_extractor.rbs +14 -0
  33. data/sig/zenrows/errors.rbs +27 -0
  34. data/sig/zenrows/hook_configurator.rbs +9 -0
  35. data/sig/zenrows/hooks/context.rbs +6 -0
  36. data/sig/zenrows/hooks/log_subscriber.rbs +15 -0
  37. data/sig/zenrows/hooks.rbs +23 -0
  38. data/sig/zenrows/instrumented_client.rbs +22 -0
  39. data/sig/zenrows/js_instructions.rbs +28 -0
  40. data/sig/zenrows/proxy.rbs +14 -0
  41. data/sig/zenrows.rbs +4 -1
  42. data/test/test_helper.rb +42 -0
  43. data/test/zenrows/api_client_test.rb +161 -0
  44. data/test/zenrows/api_response_test.rb +142 -0
  45. data/test/zenrows/client_hooks_test.rb +105 -0
  46. data/test/zenrows/configuration_hooks_test.rb +101 -0
  47. data/test/zenrows/css_extractor_test.rb +84 -0
  48. data/test/zenrows/hooks/context_test.rb +150 -0
  49. data/test/zenrows/hooks/log_subscriber_test.rb +105 -0
  50. data/test/zenrows/hooks_test.rb +215 -0
  51. data/test/zenrows/instrumented_client_test.rb +153 -0
  52. data/test/zenrows/js_instructions_test.rb +2 -1
  53. data/test/zenrows/proxy_test.rb +39 -0
  54. metadata +42 -4
@@ -0,0 +1,185 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "json"
4
+
5
+ module Zenrows
6
+ # Response wrapper for ZenRows API responses
7
+ #
8
+ # Provides convenient accessors for different response types based on
9
+ # the options used in the request.
10
+ #
11
+ # @example HTML response
12
+ # response = api.get(url)
13
+ # response.html # => "<html>..."
14
+ #
15
+ # @example JSON response with XHR data
16
+ # response = api.get(url, json_response: true)
17
+ # response.data # => { "html" => "...", "xhr" => [...] }
18
+ # response.html # => "<html>..."
19
+ # response.xhr # => [...]
20
+ #
21
+ # @example Autoparse response
22
+ # response = api.get(url, autoparse: true)
23
+ # response.parsed # => { "title" => "...", "price" => "..." }
24
+ #
25
+ # @example CSS extraction
26
+ # response = api.get(url, css_extractor: { title: 'h1' })
27
+ # response.extracted # => { "title" => "Page Title" }
28
+ #
29
+ # @example Markdown response
30
+ # response = api.get(url, response_type: 'markdown')
31
+ # response.markdown # => "# Page Title\n\n..."
32
+ #
33
+ # @author Ernest Bursa
34
+ # @since 0.2.0
35
+ # @api public
36
+ class ApiResponse
37
+ # @return [HTTP::Response] Raw HTTP response
38
+ attr_reader :raw
39
+
40
+ # @return [Integer] HTTP status code
41
+ attr_reader :status
42
+
43
+ # @return [Hash] Request options used
44
+ attr_reader :options
45
+
46
+ # Initialize response wrapper
47
+ #
48
+ # @param http_response [HTTP::Response] Raw HTTP response
49
+ # @param options [Hash] Request options
50
+ def initialize(http_response, options = {})
51
+ @raw = http_response
52
+ @status = http_response.status.code
53
+ @options = options
54
+ @body = http_response.body.to_s
55
+ @parsed_json = nil
56
+ end
57
+
58
+ # Response body as string
59
+ #
60
+ # @return [String] Raw response body
61
+ attr_reader :body
62
+
63
+ # Parsed data (for JSON responses)
64
+ #
65
+ # @return [Hash, Array, String] Parsed response data
66
+ def data
67
+ @parsed_json ||= parse_body
68
+ end
69
+
70
+ # HTML content
71
+ #
72
+ # Returns HTML from json_response data or raw body
73
+ #
74
+ # @return [String] HTML content
75
+ def html
76
+ if json_response?
77
+ data.is_a?(Hash) ? data["html"] : data
78
+ else
79
+ @body
80
+ end
81
+ end
82
+
83
+ # Markdown content (when response_type: 'markdown')
84
+ #
85
+ # @return [String] Markdown content
86
+ def markdown
87
+ @body
88
+ end
89
+
90
+ # Parsed/extracted data (for autoparse or css_extractor)
91
+ #
92
+ # @return [Hash] Structured data
93
+ def parsed
94
+ data
95
+ end
96
+
97
+ # Alias for parsed data when using css_extractor
98
+ #
99
+ # @return [Hash] Extracted data
100
+ def extracted
101
+ data
102
+ end
103
+
104
+ # XHR/fetch request data (when json_response: true)
105
+ #
106
+ # @return [Array, nil] XHR request data
107
+ def xhr
108
+ data.is_a?(Hash) ? data["xhr"] : nil
109
+ end
110
+
111
+ # JS instructions execution report (when json_response: true)
112
+ #
113
+ # @return [Hash, nil] Instructions report
114
+ def js_instructions_report
115
+ data.is_a?(Hash) ? data["js_instructions_report"] : nil
116
+ end
117
+
118
+ # Screenshot data (when screenshot options used with json_response)
119
+ #
120
+ # @return [String, nil] Base64 encoded screenshot
121
+ def screenshot
122
+ data.is_a?(Hash) ? data["screenshot"] : nil
123
+ end
124
+
125
+ # Response headers
126
+ #
127
+ # @return [Hash] Response headers
128
+ def headers
129
+ @raw.headers.to_h
130
+ end
131
+
132
+ # Concurrency limit from headers
133
+ #
134
+ # @return [Integer, nil] Max concurrent requests
135
+ def concurrency_limit
136
+ headers["Concurrency-Limit"]&.to_i
137
+ end
138
+
139
+ # Remaining concurrency from headers
140
+ #
141
+ # @return [Integer, nil] Available concurrent request slots
142
+ def concurrency_remaining
143
+ headers["Concurrency-Remaining"]&.to_i
144
+ end
145
+
146
+ # Request cost from headers
147
+ #
148
+ # @return [Float, nil] Credit cost of request
149
+ def request_cost
150
+ headers["X-Request-Cost"]&.to_f
151
+ end
152
+
153
+ # Final URL after redirects
154
+ #
155
+ # @return [String, nil] Final URL
156
+ def final_url
157
+ headers["Zr-Final-Url"]
158
+ end
159
+
160
+ # Check if response is successful
161
+ #
162
+ # @return [Boolean]
163
+ def success?
164
+ status >= 200 && status < 300
165
+ end
166
+
167
+ private
168
+
169
+ def json_response?
170
+ options[:json_response] || options[:autoparse] || options[:css_extractor] || options[:outputs]
171
+ end
172
+
173
+ def parse_body
174
+ return @body unless json_response? || looks_like_json?
175
+
176
+ JSON.parse(@body)
177
+ rescue JSON::ParserError
178
+ @body
179
+ end
180
+
181
+ def looks_like_json?
182
+ @body.start_with?("{") || @body.start_with?("[")
183
+ end
184
+ end
185
+ end
@@ -18,11 +18,16 @@ module Zenrows
18
18
  # @return [Zenrows::Configuration] Configuration instance
19
19
  attr_reader :config
20
20
 
21
+ # @return [Zenrows::Hooks] Hook registry for this backend
22
+ attr_reader :hooks
23
+
21
24
  # @param proxy [Zenrows::Proxy] Proxy configuration builder
22
25
  # @param config [Zenrows::Configuration] Configuration instance
23
- def initialize(proxy:, config:)
26
+ # @param hooks [Zenrows::Hooks, nil] Optional hook registry (defaults to config.hooks)
27
+ def initialize(proxy:, config:, hooks: nil)
24
28
  @proxy = proxy
25
29
  @config = config
30
+ @hooks = hooks || config.hooks&.dup || Hooks.new
26
31
  end
27
32
 
28
33
  # Build a configured HTTP client
@@ -74,6 +79,31 @@ module Zenrows
74
79
  {connect: connect, read: read}
75
80
  end
76
81
 
82
+ # Wrap HTTP client with instrumentation if hooks are registered
83
+ #
84
+ # @param client [Object] The underlying HTTP client
85
+ # @param options [Hash] Request options used for this client
86
+ # @return [Object] Instrumented client or original if no hooks
87
+ def wrap_client(client, options)
88
+ return client if hooks.empty?
89
+
90
+ InstrumentedClient.new(
91
+ client,
92
+ hooks: hooks,
93
+ context_base: {
94
+ options: options,
95
+ backend: backend_name
96
+ }
97
+ )
98
+ end
99
+
100
+ # Get the backend name for context
101
+ #
102
+ # @return [Symbol] Backend identifier
103
+ def backend_name
104
+ :base
105
+ end
106
+
77
107
  private
78
108
 
79
109
  # Normalize wait value to seconds
@@ -12,7 +12,7 @@ module Zenrows
12
12
  # @example Basic usage
13
13
  # backend = Zenrows::Backends::HttpRb.new(proxy: proxy, config: config)
14
14
  # http = backend.build_client(js_render: true)
15
- # response = http.get(url, ssl_context: backend.ssl_context)
15
+ # response = http.get(url) # SSL context is auto-configured
16
16
  #
17
17
  # @author Ernest Bursa
18
18
  # @since 0.1.0
@@ -27,7 +27,7 @@ module Zenrows
27
27
  # @option options [Boolean, Integer] :wait Wait time
28
28
  # @option options [String] :wait_for CSS selector to wait for
29
29
  # @option options [Hash] :headers Custom HTTP headers
30
- # @return [HTTP::Client] Configured HTTP client
30
+ # @return [HTTP::Client, InstrumentedClient] Configured HTTP client (instrumented if hooks registered)
31
31
  def build_client(options = {})
32
32
  opts = options.dup
33
33
  headers = opts.delete(:headers) || {}
@@ -41,18 +41,25 @@ module Zenrows
41
41
  # Calculate timeouts
42
42
  timeouts = calculate_timeouts(opts)
43
43
 
44
- # Build HTTP client
44
+ # Build HTTP client with SSL context and proxy
45
45
  client = HTTP
46
46
  .timeout(connect: timeouts[:connect], read: timeouts[:read])
47
47
  .headers(headers)
48
+ .via(
49
+ proxy_config[:host],
50
+ proxy_config[:port],
51
+ proxy_config[:username],
52
+ proxy_config[:password],
53
+ ssl_context: ssl_context
54
+ )
48
55
 
49
- # Configure proxy
50
- client.via(
51
- proxy_config[:host],
52
- proxy_config[:port],
53
- proxy_config[:username],
54
- proxy_config[:password]
55
- )
56
+ # Wrap with instrumentation if hooks registered
57
+ wrap_client(client, opts)
58
+ end
59
+
60
+ # @return [Symbol] Backend identifier
61
+ def backend_name
62
+ :http_rb
56
63
  end
57
64
  end
58
65
  end
@@ -0,0 +1,149 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "net/http"
4
+ require "uri"
5
+ require "openssl"
6
+
7
+ module Zenrows
8
+ module Backends
9
+ # Net::HTTP backend adapter (stdlib fallback)
10
+ #
11
+ # Uses Ruby's built-in Net::HTTP when http.rb is not available.
12
+ # Provides basic proxy support with SSL verification disabled.
13
+ #
14
+ # @example Basic usage
15
+ # backend = Zenrows::Backends::NetHttp.new(proxy: proxy, config: config)
16
+ # http = backend.build_client(js_render: true)
17
+ # response = http.get(url)
18
+ #
19
+ # @author Ernest Bursa
20
+ # @since 0.2.1
21
+ # @api public
22
+ class NetHttp < Base
23
+ # Build a configured HTTP client wrapper
24
+ #
25
+ # @param options [Hash] Request options
26
+ # @return [NetHttpClient, InstrumentedClient] Configured client wrapper (instrumented if hooks registered)
27
+ def build_client(options = {})
28
+ opts = options.dup
29
+ headers = opts.delete(:headers) || {}
30
+ opts[:custom_headers] = true if headers.any?
31
+
32
+ proxy_config = proxy.build(opts)
33
+ timeouts = calculate_timeouts(opts)
34
+
35
+ client = NetHttpClient.new(
36
+ proxy_config: proxy_config,
37
+ headers: headers,
38
+ timeouts: timeouts,
39
+ ssl_context: ssl_context
40
+ )
41
+
42
+ # Wrap with instrumentation if hooks registered
43
+ wrap_client(client, opts)
44
+ end
45
+
46
+ # @return [Symbol] Backend identifier
47
+ def backend_name
48
+ :net_http
49
+ end
50
+ end
51
+
52
+ # Wrapper around Net::HTTP that mimics http.rb interface
53
+ #
54
+ # @api private
55
+ class NetHttpClient
56
+ # @param proxy_config [Hash] Proxy configuration
57
+ # @param headers [Hash] Default headers
58
+ # @param timeouts [Hash] Timeout configuration
59
+ # @param ssl_context [OpenSSL::SSL::SSLContext] SSL context
60
+ def initialize(proxy_config:, headers:, timeouts:, ssl_context:)
61
+ @proxy_config = proxy_config
62
+ @headers = headers
63
+ @timeouts = timeouts
64
+ @ssl_context = ssl_context
65
+ end
66
+
67
+ # Make GET request
68
+ #
69
+ # @param url [String] Target URL
70
+ # @param options [Hash] Request options
71
+ # @return [NetHttpResponse] Response wrapper
72
+ def get(url, **options)
73
+ uri = URI.parse(url)
74
+ request(uri, Net::HTTP::Get.new(uri), options)
75
+ end
76
+
77
+ # Make POST request
78
+ #
79
+ # @param url [String] Target URL
80
+ # @param body [String, nil] Request body
81
+ # @param options [Hash] Request options
82
+ # @return [NetHttpResponse] Response wrapper
83
+ def post(url, body: nil, **options)
84
+ uri = URI.parse(url)
85
+ req = Net::HTTP::Post.new(uri)
86
+ req.body = body if body
87
+ request(uri, req, options)
88
+ end
89
+
90
+ private
91
+
92
+ def request(uri, req, options)
93
+ @headers.each { |k, v| req[k] = v }
94
+
95
+ http = Net::HTTP.new(
96
+ uri.host,
97
+ uri.port,
98
+ @proxy_config[:host],
99
+ @proxy_config[:port],
100
+ @proxy_config[:username],
101
+ @proxy_config[:password]
102
+ )
103
+
104
+ http.use_ssl = uri.scheme == "https"
105
+ http.open_timeout = @timeouts[:connect]
106
+ http.read_timeout = @timeouts[:read]
107
+
108
+ # Apply SSL context
109
+ ctx = options[:ssl_context] || @ssl_context
110
+ http.verify_mode = ctx.verify_mode if ctx
111
+
112
+ response = http.request(req)
113
+ NetHttpResponse.new(response)
114
+ end
115
+ end
116
+
117
+ # Response wrapper that mimics http.rb response interface
118
+ #
119
+ # @api private
120
+ class NetHttpResponse
121
+ # @return [Net::HTTPResponse] Raw response
122
+ attr_reader :raw
123
+
124
+ def initialize(response)
125
+ @raw = response
126
+ end
127
+
128
+ # @return [String] Response body
129
+ def body
130
+ @raw.body
131
+ end
132
+
133
+ # @return [Integer] HTTP status code
134
+ def status
135
+ @raw.code.to_i
136
+ end
137
+
138
+ # @return [Hash] Response headers
139
+ def headers
140
+ @raw.to_hash.transform_values(&:first)
141
+ end
142
+
143
+ # Alias for body (http.rb compatibility)
144
+ def to_s
145
+ body
146
+ end
147
+ end
148
+ end
149
+ end
@@ -13,12 +13,17 @@ module Zenrows
13
13
  #
14
14
  # client = Zenrows::Client.new
15
15
  # http = client.http(js_render: true)
16
- # response = http.get('https://example.com', ssl_context: client.ssl_context)
16
+ # response = http.get('https://example.com')
17
17
  #
18
18
  # @example With custom configuration
19
19
  # client = Zenrows::Client.new(api_key: 'KEY', host: 'proxy.zenrows.com')
20
20
  # http = client.http(premium_proxy: true, proxy_country: 'us')
21
21
  #
22
+ # @example With per-client hooks
23
+ # client = Zenrows::Client.new do |c|
24
+ # c.on_response { |resp, ctx| puts "#{ctx[:host]} -> #{resp.status}" }
25
+ # end
26
+ #
22
27
  # @author Ernest Bursa
23
28
  # @since 0.1.0
24
29
  # @api public
@@ -32,17 +37,30 @@ module Zenrows
32
37
  # @return [Backends::Base] HTTP backend instance
33
38
  attr_reader :backend
34
39
 
40
+ # @return [Hooks] Hook registry for this client
41
+ attr_reader :hooks
42
+
35
43
  # Initialize a new client
36
44
  #
37
45
  # @param api_key [String, nil] Override API key from global config
38
46
  # @param host [String, nil] Override proxy host
39
47
  # @param port [Integer, nil] Override proxy port
40
48
  # @param backend [Symbol] Backend to use (:http_rb)
49
+ # @yield [config] Optional block for per-client configuration (hooks, etc.)
50
+ # @yieldparam config [Configuration] Client configuration for hook registration
41
51
  # @raise [ConfigurationError] if api_key is not configured
42
- def initialize(api_key: nil, host: nil, port: nil, backend: nil)
52
+ #
53
+ # @example With per-client hooks
54
+ # client = Zenrows::Client.new do |c|
55
+ # c.on_response { |resp, ctx| puts resp.status }
56
+ # end
57
+ def initialize(api_key: nil, host: nil, port: nil, backend: nil, &block)
43
58
  @config = build_config(api_key: api_key, host: host, port: port, backend: backend)
44
59
  @config.validate!
45
60
 
61
+ # Build hooks: start with global, allow per-client additions
62
+ @hooks = block ? build_hooks(&block) : Zenrows.configuration.hooks.dup
63
+
46
64
  @proxy = Proxy.new(
47
65
  api_key: @config.api_key,
48
66
  host: @config.host,
@@ -74,11 +92,11 @@ module Zenrows
74
92
  #
75
93
  # @example Basic request
76
94
  # http = client.http(js_render: true)
77
- # response = http.get(url, ssl_context: client.ssl_context)
95
+ # response = http.get(url)
78
96
  #
79
97
  # @example With premium proxy and country
80
98
  # http = client.http(premium_proxy: true, proxy_country: 'us')
81
- # response = http.get(url, ssl_context: client.ssl_context)
99
+ # response = http.get(url)
82
100
  def http(options = {})
83
101
  backend.build_client(options)
84
102
  end
@@ -86,12 +104,10 @@ module Zenrows
86
104
  # Get SSL context for proxy connections
87
105
  #
88
106
  # ZenRows proxy requires SSL verification to be disabled.
107
+ # This is automatically applied when using #http, but exposed
108
+ # for advanced use cases.
89
109
  #
90
110
  # @return [OpenSSL::SSL::SSLContext] SSL context
91
- #
92
- # @example
93
- # http = client.http(js_render: true)
94
- # response = http.get(url, ssl_context: client.ssl_context)
95
111
  def ssl_context
96
112
  backend.ssl_context
97
113
  end
@@ -148,12 +164,105 @@ module Zenrows
148
164
  # @return [Backends::Base] Backend instance
149
165
  # @raise [ConfigurationError] if backend is not supported
150
166
  def build_backend
151
- case config.backend
167
+ backend_name = resolve_backend
168
+ case backend_name
152
169
  when :http_rb
153
- Backends::HttpRb.new(proxy: proxy, config: config)
170
+ Backends::HttpRb.new(proxy: proxy, config: config, hooks: hooks)
171
+ when :net_http
172
+ Backends::NetHttp.new(proxy: proxy, config: config, hooks: hooks)
154
173
  else
155
- raise ConfigurationError, "Unsupported backend: #{config.backend}. Use :http_rb"
174
+ raise ConfigurationError, "Unsupported backend: #{backend_name}. Use :http_rb or :net_http"
175
+ end
176
+ end
177
+
178
+ # Build hooks registry for this client
179
+ #
180
+ # Starts with global hooks, then applies per-client hooks from block.
181
+ #
182
+ # @yield [config] Block for registering per-client hooks
183
+ # @return [Hooks] Combined hooks registry
184
+ def build_hooks
185
+ # Start with a copy of global hooks
186
+ client_hooks = Zenrows.configuration.hooks.dup
187
+
188
+ # Create a temporary config-like object for hook registration
189
+ hook_config = HookConfigurator.new(client_hooks)
190
+ yield(hook_config)
191
+
192
+ client_hooks
193
+ end
194
+
195
+ # Resolve which backend to use
196
+ #
197
+ # @return [Symbol] Backend name
198
+ def resolve_backend
199
+ return config.backend if config.backend == :net_http
200
+
201
+ # Try http_rb first (preferred), fallback to net_http
202
+ if config.backend == :http_rb
203
+ return :http_rb if http_rb_available?
204
+ return :net_http
156
205
  end
206
+
207
+ config.backend
208
+ end
209
+
210
+ # Check if http.rb gem is available
211
+ #
212
+ # @return [Boolean]
213
+ def http_rb_available?
214
+ require "http"
215
+ true
216
+ rescue LoadError
217
+ false
218
+ end
219
+ end
220
+
221
+ # Helper class for per-client hook configuration
222
+ #
223
+ # Provides the same hook registration DSL as Configuration.
224
+ #
225
+ # @api private
226
+ class HookConfigurator
227
+ # @param hooks [Hooks] Hook registry to configure
228
+ def initialize(hooks)
229
+ @hooks = hooks
230
+ end
231
+
232
+ # Register a before_request callback
233
+ def before_request(callable = nil, &block)
234
+ @hooks.register(:before_request, callable, &block)
235
+ self
236
+ end
237
+
238
+ # Register an after_request callback
239
+ def after_request(callable = nil, &block)
240
+ @hooks.register(:after_request, callable, &block)
241
+ self
242
+ end
243
+
244
+ # Register an on_response callback
245
+ def on_response(callable = nil, &block)
246
+ @hooks.register(:on_response, callable, &block)
247
+ self
248
+ end
249
+
250
+ # Register an on_error callback
251
+ def on_error(callable = nil, &block)
252
+ @hooks.register(:on_error, callable, &block)
253
+ self
254
+ end
255
+
256
+ # Register an around_request callback
257
+ def around_request(callable = nil, &block)
258
+ @hooks.register(:around_request, callable, &block)
259
+ self
260
+ end
261
+
262
+ # Add a subscriber object
263
+ def add_subscriber(subscriber)
264
+ @hooks.add_subscriber(subscriber)
265
+ self
157
266
  end
158
267
  end
159
268
  end