scraper-central-ruby 2.1.0 → 2.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 344375e9baae6533bc98bb11d109dbe8771592d0d7029a83f3ee27176981938c
4
- data.tar.gz: 1d02478a129bc7641c74733e3e4de6495112a2151ba6b2802fa1cf0a44f71c04
3
+ metadata.gz: 5c81c9909df9cbc05523c066e027df4aff98182c1b59f6e554e73ffed5bfbed1
4
+ data.tar.gz: 65254df4076335ac2df67f95a0f68f779d8f9eac6ee2d97232cd8b6dfb69556b
5
5
  SHA512:
6
- metadata.gz: 0f6bcada267645f395aaf66aa6e39062beec893b2c3734be037b10cc7fda6664a51e37ab2de65c35cfc3a9786154b85955402f5c82f4934af844d99c202276e1
7
- data.tar.gz: 60a335b4d9e62b53d98cde2436e522f0d93598251a64527439f7dd85576c612afeac5490ec81429f271c6eabd73f20872e299076abcef83606bae0c027ffe91c
6
+ metadata.gz: ed50acebd0f138ed65d8d354d3f8b71c0558d6720d1bae845831f712d8abf0d4b6f4e0e770720fb235aa469a37703cc47b27596060b9eb5fab23b82c0c3eadfe
7
+ data.tar.gz: f040bec9965ae7c6bd6c20126cf76a20ad9aa3c84481df20f598c0f46f1fac0eba9f1b35cd4570c014a3527e645eec481b9ecf8c01d39e6eaa19b6d0889349af
data/Gemfile.lock CHANGED
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- scraper-central-ruby (2.1.0)
4
+ scraper-central-ruby (2.2.0)
5
5
  activesupport (~> 7.0)
6
6
  brotli (~> 0.5.0)
7
7
  net-http-persistent (~> 4.0)
@@ -43,6 +43,7 @@ GEM
43
43
 
44
44
  PLATFORMS
45
45
  arm64-darwin-23
46
+ arm64-darwin-25
46
47
 
47
48
  DEPENDENCIES
48
49
  brotli (~> 0.5.0)
data/README.md CHANGED
@@ -7,7 +7,7 @@ Ruby library to scrape and cache the data
7
7
  Add gem `scraper-central-ruby` into Gemfile:
8
8
 
9
9
  ```bash
10
- gem 'scraper-central-ruby', git: 'git@github.com:patterninc/scraper-central-ruby.git', tag: 'v2.1.0'
10
+ gem 'scraper-central-ruby', git: 'git@github.com:patterninc/scraper-central-ruby.git', tag: 'v2.2.0'
11
11
  ```
12
12
 
13
13
  ```bash
@@ -143,6 +143,15 @@ puts "Status Code: ", response.code
143
143
  puts "Headers: ", response.headers
144
144
  ```
145
145
 
146
+ ### Single-roundtrip get-dom
147
+
148
+ Use `enable_get_dom = true` to call `/v1/get-dom` on the scraper-central server. On cache miss the server scrapes and returns HTML in one request (no client-side proxy or put-cache). Reuses `SERVER_URL_PUT_CACHE` when set, otherwise `SERVER_URL_GET_CACHE`. `cache_duration` is required (`age` in the request payload).
149
+
150
+ ```ruby
151
+ scraper_central.enable_get_dom = true
152
+ response = scraper_central.fetch("https://www.example-brand.com/product")
153
+ ```
154
+
146
155
  ## Documentation
147
156
 
148
157
  ### Configuration Functions
@@ -153,6 +162,7 @@ puts "Headers: ", response.headers
153
162
  - `scraper_central.timeout=`: Sets the request timeout in seconds.
154
163
  - `scraper_central.tls_verify=`: Configures TLS verification.
155
164
  - `scraper_central.enable_image_cache=` Enable/Disable image caching by setting true/false
165
+ - `scraper_central.enable_get_dom=`: Fetches via `/v1/get-dom` for synchronous cache-or-scrape in one roundtrip.
156
166
  ### Proxy Methods
157
167
 
158
168
  - `scraper_central.query_params=`: Sets query parameters to be appended to each request URL.
data/lib/cache_server.rb CHANGED
@@ -9,6 +9,10 @@ require 'auth'
9
9
  class CacheServer
10
10
  AUTH0_BEARER_HEADER = 'Authorization'.freeze
11
11
  CONTENT_TYPE_HEADER = 'Content-Type'.freeze
12
+ GET_CACHE_ENDPOINT = '/v1/get-cache'.freeze
13
+ GET_DOM_ENDPOINT = '/v1/get-dom'.freeze
14
+ PUT_CACHE_ENDPOINT = '/v1/put-cache'.freeze
15
+ HTTP_OK = 200
12
16
 
13
17
  # Persistent HTTP clients shared across all instances — reuses TCP connections
14
18
  # instead of opening a new connection on every request.
@@ -26,35 +30,64 @@ class CacheServer
26
30
  @s3_key = args[:s3_key]
27
31
  @enable_image_cache = args[:enable_image_cache]
28
32
  @auth_config = args[:auth_config]
33
+ @use_get_dom = args[:use_get_dom]
29
34
  @logger = Logger.new($stdout)
30
35
  end
31
36
 
32
37
  def get_cache(url)
38
+ base_url = ENV['SERVER_URL_GET_CACHE']
39
+ endpoint = GET_CACHE_ENDPOINT
40
+ if @use_get_dom
41
+ put_cache = ENV['SERVER_URL_PUT_CACHE']
42
+ base_url = put_cache.to_s != '' ? put_cache : ENV['SERVER_URL_GET_CACHE']
43
+ endpoint = GET_DOM_ENDPOINT
44
+ if base_url.nil? || base_url.empty?
45
+ raise StandardError, 'SERVER_URL_PUT_CACHE or SERVER_URL_GET_CACHE must be set for get-dom'
46
+ end
47
+ end
48
+
33
49
  payload = prepare_get_cache_payload(url)
34
50
 
35
- uri = URI.parse("#{ENV['SERVER_URL_GET_CACHE']}/v1/get-cache")
51
+ uri = URI.parse("#{base_url}#{endpoint}")
36
52
  request = Net::HTTP::Get.new(uri.request_uri, auth_headers)
37
53
  request.body = payload.to_json
38
54
 
39
55
  begin
40
56
  response = GET_HTTP.request(uri, request)
41
57
 
58
+ response_code = response.code&.to_i
59
+ if @use_get_dom && response_code != HTTP_OK
60
+ raise StandardError, "get-dom returned status #{response.code}: #{response.body}"
61
+ end
62
+
42
63
  if response.content_type.include?('application/json')
43
64
  response_body = JSON.parse(response.body)
44
- return '', nil, proxy_from_server(response_body) if response_body.key?('proxyUrl')
65
+ return '', nil, proxy_from_server(response_body), 200 if response_body.key?('proxyUrl')
45
66
  page_from_server = response_body['body']
46
67
  if @enable_image_cache
47
68
  decoded_data = Base64.decode64(page_from_server)
48
69
  page_from_server = StringIO.new(decoded_data)
49
70
  end
50
- return page_from_server, headers_from_server(response_body), nil
71
+ status_code = response_body['statusCode'] || 200
72
+ return page_from_server, headers_from_server(response_body), nil, status_code
73
+ elsif @use_get_dom
74
+ raise StandardError, "get-dom returned unexpected content type #{response.content_type.inspect}"
51
75
  else
52
76
  @logger.error "Unexpected response type: #{response.content_type}, body: #{response.body}, code: #{response.code}"
53
77
  end
78
+ rescue JSON::ParserError => e
79
+ if @use_get_dom
80
+ raise StandardError, "error unmarshaling get-dom response: #{e.message}"
81
+ end
82
+ @logger.error "Error sending request to server: #{e.message}"
54
83
  rescue StandardError => e
84
+ if @use_get_dom
85
+ @logger.error "Error sending request to server: #{e.message}"
86
+ raise
87
+ end
55
88
  @logger.error "Error sending request to server: #{e.message}"
56
89
  end
57
- ['', nil, nil]
90
+ ['', nil, nil, 200]
58
91
  end
59
92
 
60
93
  def put_cache(cache_key, page, headers, cookies, enable_image_cache)
@@ -66,13 +99,13 @@ class CacheServer
66
99
  enableImageCache: enable_image_cache
67
100
  }
68
101
 
69
- uri = URI.parse("#{ENV['SERVER_URL_PUT_CACHE']}/v1/put-cache")
102
+ uri = URI.parse("#{ENV['SERVER_URL_PUT_CACHE']}#{PUT_CACHE_ENDPOINT}")
70
103
  request = Net::HTTP::Post.new(uri.request_uri, auth_headers)
71
104
  request.body = payload.to_json
72
105
 
73
106
  begin
74
107
  response = PUT_HTTP.request(uri, request)
75
- if response.code.to_i != 200
108
+ if response.code&.to_i != HTTP_OK
76
109
  error_message = "Server returned bad status: #{response.code}"
77
110
  @logger.error error_message
78
111
  raise StandardError, error_message
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  class ScraperCentral
4
- VERSION = '2.1.0'
4
+ VERSION = '2.2.0'
5
5
  end
@@ -9,7 +9,7 @@ require 'proxy/scraper_api'
9
9
 
10
10
  class ScraperCentral
11
11
  attr_accessor :cache_duration, :proxy_name, :s3_key, :enable_js, :tls_verify, :headers, :query_params, :cookies,
12
- :timeout, :retry_attr, :enable_image_cache, :auth_config
12
+ :timeout, :retry_attr, :enable_image_cache, :auth_config, :enable_get_dom
13
13
 
14
14
  def initialize
15
15
  @lock = Mutex.new
@@ -19,10 +19,11 @@ class ScraperCentral
19
19
  def fetch(url)
20
20
  @lock.synchronize do
21
21
  @url = url
22
- page_from_server, headers_from_server, proxy_from_server = cache_server.get_cache(@url)
22
+ page_from_server, headers_from_server, proxy_from_server, status_code = cache_server.get_cache(@url)
23
23
  if proxy_from_server.nil?
24
24
  print_proxy_values
25
- return Response.new(code: 200, body: page_from_server, headers: headers_from_server)
25
+ response_code = enable_get_dom ? status_code : 200
26
+ return Response.new(code: response_code, body: page_from_server, headers: headers_from_server)
26
27
  else
27
28
  proxy_response = nil
28
29
 
@@ -53,12 +54,14 @@ class ScraperCentral
53
54
  return Response.new(code: status_code)
54
55
  end
55
56
 
56
- Thread.new do
57
- cache_server.put_cache(proxy_from_server['cacheKey'], proxy_response.body, proxy_response.headers,
58
- proxy_response.cookies, enable_image_cache)
59
- @logger.info("Cache successfully sent to server key: #{proxy_from_server['cacheKey']}")
60
- rescue StandardError => e
61
- @logger.error("Error uploading cache to server key: #{proxy_from_server['cacheKey']}, error: #{e.message}")
57
+ unless enable_get_dom
58
+ Thread.new do
59
+ cache_server.put_cache(proxy_from_server['cacheKey'], proxy_response.body, proxy_response.headers,
60
+ proxy_response.cookies, enable_image_cache)
61
+ @logger.info("Cache successfully sent to server key: #{proxy_from_server['cacheKey']}")
62
+ rescue StandardError => e
63
+ @logger.error("Error uploading cache to server key: #{proxy_from_server['cacheKey']}, error: #{e.message}")
64
+ end
62
65
  end
63
66
 
64
67
  print_proxy_values
@@ -98,7 +101,8 @@ class ScraperCentral
98
101
  cache_duration: cache_duration,
99
102
  s3_key: s3_key,
100
103
  enable_image_cache: enable_image_cache,
101
- auth_config: auth_config
104
+ auth_config: auth_config,
105
+ use_get_dom: enable_get_dom
102
106
  }
103
107
  CacheServer.new(args)
104
108
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: scraper-central-ruby
3
3
  version: !ruby/object:Gem::Version
4
- version: 2.1.0
4
+ version: 2.2.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Patterninc
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2026-04-23 00:00:00.000000000 Z
11
+ date: 2026-06-10 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: brotli