scraper-central-ruby 2.1.0 → 2.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +2 -1
- data/README.md +11 -1
- data/lib/cache_server.rb +39 -6
- data/lib/scraper_central/version.rb +1 -1
- data/lib/scraper_central.rb +14 -10
- metadata +2 -2
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 5c81c9909df9cbc05523c066e027df4aff98182c1b59f6e554e73ffed5bfbed1
|
|
4
|
+
data.tar.gz: 65254df4076335ac2df67f95a0f68f779d8f9eac6ee2d97232cd8b6dfb69556b
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: ed50acebd0f138ed65d8d354d3f8b71c0558d6720d1bae845831f712d8abf0d4b6f4e0e770720fb235aa469a37703cc47b27596060b9eb5fab23b82c0c3eadfe
|
|
7
|
+
data.tar.gz: f040bec9965ae7c6bd6c20126cf76a20ad9aa3c84481df20f598c0f46f1fac0eba9f1b35cd4570c014a3527e645eec481b9ecf8c01d39e6eaa19b6d0889349af
|
data/Gemfile.lock
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
PATH
|
|
2
2
|
remote: .
|
|
3
3
|
specs:
|
|
4
|
-
scraper-central-ruby (2.
|
|
4
|
+
scraper-central-ruby (2.2.0)
|
|
5
5
|
activesupport (~> 7.0)
|
|
6
6
|
brotli (~> 0.5.0)
|
|
7
7
|
net-http-persistent (~> 4.0)
|
|
@@ -43,6 +43,7 @@ GEM
|
|
|
43
43
|
|
|
44
44
|
PLATFORMS
|
|
45
45
|
arm64-darwin-23
|
|
46
|
+
arm64-darwin-25
|
|
46
47
|
|
|
47
48
|
DEPENDENCIES
|
|
48
49
|
brotli (~> 0.5.0)
|
data/README.md
CHANGED
|
@@ -7,7 +7,7 @@ Ruby library to scrape and cache the data
|
|
|
7
7
|
Add gem `scraper-central-ruby` into Gemfile:
|
|
8
8
|
|
|
9
9
|
```bash
|
|
10
|
-
gem 'scraper-central-ruby', git: 'git@github.com:patterninc/scraper-central-ruby.git', tag: 'v2.
|
|
10
|
+
gem 'scraper-central-ruby', git: 'git@github.com:patterninc/scraper-central-ruby.git', tag: 'v2.2.0'
|
|
11
11
|
```
|
|
12
12
|
|
|
13
13
|
```bash
|
|
@@ -143,6 +143,15 @@ puts "Status Code: ", response.code
|
|
|
143
143
|
puts "Headers: ", response.headers
|
|
144
144
|
```
|
|
145
145
|
|
|
146
|
+
### Single-roundtrip get-dom
|
|
147
|
+
|
|
148
|
+
Use `enable_get_dom = true` to call `/v1/get-dom` on the scraper-central server. On cache miss the server scrapes and returns HTML in one request (no client-side proxy or put-cache). Reuses `SERVER_URL_PUT_CACHE` when set, otherwise `SERVER_URL_GET_CACHE`. `cache_duration` is required (`age` in the request payload).
|
|
149
|
+
|
|
150
|
+
```ruby
|
|
151
|
+
scraper_central.enable_get_dom = true
|
|
152
|
+
response = scraper_central.fetch("https://www.example-brand.com/product")
|
|
153
|
+
```
|
|
154
|
+
|
|
146
155
|
## Documentation
|
|
147
156
|
|
|
148
157
|
### Configuration Functions
|
|
@@ -153,6 +162,7 @@ puts "Headers: ", response.headers
|
|
|
153
162
|
- `scraper_central.timeout=`: Sets the request timeout in seconds.
|
|
154
163
|
- `scraper_central.tls_verify=`: Configures TLS verification.
|
|
155
164
|
- `scraper_central.enable_image_cache=` Enable/Disable image caching by setting true/false
|
|
165
|
+
- `scraper_central.enable_get_dom=`: Fetches via `/v1/get-dom` for synchronous cache-or-scrape in one roundtrip.
|
|
156
166
|
### Proxy Methods
|
|
157
167
|
|
|
158
168
|
- `scraper_central.query_params=`: Sets query parameters to be appended to each request URL.
|
data/lib/cache_server.rb
CHANGED
|
@@ -9,6 +9,10 @@ require 'auth'
|
|
|
9
9
|
class CacheServer
|
|
10
10
|
AUTH0_BEARER_HEADER = 'Authorization'.freeze
|
|
11
11
|
CONTENT_TYPE_HEADER = 'Content-Type'.freeze
|
|
12
|
+
GET_CACHE_ENDPOINT = '/v1/get-cache'.freeze
|
|
13
|
+
GET_DOM_ENDPOINT = '/v1/get-dom'.freeze
|
|
14
|
+
PUT_CACHE_ENDPOINT = '/v1/put-cache'.freeze
|
|
15
|
+
HTTP_OK = 200
|
|
12
16
|
|
|
13
17
|
# Persistent HTTP clients shared across all instances — reuses TCP connections
|
|
14
18
|
# instead of opening a new connection on every request.
|
|
@@ -26,35 +30,64 @@ class CacheServer
|
|
|
26
30
|
@s3_key = args[:s3_key]
|
|
27
31
|
@enable_image_cache = args[:enable_image_cache]
|
|
28
32
|
@auth_config = args[:auth_config]
|
|
33
|
+
@use_get_dom = args[:use_get_dom]
|
|
29
34
|
@logger = Logger.new($stdout)
|
|
30
35
|
end
|
|
31
36
|
|
|
32
37
|
def get_cache(url)
|
|
38
|
+
base_url = ENV['SERVER_URL_GET_CACHE']
|
|
39
|
+
endpoint = GET_CACHE_ENDPOINT
|
|
40
|
+
if @use_get_dom
|
|
41
|
+
put_cache = ENV['SERVER_URL_PUT_CACHE']
|
|
42
|
+
base_url = put_cache.to_s != '' ? put_cache : ENV['SERVER_URL_GET_CACHE']
|
|
43
|
+
endpoint = GET_DOM_ENDPOINT
|
|
44
|
+
if base_url.nil? || base_url.empty?
|
|
45
|
+
raise StandardError, 'SERVER_URL_PUT_CACHE or SERVER_URL_GET_CACHE must be set for get-dom'
|
|
46
|
+
end
|
|
47
|
+
end
|
|
48
|
+
|
|
33
49
|
payload = prepare_get_cache_payload(url)
|
|
34
50
|
|
|
35
|
-
uri = URI.parse("#{
|
|
51
|
+
uri = URI.parse("#{base_url}#{endpoint}")
|
|
36
52
|
request = Net::HTTP::Get.new(uri.request_uri, auth_headers)
|
|
37
53
|
request.body = payload.to_json
|
|
38
54
|
|
|
39
55
|
begin
|
|
40
56
|
response = GET_HTTP.request(uri, request)
|
|
41
57
|
|
|
58
|
+
response_code = response.code&.to_i
|
|
59
|
+
if @use_get_dom && response_code != HTTP_OK
|
|
60
|
+
raise StandardError, "get-dom returned status #{response.code}: #{response.body}"
|
|
61
|
+
end
|
|
62
|
+
|
|
42
63
|
if response.content_type.include?('application/json')
|
|
43
64
|
response_body = JSON.parse(response.body)
|
|
44
|
-
return '', nil, proxy_from_server(response_body) if response_body.key?('proxyUrl')
|
|
65
|
+
return '', nil, proxy_from_server(response_body), 200 if response_body.key?('proxyUrl')
|
|
45
66
|
page_from_server = response_body['body']
|
|
46
67
|
if @enable_image_cache
|
|
47
68
|
decoded_data = Base64.decode64(page_from_server)
|
|
48
69
|
page_from_server = StringIO.new(decoded_data)
|
|
49
70
|
end
|
|
50
|
-
|
|
71
|
+
status_code = response_body['statusCode'] || 200
|
|
72
|
+
return page_from_server, headers_from_server(response_body), nil, status_code
|
|
73
|
+
elsif @use_get_dom
|
|
74
|
+
raise StandardError, "get-dom returned unexpected content type #{response.content_type.inspect}"
|
|
51
75
|
else
|
|
52
76
|
@logger.error "Unexpected response type: #{response.content_type}, body: #{response.body}, code: #{response.code}"
|
|
53
77
|
end
|
|
78
|
+
rescue JSON::ParserError => e
|
|
79
|
+
if @use_get_dom
|
|
80
|
+
raise StandardError, "error unmarshaling get-dom response: #{e.message}"
|
|
81
|
+
end
|
|
82
|
+
@logger.error "Error sending request to server: #{e.message}"
|
|
54
83
|
rescue StandardError => e
|
|
84
|
+
if @use_get_dom
|
|
85
|
+
@logger.error "Error sending request to server: #{e.message}"
|
|
86
|
+
raise
|
|
87
|
+
end
|
|
55
88
|
@logger.error "Error sending request to server: #{e.message}"
|
|
56
89
|
end
|
|
57
|
-
['', nil, nil]
|
|
90
|
+
['', nil, nil, 200]
|
|
58
91
|
end
|
|
59
92
|
|
|
60
93
|
def put_cache(cache_key, page, headers, cookies, enable_image_cache)
|
|
@@ -66,13 +99,13 @@ class CacheServer
|
|
|
66
99
|
enableImageCache: enable_image_cache
|
|
67
100
|
}
|
|
68
101
|
|
|
69
|
-
uri = URI.parse("#{ENV['SERVER_URL_PUT_CACHE']}
|
|
102
|
+
uri = URI.parse("#{ENV['SERVER_URL_PUT_CACHE']}#{PUT_CACHE_ENDPOINT}")
|
|
70
103
|
request = Net::HTTP::Post.new(uri.request_uri, auth_headers)
|
|
71
104
|
request.body = payload.to_json
|
|
72
105
|
|
|
73
106
|
begin
|
|
74
107
|
response = PUT_HTTP.request(uri, request)
|
|
75
|
-
if response.code
|
|
108
|
+
if response.code&.to_i != HTTP_OK
|
|
76
109
|
error_message = "Server returned bad status: #{response.code}"
|
|
77
110
|
@logger.error error_message
|
|
78
111
|
raise StandardError, error_message
|
data/lib/scraper_central.rb
CHANGED
|
@@ -9,7 +9,7 @@ require 'proxy/scraper_api'
|
|
|
9
9
|
|
|
10
10
|
class ScraperCentral
|
|
11
11
|
attr_accessor :cache_duration, :proxy_name, :s3_key, :enable_js, :tls_verify, :headers, :query_params, :cookies,
|
|
12
|
-
:timeout, :retry_attr, :enable_image_cache, :auth_config
|
|
12
|
+
:timeout, :retry_attr, :enable_image_cache, :auth_config, :enable_get_dom
|
|
13
13
|
|
|
14
14
|
def initialize
|
|
15
15
|
@lock = Mutex.new
|
|
@@ -19,10 +19,11 @@ class ScraperCentral
|
|
|
19
19
|
def fetch(url)
|
|
20
20
|
@lock.synchronize do
|
|
21
21
|
@url = url
|
|
22
|
-
page_from_server, headers_from_server, proxy_from_server = cache_server.get_cache(@url)
|
|
22
|
+
page_from_server, headers_from_server, proxy_from_server, status_code = cache_server.get_cache(@url)
|
|
23
23
|
if proxy_from_server.nil?
|
|
24
24
|
print_proxy_values
|
|
25
|
-
|
|
25
|
+
response_code = enable_get_dom ? status_code : 200
|
|
26
|
+
return Response.new(code: response_code, body: page_from_server, headers: headers_from_server)
|
|
26
27
|
else
|
|
27
28
|
proxy_response = nil
|
|
28
29
|
|
|
@@ -53,12 +54,14 @@ class ScraperCentral
|
|
|
53
54
|
return Response.new(code: status_code)
|
|
54
55
|
end
|
|
55
56
|
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
57
|
+
unless enable_get_dom
|
|
58
|
+
Thread.new do
|
|
59
|
+
cache_server.put_cache(proxy_from_server['cacheKey'], proxy_response.body, proxy_response.headers,
|
|
60
|
+
proxy_response.cookies, enable_image_cache)
|
|
61
|
+
@logger.info("Cache successfully sent to server key: #{proxy_from_server['cacheKey']}")
|
|
62
|
+
rescue StandardError => e
|
|
63
|
+
@logger.error("Error uploading cache to server key: #{proxy_from_server['cacheKey']}, error: #{e.message}")
|
|
64
|
+
end
|
|
62
65
|
end
|
|
63
66
|
|
|
64
67
|
print_proxy_values
|
|
@@ -98,7 +101,8 @@ class ScraperCentral
|
|
|
98
101
|
cache_duration: cache_duration,
|
|
99
102
|
s3_key: s3_key,
|
|
100
103
|
enable_image_cache: enable_image_cache,
|
|
101
|
-
auth_config: auth_config
|
|
104
|
+
auth_config: auth_config,
|
|
105
|
+
use_get_dom: enable_get_dom
|
|
102
106
|
}
|
|
103
107
|
CacheServer.new(args)
|
|
104
108
|
end
|
metadata
CHANGED
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: scraper-central-ruby
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 2.
|
|
4
|
+
version: 2.2.0
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Patterninc
|
|
8
8
|
autorequire:
|
|
9
9
|
bindir: exe
|
|
10
10
|
cert_chain: []
|
|
11
|
-
date: 2026-
|
|
11
|
+
date: 2026-06-10 00:00:00.000000000 Z
|
|
12
12
|
dependencies:
|
|
13
13
|
- !ruby/object:Gem::Dependency
|
|
14
14
|
name: brotli
|