scraper-central-ruby 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/Gemfile +8 -0
- data/Gemfile.lock +20 -0
- data/README.md +153 -0
- data/lib/cache_server.rb +108 -0
- data/lib/proxy/base.rb +62 -0
- data/lib/proxy/bright_data.rb +29 -0
- data/lib/proxy/crawl_base.rb +32 -0
- data/lib/proxy/scraper_api.rb +34 -0
- data/lib/response.rb +12 -0
- data/lib/scraper_central/version.rb +5 -0
- data/lib/scraper_central.rb +106 -0
- data/scraper-central-ruby.gemspec +32 -0
- metadata +83 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: 3a80d1ba8b02b21d51a048f7b153f04e23176ffcd5fc258f004ac903776fe831
|
4
|
+
data.tar.gz: 9982839e9a725452a3f26022ccbd46eabea0aa8ac3b94fd1e4bdcda172ae58cc
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 20f9efb238092af60d758971d8a84752a1cf24b9f927795352be2d66fc47784b3050b629300e0c7ed9b8baa04521970600851ce44d3a2b592f556a489a35ae5f
|
7
|
+
data.tar.gz: c464c0c5a20b58366f2e738aee1cabb1b66555c3d9217b277fe9d9ab3433968f596be12dad4717738017b725b9044e3db0e24797aeec146d93d3abe1527119ce
|
data/Gemfile
ADDED
data/Gemfile.lock
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
PATH
|
2
|
+
remote: .
|
3
|
+
specs:
|
4
|
+
scraper-central-ruby (1.0.0)
|
5
|
+
|
6
|
+
GEM
|
7
|
+
remote: https://rubygems.org/
|
8
|
+
specs:
|
9
|
+
rake (13.2.1)
|
10
|
+
|
11
|
+
PLATFORMS
|
12
|
+
arm64-darwin-23
|
13
|
+
|
14
|
+
DEPENDENCIES
|
15
|
+
bundler (~> 2.4.22)
|
16
|
+
rake (~> 13.0)
|
17
|
+
scraper-central-ruby!
|
18
|
+
|
19
|
+
BUNDLED WITH
|
20
|
+
2.4.22
|
data/README.md
ADDED
@@ -0,0 +1,153 @@
|
|
1
|
+
# scraper-central-ruby
|
2
|
+
Ruby library to scrape and cache the data
|
3
|
+
|
4
|
+
|
5
|
+
## Usage
|
6
|
+
|
7
|
+
Add gem `scraper-central-ruby` into Gemfile:
|
8
|
+
|
9
|
+
```bash
|
10
|
+
gem 'scraper-central-ruby', git: 'git@github.com:patterninc/scraper-central-ruby.git', tag: 'v1.0.0'
|
11
|
+
```
|
12
|
+
|
13
|
+
```bash
|
14
|
+
bundle install
|
15
|
+
```
|
16
|
+
|
17
|
+
Add below configurations to service where this gem is being used.(ENV variables)
|
18
|
+
|
19
|
+
```bash
|
20
|
+
SERVER_URL_GET_CACHE=server base url for get cache
|
21
|
+
SERVER_URL_PUT_CACHE=server base url for put cache
|
22
|
+
```
|
23
|
+
|
24
|
+
### New Scraper central object
|
25
|
+
|
26
|
+
Use `ScraperCentral.new` to create new instance:
|
27
|
+
|
28
|
+
```ruby
|
29
|
+
scraper_central = ScraperCentral.new
|
30
|
+
```
|
31
|
+
|
32
|
+
### Options
|
33
|
+
|
34
|
+
Customize the ScraperCentral instance using the provided options:
|
35
|
+
|
36
|
+
```ruby
|
37
|
+
scraper_central = ScraperCentral.new
|
38
|
+
scraper_central.timeout = 45
|
39
|
+
scraper_central.tls_verify = false
|
40
|
+
scraper_central.enable_js = true
|
41
|
+
scraper_central.proxy_name = "ProxyName"
|
42
|
+
scraper_central.retry_attr = {
|
43
|
+
count: 5,
|
44
|
+
wait_time: 5,
|
45
|
+
max_wait_time: 5
|
46
|
+
}
|
47
|
+
```
|
48
|
+
|
49
|
+
### Customizing Headers and Query Parameters
|
50
|
+
|
51
|
+
To add custom headers and query parameters to your requests:
|
52
|
+
|
53
|
+
```ruby
|
54
|
+
scraper_central = ScraperCentral.new
|
55
|
+
scraper_central.query_params = {
|
56
|
+
"query": "value",
|
57
|
+
"page_wait": "500"
|
58
|
+
}
|
59
|
+
|
60
|
+
scraper_central.headers = {
|
61
|
+
"X-Custom-Header": "value",
|
62
|
+
"mime-type": "text/html",
|
63
|
+
"Encoding": "gzip",
|
64
|
+
"ContentType": "application/json"
|
65
|
+
}
|
66
|
+
```
|
67
|
+
|
68
|
+
### Cookies
|
69
|
+
|
70
|
+
Customizing cookies:
|
71
|
+
|
72
|
+
```ruby
|
73
|
+
scraper_central = ScraperCentral.new
|
74
|
+
scraper_central.cookies = `[
|
75
|
+
{
|
76
|
+
"Name":"amazon-cookie",
|
77
|
+
"Value":"Some session value",
|
78
|
+
"Path":"/",
|
79
|
+
"Domain":"amazon.com",
|
80
|
+
"MaxAge":36000,
|
81
|
+
"HttpOnly":true,
|
82
|
+
"Secure":false
|
83
|
+
},
|
84
|
+
{
|
85
|
+
"Name":"walmart-cookie",
|
86
|
+
"Value":"Some session value",
|
87
|
+
"Path":"/",
|
88
|
+
"Domain":"walmart.com",
|
89
|
+
"MaxAge":72000,
|
90
|
+
"HttpOnly":false,
|
91
|
+
"Secure":true
|
92
|
+
}
|
93
|
+
]`
|
94
|
+
```
|
95
|
+
|
96
|
+
### S3 Key Structure
|
97
|
+
|
98
|
+
To generate S3 key we require following methods to set-up:
|
99
|
+
|
100
|
+
```ruby
|
101
|
+
scraper_central = ScraperCentral.new
|
102
|
+
|
103
|
+
scraper_central.s3_key = {
|
104
|
+
country: "US",
|
105
|
+
marketplace: "Amazon",
|
106
|
+
page_type: "detail-page",
|
107
|
+
identifier: "B0BQZBPS4G",
|
108
|
+
page_number: 1
|
109
|
+
}
|
110
|
+
scraper_central.cache_duration = 360
|
111
|
+
|
112
|
+
```
|
113
|
+
|
114
|
+
### Fetch Content and Headers
|
115
|
+
|
116
|
+
Fetches the URL through the proxy:
|
117
|
+
|
118
|
+
```ruby
|
119
|
+
scraper_central = ScraperCentral.new
|
120
|
+
|
121
|
+
response = scraper_central.fetch("https://example.com")
|
122
|
+
|
123
|
+
...
|
124
|
+
|
125
|
+
puts "Response: ", response.body
|
126
|
+
puts "Status Code: ", response.code
|
127
|
+
puts "Headers: ", response.headers
|
128
|
+
```
|
129
|
+
|
130
|
+
## Documentation
|
131
|
+
|
132
|
+
### Configuration Functions
|
133
|
+
|
134
|
+
- `scraper_central.proxy_name=`: Sets the proxy service name. e.g. `CrawlBase`, `BrightData`, `ScraperApi`
|
135
|
+
- `scraper_central.enable_js=`: Enables or disables JavaScript execution for proxies.
|
136
|
+
- `scraper_central.retry_attr=`: Configures retry logic, including the number of attempts, wait time between attempts.
|
137
|
+
- `scraper_central.timeout=`: Sets the request timeout in seconds.
|
138
|
+
- `scraper_central.tls_verify=`: Configures TLS verification.
|
139
|
+
|
140
|
+
### Proxy Methods
|
141
|
+
|
142
|
+
- `scraper_central.query_params=`: Sets query parameters to be appended to each request URL.
|
143
|
+
- `scraper_central.headers=`: Adds custom headers to requests. (Accept, Accept-Encoding or Content-Type).
|
144
|
+
- `scraper_central.cookies=`: Parses a JSON string of cookies and sets them for subsequent requests.
|
145
|
+
|
146
|
+
### Proxy Methods For S3 Key
|
147
|
+
|
148
|
+
- `scraper_central.s3_key=`: Updates the proxy's target country or S3 key structure country, Marketplace, Pre defined names for page views, e.g. `detail-page`. S3 key value for page identifer e.g. ASIN or Product ID. PageNumber if page is paginated.
|
149
|
+
- `scraper_central.cache_duration=`: Age of object stored on S3 bucket.
|
150
|
+
|
151
|
+
### Get Content
|
152
|
+
|
153
|
+
- `response = fetch(url)`: Makes a configured HTTP request to the specified URL and returns the response object.
|
data/lib/cache_server.rb
ADDED
@@ -0,0 +1,108 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'net/http'
|
4
|
+
require 'uri'
|
5
|
+
require 'json'
|
6
|
+
require 'logger'
|
7
|
+
|
8
|
+
class CacheServer
|
9
|
+
def initialize(proxy_name, enable_js, cache_duration, s3_key)
|
10
|
+
@proxy_name = proxy_name
|
11
|
+
@enable_js = enable_js
|
12
|
+
@cache_duration = cache_duration
|
13
|
+
@s3_key = s3_key
|
14
|
+
@logger = Logger.new($stdout)
|
15
|
+
end
|
16
|
+
|
17
|
+
def get_cache(url)
|
18
|
+
payload = prepare_get_cache_payload(url)
|
19
|
+
|
20
|
+
uri = URI.parse("#{ENV['SERVER_URL_GET_CACHE']}/get-cache")
|
21
|
+
http = Net::HTTP.new(uri.host, uri.port)
|
22
|
+
if uri.scheme == 'https'
|
23
|
+
http.use_ssl = true
|
24
|
+
http.verify_mode = OpenSSL::SSL::VERIFY_NONE
|
25
|
+
end
|
26
|
+
request = Net::HTTP::Post.new(uri.request_uri, 'Content-Type' => 'application/json')
|
27
|
+
request.body = payload.to_json
|
28
|
+
|
29
|
+
begin
|
30
|
+
response = http.request(request)
|
31
|
+
if response.content_type.include?('application/json')
|
32
|
+
response_body = JSON.parse(response.body)
|
33
|
+
return '', nil, proxy_from_server(response_body) if response_body.key?('proxyUrl')
|
34
|
+
return response_body['body'], headers_from_server(response_body), nil
|
35
|
+
else
|
36
|
+
@logger.error "Unexpected response type: #{response.content_type}, body: #{response.body}, code: #{response.code}"
|
37
|
+
end
|
38
|
+
rescue StandardError => e
|
39
|
+
@logger.error "Error sending request to server: #{e.message}"
|
40
|
+
end
|
41
|
+
['', nil, nil]
|
42
|
+
end
|
43
|
+
|
44
|
+
def put_cache(cache_key, page, headers, cookies)
|
45
|
+
payload = {
|
46
|
+
cacheKey: cache_key,
|
47
|
+
page: page,
|
48
|
+
headers: headers,
|
49
|
+
cookies: cookies
|
50
|
+
}
|
51
|
+
|
52
|
+
uri = URI.parse("#{ENV['SERVER_URL_PUT_CACHE']}/put-cache")
|
53
|
+
http = Net::HTTP.new(uri.host, uri.port)
|
54
|
+
if uri.scheme == 'https'
|
55
|
+
http.use_ssl = true
|
56
|
+
http.verify_mode = OpenSSL::SSL::VERIFY_NONE
|
57
|
+
end
|
58
|
+
|
59
|
+
request = Net::HTTP::Post.new(uri.request_uri, 'Content-Type' => 'application/json')
|
60
|
+
request.body = payload.to_json
|
61
|
+
|
62
|
+
begin
|
63
|
+
response = http.request(request)
|
64
|
+
if response.code.to_i != 200
|
65
|
+
error_message = "Server returned bad status: #{response.code}"
|
66
|
+
@logger.error error_message
|
67
|
+
raise StandardError, error_message
|
68
|
+
end
|
69
|
+
rescue StandardError => e
|
70
|
+
@logger.error "Error sending cache to server: #{e.message}"
|
71
|
+
raise e
|
72
|
+
end
|
73
|
+
end
|
74
|
+
|
75
|
+
private
|
76
|
+
|
77
|
+
def headers_from_server(response_body)
|
78
|
+
headers = {}
|
79
|
+
if response_body['headers'].is_a?(Hash)
|
80
|
+
response_body['headers'].each do |key, value|
|
81
|
+
headers[key] = value if value.is_a?(String)
|
82
|
+
end
|
83
|
+
end
|
84
|
+
headers
|
85
|
+
end
|
86
|
+
|
87
|
+
def proxy_from_server(response_body)
|
88
|
+
proxy = {}
|
89
|
+
response_body.each do |key, value|
|
90
|
+
proxy[key] = value if value.is_a?(String)
|
91
|
+
end
|
92
|
+
proxy
|
93
|
+
end
|
94
|
+
|
95
|
+
def prepare_get_cache_payload(url)
|
96
|
+
{
|
97
|
+
url: url,
|
98
|
+
proxyName: @proxy_name,
|
99
|
+
country: @s3_key[:country],
|
100
|
+
enableJs: @enable_js,
|
101
|
+
age: @cache_duration,
|
102
|
+
marketplace: @s3_key[:marketplace],
|
103
|
+
pageType: @s3_key[:page_type],
|
104
|
+
identifier: @s3_key[:identifier],
|
105
|
+
pageNumber: @s3_key[:page_number]
|
106
|
+
}
|
107
|
+
end
|
108
|
+
end
|
data/lib/proxy/base.rb
ADDED
@@ -0,0 +1,62 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'net/http'
|
4
|
+
require 'uri'
|
5
|
+
require 'json'
|
6
|
+
require 'openssl'
|
7
|
+
require 'response'
|
8
|
+
require 'logger'
|
9
|
+
|
10
|
+
module Proxy
|
11
|
+
class Base
|
12
|
+
def initialize(params = {})
|
13
|
+
@country = params[:country]
|
14
|
+
@headers = params[:headers] || {}
|
15
|
+
@query_params = params[:query_params] || {}
|
16
|
+
@cookies = params[:cookies] || []
|
17
|
+
@timeout = params[:timeout] || 60
|
18
|
+
@tls_verify = params.fetch(:tls_verify, true)
|
19
|
+
@retry = params[:retry_attr] || {}
|
20
|
+
@enable_js = params.fetch(:enable_js, false)
|
21
|
+
@logger = Logger.new($stdout)
|
22
|
+
end
|
23
|
+
|
24
|
+
def with_retry
|
25
|
+
attempts = 0
|
26
|
+
begin
|
27
|
+
attempts += 1
|
28
|
+
yield
|
29
|
+
rescue StandardError => e
|
30
|
+
raise e unless attempts <= @retry[:count].to_i
|
31
|
+
|
32
|
+
sleep(@retry[:wait_time] || 5)
|
33
|
+
retry
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
37
|
+
def format_response(response)
|
38
|
+
Response.new(
|
39
|
+
code: response.code.to_i,
|
40
|
+
body: response.body,
|
41
|
+
headers: response.to_hash,
|
42
|
+
cookies: response.get_fields('set-cookie')
|
43
|
+
)
|
44
|
+
end
|
45
|
+
|
46
|
+
def prepare_request(uri, proxy_uri = nil)
|
47
|
+
http = if proxy_uri.nil?
|
48
|
+
Net::HTTP.new(uri.host, uri.port)
|
49
|
+
else
|
50
|
+
Net::HTTP.new(uri.host, uri.port, proxy_uri.host, proxy_uri.port, proxy_uri.user, proxy_uri.password)
|
51
|
+
end
|
52
|
+
if uri.scheme == 'https'
|
53
|
+
http.use_ssl = true
|
54
|
+
http.verify_mode = OpenSSL::SSL::VERIFY_NONE
|
55
|
+
end
|
56
|
+
|
57
|
+
http.read_timeout = @timeout
|
58
|
+
http.open_timeout = @timeout
|
59
|
+
http
|
60
|
+
end
|
61
|
+
end
|
62
|
+
end
|
@@ -0,0 +1,29 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'proxy/base'
|
4
|
+
|
5
|
+
module Proxy
|
6
|
+
class BrightData < Proxy::Base
|
7
|
+
def fetch(url, proxy_from_server)
|
8
|
+
uri = URI.parse(url)
|
9
|
+
proxy_uri = URI.parse(proxy_from_server['proxyUrl'])
|
10
|
+
http = prepare_request(uri, proxy_uri)
|
11
|
+
|
12
|
+
uri.query = URI.encode_www_form(@query_params) unless @query_params.empty?
|
13
|
+
|
14
|
+
request = Net::HTTP::Get.new(uri, @headers)
|
15
|
+
@cookies.each do |cookie|
|
16
|
+
request.add_field('Cookie', "#{cookie[:name]}=#{cookie[:value]}")
|
17
|
+
end
|
18
|
+
|
19
|
+
response = with_retry do
|
20
|
+
http.request(request)
|
21
|
+
end
|
22
|
+
|
23
|
+
format_response(response)
|
24
|
+
rescue StandardError => e
|
25
|
+
@logger.error("Request failed error: #{e.message}")
|
26
|
+
nil
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
@@ -0,0 +1,32 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'proxy/base'
|
4
|
+
|
5
|
+
module Proxy
|
6
|
+
class CrawlBase < Proxy::Base
|
7
|
+
def fetch(url, proxy_from_server)
|
8
|
+
uri = URI.parse(proxy_from_server['proxyHost'])
|
9
|
+
uri.query = URI.encode_www_form({
|
10
|
+
'token' => proxy_from_server['proxyToken'],
|
11
|
+
proxy_from_server['proxyCountryKey'] => @country,
|
12
|
+
'url' => url
|
13
|
+
}.merge(@query_params))
|
14
|
+
|
15
|
+
request = Net::HTTP::Get.new(uri, @headers)
|
16
|
+
http = prepare_request(uri)
|
17
|
+
|
18
|
+
@cookies.each do |cookie|
|
19
|
+
request.add_field('Cookie', "#{cookie[:name]}=#{cookie[:value]}")
|
20
|
+
end
|
21
|
+
|
22
|
+
response = with_retry do
|
23
|
+
http.request(request)
|
24
|
+
end
|
25
|
+
|
26
|
+
format_response(response)
|
27
|
+
rescue StandardError => e
|
28
|
+
@logger.error("Request failed error: #{e.message}")
|
29
|
+
nil
|
30
|
+
end
|
31
|
+
end
|
32
|
+
end
|
@@ -0,0 +1,34 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'proxy/base'
|
4
|
+
|
5
|
+
module Proxy
|
6
|
+
class ScraperApi < Proxy::Base
|
7
|
+
def fetch(url, proxy_from_server)
|
8
|
+
uri = URI.parse(proxy_from_server['proxyHost'])
|
9
|
+
|
10
|
+
uri.query = URI.encode_www_form({
|
11
|
+
'api_key' => proxy_from_server['proxyToken'],
|
12
|
+
'render' => @enable_js.to_s,
|
13
|
+
proxy_from_server['proxyCountryKey'] => @country,
|
14
|
+
'url' => url
|
15
|
+
}.merge(@query_params))
|
16
|
+
|
17
|
+
request = Net::HTTP::Get.new(uri, @headers)
|
18
|
+
http = prepare_request(uri)
|
19
|
+
|
20
|
+
@cookies.each do |cookie|
|
21
|
+
request.add_field('Cookie', "#{cookie[:name]}=#{cookie[:value]}")
|
22
|
+
end
|
23
|
+
|
24
|
+
response = with_retry do
|
25
|
+
http.request(request)
|
26
|
+
end
|
27
|
+
|
28
|
+
format_response(response)
|
29
|
+
rescue StandardError => e
|
30
|
+
@logger.error("Request failed error: #{e.message}")
|
31
|
+
nil
|
32
|
+
end
|
33
|
+
end
|
34
|
+
end
|
data/lib/response.rb
ADDED
@@ -0,0 +1,106 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'logger'
|
4
|
+
require 'cache_server'
|
5
|
+
require 'response'
|
6
|
+
require 'proxy/bright_data'
|
7
|
+
require 'proxy/crawl_base'
|
8
|
+
require 'proxy/scraper_api'
|
9
|
+
|
10
|
+
class ScraperCentral
|
11
|
+
attr_accessor :cache_duration, :proxy_name, :s3_key, :enable_js, :tls_verify, :headers, :query_params, :cookies,
|
12
|
+
:timeout, :retry_attr
|
13
|
+
|
14
|
+
def initialize
|
15
|
+
@lock = Mutex.new
|
16
|
+
@logger = Logger.new($stdout)
|
17
|
+
end
|
18
|
+
|
19
|
+
def fetch(url)
|
20
|
+
@lock.synchronize do
|
21
|
+
@url = url
|
22
|
+
page_from_server, headers_from_server, proxy_from_server = cache_server.get_cache(@url)
|
23
|
+
if proxy_from_server.nil?
|
24
|
+
print_proxy_values
|
25
|
+
return Response.new(code: 200, body: page_from_server, headers: headers_from_server)
|
26
|
+
else
|
27
|
+
proxy_response = nil
|
28
|
+
params = {
|
29
|
+
country: s3_key[:country],
|
30
|
+
headers: headers,
|
31
|
+
query_params: query_params,
|
32
|
+
cookies: cookies,
|
33
|
+
timeout: timeout,
|
34
|
+
tls_verify: tls_verify,
|
35
|
+
retry_attr: retry_attr,
|
36
|
+
enable_js: enable_js
|
37
|
+
}
|
38
|
+
|
39
|
+
case proxy_from_server['proxyName']
|
40
|
+
when 'BrightData'
|
41
|
+
proxy_response = Proxy::BrightData.new(params).fetch(@url, proxy_from_server)
|
42
|
+
when 'CrawlBase'
|
43
|
+
proxy_response = Proxy::CrawlBase.new(params).fetch(@url, proxy_from_server)
|
44
|
+
when 'ScraperApi'
|
45
|
+
proxy_response = Proxy::ScraperApi.new(params).fetch(@url, proxy_from_server)
|
46
|
+
end
|
47
|
+
|
48
|
+
if proxy_response.nil?
|
49
|
+
@logger.error("Error fetching content from proxy: #{proxy_from_server['proxyName']}")
|
50
|
+
return Response.new(code: 500,
|
51
|
+
body: StandardError.new("Error fetching content from proxy: #{proxy_from_server['proxyName']}"))
|
52
|
+
end
|
53
|
+
|
54
|
+
Thread.new do
|
55
|
+
cache_server.put_cache(proxy_from_server['cacheKey'], proxy_response.body, proxy_response.headers,
|
56
|
+
proxy_response.cookies)
|
57
|
+
@logger.info('Cache successfully sent to server')
|
58
|
+
rescue StandardError => e
|
59
|
+
@logger.error("Error uploading cache to server: #{e.message}")
|
60
|
+
end
|
61
|
+
|
62
|
+
print_proxy_values
|
63
|
+
|
64
|
+
proxy_response
|
65
|
+
end
|
66
|
+
end
|
67
|
+
end
|
68
|
+
|
69
|
+
def print_proxy_values
|
70
|
+
@logger.info("url: #{@url}")
|
71
|
+
unless s3_key.empty?
|
72
|
+
@logger.info("marketplace: #{s3_key[:marketplace]}")
|
73
|
+
@logger.info("country: #{s3_key[:country]}")
|
74
|
+
@logger.info("identifier: #{s3_key[:identifier]}")
|
75
|
+
@logger.info("page_type: #{s3_key[:page_type]}")
|
76
|
+
@logger.info("page_number: #{s3_key[:page_number]}")
|
77
|
+
end
|
78
|
+
|
79
|
+
@logger.info("cache_duration: #{cache_duration}")
|
80
|
+
@logger.info("proxy_name: #{proxy_name}")
|
81
|
+
@logger.info("enable_js: #{enable_js}")
|
82
|
+
@logger.info("tls_verify: #{tls_verify}") if tls_verify
|
83
|
+
|
84
|
+
@logger.info("headers: #{headers}") if headers
|
85
|
+
@logger.info("query_params: #{query_params}") if query_params
|
86
|
+
@logger.info("cookies: #{cookies}") if cookies
|
87
|
+
@logger.info("timeout: #{timeout}") if timeout
|
88
|
+
@logger.info("retry_attr: #{retry_attr}") if retry_attr
|
89
|
+
end
|
90
|
+
|
91
|
+
def cache_server
|
92
|
+
CacheServer.new(proxy_name, enable_js, cache_duration, s3_key)
|
93
|
+
end
|
94
|
+
end
|
95
|
+
|
96
|
+
# url = 'https://www.amazon.com/ask/questions/asin/B082YK5C1T'
|
97
|
+
# scraper_central = ScraperCentral.new()
|
98
|
+
# scraper_central.cache_duration = 60
|
99
|
+
# scraper_central.proxy_name = 'BrightData'
|
100
|
+
# scraper_central.s3_key = {
|
101
|
+
# country: 'us',
|
102
|
+
# marketplace: 'Amazon',
|
103
|
+
# identifier: 'B082YK5C1T',
|
104
|
+
# page_type: 'product-question'
|
105
|
+
# }
|
106
|
+
# scraper_central.fetch(url)
|
@@ -0,0 +1,32 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
lib = File.expand_path('lib', __dir__)
|
4
|
+
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
5
|
+
require 'scraper_central'
|
6
|
+
require 'scraper_central/version'
|
7
|
+
|
8
|
+
Gem::Specification.new do |spec|
|
9
|
+
spec.name = 'scraper-central-ruby'
|
10
|
+
spec.version = ScraperCentral::VERSION
|
11
|
+
spec.authors = ['Patterninc']
|
12
|
+
spec.summary = 'Scraper central ruby library'
|
13
|
+
spec.email = ['amol.udage@pattern.com']
|
14
|
+
spec.homepage = 'https://github.com/patterninc/scraper-central-ruby'
|
15
|
+
spec.license = 'MIT'
|
16
|
+
|
17
|
+
# Specify which files should be added to the gem when it is released.
|
18
|
+
# The `git ls-files -z` loads the files in the RubyGem that have been added into git.
|
19
|
+
spec.files = Dir.chdir(File.expand_path(__dir__)) do
|
20
|
+
`git ls-files -z`.split("\x0").reject { |f| f.match(%r{^(test|spec|features)/}) }
|
21
|
+
end
|
22
|
+
spec.bindir = 'exe'
|
23
|
+
spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
|
24
|
+
spec.require_paths = ['lib']
|
25
|
+
spec.required_ruby_version = '>= 2.7.0'
|
26
|
+
|
27
|
+
# gem install bundler:2.4.22
|
28
|
+
# bundle _2.4.22_
|
29
|
+
spec.add_development_dependency 'bundler', '~> 2.4.22'
|
30
|
+
spec.add_development_dependency 'rake', '~> 13.0'
|
31
|
+
# ... (other development dependencies)
|
32
|
+
end
|
metadata
ADDED
@@ -0,0 +1,83 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: scraper-central-ruby
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 1.0.0
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Patterninc
|
8
|
+
autorequire:
|
9
|
+
bindir: exe
|
10
|
+
cert_chain: []
|
11
|
+
date: 2024-07-11 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: bundler
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - "~>"
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: 2.4.22
|
20
|
+
type: :development
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - "~>"
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: 2.4.22
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: rake
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - "~>"
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: '13.0'
|
34
|
+
type: :development
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - "~>"
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: '13.0'
|
41
|
+
description:
|
42
|
+
email:
|
43
|
+
- amol.udage@pattern.com
|
44
|
+
executables: []
|
45
|
+
extensions: []
|
46
|
+
extra_rdoc_files: []
|
47
|
+
files:
|
48
|
+
- Gemfile
|
49
|
+
- Gemfile.lock
|
50
|
+
- README.md
|
51
|
+
- lib/cache_server.rb
|
52
|
+
- lib/proxy/base.rb
|
53
|
+
- lib/proxy/bright_data.rb
|
54
|
+
- lib/proxy/crawl_base.rb
|
55
|
+
- lib/proxy/scraper_api.rb
|
56
|
+
- lib/response.rb
|
57
|
+
- lib/scraper_central.rb
|
58
|
+
- lib/scraper_central/version.rb
|
59
|
+
- scraper-central-ruby.gemspec
|
60
|
+
homepage: https://github.com/patterninc/scraper-central-ruby
|
61
|
+
licenses:
|
62
|
+
- MIT
|
63
|
+
metadata: {}
|
64
|
+
post_install_message:
|
65
|
+
rdoc_options: []
|
66
|
+
require_paths:
|
67
|
+
- lib
|
68
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
69
|
+
requirements:
|
70
|
+
- - ">="
|
71
|
+
- !ruby/object:Gem::Version
|
72
|
+
version: 2.7.0
|
73
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
74
|
+
requirements:
|
75
|
+
- - ">="
|
76
|
+
- !ruby/object:Gem::Version
|
77
|
+
version: '0'
|
78
|
+
requirements: []
|
79
|
+
rubygems_version: 3.2.3
|
80
|
+
signing_key:
|
81
|
+
specification_version: 4
|
82
|
+
summary: Scraper central ruby library
|
83
|
+
test_files: []
|