relaton-cie 2.1.1 → 2.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/relaton/cie/data_fetcher.rb +76 -15
- data/lib/relaton/cie/version.rb +1 -1
- data/relaton-cie.gemspec +1 -0
- metadata +16 -2
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 905831793252b7a1a462a6cc7e4462d0c90c3a4089894cfd3f991fffa87cbe37
|
|
4
|
+
data.tar.gz: 4740b925366997bd689ca50a690e86653e15afe78ea190e5a17fcc7c43b4b726
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 7ef9caf1b5421a8e0c7a31238f1ccb4767987827df554fc6e0fc24f56be74183e78e5a4f3093d90c0e6fa4060997288b217d15f7e91ad7bb7e2da31cc318587e
|
|
7
|
+
data.tar.gz: d68a2916f072bfb7113b154c38ba25e96e616fd33fb9094f7d145bdb7aa4bff27c27c02bc5f1b21e8eb21545e9b6e5ab4a89b72f56c877c5663b862281a6a4ef
|
|
@@ -2,7 +2,8 @@
|
|
|
2
2
|
|
|
3
3
|
require "English"
|
|
4
4
|
require "fileutils"
|
|
5
|
-
require "
|
|
5
|
+
require "ferrum"
|
|
6
|
+
require "nokogiri"
|
|
6
7
|
require "relaton/index"
|
|
7
8
|
require "relaton/bib"
|
|
8
9
|
require "relaton/core/data_fetcher"
|
|
@@ -10,22 +11,71 @@ require_relative "../cie"
|
|
|
10
11
|
|
|
11
12
|
module Relaton
|
|
12
13
|
module Cie
|
|
14
|
+
# Thin Ferrum-backed HTTP agent that mimics the Mechanize#get interface
|
|
15
|
+
# used elsewhere in DataFetcher. Drives headless Chrome with stealth
|
|
16
|
+
# tweaks so the CIE catalogue host (Cloudflare-protected accuristech)
|
|
17
|
+
# serves real HTML instead of a "Just a moment..." challenge.
|
|
18
|
+
class BrowserAgent
|
|
19
|
+
UA = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 " \
|
|
20
|
+
"(KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36"
|
|
21
|
+
CHALLENGE_MARKERS = ["Just a moment", "challenge-platform"].freeze
|
|
22
|
+
MAX_CHALLENGE_WAIT = 30
|
|
23
|
+
|
|
24
|
+
def initialize
|
|
25
|
+
@browser = Ferrum::Browser.new(
|
|
26
|
+
headless: true,
|
|
27
|
+
timeout: 90,
|
|
28
|
+
process_timeout: 90,
|
|
29
|
+
window_size: [1366, 768],
|
|
30
|
+
browser_options: {
|
|
31
|
+
"disable-blink-features" => "AutomationControlled",
|
|
32
|
+
"disable-quic" => nil,
|
|
33
|
+
"no-sandbox" => nil
|
|
34
|
+
}
|
|
35
|
+
)
|
|
36
|
+
@browser.headers.set(
|
|
37
|
+
"Accept-Language" => "en-US,en;q=0.9",
|
|
38
|
+
"Accept" => "text/html,application/xhtml+xml,application/xml;q=0.9," \
|
|
39
|
+
"image/webp,*/*;q=0.8",
|
|
40
|
+
"User-Agent" => UA
|
|
41
|
+
)
|
|
42
|
+
# Pre-mask the most common headless-Chrome tells before any page JS runs.
|
|
43
|
+
@browser.evaluate_on_new_document(<<~JS)
|
|
44
|
+
Object.defineProperty(navigator, 'webdriver', { get: () => undefined });
|
|
45
|
+
Object.defineProperty(navigator, 'languages', { get: () => ['en-US', 'en'] });
|
|
46
|
+
Object.defineProperty(navigator, 'plugins', { get: () => [1,2,3,4,5] });
|
|
47
|
+
window.chrome = { runtime: {} };
|
|
48
|
+
JS
|
|
49
|
+
end
|
|
50
|
+
|
|
51
|
+
def get(url)
|
|
52
|
+
@browser.go_to(url)
|
|
53
|
+
wait_for_challenge
|
|
54
|
+
Nokogiri::HTML(@browser.body)
|
|
55
|
+
end
|
|
56
|
+
|
|
57
|
+
def quit
|
|
58
|
+
@browser&.quit
|
|
59
|
+
ensure
|
|
60
|
+
@browser = nil
|
|
61
|
+
end
|
|
62
|
+
|
|
63
|
+
private
|
|
64
|
+
|
|
65
|
+
def wait_for_challenge
|
|
66
|
+
MAX_CHALLENGE_WAIT.times do
|
|
67
|
+
return unless CHALLENGE_MARKERS.any? { |m| @browser.body.include?(m) }
|
|
68
|
+
|
|
69
|
+
sleep 1
|
|
70
|
+
end
|
|
71
|
+
end
|
|
72
|
+
end
|
|
73
|
+
|
|
13
74
|
class DataFetcher < Relaton::Core::DataFetcher
|
|
14
75
|
URL = "https://www.techstreet.com/cie/searches/31156444?page=1&per_page=100"
|
|
15
76
|
|
|
16
77
|
def agent
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
@agent = Mechanize.new
|
|
20
|
-
@agent.request_headers = {
|
|
21
|
-
"Accept" => "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
|
|
22
|
-
"Accept-Language" => "en-US,en;q=0.5",
|
|
23
|
-
"Connection" => "keep-alive",
|
|
24
|
-
"sec-ch-ua" => '"Chromium";v="91", "Google Chrome";v="91", ";Not A Brand";v="99"',
|
|
25
|
-
"Sec-Fetch-Dest" => "document"
|
|
26
|
-
}
|
|
27
|
-
@agent.user_agent_alias = "Linux Firefox"
|
|
28
|
-
@agent
|
|
78
|
+
@agent ||= BrowserAgent.new
|
|
29
79
|
end
|
|
30
80
|
|
|
31
81
|
def index
|
|
@@ -263,6 +313,8 @@ module Relaton
|
|
|
263
313
|
def fetch(_source = nil)
|
|
264
314
|
fetch_doc
|
|
265
315
|
report_errors
|
|
316
|
+
ensure
|
|
317
|
+
@agent&.quit
|
|
266
318
|
end
|
|
267
319
|
|
|
268
320
|
def fetch_doc(url = URL)
|
|
@@ -270,19 +322,28 @@ module Relaton
|
|
|
270
322
|
result.xpath("//li[@data-product]").each { |hit| parse_page hit }
|
|
271
323
|
np = result.at '//a[@class="next_page"]'
|
|
272
324
|
if np
|
|
273
|
-
|
|
325
|
+
next_href = np[:href]
|
|
326
|
+
next_url = next_href.start_with?("http") ? next_href : "https://www.techstreet.com#{next_href}"
|
|
327
|
+
fetch_doc next_url
|
|
274
328
|
else
|
|
275
329
|
index.save
|
|
276
330
|
end
|
|
277
331
|
end
|
|
278
332
|
|
|
333
|
+
RETRIABLE_ERRORS = [
|
|
334
|
+
SocketError,
|
|
335
|
+
Ferrum::TimeoutError,
|
|
336
|
+
Ferrum::PendingConnectionsError,
|
|
337
|
+
Ferrum::StatusError
|
|
338
|
+
].freeze
|
|
339
|
+
|
|
279
340
|
def time_req
|
|
280
341
|
tries = 0
|
|
281
342
|
begin
|
|
282
343
|
tries += 1
|
|
283
344
|
sleep [4 - (Time.now - @last_request_time).to_i, 0].max if @last_request_time
|
|
284
345
|
yield
|
|
285
|
-
rescue
|
|
346
|
+
rescue *RETRIABLE_ERRORS => e
|
|
286
347
|
retry if tries < 4
|
|
287
348
|
raise e
|
|
288
349
|
ensure
|
data/lib/relaton/cie/version.rb
CHANGED
data/relaton-cie.gemspec
CHANGED
|
@@ -29,6 +29,7 @@ Gem::Specification.new do |spec|
|
|
|
29
29
|
spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
|
|
30
30
|
spec.require_paths = ["lib"]
|
|
31
31
|
|
|
32
|
+
spec.add_dependency "ferrum", "~> 0.17"
|
|
32
33
|
spec.add_dependency "mechanize", "~> 2.10"
|
|
33
34
|
spec.add_dependency "parslet", "~> 2.0.0"
|
|
34
35
|
spec.add_dependency "relaton-bib", "~> 2.1.0"
|
metadata
CHANGED
|
@@ -1,15 +1,29 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: relaton-cie
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 2.1.
|
|
4
|
+
version: 2.1.2
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Ribose Inc.
|
|
8
8
|
autorequire:
|
|
9
9
|
bindir: exe
|
|
10
10
|
cert_chain: []
|
|
11
|
-
date: 2026-05-
|
|
11
|
+
date: 2026-05-14 00:00:00.000000000 Z
|
|
12
12
|
dependencies:
|
|
13
|
+
- !ruby/object:Gem::Dependency
|
|
14
|
+
name: ferrum
|
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
|
16
|
+
requirements:
|
|
17
|
+
- - "~>"
|
|
18
|
+
- !ruby/object:Gem::Version
|
|
19
|
+
version: '0.17'
|
|
20
|
+
type: :runtime
|
|
21
|
+
prerelease: false
|
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
23
|
+
requirements:
|
|
24
|
+
- - "~>"
|
|
25
|
+
- !ruby/object:Gem::Version
|
|
26
|
+
version: '0.17'
|
|
13
27
|
- !ruby/object:Gem::Dependency
|
|
14
28
|
name: mechanize
|
|
15
29
|
requirement: !ruby/object:Gem::Requirement
|