relaton-cie 2.1.1 → 2.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 4cd9e78a6899ffa70d902a2b00eccf5dcce1d2dc15fffe8925dc9b5dc84650af
4
- data.tar.gz: 31cb613d144a7aec4c7f100c116dcdcab4d23e3773c362a4e5129cb35f3dbdef
3
+ metadata.gz: 905831793252b7a1a462a6cc7e4462d0c90c3a4089894cfd3f991fffa87cbe37
4
+ data.tar.gz: 4740b925366997bd689ca50a690e86653e15afe78ea190e5a17fcc7c43b4b726
5
5
  SHA512:
6
- metadata.gz: 59e51836382654d2f186f8f6aaf68a7e74f222d0cf4f1e20894070da6822e0a59766e237754412bce1383530f4a6a730da519fd8a73a47c4a985ba7a8d0704a1
7
- data.tar.gz: 72460dcabce2101301e1b55347ca762f88b003c66d4fbd246adba213030fec121def7550f08e132891790885aa21fbddcba1c7caa5f2efbe0c1769c0b1600630
6
+ metadata.gz: 7ef9caf1b5421a8e0c7a31238f1ccb4767987827df554fc6e0fc24f56be74183e78e5a4f3093d90c0e6fa4060997288b217d15f7e91ad7bb7e2da31cc318587e
7
+ data.tar.gz: d68a2916f072bfb7113b154c38ba25e96e616fd33fb9094f7d145bdb7aa4bff27c27c02bc5f1b21e8eb21545e9b6e5ab4a89b72f56c877c5663b862281a6a4ef
@@ -2,7 +2,8 @@
2
2
 
3
3
  require "English"
4
4
  require "fileutils"
5
- require "mechanize"
5
+ require "ferrum"
6
+ require "nokogiri"
6
7
  require "relaton/index"
7
8
  require "relaton/bib"
8
9
  require "relaton/core/data_fetcher"
@@ -10,22 +11,71 @@ require_relative "../cie"
10
11
 
11
12
  module Relaton
12
13
  module Cie
14
+ # Thin Ferrum-backed HTTP agent that mimics the Mechanize#get interface
15
+ # used elsewhere in DataFetcher. Drives headless Chrome with stealth
16
+ # tweaks so the CIE catalogue host (Cloudflare-protected accuristech)
17
+ # serves real HTML instead of a "Just a moment..." challenge.
18
+ class BrowserAgent
19
+ UA = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 " \
20
+ "(KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36"
21
+ CHALLENGE_MARKERS = ["Just a moment", "challenge-platform"].freeze
22
+ MAX_CHALLENGE_WAIT = 30
23
+
24
+ def initialize
25
+ @browser = Ferrum::Browser.new(
26
+ headless: true,
27
+ timeout: 90,
28
+ process_timeout: 90,
29
+ window_size: [1366, 768],
30
+ browser_options: {
31
+ "disable-blink-features" => "AutomationControlled",
32
+ "disable-quic" => nil,
33
+ "no-sandbox" => nil
34
+ }
35
+ )
36
+ @browser.headers.set(
37
+ "Accept-Language" => "en-US,en;q=0.9",
38
+ "Accept" => "text/html,application/xhtml+xml,application/xml;q=0.9," \
39
+ "image/webp,*/*;q=0.8",
40
+ "User-Agent" => UA
41
+ )
42
+ # Pre-mask the most common headless-Chrome tells before any page JS runs.
43
+ @browser.evaluate_on_new_document(<<~JS)
44
+ Object.defineProperty(navigator, 'webdriver', { get: () => undefined });
45
+ Object.defineProperty(navigator, 'languages', { get: () => ['en-US', 'en'] });
46
+ Object.defineProperty(navigator, 'plugins', { get: () => [1,2,3,4,5] });
47
+ window.chrome = { runtime: {} };
48
+ JS
49
+ end
50
+
51
+ def get(url)
52
+ @browser.go_to(url)
53
+ wait_for_challenge
54
+ Nokogiri::HTML(@browser.body)
55
+ end
56
+
57
+ def quit
58
+ @browser&.quit
59
+ ensure
60
+ @browser = nil
61
+ end
62
+
63
+ private
64
+
65
+ def wait_for_challenge
66
+ MAX_CHALLENGE_WAIT.times do
67
+ return unless CHALLENGE_MARKERS.any? { |m| @browser.body.include?(m) }
68
+
69
+ sleep 1
70
+ end
71
+ end
72
+ end
73
+
13
74
  class DataFetcher < Relaton::Core::DataFetcher
14
75
  URL = "https://www.techstreet.com/cie/searches/31156444?page=1&per_page=100"
15
76
 
16
77
  def agent
17
- return @agent if @agent
18
-
19
- @agent = Mechanize.new
20
- @agent.request_headers = {
21
- "Accept" => "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
22
- "Accept-Language" => "en-US,en;q=0.5",
23
- "Connection" => "keep-alive",
24
- "sec-ch-ua" => '"Chromium";v="91", "Google Chrome";v="91", ";Not A Brand";v="99"',
25
- "Sec-Fetch-Dest" => "document"
26
- }
27
- @agent.user_agent_alias = "Linux Firefox"
28
- @agent
78
+ @agent ||= BrowserAgent.new
29
79
  end
30
80
 
31
81
  def index
@@ -263,6 +313,8 @@ module Relaton
263
313
  def fetch(_source = nil)
264
314
  fetch_doc
265
315
  report_errors
316
+ ensure
317
+ @agent&.quit
266
318
  end
267
319
 
268
320
  def fetch_doc(url = URL)
@@ -270,19 +322,28 @@ module Relaton
270
322
  result.xpath("//li[@data-product]").each { |hit| parse_page hit }
271
323
  np = result.at '//a[@class="next_page"]'
272
324
  if np
273
- fetch_doc "https://www.techstreet.com#{np[:href]}"
325
+ next_href = np[:href]
326
+ next_url = next_href.start_with?("http") ? next_href : "https://www.techstreet.com#{next_href}"
327
+ fetch_doc next_url
274
328
  else
275
329
  index.save
276
330
  end
277
331
  end
278
332
 
333
+ RETRIABLE_ERRORS = [
334
+ SocketError,
335
+ Ferrum::TimeoutError,
336
+ Ferrum::PendingConnectionsError,
337
+ Ferrum::StatusError
338
+ ].freeze
339
+
279
340
  def time_req
280
341
  tries = 0
281
342
  begin
282
343
  tries += 1
283
344
  sleep [4 - (Time.now - @last_request_time).to_i, 0].max if @last_request_time
284
345
  yield
285
- rescue SocketError => e
346
+ rescue *RETRIABLE_ERRORS => e
286
347
  retry if tries < 4
287
348
  raise e
288
349
  ensure
@@ -1,5 +1,5 @@
1
1
  module Relaton
2
2
  module Cie
3
- VERSION = "2.1.1".freeze
3
+ VERSION = "2.1.2".freeze
4
4
  end
5
5
  end
data/relaton-cie.gemspec CHANGED
@@ -29,6 +29,7 @@ Gem::Specification.new do |spec|
29
29
  spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
30
30
  spec.require_paths = ["lib"]
31
31
 
32
+ spec.add_dependency "ferrum", "~> 0.17"
32
33
  spec.add_dependency "mechanize", "~> 2.10"
33
34
  spec.add_dependency "parslet", "~> 2.0.0"
34
35
  spec.add_dependency "relaton-bib", "~> 2.1.0"
metadata CHANGED
@@ -1,15 +1,29 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: relaton-cie
3
3
  version: !ruby/object:Gem::Version
4
- version: 2.1.1
4
+ version: 2.1.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Ribose Inc.
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2026-05-12 00:00:00.000000000 Z
11
+ date: 2026-05-14 00:00:00.000000000 Z
12
12
  dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: ferrum
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '0.17'
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: '0.17'
13
27
  - !ruby/object:Gem::Dependency
14
28
  name: mechanize
15
29
  requirement: !ruby/object:Gem::Requirement