relaton-cie 2.1.1 → 2.2.0.pre.alpha.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 4cd9e78a6899ffa70d902a2b00eccf5dcce1d2dc15fffe8925dc9b5dc84650af
4
- data.tar.gz: 31cb613d144a7aec4c7f100c116dcdcab4d23e3773c362a4e5129cb35f3dbdef
3
+ metadata.gz: 23dae19e6d6a0c33a0cc45fa13849bb2d1cb7c2df5edc6c0f4e63d9ef7cdc019
4
+ data.tar.gz: 6a69081acd7bc15619eee05e1c272b9ca7a4cd40e81ead45621e6e4d38af4dcc
5
5
  SHA512:
6
- metadata.gz: 59e51836382654d2f186f8f6aaf68a7e74f222d0cf4f1e20894070da6822e0a59766e237754412bce1383530f4a6a730da519fd8a73a47c4a985ba7a8d0704a1
7
- data.tar.gz: 72460dcabce2101301e1b55347ca762f88b003c66d4fbd246adba213030fec121def7550f08e132891790885aa21fbddcba1c7caa5f2efbe0c1769c0b1600630
6
+ metadata.gz: 2d5345f1337ab5cdb9e7b47c9de21fd698dc3bbe9825a463367f8e05ca3dcf9cdd08615939154db886e62aec85145660362c018ed1ba4ce5ec72bdf47b986795
7
+ data.tar.gz: 84029b0877a63e2408e053d361740b3c2ba7cab9f0a24a2ef67cbb8d14e4c08e88dbbcc1177d5a19805862255b0ec815318c7bbbef1ee8608e1bb42723d72902
data/Gemfile CHANGED
@@ -3,6 +3,14 @@ source "https://rubygems.org"
3
3
  # Specify your gem's dependencies in relaton_cie.gemspec
4
4
  gemspec
5
5
 
6
+ # Use local monorepo sibling gems where available.
7
+ Dir["../*/"].each do |dir|
8
+ name = File.basename(dir)
9
+ next if name == File.basename(__dir__)
10
+ next unless File.exist?(File.join(dir, "#{name}.gemspec"))
11
+ gem name, path: dir
12
+ end
13
+
6
14
 
7
15
  gem "equivalent-xml", "~> 0.6"
8
16
  gem "pry-byebug"
@@ -2,7 +2,8 @@
2
2
 
3
3
  require "English"
4
4
  require "fileutils"
5
- require "mechanize"
5
+ require "ferrum"
6
+ require "nokogiri"
6
7
  require "relaton/index"
7
8
  require "relaton/bib"
8
9
  require "relaton/core/data_fetcher"
@@ -10,22 +11,71 @@ require_relative "../cie"
10
11
 
11
12
  module Relaton
12
13
  module Cie
14
+ # Thin Ferrum-backed HTTP agent that mimics the Mechanize#get interface
15
+ # used elsewhere in DataFetcher. Drives headless Chrome with stealth
16
+ # tweaks so the CIE catalogue host (Cloudflare-protected accuristech)
17
+ # serves real HTML instead of a "Just a moment..." challenge.
18
+ class BrowserAgent
19
+ UA = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 " \
20
+ "(KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36"
21
+ CHALLENGE_MARKERS = ["Just a moment", "challenge-platform"].freeze
22
+ MAX_CHALLENGE_WAIT = 30
23
+
24
+ def initialize
25
+ @browser = Ferrum::Browser.new(
26
+ headless: true,
27
+ timeout: 90,
28
+ process_timeout: 90,
29
+ window_size: [1366, 768],
30
+ browser_options: {
31
+ "disable-blink-features" => "AutomationControlled",
32
+ "disable-quic" => nil,
33
+ "no-sandbox" => nil
34
+ }
35
+ )
36
+ @browser.headers.set(
37
+ "Accept-Language" => "en-US,en;q=0.9",
38
+ "Accept" => "text/html,application/xhtml+xml,application/xml;q=0.9," \
39
+ "image/webp,*/*;q=0.8",
40
+ "User-Agent" => UA
41
+ )
42
+ # Pre-mask the most common headless-Chrome tells before any page JS runs.
43
+ @browser.evaluate_on_new_document(<<~JS)
44
+ Object.defineProperty(navigator, 'webdriver', { get: () => undefined });
45
+ Object.defineProperty(navigator, 'languages', { get: () => ['en-US', 'en'] });
46
+ Object.defineProperty(navigator, 'plugins', { get: () => [1,2,3,4,5] });
47
+ window.chrome = { runtime: {} };
48
+ JS
49
+ end
50
+
51
+ def get(url)
52
+ @browser.go_to(url)
53
+ wait_for_challenge
54
+ Nokogiri::HTML(@browser.body)
55
+ end
56
+
57
+ def quit
58
+ @browser&.quit
59
+ ensure
60
+ @browser = nil
61
+ end
62
+
63
+ private
64
+
65
+ def wait_for_challenge
66
+ MAX_CHALLENGE_WAIT.times do
67
+ return unless CHALLENGE_MARKERS.any? { |m| @browser.body.include?(m) }
68
+
69
+ sleep 1
70
+ end
71
+ end
72
+ end
73
+
13
74
  class DataFetcher < Relaton::Core::DataFetcher
14
75
  URL = "https://www.techstreet.com/cie/searches/31156444?page=1&per_page=100"
15
76
 
16
77
  def agent
17
- return @agent if @agent
18
-
19
- @agent = Mechanize.new
20
- @agent.request_headers = {
21
- "Accept" => "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
22
- "Accept-Language" => "en-US,en;q=0.5",
23
- "Connection" => "keep-alive",
24
- "sec-ch-ua" => '"Chromium";v="91", "Google Chrome";v="91", ";Not A Brand";v="99"',
25
- "Sec-Fetch-Dest" => "document"
26
- }
27
- @agent.user_agent_alias = "Linux Firefox"
28
- @agent
78
+ @agent ||= BrowserAgent.new
29
79
  end
30
80
 
31
81
  def index
@@ -263,6 +313,8 @@ module Relaton
263
313
  def fetch(_source = nil)
264
314
  fetch_doc
265
315
  report_errors
316
+ ensure
317
+ @agent&.quit
266
318
  end
267
319
 
268
320
  def fetch_doc(url = URL)
@@ -270,19 +322,28 @@ module Relaton
270
322
  result.xpath("//li[@data-product]").each { |hit| parse_page hit }
271
323
  np = result.at '//a[@class="next_page"]'
272
324
  if np
273
- fetch_doc "https://www.techstreet.com#{np[:href]}"
325
+ next_href = np[:href]
326
+ next_url = next_href.start_with?("http") ? next_href : "https://www.techstreet.com#{next_href}"
327
+ fetch_doc next_url
274
328
  else
275
329
  index.save
276
330
  end
277
331
  end
278
332
 
333
+ RETRIABLE_ERRORS = [
334
+ SocketError,
335
+ Ferrum::TimeoutError,
336
+ Ferrum::PendingConnectionsError,
337
+ Ferrum::StatusError
338
+ ].freeze
339
+
279
340
  def time_req
280
341
  tries = 0
281
342
  begin
282
343
  tries += 1
283
344
  sleep [4 - (Time.now - @last_request_time).to_i, 0].max if @last_request_time
284
345
  yield
285
- rescue SocketError => e
346
+ rescue *RETRIABLE_ERRORS => e
286
347
  retry if tries < 4
287
348
  raise e
288
349
  ensure
@@ -1,5 +1,5 @@
1
1
  module Relaton
2
2
  module Cie
3
- VERSION = "2.1.1".freeze
3
+ VERSION = "2.2.0.pre.alpha.1".freeze
4
4
  end
5
5
  end
data/relaton-cie.gemspec CHANGED
@@ -16,7 +16,7 @@ Gem::Specification.new do |spec|
16
16
  DESCRIPTION
17
17
  spec.homepage = "https://github.com/metanorma/relaton-cie"
18
18
  spec.license = "BSD-2-Clause"
19
- spec.required_ruby_version = Gem::Requirement.new(">= 3.2.0")
19
+ spec.required_ruby_version = Gem::Requirement.new(">= 3.3.0")
20
20
 
21
21
  spec.metadata["homepage_uri"] = spec.homepage
22
22
 
@@ -29,9 +29,9 @@ Gem::Specification.new do |spec|
29
29
  spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
30
30
  spec.require_paths = ["lib"]
31
31
 
32
+ spec.add_dependency "ferrum", "~> 0.17"
32
33
  spec.add_dependency "mechanize", "~> 2.10"
33
- spec.add_dependency "parslet", "~> 2.0.0"
34
- spec.add_dependency "relaton-bib", "~> 2.1.0"
35
- spec.add_dependency "relaton-core", "~> 0.0.12"
36
- spec.add_dependency "relaton-index", "~> 0.2.0"
34
+ spec.add_dependency "relaton-bib", "~> 2.2.0.pre.alpha.1"
35
+ spec.add_dependency "relaton-core", "~> 2.2.0.pre.alpha.1"
36
+ spec.add_dependency "relaton-index", "~> 2.2.0.pre.alpha.1"
37
37
  end
metadata CHANGED
@@ -1,85 +1,85 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: relaton-cie
3
3
  version: !ruby/object:Gem::Version
4
- version: 2.1.1
4
+ version: 2.2.0.pre.alpha.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Ribose Inc.
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2026-05-12 00:00:00.000000000 Z
11
+ date: 2026-06-26 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
- name: mechanize
14
+ name: ferrum
15
15
  requirement: !ruby/object:Gem::Requirement
16
16
  requirements:
17
17
  - - "~>"
18
18
  - !ruby/object:Gem::Version
19
- version: '2.10'
19
+ version: '0.17'
20
20
  type: :runtime
21
21
  prerelease: false
22
22
  version_requirements: !ruby/object:Gem::Requirement
23
23
  requirements:
24
24
  - - "~>"
25
25
  - !ruby/object:Gem::Version
26
- version: '2.10'
26
+ version: '0.17'
27
27
  - !ruby/object:Gem::Dependency
28
- name: parslet
28
+ name: mechanize
29
29
  requirement: !ruby/object:Gem::Requirement
30
30
  requirements:
31
31
  - - "~>"
32
32
  - !ruby/object:Gem::Version
33
- version: 2.0.0
33
+ version: '2.10'
34
34
  type: :runtime
35
35
  prerelease: false
36
36
  version_requirements: !ruby/object:Gem::Requirement
37
37
  requirements:
38
38
  - - "~>"
39
39
  - !ruby/object:Gem::Version
40
- version: 2.0.0
40
+ version: '2.10'
41
41
  - !ruby/object:Gem::Dependency
42
42
  name: relaton-bib
43
43
  requirement: !ruby/object:Gem::Requirement
44
44
  requirements:
45
45
  - - "~>"
46
46
  - !ruby/object:Gem::Version
47
- version: 2.1.0
47
+ version: 2.2.0.pre.alpha.1
48
48
  type: :runtime
49
49
  prerelease: false
50
50
  version_requirements: !ruby/object:Gem::Requirement
51
51
  requirements:
52
52
  - - "~>"
53
53
  - !ruby/object:Gem::Version
54
- version: 2.1.0
54
+ version: 2.2.0.pre.alpha.1
55
55
  - !ruby/object:Gem::Dependency
56
56
  name: relaton-core
57
57
  requirement: !ruby/object:Gem::Requirement
58
58
  requirements:
59
59
  - - "~>"
60
60
  - !ruby/object:Gem::Version
61
- version: 0.0.12
61
+ version: 2.2.0.pre.alpha.1
62
62
  type: :runtime
63
63
  prerelease: false
64
64
  version_requirements: !ruby/object:Gem::Requirement
65
65
  requirements:
66
66
  - - "~>"
67
67
  - !ruby/object:Gem::Version
68
- version: 0.0.12
68
+ version: 2.2.0.pre.alpha.1
69
69
  - !ruby/object:Gem::Dependency
70
70
  name: relaton-index
71
71
  requirement: !ruby/object:Gem::Requirement
72
72
  requirements:
73
73
  - - "~>"
74
74
  - !ruby/object:Gem::Version
75
- version: 0.2.0
75
+ version: 2.2.0.pre.alpha.1
76
76
  type: :runtime
77
77
  prerelease: false
78
78
  version_requirements: !ruby/object:Gem::Requirement
79
79
  requirements:
80
80
  - - "~>"
81
81
  - !ruby/object:Gem::Version
82
- version: 0.2.0
82
+ version: 2.2.0.pre.alpha.1
83
83
  description: |
84
84
  Relaton::Cie: retrieve CIE Standards for bibliographic use
85
85
  using the BibliographicItem model.
@@ -93,7 +93,6 @@ files:
93
93
  - ".github/workflows/release.yml"
94
94
  - ".gitignore"
95
95
  - ".rspec"
96
- - ".rubocop.yml"
97
96
  - CLAUDE.md
98
97
  - Gemfile
99
98
  - LICENSE.txt
@@ -102,11 +101,6 @@ files:
102
101
  - bin/console
103
102
  - bin/rspec
104
103
  - bin/setup
105
- - grammars/basicdoc.rng
106
- - grammars/biblio-standoc.rng
107
- - grammars/biblio.rng
108
- - grammars/relaton-cie-compile.rng
109
- - grammars/relaton-cie.rng
110
104
  - lib/relaton/cie.rb
111
105
  - lib/relaton/cie/bibdata.rb
112
106
  - lib/relaton/cie/bibitem.rb
@@ -133,7 +127,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
133
127
  requirements:
134
128
  - - ">="
135
129
  - !ruby/object:Gem::Version
136
- version: 3.2.0
130
+ version: 3.3.0
137
131
  required_rubygems_version: !ruby/object:Gem::Requirement
138
132
  requirements:
139
133
  - - ">="
data/.rubocop.yml DELETED
@@ -1,12 +0,0 @@
1
- # This project follows the Ribose OSS style guide.
2
- # https://github.com/riboseinc/oss-guides
3
- # All project-specific additions and overrides should be specified in this file.
4
-
5
- require: rubocop-rails
6
-
7
- inherit_from:
8
- - https://raw.githubusercontent.com/riboseinc/oss-guides/master/ci/rubocop.yml
9
- AllCops:
10
- TargetRubyVersion: 3.2
11
- Rails:
12
- Enabled: false