relaton-cie 2.1.1 → 2.2.0.pre.alpha.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile +8 -0
- data/lib/relaton/cie/data_fetcher.rb +76 -15
- data/lib/relaton/cie/version.rb +1 -1
- data/relaton-cie.gemspec +5 -5
- metadata +15 -21
- data/.rubocop.yml +0 -12
- data/grammars/basicdoc.rng +0 -2140
- data/grammars/biblio-standoc.rng +0 -268
- data/grammars/biblio.rng +0 -2125
- data/grammars/relaton-cie-compile.rng +0 -11
- data/grammars/relaton-cie.rng +0 -4
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 23dae19e6d6a0c33a0cc45fa13849bb2d1cb7c2df5edc6c0f4e63d9ef7cdc019
|
|
4
|
+
data.tar.gz: 6a69081acd7bc15619eee05e1c272b9ca7a4cd40e81ead45621e6e4d38af4dcc
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 2d5345f1337ab5cdb9e7b47c9de21fd698dc3bbe9825a463367f8e05ca3dcf9cdd08615939154db886e62aec85145660362c018ed1ba4ce5ec72bdf47b986795
|
|
7
|
+
data.tar.gz: 84029b0877a63e2408e053d361740b3c2ba7cab9f0a24a2ef67cbb8d14e4c08e88dbbcc1177d5a19805862255b0ec815318c7bbbef1ee8608e1bb42723d72902
|
data/Gemfile
CHANGED
|
@@ -3,6 +3,14 @@ source "https://rubygems.org"
|
|
|
3
3
|
# Specify your gem's dependencies in relaton_cie.gemspec
|
|
4
4
|
gemspec
|
|
5
5
|
|
|
6
|
+
# Use local monorepo sibling gems where available.
|
|
7
|
+
Dir["../*/"].each do |dir|
|
|
8
|
+
name = File.basename(dir)
|
|
9
|
+
next if name == File.basename(__dir__)
|
|
10
|
+
next unless File.exist?(File.join(dir, "#{name}.gemspec"))
|
|
11
|
+
gem name, path: dir
|
|
12
|
+
end
|
|
13
|
+
|
|
6
14
|
|
|
7
15
|
gem "equivalent-xml", "~> 0.6"
|
|
8
16
|
gem "pry-byebug"
|
|
@@ -2,7 +2,8 @@
|
|
|
2
2
|
|
|
3
3
|
require "English"
|
|
4
4
|
require "fileutils"
|
|
5
|
-
require "
|
|
5
|
+
require "ferrum"
|
|
6
|
+
require "nokogiri"
|
|
6
7
|
require "relaton/index"
|
|
7
8
|
require "relaton/bib"
|
|
8
9
|
require "relaton/core/data_fetcher"
|
|
@@ -10,22 +11,71 @@ require_relative "../cie"
|
|
|
10
11
|
|
|
11
12
|
module Relaton
|
|
12
13
|
module Cie
|
|
14
|
+
# Thin Ferrum-backed HTTP agent that mimics the Mechanize#get interface
|
|
15
|
+
# used elsewhere in DataFetcher. Drives headless Chrome with stealth
|
|
16
|
+
# tweaks so the CIE catalogue host (Cloudflare-protected accuristech)
|
|
17
|
+
# serves real HTML instead of a "Just a moment..." challenge.
|
|
18
|
+
class BrowserAgent
|
|
19
|
+
UA = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 " \
|
|
20
|
+
"(KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36"
|
|
21
|
+
CHALLENGE_MARKERS = ["Just a moment", "challenge-platform"].freeze
|
|
22
|
+
MAX_CHALLENGE_WAIT = 30
|
|
23
|
+
|
|
24
|
+
def initialize
|
|
25
|
+
@browser = Ferrum::Browser.new(
|
|
26
|
+
headless: true,
|
|
27
|
+
timeout: 90,
|
|
28
|
+
process_timeout: 90,
|
|
29
|
+
window_size: [1366, 768],
|
|
30
|
+
browser_options: {
|
|
31
|
+
"disable-blink-features" => "AutomationControlled",
|
|
32
|
+
"disable-quic" => nil,
|
|
33
|
+
"no-sandbox" => nil
|
|
34
|
+
}
|
|
35
|
+
)
|
|
36
|
+
@browser.headers.set(
|
|
37
|
+
"Accept-Language" => "en-US,en;q=0.9",
|
|
38
|
+
"Accept" => "text/html,application/xhtml+xml,application/xml;q=0.9," \
|
|
39
|
+
"image/webp,*/*;q=0.8",
|
|
40
|
+
"User-Agent" => UA
|
|
41
|
+
)
|
|
42
|
+
# Pre-mask the most common headless-Chrome tells before any page JS runs.
|
|
43
|
+
@browser.evaluate_on_new_document(<<~JS)
|
|
44
|
+
Object.defineProperty(navigator, 'webdriver', { get: () => undefined });
|
|
45
|
+
Object.defineProperty(navigator, 'languages', { get: () => ['en-US', 'en'] });
|
|
46
|
+
Object.defineProperty(navigator, 'plugins', { get: () => [1,2,3,4,5] });
|
|
47
|
+
window.chrome = { runtime: {} };
|
|
48
|
+
JS
|
|
49
|
+
end
|
|
50
|
+
|
|
51
|
+
def get(url)
|
|
52
|
+
@browser.go_to(url)
|
|
53
|
+
wait_for_challenge
|
|
54
|
+
Nokogiri::HTML(@browser.body)
|
|
55
|
+
end
|
|
56
|
+
|
|
57
|
+
def quit
|
|
58
|
+
@browser&.quit
|
|
59
|
+
ensure
|
|
60
|
+
@browser = nil
|
|
61
|
+
end
|
|
62
|
+
|
|
63
|
+
private
|
|
64
|
+
|
|
65
|
+
def wait_for_challenge
|
|
66
|
+
MAX_CHALLENGE_WAIT.times do
|
|
67
|
+
return unless CHALLENGE_MARKERS.any? { |m| @browser.body.include?(m) }
|
|
68
|
+
|
|
69
|
+
sleep 1
|
|
70
|
+
end
|
|
71
|
+
end
|
|
72
|
+
end
|
|
73
|
+
|
|
13
74
|
class DataFetcher < Relaton::Core::DataFetcher
|
|
14
75
|
URL = "https://www.techstreet.com/cie/searches/31156444?page=1&per_page=100"
|
|
15
76
|
|
|
16
77
|
def agent
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
@agent = Mechanize.new
|
|
20
|
-
@agent.request_headers = {
|
|
21
|
-
"Accept" => "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
|
|
22
|
-
"Accept-Language" => "en-US,en;q=0.5",
|
|
23
|
-
"Connection" => "keep-alive",
|
|
24
|
-
"sec-ch-ua" => '"Chromium";v="91", "Google Chrome";v="91", ";Not A Brand";v="99"',
|
|
25
|
-
"Sec-Fetch-Dest" => "document"
|
|
26
|
-
}
|
|
27
|
-
@agent.user_agent_alias = "Linux Firefox"
|
|
28
|
-
@agent
|
|
78
|
+
@agent ||= BrowserAgent.new
|
|
29
79
|
end
|
|
30
80
|
|
|
31
81
|
def index
|
|
@@ -263,6 +313,8 @@ module Relaton
|
|
|
263
313
|
def fetch(_source = nil)
|
|
264
314
|
fetch_doc
|
|
265
315
|
report_errors
|
|
316
|
+
ensure
|
|
317
|
+
@agent&.quit
|
|
266
318
|
end
|
|
267
319
|
|
|
268
320
|
def fetch_doc(url = URL)
|
|
@@ -270,19 +322,28 @@ module Relaton
|
|
|
270
322
|
result.xpath("//li[@data-product]").each { |hit| parse_page hit }
|
|
271
323
|
np = result.at '//a[@class="next_page"]'
|
|
272
324
|
if np
|
|
273
|
-
|
|
325
|
+
next_href = np[:href]
|
|
326
|
+
next_url = next_href.start_with?("http") ? next_href : "https://www.techstreet.com#{next_href}"
|
|
327
|
+
fetch_doc next_url
|
|
274
328
|
else
|
|
275
329
|
index.save
|
|
276
330
|
end
|
|
277
331
|
end
|
|
278
332
|
|
|
333
|
+
RETRIABLE_ERRORS = [
|
|
334
|
+
SocketError,
|
|
335
|
+
Ferrum::TimeoutError,
|
|
336
|
+
Ferrum::PendingConnectionsError,
|
|
337
|
+
Ferrum::StatusError
|
|
338
|
+
].freeze
|
|
339
|
+
|
|
279
340
|
def time_req
|
|
280
341
|
tries = 0
|
|
281
342
|
begin
|
|
282
343
|
tries += 1
|
|
283
344
|
sleep [4 - (Time.now - @last_request_time).to_i, 0].max if @last_request_time
|
|
284
345
|
yield
|
|
285
|
-
rescue
|
|
346
|
+
rescue *RETRIABLE_ERRORS => e
|
|
286
347
|
retry if tries < 4
|
|
287
348
|
raise e
|
|
288
349
|
ensure
|
data/lib/relaton/cie/version.rb
CHANGED
data/relaton-cie.gemspec
CHANGED
|
@@ -16,7 +16,7 @@ Gem::Specification.new do |spec|
|
|
|
16
16
|
DESCRIPTION
|
|
17
17
|
spec.homepage = "https://github.com/metanorma/relaton-cie"
|
|
18
18
|
spec.license = "BSD-2-Clause"
|
|
19
|
-
spec.required_ruby_version = Gem::Requirement.new(">= 3.
|
|
19
|
+
spec.required_ruby_version = Gem::Requirement.new(">= 3.3.0")
|
|
20
20
|
|
|
21
21
|
spec.metadata["homepage_uri"] = spec.homepage
|
|
22
22
|
|
|
@@ -29,9 +29,9 @@ Gem::Specification.new do |spec|
|
|
|
29
29
|
spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
|
|
30
30
|
spec.require_paths = ["lib"]
|
|
31
31
|
|
|
32
|
+
spec.add_dependency "ferrum", "~> 0.17"
|
|
32
33
|
spec.add_dependency "mechanize", "~> 2.10"
|
|
33
|
-
spec.add_dependency "
|
|
34
|
-
spec.add_dependency "relaton-
|
|
35
|
-
spec.add_dependency "relaton-
|
|
36
|
-
spec.add_dependency "relaton-index", "~> 0.2.0"
|
|
34
|
+
spec.add_dependency "relaton-bib", "~> 2.2.0.pre.alpha.1"
|
|
35
|
+
spec.add_dependency "relaton-core", "~> 2.2.0.pre.alpha.1"
|
|
36
|
+
spec.add_dependency "relaton-index", "~> 2.2.0.pre.alpha.1"
|
|
37
37
|
end
|
metadata
CHANGED
|
@@ -1,85 +1,85 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: relaton-cie
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 2.
|
|
4
|
+
version: 2.2.0.pre.alpha.1
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Ribose Inc.
|
|
8
8
|
autorequire:
|
|
9
9
|
bindir: exe
|
|
10
10
|
cert_chain: []
|
|
11
|
-
date: 2026-
|
|
11
|
+
date: 2026-06-26 00:00:00.000000000 Z
|
|
12
12
|
dependencies:
|
|
13
13
|
- !ruby/object:Gem::Dependency
|
|
14
|
-
name:
|
|
14
|
+
name: ferrum
|
|
15
15
|
requirement: !ruby/object:Gem::Requirement
|
|
16
16
|
requirements:
|
|
17
17
|
- - "~>"
|
|
18
18
|
- !ruby/object:Gem::Version
|
|
19
|
-
version: '
|
|
19
|
+
version: '0.17'
|
|
20
20
|
type: :runtime
|
|
21
21
|
prerelease: false
|
|
22
22
|
version_requirements: !ruby/object:Gem::Requirement
|
|
23
23
|
requirements:
|
|
24
24
|
- - "~>"
|
|
25
25
|
- !ruby/object:Gem::Version
|
|
26
|
-
version: '
|
|
26
|
+
version: '0.17'
|
|
27
27
|
- !ruby/object:Gem::Dependency
|
|
28
|
-
name:
|
|
28
|
+
name: mechanize
|
|
29
29
|
requirement: !ruby/object:Gem::Requirement
|
|
30
30
|
requirements:
|
|
31
31
|
- - "~>"
|
|
32
32
|
- !ruby/object:Gem::Version
|
|
33
|
-
version: 2.
|
|
33
|
+
version: '2.10'
|
|
34
34
|
type: :runtime
|
|
35
35
|
prerelease: false
|
|
36
36
|
version_requirements: !ruby/object:Gem::Requirement
|
|
37
37
|
requirements:
|
|
38
38
|
- - "~>"
|
|
39
39
|
- !ruby/object:Gem::Version
|
|
40
|
-
version: 2.
|
|
40
|
+
version: '2.10'
|
|
41
41
|
- !ruby/object:Gem::Dependency
|
|
42
42
|
name: relaton-bib
|
|
43
43
|
requirement: !ruby/object:Gem::Requirement
|
|
44
44
|
requirements:
|
|
45
45
|
- - "~>"
|
|
46
46
|
- !ruby/object:Gem::Version
|
|
47
|
-
version: 2.
|
|
47
|
+
version: 2.2.0.pre.alpha.1
|
|
48
48
|
type: :runtime
|
|
49
49
|
prerelease: false
|
|
50
50
|
version_requirements: !ruby/object:Gem::Requirement
|
|
51
51
|
requirements:
|
|
52
52
|
- - "~>"
|
|
53
53
|
- !ruby/object:Gem::Version
|
|
54
|
-
version: 2.
|
|
54
|
+
version: 2.2.0.pre.alpha.1
|
|
55
55
|
- !ruby/object:Gem::Dependency
|
|
56
56
|
name: relaton-core
|
|
57
57
|
requirement: !ruby/object:Gem::Requirement
|
|
58
58
|
requirements:
|
|
59
59
|
- - "~>"
|
|
60
60
|
- !ruby/object:Gem::Version
|
|
61
|
-
version:
|
|
61
|
+
version: 2.2.0.pre.alpha.1
|
|
62
62
|
type: :runtime
|
|
63
63
|
prerelease: false
|
|
64
64
|
version_requirements: !ruby/object:Gem::Requirement
|
|
65
65
|
requirements:
|
|
66
66
|
- - "~>"
|
|
67
67
|
- !ruby/object:Gem::Version
|
|
68
|
-
version:
|
|
68
|
+
version: 2.2.0.pre.alpha.1
|
|
69
69
|
- !ruby/object:Gem::Dependency
|
|
70
70
|
name: relaton-index
|
|
71
71
|
requirement: !ruby/object:Gem::Requirement
|
|
72
72
|
requirements:
|
|
73
73
|
- - "~>"
|
|
74
74
|
- !ruby/object:Gem::Version
|
|
75
|
-
version:
|
|
75
|
+
version: 2.2.0.pre.alpha.1
|
|
76
76
|
type: :runtime
|
|
77
77
|
prerelease: false
|
|
78
78
|
version_requirements: !ruby/object:Gem::Requirement
|
|
79
79
|
requirements:
|
|
80
80
|
- - "~>"
|
|
81
81
|
- !ruby/object:Gem::Version
|
|
82
|
-
version:
|
|
82
|
+
version: 2.2.0.pre.alpha.1
|
|
83
83
|
description: |
|
|
84
84
|
Relaton::Cie: retrieve CIE Standards for bibliographic use
|
|
85
85
|
using the BibliographicItem model.
|
|
@@ -93,7 +93,6 @@ files:
|
|
|
93
93
|
- ".github/workflows/release.yml"
|
|
94
94
|
- ".gitignore"
|
|
95
95
|
- ".rspec"
|
|
96
|
-
- ".rubocop.yml"
|
|
97
96
|
- CLAUDE.md
|
|
98
97
|
- Gemfile
|
|
99
98
|
- LICENSE.txt
|
|
@@ -102,11 +101,6 @@ files:
|
|
|
102
101
|
- bin/console
|
|
103
102
|
- bin/rspec
|
|
104
103
|
- bin/setup
|
|
105
|
-
- grammars/basicdoc.rng
|
|
106
|
-
- grammars/biblio-standoc.rng
|
|
107
|
-
- grammars/biblio.rng
|
|
108
|
-
- grammars/relaton-cie-compile.rng
|
|
109
|
-
- grammars/relaton-cie.rng
|
|
110
104
|
- lib/relaton/cie.rb
|
|
111
105
|
- lib/relaton/cie/bibdata.rb
|
|
112
106
|
- lib/relaton/cie/bibitem.rb
|
|
@@ -133,7 +127,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
|
133
127
|
requirements:
|
|
134
128
|
- - ">="
|
|
135
129
|
- !ruby/object:Gem::Version
|
|
136
|
-
version: 3.
|
|
130
|
+
version: 3.3.0
|
|
137
131
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
|
138
132
|
requirements:
|
|
139
133
|
- - ">="
|
data/.rubocop.yml
DELETED
|
@@ -1,12 +0,0 @@
|
|
|
1
|
-
# This project follows the Ribose OSS style guide.
|
|
2
|
-
# https://github.com/riboseinc/oss-guides
|
|
3
|
-
# All project-specific additions and overrides should be specified in this file.
|
|
4
|
-
|
|
5
|
-
require: rubocop-rails
|
|
6
|
-
|
|
7
|
-
inherit_from:
|
|
8
|
-
- https://raw.githubusercontent.com/riboseinc/oss-guides/master/ci/rubocop.yml
|
|
9
|
-
AllCops:
|
|
10
|
-
TargetRubyVersion: 3.2
|
|
11
|
-
Rails:
|
|
12
|
-
Enabled: false
|