relaton-oasis 2.1.1 → 2.2.0.pre.alpha.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: c3434e03a8bfad2eb1ae188cbc3e27908a530661267335656356e530889a3f21
4
- data.tar.gz: 2a81653e2ce01d964ec217b0d2ec0d1fd18bcc38fa4065f1ffb1029340d9a615
3
+ metadata.gz: ac8025bbbca70581bc6047bcbef3b1d81b9f110b5d91a235d43dd00201052864
4
+ data.tar.gz: 04db6b5abd8c83acfcdb46916880a47e210a98bf2c78d24c1e9809e3c7a2fd0f
5
5
  SHA512:
6
- metadata.gz: 43cec70fb75e28fcfaa9d5203c95cc2536a9050a952427192c8cf8d133b86742a98aac9b73e11884133b19f035134249af407fe91ec820b1b0aa1769121e9699
7
- data.tar.gz: 8256710e4b5699c8cefec19f0cf8919dc4d703c05d461c71a896a7ff0e1497923d7dfc0b78db59b882e07b983bcff8799aafae830b93da4c93ff0240f699e93f
6
+ metadata.gz: cee78c32f3033e7d6fd69c0697598225147b6e58329dce7fd9015b323f3b5aaec201080cb5427d80e1ab86126b48714a0f4ac78d2ff2e9bfe151af13117cc1ff
7
+ data.tar.gz: cc409df22b3c451c748d84cfe4cdf4b41f67ba338071b147e5d0b5766101306cc86b95f2ac9add1c6e3d00ce64eaa06779998de897f5cdf7aa1d4e8bedb8facc
data/Gemfile CHANGED
@@ -5,6 +5,14 @@ source "https://rubygems.org"
5
5
  # Specify your gem's dependencies in relaton_oasis.gemspec
6
6
  gemspec
7
7
 
8
+ # Use local monorepo sibling gems where available.
9
+ Dir["../*/"].each do |dir|
10
+ name = File.basename(dir)
11
+ next if name == File.basename(__dir__)
12
+ next unless File.exist?(File.join(dir, "#{name}.gemspec"))
13
+ gem name, path: dir
14
+ end
15
+
8
16
 
9
17
  gem "rake", "~> 13.0"
10
18
  gem "rspec", "~> 3.0"
@@ -0,0 +1,71 @@
1
+ require "ferrum"
2
+ require "nokogiri"
3
+
4
+ module Relaton
5
+ module Oasis
6
+ # Thin Ferrum-backed agent that drives headless Chrome with stealth tweaks
7
+ # so the Cloudflare-protected oasis-open.org host serves real HTML instead
8
+ # of a "Just a moment..." challenge. Mirrors the pattern used by
9
+ # Relaton::Cie::BrowserAgent.
10
+ class BrowserAgent
11
+ UA = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 " \
12
+ "(KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36"
13
+ CHALLENGE_MARKERS = ["Just a moment", "challenge-platform"].freeze
14
+ MAX_CHALLENGE_WAIT = 30
15
+
16
+ def initialize
17
+ @browser = Ferrum::Browser.new(
18
+ headless: true,
19
+ timeout: 90,
20
+ process_timeout: 90,
21
+ window_size: [1366, 768],
22
+ browser_options: {
23
+ "disable-blink-features" => "AutomationControlled",
24
+ "disable-quic" => nil,
25
+ "no-sandbox" => nil,
26
+ },
27
+ )
28
+ @browser.headers.set(
29
+ "Accept-Language" => "en-US,en;q=0.9",
30
+ "Accept" => "text/html,application/xhtml+xml,application/xml;q=0.9," \
31
+ "image/webp,*/*;q=0.8",
32
+ "User-Agent" => UA,
33
+ )
34
+ @browser.evaluate_on_new_document(<<~JS)
35
+ Object.defineProperty(navigator, 'webdriver', { get: () => undefined });
36
+ Object.defineProperty(navigator, 'languages', { get: () => ['en-US', 'en'] });
37
+ Object.defineProperty(navigator, 'plugins', { get: () => [1,2,3,4,5] });
38
+ window.chrome = { runtime: {} };
39
+ JS
40
+ end
41
+
42
+ def get(url)
43
+ @browser.go_to(url)
44
+ wait_for_challenge
45
+ Nokogiri::HTML(@browser.body)
46
+ end
47
+
48
+ # HTTP status code of the most recent navigation's main resource,
49
+ # or nil if no navigation has happened yet.
50
+ def last_status
51
+ @browser&.network&.status
52
+ end
53
+
54
+ def quit
55
+ @browser&.quit
56
+ ensure
57
+ @browser = nil
58
+ end
59
+
60
+ private
61
+
62
+ def wait_for_challenge
63
+ MAX_CHALLENGE_WAIT.times do
64
+ return unless CHALLENGE_MARKERS.any? { |m| @browser.body.include?(m) }
65
+
66
+ sleep 1
67
+ end
68
+ end
69
+ end
70
+ end
71
+ end
@@ -1,5 +1,5 @@
1
- require "mechanize"
2
1
  require_relative "../oasis"
2
+ require_relative "browser_agent"
3
3
  require_relative "data_parser_utils"
4
4
  require_relative "data_parser"
5
5
  require_relative "data_part_parser"
@@ -7,24 +7,47 @@ require_relative "data_part_parser"
7
7
  module Relaton
8
8
  module Oasis
9
9
  class DataFetcher < Core::DataFetcher
10
+ STANDARDS_URL = "https://www.oasis-open.org/standards/".freeze
11
+ RETRIABLE_ERRORS = [
12
+ SocketError,
13
+ Ferrum::TimeoutError,
14
+ Ferrum::PendingConnectionsError,
15
+ Ferrum::StatusError,
16
+ ].freeze
17
+
10
18
  def log_error(msg)
11
19
  Util.error msg
12
20
  end
13
21
 
14
22
  def fetch(_source = nil)
15
- agent = Mechanize.new
16
- resp = agent.get "https://www.oasis-open.org/standards/"
17
- doc = Nokogiri::HTML resp.body
23
+ doc = with_retry { agent.get(STANDARDS_URL) }
18
24
  doc.xpath("//details").map do |item|
19
- save_doc DataParser.new(item, @errors).parse
25
+ save_doc DataParser.new(item, @errors, agent: agent).parse
20
26
  fetch_parts item
21
27
  end
22
28
  index.save
23
29
  report_errors
30
+ ensure
31
+ @agent&.quit
24
32
  end
25
33
 
26
34
  private
27
35
 
36
+ def agent
37
+ @agent ||= BrowserAgent.new
38
+ end
39
+
40
+ def with_retry
41
+ tries = 0
42
+ begin
43
+ tries += 1
44
+ yield
45
+ rescue *RETRIABLE_ERRORS => e
46
+ retry if tries < 4
47
+ raise e
48
+ end
49
+ end
50
+
28
51
  def index
29
52
  @index ||= Relaton::Index.find_or_create(
30
53
  :oasis, file: "#{INDEXFILE}.yaml"
@@ -38,7 +61,9 @@ module Relaton
38
61
  parts = item.xpath(xpath)
39
62
  return unless parts.size > 1
40
63
 
41
- parts.each { |part| save_doc DataPartParser.new(part, @errors).parse }
64
+ parts.each do |part|
65
+ save_doc DataPartParser.new(part, @errors, agent: agent).parse
66
+ end
42
67
  end
43
68
 
44
69
  def save_doc(doc) # rubocop:disable Metrics/AbcSize
@@ -9,9 +9,10 @@ module Relaton
9
9
  #
10
10
  # @param [Nokogiri::HTML::Element] node document node
11
11
  #
12
- def initialize(node, errors = {})
12
+ def initialize(node, errors = {}, agent: nil)
13
13
  @node = node
14
14
  @errors = errors
15
+ @agent = agent
15
16
  end
16
17
 
17
18
  def title
@@ -158,7 +159,7 @@ module Relaton
158
159
  rels = @node.xpath(xpath)
159
160
  result = if rels.size > 1
160
161
  rels.map do |r|
161
- docid = DataPartParser.new(r).parse_docid
162
+ docid = DataPartParser.new(r, @errors, agent: @agent).parse_docid
162
163
  bib = ItemData.new(formattedref: Bib::Formattedref.new(content: docid[0].content))
163
164
  Bib::Relation.new(type: "hasPart", bibitem: bib)
164
165
  end
@@ -1,7 +1,17 @@
1
+ require "mechanize"
2
+ require_relative "browser_agent"
3
+
1
4
  module Relaton
2
5
  module Oasis
3
6
  # Common methods for document and part parsers.
4
7
  module DataParserUtils
8
+ RETRIABLE_PAGE_ERRORS = [
9
+ Errno::ETIMEDOUT,
10
+ Net::OpenTimeout,
11
+ Ferrum::TimeoutError,
12
+ Ferrum::PendingConnectionsError,
13
+ Ferrum::StatusError,
14
+ ].freeze
5
15
  #
6
16
  # Parse contributor.
7
17
  #
@@ -50,9 +60,17 @@ module Relaton
50
60
  def page
51
61
  return @page if defined? @page
52
62
 
53
- if link_node && link_node[:href].match?(/\.html$/)
63
+ @page = nil
64
+ return @page unless link_node && link_node[:href].match?(/\.html$/)
65
+
66
+ if @agent
67
+ doc = retry_page(link_node[:href], @agent)
68
+ @page = doc if doc && @agent.last_status == 200
69
+ else
70
+ # No injected agent (e.g. unit tests with VCR cassettes): fall back
71
+ # to a Mechanize request — VCR can intercept it.
54
72
  agent = Mechanize.new
55
- agent.agent.allowed_error_codes = [404]
73
+ agent.agent.allowed_error_codes = [403, 404, 503]
56
74
  resp = retry_page(link_node[:href], agent)
57
75
  @page = resp if resp && resp.code == "200"
58
76
  end
@@ -62,15 +80,15 @@ module Relaton
62
80
  # Retry to get page.
63
81
  #
64
82
  # @param [String] url page URL
65
- # @param [Mechanize] agent HTTP client
83
+ # @param [#get] agent HTTP client responding to #get(url)
66
84
  # @param [Integer] retries number of retries
67
85
  #
68
- # @return [Mechanize::Page, nil] page or nil
86
+ # @return [Nokogiri::HTML::Document, Mechanize::Page, nil] page or nil
69
87
  #
70
88
  def retry_page(url, agent, retries = 3)
71
89
  sleep 1 # to avoid 429 error
72
90
  agent.get url
73
- rescue Errno::ETIMEDOUT, Net::OpenTimeout => e
91
+ rescue *RETRIABLE_PAGE_ERRORS => e
74
92
  retry if (retries -= 1).positive?
75
93
  Util.error "Failed to get page `#{url}`\n#{e.message}"
76
94
  nil
@@ -84,7 +102,7 @@ module Relaton
84
102
  "[starts-with(., 'Editor')]]"
85
103
  page.xpath(xpath).map do |p|
86
104
  create_contribution_info(p, "editor", ["Chair"])
87
- end
105
+ end.compact
88
106
  else
89
107
  []
90
108
  end
@@ -101,7 +119,7 @@ module Relaton
101
119
  "[contains(@class, 'Title')]]"
102
120
  page.xpath(xpath).map do |p|
103
121
  create_contribution_info(p, "editor")
104
- end
122
+ end.compact
105
123
  else
106
124
  parse_editors_from_text
107
125
  end
@@ -111,6 +129,9 @@ module Relaton
111
129
 
112
130
  def create_contribution_info(person_node, type, description = [])
113
131
  name = person_node.text.match(/^[^(]+/).to_s.strip
132
+ return nil if name.empty? || !name.match?(/\A\p{L}/) ||
133
+ name.match?(%r{\A(?:https?://|urn:)})
134
+
114
135
  email, org = person_node.xpath ".//a[@href]"
115
136
  entity = create_person name, email, org
116
137
  desc = description.map { |d| Bib::LocalizedMarkedUpString.new(content: d) }
@@ -137,7 +158,15 @@ module Relaton
137
158
  [href.split(":")[1]]
138
159
  elsif (cf_email = email.at(".//span[@data-cfemail]"))
139
160
  decoded = decode_cf_email(cf_email["data-cfemail"])
140
- decoded.empty? ? [] : [decoded]
161
+ return [] if decoded.empty?
162
+
163
+ # Cloudflare obfuscates ASCII email characters in the data-cfemail
164
+ # span but leaves non-ASCII characters (e.g. the Latin "fl" ligature
165
+ # U+FB02) as plain text outside the span. Concatenate any sibling
166
+ # text and NFKC-normalize so ligatures become their ASCII equivalent.
167
+ prefix = cf_email.xpath("./preceding-sibling::node()").map(&:text).join
168
+ suffix = cf_email.xpath("./following-sibling::node()").map(&:text).join
169
+ [(prefix + decoded + suffix).unicode_normalize(:nfkc)]
141
170
  else
142
171
  []
143
172
  end
@@ -9,9 +9,10 @@ module Relaton
9
9
  #
10
10
  # @param [Nokogiri::HTML::Element] node document node
11
11
  #
12
- def initialize(node, errors = {})
12
+ def initialize(node, errors = {}, agent: nil)
13
13
  @node = node
14
14
  @errors = errors
15
+ @agent = agent
15
16
  end
16
17
 
17
18
  def text
@@ -34,11 +35,11 @@ module Relaton
34
35
  xpath = "./span[@class='citationTitle' " \
35
36
  "or @class='citeTitle']|./em|./i"
36
37
  t = @node.at(xpath)
37
- @title = if t then t.text
38
+ @title = if t
39
+ t.text
38
40
  else
39
- text.match(
40
- /(?<content>.+)\s(?:Edited|\d{2}\s\w+\d{4})/,
41
- )[:content]
41
+ m = text.match(/(?<content>.+)\s(?:Edited|\d{2}\s\w+\d{4})/)
42
+ m ? m[:content] : text
42
43
  end.strip
43
44
  end
44
45
 
@@ -82,7 +83,7 @@ module Relaton
82
83
  # @return [String] document number
83
84
  #
84
85
  def parse_docnumber
85
- ref = @node.at("./span[@class='citationLabel']/strong|./strong|b/span")
86
+ ref = @node.at("./span/strong|./strong|./b/span")
86
87
  num = ref.text.match(/[^\[\]]+/).to_s
87
88
  id = parse_errata(num)
88
89
  # some part refs need "Pt" to distinguish from root doc
@@ -110,8 +111,8 @@ module Relaton
110
111
  # @return [Array<Bib::Date>] bibliographic dates
111
112
  #
112
113
  def parse_date
113
- /(?<on>\d{1,2}\s\w+\s\d{4})/ =~ text
114
- result = [Bib::Date.new(at: Date.parse(on).to_s, type: "issued")]
114
+ match = text.match(/(?<on>\d{1,2}\s\w+\s\d{4})/)
115
+ result = match ? [Bib::Date.new(at: Date.parse(match[:on]).to_s, type: "issued")] : []
115
116
  @errors[:part_date] &&= result.empty?
116
117
  result
117
118
  end
@@ -182,7 +183,7 @@ module Relaton
182
183
  # @return [Array<Bib::Relation>] document relations
183
184
  #
184
185
  def parse_relation
185
- parser = DataParser.new @node.at("./ancestor::details")
186
+ parser = DataParser.new(@node.at("./ancestor::details"), @errors, agent: @agent)
186
187
  fref = parser.parse_docid[0].content
187
188
  bib = ItemData.new(formattedref: Bib::Formattedref.new(content: fref))
188
189
  result = [Bib::Relation.new(type: "partOf", bibitem: bib)]
@@ -2,6 +2,6 @@
2
2
 
3
3
  module Relaton
4
4
  module Oasis
5
- VERSION = "2.1.1"
5
+ VERSION = "2.2.0.pre.alpha.1"
6
6
  end
7
7
  end
@@ -14,7 +14,7 @@ Gem::Specification.new do |spec|
14
14
  "bibliographic use using the BibliographicItem model"
15
15
  spec.homepage = "https://github.com/metanorma/relaton-oasis"
16
16
  spec.license = "BSD-2-Clause"
17
- spec.required_ruby_version = Gem::Requirement.new(">= 3.2.0")
17
+ spec.required_ruby_version = Gem::Requirement.new(">= 3.3.0")
18
18
 
19
19
  # spec.metadata["allowed_push_host"] = "TODO: Set to 'http://mygemserver.com'"
20
20
 
@@ -31,11 +31,11 @@ Gem::Specification.new do |spec|
31
31
  spec.executables = spec.files.grep(%r{\Aexe/}) { |f| File.basename(f) }
32
32
  spec.require_paths = ["lib"]
33
33
 
34
+ spec.add_dependency "ferrum", "~> 0.17"
34
35
  spec.add_dependency "mechanize", "~> 2.10"
35
- spec.add_dependency "multi_json", "~> 1.15.0"
36
- spec.add_dependency "relaton-bib", "~> 2.1.0"
37
- spec.add_dependency "relaton-core", "~> 0.0.13"
38
- spec.add_dependency "relaton-index", "~> 0.2.0"
36
+ spec.add_dependency "relaton-bib", "~> 2.2.0.pre.alpha.1"
37
+ spec.add_dependency "relaton-core", "~> 2.2.0.pre.alpha.1"
38
+ spec.add_dependency "relaton-index", "~> 2.2.0.pre.alpha.1"
39
39
 
40
40
  # For more information and examples about making a new gem, checkout our
41
41
  # guide at: https://bundler.io/guides/creating_gem.html
metadata CHANGED
@@ -1,85 +1,85 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: relaton-oasis
3
3
  version: !ruby/object:Gem::Version
4
- version: 2.1.1
4
+ version: 2.2.0.pre.alpha.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Ribose Inc.
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2026-05-12 00:00:00.000000000 Z
11
+ date: 2026-06-26 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
- name: mechanize
14
+ name: ferrum
15
15
  requirement: !ruby/object:Gem::Requirement
16
16
  requirements:
17
17
  - - "~>"
18
18
  - !ruby/object:Gem::Version
19
- version: '2.10'
19
+ version: '0.17'
20
20
  type: :runtime
21
21
  prerelease: false
22
22
  version_requirements: !ruby/object:Gem::Requirement
23
23
  requirements:
24
24
  - - "~>"
25
25
  - !ruby/object:Gem::Version
26
- version: '2.10'
26
+ version: '0.17'
27
27
  - !ruby/object:Gem::Dependency
28
- name: multi_json
28
+ name: mechanize
29
29
  requirement: !ruby/object:Gem::Requirement
30
30
  requirements:
31
31
  - - "~>"
32
32
  - !ruby/object:Gem::Version
33
- version: 1.15.0
33
+ version: '2.10'
34
34
  type: :runtime
35
35
  prerelease: false
36
36
  version_requirements: !ruby/object:Gem::Requirement
37
37
  requirements:
38
38
  - - "~>"
39
39
  - !ruby/object:Gem::Version
40
- version: 1.15.0
40
+ version: '2.10'
41
41
  - !ruby/object:Gem::Dependency
42
42
  name: relaton-bib
43
43
  requirement: !ruby/object:Gem::Requirement
44
44
  requirements:
45
45
  - - "~>"
46
46
  - !ruby/object:Gem::Version
47
- version: 2.1.0
47
+ version: 2.2.0.pre.alpha.1
48
48
  type: :runtime
49
49
  prerelease: false
50
50
  version_requirements: !ruby/object:Gem::Requirement
51
51
  requirements:
52
52
  - - "~>"
53
53
  - !ruby/object:Gem::Version
54
- version: 2.1.0
54
+ version: 2.2.0.pre.alpha.1
55
55
  - !ruby/object:Gem::Dependency
56
56
  name: relaton-core
57
57
  requirement: !ruby/object:Gem::Requirement
58
58
  requirements:
59
59
  - - "~>"
60
60
  - !ruby/object:Gem::Version
61
- version: 0.0.13
61
+ version: 2.2.0.pre.alpha.1
62
62
  type: :runtime
63
63
  prerelease: false
64
64
  version_requirements: !ruby/object:Gem::Requirement
65
65
  requirements:
66
66
  - - "~>"
67
67
  - !ruby/object:Gem::Version
68
- version: 0.0.13
68
+ version: 2.2.0.pre.alpha.1
69
69
  - !ruby/object:Gem::Dependency
70
70
  name: relaton-index
71
71
  requirement: !ruby/object:Gem::Requirement
72
72
  requirements:
73
73
  - - "~>"
74
74
  - !ruby/object:Gem::Version
75
- version: 0.2.0
75
+ version: 2.2.0.pre.alpha.1
76
76
  type: :runtime
77
77
  prerelease: false
78
78
  version_requirements: !ruby/object:Gem::Requirement
79
79
  requirements:
80
80
  - - "~>"
81
81
  - !ruby/object:Gem::Version
82
- version: 0.2.0
82
+ version: 2.2.0.pre.alpha.1
83
83
  description: 'Relaton::Oasis: retrieve OASIS Standards for bibliographic use using
84
84
  the BibliographicItem model'
85
85
  email:
@@ -92,7 +92,6 @@ files:
92
92
  - ".github/workflows/release.yml"
93
93
  - ".gitignore"
94
94
  - ".rspec"
95
- - ".rubocop.yml"
96
95
  - CLAUDE.md
97
96
  - Gemfile
98
97
  - LICENSE.txt
@@ -101,15 +100,11 @@ files:
101
100
  - bin/console
102
101
  - bin/rspec
103
102
  - bin/setup
104
- - grammars/basicdoc.rng
105
- - grammars/biblio-standoc.rng
106
- - grammars/biblio.rng
107
- - grammars/relaton-oasis-compile.rng
108
- - grammars/relaton-oasis.rng
109
103
  - lib/relaton/oasis.rb
110
104
  - lib/relaton/oasis/bibdata.rb
111
105
  - lib/relaton/oasis/bibitem.rb
112
106
  - lib/relaton/oasis/bibliography.rb
107
+ - lib/relaton/oasis/browser_agent.rb
113
108
  - lib/relaton/oasis/data_fetcher.rb
114
109
  - lib/relaton/oasis/data_parser.rb
115
110
  - lib/relaton/oasis/data_parser_utils.rb
@@ -136,7 +131,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
136
131
  requirements:
137
132
  - - ">="
138
133
  - !ruby/object:Gem::Version
139
- version: 3.2.0
134
+ version: 3.3.0
140
135
  required_rubygems_version: !ruby/object:Gem::Requirement
141
136
  requirements:
142
137
  - - ">="
data/.rubocop.yml DELETED
@@ -1,12 +0,0 @@
1
- # This project follows the Ribose OSS style guide.
2
- # https://github.com/riboseinc/oss-guides
3
- # All project-specific additions and overrides should be specified in this file.
4
-
5
- require: rubocop-rails
6
-
7
- inherit_from:
8
- - https://raw.githubusercontent.com/riboseinc/oss-guides/master/ci/rubocop.yml
9
- AllCops:
10
- TargetRubyVersion: 3.2
11
- Rails:
12
- Enabled: false