relaton-oasis 2.1.1 → 2.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: c3434e03a8bfad2eb1ae188cbc3e27908a530661267335656356e530889a3f21
4
- data.tar.gz: 2a81653e2ce01d964ec217b0d2ec0d1fd18bcc38fa4065f1ffb1029340d9a615
3
+ metadata.gz: b09904cb15aa82500946dfe0d7c3a3d3edd08c391244ef56208e077c8b322d9e
4
+ data.tar.gz: f16f7b98dcde027268ea89622f595c57290f6dc9845f075d240a50b9ff7996d6
5
5
  SHA512:
6
- metadata.gz: 43cec70fb75e28fcfaa9d5203c95cc2536a9050a952427192c8cf8d133b86742a98aac9b73e11884133b19f035134249af407fe91ec820b1b0aa1769121e9699
7
- data.tar.gz: 8256710e4b5699c8cefec19f0cf8919dc4d703c05d461c71a896a7ff0e1497923d7dfc0b78db59b882e07b983bcff8799aafae830b93da4c93ff0240f699e93f
6
+ metadata.gz: edbc0cb1b32818941d92dd5c5d01854205374fd4dfd23b84457bc03de3dacbd5f87c9a831a0bcf7f07ff534bae5d6a32e652fb343f48508f426ce2d1749384a7
7
+ data.tar.gz: 3abbf12c779c34649b0974e331ed529bd7ff65a16d2f9f7f566d96067137920c99f08c5779fcd2bfd8aecd2f07b7e7969fe4f3e7237e17ff62d29783189a6c17
@@ -0,0 +1,71 @@
1
+ require "ferrum"
2
+ require "nokogiri"
3
+
4
+ module Relaton
5
+ module Oasis
6
+ # Thin Ferrum-backed agent that drives headless Chrome with stealth tweaks
7
+ # so the Cloudflare-protected oasis-open.org host serves real HTML instead
8
+ # of a "Just a moment..." challenge. Mirrors the pattern used by
9
+ # Relaton::Cie::BrowserAgent.
10
+ class BrowserAgent
11
+ UA = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 " \
12
+ "(KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36"
13
+ CHALLENGE_MARKERS = ["Just a moment", "challenge-platform"].freeze
14
+ MAX_CHALLENGE_WAIT = 30
15
+
16
+ def initialize
17
+ @browser = Ferrum::Browser.new(
18
+ headless: true,
19
+ timeout: 90,
20
+ process_timeout: 90,
21
+ window_size: [1366, 768],
22
+ browser_options: {
23
+ "disable-blink-features" => "AutomationControlled",
24
+ "disable-quic" => nil,
25
+ "no-sandbox" => nil,
26
+ },
27
+ )
28
+ @browser.headers.set(
29
+ "Accept-Language" => "en-US,en;q=0.9",
30
+ "Accept" => "text/html,application/xhtml+xml,application/xml;q=0.9," \
31
+ "image/webp,*/*;q=0.8",
32
+ "User-Agent" => UA,
33
+ )
34
+ @browser.evaluate_on_new_document(<<~JS)
35
+ Object.defineProperty(navigator, 'webdriver', { get: () => undefined });
36
+ Object.defineProperty(navigator, 'languages', { get: () => ['en-US', 'en'] });
37
+ Object.defineProperty(navigator, 'plugins', { get: () => [1,2,3,4,5] });
38
+ window.chrome = { runtime: {} };
39
+ JS
40
+ end
41
+
42
+ def get(url)
43
+ @browser.go_to(url)
44
+ wait_for_challenge
45
+ Nokogiri::HTML(@browser.body)
46
+ end
47
+
48
+ # HTTP status code of the most recent navigation's main resource,
49
+ # or nil if no navigation has happened yet.
50
+ def last_status
51
+ @browser&.network&.status
52
+ end
53
+
54
+ def quit
55
+ @browser&.quit
56
+ ensure
57
+ @browser = nil
58
+ end
59
+
60
+ private
61
+
62
+ def wait_for_challenge
63
+ MAX_CHALLENGE_WAIT.times do
64
+ return unless CHALLENGE_MARKERS.any? { |m| @browser.body.include?(m) }
65
+
66
+ sleep 1
67
+ end
68
+ end
69
+ end
70
+ end
71
+ end
@@ -1,5 +1,5 @@
1
- require "mechanize"
2
1
  require_relative "../oasis"
2
+ require_relative "browser_agent"
3
3
  require_relative "data_parser_utils"
4
4
  require_relative "data_parser"
5
5
  require_relative "data_part_parser"
@@ -7,24 +7,47 @@ require_relative "data_part_parser"
7
7
  module Relaton
8
8
  module Oasis
9
9
  class DataFetcher < Core::DataFetcher
10
+ STANDARDS_URL = "https://www.oasis-open.org/standards/".freeze
11
+ RETRIABLE_ERRORS = [
12
+ SocketError,
13
+ Ferrum::TimeoutError,
14
+ Ferrum::PendingConnectionsError,
15
+ Ferrum::StatusError,
16
+ ].freeze
17
+
10
18
  def log_error(msg)
11
19
  Util.error msg
12
20
  end
13
21
 
14
22
  def fetch(_source = nil)
15
- agent = Mechanize.new
16
- resp = agent.get "https://www.oasis-open.org/standards/"
17
- doc = Nokogiri::HTML resp.body
23
+ doc = with_retry { agent.get(STANDARDS_URL) }
18
24
  doc.xpath("//details").map do |item|
19
- save_doc DataParser.new(item, @errors).parse
25
+ save_doc DataParser.new(item, @errors, agent: agent).parse
20
26
  fetch_parts item
21
27
  end
22
28
  index.save
23
29
  report_errors
30
+ ensure
31
+ @agent&.quit
24
32
  end
25
33
 
26
34
  private
27
35
 
36
+ def agent
37
+ @agent ||= BrowserAgent.new
38
+ end
39
+
40
+ def with_retry
41
+ tries = 0
42
+ begin
43
+ tries += 1
44
+ yield
45
+ rescue *RETRIABLE_ERRORS => e
46
+ retry if tries < 4
47
+ raise e
48
+ end
49
+ end
50
+
28
51
  def index
29
52
  @index ||= Relaton::Index.find_or_create(
30
53
  :oasis, file: "#{INDEXFILE}.yaml"
@@ -38,7 +61,9 @@ module Relaton
38
61
  parts = item.xpath(xpath)
39
62
  return unless parts.size > 1
40
63
 
41
- parts.each { |part| save_doc DataPartParser.new(part, @errors).parse }
64
+ parts.each do |part|
65
+ save_doc DataPartParser.new(part, @errors, agent: agent).parse
66
+ end
42
67
  end
43
68
 
44
69
  def save_doc(doc) # rubocop:disable Metrics/AbcSize
@@ -9,9 +9,10 @@ module Relaton
9
9
  #
10
10
  # @param [Nokogiri::HTML::Element] node document node
11
11
  #
12
- def initialize(node, errors = {})
12
+ def initialize(node, errors = {}, agent: nil)
13
13
  @node = node
14
14
  @errors = errors
15
+ @agent = agent
15
16
  end
16
17
 
17
18
  def title
@@ -158,7 +159,7 @@ module Relaton
158
159
  rels = @node.xpath(xpath)
159
160
  result = if rels.size > 1
160
161
  rels.map do |r|
161
- docid = DataPartParser.new(r).parse_docid
162
+ docid = DataPartParser.new(r, @errors, agent: @agent).parse_docid
162
163
  bib = ItemData.new(formattedref: Bib::Formattedref.new(content: docid[0].content))
163
164
  Bib::Relation.new(type: "hasPart", bibitem: bib)
164
165
  end
@@ -1,7 +1,17 @@
1
+ require "mechanize"
2
+ require_relative "browser_agent"
3
+
1
4
  module Relaton
2
5
  module Oasis
3
6
  # Common methods for document and part parsers.
4
7
  module DataParserUtils
8
+ RETRIABLE_PAGE_ERRORS = [
9
+ Errno::ETIMEDOUT,
10
+ Net::OpenTimeout,
11
+ Ferrum::TimeoutError,
12
+ Ferrum::PendingConnectionsError,
13
+ Ferrum::StatusError,
14
+ ].freeze
5
15
  #
6
16
  # Parse contributor.
7
17
  #
@@ -50,9 +60,17 @@ module Relaton
50
60
  def page
51
61
  return @page if defined? @page
52
62
 
53
- if link_node && link_node[:href].match?(/\.html$/)
63
+ @page = nil
64
+ return @page unless link_node && link_node[:href].match?(/\.html$/)
65
+
66
+ if @agent
67
+ doc = retry_page(link_node[:href], @agent)
68
+ @page = doc if doc && @agent.last_status == 200
69
+ else
70
+ # No injected agent (e.g. unit tests with VCR cassettes): fall back
71
+ # to a Mechanize request — VCR can intercept it.
54
72
  agent = Mechanize.new
55
- agent.agent.allowed_error_codes = [404]
73
+ agent.agent.allowed_error_codes = [403, 404, 503]
56
74
  resp = retry_page(link_node[:href], agent)
57
75
  @page = resp if resp && resp.code == "200"
58
76
  end
@@ -62,15 +80,15 @@ module Relaton
62
80
  # Retry to get page.
63
81
  #
64
82
  # @param [String] url page URL
65
- # @param [Mechanize] agent HTTP client
83
+ # @param [#get] agent HTTP client responding to #get(url)
66
84
  # @param [Integer] retries number of retries
67
85
  #
68
- # @return [Mechanize::Page, nil] page or nil
86
+ # @return [Nokogiri::HTML::Document, Mechanize::Page, nil] page or nil
69
87
  #
70
88
  def retry_page(url, agent, retries = 3)
71
89
  sleep 1 # to avoid 429 error
72
90
  agent.get url
73
- rescue Errno::ETIMEDOUT, Net::OpenTimeout => e
91
+ rescue *RETRIABLE_PAGE_ERRORS => e
74
92
  retry if (retries -= 1).positive?
75
93
  Util.error "Failed to get page `#{url}`\n#{e.message}"
76
94
  nil
@@ -84,7 +102,7 @@ module Relaton
84
102
  "[starts-with(., 'Editor')]]"
85
103
  page.xpath(xpath).map do |p|
86
104
  create_contribution_info(p, "editor", ["Chair"])
87
- end
105
+ end.compact
88
106
  else
89
107
  []
90
108
  end
@@ -101,7 +119,7 @@ module Relaton
101
119
  "[contains(@class, 'Title')]]"
102
120
  page.xpath(xpath).map do |p|
103
121
  create_contribution_info(p, "editor")
104
- end
122
+ end.compact
105
123
  else
106
124
  parse_editors_from_text
107
125
  end
@@ -111,6 +129,9 @@ module Relaton
111
129
 
112
130
  def create_contribution_info(person_node, type, description = [])
113
131
  name = person_node.text.match(/^[^(]+/).to_s.strip
132
+ return nil if name.empty? || !name.match?(/\A\p{L}/) ||
133
+ name.match?(%r{\A(?:https?://|urn:)})
134
+
114
135
  email, org = person_node.xpath ".//a[@href]"
115
136
  entity = create_person name, email, org
116
137
  desc = description.map { |d| Bib::LocalizedMarkedUpString.new(content: d) }
@@ -137,7 +158,15 @@ module Relaton
137
158
  [href.split(":")[1]]
138
159
  elsif (cf_email = email.at(".//span[@data-cfemail]"))
139
160
  decoded = decode_cf_email(cf_email["data-cfemail"])
140
- decoded.empty? ? [] : [decoded]
161
+ return [] if decoded.empty?
162
+
163
+ # Cloudflare obfuscates ASCII email characters in the data-cfemail
164
+ # span but leaves non-ASCII characters (e.g. the Latin "fl" ligature
165
+ # U+FB02) as plain text outside the span. Concatenate any sibling
166
+ # text and NFKC-normalize so ligatures become their ASCII equivalent.
167
+ prefix = cf_email.xpath("./preceding-sibling::node()").map(&:text).join
168
+ suffix = cf_email.xpath("./following-sibling::node()").map(&:text).join
169
+ [(prefix + decoded + suffix).unicode_normalize(:nfkc)]
141
170
  else
142
171
  []
143
172
  end
@@ -9,9 +9,10 @@ module Relaton
9
9
  #
10
10
  # @param [Nokogiri::HTML::Element] node document node
11
11
  #
12
- def initialize(node, errors = {})
12
+ def initialize(node, errors = {}, agent: nil)
13
13
  @node = node
14
14
  @errors = errors
15
+ @agent = agent
15
16
  end
16
17
 
17
18
  def text
@@ -34,11 +35,11 @@ module Relaton
34
35
  xpath = "./span[@class='citationTitle' " \
35
36
  "or @class='citeTitle']|./em|./i"
36
37
  t = @node.at(xpath)
37
- @title = if t then t.text
38
+ @title = if t
39
+ t.text
38
40
  else
39
- text.match(
40
- /(?<content>.+)\s(?:Edited|\d{2}\s\w+\d{4})/,
41
- )[:content]
41
+ m = text.match(/(?<content>.+)\s(?:Edited|\d{2}\s\w+\d{4})/)
42
+ m ? m[:content] : text
42
43
  end.strip
43
44
  end
44
45
 
@@ -82,7 +83,7 @@ module Relaton
82
83
  # @return [String] document number
83
84
  #
84
85
  def parse_docnumber
85
- ref = @node.at("./span[@class='citationLabel']/strong|./strong|b/span")
86
+ ref = @node.at("./span/strong|./strong|./b/span")
86
87
  num = ref.text.match(/[^\[\]]+/).to_s
87
88
  id = parse_errata(num)
88
89
  # some part refs need "Pt" to distinguish from root doc
@@ -110,8 +111,8 @@ module Relaton
110
111
  # @return [Array<Bib::Date>] bibliographic dates
111
112
  #
112
113
  def parse_date
113
- /(?<on>\d{1,2}\s\w+\s\d{4})/ =~ text
114
- result = [Bib::Date.new(at: Date.parse(on).to_s, type: "issued")]
114
+ match = text.match(/(?<on>\d{1,2}\s\w+\s\d{4})/)
115
+ result = match ? [Bib::Date.new(at: Date.parse(match[:on]).to_s, type: "issued")] : []
115
116
  @errors[:part_date] &&= result.empty?
116
117
  result
117
118
  end
@@ -182,7 +183,7 @@ module Relaton
182
183
  # @return [Array<Bib::Relation>] document relations
183
184
  #
184
185
  def parse_relation
185
- parser = DataParser.new @node.at("./ancestor::details")
186
+ parser = DataParser.new(@node.at("./ancestor::details"), @errors, agent: @agent)
186
187
  fref = parser.parse_docid[0].content
187
188
  bib = ItemData.new(formattedref: Bib::Formattedref.new(content: fref))
188
189
  result = [Bib::Relation.new(type: "partOf", bibitem: bib)]
@@ -2,6 +2,6 @@
2
2
 
3
3
  module Relaton
4
4
  module Oasis
5
- VERSION = "2.1.1"
5
+ VERSION = "2.1.2"
6
6
  end
7
7
  end
@@ -31,6 +31,7 @@ Gem::Specification.new do |spec|
31
31
  spec.executables = spec.files.grep(%r{\Aexe/}) { |f| File.basename(f) }
32
32
  spec.require_paths = ["lib"]
33
33
 
34
+ spec.add_dependency "ferrum", "~> 0.17"
34
35
  spec.add_dependency "mechanize", "~> 2.10"
35
36
  spec.add_dependency "multi_json", "~> 1.15.0"
36
37
  spec.add_dependency "relaton-bib", "~> 2.1.0"
metadata CHANGED
@@ -1,15 +1,29 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: relaton-oasis
3
3
  version: !ruby/object:Gem::Version
4
- version: 2.1.1
4
+ version: 2.1.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Ribose Inc.
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2026-05-12 00:00:00.000000000 Z
11
+ date: 2026-05-14 00:00:00.000000000 Z
12
12
  dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: ferrum
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '0.17'
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: '0.17'
13
27
  - !ruby/object:Gem::Dependency
14
28
  name: mechanize
15
29
  requirement: !ruby/object:Gem::Requirement
@@ -110,6 +124,7 @@ files:
110
124
  - lib/relaton/oasis/bibdata.rb
111
125
  - lib/relaton/oasis/bibitem.rb
112
126
  - lib/relaton/oasis/bibliography.rb
127
+ - lib/relaton/oasis/browser_agent.rb
113
128
  - lib/relaton/oasis/data_fetcher.rb
114
129
  - lib/relaton/oasis/data_parser.rb
115
130
  - lib/relaton/oasis/data_parser_utils.rb