relaton-oasis 2.1.1 → 2.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/relaton/oasis/browser_agent.rb +71 -0
- data/lib/relaton/oasis/data_fetcher.rb +31 -6
- data/lib/relaton/oasis/data_parser.rb +3 -2
- data/lib/relaton/oasis/data_parser_utils.rb +37 -8
- data/lib/relaton/oasis/data_part_parser.rb +10 -9
- data/lib/relaton/oasis/version.rb +1 -1
- data/relaton-oasis.gemspec +1 -0
- metadata +17 -2
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: b09904cb15aa82500946dfe0d7c3a3d3edd08c391244ef56208e077c8b322d9e
|
|
4
|
+
data.tar.gz: f16f7b98dcde027268ea89622f595c57290f6dc9845f075d240a50b9ff7996d6
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: edbc0cb1b32818941d92dd5c5d01854205374fd4dfd23b84457bc03de3dacbd5f87c9a831a0bcf7f07ff534bae5d6a32e652fb343f48508f426ce2d1749384a7
|
|
7
|
+
data.tar.gz: 3abbf12c779c34649b0974e331ed529bd7ff65a16d2f9f7f566d96067137920c99f08c5779fcd2bfd8aecd2f07b7e7969fe4f3e7237e17ff62d29783189a6c17
|
|
@@ -0,0 +1,71 @@
|
|
|
1
|
+
require "ferrum"
|
|
2
|
+
require "nokogiri"
|
|
3
|
+
|
|
4
|
+
module Relaton
|
|
5
|
+
module Oasis
|
|
6
|
+
# Thin Ferrum-backed agent that drives headless Chrome with stealth tweaks
|
|
7
|
+
# so the Cloudflare-protected oasis-open.org host serves real HTML instead
|
|
8
|
+
# of a "Just a moment..." challenge. Mirrors the pattern used by
|
|
9
|
+
# Relaton::Cie::BrowserAgent.
|
|
10
|
+
class BrowserAgent
|
|
11
|
+
UA = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 " \
|
|
12
|
+
"(KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36"
|
|
13
|
+
CHALLENGE_MARKERS = ["Just a moment", "challenge-platform"].freeze
|
|
14
|
+
MAX_CHALLENGE_WAIT = 30
|
|
15
|
+
|
|
16
|
+
def initialize
|
|
17
|
+
@browser = Ferrum::Browser.new(
|
|
18
|
+
headless: true,
|
|
19
|
+
timeout: 90,
|
|
20
|
+
process_timeout: 90,
|
|
21
|
+
window_size: [1366, 768],
|
|
22
|
+
browser_options: {
|
|
23
|
+
"disable-blink-features" => "AutomationControlled",
|
|
24
|
+
"disable-quic" => nil,
|
|
25
|
+
"no-sandbox" => nil,
|
|
26
|
+
},
|
|
27
|
+
)
|
|
28
|
+
@browser.headers.set(
|
|
29
|
+
"Accept-Language" => "en-US,en;q=0.9",
|
|
30
|
+
"Accept" => "text/html,application/xhtml+xml,application/xml;q=0.9," \
|
|
31
|
+
"image/webp,*/*;q=0.8",
|
|
32
|
+
"User-Agent" => UA,
|
|
33
|
+
)
|
|
34
|
+
@browser.evaluate_on_new_document(<<~JS)
|
|
35
|
+
Object.defineProperty(navigator, 'webdriver', { get: () => undefined });
|
|
36
|
+
Object.defineProperty(navigator, 'languages', { get: () => ['en-US', 'en'] });
|
|
37
|
+
Object.defineProperty(navigator, 'plugins', { get: () => [1,2,3,4,5] });
|
|
38
|
+
window.chrome = { runtime: {} };
|
|
39
|
+
JS
|
|
40
|
+
end
|
|
41
|
+
|
|
42
|
+
def get(url)
|
|
43
|
+
@browser.go_to(url)
|
|
44
|
+
wait_for_challenge
|
|
45
|
+
Nokogiri::HTML(@browser.body)
|
|
46
|
+
end
|
|
47
|
+
|
|
48
|
+
# HTTP status code of the most recent navigation's main resource,
|
|
49
|
+
# or nil if no navigation has happened yet.
|
|
50
|
+
def last_status
|
|
51
|
+
@browser&.network&.status
|
|
52
|
+
end
|
|
53
|
+
|
|
54
|
+
def quit
|
|
55
|
+
@browser&.quit
|
|
56
|
+
ensure
|
|
57
|
+
@browser = nil
|
|
58
|
+
end
|
|
59
|
+
|
|
60
|
+
private
|
|
61
|
+
|
|
62
|
+
def wait_for_challenge
|
|
63
|
+
MAX_CHALLENGE_WAIT.times do
|
|
64
|
+
return unless CHALLENGE_MARKERS.any? { |m| @browser.body.include?(m) }
|
|
65
|
+
|
|
66
|
+
sleep 1
|
|
67
|
+
end
|
|
68
|
+
end
|
|
69
|
+
end
|
|
70
|
+
end
|
|
71
|
+
end
|
|
@@ -1,5 +1,5 @@
|
|
|
1
|
-
require "mechanize"
|
|
2
1
|
require_relative "../oasis"
|
|
2
|
+
require_relative "browser_agent"
|
|
3
3
|
require_relative "data_parser_utils"
|
|
4
4
|
require_relative "data_parser"
|
|
5
5
|
require_relative "data_part_parser"
|
|
@@ -7,24 +7,47 @@ require_relative "data_part_parser"
|
|
|
7
7
|
module Relaton
|
|
8
8
|
module Oasis
|
|
9
9
|
class DataFetcher < Core::DataFetcher
|
|
10
|
+
STANDARDS_URL = "https://www.oasis-open.org/standards/".freeze
|
|
11
|
+
RETRIABLE_ERRORS = [
|
|
12
|
+
SocketError,
|
|
13
|
+
Ferrum::TimeoutError,
|
|
14
|
+
Ferrum::PendingConnectionsError,
|
|
15
|
+
Ferrum::StatusError,
|
|
16
|
+
].freeze
|
|
17
|
+
|
|
10
18
|
def log_error(msg)
|
|
11
19
|
Util.error msg
|
|
12
20
|
end
|
|
13
21
|
|
|
14
22
|
def fetch(_source = nil)
|
|
15
|
-
|
|
16
|
-
resp = agent.get "https://www.oasis-open.org/standards/"
|
|
17
|
-
doc = Nokogiri::HTML resp.body
|
|
23
|
+
doc = with_retry { agent.get(STANDARDS_URL) }
|
|
18
24
|
doc.xpath("//details").map do |item|
|
|
19
|
-
save_doc DataParser.new(item, @errors).parse
|
|
25
|
+
save_doc DataParser.new(item, @errors, agent: agent).parse
|
|
20
26
|
fetch_parts item
|
|
21
27
|
end
|
|
22
28
|
index.save
|
|
23
29
|
report_errors
|
|
30
|
+
ensure
|
|
31
|
+
@agent&.quit
|
|
24
32
|
end
|
|
25
33
|
|
|
26
34
|
private
|
|
27
35
|
|
|
36
|
+
def agent
|
|
37
|
+
@agent ||= BrowserAgent.new
|
|
38
|
+
end
|
|
39
|
+
|
|
40
|
+
def with_retry
|
|
41
|
+
tries = 0
|
|
42
|
+
begin
|
|
43
|
+
tries += 1
|
|
44
|
+
yield
|
|
45
|
+
rescue *RETRIABLE_ERRORS => e
|
|
46
|
+
retry if tries < 4
|
|
47
|
+
raise e
|
|
48
|
+
end
|
|
49
|
+
end
|
|
50
|
+
|
|
28
51
|
def index
|
|
29
52
|
@index ||= Relaton::Index.find_or_create(
|
|
30
53
|
:oasis, file: "#{INDEXFILE}.yaml"
|
|
@@ -38,7 +61,9 @@ module Relaton
|
|
|
38
61
|
parts = item.xpath(xpath)
|
|
39
62
|
return unless parts.size > 1
|
|
40
63
|
|
|
41
|
-
parts.each
|
|
64
|
+
parts.each do |part|
|
|
65
|
+
save_doc DataPartParser.new(part, @errors, agent: agent).parse
|
|
66
|
+
end
|
|
42
67
|
end
|
|
43
68
|
|
|
44
69
|
def save_doc(doc) # rubocop:disable Metrics/AbcSize
|
|
@@ -9,9 +9,10 @@ module Relaton
|
|
|
9
9
|
#
|
|
10
10
|
# @param [Nokogiri::HTML::Element] node document node
|
|
11
11
|
#
|
|
12
|
-
def initialize(node, errors = {})
|
|
12
|
+
def initialize(node, errors = {}, agent: nil)
|
|
13
13
|
@node = node
|
|
14
14
|
@errors = errors
|
|
15
|
+
@agent = agent
|
|
15
16
|
end
|
|
16
17
|
|
|
17
18
|
def title
|
|
@@ -158,7 +159,7 @@ module Relaton
|
|
|
158
159
|
rels = @node.xpath(xpath)
|
|
159
160
|
result = if rels.size > 1
|
|
160
161
|
rels.map do |r|
|
|
161
|
-
docid = DataPartParser.new(r).parse_docid
|
|
162
|
+
docid = DataPartParser.new(r, @errors, agent: @agent).parse_docid
|
|
162
163
|
bib = ItemData.new(formattedref: Bib::Formattedref.new(content: docid[0].content))
|
|
163
164
|
Bib::Relation.new(type: "hasPart", bibitem: bib)
|
|
164
165
|
end
|
|
@@ -1,7 +1,17 @@
|
|
|
1
|
+
require "mechanize"
|
|
2
|
+
require_relative "browser_agent"
|
|
3
|
+
|
|
1
4
|
module Relaton
|
|
2
5
|
module Oasis
|
|
3
6
|
# Common methods for document and part parsers.
|
|
4
7
|
module DataParserUtils
|
|
8
|
+
RETRIABLE_PAGE_ERRORS = [
|
|
9
|
+
Errno::ETIMEDOUT,
|
|
10
|
+
Net::OpenTimeout,
|
|
11
|
+
Ferrum::TimeoutError,
|
|
12
|
+
Ferrum::PendingConnectionsError,
|
|
13
|
+
Ferrum::StatusError,
|
|
14
|
+
].freeze
|
|
5
15
|
#
|
|
6
16
|
# Parse contributor.
|
|
7
17
|
#
|
|
@@ -50,9 +60,17 @@ module Relaton
|
|
|
50
60
|
def page
|
|
51
61
|
return @page if defined? @page
|
|
52
62
|
|
|
53
|
-
|
|
63
|
+
@page = nil
|
|
64
|
+
return @page unless link_node && link_node[:href].match?(/\.html$/)
|
|
65
|
+
|
|
66
|
+
if @agent
|
|
67
|
+
doc = retry_page(link_node[:href], @agent)
|
|
68
|
+
@page = doc if doc && @agent.last_status == 200
|
|
69
|
+
else
|
|
70
|
+
# No injected agent (e.g. unit tests with VCR cassettes): fall back
|
|
71
|
+
# to a Mechanize request — VCR can intercept it.
|
|
54
72
|
agent = Mechanize.new
|
|
55
|
-
agent.agent.allowed_error_codes = [404]
|
|
73
|
+
agent.agent.allowed_error_codes = [403, 404, 503]
|
|
56
74
|
resp = retry_page(link_node[:href], agent)
|
|
57
75
|
@page = resp if resp && resp.code == "200"
|
|
58
76
|
end
|
|
@@ -62,15 +80,15 @@ module Relaton
|
|
|
62
80
|
# Retry to get page.
|
|
63
81
|
#
|
|
64
82
|
# @param [String] url page URL
|
|
65
|
-
# @param [
|
|
83
|
+
# @param [#get] agent HTTP client responding to #get(url)
|
|
66
84
|
# @param [Integer] retries number of retries
|
|
67
85
|
#
|
|
68
|
-
# @return [Mechanize::Page, nil] page or nil
|
|
86
|
+
# @return [Nokogiri::HTML::Document, Mechanize::Page, nil] page or nil
|
|
69
87
|
#
|
|
70
88
|
def retry_page(url, agent, retries = 3)
|
|
71
89
|
sleep 1 # to avoid 429 error
|
|
72
90
|
agent.get url
|
|
73
|
-
rescue
|
|
91
|
+
rescue *RETRIABLE_PAGE_ERRORS => e
|
|
74
92
|
retry if (retries -= 1).positive?
|
|
75
93
|
Util.error "Failed to get page `#{url}`\n#{e.message}"
|
|
76
94
|
nil
|
|
@@ -84,7 +102,7 @@ module Relaton
|
|
|
84
102
|
"[starts-with(., 'Editor')]]"
|
|
85
103
|
page.xpath(xpath).map do |p|
|
|
86
104
|
create_contribution_info(p, "editor", ["Chair"])
|
|
87
|
-
end
|
|
105
|
+
end.compact
|
|
88
106
|
else
|
|
89
107
|
[]
|
|
90
108
|
end
|
|
@@ -101,7 +119,7 @@ module Relaton
|
|
|
101
119
|
"[contains(@class, 'Title')]]"
|
|
102
120
|
page.xpath(xpath).map do |p|
|
|
103
121
|
create_contribution_info(p, "editor")
|
|
104
|
-
end
|
|
122
|
+
end.compact
|
|
105
123
|
else
|
|
106
124
|
parse_editors_from_text
|
|
107
125
|
end
|
|
@@ -111,6 +129,9 @@ module Relaton
|
|
|
111
129
|
|
|
112
130
|
def create_contribution_info(person_node, type, description = [])
|
|
113
131
|
name = person_node.text.match(/^[^(]+/).to_s.strip
|
|
132
|
+
return nil if name.empty? || !name.match?(/\A\p{L}/) ||
|
|
133
|
+
name.match?(%r{\A(?:https?://|urn:)})
|
|
134
|
+
|
|
114
135
|
email, org = person_node.xpath ".//a[@href]"
|
|
115
136
|
entity = create_person name, email, org
|
|
116
137
|
desc = description.map { |d| Bib::LocalizedMarkedUpString.new(content: d) }
|
|
@@ -137,7 +158,15 @@ module Relaton
|
|
|
137
158
|
[href.split(":")[1]]
|
|
138
159
|
elsif (cf_email = email.at(".//span[@data-cfemail]"))
|
|
139
160
|
decoded = decode_cf_email(cf_email["data-cfemail"])
|
|
140
|
-
|
|
161
|
+
return [] if decoded.empty?
|
|
162
|
+
|
|
163
|
+
# Cloudflare obfuscates ASCII email characters in the data-cfemail
|
|
164
|
+
# span but leaves non-ASCII characters (e.g. the Latin "fl" ligature
|
|
165
|
+
# U+FB02) as plain text outside the span. Concatenate any sibling
|
|
166
|
+
# text and NFKC-normalize so ligatures become their ASCII equivalent.
|
|
167
|
+
prefix = cf_email.xpath("./preceding-sibling::node()").map(&:text).join
|
|
168
|
+
suffix = cf_email.xpath("./following-sibling::node()").map(&:text).join
|
|
169
|
+
[(prefix + decoded + suffix).unicode_normalize(:nfkc)]
|
|
141
170
|
else
|
|
142
171
|
[]
|
|
143
172
|
end
|
|
@@ -9,9 +9,10 @@ module Relaton
|
|
|
9
9
|
#
|
|
10
10
|
# @param [Nokogiri::HTML::Element] node document node
|
|
11
11
|
#
|
|
12
|
-
def initialize(node, errors = {})
|
|
12
|
+
def initialize(node, errors = {}, agent: nil)
|
|
13
13
|
@node = node
|
|
14
14
|
@errors = errors
|
|
15
|
+
@agent = agent
|
|
15
16
|
end
|
|
16
17
|
|
|
17
18
|
def text
|
|
@@ -34,11 +35,11 @@ module Relaton
|
|
|
34
35
|
xpath = "./span[@class='citationTitle' " \
|
|
35
36
|
"or @class='citeTitle']|./em|./i"
|
|
36
37
|
t = @node.at(xpath)
|
|
37
|
-
@title = if t
|
|
38
|
+
@title = if t
|
|
39
|
+
t.text
|
|
38
40
|
else
|
|
39
|
-
text.match(
|
|
40
|
-
|
|
41
|
-
)[:content]
|
|
41
|
+
m = text.match(/(?<content>.+)\s(?:Edited|\d{2}\s\w+\d{4})/)
|
|
42
|
+
m ? m[:content] : text
|
|
42
43
|
end.strip
|
|
43
44
|
end
|
|
44
45
|
|
|
@@ -82,7 +83,7 @@ module Relaton
|
|
|
82
83
|
# @return [String] document number
|
|
83
84
|
#
|
|
84
85
|
def parse_docnumber
|
|
85
|
-
ref = @node.at("./span
|
|
86
|
+
ref = @node.at("./span/strong|./strong|./b/span")
|
|
86
87
|
num = ref.text.match(/[^\[\]]+/).to_s
|
|
87
88
|
id = parse_errata(num)
|
|
88
89
|
# some part refs need "Pt" to distinguish from root doc
|
|
@@ -110,8 +111,8 @@ module Relaton
|
|
|
110
111
|
# @return [Array<Bib::Date>] bibliographic dates
|
|
111
112
|
#
|
|
112
113
|
def parse_date
|
|
113
|
-
/(?<on>\d{1,2}\s\w+\s\d{4})/
|
|
114
|
-
result = [Bib::Date.new(at: Date.parse(on).to_s, type: "issued")]
|
|
114
|
+
match = text.match(/(?<on>\d{1,2}\s\w+\s\d{4})/)
|
|
115
|
+
result = match ? [Bib::Date.new(at: Date.parse(match[:on]).to_s, type: "issued")] : []
|
|
115
116
|
@errors[:part_date] &&= result.empty?
|
|
116
117
|
result
|
|
117
118
|
end
|
|
@@ -182,7 +183,7 @@ module Relaton
|
|
|
182
183
|
# @return [Array<Bib::Relation>] document relations
|
|
183
184
|
#
|
|
184
185
|
def parse_relation
|
|
185
|
-
parser = DataParser.new
|
|
186
|
+
parser = DataParser.new(@node.at("./ancestor::details"), @errors, agent: @agent)
|
|
186
187
|
fref = parser.parse_docid[0].content
|
|
187
188
|
bib = ItemData.new(formattedref: Bib::Formattedref.new(content: fref))
|
|
188
189
|
result = [Bib::Relation.new(type: "partOf", bibitem: bib)]
|
data/relaton-oasis.gemspec
CHANGED
|
@@ -31,6 +31,7 @@ Gem::Specification.new do |spec|
|
|
|
31
31
|
spec.executables = spec.files.grep(%r{\Aexe/}) { |f| File.basename(f) }
|
|
32
32
|
spec.require_paths = ["lib"]
|
|
33
33
|
|
|
34
|
+
spec.add_dependency "ferrum", "~> 0.17"
|
|
34
35
|
spec.add_dependency "mechanize", "~> 2.10"
|
|
35
36
|
spec.add_dependency "multi_json", "~> 1.15.0"
|
|
36
37
|
spec.add_dependency "relaton-bib", "~> 2.1.0"
|
metadata
CHANGED
|
@@ -1,15 +1,29 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: relaton-oasis
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 2.1.
|
|
4
|
+
version: 2.1.2
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Ribose Inc.
|
|
8
8
|
autorequire:
|
|
9
9
|
bindir: exe
|
|
10
10
|
cert_chain: []
|
|
11
|
-
date: 2026-05-
|
|
11
|
+
date: 2026-05-14 00:00:00.000000000 Z
|
|
12
12
|
dependencies:
|
|
13
|
+
- !ruby/object:Gem::Dependency
|
|
14
|
+
name: ferrum
|
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
|
16
|
+
requirements:
|
|
17
|
+
- - "~>"
|
|
18
|
+
- !ruby/object:Gem::Version
|
|
19
|
+
version: '0.17'
|
|
20
|
+
type: :runtime
|
|
21
|
+
prerelease: false
|
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
23
|
+
requirements:
|
|
24
|
+
- - "~>"
|
|
25
|
+
- !ruby/object:Gem::Version
|
|
26
|
+
version: '0.17'
|
|
13
27
|
- !ruby/object:Gem::Dependency
|
|
14
28
|
name: mechanize
|
|
15
29
|
requirement: !ruby/object:Gem::Requirement
|
|
@@ -110,6 +124,7 @@ files:
|
|
|
110
124
|
- lib/relaton/oasis/bibdata.rb
|
|
111
125
|
- lib/relaton/oasis/bibitem.rb
|
|
112
126
|
- lib/relaton/oasis/bibliography.rb
|
|
127
|
+
- lib/relaton/oasis/browser_agent.rb
|
|
113
128
|
- lib/relaton/oasis/data_fetcher.rb
|
|
114
129
|
- lib/relaton/oasis/data_parser.rb
|
|
115
130
|
- lib/relaton/oasis/data_parser_utils.rb
|