relaton-iso 1.18.1 → 1.18.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/relaton_iso/data_fetcher.rb +200 -0
- data/lib/relaton_iso/document_identifier.rb +20 -1
- data/lib/relaton_iso/hash_converter.rb +15 -0
- data/lib/relaton_iso/hit.rb +29 -21
- data/lib/relaton_iso/hit_collection.rb +74 -59
- data/lib/relaton_iso/index.rb +132 -0
- data/lib/relaton_iso/iso_bibliography.rb +172 -180
- data/lib/relaton_iso/processor.rb +22 -2
- data/lib/relaton_iso/queue.rb +61 -0
- data/lib/relaton_iso/scrapper.rb +118 -70
- data/lib/relaton_iso/version.rb +1 -1
- data/lib/relaton_iso.rb +5 -0
- data/relaton_iso.gemspec +1 -0
- metadata +20 -2
data/lib/relaton_iso/scrapper.rb
CHANGED
@@ -7,7 +7,9 @@ module RelatonIso
|
|
7
7
|
|
8
8
|
TYPES = {
|
9
9
|
"TS" => "technical-specification",
|
10
|
+
"DTS" => "technical-specification",
|
10
11
|
"TR" => "technical-report",
|
12
|
+
"DTR" => "technical-report",
|
11
13
|
"PAS" => "publicly-available-specification",
|
12
14
|
# "AWI" => "approvedWorkItem",
|
13
15
|
# "CD" => "committeeDraft",
|
@@ -18,6 +20,7 @@ module RelatonIso
|
|
18
20
|
# "R" => "recommendation",
|
19
21
|
"Guide" => "guide",
|
20
22
|
"ISO" => "international-standard",
|
23
|
+
"IEC" => "international-standard",
|
21
24
|
"IWA" => "international-workshop-agreement",
|
22
25
|
}.freeze
|
23
26
|
|
@@ -48,43 +51,38 @@ module RelatonIso
|
|
48
51
|
extend self
|
49
52
|
|
50
53
|
# Parse page.
|
51
|
-
# @param
|
52
|
-
# @param lang [String,
|
54
|
+
# @param path [String]
|
55
|
+
# @param lang [String, nil]
|
53
56
|
# @return [RelatonIsoBib::IsoBibliographicItem]
|
54
|
-
def parse_page(
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
path = hit.hit[:path].sub("/sites/isoorg", "")
|
59
|
-
doc, url = get_page "#{path}.html"
|
60
|
-
|
57
|
+
def parse_page(path, lang = nil) # rubocop:disable Metrics/AbcSize, Metrics/MethodLength, Metrics/CyclomaticComplexity, Metrics/PerceivedComplexity
|
58
|
+
doc, url = get_page path
|
59
|
+
id = doc.at("//nav[contains(@class,'heading-condensed')]/h1").text.split(" | ").first
|
60
|
+
pubid = Pubid::Iso::Identifier.parse(id)
|
61
61
|
# Fetch edition.
|
62
|
-
edition = doc.at("//div[div[.='Edition']]/text()[last()]")
|
63
|
-
|
64
|
-
hit.pubid.base.edition ||= edition if hit.pubid.base
|
62
|
+
edition = doc.at("//div[div[.='Edition']]/text()[last()]")&.text&.match(/\d+$/)&.to_s
|
63
|
+
pubid.root.edition ||= edition if pubid.base
|
65
64
|
|
66
65
|
titles, abstract, langs = fetch_titles_abstract(doc, lang)
|
67
66
|
|
68
67
|
RelatonIsoBib::IsoBibliographicItem.new(
|
69
|
-
|
70
|
-
|
71
|
-
docnumber: fetch_docnumber(hit.pubid),
|
68
|
+
docid: fetch_relaton_docids(doc, pubid),
|
69
|
+
docnumber: fetch_docnumber(pubid),
|
72
70
|
edition: edition,
|
73
71
|
language: langs.map { |l| l[:lang] },
|
74
72
|
script: langs.map { |l| script(l[:lang]) }.uniq,
|
75
73
|
title: titles,
|
76
|
-
doctype: fetch_type(
|
74
|
+
doctype: fetch_type(id),
|
77
75
|
docstatus: fetch_status(doc),
|
78
76
|
ics: fetch_ics(doc),
|
79
|
-
date: fetch_dates(doc,
|
80
|
-
contributor: fetch_contributors(
|
77
|
+
date: fetch_dates(doc, id),
|
78
|
+
contributor: fetch_contributors(id),
|
81
79
|
editorialgroup: fetch_workgroup(doc),
|
82
80
|
abstract: abstract,
|
83
81
|
copyright: fetch_copyright(doc),
|
84
82
|
link: fetch_link(doc, url),
|
85
83
|
relation: fetch_relations(doc),
|
86
84
|
place: ["Geneva"],
|
87
|
-
structuredidentifier: fetch_structuredidentifier(
|
85
|
+
structuredidentifier: fetch_structuredidentifier(pubid),
|
88
86
|
)
|
89
87
|
end
|
90
88
|
|
@@ -99,9 +97,9 @@ module RelatonIso
|
|
99
97
|
def fetch_relaton_docids(doc, pubid)
|
100
98
|
pubid.stage ||= Pubid::Iso::Identifier.parse_stage(stage_code(doc))
|
101
99
|
[
|
102
|
-
|
100
|
+
DocumentIdentifier.new(id: pubid, type: "ISO", primary: true),
|
103
101
|
RelatonBib::DocumentIdentifier.new(id: isoref(pubid), type: "iso-reference"),
|
104
|
-
|
102
|
+
DocumentIdentifier.new(id: pubid, type: "URN"),
|
105
103
|
]
|
106
104
|
end
|
107
105
|
|
@@ -121,49 +119,45 @@ module RelatonIso
|
|
121
119
|
|
122
120
|
# Fetch titles and abstracts.
|
123
121
|
# @param doc [Nokigiri::HTML::Document]
|
124
|
-
# @param lang [String,
|
122
|
+
# @param lang [String, nil]
|
125
123
|
# @return [Array<Array>]
|
126
|
-
def fetch_titles_abstract(doc, lang) # rubocop:disable Metrics/AbcSize,Metrics/MethodLength
|
124
|
+
def fetch_titles_abstract(doc, lang) # rubocop:disable Metrics/AbcSize,Metrics/MethodLength
|
127
125
|
titles = RelatonBib::TypedTitleStringCollection.new
|
128
126
|
abstract = []
|
129
|
-
langs = languages(doc, lang).
|
127
|
+
langs = languages(doc, lang).each_with_object([]) do |l, s|
|
130
128
|
# Don't need to get page for en. We already have it.
|
131
129
|
d = l[:path] ? get_page(l[:path])[0] : doc
|
132
|
-
unless d.at("//h5[@class='help-block']"
|
133
|
-
"[.='недоступно на русском языке']")
|
130
|
+
unless d.at("//h5[@class='help-block'][.='недоступно на русском языке']")
|
134
131
|
s << l
|
135
132
|
titles += fetch_title(d, l[:lang])
|
136
133
|
|
137
|
-
|
138
|
-
|
139
|
-
"//div[@itemprop='description']/p|//div[@itemprop='description']/ul/li",
|
140
|
-
).map do |a|
|
141
|
-
a.name == "li" ? "- #{a.text}" : a.text
|
142
|
-
end.reject(&:empty?).join("\n")
|
143
|
-
unless abstract_content.empty?
|
144
|
-
abstract << {
|
145
|
-
content: abstract_content,
|
146
|
-
language: l[:lang],
|
147
|
-
script: script(l[:lang]),
|
148
|
-
format: "text/plain",
|
149
|
-
}
|
150
|
-
end
|
134
|
+
abstr = parse_abstract(d, l)
|
135
|
+
abstract << abstr if abstr
|
151
136
|
end
|
152
|
-
s
|
153
137
|
end
|
154
138
|
[titles, abstract, langs]
|
155
139
|
end
|
156
140
|
|
141
|
+
def parse_abstract(doc, lang)
|
142
|
+
abstract_content = doc.xpath(
|
143
|
+
"//div[@itemprop='description']/p|//div[@itemprop='description']/ul/li",
|
144
|
+
).map { |a| a.name == "li" ? "- #{a.text}" : a.text }.reject(&:empty?).join("\n")
|
145
|
+
return if abstract_content.empty?
|
146
|
+
|
147
|
+
{ content: abstract_content, language: lang[:lang],
|
148
|
+
script: script(lang[:lang]), format: "text/plain" }
|
149
|
+
end
|
150
|
+
|
157
151
|
# Returns available languages.
|
158
152
|
# @param doc [Nokogiri::HTML::Document]
|
159
|
-
# @
|
153
|
+
# @param lang [String, nil]
|
160
154
|
# @return [Array<Hash>]
|
161
155
|
def languages(doc, lang)
|
162
156
|
lgs = [{ lang: "en" }]
|
163
157
|
doc.css("li#lang-switcher ul li a").each do |lang_link|
|
164
158
|
lang_path = lang_link.attr("href")
|
165
159
|
l = lang_path.match(%r{^/(fr)/})
|
166
|
-
lgs << { lang: l[1], path: lang_path } if l && (!lang || l[1]
|
160
|
+
lgs << { lang: l[1], path: lang_path } if l && (!lang || l[1] != lang)
|
167
161
|
end
|
168
162
|
lgs
|
169
163
|
end
|
@@ -171,14 +165,21 @@ module RelatonIso
|
|
171
165
|
# Get page.
|
172
166
|
# @param path [String] page's path
|
173
167
|
# @return [Array<Nokogiri::HTML::Document, String>]
|
174
|
-
def get_page(path)
|
175
|
-
|
176
|
-
|
177
|
-
|
178
|
-
|
179
|
-
|
180
|
-
|
181
|
-
|
168
|
+
def get_page(path) # rubocop:disable Metrics/MethodLength
|
169
|
+
try = 0
|
170
|
+
begin
|
171
|
+
resp, uri = get_redirection path
|
172
|
+
doc = try_if_fail resp, uri
|
173
|
+
[doc, uri.to_s]
|
174
|
+
rescue SocketError, Timeout::Error, Errno::EINVAL, Errno::ECONNRESET,
|
175
|
+
EOFError, Net::HTTPBadResponse, Net::HTTPHeaderSyntaxError,
|
176
|
+
Net::ProtocolError, Errno::ETIMEDOUT
|
177
|
+
try += 1
|
178
|
+
raise RelatonBib::RequestError, "Could not access #{DOMAIN}#{path}" if try > 3
|
179
|
+
|
180
|
+
sleep 1
|
181
|
+
retry
|
182
|
+
end
|
182
183
|
end
|
183
184
|
|
184
185
|
#
|
@@ -190,13 +191,37 @@ module RelatonIso
|
|
190
191
|
# @return [Array<Net::HTTPOK, URI>] HTTP response and URI
|
191
192
|
# @raise [RelatonBib::RequestError] if the page is not found
|
192
193
|
#
|
193
|
-
def get_redirection(path)
|
194
|
-
|
195
|
-
|
196
|
-
|
197
|
-
|
194
|
+
def get_redirection(path) # rubocop:disable Metrics/AbcSize,Metrics/MethodLength
|
195
|
+
uri = URI(DOMAIN + path)
|
196
|
+
try = 0
|
197
|
+
begin
|
198
|
+
get_response uri
|
199
|
+
rescue Errno::EPIPE => e
|
200
|
+
try += 1
|
201
|
+
retry if check_try try, uri
|
202
|
+
raise e
|
203
|
+
end
|
204
|
+
end
|
198
205
|
|
199
|
-
|
206
|
+
def check_try(try, uri)
|
207
|
+
if try < 3
|
208
|
+
warn "Timeout fetching #{uri}, retrying..."
|
209
|
+
sleep 1
|
210
|
+
true
|
211
|
+
end
|
212
|
+
end
|
213
|
+
|
214
|
+
def get_response(uri, try = 0)
|
215
|
+
raise RelatonBib::RequestError, "#{uri} not found." if try > 3
|
216
|
+
|
217
|
+
resp = Net::HTTP.get_response(uri)
|
218
|
+
case resp.code
|
219
|
+
when "200" then [resp, uri]
|
220
|
+
when "301" then get_redirection(resp["location"])
|
221
|
+
else
|
222
|
+
sleep 1
|
223
|
+
get_response uri, try + 1
|
224
|
+
end
|
200
225
|
end
|
201
226
|
|
202
227
|
#
|
@@ -240,12 +265,19 @@ module RelatonIso
|
|
240
265
|
#
|
241
266
|
def fetch_structuredidentifier(pubid) # rubocop:disable Metrics/MethodLength
|
242
267
|
RelatonIsoBib::StructuredIdentifier.new(
|
243
|
-
project_number: "#{pubid.publisher} #{pubid.number}",
|
244
|
-
part: pubid.part&.to_s, # &.sub(/^-/, ""),
|
245
|
-
type: pubid.publisher,
|
268
|
+
project_number: "#{pubid.root.publisher} #{pubid.root.number}",
|
269
|
+
part: pubid.root.part&.to_s, # &.sub(/^-/, ""),
|
270
|
+
type: pubid.root.publisher,
|
246
271
|
)
|
247
272
|
end
|
248
273
|
|
274
|
+
#
|
275
|
+
# Parse ID from the document.
|
276
|
+
#
|
277
|
+
# @param [Nokogiri::HTML::Document] doc document to parse
|
278
|
+
#
|
279
|
+
# @return [String, nil] ID
|
280
|
+
#
|
249
281
|
def item_ref(doc)
|
250
282
|
doc.at("//main//section/div/div/div//h1")&.text
|
251
283
|
end
|
@@ -271,7 +303,7 @@ module RelatonIso
|
|
271
303
|
|
272
304
|
# Fetch workgroup.
|
273
305
|
# @param doc [Nokogiri::HTML::Document]
|
274
|
-
# @return [
|
306
|
+
# @return [RelatonIsoBib::EditorialGroup, nil]
|
275
307
|
def fetch_workgroup(doc) # rubocop:disable Metrics/MethodLength,Metrics/AbcSize,Metrics/CyclomaticComplexity,Metrics/PerceivedComplexity
|
276
308
|
wg = doc.at("////div[contains(., 'Technical Committe')]/following-sibling::span/a")
|
277
309
|
return unless wg
|
@@ -286,7 +318,7 @@ module RelatonIso
|
|
286
318
|
tc_numb = workgroup[1]&.match(/\d+/)&.to_s&.to_i
|
287
319
|
tc_name = wg[:title]
|
288
320
|
tc = RelatonBib::WorkGroup.new(name: tc_name, identifier: wg.text,
|
289
|
-
|
321
|
+
type: type, number: tc_numb)
|
290
322
|
RelatonIsoBib::EditorialGroup.new(technical_committee: [tc])
|
291
323
|
end
|
292
324
|
|
@@ -303,6 +335,14 @@ module RelatonIso
|
|
303
335
|
end
|
304
336
|
end
|
305
337
|
|
338
|
+
#
|
339
|
+
# Parse relation type and dates.
|
340
|
+
#
|
341
|
+
# @param [String] type parsed type
|
342
|
+
# @param [Nokogiri::HTML::Document] doc document to parse
|
343
|
+
#
|
344
|
+
# @return [Array<String,Array>] type and dates
|
345
|
+
#
|
306
346
|
def relation_type(type, doc)
|
307
347
|
date = []
|
308
348
|
t = case type.strip
|
@@ -316,9 +356,20 @@ module RelatonIso
|
|
316
356
|
[t, date]
|
317
357
|
end
|
318
358
|
|
359
|
+
#
|
360
|
+
# Create relations.
|
361
|
+
#
|
362
|
+
# @param [Nokogiri::HTML::Element] rel relation element
|
363
|
+
# @param [String] type relation type
|
364
|
+
# @param [Hash{Symbol=>String}] date relation document date
|
365
|
+
# @option date [String] :type date type
|
366
|
+
# @option date [String] :on date
|
367
|
+
#
|
368
|
+
# @return [Array<Hash>] Relations
|
369
|
+
#
|
319
370
|
def create_relations(rel, type, date)
|
320
371
|
rel.css("a").map do |id|
|
321
|
-
docid =
|
372
|
+
docid = DocumentIdentifier.new(type: "ISO", id: id.text, primary: true)
|
322
373
|
fref = RelatonBib::FormattedRef.new(content: id.text, format: "text/plain")
|
323
374
|
bibitem = RelatonIsoBib::IsoBibliographicItem.new(
|
324
375
|
docid: [docid], formattedref: fref, date: date,
|
@@ -333,14 +384,11 @@ module RelatonIso
|
|
333
384
|
def fetch_type(ref)
|
334
385
|
%r{
|
335
386
|
^(?<prefix>ISO|IWA|IEC)
|
336
|
-
(?:(?:/IEC|/IEEE|/PRF|/NP|/DGuide)*\s|/)
|
337
|
-
(?<type>TS|TR|PAS|AWI|CD|FDIS|NP|DIS|WD|R|Guide|(?=\d+))
|
387
|
+
(?:(?:/IEC|/IEEE|/PRF|/NP|/SAE|/HL7|/DGuide)*\s|/)
|
388
|
+
(?<type>TS|TR|PAS|AWI|CD|FDIS|NP|DIS|WD|R|DTS|DTR|ISP|PWI|Guide|(?=\d+))
|
338
389
|
}x =~ ref
|
339
|
-
|
340
|
-
type = TYPES[type] || TYPES[prefix]
|
390
|
+
type = TYPES[type] || TYPES[prefix] || "international-standard"
|
341
391
|
RelatonIsoBib::DocumentType.new(type: type)
|
342
|
-
# rescue => _e
|
343
|
-
# puts 'Unknown document type: ' + title
|
344
392
|
end
|
345
393
|
|
346
394
|
# Fetch titles.
|
@@ -445,7 +493,7 @@ module RelatonIso
|
|
445
493
|
links << { type: "obp", content: obp[:href] } if obp
|
446
494
|
rss = doc.at("//a[contains(@href, 'rss')]")
|
447
495
|
links << { type: "rss", content: DOMAIN + rss[:href] } if rss
|
448
|
-
pub = doc.at
|
496
|
+
pub = doc.at "//p[contains(., 'publicly available')]/a",
|
449
497
|
"//p[contains(., 'can be downloaded from the')]/a"
|
450
498
|
links << { type: "pub", content: pub[:href] } if pub
|
451
499
|
links
|
data/lib/relaton_iso/version.rb
CHANGED
data/lib/relaton_iso.rb
CHANGED
@@ -4,10 +4,15 @@ require "nokogiri"
|
|
4
4
|
require "net/http"
|
5
5
|
require "logger"
|
6
6
|
require "pubid-iso"
|
7
|
+
require "relaton/index"
|
7
8
|
require "relaton_iso_bib"
|
8
9
|
require "relaton_iso/version"
|
9
10
|
require "relaton_iso/config"
|
10
11
|
require "relaton_iso/util"
|
12
|
+
require "relaton_iso/hash_converter"
|
11
13
|
require "relaton_iso/hit"
|
12
14
|
require "relaton_iso/iso_bibliography"
|
13
15
|
require "relaton_iso/document_identifier"
|
16
|
+
# require "relaton_iso/index"
|
17
|
+
require "relaton_iso/queue"
|
18
|
+
require "relaton_iso/data_fetcher"
|
data/relaton_iso.gemspec
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: relaton-iso
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.18.
|
4
|
+
version: 1.18.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Ribose Inc.
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2024-02-
|
11
|
+
date: 2024-02-14 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: algolia
|
@@ -38,6 +38,20 @@ dependencies:
|
|
38
38
|
- - "~>"
|
39
39
|
- !ruby/object:Gem::Version
|
40
40
|
version: 0.1.1
|
41
|
+
- !ruby/object:Gem::Dependency
|
42
|
+
name: relaton-index
|
43
|
+
requirement: !ruby/object:Gem::Requirement
|
44
|
+
requirements:
|
45
|
+
- - "~>"
|
46
|
+
- !ruby/object:Gem::Version
|
47
|
+
version: 0.2.12
|
48
|
+
type: :runtime
|
49
|
+
prerelease: false
|
50
|
+
version_requirements: !ruby/object:Gem::Requirement
|
51
|
+
requirements:
|
52
|
+
- - "~>"
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: 0.2.12
|
41
55
|
- !ruby/object:Gem::Dependency
|
42
56
|
name: relaton-iso-bib
|
43
57
|
requirement: !ruby/object:Gem::Requirement
|
@@ -94,11 +108,15 @@ files:
|
|
94
108
|
- bin/thor
|
95
109
|
- lib/relaton_iso.rb
|
96
110
|
- lib/relaton_iso/config.rb
|
111
|
+
- lib/relaton_iso/data_fetcher.rb
|
97
112
|
- lib/relaton_iso/document_identifier.rb
|
113
|
+
- lib/relaton_iso/hash_converter.rb
|
98
114
|
- lib/relaton_iso/hit.rb
|
99
115
|
- lib/relaton_iso/hit_collection.rb
|
116
|
+
- lib/relaton_iso/index.rb
|
100
117
|
- lib/relaton_iso/iso_bibliography.rb
|
101
118
|
- lib/relaton_iso/processor.rb
|
119
|
+
- lib/relaton_iso/queue.rb
|
102
120
|
- lib/relaton_iso/scrapper.rb
|
103
121
|
- lib/relaton_iso/util.rb
|
104
122
|
- lib/relaton_iso/version.rb
|