relaton-iso 1.18.1 → 1.18.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/relaton_iso/data_fetcher.rb +200 -0
- data/lib/relaton_iso/document_identifier.rb +20 -1
- data/lib/relaton_iso/hash_converter.rb +15 -0
- data/lib/relaton_iso/hit.rb +29 -21
- data/lib/relaton_iso/hit_collection.rb +74 -59
- data/lib/relaton_iso/index.rb +132 -0
- data/lib/relaton_iso/iso_bibliography.rb +172 -180
- data/lib/relaton_iso/processor.rb +22 -2
- data/lib/relaton_iso/queue.rb +61 -0
- data/lib/relaton_iso/scrapper.rb +118 -70
- data/lib/relaton_iso/version.rb +1 -1
- data/lib/relaton_iso.rb +5 -0
- data/relaton_iso.gemspec +1 -0
- metadata +20 -2
data/lib/relaton_iso/scrapper.rb
CHANGED
@@ -7,7 +7,9 @@ module RelatonIso
|
|
7
7
|
|
8
8
|
TYPES = {
|
9
9
|
"TS" => "technical-specification",
|
10
|
+
"DTS" => "technical-specification",
|
10
11
|
"TR" => "technical-report",
|
12
|
+
"DTR" => "technical-report",
|
11
13
|
"PAS" => "publicly-available-specification",
|
12
14
|
# "AWI" => "approvedWorkItem",
|
13
15
|
# "CD" => "committeeDraft",
|
@@ -18,6 +20,7 @@ module RelatonIso
|
|
18
20
|
# "R" => "recommendation",
|
19
21
|
"Guide" => "guide",
|
20
22
|
"ISO" => "international-standard",
|
23
|
+
"IEC" => "international-standard",
|
21
24
|
"IWA" => "international-workshop-agreement",
|
22
25
|
}.freeze
|
23
26
|
|
@@ -48,43 +51,38 @@ module RelatonIso
|
|
48
51
|
extend self
|
49
52
|
|
50
53
|
# Parse page.
|
51
|
-
# @param
|
52
|
-
# @param lang [String,
|
54
|
+
# @param path [String]
|
55
|
+
# @param lang [String, nil]
|
53
56
|
# @return [RelatonIsoBib::IsoBibliographicItem]
|
54
|
-
def parse_page(
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
path = hit.hit[:path].sub("/sites/isoorg", "")
|
59
|
-
doc, url = get_page "#{path}.html"
|
60
|
-
|
57
|
+
def parse_page(path, lang = nil) # rubocop:disable Metrics/AbcSize, Metrics/MethodLength, Metrics/CyclomaticComplexity, Metrics/PerceivedComplexity
|
58
|
+
doc, url = get_page path
|
59
|
+
id = doc.at("//nav[contains(@class,'heading-condensed')]/h1").text.split(" | ").first
|
60
|
+
pubid = Pubid::Iso::Identifier.parse(id)
|
61
61
|
# Fetch edition.
|
62
|
-
edition = doc.at("//div[div[.='Edition']]/text()[last()]")
|
63
|
-
|
64
|
-
hit.pubid.base.edition ||= edition if hit.pubid.base
|
62
|
+
edition = doc.at("//div[div[.='Edition']]/text()[last()]")&.text&.match(/\d+$/)&.to_s
|
63
|
+
pubid.root.edition ||= edition if pubid.base
|
65
64
|
|
66
65
|
titles, abstract, langs = fetch_titles_abstract(doc, lang)
|
67
66
|
|
68
67
|
RelatonIsoBib::IsoBibliographicItem.new(
|
69
|
-
|
70
|
-
|
71
|
-
docnumber: fetch_docnumber(hit.pubid),
|
68
|
+
docid: fetch_relaton_docids(doc, pubid),
|
69
|
+
docnumber: fetch_docnumber(pubid),
|
72
70
|
edition: edition,
|
73
71
|
language: langs.map { |l| l[:lang] },
|
74
72
|
script: langs.map { |l| script(l[:lang]) }.uniq,
|
75
73
|
title: titles,
|
76
|
-
doctype: fetch_type(
|
74
|
+
doctype: fetch_type(id),
|
77
75
|
docstatus: fetch_status(doc),
|
78
76
|
ics: fetch_ics(doc),
|
79
|
-
date: fetch_dates(doc,
|
80
|
-
contributor: fetch_contributors(
|
77
|
+
date: fetch_dates(doc, id),
|
78
|
+
contributor: fetch_contributors(id),
|
81
79
|
editorialgroup: fetch_workgroup(doc),
|
82
80
|
abstract: abstract,
|
83
81
|
copyright: fetch_copyright(doc),
|
84
82
|
link: fetch_link(doc, url),
|
85
83
|
relation: fetch_relations(doc),
|
86
84
|
place: ["Geneva"],
|
87
|
-
structuredidentifier: fetch_structuredidentifier(
|
85
|
+
structuredidentifier: fetch_structuredidentifier(pubid),
|
88
86
|
)
|
89
87
|
end
|
90
88
|
|
@@ -99,9 +97,9 @@ module RelatonIso
|
|
99
97
|
def fetch_relaton_docids(doc, pubid)
|
100
98
|
pubid.stage ||= Pubid::Iso::Identifier.parse_stage(stage_code(doc))
|
101
99
|
[
|
102
|
-
|
100
|
+
DocumentIdentifier.new(id: pubid, type: "ISO", primary: true),
|
103
101
|
RelatonBib::DocumentIdentifier.new(id: isoref(pubid), type: "iso-reference"),
|
104
|
-
|
102
|
+
DocumentIdentifier.new(id: pubid, type: "URN"),
|
105
103
|
]
|
106
104
|
end
|
107
105
|
|
@@ -121,49 +119,45 @@ module RelatonIso
|
|
121
119
|
|
122
120
|
# Fetch titles and abstracts.
|
123
121
|
# @param doc [Nokigiri::HTML::Document]
|
124
|
-
# @param lang [String,
|
122
|
+
# @param lang [String, nil]
|
125
123
|
# @return [Array<Array>]
|
126
|
-
def fetch_titles_abstract(doc, lang) # rubocop:disable Metrics/AbcSize,Metrics/MethodLength
|
124
|
+
def fetch_titles_abstract(doc, lang) # rubocop:disable Metrics/AbcSize,Metrics/MethodLength
|
127
125
|
titles = RelatonBib::TypedTitleStringCollection.new
|
128
126
|
abstract = []
|
129
|
-
langs = languages(doc, lang).
|
127
|
+
langs = languages(doc, lang).each_with_object([]) do |l, s|
|
130
128
|
# Don't need to get page for en. We already have it.
|
131
129
|
d = l[:path] ? get_page(l[:path])[0] : doc
|
132
|
-
unless d.at("//h5[@class='help-block']"
|
133
|
-
"[.='недоступно на русском языке']")
|
130
|
+
unless d.at("//h5[@class='help-block'][.='недоступно на русском языке']")
|
134
131
|
s << l
|
135
132
|
titles += fetch_title(d, l[:lang])
|
136
133
|
|
137
|
-
|
138
|
-
|
139
|
-
"//div[@itemprop='description']/p|//div[@itemprop='description']/ul/li",
|
140
|
-
).map do |a|
|
141
|
-
a.name == "li" ? "- #{a.text}" : a.text
|
142
|
-
end.reject(&:empty?).join("\n")
|
143
|
-
unless abstract_content.empty?
|
144
|
-
abstract << {
|
145
|
-
content: abstract_content,
|
146
|
-
language: l[:lang],
|
147
|
-
script: script(l[:lang]),
|
148
|
-
format: "text/plain",
|
149
|
-
}
|
150
|
-
end
|
134
|
+
abstr = parse_abstract(d, l)
|
135
|
+
abstract << abstr if abstr
|
151
136
|
end
|
152
|
-
s
|
153
137
|
end
|
154
138
|
[titles, abstract, langs]
|
155
139
|
end
|
156
140
|
|
141
|
+
def parse_abstract(doc, lang)
|
142
|
+
abstract_content = doc.xpath(
|
143
|
+
"//div[@itemprop='description']/p|//div[@itemprop='description']/ul/li",
|
144
|
+
).map { |a| a.name == "li" ? "- #{a.text}" : a.text }.reject(&:empty?).join("\n")
|
145
|
+
return if abstract_content.empty?
|
146
|
+
|
147
|
+
{ content: abstract_content, language: lang[:lang],
|
148
|
+
script: script(lang[:lang]), format: "text/plain" }
|
149
|
+
end
|
150
|
+
|
157
151
|
# Returns available languages.
|
158
152
|
# @param doc [Nokogiri::HTML::Document]
|
159
|
-
# @
|
153
|
+
# @param lang [String, nil]
|
160
154
|
# @return [Array<Hash>]
|
161
155
|
def languages(doc, lang)
|
162
156
|
lgs = [{ lang: "en" }]
|
163
157
|
doc.css("li#lang-switcher ul li a").each do |lang_link|
|
164
158
|
lang_path = lang_link.attr("href")
|
165
159
|
l = lang_path.match(%r{^/(fr)/})
|
166
|
-
lgs << { lang: l[1], path: lang_path } if l && (!lang || l[1]
|
160
|
+
lgs << { lang: l[1], path: lang_path } if l && (!lang || l[1] != lang)
|
167
161
|
end
|
168
162
|
lgs
|
169
163
|
end
|
@@ -171,14 +165,21 @@ module RelatonIso
|
|
171
165
|
# Get page.
|
172
166
|
# @param path [String] page's path
|
173
167
|
# @return [Array<Nokogiri::HTML::Document, String>]
|
174
|
-
def get_page(path)
|
175
|
-
|
176
|
-
|
177
|
-
|
178
|
-
|
179
|
-
|
180
|
-
|
181
|
-
|
168
|
+
def get_page(path) # rubocop:disable Metrics/MethodLength
|
169
|
+
try = 0
|
170
|
+
begin
|
171
|
+
resp, uri = get_redirection path
|
172
|
+
doc = try_if_fail resp, uri
|
173
|
+
[doc, uri.to_s]
|
174
|
+
rescue SocketError, Timeout::Error, Errno::EINVAL, Errno::ECONNRESET,
|
175
|
+
EOFError, Net::HTTPBadResponse, Net::HTTPHeaderSyntaxError,
|
176
|
+
Net::ProtocolError, Errno::ETIMEDOUT
|
177
|
+
try += 1
|
178
|
+
raise RelatonBib::RequestError, "Could not access #{DOMAIN}#{path}" if try > 3
|
179
|
+
|
180
|
+
sleep 1
|
181
|
+
retry
|
182
|
+
end
|
182
183
|
end
|
183
184
|
|
184
185
|
#
|
@@ -190,13 +191,37 @@ module RelatonIso
|
|
190
191
|
# @return [Array<Net::HTTPOK, URI>] HTTP response and URI
|
191
192
|
# @raise [RelatonBib::RequestError] if the page is not found
|
192
193
|
#
|
193
|
-
def get_redirection(path)
|
194
|
-
|
195
|
-
|
196
|
-
|
197
|
-
|
194
|
+
def get_redirection(path) # rubocop:disable Metrics/AbcSize,Metrics/MethodLength
|
195
|
+
uri = URI(DOMAIN + path)
|
196
|
+
try = 0
|
197
|
+
begin
|
198
|
+
get_response uri
|
199
|
+
rescue Errno::EPIPE => e
|
200
|
+
try += 1
|
201
|
+
retry if check_try try, uri
|
202
|
+
raise e
|
203
|
+
end
|
204
|
+
end
|
198
205
|
|
199
|
-
|
206
|
+
def check_try(try, uri)
|
207
|
+
if try < 3
|
208
|
+
warn "Timeout fetching #{uri}, retrying..."
|
209
|
+
sleep 1
|
210
|
+
true
|
211
|
+
end
|
212
|
+
end
|
213
|
+
|
214
|
+
def get_response(uri, try = 0)
|
215
|
+
raise RelatonBib::RequestError, "#{uri} not found." if try > 3
|
216
|
+
|
217
|
+
resp = Net::HTTP.get_response(uri)
|
218
|
+
case resp.code
|
219
|
+
when "200" then [resp, uri]
|
220
|
+
when "301" then get_redirection(resp["location"])
|
221
|
+
else
|
222
|
+
sleep 1
|
223
|
+
get_response uri, try + 1
|
224
|
+
end
|
200
225
|
end
|
201
226
|
|
202
227
|
#
|
@@ -240,12 +265,19 @@ module RelatonIso
|
|
240
265
|
#
|
241
266
|
def fetch_structuredidentifier(pubid) # rubocop:disable Metrics/MethodLength
|
242
267
|
RelatonIsoBib::StructuredIdentifier.new(
|
243
|
-
project_number: "#{pubid.publisher} #{pubid.number}",
|
244
|
-
part: pubid.part&.to_s, # &.sub(/^-/, ""),
|
245
|
-
type: pubid.publisher,
|
268
|
+
project_number: "#{pubid.root.publisher} #{pubid.root.number}",
|
269
|
+
part: pubid.root.part&.to_s, # &.sub(/^-/, ""),
|
270
|
+
type: pubid.root.publisher,
|
246
271
|
)
|
247
272
|
end
|
248
273
|
|
274
|
+
#
|
275
|
+
# Parse ID from the document.
|
276
|
+
#
|
277
|
+
# @param [Nokogiri::HTML::Document] doc document to parse
|
278
|
+
#
|
279
|
+
# @return [String, nil] ID
|
280
|
+
#
|
249
281
|
def item_ref(doc)
|
250
282
|
doc.at("//main//section/div/div/div//h1")&.text
|
251
283
|
end
|
@@ -271,7 +303,7 @@ module RelatonIso
|
|
271
303
|
|
272
304
|
# Fetch workgroup.
|
273
305
|
# @param doc [Nokogiri::HTML::Document]
|
274
|
-
# @return [
|
306
|
+
# @return [RelatonIsoBib::EditorialGroup, nil]
|
275
307
|
def fetch_workgroup(doc) # rubocop:disable Metrics/MethodLength,Metrics/AbcSize,Metrics/CyclomaticComplexity,Metrics/PerceivedComplexity
|
276
308
|
wg = doc.at("////div[contains(., 'Technical Committe')]/following-sibling::span/a")
|
277
309
|
return unless wg
|
@@ -286,7 +318,7 @@ module RelatonIso
|
|
286
318
|
tc_numb = workgroup[1]&.match(/\d+/)&.to_s&.to_i
|
287
319
|
tc_name = wg[:title]
|
288
320
|
tc = RelatonBib::WorkGroup.new(name: tc_name, identifier: wg.text,
|
289
|
-
|
321
|
+
type: type, number: tc_numb)
|
290
322
|
RelatonIsoBib::EditorialGroup.new(technical_committee: [tc])
|
291
323
|
end
|
292
324
|
|
@@ -303,6 +335,14 @@ module RelatonIso
|
|
303
335
|
end
|
304
336
|
end
|
305
337
|
|
338
|
+
#
|
339
|
+
# Parse relation type and dates.
|
340
|
+
#
|
341
|
+
# @param [String] type parsed type
|
342
|
+
# @param [Nokogiri::HTML::Document] doc document to parse
|
343
|
+
#
|
344
|
+
# @return [Array<String,Array>] type and dates
|
345
|
+
#
|
306
346
|
def relation_type(type, doc)
|
307
347
|
date = []
|
308
348
|
t = case type.strip
|
@@ -316,9 +356,20 @@ module RelatonIso
|
|
316
356
|
[t, date]
|
317
357
|
end
|
318
358
|
|
359
|
+
#
|
360
|
+
# Create relations.
|
361
|
+
#
|
362
|
+
# @param [Nokogiri::HTML::Element] rel relation element
|
363
|
+
# @param [String] type relation type
|
364
|
+
# @param [Hash{Symbol=>String}] date relation document date
|
365
|
+
# @option date [String] :type date type
|
366
|
+
# @option date [String] :on date
|
367
|
+
#
|
368
|
+
# @return [Array<Hash>] Relations
|
369
|
+
#
|
319
370
|
def create_relations(rel, type, date)
|
320
371
|
rel.css("a").map do |id|
|
321
|
-
docid =
|
372
|
+
docid = DocumentIdentifier.new(type: "ISO", id: id.text, primary: true)
|
322
373
|
fref = RelatonBib::FormattedRef.new(content: id.text, format: "text/plain")
|
323
374
|
bibitem = RelatonIsoBib::IsoBibliographicItem.new(
|
324
375
|
docid: [docid], formattedref: fref, date: date,
|
@@ -333,14 +384,11 @@ module RelatonIso
|
|
333
384
|
def fetch_type(ref)
|
334
385
|
%r{
|
335
386
|
^(?<prefix>ISO|IWA|IEC)
|
336
|
-
(?:(?:/IEC|/IEEE|/PRF|/NP|/DGuide)*\s|/)
|
337
|
-
(?<type>TS|TR|PAS|AWI|CD|FDIS|NP|DIS|WD|R|Guide|(?=\d+))
|
387
|
+
(?:(?:/IEC|/IEEE|/PRF|/NP|/SAE|/HL7|/DGuide)*\s|/)
|
388
|
+
(?<type>TS|TR|PAS|AWI|CD|FDIS|NP|DIS|WD|R|DTS|DTR|ISP|PWI|Guide|(?=\d+))
|
338
389
|
}x =~ ref
|
339
|
-
|
340
|
-
type = TYPES[type] || TYPES[prefix]
|
390
|
+
type = TYPES[type] || TYPES[prefix] || "international-standard"
|
341
391
|
RelatonIsoBib::DocumentType.new(type: type)
|
342
|
-
# rescue => _e
|
343
|
-
# puts 'Unknown document type: ' + title
|
344
392
|
end
|
345
393
|
|
346
394
|
# Fetch titles.
|
@@ -445,7 +493,7 @@ module RelatonIso
|
|
445
493
|
links << { type: "obp", content: obp[:href] } if obp
|
446
494
|
rss = doc.at("//a[contains(@href, 'rss')]")
|
447
495
|
links << { type: "rss", content: DOMAIN + rss[:href] } if rss
|
448
|
-
pub = doc.at
|
496
|
+
pub = doc.at "//p[contains(., 'publicly available')]/a",
|
449
497
|
"//p[contains(., 'can be downloaded from the')]/a"
|
450
498
|
links << { type: "pub", content: pub[:href] } if pub
|
451
499
|
links
|
data/lib/relaton_iso/version.rb
CHANGED
data/lib/relaton_iso.rb
CHANGED
@@ -4,10 +4,15 @@ require "nokogiri"
|
|
4
4
|
require "net/http"
|
5
5
|
require "logger"
|
6
6
|
require "pubid-iso"
|
7
|
+
require "relaton/index"
|
7
8
|
require "relaton_iso_bib"
|
8
9
|
require "relaton_iso/version"
|
9
10
|
require "relaton_iso/config"
|
10
11
|
require "relaton_iso/util"
|
12
|
+
require "relaton_iso/hash_converter"
|
11
13
|
require "relaton_iso/hit"
|
12
14
|
require "relaton_iso/iso_bibliography"
|
13
15
|
require "relaton_iso/document_identifier"
|
16
|
+
# require "relaton_iso/index"
|
17
|
+
require "relaton_iso/queue"
|
18
|
+
require "relaton_iso/data_fetcher"
|
data/relaton_iso.gemspec
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: relaton-iso
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.18.
|
4
|
+
version: 1.18.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Ribose Inc.
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2024-02-
|
11
|
+
date: 2024-02-14 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: algolia
|
@@ -38,6 +38,20 @@ dependencies:
|
|
38
38
|
- - "~>"
|
39
39
|
- !ruby/object:Gem::Version
|
40
40
|
version: 0.1.1
|
41
|
+
- !ruby/object:Gem::Dependency
|
42
|
+
name: relaton-index
|
43
|
+
requirement: !ruby/object:Gem::Requirement
|
44
|
+
requirements:
|
45
|
+
- - "~>"
|
46
|
+
- !ruby/object:Gem::Version
|
47
|
+
version: 0.2.12
|
48
|
+
type: :runtime
|
49
|
+
prerelease: false
|
50
|
+
version_requirements: !ruby/object:Gem::Requirement
|
51
|
+
requirements:
|
52
|
+
- - "~>"
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: 0.2.12
|
41
55
|
- !ruby/object:Gem::Dependency
|
42
56
|
name: relaton-iso-bib
|
43
57
|
requirement: !ruby/object:Gem::Requirement
|
@@ -94,11 +108,15 @@ files:
|
|
94
108
|
- bin/thor
|
95
109
|
- lib/relaton_iso.rb
|
96
110
|
- lib/relaton_iso/config.rb
|
111
|
+
- lib/relaton_iso/data_fetcher.rb
|
97
112
|
- lib/relaton_iso/document_identifier.rb
|
113
|
+
- lib/relaton_iso/hash_converter.rb
|
98
114
|
- lib/relaton_iso/hit.rb
|
99
115
|
- lib/relaton_iso/hit_collection.rb
|
116
|
+
- lib/relaton_iso/index.rb
|
100
117
|
- lib/relaton_iso/iso_bibliography.rb
|
101
118
|
- lib/relaton_iso/processor.rb
|
119
|
+
- lib/relaton_iso/queue.rb
|
102
120
|
- lib/relaton_iso/scrapper.rb
|
103
121
|
- lib/relaton_iso/util.rb
|
104
122
|
- lib/relaton_iso/version.rb
|