relaton-iso 1.19.2 → 2.0.0.pre.alpha.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.rubocop.yml +1 -1
- data/Gemfile +1 -0
- data/README.adoc +134 -131
- data/bin/console +1 -1
- data/grammars/basicdoc.rng +2110 -0
- data/grammars/biblio-standoc.rng +287 -0
- data/grammars/biblio.rng +2097 -0
- data/grammars/relaton-iso-compile.rng +11 -0
- data/grammars/relaton-iso.rng +214 -0
- data/lib/relaton/iso/bibliography.rb +206 -0
- data/lib/relaton/iso/data_fetcher.rb +227 -0
- data/lib/relaton/iso/hash_parser_v1.rb +121 -0
- data/lib/relaton/iso/hit.rb +62 -0
- data/lib/relaton/iso/hit_collection.rb +117 -0
- data/lib/relaton/iso/item_data.rb +49 -0
- data/lib/relaton/iso/model/bibdata.rb +9 -0
- data/lib/relaton/iso/model/bibitem.rb +7 -0
- data/lib/relaton/iso/model/contributor.rb +7 -0
- data/lib/relaton/iso/model/contributor_info.rb +9 -0
- data/lib/relaton/iso/model/docidentifier.rb +128 -0
- data/lib/relaton/iso/model/doctype.rb +13 -0
- data/lib/relaton/iso/model/ext.rb +47 -0
- data/lib/relaton/iso/model/iso_project_group.rb +21 -0
- data/lib/relaton/iso/model/item.rb +17 -0
- data/lib/relaton/iso/model/item_base.rb +19 -0
- data/lib/relaton/iso/model/organization.rb +9 -0
- data/lib/relaton/iso/model/project_number.rb +22 -0
- data/lib/relaton/iso/model/relation.rb +9 -0
- data/lib/relaton/iso/model/stagename.rb +14 -0
- data/lib/relaton/iso/model/structured_identifier.rb +31 -0
- data/lib/relaton/iso/processor.rb +78 -0
- data/lib/relaton/iso/queue.rb +63 -0
- data/lib/relaton/iso/scraper.rb +591 -0
- data/lib/relaton/iso/util.rb +8 -0
- data/lib/relaton/iso/version.rb +7 -0
- data/lib/relaton/iso.rb +17 -0
- data/relaton_iso.gemspec +9 -7
- metadata +79 -49
- data/bin/bundle +0 -109
- data/bin/byebug +0 -27
- data/bin/coderay +0 -27
- data/bin/gdb_wrapper +0 -29
- data/bin/htmldiff +0 -27
- data/bin/httpclient +0 -29
- data/bin/ldiff +0 -27
- data/bin/nokogiri +0 -27
- data/bin/pry +0 -27
- data/bin/pubid-nist +0 -27
- data/bin/racc +0 -27
- data/bin/rackup +0 -29
- data/bin/rake +0 -27
- data/bin/rubocop +0 -27
- data/bin/ruby-parse +0 -27
- data/bin/ruby-rewrite +0 -27
- data/bin/safe_yaml +0 -29
- data/bin/thor +0 -27
- data/lib/relaton_iso/data_fetcher.rb +0 -246
- data/lib/relaton_iso/document_identifier.rb +0 -46
- data/lib/relaton_iso/hash_converter.rb +0 -15
- data/lib/relaton_iso/hit.rb +0 -59
- data/lib/relaton_iso/hit_collection.rb +0 -100
- data/lib/relaton_iso/iso_bibliography.rb +0 -202
- data/lib/relaton_iso/processor.rb +0 -67
- data/lib/relaton_iso/queue.rb +0 -61
- data/lib/relaton_iso/scrapper.rb +0 -553
- data/lib/relaton_iso/util.rb +0 -6
- data/lib/relaton_iso/version.rb +0 -5
- data/lib/relaton_iso.rb +0 -17
data/lib/relaton_iso/scrapper.rb
DELETED
@@ -1,553 +0,0 @@
|
|
1
|
-
# frozen_string_literal: true
|
2
|
-
|
3
|
-
module RelatonIso
|
4
|
-
# Scrapper.
|
5
|
-
class Scrapper # rubocop:disable Metrics/ModuleLength
|
6
|
-
DOMAIN = "https://www.iso.org"
|
7
|
-
|
8
|
-
TYPES = {
|
9
|
-
"TS" => "technical-specification",
|
10
|
-
"DTS" => "technical-specification",
|
11
|
-
"TR" => "technical-report",
|
12
|
-
"DTR" => "technical-report",
|
13
|
-
"PAS" => "publicly-available-specification",
|
14
|
-
# "AWI" => "approvedWorkItem",
|
15
|
-
# "CD" => "committeeDraft",
|
16
|
-
# "FDIS" => "finalDraftInternationalStandard",
|
17
|
-
# "NP" => "newProposal",
|
18
|
-
# "DIS" => "draftInternationalStandard",
|
19
|
-
# "WD" => "workingDraft",
|
20
|
-
# "R" => "recommendation",
|
21
|
-
"Guide" => "guide",
|
22
|
-
"ISO" => "international-standard",
|
23
|
-
"IEC" => "international-standard",
|
24
|
-
"IWA" => "international-workshop-agreement",
|
25
|
-
}.freeze
|
26
|
-
|
27
|
-
STGABBR = {
|
28
|
-
"00" => "NWIP",
|
29
|
-
"10" => "AWI",
|
30
|
-
"20" => "WD",
|
31
|
-
"30" => "CD",
|
32
|
-
"40" => "DIS",
|
33
|
-
"50" => "FDIS",
|
34
|
-
"60" => { "00" => "PRF", "60" => "FINAL" },
|
35
|
-
}.freeze
|
36
|
-
|
37
|
-
PUBLISHERS = {
|
38
|
-
"IEC" => { name: "International Electrotechnical Commission",
|
39
|
-
url: "www.iec.ch" },
|
40
|
-
"ISO" => { name: "International Organization for Standardization",
|
41
|
-
url: "www.iso.org" },
|
42
|
-
"IEEE" => { name: "Institute of Electrical and Electronics Engineers",
|
43
|
-
url: "www.ieee.org" },
|
44
|
-
"SAE" => { name: "SAE International", url: "www.sae.org" },
|
45
|
-
"CIE" => { name: " International Commission on Illumination",
|
46
|
-
url: "cie.co.at" },
|
47
|
-
"ASME" => { name: "American Society of Mechanical Engineers",
|
48
|
-
url: "www.asme.org" },
|
49
|
-
}.freeze
|
50
|
-
|
51
|
-
# extend self
|
52
|
-
|
53
|
-
def initialize(lang, errors)
|
54
|
-
@lang = lang
|
55
|
-
@errors = errors
|
56
|
-
end
|
57
|
-
|
58
|
-
# Parse page.
|
59
|
-
# @param path [String] page path
|
60
|
-
# @param lang [String, nil] language
|
61
|
-
# @param errors [Hash] collection of parsing errors
|
62
|
-
# @return [RelatonIsoBib::IsoBibliographicItem]
|
63
|
-
def self.parse_page(path, lang: nil, errors: {})
|
64
|
-
new(lang, errors).parse(path)
|
65
|
-
end
|
66
|
-
|
67
|
-
def parse(path) # rubocop:disable Metrics/AbcSize,Metrics/MethodLength
|
68
|
-
@doc, url = get_page path
|
69
|
-
titles, abstract, langs = fetch_titles_abstract
|
70
|
-
|
71
|
-
RelatonIsoBib::IsoBibliographicItem.new(
|
72
|
-
docid: fetch_relaton_docids,
|
73
|
-
docnumber: fetch_docnumber,
|
74
|
-
edition: edition,
|
75
|
-
language: langs.map { |l| l[:lang] },
|
76
|
-
script: langs.map { |l| script(l[:lang]) }.uniq,
|
77
|
-
title: titles,
|
78
|
-
doctype: fetch_type,
|
79
|
-
docstatus: fetch_status,
|
80
|
-
ics: fetch_ics,
|
81
|
-
date: fetch_dates,
|
82
|
-
contributor: fetch_contributors,
|
83
|
-
editorialgroup: fetch_workgroup,
|
84
|
-
abstract: abstract,
|
85
|
-
copyright: fetch_copyright,
|
86
|
-
link: fetch_link(url),
|
87
|
-
relation: fetch_relations,
|
88
|
-
place: ["Geneva"],
|
89
|
-
structuredidentifier: fetch_structuredidentifier,
|
90
|
-
)
|
91
|
-
end
|
92
|
-
|
93
|
-
def id
|
94
|
-
return @id if defined?(@id)
|
95
|
-
|
96
|
-
did = @doc.at("//h1/span[1]")
|
97
|
-
@errors[:id] &&= did.nil?
|
98
|
-
@id = did && did.text.split(" | ").first.strip
|
99
|
-
end
|
100
|
-
|
101
|
-
def pubid
|
102
|
-
return @pubid if @pubid
|
103
|
-
|
104
|
-
@pubid = Pubid::Iso::Identifier.parse(id)
|
105
|
-
@pubid.root.edition ||= edition if @pubid.base
|
106
|
-
@pubid
|
107
|
-
rescue StandardError => e
|
108
|
-
Util.error "Failed to parse pubid from #{id}: #{e.message}"
|
109
|
-
end
|
110
|
-
|
111
|
-
def edition
|
112
|
-
return @edition if defined?(@edition)
|
113
|
-
|
114
|
-
ed = @doc.at("//div[div[.='Edition']]/text()[last()]")
|
115
|
-
@errors[:edition] &&= ed.nil?
|
116
|
-
@edition = ed && ed.text.match(/\d+$/).to_s
|
117
|
-
end
|
118
|
-
|
119
|
-
#
|
120
|
-
# Create document ids.
|
121
|
-
#
|
122
|
-
# @return [Array<RelatonBib::DocumentIdentifier>]
|
123
|
-
#
|
124
|
-
def fetch_relaton_docids
|
125
|
-
pubid.stage ||= Pubid::Iso::Identifier.parse_stage(stage_code)
|
126
|
-
[
|
127
|
-
DocumentIdentifier.new(id: pubid, type: "ISO", primary: true),
|
128
|
-
RelatonBib::DocumentIdentifier.new(id: isoref, type: "iso-reference"),
|
129
|
-
DocumentIdentifier.new(id: pubid, type: "URN"),
|
130
|
-
]
|
131
|
-
end
|
132
|
-
|
133
|
-
#
|
134
|
-
# Create ISO reference identifier with English language.
|
135
|
-
#
|
136
|
-
# @return [String] English reference identifier
|
137
|
-
#
|
138
|
-
def isoref
|
139
|
-
params = pubid.to_h.reject { |k, _| k == :typed_stage }
|
140
|
-
Pubid::Iso::Identifier.create(language: "en", **params).to_s(format: :ref_num_short)
|
141
|
-
end
|
142
|
-
|
143
|
-
private
|
144
|
-
|
145
|
-
# Fetch titles and abstracts.
|
146
|
-
# @return [Array<Array>]
|
147
|
-
def fetch_titles_abstract # rubocop:disable Metrics/AbcSize,Metrics/MethodLength
|
148
|
-
titles = RelatonBib::TypedTitleStringCollection.new
|
149
|
-
abstract = []
|
150
|
-
langs = languages.each_with_object([]) do |l, s|
|
151
|
-
# Don't need to get page for en. We already have it.
|
152
|
-
d = l[:path] ? get_page(l[:path])[0] : @doc
|
153
|
-
unless d.at("//h5[@class='help-block'][.='недоступно на русском языке']")
|
154
|
-
s << l
|
155
|
-
titles += fetch_title(d, l[:lang])
|
156
|
-
|
157
|
-
abstr = parse_abstract(d, l[:lang])
|
158
|
-
abstract << abstr if abstr
|
159
|
-
end
|
160
|
-
end
|
161
|
-
[titles, abstract, langs]
|
162
|
-
end
|
163
|
-
|
164
|
-
def parse_abstract(doc, lang)
|
165
|
-
abstract_content = doc.xpath(
|
166
|
-
"//div[@itemprop='description']/p|//div[@itemprop='description']/ul/li",
|
167
|
-
).map { |a| a.name == "li" ? "- #{a.text}" : a.text }.reject(&:empty?).join("\n")
|
168
|
-
@errors[:abstract] &&= abstract_content.empty?
|
169
|
-
return if abstract_content.empty?
|
170
|
-
|
171
|
-
{ content: abstract_content, language: lang, script: script(lang), format: "text/plain" }
|
172
|
-
end
|
173
|
-
|
174
|
-
# Returns available languages.
|
175
|
-
# @return [Array<Hash>]
|
176
|
-
def languages
|
177
|
-
lgs = [{ lang: "en" }]
|
178
|
-
@doc.css("li#lang-switcher ul li a").each do |lang_link|
|
179
|
-
lang_path = lang_link.attr("href")
|
180
|
-
l = lang_path.match(%r{^/(fr)/})
|
181
|
-
lgs << { lang: l[1], path: lang_path } if l && (!@lang || l[1] != @lang)
|
182
|
-
end
|
183
|
-
@errors[:language] &&= lgs.size == 1
|
184
|
-
lgs
|
185
|
-
end
|
186
|
-
|
187
|
-
# Get page.
|
188
|
-
# @param path [String] page's path
|
189
|
-
# @return [Array<Nokogiri::HTML::Document, String>]
|
190
|
-
def get_page(path) # rubocop:disable Metrics/MethodLength
|
191
|
-
try = 0
|
192
|
-
begin
|
193
|
-
resp, uri = get_redirection path
|
194
|
-
doc = try_if_fail resp, uri
|
195
|
-
[doc, uri.to_s]
|
196
|
-
rescue SocketError, Timeout::Error, Errno::EINVAL, Errno::ECONNRESET,
|
197
|
-
EOFError, Net::HTTPBadResponse, Net::HTTPHeaderSyntaxError,
|
198
|
-
Net::ProtocolError, Errno::ETIMEDOUT
|
199
|
-
try += 1
|
200
|
-
raise RelatonBib::RequestError, "Could not access #{DOMAIN}#{path}" if try > 3
|
201
|
-
|
202
|
-
sleep 1
|
203
|
-
retry
|
204
|
-
end
|
205
|
-
end
|
206
|
-
|
207
|
-
#
|
208
|
-
# Get the page from the given path. If the page is redirected, get the
|
209
|
-
# page from the new path.
|
210
|
-
#
|
211
|
-
# @param [String] path path to the page
|
212
|
-
#
|
213
|
-
# @return [Array<Net::HTTPOK, URI>] HTTP response and URI
|
214
|
-
# @raise [RelatonBib::RequestError] if the page is not found
|
215
|
-
#
|
216
|
-
def get_redirection(path) # rubocop:disable Metrics/AbcSize,Metrics/MethodLength
|
217
|
-
uri = URI(DOMAIN + path)
|
218
|
-
try = 0
|
219
|
-
begin
|
220
|
-
get_response uri
|
221
|
-
rescue Errno::EPIPE => e
|
222
|
-
try += 1
|
223
|
-
retry if check_try try, uri
|
224
|
-
raise e
|
225
|
-
end
|
226
|
-
end
|
227
|
-
|
228
|
-
def check_try(try, uri)
|
229
|
-
if try < 3
|
230
|
-
warn "Timeout fetching #{uri}, retrying..."
|
231
|
-
sleep 1
|
232
|
-
true
|
233
|
-
end
|
234
|
-
end
|
235
|
-
|
236
|
-
def get_response(uri, try = 0)
|
237
|
-
raise RelatonBib::RequestError, "#{uri} not found." if try > 3
|
238
|
-
|
239
|
-
resp = Net::HTTP.get_response(uri)
|
240
|
-
case resp.code
|
241
|
-
when "200" then [resp, uri]
|
242
|
-
when "301" then get_redirection(resp["location"])
|
243
|
-
when "404" then raise RelatonBib::RequestError, "#{uri} not found."
|
244
|
-
else
|
245
|
-
sleep (2**try)
|
246
|
-
get_response uri, try + 1
|
247
|
-
end
|
248
|
-
end
|
249
|
-
|
250
|
-
#
|
251
|
-
# The iso.org site fails to respond sometimes. This method tries to get
|
252
|
-
# the response again.
|
253
|
-
#
|
254
|
-
# @param [Net::HTTPOK] resp HTTP response
|
255
|
-
# @param [URI::HTTPS] uri URI of the page
|
256
|
-
#
|
257
|
-
# @return [Nokogiri::HTML4::Document] document
|
258
|
-
# @raise [RelatonBib::RequestError] if the page could not be parsed
|
259
|
-
#
|
260
|
-
def try_if_fail(resp, uri)
|
261
|
-
10.times do
|
262
|
-
doc = Nokogiri::HTML(resp.body)
|
263
|
-
# stop trying if page has a document id
|
264
|
-
return doc if item_ref(doc)
|
265
|
-
|
266
|
-
resp = Net::HTTP.get_response(uri)
|
267
|
-
end
|
268
|
-
raise RelatonBib::RequestError, "Could not parse the page #{uri}"
|
269
|
-
end
|
270
|
-
|
271
|
-
#
|
272
|
-
# Generate docnumber.
|
273
|
-
#
|
274
|
-
# @return [String] docnumber
|
275
|
-
#
|
276
|
-
def fetch_docnumber
|
277
|
-
pubid.to_s.match(/\d+/)&.to_s
|
278
|
-
end
|
279
|
-
|
280
|
-
#
|
281
|
-
# Parse structuredidentifier.
|
282
|
-
#
|
283
|
-
# @return [RelatonBib::StructuredIdentifier] structured identifier
|
284
|
-
#
|
285
|
-
def fetch_structuredidentifier # rubocop:disable Metrics/MethodLength
|
286
|
-
RelatonIsoBib::StructuredIdentifier.new(
|
287
|
-
project_number: "#{pubid.root.publisher} #{pubid.root.number}",
|
288
|
-
part: pubid.root.part&.to_s, # &.sub(/^-/, ""),
|
289
|
-
type: pubid.root.publisher,
|
290
|
-
)
|
291
|
-
end
|
292
|
-
|
293
|
-
#
|
294
|
-
# Parse ID from the document.
|
295
|
-
#
|
296
|
-
# @param [Nokogiri::HTML::Document] doc document to parse
|
297
|
-
#
|
298
|
-
# @return [String, nil] ID
|
299
|
-
#
|
300
|
-
def item_ref(doc)
|
301
|
-
ref = doc.at("//main//section/div/div/div//h1/span[1]")
|
302
|
-
@errors[:reference] &&= ref.nil?
|
303
|
-
ref&.text&.strip
|
304
|
-
end
|
305
|
-
|
306
|
-
# Fetch status.
|
307
|
-
# @return [RelatonBib::DocumentStatus]
|
308
|
-
def fetch_status
|
309
|
-
stg, substg = stage_code.split "."
|
310
|
-
RelatonBib::DocumentStatus.new(stage: stg, substage: substg)
|
311
|
-
end
|
312
|
-
|
313
|
-
def stage_code
|
314
|
-
return @stage_code if defined?(@stage_code)
|
315
|
-
|
316
|
-
stc = @doc.at("//ul[@class='dropdown-menu']/li[@class='active']/a/span[@class='stage-code']")
|
317
|
-
@errors[:stage] &&= stc.nil?
|
318
|
-
@stage_code = stc&.text
|
319
|
-
end
|
320
|
-
|
321
|
-
# def stage(stg, substg)
|
322
|
-
# abbr = STGABBR[stg].is_a?(Hash) ? STGABBR[stg][substg] : STGABBR[stg]
|
323
|
-
# RelatonBib::DocumentStatus::Stage.new value: stg, abbreviation: abbr
|
324
|
-
# end
|
325
|
-
|
326
|
-
# Fetch workgroup.
|
327
|
-
# @param doc [Nokogiri::HTML::Document]
|
328
|
-
# @return [RelatonIsoBib::EditorialGroup, nil]
|
329
|
-
def fetch_workgroup # rubocop:disable Metrics/MethodLength,Metrics/AbcSize,Metrics/CyclomaticComplexity,Metrics/PerceivedComplexity
|
330
|
-
wg = @doc.at("//div[contains(., 'Technical Committe')]/following-sibling::span/a")
|
331
|
-
@errors[:workgroup] &&= wg.nil?
|
332
|
-
return unless wg
|
333
|
-
|
334
|
-
workgroup = wg.text.split "/"
|
335
|
-
type = workgroup[1]&.match(/^[A-Z]+/)&.to_s || "TC"
|
336
|
-
# {
|
337
|
-
# name: "International Organization for Standardization",
|
338
|
-
# abbreviation: "ISO",
|
339
|
-
# url: "www.iso.org",
|
340
|
-
# }
|
341
|
-
tc_numb = workgroup[1]&.match(/\d+/)&.to_s&.to_i
|
342
|
-
tc_name = wg[:title]
|
343
|
-
tc = RelatonBib::WorkGroup.new(name: tc_name, identifier: wg.text,
|
344
|
-
type: type, number: tc_numb)
|
345
|
-
RelatonIsoBib::EditorialGroup.new(technical_committee: [tc])
|
346
|
-
end
|
347
|
-
|
348
|
-
# Fetch relations.
|
349
|
-
# @return [Array<Hash>]
|
350
|
-
def fetch_relations
|
351
|
-
types = ["Now", "Now under review"]
|
352
|
-
rels = @doc.xpath(
|
353
|
-
"//ul[@class='steps']/li", "//div[contains(@class, 'sub-step')]"
|
354
|
-
).reduce([]) do |a, r|
|
355
|
-
type, date = relation_type(r.at("h4", "h5").text.strip)
|
356
|
-
next a if types.include?(type)
|
357
|
-
|
358
|
-
a + create_relations(r, type, date)
|
359
|
-
end
|
360
|
-
@errors[:relation] &&= rels.empty?
|
361
|
-
rels
|
362
|
-
end
|
363
|
-
|
364
|
-
#
|
365
|
-
# Parse relation type and dates.
|
366
|
-
#
|
367
|
-
# @param [String] type parsed type
|
368
|
-
#
|
369
|
-
# @return [Array<String,Array>] type and dates
|
370
|
-
#
|
371
|
-
def relation_type(type)
|
372
|
-
date = []
|
373
|
-
t = case type.strip
|
374
|
-
when "Previously", "Will be replaced by" then "obsoletes"
|
375
|
-
when /Corrigenda|Amendments|Revised by|Now confirmed|replaced by/
|
376
|
-
on = @doc.xpath('//span[@class="stage-date"][contains(., "-")]').last
|
377
|
-
date << { type: "circulated", on: on.text } if on
|
378
|
-
"updates"
|
379
|
-
else type
|
380
|
-
end
|
381
|
-
[t, date]
|
382
|
-
end
|
383
|
-
|
384
|
-
#
|
385
|
-
# Create relations.
|
386
|
-
#
|
387
|
-
# @param [Nokogiri::HTML::Element] rel relation element
|
388
|
-
# @param [String] type relation type
|
389
|
-
# @param [Hash{Symbol=>String}] date relation document date
|
390
|
-
# @option date [String] :type date type
|
391
|
-
# @option date [String] :on date
|
392
|
-
#
|
393
|
-
# @return [Array<Hash>] Relations
|
394
|
-
#
|
395
|
-
def create_relations(rel, type, date)
|
396
|
-
rel.css("a").map do |rid|
|
397
|
-
docid = DocumentIdentifier.new(type: "ISO", id: rid.text, primary: true)
|
398
|
-
fref = RelatonBib::FormattedRef.new(content: rid.text, format: "text/plain")
|
399
|
-
bibitem = RelatonIsoBib::IsoBibliographicItem.new(
|
400
|
-
docid: [docid], formattedref: fref, date: date,
|
401
|
-
)
|
402
|
-
{ type: type, bibitem: bibitem }
|
403
|
-
end
|
404
|
-
end
|
405
|
-
|
406
|
-
# Fetch type.
|
407
|
-
# @return [String]
|
408
|
-
def fetch_type
|
409
|
-
%r{
|
410
|
-
^(?<prefix>ISO|IWA|IEC)
|
411
|
-
(?:(?:/CIE|/IEC|/IEEE|/PRF|/NP|/SAE|/HL7|/DGuide)*\s|/)
|
412
|
-
(?<type>TS|TR|PAS|AWI|CD|FDIS|NP|DIS|WD|R|DTS|DTR|ISP|PWI|Guide|(?=\d+))
|
413
|
-
}x =~ id
|
414
|
-
type = TYPES[type] || TYPES[prefix] || "international-standard"
|
415
|
-
RelatonIsoBib::DocumentType.new(type: type)
|
416
|
-
end
|
417
|
-
|
418
|
-
# Fetch titles.
|
419
|
-
# @param doc [Nokogiri::HTML::Document]
|
420
|
-
# @param lang [String]
|
421
|
-
# @return [Array<RelatonBib::TypedTitleString>]
|
422
|
-
def fetch_title(doc, lang) # rubocop:disable Metrics/MethodLength,Metrics/AbcSize
|
423
|
-
types = %w[title-intro title-main title-part]
|
424
|
-
ttls = parse_titles(doc)
|
425
|
-
title = RelatonBib::TypedTitleStringCollection.new
|
426
|
-
ttls.each.with_index do |p, i|
|
427
|
-
next unless p
|
428
|
-
|
429
|
-
title << RelatonBib::TypedTitleString.new(
|
430
|
-
type: types[i], content: p, language: lang, script: script(lang),
|
431
|
-
)
|
432
|
-
end.compact
|
433
|
-
main = title.map { |t| t.title.content }.join " - "
|
434
|
-
title << RelatonBib::TypedTitleString.new(type: "main", content: main, language: lang, script: script(lang))
|
435
|
-
end
|
436
|
-
|
437
|
-
def parse_titles(doc)
|
438
|
-
# head = doc.at "//nav[contains(@class,'heading-condensed')]"
|
439
|
-
ttls = doc.xpath("//h1[@class='stdTitle']/span[position()>1]").map(&:text)
|
440
|
-
return ttls if @errors[:title] &&= ttls.empty?
|
441
|
-
|
442
|
-
ttls[0, 1] = ttls[0].split(/\s(?:-|\u2014)\s/) # if ttls.size == 1
|
443
|
-
case ttls.size
|
444
|
-
when 0, 1 then [nil, ttls.first, nil]
|
445
|
-
else RelatonBib::TypedTitleString.intro_or_part ttls
|
446
|
-
end
|
447
|
-
end
|
448
|
-
|
449
|
-
# Return ISO script code.
|
450
|
-
# @param lang [String]
|
451
|
-
# @return [String]
|
452
|
-
def script(lang)
|
453
|
-
case lang
|
454
|
-
when "en", "fr" then "Latn"
|
455
|
-
# when "ru" then "Cyrl"
|
456
|
-
end
|
457
|
-
end
|
458
|
-
|
459
|
-
# Fetch dates
|
460
|
-
# @return [Array<Hash>]
|
461
|
-
def fetch_dates # rubocop:disable Metrics/AbcSize,Metrics/MethodLength
|
462
|
-
dates = []
|
463
|
-
%r{^[^\s]+\s[\d-]+:(?<ref_date_str>\d{4})} =~ id
|
464
|
-
pub_date_str = @doc.at("//span[@itemprop='releaseDate']")
|
465
|
-
@errors[:date_pub] &&= pub_date_str.nil?
|
466
|
-
if ref_date_str
|
467
|
-
dates += parse_date_from_id ref_date_str, pub_date_str
|
468
|
-
elsif pub_date_str
|
469
|
-
dates << { type: "published", on: pub_date_str.text }
|
470
|
-
end
|
471
|
-
corr_data = @doc.at "//span[@itemprop='dateModified']"
|
472
|
-
@errors[:date_corr] &&= corr_data.nil?
|
473
|
-
dates << { type: "corrected", on: corr_data.text } if corr_data
|
474
|
-
dates
|
475
|
-
end
|
476
|
-
|
477
|
-
def parse_date_from_id(ref_date_str, pub_date_str) # rubocop:disable Metrics/AbcSize,Metrics/MethodLength
|
478
|
-
dates = []
|
479
|
-
ref_date = Date.strptime ref_date_str, "%Y"
|
480
|
-
if pub_date_str.nil?
|
481
|
-
dates << { type: "published", on: ref_date_str }
|
482
|
-
else
|
483
|
-
pub_date = Date.strptime pub_date_str.text, "%Y"
|
484
|
-
if pub_date.year > ref_date.year
|
485
|
-
dates << { type: "published", on: ref_date_str }
|
486
|
-
dates << { type: "updated", on: pub_date_str.text }
|
487
|
-
else
|
488
|
-
dates << { type: "published", on: pub_date_str.text }
|
489
|
-
end
|
490
|
-
end
|
491
|
-
dates
|
492
|
-
end
|
493
|
-
|
494
|
-
def fetch_contributors
|
495
|
-
id.sub(/\s.*/, "").split("/").reduce([]) do |mem, abbrev|
|
496
|
-
publisher = PUBLISHERS[abbrev]
|
497
|
-
next mem unless publisher
|
498
|
-
|
499
|
-
publisher[:abbreviation] = abbrev
|
500
|
-
mem << { entity: publisher, role: [type: "publisher"] }
|
501
|
-
end
|
502
|
-
end
|
503
|
-
|
504
|
-
# Fetch ICS.
|
505
|
-
# @return [Array<Hash>]
|
506
|
-
def fetch_ics
|
507
|
-
ics = @doc.xpath("//div[contains(., 'ICS')]/following-sibling::span/a").map do |i|
|
508
|
-
code = i.text.match(/[\d.]+/).to_s.split "."
|
509
|
-
{ field: code[0], group: code[1], subgroup: code[2] }
|
510
|
-
end
|
511
|
-
@errors[:ics] &&= ics.empty?
|
512
|
-
ics
|
513
|
-
end
|
514
|
-
|
515
|
-
#
|
516
|
-
# Fetch links.
|
517
|
-
#
|
518
|
-
# @param url [String] document url
|
519
|
-
#
|
520
|
-
# @return [Array<Hash>]
|
521
|
-
#
|
522
|
-
def fetch_link(url) # rubocop:disable Metrics/AbcSize,Metrics/CyclomaticComplexity,Metrics/MethodLength
|
523
|
-
links = [{ type: "src", content: url }]
|
524
|
-
obp = @doc.at("//a[.='Read sample']")
|
525
|
-
@errors[:link_obp] &&= obp.nil?
|
526
|
-
links << { type: "obp", content: obp[:href] } if obp
|
527
|
-
rss = @doc.at("//a[contains(@href, 'rss')]")
|
528
|
-
@errors[:link_rss] &&= rss.nil?
|
529
|
-
links << { type: "rss", content: DOMAIN + rss[:href] } if rss
|
530
|
-
pub = @doc.at "//p[contains(., 'publicly available')]/a",
|
531
|
-
"//p[contains(., 'can be downloaded from the')]/a"
|
532
|
-
@errors[:link_pub] &&= pub.nil?
|
533
|
-
links << { type: "pub", content: pub[:href] } if pub
|
534
|
-
links
|
535
|
-
end
|
536
|
-
|
537
|
-
# Fetch copyright.
|
538
|
-
# @return [Array<Hash>]
|
539
|
-
def fetch_copyright # rubocop:disable Metrics/MethodLength
|
540
|
-
ref = item_ref @doc
|
541
|
-
owner_name = ref.match(/.*?(?=\s)/).to_s
|
542
|
-
from = ref.match(/(?<=:)\d{4}/).to_s
|
543
|
-
if from.empty?
|
544
|
-
date = @doc.at(
|
545
|
-
"//span[@itemprop='releaseDate']",
|
546
|
-
"//ul[@id='stages']/li[contains(@class,'active')]/ul/li[@class='active']/a/span[@class='stage-date']",
|
547
|
-
)
|
548
|
-
from = date.text.match(/\d{4}/).to_s
|
549
|
-
end
|
550
|
-
[{ owner: [{ name: owner_name }], from: from }]
|
551
|
-
end
|
552
|
-
end
|
553
|
-
end
|
data/lib/relaton_iso/util.rb
DELETED
data/lib/relaton_iso/version.rb
DELETED
data/lib/relaton_iso.rb
DELETED
@@ -1,17 +0,0 @@
|
|
1
|
-
# frozen_string_literal: true
|
2
|
-
|
3
|
-
require "nokogiri"
|
4
|
-
require "net/http"
|
5
|
-
require "logger"
|
6
|
-
require "pubid-iso"
|
7
|
-
require "relaton/index"
|
8
|
-
require "relaton_iso_bib"
|
9
|
-
require "relaton_iso/version"
|
10
|
-
require "relaton_iso/util"
|
11
|
-
require "relaton_iso/hash_converter"
|
12
|
-
require "relaton_iso/hit"
|
13
|
-
require "relaton_iso/iso_bibliography"
|
14
|
-
require "relaton_iso/document_identifier"
|
15
|
-
# require "relaton_iso/index"
|
16
|
-
require "relaton_iso/queue"
|
17
|
-
require "relaton_iso/data_fetcher"
|