relaton-iso 1.16.0 → 1.16.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.adoc +18 -18
- data/lib/relaton_iso/hit.rb +3 -2
- data/lib/relaton_iso/iso_bibliography.rb +7 -5
- data/lib/relaton_iso/scrapper.rb +56 -45
- data/lib/relaton_iso/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 479a728a58c56799448fd6d468e0d19fe245b731119f8dcd9ae6f19a7b624e07
|
|
4
|
+
data.tar.gz: ac89507180ca01978bfe98b68fbe02450f2c33015bd38d788752f3bf933911ad
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 71cc49dc2afa8690f02f7035ec5cc13981eb620e2b8c3792456401c152a4ca8192b2ffbd7445c6c982886e61f679427a2d5afbf26e13c6ebcfffcc8d54f7e5c9
|
|
7
|
+
data.tar.gz: 853da0772a998533c5f461ff297bef978c75e1f58b2df1fec5eff0fea6d306807420453a6ba37b1348d44e58c3a71dcf743228c7703b73fd9b7d72c9d4309598
|
data/README.adoc
CHANGED
|
@@ -72,14 +72,14 @@ item.docidentifier.detect { |di| di.type == "URN" }.id
|
|
|
72
72
|
[source,ruby]
|
|
73
73
|
----
|
|
74
74
|
item = RelatonIso::IsoBibliography.get "ISO 19115:2003"
|
|
75
|
-
[relaton-iso] (
|
|
76
|
-
[relaton-iso] (
|
|
75
|
+
[relaton-iso] (ISO 19115:2003) Fetching from iso.org ...
|
|
76
|
+
[relaton-iso] (ISO 19115:2003) Found: `ISO 19115:2003`
|
|
77
77
|
=> #<RelatonIsoBib::IsoBibliographicItem:0x00007f8c83429e30
|
|
78
78
|
...
|
|
79
79
|
|
|
80
80
|
item = RelatonIso::IsoBibliography.get "ISO 19115", "2003"
|
|
81
|
-
[relaton-iso] (
|
|
82
|
-
[relaton-iso] (
|
|
81
|
+
[relaton-iso] (ISO 19115:2003) Fetching from iso.org ...
|
|
82
|
+
[relaton-iso] (ISO 19115:2003) Found: `ISO 19115:2003`
|
|
83
83
|
=> #<RelatonIsoBib::IsoBibliographicItem:0x0000000112c9ca80
|
|
84
84
|
...
|
|
85
85
|
|
|
@@ -92,8 +92,8 @@ item.docidentifier[0].id
|
|
|
92
92
|
[source,ruby]
|
|
93
93
|
----
|
|
94
94
|
item = RelatonIso::IsoBibliography.get "ISO 19115"
|
|
95
|
-
[relaton-iso] (
|
|
96
|
-
[relaton-iso] (
|
|
95
|
+
[relaton-iso] (ISO 19115) Fetching from iso.org ...
|
|
96
|
+
[relaton-iso] (ISO 19115) Found: `ISO 19115:2003`
|
|
97
97
|
=> #<RelatonIsoBib::IsoBibliographicItem:0x00007f8c830275a8
|
|
98
98
|
...
|
|
99
99
|
|
|
@@ -106,8 +106,8 @@ item.docidentifier[0].id
|
|
|
106
106
|
[source,ruby]
|
|
107
107
|
----
|
|
108
108
|
item = RelatonIso::IsoBibliography.get "ISO 19115-1"
|
|
109
|
-
[relaton-iso] (
|
|
110
|
-
[relaton-iso] (
|
|
109
|
+
[relaton-iso] (ISO 19115-1) Fetching from iso.org ...
|
|
110
|
+
[relaton-iso] (ISO 19115-1) Found: `ISO 19115-1:2014`
|
|
111
111
|
=> #<RelatonIsoBib::IsoBibliographicItem:0x00007f8c83408af0
|
|
112
112
|
...
|
|
113
113
|
|
|
@@ -120,14 +120,14 @@ item.docidentifier[0].id
|
|
|
120
120
|
[source,ruby]
|
|
121
121
|
----
|
|
122
122
|
item = RelatonIso::IsoBibliography.get "ISO 19115 (all parts)"
|
|
123
|
-
[relaton-iso] (
|
|
124
|
-
[relaton-iso] (
|
|
123
|
+
[relaton-iso] (ISO 19115) Fetching from iso.org ...
|
|
124
|
+
[relaton-iso] (ISO 19115) Found: `ISO 19115`
|
|
125
125
|
=> #<RelatonIsoBib::IsoBibliographicItem:0x00007f8ca216e118
|
|
126
126
|
...
|
|
127
127
|
|
|
128
128
|
item = RelatonIso::IsoBibliography.get "ISO 19115", nil, all_parts: true
|
|
129
|
-
[relaton-iso] (
|
|
130
|
-
[relaton-iso] (
|
|
129
|
+
[relaton-iso] (ISO 19115) Fetching from iso.org ...
|
|
130
|
+
[relaton-iso] (ISO 19115) Found: `ISO 19115`
|
|
131
131
|
=> #<RelatonIsoBib::IsoBibliographicItem:0x00007f8c830f3d38
|
|
132
132
|
...
|
|
133
133
|
|
|
@@ -135,13 +135,13 @@ item.docidentifier[0].id
|
|
|
135
135
|
=> "ISO 19115 (all parts)"
|
|
136
136
|
|
|
137
137
|
item = RelatonIso::IsoBibliography.get "ISO 19115-1 (all parts)"
|
|
138
|
-
[relaton-iso] (
|
|
139
|
-
[relaton-iso] (
|
|
138
|
+
[relaton-iso] (ISO 19115) Fetching from iso.org ...
|
|
139
|
+
[relaton-iso] (ISO 19115) Found: `ISO 19115`
|
|
140
140
|
=> #<RelatonIsoBib::IsoBibliographicItem:0x00007f8c8290e5a0
|
|
141
141
|
|
|
142
142
|
item = RelatonIso::IsoBibliography.get "ISO 19115-1", nil, all_parts: true
|
|
143
|
-
[relaton-iso] (
|
|
144
|
-
[relaton-iso] (
|
|
143
|
+
[relaton-iso] (ISO 19115) Fetching from iso.org ...
|
|
144
|
+
[relaton-iso] (ISO 19115) Found: `ISO 19115`
|
|
145
145
|
=> #<RelatonIsoBib::IsoBibliographicItem:0x00007f8c925355b8
|
|
146
146
|
...
|
|
147
147
|
|
|
@@ -234,8 +234,8 @@ item.title lang: 'fr'
|
|
|
234
234
|
@type="main">]>
|
|
235
235
|
|
|
236
236
|
item = RelatonIso::IsoBibliography.get "ISO 19115:2003"
|
|
237
|
-
[relaton-iso] (
|
|
238
|
-
[relaton-iso] (
|
|
237
|
+
[relaton-iso] (ISO 19115:2003) Fetching from iso.org ...
|
|
238
|
+
[relaton-iso] (ISO 19115:2003) Found: `ISO 19115:2003`
|
|
239
239
|
=> #<RelatonIsoBib::IsoBibliographicItem:0x00007fa8870b69e0
|
|
240
240
|
|
|
241
241
|
item.abstract lang: 'en'
|
data/lib/relaton_iso/hit.rb
CHANGED
|
@@ -42,8 +42,9 @@ module RelatonIso
|
|
|
42
42
|
# @return [Pubid::Iso::Identifier]
|
|
43
43
|
def pubid
|
|
44
44
|
@pubid ||= Pubid::Iso::Identifier.parse_from_title(hit[:title])
|
|
45
|
-
rescue Pubid::Iso::Errors::WrongTypeError,
|
|
46
|
-
|
|
45
|
+
rescue Pubid::Iso::Errors::WrongTypeError,
|
|
46
|
+
Pubid::Iso::Errors::ParseError => e
|
|
47
|
+
Util.warn "Unable to find an identifier in: `#{hit[:title]}`."
|
|
47
48
|
Util.warn e.message
|
|
48
49
|
end
|
|
49
50
|
end
|
|
@@ -38,7 +38,7 @@ module RelatonIso
|
|
|
38
38
|
query_pubid = Pubid::Iso::Identifier.parse(code)
|
|
39
39
|
query_pubid.year = year if year
|
|
40
40
|
query_pubid.part = nil if opts[:all_parts]
|
|
41
|
-
Util.warn "(#{query_pubid}) Fetching from
|
|
41
|
+
Util.warn "(#{query_pubid}) Fetching from iso.org ..."
|
|
42
42
|
|
|
43
43
|
hits, missed_year_ids = isobib_search_filter(query_pubid, opts)
|
|
44
44
|
tip_ids = look_up_with_any_types_stages(hits, ref, opts)
|
|
@@ -54,7 +54,7 @@ module RelatonIso
|
|
|
54
54
|
response_docid = ret.docidentifier.first.id.sub(" (all parts)", "")
|
|
55
55
|
response_pubid = Pubid::Iso::Identifier.parse(response_docid)
|
|
56
56
|
|
|
57
|
-
Util.warn "(#{query_pubid}) Found `#{response_pubid}
|
|
57
|
+
Util.warn "(#{query_pubid}) Found: `#{response_pubid}`"
|
|
58
58
|
|
|
59
59
|
get_all = (
|
|
60
60
|
(query_pubid.year && opts[:keep_year].nil?) ||
|
|
@@ -65,7 +65,7 @@ module RelatonIso
|
|
|
65
65
|
|
|
66
66
|
ret.to_most_recent_reference
|
|
67
67
|
rescue Pubid::Core::Errors::ParseError
|
|
68
|
-
Util.warn "(#{code})
|
|
68
|
+
Util.warn "(#{code}) Is not recognized as a standards identifier."
|
|
69
69
|
nil
|
|
70
70
|
end
|
|
71
71
|
|
|
@@ -179,7 +179,8 @@ module RelatonIso
|
|
|
179
179
|
hit_collection = search(query_pubid_without_year.to_s)
|
|
180
180
|
|
|
181
181
|
# filter only matching hits
|
|
182
|
-
filter_hits hit_collection, query_pubid, opts[:all_parts],
|
|
182
|
+
filter_hits hit_collection, query_pubid, opts[:all_parts],
|
|
183
|
+
any_types_stages
|
|
183
184
|
end
|
|
184
185
|
|
|
185
186
|
#
|
|
@@ -196,7 +197,8 @@ module RelatonIso
|
|
|
196
197
|
# filter out
|
|
197
198
|
result = hit_collection.select do |i|
|
|
198
199
|
hit_pubid = i.pubid
|
|
199
|
-
matches_base?(query_pubid, hit_pubid,
|
|
200
|
+
matches_base?(query_pubid, hit_pubid,
|
|
201
|
+
any_types_stages: any_stypes_tages) &&
|
|
200
202
|
matches_parts?(query_pubid, hit_pubid, all_parts: all_parts) &&
|
|
201
203
|
query_pubid.corrigendums == hit_pubid.corrigendums &&
|
|
202
204
|
query_pubid.amendments == hit_pubid.amendments
|
data/lib/relaton_iso/scrapper.rb
CHANGED
|
@@ -270,11 +270,10 @@ module RelatonIso
|
|
|
270
270
|
# @param doc [Nokogiri::HTML::Document]
|
|
271
271
|
# @return [Hash]
|
|
272
272
|
def fetch_workgroup(doc) # rubocop:disable Metrics/MethodLength,Metrics/AbcSize,Metrics/CyclomaticComplexity,Metrics/PerceivedComplexity
|
|
273
|
-
wg = doc.at("
|
|
274
|
-
|
|
275
|
-
return unless wg_link
|
|
273
|
+
wg = doc.at("////div[contains(., 'Technical Committe')]/following-sibling::span/a")
|
|
274
|
+
return unless wg
|
|
276
275
|
|
|
277
|
-
workgroup =
|
|
276
|
+
workgroup = wg.text.split "/"
|
|
278
277
|
type = workgroup[1]&.match(/^[A-Z]+/)&.to_s || "TC"
|
|
279
278
|
# {
|
|
280
279
|
# name: "International Organization for Standardization",
|
|
@@ -282,44 +281,48 @@ module RelatonIso
|
|
|
282
281
|
# url: "www.iso.org",
|
|
283
282
|
# }
|
|
284
283
|
tc_numb = workgroup[1]&.match(/\d+/)&.to_s&.to_i
|
|
285
|
-
tc_name = wg
|
|
286
|
-
tc = RelatonBib::WorkGroup.new(name: tc_name, identifier:
|
|
284
|
+
tc_name = wg[:title]
|
|
285
|
+
tc = RelatonBib::WorkGroup.new(name: tc_name, identifier: wg.text,
|
|
287
286
|
type: type, number: tc_numb)
|
|
288
287
|
RelatonIsoBib::EditorialGroup.new(technical_committee: [tc])
|
|
289
288
|
end
|
|
290
289
|
|
|
291
|
-
# rubocop:disable Metrics/MethodLength
|
|
292
|
-
|
|
293
290
|
# Fetch relations.
|
|
294
291
|
# @param doc [Nokogiri::HTML::Document]
|
|
295
292
|
# @return [Array<Hash>]
|
|
296
|
-
def fetch_relations(doc)
|
|
293
|
+
def fetch_relations(doc)
|
|
297
294
|
types = ["Now", "Now under review"]
|
|
298
295
|
doc.xpath("//ul[@class='steps']/li", "//div[@class='sub-step']").reduce([]) do |a, r|
|
|
299
|
-
|
|
300
|
-
|
|
301
|
-
|
|
302
|
-
|
|
303
|
-
|
|
304
|
-
|
|
305
|
-
|
|
306
|
-
|
|
307
|
-
|
|
308
|
-
|
|
309
|
-
|
|
310
|
-
|
|
311
|
-
|
|
312
|
-
|
|
313
|
-
|
|
314
|
-
|
|
315
|
-
docid: [docid], formattedref: fref, date: date,
|
|
316
|
-
)
|
|
317
|
-
{ type: type, bibitem: bibitem }
|
|
296
|
+
type, date = relation_type(r.at("h4", "h5").text.strip, doc)
|
|
297
|
+
next a if types.include?(type)
|
|
298
|
+
|
|
299
|
+
a + create_relations(r, type, date)
|
|
300
|
+
end
|
|
301
|
+
end
|
|
302
|
+
|
|
303
|
+
def relation_type(type, doc)
|
|
304
|
+
date = []
|
|
305
|
+
t = case type.strip
|
|
306
|
+
when "Previously", "Will be replaced by" then "obsoletes"
|
|
307
|
+
when "Corrigenda / Amendments", "Revised by", "Now confirmed"
|
|
308
|
+
on = doc.xpath('//span[@class="stage-date"][contains(., "-")]').last
|
|
309
|
+
date << { type: "circulated", on: on.text } if on
|
|
310
|
+
"updates"
|
|
311
|
+
else type
|
|
318
312
|
end
|
|
319
|
-
|
|
313
|
+
[t, date]
|
|
314
|
+
end
|
|
315
|
+
|
|
316
|
+
def create_relations(rel, type, date)
|
|
317
|
+
rel.css("a").map do |id|
|
|
318
|
+
docid = RelatonBib::DocumentIdentifier.new(type: "ISO", id: id.text, primary: true)
|
|
319
|
+
fref = RelatonBib::FormattedRef.new(content: id.text, format: "text/plain")
|
|
320
|
+
bibitem = RelatonIsoBib::IsoBibliographicItem.new(
|
|
321
|
+
docid: [docid], formattedref: fref, date: date,
|
|
322
|
+
)
|
|
323
|
+
{ type: type, bibitem: bibitem }
|
|
320
324
|
end
|
|
321
325
|
end
|
|
322
|
-
# rubocop:enable Metrics/MethodLength
|
|
323
326
|
|
|
324
327
|
# Fetch type.
|
|
325
328
|
# @param ref [String]
|
|
@@ -343,14 +346,20 @@ module RelatonIso
|
|
|
343
346
|
# @param doc [Nokogiri::HTML::Document]
|
|
344
347
|
# @param lang [String]
|
|
345
348
|
# @return [Array<RelatonBib::TypedTitleString>]
|
|
346
|
-
def fetch_title(doc, lang)
|
|
347
|
-
|
|
348
|
-
|
|
349
|
-
|
|
350
|
-
|
|
351
|
-
|
|
352
|
-
|
|
353
|
-
|
|
349
|
+
def fetch_title(doc, lang) # rubocop:disable Metrics/MethodLength,Metrics/AbcSize
|
|
350
|
+
head = doc.at "//nav[contains(@class,'heading-condensed')]"
|
|
351
|
+
types = { "h2" => "title-intro", "h3" => "title-main", "h4" => "title-part" }
|
|
352
|
+
title_types = head.xpath("h2 | h3 | h4").each_with_object({}) do |t, h|
|
|
353
|
+
h[types[t.name]] = t.text
|
|
354
|
+
end
|
|
355
|
+
title = RelatonBib::TypedTitleStringCollection.new
|
|
356
|
+
title_types.each do |type, content|
|
|
357
|
+
title << RelatonBib::TypedTitleString.new(
|
|
358
|
+
type: type, content: content, language: lang, script: script(lang),
|
|
359
|
+
)
|
|
360
|
+
end
|
|
361
|
+
main = title.map { |t| t.title.content }.join " - "
|
|
362
|
+
title << RelatonBib::TypedTitleString.new(type: "main", content: main, language: lang, script: script(lang))
|
|
354
363
|
end
|
|
355
364
|
|
|
356
365
|
# Return ISO script code.
|
|
@@ -363,12 +372,11 @@ module RelatonIso
|
|
|
363
372
|
end
|
|
364
373
|
end
|
|
365
374
|
|
|
366
|
-
# rubocop:disable Metrics/MethodLength
|
|
367
375
|
# Fetch dates
|
|
368
376
|
# @param doc [Nokogiri::HTML::Document]
|
|
369
377
|
# @param ref [String]
|
|
370
378
|
# @return [Array<Hash>]
|
|
371
|
-
def fetch_dates(doc, ref) # rubocop:disable Metrics/AbcSize, Metrics/PerceivedComplexity
|
|
379
|
+
def fetch_dates(doc, ref) # rubocop:disable Metrics/AbcSize, Metrics/PerceivedComplexity, Metrics/MethodLength
|
|
372
380
|
dates = []
|
|
373
381
|
%r{^[^\s]+\s[\d-]+:(?<ref_date_str>\d{4})} =~ ref
|
|
374
382
|
pub_date_str = doc.xpath("//span[@itemprop='releaseDate']").text
|
|
@@ -400,25 +408,28 @@ module RelatonIso
|
|
|
400
408
|
mem << { entity: publisher, role: [type: "publisher"] }
|
|
401
409
|
end
|
|
402
410
|
end
|
|
403
|
-
# rubocop:enable Metrics/MethodLength
|
|
404
411
|
|
|
405
412
|
# Fetch ICS.
|
|
406
413
|
# @param doc [Nokogiri::HTML::Document]
|
|
407
414
|
# @return [Array<Hash>]
|
|
408
415
|
def fetch_ics(doc)
|
|
409
|
-
doc.xpath("//
|
|
416
|
+
doc.xpath("//div[contains(., 'ICS')]/following-sibling::span/a").map do |i|
|
|
410
417
|
code = i.text.match(/[\d.]+/).to_s.split "."
|
|
411
418
|
{ field: code[0], group: code[1], subgroup: code[2] }
|
|
412
419
|
end
|
|
413
420
|
end
|
|
414
421
|
|
|
422
|
+
#
|
|
415
423
|
# Fetch links.
|
|
416
|
-
#
|
|
417
|
-
# @param
|
|
424
|
+
#
|
|
425
|
+
# @param doc [Nokogiri::HTML::Document] document to parse
|
|
426
|
+
# @param url [String] document url
|
|
427
|
+
#
|
|
418
428
|
# @return [Array<Hash>]
|
|
429
|
+
#
|
|
419
430
|
def fetch_link(doc, url)
|
|
420
431
|
links = [{ type: "src", content: url }]
|
|
421
|
-
obp = doc.
|
|
432
|
+
obp = doc.at("//h4[contains(@class, 'h5')]/a")
|
|
422
433
|
links << { type: "obp", content: obp[:href] } if obp
|
|
423
434
|
rss = doc.at("//a[contains(@href, 'rss')]")
|
|
424
435
|
links << { type: "rss", content: DOMAIN + rss[:href] } if rss
|
data/lib/relaton_iso/version.rb
CHANGED
metadata
CHANGED
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: relaton-iso
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 1.16.
|
|
4
|
+
version: 1.16.2
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Ribose Inc.
|
|
8
8
|
autorequire:
|
|
9
9
|
bindir: exe
|
|
10
10
|
cert_chain: []
|
|
11
|
-
date: 2023-
|
|
11
|
+
date: 2023-10-20 00:00:00.000000000 Z
|
|
12
12
|
dependencies:
|
|
13
13
|
- !ruby/object:Gem::Dependency
|
|
14
14
|
name: algolia
|