relaton-iso 1.16.1 → 1.16.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/relaton_iso/scrapper.rb +56 -45
- data/lib/relaton_iso/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 479a728a58c56799448fd6d468e0d19fe245b731119f8dcd9ae6f19a7b624e07
|
4
|
+
data.tar.gz: ac89507180ca01978bfe98b68fbe02450f2c33015bd38d788752f3bf933911ad
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 71cc49dc2afa8690f02f7035ec5cc13981eb620e2b8c3792456401c152a4ca8192b2ffbd7445c6c982886e61f679427a2d5afbf26e13c6ebcfffcc8d54f7e5c9
|
7
|
+
data.tar.gz: 853da0772a998533c5f461ff297bef978c75e1f58b2df1fec5eff0fea6d306807420453a6ba37b1348d44e58c3a71dcf743228c7703b73fd9b7d72c9d4309598
|
data/lib/relaton_iso/scrapper.rb
CHANGED
@@ -270,11 +270,10 @@ module RelatonIso
|
|
270
270
|
# @param doc [Nokogiri::HTML::Document]
|
271
271
|
# @return [Hash]
|
272
272
|
def fetch_workgroup(doc) # rubocop:disable Metrics/MethodLength,Metrics/AbcSize,Metrics/CyclomaticComplexity,Metrics/PerceivedComplexity
|
273
|
-
wg = doc.at("
|
274
|
-
|
275
|
-
return unless wg_link
|
273
|
+
wg = doc.at("////div[contains(., 'Technical Committe')]/following-sibling::span/a")
|
274
|
+
return unless wg
|
276
275
|
|
277
|
-
workgroup =
|
276
|
+
workgroup = wg.text.split "/"
|
278
277
|
type = workgroup[1]&.match(/^[A-Z]+/)&.to_s || "TC"
|
279
278
|
# {
|
280
279
|
# name: "International Organization for Standardization",
|
@@ -282,44 +281,48 @@ module RelatonIso
|
|
282
281
|
# url: "www.iso.org",
|
283
282
|
# }
|
284
283
|
tc_numb = workgroup[1]&.match(/\d+/)&.to_s&.to_i
|
285
|
-
tc_name = wg
|
286
|
-
tc = RelatonBib::WorkGroup.new(name: tc_name, identifier:
|
284
|
+
tc_name = wg[:title]
|
285
|
+
tc = RelatonBib::WorkGroup.new(name: tc_name, identifier: wg.text,
|
287
286
|
type: type, number: tc_numb)
|
288
287
|
RelatonIsoBib::EditorialGroup.new(technical_committee: [tc])
|
289
288
|
end
|
290
289
|
|
291
|
-
# rubocop:disable Metrics/MethodLength
|
292
|
-
|
293
290
|
# Fetch relations.
|
294
291
|
# @param doc [Nokogiri::HTML::Document]
|
295
292
|
# @return [Array<Hash>]
|
296
|
-
def fetch_relations(doc)
|
293
|
+
def fetch_relations(doc)
|
297
294
|
types = ["Now", "Now under review"]
|
298
295
|
doc.xpath("//ul[@class='steps']/li", "//div[@class='sub-step']").reduce([]) do |a, r|
|
299
|
-
|
300
|
-
|
301
|
-
|
302
|
-
|
303
|
-
|
304
|
-
|
305
|
-
|
306
|
-
|
307
|
-
|
308
|
-
|
309
|
-
|
310
|
-
|
311
|
-
|
312
|
-
|
313
|
-
|
314
|
-
|
315
|
-
docid: [docid], formattedref: fref, date: date,
|
316
|
-
)
|
317
|
-
{ type: type, bibitem: bibitem }
|
296
|
+
type, date = relation_type(r.at("h4", "h5").text.strip, doc)
|
297
|
+
next a if types.include?(type)
|
298
|
+
|
299
|
+
a + create_relations(r, type, date)
|
300
|
+
end
|
301
|
+
end
|
302
|
+
|
303
|
+
def relation_type(type, doc)
|
304
|
+
date = []
|
305
|
+
t = case type.strip
|
306
|
+
when "Previously", "Will be replaced by" then "obsoletes"
|
307
|
+
when "Corrigenda / Amendments", "Revised by", "Now confirmed"
|
308
|
+
on = doc.xpath('//span[@class="stage-date"][contains(., "-")]').last
|
309
|
+
date << { type: "circulated", on: on.text } if on
|
310
|
+
"updates"
|
311
|
+
else type
|
318
312
|
end
|
319
|
-
|
313
|
+
[t, date]
|
314
|
+
end
|
315
|
+
|
316
|
+
def create_relations(rel, type, date)
|
317
|
+
rel.css("a").map do |id|
|
318
|
+
docid = RelatonBib::DocumentIdentifier.new(type: "ISO", id: id.text, primary: true)
|
319
|
+
fref = RelatonBib::FormattedRef.new(content: id.text, format: "text/plain")
|
320
|
+
bibitem = RelatonIsoBib::IsoBibliographicItem.new(
|
321
|
+
docid: [docid], formattedref: fref, date: date,
|
322
|
+
)
|
323
|
+
{ type: type, bibitem: bibitem }
|
320
324
|
end
|
321
325
|
end
|
322
|
-
# rubocop:enable Metrics/MethodLength
|
323
326
|
|
324
327
|
# Fetch type.
|
325
328
|
# @param ref [String]
|
@@ -343,14 +346,20 @@ module RelatonIso
|
|
343
346
|
# @param doc [Nokogiri::HTML::Document]
|
344
347
|
# @param lang [String]
|
345
348
|
# @return [Array<RelatonBib::TypedTitleString>]
|
346
|
-
def fetch_title(doc, lang)
|
347
|
-
|
348
|
-
|
349
|
-
|
350
|
-
|
351
|
-
|
352
|
-
|
353
|
-
|
349
|
+
def fetch_title(doc, lang) # rubocop:disable Metrics/MethodLength,Metrics/AbcSize
|
350
|
+
head = doc.at "//nav[contains(@class,'heading-condensed')]"
|
351
|
+
types = { "h2" => "title-intro", "h3" => "title-main", "h4" => "title-part" }
|
352
|
+
title_types = head.xpath("h2 | h3 | h4").each_with_object({}) do |t, h|
|
353
|
+
h[types[t.name]] = t.text
|
354
|
+
end
|
355
|
+
title = RelatonBib::TypedTitleStringCollection.new
|
356
|
+
title_types.each do |type, content|
|
357
|
+
title << RelatonBib::TypedTitleString.new(
|
358
|
+
type: type, content: content, language: lang, script: script(lang),
|
359
|
+
)
|
360
|
+
end
|
361
|
+
main = title.map { |t| t.title.content }.join " - "
|
362
|
+
title << RelatonBib::TypedTitleString.new(type: "main", content: main, language: lang, script: script(lang))
|
354
363
|
end
|
355
364
|
|
356
365
|
# Return ISO script code.
|
@@ -363,12 +372,11 @@ module RelatonIso
|
|
363
372
|
end
|
364
373
|
end
|
365
374
|
|
366
|
-
# rubocop:disable Metrics/MethodLength
|
367
375
|
# Fetch dates
|
368
376
|
# @param doc [Nokogiri::HTML::Document]
|
369
377
|
# @param ref [String]
|
370
378
|
# @return [Array<Hash>]
|
371
|
-
def fetch_dates(doc, ref) # rubocop:disable Metrics/AbcSize, Metrics/PerceivedComplexity
|
379
|
+
def fetch_dates(doc, ref) # rubocop:disable Metrics/AbcSize, Metrics/PerceivedComplexity, Metrics/MethodLength
|
372
380
|
dates = []
|
373
381
|
%r{^[^\s]+\s[\d-]+:(?<ref_date_str>\d{4})} =~ ref
|
374
382
|
pub_date_str = doc.xpath("//span[@itemprop='releaseDate']").text
|
@@ -400,25 +408,28 @@ module RelatonIso
|
|
400
408
|
mem << { entity: publisher, role: [type: "publisher"] }
|
401
409
|
end
|
402
410
|
end
|
403
|
-
# rubocop:enable Metrics/MethodLength
|
404
411
|
|
405
412
|
# Fetch ICS.
|
406
413
|
# @param doc [Nokogiri::HTML::Document]
|
407
414
|
# @return [Array<Hash>]
|
408
415
|
def fetch_ics(doc)
|
409
|
-
doc.xpath("//
|
416
|
+
doc.xpath("//div[contains(., 'ICS')]/following-sibling::span/a").map do |i|
|
410
417
|
code = i.text.match(/[\d.]+/).to_s.split "."
|
411
418
|
{ field: code[0], group: code[1], subgroup: code[2] }
|
412
419
|
end
|
413
420
|
end
|
414
421
|
|
422
|
+
#
|
415
423
|
# Fetch links.
|
416
|
-
#
|
417
|
-
# @param
|
424
|
+
#
|
425
|
+
# @param doc [Nokogiri::HTML::Document] document to parse
|
426
|
+
# @param url [String] document url
|
427
|
+
#
|
418
428
|
# @return [Array<Hash>]
|
429
|
+
#
|
419
430
|
def fetch_link(doc, url)
|
420
431
|
links = [{ type: "src", content: url }]
|
421
|
-
obp = doc.
|
432
|
+
obp = doc.at("//h4[contains(@class, 'h5')]/a")
|
422
433
|
links << { type: "obp", content: obp[:href] } if obp
|
423
434
|
rss = doc.at("//a[contains(@href, 'rss')]")
|
424
435
|
links << { type: "rss", content: DOMAIN + rss[:href] } if rss
|
data/lib/relaton_iso/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: relaton-iso
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.16.
|
4
|
+
version: 1.16.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Ribose Inc.
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2023-10-
|
11
|
+
date: 2023-10-20 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: algolia
|