relaton-iso 1.16.1 → 1.16.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/relaton_iso/scrapper.rb +56 -45
- data/lib/relaton_iso/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 479a728a58c56799448fd6d468e0d19fe245b731119f8dcd9ae6f19a7b624e07
|
4
|
+
data.tar.gz: ac89507180ca01978bfe98b68fbe02450f2c33015bd38d788752f3bf933911ad
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 71cc49dc2afa8690f02f7035ec5cc13981eb620e2b8c3792456401c152a4ca8192b2ffbd7445c6c982886e61f679427a2d5afbf26e13c6ebcfffcc8d54f7e5c9
|
7
|
+
data.tar.gz: 853da0772a998533c5f461ff297bef978c75e1f58b2df1fec5eff0fea6d306807420453a6ba37b1348d44e58c3a71dcf743228c7703b73fd9b7d72c9d4309598
|
data/lib/relaton_iso/scrapper.rb
CHANGED
@@ -270,11 +270,10 @@ module RelatonIso
|
|
270
270
|
# @param doc [Nokogiri::HTML::Document]
|
271
271
|
# @return [Hash]
|
272
272
|
def fetch_workgroup(doc) # rubocop:disable Metrics/MethodLength,Metrics/AbcSize,Metrics/CyclomaticComplexity,Metrics/PerceivedComplexity
|
273
|
-
wg = doc.at("
|
274
|
-
|
275
|
-
return unless wg_link
|
273
|
+
wg = doc.at("////div[contains(., 'Technical Committe')]/following-sibling::span/a")
|
274
|
+
return unless wg
|
276
275
|
|
277
|
-
workgroup =
|
276
|
+
workgroup = wg.text.split "/"
|
278
277
|
type = workgroup[1]&.match(/^[A-Z]+/)&.to_s || "TC"
|
279
278
|
# {
|
280
279
|
# name: "International Organization for Standardization",
|
@@ -282,44 +281,48 @@ module RelatonIso
|
|
282
281
|
# url: "www.iso.org",
|
283
282
|
# }
|
284
283
|
tc_numb = workgroup[1]&.match(/\d+/)&.to_s&.to_i
|
285
|
-
tc_name = wg
|
286
|
-
tc = RelatonBib::WorkGroup.new(name: tc_name, identifier:
|
284
|
+
tc_name = wg[:title]
|
285
|
+
tc = RelatonBib::WorkGroup.new(name: tc_name, identifier: wg.text,
|
287
286
|
type: type, number: tc_numb)
|
288
287
|
RelatonIsoBib::EditorialGroup.new(technical_committee: [tc])
|
289
288
|
end
|
290
289
|
|
291
|
-
# rubocop:disable Metrics/MethodLength
|
292
|
-
|
293
290
|
# Fetch relations.
|
294
291
|
# @param doc [Nokogiri::HTML::Document]
|
295
292
|
# @return [Array<Hash>]
|
296
|
-
def fetch_relations(doc)
|
293
|
+
def fetch_relations(doc)
|
297
294
|
types = ["Now", "Now under review"]
|
298
295
|
doc.xpath("//ul[@class='steps']/li", "//div[@class='sub-step']").reduce([]) do |a, r|
|
299
|
-
|
300
|
-
|
301
|
-
|
302
|
-
|
303
|
-
|
304
|
-
|
305
|
-
|
306
|
-
|
307
|
-
|
308
|
-
|
309
|
-
|
310
|
-
|
311
|
-
|
312
|
-
|
313
|
-
|
314
|
-
|
315
|
-
docid: [docid], formattedref: fref, date: date,
|
316
|
-
)
|
317
|
-
{ type: type, bibitem: bibitem }
|
296
|
+
type, date = relation_type(r.at("h4", "h5").text.strip, doc)
|
297
|
+
next a if types.include?(type)
|
298
|
+
|
299
|
+
a + create_relations(r, type, date)
|
300
|
+
end
|
301
|
+
end
|
302
|
+
|
303
|
+
def relation_type(type, doc)
|
304
|
+
date = []
|
305
|
+
t = case type.strip
|
306
|
+
when "Previously", "Will be replaced by" then "obsoletes"
|
307
|
+
when "Corrigenda / Amendments", "Revised by", "Now confirmed"
|
308
|
+
on = doc.xpath('//span[@class="stage-date"][contains(., "-")]').last
|
309
|
+
date << { type: "circulated", on: on.text } if on
|
310
|
+
"updates"
|
311
|
+
else type
|
318
312
|
end
|
319
|
-
|
313
|
+
[t, date]
|
314
|
+
end
|
315
|
+
|
316
|
+
def create_relations(rel, type, date)
|
317
|
+
rel.css("a").map do |id|
|
318
|
+
docid = RelatonBib::DocumentIdentifier.new(type: "ISO", id: id.text, primary: true)
|
319
|
+
fref = RelatonBib::FormattedRef.new(content: id.text, format: "text/plain")
|
320
|
+
bibitem = RelatonIsoBib::IsoBibliographicItem.new(
|
321
|
+
docid: [docid], formattedref: fref, date: date,
|
322
|
+
)
|
323
|
+
{ type: type, bibitem: bibitem }
|
320
324
|
end
|
321
325
|
end
|
322
|
-
# rubocop:enable Metrics/MethodLength
|
323
326
|
|
324
327
|
# Fetch type.
|
325
328
|
# @param ref [String]
|
@@ -343,14 +346,20 @@ module RelatonIso
|
|
343
346
|
# @param doc [Nokogiri::HTML::Document]
|
344
347
|
# @param lang [String]
|
345
348
|
# @return [Array<RelatonBib::TypedTitleString>]
|
346
|
-
def fetch_title(doc, lang)
|
347
|
-
|
348
|
-
|
349
|
-
|
350
|
-
|
351
|
-
|
352
|
-
|
353
|
-
|
349
|
+
def fetch_title(doc, lang) # rubocop:disable Metrics/MethodLength,Metrics/AbcSize
|
350
|
+
head = doc.at "//nav[contains(@class,'heading-condensed')]"
|
351
|
+
types = { "h2" => "title-intro", "h3" => "title-main", "h4" => "title-part" }
|
352
|
+
title_types = head.xpath("h2 | h3 | h4").each_with_object({}) do |t, h|
|
353
|
+
h[types[t.name]] = t.text
|
354
|
+
end
|
355
|
+
title = RelatonBib::TypedTitleStringCollection.new
|
356
|
+
title_types.each do |type, content|
|
357
|
+
title << RelatonBib::TypedTitleString.new(
|
358
|
+
type: type, content: content, language: lang, script: script(lang),
|
359
|
+
)
|
360
|
+
end
|
361
|
+
main = title.map { |t| t.title.content }.join " - "
|
362
|
+
title << RelatonBib::TypedTitleString.new(type: "main", content: main, language: lang, script: script(lang))
|
354
363
|
end
|
355
364
|
|
356
365
|
# Return ISO script code.
|
@@ -363,12 +372,11 @@ module RelatonIso
|
|
363
372
|
end
|
364
373
|
end
|
365
374
|
|
366
|
-
# rubocop:disable Metrics/MethodLength
|
367
375
|
# Fetch dates
|
368
376
|
# @param doc [Nokogiri::HTML::Document]
|
369
377
|
# @param ref [String]
|
370
378
|
# @return [Array<Hash>]
|
371
|
-
def fetch_dates(doc, ref) # rubocop:disable Metrics/AbcSize, Metrics/PerceivedComplexity
|
379
|
+
def fetch_dates(doc, ref) # rubocop:disable Metrics/AbcSize, Metrics/PerceivedComplexity, Metrics/MethodLength
|
372
380
|
dates = []
|
373
381
|
%r{^[^\s]+\s[\d-]+:(?<ref_date_str>\d{4})} =~ ref
|
374
382
|
pub_date_str = doc.xpath("//span[@itemprop='releaseDate']").text
|
@@ -400,25 +408,28 @@ module RelatonIso
|
|
400
408
|
mem << { entity: publisher, role: [type: "publisher"] }
|
401
409
|
end
|
402
410
|
end
|
403
|
-
# rubocop:enable Metrics/MethodLength
|
404
411
|
|
405
412
|
# Fetch ICS.
|
406
413
|
# @param doc [Nokogiri::HTML::Document]
|
407
414
|
# @return [Array<Hash>]
|
408
415
|
def fetch_ics(doc)
|
409
|
-
doc.xpath("//
|
416
|
+
doc.xpath("//div[contains(., 'ICS')]/following-sibling::span/a").map do |i|
|
410
417
|
code = i.text.match(/[\d.]+/).to_s.split "."
|
411
418
|
{ field: code[0], group: code[1], subgroup: code[2] }
|
412
419
|
end
|
413
420
|
end
|
414
421
|
|
422
|
+
#
|
415
423
|
# Fetch links.
|
416
|
-
#
|
417
|
-
# @param
|
424
|
+
#
|
425
|
+
# @param doc [Nokogiri::HTML::Document] document to parse
|
426
|
+
# @param url [String] document url
|
427
|
+
#
|
418
428
|
# @return [Array<Hash>]
|
429
|
+
#
|
419
430
|
def fetch_link(doc, url)
|
420
431
|
links = [{ type: "src", content: url }]
|
421
|
-
obp = doc.
|
432
|
+
obp = doc.at("//h4[contains(@class, 'h5')]/a")
|
422
433
|
links << { type: "obp", content: obp[:href] } if obp
|
423
434
|
rss = doc.at("//a[contains(@href, 'rss')]")
|
424
435
|
links << { type: "rss", content: DOMAIN + rss[:href] } if rss
|
data/lib/relaton_iso/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: relaton-iso
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.16.
|
4
|
+
version: 1.16.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Ribose Inc.
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2023-10-
|
11
|
+
date: 2023-10-20 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: algolia
|