relaton-iso 0.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +16 -0
- data/.hound.yml +3 -0
- data/.rspec +3 -0
- data/.rubocop.yml +10 -0
- data/.travis.yml +17 -0
- data/CODE_OF_CONDUCT.md +74 -0
- data/Gemfile +9 -0
- data/Gemfile.lock +101 -0
- data/LICENSE.txt +25 -0
- data/README.adoc +202 -0
- data/Rakefile +6 -0
- data/appveyor.yml +35 -0
- data/bin/bundle +105 -0
- data/bin/byebug +29 -0
- data/bin/coderay +29 -0
- data/bin/console +14 -0
- data/bin/gdb_wrapper +29 -0
- data/bin/htmldiff +29 -0
- data/bin/httpclient +29 -0
- data/bin/ldiff +29 -0
- data/bin/nokogiri +29 -0
- data/bin/pry +29 -0
- data/bin/rake +29 -0
- data/bin/rdebug-ide +29 -0
- data/bin/rspec +29 -0
- data/bin/safe_yaml +29 -0
- data/bin/setup +8 -0
- data/lib/relaton/processor.rb +22 -0
- data/lib/relaton_iso.rb +8 -0
- data/lib/relaton_iso/hit.rb +55 -0
- data/lib/relaton_iso/hit_collection.rb +42 -0
- data/lib/relaton_iso/hit_pages.rb +96 -0
- data/lib/relaton_iso/iso_bibliography.rb +132 -0
- data/lib/relaton_iso/scrapper.rb +421 -0
- data/lib/relaton_iso/version.rb +5 -0
- data/relaton_iso.gemspec +44 -0
- metadata +278 -0
data/bin/rspec
ADDED
@@ -0,0 +1,29 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
# frozen_string_literal: true
|
3
|
+
|
4
|
+
#
|
5
|
+
# This file was generated by Bundler.
|
6
|
+
#
|
7
|
+
# The application 'rspec' is installed as part of a gem, and
|
8
|
+
# this file is here to facilitate running it.
|
9
|
+
#
|
10
|
+
|
11
|
+
require "pathname"
|
12
|
+
ENV["BUNDLE_GEMFILE"] ||= File.expand_path("../../Gemfile",
|
13
|
+
Pathname.new(__FILE__).realpath)
|
14
|
+
|
15
|
+
bundle_binstub = File.expand_path("../bundle", __FILE__)
|
16
|
+
|
17
|
+
if File.file?(bundle_binstub)
|
18
|
+
if File.read(bundle_binstub, 300) =~ /This file was generated by Bundler/
|
19
|
+
load(bundle_binstub)
|
20
|
+
else
|
21
|
+
abort("Your `bin/bundle` was not generated by Bundler, so this binstub cannot run.
|
22
|
+
Replace `bin/bundle` by running `bundle binstubs bundler --force`, then run this command again.")
|
23
|
+
end
|
24
|
+
end
|
25
|
+
|
26
|
+
require "rubygems"
|
27
|
+
require "bundler/setup"
|
28
|
+
|
29
|
+
load Gem.bin_path("rspec-core", "rspec")
|
data/bin/safe_yaml
ADDED
@@ -0,0 +1,29 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
# frozen_string_literal: true
|
3
|
+
|
4
|
+
#
|
5
|
+
# This file was generated by Bundler.
|
6
|
+
#
|
7
|
+
# The application 'safe_yaml' is installed as part of a gem, and
|
8
|
+
# this file is here to facilitate running it.
|
9
|
+
#
|
10
|
+
|
11
|
+
require "pathname"
|
12
|
+
ENV["BUNDLE_GEMFILE"] ||= File.expand_path("../../Gemfile",
|
13
|
+
Pathname.new(__FILE__).realpath)
|
14
|
+
|
15
|
+
bundle_binstub = File.expand_path("../bundle", __FILE__)
|
16
|
+
|
17
|
+
if File.file?(bundle_binstub)
|
18
|
+
if File.read(bundle_binstub, 300) =~ /This file was generated by Bundler/
|
19
|
+
load(bundle_binstub)
|
20
|
+
else
|
21
|
+
abort("Your `bin/bundle` was not generated by Bundler, so this binstub cannot run.
|
22
|
+
Replace `bin/bundle` by running `bundle binstubs bundler --force`, then run this command again.")
|
23
|
+
end
|
24
|
+
end
|
25
|
+
|
26
|
+
require "rubygems"
|
27
|
+
require "bundler/setup"
|
28
|
+
|
29
|
+
load Gem.bin_path("safe_yaml", "safe_yaml")
|
data/bin/setup
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
require "relaton/processor"
|
2
|
+
|
3
|
+
module Relaton
|
4
|
+
module RelatonIso
|
5
|
+
class Processor < Relaton::Processor
|
6
|
+
def initialize
|
7
|
+
@short = :relaton_iso
|
8
|
+
@prefix = "ISO"
|
9
|
+
@defaultprefix = %r{^(ISO)[ /]}
|
10
|
+
@idtype = "ISO"
|
11
|
+
end
|
12
|
+
|
13
|
+
def get(code, date, opts)
|
14
|
+
::RelatonIso::IsoBibliography.get(code, date, opts)
|
15
|
+
end
|
16
|
+
|
17
|
+
def from_xml(xml)
|
18
|
+
RelatonIsoBib::XMLParser.from_xml xml
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|
data/lib/relaton_iso.rb
ADDED
@@ -0,0 +1,55 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module RelatonIso
|
4
|
+
# Hit.
|
5
|
+
class Hit
|
6
|
+
# @return [RelatonIso::HitCollection]
|
7
|
+
attr_reader :hit_collection
|
8
|
+
|
9
|
+
# @return [Array<Hash>]
|
10
|
+
attr_reader :hit
|
11
|
+
|
12
|
+
# @param hit [Hash]
|
13
|
+
# @param hit_collection [RelatonIso:HitCollection]
|
14
|
+
def initialize(hit, hit_collection = nil)
|
15
|
+
@hit = hit
|
16
|
+
@hit_collection = hit_collection
|
17
|
+
end
|
18
|
+
|
19
|
+
# Parse page.
|
20
|
+
# @return [RelatonIso::IsoBibliographicItem]
|
21
|
+
def fetch
|
22
|
+
@fetch ||= Scrapper.parse_page @hit
|
23
|
+
end
|
24
|
+
|
25
|
+
# @return [String]
|
26
|
+
def to_s
|
27
|
+
inspect
|
28
|
+
end
|
29
|
+
|
30
|
+
# @return [String]
|
31
|
+
def inspect
|
32
|
+
matched_words = @hit["_highlightResult"].
|
33
|
+
reduce([]) { |a, (_k, v)| a + v["matchedWords"] }.uniq
|
34
|
+
|
35
|
+
"<#{self.class}:#{format('%#.14x', object_id << 1)} "\
|
36
|
+
"@text=\"#{@hit_collection&.hit_pages&.text}\" "\
|
37
|
+
"@fullIdentifier=\"#{@fetch&.shortref}\" "\
|
38
|
+
"@matchedWords=#{matched_words} "\
|
39
|
+
"@category=\"#{@hit['category']}\" "\
|
40
|
+
"@title=\"#{@hit['title']}\">"
|
41
|
+
end
|
42
|
+
|
43
|
+
# @param builder [Nokogiri::XML::Builder]
|
44
|
+
def to_xml(builder = nil, **opts)
|
45
|
+
if builder
|
46
|
+
fetch.to_xml builder, **opts
|
47
|
+
else
|
48
|
+
builder = Nokogiri::XML::Builder.new(encoding: "UTF-8") do |xml|
|
49
|
+
fetch.to_xml xml, **opts
|
50
|
+
end
|
51
|
+
builder.doc.root.to_xml
|
52
|
+
end
|
53
|
+
end
|
54
|
+
end
|
55
|
+
end
|
@@ -0,0 +1,42 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require "relaton_iso/hit"
|
4
|
+
|
5
|
+
module RelatonIso
|
6
|
+
# Page of hit collection.
|
7
|
+
class HitCollection < Array
|
8
|
+
# @return [TrueClass, FalseClass]
|
9
|
+
attr_reader :fetched
|
10
|
+
|
11
|
+
# @return [RelatonIso::HitPages]
|
12
|
+
attr_reader :hit_pages
|
13
|
+
|
14
|
+
# @param hits [Array<Hash>]
|
15
|
+
def initialize(hits, hit_pages = nil)
|
16
|
+
concat(hits.map { |h| Hit.new(h, self) })
|
17
|
+
@fetched = false
|
18
|
+
@hit_pages = hit_pages
|
19
|
+
end
|
20
|
+
|
21
|
+
# @return [RelatonIso::HitCollection]
|
22
|
+
def fetch
|
23
|
+
workers = RelatonBib::WorkersPool.new 4
|
24
|
+
workers.worker(&:fetch)
|
25
|
+
each do |hit|
|
26
|
+
workers << hit
|
27
|
+
end
|
28
|
+
workers.end
|
29
|
+
workers.result
|
30
|
+
@fetched = true
|
31
|
+
self
|
32
|
+
end
|
33
|
+
|
34
|
+
def to_s
|
35
|
+
inspect
|
36
|
+
end
|
37
|
+
|
38
|
+
def inspect
|
39
|
+
"<#{self.class}:#{format('%#.14x', object_id << 1)} @fetched=#{@fetched}>"
|
40
|
+
end
|
41
|
+
end
|
42
|
+
end
|
@@ -0,0 +1,96 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require "algoliasearch"
|
4
|
+
require "relaton_iso/hit_collection"
|
5
|
+
|
6
|
+
module RelatonIso
|
7
|
+
# Pages of hits.
|
8
|
+
class HitPages < Array
|
9
|
+
Algolia.init application_id: "JCL49WV5AR",
|
10
|
+
api_key: "dd1b9e1ab383f4d4817d29cd5e96d3f0"
|
11
|
+
|
12
|
+
# @return [String]
|
13
|
+
attr_reader :text
|
14
|
+
|
15
|
+
# @param text [String]
|
16
|
+
def initialize(text)
|
17
|
+
@text = text
|
18
|
+
@index = Algolia::Index.new "all_en"
|
19
|
+
resp = @index.search(text, facetFilters: ["category:standard"])
|
20
|
+
@nb_pages = resp["nbPages"]
|
21
|
+
self << HitCollection.new(resp["hits"], self)
|
22
|
+
end
|
23
|
+
|
24
|
+
# @return [RelatonIso::HitCollection]
|
25
|
+
def last
|
26
|
+
collection(@nb_pages - 1)
|
27
|
+
end
|
28
|
+
|
29
|
+
# @param i [Integer]
|
30
|
+
# @return [RelatonIso::HitCollection]
|
31
|
+
def [](idx)
|
32
|
+
# collection i
|
33
|
+
return if idx + 1 > @nb_pages
|
34
|
+
|
35
|
+
collection idx
|
36
|
+
super
|
37
|
+
end
|
38
|
+
|
39
|
+
# @return [Array]
|
40
|
+
def map(&block)
|
41
|
+
m = []
|
42
|
+
@nb_pages.times do |n|
|
43
|
+
m << yield(self[n]) if block
|
44
|
+
end
|
45
|
+
m
|
46
|
+
end
|
47
|
+
|
48
|
+
def each(&block)
|
49
|
+
@nb_pages.times do |n|
|
50
|
+
yield self[n] if block
|
51
|
+
end
|
52
|
+
end
|
53
|
+
|
54
|
+
def to_s
|
55
|
+
inspect
|
56
|
+
end
|
57
|
+
|
58
|
+
def inspect
|
59
|
+
"<#{self.class}:#{format('%#.14x', object_id << 1)} @text=#{@text} "\
|
60
|
+
"@pages=#{@nb_pages}>"
|
61
|
+
end
|
62
|
+
|
63
|
+
# @return [Integer]
|
64
|
+
def size
|
65
|
+
@nb_pages
|
66
|
+
end
|
67
|
+
|
68
|
+
def to_xml(**opts)
|
69
|
+
builder = Nokogiri::XML::Builder.new(encoding: "UTF-8") do |xml|
|
70
|
+
xml.documents do
|
71
|
+
each do |page|
|
72
|
+
page.fetch
|
73
|
+
page.each { |hit| hit.to_xml xml, **opts }
|
74
|
+
end
|
75
|
+
end
|
76
|
+
end
|
77
|
+
builder.to_xml
|
78
|
+
end
|
79
|
+
|
80
|
+
private
|
81
|
+
|
82
|
+
# @param i [Integer]
|
83
|
+
# @return [RelatonIso::HitCollection]
|
84
|
+
def collection(idx)
|
85
|
+
return if idx + 1 > @nb_pages
|
86
|
+
|
87
|
+
while Array.instance_method(:size).bind(self).call < idx + 1
|
88
|
+
resp = @index.search(@text,
|
89
|
+
facetFilters: ["category:standard"],
|
90
|
+
page: idx)
|
91
|
+
self << HitCollection.new(resp["hits"], self)
|
92
|
+
end
|
93
|
+
Array.instance_method(:[]).bind(self).call idx
|
94
|
+
end
|
95
|
+
end
|
96
|
+
end
|
@@ -0,0 +1,132 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
# require 'relaton_iso/iso_bibliographic_item'
|
4
|
+
require "relaton_iso/scrapper"
|
5
|
+
require "relaton_iso/hit_pages"
|
6
|
+
require "relaton_iec"
|
7
|
+
|
8
|
+
module RelatonIso
|
9
|
+
# Class methods for search ISO standards.
|
10
|
+
class IsoBibliography
|
11
|
+
class << self
|
12
|
+
# @param text [String]
|
13
|
+
# @return [RelatonIso::HitPages]
|
14
|
+
def search(text)
|
15
|
+
HitPages.new text
|
16
|
+
rescue Algolia::AlgoliaProtocolError
|
17
|
+
warn "Could not access http://www.iso.org"
|
18
|
+
[]
|
19
|
+
end
|
20
|
+
|
21
|
+
# @param text [String]
|
22
|
+
# @return [Array<RelatonIso::IsoBibliographicItem>]
|
23
|
+
# def search_and_fetch(text)
|
24
|
+
# Scrapper.get(text)
|
25
|
+
# end
|
26
|
+
|
27
|
+
# @param code [String] the ISO standard Code to look up (e..g "ISO 9000")
|
28
|
+
# @param year [String] the year the standard was published (optional)
|
29
|
+
# @param opts [Hash] options; restricted to :all_parts if all-parts reference is required,
|
30
|
+
# :keep_year if undated reference should return actual reference with year
|
31
|
+
# @return [String] Relaton XML serialisation of reference
|
32
|
+
def get(code, year, opts)
|
33
|
+
if year.nil?
|
34
|
+
/^(?<code1>[^:]+):(?<year1>[^:]+)$/ =~ code
|
35
|
+
unless code1.nil?
|
36
|
+
code = code1
|
37
|
+
year = year1
|
38
|
+
end
|
39
|
+
end
|
40
|
+
code += "-1" if opts[:all_parts]
|
41
|
+
return Iecbib::IecBibliography.get(code, year, opts) if %r[^ISO/IEC DIR].match code
|
42
|
+
|
43
|
+
ret = isobib_get1(code, year, opts)
|
44
|
+
if ret.nil? && code =~ %r[^ISO\s]
|
45
|
+
c = code.gsub "ISO", "ISO/IEC"
|
46
|
+
warn "Attempting ISO/IEC retrieval"
|
47
|
+
ret = isobib_get1(c, year, opts)
|
48
|
+
end
|
49
|
+
return nil if ret.nil?
|
50
|
+
|
51
|
+
ret.to_most_recent_reference unless year || opts[:keep_year]
|
52
|
+
ret.to_all_parts if opts[:all_parts]
|
53
|
+
ret
|
54
|
+
end
|
55
|
+
|
56
|
+
private
|
57
|
+
|
58
|
+
def fetch_ref_err(code, year, missed_years)
|
59
|
+
id = year ? "#{code}:#{year}" : code
|
60
|
+
warn "WARNING: no match found online for #{id}. "\
|
61
|
+
"The code must be exactly like it is on the standards website."
|
62
|
+
warn "(There was no match for #{year}, though there were matches "\
|
63
|
+
"found for #{missed_years.join(', ')}.)" unless missed_years.empty?
|
64
|
+
if /\d-\d/ =~ code
|
65
|
+
warn "The provided document part may not exist, or the document "\
|
66
|
+
"may no longer be published in parts."
|
67
|
+
else
|
68
|
+
warn "If you wanted to cite all document parts for the reference, "\
|
69
|
+
"use \"#{code} (all parts)\".\nIf the document is not a standard, "\
|
70
|
+
"use its document type abbreviation (TS, TR, PAS, Guide)."
|
71
|
+
end
|
72
|
+
nil
|
73
|
+
end
|
74
|
+
|
75
|
+
def fetch_pages(s, n)
|
76
|
+
workers = RelatonBib::WorkersPool.new n
|
77
|
+
workers.worker { |w| { i: w[:i], hit: w[:hit].fetch } }
|
78
|
+
s.each_with_index { |hit, i| workers << { i: i, hit: hit } }
|
79
|
+
workers.end
|
80
|
+
workers.result.sort { |x, y| x[:i] <=> y[:i] }.map { |x| x[:hit] }
|
81
|
+
end
|
82
|
+
|
83
|
+
def isobib_search_filter(code)
|
84
|
+
docidrx = %r{^(ISO|IEC)[^0-9]*\s[0-9-]+}
|
85
|
+
corrigrx = %r{^(ISO|IEC)[^0-9]*\s[0-9-]+:[0-9]+/}
|
86
|
+
warn "fetching #{code}..."
|
87
|
+
result = search(code)
|
88
|
+
result.each do |page|
|
89
|
+
ret = page.select do |i|
|
90
|
+
i.hit["title"] &&
|
91
|
+
i.hit["title"].match(docidrx).to_s == code &&
|
92
|
+
corrigrx !~ i.hit["title"]
|
93
|
+
end
|
94
|
+
return ret unless ret.empty?
|
95
|
+
end
|
96
|
+
[]
|
97
|
+
end
|
98
|
+
|
99
|
+
# Sort through the results from RelatonIso, fetching them three at a time,
|
100
|
+
# and return the first result that matches the code,
|
101
|
+
# matches the year (if provided), and which # has a title (amendments do not).
|
102
|
+
# Only expects the first page of results to be populated.
|
103
|
+
# Does not match corrigenda etc (e.g. ISO 3166-1:2006/Cor 1:2007)
|
104
|
+
# If no match, returns any years which caused mismatch, for error reporting
|
105
|
+
def isobib_results_filter(result, year)
|
106
|
+
missed_years = []
|
107
|
+
result.each_slice(3) do |s| # ISO website only allows 3 connections
|
108
|
+
fetch_pages(s, 3).each_with_index do |r, _i|
|
109
|
+
next if r.nil?
|
110
|
+
return { ret: r } if !year
|
111
|
+
|
112
|
+
r.dates.select { |d| d.type == "published" }.each do |d|
|
113
|
+
return { ret: r } if year.to_i == d.on.year
|
114
|
+
|
115
|
+
missed_years << d.on.year
|
116
|
+
end
|
117
|
+
end
|
118
|
+
end
|
119
|
+
{ years: missed_years }
|
120
|
+
end
|
121
|
+
|
122
|
+
def isobib_get1(code, year, _opts)
|
123
|
+
# return iev(code) if /^IEC 60050-/.match code
|
124
|
+
result = isobib_search_filter(code) || return
|
125
|
+
ret = isobib_results_filter(result, year)
|
126
|
+
return ret[:ret] if ret[:ret]
|
127
|
+
|
128
|
+
fetch_ref_err(code, year, ret[:years])
|
129
|
+
end
|
130
|
+
end
|
131
|
+
end
|
132
|
+
end
|
@@ -0,0 +1,421 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require "algoliasearch"
|
4
|
+
require "relaton_iso_bib"
|
5
|
+
require "relaton_iso/hit"
|
6
|
+
require "nokogiri"
|
7
|
+
require "net/http"
|
8
|
+
|
9
|
+
Algolia.init application_id: "JCL49WV5AR",
|
10
|
+
api_key: "dd1b9e1ab383f4d4817d29cd5e96d3f0"
|
11
|
+
|
12
|
+
module RelatonIso
|
13
|
+
# Scrapper.
|
14
|
+
# rubocop:disable Metrics/ModuleLength
|
15
|
+
module Scrapper
|
16
|
+
DOMAIN = "https://www.iso.org"
|
17
|
+
|
18
|
+
TYPES = {
|
19
|
+
"TS" => "technical-specification",
|
20
|
+
"TR" => "technical-report",
|
21
|
+
"PAS" => "publicly-available-specification",
|
22
|
+
# "AWI" => "approvedWorkItem",
|
23
|
+
# "CD" => "committeeDraft",
|
24
|
+
# "FDIS" => "finalDraftInternationalStandard",
|
25
|
+
# "NP" => "newProposal",
|
26
|
+
# "DIS" => "draftInternationalStandard",
|
27
|
+
# "WD" => "workingDraft",
|
28
|
+
# "R" => "recommendation",
|
29
|
+
"Guide" => "guide",
|
30
|
+
}.freeze
|
31
|
+
|
32
|
+
class << self
|
33
|
+
# @param text [String]
|
34
|
+
# @return [Array<Hash>]
|
35
|
+
# def get(text)
|
36
|
+
# iso_workers = RelatonBib::WorkersPool.new 4
|
37
|
+
# iso_workers.worker { |hit| iso_worker(hit, iso_workers) }
|
38
|
+
# algolia_workers = start_algolia_search(text, iso_workers)
|
39
|
+
# iso_docs = iso_workers.result
|
40
|
+
# algolia_workers.end
|
41
|
+
# algolia_workers.result
|
42
|
+
# iso_docs
|
43
|
+
# rescue
|
44
|
+
# warn "Could not connect to http://www.iso.org"
|
45
|
+
# []
|
46
|
+
# end
|
47
|
+
|
48
|
+
# Parse page.
|
49
|
+
# @param hit [Hash]
|
50
|
+
# @return [Hash]
|
51
|
+
# rubocop:disable Metrics/AbcSize, Metrics/MethodLength
|
52
|
+
def parse_page(hit_data)
|
53
|
+
return unless hit_data["path"] =~ /\d+$/
|
54
|
+
|
55
|
+
doc, url = get_page "/standard/#{hit_data['path'].match(/\d+$/)}.html"
|
56
|
+
|
57
|
+
# Fetch edition.
|
58
|
+
edition = doc&.xpath("//strong[contains(text(), 'Edition')]/..")&.
|
59
|
+
children&.last&.text&.match(/\d+/)&.to_s
|
60
|
+
|
61
|
+
titles, abstract = fetch_titles_abstract(doc)
|
62
|
+
|
63
|
+
RelatonIsoBib::IsoBibliographicItem.new(
|
64
|
+
fetched: Date.today.to_s,
|
65
|
+
docid: fetch_docid(doc),
|
66
|
+
edition: edition,
|
67
|
+
language: langs(doc).map { |l| l[:lang] },
|
68
|
+
script: langs(doc).map { |l| script(l[:lang]) }.uniq,
|
69
|
+
titles: titles,
|
70
|
+
type: fetch_type(hit_data["title"]),
|
71
|
+
docstatus: fetch_status(doc, hit_data["status"]),
|
72
|
+
ics: fetch_ics(doc),
|
73
|
+
dates: fetch_dates(doc),
|
74
|
+
contributors: fetch_contributors(hit_data["title"]),
|
75
|
+
editorialgroup: fetch_workgroup(doc),
|
76
|
+
abstract: abstract,
|
77
|
+
copyright: fetch_copyright(hit_data["title"], doc),
|
78
|
+
link: fetch_link(doc, url),
|
79
|
+
relations: fetch_relations(doc),
|
80
|
+
structuredidentifier: fetch_structuredidentifier(doc),
|
81
|
+
)
|
82
|
+
end
|
83
|
+
# rubocop:enable Metrics/AbcSize, Metrics/MethodLength
|
84
|
+
|
85
|
+
private
|
86
|
+
|
87
|
+
# Start algolia search workers.
|
88
|
+
# @param text[String]
|
89
|
+
# @param iso_workers [RelatonBib::WorkersPool]
|
90
|
+
# @reaturn [RelatonBib::WorkersPool]
|
91
|
+
# def start_algolia_search(text, iso_workers)
|
92
|
+
# index = Algolia::Index.new "all_en"
|
93
|
+
# algolia_workers = RelatonBib::WorkersPool.new
|
94
|
+
# algolia_workers.worker do |page|
|
95
|
+
# algolia_worker(index, text, page, algolia_workers, iso_workers)
|
96
|
+
# end
|
97
|
+
|
98
|
+
# # Add first page so algolia worker will start.
|
99
|
+
# algolia_workers << 0
|
100
|
+
# end
|
101
|
+
|
102
|
+
# Fetch ISO documents.
|
103
|
+
# @param hit [Hash]
|
104
|
+
# @param isiso_workers [RelatonIso::WorkersPool]
|
105
|
+
# def iso_worker(hit, iso_workers)
|
106
|
+
# print "Parse #{iso_workers.size} of #{iso_workers.nb_hits} \r"
|
107
|
+
# parse_page hit
|
108
|
+
# end
|
109
|
+
|
110
|
+
# Fetch hits from algolia search service.
|
111
|
+
# @param index[Algolia::Index]
|
112
|
+
# @param text [String]
|
113
|
+
# @param page [Integer]
|
114
|
+
# @param algolia_workers [RelatonBib::WorkersPool]
|
115
|
+
# @param isiso_workers [RelatonBib::WorkersPool]
|
116
|
+
# def algolia_worker(index, text, page, algolia_workers, iso_workers)
|
117
|
+
# res = index.search text, facetFilters: ["category:standard"], page: page
|
118
|
+
# next_page = res["page"] + 1
|
119
|
+
# algolia_workers << next_page if next_page < res["nbPages"]
|
120
|
+
# res["hits"].each do |hit|
|
121
|
+
# iso_workers.nb_hits = res["nbHits"]
|
122
|
+
# iso_workers << hit
|
123
|
+
# end
|
124
|
+
# iso_workers.end unless next_page < res["nbPages"]
|
125
|
+
# end
|
126
|
+
|
127
|
+
# Fetch titles and abstracts.
|
128
|
+
# @param doc [Nokigiri::HTML::Document]
|
129
|
+
# @return [Array<Array>]
|
130
|
+
# rubocop:disable Metrics/AbcSize, Metrics/MethodLength
|
131
|
+
def fetch_titles_abstract(doc)
|
132
|
+
titles = []
|
133
|
+
abstract = []
|
134
|
+
langs(doc).each do |lang|
|
135
|
+
# Don't need to get page for en. We already have it.
|
136
|
+
d = lang[:path] ? get_page(lang[:path])[0] : doc
|
137
|
+
|
138
|
+
# Check if unavailable for the lang.
|
139
|
+
next if d.css("h5.help-block").any?
|
140
|
+
|
141
|
+
titles << fetch_title(d, lang[:lang])
|
142
|
+
|
143
|
+
# Fetch abstracts.
|
144
|
+
abstract_content = d.css("div[itemprop='description'] p").text
|
145
|
+
next if abstract_content.empty?
|
146
|
+
|
147
|
+
abstract << {
|
148
|
+
content: abstract_content,
|
149
|
+
language: lang[:lang],
|
150
|
+
script: script(lang[:lang]),
|
151
|
+
format: "text/plain",
|
152
|
+
}
|
153
|
+
end
|
154
|
+
[titles, abstract]
|
155
|
+
end
|
156
|
+
# rubocop:enable Metrics/AbcSize, Metrics/MethodLength
|
157
|
+
|
158
|
+
# Get langs.
|
159
|
+
# @param doc [Nokogiri::HTML::Document]
|
160
|
+
# @return [Array<Hash>]
|
161
|
+
def langs(doc)
|
162
|
+
lgs = [{ lang: "en" }]
|
163
|
+
doc.css("ul#lang-switcher ul li a").each do |lang_link|
|
164
|
+
lang_path = lang_link.attr("href")
|
165
|
+
lang = lang_path.match(%r{^\/(fr)\/})
|
166
|
+
lgs << { lang: lang[1], path: lang_path } if lang
|
167
|
+
end
|
168
|
+
lgs
|
169
|
+
end
|
170
|
+
|
171
|
+
# rubocop:disable Metrics/AbcSize, Metrics/MethodLength
|
172
|
+
# Get page.
|
173
|
+
# @param path [String] page's path
|
174
|
+
# @return [Array<Nokogiri::HTML::Document, String>]
|
175
|
+
def get_page(path)
|
176
|
+
url = DOMAIN + path
|
177
|
+
uri = URI url
|
178
|
+
resp = Net::HTTP.get_response(uri) # .encode("UTF-8")
|
179
|
+
if resp.code == "301"
|
180
|
+
path = resp["location"]
|
181
|
+
url = DOMAIN + path
|
182
|
+
uri = URI url
|
183
|
+
resp = Net::HTTP.get_response(uri) # .encode("UTF-8")
|
184
|
+
end
|
185
|
+
n = 0
|
186
|
+
while resp.body !~ /<strong/ && n < 10
|
187
|
+
resp = Net::HTTP.get_response(uri) # .encode("UTF-8")
|
188
|
+
n += 1
|
189
|
+
end
|
190
|
+
[Nokogiri::HTML(resp.body), url]
|
191
|
+
end
|
192
|
+
# rubocop:enable Metrics/AbcSize, Metrics/MethodLength
|
193
|
+
|
194
|
+
# Fetch docid.
|
195
|
+
# @param doc [Nokogiri::HTML::Document]
|
196
|
+
# @return [Array<RelatonBib::DocumentIdentifier>]
|
197
|
+
def fetch_docid(doc)
|
198
|
+
item_ref = doc.at("//strong[@id='itemReference']")
|
199
|
+
return [] unless item_ref
|
200
|
+
|
201
|
+
[RelatonBib::DocumentIdentifier.new(id: item_ref.text, type: "ISO")]
|
202
|
+
end
|
203
|
+
|
204
|
+
# @param doc [Nokogiri::HTML::Document]
|
205
|
+
def fetch_structuredidentifier(doc)
|
206
|
+
item_ref = doc.at("//strong[@id='itemReference']")
|
207
|
+
unless item_ref
|
208
|
+
return RelatonIsoBib::StructuredIdentifier.new(
|
209
|
+
project_number: "?", part_number: "", prefix: nil, id: "?",
|
210
|
+
)
|
211
|
+
end
|
212
|
+
|
213
|
+
m = item_ref.text.match(/^(.*?\d+)-?((?<=-)\d+|)/)
|
214
|
+
RelatonIsoBib::StructuredIdentifier.new(
|
215
|
+
project_number: m[1], part_number: m[2], prefix: nil,
|
216
|
+
id: item_ref.text, type: "ISO"
|
217
|
+
)
|
218
|
+
end
|
219
|
+
|
220
|
+
# Fetch status.
|
221
|
+
# @param doc [Nokogiri::HTML::Document]
|
222
|
+
# @param status [String]
|
223
|
+
# @return [Hash]
|
224
|
+
def fetch_status(doc, _status)
|
225
|
+
stage, substage = doc.css("li.dropdown.active span.stage-code > strong").text.split "."
|
226
|
+
RelatonBib::DocumentStatus.new(stage: stage, substage: substage)
|
227
|
+
end
|
228
|
+
|
229
|
+
# Fetch workgroup.
|
230
|
+
# @param doc [Nokogiri::HTML::Document]
|
231
|
+
# @return [Hash]
|
232
|
+
def fetch_workgroup(doc)
|
233
|
+
wg_link = doc.css("div.entry-name.entry-block a")[0]
|
234
|
+
# wg_url = DOMAIN + wg_link['href']
|
235
|
+
workgroup = wg_link.text.split "/"
|
236
|
+
{
|
237
|
+
name: "International Organization for Standardization",
|
238
|
+
abbreviation: "ISO",
|
239
|
+
url: "www.iso.org",
|
240
|
+
technical_committee: [{
|
241
|
+
name: wg_link.text + doc.css("div.entry-title")[0].text,
|
242
|
+
type: "TC",
|
243
|
+
number: workgroup[1]&.match(/\d+/)&.to_s&.to_i,
|
244
|
+
}],
|
245
|
+
}
|
246
|
+
end
|
247
|
+
|
248
|
+
# rubocop:disable Metrics/MethodLength
|
249
|
+
|
250
|
+
# Fetch relations.
|
251
|
+
# @param doc [Nokogiri::HTML::Document]
|
252
|
+
# @return [Array<Hash>]
|
253
|
+
def fetch_relations(doc)
|
254
|
+
doc.css("ul.steps li").reduce([]) do |a, r|
|
255
|
+
r_type = r.css("strong").text
|
256
|
+
type = case r_type
|
257
|
+
when "Previously", "Will be replaced by" then "obsoletes"
|
258
|
+
when "Corrigenda/Amendments", "Revised by", "Now confirmed"
|
259
|
+
"updates"
|
260
|
+
else r_type
|
261
|
+
end
|
262
|
+
if ["Now", "Now under review"].include? type
|
263
|
+
a
|
264
|
+
else
|
265
|
+
a + r.css("a").map do |id|
|
266
|
+
fref = RelatonBib::FormattedRef.new(
|
267
|
+
content: id.text, format: "text/plain",
|
268
|
+
)
|
269
|
+
bibitem = RelatonIsoBib::IsoBibliographicItem.new(
|
270
|
+
formattedref: fref,
|
271
|
+
)
|
272
|
+
{ type: type, bibitem: bibitem }
|
273
|
+
end
|
274
|
+
end
|
275
|
+
end
|
276
|
+
end
|
277
|
+
# rubocop:enable Metrics/MethodLength
|
278
|
+
|
279
|
+
# Fetch type.
|
280
|
+
# @param title [String]
|
281
|
+
# @return [String]
|
282
|
+
def fetch_type(title)
|
283
|
+
type_match = title.match(%r{^(ISO|IWA|IEC)(?:(/IEC|/IEEE|/PRF|
|
284
|
+
/NP)*\s|/)(TS|TR|PAS|AWI|CD|FDIS|NP|DIS|WD|R|Guide|(?=\d+))}x)
|
285
|
+
# return "international-standard" if type_match.nil?
|
286
|
+
if TYPES[type_match[3]]
|
287
|
+
TYPES[type_match[3]]
|
288
|
+
elsif type_match[1] == "ISO"
|
289
|
+
"international-standard"
|
290
|
+
elsif type_match[1] == "IWA"
|
291
|
+
"international-workshop-agreement"
|
292
|
+
end
|
293
|
+
# rescue => _e
|
294
|
+
# puts 'Unknown document type: ' + title
|
295
|
+
end
|
296
|
+
|
297
|
+
# Fetch titles.
|
298
|
+
# @param doc [Nokogiri::HTML::Document]
|
299
|
+
# @param lang [String]
|
300
|
+
# @return [Hash]
|
301
|
+
def fetch_title(doc, lang)
|
302
|
+
titles = doc.at("//h3[@itemprop='description'] | //h2[@itemprop='description']").
|
303
|
+
text.split " -- "
|
304
|
+
case titles.size
|
305
|
+
when 0
|
306
|
+
intro, main, part = nil, "", nil
|
307
|
+
when 1
|
308
|
+
intro, main, part = nil, titles[0], nil
|
309
|
+
when 2
|
310
|
+
if /^(Part|Partie) \d+:/ =~ titles[1]
|
311
|
+
intro, main, part = nil, titles[0], titles[1]
|
312
|
+
else
|
313
|
+
intro, main, part = titles[0], titles[1], nil
|
314
|
+
end
|
315
|
+
when 3
|
316
|
+
intro, main, part = titles[0], titles[1], titles[2]
|
317
|
+
else
|
318
|
+
intro, main, part = titles[0], titles[1], titles[2..-1]&.join(" -- ")
|
319
|
+
end
|
320
|
+
{
|
321
|
+
title_intro: intro,
|
322
|
+
title_main: main,
|
323
|
+
title_part: part,
|
324
|
+
language: lang,
|
325
|
+
script: script(lang),
|
326
|
+
}
|
327
|
+
end
|
328
|
+
|
329
|
+
# Return ISO script code.
|
330
|
+
# @param lang [String]
|
331
|
+
# @return [String]
|
332
|
+
def script(lang)
|
333
|
+
case lang
|
334
|
+
when "en", "fr" then "Latn"
|
335
|
+
end
|
336
|
+
end
|
337
|
+
|
338
|
+
# Fetch dates
|
339
|
+
# @param doc [Nokogiri::HTML::Document]
|
340
|
+
# @return [Array<Hash>]
|
341
|
+
def fetch_dates(doc)
|
342
|
+
dates = []
|
343
|
+
publish_date = doc.xpath("//span[@itemprop='releaseDate']").text
|
344
|
+
unless publish_date.empty?
|
345
|
+
dates << { type: "published", on: publish_date }
|
346
|
+
end
|
347
|
+
dates
|
348
|
+
end
|
349
|
+
|
350
|
+
# rubocop:disable Metrics/MethodLength
|
351
|
+
def fetch_contributors(title)
|
352
|
+
title.sub(/\s.*/, "").split("/").map do |abbrev|
|
353
|
+
case abbrev
|
354
|
+
when "IEC"
|
355
|
+
name = "International Electrotechnical Commission"
|
356
|
+
url = "www.iec.ch"
|
357
|
+
else
|
358
|
+
name = "International Organization for Standardization"
|
359
|
+
url = "www.iso.org"
|
360
|
+
end
|
361
|
+
{ entity: { name: name, url: url, abbreviation: abbrev },
|
362
|
+
roles: ["publisher"] }
|
363
|
+
end
|
364
|
+
end
|
365
|
+
# rubocop:enable Metrics/MethodLength
|
366
|
+
|
367
|
+
# Fetch ICS.
|
368
|
+
# @param doc [Nokogiri::HTML::Document]
|
369
|
+
# @return [Array<Hash>]
|
370
|
+
def fetch_ics(doc)
|
371
|
+
doc.xpath("//strong[contains(text(), "\
|
372
|
+
"'ICS')]/../following-sibling::dd/div/a").map do |i|
|
373
|
+
code = i.text.match(/[\d\.]+/).to_s.split "."
|
374
|
+
{ field: code[0], group: code[1], subgroup: code[2] }
|
375
|
+
end
|
376
|
+
end
|
377
|
+
|
378
|
+
# Fetch links.
|
379
|
+
# @param doc [Nokogiri::HTML::Document]
|
380
|
+
# @param url [String]
|
381
|
+
# @return [Array<Hash>]
|
382
|
+
def fetch_link(doc, url)
|
383
|
+
obp_elms = doc.xpath("//a[contains(@href, '/obp/ui/')]")
|
384
|
+
obp = obp_elms.attr("href").value if obp_elms.any?
|
385
|
+
rss = DOMAIN + doc.xpath("//a[contains(@href, 'rss')]").attr("href").value
|
386
|
+
[
|
387
|
+
{ type: "src", content: url },
|
388
|
+
{ type: "obp", content: obp },
|
389
|
+
{ type: "rss", content: rss },
|
390
|
+
]
|
391
|
+
end
|
392
|
+
|
393
|
+
# Fetch copyright.
|
394
|
+
# @param title [String]
|
395
|
+
# @return [Hash]
|
396
|
+
def fetch_copyright(title, doc)
|
397
|
+
owner_name = title.match(/.*?(?=\s)/).to_s
|
398
|
+
from = title.match(/(?<=:)\d{4}/).to_s
|
399
|
+
if from.empty?
|
400
|
+
from = doc.xpath("//span[@itemprop='releaseDate']").text.match(/\d{4}/).to_s
|
401
|
+
end
|
402
|
+
{ owner: { name: owner_name }, from: from }
|
403
|
+
end
|
404
|
+
end
|
405
|
+
|
406
|
+
# private
|
407
|
+
#
|
408
|
+
# def next_hits_page(next_page)
|
409
|
+
# page = @index.search @text, facetFilters: ['category:standard'],
|
410
|
+
# page: next_page
|
411
|
+
# page.each do |key, value|
|
412
|
+
# if key == 'hits'
|
413
|
+
# @docs[key] += value
|
414
|
+
# else
|
415
|
+
# @docs[key] = value
|
416
|
+
# end
|
417
|
+
# end
|
418
|
+
# end
|
419
|
+
end
|
420
|
+
# rubocop:enable Metrics/ModuleLength
|
421
|
+
end
|