relaton-iso 0.5.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +16 -0
- data/.hound.yml +3 -0
- data/.rspec +3 -0
- data/.rubocop.yml +10 -0
- data/.travis.yml +17 -0
- data/CODE_OF_CONDUCT.md +74 -0
- data/Gemfile +9 -0
- data/Gemfile.lock +101 -0
- data/LICENSE.txt +25 -0
- data/README.adoc +202 -0
- data/Rakefile +6 -0
- data/appveyor.yml +35 -0
- data/bin/bundle +105 -0
- data/bin/byebug +29 -0
- data/bin/coderay +29 -0
- data/bin/console +14 -0
- data/bin/gdb_wrapper +29 -0
- data/bin/htmldiff +29 -0
- data/bin/httpclient +29 -0
- data/bin/ldiff +29 -0
- data/bin/nokogiri +29 -0
- data/bin/pry +29 -0
- data/bin/rake +29 -0
- data/bin/rdebug-ide +29 -0
- data/bin/rspec +29 -0
- data/bin/safe_yaml +29 -0
- data/bin/setup +8 -0
- data/lib/relaton/processor.rb +22 -0
- data/lib/relaton_iso.rb +8 -0
- data/lib/relaton_iso/hit.rb +55 -0
- data/lib/relaton_iso/hit_collection.rb +42 -0
- data/lib/relaton_iso/hit_pages.rb +96 -0
- data/lib/relaton_iso/iso_bibliography.rb +132 -0
- data/lib/relaton_iso/scrapper.rb +421 -0
- data/lib/relaton_iso/version.rb +5 -0
- data/relaton_iso.gemspec +44 -0
- metadata +278 -0
data/bin/rspec
ADDED
@@ -0,0 +1,29 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
# frozen_string_literal: true
|
3
|
+
|
4
|
+
#
|
5
|
+
# This file was generated by Bundler.
|
6
|
+
#
|
7
|
+
# The application 'rspec' is installed as part of a gem, and
|
8
|
+
# this file is here to facilitate running it.
|
9
|
+
#
|
10
|
+
|
11
|
+
require "pathname"
|
12
|
+
ENV["BUNDLE_GEMFILE"] ||= File.expand_path("../../Gemfile",
|
13
|
+
Pathname.new(__FILE__).realpath)
|
14
|
+
|
15
|
+
bundle_binstub = File.expand_path("../bundle", __FILE__)
|
16
|
+
|
17
|
+
if File.file?(bundle_binstub)
|
18
|
+
if File.read(bundle_binstub, 300) =~ /This file was generated by Bundler/
|
19
|
+
load(bundle_binstub)
|
20
|
+
else
|
21
|
+
abort("Your `bin/bundle` was not generated by Bundler, so this binstub cannot run.
|
22
|
+
Replace `bin/bundle` by running `bundle binstubs bundler --force`, then run this command again.")
|
23
|
+
end
|
24
|
+
end
|
25
|
+
|
26
|
+
require "rubygems"
|
27
|
+
require "bundler/setup"
|
28
|
+
|
29
|
+
load Gem.bin_path("rspec-core", "rspec")
|
data/bin/safe_yaml
ADDED
@@ -0,0 +1,29 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
# frozen_string_literal: true
|
3
|
+
|
4
|
+
#
|
5
|
+
# This file was generated by Bundler.
|
6
|
+
#
|
7
|
+
# The application 'safe_yaml' is installed as part of a gem, and
|
8
|
+
# this file is here to facilitate running it.
|
9
|
+
#
|
10
|
+
|
11
|
+
require "pathname"
|
12
|
+
ENV["BUNDLE_GEMFILE"] ||= File.expand_path("../../Gemfile",
|
13
|
+
Pathname.new(__FILE__).realpath)
|
14
|
+
|
15
|
+
bundle_binstub = File.expand_path("../bundle", __FILE__)
|
16
|
+
|
17
|
+
if File.file?(bundle_binstub)
|
18
|
+
if File.read(bundle_binstub, 300) =~ /This file was generated by Bundler/
|
19
|
+
load(bundle_binstub)
|
20
|
+
else
|
21
|
+
abort("Your `bin/bundle` was not generated by Bundler, so this binstub cannot run.
|
22
|
+
Replace `bin/bundle` by running `bundle binstubs bundler --force`, then run this command again.")
|
23
|
+
end
|
24
|
+
end
|
25
|
+
|
26
|
+
require "rubygems"
|
27
|
+
require "bundler/setup"
|
28
|
+
|
29
|
+
load Gem.bin_path("safe_yaml", "safe_yaml")
|
data/bin/setup
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
require "relaton/processor"
|
2
|
+
|
3
|
+
module Relaton
|
4
|
+
module RelatonIso
|
5
|
+
class Processor < Relaton::Processor
|
6
|
+
def initialize
|
7
|
+
@short = :relaton_iso
|
8
|
+
@prefix = "ISO"
|
9
|
+
@defaultprefix = %r{^(ISO)[ /]}
|
10
|
+
@idtype = "ISO"
|
11
|
+
end
|
12
|
+
|
13
|
+
def get(code, date, opts)
|
14
|
+
::RelatonIso::IsoBibliography.get(code, date, opts)
|
15
|
+
end
|
16
|
+
|
17
|
+
def from_xml(xml)
|
18
|
+
RelatonIsoBib::XMLParser.from_xml xml
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|
data/lib/relaton_iso.rb
ADDED
@@ -0,0 +1,55 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module RelatonIso
|
4
|
+
# Hit.
|
5
|
+
class Hit
|
6
|
+
# @return [RelatonIso::HitCollection]
|
7
|
+
attr_reader :hit_collection
|
8
|
+
|
9
|
+
# @return [Array<Hash>]
|
10
|
+
attr_reader :hit
|
11
|
+
|
12
|
+
# @param hit [Hash]
|
13
|
+
# @param hit_collection [RelatonIso:HitCollection]
|
14
|
+
def initialize(hit, hit_collection = nil)
|
15
|
+
@hit = hit
|
16
|
+
@hit_collection = hit_collection
|
17
|
+
end
|
18
|
+
|
19
|
+
# Parse page.
|
20
|
+
# @return [RelatonIso::IsoBibliographicItem]
|
21
|
+
def fetch
|
22
|
+
@fetch ||= Scrapper.parse_page @hit
|
23
|
+
end
|
24
|
+
|
25
|
+
# @return [String]
|
26
|
+
def to_s
|
27
|
+
inspect
|
28
|
+
end
|
29
|
+
|
30
|
+
# @return [String]
|
31
|
+
def inspect
|
32
|
+
matched_words = @hit["_highlightResult"].
|
33
|
+
reduce([]) { |a, (_k, v)| a + v["matchedWords"] }.uniq
|
34
|
+
|
35
|
+
"<#{self.class}:#{format('%#.14x', object_id << 1)} "\
|
36
|
+
"@text=\"#{@hit_collection&.hit_pages&.text}\" "\
|
37
|
+
"@fullIdentifier=\"#{@fetch&.shortref}\" "\
|
38
|
+
"@matchedWords=#{matched_words} "\
|
39
|
+
"@category=\"#{@hit['category']}\" "\
|
40
|
+
"@title=\"#{@hit['title']}\">"
|
41
|
+
end
|
42
|
+
|
43
|
+
# @param builder [Nokogiri::XML::Builder]
|
44
|
+
def to_xml(builder = nil, **opts)
|
45
|
+
if builder
|
46
|
+
fetch.to_xml builder, **opts
|
47
|
+
else
|
48
|
+
builder = Nokogiri::XML::Builder.new(encoding: "UTF-8") do |xml|
|
49
|
+
fetch.to_xml xml, **opts
|
50
|
+
end
|
51
|
+
builder.doc.root.to_xml
|
52
|
+
end
|
53
|
+
end
|
54
|
+
end
|
55
|
+
end
|
@@ -0,0 +1,42 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require "relaton_iso/hit"
|
4
|
+
|
5
|
+
module RelatonIso
|
6
|
+
# Page of hit collection.
|
7
|
+
class HitCollection < Array
|
8
|
+
# @return [TrueClass, FalseClass]
|
9
|
+
attr_reader :fetched
|
10
|
+
|
11
|
+
# @return [RelatonIso::HitPages]
|
12
|
+
attr_reader :hit_pages
|
13
|
+
|
14
|
+
# @param hits [Array<Hash>]
|
15
|
+
def initialize(hits, hit_pages = nil)
|
16
|
+
concat(hits.map { |h| Hit.new(h, self) })
|
17
|
+
@fetched = false
|
18
|
+
@hit_pages = hit_pages
|
19
|
+
end
|
20
|
+
|
21
|
+
# @return [RelatonIso::HitCollection]
|
22
|
+
def fetch
|
23
|
+
workers = RelatonBib::WorkersPool.new 4
|
24
|
+
workers.worker(&:fetch)
|
25
|
+
each do |hit|
|
26
|
+
workers << hit
|
27
|
+
end
|
28
|
+
workers.end
|
29
|
+
workers.result
|
30
|
+
@fetched = true
|
31
|
+
self
|
32
|
+
end
|
33
|
+
|
34
|
+
def to_s
|
35
|
+
inspect
|
36
|
+
end
|
37
|
+
|
38
|
+
def inspect
|
39
|
+
"<#{self.class}:#{format('%#.14x', object_id << 1)} @fetched=#{@fetched}>"
|
40
|
+
end
|
41
|
+
end
|
42
|
+
end
|
@@ -0,0 +1,96 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require "algoliasearch"
|
4
|
+
require "relaton_iso/hit_collection"
|
5
|
+
|
6
|
+
module RelatonIso
|
7
|
+
# Pages of hits.
|
8
|
+
class HitPages < Array
|
9
|
+
Algolia.init application_id: "JCL49WV5AR",
|
10
|
+
api_key: "dd1b9e1ab383f4d4817d29cd5e96d3f0"
|
11
|
+
|
12
|
+
# @return [String]
|
13
|
+
attr_reader :text
|
14
|
+
|
15
|
+
# @param text [String]
|
16
|
+
def initialize(text)
|
17
|
+
@text = text
|
18
|
+
@index = Algolia::Index.new "all_en"
|
19
|
+
resp = @index.search(text, facetFilters: ["category:standard"])
|
20
|
+
@nb_pages = resp["nbPages"]
|
21
|
+
self << HitCollection.new(resp["hits"], self)
|
22
|
+
end
|
23
|
+
|
24
|
+
# @return [RelatonIso::HitCollection]
|
25
|
+
def last
|
26
|
+
collection(@nb_pages - 1)
|
27
|
+
end
|
28
|
+
|
29
|
+
# @param i [Integer]
|
30
|
+
# @return [RelatonIso::HitCollection]
|
31
|
+
def [](idx)
|
32
|
+
# collection i
|
33
|
+
return if idx + 1 > @nb_pages
|
34
|
+
|
35
|
+
collection idx
|
36
|
+
super
|
37
|
+
end
|
38
|
+
|
39
|
+
# @return [Array]
|
40
|
+
def map(&block)
|
41
|
+
m = []
|
42
|
+
@nb_pages.times do |n|
|
43
|
+
m << yield(self[n]) if block
|
44
|
+
end
|
45
|
+
m
|
46
|
+
end
|
47
|
+
|
48
|
+
def each(&block)
|
49
|
+
@nb_pages.times do |n|
|
50
|
+
yield self[n] if block
|
51
|
+
end
|
52
|
+
end
|
53
|
+
|
54
|
+
def to_s
|
55
|
+
inspect
|
56
|
+
end
|
57
|
+
|
58
|
+
def inspect
|
59
|
+
"<#{self.class}:#{format('%#.14x', object_id << 1)} @text=#{@text} "\
|
60
|
+
"@pages=#{@nb_pages}>"
|
61
|
+
end
|
62
|
+
|
63
|
+
# @return [Integer]
|
64
|
+
def size
|
65
|
+
@nb_pages
|
66
|
+
end
|
67
|
+
|
68
|
+
def to_xml(**opts)
|
69
|
+
builder = Nokogiri::XML::Builder.new(encoding: "UTF-8") do |xml|
|
70
|
+
xml.documents do
|
71
|
+
each do |page|
|
72
|
+
page.fetch
|
73
|
+
page.each { |hit| hit.to_xml xml, **opts }
|
74
|
+
end
|
75
|
+
end
|
76
|
+
end
|
77
|
+
builder.to_xml
|
78
|
+
end
|
79
|
+
|
80
|
+
private
|
81
|
+
|
82
|
+
# @param i [Integer]
|
83
|
+
# @return [RelatonIso::HitCollection]
|
84
|
+
def collection(idx)
|
85
|
+
return if idx + 1 > @nb_pages
|
86
|
+
|
87
|
+
while Array.instance_method(:size).bind(self).call < idx + 1
|
88
|
+
resp = @index.search(@text,
|
89
|
+
facetFilters: ["category:standard"],
|
90
|
+
page: idx)
|
91
|
+
self << HitCollection.new(resp["hits"], self)
|
92
|
+
end
|
93
|
+
Array.instance_method(:[]).bind(self).call idx
|
94
|
+
end
|
95
|
+
end
|
96
|
+
end
|
@@ -0,0 +1,132 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
# require 'relaton_iso/iso_bibliographic_item'
|
4
|
+
require "relaton_iso/scrapper"
|
5
|
+
require "relaton_iso/hit_pages"
|
6
|
+
require "relaton_iec"
|
7
|
+
|
8
|
+
module RelatonIso
|
9
|
+
# Class methods for search ISO standards.
|
10
|
+
class IsoBibliography
|
11
|
+
class << self
|
12
|
+
# @param text [String]
|
13
|
+
# @return [RelatonIso::HitPages]
|
14
|
+
def search(text)
|
15
|
+
HitPages.new text
|
16
|
+
rescue Algolia::AlgoliaProtocolError
|
17
|
+
warn "Could not access http://www.iso.org"
|
18
|
+
[]
|
19
|
+
end
|
20
|
+
|
21
|
+
# @param text [String]
|
22
|
+
# @return [Array<RelatonIso::IsoBibliographicItem>]
|
23
|
+
# def search_and_fetch(text)
|
24
|
+
# Scrapper.get(text)
|
25
|
+
# end
|
26
|
+
|
27
|
+
# @param code [String] the ISO standard Code to look up (e..g "ISO 9000")
|
28
|
+
# @param year [String] the year the standard was published (optional)
|
29
|
+
# @param opts [Hash] options; restricted to :all_parts if all-parts reference is required,
|
30
|
+
# :keep_year if undated reference should return actual reference with year
|
31
|
+
# @return [String] Relaton XML serialisation of reference
|
32
|
+
def get(code, year, opts)
|
33
|
+
if year.nil?
|
34
|
+
/^(?<code1>[^:]+):(?<year1>[^:]+)$/ =~ code
|
35
|
+
unless code1.nil?
|
36
|
+
code = code1
|
37
|
+
year = year1
|
38
|
+
end
|
39
|
+
end
|
40
|
+
code += "-1" if opts[:all_parts]
|
41
|
+
return Iecbib::IecBibliography.get(code, year, opts) if %r[^ISO/IEC DIR].match code
|
42
|
+
|
43
|
+
ret = isobib_get1(code, year, opts)
|
44
|
+
if ret.nil? && code =~ %r[^ISO\s]
|
45
|
+
c = code.gsub "ISO", "ISO/IEC"
|
46
|
+
warn "Attempting ISO/IEC retrieval"
|
47
|
+
ret = isobib_get1(c, year, opts)
|
48
|
+
end
|
49
|
+
return nil if ret.nil?
|
50
|
+
|
51
|
+
ret.to_most_recent_reference unless year || opts[:keep_year]
|
52
|
+
ret.to_all_parts if opts[:all_parts]
|
53
|
+
ret
|
54
|
+
end
|
55
|
+
|
56
|
+
private
|
57
|
+
|
58
|
+
def fetch_ref_err(code, year, missed_years)
|
59
|
+
id = year ? "#{code}:#{year}" : code
|
60
|
+
warn "WARNING: no match found online for #{id}. "\
|
61
|
+
"The code must be exactly like it is on the standards website."
|
62
|
+
warn "(There was no match for #{year}, though there were matches "\
|
63
|
+
"found for #{missed_years.join(', ')}.)" unless missed_years.empty?
|
64
|
+
if /\d-\d/ =~ code
|
65
|
+
warn "The provided document part may not exist, or the document "\
|
66
|
+
"may no longer be published in parts."
|
67
|
+
else
|
68
|
+
warn "If you wanted to cite all document parts for the reference, "\
|
69
|
+
"use \"#{code} (all parts)\".\nIf the document is not a standard, "\
|
70
|
+
"use its document type abbreviation (TS, TR, PAS, Guide)."
|
71
|
+
end
|
72
|
+
nil
|
73
|
+
end
|
74
|
+
|
75
|
+
def fetch_pages(s, n)
|
76
|
+
workers = RelatonBib::WorkersPool.new n
|
77
|
+
workers.worker { |w| { i: w[:i], hit: w[:hit].fetch } }
|
78
|
+
s.each_with_index { |hit, i| workers << { i: i, hit: hit } }
|
79
|
+
workers.end
|
80
|
+
workers.result.sort { |x, y| x[:i] <=> y[:i] }.map { |x| x[:hit] }
|
81
|
+
end
|
82
|
+
|
83
|
+
def isobib_search_filter(code)
|
84
|
+
docidrx = %r{^(ISO|IEC)[^0-9]*\s[0-9-]+}
|
85
|
+
corrigrx = %r{^(ISO|IEC)[^0-9]*\s[0-9-]+:[0-9]+/}
|
86
|
+
warn "fetching #{code}..."
|
87
|
+
result = search(code)
|
88
|
+
result.each do |page|
|
89
|
+
ret = page.select do |i|
|
90
|
+
i.hit["title"] &&
|
91
|
+
i.hit["title"].match(docidrx).to_s == code &&
|
92
|
+
corrigrx !~ i.hit["title"]
|
93
|
+
end
|
94
|
+
return ret unless ret.empty?
|
95
|
+
end
|
96
|
+
[]
|
97
|
+
end
|
98
|
+
|
99
|
+
# Sort through the results from RelatonIso, fetching them three at a time,
|
100
|
+
# and return the first result that matches the code,
|
101
|
+
# matches the year (if provided), and which # has a title (amendments do not).
|
102
|
+
# Only expects the first page of results to be populated.
|
103
|
+
# Does not match corrigenda etc (e.g. ISO 3166-1:2006/Cor 1:2007)
|
104
|
+
# If no match, returns any years which caused mismatch, for error reporting
|
105
|
+
def isobib_results_filter(result, year)
|
106
|
+
missed_years = []
|
107
|
+
result.each_slice(3) do |s| # ISO website only allows 3 connections
|
108
|
+
fetch_pages(s, 3).each_with_index do |r, _i|
|
109
|
+
next if r.nil?
|
110
|
+
return { ret: r } if !year
|
111
|
+
|
112
|
+
r.dates.select { |d| d.type == "published" }.each do |d|
|
113
|
+
return { ret: r } if year.to_i == d.on.year
|
114
|
+
|
115
|
+
missed_years << d.on.year
|
116
|
+
end
|
117
|
+
end
|
118
|
+
end
|
119
|
+
{ years: missed_years }
|
120
|
+
end
|
121
|
+
|
122
|
+
def isobib_get1(code, year, _opts)
|
123
|
+
# return iev(code) if /^IEC 60050-/.match code
|
124
|
+
result = isobib_search_filter(code) || return
|
125
|
+
ret = isobib_results_filter(result, year)
|
126
|
+
return ret[:ret] if ret[:ret]
|
127
|
+
|
128
|
+
fetch_ref_err(code, year, ret[:years])
|
129
|
+
end
|
130
|
+
end
|
131
|
+
end
|
132
|
+
end
|
@@ -0,0 +1,421 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require "algoliasearch"
|
4
|
+
require "relaton_iso_bib"
|
5
|
+
require "relaton_iso/hit"
|
6
|
+
require "nokogiri"
|
7
|
+
require "net/http"
|
8
|
+
|
9
|
+
Algolia.init application_id: "JCL49WV5AR",
|
10
|
+
api_key: "dd1b9e1ab383f4d4817d29cd5e96d3f0"
|
11
|
+
|
12
|
+
module RelatonIso
|
13
|
+
# Scrapper.
|
14
|
+
# rubocop:disable Metrics/ModuleLength
|
15
|
+
module Scrapper
|
16
|
+
DOMAIN = "https://www.iso.org"
|
17
|
+
|
18
|
+
TYPES = {
|
19
|
+
"TS" => "technical-specification",
|
20
|
+
"TR" => "technical-report",
|
21
|
+
"PAS" => "publicly-available-specification",
|
22
|
+
# "AWI" => "approvedWorkItem",
|
23
|
+
# "CD" => "committeeDraft",
|
24
|
+
# "FDIS" => "finalDraftInternationalStandard",
|
25
|
+
# "NP" => "newProposal",
|
26
|
+
# "DIS" => "draftInternationalStandard",
|
27
|
+
# "WD" => "workingDraft",
|
28
|
+
# "R" => "recommendation",
|
29
|
+
"Guide" => "guide",
|
30
|
+
}.freeze
|
31
|
+
|
32
|
+
class << self
|
33
|
+
# @param text [String]
|
34
|
+
# @return [Array<Hash>]
|
35
|
+
# def get(text)
|
36
|
+
# iso_workers = RelatonBib::WorkersPool.new 4
|
37
|
+
# iso_workers.worker { |hit| iso_worker(hit, iso_workers) }
|
38
|
+
# algolia_workers = start_algolia_search(text, iso_workers)
|
39
|
+
# iso_docs = iso_workers.result
|
40
|
+
# algolia_workers.end
|
41
|
+
# algolia_workers.result
|
42
|
+
# iso_docs
|
43
|
+
# rescue
|
44
|
+
# warn "Could not connect to http://www.iso.org"
|
45
|
+
# []
|
46
|
+
# end
|
47
|
+
|
48
|
+
# Parse page.
|
49
|
+
# @param hit [Hash]
|
50
|
+
# @return [Hash]
|
51
|
+
# rubocop:disable Metrics/AbcSize, Metrics/MethodLength
|
52
|
+
def parse_page(hit_data)
|
53
|
+
return unless hit_data["path"] =~ /\d+$/
|
54
|
+
|
55
|
+
doc, url = get_page "/standard/#{hit_data['path'].match(/\d+$/)}.html"
|
56
|
+
|
57
|
+
# Fetch edition.
|
58
|
+
edition = doc&.xpath("//strong[contains(text(), 'Edition')]/..")&.
|
59
|
+
children&.last&.text&.match(/\d+/)&.to_s
|
60
|
+
|
61
|
+
titles, abstract = fetch_titles_abstract(doc)
|
62
|
+
|
63
|
+
RelatonIsoBib::IsoBibliographicItem.new(
|
64
|
+
fetched: Date.today.to_s,
|
65
|
+
docid: fetch_docid(doc),
|
66
|
+
edition: edition,
|
67
|
+
language: langs(doc).map { |l| l[:lang] },
|
68
|
+
script: langs(doc).map { |l| script(l[:lang]) }.uniq,
|
69
|
+
titles: titles,
|
70
|
+
type: fetch_type(hit_data["title"]),
|
71
|
+
docstatus: fetch_status(doc, hit_data["status"]),
|
72
|
+
ics: fetch_ics(doc),
|
73
|
+
dates: fetch_dates(doc),
|
74
|
+
contributors: fetch_contributors(hit_data["title"]),
|
75
|
+
editorialgroup: fetch_workgroup(doc),
|
76
|
+
abstract: abstract,
|
77
|
+
copyright: fetch_copyright(hit_data["title"], doc),
|
78
|
+
link: fetch_link(doc, url),
|
79
|
+
relations: fetch_relations(doc),
|
80
|
+
structuredidentifier: fetch_structuredidentifier(doc),
|
81
|
+
)
|
82
|
+
end
|
83
|
+
# rubocop:enable Metrics/AbcSize, Metrics/MethodLength
|
84
|
+
|
85
|
+
private
|
86
|
+
|
87
|
+
# Start algolia search workers.
|
88
|
+
# @param text[String]
|
89
|
+
# @param iso_workers [RelatonBib::WorkersPool]
|
90
|
+
# @reaturn [RelatonBib::WorkersPool]
|
91
|
+
# def start_algolia_search(text, iso_workers)
|
92
|
+
# index = Algolia::Index.new "all_en"
|
93
|
+
# algolia_workers = RelatonBib::WorkersPool.new
|
94
|
+
# algolia_workers.worker do |page|
|
95
|
+
# algolia_worker(index, text, page, algolia_workers, iso_workers)
|
96
|
+
# end
|
97
|
+
|
98
|
+
# # Add first page so algolia worker will start.
|
99
|
+
# algolia_workers << 0
|
100
|
+
# end
|
101
|
+
|
102
|
+
# Fetch ISO documents.
|
103
|
+
# @param hit [Hash]
|
104
|
+
# @param isiso_workers [RelatonIso::WorkersPool]
|
105
|
+
# def iso_worker(hit, iso_workers)
|
106
|
+
# print "Parse #{iso_workers.size} of #{iso_workers.nb_hits} \r"
|
107
|
+
# parse_page hit
|
108
|
+
# end
|
109
|
+
|
110
|
+
# Fetch hits from algolia search service.
|
111
|
+
# @param index[Algolia::Index]
|
112
|
+
# @param text [String]
|
113
|
+
# @param page [Integer]
|
114
|
+
# @param algolia_workers [RelatonBib::WorkersPool]
|
115
|
+
# @param isiso_workers [RelatonBib::WorkersPool]
|
116
|
+
# def algolia_worker(index, text, page, algolia_workers, iso_workers)
|
117
|
+
# res = index.search text, facetFilters: ["category:standard"], page: page
|
118
|
+
# next_page = res["page"] + 1
|
119
|
+
# algolia_workers << next_page if next_page < res["nbPages"]
|
120
|
+
# res["hits"].each do |hit|
|
121
|
+
# iso_workers.nb_hits = res["nbHits"]
|
122
|
+
# iso_workers << hit
|
123
|
+
# end
|
124
|
+
# iso_workers.end unless next_page < res["nbPages"]
|
125
|
+
# end
|
126
|
+
|
127
|
+
# Fetch titles and abstracts.
|
128
|
+
# @param doc [Nokigiri::HTML::Document]
|
129
|
+
# @return [Array<Array>]
|
130
|
+
# rubocop:disable Metrics/AbcSize, Metrics/MethodLength
|
131
|
+
def fetch_titles_abstract(doc)
|
132
|
+
titles = []
|
133
|
+
abstract = []
|
134
|
+
langs(doc).each do |lang|
|
135
|
+
# Don't need to get page for en. We already have it.
|
136
|
+
d = lang[:path] ? get_page(lang[:path])[0] : doc
|
137
|
+
|
138
|
+
# Check if unavailable for the lang.
|
139
|
+
next if d.css("h5.help-block").any?
|
140
|
+
|
141
|
+
titles << fetch_title(d, lang[:lang])
|
142
|
+
|
143
|
+
# Fetch abstracts.
|
144
|
+
abstract_content = d.css("div[itemprop='description'] p").text
|
145
|
+
next if abstract_content.empty?
|
146
|
+
|
147
|
+
abstract << {
|
148
|
+
content: abstract_content,
|
149
|
+
language: lang[:lang],
|
150
|
+
script: script(lang[:lang]),
|
151
|
+
format: "text/plain",
|
152
|
+
}
|
153
|
+
end
|
154
|
+
[titles, abstract]
|
155
|
+
end
|
156
|
+
# rubocop:enable Metrics/AbcSize, Metrics/MethodLength
|
157
|
+
|
158
|
+
# Get langs.
|
159
|
+
# @param doc [Nokogiri::HTML::Document]
|
160
|
+
# @return [Array<Hash>]
|
161
|
+
def langs(doc)
|
162
|
+
lgs = [{ lang: "en" }]
|
163
|
+
doc.css("ul#lang-switcher ul li a").each do |lang_link|
|
164
|
+
lang_path = lang_link.attr("href")
|
165
|
+
lang = lang_path.match(%r{^\/(fr)\/})
|
166
|
+
lgs << { lang: lang[1], path: lang_path } if lang
|
167
|
+
end
|
168
|
+
lgs
|
169
|
+
end
|
170
|
+
|
171
|
+
# rubocop:disable Metrics/AbcSize, Metrics/MethodLength
|
172
|
+
# Get page.
|
173
|
+
# @param path [String] page's path
|
174
|
+
# @return [Array<Nokogiri::HTML::Document, String>]
|
175
|
+
def get_page(path)
|
176
|
+
url = DOMAIN + path
|
177
|
+
uri = URI url
|
178
|
+
resp = Net::HTTP.get_response(uri) # .encode("UTF-8")
|
179
|
+
if resp.code == "301"
|
180
|
+
path = resp["location"]
|
181
|
+
url = DOMAIN + path
|
182
|
+
uri = URI url
|
183
|
+
resp = Net::HTTP.get_response(uri) # .encode("UTF-8")
|
184
|
+
end
|
185
|
+
n = 0
|
186
|
+
while resp.body !~ /<strong/ && n < 10
|
187
|
+
resp = Net::HTTP.get_response(uri) # .encode("UTF-8")
|
188
|
+
n += 1
|
189
|
+
end
|
190
|
+
[Nokogiri::HTML(resp.body), url]
|
191
|
+
end
|
192
|
+
# rubocop:enable Metrics/AbcSize, Metrics/MethodLength
|
193
|
+
|
194
|
+
# Fetch docid.
|
195
|
+
# @param doc [Nokogiri::HTML::Document]
|
196
|
+
# @return [Array<RelatonBib::DocumentIdentifier>]
|
197
|
+
def fetch_docid(doc)
|
198
|
+
item_ref = doc.at("//strong[@id='itemReference']")
|
199
|
+
return [] unless item_ref
|
200
|
+
|
201
|
+
[RelatonBib::DocumentIdentifier.new(id: item_ref.text, type: "ISO")]
|
202
|
+
end
|
203
|
+
|
204
|
+
# @param doc [Nokogiri::HTML::Document]
|
205
|
+
def fetch_structuredidentifier(doc)
|
206
|
+
item_ref = doc.at("//strong[@id='itemReference']")
|
207
|
+
unless item_ref
|
208
|
+
return RelatonIsoBib::StructuredIdentifier.new(
|
209
|
+
project_number: "?", part_number: "", prefix: nil, id: "?",
|
210
|
+
)
|
211
|
+
end
|
212
|
+
|
213
|
+
m = item_ref.text.match(/^(.*?\d+)-?((?<=-)\d+|)/)
|
214
|
+
RelatonIsoBib::StructuredIdentifier.new(
|
215
|
+
project_number: m[1], part_number: m[2], prefix: nil,
|
216
|
+
id: item_ref.text, type: "ISO"
|
217
|
+
)
|
218
|
+
end
|
219
|
+
|
220
|
+
# Fetch status.
|
221
|
+
# @param doc [Nokogiri::HTML::Document]
|
222
|
+
# @param status [String]
|
223
|
+
# @return [Hash]
|
224
|
+
def fetch_status(doc, _status)
|
225
|
+
stage, substage = doc.css("li.dropdown.active span.stage-code > strong").text.split "."
|
226
|
+
RelatonBib::DocumentStatus.new(stage: stage, substage: substage)
|
227
|
+
end
|
228
|
+
|
229
|
+
# Fetch workgroup.
|
230
|
+
# @param doc [Nokogiri::HTML::Document]
|
231
|
+
# @return [Hash]
|
232
|
+
def fetch_workgroup(doc)
|
233
|
+
wg_link = doc.css("div.entry-name.entry-block a")[0]
|
234
|
+
# wg_url = DOMAIN + wg_link['href']
|
235
|
+
workgroup = wg_link.text.split "/"
|
236
|
+
{
|
237
|
+
name: "International Organization for Standardization",
|
238
|
+
abbreviation: "ISO",
|
239
|
+
url: "www.iso.org",
|
240
|
+
technical_committee: [{
|
241
|
+
name: wg_link.text + doc.css("div.entry-title")[0].text,
|
242
|
+
type: "TC",
|
243
|
+
number: workgroup[1]&.match(/\d+/)&.to_s&.to_i,
|
244
|
+
}],
|
245
|
+
}
|
246
|
+
end
|
247
|
+
|
248
|
+
# rubocop:disable Metrics/MethodLength
|
249
|
+
|
250
|
+
# Fetch relations.
|
251
|
+
# @param doc [Nokogiri::HTML::Document]
|
252
|
+
# @return [Array<Hash>]
|
253
|
+
def fetch_relations(doc)
|
254
|
+
doc.css("ul.steps li").reduce([]) do |a, r|
|
255
|
+
r_type = r.css("strong").text
|
256
|
+
type = case r_type
|
257
|
+
when "Previously", "Will be replaced by" then "obsoletes"
|
258
|
+
when "Corrigenda/Amendments", "Revised by", "Now confirmed"
|
259
|
+
"updates"
|
260
|
+
else r_type
|
261
|
+
end
|
262
|
+
if ["Now", "Now under review"].include? type
|
263
|
+
a
|
264
|
+
else
|
265
|
+
a + r.css("a").map do |id|
|
266
|
+
fref = RelatonBib::FormattedRef.new(
|
267
|
+
content: id.text, format: "text/plain",
|
268
|
+
)
|
269
|
+
bibitem = RelatonIsoBib::IsoBibliographicItem.new(
|
270
|
+
formattedref: fref,
|
271
|
+
)
|
272
|
+
{ type: type, bibitem: bibitem }
|
273
|
+
end
|
274
|
+
end
|
275
|
+
end
|
276
|
+
end
|
277
|
+
# rubocop:enable Metrics/MethodLength
|
278
|
+
|
279
|
+
# Fetch type.
|
280
|
+
# @param title [String]
|
281
|
+
# @return [String]
|
282
|
+
def fetch_type(title)
|
283
|
+
type_match = title.match(%r{^(ISO|IWA|IEC)(?:(/IEC|/IEEE|/PRF|
|
284
|
+
/NP)*\s|/)(TS|TR|PAS|AWI|CD|FDIS|NP|DIS|WD|R|Guide|(?=\d+))}x)
|
285
|
+
# return "international-standard" if type_match.nil?
|
286
|
+
if TYPES[type_match[3]]
|
287
|
+
TYPES[type_match[3]]
|
288
|
+
elsif type_match[1] == "ISO"
|
289
|
+
"international-standard"
|
290
|
+
elsif type_match[1] == "IWA"
|
291
|
+
"international-workshop-agreement"
|
292
|
+
end
|
293
|
+
# rescue => _e
|
294
|
+
# puts 'Unknown document type: ' + title
|
295
|
+
end
|
296
|
+
|
297
|
+
# Fetch titles.
|
298
|
+
# @param doc [Nokogiri::HTML::Document]
|
299
|
+
# @param lang [String]
|
300
|
+
# @return [Hash]
|
301
|
+
def fetch_title(doc, lang)
|
302
|
+
titles = doc.at("//h3[@itemprop='description'] | //h2[@itemprop='description']").
|
303
|
+
text.split " -- "
|
304
|
+
case titles.size
|
305
|
+
when 0
|
306
|
+
intro, main, part = nil, "", nil
|
307
|
+
when 1
|
308
|
+
intro, main, part = nil, titles[0], nil
|
309
|
+
when 2
|
310
|
+
if /^(Part|Partie) \d+:/ =~ titles[1]
|
311
|
+
intro, main, part = nil, titles[0], titles[1]
|
312
|
+
else
|
313
|
+
intro, main, part = titles[0], titles[1], nil
|
314
|
+
end
|
315
|
+
when 3
|
316
|
+
intro, main, part = titles[0], titles[1], titles[2]
|
317
|
+
else
|
318
|
+
intro, main, part = titles[0], titles[1], titles[2..-1]&.join(" -- ")
|
319
|
+
end
|
320
|
+
{
|
321
|
+
title_intro: intro,
|
322
|
+
title_main: main,
|
323
|
+
title_part: part,
|
324
|
+
language: lang,
|
325
|
+
script: script(lang),
|
326
|
+
}
|
327
|
+
end
|
328
|
+
|
329
|
+
# Return ISO script code.
|
330
|
+
# @param lang [String]
|
331
|
+
# @return [String]
|
332
|
+
def script(lang)
|
333
|
+
case lang
|
334
|
+
when "en", "fr" then "Latn"
|
335
|
+
end
|
336
|
+
end
|
337
|
+
|
338
|
+
# Fetch dates
|
339
|
+
# @param doc [Nokogiri::HTML::Document]
|
340
|
+
# @return [Array<Hash>]
|
341
|
+
def fetch_dates(doc)
|
342
|
+
dates = []
|
343
|
+
publish_date = doc.xpath("//span[@itemprop='releaseDate']").text
|
344
|
+
unless publish_date.empty?
|
345
|
+
dates << { type: "published", on: publish_date }
|
346
|
+
end
|
347
|
+
dates
|
348
|
+
end
|
349
|
+
|
350
|
+
# rubocop:disable Metrics/MethodLength
|
351
|
+
def fetch_contributors(title)
|
352
|
+
title.sub(/\s.*/, "").split("/").map do |abbrev|
|
353
|
+
case abbrev
|
354
|
+
when "IEC"
|
355
|
+
name = "International Electrotechnical Commission"
|
356
|
+
url = "www.iec.ch"
|
357
|
+
else
|
358
|
+
name = "International Organization for Standardization"
|
359
|
+
url = "www.iso.org"
|
360
|
+
end
|
361
|
+
{ entity: { name: name, url: url, abbreviation: abbrev },
|
362
|
+
roles: ["publisher"] }
|
363
|
+
end
|
364
|
+
end
|
365
|
+
# rubocop:enable Metrics/MethodLength
|
366
|
+
|
367
|
+
# Fetch ICS.
|
368
|
+
# @param doc [Nokogiri::HTML::Document]
|
369
|
+
# @return [Array<Hash>]
|
370
|
+
def fetch_ics(doc)
|
371
|
+
doc.xpath("//strong[contains(text(), "\
|
372
|
+
"'ICS')]/../following-sibling::dd/div/a").map do |i|
|
373
|
+
code = i.text.match(/[\d\.]+/).to_s.split "."
|
374
|
+
{ field: code[0], group: code[1], subgroup: code[2] }
|
375
|
+
end
|
376
|
+
end
|
377
|
+
|
378
|
+
# Fetch links.
|
379
|
+
# @param doc [Nokogiri::HTML::Document]
|
380
|
+
# @param url [String]
|
381
|
+
# @return [Array<Hash>]
|
382
|
+
def fetch_link(doc, url)
|
383
|
+
obp_elms = doc.xpath("//a[contains(@href, '/obp/ui/')]")
|
384
|
+
obp = obp_elms.attr("href").value if obp_elms.any?
|
385
|
+
rss = DOMAIN + doc.xpath("//a[contains(@href, 'rss')]").attr("href").value
|
386
|
+
[
|
387
|
+
{ type: "src", content: url },
|
388
|
+
{ type: "obp", content: obp },
|
389
|
+
{ type: "rss", content: rss },
|
390
|
+
]
|
391
|
+
end
|
392
|
+
|
393
|
+
# Fetch copyright.
|
394
|
+
# @param title [String]
|
395
|
+
# @return [Hash]
|
396
|
+
def fetch_copyright(title, doc)
|
397
|
+
owner_name = title.match(/.*?(?=\s)/).to_s
|
398
|
+
from = title.match(/(?<=:)\d{4}/).to_s
|
399
|
+
if from.empty?
|
400
|
+
from = doc.xpath("//span[@itemprop='releaseDate']").text.match(/\d{4}/).to_s
|
401
|
+
end
|
402
|
+
{ owner: { name: owner_name }, from: from }
|
403
|
+
end
|
404
|
+
end
|
405
|
+
|
406
|
+
# private
|
407
|
+
#
|
408
|
+
# def next_hits_page(next_page)
|
409
|
+
# page = @index.search @text, facetFilters: ['category:standard'],
|
410
|
+
# page: next_page
|
411
|
+
# page.each do |key, value|
|
412
|
+
# if key == 'hits'
|
413
|
+
# @docs[key] += value
|
414
|
+
# else
|
415
|
+
# @docs[key] = value
|
416
|
+
# end
|
417
|
+
# end
|
418
|
+
# end
|
419
|
+
end
|
420
|
+
# rubocop:enable Metrics/ModuleLength
|
421
|
+
end
|