relaton-iso 0.5.0

Sign up to get free protection for your applications and to get access to all the features.
data/bin/rspec ADDED
@@ -0,0 +1,29 @@
1
+ #!/usr/bin/env ruby
2
+ # frozen_string_literal: true
3
+
4
+ #
5
+ # This file was generated by Bundler.
6
+ #
7
+ # The application 'rspec' is installed as part of a gem, and
8
+ # this file is here to facilitate running it.
9
+ #
10
+
11
+ require "pathname"
12
+ ENV["BUNDLE_GEMFILE"] ||= File.expand_path("../../Gemfile",
13
+ Pathname.new(__FILE__).realpath)
14
+
15
+ bundle_binstub = File.expand_path("../bundle", __FILE__)
16
+
17
+ if File.file?(bundle_binstub)
18
+ if File.read(bundle_binstub, 300) =~ /This file was generated by Bundler/
19
+ load(bundle_binstub)
20
+ else
21
+ abort("Your `bin/bundle` was not generated by Bundler, so this binstub cannot run.
22
+ Replace `bin/bundle` by running `bundle binstubs bundler --force`, then run this command again.")
23
+ end
24
+ end
25
+
26
+ require "rubygems"
27
+ require "bundler/setup"
28
+
29
+ load Gem.bin_path("rspec-core", "rspec")
data/bin/safe_yaml ADDED
@@ -0,0 +1,29 @@
1
+ #!/usr/bin/env ruby
2
+ # frozen_string_literal: true
3
+
4
+ #
5
+ # This file was generated by Bundler.
6
+ #
7
+ # The application 'safe_yaml' is installed as part of a gem, and
8
+ # this file is here to facilitate running it.
9
+ #
10
+
11
+ require "pathname"
12
+ ENV["BUNDLE_GEMFILE"] ||= File.expand_path("../../Gemfile",
13
+ Pathname.new(__FILE__).realpath)
14
+
15
+ bundle_binstub = File.expand_path("../bundle", __FILE__)
16
+
17
+ if File.file?(bundle_binstub)
18
+ if File.read(bundle_binstub, 300) =~ /This file was generated by Bundler/
19
+ load(bundle_binstub)
20
+ else
21
+ abort("Your `bin/bundle` was not generated by Bundler, so this binstub cannot run.
22
+ Replace `bin/bundle` by running `bundle binstubs bundler --force`, then run this command again.")
23
+ end
24
+ end
25
+
26
+ require "rubygems"
27
+ require "bundler/setup"
28
+
29
+ load Gem.bin_path("safe_yaml", "safe_yaml")
data/bin/setup ADDED
@@ -0,0 +1,8 @@
1
+ #!/usr/bin/env bash
2
+ set -euo pipefail
3
+ IFS=$'\n\t'
4
+ set -vx
5
+
6
+ bundle install
7
+
8
+ # Do any other automated setup that you need to do here
@@ -0,0 +1,22 @@
1
+ require "relaton/processor"
2
+
3
+ module Relaton
4
+ module RelatonIso
5
+ class Processor < Relaton::Processor
6
+ def initialize
7
+ @short = :relaton_iso
8
+ @prefix = "ISO"
9
+ @defaultprefix = %r{^(ISO)[ /]}
10
+ @idtype = "ISO"
11
+ end
12
+
13
+ def get(code, date, opts)
14
+ ::RelatonIso::IsoBibliography.get(code, date, opts)
15
+ end
16
+
17
+ def from_xml(xml)
18
+ RelatonIsoBib::XMLParser.from_xml xml
19
+ end
20
+ end
21
+ end
22
+ end
@@ -0,0 +1,8 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "relaton_iso/version"
4
+ require "relaton_iso/iso_bibliography"
5
+ if defined? Relaton
6
+ require_relative "relaton/processor"
7
+ Relaton::Registry.instance.register(Relaton::RelatonIso::Processor)
8
+ end
@@ -0,0 +1,55 @@
1
+ # frozen_string_literal: true
2
+
3
+ module RelatonIso
4
+ # Hit.
5
+ class Hit
6
+ # @return [RelatonIso::HitCollection]
7
+ attr_reader :hit_collection
8
+
9
+ # @return [Array<Hash>]
10
+ attr_reader :hit
11
+
12
+ # @param hit [Hash]
13
+ # @param hit_collection [RelatonIso:HitCollection]
14
+ def initialize(hit, hit_collection = nil)
15
+ @hit = hit
16
+ @hit_collection = hit_collection
17
+ end
18
+
19
+ # Parse page.
20
+ # @return [RelatonIso::IsoBibliographicItem]
21
+ def fetch
22
+ @fetch ||= Scrapper.parse_page @hit
23
+ end
24
+
25
+ # @return [String]
26
+ def to_s
27
+ inspect
28
+ end
29
+
30
+ # @return [String]
31
+ def inspect
32
+ matched_words = @hit["_highlightResult"].
33
+ reduce([]) { |a, (_k, v)| a + v["matchedWords"] }.uniq
34
+
35
+ "<#{self.class}:#{format('%#.14x', object_id << 1)} "\
36
+ "@text=\"#{@hit_collection&.hit_pages&.text}\" "\
37
+ "@fullIdentifier=\"#{@fetch&.shortref}\" "\
38
+ "@matchedWords=#{matched_words} "\
39
+ "@category=\"#{@hit['category']}\" "\
40
+ "@title=\"#{@hit['title']}\">"
41
+ end
42
+
43
+ # @param builder [Nokogiri::XML::Builder]
44
+ def to_xml(builder = nil, **opts)
45
+ if builder
46
+ fetch.to_xml builder, **opts
47
+ else
48
+ builder = Nokogiri::XML::Builder.new(encoding: "UTF-8") do |xml|
49
+ fetch.to_xml xml, **opts
50
+ end
51
+ builder.doc.root.to_xml
52
+ end
53
+ end
54
+ end
55
+ end
@@ -0,0 +1,42 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "relaton_iso/hit"
4
+
5
+ module RelatonIso
6
+ # Page of hit collection.
7
+ class HitCollection < Array
8
+ # @return [TrueClass, FalseClass]
9
+ attr_reader :fetched
10
+
11
+ # @return [RelatonIso::HitPages]
12
+ attr_reader :hit_pages
13
+
14
+ # @param hits [Array<Hash>]
15
+ def initialize(hits, hit_pages = nil)
16
+ concat(hits.map { |h| Hit.new(h, self) })
17
+ @fetched = false
18
+ @hit_pages = hit_pages
19
+ end
20
+
21
+ # @return [RelatonIso::HitCollection]
22
+ def fetch
23
+ workers = RelatonBib::WorkersPool.new 4
24
+ workers.worker(&:fetch)
25
+ each do |hit|
26
+ workers << hit
27
+ end
28
+ workers.end
29
+ workers.result
30
+ @fetched = true
31
+ self
32
+ end
33
+
34
+ def to_s
35
+ inspect
36
+ end
37
+
38
+ def inspect
39
+ "<#{self.class}:#{format('%#.14x', object_id << 1)} @fetched=#{@fetched}>"
40
+ end
41
+ end
42
+ end
@@ -0,0 +1,96 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "algoliasearch"
4
+ require "relaton_iso/hit_collection"
5
+
6
+ module RelatonIso
7
+ # Pages of hits.
8
+ class HitPages < Array
9
+ Algolia.init application_id: "JCL49WV5AR",
10
+ api_key: "dd1b9e1ab383f4d4817d29cd5e96d3f0"
11
+
12
+ # @return [String]
13
+ attr_reader :text
14
+
15
+ # @param text [String]
16
+ def initialize(text)
17
+ @text = text
18
+ @index = Algolia::Index.new "all_en"
19
+ resp = @index.search(text, facetFilters: ["category:standard"])
20
+ @nb_pages = resp["nbPages"]
21
+ self << HitCollection.new(resp["hits"], self)
22
+ end
23
+
24
+ # @return [RelatonIso::HitCollection]
25
+ def last
26
+ collection(@nb_pages - 1)
27
+ end
28
+
29
+ # @param i [Integer]
30
+ # @return [RelatonIso::HitCollection]
31
+ def [](idx)
32
+ # collection i
33
+ return if idx + 1 > @nb_pages
34
+
35
+ collection idx
36
+ super
37
+ end
38
+
39
+ # @return [Array]
40
+ def map(&block)
41
+ m = []
42
+ @nb_pages.times do |n|
43
+ m << yield(self[n]) if block
44
+ end
45
+ m
46
+ end
47
+
48
+ def each(&block)
49
+ @nb_pages.times do |n|
50
+ yield self[n] if block
51
+ end
52
+ end
53
+
54
+ def to_s
55
+ inspect
56
+ end
57
+
58
+ def inspect
59
+ "<#{self.class}:#{format('%#.14x', object_id << 1)} @text=#{@text} "\
60
+ "@pages=#{@nb_pages}>"
61
+ end
62
+
63
+ # @return [Integer]
64
+ def size
65
+ @nb_pages
66
+ end
67
+
68
+ def to_xml(**opts)
69
+ builder = Nokogiri::XML::Builder.new(encoding: "UTF-8") do |xml|
70
+ xml.documents do
71
+ each do |page|
72
+ page.fetch
73
+ page.each { |hit| hit.to_xml xml, **opts }
74
+ end
75
+ end
76
+ end
77
+ builder.to_xml
78
+ end
79
+
80
+ private
81
+
82
+ # @param i [Integer]
83
+ # @return [RelatonIso::HitCollection]
84
+ def collection(idx)
85
+ return if idx + 1 > @nb_pages
86
+
87
+ while Array.instance_method(:size).bind(self).call < idx + 1
88
+ resp = @index.search(@text,
89
+ facetFilters: ["category:standard"],
90
+ page: idx)
91
+ self << HitCollection.new(resp["hits"], self)
92
+ end
93
+ Array.instance_method(:[]).bind(self).call idx
94
+ end
95
+ end
96
+ end
@@ -0,0 +1,132 @@
1
+ # frozen_string_literal: true
2
+
3
+ # require 'relaton_iso/iso_bibliographic_item'
4
+ require "relaton_iso/scrapper"
5
+ require "relaton_iso/hit_pages"
6
+ require "relaton_iec"
7
+
8
+ module RelatonIso
9
+ # Class methods for search ISO standards.
10
+ class IsoBibliography
11
+ class << self
12
+ # @param text [String]
13
+ # @return [RelatonIso::HitPages]
14
+ def search(text)
15
+ HitPages.new text
16
+ rescue Algolia::AlgoliaProtocolError
17
+ warn "Could not access http://www.iso.org"
18
+ []
19
+ end
20
+
21
+ # @param text [String]
22
+ # @return [Array<RelatonIso::IsoBibliographicItem>]
23
+ # def search_and_fetch(text)
24
+ # Scrapper.get(text)
25
+ # end
26
+
27
+ # @param code [String] the ISO standard Code to look up (e..g "ISO 9000")
28
+ # @param year [String] the year the standard was published (optional)
29
+ # @param opts [Hash] options; restricted to :all_parts if all-parts reference is required,
30
+ # :keep_year if undated reference should return actual reference with year
31
+ # @return [String] Relaton XML serialisation of reference
32
+ def get(code, year, opts)
33
+ if year.nil?
34
+ /^(?<code1>[^:]+):(?<year1>[^:]+)$/ =~ code
35
+ unless code1.nil?
36
+ code = code1
37
+ year = year1
38
+ end
39
+ end
40
+ code += "-1" if opts[:all_parts]
41
+ return Iecbib::IecBibliography.get(code, year, opts) if %r[^ISO/IEC DIR].match code
42
+
43
+ ret = isobib_get1(code, year, opts)
44
+ if ret.nil? && code =~ %r[^ISO\s]
45
+ c = code.gsub "ISO", "ISO/IEC"
46
+ warn "Attempting ISO/IEC retrieval"
47
+ ret = isobib_get1(c, year, opts)
48
+ end
49
+ return nil if ret.nil?
50
+
51
+ ret.to_most_recent_reference unless year || opts[:keep_year]
52
+ ret.to_all_parts if opts[:all_parts]
53
+ ret
54
+ end
55
+
56
+ private
57
+
58
+ def fetch_ref_err(code, year, missed_years)
59
+ id = year ? "#{code}:#{year}" : code
60
+ warn "WARNING: no match found online for #{id}. "\
61
+ "The code must be exactly like it is on the standards website."
62
+ warn "(There was no match for #{year}, though there were matches "\
63
+ "found for #{missed_years.join(', ')}.)" unless missed_years.empty?
64
+ if /\d-\d/ =~ code
65
+ warn "The provided document part may not exist, or the document "\
66
+ "may no longer be published in parts."
67
+ else
68
+ warn "If you wanted to cite all document parts for the reference, "\
69
+ "use \"#{code} (all parts)\".\nIf the document is not a standard, "\
70
+ "use its document type abbreviation (TS, TR, PAS, Guide)."
71
+ end
72
+ nil
73
+ end
74
+
75
+ def fetch_pages(s, n)
76
+ workers = RelatonBib::WorkersPool.new n
77
+ workers.worker { |w| { i: w[:i], hit: w[:hit].fetch } }
78
+ s.each_with_index { |hit, i| workers << { i: i, hit: hit } }
79
+ workers.end
80
+ workers.result.sort { |x, y| x[:i] <=> y[:i] }.map { |x| x[:hit] }
81
+ end
82
+
83
+ def isobib_search_filter(code)
84
+ docidrx = %r{^(ISO|IEC)[^0-9]*\s[0-9-]+}
85
+ corrigrx = %r{^(ISO|IEC)[^0-9]*\s[0-9-]+:[0-9]+/}
86
+ warn "fetching #{code}..."
87
+ result = search(code)
88
+ result.each do |page|
89
+ ret = page.select do |i|
90
+ i.hit["title"] &&
91
+ i.hit["title"].match(docidrx).to_s == code &&
92
+ corrigrx !~ i.hit["title"]
93
+ end
94
+ return ret unless ret.empty?
95
+ end
96
+ []
97
+ end
98
+
99
+ # Sort through the results from RelatonIso, fetching them three at a time,
100
+ # and return the first result that matches the code,
101
+ # matches the year (if provided), and which # has a title (amendments do not).
102
+ # Only expects the first page of results to be populated.
103
+ # Does not match corrigenda etc (e.g. ISO 3166-1:2006/Cor 1:2007)
104
+ # If no match, returns any years which caused mismatch, for error reporting
105
+ def isobib_results_filter(result, year)
106
+ missed_years = []
107
+ result.each_slice(3) do |s| # ISO website only allows 3 connections
108
+ fetch_pages(s, 3).each_with_index do |r, _i|
109
+ next if r.nil?
110
+ return { ret: r } if !year
111
+
112
+ r.dates.select { |d| d.type == "published" }.each do |d|
113
+ return { ret: r } if year.to_i == d.on.year
114
+
115
+ missed_years << d.on.year
116
+ end
117
+ end
118
+ end
119
+ { years: missed_years }
120
+ end
121
+
122
+ def isobib_get1(code, year, _opts)
123
+ # return iev(code) if /^IEC 60050-/.match code
124
+ result = isobib_search_filter(code) || return
125
+ ret = isobib_results_filter(result, year)
126
+ return ret[:ret] if ret[:ret]
127
+
128
+ fetch_ref_err(code, year, ret[:years])
129
+ end
130
+ end
131
+ end
132
+ end
@@ -0,0 +1,421 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "algoliasearch"
4
+ require "relaton_iso_bib"
5
+ require "relaton_iso/hit"
6
+ require "nokogiri"
7
+ require "net/http"
8
+
9
+ Algolia.init application_id: "JCL49WV5AR",
10
+ api_key: "dd1b9e1ab383f4d4817d29cd5e96d3f0"
11
+
12
+ module RelatonIso
13
+ # Scrapper.
14
+ # rubocop:disable Metrics/ModuleLength
15
+ module Scrapper
16
+ DOMAIN = "https://www.iso.org"
17
+
18
+ TYPES = {
19
+ "TS" => "technical-specification",
20
+ "TR" => "technical-report",
21
+ "PAS" => "publicly-available-specification",
22
+ # "AWI" => "approvedWorkItem",
23
+ # "CD" => "committeeDraft",
24
+ # "FDIS" => "finalDraftInternationalStandard",
25
+ # "NP" => "newProposal",
26
+ # "DIS" => "draftInternationalStandard",
27
+ # "WD" => "workingDraft",
28
+ # "R" => "recommendation",
29
+ "Guide" => "guide",
30
+ }.freeze
31
+
32
+ class << self
33
+ # @param text [String]
34
+ # @return [Array<Hash>]
35
+ # def get(text)
36
+ # iso_workers = RelatonBib::WorkersPool.new 4
37
+ # iso_workers.worker { |hit| iso_worker(hit, iso_workers) }
38
+ # algolia_workers = start_algolia_search(text, iso_workers)
39
+ # iso_docs = iso_workers.result
40
+ # algolia_workers.end
41
+ # algolia_workers.result
42
+ # iso_docs
43
+ # rescue
44
+ # warn "Could not connect to http://www.iso.org"
45
+ # []
46
+ # end
47
+
48
+ # Parse page.
49
+ # @param hit [Hash]
50
+ # @return [Hash]
51
+ # rubocop:disable Metrics/AbcSize, Metrics/MethodLength
52
+ def parse_page(hit_data)
53
+ return unless hit_data["path"] =~ /\d+$/
54
+
55
+ doc, url = get_page "/standard/#{hit_data['path'].match(/\d+$/)}.html"
56
+
57
+ # Fetch edition.
58
+ edition = doc&.xpath("//strong[contains(text(), 'Edition')]/..")&.
59
+ children&.last&.text&.match(/\d+/)&.to_s
60
+
61
+ titles, abstract = fetch_titles_abstract(doc)
62
+
63
+ RelatonIsoBib::IsoBibliographicItem.new(
64
+ fetched: Date.today.to_s,
65
+ docid: fetch_docid(doc),
66
+ edition: edition,
67
+ language: langs(doc).map { |l| l[:lang] },
68
+ script: langs(doc).map { |l| script(l[:lang]) }.uniq,
69
+ titles: titles,
70
+ type: fetch_type(hit_data["title"]),
71
+ docstatus: fetch_status(doc, hit_data["status"]),
72
+ ics: fetch_ics(doc),
73
+ dates: fetch_dates(doc),
74
+ contributors: fetch_contributors(hit_data["title"]),
75
+ editorialgroup: fetch_workgroup(doc),
76
+ abstract: abstract,
77
+ copyright: fetch_copyright(hit_data["title"], doc),
78
+ link: fetch_link(doc, url),
79
+ relations: fetch_relations(doc),
80
+ structuredidentifier: fetch_structuredidentifier(doc),
81
+ )
82
+ end
83
+ # rubocop:enable Metrics/AbcSize, Metrics/MethodLength
84
+
85
+ private
86
+
87
+ # Start algolia search workers.
88
+ # @param text[String]
89
+ # @param iso_workers [RelatonBib::WorkersPool]
90
+ # @reaturn [RelatonBib::WorkersPool]
91
+ # def start_algolia_search(text, iso_workers)
92
+ # index = Algolia::Index.new "all_en"
93
+ # algolia_workers = RelatonBib::WorkersPool.new
94
+ # algolia_workers.worker do |page|
95
+ # algolia_worker(index, text, page, algolia_workers, iso_workers)
96
+ # end
97
+
98
+ # # Add first page so algolia worker will start.
99
+ # algolia_workers << 0
100
+ # end
101
+
102
+ # Fetch ISO documents.
103
+ # @param hit [Hash]
104
+ # @param isiso_workers [RelatonIso::WorkersPool]
105
+ # def iso_worker(hit, iso_workers)
106
+ # print "Parse #{iso_workers.size} of #{iso_workers.nb_hits} \r"
107
+ # parse_page hit
108
+ # end
109
+
110
+ # Fetch hits from algolia search service.
111
+ # @param index[Algolia::Index]
112
+ # @param text [String]
113
+ # @param page [Integer]
114
+ # @param algolia_workers [RelatonBib::WorkersPool]
115
+ # @param isiso_workers [RelatonBib::WorkersPool]
116
+ # def algolia_worker(index, text, page, algolia_workers, iso_workers)
117
+ # res = index.search text, facetFilters: ["category:standard"], page: page
118
+ # next_page = res["page"] + 1
119
+ # algolia_workers << next_page if next_page < res["nbPages"]
120
+ # res["hits"].each do |hit|
121
+ # iso_workers.nb_hits = res["nbHits"]
122
+ # iso_workers << hit
123
+ # end
124
+ # iso_workers.end unless next_page < res["nbPages"]
125
+ # end
126
+
127
+ # Fetch titles and abstracts.
128
+ # @param doc [Nokigiri::HTML::Document]
129
+ # @return [Array<Array>]
130
+ # rubocop:disable Metrics/AbcSize, Metrics/MethodLength
131
+ def fetch_titles_abstract(doc)
132
+ titles = []
133
+ abstract = []
134
+ langs(doc).each do |lang|
135
+ # Don't need to get page for en. We already have it.
136
+ d = lang[:path] ? get_page(lang[:path])[0] : doc
137
+
138
+ # Check if unavailable for the lang.
139
+ next if d.css("h5.help-block").any?
140
+
141
+ titles << fetch_title(d, lang[:lang])
142
+
143
+ # Fetch abstracts.
144
+ abstract_content = d.css("div[itemprop='description'] p").text
145
+ next if abstract_content.empty?
146
+
147
+ abstract << {
148
+ content: abstract_content,
149
+ language: lang[:lang],
150
+ script: script(lang[:lang]),
151
+ format: "text/plain",
152
+ }
153
+ end
154
+ [titles, abstract]
155
+ end
156
+ # rubocop:enable Metrics/AbcSize, Metrics/MethodLength
157
+
158
+ # Get langs.
159
+ # @param doc [Nokogiri::HTML::Document]
160
+ # @return [Array<Hash>]
161
+ def langs(doc)
162
+ lgs = [{ lang: "en" }]
163
+ doc.css("ul#lang-switcher ul li a").each do |lang_link|
164
+ lang_path = lang_link.attr("href")
165
+ lang = lang_path.match(%r{^\/(fr)\/})
166
+ lgs << { lang: lang[1], path: lang_path } if lang
167
+ end
168
+ lgs
169
+ end
170
+
171
+ # rubocop:disable Metrics/AbcSize, Metrics/MethodLength
172
+ # Get page.
173
+ # @param path [String] page's path
174
+ # @return [Array<Nokogiri::HTML::Document, String>]
175
+ def get_page(path)
176
+ url = DOMAIN + path
177
+ uri = URI url
178
+ resp = Net::HTTP.get_response(uri) # .encode("UTF-8")
179
+ if resp.code == "301"
180
+ path = resp["location"]
181
+ url = DOMAIN + path
182
+ uri = URI url
183
+ resp = Net::HTTP.get_response(uri) # .encode("UTF-8")
184
+ end
185
+ n = 0
186
+ while resp.body !~ /<strong/ && n < 10
187
+ resp = Net::HTTP.get_response(uri) # .encode("UTF-8")
188
+ n += 1
189
+ end
190
+ [Nokogiri::HTML(resp.body), url]
191
+ end
192
+ # rubocop:enable Metrics/AbcSize, Metrics/MethodLength
193
+
194
+ # Fetch docid.
195
+ # @param doc [Nokogiri::HTML::Document]
196
+ # @return [Array<RelatonBib::DocumentIdentifier>]
197
+ def fetch_docid(doc)
198
+ item_ref = doc.at("//strong[@id='itemReference']")
199
+ return [] unless item_ref
200
+
201
+ [RelatonBib::DocumentIdentifier.new(id: item_ref.text, type: "ISO")]
202
+ end
203
+
204
+ # @param doc [Nokogiri::HTML::Document]
205
+ def fetch_structuredidentifier(doc)
206
+ item_ref = doc.at("//strong[@id='itemReference']")
207
+ unless item_ref
208
+ return RelatonIsoBib::StructuredIdentifier.new(
209
+ project_number: "?", part_number: "", prefix: nil, id: "?",
210
+ )
211
+ end
212
+
213
+ m = item_ref.text.match(/^(.*?\d+)-?((?<=-)\d+|)/)
214
+ RelatonIsoBib::StructuredIdentifier.new(
215
+ project_number: m[1], part_number: m[2], prefix: nil,
216
+ id: item_ref.text, type: "ISO"
217
+ )
218
+ end
219
+
220
+ # Fetch status.
221
+ # @param doc [Nokogiri::HTML::Document]
222
+ # @param status [String]
223
+ # @return [Hash]
224
+ def fetch_status(doc, _status)
225
+ stage, substage = doc.css("li.dropdown.active span.stage-code > strong").text.split "."
226
+ RelatonBib::DocumentStatus.new(stage: stage, substage: substage)
227
+ end
228
+
229
+ # Fetch workgroup.
230
+ # @param doc [Nokogiri::HTML::Document]
231
+ # @return [Hash]
232
+ def fetch_workgroup(doc)
233
+ wg_link = doc.css("div.entry-name.entry-block a")[0]
234
+ # wg_url = DOMAIN + wg_link['href']
235
+ workgroup = wg_link.text.split "/"
236
+ {
237
+ name: "International Organization for Standardization",
238
+ abbreviation: "ISO",
239
+ url: "www.iso.org",
240
+ technical_committee: [{
241
+ name: wg_link.text + doc.css("div.entry-title")[0].text,
242
+ type: "TC",
243
+ number: workgroup[1]&.match(/\d+/)&.to_s&.to_i,
244
+ }],
245
+ }
246
+ end
247
+
248
+ # rubocop:disable Metrics/MethodLength
249
+
250
+ # Fetch relations.
251
+ # @param doc [Nokogiri::HTML::Document]
252
+ # @return [Array<Hash>]
253
+ def fetch_relations(doc)
254
+ doc.css("ul.steps li").reduce([]) do |a, r|
255
+ r_type = r.css("strong").text
256
+ type = case r_type
257
+ when "Previously", "Will be replaced by" then "obsoletes"
258
+ when "Corrigenda/Amendments", "Revised by", "Now confirmed"
259
+ "updates"
260
+ else r_type
261
+ end
262
+ if ["Now", "Now under review"].include? type
263
+ a
264
+ else
265
+ a + r.css("a").map do |id|
266
+ fref = RelatonBib::FormattedRef.new(
267
+ content: id.text, format: "text/plain",
268
+ )
269
+ bibitem = RelatonIsoBib::IsoBibliographicItem.new(
270
+ formattedref: fref,
271
+ )
272
+ { type: type, bibitem: bibitem }
273
+ end
274
+ end
275
+ end
276
+ end
277
+ # rubocop:enable Metrics/MethodLength
278
+
279
+ # Fetch type.
280
+ # @param title [String]
281
+ # @return [String]
282
+ def fetch_type(title)
283
+ type_match = title.match(%r{^(ISO|IWA|IEC)(?:(/IEC|/IEEE|/PRF|
284
+ /NP)*\s|/)(TS|TR|PAS|AWI|CD|FDIS|NP|DIS|WD|R|Guide|(?=\d+))}x)
285
+ # return "international-standard" if type_match.nil?
286
+ if TYPES[type_match[3]]
287
+ TYPES[type_match[3]]
288
+ elsif type_match[1] == "ISO"
289
+ "international-standard"
290
+ elsif type_match[1] == "IWA"
291
+ "international-workshop-agreement"
292
+ end
293
+ # rescue => _e
294
+ # puts 'Unknown document type: ' + title
295
+ end
296
+
297
+ # Fetch titles.
298
+ # @param doc [Nokogiri::HTML::Document]
299
+ # @param lang [String]
300
+ # @return [Hash]
301
+ def fetch_title(doc, lang)
302
+ titles = doc.at("//h3[@itemprop='description'] | //h2[@itemprop='description']").
303
+ text.split " -- "
304
+ case titles.size
305
+ when 0
306
+ intro, main, part = nil, "", nil
307
+ when 1
308
+ intro, main, part = nil, titles[0], nil
309
+ when 2
310
+ if /^(Part|Partie) \d+:/ =~ titles[1]
311
+ intro, main, part = nil, titles[0], titles[1]
312
+ else
313
+ intro, main, part = titles[0], titles[1], nil
314
+ end
315
+ when 3
316
+ intro, main, part = titles[0], titles[1], titles[2]
317
+ else
318
+ intro, main, part = titles[0], titles[1], titles[2..-1]&.join(" -- ")
319
+ end
320
+ {
321
+ title_intro: intro,
322
+ title_main: main,
323
+ title_part: part,
324
+ language: lang,
325
+ script: script(lang),
326
+ }
327
+ end
328
+
329
+ # Return ISO script code.
330
+ # @param lang [String]
331
+ # @return [String]
332
+ def script(lang)
333
+ case lang
334
+ when "en", "fr" then "Latn"
335
+ end
336
+ end
337
+
338
+ # Fetch dates
339
+ # @param doc [Nokogiri::HTML::Document]
340
+ # @return [Array<Hash>]
341
+ def fetch_dates(doc)
342
+ dates = []
343
+ publish_date = doc.xpath("//span[@itemprop='releaseDate']").text
344
+ unless publish_date.empty?
345
+ dates << { type: "published", on: publish_date }
346
+ end
347
+ dates
348
+ end
349
+
350
+ # rubocop:disable Metrics/MethodLength
351
+ def fetch_contributors(title)
352
+ title.sub(/\s.*/, "").split("/").map do |abbrev|
353
+ case abbrev
354
+ when "IEC"
355
+ name = "International Electrotechnical Commission"
356
+ url = "www.iec.ch"
357
+ else
358
+ name = "International Organization for Standardization"
359
+ url = "www.iso.org"
360
+ end
361
+ { entity: { name: name, url: url, abbreviation: abbrev },
362
+ roles: ["publisher"] }
363
+ end
364
+ end
365
+ # rubocop:enable Metrics/MethodLength
366
+
367
+ # Fetch ICS.
368
+ # @param doc [Nokogiri::HTML::Document]
369
+ # @return [Array<Hash>]
370
+ def fetch_ics(doc)
371
+ doc.xpath("//strong[contains(text(), "\
372
+ "'ICS')]/../following-sibling::dd/div/a").map do |i|
373
+ code = i.text.match(/[\d\.]+/).to_s.split "."
374
+ { field: code[0], group: code[1], subgroup: code[2] }
375
+ end
376
+ end
377
+
378
+ # Fetch links.
379
+ # @param doc [Nokogiri::HTML::Document]
380
+ # @param url [String]
381
+ # @return [Array<Hash>]
382
+ def fetch_link(doc, url)
383
+ obp_elms = doc.xpath("//a[contains(@href, '/obp/ui/')]")
384
+ obp = obp_elms.attr("href").value if obp_elms.any?
385
+ rss = DOMAIN + doc.xpath("//a[contains(@href, 'rss')]").attr("href").value
386
+ [
387
+ { type: "src", content: url },
388
+ { type: "obp", content: obp },
389
+ { type: "rss", content: rss },
390
+ ]
391
+ end
392
+
393
+ # Fetch copyright.
394
+ # @param title [String]
395
+ # @return [Hash]
396
+ def fetch_copyright(title, doc)
397
+ owner_name = title.match(/.*?(?=\s)/).to_s
398
+ from = title.match(/(?<=:)\d{4}/).to_s
399
+ if from.empty?
400
+ from = doc.xpath("//span[@itemprop='releaseDate']").text.match(/\d{4}/).to_s
401
+ end
402
+ { owner: { name: owner_name }, from: from }
403
+ end
404
+ end
405
+
406
+ # private
407
+ #
408
+ # def next_hits_page(next_page)
409
+ # page = @index.search @text, facetFilters: ['category:standard'],
410
+ # page: next_page
411
+ # page.each do |key, value|
412
+ # if key == 'hits'
413
+ # @docs[key] += value
414
+ # else
415
+ # @docs[key] = value
416
+ # end
417
+ # end
418
+ # end
419
+ end
420
+ # rubocop:enable Metrics/ModuleLength
421
+ end