relaton-iso 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/bin/rspec ADDED
@@ -0,0 +1,29 @@
1
+ #!/usr/bin/env ruby
2
+ # frozen_string_literal: true
3
+
4
+ #
5
+ # This file was generated by Bundler.
6
+ #
7
+ # The application 'rspec' is installed as part of a gem, and
8
+ # this file is here to facilitate running it.
9
+ #
10
+
11
+ require "pathname"
12
+ ENV["BUNDLE_GEMFILE"] ||= File.expand_path("../../Gemfile",
13
+ Pathname.new(__FILE__).realpath)
14
+
15
+ bundle_binstub = File.expand_path("../bundle", __FILE__)
16
+
17
+ if File.file?(bundle_binstub)
18
+ if File.read(bundle_binstub, 300) =~ /This file was generated by Bundler/
19
+ load(bundle_binstub)
20
+ else
21
+ abort("Your `bin/bundle` was not generated by Bundler, so this binstub cannot run.
22
+ Replace `bin/bundle` by running `bundle binstubs bundler --force`, then run this command again.")
23
+ end
24
+ end
25
+
26
+ require "rubygems"
27
+ require "bundler/setup"
28
+
29
+ load Gem.bin_path("rspec-core", "rspec")
data/bin/safe_yaml ADDED
@@ -0,0 +1,29 @@
1
+ #!/usr/bin/env ruby
2
+ # frozen_string_literal: true
3
+
4
+ #
5
+ # This file was generated by Bundler.
6
+ #
7
+ # The application 'safe_yaml' is installed as part of a gem, and
8
+ # this file is here to facilitate running it.
9
+ #
10
+
11
+ require "pathname"
12
+ ENV["BUNDLE_GEMFILE"] ||= File.expand_path("../../Gemfile",
13
+ Pathname.new(__FILE__).realpath)
14
+
15
+ bundle_binstub = File.expand_path("../bundle", __FILE__)
16
+
17
+ if File.file?(bundle_binstub)
18
+ if File.read(bundle_binstub, 300) =~ /This file was generated by Bundler/
19
+ load(bundle_binstub)
20
+ else
21
+ abort("Your `bin/bundle` was not generated by Bundler, so this binstub cannot run.
22
+ Replace `bin/bundle` by running `bundle binstubs bundler --force`, then run this command again.")
23
+ end
24
+ end
25
+
26
+ require "rubygems"
27
+ require "bundler/setup"
28
+
29
+ load Gem.bin_path("safe_yaml", "safe_yaml")
data/bin/setup ADDED
@@ -0,0 +1,8 @@
1
+ #!/usr/bin/env bash
2
+ set -euo pipefail
3
+ IFS=$'\n\t'
4
+ set -vx
5
+
6
+ bundle install
7
+
8
+ # Do any other automated setup that you need to do here
@@ -0,0 +1,22 @@
1
+ require "relaton/processor"
2
+
3
+ module Relaton
4
+ module RelatonIso
5
+ class Processor < Relaton::Processor
6
+ def initialize
7
+ @short = :relaton_iso
8
+ @prefix = "ISO"
9
+ @defaultprefix = %r{^(ISO)[ /]}
10
+ @idtype = "ISO"
11
+ end
12
+
13
+ def get(code, date, opts)
14
+ ::RelatonIso::IsoBibliography.get(code, date, opts)
15
+ end
16
+
17
+ def from_xml(xml)
18
+ RelatonIsoBib::XMLParser.from_xml xml
19
+ end
20
+ end
21
+ end
22
+ end
@@ -0,0 +1,8 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "relaton_iso/version"
4
+ require "relaton_iso/iso_bibliography"
5
+ if defined? Relaton
6
+ require_relative "relaton/processor"
7
+ Relaton::Registry.instance.register(Relaton::RelatonIso::Processor)
8
+ end
@@ -0,0 +1,55 @@
1
+ # frozen_string_literal: true
2
+
3
+ module RelatonIso
4
+ # Hit.
5
+ class Hit
6
+ # @return [RelatonIso::HitCollection]
7
+ attr_reader :hit_collection
8
+
9
+ # @return [Array<Hash>]
10
+ attr_reader :hit
11
+
12
+ # @param hit [Hash]
13
+ # @param hit_collection [RelatonIso:HitCollection]
14
+ def initialize(hit, hit_collection = nil)
15
+ @hit = hit
16
+ @hit_collection = hit_collection
17
+ end
18
+
19
+ # Parse page.
20
+ # @return [RelatonIso::IsoBibliographicItem]
21
+ def fetch
22
+ @fetch ||= Scrapper.parse_page @hit
23
+ end
24
+
25
+ # @return [String]
26
+ def to_s
27
+ inspect
28
+ end
29
+
30
+ # @return [String]
31
+ def inspect
32
+ matched_words = @hit["_highlightResult"].
33
+ reduce([]) { |a, (_k, v)| a + v["matchedWords"] }.uniq
34
+
35
+ "<#{self.class}:#{format('%#.14x', object_id << 1)} "\
36
+ "@text=\"#{@hit_collection&.hit_pages&.text}\" "\
37
+ "@fullIdentifier=\"#{@fetch&.shortref}\" "\
38
+ "@matchedWords=#{matched_words} "\
39
+ "@category=\"#{@hit['category']}\" "\
40
+ "@title=\"#{@hit['title']}\">"
41
+ end
42
+
43
+ # @param builder [Nokogiri::XML::Builder]
44
+ def to_xml(builder = nil, **opts)
45
+ if builder
46
+ fetch.to_xml builder, **opts
47
+ else
48
+ builder = Nokogiri::XML::Builder.new(encoding: "UTF-8") do |xml|
49
+ fetch.to_xml xml, **opts
50
+ end
51
+ builder.doc.root.to_xml
52
+ end
53
+ end
54
+ end
55
+ end
@@ -0,0 +1,42 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "relaton_iso/hit"
4
+
5
+ module RelatonIso
6
+ # Page of hit collection.
7
+ class HitCollection < Array
8
+ # @return [TrueClass, FalseClass]
9
+ attr_reader :fetched
10
+
11
+ # @return [RelatonIso::HitPages]
12
+ attr_reader :hit_pages
13
+
14
+ # @param hits [Array<Hash>]
15
+ def initialize(hits, hit_pages = nil)
16
+ concat(hits.map { |h| Hit.new(h, self) })
17
+ @fetched = false
18
+ @hit_pages = hit_pages
19
+ end
20
+
21
+ # @return [RelatonIso::HitCollection]
22
+ def fetch
23
+ workers = RelatonBib::WorkersPool.new 4
24
+ workers.worker(&:fetch)
25
+ each do |hit|
26
+ workers << hit
27
+ end
28
+ workers.end
29
+ workers.result
30
+ @fetched = true
31
+ self
32
+ end
33
+
34
+ def to_s
35
+ inspect
36
+ end
37
+
38
+ def inspect
39
+ "<#{self.class}:#{format('%#.14x', object_id << 1)} @fetched=#{@fetched}>"
40
+ end
41
+ end
42
+ end
@@ -0,0 +1,96 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "algoliasearch"
4
+ require "relaton_iso/hit_collection"
5
+
6
+ module RelatonIso
7
+ # Pages of hits.
8
+ class HitPages < Array
9
+ Algolia.init application_id: "JCL49WV5AR",
10
+ api_key: "dd1b9e1ab383f4d4817d29cd5e96d3f0"
11
+
12
+ # @return [String]
13
+ attr_reader :text
14
+
15
+ # @param text [String]
16
+ def initialize(text)
17
+ @text = text
18
+ @index = Algolia::Index.new "all_en"
19
+ resp = @index.search(text, facetFilters: ["category:standard"])
20
+ @nb_pages = resp["nbPages"]
21
+ self << HitCollection.new(resp["hits"], self)
22
+ end
23
+
24
+ # @return [RelatonIso::HitCollection]
25
+ def last
26
+ collection(@nb_pages - 1)
27
+ end
28
+
29
+ # @param i [Integer]
30
+ # @return [RelatonIso::HitCollection]
31
+ def [](idx)
32
+ # collection i
33
+ return if idx + 1 > @nb_pages
34
+
35
+ collection idx
36
+ super
37
+ end
38
+
39
+ # @return [Array]
40
+ def map(&block)
41
+ m = []
42
+ @nb_pages.times do |n|
43
+ m << yield(self[n]) if block
44
+ end
45
+ m
46
+ end
47
+
48
+ def each(&block)
49
+ @nb_pages.times do |n|
50
+ yield self[n] if block
51
+ end
52
+ end
53
+
54
+ def to_s
55
+ inspect
56
+ end
57
+
58
+ def inspect
59
+ "<#{self.class}:#{format('%#.14x', object_id << 1)} @text=#{@text} "\
60
+ "@pages=#{@nb_pages}>"
61
+ end
62
+
63
+ # @return [Integer]
64
+ def size
65
+ @nb_pages
66
+ end
67
+
68
+ def to_xml(**opts)
69
+ builder = Nokogiri::XML::Builder.new(encoding: "UTF-8") do |xml|
70
+ xml.documents do
71
+ each do |page|
72
+ page.fetch
73
+ page.each { |hit| hit.to_xml xml, **opts }
74
+ end
75
+ end
76
+ end
77
+ builder.to_xml
78
+ end
79
+
80
+ private
81
+
82
+ # @param i [Integer]
83
+ # @return [RelatonIso::HitCollection]
84
+ def collection(idx)
85
+ return if idx + 1 > @nb_pages
86
+
87
+ while Array.instance_method(:size).bind(self).call < idx + 1
88
+ resp = @index.search(@text,
89
+ facetFilters: ["category:standard"],
90
+ page: idx)
91
+ self << HitCollection.new(resp["hits"], self)
92
+ end
93
+ Array.instance_method(:[]).bind(self).call idx
94
+ end
95
+ end
96
+ end
@@ -0,0 +1,132 @@
1
+ # frozen_string_literal: true
2
+
3
+ # require 'relaton_iso/iso_bibliographic_item'
4
+ require "relaton_iso/scrapper"
5
+ require "relaton_iso/hit_pages"
6
+ require "relaton_iec"
7
+
8
+ module RelatonIso
9
+ # Class methods for search ISO standards.
10
+ class IsoBibliography
11
+ class << self
12
+ # @param text [String]
13
+ # @return [RelatonIso::HitPages]
14
+ def search(text)
15
+ HitPages.new text
16
+ rescue Algolia::AlgoliaProtocolError
17
+ warn "Could not access http://www.iso.org"
18
+ []
19
+ end
20
+
21
+ # @param text [String]
22
+ # @return [Array<RelatonIso::IsoBibliographicItem>]
23
+ # def search_and_fetch(text)
24
+ # Scrapper.get(text)
25
+ # end
26
+
27
+ # @param code [String] the ISO standard Code to look up (e..g "ISO 9000")
28
+ # @param year [String] the year the standard was published (optional)
29
+ # @param opts [Hash] options; restricted to :all_parts if all-parts reference is required,
30
+ # :keep_year if undated reference should return actual reference with year
31
+ # @return [String] Relaton XML serialisation of reference
32
+ def get(code, year, opts)
33
+ if year.nil?
34
+ /^(?<code1>[^:]+):(?<year1>[^:]+)$/ =~ code
35
+ unless code1.nil?
36
+ code = code1
37
+ year = year1
38
+ end
39
+ end
40
+ code += "-1" if opts[:all_parts]
41
+ return Iecbib::IecBibliography.get(code, year, opts) if %r[^ISO/IEC DIR].match code
42
+
43
+ ret = isobib_get1(code, year, opts)
44
+ if ret.nil? && code =~ %r[^ISO\s]
45
+ c = code.gsub "ISO", "ISO/IEC"
46
+ warn "Attempting ISO/IEC retrieval"
47
+ ret = isobib_get1(c, year, opts)
48
+ end
49
+ return nil if ret.nil?
50
+
51
+ ret.to_most_recent_reference unless year || opts[:keep_year]
52
+ ret.to_all_parts if opts[:all_parts]
53
+ ret
54
+ end
55
+
56
+ private
57
+
58
+ def fetch_ref_err(code, year, missed_years)
59
+ id = year ? "#{code}:#{year}" : code
60
+ warn "WARNING: no match found online for #{id}. "\
61
+ "The code must be exactly like it is on the standards website."
62
+ warn "(There was no match for #{year}, though there were matches "\
63
+ "found for #{missed_years.join(', ')}.)" unless missed_years.empty?
64
+ if /\d-\d/ =~ code
65
+ warn "The provided document part may not exist, or the document "\
66
+ "may no longer be published in parts."
67
+ else
68
+ warn "If you wanted to cite all document parts for the reference, "\
69
+ "use \"#{code} (all parts)\".\nIf the document is not a standard, "\
70
+ "use its document type abbreviation (TS, TR, PAS, Guide)."
71
+ end
72
+ nil
73
+ end
74
+
75
+ def fetch_pages(s, n)
76
+ workers = RelatonBib::WorkersPool.new n
77
+ workers.worker { |w| { i: w[:i], hit: w[:hit].fetch } }
78
+ s.each_with_index { |hit, i| workers << { i: i, hit: hit } }
79
+ workers.end
80
+ workers.result.sort { |x, y| x[:i] <=> y[:i] }.map { |x| x[:hit] }
81
+ end
82
+
83
+ def isobib_search_filter(code)
84
+ docidrx = %r{^(ISO|IEC)[^0-9]*\s[0-9-]+}
85
+ corrigrx = %r{^(ISO|IEC)[^0-9]*\s[0-9-]+:[0-9]+/}
86
+ warn "fetching #{code}..."
87
+ result = search(code)
88
+ result.each do |page|
89
+ ret = page.select do |i|
90
+ i.hit["title"] &&
91
+ i.hit["title"].match(docidrx).to_s == code &&
92
+ corrigrx !~ i.hit["title"]
93
+ end
94
+ return ret unless ret.empty?
95
+ end
96
+ []
97
+ end
98
+
99
+ # Sort through the results from RelatonIso, fetching them three at a time,
100
+ # and return the first result that matches the code,
101
+ # matches the year (if provided), and which # has a title (amendments do not).
102
+ # Only expects the first page of results to be populated.
103
+ # Does not match corrigenda etc (e.g. ISO 3166-1:2006/Cor 1:2007)
104
+ # If no match, returns any years which caused mismatch, for error reporting
105
+ def isobib_results_filter(result, year)
106
+ missed_years = []
107
+ result.each_slice(3) do |s| # ISO website only allows 3 connections
108
+ fetch_pages(s, 3).each_with_index do |r, _i|
109
+ next if r.nil?
110
+ return { ret: r } if !year
111
+
112
+ r.dates.select { |d| d.type == "published" }.each do |d|
113
+ return { ret: r } if year.to_i == d.on.year
114
+
115
+ missed_years << d.on.year
116
+ end
117
+ end
118
+ end
119
+ { years: missed_years }
120
+ end
121
+
122
+ def isobib_get1(code, year, _opts)
123
+ # return iev(code) if /^IEC 60050-/.match code
124
+ result = isobib_search_filter(code) || return
125
+ ret = isobib_results_filter(result, year)
126
+ return ret[:ret] if ret[:ret]
127
+
128
+ fetch_ref_err(code, year, ret[:years])
129
+ end
130
+ end
131
+ end
132
+ end
@@ -0,0 +1,421 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "algoliasearch"
4
+ require "relaton_iso_bib"
5
+ require "relaton_iso/hit"
6
+ require "nokogiri"
7
+ require "net/http"
8
+
9
+ Algolia.init application_id: "JCL49WV5AR",
10
+ api_key: "dd1b9e1ab383f4d4817d29cd5e96d3f0"
11
+
12
+ module RelatonIso
13
+ # Scrapper.
14
+ # rubocop:disable Metrics/ModuleLength
15
+ module Scrapper
16
+ DOMAIN = "https://www.iso.org"
17
+
18
+ TYPES = {
19
+ "TS" => "technical-specification",
20
+ "TR" => "technical-report",
21
+ "PAS" => "publicly-available-specification",
22
+ # "AWI" => "approvedWorkItem",
23
+ # "CD" => "committeeDraft",
24
+ # "FDIS" => "finalDraftInternationalStandard",
25
+ # "NP" => "newProposal",
26
+ # "DIS" => "draftInternationalStandard",
27
+ # "WD" => "workingDraft",
28
+ # "R" => "recommendation",
29
+ "Guide" => "guide",
30
+ }.freeze
31
+
32
+ class << self
33
+ # @param text [String]
34
+ # @return [Array<Hash>]
35
+ # def get(text)
36
+ # iso_workers = RelatonBib::WorkersPool.new 4
37
+ # iso_workers.worker { |hit| iso_worker(hit, iso_workers) }
38
+ # algolia_workers = start_algolia_search(text, iso_workers)
39
+ # iso_docs = iso_workers.result
40
+ # algolia_workers.end
41
+ # algolia_workers.result
42
+ # iso_docs
43
+ # rescue
44
+ # warn "Could not connect to http://www.iso.org"
45
+ # []
46
+ # end
47
+
48
+ # Parse page.
49
+ # @param hit [Hash]
50
+ # @return [Hash]
51
+ # rubocop:disable Metrics/AbcSize, Metrics/MethodLength
52
+ def parse_page(hit_data)
53
+ return unless hit_data["path"] =~ /\d+$/
54
+
55
+ doc, url = get_page "/standard/#{hit_data['path'].match(/\d+$/)}.html"
56
+
57
+ # Fetch edition.
58
+ edition = doc&.xpath("//strong[contains(text(), 'Edition')]/..")&.
59
+ children&.last&.text&.match(/\d+/)&.to_s
60
+
61
+ titles, abstract = fetch_titles_abstract(doc)
62
+
63
+ RelatonIsoBib::IsoBibliographicItem.new(
64
+ fetched: Date.today.to_s,
65
+ docid: fetch_docid(doc),
66
+ edition: edition,
67
+ language: langs(doc).map { |l| l[:lang] },
68
+ script: langs(doc).map { |l| script(l[:lang]) }.uniq,
69
+ titles: titles,
70
+ type: fetch_type(hit_data["title"]),
71
+ docstatus: fetch_status(doc, hit_data["status"]),
72
+ ics: fetch_ics(doc),
73
+ dates: fetch_dates(doc),
74
+ contributors: fetch_contributors(hit_data["title"]),
75
+ editorialgroup: fetch_workgroup(doc),
76
+ abstract: abstract,
77
+ copyright: fetch_copyright(hit_data["title"], doc),
78
+ link: fetch_link(doc, url),
79
+ relations: fetch_relations(doc),
80
+ structuredidentifier: fetch_structuredidentifier(doc),
81
+ )
82
+ end
83
+ # rubocop:enable Metrics/AbcSize, Metrics/MethodLength
84
+
85
+ private
86
+
87
+ # Start algolia search workers.
88
+ # @param text[String]
89
+ # @param iso_workers [RelatonBib::WorkersPool]
90
+ # @reaturn [RelatonBib::WorkersPool]
91
+ # def start_algolia_search(text, iso_workers)
92
+ # index = Algolia::Index.new "all_en"
93
+ # algolia_workers = RelatonBib::WorkersPool.new
94
+ # algolia_workers.worker do |page|
95
+ # algolia_worker(index, text, page, algolia_workers, iso_workers)
96
+ # end
97
+
98
+ # # Add first page so algolia worker will start.
99
+ # algolia_workers << 0
100
+ # end
101
+
102
+ # Fetch ISO documents.
103
+ # @param hit [Hash]
104
+ # @param isiso_workers [RelatonIso::WorkersPool]
105
+ # def iso_worker(hit, iso_workers)
106
+ # print "Parse #{iso_workers.size} of #{iso_workers.nb_hits} \r"
107
+ # parse_page hit
108
+ # end
109
+
110
+ # Fetch hits from algolia search service.
111
+ # @param index[Algolia::Index]
112
+ # @param text [String]
113
+ # @param page [Integer]
114
+ # @param algolia_workers [RelatonBib::WorkersPool]
115
+ # @param isiso_workers [RelatonBib::WorkersPool]
116
+ # def algolia_worker(index, text, page, algolia_workers, iso_workers)
117
+ # res = index.search text, facetFilters: ["category:standard"], page: page
118
+ # next_page = res["page"] + 1
119
+ # algolia_workers << next_page if next_page < res["nbPages"]
120
+ # res["hits"].each do |hit|
121
+ # iso_workers.nb_hits = res["nbHits"]
122
+ # iso_workers << hit
123
+ # end
124
+ # iso_workers.end unless next_page < res["nbPages"]
125
+ # end
126
+
127
+ # Fetch titles and abstracts.
128
+ # @param doc [Nokigiri::HTML::Document]
129
+ # @return [Array<Array>]
130
+ # rubocop:disable Metrics/AbcSize, Metrics/MethodLength
131
+ def fetch_titles_abstract(doc)
132
+ titles = []
133
+ abstract = []
134
+ langs(doc).each do |lang|
135
+ # Don't need to get page for en. We already have it.
136
+ d = lang[:path] ? get_page(lang[:path])[0] : doc
137
+
138
+ # Check if unavailable for the lang.
139
+ next if d.css("h5.help-block").any?
140
+
141
+ titles << fetch_title(d, lang[:lang])
142
+
143
+ # Fetch abstracts.
144
+ abstract_content = d.css("div[itemprop='description'] p").text
145
+ next if abstract_content.empty?
146
+
147
+ abstract << {
148
+ content: abstract_content,
149
+ language: lang[:lang],
150
+ script: script(lang[:lang]),
151
+ format: "text/plain",
152
+ }
153
+ end
154
+ [titles, abstract]
155
+ end
156
+ # rubocop:enable Metrics/AbcSize, Metrics/MethodLength
157
+
158
+ # Get langs.
159
+ # @param doc [Nokogiri::HTML::Document]
160
+ # @return [Array<Hash>]
161
+ def langs(doc)
162
+ lgs = [{ lang: "en" }]
163
+ doc.css("ul#lang-switcher ul li a").each do |lang_link|
164
+ lang_path = lang_link.attr("href")
165
+ lang = lang_path.match(%r{^\/(fr)\/})
166
+ lgs << { lang: lang[1], path: lang_path } if lang
167
+ end
168
+ lgs
169
+ end
170
+
171
+ # rubocop:disable Metrics/AbcSize, Metrics/MethodLength
172
+ # Get page.
173
+ # @param path [String] page's path
174
+ # @return [Array<Nokogiri::HTML::Document, String>]
175
+ def get_page(path)
176
+ url = DOMAIN + path
177
+ uri = URI url
178
+ resp = Net::HTTP.get_response(uri) # .encode("UTF-8")
179
+ if resp.code == "301"
180
+ path = resp["location"]
181
+ url = DOMAIN + path
182
+ uri = URI url
183
+ resp = Net::HTTP.get_response(uri) # .encode("UTF-8")
184
+ end
185
+ n = 0
186
+ while resp.body !~ /<strong/ && n < 10
187
+ resp = Net::HTTP.get_response(uri) # .encode("UTF-8")
188
+ n += 1
189
+ end
190
+ [Nokogiri::HTML(resp.body), url]
191
+ end
192
+ # rubocop:enable Metrics/AbcSize, Metrics/MethodLength
193
+
194
+ # Fetch docid.
195
+ # @param doc [Nokogiri::HTML::Document]
196
+ # @return [Array<RelatonBib::DocumentIdentifier>]
197
+ def fetch_docid(doc)
198
+ item_ref = doc.at("//strong[@id='itemReference']")
199
+ return [] unless item_ref
200
+
201
+ [RelatonBib::DocumentIdentifier.new(id: item_ref.text, type: "ISO")]
202
+ end
203
+
204
+ # @param doc [Nokogiri::HTML::Document]
205
+ def fetch_structuredidentifier(doc)
206
+ item_ref = doc.at("//strong[@id='itemReference']")
207
+ unless item_ref
208
+ return RelatonIsoBib::StructuredIdentifier.new(
209
+ project_number: "?", part_number: "", prefix: nil, id: "?",
210
+ )
211
+ end
212
+
213
+ m = item_ref.text.match(/^(.*?\d+)-?((?<=-)\d+|)/)
214
+ RelatonIsoBib::StructuredIdentifier.new(
215
+ project_number: m[1], part_number: m[2], prefix: nil,
216
+ id: item_ref.text, type: "ISO"
217
+ )
218
+ end
219
+
220
+ # Fetch status.
221
+ # @param doc [Nokogiri::HTML::Document]
222
+ # @param status [String]
223
+ # @return [Hash]
224
+ def fetch_status(doc, _status)
225
+ stage, substage = doc.css("li.dropdown.active span.stage-code > strong").text.split "."
226
+ RelatonBib::DocumentStatus.new(stage: stage, substage: substage)
227
+ end
228
+
229
+ # Fetch workgroup.
230
+ # @param doc [Nokogiri::HTML::Document]
231
+ # @return [Hash]
232
+ def fetch_workgroup(doc)
233
+ wg_link = doc.css("div.entry-name.entry-block a")[0]
234
+ # wg_url = DOMAIN + wg_link['href']
235
+ workgroup = wg_link.text.split "/"
236
+ {
237
+ name: "International Organization for Standardization",
238
+ abbreviation: "ISO",
239
+ url: "www.iso.org",
240
+ technical_committee: [{
241
+ name: wg_link.text + doc.css("div.entry-title")[0].text,
242
+ type: "TC",
243
+ number: workgroup[1]&.match(/\d+/)&.to_s&.to_i,
244
+ }],
245
+ }
246
+ end
247
+
248
+ # rubocop:disable Metrics/MethodLength
249
+
250
+ # Fetch relations.
251
+ # @param doc [Nokogiri::HTML::Document]
252
+ # @return [Array<Hash>]
253
+ def fetch_relations(doc)
254
+ doc.css("ul.steps li").reduce([]) do |a, r|
255
+ r_type = r.css("strong").text
256
+ type = case r_type
257
+ when "Previously", "Will be replaced by" then "obsoletes"
258
+ when "Corrigenda/Amendments", "Revised by", "Now confirmed"
259
+ "updates"
260
+ else r_type
261
+ end
262
+ if ["Now", "Now under review"].include? type
263
+ a
264
+ else
265
+ a + r.css("a").map do |id|
266
+ fref = RelatonBib::FormattedRef.new(
267
+ content: id.text, format: "text/plain",
268
+ )
269
+ bibitem = RelatonIsoBib::IsoBibliographicItem.new(
270
+ formattedref: fref,
271
+ )
272
+ { type: type, bibitem: bibitem }
273
+ end
274
+ end
275
+ end
276
+ end
277
+ # rubocop:enable Metrics/MethodLength
278
+
279
+ # Fetch type.
280
+ # @param title [String]
281
+ # @return [String]
282
+ def fetch_type(title)
283
+ type_match = title.match(%r{^(ISO|IWA|IEC)(?:(/IEC|/IEEE|/PRF|
284
+ /NP)*\s|/)(TS|TR|PAS|AWI|CD|FDIS|NP|DIS|WD|R|Guide|(?=\d+))}x)
285
+ # return "international-standard" if type_match.nil?
286
+ if TYPES[type_match[3]]
287
+ TYPES[type_match[3]]
288
+ elsif type_match[1] == "ISO"
289
+ "international-standard"
290
+ elsif type_match[1] == "IWA"
291
+ "international-workshop-agreement"
292
+ end
293
+ # rescue => _e
294
+ # puts 'Unknown document type: ' + title
295
+ end
296
+
297
+ # Fetch titles.
298
+ # @param doc [Nokogiri::HTML::Document]
299
+ # @param lang [String]
300
+ # @return [Hash]
301
+ def fetch_title(doc, lang)
302
+ titles = doc.at("//h3[@itemprop='description'] | //h2[@itemprop='description']").
303
+ text.split " -- "
304
+ case titles.size
305
+ when 0
306
+ intro, main, part = nil, "", nil
307
+ when 1
308
+ intro, main, part = nil, titles[0], nil
309
+ when 2
310
+ if /^(Part|Partie) \d+:/ =~ titles[1]
311
+ intro, main, part = nil, titles[0], titles[1]
312
+ else
313
+ intro, main, part = titles[0], titles[1], nil
314
+ end
315
+ when 3
316
+ intro, main, part = titles[0], titles[1], titles[2]
317
+ else
318
+ intro, main, part = titles[0], titles[1], titles[2..-1]&.join(" -- ")
319
+ end
320
+ {
321
+ title_intro: intro,
322
+ title_main: main,
323
+ title_part: part,
324
+ language: lang,
325
+ script: script(lang),
326
+ }
327
+ end
328
+
329
+ # Return ISO script code.
330
+ # @param lang [String]
331
+ # @return [String]
332
+ def script(lang)
333
+ case lang
334
+ when "en", "fr" then "Latn"
335
+ end
336
+ end
337
+
338
+ # Fetch dates
339
+ # @param doc [Nokogiri::HTML::Document]
340
+ # @return [Array<Hash>]
341
+ def fetch_dates(doc)
342
+ dates = []
343
+ publish_date = doc.xpath("//span[@itemprop='releaseDate']").text
344
+ unless publish_date.empty?
345
+ dates << { type: "published", on: publish_date }
346
+ end
347
+ dates
348
+ end
349
+
350
+ # rubocop:disable Metrics/MethodLength
351
+ def fetch_contributors(title)
352
+ title.sub(/\s.*/, "").split("/").map do |abbrev|
353
+ case abbrev
354
+ when "IEC"
355
+ name = "International Electrotechnical Commission"
356
+ url = "www.iec.ch"
357
+ else
358
+ name = "International Organization for Standardization"
359
+ url = "www.iso.org"
360
+ end
361
+ { entity: { name: name, url: url, abbreviation: abbrev },
362
+ roles: ["publisher"] }
363
+ end
364
+ end
365
+ # rubocop:enable Metrics/MethodLength
366
+
367
+ # Fetch ICS.
368
+ # @param doc [Nokogiri::HTML::Document]
369
+ # @return [Array<Hash>]
370
+ def fetch_ics(doc)
371
+ doc.xpath("//strong[contains(text(), "\
372
+ "'ICS')]/../following-sibling::dd/div/a").map do |i|
373
+ code = i.text.match(/[\d\.]+/).to_s.split "."
374
+ { field: code[0], group: code[1], subgroup: code[2] }
375
+ end
376
+ end
377
+
378
+ # Fetch links.
379
+ # @param doc [Nokogiri::HTML::Document]
380
+ # @param url [String]
381
+ # @return [Array<Hash>]
382
+ def fetch_link(doc, url)
383
+ obp_elms = doc.xpath("//a[contains(@href, '/obp/ui/')]")
384
+ obp = obp_elms.attr("href").value if obp_elms.any?
385
+ rss = DOMAIN + doc.xpath("//a[contains(@href, 'rss')]").attr("href").value
386
+ [
387
+ { type: "src", content: url },
388
+ { type: "obp", content: obp },
389
+ { type: "rss", content: rss },
390
+ ]
391
+ end
392
+
393
+ # Fetch copyright.
394
+ # @param title [String]
395
+ # @return [Hash]
396
+ def fetch_copyright(title, doc)
397
+ owner_name = title.match(/.*?(?=\s)/).to_s
398
+ from = title.match(/(?<=:)\d{4}/).to_s
399
+ if from.empty?
400
+ from = doc.xpath("//span[@itemprop='releaseDate']").text.match(/\d{4}/).to_s
401
+ end
402
+ { owner: { name: owner_name }, from: from }
403
+ end
404
+ end
405
+
406
+ # private
407
+ #
408
+ # def next_hits_page(next_page)
409
+ # page = @index.search @text, facetFilters: ['category:standard'],
410
+ # page: next_page
411
+ # page.each do |key, value|
412
+ # if key == 'hits'
413
+ # @docs[key] += value
414
+ # else
415
+ # @docs[key] = value
416
+ # end
417
+ # end
418
+ # end
419
+ end
420
+ # rubocop:enable Metrics/ModuleLength
421
+ end