relaton-iec 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +13 -0
- data/.rspec +3 -0
- data/.rubocop.yml +10 -0
- data/.travis.yml +16 -0
- data/Gemfile +6 -0
- data/Gemfile.lock +92 -0
- data/LICENSE.txt +21 -0
- data/README.adoc +52 -0
- data/Rakefile +6 -0
- data/appveyor.yml +30 -0
- data/bin/console +14 -0
- data/bin/setup +8 -0
- data/lib/relaton/processor.rb +23 -0
- data/lib/relaton_iec.rb +10 -0
- data/lib/relaton_iec/hit.rb +51 -0
- data/lib/relaton_iec/hit_collection.rb +67 -0
- data/lib/relaton_iec/iec_bibliography.rb +156 -0
- data/lib/relaton_iec/scrapper.rb +413 -0
- data/lib/relaton_iec/statuses.yml +132 -0
- data/lib/relaton_iec/version.rb +3 -0
- data/relaton_iec.gemspec +38 -0
- metadata +235 -0
@@ -0,0 +1,156 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
# require 'isobib/iso_bibliographic_item'
|
4
|
+
require "relaton_iec/scrapper"
|
5
|
+
require "relaton_iec/hit_collection"
|
6
|
+
require "date"
|
7
|
+
|
8
|
+
module RelatonIec
|
9
|
+
# Class methods for search ISO standards.
|
10
|
+
class IecBibliography
|
11
|
+
class << self
|
12
|
+
# @param text [String]
|
13
|
+
# @return [RelatonIec::HitCollection]
|
14
|
+
def search(text, year = nil)
|
15
|
+
HitCollection.new text, year
|
16
|
+
rescue SocketError, OpenURI::HTTPError
|
17
|
+
warn "Could not access http://www.iec.ch"
|
18
|
+
[]
|
19
|
+
end
|
20
|
+
|
21
|
+
# @param text [String]
|
22
|
+
# @return [Array<IsoBibliographicItem>]
|
23
|
+
# def search_and_fetch(text, year = nil)
|
24
|
+
# Scrapper.get(text, year)
|
25
|
+
# end
|
26
|
+
|
27
|
+
# @param code [String] the ISO standard Code to look up (e..g "ISO 9000")
|
28
|
+
# @param year [String] the year the standard was published (optional)
|
29
|
+
# @param opts [Hash] options; restricted to :all_parts if all-parts reference is required
|
30
|
+
# @return [String] Relaton XML serialisation of reference
|
31
|
+
def get(code, year = nil, opts = {})
|
32
|
+
if year.nil?
|
33
|
+
/^(?<code1>[^:]+):(?<year1>[^:]+)$/ =~ code
|
34
|
+
unless code1.nil?
|
35
|
+
code = code1
|
36
|
+
year = year1
|
37
|
+
end
|
38
|
+
end
|
39
|
+
|
40
|
+
return iev if code.casecmp("IEV").zero?
|
41
|
+
|
42
|
+
code += "-1" if opts[:all_parts]
|
43
|
+
ret = iecbib_get1(code, year, opts)
|
44
|
+
return nil if ret.nil?
|
45
|
+
|
46
|
+
ret.to_most_recent_reference unless year || opts[:keep_year]
|
47
|
+
ret.to_all_parts if opts[:all_parts]
|
48
|
+
ret
|
49
|
+
end
|
50
|
+
|
51
|
+
private
|
52
|
+
|
53
|
+
def fetch_ref_err(code, year, missed_years)
|
54
|
+
id = year ? "#{code}:#{year}" : code
|
55
|
+
warn "WARNING: no match found online for #{id}. "\
|
56
|
+
"The code must be exactly like it is on the standards website."
|
57
|
+
warn "(There was no match for #{year}, though there were matches "\
|
58
|
+
"found for #{missed_years.join(', ')}.)" unless missed_years.empty?
|
59
|
+
if /\d-\d/ =~ code
|
60
|
+
warn "The provided document part may not exist, or the document "\
|
61
|
+
"may no longer be published in parts."
|
62
|
+
else
|
63
|
+
warn "If you wanted to cite all document parts for the reference, "\
|
64
|
+
"use \"#{code} (all parts)\".\nIf the document is not a standard, "\
|
65
|
+
"use its document type abbreviation (TS, TR, PAS, Guide)."
|
66
|
+
end
|
67
|
+
nil
|
68
|
+
end
|
69
|
+
|
70
|
+
def fetch_pages(s, n)
|
71
|
+
workers = RelatonBib::WorkersPool.new n
|
72
|
+
workers.worker { |w| { i: w[:i], hit: w[:hit].fetch } }
|
73
|
+
s.each_with_index { |hit, i| workers << { i: i, hit: hit } }
|
74
|
+
workers.end
|
75
|
+
workers.result.sort { |x, y| x[:i] <=> y[:i] }.map { |x| x[:hit] }
|
76
|
+
end
|
77
|
+
|
78
|
+
def isobib_search_filter(code)
|
79
|
+
docidrx = %r{^(ISO|IEC)[^0-9]*\s[0-9-]+}
|
80
|
+
corrigrx = %r{^(ISO|IEC)[^0-9]*\s[0-9-]+:[0-9]+/}
|
81
|
+
warn "fetching #{code}..."
|
82
|
+
result = search(code)
|
83
|
+
result.select do |i|
|
84
|
+
i.hit[:code] &&
|
85
|
+
i.hit[:code].match(docidrx).to_s == code &&
|
86
|
+
corrigrx !~ i.hit[:code]
|
87
|
+
end
|
88
|
+
end
|
89
|
+
|
90
|
+
def iev(code = "IEC 60050")
|
91
|
+
RelatonIsoBib::XMLParser.from_xml(<<~"END")
|
92
|
+
<bibitem>
|
93
|
+
<fetched>#{Date.today}</fetched>
|
94
|
+
<title format="text/plain" language="en" script="Latn">International Electrotechnical Vocabulary</title>
|
95
|
+
<link type="src">http://www.electropedia.org</link>
|
96
|
+
<docidentifier>#{code}:2011</docidentifier>
|
97
|
+
<date type="published"><on>2011</on></date>
|
98
|
+
<contributor>
|
99
|
+
<role type="publisher"/>
|
100
|
+
<organization>
|
101
|
+
<name>International Electrotechnical Commission</name>
|
102
|
+
<abbreviation>IEC</abbreviation>
|
103
|
+
<uri>www.iec.ch</uri>
|
104
|
+
</organization>
|
105
|
+
</contributor>
|
106
|
+
<language>en</language> <language>fr</language>
|
107
|
+
<script>Latn</script>
|
108
|
+
<status> <stage>60</stage> </status>
|
109
|
+
<copyright>
|
110
|
+
<from>2018</from>
|
111
|
+
<owner>
|
112
|
+
<organization>
|
113
|
+
<name>International Electrotechnical Commission</name>
|
114
|
+
<abbreviation>IEC</abbreviation>
|
115
|
+
<uri>www.iec.ch</uri>
|
116
|
+
</organization>
|
117
|
+
</owner>
|
118
|
+
</copyright>
|
119
|
+
</bibitem>
|
120
|
+
END
|
121
|
+
end
|
122
|
+
|
123
|
+
# Sort through the results from Isobib, fetching them three at a time,
|
124
|
+
# and return the first result that matches the code,
|
125
|
+
# matches the year (if provided), and which # has a title (amendments do not).
|
126
|
+
# Only expects the first page of results to be populated.
|
127
|
+
# Does not match corrigenda etc (e.g. ISO 3166-1:2006/Cor 1:2007)
|
128
|
+
# If no match, returns any years which caused mismatch, for error reporting
|
129
|
+
def isobib_results_filter(result, year)
|
130
|
+
missed_years = []
|
131
|
+
result.each_slice(3) do |s| # ISO website only allows 3 connections
|
132
|
+
fetch_pages(s, 3).each_with_index do |r, _i|
|
133
|
+
return { ret: r } if !year
|
134
|
+
|
135
|
+
r.dates.select { |d| d.type == "published" }.each do |d|
|
136
|
+
return { ret: r } if year.to_i == d.on.year
|
137
|
+
|
138
|
+
missed_years << d.on.year
|
139
|
+
end
|
140
|
+
end
|
141
|
+
end
|
142
|
+
{ years: missed_years }
|
143
|
+
end
|
144
|
+
|
145
|
+
def iecbib_get1(code, year, _opts)
|
146
|
+
return iev if code.casecmp("IEV").zero?
|
147
|
+
|
148
|
+
result = isobib_search_filter(code) || return
|
149
|
+
ret = isobib_results_filter(result, year)
|
150
|
+
return ret[:ret] if ret[:ret]
|
151
|
+
|
152
|
+
fetch_ref_err(code, year, ret[:years])
|
153
|
+
end
|
154
|
+
end
|
155
|
+
end
|
156
|
+
end
|
@@ -0,0 +1,413 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require "relaton_iso_bib"
|
4
|
+
require "relaton_iec/hit"
|
5
|
+
require "nokogiri"
|
6
|
+
require "net/http"
|
7
|
+
|
8
|
+
# Capybara.request_driver :poltergeist do |app|
|
9
|
+
# Capybara::Poltergeist::Driver.new app, js_errors: false
|
10
|
+
# end
|
11
|
+
# Capybara.default_driver = :poltergeist
|
12
|
+
|
13
|
+
module RelatonIec
|
14
|
+
# Scrapper.
|
15
|
+
# rubocop:disable Metrics/ModuleLength
|
16
|
+
module Scrapper
|
17
|
+
DOMAIN = "https://webstore.iec.ch"
|
18
|
+
|
19
|
+
TYPES = {
|
20
|
+
"ISO" => "international-standard",
|
21
|
+
"TS" => "technicalSpecification",
|
22
|
+
"TR" => "technicalReport",
|
23
|
+
"PAS" => "publiclyAvailableSpecification",
|
24
|
+
"AWI" => "appruvedWorkItem",
|
25
|
+
"CD" => "committeeDraft",
|
26
|
+
"FDIS" => "finalDraftInternationalStandard",
|
27
|
+
"NP" => "newProposal",
|
28
|
+
"DIS" => "draftInternationalStandard",
|
29
|
+
"WD" => "workingDraft",
|
30
|
+
"R" => "recommendation",
|
31
|
+
"Guide" => "guide",
|
32
|
+
}.freeze
|
33
|
+
|
34
|
+
class << self
|
35
|
+
# @param text [String]
|
36
|
+
# @return [Array<Hash>]
|
37
|
+
# def get(text)
|
38
|
+
# iso_workers = WorkersPool.new 4
|
39
|
+
# iso_workers.worker { |hit| iso_worker(hit, iso_workers) }
|
40
|
+
# algolia_workers = start_algolia_search(text, iso_workers)
|
41
|
+
# iso_docs = iso_workers.result
|
42
|
+
# algolia_workers.end
|
43
|
+
# algolia_workers.result
|
44
|
+
# iso_docs
|
45
|
+
# end
|
46
|
+
|
47
|
+
# Parse page.
|
48
|
+
# @param hit [Hash]
|
49
|
+
# @return [Hash]
|
50
|
+
# rubocop:disable Metrics/AbcSize, Metrics/MethodLength
|
51
|
+
def parse_page(hit_data)
|
52
|
+
doc = get_page hit_data[:url]
|
53
|
+
|
54
|
+
# Fetch edition.
|
55
|
+
edition = doc.at("//th[contains(., 'Edition')]/following-sibling::td/span").text
|
56
|
+
|
57
|
+
status, relations = fetch_status_relations hit_data[:url]
|
58
|
+
|
59
|
+
RelatonIsoBib::IsoBibliographicItem.new(
|
60
|
+
docid: [RelatonBib::DocumentIdentifier.new(id: hit_data[:code], type: "IEC")],
|
61
|
+
structuredidentifier: fetch_structuredidentifier(doc),
|
62
|
+
edition: edition,
|
63
|
+
language: ["en"],
|
64
|
+
script: ["Latn"],
|
65
|
+
titles: fetch_titles(hit_data),
|
66
|
+
type: fetch_type(doc),
|
67
|
+
docstatus: status,
|
68
|
+
ics: fetch_ics(doc),
|
69
|
+
dates: fetch_dates(doc),
|
70
|
+
contributors: fetch_contributors(hit_data[:code]),
|
71
|
+
editorialgroup: fetch_workgroup(doc),
|
72
|
+
abstract: fetch_abstract(doc),
|
73
|
+
copyright: fetch_copyright(hit_data[:code], doc),
|
74
|
+
link: fetch_link(doc, hit_data[:url]),
|
75
|
+
relations: relations,
|
76
|
+
)
|
77
|
+
end
|
78
|
+
# rubocop:enable Metrics/AbcSize, Metrics/MethodLength
|
79
|
+
|
80
|
+
private
|
81
|
+
|
82
|
+
# Start search workers.
|
83
|
+
# @param text[String]
|
84
|
+
# @param iec_workers [Isobib::WorkersPool]
|
85
|
+
# @reaturn [Isobib::WorkersPool]
|
86
|
+
# def start_algolia_search(text, iec_workers)
|
87
|
+
# index = Algolia::Index.new 'all_en'
|
88
|
+
# workers = WorkersPool.new
|
89
|
+
# workers.worker do |page|
|
90
|
+
# algolia_worker(index, text, page, workers, iec_workers)
|
91
|
+
# end
|
92
|
+
|
93
|
+
# # Add first page so search worker will start.
|
94
|
+
# workers << 0
|
95
|
+
# end
|
96
|
+
|
97
|
+
# Fetch ISO documents.
|
98
|
+
# @param hit [Hash]
|
99
|
+
# @param isiso_workers [Isobib::WorkersPool]
|
100
|
+
# def iso_worker(hit, iso_workers)
|
101
|
+
# print "Parse #{iso_workers.size} of #{iso_workers.nb_hits} \r"
|
102
|
+
# parse_page hit
|
103
|
+
# end
|
104
|
+
|
105
|
+
# Fetch hits from algolia search service.
|
106
|
+
# @param index[Algolia::Index]
|
107
|
+
# @param text [String]
|
108
|
+
# @param page [Integer]
|
109
|
+
# @param algolia_workers [Isobib::WorkersPool]
|
110
|
+
# @param isiso_workers [Isobib::WorkersPool]
|
111
|
+
# def algolia_worker(index, text, page, algolia_workers, iso_workers)
|
112
|
+
# res = index.search text, facetFilters: ['category:standard'], page: page
|
113
|
+
# next_page = res['page'] + 1
|
114
|
+
# algolia_workers << next_page if next_page < res['nbPages']
|
115
|
+
# res['hits'].each do |hit|
|
116
|
+
# iso_workers.nb_hits = res['nbHits']
|
117
|
+
# iso_workers << hit
|
118
|
+
# end
|
119
|
+
# iso_workers.end unless next_page < res['nbPages']
|
120
|
+
# end
|
121
|
+
|
122
|
+
# Fetch abstracts.
|
123
|
+
# @param doc [Nokigiri::HTML::Document]
|
124
|
+
# @return [Array<Array>]
|
125
|
+
def fetch_abstract(doc)
|
126
|
+
abstract_content = doc.at('//div[@itemprop="description"]').text
|
127
|
+
[{
|
128
|
+
content: abstract_content,
|
129
|
+
language: "en",
|
130
|
+
script: "Latn",
|
131
|
+
format: "text/plain",
|
132
|
+
}]
|
133
|
+
end
|
134
|
+
|
135
|
+
# Get langs.
|
136
|
+
# @param doc [Nokogiri::HTML::Document]
|
137
|
+
# @return [Array<Hash>]
|
138
|
+
# def langs(doc)
|
139
|
+
# lgs = [{ lang: 'en' }]
|
140
|
+
# doc.css('ul#lang-switcher ul li a').each do |lang_link|
|
141
|
+
# lang_path = lang_link.attr('href')
|
142
|
+
# lang = lang_path.match(%r{^\/(fr)\/})
|
143
|
+
# lgs << { lang: lang[1], path: lang_path } if lang
|
144
|
+
# end
|
145
|
+
# lgs
|
146
|
+
# end
|
147
|
+
|
148
|
+
# rubocop:disable Metrics/AbcSize, Metrics/MethodLength
|
149
|
+
|
150
|
+
# Get page.
|
151
|
+
# @param path [String] page's path
|
152
|
+
# @return [Array<Nokogiri::HTML::Document, String>]
|
153
|
+
def get_page(url)
|
154
|
+
uri = URI url
|
155
|
+
resp = Net::HTTP.get_response(uri) # .encode("UTF-8")
|
156
|
+
if resp.code == "301"
|
157
|
+
path = resp["location"]
|
158
|
+
url = DOMAIN + path
|
159
|
+
uri = URI url
|
160
|
+
resp = Net::HTTP.get_response(uri) # .encode("UTF-8")
|
161
|
+
end
|
162
|
+
# n = 0
|
163
|
+
# while resp.body !~ /<strong/ && n < 10
|
164
|
+
# resp = Net::HTTP.get_response(uri)#.encode("UTF-8")
|
165
|
+
# n += 1
|
166
|
+
# end
|
167
|
+
Nokogiri::HTML(resp.body)
|
168
|
+
end
|
169
|
+
# rubocop:enable Metrics/AbcSize
|
170
|
+
|
171
|
+
# Fetch structuredidentifier.
|
172
|
+
# @param doc [Nokogiri::HTML::Document]
|
173
|
+
# @return [RelatonIsoBib::StructuredIdentifier]
|
174
|
+
def fetch_structuredidentifier(doc)
|
175
|
+
item_ref = doc.at("//span[@itemprop='productID']")
|
176
|
+
unless item_ref
|
177
|
+
return RelatonIsoBib::StructuredIdentifier.new(
|
178
|
+
project_number: "?", part_number: "", prefix: nil, id: "?",
|
179
|
+
)
|
180
|
+
end
|
181
|
+
|
182
|
+
m = item_ref.text.match(
|
183
|
+
/(?<=\s)(?<project>\d+)-?(?<part>(?<=-)\d+|)-?(?<subpart>(?<=-)\d+|)/,
|
184
|
+
)
|
185
|
+
RelatonIsoBib::StructuredIdentifier.new(
|
186
|
+
project_number: m[:project],
|
187
|
+
part_number: m[:part],
|
188
|
+
subpart_number: m[:subpart],
|
189
|
+
prefix: nil,
|
190
|
+
type: "IEC",
|
191
|
+
id: item_ref.text,
|
192
|
+
)
|
193
|
+
end
|
194
|
+
|
195
|
+
# Fetch status.
|
196
|
+
# @param doc [Nokogiri::HTML::Document]
|
197
|
+
# @param status [String]
|
198
|
+
# @return [Hash]
|
199
|
+
def fetch_status(doc)
|
200
|
+
wip = doc.at('//ROW[STATUS[.="PREPARING"]]')
|
201
|
+
if wip
|
202
|
+
statuses = YAML.load_file "lib/relaton_iec/statuses.yml"
|
203
|
+
s = wip.at("STAGE").text
|
204
|
+
stage, substage = statuses[s]["stage"].split "."
|
205
|
+
# status = statuses[s]["status"]
|
206
|
+
else
|
207
|
+
# status = "Published"
|
208
|
+
stage = "60"
|
209
|
+
substage = "60"
|
210
|
+
end
|
211
|
+
RelatonBib::DocumentStatus.new(stage: stage, substage: substage)
|
212
|
+
end
|
213
|
+
# rubocop:enable Metrics/MethodLength
|
214
|
+
|
215
|
+
# Fetch workgroup.
|
216
|
+
# @param doc [Nokogiri::HTML::Document]
|
217
|
+
# @return [Hash]
|
218
|
+
def fetch_workgroup(doc)
|
219
|
+
wg = doc.at('//th/abbr[.="TC"]/../following-sibling::td/a').text
|
220
|
+
{
|
221
|
+
name: "International Electrotechnical Commission",
|
222
|
+
abbreviation: "IEC",
|
223
|
+
url: "webstore.iec.ch",
|
224
|
+
technical_committee: [{
|
225
|
+
name: wg,
|
226
|
+
type: "technicalCommittee",
|
227
|
+
number: wg.match(/\d+/)&.to_s&.to_i,
|
228
|
+
}],
|
229
|
+
}
|
230
|
+
end
|
231
|
+
|
232
|
+
# Fetch relations.
|
233
|
+
# @param doc [Nokogiri::HTML::Document]
|
234
|
+
# @return [Array<Hash>]
|
235
|
+
# rubocop:disable Metrics/MethodLength
|
236
|
+
def fetch_relations(doc)
|
237
|
+
doc.xpath('//ROW[STATUS[.!="PREPARING"]][STATUS[.!="PUBLISHED"]]').map do |r|
|
238
|
+
r_type = r.at("STATUS").text.downcase
|
239
|
+
type = case r_type
|
240
|
+
# when 'published' then 'obsoletes' # Valid
|
241
|
+
when "revised", "replaced" then "updates"
|
242
|
+
when "withdrawn" then "obsoletes"
|
243
|
+
else r_type
|
244
|
+
end
|
245
|
+
# url = DOMAIN + "/publication/" + r.at("PUB_ID").text
|
246
|
+
fref = RelatonBib::FormattedRef.new(
|
247
|
+
content: r.at("FULL_NAME").text, format: "text/plain",
|
248
|
+
)
|
249
|
+
bibitem = RelatonIsoBib::IsoBibliographicItem.new(
|
250
|
+
formattedref: fref,
|
251
|
+
)
|
252
|
+
{ type: type, bibitem: bibitem }
|
253
|
+
end
|
254
|
+
end
|
255
|
+
|
256
|
+
def fetch_status_relations(url)
|
257
|
+
pubid = url.match(/\d+$/).to_s
|
258
|
+
uri = URI DOMAIN + "/webstore/webstore.nsf/AjaxRequestXML?"\
|
259
|
+
"Openagent&url=http://www.iec.ch/dyn/www/f?"\
|
260
|
+
"p=103:390:::::P390_PUBLICATION_ID:" + pubid
|
261
|
+
resp = Net::HTTP.get_response uri
|
262
|
+
doc = Nokogiri::XML resp.body
|
263
|
+
status = fetch_status doc
|
264
|
+
relations = fetch_relations doc
|
265
|
+
[status, relations]
|
266
|
+
# doc.css('ul.steps li').inject([]) do |a, r|
|
267
|
+
# r_type = r.css('strong').text
|
268
|
+
# type = case r_type
|
269
|
+
# when 'Previously', 'Will be replaced by' then 'obsoletes'
|
270
|
+
# when 'Corrigenda/Amendments', 'Revised by', 'Now confirmed'
|
271
|
+
# 'updates'
|
272
|
+
# else r_type
|
273
|
+
# end
|
274
|
+
# if ['Now', 'Now under review'].include? type
|
275
|
+
# a
|
276
|
+
# else
|
277
|
+
# a + r.css('a').map do |id|
|
278
|
+
# { type: type, identifier: id.text, url: id['href'] }
|
279
|
+
# end
|
280
|
+
# end
|
281
|
+
# end
|
282
|
+
end
|
283
|
+
# rubocop:enable Metrics/MethodLength
|
284
|
+
|
285
|
+
# Fetch type.
|
286
|
+
# @param doc [Nokogiri::HTML::Document]
|
287
|
+
# @return [String]
|
288
|
+
def fetch_type(doc)
|
289
|
+
doc.at('//th[contains(., "Publication type")]/following-sibling::td/span')
|
290
|
+
.text.downcase.tr " ", "-"
|
291
|
+
# type_match = title.match(%r{^(ISO|IWA|IEC)(?:(/IEC|/IEEE|/PRF|
|
292
|
+
# /NP)*\s|/)(TS|TR|PAS|AWI|CD|FDIS|NP|DIS|WD|R|Guide|(?=\d+))}x)
|
293
|
+
# #return "international-standard" if type_match.nil?
|
294
|
+
# if TYPES[type_match[2]]
|
295
|
+
# TYPES[type_match[2]]
|
296
|
+
# elsif type_match[1]
|
297
|
+
# elsif type_match[1] == 'ISO'
|
298
|
+
# 'international-standard'
|
299
|
+
# elsif type_match[1] == 'IWA'
|
300
|
+
# 'international-workshop-agreement'
|
301
|
+
# end
|
302
|
+
# # rescue => _e
|
303
|
+
# # puts 'Unknown document type: ' + title
|
304
|
+
end
|
305
|
+
|
306
|
+
# Fetch titles.
|
307
|
+
# @param hit_data [Hash]
|
308
|
+
# @return [Array<Hash>]
|
309
|
+
def fetch_titles(hit_data)
|
310
|
+
titles = hit_data[:title].split " - "
|
311
|
+
case titles.size
|
312
|
+
when 0
|
313
|
+
intro, main, part = nil, "", nil
|
314
|
+
when 1
|
315
|
+
intro, main, part = nil, titles[0], nil
|
316
|
+
when 2
|
317
|
+
if /^(Part|Partie) \d+:/ =~ titles[1]
|
318
|
+
intro, main, part = nil, titles[0], titles[1]
|
319
|
+
else
|
320
|
+
intro, main, part = titles[0], titles[1], nil
|
321
|
+
end
|
322
|
+
when 3
|
323
|
+
intro, main, part = titles[0], titles[1], titles[2]
|
324
|
+
else
|
325
|
+
intro, main, part = titles[0], titles[1], titles[2..-1]&.join(" -- ")
|
326
|
+
end
|
327
|
+
[{
|
328
|
+
title_intro: intro,
|
329
|
+
title_main: main,
|
330
|
+
title_part: part,
|
331
|
+
language: "en",
|
332
|
+
script: "Latn"
|
333
|
+
}]
|
334
|
+
end
|
335
|
+
|
336
|
+
# Return ISO script code.
|
337
|
+
# @param lang [String]
|
338
|
+
# @return [String]
|
339
|
+
# def script(lang)
|
340
|
+
# case lang
|
341
|
+
# when 'en', 'fr' then 'Latn'
|
342
|
+
# end
|
343
|
+
# end
|
344
|
+
|
345
|
+
# Fetch dates
|
346
|
+
# @param doc [Nokogiri::HTML::Document]
|
347
|
+
# @return [Array<Hash>]
|
348
|
+
def fetch_dates(doc)
|
349
|
+
dates = []
|
350
|
+
publish_date = doc.at("//span[@itemprop='releaseDate']").text
|
351
|
+
unless publish_date.empty?
|
352
|
+
dates << { type: "published", on: publish_date }
|
353
|
+
end
|
354
|
+
dates
|
355
|
+
end
|
356
|
+
|
357
|
+
def fetch_contributors(code)
|
358
|
+
code.sub(/\s.*/, "").split("/").map do |abbrev|
|
359
|
+
case abbrev
|
360
|
+
when "ISO"
|
361
|
+
name = "International Organization for Standardization"
|
362
|
+
url = "www.iso.org"
|
363
|
+
when "IEC"
|
364
|
+
name = "International Electrotechnical Commission"
|
365
|
+
url = "www.iec.ch"
|
366
|
+
end
|
367
|
+
{ entity: { name: name, url: url, abbreviation: abbrev },
|
368
|
+
roles: ["publisher"] }
|
369
|
+
end
|
370
|
+
end
|
371
|
+
|
372
|
+
# Fetch ICS.
|
373
|
+
# @param doc [Nokogiri::HTML::Document]
|
374
|
+
# @return [Array<Hash>]
|
375
|
+
def fetch_ics(doc)
|
376
|
+
doc.xpath('//th[contains(text(), "ICS")]/following-sibling::td/a').map do |i|
|
377
|
+
code = i.text.match(/[\d\.]+/).to_s.split "."
|
378
|
+
{ field: code[0], group: code[1], subgroup: code[2] }
|
379
|
+
end
|
380
|
+
end
|
381
|
+
|
382
|
+
# Fetch links.
|
383
|
+
# @param doc [Nokogiri::HTML::Document]
|
384
|
+
# @param url [String]
|
385
|
+
# @return [Array<Hash>]
|
386
|
+
def fetch_link(doc, url)
|
387
|
+
links = [{ type: "src", content: url }]
|
388
|
+
obp_elms = doc.at_css("p.btn-preview a")
|
389
|
+
links << { type: "obp", content: obp_elms[:href] } if obp_elms
|
390
|
+
links
|
391
|
+
end
|
392
|
+
|
393
|
+
# Fetch copyright.
|
394
|
+
# @param title [String]
|
395
|
+
# @return [Hash]
|
396
|
+
def fetch_copyright(code, doc)
|
397
|
+
abbreviation = code.match(/.*?(?=\s)/).to_s
|
398
|
+
case abbreviation
|
399
|
+
when "IEC"
|
400
|
+
name = "International Electrotechnical Commission"
|
401
|
+
url = "www.iec.ch"
|
402
|
+
end
|
403
|
+
from = code.match(/(?<=:)\d{4}/).to_s
|
404
|
+
if from.empty?
|
405
|
+
from = doc.xpath("//span[@itemprop='releaseDate']").text
|
406
|
+
.match(/\d{4}/).to_s
|
407
|
+
end
|
408
|
+
{ owner: { name: name, abbreviation: abbreviation, url: url }, from: from }
|
409
|
+
end
|
410
|
+
end
|
411
|
+
end
|
412
|
+
# rubocop:enable Metrics/ModuleLength
|
413
|
+
end
|