relaton-iec 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,156 @@
1
+ # frozen_string_literal: true
2
+
3
+ # require 'isobib/iso_bibliographic_item'
4
+ require "relaton_iec/scrapper"
5
+ require "relaton_iec/hit_collection"
6
+ require "date"
7
+
8
+ module RelatonIec
9
+ # Class methods for search ISO standards.
10
+ class IecBibliography
11
+ class << self
12
+ # @param text [String]
13
+ # @return [RelatonIec::HitCollection]
14
+ def search(text, year = nil)
15
+ HitCollection.new text, year
16
+ rescue SocketError, OpenURI::HTTPError
17
+ warn "Could not access http://www.iec.ch"
18
+ []
19
+ end
20
+
21
+ # @param text [String]
22
+ # @return [Array<IsoBibliographicItem>]
23
+ # def search_and_fetch(text, year = nil)
24
+ # Scrapper.get(text, year)
25
+ # end
26
+
27
+ # @param code [String] the ISO standard Code to look up (e..g "ISO 9000")
28
+ # @param year [String] the year the standard was published (optional)
29
+ # @param opts [Hash] options; restricted to :all_parts if all-parts reference is required
30
+ # @return [String] Relaton XML serialisation of reference
31
+ def get(code, year = nil, opts = {})
32
+ if year.nil?
33
+ /^(?<code1>[^:]+):(?<year1>[^:]+)$/ =~ code
34
+ unless code1.nil?
35
+ code = code1
36
+ year = year1
37
+ end
38
+ end
39
+
40
+ return iev if code.casecmp("IEV").zero?
41
+
42
+ code += "-1" if opts[:all_parts]
43
+ ret = iecbib_get1(code, year, opts)
44
+ return nil if ret.nil?
45
+
46
+ ret.to_most_recent_reference unless year || opts[:keep_year]
47
+ ret.to_all_parts if opts[:all_parts]
48
+ ret
49
+ end
50
+
51
+ private
52
+
53
+ def fetch_ref_err(code, year, missed_years)
54
+ id = year ? "#{code}:#{year}" : code
55
+ warn "WARNING: no match found online for #{id}. "\
56
+ "The code must be exactly like it is on the standards website."
57
+ warn "(There was no match for #{year}, though there were matches "\
58
+ "found for #{missed_years.join(', ')}.)" unless missed_years.empty?
59
+ if /\d-\d/ =~ code
60
+ warn "The provided document part may not exist, or the document "\
61
+ "may no longer be published in parts."
62
+ else
63
+ warn "If you wanted to cite all document parts for the reference, "\
64
+ "use \"#{code} (all parts)\".\nIf the document is not a standard, "\
65
+ "use its document type abbreviation (TS, TR, PAS, Guide)."
66
+ end
67
+ nil
68
+ end
69
+
70
+ def fetch_pages(s, n)
71
+ workers = RelatonBib::WorkersPool.new n
72
+ workers.worker { |w| { i: w[:i], hit: w[:hit].fetch } }
73
+ s.each_with_index { |hit, i| workers << { i: i, hit: hit } }
74
+ workers.end
75
+ workers.result.sort { |x, y| x[:i] <=> y[:i] }.map { |x| x[:hit] }
76
+ end
77
+
78
+ def isobib_search_filter(code)
79
+ docidrx = %r{^(ISO|IEC)[^0-9]*\s[0-9-]+}
80
+ corrigrx = %r{^(ISO|IEC)[^0-9]*\s[0-9-]+:[0-9]+/}
81
+ warn "fetching #{code}..."
82
+ result = search(code)
83
+ result.select do |i|
84
+ i.hit[:code] &&
85
+ i.hit[:code].match(docidrx).to_s == code &&
86
+ corrigrx !~ i.hit[:code]
87
+ end
88
+ end
89
+
90
+ def iev(code = "IEC 60050")
91
+ RelatonIsoBib::XMLParser.from_xml(<<~"END")
92
+ <bibitem>
93
+ <fetched>#{Date.today}</fetched>
94
+ <title format="text/plain" language="en" script="Latn">International Electrotechnical Vocabulary</title>
95
+ <link type="src">http://www.electropedia.org</link>
96
+ <docidentifier>#{code}:2011</docidentifier>
97
+ <date type="published"><on>2011</on></date>
98
+ <contributor>
99
+ <role type="publisher"/>
100
+ <organization>
101
+ <name>International Electrotechnical Commission</name>
102
+ <abbreviation>IEC</abbreviation>
103
+ <uri>www.iec.ch</uri>
104
+ </organization>
105
+ </contributor>
106
+ <language>en</language> <language>fr</language>
107
+ <script>Latn</script>
108
+ <status> <stage>60</stage> </status>
109
+ <copyright>
110
+ <from>2018</from>
111
+ <owner>
112
+ <organization>
113
+ <name>International Electrotechnical Commission</name>
114
+ <abbreviation>IEC</abbreviation>
115
+ <uri>www.iec.ch</uri>
116
+ </organization>
117
+ </owner>
118
+ </copyright>
119
+ </bibitem>
120
+ END
121
+ end
122
+
123
+ # Sort through the results from Isobib, fetching them three at a time,
124
+ # and return the first result that matches the code,
125
+ # matches the year (if provided), and which # has a title (amendments do not).
126
+ # Only expects the first page of results to be populated.
127
+ # Does not match corrigenda etc (e.g. ISO 3166-1:2006/Cor 1:2007)
128
+ # If no match, returns any years which caused mismatch, for error reporting
129
+ def isobib_results_filter(result, year)
130
+ missed_years = []
131
+ result.each_slice(3) do |s| # ISO website only allows 3 connections
132
+ fetch_pages(s, 3).each_with_index do |r, _i|
133
+ return { ret: r } if !year
134
+
135
+ r.dates.select { |d| d.type == "published" }.each do |d|
136
+ return { ret: r } if year.to_i == d.on.year
137
+
138
+ missed_years << d.on.year
139
+ end
140
+ end
141
+ end
142
+ { years: missed_years }
143
+ end
144
+
145
+ def iecbib_get1(code, year, _opts)
146
+ return iev if code.casecmp("IEV").zero?
147
+
148
+ result = isobib_search_filter(code) || return
149
+ ret = isobib_results_filter(result, year)
150
+ return ret[:ret] if ret[:ret]
151
+
152
+ fetch_ref_err(code, year, ret[:years])
153
+ end
154
+ end
155
+ end
156
+ end
@@ -0,0 +1,413 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "relaton_iso_bib"
4
+ require "relaton_iec/hit"
5
+ require "nokogiri"
6
+ require "net/http"
7
+
8
+ # Capybara.request_driver :poltergeist do |app|
9
+ # Capybara::Poltergeist::Driver.new app, js_errors: false
10
+ # end
11
+ # Capybara.default_driver = :poltergeist
12
+
13
+ module RelatonIec
14
+ # Scrapper.
15
+ # rubocop:disable Metrics/ModuleLength
16
+ module Scrapper
17
+ DOMAIN = "https://webstore.iec.ch"
18
+
19
+ TYPES = {
20
+ "ISO" => "international-standard",
21
+ "TS" => "technicalSpecification",
22
+ "TR" => "technicalReport",
23
+ "PAS" => "publiclyAvailableSpecification",
24
+ "AWI" => "appruvedWorkItem",
25
+ "CD" => "committeeDraft",
26
+ "FDIS" => "finalDraftInternationalStandard",
27
+ "NP" => "newProposal",
28
+ "DIS" => "draftInternationalStandard",
29
+ "WD" => "workingDraft",
30
+ "R" => "recommendation",
31
+ "Guide" => "guide",
32
+ }.freeze
33
+
34
+ class << self
35
+ # @param text [String]
36
+ # @return [Array<Hash>]
37
+ # def get(text)
38
+ # iso_workers = WorkersPool.new 4
39
+ # iso_workers.worker { |hit| iso_worker(hit, iso_workers) }
40
+ # algolia_workers = start_algolia_search(text, iso_workers)
41
+ # iso_docs = iso_workers.result
42
+ # algolia_workers.end
43
+ # algolia_workers.result
44
+ # iso_docs
45
+ # end
46
+
47
+ # Parse page.
48
+ # @param hit [Hash]
49
+ # @return [Hash]
50
+ # rubocop:disable Metrics/AbcSize, Metrics/MethodLength
51
+ def parse_page(hit_data)
52
+ doc = get_page hit_data[:url]
53
+
54
+ # Fetch edition.
55
+ edition = doc.at("//th[contains(., 'Edition')]/following-sibling::td/span").text
56
+
57
+ status, relations = fetch_status_relations hit_data[:url]
58
+
59
+ RelatonIsoBib::IsoBibliographicItem.new(
60
+ docid: [RelatonBib::DocumentIdentifier.new(id: hit_data[:code], type: "IEC")],
61
+ structuredidentifier: fetch_structuredidentifier(doc),
62
+ edition: edition,
63
+ language: ["en"],
64
+ script: ["Latn"],
65
+ titles: fetch_titles(hit_data),
66
+ type: fetch_type(doc),
67
+ docstatus: status,
68
+ ics: fetch_ics(doc),
69
+ dates: fetch_dates(doc),
70
+ contributors: fetch_contributors(hit_data[:code]),
71
+ editorialgroup: fetch_workgroup(doc),
72
+ abstract: fetch_abstract(doc),
73
+ copyright: fetch_copyright(hit_data[:code], doc),
74
+ link: fetch_link(doc, hit_data[:url]),
75
+ relations: relations,
76
+ )
77
+ end
78
+ # rubocop:enable Metrics/AbcSize, Metrics/MethodLength
79
+
80
+ private
81
+
82
+ # Start search workers.
83
+ # @param text[String]
84
+ # @param iec_workers [Isobib::WorkersPool]
85
+ # @reaturn [Isobib::WorkersPool]
86
+ # def start_algolia_search(text, iec_workers)
87
+ # index = Algolia::Index.new 'all_en'
88
+ # workers = WorkersPool.new
89
+ # workers.worker do |page|
90
+ # algolia_worker(index, text, page, workers, iec_workers)
91
+ # end
92
+
93
+ # # Add first page so search worker will start.
94
+ # workers << 0
95
+ # end
96
+
97
+ # Fetch ISO documents.
98
+ # @param hit [Hash]
99
+ # @param isiso_workers [Isobib::WorkersPool]
100
+ # def iso_worker(hit, iso_workers)
101
+ # print "Parse #{iso_workers.size} of #{iso_workers.nb_hits} \r"
102
+ # parse_page hit
103
+ # end
104
+
105
+ # Fetch hits from algolia search service.
106
+ # @param index[Algolia::Index]
107
+ # @param text [String]
108
+ # @param page [Integer]
109
+ # @param algolia_workers [Isobib::WorkersPool]
110
+ # @param isiso_workers [Isobib::WorkersPool]
111
+ # def algolia_worker(index, text, page, algolia_workers, iso_workers)
112
+ # res = index.search text, facetFilters: ['category:standard'], page: page
113
+ # next_page = res['page'] + 1
114
+ # algolia_workers << next_page if next_page < res['nbPages']
115
+ # res['hits'].each do |hit|
116
+ # iso_workers.nb_hits = res['nbHits']
117
+ # iso_workers << hit
118
+ # end
119
+ # iso_workers.end unless next_page < res['nbPages']
120
+ # end
121
+
122
+ # Fetch abstracts.
123
+ # @param doc [Nokigiri::HTML::Document]
124
+ # @return [Array<Array>]
125
+ def fetch_abstract(doc)
126
+ abstract_content = doc.at('//div[@itemprop="description"]').text
127
+ [{
128
+ content: abstract_content,
129
+ language: "en",
130
+ script: "Latn",
131
+ format: "text/plain",
132
+ }]
133
+ end
134
+
135
+ # Get langs.
136
+ # @param doc [Nokogiri::HTML::Document]
137
+ # @return [Array<Hash>]
138
+ # def langs(doc)
139
+ # lgs = [{ lang: 'en' }]
140
+ # doc.css('ul#lang-switcher ul li a').each do |lang_link|
141
+ # lang_path = lang_link.attr('href')
142
+ # lang = lang_path.match(%r{^\/(fr)\/})
143
+ # lgs << { lang: lang[1], path: lang_path } if lang
144
+ # end
145
+ # lgs
146
+ # end
147
+
148
+ # rubocop:disable Metrics/AbcSize, Metrics/MethodLength
149
+
150
+ # Get page.
151
+ # @param path [String] page's path
152
+ # @return [Array<Nokogiri::HTML::Document, String>]
153
+ def get_page(url)
154
+ uri = URI url
155
+ resp = Net::HTTP.get_response(uri) # .encode("UTF-8")
156
+ if resp.code == "301"
157
+ path = resp["location"]
158
+ url = DOMAIN + path
159
+ uri = URI url
160
+ resp = Net::HTTP.get_response(uri) # .encode("UTF-8")
161
+ end
162
+ # n = 0
163
+ # while resp.body !~ /<strong/ && n < 10
164
+ # resp = Net::HTTP.get_response(uri)#.encode("UTF-8")
165
+ # n += 1
166
+ # end
167
+ Nokogiri::HTML(resp.body)
168
+ end
169
+ # rubocop:enable Metrics/AbcSize
170
+
171
+ # Fetch structuredidentifier.
172
+ # @param doc [Nokogiri::HTML::Document]
173
+ # @return [RelatonIsoBib::StructuredIdentifier]
174
+ def fetch_structuredidentifier(doc)
175
+ item_ref = doc.at("//span[@itemprop='productID']")
176
+ unless item_ref
177
+ return RelatonIsoBib::StructuredIdentifier.new(
178
+ project_number: "?", part_number: "", prefix: nil, id: "?",
179
+ )
180
+ end
181
+
182
+ m = item_ref.text.match(
183
+ /(?<=\s)(?<project>\d+)-?(?<part>(?<=-)\d+|)-?(?<subpart>(?<=-)\d+|)/,
184
+ )
185
+ RelatonIsoBib::StructuredIdentifier.new(
186
+ project_number: m[:project],
187
+ part_number: m[:part],
188
+ subpart_number: m[:subpart],
189
+ prefix: nil,
190
+ type: "IEC",
191
+ id: item_ref.text,
192
+ )
193
+ end
194
+
195
+ # Fetch status.
196
+ # @param doc [Nokogiri::HTML::Document]
197
+ # @param status [String]
198
+ # @return [Hash]
199
+ def fetch_status(doc)
200
+ wip = doc.at('//ROW[STATUS[.="PREPARING"]]')
201
+ if wip
202
+ statuses = YAML.load_file "lib/relaton_iec/statuses.yml"
203
+ s = wip.at("STAGE").text
204
+ stage, substage = statuses[s]["stage"].split "."
205
+ # status = statuses[s]["status"]
206
+ else
207
+ # status = "Published"
208
+ stage = "60"
209
+ substage = "60"
210
+ end
211
+ RelatonBib::DocumentStatus.new(stage: stage, substage: substage)
212
+ end
213
+ # rubocop:enable Metrics/MethodLength
214
+
215
+ # Fetch workgroup.
216
+ # @param doc [Nokogiri::HTML::Document]
217
+ # @return [Hash]
218
+ def fetch_workgroup(doc)
219
+ wg = doc.at('//th/abbr[.="TC"]/../following-sibling::td/a').text
220
+ {
221
+ name: "International Electrotechnical Commission",
222
+ abbreviation: "IEC",
223
+ url: "webstore.iec.ch",
224
+ technical_committee: [{
225
+ name: wg,
226
+ type: "technicalCommittee",
227
+ number: wg.match(/\d+/)&.to_s&.to_i,
228
+ }],
229
+ }
230
+ end
231
+
232
+ # Fetch relations.
233
+ # @param doc [Nokogiri::HTML::Document]
234
+ # @return [Array<Hash>]
235
+ # rubocop:disable Metrics/MethodLength
236
+ def fetch_relations(doc)
237
+ doc.xpath('//ROW[STATUS[.!="PREPARING"]][STATUS[.!="PUBLISHED"]]').map do |r|
238
+ r_type = r.at("STATUS").text.downcase
239
+ type = case r_type
240
+ # when 'published' then 'obsoletes' # Valid
241
+ when "revised", "replaced" then "updates"
242
+ when "withdrawn" then "obsoletes"
243
+ else r_type
244
+ end
245
+ # url = DOMAIN + "/publication/" + r.at("PUB_ID").text
246
+ fref = RelatonBib::FormattedRef.new(
247
+ content: r.at("FULL_NAME").text, format: "text/plain",
248
+ )
249
+ bibitem = RelatonIsoBib::IsoBibliographicItem.new(
250
+ formattedref: fref,
251
+ )
252
+ { type: type, bibitem: bibitem }
253
+ end
254
+ end
255
+
256
+ def fetch_status_relations(url)
257
+ pubid = url.match(/\d+$/).to_s
258
+ uri = URI DOMAIN + "/webstore/webstore.nsf/AjaxRequestXML?"\
259
+ "Openagent&url=http://www.iec.ch/dyn/www/f?"\
260
+ "p=103:390:::::P390_PUBLICATION_ID:" + pubid
261
+ resp = Net::HTTP.get_response uri
262
+ doc = Nokogiri::XML resp.body
263
+ status = fetch_status doc
264
+ relations = fetch_relations doc
265
+ [status, relations]
266
+ # doc.css('ul.steps li').inject([]) do |a, r|
267
+ # r_type = r.css('strong').text
268
+ # type = case r_type
269
+ # when 'Previously', 'Will be replaced by' then 'obsoletes'
270
+ # when 'Corrigenda/Amendments', 'Revised by', 'Now confirmed'
271
+ # 'updates'
272
+ # else r_type
273
+ # end
274
+ # if ['Now', 'Now under review'].include? type
275
+ # a
276
+ # else
277
+ # a + r.css('a').map do |id|
278
+ # { type: type, identifier: id.text, url: id['href'] }
279
+ # end
280
+ # end
281
+ # end
282
+ end
283
+ # rubocop:enable Metrics/MethodLength
284
+
285
+ # Fetch type.
286
+ # @param doc [Nokogiri::HTML::Document]
287
+ # @return [String]
288
+ def fetch_type(doc)
289
+ doc.at('//th[contains(., "Publication type")]/following-sibling::td/span')
290
+ .text.downcase.tr " ", "-"
291
+ # type_match = title.match(%r{^(ISO|IWA|IEC)(?:(/IEC|/IEEE|/PRF|
292
+ # /NP)*\s|/)(TS|TR|PAS|AWI|CD|FDIS|NP|DIS|WD|R|Guide|(?=\d+))}x)
293
+ # #return "international-standard" if type_match.nil?
294
+ # if TYPES[type_match[2]]
295
+ # TYPES[type_match[2]]
296
+ # elsif type_match[1]
297
+ # elsif type_match[1] == 'ISO'
298
+ # 'international-standard'
299
+ # elsif type_match[1] == 'IWA'
300
+ # 'international-workshop-agreement'
301
+ # end
302
+ # # rescue => _e
303
+ # # puts 'Unknown document type: ' + title
304
+ end
305
+
306
+ # Fetch titles.
307
+ # @param hit_data [Hash]
308
+ # @return [Array<Hash>]
309
+ def fetch_titles(hit_data)
310
+ titles = hit_data[:title].split " - "
311
+ case titles.size
312
+ when 0
313
+ intro, main, part = nil, "", nil
314
+ when 1
315
+ intro, main, part = nil, titles[0], nil
316
+ when 2
317
+ if /^(Part|Partie) \d+:/ =~ titles[1]
318
+ intro, main, part = nil, titles[0], titles[1]
319
+ else
320
+ intro, main, part = titles[0], titles[1], nil
321
+ end
322
+ when 3
323
+ intro, main, part = titles[0], titles[1], titles[2]
324
+ else
325
+ intro, main, part = titles[0], titles[1], titles[2..-1]&.join(" -- ")
326
+ end
327
+ [{
328
+ title_intro: intro,
329
+ title_main: main,
330
+ title_part: part,
331
+ language: "en",
332
+ script: "Latn"
333
+ }]
334
+ end
335
+
336
+ # Return ISO script code.
337
+ # @param lang [String]
338
+ # @return [String]
339
+ # def script(lang)
340
+ # case lang
341
+ # when 'en', 'fr' then 'Latn'
342
+ # end
343
+ # end
344
+
345
+ # Fetch dates
346
+ # @param doc [Nokogiri::HTML::Document]
347
+ # @return [Array<Hash>]
348
+ def fetch_dates(doc)
349
+ dates = []
350
+ publish_date = doc.at("//span[@itemprop='releaseDate']").text
351
+ unless publish_date.empty?
352
+ dates << { type: "published", on: publish_date }
353
+ end
354
+ dates
355
+ end
356
+
357
+ def fetch_contributors(code)
358
+ code.sub(/\s.*/, "").split("/").map do |abbrev|
359
+ case abbrev
360
+ when "ISO"
361
+ name = "International Organization for Standardization"
362
+ url = "www.iso.org"
363
+ when "IEC"
364
+ name = "International Electrotechnical Commission"
365
+ url = "www.iec.ch"
366
+ end
367
+ { entity: { name: name, url: url, abbreviation: abbrev },
368
+ roles: ["publisher"] }
369
+ end
370
+ end
371
+
372
+ # Fetch ICS.
373
+ # @param doc [Nokogiri::HTML::Document]
374
+ # @return [Array<Hash>]
375
+ def fetch_ics(doc)
376
+ doc.xpath('//th[contains(text(), "ICS")]/following-sibling::td/a').map do |i|
377
+ code = i.text.match(/[\d\.]+/).to_s.split "."
378
+ { field: code[0], group: code[1], subgroup: code[2] }
379
+ end
380
+ end
381
+
382
+ # Fetch links.
383
+ # @param doc [Nokogiri::HTML::Document]
384
+ # @param url [String]
385
+ # @return [Array<Hash>]
386
+ def fetch_link(doc, url)
387
+ links = [{ type: "src", content: url }]
388
+ obp_elms = doc.at_css("p.btn-preview a")
389
+ links << { type: "obp", content: obp_elms[:href] } if obp_elms
390
+ links
391
+ end
392
+
393
+ # Fetch copyright.
394
+ # @param title [String]
395
+ # @return [Hash]
396
+ def fetch_copyright(code, doc)
397
+ abbreviation = code.match(/.*?(?=\s)/).to_s
398
+ case abbreviation
399
+ when "IEC"
400
+ name = "International Electrotechnical Commission"
401
+ url = "www.iec.ch"
402
+ end
403
+ from = code.match(/(?<=:)\d{4}/).to_s
404
+ if from.empty?
405
+ from = doc.xpath("//span[@itemprop='releaseDate']").text
406
+ .match(/\d{4}/).to_s
407
+ end
408
+ { owner: { name: name, abbreviation: abbreviation, url: url }, from: from }
409
+ end
410
+ end
411
+ end
412
+ # rubocop:enable Metrics/ModuleLength
413
+ end