relaton-iec 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,156 @@
1
+ # frozen_string_literal: true
2
+
3
+ # require 'isobib/iso_bibliographic_item'
4
+ require "relaton_iec/scrapper"
5
+ require "relaton_iec/hit_collection"
6
+ require "date"
7
+
8
+ module RelatonIec
9
+ # Class methods for search ISO standards.
10
+ class IecBibliography
11
+ class << self
12
+ # @param text [String]
13
+ # @return [RelatonIec::HitCollection]
14
+ def search(text, year = nil)
15
+ HitCollection.new text, year
16
+ rescue SocketError, OpenURI::HTTPError
17
+ warn "Could not access http://www.iec.ch"
18
+ []
19
+ end
20
+
21
+ # @param text [String]
22
+ # @return [Array<IsoBibliographicItem>]
23
+ # def search_and_fetch(text, year = nil)
24
+ # Scrapper.get(text, year)
25
+ # end
26
+
27
+ # @param code [String] the ISO standard Code to look up (e..g "ISO 9000")
28
+ # @param year [String] the year the standard was published (optional)
29
+ # @param opts [Hash] options; restricted to :all_parts if all-parts reference is required
30
+ # @return [String] Relaton XML serialisation of reference
31
+ def get(code, year = nil, opts = {})
32
+ if year.nil?
33
+ /^(?<code1>[^:]+):(?<year1>[^:]+)$/ =~ code
34
+ unless code1.nil?
35
+ code = code1
36
+ year = year1
37
+ end
38
+ end
39
+
40
+ return iev if code.casecmp("IEV").zero?
41
+
42
+ code += "-1" if opts[:all_parts]
43
+ ret = iecbib_get1(code, year, opts)
44
+ return nil if ret.nil?
45
+
46
+ ret.to_most_recent_reference unless year || opts[:keep_year]
47
+ ret.to_all_parts if opts[:all_parts]
48
+ ret
49
+ end
50
+
51
+ private
52
+
53
+ def fetch_ref_err(code, year, missed_years)
54
+ id = year ? "#{code}:#{year}" : code
55
+ warn "WARNING: no match found online for #{id}. "\
56
+ "The code must be exactly like it is on the standards website."
57
+ warn "(There was no match for #{year}, though there were matches "\
58
+ "found for #{missed_years.join(', ')}.)" unless missed_years.empty?
59
+ if /\d-\d/ =~ code
60
+ warn "The provided document part may not exist, or the document "\
61
+ "may no longer be published in parts."
62
+ else
63
+ warn "If you wanted to cite all document parts for the reference, "\
64
+ "use \"#{code} (all parts)\".\nIf the document is not a standard, "\
65
+ "use its document type abbreviation (TS, TR, PAS, Guide)."
66
+ end
67
+ nil
68
+ end
69
+
70
+ def fetch_pages(s, n)
71
+ workers = RelatonBib::WorkersPool.new n
72
+ workers.worker { |w| { i: w[:i], hit: w[:hit].fetch } }
73
+ s.each_with_index { |hit, i| workers << { i: i, hit: hit } }
74
+ workers.end
75
+ workers.result.sort { |x, y| x[:i] <=> y[:i] }.map { |x| x[:hit] }
76
+ end
77
+
78
+ def isobib_search_filter(code)
79
+ docidrx = %r{^(ISO|IEC)[^0-9]*\s[0-9-]+}
80
+ corrigrx = %r{^(ISO|IEC)[^0-9]*\s[0-9-]+:[0-9]+/}
81
+ warn "fetching #{code}..."
82
+ result = search(code)
83
+ result.select do |i|
84
+ i.hit[:code] &&
85
+ i.hit[:code].match(docidrx).to_s == code &&
86
+ corrigrx !~ i.hit[:code]
87
+ end
88
+ end
89
+
90
+ def iev(code = "IEC 60050")
91
+ RelatonIsoBib::XMLParser.from_xml(<<~"END")
92
+ <bibitem>
93
+ <fetched>#{Date.today}</fetched>
94
+ <title format="text/plain" language="en" script="Latn">International Electrotechnical Vocabulary</title>
95
+ <link type="src">http://www.electropedia.org</link>
96
+ <docidentifier>#{code}:2011</docidentifier>
97
+ <date type="published"><on>2011</on></date>
98
+ <contributor>
99
+ <role type="publisher"/>
100
+ <organization>
101
+ <name>International Electrotechnical Commission</name>
102
+ <abbreviation>IEC</abbreviation>
103
+ <uri>www.iec.ch</uri>
104
+ </organization>
105
+ </contributor>
106
+ <language>en</language> <language>fr</language>
107
+ <script>Latn</script>
108
+ <status> <stage>60</stage> </status>
109
+ <copyright>
110
+ <from>2018</from>
111
+ <owner>
112
+ <organization>
113
+ <name>International Electrotechnical Commission</name>
114
+ <abbreviation>IEC</abbreviation>
115
+ <uri>www.iec.ch</uri>
116
+ </organization>
117
+ </owner>
118
+ </copyright>
119
+ </bibitem>
120
+ END
121
+ end
122
+
123
+ # Sort through the results from Isobib, fetching them three at a time,
124
+ # and return the first result that matches the code,
125
+ # matches the year (if provided), and which # has a title (amendments do not).
126
+ # Only expects the first page of results to be populated.
127
+ # Does not match corrigenda etc (e.g. ISO 3166-1:2006/Cor 1:2007)
128
+ # If no match, returns any years which caused mismatch, for error reporting
129
+ def isobib_results_filter(result, year)
130
+ missed_years = []
131
+ result.each_slice(3) do |s| # ISO website only allows 3 connections
132
+ fetch_pages(s, 3).each_with_index do |r, _i|
133
+ return { ret: r } if !year
134
+
135
+ r.dates.select { |d| d.type == "published" }.each do |d|
136
+ return { ret: r } if year.to_i == d.on.year
137
+
138
+ missed_years << d.on.year
139
+ end
140
+ end
141
+ end
142
+ { years: missed_years }
143
+ end
144
+
145
+ def iecbib_get1(code, year, _opts)
146
+ return iev if code.casecmp("IEV").zero?
147
+
148
+ result = isobib_search_filter(code) || return
149
+ ret = isobib_results_filter(result, year)
150
+ return ret[:ret] if ret[:ret]
151
+
152
+ fetch_ref_err(code, year, ret[:years])
153
+ end
154
+ end
155
+ end
156
+ end
@@ -0,0 +1,413 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "relaton_iso_bib"
4
+ require "relaton_iec/hit"
5
+ require "nokogiri"
6
+ require "net/http"
7
+
8
+ # Capybara.request_driver :poltergeist do |app|
9
+ # Capybara::Poltergeist::Driver.new app, js_errors: false
10
+ # end
11
+ # Capybara.default_driver = :poltergeist
12
+
13
+ module RelatonIec
14
+ # Scrapper.
15
+ # rubocop:disable Metrics/ModuleLength
16
+ module Scrapper
17
+ DOMAIN = "https://webstore.iec.ch"
18
+
19
+ TYPES = {
20
+ "ISO" => "international-standard",
21
+ "TS" => "technicalSpecification",
22
+ "TR" => "technicalReport",
23
+ "PAS" => "publiclyAvailableSpecification",
24
+ "AWI" => "appruvedWorkItem",
25
+ "CD" => "committeeDraft",
26
+ "FDIS" => "finalDraftInternationalStandard",
27
+ "NP" => "newProposal",
28
+ "DIS" => "draftInternationalStandard",
29
+ "WD" => "workingDraft",
30
+ "R" => "recommendation",
31
+ "Guide" => "guide",
32
+ }.freeze
33
+
34
+ class << self
35
+ # @param text [String]
36
+ # @return [Array<Hash>]
37
+ # def get(text)
38
+ # iso_workers = WorkersPool.new 4
39
+ # iso_workers.worker { |hit| iso_worker(hit, iso_workers) }
40
+ # algolia_workers = start_algolia_search(text, iso_workers)
41
+ # iso_docs = iso_workers.result
42
+ # algolia_workers.end
43
+ # algolia_workers.result
44
+ # iso_docs
45
+ # end
46
+
47
+ # Parse page.
48
+ # @param hit [Hash]
49
+ # @return [Hash]
50
+ # rubocop:disable Metrics/AbcSize, Metrics/MethodLength
51
+ def parse_page(hit_data)
52
+ doc = get_page hit_data[:url]
53
+
54
+ # Fetch edition.
55
+ edition = doc.at("//th[contains(., 'Edition')]/following-sibling::td/span").text
56
+
57
+ status, relations = fetch_status_relations hit_data[:url]
58
+
59
+ RelatonIsoBib::IsoBibliographicItem.new(
60
+ docid: [RelatonBib::DocumentIdentifier.new(id: hit_data[:code], type: "IEC")],
61
+ structuredidentifier: fetch_structuredidentifier(doc),
62
+ edition: edition,
63
+ language: ["en"],
64
+ script: ["Latn"],
65
+ titles: fetch_titles(hit_data),
66
+ type: fetch_type(doc),
67
+ docstatus: status,
68
+ ics: fetch_ics(doc),
69
+ dates: fetch_dates(doc),
70
+ contributors: fetch_contributors(hit_data[:code]),
71
+ editorialgroup: fetch_workgroup(doc),
72
+ abstract: fetch_abstract(doc),
73
+ copyright: fetch_copyright(hit_data[:code], doc),
74
+ link: fetch_link(doc, hit_data[:url]),
75
+ relations: relations,
76
+ )
77
+ end
78
+ # rubocop:enable Metrics/AbcSize, Metrics/MethodLength
79
+
80
+ private
81
+
82
+ # Start search workers.
83
+ # @param text[String]
84
+ # @param iec_workers [Isobib::WorkersPool]
85
+ # @reaturn [Isobib::WorkersPool]
86
+ # def start_algolia_search(text, iec_workers)
87
+ # index = Algolia::Index.new 'all_en'
88
+ # workers = WorkersPool.new
89
+ # workers.worker do |page|
90
+ # algolia_worker(index, text, page, workers, iec_workers)
91
+ # end
92
+
93
+ # # Add first page so search worker will start.
94
+ # workers << 0
95
+ # end
96
+
97
+ # Fetch ISO documents.
98
+ # @param hit [Hash]
99
+ # @param isiso_workers [Isobib::WorkersPool]
100
+ # def iso_worker(hit, iso_workers)
101
+ # print "Parse #{iso_workers.size} of #{iso_workers.nb_hits} \r"
102
+ # parse_page hit
103
+ # end
104
+
105
+ # Fetch hits from algolia search service.
106
+ # @param index[Algolia::Index]
107
+ # @param text [String]
108
+ # @param page [Integer]
109
+ # @param algolia_workers [Isobib::WorkersPool]
110
+ # @param isiso_workers [Isobib::WorkersPool]
111
+ # def algolia_worker(index, text, page, algolia_workers, iso_workers)
112
+ # res = index.search text, facetFilters: ['category:standard'], page: page
113
+ # next_page = res['page'] + 1
114
+ # algolia_workers << next_page if next_page < res['nbPages']
115
+ # res['hits'].each do |hit|
116
+ # iso_workers.nb_hits = res['nbHits']
117
+ # iso_workers << hit
118
+ # end
119
+ # iso_workers.end unless next_page < res['nbPages']
120
+ # end
121
+
122
+ # Fetch abstracts.
123
+ # @param doc [Nokigiri::HTML::Document]
124
+ # @return [Array<Array>]
125
+ def fetch_abstract(doc)
126
+ abstract_content = doc.at('//div[@itemprop="description"]').text
127
+ [{
128
+ content: abstract_content,
129
+ language: "en",
130
+ script: "Latn",
131
+ format: "text/plain",
132
+ }]
133
+ end
134
+
135
+ # Get langs.
136
+ # @param doc [Nokogiri::HTML::Document]
137
+ # @return [Array<Hash>]
138
+ # def langs(doc)
139
+ # lgs = [{ lang: 'en' }]
140
+ # doc.css('ul#lang-switcher ul li a').each do |lang_link|
141
+ # lang_path = lang_link.attr('href')
142
+ # lang = lang_path.match(%r{^\/(fr)\/})
143
+ # lgs << { lang: lang[1], path: lang_path } if lang
144
+ # end
145
+ # lgs
146
+ # end
147
+
148
+ # rubocop:disable Metrics/AbcSize, Metrics/MethodLength
149
+
150
+ # Get page.
151
+ # @param path [String] page's path
152
+ # @return [Array<Nokogiri::HTML::Document, String>]
153
+ def get_page(url)
154
+ uri = URI url
155
+ resp = Net::HTTP.get_response(uri) # .encode("UTF-8")
156
+ if resp.code == "301"
157
+ path = resp["location"]
158
+ url = DOMAIN + path
159
+ uri = URI url
160
+ resp = Net::HTTP.get_response(uri) # .encode("UTF-8")
161
+ end
162
+ # n = 0
163
+ # while resp.body !~ /<strong/ && n < 10
164
+ # resp = Net::HTTP.get_response(uri)#.encode("UTF-8")
165
+ # n += 1
166
+ # end
167
+ Nokogiri::HTML(resp.body)
168
+ end
169
+ # rubocop:enable Metrics/AbcSize
170
+
171
+ # Fetch structuredidentifier.
172
+ # @param doc [Nokogiri::HTML::Document]
173
+ # @return [RelatonIsoBib::StructuredIdentifier]
174
+ def fetch_structuredidentifier(doc)
175
+ item_ref = doc.at("//span[@itemprop='productID']")
176
+ unless item_ref
177
+ return RelatonIsoBib::StructuredIdentifier.new(
178
+ project_number: "?", part_number: "", prefix: nil, id: "?",
179
+ )
180
+ end
181
+
182
+ m = item_ref.text.match(
183
+ /(?<=\s)(?<project>\d+)-?(?<part>(?<=-)\d+|)-?(?<subpart>(?<=-)\d+|)/,
184
+ )
185
+ RelatonIsoBib::StructuredIdentifier.new(
186
+ project_number: m[:project],
187
+ part_number: m[:part],
188
+ subpart_number: m[:subpart],
189
+ prefix: nil,
190
+ type: "IEC",
191
+ id: item_ref.text,
192
+ )
193
+ end
194
+
195
+ # Fetch status.
196
+ # @param doc [Nokogiri::HTML::Document]
197
+ # @param status [String]
198
+ # @return [Hash]
199
+ def fetch_status(doc)
200
+ wip = doc.at('//ROW[STATUS[.="PREPARING"]]')
201
+ if wip
202
+ statuses = YAML.load_file "lib/relaton_iec/statuses.yml"
203
+ s = wip.at("STAGE").text
204
+ stage, substage = statuses[s]["stage"].split "."
205
+ # status = statuses[s]["status"]
206
+ else
207
+ # status = "Published"
208
+ stage = "60"
209
+ substage = "60"
210
+ end
211
+ RelatonBib::DocumentStatus.new(stage: stage, substage: substage)
212
+ end
213
+ # rubocop:enable Metrics/MethodLength
214
+
215
+ # Fetch workgroup.
216
+ # @param doc [Nokogiri::HTML::Document]
217
+ # @return [Hash]
218
+ def fetch_workgroup(doc)
219
+ wg = doc.at('//th/abbr[.="TC"]/../following-sibling::td/a').text
220
+ {
221
+ name: "International Electrotechnical Commission",
222
+ abbreviation: "IEC",
223
+ url: "webstore.iec.ch",
224
+ technical_committee: [{
225
+ name: wg,
226
+ type: "technicalCommittee",
227
+ number: wg.match(/\d+/)&.to_s&.to_i,
228
+ }],
229
+ }
230
+ end
231
+
232
+ # Fetch relations.
233
+ # @param doc [Nokogiri::HTML::Document]
234
+ # @return [Array<Hash>]
235
+ # rubocop:disable Metrics/MethodLength
236
+ def fetch_relations(doc)
237
+ doc.xpath('//ROW[STATUS[.!="PREPARING"]][STATUS[.!="PUBLISHED"]]').map do |r|
238
+ r_type = r.at("STATUS").text.downcase
239
+ type = case r_type
240
+ # when 'published' then 'obsoletes' # Valid
241
+ when "revised", "replaced" then "updates"
242
+ when "withdrawn" then "obsoletes"
243
+ else r_type
244
+ end
245
+ # url = DOMAIN + "/publication/" + r.at("PUB_ID").text
246
+ fref = RelatonBib::FormattedRef.new(
247
+ content: r.at("FULL_NAME").text, format: "text/plain",
248
+ )
249
+ bibitem = RelatonIsoBib::IsoBibliographicItem.new(
250
+ formattedref: fref,
251
+ )
252
+ { type: type, bibitem: bibitem }
253
+ end
254
+ end
255
+
256
+ def fetch_status_relations(url)
257
+ pubid = url.match(/\d+$/).to_s
258
+ uri = URI DOMAIN + "/webstore/webstore.nsf/AjaxRequestXML?"\
259
+ "Openagent&url=http://www.iec.ch/dyn/www/f?"\
260
+ "p=103:390:::::P390_PUBLICATION_ID:" + pubid
261
+ resp = Net::HTTP.get_response uri
262
+ doc = Nokogiri::XML resp.body
263
+ status = fetch_status doc
264
+ relations = fetch_relations doc
265
+ [status, relations]
266
+ # doc.css('ul.steps li').inject([]) do |a, r|
267
+ # r_type = r.css('strong').text
268
+ # type = case r_type
269
+ # when 'Previously', 'Will be replaced by' then 'obsoletes'
270
+ # when 'Corrigenda/Amendments', 'Revised by', 'Now confirmed'
271
+ # 'updates'
272
+ # else r_type
273
+ # end
274
+ # if ['Now', 'Now under review'].include? type
275
+ # a
276
+ # else
277
+ # a + r.css('a').map do |id|
278
+ # { type: type, identifier: id.text, url: id['href'] }
279
+ # end
280
+ # end
281
+ # end
282
+ end
283
+ # rubocop:enable Metrics/MethodLength
284
+
285
+ # Fetch type.
286
+ # @param doc [Nokogiri::HTML::Document]
287
+ # @return [String]
288
+ def fetch_type(doc)
289
+ doc.at('//th[contains(., "Publication type")]/following-sibling::td/span')
290
+ .text.downcase.tr " ", "-"
291
+ # type_match = title.match(%r{^(ISO|IWA|IEC)(?:(/IEC|/IEEE|/PRF|
292
+ # /NP)*\s|/)(TS|TR|PAS|AWI|CD|FDIS|NP|DIS|WD|R|Guide|(?=\d+))}x)
293
+ # #return "international-standard" if type_match.nil?
294
+ # if TYPES[type_match[2]]
295
+ # TYPES[type_match[2]]
296
+ # elsif type_match[1]
297
+ # elsif type_match[1] == 'ISO'
298
+ # 'international-standard'
299
+ # elsif type_match[1] == 'IWA'
300
+ # 'international-workshop-agreement'
301
+ # end
302
+ # # rescue => _e
303
+ # # puts 'Unknown document type: ' + title
304
+ end
305
+
306
+ # Fetch titles.
307
+ # @param hit_data [Hash]
308
+ # @return [Array<Hash>]
309
+ def fetch_titles(hit_data)
310
+ titles = hit_data[:title].split " - "
311
+ case titles.size
312
+ when 0
313
+ intro, main, part = nil, "", nil
314
+ when 1
315
+ intro, main, part = nil, titles[0], nil
316
+ when 2
317
+ if /^(Part|Partie) \d+:/ =~ titles[1]
318
+ intro, main, part = nil, titles[0], titles[1]
319
+ else
320
+ intro, main, part = titles[0], titles[1], nil
321
+ end
322
+ when 3
323
+ intro, main, part = titles[0], titles[1], titles[2]
324
+ else
325
+ intro, main, part = titles[0], titles[1], titles[2..-1]&.join(" -- ")
326
+ end
327
+ [{
328
+ title_intro: intro,
329
+ title_main: main,
330
+ title_part: part,
331
+ language: "en",
332
+ script: "Latn"
333
+ }]
334
+ end
335
+
336
+ # Return ISO script code.
337
+ # @param lang [String]
338
+ # @return [String]
339
+ # def script(lang)
340
+ # case lang
341
+ # when 'en', 'fr' then 'Latn'
342
+ # end
343
+ # end
344
+
345
+ # Fetch dates
346
+ # @param doc [Nokogiri::HTML::Document]
347
+ # @return [Array<Hash>]
348
+ def fetch_dates(doc)
349
+ dates = []
350
+ publish_date = doc.at("//span[@itemprop='releaseDate']").text
351
+ unless publish_date.empty?
352
+ dates << { type: "published", on: publish_date }
353
+ end
354
+ dates
355
+ end
356
+
357
+ def fetch_contributors(code)
358
+ code.sub(/\s.*/, "").split("/").map do |abbrev|
359
+ case abbrev
360
+ when "ISO"
361
+ name = "International Organization for Standardization"
362
+ url = "www.iso.org"
363
+ when "IEC"
364
+ name = "International Electrotechnical Commission"
365
+ url = "www.iec.ch"
366
+ end
367
+ { entity: { name: name, url: url, abbreviation: abbrev },
368
+ roles: ["publisher"] }
369
+ end
370
+ end
371
+
372
+ # Fetch ICS.
373
+ # @param doc [Nokogiri::HTML::Document]
374
+ # @return [Array<Hash>]
375
+ def fetch_ics(doc)
376
+ doc.xpath('//th[contains(text(), "ICS")]/following-sibling::td/a').map do |i|
377
+ code = i.text.match(/[\d\.]+/).to_s.split "."
378
+ { field: code[0], group: code[1], subgroup: code[2] }
379
+ end
380
+ end
381
+
382
+ # Fetch links.
383
+ # @param doc [Nokogiri::HTML::Document]
384
+ # @param url [String]
385
+ # @return [Array<Hash>]
386
+ def fetch_link(doc, url)
387
+ links = [{ type: "src", content: url }]
388
+ obp_elms = doc.at_css("p.btn-preview a")
389
+ links << { type: "obp", content: obp_elms[:href] } if obp_elms
390
+ links
391
+ end
392
+
393
+ # Fetch copyright.
394
+ # @param title [String]
395
+ # @return [Hash]
396
+ def fetch_copyright(code, doc)
397
+ abbreviation = code.match(/.*?(?=\s)/).to_s
398
+ case abbreviation
399
+ when "IEC"
400
+ name = "International Electrotechnical Commission"
401
+ url = "www.iec.ch"
402
+ end
403
+ from = code.match(/(?<=:)\d{4}/).to_s
404
+ if from.empty?
405
+ from = doc.xpath("//span[@itemprop='releaseDate']").text
406
+ .match(/\d{4}/).to_s
407
+ end
408
+ { owner: { name: name, abbreviation: abbreviation, url: url }, from: from }
409
+ end
410
+ end
411
+ end
412
+ # rubocop:enable Metrics/ModuleLength
413
+ end