relaton-iec 1.14.1 → 1.14.2

Sign up to get free protection for your applications and to get access to all the features.
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: relaton-iec
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.14.1
4
+ version: 1.14.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Ribose Inc.
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2022-12-21 00:00:00.000000000 Z
11
+ date: 2023-03-02 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: equivalent-xml
@@ -150,6 +150,20 @@ dependencies:
150
150
  - - "~>"
151
151
  - !ruby/object:Gem::Version
152
152
  version: 1.14.0
153
+ - !ruby/object:Gem::Dependency
154
+ name: rubyzip
155
+ requirement: !ruby/object:Gem::Requirement
156
+ requirements:
157
+ - - ">="
158
+ - !ruby/object:Gem::Version
159
+ version: '0'
160
+ type: :runtime
161
+ prerelease: false
162
+ version_requirements: !ruby/object:Gem::Requirement
163
+ requirements:
164
+ - - ">="
165
+ - !ruby/object:Gem::Version
166
+ version: '0'
153
167
  description: 'RelatonIec: retrieve IEC Standards for bibliographic use using the IecBibliographicItem
154
168
  model'
155
169
  email:
@@ -193,13 +207,15 @@ files:
193
207
  - lib/relaton_iec/basic_block/stem.rb
194
208
  - lib/relaton_iec/basic_block/table.rb
195
209
  - lib/relaton_iec/basic_block/text_element.rb
210
+ - lib/relaton_iec/data_fetcher.rb
211
+ - lib/relaton_iec/data_parser.rb
196
212
  - lib/relaton_iec/hash_converter.rb
197
213
  - lib/relaton_iec/hit.rb
198
214
  - lib/relaton_iec/hit_collection.rb
199
215
  - lib/relaton_iec/iec_bibliographic_item.rb
200
216
  - lib/relaton_iec/iec_bibliography.rb
217
+ - lib/relaton_iec/index.rb
201
218
  - lib/relaton_iec/processor.rb
202
- - lib/relaton_iec/scrapper.rb
203
219
  - lib/relaton_iec/statuses.yml
204
220
  - lib/relaton_iec/tc_sc_officers_note.rb
205
221
  - lib/relaton_iec/version.rb
@@ -1,308 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- # Capybara.request_driver :poltergeist do |app|
4
- # Capybara::Poltergeist::Driver.new app, js_errors: false
5
- # end
6
- # Capybara.default_driver = :poltergeist
7
-
8
- module RelatonIec
9
- # Scrapper.
10
- module Scrapper
11
- DOMAIN = "https://webstore.iec.ch"
12
- ABBREVS = {
13
- "ISO" => ["International Organization for Standardization", "www.iso.org"],
14
- "IEC" => ["International Electrotechnical Commission", "www.iec.ch"],
15
- "CISPR" => ["International special committee on radio interference", "www.iec.ch"],
16
- }.freeze
17
-
18
- TYPES = {
19
- "ISO" => "international-standard",
20
- "TS" => "technical-specification",
21
- "TR" => "technical-report",
22
- "PAS" => "publicly-available-specification",
23
- "AWI" => "appruved-work-item",
24
- "CD" => "committee-draft",
25
- "FDIS" => "final-draft-international-standard",
26
- "NP" => "new-proposal",
27
- "DIS" => "draft-international-standard",
28
- "WD" => "working-draft",
29
- "R" => "recommendation",
30
- "Guide" => "guide",
31
- "SRD" => "system-reference-deliverable",
32
- }.freeze
33
-
34
- class << self
35
- # rubocop:disable Metrics/AbcSize, Metrics/MethodLength
36
-
37
- # Parse page.
38
- # @param hit_data [Hash]
39
- # @return [Hash]
40
- def parse_page(hit_data)
41
- doc = get_page hit_data[:url]
42
-
43
- # Fetch edition.
44
- edition = doc.at(
45
- "//th[contains(., 'Edition')]/following-sibling::td/span",
46
- ).text
47
-
48
- status, relations = fetch_status_relations hit_data[:url]
49
-
50
- IecBibliographicItem.new(
51
- fetched: Date.today.to_s,
52
- docid: fetch_docid(hit_data),
53
- structuredidentifier: fetch_structuredidentifier(doc),
54
- edition: edition,
55
- language: ["en"],
56
- script: ["Latn"],
57
- title: fetch_titles(hit_data),
58
- doctype: fetch_type(doc),
59
- docstatus: status,
60
- ics: fetch_ics(doc),
61
- date: fetch_dates(doc),
62
- contributor: fetch_contributors(hit_data[:code]),
63
- editorialgroup: fetch_workgroup(doc),
64
- abstract: fetch_abstract(doc),
65
- copyright: fetch_copyright(hit_data[:code], doc),
66
- link: fetch_link(doc, hit_data[:url]),
67
- relation: relations,
68
- place: ["Geneva"],
69
- )
70
- end
71
- # rubocop:enable Metrics/AbcSize, Metrics/MethodLength
72
-
73
- private
74
-
75
- # @param hit [Hash]
76
- # @return [Array<RelatonBib::DocumentIdentifier>]
77
- def fetch_docid(hit)
78
- urn = RelatonIec.code_to_urn hit[:code], "en"
79
- [
80
- RelatonBib::DocumentIdentifier.new(id: hit[:code], type: "IEC", primary: true),
81
- RelatonBib::DocumentIdentifier.new(id: urn, type: "URN"),
82
- ]
83
- end
84
-
85
- # Fetch abstracts.
86
- # @param doc [Nokigiri::HTML::Document]
87
- # @return [Array<Array>]
88
- def fetch_abstract(doc)
89
- abstract_content = doc.at('//div[@itemprop="description"]').text
90
- [{
91
- content: abstract_content,
92
- language: "en",
93
- script: "Latn",
94
- format: "text/plain",
95
- }]
96
- end
97
-
98
- # rubocop:disable Metrics/AbcSize, Metrics/MethodLength
99
-
100
- # Get page.
101
- # @param path [String] page's path
102
- # @return [Array<Nokogiri::HTML::Document, String>]
103
- def get_page(url)
104
- uri = URI url
105
- resp = Net::HTTP.get_response(uri)
106
- case resp.code
107
- when "301"
108
- path = resp["location"]
109
- url = DOMAIN + path
110
- uri = URI url
111
- resp = Net::HTTP.get_response(uri)
112
- when "404"
113
- raise RelatonBib::RequestError, "Page not found #{url}"
114
- end
115
- Nokogiri::HTML(resp.body)
116
- rescue SocketError, Timeout::Error, Errno::EINVAL, Errno::ECONNRESET,
117
- EOFError, Net::HTTPBadResponse, Net::HTTPHeaderSyntaxError,
118
- Net::ProtocolError, OpenSSL::SSL::SSLError
119
- raise RelatonBib::RequestError, "Could not access #{url}"
120
- end
121
- # rubocop:enable Metrics/AbcSize
122
-
123
- # Fetch structuredidentifier.
124
- # @param doc [Nokogiri::HTML::Document]
125
- # @return [RelatonIsoBib::StructuredIdentifier]
126
- def fetch_structuredidentifier(doc)
127
- item_ref = doc.at("//span[@itemprop='productID']")
128
- unless item_ref
129
- return RelatonIsoBib::StructuredIdentifier.new(
130
- project_number: "?", part_number: "", prefix: nil, id: "?",
131
- )
132
- end
133
-
134
- m = item_ref.text.match(
135
- /(?<=\s)(?<project>\d+)-?(?<part>(?<=-)\d+|)-?(?<subpart>(?<=-)\d+|)/,
136
- )
137
- RelatonIsoBib::StructuredIdentifier.new(
138
- project_number: m[:project],
139
- part_number: m[:part],
140
- subpart_number: m[:subpart],
141
- prefix: nil,
142
- type: "IEC",
143
- id: item_ref.text,
144
- )
145
- end
146
-
147
- # Fetch status.
148
- # @param doc [Nokogiri::HTML::Document]
149
- # @param status [String]
150
- # @return [Hash]
151
- def fetch_status(doc)
152
- wip = doc.at('//ROW[STATUS[.="PREPARING"]]')
153
- if wip
154
- statuses = YAML.load_file File.join __dir__, "statuses.yml"
155
- s = wip.at("STAGE").text
156
- return unless statuses[s]
157
-
158
- stage, substage = statuses[s]["stage"].split "."
159
- else
160
- stage = "60"
161
- substage = "60"
162
- end
163
- RelatonBib::DocumentStatus.new(stage: stage, substage: substage)
164
- end
165
-
166
- # Fetch workgroup.
167
- # @param doc [Nokogiri::HTML::Document]
168
- # @return [Hash]
169
- def fetch_workgroup(doc)
170
- wg = doc.at('//th/abbr[.="TC"]/../following-sibling::td/a').text
171
- {
172
- name: "International Electrotechnical Commission",
173
- abbreviation: "IEC",
174
- url: "webstore.iec.ch",
175
- technical_committee: [{
176
- name: wg,
177
- type: "technicalCommittee",
178
- number: wg.match(/\d+/)&.to_s&.to_i,
179
- }],
180
- }
181
- end
182
- # rubocop:enable Metrics/MethodLength
183
-
184
- # Fetch relations.
185
- # @param doc [Nokogiri::HTML::Document]
186
- # @return [Array<Hash>]
187
- # rubocop:disable Metrics/MethodLength
188
- def fetch_relations(doc)
189
- doc.xpath('//ROW[STATUS[.!="PREPARING"]][STATUS[.!="PUBLISHED"]]')
190
- .map do |r|
191
- r_type = r.at("STATUS").text.downcase
192
- type = case r_type
193
- # when 'published' then 'obsoletes' # Valid
194
- when "revised", "replaced" then "updates"
195
- when "withdrawn" then "obsoletes"
196
- else r_type
197
- end
198
- ref = r.at("FULL_NAME").text
199
- fref = RelatonBib::FormattedRef.new content: ref, format: "text/plain"
200
- bibitem = IecBibliographicItem.new(
201
- formattedref: fref,
202
- docid: [RelatonBib::DocumentIdentifier.new(id: ref, type: "IEC", primary: true)],
203
- )
204
- { type: type, bibitem: bibitem }
205
- end
206
- end
207
-
208
- def fetch_status_relations(url)
209
- pubid = url.match(/\d+$/).to_s
210
- uri = URI "#{DOMAIN}/webstore/webstore.nsf/AjaxRequestXML?"\
211
- "Openagent&url=#{pubid}"
212
- resp = Net::HTTP.get_response uri
213
- doc = Nokogiri::XML resp.body
214
- status = fetch_status doc
215
- relations = fetch_relations doc
216
- [status, relations]
217
- end
218
- # rubocop:enable Metrics/MethodLength
219
-
220
- # Fetch type.
221
- # @param doc [Nokogiri::HTML::Document]
222
- # @return [String]
223
- def fetch_type(doc)
224
- type = doc.at(
225
- '//th[contains(., "Publication type")]/following-sibling::td/span',
226
- ).text
227
- TYPES[type] || type.downcase.tr(" ", "-")
228
- end
229
-
230
- # Fetch titles.
231
- # @param hit_data [Hash]
232
- # @return [Array<Hash>]
233
- def fetch_titles(hit_data)
234
- RelatonBib::TypedTitleString.from_string hit_data[:title], "en", "Latn"
235
- end
236
-
237
- # Fetch dates
238
- # @param doc [Nokogiri::HTML::Document]
239
- # @return [Array<Hash>]
240
- def fetch_dates(doc)
241
- dates = []
242
- publish_date = doc.at("//span[@itemprop='releaseDate']").text
243
- unless publish_date.empty?
244
- dates << { type: "published", on: publish_date }
245
- end
246
- dates
247
- end
248
-
249
- # rubocop:disable Metrics/MethodLength
250
-
251
- def fetch_contributors(code)
252
- code.sub(/\s.*/, "").split("/").map do |abbrev|
253
- name, url = name_url abbrev
254
- { entity: { name: name, url: url, abbreviation: abbrev },
255
- role: [type: "publisher"] }
256
- end
257
- end
258
- # rubocop:enable Metrics/MethodLength
259
-
260
- # Fetch ICS.
261
- # @param doc [Nokogiri::HTML::Document]
262
- # @return [Array<Hash>]
263
- def fetch_ics(doc)
264
- doc.xpath(
265
- '//th[contains(text(), "ICS")]/following-sibling::td/a',
266
- ).map do |i|
267
- code = i.text.match(/[\d.]+/).to_s.split "."
268
- { field: code[0], group: code[1], subgroup: code[2] }
269
- end
270
- end
271
-
272
- # Fetch links.
273
- # @param doc [Nokogiri::HTML::Document]
274
- # @param url [String]
275
- # @return [Array<Hash>]
276
- def fetch_link(doc, url)
277
- links = [{ type: "src", content: url }]
278
- obp_elms = doc.at_css("p.btn-preview a")
279
- links << { type: "obp", content: obp_elms[:href] } if obp_elms
280
- links
281
- end
282
-
283
- # rubocop:disable Metrics/MethodLength
284
-
285
- # Fetch copyright.
286
- # @param title [String]
287
- # @return [Array<Hash>]
288
- def fetch_copyright(code, doc)
289
- abbreviation = code.match(/.*?(?=\s)/).to_s
290
- name, url = name_url abbreviation
291
- from = code.match(/(?<=:)\d{4}/).to_s
292
- if from.empty?
293
- from = doc.xpath("//span[@itemprop='releaseDate']").text
294
- .match(/\d{4}/).to_s
295
- end
296
- [{
297
- owner: [{ name: name, abbreviation: abbreviation, url: url }],
298
- from: from,
299
- }]
300
- end
301
- # rubocop:enable Metrics/MethodLength
302
-
303
- def name_url(abbrev)
304
- ABBREVS[abbrev]
305
- end
306
- end
307
- end
308
- end