relaton-iec 1.14.1 → 1.14.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: relaton-iec
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.14.1
4
+ version: 1.14.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Ribose Inc.
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2022-12-21 00:00:00.000000000 Z
11
+ date: 2023-03-02 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: equivalent-xml
@@ -150,6 +150,20 @@ dependencies:
150
150
  - - "~>"
151
151
  - !ruby/object:Gem::Version
152
152
  version: 1.14.0
153
+ - !ruby/object:Gem::Dependency
154
+ name: rubyzip
155
+ requirement: !ruby/object:Gem::Requirement
156
+ requirements:
157
+ - - ">="
158
+ - !ruby/object:Gem::Version
159
+ version: '0'
160
+ type: :runtime
161
+ prerelease: false
162
+ version_requirements: !ruby/object:Gem::Requirement
163
+ requirements:
164
+ - - ">="
165
+ - !ruby/object:Gem::Version
166
+ version: '0'
153
167
  description: 'RelatonIec: retrieve IEC Standards for bibliographic use using the IecBibliographicItem
154
168
  model'
155
169
  email:
@@ -193,13 +207,15 @@ files:
193
207
  - lib/relaton_iec/basic_block/stem.rb
194
208
  - lib/relaton_iec/basic_block/table.rb
195
209
  - lib/relaton_iec/basic_block/text_element.rb
210
+ - lib/relaton_iec/data_fetcher.rb
211
+ - lib/relaton_iec/data_parser.rb
196
212
  - lib/relaton_iec/hash_converter.rb
197
213
  - lib/relaton_iec/hit.rb
198
214
  - lib/relaton_iec/hit_collection.rb
199
215
  - lib/relaton_iec/iec_bibliographic_item.rb
200
216
  - lib/relaton_iec/iec_bibliography.rb
217
+ - lib/relaton_iec/index.rb
201
218
  - lib/relaton_iec/processor.rb
202
- - lib/relaton_iec/scrapper.rb
203
219
  - lib/relaton_iec/statuses.yml
204
220
  - lib/relaton_iec/tc_sc_officers_note.rb
205
221
  - lib/relaton_iec/version.rb
@@ -1,308 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- # Capybara.request_driver :poltergeist do |app|
4
- # Capybara::Poltergeist::Driver.new app, js_errors: false
5
- # end
6
- # Capybara.default_driver = :poltergeist
7
-
8
- module RelatonIec
9
- # Scrapper.
10
- module Scrapper
11
- DOMAIN = "https://webstore.iec.ch"
12
- ABBREVS = {
13
- "ISO" => ["International Organization for Standardization", "www.iso.org"],
14
- "IEC" => ["International Electrotechnical Commission", "www.iec.ch"],
15
- "CISPR" => ["International special committee on radio interference", "www.iec.ch"],
16
- }.freeze
17
-
18
- TYPES = {
19
- "ISO" => "international-standard",
20
- "TS" => "technical-specification",
21
- "TR" => "technical-report",
22
- "PAS" => "publicly-available-specification",
23
- "AWI" => "appruved-work-item",
24
- "CD" => "committee-draft",
25
- "FDIS" => "final-draft-international-standard",
26
- "NP" => "new-proposal",
27
- "DIS" => "draft-international-standard",
28
- "WD" => "working-draft",
29
- "R" => "recommendation",
30
- "Guide" => "guide",
31
- "SRD" => "system-reference-deliverable",
32
- }.freeze
33
-
34
- class << self
35
- # rubocop:disable Metrics/AbcSize, Metrics/MethodLength
36
-
37
- # Parse page.
38
- # @param hit_data [Hash]
39
- # @return [Hash]
40
- def parse_page(hit_data)
41
- doc = get_page hit_data[:url]
42
-
43
- # Fetch edition.
44
- edition = doc.at(
45
- "//th[contains(., 'Edition')]/following-sibling::td/span",
46
- ).text
47
-
48
- status, relations = fetch_status_relations hit_data[:url]
49
-
50
- IecBibliographicItem.new(
51
- fetched: Date.today.to_s,
52
- docid: fetch_docid(hit_data),
53
- structuredidentifier: fetch_structuredidentifier(doc),
54
- edition: edition,
55
- language: ["en"],
56
- script: ["Latn"],
57
- title: fetch_titles(hit_data),
58
- doctype: fetch_type(doc),
59
- docstatus: status,
60
- ics: fetch_ics(doc),
61
- date: fetch_dates(doc),
62
- contributor: fetch_contributors(hit_data[:code]),
63
- editorialgroup: fetch_workgroup(doc),
64
- abstract: fetch_abstract(doc),
65
- copyright: fetch_copyright(hit_data[:code], doc),
66
- link: fetch_link(doc, hit_data[:url]),
67
- relation: relations,
68
- place: ["Geneva"],
69
- )
70
- end
71
- # rubocop:enable Metrics/AbcSize, Metrics/MethodLength
72
-
73
- private
74
-
75
- # @param hit [Hash]
76
- # @return [Array<RelatonBib::DocumentIdentifier>]
77
- def fetch_docid(hit)
78
- urn = RelatonIec.code_to_urn hit[:code], "en"
79
- [
80
- RelatonBib::DocumentIdentifier.new(id: hit[:code], type: "IEC", primary: true),
81
- RelatonBib::DocumentIdentifier.new(id: urn, type: "URN"),
82
- ]
83
- end
84
-
85
- # Fetch abstracts.
86
- # @param doc [Nokigiri::HTML::Document]
87
- # @return [Array<Array>]
88
- def fetch_abstract(doc)
89
- abstract_content = doc.at('//div[@itemprop="description"]').text
90
- [{
91
- content: abstract_content,
92
- language: "en",
93
- script: "Latn",
94
- format: "text/plain",
95
- }]
96
- end
97
-
98
- # rubocop:disable Metrics/AbcSize, Metrics/MethodLength
99
-
100
- # Get page.
101
- # @param path [String] page's path
102
- # @return [Array<Nokogiri::HTML::Document, String>]
103
- def get_page(url)
104
- uri = URI url
105
- resp = Net::HTTP.get_response(uri)
106
- case resp.code
107
- when "301"
108
- path = resp["location"]
109
- url = DOMAIN + path
110
- uri = URI url
111
- resp = Net::HTTP.get_response(uri)
112
- when "404"
113
- raise RelatonBib::RequestError, "Page not found #{url}"
114
- end
115
- Nokogiri::HTML(resp.body)
116
- rescue SocketError, Timeout::Error, Errno::EINVAL, Errno::ECONNRESET,
117
- EOFError, Net::HTTPBadResponse, Net::HTTPHeaderSyntaxError,
118
- Net::ProtocolError, OpenSSL::SSL::SSLError
119
- raise RelatonBib::RequestError, "Could not access #{url}"
120
- end
121
- # rubocop:enable Metrics/AbcSize
122
-
123
- # Fetch structuredidentifier.
124
- # @param doc [Nokogiri::HTML::Document]
125
- # @return [RelatonIsoBib::StructuredIdentifier]
126
- def fetch_structuredidentifier(doc)
127
- item_ref = doc.at("//span[@itemprop='productID']")
128
- unless item_ref
129
- return RelatonIsoBib::StructuredIdentifier.new(
130
- project_number: "?", part_number: "", prefix: nil, id: "?",
131
- )
132
- end
133
-
134
- m = item_ref.text.match(
135
- /(?<=\s)(?<project>\d+)-?(?<part>(?<=-)\d+|)-?(?<subpart>(?<=-)\d+|)/,
136
- )
137
- RelatonIsoBib::StructuredIdentifier.new(
138
- project_number: m[:project],
139
- part_number: m[:part],
140
- subpart_number: m[:subpart],
141
- prefix: nil,
142
- type: "IEC",
143
- id: item_ref.text,
144
- )
145
- end
146
-
147
- # Fetch status.
148
- # @param doc [Nokogiri::HTML::Document]
149
- # @param status [String]
150
- # @return [Hash]
151
- def fetch_status(doc)
152
- wip = doc.at('//ROW[STATUS[.="PREPARING"]]')
153
- if wip
154
- statuses = YAML.load_file File.join __dir__, "statuses.yml"
155
- s = wip.at("STAGE").text
156
- return unless statuses[s]
157
-
158
- stage, substage = statuses[s]["stage"].split "."
159
- else
160
- stage = "60"
161
- substage = "60"
162
- end
163
- RelatonBib::DocumentStatus.new(stage: stage, substage: substage)
164
- end
165
-
166
- # Fetch workgroup.
167
- # @param doc [Nokogiri::HTML::Document]
168
- # @return [Hash]
169
- def fetch_workgroup(doc)
170
- wg = doc.at('//th/abbr[.="TC"]/../following-sibling::td/a').text
171
- {
172
- name: "International Electrotechnical Commission",
173
- abbreviation: "IEC",
174
- url: "webstore.iec.ch",
175
- technical_committee: [{
176
- name: wg,
177
- type: "technicalCommittee",
178
- number: wg.match(/\d+/)&.to_s&.to_i,
179
- }],
180
- }
181
- end
182
- # rubocop:enable Metrics/MethodLength
183
-
184
- # Fetch relations.
185
- # @param doc [Nokogiri::HTML::Document]
186
- # @return [Array<Hash>]
187
- # rubocop:disable Metrics/MethodLength
188
- def fetch_relations(doc)
189
- doc.xpath('//ROW[STATUS[.!="PREPARING"]][STATUS[.!="PUBLISHED"]]')
190
- .map do |r|
191
- r_type = r.at("STATUS").text.downcase
192
- type = case r_type
193
- # when 'published' then 'obsoletes' # Valid
194
- when "revised", "replaced" then "updates"
195
- when "withdrawn" then "obsoletes"
196
- else r_type
197
- end
198
- ref = r.at("FULL_NAME").text
199
- fref = RelatonBib::FormattedRef.new content: ref, format: "text/plain"
200
- bibitem = IecBibliographicItem.new(
201
- formattedref: fref,
202
- docid: [RelatonBib::DocumentIdentifier.new(id: ref, type: "IEC", primary: true)],
203
- )
204
- { type: type, bibitem: bibitem }
205
- end
206
- end
207
-
208
- def fetch_status_relations(url)
209
- pubid = url.match(/\d+$/).to_s
210
- uri = URI "#{DOMAIN}/webstore/webstore.nsf/AjaxRequestXML?"\
211
- "Openagent&url=#{pubid}"
212
- resp = Net::HTTP.get_response uri
213
- doc = Nokogiri::XML resp.body
214
- status = fetch_status doc
215
- relations = fetch_relations doc
216
- [status, relations]
217
- end
218
- # rubocop:enable Metrics/MethodLength
219
-
220
- # Fetch type.
221
- # @param doc [Nokogiri::HTML::Document]
222
- # @return [String]
223
- def fetch_type(doc)
224
- type = doc.at(
225
- '//th[contains(., "Publication type")]/following-sibling::td/span',
226
- ).text
227
- TYPES[type] || type.downcase.tr(" ", "-")
228
- end
229
-
230
- # Fetch titles.
231
- # @param hit_data [Hash]
232
- # @return [Array<Hash>]
233
- def fetch_titles(hit_data)
234
- RelatonBib::TypedTitleString.from_string hit_data[:title], "en", "Latn"
235
- end
236
-
237
- # Fetch dates
238
- # @param doc [Nokogiri::HTML::Document]
239
- # @return [Array<Hash>]
240
- def fetch_dates(doc)
241
- dates = []
242
- publish_date = doc.at("//span[@itemprop='releaseDate']").text
243
- unless publish_date.empty?
244
- dates << { type: "published", on: publish_date }
245
- end
246
- dates
247
- end
248
-
249
- # rubocop:disable Metrics/MethodLength
250
-
251
- def fetch_contributors(code)
252
- code.sub(/\s.*/, "").split("/").map do |abbrev|
253
- name, url = name_url abbrev
254
- { entity: { name: name, url: url, abbreviation: abbrev },
255
- role: [type: "publisher"] }
256
- end
257
- end
258
- # rubocop:enable Metrics/MethodLength
259
-
260
- # Fetch ICS.
261
- # @param doc [Nokogiri::HTML::Document]
262
- # @return [Array<Hash>]
263
- def fetch_ics(doc)
264
- doc.xpath(
265
- '//th[contains(text(), "ICS")]/following-sibling::td/a',
266
- ).map do |i|
267
- code = i.text.match(/[\d.]+/).to_s.split "."
268
- { field: code[0], group: code[1], subgroup: code[2] }
269
- end
270
- end
271
-
272
- # Fetch links.
273
- # @param doc [Nokogiri::HTML::Document]
274
- # @param url [String]
275
- # @return [Array<Hash>]
276
- def fetch_link(doc, url)
277
- links = [{ type: "src", content: url }]
278
- obp_elms = doc.at_css("p.btn-preview a")
279
- links << { type: "obp", content: obp_elms[:href] } if obp_elms
280
- links
281
- end
282
-
283
- # rubocop:disable Metrics/MethodLength
284
-
285
- # Fetch copyright.
286
- # @param title [String]
287
- # @return [Array<Hash>]
288
- def fetch_copyright(code, doc)
289
- abbreviation = code.match(/.*?(?=\s)/).to_s
290
- name, url = name_url abbreviation
291
- from = code.match(/(?<=:)\d{4}/).to_s
292
- if from.empty?
293
- from = doc.xpath("//span[@itemprop='releaseDate']").text
294
- .match(/\d{4}/).to_s
295
- end
296
- [{
297
- owner: [{ name: name, abbreviation: abbreviation, url: url }],
298
- from: from,
299
- }]
300
- end
301
- # rubocop:enable Metrics/MethodLength
302
-
303
- def name_url(abbrev)
304
- ABBREVS[abbrev]
305
- end
306
- end
307
- end
308
- end