relaton-iec 1.14.0 → 1.14.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.adoc +31 -26
- data/lib/relaton_iec/data_fetcher.rb +166 -0
- data/lib/relaton_iec/data_parser.rb +287 -0
- data/lib/relaton_iec/hit.rb +9 -1
- data/lib/relaton_iec/hit_collection.rb +15 -79
- data/lib/relaton_iec/iec_bibliographic_item.rb +20 -5
- data/lib/relaton_iec/iec_bibliography.rb +83 -111
- data/lib/relaton_iec/index.rb +133 -0
- data/lib/relaton_iec/processor.rb +13 -0
- data/lib/relaton_iec/version.rb +1 -1
- data/lib/relaton_iec.rb +9 -6
- data/relaton_iec.gemspec +4 -3
- metadata +23 -7
- data/lib/relaton_iec/scrapper.rb +0 -308
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: relaton-iec
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.14.
|
4
|
+
version: 1.14.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Ribose Inc.
|
8
|
-
autorequire:
|
8
|
+
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2023-03-02 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: equivalent-xml
|
@@ -150,6 +150,20 @@ dependencies:
|
|
150
150
|
- - "~>"
|
151
151
|
- !ruby/object:Gem::Version
|
152
152
|
version: 1.14.0
|
153
|
+
- !ruby/object:Gem::Dependency
|
154
|
+
name: rubyzip
|
155
|
+
requirement: !ruby/object:Gem::Requirement
|
156
|
+
requirements:
|
157
|
+
- - ">="
|
158
|
+
- !ruby/object:Gem::Version
|
159
|
+
version: '0'
|
160
|
+
type: :runtime
|
161
|
+
prerelease: false
|
162
|
+
version_requirements: !ruby/object:Gem::Requirement
|
163
|
+
requirements:
|
164
|
+
- - ">="
|
165
|
+
- !ruby/object:Gem::Version
|
166
|
+
version: '0'
|
153
167
|
description: 'RelatonIec: retrieve IEC Standards for bibliographic use using the IecBibliographicItem
|
154
168
|
model'
|
155
169
|
email:
|
@@ -193,13 +207,15 @@ files:
|
|
193
207
|
- lib/relaton_iec/basic_block/stem.rb
|
194
208
|
- lib/relaton_iec/basic_block/table.rb
|
195
209
|
- lib/relaton_iec/basic_block/text_element.rb
|
210
|
+
- lib/relaton_iec/data_fetcher.rb
|
211
|
+
- lib/relaton_iec/data_parser.rb
|
196
212
|
- lib/relaton_iec/hash_converter.rb
|
197
213
|
- lib/relaton_iec/hit.rb
|
198
214
|
- lib/relaton_iec/hit_collection.rb
|
199
215
|
- lib/relaton_iec/iec_bibliographic_item.rb
|
200
216
|
- lib/relaton_iec/iec_bibliography.rb
|
217
|
+
- lib/relaton_iec/index.rb
|
201
218
|
- lib/relaton_iec/processor.rb
|
202
|
-
- lib/relaton_iec/scrapper.rb
|
203
219
|
- lib/relaton_iec/statuses.yml
|
204
220
|
- lib/relaton_iec/tc_sc_officers_note.rb
|
205
221
|
- lib/relaton_iec/version.rb
|
@@ -209,7 +225,7 @@ homepage: https://github.com/metanorma/relaton-iec
|
|
209
225
|
licenses:
|
210
226
|
- MIT
|
211
227
|
metadata: {}
|
212
|
-
post_install_message:
|
228
|
+
post_install_message:
|
213
229
|
rdoc_options: []
|
214
230
|
require_paths:
|
215
231
|
- lib
|
@@ -224,8 +240,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
224
240
|
- !ruby/object:Gem::Version
|
225
241
|
version: '0'
|
226
242
|
requirements: []
|
227
|
-
rubygems_version: 3.
|
228
|
-
signing_key:
|
243
|
+
rubygems_version: 3.1.6
|
244
|
+
signing_key:
|
229
245
|
specification_version: 4
|
230
246
|
summary: 'RelatonIec: retrieve IEC Standards for bibliographic use using the IecBibliographicItem
|
231
247
|
model'
|
data/lib/relaton_iec/scrapper.rb
DELETED
@@ -1,308 +0,0 @@
|
|
1
|
-
# frozen_string_literal: true
|
2
|
-
|
3
|
-
# Capybara.request_driver :poltergeist do |app|
|
4
|
-
# Capybara::Poltergeist::Driver.new app, js_errors: false
|
5
|
-
# end
|
6
|
-
# Capybara.default_driver = :poltergeist
|
7
|
-
|
8
|
-
module RelatonIec
|
9
|
-
# Scrapper.
|
10
|
-
module Scrapper
|
11
|
-
DOMAIN = "https://webstore.iec.ch"
|
12
|
-
ABBREVS = {
|
13
|
-
"ISO" => ["International Organization for Standardization", "www.iso.org"],
|
14
|
-
"IEC" => ["International Electrotechnical Commission", "www.iec.ch"],
|
15
|
-
"CISPR" => ["International special committee on radio interference", "www.iec.ch"],
|
16
|
-
}.freeze
|
17
|
-
|
18
|
-
TYPES = {
|
19
|
-
"ISO" => "international-standard",
|
20
|
-
"TS" => "technical-specification",
|
21
|
-
"TR" => "technical-report",
|
22
|
-
"PAS" => "publicly-available-specification",
|
23
|
-
"AWI" => "appruved-work-item",
|
24
|
-
"CD" => "committee-draft",
|
25
|
-
"FDIS" => "final-draft-international-standard",
|
26
|
-
"NP" => "new-proposal",
|
27
|
-
"DIS" => "draft-international-standard",
|
28
|
-
"WD" => "working-draft",
|
29
|
-
"R" => "recommendation",
|
30
|
-
"Guide" => "guide",
|
31
|
-
"SRD" => "system-reference-delivrabble",
|
32
|
-
}.freeze
|
33
|
-
|
34
|
-
class << self
|
35
|
-
# rubocop:disable Metrics/AbcSize, Metrics/MethodLength
|
36
|
-
|
37
|
-
# Parse page.
|
38
|
-
# @param hit_data [Hash]
|
39
|
-
# @return [Hash]
|
40
|
-
def parse_page(hit_data)
|
41
|
-
doc = get_page hit_data[:url]
|
42
|
-
|
43
|
-
# Fetch edition.
|
44
|
-
edition = doc.at(
|
45
|
-
"//th[contains(., 'Edition')]/following-sibling::td/span",
|
46
|
-
).text
|
47
|
-
|
48
|
-
status, relations = fetch_status_relations hit_data[:url]
|
49
|
-
|
50
|
-
IecBibliographicItem.new(
|
51
|
-
fetched: Date.today.to_s,
|
52
|
-
docid: fetch_docid(hit_data),
|
53
|
-
structuredidentifier: fetch_structuredidentifier(doc),
|
54
|
-
edition: edition,
|
55
|
-
language: ["en"],
|
56
|
-
script: ["Latn"],
|
57
|
-
title: fetch_titles(hit_data),
|
58
|
-
doctype: fetch_type(doc),
|
59
|
-
docstatus: status,
|
60
|
-
ics: fetch_ics(doc),
|
61
|
-
date: fetch_dates(doc),
|
62
|
-
contributor: fetch_contributors(hit_data[:code]),
|
63
|
-
editorialgroup: fetch_workgroup(doc),
|
64
|
-
abstract: fetch_abstract(doc),
|
65
|
-
copyright: fetch_copyright(hit_data[:code], doc),
|
66
|
-
link: fetch_link(doc, hit_data[:url]),
|
67
|
-
relation: relations,
|
68
|
-
place: ["Geneva"],
|
69
|
-
)
|
70
|
-
end
|
71
|
-
# rubocop:enable Metrics/AbcSize, Metrics/MethodLength
|
72
|
-
|
73
|
-
private
|
74
|
-
|
75
|
-
# @param hit [Hash]
|
76
|
-
# @return [Array<RelatonBib::DocumentIdentifier>]
|
77
|
-
def fetch_docid(hit)
|
78
|
-
urn = RelatonIec.code_to_urn hit[:code], "en"
|
79
|
-
[
|
80
|
-
RelatonBib::DocumentIdentifier.new(id: hit[:code], type: "IEC", primary: true),
|
81
|
-
RelatonBib::DocumentIdentifier.new(id: urn, type: "URN"),
|
82
|
-
]
|
83
|
-
end
|
84
|
-
|
85
|
-
# Fetch abstracts.
|
86
|
-
# @param doc [Nokigiri::HTML::Document]
|
87
|
-
# @return [Array<Array>]
|
88
|
-
def fetch_abstract(doc)
|
89
|
-
abstract_content = doc.at('//div[@itemprop="description"]').text
|
90
|
-
[{
|
91
|
-
content: abstract_content,
|
92
|
-
language: "en",
|
93
|
-
script: "Latn",
|
94
|
-
format: "text/plain",
|
95
|
-
}]
|
96
|
-
end
|
97
|
-
|
98
|
-
# rubocop:disable Metrics/AbcSize, Metrics/MethodLength
|
99
|
-
|
100
|
-
# Get page.
|
101
|
-
# @param path [String] page's path
|
102
|
-
# @return [Array<Nokogiri::HTML::Document, String>]
|
103
|
-
def get_page(url)
|
104
|
-
uri = URI url
|
105
|
-
resp = Net::HTTP.get_response(uri)
|
106
|
-
case resp.code
|
107
|
-
when "301"
|
108
|
-
path = resp["location"]
|
109
|
-
url = DOMAIN + path
|
110
|
-
uri = URI url
|
111
|
-
resp = Net::HTTP.get_response(uri)
|
112
|
-
when "404"
|
113
|
-
raise RelatonBib::RequestError, "Page not found #{url}"
|
114
|
-
end
|
115
|
-
Nokogiri::HTML(resp.body)
|
116
|
-
rescue SocketError, Timeout::Error, Errno::EINVAL, Errno::ECONNRESET,
|
117
|
-
EOFError, Net::HTTPBadResponse, Net::HTTPHeaderSyntaxError,
|
118
|
-
Net::ProtocolError, OpenSSL::SSL::SSLError
|
119
|
-
raise RelatonBib::RequestError, "Could not access #{url}"
|
120
|
-
end
|
121
|
-
# rubocop:enable Metrics/AbcSize
|
122
|
-
|
123
|
-
# Fetch structuredidentifier.
|
124
|
-
# @param doc [Nokogiri::HTML::Document]
|
125
|
-
# @return [RelatonIsoBib::StructuredIdentifier]
|
126
|
-
def fetch_structuredidentifier(doc)
|
127
|
-
item_ref = doc.at("//span[@itemprop='productID']")
|
128
|
-
unless item_ref
|
129
|
-
return RelatonIsoBib::StructuredIdentifier.new(
|
130
|
-
project_number: "?", part_number: "", prefix: nil, id: "?",
|
131
|
-
)
|
132
|
-
end
|
133
|
-
|
134
|
-
m = item_ref.text.match(
|
135
|
-
/(?<=\s)(?<project>\d+)-?(?<part>(?<=-)\d+|)-?(?<subpart>(?<=-)\d+|)/,
|
136
|
-
)
|
137
|
-
RelatonIsoBib::StructuredIdentifier.new(
|
138
|
-
project_number: m[:project],
|
139
|
-
part_number: m[:part],
|
140
|
-
subpart_number: m[:subpart],
|
141
|
-
prefix: nil,
|
142
|
-
type: "IEC",
|
143
|
-
id: item_ref.text,
|
144
|
-
)
|
145
|
-
end
|
146
|
-
|
147
|
-
# Fetch status.
|
148
|
-
# @param doc [Nokogiri::HTML::Document]
|
149
|
-
# @param status [String]
|
150
|
-
# @return [Hash]
|
151
|
-
def fetch_status(doc)
|
152
|
-
wip = doc.at('//ROW[STATUS[.="PREPARING"]]')
|
153
|
-
if wip
|
154
|
-
statuses = YAML.load_file File.join __dir__, "statuses.yml"
|
155
|
-
s = wip.at("STAGE").text
|
156
|
-
return unless statuses[s]
|
157
|
-
|
158
|
-
stage, substage = statuses[s]["stage"].split "."
|
159
|
-
else
|
160
|
-
stage = "60"
|
161
|
-
substage = "60"
|
162
|
-
end
|
163
|
-
RelatonBib::DocumentStatus.new(stage: stage, substage: substage)
|
164
|
-
end
|
165
|
-
|
166
|
-
# Fetch workgroup.
|
167
|
-
# @param doc [Nokogiri::HTML::Document]
|
168
|
-
# @return [Hash]
|
169
|
-
def fetch_workgroup(doc)
|
170
|
-
wg = doc.at('//th/abbr[.="TC"]/../following-sibling::td/a').text
|
171
|
-
{
|
172
|
-
name: "International Electrotechnical Commission",
|
173
|
-
abbreviation: "IEC",
|
174
|
-
url: "webstore.iec.ch",
|
175
|
-
technical_committee: [{
|
176
|
-
name: wg,
|
177
|
-
type: "technicalCommittee",
|
178
|
-
number: wg.match(/\d+/)&.to_s&.to_i,
|
179
|
-
}],
|
180
|
-
}
|
181
|
-
end
|
182
|
-
# rubocop:enable Metrics/MethodLength
|
183
|
-
|
184
|
-
# Fetch relations.
|
185
|
-
# @param doc [Nokogiri::HTML::Document]
|
186
|
-
# @return [Array<Hash>]
|
187
|
-
# rubocop:disable Metrics/MethodLength
|
188
|
-
def fetch_relations(doc)
|
189
|
-
doc.xpath('//ROW[STATUS[.!="PREPARING"]][STATUS[.!="PUBLISHED"]]')
|
190
|
-
.map do |r|
|
191
|
-
r_type = r.at("STATUS").text.downcase
|
192
|
-
type = case r_type
|
193
|
-
# when 'published' then 'obsoletes' # Valid
|
194
|
-
when "revised", "replaced" then "updates"
|
195
|
-
when "withdrawn" then "obsoletes"
|
196
|
-
else r_type
|
197
|
-
end
|
198
|
-
ref = r.at("FULL_NAME").text
|
199
|
-
fref = RelatonBib::FormattedRef.new content: ref, format: "text/plain"
|
200
|
-
bibitem = IecBibliographicItem.new(
|
201
|
-
formattedref: fref,
|
202
|
-
docid: [RelatonBib::DocumentIdentifier.new(id: ref, type: "IEC", primary: true)],
|
203
|
-
)
|
204
|
-
{ type: type, bibitem: bibitem }
|
205
|
-
end
|
206
|
-
end
|
207
|
-
|
208
|
-
def fetch_status_relations(url)
|
209
|
-
pubid = url.match(/\d+$/).to_s
|
210
|
-
uri = URI "#{DOMAIN}/webstore/webstore.nsf/AjaxRequestXML?"\
|
211
|
-
"Openagent&url=#{pubid}"
|
212
|
-
resp = Net::HTTP.get_response uri
|
213
|
-
doc = Nokogiri::XML resp.body
|
214
|
-
status = fetch_status doc
|
215
|
-
relations = fetch_relations doc
|
216
|
-
[status, relations]
|
217
|
-
end
|
218
|
-
# rubocop:enable Metrics/MethodLength
|
219
|
-
|
220
|
-
# Fetch type.
|
221
|
-
# @param doc [Nokogiri::HTML::Document]
|
222
|
-
# @return [String]
|
223
|
-
def fetch_type(doc)
|
224
|
-
type = doc.at(
|
225
|
-
'//th[contains(., "Publication type")]/following-sibling::td/span',
|
226
|
-
).text
|
227
|
-
TYPES[type] || type.downcase.tr(" ", "-")
|
228
|
-
end
|
229
|
-
|
230
|
-
# Fetch titles.
|
231
|
-
# @param hit_data [Hash]
|
232
|
-
# @return [Array<Hash>]
|
233
|
-
def fetch_titles(hit_data)
|
234
|
-
RelatonBib::TypedTitleString.from_string hit_data[:title], "en", "Latn"
|
235
|
-
end
|
236
|
-
|
237
|
-
# Fetch dates
|
238
|
-
# @param doc [Nokogiri::HTML::Document]
|
239
|
-
# @return [Array<Hash>]
|
240
|
-
def fetch_dates(doc)
|
241
|
-
dates = []
|
242
|
-
publish_date = doc.at("//span[@itemprop='releaseDate']").text
|
243
|
-
unless publish_date.empty?
|
244
|
-
dates << { type: "published", on: publish_date }
|
245
|
-
end
|
246
|
-
dates
|
247
|
-
end
|
248
|
-
|
249
|
-
# rubocop:disable Metrics/MethodLength
|
250
|
-
|
251
|
-
def fetch_contributors(code)
|
252
|
-
code.sub(/\s.*/, "").split("/").map do |abbrev|
|
253
|
-
name, url = name_url abbrev
|
254
|
-
{ entity: { name: name, url: url, abbreviation: abbrev },
|
255
|
-
role: [type: "publisher"] }
|
256
|
-
end
|
257
|
-
end
|
258
|
-
# rubocop:enable Metrics/MethodLength
|
259
|
-
|
260
|
-
# Fetch ICS.
|
261
|
-
# @param doc [Nokogiri::HTML::Document]
|
262
|
-
# @return [Array<Hash>]
|
263
|
-
def fetch_ics(doc)
|
264
|
-
doc.xpath(
|
265
|
-
'//th[contains(text(), "ICS")]/following-sibling::td/a',
|
266
|
-
).map do |i|
|
267
|
-
code = i.text.match(/[\d.]+/).to_s.split "."
|
268
|
-
{ field: code[0], group: code[1], subgroup: code[2] }
|
269
|
-
end
|
270
|
-
end
|
271
|
-
|
272
|
-
# Fetch links.
|
273
|
-
# @param doc [Nokogiri::HTML::Document]
|
274
|
-
# @param url [String]
|
275
|
-
# @return [Array<Hash>]
|
276
|
-
def fetch_link(doc, url)
|
277
|
-
links = [{ type: "src", content: url }]
|
278
|
-
obp_elms = doc.at_css("p.btn-preview a")
|
279
|
-
links << { type: "obp", content: obp_elms[:href] } if obp_elms
|
280
|
-
links
|
281
|
-
end
|
282
|
-
|
283
|
-
# rubocop:disable Metrics/MethodLength
|
284
|
-
|
285
|
-
# Fetch copyright.
|
286
|
-
# @param title [String]
|
287
|
-
# @return [Array<Hash>]
|
288
|
-
def fetch_copyright(code, doc)
|
289
|
-
abbreviation = code.match(/.*?(?=\s)/).to_s
|
290
|
-
name, url = name_url abbreviation
|
291
|
-
from = code.match(/(?<=:)\d{4}/).to_s
|
292
|
-
if from.empty?
|
293
|
-
from = doc.xpath("//span[@itemprop='releaseDate']").text
|
294
|
-
.match(/\d{4}/).to_s
|
295
|
-
end
|
296
|
-
[{
|
297
|
-
owner: [{ name: name, abbreviation: abbreviation, url: url }],
|
298
|
-
from: from,
|
299
|
-
}]
|
300
|
-
end
|
301
|
-
# rubocop:enable Metrics/MethodLength
|
302
|
-
|
303
|
-
def name_url(abbrev)
|
304
|
-
ABBREVS[abbrev]
|
305
|
-
end
|
306
|
-
end
|
307
|
-
end
|
308
|
-
end
|