relaton-iec 1.14.1 → 1.14.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.adoc +31 -26
- data/lib/relaton_iec/data_fetcher.rb +166 -0
- data/lib/relaton_iec/data_parser.rb +287 -0
- data/lib/relaton_iec/hit.rb +9 -1
- data/lib/relaton_iec/hit_collection.rb +15 -79
- data/lib/relaton_iec/iec_bibliographic_item.rb +3 -1
- data/lib/relaton_iec/iec_bibliography.rb +83 -111
- data/lib/relaton_iec/index.rb +133 -0
- data/lib/relaton_iec/processor.rb +13 -0
- data/lib/relaton_iec/version.rb +1 -1
- data/lib/relaton_iec.rb +9 -6
- data/relaton_iec.gemspec +4 -3
- metadata +19 -3
- data/lib/relaton_iec/scrapper.rb +0 -308
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: relaton-iec
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.14.
|
4
|
+
version: 1.14.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Ribose Inc.
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2023-03-02 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: equivalent-xml
|
@@ -150,6 +150,20 @@ dependencies:
|
|
150
150
|
- - "~>"
|
151
151
|
- !ruby/object:Gem::Version
|
152
152
|
version: 1.14.0
|
153
|
+
- !ruby/object:Gem::Dependency
|
154
|
+
name: rubyzip
|
155
|
+
requirement: !ruby/object:Gem::Requirement
|
156
|
+
requirements:
|
157
|
+
- - ">="
|
158
|
+
- !ruby/object:Gem::Version
|
159
|
+
version: '0'
|
160
|
+
type: :runtime
|
161
|
+
prerelease: false
|
162
|
+
version_requirements: !ruby/object:Gem::Requirement
|
163
|
+
requirements:
|
164
|
+
- - ">="
|
165
|
+
- !ruby/object:Gem::Version
|
166
|
+
version: '0'
|
153
167
|
description: 'RelatonIec: retrieve IEC Standards for bibliographic use using the IecBibliographicItem
|
154
168
|
model'
|
155
169
|
email:
|
@@ -193,13 +207,15 @@ files:
|
|
193
207
|
- lib/relaton_iec/basic_block/stem.rb
|
194
208
|
- lib/relaton_iec/basic_block/table.rb
|
195
209
|
- lib/relaton_iec/basic_block/text_element.rb
|
210
|
+
- lib/relaton_iec/data_fetcher.rb
|
211
|
+
- lib/relaton_iec/data_parser.rb
|
196
212
|
- lib/relaton_iec/hash_converter.rb
|
197
213
|
- lib/relaton_iec/hit.rb
|
198
214
|
- lib/relaton_iec/hit_collection.rb
|
199
215
|
- lib/relaton_iec/iec_bibliographic_item.rb
|
200
216
|
- lib/relaton_iec/iec_bibliography.rb
|
217
|
+
- lib/relaton_iec/index.rb
|
201
218
|
- lib/relaton_iec/processor.rb
|
202
|
-
- lib/relaton_iec/scrapper.rb
|
203
219
|
- lib/relaton_iec/statuses.yml
|
204
220
|
- lib/relaton_iec/tc_sc_officers_note.rb
|
205
221
|
- lib/relaton_iec/version.rb
|
data/lib/relaton_iec/scrapper.rb
DELETED
@@ -1,308 +0,0 @@
|
|
1
|
-
# frozen_string_literal: true
|
2
|
-
|
3
|
-
# Capybara.request_driver :poltergeist do |app|
|
4
|
-
# Capybara::Poltergeist::Driver.new app, js_errors: false
|
5
|
-
# end
|
6
|
-
# Capybara.default_driver = :poltergeist
|
7
|
-
|
8
|
-
module RelatonIec
|
9
|
-
# Scrapper.
|
10
|
-
module Scrapper
|
11
|
-
DOMAIN = "https://webstore.iec.ch"
|
12
|
-
ABBREVS = {
|
13
|
-
"ISO" => ["International Organization for Standardization", "www.iso.org"],
|
14
|
-
"IEC" => ["International Electrotechnical Commission", "www.iec.ch"],
|
15
|
-
"CISPR" => ["International special committee on radio interference", "www.iec.ch"],
|
16
|
-
}.freeze
|
17
|
-
|
18
|
-
TYPES = {
|
19
|
-
"ISO" => "international-standard",
|
20
|
-
"TS" => "technical-specification",
|
21
|
-
"TR" => "technical-report",
|
22
|
-
"PAS" => "publicly-available-specification",
|
23
|
-
"AWI" => "appruved-work-item",
|
24
|
-
"CD" => "committee-draft",
|
25
|
-
"FDIS" => "final-draft-international-standard",
|
26
|
-
"NP" => "new-proposal",
|
27
|
-
"DIS" => "draft-international-standard",
|
28
|
-
"WD" => "working-draft",
|
29
|
-
"R" => "recommendation",
|
30
|
-
"Guide" => "guide",
|
31
|
-
"SRD" => "system-reference-deliverable",
|
32
|
-
}.freeze
|
33
|
-
|
34
|
-
class << self
|
35
|
-
# rubocop:disable Metrics/AbcSize, Metrics/MethodLength
|
36
|
-
|
37
|
-
# Parse page.
|
38
|
-
# @param hit_data [Hash]
|
39
|
-
# @return [Hash]
|
40
|
-
def parse_page(hit_data)
|
41
|
-
doc = get_page hit_data[:url]
|
42
|
-
|
43
|
-
# Fetch edition.
|
44
|
-
edition = doc.at(
|
45
|
-
"//th[contains(., 'Edition')]/following-sibling::td/span",
|
46
|
-
).text
|
47
|
-
|
48
|
-
status, relations = fetch_status_relations hit_data[:url]
|
49
|
-
|
50
|
-
IecBibliographicItem.new(
|
51
|
-
fetched: Date.today.to_s,
|
52
|
-
docid: fetch_docid(hit_data),
|
53
|
-
structuredidentifier: fetch_structuredidentifier(doc),
|
54
|
-
edition: edition,
|
55
|
-
language: ["en"],
|
56
|
-
script: ["Latn"],
|
57
|
-
title: fetch_titles(hit_data),
|
58
|
-
doctype: fetch_type(doc),
|
59
|
-
docstatus: status,
|
60
|
-
ics: fetch_ics(doc),
|
61
|
-
date: fetch_dates(doc),
|
62
|
-
contributor: fetch_contributors(hit_data[:code]),
|
63
|
-
editorialgroup: fetch_workgroup(doc),
|
64
|
-
abstract: fetch_abstract(doc),
|
65
|
-
copyright: fetch_copyright(hit_data[:code], doc),
|
66
|
-
link: fetch_link(doc, hit_data[:url]),
|
67
|
-
relation: relations,
|
68
|
-
place: ["Geneva"],
|
69
|
-
)
|
70
|
-
end
|
71
|
-
# rubocop:enable Metrics/AbcSize, Metrics/MethodLength
|
72
|
-
|
73
|
-
private
|
74
|
-
|
75
|
-
# @param hit [Hash]
|
76
|
-
# @return [Array<RelatonBib::DocumentIdentifier>]
|
77
|
-
def fetch_docid(hit)
|
78
|
-
urn = RelatonIec.code_to_urn hit[:code], "en"
|
79
|
-
[
|
80
|
-
RelatonBib::DocumentIdentifier.new(id: hit[:code], type: "IEC", primary: true),
|
81
|
-
RelatonBib::DocumentIdentifier.new(id: urn, type: "URN"),
|
82
|
-
]
|
83
|
-
end
|
84
|
-
|
85
|
-
# Fetch abstracts.
|
86
|
-
# @param doc [Nokigiri::HTML::Document]
|
87
|
-
# @return [Array<Array>]
|
88
|
-
def fetch_abstract(doc)
|
89
|
-
abstract_content = doc.at('//div[@itemprop="description"]').text
|
90
|
-
[{
|
91
|
-
content: abstract_content,
|
92
|
-
language: "en",
|
93
|
-
script: "Latn",
|
94
|
-
format: "text/plain",
|
95
|
-
}]
|
96
|
-
end
|
97
|
-
|
98
|
-
# rubocop:disable Metrics/AbcSize, Metrics/MethodLength
|
99
|
-
|
100
|
-
# Get page.
|
101
|
-
# @param path [String] page's path
|
102
|
-
# @return [Array<Nokogiri::HTML::Document, String>]
|
103
|
-
def get_page(url)
|
104
|
-
uri = URI url
|
105
|
-
resp = Net::HTTP.get_response(uri)
|
106
|
-
case resp.code
|
107
|
-
when "301"
|
108
|
-
path = resp["location"]
|
109
|
-
url = DOMAIN + path
|
110
|
-
uri = URI url
|
111
|
-
resp = Net::HTTP.get_response(uri)
|
112
|
-
when "404"
|
113
|
-
raise RelatonBib::RequestError, "Page not found #{url}"
|
114
|
-
end
|
115
|
-
Nokogiri::HTML(resp.body)
|
116
|
-
rescue SocketError, Timeout::Error, Errno::EINVAL, Errno::ECONNRESET,
|
117
|
-
EOFError, Net::HTTPBadResponse, Net::HTTPHeaderSyntaxError,
|
118
|
-
Net::ProtocolError, OpenSSL::SSL::SSLError
|
119
|
-
raise RelatonBib::RequestError, "Could not access #{url}"
|
120
|
-
end
|
121
|
-
# rubocop:enable Metrics/AbcSize
|
122
|
-
|
123
|
-
# Fetch structuredidentifier.
|
124
|
-
# @param doc [Nokogiri::HTML::Document]
|
125
|
-
# @return [RelatonIsoBib::StructuredIdentifier]
|
126
|
-
def fetch_structuredidentifier(doc)
|
127
|
-
item_ref = doc.at("//span[@itemprop='productID']")
|
128
|
-
unless item_ref
|
129
|
-
return RelatonIsoBib::StructuredIdentifier.new(
|
130
|
-
project_number: "?", part_number: "", prefix: nil, id: "?",
|
131
|
-
)
|
132
|
-
end
|
133
|
-
|
134
|
-
m = item_ref.text.match(
|
135
|
-
/(?<=\s)(?<project>\d+)-?(?<part>(?<=-)\d+|)-?(?<subpart>(?<=-)\d+|)/,
|
136
|
-
)
|
137
|
-
RelatonIsoBib::StructuredIdentifier.new(
|
138
|
-
project_number: m[:project],
|
139
|
-
part_number: m[:part],
|
140
|
-
subpart_number: m[:subpart],
|
141
|
-
prefix: nil,
|
142
|
-
type: "IEC",
|
143
|
-
id: item_ref.text,
|
144
|
-
)
|
145
|
-
end
|
146
|
-
|
147
|
-
# Fetch status.
|
148
|
-
# @param doc [Nokogiri::HTML::Document]
|
149
|
-
# @param status [String]
|
150
|
-
# @return [Hash]
|
151
|
-
def fetch_status(doc)
|
152
|
-
wip = doc.at('//ROW[STATUS[.="PREPARING"]]')
|
153
|
-
if wip
|
154
|
-
statuses = YAML.load_file File.join __dir__, "statuses.yml"
|
155
|
-
s = wip.at("STAGE").text
|
156
|
-
return unless statuses[s]
|
157
|
-
|
158
|
-
stage, substage = statuses[s]["stage"].split "."
|
159
|
-
else
|
160
|
-
stage = "60"
|
161
|
-
substage = "60"
|
162
|
-
end
|
163
|
-
RelatonBib::DocumentStatus.new(stage: stage, substage: substage)
|
164
|
-
end
|
165
|
-
|
166
|
-
# Fetch workgroup.
|
167
|
-
# @param doc [Nokogiri::HTML::Document]
|
168
|
-
# @return [Hash]
|
169
|
-
def fetch_workgroup(doc)
|
170
|
-
wg = doc.at('//th/abbr[.="TC"]/../following-sibling::td/a').text
|
171
|
-
{
|
172
|
-
name: "International Electrotechnical Commission",
|
173
|
-
abbreviation: "IEC",
|
174
|
-
url: "webstore.iec.ch",
|
175
|
-
technical_committee: [{
|
176
|
-
name: wg,
|
177
|
-
type: "technicalCommittee",
|
178
|
-
number: wg.match(/\d+/)&.to_s&.to_i,
|
179
|
-
}],
|
180
|
-
}
|
181
|
-
end
|
182
|
-
# rubocop:enable Metrics/MethodLength
|
183
|
-
|
184
|
-
# Fetch relations.
|
185
|
-
# @param doc [Nokogiri::HTML::Document]
|
186
|
-
# @return [Array<Hash>]
|
187
|
-
# rubocop:disable Metrics/MethodLength
|
188
|
-
def fetch_relations(doc)
|
189
|
-
doc.xpath('//ROW[STATUS[.!="PREPARING"]][STATUS[.!="PUBLISHED"]]')
|
190
|
-
.map do |r|
|
191
|
-
r_type = r.at("STATUS").text.downcase
|
192
|
-
type = case r_type
|
193
|
-
# when 'published' then 'obsoletes' # Valid
|
194
|
-
when "revised", "replaced" then "updates"
|
195
|
-
when "withdrawn" then "obsoletes"
|
196
|
-
else r_type
|
197
|
-
end
|
198
|
-
ref = r.at("FULL_NAME").text
|
199
|
-
fref = RelatonBib::FormattedRef.new content: ref, format: "text/plain"
|
200
|
-
bibitem = IecBibliographicItem.new(
|
201
|
-
formattedref: fref,
|
202
|
-
docid: [RelatonBib::DocumentIdentifier.new(id: ref, type: "IEC", primary: true)],
|
203
|
-
)
|
204
|
-
{ type: type, bibitem: bibitem }
|
205
|
-
end
|
206
|
-
end
|
207
|
-
|
208
|
-
def fetch_status_relations(url)
|
209
|
-
pubid = url.match(/\d+$/).to_s
|
210
|
-
uri = URI "#{DOMAIN}/webstore/webstore.nsf/AjaxRequestXML?"\
|
211
|
-
"Openagent&url=#{pubid}"
|
212
|
-
resp = Net::HTTP.get_response uri
|
213
|
-
doc = Nokogiri::XML resp.body
|
214
|
-
status = fetch_status doc
|
215
|
-
relations = fetch_relations doc
|
216
|
-
[status, relations]
|
217
|
-
end
|
218
|
-
# rubocop:enable Metrics/MethodLength
|
219
|
-
|
220
|
-
# Fetch type.
|
221
|
-
# @param doc [Nokogiri::HTML::Document]
|
222
|
-
# @return [String]
|
223
|
-
def fetch_type(doc)
|
224
|
-
type = doc.at(
|
225
|
-
'//th[contains(., "Publication type")]/following-sibling::td/span',
|
226
|
-
).text
|
227
|
-
TYPES[type] || type.downcase.tr(" ", "-")
|
228
|
-
end
|
229
|
-
|
230
|
-
# Fetch titles.
|
231
|
-
# @param hit_data [Hash]
|
232
|
-
# @return [Array<Hash>]
|
233
|
-
def fetch_titles(hit_data)
|
234
|
-
RelatonBib::TypedTitleString.from_string hit_data[:title], "en", "Latn"
|
235
|
-
end
|
236
|
-
|
237
|
-
# Fetch dates
|
238
|
-
# @param doc [Nokogiri::HTML::Document]
|
239
|
-
# @return [Array<Hash>]
|
240
|
-
def fetch_dates(doc)
|
241
|
-
dates = []
|
242
|
-
publish_date = doc.at("//span[@itemprop='releaseDate']").text
|
243
|
-
unless publish_date.empty?
|
244
|
-
dates << { type: "published", on: publish_date }
|
245
|
-
end
|
246
|
-
dates
|
247
|
-
end
|
248
|
-
|
249
|
-
# rubocop:disable Metrics/MethodLength
|
250
|
-
|
251
|
-
def fetch_contributors(code)
|
252
|
-
code.sub(/\s.*/, "").split("/").map do |abbrev|
|
253
|
-
name, url = name_url abbrev
|
254
|
-
{ entity: { name: name, url: url, abbreviation: abbrev },
|
255
|
-
role: [type: "publisher"] }
|
256
|
-
end
|
257
|
-
end
|
258
|
-
# rubocop:enable Metrics/MethodLength
|
259
|
-
|
260
|
-
# Fetch ICS.
|
261
|
-
# @param doc [Nokogiri::HTML::Document]
|
262
|
-
# @return [Array<Hash>]
|
263
|
-
def fetch_ics(doc)
|
264
|
-
doc.xpath(
|
265
|
-
'//th[contains(text(), "ICS")]/following-sibling::td/a',
|
266
|
-
).map do |i|
|
267
|
-
code = i.text.match(/[\d.]+/).to_s.split "."
|
268
|
-
{ field: code[0], group: code[1], subgroup: code[2] }
|
269
|
-
end
|
270
|
-
end
|
271
|
-
|
272
|
-
# Fetch links.
|
273
|
-
# @param doc [Nokogiri::HTML::Document]
|
274
|
-
# @param url [String]
|
275
|
-
# @return [Array<Hash>]
|
276
|
-
def fetch_link(doc, url)
|
277
|
-
links = [{ type: "src", content: url }]
|
278
|
-
obp_elms = doc.at_css("p.btn-preview a")
|
279
|
-
links << { type: "obp", content: obp_elms[:href] } if obp_elms
|
280
|
-
links
|
281
|
-
end
|
282
|
-
|
283
|
-
# rubocop:disable Metrics/MethodLength
|
284
|
-
|
285
|
-
# Fetch copyright.
|
286
|
-
# @param title [String]
|
287
|
-
# @return [Array<Hash>]
|
288
|
-
def fetch_copyright(code, doc)
|
289
|
-
abbreviation = code.match(/.*?(?=\s)/).to_s
|
290
|
-
name, url = name_url abbreviation
|
291
|
-
from = code.match(/(?<=:)\d{4}/).to_s
|
292
|
-
if from.empty?
|
293
|
-
from = doc.xpath("//span[@itemprop='releaseDate']").text
|
294
|
-
.match(/\d{4}/).to_s
|
295
|
-
end
|
296
|
-
[{
|
297
|
-
owner: [{ name: name, abbreviation: abbreviation, url: url }],
|
298
|
-
from: from,
|
299
|
-
}]
|
300
|
-
end
|
301
|
-
# rubocop:enable Metrics/MethodLength
|
302
|
-
|
303
|
-
def name_url(abbrev)
|
304
|
-
ABBREVS[abbrev]
|
305
|
-
end
|
306
|
-
end
|
307
|
-
end
|
308
|
-
end
|