relaton-iec 1.14.1 → 1.14.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile +6 -0
- data/README.adoc +53 -26
- data/lib/relaton_iec/data_fetcher.rb +190 -0
- data/lib/relaton_iec/data_parser.rb +289 -0
- data/lib/relaton_iec/hit.rb +17 -1
- data/lib/relaton_iec/hit_collection.rb +17 -79
- data/lib/relaton_iec/iec_bibliographic_item.rb +3 -1
- data/lib/relaton_iec/iec_bibliography.rb +83 -111
- data/lib/relaton_iec/index.rb +133 -0
- data/lib/relaton_iec/processor.rb +13 -0
- data/lib/relaton_iec/version.rb +1 -1
- data/lib/relaton_iec.rb +10 -6
- data/relaton_iec.gemspec +5 -8
- metadata +24 -64
- data/lib/relaton_iec/scrapper.rb +0 -308
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: relaton-iec
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.14.
|
4
|
+
version: 1.14.3
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Ribose Inc.
|
8
|
-
autorequire:
|
8
|
+
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2023-05-01 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: equivalent-xml
|
@@ -24,20 +24,6 @@ dependencies:
|
|
24
24
|
- - "~>"
|
25
25
|
- !ruby/object:Gem::Version
|
26
26
|
version: '0.6'
|
27
|
-
- !ruby/object:Gem::Dependency
|
28
|
-
name: pry-byebug
|
29
|
-
requirement: !ruby/object:Gem::Requirement
|
30
|
-
requirements:
|
31
|
-
- - ">="
|
32
|
-
- !ruby/object:Gem::Version
|
33
|
-
version: '0'
|
34
|
-
type: :development
|
35
|
-
prerelease: false
|
36
|
-
version_requirements: !ruby/object:Gem::Requirement
|
37
|
-
requirements:
|
38
|
-
- - ">="
|
39
|
-
- !ruby/object:Gem::Version
|
40
|
-
version: '0'
|
41
27
|
- !ruby/object:Gem::Dependency
|
42
28
|
name: rake
|
43
29
|
requirement: !ruby/object:Gem::Requirement
|
@@ -67,27 +53,13 @@ dependencies:
|
|
67
53
|
- !ruby/object:Gem::Version
|
68
54
|
version: '3.0'
|
69
55
|
- !ruby/object:Gem::Dependency
|
70
|
-
name:
|
71
|
-
requirement: !ruby/object:Gem::Requirement
|
72
|
-
requirements:
|
73
|
-
- - ">="
|
74
|
-
- !ruby/object:Gem::Version
|
75
|
-
version: '0'
|
76
|
-
type: :development
|
77
|
-
prerelease: false
|
78
|
-
version_requirements: !ruby/object:Gem::Requirement
|
79
|
-
requirements:
|
80
|
-
- - ">="
|
81
|
-
- !ruby/object:Gem::Version
|
82
|
-
version: '0'
|
83
|
-
- !ruby/object:Gem::Dependency
|
84
|
-
name: simplecov
|
56
|
+
name: addressable
|
85
57
|
requirement: !ruby/object:Gem::Requirement
|
86
58
|
requirements:
|
87
59
|
- - ">="
|
88
60
|
- !ruby/object:Gem::Version
|
89
61
|
version: '0'
|
90
|
-
type: :
|
62
|
+
type: :runtime
|
91
63
|
prerelease: false
|
92
64
|
version_requirements: !ruby/object:Gem::Requirement
|
93
65
|
requirements:
|
@@ -95,35 +67,35 @@ dependencies:
|
|
95
67
|
- !ruby/object:Gem::Version
|
96
68
|
version: '0'
|
97
69
|
- !ruby/object:Gem::Dependency
|
98
|
-
name:
|
70
|
+
name: relaton-index
|
99
71
|
requirement: !ruby/object:Gem::Requirement
|
100
72
|
requirements:
|
101
|
-
- - "
|
73
|
+
- - "~>"
|
102
74
|
- !ruby/object:Gem::Version
|
103
|
-
version:
|
104
|
-
type: :
|
75
|
+
version: 0.1.6
|
76
|
+
type: :runtime
|
105
77
|
prerelease: false
|
106
78
|
version_requirements: !ruby/object:Gem::Requirement
|
107
79
|
requirements:
|
108
|
-
- - "
|
80
|
+
- - "~>"
|
109
81
|
- !ruby/object:Gem::Version
|
110
|
-
version:
|
82
|
+
version: 0.1.6
|
111
83
|
- !ruby/object:Gem::Dependency
|
112
|
-
name:
|
84
|
+
name: relaton-iso-bib
|
113
85
|
requirement: !ruby/object:Gem::Requirement
|
114
86
|
requirements:
|
115
|
-
- - "
|
87
|
+
- - "~>"
|
116
88
|
- !ruby/object:Gem::Version
|
117
|
-
version:
|
118
|
-
type: :
|
89
|
+
version: 1.14.0
|
90
|
+
type: :runtime
|
119
91
|
prerelease: false
|
120
92
|
version_requirements: !ruby/object:Gem::Requirement
|
121
93
|
requirements:
|
122
|
-
- - "
|
94
|
+
- - "~>"
|
123
95
|
- !ruby/object:Gem::Version
|
124
|
-
version:
|
96
|
+
version: 1.14.0
|
125
97
|
- !ruby/object:Gem::Dependency
|
126
|
-
name:
|
98
|
+
name: rubyzip
|
127
99
|
requirement: !ruby/object:Gem::Requirement
|
128
100
|
requirements:
|
129
101
|
- - ">="
|
@@ -136,20 +108,6 @@ dependencies:
|
|
136
108
|
- - ">="
|
137
109
|
- !ruby/object:Gem::Version
|
138
110
|
version: '0'
|
139
|
-
- !ruby/object:Gem::Dependency
|
140
|
-
name: relaton-iso-bib
|
141
|
-
requirement: !ruby/object:Gem::Requirement
|
142
|
-
requirements:
|
143
|
-
- - "~>"
|
144
|
-
- !ruby/object:Gem::Version
|
145
|
-
version: 1.14.0
|
146
|
-
type: :runtime
|
147
|
-
prerelease: false
|
148
|
-
version_requirements: !ruby/object:Gem::Requirement
|
149
|
-
requirements:
|
150
|
-
- - "~>"
|
151
|
-
- !ruby/object:Gem::Version
|
152
|
-
version: 1.14.0
|
153
111
|
description: 'RelatonIec: retrieve IEC Standards for bibliographic use using the IecBibliographicItem
|
154
112
|
model'
|
155
113
|
email:
|
@@ -193,13 +151,15 @@ files:
|
|
193
151
|
- lib/relaton_iec/basic_block/stem.rb
|
194
152
|
- lib/relaton_iec/basic_block/table.rb
|
195
153
|
- lib/relaton_iec/basic_block/text_element.rb
|
154
|
+
- lib/relaton_iec/data_fetcher.rb
|
155
|
+
- lib/relaton_iec/data_parser.rb
|
196
156
|
- lib/relaton_iec/hash_converter.rb
|
197
157
|
- lib/relaton_iec/hit.rb
|
198
158
|
- lib/relaton_iec/hit_collection.rb
|
199
159
|
- lib/relaton_iec/iec_bibliographic_item.rb
|
200
160
|
- lib/relaton_iec/iec_bibliography.rb
|
161
|
+
- lib/relaton_iec/index.rb
|
201
162
|
- lib/relaton_iec/processor.rb
|
202
|
-
- lib/relaton_iec/scrapper.rb
|
203
163
|
- lib/relaton_iec/statuses.yml
|
204
164
|
- lib/relaton_iec/tc_sc_officers_note.rb
|
205
165
|
- lib/relaton_iec/version.rb
|
@@ -209,7 +169,7 @@ homepage: https://github.com/metanorma/relaton-iec
|
|
209
169
|
licenses:
|
210
170
|
- MIT
|
211
171
|
metadata: {}
|
212
|
-
post_install_message:
|
172
|
+
post_install_message:
|
213
173
|
rdoc_options: []
|
214
174
|
require_paths:
|
215
175
|
- lib
|
@@ -224,8 +184,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
224
184
|
- !ruby/object:Gem::Version
|
225
185
|
version: '0'
|
226
186
|
requirements: []
|
227
|
-
rubygems_version: 3.
|
228
|
-
signing_key:
|
187
|
+
rubygems_version: 3.4.9
|
188
|
+
signing_key:
|
229
189
|
specification_version: 4
|
230
190
|
summary: 'RelatonIec: retrieve IEC Standards for bibliographic use using the IecBibliographicItem
|
231
191
|
model'
|
data/lib/relaton_iec/scrapper.rb
DELETED
@@ -1,308 +0,0 @@
|
|
1
|
-
# frozen_string_literal: true
|
2
|
-
|
3
|
-
# Capybara.request_driver :poltergeist do |app|
|
4
|
-
# Capybara::Poltergeist::Driver.new app, js_errors: false
|
5
|
-
# end
|
6
|
-
# Capybara.default_driver = :poltergeist
|
7
|
-
|
8
|
-
module RelatonIec
|
9
|
-
# Scrapper.
|
10
|
-
module Scrapper
|
11
|
-
DOMAIN = "https://webstore.iec.ch"
|
12
|
-
ABBREVS = {
|
13
|
-
"ISO" => ["International Organization for Standardization", "www.iso.org"],
|
14
|
-
"IEC" => ["International Electrotechnical Commission", "www.iec.ch"],
|
15
|
-
"CISPR" => ["International special committee on radio interference", "www.iec.ch"],
|
16
|
-
}.freeze
|
17
|
-
|
18
|
-
TYPES = {
|
19
|
-
"ISO" => "international-standard",
|
20
|
-
"TS" => "technical-specification",
|
21
|
-
"TR" => "technical-report",
|
22
|
-
"PAS" => "publicly-available-specification",
|
23
|
-
"AWI" => "appruved-work-item",
|
24
|
-
"CD" => "committee-draft",
|
25
|
-
"FDIS" => "final-draft-international-standard",
|
26
|
-
"NP" => "new-proposal",
|
27
|
-
"DIS" => "draft-international-standard",
|
28
|
-
"WD" => "working-draft",
|
29
|
-
"R" => "recommendation",
|
30
|
-
"Guide" => "guide",
|
31
|
-
"SRD" => "system-reference-deliverable",
|
32
|
-
}.freeze
|
33
|
-
|
34
|
-
class << self
|
35
|
-
# rubocop:disable Metrics/AbcSize, Metrics/MethodLength
|
36
|
-
|
37
|
-
# Parse page.
|
38
|
-
# @param hit_data [Hash]
|
39
|
-
# @return [Hash]
|
40
|
-
def parse_page(hit_data)
|
41
|
-
doc = get_page hit_data[:url]
|
42
|
-
|
43
|
-
# Fetch edition.
|
44
|
-
edition = doc.at(
|
45
|
-
"//th[contains(., 'Edition')]/following-sibling::td/span",
|
46
|
-
).text
|
47
|
-
|
48
|
-
status, relations = fetch_status_relations hit_data[:url]
|
49
|
-
|
50
|
-
IecBibliographicItem.new(
|
51
|
-
fetched: Date.today.to_s,
|
52
|
-
docid: fetch_docid(hit_data),
|
53
|
-
structuredidentifier: fetch_structuredidentifier(doc),
|
54
|
-
edition: edition,
|
55
|
-
language: ["en"],
|
56
|
-
script: ["Latn"],
|
57
|
-
title: fetch_titles(hit_data),
|
58
|
-
doctype: fetch_type(doc),
|
59
|
-
docstatus: status,
|
60
|
-
ics: fetch_ics(doc),
|
61
|
-
date: fetch_dates(doc),
|
62
|
-
contributor: fetch_contributors(hit_data[:code]),
|
63
|
-
editorialgroup: fetch_workgroup(doc),
|
64
|
-
abstract: fetch_abstract(doc),
|
65
|
-
copyright: fetch_copyright(hit_data[:code], doc),
|
66
|
-
link: fetch_link(doc, hit_data[:url]),
|
67
|
-
relation: relations,
|
68
|
-
place: ["Geneva"],
|
69
|
-
)
|
70
|
-
end
|
71
|
-
# rubocop:enable Metrics/AbcSize, Metrics/MethodLength
|
72
|
-
|
73
|
-
private
|
74
|
-
|
75
|
-
# @param hit [Hash]
|
76
|
-
# @return [Array<RelatonBib::DocumentIdentifier>]
|
77
|
-
def fetch_docid(hit)
|
78
|
-
urn = RelatonIec.code_to_urn hit[:code], "en"
|
79
|
-
[
|
80
|
-
RelatonBib::DocumentIdentifier.new(id: hit[:code], type: "IEC", primary: true),
|
81
|
-
RelatonBib::DocumentIdentifier.new(id: urn, type: "URN"),
|
82
|
-
]
|
83
|
-
end
|
84
|
-
|
85
|
-
# Fetch abstracts.
|
86
|
-
# @param doc [Nokigiri::HTML::Document]
|
87
|
-
# @return [Array<Array>]
|
88
|
-
def fetch_abstract(doc)
|
89
|
-
abstract_content = doc.at('//div[@itemprop="description"]').text
|
90
|
-
[{
|
91
|
-
content: abstract_content,
|
92
|
-
language: "en",
|
93
|
-
script: "Latn",
|
94
|
-
format: "text/plain",
|
95
|
-
}]
|
96
|
-
end
|
97
|
-
|
98
|
-
# rubocop:disable Metrics/AbcSize, Metrics/MethodLength
|
99
|
-
|
100
|
-
# Get page.
|
101
|
-
# @param path [String] page's path
|
102
|
-
# @return [Array<Nokogiri::HTML::Document, String>]
|
103
|
-
def get_page(url)
|
104
|
-
uri = URI url
|
105
|
-
resp = Net::HTTP.get_response(uri)
|
106
|
-
case resp.code
|
107
|
-
when "301"
|
108
|
-
path = resp["location"]
|
109
|
-
url = DOMAIN + path
|
110
|
-
uri = URI url
|
111
|
-
resp = Net::HTTP.get_response(uri)
|
112
|
-
when "404"
|
113
|
-
raise RelatonBib::RequestError, "Page not found #{url}"
|
114
|
-
end
|
115
|
-
Nokogiri::HTML(resp.body)
|
116
|
-
rescue SocketError, Timeout::Error, Errno::EINVAL, Errno::ECONNRESET,
|
117
|
-
EOFError, Net::HTTPBadResponse, Net::HTTPHeaderSyntaxError,
|
118
|
-
Net::ProtocolError, OpenSSL::SSL::SSLError
|
119
|
-
raise RelatonBib::RequestError, "Could not access #{url}"
|
120
|
-
end
|
121
|
-
# rubocop:enable Metrics/AbcSize
|
122
|
-
|
123
|
-
# Fetch structuredidentifier.
|
124
|
-
# @param doc [Nokogiri::HTML::Document]
|
125
|
-
# @return [RelatonIsoBib::StructuredIdentifier]
|
126
|
-
def fetch_structuredidentifier(doc)
|
127
|
-
item_ref = doc.at("//span[@itemprop='productID']")
|
128
|
-
unless item_ref
|
129
|
-
return RelatonIsoBib::StructuredIdentifier.new(
|
130
|
-
project_number: "?", part_number: "", prefix: nil, id: "?",
|
131
|
-
)
|
132
|
-
end
|
133
|
-
|
134
|
-
m = item_ref.text.match(
|
135
|
-
/(?<=\s)(?<project>\d+)-?(?<part>(?<=-)\d+|)-?(?<subpart>(?<=-)\d+|)/,
|
136
|
-
)
|
137
|
-
RelatonIsoBib::StructuredIdentifier.new(
|
138
|
-
project_number: m[:project],
|
139
|
-
part_number: m[:part],
|
140
|
-
subpart_number: m[:subpart],
|
141
|
-
prefix: nil,
|
142
|
-
type: "IEC",
|
143
|
-
id: item_ref.text,
|
144
|
-
)
|
145
|
-
end
|
146
|
-
|
147
|
-
# Fetch status.
|
148
|
-
# @param doc [Nokogiri::HTML::Document]
|
149
|
-
# @param status [String]
|
150
|
-
# @return [Hash]
|
151
|
-
def fetch_status(doc)
|
152
|
-
wip = doc.at('//ROW[STATUS[.="PREPARING"]]')
|
153
|
-
if wip
|
154
|
-
statuses = YAML.load_file File.join __dir__, "statuses.yml"
|
155
|
-
s = wip.at("STAGE").text
|
156
|
-
return unless statuses[s]
|
157
|
-
|
158
|
-
stage, substage = statuses[s]["stage"].split "."
|
159
|
-
else
|
160
|
-
stage = "60"
|
161
|
-
substage = "60"
|
162
|
-
end
|
163
|
-
RelatonBib::DocumentStatus.new(stage: stage, substage: substage)
|
164
|
-
end
|
165
|
-
|
166
|
-
# Fetch workgroup.
|
167
|
-
# @param doc [Nokogiri::HTML::Document]
|
168
|
-
# @return [Hash]
|
169
|
-
def fetch_workgroup(doc)
|
170
|
-
wg = doc.at('//th/abbr[.="TC"]/../following-sibling::td/a').text
|
171
|
-
{
|
172
|
-
name: "International Electrotechnical Commission",
|
173
|
-
abbreviation: "IEC",
|
174
|
-
url: "webstore.iec.ch",
|
175
|
-
technical_committee: [{
|
176
|
-
name: wg,
|
177
|
-
type: "technicalCommittee",
|
178
|
-
number: wg.match(/\d+/)&.to_s&.to_i,
|
179
|
-
}],
|
180
|
-
}
|
181
|
-
end
|
182
|
-
# rubocop:enable Metrics/MethodLength
|
183
|
-
|
184
|
-
# Fetch relations.
|
185
|
-
# @param doc [Nokogiri::HTML::Document]
|
186
|
-
# @return [Array<Hash>]
|
187
|
-
# rubocop:disable Metrics/MethodLength
|
188
|
-
def fetch_relations(doc)
|
189
|
-
doc.xpath('//ROW[STATUS[.!="PREPARING"]][STATUS[.!="PUBLISHED"]]')
|
190
|
-
.map do |r|
|
191
|
-
r_type = r.at("STATUS").text.downcase
|
192
|
-
type = case r_type
|
193
|
-
# when 'published' then 'obsoletes' # Valid
|
194
|
-
when "revised", "replaced" then "updates"
|
195
|
-
when "withdrawn" then "obsoletes"
|
196
|
-
else r_type
|
197
|
-
end
|
198
|
-
ref = r.at("FULL_NAME").text
|
199
|
-
fref = RelatonBib::FormattedRef.new content: ref, format: "text/plain"
|
200
|
-
bibitem = IecBibliographicItem.new(
|
201
|
-
formattedref: fref,
|
202
|
-
docid: [RelatonBib::DocumentIdentifier.new(id: ref, type: "IEC", primary: true)],
|
203
|
-
)
|
204
|
-
{ type: type, bibitem: bibitem }
|
205
|
-
end
|
206
|
-
end
|
207
|
-
|
208
|
-
def fetch_status_relations(url)
|
209
|
-
pubid = url.match(/\d+$/).to_s
|
210
|
-
uri = URI "#{DOMAIN}/webstore/webstore.nsf/AjaxRequestXML?"\
|
211
|
-
"Openagent&url=#{pubid}"
|
212
|
-
resp = Net::HTTP.get_response uri
|
213
|
-
doc = Nokogiri::XML resp.body
|
214
|
-
status = fetch_status doc
|
215
|
-
relations = fetch_relations doc
|
216
|
-
[status, relations]
|
217
|
-
end
|
218
|
-
# rubocop:enable Metrics/MethodLength
|
219
|
-
|
220
|
-
# Fetch type.
|
221
|
-
# @param doc [Nokogiri::HTML::Document]
|
222
|
-
# @return [String]
|
223
|
-
def fetch_type(doc)
|
224
|
-
type = doc.at(
|
225
|
-
'//th[contains(., "Publication type")]/following-sibling::td/span',
|
226
|
-
).text
|
227
|
-
TYPES[type] || type.downcase.tr(" ", "-")
|
228
|
-
end
|
229
|
-
|
230
|
-
# Fetch titles.
|
231
|
-
# @param hit_data [Hash]
|
232
|
-
# @return [Array<Hash>]
|
233
|
-
def fetch_titles(hit_data)
|
234
|
-
RelatonBib::TypedTitleString.from_string hit_data[:title], "en", "Latn"
|
235
|
-
end
|
236
|
-
|
237
|
-
# Fetch dates
|
238
|
-
# @param doc [Nokogiri::HTML::Document]
|
239
|
-
# @return [Array<Hash>]
|
240
|
-
def fetch_dates(doc)
|
241
|
-
dates = []
|
242
|
-
publish_date = doc.at("//span[@itemprop='releaseDate']").text
|
243
|
-
unless publish_date.empty?
|
244
|
-
dates << { type: "published", on: publish_date }
|
245
|
-
end
|
246
|
-
dates
|
247
|
-
end
|
248
|
-
|
249
|
-
# rubocop:disable Metrics/MethodLength
|
250
|
-
|
251
|
-
def fetch_contributors(code)
|
252
|
-
code.sub(/\s.*/, "").split("/").map do |abbrev|
|
253
|
-
name, url = name_url abbrev
|
254
|
-
{ entity: { name: name, url: url, abbreviation: abbrev },
|
255
|
-
role: [type: "publisher"] }
|
256
|
-
end
|
257
|
-
end
|
258
|
-
# rubocop:enable Metrics/MethodLength
|
259
|
-
|
260
|
-
# Fetch ICS.
|
261
|
-
# @param doc [Nokogiri::HTML::Document]
|
262
|
-
# @return [Array<Hash>]
|
263
|
-
def fetch_ics(doc)
|
264
|
-
doc.xpath(
|
265
|
-
'//th[contains(text(), "ICS")]/following-sibling::td/a',
|
266
|
-
).map do |i|
|
267
|
-
code = i.text.match(/[\d.]+/).to_s.split "."
|
268
|
-
{ field: code[0], group: code[1], subgroup: code[2] }
|
269
|
-
end
|
270
|
-
end
|
271
|
-
|
272
|
-
# Fetch links.
|
273
|
-
# @param doc [Nokogiri::HTML::Document]
|
274
|
-
# @param url [String]
|
275
|
-
# @return [Array<Hash>]
|
276
|
-
def fetch_link(doc, url)
|
277
|
-
links = [{ type: "src", content: url }]
|
278
|
-
obp_elms = doc.at_css("p.btn-preview a")
|
279
|
-
links << { type: "obp", content: obp_elms[:href] } if obp_elms
|
280
|
-
links
|
281
|
-
end
|
282
|
-
|
283
|
-
# rubocop:disable Metrics/MethodLength
|
284
|
-
|
285
|
-
# Fetch copyright.
|
286
|
-
# @param title [String]
|
287
|
-
# @return [Array<Hash>]
|
288
|
-
def fetch_copyright(code, doc)
|
289
|
-
abbreviation = code.match(/.*?(?=\s)/).to_s
|
290
|
-
name, url = name_url abbreviation
|
291
|
-
from = code.match(/(?<=:)\d{4}/).to_s
|
292
|
-
if from.empty?
|
293
|
-
from = doc.xpath("//span[@itemprop='releaseDate']").text
|
294
|
-
.match(/\d{4}/).to_s
|
295
|
-
end
|
296
|
-
[{
|
297
|
-
owner: [{ name: name, abbreviation: abbreviation, url: url }],
|
298
|
-
from: from,
|
299
|
-
}]
|
300
|
-
end
|
301
|
-
# rubocop:enable Metrics/MethodLength
|
302
|
-
|
303
|
-
def name_url(abbrev)
|
304
|
-
ABBREVS[abbrev]
|
305
|
-
end
|
306
|
-
end
|
307
|
-
end
|
308
|
-
end
|