relaton-iec 0.8.0 → 1.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.adoc +1 -1
- data/grammars/basicdoc.rng +986 -0
- data/grammars/biblio.rng +1237 -0
- data/grammars/iec.rng +43 -0
- data/grammars/isodoc.rng +1077 -0
- data/{grammars → grammars/isostandard.rng} +145 -472
- data/grammars/reqt.rng +165 -0
- data/lib/relaton_iec.rb +12 -9
- data/lib/relaton_iec/hash_converter.rb +14 -0
- data/lib/relaton_iec/hit.rb +2 -41
- data/lib/relaton_iec/hit_collection.rb +3 -37
- data/lib/relaton_iec/iec_bibliographic_item.rb +9 -0
- data/lib/relaton_iec/iec_bibliography.rb +16 -18
- data/lib/relaton_iec/processor.rb +7 -7
- data/lib/relaton_iec/scrapper.rb +41 -135
- data/lib/relaton_iec/version.rb +1 -1
- data/lib/relaton_iec/xml_parser.rb +14 -0
- data/relaton_iec.gemspec +6 -5
- metadata +29 -7
data/lib/relaton_iec/scrapper.rb
CHANGED
@@ -32,31 +32,22 @@ module RelatonIec
|
|
32
32
|
}.freeze
|
33
33
|
|
34
34
|
class << self
|
35
|
-
#
|
36
|
-
# @return [Array<Hash>]
|
37
|
-
# def get(text)
|
38
|
-
# iso_workers = WorkersPool.new 4
|
39
|
-
# iso_workers.worker { |hit| iso_worker(hit, iso_workers) }
|
40
|
-
# algolia_workers = start_algolia_search(text, iso_workers)
|
41
|
-
# iso_docs = iso_workers.result
|
42
|
-
# algolia_workers.end
|
43
|
-
# algolia_workers.result
|
44
|
-
# iso_docs
|
45
|
-
# end
|
35
|
+
# rubocop:disable Metrics/AbcSize, Metrics/MethodLength
|
46
36
|
|
47
37
|
# Parse page.
|
48
38
|
# @param hit [Hash]
|
49
39
|
# @return [Hash]
|
50
|
-
# rubocop:disable Metrics/AbcSize, Metrics/MethodLength
|
51
40
|
def parse_page(hit_data)
|
52
41
|
doc = get_page hit_data[:url]
|
53
42
|
|
54
43
|
# Fetch edition.
|
55
|
-
edition = doc.at(
|
44
|
+
edition = doc.at(
|
45
|
+
"//th[contains(., 'Edition')]/following-sibling::td/span",
|
46
|
+
).text
|
56
47
|
|
57
48
|
status, relations = fetch_status_relations hit_data[:url]
|
58
49
|
|
59
|
-
|
50
|
+
IecBibliographicItem.new(
|
60
51
|
fetched: Date.today.to_s,
|
61
52
|
docid: [RelatonBib::DocumentIdentifier.new(id: hit_data[:code], type: "IEC")],
|
62
53
|
structuredidentifier: fetch_structuredidentifier(doc),
|
@@ -81,46 +72,6 @@ module RelatonIec
|
|
81
72
|
|
82
73
|
private
|
83
74
|
|
84
|
-
# Start search workers.
|
85
|
-
# @param text[String]
|
86
|
-
# @param iec_workers [Isobib::WorkersPool]
|
87
|
-
# @reaturn [Isobib::WorkersPool]
|
88
|
-
# def start_algolia_search(text, iec_workers)
|
89
|
-
# index = Algolia::Index.new 'all_en'
|
90
|
-
# workers = WorkersPool.new
|
91
|
-
# workers.worker do |page|
|
92
|
-
# algolia_worker(index, text, page, workers, iec_workers)
|
93
|
-
# end
|
94
|
-
|
95
|
-
# # Add first page so search worker will start.
|
96
|
-
# workers << 0
|
97
|
-
# end
|
98
|
-
|
99
|
-
# Fetch ISO documents.
|
100
|
-
# @param hit [Hash]
|
101
|
-
# @param isiso_workers [Isobib::WorkersPool]
|
102
|
-
# def iso_worker(hit, iso_workers)
|
103
|
-
# print "Parse #{iso_workers.size} of #{iso_workers.nb_hits} \r"
|
104
|
-
# parse_page hit
|
105
|
-
# end
|
106
|
-
|
107
|
-
# Fetch hits from algolia search service.
|
108
|
-
# @param index[Algolia::Index]
|
109
|
-
# @param text [String]
|
110
|
-
# @param page [Integer]
|
111
|
-
# @param algolia_workers [Isobib::WorkersPool]
|
112
|
-
# @param isiso_workers [Isobib::WorkersPool]
|
113
|
-
# def algolia_worker(index, text, page, algolia_workers, iso_workers)
|
114
|
-
# res = index.search text, facetFilters: ['category:standard'], page: page
|
115
|
-
# next_page = res['page'] + 1
|
116
|
-
# algolia_workers << next_page if next_page < res['nbPages']
|
117
|
-
# res['hits'].each do |hit|
|
118
|
-
# iso_workers.nb_hits = res['nbHits']
|
119
|
-
# iso_workers << hit
|
120
|
-
# end
|
121
|
-
# iso_workers.end unless next_page < res['nbPages']
|
122
|
-
# end
|
123
|
-
|
124
75
|
# Fetch abstracts.
|
125
76
|
# @param doc [Nokigiri::HTML::Document]
|
126
77
|
# @return [Array<Array>]
|
@@ -134,19 +85,6 @@ module RelatonIec
|
|
134
85
|
}]
|
135
86
|
end
|
136
87
|
|
137
|
-
# Get langs.
|
138
|
-
# @param doc [Nokogiri::HTML::Document]
|
139
|
-
# @return [Array<Hash>]
|
140
|
-
# def langs(doc)
|
141
|
-
# lgs = [{ lang: 'en' }]
|
142
|
-
# doc.css('ul#lang-switcher ul li a').each do |lang_link|
|
143
|
-
# lang_path = lang_link.attr('href')
|
144
|
-
# lang = lang_path.match(%r{^\/(fr)\/})
|
145
|
-
# lgs << { lang: lang[1], path: lang_path } if lang
|
146
|
-
# end
|
147
|
-
# lgs
|
148
|
-
# end
|
149
|
-
|
150
88
|
# rubocop:disable Metrics/AbcSize, Metrics/MethodLength
|
151
89
|
|
152
90
|
# Get page.
|
@@ -154,25 +92,20 @@ module RelatonIec
|
|
154
92
|
# @return [Array<Nokogiri::HTML::Document, String>]
|
155
93
|
def get_page(url)
|
156
94
|
uri = URI url
|
157
|
-
resp = Net::HTTP.get_response(uri)
|
95
|
+
resp = Net::HTTP.get_response(uri)
|
158
96
|
case resp.code
|
159
97
|
when "301"
|
160
98
|
path = resp["location"]
|
161
99
|
url = DOMAIN + path
|
162
100
|
uri = URI url
|
163
|
-
resp = Net::HTTP.get_response(uri)
|
101
|
+
resp = Net::HTTP.get_response(uri)
|
164
102
|
when "404"
|
165
103
|
raise RelatonBib::RequestError, "Page not found #{url}"
|
166
104
|
end
|
167
|
-
# n = 0
|
168
|
-
# while resp.body !~ /<strong/ && n < 10
|
169
|
-
# resp = Net::HTTP.get_response(uri)#.encode("UTF-8")
|
170
|
-
# n += 1
|
171
|
-
# end
|
172
105
|
Nokogiri::HTML(resp.body)
|
173
|
-
rescue SocketError, Timeout::Error, Errno::EINVAL, Errno::ECONNRESET,
|
174
|
-
Net::HTTPBadResponse, Net::HTTPHeaderSyntaxError,
|
175
|
-
OpenSSL::SSL::SSLError
|
106
|
+
rescue SocketError, Timeout::Error, Errno::EINVAL, Errno::ECONNRESET,
|
107
|
+
EOFError, Net::HTTPBadResponse, Net::HTTPHeaderSyntaxError,
|
108
|
+
Net::ProtocolError, OpenSSL::SSL::SSLError
|
176
109
|
raise RelatonBib::RequestError, "Could not access #{url}"
|
177
110
|
end
|
178
111
|
# rubocop:enable Metrics/AbcSize
|
@@ -211,15 +144,12 @@ module RelatonIec
|
|
211
144
|
statuses = YAML.load_file "lib/relaton_iec/statuses.yml"
|
212
145
|
s = wip.at("STAGE").text
|
213
146
|
stage, substage = statuses[s]["stage"].split "."
|
214
|
-
# status = statuses[s]["status"]
|
215
147
|
else
|
216
|
-
# status = "Published"
|
217
148
|
stage = "60"
|
218
149
|
substage = "60"
|
219
150
|
end
|
220
151
|
RelatonBib::DocumentStatus.new(stage: stage, substage: substage)
|
221
152
|
end
|
222
|
-
# rubocop:enable Metrics/MethodLength
|
223
153
|
|
224
154
|
# Fetch workgroup.
|
225
155
|
# @param doc [Nokogiri::HTML::Document]
|
@@ -237,27 +167,26 @@ module RelatonIec
|
|
237
167
|
}],
|
238
168
|
}
|
239
169
|
end
|
170
|
+
# rubocop:enable Metrics/MethodLength
|
240
171
|
|
241
172
|
# Fetch relations.
|
242
173
|
# @param doc [Nokogiri::HTML::Document]
|
243
174
|
# @return [Array<Hash>]
|
244
175
|
# rubocop:disable Metrics/MethodLength
|
245
176
|
def fetch_relations(doc)
|
246
|
-
doc.xpath('//ROW[STATUS[.!="PREPARING"]][STATUS[.!="PUBLISHED"]]').
|
177
|
+
doc.xpath('//ROW[STATUS[.!="PREPARING"]][STATUS[.!="PUBLISHED"]]').
|
178
|
+
map do |r|
|
247
179
|
r_type = r.at("STATUS").text.downcase
|
248
180
|
type = case r_type
|
249
|
-
|
181
|
+
# when 'published' then 'obsoletes' # Valid
|
250
182
|
when "revised", "replaced" then "updates"
|
251
183
|
when "withdrawn" then "obsoletes"
|
252
184
|
else r_type
|
253
185
|
end
|
254
|
-
# url = DOMAIN + "/publication/" + r.at("PUB_ID").text
|
255
186
|
fref = RelatonBib::FormattedRef.new(
|
256
187
|
content: r.at("FULL_NAME").text, format: "text/plain",
|
257
188
|
)
|
258
|
-
bibitem =
|
259
|
-
formattedref: fref,
|
260
|
-
)
|
189
|
+
bibitem = IecBibliographicItem.new(formattedref: fref)
|
261
190
|
{ type: type, bibitem: bibitem }
|
262
191
|
end
|
263
192
|
end
|
@@ -272,22 +201,6 @@ module RelatonIec
|
|
272
201
|
status = fetch_status doc
|
273
202
|
relations = fetch_relations doc
|
274
203
|
[status, relations]
|
275
|
-
# doc.css('ul.steps li').inject([]) do |a, r|
|
276
|
-
# r_type = r.css('strong').text
|
277
|
-
# type = case r_type
|
278
|
-
# when 'Previously', 'Will be replaced by' then 'obsoletes'
|
279
|
-
# when 'Corrigenda/Amendments', 'Revised by', 'Now confirmed'
|
280
|
-
# 'updates'
|
281
|
-
# else r_type
|
282
|
-
# end
|
283
|
-
# if ['Now', 'Now under review'].include? type
|
284
|
-
# a
|
285
|
-
# else
|
286
|
-
# a + r.css('a').map do |id|
|
287
|
-
# { type: type, identifier: id.text, url: id['href'] }
|
288
|
-
# end
|
289
|
-
# end
|
290
|
-
# end
|
291
204
|
end
|
292
205
|
# rubocop:enable Metrics/MethodLength
|
293
206
|
|
@@ -295,23 +208,13 @@ module RelatonIec
|
|
295
208
|
# @param doc [Nokogiri::HTML::Document]
|
296
209
|
# @return [String]
|
297
210
|
def fetch_type(doc)
|
298
|
-
doc.at(
|
299
|
-
|
300
|
-
|
301
|
-
# /NP)*\s|/)(TS|TR|PAS|AWI|CD|FDIS|NP|DIS|WD|R|Guide|(?=\d+))}x)
|
302
|
-
# #return "international-standard" if type_match.nil?
|
303
|
-
# if TYPES[type_match[2]]
|
304
|
-
# TYPES[type_match[2]]
|
305
|
-
# elsif type_match[1]
|
306
|
-
# elsif type_match[1] == 'ISO'
|
307
|
-
# 'international-standard'
|
308
|
-
# elsif type_match[1] == 'IWA'
|
309
|
-
# 'international-workshop-agreement'
|
310
|
-
# end
|
311
|
-
# # rescue => _e
|
312
|
-
# # puts 'Unknown document type: ' + title
|
211
|
+
doc.at(
|
212
|
+
'//th[contains(., "Publication type")]/following-sibling::td/span',
|
213
|
+
).text.downcase.tr " ", "-"
|
313
214
|
end
|
314
215
|
|
216
|
+
# rubocop:disable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/MethodLength
|
217
|
+
|
315
218
|
# Fetch titles.
|
316
219
|
# @param hit_data [Hash]
|
317
220
|
# @return [Array<Hash>]
|
@@ -339,21 +242,13 @@ module RelatonIec
|
|
339
242
|
end
|
340
243
|
[{
|
341
244
|
title_intro: intro,
|
342
|
-
title_main:
|
343
|
-
title_part:
|
344
|
-
language:
|
345
|
-
script:
|
245
|
+
title_main: main,
|
246
|
+
title_part: part,
|
247
|
+
language: "en",
|
248
|
+
script: "Latn",
|
346
249
|
}]
|
347
250
|
end
|
348
|
-
|
349
|
-
# Return ISO script code.
|
350
|
-
# @param lang [String]
|
351
|
-
# @return [String]
|
352
|
-
# def script(lang)
|
353
|
-
# case lang
|
354
|
-
# when 'en', 'fr' then 'Latn'
|
355
|
-
# end
|
356
|
-
# end
|
251
|
+
# rubocop:enable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/MethodLength
|
357
252
|
|
358
253
|
# Fetch dates
|
359
254
|
# @param doc [Nokogiri::HTML::Document]
|
@@ -367,6 +262,8 @@ module RelatonIec
|
|
367
262
|
dates
|
368
263
|
end
|
369
264
|
|
265
|
+
# rubocop:disable Metrics/MethodLength
|
266
|
+
|
370
267
|
def fetch_contributors(code)
|
371
268
|
code.sub(/\s.*/, "").split("/").map do |abbrev|
|
372
269
|
case abbrev
|
@@ -381,12 +278,15 @@ module RelatonIec
|
|
381
278
|
role: [type: "publisher"] }
|
382
279
|
end
|
383
280
|
end
|
281
|
+
# rubocop:enable Metrics/MethodLength
|
384
282
|
|
385
283
|
# Fetch ICS.
|
386
284
|
# @param doc [Nokogiri::HTML::Document]
|
387
285
|
# @return [Array<Hash>]
|
388
286
|
def fetch_ics(doc)
|
389
|
-
doc.xpath(
|
287
|
+
doc.xpath(
|
288
|
+
'//th[contains(text(), "ICS")]/following-sibling::td/a',
|
289
|
+
).map do |i|
|
390
290
|
code = i.text.match(/[\d\.]+/).to_s.split "."
|
391
291
|
{ field: code[0], group: code[1], subgroup: code[2] }
|
392
292
|
end
|
@@ -403,9 +303,11 @@ module RelatonIec
|
|
403
303
|
links
|
404
304
|
end
|
405
305
|
|
306
|
+
# rubocop:disable Metrics/MethodLength
|
307
|
+
|
406
308
|
# Fetch copyright.
|
407
309
|
# @param title [String]
|
408
|
-
# @return [Hash]
|
310
|
+
# @return [Array<Hash>]
|
409
311
|
def fetch_copyright(code, doc)
|
410
312
|
abbreviation = code.match(/.*?(?=\s)/).to_s
|
411
313
|
case abbreviation
|
@@ -415,11 +317,15 @@ module RelatonIec
|
|
415
317
|
end
|
416
318
|
from = code.match(/(?<=:)\d{4}/).to_s
|
417
319
|
if from.empty?
|
418
|
-
from = doc.xpath("//span[@itemprop='releaseDate']").text
|
419
|
-
|
320
|
+
from = doc.xpath("//span[@itemprop='releaseDate']").text.
|
321
|
+
match(/\d{4}/).to_s
|
420
322
|
end
|
421
|
-
{
|
323
|
+
[{
|
324
|
+
owner: [{ name: name, abbreviation: abbreviation, url: url }],
|
325
|
+
from: from,
|
326
|
+
}]
|
422
327
|
end
|
328
|
+
# rubocop:enable Metrics/MethodLength
|
423
329
|
end
|
424
330
|
end
|
425
331
|
# rubocop:enable Metrics/ModuleLength
|
data/lib/relaton_iec/version.rb
CHANGED
@@ -0,0 +1,14 @@
|
|
1
|
+
module RelatonIec
|
2
|
+
class XMLParser < RelatonIsoBib::XMLParser
|
3
|
+
class << self
|
4
|
+
private
|
5
|
+
|
6
|
+
# override RelatonIsoBib::IsoBibliographicItem.bib_item method
|
7
|
+
# @param item_hash [Hash]
|
8
|
+
# @return [RelatonIec::IecBibliographicItem]
|
9
|
+
def bib_item(item_hash)
|
10
|
+
IecBibliographicItem.new item_hash
|
11
|
+
end
|
12
|
+
end
|
13
|
+
end
|
14
|
+
end
|
data/relaton_iec.gemspec
CHANGED
@@ -8,10 +8,10 @@ Gem::Specification.new do |spec|
|
|
8
8
|
spec.authors = ["Ribose Inc."]
|
9
9
|
spec.email = ["open.source@ribose.com"]
|
10
10
|
|
11
|
-
spec.summary = "RelatonIec: retrieve IEC Standards for bibliographic
|
12
|
-
"using the
|
13
|
-
spec.description = "RelatonIec: retrieve IEC Standards for bibliographic
|
14
|
-
"using the
|
11
|
+
spec.summary = "RelatonIec: retrieve IEC Standards for bibliographic "\
|
12
|
+
"use using the IecBibliographicItem model"
|
13
|
+
spec.description = "RelatonIec: retrieve IEC Standards for bibliographic "\
|
14
|
+
"use using the IecBibliographicItem model"
|
15
15
|
spec.homepage = "https://github.com/metanorma/relaton-iec"
|
16
16
|
spec.license = "MIT"
|
17
17
|
|
@@ -29,10 +29,11 @@ Gem::Specification.new do |spec|
|
|
29
29
|
spec.add_development_dependency "rake", "~> 10.0"
|
30
30
|
spec.add_development_dependency "rspec", "~> 3.0"
|
31
31
|
spec.add_development_dependency "ruby-debug-ide"
|
32
|
+
spec.add_development_dependency "ruby-jing"
|
32
33
|
spec.add_development_dependency "simplecov"
|
33
34
|
spec.add_development_dependency "vcr"
|
34
35
|
spec.add_development_dependency "webmock"
|
35
36
|
|
36
37
|
spec.add_dependency "addressable"
|
37
|
-
spec.add_dependency "relaton-iso-bib", "~>
|
38
|
+
spec.add_dependency "relaton-iso-bib", "~> 1.1.0"
|
38
39
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: relaton-iec
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version:
|
4
|
+
version: 1.1.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Ribose Inc.
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2020-
|
11
|
+
date: 2020-06-03 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: debase
|
@@ -94,6 +94,20 @@ dependencies:
|
|
94
94
|
- - ">="
|
95
95
|
- !ruby/object:Gem::Version
|
96
96
|
version: '0'
|
97
|
+
- !ruby/object:Gem::Dependency
|
98
|
+
name: ruby-jing
|
99
|
+
requirement: !ruby/object:Gem::Requirement
|
100
|
+
requirements:
|
101
|
+
- - ">="
|
102
|
+
- !ruby/object:Gem::Version
|
103
|
+
version: '0'
|
104
|
+
type: :development
|
105
|
+
prerelease: false
|
106
|
+
version_requirements: !ruby/object:Gem::Requirement
|
107
|
+
requirements:
|
108
|
+
- - ">="
|
109
|
+
- !ruby/object:Gem::Version
|
110
|
+
version: '0'
|
97
111
|
- !ruby/object:Gem::Dependency
|
98
112
|
name: simplecov
|
99
113
|
requirement: !ruby/object:Gem::Requirement
|
@@ -156,15 +170,15 @@ dependencies:
|
|
156
170
|
requirements:
|
157
171
|
- - "~>"
|
158
172
|
- !ruby/object:Gem::Version
|
159
|
-
version:
|
173
|
+
version: 1.1.0
|
160
174
|
type: :runtime
|
161
175
|
prerelease: false
|
162
176
|
version_requirements: !ruby/object:Gem::Requirement
|
163
177
|
requirements:
|
164
178
|
- - "~>"
|
165
179
|
- !ruby/object:Gem::Version
|
166
|
-
version:
|
167
|
-
description: 'RelatonIec: retrieve IEC Standards for bibliographic use using the
|
180
|
+
version: 1.1.0
|
181
|
+
description: 'RelatonIec: retrieve IEC Standards for bibliographic use using the IecBibliographicItem
|
168
182
|
model'
|
169
183
|
email:
|
170
184
|
- open.source@ribose.com
|
@@ -184,15 +198,23 @@ files:
|
|
184
198
|
- Rakefile
|
185
199
|
- bin/console
|
186
200
|
- bin/setup
|
187
|
-
- grammars
|
201
|
+
- grammars/basicdoc.rng
|
202
|
+
- grammars/biblio.rng
|
203
|
+
- grammars/iec.rng
|
204
|
+
- grammars/isodoc.rng
|
205
|
+
- grammars/isostandard.rng
|
206
|
+
- grammars/reqt.rng
|
188
207
|
- lib/relaton_iec.rb
|
208
|
+
- lib/relaton_iec/hash_converter.rb
|
189
209
|
- lib/relaton_iec/hit.rb
|
190
210
|
- lib/relaton_iec/hit_collection.rb
|
211
|
+
- lib/relaton_iec/iec_bibliographic_item.rb
|
191
212
|
- lib/relaton_iec/iec_bibliography.rb
|
192
213
|
- lib/relaton_iec/processor.rb
|
193
214
|
- lib/relaton_iec/scrapper.rb
|
194
215
|
- lib/relaton_iec/statuses.yml
|
195
216
|
- lib/relaton_iec/version.rb
|
217
|
+
- lib/relaton_iec/xml_parser.rb
|
196
218
|
- relaton_iec.gemspec
|
197
219
|
homepage: https://github.com/metanorma/relaton-iec
|
198
220
|
licenses:
|
@@ -216,6 +238,6 @@ requirements: []
|
|
216
238
|
rubygems_version: 3.0.6
|
217
239
|
signing_key:
|
218
240
|
specification_version: 4
|
219
|
-
summary: 'RelatonIec: retrieve IEC Standards for bibliographic use using the
|
241
|
+
summary: 'RelatonIec: retrieve IEC Standards for bibliographic use using the IecBibliographicItem
|
220
242
|
model'
|
221
243
|
test_files: []
|