relaton-iec 0.8.0 → 1.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.adoc +1 -1
- data/grammars/basicdoc.rng +986 -0
- data/grammars/biblio.rng +1237 -0
- data/grammars/iec.rng +43 -0
- data/grammars/isodoc.rng +1077 -0
- data/{grammars → grammars/isostandard.rng} +145 -472
- data/grammars/reqt.rng +165 -0
- data/lib/relaton_iec.rb +12 -9
- data/lib/relaton_iec/hash_converter.rb +14 -0
- data/lib/relaton_iec/hit.rb +2 -41
- data/lib/relaton_iec/hit_collection.rb +3 -37
- data/lib/relaton_iec/iec_bibliographic_item.rb +9 -0
- data/lib/relaton_iec/iec_bibliography.rb +16 -18
- data/lib/relaton_iec/processor.rb +7 -7
- data/lib/relaton_iec/scrapper.rb +41 -135
- data/lib/relaton_iec/version.rb +1 -1
- data/lib/relaton_iec/xml_parser.rb +14 -0
- data/relaton_iec.gemspec +6 -5
- metadata +29 -7
data/lib/relaton_iec/scrapper.rb
CHANGED
@@ -32,31 +32,22 @@ module RelatonIec
|
|
32
32
|
}.freeze
|
33
33
|
|
34
34
|
class << self
|
35
|
-
#
|
36
|
-
# @return [Array<Hash>]
|
37
|
-
# def get(text)
|
38
|
-
# iso_workers = WorkersPool.new 4
|
39
|
-
# iso_workers.worker { |hit| iso_worker(hit, iso_workers) }
|
40
|
-
# algolia_workers = start_algolia_search(text, iso_workers)
|
41
|
-
# iso_docs = iso_workers.result
|
42
|
-
# algolia_workers.end
|
43
|
-
# algolia_workers.result
|
44
|
-
# iso_docs
|
45
|
-
# end
|
35
|
+
# rubocop:disable Metrics/AbcSize, Metrics/MethodLength
|
46
36
|
|
47
37
|
# Parse page.
|
48
38
|
# @param hit [Hash]
|
49
39
|
# @return [Hash]
|
50
|
-
# rubocop:disable Metrics/AbcSize, Metrics/MethodLength
|
51
40
|
def parse_page(hit_data)
|
52
41
|
doc = get_page hit_data[:url]
|
53
42
|
|
54
43
|
# Fetch edition.
|
55
|
-
edition = doc.at(
|
44
|
+
edition = doc.at(
|
45
|
+
"//th[contains(., 'Edition')]/following-sibling::td/span",
|
46
|
+
).text
|
56
47
|
|
57
48
|
status, relations = fetch_status_relations hit_data[:url]
|
58
49
|
|
59
|
-
|
50
|
+
IecBibliographicItem.new(
|
60
51
|
fetched: Date.today.to_s,
|
61
52
|
docid: [RelatonBib::DocumentIdentifier.new(id: hit_data[:code], type: "IEC")],
|
62
53
|
structuredidentifier: fetch_structuredidentifier(doc),
|
@@ -81,46 +72,6 @@ module RelatonIec
|
|
81
72
|
|
82
73
|
private
|
83
74
|
|
84
|
-
# Start search workers.
|
85
|
-
# @param text[String]
|
86
|
-
# @param iec_workers [Isobib::WorkersPool]
|
87
|
-
# @reaturn [Isobib::WorkersPool]
|
88
|
-
# def start_algolia_search(text, iec_workers)
|
89
|
-
# index = Algolia::Index.new 'all_en'
|
90
|
-
# workers = WorkersPool.new
|
91
|
-
# workers.worker do |page|
|
92
|
-
# algolia_worker(index, text, page, workers, iec_workers)
|
93
|
-
# end
|
94
|
-
|
95
|
-
# # Add first page so search worker will start.
|
96
|
-
# workers << 0
|
97
|
-
# end
|
98
|
-
|
99
|
-
# Fetch ISO documents.
|
100
|
-
# @param hit [Hash]
|
101
|
-
# @param isiso_workers [Isobib::WorkersPool]
|
102
|
-
# def iso_worker(hit, iso_workers)
|
103
|
-
# print "Parse #{iso_workers.size} of #{iso_workers.nb_hits} \r"
|
104
|
-
# parse_page hit
|
105
|
-
# end
|
106
|
-
|
107
|
-
# Fetch hits from algolia search service.
|
108
|
-
# @param index[Algolia::Index]
|
109
|
-
# @param text [String]
|
110
|
-
# @param page [Integer]
|
111
|
-
# @param algolia_workers [Isobib::WorkersPool]
|
112
|
-
# @param isiso_workers [Isobib::WorkersPool]
|
113
|
-
# def algolia_worker(index, text, page, algolia_workers, iso_workers)
|
114
|
-
# res = index.search text, facetFilters: ['category:standard'], page: page
|
115
|
-
# next_page = res['page'] + 1
|
116
|
-
# algolia_workers << next_page if next_page < res['nbPages']
|
117
|
-
# res['hits'].each do |hit|
|
118
|
-
# iso_workers.nb_hits = res['nbHits']
|
119
|
-
# iso_workers << hit
|
120
|
-
# end
|
121
|
-
# iso_workers.end unless next_page < res['nbPages']
|
122
|
-
# end
|
123
|
-
|
124
75
|
# Fetch abstracts.
|
125
76
|
# @param doc [Nokigiri::HTML::Document]
|
126
77
|
# @return [Array<Array>]
|
@@ -134,19 +85,6 @@ module RelatonIec
|
|
134
85
|
}]
|
135
86
|
end
|
136
87
|
|
137
|
-
# Get langs.
|
138
|
-
# @param doc [Nokogiri::HTML::Document]
|
139
|
-
# @return [Array<Hash>]
|
140
|
-
# def langs(doc)
|
141
|
-
# lgs = [{ lang: 'en' }]
|
142
|
-
# doc.css('ul#lang-switcher ul li a').each do |lang_link|
|
143
|
-
# lang_path = lang_link.attr('href')
|
144
|
-
# lang = lang_path.match(%r{^\/(fr)\/})
|
145
|
-
# lgs << { lang: lang[1], path: lang_path } if lang
|
146
|
-
# end
|
147
|
-
# lgs
|
148
|
-
# end
|
149
|
-
|
150
88
|
# rubocop:disable Metrics/AbcSize, Metrics/MethodLength
|
151
89
|
|
152
90
|
# Get page.
|
@@ -154,25 +92,20 @@ module RelatonIec
|
|
154
92
|
# @return [Array<Nokogiri::HTML::Document, String>]
|
155
93
|
def get_page(url)
|
156
94
|
uri = URI url
|
157
|
-
resp = Net::HTTP.get_response(uri)
|
95
|
+
resp = Net::HTTP.get_response(uri)
|
158
96
|
case resp.code
|
159
97
|
when "301"
|
160
98
|
path = resp["location"]
|
161
99
|
url = DOMAIN + path
|
162
100
|
uri = URI url
|
163
|
-
resp = Net::HTTP.get_response(uri)
|
101
|
+
resp = Net::HTTP.get_response(uri)
|
164
102
|
when "404"
|
165
103
|
raise RelatonBib::RequestError, "Page not found #{url}"
|
166
104
|
end
|
167
|
-
# n = 0
|
168
|
-
# while resp.body !~ /<strong/ && n < 10
|
169
|
-
# resp = Net::HTTP.get_response(uri)#.encode("UTF-8")
|
170
|
-
# n += 1
|
171
|
-
# end
|
172
105
|
Nokogiri::HTML(resp.body)
|
173
|
-
rescue SocketError, Timeout::Error, Errno::EINVAL, Errno::ECONNRESET,
|
174
|
-
Net::HTTPBadResponse, Net::HTTPHeaderSyntaxError,
|
175
|
-
OpenSSL::SSL::SSLError
|
106
|
+
rescue SocketError, Timeout::Error, Errno::EINVAL, Errno::ECONNRESET,
|
107
|
+
EOFError, Net::HTTPBadResponse, Net::HTTPHeaderSyntaxError,
|
108
|
+
Net::ProtocolError, OpenSSL::SSL::SSLError
|
176
109
|
raise RelatonBib::RequestError, "Could not access #{url}"
|
177
110
|
end
|
178
111
|
# rubocop:enable Metrics/AbcSize
|
@@ -211,15 +144,12 @@ module RelatonIec
|
|
211
144
|
statuses = YAML.load_file "lib/relaton_iec/statuses.yml"
|
212
145
|
s = wip.at("STAGE").text
|
213
146
|
stage, substage = statuses[s]["stage"].split "."
|
214
|
-
# status = statuses[s]["status"]
|
215
147
|
else
|
216
|
-
# status = "Published"
|
217
148
|
stage = "60"
|
218
149
|
substage = "60"
|
219
150
|
end
|
220
151
|
RelatonBib::DocumentStatus.new(stage: stage, substage: substage)
|
221
152
|
end
|
222
|
-
# rubocop:enable Metrics/MethodLength
|
223
153
|
|
224
154
|
# Fetch workgroup.
|
225
155
|
# @param doc [Nokogiri::HTML::Document]
|
@@ -237,27 +167,26 @@ module RelatonIec
|
|
237
167
|
}],
|
238
168
|
}
|
239
169
|
end
|
170
|
+
# rubocop:enable Metrics/MethodLength
|
240
171
|
|
241
172
|
# Fetch relations.
|
242
173
|
# @param doc [Nokogiri::HTML::Document]
|
243
174
|
# @return [Array<Hash>]
|
244
175
|
# rubocop:disable Metrics/MethodLength
|
245
176
|
def fetch_relations(doc)
|
246
|
-
doc.xpath('//ROW[STATUS[.!="PREPARING"]][STATUS[.!="PUBLISHED"]]').
|
177
|
+
doc.xpath('//ROW[STATUS[.!="PREPARING"]][STATUS[.!="PUBLISHED"]]').
|
178
|
+
map do |r|
|
247
179
|
r_type = r.at("STATUS").text.downcase
|
248
180
|
type = case r_type
|
249
|
-
|
181
|
+
# when 'published' then 'obsoletes' # Valid
|
250
182
|
when "revised", "replaced" then "updates"
|
251
183
|
when "withdrawn" then "obsoletes"
|
252
184
|
else r_type
|
253
185
|
end
|
254
|
-
# url = DOMAIN + "/publication/" + r.at("PUB_ID").text
|
255
186
|
fref = RelatonBib::FormattedRef.new(
|
256
187
|
content: r.at("FULL_NAME").text, format: "text/plain",
|
257
188
|
)
|
258
|
-
bibitem =
|
259
|
-
formattedref: fref,
|
260
|
-
)
|
189
|
+
bibitem = IecBibliographicItem.new(formattedref: fref)
|
261
190
|
{ type: type, bibitem: bibitem }
|
262
191
|
end
|
263
192
|
end
|
@@ -272,22 +201,6 @@ module RelatonIec
|
|
272
201
|
status = fetch_status doc
|
273
202
|
relations = fetch_relations doc
|
274
203
|
[status, relations]
|
275
|
-
# doc.css('ul.steps li').inject([]) do |a, r|
|
276
|
-
# r_type = r.css('strong').text
|
277
|
-
# type = case r_type
|
278
|
-
# when 'Previously', 'Will be replaced by' then 'obsoletes'
|
279
|
-
# when 'Corrigenda/Amendments', 'Revised by', 'Now confirmed'
|
280
|
-
# 'updates'
|
281
|
-
# else r_type
|
282
|
-
# end
|
283
|
-
# if ['Now', 'Now under review'].include? type
|
284
|
-
# a
|
285
|
-
# else
|
286
|
-
# a + r.css('a').map do |id|
|
287
|
-
# { type: type, identifier: id.text, url: id['href'] }
|
288
|
-
# end
|
289
|
-
# end
|
290
|
-
# end
|
291
204
|
end
|
292
205
|
# rubocop:enable Metrics/MethodLength
|
293
206
|
|
@@ -295,23 +208,13 @@ module RelatonIec
|
|
295
208
|
# @param doc [Nokogiri::HTML::Document]
|
296
209
|
# @return [String]
|
297
210
|
def fetch_type(doc)
|
298
|
-
doc.at(
|
299
|
-
|
300
|
-
|
301
|
-
# /NP)*\s|/)(TS|TR|PAS|AWI|CD|FDIS|NP|DIS|WD|R|Guide|(?=\d+))}x)
|
302
|
-
# #return "international-standard" if type_match.nil?
|
303
|
-
# if TYPES[type_match[2]]
|
304
|
-
# TYPES[type_match[2]]
|
305
|
-
# elsif type_match[1]
|
306
|
-
# elsif type_match[1] == 'ISO'
|
307
|
-
# 'international-standard'
|
308
|
-
# elsif type_match[1] == 'IWA'
|
309
|
-
# 'international-workshop-agreement'
|
310
|
-
# end
|
311
|
-
# # rescue => _e
|
312
|
-
# # puts 'Unknown document type: ' + title
|
211
|
+
doc.at(
|
212
|
+
'//th[contains(., "Publication type")]/following-sibling::td/span',
|
213
|
+
).text.downcase.tr " ", "-"
|
313
214
|
end
|
314
215
|
|
216
|
+
# rubocop:disable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/MethodLength
|
217
|
+
|
315
218
|
# Fetch titles.
|
316
219
|
# @param hit_data [Hash]
|
317
220
|
# @return [Array<Hash>]
|
@@ -339,21 +242,13 @@ module RelatonIec
|
|
339
242
|
end
|
340
243
|
[{
|
341
244
|
title_intro: intro,
|
342
|
-
title_main:
|
343
|
-
title_part:
|
344
|
-
language:
|
345
|
-
script:
|
245
|
+
title_main: main,
|
246
|
+
title_part: part,
|
247
|
+
language: "en",
|
248
|
+
script: "Latn",
|
346
249
|
}]
|
347
250
|
end
|
348
|
-
|
349
|
-
# Return ISO script code.
|
350
|
-
# @param lang [String]
|
351
|
-
# @return [String]
|
352
|
-
# def script(lang)
|
353
|
-
# case lang
|
354
|
-
# when 'en', 'fr' then 'Latn'
|
355
|
-
# end
|
356
|
-
# end
|
251
|
+
# rubocop:enable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/MethodLength
|
357
252
|
|
358
253
|
# Fetch dates
|
359
254
|
# @param doc [Nokogiri::HTML::Document]
|
@@ -367,6 +262,8 @@ module RelatonIec
|
|
367
262
|
dates
|
368
263
|
end
|
369
264
|
|
265
|
+
# rubocop:disable Metrics/MethodLength
|
266
|
+
|
370
267
|
def fetch_contributors(code)
|
371
268
|
code.sub(/\s.*/, "").split("/").map do |abbrev|
|
372
269
|
case abbrev
|
@@ -381,12 +278,15 @@ module RelatonIec
|
|
381
278
|
role: [type: "publisher"] }
|
382
279
|
end
|
383
280
|
end
|
281
|
+
# rubocop:enable Metrics/MethodLength
|
384
282
|
|
385
283
|
# Fetch ICS.
|
386
284
|
# @param doc [Nokogiri::HTML::Document]
|
387
285
|
# @return [Array<Hash>]
|
388
286
|
def fetch_ics(doc)
|
389
|
-
doc.xpath(
|
287
|
+
doc.xpath(
|
288
|
+
'//th[contains(text(), "ICS")]/following-sibling::td/a',
|
289
|
+
).map do |i|
|
390
290
|
code = i.text.match(/[\d\.]+/).to_s.split "."
|
391
291
|
{ field: code[0], group: code[1], subgroup: code[2] }
|
392
292
|
end
|
@@ -403,9 +303,11 @@ module RelatonIec
|
|
403
303
|
links
|
404
304
|
end
|
405
305
|
|
306
|
+
# rubocop:disable Metrics/MethodLength
|
307
|
+
|
406
308
|
# Fetch copyright.
|
407
309
|
# @param title [String]
|
408
|
-
# @return [Hash]
|
310
|
+
# @return [Array<Hash>]
|
409
311
|
def fetch_copyright(code, doc)
|
410
312
|
abbreviation = code.match(/.*?(?=\s)/).to_s
|
411
313
|
case abbreviation
|
@@ -415,11 +317,15 @@ module RelatonIec
|
|
415
317
|
end
|
416
318
|
from = code.match(/(?<=:)\d{4}/).to_s
|
417
319
|
if from.empty?
|
418
|
-
from = doc.xpath("//span[@itemprop='releaseDate']").text
|
419
|
-
|
320
|
+
from = doc.xpath("//span[@itemprop='releaseDate']").text.
|
321
|
+
match(/\d{4}/).to_s
|
420
322
|
end
|
421
|
-
{
|
323
|
+
[{
|
324
|
+
owner: [{ name: name, abbreviation: abbreviation, url: url }],
|
325
|
+
from: from,
|
326
|
+
}]
|
422
327
|
end
|
328
|
+
# rubocop:enable Metrics/MethodLength
|
423
329
|
end
|
424
330
|
end
|
425
331
|
# rubocop:enable Metrics/ModuleLength
|
data/lib/relaton_iec/version.rb
CHANGED
@@ -0,0 +1,14 @@
|
|
1
|
+
module RelatonIec
|
2
|
+
class XMLParser < RelatonIsoBib::XMLParser
|
3
|
+
class << self
|
4
|
+
private
|
5
|
+
|
6
|
+
# override RelatonIsoBib::IsoBibliographicItem.bib_item method
|
7
|
+
# @param item_hash [Hash]
|
8
|
+
# @return [RelatonIec::IecBibliographicItem]
|
9
|
+
def bib_item(item_hash)
|
10
|
+
IecBibliographicItem.new item_hash
|
11
|
+
end
|
12
|
+
end
|
13
|
+
end
|
14
|
+
end
|
data/relaton_iec.gemspec
CHANGED
@@ -8,10 +8,10 @@ Gem::Specification.new do |spec|
|
|
8
8
|
spec.authors = ["Ribose Inc."]
|
9
9
|
spec.email = ["open.source@ribose.com"]
|
10
10
|
|
11
|
-
spec.summary = "RelatonIec: retrieve IEC Standards for bibliographic
|
12
|
-
"using the
|
13
|
-
spec.description = "RelatonIec: retrieve IEC Standards for bibliographic
|
14
|
-
"using the
|
11
|
+
spec.summary = "RelatonIec: retrieve IEC Standards for bibliographic "\
|
12
|
+
"use using the IecBibliographicItem model"
|
13
|
+
spec.description = "RelatonIec: retrieve IEC Standards for bibliographic "\
|
14
|
+
"use using the IecBibliographicItem model"
|
15
15
|
spec.homepage = "https://github.com/metanorma/relaton-iec"
|
16
16
|
spec.license = "MIT"
|
17
17
|
|
@@ -29,10 +29,11 @@ Gem::Specification.new do |spec|
|
|
29
29
|
spec.add_development_dependency "rake", "~> 10.0"
|
30
30
|
spec.add_development_dependency "rspec", "~> 3.0"
|
31
31
|
spec.add_development_dependency "ruby-debug-ide"
|
32
|
+
spec.add_development_dependency "ruby-jing"
|
32
33
|
spec.add_development_dependency "simplecov"
|
33
34
|
spec.add_development_dependency "vcr"
|
34
35
|
spec.add_development_dependency "webmock"
|
35
36
|
|
36
37
|
spec.add_dependency "addressable"
|
37
|
-
spec.add_dependency "relaton-iso-bib", "~>
|
38
|
+
spec.add_dependency "relaton-iso-bib", "~> 1.1.0"
|
38
39
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: relaton-iec
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version:
|
4
|
+
version: 1.1.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Ribose Inc.
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2020-
|
11
|
+
date: 2020-06-03 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: debase
|
@@ -94,6 +94,20 @@ dependencies:
|
|
94
94
|
- - ">="
|
95
95
|
- !ruby/object:Gem::Version
|
96
96
|
version: '0'
|
97
|
+
- !ruby/object:Gem::Dependency
|
98
|
+
name: ruby-jing
|
99
|
+
requirement: !ruby/object:Gem::Requirement
|
100
|
+
requirements:
|
101
|
+
- - ">="
|
102
|
+
- !ruby/object:Gem::Version
|
103
|
+
version: '0'
|
104
|
+
type: :development
|
105
|
+
prerelease: false
|
106
|
+
version_requirements: !ruby/object:Gem::Requirement
|
107
|
+
requirements:
|
108
|
+
- - ">="
|
109
|
+
- !ruby/object:Gem::Version
|
110
|
+
version: '0'
|
97
111
|
- !ruby/object:Gem::Dependency
|
98
112
|
name: simplecov
|
99
113
|
requirement: !ruby/object:Gem::Requirement
|
@@ -156,15 +170,15 @@ dependencies:
|
|
156
170
|
requirements:
|
157
171
|
- - "~>"
|
158
172
|
- !ruby/object:Gem::Version
|
159
|
-
version:
|
173
|
+
version: 1.1.0
|
160
174
|
type: :runtime
|
161
175
|
prerelease: false
|
162
176
|
version_requirements: !ruby/object:Gem::Requirement
|
163
177
|
requirements:
|
164
178
|
- - "~>"
|
165
179
|
- !ruby/object:Gem::Version
|
166
|
-
version:
|
167
|
-
description: 'RelatonIec: retrieve IEC Standards for bibliographic use using the
|
180
|
+
version: 1.1.0
|
181
|
+
description: 'RelatonIec: retrieve IEC Standards for bibliographic use using the IecBibliographicItem
|
168
182
|
model'
|
169
183
|
email:
|
170
184
|
- open.source@ribose.com
|
@@ -184,15 +198,23 @@ files:
|
|
184
198
|
- Rakefile
|
185
199
|
- bin/console
|
186
200
|
- bin/setup
|
187
|
-
- grammars
|
201
|
+
- grammars/basicdoc.rng
|
202
|
+
- grammars/biblio.rng
|
203
|
+
- grammars/iec.rng
|
204
|
+
- grammars/isodoc.rng
|
205
|
+
- grammars/isostandard.rng
|
206
|
+
- grammars/reqt.rng
|
188
207
|
- lib/relaton_iec.rb
|
208
|
+
- lib/relaton_iec/hash_converter.rb
|
189
209
|
- lib/relaton_iec/hit.rb
|
190
210
|
- lib/relaton_iec/hit_collection.rb
|
211
|
+
- lib/relaton_iec/iec_bibliographic_item.rb
|
191
212
|
- lib/relaton_iec/iec_bibliography.rb
|
192
213
|
- lib/relaton_iec/processor.rb
|
193
214
|
- lib/relaton_iec/scrapper.rb
|
194
215
|
- lib/relaton_iec/statuses.yml
|
195
216
|
- lib/relaton_iec/version.rb
|
217
|
+
- lib/relaton_iec/xml_parser.rb
|
196
218
|
- relaton_iec.gemspec
|
197
219
|
homepage: https://github.com/metanorma/relaton-iec
|
198
220
|
licenses:
|
@@ -216,6 +238,6 @@ requirements: []
|
|
216
238
|
rubygems_version: 3.0.6
|
217
239
|
signing_key:
|
218
240
|
specification_version: 4
|
219
|
-
summary: 'RelatonIec: retrieve IEC Standards for bibliographic use using the
|
241
|
+
summary: 'RelatonIec: retrieve IEC Standards for bibliographic use using the IecBibliographicItem
|
220
242
|
model'
|
221
243
|
test_files: []
|