relaton-itu 1.7.7 → 1.7.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/relaton_itu.rb +1 -0
- data/lib/relaton_itu/hash_converter.rb +9 -0
- data/lib/relaton_itu/hit.rb +1 -1
- data/lib/relaton_itu/hit_collection.rb +9 -4
- data/lib/relaton_itu/itu_bibliography.rb +10 -8
- data/lib/relaton_itu/scrapper.rb +49 -55
- data/lib/relaton_itu/version.rb +1 -1
- data/relaton-itu.gemspec +1 -0
- metadata +16 -2
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 249fe896ec8a77979ca15d6a42da98ad2ac2620cfe8dc0f468cd14277c5a35b0
|
|
4
|
+
data.tar.gz: 62415ed835abc49cf00d3048b52556b3f718a4ad0dc531ec2c20572b95305210
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 65a5bcf91f851cc4ec3139fad83b0c83f143b1bafefe8c9638e34072f7d82f76b77517fedf153fcdf7b63ee903fcd372c64e378edaeb231835f477f004f7e94a
|
|
7
|
+
data.tar.gz: 61cdc7df34b24f5d3f3e56b967e9e2b34337bc164b32691747853660cbb6c9e337a6eee058162f97e218da356f4028d7c107b4be4ed7dc10f99253e6985fccd9
|
data/lib/relaton_itu.rb
CHANGED
|
@@ -3,6 +3,15 @@ module RelatonItu
|
|
|
3
3
|
class << self
|
|
4
4
|
private
|
|
5
5
|
|
|
6
|
+
#
|
|
7
|
+
# Ovverides superclass's method
|
|
8
|
+
#
|
|
9
|
+
# @param item [Hash]
|
|
10
|
+
# @retirn [RelatonItu::ItuBibliographicItem]
|
|
11
|
+
def bib_item(item)
|
|
12
|
+
ItuBibliographicItem.new(**item)
|
|
13
|
+
end
|
|
14
|
+
|
|
6
15
|
def editorialgroup_hash_to_bib(ret)
|
|
7
16
|
eg = ret[:editorialgroup]
|
|
8
17
|
return unless eg
|
data/lib/relaton_itu/hit.rb
CHANGED
|
@@ -12,17 +12,21 @@ module RelatonItu
|
|
|
12
12
|
# @return [TrueClass, FalseClass]
|
|
13
13
|
attr_reader :gi_imp
|
|
14
14
|
|
|
15
|
+
# @return [Mechanize]
|
|
16
|
+
attr_reader :agent
|
|
17
|
+
|
|
15
18
|
# @param ref [String]
|
|
16
19
|
# @param year [String]
|
|
17
20
|
def initialize(ref, year = nil) # rubocop:disable Metrics/AbcSize,Metrics/MethodLength
|
|
18
21
|
text = ref.sub /(?<=\.)Imp\s?(?=\d)/, ""
|
|
19
22
|
super text, year
|
|
23
|
+
@agent = Mechanize.new
|
|
24
|
+
agent.user_agent_alias = "Mac Safari"
|
|
20
25
|
@gi_imp = /\.Imp\d/.match?(ref)
|
|
21
26
|
if ref.match? /^(ITU-T|ITU-R\sRR)/
|
|
22
|
-
|
|
27
|
+
url = "#{DOMAIN}/net4/ITU-T/search/GlobalSearch/Search"
|
|
23
28
|
data = { json: params.to_json }
|
|
24
|
-
resp =
|
|
25
|
-
"Content-Type" => "application/json")
|
|
29
|
+
resp = agent.post url, data.to_json, "Content-Type" => "application/json"
|
|
26
30
|
@array = hits JSON.parse(resp.body)
|
|
27
31
|
elsif ref.match? /^ITU-R/
|
|
28
32
|
rf = ref.sub(/^ITU-R\s/, "").upcase
|
|
@@ -47,7 +51,8 @@ module RelatonItu
|
|
|
47
51
|
# @return [String]
|
|
48
52
|
def group
|
|
49
53
|
@group ||= case text
|
|
50
|
-
when %r{OB|Operational Bulletin}, %r{^ITU-R\sRR}
|
|
54
|
+
when %r{OB|Operational Bulletin}, %r{^ITU-R\sRR}
|
|
55
|
+
"Publications"
|
|
51
56
|
when %r{^ITU-T} then "Recommendations"
|
|
52
57
|
end
|
|
53
58
|
end
|
|
@@ -28,13 +28,14 @@ module RelatonItu
|
|
|
28
28
|
HitCollection.new text, year
|
|
29
29
|
rescue SocketError, Timeout::Error, Errno::EINVAL, Errno::ECONNRESET,
|
|
30
30
|
EOFError, Net::HTTPBadResponse, Net::HTTPHeaderSyntaxError,
|
|
31
|
-
Net::ProtocolError,
|
|
32
|
-
raise RelatonBib::RequestError,
|
|
31
|
+
Net::ProtocolError, URI::InvalidURIError => e
|
|
32
|
+
raise RelatonBib::RequestError, e.message
|
|
33
33
|
end
|
|
34
34
|
|
|
35
35
|
# @param code [String] the ISO standard Code to look up (e..g "ISO 9000")
|
|
36
36
|
# @param year [String] the year the standard was published (optional)
|
|
37
|
-
# @param opts [Hash] options; restricted to :all_parts if all-parts
|
|
37
|
+
# @param opts [Hash] options; restricted to :all_parts if all-parts
|
|
38
|
+
# reference is required
|
|
38
39
|
# @return [String] Relaton XML serialisation of reference
|
|
39
40
|
def get(code, year = nil, opts = {}) # rubocop:disable Metrics/CyclomaticComplexity,Metrics/MethodLength,Metrics/PerceivedComplexity
|
|
40
41
|
if year.nil?
|
|
@@ -61,12 +62,12 @@ module RelatonItu
|
|
|
61
62
|
warn "[relaton-itu] WARNING: no match found online for #{id}. "\
|
|
62
63
|
"The code must be exactly like it is on the standards website."
|
|
63
64
|
unless missed_years.empty?
|
|
64
|
-
warn "[relaton-itu] (There was no match for #{year}, though there
|
|
65
|
-
"found for #{missed_years.join(', ')}.)"
|
|
65
|
+
warn "[relaton-itu] (There was no match for #{year}, though there "\
|
|
66
|
+
"were matches found for #{missed_years.join(', ')}.)"
|
|
66
67
|
end
|
|
67
68
|
if /\d-\d/.match? code
|
|
68
|
-
warn "[relaton-itu] The provided document part may not exist, or
|
|
69
|
-
"may no longer be published in parts."
|
|
69
|
+
warn "[relaton-itu] The provided document part may not exist, or "\
|
|
70
|
+
"the document may no longer be published in parts."
|
|
70
71
|
else
|
|
71
72
|
warn "[relaton-itu] If you wanted to cite all document parts for the reference, "\
|
|
72
73
|
"use \"#{code} (all parts)\".\nIf the document is not a standard, "\
|
|
@@ -123,7 +124,8 @@ module RelatonItu
|
|
|
123
124
|
def isobib_results_filter(result, year)
|
|
124
125
|
missed_years = []
|
|
125
126
|
result.each do |r|
|
|
126
|
-
|
|
127
|
+
/\((\d{2}\/)?(?<pyear>\d{4})\)/ =~ r.hit[:code]
|
|
128
|
+
if !year || year == pyear
|
|
127
129
|
ret = r.fetch
|
|
128
130
|
return { ret: ret } if ret
|
|
129
131
|
end
|
data/lib/relaton_itu/scrapper.rb
CHANGED
|
@@ -24,20 +24,18 @@ module RelatonItu
|
|
|
24
24
|
}.freeze
|
|
25
25
|
|
|
26
26
|
class << self
|
|
27
|
-
# rubocop:disable Metrics/AbcSize, Metrics/MethodLength
|
|
28
|
-
|
|
29
27
|
# Parse page.
|
|
30
|
-
# @param
|
|
28
|
+
# @param hit [RelatonItu::Hit]
|
|
31
29
|
# @return [Hash]
|
|
32
|
-
def parse_page(
|
|
33
|
-
|
|
34
|
-
return unless doc
|
|
30
|
+
def parse_page(hit, imp = false) # rubocop:disable Metrics/AbcSize, Metrics/MethodLength
|
|
31
|
+
doc = get_page hit
|
|
32
|
+
return unless doc.code == "200"
|
|
35
33
|
|
|
36
34
|
if imp
|
|
37
35
|
a = doc.at "//span[contains(@id, 'tab_ig_uc_rec')]/a"
|
|
38
36
|
return unless a
|
|
39
37
|
|
|
40
|
-
|
|
38
|
+
doc = get_page hit, a[:href].to_s
|
|
41
39
|
end
|
|
42
40
|
|
|
43
41
|
# Fetch edition.
|
|
@@ -46,36 +44,37 @@ module RelatonItu
|
|
|
46
44
|
ItuBibliographicItem.new(
|
|
47
45
|
fetched: Date.today.to_s,
|
|
48
46
|
type: "standard",
|
|
49
|
-
docid: fetch_docid(doc,
|
|
47
|
+
docid: fetch_docid(doc, hit.hit[:title]),
|
|
50
48
|
edition: edition,
|
|
51
49
|
language: ["en"],
|
|
52
50
|
script: ["Latn"],
|
|
53
51
|
title: fetch_titles(doc),
|
|
54
|
-
doctype:
|
|
52
|
+
doctype: hit.hit[:type],
|
|
55
53
|
docstatus: fetch_status(doc),
|
|
56
54
|
ics: [], # fetch_ics(doc),
|
|
57
55
|
date: fetch_dates(doc),
|
|
58
|
-
contributor: fetch_contributors(
|
|
59
|
-
editorialgroup: fetch_workgroup(
|
|
60
|
-
abstract: fetch_abstract(doc),
|
|
61
|
-
copyright: fetch_copyright(
|
|
62
|
-
link: fetch_link(doc
|
|
56
|
+
contributor: fetch_contributors(hit.hit[:code]),
|
|
57
|
+
editorialgroup: fetch_workgroup(hit.hit[:code], doc),
|
|
58
|
+
abstract: fetch_abstract(doc, hit),
|
|
59
|
+
copyright: fetch_copyright(hit.hit[:code], doc),
|
|
60
|
+
link: fetch_link(doc),
|
|
63
61
|
relation: fetch_relations(doc),
|
|
64
62
|
place: ["Geneva"]
|
|
65
63
|
)
|
|
66
64
|
end
|
|
67
|
-
# rubocop:enable Metrics/AbcSize
|
|
68
65
|
|
|
69
66
|
private
|
|
70
67
|
|
|
71
68
|
# Fetch abstracts.
|
|
72
|
-
# @param doc [
|
|
73
|
-
# @
|
|
74
|
-
|
|
75
|
-
|
|
69
|
+
# @param doc [Mechanize::Page]
|
|
70
|
+
# @param hit [RelatonItu::Hit]
|
|
71
|
+
# @return [Array<Hash>]
|
|
72
|
+
def fetch_abstract(doc, hit) # rubocop:disable Metrics/AbcSize,Metrics/MethodLength
|
|
73
|
+
abstract_url = doc.at '//table/tr/td/span[contains(@id, "lbl_dms")]/div'
|
|
76
74
|
content = if abstract_url
|
|
77
75
|
url = abstract_url[:onclick].match(/https?[^']+/).to_s
|
|
78
|
-
|
|
76
|
+
rsp = hit.hit_collection.agent.get url
|
|
77
|
+
d = Nokogiri::HTML rsp.body.encode(undef: :replace, replace: "")
|
|
79
78
|
d.css("p.MsoNormal").text.gsub(/\r\n/, "").squeeze(" ").gsub(/\u00a0/, "")
|
|
80
79
|
elsif a = doc.at('//table/tr/td/span[contains(@class, "observation")]/text()')
|
|
81
80
|
a.text.strip
|
|
@@ -90,27 +89,20 @@ module RelatonItu
|
|
|
90
89
|
end
|
|
91
90
|
|
|
92
91
|
# Get page.
|
|
93
|
-
# @param
|
|
92
|
+
# @param hit [RelatonItu::Hit]
|
|
93
|
+
# @param url [String, nil]
|
|
94
94
|
# @return [Array<String, Nokogiri::HTML::Document>]
|
|
95
|
-
def get_page(url)
|
|
96
|
-
uri =
|
|
97
|
-
|
|
98
|
-
until resp.code == "200"
|
|
99
|
-
return if resp["location"] == "/en/publications/pages/notfound.aspx"
|
|
100
|
-
|
|
101
|
-
uri = URI resp["location"] if resp.code.match? /^30/
|
|
102
|
-
resp = Net::HTTP.get_response(uri)
|
|
103
|
-
end
|
|
104
|
-
[uri.to_s, Nokogiri::HTML(resp.body)]
|
|
95
|
+
def get_page(hit, url = nil)
|
|
96
|
+
uri = url || hit.hit[:url]
|
|
97
|
+
hit.hit_collection.agent.get uri
|
|
105
98
|
rescue SocketError, Timeout::Error, Errno::EINVAL, Errno::ECONNRESET,
|
|
106
99
|
EOFError, Net::HTTPBadResponse, Net::HTTPHeaderSyntaxError,
|
|
107
100
|
Net::ProtocolError, OpenSSL::SSL::SSLError
|
|
108
|
-
raise RelatonBib::RequestError, "Could not access #{
|
|
101
|
+
raise RelatonBib::RequestError, "Could not access #{uri}"
|
|
109
102
|
end
|
|
110
|
-
# rubocop:enable Metrics/MethodLength
|
|
111
103
|
|
|
112
104
|
# Fetch docid.
|
|
113
|
-
# @param doc [
|
|
105
|
+
# @param doc [Mechanize::Page]
|
|
114
106
|
# @param title [String]
|
|
115
107
|
# @return [Hash]
|
|
116
108
|
def fetch_docid(doc, title)
|
|
@@ -123,6 +115,8 @@ module RelatonItu
|
|
|
123
115
|
docids
|
|
124
116
|
end
|
|
125
117
|
|
|
118
|
+
# @param text [String]
|
|
119
|
+
# @return [RelatonBib::DocumentIdentifier]
|
|
126
120
|
def createdocid(text) # rubocop:disable Metrics/MethodLength
|
|
127
121
|
%r{
|
|
128
122
|
^(?<code>((ITU-\w|ISO\/IEC)\s)?[^\(:]+)
|
|
@@ -140,7 +134,7 @@ module RelatonItu
|
|
|
140
134
|
end
|
|
141
135
|
|
|
142
136
|
# Fetch status.
|
|
143
|
-
# @param doc [
|
|
137
|
+
# @param doc [Mechanize::Page]
|
|
144
138
|
# @return [RelatonBib::DocumentStatus, NilClass]
|
|
145
139
|
def fetch_status(doc)
|
|
146
140
|
s = doc.at("//table/tr/td/span[contains(@id, 'Label7')]",
|
|
@@ -153,7 +147,7 @@ module RelatonItu
|
|
|
153
147
|
|
|
154
148
|
# Fetch workgroup.
|
|
155
149
|
# @param code [String]
|
|
156
|
-
# @param doc [
|
|
150
|
+
# @param doc [Mechanize::Page]
|
|
157
151
|
# @return [RelatonItu::EditorialGroup, NilClass]
|
|
158
152
|
def fetch_workgroup(code, doc)
|
|
159
153
|
wg = doc.at('//table/tr/td/span[contains(@id, "Label8")]/a')
|
|
@@ -161,8 +155,7 @@ module RelatonItu
|
|
|
161
155
|
|
|
162
156
|
group = wg && itugroup(wg.text)
|
|
163
157
|
EditorialGroup.new(
|
|
164
|
-
bureau: code.match(/(?<=-)./).to_s,
|
|
165
|
-
group: group
|
|
158
|
+
bureau: code.match(/(?<=-)./).to_s, group: group
|
|
166
159
|
)
|
|
167
160
|
end
|
|
168
161
|
|
|
@@ -182,24 +175,24 @@ module RelatonItu
|
|
|
182
175
|
ItuGroup.new name: name, type: type, acronym: acronym
|
|
183
176
|
end
|
|
184
177
|
|
|
185
|
-
# rubocop:disable Metrics/MethodLength
|
|
186
|
-
|
|
187
178
|
# Fetch relations.
|
|
188
|
-
# @param doc [
|
|
179
|
+
# @param doc [Mechanize::Page]
|
|
189
180
|
# @return [Array<Hash>]
|
|
190
181
|
def fetch_relations(doc)
|
|
191
|
-
doc.xpath('//div[contains(@id, "tab_sup")]//table/tr[position()>2]')
|
|
182
|
+
doc.xpath('//div[contains(@id, "tab_sup")]//table/tr[position()>2]')
|
|
183
|
+
.map do |r|
|
|
192
184
|
ref = r.at('./td/span[contains(@id, "title_e")]/nobr/a')
|
|
193
|
-
fref = RelatonBib::FormattedRef.new(content: ref.text, language: "en",
|
|
194
|
-
|
|
185
|
+
fref = RelatonBib::FormattedRef.new(content: ref.text, language: "en",
|
|
186
|
+
script: "Latn")
|
|
187
|
+
bibitem = ItuBibliographicItem.new(formattedref: fref,
|
|
188
|
+
type: "standard")
|
|
195
189
|
{ type: "complements", bibitem: bibitem }
|
|
196
190
|
end
|
|
197
191
|
end
|
|
198
|
-
# rubocop:enable Metrics/MethodLength
|
|
199
192
|
|
|
200
193
|
# Fetch titles.
|
|
201
|
-
# @param doc [
|
|
202
|
-
# @return [
|
|
194
|
+
# @param doc [Mechanize::Page]
|
|
195
|
+
# @return [RelatonBib::TypedTitleStringCollection]
|
|
203
196
|
def fetch_titles(doc)
|
|
204
197
|
t = doc.at("//td[@class='title']|//div/table[1]/tr[4]/td/strong")
|
|
205
198
|
return [] unless t
|
|
@@ -208,7 +201,7 @@ module RelatonItu
|
|
|
208
201
|
end
|
|
209
202
|
|
|
210
203
|
# Fetch dates
|
|
211
|
-
# @param doc [
|
|
204
|
+
# @param doc [Mechanize::Page]
|
|
212
205
|
# @return [Array<Hash>]
|
|
213
206
|
def fetch_dates(doc) # rubocop:disable Metrics/CyclomaticComplexity,Metrics/PerceivedComplexity
|
|
214
207
|
dates = []
|
|
@@ -224,7 +217,7 @@ module RelatonItu
|
|
|
224
217
|
end
|
|
225
218
|
|
|
226
219
|
# Scrape Operational Bulletin date.
|
|
227
|
-
# @param doc [
|
|
220
|
+
# @param doc [Mechanize::Page]
|
|
228
221
|
# @return [String]
|
|
229
222
|
def ob_date(doc)
|
|
230
223
|
pdate = doc.at('//table/tbody/tr/td[contains(text(), "Year:")]')
|
|
@@ -246,7 +239,7 @@ module RelatonItu
|
|
|
246
239
|
end
|
|
247
240
|
|
|
248
241
|
# Fetch contributors
|
|
249
|
-
# @param doc [
|
|
242
|
+
# @param doc [Mechanize::Page]
|
|
250
243
|
# @return [Array<Hash>]
|
|
251
244
|
def fetch_contributors(code)
|
|
252
245
|
return [] unless code
|
|
@@ -262,11 +255,10 @@ module RelatonItu
|
|
|
262
255
|
end
|
|
263
256
|
|
|
264
257
|
# Fetch links.
|
|
265
|
-
# @param doc [
|
|
266
|
-
# @param url [String]
|
|
258
|
+
# @param doc [Mechanize::Page]
|
|
267
259
|
# @return [Array<Hash>]
|
|
268
|
-
def fetch_link(doc
|
|
269
|
-
links = [{ type: "src", content:
|
|
260
|
+
def fetch_link(doc)
|
|
261
|
+
links = [{ type: "src", content: doc.uri.to_s }]
|
|
270
262
|
obp_elm = doc.at(
|
|
271
263
|
'//a[@title="Persistent link to download the PDF file"]',
|
|
272
264
|
"//font[contains(.,'PDF')]/../.."
|
|
@@ -277,6 +269,8 @@ module RelatonItu
|
|
|
277
269
|
links
|
|
278
270
|
end
|
|
279
271
|
|
|
272
|
+
# @param type [String]
|
|
273
|
+
# @param elm [Nokogiri::XML::Element]
|
|
280
274
|
def typed_link(type, elm)
|
|
281
275
|
{
|
|
282
276
|
type: type,
|
|
@@ -286,7 +280,7 @@ module RelatonItu
|
|
|
286
280
|
|
|
287
281
|
# Fetch copyright.
|
|
288
282
|
# @param code [String]
|
|
289
|
-
# @param doc [
|
|
283
|
+
# @param doc [Mechanize::Page]
|
|
290
284
|
# @return [Array<Hash>]
|
|
291
285
|
def fetch_copyright(code, doc)
|
|
292
286
|
abbreviation = code.match(/^[^-]+/).to_s
|
data/lib/relaton_itu/version.rb
CHANGED
data/relaton-itu.gemspec
CHANGED
metadata
CHANGED
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: relaton-itu
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 1.7.
|
|
4
|
+
version: 1.7.8
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Ribose Inc.
|
|
8
8
|
autorequire:
|
|
9
9
|
bindir: exe
|
|
10
10
|
cert_chain: []
|
|
11
|
-
date: 2021-04-
|
|
11
|
+
date: 2021-04-22 00:00:00.000000000 Z
|
|
12
12
|
dependencies:
|
|
13
13
|
- !ruby/object:Gem::Dependency
|
|
14
14
|
name: equivalent-xml
|
|
@@ -122,6 +122,20 @@ dependencies:
|
|
|
122
122
|
- - ">="
|
|
123
123
|
- !ruby/object:Gem::Version
|
|
124
124
|
version: '0'
|
|
125
|
+
- !ruby/object:Gem::Dependency
|
|
126
|
+
name: mechanize
|
|
127
|
+
requirement: !ruby/object:Gem::Requirement
|
|
128
|
+
requirements:
|
|
129
|
+
- - ">="
|
|
130
|
+
- !ruby/object:Gem::Version
|
|
131
|
+
version: '0'
|
|
132
|
+
type: :runtime
|
|
133
|
+
prerelease: false
|
|
134
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
135
|
+
requirements:
|
|
136
|
+
- - ">="
|
|
137
|
+
- !ruby/object:Gem::Version
|
|
138
|
+
version: '0'
|
|
125
139
|
- !ruby/object:Gem::Dependency
|
|
126
140
|
name: relaton-bib
|
|
127
141
|
requirement: !ruby/object:Gem::Requirement
|