relaton-itu 1.0.0 → 1.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/relaton_itu.rb +0 -5
- data/lib/relaton_itu/hit.rb +2 -2
- data/lib/relaton_itu/hit_collection.rb +38 -12
- data/lib/relaton_itu/itu_bibliography.rb +22 -20
- data/lib/relaton_itu/scrapper.rb +49 -77
- data/lib/relaton_itu/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 169f68ca9de0a9e01f2130919807393d53507a2fd92f98a7043337ee9c037a18
|
4
|
+
data.tar.gz: a5fc6f91b1d6c6af3b25919c77f2a3f1cc57bbc527983207d5163088ba01ea6c
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 3243b1b99b363cb8bb773320ad5c70f0e48de661b4db92c1e3a96991de163ff40dd6179f974ebd80507ad2b37aca4ad0e20ca549c880787752128517a4dd34a9
|
7
|
+
data.tar.gz: d40ffddd2ef5a4ec569f92ae6253148b85ae434b38fa31ec2eb27681bf3e948096137b0acf86535fade15e4ccc79e538668a6371dbb575416a8a842739cdcf97
|
data/lib/relaton_itu.rb
CHANGED
@@ -2,11 +2,6 @@ require "relaton_itu/version"
|
|
2
2
|
require "relaton_itu/itu_bibliography"
|
3
3
|
require "digest/md5"
|
4
4
|
|
5
|
-
# if defined? Relaton
|
6
|
-
# require_relative "relaton/processor"
|
7
|
-
# Relaton::Registry.instance.register(Relaton::RelatonItu::Processor)
|
8
|
-
# end
|
9
|
-
|
10
5
|
module RelatonItu
|
11
6
|
class Error < StandardError; end
|
12
7
|
|
data/lib/relaton_itu/hit.rb
CHANGED
@@ -4,9 +4,9 @@ module RelatonItu
|
|
4
4
|
# Hit.
|
5
5
|
class Hit < RelatonBib::Hit
|
6
6
|
# Parse page.
|
7
|
-
# @return [
|
7
|
+
# @return [RelatonItu::ItuBibliographicItem]
|
8
8
|
def fetch
|
9
|
-
@fetch ||= Scrapper.parse_page
|
9
|
+
@fetch ||= Scrapper.parse_page hit, hit_collection.gi_imp
|
10
10
|
end
|
11
11
|
end
|
12
12
|
end
|
@@ -7,16 +7,39 @@ require "net/http"
|
|
7
7
|
module RelatonItu
|
8
8
|
# Page of hit collection.
|
9
9
|
class HitCollection < RelatonBib::HitCollection
|
10
|
-
DOMAIN = "https://www.itu.int"
|
10
|
+
DOMAIN = "https://www.itu.int"
|
11
11
|
|
12
|
-
# @
|
12
|
+
# @return [TrueClass, FalseClass]
|
13
|
+
attr_reader :gi_imp
|
14
|
+
|
15
|
+
# @param ref [String]
|
13
16
|
# @param year [String]
|
14
|
-
def initialize(
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
17
|
+
def initialize(ref, year = nil)
|
18
|
+
text = ref.sub /(?<=\.)Imp\s?(?=\d)/, ""
|
19
|
+
super text, year
|
20
|
+
@gi_imp = /\.Imp\d/.match?(ref)
|
21
|
+
uri = URI "#{DOMAIN}/net4/ITU-T/search/GlobalSearch/Search"
|
22
|
+
data = { json: params.to_json }
|
23
|
+
resp = Net::HTTP.post(uri, data.to_json,
|
24
|
+
"Content-Type" => "application/json")
|
25
|
+
@array = hits JSON.parse(resp.body)
|
26
|
+
end
|
27
|
+
|
28
|
+
private
|
29
|
+
|
30
|
+
# @return [String]
|
31
|
+
def group
|
32
|
+
@group ||= if %r{(OB|Operational Bulletin) No} =~ text then "Publications"
|
33
|
+
else "Recommendations"
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
37
|
+
# rubocop:disable Metrics/MethodLength
|
38
|
+
|
39
|
+
# @return [Hash]
|
40
|
+
def params
|
41
|
+
{
|
42
|
+
"Input" => text,
|
20
43
|
"Start" => 0,
|
21
44
|
"Rows" => 10,
|
22
45
|
"SortBy" => "RELEVANCE",
|
@@ -61,10 +84,13 @@ module RelatonItu
|
|
61
84
|
"IP" => "",
|
62
85
|
"SearchType" => "All",
|
63
86
|
}
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
87
|
+
end
|
88
|
+
# rubocop:enable Metrics/MethodLength
|
89
|
+
|
90
|
+
# @param data [Hash]
|
91
|
+
# @return [Array<RelatonItu::Hit>]
|
92
|
+
def hits(data)
|
93
|
+
data["results"].map do |h|
|
68
94
|
code = h["Media"]["Name"]
|
69
95
|
title = h["Title"]
|
70
96
|
url = h["Redirection"]
|
@@ -19,9 +19,9 @@ module RelatonItu
|
|
19
19
|
# @return [RelatonItu::HitCollection]
|
20
20
|
def search(text, year = nil)
|
21
21
|
HitCollection.new text, year
|
22
|
-
rescue SocketError, Timeout::Error, Errno::EINVAL, Errno::ECONNRESET,
|
23
|
-
Net::HTTPBadResponse, Net::HTTPHeaderSyntaxError,
|
24
|
-
OpenSSL::SSL::SSLError
|
22
|
+
rescue SocketError, Timeout::Error, Errno::EINVAL, Errno::ECONNRESET,
|
23
|
+
EOFError, Net::HTTPBadResponse, Net::HTTPHeaderSyntaxError,
|
24
|
+
Net::ProtocolError, OpenSSL::SSL::SSLError
|
25
25
|
raise RelatonBib::RequestError, "Could not access http://www.itu.int"
|
26
26
|
end
|
27
27
|
|
@@ -66,17 +66,17 @@ module RelatonItu
|
|
66
66
|
nil
|
67
67
|
end
|
68
68
|
|
69
|
-
def fetch_pages(hits, threads)
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
end
|
69
|
+
# def fetch_pages(hits, threads)
|
70
|
+
# workers = RelatonBib::WorkersPool.new threads
|
71
|
+
# workers.worker { |w| { i: w[:i], hit: w[:hit].fetch } }
|
72
|
+
# hits.each_with_index { |hit, i| workers << { i: i, hit: hit } }
|
73
|
+
# workers.end
|
74
|
+
# workers.result.sort_by { |a| a[:i] }.map { |x| x[:hit] }
|
75
|
+
# end
|
76
76
|
|
77
77
|
def search_filter(code)
|
78
|
-
docidrx = %r{\w
|
79
|
-
c = code.match(docidrx).to_s
|
78
|
+
docidrx = %r{\w+\.\d+|\w\sSuppl\.\s\d+} # %r{^ITU-T\s[^\s]+}
|
79
|
+
c = code.sub(/Imp\s?/, "").match(docidrx).to_s
|
80
80
|
warn "[relaton-itu] (\"#{code}\") fetching..."
|
81
81
|
result = search(code)
|
82
82
|
result.select do |i|
|
@@ -93,16 +93,18 @@ module RelatonItu
|
|
93
93
|
# If no match, returns any years which caused mismatch, for error reporting
|
94
94
|
def isobib_results_filter(result, year)
|
95
95
|
missed_years = []
|
96
|
-
result.each_slice(3) do |s| # ISO website only allows 3 connections
|
97
|
-
|
98
|
-
|
96
|
+
# result.each_slice(3) do |s| # ISO website only allows 3 connections
|
97
|
+
# fetch_pages(s, 3).each do |r|
|
98
|
+
result.each do |r|
|
99
|
+
return { ret: r.fetch } if !year
|
99
100
|
|
100
|
-
|
101
|
-
|
101
|
+
/\(\d{2}\/(?<pyear>\d{4})\)/ =~ r.hit[:code]
|
102
|
+
# r.date.select { |d| d.type == "published" }.each do |d|
|
103
|
+
return { ret: r.fetch } if year == pyear
|
102
104
|
|
103
|
-
|
104
|
-
|
105
|
-
end
|
105
|
+
missed_years << pyear
|
106
|
+
# end
|
107
|
+
# end
|
106
108
|
end
|
107
109
|
{ years: missed_years }
|
108
110
|
end
|
data/lib/relaton_itu/scrapper.rb
CHANGED
@@ -3,16 +3,9 @@
|
|
3
3
|
require "nokogiri"
|
4
4
|
require "net/http"
|
5
5
|
|
6
|
-
# Capybara.request_driver :poltergeist do |app|
|
7
|
-
# Capybara::Poltergeist::Driver.new app, js_errors: false
|
8
|
-
# end
|
9
|
-
# Capybara.default_driver = :poltergeist
|
10
|
-
|
11
6
|
module RelatonItu
|
12
7
|
# Scrapper.
|
13
|
-
# rubocop:disable Metrics/ModuleLength
|
14
8
|
module Scrapper
|
15
|
-
DOMAIN = "https://www.itu.int"
|
16
9
|
ROMAN_MONTHS = %w[I II III IV V VI VII VIII IX X XI XII].freeze
|
17
10
|
|
18
11
|
TYPES = {
|
@@ -31,24 +24,19 @@ module RelatonItu
|
|
31
24
|
}.freeze
|
32
25
|
|
33
26
|
class << self
|
34
|
-
#
|
35
|
-
# @return [Array<Hash>]
|
36
|
-
# def get(text)
|
37
|
-
# iso_workers = WorkersPool.new 4
|
38
|
-
# iso_workers.worker { |hit| iso_worker(hit, iso_workers) }
|
39
|
-
# algolia_workers = start_algolia_search(text, iso_workers)
|
40
|
-
# iso_docs = iso_workers.result
|
41
|
-
# algolia_workers.end
|
42
|
-
# algolia_workers.result
|
43
|
-
# iso_docs
|
44
|
-
# end
|
27
|
+
# rubocop:disable Metrics/AbcSize, Metrics/MethodLength
|
45
28
|
|
46
29
|
# Parse page.
|
47
|
-
# @param
|
30
|
+
# @param hit_data [Hash]
|
48
31
|
# @return [Hash]
|
49
|
-
|
50
|
-
def parse_page(hit_data)
|
32
|
+
def parse_page(hit_data, imp = false)
|
51
33
|
url, doc = get_page hit_data[:url]
|
34
|
+
if imp
|
35
|
+
a = doc.at "//span[contains(@id, 'tab_ig_uc_rec')]/a"
|
36
|
+
return unless a
|
37
|
+
|
38
|
+
url, doc = get_page URI.join(url, a[:href]).to_s
|
39
|
+
end
|
52
40
|
|
53
41
|
# Fetch edition.
|
54
42
|
edition = doc.at("//table/tr/td/span[contains(@id, 'Label8')]/b")&.text
|
@@ -73,7 +61,7 @@ module RelatonItu
|
|
73
61
|
place: ["Geneva"],
|
74
62
|
)
|
75
63
|
end
|
76
|
-
# rubocop:enable Metrics/AbcSize
|
64
|
+
# rubocop:enable Metrics/AbcSize
|
77
65
|
|
78
66
|
private
|
79
67
|
|
@@ -96,37 +84,23 @@ module RelatonItu
|
|
96
84
|
}]
|
97
85
|
end
|
98
86
|
|
99
|
-
# Get langs.
|
100
|
-
# @param doc [Nokogiri::HTML::Document]
|
101
|
-
# @return [Array<Hash>]
|
102
|
-
# def langs(doc)
|
103
|
-
# lgs = [{ lang: 'en' }]
|
104
|
-
# doc.css('ul#lang-switcher ul li a').each do |lang_link|
|
105
|
-
# lang_path = lang_link.attr('href')
|
106
|
-
# lang = lang_path.match(%r{^\/(fr)\/})
|
107
|
-
# lgs << { lang: lang[1], path: lang_path } if lang
|
108
|
-
# end
|
109
|
-
# lgs
|
110
|
-
# end
|
111
|
-
|
112
|
-
# rubocop:disable Metrics/AbcSize, Metrics/MethodLength
|
113
87
|
# Get page.
|
114
88
|
# @param path [String] page's path
|
115
|
-
# @return [Array<Nokogiri::HTML::Document
|
89
|
+
# @return [Array<String, Nokogiri::HTML::Document>]
|
116
90
|
def get_page(url)
|
117
91
|
uri = URI url
|
118
|
-
resp = Net::HTTP.get_response(uri)
|
92
|
+
resp = Net::HTTP.get_response(uri)
|
119
93
|
until resp.code == "200"
|
120
94
|
uri = URI resp["location"] if resp.code =~ /^30/
|
121
|
-
resp = Net::HTTP.get_response(uri)
|
95
|
+
resp = Net::HTTP.get_response(uri)
|
122
96
|
end
|
123
97
|
[uri.to_s, Nokogiri::HTML(resp.body)]
|
124
|
-
rescue SocketError, Timeout::Error, Errno::EINVAL, Errno::ECONNRESET,
|
125
|
-
Net::HTTPBadResponse, Net::HTTPHeaderSyntaxError,
|
126
|
-
OpenSSL::SSL::SSLError
|
98
|
+
rescue SocketError, Timeout::Error, Errno::EINVAL, Errno::ECONNRESET,
|
99
|
+
EOFError, Net::HTTPBadResponse, Net::HTTPHeaderSyntaxError,
|
100
|
+
Net::ProtocolError, OpenSSL::SSL::SSLError
|
127
101
|
raise RelatonBib::RequestError, "Could not access #{url}"
|
128
102
|
end
|
129
|
-
# rubocop:enable Metrics/
|
103
|
+
# rubocop:enable Metrics/MethodLength
|
130
104
|
|
131
105
|
# Fetch docid.
|
132
106
|
# @param doc [Nokogiri::HTML::Document]
|
@@ -135,9 +109,11 @@ module RelatonItu
|
|
135
109
|
doc.xpath(
|
136
110
|
"//span[@id='ctl00_content_main_uc_rec_main_info1_rpt_main_ctl00_lbl_rec']",
|
137
111
|
"//td[.='Identical standard:']/following-sibling::td",
|
112
|
+
"//div/table[1]/tr[4]/td/strong",
|
138
113
|
).map do |code|
|
139
|
-
id = code.text.match(%r{^.*?(?= \()}).to_s.squeeze(" ")
|
114
|
+
id = code.text.match(%r{^.*?(?= \()|\w\.Imp\s?\d+}).to_s.squeeze(" ")
|
140
115
|
type = id.match(%r{^\w+}).to_s
|
116
|
+
type = "ITU" if type == "G"
|
141
117
|
RelatonBib::DocumentIdentifier.new(type: type, id: id)
|
142
118
|
end
|
143
119
|
end
|
@@ -146,10 +122,11 @@ module RelatonItu
|
|
146
122
|
# @param doc [Nokogiri::HTML::Document]
|
147
123
|
# @return [RelatonBib::DocumentStatus, NilClass]
|
148
124
|
def fetch_status(doc)
|
149
|
-
s = doc.at("//table/tr/td/span[contains(@id, 'Label7')]"
|
125
|
+
s = doc.at("//table/tr/td/span[contains(@id, 'Label7')]",
|
126
|
+
"//p[contains(.,'Status :')]")
|
150
127
|
return unless s
|
151
128
|
|
152
|
-
status = s.text
|
129
|
+
status = s.text.include?("In force") ? "Published" : "Withdrawal"
|
153
130
|
RelatonBib::DocumentStatus.new(stage: status)
|
154
131
|
end
|
155
132
|
|
@@ -191,9 +168,7 @@ module RelatonItu
|
|
191
168
|
# @return [Array<Hash>]
|
192
169
|
def fetch_relations(doc)
|
193
170
|
doc.xpath('//div[contains(@id, "tab_sup")]//table/tr[position()>2]').map do |r|
|
194
|
-
# r_type = r.at('./td/span[contains(@id, "Label4")]/nobr').text.downcase
|
195
171
|
ref = r.at('./td/span[contains(@id, "title_e")]/nobr/a')
|
196
|
-
# url = DOMAIN + ref[:href].sub(/^\./, "/ITU-T/recommendations")
|
197
172
|
fref = RelatonBib::FormattedRef.new(content: ref.text, language: "en", script: "Latn")
|
198
173
|
bibitem = RelatonIsoBib::IsoBibliographicItem.new(formattedref: fref)
|
199
174
|
{ type: "complements", bibitem: bibitem }
|
@@ -201,22 +176,14 @@ module RelatonItu
|
|
201
176
|
end
|
202
177
|
# rubocop:enable Metrics/MethodLength
|
203
178
|
|
204
|
-
# Fetch type.
|
205
|
-
# @param doc [Nokogiri::HTML::Document]
|
206
|
-
# @return [String]
|
207
|
-
# def fetch_type(_doc)
|
208
|
-
# "recommendation"
|
209
|
-
# end
|
210
|
-
|
211
179
|
# Fetch titles.
|
212
180
|
# @param doc [Nokogiri::HTML::Document]
|
213
181
|
# @return [Array<Hash>]
|
214
182
|
def fetch_titles(doc)
|
215
|
-
|
216
|
-
# t = hit_data[:title] if t.empty?
|
217
|
-
t = doc.at("//td[@class='title']")
|
183
|
+
t = doc.at("//td[@class='title']|//div/table[1]/tr[4]/td/strong")
|
218
184
|
return [] unless t
|
219
|
-
|
185
|
+
|
186
|
+
titles = t.text.sub(/\w\.Imp\s?\d+\u00A0:\u00A0/, "").split " - "
|
220
187
|
case titles.size
|
221
188
|
when 0
|
222
189
|
intro, main, part = nil, "", nil
|
@@ -247,10 +214,11 @@ module RelatonItu
|
|
247
214
|
# @return [Array<Hash>]
|
248
215
|
def fetch_dates(doc)
|
249
216
|
dates = []
|
250
|
-
|
251
|
-
|
252
|
-
|
253
|
-
|
217
|
+
date = doc.at("//table/tr/td/span[contains(@id, 'Label5')]",
|
218
|
+
"//p[contains(.,'Approved in')]")
|
219
|
+
pdate = date&.text&.match(/\d{4}-\d{2}-\d{2}/).to_s || ob_date(doc)
|
220
|
+
if pdate && !pdate&.empty?
|
221
|
+
dates << { type: "published", on: pdate }
|
254
222
|
end
|
255
223
|
dates
|
256
224
|
end
|
@@ -278,36 +246,41 @@ module RelatonItu
|
|
278
246
|
# @param doc [Nokogiri::HTML::Document]
|
279
247
|
# @return [Array<Hash>]
|
280
248
|
def fetch_contributors(code)
|
249
|
+
return [] unless code
|
250
|
+
|
281
251
|
abbrev = code.sub(/-\w\s.*/, "")
|
282
252
|
case abbrev
|
283
253
|
when "ITU"
|
284
254
|
name = "International Telecommunication Union"
|
285
255
|
url = "www.itu.int"
|
286
256
|
end
|
287
|
-
[{ entity: { name: name, url: url, abbreviation: abbrev },
|
257
|
+
[{ entity: { name: name, url: url, abbreviation: abbrev },
|
258
|
+
role: [type: "publisher"] }]
|
288
259
|
end
|
289
260
|
|
290
|
-
# Fetch ICS.
|
291
|
-
# @param doc [Nokogiri::HTML::Document]
|
292
|
-
# @return [Array<Hash>]
|
293
|
-
# def fetch_ics(doc)
|
294
|
-
# doc.xpath('//th[contains(text(), "ICS")]/following-sibling::td/a').map do |i|
|
295
|
-
# code = i.text.match(/[\d\.]+/).to_s.split '.'
|
296
|
-
# { field: code[0], group: code[1], subgroup: code[2] }
|
297
|
-
# end
|
298
|
-
# end
|
299
|
-
|
300
261
|
# Fetch links.
|
301
262
|
# @param doc [Nokogiri::HTML::Document]
|
302
263
|
# @param url [String]
|
303
264
|
# @return [Array<Hash>]
|
304
265
|
def fetch_link(doc, url)
|
305
266
|
links = [{ type: "src", content: url }]
|
306
|
-
|
307
|
-
|
267
|
+
obp_elm = doc.at(
|
268
|
+
'//a[@title="Persistent link to download the PDF file"]',
|
269
|
+
"//font[contains(.,'PDF')]/../..",
|
270
|
+
)
|
271
|
+
links << typed_link("obp", obp_elm) if obp_elm
|
272
|
+
wrd_elm = doc.at("//font[contains(.,'Word')]/../..")
|
273
|
+
links << typed_link("word", wrd_elm) if wrd_elm
|
308
274
|
links
|
309
275
|
end
|
310
276
|
|
277
|
+
def typed_link(type, elm)
|
278
|
+
{
|
279
|
+
type: type,
|
280
|
+
content: URI.join(HitCollection::DOMAIN + elm[:href].strip).to_s,
|
281
|
+
}
|
282
|
+
end
|
283
|
+
|
311
284
|
# Fetch copyright.
|
312
285
|
# @param code [String]
|
313
286
|
# @param doc [Nokogiri::HTML::Document]
|
@@ -325,5 +298,4 @@ module RelatonItu
|
|
325
298
|
end
|
326
299
|
end
|
327
300
|
end
|
328
|
-
# rubocop:enable Metrics/ModuleLength
|
329
301
|
end
|
data/lib/relaton_itu/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: relaton-itu
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.0.
|
4
|
+
version: 1.0.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Ribose Inc.
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2020-04
|
11
|
+
date: 2020-05-04 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: debase
|