relaton-itu 1.0.0 → 1.0.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/relaton_itu.rb +0 -5
- data/lib/relaton_itu/hit.rb +2 -2
- data/lib/relaton_itu/hit_collection.rb +38 -12
- data/lib/relaton_itu/itu_bibliography.rb +22 -20
- data/lib/relaton_itu/scrapper.rb +49 -77
- data/lib/relaton_itu/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 169f68ca9de0a9e01f2130919807393d53507a2fd92f98a7043337ee9c037a18
|
4
|
+
data.tar.gz: a5fc6f91b1d6c6af3b25919c77f2a3f1cc57bbc527983207d5163088ba01ea6c
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 3243b1b99b363cb8bb773320ad5c70f0e48de661b4db92c1e3a96991de163ff40dd6179f974ebd80507ad2b37aca4ad0e20ca549c880787752128517a4dd34a9
|
7
|
+
data.tar.gz: d40ffddd2ef5a4ec569f92ae6253148b85ae434b38fa31ec2eb27681bf3e948096137b0acf86535fade15e4ccc79e538668a6371dbb575416a8a842739cdcf97
|
data/lib/relaton_itu.rb
CHANGED
@@ -2,11 +2,6 @@ require "relaton_itu/version"
|
|
2
2
|
require "relaton_itu/itu_bibliography"
|
3
3
|
require "digest/md5"
|
4
4
|
|
5
|
-
# if defined? Relaton
|
6
|
-
# require_relative "relaton/processor"
|
7
|
-
# Relaton::Registry.instance.register(Relaton::RelatonItu::Processor)
|
8
|
-
# end
|
9
|
-
|
10
5
|
module RelatonItu
|
11
6
|
class Error < StandardError; end
|
12
7
|
|
data/lib/relaton_itu/hit.rb
CHANGED
@@ -4,9 +4,9 @@ module RelatonItu
|
|
4
4
|
# Hit.
|
5
5
|
class Hit < RelatonBib::Hit
|
6
6
|
# Parse page.
|
7
|
-
# @return [
|
7
|
+
# @return [RelatonItu::ItuBibliographicItem]
|
8
8
|
def fetch
|
9
|
-
@fetch ||= Scrapper.parse_page
|
9
|
+
@fetch ||= Scrapper.parse_page hit, hit_collection.gi_imp
|
10
10
|
end
|
11
11
|
end
|
12
12
|
end
|
@@ -7,16 +7,39 @@ require "net/http"
|
|
7
7
|
module RelatonItu
|
8
8
|
# Page of hit collection.
|
9
9
|
class HitCollection < RelatonBib::HitCollection
|
10
|
-
DOMAIN = "https://www.itu.int"
|
10
|
+
DOMAIN = "https://www.itu.int"
|
11
11
|
|
12
|
-
# @
|
12
|
+
# @return [TrueClass, FalseClass]
|
13
|
+
attr_reader :gi_imp
|
14
|
+
|
15
|
+
# @param ref [String]
|
13
16
|
# @param year [String]
|
14
|
-
def initialize(
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
17
|
+
def initialize(ref, year = nil)
|
18
|
+
text = ref.sub /(?<=\.)Imp\s?(?=\d)/, ""
|
19
|
+
super text, year
|
20
|
+
@gi_imp = /\.Imp\d/.match?(ref)
|
21
|
+
uri = URI "#{DOMAIN}/net4/ITU-T/search/GlobalSearch/Search"
|
22
|
+
data = { json: params.to_json }
|
23
|
+
resp = Net::HTTP.post(uri, data.to_json,
|
24
|
+
"Content-Type" => "application/json")
|
25
|
+
@array = hits JSON.parse(resp.body)
|
26
|
+
end
|
27
|
+
|
28
|
+
private
|
29
|
+
|
30
|
+
# @return [String]
|
31
|
+
def group
|
32
|
+
@group ||= if %r{(OB|Operational Bulletin) No} =~ text then "Publications"
|
33
|
+
else "Recommendations"
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
37
|
+
# rubocop:disable Metrics/MethodLength
|
38
|
+
|
39
|
+
# @return [Hash]
|
40
|
+
def params
|
41
|
+
{
|
42
|
+
"Input" => text,
|
20
43
|
"Start" => 0,
|
21
44
|
"Rows" => 10,
|
22
45
|
"SortBy" => "RELEVANCE",
|
@@ -61,10 +84,13 @@ module RelatonItu
|
|
61
84
|
"IP" => "",
|
62
85
|
"SearchType" => "All",
|
63
86
|
}
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
87
|
+
end
|
88
|
+
# rubocop:enable Metrics/MethodLength
|
89
|
+
|
90
|
+
# @param data [Hash]
|
91
|
+
# @return [Array<RelatonItu::Hit>]
|
92
|
+
def hits(data)
|
93
|
+
data["results"].map do |h|
|
68
94
|
code = h["Media"]["Name"]
|
69
95
|
title = h["Title"]
|
70
96
|
url = h["Redirection"]
|
@@ -19,9 +19,9 @@ module RelatonItu
|
|
19
19
|
# @return [RelatonItu::HitCollection]
|
20
20
|
def search(text, year = nil)
|
21
21
|
HitCollection.new text, year
|
22
|
-
rescue SocketError, Timeout::Error, Errno::EINVAL, Errno::ECONNRESET,
|
23
|
-
Net::HTTPBadResponse, Net::HTTPHeaderSyntaxError,
|
24
|
-
OpenSSL::SSL::SSLError
|
22
|
+
rescue SocketError, Timeout::Error, Errno::EINVAL, Errno::ECONNRESET,
|
23
|
+
EOFError, Net::HTTPBadResponse, Net::HTTPHeaderSyntaxError,
|
24
|
+
Net::ProtocolError, OpenSSL::SSL::SSLError
|
25
25
|
raise RelatonBib::RequestError, "Could not access http://www.itu.int"
|
26
26
|
end
|
27
27
|
|
@@ -66,17 +66,17 @@ module RelatonItu
|
|
66
66
|
nil
|
67
67
|
end
|
68
68
|
|
69
|
-
def fetch_pages(hits, threads)
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
end
|
69
|
+
# def fetch_pages(hits, threads)
|
70
|
+
# workers = RelatonBib::WorkersPool.new threads
|
71
|
+
# workers.worker { |w| { i: w[:i], hit: w[:hit].fetch } }
|
72
|
+
# hits.each_with_index { |hit, i| workers << { i: i, hit: hit } }
|
73
|
+
# workers.end
|
74
|
+
# workers.result.sort_by { |a| a[:i] }.map { |x| x[:hit] }
|
75
|
+
# end
|
76
76
|
|
77
77
|
def search_filter(code)
|
78
|
-
docidrx = %r{\w
|
79
|
-
c = code.match(docidrx).to_s
|
78
|
+
docidrx = %r{\w+\.\d+|\w\sSuppl\.\s\d+} # %r{^ITU-T\s[^\s]+}
|
79
|
+
c = code.sub(/Imp\s?/, "").match(docidrx).to_s
|
80
80
|
warn "[relaton-itu] (\"#{code}\") fetching..."
|
81
81
|
result = search(code)
|
82
82
|
result.select do |i|
|
@@ -93,16 +93,18 @@ module RelatonItu
|
|
93
93
|
# If no match, returns any years which caused mismatch, for error reporting
|
94
94
|
def isobib_results_filter(result, year)
|
95
95
|
missed_years = []
|
96
|
-
result.each_slice(3) do |s| # ISO website only allows 3 connections
|
97
|
-
|
98
|
-
|
96
|
+
# result.each_slice(3) do |s| # ISO website only allows 3 connections
|
97
|
+
# fetch_pages(s, 3).each do |r|
|
98
|
+
result.each do |r|
|
99
|
+
return { ret: r.fetch } if !year
|
99
100
|
|
100
|
-
|
101
|
-
|
101
|
+
/\(\d{2}\/(?<pyear>\d{4})\)/ =~ r.hit[:code]
|
102
|
+
# r.date.select { |d| d.type == "published" }.each do |d|
|
103
|
+
return { ret: r.fetch } if year == pyear
|
102
104
|
|
103
|
-
|
104
|
-
|
105
|
-
end
|
105
|
+
missed_years << pyear
|
106
|
+
# end
|
107
|
+
# end
|
106
108
|
end
|
107
109
|
{ years: missed_years }
|
108
110
|
end
|
data/lib/relaton_itu/scrapper.rb
CHANGED
@@ -3,16 +3,9 @@
|
|
3
3
|
require "nokogiri"
|
4
4
|
require "net/http"
|
5
5
|
|
6
|
-
# Capybara.request_driver :poltergeist do |app|
|
7
|
-
# Capybara::Poltergeist::Driver.new app, js_errors: false
|
8
|
-
# end
|
9
|
-
# Capybara.default_driver = :poltergeist
|
10
|
-
|
11
6
|
module RelatonItu
|
12
7
|
# Scrapper.
|
13
|
-
# rubocop:disable Metrics/ModuleLength
|
14
8
|
module Scrapper
|
15
|
-
DOMAIN = "https://www.itu.int"
|
16
9
|
ROMAN_MONTHS = %w[I II III IV V VI VII VIII IX X XI XII].freeze
|
17
10
|
|
18
11
|
TYPES = {
|
@@ -31,24 +24,19 @@ module RelatonItu
|
|
31
24
|
}.freeze
|
32
25
|
|
33
26
|
class << self
|
34
|
-
#
|
35
|
-
# @return [Array<Hash>]
|
36
|
-
# def get(text)
|
37
|
-
# iso_workers = WorkersPool.new 4
|
38
|
-
# iso_workers.worker { |hit| iso_worker(hit, iso_workers) }
|
39
|
-
# algolia_workers = start_algolia_search(text, iso_workers)
|
40
|
-
# iso_docs = iso_workers.result
|
41
|
-
# algolia_workers.end
|
42
|
-
# algolia_workers.result
|
43
|
-
# iso_docs
|
44
|
-
# end
|
27
|
+
# rubocop:disable Metrics/AbcSize, Metrics/MethodLength
|
45
28
|
|
46
29
|
# Parse page.
|
47
|
-
# @param
|
30
|
+
# @param hit_data [Hash]
|
48
31
|
# @return [Hash]
|
49
|
-
|
50
|
-
def parse_page(hit_data)
|
32
|
+
def parse_page(hit_data, imp = false)
|
51
33
|
url, doc = get_page hit_data[:url]
|
34
|
+
if imp
|
35
|
+
a = doc.at "//span[contains(@id, 'tab_ig_uc_rec')]/a"
|
36
|
+
return unless a
|
37
|
+
|
38
|
+
url, doc = get_page URI.join(url, a[:href]).to_s
|
39
|
+
end
|
52
40
|
|
53
41
|
# Fetch edition.
|
54
42
|
edition = doc.at("//table/tr/td/span[contains(@id, 'Label8')]/b")&.text
|
@@ -73,7 +61,7 @@ module RelatonItu
|
|
73
61
|
place: ["Geneva"],
|
74
62
|
)
|
75
63
|
end
|
76
|
-
# rubocop:enable Metrics/AbcSize
|
64
|
+
# rubocop:enable Metrics/AbcSize
|
77
65
|
|
78
66
|
private
|
79
67
|
|
@@ -96,37 +84,23 @@ module RelatonItu
|
|
96
84
|
}]
|
97
85
|
end
|
98
86
|
|
99
|
-
# Get langs.
|
100
|
-
# @param doc [Nokogiri::HTML::Document]
|
101
|
-
# @return [Array<Hash>]
|
102
|
-
# def langs(doc)
|
103
|
-
# lgs = [{ lang: 'en' }]
|
104
|
-
# doc.css('ul#lang-switcher ul li a').each do |lang_link|
|
105
|
-
# lang_path = lang_link.attr('href')
|
106
|
-
# lang = lang_path.match(%r{^\/(fr)\/})
|
107
|
-
# lgs << { lang: lang[1], path: lang_path } if lang
|
108
|
-
# end
|
109
|
-
# lgs
|
110
|
-
# end
|
111
|
-
|
112
|
-
# rubocop:disable Metrics/AbcSize, Metrics/MethodLength
|
113
87
|
# Get page.
|
114
88
|
# @param path [String] page's path
|
115
|
-
# @return [Array<Nokogiri::HTML::Document
|
89
|
+
# @return [Array<String, Nokogiri::HTML::Document>]
|
116
90
|
def get_page(url)
|
117
91
|
uri = URI url
|
118
|
-
resp = Net::HTTP.get_response(uri)
|
92
|
+
resp = Net::HTTP.get_response(uri)
|
119
93
|
until resp.code == "200"
|
120
94
|
uri = URI resp["location"] if resp.code =~ /^30/
|
121
|
-
resp = Net::HTTP.get_response(uri)
|
95
|
+
resp = Net::HTTP.get_response(uri)
|
122
96
|
end
|
123
97
|
[uri.to_s, Nokogiri::HTML(resp.body)]
|
124
|
-
rescue SocketError, Timeout::Error, Errno::EINVAL, Errno::ECONNRESET,
|
125
|
-
Net::HTTPBadResponse, Net::HTTPHeaderSyntaxError,
|
126
|
-
OpenSSL::SSL::SSLError
|
98
|
+
rescue SocketError, Timeout::Error, Errno::EINVAL, Errno::ECONNRESET,
|
99
|
+
EOFError, Net::HTTPBadResponse, Net::HTTPHeaderSyntaxError,
|
100
|
+
Net::ProtocolError, OpenSSL::SSL::SSLError
|
127
101
|
raise RelatonBib::RequestError, "Could not access #{url}"
|
128
102
|
end
|
129
|
-
# rubocop:enable Metrics/
|
103
|
+
# rubocop:enable Metrics/MethodLength
|
130
104
|
|
131
105
|
# Fetch docid.
|
132
106
|
# @param doc [Nokogiri::HTML::Document]
|
@@ -135,9 +109,11 @@ module RelatonItu
|
|
135
109
|
doc.xpath(
|
136
110
|
"//span[@id='ctl00_content_main_uc_rec_main_info1_rpt_main_ctl00_lbl_rec']",
|
137
111
|
"//td[.='Identical standard:']/following-sibling::td",
|
112
|
+
"//div/table[1]/tr[4]/td/strong",
|
138
113
|
).map do |code|
|
139
|
-
id = code.text.match(%r{^.*?(?= \()}).to_s.squeeze(" ")
|
114
|
+
id = code.text.match(%r{^.*?(?= \()|\w\.Imp\s?\d+}).to_s.squeeze(" ")
|
140
115
|
type = id.match(%r{^\w+}).to_s
|
116
|
+
type = "ITU" if type == "G"
|
141
117
|
RelatonBib::DocumentIdentifier.new(type: type, id: id)
|
142
118
|
end
|
143
119
|
end
|
@@ -146,10 +122,11 @@ module RelatonItu
|
|
146
122
|
# @param doc [Nokogiri::HTML::Document]
|
147
123
|
# @return [RelatonBib::DocumentStatus, NilClass]
|
148
124
|
def fetch_status(doc)
|
149
|
-
s = doc.at("//table/tr/td/span[contains(@id, 'Label7')]"
|
125
|
+
s = doc.at("//table/tr/td/span[contains(@id, 'Label7')]",
|
126
|
+
"//p[contains(.,'Status :')]")
|
150
127
|
return unless s
|
151
128
|
|
152
|
-
status = s.text
|
129
|
+
status = s.text.include?("In force") ? "Published" : "Withdrawal"
|
153
130
|
RelatonBib::DocumentStatus.new(stage: status)
|
154
131
|
end
|
155
132
|
|
@@ -191,9 +168,7 @@ module RelatonItu
|
|
191
168
|
# @return [Array<Hash>]
|
192
169
|
def fetch_relations(doc)
|
193
170
|
doc.xpath('//div[contains(@id, "tab_sup")]//table/tr[position()>2]').map do |r|
|
194
|
-
# r_type = r.at('./td/span[contains(@id, "Label4")]/nobr').text.downcase
|
195
171
|
ref = r.at('./td/span[contains(@id, "title_e")]/nobr/a')
|
196
|
-
# url = DOMAIN + ref[:href].sub(/^\./, "/ITU-T/recommendations")
|
197
172
|
fref = RelatonBib::FormattedRef.new(content: ref.text, language: "en", script: "Latn")
|
198
173
|
bibitem = RelatonIsoBib::IsoBibliographicItem.new(formattedref: fref)
|
199
174
|
{ type: "complements", bibitem: bibitem }
|
@@ -201,22 +176,14 @@ module RelatonItu
|
|
201
176
|
end
|
202
177
|
# rubocop:enable Metrics/MethodLength
|
203
178
|
|
204
|
-
# Fetch type.
|
205
|
-
# @param doc [Nokogiri::HTML::Document]
|
206
|
-
# @return [String]
|
207
|
-
# def fetch_type(_doc)
|
208
|
-
# "recommendation"
|
209
|
-
# end
|
210
|
-
|
211
179
|
# Fetch titles.
|
212
180
|
# @param doc [Nokogiri::HTML::Document]
|
213
181
|
# @return [Array<Hash>]
|
214
182
|
def fetch_titles(doc)
|
215
|
-
|
216
|
-
# t = hit_data[:title] if t.empty?
|
217
|
-
t = doc.at("//td[@class='title']")
|
183
|
+
t = doc.at("//td[@class='title']|//div/table[1]/tr[4]/td/strong")
|
218
184
|
return [] unless t
|
219
|
-
|
185
|
+
|
186
|
+
titles = t.text.sub(/\w\.Imp\s?\d+\u00A0:\u00A0/, "").split " - "
|
220
187
|
case titles.size
|
221
188
|
when 0
|
222
189
|
intro, main, part = nil, "", nil
|
@@ -247,10 +214,11 @@ module RelatonItu
|
|
247
214
|
# @return [Array<Hash>]
|
248
215
|
def fetch_dates(doc)
|
249
216
|
dates = []
|
250
|
-
|
251
|
-
|
252
|
-
|
253
|
-
|
217
|
+
date = doc.at("//table/tr/td/span[contains(@id, 'Label5')]",
|
218
|
+
"//p[contains(.,'Approved in')]")
|
219
|
+
pdate = date&.text&.match(/\d{4}-\d{2}-\d{2}/).to_s || ob_date(doc)
|
220
|
+
if pdate && !pdate&.empty?
|
221
|
+
dates << { type: "published", on: pdate }
|
254
222
|
end
|
255
223
|
dates
|
256
224
|
end
|
@@ -278,36 +246,41 @@ module RelatonItu
|
|
278
246
|
# @param doc [Nokogiri::HTML::Document]
|
279
247
|
# @return [Array<Hash>]
|
280
248
|
def fetch_contributors(code)
|
249
|
+
return [] unless code
|
250
|
+
|
281
251
|
abbrev = code.sub(/-\w\s.*/, "")
|
282
252
|
case abbrev
|
283
253
|
when "ITU"
|
284
254
|
name = "International Telecommunication Union"
|
285
255
|
url = "www.itu.int"
|
286
256
|
end
|
287
|
-
[{ entity: { name: name, url: url, abbreviation: abbrev },
|
257
|
+
[{ entity: { name: name, url: url, abbreviation: abbrev },
|
258
|
+
role: [type: "publisher"] }]
|
288
259
|
end
|
289
260
|
|
290
|
-
# Fetch ICS.
|
291
|
-
# @param doc [Nokogiri::HTML::Document]
|
292
|
-
# @return [Array<Hash>]
|
293
|
-
# def fetch_ics(doc)
|
294
|
-
# doc.xpath('//th[contains(text(), "ICS")]/following-sibling::td/a').map do |i|
|
295
|
-
# code = i.text.match(/[\d\.]+/).to_s.split '.'
|
296
|
-
# { field: code[0], group: code[1], subgroup: code[2] }
|
297
|
-
# end
|
298
|
-
# end
|
299
|
-
|
300
261
|
# Fetch links.
|
301
262
|
# @param doc [Nokogiri::HTML::Document]
|
302
263
|
# @param url [String]
|
303
264
|
# @return [Array<Hash>]
|
304
265
|
def fetch_link(doc, url)
|
305
266
|
links = [{ type: "src", content: url }]
|
306
|
-
|
307
|
-
|
267
|
+
obp_elm = doc.at(
|
268
|
+
'//a[@title="Persistent link to download the PDF file"]',
|
269
|
+
"//font[contains(.,'PDF')]/../..",
|
270
|
+
)
|
271
|
+
links << typed_link("obp", obp_elm) if obp_elm
|
272
|
+
wrd_elm = doc.at("//font[contains(.,'Word')]/../..")
|
273
|
+
links << typed_link("word", wrd_elm) if wrd_elm
|
308
274
|
links
|
309
275
|
end
|
310
276
|
|
277
|
+
def typed_link(type, elm)
|
278
|
+
{
|
279
|
+
type: type,
|
280
|
+
content: URI.join(HitCollection::DOMAIN + elm[:href].strip).to_s,
|
281
|
+
}
|
282
|
+
end
|
283
|
+
|
311
284
|
# Fetch copyright.
|
312
285
|
# @param code [String]
|
313
286
|
# @param doc [Nokogiri::HTML::Document]
|
@@ -325,5 +298,4 @@ module RelatonItu
|
|
325
298
|
end
|
326
299
|
end
|
327
300
|
end
|
328
|
-
# rubocop:enable Metrics/ModuleLength
|
329
301
|
end
|
data/lib/relaton_itu/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: relaton-itu
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.0.
|
4
|
+
version: 1.0.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Ribose Inc.
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2020-04
|
11
|
+
date: 2020-05-04 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: debase
|