relaton-itu 1.0.0 → 1.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: cbdb4a303ff1173ad845aa15185093b9f93caaf1b3dfce4cde8dae9094bf8def
4
- data.tar.gz: 8a096c17b4b6863596996556eb4319ee9451b6dadbf850c78578abb40424c380
3
+ metadata.gz: 169f68ca9de0a9e01f2130919807393d53507a2fd92f98a7043337ee9c037a18
4
+ data.tar.gz: a5fc6f91b1d6c6af3b25919c77f2a3f1cc57bbc527983207d5163088ba01ea6c
5
5
  SHA512:
6
- metadata.gz: ec3aed1ca2c9ba554edd6990df967369104f23efa8286ca529f0d0baffefb753c50c3b54d9030d3b65f34efcbeb096bf4674cc435e8755b5584d38e6ab485d36
7
- data.tar.gz: 18baf5f6f7b7b3473af763b286794d009350995b99a7cc7a76f343f360e5098a9ea795cda8eac36bf105328d02e88b730571e13bb793fc0ad60250979d67dbe3
6
+ metadata.gz: 3243b1b99b363cb8bb773320ad5c70f0e48de661b4db92c1e3a96991de163ff40dd6179f974ebd80507ad2b37aca4ad0e20ca549c880787752128517a4dd34a9
7
+ data.tar.gz: d40ffddd2ef5a4ec569f92ae6253148b85ae434b38fa31ec2eb27681bf3e948096137b0acf86535fade15e4ccc79e538668a6371dbb575416a8a842739cdcf97
@@ -2,11 +2,6 @@ require "relaton_itu/version"
2
2
  require "relaton_itu/itu_bibliography"
3
3
  require "digest/md5"
4
4
 
5
- # if defined? Relaton
6
- # require_relative "relaton/processor"
7
- # Relaton::Registry.instance.register(Relaton::RelatonItu::Processor)
8
- # end
9
-
10
5
  module RelatonItu
11
6
  class Error < StandardError; end
12
7
 
@@ -4,9 +4,9 @@ module RelatonItu
4
4
  # Hit.
5
5
  class Hit < RelatonBib::Hit
6
6
  # Parse page.
7
- # @return [Isobib::IsoBibliographicItem]
7
+ # @return [RelatonItu::ItuBibliographicItem]
8
8
  def fetch
9
- @fetch ||= Scrapper.parse_page @hit
9
+ @fetch ||= Scrapper.parse_page hit, hit_collection.gi_imp
10
10
  end
11
11
  end
12
12
  end
@@ -7,16 +7,39 @@ require "net/http"
7
7
  module RelatonItu
8
8
  # Page of hit collection.
9
9
  class HitCollection < RelatonBib::HitCollection
10
- DOMAIN = "https://www.itu.int".freeze
10
+ DOMAIN = "https://www.itu.int"
11
11
 
12
- # @param ref_nbr [String]
12
+ # @return [TrueClass, FalseClass]
13
+ attr_reader :gi_imp
14
+
15
+ # @param ref [String]
13
16
  # @param year [String]
14
- def initialize(ref_nbr, year = nil)
15
- super
16
- group = %r{(OB|Operational Bulletin) No} =~ text ? "Publications" : "Recommendations"
17
- url = "#{DOMAIN}/net4/ITU-T/search/GlobalSearch/Search"
18
- params = {
19
- "Input" => ref_nbr,
17
+ def initialize(ref, year = nil)
18
+ text = ref.sub /(?<=\.)Imp\s?(?=\d)/, ""
19
+ super text, year
20
+ @gi_imp = /\.Imp\d/.match?(ref)
21
+ uri = URI "#{DOMAIN}/net4/ITU-T/search/GlobalSearch/Search"
22
+ data = { json: params.to_json }
23
+ resp = Net::HTTP.post(uri, data.to_json,
24
+ "Content-Type" => "application/json")
25
+ @array = hits JSON.parse(resp.body)
26
+ end
27
+
28
+ private
29
+
30
+ # @return [String]
31
+ def group
32
+ @group ||= if %r{(OB|Operational Bulletin) No} =~ text then "Publications"
33
+ else "Recommendations"
34
+ end
35
+ end
36
+
37
+ # rubocop:disable Metrics/MethodLength
38
+
39
+ # @return [Hash]
40
+ def params
41
+ {
42
+ "Input" => text,
20
43
  "Start" => 0,
21
44
  "Rows" => 10,
22
45
  "SortBy" => "RELEVANCE",
@@ -61,10 +84,13 @@ module RelatonItu
61
84
  "IP" => "",
62
85
  "SearchType" => "All",
63
86
  }
64
- data = { json: params.to_json }
65
- resp = Net::HTTP.post(URI(url), data.to_json, "Content-Type" => "application/json")
66
- doc = JSON.parse resp.body
67
- @array = doc["results"].map do |h|
87
+ end
88
+ # rubocop:enable Metrics/MethodLength
89
+
90
+ # @param data [Hash]
91
+ # @return [Array<RelatonItu::Hit>]
92
+ def hits(data)
93
+ data["results"].map do |h|
68
94
  code = h["Media"]["Name"]
69
95
  title = h["Title"]
70
96
  url = h["Redirection"]
@@ -19,9 +19,9 @@ module RelatonItu
19
19
  # @return [RelatonItu::HitCollection]
20
20
  def search(text, year = nil)
21
21
  HitCollection.new text, year
22
- rescue SocketError, Timeout::Error, Errno::EINVAL, Errno::ECONNRESET, EOFError,
23
- Net::HTTPBadResponse, Net::HTTPHeaderSyntaxError, Net::ProtocolError,
24
- OpenSSL::SSL::SSLError
22
+ rescue SocketError, Timeout::Error, Errno::EINVAL, Errno::ECONNRESET,
23
+ EOFError, Net::HTTPBadResponse, Net::HTTPHeaderSyntaxError,
24
+ Net::ProtocolError, OpenSSL::SSL::SSLError
25
25
  raise RelatonBib::RequestError, "Could not access http://www.itu.int"
26
26
  end
27
27
 
@@ -66,17 +66,17 @@ module RelatonItu
66
66
  nil
67
67
  end
68
68
 
69
- def fetch_pages(hits, threads)
70
- workers = RelatonBib::WorkersPool.new threads
71
- workers.worker { |w| { i: w[:i], hit: w[:hit].fetch } }
72
- hits.each_with_index { |hit, i| workers << { i: i, hit: hit } }
73
- workers.end
74
- workers.result.sort_by { |a| a[:i] }.map { |x| x[:hit] }
75
- end
69
+ # def fetch_pages(hits, threads)
70
+ # workers = RelatonBib::WorkersPool.new threads
71
+ # workers.worker { |w| { i: w[:i], hit: w[:hit].fetch } }
72
+ # hits.each_with_index { |hit, i| workers << { i: i, hit: hit } }
73
+ # workers.end
74
+ # workers.result.sort_by { |a| a[:i] }.map { |x| x[:hit] }
75
+ # end
76
76
 
77
77
  def search_filter(code)
78
- docidrx = %r{\w+.\d+|\w\sSuppl\.\s\d+} # %r{^ITU-T\s[^\s]+}
79
- c = code.match(docidrx).to_s
78
+ docidrx = %r{\w+\.\d+|\w\sSuppl\.\s\d+} # %r{^ITU-T\s[^\s]+}
79
+ c = code.sub(/Imp\s?/, "").match(docidrx).to_s
80
80
  warn "[relaton-itu] (\"#{code}\") fetching..."
81
81
  result = search(code)
82
82
  result.select do |i|
@@ -93,16 +93,18 @@ module RelatonItu
93
93
  # If no match, returns any years which caused mismatch, for error reporting
94
94
  def isobib_results_filter(result, year)
95
95
  missed_years = []
96
- result.each_slice(3) do |s| # ISO website only allows 3 connections
97
- fetch_pages(s, 3).each do |r|
98
- return { ret: r } if !year
96
+ # result.each_slice(3) do |s| # ISO website only allows 3 connections
97
+ # fetch_pages(s, 3).each do |r|
98
+ result.each do |r|
99
+ return { ret: r.fetch } if !year
99
100
 
100
- r.date.select { |d| d.type == "published" }.each do |d|
101
- return { ret: r } if year.to_i == d.on.year
101
+ /\(\d{2}\/(?<pyear>\d{4})\)/ =~ r.hit[:code]
102
+ # r.date.select { |d| d.type == "published" }.each do |d|
103
+ return { ret: r.fetch } if year == pyear
102
104
 
103
- missed_years << d.on.year
104
- end
105
- end
105
+ missed_years << pyear
106
+ # end
107
+ # end
106
108
  end
107
109
  { years: missed_years }
108
110
  end
@@ -3,16 +3,9 @@
3
3
  require "nokogiri"
4
4
  require "net/http"
5
5
 
6
- # Capybara.request_driver :poltergeist do |app|
7
- # Capybara::Poltergeist::Driver.new app, js_errors: false
8
- # end
9
- # Capybara.default_driver = :poltergeist
10
-
11
6
  module RelatonItu
12
7
  # Scrapper.
13
- # rubocop:disable Metrics/ModuleLength
14
8
  module Scrapper
15
- DOMAIN = "https://www.itu.int"
16
9
  ROMAN_MONTHS = %w[I II III IV V VI VII VIII IX X XI XII].freeze
17
10
 
18
11
  TYPES = {
@@ -31,24 +24,19 @@ module RelatonItu
31
24
  }.freeze
32
25
 
33
26
  class << self
34
- # @param text [String]
35
- # @return [Array<Hash>]
36
- # def get(text)
37
- # iso_workers = WorkersPool.new 4
38
- # iso_workers.worker { |hit| iso_worker(hit, iso_workers) }
39
- # algolia_workers = start_algolia_search(text, iso_workers)
40
- # iso_docs = iso_workers.result
41
- # algolia_workers.end
42
- # algolia_workers.result
43
- # iso_docs
44
- # end
27
+ # rubocop:disable Metrics/AbcSize, Metrics/MethodLength
45
28
 
46
29
  # Parse page.
47
- # @param hit [Hash]
30
+ # @param hit_data [Hash]
48
31
  # @return [Hash]
49
- # rubocop:disable Metrics/AbcSize, Metrics/MethodLength
50
- def parse_page(hit_data)
32
+ def parse_page(hit_data, imp = false)
51
33
  url, doc = get_page hit_data[:url]
34
+ if imp
35
+ a = doc.at "//span[contains(@id, 'tab_ig_uc_rec')]/a"
36
+ return unless a
37
+
38
+ url, doc = get_page URI.join(url, a[:href]).to_s
39
+ end
52
40
 
53
41
  # Fetch edition.
54
42
  edition = doc.at("//table/tr/td/span[contains(@id, 'Label8')]/b")&.text
@@ -73,7 +61,7 @@ module RelatonItu
73
61
  place: ["Geneva"],
74
62
  )
75
63
  end
76
- # rubocop:enable Metrics/AbcSize, Metrics/MethodLength
64
+ # rubocop:enable Metrics/AbcSize
77
65
 
78
66
  private
79
67
 
@@ -96,37 +84,23 @@ module RelatonItu
96
84
  }]
97
85
  end
98
86
 
99
- # Get langs.
100
- # @param doc [Nokogiri::HTML::Document]
101
- # @return [Array<Hash>]
102
- # def langs(doc)
103
- # lgs = [{ lang: 'en' }]
104
- # doc.css('ul#lang-switcher ul li a').each do |lang_link|
105
- # lang_path = lang_link.attr('href')
106
- # lang = lang_path.match(%r{^\/(fr)\/})
107
- # lgs << { lang: lang[1], path: lang_path } if lang
108
- # end
109
- # lgs
110
- # end
111
-
112
- # rubocop:disable Metrics/AbcSize, Metrics/MethodLength
113
87
  # Get page.
114
88
  # @param path [String] page's path
115
- # @return [Array<Nokogiri::HTML::Document, String>]
89
+ # @return [Array<String, Nokogiri::HTML::Document>]
116
90
  def get_page(url)
117
91
  uri = URI url
118
- resp = Net::HTTP.get_response(uri) # .encode("UTF-8")
92
+ resp = Net::HTTP.get_response(uri)
119
93
  until resp.code == "200"
120
94
  uri = URI resp["location"] if resp.code =~ /^30/
121
- resp = Net::HTTP.get_response(uri) # .encode("UTF-8")
95
+ resp = Net::HTTP.get_response(uri)
122
96
  end
123
97
  [uri.to_s, Nokogiri::HTML(resp.body)]
124
- rescue SocketError, Timeout::Error, Errno::EINVAL, Errno::ECONNRESET, EOFError,
125
- Net::HTTPBadResponse, Net::HTTPHeaderSyntaxError, Net::ProtocolError,
126
- OpenSSL::SSL::SSLError
98
+ rescue SocketError, Timeout::Error, Errno::EINVAL, Errno::ECONNRESET,
99
+ EOFError, Net::HTTPBadResponse, Net::HTTPHeaderSyntaxError,
100
+ Net::ProtocolError, OpenSSL::SSL::SSLError
127
101
  raise RelatonBib::RequestError, "Could not access #{url}"
128
102
  end
129
- # rubocop:enable Metrics/AbcSize, Metrics/MethodLength
103
+ # rubocop:enable Metrics/MethodLength
130
104
 
131
105
  # Fetch docid.
132
106
  # @param doc [Nokogiri::HTML::Document]
@@ -135,9 +109,11 @@ module RelatonItu
135
109
  doc.xpath(
136
110
  "//span[@id='ctl00_content_main_uc_rec_main_info1_rpt_main_ctl00_lbl_rec']",
137
111
  "//td[.='Identical standard:']/following-sibling::td",
112
+ "//div/table[1]/tr[4]/td/strong",
138
113
  ).map do |code|
139
- id = code.text.match(%r{^.*?(?= \()}).to_s.squeeze(" ")
114
+ id = code.text.match(%r{^.*?(?= \()|\w\.Imp\s?\d+}).to_s.squeeze(" ")
140
115
  type = id.match(%r{^\w+}).to_s
116
+ type = "ITU" if type == "G"
141
117
  RelatonBib::DocumentIdentifier.new(type: type, id: id)
142
118
  end
143
119
  end
@@ -146,10 +122,11 @@ module RelatonItu
146
122
  # @param doc [Nokogiri::HTML::Document]
147
123
  # @return [RelatonBib::DocumentStatus, NilClass]
148
124
  def fetch_status(doc)
149
- s = doc.at("//table/tr/td/span[contains(@id, 'Label7')]")
125
+ s = doc.at("//table/tr/td/span[contains(@id, 'Label7')]",
126
+ "//p[contains(.,'Status :')]")
150
127
  return unless s
151
128
 
152
- status = s.text == "In force" ? "Published" : "Withdrawal"
129
+ status = s.text.include?("In force") ? "Published" : "Withdrawal"
153
130
  RelatonBib::DocumentStatus.new(stage: status)
154
131
  end
155
132
 
@@ -191,9 +168,7 @@ module RelatonItu
191
168
  # @return [Array<Hash>]
192
169
  def fetch_relations(doc)
193
170
  doc.xpath('//div[contains(@id, "tab_sup")]//table/tr[position()>2]').map do |r|
194
- # r_type = r.at('./td/span[contains(@id, "Label4")]/nobr').text.downcase
195
171
  ref = r.at('./td/span[contains(@id, "title_e")]/nobr/a')
196
- # url = DOMAIN + ref[:href].sub(/^\./, "/ITU-T/recommendations")
197
172
  fref = RelatonBib::FormattedRef.new(content: ref.text, language: "en", script: "Latn")
198
173
  bibitem = RelatonIsoBib::IsoBibliographicItem.new(formattedref: fref)
199
174
  { type: "complements", bibitem: bibitem }
@@ -201,22 +176,14 @@ module RelatonItu
201
176
  end
202
177
  # rubocop:enable Metrics/MethodLength
203
178
 
204
- # Fetch type.
205
- # @param doc [Nokogiri::HTML::Document]
206
- # @return [String]
207
- # def fetch_type(_doc)
208
- # "recommendation"
209
- # end
210
-
211
179
  # Fetch titles.
212
180
  # @param doc [Nokogiri::HTML::Document]
213
181
  # @return [Array<Hash>]
214
182
  def fetch_titles(doc)
215
- # t = hit_data[:title].match(%r{(?<=\(\d{2}\/\d{4}\): ).*}).to_s
216
- # t = hit_data[:title] if t.empty?
217
- t = doc.at("//td[@class='title']")
183
+ t = doc.at("//td[@class='title']|//div/table[1]/tr[4]/td/strong")
218
184
  return [] unless t
219
- titles = t.text.split " - "
185
+
186
+ titles = t.text.sub(/\w\.Imp\s?\d+\u00A0:\u00A0/, "").split " - "
220
187
  case titles.size
221
188
  when 0
222
189
  intro, main, part = nil, "", nil
@@ -247,10 +214,11 @@ module RelatonItu
247
214
  # @return [Array<Hash>]
248
215
  def fetch_dates(doc)
249
216
  dates = []
250
- pdate = doc.at("//table/tr/td/span[contains(@id, 'Label5')]")
251
- publish_date = pdate&.text || ob_date(doc)
252
- if publish_date && !publish_date&.empty?
253
- dates << { type: "published", on: publish_date }
217
+ date = doc.at("//table/tr/td/span[contains(@id, 'Label5')]",
218
+ "//p[contains(.,'Approved in')]")
219
+ pdate = date&.text&.match(/\d{4}-\d{2}-\d{2}/).to_s || ob_date(doc)
220
+ if pdate && !pdate&.empty?
221
+ dates << { type: "published", on: pdate }
254
222
  end
255
223
  dates
256
224
  end
@@ -278,36 +246,41 @@ module RelatonItu
278
246
  # @param doc [Nokogiri::HTML::Document]
279
247
  # @return [Array<Hash>]
280
248
  def fetch_contributors(code)
249
+ return [] unless code
250
+
281
251
  abbrev = code.sub(/-\w\s.*/, "")
282
252
  case abbrev
283
253
  when "ITU"
284
254
  name = "International Telecommunication Union"
285
255
  url = "www.itu.int"
286
256
  end
287
- [{ entity: { name: name, url: url, abbreviation: abbrev }, role: [type: "publisher"] }]
257
+ [{ entity: { name: name, url: url, abbreviation: abbrev },
258
+ role: [type: "publisher"] }]
288
259
  end
289
260
 
290
- # Fetch ICS.
291
- # @param doc [Nokogiri::HTML::Document]
292
- # @return [Array<Hash>]
293
- # def fetch_ics(doc)
294
- # doc.xpath('//th[contains(text(), "ICS")]/following-sibling::td/a').map do |i|
295
- # code = i.text.match(/[\d\.]+/).to_s.split '.'
296
- # { field: code[0], group: code[1], subgroup: code[2] }
297
- # end
298
- # end
299
-
300
261
  # Fetch links.
301
262
  # @param doc [Nokogiri::HTML::Document]
302
263
  # @param url [String]
303
264
  # @return [Array<Hash>]
304
265
  def fetch_link(doc, url)
305
266
  links = [{ type: "src", content: url }]
306
- obp_elms = doc.at('//a[@title="Persistent link to download the PDF file"]')
307
- links << { type: "obp", content: DOMAIN + obp_elms[:href].strip } if obp_elms
267
+ obp_elm = doc.at(
268
+ '//a[@title="Persistent link to download the PDF file"]',
269
+ "//font[contains(.,'PDF')]/../..",
270
+ )
271
+ links << typed_link("obp", obp_elm) if obp_elm
272
+ wrd_elm = doc.at("//font[contains(.,'Word')]/../..")
273
+ links << typed_link("word", wrd_elm) if wrd_elm
308
274
  links
309
275
  end
310
276
 
277
+ def typed_link(type, elm)
278
+ {
279
+ type: type,
280
+ content: URI.join(HitCollection::DOMAIN + elm[:href].strip).to_s,
281
+ }
282
+ end
283
+
311
284
  # Fetch copyright.
312
285
  # @param code [String]
313
286
  # @param doc [Nokogiri::HTML::Document]
@@ -325,5 +298,4 @@ module RelatonItu
325
298
  end
326
299
  end
327
300
  end
328
- # rubocop:enable Metrics/ModuleLength
329
301
  end
@@ -1,3 +1,3 @@
1
1
  module RelatonItu
2
- VERSION = "1.0.0".freeze
2
+ VERSION = "1.0.1".freeze
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: relaton-itu
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.0.0
4
+ version: 1.0.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Ribose Inc.
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2020-04-25 00:00:00.000000000 Z
11
+ date: 2020-05-04 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: debase