relaton-itu 1.0.0 → 1.0.1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: cbdb4a303ff1173ad845aa15185093b9f93caaf1b3dfce4cde8dae9094bf8def
4
- data.tar.gz: 8a096c17b4b6863596996556eb4319ee9451b6dadbf850c78578abb40424c380
3
+ metadata.gz: 169f68ca9de0a9e01f2130919807393d53507a2fd92f98a7043337ee9c037a18
4
+ data.tar.gz: a5fc6f91b1d6c6af3b25919c77f2a3f1cc57bbc527983207d5163088ba01ea6c
5
5
  SHA512:
6
- metadata.gz: ec3aed1ca2c9ba554edd6990df967369104f23efa8286ca529f0d0baffefb753c50c3b54d9030d3b65f34efcbeb096bf4674cc435e8755b5584d38e6ab485d36
7
- data.tar.gz: 18baf5f6f7b7b3473af763b286794d009350995b99a7cc7a76f343f360e5098a9ea795cda8eac36bf105328d02e88b730571e13bb793fc0ad60250979d67dbe3
6
+ metadata.gz: 3243b1b99b363cb8bb773320ad5c70f0e48de661b4db92c1e3a96991de163ff40dd6179f974ebd80507ad2b37aca4ad0e20ca549c880787752128517a4dd34a9
7
+ data.tar.gz: d40ffddd2ef5a4ec569f92ae6253148b85ae434b38fa31ec2eb27681bf3e948096137b0acf86535fade15e4ccc79e538668a6371dbb575416a8a842739cdcf97
@@ -2,11 +2,6 @@ require "relaton_itu/version"
2
2
  require "relaton_itu/itu_bibliography"
3
3
  require "digest/md5"
4
4
 
5
- # if defined? Relaton
6
- # require_relative "relaton/processor"
7
- # Relaton::Registry.instance.register(Relaton::RelatonItu::Processor)
8
- # end
9
-
10
5
  module RelatonItu
11
6
  class Error < StandardError; end
12
7
 
@@ -4,9 +4,9 @@ module RelatonItu
4
4
  # Hit.
5
5
  class Hit < RelatonBib::Hit
6
6
  # Parse page.
7
- # @return [Isobib::IsoBibliographicItem]
7
+ # @return [RelatonItu::ItuBibliographicItem]
8
8
  def fetch
9
- @fetch ||= Scrapper.parse_page @hit
9
+ @fetch ||= Scrapper.parse_page hit, hit_collection.gi_imp
10
10
  end
11
11
  end
12
12
  end
@@ -7,16 +7,39 @@ require "net/http"
7
7
  module RelatonItu
8
8
  # Page of hit collection.
9
9
  class HitCollection < RelatonBib::HitCollection
10
- DOMAIN = "https://www.itu.int".freeze
10
+ DOMAIN = "https://www.itu.int"
11
11
 
12
- # @param ref_nbr [String]
12
+ # @return [TrueClass, FalseClass]
13
+ attr_reader :gi_imp
14
+
15
+ # @param ref [String]
13
16
  # @param year [String]
14
- def initialize(ref_nbr, year = nil)
15
- super
16
- group = %r{(OB|Operational Bulletin) No} =~ text ? "Publications" : "Recommendations"
17
- url = "#{DOMAIN}/net4/ITU-T/search/GlobalSearch/Search"
18
- params = {
19
- "Input" => ref_nbr,
17
+ def initialize(ref, year = nil)
18
+ text = ref.sub /(?<=\.)Imp\s?(?=\d)/, ""
19
+ super text, year
20
+ @gi_imp = /\.Imp\d/.match?(ref)
21
+ uri = URI "#{DOMAIN}/net4/ITU-T/search/GlobalSearch/Search"
22
+ data = { json: params.to_json }
23
+ resp = Net::HTTP.post(uri, data.to_json,
24
+ "Content-Type" => "application/json")
25
+ @array = hits JSON.parse(resp.body)
26
+ end
27
+
28
+ private
29
+
30
+ # @return [String]
31
+ def group
32
+ @group ||= if %r{(OB|Operational Bulletin) No} =~ text then "Publications"
33
+ else "Recommendations"
34
+ end
35
+ end
36
+
37
+ # rubocop:disable Metrics/MethodLength
38
+
39
+ # @return [Hash]
40
+ def params
41
+ {
42
+ "Input" => text,
20
43
  "Start" => 0,
21
44
  "Rows" => 10,
22
45
  "SortBy" => "RELEVANCE",
@@ -61,10 +84,13 @@ module RelatonItu
61
84
  "IP" => "",
62
85
  "SearchType" => "All",
63
86
  }
64
- data = { json: params.to_json }
65
- resp = Net::HTTP.post(URI(url), data.to_json, "Content-Type" => "application/json")
66
- doc = JSON.parse resp.body
67
- @array = doc["results"].map do |h|
87
+ end
88
+ # rubocop:enable Metrics/MethodLength
89
+
90
+ # @param data [Hash]
91
+ # @return [Array<RelatonItu::Hit>]
92
+ def hits(data)
93
+ data["results"].map do |h|
68
94
  code = h["Media"]["Name"]
69
95
  title = h["Title"]
70
96
  url = h["Redirection"]
@@ -19,9 +19,9 @@ module RelatonItu
19
19
  # @return [RelatonItu::HitCollection]
20
20
  def search(text, year = nil)
21
21
  HitCollection.new text, year
22
- rescue SocketError, Timeout::Error, Errno::EINVAL, Errno::ECONNRESET, EOFError,
23
- Net::HTTPBadResponse, Net::HTTPHeaderSyntaxError, Net::ProtocolError,
24
- OpenSSL::SSL::SSLError
22
+ rescue SocketError, Timeout::Error, Errno::EINVAL, Errno::ECONNRESET,
23
+ EOFError, Net::HTTPBadResponse, Net::HTTPHeaderSyntaxError,
24
+ Net::ProtocolError, OpenSSL::SSL::SSLError
25
25
  raise RelatonBib::RequestError, "Could not access http://www.itu.int"
26
26
  end
27
27
 
@@ -66,17 +66,17 @@ module RelatonItu
66
66
  nil
67
67
  end
68
68
 
69
- def fetch_pages(hits, threads)
70
- workers = RelatonBib::WorkersPool.new threads
71
- workers.worker { |w| { i: w[:i], hit: w[:hit].fetch } }
72
- hits.each_with_index { |hit, i| workers << { i: i, hit: hit } }
73
- workers.end
74
- workers.result.sort_by { |a| a[:i] }.map { |x| x[:hit] }
75
- end
69
+ # def fetch_pages(hits, threads)
70
+ # workers = RelatonBib::WorkersPool.new threads
71
+ # workers.worker { |w| { i: w[:i], hit: w[:hit].fetch } }
72
+ # hits.each_with_index { |hit, i| workers << { i: i, hit: hit } }
73
+ # workers.end
74
+ # workers.result.sort_by { |a| a[:i] }.map { |x| x[:hit] }
75
+ # end
76
76
 
77
77
  def search_filter(code)
78
- docidrx = %r{\w+.\d+|\w\sSuppl\.\s\d+} # %r{^ITU-T\s[^\s]+}
79
- c = code.match(docidrx).to_s
78
+ docidrx = %r{\w+\.\d+|\w\sSuppl\.\s\d+} # %r{^ITU-T\s[^\s]+}
79
+ c = code.sub(/Imp\s?/, "").match(docidrx).to_s
80
80
  warn "[relaton-itu] (\"#{code}\") fetching..."
81
81
  result = search(code)
82
82
  result.select do |i|
@@ -93,16 +93,18 @@ module RelatonItu
93
93
  # If no match, returns any years which caused mismatch, for error reporting
94
94
  def isobib_results_filter(result, year)
95
95
  missed_years = []
96
- result.each_slice(3) do |s| # ISO website only allows 3 connections
97
- fetch_pages(s, 3).each do |r|
98
- return { ret: r } if !year
96
+ # result.each_slice(3) do |s| # ISO website only allows 3 connections
97
+ # fetch_pages(s, 3).each do |r|
98
+ result.each do |r|
99
+ return { ret: r.fetch } if !year
99
100
 
100
- r.date.select { |d| d.type == "published" }.each do |d|
101
- return { ret: r } if year.to_i == d.on.year
101
+ /\(\d{2}\/(?<pyear>\d{4})\)/ =~ r.hit[:code]
102
+ # r.date.select { |d| d.type == "published" }.each do |d|
103
+ return { ret: r.fetch } if year == pyear
102
104
 
103
- missed_years << d.on.year
104
- end
105
- end
105
+ missed_years << pyear
106
+ # end
107
+ # end
106
108
  end
107
109
  { years: missed_years }
108
110
  end
@@ -3,16 +3,9 @@
3
3
  require "nokogiri"
4
4
  require "net/http"
5
5
 
6
- # Capybara.request_driver :poltergeist do |app|
7
- # Capybara::Poltergeist::Driver.new app, js_errors: false
8
- # end
9
- # Capybara.default_driver = :poltergeist
10
-
11
6
  module RelatonItu
12
7
  # Scrapper.
13
- # rubocop:disable Metrics/ModuleLength
14
8
  module Scrapper
15
- DOMAIN = "https://www.itu.int"
16
9
  ROMAN_MONTHS = %w[I II III IV V VI VII VIII IX X XI XII].freeze
17
10
 
18
11
  TYPES = {
@@ -31,24 +24,19 @@ module RelatonItu
31
24
  }.freeze
32
25
 
33
26
  class << self
34
- # @param text [String]
35
- # @return [Array<Hash>]
36
- # def get(text)
37
- # iso_workers = WorkersPool.new 4
38
- # iso_workers.worker { |hit| iso_worker(hit, iso_workers) }
39
- # algolia_workers = start_algolia_search(text, iso_workers)
40
- # iso_docs = iso_workers.result
41
- # algolia_workers.end
42
- # algolia_workers.result
43
- # iso_docs
44
- # end
27
+ # rubocop:disable Metrics/AbcSize, Metrics/MethodLength
45
28
 
46
29
  # Parse page.
47
- # @param hit [Hash]
30
+ # @param hit_data [Hash]
48
31
  # @return [Hash]
49
- # rubocop:disable Metrics/AbcSize, Metrics/MethodLength
50
- def parse_page(hit_data)
32
+ def parse_page(hit_data, imp = false)
51
33
  url, doc = get_page hit_data[:url]
34
+ if imp
35
+ a = doc.at "//span[contains(@id, 'tab_ig_uc_rec')]/a"
36
+ return unless a
37
+
38
+ url, doc = get_page URI.join(url, a[:href]).to_s
39
+ end
52
40
 
53
41
  # Fetch edition.
54
42
  edition = doc.at("//table/tr/td/span[contains(@id, 'Label8')]/b")&.text
@@ -73,7 +61,7 @@ module RelatonItu
73
61
  place: ["Geneva"],
74
62
  )
75
63
  end
76
- # rubocop:enable Metrics/AbcSize, Metrics/MethodLength
64
+ # rubocop:enable Metrics/AbcSize
77
65
 
78
66
  private
79
67
 
@@ -96,37 +84,23 @@ module RelatonItu
96
84
  }]
97
85
  end
98
86
 
99
- # Get langs.
100
- # @param doc [Nokogiri::HTML::Document]
101
- # @return [Array<Hash>]
102
- # def langs(doc)
103
- # lgs = [{ lang: 'en' }]
104
- # doc.css('ul#lang-switcher ul li a').each do |lang_link|
105
- # lang_path = lang_link.attr('href')
106
- # lang = lang_path.match(%r{^\/(fr)\/})
107
- # lgs << { lang: lang[1], path: lang_path } if lang
108
- # end
109
- # lgs
110
- # end
111
-
112
- # rubocop:disable Metrics/AbcSize, Metrics/MethodLength
113
87
  # Get page.
114
88
  # @param path [String] page's path
115
- # @return [Array<Nokogiri::HTML::Document, String>]
89
+ # @return [Array<String, Nokogiri::HTML::Document>]
116
90
  def get_page(url)
117
91
  uri = URI url
118
- resp = Net::HTTP.get_response(uri) # .encode("UTF-8")
92
+ resp = Net::HTTP.get_response(uri)
119
93
  until resp.code == "200"
120
94
  uri = URI resp["location"] if resp.code =~ /^30/
121
- resp = Net::HTTP.get_response(uri) # .encode("UTF-8")
95
+ resp = Net::HTTP.get_response(uri)
122
96
  end
123
97
  [uri.to_s, Nokogiri::HTML(resp.body)]
124
- rescue SocketError, Timeout::Error, Errno::EINVAL, Errno::ECONNRESET, EOFError,
125
- Net::HTTPBadResponse, Net::HTTPHeaderSyntaxError, Net::ProtocolError,
126
- OpenSSL::SSL::SSLError
98
+ rescue SocketError, Timeout::Error, Errno::EINVAL, Errno::ECONNRESET,
99
+ EOFError, Net::HTTPBadResponse, Net::HTTPHeaderSyntaxError,
100
+ Net::ProtocolError, OpenSSL::SSL::SSLError
127
101
  raise RelatonBib::RequestError, "Could not access #{url}"
128
102
  end
129
- # rubocop:enable Metrics/AbcSize, Metrics/MethodLength
103
+ # rubocop:enable Metrics/MethodLength
130
104
 
131
105
  # Fetch docid.
132
106
  # @param doc [Nokogiri::HTML::Document]
@@ -135,9 +109,11 @@ module RelatonItu
135
109
  doc.xpath(
136
110
  "//span[@id='ctl00_content_main_uc_rec_main_info1_rpt_main_ctl00_lbl_rec']",
137
111
  "//td[.='Identical standard:']/following-sibling::td",
112
+ "//div/table[1]/tr[4]/td/strong",
138
113
  ).map do |code|
139
- id = code.text.match(%r{^.*?(?= \()}).to_s.squeeze(" ")
114
+ id = code.text.match(%r{^.*?(?= \()|\w\.Imp\s?\d+}).to_s.squeeze(" ")
140
115
  type = id.match(%r{^\w+}).to_s
116
+ type = "ITU" if type == "G"
141
117
  RelatonBib::DocumentIdentifier.new(type: type, id: id)
142
118
  end
143
119
  end
@@ -146,10 +122,11 @@ module RelatonItu
146
122
  # @param doc [Nokogiri::HTML::Document]
147
123
  # @return [RelatonBib::DocumentStatus, NilClass]
148
124
  def fetch_status(doc)
149
- s = doc.at("//table/tr/td/span[contains(@id, 'Label7')]")
125
+ s = doc.at("//table/tr/td/span[contains(@id, 'Label7')]",
126
+ "//p[contains(.,'Status :')]")
150
127
  return unless s
151
128
 
152
- status = s.text == "In force" ? "Published" : "Withdrawal"
129
+ status = s.text.include?("In force") ? "Published" : "Withdrawal"
153
130
  RelatonBib::DocumentStatus.new(stage: status)
154
131
  end
155
132
 
@@ -191,9 +168,7 @@ module RelatonItu
191
168
  # @return [Array<Hash>]
192
169
  def fetch_relations(doc)
193
170
  doc.xpath('//div[contains(@id, "tab_sup")]//table/tr[position()>2]').map do |r|
194
- # r_type = r.at('./td/span[contains(@id, "Label4")]/nobr').text.downcase
195
171
  ref = r.at('./td/span[contains(@id, "title_e")]/nobr/a')
196
- # url = DOMAIN + ref[:href].sub(/^\./, "/ITU-T/recommendations")
197
172
  fref = RelatonBib::FormattedRef.new(content: ref.text, language: "en", script: "Latn")
198
173
  bibitem = RelatonIsoBib::IsoBibliographicItem.new(formattedref: fref)
199
174
  { type: "complements", bibitem: bibitem }
@@ -201,22 +176,14 @@ module RelatonItu
201
176
  end
202
177
  # rubocop:enable Metrics/MethodLength
203
178
 
204
- # Fetch type.
205
- # @param doc [Nokogiri::HTML::Document]
206
- # @return [String]
207
- # def fetch_type(_doc)
208
- # "recommendation"
209
- # end
210
-
211
179
  # Fetch titles.
212
180
  # @param doc [Nokogiri::HTML::Document]
213
181
  # @return [Array<Hash>]
214
182
  def fetch_titles(doc)
215
- # t = hit_data[:title].match(%r{(?<=\(\d{2}\/\d{4}\): ).*}).to_s
216
- # t = hit_data[:title] if t.empty?
217
- t = doc.at("//td[@class='title']")
183
+ t = doc.at("//td[@class='title']|//div/table[1]/tr[4]/td/strong")
218
184
  return [] unless t
219
- titles = t.text.split " - "
185
+
186
+ titles = t.text.sub(/\w\.Imp\s?\d+\u00A0:\u00A0/, "").split " - "
220
187
  case titles.size
221
188
  when 0
222
189
  intro, main, part = nil, "", nil
@@ -247,10 +214,11 @@ module RelatonItu
247
214
  # @return [Array<Hash>]
248
215
  def fetch_dates(doc)
249
216
  dates = []
250
- pdate = doc.at("//table/tr/td/span[contains(@id, 'Label5')]")
251
- publish_date = pdate&.text || ob_date(doc)
252
- if publish_date && !publish_date&.empty?
253
- dates << { type: "published", on: publish_date }
217
+ date = doc.at("//table/tr/td/span[contains(@id, 'Label5')]",
218
+ "//p[contains(.,'Approved in')]")
219
+ pdate = date&.text&.match(/\d{4}-\d{2}-\d{2}/).to_s || ob_date(doc)
220
+ if pdate && !pdate&.empty?
221
+ dates << { type: "published", on: pdate }
254
222
  end
255
223
  dates
256
224
  end
@@ -278,36 +246,41 @@ module RelatonItu
278
246
  # @param doc [Nokogiri::HTML::Document]
279
247
  # @return [Array<Hash>]
280
248
  def fetch_contributors(code)
249
+ return [] unless code
250
+
281
251
  abbrev = code.sub(/-\w\s.*/, "")
282
252
  case abbrev
283
253
  when "ITU"
284
254
  name = "International Telecommunication Union"
285
255
  url = "www.itu.int"
286
256
  end
287
- [{ entity: { name: name, url: url, abbreviation: abbrev }, role: [type: "publisher"] }]
257
+ [{ entity: { name: name, url: url, abbreviation: abbrev },
258
+ role: [type: "publisher"] }]
288
259
  end
289
260
 
290
- # Fetch ICS.
291
- # @param doc [Nokogiri::HTML::Document]
292
- # @return [Array<Hash>]
293
- # def fetch_ics(doc)
294
- # doc.xpath('//th[contains(text(), "ICS")]/following-sibling::td/a').map do |i|
295
- # code = i.text.match(/[\d\.]+/).to_s.split '.'
296
- # { field: code[0], group: code[1], subgroup: code[2] }
297
- # end
298
- # end
299
-
300
261
  # Fetch links.
301
262
  # @param doc [Nokogiri::HTML::Document]
302
263
  # @param url [String]
303
264
  # @return [Array<Hash>]
304
265
  def fetch_link(doc, url)
305
266
  links = [{ type: "src", content: url }]
306
- obp_elms = doc.at('//a[@title="Persistent link to download the PDF file"]')
307
- links << { type: "obp", content: DOMAIN + obp_elms[:href].strip } if obp_elms
267
+ obp_elm = doc.at(
268
+ '//a[@title="Persistent link to download the PDF file"]',
269
+ "//font[contains(.,'PDF')]/../..",
270
+ )
271
+ links << typed_link("obp", obp_elm) if obp_elm
272
+ wrd_elm = doc.at("//font[contains(.,'Word')]/../..")
273
+ links << typed_link("word", wrd_elm) if wrd_elm
308
274
  links
309
275
  end
310
276
 
277
+ def typed_link(type, elm)
278
+ {
279
+ type: type,
280
+ content: URI.join(HitCollection::DOMAIN + elm[:href].strip).to_s,
281
+ }
282
+ end
283
+
311
284
  # Fetch copyright.
312
285
  # @param code [String]
313
286
  # @param doc [Nokogiri::HTML::Document]
@@ -325,5 +298,4 @@ module RelatonItu
325
298
  end
326
299
  end
327
300
  end
328
- # rubocop:enable Metrics/ModuleLength
329
301
  end
@@ -1,3 +1,3 @@
1
1
  module RelatonItu
2
- VERSION = "1.0.0".freeze
2
+ VERSION = "1.0.1".freeze
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: relaton-itu
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.0.0
4
+ version: 1.0.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Ribose Inc.
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2020-04-25 00:00:00.000000000 Z
11
+ date: 2020-05-04 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: debase