relaton-itu 1.0.0 → 1.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -2,11 +2,6 @@ require "relaton_itu/version"
2
2
  require "relaton_itu/itu_bibliography"
3
3
  require "digest/md5"
4
4
 
5
- # if defined? Relaton
6
- # require_relative "relaton/processor"
7
- # Relaton::Registry.instance.register(Relaton::RelatonItu::Processor)
8
- # end
9
-
10
5
  module RelatonItu
11
6
  class Error < StandardError; end
12
7
 
@@ -16,8 +16,9 @@ module RelatonItu
16
16
  # @param subgroup [Hash, RelatonItu::ItuGroup, NilClass]
17
17
  # @param workgroup [Hash, RelatonItu::ItuGroup, NilClass]
18
18
  def initialize(bureau:, group:, subgroup: nil, workgroup: nil)
19
- raise ArgumentError, "invalid bureau: #{bureau}" unless BUREAUS.include? bureau
20
-
19
+ unless BUREAUS.include? bureau
20
+ warn "[relaton-itu] WARNING: invalid bureau: #{bureau}"
21
+ end
21
22
  @bureau = bureau
22
23
  @group = group.is_a?(Hash) ? ItuGroup.new(group) : group
23
24
  @subgroup = subgroup.is_a?(Hash) ? ItuGroup.new(subgroup) : subgroup
@@ -28,7 +29,7 @@ module RelatonItu
28
29
  def to_xml(builder)
29
30
  builder.editorialgroup do
30
31
  builder.bureau bureau
31
- builder.group { |b| group.to_xml b }
32
+ builder.group { |b| group.to_xml b } if group
32
33
  builder.subgroup { |b| group.to_xml b } if subgroup
33
34
  builder.workgroup { |b| group.to_xml b } if workgroup
34
35
  end
@@ -36,7 +37,8 @@ module RelatonItu
36
37
 
37
38
  # @return [Hash]
38
39
  def to_hash
39
- hash = { "bureau" => bureau, "group" => group.to_hash }
40
+ hash = { "bureau" => bureau }
41
+ hash["group"] = group.to_hash if group
40
42
  hash["subgroup"] = subgroup.to_hash if subgroup
41
43
  hash["workgroup"] = workgroup.to_hash if workgroup
42
44
  hash
@@ -1,5 +1,5 @@
1
1
  module RelatonItu
2
- class HashConverter < RelatonIsoBib::HashConverter
2
+ class HashConverter < RelatonBib::HashConverter
3
3
  class << self
4
4
  private
5
5
 
@@ -9,6 +9,15 @@ module RelatonItu
9
9
 
10
10
  ret[:editorialgroup] = EditorialGroup.new eg
11
11
  end
12
+
13
+ # @param ret [Hash]
14
+ def structuredidentifier_hash_to_bib(ret)
15
+ return unless ret[:structuredidentifier]
16
+
17
+ ret[:structuredidentifier] = StructuredIdentifier.new(
18
+ ret[:structuredidentifier],
19
+ )
20
+ end
12
21
  end
13
22
  end
14
23
  end
@@ -4,9 +4,9 @@ module RelatonItu
4
4
  # Hit.
5
5
  class Hit < RelatonBib::Hit
6
6
  # Parse page.
7
- # @return [Isobib::IsoBibliographicItem]
7
+ # @return [RelatonItu::ItuBibliographicItem]
8
8
  def fetch
9
- @fetch ||= Scrapper.parse_page @hit
9
+ @fetch ||= Scrapper.parse_page hit, hit_collection.gi_imp
10
10
  end
11
11
  end
12
12
  end
@@ -7,16 +7,39 @@ require "net/http"
7
7
  module RelatonItu
8
8
  # Page of hit collection.
9
9
  class HitCollection < RelatonBib::HitCollection
10
- DOMAIN = "https://www.itu.int".freeze
10
+ DOMAIN = "https://www.itu.int"
11
11
 
12
- # @param ref_nbr [String]
12
+ # @return [TrueClass, FalseClass]
13
+ attr_reader :gi_imp
14
+
15
+ # @param ref [String]
13
16
  # @param year [String]
14
- def initialize(ref_nbr, year = nil)
15
- super
16
- group = %r{(OB|Operational Bulletin) No} =~ text ? "Publications" : "Recommendations"
17
- url = "#{DOMAIN}/net4/ITU-T/search/GlobalSearch/Search"
18
- params = {
19
- "Input" => ref_nbr,
17
+ def initialize(ref, year = nil)
18
+ text = ref.sub /(?<=\.)Imp\s?(?=\d)/, ""
19
+ super text, year
20
+ @gi_imp = /\.Imp\d/.match?(ref)
21
+ uri = URI "#{DOMAIN}/net4/ITU-T/search/GlobalSearch/Search"
22
+ data = { json: params.to_json }
23
+ resp = Net::HTTP.post(uri, data.to_json,
24
+ "Content-Type" => "application/json")
25
+ @array = hits JSON.parse(resp.body)
26
+ end
27
+
28
+ private
29
+
30
+ # @return [String]
31
+ def group
32
+ @group ||= if %r{(OB|Operational Bulletin) No} =~ text then "Publications"
33
+ else "Recommendations"
34
+ end
35
+ end
36
+
37
+ # rubocop:disable Metrics/MethodLength
38
+
39
+ # @return [Hash]
40
+ def params
41
+ {
42
+ "Input" => text,
20
43
  "Start" => 0,
21
44
  "Rows" => 10,
22
45
  "SortBy" => "RELEVANCE",
@@ -61,10 +84,13 @@ module RelatonItu
61
84
  "IP" => "",
62
85
  "SearchType" => "All",
63
86
  }
64
- data = { json: params.to_json }
65
- resp = Net::HTTP.post(URI(url), data.to_json, "Content-Type" => "application/json")
66
- doc = JSON.parse resp.body
67
- @array = doc["results"].map do |h|
87
+ end
88
+ # rubocop:enable Metrics/MethodLength
89
+
90
+ # @param data [Hash]
91
+ # @return [Array<RelatonItu::Hit>]
92
+ def hits(data)
93
+ data["results"].map do |h|
68
94
  code = h["Media"]["Name"]
69
95
  title = h["Title"]
70
96
  url = h["Redirection"]
@@ -1,5 +1,5 @@
1
1
  module RelatonItu
2
- class ItuBibliographicItem < RelatonIsoBib::IsoBibliographicItem
2
+ class ItuBibliographicItem < RelatonBib::BibliographicItem
3
3
  TYPES = %w[
4
4
  recommendation recommendation-supplement recommendation-amendment
5
5
  recommendation-corrigendum recommendation-errata recommendation-annex
@@ -7,13 +7,14 @@ module RelatonItu
7
7
  joint-itu-iso-iec
8
8
  ].freeze
9
9
 
10
+ # @params structuredidentifier [RelatonItu::StructuredIdentifier]
10
11
  def initialize(**args)
11
- @doctype = args.delete :doctype
12
- if doctype && !TYPES.include?(doctype)
13
- raise ArgumentError, "invalid doctype: #{doctype}"
12
+ # @doctype = args.delete :doctype
13
+ if args[:doctype] && !TYPES.include?(args[:doctype])
14
+ warn "[relaton-itu] WARNING: invalid doctype: #{args[:doctype]}"
14
15
  end
15
-
16
16
  super
17
+ # @doctype = args[:doctype]
17
18
  end
18
19
  end
19
20
  end
@@ -1,8 +1,9 @@
1
1
  # frozen_string_literal: true
2
2
 
3
- require "relaton_iso_bib"
3
+ require "relaton_bib"
4
4
  require "relaton_itu/itu_bibliographic_item"
5
5
  require "relaton_itu/editorial_group"
6
+ require "relaton_itu/structured_identifier"
6
7
  require "relaton_itu/itu_group"
7
8
  require "relaton_itu/scrapper"
8
9
  require "relaton_itu/hit_collection"
@@ -19,9 +20,9 @@ module RelatonItu
19
20
  # @return [RelatonItu::HitCollection]
20
21
  def search(text, year = nil)
21
22
  HitCollection.new text, year
22
- rescue SocketError, Timeout::Error, Errno::EINVAL, Errno::ECONNRESET, EOFError,
23
- Net::HTTPBadResponse, Net::HTTPHeaderSyntaxError, Net::ProtocolError,
24
- OpenSSL::SSL::SSLError
23
+ rescue SocketError, Timeout::Error, Errno::EINVAL, Errno::ECONNRESET,
24
+ EOFError, Net::HTTPBadResponse, Net::HTTPHeaderSyntaxError,
25
+ Net::ProtocolError, OpenSSL::SSL::SSLError
25
26
  raise RelatonBib::RequestError, "Could not access http://www.itu.int"
26
27
  end
27
28
 
@@ -66,17 +67,17 @@ module RelatonItu
66
67
  nil
67
68
  end
68
69
 
69
- def fetch_pages(hits, threads)
70
- workers = RelatonBib::WorkersPool.new threads
71
- workers.worker { |w| { i: w[:i], hit: w[:hit].fetch } }
72
- hits.each_with_index { |hit, i| workers << { i: i, hit: hit } }
73
- workers.end
74
- workers.result.sort_by { |a| a[:i] }.map { |x| x[:hit] }
75
- end
70
+ # def fetch_pages(hits, threads)
71
+ # workers = RelatonBib::WorkersPool.new threads
72
+ # workers.worker { |w| { i: w[:i], hit: w[:hit].fetch } }
73
+ # hits.each_with_index { |hit, i| workers << { i: i, hit: hit } }
74
+ # workers.end
75
+ # workers.result.sort_by { |a| a[:i] }.map { |x| x[:hit] }
76
+ # end
76
77
 
77
78
  def search_filter(code)
78
- docidrx = %r{\w+.\d+|\w\sSuppl\.\s\d+} # %r{^ITU-T\s[^\s]+}
79
- c = code.match(docidrx).to_s
79
+ docidrx = %r{\w+\.\d+|\w\sSuppl\.\s\d+} # %r{^ITU-T\s[^\s]+}
80
+ c = code.sub(/Imp\s?/, "").match(docidrx).to_s
80
81
  warn "[relaton-itu] (\"#{code}\") fetching..."
81
82
  result = search(code)
82
83
  result.select do |i|
@@ -93,16 +94,18 @@ module RelatonItu
93
94
  # If no match, returns any years which caused mismatch, for error reporting
94
95
  def isobib_results_filter(result, year)
95
96
  missed_years = []
96
- result.each_slice(3) do |s| # ISO website only allows 3 connections
97
- fetch_pages(s, 3).each do |r|
98
- return { ret: r } if !year
97
+ # result.each_slice(3) do |s| # ISO website only allows 3 connections
98
+ # fetch_pages(s, 3).each do |r|
99
+ result.each do |r|
100
+ return { ret: r.fetch } if !year
99
101
 
100
- r.date.select { |d| d.type == "published" }.each do |d|
101
- return { ret: r } if year.to_i == d.on.year
102
+ /\(\d{2}\/(?<pyear>\d{4})\)/ =~ r.hit[:code]
103
+ # r.date.select { |d| d.type == "published" }.each do |d|
104
+ return { ret: r.fetch } if year == pyear
102
105
 
103
- missed_years << d.on.year
104
- end
105
- end
106
+ missed_years << pyear
107
+ # end
108
+ # end
106
109
  end
107
110
  { years: missed_years }
108
111
  end
@@ -3,16 +3,9 @@
3
3
  require "nokogiri"
4
4
  require "net/http"
5
5
 
6
- # Capybara.request_driver :poltergeist do |app|
7
- # Capybara::Poltergeist::Driver.new app, js_errors: false
8
- # end
9
- # Capybara.default_driver = :poltergeist
10
-
11
6
  module RelatonItu
12
7
  # Scrapper.
13
- # rubocop:disable Metrics/ModuleLength
14
8
  module Scrapper
15
- DOMAIN = "https://www.itu.int"
16
9
  ROMAN_MONTHS = %w[I II III IV V VI VII VIII IX X XI XII].freeze
17
10
 
18
11
  TYPES = {
@@ -31,30 +24,26 @@ module RelatonItu
31
24
  }.freeze
32
25
 
33
26
  class << self
34
- # @param text [String]
35
- # @return [Array<Hash>]
36
- # def get(text)
37
- # iso_workers = WorkersPool.new 4
38
- # iso_workers.worker { |hit| iso_worker(hit, iso_workers) }
39
- # algolia_workers = start_algolia_search(text, iso_workers)
40
- # iso_docs = iso_workers.result
41
- # algolia_workers.end
42
- # algolia_workers.result
43
- # iso_docs
44
- # end
27
+ # rubocop:disable Metrics/AbcSize, Metrics/MethodLength
45
28
 
46
29
  # Parse page.
47
- # @param hit [Hash]
30
+ # @param hit_data [Hash]
48
31
  # @return [Hash]
49
- # rubocop:disable Metrics/AbcSize, Metrics/MethodLength
50
- def parse_page(hit_data)
32
+ def parse_page(hit_data, imp = false)
51
33
  url, doc = get_page hit_data[:url]
34
+ if imp
35
+ a = doc.at "//span[contains(@id, 'tab_ig_uc_rec')]/a"
36
+ return unless a
37
+
38
+ url, doc = get_page URI.join(url, a[:href]).to_s
39
+ end
52
40
 
53
41
  # Fetch edition.
54
42
  edition = doc.at("//table/tr/td/span[contains(@id, 'Label8')]/b")&.text
55
43
 
56
44
  ItuBibliographicItem.new(
57
45
  fetched: Date.today.to_s,
46
+ type: "standard",
58
47
  docid: fetch_docid(doc),
59
48
  edition: edition,
60
49
  language: ["en"],
@@ -73,7 +62,7 @@ module RelatonItu
73
62
  place: ["Geneva"],
74
63
  )
75
64
  end
76
- # rubocop:enable Metrics/AbcSize, Metrics/MethodLength
65
+ # rubocop:enable Metrics/AbcSize
77
66
 
78
67
  private
79
68
 
@@ -96,37 +85,23 @@ module RelatonItu
96
85
  }]
97
86
  end
98
87
 
99
- # Get langs.
100
- # @param doc [Nokogiri::HTML::Document]
101
- # @return [Array<Hash>]
102
- # def langs(doc)
103
- # lgs = [{ lang: 'en' }]
104
- # doc.css('ul#lang-switcher ul li a').each do |lang_link|
105
- # lang_path = lang_link.attr('href')
106
- # lang = lang_path.match(%r{^\/(fr)\/})
107
- # lgs << { lang: lang[1], path: lang_path } if lang
108
- # end
109
- # lgs
110
- # end
111
-
112
- # rubocop:disable Metrics/AbcSize, Metrics/MethodLength
113
88
  # Get page.
114
89
  # @param path [String] page's path
115
- # @return [Array<Nokogiri::HTML::Document, String>]
90
+ # @return [Array<String, Nokogiri::HTML::Document>]
116
91
  def get_page(url)
117
92
  uri = URI url
118
- resp = Net::HTTP.get_response(uri) # .encode("UTF-8")
93
+ resp = Net::HTTP.get_response(uri)
119
94
  until resp.code == "200"
120
95
  uri = URI resp["location"] if resp.code =~ /^30/
121
- resp = Net::HTTP.get_response(uri) # .encode("UTF-8")
96
+ resp = Net::HTTP.get_response(uri)
122
97
  end
123
98
  [uri.to_s, Nokogiri::HTML(resp.body)]
124
- rescue SocketError, Timeout::Error, Errno::EINVAL, Errno::ECONNRESET, EOFError,
125
- Net::HTTPBadResponse, Net::HTTPHeaderSyntaxError, Net::ProtocolError,
126
- OpenSSL::SSL::SSLError
99
+ rescue SocketError, Timeout::Error, Errno::EINVAL, Errno::ECONNRESET,
100
+ EOFError, Net::HTTPBadResponse, Net::HTTPHeaderSyntaxError,
101
+ Net::ProtocolError, OpenSSL::SSL::SSLError
127
102
  raise RelatonBib::RequestError, "Could not access #{url}"
128
103
  end
129
- # rubocop:enable Metrics/AbcSize, Metrics/MethodLength
104
+ # rubocop:enable Metrics/MethodLength
130
105
 
131
106
  # Fetch docid.
132
107
  # @param doc [Nokogiri::HTML::Document]
@@ -135,9 +110,11 @@ module RelatonItu
135
110
  doc.xpath(
136
111
  "//span[@id='ctl00_content_main_uc_rec_main_info1_rpt_main_ctl00_lbl_rec']",
137
112
  "//td[.='Identical standard:']/following-sibling::td",
113
+ "//div/table[1]/tr[4]/td/strong",
138
114
  ).map do |code|
139
- id = code.text.match(%r{^.*?(?= \()}).to_s.squeeze(" ")
115
+ id = code.text.match(%r{^.*?(?= \()|\w\.Imp\s?\d+}).to_s.squeeze(" ")
140
116
  type = id.match(%r{^\w+}).to_s
117
+ type = "ITU" if type == "G"
141
118
  RelatonBib::DocumentIdentifier.new(type: type, id: id)
142
119
  end
143
120
  end
@@ -146,10 +123,11 @@ module RelatonItu
146
123
  # @param doc [Nokogiri::HTML::Document]
147
124
  # @return [RelatonBib::DocumentStatus, NilClass]
148
125
  def fetch_status(doc)
149
- s = doc.at("//table/tr/td/span[contains(@id, 'Label7')]")
126
+ s = doc.at("//table/tr/td/span[contains(@id, 'Label7')]",
127
+ "//p[contains(.,'Status :')]")
150
128
  return unless s
151
129
 
152
- status = s.text == "In force" ? "Published" : "Withdrawal"
130
+ status = s.text.include?("In force") ? "Published" : "Withdrawal"
153
131
  RelatonBib::DocumentStatus.new(stage: status)
154
132
  end
155
133
 
@@ -191,55 +169,22 @@ module RelatonItu
191
169
  # @return [Array<Hash>]
192
170
  def fetch_relations(doc)
193
171
  doc.xpath('//div[contains(@id, "tab_sup")]//table/tr[position()>2]').map do |r|
194
- # r_type = r.at('./td/span[contains(@id, "Label4")]/nobr').text.downcase
195
172
  ref = r.at('./td/span[contains(@id, "title_e")]/nobr/a')
196
- # url = DOMAIN + ref[:href].sub(/^\./, "/ITU-T/recommendations")
197
173
  fref = RelatonBib::FormattedRef.new(content: ref.text, language: "en", script: "Latn")
198
- bibitem = RelatonIsoBib::IsoBibliographicItem.new(formattedref: fref)
174
+ bibitem = ItuBibliographicItem.new(formattedref: fref, type: "standard")
199
175
  { type: "complements", bibitem: bibitem }
200
176
  end
201
177
  end
202
178
  # rubocop:enable Metrics/MethodLength
203
179
 
204
- # Fetch type.
205
- # @param doc [Nokogiri::HTML::Document]
206
- # @return [String]
207
- # def fetch_type(_doc)
208
- # "recommendation"
209
- # end
210
-
211
180
  # Fetch titles.
212
181
  # @param doc [Nokogiri::HTML::Document]
213
182
  # @return [Array<Hash>]
214
183
  def fetch_titles(doc)
215
- # t = hit_data[:title].match(%r{(?<=\(\d{2}\/\d{4}\): ).*}).to_s
216
- # t = hit_data[:title] if t.empty?
217
- t = doc.at("//td[@class='title']")
184
+ t = doc.at("//td[@class='title']|//div/table[1]/tr[4]/td/strong")
218
185
  return [] unless t
219
- titles = t.text.split " - "
220
- case titles.size
221
- when 0
222
- intro, main, part = nil, "", nil
223
- when 1
224
- intro, main, part = nil, titles[0], nil
225
- when 2
226
- if /^(Part|Partie) \d+:/ =~ titles[1]
227
- intro, main, part = nil, titles[0], titles[1]
228
- else
229
- intro, main, part = titles[0], titles[1], nil
230
- end
231
- when 3
232
- intro, main, part = titles[0], titles[1], titles[2]
233
- else
234
- intro, main, part = titles[0], titles[1], titles[2..-1]&.join(" -- ")
235
- end
236
- [{
237
- title_intro: intro,
238
- title_main: main,
239
- title_part: part,
240
- language: "en",
241
- script: "Latn",
242
- }]
186
+
187
+ RelatonBib::TypedTitleString.from_string t.text, "en", "Latn"
243
188
  end
244
189
 
245
190
  # Fetch dates
@@ -247,10 +192,11 @@ module RelatonItu
247
192
  # @return [Array<Hash>]
248
193
  def fetch_dates(doc)
249
194
  dates = []
250
- pdate = doc.at("//table/tr/td/span[contains(@id, 'Label5')]")
251
- publish_date = pdate&.text || ob_date(doc)
252
- if publish_date && !publish_date&.empty?
253
- dates << { type: "published", on: publish_date }
195
+ date = doc.at("//table/tr/td/span[contains(@id, 'Label5')]",
196
+ "//p[contains(.,'Approved in')]")
197
+ pdate = date&.text&.match(/\d{4}-\d{2}-\d{2}/).to_s || ob_date(doc)
198
+ if pdate && !pdate&.empty?
199
+ dates << { type: "published", on: pdate }
254
200
  end
255
201
  dates
256
202
  end
@@ -278,40 +224,45 @@ module RelatonItu
278
224
  # @param doc [Nokogiri::HTML::Document]
279
225
  # @return [Array<Hash>]
280
226
  def fetch_contributors(code)
227
+ return [] unless code
228
+
281
229
  abbrev = code.sub(/-\w\s.*/, "")
282
230
  case abbrev
283
231
  when "ITU"
284
232
  name = "International Telecommunication Union"
285
233
  url = "www.itu.int"
286
234
  end
287
- [{ entity: { name: name, url: url, abbreviation: abbrev }, role: [type: "publisher"] }]
235
+ [{ entity: { name: name, url: url, abbreviation: abbrev },
236
+ role: [type: "publisher"] }]
288
237
  end
289
238
 
290
- # Fetch ICS.
291
- # @param doc [Nokogiri::HTML::Document]
292
- # @return [Array<Hash>]
293
- # def fetch_ics(doc)
294
- # doc.xpath('//th[contains(text(), "ICS")]/following-sibling::td/a').map do |i|
295
- # code = i.text.match(/[\d\.]+/).to_s.split '.'
296
- # { field: code[0], group: code[1], subgroup: code[2] }
297
- # end
298
- # end
299
-
300
239
  # Fetch links.
301
240
  # @param doc [Nokogiri::HTML::Document]
302
241
  # @param url [String]
303
242
  # @return [Array<Hash>]
304
243
  def fetch_link(doc, url)
305
244
  links = [{ type: "src", content: url }]
306
- obp_elms = doc.at('//a[@title="Persistent link to download the PDF file"]')
307
- links << { type: "obp", content: DOMAIN + obp_elms[:href].strip } if obp_elms
245
+ obp_elm = doc.at(
246
+ '//a[@title="Persistent link to download the PDF file"]',
247
+ "//font[contains(.,'PDF')]/../..",
248
+ )
249
+ links << typed_link("obp", obp_elm) if obp_elm
250
+ wrd_elm = doc.at("//font[contains(.,'Word')]/../..")
251
+ links << typed_link("word", wrd_elm) if wrd_elm
308
252
  links
309
253
  end
310
254
 
255
+ def typed_link(type, elm)
256
+ {
257
+ type: type,
258
+ content: URI.join(HitCollection::DOMAIN + elm[:href].strip).to_s,
259
+ }
260
+ end
261
+
311
262
  # Fetch copyright.
312
263
  # @param code [String]
313
264
  # @param doc [Nokogiri::HTML::Document]
314
- # @return [Hash]
265
+ # @return [Array<Hash>]
315
266
  def fetch_copyright(code, doc)
316
267
  abbreviation = code.match(/^[^-]+/).to_s
317
268
  case abbreviation
@@ -321,9 +272,9 @@ module RelatonItu
321
272
  end
322
273
  fdate = doc.at("//table/tr/td/span[contains(@id, 'Label5')]")
323
274
  from = fdate&.text || ob_date(doc)
324
- { owner: { name: name, abbreviation: abbreviation, url: url }, from: from }
275
+ [{ owner: [{ name: name, abbreviation: abbreviation, url: url }],
276
+ from: from }]
325
277
  end
326
278
  end
327
279
  end
328
- # rubocop:enable Metrics/ModuleLength
329
280
  end