relaton-itu 1.0.0 → 1.2.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -2,11 +2,6 @@ require "relaton_itu/version"
2
2
  require "relaton_itu/itu_bibliography"
3
3
  require "digest/md5"
4
4
 
5
- # if defined? Relaton
6
- # require_relative "relaton/processor"
7
- # Relaton::Registry.instance.register(Relaton::RelatonItu::Processor)
8
- # end
9
-
10
5
  module RelatonItu
11
6
  class Error < StandardError; end
12
7
 
@@ -16,8 +16,9 @@ module RelatonItu
16
16
  # @param subgroup [Hash, RelatonItu::ItuGroup, NilClass]
17
17
  # @param workgroup [Hash, RelatonItu::ItuGroup, NilClass]
18
18
  def initialize(bureau:, group:, subgroup: nil, workgroup: nil)
19
- raise ArgumentError, "invalid bureau: #{bureau}" unless BUREAUS.include? bureau
20
-
19
+ unless BUREAUS.include? bureau
20
+ warn "[relaton-itu] WARNING: invalid bureau: #{bureau}"
21
+ end
21
22
  @bureau = bureau
22
23
  @group = group.is_a?(Hash) ? ItuGroup.new(group) : group
23
24
  @subgroup = subgroup.is_a?(Hash) ? ItuGroup.new(subgroup) : subgroup
@@ -28,7 +29,7 @@ module RelatonItu
28
29
  def to_xml(builder)
29
30
  builder.editorialgroup do
30
31
  builder.bureau bureau
31
- builder.group { |b| group.to_xml b }
32
+ builder.group { |b| group.to_xml b } if group
32
33
  builder.subgroup { |b| group.to_xml b } if subgroup
33
34
  builder.workgroup { |b| group.to_xml b } if workgroup
34
35
  end
@@ -36,7 +37,8 @@ module RelatonItu
36
37
 
37
38
  # @return [Hash]
38
39
  def to_hash
39
- hash = { "bureau" => bureau, "group" => group.to_hash }
40
+ hash = { "bureau" => bureau }
41
+ hash["group"] = group.to_hash if group
40
42
  hash["subgroup"] = subgroup.to_hash if subgroup
41
43
  hash["workgroup"] = workgroup.to_hash if workgroup
42
44
  hash
@@ -1,5 +1,5 @@
1
1
  module RelatonItu
2
- class HashConverter < RelatonIsoBib::HashConverter
2
+ class HashConverter < RelatonBib::HashConverter
3
3
  class << self
4
4
  private
5
5
 
@@ -9,6 +9,15 @@ module RelatonItu
9
9
 
10
10
  ret[:editorialgroup] = EditorialGroup.new eg
11
11
  end
12
+
13
+ # @param ret [Hash]
14
+ def structuredidentifier_hash_to_bib(ret)
15
+ return unless ret[:structuredidentifier]
16
+
17
+ ret[:structuredidentifier] = StructuredIdentifier.new(
18
+ ret[:structuredidentifier],
19
+ )
20
+ end
12
21
  end
13
22
  end
14
23
  end
@@ -4,9 +4,9 @@ module RelatonItu
4
4
  # Hit.
5
5
  class Hit < RelatonBib::Hit
6
6
  # Parse page.
7
- # @return [Isobib::IsoBibliographicItem]
7
+ # @return [RelatonItu::ItuBibliographicItem]
8
8
  def fetch
9
- @fetch ||= Scrapper.parse_page @hit
9
+ @fetch ||= Scrapper.parse_page hit, hit_collection.gi_imp
10
10
  end
11
11
  end
12
12
  end
@@ -7,16 +7,39 @@ require "net/http"
7
7
  module RelatonItu
8
8
  # Page of hit collection.
9
9
  class HitCollection < RelatonBib::HitCollection
10
- DOMAIN = "https://www.itu.int".freeze
10
+ DOMAIN = "https://www.itu.int"
11
11
 
12
- # @param ref_nbr [String]
12
+ # @return [TrueClass, FalseClass]
13
+ attr_reader :gi_imp
14
+
15
+ # @param ref [String]
13
16
  # @param year [String]
14
- def initialize(ref_nbr, year = nil)
15
- super
16
- group = %r{(OB|Operational Bulletin) No} =~ text ? "Publications" : "Recommendations"
17
- url = "#{DOMAIN}/net4/ITU-T/search/GlobalSearch/Search"
18
- params = {
19
- "Input" => ref_nbr,
17
+ def initialize(ref, year = nil)
18
+ text = ref.sub /(?<=\.)Imp\s?(?=\d)/, ""
19
+ super text, year
20
+ @gi_imp = /\.Imp\d/.match?(ref)
21
+ uri = URI "#{DOMAIN}/net4/ITU-T/search/GlobalSearch/Search"
22
+ data = { json: params.to_json }
23
+ resp = Net::HTTP.post(uri, data.to_json,
24
+ "Content-Type" => "application/json")
25
+ @array = hits JSON.parse(resp.body)
26
+ end
27
+
28
+ private
29
+
30
+ # @return [String]
31
+ def group
32
+ @group ||= if %r{(OB|Operational Bulletin) No} =~ text then "Publications"
33
+ else "Recommendations"
34
+ end
35
+ end
36
+
37
+ # rubocop:disable Metrics/MethodLength
38
+
39
+ # @return [Hash]
40
+ def params
41
+ {
42
+ "Input" => text,
20
43
  "Start" => 0,
21
44
  "Rows" => 10,
22
45
  "SortBy" => "RELEVANCE",
@@ -61,10 +84,13 @@ module RelatonItu
61
84
  "IP" => "",
62
85
  "SearchType" => "All",
63
86
  }
64
- data = { json: params.to_json }
65
- resp = Net::HTTP.post(URI(url), data.to_json, "Content-Type" => "application/json")
66
- doc = JSON.parse resp.body
67
- @array = doc["results"].map do |h|
87
+ end
88
+ # rubocop:enable Metrics/MethodLength
89
+
90
+ # @param data [Hash]
91
+ # @return [Array<RelatonItu::Hit>]
92
+ def hits(data)
93
+ data["results"].map do |h|
68
94
  code = h["Media"]["Name"]
69
95
  title = h["Title"]
70
96
  url = h["Redirection"]
@@ -1,5 +1,5 @@
1
1
  module RelatonItu
2
- class ItuBibliographicItem < RelatonIsoBib::IsoBibliographicItem
2
+ class ItuBibliographicItem < RelatonBib::BibliographicItem
3
3
  TYPES = %w[
4
4
  recommendation recommendation-supplement recommendation-amendment
5
5
  recommendation-corrigendum recommendation-errata recommendation-annex
@@ -7,13 +7,14 @@ module RelatonItu
7
7
  joint-itu-iso-iec
8
8
  ].freeze
9
9
 
10
+ # @params structuredidentifier [RelatonItu::StructuredIdentifier]
10
11
  def initialize(**args)
11
- @doctype = args.delete :doctype
12
- if doctype && !TYPES.include?(doctype)
13
- raise ArgumentError, "invalid doctype: #{doctype}"
12
+ # @doctype = args.delete :doctype
13
+ if args[:doctype] && !TYPES.include?(args[:doctype])
14
+ warn "[relaton-itu] WARNING: invalid doctype: #{args[:doctype]}"
14
15
  end
15
-
16
16
  super
17
+ # @doctype = args[:doctype]
17
18
  end
18
19
  end
19
20
  end
@@ -1,8 +1,9 @@
1
1
  # frozen_string_literal: true
2
2
 
3
- require "relaton_iso_bib"
3
+ require "relaton_bib"
4
4
  require "relaton_itu/itu_bibliographic_item"
5
5
  require "relaton_itu/editorial_group"
6
+ require "relaton_itu/structured_identifier"
6
7
  require "relaton_itu/itu_group"
7
8
  require "relaton_itu/scrapper"
8
9
  require "relaton_itu/hit_collection"
@@ -19,9 +20,9 @@ module RelatonItu
19
20
  # @return [RelatonItu::HitCollection]
20
21
  def search(text, year = nil)
21
22
  HitCollection.new text, year
22
- rescue SocketError, Timeout::Error, Errno::EINVAL, Errno::ECONNRESET, EOFError,
23
- Net::HTTPBadResponse, Net::HTTPHeaderSyntaxError, Net::ProtocolError,
24
- OpenSSL::SSL::SSLError
23
+ rescue SocketError, Timeout::Error, Errno::EINVAL, Errno::ECONNRESET,
24
+ EOFError, Net::HTTPBadResponse, Net::HTTPHeaderSyntaxError,
25
+ Net::ProtocolError, OpenSSL::SSL::SSLError
25
26
  raise RelatonBib::RequestError, "Could not access http://www.itu.int"
26
27
  end
27
28
 
@@ -66,17 +67,17 @@ module RelatonItu
66
67
  nil
67
68
  end
68
69
 
69
- def fetch_pages(hits, threads)
70
- workers = RelatonBib::WorkersPool.new threads
71
- workers.worker { |w| { i: w[:i], hit: w[:hit].fetch } }
72
- hits.each_with_index { |hit, i| workers << { i: i, hit: hit } }
73
- workers.end
74
- workers.result.sort_by { |a| a[:i] }.map { |x| x[:hit] }
75
- end
70
+ # def fetch_pages(hits, threads)
71
+ # workers = RelatonBib::WorkersPool.new threads
72
+ # workers.worker { |w| { i: w[:i], hit: w[:hit].fetch } }
73
+ # hits.each_with_index { |hit, i| workers << { i: i, hit: hit } }
74
+ # workers.end
75
+ # workers.result.sort_by { |a| a[:i] }.map { |x| x[:hit] }
76
+ # end
76
77
 
77
78
  def search_filter(code)
78
- docidrx = %r{\w+.\d+|\w\sSuppl\.\s\d+} # %r{^ITU-T\s[^\s]+}
79
- c = code.match(docidrx).to_s
79
+ docidrx = %r{\w+\.\d+|\w\sSuppl\.\s\d+} # %r{^ITU-T\s[^\s]+}
80
+ c = code.sub(/Imp\s?/, "").match(docidrx).to_s
80
81
  warn "[relaton-itu] (\"#{code}\") fetching..."
81
82
  result = search(code)
82
83
  result.select do |i|
@@ -93,16 +94,18 @@ module RelatonItu
93
94
  # If no match, returns any years which caused mismatch, for error reporting
94
95
  def isobib_results_filter(result, year)
95
96
  missed_years = []
96
- result.each_slice(3) do |s| # ISO website only allows 3 connections
97
- fetch_pages(s, 3).each do |r|
98
- return { ret: r } if !year
97
+ # result.each_slice(3) do |s| # ISO website only allows 3 connections
98
+ # fetch_pages(s, 3).each do |r|
99
+ result.each do |r|
100
+ return { ret: r.fetch } if !year
99
101
 
100
- r.date.select { |d| d.type == "published" }.each do |d|
101
- return { ret: r } if year.to_i == d.on.year
102
+ /\(\d{2}\/(?<pyear>\d{4})\)/ =~ r.hit[:code]
103
+ # r.date.select { |d| d.type == "published" }.each do |d|
104
+ return { ret: r.fetch } if year == pyear
102
105
 
103
- missed_years << d.on.year
104
- end
105
- end
106
+ missed_years << pyear
107
+ # end
108
+ # end
106
109
  end
107
110
  { years: missed_years }
108
111
  end
@@ -3,16 +3,9 @@
3
3
  require "nokogiri"
4
4
  require "net/http"
5
5
 
6
- # Capybara.request_driver :poltergeist do |app|
7
- # Capybara::Poltergeist::Driver.new app, js_errors: false
8
- # end
9
- # Capybara.default_driver = :poltergeist
10
-
11
6
  module RelatonItu
12
7
  # Scrapper.
13
- # rubocop:disable Metrics/ModuleLength
14
8
  module Scrapper
15
- DOMAIN = "https://www.itu.int"
16
9
  ROMAN_MONTHS = %w[I II III IV V VI VII VIII IX X XI XII].freeze
17
10
 
18
11
  TYPES = {
@@ -31,30 +24,26 @@ module RelatonItu
31
24
  }.freeze
32
25
 
33
26
  class << self
34
- # @param text [String]
35
- # @return [Array<Hash>]
36
- # def get(text)
37
- # iso_workers = WorkersPool.new 4
38
- # iso_workers.worker { |hit| iso_worker(hit, iso_workers) }
39
- # algolia_workers = start_algolia_search(text, iso_workers)
40
- # iso_docs = iso_workers.result
41
- # algolia_workers.end
42
- # algolia_workers.result
43
- # iso_docs
44
- # end
27
+ # rubocop:disable Metrics/AbcSize, Metrics/MethodLength
45
28
 
46
29
  # Parse page.
47
- # @param hit [Hash]
30
+ # @param hit_data [Hash]
48
31
  # @return [Hash]
49
- # rubocop:disable Metrics/AbcSize, Metrics/MethodLength
50
- def parse_page(hit_data)
32
+ def parse_page(hit_data, imp = false)
51
33
  url, doc = get_page hit_data[:url]
34
+ if imp
35
+ a = doc.at "//span[contains(@id, 'tab_ig_uc_rec')]/a"
36
+ return unless a
37
+
38
+ url, doc = get_page URI.join(url, a[:href]).to_s
39
+ end
52
40
 
53
41
  # Fetch edition.
54
42
  edition = doc.at("//table/tr/td/span[contains(@id, 'Label8')]/b")&.text
55
43
 
56
44
  ItuBibliographicItem.new(
57
45
  fetched: Date.today.to_s,
46
+ type: "standard",
58
47
  docid: fetch_docid(doc),
59
48
  edition: edition,
60
49
  language: ["en"],
@@ -73,7 +62,7 @@ module RelatonItu
73
62
  place: ["Geneva"],
74
63
  )
75
64
  end
76
- # rubocop:enable Metrics/AbcSize, Metrics/MethodLength
65
+ # rubocop:enable Metrics/AbcSize
77
66
 
78
67
  private
79
68
 
@@ -96,37 +85,23 @@ module RelatonItu
96
85
  }]
97
86
  end
98
87
 
99
- # Get langs.
100
- # @param doc [Nokogiri::HTML::Document]
101
- # @return [Array<Hash>]
102
- # def langs(doc)
103
- # lgs = [{ lang: 'en' }]
104
- # doc.css('ul#lang-switcher ul li a').each do |lang_link|
105
- # lang_path = lang_link.attr('href')
106
- # lang = lang_path.match(%r{^\/(fr)\/})
107
- # lgs << { lang: lang[1], path: lang_path } if lang
108
- # end
109
- # lgs
110
- # end
111
-
112
- # rubocop:disable Metrics/AbcSize, Metrics/MethodLength
113
88
  # Get page.
114
89
  # @param path [String] page's path
115
- # @return [Array<Nokogiri::HTML::Document, String>]
90
+ # @return [Array<String, Nokogiri::HTML::Document>]
116
91
  def get_page(url)
117
92
  uri = URI url
118
- resp = Net::HTTP.get_response(uri) # .encode("UTF-8")
93
+ resp = Net::HTTP.get_response(uri)
119
94
  until resp.code == "200"
120
95
  uri = URI resp["location"] if resp.code =~ /^30/
121
- resp = Net::HTTP.get_response(uri) # .encode("UTF-8")
96
+ resp = Net::HTTP.get_response(uri)
122
97
  end
123
98
  [uri.to_s, Nokogiri::HTML(resp.body)]
124
- rescue SocketError, Timeout::Error, Errno::EINVAL, Errno::ECONNRESET, EOFError,
125
- Net::HTTPBadResponse, Net::HTTPHeaderSyntaxError, Net::ProtocolError,
126
- OpenSSL::SSL::SSLError
99
+ rescue SocketError, Timeout::Error, Errno::EINVAL, Errno::ECONNRESET,
100
+ EOFError, Net::HTTPBadResponse, Net::HTTPHeaderSyntaxError,
101
+ Net::ProtocolError, OpenSSL::SSL::SSLError
127
102
  raise RelatonBib::RequestError, "Could not access #{url}"
128
103
  end
129
- # rubocop:enable Metrics/AbcSize, Metrics/MethodLength
104
+ # rubocop:enable Metrics/MethodLength
130
105
 
131
106
  # Fetch docid.
132
107
  # @param doc [Nokogiri::HTML::Document]
@@ -135,9 +110,11 @@ module RelatonItu
135
110
  doc.xpath(
136
111
  "//span[@id='ctl00_content_main_uc_rec_main_info1_rpt_main_ctl00_lbl_rec']",
137
112
  "//td[.='Identical standard:']/following-sibling::td",
113
+ "//div/table[1]/tr[4]/td/strong",
138
114
  ).map do |code|
139
- id = code.text.match(%r{^.*?(?= \()}).to_s.squeeze(" ")
115
+ id = code.text.match(%r{^.*?(?= \()|\w\.Imp\s?\d+}).to_s.squeeze(" ")
140
116
  type = id.match(%r{^\w+}).to_s
117
+ type = "ITU" if type == "G"
141
118
  RelatonBib::DocumentIdentifier.new(type: type, id: id)
142
119
  end
143
120
  end
@@ -146,10 +123,11 @@ module RelatonItu
146
123
  # @param doc [Nokogiri::HTML::Document]
147
124
  # @return [RelatonBib::DocumentStatus, NilClass]
148
125
  def fetch_status(doc)
149
- s = doc.at("//table/tr/td/span[contains(@id, 'Label7')]")
126
+ s = doc.at("//table/tr/td/span[contains(@id, 'Label7')]",
127
+ "//p[contains(.,'Status :')]")
150
128
  return unless s
151
129
 
152
- status = s.text == "In force" ? "Published" : "Withdrawal"
130
+ status = s.text.include?("In force") ? "Published" : "Withdrawal"
153
131
  RelatonBib::DocumentStatus.new(stage: status)
154
132
  end
155
133
 
@@ -191,55 +169,22 @@ module RelatonItu
191
169
  # @return [Array<Hash>]
192
170
  def fetch_relations(doc)
193
171
  doc.xpath('//div[contains(@id, "tab_sup")]//table/tr[position()>2]').map do |r|
194
- # r_type = r.at('./td/span[contains(@id, "Label4")]/nobr').text.downcase
195
172
  ref = r.at('./td/span[contains(@id, "title_e")]/nobr/a')
196
- # url = DOMAIN + ref[:href].sub(/^\./, "/ITU-T/recommendations")
197
173
  fref = RelatonBib::FormattedRef.new(content: ref.text, language: "en", script: "Latn")
198
- bibitem = RelatonIsoBib::IsoBibliographicItem.new(formattedref: fref)
174
+ bibitem = ItuBibliographicItem.new(formattedref: fref, type: "standard")
199
175
  { type: "complements", bibitem: bibitem }
200
176
  end
201
177
  end
202
178
  # rubocop:enable Metrics/MethodLength
203
179
 
204
- # Fetch type.
205
- # @param doc [Nokogiri::HTML::Document]
206
- # @return [String]
207
- # def fetch_type(_doc)
208
- # "recommendation"
209
- # end
210
-
211
180
  # Fetch titles.
212
181
  # @param doc [Nokogiri::HTML::Document]
213
182
  # @return [Array<Hash>]
214
183
  def fetch_titles(doc)
215
- # t = hit_data[:title].match(%r{(?<=\(\d{2}\/\d{4}\): ).*}).to_s
216
- # t = hit_data[:title] if t.empty?
217
- t = doc.at("//td[@class='title']")
184
+ t = doc.at("//td[@class='title']|//div/table[1]/tr[4]/td/strong")
218
185
  return [] unless t
219
- titles = t.text.split " - "
220
- case titles.size
221
- when 0
222
- intro, main, part = nil, "", nil
223
- when 1
224
- intro, main, part = nil, titles[0], nil
225
- when 2
226
- if /^(Part|Partie) \d+:/ =~ titles[1]
227
- intro, main, part = nil, titles[0], titles[1]
228
- else
229
- intro, main, part = titles[0], titles[1], nil
230
- end
231
- when 3
232
- intro, main, part = titles[0], titles[1], titles[2]
233
- else
234
- intro, main, part = titles[0], titles[1], titles[2..-1]&.join(" -- ")
235
- end
236
- [{
237
- title_intro: intro,
238
- title_main: main,
239
- title_part: part,
240
- language: "en",
241
- script: "Latn",
242
- }]
186
+
187
+ RelatonBib::TypedTitleString.from_string t.text, "en", "Latn"
243
188
  end
244
189
 
245
190
  # Fetch dates
@@ -247,10 +192,11 @@ module RelatonItu
247
192
  # @return [Array<Hash>]
248
193
  def fetch_dates(doc)
249
194
  dates = []
250
- pdate = doc.at("//table/tr/td/span[contains(@id, 'Label5')]")
251
- publish_date = pdate&.text || ob_date(doc)
252
- if publish_date && !publish_date&.empty?
253
- dates << { type: "published", on: publish_date }
195
+ date = doc.at("//table/tr/td/span[contains(@id, 'Label5')]",
196
+ "//p[contains(.,'Approved in')]")
197
+ pdate = date&.text&.match(/\d{4}-\d{2}-\d{2}/).to_s || ob_date(doc)
198
+ if pdate && !pdate&.empty?
199
+ dates << { type: "published", on: pdate }
254
200
  end
255
201
  dates
256
202
  end
@@ -278,40 +224,45 @@ module RelatonItu
278
224
  # @param doc [Nokogiri::HTML::Document]
279
225
  # @return [Array<Hash>]
280
226
  def fetch_contributors(code)
227
+ return [] unless code
228
+
281
229
  abbrev = code.sub(/-\w\s.*/, "")
282
230
  case abbrev
283
231
  when "ITU"
284
232
  name = "International Telecommunication Union"
285
233
  url = "www.itu.int"
286
234
  end
287
- [{ entity: { name: name, url: url, abbreviation: abbrev }, role: [type: "publisher"] }]
235
+ [{ entity: { name: name, url: url, abbreviation: abbrev },
236
+ role: [type: "publisher"] }]
288
237
  end
289
238
 
290
- # Fetch ICS.
291
- # @param doc [Nokogiri::HTML::Document]
292
- # @return [Array<Hash>]
293
- # def fetch_ics(doc)
294
- # doc.xpath('//th[contains(text(), "ICS")]/following-sibling::td/a').map do |i|
295
- # code = i.text.match(/[\d\.]+/).to_s.split '.'
296
- # { field: code[0], group: code[1], subgroup: code[2] }
297
- # end
298
- # end
299
-
300
239
  # Fetch links.
301
240
  # @param doc [Nokogiri::HTML::Document]
302
241
  # @param url [String]
303
242
  # @return [Array<Hash>]
304
243
  def fetch_link(doc, url)
305
244
  links = [{ type: "src", content: url }]
306
- obp_elms = doc.at('//a[@title="Persistent link to download the PDF file"]')
307
- links << { type: "obp", content: DOMAIN + obp_elms[:href].strip } if obp_elms
245
+ obp_elm = doc.at(
246
+ '//a[@title="Persistent link to download the PDF file"]',
247
+ "//font[contains(.,'PDF')]/../..",
248
+ )
249
+ links << typed_link("obp", obp_elm) if obp_elm
250
+ wrd_elm = doc.at("//font[contains(.,'Word')]/../..")
251
+ links << typed_link("word", wrd_elm) if wrd_elm
308
252
  links
309
253
  end
310
254
 
255
+ def typed_link(type, elm)
256
+ {
257
+ type: type,
258
+ content: URI.join(HitCollection::DOMAIN + elm[:href].strip).to_s,
259
+ }
260
+ end
261
+
311
262
  # Fetch copyright.
312
263
  # @param code [String]
313
264
  # @param doc [Nokogiri::HTML::Document]
314
- # @return [Hash]
265
+ # @return [Array<Hash>]
315
266
  def fetch_copyright(code, doc)
316
267
  abbreviation = code.match(/^[^-]+/).to_s
317
268
  case abbreviation
@@ -321,9 +272,9 @@ module RelatonItu
321
272
  end
322
273
  fdate = doc.at("//table/tr/td/span[contains(@id, 'Label5')]")
323
274
  from = fdate&.text || ob_date(doc)
324
- { owner: { name: name, abbreviation: abbreviation, url: url }, from: from }
275
+ [{ owner: [{ name: name, abbreviation: abbreviation, url: url }],
276
+ from: from }]
325
277
  end
326
278
  end
327
279
  end
328
- # rubocop:enable Metrics/ModuleLength
329
280
  end