relaton-itu 1.20.1 → 1.20.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 5f782e5e67be5f75336a106dd6e2366aabf9085391d20f0e0e0424d7965e2b7f
4
- data.tar.gz: 3491eda42b19b30bdbc7568f8a8e436c6d64bfcef80298ff6ca57c50e93737b8
3
+ metadata.gz: f70f91da524595847c16b4a8cb46dc7379a254822a84b03034e9b4965560ff4a
4
+ data.tar.gz: eb69591d51a38a1ad4c99f5914f2fd7be27abaad712947ea04a603939fe65196
5
5
  SHA512:
6
- metadata.gz: 48987e9207d2a28778de5442e53ac2cba35481a219791aba02f119ed2048ec3018effffbea90d825cb3702c4e3c04285e649264542a6686163d046e52a528957
7
- data.tar.gz: 2ece9eb73b4dbdb478d2c54d635c76cd8b11f54a59357aed2a8227a7fb69a5cb5d5cf7eaefcb4b0fa6a907fb93b95afcf80e0b950e44d533fb86456cc7db9bda
6
+ metadata.gz: 0d7300e181010d2bdd2829a0a888a0db59bd3c61448e09d840ab76f3a5c24b17700c5ea6ae4598e1aebfba82f173c3a619ad9e37e9581e26d636db601d178d99
7
+ data.tar.gz: b82909697c98ac580e0eaf13e1c0f0652365fb08f2951b8a86ffc7c972959703c916596b5f210624e6ec7945900502b63ac52f7ae516f4be0420fa79065ead8f
data/Gemfile CHANGED
@@ -13,3 +13,7 @@ gem "ruby-jing"
13
13
  gem "simplecov"
14
14
  gem "vcr"
15
15
  gem "webmock"
16
+
17
+ group :development do
18
+ gem 'pry'
19
+ end
@@ -45,6 +45,9 @@ module RelatonItu
45
45
  data = { json: params.to_json }
46
46
  resp = agent.post url, data
47
47
  @array = hits JSON.parse(resp.body)
48
+ rescue Mechanize::ResponseCodeError, SocketError, Timeout::Error, Errno::ECONNRESET,
49
+ EOFError, Net::ProtocolError, OpenSSL::SSL::SSLError => e
50
+ raise RelatonBib::RequestError, "Could not access #{url}: #{e.message}"
48
51
  end
49
52
 
50
53
  def request_document # rubocop:todo Metrics/MethodLength, Metrics/AbcSize
@@ -0,0 +1,66 @@
1
+ module RelatonItu
2
+ class RadioRegulationsParser
3
+ include Relaton::Core::ArrayWrapper
4
+
5
+ ROMAN_MONTHS = %w[I II III IV V VI VII VIII IX X XI XII].freeze
6
+
7
+ def initialize(hit)
8
+ @hit = hit
9
+ end
10
+
11
+ def doc
12
+ @doc ||= hit.hit_collection.agent.get doc_url
13
+ rescue Mechanize::ResponseCodeError, SocketError, Timeout::Error, Errno::ECONNRESET,
14
+ EOFError, Net::ProtocolError, OpenSSL::SSL::SSLError => e
15
+ raise RelatonBib::RequestError, "Could not access #{url}: #{e.message}"
16
+ end
17
+
18
+ def doc_url
19
+ CGI.unescape(hit.hit[:url]).split("dest=").last
20
+ end
21
+
22
+ def fetch_edition = nil
23
+ def fetch_status = nil
24
+ def fetch_workgroup = nil
25
+ def fetch_abstract = []
26
+ def fetch_relations = []
27
+
28
+ def fetch_titles
29
+ title = doc.at("//title")&.text&.strip
30
+ return [] if title.nil? || title.empty?
31
+
32
+ RelatonBib::TypedTitleString.from_string title, "en", "Latn"
33
+ end
34
+
35
+ def fetch_dates
36
+ array(doc_date).map { |on| { type: "published", on: on } }
37
+ end
38
+
39
+ def doc_date
40
+ return @doc_date if defined? @doc_date
41
+
42
+ date_str = doc.at("//td[@class='title']/text()")&.text&.slice(/(?<=Year:\s)(?:\d{1,2}\.\w+\.)?\d{4}/)
43
+ @doc_date = date_str ? roman_to_arabic(date_str) : nil
44
+ end
45
+
46
+ def fetch_link
47
+ [RelatonBib::TypedUri.new(type: "src", content: doc_url)]
48
+ end
49
+
50
+ private
51
+
52
+ attr_reader :hit
53
+
54
+ # Convert roman month number in string date to arabic number
55
+ # @param date [String]
56
+ # @return [String]
57
+ def roman_to_arabic(date)
58
+ %r{(?<rmonth>[IVX]+)} =~ date
59
+ if ROMAN_MONTHS.index(rmonth)
60
+ month = ROMAN_MONTHS.index(rmonth) + 1
61
+ Date.parse(date.sub(%r{[IVX]+}, month.to_s)).to_s
62
+ else date
63
+ end
64
+ end
65
+ end
66
+ end
@@ -0,0 +1,191 @@
1
+ module RelatonItu
2
+ # Parse ITU Radio Regulations from XML to Relaton format.
3
+ class RecommendationParser
4
+ include Relaton::Core::ArrayWrapper
5
+
6
+ RECHDR = "https://www.itu.int/mws/api/recommendations/getRecHdrDetail?idrec=%{idrec}&lang=en".freeze
7
+ RECEDITIONS = "https://www.itu.int/mws/api/recommendations/getRecEditions?idrec=%{idrec}&lang=en".freeze
8
+ RECSUPPLEMENTS = "https://www.itu.int/mws/api/recommendations/getRecSupplements?idrec=%{idrec}&lang=en".freeze
9
+ IMPLGUIDES = "https://www.itu.int/mws/api/recommendations/getImplGuides?idrec=%{idrec}&lang=en".freeze
10
+
11
+ def initialize(hit, idrec, imp)
12
+ @hit = hit
13
+ @idrec = idrec
14
+ @imp = imp
15
+ end
16
+
17
+ def doc
18
+ @doc ||= begin
19
+ url = (imp ? IMPLGUIDES : RECHDR ) % { idrec: idrec }
20
+ resp = get_data url
21
+ imp ? resp.first : resp
22
+ end
23
+ end
24
+
25
+ # @return [Strign, nil]
26
+ def fetch_edition
27
+ self_edition.dig("Version")
28
+ end
29
+
30
+ # Fetch titles.
31
+ # @return [RelatonBib::TypedTitleStringCollection]
32
+ def fetch_titles
33
+ title = imp ? doc["imp_title_e"] : doc["rec_title"]
34
+ return [] if title.nil? || title.empty?
35
+
36
+ RelatonBib::TypedTitleString.from_string title, "en", "Latn"
37
+ end
38
+
39
+ # Fetch status.
40
+ # @return [RelatonBib::DocumentStatus, NilClass]
41
+ def fetch_status
42
+ inforce = imp ? imp_status : doc["status"]
43
+ return if inforce.nil? || inforce.empty?
44
+
45
+ status = inforce == "In force" ? "Published" : "Withdrawal"
46
+ RelatonBib::DocumentStatus.new(stage: status)
47
+ end
48
+
49
+ # Fetch dates
50
+ # @return [Array<Hash>]
51
+ def fetch_dates
52
+ array(doc_date).map { |on| { type: "published", on: on } }
53
+ end
54
+
55
+ # Fetch workgroup.
56
+ # @return [RelatonItu::EditorialGroup, NilClass]
57
+ def fetch_workgroup
58
+ group = itugroup(doc["sg"])
59
+ EditorialGroup.new(
60
+ bureau: hit.hit[:code].match(/(?<=-)./).to_s, group: group
61
+ )
62
+ end
63
+
64
+ # Fetch abstracts.
65
+ # @return [Array<Hash>]
66
+ def fetch_abstract
67
+ array(doc["summary"]).map do |content|
68
+ { content: content, language: "en", script: "Latn" }
69
+ end
70
+ end
71
+
72
+ # Fetch links.
73
+ # @return [Array<Hash>]
74
+ def fetch_link
75
+ link = imp ? doc["imp_dms_link"] : doc["handle_id"]
76
+ links = [{ type: "src", content: link }]
77
+ links << typed_link("pdf", doc["handle_id_pdf_link"]) if doc["handle_id_pdf_link"]
78
+ imp_word_link { |wlink| links << typed_link("word", wlink) }
79
+ links
80
+ end
81
+
82
+ def doc_date
83
+ return @doc_date if defined? @doc_date
84
+
85
+ date = imp ? doc["imp_approval_date"] : doc["approval_date"]
86
+ @doc_date = Date.parse(date).to_s rescue date
87
+ end
88
+
89
+ # Fetch relations.
90
+ # @return [Array<Hash>]
91
+ def fetch_relations
92
+ relations = []
93
+ editions.each do |ed|
94
+ next if ed["idrec"] == idrec
95
+
96
+ relations << create_relation("hasEdition", ed["title"], ed["rec_name"])
97
+ end
98
+
99
+ supplements.each { |supp| relations << create_relation("complementOf", supp["title_text"], supp["rec_name"]) }
100
+ relations
101
+ end
102
+
103
+ private
104
+
105
+ attr_reader :hit, :idrec, :imp
106
+
107
+ # Get data.
108
+ # @param url [String, nil]
109
+ # @return [Array<String, Nokogiri::HTML::Document>]
110
+ def get_data(url)
111
+ JSON.parse request_document(url).body
112
+ end
113
+
114
+ def request_document(url)
115
+ hit.hit_collection.agent.get url
116
+ rescue Mechanize::ResponseCodeError, SocketError, Timeout::Error, Errno::ECONNRESET,
117
+ EOFError, Net::ProtocolError, OpenSSL::SSL::SSLError => e
118
+ raise RelatonBib::RequestError, "Could not access #{url}: #{e.message}"
119
+ end
120
+
121
+ def editions
122
+ @editions ||= begin
123
+ url = RECEDITIONS % { idrec: idrec }
124
+ get_data(url) || []
125
+ end
126
+ end
127
+
128
+ def self_edition
129
+ @self_edition ||= editions.find { |ed| ed["idrec"] == idrec }
130
+ end
131
+
132
+ def imp_status
133
+ self_edition.dig("status")
134
+ end
135
+
136
+ # @param name [String]
137
+ # @return [RelatonItu::ItuGroup]
138
+ def itugroup(name) # rubocop:disable Metrics/MethodLength
139
+ return if name.nil? || name.empty?
140
+
141
+ if name.include? "Study Group"
142
+ type = "study-group"
143
+ acronym = "SG"
144
+ elsif name.include? "Telecommunication Standardization Advisory Group"
145
+ type = "tsag"
146
+ acronym = "TSAG"
147
+ else
148
+ type = "work-group"
149
+ acronym = "WG"
150
+ end
151
+ ItuGroup.new name: name, type: type, acronym: acronym
152
+ end
153
+
154
+ def imp_word_link
155
+ return unless doc["imp_dms_link"]
156
+ @doc_page ||= request_document(doc["imp_dms_link"])
157
+ wrd_elm = @doc_page.at("//font[contains(.,'Word')]/../..")
158
+ yield wrd_elm[:href] if block_given? && wrd_elm
159
+ end
160
+
161
+ def create_relation(type, title_text, id)
162
+ title = []
163
+ if title_text && !title.empty?
164
+ title << RelatonBib::TypedTitleString.new(content: title_text, language: "en", script: "Latn")
165
+ else
166
+ fref = RelatonBib::FormattedRef.new(content: id, language: "en", script: "Latn")
167
+ end
168
+
169
+ did = RelatonBib::DocumentIdentifier.new(id: id, type: "ITU", primary: true)
170
+ item = ItuBibliographicItem.new(title: title, formattedref: fref, docid: [did])
171
+ { type: "hasEdition", bibitem: item }
172
+ end
173
+
174
+ def supplements
175
+ @supplements ||= begin
176
+ if imp
177
+ []
178
+ else
179
+ url = RECSUPPLEMENTS % { idrec: idrec }
180
+ get_data(url) || []
181
+ end
182
+ end
183
+ end
184
+
185
+ # @param type [String]
186
+ # @param url [Nokogiri::XML::Element]
187
+ def typed_link(type, url)
188
+ { type: type, content: url }
189
+ end
190
+ end
191
+ end
@@ -2,11 +2,13 @@
2
2
 
3
3
  require "nokogiri"
4
4
  require "net/http"
5
+ require_relative "recommendation_parser"
6
+ require_relative "radio_regulations_parser"
5
7
 
6
8
  module RelatonItu
7
9
  # Scrapper.
8
- module Scrapper
9
- ROMAN_MONTHS = %w[I II III IV V VI VII VIII IX X XI XII].freeze
10
+ class Scrapper
11
+ attr_reader :hit, :imp
10
12
 
11
13
  TYPES = {
12
14
  "ISO" => "international-standard",
@@ -23,302 +25,145 @@ module RelatonItu
23
25
  "Guide" => "guide",
24
26
  }.freeze
25
27
 
26
- class << self
27
- # Parse page.
28
- # @param hit [RelatonItu::Hit]
29
- # @return [Hash]
30
- def parse_page(hit, imp: false) # rubocop:disable Metrics/AbcSize, Metrics/MethodLength
31
- doc = get_page hit
32
- return unless doc.code == "200"
33
-
34
- if imp
35
- a = doc.at "//span[contains(@id, 'tab_ig_uc_rec')]/a"
36
- return unless a
37
-
38
- doc = get_page hit, a[:href].to_s
39
- end
40
-
41
- # Fetch edition.
42
- edition = doc.at("//table/tr/td[contains(@style,'color: white')]/span[contains(@id, 'Label8')]/b")&.text
43
- docid = fetch_docid(doc, hit)
44
-
45
- ItuBibliographicItem.new(
46
- id: fetch_id(docid),
47
- fetched: Date.today.to_s,
48
- type: "standard",
49
- docid: docid,
50
- edition: edition,
51
- language: ["en"],
52
- script: ["Latn"],
53
- title: fetch_titles(doc),
54
- doctype: DocumentType.new(type: hit.hit[:type]),
55
- docstatus: fetch_status(doc),
56
- ics: [], # fetch_ics(doc),
57
- date: fetch_dates(doc),
58
- contributor: fetch_contributors(hit.hit[:code]),
59
- editorialgroup: fetch_workgroup(hit.hit[:code], doc),
60
- abstract: fetch_abstract(doc, hit),
61
- copyright: fetch_copyright(hit.hit[:code], doc),
62
- link: fetch_link(doc),
63
- relation: fetch_relations(doc),
64
- place: ["Geneva"],
65
- )
66
- end
67
-
68
- private
69
-
70
- def fetch_id(docid)
71
- docid.find(&:primary).id.gsub(/[.\s()\/-]/, "")
72
- end
73
-
74
- # Fetch abstracts.
75
- # @param doc [Mechanize::Page]
76
- # @param hit [RelatonItu::Hit]
77
- # @return [Array<Hash>]
78
- def fetch_abstract(doc, hit) # rubocop:disable Metrics/AbcSize,Metrics/MethodLength
79
- abstract_url = doc.at '//table/tr/td[contains(@style,"color: white")]/span[contains(@id, "lbl_dms")]/div'
80
- if abstract_url
81
- url = abstract_url[:onclick].match(/https?[^']+/).to_s
82
- rsp = hit.hit_collection.agent.get url
83
- d = Nokogiri::HTML rsp.body.encode(undef: :replace, replace: "")
84
- d.css("p.MsoNormal").text.gsub("\r\n", "").squeeze(" ").gsub("\u00a0", "")
85
- elsif a = doc.at('//table/tr/td/span[contains(@class, "observation")]/text()')
86
- a.text.strip
87
- end => content
88
- return [] unless content
89
-
90
- [{
91
- content: content,
92
- language: "en",
93
- script: "Latn",
94
- }]
95
- rescue Mechanize::ResponseCodeError => e
96
- Util.error "HTTP Service Unavailable: #{e.message}"
97
- []
98
- end
99
-
100
- # Get page.
101
- # @param hit [RelatonItu::Hit]
102
- # @param url [String, nil]
103
- # @return [Array<String, Nokogiri::HTML::Document>]
104
- def get_page(hit, url = nil)
105
- uri = url || hit.hit[:url]
106
- hit.hit_collection.agent.get uri
107
- rescue SocketError, Timeout::Error, Errno::EINVAL, Errno::ECONNRESET,
108
- EOFError, Net::HTTPBadResponse, Net::HTTPHeaderSyntaxError,
109
- Net::ProtocolError, OpenSSL::SSL::SSLError
110
- raise RelatonBib::RequestError, "Could not access #{uri}"
111
- end
112
-
113
- # Fetch docid.
114
- # @param doc [Mechanize::Page]
115
- # @param hit [RelatonItu::Hit]
116
- # @return [Hash]
117
- def fetch_docid(doc, hit)
118
- docids = hit.hit[:code].to_s.split(" | ").map { |c| createdocid(c) }
119
- docids += parse_id(doc).map { |c| createdocid c.text } if docids.empty?
120
- docids << createdocid(title) unless docids.any?
121
- docids
122
- end
123
-
124
- def parse_id(doc)
125
- doc.xpath(
126
- "//span[@id='ctl00_content_main_uc_rec_main_info1_rpt_main_ctl00_lbl_rec']",
127
- "//td[.='Identical standard:']/following-sibling::td",
128
- "//div/table[1]/tr[4]/td/strong",
129
- )
130
- end
28
+ def initialize(hit, imp: false)
29
+ @hit = hit
30
+ @imp = imp
31
+ end
131
32
 
132
- # @param text [String]
133
- # @return [RelatonBib::DocumentIdentifier]
134
- def createdocid(text) # rubocop:disable Metrics/MethodLength
135
- # %r{
136
- # ^(?<code>(?:(?:ITU-\w|ISO/IEC)\s)?[^(:]*)
137
- # (?:\s\(V(?<version>\d+)\))?
138
- # (?:\s\((?:(?<_month>\d{2})/)?(?<_year>\d{4})\))?
139
- # (?::[^(]+\((?<buldate>\d{2}\.\w{1,4}\.\d{4})\))?
140
- # (?:\s(?<corr>(?:Amd|Cor)\.\s?\d+))?
141
- # # (\s\(((?<_cormonth>\d{2})\/)?(?<_coryear>\d{4})\))?
142
- # }x =~ text.squeeze(" ")
143
- # corr&.sub!(/\.\s?/, " ")
144
- # id = [code.sub(/[[:space:]]$/, ""), corr].compact.join " "
145
- # id += " (V#{version})" if version
146
- # id += " - #{buldate}" if buldate
147
- # type = id.match(%r{^\w+}).to_s
148
- # type = "ITU" if type == "G"
149
- if text.match?(/^(?:ISO|ETSI)/)
150
- type = "ISO"
151
- text.match(/[^(]+/).to_s.strip.squeeze(" ")
152
- else
153
- pubid = Pubid.parse(text)
154
- type = pubid.prefix # == "G" ? "ITU" : pubid.prefix
155
- pubid.to_s
156
- end => id
157
- RelatonBib::DocumentIdentifier.new(type: type, id: id, primary: true)
158
- end
33
+ def self.parse_page(hit, imp: false)
34
+ new(hit, imp: imp).parse_page
35
+ end
159
36
 
160
- # Fetch status.
161
- # @param doc [Mechanize::Page]
162
- # @return [RelatonBib::DocumentStatus, NilClass]
163
- def fetch_status(doc)
164
- s = doc.at("//table/tr/td/span[contains(@id, 'Label7')]",
165
- "//p[contains(.,'Status :')]")
166
- return unless s
37
+ # Parse page.
38
+ # @return [Hash]
39
+ def parse_page # rubocop:disable Metrics/AbcSize, Metrics/MethodLength
40
+ return unless parser.doc
41
+
42
+ ItuBibliographicItem.new(
43
+ id: fetch_id,
44
+ fetched: Date.today.to_s,
45
+ type: "standard",
46
+ docid: docid,
47
+ edition: parser.fetch_edition,
48
+ language: ["en"],
49
+ script: ["Latn"],
50
+ title: parser.fetch_titles,
51
+ doctype: DocumentType.new(type: hit.hit[:type]),
52
+ docstatus: parser.fetch_status,
53
+ ics: [], # fetch_ics(doc),
54
+ date: parser.fetch_dates,
55
+ contributor: fetch_contributors,
56
+ editorialgroup: parser.fetch_workgroup,
57
+ abstract: parser.fetch_abstract,
58
+ copyright: fetch_copyright,
59
+ link: parser.fetch_link,
60
+ relation: parser.fetch_relations,
61
+ place: ["Geneva"],
62
+ )
63
+ end
167
64
 
168
- status = s.text.include?("In force") ? "Published" : "Withdrawal"
169
- RelatonBib::DocumentStatus.new(stage: status)
170
- end
65
+ private
171
66
 
172
- # Fetch workgroup.
173
- # @param code [String]
174
- # @param doc [Mechanize::Page]
175
- # @return [RelatonItu::EditorialGroup, NilClass]
176
- def fetch_workgroup(code, doc)
177
- wg = doc.at('//table/tr/td/span[contains(@id, "Label8")]/a')
178
- # return unless wg
67
+ def idrec
68
+ return @idrec if defined? @idrec
179
69
 
180
- group = wg && itugroup(wg.text)
181
- EditorialGroup.new(
182
- bureau: code.match(/(?<=-)./).to_s, group: group
183
- )
184
- end
70
+ @idrec = CGI.unescape(hit.hit[:url]).split("/").last.slice(/^\d+(?=-)/)&.to_i
71
+ end
185
72
 
186
- # @param name [String]
187
- # @return [RelatonItu::ItuGroup]
188
- def itugroup(name) # rubocop:disable Metrics/MethodLength
189
- if name.include? "Study Group"
190
- type = "study-group"
191
- acronym = "SG"
192
- elsif name.include? "Telecommunication Standardization Advisory Group"
193
- type = "tsag"
194
- acronym = "TSAG"
73
+ def parser
74
+ @parser ||= begin
75
+ if idrec
76
+ RecommendationParser.new hit, idrec, imp
195
77
  else
196
- type = "work-group"
197
- acronym = "WG"
198
- end
199
- ItuGroup.new name: name, type: type, acronym: acronym
200
- end
201
-
202
- # Fetch relations.
203
- # @param doc [Mechanize::Page]
204
- # @return [Array<Hash>]
205
- def fetch_relations(doc)
206
- doc.xpath('//div[contains(@id, "tab_sup")]//table/tr[position()>2]')
207
- .map do |r|
208
- ref = r.at('./td/span[contains(@id, "title_e")]/nobr/a')
209
- fref = RelatonBib::FormattedRef.new(content: ref.text, language: "en",
210
- script: "Latn")
211
- did = RelatonBib::DocumentIdentifier.new(id: ref.text, type: "ITU")
212
- bibitem = ItuBibliographicItem.new(formattedref: fref, docid: [did],
213
- type: "standard")
214
- { type: "complementOf", bibitem: bibitem }
215
- end
216
- end
217
-
218
- # Fetch titles.
219
- # @param doc [Mechanize::Page]
220
- # @return [RelatonBib::TypedTitleStringCollection]
221
- def fetch_titles(doc)
222
- t = doc.at("//td[@class='title']|//div/table[1]/tr[4]/td/strong")
223
- return [] unless t
224
-
225
- RelatonBib::TypedTitleString.from_string t.text, "en", "Latn"
226
- end
227
-
228
- # Fetch dates
229
- # @param doc [Mechanize::Page]
230
- # @return [Array<Hash>]
231
- def fetch_dates(doc) # rubocop:disable Metrics/CyclomaticComplexity,Metrics/PerceivedComplexity
232
- dates = []
233
- date = doc.at("//table/tr/td/span[contains(@id, 'Label5')]",
234
- "//p[contains(.,'Approved in')]")
235
- pdate = date&.text&.match(/\d{4}-\d{2}-\d{2}/).to_s || ob_date(doc)
236
- if pdate && !pdate&.empty?
237
- dates << { type: "published", on: pdate }
238
- elsif pdate = ob_date(doc)
239
- dates << { type: "published", on: pdate }
240
- end
241
- dates
242
- end
243
-
244
- # Scrape Operational Bulletin date.
245
- # @param doc [Mechanize::Page]
246
- # @return [String]
247
- def ob_date(doc)
248
- pdate = doc.at('//table/tbody/tr/td[contains(text(), "Year:")]')
249
- return unless pdate
250
-
251
- roman_to_arabic pdate.text.match(%r{(?<=Year: )(\d{2}.\w+.)?\d{4}}).to_s
252
- end
253
-
254
- # Convert roman month number in string date to arabic number
255
- # @param date [String]
256
- # @return [String]
257
- def roman_to_arabic(date)
258
- %r{(?<rmonth>[IVX]+)} =~ date
259
- if ROMAN_MONTHS.index(rmonth)
260
- month = ROMAN_MONTHS.index(rmonth) + 1
261
- Date.parse(date.sub(%r{[IVX]+}, month.to_s)).to_s
262
- else date
78
+ RadioRegulationsParser.new hit
263
79
  end
264
80
  end
81
+ end
265
82
 
266
- # Fetch contributors
267
- # @param doc [Mechanize::Page]
268
- # @return [Array<Hash>]
269
- def fetch_contributors(code)
270
- return [] unless code
83
+ def fetch_id
84
+ docid.find(&:primary).id.gsub(/[.\s()\/-]/, "")
85
+ end
271
86
 
272
- abbrev = code.sub(/-\w\s.*/, "")
273
- case abbrev
274
- when "ITU"
275
- name = "International Telecommunication Union"
276
- url = "www.itu.int"
277
- end
278
- [{ entity: { name: name, url: url, abbreviation: abbrev },
279
- role: [type: "publisher"] }]
87
+ # Fetch docid.
88
+ # @return [Hash]
89
+ def docid
90
+ @docid ||= begin
91
+ docids = hit.hit[:code].to_s.split(" | ").map { |c| createdocid(c) }
92
+ docids << createdocid(doc["rec_name"]) if docids.empty?
93
+ docids
280
94
  end
95
+ end
281
96
 
282
- # Fetch links.
283
- # @param doc [Mechanize::Page]
284
- # @return [Array<Hash>]
285
- def fetch_link(doc)
286
- links = [{ type: "src", content: doc.uri.to_s }]
287
- obp_elm = doc.at(
288
- '//a[@title="Persistent link to download the PDF file"]',
289
- "//font[contains(.,'PDF')]/../..",
290
- )
291
- links << typed_link("obp", obp_elm) if obp_elm
292
- wrd_elm = doc.at("//font[contains(.,'Word')]/../..")
293
- links << typed_link("word", wrd_elm) if wrd_elm
294
- links
295
- end
97
+ # @param text [String]
98
+ # @return [RelatonBib::DocumentIdentifier]
99
+ def createdocid(text) # rubocop:disable Metrics/MethodLength
100
+ # %r{
101
+ # ^(?<code>(?:(?:ITU-\w|ISO/IEC)\s)?[^(:]*)
102
+ # (?:\s\(V(?<version>\d+)\))?
103
+ # (?:\s\((?:(?<_month>\d{2})/)?(?<_year>\d{4})\))?
104
+ # (?::[^(]+\((?<buldate>\d{2}\.\w{1,4}\.\d{4})\))?
105
+ # (?:\s(?<corr>(?:Amd|Cor)\.\s?\d+))?
106
+ # # (\s\(((?<_cormonth>\d{2})\/)?(?<_coryear>\d{4})\))?
107
+ # }x =~ text.squeeze(" ")
108
+ # corr&.sub!(/\.\s?/, " ")
109
+ # id = [code.sub(/[[:space:]]$/, ""), corr].compact.join " "
110
+ # id += " (V#{version})" if version
111
+ # id += " - #{buldate}" if buldate
112
+ # type = id.match(%r{^\w+}).to_s
113
+ # type = "ITU" if type == "G"
114
+ if text.match?(/^(?:ISO|ETSI)/)
115
+ type = "ISO"
116
+ text.match(/[^(]+/).to_s.strip.squeeze(" ")
117
+ else
118
+ pubid = Pubid.parse(text)
119
+ type = pubid.prefix # == "G" ? "ITU" : pubid.prefix
120
+ pubid.to_s
121
+ end => id
122
+ RelatonBib::DocumentIdentifier.new(type: type, id: id, primary: true)
123
+ end
296
124
 
297
- # @param type [String]
298
- # @param elm [Nokogiri::XML::Element]
299
- def typed_link(type, elm)
300
- {
301
- type: type,
302
- content: URI.join(HitCollection::DOMAIN, elm[:href].strip).to_s,
303
- }
304
- end
125
+ # def fetch_data(url)
126
+ # resp = hit.hit_collection.agent.get url
127
+ # JSON.parse(resp.body)
128
+ # rescue Mechanize::ResponseCodeError => e
129
+ # Util.error "HTTP Service Unavailable: #{e.message}"
130
+ # nil
131
+ # end
132
+
133
+ # Scrape Operational Bulletin date.
134
+ # @param doc [Mechanize::Page]
135
+ # @return [String]
136
+ # def ob_date(doc)
137
+ # pdate = doc.at('//table/tbody/tr/td[contains(text(), "Year:")]')
138
+ # return unless pdate
139
+
140
+ # roman_to_arabic pdate.text.match(%r{(?<=Year: )(\d{2}.\w+.)?\d{4}}).to_s
141
+ # end
142
+
143
+ # Fetch contributors
144
+ # @return [Array<Hash>]
145
+ def fetch_contributors
146
+ return [] unless hit.hit[:code]
147
+
148
+ abbrev = hit.hit[:code].sub(/-\w\s.*/, "")
149
+ case abbrev
150
+ when "ITU"
151
+ name = "International Telecommunication Union"
152
+ url = "www.itu.int"
153
+ end
154
+ [{ entity: { name: name, url: url, abbreviation: abbrev }, role: [type: "publisher"] }]
155
+ end
305
156
 
306
- # Fetch copyright.
307
- # @param code [String]
308
- # @param doc [Mechanize::Page]
309
- # @return [Array<Hash>]
310
- def fetch_copyright(code, doc)
311
- abbreviation = code.match(/^[^-]+/).to_s
312
- case abbreviation
313
- when "ITU"
314
- name = "International Telecommunication Union"
315
- url = "www.itu.int"
316
- end
317
- fdate = doc.at("//table/tr/td/span[contains(@id, 'Label5')]")
318
- from = fdate&.text || ob_date(doc)
319
- [{ owner: [{ name: name, abbreviation: abbreviation, url: url }],
320
- from: from }]
157
+ # Fetch copyright.
158
+ # @return [Array<Hash>]
159
+ def fetch_copyright
160
+ abbreviation = hit.hit[:code].match(/^[^-]+/).to_s
161
+ case abbreviation
162
+ when "ITU"
163
+ name = "International Telecommunication Union"
164
+ url = "www.itu.int"
321
165
  end
166
+ [{ owner: [{ name: name, abbreviation: abbreviation, url: url }], from: parser.doc_date }]
322
167
  end
323
168
  end
324
169
  end
@@ -1,3 +1,3 @@
1
1
  module RelatonItu
2
- VERSION = "1.20.1".freeze
2
+ VERSION = "1.20.3".freeze
3
3
  end
@@ -14,8 +14,7 @@ module RelatonItu
14
14
  # @param ext [Nokogiri::XML::Element]
15
15
  # @return [RelatonItu::EditorialGroup]
16
16
  def fetch_editorialgroup(ext)
17
- eg = ext.at("./editorialgroup")
18
- return unless eg
17
+ return unless ext && (eg = ext.at "editorialgroup")
19
18
 
20
19
  EditorialGroup.new(
21
20
  bureau: eg.at("bureau")&.text,
@@ -51,8 +50,7 @@ module RelatonItu
51
50
  # @param ext [Nokogiri::XML::Element]
52
51
  # @return [RelatonItu::StructuredIdentifier]
53
52
  def fetch_structuredidentifier(ext)
54
- sid = ext.at "./structuredidentifier"
55
- return unless sid
53
+ return unless ext && (sid = ext.at "./structuredidentifier")
56
54
 
57
55
  br = sid.at("bureau").text
58
56
  dn = sid.at("docnumber").text
data/lib/relaton_itu.rb CHANGED
@@ -1,6 +1,7 @@
1
1
  require "mechanize"
2
2
  require "parslet"
3
3
  require "digest/md5"
4
+ require "relaton/core"
4
5
  require "relaton/index"
5
6
  require "relaton_bib"
6
7
  require "relaton_itu/version"
data/relaton_itu.gemspec CHANGED
@@ -28,5 +28,6 @@ Gem::Specification.new do |spec|
28
28
  spec.add_dependency "mechanize", "~> 2.10"
29
29
  spec.add_dependency "parslet", "~> 2.0.0"
30
30
  spec.add_dependency "relaton-bib", "~> 1.20.0"
31
+ spec.add_dependency "relaton-core", "~> 0.0.6"
31
32
  spec.add_dependency "relaton-index", "~> 0.2.0"
32
33
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: relaton-itu
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.20.1
4
+ version: 1.20.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - Ribose Inc.
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2025-01-16 00:00:00.000000000 Z
11
+ date: 2026-01-20 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: mechanize
@@ -52,6 +52,20 @@ dependencies:
52
52
  - - "~>"
53
53
  - !ruby/object:Gem::Version
54
54
  version: 1.20.0
55
+ - !ruby/object:Gem::Dependency
56
+ name: relaton-core
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - "~>"
60
+ - !ruby/object:Gem::Version
61
+ version: 0.0.6
62
+ type: :runtime
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - "~>"
67
+ - !ruby/object:Gem::Version
68
+ version: 0.0.6
55
69
  - !ruby/object:Gem::Dependency
56
70
  name: relaton-index
57
71
  requirement: !ruby/object:Gem::Requirement
@@ -104,6 +118,8 @@ files:
104
118
  - lib/relaton_itu/itu_group.rb
105
119
  - lib/relaton_itu/processor.rb
106
120
  - lib/relaton_itu/pubid.rb
121
+ - lib/relaton_itu/radio_regulations_parser.rb
122
+ - lib/relaton_itu/recommendation_parser.rb
107
123
  - lib/relaton_itu/scrapper.rb
108
124
  - lib/relaton_itu/structured_identifier.rb
109
125
  - lib/relaton_itu/util.rb
@@ -129,7 +145,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
129
145
  - !ruby/object:Gem::Version
130
146
  version: '0'
131
147
  requirements: []
132
- rubygems_version: 3.3.27
148
+ rubygems_version: 3.5.22
133
149
  signing_key:
134
150
  specification_version: 4
135
151
  summary: 'RelatonItu: retrieve ITU Standards for bibliographic use using the BibliographicItem