relaton-itu 1.20.1 → 1.20.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile +4 -0
- data/lib/relaton_itu/hit_collection.rb +3 -0
- data/lib/relaton_itu/radio_regulations_parser.rb +66 -0
- data/lib/relaton_itu/recommendation_parser.rb +191 -0
- data/lib/relaton_itu/scrapper.rb +127 -282
- data/lib/relaton_itu/version.rb +1 -1
- data/lib/relaton_itu/xml_parser.rb +2 -4
- data/lib/relaton_itu.rb +1 -0
- data/relaton_itu.gemspec +1 -0
- metadata +19 -3
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: f70f91da524595847c16b4a8cb46dc7379a254822a84b03034e9b4965560ff4a
|
|
4
|
+
data.tar.gz: eb69591d51a38a1ad4c99f5914f2fd7be27abaad712947ea04a603939fe65196
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 0d7300e181010d2bdd2829a0a888a0db59bd3c61448e09d840ab76f3a5c24b17700c5ea6ae4598e1aebfba82f173c3a619ad9e37e9581e26d636db601d178d99
|
|
7
|
+
data.tar.gz: b82909697c98ac580e0eaf13e1c0f0652365fb08f2951b8a86ffc7c972959703c916596b5f210624e6ec7945900502b63ac52f7ae516f4be0420fa79065ead8f
|
data/Gemfile
CHANGED
|
@@ -45,6 +45,9 @@ module RelatonItu
|
|
|
45
45
|
data = { json: params.to_json }
|
|
46
46
|
resp = agent.post url, data
|
|
47
47
|
@array = hits JSON.parse(resp.body)
|
|
48
|
+
rescue Mechanize::ResponseCodeError, SocketError, Timeout::Error, Errno::ECONNRESET,
|
|
49
|
+
EOFError, Net::ProtocolError, OpenSSL::SSL::SSLError => e
|
|
50
|
+
raise RelatonBib::RequestError, "Could not access #{url}: #{e.message}"
|
|
48
51
|
end
|
|
49
52
|
|
|
50
53
|
def request_document # rubocop:todo Metrics/MethodLength, Metrics/AbcSize
|
|
@@ -0,0 +1,66 @@
|
|
|
1
|
+
module RelatonItu
|
|
2
|
+
class RadioRegulationsParser
|
|
3
|
+
include Relaton::Core::ArrayWrapper
|
|
4
|
+
|
|
5
|
+
ROMAN_MONTHS = %w[I II III IV V VI VII VIII IX X XI XII].freeze
|
|
6
|
+
|
|
7
|
+
def initialize(hit)
|
|
8
|
+
@hit = hit
|
|
9
|
+
end
|
|
10
|
+
|
|
11
|
+
def doc
|
|
12
|
+
@doc ||= hit.hit_collection.agent.get doc_url
|
|
13
|
+
rescue Mechanize::ResponseCodeError, SocketError, Timeout::Error, Errno::ECONNRESET,
|
|
14
|
+
EOFError, Net::ProtocolError, OpenSSL::SSL::SSLError => e
|
|
15
|
+
raise RelatonBib::RequestError, "Could not access #{url}: #{e.message}"
|
|
16
|
+
end
|
|
17
|
+
|
|
18
|
+
def doc_url
|
|
19
|
+
CGI.unescape(hit.hit[:url]).split("dest=").last
|
|
20
|
+
end
|
|
21
|
+
|
|
22
|
+
def fetch_edition = nil
|
|
23
|
+
def fetch_status = nil
|
|
24
|
+
def fetch_workgroup = nil
|
|
25
|
+
def fetch_abstract = []
|
|
26
|
+
def fetch_relations = []
|
|
27
|
+
|
|
28
|
+
def fetch_titles
|
|
29
|
+
title = doc.at("//title")&.text&.strip
|
|
30
|
+
return [] if title.nil? || title.empty?
|
|
31
|
+
|
|
32
|
+
RelatonBib::TypedTitleString.from_string title, "en", "Latn"
|
|
33
|
+
end
|
|
34
|
+
|
|
35
|
+
def fetch_dates
|
|
36
|
+
array(doc_date).map { |on| { type: "published", on: on } }
|
|
37
|
+
end
|
|
38
|
+
|
|
39
|
+
def doc_date
|
|
40
|
+
return @doc_date if defined? @doc_date
|
|
41
|
+
|
|
42
|
+
date_str = doc.at("//td[@class='title']/text()")&.text&.slice(/(?<=Year:\s)(?:\d{1,2}\.\w+\.)?\d{4}/)
|
|
43
|
+
@doc_date = date_str ? roman_to_arabic(date_str) : nil
|
|
44
|
+
end
|
|
45
|
+
|
|
46
|
+
def fetch_link
|
|
47
|
+
[RelatonBib::TypedUri.new(type: "src", content: doc_url)]
|
|
48
|
+
end
|
|
49
|
+
|
|
50
|
+
private
|
|
51
|
+
|
|
52
|
+
attr_reader :hit
|
|
53
|
+
|
|
54
|
+
# Convert roman month number in string date to arabic number
|
|
55
|
+
# @param date [String]
|
|
56
|
+
# @return [String]
|
|
57
|
+
def roman_to_arabic(date)
|
|
58
|
+
%r{(?<rmonth>[IVX]+)} =~ date
|
|
59
|
+
if ROMAN_MONTHS.index(rmonth)
|
|
60
|
+
month = ROMAN_MONTHS.index(rmonth) + 1
|
|
61
|
+
Date.parse(date.sub(%r{[IVX]+}, month.to_s)).to_s
|
|
62
|
+
else date
|
|
63
|
+
end
|
|
64
|
+
end
|
|
65
|
+
end
|
|
66
|
+
end
|
|
@@ -0,0 +1,191 @@
|
|
|
1
|
+
module RelatonItu
|
|
2
|
+
# Parse ITU Radio Regulations from XML to Relaton format.
|
|
3
|
+
class RecommendationParser
|
|
4
|
+
include Relaton::Core::ArrayWrapper
|
|
5
|
+
|
|
6
|
+
RECHDR = "https://www.itu.int/mws/api/recommendations/getRecHdrDetail?idrec=%{idrec}&lang=en".freeze
|
|
7
|
+
RECEDITIONS = "https://www.itu.int/mws/api/recommendations/getRecEditions?idrec=%{idrec}&lang=en".freeze
|
|
8
|
+
RECSUPPLEMENTS = "https://www.itu.int/mws/api/recommendations/getRecSupplements?idrec=%{idrec}&lang=en".freeze
|
|
9
|
+
IMPLGUIDES = "https://www.itu.int/mws/api/recommendations/getImplGuides?idrec=%{idrec}&lang=en".freeze
|
|
10
|
+
|
|
11
|
+
def initialize(hit, idrec, imp)
|
|
12
|
+
@hit = hit
|
|
13
|
+
@idrec = idrec
|
|
14
|
+
@imp = imp
|
|
15
|
+
end
|
|
16
|
+
|
|
17
|
+
def doc
|
|
18
|
+
@doc ||= begin
|
|
19
|
+
url = (imp ? IMPLGUIDES : RECHDR ) % { idrec: idrec }
|
|
20
|
+
resp = get_data url
|
|
21
|
+
imp ? resp.first : resp
|
|
22
|
+
end
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
# @return [Strign, nil]
|
|
26
|
+
def fetch_edition
|
|
27
|
+
self_edition.dig("Version")
|
|
28
|
+
end
|
|
29
|
+
|
|
30
|
+
# Fetch titles.
|
|
31
|
+
# @return [RelatonBib::TypedTitleStringCollection]
|
|
32
|
+
def fetch_titles
|
|
33
|
+
title = imp ? doc["imp_title_e"] : doc["rec_title"]
|
|
34
|
+
return [] if title.nil? || title.empty?
|
|
35
|
+
|
|
36
|
+
RelatonBib::TypedTitleString.from_string title, "en", "Latn"
|
|
37
|
+
end
|
|
38
|
+
|
|
39
|
+
# Fetch status.
|
|
40
|
+
# @return [RelatonBib::DocumentStatus, NilClass]
|
|
41
|
+
def fetch_status
|
|
42
|
+
inforce = imp ? imp_status : doc["status"]
|
|
43
|
+
return if inforce.nil? || inforce.empty?
|
|
44
|
+
|
|
45
|
+
status = inforce == "In force" ? "Published" : "Withdrawal"
|
|
46
|
+
RelatonBib::DocumentStatus.new(stage: status)
|
|
47
|
+
end
|
|
48
|
+
|
|
49
|
+
# Fetch dates
|
|
50
|
+
# @return [Array<Hash>]
|
|
51
|
+
def fetch_dates
|
|
52
|
+
array(doc_date).map { |on| { type: "published", on: on } }
|
|
53
|
+
end
|
|
54
|
+
|
|
55
|
+
# Fetch workgroup.
|
|
56
|
+
# @return [RelatonItu::EditorialGroup, NilClass]
|
|
57
|
+
def fetch_workgroup
|
|
58
|
+
group = itugroup(doc["sg"])
|
|
59
|
+
EditorialGroup.new(
|
|
60
|
+
bureau: hit.hit[:code].match(/(?<=-)./).to_s, group: group
|
|
61
|
+
)
|
|
62
|
+
end
|
|
63
|
+
|
|
64
|
+
# Fetch abstracts.
|
|
65
|
+
# @return [Array<Hash>]
|
|
66
|
+
def fetch_abstract
|
|
67
|
+
array(doc["summary"]).map do |content|
|
|
68
|
+
{ content: content, language: "en", script: "Latn" }
|
|
69
|
+
end
|
|
70
|
+
end
|
|
71
|
+
|
|
72
|
+
# Fetch links.
|
|
73
|
+
# @return [Array<Hash>]
|
|
74
|
+
def fetch_link
|
|
75
|
+
link = imp ? doc["imp_dms_link"] : doc["handle_id"]
|
|
76
|
+
links = [{ type: "src", content: link }]
|
|
77
|
+
links << typed_link("pdf", doc["handle_id_pdf_link"]) if doc["handle_id_pdf_link"]
|
|
78
|
+
imp_word_link { |wlink| links << typed_link("word", wlink) }
|
|
79
|
+
links
|
|
80
|
+
end
|
|
81
|
+
|
|
82
|
+
def doc_date
|
|
83
|
+
return @doc_date if defined? @doc_date
|
|
84
|
+
|
|
85
|
+
date = imp ? doc["imp_approval_date"] : doc["approval_date"]
|
|
86
|
+
@doc_date = Date.parse(date).to_s rescue date
|
|
87
|
+
end
|
|
88
|
+
|
|
89
|
+
# Fetch relations.
|
|
90
|
+
# @return [Array<Hash>]
|
|
91
|
+
def fetch_relations
|
|
92
|
+
relations = []
|
|
93
|
+
editions.each do |ed|
|
|
94
|
+
next if ed["idrec"] == idrec
|
|
95
|
+
|
|
96
|
+
relations << create_relation("hasEdition", ed["title"], ed["rec_name"])
|
|
97
|
+
end
|
|
98
|
+
|
|
99
|
+
supplements.each { |supp| relations << create_relation("complementOf", supp["title_text"], supp["rec_name"]) }
|
|
100
|
+
relations
|
|
101
|
+
end
|
|
102
|
+
|
|
103
|
+
private
|
|
104
|
+
|
|
105
|
+
attr_reader :hit, :idrec, :imp
|
|
106
|
+
|
|
107
|
+
# Get data.
|
|
108
|
+
# @param url [String, nil]
|
|
109
|
+
# @return [Array<String, Nokogiri::HTML::Document>]
|
|
110
|
+
def get_data(url)
|
|
111
|
+
JSON.parse request_document(url).body
|
|
112
|
+
end
|
|
113
|
+
|
|
114
|
+
def request_document(url)
|
|
115
|
+
hit.hit_collection.agent.get url
|
|
116
|
+
rescue Mechanize::ResponseCodeError, SocketError, Timeout::Error, Errno::ECONNRESET,
|
|
117
|
+
EOFError, Net::ProtocolError, OpenSSL::SSL::SSLError => e
|
|
118
|
+
raise RelatonBib::RequestError, "Could not access #{url}: #{e.message}"
|
|
119
|
+
end
|
|
120
|
+
|
|
121
|
+
def editions
|
|
122
|
+
@editions ||= begin
|
|
123
|
+
url = RECEDITIONS % { idrec: idrec }
|
|
124
|
+
get_data(url) || []
|
|
125
|
+
end
|
|
126
|
+
end
|
|
127
|
+
|
|
128
|
+
def self_edition
|
|
129
|
+
@self_edition ||= editions.find { |ed| ed["idrec"] == idrec }
|
|
130
|
+
end
|
|
131
|
+
|
|
132
|
+
def imp_status
|
|
133
|
+
self_edition.dig("status")
|
|
134
|
+
end
|
|
135
|
+
|
|
136
|
+
# @param name [String]
|
|
137
|
+
# @return [RelatonItu::ItuGroup]
|
|
138
|
+
def itugroup(name) # rubocop:disable Metrics/MethodLength
|
|
139
|
+
return if name.nil? || name.empty?
|
|
140
|
+
|
|
141
|
+
if name.include? "Study Group"
|
|
142
|
+
type = "study-group"
|
|
143
|
+
acronym = "SG"
|
|
144
|
+
elsif name.include? "Telecommunication Standardization Advisory Group"
|
|
145
|
+
type = "tsag"
|
|
146
|
+
acronym = "TSAG"
|
|
147
|
+
else
|
|
148
|
+
type = "work-group"
|
|
149
|
+
acronym = "WG"
|
|
150
|
+
end
|
|
151
|
+
ItuGroup.new name: name, type: type, acronym: acronym
|
|
152
|
+
end
|
|
153
|
+
|
|
154
|
+
def imp_word_link
|
|
155
|
+
return unless doc["imp_dms_link"]
|
|
156
|
+
@doc_page ||= request_document(doc["imp_dms_link"])
|
|
157
|
+
wrd_elm = @doc_page.at("//font[contains(.,'Word')]/../..")
|
|
158
|
+
yield wrd_elm[:href] if block_given? && wrd_elm
|
|
159
|
+
end
|
|
160
|
+
|
|
161
|
+
def create_relation(type, title_text, id)
|
|
162
|
+
title = []
|
|
163
|
+
if title_text && !title.empty?
|
|
164
|
+
title << RelatonBib::TypedTitleString.new(content: title_text, language: "en", script: "Latn")
|
|
165
|
+
else
|
|
166
|
+
fref = RelatonBib::FormattedRef.new(content: id, language: "en", script: "Latn")
|
|
167
|
+
end
|
|
168
|
+
|
|
169
|
+
did = RelatonBib::DocumentIdentifier.new(id: id, type: "ITU", primary: true)
|
|
170
|
+
item = ItuBibliographicItem.new(title: title, formattedref: fref, docid: [did])
|
|
171
|
+
{ type: "hasEdition", bibitem: item }
|
|
172
|
+
end
|
|
173
|
+
|
|
174
|
+
def supplements
|
|
175
|
+
@supplements ||= begin
|
|
176
|
+
if imp
|
|
177
|
+
[]
|
|
178
|
+
else
|
|
179
|
+
url = RECSUPPLEMENTS % { idrec: idrec }
|
|
180
|
+
get_data(url) || []
|
|
181
|
+
end
|
|
182
|
+
end
|
|
183
|
+
end
|
|
184
|
+
|
|
185
|
+
# @param type [String]
|
|
186
|
+
# @param url [Nokogiri::XML::Element]
|
|
187
|
+
def typed_link(type, url)
|
|
188
|
+
{ type: type, content: url }
|
|
189
|
+
end
|
|
190
|
+
end
|
|
191
|
+
end
|
data/lib/relaton_itu/scrapper.rb
CHANGED
|
@@ -2,11 +2,13 @@
|
|
|
2
2
|
|
|
3
3
|
require "nokogiri"
|
|
4
4
|
require "net/http"
|
|
5
|
+
require_relative "recommendation_parser"
|
|
6
|
+
require_relative "radio_regulations_parser"
|
|
5
7
|
|
|
6
8
|
module RelatonItu
|
|
7
9
|
# Scrapper.
|
|
8
|
-
|
|
9
|
-
|
|
10
|
+
class Scrapper
|
|
11
|
+
attr_reader :hit, :imp
|
|
10
12
|
|
|
11
13
|
TYPES = {
|
|
12
14
|
"ISO" => "international-standard",
|
|
@@ -23,302 +25,145 @@ module RelatonItu
|
|
|
23
25
|
"Guide" => "guide",
|
|
24
26
|
}.freeze
|
|
25
27
|
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
def parse_page(hit, imp: false) # rubocop:disable Metrics/AbcSize, Metrics/MethodLength
|
|
31
|
-
doc = get_page hit
|
|
32
|
-
return unless doc.code == "200"
|
|
33
|
-
|
|
34
|
-
if imp
|
|
35
|
-
a = doc.at "//span[contains(@id, 'tab_ig_uc_rec')]/a"
|
|
36
|
-
return unless a
|
|
37
|
-
|
|
38
|
-
doc = get_page hit, a[:href].to_s
|
|
39
|
-
end
|
|
40
|
-
|
|
41
|
-
# Fetch edition.
|
|
42
|
-
edition = doc.at("//table/tr/td[contains(@style,'color: white')]/span[contains(@id, 'Label8')]/b")&.text
|
|
43
|
-
docid = fetch_docid(doc, hit)
|
|
44
|
-
|
|
45
|
-
ItuBibliographicItem.new(
|
|
46
|
-
id: fetch_id(docid),
|
|
47
|
-
fetched: Date.today.to_s,
|
|
48
|
-
type: "standard",
|
|
49
|
-
docid: docid,
|
|
50
|
-
edition: edition,
|
|
51
|
-
language: ["en"],
|
|
52
|
-
script: ["Latn"],
|
|
53
|
-
title: fetch_titles(doc),
|
|
54
|
-
doctype: DocumentType.new(type: hit.hit[:type]),
|
|
55
|
-
docstatus: fetch_status(doc),
|
|
56
|
-
ics: [], # fetch_ics(doc),
|
|
57
|
-
date: fetch_dates(doc),
|
|
58
|
-
contributor: fetch_contributors(hit.hit[:code]),
|
|
59
|
-
editorialgroup: fetch_workgroup(hit.hit[:code], doc),
|
|
60
|
-
abstract: fetch_abstract(doc, hit),
|
|
61
|
-
copyright: fetch_copyright(hit.hit[:code], doc),
|
|
62
|
-
link: fetch_link(doc),
|
|
63
|
-
relation: fetch_relations(doc),
|
|
64
|
-
place: ["Geneva"],
|
|
65
|
-
)
|
|
66
|
-
end
|
|
67
|
-
|
|
68
|
-
private
|
|
69
|
-
|
|
70
|
-
def fetch_id(docid)
|
|
71
|
-
docid.find(&:primary).id.gsub(/[.\s()\/-]/, "")
|
|
72
|
-
end
|
|
73
|
-
|
|
74
|
-
# Fetch abstracts.
|
|
75
|
-
# @param doc [Mechanize::Page]
|
|
76
|
-
# @param hit [RelatonItu::Hit]
|
|
77
|
-
# @return [Array<Hash>]
|
|
78
|
-
def fetch_abstract(doc, hit) # rubocop:disable Metrics/AbcSize,Metrics/MethodLength
|
|
79
|
-
abstract_url = doc.at '//table/tr/td[contains(@style,"color: white")]/span[contains(@id, "lbl_dms")]/div'
|
|
80
|
-
if abstract_url
|
|
81
|
-
url = abstract_url[:onclick].match(/https?[^']+/).to_s
|
|
82
|
-
rsp = hit.hit_collection.agent.get url
|
|
83
|
-
d = Nokogiri::HTML rsp.body.encode(undef: :replace, replace: "")
|
|
84
|
-
d.css("p.MsoNormal").text.gsub("\r\n", "").squeeze(" ").gsub("\u00a0", "")
|
|
85
|
-
elsif a = doc.at('//table/tr/td/span[contains(@class, "observation")]/text()')
|
|
86
|
-
a.text.strip
|
|
87
|
-
end => content
|
|
88
|
-
return [] unless content
|
|
89
|
-
|
|
90
|
-
[{
|
|
91
|
-
content: content,
|
|
92
|
-
language: "en",
|
|
93
|
-
script: "Latn",
|
|
94
|
-
}]
|
|
95
|
-
rescue Mechanize::ResponseCodeError => e
|
|
96
|
-
Util.error "HTTP Service Unavailable: #{e.message}"
|
|
97
|
-
[]
|
|
98
|
-
end
|
|
99
|
-
|
|
100
|
-
# Get page.
|
|
101
|
-
# @param hit [RelatonItu::Hit]
|
|
102
|
-
# @param url [String, nil]
|
|
103
|
-
# @return [Array<String, Nokogiri::HTML::Document>]
|
|
104
|
-
def get_page(hit, url = nil)
|
|
105
|
-
uri = url || hit.hit[:url]
|
|
106
|
-
hit.hit_collection.agent.get uri
|
|
107
|
-
rescue SocketError, Timeout::Error, Errno::EINVAL, Errno::ECONNRESET,
|
|
108
|
-
EOFError, Net::HTTPBadResponse, Net::HTTPHeaderSyntaxError,
|
|
109
|
-
Net::ProtocolError, OpenSSL::SSL::SSLError
|
|
110
|
-
raise RelatonBib::RequestError, "Could not access #{uri}"
|
|
111
|
-
end
|
|
112
|
-
|
|
113
|
-
# Fetch docid.
|
|
114
|
-
# @param doc [Mechanize::Page]
|
|
115
|
-
# @param hit [RelatonItu::Hit]
|
|
116
|
-
# @return [Hash]
|
|
117
|
-
def fetch_docid(doc, hit)
|
|
118
|
-
docids = hit.hit[:code].to_s.split(" | ").map { |c| createdocid(c) }
|
|
119
|
-
docids += parse_id(doc).map { |c| createdocid c.text } if docids.empty?
|
|
120
|
-
docids << createdocid(title) unless docids.any?
|
|
121
|
-
docids
|
|
122
|
-
end
|
|
123
|
-
|
|
124
|
-
def parse_id(doc)
|
|
125
|
-
doc.xpath(
|
|
126
|
-
"//span[@id='ctl00_content_main_uc_rec_main_info1_rpt_main_ctl00_lbl_rec']",
|
|
127
|
-
"//td[.='Identical standard:']/following-sibling::td",
|
|
128
|
-
"//div/table[1]/tr[4]/td/strong",
|
|
129
|
-
)
|
|
130
|
-
end
|
|
28
|
+
def initialize(hit, imp: false)
|
|
29
|
+
@hit = hit
|
|
30
|
+
@imp = imp
|
|
31
|
+
end
|
|
131
32
|
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
# %r{
|
|
136
|
-
# ^(?<code>(?:(?:ITU-\w|ISO/IEC)\s)?[^(:]*)
|
|
137
|
-
# (?:\s\(V(?<version>\d+)\))?
|
|
138
|
-
# (?:\s\((?:(?<_month>\d{2})/)?(?<_year>\d{4})\))?
|
|
139
|
-
# (?::[^(]+\((?<buldate>\d{2}\.\w{1,4}\.\d{4})\))?
|
|
140
|
-
# (?:\s(?<corr>(?:Amd|Cor)\.\s?\d+))?
|
|
141
|
-
# # (\s\(((?<_cormonth>\d{2})\/)?(?<_coryear>\d{4})\))?
|
|
142
|
-
# }x =~ text.squeeze(" ")
|
|
143
|
-
# corr&.sub!(/\.\s?/, " ")
|
|
144
|
-
# id = [code.sub(/[[:space:]]$/, ""), corr].compact.join " "
|
|
145
|
-
# id += " (V#{version})" if version
|
|
146
|
-
# id += " - #{buldate}" if buldate
|
|
147
|
-
# type = id.match(%r{^\w+}).to_s
|
|
148
|
-
# type = "ITU" if type == "G"
|
|
149
|
-
if text.match?(/^(?:ISO|ETSI)/)
|
|
150
|
-
type = "ISO"
|
|
151
|
-
text.match(/[^(]+/).to_s.strip.squeeze(" ")
|
|
152
|
-
else
|
|
153
|
-
pubid = Pubid.parse(text)
|
|
154
|
-
type = pubid.prefix # == "G" ? "ITU" : pubid.prefix
|
|
155
|
-
pubid.to_s
|
|
156
|
-
end => id
|
|
157
|
-
RelatonBib::DocumentIdentifier.new(type: type, id: id, primary: true)
|
|
158
|
-
end
|
|
33
|
+
def self.parse_page(hit, imp: false)
|
|
34
|
+
new(hit, imp: imp).parse_page
|
|
35
|
+
end
|
|
159
36
|
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
37
|
+
# Parse page.
|
|
38
|
+
# @return [Hash]
|
|
39
|
+
def parse_page # rubocop:disable Metrics/AbcSize, Metrics/MethodLength
|
|
40
|
+
return unless parser.doc
|
|
41
|
+
|
|
42
|
+
ItuBibliographicItem.new(
|
|
43
|
+
id: fetch_id,
|
|
44
|
+
fetched: Date.today.to_s,
|
|
45
|
+
type: "standard",
|
|
46
|
+
docid: docid,
|
|
47
|
+
edition: parser.fetch_edition,
|
|
48
|
+
language: ["en"],
|
|
49
|
+
script: ["Latn"],
|
|
50
|
+
title: parser.fetch_titles,
|
|
51
|
+
doctype: DocumentType.new(type: hit.hit[:type]),
|
|
52
|
+
docstatus: parser.fetch_status,
|
|
53
|
+
ics: [], # fetch_ics(doc),
|
|
54
|
+
date: parser.fetch_dates,
|
|
55
|
+
contributor: fetch_contributors,
|
|
56
|
+
editorialgroup: parser.fetch_workgroup,
|
|
57
|
+
abstract: parser.fetch_abstract,
|
|
58
|
+
copyright: fetch_copyright,
|
|
59
|
+
link: parser.fetch_link,
|
|
60
|
+
relation: parser.fetch_relations,
|
|
61
|
+
place: ["Geneva"],
|
|
62
|
+
)
|
|
63
|
+
end
|
|
167
64
|
|
|
168
|
-
|
|
169
|
-
RelatonBib::DocumentStatus.new(stage: status)
|
|
170
|
-
end
|
|
65
|
+
private
|
|
171
66
|
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
# @param doc [Mechanize::Page]
|
|
175
|
-
# @return [RelatonItu::EditorialGroup, NilClass]
|
|
176
|
-
def fetch_workgroup(code, doc)
|
|
177
|
-
wg = doc.at('//table/tr/td/span[contains(@id, "Label8")]/a')
|
|
178
|
-
# return unless wg
|
|
67
|
+
def idrec
|
|
68
|
+
return @idrec if defined? @idrec
|
|
179
69
|
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
bureau: code.match(/(?<=-)./).to_s, group: group
|
|
183
|
-
)
|
|
184
|
-
end
|
|
70
|
+
@idrec = CGI.unescape(hit.hit[:url]).split("/").last.slice(/^\d+(?=-)/)&.to_i
|
|
71
|
+
end
|
|
185
72
|
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
type = "study-group"
|
|
191
|
-
acronym = "SG"
|
|
192
|
-
elsif name.include? "Telecommunication Standardization Advisory Group"
|
|
193
|
-
type = "tsag"
|
|
194
|
-
acronym = "TSAG"
|
|
73
|
+
def parser
|
|
74
|
+
@parser ||= begin
|
|
75
|
+
if idrec
|
|
76
|
+
RecommendationParser.new hit, idrec, imp
|
|
195
77
|
else
|
|
196
|
-
|
|
197
|
-
acronym = "WG"
|
|
198
|
-
end
|
|
199
|
-
ItuGroup.new name: name, type: type, acronym: acronym
|
|
200
|
-
end
|
|
201
|
-
|
|
202
|
-
# Fetch relations.
|
|
203
|
-
# @param doc [Mechanize::Page]
|
|
204
|
-
# @return [Array<Hash>]
|
|
205
|
-
def fetch_relations(doc)
|
|
206
|
-
doc.xpath('//div[contains(@id, "tab_sup")]//table/tr[position()>2]')
|
|
207
|
-
.map do |r|
|
|
208
|
-
ref = r.at('./td/span[contains(@id, "title_e")]/nobr/a')
|
|
209
|
-
fref = RelatonBib::FormattedRef.new(content: ref.text, language: "en",
|
|
210
|
-
script: "Latn")
|
|
211
|
-
did = RelatonBib::DocumentIdentifier.new(id: ref.text, type: "ITU")
|
|
212
|
-
bibitem = ItuBibliographicItem.new(formattedref: fref, docid: [did],
|
|
213
|
-
type: "standard")
|
|
214
|
-
{ type: "complementOf", bibitem: bibitem }
|
|
215
|
-
end
|
|
216
|
-
end
|
|
217
|
-
|
|
218
|
-
# Fetch titles.
|
|
219
|
-
# @param doc [Mechanize::Page]
|
|
220
|
-
# @return [RelatonBib::TypedTitleStringCollection]
|
|
221
|
-
def fetch_titles(doc)
|
|
222
|
-
t = doc.at("//td[@class='title']|//div/table[1]/tr[4]/td/strong")
|
|
223
|
-
return [] unless t
|
|
224
|
-
|
|
225
|
-
RelatonBib::TypedTitleString.from_string t.text, "en", "Latn"
|
|
226
|
-
end
|
|
227
|
-
|
|
228
|
-
# Fetch dates
|
|
229
|
-
# @param doc [Mechanize::Page]
|
|
230
|
-
# @return [Array<Hash>]
|
|
231
|
-
def fetch_dates(doc) # rubocop:disable Metrics/CyclomaticComplexity,Metrics/PerceivedComplexity
|
|
232
|
-
dates = []
|
|
233
|
-
date = doc.at("//table/tr/td/span[contains(@id, 'Label5')]",
|
|
234
|
-
"//p[contains(.,'Approved in')]")
|
|
235
|
-
pdate = date&.text&.match(/\d{4}-\d{2}-\d{2}/).to_s || ob_date(doc)
|
|
236
|
-
if pdate && !pdate&.empty?
|
|
237
|
-
dates << { type: "published", on: pdate }
|
|
238
|
-
elsif pdate = ob_date(doc)
|
|
239
|
-
dates << { type: "published", on: pdate }
|
|
240
|
-
end
|
|
241
|
-
dates
|
|
242
|
-
end
|
|
243
|
-
|
|
244
|
-
# Scrape Operational Bulletin date.
|
|
245
|
-
# @param doc [Mechanize::Page]
|
|
246
|
-
# @return [String]
|
|
247
|
-
def ob_date(doc)
|
|
248
|
-
pdate = doc.at('//table/tbody/tr/td[contains(text(), "Year:")]')
|
|
249
|
-
return unless pdate
|
|
250
|
-
|
|
251
|
-
roman_to_arabic pdate.text.match(%r{(?<=Year: )(\d{2}.\w+.)?\d{4}}).to_s
|
|
252
|
-
end
|
|
253
|
-
|
|
254
|
-
# Convert roman month number in string date to arabic number
|
|
255
|
-
# @param date [String]
|
|
256
|
-
# @return [String]
|
|
257
|
-
def roman_to_arabic(date)
|
|
258
|
-
%r{(?<rmonth>[IVX]+)} =~ date
|
|
259
|
-
if ROMAN_MONTHS.index(rmonth)
|
|
260
|
-
month = ROMAN_MONTHS.index(rmonth) + 1
|
|
261
|
-
Date.parse(date.sub(%r{[IVX]+}, month.to_s)).to_s
|
|
262
|
-
else date
|
|
78
|
+
RadioRegulationsParser.new hit
|
|
263
79
|
end
|
|
264
80
|
end
|
|
81
|
+
end
|
|
265
82
|
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
def fetch_contributors(code)
|
|
270
|
-
return [] unless code
|
|
83
|
+
def fetch_id
|
|
84
|
+
docid.find(&:primary).id.gsub(/[.\s()\/-]/, "")
|
|
85
|
+
end
|
|
271
86
|
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
role: [type: "publisher"] }]
|
|
87
|
+
# Fetch docid.
|
|
88
|
+
# @return [Hash]
|
|
89
|
+
def docid
|
|
90
|
+
@docid ||= begin
|
|
91
|
+
docids = hit.hit[:code].to_s.split(" | ").map { |c| createdocid(c) }
|
|
92
|
+
docids << createdocid(doc["rec_name"]) if docids.empty?
|
|
93
|
+
docids
|
|
280
94
|
end
|
|
95
|
+
end
|
|
281
96
|
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
|
|
291
|
-
|
|
292
|
-
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
|
|
97
|
+
# @param text [String]
|
|
98
|
+
# @return [RelatonBib::DocumentIdentifier]
|
|
99
|
+
def createdocid(text) # rubocop:disable Metrics/MethodLength
|
|
100
|
+
# %r{
|
|
101
|
+
# ^(?<code>(?:(?:ITU-\w|ISO/IEC)\s)?[^(:]*)
|
|
102
|
+
# (?:\s\(V(?<version>\d+)\))?
|
|
103
|
+
# (?:\s\((?:(?<_month>\d{2})/)?(?<_year>\d{4})\))?
|
|
104
|
+
# (?::[^(]+\((?<buldate>\d{2}\.\w{1,4}\.\d{4})\))?
|
|
105
|
+
# (?:\s(?<corr>(?:Amd|Cor)\.\s?\d+))?
|
|
106
|
+
# # (\s\(((?<_cormonth>\d{2})\/)?(?<_coryear>\d{4})\))?
|
|
107
|
+
# }x =~ text.squeeze(" ")
|
|
108
|
+
# corr&.sub!(/\.\s?/, " ")
|
|
109
|
+
# id = [code.sub(/[[:space:]]$/, ""), corr].compact.join " "
|
|
110
|
+
# id += " (V#{version})" if version
|
|
111
|
+
# id += " - #{buldate}" if buldate
|
|
112
|
+
# type = id.match(%r{^\w+}).to_s
|
|
113
|
+
# type = "ITU" if type == "G"
|
|
114
|
+
if text.match?(/^(?:ISO|ETSI)/)
|
|
115
|
+
type = "ISO"
|
|
116
|
+
text.match(/[^(]+/).to_s.strip.squeeze(" ")
|
|
117
|
+
else
|
|
118
|
+
pubid = Pubid.parse(text)
|
|
119
|
+
type = pubid.prefix # == "G" ? "ITU" : pubid.prefix
|
|
120
|
+
pubid.to_s
|
|
121
|
+
end => id
|
|
122
|
+
RelatonBib::DocumentIdentifier.new(type: type, id: id, primary: true)
|
|
123
|
+
end
|
|
296
124
|
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
|
|
301
|
-
|
|
302
|
-
|
|
303
|
-
|
|
304
|
-
|
|
125
|
+
# def fetch_data(url)
|
|
126
|
+
# resp = hit.hit_collection.agent.get url
|
|
127
|
+
# JSON.parse(resp.body)
|
|
128
|
+
# rescue Mechanize::ResponseCodeError => e
|
|
129
|
+
# Util.error "HTTP Service Unavailable: #{e.message}"
|
|
130
|
+
# nil
|
|
131
|
+
# end
|
|
132
|
+
|
|
133
|
+
# Scrape Operational Bulletin date.
|
|
134
|
+
# @param doc [Mechanize::Page]
|
|
135
|
+
# @return [String]
|
|
136
|
+
# def ob_date(doc)
|
|
137
|
+
# pdate = doc.at('//table/tbody/tr/td[contains(text(), "Year:")]')
|
|
138
|
+
# return unless pdate
|
|
139
|
+
|
|
140
|
+
# roman_to_arabic pdate.text.match(%r{(?<=Year: )(\d{2}.\w+.)?\d{4}}).to_s
|
|
141
|
+
# end
|
|
142
|
+
|
|
143
|
+
# Fetch contributors
|
|
144
|
+
# @return [Array<Hash>]
|
|
145
|
+
def fetch_contributors
|
|
146
|
+
return [] unless hit.hit[:code]
|
|
147
|
+
|
|
148
|
+
abbrev = hit.hit[:code].sub(/-\w\s.*/, "")
|
|
149
|
+
case abbrev
|
|
150
|
+
when "ITU"
|
|
151
|
+
name = "International Telecommunication Union"
|
|
152
|
+
url = "www.itu.int"
|
|
153
|
+
end
|
|
154
|
+
[{ entity: { name: name, url: url, abbreviation: abbrev }, role: [type: "publisher"] }]
|
|
155
|
+
end
|
|
305
156
|
|
|
306
|
-
|
|
307
|
-
|
|
308
|
-
|
|
309
|
-
|
|
310
|
-
|
|
311
|
-
|
|
312
|
-
|
|
313
|
-
|
|
314
|
-
name = "International Telecommunication Union"
|
|
315
|
-
url = "www.itu.int"
|
|
316
|
-
end
|
|
317
|
-
fdate = doc.at("//table/tr/td/span[contains(@id, 'Label5')]")
|
|
318
|
-
from = fdate&.text || ob_date(doc)
|
|
319
|
-
[{ owner: [{ name: name, abbreviation: abbreviation, url: url }],
|
|
320
|
-
from: from }]
|
|
157
|
+
# Fetch copyright.
|
|
158
|
+
# @return [Array<Hash>]
|
|
159
|
+
def fetch_copyright
|
|
160
|
+
abbreviation = hit.hit[:code].match(/^[^-]+/).to_s
|
|
161
|
+
case abbreviation
|
|
162
|
+
when "ITU"
|
|
163
|
+
name = "International Telecommunication Union"
|
|
164
|
+
url = "www.itu.int"
|
|
321
165
|
end
|
|
166
|
+
[{ owner: [{ name: name, abbreviation: abbreviation, url: url }], from: parser.doc_date }]
|
|
322
167
|
end
|
|
323
168
|
end
|
|
324
169
|
end
|
data/lib/relaton_itu/version.rb
CHANGED
|
@@ -14,8 +14,7 @@ module RelatonItu
|
|
|
14
14
|
# @param ext [Nokogiri::XML::Element]
|
|
15
15
|
# @return [RelatonItu::EditorialGroup]
|
|
16
16
|
def fetch_editorialgroup(ext)
|
|
17
|
-
eg = ext.at
|
|
18
|
-
return unless eg
|
|
17
|
+
return unless ext && (eg = ext.at "editorialgroup")
|
|
19
18
|
|
|
20
19
|
EditorialGroup.new(
|
|
21
20
|
bureau: eg.at("bureau")&.text,
|
|
@@ -51,8 +50,7 @@ module RelatonItu
|
|
|
51
50
|
# @param ext [Nokogiri::XML::Element]
|
|
52
51
|
# @return [RelatonItu::StructuredIdentifier]
|
|
53
52
|
def fetch_structuredidentifier(ext)
|
|
54
|
-
sid = ext.at "./structuredidentifier"
|
|
55
|
-
return unless sid
|
|
53
|
+
return unless ext && (sid = ext.at "./structuredidentifier")
|
|
56
54
|
|
|
57
55
|
br = sid.at("bureau").text
|
|
58
56
|
dn = sid.at("docnumber").text
|
data/lib/relaton_itu.rb
CHANGED
data/relaton_itu.gemspec
CHANGED
|
@@ -28,5 +28,6 @@ Gem::Specification.new do |spec|
|
|
|
28
28
|
spec.add_dependency "mechanize", "~> 2.10"
|
|
29
29
|
spec.add_dependency "parslet", "~> 2.0.0"
|
|
30
30
|
spec.add_dependency "relaton-bib", "~> 1.20.0"
|
|
31
|
+
spec.add_dependency "relaton-core", "~> 0.0.6"
|
|
31
32
|
spec.add_dependency "relaton-index", "~> 0.2.0"
|
|
32
33
|
end
|
metadata
CHANGED
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: relaton-itu
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 1.20.
|
|
4
|
+
version: 1.20.3
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Ribose Inc.
|
|
8
8
|
autorequire:
|
|
9
9
|
bindir: exe
|
|
10
10
|
cert_chain: []
|
|
11
|
-
date:
|
|
11
|
+
date: 2026-01-20 00:00:00.000000000 Z
|
|
12
12
|
dependencies:
|
|
13
13
|
- !ruby/object:Gem::Dependency
|
|
14
14
|
name: mechanize
|
|
@@ -52,6 +52,20 @@ dependencies:
|
|
|
52
52
|
- - "~>"
|
|
53
53
|
- !ruby/object:Gem::Version
|
|
54
54
|
version: 1.20.0
|
|
55
|
+
- !ruby/object:Gem::Dependency
|
|
56
|
+
name: relaton-core
|
|
57
|
+
requirement: !ruby/object:Gem::Requirement
|
|
58
|
+
requirements:
|
|
59
|
+
- - "~>"
|
|
60
|
+
- !ruby/object:Gem::Version
|
|
61
|
+
version: 0.0.6
|
|
62
|
+
type: :runtime
|
|
63
|
+
prerelease: false
|
|
64
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
65
|
+
requirements:
|
|
66
|
+
- - "~>"
|
|
67
|
+
- !ruby/object:Gem::Version
|
|
68
|
+
version: 0.0.6
|
|
55
69
|
- !ruby/object:Gem::Dependency
|
|
56
70
|
name: relaton-index
|
|
57
71
|
requirement: !ruby/object:Gem::Requirement
|
|
@@ -104,6 +118,8 @@ files:
|
|
|
104
118
|
- lib/relaton_itu/itu_group.rb
|
|
105
119
|
- lib/relaton_itu/processor.rb
|
|
106
120
|
- lib/relaton_itu/pubid.rb
|
|
121
|
+
- lib/relaton_itu/radio_regulations_parser.rb
|
|
122
|
+
- lib/relaton_itu/recommendation_parser.rb
|
|
107
123
|
- lib/relaton_itu/scrapper.rb
|
|
108
124
|
- lib/relaton_itu/structured_identifier.rb
|
|
109
125
|
- lib/relaton_itu/util.rb
|
|
@@ -129,7 +145,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
|
129
145
|
- !ruby/object:Gem::Version
|
|
130
146
|
version: '0'
|
|
131
147
|
requirements: []
|
|
132
|
-
rubygems_version: 3.
|
|
148
|
+
rubygems_version: 3.5.22
|
|
133
149
|
signing_key:
|
|
134
150
|
specification_version: 4
|
|
135
151
|
summary: 'RelatonItu: retrieve ITU Standards for bibliographic use using the BibliographicItem
|