relaton-itu 0.9.0 → 1.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/grammars/basicdoc.rng +12 -55
- data/grammars/biblio.rng +144 -49
- data/grammars/isodoc.rng +592 -29
- data/grammars/isostandard.rng +145 -472
- data/grammars/itu.rng +12 -52
- data/lib/relaton_itu.rb +0 -5
- data/lib/relaton_itu/editorial_group.rb +6 -4
- data/lib/relaton_itu/hash_converter.rb +9 -0
- data/lib/relaton_itu/hit.rb +2 -2
- data/lib/relaton_itu/hit_collection.rb +38 -12
- data/lib/relaton_itu/itu_bibliographic_item.rb +2 -2
- data/lib/relaton_itu/itu_bibliography.rb +23 -20
- data/lib/relaton_itu/scrapper.rb +52 -79
- data/lib/relaton_itu/structured_identifier.rb +37 -0
- data/lib/relaton_itu/version.rb +1 -1
- data/lib/relaton_itu/xml_parser.rb +17 -12
- data/relaton-itu.gemspec +1 -1
- metadata +5 -4
data/grammars/itu.rng
CHANGED
@@ -5,50 +5,10 @@
|
|
5
5
|
we cannot have a new default namespace: we will end up with a grammar with two different
|
6
6
|
namespaces, one for isostandard and one for csand additions. And we do not want that.
|
7
7
|
-->
|
8
|
-
<include href="
|
8
|
+
<include href="isodoc.rng">
|
9
9
|
<start>
|
10
10
|
<ref name="itu"/>
|
11
11
|
</start>
|
12
|
-
<define name="figure">
|
13
|
-
<element name="figure">
|
14
|
-
<attribute name="id">
|
15
|
-
<data type="ID"/>
|
16
|
-
</attribute>
|
17
|
-
<optional>
|
18
|
-
<ref name="tname"/>
|
19
|
-
</optional>
|
20
|
-
<choice>
|
21
|
-
<ref name="image"/>
|
22
|
-
<ref name="pre"/>
|
23
|
-
<oneOrMore>
|
24
|
-
<ref name="subfigure"/>
|
25
|
-
</oneOrMore>
|
26
|
-
</choice>
|
27
|
-
<zeroOrMore>
|
28
|
-
<ref name="fn"/>
|
29
|
-
</zeroOrMore>
|
30
|
-
<optional>
|
31
|
-
<ref name="dl"/>
|
32
|
-
</optional>
|
33
|
-
<zeroOrMore>
|
34
|
-
<ref name="note"/>
|
35
|
-
</zeroOrMore>
|
36
|
-
</element>
|
37
|
-
</define>
|
38
|
-
<define name="subfigure">
|
39
|
-
<element name="figure">
|
40
|
-
<attribute name="id">
|
41
|
-
<data type="ID"/>
|
42
|
-
</attribute>
|
43
|
-
<optional>
|
44
|
-
<ref name="tname"/>
|
45
|
-
</optional>
|
46
|
-
<choice>
|
47
|
-
<ref name="image"/>
|
48
|
-
<ref name="pre"/>
|
49
|
-
</choice>
|
50
|
-
</element>
|
51
|
-
</define>
|
52
12
|
<define name="ol">
|
53
13
|
<element name="ol">
|
54
14
|
<attribute name="id">
|
@@ -97,11 +57,6 @@
|
|
97
57
|
</optional>
|
98
58
|
</element>
|
99
59
|
</define>
|
100
|
-
<define name="workgroup">
|
101
|
-
<element name="workgroup">
|
102
|
-
<ref name="ItuGroup"/>
|
103
|
-
</element>
|
104
|
-
</define>
|
105
60
|
<define name="structuredidentifier">
|
106
61
|
<element name="structuredidentifier">
|
107
62
|
<ref name="bureau"/>
|
@@ -147,13 +102,13 @@
|
|
147
102
|
</oneOrMore>
|
148
103
|
</element>
|
149
104
|
</define>
|
150
|
-
<define name="TextElement" combine="choice">
|
151
|
-
<choice>
|
152
|
-
<ref name="add"/>
|
153
|
-
<ref name="del"/>
|
154
|
-
</choice>
|
155
|
-
</define>
|
156
105
|
</include>
|
106
|
+
<define name="TextElement" combine="choice">
|
107
|
+
<choice>
|
108
|
+
<ref name="add"/>
|
109
|
+
<ref name="del"/>
|
110
|
+
</choice>
|
111
|
+
</define>
|
157
112
|
<define name="add">
|
158
113
|
<element name="add">
|
159
114
|
<choice>
|
@@ -282,6 +237,11 @@
|
|
282
237
|
<data type="gYear"/>
|
283
238
|
</element>
|
284
239
|
</define>
|
240
|
+
<define name="workgroup">
|
241
|
+
<element name="workgroup">
|
242
|
+
<ref name="ItuGroup"/>
|
243
|
+
</element>
|
244
|
+
</define>
|
285
245
|
<define name="itu">
|
286
246
|
<element name="itu-standard">
|
287
247
|
<ref name="bibdata"/>
|
data/lib/relaton_itu.rb
CHANGED
@@ -2,11 +2,6 @@ require "relaton_itu/version"
|
|
2
2
|
require "relaton_itu/itu_bibliography"
|
3
3
|
require "digest/md5"
|
4
4
|
|
5
|
-
# if defined? Relaton
|
6
|
-
# require_relative "relaton/processor"
|
7
|
-
# Relaton::Registry.instance.register(Relaton::RelatonItu::Processor)
|
8
|
-
# end
|
9
|
-
|
10
5
|
module RelatonItu
|
11
6
|
class Error < StandardError; end
|
12
7
|
|
@@ -16,8 +16,9 @@ module RelatonItu
|
|
16
16
|
# @param subgroup [Hash, RelatonItu::ItuGroup, NilClass]
|
17
17
|
# @param workgroup [Hash, RelatonItu::ItuGroup, NilClass]
|
18
18
|
def initialize(bureau:, group:, subgroup: nil, workgroup: nil)
|
19
|
-
|
20
|
-
|
19
|
+
unless BUREAUS.include? bureau
|
20
|
+
warn "[relaton-itu] WARNING: invalid bureau: #{bureau}"
|
21
|
+
end
|
21
22
|
@bureau = bureau
|
22
23
|
@group = group.is_a?(Hash) ? ItuGroup.new(group) : group
|
23
24
|
@subgroup = subgroup.is_a?(Hash) ? ItuGroup.new(subgroup) : subgroup
|
@@ -28,7 +29,7 @@ module RelatonItu
|
|
28
29
|
def to_xml(builder)
|
29
30
|
builder.editorialgroup do
|
30
31
|
builder.bureau bureau
|
31
|
-
builder.group { |b| group.to_xml b }
|
32
|
+
builder.group { |b| group.to_xml b } if group
|
32
33
|
builder.subgroup { |b| group.to_xml b } if subgroup
|
33
34
|
builder.workgroup { |b| group.to_xml b } if workgroup
|
34
35
|
end
|
@@ -36,7 +37,8 @@ module RelatonItu
|
|
36
37
|
|
37
38
|
# @return [Hash]
|
38
39
|
def to_hash
|
39
|
-
hash = { "bureau" => bureau
|
40
|
+
hash = { "bureau" => bureau }
|
41
|
+
hash["group"] = group.to_hash if group
|
40
42
|
hash["subgroup"] = subgroup.to_hash if subgroup
|
41
43
|
hash["workgroup"] = workgroup.to_hash if workgroup
|
42
44
|
hash
|
@@ -9,6 +9,15 @@ module RelatonItu
|
|
9
9
|
|
10
10
|
ret[:editorialgroup] = EditorialGroup.new eg
|
11
11
|
end
|
12
|
+
|
13
|
+
# @param ret [Hash]
|
14
|
+
def structuredidentifier_hash_to_bib(ret)
|
15
|
+
return unless ret[:structuredidentifier]
|
16
|
+
|
17
|
+
ret[:structuredidentifier] = StructuredIdentifier.new(
|
18
|
+
ret[:structuredidentifier],
|
19
|
+
)
|
20
|
+
end
|
12
21
|
end
|
13
22
|
end
|
14
23
|
end
|
data/lib/relaton_itu/hit.rb
CHANGED
@@ -4,9 +4,9 @@ module RelatonItu
|
|
4
4
|
# Hit.
|
5
5
|
class Hit < RelatonBib::Hit
|
6
6
|
# Parse page.
|
7
|
-
# @return [
|
7
|
+
# @return [RelatonItu::ItuBibliographicItem]
|
8
8
|
def fetch
|
9
|
-
@fetch ||= Scrapper.parse_page
|
9
|
+
@fetch ||= Scrapper.parse_page hit, hit_collection.gi_imp
|
10
10
|
end
|
11
11
|
end
|
12
12
|
end
|
@@ -7,16 +7,39 @@ require "net/http"
|
|
7
7
|
module RelatonItu
|
8
8
|
# Page of hit collection.
|
9
9
|
class HitCollection < RelatonBib::HitCollection
|
10
|
-
DOMAIN = "https://www.itu.int"
|
10
|
+
DOMAIN = "https://www.itu.int"
|
11
11
|
|
12
|
-
# @
|
12
|
+
# @return [TrueClass, FalseClass]
|
13
|
+
attr_reader :gi_imp
|
14
|
+
|
15
|
+
# @param ref [String]
|
13
16
|
# @param year [String]
|
14
|
-
def initialize(
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
17
|
+
def initialize(ref, year = nil)
|
18
|
+
text = ref.sub /(?<=\.)Imp\s?(?=\d)/, ""
|
19
|
+
super text, year
|
20
|
+
@gi_imp = /\.Imp\d/.match?(ref)
|
21
|
+
uri = URI "#{DOMAIN}/net4/ITU-T/search/GlobalSearch/Search"
|
22
|
+
data = { json: params.to_json }
|
23
|
+
resp = Net::HTTP.post(uri, data.to_json,
|
24
|
+
"Content-Type" => "application/json")
|
25
|
+
@array = hits JSON.parse(resp.body)
|
26
|
+
end
|
27
|
+
|
28
|
+
private
|
29
|
+
|
30
|
+
# @return [String]
|
31
|
+
def group
|
32
|
+
@group ||= if %r{(OB|Operational Bulletin) No} =~ text then "Publications"
|
33
|
+
else "Recommendations"
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
37
|
+
# rubocop:disable Metrics/MethodLength
|
38
|
+
|
39
|
+
# @return [Hash]
|
40
|
+
def params
|
41
|
+
{
|
42
|
+
"Input" => text,
|
20
43
|
"Start" => 0,
|
21
44
|
"Rows" => 10,
|
22
45
|
"SortBy" => "RELEVANCE",
|
@@ -61,10 +84,13 @@ module RelatonItu
|
|
61
84
|
"IP" => "",
|
62
85
|
"SearchType" => "All",
|
63
86
|
}
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
87
|
+
end
|
88
|
+
# rubocop:enable Metrics/MethodLength
|
89
|
+
|
90
|
+
# @param data [Hash]
|
91
|
+
# @return [Array<RelatonItu::Hit>]
|
92
|
+
def hits(data)
|
93
|
+
data["results"].map do |h|
|
68
94
|
code = h["Media"]["Name"]
|
69
95
|
title = h["Title"]
|
70
96
|
url = h["Redirection"]
|
@@ -7,12 +7,12 @@ module RelatonItu
|
|
7
7
|
joint-itu-iso-iec
|
8
8
|
].freeze
|
9
9
|
|
10
|
+
# @params structuredidentifier [RelatonItu::StructuredIdentifier]
|
10
11
|
def initialize(**args)
|
11
12
|
@doctype = args.delete :doctype
|
12
13
|
if doctype && !TYPES.include?(doctype)
|
13
|
-
|
14
|
+
warn "[relaton-itu] WARNING: invalid doctype: #{doctype}"
|
14
15
|
end
|
15
|
-
|
16
16
|
super
|
17
17
|
end
|
18
18
|
end
|
@@ -3,6 +3,7 @@
|
|
3
3
|
require "relaton_iso_bib"
|
4
4
|
require "relaton_itu/itu_bibliographic_item"
|
5
5
|
require "relaton_itu/editorial_group"
|
6
|
+
require "relaton_itu/structured_identifier"
|
6
7
|
require "relaton_itu/itu_group"
|
7
8
|
require "relaton_itu/scrapper"
|
8
9
|
require "relaton_itu/hit_collection"
|
@@ -19,9 +20,9 @@ module RelatonItu
|
|
19
20
|
# @return [RelatonItu::HitCollection]
|
20
21
|
def search(text, year = nil)
|
21
22
|
HitCollection.new text, year
|
22
|
-
rescue SocketError, Timeout::Error, Errno::EINVAL, Errno::ECONNRESET,
|
23
|
-
Net::HTTPBadResponse, Net::HTTPHeaderSyntaxError,
|
24
|
-
OpenSSL::SSL::SSLError
|
23
|
+
rescue SocketError, Timeout::Error, Errno::EINVAL, Errno::ECONNRESET,
|
24
|
+
EOFError, Net::HTTPBadResponse, Net::HTTPHeaderSyntaxError,
|
25
|
+
Net::ProtocolError, OpenSSL::SSL::SSLError
|
25
26
|
raise RelatonBib::RequestError, "Could not access http://www.itu.int"
|
26
27
|
end
|
27
28
|
|
@@ -66,17 +67,17 @@ module RelatonItu
|
|
66
67
|
nil
|
67
68
|
end
|
68
69
|
|
69
|
-
def fetch_pages(
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
end
|
70
|
+
# def fetch_pages(hits, threads)
|
71
|
+
# workers = RelatonBib::WorkersPool.new threads
|
72
|
+
# workers.worker { |w| { i: w[:i], hit: w[:hit].fetch } }
|
73
|
+
# hits.each_with_index { |hit, i| workers << { i: i, hit: hit } }
|
74
|
+
# workers.end
|
75
|
+
# workers.result.sort_by { |a| a[:i] }.map { |x| x[:hit] }
|
76
|
+
# end
|
76
77
|
|
77
78
|
def search_filter(code)
|
78
|
-
docidrx = %r{\w
|
79
|
-
c = code.match(docidrx).to_s
|
79
|
+
docidrx = %r{\w+\.\d+|\w\sSuppl\.\s\d+} # %r{^ITU-T\s[^\s]+}
|
80
|
+
c = code.sub(/Imp\s?/, "").match(docidrx).to_s
|
80
81
|
warn "[relaton-itu] (\"#{code}\") fetching..."
|
81
82
|
result = search(code)
|
82
83
|
result.select do |i|
|
@@ -93,16 +94,18 @@ module RelatonItu
|
|
93
94
|
# If no match, returns any years which caused mismatch, for error reporting
|
94
95
|
def isobib_results_filter(result, year)
|
95
96
|
missed_years = []
|
96
|
-
result.each_slice(3) do |s| # ISO website only allows 3 connections
|
97
|
-
|
98
|
-
|
97
|
+
# result.each_slice(3) do |s| # ISO website only allows 3 connections
|
98
|
+
# fetch_pages(s, 3).each do |r|
|
99
|
+
result.each do |r|
|
100
|
+
return { ret: r.fetch } if !year
|
99
101
|
|
100
|
-
|
101
|
-
|
102
|
+
/\(\d{2}\/(?<pyear>\d{4})\)/ =~ r.hit[:code]
|
103
|
+
# r.date.select { |d| d.type == "published" }.each do |d|
|
104
|
+
return { ret: r.fetch } if year == pyear
|
102
105
|
|
103
|
-
|
104
|
-
|
105
|
-
end
|
106
|
+
missed_years << pyear
|
107
|
+
# end
|
108
|
+
# end
|
106
109
|
end
|
107
110
|
{ years: missed_years }
|
108
111
|
end
|
data/lib/relaton_itu/scrapper.rb
CHANGED
@@ -3,16 +3,9 @@
|
|
3
3
|
require "nokogiri"
|
4
4
|
require "net/http"
|
5
5
|
|
6
|
-
# Capybara.request_driver :poltergeist do |app|
|
7
|
-
# Capybara::Poltergeist::Driver.new app, js_errors: false
|
8
|
-
# end
|
9
|
-
# Capybara.default_driver = :poltergeist
|
10
|
-
|
11
6
|
module RelatonItu
|
12
7
|
# Scrapper.
|
13
|
-
# rubocop:disable Metrics/ModuleLength
|
14
8
|
module Scrapper
|
15
|
-
DOMAIN = "https://www.itu.int"
|
16
9
|
ROMAN_MONTHS = %w[I II III IV V VI VII VIII IX X XI XII].freeze
|
17
10
|
|
18
11
|
TYPES = {
|
@@ -31,24 +24,19 @@ module RelatonItu
|
|
31
24
|
}.freeze
|
32
25
|
|
33
26
|
class << self
|
34
|
-
#
|
35
|
-
# @return [Array<Hash>]
|
36
|
-
# def get(text)
|
37
|
-
# iso_workers = WorkersPool.new 4
|
38
|
-
# iso_workers.worker { |hit| iso_worker(hit, iso_workers) }
|
39
|
-
# algolia_workers = start_algolia_search(text, iso_workers)
|
40
|
-
# iso_docs = iso_workers.result
|
41
|
-
# algolia_workers.end
|
42
|
-
# algolia_workers.result
|
43
|
-
# iso_docs
|
44
|
-
# end
|
27
|
+
# rubocop:disable Metrics/AbcSize, Metrics/MethodLength
|
45
28
|
|
46
29
|
# Parse page.
|
47
|
-
# @param
|
30
|
+
# @param hit_data [Hash]
|
48
31
|
# @return [Hash]
|
49
|
-
|
50
|
-
def parse_page(hit_data)
|
32
|
+
def parse_page(hit_data, imp = false)
|
51
33
|
url, doc = get_page hit_data[:url]
|
34
|
+
if imp
|
35
|
+
a = doc.at "//span[contains(@id, 'tab_ig_uc_rec')]/a"
|
36
|
+
return unless a
|
37
|
+
|
38
|
+
url, doc = get_page URI.join(url, a[:href]).to_s
|
39
|
+
end
|
52
40
|
|
53
41
|
# Fetch edition.
|
54
42
|
edition = doc.at("//table/tr/td/span[contains(@id, 'Label8')]/b")&.text
|
@@ -73,7 +61,7 @@ module RelatonItu
|
|
73
61
|
place: ["Geneva"],
|
74
62
|
)
|
75
63
|
end
|
76
|
-
# rubocop:enable Metrics/AbcSize
|
64
|
+
# rubocop:enable Metrics/AbcSize
|
77
65
|
|
78
66
|
private
|
79
67
|
|
@@ -96,37 +84,23 @@ module RelatonItu
|
|
96
84
|
}]
|
97
85
|
end
|
98
86
|
|
99
|
-
# Get langs.
|
100
|
-
# @param doc [Nokogiri::HTML::Document]
|
101
|
-
# @return [Array<Hash>]
|
102
|
-
# def langs(doc)
|
103
|
-
# lgs = [{ lang: 'en' }]
|
104
|
-
# doc.css('ul#lang-switcher ul li a').each do |lang_link|
|
105
|
-
# lang_path = lang_link.attr('href')
|
106
|
-
# lang = lang_path.match(%r{^\/(fr)\/})
|
107
|
-
# lgs << { lang: lang[1], path: lang_path } if lang
|
108
|
-
# end
|
109
|
-
# lgs
|
110
|
-
# end
|
111
|
-
|
112
|
-
# rubocop:disable Metrics/AbcSize, Metrics/MethodLength
|
113
87
|
# Get page.
|
114
88
|
# @param path [String] page's path
|
115
|
-
# @return [Array<Nokogiri::HTML::Document
|
89
|
+
# @return [Array<String, Nokogiri::HTML::Document>]
|
116
90
|
def get_page(url)
|
117
91
|
uri = URI url
|
118
|
-
resp = Net::HTTP.get_response(uri)
|
92
|
+
resp = Net::HTTP.get_response(uri)
|
119
93
|
until resp.code == "200"
|
120
94
|
uri = URI resp["location"] if resp.code =~ /^30/
|
121
|
-
resp = Net::HTTP.get_response(uri)
|
95
|
+
resp = Net::HTTP.get_response(uri)
|
122
96
|
end
|
123
97
|
[uri.to_s, Nokogiri::HTML(resp.body)]
|
124
|
-
rescue SocketError, Timeout::Error, Errno::EINVAL, Errno::ECONNRESET,
|
125
|
-
Net::HTTPBadResponse, Net::HTTPHeaderSyntaxError,
|
126
|
-
OpenSSL::SSL::SSLError
|
98
|
+
rescue SocketError, Timeout::Error, Errno::EINVAL, Errno::ECONNRESET,
|
99
|
+
EOFError, Net::HTTPBadResponse, Net::HTTPHeaderSyntaxError,
|
100
|
+
Net::ProtocolError, OpenSSL::SSL::SSLError
|
127
101
|
raise RelatonBib::RequestError, "Could not access #{url}"
|
128
102
|
end
|
129
|
-
# rubocop:enable Metrics/
|
103
|
+
# rubocop:enable Metrics/MethodLength
|
130
104
|
|
131
105
|
# Fetch docid.
|
132
106
|
# @param doc [Nokogiri::HTML::Document]
|
@@ -135,9 +109,11 @@ module RelatonItu
|
|
135
109
|
doc.xpath(
|
136
110
|
"//span[@id='ctl00_content_main_uc_rec_main_info1_rpt_main_ctl00_lbl_rec']",
|
137
111
|
"//td[.='Identical standard:']/following-sibling::td",
|
112
|
+
"//div/table[1]/tr[4]/td/strong",
|
138
113
|
).map do |code|
|
139
|
-
id = code.text.match(%r{^.*?(?= \()}).to_s.squeeze(" ")
|
114
|
+
id = code.text.match(%r{^.*?(?= \()|\w\.Imp\s?\d+}).to_s.squeeze(" ")
|
140
115
|
type = id.match(%r{^\w+}).to_s
|
116
|
+
type = "ITU" if type == "G"
|
141
117
|
RelatonBib::DocumentIdentifier.new(type: type, id: id)
|
142
118
|
end
|
143
119
|
end
|
@@ -146,10 +122,11 @@ module RelatonItu
|
|
146
122
|
# @param doc [Nokogiri::HTML::Document]
|
147
123
|
# @return [RelatonBib::DocumentStatus, NilClass]
|
148
124
|
def fetch_status(doc)
|
149
|
-
s = doc.at("//table/tr/td/span[contains(@id, 'Label7')]"
|
125
|
+
s = doc.at("//table/tr/td/span[contains(@id, 'Label7')]",
|
126
|
+
"//p[contains(.,'Status :')]")
|
150
127
|
return unless s
|
151
128
|
|
152
|
-
status = s.text
|
129
|
+
status = s.text.include?("In force") ? "Published" : "Withdrawal"
|
153
130
|
RelatonBib::DocumentStatus.new(stage: status)
|
154
131
|
end
|
155
132
|
|
@@ -191,9 +168,7 @@ module RelatonItu
|
|
191
168
|
# @return [Array<Hash>]
|
192
169
|
def fetch_relations(doc)
|
193
170
|
doc.xpath('//div[contains(@id, "tab_sup")]//table/tr[position()>2]').map do |r|
|
194
|
-
# r_type = r.at('./td/span[contains(@id, "Label4")]/nobr').text.downcase
|
195
171
|
ref = r.at('./td/span[contains(@id, "title_e")]/nobr/a')
|
196
|
-
# url = DOMAIN + ref[:href].sub(/^\./, "/ITU-T/recommendations")
|
197
172
|
fref = RelatonBib::FormattedRef.new(content: ref.text, language: "en", script: "Latn")
|
198
173
|
bibitem = RelatonIsoBib::IsoBibliographicItem.new(formattedref: fref)
|
199
174
|
{ type: "complements", bibitem: bibitem }
|
@@ -201,22 +176,14 @@ module RelatonItu
|
|
201
176
|
end
|
202
177
|
# rubocop:enable Metrics/MethodLength
|
203
178
|
|
204
|
-
# Fetch type.
|
205
|
-
# @param doc [Nokogiri::HTML::Document]
|
206
|
-
# @return [String]
|
207
|
-
# def fetch_type(_doc)
|
208
|
-
# "recommendation"
|
209
|
-
# end
|
210
|
-
|
211
179
|
# Fetch titles.
|
212
180
|
# @param doc [Nokogiri::HTML::Document]
|
213
181
|
# @return [Array<Hash>]
|
214
182
|
def fetch_titles(doc)
|
215
|
-
|
216
|
-
# t = hit_data[:title] if t.empty?
|
217
|
-
t = doc.at("//td[@class='title']")
|
183
|
+
t = doc.at("//td[@class='title']|//div/table[1]/tr[4]/td/strong")
|
218
184
|
return [] unless t
|
219
|
-
|
185
|
+
|
186
|
+
titles = t.text.sub(/\w\.Imp\s?\d+\u00A0:\u00A0/, "").split " - "
|
220
187
|
case titles.size
|
221
188
|
when 0
|
222
189
|
intro, main, part = nil, "", nil
|
@@ -247,10 +214,11 @@ module RelatonItu
|
|
247
214
|
# @return [Array<Hash>]
|
248
215
|
def fetch_dates(doc)
|
249
216
|
dates = []
|
250
|
-
|
251
|
-
|
252
|
-
|
253
|
-
|
217
|
+
date = doc.at("//table/tr/td/span[contains(@id, 'Label5')]",
|
218
|
+
"//p[contains(.,'Approved in')]")
|
219
|
+
pdate = date&.text&.match(/\d{4}-\d{2}-\d{2}/).to_s || ob_date(doc)
|
220
|
+
if pdate && !pdate&.empty?
|
221
|
+
dates << { type: "published", on: pdate }
|
254
222
|
end
|
255
223
|
dates
|
256
224
|
end
|
@@ -278,40 +246,45 @@ module RelatonItu
|
|
278
246
|
# @param doc [Nokogiri::HTML::Document]
|
279
247
|
# @return [Array<Hash>]
|
280
248
|
def fetch_contributors(code)
|
249
|
+
return [] unless code
|
250
|
+
|
281
251
|
abbrev = code.sub(/-\w\s.*/, "")
|
282
252
|
case abbrev
|
283
253
|
when "ITU"
|
284
254
|
name = "International Telecommunication Union"
|
285
255
|
url = "www.itu.int"
|
286
256
|
end
|
287
|
-
[{ entity: { name: name, url: url, abbreviation: abbrev },
|
257
|
+
[{ entity: { name: name, url: url, abbreviation: abbrev },
|
258
|
+
role: [type: "publisher"] }]
|
288
259
|
end
|
289
260
|
|
290
|
-
# Fetch ICS.
|
291
|
-
# @param doc [Nokogiri::HTML::Document]
|
292
|
-
# @return [Array<Hash>]
|
293
|
-
# def fetch_ics(doc)
|
294
|
-
# doc.xpath('//th[contains(text(), "ICS")]/following-sibling::td/a').map do |i|
|
295
|
-
# code = i.text.match(/[\d\.]+/).to_s.split '.'
|
296
|
-
# { field: code[0], group: code[1], subgroup: code[2] }
|
297
|
-
# end
|
298
|
-
# end
|
299
|
-
|
300
261
|
# Fetch links.
|
301
262
|
# @param doc [Nokogiri::HTML::Document]
|
302
263
|
# @param url [String]
|
303
264
|
# @return [Array<Hash>]
|
304
265
|
def fetch_link(doc, url)
|
305
266
|
links = [{ type: "src", content: url }]
|
306
|
-
|
307
|
-
|
267
|
+
obp_elm = doc.at(
|
268
|
+
'//a[@title="Persistent link to download the PDF file"]',
|
269
|
+
"//font[contains(.,'PDF')]/../..",
|
270
|
+
)
|
271
|
+
links << typed_link("obp", obp_elm) if obp_elm
|
272
|
+
wrd_elm = doc.at("//font[contains(.,'Word')]/../..")
|
273
|
+
links << typed_link("word", wrd_elm) if wrd_elm
|
308
274
|
links
|
309
275
|
end
|
310
276
|
|
277
|
+
def typed_link(type, elm)
|
278
|
+
{
|
279
|
+
type: type,
|
280
|
+
content: URI.join(HitCollection::DOMAIN + elm[:href].strip).to_s,
|
281
|
+
}
|
282
|
+
end
|
283
|
+
|
311
284
|
# Fetch copyright.
|
312
285
|
# @param code [String]
|
313
286
|
# @param doc [Nokogiri::HTML::Document]
|
314
|
-
# @return [Hash]
|
287
|
+
# @return [Array<Hash>]
|
315
288
|
def fetch_copyright(code, doc)
|
316
289
|
abbreviation = code.match(/^[^-]+/).to_s
|
317
290
|
case abbreviation
|
@@ -321,9 +294,9 @@ module RelatonItu
|
|
321
294
|
end
|
322
295
|
fdate = doc.at("//table/tr/td/span[contains(@id, 'Label5')]")
|
323
296
|
from = fdate&.text || ob_date(doc)
|
324
|
-
{ owner: { name: name, abbreviation: abbreviation, url: url },
|
297
|
+
[{ owner: [{ name: name, abbreviation: abbreviation, url: url }],
|
298
|
+
from: from }]
|
325
299
|
end
|
326
300
|
end
|
327
301
|
end
|
328
|
-
# rubocop:enable Metrics/ModuleLength
|
329
302
|
end
|