relaton-iso 0.5.2 → 0.5.4
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.travis.yml +1 -0
- data/Gemfile.lock +11 -17
- data/appveyor.yml +1 -0
- data/lib/relaton_iso/hit.rb +4 -7
- data/lib/relaton_iso/hit_collection.rb +47 -18
- data/lib/relaton_iso/iso_bibliography.rb +61 -40
- data/lib/relaton_iso/scrapper.rb +41 -43
- data/lib/relaton_iso/version.rb +1 -1
- data/relaton_iso.gemspec +0 -1
- metadata +2 -17
- data/lib/relaton_iso/hit_pages.rb +0 -96
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: dd542380eec781e5113ea5455b3f0b528188306d
|
4
|
+
data.tar.gz: e457fa95214e668f42a2eda1a1ba5f0cef1e54a1
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: d15d13af842f09e14b48ae43c82ae362b37df6394da6b84bc146c359be029ee207bd5dc5ae87691c33d9951d4c4ff86a04542355780fb4a7918848ef665469cd
|
7
|
+
data.tar.gz: 4c0b4c5d1ac62f9418b611a59991c89cf7e29cfa1614319341162aed676d80ad7dc88f13438f1ae83010a09838d971b826e4d517b1a3f5212e863e93b9470b16
|
data/.travis.yml
CHANGED
data/Gemfile.lock
CHANGED
@@ -1,8 +1,7 @@
|
|
1
1
|
PATH
|
2
2
|
remote: .
|
3
3
|
specs:
|
4
|
-
relaton-iso (0.5.
|
5
|
-
algoliasearch
|
4
|
+
relaton-iso (0.5.4)
|
6
5
|
relaton-iec (~> 0.3.0)
|
7
6
|
relaton-iso-bib (~> 0.2.0)
|
8
7
|
|
@@ -11,14 +10,11 @@ GEM
|
|
11
10
|
specs:
|
12
11
|
addressable (2.6.0)
|
13
12
|
public_suffix (>= 2.0.2, < 4.0)
|
14
|
-
algoliasearch (1.26.0)
|
15
|
-
httpclient (~> 2.8, >= 2.8.3)
|
16
|
-
json (>= 1.5.1)
|
17
13
|
byebug (11.0.1)
|
18
14
|
coderay (1.1.2)
|
19
15
|
crack (0.4.3)
|
20
16
|
safe_yaml (~> 1.0.0)
|
21
|
-
debase (0.2.
|
17
|
+
debase (0.2.3)
|
22
18
|
debase-ruby_core_source (>= 0.10.2)
|
23
19
|
debase-ruby_core_source (0.10.5)
|
24
20
|
diff-lcs (1.3)
|
@@ -26,13 +22,12 @@ GEM
|
|
26
22
|
equivalent-xml (0.6.0)
|
27
23
|
nokogiri (>= 1.4.3)
|
28
24
|
hashdiff (0.4.0)
|
29
|
-
httpclient (2.8.3)
|
30
25
|
isoics (0.1.7)
|
31
26
|
json (2.2.0)
|
32
27
|
method_source (0.9.2)
|
33
|
-
mini_portile2 (2.
|
34
|
-
nokogiri (1.
|
35
|
-
mini_portile2 (~> 2.
|
28
|
+
mini_portile2 (2.4.0)
|
29
|
+
nokogiri (1.10.3)
|
30
|
+
mini_portile2 (~> 2.4.0)
|
36
31
|
pry (0.12.2)
|
37
32
|
coderay (~> 1.1.0)
|
38
33
|
method_source (~> 0.9.0)
|
@@ -41,22 +36,21 @@ GEM
|
|
41
36
|
pry (~> 0.10)
|
42
37
|
public_suffix (3.1.1)
|
43
38
|
rake (10.5.0)
|
44
|
-
relaton-bib (0.2.
|
39
|
+
relaton-bib (0.2.5)
|
45
40
|
addressable
|
46
|
-
nokogiri (~> 1.
|
47
|
-
relaton-iec (0.3.
|
41
|
+
nokogiri (~> 1.10)
|
42
|
+
relaton-iec (0.3.2)
|
48
43
|
addressable
|
49
44
|
relaton-iso-bib (~> 0.2.0)
|
50
|
-
relaton-iso-bib (0.2.
|
45
|
+
relaton-iso-bib (0.2.4)
|
51
46
|
isoics (~> 0.1.6)
|
52
|
-
nokogiri (~> 1.8.4)
|
53
47
|
relaton-bib (~> 0.2.0)
|
54
48
|
ruby_deep_clone (~> 0.8.0)
|
55
49
|
rspec (3.8.0)
|
56
50
|
rspec-core (~> 3.8.0)
|
57
51
|
rspec-expectations (~> 3.8.0)
|
58
52
|
rspec-mocks (~> 3.8.0)
|
59
|
-
rspec-core (3.8.
|
53
|
+
rspec-core (3.8.2)
|
60
54
|
rspec-support (~> 3.8.0)
|
61
55
|
rspec-expectations (3.8.4)
|
62
56
|
diff-lcs (>= 1.2.0, < 2.0)
|
@@ -69,7 +63,7 @@ GEM
|
|
69
63
|
rake (>= 0.8.1)
|
70
64
|
ruby_deep_clone (0.8.0)
|
71
65
|
safe_yaml (1.0.5)
|
72
|
-
simplecov (0.
|
66
|
+
simplecov (0.17.0)
|
73
67
|
docile (~> 1.1)
|
74
68
|
json (>= 1.8, < 3)
|
75
69
|
simplecov-html (~> 0.10.0)
|
data/appveyor.yml
CHANGED
data/lib/relaton_iso/hit.rb
CHANGED
@@ -29,15 +29,12 @@ module RelatonIso
|
|
29
29
|
|
30
30
|
# @return [String]
|
31
31
|
def inspect
|
32
|
-
matched_words = @hit["_highlightResult"].
|
33
|
-
|
32
|
+
# matched_words = @hit["_highlightResult"].
|
33
|
+
# reduce([]) { |a, (_k, v)| a + v["matchedWords"] }.uniq
|
34
34
|
|
35
35
|
"<#{self.class}:#{format('%#.14x', object_id << 1)} "\
|
36
|
-
"@text=\"#{@hit_collection&.
|
37
|
-
"@
|
38
|
-
"@matchedWords=#{matched_words} "\
|
39
|
-
"@category=\"#{@hit['category']}\" "\
|
40
|
-
"@title=\"#{@hit['title']}\">"
|
36
|
+
"@text=\"#{@hit_collection&.ref}\" "\
|
37
|
+
"@reference=\"#{@hit["docRef"]}\""
|
41
38
|
end
|
42
39
|
|
43
40
|
# @param builder [Nokogiri::XML::Builder]
|
@@ -6,37 +6,66 @@ module RelatonIso
|
|
6
6
|
# Page of hit collection.
|
7
7
|
class HitCollection < Array
|
8
8
|
# @return [TrueClass, FalseClass]
|
9
|
-
attr_reader :fetched
|
9
|
+
# attr_reader :fetched
|
10
10
|
|
11
11
|
# @return [RelatonIso::HitPages]
|
12
|
-
attr_reader :hit_pages
|
12
|
+
# attr_reader :hit_pages
|
13
|
+
|
14
|
+
# @return [String]
|
15
|
+
attr_reader :ref
|
13
16
|
|
14
17
|
# @param hits [Array<Hash>]
|
15
|
-
def initialize(
|
16
|
-
concat(hits.map { |h| Hit.new(h, self) })
|
17
|
-
@fetched = false
|
18
|
-
@hit_pages = hit_pages
|
18
|
+
def initialize(ref)
|
19
|
+
# concat(hits.map { |h| Hit.new(h, self) })
|
20
|
+
# @fetched = false
|
21
|
+
# @hit_pages = hit_pages
|
22
|
+
@ref = ref
|
23
|
+
%r{(?<num>\d+)(-(?<part>\d+))?} =~ ref
|
24
|
+
http = Net::HTTP.new "www.iso.org", 443
|
25
|
+
http.use_ssl = true
|
26
|
+
search = ["status=ENT_ACTIVE,ENT_PROGRESS,ENT_INACTIVE,ENT_DELETED"]
|
27
|
+
search << "docNumber=#{num}"
|
28
|
+
search << "docPartNo=#{part}" if part
|
29
|
+
q = search.join "&"
|
30
|
+
resp = http.get(
|
31
|
+
"/cms/render/live/en/sites/isoorg.advancedSearch.do?#{q}",
|
32
|
+
{ 'Accept' => 'application/json, text/plain, */*' }
|
33
|
+
)
|
34
|
+
json = JSON.parse resp.body
|
35
|
+
concat(json["standards"].map { |h| Hit.new h, self })
|
19
36
|
end
|
20
37
|
|
21
38
|
# @return [RelatonIso::HitCollection]
|
22
|
-
def fetch
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
end
|
39
|
+
# def fetch
|
40
|
+
# workers = RelatonBib::WorkersPool.new 4
|
41
|
+
# workers.worker(&:fetch)
|
42
|
+
# each do |hit|
|
43
|
+
# workers << hit
|
44
|
+
# end
|
45
|
+
# workers.end
|
46
|
+
# workers.result
|
47
|
+
# @fetched = true
|
48
|
+
# self
|
49
|
+
# end
|
33
50
|
|
34
51
|
def to_s
|
35
52
|
inspect
|
36
53
|
end
|
37
54
|
|
38
55
|
def inspect
|
39
|
-
"<#{self.class}:#{format('%#.14x', object_id << 1)} @
|
56
|
+
"<#{self.class}:#{format('%#.14x', object_id << 1)} @ref=#{@ref}>"
|
57
|
+
end
|
58
|
+
|
59
|
+
def to_xml(**opts)
|
60
|
+
builder = Nokogiri::XML::Builder.new(encoding: "UTF-8") do |xml|
|
61
|
+
xml.documents do
|
62
|
+
each do |hit|
|
63
|
+
hit.fetch
|
64
|
+
hit.to_xml xml, **opts
|
65
|
+
end
|
66
|
+
end
|
67
|
+
end
|
68
|
+
builder.to_xml
|
40
69
|
end
|
41
70
|
end
|
42
71
|
end
|
@@ -2,7 +2,7 @@
|
|
2
2
|
|
3
3
|
# require 'relaton_iso/iso_bibliographic_item'
|
4
4
|
require "relaton_iso/scrapper"
|
5
|
-
require "relaton_iso/
|
5
|
+
require "relaton_iso/hit_collection"
|
6
6
|
require "relaton_iec"
|
7
7
|
|
8
8
|
module RelatonIso
|
@@ -12,10 +12,9 @@ module RelatonIso
|
|
12
12
|
# @param text [String]
|
13
13
|
# @return [RelatonIso::HitPages]
|
14
14
|
def search(text)
|
15
|
-
|
16
|
-
rescue
|
17
|
-
|
18
|
-
# []
|
15
|
+
HitCollection.new text
|
16
|
+
rescue SocketError, Timeout::Error, Errno::EINVAL, Errno::ECONNRESET, EOFError,
|
17
|
+
Net::HTTPBadResponse, Net::HTTPHeaderSyntaxError, Net::ProtocolError
|
19
18
|
raise RelatonBib::RequestError, "Could not access http://www.iso.org"
|
20
19
|
end
|
21
20
|
|
@@ -34,7 +33,7 @@ module RelatonIso
|
|
34
33
|
%r{
|
35
34
|
^(?<code1>[^\s]+\s[^/]+) # match code
|
36
35
|
/?
|
37
|
-
(?<corr>(Amd|CD Amd|Cor|CD Cor)\s\d+:?(\d{4})?(/Cor \d+:\d{4})?) # match correction
|
36
|
+
(?<corr>(Amd|DAmd|CD Amd|Cor|CD Cor)\s\d+:?(\d{4})?(/Cor \d+:\d{4})?) # match correction
|
38
37
|
}x =~ code
|
39
38
|
code = code1 if code1
|
40
39
|
|
@@ -46,14 +45,9 @@ module RelatonIso
|
|
46
45
|
end
|
47
46
|
end
|
48
47
|
code += "-1" if opts[:all_parts]
|
49
|
-
return RelatonIec::IecBibliography.get(code, year, opts) if %r[^ISO/IEC DIR]
|
48
|
+
return RelatonIec::IecBibliography.get(code, year, opts) if %r[^ISO/IEC DIR] =~ code
|
50
49
|
|
51
50
|
ret = isobib_get1(code, year, corr)
|
52
|
-
if ret.nil? && code =~ %r[^ISO\s]
|
53
|
-
c = code.gsub "ISO", "ISO/IEC"
|
54
|
-
warn "Attempting ISO/IEC retrieval"
|
55
|
-
ret = isobib_get1(c, year, corr)
|
56
|
-
end
|
57
51
|
return nil if ret.nil?
|
58
52
|
|
59
53
|
ret.to_most_recent_reference unless year || opts[:keep_year]
|
@@ -80,32 +74,63 @@ module RelatonIso
|
|
80
74
|
nil
|
81
75
|
end
|
82
76
|
|
83
|
-
def fetch_pages(s, n)
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
end
|
77
|
+
# def fetch_pages(s, n)
|
78
|
+
# workers = RelatonBib::WorkersPool.new n
|
79
|
+
# workers.worker { |w| { i: w[:i], hit: w[:hit].fetch } }
|
80
|
+
# s.each_with_index { |hit, i| workers << { i: i, hit: hit } }
|
81
|
+
# workers.end
|
82
|
+
# workers.result.sort { |x, y| x[:i] <=> y[:i] }.map { |x| x[:hit] }
|
83
|
+
# end
|
90
84
|
|
85
|
+
# Search for hits. If no found then trying missed stages and ISO/IEC.
|
86
|
+
#
|
87
|
+
# @param code [String] reference without correction
|
88
|
+
# @param corr [String] correction
|
89
|
+
# @return [Array<RelatonIso::Hit>]
|
91
90
|
def isobib_search_filter(code, corr)
|
92
|
-
# docidrx = %r{^(ISO|IEC)[^0-9]*\s[0-9-]+}
|
93
|
-
# corrigrx = %r{^(ISO|IEC)[^0-9]*\s[0-9-]+:[0-9]+/}
|
94
91
|
warn "fetching #{code}..."
|
95
92
|
result = search(code)
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
93
|
+
res = search_code result, code, corr
|
94
|
+
return res unless res.empty?
|
95
|
+
|
96
|
+
# try stages
|
97
|
+
if %r{^\w+/[^/]+\s\d+} =~ code # code like ISO/IEC 123, ISO/IEC/IEE 123
|
98
|
+
res = try_stages(result, corr) do |st|
|
99
|
+
code.sub(%r{^(?<pref>[^\s]+\s)}) { "#{$~[:pref]}#{st} " }
|
100
|
+
end
|
101
|
+
return res unless res.empty?
|
102
|
+
elsif %r{^\w+\s\d+} =~ code # code like ISO 123
|
103
|
+
res = try_stages(result, corr) do |st|
|
104
|
+
code.sub(%r{^(?<pref>\w+)}) { "#{$~[:pref]}/#{st}" }
|
103
105
|
end
|
104
|
-
return
|
106
|
+
return res unless res.empty?
|
107
|
+
end
|
105
108
|
|
106
|
-
|
109
|
+
if %r{^ISO\s} =~ code # try ISO/IEC if ISO not found
|
110
|
+
warn "Attempting ISO/IEC retrieval"
|
111
|
+
c = code.sub "ISO", "ISO/IEC"
|
112
|
+
res = search_code result, c, corr
|
113
|
+
end
|
114
|
+
res
|
115
|
+
end
|
116
|
+
|
117
|
+
def try_stages(result, corr)
|
118
|
+
%w[NP WD CD DIS FDIS PRF IS].each do |st| # try stages
|
119
|
+
warn "Attempting #{st} stage retrieval"
|
120
|
+
c = yield st
|
121
|
+
res = search_code result, c, corr
|
122
|
+
return res unless res.empty?
|
123
|
+
end
|
124
|
+
[]
|
125
|
+
end
|
126
|
+
|
127
|
+
def search_code(result, code, corr)
|
128
|
+
result.select do |i|
|
129
|
+
i.hit["docRef"] =~ %r{^#{code}(?!-)} && (
|
130
|
+
corr && %r{^#{code}[\w-]*(:\d{4})?/#{corr}} =~ i.hit["docRef"] ||
|
131
|
+
%r{^#{code}[\w-]*(:\d{4})?/} !~ i.hit["docRef"] && !corr
|
132
|
+
)
|
107
133
|
end
|
108
|
-
# []
|
109
134
|
end
|
110
135
|
|
111
136
|
# Sort through the results from RelatonIso, fetching them three at a time,
|
@@ -116,17 +141,13 @@ module RelatonIso
|
|
116
141
|
# If no match, returns any years which caused mismatch, for error reporting
|
117
142
|
def isobib_results_filter(result, year)
|
118
143
|
missed_years = []
|
119
|
-
result.
|
120
|
-
|
121
|
-
next if r.nil?
|
122
|
-
return { ret: r } if !year
|
144
|
+
result.each do |s|
|
145
|
+
return { ret: s.fetch } if !year
|
123
146
|
|
124
|
-
|
125
|
-
|
147
|
+
%r{:(?<iyear>\d{4})} =~ s.hit["docRef"]
|
148
|
+
return { ret: s.fetch } if iyear == year
|
126
149
|
|
127
|
-
|
128
|
-
end
|
129
|
-
end
|
150
|
+
missed_years << iyear
|
130
151
|
end
|
131
152
|
{ years: missed_years }
|
132
153
|
end
|
data/lib/relaton_iso/scrapper.rb
CHANGED
@@ -1,14 +1,10 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
2
|
|
3
|
-
require "algoliasearch"
|
4
3
|
require "relaton_iso_bib"
|
5
4
|
require "relaton_iso/hit"
|
6
5
|
require "nokogiri"
|
7
6
|
require "net/http"
|
8
7
|
|
9
|
-
Algolia.init application_id: "JCL49WV5AR",
|
10
|
-
api_key: "dd1b9e1ab383f4d4817d29cd5e96d3f0"
|
11
|
-
|
12
8
|
module RelatonIso
|
13
9
|
# Scrapper.
|
14
10
|
# rubocop:disable Metrics/ModuleLength
|
@@ -50,9 +46,8 @@ module RelatonIso
|
|
50
46
|
# @return [Hash]
|
51
47
|
# rubocop:disable Metrics/AbcSize, Metrics/MethodLength
|
52
48
|
def parse_page(hit_data)
|
53
|
-
|
54
|
-
|
55
|
-
doc, url = get_page "/standard/#{hit_data['path'].match(/\d+$/)}.html"
|
49
|
+
path = "/contents/data/standard#{hit_data["splitPath"]}/#{hit_data["csnumber"]}.html"
|
50
|
+
doc, url = get_page path
|
56
51
|
|
57
52
|
# Fetch edition.
|
58
53
|
edition = doc&.xpath("//strong[contains(text(), 'Edition')]/..")&.
|
@@ -67,14 +62,14 @@ module RelatonIso
|
|
67
62
|
language: langs(doc).map { |l| l[:lang] },
|
68
63
|
script: langs(doc).map { |l| script(l[:lang]) }.uniq,
|
69
64
|
titles: titles,
|
70
|
-
type: fetch_type(hit_data["
|
71
|
-
docstatus: fetch_status(doc
|
65
|
+
type: fetch_type(hit_data["docRef"]),
|
66
|
+
docstatus: fetch_status(doc),
|
72
67
|
ics: fetch_ics(doc),
|
73
|
-
dates: fetch_dates(doc, hit_data["
|
74
|
-
contributors: fetch_contributors(hit_data["
|
68
|
+
dates: fetch_dates(doc, hit_data["docRef"]),
|
69
|
+
contributors: fetch_contributors(hit_data["docRef"]),
|
75
70
|
editorialgroup: fetch_workgroup(doc),
|
76
71
|
abstract: abstract,
|
77
|
-
copyright: fetch_copyright(hit_data["
|
72
|
+
copyright: fetch_copyright(hit_data["docRef"], doc),
|
78
73
|
link: fetch_link(doc, url),
|
79
74
|
relations: fetch_relations(doc),
|
80
75
|
structuredidentifier: fetch_structuredidentifier(doc),
|
@@ -227,7 +222,7 @@ module RelatonIso
|
|
227
222
|
# @param doc [Nokogiri::HTML::Document]
|
228
223
|
# @param status [String]
|
229
224
|
# @return [Hash]
|
230
|
-
def fetch_status(doc
|
225
|
+
def fetch_status(doc)
|
231
226
|
stage, substage = doc.css("li.dropdown.active span.stage-code > strong").text.split "."
|
232
227
|
RelatonBib::DocumentStatus.new(stage: stage, substage: substage)
|
233
228
|
end
|
@@ -283,18 +278,18 @@ module RelatonIso
|
|
283
278
|
# rubocop:enable Metrics/MethodLength
|
284
279
|
|
285
280
|
# Fetch type.
|
286
|
-
# @param
|
281
|
+
# @param ref [String]
|
287
282
|
# @return [String]
|
288
|
-
def fetch_type(
|
289
|
-
|
290
|
-
|
283
|
+
def fetch_type(ref)
|
284
|
+
%r{
|
285
|
+
^(?<prefix>ISO|IWA|IEC)
|
286
|
+
(?:(/IEC|/IEEE|/PRF|/NP|/DGuide)*\s|/)
|
287
|
+
(?<type>TS|TR|PAS|AWI|CD|FDIS|NP|DIS|WD|R|Guide|(?=\d+))
|
288
|
+
}x =~ ref
|
291
289
|
# return "international-standard" if type_match.nil?
|
292
|
-
if TYPES[
|
293
|
-
|
294
|
-
elsif
|
295
|
-
"international-standard"
|
296
|
-
elsif type_match[1] == "IWA"
|
297
|
-
"international-workshop-agreement"
|
290
|
+
if TYPES[type] then TYPES[type]
|
291
|
+
elsif prefix == "ISO" then "international-standard"
|
292
|
+
elsif prefix == "IWA" then "international-workshop-agreement"
|
298
293
|
end
|
299
294
|
# rescue => _e
|
300
295
|
# puts 'Unknown document type: ' + title
|
@@ -305,10 +300,11 @@ module RelatonIso
|
|
305
300
|
# @param lang [String]
|
306
301
|
# @return [Hash]
|
307
302
|
def fetch_title(doc, lang)
|
308
|
-
titles = doc.at(
|
309
|
-
|
310
|
-
|
311
|
-
|
303
|
+
titles = doc.at(
|
304
|
+
"//h3[@itemprop='description'] | //h2[@itemprop='description']",
|
305
|
+
)&.text&.split " -- "
|
306
|
+
case titles&.size
|
307
|
+
when nil, 0
|
312
308
|
intro, main, part = nil, "", nil
|
313
309
|
when 1
|
314
310
|
intro, main, part = nil, titles[0], nil
|
@@ -344,10 +340,11 @@ module RelatonIso
|
|
344
340
|
# rubocop:disable Metrics/MethodLength
|
345
341
|
# Fetch dates
|
346
342
|
# @param doc [Nokogiri::HTML::Document]
|
343
|
+
# @param ref [String]
|
347
344
|
# @return [Array<Hash>]
|
348
|
-
def fetch_dates(doc,
|
345
|
+
def fetch_dates(doc, ref)
|
349
346
|
dates = []
|
350
|
-
%r{^[^\s]+\s[\d-]+:(?<ref_date_str>\d{4})} =~
|
347
|
+
%r{^[^\s]+\s[\d-]+:(?<ref_date_str>\d{4})} =~ ref
|
351
348
|
pub_date_str = doc.xpath("//span[@itemprop='releaseDate']").text
|
352
349
|
if ref_date_str
|
353
350
|
ref_date = Date.strptime ref_date_str, "%Y"
|
@@ -368,8 +365,8 @@ module RelatonIso
|
|
368
365
|
dates
|
369
366
|
end
|
370
367
|
|
371
|
-
def fetch_contributors(
|
372
|
-
|
368
|
+
def fetch_contributors(ref)
|
369
|
+
ref.sub(/\s.*/, "").split("/").map do |abbrev|
|
373
370
|
case abbrev
|
374
371
|
when "IEC"
|
375
372
|
name = "International Electrotechnical Commission"
|
@@ -400,22 +397,23 @@ module RelatonIso
|
|
400
397
|
# @param url [String]
|
401
398
|
# @return [Array<Hash>]
|
402
399
|
def fetch_link(doc, url)
|
403
|
-
|
404
|
-
obp =
|
405
|
-
|
406
|
-
[
|
407
|
-
|
408
|
-
|
409
|
-
|
410
|
-
|
400
|
+
links = [{ type: "src", content: url }]
|
401
|
+
obp = doc.at("//a[contains(@href, '/obp/ui/')]")
|
402
|
+
links << { type: "obp", content: obp[:href] } if obp
|
403
|
+
rss = doc.at("//a[contains(@href, 'rss')]")
|
404
|
+
links << { type: "rss", content: DOMAIN + rss[:href] } if rss
|
405
|
+
pub = doc.at "//p[contains(., 'publicly available')]/a"
|
406
|
+
links << { type: "pub", content: pub[:href] } if pub
|
407
|
+
links
|
411
408
|
end
|
412
409
|
|
413
410
|
# Fetch copyright.
|
414
|
-
# @param
|
411
|
+
# @param ref [String]
|
412
|
+
# @param doc [Nokogiri::HTML::Document]
|
415
413
|
# @return [Hash]
|
416
|
-
def fetch_copyright(
|
417
|
-
owner_name =
|
418
|
-
from =
|
414
|
+
def fetch_copyright(ref, doc)
|
415
|
+
owner_name = ref.match(/.*?(?=\s)/).to_s
|
416
|
+
from = ref.match(/(?<=:)\d{4}/).to_s
|
419
417
|
if from.empty?
|
420
418
|
from = doc.xpath("//span[@itemprop='releaseDate']").text.match(/\d{4}/).to_s
|
421
419
|
end
|
data/lib/relaton_iso/version.rb
CHANGED
data/relaton_iso.gemspec
CHANGED
@@ -38,7 +38,6 @@ Gem::Specification.new do |spec|
|
|
38
38
|
spec.add_development_dependency "vcr"
|
39
39
|
spec.add_development_dependency "webmock"
|
40
40
|
|
41
|
-
spec.add_dependency "algoliasearch"
|
42
41
|
spec.add_dependency "relaton-iec", "~> 0.3.0"
|
43
42
|
spec.add_dependency "relaton-iso-bib", "~> 0.2.0"
|
44
43
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: relaton-iso
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.5.
|
4
|
+
version: 0.5.4
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Ribose Inc.
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2019-07-
|
11
|
+
date: 2019-07-22 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -164,20 +164,6 @@ dependencies:
|
|
164
164
|
- - ">="
|
165
165
|
- !ruby/object:Gem::Version
|
166
166
|
version: '0'
|
167
|
-
- !ruby/object:Gem::Dependency
|
168
|
-
name: algoliasearch
|
169
|
-
requirement: !ruby/object:Gem::Requirement
|
170
|
-
requirements:
|
171
|
-
- - ">="
|
172
|
-
- !ruby/object:Gem::Version
|
173
|
-
version: '0'
|
174
|
-
type: :runtime
|
175
|
-
prerelease: false
|
176
|
-
version_requirements: !ruby/object:Gem::Requirement
|
177
|
-
requirements:
|
178
|
-
- - ">="
|
179
|
-
- !ruby/object:Gem::Version
|
180
|
-
version: '0'
|
181
167
|
- !ruby/object:Gem::Dependency
|
182
168
|
name: relaton-iec
|
183
169
|
requirement: !ruby/object:Gem::Requirement
|
@@ -245,7 +231,6 @@ files:
|
|
245
231
|
- lib/relaton_iso.rb
|
246
232
|
- lib/relaton_iso/hit.rb
|
247
233
|
- lib/relaton_iso/hit_collection.rb
|
248
|
-
- lib/relaton_iso/hit_pages.rb
|
249
234
|
- lib/relaton_iso/iso_bibliography.rb
|
250
235
|
- lib/relaton_iso/scrapper.rb
|
251
236
|
- lib/relaton_iso/version.rb
|
@@ -1,96 +0,0 @@
|
|
1
|
-
# frozen_string_literal: true
|
2
|
-
|
3
|
-
require "algoliasearch"
|
4
|
-
require "relaton_iso/hit_collection"
|
5
|
-
|
6
|
-
module RelatonIso
|
7
|
-
# Pages of hits.
|
8
|
-
class HitPages < Array
|
9
|
-
Algolia.init application_id: "JCL49WV5AR",
|
10
|
-
api_key: "dd1b9e1ab383f4d4817d29cd5e96d3f0"
|
11
|
-
|
12
|
-
# @return [String]
|
13
|
-
attr_reader :text
|
14
|
-
|
15
|
-
# @param text [String]
|
16
|
-
def initialize(text)
|
17
|
-
@text = text
|
18
|
-
@index = Algolia::Index.new "all_en"
|
19
|
-
resp = @index.search(text, facetFilters: ["category:standard"])
|
20
|
-
@nb_pages = resp["nbPages"]
|
21
|
-
self << HitCollection.new(resp["hits"], self)
|
22
|
-
end
|
23
|
-
|
24
|
-
# @return [RelatonIso::HitCollection]
|
25
|
-
def last
|
26
|
-
collection(@nb_pages - 1)
|
27
|
-
end
|
28
|
-
|
29
|
-
# @param i [Integer]
|
30
|
-
# @return [RelatonIso::HitCollection]
|
31
|
-
def [](idx)
|
32
|
-
# collection i
|
33
|
-
return if idx + 1 > @nb_pages
|
34
|
-
|
35
|
-
collection idx
|
36
|
-
super
|
37
|
-
end
|
38
|
-
|
39
|
-
# @return [Array]
|
40
|
-
def map(&block)
|
41
|
-
m = []
|
42
|
-
@nb_pages.times do |n|
|
43
|
-
m << yield(self[n]) if block
|
44
|
-
end
|
45
|
-
m
|
46
|
-
end
|
47
|
-
|
48
|
-
def each(&block)
|
49
|
-
@nb_pages.times do |n|
|
50
|
-
yield self[n] if block
|
51
|
-
end
|
52
|
-
end
|
53
|
-
|
54
|
-
def to_s
|
55
|
-
inspect
|
56
|
-
end
|
57
|
-
|
58
|
-
def inspect
|
59
|
-
"<#{self.class}:#{format('%#.14x', object_id << 1)} @text=#{@text} "\
|
60
|
-
"@pages=#{@nb_pages}>"
|
61
|
-
end
|
62
|
-
|
63
|
-
# @return [Integer]
|
64
|
-
def size
|
65
|
-
@nb_pages
|
66
|
-
end
|
67
|
-
|
68
|
-
def to_xml(**opts)
|
69
|
-
builder = Nokogiri::XML::Builder.new(encoding: "UTF-8") do |xml|
|
70
|
-
xml.documents do
|
71
|
-
each do |page|
|
72
|
-
page.fetch
|
73
|
-
page.each { |hit| hit.to_xml xml, **opts }
|
74
|
-
end
|
75
|
-
end
|
76
|
-
end
|
77
|
-
builder.to_xml
|
78
|
-
end
|
79
|
-
|
80
|
-
private
|
81
|
-
|
82
|
-
# @param i [Integer]
|
83
|
-
# @return [RelatonIso::HitCollection]
|
84
|
-
def collection(idx)
|
85
|
-
return if idx + 1 > @nb_pages
|
86
|
-
|
87
|
-
while Array.instance_method(:size).bind(self).call < idx + 1
|
88
|
-
resp = @index.search(@text,
|
89
|
-
facetFilters: ["category:standard"],
|
90
|
-
page: idx)
|
91
|
-
self << HitCollection.new(resp["hits"], self)
|
92
|
-
end
|
93
|
-
Array.instance_method(:[]).bind(self).call idx
|
94
|
-
end
|
95
|
-
end
|
96
|
-
end
|