relaton-iso 0.5.2 → 0.5.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.travis.yml +1 -0
- data/Gemfile.lock +11 -17
- data/appveyor.yml +1 -0
- data/lib/relaton_iso/hit.rb +4 -7
- data/lib/relaton_iso/hit_collection.rb +47 -18
- data/lib/relaton_iso/iso_bibliography.rb +61 -40
- data/lib/relaton_iso/scrapper.rb +41 -43
- data/lib/relaton_iso/version.rb +1 -1
- data/relaton_iso.gemspec +0 -1
- metadata +2 -17
- data/lib/relaton_iso/hit_pages.rb +0 -96
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: dd542380eec781e5113ea5455b3f0b528188306d
|
4
|
+
data.tar.gz: e457fa95214e668f42a2eda1a1ba5f0cef1e54a1
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: d15d13af842f09e14b48ae43c82ae362b37df6394da6b84bc146c359be029ee207bd5dc5ae87691c33d9951d4c4ff86a04542355780fb4a7918848ef665469cd
|
7
|
+
data.tar.gz: 4c0b4c5d1ac62f9418b611a59991c89cf7e29cfa1614319341162aed676d80ad7dc88f13438f1ae83010a09838d971b826e4d517b1a3f5212e863e93b9470b16
|
data/.travis.yml
CHANGED
data/Gemfile.lock
CHANGED
@@ -1,8 +1,7 @@
|
|
1
1
|
PATH
|
2
2
|
remote: .
|
3
3
|
specs:
|
4
|
-
relaton-iso (0.5.
|
5
|
-
algoliasearch
|
4
|
+
relaton-iso (0.5.4)
|
6
5
|
relaton-iec (~> 0.3.0)
|
7
6
|
relaton-iso-bib (~> 0.2.0)
|
8
7
|
|
@@ -11,14 +10,11 @@ GEM
|
|
11
10
|
specs:
|
12
11
|
addressable (2.6.0)
|
13
12
|
public_suffix (>= 2.0.2, < 4.0)
|
14
|
-
algoliasearch (1.26.0)
|
15
|
-
httpclient (~> 2.8, >= 2.8.3)
|
16
|
-
json (>= 1.5.1)
|
17
13
|
byebug (11.0.1)
|
18
14
|
coderay (1.1.2)
|
19
15
|
crack (0.4.3)
|
20
16
|
safe_yaml (~> 1.0.0)
|
21
|
-
debase (0.2.
|
17
|
+
debase (0.2.3)
|
22
18
|
debase-ruby_core_source (>= 0.10.2)
|
23
19
|
debase-ruby_core_source (0.10.5)
|
24
20
|
diff-lcs (1.3)
|
@@ -26,13 +22,12 @@ GEM
|
|
26
22
|
equivalent-xml (0.6.0)
|
27
23
|
nokogiri (>= 1.4.3)
|
28
24
|
hashdiff (0.4.0)
|
29
|
-
httpclient (2.8.3)
|
30
25
|
isoics (0.1.7)
|
31
26
|
json (2.2.0)
|
32
27
|
method_source (0.9.2)
|
33
|
-
mini_portile2 (2.
|
34
|
-
nokogiri (1.
|
35
|
-
mini_portile2 (~> 2.
|
28
|
+
mini_portile2 (2.4.0)
|
29
|
+
nokogiri (1.10.3)
|
30
|
+
mini_portile2 (~> 2.4.0)
|
36
31
|
pry (0.12.2)
|
37
32
|
coderay (~> 1.1.0)
|
38
33
|
method_source (~> 0.9.0)
|
@@ -41,22 +36,21 @@ GEM
|
|
41
36
|
pry (~> 0.10)
|
42
37
|
public_suffix (3.1.1)
|
43
38
|
rake (10.5.0)
|
44
|
-
relaton-bib (0.2.
|
39
|
+
relaton-bib (0.2.5)
|
45
40
|
addressable
|
46
|
-
nokogiri (~> 1.
|
47
|
-
relaton-iec (0.3.
|
41
|
+
nokogiri (~> 1.10)
|
42
|
+
relaton-iec (0.3.2)
|
48
43
|
addressable
|
49
44
|
relaton-iso-bib (~> 0.2.0)
|
50
|
-
relaton-iso-bib (0.2.
|
45
|
+
relaton-iso-bib (0.2.4)
|
51
46
|
isoics (~> 0.1.6)
|
52
|
-
nokogiri (~> 1.8.4)
|
53
47
|
relaton-bib (~> 0.2.0)
|
54
48
|
ruby_deep_clone (~> 0.8.0)
|
55
49
|
rspec (3.8.0)
|
56
50
|
rspec-core (~> 3.8.0)
|
57
51
|
rspec-expectations (~> 3.8.0)
|
58
52
|
rspec-mocks (~> 3.8.0)
|
59
|
-
rspec-core (3.8.
|
53
|
+
rspec-core (3.8.2)
|
60
54
|
rspec-support (~> 3.8.0)
|
61
55
|
rspec-expectations (3.8.4)
|
62
56
|
diff-lcs (>= 1.2.0, < 2.0)
|
@@ -69,7 +63,7 @@ GEM
|
|
69
63
|
rake (>= 0.8.1)
|
70
64
|
ruby_deep_clone (0.8.0)
|
71
65
|
safe_yaml (1.0.5)
|
72
|
-
simplecov (0.
|
66
|
+
simplecov (0.17.0)
|
73
67
|
docile (~> 1.1)
|
74
68
|
json (>= 1.8, < 3)
|
75
69
|
simplecov-html (~> 0.10.0)
|
data/appveyor.yml
CHANGED
data/lib/relaton_iso/hit.rb
CHANGED
@@ -29,15 +29,12 @@ module RelatonIso
|
|
29
29
|
|
30
30
|
# @return [String]
|
31
31
|
def inspect
|
32
|
-
matched_words = @hit["_highlightResult"].
|
33
|
-
|
32
|
+
# matched_words = @hit["_highlightResult"].
|
33
|
+
# reduce([]) { |a, (_k, v)| a + v["matchedWords"] }.uniq
|
34
34
|
|
35
35
|
"<#{self.class}:#{format('%#.14x', object_id << 1)} "\
|
36
|
-
"@text=\"#{@hit_collection&.
|
37
|
-
"@
|
38
|
-
"@matchedWords=#{matched_words} "\
|
39
|
-
"@category=\"#{@hit['category']}\" "\
|
40
|
-
"@title=\"#{@hit['title']}\">"
|
36
|
+
"@text=\"#{@hit_collection&.ref}\" "\
|
37
|
+
"@reference=\"#{@hit["docRef"]}\""
|
41
38
|
end
|
42
39
|
|
43
40
|
# @param builder [Nokogiri::XML::Builder]
|
@@ -6,37 +6,66 @@ module RelatonIso
|
|
6
6
|
# Page of hit collection.
|
7
7
|
class HitCollection < Array
|
8
8
|
# @return [TrueClass, FalseClass]
|
9
|
-
attr_reader :fetched
|
9
|
+
# attr_reader :fetched
|
10
10
|
|
11
11
|
# @return [RelatonIso::HitPages]
|
12
|
-
attr_reader :hit_pages
|
12
|
+
# attr_reader :hit_pages
|
13
|
+
|
14
|
+
# @return [String]
|
15
|
+
attr_reader :ref
|
13
16
|
|
14
17
|
# @param hits [Array<Hash>]
|
15
|
-
def initialize(
|
16
|
-
concat(hits.map { |h| Hit.new(h, self) })
|
17
|
-
@fetched = false
|
18
|
-
@hit_pages = hit_pages
|
18
|
+
def initialize(ref)
|
19
|
+
# concat(hits.map { |h| Hit.new(h, self) })
|
20
|
+
# @fetched = false
|
21
|
+
# @hit_pages = hit_pages
|
22
|
+
@ref = ref
|
23
|
+
%r{(?<num>\d+)(-(?<part>\d+))?} =~ ref
|
24
|
+
http = Net::HTTP.new "www.iso.org", 443
|
25
|
+
http.use_ssl = true
|
26
|
+
search = ["status=ENT_ACTIVE,ENT_PROGRESS,ENT_INACTIVE,ENT_DELETED"]
|
27
|
+
search << "docNumber=#{num}"
|
28
|
+
search << "docPartNo=#{part}" if part
|
29
|
+
q = search.join "&"
|
30
|
+
resp = http.get(
|
31
|
+
"/cms/render/live/en/sites/isoorg.advancedSearch.do?#{q}",
|
32
|
+
{ 'Accept' => 'application/json, text/plain, */*' }
|
33
|
+
)
|
34
|
+
json = JSON.parse resp.body
|
35
|
+
concat(json["standards"].map { |h| Hit.new h, self })
|
19
36
|
end
|
20
37
|
|
21
38
|
# @return [RelatonIso::HitCollection]
|
22
|
-
def fetch
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
end
|
39
|
+
# def fetch
|
40
|
+
# workers = RelatonBib::WorkersPool.new 4
|
41
|
+
# workers.worker(&:fetch)
|
42
|
+
# each do |hit|
|
43
|
+
# workers << hit
|
44
|
+
# end
|
45
|
+
# workers.end
|
46
|
+
# workers.result
|
47
|
+
# @fetched = true
|
48
|
+
# self
|
49
|
+
# end
|
33
50
|
|
34
51
|
def to_s
|
35
52
|
inspect
|
36
53
|
end
|
37
54
|
|
38
55
|
def inspect
|
39
|
-
"<#{self.class}:#{format('%#.14x', object_id << 1)} @
|
56
|
+
"<#{self.class}:#{format('%#.14x', object_id << 1)} @ref=#{@ref}>"
|
57
|
+
end
|
58
|
+
|
59
|
+
def to_xml(**opts)
|
60
|
+
builder = Nokogiri::XML::Builder.new(encoding: "UTF-8") do |xml|
|
61
|
+
xml.documents do
|
62
|
+
each do |hit|
|
63
|
+
hit.fetch
|
64
|
+
hit.to_xml xml, **opts
|
65
|
+
end
|
66
|
+
end
|
67
|
+
end
|
68
|
+
builder.to_xml
|
40
69
|
end
|
41
70
|
end
|
42
71
|
end
|
@@ -2,7 +2,7 @@
|
|
2
2
|
|
3
3
|
# require 'relaton_iso/iso_bibliographic_item'
|
4
4
|
require "relaton_iso/scrapper"
|
5
|
-
require "relaton_iso/
|
5
|
+
require "relaton_iso/hit_collection"
|
6
6
|
require "relaton_iec"
|
7
7
|
|
8
8
|
module RelatonIso
|
@@ -12,10 +12,9 @@ module RelatonIso
|
|
12
12
|
# @param text [String]
|
13
13
|
# @return [RelatonIso::HitPages]
|
14
14
|
def search(text)
|
15
|
-
|
16
|
-
rescue
|
17
|
-
|
18
|
-
# []
|
15
|
+
HitCollection.new text
|
16
|
+
rescue SocketError, Timeout::Error, Errno::EINVAL, Errno::ECONNRESET, EOFError,
|
17
|
+
Net::HTTPBadResponse, Net::HTTPHeaderSyntaxError, Net::ProtocolError
|
19
18
|
raise RelatonBib::RequestError, "Could not access http://www.iso.org"
|
20
19
|
end
|
21
20
|
|
@@ -34,7 +33,7 @@ module RelatonIso
|
|
34
33
|
%r{
|
35
34
|
^(?<code1>[^\s]+\s[^/]+) # match code
|
36
35
|
/?
|
37
|
-
(?<corr>(Amd|CD Amd|Cor|CD Cor)\s\d+:?(\d{4})?(/Cor \d+:\d{4})?) # match correction
|
36
|
+
(?<corr>(Amd|DAmd|CD Amd|Cor|CD Cor)\s\d+:?(\d{4})?(/Cor \d+:\d{4})?) # match correction
|
38
37
|
}x =~ code
|
39
38
|
code = code1 if code1
|
40
39
|
|
@@ -46,14 +45,9 @@ module RelatonIso
|
|
46
45
|
end
|
47
46
|
end
|
48
47
|
code += "-1" if opts[:all_parts]
|
49
|
-
return RelatonIec::IecBibliography.get(code, year, opts) if %r[^ISO/IEC DIR]
|
48
|
+
return RelatonIec::IecBibliography.get(code, year, opts) if %r[^ISO/IEC DIR] =~ code
|
50
49
|
|
51
50
|
ret = isobib_get1(code, year, corr)
|
52
|
-
if ret.nil? && code =~ %r[^ISO\s]
|
53
|
-
c = code.gsub "ISO", "ISO/IEC"
|
54
|
-
warn "Attempting ISO/IEC retrieval"
|
55
|
-
ret = isobib_get1(c, year, corr)
|
56
|
-
end
|
57
51
|
return nil if ret.nil?
|
58
52
|
|
59
53
|
ret.to_most_recent_reference unless year || opts[:keep_year]
|
@@ -80,32 +74,63 @@ module RelatonIso
|
|
80
74
|
nil
|
81
75
|
end
|
82
76
|
|
83
|
-
def fetch_pages(s, n)
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
end
|
77
|
+
# def fetch_pages(s, n)
|
78
|
+
# workers = RelatonBib::WorkersPool.new n
|
79
|
+
# workers.worker { |w| { i: w[:i], hit: w[:hit].fetch } }
|
80
|
+
# s.each_with_index { |hit, i| workers << { i: i, hit: hit } }
|
81
|
+
# workers.end
|
82
|
+
# workers.result.sort { |x, y| x[:i] <=> y[:i] }.map { |x| x[:hit] }
|
83
|
+
# end
|
90
84
|
|
85
|
+
# Search for hits. If no found then trying missed stages and ISO/IEC.
|
86
|
+
#
|
87
|
+
# @param code [String] reference without correction
|
88
|
+
# @param corr [String] correction
|
89
|
+
# @return [Array<RelatonIso::Hit>]
|
91
90
|
def isobib_search_filter(code, corr)
|
92
|
-
# docidrx = %r{^(ISO|IEC)[^0-9]*\s[0-9-]+}
|
93
|
-
# corrigrx = %r{^(ISO|IEC)[^0-9]*\s[0-9-]+:[0-9]+/}
|
94
91
|
warn "fetching #{code}..."
|
95
92
|
result = search(code)
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
93
|
+
res = search_code result, code, corr
|
94
|
+
return res unless res.empty?
|
95
|
+
|
96
|
+
# try stages
|
97
|
+
if %r{^\w+/[^/]+\s\d+} =~ code # code like ISO/IEC 123, ISO/IEC/IEE 123
|
98
|
+
res = try_stages(result, corr) do |st|
|
99
|
+
code.sub(%r{^(?<pref>[^\s]+\s)}) { "#{$~[:pref]}#{st} " }
|
100
|
+
end
|
101
|
+
return res unless res.empty?
|
102
|
+
elsif %r{^\w+\s\d+} =~ code # code like ISO 123
|
103
|
+
res = try_stages(result, corr) do |st|
|
104
|
+
code.sub(%r{^(?<pref>\w+)}) { "#{$~[:pref]}/#{st}" }
|
103
105
|
end
|
104
|
-
return
|
106
|
+
return res unless res.empty?
|
107
|
+
end
|
105
108
|
|
106
|
-
|
109
|
+
if %r{^ISO\s} =~ code # try ISO/IEC if ISO not found
|
110
|
+
warn "Attempting ISO/IEC retrieval"
|
111
|
+
c = code.sub "ISO", "ISO/IEC"
|
112
|
+
res = search_code result, c, corr
|
113
|
+
end
|
114
|
+
res
|
115
|
+
end
|
116
|
+
|
117
|
+
def try_stages(result, corr)
|
118
|
+
%w[NP WD CD DIS FDIS PRF IS].each do |st| # try stages
|
119
|
+
warn "Attempting #{st} stage retrieval"
|
120
|
+
c = yield st
|
121
|
+
res = search_code result, c, corr
|
122
|
+
return res unless res.empty?
|
123
|
+
end
|
124
|
+
[]
|
125
|
+
end
|
126
|
+
|
127
|
+
def search_code(result, code, corr)
|
128
|
+
result.select do |i|
|
129
|
+
i.hit["docRef"] =~ %r{^#{code}(?!-)} && (
|
130
|
+
corr && %r{^#{code}[\w-]*(:\d{4})?/#{corr}} =~ i.hit["docRef"] ||
|
131
|
+
%r{^#{code}[\w-]*(:\d{4})?/} !~ i.hit["docRef"] && !corr
|
132
|
+
)
|
107
133
|
end
|
108
|
-
# []
|
109
134
|
end
|
110
135
|
|
111
136
|
# Sort through the results from RelatonIso, fetching them three at a time,
|
@@ -116,17 +141,13 @@ module RelatonIso
|
|
116
141
|
# If no match, returns any years which caused mismatch, for error reporting
|
117
142
|
def isobib_results_filter(result, year)
|
118
143
|
missed_years = []
|
119
|
-
result.
|
120
|
-
|
121
|
-
next if r.nil?
|
122
|
-
return { ret: r } if !year
|
144
|
+
result.each do |s|
|
145
|
+
return { ret: s.fetch } if !year
|
123
146
|
|
124
|
-
|
125
|
-
|
147
|
+
%r{:(?<iyear>\d{4})} =~ s.hit["docRef"]
|
148
|
+
return { ret: s.fetch } if iyear == year
|
126
149
|
|
127
|
-
|
128
|
-
end
|
129
|
-
end
|
150
|
+
missed_years << iyear
|
130
151
|
end
|
131
152
|
{ years: missed_years }
|
132
153
|
end
|
data/lib/relaton_iso/scrapper.rb
CHANGED
@@ -1,14 +1,10 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
2
|
|
3
|
-
require "algoliasearch"
|
4
3
|
require "relaton_iso_bib"
|
5
4
|
require "relaton_iso/hit"
|
6
5
|
require "nokogiri"
|
7
6
|
require "net/http"
|
8
7
|
|
9
|
-
Algolia.init application_id: "JCL49WV5AR",
|
10
|
-
api_key: "dd1b9e1ab383f4d4817d29cd5e96d3f0"
|
11
|
-
|
12
8
|
module RelatonIso
|
13
9
|
# Scrapper.
|
14
10
|
# rubocop:disable Metrics/ModuleLength
|
@@ -50,9 +46,8 @@ module RelatonIso
|
|
50
46
|
# @return [Hash]
|
51
47
|
# rubocop:disable Metrics/AbcSize, Metrics/MethodLength
|
52
48
|
def parse_page(hit_data)
|
53
|
-
|
54
|
-
|
55
|
-
doc, url = get_page "/standard/#{hit_data['path'].match(/\d+$/)}.html"
|
49
|
+
path = "/contents/data/standard#{hit_data["splitPath"]}/#{hit_data["csnumber"]}.html"
|
50
|
+
doc, url = get_page path
|
56
51
|
|
57
52
|
# Fetch edition.
|
58
53
|
edition = doc&.xpath("//strong[contains(text(), 'Edition')]/..")&.
|
@@ -67,14 +62,14 @@ module RelatonIso
|
|
67
62
|
language: langs(doc).map { |l| l[:lang] },
|
68
63
|
script: langs(doc).map { |l| script(l[:lang]) }.uniq,
|
69
64
|
titles: titles,
|
70
|
-
type: fetch_type(hit_data["
|
71
|
-
docstatus: fetch_status(doc
|
65
|
+
type: fetch_type(hit_data["docRef"]),
|
66
|
+
docstatus: fetch_status(doc),
|
72
67
|
ics: fetch_ics(doc),
|
73
|
-
dates: fetch_dates(doc, hit_data["
|
74
|
-
contributors: fetch_contributors(hit_data["
|
68
|
+
dates: fetch_dates(doc, hit_data["docRef"]),
|
69
|
+
contributors: fetch_contributors(hit_data["docRef"]),
|
75
70
|
editorialgroup: fetch_workgroup(doc),
|
76
71
|
abstract: abstract,
|
77
|
-
copyright: fetch_copyright(hit_data["
|
72
|
+
copyright: fetch_copyright(hit_data["docRef"], doc),
|
78
73
|
link: fetch_link(doc, url),
|
79
74
|
relations: fetch_relations(doc),
|
80
75
|
structuredidentifier: fetch_structuredidentifier(doc),
|
@@ -227,7 +222,7 @@ module RelatonIso
|
|
227
222
|
# @param doc [Nokogiri::HTML::Document]
|
228
223
|
# @param status [String]
|
229
224
|
# @return [Hash]
|
230
|
-
def fetch_status(doc
|
225
|
+
def fetch_status(doc)
|
231
226
|
stage, substage = doc.css("li.dropdown.active span.stage-code > strong").text.split "."
|
232
227
|
RelatonBib::DocumentStatus.new(stage: stage, substage: substage)
|
233
228
|
end
|
@@ -283,18 +278,18 @@ module RelatonIso
|
|
283
278
|
# rubocop:enable Metrics/MethodLength
|
284
279
|
|
285
280
|
# Fetch type.
|
286
|
-
# @param
|
281
|
+
# @param ref [String]
|
287
282
|
# @return [String]
|
288
|
-
def fetch_type(
|
289
|
-
|
290
|
-
|
283
|
+
def fetch_type(ref)
|
284
|
+
%r{
|
285
|
+
^(?<prefix>ISO|IWA|IEC)
|
286
|
+
(?:(/IEC|/IEEE|/PRF|/NP|/DGuide)*\s|/)
|
287
|
+
(?<type>TS|TR|PAS|AWI|CD|FDIS|NP|DIS|WD|R|Guide|(?=\d+))
|
288
|
+
}x =~ ref
|
291
289
|
# return "international-standard" if type_match.nil?
|
292
|
-
if TYPES[
|
293
|
-
|
294
|
-
elsif
|
295
|
-
"international-standard"
|
296
|
-
elsif type_match[1] == "IWA"
|
297
|
-
"international-workshop-agreement"
|
290
|
+
if TYPES[type] then TYPES[type]
|
291
|
+
elsif prefix == "ISO" then "international-standard"
|
292
|
+
elsif prefix == "IWA" then "international-workshop-agreement"
|
298
293
|
end
|
299
294
|
# rescue => _e
|
300
295
|
# puts 'Unknown document type: ' + title
|
@@ -305,10 +300,11 @@ module RelatonIso
|
|
305
300
|
# @param lang [String]
|
306
301
|
# @return [Hash]
|
307
302
|
def fetch_title(doc, lang)
|
308
|
-
titles = doc.at(
|
309
|
-
|
310
|
-
|
311
|
-
|
303
|
+
titles = doc.at(
|
304
|
+
"//h3[@itemprop='description'] | //h2[@itemprop='description']",
|
305
|
+
)&.text&.split " -- "
|
306
|
+
case titles&.size
|
307
|
+
when nil, 0
|
312
308
|
intro, main, part = nil, "", nil
|
313
309
|
when 1
|
314
310
|
intro, main, part = nil, titles[0], nil
|
@@ -344,10 +340,11 @@ module RelatonIso
|
|
344
340
|
# rubocop:disable Metrics/MethodLength
|
345
341
|
# Fetch dates
|
346
342
|
# @param doc [Nokogiri::HTML::Document]
|
343
|
+
# @param ref [String]
|
347
344
|
# @return [Array<Hash>]
|
348
|
-
def fetch_dates(doc,
|
345
|
+
def fetch_dates(doc, ref)
|
349
346
|
dates = []
|
350
|
-
%r{^[^\s]+\s[\d-]+:(?<ref_date_str>\d{4})} =~
|
347
|
+
%r{^[^\s]+\s[\d-]+:(?<ref_date_str>\d{4})} =~ ref
|
351
348
|
pub_date_str = doc.xpath("//span[@itemprop='releaseDate']").text
|
352
349
|
if ref_date_str
|
353
350
|
ref_date = Date.strptime ref_date_str, "%Y"
|
@@ -368,8 +365,8 @@ module RelatonIso
|
|
368
365
|
dates
|
369
366
|
end
|
370
367
|
|
371
|
-
def fetch_contributors(
|
372
|
-
|
368
|
+
def fetch_contributors(ref)
|
369
|
+
ref.sub(/\s.*/, "").split("/").map do |abbrev|
|
373
370
|
case abbrev
|
374
371
|
when "IEC"
|
375
372
|
name = "International Electrotechnical Commission"
|
@@ -400,22 +397,23 @@ module RelatonIso
|
|
400
397
|
# @param url [String]
|
401
398
|
# @return [Array<Hash>]
|
402
399
|
def fetch_link(doc, url)
|
403
|
-
|
404
|
-
obp =
|
405
|
-
|
406
|
-
[
|
407
|
-
|
408
|
-
|
409
|
-
|
410
|
-
|
400
|
+
links = [{ type: "src", content: url }]
|
401
|
+
obp = doc.at("//a[contains(@href, '/obp/ui/')]")
|
402
|
+
links << { type: "obp", content: obp[:href] } if obp
|
403
|
+
rss = doc.at("//a[contains(@href, 'rss')]")
|
404
|
+
links << { type: "rss", content: DOMAIN + rss[:href] } if rss
|
405
|
+
pub = doc.at "//p[contains(., 'publicly available')]/a"
|
406
|
+
links << { type: "pub", content: pub[:href] } if pub
|
407
|
+
links
|
411
408
|
end
|
412
409
|
|
413
410
|
# Fetch copyright.
|
414
|
-
# @param
|
411
|
+
# @param ref [String]
|
412
|
+
# @param doc [Nokogiri::HTML::Document]
|
415
413
|
# @return [Hash]
|
416
|
-
def fetch_copyright(
|
417
|
-
owner_name =
|
418
|
-
from =
|
414
|
+
def fetch_copyright(ref, doc)
|
415
|
+
owner_name = ref.match(/.*?(?=\s)/).to_s
|
416
|
+
from = ref.match(/(?<=:)\d{4}/).to_s
|
419
417
|
if from.empty?
|
420
418
|
from = doc.xpath("//span[@itemprop='releaseDate']").text.match(/\d{4}/).to_s
|
421
419
|
end
|
data/lib/relaton_iso/version.rb
CHANGED
data/relaton_iso.gemspec
CHANGED
@@ -38,7 +38,6 @@ Gem::Specification.new do |spec|
|
|
38
38
|
spec.add_development_dependency "vcr"
|
39
39
|
spec.add_development_dependency "webmock"
|
40
40
|
|
41
|
-
spec.add_dependency "algoliasearch"
|
42
41
|
spec.add_dependency "relaton-iec", "~> 0.3.0"
|
43
42
|
spec.add_dependency "relaton-iso-bib", "~> 0.2.0"
|
44
43
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: relaton-iso
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.5.
|
4
|
+
version: 0.5.4
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Ribose Inc.
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2019-07-
|
11
|
+
date: 2019-07-22 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -164,20 +164,6 @@ dependencies:
|
|
164
164
|
- - ">="
|
165
165
|
- !ruby/object:Gem::Version
|
166
166
|
version: '0'
|
167
|
-
- !ruby/object:Gem::Dependency
|
168
|
-
name: algoliasearch
|
169
|
-
requirement: !ruby/object:Gem::Requirement
|
170
|
-
requirements:
|
171
|
-
- - ">="
|
172
|
-
- !ruby/object:Gem::Version
|
173
|
-
version: '0'
|
174
|
-
type: :runtime
|
175
|
-
prerelease: false
|
176
|
-
version_requirements: !ruby/object:Gem::Requirement
|
177
|
-
requirements:
|
178
|
-
- - ">="
|
179
|
-
- !ruby/object:Gem::Version
|
180
|
-
version: '0'
|
181
167
|
- !ruby/object:Gem::Dependency
|
182
168
|
name: relaton-iec
|
183
169
|
requirement: !ruby/object:Gem::Requirement
|
@@ -245,7 +231,6 @@ files:
|
|
245
231
|
- lib/relaton_iso.rb
|
246
232
|
- lib/relaton_iso/hit.rb
|
247
233
|
- lib/relaton_iso/hit_collection.rb
|
248
|
-
- lib/relaton_iso/hit_pages.rb
|
249
234
|
- lib/relaton_iso/iso_bibliography.rb
|
250
235
|
- lib/relaton_iso/scrapper.rb
|
251
236
|
- lib/relaton_iso/version.rb
|
@@ -1,96 +0,0 @@
|
|
1
|
-
# frozen_string_literal: true
|
2
|
-
|
3
|
-
require "algoliasearch"
|
4
|
-
require "relaton_iso/hit_collection"
|
5
|
-
|
6
|
-
module RelatonIso
|
7
|
-
# Pages of hits.
|
8
|
-
class HitPages < Array
|
9
|
-
Algolia.init application_id: "JCL49WV5AR",
|
10
|
-
api_key: "dd1b9e1ab383f4d4817d29cd5e96d3f0"
|
11
|
-
|
12
|
-
# @return [String]
|
13
|
-
attr_reader :text
|
14
|
-
|
15
|
-
# @param text [String]
|
16
|
-
def initialize(text)
|
17
|
-
@text = text
|
18
|
-
@index = Algolia::Index.new "all_en"
|
19
|
-
resp = @index.search(text, facetFilters: ["category:standard"])
|
20
|
-
@nb_pages = resp["nbPages"]
|
21
|
-
self << HitCollection.new(resp["hits"], self)
|
22
|
-
end
|
23
|
-
|
24
|
-
# @return [RelatonIso::HitCollection]
|
25
|
-
def last
|
26
|
-
collection(@nb_pages - 1)
|
27
|
-
end
|
28
|
-
|
29
|
-
# @param i [Integer]
|
30
|
-
# @return [RelatonIso::HitCollection]
|
31
|
-
def [](idx)
|
32
|
-
# collection i
|
33
|
-
return if idx + 1 > @nb_pages
|
34
|
-
|
35
|
-
collection idx
|
36
|
-
super
|
37
|
-
end
|
38
|
-
|
39
|
-
# @return [Array]
|
40
|
-
def map(&block)
|
41
|
-
m = []
|
42
|
-
@nb_pages.times do |n|
|
43
|
-
m << yield(self[n]) if block
|
44
|
-
end
|
45
|
-
m
|
46
|
-
end
|
47
|
-
|
48
|
-
def each(&block)
|
49
|
-
@nb_pages.times do |n|
|
50
|
-
yield self[n] if block
|
51
|
-
end
|
52
|
-
end
|
53
|
-
|
54
|
-
def to_s
|
55
|
-
inspect
|
56
|
-
end
|
57
|
-
|
58
|
-
def inspect
|
59
|
-
"<#{self.class}:#{format('%#.14x', object_id << 1)} @text=#{@text} "\
|
60
|
-
"@pages=#{@nb_pages}>"
|
61
|
-
end
|
62
|
-
|
63
|
-
# @return [Integer]
|
64
|
-
def size
|
65
|
-
@nb_pages
|
66
|
-
end
|
67
|
-
|
68
|
-
def to_xml(**opts)
|
69
|
-
builder = Nokogiri::XML::Builder.new(encoding: "UTF-8") do |xml|
|
70
|
-
xml.documents do
|
71
|
-
each do |page|
|
72
|
-
page.fetch
|
73
|
-
page.each { |hit| hit.to_xml xml, **opts }
|
74
|
-
end
|
75
|
-
end
|
76
|
-
end
|
77
|
-
builder.to_xml
|
78
|
-
end
|
79
|
-
|
80
|
-
private
|
81
|
-
|
82
|
-
# @param i [Integer]
|
83
|
-
# @return [RelatonIso::HitCollection]
|
84
|
-
def collection(idx)
|
85
|
-
return if idx + 1 > @nb_pages
|
86
|
-
|
87
|
-
while Array.instance_method(:size).bind(self).call < idx + 1
|
88
|
-
resp = @index.search(@text,
|
89
|
-
facetFilters: ["category:standard"],
|
90
|
-
page: idx)
|
91
|
-
self << HitCollection.new(resp["hits"], self)
|
92
|
-
end
|
93
|
-
Array.instance_method(:[]).bind(self).call idx
|
94
|
-
end
|
95
|
-
end
|
96
|
-
end
|