relaton-iso 0.8.0 → 0.8.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.adoc +30 -0
- data/lib/relaton_iso/hit.rb +3 -2
- data/lib/relaton_iso/hit_collection.rb +12 -27
- data/lib/relaton_iso/iso_bibliography.rb +15 -16
- data/lib/relaton_iso/scrapper.rb +36 -25
- data/lib/relaton_iso/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 23cfcc3b3c94cf0988d6087a1afacb4bcc6065249b4f6785879e8242384bf00d
|
4
|
+
data.tar.gz: f973dc6ba692b89acd1c6b4f9f75b9261ca7aff09557ed58605a7b1b8e499719
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 8d88890b35c169076ceb3350eedb036116a4fe30403ae23e6e9300718d5cfd8d6d37f49a6724bdb2c4b459a7782bd429cab660d4a195bcef92ac0dc347021c6f
|
7
|
+
data.tar.gz: e68d82ea105bba5dc0ab989a35eb02062adefdb45302c17065f9b47b3bf3f3b6eb9ea4e3caa43177782235013f984ea6fe574cad8e95fe1a535319d63c8b573e
|
data/README.adoc
CHANGED
@@ -183,6 +183,36 @@ RelatonIso::IsoBibliography.get('ISO 19115', "2014", {all_parts: true}).title
|
|
183
183
|
@type="main">]
|
184
184
|
----
|
185
185
|
|
186
|
+
=== Get specific language
|
187
|
+
|
188
|
+
[source,ruby]
|
189
|
+
----
|
190
|
+
item = RelatonIso::IsoBibliography.get 'ISO 19115', nil, {lang: "en"}
|
191
|
+
item.to_xml
|
192
|
+
=> <bibitem id="ISO19115-1-2014" type="standard">
|
193
|
+
<fetched>2020-01-22</fetched>
|
194
|
+
<title type="title-intro" format="text/plain" language="en" script="Latn">Geographic information</title>
|
195
|
+
<title type="title-main" format="text/plain" language="en" script="Latn">Metadata</title>
|
196
|
+
<title type="main" format="text/plain" language="en" script="Latn">Geographic information – Metadata</title>
|
197
|
+
<uri type=\"src\">https://www.iso.org/standard/53798.html</uri>
|
198
|
+
...
|
199
|
+
</bibitem>
|
200
|
+
|
201
|
+
item = RelatonIso::IsoBibliography.get 'ISO 19115', nil, {lang: "fr"}
|
202
|
+
item.to_xml
|
203
|
+
=> <bibitem id="ISO19115-1-2014" type="standard">
|
204
|
+
<fetched>2020-01-22</fetched>
|
205
|
+
<title type="title-intro" format="text/plain" language="en" script="Latn">Geographic information</title>
|
206
|
+
<title type="title-main" format="text/plain" language="en" script="Latn">Metadata</title>
|
207
|
+
<title type="main" format="text/plain" language="en" script="Latn">Geographic information – Metadata</title>
|
208
|
+
<title type="title-intro" format="text/plain" language="fr" script="Latn">Information géographique</title>
|
209
|
+
<title type="title-main" format="text/plain" language="fr" script="Latn">Métadonnées</title>
|
210
|
+
<title type="main" format="text/plain" language="fr" script="Latn">Information géographique – Métadonnées</title>
|
211
|
+
<uri type="src">https://www.iso.org/standard/53798.html</uri>
|
212
|
+
...
|
213
|
+
</bibitem>
|
214
|
+
----
|
215
|
+
|
186
216
|
== Development
|
187
217
|
|
188
218
|
After checking out the repo, run `bin/setup` to install dependencies. Then, run `rake spec` to run the tests. You can also run `bin/console` for an interactive prompt that will allow you to experiment.
|
data/lib/relaton_iso/hit.rb
CHANGED
@@ -7,9 +7,10 @@ module RelatonIso
|
|
7
7
|
attr_reader :hit_collection
|
8
8
|
|
9
9
|
# Parse page.
|
10
|
+
# @param lang [String, NilClass]
|
10
11
|
# @return [RelatonIso::IsoBibliographicItem]
|
11
|
-
def fetch
|
12
|
-
@fetch ||= Scrapper.parse_page @hit
|
12
|
+
def fetch(lang = nil)
|
13
|
+
@fetch ||= Scrapper.parse_page @hit, lang
|
13
14
|
end
|
14
15
|
|
15
16
|
# @param builder [Nokogiri::XML::Builder]
|
@@ -10,25 +10,23 @@ module RelatonIso
|
|
10
10
|
|
11
11
|
def_delegators :@array, :<<, :[], :first, :empty?, :any?, :size
|
12
12
|
|
13
|
-
# @return [
|
14
|
-
# attr_reader :fetched
|
15
|
-
|
16
|
-
# @return [RelatonIso::HitPages]
|
17
|
-
# attr_reader :hit_pages
|
18
|
-
|
19
|
-
# @return [String]
|
13
|
+
# @return [String, NilClass]
|
20
14
|
attr_reader :text
|
21
15
|
|
22
|
-
# @param
|
16
|
+
# @param text [String] reference to search
|
23
17
|
def initialize(text)
|
24
18
|
@array = []
|
25
19
|
@text = text
|
26
|
-
%r{\s(?<num>\d+)(-(?<part
|
20
|
+
%r{\s(?<num>\d+)(-(?<part>[\d-]+))?} =~ text
|
27
21
|
http = Net::HTTP.new "www.iso.org", 443
|
28
22
|
http.use_ssl = true
|
29
23
|
search = ["status=ENT_ACTIVE,ENT_PROGRESS,ENT_INACTIVE,ENT_DELETED"]
|
30
24
|
search << "docNumber=#{num}"
|
31
25
|
search << "docPartNo=#{part}" if part
|
26
|
+
# if year
|
27
|
+
# search << "stageDateStart=#{Date.new(year.to_i).strftime("%Y-%m-%d")}"
|
28
|
+
# search << "stageDateEnd=#{Date.new(year.to_i, 12, 31).strftime("%Y-%m-%d")}"
|
29
|
+
# end
|
32
30
|
q = search.join "&"
|
33
31
|
resp = http.get("/cms/render/live/en/sites/isoorg.advancedSearch.do?#{q}",
|
34
32
|
"Accept" => "application/json, text/plain, */*")
|
@@ -55,27 +53,14 @@ module RelatonIso
|
|
55
53
|
self
|
56
54
|
end
|
57
55
|
|
58
|
-
# @
|
59
|
-
#
|
60
|
-
|
61
|
-
|
62
|
-
# workers = RelatonBib::WorkersPool.new 4
|
63
|
-
# workers.worker(&:fetch)
|
64
|
-
# @array.each do |hit|
|
65
|
-
# workers << hit
|
66
|
-
# end
|
67
|
-
# workers.end
|
68
|
-
# workers.result
|
69
|
-
# @fetched = true
|
70
|
-
# self
|
71
|
-
# end
|
72
|
-
|
73
|
-
def to_all_parts
|
56
|
+
# @param lang [String, NilClass]
|
57
|
+
# @return [RelatonIsoBib::IsoBibliographicItem]
|
58
|
+
def to_all_parts(lang = nil)
|
74
59
|
parts = @array.select { |h| !h.hit["docPart"].empty? }
|
75
60
|
hit = parts.min_by { |h| h.hit["docPart"].to_i }
|
76
|
-
return @array.first.fetch unless hit
|
61
|
+
return @array.first.fetch lang unless hit
|
77
62
|
|
78
|
-
bibitem = hit.fetch
|
63
|
+
bibitem = hit.fetch lang
|
79
64
|
bibitem.to_all_parts
|
80
65
|
parts.reject { |h| h.hit["docRef"] == hit.hit["docRef"] }.each do |hi|
|
81
66
|
isobib = RelatonIsoBib::IsoBibliographicItem.new(
|
@@ -19,24 +19,19 @@ module RelatonIso
|
|
19
19
|
raise RelatonBib::RequestError, "Could not access http://www.iso.org"
|
20
20
|
end
|
21
21
|
|
22
|
-
# @param text [String]
|
23
|
-
# @return [Array<RelatonIso::IsoBibliographicItem>]
|
24
|
-
# def search_and_fetch(text)
|
25
|
-
# Scrapper.get(text)
|
26
|
-
# end
|
27
|
-
|
28
22
|
# @param ref [String] the ISO standard Code to look up (e..g "ISO 9000")
|
29
|
-
# @param year [String] the year the standard was published
|
30
|
-
# @param opts [Hash] options; restricted to :all_parts if all-parts
|
31
|
-
# :keep_year if undated reference should
|
23
|
+
# @param year [String, NilClass] the year the standard was published
|
24
|
+
# @param opts [Hash] options; restricted to :all_parts if all-parts
|
25
|
+
# reference is required, :keep_year if undated reference should
|
26
|
+
# return actual reference with year
|
32
27
|
# @return [String] Relaton XML serialisation of reference
|
33
|
-
def get(ref, year, opts)
|
28
|
+
def get(ref, year = nil, opts = {})
|
34
29
|
opts[:ref] = ref
|
35
30
|
|
36
31
|
%r{
|
37
32
|
^(?<code1>[^\s]+\s[^/]+) # match code
|
38
33
|
/?
|
39
|
-
(?<corr>(Amd|DAmd|(CD|WD|AWI|NP)\sAmd|Cor|CD\sCor|FDAmd)\s\d+ # correction name
|
34
|
+
(?<corr>(Amd|DAmd|(CD|WD|AWI|NP)\sAmd|Cor|CD\sCor|FDAmd|PRF\sAmd)\s\d+ # correction name
|
40
35
|
:?(\d{4})?(/Cor\s\d+:\d{4})?) # match correction year
|
41
36
|
}x =~ ref
|
42
37
|
code = code1 || ref
|
@@ -122,7 +117,7 @@ module RelatonIso
|
|
122
117
|
# @param opts [Hash]
|
123
118
|
def try_stages(result, corr, opts)
|
124
119
|
res = nil
|
125
|
-
%w[NP WD CD DIS FDIS PRF IS AWI].each do |st| # try stages
|
120
|
+
%w[NP WD CD DIS FDIS PRF IS AWI TR].each do |st| # try stages
|
126
121
|
c = yield st
|
127
122
|
res = search_code result, c, corr, opts
|
128
123
|
return res unless res.empty?
|
@@ -134,8 +129,8 @@ module RelatonIso
|
|
134
129
|
result.select do |i|
|
135
130
|
(opts[:all_parts] || i.hit["docRef"] =~ %r{^#{code}(?!-)}) && (
|
136
131
|
corr && %r{^#{code}[\w-]*(:\d{4})?/#{corr}} =~ i.hit["docRef"] ||
|
137
|
-
%r{^#{code}[\w-]*(:\d{4})?/} !~ i.hit["docRef"]
|
138
|
-
|
132
|
+
!corr && %r{^#{code}[\w-]*(:\d{4})?/} !~ i.hit["docRef"]
|
133
|
+
) # && %r{^#{code}} =~ i.hit["docRef"]
|
139
134
|
end
|
140
135
|
end
|
141
136
|
|
@@ -159,11 +154,15 @@ module RelatonIso
|
|
159
154
|
end
|
160
155
|
return { years: missed_years } unless hits.any?
|
161
156
|
|
162
|
-
return { ret: hits.first.fetch } if !opts[:all_parts] || hits.size == 1
|
157
|
+
return { ret: hits.first.fetch(opts[:lang]) } if !opts[:all_parts] || hits.size == 1
|
163
158
|
|
164
|
-
{ ret: hits.to_all_parts }
|
159
|
+
{ ret: hits.to_all_parts(opts[:lang]) }
|
165
160
|
end
|
166
161
|
|
162
|
+
# @param code [String]
|
163
|
+
# @param year [String, NilClass]
|
164
|
+
# @param corr [String, NilClass]
|
165
|
+
# @param opts [Hash]
|
167
166
|
def isobib_get1(code, year, corr, opts)
|
168
167
|
# return iev(code) if /^IEC 60050-/.match code
|
169
168
|
result = isobib_search_filter(code, corr, opts) || return
|
data/lib/relaton_iso/scrapper.rb
CHANGED
@@ -27,10 +27,11 @@ module RelatonIso
|
|
27
27
|
|
28
28
|
class << self
|
29
29
|
# Parse page.
|
30
|
-
# @param
|
30
|
+
# @param hit_data [Hash]
|
31
|
+
# @param lang [String, NilClass]
|
31
32
|
# @return [Hash]
|
32
33
|
# rubocop:disable Metrics/AbcSize, Metrics/MethodLength
|
33
|
-
def parse_page(hit_data)
|
34
|
+
def parse_page(hit_data, lang = nil)
|
34
35
|
path = "/contents/data/standard#{hit_data["splitPath"]}/#{hit_data["csnumber"]}.html"
|
35
36
|
doc, url = get_page path
|
36
37
|
|
@@ -38,15 +39,15 @@ module RelatonIso
|
|
38
39
|
edition = doc&.xpath("//strong[contains(text(), 'Edition')]/..")&.
|
39
40
|
children&.last&.text&.match(/\d+/)&.to_s
|
40
41
|
|
41
|
-
titles, abstract = fetch_titles_abstract(doc)
|
42
|
+
titles, abstract, langs = fetch_titles_abstract(doc, lang)
|
42
43
|
|
43
44
|
RelatonIsoBib::IsoBibliographicItem.new(
|
44
45
|
fetched: Date.today.to_s,
|
45
46
|
docid: fetch_docid(hit_data["docRef"]),
|
46
47
|
docnumber: fetch_docnumber(doc),
|
47
48
|
edition: edition,
|
48
|
-
language: langs
|
49
|
-
script: langs
|
49
|
+
language: langs.map { |l| l[:lang] },
|
50
|
+
script: langs.map { |l| script(l[:lang]) }.uniq,
|
50
51
|
title: titles,
|
51
52
|
doctype: fetch_type(hit_data["docRef"]),
|
52
53
|
docstatus: fetch_status(doc),
|
@@ -68,40 +69,46 @@ module RelatonIso
|
|
68
69
|
|
69
70
|
# Fetch titles and abstracts.
|
70
71
|
# @param doc [Nokigiri::HTML::Document]
|
72
|
+
# @param lang [String, NilClass]
|
71
73
|
# @return [Array<Array>]
|
72
74
|
# rubocop:disable Metrics/AbcSize, Metrics/MethodLength
|
73
|
-
def fetch_titles_abstract(doc)
|
75
|
+
def fetch_titles_abstract(doc, lang)
|
74
76
|
titles = []
|
75
77
|
abstract = []
|
76
|
-
langs(doc).
|
78
|
+
langs = languages(doc, lang).reduce([]) do |s, l|
|
77
79
|
# Don't need to get page for en. We already have it.
|
78
|
-
d =
|
79
|
-
|
80
|
+
d = l[:path] ? get_page(l[:path])[0] : doc
|
81
|
+
unless d.at("//h5[@class='help-block'][.='недоступно на русском языке']")
|
82
|
+
s << l
|
83
|
+
titles << fetch_title(d, l[:lang])
|
80
84
|
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
85
|
+
# Fetch abstracts.
|
86
|
+
abstract_content = d.css("div[itemprop='description'] p").text
|
87
|
+
unless abstract_content.empty?
|
88
|
+
abstract << {
|
89
|
+
content: abstract_content,
|
90
|
+
language: l[:lang],
|
91
|
+
script: script(l[:lang]),
|
92
|
+
format: "text/plain",
|
93
|
+
}
|
94
|
+
end
|
95
|
+
end
|
96
|
+
s
|
91
97
|
end
|
92
|
-
[titles, abstract]
|
98
|
+
[titles, abstract, langs]
|
93
99
|
end
|
94
100
|
# rubocop:enable Metrics/AbcSize, Metrics/MethodLength
|
95
101
|
|
96
|
-
#
|
102
|
+
# Returns available languages.
|
97
103
|
# @param doc [Nokogiri::HTML::Document]
|
104
|
+
# @pqrqm lang [String, NilClass]
|
98
105
|
# @return [Array<Hash>]
|
99
|
-
def
|
106
|
+
def languages(doc, lang)
|
100
107
|
lgs = [{ lang: "en" }]
|
101
108
|
doc.css("li#lang-switcher ul li a").each do |lang_link|
|
102
109
|
lang_path = lang_link.attr("href")
|
103
|
-
|
104
|
-
lgs << { lang:
|
110
|
+
l = lang_path.match(%r{^\/(fr)\/})
|
111
|
+
lgs << { lang: l[1], path: lang_path } if l && (!lang || l[1] == lang)
|
105
112
|
end
|
106
113
|
lgs
|
107
114
|
end
|
@@ -200,9 +207,12 @@ module RelatonIso
|
|
200
207
|
def fetch_relations(doc)
|
201
208
|
doc.css("ul.steps li").reduce([]) do |a, r|
|
202
209
|
r_type = r.css("strong").text
|
210
|
+
date = []
|
203
211
|
type = case r_type
|
204
212
|
when "Previously", "Will be replaced by" then "obsoletes"
|
205
213
|
when "Corrigenda/Amendments", "Revised by", "Now confirmed"
|
214
|
+
date << { type: "circulated",
|
215
|
+
on: doc.xpath('//span[@class="stage-date"]').last.text }
|
206
216
|
"updates"
|
207
217
|
else r_type
|
208
218
|
end
|
@@ -213,7 +223,7 @@ module RelatonIso
|
|
213
223
|
content: id.text, format: "text/plain",
|
214
224
|
)
|
215
225
|
bibitem = RelatonIsoBib::IsoBibliographicItem.new(
|
216
|
-
formattedref: fref,
|
226
|
+
formattedref: fref, date: date
|
217
227
|
)
|
218
228
|
{ type: type, bibitem: bibitem }
|
219
229
|
end
|
@@ -257,6 +267,7 @@ module RelatonIso
|
|
257
267
|
def script(lang)
|
258
268
|
case lang
|
259
269
|
when "en", "fr" then "Latn"
|
270
|
+
# when "ru" then "Cyrl"
|
260
271
|
end
|
261
272
|
end
|
262
273
|
|
data/lib/relaton_iso/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: relaton-iso
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.8.
|
4
|
+
version: 0.8.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Ribose Inc.
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2020-01-
|
11
|
+
date: 2020-01-22 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: byebug
|