relaton-iso 0.8.0 → 0.8.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.adoc +30 -0
- data/lib/relaton_iso/hit.rb +3 -2
- data/lib/relaton_iso/hit_collection.rb +12 -27
- data/lib/relaton_iso/iso_bibliography.rb +15 -16
- data/lib/relaton_iso/scrapper.rb +36 -25
- data/lib/relaton_iso/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 23cfcc3b3c94cf0988d6087a1afacb4bcc6065249b4f6785879e8242384bf00d
|
4
|
+
data.tar.gz: f973dc6ba692b89acd1c6b4f9f75b9261ca7aff09557ed58605a7b1b8e499719
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 8d88890b35c169076ceb3350eedb036116a4fe30403ae23e6e9300718d5cfd8d6d37f49a6724bdb2c4b459a7782bd429cab660d4a195bcef92ac0dc347021c6f
|
7
|
+
data.tar.gz: e68d82ea105bba5dc0ab989a35eb02062adefdb45302c17065f9b47b3bf3f3b6eb9ea4e3caa43177782235013f984ea6fe574cad8e95fe1a535319d63c8b573e
|
data/README.adoc
CHANGED
@@ -183,6 +183,36 @@ RelatonIso::IsoBibliography.get('ISO 19115', "2014", {all_parts: true}).title
|
|
183
183
|
@type="main">]
|
184
184
|
----
|
185
185
|
|
186
|
+
=== Get specific language
|
187
|
+
|
188
|
+
[source,ruby]
|
189
|
+
----
|
190
|
+
item = RelatonIso::IsoBibliography.get 'ISO 19115', nil, {lang: "en"}
|
191
|
+
item.to_xml
|
192
|
+
=> <bibitem id="ISO19115-1-2014" type="standard">
|
193
|
+
<fetched>2020-01-22</fetched>
|
194
|
+
<title type="title-intro" format="text/plain" language="en" script="Latn">Geographic information</title>
|
195
|
+
<title type="title-main" format="text/plain" language="en" script="Latn">Metadata</title>
|
196
|
+
<title type="main" format="text/plain" language="en" script="Latn">Geographic information – Metadata</title>
|
197
|
+
<uri type=\"src\">https://www.iso.org/standard/53798.html</uri>
|
198
|
+
...
|
199
|
+
</bibitem>
|
200
|
+
|
201
|
+
item = RelatonIso::IsoBibliography.get 'ISO 19115', nil, {lang: "fr"}
|
202
|
+
item.to_xml
|
203
|
+
=> <bibitem id="ISO19115-1-2014" type="standard">
|
204
|
+
<fetched>2020-01-22</fetched>
|
205
|
+
<title type="title-intro" format="text/plain" language="en" script="Latn">Geographic information</title>
|
206
|
+
<title type="title-main" format="text/plain" language="en" script="Latn">Metadata</title>
|
207
|
+
<title type="main" format="text/plain" language="en" script="Latn">Geographic information – Metadata</title>
|
208
|
+
<title type="title-intro" format="text/plain" language="fr" script="Latn">Information géographique</title>
|
209
|
+
<title type="title-main" format="text/plain" language="fr" script="Latn">Métadonnées</title>
|
210
|
+
<title type="main" format="text/plain" language="fr" script="Latn">Information géographique – Métadonnées</title>
|
211
|
+
<uri type="src">https://www.iso.org/standard/53798.html</uri>
|
212
|
+
...
|
213
|
+
</bibitem>
|
214
|
+
----
|
215
|
+
|
186
216
|
== Development
|
187
217
|
|
188
218
|
After checking out the repo, run `bin/setup` to install dependencies. Then, run `rake spec` to run the tests. You can also run `bin/console` for an interactive prompt that will allow you to experiment.
|
data/lib/relaton_iso/hit.rb
CHANGED
@@ -7,9 +7,10 @@ module RelatonIso
|
|
7
7
|
attr_reader :hit_collection
|
8
8
|
|
9
9
|
# Parse page.
|
10
|
+
# @param lang [String, NilClass]
|
10
11
|
# @return [RelatonIso::IsoBibliographicItem]
|
11
|
-
def fetch
|
12
|
-
@fetch ||= Scrapper.parse_page @hit
|
12
|
+
def fetch(lang = nil)
|
13
|
+
@fetch ||= Scrapper.parse_page @hit, lang
|
13
14
|
end
|
14
15
|
|
15
16
|
# @param builder [Nokogiri::XML::Builder]
|
@@ -10,25 +10,23 @@ module RelatonIso
|
|
10
10
|
|
11
11
|
def_delegators :@array, :<<, :[], :first, :empty?, :any?, :size
|
12
12
|
|
13
|
-
# @return [
|
14
|
-
# attr_reader :fetched
|
15
|
-
|
16
|
-
# @return [RelatonIso::HitPages]
|
17
|
-
# attr_reader :hit_pages
|
18
|
-
|
19
|
-
# @return [String]
|
13
|
+
# @return [String, NilClass]
|
20
14
|
attr_reader :text
|
21
15
|
|
22
|
-
# @param
|
16
|
+
# @param text [String] reference to search
|
23
17
|
def initialize(text)
|
24
18
|
@array = []
|
25
19
|
@text = text
|
26
|
-
%r{\s(?<num>\d+)(-(?<part
|
20
|
+
%r{\s(?<num>\d+)(-(?<part>[\d-]+))?} =~ text
|
27
21
|
http = Net::HTTP.new "www.iso.org", 443
|
28
22
|
http.use_ssl = true
|
29
23
|
search = ["status=ENT_ACTIVE,ENT_PROGRESS,ENT_INACTIVE,ENT_DELETED"]
|
30
24
|
search << "docNumber=#{num}"
|
31
25
|
search << "docPartNo=#{part}" if part
|
26
|
+
# if year
|
27
|
+
# search << "stageDateStart=#{Date.new(year.to_i).strftime("%Y-%m-%d")}"
|
28
|
+
# search << "stageDateEnd=#{Date.new(year.to_i, 12, 31).strftime("%Y-%m-%d")}"
|
29
|
+
# end
|
32
30
|
q = search.join "&"
|
33
31
|
resp = http.get("/cms/render/live/en/sites/isoorg.advancedSearch.do?#{q}",
|
34
32
|
"Accept" => "application/json, text/plain, */*")
|
@@ -55,27 +53,14 @@ module RelatonIso
|
|
55
53
|
self
|
56
54
|
end
|
57
55
|
|
58
|
-
# @
|
59
|
-
#
|
60
|
-
|
61
|
-
|
62
|
-
# workers = RelatonBib::WorkersPool.new 4
|
63
|
-
# workers.worker(&:fetch)
|
64
|
-
# @array.each do |hit|
|
65
|
-
# workers << hit
|
66
|
-
# end
|
67
|
-
# workers.end
|
68
|
-
# workers.result
|
69
|
-
# @fetched = true
|
70
|
-
# self
|
71
|
-
# end
|
72
|
-
|
73
|
-
def to_all_parts
|
56
|
+
# @param lang [String, NilClass]
|
57
|
+
# @return [RelatonIsoBib::IsoBibliographicItem]
|
58
|
+
def to_all_parts(lang = nil)
|
74
59
|
parts = @array.select { |h| !h.hit["docPart"].empty? }
|
75
60
|
hit = parts.min_by { |h| h.hit["docPart"].to_i }
|
76
|
-
return @array.first.fetch unless hit
|
61
|
+
return @array.first.fetch lang unless hit
|
77
62
|
|
78
|
-
bibitem = hit.fetch
|
63
|
+
bibitem = hit.fetch lang
|
79
64
|
bibitem.to_all_parts
|
80
65
|
parts.reject { |h| h.hit["docRef"] == hit.hit["docRef"] }.each do |hi|
|
81
66
|
isobib = RelatonIsoBib::IsoBibliographicItem.new(
|
@@ -19,24 +19,19 @@ module RelatonIso
|
|
19
19
|
raise RelatonBib::RequestError, "Could not access http://www.iso.org"
|
20
20
|
end
|
21
21
|
|
22
|
-
# @param text [String]
|
23
|
-
# @return [Array<RelatonIso::IsoBibliographicItem>]
|
24
|
-
# def search_and_fetch(text)
|
25
|
-
# Scrapper.get(text)
|
26
|
-
# end
|
27
|
-
|
28
22
|
# @param ref [String] the ISO standard Code to look up (e..g "ISO 9000")
|
29
|
-
# @param year [String] the year the standard was published
|
30
|
-
# @param opts [Hash] options; restricted to :all_parts if all-parts
|
31
|
-
# :keep_year if undated reference should
|
23
|
+
# @param year [String, NilClass] the year the standard was published
|
24
|
+
# @param opts [Hash] options; restricted to :all_parts if all-parts
|
25
|
+
# reference is required, :keep_year if undated reference should
|
26
|
+
# return actual reference with year
|
32
27
|
# @return [String] Relaton XML serialisation of reference
|
33
|
-
def get(ref, year, opts)
|
28
|
+
def get(ref, year = nil, opts = {})
|
34
29
|
opts[:ref] = ref
|
35
30
|
|
36
31
|
%r{
|
37
32
|
^(?<code1>[^\s]+\s[^/]+) # match code
|
38
33
|
/?
|
39
|
-
(?<corr>(Amd|DAmd|(CD|WD|AWI|NP)\sAmd|Cor|CD\sCor|FDAmd)\s\d+ # correction name
|
34
|
+
(?<corr>(Amd|DAmd|(CD|WD|AWI|NP)\sAmd|Cor|CD\sCor|FDAmd|PRF\sAmd)\s\d+ # correction name
|
40
35
|
:?(\d{4})?(/Cor\s\d+:\d{4})?) # match correction year
|
41
36
|
}x =~ ref
|
42
37
|
code = code1 || ref
|
@@ -122,7 +117,7 @@ module RelatonIso
|
|
122
117
|
# @param opts [Hash]
|
123
118
|
def try_stages(result, corr, opts)
|
124
119
|
res = nil
|
125
|
-
%w[NP WD CD DIS FDIS PRF IS AWI].each do |st| # try stages
|
120
|
+
%w[NP WD CD DIS FDIS PRF IS AWI TR].each do |st| # try stages
|
126
121
|
c = yield st
|
127
122
|
res = search_code result, c, corr, opts
|
128
123
|
return res unless res.empty?
|
@@ -134,8 +129,8 @@ module RelatonIso
|
|
134
129
|
result.select do |i|
|
135
130
|
(opts[:all_parts] || i.hit["docRef"] =~ %r{^#{code}(?!-)}) && (
|
136
131
|
corr && %r{^#{code}[\w-]*(:\d{4})?/#{corr}} =~ i.hit["docRef"] ||
|
137
|
-
%r{^#{code}[\w-]*(:\d{4})?/} !~ i.hit["docRef"]
|
138
|
-
|
132
|
+
!corr && %r{^#{code}[\w-]*(:\d{4})?/} !~ i.hit["docRef"]
|
133
|
+
) # && %r{^#{code}} =~ i.hit["docRef"]
|
139
134
|
end
|
140
135
|
end
|
141
136
|
|
@@ -159,11 +154,15 @@ module RelatonIso
|
|
159
154
|
end
|
160
155
|
return { years: missed_years } unless hits.any?
|
161
156
|
|
162
|
-
return { ret: hits.first.fetch } if !opts[:all_parts] || hits.size == 1
|
157
|
+
return { ret: hits.first.fetch(opts[:lang]) } if !opts[:all_parts] || hits.size == 1
|
163
158
|
|
164
|
-
{ ret: hits.to_all_parts }
|
159
|
+
{ ret: hits.to_all_parts(opts[:lang]) }
|
165
160
|
end
|
166
161
|
|
162
|
+
# @param code [String]
|
163
|
+
# @param year [String, NilClass]
|
164
|
+
# @param corr [String, NilClass]
|
165
|
+
# @param opts [Hash]
|
167
166
|
def isobib_get1(code, year, corr, opts)
|
168
167
|
# return iev(code) if /^IEC 60050-/.match code
|
169
168
|
result = isobib_search_filter(code, corr, opts) || return
|
data/lib/relaton_iso/scrapper.rb
CHANGED
@@ -27,10 +27,11 @@ module RelatonIso
|
|
27
27
|
|
28
28
|
class << self
|
29
29
|
# Parse page.
|
30
|
-
# @param
|
30
|
+
# @param hit_data [Hash]
|
31
|
+
# @param lang [String, NilClass]
|
31
32
|
# @return [Hash]
|
32
33
|
# rubocop:disable Metrics/AbcSize, Metrics/MethodLength
|
33
|
-
def parse_page(hit_data)
|
34
|
+
def parse_page(hit_data, lang = nil)
|
34
35
|
path = "/contents/data/standard#{hit_data["splitPath"]}/#{hit_data["csnumber"]}.html"
|
35
36
|
doc, url = get_page path
|
36
37
|
|
@@ -38,15 +39,15 @@ module RelatonIso
|
|
38
39
|
edition = doc&.xpath("//strong[contains(text(), 'Edition')]/..")&.
|
39
40
|
children&.last&.text&.match(/\d+/)&.to_s
|
40
41
|
|
41
|
-
titles, abstract = fetch_titles_abstract(doc)
|
42
|
+
titles, abstract, langs = fetch_titles_abstract(doc, lang)
|
42
43
|
|
43
44
|
RelatonIsoBib::IsoBibliographicItem.new(
|
44
45
|
fetched: Date.today.to_s,
|
45
46
|
docid: fetch_docid(hit_data["docRef"]),
|
46
47
|
docnumber: fetch_docnumber(doc),
|
47
48
|
edition: edition,
|
48
|
-
language: langs
|
49
|
-
script: langs
|
49
|
+
language: langs.map { |l| l[:lang] },
|
50
|
+
script: langs.map { |l| script(l[:lang]) }.uniq,
|
50
51
|
title: titles,
|
51
52
|
doctype: fetch_type(hit_data["docRef"]),
|
52
53
|
docstatus: fetch_status(doc),
|
@@ -68,40 +69,46 @@ module RelatonIso
|
|
68
69
|
|
69
70
|
# Fetch titles and abstracts.
|
70
71
|
# @param doc [Nokigiri::HTML::Document]
|
72
|
+
# @param lang [String, NilClass]
|
71
73
|
# @return [Array<Array>]
|
72
74
|
# rubocop:disable Metrics/AbcSize, Metrics/MethodLength
|
73
|
-
def fetch_titles_abstract(doc)
|
75
|
+
def fetch_titles_abstract(doc, lang)
|
74
76
|
titles = []
|
75
77
|
abstract = []
|
76
|
-
langs(doc).
|
78
|
+
langs = languages(doc, lang).reduce([]) do |s, l|
|
77
79
|
# Don't need to get page for en. We already have it.
|
78
|
-
d =
|
79
|
-
|
80
|
+
d = l[:path] ? get_page(l[:path])[0] : doc
|
81
|
+
unless d.at("//h5[@class='help-block'][.='недоступно на русском языке']")
|
82
|
+
s << l
|
83
|
+
titles << fetch_title(d, l[:lang])
|
80
84
|
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
85
|
+
# Fetch abstracts.
|
86
|
+
abstract_content = d.css("div[itemprop='description'] p").text
|
87
|
+
unless abstract_content.empty?
|
88
|
+
abstract << {
|
89
|
+
content: abstract_content,
|
90
|
+
language: l[:lang],
|
91
|
+
script: script(l[:lang]),
|
92
|
+
format: "text/plain",
|
93
|
+
}
|
94
|
+
end
|
95
|
+
end
|
96
|
+
s
|
91
97
|
end
|
92
|
-
[titles, abstract]
|
98
|
+
[titles, abstract, langs]
|
93
99
|
end
|
94
100
|
# rubocop:enable Metrics/AbcSize, Metrics/MethodLength
|
95
101
|
|
96
|
-
#
|
102
|
+
# Returns available languages.
|
97
103
|
# @param doc [Nokogiri::HTML::Document]
|
104
|
+
# @pqrqm lang [String, NilClass]
|
98
105
|
# @return [Array<Hash>]
|
99
|
-
def
|
106
|
+
def languages(doc, lang)
|
100
107
|
lgs = [{ lang: "en" }]
|
101
108
|
doc.css("li#lang-switcher ul li a").each do |lang_link|
|
102
109
|
lang_path = lang_link.attr("href")
|
103
|
-
|
104
|
-
lgs << { lang:
|
110
|
+
l = lang_path.match(%r{^\/(fr)\/})
|
111
|
+
lgs << { lang: l[1], path: lang_path } if l && (!lang || l[1] == lang)
|
105
112
|
end
|
106
113
|
lgs
|
107
114
|
end
|
@@ -200,9 +207,12 @@ module RelatonIso
|
|
200
207
|
def fetch_relations(doc)
|
201
208
|
doc.css("ul.steps li").reduce([]) do |a, r|
|
202
209
|
r_type = r.css("strong").text
|
210
|
+
date = []
|
203
211
|
type = case r_type
|
204
212
|
when "Previously", "Will be replaced by" then "obsoletes"
|
205
213
|
when "Corrigenda/Amendments", "Revised by", "Now confirmed"
|
214
|
+
date << { type: "circulated",
|
215
|
+
on: doc.xpath('//span[@class="stage-date"]').last.text }
|
206
216
|
"updates"
|
207
217
|
else r_type
|
208
218
|
end
|
@@ -213,7 +223,7 @@ module RelatonIso
|
|
213
223
|
content: id.text, format: "text/plain",
|
214
224
|
)
|
215
225
|
bibitem = RelatonIsoBib::IsoBibliographicItem.new(
|
216
|
-
formattedref: fref,
|
226
|
+
formattedref: fref, date: date
|
217
227
|
)
|
218
228
|
{ type: type, bibitem: bibitem }
|
219
229
|
end
|
@@ -257,6 +267,7 @@ module RelatonIso
|
|
257
267
|
def script(lang)
|
258
268
|
case lang
|
259
269
|
when "en", "fr" then "Latn"
|
270
|
+
# when "ru" then "Cyrl"
|
260
271
|
end
|
261
272
|
end
|
262
273
|
|
data/lib/relaton_iso/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: relaton-iso
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.8.
|
4
|
+
version: 0.8.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Ribose Inc.
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2020-01-
|
11
|
+
date: 2020-01-22 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: byebug
|