relaton-iso 1.16.1 → 1.16.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/relaton_iso/processor.rb +1 -1
- data/lib/relaton_iso/scrapper.rb +380 -360
- data/lib/relaton_iso/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 13ecc04a430b1dbf256c0853f612969727c16eba72a06cb2bc74bed17745ba90
|
4
|
+
data.tar.gz: f795f63a994b843e07d4857ba3b0dd9c91ec9a3ccb408827f1bf7bdbf5f854a9
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: d33586bbe409f54736b694d774a52e1bef8a4cc2d7c304aebd06c5ead8b3893b6f45c65d3e5c586c5e7f9f23501b52ae6b0630c25213d6105660251d03cff94e
|
7
|
+
data.tar.gz: e7fdcb33dfa855c73ead77a514eae36d761274bafeec77c520ac8ff84a05c6a04a2b30bd80fa7d89fcabda1975eeb95f85c65d0c177da7e1694da97bc4245ccd
|
data/lib/relaton_iso/scrapper.rb
CHANGED
@@ -43,407 +43,427 @@ module RelatonIso
|
|
43
43
|
url: "www.asme.org" },
|
44
44
|
}.freeze
|
45
45
|
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
)
|
86
|
-
|
46
|
+
extend self
|
47
|
+
|
48
|
+
# Parse page.
|
49
|
+
# @param hit [RelatonIso::Hit]
|
50
|
+
# @param lang [String, NilClass]
|
51
|
+
# @return [RelatonIsoBib::IsoBibliographicItem]
|
52
|
+
def parse_page(hit, lang = nil) # rubocop:disable Metrics/AbcSize, Metrics/MethodLength, Metrics/CyclomaticComplexity, Metrics/PerceivedComplexity
|
53
|
+
# path = "/contents/data/standard#{hit_data['splitPath']}/"\
|
54
|
+
# "#{hit_data['csnumber']}.html"
|
55
|
+
|
56
|
+
path = hit.hit[:path].sub("/sites/isoorg", "")
|
57
|
+
doc, url = get_page "#{path}.html"
|
58
|
+
|
59
|
+
# Fetch edition.
|
60
|
+
edition = doc.at("//div[div[.='Edition']]/text()[last()]")
|
61
|
+
&.text&.match(/\d+$/)&.to_s
|
62
|
+
hit.pubid.base.edition ||= edition if hit.pubid.base
|
63
|
+
|
64
|
+
titles, abstract, langs = fetch_titles_abstract(doc, lang)
|
65
|
+
|
66
|
+
RelatonIsoBib::IsoBibliographicItem.new(
|
67
|
+
fetched: Date.today.to_s,
|
68
|
+
docid: fetch_relaton_docids(doc, hit.pubid),
|
69
|
+
docnumber: fetch_docnumber(hit.pubid),
|
70
|
+
edition: edition,
|
71
|
+
language: langs.map { |l| l[:lang] },
|
72
|
+
script: langs.map { |l| script(l[:lang]) }.uniq,
|
73
|
+
title: titles,
|
74
|
+
doctype: fetch_type(hit.hit[:title]),
|
75
|
+
docstatus: fetch_status(doc),
|
76
|
+
ics: fetch_ics(doc),
|
77
|
+
date: fetch_dates(doc, hit.hit[:title]),
|
78
|
+
contributor: fetch_contributors(hit.hit[:title]),
|
79
|
+
editorialgroup: fetch_workgroup(doc),
|
80
|
+
abstract: abstract,
|
81
|
+
copyright: fetch_copyright(doc),
|
82
|
+
link: fetch_link(doc, url),
|
83
|
+
relation: fetch_relations(doc),
|
84
|
+
place: ["Geneva"],
|
85
|
+
structuredidentifier: fetch_structuredidentifier(hit.pubid),
|
86
|
+
)
|
87
|
+
end
|
87
88
|
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
89
|
+
#
|
90
|
+
# Create document ids.
|
91
|
+
#
|
92
|
+
# @param doc [Nokogiri::HTML::Document] document to parse
|
93
|
+
# @param pubid [Pubid::Iso::Identifier] publication identifier
|
94
|
+
#
|
95
|
+
# @return [Array<RelatonBib::DocumentIdentifier>]
|
96
|
+
#
|
97
|
+
def fetch_relaton_docids(doc, pubid)
|
98
|
+
pubid.stage ||= Pubid::Iso::Identifier.parse_stage(stage_code(doc))
|
99
|
+
[
|
100
|
+
RelatonIso::DocumentIdentifier.new(id: pubid, type: "ISO", primary: true),
|
101
|
+
RelatonBib::DocumentIdentifier.new(id: isoref(pubid), type: "iso-reference"),
|
102
|
+
RelatonIso::DocumentIdentifier.new(id: pubid, type: "URN"),
|
103
|
+
]
|
104
|
+
end
|
104
105
|
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
106
|
+
#
|
107
|
+
# Create ISO reference identifier with English language.
|
108
|
+
#
|
109
|
+
# @param [Pubid::Iso::Identifier] pubid publication identifier
|
110
|
+
#
|
111
|
+
# @return [String] English reference identifier
|
112
|
+
#
|
113
|
+
def isoref(pubid)
|
114
|
+
params = pubid.get_params.reject { |k, _| k == :typed_stage }
|
115
|
+
Pubid::Iso::Identifier.create(language: "en", **params).to_s(format: :ref_num_short)
|
116
|
+
end
|
116
117
|
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
end
|
118
|
+
private
|
119
|
+
|
120
|
+
# Fetch titles and abstracts.
|
121
|
+
# @param doc [Nokigiri::HTML::Document]
|
122
|
+
# @param lang [String, NilClass]
|
123
|
+
# @return [Array<Array>]
|
124
|
+
def fetch_titles_abstract(doc, lang) # rubocop:disable Metrics/AbcSize,Metrics/MethodLength,Metrics/CyclomaticComplexity,Metrics/PerceivedComplexity
|
125
|
+
titles = RelatonBib::TypedTitleStringCollection.new
|
126
|
+
abstract = []
|
127
|
+
langs = languages(doc, lang).reduce([]) do |s, l|
|
128
|
+
# Don't need to get page for en. We already have it.
|
129
|
+
d = l[:path] ? get_page(l[:path])[0] : doc
|
130
|
+
unless d.at("//h5[@class='help-block']" \
|
131
|
+
"[.='недоступно на русском языке']")
|
132
|
+
s << l
|
133
|
+
titles += fetch_title(d, l[:lang])
|
134
|
+
|
135
|
+
# Fetch abstracts.
|
136
|
+
abstract_content = d.xpath(
|
137
|
+
"//div[@itemprop='description']/p|//div[@itemprop='description']/ul/li",
|
138
|
+
).map do |a|
|
139
|
+
a.name == "li" ? "- #{a.text}" : a.text
|
140
|
+
end.reject(&:empty?).join("\n")
|
141
|
+
unless abstract_content.empty?
|
142
|
+
abstract << {
|
143
|
+
content: abstract_content,
|
144
|
+
language: l[:lang],
|
145
|
+
script: script(l[:lang]),
|
146
|
+
format: "text/plain",
|
147
|
+
}
|
148
148
|
end
|
149
|
-
s
|
150
149
|
end
|
151
|
-
|
150
|
+
s
|
152
151
|
end
|
152
|
+
[titles, abstract, langs]
|
153
|
+
end
|
153
154
|
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
|
160
|
-
|
161
|
-
|
162
|
-
|
163
|
-
|
164
|
-
end
|
165
|
-
lgs
|
155
|
+
# Returns available languages.
|
156
|
+
# @param doc [Nokogiri::HTML::Document]
|
157
|
+
# @pqrqm lang [String, NilClass]
|
158
|
+
# @return [Array<Hash>]
|
159
|
+
def languages(doc, lang)
|
160
|
+
lgs = [{ lang: "en" }]
|
161
|
+
doc.css("li#lang-switcher ul li a").each do |lang_link|
|
162
|
+
lang_path = lang_link.attr("href")
|
163
|
+
l = lang_path.match(%r{^/(fr)/})
|
164
|
+
lgs << { lang: l[1], path: lang_path } if l && (!lang || l[1] == lang)
|
166
165
|
end
|
166
|
+
lgs
|
167
|
+
end
|
167
168
|
|
168
|
-
|
169
|
-
|
170
|
-
|
171
|
-
|
172
|
-
|
173
|
-
|
174
|
-
|
175
|
-
|
176
|
-
|
177
|
-
|
178
|
-
|
179
|
-
|
169
|
+
# Get page.
|
170
|
+
# @param path [String] page's path
|
171
|
+
# @return [Array<Nokogiri::HTML::Document, String>]
|
172
|
+
def get_page(path)
|
173
|
+
resp, uri = get_redirection path
|
174
|
+
doc = try_if_fail resp, uri
|
175
|
+
[doc, uri.to_s]
|
176
|
+
rescue SocketError, Timeout::Error, Errno::EINVAL, Errno::ECONNRESET,
|
177
|
+
EOFError, Net::HTTPBadResponse, Net::HTTPHeaderSyntaxError,
|
178
|
+
Net::ProtocolError, Errno::ETIMEDOUT
|
179
|
+
raise RelatonBib::RequestError, "Could not access #{uri}"
|
180
|
+
end
|
180
181
|
|
181
|
-
|
182
|
-
|
183
|
-
|
184
|
-
|
185
|
-
|
186
|
-
|
187
|
-
|
188
|
-
|
189
|
-
|
190
|
-
|
191
|
-
|
192
|
-
|
193
|
-
|
194
|
-
|
182
|
+
#
|
183
|
+
# Get the page from the given path. If the page is redirected, get the
|
184
|
+
# page from the new path.
|
185
|
+
#
|
186
|
+
# @param [String] path path to the page
|
187
|
+
#
|
188
|
+
# @return [Array<Net::HTTPOK, URI>] HTTP response and URI
|
189
|
+
# @raise [RelatonBib::RequestError] if the page is not found
|
190
|
+
#
|
191
|
+
def get_redirection(path)
|
192
|
+
url = DOMAIN + path
|
193
|
+
uri = URI url
|
194
|
+
resp = Net::HTTP.get_response(uri)
|
195
|
+
raise RelatonBib::RequestError, "#{url} not found." if %w[404 302].include? resp.code
|
196
|
+
|
197
|
+
resp.code == "301" ? get_redirection(resp["location"]) : [resp, uri]
|
198
|
+
end
|
195
199
|
|
196
|
-
|
197
|
-
|
200
|
+
#
|
201
|
+
# The iso.org site fails to respond sometimes. This method tries to get
|
202
|
+
# the response again.
|
203
|
+
#
|
204
|
+
# @param [Net::HTTPOK] resp HTTP response
|
205
|
+
# @param [URI::HTTPS] uri URI of the page
|
206
|
+
#
|
207
|
+
# @return [Nokogiri::HTML4::Document] document
|
208
|
+
# @raise [RelatonBib::RequestError] if the page could not be parsed
|
209
|
+
#
|
210
|
+
def try_if_fail(resp, uri)
|
211
|
+
10.times do
|
212
|
+
doc = Nokogiri::HTML(resp.body)
|
213
|
+
# stop trying if page has a document id
|
214
|
+
return doc if item_ref doc
|
198
215
|
|
199
|
-
|
200
|
-
# The iso.org site fails to respond sometimes. This method tries to get
|
201
|
-
# the response again.
|
202
|
-
#
|
203
|
-
# @param [Net::HTTPOK] resp HTTP response
|
204
|
-
# @param [URI::HTTPS] uri URI of the page
|
205
|
-
#
|
206
|
-
# @return [Nokogiri::HTML4::Document] document
|
207
|
-
# @raise [RelatonBib::RequestError] if the page could not be parsed
|
208
|
-
#
|
209
|
-
def try_if_fail(resp, uri)
|
210
|
-
10.times do
|
211
|
-
doc = Nokogiri::HTML(resp.body)
|
212
|
-
# stop trying if page has a document id
|
213
|
-
return doc if item_ref doc
|
214
|
-
|
215
|
-
resp = Net::HTTP.get_response(uri)
|
216
|
-
end
|
217
|
-
raise RelatonBib::RequestError, "Could not parse the page #{uri}"
|
216
|
+
resp = Net::HTTP.get_response(uri)
|
218
217
|
end
|
218
|
+
raise RelatonBib::RequestError, "Could not parse the page #{uri}"
|
219
|
+
end
|
219
220
|
|
220
|
-
|
221
|
-
|
222
|
-
|
223
|
-
|
224
|
-
|
225
|
-
|
226
|
-
|
227
|
-
|
228
|
-
|
229
|
-
|
221
|
+
#
|
222
|
+
# Generate docnumber.
|
223
|
+
#
|
224
|
+
# @param [Pubid::Iso] pubid
|
225
|
+
#
|
226
|
+
# @return [String] docnumber
|
227
|
+
#
|
228
|
+
def fetch_docnumber(pubid)
|
229
|
+
pubid.to_s.match(/\d+/)&.to_s
|
230
|
+
end
|
230
231
|
|
231
|
-
|
232
|
-
|
233
|
-
|
234
|
-
|
235
|
-
|
236
|
-
|
237
|
-
|
238
|
-
|
239
|
-
|
240
|
-
|
241
|
-
|
242
|
-
|
243
|
-
|
244
|
-
|
232
|
+
#
|
233
|
+
# Parse structuredidentifier.
|
234
|
+
#
|
235
|
+
# @param pubid [Pubid::Iso::Identifier] pubid
|
236
|
+
#
|
237
|
+
# @return [RelatonBib::StructuredIdentifier] structured identifier
|
238
|
+
#
|
239
|
+
def fetch_structuredidentifier(pubid) # rubocop:disable Metrics/MethodLength
|
240
|
+
RelatonIsoBib::StructuredIdentifier.new(
|
241
|
+
project_number: "#{pubid.publisher} #{pubid.number}",
|
242
|
+
part: pubid.part&.to_s, # &.sub(/^-/, ""),
|
243
|
+
type: pubid.publisher,
|
244
|
+
)
|
245
|
+
end
|
245
246
|
|
246
|
-
|
247
|
-
|
248
|
-
|
247
|
+
def item_ref(doc)
|
248
|
+
doc.at("//main//section/div/div/div//h1")&.text
|
249
|
+
end
|
249
250
|
|
250
|
-
|
251
|
-
|
252
|
-
|
253
|
-
|
254
|
-
|
255
|
-
|
256
|
-
|
257
|
-
|
251
|
+
# Fetch status.
|
252
|
+
# @param doc [Nokogiri::HTML::Document]
|
253
|
+
# @param status [String]
|
254
|
+
# @return [Hash]
|
255
|
+
def fetch_status(doc)
|
256
|
+
stg, substg = stage_code(doc).split "."
|
257
|
+
RelatonBib::DocumentStatus.new(stage: stg, substage: substg)
|
258
|
+
end
|
258
259
|
|
259
|
-
|
260
|
-
|
261
|
-
|
262
|
-
|
260
|
+
def stage_code(doc)
|
261
|
+
doc.at("//ul[@class='dropdown-menu']/li[@class='active']" \
|
262
|
+
"/a/span[@class='stage-code']").text
|
263
|
+
end
|
263
264
|
|
264
|
-
|
265
|
-
|
266
|
-
|
267
|
-
|
268
|
-
|
269
|
-
|
270
|
-
|
271
|
-
|
272
|
-
|
273
|
-
|
274
|
-
|
275
|
-
|
276
|
-
|
277
|
-
|
278
|
-
|
279
|
-
|
280
|
-
|
281
|
-
|
282
|
-
|
283
|
-
|
284
|
-
|
285
|
-
|
286
|
-
|
287
|
-
|
288
|
-
|
265
|
+
# def stage(stg, substg)
|
266
|
+
# abbr = STGABBR[stg].is_a?(Hash) ? STGABBR[stg][substg] : STGABBR[stg]
|
267
|
+
# RelatonBib::DocumentStatus::Stage.new value: stg, abbreviation: abbr
|
268
|
+
# end
|
269
|
+
|
270
|
+
# Fetch workgroup.
|
271
|
+
# @param doc [Nokogiri::HTML::Document]
|
272
|
+
# @return [Hash]
|
273
|
+
def fetch_workgroup(doc) # rubocop:disable Metrics/MethodLength,Metrics/AbcSize,Metrics/CyclomaticComplexity,Metrics/PerceivedComplexity
|
274
|
+
wg = doc.at("////div[contains(., 'Technical Committe')]/following-sibling::span/a")
|
275
|
+
return unless wg
|
276
|
+
|
277
|
+
workgroup = wg.text.split "/"
|
278
|
+
type = workgroup[1]&.match(/^[A-Z]+/)&.to_s || "TC"
|
279
|
+
# {
|
280
|
+
# name: "International Organization for Standardization",
|
281
|
+
# abbreviation: "ISO",
|
282
|
+
# url: "www.iso.org",
|
283
|
+
# }
|
284
|
+
tc_numb = workgroup[1]&.match(/\d+/)&.to_s&.to_i
|
285
|
+
tc_name = wg[:title]
|
286
|
+
tc = RelatonBib::WorkGroup.new(name: tc_name, identifier: wg.text,
|
287
|
+
type: type, number: tc_numb)
|
288
|
+
RelatonIsoBib::EditorialGroup.new(technical_committee: [tc])
|
289
|
+
end
|
290
|
+
|
291
|
+
# Fetch relations.
|
292
|
+
# @param doc [Nokogiri::HTML::Document]
|
293
|
+
# @return [Array<Hash>]
|
294
|
+
def fetch_relations(doc)
|
295
|
+
types = ["Now", "Now under review"]
|
296
|
+
doc.xpath("//ul[@class='steps']/li", "//div[@class='sub-step']").reduce([]) do |a, r|
|
297
|
+
type, date = relation_type(r.at("h4", "h5").text.strip, doc)
|
298
|
+
next a if types.include?(type)
|
299
|
+
|
300
|
+
a + create_relations(r, type, date)
|
289
301
|
end
|
302
|
+
end
|
290
303
|
|
291
|
-
|
292
|
-
|
293
|
-
|
294
|
-
|
295
|
-
|
296
|
-
|
297
|
-
|
298
|
-
|
299
|
-
|
300
|
-
date = []
|
301
|
-
type = case r_type.strip
|
302
|
-
when "Previously", "Will be replaced by" then "obsoletes"
|
303
|
-
when "Corrigenda / Amendments", "Revised by", "Now confirmed"
|
304
|
-
on = doc.xpath('//span[@class="stage-date"][contains(., "-")]').last
|
305
|
-
date << { type: "circulated", on: on.text } if on
|
306
|
-
"updates"
|
307
|
-
else r_type
|
308
|
-
end
|
309
|
-
if types.include?(type) then a
|
310
|
-
else
|
311
|
-
a + r.css("a").map do |id|
|
312
|
-
docid = RelatonBib::DocumentIdentifier.new(type: "ISO", id: id.text, primary: true)
|
313
|
-
fref = RelatonBib::FormattedRef.new(content: id.text, format: "text/plain")
|
314
|
-
bibitem = RelatonIsoBib::IsoBibliographicItem.new(
|
315
|
-
docid: [docid], formattedref: fref, date: date,
|
316
|
-
)
|
317
|
-
{ type: type, bibitem: bibitem }
|
318
|
-
end
|
304
|
+
def relation_type(type, doc)
|
305
|
+
date = []
|
306
|
+
t = case type.strip
|
307
|
+
when "Previously", "Will be replaced by" then "obsoletes"
|
308
|
+
when "Corrigenda / Amendments", "Revised by", "Now confirmed"
|
309
|
+
on = doc.xpath('//span[@class="stage-date"][contains(., "-")]').last
|
310
|
+
date << { type: "circulated", on: on.text } if on
|
311
|
+
"updates"
|
312
|
+
else type
|
319
313
|
end
|
320
|
-
|
314
|
+
[t, date]
|
315
|
+
end
|
316
|
+
|
317
|
+
def create_relations(rel, type, date)
|
318
|
+
rel.css("a").map do |id|
|
319
|
+
docid = RelatonBib::DocumentIdentifier.new(type: "ISO", id: id.text, primary: true)
|
320
|
+
fref = RelatonBib::FormattedRef.new(content: id.text, format: "text/plain")
|
321
|
+
bibitem = RelatonIsoBib::IsoBibliographicItem.new(
|
322
|
+
docid: [docid], formattedref: fref, date: date,
|
323
|
+
)
|
324
|
+
{ type: type, bibitem: bibitem }
|
321
325
|
end
|
322
|
-
|
323
|
-
|
324
|
-
|
325
|
-
|
326
|
-
|
327
|
-
|
328
|
-
|
329
|
-
|
330
|
-
|
331
|
-
|
332
|
-
|
333
|
-
|
334
|
-
|
335
|
-
|
336
|
-
|
337
|
-
end
|
338
|
-
# rescue => _e
|
339
|
-
# puts 'Unknown document type: ' + title
|
326
|
+
end
|
327
|
+
|
328
|
+
# Fetch type.
|
329
|
+
# @param ref [String]
|
330
|
+
# @return [String]
|
331
|
+
def fetch_type(ref)
|
332
|
+
%r{
|
333
|
+
^(?<prefix>ISO|IWA|IEC)
|
334
|
+
(?:(?:/IEC|/IEEE|/PRF|/NP|/DGuide)*\s|/)
|
335
|
+
(?<type>TS|TR|PAS|AWI|CD|FDIS|NP|DIS|WD|R|Guide|(?=\d+))
|
336
|
+
}x =~ ref
|
337
|
+
# return "international-standard" if type_match.nil?
|
338
|
+
if TYPES[type] then TYPES[type]
|
339
|
+
elsif prefix == "ISO" then "international-standard"
|
340
|
+
elsif prefix == "IWA" then "international-workshop-agreement"
|
340
341
|
end
|
342
|
+
# rescue => _e
|
343
|
+
# puts 'Unknown document type: ' + title
|
344
|
+
end
|
341
345
|
|
342
|
-
|
343
|
-
|
344
|
-
|
345
|
-
|
346
|
-
|
347
|
-
|
348
|
-
|
349
|
-
|
350
|
-
|
351
|
-
|
352
|
-
|
353
|
-
RelatonBib::TypedTitleString.
|
346
|
+
# Fetch titles.
|
347
|
+
# @param doc [Nokogiri::HTML::Document]
|
348
|
+
# @param lang [String]
|
349
|
+
# @return [Array<RelatonBib::TypedTitleString>]
|
350
|
+
def fetch_title(doc, lang) # rubocop:disable Metrics/MethodLength,Metrics/AbcSize
|
351
|
+
types = %w[title-intro title-main title-part]
|
352
|
+
ttls = titles(doc)
|
353
|
+
title = RelatonBib::TypedTitleStringCollection.new
|
354
|
+
ttls.each.with_index do |p, i|
|
355
|
+
next unless p
|
356
|
+
|
357
|
+
title << RelatonBib::TypedTitleString.new(
|
358
|
+
type: types[i], content: p, language: lang, script: script(lang),
|
359
|
+
)
|
360
|
+
end.compact
|
361
|
+
main = title.map { |t| t.title.content }.join " - "
|
362
|
+
title << RelatonBib::TypedTitleString.new(type: "main", content: main, language: lang, script: script(lang))
|
363
|
+
end
|
364
|
+
|
365
|
+
def titles(doc)
|
366
|
+
head = doc.at "//nav[contains(@class,'heading-condensed')]"
|
367
|
+
ttls = head.xpath("h2 | h3 | h4").map &:text
|
368
|
+
ttls = ttls[0].split " - " if ttls.size == 1
|
369
|
+
case ttls.size
|
370
|
+
when 0, 1 then [nil, ttls.first, nil]
|
371
|
+
else RelatonBib::TypedTitleString.intro_or_part ttls
|
354
372
|
end
|
373
|
+
end
|
355
374
|
|
356
|
-
|
357
|
-
|
358
|
-
|
359
|
-
|
360
|
-
|
361
|
-
|
362
|
-
|
363
|
-
end
|
375
|
+
# Return ISO script code.
|
376
|
+
# @param lang [String]
|
377
|
+
# @return [String]
|
378
|
+
def script(lang)
|
379
|
+
case lang
|
380
|
+
when "en", "fr" then "Latn"
|
381
|
+
# when "ru" then "Cyrl"
|
364
382
|
end
|
383
|
+
end
|
365
384
|
|
366
|
-
|
367
|
-
|
368
|
-
|
369
|
-
|
370
|
-
|
371
|
-
|
372
|
-
|
373
|
-
|
374
|
-
|
375
|
-
|
376
|
-
|
377
|
-
|
385
|
+
# Fetch dates
|
386
|
+
# @param doc [Nokogiri::HTML::Document]
|
387
|
+
# @param ref [String]
|
388
|
+
# @return [Array<Hash>]
|
389
|
+
def fetch_dates(doc, ref) # rubocop:disable Metrics/AbcSize, Metrics/PerceivedComplexity, Metrics/MethodLength
|
390
|
+
dates = []
|
391
|
+
%r{^[^\s]+\s[\d-]+:(?<ref_date_str>\d{4})} =~ ref
|
392
|
+
pub_date_str = doc.xpath("//span[@itemprop='releaseDate']").text
|
393
|
+
if ref_date_str
|
394
|
+
ref_date = Date.strptime ref_date_str, "%Y"
|
395
|
+
if pub_date_str.empty?
|
396
|
+
dates << { type: "published", on: ref_date_str }
|
397
|
+
else
|
398
|
+
pub_date = Date.strptime pub_date_str, "%Y"
|
399
|
+
if pub_date.year > ref_date.year
|
378
400
|
dates << { type: "published", on: ref_date_str }
|
401
|
+
dates << { type: "updated", on: pub_date_str }
|
379
402
|
else
|
380
|
-
|
381
|
-
if pub_date.year > ref_date.year
|
382
|
-
dates << { type: "published", on: ref_date_str }
|
383
|
-
dates << { type: "updated", on: pub_date_str }
|
384
|
-
else
|
385
|
-
dates << { type: "published", on: pub_date_str }
|
386
|
-
end
|
403
|
+
dates << { type: "published", on: pub_date_str }
|
387
404
|
end
|
388
|
-
elsif !pub_date_str.empty?
|
389
|
-
dates << { type: "published", on: pub_date_str }
|
390
405
|
end
|
391
|
-
|
406
|
+
elsif !pub_date_str.empty?
|
407
|
+
dates << { type: "published", on: pub_date_str }
|
392
408
|
end
|
409
|
+
dates
|
410
|
+
end
|
393
411
|
|
394
|
-
|
395
|
-
|
396
|
-
|
397
|
-
|
412
|
+
def fetch_contributors(ref)
|
413
|
+
ref.sub(/\s.*/, "").split("/").reduce([]) do |mem, abbrev|
|
414
|
+
publisher = PUBLISHERS[abbrev]
|
415
|
+
next mem unless publisher
|
398
416
|
|
399
|
-
|
400
|
-
|
401
|
-
end
|
402
|
-
end
|
403
|
-
# rubocop:enable Metrics/MethodLength
|
404
|
-
|
405
|
-
# Fetch ICS.
|
406
|
-
# @param doc [Nokogiri::HTML::Document]
|
407
|
-
# @return [Array<Hash>]
|
408
|
-
def fetch_ics(doc)
|
409
|
-
doc.xpath("//dl[dt/strong[.='ICS']]/dd/span/a").map do |i|
|
410
|
-
code = i.text.match(/[\d.]+/).to_s.split "."
|
411
|
-
{ field: code[0], group: code[1], subgroup: code[2] }
|
412
|
-
end
|
417
|
+
publisher[:abbreviation] = abbrev
|
418
|
+
mem << { entity: publisher, role: [type: "publisher"] }
|
413
419
|
end
|
420
|
+
end
|
414
421
|
|
415
|
-
|
416
|
-
|
417
|
-
|
418
|
-
|
419
|
-
|
420
|
-
|
421
|
-
|
422
|
-
links << { type: "obp", content: obp[:href] } if obp
|
423
|
-
rss = doc.at("//a[contains(@href, 'rss')]")
|
424
|
-
links << { type: "rss", content: DOMAIN + rss[:href] } if rss
|
425
|
-
pub = doc.at "//p[contains(., 'publicly available')]/a",
|
426
|
-
"//p[contains(., 'can be downloaded from the')]/a"
|
427
|
-
links << { type: "pub", content: pub[:href] } if pub
|
428
|
-
links
|
422
|
+
# Fetch ICS.
|
423
|
+
# @param doc [Nokogiri::HTML::Document]
|
424
|
+
# @return [Array<Hash>]
|
425
|
+
def fetch_ics(doc)
|
426
|
+
doc.xpath("//div[contains(., 'ICS')]/following-sibling::span/a").map do |i|
|
427
|
+
code = i.text.match(/[\d.]+/).to_s.split "."
|
428
|
+
{ field: code[0], group: code[1], subgroup: code[2] }
|
429
429
|
end
|
430
|
+
end
|
430
431
|
|
431
|
-
|
432
|
-
|
433
|
-
|
434
|
-
|
435
|
-
|
436
|
-
|
437
|
-
|
438
|
-
|
439
|
-
|
440
|
-
|
441
|
-
|
442
|
-
|
443
|
-
|
444
|
-
|
445
|
-
|
432
|
+
#
|
433
|
+
# Fetch links.
|
434
|
+
#
|
435
|
+
# @param doc [Nokogiri::HTML::Document] document to parse
|
436
|
+
# @param url [String] document url
|
437
|
+
#
|
438
|
+
# @return [Array<Hash>]
|
439
|
+
#
|
440
|
+
def fetch_link(doc, url)
|
441
|
+
links = [{ type: "src", content: url }]
|
442
|
+
obp = doc.at("//h4[contains(@class, 'h5')]/a")
|
443
|
+
links << { type: "obp", content: obp[:href] } if obp
|
444
|
+
rss = doc.at("//a[contains(@href, 'rss')]")
|
445
|
+
links << { type: "rss", content: DOMAIN + rss[:href] } if rss
|
446
|
+
pub = doc.at "//p[contains(., 'publicly available')]/a",
|
447
|
+
"//p[contains(., 'can be downloaded from the')]/a"
|
448
|
+
links << { type: "pub", content: pub[:href] } if pub
|
449
|
+
links
|
450
|
+
end
|
451
|
+
|
452
|
+
# Fetch copyright.
|
453
|
+
# @param doc [Nokogiri::HTML::Document]
|
454
|
+
# @return [Array<Hash>]
|
455
|
+
def fetch_copyright(doc) # rubocop:disable Metrics/MethodLength
|
456
|
+
ref = item_ref doc
|
457
|
+
owner_name = ref.match(/.*?(?=\s)/).to_s
|
458
|
+
from = ref.match(/(?<=:)\d{4}/).to_s
|
459
|
+
if from.empty?
|
460
|
+
date = doc.at(
|
461
|
+
"//span[@itemprop='releaseDate']",
|
462
|
+
"//ul[@id='stages']/li[contains(@class,'active')]/ul/li[@class='active']/a/span[@class='stage-date']",
|
463
|
+
)
|
464
|
+
from = date.text.match(/\d{4}/).to_s
|
446
465
|
end
|
466
|
+
[{ owner: [{ name: owner_name }], from: from }]
|
447
467
|
end
|
448
468
|
end
|
449
469
|
end
|
data/lib/relaton_iso/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: relaton-iso
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.16.
|
4
|
+
version: 1.16.3
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Ribose Inc.
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2023-10-
|
11
|
+
date: 2023-10-21 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: algolia
|