relaton-iso 1.16.2 → 1.16.3
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/relaton_iso/processor.rb +1 -1
- data/lib/relaton_iso/scrapper.rb +376 -367
- data/lib/relaton_iso/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 13ecc04a430b1dbf256c0853f612969727c16eba72a06cb2bc74bed17745ba90
|
4
|
+
data.tar.gz: f795f63a994b843e07d4857ba3b0dd9c91ec9a3ccb408827f1bf7bdbf5f854a9
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: d33586bbe409f54736b694d774a52e1bef8a4cc2d7c304aebd06c5ead8b3893b6f45c65d3e5c586c5e7f9f23501b52ae6b0630c25213d6105660251d03cff94e
|
7
|
+
data.tar.gz: e7fdcb33dfa855c73ead77a514eae36d761274bafeec77c520ac8ff84a05c6a04a2b30bd80fa7d89fcabda1975eeb95f85c65d0c177da7e1694da97bc4245ccd
|
data/lib/relaton_iso/scrapper.rb
CHANGED
@@ -43,418 +43,427 @@ module RelatonIso
|
|
43
43
|
url: "www.asme.org" },
|
44
44
|
}.freeze
|
45
45
|
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
)
|
86
|
-
|
46
|
+
extend self
|
47
|
+
|
48
|
+
# Parse page.
|
49
|
+
# @param hit [RelatonIso::Hit]
|
50
|
+
# @param lang [String, NilClass]
|
51
|
+
# @return [RelatonIsoBib::IsoBibliographicItem]
|
52
|
+
def parse_page(hit, lang = nil) # rubocop:disable Metrics/AbcSize, Metrics/MethodLength, Metrics/CyclomaticComplexity, Metrics/PerceivedComplexity
|
53
|
+
# path = "/contents/data/standard#{hit_data['splitPath']}/"\
|
54
|
+
# "#{hit_data['csnumber']}.html"
|
55
|
+
|
56
|
+
path = hit.hit[:path].sub("/sites/isoorg", "")
|
57
|
+
doc, url = get_page "#{path}.html"
|
58
|
+
|
59
|
+
# Fetch edition.
|
60
|
+
edition = doc.at("//div[div[.='Edition']]/text()[last()]")
|
61
|
+
&.text&.match(/\d+$/)&.to_s
|
62
|
+
hit.pubid.base.edition ||= edition if hit.pubid.base
|
63
|
+
|
64
|
+
titles, abstract, langs = fetch_titles_abstract(doc, lang)
|
65
|
+
|
66
|
+
RelatonIsoBib::IsoBibliographicItem.new(
|
67
|
+
fetched: Date.today.to_s,
|
68
|
+
docid: fetch_relaton_docids(doc, hit.pubid),
|
69
|
+
docnumber: fetch_docnumber(hit.pubid),
|
70
|
+
edition: edition,
|
71
|
+
language: langs.map { |l| l[:lang] },
|
72
|
+
script: langs.map { |l| script(l[:lang]) }.uniq,
|
73
|
+
title: titles,
|
74
|
+
doctype: fetch_type(hit.hit[:title]),
|
75
|
+
docstatus: fetch_status(doc),
|
76
|
+
ics: fetch_ics(doc),
|
77
|
+
date: fetch_dates(doc, hit.hit[:title]),
|
78
|
+
contributor: fetch_contributors(hit.hit[:title]),
|
79
|
+
editorialgroup: fetch_workgroup(doc),
|
80
|
+
abstract: abstract,
|
81
|
+
copyright: fetch_copyright(doc),
|
82
|
+
link: fetch_link(doc, url),
|
83
|
+
relation: fetch_relations(doc),
|
84
|
+
place: ["Geneva"],
|
85
|
+
structuredidentifier: fetch_structuredidentifier(hit.pubid),
|
86
|
+
)
|
87
|
+
end
|
87
88
|
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
89
|
+
#
|
90
|
+
# Create document ids.
|
91
|
+
#
|
92
|
+
# @param doc [Nokogiri::HTML::Document] document to parse
|
93
|
+
# @param pubid [Pubid::Iso::Identifier] publication identifier
|
94
|
+
#
|
95
|
+
# @return [Array<RelatonBib::DocumentIdentifier>]
|
96
|
+
#
|
97
|
+
def fetch_relaton_docids(doc, pubid)
|
98
|
+
pubid.stage ||= Pubid::Iso::Identifier.parse_stage(stage_code(doc))
|
99
|
+
[
|
100
|
+
RelatonIso::DocumentIdentifier.new(id: pubid, type: "ISO", primary: true),
|
101
|
+
RelatonBib::DocumentIdentifier.new(id: isoref(pubid), type: "iso-reference"),
|
102
|
+
RelatonIso::DocumentIdentifier.new(id: pubid, type: "URN"),
|
103
|
+
]
|
104
|
+
end
|
104
105
|
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
106
|
+
#
|
107
|
+
# Create ISO reference identifier with English language.
|
108
|
+
#
|
109
|
+
# @param [Pubid::Iso::Identifier] pubid publication identifier
|
110
|
+
#
|
111
|
+
# @return [String] English reference identifier
|
112
|
+
#
|
113
|
+
def isoref(pubid)
|
114
|
+
params = pubid.get_params.reject { |k, _| k == :typed_stage }
|
115
|
+
Pubid::Iso::Identifier.create(language: "en", **params).to_s(format: :ref_num_short)
|
116
|
+
end
|
116
117
|
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
end
|
118
|
+
private
|
119
|
+
|
120
|
+
# Fetch titles and abstracts.
|
121
|
+
# @param doc [Nokigiri::HTML::Document]
|
122
|
+
# @param lang [String, NilClass]
|
123
|
+
# @return [Array<Array>]
|
124
|
+
def fetch_titles_abstract(doc, lang) # rubocop:disable Metrics/AbcSize,Metrics/MethodLength,Metrics/CyclomaticComplexity,Metrics/PerceivedComplexity
|
125
|
+
titles = RelatonBib::TypedTitleStringCollection.new
|
126
|
+
abstract = []
|
127
|
+
langs = languages(doc, lang).reduce([]) do |s, l|
|
128
|
+
# Don't need to get page for en. We already have it.
|
129
|
+
d = l[:path] ? get_page(l[:path])[0] : doc
|
130
|
+
unless d.at("//h5[@class='help-block']" \
|
131
|
+
"[.='недоступно на русском языке']")
|
132
|
+
s << l
|
133
|
+
titles += fetch_title(d, l[:lang])
|
134
|
+
|
135
|
+
# Fetch abstracts.
|
136
|
+
abstract_content = d.xpath(
|
137
|
+
"//div[@itemprop='description']/p|//div[@itemprop='description']/ul/li",
|
138
|
+
).map do |a|
|
139
|
+
a.name == "li" ? "- #{a.text}" : a.text
|
140
|
+
end.reject(&:empty?).join("\n")
|
141
|
+
unless abstract_content.empty?
|
142
|
+
abstract << {
|
143
|
+
content: abstract_content,
|
144
|
+
language: l[:lang],
|
145
|
+
script: script(l[:lang]),
|
146
|
+
format: "text/plain",
|
147
|
+
}
|
148
148
|
end
|
149
|
-
s
|
150
149
|
end
|
151
|
-
|
150
|
+
s
|
152
151
|
end
|
152
|
+
[titles, abstract, langs]
|
153
|
+
end
|
153
154
|
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
|
160
|
-
|
161
|
-
|
162
|
-
|
163
|
-
|
164
|
-
end
|
165
|
-
lgs
|
155
|
+
# Returns available languages.
|
156
|
+
# @param doc [Nokogiri::HTML::Document]
|
157
|
+
# @pqrqm lang [String, NilClass]
|
158
|
+
# @return [Array<Hash>]
|
159
|
+
def languages(doc, lang)
|
160
|
+
lgs = [{ lang: "en" }]
|
161
|
+
doc.css("li#lang-switcher ul li a").each do |lang_link|
|
162
|
+
lang_path = lang_link.attr("href")
|
163
|
+
l = lang_path.match(%r{^/(fr)/})
|
164
|
+
lgs << { lang: l[1], path: lang_path } if l && (!lang || l[1] == lang)
|
166
165
|
end
|
166
|
+
lgs
|
167
|
+
end
|
167
168
|
|
168
|
-
|
169
|
-
|
170
|
-
|
171
|
-
|
172
|
-
|
173
|
-
|
174
|
-
|
175
|
-
|
176
|
-
|
177
|
-
|
178
|
-
|
179
|
-
|
169
|
+
# Get page.
|
170
|
+
# @param path [String] page's path
|
171
|
+
# @return [Array<Nokogiri::HTML::Document, String>]
|
172
|
+
def get_page(path)
|
173
|
+
resp, uri = get_redirection path
|
174
|
+
doc = try_if_fail resp, uri
|
175
|
+
[doc, uri.to_s]
|
176
|
+
rescue SocketError, Timeout::Error, Errno::EINVAL, Errno::ECONNRESET,
|
177
|
+
EOFError, Net::HTTPBadResponse, Net::HTTPHeaderSyntaxError,
|
178
|
+
Net::ProtocolError, Errno::ETIMEDOUT
|
179
|
+
raise RelatonBib::RequestError, "Could not access #{uri}"
|
180
|
+
end
|
180
181
|
|
181
|
-
|
182
|
-
|
183
|
-
|
184
|
-
|
185
|
-
|
186
|
-
|
187
|
-
|
188
|
-
|
189
|
-
|
190
|
-
|
191
|
-
|
192
|
-
|
193
|
-
|
194
|
-
|
182
|
+
#
|
183
|
+
# Get the page from the given path. If the page is redirected, get the
|
184
|
+
# page from the new path.
|
185
|
+
#
|
186
|
+
# @param [String] path path to the page
|
187
|
+
#
|
188
|
+
# @return [Array<Net::HTTPOK, URI>] HTTP response and URI
|
189
|
+
# @raise [RelatonBib::RequestError] if the page is not found
|
190
|
+
#
|
191
|
+
def get_redirection(path)
|
192
|
+
url = DOMAIN + path
|
193
|
+
uri = URI url
|
194
|
+
resp = Net::HTTP.get_response(uri)
|
195
|
+
raise RelatonBib::RequestError, "#{url} not found." if %w[404 302].include? resp.code
|
196
|
+
|
197
|
+
resp.code == "301" ? get_redirection(resp["location"]) : [resp, uri]
|
198
|
+
end
|
195
199
|
|
196
|
-
|
197
|
-
|
200
|
+
#
|
201
|
+
# The iso.org site fails to respond sometimes. This method tries to get
|
202
|
+
# the response again.
|
203
|
+
#
|
204
|
+
# @param [Net::HTTPOK] resp HTTP response
|
205
|
+
# @param [URI::HTTPS] uri URI of the page
|
206
|
+
#
|
207
|
+
# @return [Nokogiri::HTML4::Document] document
|
208
|
+
# @raise [RelatonBib::RequestError] if the page could not be parsed
|
209
|
+
#
|
210
|
+
def try_if_fail(resp, uri)
|
211
|
+
10.times do
|
212
|
+
doc = Nokogiri::HTML(resp.body)
|
213
|
+
# stop trying if page has a document id
|
214
|
+
return doc if item_ref doc
|
198
215
|
|
199
|
-
|
200
|
-
# The iso.org site fails to respond sometimes. This method tries to get
|
201
|
-
# the response again.
|
202
|
-
#
|
203
|
-
# @param [Net::HTTPOK] resp HTTP response
|
204
|
-
# @param [URI::HTTPS] uri URI of the page
|
205
|
-
#
|
206
|
-
# @return [Nokogiri::HTML4::Document] document
|
207
|
-
# @raise [RelatonBib::RequestError] if the page could not be parsed
|
208
|
-
#
|
209
|
-
def try_if_fail(resp, uri)
|
210
|
-
10.times do
|
211
|
-
doc = Nokogiri::HTML(resp.body)
|
212
|
-
# stop trying if page has a document id
|
213
|
-
return doc if item_ref doc
|
214
|
-
|
215
|
-
resp = Net::HTTP.get_response(uri)
|
216
|
-
end
|
217
|
-
raise RelatonBib::RequestError, "Could not parse the page #{uri}"
|
216
|
+
resp = Net::HTTP.get_response(uri)
|
218
217
|
end
|
218
|
+
raise RelatonBib::RequestError, "Could not parse the page #{uri}"
|
219
|
+
end
|
219
220
|
|
220
|
-
|
221
|
-
|
222
|
-
|
223
|
-
|
224
|
-
|
225
|
-
|
226
|
-
|
227
|
-
|
228
|
-
|
229
|
-
|
221
|
+
#
|
222
|
+
# Generate docnumber.
|
223
|
+
#
|
224
|
+
# @param [Pubid::Iso] pubid
|
225
|
+
#
|
226
|
+
# @return [String] docnumber
|
227
|
+
#
|
228
|
+
def fetch_docnumber(pubid)
|
229
|
+
pubid.to_s.match(/\d+/)&.to_s
|
230
|
+
end
|
230
231
|
|
231
|
-
|
232
|
-
|
233
|
-
|
234
|
-
|
235
|
-
|
236
|
-
|
237
|
-
|
238
|
-
|
239
|
-
|
240
|
-
|
241
|
-
|
242
|
-
|
243
|
-
|
244
|
-
|
232
|
+
#
|
233
|
+
# Parse structuredidentifier.
|
234
|
+
#
|
235
|
+
# @param pubid [Pubid::Iso::Identifier] pubid
|
236
|
+
#
|
237
|
+
# @return [RelatonBib::StructuredIdentifier] structured identifier
|
238
|
+
#
|
239
|
+
def fetch_structuredidentifier(pubid) # rubocop:disable Metrics/MethodLength
|
240
|
+
RelatonIsoBib::StructuredIdentifier.new(
|
241
|
+
project_number: "#{pubid.publisher} #{pubid.number}",
|
242
|
+
part: pubid.part&.to_s, # &.sub(/^-/, ""),
|
243
|
+
type: pubid.publisher,
|
244
|
+
)
|
245
|
+
end
|
245
246
|
|
246
|
-
|
247
|
-
|
248
|
-
|
247
|
+
def item_ref(doc)
|
248
|
+
doc.at("//main//section/div/div/div//h1")&.text
|
249
|
+
end
|
249
250
|
|
250
|
-
|
251
|
-
|
252
|
-
|
253
|
-
|
254
|
-
|
255
|
-
|
256
|
-
|
257
|
-
|
251
|
+
# Fetch status.
|
252
|
+
# @param doc [Nokogiri::HTML::Document]
|
253
|
+
# @param status [String]
|
254
|
+
# @return [Hash]
|
255
|
+
def fetch_status(doc)
|
256
|
+
stg, substg = stage_code(doc).split "."
|
257
|
+
RelatonBib::DocumentStatus.new(stage: stg, substage: substg)
|
258
|
+
end
|
258
259
|
|
259
|
-
|
260
|
-
|
261
|
-
|
262
|
-
|
260
|
+
def stage_code(doc)
|
261
|
+
doc.at("//ul[@class='dropdown-menu']/li[@class='active']" \
|
262
|
+
"/a/span[@class='stage-code']").text
|
263
|
+
end
|
263
264
|
|
264
|
-
|
265
|
-
|
266
|
-
|
267
|
-
|
268
|
-
|
269
|
-
|
270
|
-
|
271
|
-
|
272
|
-
|
273
|
-
|
274
|
-
|
275
|
-
|
276
|
-
|
277
|
-
|
278
|
-
|
279
|
-
|
280
|
-
|
281
|
-
|
282
|
-
|
283
|
-
|
284
|
-
|
285
|
-
|
286
|
-
|
287
|
-
|
288
|
-
|
265
|
+
# def stage(stg, substg)
|
266
|
+
# abbr = STGABBR[stg].is_a?(Hash) ? STGABBR[stg][substg] : STGABBR[stg]
|
267
|
+
# RelatonBib::DocumentStatus::Stage.new value: stg, abbreviation: abbr
|
268
|
+
# end
|
269
|
+
|
270
|
+
# Fetch workgroup.
|
271
|
+
# @param doc [Nokogiri::HTML::Document]
|
272
|
+
# @return [Hash]
|
273
|
+
def fetch_workgroup(doc) # rubocop:disable Metrics/MethodLength,Metrics/AbcSize,Metrics/CyclomaticComplexity,Metrics/PerceivedComplexity
|
274
|
+
wg = doc.at("////div[contains(., 'Technical Committe')]/following-sibling::span/a")
|
275
|
+
return unless wg
|
276
|
+
|
277
|
+
workgroup = wg.text.split "/"
|
278
|
+
type = workgroup[1]&.match(/^[A-Z]+/)&.to_s || "TC"
|
279
|
+
# {
|
280
|
+
# name: "International Organization for Standardization",
|
281
|
+
# abbreviation: "ISO",
|
282
|
+
# url: "www.iso.org",
|
283
|
+
# }
|
284
|
+
tc_numb = workgroup[1]&.match(/\d+/)&.to_s&.to_i
|
285
|
+
tc_name = wg[:title]
|
286
|
+
tc = RelatonBib::WorkGroup.new(name: tc_name, identifier: wg.text,
|
287
|
+
type: type, number: tc_numb)
|
288
|
+
RelatonIsoBib::EditorialGroup.new(technical_committee: [tc])
|
289
|
+
end
|
289
290
|
|
290
|
-
|
291
|
-
|
292
|
-
|
293
|
-
|
294
|
-
|
295
|
-
|
296
|
-
|
297
|
-
|
291
|
+
# Fetch relations.
|
292
|
+
# @param doc [Nokogiri::HTML::Document]
|
293
|
+
# @return [Array<Hash>]
|
294
|
+
def fetch_relations(doc)
|
295
|
+
types = ["Now", "Now under review"]
|
296
|
+
doc.xpath("//ul[@class='steps']/li", "//div[@class='sub-step']").reduce([]) do |a, r|
|
297
|
+
type, date = relation_type(r.at("h4", "h5").text.strip, doc)
|
298
|
+
next a if types.include?(type)
|
298
299
|
|
299
|
-
|
300
|
-
end
|
300
|
+
a + create_relations(r, type, date)
|
301
301
|
end
|
302
|
+
end
|
302
303
|
|
303
|
-
|
304
|
-
|
305
|
-
|
306
|
-
|
307
|
-
|
308
|
-
|
309
|
-
|
310
|
-
|
311
|
-
|
312
|
-
|
313
|
-
|
314
|
-
|
304
|
+
def relation_type(type, doc)
|
305
|
+
date = []
|
306
|
+
t = case type.strip
|
307
|
+
when "Previously", "Will be replaced by" then "obsoletes"
|
308
|
+
when "Corrigenda / Amendments", "Revised by", "Now confirmed"
|
309
|
+
on = doc.xpath('//span[@class="stage-date"][contains(., "-")]').last
|
310
|
+
date << { type: "circulated", on: on.text } if on
|
311
|
+
"updates"
|
312
|
+
else type
|
313
|
+
end
|
314
|
+
[t, date]
|
315
|
+
end
|
315
316
|
|
316
|
-
|
317
|
-
|
318
|
-
|
319
|
-
|
320
|
-
|
321
|
-
|
322
|
-
|
323
|
-
|
324
|
-
end
|
317
|
+
def create_relations(rel, type, date)
|
318
|
+
rel.css("a").map do |id|
|
319
|
+
docid = RelatonBib::DocumentIdentifier.new(type: "ISO", id: id.text, primary: true)
|
320
|
+
fref = RelatonBib::FormattedRef.new(content: id.text, format: "text/plain")
|
321
|
+
bibitem = RelatonIsoBib::IsoBibliographicItem.new(
|
322
|
+
docid: [docid], formattedref: fref, date: date,
|
323
|
+
)
|
324
|
+
{ type: type, bibitem: bibitem }
|
325
325
|
end
|
326
|
+
end
|
326
327
|
|
327
|
-
|
328
|
-
|
329
|
-
|
330
|
-
|
331
|
-
|
332
|
-
|
333
|
-
|
334
|
-
|
335
|
-
|
336
|
-
|
337
|
-
|
338
|
-
|
339
|
-
|
340
|
-
end
|
341
|
-
# rescue => _e
|
342
|
-
# puts 'Unknown document type: ' + title
|
328
|
+
# Fetch type.
|
329
|
+
# @param ref [String]
|
330
|
+
# @return [String]
|
331
|
+
def fetch_type(ref)
|
332
|
+
%r{
|
333
|
+
^(?<prefix>ISO|IWA|IEC)
|
334
|
+
(?:(?:/IEC|/IEEE|/PRF|/NP|/DGuide)*\s|/)
|
335
|
+
(?<type>TS|TR|PAS|AWI|CD|FDIS|NP|DIS|WD|R|Guide|(?=\d+))
|
336
|
+
}x =~ ref
|
337
|
+
# return "international-standard" if type_match.nil?
|
338
|
+
if TYPES[type] then TYPES[type]
|
339
|
+
elsif prefix == "ISO" then "international-standard"
|
340
|
+
elsif prefix == "IWA" then "international-workshop-agreement"
|
343
341
|
end
|
342
|
+
# rescue => _e
|
343
|
+
# puts 'Unknown document type: ' + title
|
344
|
+
end
|
344
345
|
|
345
|
-
|
346
|
-
|
347
|
-
|
348
|
-
|
349
|
-
|
350
|
-
|
351
|
-
|
352
|
-
|
353
|
-
|
354
|
-
|
355
|
-
|
356
|
-
|
357
|
-
|
358
|
-
|
359
|
-
|
360
|
-
|
361
|
-
|
362
|
-
|
346
|
+
# Fetch titles.
|
347
|
+
# @param doc [Nokogiri::HTML::Document]
|
348
|
+
# @param lang [String]
|
349
|
+
# @return [Array<RelatonBib::TypedTitleString>]
|
350
|
+
def fetch_title(doc, lang) # rubocop:disable Metrics/MethodLength,Metrics/AbcSize
|
351
|
+
types = %w[title-intro title-main title-part]
|
352
|
+
ttls = titles(doc)
|
353
|
+
title = RelatonBib::TypedTitleStringCollection.new
|
354
|
+
ttls.each.with_index do |p, i|
|
355
|
+
next unless p
|
356
|
+
|
357
|
+
title << RelatonBib::TypedTitleString.new(
|
358
|
+
type: types[i], content: p, language: lang, script: script(lang),
|
359
|
+
)
|
360
|
+
end.compact
|
361
|
+
main = title.map { |t| t.title.content }.join " - "
|
362
|
+
title << RelatonBib::TypedTitleString.new(type: "main", content: main, language: lang, script: script(lang))
|
363
|
+
end
|
364
|
+
|
365
|
+
def titles(doc)
|
366
|
+
head = doc.at "//nav[contains(@class,'heading-condensed')]"
|
367
|
+
ttls = head.xpath("h2 | h3 | h4").map &:text
|
368
|
+
ttls = ttls[0].split " - " if ttls.size == 1
|
369
|
+
case ttls.size
|
370
|
+
when 0, 1 then [nil, ttls.first, nil]
|
371
|
+
else RelatonBib::TypedTitleString.intro_or_part ttls
|
363
372
|
end
|
373
|
+
end
|
364
374
|
|
365
|
-
|
366
|
-
|
367
|
-
|
368
|
-
|
369
|
-
|
370
|
-
|
371
|
-
|
372
|
-
end
|
375
|
+
# Return ISO script code.
|
376
|
+
# @param lang [String]
|
377
|
+
# @return [String]
|
378
|
+
def script(lang)
|
379
|
+
case lang
|
380
|
+
when "en", "fr" then "Latn"
|
381
|
+
# when "ru" then "Cyrl"
|
373
382
|
end
|
383
|
+
end
|
374
384
|
|
375
|
-
|
376
|
-
|
377
|
-
|
378
|
-
|
379
|
-
|
380
|
-
|
381
|
-
|
382
|
-
|
383
|
-
|
384
|
-
|
385
|
-
|
385
|
+
# Fetch dates
|
386
|
+
# @param doc [Nokogiri::HTML::Document]
|
387
|
+
# @param ref [String]
|
388
|
+
# @return [Array<Hash>]
|
389
|
+
def fetch_dates(doc, ref) # rubocop:disable Metrics/AbcSize, Metrics/PerceivedComplexity, Metrics/MethodLength
|
390
|
+
dates = []
|
391
|
+
%r{^[^\s]+\s[\d-]+:(?<ref_date_str>\d{4})} =~ ref
|
392
|
+
pub_date_str = doc.xpath("//span[@itemprop='releaseDate']").text
|
393
|
+
if ref_date_str
|
394
|
+
ref_date = Date.strptime ref_date_str, "%Y"
|
395
|
+
if pub_date_str.empty?
|
396
|
+
dates << { type: "published", on: ref_date_str }
|
397
|
+
else
|
398
|
+
pub_date = Date.strptime pub_date_str, "%Y"
|
399
|
+
if pub_date.year > ref_date.year
|
386
400
|
dates << { type: "published", on: ref_date_str }
|
401
|
+
dates << { type: "updated", on: pub_date_str }
|
387
402
|
else
|
388
|
-
|
389
|
-
if pub_date.year > ref_date.year
|
390
|
-
dates << { type: "published", on: ref_date_str }
|
391
|
-
dates << { type: "updated", on: pub_date_str }
|
392
|
-
else
|
393
|
-
dates << { type: "published", on: pub_date_str }
|
394
|
-
end
|
403
|
+
dates << { type: "published", on: pub_date_str }
|
395
404
|
end
|
396
|
-
elsif !pub_date_str.empty?
|
397
|
-
dates << { type: "published", on: pub_date_str }
|
398
405
|
end
|
399
|
-
|
406
|
+
elsif !pub_date_str.empty?
|
407
|
+
dates << { type: "published", on: pub_date_str }
|
400
408
|
end
|
409
|
+
dates
|
410
|
+
end
|
401
411
|
|
402
|
-
|
403
|
-
|
404
|
-
|
405
|
-
|
412
|
+
def fetch_contributors(ref)
|
413
|
+
ref.sub(/\s.*/, "").split("/").reduce([]) do |mem, abbrev|
|
414
|
+
publisher = PUBLISHERS[abbrev]
|
415
|
+
next mem unless publisher
|
406
416
|
|
407
|
-
|
408
|
-
|
409
|
-
end
|
417
|
+
publisher[:abbreviation] = abbrev
|
418
|
+
mem << { entity: publisher, role: [type: "publisher"] }
|
410
419
|
end
|
420
|
+
end
|
411
421
|
|
412
|
-
|
413
|
-
|
414
|
-
|
415
|
-
|
416
|
-
|
417
|
-
|
418
|
-
|
419
|
-
end
|
422
|
+
# Fetch ICS.
|
423
|
+
# @param doc [Nokogiri::HTML::Document]
|
424
|
+
# @return [Array<Hash>]
|
425
|
+
def fetch_ics(doc)
|
426
|
+
doc.xpath("//div[contains(., 'ICS')]/following-sibling::span/a").map do |i|
|
427
|
+
code = i.text.match(/[\d.]+/).to_s.split "."
|
428
|
+
{ field: code[0], group: code[1], subgroup: code[2] }
|
420
429
|
end
|
430
|
+
end
|
421
431
|
|
422
|
-
|
423
|
-
|
424
|
-
|
425
|
-
|
426
|
-
|
427
|
-
|
428
|
-
|
429
|
-
|
430
|
-
|
431
|
-
|
432
|
-
|
433
|
-
|
434
|
-
|
435
|
-
|
436
|
-
|
437
|
-
|
438
|
-
|
439
|
-
|
440
|
-
|
432
|
+
#
|
433
|
+
# Fetch links.
|
434
|
+
#
|
435
|
+
# @param doc [Nokogiri::HTML::Document] document to parse
|
436
|
+
# @param url [String] document url
|
437
|
+
#
|
438
|
+
# @return [Array<Hash>]
|
439
|
+
#
|
440
|
+
def fetch_link(doc, url)
|
441
|
+
links = [{ type: "src", content: url }]
|
442
|
+
obp = doc.at("//h4[contains(@class, 'h5')]/a")
|
443
|
+
links << { type: "obp", content: obp[:href] } if obp
|
444
|
+
rss = doc.at("//a[contains(@href, 'rss')]")
|
445
|
+
links << { type: "rss", content: DOMAIN + rss[:href] } if rss
|
446
|
+
pub = doc.at "//p[contains(., 'publicly available')]/a",
|
447
|
+
"//p[contains(., 'can be downloaded from the')]/a"
|
448
|
+
links << { type: "pub", content: pub[:href] } if pub
|
449
|
+
links
|
450
|
+
end
|
441
451
|
|
442
|
-
|
443
|
-
|
444
|
-
|
445
|
-
|
446
|
-
|
447
|
-
|
448
|
-
|
449
|
-
|
450
|
-
|
451
|
-
|
452
|
-
|
453
|
-
|
454
|
-
|
455
|
-
end
|
456
|
-
[{ owner: [{ name: owner_name }], from: from }]
|
452
|
+
# Fetch copyright.
|
453
|
+
# @param doc [Nokogiri::HTML::Document]
|
454
|
+
# @return [Array<Hash>]
|
455
|
+
def fetch_copyright(doc) # rubocop:disable Metrics/MethodLength
|
456
|
+
ref = item_ref doc
|
457
|
+
owner_name = ref.match(/.*?(?=\s)/).to_s
|
458
|
+
from = ref.match(/(?<=:)\d{4}/).to_s
|
459
|
+
if from.empty?
|
460
|
+
date = doc.at(
|
461
|
+
"//span[@itemprop='releaseDate']",
|
462
|
+
"//ul[@id='stages']/li[contains(@class,'active')]/ul/li[@class='active']/a/span[@class='stage-date']",
|
463
|
+
)
|
464
|
+
from = date.text.match(/\d{4}/).to_s
|
457
465
|
end
|
466
|
+
[{ owner: [{ name: owner_name }], from: from }]
|
458
467
|
end
|
459
468
|
end
|
460
469
|
end
|
data/lib/relaton_iso/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: relaton-iso
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.16.
|
4
|
+
version: 1.16.3
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Ribose Inc.
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2023-10-
|
11
|
+
date: 2023-10-21 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: algolia
|