relaton-iso 1.16.2 → 1.16.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/relaton_iso/processor.rb +1 -1
- data/lib/relaton_iso/scrapper.rb +376 -367
- data/lib/relaton_iso/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 13ecc04a430b1dbf256c0853f612969727c16eba72a06cb2bc74bed17745ba90
|
4
|
+
data.tar.gz: f795f63a994b843e07d4857ba3b0dd9c91ec9a3ccb408827f1bf7bdbf5f854a9
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: d33586bbe409f54736b694d774a52e1bef8a4cc2d7c304aebd06c5ead8b3893b6f45c65d3e5c586c5e7f9f23501b52ae6b0630c25213d6105660251d03cff94e
|
7
|
+
data.tar.gz: e7fdcb33dfa855c73ead77a514eae36d761274bafeec77c520ac8ff84a05c6a04a2b30bd80fa7d89fcabda1975eeb95f85c65d0c177da7e1694da97bc4245ccd
|
data/lib/relaton_iso/scrapper.rb
CHANGED
@@ -43,418 +43,427 @@ module RelatonIso
|
|
43
43
|
url: "www.asme.org" },
|
44
44
|
}.freeze
|
45
45
|
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
)
|
86
|
-
|
46
|
+
extend self
|
47
|
+
|
48
|
+
# Parse page.
|
49
|
+
# @param hit [RelatonIso::Hit]
|
50
|
+
# @param lang [String, NilClass]
|
51
|
+
# @return [RelatonIsoBib::IsoBibliographicItem]
|
52
|
+
def parse_page(hit, lang = nil) # rubocop:disable Metrics/AbcSize, Metrics/MethodLength, Metrics/CyclomaticComplexity, Metrics/PerceivedComplexity
|
53
|
+
# path = "/contents/data/standard#{hit_data['splitPath']}/"\
|
54
|
+
# "#{hit_data['csnumber']}.html"
|
55
|
+
|
56
|
+
path = hit.hit[:path].sub("/sites/isoorg", "")
|
57
|
+
doc, url = get_page "#{path}.html"
|
58
|
+
|
59
|
+
# Fetch edition.
|
60
|
+
edition = doc.at("//div[div[.='Edition']]/text()[last()]")
|
61
|
+
&.text&.match(/\d+$/)&.to_s
|
62
|
+
hit.pubid.base.edition ||= edition if hit.pubid.base
|
63
|
+
|
64
|
+
titles, abstract, langs = fetch_titles_abstract(doc, lang)
|
65
|
+
|
66
|
+
RelatonIsoBib::IsoBibliographicItem.new(
|
67
|
+
fetched: Date.today.to_s,
|
68
|
+
docid: fetch_relaton_docids(doc, hit.pubid),
|
69
|
+
docnumber: fetch_docnumber(hit.pubid),
|
70
|
+
edition: edition,
|
71
|
+
language: langs.map { |l| l[:lang] },
|
72
|
+
script: langs.map { |l| script(l[:lang]) }.uniq,
|
73
|
+
title: titles,
|
74
|
+
doctype: fetch_type(hit.hit[:title]),
|
75
|
+
docstatus: fetch_status(doc),
|
76
|
+
ics: fetch_ics(doc),
|
77
|
+
date: fetch_dates(doc, hit.hit[:title]),
|
78
|
+
contributor: fetch_contributors(hit.hit[:title]),
|
79
|
+
editorialgroup: fetch_workgroup(doc),
|
80
|
+
abstract: abstract,
|
81
|
+
copyright: fetch_copyright(doc),
|
82
|
+
link: fetch_link(doc, url),
|
83
|
+
relation: fetch_relations(doc),
|
84
|
+
place: ["Geneva"],
|
85
|
+
structuredidentifier: fetch_structuredidentifier(hit.pubid),
|
86
|
+
)
|
87
|
+
end
|
87
88
|
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
89
|
+
#
|
90
|
+
# Create document ids.
|
91
|
+
#
|
92
|
+
# @param doc [Nokogiri::HTML::Document] document to parse
|
93
|
+
# @param pubid [Pubid::Iso::Identifier] publication identifier
|
94
|
+
#
|
95
|
+
# @return [Array<RelatonBib::DocumentIdentifier>]
|
96
|
+
#
|
97
|
+
def fetch_relaton_docids(doc, pubid)
|
98
|
+
pubid.stage ||= Pubid::Iso::Identifier.parse_stage(stage_code(doc))
|
99
|
+
[
|
100
|
+
RelatonIso::DocumentIdentifier.new(id: pubid, type: "ISO", primary: true),
|
101
|
+
RelatonBib::DocumentIdentifier.new(id: isoref(pubid), type: "iso-reference"),
|
102
|
+
RelatonIso::DocumentIdentifier.new(id: pubid, type: "URN"),
|
103
|
+
]
|
104
|
+
end
|
104
105
|
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
106
|
+
#
|
107
|
+
# Create ISO reference identifier with English language.
|
108
|
+
#
|
109
|
+
# @param [Pubid::Iso::Identifier] pubid publication identifier
|
110
|
+
#
|
111
|
+
# @return [String] English reference identifier
|
112
|
+
#
|
113
|
+
def isoref(pubid)
|
114
|
+
params = pubid.get_params.reject { |k, _| k == :typed_stage }
|
115
|
+
Pubid::Iso::Identifier.create(language: "en", **params).to_s(format: :ref_num_short)
|
116
|
+
end
|
116
117
|
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
end
|
118
|
+
private
|
119
|
+
|
120
|
+
# Fetch titles and abstracts.
|
121
|
+
# @param doc [Nokigiri::HTML::Document]
|
122
|
+
# @param lang [String, NilClass]
|
123
|
+
# @return [Array<Array>]
|
124
|
+
def fetch_titles_abstract(doc, lang) # rubocop:disable Metrics/AbcSize,Metrics/MethodLength,Metrics/CyclomaticComplexity,Metrics/PerceivedComplexity
|
125
|
+
titles = RelatonBib::TypedTitleStringCollection.new
|
126
|
+
abstract = []
|
127
|
+
langs = languages(doc, lang).reduce([]) do |s, l|
|
128
|
+
# Don't need to get page for en. We already have it.
|
129
|
+
d = l[:path] ? get_page(l[:path])[0] : doc
|
130
|
+
unless d.at("//h5[@class='help-block']" \
|
131
|
+
"[.='недоступно на русском языке']")
|
132
|
+
s << l
|
133
|
+
titles += fetch_title(d, l[:lang])
|
134
|
+
|
135
|
+
# Fetch abstracts.
|
136
|
+
abstract_content = d.xpath(
|
137
|
+
"//div[@itemprop='description']/p|//div[@itemprop='description']/ul/li",
|
138
|
+
).map do |a|
|
139
|
+
a.name == "li" ? "- #{a.text}" : a.text
|
140
|
+
end.reject(&:empty?).join("\n")
|
141
|
+
unless abstract_content.empty?
|
142
|
+
abstract << {
|
143
|
+
content: abstract_content,
|
144
|
+
language: l[:lang],
|
145
|
+
script: script(l[:lang]),
|
146
|
+
format: "text/plain",
|
147
|
+
}
|
148
148
|
end
|
149
|
-
s
|
150
149
|
end
|
151
|
-
|
150
|
+
s
|
152
151
|
end
|
152
|
+
[titles, abstract, langs]
|
153
|
+
end
|
153
154
|
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
|
160
|
-
|
161
|
-
|
162
|
-
|
163
|
-
|
164
|
-
end
|
165
|
-
lgs
|
155
|
+
# Returns available languages.
|
156
|
+
# @param doc [Nokogiri::HTML::Document]
|
157
|
+
# @pqrqm lang [String, NilClass]
|
158
|
+
# @return [Array<Hash>]
|
159
|
+
def languages(doc, lang)
|
160
|
+
lgs = [{ lang: "en" }]
|
161
|
+
doc.css("li#lang-switcher ul li a").each do |lang_link|
|
162
|
+
lang_path = lang_link.attr("href")
|
163
|
+
l = lang_path.match(%r{^/(fr)/})
|
164
|
+
lgs << { lang: l[1], path: lang_path } if l && (!lang || l[1] == lang)
|
166
165
|
end
|
166
|
+
lgs
|
167
|
+
end
|
167
168
|
|
168
|
-
|
169
|
-
|
170
|
-
|
171
|
-
|
172
|
-
|
173
|
-
|
174
|
-
|
175
|
-
|
176
|
-
|
177
|
-
|
178
|
-
|
179
|
-
|
169
|
+
# Get page.
|
170
|
+
# @param path [String] page's path
|
171
|
+
# @return [Array<Nokogiri::HTML::Document, String>]
|
172
|
+
def get_page(path)
|
173
|
+
resp, uri = get_redirection path
|
174
|
+
doc = try_if_fail resp, uri
|
175
|
+
[doc, uri.to_s]
|
176
|
+
rescue SocketError, Timeout::Error, Errno::EINVAL, Errno::ECONNRESET,
|
177
|
+
EOFError, Net::HTTPBadResponse, Net::HTTPHeaderSyntaxError,
|
178
|
+
Net::ProtocolError, Errno::ETIMEDOUT
|
179
|
+
raise RelatonBib::RequestError, "Could not access #{uri}"
|
180
|
+
end
|
180
181
|
|
181
|
-
|
182
|
-
|
183
|
-
|
184
|
-
|
185
|
-
|
186
|
-
|
187
|
-
|
188
|
-
|
189
|
-
|
190
|
-
|
191
|
-
|
192
|
-
|
193
|
-
|
194
|
-
|
182
|
+
#
|
183
|
+
# Get the page from the given path. If the page is redirected, get the
|
184
|
+
# page from the new path.
|
185
|
+
#
|
186
|
+
# @param [String] path path to the page
|
187
|
+
#
|
188
|
+
# @return [Array<Net::HTTPOK, URI>] HTTP response and URI
|
189
|
+
# @raise [RelatonBib::RequestError] if the page is not found
|
190
|
+
#
|
191
|
+
def get_redirection(path)
|
192
|
+
url = DOMAIN + path
|
193
|
+
uri = URI url
|
194
|
+
resp = Net::HTTP.get_response(uri)
|
195
|
+
raise RelatonBib::RequestError, "#{url} not found." if %w[404 302].include? resp.code
|
196
|
+
|
197
|
+
resp.code == "301" ? get_redirection(resp["location"]) : [resp, uri]
|
198
|
+
end
|
195
199
|
|
196
|
-
|
197
|
-
|
200
|
+
#
|
201
|
+
# The iso.org site fails to respond sometimes. This method tries to get
|
202
|
+
# the response again.
|
203
|
+
#
|
204
|
+
# @param [Net::HTTPOK] resp HTTP response
|
205
|
+
# @param [URI::HTTPS] uri URI of the page
|
206
|
+
#
|
207
|
+
# @return [Nokogiri::HTML4::Document] document
|
208
|
+
# @raise [RelatonBib::RequestError] if the page could not be parsed
|
209
|
+
#
|
210
|
+
def try_if_fail(resp, uri)
|
211
|
+
10.times do
|
212
|
+
doc = Nokogiri::HTML(resp.body)
|
213
|
+
# stop trying if page has a document id
|
214
|
+
return doc if item_ref doc
|
198
215
|
|
199
|
-
|
200
|
-
# The iso.org site fails to respond sometimes. This method tries to get
|
201
|
-
# the response again.
|
202
|
-
#
|
203
|
-
# @param [Net::HTTPOK] resp HTTP response
|
204
|
-
# @param [URI::HTTPS] uri URI of the page
|
205
|
-
#
|
206
|
-
# @return [Nokogiri::HTML4::Document] document
|
207
|
-
# @raise [RelatonBib::RequestError] if the page could not be parsed
|
208
|
-
#
|
209
|
-
def try_if_fail(resp, uri)
|
210
|
-
10.times do
|
211
|
-
doc = Nokogiri::HTML(resp.body)
|
212
|
-
# stop trying if page has a document id
|
213
|
-
return doc if item_ref doc
|
214
|
-
|
215
|
-
resp = Net::HTTP.get_response(uri)
|
216
|
-
end
|
217
|
-
raise RelatonBib::RequestError, "Could not parse the page #{uri}"
|
216
|
+
resp = Net::HTTP.get_response(uri)
|
218
217
|
end
|
218
|
+
raise RelatonBib::RequestError, "Could not parse the page #{uri}"
|
219
|
+
end
|
219
220
|
|
220
|
-
|
221
|
-
|
222
|
-
|
223
|
-
|
224
|
-
|
225
|
-
|
226
|
-
|
227
|
-
|
228
|
-
|
229
|
-
|
221
|
+
#
|
222
|
+
# Generate docnumber.
|
223
|
+
#
|
224
|
+
# @param [Pubid::Iso] pubid
|
225
|
+
#
|
226
|
+
# @return [String] docnumber
|
227
|
+
#
|
228
|
+
def fetch_docnumber(pubid)
|
229
|
+
pubid.to_s.match(/\d+/)&.to_s
|
230
|
+
end
|
230
231
|
|
231
|
-
|
232
|
-
|
233
|
-
|
234
|
-
|
235
|
-
|
236
|
-
|
237
|
-
|
238
|
-
|
239
|
-
|
240
|
-
|
241
|
-
|
242
|
-
|
243
|
-
|
244
|
-
|
232
|
+
#
|
233
|
+
# Parse structuredidentifier.
|
234
|
+
#
|
235
|
+
# @param pubid [Pubid::Iso::Identifier] pubid
|
236
|
+
#
|
237
|
+
# @return [RelatonBib::StructuredIdentifier] structured identifier
|
238
|
+
#
|
239
|
+
def fetch_structuredidentifier(pubid) # rubocop:disable Metrics/MethodLength
|
240
|
+
RelatonIsoBib::StructuredIdentifier.new(
|
241
|
+
project_number: "#{pubid.publisher} #{pubid.number}",
|
242
|
+
part: pubid.part&.to_s, # &.sub(/^-/, ""),
|
243
|
+
type: pubid.publisher,
|
244
|
+
)
|
245
|
+
end
|
245
246
|
|
246
|
-
|
247
|
-
|
248
|
-
|
247
|
+
def item_ref(doc)
|
248
|
+
doc.at("//main//section/div/div/div//h1")&.text
|
249
|
+
end
|
249
250
|
|
250
|
-
|
251
|
-
|
252
|
-
|
253
|
-
|
254
|
-
|
255
|
-
|
256
|
-
|
257
|
-
|
251
|
+
# Fetch status.
|
252
|
+
# @param doc [Nokogiri::HTML::Document]
|
253
|
+
# @param status [String]
|
254
|
+
# @return [Hash]
|
255
|
+
def fetch_status(doc)
|
256
|
+
stg, substg = stage_code(doc).split "."
|
257
|
+
RelatonBib::DocumentStatus.new(stage: stg, substage: substg)
|
258
|
+
end
|
258
259
|
|
259
|
-
|
260
|
-
|
261
|
-
|
262
|
-
|
260
|
+
def stage_code(doc)
|
261
|
+
doc.at("//ul[@class='dropdown-menu']/li[@class='active']" \
|
262
|
+
"/a/span[@class='stage-code']").text
|
263
|
+
end
|
263
264
|
|
264
|
-
|
265
|
-
|
266
|
-
|
267
|
-
|
268
|
-
|
269
|
-
|
270
|
-
|
271
|
-
|
272
|
-
|
273
|
-
|
274
|
-
|
275
|
-
|
276
|
-
|
277
|
-
|
278
|
-
|
279
|
-
|
280
|
-
|
281
|
-
|
282
|
-
|
283
|
-
|
284
|
-
|
285
|
-
|
286
|
-
|
287
|
-
|
288
|
-
|
265
|
+
# def stage(stg, substg)
|
266
|
+
# abbr = STGABBR[stg].is_a?(Hash) ? STGABBR[stg][substg] : STGABBR[stg]
|
267
|
+
# RelatonBib::DocumentStatus::Stage.new value: stg, abbreviation: abbr
|
268
|
+
# end
|
269
|
+
|
270
|
+
# Fetch workgroup.
|
271
|
+
# @param doc [Nokogiri::HTML::Document]
|
272
|
+
# @return [Hash]
|
273
|
+
def fetch_workgroup(doc) # rubocop:disable Metrics/MethodLength,Metrics/AbcSize,Metrics/CyclomaticComplexity,Metrics/PerceivedComplexity
|
274
|
+
wg = doc.at("////div[contains(., 'Technical Committe')]/following-sibling::span/a")
|
275
|
+
return unless wg
|
276
|
+
|
277
|
+
workgroup = wg.text.split "/"
|
278
|
+
type = workgroup[1]&.match(/^[A-Z]+/)&.to_s || "TC"
|
279
|
+
# {
|
280
|
+
# name: "International Organization for Standardization",
|
281
|
+
# abbreviation: "ISO",
|
282
|
+
# url: "www.iso.org",
|
283
|
+
# }
|
284
|
+
tc_numb = workgroup[1]&.match(/\d+/)&.to_s&.to_i
|
285
|
+
tc_name = wg[:title]
|
286
|
+
tc = RelatonBib::WorkGroup.new(name: tc_name, identifier: wg.text,
|
287
|
+
type: type, number: tc_numb)
|
288
|
+
RelatonIsoBib::EditorialGroup.new(technical_committee: [tc])
|
289
|
+
end
|
289
290
|
|
290
|
-
|
291
|
-
|
292
|
-
|
293
|
-
|
294
|
-
|
295
|
-
|
296
|
-
|
297
|
-
|
291
|
+
# Fetch relations.
|
292
|
+
# @param doc [Nokogiri::HTML::Document]
|
293
|
+
# @return [Array<Hash>]
|
294
|
+
def fetch_relations(doc)
|
295
|
+
types = ["Now", "Now under review"]
|
296
|
+
doc.xpath("//ul[@class='steps']/li", "//div[@class='sub-step']").reduce([]) do |a, r|
|
297
|
+
type, date = relation_type(r.at("h4", "h5").text.strip, doc)
|
298
|
+
next a if types.include?(type)
|
298
299
|
|
299
|
-
|
300
|
-
end
|
300
|
+
a + create_relations(r, type, date)
|
301
301
|
end
|
302
|
+
end
|
302
303
|
|
303
|
-
|
304
|
-
|
305
|
-
|
306
|
-
|
307
|
-
|
308
|
-
|
309
|
-
|
310
|
-
|
311
|
-
|
312
|
-
|
313
|
-
|
314
|
-
|
304
|
+
def relation_type(type, doc)
|
305
|
+
date = []
|
306
|
+
t = case type.strip
|
307
|
+
when "Previously", "Will be replaced by" then "obsoletes"
|
308
|
+
when "Corrigenda / Amendments", "Revised by", "Now confirmed"
|
309
|
+
on = doc.xpath('//span[@class="stage-date"][contains(., "-")]').last
|
310
|
+
date << { type: "circulated", on: on.text } if on
|
311
|
+
"updates"
|
312
|
+
else type
|
313
|
+
end
|
314
|
+
[t, date]
|
315
|
+
end
|
315
316
|
|
316
|
-
|
317
|
-
|
318
|
-
|
319
|
-
|
320
|
-
|
321
|
-
|
322
|
-
|
323
|
-
|
324
|
-
end
|
317
|
+
def create_relations(rel, type, date)
|
318
|
+
rel.css("a").map do |id|
|
319
|
+
docid = RelatonBib::DocumentIdentifier.new(type: "ISO", id: id.text, primary: true)
|
320
|
+
fref = RelatonBib::FormattedRef.new(content: id.text, format: "text/plain")
|
321
|
+
bibitem = RelatonIsoBib::IsoBibliographicItem.new(
|
322
|
+
docid: [docid], formattedref: fref, date: date,
|
323
|
+
)
|
324
|
+
{ type: type, bibitem: bibitem }
|
325
325
|
end
|
326
|
+
end
|
326
327
|
|
327
|
-
|
328
|
-
|
329
|
-
|
330
|
-
|
331
|
-
|
332
|
-
|
333
|
-
|
334
|
-
|
335
|
-
|
336
|
-
|
337
|
-
|
338
|
-
|
339
|
-
|
340
|
-
end
|
341
|
-
# rescue => _e
|
342
|
-
# puts 'Unknown document type: ' + title
|
328
|
+
# Fetch type.
|
329
|
+
# @param ref [String]
|
330
|
+
# @return [String]
|
331
|
+
def fetch_type(ref)
|
332
|
+
%r{
|
333
|
+
^(?<prefix>ISO|IWA|IEC)
|
334
|
+
(?:(?:/IEC|/IEEE|/PRF|/NP|/DGuide)*\s|/)
|
335
|
+
(?<type>TS|TR|PAS|AWI|CD|FDIS|NP|DIS|WD|R|Guide|(?=\d+))
|
336
|
+
}x =~ ref
|
337
|
+
# return "international-standard" if type_match.nil?
|
338
|
+
if TYPES[type] then TYPES[type]
|
339
|
+
elsif prefix == "ISO" then "international-standard"
|
340
|
+
elsif prefix == "IWA" then "international-workshop-agreement"
|
343
341
|
end
|
342
|
+
# rescue => _e
|
343
|
+
# puts 'Unknown document type: ' + title
|
344
|
+
end
|
344
345
|
|
345
|
-
|
346
|
-
|
347
|
-
|
348
|
-
|
349
|
-
|
350
|
-
|
351
|
-
|
352
|
-
|
353
|
-
|
354
|
-
|
355
|
-
|
356
|
-
|
357
|
-
|
358
|
-
|
359
|
-
|
360
|
-
|
361
|
-
|
362
|
-
|
346
|
+
# Fetch titles.
|
347
|
+
# @param doc [Nokogiri::HTML::Document]
|
348
|
+
# @param lang [String]
|
349
|
+
# @return [Array<RelatonBib::TypedTitleString>]
|
350
|
+
def fetch_title(doc, lang) # rubocop:disable Metrics/MethodLength,Metrics/AbcSize
|
351
|
+
types = %w[title-intro title-main title-part]
|
352
|
+
ttls = titles(doc)
|
353
|
+
title = RelatonBib::TypedTitleStringCollection.new
|
354
|
+
ttls.each.with_index do |p, i|
|
355
|
+
next unless p
|
356
|
+
|
357
|
+
title << RelatonBib::TypedTitleString.new(
|
358
|
+
type: types[i], content: p, language: lang, script: script(lang),
|
359
|
+
)
|
360
|
+
end.compact
|
361
|
+
main = title.map { |t| t.title.content }.join " - "
|
362
|
+
title << RelatonBib::TypedTitleString.new(type: "main", content: main, language: lang, script: script(lang))
|
363
|
+
end
|
364
|
+
|
365
|
+
def titles(doc)
|
366
|
+
head = doc.at "//nav[contains(@class,'heading-condensed')]"
|
367
|
+
ttls = head.xpath("h2 | h3 | h4").map &:text
|
368
|
+
ttls = ttls[0].split " - " if ttls.size == 1
|
369
|
+
case ttls.size
|
370
|
+
when 0, 1 then [nil, ttls.first, nil]
|
371
|
+
else RelatonBib::TypedTitleString.intro_or_part ttls
|
363
372
|
end
|
373
|
+
end
|
364
374
|
|
365
|
-
|
366
|
-
|
367
|
-
|
368
|
-
|
369
|
-
|
370
|
-
|
371
|
-
|
372
|
-
end
|
375
|
+
# Return ISO script code.
|
376
|
+
# @param lang [String]
|
377
|
+
# @return [String]
|
378
|
+
def script(lang)
|
379
|
+
case lang
|
380
|
+
when "en", "fr" then "Latn"
|
381
|
+
# when "ru" then "Cyrl"
|
373
382
|
end
|
383
|
+
end
|
374
384
|
|
375
|
-
|
376
|
-
|
377
|
-
|
378
|
-
|
379
|
-
|
380
|
-
|
381
|
-
|
382
|
-
|
383
|
-
|
384
|
-
|
385
|
-
|
385
|
+
# Fetch dates
|
386
|
+
# @param doc [Nokogiri::HTML::Document]
|
387
|
+
# @param ref [String]
|
388
|
+
# @return [Array<Hash>]
|
389
|
+
def fetch_dates(doc, ref) # rubocop:disable Metrics/AbcSize, Metrics/PerceivedComplexity, Metrics/MethodLength
|
390
|
+
dates = []
|
391
|
+
%r{^[^\s]+\s[\d-]+:(?<ref_date_str>\d{4})} =~ ref
|
392
|
+
pub_date_str = doc.xpath("//span[@itemprop='releaseDate']").text
|
393
|
+
if ref_date_str
|
394
|
+
ref_date = Date.strptime ref_date_str, "%Y"
|
395
|
+
if pub_date_str.empty?
|
396
|
+
dates << { type: "published", on: ref_date_str }
|
397
|
+
else
|
398
|
+
pub_date = Date.strptime pub_date_str, "%Y"
|
399
|
+
if pub_date.year > ref_date.year
|
386
400
|
dates << { type: "published", on: ref_date_str }
|
401
|
+
dates << { type: "updated", on: pub_date_str }
|
387
402
|
else
|
388
|
-
|
389
|
-
if pub_date.year > ref_date.year
|
390
|
-
dates << { type: "published", on: ref_date_str }
|
391
|
-
dates << { type: "updated", on: pub_date_str }
|
392
|
-
else
|
393
|
-
dates << { type: "published", on: pub_date_str }
|
394
|
-
end
|
403
|
+
dates << { type: "published", on: pub_date_str }
|
395
404
|
end
|
396
|
-
elsif !pub_date_str.empty?
|
397
|
-
dates << { type: "published", on: pub_date_str }
|
398
405
|
end
|
399
|
-
|
406
|
+
elsif !pub_date_str.empty?
|
407
|
+
dates << { type: "published", on: pub_date_str }
|
400
408
|
end
|
409
|
+
dates
|
410
|
+
end
|
401
411
|
|
402
|
-
|
403
|
-
|
404
|
-
|
405
|
-
|
412
|
+
def fetch_contributors(ref)
|
413
|
+
ref.sub(/\s.*/, "").split("/").reduce([]) do |mem, abbrev|
|
414
|
+
publisher = PUBLISHERS[abbrev]
|
415
|
+
next mem unless publisher
|
406
416
|
|
407
|
-
|
408
|
-
|
409
|
-
end
|
417
|
+
publisher[:abbreviation] = abbrev
|
418
|
+
mem << { entity: publisher, role: [type: "publisher"] }
|
410
419
|
end
|
420
|
+
end
|
411
421
|
|
412
|
-
|
413
|
-
|
414
|
-
|
415
|
-
|
416
|
-
|
417
|
-
|
418
|
-
|
419
|
-
end
|
422
|
+
# Fetch ICS.
|
423
|
+
# @param doc [Nokogiri::HTML::Document]
|
424
|
+
# @return [Array<Hash>]
|
425
|
+
def fetch_ics(doc)
|
426
|
+
doc.xpath("//div[contains(., 'ICS')]/following-sibling::span/a").map do |i|
|
427
|
+
code = i.text.match(/[\d.]+/).to_s.split "."
|
428
|
+
{ field: code[0], group: code[1], subgroup: code[2] }
|
420
429
|
end
|
430
|
+
end
|
421
431
|
|
422
|
-
|
423
|
-
|
424
|
-
|
425
|
-
|
426
|
-
|
427
|
-
|
428
|
-
|
429
|
-
|
430
|
-
|
431
|
-
|
432
|
-
|
433
|
-
|
434
|
-
|
435
|
-
|
436
|
-
|
437
|
-
|
438
|
-
|
439
|
-
|
440
|
-
|
432
|
+
#
|
433
|
+
# Fetch links.
|
434
|
+
#
|
435
|
+
# @param doc [Nokogiri::HTML::Document] document to parse
|
436
|
+
# @param url [String] document url
|
437
|
+
#
|
438
|
+
# @return [Array<Hash>]
|
439
|
+
#
|
440
|
+
def fetch_link(doc, url)
|
441
|
+
links = [{ type: "src", content: url }]
|
442
|
+
obp = doc.at("//h4[contains(@class, 'h5')]/a")
|
443
|
+
links << { type: "obp", content: obp[:href] } if obp
|
444
|
+
rss = doc.at("//a[contains(@href, 'rss')]")
|
445
|
+
links << { type: "rss", content: DOMAIN + rss[:href] } if rss
|
446
|
+
pub = doc.at "//p[contains(., 'publicly available')]/a",
|
447
|
+
"//p[contains(., 'can be downloaded from the')]/a"
|
448
|
+
links << { type: "pub", content: pub[:href] } if pub
|
449
|
+
links
|
450
|
+
end
|
441
451
|
|
442
|
-
|
443
|
-
|
444
|
-
|
445
|
-
|
446
|
-
|
447
|
-
|
448
|
-
|
449
|
-
|
450
|
-
|
451
|
-
|
452
|
-
|
453
|
-
|
454
|
-
|
455
|
-
end
|
456
|
-
[{ owner: [{ name: owner_name }], from: from }]
|
452
|
+
# Fetch copyright.
|
453
|
+
# @param doc [Nokogiri::HTML::Document]
|
454
|
+
# @return [Array<Hash>]
|
455
|
+
def fetch_copyright(doc) # rubocop:disable Metrics/MethodLength
|
456
|
+
ref = item_ref doc
|
457
|
+
owner_name = ref.match(/.*?(?=\s)/).to_s
|
458
|
+
from = ref.match(/(?<=:)\d{4}/).to_s
|
459
|
+
if from.empty?
|
460
|
+
date = doc.at(
|
461
|
+
"//span[@itemprop='releaseDate']",
|
462
|
+
"//ul[@id='stages']/li[contains(@class,'active')]/ul/li[@class='active']/a/span[@class='stage-date']",
|
463
|
+
)
|
464
|
+
from = date.text.match(/\d{4}/).to_s
|
457
465
|
end
|
466
|
+
[{ owner: [{ name: owner_name }], from: from }]
|
458
467
|
end
|
459
468
|
end
|
460
469
|
end
|
data/lib/relaton_iso/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: relaton-iso
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.16.
|
4
|
+
version: 1.16.3
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Ribose Inc.
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2023-10-
|
11
|
+
date: 2023-10-21 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: algolia
|