relaton-iso 1.19.0 → 1.19.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/bin/bundle +1 -1
- data/lib/relaton_iso/data_fetcher.rb +74 -26
- data/lib/relaton_iso/iso_bibliography.rb +1 -1
- data/lib/relaton_iso/scrapper.rb +143 -112
- data/lib/relaton_iso/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: ae7999f9e96553504fb75338485ad672b1cb176e15860c39b35913b0ed525852
|
4
|
+
data.tar.gz: 4bf9ec438dd3aebb4b81707b53c7141a466d1f8c504a88c0d6b5d60f46ea0534
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 05fab25ad1c760bde99b95b230e6616aa6205510cb77454a7c167aeb7501e967c1909ec4afe583e259312802d8172f5099937128bc7d34c8e4901cb7f48c3181
|
7
|
+
data.tar.gz: e935ceff8ab264b3f4ca6383874c427733633bd744e3a5318cb079f47bf67f580f8957ee5cea36c3a1827b064f1f7022061b13d9e4f60182200180eb50abcbe9
|
data/bin/bundle
CHANGED
@@ -27,7 +27,7 @@ m = Module.new do
|
|
27
27
|
bundler_version = nil
|
28
28
|
update_index = nil
|
29
29
|
ARGV.each_with_index do |a, i|
|
30
|
-
if update_index && update_index.succ == i && a
|
30
|
+
if update_index && update_index.succ == i && a.match?(Gem::Version::ANCHORED_VERSION_PATTERN)
|
31
31
|
bundler_version = a
|
32
32
|
end
|
33
33
|
next unless a =~ /\A--bundler(?:[= ](#{Gem::Version::VERSION_PATTERN}))?\z/
|
@@ -7,13 +7,16 @@ module RelatonIso
|
|
7
7
|
# @param [String] output output directory
|
8
8
|
# @param [String] format format of output files (yaml, bibxml, xml)
|
9
9
|
#
|
10
|
-
def initialize(output, format)
|
10
|
+
def initialize(output, format) # rubocop:disable Metrics/AbcSize
|
11
11
|
@output = output
|
12
12
|
@format = format
|
13
13
|
@ext = format.sub(/^bib/, "")
|
14
|
-
@files =
|
14
|
+
@files = Set.new
|
15
15
|
@queue = ::Queue.new
|
16
16
|
@mutex = Mutex.new
|
17
|
+
@gh_issue = Relaton::Logger::Channels::GhIssue.new "relaton/relaton-iso", "Error fetching ISO documents"
|
18
|
+
Relaton.logger_pool[:gh_issue] = Relaton::Logger::Log.new(@gh_issue, levels: [:error])
|
19
|
+
@errors = Hash.new(true)
|
17
20
|
end
|
18
21
|
|
19
22
|
def index
|
@@ -34,12 +37,12 @@ module RelatonIso
|
|
34
37
|
#
|
35
38
|
def self.fetch(output: "data", format: "yaml")
|
36
39
|
t1 = Time.now
|
37
|
-
|
40
|
+
Util.info "Started at: #{t1}"
|
38
41
|
FileUtils.mkdir_p output
|
39
42
|
new(output, format).fetch
|
40
43
|
t2 = Time.now
|
41
|
-
|
42
|
-
|
44
|
+
Util.info "Stopped at: #{t2}"
|
45
|
+
Util.info "Done in: #{(t2 - t1).round} sec."
|
43
46
|
end
|
44
47
|
|
45
48
|
#
|
@@ -48,13 +51,21 @@ module RelatonIso
|
|
48
51
|
# @return [void]
|
49
52
|
#
|
50
53
|
def fetch # rubocop:disable Metrics/AbcSize
|
51
|
-
|
54
|
+
Util.info "Scrapping ICS pages..."
|
52
55
|
fetch_ics
|
53
|
-
|
56
|
+
Util.info "(#{Time.now}) Scrapping documents..."
|
54
57
|
fetch_docs
|
55
58
|
iso_queue.save
|
56
59
|
# index.sort! { |a, b| compare_docids a, b }
|
57
60
|
index.save
|
61
|
+
repot_errors
|
62
|
+
end
|
63
|
+
|
64
|
+
def repot_errors
|
65
|
+
@errors.select { |_, v| v }.each_key do |k|
|
66
|
+
Util.error "Failed to fetch #{k}"
|
67
|
+
end
|
68
|
+
@gh_issue.create_issue
|
58
69
|
end
|
59
70
|
|
60
71
|
#
|
@@ -72,14 +83,30 @@ module RelatonIso
|
|
72
83
|
|
73
84
|
def fetch_ics_page(path)
|
74
85
|
resp = get_redirection path
|
75
|
-
|
76
|
-
|
77
|
-
|
86
|
+
unless resp
|
87
|
+
Util.error "Failed fetching ICS page #{url(path)}"
|
88
|
+
return
|
78
89
|
end
|
79
90
|
|
80
|
-
page.
|
81
|
-
|
82
|
-
|
91
|
+
page = Nokogiri::HTML(resp.body)
|
92
|
+
parse_doc_links page
|
93
|
+
parse_ics_links page
|
94
|
+
end
|
95
|
+
|
96
|
+
def parse_doc_links(page)
|
97
|
+
doc_links = page.xpath "//td[@data-title='Standard and/or project']/div/div/a"
|
98
|
+
@errors[:doc_links] &&= doc_links.empty?
|
99
|
+
doc_links.each { |item| iso_queue.add_first item[:href].split("?").first }
|
100
|
+
end
|
101
|
+
|
102
|
+
def parse_ics_links(page)
|
103
|
+
ics_links = page.xpath("//td[@data-title='ICS']/a")
|
104
|
+
@errors[:ics_links] &&= ics_links.empty?
|
105
|
+
ics_links.each { |item| @queue << item[:href] }
|
106
|
+
end
|
107
|
+
|
108
|
+
def url(path)
|
109
|
+
Scrapper::DOMAIN + path
|
83
110
|
end
|
84
111
|
|
85
112
|
#
|
@@ -88,18 +115,18 @@ module RelatonIso
|
|
88
115
|
#
|
89
116
|
# @param [String] path path to the page
|
90
117
|
#
|
91
|
-
# @return [Net::HTTPOK] HTTP response
|
118
|
+
# @return [Net::HTTPOK, nil] HTTP response
|
92
119
|
#
|
93
120
|
def get_redirection(path) # rubocop:disable Metrics/MethodLength
|
94
121
|
try = 0
|
95
|
-
uri = URI(
|
122
|
+
uri = URI url(path)
|
96
123
|
begin
|
97
124
|
get_response uri
|
98
125
|
rescue Net::OpenTimeout, Net::ReadTimeout, Errno::ECONNREFUSED => e
|
99
126
|
try += 1
|
100
127
|
retry if check_try try, uri
|
101
128
|
|
102
|
-
Util.
|
129
|
+
Util.warn "Failed fetching #{uri}, #{e.message}"
|
103
130
|
end
|
104
131
|
end
|
105
132
|
|
@@ -131,13 +158,10 @@ module RelatonIso
|
|
131
158
|
# @return [void]
|
132
159
|
#
|
133
160
|
def fetch_doc(docpath)
|
134
|
-
|
135
|
-
# hit = Hit.new({ path: docpath }, nil)
|
136
|
-
doc = Scrapper.parse_page docpath
|
161
|
+
doc = Scrapper.parse_page docpath, errors: @errors
|
137
162
|
@mutex.synchronize { save_doc doc, docpath }
|
138
163
|
rescue StandardError => e
|
139
|
-
Util.
|
140
|
-
"#{e.message}\n#{e.backtrace}"
|
164
|
+
Util.warn "Fail fetching document: #{url(docpath)}\n#{e.message}\n#{e.backtrace}"
|
141
165
|
end
|
142
166
|
|
143
167
|
# def compare_docids(id1, id2)
|
@@ -155,16 +179,40 @@ module RelatonIso
|
|
155
179
|
docid = doc.docidentifier.detect(&:primary)
|
156
180
|
file_name = docid.id.gsub(/[\s\/:]+/, "-").downcase
|
157
181
|
file = File.join @output, "#{file_name}.#{@ext}"
|
158
|
-
if
|
159
|
-
|
182
|
+
if File.exist?(file)
|
183
|
+
rewrite_with_same_or_newer doc, docid, file, docpath
|
160
184
|
else
|
161
|
-
|
162
|
-
index.add_or_update docid.to_h, file
|
163
|
-
File.write file, serialize(doc), encoding: "UTF-8"
|
185
|
+
write_file file, doc, docid
|
164
186
|
end
|
165
187
|
iso_queue.move_last docpath
|
166
188
|
end
|
167
189
|
|
190
|
+
def rewrite_with_same_or_newer(doc, docid, file, docpath)
|
191
|
+
hash = YAML.load_file file
|
192
|
+
item_hash = HashConverter.hash_to_bib hash
|
193
|
+
bib = ::RelatonIsoBib::IsoBibliographicItem.new(**item_hash)
|
194
|
+
if edition_greater?(doc, bib) || replace_substage98?(doc, bib)
|
195
|
+
write_file file, doc, docid
|
196
|
+
elsif @files.include?(file) && !edition_greater?(bib, doc)
|
197
|
+
Util.warn "Duplicate file `#{file}` for `#{docid.id}` from #{url(docpath)}"
|
198
|
+
end
|
199
|
+
end
|
200
|
+
|
201
|
+
def edition_greater?(doc, bib)
|
202
|
+
doc.edition && bib.edition && doc.edition.content.to_i > bib.edition.content.to_i
|
203
|
+
end
|
204
|
+
|
205
|
+
def replace_substage98?(doc, bib) # rubocop:disable Metrics/CyclomaticComplexity,Metrics/PerceivedComplexity
|
206
|
+
doc.edition&.content == bib.edition&.content &&
|
207
|
+
(doc.status&.substage&.value != "98" || bib.status&.substage&.value == "98")
|
208
|
+
end
|
209
|
+
|
210
|
+
def write_file(file, doc, docid)
|
211
|
+
@files << file
|
212
|
+
index.add_or_update docid.to_h, file
|
213
|
+
File.write file, serialize(doc), encoding: "UTF-8"
|
214
|
+
end
|
215
|
+
|
168
216
|
#
|
169
217
|
# Serialize document to string.
|
170
218
|
#
|
data/lib/relaton_iso/scrapper.rb
CHANGED
@@ -2,7 +2,7 @@
|
|
2
2
|
|
3
3
|
module RelatonIso
|
4
4
|
# Scrapper.
|
5
|
-
|
5
|
+
class Scrapper # rubocop:disable Metrics/ModuleLength
|
6
6
|
DOMAIN = "https://www.iso.org"
|
7
7
|
|
8
8
|
TYPES = {
|
@@ -48,57 +48,84 @@ module RelatonIso
|
|
48
48
|
url: "www.asme.org" },
|
49
49
|
}.freeze
|
50
50
|
|
51
|
-
extend self
|
51
|
+
# extend self
|
52
|
+
|
53
|
+
def initialize(lang, errors)
|
54
|
+
@lang = lang
|
55
|
+
@errors = errors
|
56
|
+
end
|
52
57
|
|
53
58
|
# Parse page.
|
54
|
-
# @param path [String]
|
55
|
-
# @param lang [String, nil]
|
59
|
+
# @param path [String] page path
|
60
|
+
# @param lang [String, nil] language
|
61
|
+
# @param errors [Hash] collection of parsing errors
|
56
62
|
# @return [RelatonIsoBib::IsoBibliographicItem]
|
57
|
-
def parse_page(path, lang
|
58
|
-
|
59
|
-
|
60
|
-
pubid = Pubid::Iso::Identifier.parse(id)
|
61
|
-
# Fetch edition.
|
62
|
-
edition = doc.at("//div[div[.='Edition']]/text()[last()]")&.text&.match(/\d+$/)&.to_s
|
63
|
-
pubid.root.edition ||= edition if pubid.base
|
63
|
+
def self.parse_page(path, lang: nil, errors: {})
|
64
|
+
new(lang, errors).parse(path)
|
65
|
+
end
|
64
66
|
|
65
|
-
|
67
|
+
def parse(path) # rubocop:disable Metrics/AbcSize,Metrics/MethodLength
|
68
|
+
@doc, url = get_page path
|
69
|
+
titles, abstract, langs = fetch_titles_abstract
|
66
70
|
|
67
71
|
RelatonIsoBib::IsoBibliographicItem.new(
|
68
|
-
docid: fetch_relaton_docids
|
69
|
-
docnumber: fetch_docnumber
|
72
|
+
docid: fetch_relaton_docids,
|
73
|
+
docnumber: fetch_docnumber,
|
70
74
|
edition: edition,
|
71
75
|
language: langs.map { |l| l[:lang] },
|
72
76
|
script: langs.map { |l| script(l[:lang]) }.uniq,
|
73
77
|
title: titles,
|
74
|
-
doctype: fetch_type
|
75
|
-
docstatus: fetch_status
|
76
|
-
ics: fetch_ics
|
77
|
-
date: fetch_dates
|
78
|
-
contributor: fetch_contributors
|
79
|
-
editorialgroup: fetch_workgroup
|
78
|
+
doctype: fetch_type,
|
79
|
+
docstatus: fetch_status,
|
80
|
+
ics: fetch_ics,
|
81
|
+
date: fetch_dates,
|
82
|
+
contributor: fetch_contributors,
|
83
|
+
editorialgroup: fetch_workgroup,
|
80
84
|
abstract: abstract,
|
81
|
-
copyright: fetch_copyright
|
82
|
-
link: fetch_link(
|
83
|
-
relation: fetch_relations
|
85
|
+
copyright: fetch_copyright,
|
86
|
+
link: fetch_link(url),
|
87
|
+
relation: fetch_relations,
|
84
88
|
place: ["Geneva"],
|
85
|
-
structuredidentifier: fetch_structuredidentifier
|
89
|
+
structuredidentifier: fetch_structuredidentifier,
|
86
90
|
)
|
87
91
|
end
|
88
92
|
|
93
|
+
def id
|
94
|
+
return @id if defined?(@id)
|
95
|
+
|
96
|
+
did = @doc.at("//h1/span[1]")
|
97
|
+
@errors[:id] &&= did.nil?
|
98
|
+
@id = did && did.text.split(" | ").first.strip
|
99
|
+
end
|
100
|
+
|
101
|
+
def pubid
|
102
|
+
return @pubid if @pubid
|
103
|
+
|
104
|
+
@pubid = Pubid::Iso::Identifier.parse(id)
|
105
|
+
@pubid.root.edition ||= edition if @pubid.base
|
106
|
+
@pubid
|
107
|
+
rescue StandardError => e
|
108
|
+
Util.error "Failed to parse pubid from #{id}: #{e.message}"
|
109
|
+
end
|
110
|
+
|
111
|
+
def edition
|
112
|
+
return @edition if defined?(@edition)
|
113
|
+
|
114
|
+
ed = @doc.at("//div[div[.='Edition']]/text()[last()]")
|
115
|
+
@errors[:edition] &&= ed.nil?
|
116
|
+
@edition = ed && ed.text.match(/\d+$/).to_s
|
117
|
+
end
|
118
|
+
|
89
119
|
#
|
90
120
|
# Create document ids.
|
91
121
|
#
|
92
|
-
# @param doc [Nokogiri::HTML::Document] document to parse
|
93
|
-
# @param pubid [Pubid::Iso::Identifier] publication identifier
|
94
|
-
#
|
95
122
|
# @return [Array<RelatonBib::DocumentIdentifier>]
|
96
123
|
#
|
97
|
-
def fetch_relaton_docids
|
98
|
-
pubid.stage ||= Pubid::Iso::Identifier.parse_stage(stage_code
|
124
|
+
def fetch_relaton_docids
|
125
|
+
pubid.stage ||= Pubid::Iso::Identifier.parse_stage(stage_code)
|
99
126
|
[
|
100
127
|
DocumentIdentifier.new(id: pubid, type: "ISO", primary: true),
|
101
|
-
RelatonBib::DocumentIdentifier.new(id: isoref
|
128
|
+
RelatonBib::DocumentIdentifier.new(id: isoref, type: "iso-reference"),
|
102
129
|
DocumentIdentifier.new(id: pubid, type: "URN"),
|
103
130
|
]
|
104
131
|
end
|
@@ -106,11 +133,9 @@ module RelatonIso
|
|
106
133
|
#
|
107
134
|
# Create ISO reference identifier with English language.
|
108
135
|
#
|
109
|
-
# @param [Pubid::Iso::Identifier] pubid publication identifier
|
110
|
-
#
|
111
136
|
# @return [String] English reference identifier
|
112
137
|
#
|
113
|
-
def isoref
|
138
|
+
def isoref
|
114
139
|
params = pubid.to_h.reject { |k, _| k == :typed_stage }
|
115
140
|
Pubid::Iso::Identifier.create(language: "en", **params).to_s(format: :ref_num_short)
|
116
141
|
end
|
@@ -118,20 +143,18 @@ module RelatonIso
|
|
118
143
|
private
|
119
144
|
|
120
145
|
# Fetch titles and abstracts.
|
121
|
-
# @param doc [Nokigiri::HTML::Document]
|
122
|
-
# @param lang [String, nil]
|
123
146
|
# @return [Array<Array>]
|
124
|
-
def fetch_titles_abstract
|
147
|
+
def fetch_titles_abstract # rubocop:disable Metrics/AbcSize,Metrics/MethodLength
|
125
148
|
titles = RelatonBib::TypedTitleStringCollection.new
|
126
149
|
abstract = []
|
127
|
-
langs = languages
|
150
|
+
langs = languages.each_with_object([]) do |l, s|
|
128
151
|
# Don't need to get page for en. We already have it.
|
129
|
-
d = l[:path] ? get_page(l[:path])[0] : doc
|
152
|
+
d = l[:path] ? get_page(l[:path])[0] : @doc
|
130
153
|
unless d.at("//h5[@class='help-block'][.='недоступно на русском языке']")
|
131
154
|
s << l
|
132
155
|
titles += fetch_title(d, l[:lang])
|
133
156
|
|
134
|
-
abstr = parse_abstract(d, l)
|
157
|
+
abstr = parse_abstract(d, l[:lang])
|
135
158
|
abstract << abstr if abstr
|
136
159
|
end
|
137
160
|
end
|
@@ -142,23 +165,22 @@ module RelatonIso
|
|
142
165
|
abstract_content = doc.xpath(
|
143
166
|
"//div[@itemprop='description']/p|//div[@itemprop='description']/ul/li",
|
144
167
|
).map { |a| a.name == "li" ? "- #{a.text}" : a.text }.reject(&:empty?).join("\n")
|
168
|
+
@errors[:abstract] &&= abstract_content.empty?
|
145
169
|
return if abstract_content.empty?
|
146
170
|
|
147
|
-
{ content: abstract_content, language: lang
|
148
|
-
script: script(lang[:lang]), format: "text/plain" }
|
171
|
+
{ content: abstract_content, language: lang, script: script(lang), format: "text/plain" }
|
149
172
|
end
|
150
173
|
|
151
174
|
# Returns available languages.
|
152
|
-
# @param doc [Nokogiri::HTML::Document]
|
153
|
-
# @param lang [String, nil]
|
154
175
|
# @return [Array<Hash>]
|
155
|
-
def languages
|
176
|
+
def languages
|
156
177
|
lgs = [{ lang: "en" }]
|
157
|
-
doc.css("li#lang-switcher ul li a").each do |lang_link|
|
178
|
+
@doc.css("li#lang-switcher ul li a").each do |lang_link|
|
158
179
|
lang_path = lang_link.attr("href")
|
159
180
|
l = lang_path.match(%r{^/(fr)/})
|
160
|
-
lgs << { lang: l[1], path: lang_path } if l && (
|
181
|
+
lgs << { lang: l[1], path: lang_path } if l && (!@lang || l[1] != @lang)
|
161
182
|
end
|
183
|
+
@errors[:language] &&= lgs.size == 1
|
162
184
|
lgs
|
163
185
|
end
|
164
186
|
|
@@ -239,7 +261,7 @@ module RelatonIso
|
|
239
261
|
10.times do
|
240
262
|
doc = Nokogiri::HTML(resp.body)
|
241
263
|
# stop trying if page has a document id
|
242
|
-
return doc if item_ref
|
264
|
+
return doc if item_ref(doc)
|
243
265
|
|
244
266
|
resp = Net::HTTP.get_response(uri)
|
245
267
|
end
|
@@ -249,22 +271,18 @@ module RelatonIso
|
|
249
271
|
#
|
250
272
|
# Generate docnumber.
|
251
273
|
#
|
252
|
-
# @param [Pubid::Iso] pubid
|
253
|
-
#
|
254
274
|
# @return [String] docnumber
|
255
275
|
#
|
256
|
-
def fetch_docnumber
|
276
|
+
def fetch_docnumber
|
257
277
|
pubid.to_s.match(/\d+/)&.to_s
|
258
278
|
end
|
259
279
|
|
260
280
|
#
|
261
281
|
# Parse structuredidentifier.
|
262
282
|
#
|
263
|
-
# @param pubid [Pubid::Iso::Identifier] pubid
|
264
|
-
#
|
265
283
|
# @return [RelatonBib::StructuredIdentifier] structured identifier
|
266
284
|
#
|
267
|
-
def fetch_structuredidentifier
|
285
|
+
def fetch_structuredidentifier # rubocop:disable Metrics/MethodLength
|
268
286
|
RelatonIsoBib::StructuredIdentifier.new(
|
269
287
|
project_number: "#{pubid.root.publisher} #{pubid.root.number}",
|
270
288
|
part: pubid.root.part&.to_s, # &.sub(/^-/, ""),
|
@@ -280,21 +298,24 @@ module RelatonIso
|
|
280
298
|
# @return [String, nil] ID
|
281
299
|
#
|
282
300
|
def item_ref(doc)
|
283
|
-
doc.at("//main//section/div/div/div//h1/span[1]")
|
301
|
+
ref = doc.at("//main//section/div/div/div//h1/span[1]")
|
302
|
+
@errors[:reference] &&= ref.nil?
|
303
|
+
ref&.text&.strip
|
284
304
|
end
|
285
305
|
|
286
306
|
# Fetch status.
|
287
|
-
# @
|
288
|
-
|
289
|
-
|
290
|
-
def fetch_status(doc)
|
291
|
-
stg, substg = stage_code(doc).split "."
|
307
|
+
# @return [RelatonBib::DocumentStatus]
|
308
|
+
def fetch_status
|
309
|
+
stg, substg = stage_code.split "."
|
292
310
|
RelatonBib::DocumentStatus.new(stage: stg, substage: substg)
|
293
311
|
end
|
294
312
|
|
295
|
-
def stage_code
|
296
|
-
|
297
|
-
|
313
|
+
def stage_code
|
314
|
+
return @stage_code if defined?(@stage_code)
|
315
|
+
|
316
|
+
stc = @doc.at("//ul[@class='dropdown-menu']/li[@class='active']/a/span[@class='stage-code']")
|
317
|
+
@errors[:stage] &&= stc.nil?
|
318
|
+
@stage_code = stc&.text
|
298
319
|
end
|
299
320
|
|
300
321
|
# def stage(stg, substg)
|
@@ -305,8 +326,9 @@ module RelatonIso
|
|
305
326
|
# Fetch workgroup.
|
306
327
|
# @param doc [Nokogiri::HTML::Document]
|
307
328
|
# @return [RelatonIsoBib::EditorialGroup, nil]
|
308
|
-
def fetch_workgroup
|
309
|
-
wg = doc.at("
|
329
|
+
def fetch_workgroup # rubocop:disable Metrics/MethodLength,Metrics/AbcSize,Metrics/CyclomaticComplexity,Metrics/PerceivedComplexity
|
330
|
+
wg = @doc.at("//div[contains(., 'Technical Committe')]/following-sibling::span/a")
|
331
|
+
@errors[:workgroup] &&= wg.nil?
|
310
332
|
return unless wg
|
311
333
|
|
312
334
|
workgroup = wg.text.split "/"
|
@@ -324,34 +346,34 @@ module RelatonIso
|
|
324
346
|
end
|
325
347
|
|
326
348
|
# Fetch relations.
|
327
|
-
# @param doc [Nokogiri::HTML::Document]
|
328
349
|
# @return [Array<Hash>]
|
329
|
-
def fetch_relations
|
350
|
+
def fetch_relations
|
330
351
|
types = ["Now", "Now under review"]
|
331
|
-
doc.xpath(
|
352
|
+
rels = @doc.xpath(
|
332
353
|
"//ul[@class='steps']/li", "//div[contains(@class, 'sub-step')]"
|
333
354
|
).reduce([]) do |a, r|
|
334
|
-
type, date = relation_type(r.at("h4", "h5").text.strip
|
355
|
+
type, date = relation_type(r.at("h4", "h5").text.strip)
|
335
356
|
next a if types.include?(type)
|
336
357
|
|
337
358
|
a + create_relations(r, type, date)
|
338
359
|
end
|
360
|
+
@errors[:relation] &&= rels.empty?
|
361
|
+
rels
|
339
362
|
end
|
340
363
|
|
341
364
|
#
|
342
365
|
# Parse relation type and dates.
|
343
366
|
#
|
344
367
|
# @param [String] type parsed type
|
345
|
-
# @param [Nokogiri::HTML::Document] doc document to parse
|
346
368
|
#
|
347
369
|
# @return [Array<String,Array>] type and dates
|
348
370
|
#
|
349
|
-
def relation_type(type
|
371
|
+
def relation_type(type)
|
350
372
|
date = []
|
351
373
|
t = case type.strip
|
352
374
|
when "Previously", "Will be replaced by" then "obsoletes"
|
353
375
|
when /Corrigenda|Amendments|Revised by|Now confirmed|replaced by/
|
354
|
-
on = doc.xpath('//span[@class="stage-date"][contains(., "-")]').last
|
376
|
+
on = @doc.xpath('//span[@class="stage-date"][contains(., "-")]').last
|
355
377
|
date << { type: "circulated", on: on.text } if on
|
356
378
|
"updates"
|
357
379
|
else type
|
@@ -371,9 +393,9 @@ module RelatonIso
|
|
371
393
|
# @return [Array<Hash>] Relations
|
372
394
|
#
|
373
395
|
def create_relations(rel, type, date)
|
374
|
-
rel.css("a").map do |
|
375
|
-
docid = DocumentIdentifier.new(type: "ISO", id:
|
376
|
-
fref = RelatonBib::FormattedRef.new(content:
|
396
|
+
rel.css("a").map do |rid|
|
397
|
+
docid = DocumentIdentifier.new(type: "ISO", id: rid.text, primary: true)
|
398
|
+
fref = RelatonBib::FormattedRef.new(content: rid.text, format: "text/plain")
|
377
399
|
bibitem = RelatonIsoBib::IsoBibliographicItem.new(
|
378
400
|
docid: [docid], formattedref: fref, date: date,
|
379
401
|
)
|
@@ -382,14 +404,13 @@ module RelatonIso
|
|
382
404
|
end
|
383
405
|
|
384
406
|
# Fetch type.
|
385
|
-
# @param ref [String]
|
386
407
|
# @return [String]
|
387
|
-
def fetch_type
|
408
|
+
def fetch_type
|
388
409
|
%r{
|
389
410
|
^(?<prefix>ISO|IWA|IEC)
|
390
|
-
(?:(?:/IEC|/IEEE|/PRF|/NP|/SAE|/HL7|/DGuide)*\s|/)
|
411
|
+
(?:(?:/CIE|/IEC|/IEEE|/PRF|/NP|/SAE|/HL7|/DGuide)*\s|/)
|
391
412
|
(?<type>TS|TR|PAS|AWI|CD|FDIS|NP|DIS|WD|R|DTS|DTR|ISP|PWI|Guide|(?=\d+))
|
392
|
-
}x =~
|
413
|
+
}x =~ id
|
393
414
|
type = TYPES[type] || TYPES[prefix] || "international-standard"
|
394
415
|
RelatonIsoBib::DocumentType.new(type: type)
|
395
416
|
end
|
@@ -400,7 +421,7 @@ module RelatonIso
|
|
400
421
|
# @return [Array<RelatonBib::TypedTitleString>]
|
401
422
|
def fetch_title(doc, lang) # rubocop:disable Metrics/MethodLength,Metrics/AbcSize
|
402
423
|
types = %w[title-intro title-main title-part]
|
403
|
-
ttls =
|
424
|
+
ttls = parse_titles(doc)
|
404
425
|
title = RelatonBib::TypedTitleStringCollection.new
|
405
426
|
ttls.each.with_index do |p, i|
|
406
427
|
next unless p
|
@@ -413,9 +434,11 @@ module RelatonIso
|
|
413
434
|
title << RelatonBib::TypedTitleString.new(type: "main", content: main, language: lang, script: script(lang))
|
414
435
|
end
|
415
436
|
|
416
|
-
def
|
437
|
+
def parse_titles(doc)
|
417
438
|
# head = doc.at "//nav[contains(@class,'heading-condensed')]"
|
418
439
|
ttls = doc.xpath("//h1[@class='stdTitle']/span[position()>1]").map(&:text)
|
440
|
+
return ttls if @errors[:title] &&= ttls.empty?
|
441
|
+
|
419
442
|
ttls[0, 1] = ttls[0].split(/\s(?:-|\u2014)\s/) # if ttls.size == 1
|
420
443
|
case ttls.size
|
421
444
|
when 0, 1 then [nil, ttls.first, nil]
|
@@ -434,36 +457,42 @@ module RelatonIso
|
|
434
457
|
end
|
435
458
|
|
436
459
|
# Fetch dates
|
437
|
-
# @param doc [Nokogiri::HTML::Document]
|
438
|
-
# @param ref [String]
|
439
460
|
# @return [Array<Hash>]
|
440
|
-
def fetch_dates
|
461
|
+
def fetch_dates # rubocop:disable Metrics/AbcSize,Metrics/MethodLength
|
441
462
|
dates = []
|
442
|
-
%r{^[^\s]+\s[\d-]+:(?<ref_date_str>\d{4})} =~
|
443
|
-
pub_date_str = doc.at("//span[@itemprop='releaseDate']")
|
463
|
+
%r{^[^\s]+\s[\d-]+:(?<ref_date_str>\d{4})} =~ id
|
464
|
+
pub_date_str = @doc.at("//span[@itemprop='releaseDate']")
|
465
|
+
@errors[:date_pub] &&= pub_date_str.nil?
|
444
466
|
if ref_date_str
|
445
|
-
|
446
|
-
if pub_date_str.nil?
|
447
|
-
dates << { type: "published", on: ref_date_str }
|
448
|
-
else
|
449
|
-
pub_date = Date.strptime pub_date_str.text, "%Y"
|
450
|
-
if pub_date.year > ref_date.year
|
451
|
-
dates << { type: "published", on: ref_date_str }
|
452
|
-
dates << { type: "updated", on: pub_date_str.text }
|
453
|
-
else
|
454
|
-
dates << { type: "published", on: pub_date_str.text }
|
455
|
-
end
|
456
|
-
end
|
467
|
+
dates += parse_date_from_id ref_date_str, pub_date_str
|
457
468
|
elsif pub_date_str
|
458
469
|
dates << { type: "published", on: pub_date_str.text }
|
459
470
|
end
|
460
|
-
corr_data = doc.at "//span[@itemprop='dateModified']"
|
471
|
+
corr_data = @doc.at "//span[@itemprop='dateModified']"
|
472
|
+
@errors[:date_corr] &&= corr_data.nil?
|
461
473
|
dates << { type: "corrected", on: corr_data.text } if corr_data
|
462
474
|
dates
|
463
475
|
end
|
464
476
|
|
465
|
-
def
|
466
|
-
|
477
|
+
def parse_date_from_id(ref_date_str, pub_date_str) # rubocop:disable Metrics/AbcSize,Metrics/MethodLength
|
478
|
+
dates = []
|
479
|
+
ref_date = Date.strptime ref_date_str, "%Y"
|
480
|
+
if pub_date_str.nil?
|
481
|
+
dates << { type: "published", on: ref_date_str }
|
482
|
+
else
|
483
|
+
pub_date = Date.strptime pub_date_str.text, "%Y"
|
484
|
+
if pub_date.year > ref_date.year
|
485
|
+
dates << { type: "published", on: ref_date_str }
|
486
|
+
dates << { type: "updated", on: pub_date_str.text }
|
487
|
+
else
|
488
|
+
dates << { type: "published", on: pub_date_str.text }
|
489
|
+
end
|
490
|
+
end
|
491
|
+
dates
|
492
|
+
end
|
493
|
+
|
494
|
+
def fetch_contributors
|
495
|
+
id.sub(/\s.*/, "").split("/").reduce([]) do |mem, abbrev|
|
467
496
|
publisher = PUBLISHERS[abbrev]
|
468
497
|
next mem unless publisher
|
469
498
|
|
@@ -473,44 +502,46 @@ module RelatonIso
|
|
473
502
|
end
|
474
503
|
|
475
504
|
# Fetch ICS.
|
476
|
-
# @param doc [Nokogiri::HTML::Document]
|
477
505
|
# @return [Array<Hash>]
|
478
|
-
def fetch_ics
|
479
|
-
doc.xpath("//div[contains(., 'ICS')]/following-sibling::span/a").map do |i|
|
506
|
+
def fetch_ics
|
507
|
+
ics = @doc.xpath("//div[contains(., 'ICS')]/following-sibling::span/a").map do |i|
|
480
508
|
code = i.text.match(/[\d.]+/).to_s.split "."
|
481
509
|
{ field: code[0], group: code[1], subgroup: code[2] }
|
482
510
|
end
|
511
|
+
@errors[:ics] &&= ics.empty?
|
512
|
+
ics
|
483
513
|
end
|
484
514
|
|
485
515
|
#
|
486
516
|
# Fetch links.
|
487
517
|
#
|
488
|
-
# @param doc [Nokogiri::HTML::Document] document to parse
|
489
518
|
# @param url [String] document url
|
490
519
|
#
|
491
520
|
# @return [Array<Hash>]
|
492
521
|
#
|
493
|
-
def fetch_link(
|
522
|
+
def fetch_link(url) # rubocop:disable Metrics/AbcSize,Metrics/CyclomaticComplexity,Metrics/MethodLength
|
494
523
|
links = [{ type: "src", content: url }]
|
495
|
-
obp = doc.at("//a[.='Read sample']")
|
524
|
+
obp = @doc.at("//a[.='Read sample']")
|
525
|
+
@errors[:link_obp] &&= obp.nil?
|
496
526
|
links << { type: "obp", content: obp[:href] } if obp
|
497
|
-
rss = doc.at("//a[contains(@href, 'rss')]")
|
527
|
+
rss = @doc.at("//a[contains(@href, 'rss')]")
|
528
|
+
@errors[:link_rss] &&= rss.nil?
|
498
529
|
links << { type: "rss", content: DOMAIN + rss[:href] } if rss
|
499
|
-
pub = doc.at
|
530
|
+
pub = @doc.at "//p[contains(., 'publicly available')]/a",
|
500
531
|
"//p[contains(., 'can be downloaded from the')]/a"
|
532
|
+
@errors[:link_pub] &&= pub.nil?
|
501
533
|
links << { type: "pub", content: pub[:href] } if pub
|
502
534
|
links
|
503
535
|
end
|
504
536
|
|
505
537
|
# Fetch copyright.
|
506
|
-
# @param doc [Nokogiri::HTML::Document]
|
507
538
|
# @return [Array<Hash>]
|
508
|
-
def fetch_copyright
|
509
|
-
ref = item_ref doc
|
539
|
+
def fetch_copyright # rubocop:disable Metrics/MethodLength
|
540
|
+
ref = item_ref @doc
|
510
541
|
owner_name = ref.match(/.*?(?=\s)/).to_s
|
511
542
|
from = ref.match(/(?<=:)\d{4}/).to_s
|
512
543
|
if from.empty?
|
513
|
-
date = doc.at(
|
544
|
+
date = @doc.at(
|
514
545
|
"//span[@itemprop='releaseDate']",
|
515
546
|
"//ul[@id='stages']/li[contains(@class,'active')]/ul/li[@class='active']/a/span[@class='stage-date']",
|
516
547
|
)
|
data/lib/relaton_iso/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: relaton-iso
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.19.
|
4
|
+
version: 1.19.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Ribose Inc.
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2024-
|
11
|
+
date: 2024-10-01 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: pubid
|