relaton-iso 1.19.0 → 1.19.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/bin/bundle +1 -1
- data/lib/relaton_iso/data_fetcher.rb +74 -26
- data/lib/relaton_iso/iso_bibliography.rb +1 -1
- data/lib/relaton_iso/scrapper.rb +143 -112
- data/lib/relaton_iso/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: ae7999f9e96553504fb75338485ad672b1cb176e15860c39b35913b0ed525852
|
4
|
+
data.tar.gz: 4bf9ec438dd3aebb4b81707b53c7141a466d1f8c504a88c0d6b5d60f46ea0534
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 05fab25ad1c760bde99b95b230e6616aa6205510cb77454a7c167aeb7501e967c1909ec4afe583e259312802d8172f5099937128bc7d34c8e4901cb7f48c3181
|
7
|
+
data.tar.gz: e935ceff8ab264b3f4ca6383874c427733633bd744e3a5318cb079f47bf67f580f8957ee5cea36c3a1827b064f1f7022061b13d9e4f60182200180eb50abcbe9
|
data/bin/bundle
CHANGED
@@ -27,7 +27,7 @@ m = Module.new do
|
|
27
27
|
bundler_version = nil
|
28
28
|
update_index = nil
|
29
29
|
ARGV.each_with_index do |a, i|
|
30
|
-
if update_index && update_index.succ == i && a
|
30
|
+
if update_index && update_index.succ == i && a.match?(Gem::Version::ANCHORED_VERSION_PATTERN)
|
31
31
|
bundler_version = a
|
32
32
|
end
|
33
33
|
next unless a =~ /\A--bundler(?:[= ](#{Gem::Version::VERSION_PATTERN}))?\z/
|
@@ -7,13 +7,16 @@ module RelatonIso
|
|
7
7
|
# @param [String] output output directory
|
8
8
|
# @param [String] format format of output files (yaml, bibxml, xml)
|
9
9
|
#
|
10
|
-
def initialize(output, format)
|
10
|
+
def initialize(output, format) # rubocop:disable Metrics/AbcSize
|
11
11
|
@output = output
|
12
12
|
@format = format
|
13
13
|
@ext = format.sub(/^bib/, "")
|
14
|
-
@files =
|
14
|
+
@files = Set.new
|
15
15
|
@queue = ::Queue.new
|
16
16
|
@mutex = Mutex.new
|
17
|
+
@gh_issue = Relaton::Logger::Channels::GhIssue.new "relaton/relaton-iso", "Error fetching ISO documents"
|
18
|
+
Relaton.logger_pool[:gh_issue] = Relaton::Logger::Log.new(@gh_issue, levels: [:error])
|
19
|
+
@errors = Hash.new(true)
|
17
20
|
end
|
18
21
|
|
19
22
|
def index
|
@@ -34,12 +37,12 @@ module RelatonIso
|
|
34
37
|
#
|
35
38
|
def self.fetch(output: "data", format: "yaml")
|
36
39
|
t1 = Time.now
|
37
|
-
|
40
|
+
Util.info "Started at: #{t1}"
|
38
41
|
FileUtils.mkdir_p output
|
39
42
|
new(output, format).fetch
|
40
43
|
t2 = Time.now
|
41
|
-
|
42
|
-
|
44
|
+
Util.info "Stopped at: #{t2}"
|
45
|
+
Util.info "Done in: #{(t2 - t1).round} sec."
|
43
46
|
end
|
44
47
|
|
45
48
|
#
|
@@ -48,13 +51,21 @@ module RelatonIso
|
|
48
51
|
# @return [void]
|
49
52
|
#
|
50
53
|
def fetch # rubocop:disable Metrics/AbcSize
|
51
|
-
|
54
|
+
Util.info "Scrapping ICS pages..."
|
52
55
|
fetch_ics
|
53
|
-
|
56
|
+
Util.info "(#{Time.now}) Scrapping documents..."
|
54
57
|
fetch_docs
|
55
58
|
iso_queue.save
|
56
59
|
# index.sort! { |a, b| compare_docids a, b }
|
57
60
|
index.save
|
61
|
+
repot_errors
|
62
|
+
end
|
63
|
+
|
64
|
+
def repot_errors
|
65
|
+
@errors.select { |_, v| v }.each_key do |k|
|
66
|
+
Util.error "Failed to fetch #{k}"
|
67
|
+
end
|
68
|
+
@gh_issue.create_issue
|
58
69
|
end
|
59
70
|
|
60
71
|
#
|
@@ -72,14 +83,30 @@ module RelatonIso
|
|
72
83
|
|
73
84
|
def fetch_ics_page(path)
|
74
85
|
resp = get_redirection path
|
75
|
-
|
76
|
-
|
77
|
-
|
86
|
+
unless resp
|
87
|
+
Util.error "Failed fetching ICS page #{url(path)}"
|
88
|
+
return
|
78
89
|
end
|
79
90
|
|
80
|
-
page.
|
81
|
-
|
82
|
-
|
91
|
+
page = Nokogiri::HTML(resp.body)
|
92
|
+
parse_doc_links page
|
93
|
+
parse_ics_links page
|
94
|
+
end
|
95
|
+
|
96
|
+
def parse_doc_links(page)
|
97
|
+
doc_links = page.xpath "//td[@data-title='Standard and/or project']/div/div/a"
|
98
|
+
@errors[:doc_links] &&= doc_links.empty?
|
99
|
+
doc_links.each { |item| iso_queue.add_first item[:href].split("?").first }
|
100
|
+
end
|
101
|
+
|
102
|
+
def parse_ics_links(page)
|
103
|
+
ics_links = page.xpath("//td[@data-title='ICS']/a")
|
104
|
+
@errors[:ics_links] &&= ics_links.empty?
|
105
|
+
ics_links.each { |item| @queue << item[:href] }
|
106
|
+
end
|
107
|
+
|
108
|
+
def url(path)
|
109
|
+
Scrapper::DOMAIN + path
|
83
110
|
end
|
84
111
|
|
85
112
|
#
|
@@ -88,18 +115,18 @@ module RelatonIso
|
|
88
115
|
#
|
89
116
|
# @param [String] path path to the page
|
90
117
|
#
|
91
|
-
# @return [Net::HTTPOK] HTTP response
|
118
|
+
# @return [Net::HTTPOK, nil] HTTP response
|
92
119
|
#
|
93
120
|
def get_redirection(path) # rubocop:disable Metrics/MethodLength
|
94
121
|
try = 0
|
95
|
-
uri = URI(
|
122
|
+
uri = URI url(path)
|
96
123
|
begin
|
97
124
|
get_response uri
|
98
125
|
rescue Net::OpenTimeout, Net::ReadTimeout, Errno::ECONNREFUSED => e
|
99
126
|
try += 1
|
100
127
|
retry if check_try try, uri
|
101
128
|
|
102
|
-
Util.
|
129
|
+
Util.warn "Failed fetching #{uri}, #{e.message}"
|
103
130
|
end
|
104
131
|
end
|
105
132
|
|
@@ -131,13 +158,10 @@ module RelatonIso
|
|
131
158
|
# @return [void]
|
132
159
|
#
|
133
160
|
def fetch_doc(docpath)
|
134
|
-
|
135
|
-
# hit = Hit.new({ path: docpath }, nil)
|
136
|
-
doc = Scrapper.parse_page docpath
|
161
|
+
doc = Scrapper.parse_page docpath, errors: @errors
|
137
162
|
@mutex.synchronize { save_doc doc, docpath }
|
138
163
|
rescue StandardError => e
|
139
|
-
Util.
|
140
|
-
"#{e.message}\n#{e.backtrace}"
|
164
|
+
Util.warn "Fail fetching document: #{url(docpath)}\n#{e.message}\n#{e.backtrace}"
|
141
165
|
end
|
142
166
|
|
143
167
|
# def compare_docids(id1, id2)
|
@@ -155,16 +179,40 @@ module RelatonIso
|
|
155
179
|
docid = doc.docidentifier.detect(&:primary)
|
156
180
|
file_name = docid.id.gsub(/[\s\/:]+/, "-").downcase
|
157
181
|
file = File.join @output, "#{file_name}.#{@ext}"
|
158
|
-
if
|
159
|
-
|
182
|
+
if File.exist?(file)
|
183
|
+
rewrite_with_same_or_newer doc, docid, file, docpath
|
160
184
|
else
|
161
|
-
|
162
|
-
index.add_or_update docid.to_h, file
|
163
|
-
File.write file, serialize(doc), encoding: "UTF-8"
|
185
|
+
write_file file, doc, docid
|
164
186
|
end
|
165
187
|
iso_queue.move_last docpath
|
166
188
|
end
|
167
189
|
|
190
|
+
def rewrite_with_same_or_newer(doc, docid, file, docpath)
|
191
|
+
hash = YAML.load_file file
|
192
|
+
item_hash = HashConverter.hash_to_bib hash
|
193
|
+
bib = ::RelatonIsoBib::IsoBibliographicItem.new(**item_hash)
|
194
|
+
if edition_greater?(doc, bib) || replace_substage98?(doc, bib)
|
195
|
+
write_file file, doc, docid
|
196
|
+
elsif @files.include?(file) && !edition_greater?(bib, doc)
|
197
|
+
Util.warn "Duplicate file `#{file}` for `#{docid.id}` from #{url(docpath)}"
|
198
|
+
end
|
199
|
+
end
|
200
|
+
|
201
|
+
def edition_greater?(doc, bib)
|
202
|
+
doc.edition && bib.edition && doc.edition.content.to_i > bib.edition.content.to_i
|
203
|
+
end
|
204
|
+
|
205
|
+
def replace_substage98?(doc, bib) # rubocop:disable Metrics/CyclomaticComplexity,Metrics/PerceivedComplexity
|
206
|
+
doc.edition&.content == bib.edition&.content &&
|
207
|
+
(doc.status&.substage&.value != "98" || bib.status&.substage&.value == "98")
|
208
|
+
end
|
209
|
+
|
210
|
+
def write_file(file, doc, docid)
|
211
|
+
@files << file
|
212
|
+
index.add_or_update docid.to_h, file
|
213
|
+
File.write file, serialize(doc), encoding: "UTF-8"
|
214
|
+
end
|
215
|
+
|
168
216
|
#
|
169
217
|
# Serialize document to string.
|
170
218
|
#
|
data/lib/relaton_iso/scrapper.rb
CHANGED
@@ -2,7 +2,7 @@
|
|
2
2
|
|
3
3
|
module RelatonIso
|
4
4
|
# Scrapper.
|
5
|
-
|
5
|
+
class Scrapper # rubocop:disable Metrics/ModuleLength
|
6
6
|
DOMAIN = "https://www.iso.org"
|
7
7
|
|
8
8
|
TYPES = {
|
@@ -48,57 +48,84 @@ module RelatonIso
|
|
48
48
|
url: "www.asme.org" },
|
49
49
|
}.freeze
|
50
50
|
|
51
|
-
extend self
|
51
|
+
# extend self
|
52
|
+
|
53
|
+
def initialize(lang, errors)
|
54
|
+
@lang = lang
|
55
|
+
@errors = errors
|
56
|
+
end
|
52
57
|
|
53
58
|
# Parse page.
|
54
|
-
# @param path [String]
|
55
|
-
# @param lang [String, nil]
|
59
|
+
# @param path [String] page path
|
60
|
+
# @param lang [String, nil] language
|
61
|
+
# @param errors [Hash] collection of parsing errors
|
56
62
|
# @return [RelatonIsoBib::IsoBibliographicItem]
|
57
|
-
def parse_page(path, lang
|
58
|
-
|
59
|
-
|
60
|
-
pubid = Pubid::Iso::Identifier.parse(id)
|
61
|
-
# Fetch edition.
|
62
|
-
edition = doc.at("//div[div[.='Edition']]/text()[last()]")&.text&.match(/\d+$/)&.to_s
|
63
|
-
pubid.root.edition ||= edition if pubid.base
|
63
|
+
def self.parse_page(path, lang: nil, errors: {})
|
64
|
+
new(lang, errors).parse(path)
|
65
|
+
end
|
64
66
|
|
65
|
-
|
67
|
+
def parse(path) # rubocop:disable Metrics/AbcSize,Metrics/MethodLength
|
68
|
+
@doc, url = get_page path
|
69
|
+
titles, abstract, langs = fetch_titles_abstract
|
66
70
|
|
67
71
|
RelatonIsoBib::IsoBibliographicItem.new(
|
68
|
-
docid: fetch_relaton_docids
|
69
|
-
docnumber: fetch_docnumber
|
72
|
+
docid: fetch_relaton_docids,
|
73
|
+
docnumber: fetch_docnumber,
|
70
74
|
edition: edition,
|
71
75
|
language: langs.map { |l| l[:lang] },
|
72
76
|
script: langs.map { |l| script(l[:lang]) }.uniq,
|
73
77
|
title: titles,
|
74
|
-
doctype: fetch_type
|
75
|
-
docstatus: fetch_status
|
76
|
-
ics: fetch_ics
|
77
|
-
date: fetch_dates
|
78
|
-
contributor: fetch_contributors
|
79
|
-
editorialgroup: fetch_workgroup
|
78
|
+
doctype: fetch_type,
|
79
|
+
docstatus: fetch_status,
|
80
|
+
ics: fetch_ics,
|
81
|
+
date: fetch_dates,
|
82
|
+
contributor: fetch_contributors,
|
83
|
+
editorialgroup: fetch_workgroup,
|
80
84
|
abstract: abstract,
|
81
|
-
copyright: fetch_copyright
|
82
|
-
link: fetch_link(
|
83
|
-
relation: fetch_relations
|
85
|
+
copyright: fetch_copyright,
|
86
|
+
link: fetch_link(url),
|
87
|
+
relation: fetch_relations,
|
84
88
|
place: ["Geneva"],
|
85
|
-
structuredidentifier: fetch_structuredidentifier
|
89
|
+
structuredidentifier: fetch_structuredidentifier,
|
86
90
|
)
|
87
91
|
end
|
88
92
|
|
93
|
+
def id
|
94
|
+
return @id if defined?(@id)
|
95
|
+
|
96
|
+
did = @doc.at("//h1/span[1]")
|
97
|
+
@errors[:id] &&= did.nil?
|
98
|
+
@id = did && did.text.split(" | ").first.strip
|
99
|
+
end
|
100
|
+
|
101
|
+
def pubid
|
102
|
+
return @pubid if @pubid
|
103
|
+
|
104
|
+
@pubid = Pubid::Iso::Identifier.parse(id)
|
105
|
+
@pubid.root.edition ||= edition if @pubid.base
|
106
|
+
@pubid
|
107
|
+
rescue StandardError => e
|
108
|
+
Util.error "Failed to parse pubid from #{id}: #{e.message}"
|
109
|
+
end
|
110
|
+
|
111
|
+
def edition
|
112
|
+
return @edition if defined?(@edition)
|
113
|
+
|
114
|
+
ed = @doc.at("//div[div[.='Edition']]/text()[last()]")
|
115
|
+
@errors[:edition] &&= ed.nil?
|
116
|
+
@edition = ed && ed.text.match(/\d+$/).to_s
|
117
|
+
end
|
118
|
+
|
89
119
|
#
|
90
120
|
# Create document ids.
|
91
121
|
#
|
92
|
-
# @param doc [Nokogiri::HTML::Document] document to parse
|
93
|
-
# @param pubid [Pubid::Iso::Identifier] publication identifier
|
94
|
-
#
|
95
122
|
# @return [Array<RelatonBib::DocumentIdentifier>]
|
96
123
|
#
|
97
|
-
def fetch_relaton_docids
|
98
|
-
pubid.stage ||= Pubid::Iso::Identifier.parse_stage(stage_code
|
124
|
+
def fetch_relaton_docids
|
125
|
+
pubid.stage ||= Pubid::Iso::Identifier.parse_stage(stage_code)
|
99
126
|
[
|
100
127
|
DocumentIdentifier.new(id: pubid, type: "ISO", primary: true),
|
101
|
-
RelatonBib::DocumentIdentifier.new(id: isoref
|
128
|
+
RelatonBib::DocumentIdentifier.new(id: isoref, type: "iso-reference"),
|
102
129
|
DocumentIdentifier.new(id: pubid, type: "URN"),
|
103
130
|
]
|
104
131
|
end
|
@@ -106,11 +133,9 @@ module RelatonIso
|
|
106
133
|
#
|
107
134
|
# Create ISO reference identifier with English language.
|
108
135
|
#
|
109
|
-
# @param [Pubid::Iso::Identifier] pubid publication identifier
|
110
|
-
#
|
111
136
|
# @return [String] English reference identifier
|
112
137
|
#
|
113
|
-
def isoref
|
138
|
+
def isoref
|
114
139
|
params = pubid.to_h.reject { |k, _| k == :typed_stage }
|
115
140
|
Pubid::Iso::Identifier.create(language: "en", **params).to_s(format: :ref_num_short)
|
116
141
|
end
|
@@ -118,20 +143,18 @@ module RelatonIso
|
|
118
143
|
private
|
119
144
|
|
120
145
|
# Fetch titles and abstracts.
|
121
|
-
# @param doc [Nokigiri::HTML::Document]
|
122
|
-
# @param lang [String, nil]
|
123
146
|
# @return [Array<Array>]
|
124
|
-
def fetch_titles_abstract
|
147
|
+
def fetch_titles_abstract # rubocop:disable Metrics/AbcSize,Metrics/MethodLength
|
125
148
|
titles = RelatonBib::TypedTitleStringCollection.new
|
126
149
|
abstract = []
|
127
|
-
langs = languages
|
150
|
+
langs = languages.each_with_object([]) do |l, s|
|
128
151
|
# Don't need to get page for en. We already have it.
|
129
|
-
d = l[:path] ? get_page(l[:path])[0] : doc
|
152
|
+
d = l[:path] ? get_page(l[:path])[0] : @doc
|
130
153
|
unless d.at("//h5[@class='help-block'][.='недоступно на русском языке']")
|
131
154
|
s << l
|
132
155
|
titles += fetch_title(d, l[:lang])
|
133
156
|
|
134
|
-
abstr = parse_abstract(d, l)
|
157
|
+
abstr = parse_abstract(d, l[:lang])
|
135
158
|
abstract << abstr if abstr
|
136
159
|
end
|
137
160
|
end
|
@@ -142,23 +165,22 @@ module RelatonIso
|
|
142
165
|
abstract_content = doc.xpath(
|
143
166
|
"//div[@itemprop='description']/p|//div[@itemprop='description']/ul/li",
|
144
167
|
).map { |a| a.name == "li" ? "- #{a.text}" : a.text }.reject(&:empty?).join("\n")
|
168
|
+
@errors[:abstract] &&= abstract_content.empty?
|
145
169
|
return if abstract_content.empty?
|
146
170
|
|
147
|
-
{ content: abstract_content, language: lang
|
148
|
-
script: script(lang[:lang]), format: "text/plain" }
|
171
|
+
{ content: abstract_content, language: lang, script: script(lang), format: "text/plain" }
|
149
172
|
end
|
150
173
|
|
151
174
|
# Returns available languages.
|
152
|
-
# @param doc [Nokogiri::HTML::Document]
|
153
|
-
# @param lang [String, nil]
|
154
175
|
# @return [Array<Hash>]
|
155
|
-
def languages
|
176
|
+
def languages
|
156
177
|
lgs = [{ lang: "en" }]
|
157
|
-
doc.css("li#lang-switcher ul li a").each do |lang_link|
|
178
|
+
@doc.css("li#lang-switcher ul li a").each do |lang_link|
|
158
179
|
lang_path = lang_link.attr("href")
|
159
180
|
l = lang_path.match(%r{^/(fr)/})
|
160
|
-
lgs << { lang: l[1], path: lang_path } if l && (
|
181
|
+
lgs << { lang: l[1], path: lang_path } if l && (!@lang || l[1] != @lang)
|
161
182
|
end
|
183
|
+
@errors[:language] &&= lgs.size == 1
|
162
184
|
lgs
|
163
185
|
end
|
164
186
|
|
@@ -239,7 +261,7 @@ module RelatonIso
|
|
239
261
|
10.times do
|
240
262
|
doc = Nokogiri::HTML(resp.body)
|
241
263
|
# stop trying if page has a document id
|
242
|
-
return doc if item_ref
|
264
|
+
return doc if item_ref(doc)
|
243
265
|
|
244
266
|
resp = Net::HTTP.get_response(uri)
|
245
267
|
end
|
@@ -249,22 +271,18 @@ module RelatonIso
|
|
249
271
|
#
|
250
272
|
# Generate docnumber.
|
251
273
|
#
|
252
|
-
# @param [Pubid::Iso] pubid
|
253
|
-
#
|
254
274
|
# @return [String] docnumber
|
255
275
|
#
|
256
|
-
def fetch_docnumber
|
276
|
+
def fetch_docnumber
|
257
277
|
pubid.to_s.match(/\d+/)&.to_s
|
258
278
|
end
|
259
279
|
|
260
280
|
#
|
261
281
|
# Parse structuredidentifier.
|
262
282
|
#
|
263
|
-
# @param pubid [Pubid::Iso::Identifier] pubid
|
264
|
-
#
|
265
283
|
# @return [RelatonBib::StructuredIdentifier] structured identifier
|
266
284
|
#
|
267
|
-
def fetch_structuredidentifier
|
285
|
+
def fetch_structuredidentifier # rubocop:disable Metrics/MethodLength
|
268
286
|
RelatonIsoBib::StructuredIdentifier.new(
|
269
287
|
project_number: "#{pubid.root.publisher} #{pubid.root.number}",
|
270
288
|
part: pubid.root.part&.to_s, # &.sub(/^-/, ""),
|
@@ -280,21 +298,24 @@ module RelatonIso
|
|
280
298
|
# @return [String, nil] ID
|
281
299
|
#
|
282
300
|
def item_ref(doc)
|
283
|
-
doc.at("//main//section/div/div/div//h1/span[1]")
|
301
|
+
ref = doc.at("//main//section/div/div/div//h1/span[1]")
|
302
|
+
@errors[:reference] &&= ref.nil?
|
303
|
+
ref&.text&.strip
|
284
304
|
end
|
285
305
|
|
286
306
|
# Fetch status.
|
287
|
-
# @
|
288
|
-
|
289
|
-
|
290
|
-
def fetch_status(doc)
|
291
|
-
stg, substg = stage_code(doc).split "."
|
307
|
+
# @return [RelatonBib::DocumentStatus]
|
308
|
+
def fetch_status
|
309
|
+
stg, substg = stage_code.split "."
|
292
310
|
RelatonBib::DocumentStatus.new(stage: stg, substage: substg)
|
293
311
|
end
|
294
312
|
|
295
|
-
def stage_code
|
296
|
-
|
297
|
-
|
313
|
+
def stage_code
|
314
|
+
return @stage_code if defined?(@stage_code)
|
315
|
+
|
316
|
+
stc = @doc.at("//ul[@class='dropdown-menu']/li[@class='active']/a/span[@class='stage-code']")
|
317
|
+
@errors[:stage] &&= stc.nil?
|
318
|
+
@stage_code = stc&.text
|
298
319
|
end
|
299
320
|
|
300
321
|
# def stage(stg, substg)
|
@@ -305,8 +326,9 @@ module RelatonIso
|
|
305
326
|
# Fetch workgroup.
|
306
327
|
# @param doc [Nokogiri::HTML::Document]
|
307
328
|
# @return [RelatonIsoBib::EditorialGroup, nil]
|
308
|
-
def fetch_workgroup
|
309
|
-
wg = doc.at("
|
329
|
+
def fetch_workgroup # rubocop:disable Metrics/MethodLength,Metrics/AbcSize,Metrics/CyclomaticComplexity,Metrics/PerceivedComplexity
|
330
|
+
wg = @doc.at("//div[contains(., 'Technical Committe')]/following-sibling::span/a")
|
331
|
+
@errors[:workgroup] &&= wg.nil?
|
310
332
|
return unless wg
|
311
333
|
|
312
334
|
workgroup = wg.text.split "/"
|
@@ -324,34 +346,34 @@ module RelatonIso
|
|
324
346
|
end
|
325
347
|
|
326
348
|
# Fetch relations.
|
327
|
-
# @param doc [Nokogiri::HTML::Document]
|
328
349
|
# @return [Array<Hash>]
|
329
|
-
def fetch_relations
|
350
|
+
def fetch_relations
|
330
351
|
types = ["Now", "Now under review"]
|
331
|
-
doc.xpath(
|
352
|
+
rels = @doc.xpath(
|
332
353
|
"//ul[@class='steps']/li", "//div[contains(@class, 'sub-step')]"
|
333
354
|
).reduce([]) do |a, r|
|
334
|
-
type, date = relation_type(r.at("h4", "h5").text.strip
|
355
|
+
type, date = relation_type(r.at("h4", "h5").text.strip)
|
335
356
|
next a if types.include?(type)
|
336
357
|
|
337
358
|
a + create_relations(r, type, date)
|
338
359
|
end
|
360
|
+
@errors[:relation] &&= rels.empty?
|
361
|
+
rels
|
339
362
|
end
|
340
363
|
|
341
364
|
#
|
342
365
|
# Parse relation type and dates.
|
343
366
|
#
|
344
367
|
# @param [String] type parsed type
|
345
|
-
# @param [Nokogiri::HTML::Document] doc document to parse
|
346
368
|
#
|
347
369
|
# @return [Array<String,Array>] type and dates
|
348
370
|
#
|
349
|
-
def relation_type(type
|
371
|
+
def relation_type(type)
|
350
372
|
date = []
|
351
373
|
t = case type.strip
|
352
374
|
when "Previously", "Will be replaced by" then "obsoletes"
|
353
375
|
when /Corrigenda|Amendments|Revised by|Now confirmed|replaced by/
|
354
|
-
on = doc.xpath('//span[@class="stage-date"][contains(., "-")]').last
|
376
|
+
on = @doc.xpath('//span[@class="stage-date"][contains(., "-")]').last
|
355
377
|
date << { type: "circulated", on: on.text } if on
|
356
378
|
"updates"
|
357
379
|
else type
|
@@ -371,9 +393,9 @@ module RelatonIso
|
|
371
393
|
# @return [Array<Hash>] Relations
|
372
394
|
#
|
373
395
|
def create_relations(rel, type, date)
|
374
|
-
rel.css("a").map do |
|
375
|
-
docid = DocumentIdentifier.new(type: "ISO", id:
|
376
|
-
fref = RelatonBib::FormattedRef.new(content:
|
396
|
+
rel.css("a").map do |rid|
|
397
|
+
docid = DocumentIdentifier.new(type: "ISO", id: rid.text, primary: true)
|
398
|
+
fref = RelatonBib::FormattedRef.new(content: rid.text, format: "text/plain")
|
377
399
|
bibitem = RelatonIsoBib::IsoBibliographicItem.new(
|
378
400
|
docid: [docid], formattedref: fref, date: date,
|
379
401
|
)
|
@@ -382,14 +404,13 @@ module RelatonIso
|
|
382
404
|
end
|
383
405
|
|
384
406
|
# Fetch type.
|
385
|
-
# @param ref [String]
|
386
407
|
# @return [String]
|
387
|
-
def fetch_type
|
408
|
+
def fetch_type
|
388
409
|
%r{
|
389
410
|
^(?<prefix>ISO|IWA|IEC)
|
390
|
-
(?:(?:/IEC|/IEEE|/PRF|/NP|/SAE|/HL7|/DGuide)*\s|/)
|
411
|
+
(?:(?:/CIE|/IEC|/IEEE|/PRF|/NP|/SAE|/HL7|/DGuide)*\s|/)
|
391
412
|
(?<type>TS|TR|PAS|AWI|CD|FDIS|NP|DIS|WD|R|DTS|DTR|ISP|PWI|Guide|(?=\d+))
|
392
|
-
}x =~
|
413
|
+
}x =~ id
|
393
414
|
type = TYPES[type] || TYPES[prefix] || "international-standard"
|
394
415
|
RelatonIsoBib::DocumentType.new(type: type)
|
395
416
|
end
|
@@ -400,7 +421,7 @@ module RelatonIso
|
|
400
421
|
# @return [Array<RelatonBib::TypedTitleString>]
|
401
422
|
def fetch_title(doc, lang) # rubocop:disable Metrics/MethodLength,Metrics/AbcSize
|
402
423
|
types = %w[title-intro title-main title-part]
|
403
|
-
ttls =
|
424
|
+
ttls = parse_titles(doc)
|
404
425
|
title = RelatonBib::TypedTitleStringCollection.new
|
405
426
|
ttls.each.with_index do |p, i|
|
406
427
|
next unless p
|
@@ -413,9 +434,11 @@ module RelatonIso
|
|
413
434
|
title << RelatonBib::TypedTitleString.new(type: "main", content: main, language: lang, script: script(lang))
|
414
435
|
end
|
415
436
|
|
416
|
-
def
|
437
|
+
def parse_titles(doc)
|
417
438
|
# head = doc.at "//nav[contains(@class,'heading-condensed')]"
|
418
439
|
ttls = doc.xpath("//h1[@class='stdTitle']/span[position()>1]").map(&:text)
|
440
|
+
return ttls if @errors[:title] &&= ttls.empty?
|
441
|
+
|
419
442
|
ttls[0, 1] = ttls[0].split(/\s(?:-|\u2014)\s/) # if ttls.size == 1
|
420
443
|
case ttls.size
|
421
444
|
when 0, 1 then [nil, ttls.first, nil]
|
@@ -434,36 +457,42 @@ module RelatonIso
|
|
434
457
|
end
|
435
458
|
|
436
459
|
# Fetch dates
|
437
|
-
# @param doc [Nokogiri::HTML::Document]
|
438
|
-
# @param ref [String]
|
439
460
|
# @return [Array<Hash>]
|
440
|
-
def fetch_dates
|
461
|
+
def fetch_dates # rubocop:disable Metrics/AbcSize,Metrics/MethodLength
|
441
462
|
dates = []
|
442
|
-
%r{^[^\s]+\s[\d-]+:(?<ref_date_str>\d{4})} =~
|
443
|
-
pub_date_str = doc.at("//span[@itemprop='releaseDate']")
|
463
|
+
%r{^[^\s]+\s[\d-]+:(?<ref_date_str>\d{4})} =~ id
|
464
|
+
pub_date_str = @doc.at("//span[@itemprop='releaseDate']")
|
465
|
+
@errors[:date_pub] &&= pub_date_str.nil?
|
444
466
|
if ref_date_str
|
445
|
-
|
446
|
-
if pub_date_str.nil?
|
447
|
-
dates << { type: "published", on: ref_date_str }
|
448
|
-
else
|
449
|
-
pub_date = Date.strptime pub_date_str.text, "%Y"
|
450
|
-
if pub_date.year > ref_date.year
|
451
|
-
dates << { type: "published", on: ref_date_str }
|
452
|
-
dates << { type: "updated", on: pub_date_str.text }
|
453
|
-
else
|
454
|
-
dates << { type: "published", on: pub_date_str.text }
|
455
|
-
end
|
456
|
-
end
|
467
|
+
dates += parse_date_from_id ref_date_str, pub_date_str
|
457
468
|
elsif pub_date_str
|
458
469
|
dates << { type: "published", on: pub_date_str.text }
|
459
470
|
end
|
460
|
-
corr_data = doc.at "//span[@itemprop='dateModified']"
|
471
|
+
corr_data = @doc.at "//span[@itemprop='dateModified']"
|
472
|
+
@errors[:date_corr] &&= corr_data.nil?
|
461
473
|
dates << { type: "corrected", on: corr_data.text } if corr_data
|
462
474
|
dates
|
463
475
|
end
|
464
476
|
|
465
|
-
def
|
466
|
-
|
477
|
+
def parse_date_from_id(ref_date_str, pub_date_str) # rubocop:disable Metrics/AbcSize,Metrics/MethodLength
|
478
|
+
dates = []
|
479
|
+
ref_date = Date.strptime ref_date_str, "%Y"
|
480
|
+
if pub_date_str.nil?
|
481
|
+
dates << { type: "published", on: ref_date_str }
|
482
|
+
else
|
483
|
+
pub_date = Date.strptime pub_date_str.text, "%Y"
|
484
|
+
if pub_date.year > ref_date.year
|
485
|
+
dates << { type: "published", on: ref_date_str }
|
486
|
+
dates << { type: "updated", on: pub_date_str.text }
|
487
|
+
else
|
488
|
+
dates << { type: "published", on: pub_date_str.text }
|
489
|
+
end
|
490
|
+
end
|
491
|
+
dates
|
492
|
+
end
|
493
|
+
|
494
|
+
def fetch_contributors
|
495
|
+
id.sub(/\s.*/, "").split("/").reduce([]) do |mem, abbrev|
|
467
496
|
publisher = PUBLISHERS[abbrev]
|
468
497
|
next mem unless publisher
|
469
498
|
|
@@ -473,44 +502,46 @@ module RelatonIso
|
|
473
502
|
end
|
474
503
|
|
475
504
|
# Fetch ICS.
|
476
|
-
# @param doc [Nokogiri::HTML::Document]
|
477
505
|
# @return [Array<Hash>]
|
478
|
-
def fetch_ics
|
479
|
-
doc.xpath("//div[contains(., 'ICS')]/following-sibling::span/a").map do |i|
|
506
|
+
def fetch_ics
|
507
|
+
ics = @doc.xpath("//div[contains(., 'ICS')]/following-sibling::span/a").map do |i|
|
480
508
|
code = i.text.match(/[\d.]+/).to_s.split "."
|
481
509
|
{ field: code[0], group: code[1], subgroup: code[2] }
|
482
510
|
end
|
511
|
+
@errors[:ics] &&= ics.empty?
|
512
|
+
ics
|
483
513
|
end
|
484
514
|
|
485
515
|
#
|
486
516
|
# Fetch links.
|
487
517
|
#
|
488
|
-
# @param doc [Nokogiri::HTML::Document] document to parse
|
489
518
|
# @param url [String] document url
|
490
519
|
#
|
491
520
|
# @return [Array<Hash>]
|
492
521
|
#
|
493
|
-
def fetch_link(
|
522
|
+
def fetch_link(url) # rubocop:disable Metrics/AbcSize,Metrics/CyclomaticComplexity,Metrics/MethodLength
|
494
523
|
links = [{ type: "src", content: url }]
|
495
|
-
obp = doc.at("//a[.='Read sample']")
|
524
|
+
obp = @doc.at("//a[.='Read sample']")
|
525
|
+
@errors[:link_obp] &&= obp.nil?
|
496
526
|
links << { type: "obp", content: obp[:href] } if obp
|
497
|
-
rss = doc.at("//a[contains(@href, 'rss')]")
|
527
|
+
rss = @doc.at("//a[contains(@href, 'rss')]")
|
528
|
+
@errors[:link_rss] &&= rss.nil?
|
498
529
|
links << { type: "rss", content: DOMAIN + rss[:href] } if rss
|
499
|
-
pub = doc.at
|
530
|
+
pub = @doc.at "//p[contains(., 'publicly available')]/a",
|
500
531
|
"//p[contains(., 'can be downloaded from the')]/a"
|
532
|
+
@errors[:link_pub] &&= pub.nil?
|
501
533
|
links << { type: "pub", content: pub[:href] } if pub
|
502
534
|
links
|
503
535
|
end
|
504
536
|
|
505
537
|
# Fetch copyright.
|
506
|
-
# @param doc [Nokogiri::HTML::Document]
|
507
538
|
# @return [Array<Hash>]
|
508
|
-
def fetch_copyright
|
509
|
-
ref = item_ref doc
|
539
|
+
def fetch_copyright # rubocop:disable Metrics/MethodLength
|
540
|
+
ref = item_ref @doc
|
510
541
|
owner_name = ref.match(/.*?(?=\s)/).to_s
|
511
542
|
from = ref.match(/(?<=:)\d{4}/).to_s
|
512
543
|
if from.empty?
|
513
|
-
date = doc.at(
|
544
|
+
date = @doc.at(
|
514
545
|
"//span[@itemprop='releaseDate']",
|
515
546
|
"//ul[@id='stages']/li[contains(@class,'active')]/ul/li[@class='active']/a/span[@class='stage-date']",
|
516
547
|
)
|
data/lib/relaton_iso/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: relaton-iso
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.19.
|
4
|
+
version: 1.19.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Ribose Inc.
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2024-
|
11
|
+
date: 2024-10-01 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: pubid
|