relaton-iso 1.19.1 → 1.19.2

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 05ea6b24502f419ac918fa4128bbde3fead57788c976a9a45aeff911a9d0487e
4
- data.tar.gz: dd524e78977ef203a13560590e4d4167a91f6cd07fbe3c5bee419699c0fe0b40
3
+ metadata.gz: ae7999f9e96553504fb75338485ad672b1cb176e15860c39b35913b0ed525852
4
+ data.tar.gz: 4bf9ec438dd3aebb4b81707b53c7141a466d1f8c504a88c0d6b5d60f46ea0534
5
5
  SHA512:
6
- metadata.gz: f2d5b69b515b3dc68a0cc27c0a8a59e6298849b4df54dfbf45a8cb7d52d347863eeea7adb9d44df56e4cab08ec04e3d12d1d47eb8dd5358567b38f214afbc380
7
- data.tar.gz: 114c12c049bf9da2d03e94905ac68017e4bbf61087c261bcbc21a92cee94eac7371a015ea01592def5dee256add9d814c1f3ed75dcf78ac434c29e76d26a06ca
6
+ metadata.gz: 05fab25ad1c760bde99b95b230e6616aa6205510cb77454a7c167aeb7501e967c1909ec4afe583e259312802d8172f5099937128bc7d34c8e4901cb7f48c3181
7
+ data.tar.gz: e935ceff8ab264b3f4ca6383874c427733633bd744e3a5318cb079f47bf67f580f8957ee5cea36c3a1827b064f1f7022061b13d9e4f60182200180eb50abcbe9
data/bin/bundle CHANGED
@@ -27,7 +27,7 @@ m = Module.new do
27
27
  bundler_version = nil
28
28
  update_index = nil
29
29
  ARGV.each_with_index do |a, i|
30
- if update_index && update_index.succ == i && a =~ Gem::Version::ANCHORED_VERSION_PATTERN
30
+ if update_index && update_index.succ == i && a.match?(Gem::Version::ANCHORED_VERSION_PATTERN)
31
31
  bundler_version = a
32
32
  end
33
33
  next unless a =~ /\A--bundler(?:[= ](#{Gem::Version::VERSION_PATTERN}))?\z/
@@ -7,13 +7,16 @@ module RelatonIso
7
7
  # @param [String] output output directory
8
8
  # @param [String] format format of output files (yaml, bibxml, xml)
9
9
  #
10
- def initialize(output, format)
10
+ def initialize(output, format) # rubocop:disable Metrics/AbcSize
11
11
  @output = output
12
12
  @format = format
13
13
  @ext = format.sub(/^bib/, "")
14
- @files = []
14
+ @files = Set.new
15
15
  @queue = ::Queue.new
16
16
  @mutex = Mutex.new
17
+ @gh_issue = Relaton::Logger::Channels::GhIssue.new "relaton/relaton-iso", "Error fetching ISO documents"
18
+ Relaton.logger_pool[:gh_issue] = Relaton::Logger::Log.new(@gh_issue, levels: [:error])
19
+ @errors = Hash.new(true)
17
20
  end
18
21
 
19
22
  def index
@@ -34,12 +37,12 @@ module RelatonIso
34
37
  #
35
38
  def self.fetch(output: "data", format: "yaml")
36
39
  t1 = Time.now
37
- puts "Started at: #{t1}"
40
+ Util.info "Started at: #{t1}"
38
41
  FileUtils.mkdir_p output
39
42
  new(output, format).fetch
40
43
  t2 = Time.now
41
- puts "Stopped at: #{t2}"
42
- puts "Done in: #{(t2 - t1).round} sec."
44
+ Util.info "Stopped at: #{t2}"
45
+ Util.info "Done in: #{(t2 - t1).round} sec."
43
46
  end
44
47
 
45
48
  #
@@ -48,13 +51,21 @@ module RelatonIso
48
51
  # @return [void]
49
52
  #
50
53
  def fetch # rubocop:disable Metrics/AbcSize
51
- puts "Scrapping ICS pages..."
54
+ Util.info "Scrapping ICS pages..."
52
55
  fetch_ics
53
- puts "[#{Time.now}] Scrapping documents..."
56
+ Util.info "(#{Time.now}) Scrapping documents..."
54
57
  fetch_docs
55
58
  iso_queue.save
56
59
  # index.sort! { |a, b| compare_docids a, b }
57
60
  index.save
61
+ repot_errors
62
+ end
63
+
64
+ def repot_errors
65
+ @errors.select { |_, v| v }.each_key do |k|
66
+ Util.error "Failed to fetch #{k}"
67
+ end
68
+ @gh_issue.create_issue
58
69
  end
59
70
 
60
71
  #
@@ -72,14 +83,30 @@ module RelatonIso
72
83
 
73
84
  def fetch_ics_page(path)
74
85
  resp = get_redirection path
75
- page = Nokogiri::HTML(resp.body)
76
- page.xpath("//td[@data-title='Standard and/or project']/div/div/a").each do |item|
77
- iso_queue.add_first item[:href].split("?").first
86
+ unless resp
87
+ Util.error "Failed fetching ICS page #{url(path)}"
88
+ return
78
89
  end
79
90
 
80
- page.xpath("//td[@data-title='ICS']/a").each do |item|
81
- @queue << item[:href]
82
- end
91
+ page = Nokogiri::HTML(resp.body)
92
+ parse_doc_links page
93
+ parse_ics_links page
94
+ end
95
+
96
+ def parse_doc_links(page)
97
+ doc_links = page.xpath "//td[@data-title='Standard and/or project']/div/div/a"
98
+ @errors[:doc_links] &&= doc_links.empty?
99
+ doc_links.each { |item| iso_queue.add_first item[:href].split("?").first }
100
+ end
101
+
102
+ def parse_ics_links(page)
103
+ ics_links = page.xpath("//td[@data-title='ICS']/a")
104
+ @errors[:ics_links] &&= ics_links.empty?
105
+ ics_links.each { |item| @queue << item[:href] }
106
+ end
107
+
108
+ def url(path)
109
+ Scrapper::DOMAIN + path
83
110
  end
84
111
 
85
112
  #
@@ -88,18 +115,18 @@ module RelatonIso
88
115
  #
89
116
  # @param [String] path path to the page
90
117
  #
91
- # @return [Net::HTTPOK] HTTP response
118
+ # @return [Net::HTTPOK, nil] HTTP response
92
119
  #
93
120
  def get_redirection(path) # rubocop:disable Metrics/MethodLength
94
121
  try = 0
95
- uri = URI(Scrapper::DOMAIN + path)
122
+ uri = URI url(path)
96
123
  begin
97
124
  get_response uri
98
125
  rescue Net::OpenTimeout, Net::ReadTimeout, Errno::ECONNREFUSED => e
99
126
  try += 1
100
127
  retry if check_try try, uri
101
128
 
102
- Util.error "Error fetching #{uri}, #{e.message}"
129
+ Util.warn "Failed fetching #{uri}, #{e.message}"
103
130
  end
104
131
  end
105
132
 
@@ -131,13 +158,10 @@ module RelatonIso
131
158
  # @return [void]
132
159
  #
133
160
  def fetch_doc(docpath)
134
- # path = docpath.sub(/\.html$/, "")
135
- # hit = Hit.new({ path: docpath }, nil)
136
- doc = Scrapper.parse_page docpath
161
+ doc = Scrapper.parse_page docpath, errors: @errors
137
162
  @mutex.synchronize { save_doc doc, docpath }
138
163
  rescue StandardError => e
139
- Util.error "Error fetching document: #{Scrapper::DOMAIN}#{docpath}\n" \
140
- "#{e.message}\n#{e.backtrace}"
164
+ Util.warn "Fail fetching document: #{url(docpath)}\n#{e.message}\n#{e.backtrace}"
141
165
  end
142
166
 
143
167
  # def compare_docids(id1, id2)
@@ -155,16 +179,40 @@ module RelatonIso
155
179
  docid = doc.docidentifier.detect(&:primary)
156
180
  file_name = docid.id.gsub(/[\s\/:]+/, "-").downcase
157
181
  file = File.join @output, "#{file_name}.#{@ext}"
158
- if @files.include? file
159
- Util.warn "Duplicate file #{file} for #{docid.id} from #{Scrapper::DOMAIN}#{docpath}"
182
+ if File.exist?(file)
183
+ rewrite_with_same_or_newer doc, docid, file, docpath
160
184
  else
161
- @files << file
162
- index.add_or_update docid.to_h, file
163
- File.write file, serialize(doc), encoding: "UTF-8"
185
+ write_file file, doc, docid
164
186
  end
165
187
  iso_queue.move_last docpath
166
188
  end
167
189
 
190
+ def rewrite_with_same_or_newer(doc, docid, file, docpath)
191
+ hash = YAML.load_file file
192
+ item_hash = HashConverter.hash_to_bib hash
193
+ bib = ::RelatonIsoBib::IsoBibliographicItem.new(**item_hash)
194
+ if edition_greater?(doc, bib) || replace_substage98?(doc, bib)
195
+ write_file file, doc, docid
196
+ elsif @files.include?(file) && !edition_greater?(bib, doc)
197
+ Util.warn "Duplicate file `#{file}` for `#{docid.id}` from #{url(docpath)}"
198
+ end
199
+ end
200
+
201
+ def edition_greater?(doc, bib)
202
+ doc.edition && bib.edition && doc.edition.content.to_i > bib.edition.content.to_i
203
+ end
204
+
205
+ def replace_substage98?(doc, bib) # rubocop:disable Metrics/CyclomaticComplexity,Metrics/PerceivedComplexity
206
+ doc.edition&.content == bib.edition&.content &&
207
+ (doc.status&.substage&.value != "98" || bib.status&.substage&.value == "98")
208
+ end
209
+
210
+ def write_file(file, doc, docid)
211
+ @files << file
212
+ index.add_or_update docid.to_h, file
213
+ File.write file, serialize(doc), encoding: "UTF-8"
214
+ end
215
+
168
216
  #
169
217
  # Serialize document to string.
170
218
  #
@@ -2,7 +2,7 @@
2
2
 
3
3
  module RelatonIso
4
4
  # Scrapper.
5
- module Scrapper # rubocop:disable Metrics/ModuleLength
5
+ class Scrapper # rubocop:disable Metrics/ModuleLength
6
6
  DOMAIN = "https://www.iso.org"
7
7
 
8
8
  TYPES = {
@@ -48,57 +48,84 @@ module RelatonIso
48
48
  url: "www.asme.org" },
49
49
  }.freeze
50
50
 
51
- extend self
51
+ # extend self
52
+
53
+ def initialize(lang, errors)
54
+ @lang = lang
55
+ @errors = errors
56
+ end
52
57
 
53
58
  # Parse page.
54
- # @param path [String]
55
- # @param lang [String, nil]
59
+ # @param path [String] page path
60
+ # @param lang [String, nil] language
61
+ # @param errors [Hash] collection of parsing errors
56
62
  # @return [RelatonIsoBib::IsoBibliographicItem]
57
- def parse_page(path, lang = nil) # rubocop:disable Metrics/AbcSize, Metrics/MethodLength, Metrics/CyclomaticComplexity, Metrics/PerceivedComplexity
58
- doc, url = get_page path
59
- id = doc.at("//h1/span[1]").text.split(" | ").first.strip
60
- pubid = Pubid::Iso::Identifier.parse(id)
61
- # Fetch edition.
62
- edition = doc.at("//div[div[.='Edition']]/text()[last()]")&.text&.match(/\d+$/)&.to_s
63
- pubid.root.edition ||= edition if pubid.base
63
+ def self.parse_page(path, lang: nil, errors: {})
64
+ new(lang, errors).parse(path)
65
+ end
64
66
 
65
- titles, abstract, langs = fetch_titles_abstract(doc, lang)
67
+ def parse(path) # rubocop:disable Metrics/AbcSize,Metrics/MethodLength
68
+ @doc, url = get_page path
69
+ titles, abstract, langs = fetch_titles_abstract
66
70
 
67
71
  RelatonIsoBib::IsoBibliographicItem.new(
68
- docid: fetch_relaton_docids(doc, pubid),
69
- docnumber: fetch_docnumber(pubid),
72
+ docid: fetch_relaton_docids,
73
+ docnumber: fetch_docnumber,
70
74
  edition: edition,
71
75
  language: langs.map { |l| l[:lang] },
72
76
  script: langs.map { |l| script(l[:lang]) }.uniq,
73
77
  title: titles,
74
- doctype: fetch_type(id),
75
- docstatus: fetch_status(doc),
76
- ics: fetch_ics(doc),
77
- date: fetch_dates(doc, id),
78
- contributor: fetch_contributors(id),
79
- editorialgroup: fetch_workgroup(doc),
78
+ doctype: fetch_type,
79
+ docstatus: fetch_status,
80
+ ics: fetch_ics,
81
+ date: fetch_dates,
82
+ contributor: fetch_contributors,
83
+ editorialgroup: fetch_workgroup,
80
84
  abstract: abstract,
81
- copyright: fetch_copyright(doc),
82
- link: fetch_link(doc, url),
83
- relation: fetch_relations(doc),
85
+ copyright: fetch_copyright,
86
+ link: fetch_link(url),
87
+ relation: fetch_relations,
84
88
  place: ["Geneva"],
85
- structuredidentifier: fetch_structuredidentifier(pubid),
89
+ structuredidentifier: fetch_structuredidentifier,
86
90
  )
87
91
  end
88
92
 
93
+ def id
94
+ return @id if defined?(@id)
95
+
96
+ did = @doc.at("//h1/span[1]")
97
+ @errors[:id] &&= did.nil?
98
+ @id = did && did.text.split(" | ").first.strip
99
+ end
100
+
101
+ def pubid
102
+ return @pubid if @pubid
103
+
104
+ @pubid = Pubid::Iso::Identifier.parse(id)
105
+ @pubid.root.edition ||= edition if @pubid.base
106
+ @pubid
107
+ rescue StandardError => e
108
+ Util.error "Failed to parse pubid from #{id}: #{e.message}"
109
+ end
110
+
111
+ def edition
112
+ return @edition if defined?(@edition)
113
+
114
+ ed = @doc.at("//div[div[.='Edition']]/text()[last()]")
115
+ @errors[:edition] &&= ed.nil?
116
+ @edition = ed && ed.text.match(/\d+$/).to_s
117
+ end
118
+
89
119
  #
90
120
  # Create document ids.
91
121
  #
92
- # @param doc [Nokogiri::HTML::Document] document to parse
93
- # @param pubid [Pubid::Iso::Identifier] publication identifier
94
- #
95
122
  # @return [Array<RelatonBib::DocumentIdentifier>]
96
123
  #
97
- def fetch_relaton_docids(doc, pubid)
98
- pubid.stage ||= Pubid::Iso::Identifier.parse_stage(stage_code(doc))
124
+ def fetch_relaton_docids
125
+ pubid.stage ||= Pubid::Iso::Identifier.parse_stage(stage_code)
99
126
  [
100
127
  DocumentIdentifier.new(id: pubid, type: "ISO", primary: true),
101
- RelatonBib::DocumentIdentifier.new(id: isoref(pubid), type: "iso-reference"),
128
+ RelatonBib::DocumentIdentifier.new(id: isoref, type: "iso-reference"),
102
129
  DocumentIdentifier.new(id: pubid, type: "URN"),
103
130
  ]
104
131
  end
@@ -106,11 +133,9 @@ module RelatonIso
106
133
  #
107
134
  # Create ISO reference identifier with English language.
108
135
  #
109
- # @param [Pubid::Iso::Identifier] pubid publication identifier
110
- #
111
136
  # @return [String] English reference identifier
112
137
  #
113
- def isoref(pubid)
138
+ def isoref
114
139
  params = pubid.to_h.reject { |k, _| k == :typed_stage }
115
140
  Pubid::Iso::Identifier.create(language: "en", **params).to_s(format: :ref_num_short)
116
141
  end
@@ -118,20 +143,18 @@ module RelatonIso
118
143
  private
119
144
 
120
145
  # Fetch titles and abstracts.
121
- # @param doc [Nokigiri::HTML::Document]
122
- # @param lang [String, nil]
123
146
  # @return [Array<Array>]
124
- def fetch_titles_abstract(doc, lang) # rubocop:disable Metrics/AbcSize,Metrics/MethodLength
147
+ def fetch_titles_abstract # rubocop:disable Metrics/AbcSize,Metrics/MethodLength
125
148
  titles = RelatonBib::TypedTitleStringCollection.new
126
149
  abstract = []
127
- langs = languages(doc, lang).each_with_object([]) do |l, s|
150
+ langs = languages.each_with_object([]) do |l, s|
128
151
  # Don't need to get page for en. We already have it.
129
- d = l[:path] ? get_page(l[:path])[0] : doc
152
+ d = l[:path] ? get_page(l[:path])[0] : @doc
130
153
  unless d.at("//h5[@class='help-block'][.='недоступно на русском языке']")
131
154
  s << l
132
155
  titles += fetch_title(d, l[:lang])
133
156
 
134
- abstr = parse_abstract(d, l)
157
+ abstr = parse_abstract(d, l[:lang])
135
158
  abstract << abstr if abstr
136
159
  end
137
160
  end
@@ -142,23 +165,22 @@ module RelatonIso
142
165
  abstract_content = doc.xpath(
143
166
  "//div[@itemprop='description']/p|//div[@itemprop='description']/ul/li",
144
167
  ).map { |a| a.name == "li" ? "- #{a.text}" : a.text }.reject(&:empty?).join("\n")
168
+ @errors[:abstract] &&= abstract_content.empty?
145
169
  return if abstract_content.empty?
146
170
 
147
- { content: abstract_content, language: lang[:lang],
148
- script: script(lang[:lang]), format: "text/plain" }
171
+ { content: abstract_content, language: lang, script: script(lang), format: "text/plain" }
149
172
  end
150
173
 
151
174
  # Returns available languages.
152
- # @param doc [Nokogiri::HTML::Document]
153
- # @param lang [String, nil]
154
175
  # @return [Array<Hash>]
155
- def languages(doc, lang)
176
+ def languages
156
177
  lgs = [{ lang: "en" }]
157
- doc.css("li#lang-switcher ul li a").each do |lang_link|
178
+ @doc.css("li#lang-switcher ul li a").each do |lang_link|
158
179
  lang_path = lang_link.attr("href")
159
180
  l = lang_path.match(%r{^/(fr)/})
160
- lgs << { lang: l[1], path: lang_path } if l && (!lang || l[1] != lang)
181
+ lgs << { lang: l[1], path: lang_path } if l && (!@lang || l[1] != @lang)
161
182
  end
183
+ @errors[:language] &&= lgs.size == 1
162
184
  lgs
163
185
  end
164
186
 
@@ -239,7 +261,7 @@ module RelatonIso
239
261
  10.times do
240
262
  doc = Nokogiri::HTML(resp.body)
241
263
  # stop trying if page has a document id
242
- return doc if item_ref doc
264
+ return doc if item_ref(doc)
243
265
 
244
266
  resp = Net::HTTP.get_response(uri)
245
267
  end
@@ -249,22 +271,18 @@ module RelatonIso
249
271
  #
250
272
  # Generate docnumber.
251
273
  #
252
- # @param [Pubid::Iso] pubid
253
- #
254
274
  # @return [String] docnumber
255
275
  #
256
- def fetch_docnumber(pubid)
276
+ def fetch_docnumber
257
277
  pubid.to_s.match(/\d+/)&.to_s
258
278
  end
259
279
 
260
280
  #
261
281
  # Parse structuredidentifier.
262
282
  #
263
- # @param pubid [Pubid::Iso::Identifier] pubid
264
- #
265
283
  # @return [RelatonBib::StructuredIdentifier] structured identifier
266
284
  #
267
- def fetch_structuredidentifier(pubid) # rubocop:disable Metrics/MethodLength
285
+ def fetch_structuredidentifier # rubocop:disable Metrics/MethodLength
268
286
  RelatonIsoBib::StructuredIdentifier.new(
269
287
  project_number: "#{pubid.root.publisher} #{pubid.root.number}",
270
288
  part: pubid.root.part&.to_s, # &.sub(/^-/, ""),
@@ -280,21 +298,24 @@ module RelatonIso
280
298
  # @return [String, nil] ID
281
299
  #
282
300
  def item_ref(doc)
283
- doc.at("//main//section/div/div/div//h1/span[1]")&.text&.strip
301
+ ref = doc.at("//main//section/div/div/div//h1/span[1]")
302
+ @errors[:reference] &&= ref.nil?
303
+ ref&.text&.strip
284
304
  end
285
305
 
286
306
  # Fetch status.
287
- # @param doc [Nokogiri::HTML::Document]
288
- # @param status [String]
289
- # @return [Hash]
290
- def fetch_status(doc)
291
- stg, substg = stage_code(doc).split "."
307
+ # @return [RelatonBib::DocumentStatus]
308
+ def fetch_status
309
+ stg, substg = stage_code.split "."
292
310
  RelatonBib::DocumentStatus.new(stage: stg, substage: substg)
293
311
  end
294
312
 
295
- def stage_code(doc)
296
- doc.at("//ul[@class='dropdown-menu']/li[@class='active']" \
297
- "/a/span[@class='stage-code']").text
313
+ def stage_code
314
+ return @stage_code if defined?(@stage_code)
315
+
316
+ stc = @doc.at("//ul[@class='dropdown-menu']/li[@class='active']/a/span[@class='stage-code']")
317
+ @errors[:stage] &&= stc.nil?
318
+ @stage_code = stc&.text
298
319
  end
299
320
 
300
321
  # def stage(stg, substg)
@@ -305,8 +326,9 @@ module RelatonIso
305
326
  # Fetch workgroup.
306
327
  # @param doc [Nokogiri::HTML::Document]
307
328
  # @return [RelatonIsoBib::EditorialGroup, nil]
308
- def fetch_workgroup(doc) # rubocop:disable Metrics/MethodLength,Metrics/AbcSize,Metrics/CyclomaticComplexity,Metrics/PerceivedComplexity
309
- wg = doc.at("////div[contains(., 'Technical Committe')]/following-sibling::span/a")
329
+ def fetch_workgroup # rubocop:disable Metrics/MethodLength,Metrics/AbcSize,Metrics/CyclomaticComplexity,Metrics/PerceivedComplexity
330
+ wg = @doc.at("//div[contains(., 'Technical Committe')]/following-sibling::span/a")
331
+ @errors[:workgroup] &&= wg.nil?
310
332
  return unless wg
311
333
 
312
334
  workgroup = wg.text.split "/"
@@ -324,34 +346,34 @@ module RelatonIso
324
346
  end
325
347
 
326
348
  # Fetch relations.
327
- # @param doc [Nokogiri::HTML::Document]
328
349
  # @return [Array<Hash>]
329
- def fetch_relations(doc)
350
+ def fetch_relations
330
351
  types = ["Now", "Now under review"]
331
- doc.xpath(
352
+ rels = @doc.xpath(
332
353
  "//ul[@class='steps']/li", "//div[contains(@class, 'sub-step')]"
333
354
  ).reduce([]) do |a, r|
334
- type, date = relation_type(r.at("h4", "h5").text.strip, doc)
355
+ type, date = relation_type(r.at("h4", "h5").text.strip)
335
356
  next a if types.include?(type)
336
357
 
337
358
  a + create_relations(r, type, date)
338
359
  end
360
+ @errors[:relation] &&= rels.empty?
361
+ rels
339
362
  end
340
363
 
341
364
  #
342
365
  # Parse relation type and dates.
343
366
  #
344
367
  # @param [String] type parsed type
345
- # @param [Nokogiri::HTML::Document] doc document to parse
346
368
  #
347
369
  # @return [Array<String,Array>] type and dates
348
370
  #
349
- def relation_type(type, doc)
371
+ def relation_type(type)
350
372
  date = []
351
373
  t = case type.strip
352
374
  when "Previously", "Will be replaced by" then "obsoletes"
353
375
  when /Corrigenda|Amendments|Revised by|Now confirmed|replaced by/
354
- on = doc.xpath('//span[@class="stage-date"][contains(., "-")]').last
376
+ on = @doc.xpath('//span[@class="stage-date"][contains(., "-")]').last
355
377
  date << { type: "circulated", on: on.text } if on
356
378
  "updates"
357
379
  else type
@@ -371,9 +393,9 @@ module RelatonIso
371
393
  # @return [Array<Hash>] Relations
372
394
  #
373
395
  def create_relations(rel, type, date)
374
- rel.css("a").map do |id|
375
- docid = DocumentIdentifier.new(type: "ISO", id: id.text, primary: true)
376
- fref = RelatonBib::FormattedRef.new(content: id.text, format: "text/plain")
396
+ rel.css("a").map do |rid|
397
+ docid = DocumentIdentifier.new(type: "ISO", id: rid.text, primary: true)
398
+ fref = RelatonBib::FormattedRef.new(content: rid.text, format: "text/plain")
377
399
  bibitem = RelatonIsoBib::IsoBibliographicItem.new(
378
400
  docid: [docid], formattedref: fref, date: date,
379
401
  )
@@ -382,14 +404,13 @@ module RelatonIso
382
404
  end
383
405
 
384
406
  # Fetch type.
385
- # @param ref [String]
386
407
  # @return [String]
387
- def fetch_type(ref)
408
+ def fetch_type
388
409
  %r{
389
410
  ^(?<prefix>ISO|IWA|IEC)
390
- (?:(?:/IEC|/IEEE|/PRF|/NP|/SAE|/HL7|/DGuide)*\s|/)
411
+ (?:(?:/CIE|/IEC|/IEEE|/PRF|/NP|/SAE|/HL7|/DGuide)*\s|/)
391
412
  (?<type>TS|TR|PAS|AWI|CD|FDIS|NP|DIS|WD|R|DTS|DTR|ISP|PWI|Guide|(?=\d+))
392
- }x =~ ref
413
+ }x =~ id
393
414
  type = TYPES[type] || TYPES[prefix] || "international-standard"
394
415
  RelatonIsoBib::DocumentType.new(type: type)
395
416
  end
@@ -400,7 +421,7 @@ module RelatonIso
400
421
  # @return [Array<RelatonBib::TypedTitleString>]
401
422
  def fetch_title(doc, lang) # rubocop:disable Metrics/MethodLength,Metrics/AbcSize
402
423
  types = %w[title-intro title-main title-part]
403
- ttls = titles(doc)
424
+ ttls = parse_titles(doc)
404
425
  title = RelatonBib::TypedTitleStringCollection.new
405
426
  ttls.each.with_index do |p, i|
406
427
  next unless p
@@ -413,9 +434,11 @@ module RelatonIso
413
434
  title << RelatonBib::TypedTitleString.new(type: "main", content: main, language: lang, script: script(lang))
414
435
  end
415
436
 
416
- def titles(doc)
437
+ def parse_titles(doc)
417
438
  # head = doc.at "//nav[contains(@class,'heading-condensed')]"
418
439
  ttls = doc.xpath("//h1[@class='stdTitle']/span[position()>1]").map(&:text)
440
+ return ttls if @errors[:title] &&= ttls.empty?
441
+
419
442
  ttls[0, 1] = ttls[0].split(/\s(?:-|\u2014)\s/) # if ttls.size == 1
420
443
  case ttls.size
421
444
  when 0, 1 then [nil, ttls.first, nil]
@@ -434,36 +457,42 @@ module RelatonIso
434
457
  end
435
458
 
436
459
  # Fetch dates
437
- # @param doc [Nokogiri::HTML::Document]
438
- # @param ref [String]
439
460
  # @return [Array<Hash>]
440
- def fetch_dates(doc, ref) # rubocop:disable Metrics/AbcSize, Metrics/PerceivedComplexity, Metrics/MethodLength
461
+ def fetch_dates # rubocop:disable Metrics/AbcSize,Metrics/MethodLength
441
462
  dates = []
442
- %r{^[^\s]+\s[\d-]+:(?<ref_date_str>\d{4})} =~ ref
443
- pub_date_str = doc.at("//span[@itemprop='releaseDate']")
463
+ %r{^[^\s]+\s[\d-]+:(?<ref_date_str>\d{4})} =~ id
464
+ pub_date_str = @doc.at("//span[@itemprop='releaseDate']")
465
+ @errors[:date_pub] &&= pub_date_str.nil?
444
466
  if ref_date_str
445
- ref_date = Date.strptime ref_date_str, "%Y"
446
- if pub_date_str.nil?
447
- dates << { type: "published", on: ref_date_str }
448
- else
449
- pub_date = Date.strptime pub_date_str.text, "%Y"
450
- if pub_date.year > ref_date.year
451
- dates << { type: "published", on: ref_date_str }
452
- dates << { type: "updated", on: pub_date_str.text }
453
- else
454
- dates << { type: "published", on: pub_date_str.text }
455
- end
456
- end
467
+ dates += parse_date_from_id ref_date_str, pub_date_str
457
468
  elsif pub_date_str
458
469
  dates << { type: "published", on: pub_date_str.text }
459
470
  end
460
- corr_data = doc.at "//span[@itemprop='dateModified']"
471
+ corr_data = @doc.at "//span[@itemprop='dateModified']"
472
+ @errors[:date_corr] &&= corr_data.nil?
461
473
  dates << { type: "corrected", on: corr_data.text } if corr_data
462
474
  dates
463
475
  end
464
476
 
465
- def fetch_contributors(ref)
466
- ref.sub(/\s.*/, "").split("/").reduce([]) do |mem, abbrev|
477
+ def parse_date_from_id(ref_date_str, pub_date_str) # rubocop:disable Metrics/AbcSize,Metrics/MethodLength
478
+ dates = []
479
+ ref_date = Date.strptime ref_date_str, "%Y"
480
+ if pub_date_str.nil?
481
+ dates << { type: "published", on: ref_date_str }
482
+ else
483
+ pub_date = Date.strptime pub_date_str.text, "%Y"
484
+ if pub_date.year > ref_date.year
485
+ dates << { type: "published", on: ref_date_str }
486
+ dates << { type: "updated", on: pub_date_str.text }
487
+ else
488
+ dates << { type: "published", on: pub_date_str.text }
489
+ end
490
+ end
491
+ dates
492
+ end
493
+
494
+ def fetch_contributors
495
+ id.sub(/\s.*/, "").split("/").reduce([]) do |mem, abbrev|
467
496
  publisher = PUBLISHERS[abbrev]
468
497
  next mem unless publisher
469
498
 
@@ -473,44 +502,46 @@ module RelatonIso
473
502
  end
474
503
 
475
504
  # Fetch ICS.
476
- # @param doc [Nokogiri::HTML::Document]
477
505
  # @return [Array<Hash>]
478
- def fetch_ics(doc)
479
- doc.xpath("//div[contains(., 'ICS')]/following-sibling::span/a").map do |i|
506
+ def fetch_ics
507
+ ics = @doc.xpath("//div[contains(., 'ICS')]/following-sibling::span/a").map do |i|
480
508
  code = i.text.match(/[\d.]+/).to_s.split "."
481
509
  { field: code[0], group: code[1], subgroup: code[2] }
482
510
  end
511
+ @errors[:ics] &&= ics.empty?
512
+ ics
483
513
  end
484
514
 
485
515
  #
486
516
  # Fetch links.
487
517
  #
488
- # @param doc [Nokogiri::HTML::Document] document to parse
489
518
  # @param url [String] document url
490
519
  #
491
520
  # @return [Array<Hash>]
492
521
  #
493
- def fetch_link(doc, url)
522
+ def fetch_link(url) # rubocop:disable Metrics/AbcSize,Metrics/CyclomaticComplexity,Metrics/MethodLength
494
523
  links = [{ type: "src", content: url }]
495
- obp = doc.at("//a[.='Read sample']")
524
+ obp = @doc.at("//a[.='Read sample']")
525
+ @errors[:link_obp] &&= obp.nil?
496
526
  links << { type: "obp", content: obp[:href] } if obp
497
- rss = doc.at("//a[contains(@href, 'rss')]")
527
+ rss = @doc.at("//a[contains(@href, 'rss')]")
528
+ @errors[:link_rss] &&= rss.nil?
498
529
  links << { type: "rss", content: DOMAIN + rss[:href] } if rss
499
- pub = doc.at "//p[contains(., 'publicly available')]/a",
530
+ pub = @doc.at "//p[contains(., 'publicly available')]/a",
500
531
  "//p[contains(., 'can be downloaded from the')]/a"
532
+ @errors[:link_pub] &&= pub.nil?
501
533
  links << { type: "pub", content: pub[:href] } if pub
502
534
  links
503
535
  end
504
536
 
505
537
  # Fetch copyright.
506
- # @param doc [Nokogiri::HTML::Document]
507
538
  # @return [Array<Hash>]
508
- def fetch_copyright(doc) # rubocop:disable Metrics/MethodLength
509
- ref = item_ref doc
539
+ def fetch_copyright # rubocop:disable Metrics/MethodLength
540
+ ref = item_ref @doc
510
541
  owner_name = ref.match(/.*?(?=\s)/).to_s
511
542
  from = ref.match(/(?<=:)\d{4}/).to_s
512
543
  if from.empty?
513
- date = doc.at(
544
+ date = @doc.at(
514
545
  "//span[@itemprop='releaseDate']",
515
546
  "//ul[@id='stages']/li[contains(@class,'active')]/ul/li[@class='active']/a/span[@class='stage-date']",
516
547
  )
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module RelatonIso
4
- VERSION = "1.19.1"
4
+ VERSION = "1.19.2"
5
5
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: relaton-iso
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.19.1
4
+ version: 1.19.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Ribose Inc.
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2024-07-18 00:00:00.000000000 Z
11
+ date: 2024-10-01 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: pubid