relaton-iso 1.19.0 → 1.19.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: a66fbf3c95df32417a9bdaea666de8a154ec75d52ed1ef7a7db4a92925a57893
4
- data.tar.gz: 05dbd444b033089d02f6334e82b54372c291b6d34cb7b3722b8bf0122abf62bb
3
+ metadata.gz: ae7999f9e96553504fb75338485ad672b1cb176e15860c39b35913b0ed525852
4
+ data.tar.gz: 4bf9ec438dd3aebb4b81707b53c7141a466d1f8c504a88c0d6b5d60f46ea0534
5
5
  SHA512:
6
- metadata.gz: 70e8782fb0c925a56c329a9ff1a19de812f3055d072a113f1d0af390c7ef51904991bb615f37f912ca079f68ca5105542905f10102969c4aa5296e57b2fae8f7
7
- data.tar.gz: 7e404e058db5336e2225428512da3e3d6e507baa6ae6c19a0ba5cde0aa39c5a5326c3dca63e20448c1baafa5beadf7e8193aaa11fdff6ccec6b9ed9789d0b69e
6
+ metadata.gz: 05fab25ad1c760bde99b95b230e6616aa6205510cb77454a7c167aeb7501e967c1909ec4afe583e259312802d8172f5099937128bc7d34c8e4901cb7f48c3181
7
+ data.tar.gz: e935ceff8ab264b3f4ca6383874c427733633bd744e3a5318cb079f47bf67f580f8957ee5cea36c3a1827b064f1f7022061b13d9e4f60182200180eb50abcbe9
data/bin/bundle CHANGED
@@ -27,7 +27,7 @@ m = Module.new do
27
27
  bundler_version = nil
28
28
  update_index = nil
29
29
  ARGV.each_with_index do |a, i|
30
- if update_index && update_index.succ == i && a =~ Gem::Version::ANCHORED_VERSION_PATTERN
30
+ if update_index && update_index.succ == i && a.match?(Gem::Version::ANCHORED_VERSION_PATTERN)
31
31
  bundler_version = a
32
32
  end
33
33
  next unless a =~ /\A--bundler(?:[= ](#{Gem::Version::VERSION_PATTERN}))?\z/
@@ -7,13 +7,16 @@ module RelatonIso
7
7
  # @param [String] output output directory
8
8
  # @param [String] format format of output files (yaml, bibxml, xml)
9
9
  #
10
- def initialize(output, format)
10
+ def initialize(output, format) # rubocop:disable Metrics/AbcSize
11
11
  @output = output
12
12
  @format = format
13
13
  @ext = format.sub(/^bib/, "")
14
- @files = []
14
+ @files = Set.new
15
15
  @queue = ::Queue.new
16
16
  @mutex = Mutex.new
17
+ @gh_issue = Relaton::Logger::Channels::GhIssue.new "relaton/relaton-iso", "Error fetching ISO documents"
18
+ Relaton.logger_pool[:gh_issue] = Relaton::Logger::Log.new(@gh_issue, levels: [:error])
19
+ @errors = Hash.new(true)
17
20
  end
18
21
 
19
22
  def index
@@ -34,12 +37,12 @@ module RelatonIso
34
37
  #
35
38
  def self.fetch(output: "data", format: "yaml")
36
39
  t1 = Time.now
37
- puts "Started at: #{t1}"
40
+ Util.info "Started at: #{t1}"
38
41
  FileUtils.mkdir_p output
39
42
  new(output, format).fetch
40
43
  t2 = Time.now
41
- puts "Stopped at: #{t2}"
42
- puts "Done in: #{(t2 - t1).round} sec."
44
+ Util.info "Stopped at: #{t2}"
45
+ Util.info "Done in: #{(t2 - t1).round} sec."
43
46
  end
44
47
 
45
48
  #
@@ -48,13 +51,21 @@ module RelatonIso
48
51
  # @return [void]
49
52
  #
50
53
  def fetch # rubocop:disable Metrics/AbcSize
51
- puts "Scrapping ICS pages..."
54
+ Util.info "Scrapping ICS pages..."
52
55
  fetch_ics
53
- puts "[#{Time.now}] Scrapping documents..."
56
+ Util.info "(#{Time.now}) Scrapping documents..."
54
57
  fetch_docs
55
58
  iso_queue.save
56
59
  # index.sort! { |a, b| compare_docids a, b }
57
60
  index.save
61
+ repot_errors
62
+ end
63
+
64
+ def repot_errors
65
+ @errors.select { |_, v| v }.each_key do |k|
66
+ Util.error "Failed to fetch #{k}"
67
+ end
68
+ @gh_issue.create_issue
58
69
  end
59
70
 
60
71
  #
@@ -72,14 +83,30 @@ module RelatonIso
72
83
 
73
84
  def fetch_ics_page(path)
74
85
  resp = get_redirection path
75
- page = Nokogiri::HTML(resp.body)
76
- page.xpath("//td[@data-title='Standard and/or project']/div/div/a").each do |item|
77
- iso_queue.add_first item[:href].split("?").first
86
+ unless resp
87
+ Util.error "Failed fetching ICS page #{url(path)}"
88
+ return
78
89
  end
79
90
 
80
- page.xpath("//td[@data-title='ICS']/a").each do |item|
81
- @queue << item[:href]
82
- end
91
+ page = Nokogiri::HTML(resp.body)
92
+ parse_doc_links page
93
+ parse_ics_links page
94
+ end
95
+
96
+ def parse_doc_links(page)
97
+ doc_links = page.xpath "//td[@data-title='Standard and/or project']/div/div/a"
98
+ @errors[:doc_links] &&= doc_links.empty?
99
+ doc_links.each { |item| iso_queue.add_first item[:href].split("?").first }
100
+ end
101
+
102
+ def parse_ics_links(page)
103
+ ics_links = page.xpath("//td[@data-title='ICS']/a")
104
+ @errors[:ics_links] &&= ics_links.empty?
105
+ ics_links.each { |item| @queue << item[:href] }
106
+ end
107
+
108
+ def url(path)
109
+ Scrapper::DOMAIN + path
83
110
  end
84
111
 
85
112
  #
@@ -88,18 +115,18 @@ module RelatonIso
88
115
  #
89
116
  # @param [String] path path to the page
90
117
  #
91
- # @return [Net::HTTPOK] HTTP response
118
+ # @return [Net::HTTPOK, nil] HTTP response
92
119
  #
93
120
  def get_redirection(path) # rubocop:disable Metrics/MethodLength
94
121
  try = 0
95
- uri = URI(Scrapper::DOMAIN + path)
122
+ uri = URI url(path)
96
123
  begin
97
124
  get_response uri
98
125
  rescue Net::OpenTimeout, Net::ReadTimeout, Errno::ECONNREFUSED => e
99
126
  try += 1
100
127
  retry if check_try try, uri
101
128
 
102
- Util.error "Error fetching #{uri}, #{e.message}"
129
+ Util.warn "Failed fetching #{uri}, #{e.message}"
103
130
  end
104
131
  end
105
132
 
@@ -131,13 +158,10 @@ module RelatonIso
131
158
  # @return [void]
132
159
  #
133
160
  def fetch_doc(docpath)
134
- # path = docpath.sub(/\.html$/, "")
135
- # hit = Hit.new({ path: docpath }, nil)
136
- doc = Scrapper.parse_page docpath
161
+ doc = Scrapper.parse_page docpath, errors: @errors
137
162
  @mutex.synchronize { save_doc doc, docpath }
138
163
  rescue StandardError => e
139
- Util.error "Error fetching document: #{Scrapper::DOMAIN}#{docpath}\n" \
140
- "#{e.message}\n#{e.backtrace}"
164
+ Util.warn "Fail fetching document: #{url(docpath)}\n#{e.message}\n#{e.backtrace}"
141
165
  end
142
166
 
143
167
  # def compare_docids(id1, id2)
@@ -155,16 +179,40 @@ module RelatonIso
155
179
  docid = doc.docidentifier.detect(&:primary)
156
180
  file_name = docid.id.gsub(/[\s\/:]+/, "-").downcase
157
181
  file = File.join @output, "#{file_name}.#{@ext}"
158
- if @files.include? file
159
- Util.warn "Duplicate file #{file} for #{docid.id} from #{Scrapper::DOMAIN}#{docpath}"
182
+ if File.exist?(file)
183
+ rewrite_with_same_or_newer doc, docid, file, docpath
160
184
  else
161
- @files << file
162
- index.add_or_update docid.to_h, file
163
- File.write file, serialize(doc), encoding: "UTF-8"
185
+ write_file file, doc, docid
164
186
  end
165
187
  iso_queue.move_last docpath
166
188
  end
167
189
 
190
+ def rewrite_with_same_or_newer(doc, docid, file, docpath)
191
+ hash = YAML.load_file file
192
+ item_hash = HashConverter.hash_to_bib hash
193
+ bib = ::RelatonIsoBib::IsoBibliographicItem.new(**item_hash)
194
+ if edition_greater?(doc, bib) || replace_substage98?(doc, bib)
195
+ write_file file, doc, docid
196
+ elsif @files.include?(file) && !edition_greater?(bib, doc)
197
+ Util.warn "Duplicate file `#{file}` for `#{docid.id}` from #{url(docpath)}"
198
+ end
199
+ end
200
+
201
+ def edition_greater?(doc, bib)
202
+ doc.edition && bib.edition && doc.edition.content.to_i > bib.edition.content.to_i
203
+ end
204
+
205
+ def replace_substage98?(doc, bib) # rubocop:disable Metrics/CyclomaticComplexity,Metrics/PerceivedComplexity
206
+ doc.edition&.content == bib.edition&.content &&
207
+ (doc.status&.substage&.value != "98" || bib.status&.substage&.value == "98")
208
+ end
209
+
210
+ def write_file(file, doc, docid)
211
+ @files << file
212
+ index.add_or_update docid.to_h, file
213
+ File.write file, serialize(doc), encoding: "UTF-8"
214
+ end
215
+
168
216
  #
169
217
  # Serialize document to string.
170
218
  #
@@ -52,7 +52,7 @@ module RelatonIso
52
52
 
53
53
  ret.to_most_recent_reference
54
54
  rescue Pubid::Core::Errors::ParseError
55
- Util.info "Is not recognized as a standards identifier.", key: code
55
+ Util.warn "Is not recognized as a standards identifier.", key: code
56
56
  nil
57
57
  end
58
58
 
@@ -2,7 +2,7 @@
2
2
 
3
3
  module RelatonIso
4
4
  # Scrapper.
5
- module Scrapper # rubocop:disable Metrics/ModuleLength
5
+ class Scrapper # rubocop:disable Metrics/ModuleLength
6
6
  DOMAIN = "https://www.iso.org"
7
7
 
8
8
  TYPES = {
@@ -48,57 +48,84 @@ module RelatonIso
48
48
  url: "www.asme.org" },
49
49
  }.freeze
50
50
 
51
- extend self
51
+ # extend self
52
+
53
+ def initialize(lang, errors)
54
+ @lang = lang
55
+ @errors = errors
56
+ end
52
57
 
53
58
  # Parse page.
54
- # @param path [String]
55
- # @param lang [String, nil]
59
+ # @param path [String] page path
60
+ # @param lang [String, nil] language
61
+ # @param errors [Hash] collection of parsing errors
56
62
  # @return [RelatonIsoBib::IsoBibliographicItem]
57
- def parse_page(path, lang = nil) # rubocop:disable Metrics/AbcSize, Metrics/MethodLength, Metrics/CyclomaticComplexity, Metrics/PerceivedComplexity
58
- doc, url = get_page path
59
- id = doc.at("//h1/span[1]").text.split(" | ").first.strip
60
- pubid = Pubid::Iso::Identifier.parse(id)
61
- # Fetch edition.
62
- edition = doc.at("//div[div[.='Edition']]/text()[last()]")&.text&.match(/\d+$/)&.to_s
63
- pubid.root.edition ||= edition if pubid.base
63
+ def self.parse_page(path, lang: nil, errors: {})
64
+ new(lang, errors).parse(path)
65
+ end
64
66
 
65
- titles, abstract, langs = fetch_titles_abstract(doc, lang)
67
+ def parse(path) # rubocop:disable Metrics/AbcSize,Metrics/MethodLength
68
+ @doc, url = get_page path
69
+ titles, abstract, langs = fetch_titles_abstract
66
70
 
67
71
  RelatonIsoBib::IsoBibliographicItem.new(
68
- docid: fetch_relaton_docids(doc, pubid),
69
- docnumber: fetch_docnumber(pubid),
72
+ docid: fetch_relaton_docids,
73
+ docnumber: fetch_docnumber,
70
74
  edition: edition,
71
75
  language: langs.map { |l| l[:lang] },
72
76
  script: langs.map { |l| script(l[:lang]) }.uniq,
73
77
  title: titles,
74
- doctype: fetch_type(id),
75
- docstatus: fetch_status(doc),
76
- ics: fetch_ics(doc),
77
- date: fetch_dates(doc, id),
78
- contributor: fetch_contributors(id),
79
- editorialgroup: fetch_workgroup(doc),
78
+ doctype: fetch_type,
79
+ docstatus: fetch_status,
80
+ ics: fetch_ics,
81
+ date: fetch_dates,
82
+ contributor: fetch_contributors,
83
+ editorialgroup: fetch_workgroup,
80
84
  abstract: abstract,
81
- copyright: fetch_copyright(doc),
82
- link: fetch_link(doc, url),
83
- relation: fetch_relations(doc),
85
+ copyright: fetch_copyright,
86
+ link: fetch_link(url),
87
+ relation: fetch_relations,
84
88
  place: ["Geneva"],
85
- structuredidentifier: fetch_structuredidentifier(pubid),
89
+ structuredidentifier: fetch_structuredidentifier,
86
90
  )
87
91
  end
88
92
 
93
+ def id
94
+ return @id if defined?(@id)
95
+
96
+ did = @doc.at("//h1/span[1]")
97
+ @errors[:id] &&= did.nil?
98
+ @id = did && did.text.split(" | ").first.strip
99
+ end
100
+
101
+ def pubid
102
+ return @pubid if @pubid
103
+
104
+ @pubid = Pubid::Iso::Identifier.parse(id)
105
+ @pubid.root.edition ||= edition if @pubid.base
106
+ @pubid
107
+ rescue StandardError => e
108
+ Util.error "Failed to parse pubid from #{id}: #{e.message}"
109
+ end
110
+
111
+ def edition
112
+ return @edition if defined?(@edition)
113
+
114
+ ed = @doc.at("//div[div[.='Edition']]/text()[last()]")
115
+ @errors[:edition] &&= ed.nil?
116
+ @edition = ed && ed.text.match(/\d+$/).to_s
117
+ end
118
+
89
119
  #
90
120
  # Create document ids.
91
121
  #
92
- # @param doc [Nokogiri::HTML::Document] document to parse
93
- # @param pubid [Pubid::Iso::Identifier] publication identifier
94
- #
95
122
  # @return [Array<RelatonBib::DocumentIdentifier>]
96
123
  #
97
- def fetch_relaton_docids(doc, pubid)
98
- pubid.stage ||= Pubid::Iso::Identifier.parse_stage(stage_code(doc))
124
+ def fetch_relaton_docids
125
+ pubid.stage ||= Pubid::Iso::Identifier.parse_stage(stage_code)
99
126
  [
100
127
  DocumentIdentifier.new(id: pubid, type: "ISO", primary: true),
101
- RelatonBib::DocumentIdentifier.new(id: isoref(pubid), type: "iso-reference"),
128
+ RelatonBib::DocumentIdentifier.new(id: isoref, type: "iso-reference"),
102
129
  DocumentIdentifier.new(id: pubid, type: "URN"),
103
130
  ]
104
131
  end
@@ -106,11 +133,9 @@ module RelatonIso
106
133
  #
107
134
  # Create ISO reference identifier with English language.
108
135
  #
109
- # @param [Pubid::Iso::Identifier] pubid publication identifier
110
- #
111
136
  # @return [String] English reference identifier
112
137
  #
113
- def isoref(pubid)
138
+ def isoref
114
139
  params = pubid.to_h.reject { |k, _| k == :typed_stage }
115
140
  Pubid::Iso::Identifier.create(language: "en", **params).to_s(format: :ref_num_short)
116
141
  end
@@ -118,20 +143,18 @@ module RelatonIso
118
143
  private
119
144
 
120
145
  # Fetch titles and abstracts.
121
- # @param doc [Nokigiri::HTML::Document]
122
- # @param lang [String, nil]
123
146
  # @return [Array<Array>]
124
- def fetch_titles_abstract(doc, lang) # rubocop:disable Metrics/AbcSize,Metrics/MethodLength
147
+ def fetch_titles_abstract # rubocop:disable Metrics/AbcSize,Metrics/MethodLength
125
148
  titles = RelatonBib::TypedTitleStringCollection.new
126
149
  abstract = []
127
- langs = languages(doc, lang).each_with_object([]) do |l, s|
150
+ langs = languages.each_with_object([]) do |l, s|
128
151
  # Don't need to get page for en. We already have it.
129
- d = l[:path] ? get_page(l[:path])[0] : doc
152
+ d = l[:path] ? get_page(l[:path])[0] : @doc
130
153
  unless d.at("//h5[@class='help-block'][.='недоступно на русском языке']")
131
154
  s << l
132
155
  titles += fetch_title(d, l[:lang])
133
156
 
134
- abstr = parse_abstract(d, l)
157
+ abstr = parse_abstract(d, l[:lang])
135
158
  abstract << abstr if abstr
136
159
  end
137
160
  end
@@ -142,23 +165,22 @@ module RelatonIso
142
165
  abstract_content = doc.xpath(
143
166
  "//div[@itemprop='description']/p|//div[@itemprop='description']/ul/li",
144
167
  ).map { |a| a.name == "li" ? "- #{a.text}" : a.text }.reject(&:empty?).join("\n")
168
+ @errors[:abstract] &&= abstract_content.empty?
145
169
  return if abstract_content.empty?
146
170
 
147
- { content: abstract_content, language: lang[:lang],
148
- script: script(lang[:lang]), format: "text/plain" }
171
+ { content: abstract_content, language: lang, script: script(lang), format: "text/plain" }
149
172
  end
150
173
 
151
174
  # Returns available languages.
152
- # @param doc [Nokogiri::HTML::Document]
153
- # @param lang [String, nil]
154
175
  # @return [Array<Hash>]
155
- def languages(doc, lang)
176
+ def languages
156
177
  lgs = [{ lang: "en" }]
157
- doc.css("li#lang-switcher ul li a").each do |lang_link|
178
+ @doc.css("li#lang-switcher ul li a").each do |lang_link|
158
179
  lang_path = lang_link.attr("href")
159
180
  l = lang_path.match(%r{^/(fr)/})
160
- lgs << { lang: l[1], path: lang_path } if l && (!lang || l[1] != lang)
181
+ lgs << { lang: l[1], path: lang_path } if l && (!@lang || l[1] != @lang)
161
182
  end
183
+ @errors[:language] &&= lgs.size == 1
162
184
  lgs
163
185
  end
164
186
 
@@ -239,7 +261,7 @@ module RelatonIso
239
261
  10.times do
240
262
  doc = Nokogiri::HTML(resp.body)
241
263
  # stop trying if page has a document id
242
- return doc if item_ref doc
264
+ return doc if item_ref(doc)
243
265
 
244
266
  resp = Net::HTTP.get_response(uri)
245
267
  end
@@ -249,22 +271,18 @@ module RelatonIso
249
271
  #
250
272
  # Generate docnumber.
251
273
  #
252
- # @param [Pubid::Iso] pubid
253
- #
254
274
  # @return [String] docnumber
255
275
  #
256
- def fetch_docnumber(pubid)
276
+ def fetch_docnumber
257
277
  pubid.to_s.match(/\d+/)&.to_s
258
278
  end
259
279
 
260
280
  #
261
281
  # Parse structuredidentifier.
262
282
  #
263
- # @param pubid [Pubid::Iso::Identifier] pubid
264
- #
265
283
  # @return [RelatonBib::StructuredIdentifier] structured identifier
266
284
  #
267
- def fetch_structuredidentifier(pubid) # rubocop:disable Metrics/MethodLength
285
+ def fetch_structuredidentifier # rubocop:disable Metrics/MethodLength
268
286
  RelatonIsoBib::StructuredIdentifier.new(
269
287
  project_number: "#{pubid.root.publisher} #{pubid.root.number}",
270
288
  part: pubid.root.part&.to_s, # &.sub(/^-/, ""),
@@ -280,21 +298,24 @@ module RelatonIso
280
298
  # @return [String, nil] ID
281
299
  #
282
300
  def item_ref(doc)
283
- doc.at("//main//section/div/div/div//h1/span[1]")&.text&.strip
301
+ ref = doc.at("//main//section/div/div/div//h1/span[1]")
302
+ @errors[:reference] &&= ref.nil?
303
+ ref&.text&.strip
284
304
  end
285
305
 
286
306
  # Fetch status.
287
- # @param doc [Nokogiri::HTML::Document]
288
- # @param status [String]
289
- # @return [Hash]
290
- def fetch_status(doc)
291
- stg, substg = stage_code(doc).split "."
307
+ # @return [RelatonBib::DocumentStatus]
308
+ def fetch_status
309
+ stg, substg = stage_code.split "."
292
310
  RelatonBib::DocumentStatus.new(stage: stg, substage: substg)
293
311
  end
294
312
 
295
- def stage_code(doc)
296
- doc.at("//ul[@class='dropdown-menu']/li[@class='active']" \
297
- "/a/span[@class='stage-code']").text
313
+ def stage_code
314
+ return @stage_code if defined?(@stage_code)
315
+
316
+ stc = @doc.at("//ul[@class='dropdown-menu']/li[@class='active']/a/span[@class='stage-code']")
317
+ @errors[:stage] &&= stc.nil?
318
+ @stage_code = stc&.text
298
319
  end
299
320
 
300
321
  # def stage(stg, substg)
@@ -305,8 +326,9 @@ module RelatonIso
305
326
  # Fetch workgroup.
306
327
  # @param doc [Nokogiri::HTML::Document]
307
328
  # @return [RelatonIsoBib::EditorialGroup, nil]
308
- def fetch_workgroup(doc) # rubocop:disable Metrics/MethodLength,Metrics/AbcSize,Metrics/CyclomaticComplexity,Metrics/PerceivedComplexity
309
- wg = doc.at("////div[contains(., 'Technical Committe')]/following-sibling::span/a")
329
+ def fetch_workgroup # rubocop:disable Metrics/MethodLength,Metrics/AbcSize,Metrics/CyclomaticComplexity,Metrics/PerceivedComplexity
330
+ wg = @doc.at("//div[contains(., 'Technical Committe')]/following-sibling::span/a")
331
+ @errors[:workgroup] &&= wg.nil?
310
332
  return unless wg
311
333
 
312
334
  workgroup = wg.text.split "/"
@@ -324,34 +346,34 @@ module RelatonIso
324
346
  end
325
347
 
326
348
  # Fetch relations.
327
- # @param doc [Nokogiri::HTML::Document]
328
349
  # @return [Array<Hash>]
329
- def fetch_relations(doc)
350
+ def fetch_relations
330
351
  types = ["Now", "Now under review"]
331
- doc.xpath(
352
+ rels = @doc.xpath(
332
353
  "//ul[@class='steps']/li", "//div[contains(@class, 'sub-step')]"
333
354
  ).reduce([]) do |a, r|
334
- type, date = relation_type(r.at("h4", "h5").text.strip, doc)
355
+ type, date = relation_type(r.at("h4", "h5").text.strip)
335
356
  next a if types.include?(type)
336
357
 
337
358
  a + create_relations(r, type, date)
338
359
  end
360
+ @errors[:relation] &&= rels.empty?
361
+ rels
339
362
  end
340
363
 
341
364
  #
342
365
  # Parse relation type and dates.
343
366
  #
344
367
  # @param [String] type parsed type
345
- # @param [Nokogiri::HTML::Document] doc document to parse
346
368
  #
347
369
  # @return [Array<String,Array>] type and dates
348
370
  #
349
- def relation_type(type, doc)
371
+ def relation_type(type)
350
372
  date = []
351
373
  t = case type.strip
352
374
  when "Previously", "Will be replaced by" then "obsoletes"
353
375
  when /Corrigenda|Amendments|Revised by|Now confirmed|replaced by/
354
- on = doc.xpath('//span[@class="stage-date"][contains(., "-")]').last
376
+ on = @doc.xpath('//span[@class="stage-date"][contains(., "-")]').last
355
377
  date << { type: "circulated", on: on.text } if on
356
378
  "updates"
357
379
  else type
@@ -371,9 +393,9 @@ module RelatonIso
371
393
  # @return [Array<Hash>] Relations
372
394
  #
373
395
  def create_relations(rel, type, date)
374
- rel.css("a").map do |id|
375
- docid = DocumentIdentifier.new(type: "ISO", id: id.text, primary: true)
376
- fref = RelatonBib::FormattedRef.new(content: id.text, format: "text/plain")
396
+ rel.css("a").map do |rid|
397
+ docid = DocumentIdentifier.new(type: "ISO", id: rid.text, primary: true)
398
+ fref = RelatonBib::FormattedRef.new(content: rid.text, format: "text/plain")
377
399
  bibitem = RelatonIsoBib::IsoBibliographicItem.new(
378
400
  docid: [docid], formattedref: fref, date: date,
379
401
  )
@@ -382,14 +404,13 @@ module RelatonIso
382
404
  end
383
405
 
384
406
  # Fetch type.
385
- # @param ref [String]
386
407
  # @return [String]
387
- def fetch_type(ref)
408
+ def fetch_type
388
409
  %r{
389
410
  ^(?<prefix>ISO|IWA|IEC)
390
- (?:(?:/IEC|/IEEE|/PRF|/NP|/SAE|/HL7|/DGuide)*\s|/)
411
+ (?:(?:/CIE|/IEC|/IEEE|/PRF|/NP|/SAE|/HL7|/DGuide)*\s|/)
391
412
  (?<type>TS|TR|PAS|AWI|CD|FDIS|NP|DIS|WD|R|DTS|DTR|ISP|PWI|Guide|(?=\d+))
392
- }x =~ ref
413
+ }x =~ id
393
414
  type = TYPES[type] || TYPES[prefix] || "international-standard"
394
415
  RelatonIsoBib::DocumentType.new(type: type)
395
416
  end
@@ -400,7 +421,7 @@ module RelatonIso
400
421
  # @return [Array<RelatonBib::TypedTitleString>]
401
422
  def fetch_title(doc, lang) # rubocop:disable Metrics/MethodLength,Metrics/AbcSize
402
423
  types = %w[title-intro title-main title-part]
403
- ttls = titles(doc)
424
+ ttls = parse_titles(doc)
404
425
  title = RelatonBib::TypedTitleStringCollection.new
405
426
  ttls.each.with_index do |p, i|
406
427
  next unless p
@@ -413,9 +434,11 @@ module RelatonIso
413
434
  title << RelatonBib::TypedTitleString.new(type: "main", content: main, language: lang, script: script(lang))
414
435
  end
415
436
 
416
- def titles(doc)
437
+ def parse_titles(doc)
417
438
  # head = doc.at "//nav[contains(@class,'heading-condensed')]"
418
439
  ttls = doc.xpath("//h1[@class='stdTitle']/span[position()>1]").map(&:text)
440
+ return ttls if @errors[:title] &&= ttls.empty?
441
+
419
442
  ttls[0, 1] = ttls[0].split(/\s(?:-|\u2014)\s/) # if ttls.size == 1
420
443
  case ttls.size
421
444
  when 0, 1 then [nil, ttls.first, nil]
@@ -434,36 +457,42 @@ module RelatonIso
434
457
  end
435
458
 
436
459
  # Fetch dates
437
- # @param doc [Nokogiri::HTML::Document]
438
- # @param ref [String]
439
460
  # @return [Array<Hash>]
440
- def fetch_dates(doc, ref) # rubocop:disable Metrics/AbcSize, Metrics/PerceivedComplexity, Metrics/MethodLength
461
+ def fetch_dates # rubocop:disable Metrics/AbcSize,Metrics/MethodLength
441
462
  dates = []
442
- %r{^[^\s]+\s[\d-]+:(?<ref_date_str>\d{4})} =~ ref
443
- pub_date_str = doc.at("//span[@itemprop='releaseDate']")
463
+ %r{^[^\s]+\s[\d-]+:(?<ref_date_str>\d{4})} =~ id
464
+ pub_date_str = @doc.at("//span[@itemprop='releaseDate']")
465
+ @errors[:date_pub] &&= pub_date_str.nil?
444
466
  if ref_date_str
445
- ref_date = Date.strptime ref_date_str, "%Y"
446
- if pub_date_str.nil?
447
- dates << { type: "published", on: ref_date_str }
448
- else
449
- pub_date = Date.strptime pub_date_str.text, "%Y"
450
- if pub_date.year > ref_date.year
451
- dates << { type: "published", on: ref_date_str }
452
- dates << { type: "updated", on: pub_date_str.text }
453
- else
454
- dates << { type: "published", on: pub_date_str.text }
455
- end
456
- end
467
+ dates += parse_date_from_id ref_date_str, pub_date_str
457
468
  elsif pub_date_str
458
469
  dates << { type: "published", on: pub_date_str.text }
459
470
  end
460
- corr_data = doc.at "//span[@itemprop='dateModified']"
471
+ corr_data = @doc.at "//span[@itemprop='dateModified']"
472
+ @errors[:date_corr] &&= corr_data.nil?
461
473
  dates << { type: "corrected", on: corr_data.text } if corr_data
462
474
  dates
463
475
  end
464
476
 
465
- def fetch_contributors(ref)
466
- ref.sub(/\s.*/, "").split("/").reduce([]) do |mem, abbrev|
477
+ def parse_date_from_id(ref_date_str, pub_date_str) # rubocop:disable Metrics/AbcSize,Metrics/MethodLength
478
+ dates = []
479
+ ref_date = Date.strptime ref_date_str, "%Y"
480
+ if pub_date_str.nil?
481
+ dates << { type: "published", on: ref_date_str }
482
+ else
483
+ pub_date = Date.strptime pub_date_str.text, "%Y"
484
+ if pub_date.year > ref_date.year
485
+ dates << { type: "published", on: ref_date_str }
486
+ dates << { type: "updated", on: pub_date_str.text }
487
+ else
488
+ dates << { type: "published", on: pub_date_str.text }
489
+ end
490
+ end
491
+ dates
492
+ end
493
+
494
+ def fetch_contributors
495
+ id.sub(/\s.*/, "").split("/").reduce([]) do |mem, abbrev|
467
496
  publisher = PUBLISHERS[abbrev]
468
497
  next mem unless publisher
469
498
 
@@ -473,44 +502,46 @@ module RelatonIso
473
502
  end
474
503
 
475
504
  # Fetch ICS.
476
- # @param doc [Nokogiri::HTML::Document]
477
505
  # @return [Array<Hash>]
478
- def fetch_ics(doc)
479
- doc.xpath("//div[contains(., 'ICS')]/following-sibling::span/a").map do |i|
506
+ def fetch_ics
507
+ ics = @doc.xpath("//div[contains(., 'ICS')]/following-sibling::span/a").map do |i|
480
508
  code = i.text.match(/[\d.]+/).to_s.split "."
481
509
  { field: code[0], group: code[1], subgroup: code[2] }
482
510
  end
511
+ @errors[:ics] &&= ics.empty?
512
+ ics
483
513
  end
484
514
 
485
515
  #
486
516
  # Fetch links.
487
517
  #
488
- # @param doc [Nokogiri::HTML::Document] document to parse
489
518
  # @param url [String] document url
490
519
  #
491
520
  # @return [Array<Hash>]
492
521
  #
493
- def fetch_link(doc, url)
522
+ def fetch_link(url) # rubocop:disable Metrics/AbcSize,Metrics/CyclomaticComplexity,Metrics/MethodLength
494
523
  links = [{ type: "src", content: url }]
495
- obp = doc.at("//a[.='Read sample']")
524
+ obp = @doc.at("//a[.='Read sample']")
525
+ @errors[:link_obp] &&= obp.nil?
496
526
  links << { type: "obp", content: obp[:href] } if obp
497
- rss = doc.at("//a[contains(@href, 'rss')]")
527
+ rss = @doc.at("//a[contains(@href, 'rss')]")
528
+ @errors[:link_rss] &&= rss.nil?
498
529
  links << { type: "rss", content: DOMAIN + rss[:href] } if rss
499
- pub = doc.at "//p[contains(., 'publicly available')]/a",
530
+ pub = @doc.at "//p[contains(., 'publicly available')]/a",
500
531
  "//p[contains(., 'can be downloaded from the')]/a"
532
+ @errors[:link_pub] &&= pub.nil?
501
533
  links << { type: "pub", content: pub[:href] } if pub
502
534
  links
503
535
  end
504
536
 
505
537
  # Fetch copyright.
506
- # @param doc [Nokogiri::HTML::Document]
507
538
  # @return [Array<Hash>]
508
- def fetch_copyright(doc) # rubocop:disable Metrics/MethodLength
509
- ref = item_ref doc
539
+ def fetch_copyright # rubocop:disable Metrics/MethodLength
540
+ ref = item_ref @doc
510
541
  owner_name = ref.match(/.*?(?=\s)/).to_s
511
542
  from = ref.match(/(?<=:)\d{4}/).to_s
512
543
  if from.empty?
513
- date = doc.at(
544
+ date = @doc.at(
514
545
  "//span[@itemprop='releaseDate']",
515
546
  "//ul[@id='stages']/li[contains(@class,'active')]/ul/li[@class='active']/a/span[@class='stage-date']",
516
547
  )
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module RelatonIso
4
- VERSION = "1.19.0"
4
+ VERSION = "1.19.2"
5
5
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: relaton-iso
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.19.0
4
+ version: 1.19.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Ribose Inc.
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2024-07-04 00:00:00.000000000 Z
11
+ date: 2024-10-01 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: pubid