relaton-iso 1.18.1 → 1.18.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -7,7 +7,9 @@ module RelatonIso
7
7
 
8
8
  TYPES = {
9
9
  "TS" => "technical-specification",
10
+ "DTS" => "technical-specification",
10
11
  "TR" => "technical-report",
12
+ "DTR" => "technical-report",
11
13
  "PAS" => "publicly-available-specification",
12
14
  # "AWI" => "approvedWorkItem",
13
15
  # "CD" => "committeeDraft",
@@ -18,6 +20,7 @@ module RelatonIso
18
20
  # "R" => "recommendation",
19
21
  "Guide" => "guide",
20
22
  "ISO" => "international-standard",
23
+ "IEC" => "international-standard",
21
24
  "IWA" => "international-workshop-agreement",
22
25
  }.freeze
23
26
 
@@ -48,43 +51,38 @@ module RelatonIso
48
51
  extend self
49
52
 
50
53
  # Parse page.
51
- # @param hit [RelatonIso::Hit]
52
- # @param lang [String, NilClass]
54
+ # @param path [String]
55
+ # @param lang [String, nil]
53
56
  # @return [RelatonIsoBib::IsoBibliographicItem]
54
- def parse_page(hit, lang = nil) # rubocop:disable Metrics/AbcSize, Metrics/MethodLength, Metrics/CyclomaticComplexity, Metrics/PerceivedComplexity
55
- # path = "/contents/data/standard#{hit_data['splitPath']}/"\
56
- # "#{hit_data['csnumber']}.html"
57
-
58
- path = hit.hit[:path].sub("/sites/isoorg", "")
59
- doc, url = get_page "#{path}.html"
60
-
57
+ def parse_page(path, lang = nil) # rubocop:disable Metrics/AbcSize, Metrics/MethodLength, Metrics/CyclomaticComplexity, Metrics/PerceivedComplexity
58
+ doc, url = get_page path
59
+ id = doc.at("//nav[contains(@class,'heading-condensed')]/h1").text.split(" | ").first
60
+ pubid = Pubid::Iso::Identifier.parse(id)
61
61
  # Fetch edition.
62
- edition = doc.at("//div[div[.='Edition']]/text()[last()]")
63
- &.text&.match(/\d+$/)&.to_s
64
- hit.pubid.base.edition ||= edition if hit.pubid.base
62
+ edition = doc.at("//div[div[.='Edition']]/text()[last()]")&.text&.match(/\d+$/)&.to_s
63
+ pubid.root.edition ||= edition if pubid.base
65
64
 
66
65
  titles, abstract, langs = fetch_titles_abstract(doc, lang)
67
66
 
68
67
  RelatonIsoBib::IsoBibliographicItem.new(
69
- fetched: Date.today.to_s,
70
- docid: fetch_relaton_docids(doc, hit.pubid),
71
- docnumber: fetch_docnumber(hit.pubid),
68
+ docid: fetch_relaton_docids(doc, pubid),
69
+ docnumber: fetch_docnumber(pubid),
72
70
  edition: edition,
73
71
  language: langs.map { |l| l[:lang] },
74
72
  script: langs.map { |l| script(l[:lang]) }.uniq,
75
73
  title: titles,
76
- doctype: fetch_type(hit.hit[:title]),
74
+ doctype: fetch_type(id),
77
75
  docstatus: fetch_status(doc),
78
76
  ics: fetch_ics(doc),
79
- date: fetch_dates(doc, hit.hit[:title]),
80
- contributor: fetch_contributors(hit.hit[:title]),
77
+ date: fetch_dates(doc, id),
78
+ contributor: fetch_contributors(id),
81
79
  editorialgroup: fetch_workgroup(doc),
82
80
  abstract: abstract,
83
81
  copyright: fetch_copyright(doc),
84
82
  link: fetch_link(doc, url),
85
83
  relation: fetch_relations(doc),
86
84
  place: ["Geneva"],
87
- structuredidentifier: fetch_structuredidentifier(hit.pubid),
85
+ structuredidentifier: fetch_structuredidentifier(pubid),
88
86
  )
89
87
  end
90
88
 
@@ -99,9 +97,9 @@ module RelatonIso
99
97
  def fetch_relaton_docids(doc, pubid)
100
98
  pubid.stage ||= Pubid::Iso::Identifier.parse_stage(stage_code(doc))
101
99
  [
102
- RelatonIso::DocumentIdentifier.new(id: pubid, type: "ISO", primary: true),
100
+ DocumentIdentifier.new(id: pubid, type: "ISO", primary: true),
103
101
  RelatonBib::DocumentIdentifier.new(id: isoref(pubid), type: "iso-reference"),
104
- RelatonIso::DocumentIdentifier.new(id: pubid, type: "URN"),
102
+ DocumentIdentifier.new(id: pubid, type: "URN"),
105
103
  ]
106
104
  end
107
105
 
@@ -121,49 +119,45 @@ module RelatonIso
121
119
 
122
120
  # Fetch titles and abstracts.
123
121
  # @param doc [Nokigiri::HTML::Document]
124
- # @param lang [String, NilClass]
122
+ # @param lang [String, nil]
125
123
  # @return [Array<Array>]
126
- def fetch_titles_abstract(doc, lang) # rubocop:disable Metrics/AbcSize,Metrics/MethodLength,Metrics/CyclomaticComplexity,Metrics/PerceivedComplexity
124
+ def fetch_titles_abstract(doc, lang) # rubocop:disable Metrics/AbcSize,Metrics/MethodLength
127
125
  titles = RelatonBib::TypedTitleStringCollection.new
128
126
  abstract = []
129
- langs = languages(doc, lang).reduce([]) do |s, l|
127
+ langs = languages(doc, lang).each_with_object([]) do |l, s|
130
128
  # Don't need to get page for en. We already have it.
131
129
  d = l[:path] ? get_page(l[:path])[0] : doc
132
- unless d.at("//h5[@class='help-block']" \
133
- "[.='недоступно на русском языке']")
130
+ unless d.at("//h5[@class='help-block'][.='недоступно на русском языке']")
134
131
  s << l
135
132
  titles += fetch_title(d, l[:lang])
136
133
 
137
- # Fetch abstracts.
138
- abstract_content = d.xpath(
139
- "//div[@itemprop='description']/p|//div[@itemprop='description']/ul/li",
140
- ).map do |a|
141
- a.name == "li" ? "- #{a.text}" : a.text
142
- end.reject(&:empty?).join("\n")
143
- unless abstract_content.empty?
144
- abstract << {
145
- content: abstract_content,
146
- language: l[:lang],
147
- script: script(l[:lang]),
148
- format: "text/plain",
149
- }
150
- end
134
+ abstr = parse_abstract(d, l)
135
+ abstract << abstr if abstr
151
136
  end
152
- s
153
137
  end
154
138
  [titles, abstract, langs]
155
139
  end
156
140
 
141
+ def parse_abstract(doc, lang)
142
+ abstract_content = doc.xpath(
143
+ "//div[@itemprop='description']/p|//div[@itemprop='description']/ul/li",
144
+ ).map { |a| a.name == "li" ? "- #{a.text}" : a.text }.reject(&:empty?).join("\n")
145
+ return if abstract_content.empty?
146
+
147
+ { content: abstract_content, language: lang[:lang],
148
+ script: script(lang[:lang]), format: "text/plain" }
149
+ end
150
+
157
151
  # Returns available languages.
158
152
  # @param doc [Nokogiri::HTML::Document]
159
- # @pqrqm lang [String, NilClass]
153
+ # @param lang [String, nil]
160
154
  # @return [Array<Hash>]
161
155
  def languages(doc, lang)
162
156
  lgs = [{ lang: "en" }]
163
157
  doc.css("li#lang-switcher ul li a").each do |lang_link|
164
158
  lang_path = lang_link.attr("href")
165
159
  l = lang_path.match(%r{^/(fr)/})
166
- lgs << { lang: l[1], path: lang_path } if l && (!lang || l[1] == lang)
160
+ lgs << { lang: l[1], path: lang_path } if l && (!lang || l[1] != lang)
167
161
  end
168
162
  lgs
169
163
  end
@@ -171,14 +165,21 @@ module RelatonIso
171
165
  # Get page.
172
166
  # @param path [String] page's path
173
167
  # @return [Array<Nokogiri::HTML::Document, String>]
174
- def get_page(path)
175
- resp, uri = get_redirection path
176
- doc = try_if_fail resp, uri
177
- [doc, uri.to_s]
178
- rescue SocketError, Timeout::Error, Errno::EINVAL, Errno::ECONNRESET,
179
- EOFError, Net::HTTPBadResponse, Net::HTTPHeaderSyntaxError,
180
- Net::ProtocolError, Errno::ETIMEDOUT
181
- raise RelatonBib::RequestError, "Could not access #{uri}"
168
+ def get_page(path) # rubocop:disable Metrics/MethodLength
169
+ try = 0
170
+ begin
171
+ resp, uri = get_redirection path
172
+ doc = try_if_fail resp, uri
173
+ [doc, uri.to_s]
174
+ rescue SocketError, Timeout::Error, Errno::EINVAL, Errno::ECONNRESET,
175
+ EOFError, Net::HTTPBadResponse, Net::HTTPHeaderSyntaxError,
176
+ Net::ProtocolError, Errno::ETIMEDOUT
177
+ try += 1
178
+ raise RelatonBib::RequestError, "Could not access #{DOMAIN}#{path}" if try > 3
179
+
180
+ sleep 1
181
+ retry
182
+ end
182
183
  end
183
184
 
184
185
  #
@@ -190,13 +191,37 @@ module RelatonIso
190
191
  # @return [Array<Net::HTTPOK, URI>] HTTP response and URI
191
192
  # @raise [RelatonBib::RequestError] if the page is not found
192
193
  #
193
- def get_redirection(path)
194
- url = DOMAIN + path
195
- uri = URI url
196
- resp = Net::HTTP.get_response(uri)
197
- raise RelatonBib::RequestError, "#{url} not found." if %w[404 302].include? resp.code
194
+ def get_redirection(path) # rubocop:disable Metrics/AbcSize,Metrics/MethodLength
195
+ uri = URI(DOMAIN + path)
196
+ try = 0
197
+ begin
198
+ get_response uri
199
+ rescue Errno::EPIPE => e
200
+ try += 1
201
+ retry if check_try try, uri
202
+ raise e
203
+ end
204
+ end
198
205
 
199
- resp.code == "301" ? get_redirection(resp["location"]) : [resp, uri]
206
+ def check_try(try, uri)
207
+ if try < 3
208
+ warn "Timeout fetching #{uri}, retrying..."
209
+ sleep 1
210
+ true
211
+ end
212
+ end
213
+
214
+ def get_response(uri, try = 0)
215
+ raise RelatonBib::RequestError, "#{uri} not found." if try > 3
216
+
217
+ resp = Net::HTTP.get_response(uri)
218
+ case resp.code
219
+ when "200" then [resp, uri]
220
+ when "301" then get_redirection(resp["location"])
221
+ else
222
+ sleep 1
223
+ get_response uri, try + 1
224
+ end
200
225
  end
201
226
 
202
227
  #
@@ -240,12 +265,19 @@ module RelatonIso
240
265
  #
241
266
  def fetch_structuredidentifier(pubid) # rubocop:disable Metrics/MethodLength
242
267
  RelatonIsoBib::StructuredIdentifier.new(
243
- project_number: "#{pubid.publisher} #{pubid.number}",
244
- part: pubid.part&.to_s, # &.sub(/^-/, ""),
245
- type: pubid.publisher,
268
+ project_number: "#{pubid.root.publisher} #{pubid.root.number}",
269
+ part: pubid.root.part&.to_s, # &.sub(/^-/, ""),
270
+ type: pubid.root.publisher,
246
271
  )
247
272
  end
248
273
 
274
+ #
275
+ # Parse ID from the document.
276
+ #
277
+ # @param [Nokogiri::HTML::Document] doc document to parse
278
+ #
279
+ # @return [String, nil] ID
280
+ #
249
281
  def item_ref(doc)
250
282
  doc.at("//main//section/div/div/div//h1")&.text
251
283
  end
@@ -271,7 +303,7 @@ module RelatonIso
271
303
 
272
304
  # Fetch workgroup.
273
305
  # @param doc [Nokogiri::HTML::Document]
274
- # @return [Hash]
306
+ # @return [RelatonIsoBib::EditorialGroup, nil]
275
307
  def fetch_workgroup(doc) # rubocop:disable Metrics/MethodLength,Metrics/AbcSize,Metrics/CyclomaticComplexity,Metrics/PerceivedComplexity
276
308
  wg = doc.at("////div[contains(., 'Technical Committe')]/following-sibling::span/a")
277
309
  return unless wg
@@ -286,7 +318,7 @@ module RelatonIso
286
318
  tc_numb = workgroup[1]&.match(/\d+/)&.to_s&.to_i
287
319
  tc_name = wg[:title]
288
320
  tc = RelatonBib::WorkGroup.new(name: tc_name, identifier: wg.text,
289
- type: type, number: tc_numb)
321
+ type: type, number: tc_numb)
290
322
  RelatonIsoBib::EditorialGroup.new(technical_committee: [tc])
291
323
  end
292
324
 
@@ -303,6 +335,14 @@ module RelatonIso
303
335
  end
304
336
  end
305
337
 
338
+ #
339
+ # Parse relation type and dates.
340
+ #
341
+ # @param [String] type parsed type
342
+ # @param [Nokogiri::HTML::Document] doc document to parse
343
+ #
344
+ # @return [Array<String,Array>] type and dates
345
+ #
306
346
  def relation_type(type, doc)
307
347
  date = []
308
348
  t = case type.strip
@@ -316,9 +356,20 @@ module RelatonIso
316
356
  [t, date]
317
357
  end
318
358
 
359
+ #
360
+ # Create relations.
361
+ #
362
+ # @param [Nokogiri::HTML::Element] rel relation element
363
+ # @param [String] type relation type
364
+ # @param [Hash{Symbol=>String}] date relation document date
365
+ # @option date [String] :type date type
366
+ # @option date [String] :on date
367
+ #
368
+ # @return [Array<Hash>] Relations
369
+ #
319
370
  def create_relations(rel, type, date)
320
371
  rel.css("a").map do |id|
321
- docid = RelatonBib::DocumentIdentifier.new(type: "ISO", id: id.text, primary: true)
372
+ docid = DocumentIdentifier.new(type: "ISO", id: id.text, primary: true)
322
373
  fref = RelatonBib::FormattedRef.new(content: id.text, format: "text/plain")
323
374
  bibitem = RelatonIsoBib::IsoBibliographicItem.new(
324
375
  docid: [docid], formattedref: fref, date: date,
@@ -333,14 +384,11 @@ module RelatonIso
333
384
  def fetch_type(ref)
334
385
  %r{
335
386
  ^(?<prefix>ISO|IWA|IEC)
336
- (?:(?:/IEC|/IEEE|/PRF|/NP|/DGuide)*\s|/)
337
- (?<type>TS|TR|PAS|AWI|CD|FDIS|NP|DIS|WD|R|Guide|(?=\d+))
387
+ (?:(?:/IEC|/IEEE|/PRF|/NP|/SAE|/HL7|/DGuide)*\s|/)
388
+ (?<type>TS|TR|PAS|AWI|CD|FDIS|NP|DIS|WD|R|DTS|DTR|ISP|PWI|Guide|(?=\d+))
338
389
  }x =~ ref
339
- # return "international-standard" if type_match.nil?
340
- type = TYPES[type] || TYPES[prefix]
390
+ type = TYPES[type] || TYPES[prefix] || "international-standard"
341
391
  RelatonIsoBib::DocumentType.new(type: type)
342
- # rescue => _e
343
- # puts 'Unknown document type: ' + title
344
392
  end
345
393
 
346
394
  # Fetch titles.
@@ -445,7 +493,7 @@ module RelatonIso
445
493
  links << { type: "obp", content: obp[:href] } if obp
446
494
  rss = doc.at("//a[contains(@href, 'rss')]")
447
495
  links << { type: "rss", content: DOMAIN + rss[:href] } if rss
448
- pub = doc.at "//p[contains(., 'publicly available')]/a",
496
+ pub = doc.at "//p[contains(., 'publicly available')]/a",
449
497
  "//p[contains(., 'can be downloaded from the')]/a"
450
498
  links << { type: "pub", content: pub[:href] } if pub
451
499
  links
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module RelatonIso
4
- VERSION = "1.18.1"
4
+ VERSION = "1.18.2"
5
5
  end
data/lib/relaton_iso.rb CHANGED
@@ -4,10 +4,15 @@ require "nokogiri"
4
4
  require "net/http"
5
5
  require "logger"
6
6
  require "pubid-iso"
7
+ require "relaton/index"
7
8
  require "relaton_iso_bib"
8
9
  require "relaton_iso/version"
9
10
  require "relaton_iso/config"
10
11
  require "relaton_iso/util"
12
+ require "relaton_iso/hash_converter"
11
13
  require "relaton_iso/hit"
12
14
  require "relaton_iso/iso_bibliography"
13
15
  require "relaton_iso/document_identifier"
16
+ # require "relaton_iso/index"
17
+ require "relaton_iso/queue"
18
+ require "relaton_iso/data_fetcher"
data/relaton_iso.gemspec CHANGED
@@ -28,5 +28,6 @@ Gem::Specification.new do |spec|
28
28
 
29
29
  spec.add_dependency "algolia", "~> 2.3.0"
30
30
  spec.add_dependency "pubid", "~> 0.1.1"
31
+ spec.add_dependency "relaton-index", "~> 0.2.12"
31
32
  spec.add_dependency "relaton-iso-bib", "~> 1.18.0"
32
33
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: relaton-iso
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.18.1
4
+ version: 1.18.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Ribose Inc.
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2024-02-02 00:00:00.000000000 Z
11
+ date: 2024-02-14 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: algolia
@@ -38,6 +38,20 @@ dependencies:
38
38
  - - "~>"
39
39
  - !ruby/object:Gem::Version
40
40
  version: 0.1.1
41
+ - !ruby/object:Gem::Dependency
42
+ name: relaton-index
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - "~>"
46
+ - !ruby/object:Gem::Version
47
+ version: 0.2.12
48
+ type: :runtime
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - "~>"
53
+ - !ruby/object:Gem::Version
54
+ version: 0.2.12
41
55
  - !ruby/object:Gem::Dependency
42
56
  name: relaton-iso-bib
43
57
  requirement: !ruby/object:Gem::Requirement
@@ -94,11 +108,15 @@ files:
94
108
  - bin/thor
95
109
  - lib/relaton_iso.rb
96
110
  - lib/relaton_iso/config.rb
111
+ - lib/relaton_iso/data_fetcher.rb
97
112
  - lib/relaton_iso/document_identifier.rb
113
+ - lib/relaton_iso/hash_converter.rb
98
114
  - lib/relaton_iso/hit.rb
99
115
  - lib/relaton_iso/hit_collection.rb
116
+ - lib/relaton_iso/index.rb
100
117
  - lib/relaton_iso/iso_bibliography.rb
101
118
  - lib/relaton_iso/processor.rb
119
+ - lib/relaton_iso/queue.rb
102
120
  - lib/relaton_iso/scrapper.rb
103
121
  - lib/relaton_iso/util.rb
104
122
  - lib/relaton_iso/version.rb