relaton-iso 1.18.1 → 1.18.2

Sign up to get free protection for your applications and to get access to all the features.
@@ -7,7 +7,9 @@ module RelatonIso
7
7
 
8
8
  TYPES = {
9
9
  "TS" => "technical-specification",
10
+ "DTS" => "technical-specification",
10
11
  "TR" => "technical-report",
12
+ "DTR" => "technical-report",
11
13
  "PAS" => "publicly-available-specification",
12
14
  # "AWI" => "approvedWorkItem",
13
15
  # "CD" => "committeeDraft",
@@ -18,6 +20,7 @@ module RelatonIso
18
20
  # "R" => "recommendation",
19
21
  "Guide" => "guide",
20
22
  "ISO" => "international-standard",
23
+ "IEC" => "international-standard",
21
24
  "IWA" => "international-workshop-agreement",
22
25
  }.freeze
23
26
 
@@ -48,43 +51,38 @@ module RelatonIso
48
51
  extend self
49
52
 
50
53
  # Parse page.
51
- # @param hit [RelatonIso::Hit]
52
- # @param lang [String, NilClass]
54
+ # @param path [String]
55
+ # @param lang [String, nil]
53
56
  # @return [RelatonIsoBib::IsoBibliographicItem]
54
- def parse_page(hit, lang = nil) # rubocop:disable Metrics/AbcSize, Metrics/MethodLength, Metrics/CyclomaticComplexity, Metrics/PerceivedComplexity
55
- # path = "/contents/data/standard#{hit_data['splitPath']}/"\
56
- # "#{hit_data['csnumber']}.html"
57
-
58
- path = hit.hit[:path].sub("/sites/isoorg", "")
59
- doc, url = get_page "#{path}.html"
60
-
57
+ def parse_page(path, lang = nil) # rubocop:disable Metrics/AbcSize, Metrics/MethodLength, Metrics/CyclomaticComplexity, Metrics/PerceivedComplexity
58
+ doc, url = get_page path
59
+ id = doc.at("//nav[contains(@class,'heading-condensed')]/h1").text.split(" | ").first
60
+ pubid = Pubid::Iso::Identifier.parse(id)
61
61
  # Fetch edition.
62
- edition = doc.at("//div[div[.='Edition']]/text()[last()]")
63
- &.text&.match(/\d+$/)&.to_s
64
- hit.pubid.base.edition ||= edition if hit.pubid.base
62
+ edition = doc.at("//div[div[.='Edition']]/text()[last()]")&.text&.match(/\d+$/)&.to_s
63
+ pubid.root.edition ||= edition if pubid.base
65
64
 
66
65
  titles, abstract, langs = fetch_titles_abstract(doc, lang)
67
66
 
68
67
  RelatonIsoBib::IsoBibliographicItem.new(
69
- fetched: Date.today.to_s,
70
- docid: fetch_relaton_docids(doc, hit.pubid),
71
- docnumber: fetch_docnumber(hit.pubid),
68
+ docid: fetch_relaton_docids(doc, pubid),
69
+ docnumber: fetch_docnumber(pubid),
72
70
  edition: edition,
73
71
  language: langs.map { |l| l[:lang] },
74
72
  script: langs.map { |l| script(l[:lang]) }.uniq,
75
73
  title: titles,
76
- doctype: fetch_type(hit.hit[:title]),
74
+ doctype: fetch_type(id),
77
75
  docstatus: fetch_status(doc),
78
76
  ics: fetch_ics(doc),
79
- date: fetch_dates(doc, hit.hit[:title]),
80
- contributor: fetch_contributors(hit.hit[:title]),
77
+ date: fetch_dates(doc, id),
78
+ contributor: fetch_contributors(id),
81
79
  editorialgroup: fetch_workgroup(doc),
82
80
  abstract: abstract,
83
81
  copyright: fetch_copyright(doc),
84
82
  link: fetch_link(doc, url),
85
83
  relation: fetch_relations(doc),
86
84
  place: ["Geneva"],
87
- structuredidentifier: fetch_structuredidentifier(hit.pubid),
85
+ structuredidentifier: fetch_structuredidentifier(pubid),
88
86
  )
89
87
  end
90
88
 
@@ -99,9 +97,9 @@ module RelatonIso
99
97
  def fetch_relaton_docids(doc, pubid)
100
98
  pubid.stage ||= Pubid::Iso::Identifier.parse_stage(stage_code(doc))
101
99
  [
102
- RelatonIso::DocumentIdentifier.new(id: pubid, type: "ISO", primary: true),
100
+ DocumentIdentifier.new(id: pubid, type: "ISO", primary: true),
103
101
  RelatonBib::DocumentIdentifier.new(id: isoref(pubid), type: "iso-reference"),
104
- RelatonIso::DocumentIdentifier.new(id: pubid, type: "URN"),
102
+ DocumentIdentifier.new(id: pubid, type: "URN"),
105
103
  ]
106
104
  end
107
105
 
@@ -121,49 +119,45 @@ module RelatonIso
121
119
 
122
120
  # Fetch titles and abstracts.
123
121
  # @param doc [Nokigiri::HTML::Document]
124
- # @param lang [String, NilClass]
122
+ # @param lang [String, nil]
125
123
  # @return [Array<Array>]
126
- def fetch_titles_abstract(doc, lang) # rubocop:disable Metrics/AbcSize,Metrics/MethodLength,Metrics/CyclomaticComplexity,Metrics/PerceivedComplexity
124
+ def fetch_titles_abstract(doc, lang) # rubocop:disable Metrics/AbcSize,Metrics/MethodLength
127
125
  titles = RelatonBib::TypedTitleStringCollection.new
128
126
  abstract = []
129
- langs = languages(doc, lang).reduce([]) do |s, l|
127
+ langs = languages(doc, lang).each_with_object([]) do |l, s|
130
128
  # Don't need to get page for en. We already have it.
131
129
  d = l[:path] ? get_page(l[:path])[0] : doc
132
- unless d.at("//h5[@class='help-block']" \
133
- "[.='недоступно на русском языке']")
130
+ unless d.at("//h5[@class='help-block'][.='недоступно на русском языке']")
134
131
  s << l
135
132
  titles += fetch_title(d, l[:lang])
136
133
 
137
- # Fetch abstracts.
138
- abstract_content = d.xpath(
139
- "//div[@itemprop='description']/p|//div[@itemprop='description']/ul/li",
140
- ).map do |a|
141
- a.name == "li" ? "- #{a.text}" : a.text
142
- end.reject(&:empty?).join("\n")
143
- unless abstract_content.empty?
144
- abstract << {
145
- content: abstract_content,
146
- language: l[:lang],
147
- script: script(l[:lang]),
148
- format: "text/plain",
149
- }
150
- end
134
+ abstr = parse_abstract(d, l)
135
+ abstract << abstr if abstr
151
136
  end
152
- s
153
137
  end
154
138
  [titles, abstract, langs]
155
139
  end
156
140
 
141
+ def parse_abstract(doc, lang)
142
+ abstract_content = doc.xpath(
143
+ "//div[@itemprop='description']/p|//div[@itemprop='description']/ul/li",
144
+ ).map { |a| a.name == "li" ? "- #{a.text}" : a.text }.reject(&:empty?).join("\n")
145
+ return if abstract_content.empty?
146
+
147
+ { content: abstract_content, language: lang[:lang],
148
+ script: script(lang[:lang]), format: "text/plain" }
149
+ end
150
+
157
151
  # Returns available languages.
158
152
  # @param doc [Nokogiri::HTML::Document]
159
- # @pqrqm lang [String, NilClass]
153
+ # @param lang [String, nil]
160
154
  # @return [Array<Hash>]
161
155
  def languages(doc, lang)
162
156
  lgs = [{ lang: "en" }]
163
157
  doc.css("li#lang-switcher ul li a").each do |lang_link|
164
158
  lang_path = lang_link.attr("href")
165
159
  l = lang_path.match(%r{^/(fr)/})
166
- lgs << { lang: l[1], path: lang_path } if l && (!lang || l[1] == lang)
160
+ lgs << { lang: l[1], path: lang_path } if l && (!lang || l[1] != lang)
167
161
  end
168
162
  lgs
169
163
  end
@@ -171,14 +165,21 @@ module RelatonIso
171
165
  # Get page.
172
166
  # @param path [String] page's path
173
167
  # @return [Array<Nokogiri::HTML::Document, String>]
174
- def get_page(path)
175
- resp, uri = get_redirection path
176
- doc = try_if_fail resp, uri
177
- [doc, uri.to_s]
178
- rescue SocketError, Timeout::Error, Errno::EINVAL, Errno::ECONNRESET,
179
- EOFError, Net::HTTPBadResponse, Net::HTTPHeaderSyntaxError,
180
- Net::ProtocolError, Errno::ETIMEDOUT
181
- raise RelatonBib::RequestError, "Could not access #{uri}"
168
+ def get_page(path) # rubocop:disable Metrics/MethodLength
169
+ try = 0
170
+ begin
171
+ resp, uri = get_redirection path
172
+ doc = try_if_fail resp, uri
173
+ [doc, uri.to_s]
174
+ rescue SocketError, Timeout::Error, Errno::EINVAL, Errno::ECONNRESET,
175
+ EOFError, Net::HTTPBadResponse, Net::HTTPHeaderSyntaxError,
176
+ Net::ProtocolError, Errno::ETIMEDOUT
177
+ try += 1
178
+ raise RelatonBib::RequestError, "Could not access #{DOMAIN}#{path}" if try > 3
179
+
180
+ sleep 1
181
+ retry
182
+ end
182
183
  end
183
184
 
184
185
  #
@@ -190,13 +191,37 @@ module RelatonIso
190
191
  # @return [Array<Net::HTTPOK, URI>] HTTP response and URI
191
192
  # @raise [RelatonBib::RequestError] if the page is not found
192
193
  #
193
- def get_redirection(path)
194
- url = DOMAIN + path
195
- uri = URI url
196
- resp = Net::HTTP.get_response(uri)
197
- raise RelatonBib::RequestError, "#{url} not found." if %w[404 302].include? resp.code
194
+ def get_redirection(path) # rubocop:disable Metrics/AbcSize,Metrics/MethodLength
195
+ uri = URI(DOMAIN + path)
196
+ try = 0
197
+ begin
198
+ get_response uri
199
+ rescue Errno::EPIPE => e
200
+ try += 1
201
+ retry if check_try try, uri
202
+ raise e
203
+ end
204
+ end
198
205
 
199
- resp.code == "301" ? get_redirection(resp["location"]) : [resp, uri]
206
+ def check_try(try, uri)
207
+ if try < 3
208
+ warn "Timeout fetching #{uri}, retrying..."
209
+ sleep 1
210
+ true
211
+ end
212
+ end
213
+
214
+ def get_response(uri, try = 0)
215
+ raise RelatonBib::RequestError, "#{uri} not found." if try > 3
216
+
217
+ resp = Net::HTTP.get_response(uri)
218
+ case resp.code
219
+ when "200" then [resp, uri]
220
+ when "301" then get_redirection(resp["location"])
221
+ else
222
+ sleep 1
223
+ get_response uri, try + 1
224
+ end
200
225
  end
201
226
 
202
227
  #
@@ -240,12 +265,19 @@ module RelatonIso
240
265
  #
241
266
  def fetch_structuredidentifier(pubid) # rubocop:disable Metrics/MethodLength
242
267
  RelatonIsoBib::StructuredIdentifier.new(
243
- project_number: "#{pubid.publisher} #{pubid.number}",
244
- part: pubid.part&.to_s, # &.sub(/^-/, ""),
245
- type: pubid.publisher,
268
+ project_number: "#{pubid.root.publisher} #{pubid.root.number}",
269
+ part: pubid.root.part&.to_s, # &.sub(/^-/, ""),
270
+ type: pubid.root.publisher,
246
271
  )
247
272
  end
248
273
 
274
+ #
275
+ # Parse ID from the document.
276
+ #
277
+ # @param [Nokogiri::HTML::Document] doc document to parse
278
+ #
279
+ # @return [String, nil] ID
280
+ #
249
281
  def item_ref(doc)
250
282
  doc.at("//main//section/div/div/div//h1")&.text
251
283
  end
@@ -271,7 +303,7 @@ module RelatonIso
271
303
 
272
304
  # Fetch workgroup.
273
305
  # @param doc [Nokogiri::HTML::Document]
274
- # @return [Hash]
306
+ # @return [RelatonIsoBib::EditorialGroup, nil]
275
307
  def fetch_workgroup(doc) # rubocop:disable Metrics/MethodLength,Metrics/AbcSize,Metrics/CyclomaticComplexity,Metrics/PerceivedComplexity
276
308
  wg = doc.at("////div[contains(., 'Technical Committe')]/following-sibling::span/a")
277
309
  return unless wg
@@ -286,7 +318,7 @@ module RelatonIso
286
318
  tc_numb = workgroup[1]&.match(/\d+/)&.to_s&.to_i
287
319
  tc_name = wg[:title]
288
320
  tc = RelatonBib::WorkGroup.new(name: tc_name, identifier: wg.text,
289
- type: type, number: tc_numb)
321
+ type: type, number: tc_numb)
290
322
  RelatonIsoBib::EditorialGroup.new(technical_committee: [tc])
291
323
  end
292
324
 
@@ -303,6 +335,14 @@ module RelatonIso
303
335
  end
304
336
  end
305
337
 
338
+ #
339
+ # Parse relation type and dates.
340
+ #
341
+ # @param [String] type parsed type
342
+ # @param [Nokogiri::HTML::Document] doc document to parse
343
+ #
344
+ # @return [Array<String,Array>] type and dates
345
+ #
306
346
  def relation_type(type, doc)
307
347
  date = []
308
348
  t = case type.strip
@@ -316,9 +356,20 @@ module RelatonIso
316
356
  [t, date]
317
357
  end
318
358
 
359
+ #
360
+ # Create relations.
361
+ #
362
+ # @param [Nokogiri::HTML::Element] rel relation element
363
+ # @param [String] type relation type
364
+ # @param [Hash{Symbol=>String}] date relation document date
365
+ # @option date [String] :type date type
366
+ # @option date [String] :on date
367
+ #
368
+ # @return [Array<Hash>] Relations
369
+ #
319
370
  def create_relations(rel, type, date)
320
371
  rel.css("a").map do |id|
321
- docid = RelatonBib::DocumentIdentifier.new(type: "ISO", id: id.text, primary: true)
372
+ docid = DocumentIdentifier.new(type: "ISO", id: id.text, primary: true)
322
373
  fref = RelatonBib::FormattedRef.new(content: id.text, format: "text/plain")
323
374
  bibitem = RelatonIsoBib::IsoBibliographicItem.new(
324
375
  docid: [docid], formattedref: fref, date: date,
@@ -333,14 +384,11 @@ module RelatonIso
333
384
  def fetch_type(ref)
334
385
  %r{
335
386
  ^(?<prefix>ISO|IWA|IEC)
336
- (?:(?:/IEC|/IEEE|/PRF|/NP|/DGuide)*\s|/)
337
- (?<type>TS|TR|PAS|AWI|CD|FDIS|NP|DIS|WD|R|Guide|(?=\d+))
387
+ (?:(?:/IEC|/IEEE|/PRF|/NP|/SAE|/HL7|/DGuide)*\s|/)
388
+ (?<type>TS|TR|PAS|AWI|CD|FDIS|NP|DIS|WD|R|DTS|DTR|ISP|PWI|Guide|(?=\d+))
338
389
  }x =~ ref
339
- # return "international-standard" if type_match.nil?
340
- type = TYPES[type] || TYPES[prefix]
390
+ type = TYPES[type] || TYPES[prefix] || "international-standard"
341
391
  RelatonIsoBib::DocumentType.new(type: type)
342
- # rescue => _e
343
- # puts 'Unknown document type: ' + title
344
392
  end
345
393
 
346
394
  # Fetch titles.
@@ -445,7 +493,7 @@ module RelatonIso
445
493
  links << { type: "obp", content: obp[:href] } if obp
446
494
  rss = doc.at("//a[contains(@href, 'rss')]")
447
495
  links << { type: "rss", content: DOMAIN + rss[:href] } if rss
448
- pub = doc.at "//p[contains(., 'publicly available')]/a",
496
+ pub = doc.at "//p[contains(., 'publicly available')]/a",
449
497
  "//p[contains(., 'can be downloaded from the')]/a"
450
498
  links << { type: "pub", content: pub[:href] } if pub
451
499
  links
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module RelatonIso
4
- VERSION = "1.18.1"
4
+ VERSION = "1.18.2"
5
5
  end
data/lib/relaton_iso.rb CHANGED
@@ -4,10 +4,15 @@ require "nokogiri"
4
4
  require "net/http"
5
5
  require "logger"
6
6
  require "pubid-iso"
7
+ require "relaton/index"
7
8
  require "relaton_iso_bib"
8
9
  require "relaton_iso/version"
9
10
  require "relaton_iso/config"
10
11
  require "relaton_iso/util"
12
+ require "relaton_iso/hash_converter"
11
13
  require "relaton_iso/hit"
12
14
  require "relaton_iso/iso_bibliography"
13
15
  require "relaton_iso/document_identifier"
16
+ # require "relaton_iso/index"
17
+ require "relaton_iso/queue"
18
+ require "relaton_iso/data_fetcher"
data/relaton_iso.gemspec CHANGED
@@ -28,5 +28,6 @@ Gem::Specification.new do |spec|
28
28
 
29
29
  spec.add_dependency "algolia", "~> 2.3.0"
30
30
  spec.add_dependency "pubid", "~> 0.1.1"
31
+ spec.add_dependency "relaton-index", "~> 0.2.12"
31
32
  spec.add_dependency "relaton-iso-bib", "~> 1.18.0"
32
33
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: relaton-iso
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.18.1
4
+ version: 1.18.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Ribose Inc.
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2024-02-02 00:00:00.000000000 Z
11
+ date: 2024-02-14 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: algolia
@@ -38,6 +38,20 @@ dependencies:
38
38
  - - "~>"
39
39
  - !ruby/object:Gem::Version
40
40
  version: 0.1.1
41
+ - !ruby/object:Gem::Dependency
42
+ name: relaton-index
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - "~>"
46
+ - !ruby/object:Gem::Version
47
+ version: 0.2.12
48
+ type: :runtime
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - "~>"
53
+ - !ruby/object:Gem::Version
54
+ version: 0.2.12
41
55
  - !ruby/object:Gem::Dependency
42
56
  name: relaton-iso-bib
43
57
  requirement: !ruby/object:Gem::Requirement
@@ -94,11 +108,15 @@ files:
94
108
  - bin/thor
95
109
  - lib/relaton_iso.rb
96
110
  - lib/relaton_iso/config.rb
111
+ - lib/relaton_iso/data_fetcher.rb
97
112
  - lib/relaton_iso/document_identifier.rb
113
+ - lib/relaton_iso/hash_converter.rb
98
114
  - lib/relaton_iso/hit.rb
99
115
  - lib/relaton_iso/hit_collection.rb
116
+ - lib/relaton_iso/index.rb
100
117
  - lib/relaton_iso/iso_bibliography.rb
101
118
  - lib/relaton_iso/processor.rb
119
+ - lib/relaton_iso/queue.rb
102
120
  - lib/relaton_iso/scrapper.rb
103
121
  - lib/relaton_iso/util.rb
104
122
  - lib/relaton_iso/version.rb