relaton-iso 1.16.1 → 1.16.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 45b4a081a62ab5a5f0a4e6f2c2cffb4950861f09e838401a68aa2208731d65ec
4
- data.tar.gz: 01521bd3e1fa7853145a390461390b7a07dfc20e1efb02c2d6d90372d03a8664
3
+ metadata.gz: 13ecc04a430b1dbf256c0853f612969727c16eba72a06cb2bc74bed17745ba90
4
+ data.tar.gz: f795f63a994b843e07d4857ba3b0dd9c91ec9a3ccb408827f1bf7bdbf5f854a9
5
5
  SHA512:
6
- metadata.gz: 0e72371e46e2d03875fce213861ab9f087fdafca4abe748436f8ccc217ee2d82b5a089c20d73062d2a022366b79294ac7b1a16f0b3f59593f79d673800286877
7
- data.tar.gz: c6fa8308f8feb86cc08ae3a1fde9e267169c5bf8b5292a6c97f4dbf7f28668b56ac111e3dc03411dfa537ce83905e87a273d866b275f376cef402a3f641a59b6
6
+ metadata.gz: d33586bbe409f54736b694d774a52e1bef8a4cc2d7c304aebd06c5ead8b3893b6f45c65d3e5c586c5e7f9f23501b52ae6b0630c25213d6105660251d03cff94e
7
+ data.tar.gz: e7fdcb33dfa855c73ead77a514eae36d761274bafeec77c520ac8ff84a05c6a04a2b30bd80fa7d89fcabda1975eeb95f85c65d0c177da7e1694da97bc4245ccd
@@ -4,7 +4,7 @@ module RelatonIso
4
4
  class Processor < Relaton::Processor
5
5
  attr_reader :idtype
6
6
 
7
- def initialize
7
+ def initialize # rubocop:disable Lint/MissingSuper
8
8
  @short = :relaton_iso
9
9
  @prefix = "ISO"
10
10
  @defaultprefix = %r{^ISO(/IEC)?\s}
@@ -43,407 +43,427 @@ module RelatonIso
43
43
  url: "www.asme.org" },
44
44
  }.freeze
45
45
 
46
- class << self
47
- # Parse page.
48
- # @param hit [RelatonIso::Hit]
49
- # @param lang [String, NilClass]
50
- # @return [RelatonIsoBib::IsoBibliographicItem]
51
- def parse_page(hit, lang = nil) # rubocop:disable Metrics/AbcSize, Metrics/MethodLength, Metrics/CyclomaticComplexity, Metrics/PerceivedComplexity
52
- # path = "/contents/data/standard#{hit_data['splitPath']}/"\
53
- # "#{hit_data['csnumber']}.html"
54
-
55
- path = hit.hit[:path].sub("/sites/isoorg", "")
56
- doc, url = get_page "#{path}.html"
57
-
58
- # Fetch edition.
59
- edition = doc.at("//div[div[.='Edition']]/text()[last()]")
60
- &.text&.match(/\d+$/)&.to_s
61
- hit.pubid.base.edition ||= edition if hit.pubid.base
62
-
63
- titles, abstract, langs = fetch_titles_abstract(doc, lang)
64
-
65
- RelatonIsoBib::IsoBibliographicItem.new(
66
- fetched: Date.today.to_s,
67
- docid: fetch_relaton_docids(doc, hit.pubid),
68
- docnumber: fetch_docnumber(hit.pubid),
69
- edition: edition,
70
- language: langs.map { |l| l[:lang] },
71
- script: langs.map { |l| script(l[:lang]) }.uniq,
72
- title: titles,
73
- doctype: fetch_type(hit.hit[:title]),
74
- docstatus: fetch_status(doc),
75
- ics: fetch_ics(doc),
76
- date: fetch_dates(doc, hit.hit[:title]),
77
- contributor: fetch_contributors(hit.hit[:title]),
78
- editorialgroup: fetch_workgroup(doc),
79
- abstract: abstract,
80
- copyright: fetch_copyright(doc),
81
- link: fetch_link(doc, url),
82
- relation: fetch_relations(doc),
83
- place: ["Geneva"],
84
- structuredidentifier: fetch_structuredidentifier(hit.pubid),
85
- )
86
- end
46
+ extend self
47
+
48
+ # Parse page.
49
+ # @param hit [RelatonIso::Hit]
50
+ # @param lang [String, NilClass]
51
+ # @return [RelatonIsoBib::IsoBibliographicItem]
52
+ def parse_page(hit, lang = nil) # rubocop:disable Metrics/AbcSize, Metrics/MethodLength, Metrics/CyclomaticComplexity, Metrics/PerceivedComplexity
53
+ # path = "/contents/data/standard#{hit_data['splitPath']}/"\
54
+ # "#{hit_data['csnumber']}.html"
55
+
56
+ path = hit.hit[:path].sub("/sites/isoorg", "")
57
+ doc, url = get_page "#{path}.html"
58
+
59
+ # Fetch edition.
60
+ edition = doc.at("//div[div[.='Edition']]/text()[last()]")
61
+ &.text&.match(/\d+$/)&.to_s
62
+ hit.pubid.base.edition ||= edition if hit.pubid.base
63
+
64
+ titles, abstract, langs = fetch_titles_abstract(doc, lang)
65
+
66
+ RelatonIsoBib::IsoBibliographicItem.new(
67
+ fetched: Date.today.to_s,
68
+ docid: fetch_relaton_docids(doc, hit.pubid),
69
+ docnumber: fetch_docnumber(hit.pubid),
70
+ edition: edition,
71
+ language: langs.map { |l| l[:lang] },
72
+ script: langs.map { |l| script(l[:lang]) }.uniq,
73
+ title: titles,
74
+ doctype: fetch_type(hit.hit[:title]),
75
+ docstatus: fetch_status(doc),
76
+ ics: fetch_ics(doc),
77
+ date: fetch_dates(doc, hit.hit[:title]),
78
+ contributor: fetch_contributors(hit.hit[:title]),
79
+ editorialgroup: fetch_workgroup(doc),
80
+ abstract: abstract,
81
+ copyright: fetch_copyright(doc),
82
+ link: fetch_link(doc, url),
83
+ relation: fetch_relations(doc),
84
+ place: ["Geneva"],
85
+ structuredidentifier: fetch_structuredidentifier(hit.pubid),
86
+ )
87
+ end
87
88
 
88
- #
89
- # Create document ids.
90
- #
91
- # @param doc [Nokogiri::HTML::Document] document to parse
92
- # @param pubid [Pubid::Iso::Identifier] publication identifier
93
- #
94
- # @return [Array<RelatonBib::DocumentIdentifier>]
95
- #
96
- def fetch_relaton_docids(doc, pubid)
97
- pubid.stage ||= Pubid::Iso::Identifier.parse_stage(stage_code(doc))
98
- [
99
- RelatonIso::DocumentIdentifier.new(id: pubid, type: "ISO", primary: true),
100
- RelatonBib::DocumentIdentifier.new(id: isoref(pubid), type: "iso-reference"),
101
- RelatonIso::DocumentIdentifier.new(id: pubid, type: "URN"),
102
- ]
103
- end
89
+ #
90
+ # Create document ids.
91
+ #
92
+ # @param doc [Nokogiri::HTML::Document] document to parse
93
+ # @param pubid [Pubid::Iso::Identifier] publication identifier
94
+ #
95
+ # @return [Array<RelatonBib::DocumentIdentifier>]
96
+ #
97
+ def fetch_relaton_docids(doc, pubid)
98
+ pubid.stage ||= Pubid::Iso::Identifier.parse_stage(stage_code(doc))
99
+ [
100
+ RelatonIso::DocumentIdentifier.new(id: pubid, type: "ISO", primary: true),
101
+ RelatonBib::DocumentIdentifier.new(id: isoref(pubid), type: "iso-reference"),
102
+ RelatonIso::DocumentIdentifier.new(id: pubid, type: "URN"),
103
+ ]
104
+ end
104
105
 
105
- #
106
- # Create ISO reference identifier with English language.
107
- #
108
- # @param [Pubid::Iso::Identifier] pubid publication identifier
109
- #
110
- # @return [String] English reference identifier
111
- #
112
- def isoref(pubid)
113
- params = pubid.get_params.reject { |k, _| k == :typed_stage }
114
- Pubid::Iso::Identifier.create(language: "en", **params).to_s(format: :ref_num_short)
115
- end
106
+ #
107
+ # Create ISO reference identifier with English language.
108
+ #
109
+ # @param [Pubid::Iso::Identifier] pubid publication identifier
110
+ #
111
+ # @return [String] English reference identifier
112
+ #
113
+ def isoref(pubid)
114
+ params = pubid.get_params.reject { |k, _| k == :typed_stage }
115
+ Pubid::Iso::Identifier.create(language: "en", **params).to_s(format: :ref_num_short)
116
+ end
116
117
 
117
- private
118
-
119
- # Fetch titles and abstracts.
120
- # @param doc [Nokigiri::HTML::Document]
121
- # @param lang [String, NilClass]
122
- # @return [Array<Array>]
123
- def fetch_titles_abstract(doc, lang) # rubocop:disable Metrics/AbcSize,Metrics/MethodLength,Metrics/CyclomaticComplexity,Metrics/PerceivedComplexity
124
- titles = RelatonBib::TypedTitleStringCollection.new
125
- abstract = []
126
- langs = languages(doc, lang).reduce([]) do |s, l|
127
- # Don't need to get page for en. We already have it.
128
- d = l[:path] ? get_page(l[:path])[0] : doc
129
- unless d.at("//h5[@class='help-block']" \
130
- "[.='недоступно на русском языке']")
131
- s << l
132
- titles += fetch_title(d, l[:lang])
133
-
134
- # Fetch abstracts.
135
- abstract_content = d.xpath(
136
- "//div[@itemprop='description']/p|//div[@itemprop='description']/ul/li",
137
- ).map do |a|
138
- a.name == "li" ? "- #{a.text}" : a.text
139
- end.reject(&:empty?).join("\n")
140
- unless abstract_content.empty?
141
- abstract << {
142
- content: abstract_content,
143
- language: l[:lang],
144
- script: script(l[:lang]),
145
- format: "text/plain",
146
- }
147
- end
118
+ private
119
+
120
+ # Fetch titles and abstracts.
121
+ # @param doc [Nokigiri::HTML::Document]
122
+ # @param lang [String, NilClass]
123
+ # @return [Array<Array>]
124
+ def fetch_titles_abstract(doc, lang) # rubocop:disable Metrics/AbcSize,Metrics/MethodLength,Metrics/CyclomaticComplexity,Metrics/PerceivedComplexity
125
+ titles = RelatonBib::TypedTitleStringCollection.new
126
+ abstract = []
127
+ langs = languages(doc, lang).reduce([]) do |s, l|
128
+ # Don't need to get page for en. We already have it.
129
+ d = l[:path] ? get_page(l[:path])[0] : doc
130
+ unless d.at("//h5[@class='help-block']" \
131
+ "[.='недоступно на русском языке']")
132
+ s << l
133
+ titles += fetch_title(d, l[:lang])
134
+
135
+ # Fetch abstracts.
136
+ abstract_content = d.xpath(
137
+ "//div[@itemprop='description']/p|//div[@itemprop='description']/ul/li",
138
+ ).map do |a|
139
+ a.name == "li" ? "- #{a.text}" : a.text
140
+ end.reject(&:empty?).join("\n")
141
+ unless abstract_content.empty?
142
+ abstract << {
143
+ content: abstract_content,
144
+ language: l[:lang],
145
+ script: script(l[:lang]),
146
+ format: "text/plain",
147
+ }
148
148
  end
149
- s
150
149
  end
151
- [titles, abstract, langs]
150
+ s
152
151
  end
152
+ [titles, abstract, langs]
153
+ end
153
154
 
154
- # Returns available languages.
155
- # @param doc [Nokogiri::HTML::Document]
156
- # @pqrqm lang [String, NilClass]
157
- # @return [Array<Hash>]
158
- def languages(doc, lang)
159
- lgs = [{ lang: "en" }]
160
- doc.css("li#lang-switcher ul li a").each do |lang_link|
161
- lang_path = lang_link.attr("href")
162
- l = lang_path.match(%r{^/(fr)/})
163
- lgs << { lang: l[1], path: lang_path } if l && (!lang || l[1] == lang)
164
- end
165
- lgs
155
+ # Returns available languages.
156
+ # @param doc [Nokogiri::HTML::Document]
157
+ # @pqrqm lang [String, NilClass]
158
+ # @return [Array<Hash>]
159
+ def languages(doc, lang)
160
+ lgs = [{ lang: "en" }]
161
+ doc.css("li#lang-switcher ul li a").each do |lang_link|
162
+ lang_path = lang_link.attr("href")
163
+ l = lang_path.match(%r{^/(fr)/})
164
+ lgs << { lang: l[1], path: lang_path } if l && (!lang || l[1] == lang)
166
165
  end
166
+ lgs
167
+ end
167
168
 
168
- # Get page.
169
- # @param path [String] page's path
170
- # @return [Array<Nokogiri::HTML::Document, String>]
171
- def get_page(path)
172
- resp, uri = get_redirection path
173
- doc = try_if_fail resp, uri
174
- [doc, uri.to_s]
175
- rescue SocketError, Timeout::Error, Errno::EINVAL, Errno::ECONNRESET,
176
- EOFError, Net::HTTPBadResponse, Net::HTTPHeaderSyntaxError,
177
- Net::ProtocolError, Errno::ETIMEDOUT
178
- raise RelatonBib::RequestError, "Could not access #{uri}"
179
- end
169
+ # Get page.
170
+ # @param path [String] page's path
171
+ # @return [Array<Nokogiri::HTML::Document, String>]
172
+ def get_page(path)
173
+ resp, uri = get_redirection path
174
+ doc = try_if_fail resp, uri
175
+ [doc, uri.to_s]
176
+ rescue SocketError, Timeout::Error, Errno::EINVAL, Errno::ECONNRESET,
177
+ EOFError, Net::HTTPBadResponse, Net::HTTPHeaderSyntaxError,
178
+ Net::ProtocolError, Errno::ETIMEDOUT
179
+ raise RelatonBib::RequestError, "Could not access #{uri}"
180
+ end
180
181
 
181
- #
182
- # Get the page from the given path. If the page is redirected, get the
183
- # page from the new path.
184
- #
185
- # @param [String] path path to the page
186
- #
187
- # @return [Array<Net::HTTPOK, URI>] HTTP response and URI
188
- # @raise [RelatonBib::RequestError] if the page is not found
189
- #
190
- def get_redirection(path)
191
- url = DOMAIN + path
192
- uri = URI url
193
- resp = Net::HTTP.get_response(uri)
194
- raise RelatonBib::RequestError, "#{url} not found." if %w[404 302].include? resp.code
182
+ #
183
+ # Get the page from the given path. If the page is redirected, get the
184
+ # page from the new path.
185
+ #
186
+ # @param [String] path path to the page
187
+ #
188
+ # @return [Array<Net::HTTPOK, URI>] HTTP response and URI
189
+ # @raise [RelatonBib::RequestError] if the page is not found
190
+ #
191
+ def get_redirection(path)
192
+ url = DOMAIN + path
193
+ uri = URI url
194
+ resp = Net::HTTP.get_response(uri)
195
+ raise RelatonBib::RequestError, "#{url} not found." if %w[404 302].include? resp.code
196
+
197
+ resp.code == "301" ? get_redirection(resp["location"]) : [resp, uri]
198
+ end
195
199
 
196
- resp.code == "301" ? get_redirection(resp["location"]) : [resp, uri]
197
- end
200
+ #
201
+ # The iso.org site fails to respond sometimes. This method tries to get
202
+ # the response again.
203
+ #
204
+ # @param [Net::HTTPOK] resp HTTP response
205
+ # @param [URI::HTTPS] uri URI of the page
206
+ #
207
+ # @return [Nokogiri::HTML4::Document] document
208
+ # @raise [RelatonBib::RequestError] if the page could not be parsed
209
+ #
210
+ def try_if_fail(resp, uri)
211
+ 10.times do
212
+ doc = Nokogiri::HTML(resp.body)
213
+ # stop trying if page has a document id
214
+ return doc if item_ref doc
198
215
 
199
- #
200
- # The iso.org site fails to respond sometimes. This method tries to get
201
- # the response again.
202
- #
203
- # @param [Net::HTTPOK] resp HTTP response
204
- # @param [URI::HTTPS] uri URI of the page
205
- #
206
- # @return [Nokogiri::HTML4::Document] document
207
- # @raise [RelatonBib::RequestError] if the page could not be parsed
208
- #
209
- def try_if_fail(resp, uri)
210
- 10.times do
211
- doc = Nokogiri::HTML(resp.body)
212
- # stop trying if page has a document id
213
- return doc if item_ref doc
214
-
215
- resp = Net::HTTP.get_response(uri)
216
- end
217
- raise RelatonBib::RequestError, "Could not parse the page #{uri}"
216
+ resp = Net::HTTP.get_response(uri)
218
217
  end
218
+ raise RelatonBib::RequestError, "Could not parse the page #{uri}"
219
+ end
219
220
 
220
- #
221
- # Generate docnumber.
222
- #
223
- # @param [Pubid::Iso] pubid
224
- #
225
- # @return [String] docnumber
226
- #
227
- def fetch_docnumber(pubid)
228
- pubid.to_s.match(/\d+/)&.to_s
229
- end
221
+ #
222
+ # Generate docnumber.
223
+ #
224
+ # @param [Pubid::Iso] pubid
225
+ #
226
+ # @return [String] docnumber
227
+ #
228
+ def fetch_docnumber(pubid)
229
+ pubid.to_s.match(/\d+/)&.to_s
230
+ end
230
231
 
231
- #
232
- # Parse structuredidentifier.
233
- #
234
- # @param pubid [Pubid::Iso::Identifier] pubid
235
- #
236
- # @return [RelatonBib::StructuredIdentifier] structured identifier
237
- #
238
- def fetch_structuredidentifier(pubid) # rubocop:disable Metrics/MethodLength
239
- RelatonIsoBib::StructuredIdentifier.new(
240
- project_number: "#{pubid.publisher} #{pubid.number}",
241
- part: pubid.part&.to_s, # &.sub(/^-/, ""),
242
- type: pubid.publisher,
243
- )
244
- end
232
+ #
233
+ # Parse structuredidentifier.
234
+ #
235
+ # @param pubid [Pubid::Iso::Identifier] pubid
236
+ #
237
+ # @return [RelatonBib::StructuredIdentifier] structured identifier
238
+ #
239
+ def fetch_structuredidentifier(pubid) # rubocop:disable Metrics/MethodLength
240
+ RelatonIsoBib::StructuredIdentifier.new(
241
+ project_number: "#{pubid.publisher} #{pubid.number}",
242
+ part: pubid.part&.to_s, # &.sub(/^-/, ""),
243
+ type: pubid.publisher,
244
+ )
245
+ end
245
246
 
246
- def item_ref(doc)
247
- doc.at("//main//section/div/div/div//h1")&.text
248
- end
247
+ def item_ref(doc)
248
+ doc.at("//main//section/div/div/div//h1")&.text
249
+ end
249
250
 
250
- # Fetch status.
251
- # @param doc [Nokogiri::HTML::Document]
252
- # @param status [String]
253
- # @return [Hash]
254
- def fetch_status(doc)
255
- stg, substg = stage_code(doc).split "."
256
- RelatonBib::DocumentStatus.new(stage: stg, substage: substg)
257
- end
251
+ # Fetch status.
252
+ # @param doc [Nokogiri::HTML::Document]
253
+ # @param status [String]
254
+ # @return [Hash]
255
+ def fetch_status(doc)
256
+ stg, substg = stage_code(doc).split "."
257
+ RelatonBib::DocumentStatus.new(stage: stg, substage: substg)
258
+ end
258
259
 
259
- def stage_code(doc)
260
- doc.at("//ul[@class='dropdown-menu']/li[@class='active']" \
261
- "/a/span[@class='stage-code']").text
262
- end
260
+ def stage_code(doc)
261
+ doc.at("//ul[@class='dropdown-menu']/li[@class='active']" \
262
+ "/a/span[@class='stage-code']").text
263
+ end
263
264
 
264
- # def stage(stg, substg)
265
- # abbr = STGABBR[stg].is_a?(Hash) ? STGABBR[stg][substg] : STGABBR[stg]
266
- # RelatonBib::DocumentStatus::Stage.new value: stg, abbreviation: abbr
267
- # end
268
-
269
- # Fetch workgroup.
270
- # @param doc [Nokogiri::HTML::Document]
271
- # @return [Hash]
272
- def fetch_workgroup(doc) # rubocop:disable Metrics/MethodLength,Metrics/AbcSize,Metrics/CyclomaticComplexity,Metrics/PerceivedComplexity
273
- wg = doc.at("//div[@class='clearfix']")
274
- wg_link = wg.at "span/a"
275
- return unless wg_link
276
-
277
- workgroup = wg_link.text.split "/"
278
- type = workgroup[1]&.match(/^[A-Z]+/)&.to_s || "TC"
279
- # {
280
- # name: "International Organization for Standardization",
281
- # abbreviation: "ISO",
282
- # url: "www.iso.org",
283
- # }
284
- tc_numb = workgroup[1]&.match(/\d+/)&.to_s&.to_i
285
- tc_name = wg.at("span[@class='entry-title']").text
286
- tc = RelatonBib::WorkGroup.new(name: tc_name, identifier: wg_link.text,
287
- type: type, number: tc_numb)
288
- RelatonIsoBib::EditorialGroup.new(technical_committee: [tc])
265
+ # def stage(stg, substg)
266
+ # abbr = STGABBR[stg].is_a?(Hash) ? STGABBR[stg][substg] : STGABBR[stg]
267
+ # RelatonBib::DocumentStatus::Stage.new value: stg, abbreviation: abbr
268
+ # end
269
+
270
+ # Fetch workgroup.
271
+ # @param doc [Nokogiri::HTML::Document]
272
+ # @return [Hash]
273
+ def fetch_workgroup(doc) # rubocop:disable Metrics/MethodLength,Metrics/AbcSize,Metrics/CyclomaticComplexity,Metrics/PerceivedComplexity
274
+ wg = doc.at("////div[contains(., 'Technical Committe')]/following-sibling::span/a")
275
+ return unless wg
276
+
277
+ workgroup = wg.text.split "/"
278
+ type = workgroup[1]&.match(/^[A-Z]+/)&.to_s || "TC"
279
+ # {
280
+ # name: "International Organization for Standardization",
281
+ # abbreviation: "ISO",
282
+ # url: "www.iso.org",
283
+ # }
284
+ tc_numb = workgroup[1]&.match(/\d+/)&.to_s&.to_i
285
+ tc_name = wg[:title]
286
+ tc = RelatonBib::WorkGroup.new(name: tc_name, identifier: wg.text,
287
+ type: type, number: tc_numb)
288
+ RelatonIsoBib::EditorialGroup.new(technical_committee: [tc])
289
+ end
290
+
291
+ # Fetch relations.
292
+ # @param doc [Nokogiri::HTML::Document]
293
+ # @return [Array<Hash>]
294
+ def fetch_relations(doc)
295
+ types = ["Now", "Now under review"]
296
+ doc.xpath("//ul[@class='steps']/li", "//div[@class='sub-step']").reduce([]) do |a, r|
297
+ type, date = relation_type(r.at("h4", "h5").text.strip, doc)
298
+ next a if types.include?(type)
299
+
300
+ a + create_relations(r, type, date)
289
301
  end
302
+ end
290
303
 
291
- # rubocop:disable Metrics/MethodLength
292
-
293
- # Fetch relations.
294
- # @param doc [Nokogiri::HTML::Document]
295
- # @return [Array<Hash>]
296
- def fetch_relations(doc) # rubocop:disable Metrics/AbcSize, Metrics/CyclomaticComplexity
297
- types = ["Now", "Now under review"]
298
- doc.xpath("//ul[@class='steps']/li", "//div[@class='sub-step']").reduce([]) do |a, r|
299
- r_type = r.at("h4", "h5").text
300
- date = []
301
- type = case r_type.strip
302
- when "Previously", "Will be replaced by" then "obsoletes"
303
- when "Corrigenda / Amendments", "Revised by", "Now confirmed"
304
- on = doc.xpath('//span[@class="stage-date"][contains(., "-")]').last
305
- date << { type: "circulated", on: on.text } if on
306
- "updates"
307
- else r_type
308
- end
309
- if types.include?(type) then a
310
- else
311
- a + r.css("a").map do |id|
312
- docid = RelatonBib::DocumentIdentifier.new(type: "ISO", id: id.text, primary: true)
313
- fref = RelatonBib::FormattedRef.new(content: id.text, format: "text/plain")
314
- bibitem = RelatonIsoBib::IsoBibliographicItem.new(
315
- docid: [docid], formattedref: fref, date: date,
316
- )
317
- { type: type, bibitem: bibitem }
318
- end
304
+ def relation_type(type, doc)
305
+ date = []
306
+ t = case type.strip
307
+ when "Previously", "Will be replaced by" then "obsoletes"
308
+ when "Corrigenda / Amendments", "Revised by", "Now confirmed"
309
+ on = doc.xpath('//span[@class="stage-date"][contains(., "-")]').last
310
+ date << { type: "circulated", on: on.text } if on
311
+ "updates"
312
+ else type
319
313
  end
320
- end
314
+ [t, date]
315
+ end
316
+
317
+ def create_relations(rel, type, date)
318
+ rel.css("a").map do |id|
319
+ docid = RelatonBib::DocumentIdentifier.new(type: "ISO", id: id.text, primary: true)
320
+ fref = RelatonBib::FormattedRef.new(content: id.text, format: "text/plain")
321
+ bibitem = RelatonIsoBib::IsoBibliographicItem.new(
322
+ docid: [docid], formattedref: fref, date: date,
323
+ )
324
+ { type: type, bibitem: bibitem }
321
325
  end
322
- # rubocop:enable Metrics/MethodLength
323
-
324
- # Fetch type.
325
- # @param ref [String]
326
- # @return [String]
327
- def fetch_type(ref)
328
- %r{
329
- ^(?<prefix>ISO|IWA|IEC)
330
- (?:(?:/IEC|/IEEE|/PRF|/NP|/DGuide)*\s|/)
331
- (?<type>TS|TR|PAS|AWI|CD|FDIS|NP|DIS|WD|R|Guide|(?=\d+))
332
- }x =~ ref
333
- # return "international-standard" if type_match.nil?
334
- if TYPES[type] then TYPES[type]
335
- elsif prefix == "ISO" then "international-standard"
336
- elsif prefix == "IWA" then "international-workshop-agreement"
337
- end
338
- # rescue => _e
339
- # puts 'Unknown document type: ' + title
326
+ end
327
+
328
+ # Fetch type.
329
+ # @param ref [String]
330
+ # @return [String]
331
+ def fetch_type(ref)
332
+ %r{
333
+ ^(?<prefix>ISO|IWA|IEC)
334
+ (?:(?:/IEC|/IEEE|/PRF|/NP|/DGuide)*\s|/)
335
+ (?<type>TS|TR|PAS|AWI|CD|FDIS|NP|DIS|WD|R|Guide|(?=\d+))
336
+ }x =~ ref
337
+ # return "international-standard" if type_match.nil?
338
+ if TYPES[type] then TYPES[type]
339
+ elsif prefix == "ISO" then "international-standard"
340
+ elsif prefix == "IWA" then "international-workshop-agreement"
340
341
  end
342
+ # rescue => _e
343
+ # puts 'Unknown document type: ' + title
344
+ end
341
345
 
342
- # Fetch titles.
343
- # @param doc [Nokogiri::HTML::Document]
344
- # @param lang [String]
345
- # @return [Array<RelatonBib::TypedTitleString>]
346
- def fetch_title(doc, lang)
347
- content = doc.at(
348
- "//nav[contains(@class,'heading-condensed')]/h2 | "\
349
- "//nav[contains(@class,'heading-condensed')]/h3",
350
- )&.text&.gsub(/\u2014/, "-")
351
- return RelatonBib::TypedTitleStringCollection.new unless content
352
-
353
- RelatonBib::TypedTitleString.from_string content, lang, script(lang)
346
+ # Fetch titles.
347
+ # @param doc [Nokogiri::HTML::Document]
348
+ # @param lang [String]
349
+ # @return [Array<RelatonBib::TypedTitleString>]
350
+ def fetch_title(doc, lang) # rubocop:disable Metrics/MethodLength,Metrics/AbcSize
351
+ types = %w[title-intro title-main title-part]
352
+ ttls = titles(doc)
353
+ title = RelatonBib::TypedTitleStringCollection.new
354
+ ttls.each.with_index do |p, i|
355
+ next unless p
356
+
357
+ title << RelatonBib::TypedTitleString.new(
358
+ type: types[i], content: p, language: lang, script: script(lang),
359
+ )
360
+ end.compact
361
+ main = title.map { |t| t.title.content }.join " - "
362
+ title << RelatonBib::TypedTitleString.new(type: "main", content: main, language: lang, script: script(lang))
363
+ end
364
+
365
+ def titles(doc)
366
+ head = doc.at "//nav[contains(@class,'heading-condensed')]"
367
+ ttls = head.xpath("h2 | h3 | h4").map &:text
368
+ ttls = ttls[0].split " - " if ttls.size == 1
369
+ case ttls.size
370
+ when 0, 1 then [nil, ttls.first, nil]
371
+ else RelatonBib::TypedTitleString.intro_or_part ttls
354
372
  end
373
+ end
355
374
 
356
- # Return ISO script code.
357
- # @param lang [String]
358
- # @return [String]
359
- def script(lang)
360
- case lang
361
- when "en", "fr" then "Latn"
362
- # when "ru" then "Cyrl"
363
- end
375
+ # Return ISO script code.
376
+ # @param lang [String]
377
+ # @return [String]
378
+ def script(lang)
379
+ case lang
380
+ when "en", "fr" then "Latn"
381
+ # when "ru" then "Cyrl"
364
382
  end
383
+ end
365
384
 
366
- # rubocop:disable Metrics/MethodLength
367
- # Fetch dates
368
- # @param doc [Nokogiri::HTML::Document]
369
- # @param ref [String]
370
- # @return [Array<Hash>]
371
- def fetch_dates(doc, ref) # rubocop:disable Metrics/AbcSize, Metrics/PerceivedComplexity
372
- dates = []
373
- %r{^[^\s]+\s[\d-]+:(?<ref_date_str>\d{4})} =~ ref
374
- pub_date_str = doc.xpath("//span[@itemprop='releaseDate']").text
375
- if ref_date_str
376
- ref_date = Date.strptime ref_date_str, "%Y"
377
- if pub_date_str.empty?
385
+ # Fetch dates
386
+ # @param doc [Nokogiri::HTML::Document]
387
+ # @param ref [String]
388
+ # @return [Array<Hash>]
389
+ def fetch_dates(doc, ref) # rubocop:disable Metrics/AbcSize, Metrics/PerceivedComplexity, Metrics/MethodLength
390
+ dates = []
391
+ %r{^[^\s]+\s[\d-]+:(?<ref_date_str>\d{4})} =~ ref
392
+ pub_date_str = doc.xpath("//span[@itemprop='releaseDate']").text
393
+ if ref_date_str
394
+ ref_date = Date.strptime ref_date_str, "%Y"
395
+ if pub_date_str.empty?
396
+ dates << { type: "published", on: ref_date_str }
397
+ else
398
+ pub_date = Date.strptime pub_date_str, "%Y"
399
+ if pub_date.year > ref_date.year
378
400
  dates << { type: "published", on: ref_date_str }
401
+ dates << { type: "updated", on: pub_date_str }
379
402
  else
380
- pub_date = Date.strptime pub_date_str, "%Y"
381
- if pub_date.year > ref_date.year
382
- dates << { type: "published", on: ref_date_str }
383
- dates << { type: "updated", on: pub_date_str }
384
- else
385
- dates << { type: "published", on: pub_date_str }
386
- end
403
+ dates << { type: "published", on: pub_date_str }
387
404
  end
388
- elsif !pub_date_str.empty?
389
- dates << { type: "published", on: pub_date_str }
390
405
  end
391
- dates
406
+ elsif !pub_date_str.empty?
407
+ dates << { type: "published", on: pub_date_str }
392
408
  end
409
+ dates
410
+ end
393
411
 
394
- def fetch_contributors(ref)
395
- ref.sub(/\s.*/, "").split("/").reduce([]) do |mem, abbrev|
396
- publisher = PUBLISHERS[abbrev]
397
- next mem unless publisher
412
+ def fetch_contributors(ref)
413
+ ref.sub(/\s.*/, "").split("/").reduce([]) do |mem, abbrev|
414
+ publisher = PUBLISHERS[abbrev]
415
+ next mem unless publisher
398
416
 
399
- publisher[:abbreviation] = abbrev
400
- mem << { entity: publisher, role: [type: "publisher"] }
401
- end
402
- end
403
- # rubocop:enable Metrics/MethodLength
404
-
405
- # Fetch ICS.
406
- # @param doc [Nokogiri::HTML::Document]
407
- # @return [Array<Hash>]
408
- def fetch_ics(doc)
409
- doc.xpath("//dl[dt/strong[.='ICS']]/dd/span/a").map do |i|
410
- code = i.text.match(/[\d.]+/).to_s.split "."
411
- { field: code[0], group: code[1], subgroup: code[2] }
412
- end
417
+ publisher[:abbreviation] = abbrev
418
+ mem << { entity: publisher, role: [type: "publisher"] }
413
419
  end
420
+ end
414
421
 
415
- # Fetch links.
416
- # @param doc [Nokogiri::HTML::Document]
417
- # @param url [String]
418
- # @return [Array<Hash>]
419
- def fetch_link(doc, url)
420
- links = [{ type: "src", content: url }]
421
- obp = doc.at_css("a#obp-preview")
422
- links << { type: "obp", content: obp[:href] } if obp
423
- rss = doc.at("//a[contains(@href, 'rss')]")
424
- links << { type: "rss", content: DOMAIN + rss[:href] } if rss
425
- pub = doc.at "//p[contains(., 'publicly available')]/a",
426
- "//p[contains(., 'can be downloaded from the')]/a"
427
- links << { type: "pub", content: pub[:href] } if pub
428
- links
422
+ # Fetch ICS.
423
+ # @param doc [Nokogiri::HTML::Document]
424
+ # @return [Array<Hash>]
425
+ def fetch_ics(doc)
426
+ doc.xpath("//div[contains(., 'ICS')]/following-sibling::span/a").map do |i|
427
+ code = i.text.match(/[\d.]+/).to_s.split "."
428
+ { field: code[0], group: code[1], subgroup: code[2] }
429
429
  end
430
+ end
430
431
 
431
- # Fetch copyright.
432
- # @param doc [Nokogiri::HTML::Document]
433
- # @return [Array<Hash>]
434
- def fetch_copyright(doc) # rubocop:disable Metrics/MethodLength
435
- ref = item_ref doc
436
- owner_name = ref.match(/.*?(?=\s)/).to_s
437
- from = ref.match(/(?<=:)\d{4}/).to_s
438
- if from.empty?
439
- date = doc.at(
440
- "//span[@itemprop='releaseDate']",
441
- "//ul[@id='stages']/li[contains(@class,'active')]/ul/li[@class='active']/a/span[@class='stage-date']",
442
- )
443
- from = date.text.match(/\d{4}/).to_s
444
- end
445
- [{ owner: [{ name: owner_name }], from: from }]
432
+ #
433
+ # Fetch links.
434
+ #
435
+ # @param doc [Nokogiri::HTML::Document] document to parse
436
+ # @param url [String] document url
437
+ #
438
+ # @return [Array<Hash>]
439
+ #
440
+ def fetch_link(doc, url)
441
+ links = [{ type: "src", content: url }]
442
+ obp = doc.at("//h4[contains(@class, 'h5')]/a")
443
+ links << { type: "obp", content: obp[:href] } if obp
444
+ rss = doc.at("//a[contains(@href, 'rss')]")
445
+ links << { type: "rss", content: DOMAIN + rss[:href] } if rss
446
+ pub = doc.at "//p[contains(., 'publicly available')]/a",
447
+ "//p[contains(., 'can be downloaded from the')]/a"
448
+ links << { type: "pub", content: pub[:href] } if pub
449
+ links
450
+ end
451
+
452
+ # Fetch copyright.
453
+ # @param doc [Nokogiri::HTML::Document]
454
+ # @return [Array<Hash>]
455
+ def fetch_copyright(doc) # rubocop:disable Metrics/MethodLength
456
+ ref = item_ref doc
457
+ owner_name = ref.match(/.*?(?=\s)/).to_s
458
+ from = ref.match(/(?<=:)\d{4}/).to_s
459
+ if from.empty?
460
+ date = doc.at(
461
+ "//span[@itemprop='releaseDate']",
462
+ "//ul[@id='stages']/li[contains(@class,'active')]/ul/li[@class='active']/a/span[@class='stage-date']",
463
+ )
464
+ from = date.text.match(/\d{4}/).to_s
446
465
  end
466
+ [{ owner: [{ name: owner_name }], from: from }]
447
467
  end
448
468
  end
449
469
  end
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module RelatonIso
4
- VERSION = "1.16.1"
4
+ VERSION = "1.16.3"
5
5
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: relaton-iso
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.16.1
4
+ version: 1.16.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - Ribose Inc.
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2023-10-14 00:00:00.000000000 Z
11
+ date: 2023-10-21 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: algolia