relaton-iso 1.16.1 → 1.16.3

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 45b4a081a62ab5a5f0a4e6f2c2cffb4950861f09e838401a68aa2208731d65ec
4
- data.tar.gz: 01521bd3e1fa7853145a390461390b7a07dfc20e1efb02c2d6d90372d03a8664
3
+ metadata.gz: 13ecc04a430b1dbf256c0853f612969727c16eba72a06cb2bc74bed17745ba90
4
+ data.tar.gz: f795f63a994b843e07d4857ba3b0dd9c91ec9a3ccb408827f1bf7bdbf5f854a9
5
5
  SHA512:
6
- metadata.gz: 0e72371e46e2d03875fce213861ab9f087fdafca4abe748436f8ccc217ee2d82b5a089c20d73062d2a022366b79294ac7b1a16f0b3f59593f79d673800286877
7
- data.tar.gz: c6fa8308f8feb86cc08ae3a1fde9e267169c5bf8b5292a6c97f4dbf7f28668b56ac111e3dc03411dfa537ce83905e87a273d866b275f376cef402a3f641a59b6
6
+ metadata.gz: d33586bbe409f54736b694d774a52e1bef8a4cc2d7c304aebd06c5ead8b3893b6f45c65d3e5c586c5e7f9f23501b52ae6b0630c25213d6105660251d03cff94e
7
+ data.tar.gz: e7fdcb33dfa855c73ead77a514eae36d761274bafeec77c520ac8ff84a05c6a04a2b30bd80fa7d89fcabda1975eeb95f85c65d0c177da7e1694da97bc4245ccd
@@ -4,7 +4,7 @@ module RelatonIso
4
4
  class Processor < Relaton::Processor
5
5
  attr_reader :idtype
6
6
 
7
- def initialize
7
+ def initialize # rubocop:disable Lint/MissingSuper
8
8
  @short = :relaton_iso
9
9
  @prefix = "ISO"
10
10
  @defaultprefix = %r{^ISO(/IEC)?\s}
@@ -43,407 +43,427 @@ module RelatonIso
43
43
  url: "www.asme.org" },
44
44
  }.freeze
45
45
 
46
- class << self
47
- # Parse page.
48
- # @param hit [RelatonIso::Hit]
49
- # @param lang [String, NilClass]
50
- # @return [RelatonIsoBib::IsoBibliographicItem]
51
- def parse_page(hit, lang = nil) # rubocop:disable Metrics/AbcSize, Metrics/MethodLength, Metrics/CyclomaticComplexity, Metrics/PerceivedComplexity
52
- # path = "/contents/data/standard#{hit_data['splitPath']}/"\
53
- # "#{hit_data['csnumber']}.html"
54
-
55
- path = hit.hit[:path].sub("/sites/isoorg", "")
56
- doc, url = get_page "#{path}.html"
57
-
58
- # Fetch edition.
59
- edition = doc.at("//div[div[.='Edition']]/text()[last()]")
60
- &.text&.match(/\d+$/)&.to_s
61
- hit.pubid.base.edition ||= edition if hit.pubid.base
62
-
63
- titles, abstract, langs = fetch_titles_abstract(doc, lang)
64
-
65
- RelatonIsoBib::IsoBibliographicItem.new(
66
- fetched: Date.today.to_s,
67
- docid: fetch_relaton_docids(doc, hit.pubid),
68
- docnumber: fetch_docnumber(hit.pubid),
69
- edition: edition,
70
- language: langs.map { |l| l[:lang] },
71
- script: langs.map { |l| script(l[:lang]) }.uniq,
72
- title: titles,
73
- doctype: fetch_type(hit.hit[:title]),
74
- docstatus: fetch_status(doc),
75
- ics: fetch_ics(doc),
76
- date: fetch_dates(doc, hit.hit[:title]),
77
- contributor: fetch_contributors(hit.hit[:title]),
78
- editorialgroup: fetch_workgroup(doc),
79
- abstract: abstract,
80
- copyright: fetch_copyright(doc),
81
- link: fetch_link(doc, url),
82
- relation: fetch_relations(doc),
83
- place: ["Geneva"],
84
- structuredidentifier: fetch_structuredidentifier(hit.pubid),
85
- )
86
- end
46
+ extend self
47
+
48
+ # Parse page.
49
+ # @param hit [RelatonIso::Hit]
50
+ # @param lang [String, NilClass]
51
+ # @return [RelatonIsoBib::IsoBibliographicItem]
52
+ def parse_page(hit, lang = nil) # rubocop:disable Metrics/AbcSize, Metrics/MethodLength, Metrics/CyclomaticComplexity, Metrics/PerceivedComplexity
53
+ # path = "/contents/data/standard#{hit_data['splitPath']}/"\
54
+ # "#{hit_data['csnumber']}.html"
55
+
56
+ path = hit.hit[:path].sub("/sites/isoorg", "")
57
+ doc, url = get_page "#{path}.html"
58
+
59
+ # Fetch edition.
60
+ edition = doc.at("//div[div[.='Edition']]/text()[last()]")
61
+ &.text&.match(/\d+$/)&.to_s
62
+ hit.pubid.base.edition ||= edition if hit.pubid.base
63
+
64
+ titles, abstract, langs = fetch_titles_abstract(doc, lang)
65
+
66
+ RelatonIsoBib::IsoBibliographicItem.new(
67
+ fetched: Date.today.to_s,
68
+ docid: fetch_relaton_docids(doc, hit.pubid),
69
+ docnumber: fetch_docnumber(hit.pubid),
70
+ edition: edition,
71
+ language: langs.map { |l| l[:lang] },
72
+ script: langs.map { |l| script(l[:lang]) }.uniq,
73
+ title: titles,
74
+ doctype: fetch_type(hit.hit[:title]),
75
+ docstatus: fetch_status(doc),
76
+ ics: fetch_ics(doc),
77
+ date: fetch_dates(doc, hit.hit[:title]),
78
+ contributor: fetch_contributors(hit.hit[:title]),
79
+ editorialgroup: fetch_workgroup(doc),
80
+ abstract: abstract,
81
+ copyright: fetch_copyright(doc),
82
+ link: fetch_link(doc, url),
83
+ relation: fetch_relations(doc),
84
+ place: ["Geneva"],
85
+ structuredidentifier: fetch_structuredidentifier(hit.pubid),
86
+ )
87
+ end
87
88
 
88
- #
89
- # Create document ids.
90
- #
91
- # @param doc [Nokogiri::HTML::Document] document to parse
92
- # @param pubid [Pubid::Iso::Identifier] publication identifier
93
- #
94
- # @return [Array<RelatonBib::DocumentIdentifier>]
95
- #
96
- def fetch_relaton_docids(doc, pubid)
97
- pubid.stage ||= Pubid::Iso::Identifier.parse_stage(stage_code(doc))
98
- [
99
- RelatonIso::DocumentIdentifier.new(id: pubid, type: "ISO", primary: true),
100
- RelatonBib::DocumentIdentifier.new(id: isoref(pubid), type: "iso-reference"),
101
- RelatonIso::DocumentIdentifier.new(id: pubid, type: "URN"),
102
- ]
103
- end
89
+ #
90
+ # Create document ids.
91
+ #
92
+ # @param doc [Nokogiri::HTML::Document] document to parse
93
+ # @param pubid [Pubid::Iso::Identifier] publication identifier
94
+ #
95
+ # @return [Array<RelatonBib::DocumentIdentifier>]
96
+ #
97
+ def fetch_relaton_docids(doc, pubid)
98
+ pubid.stage ||= Pubid::Iso::Identifier.parse_stage(stage_code(doc))
99
+ [
100
+ RelatonIso::DocumentIdentifier.new(id: pubid, type: "ISO", primary: true),
101
+ RelatonBib::DocumentIdentifier.new(id: isoref(pubid), type: "iso-reference"),
102
+ RelatonIso::DocumentIdentifier.new(id: pubid, type: "URN"),
103
+ ]
104
+ end
104
105
 
105
- #
106
- # Create ISO reference identifier with English language.
107
- #
108
- # @param [Pubid::Iso::Identifier] pubid publication identifier
109
- #
110
- # @return [String] English reference identifier
111
- #
112
- def isoref(pubid)
113
- params = pubid.get_params.reject { |k, _| k == :typed_stage }
114
- Pubid::Iso::Identifier.create(language: "en", **params).to_s(format: :ref_num_short)
115
- end
106
+ #
107
+ # Create ISO reference identifier with English language.
108
+ #
109
+ # @param [Pubid::Iso::Identifier] pubid publication identifier
110
+ #
111
+ # @return [String] English reference identifier
112
+ #
113
+ def isoref(pubid)
114
+ params = pubid.get_params.reject { |k, _| k == :typed_stage }
115
+ Pubid::Iso::Identifier.create(language: "en", **params).to_s(format: :ref_num_short)
116
+ end
116
117
 
117
- private
118
-
119
- # Fetch titles and abstracts.
120
- # @param doc [Nokigiri::HTML::Document]
121
- # @param lang [String, NilClass]
122
- # @return [Array<Array>]
123
- def fetch_titles_abstract(doc, lang) # rubocop:disable Metrics/AbcSize,Metrics/MethodLength,Metrics/CyclomaticComplexity,Metrics/PerceivedComplexity
124
- titles = RelatonBib::TypedTitleStringCollection.new
125
- abstract = []
126
- langs = languages(doc, lang).reduce([]) do |s, l|
127
- # Don't need to get page for en. We already have it.
128
- d = l[:path] ? get_page(l[:path])[0] : doc
129
- unless d.at("//h5[@class='help-block']" \
130
- "[.='недоступно на русском языке']")
131
- s << l
132
- titles += fetch_title(d, l[:lang])
133
-
134
- # Fetch abstracts.
135
- abstract_content = d.xpath(
136
- "//div[@itemprop='description']/p|//div[@itemprop='description']/ul/li",
137
- ).map do |a|
138
- a.name == "li" ? "- #{a.text}" : a.text
139
- end.reject(&:empty?).join("\n")
140
- unless abstract_content.empty?
141
- abstract << {
142
- content: abstract_content,
143
- language: l[:lang],
144
- script: script(l[:lang]),
145
- format: "text/plain",
146
- }
147
- end
118
+ private
119
+
120
+ # Fetch titles and abstracts.
121
+ # @param doc [Nokigiri::HTML::Document]
122
+ # @param lang [String, NilClass]
123
+ # @return [Array<Array>]
124
+ def fetch_titles_abstract(doc, lang) # rubocop:disable Metrics/AbcSize,Metrics/MethodLength,Metrics/CyclomaticComplexity,Metrics/PerceivedComplexity
125
+ titles = RelatonBib::TypedTitleStringCollection.new
126
+ abstract = []
127
+ langs = languages(doc, lang).reduce([]) do |s, l|
128
+ # Don't need to get page for en. We already have it.
129
+ d = l[:path] ? get_page(l[:path])[0] : doc
130
+ unless d.at("//h5[@class='help-block']" \
131
+ "[.='недоступно на русском языке']")
132
+ s << l
133
+ titles += fetch_title(d, l[:lang])
134
+
135
+ # Fetch abstracts.
136
+ abstract_content = d.xpath(
137
+ "//div[@itemprop='description']/p|//div[@itemprop='description']/ul/li",
138
+ ).map do |a|
139
+ a.name == "li" ? "- #{a.text}" : a.text
140
+ end.reject(&:empty?).join("\n")
141
+ unless abstract_content.empty?
142
+ abstract << {
143
+ content: abstract_content,
144
+ language: l[:lang],
145
+ script: script(l[:lang]),
146
+ format: "text/plain",
147
+ }
148
148
  end
149
- s
150
149
  end
151
- [titles, abstract, langs]
150
+ s
152
151
  end
152
+ [titles, abstract, langs]
153
+ end
153
154
 
154
- # Returns available languages.
155
- # @param doc [Nokogiri::HTML::Document]
156
- # @pqrqm lang [String, NilClass]
157
- # @return [Array<Hash>]
158
- def languages(doc, lang)
159
- lgs = [{ lang: "en" }]
160
- doc.css("li#lang-switcher ul li a").each do |lang_link|
161
- lang_path = lang_link.attr("href")
162
- l = lang_path.match(%r{^/(fr)/})
163
- lgs << { lang: l[1], path: lang_path } if l && (!lang || l[1] == lang)
164
- end
165
- lgs
155
+ # Returns available languages.
156
+ # @param doc [Nokogiri::HTML::Document]
157
+ # @pqrqm lang [String, NilClass]
158
+ # @return [Array<Hash>]
159
+ def languages(doc, lang)
160
+ lgs = [{ lang: "en" }]
161
+ doc.css("li#lang-switcher ul li a").each do |lang_link|
162
+ lang_path = lang_link.attr("href")
163
+ l = lang_path.match(%r{^/(fr)/})
164
+ lgs << { lang: l[1], path: lang_path } if l && (!lang || l[1] == lang)
166
165
  end
166
+ lgs
167
+ end
167
168
 
168
- # Get page.
169
- # @param path [String] page's path
170
- # @return [Array<Nokogiri::HTML::Document, String>]
171
- def get_page(path)
172
- resp, uri = get_redirection path
173
- doc = try_if_fail resp, uri
174
- [doc, uri.to_s]
175
- rescue SocketError, Timeout::Error, Errno::EINVAL, Errno::ECONNRESET,
176
- EOFError, Net::HTTPBadResponse, Net::HTTPHeaderSyntaxError,
177
- Net::ProtocolError, Errno::ETIMEDOUT
178
- raise RelatonBib::RequestError, "Could not access #{uri}"
179
- end
169
+ # Get page.
170
+ # @param path [String] page's path
171
+ # @return [Array<Nokogiri::HTML::Document, String>]
172
+ def get_page(path)
173
+ resp, uri = get_redirection path
174
+ doc = try_if_fail resp, uri
175
+ [doc, uri.to_s]
176
+ rescue SocketError, Timeout::Error, Errno::EINVAL, Errno::ECONNRESET,
177
+ EOFError, Net::HTTPBadResponse, Net::HTTPHeaderSyntaxError,
178
+ Net::ProtocolError, Errno::ETIMEDOUT
179
+ raise RelatonBib::RequestError, "Could not access #{uri}"
180
+ end
180
181
 
181
- #
182
- # Get the page from the given path. If the page is redirected, get the
183
- # page from the new path.
184
- #
185
- # @param [String] path path to the page
186
- #
187
- # @return [Array<Net::HTTPOK, URI>] HTTP response and URI
188
- # @raise [RelatonBib::RequestError] if the page is not found
189
- #
190
- def get_redirection(path)
191
- url = DOMAIN + path
192
- uri = URI url
193
- resp = Net::HTTP.get_response(uri)
194
- raise RelatonBib::RequestError, "#{url} not found." if %w[404 302].include? resp.code
182
+ #
183
+ # Get the page from the given path. If the page is redirected, get the
184
+ # page from the new path.
185
+ #
186
+ # @param [String] path path to the page
187
+ #
188
+ # @return [Array<Net::HTTPOK, URI>] HTTP response and URI
189
+ # @raise [RelatonBib::RequestError] if the page is not found
190
+ #
191
+ def get_redirection(path)
192
+ url = DOMAIN + path
193
+ uri = URI url
194
+ resp = Net::HTTP.get_response(uri)
195
+ raise RelatonBib::RequestError, "#{url} not found." if %w[404 302].include? resp.code
196
+
197
+ resp.code == "301" ? get_redirection(resp["location"]) : [resp, uri]
198
+ end
195
199
 
196
- resp.code == "301" ? get_redirection(resp["location"]) : [resp, uri]
197
- end
200
+ #
201
+ # The iso.org site fails to respond sometimes. This method tries to get
202
+ # the response again.
203
+ #
204
+ # @param [Net::HTTPOK] resp HTTP response
205
+ # @param [URI::HTTPS] uri URI of the page
206
+ #
207
+ # @return [Nokogiri::HTML4::Document] document
208
+ # @raise [RelatonBib::RequestError] if the page could not be parsed
209
+ #
210
+ def try_if_fail(resp, uri)
211
+ 10.times do
212
+ doc = Nokogiri::HTML(resp.body)
213
+ # stop trying if page has a document id
214
+ return doc if item_ref doc
198
215
 
199
- #
200
- # The iso.org site fails to respond sometimes. This method tries to get
201
- # the response again.
202
- #
203
- # @param [Net::HTTPOK] resp HTTP response
204
- # @param [URI::HTTPS] uri URI of the page
205
- #
206
- # @return [Nokogiri::HTML4::Document] document
207
- # @raise [RelatonBib::RequestError] if the page could not be parsed
208
- #
209
- def try_if_fail(resp, uri)
210
- 10.times do
211
- doc = Nokogiri::HTML(resp.body)
212
- # stop trying if page has a document id
213
- return doc if item_ref doc
214
-
215
- resp = Net::HTTP.get_response(uri)
216
- end
217
- raise RelatonBib::RequestError, "Could not parse the page #{uri}"
216
+ resp = Net::HTTP.get_response(uri)
218
217
  end
218
+ raise RelatonBib::RequestError, "Could not parse the page #{uri}"
219
+ end
219
220
 
220
- #
221
- # Generate docnumber.
222
- #
223
- # @param [Pubid::Iso] pubid
224
- #
225
- # @return [String] docnumber
226
- #
227
- def fetch_docnumber(pubid)
228
- pubid.to_s.match(/\d+/)&.to_s
229
- end
221
+ #
222
+ # Generate docnumber.
223
+ #
224
+ # @param [Pubid::Iso] pubid
225
+ #
226
+ # @return [String] docnumber
227
+ #
228
+ def fetch_docnumber(pubid)
229
+ pubid.to_s.match(/\d+/)&.to_s
230
+ end
230
231
 
231
- #
232
- # Parse structuredidentifier.
233
- #
234
- # @param pubid [Pubid::Iso::Identifier] pubid
235
- #
236
- # @return [RelatonBib::StructuredIdentifier] structured identifier
237
- #
238
- def fetch_structuredidentifier(pubid) # rubocop:disable Metrics/MethodLength
239
- RelatonIsoBib::StructuredIdentifier.new(
240
- project_number: "#{pubid.publisher} #{pubid.number}",
241
- part: pubid.part&.to_s, # &.sub(/^-/, ""),
242
- type: pubid.publisher,
243
- )
244
- end
232
+ #
233
+ # Parse structuredidentifier.
234
+ #
235
+ # @param pubid [Pubid::Iso::Identifier] pubid
236
+ #
237
+ # @return [RelatonBib::StructuredIdentifier] structured identifier
238
+ #
239
+ def fetch_structuredidentifier(pubid) # rubocop:disable Metrics/MethodLength
240
+ RelatonIsoBib::StructuredIdentifier.new(
241
+ project_number: "#{pubid.publisher} #{pubid.number}",
242
+ part: pubid.part&.to_s, # &.sub(/^-/, ""),
243
+ type: pubid.publisher,
244
+ )
245
+ end
245
246
 
246
- def item_ref(doc)
247
- doc.at("//main//section/div/div/div//h1")&.text
248
- end
247
+ def item_ref(doc)
248
+ doc.at("//main//section/div/div/div//h1")&.text
249
+ end
249
250
 
250
- # Fetch status.
251
- # @param doc [Nokogiri::HTML::Document]
252
- # @param status [String]
253
- # @return [Hash]
254
- def fetch_status(doc)
255
- stg, substg = stage_code(doc).split "."
256
- RelatonBib::DocumentStatus.new(stage: stg, substage: substg)
257
- end
251
+ # Fetch status.
252
+ # @param doc [Nokogiri::HTML::Document]
253
+ # @param status [String]
254
+ # @return [Hash]
255
+ def fetch_status(doc)
256
+ stg, substg = stage_code(doc).split "."
257
+ RelatonBib::DocumentStatus.new(stage: stg, substage: substg)
258
+ end
258
259
 
259
- def stage_code(doc)
260
- doc.at("//ul[@class='dropdown-menu']/li[@class='active']" \
261
- "/a/span[@class='stage-code']").text
262
- end
260
+ def stage_code(doc)
261
+ doc.at("//ul[@class='dropdown-menu']/li[@class='active']" \
262
+ "/a/span[@class='stage-code']").text
263
+ end
263
264
 
264
- # def stage(stg, substg)
265
- # abbr = STGABBR[stg].is_a?(Hash) ? STGABBR[stg][substg] : STGABBR[stg]
266
- # RelatonBib::DocumentStatus::Stage.new value: stg, abbreviation: abbr
267
- # end
268
-
269
- # Fetch workgroup.
270
- # @param doc [Nokogiri::HTML::Document]
271
- # @return [Hash]
272
- def fetch_workgroup(doc) # rubocop:disable Metrics/MethodLength,Metrics/AbcSize,Metrics/CyclomaticComplexity,Metrics/PerceivedComplexity
273
- wg = doc.at("//div[@class='clearfix']")
274
- wg_link = wg.at "span/a"
275
- return unless wg_link
276
-
277
- workgroup = wg_link.text.split "/"
278
- type = workgroup[1]&.match(/^[A-Z]+/)&.to_s || "TC"
279
- # {
280
- # name: "International Organization for Standardization",
281
- # abbreviation: "ISO",
282
- # url: "www.iso.org",
283
- # }
284
- tc_numb = workgroup[1]&.match(/\d+/)&.to_s&.to_i
285
- tc_name = wg.at("span[@class='entry-title']").text
286
- tc = RelatonBib::WorkGroup.new(name: tc_name, identifier: wg_link.text,
287
- type: type, number: tc_numb)
288
- RelatonIsoBib::EditorialGroup.new(technical_committee: [tc])
265
+ # def stage(stg, substg)
266
+ # abbr = STGABBR[stg].is_a?(Hash) ? STGABBR[stg][substg] : STGABBR[stg]
267
+ # RelatonBib::DocumentStatus::Stage.new value: stg, abbreviation: abbr
268
+ # end
269
+
270
+ # Fetch workgroup.
271
+ # @param doc [Nokogiri::HTML::Document]
272
+ # @return [Hash]
273
+ def fetch_workgroup(doc) # rubocop:disable Metrics/MethodLength,Metrics/AbcSize,Metrics/CyclomaticComplexity,Metrics/PerceivedComplexity
274
+ wg = doc.at("////div[contains(., 'Technical Committe')]/following-sibling::span/a")
275
+ return unless wg
276
+
277
+ workgroup = wg.text.split "/"
278
+ type = workgroup[1]&.match(/^[A-Z]+/)&.to_s || "TC"
279
+ # {
280
+ # name: "International Organization for Standardization",
281
+ # abbreviation: "ISO",
282
+ # url: "www.iso.org",
283
+ # }
284
+ tc_numb = workgroup[1]&.match(/\d+/)&.to_s&.to_i
285
+ tc_name = wg[:title]
286
+ tc = RelatonBib::WorkGroup.new(name: tc_name, identifier: wg.text,
287
+ type: type, number: tc_numb)
288
+ RelatonIsoBib::EditorialGroup.new(technical_committee: [tc])
289
+ end
290
+
291
+ # Fetch relations.
292
+ # @param doc [Nokogiri::HTML::Document]
293
+ # @return [Array<Hash>]
294
+ def fetch_relations(doc)
295
+ types = ["Now", "Now under review"]
296
+ doc.xpath("//ul[@class='steps']/li", "//div[@class='sub-step']").reduce([]) do |a, r|
297
+ type, date = relation_type(r.at("h4", "h5").text.strip, doc)
298
+ next a if types.include?(type)
299
+
300
+ a + create_relations(r, type, date)
289
301
  end
302
+ end
290
303
 
291
- # rubocop:disable Metrics/MethodLength
292
-
293
- # Fetch relations.
294
- # @param doc [Nokogiri::HTML::Document]
295
- # @return [Array<Hash>]
296
- def fetch_relations(doc) # rubocop:disable Metrics/AbcSize, Metrics/CyclomaticComplexity
297
- types = ["Now", "Now under review"]
298
- doc.xpath("//ul[@class='steps']/li", "//div[@class='sub-step']").reduce([]) do |a, r|
299
- r_type = r.at("h4", "h5").text
300
- date = []
301
- type = case r_type.strip
302
- when "Previously", "Will be replaced by" then "obsoletes"
303
- when "Corrigenda / Amendments", "Revised by", "Now confirmed"
304
- on = doc.xpath('//span[@class="stage-date"][contains(., "-")]').last
305
- date << { type: "circulated", on: on.text } if on
306
- "updates"
307
- else r_type
308
- end
309
- if types.include?(type) then a
310
- else
311
- a + r.css("a").map do |id|
312
- docid = RelatonBib::DocumentIdentifier.new(type: "ISO", id: id.text, primary: true)
313
- fref = RelatonBib::FormattedRef.new(content: id.text, format: "text/plain")
314
- bibitem = RelatonIsoBib::IsoBibliographicItem.new(
315
- docid: [docid], formattedref: fref, date: date,
316
- )
317
- { type: type, bibitem: bibitem }
318
- end
304
+ def relation_type(type, doc)
305
+ date = []
306
+ t = case type.strip
307
+ when "Previously", "Will be replaced by" then "obsoletes"
308
+ when "Corrigenda / Amendments", "Revised by", "Now confirmed"
309
+ on = doc.xpath('//span[@class="stage-date"][contains(., "-")]').last
310
+ date << { type: "circulated", on: on.text } if on
311
+ "updates"
312
+ else type
319
313
  end
320
- end
314
+ [t, date]
315
+ end
316
+
317
+ def create_relations(rel, type, date)
318
+ rel.css("a").map do |id|
319
+ docid = RelatonBib::DocumentIdentifier.new(type: "ISO", id: id.text, primary: true)
320
+ fref = RelatonBib::FormattedRef.new(content: id.text, format: "text/plain")
321
+ bibitem = RelatonIsoBib::IsoBibliographicItem.new(
322
+ docid: [docid], formattedref: fref, date: date,
323
+ )
324
+ { type: type, bibitem: bibitem }
321
325
  end
322
- # rubocop:enable Metrics/MethodLength
323
-
324
- # Fetch type.
325
- # @param ref [String]
326
- # @return [String]
327
- def fetch_type(ref)
328
- %r{
329
- ^(?<prefix>ISO|IWA|IEC)
330
- (?:(?:/IEC|/IEEE|/PRF|/NP|/DGuide)*\s|/)
331
- (?<type>TS|TR|PAS|AWI|CD|FDIS|NP|DIS|WD|R|Guide|(?=\d+))
332
- }x =~ ref
333
- # return "international-standard" if type_match.nil?
334
- if TYPES[type] then TYPES[type]
335
- elsif prefix == "ISO" then "international-standard"
336
- elsif prefix == "IWA" then "international-workshop-agreement"
337
- end
338
- # rescue => _e
339
- # puts 'Unknown document type: ' + title
326
+ end
327
+
328
+ # Fetch type.
329
+ # @param ref [String]
330
+ # @return [String]
331
+ def fetch_type(ref)
332
+ %r{
333
+ ^(?<prefix>ISO|IWA|IEC)
334
+ (?:(?:/IEC|/IEEE|/PRF|/NP|/DGuide)*\s|/)
335
+ (?<type>TS|TR|PAS|AWI|CD|FDIS|NP|DIS|WD|R|Guide|(?=\d+))
336
+ }x =~ ref
337
+ # return "international-standard" if type_match.nil?
338
+ if TYPES[type] then TYPES[type]
339
+ elsif prefix == "ISO" then "international-standard"
340
+ elsif prefix == "IWA" then "international-workshop-agreement"
340
341
  end
342
+ # rescue => _e
343
+ # puts 'Unknown document type: ' + title
344
+ end
341
345
 
342
- # Fetch titles.
343
- # @param doc [Nokogiri::HTML::Document]
344
- # @param lang [String]
345
- # @return [Array<RelatonBib::TypedTitleString>]
346
- def fetch_title(doc, lang)
347
- content = doc.at(
348
- "//nav[contains(@class,'heading-condensed')]/h2 | "\
349
- "//nav[contains(@class,'heading-condensed')]/h3",
350
- )&.text&.gsub(/\u2014/, "-")
351
- return RelatonBib::TypedTitleStringCollection.new unless content
352
-
353
- RelatonBib::TypedTitleString.from_string content, lang, script(lang)
346
+ # Fetch titles.
347
+ # @param doc [Nokogiri::HTML::Document]
348
+ # @param lang [String]
349
+ # @return [Array<RelatonBib::TypedTitleString>]
350
+ def fetch_title(doc, lang) # rubocop:disable Metrics/MethodLength,Metrics/AbcSize
351
+ types = %w[title-intro title-main title-part]
352
+ ttls = titles(doc)
353
+ title = RelatonBib::TypedTitleStringCollection.new
354
+ ttls.each.with_index do |p, i|
355
+ next unless p
356
+
357
+ title << RelatonBib::TypedTitleString.new(
358
+ type: types[i], content: p, language: lang, script: script(lang),
359
+ )
360
+ end.compact
361
+ main = title.map { |t| t.title.content }.join " - "
362
+ title << RelatonBib::TypedTitleString.new(type: "main", content: main, language: lang, script: script(lang))
363
+ end
364
+
365
+ def titles(doc)
366
+ head = doc.at "//nav[contains(@class,'heading-condensed')]"
367
+ ttls = head.xpath("h2 | h3 | h4").map &:text
368
+ ttls = ttls[0].split " - " if ttls.size == 1
369
+ case ttls.size
370
+ when 0, 1 then [nil, ttls.first, nil]
371
+ else RelatonBib::TypedTitleString.intro_or_part ttls
354
372
  end
373
+ end
355
374
 
356
- # Return ISO script code.
357
- # @param lang [String]
358
- # @return [String]
359
- def script(lang)
360
- case lang
361
- when "en", "fr" then "Latn"
362
- # when "ru" then "Cyrl"
363
- end
375
+ # Return ISO script code.
376
+ # @param lang [String]
377
+ # @return [String]
378
+ def script(lang)
379
+ case lang
380
+ when "en", "fr" then "Latn"
381
+ # when "ru" then "Cyrl"
364
382
  end
383
+ end
365
384
 
366
- # rubocop:disable Metrics/MethodLength
367
- # Fetch dates
368
- # @param doc [Nokogiri::HTML::Document]
369
- # @param ref [String]
370
- # @return [Array<Hash>]
371
- def fetch_dates(doc, ref) # rubocop:disable Metrics/AbcSize, Metrics/PerceivedComplexity
372
- dates = []
373
- %r{^[^\s]+\s[\d-]+:(?<ref_date_str>\d{4})} =~ ref
374
- pub_date_str = doc.xpath("//span[@itemprop='releaseDate']").text
375
- if ref_date_str
376
- ref_date = Date.strptime ref_date_str, "%Y"
377
- if pub_date_str.empty?
385
+ # Fetch dates
386
+ # @param doc [Nokogiri::HTML::Document]
387
+ # @param ref [String]
388
+ # @return [Array<Hash>]
389
+ def fetch_dates(doc, ref) # rubocop:disable Metrics/AbcSize, Metrics/PerceivedComplexity, Metrics/MethodLength
390
+ dates = []
391
+ %r{^[^\s]+\s[\d-]+:(?<ref_date_str>\d{4})} =~ ref
392
+ pub_date_str = doc.xpath("//span[@itemprop='releaseDate']").text
393
+ if ref_date_str
394
+ ref_date = Date.strptime ref_date_str, "%Y"
395
+ if pub_date_str.empty?
396
+ dates << { type: "published", on: ref_date_str }
397
+ else
398
+ pub_date = Date.strptime pub_date_str, "%Y"
399
+ if pub_date.year > ref_date.year
378
400
  dates << { type: "published", on: ref_date_str }
401
+ dates << { type: "updated", on: pub_date_str }
379
402
  else
380
- pub_date = Date.strptime pub_date_str, "%Y"
381
- if pub_date.year > ref_date.year
382
- dates << { type: "published", on: ref_date_str }
383
- dates << { type: "updated", on: pub_date_str }
384
- else
385
- dates << { type: "published", on: pub_date_str }
386
- end
403
+ dates << { type: "published", on: pub_date_str }
387
404
  end
388
- elsif !pub_date_str.empty?
389
- dates << { type: "published", on: pub_date_str }
390
405
  end
391
- dates
406
+ elsif !pub_date_str.empty?
407
+ dates << { type: "published", on: pub_date_str }
392
408
  end
409
+ dates
410
+ end
393
411
 
394
- def fetch_contributors(ref)
395
- ref.sub(/\s.*/, "").split("/").reduce([]) do |mem, abbrev|
396
- publisher = PUBLISHERS[abbrev]
397
- next mem unless publisher
412
+ def fetch_contributors(ref)
413
+ ref.sub(/\s.*/, "").split("/").reduce([]) do |mem, abbrev|
414
+ publisher = PUBLISHERS[abbrev]
415
+ next mem unless publisher
398
416
 
399
- publisher[:abbreviation] = abbrev
400
- mem << { entity: publisher, role: [type: "publisher"] }
401
- end
402
- end
403
- # rubocop:enable Metrics/MethodLength
404
-
405
- # Fetch ICS.
406
- # @param doc [Nokogiri::HTML::Document]
407
- # @return [Array<Hash>]
408
- def fetch_ics(doc)
409
- doc.xpath("//dl[dt/strong[.='ICS']]/dd/span/a").map do |i|
410
- code = i.text.match(/[\d.]+/).to_s.split "."
411
- { field: code[0], group: code[1], subgroup: code[2] }
412
- end
417
+ publisher[:abbreviation] = abbrev
418
+ mem << { entity: publisher, role: [type: "publisher"] }
413
419
  end
420
+ end
414
421
 
415
- # Fetch links.
416
- # @param doc [Nokogiri::HTML::Document]
417
- # @param url [String]
418
- # @return [Array<Hash>]
419
- def fetch_link(doc, url)
420
- links = [{ type: "src", content: url }]
421
- obp = doc.at_css("a#obp-preview")
422
- links << { type: "obp", content: obp[:href] } if obp
423
- rss = doc.at("//a[contains(@href, 'rss')]")
424
- links << { type: "rss", content: DOMAIN + rss[:href] } if rss
425
- pub = doc.at "//p[contains(., 'publicly available')]/a",
426
- "//p[contains(., 'can be downloaded from the')]/a"
427
- links << { type: "pub", content: pub[:href] } if pub
428
- links
422
+ # Fetch ICS.
423
+ # @param doc [Nokogiri::HTML::Document]
424
+ # @return [Array<Hash>]
425
+ def fetch_ics(doc)
426
+ doc.xpath("//div[contains(., 'ICS')]/following-sibling::span/a").map do |i|
427
+ code = i.text.match(/[\d.]+/).to_s.split "."
428
+ { field: code[0], group: code[1], subgroup: code[2] }
429
429
  end
430
+ end
430
431
 
431
- # Fetch copyright.
432
- # @param doc [Nokogiri::HTML::Document]
433
- # @return [Array<Hash>]
434
- def fetch_copyright(doc) # rubocop:disable Metrics/MethodLength
435
- ref = item_ref doc
436
- owner_name = ref.match(/.*?(?=\s)/).to_s
437
- from = ref.match(/(?<=:)\d{4}/).to_s
438
- if from.empty?
439
- date = doc.at(
440
- "//span[@itemprop='releaseDate']",
441
- "//ul[@id='stages']/li[contains(@class,'active')]/ul/li[@class='active']/a/span[@class='stage-date']",
442
- )
443
- from = date.text.match(/\d{4}/).to_s
444
- end
445
- [{ owner: [{ name: owner_name }], from: from }]
432
+ #
433
+ # Fetch links.
434
+ #
435
+ # @param doc [Nokogiri::HTML::Document] document to parse
436
+ # @param url [String] document url
437
+ #
438
+ # @return [Array<Hash>]
439
+ #
440
+ def fetch_link(doc, url)
441
+ links = [{ type: "src", content: url }]
442
+ obp = doc.at("//h4[contains(@class, 'h5')]/a")
443
+ links << { type: "obp", content: obp[:href] } if obp
444
+ rss = doc.at("//a[contains(@href, 'rss')]")
445
+ links << { type: "rss", content: DOMAIN + rss[:href] } if rss
446
+ pub = doc.at "//p[contains(., 'publicly available')]/a",
447
+ "//p[contains(., 'can be downloaded from the')]/a"
448
+ links << { type: "pub", content: pub[:href] } if pub
449
+ links
450
+ end
451
+
452
+ # Fetch copyright.
453
+ # @param doc [Nokogiri::HTML::Document]
454
+ # @return [Array<Hash>]
455
+ def fetch_copyright(doc) # rubocop:disable Metrics/MethodLength
456
+ ref = item_ref doc
457
+ owner_name = ref.match(/.*?(?=\s)/).to_s
458
+ from = ref.match(/(?<=:)\d{4}/).to_s
459
+ if from.empty?
460
+ date = doc.at(
461
+ "//span[@itemprop='releaseDate']",
462
+ "//ul[@id='stages']/li[contains(@class,'active')]/ul/li[@class='active']/a/span[@class='stage-date']",
463
+ )
464
+ from = date.text.match(/\d{4}/).to_s
446
465
  end
466
+ [{ owner: [{ name: owner_name }], from: from }]
447
467
  end
448
468
  end
449
469
  end
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module RelatonIso
4
- VERSION = "1.16.1"
4
+ VERSION = "1.16.3"
5
5
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: relaton-iso
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.16.1
4
+ version: 1.16.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - Ribose Inc.
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2023-10-14 00:00:00.000000000 Z
11
+ date: 2023-10-21 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: algolia