relaton-iso 1.16.2 → 1.16.4

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 479a728a58c56799448fd6d468e0d19fe245b731119f8dcd9ae6f19a7b624e07
4
- data.tar.gz: ac89507180ca01978bfe98b68fbe02450f2c33015bd38d788752f3bf933911ad
3
+ metadata.gz: 24973dcb87074a6029a83761f4690db8c53203e7a65928ea766ddeaa61b6d167
4
+ data.tar.gz: 15f8936150781349e849ec0a89edc2ba1e24cb7a105973d6479418a7c98f9ffa
5
5
  SHA512:
6
- metadata.gz: 71cc49dc2afa8690f02f7035ec5cc13981eb620e2b8c3792456401c152a4ca8192b2ffbd7445c6c982886e61f679427a2d5afbf26e13c6ebcfffcc8d54f7e5c9
7
- data.tar.gz: 853da0772a998533c5f461ff297bef978c75e1f58b2df1fec5eff0fea6d306807420453a6ba37b1348d44e58c3a71dcf743228c7703b73fd9b7d72c9d4309598
6
+ metadata.gz: ae2ac0909781b8f8f196a259444cc55e3fc8d92eccca7f0d83da769dd27b10c0c95a3cc59e391a52e27cc0320f3149f4498b8004e8d259c98fa9bfa3947b7b81
7
+ data.tar.gz: 4143f5870f15be800efe77cac822c90e7a345c5d6613b1481e7f88598b7d50f701546eaf9138c8bbebde22ad6473a5a7c7e4f81768b3656855e43d91d7ec10d8
@@ -7,6 +7,8 @@ on:
7
7
  branches: [ master, main ]
8
8
  tags: [ v* ]
9
9
  pull_request:
10
+ schedule:
11
+ - cron: '0 0 * * *'
10
12
 
11
13
  jobs:
12
14
  rake:
@@ -4,7 +4,7 @@ module RelatonIso
4
4
  class Processor < Relaton::Processor
5
5
  attr_reader :idtype
6
6
 
7
- def initialize
7
+ def initialize # rubocop:disable Lint/MissingSuper
8
8
  @short = :relaton_iso
9
9
  @prefix = "ISO"
10
10
  @defaultprefix = %r{^ISO(/IEC)?\s}
@@ -43,418 +43,429 @@ module RelatonIso
43
43
  url: "www.asme.org" },
44
44
  }.freeze
45
45
 
46
- class << self
47
- # Parse page.
48
- # @param hit [RelatonIso::Hit]
49
- # @param lang [String, NilClass]
50
- # @return [RelatonIsoBib::IsoBibliographicItem]
51
- def parse_page(hit, lang = nil) # rubocop:disable Metrics/AbcSize, Metrics/MethodLength, Metrics/CyclomaticComplexity, Metrics/PerceivedComplexity
52
- # path = "/contents/data/standard#{hit_data['splitPath']}/"\
53
- # "#{hit_data['csnumber']}.html"
54
-
55
- path = hit.hit[:path].sub("/sites/isoorg", "")
56
- doc, url = get_page "#{path}.html"
57
-
58
- # Fetch edition.
59
- edition = doc.at("//div[div[.='Edition']]/text()[last()]")
60
- &.text&.match(/\d+$/)&.to_s
61
- hit.pubid.base.edition ||= edition if hit.pubid.base
62
-
63
- titles, abstract, langs = fetch_titles_abstract(doc, lang)
64
-
65
- RelatonIsoBib::IsoBibliographicItem.new(
66
- fetched: Date.today.to_s,
67
- docid: fetch_relaton_docids(doc, hit.pubid),
68
- docnumber: fetch_docnumber(hit.pubid),
69
- edition: edition,
70
- language: langs.map { |l| l[:lang] },
71
- script: langs.map { |l| script(l[:lang]) }.uniq,
72
- title: titles,
73
- doctype: fetch_type(hit.hit[:title]),
74
- docstatus: fetch_status(doc),
75
- ics: fetch_ics(doc),
76
- date: fetch_dates(doc, hit.hit[:title]),
77
- contributor: fetch_contributors(hit.hit[:title]),
78
- editorialgroup: fetch_workgroup(doc),
79
- abstract: abstract,
80
- copyright: fetch_copyright(doc),
81
- link: fetch_link(doc, url),
82
- relation: fetch_relations(doc),
83
- place: ["Geneva"],
84
- structuredidentifier: fetch_structuredidentifier(hit.pubid),
85
- )
86
- end
46
+ extend self
47
+
48
+ # Parse page.
49
+ # @param hit [RelatonIso::Hit]
50
+ # @param lang [String, NilClass]
51
+ # @return [RelatonIsoBib::IsoBibliographicItem]
52
+ def parse_page(hit, lang = nil) # rubocop:disable Metrics/AbcSize, Metrics/MethodLength, Metrics/CyclomaticComplexity, Metrics/PerceivedComplexity
53
+ # path = "/contents/data/standard#{hit_data['splitPath']}/"\
54
+ # "#{hit_data['csnumber']}.html"
55
+
56
+ path = hit.hit[:path].sub("/sites/isoorg", "")
57
+ doc, url = get_page "#{path}.html"
58
+
59
+ # Fetch edition.
60
+ edition = doc.at("//div[div[.='Edition']]/text()[last()]")
61
+ &.text&.match(/\d+$/)&.to_s
62
+ hit.pubid.base.edition ||= edition if hit.pubid.base
63
+
64
+ titles, abstract, langs = fetch_titles_abstract(doc, lang)
65
+
66
+ RelatonIsoBib::IsoBibliographicItem.new(
67
+ fetched: Date.today.to_s,
68
+ docid: fetch_relaton_docids(doc, hit.pubid),
69
+ docnumber: fetch_docnumber(hit.pubid),
70
+ edition: edition,
71
+ language: langs.map { |l| l[:lang] },
72
+ script: langs.map { |l| script(l[:lang]) }.uniq,
73
+ title: titles,
74
+ doctype: fetch_type(hit.hit[:title]),
75
+ docstatus: fetch_status(doc),
76
+ ics: fetch_ics(doc),
77
+ date: fetch_dates(doc, hit.hit[:title]),
78
+ contributor: fetch_contributors(hit.hit[:title]),
79
+ editorialgroup: fetch_workgroup(doc),
80
+ abstract: abstract,
81
+ copyright: fetch_copyright(doc),
82
+ link: fetch_link(doc, url),
83
+ relation: fetch_relations(doc),
84
+ place: ["Geneva"],
85
+ structuredidentifier: fetch_structuredidentifier(hit.pubid),
86
+ )
87
+ end
87
88
 
88
- #
89
- # Create document ids.
90
- #
91
- # @param doc [Nokogiri::HTML::Document] document to parse
92
- # @param pubid [Pubid::Iso::Identifier] publication identifier
93
- #
94
- # @return [Array<RelatonBib::DocumentIdentifier>]
95
- #
96
- def fetch_relaton_docids(doc, pubid)
97
- pubid.stage ||= Pubid::Iso::Identifier.parse_stage(stage_code(doc))
98
- [
99
- RelatonIso::DocumentIdentifier.new(id: pubid, type: "ISO", primary: true),
100
- RelatonBib::DocumentIdentifier.new(id: isoref(pubid), type: "iso-reference"),
101
- RelatonIso::DocumentIdentifier.new(id: pubid, type: "URN"),
102
- ]
103
- end
89
+ #
90
+ # Create document ids.
91
+ #
92
+ # @param doc [Nokogiri::HTML::Document] document to parse
93
+ # @param pubid [Pubid::Iso::Identifier] publication identifier
94
+ #
95
+ # @return [Array<RelatonBib::DocumentIdentifier>]
96
+ #
97
+ def fetch_relaton_docids(doc, pubid)
98
+ pubid.stage ||= Pubid::Iso::Identifier.parse_stage(stage_code(doc))
99
+ [
100
+ RelatonIso::DocumentIdentifier.new(id: pubid, type: "ISO", primary: true),
101
+ RelatonBib::DocumentIdentifier.new(id: isoref(pubid), type: "iso-reference"),
102
+ RelatonIso::DocumentIdentifier.new(id: pubid, type: "URN"),
103
+ ]
104
+ end
104
105
 
105
- #
106
- # Create ISO reference identifier with English language.
107
- #
108
- # @param [Pubid::Iso::Identifier] pubid publication identifier
109
- #
110
- # @return [String] English reference identifier
111
- #
112
- def isoref(pubid)
113
- params = pubid.get_params.reject { |k, _| k == :typed_stage }
114
- Pubid::Iso::Identifier.create(language: "en", **params).to_s(format: :ref_num_short)
115
- end
106
+ #
107
+ # Create ISO reference identifier with English language.
108
+ #
109
+ # @param [Pubid::Iso::Identifier] pubid publication identifier
110
+ #
111
+ # @return [String] English reference identifier
112
+ #
113
+ def isoref(pubid)
114
+ params = pubid.get_params.reject { |k, _| k == :typed_stage }
115
+ Pubid::Iso::Identifier.create(language: "en", **params).to_s(format: :ref_num_short)
116
+ end
116
117
 
117
- private
118
-
119
- # Fetch titles and abstracts.
120
- # @param doc [Nokigiri::HTML::Document]
121
- # @param lang [String, NilClass]
122
- # @return [Array<Array>]
123
- def fetch_titles_abstract(doc, lang) # rubocop:disable Metrics/AbcSize,Metrics/MethodLength,Metrics/CyclomaticComplexity,Metrics/PerceivedComplexity
124
- titles = RelatonBib::TypedTitleStringCollection.new
125
- abstract = []
126
- langs = languages(doc, lang).reduce([]) do |s, l|
127
- # Don't need to get page for en. We already have it.
128
- d = l[:path] ? get_page(l[:path])[0] : doc
129
- unless d.at("//h5[@class='help-block']" \
130
- "[.='недоступно на русском языке']")
131
- s << l
132
- titles += fetch_title(d, l[:lang])
133
-
134
- # Fetch abstracts.
135
- abstract_content = d.xpath(
136
- "//div[@itemprop='description']/p|//div[@itemprop='description']/ul/li",
137
- ).map do |a|
138
- a.name == "li" ? "- #{a.text}" : a.text
139
- end.reject(&:empty?).join("\n")
140
- unless abstract_content.empty?
141
- abstract << {
142
- content: abstract_content,
143
- language: l[:lang],
144
- script: script(l[:lang]),
145
- format: "text/plain",
146
- }
147
- end
118
+ private
119
+
120
+ # Fetch titles and abstracts.
121
+ # @param doc [Nokigiri::HTML::Document]
122
+ # @param lang [String, NilClass]
123
+ # @return [Array<Array>]
124
+ def fetch_titles_abstract(doc, lang) # rubocop:disable Metrics/AbcSize,Metrics/MethodLength,Metrics/CyclomaticComplexity,Metrics/PerceivedComplexity
125
+ titles = RelatonBib::TypedTitleStringCollection.new
126
+ abstract = []
127
+ langs = languages(doc, lang).reduce([]) do |s, l|
128
+ # Don't need to get page for en. We already have it.
129
+ d = l[:path] ? get_page(l[:path])[0] : doc
130
+ unless d.at("//h5[@class='help-block']" \
131
+ "[.='недоступно на русском языке']")
132
+ s << l
133
+ titles += fetch_title(d, l[:lang])
134
+
135
+ # Fetch abstracts.
136
+ abstract_content = d.xpath(
137
+ "//div[@itemprop='description']/p|//div[@itemprop='description']/ul/li",
138
+ ).map do |a|
139
+ a.name == "li" ? "- #{a.text}" : a.text
140
+ end.reject(&:empty?).join("\n")
141
+ unless abstract_content.empty?
142
+ abstract << {
143
+ content: abstract_content,
144
+ language: l[:lang],
145
+ script: script(l[:lang]),
146
+ format: "text/plain",
147
+ }
148
148
  end
149
- s
150
149
  end
151
- [titles, abstract, langs]
150
+ s
152
151
  end
152
+ [titles, abstract, langs]
153
+ end
153
154
 
154
- # Returns available languages.
155
- # @param doc [Nokogiri::HTML::Document]
156
- # @pqrqm lang [String, NilClass]
157
- # @return [Array<Hash>]
158
- def languages(doc, lang)
159
- lgs = [{ lang: "en" }]
160
- doc.css("li#lang-switcher ul li a").each do |lang_link|
161
- lang_path = lang_link.attr("href")
162
- l = lang_path.match(%r{^/(fr)/})
163
- lgs << { lang: l[1], path: lang_path } if l && (!lang || l[1] == lang)
164
- end
165
- lgs
155
+ # Returns available languages.
156
+ # @param doc [Nokogiri::HTML::Document]
157
+ # @pqrqm lang [String, NilClass]
158
+ # @return [Array<Hash>]
159
+ def languages(doc, lang)
160
+ lgs = [{ lang: "en" }]
161
+ doc.css("li#lang-switcher ul li a").each do |lang_link|
162
+ lang_path = lang_link.attr("href")
163
+ l = lang_path.match(%r{^/(fr)/})
164
+ lgs << { lang: l[1], path: lang_path } if l && (!lang || l[1] == lang)
166
165
  end
166
+ lgs
167
+ end
167
168
 
168
- # Get page.
169
- # @param path [String] page's path
170
- # @return [Array<Nokogiri::HTML::Document, String>]
171
- def get_page(path)
172
- resp, uri = get_redirection path
173
- doc = try_if_fail resp, uri
174
- [doc, uri.to_s]
175
- rescue SocketError, Timeout::Error, Errno::EINVAL, Errno::ECONNRESET,
176
- EOFError, Net::HTTPBadResponse, Net::HTTPHeaderSyntaxError,
177
- Net::ProtocolError, Errno::ETIMEDOUT
178
- raise RelatonBib::RequestError, "Could not access #{uri}"
179
- end
169
+ # Get page.
170
+ # @param path [String] page's path
171
+ # @return [Array<Nokogiri::HTML::Document, String>]
172
+ def get_page(path)
173
+ resp, uri = get_redirection path
174
+ doc = try_if_fail resp, uri
175
+ [doc, uri.to_s]
176
+ rescue SocketError, Timeout::Error, Errno::EINVAL, Errno::ECONNRESET,
177
+ EOFError, Net::HTTPBadResponse, Net::HTTPHeaderSyntaxError,
178
+ Net::ProtocolError, Errno::ETIMEDOUT
179
+ raise RelatonBib::RequestError, "Could not access #{uri}"
180
+ end
180
181
 
181
- #
182
- # Get the page from the given path. If the page is redirected, get the
183
- # page from the new path.
184
- #
185
- # @param [String] path path to the page
186
- #
187
- # @return [Array<Net::HTTPOK, URI>] HTTP response and URI
188
- # @raise [RelatonBib::RequestError] if the page is not found
189
- #
190
- def get_redirection(path)
191
- url = DOMAIN + path
192
- uri = URI url
193
- resp = Net::HTTP.get_response(uri)
194
- raise RelatonBib::RequestError, "#{url} not found." if %w[404 302].include? resp.code
182
+ #
183
+ # Get the page from the given path. If the page is redirected, get the
184
+ # page from the new path.
185
+ #
186
+ # @param [String] path path to the page
187
+ #
188
+ # @return [Array<Net::HTTPOK, URI>] HTTP response and URI
189
+ # @raise [RelatonBib::RequestError] if the page is not found
190
+ #
191
+ def get_redirection(path)
192
+ url = DOMAIN + path
193
+ uri = URI url
194
+ resp = Net::HTTP.get_response(uri)
195
+ raise RelatonBib::RequestError, "#{url} not found." if %w[404 302].include? resp.code
196
+
197
+ resp.code == "301" ? get_redirection(resp["location"]) : [resp, uri]
198
+ end
195
199
 
196
- resp.code == "301" ? get_redirection(resp["location"]) : [resp, uri]
197
- end
200
+ #
201
+ # The iso.org site fails to respond sometimes. This method tries to get
202
+ # the response again.
203
+ #
204
+ # @param [Net::HTTPOK] resp HTTP response
205
+ # @param [URI::HTTPS] uri URI of the page
206
+ #
207
+ # @return [Nokogiri::HTML4::Document] document
208
+ # @raise [RelatonBib::RequestError] if the page could not be parsed
209
+ #
210
+ def try_if_fail(resp, uri)
211
+ 10.times do
212
+ doc = Nokogiri::HTML(resp.body)
213
+ # stop trying if page has a document id
214
+ return doc if item_ref doc
198
215
 
199
- #
200
- # The iso.org site fails to respond sometimes. This method tries to get
201
- # the response again.
202
- #
203
- # @param [Net::HTTPOK] resp HTTP response
204
- # @param [URI::HTTPS] uri URI of the page
205
- #
206
- # @return [Nokogiri::HTML4::Document] document
207
- # @raise [RelatonBib::RequestError] if the page could not be parsed
208
- #
209
- def try_if_fail(resp, uri)
210
- 10.times do
211
- doc = Nokogiri::HTML(resp.body)
212
- # stop trying if page has a document id
213
- return doc if item_ref doc
214
-
215
- resp = Net::HTTP.get_response(uri)
216
- end
217
- raise RelatonBib::RequestError, "Could not parse the page #{uri}"
216
+ resp = Net::HTTP.get_response(uri)
218
217
  end
218
+ raise RelatonBib::RequestError, "Could not parse the page #{uri}"
219
+ end
219
220
 
220
- #
221
- # Generate docnumber.
222
- #
223
- # @param [Pubid::Iso] pubid
224
- #
225
- # @return [String] docnumber
226
- #
227
- def fetch_docnumber(pubid)
228
- pubid.to_s.match(/\d+/)&.to_s
229
- end
221
+ #
222
+ # Generate docnumber.
223
+ #
224
+ # @param [Pubid::Iso] pubid
225
+ #
226
+ # @return [String] docnumber
227
+ #
228
+ def fetch_docnumber(pubid)
229
+ pubid.to_s.match(/\d+/)&.to_s
230
+ end
230
231
 
231
- #
232
- # Parse structuredidentifier.
233
- #
234
- # @param pubid [Pubid::Iso::Identifier] pubid
235
- #
236
- # @return [RelatonBib::StructuredIdentifier] structured identifier
237
- #
238
- def fetch_structuredidentifier(pubid) # rubocop:disable Metrics/MethodLength
239
- RelatonIsoBib::StructuredIdentifier.new(
240
- project_number: "#{pubid.publisher} #{pubid.number}",
241
- part: pubid.part&.to_s, # &.sub(/^-/, ""),
242
- type: pubid.publisher,
243
- )
244
- end
232
+ #
233
+ # Parse structuredidentifier.
234
+ #
235
+ # @param pubid [Pubid::Iso::Identifier] pubid
236
+ #
237
+ # @return [RelatonBib::StructuredIdentifier] structured identifier
238
+ #
239
+ def fetch_structuredidentifier(pubid) # rubocop:disable Metrics/MethodLength
240
+ RelatonIsoBib::StructuredIdentifier.new(
241
+ project_number: "#{pubid.publisher} #{pubid.number}",
242
+ part: pubid.part&.to_s, # &.sub(/^-/, ""),
243
+ type: pubid.publisher,
244
+ )
245
+ end
245
246
 
246
- def item_ref(doc)
247
- doc.at("//main//section/div/div/div//h1")&.text
248
- end
247
+ def item_ref(doc)
248
+ doc.at("//main//section/div/div/div//h1")&.text
249
+ end
249
250
 
250
- # Fetch status.
251
- # @param doc [Nokogiri::HTML::Document]
252
- # @param status [String]
253
- # @return [Hash]
254
- def fetch_status(doc)
255
- stg, substg = stage_code(doc).split "."
256
- RelatonBib::DocumentStatus.new(stage: stg, substage: substg)
257
- end
251
+ # Fetch status.
252
+ # @param doc [Nokogiri::HTML::Document]
253
+ # @param status [String]
254
+ # @return [Hash]
255
+ def fetch_status(doc)
256
+ stg, substg = stage_code(doc).split "."
257
+ RelatonBib::DocumentStatus.new(stage: stg, substage: substg)
258
+ end
258
259
 
259
- def stage_code(doc)
260
- doc.at("//ul[@class='dropdown-menu']/li[@class='active']" \
261
- "/a/span[@class='stage-code']").text
262
- end
260
+ def stage_code(doc)
261
+ doc.at("//ul[@class='dropdown-menu']/li[@class='active']" \
262
+ "/a/span[@class='stage-code']").text
263
+ end
263
264
 
264
- # def stage(stg, substg)
265
- # abbr = STGABBR[stg].is_a?(Hash) ? STGABBR[stg][substg] : STGABBR[stg]
266
- # RelatonBib::DocumentStatus::Stage.new value: stg, abbreviation: abbr
267
- # end
268
-
269
- # Fetch workgroup.
270
- # @param doc [Nokogiri::HTML::Document]
271
- # @return [Hash]
272
- def fetch_workgroup(doc) # rubocop:disable Metrics/MethodLength,Metrics/AbcSize,Metrics/CyclomaticComplexity,Metrics/PerceivedComplexity
273
- wg = doc.at("////div[contains(., 'Technical Committe')]/following-sibling::span/a")
274
- return unless wg
275
-
276
- workgroup = wg.text.split "/"
277
- type = workgroup[1]&.match(/^[A-Z]+/)&.to_s || "TC"
278
- # {
279
- # name: "International Organization for Standardization",
280
- # abbreviation: "ISO",
281
- # url: "www.iso.org",
282
- # }
283
- tc_numb = workgroup[1]&.match(/\d+/)&.to_s&.to_i
284
- tc_name = wg[:title]
285
- tc = RelatonBib::WorkGroup.new(name: tc_name, identifier: wg.text,
286
- type: type, number: tc_numb)
287
- RelatonIsoBib::EditorialGroup.new(technical_committee: [tc])
288
- end
265
+ # def stage(stg, substg)
266
+ # abbr = STGABBR[stg].is_a?(Hash) ? STGABBR[stg][substg] : STGABBR[stg]
267
+ # RelatonBib::DocumentStatus::Stage.new value: stg, abbreviation: abbr
268
+ # end
269
+
270
+ # Fetch workgroup.
271
+ # @param doc [Nokogiri::HTML::Document]
272
+ # @return [Hash]
273
+ def fetch_workgroup(doc) # rubocop:disable Metrics/MethodLength,Metrics/AbcSize,Metrics/CyclomaticComplexity,Metrics/PerceivedComplexity
274
+ wg = doc.at("////div[contains(., 'Technical Committe')]/following-sibling::span/a")
275
+ return unless wg
276
+
277
+ workgroup = wg.text.split "/"
278
+ type = workgroup[1]&.match(/^[A-Z]+/)&.to_s || "TC"
279
+ # {
280
+ # name: "International Organization for Standardization",
281
+ # abbreviation: "ISO",
282
+ # url: "www.iso.org",
283
+ # }
284
+ tc_numb = workgroup[1]&.match(/\d+/)&.to_s&.to_i
285
+ tc_name = wg[:title]
286
+ tc = RelatonBib::WorkGroup.new(name: tc_name, identifier: wg.text,
287
+ type: type, number: tc_numb)
288
+ RelatonIsoBib::EditorialGroup.new(technical_committee: [tc])
289
+ end
289
290
 
290
- # Fetch relations.
291
- # @param doc [Nokogiri::HTML::Document]
292
- # @return [Array<Hash>]
293
- def fetch_relations(doc)
294
- types = ["Now", "Now under review"]
295
- doc.xpath("//ul[@class='steps']/li", "//div[@class='sub-step']").reduce([]) do |a, r|
296
- type, date = relation_type(r.at("h4", "h5").text.strip, doc)
297
- next a if types.include?(type)
291
+ # Fetch relations.
292
+ # @param doc [Nokogiri::HTML::Document]
293
+ # @return [Array<Hash>]
294
+ def fetch_relations(doc)
295
+ types = ["Now", "Now under review"]
296
+ doc.xpath("//ul[@class='steps']/li", "//div[@class='sub-step']").reduce([]) do |a, r|
297
+ type, date = relation_type(r.at("h4", "h5").text.strip, doc)
298
+ next a if types.include?(type)
298
299
 
299
- a + create_relations(r, type, date)
300
- end
300
+ a + create_relations(r, type, date)
301
301
  end
302
+ end
302
303
 
303
- def relation_type(type, doc)
304
- date = []
305
- t = case type.strip
306
- when "Previously", "Will be replaced by" then "obsoletes"
307
- when "Corrigenda / Amendments", "Revised by", "Now confirmed"
308
- on = doc.xpath('//span[@class="stage-date"][contains(., "-")]').last
309
- date << { type: "circulated", on: on.text } if on
310
- "updates"
311
- else type
312
- end
313
- [t, date]
314
- end
304
+ def relation_type(type, doc)
305
+ date = []
306
+ t = case type.strip
307
+ when "Previously", "Will be replaced by" then "obsoletes"
308
+ when "Corrigenda / Amendments", "Revised by", "Now confirmed"
309
+ on = doc.xpath('//span[@class="stage-date"][contains(., "-")]').last
310
+ date << { type: "circulated", on: on.text } if on
311
+ "updates"
312
+ else type
313
+ end
314
+ [t, date]
315
+ end
315
316
 
316
- def create_relations(rel, type, date)
317
- rel.css("a").map do |id|
318
- docid = RelatonBib::DocumentIdentifier.new(type: "ISO", id: id.text, primary: true)
319
- fref = RelatonBib::FormattedRef.new(content: id.text, format: "text/plain")
320
- bibitem = RelatonIsoBib::IsoBibliographicItem.new(
321
- docid: [docid], formattedref: fref, date: date,
322
- )
323
- { type: type, bibitem: bibitem }
324
- end
317
+ def create_relations(rel, type, date)
318
+ rel.css("a").map do |id|
319
+ docid = RelatonBib::DocumentIdentifier.new(type: "ISO", id: id.text, primary: true)
320
+ fref = RelatonBib::FormattedRef.new(content: id.text, format: "text/plain")
321
+ bibitem = RelatonIsoBib::IsoBibliographicItem.new(
322
+ docid: [docid], formattedref: fref, date: date,
323
+ )
324
+ { type: type, bibitem: bibitem }
325
325
  end
326
+ end
326
327
 
327
- # Fetch type.
328
- # @param ref [String]
329
- # @return [String]
330
- def fetch_type(ref)
331
- %r{
332
- ^(?<prefix>ISO|IWA|IEC)
333
- (?:(?:/IEC|/IEEE|/PRF|/NP|/DGuide)*\s|/)
334
- (?<type>TS|TR|PAS|AWI|CD|FDIS|NP|DIS|WD|R|Guide|(?=\d+))
335
- }x =~ ref
336
- # return "international-standard" if type_match.nil?
337
- if TYPES[type] then TYPES[type]
338
- elsif prefix == "ISO" then "international-standard"
339
- elsif prefix == "IWA" then "international-workshop-agreement"
340
- end
341
- # rescue => _e
342
- # puts 'Unknown document type: ' + title
328
+ # Fetch type.
329
+ # @param ref [String]
330
+ # @return [String]
331
+ def fetch_type(ref)
332
+ %r{
333
+ ^(?<prefix>ISO|IWA|IEC)
334
+ (?:(?:/IEC|/IEEE|/PRF|/NP|/DGuide)*\s|/)
335
+ (?<type>TS|TR|PAS|AWI|CD|FDIS|NP|DIS|WD|R|Guide|(?=\d+))
336
+ }x =~ ref
337
+ # return "international-standard" if type_match.nil?
338
+ if TYPES[type] then TYPES[type]
339
+ elsif prefix == "ISO" then "international-standard"
340
+ elsif prefix == "IWA" then "international-workshop-agreement"
343
341
  end
342
+ # rescue => _e
343
+ # puts 'Unknown document type: ' + title
344
+ end
344
345
 
345
- # Fetch titles.
346
- # @param doc [Nokogiri::HTML::Document]
347
- # @param lang [String]
348
- # @return [Array<RelatonBib::TypedTitleString>]
349
- def fetch_title(doc, lang) # rubocop:disable Metrics/MethodLength,Metrics/AbcSize
350
- head = doc.at "//nav[contains(@class,'heading-condensed')]"
351
- types = { "h2" => "title-intro", "h3" => "title-main", "h4" => "title-part" }
352
- title_types = head.xpath("h2 | h3 | h4").each_with_object({}) do |t, h|
353
- h[types[t.name]] = t.text
354
- end
355
- title = RelatonBib::TypedTitleStringCollection.new
356
- title_types.each do |type, content|
357
- title << RelatonBib::TypedTitleString.new(
358
- type: type, content: content, language: lang, script: script(lang),
359
- )
360
- end
361
- main = title.map { |t| t.title.content }.join " - "
362
- title << RelatonBib::TypedTitleString.new(type: "main", content: main, language: lang, script: script(lang))
346
+ # Fetch titles.
347
+ # @param doc [Nokogiri::HTML::Document]
348
+ # @param lang [String]
349
+ # @return [Array<RelatonBib::TypedTitleString>]
350
+ def fetch_title(doc, lang) # rubocop:disable Metrics/MethodLength,Metrics/AbcSize
351
+ types = %w[title-intro title-main title-part]
352
+ ttls = titles(doc)
353
+ title = RelatonBib::TypedTitleStringCollection.new
354
+ ttls.each.with_index do |p, i|
355
+ next unless p
356
+
357
+ title << RelatonBib::TypedTitleString.new(
358
+ type: types[i], content: p, language: lang, script: script(lang),
359
+ )
360
+ end.compact
361
+ main = title.map { |t| t.title.content }.join " - "
362
+ title << RelatonBib::TypedTitleString.new(type: "main", content: main, language: lang, script: script(lang))
363
+ end
364
+
365
+ def titles(doc)
366
+ head = doc.at "//nav[contains(@class,'heading-condensed')]"
367
+ ttls = head.xpath("h2 | h3 | h4").map &:text
368
+ ttls = ttls[0].split " - " if ttls.size == 1
369
+ case ttls.size
370
+ when 0, 1 then [nil, ttls.first, nil]
371
+ else RelatonBib::TypedTitleString.intro_or_part ttls
363
372
  end
373
+ end
364
374
 
365
- # Return ISO script code.
366
- # @param lang [String]
367
- # @return [String]
368
- def script(lang)
369
- case lang
370
- when "en", "fr" then "Latn"
371
- # when "ru" then "Cyrl"
372
- end
375
+ # Return ISO script code.
376
+ # @param lang [String]
377
+ # @return [String]
378
+ def script(lang)
379
+ case lang
380
+ when "en", "fr" then "Latn"
381
+ # when "ru" then "Cyrl"
373
382
  end
383
+ end
374
384
 
375
- # Fetch dates
376
- # @param doc [Nokogiri::HTML::Document]
377
- # @param ref [String]
378
- # @return [Array<Hash>]
379
- def fetch_dates(doc, ref) # rubocop:disable Metrics/AbcSize, Metrics/PerceivedComplexity, Metrics/MethodLength
380
- dates = []
381
- %r{^[^\s]+\s[\d-]+:(?<ref_date_str>\d{4})} =~ ref
382
- pub_date_str = doc.xpath("//span[@itemprop='releaseDate']").text
383
- if ref_date_str
384
- ref_date = Date.strptime ref_date_str, "%Y"
385
- if pub_date_str.empty?
385
+ # Fetch dates
386
+ # @param doc [Nokogiri::HTML::Document]
387
+ # @param ref [String]
388
+ # @return [Array<Hash>]
389
+ def fetch_dates(doc, ref) # rubocop:disable Metrics/AbcSize, Metrics/PerceivedComplexity, Metrics/MethodLength
390
+ dates = []
391
+ %r{^[^\s]+\s[\d-]+:(?<ref_date_str>\d{4})} =~ ref
392
+ pub_date_str = doc.at("//span[@itemprop='releaseDate']")
393
+ if ref_date_str
394
+ ref_date = Date.strptime ref_date_str, "%Y"
395
+ if pub_date_str.nil?
396
+ dates << { type: "published", on: ref_date_str }
397
+ else
398
+ pub_date = Date.strptime pub_date_str.text, "%Y"
399
+ if pub_date.year > ref_date.year
386
400
  dates << { type: "published", on: ref_date_str }
401
+ dates << { type: "updated", on: pub_date_str.text }
387
402
  else
388
- pub_date = Date.strptime pub_date_str, "%Y"
389
- if pub_date.year > ref_date.year
390
- dates << { type: "published", on: ref_date_str }
391
- dates << { type: "updated", on: pub_date_str }
392
- else
393
- dates << { type: "published", on: pub_date_str }
394
- end
403
+ dates << { type: "published", on: pub_date_str.text }
395
404
  end
396
- elsif !pub_date_str.empty?
397
- dates << { type: "published", on: pub_date_str }
398
405
  end
399
- dates
406
+ elsif pub_date_str
407
+ dates << { type: "published", on: pub_date_str.text }
400
408
  end
409
+ corr_data = doc.at "//span[@itemprop='dateModified']"
410
+ dates << { type: "corrected", on: corr_data.text } if corr_data
411
+ dates
412
+ end
401
413
 
402
- def fetch_contributors(ref)
403
- ref.sub(/\s.*/, "").split("/").reduce([]) do |mem, abbrev|
404
- publisher = PUBLISHERS[abbrev]
405
- next mem unless publisher
414
+ def fetch_contributors(ref)
415
+ ref.sub(/\s.*/, "").split("/").reduce([]) do |mem, abbrev|
416
+ publisher = PUBLISHERS[abbrev]
417
+ next mem unless publisher
406
418
 
407
- publisher[:abbreviation] = abbrev
408
- mem << { entity: publisher, role: [type: "publisher"] }
409
- end
419
+ publisher[:abbreviation] = abbrev
420
+ mem << { entity: publisher, role: [type: "publisher"] }
410
421
  end
422
+ end
411
423
 
412
- # Fetch ICS.
413
- # @param doc [Nokogiri::HTML::Document]
414
- # @return [Array<Hash>]
415
- def fetch_ics(doc)
416
- doc.xpath("//div[contains(., 'ICS')]/following-sibling::span/a").map do |i|
417
- code = i.text.match(/[\d.]+/).to_s.split "."
418
- { field: code[0], group: code[1], subgroup: code[2] }
419
- end
424
+ # Fetch ICS.
425
+ # @param doc [Nokogiri::HTML::Document]
426
+ # @return [Array<Hash>]
427
+ def fetch_ics(doc)
428
+ doc.xpath("//div[contains(., 'ICS')]/following-sibling::span/a").map do |i|
429
+ code = i.text.match(/[\d.]+/).to_s.split "."
430
+ { field: code[0], group: code[1], subgroup: code[2] }
420
431
  end
432
+ end
421
433
 
422
- #
423
- # Fetch links.
424
- #
425
- # @param doc [Nokogiri::HTML::Document] document to parse
426
- # @param url [String] document url
427
- #
428
- # @return [Array<Hash>]
429
- #
430
- def fetch_link(doc, url)
431
- links = [{ type: "src", content: url }]
432
- obp = doc.at("//h4[contains(@class, 'h5')]/a")
433
- links << { type: "obp", content: obp[:href] } if obp
434
- rss = doc.at("//a[contains(@href, 'rss')]")
435
- links << { type: "rss", content: DOMAIN + rss[:href] } if rss
436
- pub = doc.at "//p[contains(., 'publicly available')]/a",
437
- "//p[contains(., 'can be downloaded from the')]/a"
438
- links << { type: "pub", content: pub[:href] } if pub
439
- links
440
- end
434
+ #
435
+ # Fetch links.
436
+ #
437
+ # @param doc [Nokogiri::HTML::Document] document to parse
438
+ # @param url [String] document url
439
+ #
440
+ # @return [Array<Hash>]
441
+ #
442
+ def fetch_link(doc, url)
443
+ links = [{ type: "src", content: url }]
444
+ obp = doc.at("//h4[contains(@class, 'h5')]/a")
445
+ links << { type: "obp", content: obp[:href] } if obp
446
+ rss = doc.at("//a[contains(@href, 'rss')]")
447
+ links << { type: "rss", content: DOMAIN + rss[:href] } if rss
448
+ pub = doc.at "//p[contains(., 'publicly available')]/a",
449
+ "//p[contains(., 'can be downloaded from the')]/a"
450
+ links << { type: "pub", content: pub[:href] } if pub
451
+ links
452
+ end
441
453
 
442
- # Fetch copyright.
443
- # @param doc [Nokogiri::HTML::Document]
444
- # @return [Array<Hash>]
445
- def fetch_copyright(doc) # rubocop:disable Metrics/MethodLength
446
- ref = item_ref doc
447
- owner_name = ref.match(/.*?(?=\s)/).to_s
448
- from = ref.match(/(?<=:)\d{4}/).to_s
449
- if from.empty?
450
- date = doc.at(
451
- "//span[@itemprop='releaseDate']",
452
- "//ul[@id='stages']/li[contains(@class,'active')]/ul/li[@class='active']/a/span[@class='stage-date']",
453
- )
454
- from = date.text.match(/\d{4}/).to_s
455
- end
456
- [{ owner: [{ name: owner_name }], from: from }]
454
+ # Fetch copyright.
455
+ # @param doc [Nokogiri::HTML::Document]
456
+ # @return [Array<Hash>]
457
+ def fetch_copyright(doc) # rubocop:disable Metrics/MethodLength
458
+ ref = item_ref doc
459
+ owner_name = ref.match(/.*?(?=\s)/).to_s
460
+ from = ref.match(/(?<=:)\d{4}/).to_s
461
+ if from.empty?
462
+ date = doc.at(
463
+ "//span[@itemprop='releaseDate']",
464
+ "//ul[@id='stages']/li[contains(@class,'active')]/ul/li[@class='active']/a/span[@class='stage-date']",
465
+ )
466
+ from = date.text.match(/\d{4}/).to_s
457
467
  end
468
+ [{ owner: [{ name: owner_name }], from: from }]
458
469
  end
459
470
  end
460
471
  end
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module RelatonIso
4
- VERSION = "1.16.2"
4
+ VERSION = "1.16.4"
5
5
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: relaton-iso
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.16.2
4
+ version: 1.16.4
5
5
  platform: ruby
6
6
  authors:
7
7
  - Ribose Inc.
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2023-10-20 00:00:00.000000000 Z
11
+ date: 2023-11-15 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: algolia