relaton-iso 2.1.1 → 2.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,443 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "nokogiri"
4
+ require_relative "../iso"
5
+ require_relative "scraper"
6
+
7
+ module Relaton
8
+ module Iso
9
+ #
10
+ # Parses one ISO Open Data record (`iso_deliverables_metadata.jsonl` line)
11
+ # into an `Relaton::Iso::ItemData`.
12
+ #
13
+ # See https://www.iso.org/open-data.html for the field reference.
14
+ #
15
+ class DataParser
16
+ ATTRS = %i[
17
+ type docidentifier docnumber edition language script title status ics
18
+ date contributor abstract copyright source relation place
19
+ structuredidentifier ext
20
+ ].freeze
21
+
22
+ DOCTYPES = {
23
+ "IS" => "international-standard",
24
+ "TS" => "technical-specification",
25
+ "TR" => "technical-report",
26
+ "PAS" => "publicly-available-specification",
27
+ "GUIDE" => "guide",
28
+ "IWA" => "international-workshop-agreement",
29
+ "R" => "recommendation",
30
+ "ISP" => "international-standard",
31
+ "DATA" => "international-standard",
32
+ "TTA" => "international-standard",
33
+ }.freeze
34
+
35
+ SUPPLEMENT_DOCTYPES = {
36
+ "Amd" => "amendment",
37
+ "Cor" => "technical-corrigendum",
38
+ "Add" => "addendum",
39
+ }.freeze
40
+
41
+ DOC_URL = "https://www.iso.org/standard/%d.html"
42
+ OBP_URL = "https://www.iso.org/obp/ui/en/#!iso:std:%d:en"
43
+ RSS_URL = "https://www.iso.org/contents/data/standard/%s/%s/%d.detail.rss"
44
+
45
+ #
46
+ # @param [Hash] pub one Open Data record
47
+ # @param [Hash{Integer=>String}] ref_index map of Open Data `id` ->
48
+ # `reference`, used to resolve `replaces` / `replacedBy` (which are
49
+ # numeric IDs in the source).
50
+ # @param [Hash] errors error accumulator (`Hash.new(true)`); fields are
51
+ # AND-ed across all records by the `report_errors` machinery.
52
+ # @param [Hash{String=>Hash}] tc_index map of TC/SC reference ->
53
+ # `{ "en" => title, "fr" => title }`, used to resolve the human
54
+ # committee label from the Open Data technical-committees dataset.
55
+ # @param [Hash{String=>Array<String>}] amend_index map of base
56
+ # reference -> list of supplement (Amd/Cor/Add) references that
57
+ # target it. Open Data records the supplement -> base direction only
58
+ # via the reference string, so we pre-build the reverse map.
59
+ # @param [Hash{String=>String}] date_index map of reference ->
60
+ # `publicationDate`, used to attach a `published` date to each
61
+ # emitted relation's bibitem when the related document is itself
62
+ # present in the Open Data feed.
63
+ #
64
+ def initialize(pub, ref_index = {}, errors = {}, tc_index = {}, amend_index = {}, date_index = {})
65
+ @pub = pub
66
+ @ref_index = ref_index
67
+ @errors = errors
68
+ @tc_index = tc_index
69
+ @amend_index = amend_index
70
+ @date_index = date_index
71
+ end
72
+
73
+ def parse
74
+ ItemData.new(**ATTRS.each_with_object({}) { |a, h| h[a] = send(a) })
75
+ end
76
+
77
+ private
78
+
79
+ def type = "standard"
80
+
81
+ # ---- identifiers -----------------------------------------------------
82
+
83
+ def reference
84
+ @reference ||= @pub["reference"] || ""
85
+ end
86
+
87
+ def pubid
88
+ return @pubid if defined?(@pubid)
89
+
90
+ @pubid = begin
91
+ ::Pubid::Iso::Identifier.parse(reference)
92
+ rescue StandardError => e
93
+ Util.warn "Failed to parse pubid `#{reference}`: #{e.message}"
94
+ nil
95
+ end
96
+ end
97
+
98
+ def docidentifier
99
+ ids = []
100
+ if pubid
101
+ ids << Docidentifier.new(content: pubid, type: "ISO", primary: true)
102
+ if (ref = iso_reference_pubid)
103
+ ids << Docidentifier.new(content: ref, type: "iso-reference")
104
+ end
105
+ if (urn = safe_urn_docid)
106
+ ids << urn
107
+ end
108
+ else
109
+ ids << Docidentifier.new(content: reference, type: "ISO", primary: true)
110
+ end
111
+ @errors[:docidentifier] &&= ids.empty?
112
+ ids
113
+ end
114
+
115
+ def safe_urn_docid
116
+ return nil unless urn_pubid
117
+
118
+ Docidentifier.new(content: urn_pubid, type: "URN")
119
+ rescue StandardError
120
+ nil
121
+ end
122
+
123
+ def iso_reference_pubid
124
+ params = pubid.to_h.except(:typed_stage)
125
+ ::Pubid::Iso::Identifier.create(language: "en", **params)
126
+ rescue StandardError
127
+ nil
128
+ end
129
+
130
+ def urn_pubid
131
+ return @urn_pubid if defined?(@urn_pubid)
132
+
133
+ @urn_pubid = begin
134
+ dup_pubid = pubid.dup
135
+ if dup_pubid.respond_to?(:stage=) && stage_dotted &&
136
+ dup_pubid.respond_to?(:stage) && dup_pubid.stage.nil?
137
+ dup_pubid.stage = ::Pubid::Iso::Identifier.parse_stage(stage_dotted)
138
+ end
139
+ dup_pubid
140
+ rescue StandardError
141
+ nil
142
+ end
143
+ end
144
+
145
+ def docnumber
146
+ pubid&.to_s&.match(/\d+/)&.to_s
147
+ end
148
+
149
+ def edition
150
+ return nil unless @pub["edition"]
151
+
152
+ Bib::Edition.new(content: @pub["edition"].to_s)
153
+ end
154
+
155
+ # ---- language / script ----------------------------------------------
156
+
157
+ def language
158
+ langs = Array(@pub["languages"]).dup
159
+ langs << "en" if langs.empty?
160
+ langs.uniq
161
+ end
162
+
163
+ def script
164
+ language.filter_map { |l| script_for(l) }.uniq
165
+ end
166
+
167
+ def script_for(lang)
168
+ case lang
169
+ when "en", "fr" then "Latn"
170
+ when "ru" then "Cyrl"
171
+ end
172
+ end
173
+
174
+ # ---- title -----------------------------------------------------------
175
+
176
+ def title
177
+ result = []
178
+ result += titles_for("en")
179
+ result += titles_for("fr")
180
+ @errors[:title] &&= result.empty?
181
+ result
182
+ end
183
+
184
+ def titles_for(lang)
185
+ raw = @pub.dig("title", lang)
186
+ return [] if raw.nil? || raw.empty?
187
+
188
+ Bib::Title.from_string(normalize_dashes(raw), lang, script_for(lang))
189
+ end
190
+
191
+ def normalize_dashes(str)
192
+ str.gsub(/\s—\s/, " - ").gsub(/\s–\s/, " - ")
193
+ end
194
+
195
+ # ---- status ----------------------------------------------------------
196
+
197
+ # Open Data exposes a 4-digit stage code (e.g. 2098 = 20.98, 6060 = 60.60).
198
+ # Records occasionally come through with 2 or 3 digits (zero-padded).
199
+ def stage_dotted
200
+ return @stage_dotted if defined?(@stage_dotted)
201
+
202
+ @stage_dotted =
203
+ if @pub["currentStage"]
204
+ digits = format("%04d", @pub["currentStage"].to_i)
205
+ "#{digits[0, 2]}.#{digits[2, 2]}"
206
+ end
207
+ end
208
+
209
+ def status
210
+ return nil unless stage_dotted
211
+
212
+ stg, sub = stage_dotted.split(".")
213
+ Bib::Status.new(
214
+ stage: Bib::Status::Stage.new(content: stg),
215
+ substage: sub ? Bib::Status::Stage.new(content: sub) : nil,
216
+ )
217
+ end
218
+
219
+ # ---- ICS -------------------------------------------------------------
220
+
221
+ def ics
222
+ return [] unless @pub["icsCode"]
223
+
224
+ Array(@pub["icsCode"]).map do |code|
225
+ info = safe_isoics_fetch(code)
226
+ Bib::ICS.new(code: code, text: info&.description)
227
+ end
228
+ end
229
+
230
+ def safe_isoics_fetch(code)
231
+ Isoics.fetch code
232
+ rescue StandardError
233
+ nil
234
+ end
235
+
236
+ # ---- dates -----------------------------------------------------------
237
+
238
+ def date
239
+ pd = @pub["publicationDate"]
240
+ return [] if pd.nil? || pd.empty?
241
+
242
+ [Bib::Date.new(type: "published", at: pd)]
243
+ end
244
+
245
+ # ---- contributors ----------------------------------------------------
246
+
247
+ def contributor
248
+ publishers + Array(editorialgroup_contributor)
249
+ end
250
+
251
+ def publishers
252
+ reference.sub(/\s.*/, "").split("/").filter_map do |abbrev|
253
+ info = Scraper::PUBLISHERS[abbrev]
254
+ next unless info
255
+
256
+ name = Bib::TypedLocalizedString.new(content: info[:name])
257
+ abbr = Bib::LocalizedString.new(content: abbrev)
258
+ uri = Bib::Uri.new(content: info[:uri]) if info[:uri]
259
+ org = Bib::Organization.new(name: [name], abbreviation: abbr, uri: [uri].compact)
260
+ role = Bib::Contributor::Role.new(type: "publisher")
261
+ Bib::Contributor.new(organization: org, role: [role])
262
+ end
263
+ end
264
+
265
+ def editorialgroup_contributor
266
+ wg = @pub["ownerCommittee"]
267
+ return nil if wg.nil? || wg.empty?
268
+
269
+ parts = wg.split("/")
270
+ prefix = parts[0]
271
+ type = parts[1]&.match(/^[A-Z]+/)&.to_s || "TC"
272
+
273
+ publisher = Scraper::PUBLISHERS[prefix]
274
+ name = if publisher
275
+ [Bib::TypedLocalizedString.new(content: publisher[:name])]
276
+ elsif prefix
277
+ [Bib::TypedLocalizedString.new(content: prefix)]
278
+ else
279
+ []
280
+ end
281
+ abbreviation = (Bib::LocalizedString.new(content: prefix) if prefix)
282
+
283
+ label = @tc_index.dig(wg, "en") || wg
284
+ subdivision = Bib::Subdivision.new(
285
+ type: "technical-committee",
286
+ subtype: type,
287
+ name: [Bib::TypedLocalizedString.new(content: label)],
288
+ identifier: [Bib::OrganizationType::Identifier.new(content: wg)],
289
+ )
290
+
291
+ role = Bib::Contributor::Role.new(
292
+ type: "author",
293
+ description: [Bib::LocalizedMarkedUpString.new(content: "committee")],
294
+ )
295
+
296
+ Bib::Contributor.new(
297
+ role: [role],
298
+ organization: Bib::Organization.new(
299
+ name: name, subdivision: [subdivision], abbreviation: abbreviation,
300
+ ),
301
+ )
302
+ end
303
+
304
+ # ---- abstract --------------------------------------------------------
305
+
306
+ def abstract
307
+ %w[en fr].filter_map do |lang|
308
+ html = @pub.dig("scope", lang)
309
+ next if html.nil? || html.empty?
310
+
311
+ text = strip_html(html)
312
+ next if text.empty?
313
+
314
+ Bib::Abstract.new(content: text, language: lang, script: script_for(lang))
315
+ end
316
+ end
317
+
318
+ def strip_html(html)
319
+ Nokogiri::HTML.fragment(html).text.strip.gsub(/\s+/, " ")
320
+ end
321
+
322
+ # ---- copyright -------------------------------------------------------
323
+
324
+ def copyright
325
+ from = reference[/(?<=:)\d{4}/] ||
326
+ @pub["publicationDate"]&.match(/\d{4}/)&.to_s
327
+ return [] unless from && !from.empty?
328
+
329
+ owner_name = reference.match(/.*?(?=\s)/).to_s
330
+ name = Bib::TypedLocalizedString.new(content: owner_name)
331
+ org = Bib::Organization.new(name: [name])
332
+ contrib = Bib::ContributionInfo.new(organization: org)
333
+ [Bib::Copyright.new(owner: [contrib], from: from)]
334
+ end
335
+
336
+ # ---- source links ----------------------------------------------------
337
+
338
+ def source
339
+ id = @pub["id"]
340
+ return [] unless id
341
+
342
+ pad = format("%06d", id)
343
+ [
344
+ Bib::Uri.new(type: "src", content: format(DOC_URL, id)),
345
+ Bib::Uri.new(type: "obp", content: format(OBP_URL, id)),
346
+ Bib::Uri.new(type: "rss", content: format(RSS_URL, pad[0, 2], pad[2, 2], id)),
347
+ ]
348
+ end
349
+
350
+ # ---- relations -------------------------------------------------------
351
+
352
+ # Open Data semantics:
353
+ # * `replaces` - older docs THIS one supersedes -> `obsoletes`
354
+ # * `replacedBy` - newer docs that supersede THIS one -> `obsoletedBy`
355
+ # Amendments/corrigenda/addenda are stitched in via two routes:
356
+ # * on the BASE record, look up `@amend_index` for supplements
357
+ # targeting it (-> `updatedBy`); the index is pre-built in
358
+ # `DataFetcher#build_ref_index` because Open Data only records
359
+ # the supplement -> base direction via the reference string.
360
+ # * on the SUPPLEMENT record itself, derive the base from
361
+ # `pubid.base` and emit the forward `updates` relation.
362
+ def relation
363
+ rels = []
364
+ rels += build_relations(@pub["replaces"], "obsoletes")
365
+ rels += build_relations(@pub["replacedBy"], "obsoletedBy")
366
+ rels += amendment_relations
367
+ rels += base_relation
368
+ rels
369
+ end
370
+
371
+ def build_relations(ids, type)
372
+ Array(ids).filter_map do |id|
373
+ ref = @ref_index[id] || @ref_index[id.to_s]
374
+ next unless ref
375
+
376
+ relation_for(ref, type)
377
+ end
378
+ end
379
+
380
+ def amendment_relations
381
+ Array(@amend_index[pubid&.to_s || reference]).map do |amend_ref|
382
+ relation_for(amend_ref, "updatedBy")
383
+ end
384
+ end
385
+
386
+ def base_relation
387
+ return [] unless pubid&.respond_to?(:base) && pubid.base
388
+
389
+ [relation_for(pubid.base.to_s, "updates")]
390
+ end
391
+
392
+ def relation_for(ref, type)
393
+ docid = Docidentifier.new(content: ref, type: "ISO", primary: true)
394
+ attrs = {
395
+ docidentifier: [docid],
396
+ formattedref: Bib::Formattedref.new(content: ref),
397
+ }
398
+ if (pub_date = @date_index[ref]) && !pub_date.empty?
399
+ attrs[:date] = [Bib::Date.new(type: "published", at: pub_date)]
400
+ end
401
+ Relation.new(type: type, bibitem: ItemData.new(**attrs))
402
+ end
403
+
404
+ # ---- structured identifier ------------------------------------------
405
+
406
+ def structuredidentifier
407
+ return nil unless @pub["id"]
408
+
409
+ pnum = ProjectNumber.new(content: @pub["id"].to_s)
410
+ publisher = pubid&.respond_to?(:publisher) ? pubid.publisher : nil
411
+ StructuredIdentifier.new(project_number: pnum, type: publisher || "ISO")
412
+ end
413
+
414
+ # ---- place -----------------------------------------------------------
415
+
416
+ def place
417
+ [Bib::Place.new(city: "Geneva")]
418
+ end
419
+
420
+ # ---- ext -------------------------------------------------------------
421
+
422
+ def ext
423
+ Ext.new(
424
+ doctype: doctype,
425
+ flavor: "iso",
426
+ ics: ics,
427
+ structuredidentifier: structuredidentifier,
428
+ stagename: nil,
429
+ updates_document_type: nil,
430
+ fast_track: nil,
431
+ price_code: nil,
432
+ )
433
+ end
434
+
435
+ def doctype
436
+ type = SUPPLEMENT_DOCTYPES[@pub["supplementType"]] ||
437
+ DOCTYPES[@pub["deliverableType"]] ||
438
+ "international-standard"
439
+ Doctype.new(content: type)
440
+ end
441
+ end
442
+ end
443
+ end
@@ -1,7 +1,9 @@
1
+ require_relative "../type/pubid"
2
+
1
3
  module Relaton
2
4
  module Iso
3
5
  class Docidentifier < Bib::Docidentifier
4
- attribute :content, :string
6
+ attribute :content, Type::Pubid
5
7
 
6
8
  attr_reader :pubid
7
9
 
@@ -30,7 +32,10 @@ module Relaton
30
32
  begin
31
33
  ::Pubid::Iso::Identifier.parse(value)
32
34
  rescue StandardError
33
- Util.warn "Failed to parse Pubid: #{value}"
35
+ # Suppress when type is not yet set (lutaml runs the setter
36
+ # once during init before `type` is assigned, then `initialize`
37
+ # re-runs it; only the second pass is authoritative).
38
+ Util.warn "Failed to parse Pubid: #{value}" if type
34
39
  nil
35
40
  end
36
41
  end
@@ -10,7 +10,7 @@ module Relaton
10
10
  @prefix = "ISO"
11
11
  @defaultprefix = %r{^ISO(/IEC)?\s}
12
12
  @idtype = "ISO"
13
- @datasets = %w[iso-ics]
13
+ @datasets = %w[iso-open-data iso-open-data-all]
14
14
  end
15
15
 
16
16
  # @param code [String]
@@ -23,16 +23,19 @@ module Relaton
23
23
  end
24
24
 
25
25
  #
26
- # Fetch all the documents from https://www.iso.org/standards-catalogue/browse-by-ics.html
26
+ # Fetch all the documents from the ISO Open Data programme
27
+ # (https://www.iso.org/open-data.html).
27
28
  #
28
- # @param [String] source source name (iso-rss, iso-rss-all)
29
+ # @param [String] source source name
30
+ # * `iso-open-data` - skip if upstream `Last-Modified` is unchanged
31
+ # * `iso-open-data-all` - wipe `output` and re-emit every record
29
32
  # @param [Hash] opts
30
33
  # @option opts [String] :output directory to output documents
31
34
  # @option opts [String] :format output format (xml, yaml, bibxml)
32
35
  #
33
- def fetch_data(_source, opts)
36
+ def fetch_data(source, opts)
34
37
  require_relative "data_fetcher"
35
- DataFetcher.fetch(**opts)
38
+ DataFetcher.fetch(source, **opts)
36
39
  end
37
40
 
38
41
  # @param xml [String]
@@ -0,0 +1,50 @@
1
+ module Relaton
2
+ module Iso
3
+ module Type
4
+ # Lutaml-model attribute type that preserves `Pubid::Iso::Identifier::Base`
5
+ # instances on the way in and stringifies them on the way out.
6
+ #
7
+ # The default `:string` type calls `.to_s` during `cast`, which loses the
8
+ # parsed structure and forces `Docidentifier#content=` to re-parse the
9
+ # human-readable form. That round-trip can render dual-type strings
10
+ # (e.g. `"ISO/IS TR 17"` from a TR pubid with stage 60.60) that the
11
+ # pubid-iso parslet grammar can't capture cleanly, producing
12
+ # `Duplicate subtrees while merging result of ROOT` warnings.
13
+ class Pubid < Lutaml::Model::Type::Value
14
+ def self.cast(value, _options = {})
15
+ return nil if value.nil?
16
+ return value if Lutaml::Model::Utils.uninitialized?(value)
17
+
18
+ value
19
+ end
20
+
21
+ def self.serialize(value)
22
+ return nil if value.nil?
23
+ return value if Lutaml::Model::Utils.uninitialized?(value)
24
+
25
+ value.to_s
26
+ end
27
+
28
+ def to_s
29
+ value.to_s
30
+ end
31
+
32
+ def to_yaml
33
+ value.to_s
34
+ end
35
+
36
+ def to_xml
37
+ value.to_s
38
+ end
39
+
40
+ def to_json(*_args)
41
+ value.to_s
42
+ end
43
+
44
+ def self.default_xsd_type
45
+ "xs:string"
46
+ end
47
+ end
48
+ end
49
+ end
50
+ end
@@ -2,6 +2,6 @@
2
2
 
3
3
  module Relaton
4
4
  module Iso
5
- VERSION = "2.1.1"
5
+ VERSION = "2.1.2"
6
6
  end
7
7
  end
metadata CHANGED
@@ -1,14 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: relaton-iso
3
3
  version: !ruby/object:Gem::Version
4
- version: 2.1.1
4
+ version: 2.1.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Ribose Inc.
8
- autorequire:
9
8
  bindir: exe
10
9
  cert_chain: []
11
- date: 2026-05-12 00:00:00.000000000 Z
10
+ date: 1980-01-02 00:00:00.000000000 Z
12
11
  dependencies:
13
12
  - !ruby/object:Gem::Dependency
14
13
  name: isoics
@@ -117,14 +116,10 @@ files:
117
116
  - bin/console
118
117
  - bin/rspec
119
118
  - bin/setup
120
- - grammars/basicdoc.rng
121
- - grammars/biblio-standoc.rng
122
- - grammars/biblio.rng
123
- - grammars/relaton-iso-compile.rng
124
- - grammars/relaton-iso.rng
125
119
  - lib/relaton/iso.rb
126
120
  - lib/relaton/iso/bibliography.rb
127
121
  - lib/relaton/iso/data_fetcher.rb
122
+ - lib/relaton/iso/data_parser.rb
128
123
  - lib/relaton/iso/hash_parser_v1.rb
129
124
  - lib/relaton/iso/hit.rb
130
125
  - lib/relaton/iso/hit_collection.rb
@@ -144,8 +139,8 @@ files:
144
139
  - lib/relaton/iso/model/stagename.rb
145
140
  - lib/relaton/iso/model/structured_identifier.rb
146
141
  - lib/relaton/iso/processor.rb
147
- - lib/relaton/iso/queue.rb
148
142
  - lib/relaton/iso/scraper.rb
143
+ - lib/relaton/iso/type/pubid.rb
149
144
  - lib/relaton/iso/util.rb
150
145
  - lib/relaton/iso/version.rb
151
146
  - relaton-iso.gemspec
@@ -153,7 +148,6 @@ homepage: https://github.com/relaton/relaton-iso
153
148
  licenses:
154
149
  - BSD-2-Clause
155
150
  metadata: {}
156
- post_install_message:
157
151
  rdoc_options: []
158
152
  require_paths:
159
153
  - lib
@@ -168,8 +162,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
168
162
  - !ruby/object:Gem::Version
169
163
  version: '0'
170
164
  requirements: []
171
- rubygems_version: 3.5.22
172
- signing_key:
165
+ rubygems_version: 3.6.9
173
166
  specification_version: 4
174
167
  summary: 'Relaton::Iso: retrieve ISO Standards for bibliographic use using the IsoBibliographicItem
175
168
  model'