relaton-iso 2.1.1 → 2.1.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,444 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "nokogiri"
4
+ require_relative "../iso"
5
+ require_relative "scraper"
6
+
7
+ module Relaton
8
+ module Iso
9
+ #
10
+ # Parses one ISO Open Data record (`iso_deliverables_metadata.jsonl` line)
11
+ # into an `Relaton::Iso::ItemData`.
12
+ #
13
+ # See https://www.iso.org/open-data.html for the field reference.
14
+ #
15
+ class DataParser
16
+ ATTRS = %i[
17
+ type docidentifier docnumber edition language script title status ics
18
+ date contributor abstract copyright source relation place
19
+ structuredidentifier ext
20
+ ].freeze
21
+
22
+ DOCTYPES = {
23
+ "IS" => "international-standard",
24
+ "TS" => "technical-specification",
25
+ "TR" => "technical-report",
26
+ "PAS" => "publicly-available-specification",
27
+ "GUIDE" => "guide",
28
+ "IWA" => "international-workshop-agreement",
29
+ "R" => "recommendation",
30
+ "ISP" => "international-standard",
31
+ "DATA" => "international-standard",
32
+ "TTA" => "international-standard",
33
+ }.freeze
34
+
35
+ SUPPLEMENT_DOCTYPES = {
36
+ "Amd" => "amendment",
37
+ "Cor" => "technical-corrigendum",
38
+ "Add" => "addendum",
39
+ "Suppl" => "supplement",
40
+ }.freeze
41
+
42
+ DOC_URL = "https://www.iso.org/standard/%d.html"
43
+ OBP_URL = "https://www.iso.org/obp/ui/en/#!iso:std:%d:en"
44
+ RSS_URL = "https://www.iso.org/contents/data/standard/%s/%s/%d.detail.rss"
45
+
46
+ #
47
+ # @param [Hash] pub one Open Data record
48
+ # @param [Hash{Integer=>String}] ref_index map of Open Data `id` ->
49
+ # `reference`, used to resolve `replaces` / `replacedBy` (which are
50
+ # numeric IDs in the source).
51
+ # @param [Hash] errors error accumulator (`Hash.new(true)`); fields are
52
+ # AND-ed across all records by the `report_errors` machinery.
53
+ # @param [Hash{String=>Hash}] tc_index map of TC/SC reference ->
54
+ # `{ "en" => title, "fr" => title }`, used to resolve the human
55
+ # committee label from the Open Data technical-committees dataset.
56
+ # @param [Hash{String=>Array<String>}] amend_index map of base
57
+ # reference -> list of supplement (Amd/Cor/Add) references that
58
+ # target it. Open Data records the supplement -> base direction only
59
+ # via the reference string, so we pre-build the reverse map.
60
+ # @param [Hash{String=>String}] date_index map of reference ->
61
+ # `publicationDate`, used to attach a `published` date to each
62
+ # emitted relation's bibitem when the related document is itself
63
+ # present in the Open Data feed.
64
+ #
65
+ def initialize(pub, ref_index = {}, errors = {}, tc_index = {}, amend_index = {}, date_index = {})
66
+ @pub = pub
67
+ @ref_index = ref_index
68
+ @errors = errors
69
+ @tc_index = tc_index
70
+ @amend_index = amend_index
71
+ @date_index = date_index
72
+ end
73
+
74
+ def parse
75
+ ItemData.new(**ATTRS.each_with_object({}) { |a, h| h[a] = send(a) })
76
+ end
77
+
78
+ private
79
+
80
+ def type = "standard"
81
+
82
+ # ---- identifiers -----------------------------------------------------
83
+
84
+ def reference
85
+ @reference ||= @pub["reference"] || ""
86
+ end
87
+
88
+ def pubid
89
+ return @pubid if defined?(@pubid)
90
+
91
+ @pubid = begin
92
+ ::Pubid::Iso::Identifier.parse(reference)
93
+ rescue StandardError => e
94
+ Util.warn "Failed to parse pubid `#{reference}`: #{e.message}"
95
+ nil
96
+ end
97
+ end
98
+
99
+ def docidentifier
100
+ ids = []
101
+ if pubid
102
+ ids << Docidentifier.new(content: pubid, type: "ISO", primary: true)
103
+ if (ref = iso_reference_pubid)
104
+ ids << Docidentifier.new(content: ref, type: "iso-reference")
105
+ end
106
+ if (urn = safe_urn_docid)
107
+ ids << urn
108
+ end
109
+ else
110
+ ids << Docidentifier.new(content: reference, type: "ISO", primary: true)
111
+ end
112
+ @errors[:docidentifier] &&= ids.empty?
113
+ ids
114
+ end
115
+
116
+ def safe_urn_docid
117
+ return nil unless urn_pubid
118
+
119
+ Docidentifier.new(content: urn_pubid, type: "URN")
120
+ rescue StandardError
121
+ nil
122
+ end
123
+
124
+ def iso_reference_pubid
125
+ params = pubid.to_h.except(:typed_stage)
126
+ ::Pubid::Iso::Identifier.create(language: "en", **params)
127
+ rescue StandardError
128
+ nil
129
+ end
130
+
131
+ def urn_pubid
132
+ return @urn_pubid if defined?(@urn_pubid)
133
+
134
+ @urn_pubid = begin
135
+ dup_pubid = pubid.dup
136
+ if dup_pubid.respond_to?(:stage=) && stage_dotted &&
137
+ dup_pubid.respond_to?(:stage) && dup_pubid.stage.nil?
138
+ dup_pubid.stage = ::Pubid::Iso::Identifier.parse_stage(stage_dotted)
139
+ end
140
+ dup_pubid
141
+ rescue StandardError
142
+ nil
143
+ end
144
+ end
145
+
146
+ def docnumber
147
+ pubid&.to_s&.match(/\d+/)&.to_s
148
+ end
149
+
150
+ def edition
151
+ return nil unless @pub["edition"]
152
+
153
+ Bib::Edition.new(content: @pub["edition"].to_s)
154
+ end
155
+
156
+ # ---- language / script ----------------------------------------------
157
+
158
+ def language
159
+ langs = Array(@pub["languages"]).dup
160
+ langs << "en" if langs.empty?
161
+ langs.uniq
162
+ end
163
+
164
+ def script
165
+ language.filter_map { |l| script_for(l) }.uniq
166
+ end
167
+
168
+ def script_for(lang)
169
+ case lang
170
+ when "en", "fr" then "Latn"
171
+ when "ru" then "Cyrl"
172
+ end
173
+ end
174
+
175
+ # ---- title -----------------------------------------------------------
176
+
177
+ def title
178
+ result = []
179
+ result += titles_for("en")
180
+ result += titles_for("fr")
181
+ @errors[:title] &&= result.empty?
182
+ result
183
+ end
184
+
185
+ def titles_for(lang)
186
+ raw = @pub.dig("title", lang)
187
+ return [] if raw.nil? || raw.empty?
188
+
189
+ Bib::Title.from_string(normalize_dashes(raw), lang, script_for(lang))
190
+ end
191
+
192
+ def normalize_dashes(str)
193
+ str.gsub(/\s—\s/, " - ").gsub(/\s–\s/, " - ")
194
+ end
195
+
196
+ # ---- status ----------------------------------------------------------
197
+
198
+ # Open Data exposes a 4-digit stage code (e.g. 2098 = 20.98, 6060 = 60.60).
199
+ # Records occasionally come through with 2 or 3 digits (zero-padded).
200
+ def stage_dotted
201
+ return @stage_dotted if defined?(@stage_dotted)
202
+
203
+ @stage_dotted =
204
+ if @pub["currentStage"]
205
+ digits = format("%04d", @pub["currentStage"].to_i)
206
+ "#{digits[0, 2]}.#{digits[2, 2]}"
207
+ end
208
+ end
209
+
210
+ def status
211
+ return nil unless stage_dotted
212
+
213
+ stg, sub = stage_dotted.split(".")
214
+ Bib::Status.new(
215
+ stage: Bib::Status::Stage.new(content: stg),
216
+ substage: sub ? Bib::Status::Stage.new(content: sub) : nil,
217
+ )
218
+ end
219
+
220
+ # ---- ICS -------------------------------------------------------------
221
+
222
+ def ics
223
+ return [] unless @pub["icsCode"]
224
+
225
+ Array(@pub["icsCode"]).map do |code|
226
+ info = safe_isoics_fetch(code)
227
+ Bib::ICS.new(code: code, text: info&.description)
228
+ end
229
+ end
230
+
231
+ def safe_isoics_fetch(code)
232
+ Isoics.fetch code
233
+ rescue StandardError
234
+ nil
235
+ end
236
+
237
+ # ---- dates -----------------------------------------------------------
238
+
239
+ def date
240
+ pd = @pub["publicationDate"]
241
+ return [] if pd.nil? || pd.empty?
242
+
243
+ [Bib::Date.new(type: "published", at: pd)]
244
+ end
245
+
246
+ # ---- contributors ----------------------------------------------------
247
+
248
+ def contributor
249
+ publishers + Array(editorialgroup_contributor)
250
+ end
251
+
252
+ def publishers
253
+ reference.sub(/\s.*/, "").split("/").filter_map do |abbrev|
254
+ info = Scraper::PUBLISHERS[abbrev]
255
+ next unless info
256
+
257
+ name = Bib::TypedLocalizedString.new(content: info[:name])
258
+ abbr = Bib::LocalizedString.new(content: abbrev)
259
+ uri = Bib::Uri.new(content: info[:uri]) if info[:uri]
260
+ org = Bib::Organization.new(name: [name], abbreviation: abbr, uri: [uri].compact)
261
+ role = Bib::Contributor::Role.new(type: "publisher")
262
+ Bib::Contributor.new(organization: org, role: [role])
263
+ end
264
+ end
265
+
266
+ def editorialgroup_contributor
267
+ wg = @pub["ownerCommittee"]
268
+ return nil if wg.nil? || wg.empty?
269
+
270
+ parts = wg.split("/")
271
+ prefix = parts[0]
272
+ type = parts[1]&.match(/^[A-Z]+/)&.to_s || "TC"
273
+
274
+ publisher = Scraper::PUBLISHERS[prefix]
275
+ name = if publisher
276
+ [Bib::TypedLocalizedString.new(content: publisher[:name])]
277
+ elsif prefix
278
+ [Bib::TypedLocalizedString.new(content: prefix)]
279
+ else
280
+ []
281
+ end
282
+ abbreviation = (Bib::LocalizedString.new(content: prefix) if prefix)
283
+
284
+ label = @tc_index.dig(wg, "en") || wg
285
+ subdivision = Bib::Subdivision.new(
286
+ type: "technical-committee",
287
+ subtype: type,
288
+ name: [Bib::TypedLocalizedString.new(content: label)],
289
+ identifier: [Bib::OrganizationType::Identifier.new(content: wg)],
290
+ )
291
+
292
+ role = Bib::Contributor::Role.new(
293
+ type: "author",
294
+ description: [Bib::LocalizedMarkedUpString.new(content: "committee")],
295
+ )
296
+
297
+ Bib::Contributor.new(
298
+ role: [role],
299
+ organization: Bib::Organization.new(
300
+ name: name, subdivision: [subdivision], abbreviation: abbreviation,
301
+ ),
302
+ )
303
+ end
304
+
305
+ # ---- abstract --------------------------------------------------------
306
+
307
+ def abstract
308
+ %w[en fr].filter_map do |lang|
309
+ html = @pub.dig("scope", lang)
310
+ next if html.nil? || html.empty?
311
+
312
+ text = strip_html(html)
313
+ next if text.empty?
314
+
315
+ Bib::Abstract.new(content: text, language: lang, script: script_for(lang))
316
+ end
317
+ end
318
+
319
+ def strip_html(html)
320
+ Nokogiri::HTML.fragment(html).text.strip.gsub(/\s+/, " ")
321
+ end
322
+
323
+ # ---- copyright -------------------------------------------------------
324
+
325
+ def copyright
326
+ from = reference[/(?<=:)\d{4}/] ||
327
+ @pub["publicationDate"]&.match(/\d{4}/)&.to_s
328
+ return [] unless from && !from.empty?
329
+
330
+ owner_name = reference.match(/.*?(?=\s)/).to_s
331
+ name = Bib::TypedLocalizedString.new(content: owner_name)
332
+ org = Bib::Organization.new(name: [name])
333
+ contrib = Bib::ContributionInfo.new(organization: org)
334
+ [Bib::Copyright.new(owner: [contrib], from: from)]
335
+ end
336
+
337
+ # ---- source links ----------------------------------------------------
338
+
339
+ def source
340
+ id = @pub["id"]
341
+ return [] unless id
342
+
343
+ pad = format("%06d", id)
344
+ [
345
+ Bib::Uri.new(type: "src", content: format(DOC_URL, id)),
346
+ Bib::Uri.new(type: "obp", content: format(OBP_URL, id)),
347
+ Bib::Uri.new(type: "rss", content: format(RSS_URL, pad[0, 2], pad[2, 2], id)),
348
+ ]
349
+ end
350
+
351
+ # ---- relations -------------------------------------------------------
352
+
353
+ # Open Data semantics:
354
+ # * `replaces` - older docs THIS one supersedes -> `obsoletes`
355
+ # * `replacedBy` - newer docs that supersede THIS one -> `obsoletedBy`
356
+ # Amendments/corrigenda/addenda are stitched in via two routes:
357
+ # * on the BASE record, look up `@amend_index` for supplements
358
+ # targeting it (-> `updatedBy`); the index is pre-built in
359
+ # `DataFetcher#build_ref_index` because Open Data only records
360
+ # the supplement -> base direction via the reference string.
361
+ # * on the SUPPLEMENT record itself, derive the base from
362
+ # `pubid.base` and emit the forward `updates` relation.
363
+ def relation
364
+ rels = []
365
+ rels += build_relations(@pub["replaces"], "obsoletes")
366
+ rels += build_relations(@pub["replacedBy"], "obsoletedBy")
367
+ rels += amendment_relations
368
+ rels += base_relation
369
+ rels
370
+ end
371
+
372
+ def build_relations(ids, type)
373
+ Array(ids).filter_map do |id|
374
+ ref = @ref_index[id] || @ref_index[id.to_s]
375
+ next unless ref
376
+
377
+ relation_for(ref, type)
378
+ end
379
+ end
380
+
381
+ def amendment_relations
382
+ Array(@amend_index[pubid&.to_s || reference]).map do |amend_ref|
383
+ relation_for(amend_ref, "updatedBy")
384
+ end
385
+ end
386
+
387
+ def base_relation
388
+ return [] unless pubid&.respond_to?(:base) && pubid.base
389
+
390
+ [relation_for(pubid.base.to_s, "updates")]
391
+ end
392
+
393
+ def relation_for(ref, type)
394
+ docid = Docidentifier.new(content: ref, type: "ISO", primary: true)
395
+ attrs = {
396
+ docidentifier: [docid],
397
+ formattedref: Bib::Formattedref.new(content: ref),
398
+ }
399
+ if (pub_date = @date_index[ref]) && !pub_date.empty?
400
+ attrs[:date] = [Bib::Date.new(type: "published", at: pub_date)]
401
+ end
402
+ Relation.new(type: type, bibitem: ItemData.new(**attrs))
403
+ end
404
+
405
+ # ---- structured identifier ------------------------------------------
406
+
407
+ def structuredidentifier
408
+ return nil unless @pub["id"]
409
+
410
+ pnum = ProjectNumber.new(content: @pub["id"].to_s)
411
+ publisher = pubid&.respond_to?(:publisher) ? pubid.publisher : nil
412
+ StructuredIdentifier.new(project_number: pnum, type: publisher || "ISO")
413
+ end
414
+
415
+ # ---- place -----------------------------------------------------------
416
+
417
+ def place
418
+ [Bib::Place.new(city: "Geneva")]
419
+ end
420
+
421
+ # ---- ext -------------------------------------------------------------
422
+
423
+ def ext
424
+ Ext.new(
425
+ doctype: doctype,
426
+ flavor: "iso",
427
+ ics: ics,
428
+ structuredidentifier: structuredidentifier,
429
+ stagename: nil,
430
+ updates_document_type: nil,
431
+ fast_track: nil,
432
+ price_code: nil,
433
+ )
434
+ end
435
+
436
+ def doctype
437
+ type = SUPPLEMENT_DOCTYPES[@pub["supplementType"]] ||
438
+ DOCTYPES[@pub["deliverableType"]] ||
439
+ "international-standard"
440
+ Doctype.new(content: type)
441
+ end
442
+ end
443
+ end
444
+ end
@@ -1,7 +1,9 @@
1
+ require_relative "../type/pubid"
2
+
1
3
  module Relaton
2
4
  module Iso
3
5
  class Docidentifier < Bib::Docidentifier
4
- attribute :content, :string
6
+ attribute :content, Type::Pubid
5
7
 
6
8
  attr_reader :pubid
7
9
 
@@ -30,7 +32,10 @@ module Relaton
30
32
  begin
31
33
  ::Pubid::Iso::Identifier.parse(value)
32
34
  rescue StandardError
33
- Util.warn "Failed to parse Pubid: #{value}"
35
+ # Suppress when type is not yet set (lutaml runs the setter
36
+ # once during init before `type` is assigned, then `initialize`
37
+ # re-runs it; only the second pass is authoritative).
38
+ Util.warn "Failed to parse Pubid: #{value}" if type
34
39
  nil
35
40
  end
36
41
  end
@@ -4,7 +4,7 @@ module Relaton
4
4
  TYPES = %w[
5
5
  international-standard technical-specification technical-report publicly-available-specification
6
6
  international-workshop-agreement guide recommendation amendment technical-corrigendum directive
7
- committee-document addendum
7
+ committee-document addendum supplement
8
8
  ].freeze
9
9
 
10
10
  attribute :content, :string, values: TYPES
@@ -10,7 +10,7 @@ module Relaton
10
10
  @prefix = "ISO"
11
11
  @defaultprefix = %r{^ISO(/IEC)?\s}
12
12
  @idtype = "ISO"
13
- @datasets = %w[iso-ics]
13
+ @datasets = %w[iso-open-data iso-open-data-all]
14
14
  end
15
15
 
16
16
  # @param code [String]
@@ -23,16 +23,19 @@ module Relaton
23
23
  end
24
24
 
25
25
  #
26
- # Fetch all the documents from https://www.iso.org/standards-catalogue/browse-by-ics.html
26
+ # Fetch all the documents from the ISO Open Data programme
27
+ # (https://www.iso.org/open-data.html).
27
28
  #
28
- # @param [String] source source name (iso-rss, iso-rss-all)
29
+ # @param [String] source source name
30
+ # * `iso-open-data` - skip if upstream `Last-Modified` is unchanged
31
+ # * `iso-open-data-all` - wipe `output` and re-emit every record
29
32
  # @param [Hash] opts
30
33
  # @option opts [String] :output directory to output documents
31
34
  # @option opts [String] :format output format (xml, yaml, bibxml)
32
35
  #
33
- def fetch_data(_source, opts)
36
+ def fetch_data(source, opts)
34
37
  require_relative "data_fetcher"
35
- DataFetcher.fetch(**opts)
38
+ DataFetcher.fetch(source, **opts)
36
39
  end
37
40
 
38
41
  # @param xml [String]
@@ -0,0 +1,50 @@
1
+ module Relaton
2
+ module Iso
3
+ module Type
4
+ # Lutaml-model attribute type that preserves `Pubid::Iso::Identifier::Base`
5
+ # instances on the way in and stringifies them on the way out.
6
+ #
7
+ # The default `:string` type calls `.to_s` during `cast`, which loses the
8
+ # parsed structure and forces `Docidentifier#content=` to re-parse the
9
+ # human-readable form. That round-trip can render dual-type strings
10
+ # (e.g. `"ISO/IS TR 17"` from a TR pubid with stage 60.60) that the
11
+ # pubid-iso parslet grammar can't capture cleanly, producing
12
+ # `Duplicate subtrees while merging result of ROOT` warnings.
13
+ class Pubid < Lutaml::Model::Type::Value
14
+ def self.cast(value, _options = {})
15
+ return nil if value.nil?
16
+ return value if Lutaml::Model::Utils.uninitialized?(value)
17
+
18
+ value
19
+ end
20
+
21
+ def self.serialize(value)
22
+ return nil if value.nil?
23
+ return value if Lutaml::Model::Utils.uninitialized?(value)
24
+
25
+ value.to_s
26
+ end
27
+
28
+ def to_s
29
+ value.to_s
30
+ end
31
+
32
+ def to_yaml
33
+ value.to_s
34
+ end
35
+
36
+ def to_xml
37
+ value.to_s
38
+ end
39
+
40
+ def to_json(*_args)
41
+ value.to_s
42
+ end
43
+
44
+ def self.default_xsd_type
45
+ "xs:string"
46
+ end
47
+ end
48
+ end
49
+ end
50
+ end
@@ -2,6 +2,6 @@
2
2
 
3
3
  module Relaton
4
4
  module Iso
5
- VERSION = "2.1.1"
5
+ VERSION = "2.1.3"
6
6
  end
7
7
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: relaton-iso
3
3
  version: !ruby/object:Gem::Version
4
- version: 2.1.1
4
+ version: 2.1.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - Ribose Inc.
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2026-05-12 00:00:00.000000000 Z
11
+ date: 2026-06-16 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: isoics
@@ -117,14 +117,10 @@ files:
117
117
  - bin/console
118
118
  - bin/rspec
119
119
  - bin/setup
120
- - grammars/basicdoc.rng
121
- - grammars/biblio-standoc.rng
122
- - grammars/biblio.rng
123
- - grammars/relaton-iso-compile.rng
124
- - grammars/relaton-iso.rng
125
120
  - lib/relaton/iso.rb
126
121
  - lib/relaton/iso/bibliography.rb
127
122
  - lib/relaton/iso/data_fetcher.rb
123
+ - lib/relaton/iso/data_parser.rb
128
124
  - lib/relaton/iso/hash_parser_v1.rb
129
125
  - lib/relaton/iso/hit.rb
130
126
  - lib/relaton/iso/hit_collection.rb
@@ -144,8 +140,8 @@ files:
144
140
  - lib/relaton/iso/model/stagename.rb
145
141
  - lib/relaton/iso/model/structured_identifier.rb
146
142
  - lib/relaton/iso/processor.rb
147
- - lib/relaton/iso/queue.rb
148
143
  - lib/relaton/iso/scraper.rb
144
+ - lib/relaton/iso/type/pubid.rb
149
145
  - lib/relaton/iso/util.rb
150
146
  - lib/relaton/iso/version.rb
151
147
  - relaton-iso.gemspec