obp-access 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (53) hide show
  1. checksums.yaml +7 -0
  2. data/.rubocop.yml +18 -0
  3. data/.rubocop_todo.yml +33 -0
  4. data/CLAUDE.md +59 -0
  5. data/README.adoc +97 -0
  6. data/Rakefile +12 -0
  7. data/exe/obp-access +5 -0
  8. data/lib/obp/access/catalog.rb +67 -0
  9. data/lib/obp/access/cli.rb +115 -0
  10. data/lib/obp/access/converter.rb +25 -0
  11. data/lib/obp/access/deliverable.rb +109 -0
  12. data/lib/obp/access/domain_extractor.rb +63 -0
  13. data/lib/obp/access/element_registry.rb +20 -0
  14. data/lib/obp/access/elements/array.rb +64 -0
  15. data/lib/obp/access/elements/base.rb +69 -0
  16. data/lib/obp/access/elements/bibliography/bib_ref.rb +60 -0
  17. data/lib/obp/access/elements/bibliography.rb +52 -0
  18. data/lib/obp/access/elements/copyright.rb +27 -0
  19. data/lib/obp/access/elements/figure.rb +58 -0
  20. data/lib/obp/access/elements/figure_group.rb +48 -0
  21. data/lib/obp/access/elements/index.rb +113 -0
  22. data/lib/obp/access/elements/introduction.rb +31 -0
  23. data/lib/obp/access/elements/list.rb +58 -0
  24. data/lib/obp/access/elements/non_normative_note.rb +47 -0
  25. data/lib/obp/access/elements/paragraph.rb +31 -0
  26. data/lib/obp/access/elements/root.rb +122 -0
  27. data/lib/obp/access/elements/section.rb +38 -0
  28. data/lib/obp/access/elements/section_title.rb +26 -0
  29. data/lib/obp/access/elements/section_type.rb +27 -0
  30. data/lib/obp/access/elements/table_wrap.rb +47 -0
  31. data/lib/obp/access/elements/terminology/base.rb +27 -0
  32. data/lib/obp/access/elements/terminology/definition.rb +44 -0
  33. data/lib/obp/access/elements/terminology/example.rb +27 -0
  34. data/lib/obp/access/elements/terminology/note.rb +27 -0
  35. data/lib/obp/access/elements/terminology/source.rb +45 -0
  36. data/lib/obp/access/elements/terminology/tig.rb +59 -0
  37. data/lib/obp/access/elements/terminology/tig_admitted.rb +23 -0
  38. data/lib/obp/access/elements/terminology/tig_deprecated.rb +39 -0
  39. data/lib/obp/access/elements/terminology/tig_preferred.rb +23 -0
  40. data/lib/obp/access/elements/terminology.rb +28 -0
  41. data/lib/obp/access/elements/title.rb +33 -0
  42. data/lib/obp/access/fetcher.rb +63 -0
  43. data/lib/obp/access/grammar_parser.rb +135 -0
  44. data/lib/obp/access/imager.rb +39 -0
  45. data/lib/obp/access/inline_renderer.rb +97 -0
  46. data/lib/obp/access/parser.rb +82 -0
  47. data/lib/obp/access/renderer.rb +43 -0
  48. data/lib/obp/access/retriever.rb +97 -0
  49. data/lib/obp/access/urn.rb +31 -0
  50. data/lib/obp/access/version.rb +5 -0
  51. data/lib/obp/access.rb +118 -0
  52. data/lib/obp-access.rb +1 -0
  53. metadata +151 -0
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA256:
3
+ metadata.gz: '08cfedb2a758cc85495717b57a184fe2e9135e77b31449f7529470db99d82fdb'
4
+ data.tar.gz: 8fdb94cb0f6f7eec0f91ee0c2161a4846725f6b8aef07ecc3fd644998f1ff616
5
+ SHA512:
6
+ metadata.gz: 363df11925ccc60638446288a06dd3bf58c4592eea3ac98459d69ed116693e4dd83fbff27a0533826e0c07affe64c9eaffd8abf6c3bdfbcded31e469f8d2eb98
7
+ data.tar.gz: c82359991c371041be7c14a11b27dcec0c255757657dfe164099c6e68809bb6c16d98f3bd319f2c809897f03b58c14d7883b12e19227f800ab8fb6aa4b4aa6e9
data/.rubocop.yml ADDED
@@ -0,0 +1,18 @@
1
+ # Auto-generated by Cimas: Do not edit it manually!
2
+ # See https://github.com/metanorma/cimas
3
+ inherit_from:
4
+ - .rubocop_todo.yml
5
+ - https://raw.githubusercontent.com/riboseinc/oss-guides/master/ci/rubocop.yml
6
+
7
+ # local repo-specific modifications
8
+ # ...
9
+
10
+ AllCops:
11
+ NewCops: enable
12
+ SuggestExtensions: false
13
+
14
+ Metrics/MethodLength:
15
+ Max: 30
16
+
17
+ Layout/LineLength:
18
+ Max: 120
data/.rubocop_todo.yml ADDED
@@ -0,0 +1,33 @@
1
+ # This configuration was generated by
2
+ # `rubocop --auto-gen-config`
3
+ # on 2026-05-12 09:41:56 UTC using RuboCop version 1.86.1.
4
+ # The point is for the user to remove these configuration records
5
+ # one by one as the offenses are removed from the code base.
6
+ # Note that changes in the inspected code, or installation of new
7
+ # versions of RuboCop, may require this file to be generated again.
8
+
9
+ # Offense count: 1
10
+ Gemspec/RequiredRubyVersion:
11
+ Exclude:
12
+ - 'obp-access.gemspec'
13
+
14
+ # Offense count: 1
15
+ # This cop supports safe autocorrection (--autocorrect).
16
+ # Configuration parameters: AllowHeredoc, AllowURI, AllowQualifiedName, URISchemes, AllowRBSInlineAnnotation, AllowCopDirectives, AllowedPatterns, SplitStrings.
17
+ # URISchemes: http, https
18
+ Layout/LineLength:
19
+ Max: 124
20
+
21
+ # Offense count: 3
22
+ # Configuration parameters: AllowedMethods, AllowedPatterns, CountRepeatedAttributes, Max.
23
+ Metrics/AbcSize:
24
+ Exclude:
25
+ - 'lib/obp/access/elements/bibliography/bib_ref.rb'
26
+ - 'lib/obp/access/elements/root.rb'
27
+
28
+ # Offense count: 1
29
+ # This cop supports safe autocorrection (--autocorrect).
30
+ # Configuration parameters: AllowHeredoc, AllowURI, AllowQualifiedName, URISchemes, AllowRBSInlineAnnotation, AllowCopDirectives, AllowedPatterns, SplitStrings.
31
+ # URISchemes: http, https
32
+ Layout/LineLength:
33
+ Max: 124
data/CLAUDE.md ADDED
@@ -0,0 +1,59 @@
1
+ # CLAUDE.md
2
+
3
+ This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository.
4
+
5
+ ## Project Overview
6
+
7
+ Ruby gem (`obp-access`) that fetches informative content (introduction, scope, terms, bibliography) from the ISO Online Browsing Platform (OBP) and converts it to NISO STS XML. Published by Ribose Inc. under BSD-2-Clause license.
8
+
9
+ ## Commands
10
+
11
+ - **Install dependencies:** `bundle install`
12
+ - **Run tests:** `bundle exec rspec`
13
+ - **Lint:** `bundle exec rubocop`
14
+ - **Run a single test:** `bundle exec rspec spec/obp/access/grammar_parser_spec.rb`
15
+ - **Console:** `bin/console` (loads the gem for interactive use)
16
+
17
+ ## Critical rules
18
+
19
+ - **NEVER use Nokogiri for XML building.** The sts gem and lutaml-model handle all XML serialization. Nokogiri is used internally by those libraries but must not appear in our application code. The `Nokogiri::XML::Builder` usage in `elements/` is legacy tech debt being migrated to the sts gem's model-driven approach.
20
+ - **Never use `send` to call private methods** (breaks encapsulation).
21
+ - **Never use `respond_to?`** (poor typing — use type checking or duck typing).
22
+ - **Never use `instance_double` in specs** — use `double` instead.
23
+
24
+ ## Architecture
25
+
26
+ Data flows through these classes in `lib/obp/access/`:
27
+
28
+ 1. **`Access`** (entry point) — orchestrates the pipeline. `Obp::Access.fetch(urn)` returns an instance per language. `Access.fetch_all(urn, languages:)` returns separate instances per language.
29
+
30
+ 2. **`Parser`** — fetches content from the ISO OBP API (`https://www.iso.org/obp/ui`) via HTTP POST with a URN payload. Parses the JSON response to extract HTML content, titles, and images.
31
+
32
+ 3. **`Converter`** — wraps the HTML source, normalizes whitespace, parses it with Nokogiri, and passes DOM nodes to the Renderer.
33
+
34
+ 4. **`Renderer`** — recursively walks DOM nodes and dispatches them to element classes registered in `ElementRegistry`. Each element class matches against CSS classes and builds NISO STS XML.
35
+
36
+ ### Element system
37
+
38
+ - **`ElementRegistry`** — explicit registry of element classes. Elements call `ElementRegistry.register(Class)` at load time.
39
+ - **`Elements::Base`** — abstract base class. Subclasses implement `self.classes` (CSS class array) and `content` (XML builder).
40
+ - **`Elements::Root`** — creates the NISO STS document skeleton (`<standard>` with `<front>`, `<body>`, `<back>`).
41
+ - **`Elements::Terminology`** sub-elements handle term entries using the `tbx:` namespace.
42
+ - Other elements: `section`, `introduction`, `bibliography`, `paragraph`, `list`, `array`, `figure`, `figure_group`, `table_wrap`, `title`, `copyright`, `index`, `non_normative_note`.
43
+
44
+ ### Supporting classes
45
+
46
+ - **`GrammarParser`** — extracts part-of-speech and gender from bold term markup.
47
+ - **`DomainExtractor`** — extracts subject-field domains from definition text.
48
+ - **`InlineRenderer`** — renders inline HTML elements (links, xrefs, italic, bold, entailed terms) to STS XML.
49
+ - **`Imager`** — downloads images from OBP. Uses `Parallel` for concurrent downloads.
50
+
51
+ ## Key dependencies
52
+
53
+ - **sts** — NISO STS gem for generating STS objects from XML
54
+ - **lutaml/model** — model serialization framework
55
+ - **parallel** — concurrent HTTP requests
56
+
57
+ ## Ruby version
58
+
59
+ Requires Ruby >= 3.1.
data/README.adoc ADDED
@@ -0,0 +1,97 @@
1
+ == ISO OBP data access in Ruby
2
+
3
+ === Background
4
+
5
+ The https://www.iso.org/obp/ui[ISO Online Browsing Platform (OBP)] is the ISO
6
+ official location to obtain informative content from ISO standards.
7
+
8
+ NOTE: Normative content of ISO standards is paywalled.
9
+
10
+ Informative content provided on the OBP include:
11
+
12
+ * Introduction (if present)
13
+ * Scope
14
+ * Normative references
15
+ * Terms and definitions
16
+ * Bibliography
17
+
18
+ === Use cases
19
+
20
+ There are many situations where the informative content is already useful:
21
+
22
+ * ISO project editors: obtain machine-readable content of informative clauses to
23
+ start a revision without waiting for the complicated STS XML file.
24
+
25
+ * Terminology users: obtain term and definition details from a standard.
26
+
27
+ * Bibliography users: find out what documents this standard refers to.
28
+
29
+ === Usage
30
+
31
+ ==== Single language
32
+
33
+ [source,ruby]
34
+ ----
35
+ obp = Obp::Access.fetch("iso:std:iso:5598:ed-3:v1:en")
36
+ obp.to_xml(pretty: true) # => NISO STS XML string
37
+ obp.to_sts # => #<Sts::NisoSts::Standard>
38
+ obp.to_xml_file # => "/tmp/iso-std-iso-5598-.../iso-std-iso-5598-....xml"
39
+ ----
40
+
41
+ ==== Multilingual (specific languages)
42
+
43
+ [source,ruby]
44
+ ----
45
+ obp = Obp::Access.fetch("iso:std:iso:5598:ed-3:v1:en", languages: ["fr", "de"])
46
+ obp.to_xml(pretty: true) # => NISO STS XML with en/fr/de langSets
47
+ ----
48
+
49
+ ==== Multilingual (all available languages)
50
+
51
+ [source,ruby]
52
+ ----
53
+ obp = Obp::Access.fetch("iso:std:iso:5598:ed-3:v1:en", languages: :all)
54
+ obp.to_xml(pretty: true) # => NISO STS XML with all available language langSets
55
+ ----
56
+
57
+ ==== CLI
58
+
59
+ Fetch a single document (English only):
60
+
61
+ [source,shell]
62
+ ----
63
+ $ obp-access fetch iso:std:iso:5598:ed-3:v1:en
64
+ ----
65
+
66
+ Fetch with all available languages:
67
+
68
+ [source,shell]
69
+ ----
70
+ $ obp-access fetch -l all iso:std:iso:5598:ed-3:v1:en
71
+ ----
72
+
73
+ Fetch with specific languages and save to file:
74
+
75
+ [source,shell]
76
+ ----
77
+ $ obp-access fetch -l fr,de -o output/ iso:std:iso:5598:ed-3:v1:en
78
+ ----
79
+
80
+ === Generated XML
81
+
82
+ The output is NISO STS XML with TBX-Basic terminology markup:
83
+
84
+ * Terms use `<tbx:termEntry>` with `<tbx:langSet>` per language
85
+ * Grammar is encoded via `<tbx:gram>` (gender: m/f/n) and `<tbx:partOfSpeech>`
86
+ * Domains are extracted as `<tbx:subjectField>`
87
+ * Deprecated terms use `<tbx:normativeAuthorization value="deprecatedTerm"/>`
88
+
89
+ == Credits
90
+
91
+ This gem is developed, maintained and funded by
92
+ https://www.ribose.com[Ribose Inc.]
93
+
94
+ == License
95
+
96
+ The gem is available as open source under the terms of the
97
+ https://opensource.org/licenses/BSD-2-Clause[2-Clause BSD License].
data/Rakefile ADDED
@@ -0,0 +1,12 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "bundler/gem_tasks"
4
+ require "rake/testtask"
5
+
6
+ Rake::TestTask.new(:test) do |t|
7
+ t.libs << "test"
8
+ t.libs << "lib"
9
+ t.test_files = FileList["test/**/*_test.rb"]
10
+ end
11
+
12
+ task default: :test
data/exe/obp-access ADDED
@@ -0,0 +1,5 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require "obp/access"
4
+
5
+ Obp::Access::CLI.start(ARGV)
@@ -0,0 +1,67 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "json"
4
+ require "net/http"
5
+
6
+ module Obp
7
+ class Access
8
+ class Catalog
9
+ SOURCE_URL = "https://isopublicstorageprod.blob.core.windows.net/opendata/" \
10
+ "_latest/iso_deliverables_metadata/json/iso_deliverables_metadata.jsonl"
11
+
12
+ attr_reader :deliverables
13
+
14
+ def initialize(deliverables:)
15
+ @deliverables = deliverables
16
+ end
17
+
18
+ def self.load(path: nil, url: SOURCE_URL)
19
+ raw = path ? read_local(path) : fetch_remote(url)
20
+ new(deliverables: parse_jsonl(raw).map { |data| Deliverable.new(data) })
21
+ end
22
+
23
+ def retrievable
24
+ @retrievable ||= deliverables.select(&:retrievable?)
25
+ end
26
+
27
+ def by_type(type)
28
+ deliverables.select { |d| d.deliverable_type == type }
29
+ end
30
+
31
+ def by_ics(code)
32
+ deliverables.select { |d| d.ics_codes.include?(code) }
33
+ end
34
+
35
+ def count
36
+ deliverables.size
37
+ end
38
+
39
+ class << self
40
+ private
41
+
42
+ def parse_jsonl(text)
43
+ text.lines.filter_map do |line|
44
+ line.strip!
45
+ next if line.empty?
46
+
47
+ JSON.parse(line)
48
+ end
49
+ end
50
+
51
+ def read_local(path)
52
+ File.read(path)
53
+ end
54
+
55
+ def fetch_remote(url)
56
+ uri = URI(url)
57
+ response = Net::HTTP.get_response(uri)
58
+ unless response.is_a?(Net::HTTPSuccess)
59
+ raise "Failed to fetch catalog: #{response.code} #{response.message}"
60
+ end
61
+
62
+ response.body
63
+ end
64
+ end
65
+ end
66
+ end
67
+ end
@@ -0,0 +1,115 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "thor"
4
+
5
+ module Obp
6
+ class Access
7
+ class CLI < Thor
8
+ desc "fetch URN", "Fetch a single document from ISO OBP by URN"
9
+ option :output, aliases: "-o", type: :string, desc: "Output directory (default: stdout)"
10
+ option :languages, aliases: "-l", type: :string,
11
+ desc: "Languages: 'all' or comma-separated (e.g. 'fr,de')"
12
+ def fetch(urn)
13
+ langs = parse_languages
14
+ if langs
15
+ say "Fetching #{urn} (#{langs == :all ? 'all languages' : langs.join(', ')})..."
16
+ Access.fetch_all(urn, languages: langs).each { |access| output(access) }
17
+ else
18
+ say "Fetching #{urn}..."
19
+ output(Access.fetch(urn))
20
+ end
21
+ rescue StandardError => e
22
+ say "Error: #{e.message}", :red
23
+ exit 1
24
+ end
25
+
26
+ desc "catalog", "Load and inspect the ISO Open Data catalog"
27
+ option :path, type: :string, desc: "Local JSONL file path (default: fetch remote)"
28
+ option :filter, type: :string, enum: %w[retrievable types], desc: "Filter mode"
29
+ option :type, type: :string, desc: "Filter by deliverable type (IS, TS, TR, etc.)"
30
+ option :ics, type: :string, desc: "Filter by ICS code"
31
+ def catalog
32
+ say "Loading catalog..."
33
+ cat = Access::Catalog.load(path: options[:path])
34
+
35
+ if options[:filter] == "types"
36
+ print_type_summary(cat)
37
+ return
38
+ end
39
+
40
+ print_deliverables(cat)
41
+ rescue StandardError => e
42
+ say "Error: #{e.message}", :red
43
+ exit 1
44
+ end
45
+
46
+ desc "retrieve", "Bulk retrieve documents from ISO OBP"
47
+ option :output, aliases: "-o", type: :string, required: true, desc: "Output directory"
48
+ option :path, type: :string, desc: "Local JSONL file path"
49
+ option :concurrency, aliases: "-c", type: :numeric, default: 4, desc: "Thread concurrency"
50
+ def retrieve
51
+ say "Loading catalog..."
52
+ cat = Access::Catalog.load(path: options[:path])
53
+ say "Found #{cat.retrievable.size} retrievable deliverables"
54
+ build_retriever(cat).run
55
+ rescue StandardError => e
56
+ say "Error: #{e.message}", :red
57
+ exit 1
58
+ end
59
+
60
+ private
61
+
62
+ def output(access)
63
+ if options[:output]
64
+ dir = File.expand_path(options[:output])
65
+ FileUtils.mkdir_p(dir)
66
+ path = File.join(dir, "#{access.urn.safe}.xml")
67
+ File.write(path, access.to_xml(pretty: true))
68
+ say "Saved to #{path}", :green
69
+ else
70
+ puts access.to_xml(pretty: true)
71
+ end
72
+ end
73
+
74
+ def print_deliverables(cat)
75
+ filtered = apply_filters(cat)
76
+ say "Total: #{filtered.size} deliverables"
77
+ filtered.first(20).each do |d|
78
+ say " #{d.reference} [#{d.deliverable_type}] stage=#{d.current_stage} langs=#{d.languages.join(',')}"
79
+ end
80
+ say " ... (showing first 20)" if filtered.size > 20
81
+ end
82
+
83
+ def apply_filters(cat)
84
+ return cat.by_type(options[:type]) if options[:type]
85
+ return cat.by_ics(options[:ics]) if options[:ics]
86
+ return cat.retrievable if options[:filter] == "retrievable"
87
+
88
+ cat.deliverables
89
+ end
90
+
91
+ def build_retriever(cat)
92
+ Access::Retriever.new(
93
+ output_dir: File.expand_path(options[:output]),
94
+ catalog: cat,
95
+ concurrency: options[:concurrency],
96
+ )
97
+ end
98
+
99
+ def parse_languages
100
+ case options[:languages]
101
+ when nil then nil
102
+ when "all" then :all
103
+ else options[:languages].split(",").map(&:strip)
104
+ end
105
+ end
106
+
107
+ def print_type_summary(cat)
108
+ cat.deliverables.group_by(&:deliverable_type).sort.each do |type, list|
109
+ published = list.count(&:published?)
110
+ say " #{type || 'IS'}: #{list.size} total, #{published} published"
111
+ end
112
+ end
113
+ end
114
+ end
115
+ end
@@ -0,0 +1,25 @@
1
+ module Obp
2
+ class Access
3
+ class Converter
4
+ attr_reader :urn, :metas, :source
5
+
6
+ def initialize(urn:, metas:, source:)
7
+ @urn = urn
8
+ @metas = metas
9
+ @source = source
10
+ end
11
+
12
+ def to_xml
13
+ Renderer.new(urn:, metas:, nodes:).to_xml
14
+ end
15
+
16
+ private
17
+
18
+ def nodes
19
+ html = source.gsub(/[[:space:]]/, " ")
20
+ doc = Nokogiri::HTML(html)
21
+ doc.css("body > div.sts-standard").children
22
+ end
23
+ end
24
+ end
25
+ end
@@ -0,0 +1,109 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Obp
4
+ class Access
5
+ class Deliverable
6
+ TYPE_SEGMENTS = {
7
+ "IS" => nil, "TS" => "ts", "TR" => "tr", "R" => "r",
8
+ "PAS" => "pas", "ISP" => "isp", "GUIDE" => "guide",
9
+ "IWA" => "iwa", "DATA" => "data", "TTA" => "tta"
10
+ }.freeze
11
+
12
+ TYPE_WORDS = Set.new(TYPE_SEGMENTS.compact.keys).freeze
13
+
14
+ PUBLISHED_STAGES = [6060, 9092].freeze
15
+
16
+ attr_reader :id, :reference, :deliverable_type, :edition, :current_stage,
17
+ :languages, :supplement_type, :title, :publication_date,
18
+ :ics_codes, :owner_committee
19
+
20
+ def initialize(data)
21
+ @id = data["id"]
22
+ assign_metadata(data)
23
+ end
24
+
25
+ def published?
26
+ PUBLISHED_STAGES.include?(current_stage)
27
+ end
28
+
29
+ def base_document?
30
+ supplement_type.nil?
31
+ end
32
+
33
+ def retrievable?
34
+ published? && base_document? && languages.any?
35
+ end
36
+
37
+ def to_urn(language: "en")
38
+ Urn.new(build_urn(language))
39
+ end
40
+
41
+ def english_title
42
+ title["en"]
43
+ end
44
+
45
+ private
46
+
47
+ def assign_metadata(data)
48
+ @reference = data["reference"]
49
+ @deliverable_type = data["deliverableType"]
50
+ @edition = data["edition"]
51
+ @current_stage = data["currentStage"]
52
+ @supplement_type = data["supplementType"]
53
+ @publication_date = data["publicationDate"]
54
+ @owner_committee = data["ownerCommittee"]
55
+ assign_collections(data)
56
+ end
57
+
58
+ def assign_collections(data)
59
+ @languages = Array(data["languages"])
60
+ @title = data["title"] || {}
61
+ @ics_codes = Array(data["icsCode"])
62
+ end
63
+
64
+ def build_urn(language)
65
+ segs = ["iso", "std", org_segment]
66
+ type_seg = TYPE_SEGMENTS[deliverable_type]
67
+ segs << type_seg if type_seg
68
+ segs << extract_number
69
+ segs << "-#{extract_part}" if extract_part
70
+ segs << "ed-#{edition}"
71
+ segs << "v1"
72
+ segs << language
73
+ segs.join(":")
74
+ end
75
+
76
+ def org_segment
77
+ @org_segment ||= begin
78
+ tokens = parse_prefix_tokens
79
+ rest = tokens[1..] || []
80
+ org_tokens = rest.reject { |t| TYPE_WORDS.include?(t) }
81
+ org_tokens.empty? ? "iso" : "iso-#{org_tokens.join('-').downcase}"
82
+ end
83
+ end
84
+
85
+ def extract_number
86
+ @extract_number ||= begin
87
+ match = base_reference.match(/(\d+)(?:-\d+)?:\d{4}/)
88
+ match ? match[1] : "0"
89
+ end
90
+ end
91
+
92
+ def extract_part
93
+ @extract_part ||= begin
94
+ match = base_reference.match(/\d+-(\d+):\d{4}/)
95
+ match ? match[1] : nil
96
+ end
97
+ end
98
+
99
+ def parse_prefix_tokens
100
+ prefix = base_reference.match(/\A(.+?)\s+\d/)&.[](1) || "ISO"
101
+ prefix.split(%r{[/\s]+})
102
+ end
103
+
104
+ def base_reference
105
+ @base_reference ||= reference.sub(%r{/(?:Amd|Cor)\s+\d+(?::\d+)?$}, "")
106
+ end
107
+ end
108
+ end
109
+ end
@@ -0,0 +1,63 @@
1
+ module Obp
2
+ class Access
3
+ class DomainExtractor
4
+ Result = Struct.new(:domains, :clean_children, keyword_init: true)
5
+
6
+ DOMAIN_PATTERN = /\A\s*<([^>]+)>/
7
+ MAX_DOMAIN_LENGTH = 50
8
+
9
+ def self.extract(node)
10
+ state = { domains: [], clean_children: [], text_consumed: false }
11
+
12
+ node.children.each { |child| process_child(child, state) }
13
+
14
+ Result.new(domains: state[:domains], clean_children: state[:clean_children])
15
+ end
16
+
17
+ class << self
18
+ private
19
+
20
+ def process_child(child, state)
21
+ if !state[:text_consumed] && child.is_a?(Nokogiri::XML::Text)
22
+ process_leading_text(child, state)
23
+ else
24
+ state[:clean_children] << child
25
+ state[:text_consumed] = true
26
+ end
27
+ end
28
+
29
+ def process_leading_text(child, state)
30
+ extracted, remaining = extract_from_text(child.content)
31
+ state[:domains] = extracted
32
+ state[:text_consumed] = true
33
+ state[:clean_children] << remaining_node(remaining) unless remaining.strip.empty?
34
+ end
35
+
36
+ def extract_from_text(text)
37
+ domains = []
38
+ remaining = text.dup
39
+
40
+ while remaining =~ DOMAIN_PATTERN
41
+ candidate = $1.strip
42
+ break unless valid_domain?(candidate)
43
+
44
+ domains << candidate
45
+ remaining = remaining.sub(DOMAIN_PATTERN, "")
46
+ end
47
+
48
+ [domains, remaining]
49
+ end
50
+
51
+ def valid_domain?(text)
52
+ text.length <= MAX_DOMAIN_LENGTH &&
53
+ !text.include?("(") &&
54
+ !text.match?(/\d{2,}/)
55
+ end
56
+
57
+ def remaining_node(text)
58
+ Nokogiri::XML::Text.new(text, Nokogiri::HTML::Document.new)
59
+ end
60
+ end
61
+ end
62
+ end
63
+ end
@@ -0,0 +1,20 @@
1
+ module Obp
2
+ class Access
3
+ class ElementRegistry
4
+ class << self
5
+ def register(element_class)
6
+ elements << element_class
7
+ @css_classes = nil
8
+ end
9
+
10
+ def elements
11
+ @elements ||= []
12
+ end
13
+
14
+ def css_classes
15
+ @css_classes ||= elements.filter_map(&:classes).uniq
16
+ end
17
+ end
18
+ end
19
+ end
20
+ end