obp-access 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.rubocop.yml +18 -0
- data/.rubocop_todo.yml +33 -0
- data/CLAUDE.md +59 -0
- data/README.adoc +97 -0
- data/Rakefile +12 -0
- data/exe/obp-access +5 -0
- data/lib/obp/access/catalog.rb +67 -0
- data/lib/obp/access/cli.rb +115 -0
- data/lib/obp/access/converter.rb +25 -0
- data/lib/obp/access/deliverable.rb +109 -0
- data/lib/obp/access/domain_extractor.rb +63 -0
- data/lib/obp/access/element_registry.rb +20 -0
- data/lib/obp/access/elements/array.rb +64 -0
- data/lib/obp/access/elements/base.rb +69 -0
- data/lib/obp/access/elements/bibliography/bib_ref.rb +60 -0
- data/lib/obp/access/elements/bibliography.rb +52 -0
- data/lib/obp/access/elements/copyright.rb +27 -0
- data/lib/obp/access/elements/figure.rb +58 -0
- data/lib/obp/access/elements/figure_group.rb +48 -0
- data/lib/obp/access/elements/index.rb +113 -0
- data/lib/obp/access/elements/introduction.rb +31 -0
- data/lib/obp/access/elements/list.rb +58 -0
- data/lib/obp/access/elements/non_normative_note.rb +47 -0
- data/lib/obp/access/elements/paragraph.rb +31 -0
- data/lib/obp/access/elements/root.rb +122 -0
- data/lib/obp/access/elements/section.rb +38 -0
- data/lib/obp/access/elements/section_title.rb +26 -0
- data/lib/obp/access/elements/section_type.rb +27 -0
- data/lib/obp/access/elements/table_wrap.rb +47 -0
- data/lib/obp/access/elements/terminology/base.rb +27 -0
- data/lib/obp/access/elements/terminology/definition.rb +44 -0
- data/lib/obp/access/elements/terminology/example.rb +27 -0
- data/lib/obp/access/elements/terminology/note.rb +27 -0
- data/lib/obp/access/elements/terminology/source.rb +45 -0
- data/lib/obp/access/elements/terminology/tig.rb +59 -0
- data/lib/obp/access/elements/terminology/tig_admitted.rb +23 -0
- data/lib/obp/access/elements/terminology/tig_deprecated.rb +39 -0
- data/lib/obp/access/elements/terminology/tig_preferred.rb +23 -0
- data/lib/obp/access/elements/terminology.rb +28 -0
- data/lib/obp/access/elements/title.rb +33 -0
- data/lib/obp/access/fetcher.rb +63 -0
- data/lib/obp/access/grammar_parser.rb +135 -0
- data/lib/obp/access/imager.rb +39 -0
- data/lib/obp/access/inline_renderer.rb +97 -0
- data/lib/obp/access/parser.rb +82 -0
- data/lib/obp/access/renderer.rb +43 -0
- data/lib/obp/access/retriever.rb +97 -0
- data/lib/obp/access/urn.rb +31 -0
- data/lib/obp/access/version.rb +5 -0
- data/lib/obp/access.rb +118 -0
- data/lib/obp-access.rb +1 -0
- metadata +151 -0
checksums.yaml
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
---
|
|
2
|
+
SHA256:
|
|
3
|
+
metadata.gz: '08cfedb2a758cc85495717b57a184fe2e9135e77b31449f7529470db99d82fdb'
|
|
4
|
+
data.tar.gz: 8fdb94cb0f6f7eec0f91ee0c2161a4846725f6b8aef07ecc3fd644998f1ff616
|
|
5
|
+
SHA512:
|
|
6
|
+
metadata.gz: 363df11925ccc60638446288a06dd3bf58c4592eea3ac98459d69ed116693e4dd83fbff27a0533826e0c07affe64c9eaffd8abf6c3bdfbcded31e469f8d2eb98
|
|
7
|
+
data.tar.gz: c82359991c371041be7c14a11b27dcec0c255757657dfe164099c6e68809bb6c16d98f3bd319f2c809897f03b58c14d7883b12e19227f800ab8fb6aa4b4aa6e9
|
data/.rubocop.yml
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
# Auto-generated by Cimas: Do not edit it manually!
|
|
2
|
+
# See https://github.com/metanorma/cimas
|
|
3
|
+
inherit_from:
|
|
4
|
+
- .rubocop_todo.yml
|
|
5
|
+
- https://raw.githubusercontent.com/riboseinc/oss-guides/master/ci/rubocop.yml
|
|
6
|
+
|
|
7
|
+
# local repo-specific modifications
|
|
8
|
+
# ...
|
|
9
|
+
|
|
10
|
+
AllCops:
|
|
11
|
+
NewCops: enable
|
|
12
|
+
SuggestExtensions: false
|
|
13
|
+
|
|
14
|
+
Metrics/MethodLength:
|
|
15
|
+
Max: 30
|
|
16
|
+
|
|
17
|
+
Layout/LineLength:
|
|
18
|
+
Max: 120
|
data/.rubocop_todo.yml
ADDED
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
# This configuration was generated by
|
|
2
|
+
# `rubocop --auto-gen-config`
|
|
3
|
+
# on 2026-05-12 09:41:56 UTC using RuboCop version 1.86.1.
|
|
4
|
+
# The point is for the user to remove these configuration records
|
|
5
|
+
# one by one as the offenses are removed from the code base.
|
|
6
|
+
# Note that changes in the inspected code, or installation of new
|
|
7
|
+
# versions of RuboCop, may require this file to be generated again.
|
|
8
|
+
|
|
9
|
+
# Offense count: 1
|
|
10
|
+
Gemspec/RequiredRubyVersion:
|
|
11
|
+
Exclude:
|
|
12
|
+
- 'obp-access.gemspec'
|
|
13
|
+
|
|
14
|
+
# Offense count: 1
|
|
15
|
+
# This cop supports safe autocorrection (--autocorrect).
|
|
16
|
+
# Configuration parameters: AllowHeredoc, AllowURI, AllowQualifiedName, URISchemes, AllowRBSInlineAnnotation, AllowCopDirectives, AllowedPatterns, SplitStrings.
|
|
17
|
+
# URISchemes: http, https
|
|
18
|
+
Layout/LineLength:
|
|
19
|
+
Max: 124
|
|
20
|
+
|
|
21
|
+
# Offense count: 3
|
|
22
|
+
# Configuration parameters: AllowedMethods, AllowedPatterns, CountRepeatedAttributes, Max.
|
|
23
|
+
Metrics/AbcSize:
|
|
24
|
+
Exclude:
|
|
25
|
+
- 'lib/obp/access/elements/bibliography/bib_ref.rb'
|
|
26
|
+
- 'lib/obp/access/elements/root.rb'
|
|
27
|
+
|
|
28
|
+
# Offense count: 1
|
|
29
|
+
# This cop supports safe autocorrection (--autocorrect).
|
|
30
|
+
# Configuration parameters: AllowHeredoc, AllowURI, AllowQualifiedName, URISchemes, AllowRBSInlineAnnotation, AllowCopDirectives, AllowedPatterns, SplitStrings.
|
|
31
|
+
# URISchemes: http, https
|
|
32
|
+
Layout/LineLength:
|
|
33
|
+
Max: 124
|
data/CLAUDE.md
ADDED
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
# CLAUDE.md
|
|
2
|
+
|
|
3
|
+
This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository.
|
|
4
|
+
|
|
5
|
+
## Project Overview
|
|
6
|
+
|
|
7
|
+
Ruby gem (`obp-access`) that fetches informative content (introduction, scope, terms, bibliography) from the ISO Online Browsing Platform (OBP) and converts it to NISO STS XML. Published by Ribose Inc. under BSD-2-Clause license.
|
|
8
|
+
|
|
9
|
+
## Commands
|
|
10
|
+
|
|
11
|
+
- **Install dependencies:** `bundle install`
|
|
12
|
+
- **Run tests:** `bundle exec rspec`
|
|
13
|
+
- **Lint:** `bundle exec rubocop`
|
|
14
|
+
- **Run a single test:** `bundle exec rspec spec/obp/access/grammar_parser_spec.rb`
|
|
15
|
+
- **Console:** `bin/console` (loads the gem for interactive use)
|
|
16
|
+
|
|
17
|
+
## Critical rules
|
|
18
|
+
|
|
19
|
+
- **NEVER use Nokogiri for XML building.** The sts gem and lutaml-model handle all XML serialization. Nokogiri is used internally by those libraries but must not appear in our application code. The `Nokogiri::XML::Builder` usage in `elements/` is legacy tech debt being migrated to the sts gem's model-driven approach.
|
|
20
|
+
- **Never use `send` to call private methods** (breaks encapsulation).
|
|
21
|
+
- **Never use `respond_to?`** (poor typing — use type checking or duck typing).
|
|
22
|
+
- **Never use `instance_double` in specs** — use `double` instead.
|
|
23
|
+
|
|
24
|
+
## Architecture
|
|
25
|
+
|
|
26
|
+
Data flows through these classes in `lib/obp/access/`:
|
|
27
|
+
|
|
28
|
+
1. **`Access`** (entry point) — orchestrates the pipeline. `Obp::Access.fetch(urn)` returns an instance per language. `Access.fetch_all(urn, languages:)` returns separate instances per language.
|
|
29
|
+
|
|
30
|
+
2. **`Parser`** — fetches content from the ISO OBP API (`https://www.iso.org/obp/ui`) via HTTP POST with a URN payload. Parses the JSON response to extract HTML content, titles, and images.
|
|
31
|
+
|
|
32
|
+
3. **`Converter`** — wraps the HTML source, normalizes whitespace, parses it with Nokogiri, and passes DOM nodes to the Renderer.
|
|
33
|
+
|
|
34
|
+
4. **`Renderer`** — recursively walks DOM nodes and dispatches them to element classes registered in `ElementRegistry`. Each element class matches against CSS classes and builds NISO STS XML.
|
|
35
|
+
|
|
36
|
+
### Element system
|
|
37
|
+
|
|
38
|
+
- **`ElementRegistry`** — explicit registry of element classes. Elements call `ElementRegistry.register(Class)` at load time.
|
|
39
|
+
- **`Elements::Base`** — abstract base class. Subclasses implement `self.classes` (CSS class array) and `content` (XML builder).
|
|
40
|
+
- **`Elements::Root`** — creates the NISO STS document skeleton (`<standard>` with `<front>`, `<body>`, `<back>`).
|
|
41
|
+
- **`Elements::Terminology`** sub-elements handle term entries using the `tbx:` namespace.
|
|
42
|
+
- Other elements: `section`, `introduction`, `bibliography`, `paragraph`, `list`, `array`, `figure`, `figure_group`, `table_wrap`, `title`, `copyright`, `index`, `non_normative_note`.
|
|
43
|
+
|
|
44
|
+
### Supporting classes
|
|
45
|
+
|
|
46
|
+
- **`GrammarParser`** — extracts part-of-speech and gender from bold term markup.
|
|
47
|
+
- **`DomainExtractor`** — extracts subject-field domains from definition text.
|
|
48
|
+
- **`InlineRenderer`** — renders inline HTML elements (links, xrefs, italic, bold, entailed terms) to STS XML.
|
|
49
|
+
- **`Imager`** — downloads images from OBP. Uses `Parallel` for concurrent downloads.
|
|
50
|
+
|
|
51
|
+
## Key dependencies
|
|
52
|
+
|
|
53
|
+
- **sts** — NISO STS gem for generating STS objects from XML
|
|
54
|
+
- **lutaml/model** — model serialization framework
|
|
55
|
+
- **parallel** — concurrent HTTP requests
|
|
56
|
+
|
|
57
|
+
## Ruby version
|
|
58
|
+
|
|
59
|
+
Requires Ruby >= 3.1.
|
data/README.adoc
ADDED
|
@@ -0,0 +1,97 @@
|
|
|
1
|
+
== ISO OBP data access in Ruby
|
|
2
|
+
|
|
3
|
+
=== Background
|
|
4
|
+
|
|
5
|
+
The https://www.iso.org/obp/ui[ISO Online Browsing Platform (OBP)] is the ISO
|
|
6
|
+
official location to obtain informative content from ISO standards.
|
|
7
|
+
|
|
8
|
+
NOTE: Normative content of ISO standards is paywalled.
|
|
9
|
+
|
|
10
|
+
Informative content provided on the OBP include:
|
|
11
|
+
|
|
12
|
+
* Introduction (if present)
|
|
13
|
+
* Scope
|
|
14
|
+
* Normative references
|
|
15
|
+
* Terms and definitions
|
|
16
|
+
* Bibliography
|
|
17
|
+
|
|
18
|
+
=== Use cases
|
|
19
|
+
|
|
20
|
+
There are many situations where the informative content is already useful:
|
|
21
|
+
|
|
22
|
+
* ISO project editors: obtain machine-readable content of informative clauses to
|
|
23
|
+
start a revision without waiting for the complicated STS XML file.
|
|
24
|
+
|
|
25
|
+
* Terminology users: obtain term and definition details from a standard.
|
|
26
|
+
|
|
27
|
+
* Bibliography users: find out what documents this standard refers to.
|
|
28
|
+
|
|
29
|
+
=== Usage
|
|
30
|
+
|
|
31
|
+
==== Single language
|
|
32
|
+
|
|
33
|
+
[source,ruby]
|
|
34
|
+
----
|
|
35
|
+
obp = Obp::Access.fetch("iso:std:iso:5598:ed-3:v1:en")
|
|
36
|
+
obp.to_xml(pretty: true) # => NISO STS XML string
|
|
37
|
+
obp.to_sts # => #<Sts::NisoSts::Standard>
|
|
38
|
+
obp.to_xml_file # => "/tmp/iso-std-iso-5598-.../iso-std-iso-5598-....xml"
|
|
39
|
+
----
|
|
40
|
+
|
|
41
|
+
==== Multilingual (specific languages)
|
|
42
|
+
|
|
43
|
+
[source,ruby]
|
|
44
|
+
----
|
|
45
|
+
obp = Obp::Access.fetch("iso:std:iso:5598:ed-3:v1:en", languages: ["fr", "de"])
|
|
46
|
+
obp.to_xml(pretty: true) # => NISO STS XML with en/fr/de langSets
|
|
47
|
+
----
|
|
48
|
+
|
|
49
|
+
==== Multilingual (all available languages)
|
|
50
|
+
|
|
51
|
+
[source,ruby]
|
|
52
|
+
----
|
|
53
|
+
obp = Obp::Access.fetch("iso:std:iso:5598:ed-3:v1:en", languages: :all)
|
|
54
|
+
obp.to_xml(pretty: true) # => NISO STS XML with all available language langSets
|
|
55
|
+
----
|
|
56
|
+
|
|
57
|
+
==== CLI
|
|
58
|
+
|
|
59
|
+
Fetch a single document (English only):
|
|
60
|
+
|
|
61
|
+
[source,shell]
|
|
62
|
+
----
|
|
63
|
+
$ obp-access fetch iso:std:iso:5598:ed-3:v1:en
|
|
64
|
+
----
|
|
65
|
+
|
|
66
|
+
Fetch with all available languages:
|
|
67
|
+
|
|
68
|
+
[source,shell]
|
|
69
|
+
----
|
|
70
|
+
$ obp-access fetch -l all iso:std:iso:5598:ed-3:v1:en
|
|
71
|
+
----
|
|
72
|
+
|
|
73
|
+
Fetch with specific languages and save to file:
|
|
74
|
+
|
|
75
|
+
[source,shell]
|
|
76
|
+
----
|
|
77
|
+
$ obp-access fetch -l fr,de -o output/ iso:std:iso:5598:ed-3:v1:en
|
|
78
|
+
----
|
|
79
|
+
|
|
80
|
+
=== Generated XML
|
|
81
|
+
|
|
82
|
+
The output is NISO STS XML with TBX-Basic terminology markup:
|
|
83
|
+
|
|
84
|
+
* Terms use `<tbx:termEntry>` with `<tbx:langSet>` per language
|
|
85
|
+
* Grammar is encoded via `<tbx:gram>` (gender: m/f/n) and `<tbx:partOfSpeech>`
|
|
86
|
+
* Domains are extracted as `<tbx:subjectField>`
|
|
87
|
+
* Deprecated terms use `<tbx:normativeAuthorization value="deprecatedTerm"/>`
|
|
88
|
+
|
|
89
|
+
== Credits
|
|
90
|
+
|
|
91
|
+
This gem is developed, maintained and funded by
|
|
92
|
+
https://www.ribose.com[Ribose Inc.]
|
|
93
|
+
|
|
94
|
+
== License
|
|
95
|
+
|
|
96
|
+
The gem is available as open source under the terms of the
|
|
97
|
+
https://opensource.org/licenses/BSD-2-Clause[2-Clause BSD License].
|
data/Rakefile
ADDED
data/exe/obp-access
ADDED
|
@@ -0,0 +1,67 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "json"
|
|
4
|
+
require "net/http"
|
|
5
|
+
|
|
6
|
+
module Obp
|
|
7
|
+
class Access
|
|
8
|
+
class Catalog
|
|
9
|
+
SOURCE_URL = "https://isopublicstorageprod.blob.core.windows.net/opendata/" \
|
|
10
|
+
"_latest/iso_deliverables_metadata/json/iso_deliverables_metadata.jsonl"
|
|
11
|
+
|
|
12
|
+
attr_reader :deliverables
|
|
13
|
+
|
|
14
|
+
def initialize(deliverables:)
|
|
15
|
+
@deliverables = deliverables
|
|
16
|
+
end
|
|
17
|
+
|
|
18
|
+
def self.load(path: nil, url: SOURCE_URL)
|
|
19
|
+
raw = path ? read_local(path) : fetch_remote(url)
|
|
20
|
+
new(deliverables: parse_jsonl(raw).map { |data| Deliverable.new(data) })
|
|
21
|
+
end
|
|
22
|
+
|
|
23
|
+
def retrievable
|
|
24
|
+
@retrievable ||= deliverables.select(&:retrievable?)
|
|
25
|
+
end
|
|
26
|
+
|
|
27
|
+
def by_type(type)
|
|
28
|
+
deliverables.select { |d| d.deliverable_type == type }
|
|
29
|
+
end
|
|
30
|
+
|
|
31
|
+
def by_ics(code)
|
|
32
|
+
deliverables.select { |d| d.ics_codes.include?(code) }
|
|
33
|
+
end
|
|
34
|
+
|
|
35
|
+
def count
|
|
36
|
+
deliverables.size
|
|
37
|
+
end
|
|
38
|
+
|
|
39
|
+
class << self
|
|
40
|
+
private
|
|
41
|
+
|
|
42
|
+
def parse_jsonl(text)
|
|
43
|
+
text.lines.filter_map do |line|
|
|
44
|
+
line.strip!
|
|
45
|
+
next if line.empty?
|
|
46
|
+
|
|
47
|
+
JSON.parse(line)
|
|
48
|
+
end
|
|
49
|
+
end
|
|
50
|
+
|
|
51
|
+
def read_local(path)
|
|
52
|
+
File.read(path)
|
|
53
|
+
end
|
|
54
|
+
|
|
55
|
+
def fetch_remote(url)
|
|
56
|
+
uri = URI(url)
|
|
57
|
+
response = Net::HTTP.get_response(uri)
|
|
58
|
+
unless response.is_a?(Net::HTTPSuccess)
|
|
59
|
+
raise "Failed to fetch catalog: #{response.code} #{response.message}"
|
|
60
|
+
end
|
|
61
|
+
|
|
62
|
+
response.body
|
|
63
|
+
end
|
|
64
|
+
end
|
|
65
|
+
end
|
|
66
|
+
end
|
|
67
|
+
end
|
|
@@ -0,0 +1,115 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "thor"
|
|
4
|
+
|
|
5
|
+
module Obp
|
|
6
|
+
class Access
|
|
7
|
+
class CLI < Thor
|
|
8
|
+
desc "fetch URN", "Fetch a single document from ISO OBP by URN"
|
|
9
|
+
option :output, aliases: "-o", type: :string, desc: "Output directory (default: stdout)"
|
|
10
|
+
option :languages, aliases: "-l", type: :string,
|
|
11
|
+
desc: "Languages: 'all' or comma-separated (e.g. 'fr,de')"
|
|
12
|
+
def fetch(urn)
|
|
13
|
+
langs = parse_languages
|
|
14
|
+
if langs
|
|
15
|
+
say "Fetching #{urn} (#{langs == :all ? 'all languages' : langs.join(', ')})..."
|
|
16
|
+
Access.fetch_all(urn, languages: langs).each { |access| output(access) }
|
|
17
|
+
else
|
|
18
|
+
say "Fetching #{urn}..."
|
|
19
|
+
output(Access.fetch(urn))
|
|
20
|
+
end
|
|
21
|
+
rescue StandardError => e
|
|
22
|
+
say "Error: #{e.message}", :red
|
|
23
|
+
exit 1
|
|
24
|
+
end
|
|
25
|
+
|
|
26
|
+
desc "catalog", "Load and inspect the ISO Open Data catalog"
|
|
27
|
+
option :path, type: :string, desc: "Local JSONL file path (default: fetch remote)"
|
|
28
|
+
option :filter, type: :string, enum: %w[retrievable types], desc: "Filter mode"
|
|
29
|
+
option :type, type: :string, desc: "Filter by deliverable type (IS, TS, TR, etc.)"
|
|
30
|
+
option :ics, type: :string, desc: "Filter by ICS code"
|
|
31
|
+
def catalog
|
|
32
|
+
say "Loading catalog..."
|
|
33
|
+
cat = Access::Catalog.load(path: options[:path])
|
|
34
|
+
|
|
35
|
+
if options[:filter] == "types"
|
|
36
|
+
print_type_summary(cat)
|
|
37
|
+
return
|
|
38
|
+
end
|
|
39
|
+
|
|
40
|
+
print_deliverables(cat)
|
|
41
|
+
rescue StandardError => e
|
|
42
|
+
say "Error: #{e.message}", :red
|
|
43
|
+
exit 1
|
|
44
|
+
end
|
|
45
|
+
|
|
46
|
+
desc "retrieve", "Bulk retrieve documents from ISO OBP"
|
|
47
|
+
option :output, aliases: "-o", type: :string, required: true, desc: "Output directory"
|
|
48
|
+
option :path, type: :string, desc: "Local JSONL file path"
|
|
49
|
+
option :concurrency, aliases: "-c", type: :numeric, default: 4, desc: "Thread concurrency"
|
|
50
|
+
def retrieve
|
|
51
|
+
say "Loading catalog..."
|
|
52
|
+
cat = Access::Catalog.load(path: options[:path])
|
|
53
|
+
say "Found #{cat.retrievable.size} retrievable deliverables"
|
|
54
|
+
build_retriever(cat).run
|
|
55
|
+
rescue StandardError => e
|
|
56
|
+
say "Error: #{e.message}", :red
|
|
57
|
+
exit 1
|
|
58
|
+
end
|
|
59
|
+
|
|
60
|
+
private
|
|
61
|
+
|
|
62
|
+
def output(access)
|
|
63
|
+
if options[:output]
|
|
64
|
+
dir = File.expand_path(options[:output])
|
|
65
|
+
FileUtils.mkdir_p(dir)
|
|
66
|
+
path = File.join(dir, "#{access.urn.safe}.xml")
|
|
67
|
+
File.write(path, access.to_xml(pretty: true))
|
|
68
|
+
say "Saved to #{path}", :green
|
|
69
|
+
else
|
|
70
|
+
puts access.to_xml(pretty: true)
|
|
71
|
+
end
|
|
72
|
+
end
|
|
73
|
+
|
|
74
|
+
def print_deliverables(cat)
|
|
75
|
+
filtered = apply_filters(cat)
|
|
76
|
+
say "Total: #{filtered.size} deliverables"
|
|
77
|
+
filtered.first(20).each do |d|
|
|
78
|
+
say " #{d.reference} [#{d.deliverable_type}] stage=#{d.current_stage} langs=#{d.languages.join(',')}"
|
|
79
|
+
end
|
|
80
|
+
say " ... (showing first 20)" if filtered.size > 20
|
|
81
|
+
end
|
|
82
|
+
|
|
83
|
+
def apply_filters(cat)
|
|
84
|
+
return cat.by_type(options[:type]) if options[:type]
|
|
85
|
+
return cat.by_ics(options[:ics]) if options[:ics]
|
|
86
|
+
return cat.retrievable if options[:filter] == "retrievable"
|
|
87
|
+
|
|
88
|
+
cat.deliverables
|
|
89
|
+
end
|
|
90
|
+
|
|
91
|
+
def build_retriever(cat)
|
|
92
|
+
Access::Retriever.new(
|
|
93
|
+
output_dir: File.expand_path(options[:output]),
|
|
94
|
+
catalog: cat,
|
|
95
|
+
concurrency: options[:concurrency],
|
|
96
|
+
)
|
|
97
|
+
end
|
|
98
|
+
|
|
99
|
+
def parse_languages
|
|
100
|
+
case options[:languages]
|
|
101
|
+
when nil then nil
|
|
102
|
+
when "all" then :all
|
|
103
|
+
else options[:languages].split(",").map(&:strip)
|
|
104
|
+
end
|
|
105
|
+
end
|
|
106
|
+
|
|
107
|
+
def print_type_summary(cat)
|
|
108
|
+
cat.deliverables.group_by(&:deliverable_type).sort.each do |type, list|
|
|
109
|
+
published = list.count(&:published?)
|
|
110
|
+
say " #{type || 'IS'}: #{list.size} total, #{published} published"
|
|
111
|
+
end
|
|
112
|
+
end
|
|
113
|
+
end
|
|
114
|
+
end
|
|
115
|
+
end
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
module Obp
|
|
2
|
+
class Access
|
|
3
|
+
class Converter
|
|
4
|
+
attr_reader :urn, :metas, :source
|
|
5
|
+
|
|
6
|
+
def initialize(urn:, metas:, source:)
|
|
7
|
+
@urn = urn
|
|
8
|
+
@metas = metas
|
|
9
|
+
@source = source
|
|
10
|
+
end
|
|
11
|
+
|
|
12
|
+
def to_xml
|
|
13
|
+
Renderer.new(urn:, metas:, nodes:).to_xml
|
|
14
|
+
end
|
|
15
|
+
|
|
16
|
+
private
|
|
17
|
+
|
|
18
|
+
def nodes
|
|
19
|
+
html = source.gsub(/[[:space:]]/, " ")
|
|
20
|
+
doc = Nokogiri::HTML(html)
|
|
21
|
+
doc.css("body > div.sts-standard").children
|
|
22
|
+
end
|
|
23
|
+
end
|
|
24
|
+
end
|
|
25
|
+
end
|
|
@@ -0,0 +1,109 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Obp
|
|
4
|
+
class Access
|
|
5
|
+
class Deliverable
|
|
6
|
+
TYPE_SEGMENTS = {
|
|
7
|
+
"IS" => nil, "TS" => "ts", "TR" => "tr", "R" => "r",
|
|
8
|
+
"PAS" => "pas", "ISP" => "isp", "GUIDE" => "guide",
|
|
9
|
+
"IWA" => "iwa", "DATA" => "data", "TTA" => "tta"
|
|
10
|
+
}.freeze
|
|
11
|
+
|
|
12
|
+
TYPE_WORDS = Set.new(TYPE_SEGMENTS.compact.keys).freeze
|
|
13
|
+
|
|
14
|
+
PUBLISHED_STAGES = [6060, 9092].freeze
|
|
15
|
+
|
|
16
|
+
attr_reader :id, :reference, :deliverable_type, :edition, :current_stage,
|
|
17
|
+
:languages, :supplement_type, :title, :publication_date,
|
|
18
|
+
:ics_codes, :owner_committee
|
|
19
|
+
|
|
20
|
+
def initialize(data)
|
|
21
|
+
@id = data["id"]
|
|
22
|
+
assign_metadata(data)
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
def published?
|
|
26
|
+
PUBLISHED_STAGES.include?(current_stage)
|
|
27
|
+
end
|
|
28
|
+
|
|
29
|
+
def base_document?
|
|
30
|
+
supplement_type.nil?
|
|
31
|
+
end
|
|
32
|
+
|
|
33
|
+
def retrievable?
|
|
34
|
+
published? && base_document? && languages.any?
|
|
35
|
+
end
|
|
36
|
+
|
|
37
|
+
def to_urn(language: "en")
|
|
38
|
+
Urn.new(build_urn(language))
|
|
39
|
+
end
|
|
40
|
+
|
|
41
|
+
def english_title
|
|
42
|
+
title["en"]
|
|
43
|
+
end
|
|
44
|
+
|
|
45
|
+
private
|
|
46
|
+
|
|
47
|
+
def assign_metadata(data)
|
|
48
|
+
@reference = data["reference"]
|
|
49
|
+
@deliverable_type = data["deliverableType"]
|
|
50
|
+
@edition = data["edition"]
|
|
51
|
+
@current_stage = data["currentStage"]
|
|
52
|
+
@supplement_type = data["supplementType"]
|
|
53
|
+
@publication_date = data["publicationDate"]
|
|
54
|
+
@owner_committee = data["ownerCommittee"]
|
|
55
|
+
assign_collections(data)
|
|
56
|
+
end
|
|
57
|
+
|
|
58
|
+
def assign_collections(data)
|
|
59
|
+
@languages = Array(data["languages"])
|
|
60
|
+
@title = data["title"] || {}
|
|
61
|
+
@ics_codes = Array(data["icsCode"])
|
|
62
|
+
end
|
|
63
|
+
|
|
64
|
+
def build_urn(language)
|
|
65
|
+
segs = ["iso", "std", org_segment]
|
|
66
|
+
type_seg = TYPE_SEGMENTS[deliverable_type]
|
|
67
|
+
segs << type_seg if type_seg
|
|
68
|
+
segs << extract_number
|
|
69
|
+
segs << "-#{extract_part}" if extract_part
|
|
70
|
+
segs << "ed-#{edition}"
|
|
71
|
+
segs << "v1"
|
|
72
|
+
segs << language
|
|
73
|
+
segs.join(":")
|
|
74
|
+
end
|
|
75
|
+
|
|
76
|
+
def org_segment
|
|
77
|
+
@org_segment ||= begin
|
|
78
|
+
tokens = parse_prefix_tokens
|
|
79
|
+
rest = tokens[1..] || []
|
|
80
|
+
org_tokens = rest.reject { |t| TYPE_WORDS.include?(t) }
|
|
81
|
+
org_tokens.empty? ? "iso" : "iso-#{org_tokens.join('-').downcase}"
|
|
82
|
+
end
|
|
83
|
+
end
|
|
84
|
+
|
|
85
|
+
def extract_number
|
|
86
|
+
@extract_number ||= begin
|
|
87
|
+
match = base_reference.match(/(\d+)(?:-\d+)?:\d{4}/)
|
|
88
|
+
match ? match[1] : "0"
|
|
89
|
+
end
|
|
90
|
+
end
|
|
91
|
+
|
|
92
|
+
def extract_part
|
|
93
|
+
@extract_part ||= begin
|
|
94
|
+
match = base_reference.match(/\d+-(\d+):\d{4}/)
|
|
95
|
+
match ? match[1] : nil
|
|
96
|
+
end
|
|
97
|
+
end
|
|
98
|
+
|
|
99
|
+
def parse_prefix_tokens
|
|
100
|
+
prefix = base_reference.match(/\A(.+?)\s+\d/)&.[](1) || "ISO"
|
|
101
|
+
prefix.split(%r{[/\s]+})
|
|
102
|
+
end
|
|
103
|
+
|
|
104
|
+
def base_reference
|
|
105
|
+
@base_reference ||= reference.sub(%r{/(?:Amd|Cor)\s+\d+(?::\d+)?$}, "")
|
|
106
|
+
end
|
|
107
|
+
end
|
|
108
|
+
end
|
|
109
|
+
end
|
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
module Obp
|
|
2
|
+
class Access
|
|
3
|
+
class DomainExtractor
|
|
4
|
+
Result = Struct.new(:domains, :clean_children, keyword_init: true)
|
|
5
|
+
|
|
6
|
+
DOMAIN_PATTERN = /\A\s*<([^>]+)>/
|
|
7
|
+
MAX_DOMAIN_LENGTH = 50
|
|
8
|
+
|
|
9
|
+
def self.extract(node)
|
|
10
|
+
state = { domains: [], clean_children: [], text_consumed: false }
|
|
11
|
+
|
|
12
|
+
node.children.each { |child| process_child(child, state) }
|
|
13
|
+
|
|
14
|
+
Result.new(domains: state[:domains], clean_children: state[:clean_children])
|
|
15
|
+
end
|
|
16
|
+
|
|
17
|
+
class << self
|
|
18
|
+
private
|
|
19
|
+
|
|
20
|
+
def process_child(child, state)
|
|
21
|
+
if !state[:text_consumed] && child.is_a?(Nokogiri::XML::Text)
|
|
22
|
+
process_leading_text(child, state)
|
|
23
|
+
else
|
|
24
|
+
state[:clean_children] << child
|
|
25
|
+
state[:text_consumed] = true
|
|
26
|
+
end
|
|
27
|
+
end
|
|
28
|
+
|
|
29
|
+
def process_leading_text(child, state)
|
|
30
|
+
extracted, remaining = extract_from_text(child.content)
|
|
31
|
+
state[:domains] = extracted
|
|
32
|
+
state[:text_consumed] = true
|
|
33
|
+
state[:clean_children] << remaining_node(remaining) unless remaining.strip.empty?
|
|
34
|
+
end
|
|
35
|
+
|
|
36
|
+
def extract_from_text(text)
|
|
37
|
+
domains = []
|
|
38
|
+
remaining = text.dup
|
|
39
|
+
|
|
40
|
+
while remaining =~ DOMAIN_PATTERN
|
|
41
|
+
candidate = $1.strip
|
|
42
|
+
break unless valid_domain?(candidate)
|
|
43
|
+
|
|
44
|
+
domains << candidate
|
|
45
|
+
remaining = remaining.sub(DOMAIN_PATTERN, "")
|
|
46
|
+
end
|
|
47
|
+
|
|
48
|
+
[domains, remaining]
|
|
49
|
+
end
|
|
50
|
+
|
|
51
|
+
def valid_domain?(text)
|
|
52
|
+
text.length <= MAX_DOMAIN_LENGTH &&
|
|
53
|
+
!text.include?("(") &&
|
|
54
|
+
!text.match?(/\d{2,}/)
|
|
55
|
+
end
|
|
56
|
+
|
|
57
|
+
def remaining_node(text)
|
|
58
|
+
Nokogiri::XML::Text.new(text, Nokogiri::HTML::Document.new)
|
|
59
|
+
end
|
|
60
|
+
end
|
|
61
|
+
end
|
|
62
|
+
end
|
|
63
|
+
end
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
module Obp
|
|
2
|
+
class Access
|
|
3
|
+
class ElementRegistry
|
|
4
|
+
class << self
|
|
5
|
+
def register(element_class)
|
|
6
|
+
elements << element_class
|
|
7
|
+
@css_classes = nil
|
|
8
|
+
end
|
|
9
|
+
|
|
10
|
+
def elements
|
|
11
|
+
@elements ||= []
|
|
12
|
+
end
|
|
13
|
+
|
|
14
|
+
def css_classes
|
|
15
|
+
@css_classes ||= elements.filter_map(&:classes).uniq
|
|
16
|
+
end
|
|
17
|
+
end
|
|
18
|
+
end
|
|
19
|
+
end
|
|
20
|
+
end
|