obp-access 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (53) hide show
  1. checksums.yaml +7 -0
  2. data/.rubocop.yml +18 -0
  3. data/.rubocop_todo.yml +33 -0
  4. data/CLAUDE.md +59 -0
  5. data/README.adoc +97 -0
  6. data/Rakefile +12 -0
  7. data/exe/obp-access +5 -0
  8. data/lib/obp/access/catalog.rb +67 -0
  9. data/lib/obp/access/cli.rb +115 -0
  10. data/lib/obp/access/converter.rb +25 -0
  11. data/lib/obp/access/deliverable.rb +109 -0
  12. data/lib/obp/access/domain_extractor.rb +63 -0
  13. data/lib/obp/access/element_registry.rb +20 -0
  14. data/lib/obp/access/elements/array.rb +64 -0
  15. data/lib/obp/access/elements/base.rb +69 -0
  16. data/lib/obp/access/elements/bibliography/bib_ref.rb +60 -0
  17. data/lib/obp/access/elements/bibliography.rb +52 -0
  18. data/lib/obp/access/elements/copyright.rb +27 -0
  19. data/lib/obp/access/elements/figure.rb +58 -0
  20. data/lib/obp/access/elements/figure_group.rb +48 -0
  21. data/lib/obp/access/elements/index.rb +113 -0
  22. data/lib/obp/access/elements/introduction.rb +31 -0
  23. data/lib/obp/access/elements/list.rb +58 -0
  24. data/lib/obp/access/elements/non_normative_note.rb +47 -0
  25. data/lib/obp/access/elements/paragraph.rb +31 -0
  26. data/lib/obp/access/elements/root.rb +122 -0
  27. data/lib/obp/access/elements/section.rb +38 -0
  28. data/lib/obp/access/elements/section_title.rb +26 -0
  29. data/lib/obp/access/elements/section_type.rb +27 -0
  30. data/lib/obp/access/elements/table_wrap.rb +47 -0
  31. data/lib/obp/access/elements/terminology/base.rb +27 -0
  32. data/lib/obp/access/elements/terminology/definition.rb +44 -0
  33. data/lib/obp/access/elements/terminology/example.rb +27 -0
  34. data/lib/obp/access/elements/terminology/note.rb +27 -0
  35. data/lib/obp/access/elements/terminology/source.rb +45 -0
  36. data/lib/obp/access/elements/terminology/tig.rb +59 -0
  37. data/lib/obp/access/elements/terminology/tig_admitted.rb +23 -0
  38. data/lib/obp/access/elements/terminology/tig_deprecated.rb +39 -0
  39. data/lib/obp/access/elements/terminology/tig_preferred.rb +23 -0
  40. data/lib/obp/access/elements/terminology.rb +28 -0
  41. data/lib/obp/access/elements/title.rb +33 -0
  42. data/lib/obp/access/fetcher.rb +63 -0
  43. data/lib/obp/access/grammar_parser.rb +135 -0
  44. data/lib/obp/access/imager.rb +39 -0
  45. data/lib/obp/access/inline_renderer.rb +97 -0
  46. data/lib/obp/access/parser.rb +82 -0
  47. data/lib/obp/access/renderer.rb +43 -0
  48. data/lib/obp/access/retriever.rb +97 -0
  49. data/lib/obp/access/urn.rb +31 -0
  50. data/lib/obp/access/version.rb +5 -0
  51. data/lib/obp/access.rb +118 -0
  52. data/lib/obp-access.rb +1 -0
  53. metadata +151 -0
@@ -0,0 +1,122 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Obp
4
+ class Access
5
+ class Renderer
6
+ class Elements
7
+ class Root
8
+ TITLE_PARTS = %w[intro main compl].freeze
9
+
10
+ attr_reader :urn, :metas
11
+
12
+ def initialize(urn:, metas:)
13
+ @urn = urn
14
+ @metas = metas
15
+ end
16
+
17
+ def content # rubocop:disable Metrics/AbcSize
18
+ Nokogiri::XML::Builder.new(namespace_inheritance: false,
19
+ encoding: "UTF-8") do |xml|
20
+ xml.standard("xmlns:ali": "http://www.niso.org/schemas/ali/1.0/",
21
+ "xmlns:mml": "http://www.w3.org/1998/Math/MathML",
22
+ "xmlns:tbx": "urn:iso:std:iso:30042:ed-2",
23
+ "xmlns:xlink": "http://www.w3.org/1999/xlink",
24
+ "dtd-version": "1.0") do
25
+ xml.front do
26
+ xml.public_send(:"std-meta", "std-meta-type": "international") do
27
+ std_meta_content(xml)
28
+ end
29
+ end
30
+ xml.body
31
+ xml.back
32
+ end
33
+ end
34
+ end
35
+
36
+ def to_document
37
+ content.doc
38
+ end
39
+
40
+ private
41
+
42
+ def std_meta_content(xml)
43
+ render_titles(xml)
44
+ xml.public_send(:"proj-id", ref_undated)
45
+ xml.public_send(:"release-version", doc_type)
46
+ render_std_ident(xml)
47
+ xml.public_send(:"content-language", metas["language"])
48
+ xml.public_send(:"std-ref", ref_dated, type: "dated")
49
+ xml.public_send(:"std-ref", ref_undated, type: "undated")
50
+ xml.public_send(:"doc-ref", ref)
51
+ xml.public_send(:"self-uri", urn.to_s)
52
+ render_permissions(xml)
53
+ end
54
+
55
+ def render_std_ident(xml)
56
+ xml.public_send(:"std-ident") do
57
+ xml.originator holder
58
+ xml.public_send(:"doc-type", doc_type)
59
+ xml.public_send(:"doc-number", urn_parts[3])
60
+ xml.edition urn_parts[4].delete_prefix("ed-")
61
+ xml.version urn_parts[5].delete_prefix("v")
62
+ end
63
+ end
64
+
65
+ def render_permissions(xml)
66
+ xml.permissions do
67
+ xml.public_send(:"copyright-statement", "All rights reserved")
68
+ xml.public_send(:"copyright-year", copyright_year) if copyright_year
69
+ xml.public_send(:"copyright-holder", holder)
70
+ end
71
+ end
72
+
73
+ def render_titles(xml)
74
+ metas["titles"].each do |language, title|
75
+ next unless title
76
+
77
+ xml.public_send(:"title-wrap", "xml:lang": language) do
78
+ split = title.split("—")
79
+ TITLE_PARTS.each_with_index do |e, i|
80
+ xml.public_send(e, split[i].strip) if split[i]
81
+ end
82
+ xml.full title
83
+ end
84
+ end
85
+ end
86
+
87
+ def holder
88
+ metas["caption"].split.first
89
+ end
90
+
91
+ def ref
92
+ metas["caption"]
93
+ end
94
+
95
+ def ref_dated
96
+ metas["caption"].gsub(/\(.*?\)/, "")
97
+ end
98
+
99
+ def ref_undated
100
+ @ref_undated ||= metas["caption"].split(":").first
101
+ end
102
+
103
+ def copyright_year
104
+ metas["caption"][/:(\d{4})/, 1]
105
+ end
106
+
107
+ def doc_type
108
+ case urn_parts[3]
109
+ when "ts" then "TS"
110
+ when "tr" then "TR"
111
+ else "IS"
112
+ end
113
+ end
114
+
115
+ def urn_parts
116
+ @urn_parts ||= urn.raw.split(":")
117
+ end
118
+ end
119
+ end
120
+ end
121
+ end
122
+ end
@@ -0,0 +1,38 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Obp
4
+ class Access
5
+ class Renderer
6
+ class Elements
7
+ class Section < Base
8
+ def self.classes
9
+ %w[sts-section]
10
+ end
11
+
12
+ def match_node?
13
+ super && id =~ /\A\d+(\.\d+)*\z/
14
+ end
15
+
16
+ private
17
+
18
+ def insertion_target
19
+ "body"
20
+ end
21
+
22
+ def content
23
+ Nokogiri::XML::Builder.new do |xml|
24
+ attrs = { id: "sec_#{id}" }
25
+ sec_type = SectionType.for(id)
26
+ attrs[:"sec-type"] = sec_type if sec_type
27
+ xml.sec(**attrs) do
28
+ xml.label id
29
+ end
30
+ end
31
+ end
32
+ end
33
+ end
34
+ end
35
+ end
36
+ end
37
+
38
+ Obp::Access::ElementRegistry.register(Obp::Access::Renderer::Elements::Section)
@@ -0,0 +1,26 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Obp
4
+ class Access
5
+ class Renderer
6
+ class Elements
7
+ class SectionTitle
8
+ LABEL_PATTERN = /\A(\d+(?:\.\d+)*)\s{2,}/
9
+
10
+ attr_reader :label, :text
11
+
12
+ def initialize(raw_text)
13
+ match = raw_text.match(LABEL_PATTERN)
14
+ if match
15
+ @label = match[1]
16
+ @text = raw_text[match[0].length..].strip
17
+ else
18
+ @label = nil
19
+ @text = raw_text.strip
20
+ end
21
+ end
22
+ end
23
+ end
24
+ end
25
+ end
26
+ end
@@ -0,0 +1,27 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Obp
4
+ class Access
5
+ class Renderer
6
+ class Elements
7
+ class SectionType
8
+ FIXED_TYPES = {
9
+ "1" => "scope",
10
+ "2" => "norm-refs",
11
+ }.freeze
12
+
13
+ def self.for(id)
14
+ FIXED_TYPES[id] || infer_from_pattern(id)
15
+ end
16
+
17
+ def self.infer_from_pattern(id)
18
+ return "terms" if id.match?(/\A\d+\.\d*\z/)
19
+ return "terms" if id.match?(/\A\d+\z/) && id.to_i >= 3
20
+
21
+ nil
22
+ end
23
+ end
24
+ end
25
+ end
26
+ end
27
+ end
@@ -0,0 +1,47 @@
1
+ module Obp
2
+ class Access
3
+ class Renderer
4
+ class Elements
5
+ class TableWrap < Base
6
+ def self.classes
7
+ %w[sts-table-wrap fig-index]
8
+ end
9
+
10
+ def match_node?
11
+ super && !inside_figure?
12
+ end
13
+
14
+ private
15
+
16
+ def inside_figure?
17
+ node.ancestors.any? { |a| a.classes == ["sts-fig"] }
18
+ end
19
+
20
+ def content
21
+ Nokogiri::XML::Builder.new do |xml|
22
+ xml.public_send(:"table-wrap") do
23
+ xml.label caption_label if caption_label
24
+ if caption_text
25
+ xml.caption do
26
+ xml.title caption_text
27
+ end
28
+ end
29
+ xml.table { xml << node.at_css("table").inner_html }
30
+ end
31
+ end
32
+ end
33
+
34
+ def caption_label
35
+ @caption_label ||= node.at_css(".sts-caption-label")&.content
36
+ end
37
+
38
+ def caption_text
39
+ @caption_text ||= node.at_css(".sts-caption")&.content
40
+ end
41
+ end
42
+ end
43
+ end
44
+ end
45
+ end
46
+
47
+ Obp::Access::ElementRegistry.register(Obp::Access::Renderer::Elements::TableWrap)
@@ -0,0 +1,27 @@
1
+ module Obp
2
+ class Access
3
+ class Renderer
4
+ class Elements
5
+ class Terminology
6
+ class Base < Elements::Base
7
+ include InlineRenderer
8
+
9
+ def self.classes
10
+ nil
11
+ end
12
+
13
+ private
14
+
15
+ def path_suffix
16
+ "/tbx:termEntry/tbx:langSet"
17
+ end
18
+
19
+ def bold_term?(node)
20
+ node.inner_html.start_with?("<b>") || node.inner_html.include?("<b>")
21
+ end
22
+ end
23
+ end
24
+ end
25
+ end
26
+ end
27
+ end
@@ -0,0 +1,44 @@
1
+ module Obp
2
+ class Access
3
+ class Renderer
4
+ class Elements
5
+ class Terminology
6
+ class Definition < Base
7
+ def self.classes
8
+ %w[sts-tbx-def]
9
+ end
10
+
11
+ private
12
+
13
+ def insert_method
14
+ :prepend_child
15
+ end
16
+
17
+ def content
18
+ extracted = DomainExtractor.extract(node)
19
+ @fragment = Nokogiri::XML::DocumentFragment.new(document)
20
+
21
+ Nokogiri::XML::Builder.with(@fragment) do |xml|
22
+ extracted.domains.each do |domain|
23
+ xml.public_send(:"tbx:subjectField") { xml << domain }
24
+ end
25
+ xml.public_send(:"tbx:definition") do
26
+ extracted.clean_children.each { |child| render_inline(xml, child) }
27
+ end
28
+ end
29
+
30
+ @fragment
31
+ end
32
+
33
+ def to_xml
34
+ content
35
+ @fragment.to_xml
36
+ end
37
+ end
38
+ end
39
+ end
40
+ end
41
+ end
42
+ end
43
+
44
+ Obp::Access::ElementRegistry.register(Obp::Access::Renderer::Elements::Terminology::Definition)
@@ -0,0 +1,27 @@
1
+ module Obp
2
+ class Access
3
+ class Renderer
4
+ class Elements
5
+ class Terminology
6
+ class Example < Base
7
+ def self.classes
8
+ %w[sts-tbx-example]
9
+ end
10
+
11
+ private
12
+
13
+ def content
14
+ Nokogiri::XML::Builder.new do |xml|
15
+ xml.public_send(:"tbx:example") do
16
+ node.css(".sts-tbx-example-content").children.each { |children| render_inline(xml, children) }
17
+ end
18
+ end
19
+ end
20
+ end
21
+ end
22
+ end
23
+ end
24
+ end
25
+ end
26
+
27
+ Obp::Access::ElementRegistry.register(Obp::Access::Renderer::Elements::Terminology::Example)
@@ -0,0 +1,27 @@
1
+ module Obp
2
+ class Access
3
+ class Renderer
4
+ class Elements
5
+ class Terminology
6
+ class Note < Base
7
+ def self.classes
8
+ %w[sts-tbx-note]
9
+ end
10
+
11
+ private
12
+
13
+ def content
14
+ Nokogiri::XML::Builder.new do |xml|
15
+ xml.public_send(:"tbx:note") do
16
+ node.children.each { |children| render_inline(xml, children) }
17
+ end
18
+ end
19
+ end
20
+ end
21
+ end
22
+ end
23
+ end
24
+ end
25
+ end
26
+
27
+ Obp::Access::ElementRegistry.register(Obp::Access::Renderer::Elements::Terminology::Note)
@@ -0,0 +1,45 @@
1
+ module Obp
2
+ class Access
3
+ class Renderer
4
+ class Elements
5
+ class Terminology
6
+ class Source < Base
7
+ def self.classes
8
+ %w[sts-tbx-source]
9
+ end
10
+
11
+ private
12
+
13
+ def content
14
+ Nokogiri::XML::Builder.new do |xml|
15
+ xml.public_send(:"tbx:source") do
16
+ node.children.each do |child|
17
+ if child.classes == ["sts-xref"]
18
+ render_ref(xml, child)
19
+ else
20
+ render_text(xml, child)
21
+ end
22
+ end
23
+ end
24
+ end
25
+ end
26
+
27
+ def render_ref(xml, child)
28
+ rid = child.attr("href").split(":").last
29
+ xml.xref("ref-type": "bibr", rid: "ref_#{rid}") do
30
+ xml << child.inner_html
31
+ end
32
+ end
33
+
34
+ def render_text(xml, child)
35
+ content = child.to_s.strip.gsub(/^\[SOURCE:|\]$/, "")
36
+ xml << content
37
+ end
38
+ end
39
+ end
40
+ end
41
+ end
42
+ end
43
+ end
44
+
45
+ Obp::Access::ElementRegistry.register(Obp::Access::Renderer::Elements::Terminology::Source)
@@ -0,0 +1,59 @@
1
+ module Obp
2
+ class Access
3
+ class Renderer
4
+ class Elements
5
+ class Terminology
6
+ class Tig < Base
7
+ def self.classes
8
+ %w[sts-tbx-term]
9
+ end
10
+
11
+ private
12
+
13
+ def id
14
+ node.parent.at_css("div.sts-tbx-label").text.strip
15
+ end
16
+
17
+ def index
18
+ node.path.match(/\[(\d+)\](?=\z)/)[1].to_i - 1
19
+ end
20
+
21
+ def normative_authorization
22
+ bold_term?(node) ? "admittedTerm" : "preferredTerm"
23
+ end
24
+
25
+ def content
26
+ Nokogiri::XML::Builder.new do |xml|
27
+ xml.public_send(:"tbx:tig", id: "term_#{id}-#{index}") do
28
+ render_tig_content(xml)
29
+ end
30
+ end
31
+ end
32
+
33
+ def render_tig_content(xml)
34
+ result = GrammarParser.parse(parsed_html)
35
+ xml.public_send(:"tbx:term") { xml << result.term }
36
+ xml.public_send(:"tbx:partOfSpeech", value: result.pos)
37
+ render_genders(xml, result.genders)
38
+ xml.public_send(:"tbx:normativeAuthorization", value: normative_authorization)
39
+ end
40
+
41
+ def parsed_html
42
+ node.inner_html
43
+ end
44
+
45
+ def render_genders(xml, genders)
46
+ return unless genders.any?
47
+
48
+ genders.each do |gender|
49
+ xml.public_send(:"tbx:gram", value: gender, type: "gender")
50
+ end
51
+ end
52
+ end
53
+ end
54
+ end
55
+ end
56
+ end
57
+ end
58
+
59
+ Obp::Access::ElementRegistry.register(Obp::Access::Renderer::Elements::Terminology::Tig)
@@ -0,0 +1,23 @@
1
+ module Obp
2
+ class Access
3
+ class Renderer
4
+ class Elements
5
+ class Terminology
6
+ class TigAdmitted < Tig
7
+ def self.classes
8
+ %w[sts-tbx-term admittedTerm]
9
+ end
10
+
11
+ private
12
+
13
+ def normative_authorization
14
+ "admittedTerm"
15
+ end
16
+ end
17
+ end
18
+ end
19
+ end
20
+ end
21
+ end
22
+
23
+ Obp::Access::ElementRegistry.register(Obp::Access::Renderer::Elements::Terminology::TigAdmitted)
@@ -0,0 +1,39 @@
1
+ module Obp
2
+ class Access
3
+ class Renderer
4
+ class Elements
5
+ class Terminology
6
+ class TigDeprecated < Tig
7
+ def self.classes
8
+ %w[sts-tbx-term deprecatedTerm]
9
+ end
10
+
11
+ private
12
+
13
+ def normative_authorization
14
+ "deprecatedTerm"
15
+ end
16
+
17
+ def content
18
+ Nokogiri::XML::Builder.new do |xml|
19
+ xml.public_send(:"tbx:tig", id: "term_#{id}-#{index}") do
20
+ render_tig_content(xml)
21
+ end
22
+ end
23
+ end
24
+
25
+ def parsed_html
26
+ strip_deprecation_label(node.inner_html)
27
+ end
28
+
29
+ def strip_deprecation_label(html)
30
+ html.gsub(%r{<span class="sts-tbx-term-depr-label">.*?</span>}, "")
31
+ end
32
+ end
33
+ end
34
+ end
35
+ end
36
+ end
37
+ end
38
+
39
+ Obp::Access::ElementRegistry.register(Obp::Access::Renderer::Elements::Terminology::TigDeprecated)
@@ -0,0 +1,23 @@
1
+ module Obp
2
+ class Access
3
+ class Renderer
4
+ class Elements
5
+ class Terminology
6
+ class TigPreferred < Tig
7
+ def self.classes
8
+ %w[sts-tbx-term preferredTerm]
9
+ end
10
+
11
+ private
12
+
13
+ def normative_authorization
14
+ "preferredTerm"
15
+ end
16
+ end
17
+ end
18
+ end
19
+ end
20
+ end
21
+ end
22
+
23
+ Obp::Access::ElementRegistry.register(Obp::Access::Renderer::Elements::Terminology::TigPreferred)
@@ -0,0 +1,28 @@
1
+ module Obp
2
+ class Access
3
+ class Renderer
4
+ class Elements
5
+ class Terminology < Base
6
+ def self.classes
7
+ %w[sts-section sts-tbx-sec]
8
+ end
9
+
10
+ private
11
+
12
+ def content
13
+ Nokogiri::XML::Builder.new do |xml|
14
+ xml.public_send(:"term-sec", id: "sec_#{id}") do
15
+ xml.label id
16
+ xml.public_send(:"tbx:termEntry", id: "term_#{id}") do
17
+ xml.public_send(:"tbx:langSet", "xml:lang": metas["language"])
18
+ end
19
+ end
20
+ end
21
+ end
22
+ end
23
+ end
24
+ end
25
+ end
26
+ end
27
+
28
+ Obp::Access::ElementRegistry.register(Obp::Access::Renderer::Elements::Terminology)
@@ -0,0 +1,33 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Obp
4
+ class Access
5
+ class Renderer
6
+ class Elements
7
+ class Title < Base
8
+ def self.classes
9
+ %w[sts-sec-title]
10
+ end
11
+
12
+ private
13
+
14
+ def insert_method
15
+ :add_child
16
+ end
17
+
18
+ def content
19
+ Nokogiri::XML::Builder.new do |xml|
20
+ xml.title sanitize_text(section_title.text)
21
+ end
22
+ end
23
+
24
+ def section_title
25
+ @section_title ||= SectionTitle.new(node.content)
26
+ end
27
+ end
28
+ end
29
+ end
30
+ end
31
+ end
32
+
33
+ Obp::Access::ElementRegistry.register(Obp::Access::Renderer::Elements::Title)