obp-access 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (53) hide show
  1. checksums.yaml +7 -0
  2. data/.rubocop.yml +18 -0
  3. data/.rubocop_todo.yml +33 -0
  4. data/CLAUDE.md +59 -0
  5. data/README.adoc +97 -0
  6. data/Rakefile +12 -0
  7. data/exe/obp-access +5 -0
  8. data/lib/obp/access/catalog.rb +67 -0
  9. data/lib/obp/access/cli.rb +115 -0
  10. data/lib/obp/access/converter.rb +25 -0
  11. data/lib/obp/access/deliverable.rb +109 -0
  12. data/lib/obp/access/domain_extractor.rb +63 -0
  13. data/lib/obp/access/element_registry.rb +20 -0
  14. data/lib/obp/access/elements/array.rb +64 -0
  15. data/lib/obp/access/elements/base.rb +69 -0
  16. data/lib/obp/access/elements/bibliography/bib_ref.rb +60 -0
  17. data/lib/obp/access/elements/bibliography.rb +52 -0
  18. data/lib/obp/access/elements/copyright.rb +27 -0
  19. data/lib/obp/access/elements/figure.rb +58 -0
  20. data/lib/obp/access/elements/figure_group.rb +48 -0
  21. data/lib/obp/access/elements/index.rb +113 -0
  22. data/lib/obp/access/elements/introduction.rb +31 -0
  23. data/lib/obp/access/elements/list.rb +58 -0
  24. data/lib/obp/access/elements/non_normative_note.rb +47 -0
  25. data/lib/obp/access/elements/paragraph.rb +31 -0
  26. data/lib/obp/access/elements/root.rb +122 -0
  27. data/lib/obp/access/elements/section.rb +38 -0
  28. data/lib/obp/access/elements/section_title.rb +26 -0
  29. data/lib/obp/access/elements/section_type.rb +27 -0
  30. data/lib/obp/access/elements/table_wrap.rb +47 -0
  31. data/lib/obp/access/elements/terminology/base.rb +27 -0
  32. data/lib/obp/access/elements/terminology/definition.rb +44 -0
  33. data/lib/obp/access/elements/terminology/example.rb +27 -0
  34. data/lib/obp/access/elements/terminology/note.rb +27 -0
  35. data/lib/obp/access/elements/terminology/source.rb +45 -0
  36. data/lib/obp/access/elements/terminology/tig.rb +59 -0
  37. data/lib/obp/access/elements/terminology/tig_admitted.rb +23 -0
  38. data/lib/obp/access/elements/terminology/tig_deprecated.rb +39 -0
  39. data/lib/obp/access/elements/terminology/tig_preferred.rb +23 -0
  40. data/lib/obp/access/elements/terminology.rb +28 -0
  41. data/lib/obp/access/elements/title.rb +33 -0
  42. data/lib/obp/access/fetcher.rb +63 -0
  43. data/lib/obp/access/grammar_parser.rb +135 -0
  44. data/lib/obp/access/imager.rb +39 -0
  45. data/lib/obp/access/inline_renderer.rb +97 -0
  46. data/lib/obp/access/parser.rb +82 -0
  47. data/lib/obp/access/renderer.rb +43 -0
  48. data/lib/obp/access/retriever.rb +97 -0
  49. data/lib/obp/access/urn.rb +31 -0
  50. data/lib/obp/access/version.rb +5 -0
  51. data/lib/obp/access.rb +118 -0
  52. data/lib/obp-access.rb +1 -0
  53. metadata +151 -0
@@ -0,0 +1,64 @@
1
+ module Obp
2
+ class Access
3
+ class Renderer
4
+ class Elements
5
+ class Array < Base
6
+ def self.classes
7
+ %w[sts-array]
8
+ end
9
+
10
+ private
11
+
12
+ def content
13
+ Nokogiri::XML::Builder.new do |xml|
14
+ xml.array do
15
+ xml.table do
16
+ render_colgroup(xml)
17
+ render_thead(xml)
18
+ render_tbody(xml)
19
+ end
20
+ end
21
+ end
22
+ end
23
+
24
+ def render_colgroup(xml)
25
+ cols = node.css("colgroup col")
26
+ return if cols.empty?
27
+
28
+ xml.colgroup do
29
+ cols.each { |col| xml.col col.attributes.slice("align", "width") }
30
+ end
31
+ end
32
+
33
+ def render_thead(xml)
34
+ rows = node.css("thead tr")
35
+ return if rows.empty?
36
+
37
+ xml.thead do
38
+ rows.each do |tr|
39
+ xml.tr do
40
+ tr.css("th").each { |th| xml.th sanitize_text(th.content) }
41
+ end
42
+ end
43
+ end
44
+ end
45
+
46
+ def render_tbody(xml)
47
+ rows = node.css("tbody tr")
48
+ return if rows.empty?
49
+
50
+ xml.tbody do
51
+ rows.each do |tr|
52
+ xml.tr do
53
+ tr.css("td").each { |td| xml.td sanitize_text(td.content) }
54
+ end
55
+ end
56
+ end
57
+ end
58
+ end
59
+ end
60
+ end
61
+ end
62
+ end
63
+
64
+ Obp::Access::ElementRegistry.register(Obp::Access::Renderer::Elements::Array)
@@ -0,0 +1,69 @@
1
+ module Obp
2
+ class Access
3
+ class Renderer
4
+ class Elements
5
+ class Base
6
+ # Elements are rendered using the NISO STS spec:
7
+ # https://www.niso-sts.org/TagLibrary/niso-sts-TL-1-2-html/index.html
8
+ attr_reader :document, :metas, :node
9
+
10
+ def initialize(document:, metas:, node:)
11
+ @document = document
12
+ @metas = metas
13
+ @node = node
14
+ end
15
+
16
+ def self.classes
17
+ nil
18
+ end
19
+
20
+ def match_node?
21
+ node.classes == self.class.classes
22
+ end
23
+
24
+ def render(target:)
25
+ effective_target = insertion_target || target
26
+ effective_target = "#{effective_target}#{path_suffix}" if path_suffix
27
+ document.at(effective_target).public_send(insert_method, to_xml)
28
+ end
29
+
30
+ private
31
+
32
+ def insertion_target
33
+ nil
34
+ end
35
+
36
+ def path_suffix
37
+ nil
38
+ end
39
+
40
+ def insert_method
41
+ :add_child
42
+ end
43
+
44
+ def id
45
+ @id ||= node.attr("id").split("_").last
46
+ end
47
+
48
+ def to_xml
49
+ content.doc.root.to_xml
50
+ end
51
+
52
+ def content
53
+ raise NotImplementedError
54
+ end
55
+
56
+ def sanitize_text(text)
57
+ text
58
+ .gsub("<b>", "<bold>").gsub("</b>", "</bold>")
59
+ .gsub("<i>", "<italic>").gsub("</i>", "</italic>")
60
+ end
61
+
62
+ def local_image_path(img)
63
+ metas["images"][img.attr("src")]
64
+ end
65
+ end
66
+ end
67
+ end
68
+ end
69
+ end
@@ -0,0 +1,60 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Obp
4
+ class Access
5
+ class Renderer
6
+ class Elements
7
+ class BibRef
8
+ REF_PATTERN = /\A\[(\d+)\]\s*/
9
+ DATED_PATTERN = /:\d{4}/
10
+
11
+ attr_reader :index, :std_ref_text, :title_text, :std_id, :type
12
+
13
+ def initialize(td_node, anchor_node)
14
+ text = td_node.text
15
+ @index = parse_index(text)
16
+ @std_id = parse_std_id(anchor_node)
17
+ @type = infer_type(text)
18
+ @std_ref_text, @title_text = parse_parts(td_node)
19
+ end
20
+
21
+ private
22
+
23
+ def parse_index(text)
24
+ text[REF_PATTERN, 1].to_i
25
+ end
26
+
27
+ def parse_std_id(anchor)
28
+ return nil unless anchor
29
+
30
+ name = anchor.attr("name")
31
+ return nil unless name
32
+ return nil unless name.include?(":ref:")
33
+
34
+ name.sub(/:ref:\d+\z/, "")
35
+ end
36
+
37
+ def infer_type(text)
38
+ text.match?(DATED_PATTERN) ? "dated" : "undated"
39
+ end
40
+
41
+ def parse_parts(td_node)
42
+ content = td_node.inner_html
43
+ content = content.sub(REF_PATTERN, "")
44
+ if content.include?("<i>")
45
+ parts = content.split(/,\s*<i>/, 2)
46
+ std_ref = parts[0].strip
47
+ title = parts[1]&.sub(%r{</i>}, "")&.strip
48
+ [std_ref, title]
49
+ elsif content.include?(",")
50
+ parts = content.split(",", 2)
51
+ [parts[0].strip, parts[1]&.strip]
52
+ else
53
+ [content.strip, nil]
54
+ end
55
+ end
56
+ end
57
+ end
58
+ end
59
+ end
60
+ end
@@ -0,0 +1,52 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative "bibliography/bib_ref"
4
+
5
+ module Obp
6
+ class Access
7
+ class Renderer
8
+ class Elements
9
+ class Bibliography < Base
10
+ def self.classes
11
+ %w[sts-section sts-ref-list]
12
+ end
13
+
14
+ private
15
+
16
+ def insertion_target
17
+ "back"
18
+ end
19
+
20
+ def content
21
+ Nokogiri::XML::Builder.new do |xml|
22
+ xml.public_send(:"ref-list", "content-type": "bibl", id: "sec_bibl") do
23
+ node.css("tr").drop(1).each_with_index do |row, _index|
24
+ td = row.css("td:last-child")
25
+ anchor = row.at_css("td:first-child a[name]")
26
+ ref = BibRef.new(td, anchor)
27
+ xml.ref("content-type": "standard", id: "ref_#{ref.index}") do
28
+ xml.label "[#{ref.index}]"
29
+ render_std(xml, ref)
30
+ end
31
+ end
32
+ end
33
+ end
34
+ end
35
+
36
+ def render_std(xml, ref)
37
+ attrs = {}
38
+ attrs["std-id"] = ref.std_id if ref.std_id
39
+ attrs["type"] = ref.type if ref.type
40
+
41
+ xml.std(**attrs) do
42
+ xml.std_ref(ref.std_ref_text)
43
+ xml.title ref.title_text if ref.title_text
44
+ end
45
+ end
46
+ end
47
+ end
48
+ end
49
+ end
50
+ end
51
+
52
+ Obp::Access::ElementRegistry.register(Obp::Access::Renderer::Elements::Bibliography)
@@ -0,0 +1,27 @@
1
+ module Obp
2
+ class Access
3
+ class Renderer
4
+ class Elements
5
+ class Copyright < Base
6
+ def self.classes
7
+ %w[sts-copyright]
8
+ end
9
+
10
+ private
11
+
12
+ def insertion_target
13
+ "front/std-meta/permissions"
14
+ end
15
+
16
+ def content
17
+ Nokogiri::XML::Builder.new do |xml|
18
+ xml.public_send(:"copyright-year", node.content.scan(/\d+/).first)
19
+ end
20
+ end
21
+ end
22
+ end
23
+ end
24
+ end
25
+ end
26
+
27
+ Obp::Access::ElementRegistry.register(Obp::Access::Renderer::Elements::Copyright)
@@ -0,0 +1,58 @@
1
+ module Obp
2
+ class Access
3
+ class Renderer
4
+ class Elements
5
+ class Figure < Base
6
+ def self.classes
7
+ %w[sts-fig]
8
+ end
9
+
10
+ def match_node?
11
+ super && node.css("img").any?
12
+ end
13
+
14
+ private
15
+
16
+ def content
17
+ Nokogiri::XML::Builder.new do |xml|
18
+ xml.fig do
19
+ render_caption(xml)
20
+ xml.graphic("xlink:href": local_image_path(node.at_css("img")))
21
+
22
+ legend_table = node.at_css("div.sts-table-wrap.fig-index")
23
+ render_legend(xml, legend_table) if legend_table
24
+ end
25
+ end
26
+ end
27
+
28
+ def render_caption(xml)
29
+ caption = node.at_css(".sts-caption")
30
+ return unless caption
31
+
32
+ label = caption.at_css(".sts-caption-label")
33
+ xml.label label.content if label
34
+ title = caption.at_css(".sts-caption-title")
35
+ xml.caption do
36
+ xml.title title.content if title
37
+ end
38
+ end
39
+
40
+ def render_legend(xml, table_node)
41
+ xml.public_send(:"table-wrap", "content-type": "legend") do
42
+ caption = table_node.at_css(".sts-caption")
43
+ if caption
44
+ xml.caption do
45
+ title = caption.at_css(".sts-caption-title")
46
+ xml.title title.content if title
47
+ end
48
+ end
49
+ xml.table { xml << table_node.at_css("table").inner_html }
50
+ end
51
+ end
52
+ end
53
+ end
54
+ end
55
+ end
56
+ end
57
+
58
+ Obp::Access::ElementRegistry.register(Obp::Access::Renderer::Elements::Figure)
@@ -0,0 +1,48 @@
1
+ module Obp
2
+ class Access
3
+ class Renderer
4
+ class Elements
5
+ class FigureGroup < Base
6
+ def self.classes
7
+ %w[sts-fig]
8
+ end
9
+
10
+ def match_node?
11
+ super && node.css("img").count > 1
12
+ end
13
+
14
+ private
15
+
16
+ def content
17
+ Nokogiri::XML::Builder.new do |xml|
18
+ xml.public_send(:"fig-group") do
19
+ render_caption(xml, node)
20
+ node.css("img").each { |img| render_figure(xml, img) }
21
+ end
22
+ end
23
+ end
24
+
25
+ def render_caption(xml, children)
26
+ xml.label children.at(".sts-caption-label").content
27
+ xml.caption do
28
+ xml.title children.at(".sts-caption-title").content
29
+ end
30
+ end
31
+
32
+ def render_figure(xml, img)
33
+ xml.fig do
34
+ div = img.previous
35
+ xml.label div.at(".sts-caption-label").content
36
+ xml.caption do
37
+ xml.title div.at(".sts-caption-title").content
38
+ end
39
+ xml.graphic("xlink:href": local_image_path(img))
40
+ end
41
+ end
42
+ end
43
+ end
44
+ end
45
+ end
46
+ end
47
+
48
+ Obp::Access::ElementRegistry.register(Obp::Access::Renderer::Elements::FigureGroup)
@@ -0,0 +1,113 @@
1
+ module Obp
2
+ class Access
3
+ class Renderer
4
+ class Elements
5
+ class Index < Base
6
+ def self.classes
7
+ %w[sts-section]
8
+ end
9
+
10
+ def match_node?
11
+ super && index_section?
12
+ end
13
+
14
+ private
15
+
16
+ def index_section?
17
+ node_id = node.attr("id").to_s
18
+ node_id.include?("sec_index") || index_title?
19
+ end
20
+
21
+ def index_title?
22
+ title = node.at_css("h1.sts-sec-title")
23
+ title&.text&.match?(/index|Index|verzeichnis/i)
24
+ end
25
+
26
+ def insertion_target
27
+ "body"
28
+ end
29
+
30
+ def content
31
+ Nokogiri::XML::Builder.new do |xml|
32
+ xml.sec(id: "sec_index") do
33
+ xml.title index_title_text
34
+ xml.index do
35
+ render_index_divs(xml)
36
+ end
37
+ end
38
+ end
39
+ end
40
+
41
+ def render_index_divs(xml)
42
+ grouped_entries.each do |letter, entries|
43
+ xml.public_send(:"index-div") do
44
+ xml.title letter
45
+ entries.each { |entry| render_index_entry(xml, entry) }
46
+ end
47
+ end
48
+ end
49
+
50
+ def render_index_entry(xml, entry)
51
+ xml.public_send(:"index-entry") do
52
+ xml.term entry[:term]
53
+ entry[:refs].each do |ref|
54
+ xml.xref("ref-type": "sec", rid: "sec_#{ref[:rid]}") { xml << ref[:text] }
55
+ end
56
+ end
57
+ end
58
+
59
+ def index_title_text
60
+ node.at_css("h1.sts-sec-title")&.text || "Index"
61
+ end
62
+
63
+ def grouped_entries
64
+ groups = {}
65
+ current_letter = nil
66
+
67
+ node.css("div.sts-p").each do |para|
68
+ letter = letter_heading(para)
69
+ if letter
70
+ current_letter = letter
71
+ groups[current_letter] ||= []
72
+ elsif current_letter && index_entry?(para)
73
+ groups[current_letter] << parse_entry(para)
74
+ end
75
+ end
76
+
77
+ groups
78
+ end
79
+
80
+ def letter_heading(para)
81
+ return nil unless para.inner_html.match?(/\A<b>[A-Z0-9À-Ü]<\/b>\z/)
82
+
83
+ para.at_css("b")&.text
84
+ end
85
+
86
+ def index_entry?(para)
87
+ !para.at_css("a.sts-xref").nil?
88
+ end
89
+
90
+ def parse_entry(para)
91
+ term_node = para.at_css("a.sts-xref")
92
+ { term: entry_term_text(term_node), refs: entry_refs(para) }
93
+ end
94
+
95
+ def entry_term_text(term_node)
96
+ preceding = term_node.previous
97
+ return "" unless preceding.is_a?(Nokogiri::XML::Text)
98
+
99
+ preceding.text.strip.gsub(/[[:space:]]+/, " ").strip
100
+ end
101
+
102
+ def entry_refs(para)
103
+ para.css("a.sts-xref").map do |xref|
104
+ { rid: xref.attr("href").split(":").last, text: xref.text.strip }
105
+ end
106
+ end
107
+ end
108
+ end
109
+ end
110
+ end
111
+ end
112
+
113
+ Obp::Access::ElementRegistry.register(Obp::Access::Renderer::Elements::Index)
@@ -0,0 +1,31 @@
1
+ module Obp
2
+ class Access
3
+ class Renderer
4
+ class Elements
5
+ class Introduction < Base
6
+ def self.classes
7
+ %w[sts-section]
8
+ end
9
+
10
+ def match_node?
11
+ super && %w[foreword intro].include?(id)
12
+ end
13
+
14
+ private
15
+
16
+ def insertion_target
17
+ "front"
18
+ end
19
+
20
+ def content
21
+ Nokogiri::XML::Builder.new do |xml|
22
+ xml.sec(id: "sec_#{id}", "sec-type": id)
23
+ end
24
+ end
25
+ end
26
+ end
27
+ end
28
+ end
29
+ end
30
+
31
+ Obp::Access::ElementRegistry.register(Obp::Access::Renderer::Elements::Introduction)
@@ -0,0 +1,58 @@
1
+ module Obp
2
+ class Access
3
+ class Renderer
4
+ class Elements
5
+ class List < Base
6
+ include InlineRenderer
7
+
8
+ def self.classes
9
+ %w[list]
10
+ end
11
+
12
+ private
13
+
14
+ def content
15
+ Nokogiri::XML::Builder.new do |xml|
16
+ xml.list("list-type": "dash") do
17
+ node.xpath("./ul/li").each do |li|
18
+ xml.public_send(:"list-item") do
19
+ render_li(xml, li)
20
+ end
21
+ end
22
+ end
23
+ end
24
+ end
25
+
26
+ def render_li(xml, item)
27
+ render_li_label(xml, item)
28
+ render_li_paragraph(xml, item)
29
+ render_nested_list(xml, item)
30
+ end
31
+
32
+ def render_li_label(xml, item)
33
+ label = item.at_css("span.sts-label")
34
+ xml.label label.text if label
35
+ end
36
+
37
+ def render_li_paragraph(xml, item)
38
+ p_node = item.at_css(".sts-p")
39
+ return unless p_node
40
+
41
+ xml.p { p_node.children.each { |c| render_inline(xml, c) } }
42
+ end
43
+
44
+ def render_nested_list(xml, item)
45
+ nested = item.at_css(".list")
46
+ return unless nested
47
+
48
+ nested.xpath("./ul/li").each do |nested_item|
49
+ xml.public_send(:"list-item") { render_li(xml, nested_item) }
50
+ end
51
+ end
52
+ end
53
+ end
54
+ end
55
+ end
56
+ end
57
+
58
+ Obp::Access::ElementRegistry.register(Obp::Access::Renderer::Elements::List)
@@ -0,0 +1,47 @@
1
+ module Obp
2
+ class Access
3
+ class Renderer
4
+ class Elements
5
+ class NonNormativeNote < Base
6
+ include InlineRenderer
7
+
8
+ def self.classes
9
+ %w[sts-non-normative-note]
10
+ end
11
+
12
+ private
13
+
14
+ def content
15
+ Nokogiri::XML::Builder.new do |xml|
16
+ xml.public_send(:"non-normative-note") do
17
+ label_node = node.at_css(".sts-non-normative-note-label")
18
+ xml.label label_node.text if label_node
19
+
20
+ p_node = node.at_css("p")
21
+ if p_node
22
+ xml.p do
23
+ p_node.children.each { |child| render_inline(xml, child) }
24
+ end
25
+ end
26
+ end
27
+ end
28
+ end
29
+
30
+ def inline_type(node)
31
+ if node.is_a?(Nokogiri::XML::Text)
32
+ :text
33
+ elsif node.is_a?(Nokogiri::XML::Element)
34
+ if node.classes == ["sts-non-normative-note-label"]
35
+ :label
36
+ else
37
+ super
38
+ end
39
+ end
40
+ end
41
+ end
42
+ end
43
+ end
44
+ end
45
+ end
46
+
47
+ Obp::Access::ElementRegistry.register(Obp::Access::Renderer::Elements::NonNormativeNote)
@@ -0,0 +1,31 @@
1
+ module Obp
2
+ class Access
3
+ class Renderer
4
+ class Elements
5
+ class Paragraph < Base
6
+ include InlineRenderer
7
+
8
+ def self.classes
9
+ %w[sts-p]
10
+ end
11
+
12
+ def match_node?
13
+ super && node.parent.name != "li"
14
+ end
15
+
16
+ private
17
+
18
+ def content
19
+ Nokogiri::XML::Builder.new do |xml|
20
+ xml.p do
21
+ node.children.each { |child| render_inline(xml, child) }
22
+ end
23
+ end
24
+ end
25
+ end
26
+ end
27
+ end
28
+ end
29
+ end
30
+
31
+ Obp::Access::ElementRegistry.register(Obp::Access::Renderer::Elements::Paragraph)