obp-access 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (53) hide show
  1. checksums.yaml +7 -0
  2. data/.rubocop.yml +18 -0
  3. data/.rubocop_todo.yml +33 -0
  4. data/CLAUDE.md +59 -0
  5. data/README.adoc +97 -0
  6. data/Rakefile +12 -0
  7. data/exe/obp-access +5 -0
  8. data/lib/obp/access/catalog.rb +67 -0
  9. data/lib/obp/access/cli.rb +115 -0
  10. data/lib/obp/access/converter.rb +25 -0
  11. data/lib/obp/access/deliverable.rb +109 -0
  12. data/lib/obp/access/domain_extractor.rb +63 -0
  13. data/lib/obp/access/element_registry.rb +20 -0
  14. data/lib/obp/access/elements/array.rb +64 -0
  15. data/lib/obp/access/elements/base.rb +69 -0
  16. data/lib/obp/access/elements/bibliography/bib_ref.rb +60 -0
  17. data/lib/obp/access/elements/bibliography.rb +52 -0
  18. data/lib/obp/access/elements/copyright.rb +27 -0
  19. data/lib/obp/access/elements/figure.rb +58 -0
  20. data/lib/obp/access/elements/figure_group.rb +48 -0
  21. data/lib/obp/access/elements/index.rb +113 -0
  22. data/lib/obp/access/elements/introduction.rb +31 -0
  23. data/lib/obp/access/elements/list.rb +58 -0
  24. data/lib/obp/access/elements/non_normative_note.rb +47 -0
  25. data/lib/obp/access/elements/paragraph.rb +31 -0
  26. data/lib/obp/access/elements/root.rb +122 -0
  27. data/lib/obp/access/elements/section.rb +38 -0
  28. data/lib/obp/access/elements/section_title.rb +26 -0
  29. data/lib/obp/access/elements/section_type.rb +27 -0
  30. data/lib/obp/access/elements/table_wrap.rb +47 -0
  31. data/lib/obp/access/elements/terminology/base.rb +27 -0
  32. data/lib/obp/access/elements/terminology/definition.rb +44 -0
  33. data/lib/obp/access/elements/terminology/example.rb +27 -0
  34. data/lib/obp/access/elements/terminology/note.rb +27 -0
  35. data/lib/obp/access/elements/terminology/source.rb +45 -0
  36. data/lib/obp/access/elements/terminology/tig.rb +59 -0
  37. data/lib/obp/access/elements/terminology/tig_admitted.rb +23 -0
  38. data/lib/obp/access/elements/terminology/tig_deprecated.rb +39 -0
  39. data/lib/obp/access/elements/terminology/tig_preferred.rb +23 -0
  40. data/lib/obp/access/elements/terminology.rb +28 -0
  41. data/lib/obp/access/elements/title.rb +33 -0
  42. data/lib/obp/access/fetcher.rb +63 -0
  43. data/lib/obp/access/grammar_parser.rb +135 -0
  44. data/lib/obp/access/imager.rb +39 -0
  45. data/lib/obp/access/inline_renderer.rb +97 -0
  46. data/lib/obp/access/parser.rb +82 -0
  47. data/lib/obp/access/renderer.rb +43 -0
  48. data/lib/obp/access/retriever.rb +97 -0
  49. data/lib/obp/access/urn.rb +31 -0
  50. data/lib/obp/access/version.rb +5 -0
  51. data/lib/obp/access.rb +118 -0
  52. data/lib/obp-access.rb +1 -0
  53. metadata +151 -0
@@ -0,0 +1,63 @@
1
+ module Obp
2
+ class Access
3
+ class Fetcher
4
+ USER_AGENT_PROFILES = [
5
+ {
6
+ user_agent: "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) " \
7
+ "AppleWebKit/537.36 (KHTML, like Gecko) " \
8
+ "Chrome/131.0.0.0 Safari/537.36",
9
+ platform: '"macOS"',
10
+ chrome_version: "131",
11
+ },
12
+ {
13
+ user_agent: "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " \
14
+ "AppleWebKit/537.36 (KHTML, like Gecko) " \
15
+ "Chrome/130.0.0.0 Safari/537.36",
16
+ platform: '"Windows"',
17
+ chrome_version: "130",
18
+ },
19
+ {
20
+ user_agent: "Mozilla/5.0 (X11; Linux x86_64) " \
21
+ "AppleWebKit/537.36 (KHTML, like Gecko) " \
22
+ "Chrome/131.0.0.0 Safari/537.36",
23
+ platform: '"Linux"',
24
+ chrome_version: "131",
25
+ },
26
+ ].freeze
27
+
28
+ def initialize(urn:)
29
+ @urn = urn
30
+ end
31
+
32
+ def fetch_state
33
+ response = post_ui_request
34
+ parse_state(response)
35
+ end
36
+
37
+ private
38
+
39
+ def post_ui_request
40
+ uri = URI(API_URL)
41
+ request = Net::HTTP::Post.new(uri)
42
+ profile = USER_AGENT_PROFILES.sample
43
+ request["User-Agent"] = profile[:user_agent]
44
+ request["Accept"] = "application/json"
45
+ request.set_form_data(
46
+ "v-browserDetails" => 1,
47
+ "theme" => "iso-red",
48
+ "v-loc" => "#{API_URL}##{@urn}",
49
+ )
50
+
51
+ Net::HTTP.start(uri.hostname, uri.port, use_ssl: true) do |http|
52
+ http.request(request)
53
+ end
54
+ end
55
+
56
+ def parse_state(response)
57
+ json = JSON.parse(response.body)
58
+ state_json = JSON.parse(json["uidl"])
59
+ state_json["state"].values
60
+ end
61
+ end
62
+ end
63
+ end
@@ -0,0 +1,135 @@
1
+ module Obp
2
+ class Access
3
+ class GrammarParser
4
+ Result = Struct.new(:term, :pos, :genders, keyword_init: true)
5
+
6
+ POS_MAP = {
7
+ "adj." => "adjective",
8
+ "Adj." => "adjective",
9
+ "verb" => "verb",
10
+ }.freeze
11
+
12
+ GENDER_VALUES = %w[m f n].freeze
13
+
14
+ BOLD_PATTERNS = [
15
+ [->(t) { POS_MAP.key?(t) }, :handle_pos_marker],
16
+ [->(t) { GENDER_VALUES.include?(t) }, :handle_gender_marker],
17
+ [->(t) { t.match?(/\A[mfn],\z/) }, :handle_gender_with_comma],
18
+ [->(t) { t.match?(/\A[mfn][,\s]+[mfn]([,\s]+[mfn])*\z/) }, :handle_multi_gender],
19
+ [->(t) { t == "," }, :handle_comma],
20
+ [->(t) { t == "〈" }, :handle_enter_bracket],
21
+ [->(t) { t == "〉" }, :handle_exit_bracket],
22
+ [->(t) { t.match?(/\A[mfn]\s+/) }, :handle_gender_qualifier],
23
+ [->(t) { t.match?(/,.+[mfn]\z/) }, :handle_term_with_gender],
24
+ ].freeze
25
+
26
+ def self.parse(inner_html)
27
+ state = { pos: "noun", genders: [], term_parts: [], in_bracket: false }
28
+ segments = parse_segments(inner_html)
29
+
30
+ segments.each do |seg|
31
+ handler = find_handler(seg, state[:in_bracket])
32
+ handler.call(seg[:text], state)
33
+ end
34
+
35
+ Result.new(term: clean_term(state[:term_parts]), pos: state[:pos], genders: state[:genders].uniq)
36
+ end
37
+
38
+ class << self
39
+ private
40
+
41
+ def find_handler(seg, in_bracket)
42
+ if seg[:bold]
43
+ bold_handler(seg[:text].strip, in_bracket)
44
+ elsif in_bracket
45
+ method(:handle_skip)
46
+ else
47
+ method(:handle_text)
48
+ end
49
+ end
50
+
51
+ def bold_handler(text, in_bracket)
52
+ _pattern, handler = BOLD_PATTERNS.find { |pred, _| pred.call(text) }
53
+ return method(handler) if handler
54
+
55
+ in_bracket ? method(:handle_skip) : method(:handle_term_text)
56
+ end
57
+
58
+ def handle_pos_marker(text, state)
59
+ state[:pos] = POS_MAP[text.strip]
60
+ end
61
+
62
+ def handle_gender_marker(text, state)
63
+ state[:genders] << text.strip
64
+ end
65
+
66
+ def handle_gender_with_comma(text, state)
67
+ state[:genders] << text.strip[0]
68
+ end
69
+
70
+ def handle_multi_gender(text, state)
71
+ text.strip.scan(/[mfn]/).each { |g| state[:genders] << g }
72
+ end
73
+
74
+ def handle_enter_bracket(_text, state)
75
+ state[:in_bracket] = true
76
+ end
77
+
78
+ def handle_exit_bracket(_text, state)
79
+ state[:in_bracket] = false
80
+ end
81
+
82
+ def handle_gender_qualifier(text, state)
83
+ state[:genders] << text.strip[0]
84
+ end
85
+
86
+ def handle_term_with_gender(text, state)
87
+ stripped = text.strip
88
+ if stripped =~ /\A(.+),\s*([mfn])\z/
89
+ state[:term_parts] << $1.strip
90
+ state[:genders] << $2
91
+ else
92
+ state[:term_parts] << stripped
93
+ end
94
+ end
95
+
96
+ def handle_comma(_text, _state); end
97
+
98
+ def handle_term_text(text, state)
99
+ state[:term_parts] << text
100
+ end
101
+
102
+ def handle_text(text, state)
103
+ state[:term_parts] << text
104
+ end
105
+
106
+ def handle_skip(_text, _state); end
107
+
108
+ def parse_segments(html)
109
+ segments = []
110
+ remaining = html.dup
111
+
112
+ while remaining.length.positive?
113
+ match = remaining.match(/\A(.*?)(<b>(.*?)<\/b>)(.*)/m)
114
+ if match
115
+ segments << { text: match[1], bold: false } if match[1].length.positive?
116
+ segments << { text: match[3], bold: true }
117
+ remaining = match[4]
118
+ else
119
+ segments << { text: remaining, bold: false }
120
+ break
121
+ end
122
+ end
123
+
124
+ segments
125
+ end
126
+
127
+ def clean_term(parts)
128
+ combined = parts.join
129
+ combined = combined.gsub(/,\s*\z/, "")
130
+ combined.gsub(/\s+/, " ").strip
131
+ end
132
+ end
133
+ end
134
+ end
135
+ end
@@ -0,0 +1,39 @@
1
+ module Obp
2
+ class Access
3
+ class Imager
4
+ attr_reader :html, :directory
5
+
6
+ def initialize(html:, directory:)
7
+ @html = html
8
+ @directory = directory
9
+ end
10
+
11
+ def images
12
+ doc = Nokogiri::HTML(html)
13
+ images = doc.search("div.sts-fig > img").to_h do |img|
14
+ key = img.attr("src")
15
+ path = File.join(imgdir, key.split("/").last)
16
+ [key, path]
17
+ end
18
+ download_images(images)
19
+ images
20
+ end
21
+
22
+ private
23
+
24
+ def imgdir
25
+ @imgdir ||= FileUtils.mkdir(File.join(directory, "images")).first
26
+ end
27
+
28
+ def download_images(images)
29
+ Parallel.each(images) { |key, path| download_image(key, path) }
30
+ end
31
+
32
+ def download_image(key, path)
33
+ url = "#{BASE_URL}#{key}"
34
+ blob = Net::HTTP.get_response(URI(url)).body
35
+ File.write(path, blob, mode: "wb")
36
+ end
37
+ end
38
+ end
39
+ end
@@ -0,0 +1,97 @@
1
+ module Obp
2
+ class Access
3
+ module InlineRenderer
4
+ CLASS_TYPES = {
5
+ %w[sts-tbx-entailedTerm] => :entailed_term,
6
+ %w[sts-xref] => :xref,
7
+ %w[sts-std-ref] => :std_ref,
8
+ %w[sts-label] => :label,
9
+ }.freeze
10
+
11
+ def render_inline(xml, node)
12
+ return xml.text(node.content) if node.is_a?(Nokogiri::XML::Text)
13
+
14
+ render_node_by_type(xml, node, inline_type(node))
15
+ end
16
+
17
+ CONTAINER_TYPES = { italic: :italic, bold: :bold }.freeze
18
+
19
+ def render_node_by_type(xml, node, type)
20
+ if CONTAINER_TYPES.key?(type)
21
+ render_container(xml, node, CONTAINER_TYPES[type])
22
+ elsif type == :label
23
+ nil
24
+ elsif type == :element
25
+ render_children(xml, node)
26
+ else
27
+ render_named_type(xml, node, type)
28
+ end
29
+ end
30
+
31
+ def render_named_type(xml, node, type)
32
+ case type
33
+ when :entailed_term then render_entailed_term(xml, node)
34
+ when :xref then render_xref(xml, node)
35
+ when :std_ref then render_std_ref(xml, node)
36
+ when :ext_link then render_ext_link(xml, node)
37
+ end
38
+ end
39
+
40
+ def render_container(xml, node, tag)
41
+ xml.public_send(tag) { node.children.each { |c| render_inline(xml, c) } }
42
+ end
43
+
44
+ def render_children(xml, node)
45
+ node.children.each { |c| render_inline(xml, c) }
46
+ end
47
+
48
+ def inline_type(node)
49
+ return :text if node.is_a?(Nokogiri::XML::Text)
50
+
51
+ CLASS_TYPES.fetch(node.classes) do
52
+ case node.name
53
+ when "i" then :italic
54
+ when "a" then :ext_link
55
+ when "b" then :bold
56
+ else :element
57
+ end
58
+ end
59
+ end
60
+
61
+ def xref_ref_type(text)
62
+ case text
63
+ when /\AFigure/ then "fig"
64
+ when /\ATable/ then "table"
65
+ when /\ANote/ then "fn"
66
+ else "sec"
67
+ end
68
+ end
69
+
70
+ private
71
+
72
+ def render_entailed_term(xml, node)
73
+ target = node.at_css("a").attr("href").split(":").last
74
+ xml.public_send(:"tbx:entailedTerm", target: "term_#{target}") do
75
+ xml << node.text.strip
76
+ end
77
+ end
78
+
79
+ def render_xref(xml, node)
80
+ rid = node.attr("href").split(":").last
81
+ ref_type = xref_ref_type(node.text)
82
+ xml.xref("ref-type": ref_type, rid: "#{ref_type}_#{rid}") { xml << node.text.strip }
83
+ end
84
+
85
+ def render_std_ref(xml, node)
86
+ rid = node.attr("href").split(":").last
87
+ xml.xref("ref-type": "bibr", rid: "ref_#{rid}") { xml << node.text.strip }
88
+ end
89
+
90
+ def render_ext_link(xml, node)
91
+ xml.public_send(:"ext-link",
92
+ "ext-link-type" => "uri",
93
+ "xlink:href" => node.attr("href")) { xml << node.text.strip }
94
+ end
95
+ end
96
+ end
97
+ end
@@ -0,0 +1,82 @@
1
+ module Obp
2
+ class Access
3
+ class Parser
4
+ attr_reader :urn, :directory
5
+
6
+ def initialize(urn:, directory:)
7
+ @urn = urn
8
+ @directory = directory
9
+ end
10
+
11
+ def to_xml
12
+ xml
13
+ end
14
+
15
+ def title
16
+ tab_data["description"]
17
+ end
18
+
19
+ def html
20
+ @html ||= begin
21
+ content = state.filter_map { |attr| attr["htmlContent"] }.first
22
+ raise "OBP content not found for URN #{urn}" unless content
23
+
24
+ content
25
+ end
26
+ end
27
+
28
+ def available_languages
29
+ state
30
+ .select { |attr| !attr["caption"]&.empty? && attr["styles"]&.include?("toggle") }
31
+ .filter_map { |attr| attr["caption"] }
32
+ .uniq
33
+ end
34
+
35
+ private
36
+
37
+ def fetcher
38
+ @fetcher ||= Fetcher.new(urn:)
39
+ end
40
+
41
+ def state
42
+ @state ||= fetcher.fetch_state
43
+ end
44
+
45
+ def xml
46
+ @xml ||= begin
47
+ metas = {
48
+ "titles" => titles,
49
+ "images" => images,
50
+ "language" => urn.language,
51
+ }.merge(tab_data)
52
+
53
+ Converter.new(urn:, metas:, source: html).to_xml
54
+ end
55
+ end
56
+
57
+ def tab_data
58
+ @tab_data ||= state.filter_map { |attr| attr["tabs"] }.first.last
59
+ end
60
+
61
+ def titles
62
+ languages = available_languages
63
+ languages = [urn.language] if languages.empty?
64
+
65
+ Parallel.map(languages) { |lang| fetch_title(lang) }.to_h
66
+ end
67
+
68
+ def fetch_title(lang)
69
+ if lang == urn.language
70
+ [lang, title]
71
+ else
72
+ other_urn = Urn.new("#{urn.base}:#{lang}")
73
+ [lang, Parser.new(urn: other_urn, directory:).title]
74
+ end
75
+ end
76
+
77
+ def images
78
+ Imager.new(html:, directory:).images
79
+ end
80
+ end
81
+ end
82
+ end
@@ -0,0 +1,43 @@
1
+ module Obp
2
+ class Access
3
+ class Renderer
4
+ attr_reader :urn, :metas, :nodes, :document
5
+
6
+ def initialize(urn:, metas:, nodes:)
7
+ @urn = urn
8
+ @metas = metas
9
+ @nodes = nodes
10
+ @document = Elements::Root.new(urn:, metas:).to_document
11
+ end
12
+
13
+ def to_xml
14
+ @nodes.each { |node| render(node:) }
15
+ @document.to_xml
16
+ end
17
+
18
+ private
19
+
20
+ def render(node:, target: nil)
21
+ return unless css_classes_match?(node)
22
+
23
+ ElementRegistry.elements.each do |element_class|
24
+ element = element_class.new(document:, metas:, node:)
25
+ next unless element.match_node?
26
+
27
+ xml = element.render(target:)
28
+ section_path = xml.first.path
29
+
30
+ node.children.each do |child|
31
+ render(node: child, target: section_path)
32
+ end
33
+
34
+ xml
35
+ end
36
+ end
37
+
38
+ def css_classes_match?(node)
39
+ ElementRegistry.css_classes.any?(node.classes)
40
+ end
41
+ end
42
+ end
43
+ end
@@ -0,0 +1,97 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "json"
4
+ require "fileutils"
5
+
6
+ module Obp
7
+ class Access
8
+ class Retriever
9
+ MANIFEST_FILE = "manifest.json"
10
+
11
+ attr_reader :output_dir, :catalog, :concurrency
12
+
13
+ def initialize(output_dir:, catalog:, concurrency: 4)
14
+ @output_dir = output_dir
15
+ @catalog = catalog
16
+ @concurrency = concurrency
17
+ end
18
+
19
+ def run
20
+ FileUtils.mkdir_p(output_dir)
21
+ pending = pending_deliverables
22
+ total = pending.size
23
+
24
+ if total.zero?
25
+ puts "Nothing to retrieve — all #{catalog.retrievable.size} deliverables already fetched."
26
+ return
27
+ end
28
+
29
+ puts "Retrieving #{total} deliverables to #{output_dir} (concurrency: #{concurrency})..."
30
+ process_all(pending, total)
31
+ puts "Done. Fetched #{total} documents."
32
+ end
33
+
34
+ private
35
+
36
+ def pending_deliverables
37
+ catalog.retrievable.reject { |d| manifest.key?(d.id.to_s) }
38
+ end
39
+
40
+ def process_all(pending, total)
41
+ Parallel.each_with_index(pending, in_threads: concurrency) do |deliverable, i|
42
+ process_one(deliverable, i + 1, total)
43
+ end
44
+ end
45
+
46
+ def process_one(deliverable, index, total)
47
+ deliverable.languages.each { |lang| fetch_and_save(deliverable, lang) }
48
+ record_success(deliverable)
49
+ puts "[#{index}/#{total}] #{deliverable.reference} — OK"
50
+ rescue StandardError => e
51
+ record_failure(deliverable, e)
52
+ puts "[#{index}/#{total}] #{deliverable.reference} — FAILED: #{e.message}"
53
+ end
54
+
55
+ def fetch_and_save(deliverable, language)
56
+ urn = deliverable.to_urn(language:)
57
+ access = Access.fetch(urn.to_s)
58
+ xml = access.to_xml(pretty: true)
59
+
60
+ dir = File.join(output_dir, deliverable.reference.gsub(%r{[/:\s]}, "-"))
61
+ FileUtils.mkdir_p(dir)
62
+ File.write(File.join(dir, "#{language}.xml"), xml)
63
+ end
64
+
65
+ def record_success(deliverable)
66
+ manifest[deliverable.id.to_s] = {
67
+ "reference" => deliverable.reference,
68
+ "status" => "success",
69
+ "timestamp" => Time.now.utc.iso8601,
70
+ }
71
+ save_manifest
72
+ end
73
+
74
+ def record_failure(deliverable, error)
75
+ manifest[deliverable.id.to_s] = {
76
+ "reference" => deliverable.reference,
77
+ "status" => "failed",
78
+ "error" => error.message,
79
+ "timestamp" => Time.now.utc.iso8601,
80
+ }
81
+ save_manifest
82
+ end
83
+
84
+ def manifest
85
+ @manifest ||= begin
86
+ path = File.join(output_dir, MANIFEST_FILE)
87
+ File.exist?(path) ? JSON.parse(File.read(path)) : {}
88
+ end
89
+ end
90
+
91
+ def save_manifest
92
+ path = File.join(output_dir, MANIFEST_FILE)
93
+ File.write(path, JSON.pretty_generate(manifest))
94
+ end
95
+ end
96
+ end
97
+ end
@@ -0,0 +1,31 @@
1
+ module Obp
2
+ class Access
3
+ class Urn
4
+ attr_reader :raw, :language, :base
5
+
6
+ def initialize(raw)
7
+ @raw = raw
8
+ parts = raw.split(":")
9
+ @language = parts.last
10
+ @base = parts[0...-1].join(":")
11
+ end
12
+
13
+ def safe
14
+ @safe ||= raw.tr(":", "-")
15
+ end
16
+
17
+ def to_s
18
+ raw
19
+ end
20
+
21
+ def ==(other)
22
+ other.is_a?(self.class) && raw == other.raw
23
+ end
24
+ alias_method :eql?, :==
25
+
26
+ def hash
27
+ raw.hash
28
+ end
29
+ end
30
+ end
31
+ end
@@ -0,0 +1,5 @@
1
+ module Obp
2
+ class Access
3
+ VERSION = "0.1.1".freeze
4
+ end
5
+ end