obp-access 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.rubocop.yml +18 -0
- data/.rubocop_todo.yml +33 -0
- data/CLAUDE.md +59 -0
- data/README.adoc +97 -0
- data/Rakefile +12 -0
- data/exe/obp-access +5 -0
- data/lib/obp/access/catalog.rb +67 -0
- data/lib/obp/access/cli.rb +115 -0
- data/lib/obp/access/converter.rb +25 -0
- data/lib/obp/access/deliverable.rb +109 -0
- data/lib/obp/access/domain_extractor.rb +63 -0
- data/lib/obp/access/element_registry.rb +20 -0
- data/lib/obp/access/elements/array.rb +64 -0
- data/lib/obp/access/elements/base.rb +69 -0
- data/lib/obp/access/elements/bibliography/bib_ref.rb +60 -0
- data/lib/obp/access/elements/bibliography.rb +52 -0
- data/lib/obp/access/elements/copyright.rb +27 -0
- data/lib/obp/access/elements/figure.rb +58 -0
- data/lib/obp/access/elements/figure_group.rb +48 -0
- data/lib/obp/access/elements/index.rb +113 -0
- data/lib/obp/access/elements/introduction.rb +31 -0
- data/lib/obp/access/elements/list.rb +58 -0
- data/lib/obp/access/elements/non_normative_note.rb +47 -0
- data/lib/obp/access/elements/paragraph.rb +31 -0
- data/lib/obp/access/elements/root.rb +122 -0
- data/lib/obp/access/elements/section.rb +38 -0
- data/lib/obp/access/elements/section_title.rb +26 -0
- data/lib/obp/access/elements/section_type.rb +27 -0
- data/lib/obp/access/elements/table_wrap.rb +47 -0
- data/lib/obp/access/elements/terminology/base.rb +27 -0
- data/lib/obp/access/elements/terminology/definition.rb +44 -0
- data/lib/obp/access/elements/terminology/example.rb +27 -0
- data/lib/obp/access/elements/terminology/note.rb +27 -0
- data/lib/obp/access/elements/terminology/source.rb +45 -0
- data/lib/obp/access/elements/terminology/tig.rb +59 -0
- data/lib/obp/access/elements/terminology/tig_admitted.rb +23 -0
- data/lib/obp/access/elements/terminology/tig_deprecated.rb +39 -0
- data/lib/obp/access/elements/terminology/tig_preferred.rb +23 -0
- data/lib/obp/access/elements/terminology.rb +28 -0
- data/lib/obp/access/elements/title.rb +33 -0
- data/lib/obp/access/fetcher.rb +63 -0
- data/lib/obp/access/grammar_parser.rb +135 -0
- data/lib/obp/access/imager.rb +39 -0
- data/lib/obp/access/inline_renderer.rb +97 -0
- data/lib/obp/access/parser.rb +82 -0
- data/lib/obp/access/renderer.rb +43 -0
- data/lib/obp/access/retriever.rb +97 -0
- data/lib/obp/access/urn.rb +31 -0
- data/lib/obp/access/version.rb +5 -0
- data/lib/obp/access.rb +118 -0
- data/lib/obp-access.rb +1 -0
- metadata +151 -0
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
module Obp
|
|
2
|
+
class Access
|
|
3
|
+
class Fetcher
|
|
4
|
+
USER_AGENT_PROFILES = [
|
|
5
|
+
{
|
|
6
|
+
user_agent: "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) " \
|
|
7
|
+
"AppleWebKit/537.36 (KHTML, like Gecko) " \
|
|
8
|
+
"Chrome/131.0.0.0 Safari/537.36",
|
|
9
|
+
platform: '"macOS"',
|
|
10
|
+
chrome_version: "131",
|
|
11
|
+
},
|
|
12
|
+
{
|
|
13
|
+
user_agent: "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " \
|
|
14
|
+
"AppleWebKit/537.36 (KHTML, like Gecko) " \
|
|
15
|
+
"Chrome/130.0.0.0 Safari/537.36",
|
|
16
|
+
platform: '"Windows"',
|
|
17
|
+
chrome_version: "130",
|
|
18
|
+
},
|
|
19
|
+
{
|
|
20
|
+
user_agent: "Mozilla/5.0 (X11; Linux x86_64) " \
|
|
21
|
+
"AppleWebKit/537.36 (KHTML, like Gecko) " \
|
|
22
|
+
"Chrome/131.0.0.0 Safari/537.36",
|
|
23
|
+
platform: '"Linux"',
|
|
24
|
+
chrome_version: "131",
|
|
25
|
+
},
|
|
26
|
+
].freeze
|
|
27
|
+
|
|
28
|
+
def initialize(urn:)
|
|
29
|
+
@urn = urn
|
|
30
|
+
end
|
|
31
|
+
|
|
32
|
+
def fetch_state
|
|
33
|
+
response = post_ui_request
|
|
34
|
+
parse_state(response)
|
|
35
|
+
end
|
|
36
|
+
|
|
37
|
+
private
|
|
38
|
+
|
|
39
|
+
def post_ui_request
|
|
40
|
+
uri = URI(API_URL)
|
|
41
|
+
request = Net::HTTP::Post.new(uri)
|
|
42
|
+
profile = USER_AGENT_PROFILES.sample
|
|
43
|
+
request["User-Agent"] = profile[:user_agent]
|
|
44
|
+
request["Accept"] = "application/json"
|
|
45
|
+
request.set_form_data(
|
|
46
|
+
"v-browserDetails" => 1,
|
|
47
|
+
"theme" => "iso-red",
|
|
48
|
+
"v-loc" => "#{API_URL}##{@urn}",
|
|
49
|
+
)
|
|
50
|
+
|
|
51
|
+
Net::HTTP.start(uri.hostname, uri.port, use_ssl: true) do |http|
|
|
52
|
+
http.request(request)
|
|
53
|
+
end
|
|
54
|
+
end
|
|
55
|
+
|
|
56
|
+
def parse_state(response)
|
|
57
|
+
json = JSON.parse(response.body)
|
|
58
|
+
state_json = JSON.parse(json["uidl"])
|
|
59
|
+
state_json["state"].values
|
|
60
|
+
end
|
|
61
|
+
end
|
|
62
|
+
end
|
|
63
|
+
end
|
|
@@ -0,0 +1,135 @@
|
|
|
1
|
+
module Obp
|
|
2
|
+
class Access
|
|
3
|
+
class GrammarParser
|
|
4
|
+
Result = Struct.new(:term, :pos, :genders, keyword_init: true)
|
|
5
|
+
|
|
6
|
+
POS_MAP = {
|
|
7
|
+
"adj." => "adjective",
|
|
8
|
+
"Adj." => "adjective",
|
|
9
|
+
"verb" => "verb",
|
|
10
|
+
}.freeze
|
|
11
|
+
|
|
12
|
+
GENDER_VALUES = %w[m f n].freeze
|
|
13
|
+
|
|
14
|
+
BOLD_PATTERNS = [
|
|
15
|
+
[->(t) { POS_MAP.key?(t) }, :handle_pos_marker],
|
|
16
|
+
[->(t) { GENDER_VALUES.include?(t) }, :handle_gender_marker],
|
|
17
|
+
[->(t) { t.match?(/\A[mfn],\z/) }, :handle_gender_with_comma],
|
|
18
|
+
[->(t) { t.match?(/\A[mfn][,\s]+[mfn]([,\s]+[mfn])*\z/) }, :handle_multi_gender],
|
|
19
|
+
[->(t) { t == "," }, :handle_comma],
|
|
20
|
+
[->(t) { t == "〈" }, :handle_enter_bracket],
|
|
21
|
+
[->(t) { t == "〉" }, :handle_exit_bracket],
|
|
22
|
+
[->(t) { t.match?(/\A[mfn]\s+/) }, :handle_gender_qualifier],
|
|
23
|
+
[->(t) { t.match?(/,.+[mfn]\z/) }, :handle_term_with_gender],
|
|
24
|
+
].freeze
|
|
25
|
+
|
|
26
|
+
def self.parse(inner_html)
|
|
27
|
+
state = { pos: "noun", genders: [], term_parts: [], in_bracket: false }
|
|
28
|
+
segments = parse_segments(inner_html)
|
|
29
|
+
|
|
30
|
+
segments.each do |seg|
|
|
31
|
+
handler = find_handler(seg, state[:in_bracket])
|
|
32
|
+
handler.call(seg[:text], state)
|
|
33
|
+
end
|
|
34
|
+
|
|
35
|
+
Result.new(term: clean_term(state[:term_parts]), pos: state[:pos], genders: state[:genders].uniq)
|
|
36
|
+
end
|
|
37
|
+
|
|
38
|
+
class << self
|
|
39
|
+
private
|
|
40
|
+
|
|
41
|
+
def find_handler(seg, in_bracket)
|
|
42
|
+
if seg[:bold]
|
|
43
|
+
bold_handler(seg[:text].strip, in_bracket)
|
|
44
|
+
elsif in_bracket
|
|
45
|
+
method(:handle_skip)
|
|
46
|
+
else
|
|
47
|
+
method(:handle_text)
|
|
48
|
+
end
|
|
49
|
+
end
|
|
50
|
+
|
|
51
|
+
def bold_handler(text, in_bracket)
|
|
52
|
+
_pattern, handler = BOLD_PATTERNS.find { |pred, _| pred.call(text) }
|
|
53
|
+
return method(handler) if handler
|
|
54
|
+
|
|
55
|
+
in_bracket ? method(:handle_skip) : method(:handle_term_text)
|
|
56
|
+
end
|
|
57
|
+
|
|
58
|
+
def handle_pos_marker(text, state)
|
|
59
|
+
state[:pos] = POS_MAP[text.strip]
|
|
60
|
+
end
|
|
61
|
+
|
|
62
|
+
def handle_gender_marker(text, state)
|
|
63
|
+
state[:genders] << text.strip
|
|
64
|
+
end
|
|
65
|
+
|
|
66
|
+
def handle_gender_with_comma(text, state)
|
|
67
|
+
state[:genders] << text.strip[0]
|
|
68
|
+
end
|
|
69
|
+
|
|
70
|
+
def handle_multi_gender(text, state)
|
|
71
|
+
text.strip.scan(/[mfn]/).each { |g| state[:genders] << g }
|
|
72
|
+
end
|
|
73
|
+
|
|
74
|
+
def handle_enter_bracket(_text, state)
|
|
75
|
+
state[:in_bracket] = true
|
|
76
|
+
end
|
|
77
|
+
|
|
78
|
+
def handle_exit_bracket(_text, state)
|
|
79
|
+
state[:in_bracket] = false
|
|
80
|
+
end
|
|
81
|
+
|
|
82
|
+
def handle_gender_qualifier(text, state)
|
|
83
|
+
state[:genders] << text.strip[0]
|
|
84
|
+
end
|
|
85
|
+
|
|
86
|
+
def handle_term_with_gender(text, state)
|
|
87
|
+
stripped = text.strip
|
|
88
|
+
if stripped =~ /\A(.+),\s*([mfn])\z/
|
|
89
|
+
state[:term_parts] << $1.strip
|
|
90
|
+
state[:genders] << $2
|
|
91
|
+
else
|
|
92
|
+
state[:term_parts] << stripped
|
|
93
|
+
end
|
|
94
|
+
end
|
|
95
|
+
|
|
96
|
+
def handle_comma(_text, _state); end
|
|
97
|
+
|
|
98
|
+
def handle_term_text(text, state)
|
|
99
|
+
state[:term_parts] << text
|
|
100
|
+
end
|
|
101
|
+
|
|
102
|
+
def handle_text(text, state)
|
|
103
|
+
state[:term_parts] << text
|
|
104
|
+
end
|
|
105
|
+
|
|
106
|
+
def handle_skip(_text, _state); end
|
|
107
|
+
|
|
108
|
+
def parse_segments(html)
|
|
109
|
+
segments = []
|
|
110
|
+
remaining = html.dup
|
|
111
|
+
|
|
112
|
+
while remaining.length.positive?
|
|
113
|
+
match = remaining.match(/\A(.*?)(<b>(.*?)<\/b>)(.*)/m)
|
|
114
|
+
if match
|
|
115
|
+
segments << { text: match[1], bold: false } if match[1].length.positive?
|
|
116
|
+
segments << { text: match[3], bold: true }
|
|
117
|
+
remaining = match[4]
|
|
118
|
+
else
|
|
119
|
+
segments << { text: remaining, bold: false }
|
|
120
|
+
break
|
|
121
|
+
end
|
|
122
|
+
end
|
|
123
|
+
|
|
124
|
+
segments
|
|
125
|
+
end
|
|
126
|
+
|
|
127
|
+
def clean_term(parts)
|
|
128
|
+
combined = parts.join
|
|
129
|
+
combined = combined.gsub(/,\s*\z/, "")
|
|
130
|
+
combined.gsub(/\s+/, " ").strip
|
|
131
|
+
end
|
|
132
|
+
end
|
|
133
|
+
end
|
|
134
|
+
end
|
|
135
|
+
end
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
module Obp
|
|
2
|
+
class Access
|
|
3
|
+
class Imager
|
|
4
|
+
attr_reader :html, :directory
|
|
5
|
+
|
|
6
|
+
def initialize(html:, directory:)
|
|
7
|
+
@html = html
|
|
8
|
+
@directory = directory
|
|
9
|
+
end
|
|
10
|
+
|
|
11
|
+
def images
|
|
12
|
+
doc = Nokogiri::HTML(html)
|
|
13
|
+
images = doc.search("div.sts-fig > img").to_h do |img|
|
|
14
|
+
key = img.attr("src")
|
|
15
|
+
path = File.join(imgdir, key.split("/").last)
|
|
16
|
+
[key, path]
|
|
17
|
+
end
|
|
18
|
+
download_images(images)
|
|
19
|
+
images
|
|
20
|
+
end
|
|
21
|
+
|
|
22
|
+
private
|
|
23
|
+
|
|
24
|
+
def imgdir
|
|
25
|
+
@imgdir ||= FileUtils.mkdir(File.join(directory, "images")).first
|
|
26
|
+
end
|
|
27
|
+
|
|
28
|
+
def download_images(images)
|
|
29
|
+
Parallel.each(images) { |key, path| download_image(key, path) }
|
|
30
|
+
end
|
|
31
|
+
|
|
32
|
+
def download_image(key, path)
|
|
33
|
+
url = "#{BASE_URL}#{key}"
|
|
34
|
+
blob = Net::HTTP.get_response(URI(url)).body
|
|
35
|
+
File.write(path, blob, mode: "wb")
|
|
36
|
+
end
|
|
37
|
+
end
|
|
38
|
+
end
|
|
39
|
+
end
|
|
@@ -0,0 +1,97 @@
|
|
|
1
|
+
module Obp
|
|
2
|
+
class Access
|
|
3
|
+
module InlineRenderer
|
|
4
|
+
CLASS_TYPES = {
|
|
5
|
+
%w[sts-tbx-entailedTerm] => :entailed_term,
|
|
6
|
+
%w[sts-xref] => :xref,
|
|
7
|
+
%w[sts-std-ref] => :std_ref,
|
|
8
|
+
%w[sts-label] => :label,
|
|
9
|
+
}.freeze
|
|
10
|
+
|
|
11
|
+
def render_inline(xml, node)
|
|
12
|
+
return xml.text(node.content) if node.is_a?(Nokogiri::XML::Text)
|
|
13
|
+
|
|
14
|
+
render_node_by_type(xml, node, inline_type(node))
|
|
15
|
+
end
|
|
16
|
+
|
|
17
|
+
CONTAINER_TYPES = { italic: :italic, bold: :bold }.freeze
|
|
18
|
+
|
|
19
|
+
def render_node_by_type(xml, node, type)
|
|
20
|
+
if CONTAINER_TYPES.key?(type)
|
|
21
|
+
render_container(xml, node, CONTAINER_TYPES[type])
|
|
22
|
+
elsif type == :label
|
|
23
|
+
nil
|
|
24
|
+
elsif type == :element
|
|
25
|
+
render_children(xml, node)
|
|
26
|
+
else
|
|
27
|
+
render_named_type(xml, node, type)
|
|
28
|
+
end
|
|
29
|
+
end
|
|
30
|
+
|
|
31
|
+
def render_named_type(xml, node, type)
|
|
32
|
+
case type
|
|
33
|
+
when :entailed_term then render_entailed_term(xml, node)
|
|
34
|
+
when :xref then render_xref(xml, node)
|
|
35
|
+
when :std_ref then render_std_ref(xml, node)
|
|
36
|
+
when :ext_link then render_ext_link(xml, node)
|
|
37
|
+
end
|
|
38
|
+
end
|
|
39
|
+
|
|
40
|
+
def render_container(xml, node, tag)
|
|
41
|
+
xml.public_send(tag) { node.children.each { |c| render_inline(xml, c) } }
|
|
42
|
+
end
|
|
43
|
+
|
|
44
|
+
def render_children(xml, node)
|
|
45
|
+
node.children.each { |c| render_inline(xml, c) }
|
|
46
|
+
end
|
|
47
|
+
|
|
48
|
+
def inline_type(node)
|
|
49
|
+
return :text if node.is_a?(Nokogiri::XML::Text)
|
|
50
|
+
|
|
51
|
+
CLASS_TYPES.fetch(node.classes) do
|
|
52
|
+
case node.name
|
|
53
|
+
when "i" then :italic
|
|
54
|
+
when "a" then :ext_link
|
|
55
|
+
when "b" then :bold
|
|
56
|
+
else :element
|
|
57
|
+
end
|
|
58
|
+
end
|
|
59
|
+
end
|
|
60
|
+
|
|
61
|
+
def xref_ref_type(text)
|
|
62
|
+
case text
|
|
63
|
+
when /\AFigure/ then "fig"
|
|
64
|
+
when /\ATable/ then "table"
|
|
65
|
+
when /\ANote/ then "fn"
|
|
66
|
+
else "sec"
|
|
67
|
+
end
|
|
68
|
+
end
|
|
69
|
+
|
|
70
|
+
private
|
|
71
|
+
|
|
72
|
+
def render_entailed_term(xml, node)
|
|
73
|
+
target = node.at_css("a").attr("href").split(":").last
|
|
74
|
+
xml.public_send(:"tbx:entailedTerm", target: "term_#{target}") do
|
|
75
|
+
xml << node.text.strip
|
|
76
|
+
end
|
|
77
|
+
end
|
|
78
|
+
|
|
79
|
+
def render_xref(xml, node)
|
|
80
|
+
rid = node.attr("href").split(":").last
|
|
81
|
+
ref_type = xref_ref_type(node.text)
|
|
82
|
+
xml.xref("ref-type": ref_type, rid: "#{ref_type}_#{rid}") { xml << node.text.strip }
|
|
83
|
+
end
|
|
84
|
+
|
|
85
|
+
def render_std_ref(xml, node)
|
|
86
|
+
rid = node.attr("href").split(":").last
|
|
87
|
+
xml.xref("ref-type": "bibr", rid: "ref_#{rid}") { xml << node.text.strip }
|
|
88
|
+
end
|
|
89
|
+
|
|
90
|
+
def render_ext_link(xml, node)
|
|
91
|
+
xml.public_send(:"ext-link",
|
|
92
|
+
"ext-link-type" => "uri",
|
|
93
|
+
"xlink:href" => node.attr("href")) { xml << node.text.strip }
|
|
94
|
+
end
|
|
95
|
+
end
|
|
96
|
+
end
|
|
97
|
+
end
|
|
@@ -0,0 +1,82 @@
|
|
|
1
|
+
module Obp
|
|
2
|
+
class Access
|
|
3
|
+
class Parser
|
|
4
|
+
attr_reader :urn, :directory
|
|
5
|
+
|
|
6
|
+
def initialize(urn:, directory:)
|
|
7
|
+
@urn = urn
|
|
8
|
+
@directory = directory
|
|
9
|
+
end
|
|
10
|
+
|
|
11
|
+
def to_xml
|
|
12
|
+
xml
|
|
13
|
+
end
|
|
14
|
+
|
|
15
|
+
def title
|
|
16
|
+
tab_data["description"]
|
|
17
|
+
end
|
|
18
|
+
|
|
19
|
+
def html
|
|
20
|
+
@html ||= begin
|
|
21
|
+
content = state.filter_map { |attr| attr["htmlContent"] }.first
|
|
22
|
+
raise "OBP content not found for URN #{urn}" unless content
|
|
23
|
+
|
|
24
|
+
content
|
|
25
|
+
end
|
|
26
|
+
end
|
|
27
|
+
|
|
28
|
+
def available_languages
|
|
29
|
+
state
|
|
30
|
+
.select { |attr| !attr["caption"]&.empty? && attr["styles"]&.include?("toggle") }
|
|
31
|
+
.filter_map { |attr| attr["caption"] }
|
|
32
|
+
.uniq
|
|
33
|
+
end
|
|
34
|
+
|
|
35
|
+
private
|
|
36
|
+
|
|
37
|
+
def fetcher
|
|
38
|
+
@fetcher ||= Fetcher.new(urn:)
|
|
39
|
+
end
|
|
40
|
+
|
|
41
|
+
def state
|
|
42
|
+
@state ||= fetcher.fetch_state
|
|
43
|
+
end
|
|
44
|
+
|
|
45
|
+
def xml
|
|
46
|
+
@xml ||= begin
|
|
47
|
+
metas = {
|
|
48
|
+
"titles" => titles,
|
|
49
|
+
"images" => images,
|
|
50
|
+
"language" => urn.language,
|
|
51
|
+
}.merge(tab_data)
|
|
52
|
+
|
|
53
|
+
Converter.new(urn:, metas:, source: html).to_xml
|
|
54
|
+
end
|
|
55
|
+
end
|
|
56
|
+
|
|
57
|
+
def tab_data
|
|
58
|
+
@tab_data ||= state.filter_map { |attr| attr["tabs"] }.first.last
|
|
59
|
+
end
|
|
60
|
+
|
|
61
|
+
def titles
|
|
62
|
+
languages = available_languages
|
|
63
|
+
languages = [urn.language] if languages.empty?
|
|
64
|
+
|
|
65
|
+
Parallel.map(languages) { |lang| fetch_title(lang) }.to_h
|
|
66
|
+
end
|
|
67
|
+
|
|
68
|
+
def fetch_title(lang)
|
|
69
|
+
if lang == urn.language
|
|
70
|
+
[lang, title]
|
|
71
|
+
else
|
|
72
|
+
other_urn = Urn.new("#{urn.base}:#{lang}")
|
|
73
|
+
[lang, Parser.new(urn: other_urn, directory:).title]
|
|
74
|
+
end
|
|
75
|
+
end
|
|
76
|
+
|
|
77
|
+
def images
|
|
78
|
+
Imager.new(html:, directory:).images
|
|
79
|
+
end
|
|
80
|
+
end
|
|
81
|
+
end
|
|
82
|
+
end
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
module Obp
|
|
2
|
+
class Access
|
|
3
|
+
class Renderer
|
|
4
|
+
attr_reader :urn, :metas, :nodes, :document
|
|
5
|
+
|
|
6
|
+
def initialize(urn:, metas:, nodes:)
|
|
7
|
+
@urn = urn
|
|
8
|
+
@metas = metas
|
|
9
|
+
@nodes = nodes
|
|
10
|
+
@document = Elements::Root.new(urn:, metas:).to_document
|
|
11
|
+
end
|
|
12
|
+
|
|
13
|
+
def to_xml
|
|
14
|
+
@nodes.each { |node| render(node:) }
|
|
15
|
+
@document.to_xml
|
|
16
|
+
end
|
|
17
|
+
|
|
18
|
+
private
|
|
19
|
+
|
|
20
|
+
def render(node:, target: nil)
|
|
21
|
+
return unless css_classes_match?(node)
|
|
22
|
+
|
|
23
|
+
ElementRegistry.elements.each do |element_class|
|
|
24
|
+
element = element_class.new(document:, metas:, node:)
|
|
25
|
+
next unless element.match_node?
|
|
26
|
+
|
|
27
|
+
xml = element.render(target:)
|
|
28
|
+
section_path = xml.first.path
|
|
29
|
+
|
|
30
|
+
node.children.each do |child|
|
|
31
|
+
render(node: child, target: section_path)
|
|
32
|
+
end
|
|
33
|
+
|
|
34
|
+
xml
|
|
35
|
+
end
|
|
36
|
+
end
|
|
37
|
+
|
|
38
|
+
def css_classes_match?(node)
|
|
39
|
+
ElementRegistry.css_classes.any?(node.classes)
|
|
40
|
+
end
|
|
41
|
+
end
|
|
42
|
+
end
|
|
43
|
+
end
|
|
@@ -0,0 +1,97 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "json"
|
|
4
|
+
require "fileutils"
|
|
5
|
+
|
|
6
|
+
module Obp
|
|
7
|
+
class Access
|
|
8
|
+
class Retriever
|
|
9
|
+
MANIFEST_FILE = "manifest.json"
|
|
10
|
+
|
|
11
|
+
attr_reader :output_dir, :catalog, :concurrency
|
|
12
|
+
|
|
13
|
+
def initialize(output_dir:, catalog:, concurrency: 4)
|
|
14
|
+
@output_dir = output_dir
|
|
15
|
+
@catalog = catalog
|
|
16
|
+
@concurrency = concurrency
|
|
17
|
+
end
|
|
18
|
+
|
|
19
|
+
def run
|
|
20
|
+
FileUtils.mkdir_p(output_dir)
|
|
21
|
+
pending = pending_deliverables
|
|
22
|
+
total = pending.size
|
|
23
|
+
|
|
24
|
+
if total.zero?
|
|
25
|
+
puts "Nothing to retrieve — all #{catalog.retrievable.size} deliverables already fetched."
|
|
26
|
+
return
|
|
27
|
+
end
|
|
28
|
+
|
|
29
|
+
puts "Retrieving #{total} deliverables to #{output_dir} (concurrency: #{concurrency})..."
|
|
30
|
+
process_all(pending, total)
|
|
31
|
+
puts "Done. Fetched #{total} documents."
|
|
32
|
+
end
|
|
33
|
+
|
|
34
|
+
private
|
|
35
|
+
|
|
36
|
+
def pending_deliverables
|
|
37
|
+
catalog.retrievable.reject { |d| manifest.key?(d.id.to_s) }
|
|
38
|
+
end
|
|
39
|
+
|
|
40
|
+
def process_all(pending, total)
|
|
41
|
+
Parallel.each_with_index(pending, in_threads: concurrency) do |deliverable, i|
|
|
42
|
+
process_one(deliverable, i + 1, total)
|
|
43
|
+
end
|
|
44
|
+
end
|
|
45
|
+
|
|
46
|
+
def process_one(deliverable, index, total)
|
|
47
|
+
deliverable.languages.each { |lang| fetch_and_save(deliverable, lang) }
|
|
48
|
+
record_success(deliverable)
|
|
49
|
+
puts "[#{index}/#{total}] #{deliverable.reference} — OK"
|
|
50
|
+
rescue StandardError => e
|
|
51
|
+
record_failure(deliverable, e)
|
|
52
|
+
puts "[#{index}/#{total}] #{deliverable.reference} — FAILED: #{e.message}"
|
|
53
|
+
end
|
|
54
|
+
|
|
55
|
+
def fetch_and_save(deliverable, language)
|
|
56
|
+
urn = deliverable.to_urn(language:)
|
|
57
|
+
access = Access.fetch(urn.to_s)
|
|
58
|
+
xml = access.to_xml(pretty: true)
|
|
59
|
+
|
|
60
|
+
dir = File.join(output_dir, deliverable.reference.gsub(%r{[/:\s]}, "-"))
|
|
61
|
+
FileUtils.mkdir_p(dir)
|
|
62
|
+
File.write(File.join(dir, "#{language}.xml"), xml)
|
|
63
|
+
end
|
|
64
|
+
|
|
65
|
+
def record_success(deliverable)
|
|
66
|
+
manifest[deliverable.id.to_s] = {
|
|
67
|
+
"reference" => deliverable.reference,
|
|
68
|
+
"status" => "success",
|
|
69
|
+
"timestamp" => Time.now.utc.iso8601,
|
|
70
|
+
}
|
|
71
|
+
save_manifest
|
|
72
|
+
end
|
|
73
|
+
|
|
74
|
+
def record_failure(deliverable, error)
|
|
75
|
+
manifest[deliverable.id.to_s] = {
|
|
76
|
+
"reference" => deliverable.reference,
|
|
77
|
+
"status" => "failed",
|
|
78
|
+
"error" => error.message,
|
|
79
|
+
"timestamp" => Time.now.utc.iso8601,
|
|
80
|
+
}
|
|
81
|
+
save_manifest
|
|
82
|
+
end
|
|
83
|
+
|
|
84
|
+
def manifest
|
|
85
|
+
@manifest ||= begin
|
|
86
|
+
path = File.join(output_dir, MANIFEST_FILE)
|
|
87
|
+
File.exist?(path) ? JSON.parse(File.read(path)) : {}
|
|
88
|
+
end
|
|
89
|
+
end
|
|
90
|
+
|
|
91
|
+
def save_manifest
|
|
92
|
+
path = File.join(output_dir, MANIFEST_FILE)
|
|
93
|
+
File.write(path, JSON.pretty_generate(manifest))
|
|
94
|
+
end
|
|
95
|
+
end
|
|
96
|
+
end
|
|
97
|
+
end
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
module Obp
|
|
2
|
+
class Access
|
|
3
|
+
class Urn
|
|
4
|
+
attr_reader :raw, :language, :base
|
|
5
|
+
|
|
6
|
+
def initialize(raw)
|
|
7
|
+
@raw = raw
|
|
8
|
+
parts = raw.split(":")
|
|
9
|
+
@language = parts.last
|
|
10
|
+
@base = parts[0...-1].join(":")
|
|
11
|
+
end
|
|
12
|
+
|
|
13
|
+
def safe
|
|
14
|
+
@safe ||= raw.tr(":", "-")
|
|
15
|
+
end
|
|
16
|
+
|
|
17
|
+
def to_s
|
|
18
|
+
raw
|
|
19
|
+
end
|
|
20
|
+
|
|
21
|
+
def ==(other)
|
|
22
|
+
other.is_a?(self.class) && raw == other.raw
|
|
23
|
+
end
|
|
24
|
+
alias_method :eql?, :==
|
|
25
|
+
|
|
26
|
+
def hash
|
|
27
|
+
raw.hash
|
|
28
|
+
end
|
|
29
|
+
end
|
|
30
|
+
end
|
|
31
|
+
end
|