kreuzberg 4.3.5-aarch64-linux
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +14 -0
- data/.rspec +3 -0
- data/.rubocop.yaml +1 -0
- data/.rubocop.yml +543 -0
- data/Gemfile +8 -0
- data/Gemfile.lock +260 -0
- data/README.md +399 -0
- data/Rakefile +34 -0
- data/Steepfile +51 -0
- data/examples/async_patterns.rb +283 -0
- data/extconf.rb +60 -0
- data/kreuzberg.gemspec +253 -0
- data/lib/kreuzberg/api_proxy.rb +125 -0
- data/lib/kreuzberg/cache_api.rb +67 -0
- data/lib/kreuzberg/cli.rb +57 -0
- data/lib/kreuzberg/cli_proxy.rb +118 -0
- data/lib/kreuzberg/config.rb +1241 -0
- data/lib/kreuzberg/djot_content.rb +225 -0
- data/lib/kreuzberg/document_structure.rb +204 -0
- data/lib/kreuzberg/error_context.rb +136 -0
- data/lib/kreuzberg/errors.rb +116 -0
- data/lib/kreuzberg/extraction_api.rb +329 -0
- data/lib/kreuzberg/mcp_proxy.rb +176 -0
- data/lib/kreuzberg/ocr_backend_protocol.rb +40 -0
- data/lib/kreuzberg/post_processor_protocol.rb +15 -0
- data/lib/kreuzberg/result.rb +712 -0
- data/lib/kreuzberg/setup_lib_path.rb +99 -0
- data/lib/kreuzberg/types.rb +414 -0
- data/lib/kreuzberg/validator_protocol.rb +16 -0
- data/lib/kreuzberg/version.rb +5 -0
- data/lib/kreuzberg.rb +102 -0
- data/lib/kreuzberg_rb.so +0 -0
- data/lib/libpdfium.so +0 -0
- data/sig/kreuzberg/internal.rbs +184 -0
- data/sig/kreuzberg.rbs +1337 -0
- data/spec/binding/async_operations_spec.rb +473 -0
- data/spec/binding/batch_operations_spec.rb +677 -0
- data/spec/binding/batch_spec.rb +360 -0
- data/spec/binding/cache_spec.rb +227 -0
- data/spec/binding/cli_proxy_spec.rb +85 -0
- data/spec/binding/cli_spec.rb +55 -0
- data/spec/binding/config_result_spec.rb +377 -0
- data/spec/binding/config_spec.rb +419 -0
- data/spec/binding/config_validation_spec.rb +377 -0
- data/spec/binding/embeddings_spec.rb +816 -0
- data/spec/binding/error_handling_spec.rb +399 -0
- data/spec/binding/error_recovery_spec.rb +488 -0
- data/spec/binding/errors_spec.rb +66 -0
- data/spec/binding/font_config_spec.rb +220 -0
- data/spec/binding/images_spec.rb +732 -0
- data/spec/binding/keywords_extraction_spec.rb +600 -0
- data/spec/binding/metadata_types_spec.rb +1253 -0
- data/spec/binding/pages_extraction_spec.rb +550 -0
- data/spec/binding/plugins/ocr_backend_spec.rb +307 -0
- data/spec/binding/plugins/postprocessor_spec.rb +269 -0
- data/spec/binding/plugins/validator_spec.rb +273 -0
- data/spec/binding/tables_spec.rb +650 -0
- data/spec/fixtures/config.toml +38 -0
- data/spec/fixtures/config.yaml +41 -0
- data/spec/fixtures/invalid_config.toml +3 -0
- data/spec/serialization_spec.rb +134 -0
- data/spec/smoke/package_spec.rb +177 -0
- data/spec/spec_helper.rb +40 -0
- data/spec/unit/config/chunking_config_spec.rb +213 -0
- data/spec/unit/config/embedding_config_spec.rb +343 -0
- data/spec/unit/config/extraction_config_spec.rb +434 -0
- data/spec/unit/config/font_config_spec.rb +285 -0
- data/spec/unit/config/hierarchy_config_spec.rb +314 -0
- data/spec/unit/config/image_extraction_config_spec.rb +209 -0
- data/spec/unit/config/image_preprocessing_config_spec.rb +230 -0
- data/spec/unit/config/keyword_config_spec.rb +229 -0
- data/spec/unit/config/language_detection_config_spec.rb +258 -0
- data/spec/unit/config/ocr_config_spec.rb +171 -0
- data/spec/unit/config/output_format_spec.rb +380 -0
- data/spec/unit/config/page_config_spec.rb +221 -0
- data/spec/unit/config/pdf_config_spec.rb +267 -0
- data/spec/unit/config/postprocessor_config_spec.rb +290 -0
- data/spec/unit/config/tesseract_config_spec.rb +181 -0
- data/spec/unit/config/token_reduction_config_spec.rb +251 -0
- data/test/metadata_types_test.rb +959 -0
- metadata +292 -0
|
@@ -0,0 +1,225 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
begin
|
|
4
|
+
require 'json'
|
|
5
|
+
rescue LoadError
|
|
6
|
+
require 'json/pure'
|
|
7
|
+
end
|
|
8
|
+
|
|
9
|
+
module Kreuzberg
|
|
10
|
+
class Result
|
|
11
|
+
# Djot structured content representation
|
|
12
|
+
#
|
|
13
|
+
# Represents document content in Djot format with structured metadata about
|
|
14
|
+
# blocks, images, links, footnotes, and other document elements.
|
|
15
|
+
#
|
|
16
|
+
class DjotContent
|
|
17
|
+
attr_reader :plain_text, :blocks, :metadata_json, :tables, :images, :links, :footnotes, :attributes
|
|
18
|
+
|
|
19
|
+
# Represents a formatted block in Djot content
|
|
20
|
+
class FormattedBlock
|
|
21
|
+
attr_reader :block_type, :children, :attributes, :content, :level
|
|
22
|
+
|
|
23
|
+
# rubocop:disable Metrics/CyclomaticComplexity, Metrics/PerceivedComplexity
|
|
24
|
+
def initialize(hash_or_type = nil, children: nil, attributes: nil, content: nil, level: nil, block_type: nil)
|
|
25
|
+
if hash_or_type.is_a?(Hash)
|
|
26
|
+
# Initialize from hash
|
|
27
|
+
@block_type = hash_or_type[:block_type] || hash_or_type['block_type'] || ''
|
|
28
|
+
@children = hash_or_type[:children] || hash_or_type['children']
|
|
29
|
+
@attributes = hash_or_type[:attributes] || hash_or_type['attributes'] || {}
|
|
30
|
+
@content = hash_or_type[:content] || hash_or_type['content']
|
|
31
|
+
@level = hash_or_type[:level] || hash_or_type['level']
|
|
32
|
+
else
|
|
33
|
+
# Initialize from keyword arguments (for backward compatibility)
|
|
34
|
+
@block_type = block_type || hash_or_type || ''
|
|
35
|
+
@children = children || []
|
|
36
|
+
@attributes = attributes || {}
|
|
37
|
+
@content = content
|
|
38
|
+
@level = level
|
|
39
|
+
end
|
|
40
|
+
end
|
|
41
|
+
# rubocop:enable Metrics/CyclomaticComplexity, Metrics/PerceivedComplexity
|
|
42
|
+
|
|
43
|
+
def to_h
|
|
44
|
+
{
|
|
45
|
+
block_type: @block_type,
|
|
46
|
+
children: @children,
|
|
47
|
+
attributes: @attributes,
|
|
48
|
+
content: @content,
|
|
49
|
+
level: @level
|
|
50
|
+
}.compact
|
|
51
|
+
end
|
|
52
|
+
end
|
|
53
|
+
|
|
54
|
+
# Represents an image in Djot content
|
|
55
|
+
class DjotImage
|
|
56
|
+
attr_reader :url, :alt, :title, :width, :height
|
|
57
|
+
alias src url
|
|
58
|
+
|
|
59
|
+
# rubocop:disable Metrics/CyclomaticComplexity
|
|
60
|
+
def initialize(hash_or_url = nil, alt: nil, title: nil, width: nil, height: nil, url: nil, src: nil)
|
|
61
|
+
if hash_or_url.is_a?(Hash)
|
|
62
|
+
# Initialize from hash (supports both 'url' and 'src' keys)
|
|
63
|
+
@url = hash_or_url[:url] || hash_or_url['url'] || hash_or_url[:src] || hash_or_url['src']
|
|
64
|
+
@alt = hash_or_url[:alt] || hash_or_url['alt']
|
|
65
|
+
@title = hash_or_url[:title] || hash_or_url['title']
|
|
66
|
+
@width = hash_or_url[:width] || hash_or_url['width']
|
|
67
|
+
@height = hash_or_url[:height] || hash_or_url['height']
|
|
68
|
+
else
|
|
69
|
+
# Initialize from keyword arguments
|
|
70
|
+
@url = url || src || hash_or_url
|
|
71
|
+
@alt = alt
|
|
72
|
+
@title = title
|
|
73
|
+
@width = width
|
|
74
|
+
@height = height
|
|
75
|
+
end
|
|
76
|
+
end
|
|
77
|
+
# rubocop:enable Metrics/CyclomaticComplexity
|
|
78
|
+
|
|
79
|
+
def to_h
|
|
80
|
+
{
|
|
81
|
+
url: @url,
|
|
82
|
+
alt: @alt,
|
|
83
|
+
title: @title,
|
|
84
|
+
width: @width,
|
|
85
|
+
height: @height
|
|
86
|
+
}.compact
|
|
87
|
+
end
|
|
88
|
+
end
|
|
89
|
+
|
|
90
|
+
# Represents a link in Djot content
|
|
91
|
+
class DjotLink
|
|
92
|
+
attr_reader :url, :text, :title, :link_type
|
|
93
|
+
alias href url
|
|
94
|
+
|
|
95
|
+
# rubocop:disable Metrics/CyclomaticComplexity
|
|
96
|
+
def initialize(hash_or_url = nil, text: nil, title: nil, url: nil, href: nil, link_type: nil)
|
|
97
|
+
if hash_or_url.is_a?(Hash)
|
|
98
|
+
# Initialize from hash (supports both 'url' and 'href' keys)
|
|
99
|
+
@url = hash_or_url[:url] || hash_or_url['url'] || hash_or_url[:href] || hash_or_url['href']
|
|
100
|
+
@text = hash_or_url[:text] || hash_or_url['text']
|
|
101
|
+
@title = hash_or_url[:title] || hash_or_url['title']
|
|
102
|
+
@link_type = hash_or_url[:link_type] || hash_or_url['link_type']
|
|
103
|
+
else
|
|
104
|
+
# Initialize from keyword arguments
|
|
105
|
+
@url = url || href || hash_or_url
|
|
106
|
+
@text = text
|
|
107
|
+
@title = title
|
|
108
|
+
@link_type = link_type
|
|
109
|
+
end
|
|
110
|
+
end
|
|
111
|
+
# rubocop:enable Metrics/CyclomaticComplexity
|
|
112
|
+
|
|
113
|
+
def to_h
|
|
114
|
+
{
|
|
115
|
+
url: @url,
|
|
116
|
+
text: @text,
|
|
117
|
+
title: @title,
|
|
118
|
+
link_type: @link_type
|
|
119
|
+
}.compact
|
|
120
|
+
end
|
|
121
|
+
end
|
|
122
|
+
|
|
123
|
+
# Represents a footnote in Djot content
|
|
124
|
+
class Footnote
|
|
125
|
+
attr_reader :label, :content
|
|
126
|
+
|
|
127
|
+
def initialize(label:, content:)
|
|
128
|
+
@label = label
|
|
129
|
+
@content = content
|
|
130
|
+
end
|
|
131
|
+
|
|
132
|
+
def to_h
|
|
133
|
+
{
|
|
134
|
+
label: @label,
|
|
135
|
+
content: @content
|
|
136
|
+
}
|
|
137
|
+
end
|
|
138
|
+
end
|
|
139
|
+
|
|
140
|
+
# rubocop:disable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/PerceivedComplexity
|
|
141
|
+
def initialize(hash)
|
|
142
|
+
@plain_text = hash['plain_text'] || hash[:plain_text] || ''
|
|
143
|
+
@blocks = parse_blocks(hash['blocks'] || hash[:blocks] || [])
|
|
144
|
+
@metadata_json = hash['metadata_json'] || hash[:metadata_json] || '{}'
|
|
145
|
+
@tables = hash['tables'] || hash[:tables] || []
|
|
146
|
+
@images = parse_images(hash['images'] || hash[:images] || [])
|
|
147
|
+
@links = parse_links(hash['links'] || hash[:links] || [])
|
|
148
|
+
@footnotes = parse_footnotes(hash['footnotes'] || hash[:footnotes] || [])
|
|
149
|
+
@attributes = hash['attributes'] || hash[:attributes] || {}
|
|
150
|
+
end
|
|
151
|
+
# rubocop:enable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/PerceivedComplexity
|
|
152
|
+
|
|
153
|
+
def metadata
|
|
154
|
+
@metadata ||= parse_metadata(@metadata_json)
|
|
155
|
+
end
|
|
156
|
+
|
|
157
|
+
def to_h
|
|
158
|
+
{
|
|
159
|
+
plain_text: @plain_text,
|
|
160
|
+
blocks: @blocks.map(&:to_h),
|
|
161
|
+
metadata_json: @metadata_json,
|
|
162
|
+
tables: @tables,
|
|
163
|
+
images: @images.map(&:to_h),
|
|
164
|
+
links: @links.map(&:to_h),
|
|
165
|
+
footnotes: @footnotes.map(&:to_h),
|
|
166
|
+
attributes: @attributes
|
|
167
|
+
}
|
|
168
|
+
end
|
|
169
|
+
|
|
170
|
+
private
|
|
171
|
+
|
|
172
|
+
def parse_metadata(metadata_json)
|
|
173
|
+
JSON.parse(metadata_json)
|
|
174
|
+
rescue JSON::ParserError
|
|
175
|
+
{}
|
|
176
|
+
end
|
|
177
|
+
|
|
178
|
+
def parse_blocks(blocks_data)
|
|
179
|
+
blocks_data.map do |block|
|
|
180
|
+
FormattedBlock.new(
|
|
181
|
+
block_type: block['block_type'] || block[:block_type] || '',
|
|
182
|
+
children: block['children'] || block[:children],
|
|
183
|
+
attributes: block['attributes'] || block[:attributes]
|
|
184
|
+
)
|
|
185
|
+
end
|
|
186
|
+
end
|
|
187
|
+
|
|
188
|
+
# rubocop:disable Metrics/CyclomaticComplexity
|
|
189
|
+
def parse_images(images_data)
|
|
190
|
+
images_data.map do |image|
|
|
191
|
+
DjotImage.new(
|
|
192
|
+
url: image['url'] || image[:url] || image['src'] || image[:src],
|
|
193
|
+
alt: image['alt'] || image[:alt],
|
|
194
|
+
title: image['title'] || image[:title],
|
|
195
|
+
width: image['width'] || image[:width],
|
|
196
|
+
height: image['height'] || image[:height]
|
|
197
|
+
)
|
|
198
|
+
end
|
|
199
|
+
end
|
|
200
|
+
# rubocop:enable Metrics/CyclomaticComplexity
|
|
201
|
+
|
|
202
|
+
# rubocop:disable Metrics/CyclomaticComplexity
|
|
203
|
+
def parse_links(links_data)
|
|
204
|
+
links_data.map do |link|
|
|
205
|
+
DjotLink.new(
|
|
206
|
+
url: link['url'] || link[:url] || link['href'] || link[:href],
|
|
207
|
+
text: link['text'] || link[:text],
|
|
208
|
+
title: link['title'] || link[:title],
|
|
209
|
+
link_type: link['link_type'] || link[:link_type]
|
|
210
|
+
)
|
|
211
|
+
end
|
|
212
|
+
end
|
|
213
|
+
# rubocop:enable Metrics/CyclomaticComplexity
|
|
214
|
+
|
|
215
|
+
def parse_footnotes(footnotes_data)
|
|
216
|
+
footnotes_data.map do |note|
|
|
217
|
+
Footnote.new(
|
|
218
|
+
label: note['label'] || note[:label],
|
|
219
|
+
content: note['content'] || note[:content]
|
|
220
|
+
)
|
|
221
|
+
end
|
|
222
|
+
end
|
|
223
|
+
end
|
|
224
|
+
end
|
|
225
|
+
end
|
|
@@ -0,0 +1,204 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Kreuzberg
|
|
4
|
+
class Result
|
|
5
|
+
# Structured document representation.
|
|
6
|
+
#
|
|
7
|
+
# Provides a hierarchical, tree-based representation of document content
|
|
8
|
+
# using a flat array of nodes with index-based parent/child references.
|
|
9
|
+
#
|
|
10
|
+
# @example
|
|
11
|
+
# if result.document
|
|
12
|
+
# result.document.nodes.each do |node|
|
|
13
|
+
# puts "#{node.id}: #{node.content[0..50]}"
|
|
14
|
+
# end
|
|
15
|
+
# end
|
|
16
|
+
#
|
|
17
|
+
class DocumentStructure
|
|
18
|
+
attr_reader :nodes
|
|
19
|
+
|
|
20
|
+
def initialize(hash)
|
|
21
|
+
@nodes = parse_nodes(hash['nodes'] || hash[:nodes] || [])
|
|
22
|
+
end
|
|
23
|
+
|
|
24
|
+
# Convert to hash
|
|
25
|
+
#
|
|
26
|
+
# @return [Hash] Hash representation
|
|
27
|
+
#
|
|
28
|
+
def to_h
|
|
29
|
+
{ nodes: @nodes.map(&:to_h) }
|
|
30
|
+
end
|
|
31
|
+
|
|
32
|
+
private
|
|
33
|
+
|
|
34
|
+
def parse_nodes(nodes_data)
|
|
35
|
+
return [] if nodes_data.nil? || nodes_data.empty?
|
|
36
|
+
|
|
37
|
+
nodes_data.map { |node_hash| DocumentNode.new(node_hash) }
|
|
38
|
+
end
|
|
39
|
+
end
|
|
40
|
+
|
|
41
|
+
# Single node in the document structure tree.
|
|
42
|
+
#
|
|
43
|
+
# Represents a logical unit of content with deterministic ID, content,
|
|
44
|
+
# tree structure information, and metadata.
|
|
45
|
+
#
|
|
46
|
+
class DocumentNode
|
|
47
|
+
attr_reader :id, :content, :parent, :children, :content_layer, :page, :page_end, :bbox, :annotations
|
|
48
|
+
|
|
49
|
+
def initialize(hash)
|
|
50
|
+
assign_core_fields(hash)
|
|
51
|
+
assign_tree_fields(hash)
|
|
52
|
+
assign_metadata_fields(hash)
|
|
53
|
+
end
|
|
54
|
+
|
|
55
|
+
private
|
|
56
|
+
|
|
57
|
+
def assign_core_fields(hash)
|
|
58
|
+
@id = hash['id'] || hash[:id] || ''
|
|
59
|
+
@content = hash['content'] || hash[:content] || {}
|
|
60
|
+
@content_layer = hash['content_layer'] || hash[:content_layer] || 'body'
|
|
61
|
+
end
|
|
62
|
+
|
|
63
|
+
def assign_tree_fields(hash)
|
|
64
|
+
@parent = hash['parent'] || hash[:parent]
|
|
65
|
+
@children = parse_children(hash['children'] || hash[:children] || [])
|
|
66
|
+
end
|
|
67
|
+
|
|
68
|
+
def assign_metadata_fields(hash)
|
|
69
|
+
@page = hash['page'] || hash[:page]
|
|
70
|
+
@page_end = hash['page_end'] || hash[:page_end]
|
|
71
|
+
@bbox = parse_bbox(hash['bbox'] || hash[:bbox])
|
|
72
|
+
@annotations = parse_annotations(hash['annotations'] || hash[:annotations] || [])
|
|
73
|
+
end
|
|
74
|
+
|
|
75
|
+
# Convert to hash
|
|
76
|
+
#
|
|
77
|
+
# @return [Hash] Hash representation
|
|
78
|
+
#
|
|
79
|
+
def to_h
|
|
80
|
+
{
|
|
81
|
+
id: @id,
|
|
82
|
+
content: @content,
|
|
83
|
+
parent: @parent,
|
|
84
|
+
children: @children,
|
|
85
|
+
content_layer: @content_layer,
|
|
86
|
+
page: @page,
|
|
87
|
+
page_end: @page_end,
|
|
88
|
+
bbox: @bbox&.to_h,
|
|
89
|
+
annotations: @annotations.map(&:to_h)
|
|
90
|
+
}.compact
|
|
91
|
+
end
|
|
92
|
+
|
|
93
|
+
def parse_children(children_data)
|
|
94
|
+
return [] if children_data.nil? || children_data.empty?
|
|
95
|
+
|
|
96
|
+
if children_data.is_a?(Array)
|
|
97
|
+
children_data.map { |c| extract_child_index(c) }
|
|
98
|
+
else
|
|
99
|
+
[]
|
|
100
|
+
end
|
|
101
|
+
end
|
|
102
|
+
|
|
103
|
+
def extract_child_index(child)
|
|
104
|
+
if child.is_a?(Integer)
|
|
105
|
+
child
|
|
106
|
+
else
|
|
107
|
+
child['index'] || child[:index]
|
|
108
|
+
end
|
|
109
|
+
end
|
|
110
|
+
|
|
111
|
+
def parse_bbox(bbox_data)
|
|
112
|
+
return nil if bbox_data.nil?
|
|
113
|
+
|
|
114
|
+
DocumentBoundingBox.new(bbox_data)
|
|
115
|
+
end
|
|
116
|
+
|
|
117
|
+
def parse_annotations(annotations_data)
|
|
118
|
+
return [] if annotations_data.nil? || annotations_data.empty?
|
|
119
|
+
|
|
120
|
+
annotations_data.map { |ann| DocumentAnnotation.new(ann) }
|
|
121
|
+
end
|
|
122
|
+
end
|
|
123
|
+
|
|
124
|
+
# Bounding box for document node positioning.
|
|
125
|
+
#
|
|
126
|
+
# Represents rectangular coordinates for a node within the document.
|
|
127
|
+
#
|
|
128
|
+
class DocumentBoundingBox
|
|
129
|
+
attr_reader :x0, :y0, :x1, :y1
|
|
130
|
+
|
|
131
|
+
def initialize(hash)
|
|
132
|
+
@x0 = extract_float(hash, 'x0')
|
|
133
|
+
@y0 = extract_float(hash, 'y0')
|
|
134
|
+
@x1 = extract_float(hash, 'x1')
|
|
135
|
+
@y1 = extract_float(hash, 'y1')
|
|
136
|
+
end
|
|
137
|
+
|
|
138
|
+
# Convert to hash
|
|
139
|
+
#
|
|
140
|
+
# @return [Hash] Hash representation
|
|
141
|
+
#
|
|
142
|
+
def to_h
|
|
143
|
+
{
|
|
144
|
+
x0: @x0,
|
|
145
|
+
y0: @y0,
|
|
146
|
+
x1: @x1,
|
|
147
|
+
y1: @y1
|
|
148
|
+
}.compact
|
|
149
|
+
end
|
|
150
|
+
|
|
151
|
+
private
|
|
152
|
+
|
|
153
|
+
def extract_float(hash, key)
|
|
154
|
+
(hash[key] || hash[key.to_sym])&.to_f
|
|
155
|
+
end
|
|
156
|
+
end
|
|
157
|
+
|
|
158
|
+
# Annotation for a document node.
|
|
159
|
+
#
|
|
160
|
+
# Represents inline text annotations (formatting, links) with byte-range
|
|
161
|
+
# references into the node's text content.
|
|
162
|
+
#
|
|
163
|
+
class DocumentAnnotation
|
|
164
|
+
attr_reader :start, :end_offset, :annotation_type, :url, :title
|
|
165
|
+
|
|
166
|
+
def initialize(hash)
|
|
167
|
+
@start = (hash['start'] || hash[:start] || 0).to_i
|
|
168
|
+
@end_offset = (hash['end'] || hash[:end] || 0).to_i
|
|
169
|
+
parse_kind(hash['kind'] || hash[:kind] || {})
|
|
170
|
+
end
|
|
171
|
+
|
|
172
|
+
# Convert to hash
|
|
173
|
+
#
|
|
174
|
+
# @return [Hash] Hash representation
|
|
175
|
+
#
|
|
176
|
+
def to_h
|
|
177
|
+
kind_hash = { annotation_type: @annotation_type }
|
|
178
|
+
url = @url
|
|
179
|
+
kind_hash[:url] = url if url
|
|
180
|
+
title = @title
|
|
181
|
+
kind_hash[:title] = title if title
|
|
182
|
+
|
|
183
|
+
{
|
|
184
|
+
start: @start,
|
|
185
|
+
end: @end_offset,
|
|
186
|
+
kind: kind_hash
|
|
187
|
+
}
|
|
188
|
+
end
|
|
189
|
+
|
|
190
|
+
private
|
|
191
|
+
|
|
192
|
+
def parse_kind(kind_hash)
|
|
193
|
+
return if kind_hash.nil? || kind_hash.empty?
|
|
194
|
+
|
|
195
|
+
@annotation_type =
|
|
196
|
+
kind_hash['annotation_type'] ||
|
|
197
|
+
kind_hash[:annotation_type] ||
|
|
198
|
+
'bold'
|
|
199
|
+
@url = kind_hash['url'] || kind_hash[:url]
|
|
200
|
+
@title = kind_hash['title'] || kind_hash[:title]
|
|
201
|
+
end
|
|
202
|
+
end
|
|
203
|
+
end
|
|
204
|
+
end
|
|
@@ -0,0 +1,136 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'json'
|
|
4
|
+
|
|
5
|
+
module Kreuzberg
|
|
6
|
+
module ErrorContext
|
|
7
|
+
class << self
|
|
8
|
+
# @return [Integer] Error code constant (ERROR_CODE_* values), or 0 on success
|
|
9
|
+
# @example Check last error
|
|
10
|
+
def last_error_code
|
|
11
|
+
Kreuzberg._last_error_code_native
|
|
12
|
+
rescue StandardError
|
|
13
|
+
0
|
|
14
|
+
end
|
|
15
|
+
|
|
16
|
+
# Get panic context information from the last error.
|
|
17
|
+
#
|
|
18
|
+
# Returns a {Errors::PanicContext} object containing detailed information about
|
|
19
|
+
# the last panic that occurred in the Rust core. Includes file path, line number,
|
|
20
|
+
# function name, error message, and timestamp.
|
|
21
|
+
#
|
|
22
|
+
# @return [Errors::PanicContext, nil] Panic context if a panic occurred, nil otherwise
|
|
23
|
+
#
|
|
24
|
+
# @example Get panic details
|
|
25
|
+
# panic = Kreuzberg::ErrorContext.last_panic_context
|
|
26
|
+
# if panic
|
|
27
|
+
# puts "Panic at #{panic.file}:#{panic.line} in #{panic.function}"
|
|
28
|
+
# puts "Message: #{panic.message}"
|
|
29
|
+
# puts "Time: #{panic.timestamp_secs}"
|
|
30
|
+
# end
|
|
31
|
+
def last_panic_context
|
|
32
|
+
json_str = Kreuzberg._last_panic_context_json_native
|
|
33
|
+
return nil unless json_str
|
|
34
|
+
|
|
35
|
+
Errors::PanicContext.from_json(json_str)
|
|
36
|
+
rescue StandardError
|
|
37
|
+
nil
|
|
38
|
+
end
|
|
39
|
+
|
|
40
|
+
# Get panic context as raw JSON string.
|
|
41
|
+
#
|
|
42
|
+
# Returns the panic context information as a JSON string for raw access or
|
|
43
|
+
# custom parsing. Returns nil if no panic has occurred.
|
|
44
|
+
#
|
|
45
|
+
# @return [String, nil] JSON-serialized panic context, or nil if no panic
|
|
46
|
+
#
|
|
47
|
+
# @example Get raw JSON panic context
|
|
48
|
+
# json = Kreuzberg::ErrorContext.last_panic_context_json
|
|
49
|
+
# if json
|
|
50
|
+
# panic_data = JSON.parse(json)
|
|
51
|
+
# puts panic_data
|
|
52
|
+
# end
|
|
53
|
+
def last_panic_context_json
|
|
54
|
+
Kreuzberg._last_panic_context_json_native
|
|
55
|
+
rescue StandardError
|
|
56
|
+
nil
|
|
57
|
+
end
|
|
58
|
+
|
|
59
|
+
# Get detailed error information from the last operation.
|
|
60
|
+
#
|
|
61
|
+
# Returns comprehensive error details including message, code, type, source location,
|
|
62
|
+
# and panic information.
|
|
63
|
+
#
|
|
64
|
+
# @return [Hash] Hash with keys: :message, :error_code, :error_type, :source_file,
|
|
65
|
+
# :source_function, :source_line, :context_info, :is_panic
|
|
66
|
+
#
|
|
67
|
+
# @example Get error details
|
|
68
|
+
# details = Kreuzberg::ErrorContext.error_details
|
|
69
|
+
# puts "Error: #{details[:message]}"
|
|
70
|
+
# puts "Code: #{details[:error_code]}"
|
|
71
|
+
# puts "Type: #{details[:error_type]}"
|
|
72
|
+
def error_details
|
|
73
|
+
Kreuzberg._get_error_details_native
|
|
74
|
+
rescue StandardError
|
|
75
|
+
{}
|
|
76
|
+
end
|
|
77
|
+
|
|
78
|
+
# Classify an error message into a Kreuzberg error code.
|
|
79
|
+
#
|
|
80
|
+
# Analyzes an error message and returns the most likely error code (0-7).
|
|
81
|
+
# Useful for converting third-party error messages into Kreuzberg categories.
|
|
82
|
+
#
|
|
83
|
+
# @param message [String] The error message to classify
|
|
84
|
+
# @return [Integer] Error code (0-7)
|
|
85
|
+
#
|
|
86
|
+
# Error code mapping:
|
|
87
|
+
# - 0: Validation
|
|
88
|
+
# - 1: Parsing
|
|
89
|
+
# - 2: OCR
|
|
90
|
+
# - 3: MissingDependency
|
|
91
|
+
# - 4: IO
|
|
92
|
+
# - 5: Plugin
|
|
93
|
+
# - 6: UnsupportedFormat
|
|
94
|
+
# - 7: Internal
|
|
95
|
+
#
|
|
96
|
+
# @example Classify an error
|
|
97
|
+
# code = Kreuzberg::ErrorContext.classify_error("File not found")
|
|
98
|
+
# if code == 4
|
|
99
|
+
# puts "This is an I/O error"
|
|
100
|
+
# end
|
|
101
|
+
def classify_error(message)
|
|
102
|
+
Kreuzberg._classify_error_native(message)
|
|
103
|
+
rescue StandardError
|
|
104
|
+
7
|
|
105
|
+
end
|
|
106
|
+
|
|
107
|
+
# Get the human-readable name of an error code.
|
|
108
|
+
#
|
|
109
|
+
# @param code [Integer] Numeric error code (0-7)
|
|
110
|
+
# @return [String] Human-readable error code name (e.g., "validation", "io")
|
|
111
|
+
#
|
|
112
|
+
# @example Get error code name
|
|
113
|
+
# name = Kreuzberg::ErrorContext.error_code_name(0)
|
|
114
|
+
# puts name # => "validation"
|
|
115
|
+
def error_code_name(code)
|
|
116
|
+
Kreuzberg._error_code_name_native(code)
|
|
117
|
+
rescue StandardError
|
|
118
|
+
'unknown'
|
|
119
|
+
end
|
|
120
|
+
|
|
121
|
+
# Get the description of an error code.
|
|
122
|
+
#
|
|
123
|
+
# @param code [Integer] Numeric error code (0-7)
|
|
124
|
+
# @return [String] Description of the error code
|
|
125
|
+
#
|
|
126
|
+
# @example Get error code description
|
|
127
|
+
# desc = Kreuzberg::ErrorContext.error_code_description(0)
|
|
128
|
+
# puts desc # => "Input validation error"
|
|
129
|
+
def error_code_description(code)
|
|
130
|
+
Kreuzberg._error_code_description_native(code)
|
|
131
|
+
rescue StandardError
|
|
132
|
+
'Unknown error code'
|
|
133
|
+
end
|
|
134
|
+
end
|
|
135
|
+
end
|
|
136
|
+
end
|
|
@@ -0,0 +1,116 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'json'
|
|
4
|
+
|
|
5
|
+
module Kreuzberg
|
|
6
|
+
ERROR_CODE_SUCCESS = 0
|
|
7
|
+
ERROR_CODE_GENERIC = 1
|
|
8
|
+
ERROR_CODE_PANIC = 2
|
|
9
|
+
ERROR_CODE_INVALID_ARGUMENT = 3
|
|
10
|
+
ERROR_CODE_IO = 4
|
|
11
|
+
ERROR_CODE_PARSING = 5
|
|
12
|
+
ERROR_CODE_OCR = 6
|
|
13
|
+
ERROR_CODE_MISSING_DEPENDENCY = 7
|
|
14
|
+
|
|
15
|
+
module Errors
|
|
16
|
+
class PanicContext
|
|
17
|
+
attr_reader :file, :line, :function, :message, :timestamp_secs
|
|
18
|
+
|
|
19
|
+
def initialize(file:, line:, function:, message:, timestamp_secs:)
|
|
20
|
+
@file = file
|
|
21
|
+
@line = line
|
|
22
|
+
@function = function
|
|
23
|
+
@message = message
|
|
24
|
+
@timestamp_secs = timestamp_secs
|
|
25
|
+
end
|
|
26
|
+
|
|
27
|
+
def to_s
|
|
28
|
+
"#{file}:#{line}:#{function}: #{message}"
|
|
29
|
+
end
|
|
30
|
+
|
|
31
|
+
def to_h
|
|
32
|
+
{
|
|
33
|
+
file:,
|
|
34
|
+
line:,
|
|
35
|
+
function:,
|
|
36
|
+
message:,
|
|
37
|
+
timestamp_secs:
|
|
38
|
+
}
|
|
39
|
+
end
|
|
40
|
+
|
|
41
|
+
def self.from_json(json_string)
|
|
42
|
+
return nil if json_string.nil? || json_string.empty?
|
|
43
|
+
|
|
44
|
+
data = JSON.parse(json_string, symbolize_names: true)
|
|
45
|
+
sliced = data.slice(:file, :line, :function, :message, :timestamp_secs)
|
|
46
|
+
new(**with_defaults(sliced))
|
|
47
|
+
rescue JSON::ParserError
|
|
48
|
+
nil
|
|
49
|
+
end
|
|
50
|
+
|
|
51
|
+
def self.with_defaults(sliced)
|
|
52
|
+
{
|
|
53
|
+
file: sliced[:file] || '',
|
|
54
|
+
line: sliced[:line] || 0,
|
|
55
|
+
function: sliced[:function] || '',
|
|
56
|
+
message: sliced[:message] || '',
|
|
57
|
+
timestamp_secs: sliced[:timestamp_secs] || 0
|
|
58
|
+
}
|
|
59
|
+
end
|
|
60
|
+
private_class_method :with_defaults
|
|
61
|
+
end
|
|
62
|
+
|
|
63
|
+
# Base error class for all Kreuzberg errors
|
|
64
|
+
class Error < StandardError
|
|
65
|
+
attr_reader :panic_context, :error_code
|
|
66
|
+
|
|
67
|
+
def initialize(message, panic_context: nil, error_code: nil)
|
|
68
|
+
super(message)
|
|
69
|
+
@panic_context = panic_context
|
|
70
|
+
@error_code = error_code
|
|
71
|
+
end
|
|
72
|
+
end
|
|
73
|
+
|
|
74
|
+
# Raised when validation fails
|
|
75
|
+
class ValidationError < Error; end
|
|
76
|
+
|
|
77
|
+
# Raised when document parsing fails
|
|
78
|
+
class ParsingError < Error
|
|
79
|
+
attr_reader :context
|
|
80
|
+
|
|
81
|
+
def initialize(message, context: nil, panic_context: nil, error_code: nil)
|
|
82
|
+
super(message, panic_context:, error_code:)
|
|
83
|
+
@context = context
|
|
84
|
+
end
|
|
85
|
+
end
|
|
86
|
+
|
|
87
|
+
# Raised when OCR processing fails
|
|
88
|
+
class OCRError < Error
|
|
89
|
+
attr_reader :context
|
|
90
|
+
|
|
91
|
+
def initialize(message, context: nil, panic_context: nil, error_code: nil)
|
|
92
|
+
super(message, panic_context:, error_code:)
|
|
93
|
+
@context = context
|
|
94
|
+
end
|
|
95
|
+
end
|
|
96
|
+
|
|
97
|
+
# Raised when a required dependency is missing
|
|
98
|
+
class MissingDependencyError < Error
|
|
99
|
+
attr_reader :dependency
|
|
100
|
+
|
|
101
|
+
def initialize(message, dependency: nil, panic_context: nil, error_code: nil)
|
|
102
|
+
super(message, panic_context:, error_code:)
|
|
103
|
+
@dependency = dependency
|
|
104
|
+
end
|
|
105
|
+
end
|
|
106
|
+
|
|
107
|
+
# Raised when an I/O operation fails
|
|
108
|
+
class IOError < Error; end
|
|
109
|
+
|
|
110
|
+
# Raised when plugin operations fail
|
|
111
|
+
class PluginError < Error; end
|
|
112
|
+
|
|
113
|
+
# Raised when an unsupported file format or MIME type is encountered
|
|
114
|
+
class UnsupportedFormatError < Error; end
|
|
115
|
+
end
|
|
116
|
+
end
|