kreuzberg 4.3.5-aarch64-linux

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (82) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +14 -0
  3. data/.rspec +3 -0
  4. data/.rubocop.yaml +1 -0
  5. data/.rubocop.yml +543 -0
  6. data/Gemfile +8 -0
  7. data/Gemfile.lock +260 -0
  8. data/README.md +399 -0
  9. data/Rakefile +34 -0
  10. data/Steepfile +51 -0
  11. data/examples/async_patterns.rb +283 -0
  12. data/extconf.rb +60 -0
  13. data/kreuzberg.gemspec +253 -0
  14. data/lib/kreuzberg/api_proxy.rb +125 -0
  15. data/lib/kreuzberg/cache_api.rb +67 -0
  16. data/lib/kreuzberg/cli.rb +57 -0
  17. data/lib/kreuzberg/cli_proxy.rb +118 -0
  18. data/lib/kreuzberg/config.rb +1241 -0
  19. data/lib/kreuzberg/djot_content.rb +225 -0
  20. data/lib/kreuzberg/document_structure.rb +204 -0
  21. data/lib/kreuzberg/error_context.rb +136 -0
  22. data/lib/kreuzberg/errors.rb +116 -0
  23. data/lib/kreuzberg/extraction_api.rb +329 -0
  24. data/lib/kreuzberg/mcp_proxy.rb +176 -0
  25. data/lib/kreuzberg/ocr_backend_protocol.rb +40 -0
  26. data/lib/kreuzberg/post_processor_protocol.rb +15 -0
  27. data/lib/kreuzberg/result.rb +712 -0
  28. data/lib/kreuzberg/setup_lib_path.rb +99 -0
  29. data/lib/kreuzberg/types.rb +414 -0
  30. data/lib/kreuzberg/validator_protocol.rb +16 -0
  31. data/lib/kreuzberg/version.rb +5 -0
  32. data/lib/kreuzberg.rb +102 -0
  33. data/lib/kreuzberg_rb.so +0 -0
  34. data/lib/libpdfium.so +0 -0
  35. data/sig/kreuzberg/internal.rbs +184 -0
  36. data/sig/kreuzberg.rbs +1337 -0
  37. data/spec/binding/async_operations_spec.rb +473 -0
  38. data/spec/binding/batch_operations_spec.rb +677 -0
  39. data/spec/binding/batch_spec.rb +360 -0
  40. data/spec/binding/cache_spec.rb +227 -0
  41. data/spec/binding/cli_proxy_spec.rb +85 -0
  42. data/spec/binding/cli_spec.rb +55 -0
  43. data/spec/binding/config_result_spec.rb +377 -0
  44. data/spec/binding/config_spec.rb +419 -0
  45. data/spec/binding/config_validation_spec.rb +377 -0
  46. data/spec/binding/embeddings_spec.rb +816 -0
  47. data/spec/binding/error_handling_spec.rb +399 -0
  48. data/spec/binding/error_recovery_spec.rb +488 -0
  49. data/spec/binding/errors_spec.rb +66 -0
  50. data/spec/binding/font_config_spec.rb +220 -0
  51. data/spec/binding/images_spec.rb +732 -0
  52. data/spec/binding/keywords_extraction_spec.rb +600 -0
  53. data/spec/binding/metadata_types_spec.rb +1253 -0
  54. data/spec/binding/pages_extraction_spec.rb +550 -0
  55. data/spec/binding/plugins/ocr_backend_spec.rb +307 -0
  56. data/spec/binding/plugins/postprocessor_spec.rb +269 -0
  57. data/spec/binding/plugins/validator_spec.rb +273 -0
  58. data/spec/binding/tables_spec.rb +650 -0
  59. data/spec/fixtures/config.toml +38 -0
  60. data/spec/fixtures/config.yaml +41 -0
  61. data/spec/fixtures/invalid_config.toml +3 -0
  62. data/spec/serialization_spec.rb +134 -0
  63. data/spec/smoke/package_spec.rb +177 -0
  64. data/spec/spec_helper.rb +40 -0
  65. data/spec/unit/config/chunking_config_spec.rb +213 -0
  66. data/spec/unit/config/embedding_config_spec.rb +343 -0
  67. data/spec/unit/config/extraction_config_spec.rb +434 -0
  68. data/spec/unit/config/font_config_spec.rb +285 -0
  69. data/spec/unit/config/hierarchy_config_spec.rb +314 -0
  70. data/spec/unit/config/image_extraction_config_spec.rb +209 -0
  71. data/spec/unit/config/image_preprocessing_config_spec.rb +230 -0
  72. data/spec/unit/config/keyword_config_spec.rb +229 -0
  73. data/spec/unit/config/language_detection_config_spec.rb +258 -0
  74. data/spec/unit/config/ocr_config_spec.rb +171 -0
  75. data/spec/unit/config/output_format_spec.rb +380 -0
  76. data/spec/unit/config/page_config_spec.rb +221 -0
  77. data/spec/unit/config/pdf_config_spec.rb +267 -0
  78. data/spec/unit/config/postprocessor_config_spec.rb +290 -0
  79. data/spec/unit/config/tesseract_config_spec.rb +181 -0
  80. data/spec/unit/config/token_reduction_config_spec.rb +251 -0
  81. data/test/metadata_types_test.rb +959 -0
  82. metadata +292 -0
@@ -0,0 +1,225 @@
1
+ # frozen_string_literal: true
2
+
3
+ begin
4
+ require 'json'
5
+ rescue LoadError
6
+ require 'json/pure'
7
+ end
8
+
9
+ module Kreuzberg
10
+ class Result
11
+ # Djot structured content representation
12
+ #
13
+ # Represents document content in Djot format with structured metadata about
14
+ # blocks, images, links, footnotes, and other document elements.
15
+ #
16
+ class DjotContent
17
+ attr_reader :plain_text, :blocks, :metadata_json, :tables, :images, :links, :footnotes, :attributes
18
+
19
+ # Represents a formatted block in Djot content
20
+ class FormattedBlock
21
+ attr_reader :block_type, :children, :attributes, :content, :level
22
+
23
+ # rubocop:disable Metrics/CyclomaticComplexity, Metrics/PerceivedComplexity
24
+ def initialize(hash_or_type = nil, children: nil, attributes: nil, content: nil, level: nil, block_type: nil)
25
+ if hash_or_type.is_a?(Hash)
26
+ # Initialize from hash
27
+ @block_type = hash_or_type[:block_type] || hash_or_type['block_type'] || ''
28
+ @children = hash_or_type[:children] || hash_or_type['children']
29
+ @attributes = hash_or_type[:attributes] || hash_or_type['attributes'] || {}
30
+ @content = hash_or_type[:content] || hash_or_type['content']
31
+ @level = hash_or_type[:level] || hash_or_type['level']
32
+ else
33
+ # Initialize from keyword arguments (for backward compatibility)
34
+ @block_type = block_type || hash_or_type || ''
35
+ @children = children || []
36
+ @attributes = attributes || {}
37
+ @content = content
38
+ @level = level
39
+ end
40
+ end
41
+ # rubocop:enable Metrics/CyclomaticComplexity, Metrics/PerceivedComplexity
42
+
43
+ def to_h
44
+ {
45
+ block_type: @block_type,
46
+ children: @children,
47
+ attributes: @attributes,
48
+ content: @content,
49
+ level: @level
50
+ }.compact
51
+ end
52
+ end
53
+
54
+ # Represents an image in Djot content
55
+ class DjotImage
56
+ attr_reader :url, :alt, :title, :width, :height
57
+ alias src url
58
+
59
+ # rubocop:disable Metrics/CyclomaticComplexity
60
+ def initialize(hash_or_url = nil, alt: nil, title: nil, width: nil, height: nil, url: nil, src: nil)
61
+ if hash_or_url.is_a?(Hash)
62
+ # Initialize from hash (supports both 'url' and 'src' keys)
63
+ @url = hash_or_url[:url] || hash_or_url['url'] || hash_or_url[:src] || hash_or_url['src']
64
+ @alt = hash_or_url[:alt] || hash_or_url['alt']
65
+ @title = hash_or_url[:title] || hash_or_url['title']
66
+ @width = hash_or_url[:width] || hash_or_url['width']
67
+ @height = hash_or_url[:height] || hash_or_url['height']
68
+ else
69
+ # Initialize from keyword arguments
70
+ @url = url || src || hash_or_url
71
+ @alt = alt
72
+ @title = title
73
+ @width = width
74
+ @height = height
75
+ end
76
+ end
77
+ # rubocop:enable Metrics/CyclomaticComplexity
78
+
79
+ def to_h
80
+ {
81
+ url: @url,
82
+ alt: @alt,
83
+ title: @title,
84
+ width: @width,
85
+ height: @height
86
+ }.compact
87
+ end
88
+ end
89
+
90
+ # Represents a link in Djot content
91
+ class DjotLink
92
+ attr_reader :url, :text, :title, :link_type
93
+ alias href url
94
+
95
+ # rubocop:disable Metrics/CyclomaticComplexity
96
+ def initialize(hash_or_url = nil, text: nil, title: nil, url: nil, href: nil, link_type: nil)
97
+ if hash_or_url.is_a?(Hash)
98
+ # Initialize from hash (supports both 'url' and 'href' keys)
99
+ @url = hash_or_url[:url] || hash_or_url['url'] || hash_or_url[:href] || hash_or_url['href']
100
+ @text = hash_or_url[:text] || hash_or_url['text']
101
+ @title = hash_or_url[:title] || hash_or_url['title']
102
+ @link_type = hash_or_url[:link_type] || hash_or_url['link_type']
103
+ else
104
+ # Initialize from keyword arguments
105
+ @url = url || href || hash_or_url
106
+ @text = text
107
+ @title = title
108
+ @link_type = link_type
109
+ end
110
+ end
111
+ # rubocop:enable Metrics/CyclomaticComplexity
112
+
113
+ def to_h
114
+ {
115
+ url: @url,
116
+ text: @text,
117
+ title: @title,
118
+ link_type: @link_type
119
+ }.compact
120
+ end
121
+ end
122
+
123
+ # Represents a footnote in Djot content
124
+ class Footnote
125
+ attr_reader :label, :content
126
+
127
+ def initialize(label:, content:)
128
+ @label = label
129
+ @content = content
130
+ end
131
+
132
+ def to_h
133
+ {
134
+ label: @label,
135
+ content: @content
136
+ }
137
+ end
138
+ end
139
+
140
+ # rubocop:disable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/PerceivedComplexity
141
+ def initialize(hash)
142
+ @plain_text = hash['plain_text'] || hash[:plain_text] || ''
143
+ @blocks = parse_blocks(hash['blocks'] || hash[:blocks] || [])
144
+ @metadata_json = hash['metadata_json'] || hash[:metadata_json] || '{}'
145
+ @tables = hash['tables'] || hash[:tables] || []
146
+ @images = parse_images(hash['images'] || hash[:images] || [])
147
+ @links = parse_links(hash['links'] || hash[:links] || [])
148
+ @footnotes = parse_footnotes(hash['footnotes'] || hash[:footnotes] || [])
149
+ @attributes = hash['attributes'] || hash[:attributes] || {}
150
+ end
151
+ # rubocop:enable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/PerceivedComplexity
152
+
153
+ def metadata
154
+ @metadata ||= parse_metadata(@metadata_json)
155
+ end
156
+
157
+ def to_h
158
+ {
159
+ plain_text: @plain_text,
160
+ blocks: @blocks.map(&:to_h),
161
+ metadata_json: @metadata_json,
162
+ tables: @tables,
163
+ images: @images.map(&:to_h),
164
+ links: @links.map(&:to_h),
165
+ footnotes: @footnotes.map(&:to_h),
166
+ attributes: @attributes
167
+ }
168
+ end
169
+
170
+ private
171
+
172
+ def parse_metadata(metadata_json)
173
+ JSON.parse(metadata_json)
174
+ rescue JSON::ParserError
175
+ {}
176
+ end
177
+
178
+ def parse_blocks(blocks_data)
179
+ blocks_data.map do |block|
180
+ FormattedBlock.new(
181
+ block_type: block['block_type'] || block[:block_type] || '',
182
+ children: block['children'] || block[:children],
183
+ attributes: block['attributes'] || block[:attributes]
184
+ )
185
+ end
186
+ end
187
+
188
+ # rubocop:disable Metrics/CyclomaticComplexity
189
+ def parse_images(images_data)
190
+ images_data.map do |image|
191
+ DjotImage.new(
192
+ url: image['url'] || image[:url] || image['src'] || image[:src],
193
+ alt: image['alt'] || image[:alt],
194
+ title: image['title'] || image[:title],
195
+ width: image['width'] || image[:width],
196
+ height: image['height'] || image[:height]
197
+ )
198
+ end
199
+ end
200
+ # rubocop:enable Metrics/CyclomaticComplexity
201
+
202
+ # rubocop:disable Metrics/CyclomaticComplexity
203
+ def parse_links(links_data)
204
+ links_data.map do |link|
205
+ DjotLink.new(
206
+ url: link['url'] || link[:url] || link['href'] || link[:href],
207
+ text: link['text'] || link[:text],
208
+ title: link['title'] || link[:title],
209
+ link_type: link['link_type'] || link[:link_type]
210
+ )
211
+ end
212
+ end
213
+ # rubocop:enable Metrics/CyclomaticComplexity
214
+
215
+ def parse_footnotes(footnotes_data)
216
+ footnotes_data.map do |note|
217
+ Footnote.new(
218
+ label: note['label'] || note[:label],
219
+ content: note['content'] || note[:content]
220
+ )
221
+ end
222
+ end
223
+ end
224
+ end
225
+ end
@@ -0,0 +1,204 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Kreuzberg
4
+ class Result
5
+ # Structured document representation.
6
+ #
7
+ # Provides a hierarchical, tree-based representation of document content
8
+ # using a flat array of nodes with index-based parent/child references.
9
+ #
10
+ # @example
11
+ # if result.document
12
+ # result.document.nodes.each do |node|
13
+ # puts "#{node.id}: #{node.content[0..50]}"
14
+ # end
15
+ # end
16
+ #
17
+ class DocumentStructure
18
+ attr_reader :nodes
19
+
20
+ def initialize(hash)
21
+ @nodes = parse_nodes(hash['nodes'] || hash[:nodes] || [])
22
+ end
23
+
24
+ # Convert to hash
25
+ #
26
+ # @return [Hash] Hash representation
27
+ #
28
+ def to_h
29
+ { nodes: @nodes.map(&:to_h) }
30
+ end
31
+
32
+ private
33
+
34
+ def parse_nodes(nodes_data)
35
+ return [] if nodes_data.nil? || nodes_data.empty?
36
+
37
+ nodes_data.map { |node_hash| DocumentNode.new(node_hash) }
38
+ end
39
+ end
40
+
41
+ # Single node in the document structure tree.
42
+ #
43
+ # Represents a logical unit of content with deterministic ID, content,
44
+ # tree structure information, and metadata.
45
+ #
46
+ class DocumentNode
47
+ attr_reader :id, :content, :parent, :children, :content_layer, :page, :page_end, :bbox, :annotations
48
+
49
+ def initialize(hash)
50
+ assign_core_fields(hash)
51
+ assign_tree_fields(hash)
52
+ assign_metadata_fields(hash)
53
+ end
54
+
55
+ private
56
+
57
+ def assign_core_fields(hash)
58
+ @id = hash['id'] || hash[:id] || ''
59
+ @content = hash['content'] || hash[:content] || {}
60
+ @content_layer = hash['content_layer'] || hash[:content_layer] || 'body'
61
+ end
62
+
63
+ def assign_tree_fields(hash)
64
+ @parent = hash['parent'] || hash[:parent]
65
+ @children = parse_children(hash['children'] || hash[:children] || [])
66
+ end
67
+
68
+ def assign_metadata_fields(hash)
69
+ @page = hash['page'] || hash[:page]
70
+ @page_end = hash['page_end'] || hash[:page_end]
71
+ @bbox = parse_bbox(hash['bbox'] || hash[:bbox])
72
+ @annotations = parse_annotations(hash['annotations'] || hash[:annotations] || [])
73
+ end
74
+
75
+ # Convert to hash
76
+ #
77
+ # @return [Hash] Hash representation
78
+ #
79
+ def to_h
80
+ {
81
+ id: @id,
82
+ content: @content,
83
+ parent: @parent,
84
+ children: @children,
85
+ content_layer: @content_layer,
86
+ page: @page,
87
+ page_end: @page_end,
88
+ bbox: @bbox&.to_h,
89
+ annotations: @annotations.map(&:to_h)
90
+ }.compact
91
+ end
92
+
93
+ def parse_children(children_data)
94
+ return [] if children_data.nil? || children_data.empty?
95
+
96
+ if children_data.is_a?(Array)
97
+ children_data.map { |c| extract_child_index(c) }
98
+ else
99
+ []
100
+ end
101
+ end
102
+
103
+ def extract_child_index(child)
104
+ if child.is_a?(Integer)
105
+ child
106
+ else
107
+ child['index'] || child[:index]
108
+ end
109
+ end
110
+
111
+ def parse_bbox(bbox_data)
112
+ return nil if bbox_data.nil?
113
+
114
+ DocumentBoundingBox.new(bbox_data)
115
+ end
116
+
117
+ def parse_annotations(annotations_data)
118
+ return [] if annotations_data.nil? || annotations_data.empty?
119
+
120
+ annotations_data.map { |ann| DocumentAnnotation.new(ann) }
121
+ end
122
+ end
123
+
124
+ # Bounding box for document node positioning.
125
+ #
126
+ # Represents rectangular coordinates for a node within the document.
127
+ #
128
+ class DocumentBoundingBox
129
+ attr_reader :x0, :y0, :x1, :y1
130
+
131
+ def initialize(hash)
132
+ @x0 = extract_float(hash, 'x0')
133
+ @y0 = extract_float(hash, 'y0')
134
+ @x1 = extract_float(hash, 'x1')
135
+ @y1 = extract_float(hash, 'y1')
136
+ end
137
+
138
+ # Convert to hash
139
+ #
140
+ # @return [Hash] Hash representation
141
+ #
142
+ def to_h
143
+ {
144
+ x0: @x0,
145
+ y0: @y0,
146
+ x1: @x1,
147
+ y1: @y1
148
+ }.compact
149
+ end
150
+
151
+ private
152
+
153
+ def extract_float(hash, key)
154
+ (hash[key] || hash[key.to_sym])&.to_f
155
+ end
156
+ end
157
+
158
+ # Annotation for a document node.
159
+ #
160
+ # Represents inline text annotations (formatting, links) with byte-range
161
+ # references into the node's text content.
162
+ #
163
+ class DocumentAnnotation
164
+ attr_reader :start, :end_offset, :annotation_type, :url, :title
165
+
166
+ def initialize(hash)
167
+ @start = (hash['start'] || hash[:start] || 0).to_i
168
+ @end_offset = (hash['end'] || hash[:end] || 0).to_i
169
+ parse_kind(hash['kind'] || hash[:kind] || {})
170
+ end
171
+
172
+ # Convert to hash
173
+ #
174
+ # @return [Hash] Hash representation
175
+ #
176
+ def to_h
177
+ kind_hash = { annotation_type: @annotation_type }
178
+ url = @url
179
+ kind_hash[:url] = url if url
180
+ title = @title
181
+ kind_hash[:title] = title if title
182
+
183
+ {
184
+ start: @start,
185
+ end: @end_offset,
186
+ kind: kind_hash
187
+ }
188
+ end
189
+
190
+ private
191
+
192
+ def parse_kind(kind_hash)
193
+ return if kind_hash.nil? || kind_hash.empty?
194
+
195
+ @annotation_type =
196
+ kind_hash['annotation_type'] ||
197
+ kind_hash[:annotation_type] ||
198
+ 'bold'
199
+ @url = kind_hash['url'] || kind_hash[:url]
200
+ @title = kind_hash['title'] || kind_hash[:title]
201
+ end
202
+ end
203
+ end
204
+ end
@@ -0,0 +1,136 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'json'
4
+
5
+ module Kreuzberg
6
+ module ErrorContext
7
+ class << self
8
+ # @return [Integer] Error code constant (ERROR_CODE_* values), or 0 on success
9
+ # @example Check last error
10
+ def last_error_code
11
+ Kreuzberg._last_error_code_native
12
+ rescue StandardError
13
+ 0
14
+ end
15
+
16
+ # Get panic context information from the last error.
17
+ #
18
+ # Returns a {Errors::PanicContext} object containing detailed information about
19
+ # the last panic that occurred in the Rust core. Includes file path, line number,
20
+ # function name, error message, and timestamp.
21
+ #
22
+ # @return [Errors::PanicContext, nil] Panic context if a panic occurred, nil otherwise
23
+ #
24
+ # @example Get panic details
25
+ # panic = Kreuzberg::ErrorContext.last_panic_context
26
+ # if panic
27
+ # puts "Panic at #{panic.file}:#{panic.line} in #{panic.function}"
28
+ # puts "Message: #{panic.message}"
29
+ # puts "Time: #{panic.timestamp_secs}"
30
+ # end
31
+ def last_panic_context
32
+ json_str = Kreuzberg._last_panic_context_json_native
33
+ return nil unless json_str
34
+
35
+ Errors::PanicContext.from_json(json_str)
36
+ rescue StandardError
37
+ nil
38
+ end
39
+
40
+ # Get panic context as raw JSON string.
41
+ #
42
+ # Returns the panic context information as a JSON string for raw access or
43
+ # custom parsing. Returns nil if no panic has occurred.
44
+ #
45
+ # @return [String, nil] JSON-serialized panic context, or nil if no panic
46
+ #
47
+ # @example Get raw JSON panic context
48
+ # json = Kreuzberg::ErrorContext.last_panic_context_json
49
+ # if json
50
+ # panic_data = JSON.parse(json)
51
+ # puts panic_data
52
+ # end
53
+ def last_panic_context_json
54
+ Kreuzberg._last_panic_context_json_native
55
+ rescue StandardError
56
+ nil
57
+ end
58
+
59
+ # Get detailed error information from the last operation.
60
+ #
61
+ # Returns comprehensive error details including message, code, type, source location,
62
+ # and panic information.
63
+ #
64
+ # @return [Hash] Hash with keys: :message, :error_code, :error_type, :source_file,
65
+ # :source_function, :source_line, :context_info, :is_panic
66
+ #
67
+ # @example Get error details
68
+ # details = Kreuzberg::ErrorContext.error_details
69
+ # puts "Error: #{details[:message]}"
70
+ # puts "Code: #{details[:error_code]}"
71
+ # puts "Type: #{details[:error_type]}"
72
+ def error_details
73
+ Kreuzberg._get_error_details_native
74
+ rescue StandardError
75
+ {}
76
+ end
77
+
78
+ # Classify an error message into a Kreuzberg error code.
79
+ #
80
+ # Analyzes an error message and returns the most likely error code (0-7).
81
+ # Useful for converting third-party error messages into Kreuzberg categories.
82
+ #
83
+ # @param message [String] The error message to classify
84
+ # @return [Integer] Error code (0-7)
85
+ #
86
+ # Error code mapping:
87
+ # - 0: Validation
88
+ # - 1: Parsing
89
+ # - 2: OCR
90
+ # - 3: MissingDependency
91
+ # - 4: IO
92
+ # - 5: Plugin
93
+ # - 6: UnsupportedFormat
94
+ # - 7: Internal
95
+ #
96
+ # @example Classify an error
97
+ # code = Kreuzberg::ErrorContext.classify_error("File not found")
98
+ # if code == 4
99
+ # puts "This is an I/O error"
100
+ # end
101
+ def classify_error(message)
102
+ Kreuzberg._classify_error_native(message)
103
+ rescue StandardError
104
+ 7
105
+ end
106
+
107
+ # Get the human-readable name of an error code.
108
+ #
109
+ # @param code [Integer] Numeric error code (0-7)
110
+ # @return [String] Human-readable error code name (e.g., "validation", "io")
111
+ #
112
+ # @example Get error code name
113
+ # name = Kreuzberg::ErrorContext.error_code_name(0)
114
+ # puts name # => "validation"
115
+ def error_code_name(code)
116
+ Kreuzberg._error_code_name_native(code)
117
+ rescue StandardError
118
+ 'unknown'
119
+ end
120
+
121
+ # Get the description of an error code.
122
+ #
123
+ # @param code [Integer] Numeric error code (0-7)
124
+ # @return [String] Description of the error code
125
+ #
126
+ # @example Get error code description
127
+ # desc = Kreuzberg::ErrorContext.error_code_description(0)
128
+ # puts desc # => "Input validation error"
129
+ def error_code_description(code)
130
+ Kreuzberg._error_code_description_native(code)
131
+ rescue StandardError
132
+ 'Unknown error code'
133
+ end
134
+ end
135
+ end
136
+ end
@@ -0,0 +1,116 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'json'
4
+
5
+ module Kreuzberg
6
+ ERROR_CODE_SUCCESS = 0
7
+ ERROR_CODE_GENERIC = 1
8
+ ERROR_CODE_PANIC = 2
9
+ ERROR_CODE_INVALID_ARGUMENT = 3
10
+ ERROR_CODE_IO = 4
11
+ ERROR_CODE_PARSING = 5
12
+ ERROR_CODE_OCR = 6
13
+ ERROR_CODE_MISSING_DEPENDENCY = 7
14
+
15
+ module Errors
16
+ class PanicContext
17
+ attr_reader :file, :line, :function, :message, :timestamp_secs
18
+
19
+ def initialize(file:, line:, function:, message:, timestamp_secs:)
20
+ @file = file
21
+ @line = line
22
+ @function = function
23
+ @message = message
24
+ @timestamp_secs = timestamp_secs
25
+ end
26
+
27
+ def to_s
28
+ "#{file}:#{line}:#{function}: #{message}"
29
+ end
30
+
31
+ def to_h
32
+ {
33
+ file:,
34
+ line:,
35
+ function:,
36
+ message:,
37
+ timestamp_secs:
38
+ }
39
+ end
40
+
41
+ def self.from_json(json_string)
42
+ return nil if json_string.nil? || json_string.empty?
43
+
44
+ data = JSON.parse(json_string, symbolize_names: true)
45
+ sliced = data.slice(:file, :line, :function, :message, :timestamp_secs)
46
+ new(**with_defaults(sliced))
47
+ rescue JSON::ParserError
48
+ nil
49
+ end
50
+
51
+ def self.with_defaults(sliced)
52
+ {
53
+ file: sliced[:file] || '',
54
+ line: sliced[:line] || 0,
55
+ function: sliced[:function] || '',
56
+ message: sliced[:message] || '',
57
+ timestamp_secs: sliced[:timestamp_secs] || 0
58
+ }
59
+ end
60
+ private_class_method :with_defaults
61
+ end
62
+
63
+ # Base error class for all Kreuzberg errors
64
+ class Error < StandardError
65
+ attr_reader :panic_context, :error_code
66
+
67
+ def initialize(message, panic_context: nil, error_code: nil)
68
+ super(message)
69
+ @panic_context = panic_context
70
+ @error_code = error_code
71
+ end
72
+ end
73
+
74
+ # Raised when validation fails
75
+ class ValidationError < Error; end
76
+
77
+ # Raised when document parsing fails
78
+ class ParsingError < Error
79
+ attr_reader :context
80
+
81
+ def initialize(message, context: nil, panic_context: nil, error_code: nil)
82
+ super(message, panic_context:, error_code:)
83
+ @context = context
84
+ end
85
+ end
86
+
87
+ # Raised when OCR processing fails
88
+ class OCRError < Error
89
+ attr_reader :context
90
+
91
+ def initialize(message, context: nil, panic_context: nil, error_code: nil)
92
+ super(message, panic_context:, error_code:)
93
+ @context = context
94
+ end
95
+ end
96
+
97
+ # Raised when a required dependency is missing
98
+ class MissingDependencyError < Error
99
+ attr_reader :dependency
100
+
101
+ def initialize(message, dependency: nil, panic_context: nil, error_code: nil)
102
+ super(message, panic_context:, error_code:)
103
+ @dependency = dependency
104
+ end
105
+ end
106
+
107
+ # Raised when an I/O operation fails
108
+ class IOError < Error; end
109
+
110
+ # Raised when plugin operations fail
111
+ class PluginError < Error; end
112
+
113
+ # Raised when an unsupported file format or MIME type is encountered
114
+ class UnsupportedFormatError < Error; end
115
+ end
116
+ end