kreuzberg 4.3.5-aarch64-linux
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +14 -0
- data/.rspec +3 -0
- data/.rubocop.yaml +1 -0
- data/.rubocop.yml +543 -0
- data/Gemfile +8 -0
- data/Gemfile.lock +260 -0
- data/README.md +399 -0
- data/Rakefile +34 -0
- data/Steepfile +51 -0
- data/examples/async_patterns.rb +283 -0
- data/extconf.rb +60 -0
- data/kreuzberg.gemspec +253 -0
- data/lib/kreuzberg/api_proxy.rb +125 -0
- data/lib/kreuzberg/cache_api.rb +67 -0
- data/lib/kreuzberg/cli.rb +57 -0
- data/lib/kreuzberg/cli_proxy.rb +118 -0
- data/lib/kreuzberg/config.rb +1241 -0
- data/lib/kreuzberg/djot_content.rb +225 -0
- data/lib/kreuzberg/document_structure.rb +204 -0
- data/lib/kreuzberg/error_context.rb +136 -0
- data/lib/kreuzberg/errors.rb +116 -0
- data/lib/kreuzberg/extraction_api.rb +329 -0
- data/lib/kreuzberg/mcp_proxy.rb +176 -0
- data/lib/kreuzberg/ocr_backend_protocol.rb +40 -0
- data/lib/kreuzberg/post_processor_protocol.rb +15 -0
- data/lib/kreuzberg/result.rb +712 -0
- data/lib/kreuzberg/setup_lib_path.rb +99 -0
- data/lib/kreuzberg/types.rb +414 -0
- data/lib/kreuzberg/validator_protocol.rb +16 -0
- data/lib/kreuzberg/version.rb +5 -0
- data/lib/kreuzberg.rb +102 -0
- data/lib/kreuzberg_rb.so +0 -0
- data/lib/libpdfium.so +0 -0
- data/sig/kreuzberg/internal.rbs +184 -0
- data/sig/kreuzberg.rbs +1337 -0
- data/spec/binding/async_operations_spec.rb +473 -0
- data/spec/binding/batch_operations_spec.rb +677 -0
- data/spec/binding/batch_spec.rb +360 -0
- data/spec/binding/cache_spec.rb +227 -0
- data/spec/binding/cli_proxy_spec.rb +85 -0
- data/spec/binding/cli_spec.rb +55 -0
- data/spec/binding/config_result_spec.rb +377 -0
- data/spec/binding/config_spec.rb +419 -0
- data/spec/binding/config_validation_spec.rb +377 -0
- data/spec/binding/embeddings_spec.rb +816 -0
- data/spec/binding/error_handling_spec.rb +399 -0
- data/spec/binding/error_recovery_spec.rb +488 -0
- data/spec/binding/errors_spec.rb +66 -0
- data/spec/binding/font_config_spec.rb +220 -0
- data/spec/binding/images_spec.rb +732 -0
- data/spec/binding/keywords_extraction_spec.rb +600 -0
- data/spec/binding/metadata_types_spec.rb +1253 -0
- data/spec/binding/pages_extraction_spec.rb +550 -0
- data/spec/binding/plugins/ocr_backend_spec.rb +307 -0
- data/spec/binding/plugins/postprocessor_spec.rb +269 -0
- data/spec/binding/plugins/validator_spec.rb +273 -0
- data/spec/binding/tables_spec.rb +650 -0
- data/spec/fixtures/config.toml +38 -0
- data/spec/fixtures/config.yaml +41 -0
- data/spec/fixtures/invalid_config.toml +3 -0
- data/spec/serialization_spec.rb +134 -0
- data/spec/smoke/package_spec.rb +177 -0
- data/spec/spec_helper.rb +40 -0
- data/spec/unit/config/chunking_config_spec.rb +213 -0
- data/spec/unit/config/embedding_config_spec.rb +343 -0
- data/spec/unit/config/extraction_config_spec.rb +434 -0
- data/spec/unit/config/font_config_spec.rb +285 -0
- data/spec/unit/config/hierarchy_config_spec.rb +314 -0
- data/spec/unit/config/image_extraction_config_spec.rb +209 -0
- data/spec/unit/config/image_preprocessing_config_spec.rb +230 -0
- data/spec/unit/config/keyword_config_spec.rb +229 -0
- data/spec/unit/config/language_detection_config_spec.rb +258 -0
- data/spec/unit/config/ocr_config_spec.rb +171 -0
- data/spec/unit/config/output_format_spec.rb +380 -0
- data/spec/unit/config/page_config_spec.rb +221 -0
- data/spec/unit/config/pdf_config_spec.rb +267 -0
- data/spec/unit/config/postprocessor_config_spec.rb +290 -0
- data/spec/unit/config/tesseract_config_spec.rb +181 -0
- data/spec/unit/config/token_reduction_config_spec.rb +251 -0
- data/test/metadata_types_test.rb +959 -0
- metadata +292 -0
|
@@ -0,0 +1,99 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'rbconfig'
|
|
4
|
+
require 'open3'
|
|
5
|
+
|
|
6
|
+
module Kreuzberg
|
|
7
|
+
module SetupLibPath
|
|
8
|
+
module_function
|
|
9
|
+
|
|
10
|
+
def configure
|
|
11
|
+
lib_dir = File.expand_path('..', __dir__ || '.')
|
|
12
|
+
host_os = RbConfig::CONFIG['host_os']
|
|
13
|
+
|
|
14
|
+
case host_os
|
|
15
|
+
when /darwin/
|
|
16
|
+
prepend_env('DYLD_LIBRARY_PATH', lib_dir)
|
|
17
|
+
prepend_env('DYLD_FALLBACK_LIBRARY_PATH', "#{lib_dir}:/usr/local/lib:/usr/lib")
|
|
18
|
+
fix_macos_install_name(lib_dir)
|
|
19
|
+
when /linux/
|
|
20
|
+
prepend_env('LD_LIBRARY_PATH', lib_dir)
|
|
21
|
+
when /mswin|mingw|cygwin/
|
|
22
|
+
# Windows uses PATH to locate DLLs
|
|
23
|
+
prepend_env('PATH', lib_dir, separator: ';')
|
|
24
|
+
# Also check common locations for PDFium on Windows
|
|
25
|
+
setup_windows_library_paths(lib_dir)
|
|
26
|
+
end
|
|
27
|
+
end
|
|
28
|
+
|
|
29
|
+
def setup_windows_library_paths(lib_dir)
|
|
30
|
+
# Add target/release to PATH for DLL lookup during development
|
|
31
|
+
target_release = File.expand_path('../../target/release', lib_dir)
|
|
32
|
+
prepend_env('PATH', target_release, separator: ';') if Dir.exist?(target_release)
|
|
33
|
+
|
|
34
|
+
# Check for short path CARGO_TARGET_DIR (CI uses C:\t)
|
|
35
|
+
cargo_target_dir = ENV.fetch('CARGO_TARGET_DIR', nil)
|
|
36
|
+
return unless cargo_target_dir
|
|
37
|
+
|
|
38
|
+
target_release_alt = File.join(cargo_target_dir, 'release')
|
|
39
|
+
prepend_env('PATH', target_release_alt, separator: ';') if Dir.exist?(target_release_alt)
|
|
40
|
+
|
|
41
|
+
# Also check for target-specific subdirectory (Windows GNU builds)
|
|
42
|
+
gnu_release = File.join(cargo_target_dir, 'x86_64-pc-windows-gnu', 'release')
|
|
43
|
+
prepend_env('PATH', gnu_release, separator: ';') if Dir.exist?(gnu_release)
|
|
44
|
+
end
|
|
45
|
+
private_class_method :setup_windows_library_paths
|
|
46
|
+
|
|
47
|
+
def prepend_env(key, value, separator: ':')
|
|
48
|
+
current = ENV.fetch(key, nil)
|
|
49
|
+
return if current&.split(separator)&.include?(value)
|
|
50
|
+
|
|
51
|
+
ENV[key] = current.nil? || current.empty? ? value : "#{value}#{separator}#{current}"
|
|
52
|
+
end
|
|
53
|
+
private_class_method :prepend_env
|
|
54
|
+
|
|
55
|
+
def fix_macos_install_name(lib_dir)
|
|
56
|
+
bundle = macos_bundle(lib_dir)
|
|
57
|
+
return unless bundle
|
|
58
|
+
|
|
59
|
+
ensure_install_name(bundle)
|
|
60
|
+
ensure_loader_rpath(bundle)
|
|
61
|
+
rescue Errno::ENOENT, IOError # rubocop:disable Lint/SuppressedException
|
|
62
|
+
end
|
|
63
|
+
private_class_method :fix_macos_install_name
|
|
64
|
+
|
|
65
|
+
def macos_bundle(lib_dir)
|
|
66
|
+
bundle = File.join(lib_dir, 'kreuzberg_rb.bundle')
|
|
67
|
+
pdfium = File.join(lib_dir, 'libpdfium.dylib')
|
|
68
|
+
return unless File.exist?(bundle) && File.exist?(pdfium)
|
|
69
|
+
|
|
70
|
+
bundle
|
|
71
|
+
end
|
|
72
|
+
private_class_method :macos_bundle
|
|
73
|
+
|
|
74
|
+
def ensure_install_name(bundle)
|
|
75
|
+
output, status = Open3.capture2('otool', '-L', bundle)
|
|
76
|
+
return unless status.success?
|
|
77
|
+
|
|
78
|
+
replacements = {
|
|
79
|
+
'./libpdfium.dylib' => '@loader_path/libpdfium.dylib',
|
|
80
|
+
'@rpath/libpdfium.dylib' => '@loader_path/libpdfium.dylib'
|
|
81
|
+
}
|
|
82
|
+
|
|
83
|
+
replacements.each do |current, desired|
|
|
84
|
+
next unless output.include?(current)
|
|
85
|
+
|
|
86
|
+
Open3.capture2('install_name_tool', '-change', current, desired, bundle)
|
|
87
|
+
end
|
|
88
|
+
end
|
|
89
|
+
private_class_method :ensure_install_name
|
|
90
|
+
|
|
91
|
+
def ensure_loader_rpath(bundle)
|
|
92
|
+
rpath_output, rpath_status = Open3.capture2('otool', '-l', bundle)
|
|
93
|
+
return unless rpath_status.success? && !rpath_output.include?('@loader_path')
|
|
94
|
+
|
|
95
|
+
Open3.capture2('install_name_tool', '-add_rpath', '@loader_path', bundle)
|
|
96
|
+
end
|
|
97
|
+
private_class_method :ensure_loader_rpath
|
|
98
|
+
end
|
|
99
|
+
end
|
|
@@ -0,0 +1,414 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'sorbet-runtime'
|
|
4
|
+
|
|
5
|
+
module Kreuzberg
|
|
6
|
+
# Semantic element type classification.
|
|
7
|
+
#
|
|
8
|
+
# Categorizes text content into semantic units for downstream processing.
|
|
9
|
+
# Supports the element types commonly found in Unstructured documents.
|
|
10
|
+
#
|
|
11
|
+
# @example
|
|
12
|
+
# type = Kreuzberg::ElementType::TITLE
|
|
13
|
+
#
|
|
14
|
+
ElementType = T.type_alias do
|
|
15
|
+
T.any(
|
|
16
|
+
'title',
|
|
17
|
+
'narrative_text',
|
|
18
|
+
'heading',
|
|
19
|
+
'list_item',
|
|
20
|
+
'table',
|
|
21
|
+
'image',
|
|
22
|
+
'page_break',
|
|
23
|
+
'code_block',
|
|
24
|
+
'block_quote',
|
|
25
|
+
'footer',
|
|
26
|
+
'header'
|
|
27
|
+
)
|
|
28
|
+
end
|
|
29
|
+
|
|
30
|
+
# Bounding box coordinates for element positioning.
|
|
31
|
+
#
|
|
32
|
+
# Represents rectangular coordinates for an element within a page.
|
|
33
|
+
#
|
|
34
|
+
# @example
|
|
35
|
+
# bbox = Kreuzberg::BoundingBox.new(
|
|
36
|
+
# x0: 10.0,
|
|
37
|
+
# y0: 20.0,
|
|
38
|
+
# x1: 100.0,
|
|
39
|
+
# y1: 50.0
|
|
40
|
+
# )
|
|
41
|
+
# puts "Width: #{bbox.x1 - bbox.x0}"
|
|
42
|
+
#
|
|
43
|
+
class BoundingBox < T::Struct
|
|
44
|
+
extend T::Sig
|
|
45
|
+
|
|
46
|
+
const :x0, Float
|
|
47
|
+
|
|
48
|
+
const :y0, Float
|
|
49
|
+
|
|
50
|
+
const :x1, Float
|
|
51
|
+
|
|
52
|
+
const :y1, Float
|
|
53
|
+
end
|
|
54
|
+
|
|
55
|
+
# Metadata for a semantic element.
|
|
56
|
+
#
|
|
57
|
+
# Provides contextual information about an extracted element including
|
|
58
|
+
# its position within the document and custom metadata fields.
|
|
59
|
+
#
|
|
60
|
+
# @example
|
|
61
|
+
# metadata = Kreuzberg::ElementMetadata.new(
|
|
62
|
+
# page_number: 1,
|
|
63
|
+
# filename: "document.pdf",
|
|
64
|
+
# coordinates: bbox,
|
|
65
|
+
# element_index: 5,
|
|
66
|
+
# additional: { "style" => "bold" }
|
|
67
|
+
# )
|
|
68
|
+
#
|
|
69
|
+
class ElementMetadata < T::Struct
|
|
70
|
+
extend T::Sig
|
|
71
|
+
|
|
72
|
+
const :page_number, T.nilable(Integer)
|
|
73
|
+
|
|
74
|
+
const :filename, T.nilable(String)
|
|
75
|
+
|
|
76
|
+
const :coordinates, T.nilable(BoundingBox)
|
|
77
|
+
|
|
78
|
+
const :element_index, T.nilable(Integer)
|
|
79
|
+
|
|
80
|
+
const :additional, T::Hash[String, String]
|
|
81
|
+
end
|
|
82
|
+
|
|
83
|
+
# Semantic element extracted from document.
|
|
84
|
+
#
|
|
85
|
+
# Represents a logical unit of content with semantic classification,
|
|
86
|
+
# unique identifier, and metadata for tracking origin and position.
|
|
87
|
+
# Compatible with Unstructured.io element format when output_format='element_based'.
|
|
88
|
+
#
|
|
89
|
+
# @example
|
|
90
|
+
# element = Kreuzberg::Element.new(
|
|
91
|
+
# element_id: "elem-abc123",
|
|
92
|
+
# element_type: "narrative_text",
|
|
93
|
+
# text: "This is the main content.",
|
|
94
|
+
# metadata: metadata
|
|
95
|
+
# )
|
|
96
|
+
# puts "#{element.element_type}: #{element.text}"
|
|
97
|
+
#
|
|
98
|
+
class Element < T::Struct
|
|
99
|
+
extend T::Sig
|
|
100
|
+
|
|
101
|
+
const :element_id, String
|
|
102
|
+
|
|
103
|
+
const :element_type, String
|
|
104
|
+
|
|
105
|
+
const :text, String
|
|
106
|
+
|
|
107
|
+
const :metadata, ElementMetadata
|
|
108
|
+
end
|
|
109
|
+
|
|
110
|
+
# Header/Heading metadata
|
|
111
|
+
#
|
|
112
|
+
# Represents a heading element found in the HTML document
|
|
113
|
+
#
|
|
114
|
+
# @example
|
|
115
|
+
# header = Kreuzberg::HeaderMetadata.new(
|
|
116
|
+
# level: 1,
|
|
117
|
+
# text: "Main Title",
|
|
118
|
+
# id: "main-title",
|
|
119
|
+
# depth: 0,
|
|
120
|
+
# html_offset: 245
|
|
121
|
+
# )
|
|
122
|
+
# puts "#{header.text} (H#{header.level})"
|
|
123
|
+
#
|
|
124
|
+
class HeaderMetadata < T::Struct
|
|
125
|
+
extend T::Sig
|
|
126
|
+
|
|
127
|
+
const :level, Integer
|
|
128
|
+
|
|
129
|
+
const :text, String
|
|
130
|
+
|
|
131
|
+
const :id, T.nilable(String)
|
|
132
|
+
|
|
133
|
+
const :depth, Integer
|
|
134
|
+
|
|
135
|
+
const :html_offset, Integer
|
|
136
|
+
end
|
|
137
|
+
|
|
138
|
+
# Link metadata
|
|
139
|
+
#
|
|
140
|
+
# Represents a link element found in the HTML document
|
|
141
|
+
#
|
|
142
|
+
# @example
|
|
143
|
+
# link = Kreuzberg::LinkMetadata.new(
|
|
144
|
+
# href: "https://example.com",
|
|
145
|
+
# text: "Example",
|
|
146
|
+
# title: "Example Site",
|
|
147
|
+
# link_type: "external",
|
|
148
|
+
# rel: ["noopener", "noreferrer"],
|
|
149
|
+
# attributes: { "data-id" => "123" }
|
|
150
|
+
# )
|
|
151
|
+
# puts "#{link.text} -> #{link.href}"
|
|
152
|
+
#
|
|
153
|
+
class LinkMetadata < T::Struct
|
|
154
|
+
extend T::Sig
|
|
155
|
+
|
|
156
|
+
const :href, String
|
|
157
|
+
|
|
158
|
+
const :text, String
|
|
159
|
+
|
|
160
|
+
const :title, T.nilable(String)
|
|
161
|
+
|
|
162
|
+
const :link_type, String
|
|
163
|
+
|
|
164
|
+
const :rel, T::Array[String]
|
|
165
|
+
|
|
166
|
+
const :attributes, T::Hash[String, String]
|
|
167
|
+
end
|
|
168
|
+
|
|
169
|
+
# Image metadata
|
|
170
|
+
#
|
|
171
|
+
# Represents an image element found in the HTML document
|
|
172
|
+
#
|
|
173
|
+
# @example
|
|
174
|
+
# image = Kreuzberg::ImageMetadata.new(
|
|
175
|
+
# src: "images/logo.png",
|
|
176
|
+
# alt: "Company Logo",
|
|
177
|
+
# title: nil,
|
|
178
|
+
# dimensions: [200, 100],
|
|
179
|
+
# image_type: "png",
|
|
180
|
+
# attributes: { "loading" => "lazy" }
|
|
181
|
+
# )
|
|
182
|
+
# if image.dimensions
|
|
183
|
+
# width, height = image.dimensions
|
|
184
|
+
# puts "#{width}x#{height}"
|
|
185
|
+
# end
|
|
186
|
+
#
|
|
187
|
+
class ImageMetadata < T::Struct
|
|
188
|
+
extend T::Sig
|
|
189
|
+
|
|
190
|
+
const :src, String
|
|
191
|
+
|
|
192
|
+
const :alt, T.nilable(String)
|
|
193
|
+
|
|
194
|
+
const :title, T.nilable(String)
|
|
195
|
+
|
|
196
|
+
const :dimensions, T.nilable(T::Array[Integer])
|
|
197
|
+
|
|
198
|
+
const :image_type, String
|
|
199
|
+
|
|
200
|
+
const :attributes, T::Hash[String, String]
|
|
201
|
+
end
|
|
202
|
+
|
|
203
|
+
# Structured data metadata
|
|
204
|
+
#
|
|
205
|
+
# Represents structured data (JSON-LD, microdata, etc.) found in the HTML document
|
|
206
|
+
#
|
|
207
|
+
# @example
|
|
208
|
+
# structured = Kreuzberg::StructuredData.new(
|
|
209
|
+
# data_type: "json-ld",
|
|
210
|
+
# raw_json: '{"@context":"https://schema.org","@type":"Article",...}',
|
|
211
|
+
# schema_type: "Article"
|
|
212
|
+
# )
|
|
213
|
+
# data = JSON.parse(structured.raw_json)
|
|
214
|
+
# puts data['@type']
|
|
215
|
+
#
|
|
216
|
+
class StructuredData < T::Struct
|
|
217
|
+
extend T::Sig
|
|
218
|
+
|
|
219
|
+
const :data_type, String
|
|
220
|
+
|
|
221
|
+
const :raw_json, String
|
|
222
|
+
|
|
223
|
+
const :schema_type, T.nilable(String)
|
|
224
|
+
end
|
|
225
|
+
|
|
226
|
+
# @example
|
|
227
|
+
class HtmlMetadata < T::Struct
|
|
228
|
+
extend T::Sig
|
|
229
|
+
|
|
230
|
+
const :title, T.nilable(String)
|
|
231
|
+
|
|
232
|
+
const :description, T.nilable(String)
|
|
233
|
+
|
|
234
|
+
const :author, T.nilable(String)
|
|
235
|
+
|
|
236
|
+
const :copyright, T.nilable(String)
|
|
237
|
+
|
|
238
|
+
const :keywords, T::Array[String]
|
|
239
|
+
|
|
240
|
+
const :canonical_url, T.nilable(String)
|
|
241
|
+
|
|
242
|
+
const :language, T.nilable(String)
|
|
243
|
+
|
|
244
|
+
const :text_direction, T.nilable(String)
|
|
245
|
+
|
|
246
|
+
const :mime_type, T.nilable(String)
|
|
247
|
+
|
|
248
|
+
const :charset, T.nilable(String)
|
|
249
|
+
|
|
250
|
+
const :generator, T.nilable(String)
|
|
251
|
+
|
|
252
|
+
const :viewport, T.nilable(String)
|
|
253
|
+
|
|
254
|
+
const :theme_color, T.nilable(String)
|
|
255
|
+
|
|
256
|
+
const :application_name, T.nilable(String)
|
|
257
|
+
|
|
258
|
+
const :robots, T.nilable(String)
|
|
259
|
+
|
|
260
|
+
const :open_graph, T::Hash[String, String]
|
|
261
|
+
|
|
262
|
+
const :twitter_card, T::Hash[String, String]
|
|
263
|
+
|
|
264
|
+
const :meta_tags, T::Hash[String, String]
|
|
265
|
+
|
|
266
|
+
const :headers, T::Array[HeaderMetadata]
|
|
267
|
+
|
|
268
|
+
const :links, T::Array[LinkMetadata]
|
|
269
|
+
|
|
270
|
+
const :images, T::Array[ImageMetadata]
|
|
271
|
+
|
|
272
|
+
const :structured_data, T::Array[StructuredData]
|
|
273
|
+
end
|
|
274
|
+
|
|
275
|
+
# Extracted keyword with relevance metadata.
|
|
276
|
+
#
|
|
277
|
+
# Represents a single keyword extracted from text along with its relevance score,
|
|
278
|
+
# the algorithm that extracted it, and optional position information.
|
|
279
|
+
#
|
|
280
|
+
# @example
|
|
281
|
+
# keyword = Kreuzberg::ExtractedKeyword.new(
|
|
282
|
+
# text: "machine learning",
|
|
283
|
+
# score: 0.95,
|
|
284
|
+
# algorithm: "yake",
|
|
285
|
+
# positions: [42, 128]
|
|
286
|
+
# )
|
|
287
|
+
# puts "#{keyword.text}: #{keyword.score}"
|
|
288
|
+
#
|
|
289
|
+
class ExtractedKeyword < T::Struct
|
|
290
|
+
extend T::Sig
|
|
291
|
+
|
|
292
|
+
const :text, String
|
|
293
|
+
|
|
294
|
+
const :score, Float
|
|
295
|
+
|
|
296
|
+
const :algorithm, String
|
|
297
|
+
|
|
298
|
+
const :positions, T.nilable(T::Array[Integer])
|
|
299
|
+
end
|
|
300
|
+
|
|
301
|
+
# Processing warning from a pipeline stage.
|
|
302
|
+
#
|
|
303
|
+
# Represents a non-fatal warning generated during document processing.
|
|
304
|
+
#
|
|
305
|
+
# @example
|
|
306
|
+
# warning = Kreuzberg::ProcessingWarning.new(
|
|
307
|
+
# source: "ocr",
|
|
308
|
+
# message: "Low confidence on page 3"
|
|
309
|
+
# )
|
|
310
|
+
# puts "[#{warning.source}] #{warning.message}"
|
|
311
|
+
#
|
|
312
|
+
class ProcessingWarning < T::Struct
|
|
313
|
+
extend T::Sig
|
|
314
|
+
|
|
315
|
+
const :source, String
|
|
316
|
+
|
|
317
|
+
const :message, String
|
|
318
|
+
end
|
|
319
|
+
|
|
320
|
+
# Bounding box for document node positioning.
|
|
321
|
+
#
|
|
322
|
+
# Represents rectangular coordinates for a node within the document.
|
|
323
|
+
#
|
|
324
|
+
# @example
|
|
325
|
+
# bbox = Kreuzberg::DocumentBoundingBox.new(
|
|
326
|
+
# x0: 10.0,
|
|
327
|
+
# y0: 20.0,
|
|
328
|
+
# x1: 100.0,
|
|
329
|
+
# y1: 50.0
|
|
330
|
+
# )
|
|
331
|
+
#
|
|
332
|
+
class DocumentBoundingBox < T::Struct
|
|
333
|
+
extend T::Sig
|
|
334
|
+
|
|
335
|
+
const :x0, Float
|
|
336
|
+
|
|
337
|
+
const :y0, Float
|
|
338
|
+
|
|
339
|
+
const :x1, Float
|
|
340
|
+
|
|
341
|
+
const :y1, Float
|
|
342
|
+
end
|
|
343
|
+
|
|
344
|
+
# Annotation for a document node.
|
|
345
|
+
#
|
|
346
|
+
# Provides additional metadata about document node content.
|
|
347
|
+
#
|
|
348
|
+
class DocumentAnnotation < T::Struct
|
|
349
|
+
extend T::Sig
|
|
350
|
+
|
|
351
|
+
const :key, String
|
|
352
|
+
|
|
353
|
+
const :value, String
|
|
354
|
+
end
|
|
355
|
+
|
|
356
|
+
# Single node in the document structure tree.
|
|
357
|
+
#
|
|
358
|
+
# Represents a logical unit of content with deterministic ID, content,
|
|
359
|
+
# tree structure information, and metadata.
|
|
360
|
+
#
|
|
361
|
+
# @example
|
|
362
|
+
# node = Kreuzberg::DocumentNode.new(
|
|
363
|
+
# id: "node-abc123",
|
|
364
|
+
# content: "This is the content",
|
|
365
|
+
# parent: nil,
|
|
366
|
+
# children: [],
|
|
367
|
+
# content_layer: "body",
|
|
368
|
+
# page: 1,
|
|
369
|
+
# page_end: 1,
|
|
370
|
+
# bbox: bbox,
|
|
371
|
+
# annotations: []
|
|
372
|
+
# )
|
|
373
|
+
#
|
|
374
|
+
class DocumentNode < T::Struct
|
|
375
|
+
extend T::Sig
|
|
376
|
+
|
|
377
|
+
const :id, String
|
|
378
|
+
|
|
379
|
+
const :content, String
|
|
380
|
+
|
|
381
|
+
const :parent, T.nilable(Integer)
|
|
382
|
+
|
|
383
|
+
const :children, T::Array[Integer]
|
|
384
|
+
|
|
385
|
+
const :content_layer, String
|
|
386
|
+
|
|
387
|
+
const :page, T.nilable(Integer)
|
|
388
|
+
|
|
389
|
+
const :page_end, T.nilable(Integer)
|
|
390
|
+
|
|
391
|
+
const :bbox, T.nilable(DocumentBoundingBox)
|
|
392
|
+
|
|
393
|
+
const :annotations, T::Array[DocumentAnnotation]
|
|
394
|
+
end
|
|
395
|
+
|
|
396
|
+
# Structured document representation.
|
|
397
|
+
#
|
|
398
|
+
# Provides a hierarchical, tree-based representation of document content
|
|
399
|
+
# using a flat array of nodes with index-based parent/child references.
|
|
400
|
+
#
|
|
401
|
+
# @example
|
|
402
|
+
# structure = Kreuzberg::DocumentStructure.new(
|
|
403
|
+
# nodes: [node1, node2, node3]
|
|
404
|
+
# )
|
|
405
|
+
# structure.nodes.each do |node|
|
|
406
|
+
# puts "#{node.id}: #{node.content}"
|
|
407
|
+
# end
|
|
408
|
+
#
|
|
409
|
+
class DocumentStructure < T::Struct
|
|
410
|
+
extend T::Sig
|
|
411
|
+
|
|
412
|
+
const :nodes, T::Array[DocumentNode]
|
|
413
|
+
end
|
|
414
|
+
end
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Kreuzberg
|
|
4
|
+
# @example Implementing a minimum length validator
|
|
5
|
+
# @example Implementing a content quality validator
|
|
6
|
+
# @example Using a Proc as a validator
|
|
7
|
+
module ValidatorProtocol
|
|
8
|
+
# @param result [Hash] Extraction result to validate with the following structure:
|
|
9
|
+
# @return [void]
|
|
10
|
+
# @raise [Kreuzberg::Errors::ValidationError] if validation fails
|
|
11
|
+
# @example
|
|
12
|
+
def call(result)
|
|
13
|
+
raise NotImplementedError, "#{self.class} must implement #call(result)"
|
|
14
|
+
end
|
|
15
|
+
end
|
|
16
|
+
end
|
data/lib/kreuzberg.rb
ADDED
|
@@ -0,0 +1,102 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require_relative 'kreuzberg/setup_lib_path'
|
|
4
|
+
Kreuzberg::SetupLibPath.configure
|
|
5
|
+
|
|
6
|
+
require_relative 'kreuzberg/version'
|
|
7
|
+
require 'kreuzberg_rb'
|
|
8
|
+
|
|
9
|
+
# Kreuzberg is a Ruby binding for the Rust core library providing document extraction,
|
|
10
|
+
# text extraction, and OCR capabilities.
|
|
11
|
+
module Kreuzberg
|
|
12
|
+
autoload :Config, 'kreuzberg/config'
|
|
13
|
+
autoload :Result, 'kreuzberg/result'
|
|
14
|
+
autoload :CLI, 'kreuzberg/cli'
|
|
15
|
+
autoload :CLIProxy, 'kreuzberg/cli_proxy'
|
|
16
|
+
autoload :APIProxy, 'kreuzberg/api_proxy'
|
|
17
|
+
autoload :MCPProxy, 'kreuzberg/mcp_proxy'
|
|
18
|
+
autoload :Errors, 'kreuzberg/errors'
|
|
19
|
+
autoload :ErrorContext, 'kreuzberg/error_context'
|
|
20
|
+
autoload :PostProcessorProtocol, 'kreuzberg/post_processor_protocol'
|
|
21
|
+
autoload :ValidatorProtocol, 'kreuzberg/validator_protocol'
|
|
22
|
+
autoload :OcrBackendProtocol, 'kreuzberg/ocr_backend_protocol'
|
|
23
|
+
|
|
24
|
+
autoload :BoundingBox, 'kreuzberg/types'
|
|
25
|
+
autoload :ElementMetadata, 'kreuzberg/types'
|
|
26
|
+
autoload :Element, 'kreuzberg/types'
|
|
27
|
+
autoload :HtmlMetadata, 'kreuzberg/types'
|
|
28
|
+
autoload :HeaderMetadata, 'kreuzberg/types'
|
|
29
|
+
autoload :LinkMetadata, 'kreuzberg/types'
|
|
30
|
+
autoload :ImageMetadata, 'kreuzberg/types'
|
|
31
|
+
autoload :StructuredData, 'kreuzberg/types'
|
|
32
|
+
autoload :ExtractedKeyword, 'kreuzberg/types'
|
|
33
|
+
autoload :ProcessingWarning, 'kreuzberg/types'
|
|
34
|
+
autoload :DocumentBoundingBox, 'kreuzberg/types'
|
|
35
|
+
autoload :DocumentAnnotation, 'kreuzberg/types'
|
|
36
|
+
autoload :DocumentNode, 'kreuzberg/types'
|
|
37
|
+
autoload :DocumentStructure, 'kreuzberg/types'
|
|
38
|
+
|
|
39
|
+
ExtractionConfig = Config::Extraction
|
|
40
|
+
PageConfig = Config::PageConfig
|
|
41
|
+
|
|
42
|
+
module KeywordAlgorithm
|
|
43
|
+
YAKE = :yake
|
|
44
|
+
RAKE = :rake
|
|
45
|
+
end
|
|
46
|
+
|
|
47
|
+
@__cache_tracker = { entries: 0, bytes: 0 }
|
|
48
|
+
|
|
49
|
+
class << self
|
|
50
|
+
alias native_extract_file_sync extract_file_sync
|
|
51
|
+
alias native_extract_bytes_sync extract_bytes_sync
|
|
52
|
+
alias native_batch_extract_files_sync batch_extract_files_sync
|
|
53
|
+
alias native_extract_file extract_file
|
|
54
|
+
alias native_extract_bytes extract_bytes
|
|
55
|
+
alias native_batch_extract_files batch_extract_files
|
|
56
|
+
alias native_batch_extract_bytes_sync batch_extract_bytes_sync
|
|
57
|
+
alias native_batch_extract_bytes batch_extract_bytes
|
|
58
|
+
alias native_clear_cache clear_cache
|
|
59
|
+
alias native_cache_stats cache_stats
|
|
60
|
+
|
|
61
|
+
private :native_extract_file_sync, :native_extract_bytes_sync, :native_batch_extract_files_sync
|
|
62
|
+
private :native_extract_file, :native_extract_bytes, :native_batch_extract_files
|
|
63
|
+
private :native_batch_extract_bytes_sync, :native_batch_extract_bytes
|
|
64
|
+
end
|
|
65
|
+
|
|
66
|
+
module_function :register_post_processor
|
|
67
|
+
|
|
68
|
+
module_function :unregister_post_processor
|
|
69
|
+
|
|
70
|
+
module_function :clear_post_processors
|
|
71
|
+
|
|
72
|
+
module_function :register_validator
|
|
73
|
+
|
|
74
|
+
module_function :unregister_validator
|
|
75
|
+
|
|
76
|
+
module_function :clear_validators
|
|
77
|
+
|
|
78
|
+
module_function :list_validators
|
|
79
|
+
|
|
80
|
+
module_function :list_post_processors
|
|
81
|
+
|
|
82
|
+
module_function :register_ocr_backend
|
|
83
|
+
|
|
84
|
+
module_function :unregister_ocr_backend
|
|
85
|
+
|
|
86
|
+
module_function :list_ocr_backends
|
|
87
|
+
|
|
88
|
+
module_function :detect_mime_type
|
|
89
|
+
|
|
90
|
+
module_function :detect_mime_type_from_path
|
|
91
|
+
|
|
92
|
+
module_function :validate_mime_type
|
|
93
|
+
|
|
94
|
+
module_function :get_extensions_for_mime
|
|
95
|
+
end
|
|
96
|
+
|
|
97
|
+
require_relative 'kreuzberg/cache_api'
|
|
98
|
+
require_relative 'kreuzberg/extraction_api'
|
|
99
|
+
require_relative 'kreuzberg/djot_content'
|
|
100
|
+
|
|
101
|
+
Kreuzberg.singleton_class.prepend(Kreuzberg::CacheAPI)
|
|
102
|
+
Kreuzberg.singleton_class.prepend(Kreuzberg::ExtractionAPI)
|
data/lib/kreuzberg_rb.so
ADDED
|
Binary file
|
data/lib/libpdfium.so
ADDED
|
Binary file
|