kreuzberg 4.3.5-aarch64-linux

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (82) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +14 -0
  3. data/.rspec +3 -0
  4. data/.rubocop.yaml +1 -0
  5. data/.rubocop.yml +543 -0
  6. data/Gemfile +8 -0
  7. data/Gemfile.lock +260 -0
  8. data/README.md +399 -0
  9. data/Rakefile +34 -0
  10. data/Steepfile +51 -0
  11. data/examples/async_patterns.rb +283 -0
  12. data/extconf.rb +60 -0
  13. data/kreuzberg.gemspec +253 -0
  14. data/lib/kreuzberg/api_proxy.rb +125 -0
  15. data/lib/kreuzberg/cache_api.rb +67 -0
  16. data/lib/kreuzberg/cli.rb +57 -0
  17. data/lib/kreuzberg/cli_proxy.rb +118 -0
  18. data/lib/kreuzberg/config.rb +1241 -0
  19. data/lib/kreuzberg/djot_content.rb +225 -0
  20. data/lib/kreuzberg/document_structure.rb +204 -0
  21. data/lib/kreuzberg/error_context.rb +136 -0
  22. data/lib/kreuzberg/errors.rb +116 -0
  23. data/lib/kreuzberg/extraction_api.rb +329 -0
  24. data/lib/kreuzberg/mcp_proxy.rb +176 -0
  25. data/lib/kreuzberg/ocr_backend_protocol.rb +40 -0
  26. data/lib/kreuzberg/post_processor_protocol.rb +15 -0
  27. data/lib/kreuzberg/result.rb +712 -0
  28. data/lib/kreuzberg/setup_lib_path.rb +99 -0
  29. data/lib/kreuzberg/types.rb +414 -0
  30. data/lib/kreuzberg/validator_protocol.rb +16 -0
  31. data/lib/kreuzberg/version.rb +5 -0
  32. data/lib/kreuzberg.rb +102 -0
  33. data/lib/kreuzberg_rb.so +0 -0
  34. data/lib/libpdfium.so +0 -0
  35. data/sig/kreuzberg/internal.rbs +184 -0
  36. data/sig/kreuzberg.rbs +1337 -0
  37. data/spec/binding/async_operations_spec.rb +473 -0
  38. data/spec/binding/batch_operations_spec.rb +677 -0
  39. data/spec/binding/batch_spec.rb +360 -0
  40. data/spec/binding/cache_spec.rb +227 -0
  41. data/spec/binding/cli_proxy_spec.rb +85 -0
  42. data/spec/binding/cli_spec.rb +55 -0
  43. data/spec/binding/config_result_spec.rb +377 -0
  44. data/spec/binding/config_spec.rb +419 -0
  45. data/spec/binding/config_validation_spec.rb +377 -0
  46. data/spec/binding/embeddings_spec.rb +816 -0
  47. data/spec/binding/error_handling_spec.rb +399 -0
  48. data/spec/binding/error_recovery_spec.rb +488 -0
  49. data/spec/binding/errors_spec.rb +66 -0
  50. data/spec/binding/font_config_spec.rb +220 -0
  51. data/spec/binding/images_spec.rb +732 -0
  52. data/spec/binding/keywords_extraction_spec.rb +600 -0
  53. data/spec/binding/metadata_types_spec.rb +1253 -0
  54. data/spec/binding/pages_extraction_spec.rb +550 -0
  55. data/spec/binding/plugins/ocr_backend_spec.rb +307 -0
  56. data/spec/binding/plugins/postprocessor_spec.rb +269 -0
  57. data/spec/binding/plugins/validator_spec.rb +273 -0
  58. data/spec/binding/tables_spec.rb +650 -0
  59. data/spec/fixtures/config.toml +38 -0
  60. data/spec/fixtures/config.yaml +41 -0
  61. data/spec/fixtures/invalid_config.toml +3 -0
  62. data/spec/serialization_spec.rb +134 -0
  63. data/spec/smoke/package_spec.rb +177 -0
  64. data/spec/spec_helper.rb +40 -0
  65. data/spec/unit/config/chunking_config_spec.rb +213 -0
  66. data/spec/unit/config/embedding_config_spec.rb +343 -0
  67. data/spec/unit/config/extraction_config_spec.rb +434 -0
  68. data/spec/unit/config/font_config_spec.rb +285 -0
  69. data/spec/unit/config/hierarchy_config_spec.rb +314 -0
  70. data/spec/unit/config/image_extraction_config_spec.rb +209 -0
  71. data/spec/unit/config/image_preprocessing_config_spec.rb +230 -0
  72. data/spec/unit/config/keyword_config_spec.rb +229 -0
  73. data/spec/unit/config/language_detection_config_spec.rb +258 -0
  74. data/spec/unit/config/ocr_config_spec.rb +171 -0
  75. data/spec/unit/config/output_format_spec.rb +380 -0
  76. data/spec/unit/config/page_config_spec.rb +221 -0
  77. data/spec/unit/config/pdf_config_spec.rb +267 -0
  78. data/spec/unit/config/postprocessor_config_spec.rb +290 -0
  79. data/spec/unit/config/tesseract_config_spec.rb +181 -0
  80. data/spec/unit/config/token_reduction_config_spec.rb +251 -0
  81. data/test/metadata_types_test.rb +959 -0
  82. metadata +292 -0
@@ -0,0 +1,99 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'rbconfig'
4
+ require 'open3'
5
+
6
+ module Kreuzberg
7
+ module SetupLibPath
8
+ module_function
9
+
10
+ def configure
11
+ lib_dir = File.expand_path('..', __dir__ || '.')
12
+ host_os = RbConfig::CONFIG['host_os']
13
+
14
+ case host_os
15
+ when /darwin/
16
+ prepend_env('DYLD_LIBRARY_PATH', lib_dir)
17
+ prepend_env('DYLD_FALLBACK_LIBRARY_PATH', "#{lib_dir}:/usr/local/lib:/usr/lib")
18
+ fix_macos_install_name(lib_dir)
19
+ when /linux/
20
+ prepend_env('LD_LIBRARY_PATH', lib_dir)
21
+ when /mswin|mingw|cygwin/
22
+ # Windows uses PATH to locate DLLs
23
+ prepend_env('PATH', lib_dir, separator: ';')
24
+ # Also check common locations for PDFium on Windows
25
+ setup_windows_library_paths(lib_dir)
26
+ end
27
+ end
28
+
29
+ def setup_windows_library_paths(lib_dir)
30
+ # Add target/release to PATH for DLL lookup during development
31
+ target_release = File.expand_path('../../target/release', lib_dir)
32
+ prepend_env('PATH', target_release, separator: ';') if Dir.exist?(target_release)
33
+
34
+ # Check for short path CARGO_TARGET_DIR (CI uses C:\t)
35
+ cargo_target_dir = ENV.fetch('CARGO_TARGET_DIR', nil)
36
+ return unless cargo_target_dir
37
+
38
+ target_release_alt = File.join(cargo_target_dir, 'release')
39
+ prepend_env('PATH', target_release_alt, separator: ';') if Dir.exist?(target_release_alt)
40
+
41
+ # Also check for target-specific subdirectory (Windows GNU builds)
42
+ gnu_release = File.join(cargo_target_dir, 'x86_64-pc-windows-gnu', 'release')
43
+ prepend_env('PATH', gnu_release, separator: ';') if Dir.exist?(gnu_release)
44
+ end
45
+ private_class_method :setup_windows_library_paths
46
+
47
+ def prepend_env(key, value, separator: ':')
48
+ current = ENV.fetch(key, nil)
49
+ return if current&.split(separator)&.include?(value)
50
+
51
+ ENV[key] = current.nil? || current.empty? ? value : "#{value}#{separator}#{current}"
52
+ end
53
+ private_class_method :prepend_env
54
+
55
+ def fix_macos_install_name(lib_dir)
56
+ bundle = macos_bundle(lib_dir)
57
+ return unless bundle
58
+
59
+ ensure_install_name(bundle)
60
+ ensure_loader_rpath(bundle)
61
+ rescue Errno::ENOENT, IOError # rubocop:disable Lint/SuppressedException
62
+ end
63
+ private_class_method :fix_macos_install_name
64
+
65
+ def macos_bundle(lib_dir)
66
+ bundle = File.join(lib_dir, 'kreuzberg_rb.bundle')
67
+ pdfium = File.join(lib_dir, 'libpdfium.dylib')
68
+ return unless File.exist?(bundle) && File.exist?(pdfium)
69
+
70
+ bundle
71
+ end
72
+ private_class_method :macos_bundle
73
+
74
+ def ensure_install_name(bundle)
75
+ output, status = Open3.capture2('otool', '-L', bundle)
76
+ return unless status.success?
77
+
78
+ replacements = {
79
+ './libpdfium.dylib' => '@loader_path/libpdfium.dylib',
80
+ '@rpath/libpdfium.dylib' => '@loader_path/libpdfium.dylib'
81
+ }
82
+
83
+ replacements.each do |current, desired|
84
+ next unless output.include?(current)
85
+
86
+ Open3.capture2('install_name_tool', '-change', current, desired, bundle)
87
+ end
88
+ end
89
+ private_class_method :ensure_install_name
90
+
91
+ def ensure_loader_rpath(bundle)
92
+ rpath_output, rpath_status = Open3.capture2('otool', '-l', bundle)
93
+ return unless rpath_status.success? && !rpath_output.include?('@loader_path')
94
+
95
+ Open3.capture2('install_name_tool', '-add_rpath', '@loader_path', bundle)
96
+ end
97
+ private_class_method :ensure_loader_rpath
98
+ end
99
+ end
@@ -0,0 +1,414 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'sorbet-runtime'
4
+
5
+ module Kreuzberg
6
+ # Semantic element type classification.
7
+ #
8
+ # Categorizes text content into semantic units for downstream processing.
9
+ # Supports the element types commonly found in Unstructured documents.
10
+ #
11
+ # @example
12
+ # type = Kreuzberg::ElementType::TITLE
13
+ #
14
+ ElementType = T.type_alias do
15
+ T.any(
16
+ 'title',
17
+ 'narrative_text',
18
+ 'heading',
19
+ 'list_item',
20
+ 'table',
21
+ 'image',
22
+ 'page_break',
23
+ 'code_block',
24
+ 'block_quote',
25
+ 'footer',
26
+ 'header'
27
+ )
28
+ end
29
+
30
+ # Bounding box coordinates for element positioning.
31
+ #
32
+ # Represents rectangular coordinates for an element within a page.
33
+ #
34
+ # @example
35
+ # bbox = Kreuzberg::BoundingBox.new(
36
+ # x0: 10.0,
37
+ # y0: 20.0,
38
+ # x1: 100.0,
39
+ # y1: 50.0
40
+ # )
41
+ # puts "Width: #{bbox.x1 - bbox.x0}"
42
+ #
43
+ class BoundingBox < T::Struct
44
+ extend T::Sig
45
+
46
+ const :x0, Float
47
+
48
+ const :y0, Float
49
+
50
+ const :x1, Float
51
+
52
+ const :y1, Float
53
+ end
54
+
55
+ # Metadata for a semantic element.
56
+ #
57
+ # Provides contextual information about an extracted element including
58
+ # its position within the document and custom metadata fields.
59
+ #
60
+ # @example
61
+ # metadata = Kreuzberg::ElementMetadata.new(
62
+ # page_number: 1,
63
+ # filename: "document.pdf",
64
+ # coordinates: bbox,
65
+ # element_index: 5,
66
+ # additional: { "style" => "bold" }
67
+ # )
68
+ #
69
+ class ElementMetadata < T::Struct
70
+ extend T::Sig
71
+
72
+ const :page_number, T.nilable(Integer)
73
+
74
+ const :filename, T.nilable(String)
75
+
76
+ const :coordinates, T.nilable(BoundingBox)
77
+
78
+ const :element_index, T.nilable(Integer)
79
+
80
+ const :additional, T::Hash[String, String]
81
+ end
82
+
83
+ # Semantic element extracted from document.
84
+ #
85
+ # Represents a logical unit of content with semantic classification,
86
+ # unique identifier, and metadata for tracking origin and position.
87
+ # Compatible with Unstructured.io element format when output_format='element_based'.
88
+ #
89
+ # @example
90
+ # element = Kreuzberg::Element.new(
91
+ # element_id: "elem-abc123",
92
+ # element_type: "narrative_text",
93
+ # text: "This is the main content.",
94
+ # metadata: metadata
95
+ # )
96
+ # puts "#{element.element_type}: #{element.text}"
97
+ #
98
+ class Element < T::Struct
99
+ extend T::Sig
100
+
101
+ const :element_id, String
102
+
103
+ const :element_type, String
104
+
105
+ const :text, String
106
+
107
+ const :metadata, ElementMetadata
108
+ end
109
+
110
+ # Header/Heading metadata
111
+ #
112
+ # Represents a heading element found in the HTML document
113
+ #
114
+ # @example
115
+ # header = Kreuzberg::HeaderMetadata.new(
116
+ # level: 1,
117
+ # text: "Main Title",
118
+ # id: "main-title",
119
+ # depth: 0,
120
+ # html_offset: 245
121
+ # )
122
+ # puts "#{header.text} (H#{header.level})"
123
+ #
124
+ class HeaderMetadata < T::Struct
125
+ extend T::Sig
126
+
127
+ const :level, Integer
128
+
129
+ const :text, String
130
+
131
+ const :id, T.nilable(String)
132
+
133
+ const :depth, Integer
134
+
135
+ const :html_offset, Integer
136
+ end
137
+
138
+ # Link metadata
139
+ #
140
+ # Represents a link element found in the HTML document
141
+ #
142
+ # @example
143
+ # link = Kreuzberg::LinkMetadata.new(
144
+ # href: "https://example.com",
145
+ # text: "Example",
146
+ # title: "Example Site",
147
+ # link_type: "external",
148
+ # rel: ["noopener", "noreferrer"],
149
+ # attributes: { "data-id" => "123" }
150
+ # )
151
+ # puts "#{link.text} -> #{link.href}"
152
+ #
153
+ class LinkMetadata < T::Struct
154
+ extend T::Sig
155
+
156
+ const :href, String
157
+
158
+ const :text, String
159
+
160
+ const :title, T.nilable(String)
161
+
162
+ const :link_type, String
163
+
164
+ const :rel, T::Array[String]
165
+
166
+ const :attributes, T::Hash[String, String]
167
+ end
168
+
169
+ # Image metadata
170
+ #
171
+ # Represents an image element found in the HTML document
172
+ #
173
+ # @example
174
+ # image = Kreuzberg::ImageMetadata.new(
175
+ # src: "images/logo.png",
176
+ # alt: "Company Logo",
177
+ # title: nil,
178
+ # dimensions: [200, 100],
179
+ # image_type: "png",
180
+ # attributes: { "loading" => "lazy" }
181
+ # )
182
+ # if image.dimensions
183
+ # width, height = image.dimensions
184
+ # puts "#{width}x#{height}"
185
+ # end
186
+ #
187
+ class ImageMetadata < T::Struct
188
+ extend T::Sig
189
+
190
+ const :src, String
191
+
192
+ const :alt, T.nilable(String)
193
+
194
+ const :title, T.nilable(String)
195
+
196
+ const :dimensions, T.nilable(T::Array[Integer])
197
+
198
+ const :image_type, String
199
+
200
+ const :attributes, T::Hash[String, String]
201
+ end
202
+
203
+ # Structured data metadata
204
+ #
205
+ # Represents structured data (JSON-LD, microdata, etc.) found in the HTML document
206
+ #
207
+ # @example
208
+ # structured = Kreuzberg::StructuredData.new(
209
+ # data_type: "json-ld",
210
+ # raw_json: '{"@context":"https://schema.org","@type":"Article",...}',
211
+ # schema_type: "Article"
212
+ # )
213
+ # data = JSON.parse(structured.raw_json)
214
+ # puts data['@type']
215
+ #
216
+ class StructuredData < T::Struct
217
+ extend T::Sig
218
+
219
+ const :data_type, String
220
+
221
+ const :raw_json, String
222
+
223
+ const :schema_type, T.nilable(String)
224
+ end
225
+
226
+ # @example
227
+ class HtmlMetadata < T::Struct
228
+ extend T::Sig
229
+
230
+ const :title, T.nilable(String)
231
+
232
+ const :description, T.nilable(String)
233
+
234
+ const :author, T.nilable(String)
235
+
236
+ const :copyright, T.nilable(String)
237
+
238
+ const :keywords, T::Array[String]
239
+
240
+ const :canonical_url, T.nilable(String)
241
+
242
+ const :language, T.nilable(String)
243
+
244
+ const :text_direction, T.nilable(String)
245
+
246
+ const :mime_type, T.nilable(String)
247
+
248
+ const :charset, T.nilable(String)
249
+
250
+ const :generator, T.nilable(String)
251
+
252
+ const :viewport, T.nilable(String)
253
+
254
+ const :theme_color, T.nilable(String)
255
+
256
+ const :application_name, T.nilable(String)
257
+
258
+ const :robots, T.nilable(String)
259
+
260
+ const :open_graph, T::Hash[String, String]
261
+
262
+ const :twitter_card, T::Hash[String, String]
263
+
264
+ const :meta_tags, T::Hash[String, String]
265
+
266
+ const :headers, T::Array[HeaderMetadata]
267
+
268
+ const :links, T::Array[LinkMetadata]
269
+
270
+ const :images, T::Array[ImageMetadata]
271
+
272
+ const :structured_data, T::Array[StructuredData]
273
+ end
274
+
275
+ # Extracted keyword with relevance metadata.
276
+ #
277
+ # Represents a single keyword extracted from text along with its relevance score,
278
+ # the algorithm that extracted it, and optional position information.
279
+ #
280
+ # @example
281
+ # keyword = Kreuzberg::ExtractedKeyword.new(
282
+ # text: "machine learning",
283
+ # score: 0.95,
284
+ # algorithm: "yake",
285
+ # positions: [42, 128]
286
+ # )
287
+ # puts "#{keyword.text}: #{keyword.score}"
288
+ #
289
+ class ExtractedKeyword < T::Struct
290
+ extend T::Sig
291
+
292
+ const :text, String
293
+
294
+ const :score, Float
295
+
296
+ const :algorithm, String
297
+
298
+ const :positions, T.nilable(T::Array[Integer])
299
+ end
300
+
301
+ # Processing warning from a pipeline stage.
302
+ #
303
+ # Represents a non-fatal warning generated during document processing.
304
+ #
305
+ # @example
306
+ # warning = Kreuzberg::ProcessingWarning.new(
307
+ # source: "ocr",
308
+ # message: "Low confidence on page 3"
309
+ # )
310
+ # puts "[#{warning.source}] #{warning.message}"
311
+ #
312
+ class ProcessingWarning < T::Struct
313
+ extend T::Sig
314
+
315
+ const :source, String
316
+
317
+ const :message, String
318
+ end
319
+
320
+ # Bounding box for document node positioning.
321
+ #
322
+ # Represents rectangular coordinates for a node within the document.
323
+ #
324
+ # @example
325
+ # bbox = Kreuzberg::DocumentBoundingBox.new(
326
+ # x0: 10.0,
327
+ # y0: 20.0,
328
+ # x1: 100.0,
329
+ # y1: 50.0
330
+ # )
331
+ #
332
+ class DocumentBoundingBox < T::Struct
333
+ extend T::Sig
334
+
335
+ const :x0, Float
336
+
337
+ const :y0, Float
338
+
339
+ const :x1, Float
340
+
341
+ const :y1, Float
342
+ end
343
+
344
+ # Annotation for a document node.
345
+ #
346
+ # Provides additional metadata about document node content.
347
+ #
348
+ class DocumentAnnotation < T::Struct
349
+ extend T::Sig
350
+
351
+ const :key, String
352
+
353
+ const :value, String
354
+ end
355
+
356
+ # Single node in the document structure tree.
357
+ #
358
+ # Represents a logical unit of content with deterministic ID, content,
359
+ # tree structure information, and metadata.
360
+ #
361
+ # @example
362
+ # node = Kreuzberg::DocumentNode.new(
363
+ # id: "node-abc123",
364
+ # content: "This is the content",
365
+ # parent: nil,
366
+ # children: [],
367
+ # content_layer: "body",
368
+ # page: 1,
369
+ # page_end: 1,
370
+ # bbox: bbox,
371
+ # annotations: []
372
+ # )
373
+ #
374
+ class DocumentNode < T::Struct
375
+ extend T::Sig
376
+
377
+ const :id, String
378
+
379
+ const :content, String
380
+
381
+ const :parent, T.nilable(Integer)
382
+
383
+ const :children, T::Array[Integer]
384
+
385
+ const :content_layer, String
386
+
387
+ const :page, T.nilable(Integer)
388
+
389
+ const :page_end, T.nilable(Integer)
390
+
391
+ const :bbox, T.nilable(DocumentBoundingBox)
392
+
393
+ const :annotations, T::Array[DocumentAnnotation]
394
+ end
395
+
396
+ # Structured document representation.
397
+ #
398
+ # Provides a hierarchical, tree-based representation of document content
399
+ # using a flat array of nodes with index-based parent/child references.
400
+ #
401
+ # @example
402
+ # structure = Kreuzberg::DocumentStructure.new(
403
+ # nodes: [node1, node2, node3]
404
+ # )
405
+ # structure.nodes.each do |node|
406
+ # puts "#{node.id}: #{node.content}"
407
+ # end
408
+ #
409
+ class DocumentStructure < T::Struct
410
+ extend T::Sig
411
+
412
+ const :nodes, T::Array[DocumentNode]
413
+ end
414
+ end
@@ -0,0 +1,16 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Kreuzberg
4
+ # @example Implementing a minimum length validator
5
+ # @example Implementing a content quality validator
6
+ # @example Using a Proc as a validator
7
+ module ValidatorProtocol
8
+ # @param result [Hash] Extraction result to validate with the following structure:
9
+ # @return [void]
10
+ # @raise [Kreuzberg::Errors::ValidationError] if validation fails
11
+ # @example
12
+ def call(result)
13
+ raise NotImplementedError, "#{self.class} must implement #call(result)"
14
+ end
15
+ end
16
+ end
@@ -0,0 +1,5 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Kreuzberg
4
+ VERSION = '4.3.5'
5
+ end
data/lib/kreuzberg.rb ADDED
@@ -0,0 +1,102 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative 'kreuzberg/setup_lib_path'
4
+ Kreuzberg::SetupLibPath.configure
5
+
6
+ require_relative 'kreuzberg/version'
7
+ require 'kreuzberg_rb'
8
+
9
+ # Kreuzberg is a Ruby binding for the Rust core library providing document extraction,
10
+ # text extraction, and OCR capabilities.
11
+ module Kreuzberg
12
+ autoload :Config, 'kreuzberg/config'
13
+ autoload :Result, 'kreuzberg/result'
14
+ autoload :CLI, 'kreuzberg/cli'
15
+ autoload :CLIProxy, 'kreuzberg/cli_proxy'
16
+ autoload :APIProxy, 'kreuzberg/api_proxy'
17
+ autoload :MCPProxy, 'kreuzberg/mcp_proxy'
18
+ autoload :Errors, 'kreuzberg/errors'
19
+ autoload :ErrorContext, 'kreuzberg/error_context'
20
+ autoload :PostProcessorProtocol, 'kreuzberg/post_processor_protocol'
21
+ autoload :ValidatorProtocol, 'kreuzberg/validator_protocol'
22
+ autoload :OcrBackendProtocol, 'kreuzberg/ocr_backend_protocol'
23
+
24
+ autoload :BoundingBox, 'kreuzberg/types'
25
+ autoload :ElementMetadata, 'kreuzberg/types'
26
+ autoload :Element, 'kreuzberg/types'
27
+ autoload :HtmlMetadata, 'kreuzberg/types'
28
+ autoload :HeaderMetadata, 'kreuzberg/types'
29
+ autoload :LinkMetadata, 'kreuzberg/types'
30
+ autoload :ImageMetadata, 'kreuzberg/types'
31
+ autoload :StructuredData, 'kreuzberg/types'
32
+ autoload :ExtractedKeyword, 'kreuzberg/types'
33
+ autoload :ProcessingWarning, 'kreuzberg/types'
34
+ autoload :DocumentBoundingBox, 'kreuzberg/types'
35
+ autoload :DocumentAnnotation, 'kreuzberg/types'
36
+ autoload :DocumentNode, 'kreuzberg/types'
37
+ autoload :DocumentStructure, 'kreuzberg/types'
38
+
39
+ ExtractionConfig = Config::Extraction
40
+ PageConfig = Config::PageConfig
41
+
42
+ module KeywordAlgorithm
43
+ YAKE = :yake
44
+ RAKE = :rake
45
+ end
46
+
47
+ @__cache_tracker = { entries: 0, bytes: 0 }
48
+
49
+ class << self
50
+ alias native_extract_file_sync extract_file_sync
51
+ alias native_extract_bytes_sync extract_bytes_sync
52
+ alias native_batch_extract_files_sync batch_extract_files_sync
53
+ alias native_extract_file extract_file
54
+ alias native_extract_bytes extract_bytes
55
+ alias native_batch_extract_files batch_extract_files
56
+ alias native_batch_extract_bytes_sync batch_extract_bytes_sync
57
+ alias native_batch_extract_bytes batch_extract_bytes
58
+ alias native_clear_cache clear_cache
59
+ alias native_cache_stats cache_stats
60
+
61
+ private :native_extract_file_sync, :native_extract_bytes_sync, :native_batch_extract_files_sync
62
+ private :native_extract_file, :native_extract_bytes, :native_batch_extract_files
63
+ private :native_batch_extract_bytes_sync, :native_batch_extract_bytes
64
+ end
65
+
66
+ module_function :register_post_processor
67
+
68
+ module_function :unregister_post_processor
69
+
70
+ module_function :clear_post_processors
71
+
72
+ module_function :register_validator
73
+
74
+ module_function :unregister_validator
75
+
76
+ module_function :clear_validators
77
+
78
+ module_function :list_validators
79
+
80
+ module_function :list_post_processors
81
+
82
+ module_function :register_ocr_backend
83
+
84
+ module_function :unregister_ocr_backend
85
+
86
+ module_function :list_ocr_backends
87
+
88
+ module_function :detect_mime_type
89
+
90
+ module_function :detect_mime_type_from_path
91
+
92
+ module_function :validate_mime_type
93
+
94
+ module_function :get_extensions_for_mime
95
+ end
96
+
97
+ require_relative 'kreuzberg/cache_api'
98
+ require_relative 'kreuzberg/extraction_api'
99
+ require_relative 'kreuzberg/djot_content'
100
+
101
+ Kreuzberg.singleton_class.prepend(Kreuzberg::CacheAPI)
102
+ Kreuzberg.singleton_class.prepend(Kreuzberg::ExtractionAPI)
Binary file
data/lib/libpdfium.so ADDED
Binary file