kreuzberg 4.0.0.pre.rc.6 → 4.0.0.pre.rc.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +5 -3
- data/README.md +15 -9
- data/ext/kreuzberg_rb/native/.cargo/config.toml +2 -0
- data/ext/kreuzberg_rb/native/Cargo.lock +516 -324
- data/ext/kreuzberg_rb/native/Cargo.toml +13 -3
- data/ext/kreuzberg_rb/native/src/lib.rs +139 -2
- data/kreuzberg.gemspec +38 -4
- data/lib/kreuzberg/config.rb +34 -1
- data/lib/kreuzberg/result.rb +77 -14
- data/lib/kreuzberg/version.rb +1 -1
- data/sig/kreuzberg.rbs +23 -6
- data/vendor/kreuzberg/Cargo.toml +25 -11
- data/vendor/kreuzberg/README.md +13 -8
- data/vendor/kreuzberg/build.rs +17 -6
- data/vendor/kreuzberg/src/api/mod.rs +2 -0
- data/vendor/kreuzberg/src/chunking/mod.rs +1279 -79
- data/vendor/kreuzberg/src/chunking/processor.rs +220 -0
- data/vendor/kreuzberg/src/core/config.rs +49 -1
- data/vendor/kreuzberg/src/core/extractor.rs +134 -2
- data/vendor/kreuzberg/src/core/mod.rs +4 -2
- data/vendor/kreuzberg/src/core/pipeline.rs +188 -1
- data/vendor/kreuzberg/src/extraction/docx.rs +358 -0
- data/vendor/kreuzberg/src/extraction/html.rs +24 -8
- data/vendor/kreuzberg/src/extraction/image.rs +124 -1
- data/vendor/kreuzberg/src/extraction/libreoffice.rs +1 -2
- data/vendor/kreuzberg/src/extraction/office_metadata/odt_properties.rs +0 -3
- data/vendor/kreuzberg/src/extraction/pptx.rs +187 -87
- data/vendor/kreuzberg/src/extractors/archive.rs +1 -0
- data/vendor/kreuzberg/src/extractors/bibtex.rs +1 -0
- data/vendor/kreuzberg/src/extractors/docbook.rs +2 -0
- data/vendor/kreuzberg/src/extractors/docx.rs +50 -17
- data/vendor/kreuzberg/src/extractors/email.rs +29 -15
- data/vendor/kreuzberg/src/extractors/epub.rs +1 -0
- data/vendor/kreuzberg/src/extractors/excel.rs +2 -0
- data/vendor/kreuzberg/src/extractors/fictionbook.rs +1 -0
- data/vendor/kreuzberg/src/extractors/html.rs +29 -15
- data/vendor/kreuzberg/src/extractors/image.rs +25 -4
- data/vendor/kreuzberg/src/extractors/jats.rs +3 -0
- data/vendor/kreuzberg/src/extractors/jupyter.rs +1 -0
- data/vendor/kreuzberg/src/extractors/latex.rs +1 -0
- data/vendor/kreuzberg/src/extractors/markdown.rs +1 -0
- data/vendor/kreuzberg/src/extractors/mod.rs +78 -14
- data/vendor/kreuzberg/src/extractors/odt.rs +3 -3
- data/vendor/kreuzberg/src/extractors/opml.rs +1 -0
- data/vendor/kreuzberg/src/extractors/orgmode.rs +1 -0
- data/vendor/kreuzberg/src/extractors/pdf.rs +194 -17
- data/vendor/kreuzberg/src/extractors/pptx.rs +32 -13
- data/vendor/kreuzberg/src/extractors/rst.rs +1 -0
- data/vendor/kreuzberg/src/extractors/rtf.rs +3 -4
- data/vendor/kreuzberg/src/extractors/structured.rs +2 -0
- data/vendor/kreuzberg/src/extractors/text.rs +7 -2
- data/vendor/kreuzberg/src/extractors/typst.rs +1 -0
- data/vendor/kreuzberg/src/extractors/xml.rs +27 -15
- data/vendor/kreuzberg/src/keywords/processor.rs +9 -1
- data/vendor/kreuzberg/src/language_detection/mod.rs +43 -0
- data/vendor/kreuzberg/src/language_detection/processor.rs +219 -0
- data/vendor/kreuzberg/src/lib.rs +10 -2
- data/vendor/kreuzberg/src/mcp/mod.rs +2 -0
- data/vendor/kreuzberg/src/mcp/server.rs +14 -12
- data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +2 -0
- data/vendor/kreuzberg/src/pdf/error.rs +8 -0
- data/vendor/kreuzberg/src/pdf/metadata.rs +238 -95
- data/vendor/kreuzberg/src/pdf/mod.rs +14 -2
- data/vendor/kreuzberg/src/pdf/rendering.rs +1 -2
- data/vendor/kreuzberg/src/pdf/table.rs +26 -2
- data/vendor/kreuzberg/src/pdf/text.rs +89 -7
- data/vendor/kreuzberg/src/plugins/extractor.rs +34 -3
- data/vendor/kreuzberg/src/plugins/mod.rs +3 -0
- data/vendor/kreuzberg/src/plugins/ocr.rs +22 -3
- data/vendor/kreuzberg/src/plugins/processor.rs +8 -0
- data/vendor/kreuzberg/src/plugins/registry.rs +2 -0
- data/vendor/kreuzberg/src/plugins/validator.rs +11 -0
- data/vendor/kreuzberg/src/text/mod.rs +6 -0
- data/vendor/kreuzberg/src/text/quality_processor.rs +219 -0
- data/vendor/kreuzberg/src/types.rs +173 -21
- data/vendor/kreuzberg/tests/archive_integration.rs +2 -0
- data/vendor/kreuzberg/tests/batch_processing.rs +5 -3
- data/vendor/kreuzberg/tests/concurrency_stress.rs +14 -6
- data/vendor/kreuzberg/tests/config_features.rs +15 -1
- data/vendor/kreuzberg/tests/config_loading_tests.rs +1 -0
- data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +2 -0
- data/vendor/kreuzberg/tests/email_integration.rs +2 -0
- data/vendor/kreuzberg/tests/error_handling.rs +43 -34
- data/vendor/kreuzberg/tests/format_integration.rs +2 -0
- data/vendor/kreuzberg/tests/image_integration.rs +2 -0
- data/vendor/kreuzberg/tests/mime_detection.rs +17 -16
- data/vendor/kreuzberg/tests/ocr_configuration.rs +4 -0
- data/vendor/kreuzberg/tests/ocr_errors.rs +22 -0
- data/vendor/kreuzberg/tests/ocr_quality.rs +2 -0
- data/vendor/kreuzberg/tests/odt_extractor_tests.rs +0 -21
- data/vendor/kreuzberg/tests/pdf_integration.rs +2 -0
- data/vendor/kreuzberg/tests/pipeline_integration.rs +25 -0
- data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +5 -0
- data/vendor/kreuzberg/tests/plugin_system.rs +6 -0
- data/vendor/kreuzberg/tests/registry_integration_tests.rs +1 -0
- data/vendor/kreuzberg/tests/rst_extractor_tests.rs +2 -0
- data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +0 -1
- data/vendor/kreuzberg/tests/security_validation.rs +1 -0
- data/vendor/kreuzberg/tests/test_fastembed.rs +45 -23
- data/vendor/kreuzberg/tests/typst_behavioral_tests.rs +1 -0
- data/vendor/kreuzberg/tests/typst_extractor_tests.rs +3 -2
- data/vendor/rb-sys/.cargo_vcs_info.json +2 -2
- data/vendor/rb-sys/Cargo.lock +15 -15
- data/vendor/rb-sys/Cargo.toml +4 -4
- data/vendor/rb-sys/Cargo.toml.orig +4 -4
- data/vendor/rb-sys/bin/release.sh +9 -8
- data/vendor/rb-sys/build/features.rs +5 -2
- data/vendor/rb-sys/build/main.rs +55 -15
- data/vendor/rb-sys/build/stable_api_config.rs +4 -2
- data/vendor/rb-sys/build/version.rs +3 -1
- data/vendor/rb-sys/src/macros.rs +2 -2
- data/vendor/rb-sys/src/special_consts.rs +1 -1
- data/vendor/rb-sys/src/stable_api/compiled.rs +1 -1
- data/vendor/rb-sys/src/stable_api/ruby_2_7.rs +12 -4
- data/vendor/rb-sys/src/stable_api/ruby_3_0.rs +12 -4
- data/vendor/rb-sys/src/stable_api/ruby_3_1.rs +12 -4
- data/vendor/rb-sys/src/stable_api/ruby_3_2.rs +12 -4
- data/vendor/rb-sys/src/stable_api/ruby_3_3.rs +19 -6
- data/vendor/rb-sys/src/stable_api/ruby_3_4.rs +17 -5
- data/vendor/rb-sys/src/stable_api.rs +0 -1
- data/vendor/rb-sys/src/tracking_allocator.rs +1 -3
- metadata +11 -10
- data/vendor/kreuzberg/src/extractors/fictionbook.rs.backup2 +0 -738
- data/vendor/rb-sys/.cargo-ok +0 -1
- data/vendor/rb-sys/src/stable_api/ruby_2_6.rs +0 -316
|
@@ -1,9 +1,9 @@
|
|
|
1
1
|
use super::StableApiDefinition;
|
|
2
2
|
use crate::{
|
|
3
|
-
|
|
3
|
+
debug_ruby_assert_type,
|
|
4
4
|
internal::{RArray, RString, RTypedData},
|
|
5
5
|
ruby_value_type::RUBY_T_DATA,
|
|
6
|
-
value_type,
|
|
6
|
+
value_type, VALUE,
|
|
7
7
|
};
|
|
8
8
|
use std::{
|
|
9
9
|
ffi::c_void,
|
|
@@ -307,7 +307,11 @@ impl StableApiDefinition for Definition {
|
|
|
307
307
|
|
|
308
308
|
#[inline]
|
|
309
309
|
unsafe fn rtypeddata_type(&self, obj: VALUE) -> *const crate::rb_data_type_t {
|
|
310
|
-
debug_ruby_assert_type!(
|
|
310
|
+
debug_ruby_assert_type!(
|
|
311
|
+
obj,
|
|
312
|
+
RUBY_T_DATA,
|
|
313
|
+
"rtypeddata_type called on non-T_DATA object"
|
|
314
|
+
);
|
|
311
315
|
|
|
312
316
|
let rdata = obj as *const RTypedData;
|
|
313
317
|
(*rdata).type_
|
|
@@ -315,7 +319,11 @@ impl StableApiDefinition for Definition {
|
|
|
315
319
|
|
|
316
320
|
#[inline]
|
|
317
321
|
unsafe fn rtypeddata_get_data(&self, obj: VALUE) -> *mut c_void {
|
|
318
|
-
debug_ruby_assert_type!(
|
|
322
|
+
debug_ruby_assert_type!(
|
|
323
|
+
obj,
|
|
324
|
+
RUBY_T_DATA,
|
|
325
|
+
"rtypeddata_get_data called on non-T_DATA object"
|
|
326
|
+
);
|
|
319
327
|
|
|
320
328
|
// For Ruby 3.0 and lower, simply return the data field
|
|
321
329
|
let rdata = obj as *const RTypedData;
|
|
@@ -1,9 +1,9 @@
|
|
|
1
1
|
use super::StableApiDefinition;
|
|
2
2
|
use crate::{
|
|
3
|
-
|
|
3
|
+
debug_ruby_assert_type,
|
|
4
4
|
internal::{RArray, RString, RTypedData},
|
|
5
5
|
ruby_value_type::RUBY_T_DATA,
|
|
6
|
-
value_type,
|
|
6
|
+
value_type, VALUE,
|
|
7
7
|
};
|
|
8
8
|
use std::{
|
|
9
9
|
ffi::c_void,
|
|
@@ -300,7 +300,11 @@ impl StableApiDefinition for Definition {
|
|
|
300
300
|
|
|
301
301
|
#[inline]
|
|
302
302
|
unsafe fn rtypeddata_type(&self, obj: VALUE) -> *const crate::rb_data_type_t {
|
|
303
|
-
debug_ruby_assert_type!(
|
|
303
|
+
debug_ruby_assert_type!(
|
|
304
|
+
obj,
|
|
305
|
+
RUBY_T_DATA,
|
|
306
|
+
"rtypeddata_type called on non-T_DATA object"
|
|
307
|
+
);
|
|
304
308
|
|
|
305
309
|
let rdata = obj as *const RTypedData;
|
|
306
310
|
(*rdata).type_
|
|
@@ -308,7 +312,11 @@ impl StableApiDefinition for Definition {
|
|
|
308
312
|
|
|
309
313
|
#[inline]
|
|
310
314
|
unsafe fn rtypeddata_get_data(&self, obj: VALUE) -> *mut c_void {
|
|
311
|
-
debug_ruby_assert_type!(
|
|
315
|
+
debug_ruby_assert_type!(
|
|
316
|
+
obj,
|
|
317
|
+
RUBY_T_DATA,
|
|
318
|
+
"rtypeddata_get_data called on non-T_DATA object"
|
|
319
|
+
);
|
|
312
320
|
|
|
313
321
|
// For Ruby 3.1 and lower, simply return the data field
|
|
314
322
|
let rdata = obj as *const RTypedData;
|
|
@@ -1,9 +1,9 @@
|
|
|
1
1
|
use super::StableApiDefinition;
|
|
2
2
|
use crate::{
|
|
3
|
-
|
|
3
|
+
debug_ruby_assert_type,
|
|
4
4
|
internal::{RArray, RString, RTypedData},
|
|
5
5
|
ruby_value_type::RUBY_T_DATA,
|
|
6
|
-
value_type,
|
|
6
|
+
value_type, VALUE,
|
|
7
7
|
};
|
|
8
8
|
use std::{
|
|
9
9
|
ffi::c_void,
|
|
@@ -298,7 +298,11 @@ impl StableApiDefinition for Definition {
|
|
|
298
298
|
|
|
299
299
|
#[inline]
|
|
300
300
|
unsafe fn rtypeddata_type(&self, obj: VALUE) -> *const crate::rb_data_type_t {
|
|
301
|
-
debug_ruby_assert_type!(
|
|
301
|
+
debug_ruby_assert_type!(
|
|
302
|
+
obj,
|
|
303
|
+
RUBY_T_DATA,
|
|
304
|
+
"rtypeddata_type called on non-T_DATA object"
|
|
305
|
+
);
|
|
302
306
|
|
|
303
307
|
let rdata = obj as *const RTypedData;
|
|
304
308
|
(*rdata).type_
|
|
@@ -306,7 +310,11 @@ impl StableApiDefinition for Definition {
|
|
|
306
310
|
|
|
307
311
|
#[inline]
|
|
308
312
|
unsafe fn rtypeddata_get_data(&self, obj: VALUE) -> *mut c_void {
|
|
309
|
-
debug_ruby_assert_type!(
|
|
313
|
+
debug_ruby_assert_type!(
|
|
314
|
+
obj,
|
|
315
|
+
RUBY_T_DATA,
|
|
316
|
+
"rtypeddata_get_data called on non-T_DATA object"
|
|
317
|
+
);
|
|
310
318
|
|
|
311
319
|
// For Ruby 3.2 and lower, simply return the data field
|
|
312
320
|
let rdata = obj as *const RTypedData;
|
|
@@ -1,9 +1,9 @@
|
|
|
1
1
|
use super::StableApiDefinition;
|
|
2
2
|
use crate::{
|
|
3
|
-
|
|
3
|
+
debug_ruby_assert_type,
|
|
4
4
|
internal::{RArray, RString},
|
|
5
5
|
ruby_value_type::RUBY_T_DATA,
|
|
6
|
-
value_type,
|
|
6
|
+
value_type, VALUE,
|
|
7
7
|
};
|
|
8
8
|
use std::{
|
|
9
9
|
os::raw::{c_char, c_long},
|
|
@@ -284,7 +284,11 @@ impl StableApiDefinition for Definition {
|
|
|
284
284
|
|
|
285
285
|
#[inline]
|
|
286
286
|
unsafe fn rtypeddata_embedded_p(&self, obj: VALUE) -> bool {
|
|
287
|
-
debug_ruby_assert_type!(
|
|
287
|
+
debug_ruby_assert_type!(
|
|
288
|
+
obj,
|
|
289
|
+
RUBY_T_DATA,
|
|
290
|
+
"rtypeddata_embedded_p called on non-T_DATA object"
|
|
291
|
+
);
|
|
288
292
|
|
|
289
293
|
let rdata = obj as *const crate::internal::RTypedData;
|
|
290
294
|
let typed_flag = (*rdata).typed_flag;
|
|
@@ -298,7 +302,11 @@ impl StableApiDefinition for Definition {
|
|
|
298
302
|
|
|
299
303
|
#[inline]
|
|
300
304
|
unsafe fn rtypeddata_type(&self, obj: VALUE) -> *const crate::rb_data_type_t {
|
|
301
|
-
debug_ruby_assert_type!(
|
|
305
|
+
debug_ruby_assert_type!(
|
|
306
|
+
obj,
|
|
307
|
+
RUBY_T_DATA,
|
|
308
|
+
"rtypeddata_type called on non-T_DATA object"
|
|
309
|
+
);
|
|
302
310
|
|
|
303
311
|
let rdata = obj as *const crate::internal::RTypedData;
|
|
304
312
|
(*rdata).type_
|
|
@@ -306,14 +314,19 @@ impl StableApiDefinition for Definition {
|
|
|
306
314
|
|
|
307
315
|
#[inline]
|
|
308
316
|
unsafe fn rtypeddata_get_data(&self, obj: VALUE) -> *mut std::ffi::c_void {
|
|
309
|
-
debug_ruby_assert_type!(
|
|
317
|
+
debug_ruby_assert_type!(
|
|
318
|
+
obj,
|
|
319
|
+
RUBY_T_DATA,
|
|
320
|
+
"rtypeddata_get_data called on non-T_DATA object"
|
|
321
|
+
);
|
|
310
322
|
|
|
311
323
|
if self.rtypeddata_embedded_p(obj) {
|
|
312
324
|
// For embedded data, calculate pointer based on struct layout
|
|
313
325
|
// The formula matches Ruby's implementation:
|
|
314
326
|
// embedded_typed_data_size = sizeof(RTypedData) - sizeof(void *)
|
|
315
327
|
const EMBEDDED_TYPED_DATA_SIZE: usize =
|
|
316
|
-
std::mem::size_of::<crate::internal::RTypedData>()
|
|
328
|
+
std::mem::size_of::<crate::internal::RTypedData>()
|
|
329
|
+
- std::mem::size_of::<*mut std::ffi::c_void>();
|
|
317
330
|
|
|
318
331
|
// Return address after the header as the data pointer
|
|
319
332
|
(obj as *mut u8).add(EMBEDDED_TYPED_DATA_SIZE) as *mut std::ffi::c_void
|
|
@@ -1,9 +1,9 @@
|
|
|
1
1
|
use super::StableApiDefinition;
|
|
2
2
|
use crate::{
|
|
3
|
-
|
|
3
|
+
debug_ruby_assert_type,
|
|
4
4
|
internal::{RArray, RString, RTypedData},
|
|
5
5
|
ruby_value_type::RUBY_T_DATA,
|
|
6
|
-
value_type,
|
|
6
|
+
value_type, VALUE,
|
|
7
7
|
};
|
|
8
8
|
use std::{
|
|
9
9
|
ffi::c_void,
|
|
@@ -285,7 +285,11 @@ impl StableApiDefinition for Definition {
|
|
|
285
285
|
|
|
286
286
|
#[inline]
|
|
287
287
|
unsafe fn rtypeddata_embedded_p(&self, obj: VALUE) -> bool {
|
|
288
|
-
debug_ruby_assert_type!(
|
|
288
|
+
debug_ruby_assert_type!(
|
|
289
|
+
obj,
|
|
290
|
+
RUBY_T_DATA,
|
|
291
|
+
"rtypeddata_embedded_p called on non-T_DATA object"
|
|
292
|
+
);
|
|
289
293
|
|
|
290
294
|
let rdata = obj as *const RTypedData;
|
|
291
295
|
let typed_flag = (*rdata).typed_flag;
|
|
@@ -299,7 +303,11 @@ impl StableApiDefinition for Definition {
|
|
|
299
303
|
|
|
300
304
|
#[inline]
|
|
301
305
|
unsafe fn rtypeddata_type(&self, obj: VALUE) -> *const crate::rb_data_type_t {
|
|
302
|
-
debug_ruby_assert_type!(
|
|
306
|
+
debug_ruby_assert_type!(
|
|
307
|
+
obj,
|
|
308
|
+
RUBY_T_DATA,
|
|
309
|
+
"rtypeddata_type called on non-T_DATA object"
|
|
310
|
+
);
|
|
303
311
|
|
|
304
312
|
let rdata = obj as *const RTypedData;
|
|
305
313
|
(*rdata).type_
|
|
@@ -307,7 +315,11 @@ impl StableApiDefinition for Definition {
|
|
|
307
315
|
|
|
308
316
|
#[inline]
|
|
309
317
|
unsafe fn rtypeddata_get_data(&self, obj: VALUE) -> *mut c_void {
|
|
310
|
-
debug_ruby_assert_type!(
|
|
318
|
+
debug_ruby_assert_type!(
|
|
319
|
+
obj,
|
|
320
|
+
RUBY_T_DATA,
|
|
321
|
+
"rtypeddata_get_data called on non-T_DATA object"
|
|
322
|
+
);
|
|
311
323
|
|
|
312
324
|
if self.rtypeddata_embedded_p(obj) {
|
|
313
325
|
// For embedded data, calculate pointer based on struct layout
|
|
@@ -226,7 +226,6 @@ mod compiled;
|
|
|
226
226
|
use compiled as api;
|
|
227
227
|
|
|
228
228
|
#[cfg(stable_api_include_rust_impl)]
|
|
229
|
-
#[cfg_attr(ruby_eq_2_6, path = "stable_api/ruby_2_6.rs")]
|
|
230
229
|
#[cfg_attr(ruby_eq_2_7, path = "stable_api/ruby_2_7.rs")]
|
|
231
230
|
#[cfg_attr(ruby_eq_3_0, path = "stable_api/ruby_3_0.rs")]
|
|
232
231
|
#[cfg_attr(ruby_eq_3_1, path = "stable_api/ruby_3_1.rs")]
|
|
@@ -3,8 +3,8 @@
|
|
|
3
3
|
use std::{
|
|
4
4
|
fmt::Formatter,
|
|
5
5
|
sync::{
|
|
6
|
-
Arc,
|
|
7
6
|
atomic::{AtomicIsize, Ordering},
|
|
7
|
+
Arc,
|
|
8
8
|
},
|
|
9
9
|
};
|
|
10
10
|
|
|
@@ -59,13 +59,11 @@ mod mri {
|
|
|
59
59
|
|
|
60
60
|
unsafe {
|
|
61
61
|
if is_ruby_vm_started() {
|
|
62
|
-
// On Windows, ssize_t is i32 even on 64-bit, so cast i64 to i32
|
|
63
62
|
#[cfg(all(target_pointer_width = "64", target_os = "windows"))]
|
|
64
63
|
rb_gc_adjust_memory_usage(delta as i32);
|
|
65
64
|
|
|
66
65
|
#[cfg(not(all(target_pointer_width = "64", target_os = "windows")))]
|
|
67
66
|
rb_gc_adjust_memory_usage(delta);
|
|
68
|
-
|
|
69
67
|
delta as isize
|
|
70
68
|
} else {
|
|
71
69
|
0
|
metadata
CHANGED
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: kreuzberg
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 4.0.0.pre.rc.
|
|
4
|
+
version: 4.0.0.pre.rc.7
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Na'aman Hirschfeld
|
|
8
8
|
autorequire:
|
|
9
9
|
bindir: exe
|
|
10
10
|
cert_chain: []
|
|
11
|
-
date: 2025-12-
|
|
11
|
+
date: 2025-12-13 00:00:00.000000000 Z
|
|
12
12
|
dependencies:
|
|
13
13
|
- !ruby/object:Gem::Dependency
|
|
14
14
|
name: bundler
|
|
@@ -186,6 +186,7 @@ files:
|
|
|
186
186
|
- Steepfile
|
|
187
187
|
- examples/async_patterns.rb
|
|
188
188
|
- ext/kreuzberg_rb/extconf.rb
|
|
189
|
+
- ext/kreuzberg_rb/native/.cargo/config.toml
|
|
189
190
|
- ext/kreuzberg_rb/native/Cargo.lock
|
|
190
191
|
- ext/kreuzberg_rb/native/Cargo.toml
|
|
191
192
|
- ext/kreuzberg_rb/native/README.md
|
|
@@ -241,6 +242,7 @@ files:
|
|
|
241
242
|
- vendor/kreuzberg/src/api/types.rs
|
|
242
243
|
- vendor/kreuzberg/src/cache/mod.rs
|
|
243
244
|
- vendor/kreuzberg/src/chunking/mod.rs
|
|
245
|
+
- vendor/kreuzberg/src/chunking/processor.rs
|
|
244
246
|
- vendor/kreuzberg/src/core/batch_mode.rs
|
|
245
247
|
- vendor/kreuzberg/src/core/config.rs
|
|
246
248
|
- vendor/kreuzberg/src/core/extractor.rs
|
|
@@ -277,7 +279,6 @@ files:
|
|
|
277
279
|
- vendor/kreuzberg/src/extractors/epub.rs
|
|
278
280
|
- vendor/kreuzberg/src/extractors/excel.rs
|
|
279
281
|
- vendor/kreuzberg/src/extractors/fictionbook.rs
|
|
280
|
-
- vendor/kreuzberg/src/extractors/fictionbook.rs.backup2
|
|
281
282
|
- vendor/kreuzberg/src/extractors/html.rs
|
|
282
283
|
- vendor/kreuzberg/src/extractors/image.rs
|
|
283
284
|
- vendor/kreuzberg/src/extractors/jats.rs
|
|
@@ -309,6 +310,7 @@ files:
|
|
|
309
310
|
- vendor/kreuzberg/src/keywords/types.rs
|
|
310
311
|
- vendor/kreuzberg/src/keywords/yake.rs
|
|
311
312
|
- vendor/kreuzberg/src/language_detection/mod.rs
|
|
313
|
+
- vendor/kreuzberg/src/language_detection/processor.rs
|
|
312
314
|
- vendor/kreuzberg/src/lib.rs
|
|
313
315
|
- vendor/kreuzberg/src/mcp/mod.rs
|
|
314
316
|
- vendor/kreuzberg/src/mcp/server.rs
|
|
@@ -341,6 +343,7 @@ files:
|
|
|
341
343
|
- vendor/kreuzberg/src/stopwords/mod.rs
|
|
342
344
|
- vendor/kreuzberg/src/text/mod.rs
|
|
343
345
|
- vendor/kreuzberg/src/text/quality.rs
|
|
346
|
+
- vendor/kreuzberg/src/text/quality_processor.rs
|
|
344
347
|
- vendor/kreuzberg/src/text/string_utils.rs
|
|
345
348
|
- vendor/kreuzberg/src/text/token_reduction/cjk_utils.rs
|
|
346
349
|
- vendor/kreuzberg/src/text/token_reduction/config.rs
|
|
@@ -469,7 +472,6 @@ files:
|
|
|
469
472
|
- vendor/kreuzberg/tests/typst_behavioral_tests.rs
|
|
470
473
|
- vendor/kreuzberg/tests/typst_extractor_tests.rs
|
|
471
474
|
- vendor/kreuzberg/tests/xlsx_metadata_extraction_test.rs
|
|
472
|
-
- vendor/rb-sys/.cargo-ok
|
|
473
475
|
- vendor/rb-sys/.cargo_vcs_info.json
|
|
474
476
|
- vendor/rb-sys/Cargo.lock
|
|
475
477
|
- vendor/rb-sys/Cargo.toml
|
|
@@ -492,7 +494,6 @@ files:
|
|
|
492
494
|
- vendor/rb-sys/src/stable_api.rs
|
|
493
495
|
- vendor/rb-sys/src/stable_api/compiled.c
|
|
494
496
|
- vendor/rb-sys/src/stable_api/compiled.rs
|
|
495
|
-
- vendor/rb-sys/src/stable_api/ruby_2_6.rs
|
|
496
497
|
- vendor/rb-sys/src/stable_api/ruby_2_7.rs
|
|
497
498
|
- vendor/rb-sys/src/stable_api/ruby_3_0.rs
|
|
498
499
|
- vendor/rb-sys/src/stable_api/ruby_3_1.rs
|
|
@@ -507,13 +508,13 @@ homepage: https://github.com/kreuzberg-dev/kreuzberg
|
|
|
507
508
|
licenses:
|
|
508
509
|
- MIT
|
|
509
510
|
metadata:
|
|
510
|
-
|
|
511
|
+
homepage_uri: https://github.com/kreuzberg-dev/kreuzberg
|
|
512
|
+
source_code_uri: https://github.com/kreuzberg-dev/kreuzberg
|
|
511
513
|
changelog_uri: https://github.com/kreuzberg-dev/kreuzberg/blob/main/CHANGELOG.md
|
|
512
514
|
documentation_uri: https://docs.kreuzberg.dev
|
|
513
|
-
|
|
514
|
-
keywords: document-intelligence,document-extraction,ocr,rust,bindings
|
|
515
|
+
bug_tracker_uri: https://github.com/kreuzberg-dev/kreuzberg/issues
|
|
515
516
|
rubygems_mfa_required: 'true'
|
|
516
|
-
|
|
517
|
+
keywords: document-intelligence,document-extraction,ocr,rust,bindings
|
|
517
518
|
post_install_message:
|
|
518
519
|
rdoc_options: []
|
|
519
520
|
require_paths:
|
|
@@ -529,7 +530,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
|
529
530
|
- !ruby/object:Gem::Version
|
|
530
531
|
version: '0'
|
|
531
532
|
requirements: []
|
|
532
|
-
rubygems_version: 3.
|
|
533
|
+
rubygems_version: 3.5.22
|
|
533
534
|
signing_key:
|
|
534
535
|
specification_version: 4
|
|
535
536
|
summary: High-performance document intelligence framework
|