kreuzberg 4.2.6 → 4.2.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (139) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile.lock +7 -4
  3. data/README.md +1 -1
  4. data/ext/kreuzberg_rb/native/Cargo.lock +36 -9
  5. data/ext/kreuzberg_rb/native/Cargo.toml +32 -0
  6. data/ext/kreuzberg_rb/native/src/config/types.rs +4 -2
  7. data/ext/kreuzberg_rb/native/src/plugins/ocr_backend.rs +1 -1
  8. data/ext/kreuzberg_rb/native/src/plugins/post_processor.rs +1 -1
  9. data/ext/kreuzberg_rb/native/src/result.rs +5 -3
  10. data/lib/kreuzberg/version.rb +1 -1
  11. data/sig/kreuzberg.rbs +228 -37
  12. data/spec/binding/batch_operations_spec.rb +2 -0
  13. data/vendor/Cargo.toml +3 -2
  14. data/vendor/kreuzberg/Cargo.toml +2 -1
  15. data/vendor/kreuzberg/README.md +1 -1
  16. data/vendor/kreuzberg/src/api/error.rs +29 -1
  17. data/vendor/kreuzberg/src/api/handlers.rs +28 -25
  18. data/vendor/kreuzberg/src/api/openapi.rs +14 -1
  19. data/vendor/kreuzberg/src/chunking/config.rs +2 -37
  20. data/vendor/kreuzberg/src/chunking/core.rs +78 -2
  21. data/vendor/kreuzberg/src/chunking/mod.rs +1 -1
  22. data/vendor/kreuzberg/src/chunking/processor.rs +15 -17
  23. data/vendor/kreuzberg/src/core/config/extraction/env.rs +13 -9
  24. data/vendor/kreuzberg/src/core/config/extraction/loaders.rs +12 -12
  25. data/vendor/kreuzberg/src/core/config/mod.rs +1 -1
  26. data/vendor/kreuzberg/src/core/config/processing.rs +65 -8
  27. data/vendor/kreuzberg/src/core/config_validation/mod.rs +8 -0
  28. data/vendor/kreuzberg/src/core/config_validation/sections.rs +5 -0
  29. data/vendor/kreuzberg/src/core/extractor/batch.rs +9 -9
  30. data/vendor/kreuzberg/src/core/extractor/file.rs +4 -2
  31. data/vendor/kreuzberg/src/core/extractor/legacy.rs +7 -7
  32. data/vendor/kreuzberg/src/core/extractor/sync.rs +3 -3
  33. data/vendor/kreuzberg/src/core/pipeline/execution.rs +2 -1
  34. data/vendor/kreuzberg/src/core/pipeline/features.rs +16 -22
  35. data/vendor/kreuzberg/src/core/pipeline/format.rs +20 -18
  36. data/vendor/kreuzberg/src/core/pipeline/tests.rs +40 -35
  37. data/vendor/kreuzberg/src/extraction/email.rs +31 -19
  38. data/vendor/kreuzberg/src/extraction/excel.rs +6 -5
  39. data/vendor/kreuzberg/src/extraction/html/image_handling.rs +6 -1
  40. data/vendor/kreuzberg/src/extraction/html/types.rs +4 -3
  41. data/vendor/kreuzberg/src/extraction/libreoffice.rs +10 -9
  42. data/vendor/kreuzberg/src/extraction/pptx/image_handling.rs +10 -8
  43. data/vendor/kreuzberg/src/extraction/pptx/mod.rs +8 -4
  44. data/vendor/kreuzberg/src/extraction/structured.rs +5 -4
  45. data/vendor/kreuzberg/src/extraction/transform/content.rs +1 -1
  46. data/vendor/kreuzberg/src/extraction/transform/mod.rs +10 -7
  47. data/vendor/kreuzberg/src/extractors/archive.rs +7 -5
  48. data/vendor/kreuzberg/src/extractors/bibtex.rs +34 -17
  49. data/vendor/kreuzberg/src/extractors/djot_format/attributes.rs +7 -10
  50. data/vendor/kreuzberg/src/extractors/djot_format/conversion.rs +4 -2
  51. data/vendor/kreuzberg/src/extractors/djot_format/extractor.rs +3 -2
  52. data/vendor/kreuzberg/src/extractors/djot_format/parsing/block_handlers.rs +1 -1
  53. data/vendor/kreuzberg/src/extractors/djot_format/parsing/content_extraction.rs +2 -4
  54. data/vendor/kreuzberg/src/extractors/djot_format/parsing/event_handlers.rs +1 -1
  55. data/vendor/kreuzberg/src/extractors/djot_format/parsing/inline_handlers.rs +4 -5
  56. data/vendor/kreuzberg/src/extractors/djot_format/parsing/table_extraction.rs +1 -1
  57. data/vendor/kreuzberg/src/extractors/docbook.rs +1 -1
  58. data/vendor/kreuzberg/src/extractors/docx.rs +32 -24
  59. data/vendor/kreuzberg/src/extractors/email.rs +5 -3
  60. data/vendor/kreuzberg/src/extractors/epub/metadata.rs +10 -10
  61. data/vendor/kreuzberg/src/extractors/epub/mod.rs +7 -3
  62. data/vendor/kreuzberg/src/extractors/excel.rs +8 -6
  63. data/vendor/kreuzberg/src/extractors/fictionbook.rs +1 -1
  64. data/vendor/kreuzberg/src/extractors/frontmatter_utils.rs +15 -10
  65. data/vendor/kreuzberg/src/extractors/html.rs +1 -1
  66. data/vendor/kreuzberg/src/extractors/image.rs +3 -3
  67. data/vendor/kreuzberg/src/extractors/jats/mod.rs +1 -1
  68. data/vendor/kreuzberg/src/extractors/jupyter.rs +11 -9
  69. data/vendor/kreuzberg/src/extractors/latex/metadata.rs +4 -3
  70. data/vendor/kreuzberg/src/extractors/latex/mod.rs +1 -1
  71. data/vendor/kreuzberg/src/extractors/markdown.rs +6 -4
  72. data/vendor/kreuzberg/src/extractors/odt.rs +38 -21
  73. data/vendor/kreuzberg/src/extractors/opml/core.rs +1 -1
  74. data/vendor/kreuzberg/src/extractors/opml/parser.rs +13 -9
  75. data/vendor/kreuzberg/src/extractors/orgmode.rs +11 -9
  76. data/vendor/kreuzberg/src/extractors/pdf/mod.rs +10 -3
  77. data/vendor/kreuzberg/src/extractors/pptx.rs +13 -11
  78. data/vendor/kreuzberg/src/extractors/rst.rs +15 -13
  79. data/vendor/kreuzberg/src/extractors/rtf/metadata.rs +22 -21
  80. data/vendor/kreuzberg/src/extractors/rtf/mod.rs +1 -1
  81. data/vendor/kreuzberg/src/extractors/structured.rs +10 -5
  82. data/vendor/kreuzberg/src/extractors/text.rs +2 -2
  83. data/vendor/kreuzberg/src/extractors/typst.rs +11 -5
  84. data/vendor/kreuzberg/src/extractors/xml.rs +1 -1
  85. data/vendor/kreuzberg/src/keywords/processor.rs +9 -8
  86. data/vendor/kreuzberg/src/language_detection/processor.rs +6 -5
  87. data/vendor/kreuzberg/src/lib.rs +1 -1
  88. data/vendor/kreuzberg/src/mcp/errors.rs +7 -6
  89. data/vendor/kreuzberg/src/mcp/format.rs +5 -4
  90. data/vendor/kreuzberg/src/mcp/tools/extraction.rs +3 -2
  91. data/vendor/kreuzberg/src/ocr/hocr.rs +4 -2
  92. data/vendor/kreuzberg/src/ocr/processor/execution.rs +128 -14
  93. data/vendor/kreuzberg/src/ocr/processor/validation.rs +129 -0
  94. data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +36 -6
  95. data/vendor/kreuzberg/src/ocr/types.rs +3 -4
  96. data/vendor/kreuzberg/src/ocr/validation.rs +14 -0
  97. data/vendor/kreuzberg/src/pdf/metadata.rs +1 -0
  98. data/vendor/kreuzberg/src/plugins/extractor/mod.rs +3 -2
  99. data/vendor/kreuzberg/src/plugins/extractor/registry.rs +5 -4
  100. data/vendor/kreuzberg/src/plugins/ocr.rs +5 -4
  101. data/vendor/kreuzberg/src/plugins/processor/mod.rs +13 -12
  102. data/vendor/kreuzberg/src/plugins/registry/extractor.rs +3 -2
  103. data/vendor/kreuzberg/src/plugins/registry/ocr.rs +3 -2
  104. data/vendor/kreuzberg/src/plugins/validator/mod.rs +15 -14
  105. data/vendor/kreuzberg/src/text/quality.rs +13 -13
  106. data/vendor/kreuzberg/src/text/quality_processor.rs +7 -6
  107. data/vendor/kreuzberg/src/types/djot.rs +15 -4
  108. data/vendor/kreuzberg/src/types/extraction.rs +24 -4
  109. data/vendor/kreuzberg/src/types/formats.rs +9 -5
  110. data/vendor/kreuzberg/src/types/metadata.rs +68 -7
  111. data/vendor/kreuzberg/src/types/mod.rs +7 -5
  112. data/vendor/kreuzberg/src/types/page.rs +9 -0
  113. data/vendor/kreuzberg/src/types/tables.rs +2 -0
  114. data/vendor/kreuzberg/tests/concurrency_stress.rs +2 -1
  115. data/vendor/kreuzberg/tests/config_behavioral.rs +12 -16
  116. data/vendor/kreuzberg/tests/config_features.rs +19 -11
  117. data/vendor/kreuzberg/tests/config_loading_tests.rs +9 -9
  118. data/vendor/kreuzberg/tests/contract_mcp.rs +2 -2
  119. data/vendor/kreuzberg/tests/core_integration.rs +5 -6
  120. data/vendor/kreuzberg/tests/opml_extractor_tests.rs +1 -1
  121. data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +1 -1
  122. data/vendor/kreuzberg/tests/pipeline_integration.rs +36 -32
  123. data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +19 -13
  124. data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +3 -2
  125. data/vendor/kreuzberg/tests/plugin_system.rs +7 -6
  126. data/vendor/kreuzberg/tests/plugin_validator_test.rs +1 -1
  127. data/vendor/kreuzberg/tests/registry_integration_tests.rs +2 -1
  128. data/vendor/kreuzberg-ffi/Cargo.toml +2 -1
  129. data/vendor/kreuzberg-ffi/benches/result_view_benchmark.rs +3 -2
  130. data/vendor/kreuzberg-ffi/kreuzberg.h +32 -0
  131. data/vendor/kreuzberg-ffi/src/error.rs +56 -0
  132. data/vendor/kreuzberg-ffi/src/helpers.rs +6 -5
  133. data/vendor/kreuzberg-ffi/src/plugins/ocr_backend.rs +1 -1
  134. data/vendor/kreuzberg-ffi/src/result.rs +2 -1
  135. data/vendor/kreuzberg-ffi/src/result_view.rs +3 -2
  136. data/vendor/kreuzberg-ffi/src/string_intern.rs +3 -3
  137. data/vendor/kreuzberg-ffi/tests.disabled/config_loading_tests.rs +2 -2
  138. data/vendor/kreuzberg-tesseract/Cargo.toml +1 -1
  139. metadata +2 -2
@@ -12,6 +12,7 @@ use kreuzberg::plugins::registry::{DocumentExtractorRegistry, ValidatorRegistry}
12
12
  use kreuzberg::plugins::{DocumentExtractor, Plugin, Validator};
13
13
  use kreuzberg::types::{ExtractionResult, Metadata};
14
14
  use kreuzberg::{KreuzbergError, Result};
15
+ use std::borrow::Cow;
15
16
  use std::path::Path;
16
17
  use std::sync::Arc;
17
18
 
@@ -119,7 +120,7 @@ impl DocumentExtractor for MockExtractor {
119
120
  ) -> Result<ExtractionResult> {
120
121
  Ok(ExtractionResult {
121
122
  content: format!("Extracted by {}: {}", self.name, String::from_utf8_lossy(content)),
122
- mime_type: mime_type.to_string(),
123
+ mime_type: Cow::Owned(mime_type.to_string()),
123
124
  metadata: Metadata::default(),
124
125
  tables: vec![],
125
126
  detected_languages: None,
@@ -28,9 +28,10 @@ serde_json = { workspace = true }
28
28
  serde = { workspace = true }
29
29
  async-trait = { workspace = true }
30
30
  tokio = { workspace = true }
31
- html-to-markdown-rs = { version = "2.24.1", default-features = false }
31
+ html-to-markdown-rs = { version = "2.24.4", default-features = false }
32
32
  rayon = { version = "1.11", optional = true }
33
33
  log = "0.4"
34
+ ahash = "0.8"
34
35
 
35
36
  [target.'cfg(all(windows, target_env = "gnu"))'.dependencies]
36
37
  kreuzberg = { path = "../kreuzberg", features = [
@@ -1,6 +1,7 @@
1
1
  use criterion::{BenchmarkId, Criterion, Throughput, criterion_group, criterion_main};
2
2
  use kreuzberg::types::{Chunk, ChunkMetadata, ExtractionResult, Metadata, PageStructure, PageUnitType};
3
3
  use kreuzberg_ffi::{CExtractionResultView, kreuzberg_get_result_view};
4
+ use std::borrow::Cow;
4
5
  use std::ffi::CString;
5
6
  use std::hint;
6
7
  use std::mem;
@@ -63,7 +64,7 @@ fn create_test_result(content_size: usize, chunk_count: usize) -> ExtractionResu
63
64
 
64
65
  ExtractionResult {
65
66
  content,
66
- mime_type: "application/pdf".to_string(),
67
+ mime_type: Cow::Borrowed("application/pdf"),
67
68
  metadata,
68
69
  tables: vec![],
69
70
  detected_languages: Some(vec!["en".to_string(), "de".to_string()]),
@@ -109,7 +110,7 @@ fn bench_copy_based_approach(c: &mut Criterion) {
109
110
 
110
111
  b.iter(|| {
111
112
  let content_cstr = CString::new(result.content.as_str()).unwrap();
112
- let mime_cstr = CString::new(result.mime_type.as_str()).unwrap();
113
+ let mime_cstr = CString::new(&*result.mime_type).unwrap();
113
114
  let language_cstr = result
114
115
  .metadata
115
116
  .language
@@ -1208,6 +1208,38 @@ const char *kreuzberg_error_code_description(uint32_t code);
1208
1208
  */
1209
1209
  struct CErrorDetails kreuzberg_get_error_details(void);
1210
1210
 
1211
+ /**
1212
+ * Heap-allocated variant of `kreuzberg_get_error_details` that returns a pointer.
1213
+ *
1214
+ * This is the preferred variant for language bindings (Java, Go, C#) where
1215
+ * returning structs by value across FFI boundaries causes ABI issues,
1216
+ * particularly on ARM64.
1217
+ *
1218
+ * The returned pointer must be freed with `kreuzberg_free_error_details()`.
1219
+ * Returns NULL if allocation fails.
1220
+ *
1221
+ * # C Signature
1222
+ *
1223
+ * ```c
1224
+ * CErrorDetails* kreuzberg_get_error_details_ptr(void);
1225
+ * ```
1226
+ */
1227
+ struct CErrorDetails *kreuzberg_get_error_details_ptr(void);
1228
+
1229
+ /**
1230
+ * Frees a `CErrorDetails` pointer returned by `kreuzberg_get_error_details_ptr()`.
1231
+ *
1232
+ * This function frees all internal string fields and the struct itself.
1233
+ * Passing NULL is a no-op.
1234
+ *
1235
+ * # C Signature
1236
+ *
1237
+ * ```c
1238
+ * void kreuzberg_free_error_details(CErrorDetails* details);
1239
+ * ```
1240
+ */
1241
+ void kreuzberg_free_error_details(struct CErrorDetails *details);
1242
+
1211
1243
  /**
1212
1244
  * Classifies an error based on the error message string.
1213
1245
  *
@@ -540,6 +540,62 @@ pub extern "C" fn kreuzberg_get_error_details() -> CErrorDetails {
540
540
  }
541
541
  }
542
542
 
543
+ /// Heap-allocated variant of `kreuzberg_get_error_details` that returns a pointer.
544
+ ///
545
+ /// This is the preferred variant for language bindings (Java, Go, C#) where
546
+ /// returning structs by value across FFI boundaries causes ABI issues,
547
+ /// particularly on ARM64.
548
+ ///
549
+ /// The returned pointer must be freed with `kreuzberg_free_error_details()`.
550
+ /// Returns NULL if allocation fails.
551
+ ///
552
+ /// # C Signature
553
+ ///
554
+ /// ```c
555
+ /// CErrorDetails* kreuzberg_get_error_details_ptr(void);
556
+ /// ```
557
+ #[unsafe(no_mangle)]
558
+ pub extern "C" fn kreuzberg_get_error_details_ptr() -> *mut CErrorDetails {
559
+ let details = kreuzberg_get_error_details();
560
+ Box::into_raw(Box::new(details))
561
+ }
562
+
563
+ /// Frees a `CErrorDetails` pointer returned by `kreuzberg_get_error_details_ptr()`.
564
+ ///
565
+ /// This function frees all internal string fields and the struct itself.
566
+ /// Passing NULL is a no-op.
567
+ ///
568
+ /// # C Signature
569
+ ///
570
+ /// ```c
571
+ /// void kreuzberg_free_error_details(CErrorDetails* details);
572
+ /// ```
573
+ #[unsafe(no_mangle)]
574
+ pub extern "C" fn kreuzberg_free_error_details(details: *mut CErrorDetails) {
575
+ if details.is_null() {
576
+ return;
577
+ }
578
+ unsafe {
579
+ let details = Box::from_raw(details);
580
+ // Free all non-null string fields
581
+ if !details.message.is_null() {
582
+ let _ = CString::from_raw(details.message);
583
+ }
584
+ if !details.error_type.is_null() {
585
+ let _ = CString::from_raw(details.error_type);
586
+ }
587
+ if !details.source_file.is_null() {
588
+ let _ = CString::from_raw(details.source_file);
589
+ }
590
+ if !details.source_function.is_null() {
591
+ let _ = CString::from_raw(details.source_function);
592
+ }
593
+ if !details.context_info.is_null() {
594
+ let _ = CString::from_raw(details.context_info);
595
+ }
596
+ }
597
+ }
598
+
543
599
  /// Classifies an error based on the error message string.
544
600
  ///
545
601
  /// Analyzes an error message and attempts to classify it into one of the standard
@@ -81,7 +81,7 @@ pub fn to_c_extraction_result(result: ExtractionResult) -> std::result::Result<*
81
81
  );
82
82
 
83
83
  let mime_type_guard = CStringGuard::new(
84
- CString::new(mime_type).map_err(|e| format!("Failed to convert MIME type to C string: {}", e))?,
84
+ CString::new(mime_type.to_string()).map_err(|e| format!("Failed to convert MIME type to C string: {}", e))?,
85
85
  );
86
86
 
87
87
  let language_guard = match &metadata.language {
@@ -213,6 +213,7 @@ pub fn to_c_extraction_result(result: ExtractionResult) -> std::result::Result<*
213
213
  mod tests {
214
214
  use super::*;
215
215
  use kreuzberg::types::{Chunk, ChunkMetadata, ExtractionResult, Metadata, Table};
216
+ use std::borrow::Cow;
216
217
  use std::ffi::CStr;
217
218
 
218
219
  #[test]
@@ -352,7 +353,7 @@ mod tests {
352
353
  fn test_to_c_extraction_result_basic() {
353
354
  let result = ExtractionResult {
354
355
  content: "Test content".to_string(),
355
- mime_type: "text/plain".to_string(),
356
+ mime_type: Cow::Borrowed("text/plain"),
356
357
  metadata: Metadata::default(),
357
358
  tables: vec![],
358
359
  detected_languages: None,
@@ -391,7 +392,7 @@ mod tests {
391
392
  fn test_to_c_extraction_result_with_null_bytes() {
392
393
  let result = ExtractionResult {
393
394
  content: "Test\0content with null".to_string(),
394
- mime_type: "text/plain".to_string(),
395
+ mime_type: Cow::Borrowed("text/plain"),
395
396
  metadata: Metadata::default(),
396
397
  tables: vec![],
397
398
  detected_languages: None,
@@ -440,7 +441,7 @@ mod tests {
440
441
 
441
442
  let result = ExtractionResult {
442
443
  content: "Test content".to_string(),
443
- mime_type: "text/plain".to_string(),
444
+ mime_type: Cow::Borrowed("text/plain"),
444
445
  metadata,
445
446
  tables: vec![],
446
447
  detected_languages: Some(vec!["en".to_string(), "de".to_string()]),
@@ -519,7 +520,7 @@ mod tests {
519
520
 
520
521
  let result = ExtractionResult {
521
522
  content: "Test content".to_string(),
522
- mime_type: "text/plain".to_string(),
523
+ mime_type: Cow::Borrowed("text/plain"),
523
524
  metadata: Metadata::default(),
524
525
  tables: vec![table],
525
526
  detected_languages: None,
@@ -160,7 +160,7 @@ impl OcrBackend for FfiOcrBackend {
160
160
 
161
161
  Ok(ExtractionResult {
162
162
  content: result_text,
163
- mime_type: "text/plain".to_string(),
163
+ mime_type: std::borrow::Cow::Borrowed("text/plain"),
164
164
  metadata: kreuzberg::types::Metadata::default(),
165
165
  tables: vec![],
166
166
  detected_languages: None,
@@ -368,6 +368,7 @@ pub unsafe extern "C" fn kreuzberg_result_get_metadata_field(
368
368
  #[cfg(test)]
369
369
  mod tests {
370
370
  use super::*;
371
+ use std::borrow::Cow;
371
372
  use std::ffi::CStr;
372
373
 
373
374
  fn create_test_result() -> ExtractionResult {
@@ -389,7 +390,7 @@ mod tests {
389
390
 
390
391
  ExtractionResult {
391
392
  content: "Sample content for testing".to_string(),
392
- mime_type: "text/plain".to_string(),
393
+ mime_type: Cow::Borrowed("text/plain"),
393
394
  metadata,
394
395
  tables: vec![],
395
396
  detected_languages: Some(vec!["en".to_string(), "de".to_string()]),
@@ -398,6 +398,7 @@ pub unsafe extern "C" fn kreuzberg_view_get_mime_type(
398
398
  mod tests {
399
399
  use super::*;
400
400
  use kreuzberg::types::{Metadata, PageStructure, PageUnitType};
401
+ use std::borrow::Cow;
401
402
  use std::mem;
402
403
 
403
404
  fn create_test_result() -> ExtractionResult {
@@ -419,7 +420,7 @@ mod tests {
419
420
 
420
421
  ExtractionResult {
421
422
  content: "Sample content for zero-copy testing".to_string(),
422
- mime_type: "text/plain".to_string(),
423
+ mime_type: Cow::Borrowed("text/plain"),
423
424
  metadata,
424
425
  tables: vec![],
425
426
  detected_languages: Some(vec!["en".to_string(), "de".to_string()]),
@@ -712,7 +713,7 @@ mod tests {
712
713
  fn test_view_all_counts_zero() {
713
714
  let result = ExtractionResult {
714
715
  content: "Minimal content".to_string(),
715
- mime_type: "text/plain".to_string(),
716
+ mime_type: Cow::Borrowed("text/plain"),
716
717
  metadata: Metadata::default(),
717
718
  tables: vec![],
718
719
  detected_languages: None,
@@ -39,7 +39,7 @@
39
39
  //! ```
40
40
 
41
41
  use crate::{clear_last_error, set_last_error};
42
- use std::collections::HashMap;
42
+ use ahash::AHashMap;
43
43
  use std::ffi::{CStr, CString};
44
44
  use std::os::raw::c_char;
45
45
  use std::ptr;
@@ -82,7 +82,7 @@ struct InternedString {
82
82
  /// Global string interning table.
83
83
  struct StringInternTable {
84
84
  /// Map from string content to interned entry
85
- strings: HashMap<String, InternedString>,
85
+ strings: AHashMap<String, InternedString>,
86
86
 
87
87
  /// Total number of intern requests
88
88
  total_requests: usize,
@@ -95,7 +95,7 @@ impl StringInternTable {
95
95
  /// Create new intern table with pre-populated common strings.
96
96
  fn new() -> Self {
97
97
  let mut table = Self {
98
- strings: HashMap::new(),
98
+ strings: AHashMap::new(),
99
99
  total_requests: 0,
100
100
  cache_hits: 0,
101
101
  };
@@ -72,8 +72,8 @@ ocr:
72
72
  enabled: true
73
73
  backend: tesseract
74
74
  chunking:
75
- max_chars: 1000
76
- max_overlap: 100
75
+ max_characters: 1000
76
+ overlap: 100
77
77
  "#;
78
78
 
79
79
  fs::write(&config_path, yaml_content).unwrap();
@@ -1,6 +1,6 @@
1
1
  [package]
2
2
  name = "kreuzberg-tesseract"
3
- version = "4.2.6"
3
+ version = "4.2.7"
4
4
  edition = "2024"
5
5
  rust-version = "1.91"
6
6
  authors = ["Na'aman Hirschfeld <nhirschfeld@gmail.com>"]
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: kreuzberg
3
3
  version: !ruby/object:Gem::Version
4
- version: 4.2.6
4
+ version: 4.2.7
5
5
  platform: ruby
6
6
  authors:
7
7
  - Na'aman Hirschfeld
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2026-01-31 00:00:00.000000000 Z
11
+ date: 2026-02-01 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler