kreuzberg 4.1.2 → 4.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (103) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile.lock +2 -2
  3. data/README.md +1 -1
  4. data/ext/kreuzberg_rb/native/Cargo.lock +26 -17
  5. data/ext/kreuzberg_rb/native/libpdfium.so +0 -0
  6. data/kreuzberg.gemspec +13 -1
  7. data/lib/kreuzberg/cli.rb +16 -6
  8. data/lib/kreuzberg/cli_proxy.rb +3 -1
  9. data/lib/kreuzberg/config.rb +121 -39
  10. data/lib/kreuzberg/djot_content.rb +225 -0
  11. data/lib/kreuzberg/extraction_api.rb +20 -4
  12. data/lib/kreuzberg/result.rb +12 -2
  13. data/lib/kreuzberg/version.rb +1 -1
  14. data/lib/kreuzberg.rb +1 -0
  15. data/sig/kreuzberg.rbs +28 -12
  16. data/spec/binding/batch_operations_spec.rb +80 -0
  17. data/spec/binding/batch_spec.rb +6 -5
  18. data/spec/binding/error_recovery_spec.rb +3 -3
  19. data/spec/binding/metadata_types_spec.rb +77 -57
  20. data/spec/binding/tables_spec.rb +11 -2
  21. data/spec/serialization_spec.rb +134 -0
  22. data/spec/unit/config/output_format_spec.rb +380 -0
  23. data/vendor/Cargo.toml +1 -1
  24. data/vendor/kreuzberg/Cargo.toml +1 -1
  25. data/vendor/kreuzberg/README.md +1 -1
  26. data/vendor/kreuzberg/src/api/startup.rs +15 -1
  27. data/vendor/kreuzberg/src/core/config_validation/sections.rs +16 -4
  28. data/vendor/kreuzberg/src/core/extractor/file.rs +1 -2
  29. data/vendor/kreuzberg/src/core/extractor/mod.rs +2 -1
  30. data/vendor/kreuzberg/src/core/io.rs +7 -7
  31. data/vendor/kreuzberg/src/core/mime.rs +4 -4
  32. data/vendor/kreuzberg/src/embeddings.rs +4 -4
  33. data/vendor/kreuzberg/src/extraction/pptx/parser.rs +6 -0
  34. data/vendor/kreuzberg/src/mcp/format.rs +237 -39
  35. data/vendor/kreuzberg/src/mcp/params.rs +26 -33
  36. data/vendor/kreuzberg/src/mcp/server.rs +6 -3
  37. data/vendor/kreuzberg/src/mcp/tools/extraction.rs +16 -23
  38. data/vendor/kreuzberg/src/plugins/mod.rs +1 -0
  39. data/vendor/kreuzberg/src/plugins/registry/extractor.rs +251 -5
  40. data/vendor/kreuzberg/src/plugins/registry/ocr.rs +150 -2
  41. data/vendor/kreuzberg/src/plugins/registry/processor.rs +213 -5
  42. data/vendor/kreuzberg/src/plugins/registry/validator.rs +220 -4
  43. data/vendor/kreuzberg/src/plugins/startup_validation.rs +385 -0
  44. data/vendor/kreuzberg/tests/api_chunk.rs +40 -30
  45. data/vendor/kreuzberg/tests/api_consistency.rs +349 -0
  46. data/vendor/kreuzberg/tests/api_embed.rs +84 -50
  47. data/vendor/kreuzberg/tests/api_large_pdf_extraction_diagnostics.rs +8 -2
  48. data/vendor/kreuzberg/tests/api_tests.rs +298 -139
  49. data/vendor/kreuzberg/tests/archive_integration.rs +63 -56
  50. data/vendor/kreuzberg/tests/batch_orchestration.rs +22 -14
  51. data/vendor/kreuzberg/tests/batch_pooling_benchmark.rs +13 -13
  52. data/vendor/kreuzberg/tests/batch_processing.rs +13 -9
  53. data/vendor/kreuzberg/tests/bibtex_parity_test.rs +10 -10
  54. data/vendor/kreuzberg/tests/concurrency_stress.rs +10 -6
  55. data/vendor/kreuzberg/tests/config_behavioral.rs +416 -0
  56. data/vendor/kreuzberg/tests/config_features.rs +19 -15
  57. data/vendor/kreuzberg/tests/config_integration_test.rs +68 -68
  58. data/vendor/kreuzberg/tests/config_loading_tests.rs +71 -62
  59. data/vendor/kreuzberg/tests/contract_mcp.rs +314 -0
  60. data/vendor/kreuzberg/tests/core_integration.rs +57 -57
  61. data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +23 -23
  62. data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +15 -14
  63. data/vendor/kreuzberg/tests/docx_vs_pandoc_comparison.rs +4 -4
  64. data/vendor/kreuzberg/tests/email_integration.rs +7 -7
  65. data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +2 -2
  66. data/vendor/kreuzberg/tests/error_handling.rs +13 -11
  67. data/vendor/kreuzberg/tests/fictionbook_extractor_tests.rs +2 -2
  68. data/vendor/kreuzberg/tests/html_table_test.rs +11 -11
  69. data/vendor/kreuzberg/tests/instrumentation_test.rs +18 -13
  70. data/vendor/kreuzberg/tests/jats_extractor_tests.rs +17 -17
  71. data/vendor/kreuzberg/tests/jupyter_extractor_tests.rs +9 -9
  72. data/vendor/kreuzberg/tests/keywords_integration.rs +25 -25
  73. data/vendor/kreuzberg/tests/keywords_quality.rs +9 -9
  74. data/vendor/kreuzberg/tests/latex_extractor_tests.rs +2 -2
  75. data/vendor/kreuzberg/tests/markdown_extractor_tests.rs +1 -1
  76. data/vendor/kreuzberg/tests/mcp_integration.rs +849 -0
  77. data/vendor/kreuzberg/tests/mime_detection.rs +75 -43
  78. data/vendor/kreuzberg/tests/ocr_errors.rs +10 -4
  79. data/vendor/kreuzberg/tests/ocr_language_registry.rs +1 -1
  80. data/vendor/kreuzberg/tests/ocr_stress.rs +3 -3
  81. data/vendor/kreuzberg/tests/odt_extractor_tests.rs +6 -6
  82. data/vendor/kreuzberg/tests/opml_extractor_tests.rs +2 -2
  83. data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +2 -2
  84. data/vendor/kreuzberg/tests/page_markers.rs +1 -1
  85. data/vendor/kreuzberg/tests/pdf_hierarchy_detection.rs +6 -6
  86. data/vendor/kreuzberg/tests/pdf_text_merging.rs +2 -2
  87. data/vendor/kreuzberg/tests/pipeline_integration.rs +77 -61
  88. data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +97 -77
  89. data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +78 -61
  90. data/vendor/kreuzberg/tests/plugin_system.rs +49 -46
  91. data/vendor/kreuzberg/tests/plugin_validator_test.rs +109 -97
  92. data/vendor/kreuzberg/tests/pptx_regression_tests.rs +324 -31
  93. data/vendor/kreuzberg/tests/registry_integration_tests.rs +26 -23
  94. data/vendor/kreuzberg/tests/rst_extractor_tests.rs +1 -1
  95. data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +18 -18
  96. data/vendor/kreuzberg/tests/security_validation.rs +20 -19
  97. data/vendor/kreuzberg/tests/serialization_integration.rs +112 -0
  98. data/vendor/kreuzberg/tests/stopwords_integration_test.rs +36 -36
  99. data/vendor/kreuzberg/tests/test_fastembed.rs +8 -8
  100. data/vendor/kreuzberg/tests/typst_extractor_tests.rs +9 -9
  101. data/vendor/kreuzberg/tests/xlsx_metadata_extraction_test.rs +12 -9
  102. data/vendor/kreuzberg-tesseract/Cargo.toml +1 -1
  103. metadata +12 -2
@@ -26,7 +26,7 @@ async fn test_fastembed_embedding_generation() {
26
26
  let result = model.embed(texts.clone(), None);
27
27
  assert!(result.is_ok(), "Failed to generate embeddings: {:?}", result.err());
28
28
 
29
- let embeddings = result.unwrap();
29
+ let embeddings = result.expect("Operation failed");
30
30
  assert_eq!(embeddings.len(), 3, "Expected 3 embeddings");
31
31
 
32
32
  for (i, embedding) in embeddings.iter().enumerate() {
@@ -64,7 +64,7 @@ async fn test_fastembed_batch_processing() {
64
64
 
65
65
  assert!(result.is_ok(), "Batch embedding failed: {:?}", result.err());
66
66
 
67
- let embeddings = result.unwrap();
67
+ let embeddings = result.expect("Operation failed");
68
68
  assert_eq!(embeddings.len(), 50, "Expected 50 embeddings");
69
69
 
70
70
  println!(
@@ -96,7 +96,7 @@ async fn test_fastembed_different_models() {
96
96
  let result = m.embed(test_text.clone(), None);
97
97
  assert!(result.is_ok(), "Failed to generate embedding for {}", description);
98
98
 
99
- let embeddings = result.unwrap();
99
+ let embeddings = result.expect("Operation failed");
100
100
  assert_eq!(embeddings.len(), 1);
101
101
  assert_eq!(
102
102
  embeddings[0].len(),
@@ -197,7 +197,7 @@ async fn test_generate_embeddings_for_chunks_basic() {
197
197
  for (i, chunk) in chunks.iter().enumerate() {
198
198
  assert!(chunk.embedding.is_some(), "Chunk {} missing embedding", i);
199
199
 
200
- let embedding = chunk.embedding.as_ref().unwrap();
200
+ let embedding = chunk.embedding.as_ref().expect("Operation failed");
201
201
  assert_eq!(embedding.len(), 384, "Chunk {} has wrong embedding dimensions", i);
202
202
 
203
203
  let sum: f32 = embedding.iter().sum();
@@ -269,8 +269,8 @@ async fn test_generate_embeddings_for_chunks_normalization() {
269
269
 
270
270
  generate_embeddings_for_chunks(&mut chunks_norm, &config_norm).expect("Failed to generate normalized embeddings");
271
271
 
272
- let embedding_no_norm = chunks_no_norm[0].embedding.as_ref().unwrap();
273
- let embedding_norm = chunks_norm[0].embedding.as_ref().unwrap();
272
+ let embedding_no_norm = chunks_no_norm[0].embedding.as_ref().expect("Operation failed");
273
+ let embedding_norm = chunks_norm[0].embedding.as_ref().expect("Operation failed");
274
274
 
275
275
  let magnitude_no_norm: f32 = embedding_no_norm.iter().map(|x| x * x).sum::<f32>().sqrt();
276
276
  let magnitude_norm: f32 = embedding_norm.iter().map(|x| x * x).sum::<f32>().sqrt();
@@ -560,7 +560,7 @@ async fn test_generate_embeddings_for_chunks_batch_size() {
560
560
  i
561
561
  );
562
562
  assert_eq!(
563
- chunk.embedding.as_ref().unwrap().len(),
563
+ chunk.embedding.as_ref().expect("Operation failed").len(),
564
564
  384,
565
565
  "Chunk {} has wrong dimensions",
566
566
  i
@@ -612,7 +612,7 @@ async fn test_generate_embeddings_chunking_integration() {
612
612
  for (i, chunk) in chunking_result.chunks.iter().enumerate() {
613
613
  assert!(chunk.embedding.is_some(), "Chunk {} missing embedding", i);
614
614
 
615
- let embedding = chunk.embedding.as_ref().unwrap();
615
+ let embedding = chunk.embedding.as_ref().expect("Operation failed");
616
616
  assert_eq!(embedding.len(), 384, "Chunk {} has wrong embedding dimensions", i);
617
617
 
618
618
  let magnitude: f32 = embedding.iter().map(|x| x * x).sum::<f32>().sqrt();
@@ -56,7 +56,7 @@ async fn test_simple_typst_document_extraction() {
56
56
  return;
57
57
  }
58
58
 
59
- let extraction = result.unwrap();
59
+ let extraction = result.expect("Operation failed");
60
60
 
61
61
  assert_eq!(extraction.mime_type, "text/x-typst", "MIME type should be preserved");
62
62
 
@@ -145,7 +145,7 @@ async fn test_minimal_typst_document_extraction() {
145
145
  return;
146
146
  }
147
147
 
148
- let extraction = result.unwrap();
148
+ let extraction = result.expect("Operation failed");
149
149
 
150
150
  assert!(
151
151
  !extraction.content.is_empty(),
@@ -189,7 +189,7 @@ async fn test_heading_hierarchy_extraction() {
189
189
  return;
190
190
  }
191
191
 
192
- let extraction = result.unwrap();
192
+ let extraction = result.expect("Operation failed");
193
193
 
194
194
  assert!(!extraction.content.is_empty(), "Document should extract content");
195
195
 
@@ -269,7 +269,7 @@ async fn test_metadata_extraction() {
269
269
  return;
270
270
  }
271
271
 
272
- let extraction = result.unwrap();
272
+ let extraction = result.expect("Operation failed");
273
273
 
274
274
  if let Some(title) = extraction.metadata.additional.get("title") {
275
275
  assert!(
@@ -330,7 +330,7 @@ async fn test_advanced_typst_document_extraction() {
330
330
  return;
331
331
  }
332
332
 
333
- let extraction = result.unwrap();
333
+ let extraction = result.expect("Operation failed");
334
334
 
335
335
  assert!(
336
336
  extraction.metadata.additional.contains_key("title"),
@@ -411,7 +411,7 @@ async fn test_typst_reader_extraction() {
411
411
  return;
412
412
  }
413
413
 
414
- let extraction = result.unwrap();
414
+ let extraction = result.expect("Operation failed");
415
415
 
416
416
  assert!(
417
417
  !extraction.content.is_empty(),
@@ -454,7 +454,7 @@ async fn test_undergradmath_extraction() {
454
454
  return;
455
455
  }
456
456
 
457
- let extraction = result.unwrap();
457
+ let extraction = result.expect("Operation failed");
458
458
 
459
459
  assert!(
460
460
  !extraction.content.is_empty(),
@@ -534,7 +534,7 @@ async fn test_formatting_preservation() {
534
534
  return;
535
535
  }
536
536
 
537
- let extraction = result.unwrap();
537
+ let extraction = result.expect("Operation failed");
538
538
 
539
539
  assert!(
540
540
  extraction.content.contains("*") || extraction.content.contains("bold"),
@@ -576,7 +576,7 @@ async fn test_large_document_extraction() {
576
576
  return;
577
577
  }
578
578
 
579
- let extraction = result.unwrap();
579
+ let extraction = result.expect("Operation failed");
580
580
 
581
581
  assert!(
582
582
  !extraction.content.is_empty(),
@@ -7,9 +7,9 @@ use kreuzberg::extraction::excel::read_excel_file;
7
7
  fn test_xlsx_full_metadata_extraction() {
8
8
  let workspace_root = std::path::Path::new(env!("CARGO_MANIFEST_DIR"))
9
9
  .parent()
10
- .unwrap()
10
+ .expect("Operation failed")
11
11
  .parent()
12
- .unwrap();
12
+ .expect("Operation failed");
13
13
  let test_file = workspace_root.join("test_documents/office/excel.xlsx");
14
14
 
15
15
  if !test_file.exists() {
@@ -17,7 +17,8 @@ fn test_xlsx_full_metadata_extraction() {
17
17
  return;
18
18
  }
19
19
 
20
- let result = read_excel_file(test_file.to_str().unwrap()).expect("Should extract XLSX successfully");
20
+ let file_path = test_file.to_str().expect("File path should be valid UTF-8");
21
+ let result = read_excel_file(file_path).expect("Should extract XLSX successfully");
21
22
 
22
23
  assert!(!result.sheets.is_empty(), "Should have at least one sheet");
23
24
 
@@ -34,9 +35,9 @@ fn test_xlsx_full_metadata_extraction() {
34
35
  fn test_xlsx_multi_sheet_metadata() {
35
36
  let workspace_root = std::path::Path::new(env!("CARGO_MANIFEST_DIR"))
36
37
  .parent()
37
- .unwrap()
38
+ .expect("Operation failed")
38
39
  .parent()
39
- .unwrap();
40
+ .expect("Operation failed");
40
41
  let test_file = workspace_root.join("test_documents/spreadsheets/excel_multi_sheet.xlsx");
41
42
 
42
43
  if !test_file.exists() {
@@ -44,7 +45,8 @@ fn test_xlsx_multi_sheet_metadata() {
44
45
  return;
45
46
  }
46
47
 
47
- let result = read_excel_file(test_file.to_str().unwrap()).expect("Should extract multi-sheet XLSX successfully");
48
+ let file_path = test_file.to_str().expect("File path should be valid UTF-8");
49
+ let result = read_excel_file(file_path).expect("Should extract multi-sheet XLSX successfully");
48
50
 
49
51
  assert!(
50
52
  result.sheets.len() > 1,
@@ -65,9 +67,9 @@ fn test_xlsx_multi_sheet_metadata() {
65
67
  fn test_xlsx_minimal_metadata_extraction() {
66
68
  let workspace_root = std::path::Path::new(env!("CARGO_MANIFEST_DIR"))
67
69
  .parent()
68
- .unwrap()
70
+ .expect("Operation failed")
69
71
  .parent()
70
- .unwrap();
72
+ .expect("Operation failed");
71
73
  let test_file = workspace_root.join("test_documents/spreadsheets/test_01.xlsx");
72
74
 
73
75
  if !test_file.exists() {
@@ -75,7 +77,8 @@ fn test_xlsx_minimal_metadata_extraction() {
75
77
  return;
76
78
  }
77
79
 
78
- let result = read_excel_file(test_file.to_str().unwrap()).expect("Should extract XLSX successfully");
80
+ let file_path = test_file.to_str().expect("File path should be valid UTF-8");
81
+ let result = read_excel_file(file_path).expect("Should extract XLSX successfully");
79
82
 
80
83
  assert!(!result.sheets.is_empty(), "Content should not be empty");
81
84
  assert!(
@@ -1,6 +1,6 @@
1
1
  [package]
2
2
  name = "kreuzberg-tesseract"
3
- version = "4.1.2"
3
+ version = "4.2.1"
4
4
  edition = "2024"
5
5
  rust-version = "1.91"
6
6
  authors = ["Na'aman Hirschfeld <nhirschfeld@gmail.com>"]
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: kreuzberg
3
3
  version: !ruby/object:Gem::Version
4
- version: 4.1.2
4
+ version: 4.2.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Na'aman Hirschfeld
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2026-01-25 00:00:00.000000000 Z
11
+ date: 2026-01-27 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -209,6 +209,7 @@ files:
209
209
  - ext/kreuzberg_rb/native/include/msvc_compat/strings.h
210
210
  - ext/kreuzberg_rb/native/include/strings.h
211
211
  - ext/kreuzberg_rb/native/include/unistd.h
212
+ - ext/kreuzberg_rb/native/libpdfium.so
212
213
  - ext/kreuzberg_rb/native/src/batch.rs
213
214
  - ext/kreuzberg_rb/native/src/config/mod.rs
214
215
  - ext/kreuzberg_rb/native/src/config/types.rs
@@ -232,6 +233,7 @@ files:
232
233
  - lib/kreuzberg/cli.rb
233
234
  - lib/kreuzberg/cli_proxy.rb
234
235
  - lib/kreuzberg/config.rb
236
+ - lib/kreuzberg/djot_content.rb
235
237
  - lib/kreuzberg/error_context.rb
236
238
  - lib/kreuzberg/errors.rb
237
239
  - lib/kreuzberg/extraction_api.rb
@@ -271,6 +273,7 @@ files:
271
273
  - spec/fixtures/config.toml
272
274
  - spec/fixtures/config.yaml
273
275
  - spec/fixtures/invalid_config.toml
276
+ - spec/serialization_spec.rb
274
277
  - spec/smoke/package_spec.rb
275
278
  - spec/spec_helper.rb
276
279
  - spec/unit/config/chunking_config_spec.rb
@@ -283,6 +286,7 @@ files:
283
286
  - spec/unit/config/keyword_config_spec.rb
284
287
  - spec/unit/config/language_detection_config_spec.rb
285
288
  - spec/unit/config/ocr_config_spec.rb
289
+ - spec/unit/config/output_format_spec.rb
286
290
  - spec/unit/config/page_config_spec.rb
287
291
  - spec/unit/config/pdf_config_spec.rb
288
292
  - spec/unit/config/postprocessor_config_spec.rb
@@ -588,6 +592,7 @@ files:
588
592
  - vendor/kreuzberg/src/plugins/registry/ocr.rs
589
593
  - vendor/kreuzberg/src/plugins/registry/processor.rs
590
594
  - vendor/kreuzberg/src/plugins/registry/validator.rs
595
+ - vendor/kreuzberg/src/plugins/startup_validation.rs
591
596
  - vendor/kreuzberg/src/plugins/traits.rs
592
597
  - vendor/kreuzberg/src/plugins/validator/mod.rs
593
598
  - vendor/kreuzberg/src/plugins/validator/registry.rs
@@ -705,6 +710,7 @@ files:
705
710
  - vendor/kreuzberg/stopwords/zh_stopwords.json
706
711
  - vendor/kreuzberg/stopwords/zu_stopwords.json
707
712
  - vendor/kreuzberg/tests/api_chunk.rs
713
+ - vendor/kreuzberg/tests/api_consistency.rs
708
714
  - vendor/kreuzberg/tests/api_embed.rs
709
715
  - vendor/kreuzberg/tests/api_extract_multipart.rs
710
716
  - vendor/kreuzberg/tests/api_large_pdf_extraction.rs
@@ -716,9 +722,11 @@ files:
716
722
  - vendor/kreuzberg/tests/batch_processing.rs
717
723
  - vendor/kreuzberg/tests/bibtex_parity_test.rs
718
724
  - vendor/kreuzberg/tests/concurrency_stress.rs
725
+ - vendor/kreuzberg/tests/config_behavioral.rs
719
726
  - vendor/kreuzberg/tests/config_features.rs
720
727
  - vendor/kreuzberg/tests/config_integration_test.rs
721
728
  - vendor/kreuzberg/tests/config_loading_tests.rs
729
+ - vendor/kreuzberg/tests/contract_mcp.rs
722
730
  - vendor/kreuzberg/tests/core_integration.rs
723
731
  - vendor/kreuzberg/tests/csv_integration.rs
724
732
  - vendor/kreuzberg/tests/data/hierarchy_ground_truth.json
@@ -740,6 +748,7 @@ files:
740
748
  - vendor/kreuzberg/tests/keywords_quality.rs
741
749
  - vendor/kreuzberg/tests/latex_extractor_tests.rs
742
750
  - vendor/kreuzberg/tests/markdown_extractor_tests.rs
751
+ - vendor/kreuzberg/tests/mcp_integration.rs
743
752
  - vendor/kreuzberg/tests/mime_detection.rs
744
753
  - vendor/kreuzberg/tests/ocr_configuration.rs
745
754
  - vendor/kreuzberg/tests/ocr_errors.rs
@@ -766,6 +775,7 @@ files:
766
775
  - vendor/kreuzberg/tests/rst_extractor_tests.rs
767
776
  - vendor/kreuzberg/tests/rtf_extractor_tests.rs
768
777
  - vendor/kreuzberg/tests/security_validation.rs
778
+ - vendor/kreuzberg/tests/serialization_integration.rs
769
779
  - vendor/kreuzberg/tests/stopwords_integration_test.rs
770
780
  - vendor/kreuzberg/tests/test_fastembed.rs
771
781
  - vendor/kreuzberg/tests/typst_behavioral_tests.rs