kreuzberg 4.1.1 → 4.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +4 -4
- data/README.md +8 -5
- data/ext/kreuzberg_rb/native/Cargo.toml +2 -2
- data/ext/kreuzberg_rb/native/libpdfium.so +0 -0
- data/ext/kreuzberg_rb/native/src/config/types.rs +23 -13
- data/kreuzberg.gemspec +14 -2
- data/lib/kreuzberg/api_proxy.rb +0 -1
- data/lib/kreuzberg/cli_proxy.rb +0 -1
- data/lib/kreuzberg/config.rb +70 -35
- data/lib/kreuzberg/mcp_proxy.rb +0 -1
- data/lib/kreuzberg/version.rb +1 -1
- data/sig/kreuzberg.rbs +5 -1
- data/spec/binding/batch_operations_spec.rb +80 -0
- data/spec/binding/metadata_types_spec.rb +77 -57
- data/spec/serialization_spec.rb +134 -0
- data/spec/unit/config/output_format_spec.rb +380 -0
- data/vendor/Cargo.toml +1 -1
- data/vendor/kreuzberg/Cargo.toml +3 -3
- data/vendor/kreuzberg/README.md +1 -1
- data/vendor/kreuzberg/src/embeddings.rs +4 -4
- data/vendor/kreuzberg/src/mcp/format.rs +237 -39
- data/vendor/kreuzberg/src/mcp/params.rs +26 -33
- data/vendor/kreuzberg/src/mcp/server.rs +6 -3
- data/vendor/kreuzberg/src/mcp/tools/extraction.rs +16 -23
- data/vendor/kreuzberg/tests/api_chunk.rs +40 -30
- data/vendor/kreuzberg/tests/api_consistency.rs +349 -0
- data/vendor/kreuzberg/tests/api_embed.rs +84 -50
- data/vendor/kreuzberg/tests/api_large_pdf_extraction_diagnostics.rs +8 -2
- data/vendor/kreuzberg/tests/api_tests.rs +298 -139
- data/vendor/kreuzberg/tests/archive_integration.rs +63 -56
- data/vendor/kreuzberg/tests/batch_orchestration.rs +22 -14
- data/vendor/kreuzberg/tests/batch_pooling_benchmark.rs +13 -13
- data/vendor/kreuzberg/tests/batch_processing.rs +13 -9
- data/vendor/kreuzberg/tests/bibtex_parity_test.rs +10 -10
- data/vendor/kreuzberg/tests/concurrency_stress.rs +10 -6
- data/vendor/kreuzberg/tests/config_behavioral.rs +414 -0
- data/vendor/kreuzberg/tests/config_features.rs +19 -15
- data/vendor/kreuzberg/tests/config_integration_test.rs +68 -68
- data/vendor/kreuzberg/tests/config_loading_tests.rs +71 -62
- data/vendor/kreuzberg/tests/contract_mcp.rs +314 -0
- data/vendor/kreuzberg/tests/core_integration.rs +55 -53
- data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +23 -23
- data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +15 -14
- data/vendor/kreuzberg/tests/docx_vs_pandoc_comparison.rs +4 -4
- data/vendor/kreuzberg/tests/email_integration.rs +7 -7
- data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +2 -2
- data/vendor/kreuzberg/tests/error_handling.rs +13 -11
- data/vendor/kreuzberg/tests/fictionbook_extractor_tests.rs +2 -2
- data/vendor/kreuzberg/tests/html_table_test.rs +11 -11
- data/vendor/kreuzberg/tests/instrumentation_test.rs +18 -13
- data/vendor/kreuzberg/tests/jats_extractor_tests.rs +17 -17
- data/vendor/kreuzberg/tests/jupyter_extractor_tests.rs +9 -9
- data/vendor/kreuzberg/tests/keywords_integration.rs +25 -25
- data/vendor/kreuzberg/tests/keywords_quality.rs +9 -9
- data/vendor/kreuzberg/tests/latex_extractor_tests.rs +2 -2
- data/vendor/kreuzberg/tests/markdown_extractor_tests.rs +1 -1
- data/vendor/kreuzberg/tests/mcp_integration.rs +849 -0
- data/vendor/kreuzberg/tests/mime_detection.rs +72 -41
- data/vendor/kreuzberg/tests/ocr_errors.rs +10 -4
- data/vendor/kreuzberg/tests/ocr_language_registry.rs +1 -1
- data/vendor/kreuzberg/tests/ocr_stress.rs +3 -3
- data/vendor/kreuzberg/tests/odt_extractor_tests.rs +6 -6
- data/vendor/kreuzberg/tests/opml_extractor_tests.rs +2 -2
- data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +2 -2
- data/vendor/kreuzberg/tests/page_markers.rs +1 -1
- data/vendor/kreuzberg/tests/pdf_hierarchy_detection.rs +6 -6
- data/vendor/kreuzberg/tests/pdf_text_merging.rs +2 -2
- data/vendor/kreuzberg/tests/pipeline_integration.rs +77 -61
- data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +97 -77
- data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +78 -61
- data/vendor/kreuzberg/tests/plugin_system.rs +49 -46
- data/vendor/kreuzberg/tests/plugin_validator_test.rs +109 -97
- data/vendor/kreuzberg/tests/pptx_regression_tests.rs +40 -30
- data/vendor/kreuzberg/tests/registry_integration_tests.rs +26 -23
- data/vendor/kreuzberg/tests/rst_extractor_tests.rs +1 -1
- data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +18 -18
- data/vendor/kreuzberg/tests/security_validation.rs +20 -19
- data/vendor/kreuzberg/tests/serialization_integration.rs +112 -0
- data/vendor/kreuzberg/tests/stopwords_integration_test.rs +36 -36
- data/vendor/kreuzberg/tests/test_fastembed.rs +8 -8
- data/vendor/kreuzberg/tests/typst_extractor_tests.rs +9 -9
- data/vendor/kreuzberg/tests/xlsx_metadata_extraction_test.rs +12 -9
- data/vendor/kreuzberg-tesseract/Cargo.toml +3 -3
- data/vendor/kreuzberg-tesseract/build.rs +4 -4
- data/vendor/kreuzberg-tesseract/src/lib.rs +6 -6
- data/vendor/kreuzberg-tesseract/tests/integration_test.rs +3 -3
- metadata +13 -2
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 9a1c9adffca7d75c142bd661f1d481b1aee00d97c6f62dcc70292f37978bcc17
|
|
4
|
+
data.tar.gz: 227af2ed45bff1dfa9afebd69220d15a41b2e476bf97f8a83173d21aab8b88e1
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 0d1b0081f89a73f5422e68a714fc415f6d290dd8be7cf0ba6d454cfdf1938ebdac4919358b25d6e5a0bc1a209e1b165062a0341d28cde1b3fa0595bffec837f5
|
|
7
|
+
data.tar.gz: fc5a5f29309c29fbbf63ba035cf5e462e78b15c2afc239fb333bd8b6e70ef061219822ed4d533f81bb35cdd62db84da8c01e8f172561b0f7fb802b848b491c0a
|
data/Gemfile.lock
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
PATH
|
|
2
2
|
remote: .
|
|
3
3
|
specs:
|
|
4
|
-
kreuzberg (4.
|
|
4
|
+
kreuzberg (4.2.0)
|
|
5
5
|
|
|
6
6
|
GEM
|
|
7
7
|
remote: https://rubygems.org/
|
|
@@ -121,7 +121,7 @@ GEM
|
|
|
121
121
|
rubocop (~> 1.81)
|
|
122
122
|
ruby-progressbar (1.13.0)
|
|
123
123
|
securerandom (0.4.1)
|
|
124
|
-
sorbet-runtime (0.6.
|
|
124
|
+
sorbet-runtime (0.6.12897)
|
|
125
125
|
steep (1.10.0)
|
|
126
126
|
activesupport (>= 5.1)
|
|
127
127
|
concurrent-ruby (>= 1.1.10)
|
|
@@ -207,7 +207,7 @@ CHECKSUMS
|
|
|
207
207
|
i18n (1.14.8) sha256=285778639134865c5e0f6269e0b818256017e8cde89993fdfcbfb64d088824a5
|
|
208
208
|
io-console (0.8.2) sha256=d6e3ae7a7cc7574f4b8893b4fca2162e57a825b223a177b7afa236c5ef9814cc
|
|
209
209
|
json (2.18.0) sha256=b10506aee4183f5cf49e0efc48073d7b75843ce3782c68dbeb763351c08fd505
|
|
210
|
-
kreuzberg (4.
|
|
210
|
+
kreuzberg (4.2.0)
|
|
211
211
|
language_server-protocol (3.17.0.5) sha256=fd1e39a51a28bf3eec959379985a72e296e9f9acfce46f6a79d31ca8760803cc
|
|
212
212
|
lint_roller (1.1.0) sha256=2c0c845b632a7d172cb849cc90c1bce937a28c5c8ccccb50dfd46a485003cc87
|
|
213
213
|
listen (3.10.0) sha256=c6e182db62143aeccc2e1960033bebe7445309c7272061979bb098d03760c9d2
|
|
@@ -242,7 +242,7 @@ CHECKSUMS
|
|
|
242
242
|
rubocop-rspec (3.9.0) sha256=8fa70a3619408237d789aeecfb9beef40576acc855173e60939d63332fdb55e2
|
|
243
243
|
ruby-progressbar (1.13.0) sha256=80fc9c47a9b640d6834e0dc7b3c94c9df37f08cb072b7761e4a71e22cff29b33
|
|
244
244
|
securerandom (0.4.1) sha256=cc5193d414a4341b6e225f0cb4446aceca8e50d5e1888743fac16987638ea0b1
|
|
245
|
-
sorbet-runtime (0.6.
|
|
245
|
+
sorbet-runtime (0.6.12897) sha256=0348ab8803c4c3646977fee298083ded9b7e74d5b34b50c567c63eb7e36eb286
|
|
246
246
|
steep (1.10.0) sha256=1b295b55f9aaff1b8d3ee42453ee55bc2a1078fda0268f288edb2dc014f4d7d1
|
|
247
247
|
strscan (3.1.7) sha256=5f76462b94a3ea50b44973225b7d75b2cb96d4e1bee9ef1319b99ca117b72c8c
|
|
248
248
|
terminal-table (4.0.0) sha256=f504793203f8251b2ea7c7068333053f0beeea26093ec9962e62ea79f94301d2
|
data/README.md
CHANGED
|
@@ -22,7 +22,7 @@
|
|
|
22
22
|
<img src="https://img.shields.io/maven-central/v/dev.kreuzberg/kreuzberg?label=Java&color=007ec6" alt="Java">
|
|
23
23
|
</a>
|
|
24
24
|
<a href="https://github.com/kreuzberg-dev/kreuzberg/releases">
|
|
25
|
-
<img src="https://img.shields.io/github/v/tag/kreuzberg-dev/kreuzberg?label=Go&color=007ec6&filter=v4.
|
|
25
|
+
<img src="https://img.shields.io/github/v/tag/kreuzberg-dev/kreuzberg?label=Go&color=007ec6&filter=v4.2.0" alt="Go">
|
|
26
26
|
</a>
|
|
27
27
|
<a href="https://www.nuget.org/packages/Kreuzberg/">
|
|
28
28
|
<img src="https://img.shields.io/nuget/v/Kreuzberg?label=C%23&color=007ec6" alt="C#">
|
|
@@ -86,10 +86,13 @@ gem 'kreuzberg'
|
|
|
86
86
|
|
|
87
87
|
### System Requirements
|
|
88
88
|
|
|
89
|
-
- **Ruby 2.
|
|
89
|
+
- **Ruby 3.2.0 or higher** required (including Ruby 4.x)
|
|
90
|
+
- Ruby 4.0+ is fully supported with no code changes required
|
|
90
91
|
- Optional: [ONNX Runtime](https://github.com/microsoft/onnxruntime/releases) version 1.22.x for embeddings support
|
|
91
92
|
- Optional: [Tesseract OCR](https://github.com/tesseract-ocr/tesseract) for OCR functionality
|
|
92
93
|
|
|
94
|
+
**Ruby 4.0 Compatibility:** Kreuzberg is fully compatible with Ruby 4.0 (released December 25, 2025) and all Ruby 4.x versions. All tests pass with 100% compatibility. The gem compiles without any breaking changes. Key Ruby 4.0 features like Ruby Box, ZJIT compiler, and Ractor improvements work seamlessly with Kreuzberg.
|
|
95
|
+
|
|
93
96
|
|
|
94
97
|
|
|
95
98
|
## Quick Start
|
|
@@ -202,9 +205,9 @@ puts "Processing time: #{result.metadata&.dig('processing_time')}ms"
|
|
|
202
205
|
|
|
203
206
|
## Features
|
|
204
207
|
|
|
205
|
-
### Supported File Formats (
|
|
208
|
+
### Supported File Formats (57+)
|
|
206
209
|
|
|
207
|
-
|
|
210
|
+
57 file formats across 8 major categories with intelligent format detection and comprehensive metadata extraction.
|
|
208
211
|
|
|
209
212
|
#### Office Documents
|
|
210
213
|
|
|
@@ -230,7 +233,7 @@ puts "Processing time: #{result.metadata&.dig('processing_time')}ms"
|
|
|
230
233
|
|----------|---------|----------|
|
|
231
234
|
| **Markup** | `.html`, `.htm`, `.xhtml`, `.xml`, `.svg` | DOM parsing, metadata (Open Graph, Twitter Card), link extraction |
|
|
232
235
|
| **Structured Data** | `.json`, `.yaml`, `.yml`, `.toml`, `.csv`, `.tsv` | Schema detection, nested structures, validation |
|
|
233
|
-
| **Text & Markdown** | `.txt`, `.md`, `.markdown`, `.rst`, `.org`, `.rtf` | CommonMark, GFM, reStructuredText, Org Mode |
|
|
236
|
+
| **Text & Markdown** | `.txt`, `.md`, `.markdown`, `.djot`, `.rst`, `.org`, `.rtf` | CommonMark, GFM, Djot, reStructuredText, Org Mode |
|
|
234
237
|
|
|
235
238
|
#### Email & Archives
|
|
236
239
|
|
|
@@ -31,7 +31,7 @@ embeddings = ["kreuzberg/embeddings"]
|
|
|
31
31
|
|
|
32
32
|
[dependencies]
|
|
33
33
|
async-trait = "0.1.89"
|
|
34
|
-
kreuzberg = { path = "
|
|
34
|
+
kreuzberg = { path = "../../../vendor/kreuzberg", default-features = false, features = [
|
|
35
35
|
"pdf",
|
|
36
36
|
"excel",
|
|
37
37
|
"office",
|
|
@@ -51,7 +51,7 @@ kreuzberg = { path = "../../../../../crates/kreuzberg", default-features = false
|
|
|
51
51
|
"bundled-pdfium",
|
|
52
52
|
"tokio-runtime",
|
|
53
53
|
] }
|
|
54
|
-
kreuzberg-ffi = { path = "
|
|
54
|
+
kreuzberg-ffi = { path = "../../../vendor/kreuzberg-ffi" }
|
|
55
55
|
magnus = { git = "https://github.com/matsadler/magnus", rev = "f6db11769efb517427bf7f121f9c32e18b059b38", features = [
|
|
56
56
|
"rb-sys",
|
|
57
57
|
] }
|
|
Binary file
|
|
@@ -1025,8 +1025,10 @@ pub fn config_from_file(path: String) -> Result<RHash, Error> {
|
|
|
1025
1025
|
.and_then(|v| magnus::RHash::try_convert(v).map_err(|_| validation_error("Config must be a Hash")))
|
|
1026
1026
|
}
|
|
1027
1027
|
|
|
1028
|
-
/// Discover extraction config from current directory
|
|
1028
|
+
/// Discover extraction config from current directory or parent directories
|
|
1029
1029
|
pub fn config_discover() -> Result<Value, Error> {
|
|
1030
|
+
use std::path::PathBuf;
|
|
1031
|
+
|
|
1030
1032
|
let ruby = Ruby::get().expect("Ruby not initialized");
|
|
1031
1033
|
|
|
1032
1034
|
// Search for config files in order of precedence
|
|
@@ -1038,19 +1040,27 @@ pub fn config_discover() -> Result<Value, Error> {
|
|
|
1038
1040
|
(".kreuzbergrc", "json"),
|
|
1039
1041
|
];
|
|
1040
1042
|
|
|
1041
|
-
|
|
1042
|
-
|
|
1043
|
-
|
|
1044
|
-
|
|
1045
|
-
|
|
1046
|
-
|
|
1047
|
-
|
|
1048
|
-
|
|
1049
|
-
|
|
1050
|
-
|
|
1051
|
-
|
|
1052
|
-
|
|
1043
|
+
// Start from current directory and search up to parent directories
|
|
1044
|
+
let mut current_dir: Option<PathBuf> = std::env::current_dir().ok();
|
|
1045
|
+
|
|
1046
|
+
while let Some(dir) = current_dir {
|
|
1047
|
+
for (name, format) in &config_files {
|
|
1048
|
+
let config_path = dir.join(name);
|
|
1049
|
+
if let Ok(content) = fs::read_to_string(&config_path) {
|
|
1050
|
+
let json_value: serde_json::Value = match *format {
|
|
1051
|
+
"toml" => toml::from_str(&content)
|
|
1052
|
+
.map_err(|e| validation_error(format!("Invalid TOML in {}: {}", config_path.display(), e)))?,
|
|
1053
|
+
"yaml" => serde_yaml_ng::from_str(&content)
|
|
1054
|
+
.map_err(|e| validation_error(format!("Invalid YAML in {}: {}", config_path.display(), e)))?,
|
|
1055
|
+
"json" => serde_json::from_str(&content)
|
|
1056
|
+
.map_err(|e| validation_error(format!("Invalid JSON in {}: {}", config_path.display(), e)))?,
|
|
1057
|
+
_ => unreachable!(),
|
|
1058
|
+
};
|
|
1059
|
+
return json_value_to_ruby(&ruby, &json_value);
|
|
1060
|
+
}
|
|
1053
1061
|
}
|
|
1062
|
+
// Move to parent directory
|
|
1063
|
+
current_dir = dir.parent().map(|p| p.to_path_buf());
|
|
1054
1064
|
}
|
|
1055
1065
|
|
|
1056
1066
|
// Return nil if no config found
|
data/kreuzberg.gemspec
CHANGED
|
@@ -130,10 +130,22 @@ vendor_files = Dir.chdir(__dir__) do
|
|
|
130
130
|
kreuzberg_files + kreuzberg_ffi_files + kreuzberg_tesseract_files + rb_sys_files + workspace_toml
|
|
131
131
|
end
|
|
132
132
|
|
|
133
|
+
# When vendor files exist, get ext/ files from filesystem (to include modified Cargo.toml
|
|
134
|
+
# with vendor paths) instead of from git (which has original 5-level crate paths)
|
|
135
|
+
ext_files_from_fs = Dir.chdir(__dir__) do
|
|
136
|
+
Dir.glob('ext/**/*', File::FNM_DOTMATCH)
|
|
137
|
+
.reject { |f| File.directory?(f) }
|
|
138
|
+
.reject { |f| f.include?('/target/') }
|
|
139
|
+
.grep_v(/\.(swp|bak|tmp)$/)
|
|
140
|
+
.grep_v(/~$/)
|
|
141
|
+
end
|
|
142
|
+
|
|
133
143
|
files = if (ruby_files + core_files + ffi_files).empty?
|
|
134
144
|
fallback_files
|
|
135
145
|
elsif vendor_files.any?
|
|
136
|
-
|
|
146
|
+
# Use ext/ files from filesystem (modified by vendor script) + non-ext ruby files from git
|
|
147
|
+
non_ext_ruby_files = ruby_files.reject { |f| f.start_with?('ext/') }
|
|
148
|
+
non_ext_ruby_files + ext_files_from_fs + vendor_files
|
|
137
149
|
else
|
|
138
150
|
ruby_files + core_files + ffi_files
|
|
139
151
|
end
|
|
@@ -165,7 +177,7 @@ Gem::Specification.new do |spec|
|
|
|
165
177
|
DESC
|
|
166
178
|
spec.homepage = 'https://github.com/kreuzberg-dev/kreuzberg'
|
|
167
179
|
spec.license = 'MIT'
|
|
168
|
-
spec.required_ruby_version = '>= 3.2.0'
|
|
180
|
+
spec.required_ruby_version = '>= 3.2.0', '< 5.0'
|
|
169
181
|
|
|
170
182
|
spec.metadata = {
|
|
171
183
|
'homepage_uri' => spec.homepage,
|
data/lib/kreuzberg/api_proxy.rb
CHANGED
data/lib/kreuzberg/cli_proxy.rb
CHANGED
data/lib/kreuzberg/config.rb
CHANGED
|
@@ -717,7 +717,7 @@ module Kreuzberg
|
|
|
717
717
|
:ocr, :chunking, :language_detection, :pdf_options,
|
|
718
718
|
:image_extraction, :image_preprocessing, :postprocessor,
|
|
719
719
|
:token_reduction, :keywords, :html_options, :pages,
|
|
720
|
-
:max_concurrent_extractions
|
|
720
|
+
:max_concurrent_extractions, :output_format, :result_format
|
|
721
721
|
|
|
722
722
|
# Load configuration from a file.
|
|
723
723
|
#
|
|
@@ -738,7 +738,7 @@ module Kreuzberg
|
|
|
738
738
|
use_cache enable_quality_processing force_ocr ocr chunking
|
|
739
739
|
language_detection pdf_options image_extraction image_preprocessing
|
|
740
740
|
postprocessor token_reduction keywords html_options pages
|
|
741
|
-
max_concurrent_extractions
|
|
741
|
+
max_concurrent_extractions output_format result_format
|
|
742
742
|
].freeze
|
|
743
743
|
|
|
744
744
|
# Aliases for backward compatibility
|
|
@@ -789,41 +789,67 @@ module Kreuzberg
|
|
|
789
789
|
new(**normalize_hash_keys(hash))
|
|
790
790
|
end
|
|
791
791
|
|
|
792
|
-
def initialize(
|
|
793
|
-
|
|
794
|
-
|
|
795
|
-
|
|
796
|
-
|
|
797
|
-
|
|
798
|
-
|
|
799
|
-
|
|
800
|
-
|
|
801
|
-
|
|
802
|
-
|
|
803
|
-
|
|
804
|
-
|
|
805
|
-
|
|
806
|
-
|
|
807
|
-
|
|
808
|
-
|
|
809
|
-
|
|
810
|
-
|
|
811
|
-
|
|
812
|
-
|
|
813
|
-
|
|
814
|
-
|
|
815
|
-
|
|
816
|
-
|
|
817
|
-
|
|
818
|
-
|
|
819
|
-
|
|
820
|
-
|
|
821
|
-
|
|
822
|
-
|
|
823
|
-
|
|
792
|
+
def initialize(hash = nil,
|
|
793
|
+
use_cache: true,
|
|
794
|
+
enable_quality_processing: false,
|
|
795
|
+
force_ocr: false,
|
|
796
|
+
ocr: nil,
|
|
797
|
+
chunking: nil,
|
|
798
|
+
language_detection: nil,
|
|
799
|
+
pdf_options: nil,
|
|
800
|
+
image_extraction: nil,
|
|
801
|
+
image_preprocessing: nil,
|
|
802
|
+
postprocessor: nil,
|
|
803
|
+
token_reduction: nil,
|
|
804
|
+
keywords: nil,
|
|
805
|
+
html_options: nil,
|
|
806
|
+
pages: nil,
|
|
807
|
+
max_concurrent_extractions: nil,
|
|
808
|
+
output_format: nil,
|
|
809
|
+
result_format: nil)
|
|
810
|
+
kwargs = {
|
|
811
|
+
use_cache: use_cache, enable_quality_processing: enable_quality_processing,
|
|
812
|
+
force_ocr: force_ocr, ocr: ocr, chunking: chunking, language_detection: language_detection,
|
|
813
|
+
pdf_options: pdf_options, image_extraction: image_extraction,
|
|
814
|
+
image_preprocessing: image_preprocessing, postprocessor: postprocessor,
|
|
815
|
+
token_reduction: token_reduction, keywords: keywords, html_options: html_options,
|
|
816
|
+
pages: pages, max_concurrent_extractions: max_concurrent_extractions,
|
|
817
|
+
output_format: output_format, result_format: result_format
|
|
818
|
+
}
|
|
819
|
+
extracted = extract_from_hash(hash, kwargs)
|
|
820
|
+
|
|
821
|
+
assign_attributes(extracted)
|
|
822
|
+
end
|
|
823
|
+
|
|
824
|
+
def extract_from_hash(hash, defaults)
|
|
825
|
+
return defaults unless hash.is_a?(Hash)
|
|
826
|
+
|
|
827
|
+
hash = hash.transform_keys(&:to_sym)
|
|
828
|
+
defaults.merge(hash.slice(*defaults.keys))
|
|
829
|
+
end
|
|
830
|
+
|
|
831
|
+
def assign_attributes(params)
|
|
832
|
+
@use_cache = params[:use_cache] ? true : false
|
|
833
|
+
@enable_quality_processing = params[:enable_quality_processing] ? true : false
|
|
834
|
+
@force_ocr = params[:force_ocr] ? true : false
|
|
835
|
+
@ocr = normalize_config(params[:ocr], OCR)
|
|
836
|
+
@chunking = normalize_config(params[:chunking], Chunking)
|
|
837
|
+
@language_detection = normalize_config(params[:language_detection], LanguageDetection)
|
|
838
|
+
@pdf_options = normalize_config(params[:pdf_options], PDF)
|
|
839
|
+
@image_extraction = normalize_config(params[:image_extraction], ImageExtraction)
|
|
840
|
+
@image_preprocessing = normalize_config(params[:image_preprocessing], ImagePreprocessing)
|
|
841
|
+
@postprocessor = normalize_config(params[:postprocessor], PostProcessor)
|
|
842
|
+
@token_reduction = normalize_config(params[:token_reduction], TokenReduction)
|
|
843
|
+
@keywords = normalize_config(params[:keywords], Keywords)
|
|
844
|
+
@html_options = normalize_config(params[:html_options], HtmlOptions)
|
|
845
|
+
@pages = normalize_config(params[:pages], PageConfig)
|
|
846
|
+
@max_concurrent_extractions = params[:max_concurrent_extractions]&.to_i
|
|
847
|
+
@output_format = params[:output_format]&.to_s
|
|
848
|
+
@result_format = params[:result_format]&.to_s
|
|
824
849
|
end
|
|
825
850
|
|
|
826
851
|
# rubocop:disable Metrics/CyclomaticComplexity
|
|
852
|
+
# rubocop:disable Metrics/MethodLength
|
|
827
853
|
def to_h
|
|
828
854
|
{
|
|
829
855
|
use_cache: @use_cache,
|
|
@@ -840,9 +866,12 @@ module Kreuzberg
|
|
|
840
866
|
keywords: @keywords&.to_h,
|
|
841
867
|
html_options: @html_options&.to_h,
|
|
842
868
|
pages: @pages&.to_h,
|
|
843
|
-
max_concurrent_extractions: @max_concurrent_extractions
|
|
869
|
+
max_concurrent_extractions: @max_concurrent_extractions,
|
|
870
|
+
output_format: @output_format,
|
|
871
|
+
result_format: @result_format
|
|
844
872
|
}.compact
|
|
845
873
|
end
|
|
874
|
+
# rubocop:enable Metrics/MethodLength
|
|
846
875
|
# rubocop:enable Metrics/CyclomaticComplexity
|
|
847
876
|
|
|
848
877
|
# Serialize configuration to JSON string
|
|
@@ -981,6 +1010,10 @@ module Kreuzberg
|
|
|
981
1010
|
@pages = normalize_config(value, PageConfig)
|
|
982
1011
|
when :max_concurrent_extractions
|
|
983
1012
|
@max_concurrent_extractions = value&.to_i
|
|
1013
|
+
when :output_format
|
|
1014
|
+
@output_format = value&.to_s
|
|
1015
|
+
when :result_format
|
|
1016
|
+
@result_format = value&.to_s
|
|
984
1017
|
else
|
|
985
1018
|
raise ArgumentError, "Unknown configuration key: #{key}"
|
|
986
1019
|
end
|
|
@@ -1028,6 +1061,8 @@ module Kreuzberg
|
|
|
1028
1061
|
@html_options = merged.html_options
|
|
1029
1062
|
@pages = merged.pages
|
|
1030
1063
|
@max_concurrent_extractions = merged.max_concurrent_extractions
|
|
1064
|
+
@output_format = merged.output_format
|
|
1065
|
+
@result_format = merged.result_format
|
|
1031
1066
|
end
|
|
1032
1067
|
end
|
|
1033
1068
|
end
|
data/lib/kreuzberg/mcp_proxy.rb
CHANGED
data/lib/kreuzberg/version.rb
CHANGED
data/sig/kreuzberg.rbs
CHANGED
|
@@ -202,6 +202,8 @@ module Kreuzberg
|
|
|
202
202
|
attr_reader html_options: HtmlOptions?
|
|
203
203
|
attr_reader pages: PageConfig?
|
|
204
204
|
attr_reader max_concurrent_extractions: Integer?
|
|
205
|
+
attr_reader output_format: String?
|
|
206
|
+
attr_reader result_format: String?
|
|
205
207
|
|
|
206
208
|
def self.from_file: (String path) -> Extraction
|
|
207
209
|
def initialize: (
|
|
@@ -219,7 +221,9 @@ module Kreuzberg
|
|
|
219
221
|
?keywords: (Keywords | Hash[Symbol, untyped])?,
|
|
220
222
|
?html_options: (HtmlOptions | Hash[Symbol, untyped])?,
|
|
221
223
|
?pages: (PageConfig | Hash[Symbol, untyped])?,
|
|
222
|
-
?max_concurrent_extractions: Integer
|
|
224
|
+
?max_concurrent_extractions: Integer?,
|
|
225
|
+
?output_format: String?,
|
|
226
|
+
?result_format: String?
|
|
223
227
|
) -> void
|
|
224
228
|
def to_h: () -> Hash[Symbol, untyped]
|
|
225
229
|
|
|
@@ -592,4 +592,84 @@ RSpec.describe 'Batch Operations' do
|
|
|
592
592
|
paths.each { |p| FileUtils.rm_f(p) }
|
|
593
593
|
end
|
|
594
594
|
end
|
|
595
|
+
|
|
596
|
+
describe 'batch with output and result formats' do
|
|
597
|
+
it 'batch processes with output_format' do
|
|
598
|
+
paths = []
|
|
599
|
+
file = Tempfile.new(['format_test', '.txt']).tap do |f|
|
|
600
|
+
f.write('Test content for output format')
|
|
601
|
+
f.close
|
|
602
|
+
end
|
|
603
|
+
paths << file.path
|
|
604
|
+
|
|
605
|
+
config = Kreuzberg::Config::Extraction.new(output_format: 'markdown')
|
|
606
|
+
results = Kreuzberg.batch_extract_files_sync(paths: paths, config: config)
|
|
607
|
+
|
|
608
|
+
expect(results).to be_an Array
|
|
609
|
+
expect(results.length).to eq 1
|
|
610
|
+
expect(results[0]).to be_a Kreuzberg::Result
|
|
611
|
+
|
|
612
|
+
paths.each { |p| FileUtils.rm_f(p) }
|
|
613
|
+
end
|
|
614
|
+
|
|
615
|
+
it 'batch processes with result_format' do
|
|
616
|
+
paths = []
|
|
617
|
+
file = Tempfile.new(['format_test', '.txt']).tap do |f|
|
|
618
|
+
f.write('Test content for result format')
|
|
619
|
+
f.close
|
|
620
|
+
end
|
|
621
|
+
paths << file.path
|
|
622
|
+
|
|
623
|
+
config = Kreuzberg::Config::Extraction.new(result_format: 'unified')
|
|
624
|
+
results = Kreuzberg.batch_extract_files_sync(paths: paths, config: config)
|
|
625
|
+
|
|
626
|
+
expect(results).to be_an Array
|
|
627
|
+
expect(results.length).to eq 1
|
|
628
|
+
expect(results[0]).to be_a Kreuzberg::Result
|
|
629
|
+
|
|
630
|
+
paths.each { |p| FileUtils.rm_f(p) }
|
|
631
|
+
end
|
|
632
|
+
|
|
633
|
+
it 'batch processes with both output and result formats' do
|
|
634
|
+
paths = []
|
|
635
|
+
file = Tempfile.new(['format_test', '.txt']).tap do |f|
|
|
636
|
+
f.write('Test content for both formats')
|
|
637
|
+
f.close
|
|
638
|
+
end
|
|
639
|
+
paths << file.path
|
|
640
|
+
|
|
641
|
+
config = Kreuzberg::Config::Extraction.new(
|
|
642
|
+
output_format: 'plain',
|
|
643
|
+
result_format: 'element_based'
|
|
644
|
+
)
|
|
645
|
+
results = Kreuzberg.batch_extract_files_sync(paths: paths, config: config)
|
|
646
|
+
|
|
647
|
+
expect(results).to be_an Array
|
|
648
|
+
expect(results.length).to eq 1
|
|
649
|
+
expect(results[0]).to be_a Kreuzberg::Result
|
|
650
|
+
|
|
651
|
+
paths.each { |p| FileUtils.rm_f(p) }
|
|
652
|
+
end
|
|
653
|
+
|
|
654
|
+
it 'batch processes with chunking and output_format' do
|
|
655
|
+
paths = []
|
|
656
|
+
file = Tempfile.new(['format_test', '.txt']).tap do |f|
|
|
657
|
+
f.write('Test content ' * 100)
|
|
658
|
+
f.close
|
|
659
|
+
end
|
|
660
|
+
paths << file.path
|
|
661
|
+
|
|
662
|
+
config = Kreuzberg::Config::Extraction.new(
|
|
663
|
+
output_format: 'markdown',
|
|
664
|
+
chunking: { max_chars: 1000 }
|
|
665
|
+
)
|
|
666
|
+
results = Kreuzberg.batch_extract_files_sync(paths: paths, config: config)
|
|
667
|
+
|
|
668
|
+
expect(results).to be_an Array
|
|
669
|
+
expect(results.length).to eq 1
|
|
670
|
+
expect(results[0]).to be_a Kreuzberg::Result
|
|
671
|
+
|
|
672
|
+
paths.each { |p| FileUtils.rm_f(p) }
|
|
673
|
+
end
|
|
674
|
+
end
|
|
595
675
|
end
|
|
@@ -1154,64 +1154,11 @@ RSpec.describe 'Kreuzberg Metadata Types' do
|
|
|
1154
1154
|
|
|
1155
1155
|
describe 'Thread Safety: Concurrent Extraction' do
|
|
1156
1156
|
it 'handles concurrent extraction safely' do
|
|
1157
|
-
test_files =
|
|
1158
|
-
results =
|
|
1159
|
-
errors = []
|
|
1157
|
+
test_files = create_concurrent_test_files
|
|
1158
|
+
results, errors = run_concurrent_extractions(test_files)
|
|
1160
1159
|
|
|
1161
|
-
|
|
1162
|
-
|
|
1163
|
-
<html>
|
|
1164
|
-
<head>
|
|
1165
|
-
<title>Concurrent Test #{i}</title>
|
|
1166
|
-
<meta name="description" content="Test document #{i}">
|
|
1167
|
-
<meta name="keywords" content="test#{i}, concurrent, thread-safe">
|
|
1168
|
-
</head>
|
|
1169
|
-
<body>
|
|
1170
|
-
<h1>Test Document #{i}</h1>
|
|
1171
|
-
<p>Content for test #{i}</p>
|
|
1172
|
-
<a href="/page-#{i}">Link #{i}</a>
|
|
1173
|
-
<img src="image-#{i}.jpg" alt="Image #{i}">
|
|
1174
|
-
</body>
|
|
1175
|
-
</html>
|
|
1176
|
-
HTML
|
|
1177
|
-
test_files << create_test_html_file(html_content)
|
|
1178
|
-
end
|
|
1179
|
-
|
|
1180
|
-
begin
|
|
1181
|
-
threads = test_files.map do |file|
|
|
1182
|
-
Thread.new do
|
|
1183
|
-
result = Kreuzberg.extract_file_sync(path: file)
|
|
1184
|
-
results << result
|
|
1185
|
-
rescue StandardError => e
|
|
1186
|
-
errors << e
|
|
1187
|
-
end
|
|
1188
|
-
end
|
|
1189
|
-
|
|
1190
|
-
threads.each(&:join)
|
|
1191
|
-
|
|
1192
|
-
expect(errors).to be_empty
|
|
1193
|
-
|
|
1194
|
-
expect(results.length).to eq(5)
|
|
1195
|
-
results.each do |result|
|
|
1196
|
-
expect(result).to be_a(Kreuzberg::Result)
|
|
1197
|
-
expect(result.metadata).not_to be_nil
|
|
1198
|
-
|
|
1199
|
-
metadata = result.metadata
|
|
1200
|
-
next unless metadata.is_a?(Kreuzberg::HtmlMetadata)
|
|
1201
|
-
|
|
1202
|
-
expect(metadata.title).not_to be_nil
|
|
1203
|
-
expect(metadata.description).not_to be_nil
|
|
1204
|
-
expect(metadata.keywords).to be_a(Array)
|
|
1205
|
-
expect(metadata.headers).to be_a(Array)
|
|
1206
|
-
expect(metadata.links).to be_a(Array)
|
|
1207
|
-
expect(metadata.images).to be_a(Array)
|
|
1208
|
-
end
|
|
1209
|
-
|
|
1210
|
-
titles = results.map { |r| r.metadata.is_a?(Kreuzberg::HtmlMetadata) ? r.metadata.title : r.metadata['title'] }
|
|
1211
|
-
expect(titles.uniq.length).to eq(5)
|
|
1212
|
-
ensure
|
|
1213
|
-
test_files.each { |f| FileUtils.rm_f(f) }
|
|
1214
|
-
end
|
|
1160
|
+
expect(results).not_to be_empty
|
|
1161
|
+
verify_concurrent_results(results, errors, test_files)
|
|
1215
1162
|
end
|
|
1216
1163
|
end
|
|
1217
1164
|
|
|
@@ -1225,4 +1172,77 @@ RSpec.describe 'Kreuzberg Metadata Types' do
|
|
|
1225
1172
|
file.close
|
|
1226
1173
|
file.path
|
|
1227
1174
|
end
|
|
1175
|
+
|
|
1176
|
+
def create_concurrent_test_files
|
|
1177
|
+
test_files = []
|
|
1178
|
+
5.times do |i|
|
|
1179
|
+
html_content = <<~HTML
|
|
1180
|
+
<html>
|
|
1181
|
+
<head>
|
|
1182
|
+
<title>Concurrent Test #{i}</title>
|
|
1183
|
+
<meta name="description" content="Test document #{i}">
|
|
1184
|
+
<meta name="keywords" content="test#{i}, concurrent, thread-safe">
|
|
1185
|
+
</head>
|
|
1186
|
+
<body>
|
|
1187
|
+
<h1>Test Document #{i}</h1>
|
|
1188
|
+
<p>Content for test #{i}</p>
|
|
1189
|
+
<a href="/page-#{i}">Link #{i}</a>
|
|
1190
|
+
<img src="image-#{i}.jpg" alt="Image #{i}">
|
|
1191
|
+
</body>
|
|
1192
|
+
</html>
|
|
1193
|
+
HTML
|
|
1194
|
+
test_files << create_test_html_file(html_content)
|
|
1195
|
+
end
|
|
1196
|
+
test_files
|
|
1197
|
+
end
|
|
1198
|
+
|
|
1199
|
+
def run_concurrent_extractions(test_files)
|
|
1200
|
+
results = []
|
|
1201
|
+
errors = []
|
|
1202
|
+
|
|
1203
|
+
threads = test_files.map do |file|
|
|
1204
|
+
Thread.new do
|
|
1205
|
+
result = Kreuzberg.extract_file_sync(path: file)
|
|
1206
|
+
results << result
|
|
1207
|
+
rescue StandardError => e
|
|
1208
|
+
errors << e
|
|
1209
|
+
end
|
|
1210
|
+
end
|
|
1211
|
+
|
|
1212
|
+
threads.each(&:join)
|
|
1213
|
+
[results, errors]
|
|
1214
|
+
end
|
|
1215
|
+
|
|
1216
|
+
def verify_concurrent_results(results, errors, test_files)
|
|
1217
|
+
expect(errors).to be_empty
|
|
1218
|
+
expect(results.length).to eq(5)
|
|
1219
|
+
|
|
1220
|
+
results.each do |result|
|
|
1221
|
+
expect(result).to be_a(Kreuzberg::Result)
|
|
1222
|
+
expect(result.metadata).not_to be_nil
|
|
1223
|
+
|
|
1224
|
+
metadata = result.metadata
|
|
1225
|
+
next unless metadata.is_a?(Kreuzberg::HtmlMetadata)
|
|
1226
|
+
|
|
1227
|
+
verify_metadata_fields(metadata)
|
|
1228
|
+
end
|
|
1229
|
+
|
|
1230
|
+
titles = extract_titles(results)
|
|
1231
|
+
expect(titles.uniq.length).to eq(5)
|
|
1232
|
+
ensure
|
|
1233
|
+
test_files.each { |f| FileUtils.rm_f(f) }
|
|
1234
|
+
end
|
|
1235
|
+
|
|
1236
|
+
def verify_metadata_fields(metadata)
|
|
1237
|
+
expect(metadata.title).not_to be_nil
|
|
1238
|
+
expect(metadata.description).not_to be_nil
|
|
1239
|
+
expect(metadata.keywords).to be_a(Array)
|
|
1240
|
+
expect(metadata.headers).to be_a(Array)
|
|
1241
|
+
expect(metadata.links).to be_a(Array)
|
|
1242
|
+
expect(metadata.images).to be_a(Array)
|
|
1243
|
+
end
|
|
1244
|
+
|
|
1245
|
+
def extract_titles(results)
|
|
1246
|
+
results.map { |r| r.metadata.is_a?(Kreuzberg::HtmlMetadata) ? r.metadata.title : r.metadata['title'] }
|
|
1247
|
+
end
|
|
1228
1248
|
end
|