kreuzberg 4.1.2 → 4.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +2 -2
- data/README.md +1 -1
- data/ext/kreuzberg_rb/native/Cargo.lock +26 -17
- data/ext/kreuzberg_rb/native/libpdfium.so +0 -0
- data/kreuzberg.gemspec +13 -1
- data/lib/kreuzberg/cli.rb +16 -6
- data/lib/kreuzberg/cli_proxy.rb +3 -1
- data/lib/kreuzberg/config.rb +121 -39
- data/lib/kreuzberg/djot_content.rb +225 -0
- data/lib/kreuzberg/extraction_api.rb +20 -4
- data/lib/kreuzberg/result.rb +12 -2
- data/lib/kreuzberg/version.rb +1 -1
- data/lib/kreuzberg.rb +1 -0
- data/sig/kreuzberg.rbs +28 -12
- data/spec/binding/batch_operations_spec.rb +80 -0
- data/spec/binding/batch_spec.rb +6 -5
- data/spec/binding/error_recovery_spec.rb +3 -3
- data/spec/binding/metadata_types_spec.rb +77 -57
- data/spec/binding/tables_spec.rb +11 -2
- data/spec/serialization_spec.rb +134 -0
- data/spec/unit/config/output_format_spec.rb +380 -0
- data/vendor/Cargo.toml +1 -1
- data/vendor/kreuzberg/Cargo.toml +1 -1
- data/vendor/kreuzberg/README.md +1 -1
- data/vendor/kreuzberg/src/api/startup.rs +15 -1
- data/vendor/kreuzberg/src/core/config_validation/sections.rs +16 -4
- data/vendor/kreuzberg/src/core/extractor/file.rs +1 -2
- data/vendor/kreuzberg/src/core/extractor/mod.rs +2 -1
- data/vendor/kreuzberg/src/core/io.rs +7 -7
- data/vendor/kreuzberg/src/core/mime.rs +4 -4
- data/vendor/kreuzberg/src/embeddings.rs +4 -4
- data/vendor/kreuzberg/src/extraction/pptx/parser.rs +6 -0
- data/vendor/kreuzberg/src/mcp/format.rs +237 -39
- data/vendor/kreuzberg/src/mcp/params.rs +26 -33
- data/vendor/kreuzberg/src/mcp/server.rs +6 -3
- data/vendor/kreuzberg/src/mcp/tools/extraction.rs +16 -23
- data/vendor/kreuzberg/src/plugins/mod.rs +1 -0
- data/vendor/kreuzberg/src/plugins/registry/extractor.rs +251 -5
- data/vendor/kreuzberg/src/plugins/registry/ocr.rs +150 -2
- data/vendor/kreuzberg/src/plugins/registry/processor.rs +213 -5
- data/vendor/kreuzberg/src/plugins/registry/validator.rs +220 -4
- data/vendor/kreuzberg/src/plugins/startup_validation.rs +385 -0
- data/vendor/kreuzberg/tests/api_chunk.rs +40 -30
- data/vendor/kreuzberg/tests/api_consistency.rs +349 -0
- data/vendor/kreuzberg/tests/api_embed.rs +84 -50
- data/vendor/kreuzberg/tests/api_large_pdf_extraction_diagnostics.rs +8 -2
- data/vendor/kreuzberg/tests/api_tests.rs +298 -139
- data/vendor/kreuzberg/tests/archive_integration.rs +63 -56
- data/vendor/kreuzberg/tests/batch_orchestration.rs +22 -14
- data/vendor/kreuzberg/tests/batch_pooling_benchmark.rs +13 -13
- data/vendor/kreuzberg/tests/batch_processing.rs +13 -9
- data/vendor/kreuzberg/tests/bibtex_parity_test.rs +10 -10
- data/vendor/kreuzberg/tests/concurrency_stress.rs +10 -6
- data/vendor/kreuzberg/tests/config_behavioral.rs +416 -0
- data/vendor/kreuzberg/tests/config_features.rs +19 -15
- data/vendor/kreuzberg/tests/config_integration_test.rs +68 -68
- data/vendor/kreuzberg/tests/config_loading_tests.rs +71 -62
- data/vendor/kreuzberg/tests/contract_mcp.rs +314 -0
- data/vendor/kreuzberg/tests/core_integration.rs +57 -57
- data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +23 -23
- data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +15 -14
- data/vendor/kreuzberg/tests/docx_vs_pandoc_comparison.rs +4 -4
- data/vendor/kreuzberg/tests/email_integration.rs +7 -7
- data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +2 -2
- data/vendor/kreuzberg/tests/error_handling.rs +13 -11
- data/vendor/kreuzberg/tests/fictionbook_extractor_tests.rs +2 -2
- data/vendor/kreuzberg/tests/html_table_test.rs +11 -11
- data/vendor/kreuzberg/tests/instrumentation_test.rs +18 -13
- data/vendor/kreuzberg/tests/jats_extractor_tests.rs +17 -17
- data/vendor/kreuzberg/tests/jupyter_extractor_tests.rs +9 -9
- data/vendor/kreuzberg/tests/keywords_integration.rs +25 -25
- data/vendor/kreuzberg/tests/keywords_quality.rs +9 -9
- data/vendor/kreuzberg/tests/latex_extractor_tests.rs +2 -2
- data/vendor/kreuzberg/tests/markdown_extractor_tests.rs +1 -1
- data/vendor/kreuzberg/tests/mcp_integration.rs +849 -0
- data/vendor/kreuzberg/tests/mime_detection.rs +75 -43
- data/vendor/kreuzberg/tests/ocr_errors.rs +10 -4
- data/vendor/kreuzberg/tests/ocr_language_registry.rs +1 -1
- data/vendor/kreuzberg/tests/ocr_stress.rs +3 -3
- data/vendor/kreuzberg/tests/odt_extractor_tests.rs +6 -6
- data/vendor/kreuzberg/tests/opml_extractor_tests.rs +2 -2
- data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +2 -2
- data/vendor/kreuzberg/tests/page_markers.rs +1 -1
- data/vendor/kreuzberg/tests/pdf_hierarchy_detection.rs +6 -6
- data/vendor/kreuzberg/tests/pdf_text_merging.rs +2 -2
- data/vendor/kreuzberg/tests/pipeline_integration.rs +77 -61
- data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +97 -77
- data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +78 -61
- data/vendor/kreuzberg/tests/plugin_system.rs +49 -46
- data/vendor/kreuzberg/tests/plugin_validator_test.rs +109 -97
- data/vendor/kreuzberg/tests/pptx_regression_tests.rs +324 -31
- data/vendor/kreuzberg/tests/registry_integration_tests.rs +26 -23
- data/vendor/kreuzberg/tests/rst_extractor_tests.rs +1 -1
- data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +18 -18
- data/vendor/kreuzberg/tests/security_validation.rs +20 -19
- data/vendor/kreuzberg/tests/serialization_integration.rs +112 -0
- data/vendor/kreuzberg/tests/stopwords_integration_test.rs +36 -36
- data/vendor/kreuzberg/tests/test_fastembed.rs +8 -8
- data/vendor/kreuzberg/tests/typst_extractor_tests.rs +9 -9
- data/vendor/kreuzberg/tests/xlsx_metadata_extraction_test.rs +12 -9
- data/vendor/kreuzberg-tesseract/Cargo.toml +1 -1
- metadata +12 -2
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: '010412940492f83bc170a4a4efc644ac7e3818502734035523796287837a3893'
|
|
4
|
+
data.tar.gz: af24cba007cc58283d678d4b15251ebae3a5740f60ade9a54cc19973a2022a82
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: ad67348bec54a01ca3592ed72e9b2b8bc9e711a37e11b40ada31466c67f834132fc0de278c53a1c014fa6751da7abebae934cff2a9cc1835f7e056c895a273cb
|
|
7
|
+
data.tar.gz: ca2cdb076a5d1af67f0e807978a966d1a391cc286bcdf5499544e3403196140918a54674beab77ea09fc0e8bc7ab66f357da5d984326a511b1d21643a3d6cf41
|
data/Gemfile.lock
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
PATH
|
|
2
2
|
remote: .
|
|
3
3
|
specs:
|
|
4
|
-
kreuzberg (4.1
|
|
4
|
+
kreuzberg (4.2.1)
|
|
5
5
|
|
|
6
6
|
GEM
|
|
7
7
|
remote: https://rubygems.org/
|
|
@@ -207,7 +207,7 @@ CHECKSUMS
|
|
|
207
207
|
i18n (1.14.8) sha256=285778639134865c5e0f6269e0b818256017e8cde89993fdfcbfb64d088824a5
|
|
208
208
|
io-console (0.8.2) sha256=d6e3ae7a7cc7574f4b8893b4fca2162e57a825b223a177b7afa236c5ef9814cc
|
|
209
209
|
json (2.18.0) sha256=b10506aee4183f5cf49e0efc48073d7b75843ce3782c68dbeb763351c08fd505
|
|
210
|
-
kreuzberg (4.1
|
|
210
|
+
kreuzberg (4.2.1)
|
|
211
211
|
language_server-protocol (3.17.0.5) sha256=fd1e39a51a28bf3eec959379985a72e296e9f9acfce46f6a79d31ca8760803cc
|
|
212
212
|
lint_roller (1.1.0) sha256=2c0c845b632a7d172cb849cc90c1bce937a28c5c8ccccb50dfd46a485003cc87
|
|
213
213
|
listen (3.10.0) sha256=c6e182db62143aeccc2e1960033bebe7445309c7272061979bb098d03760c9d2
|
data/README.md
CHANGED
|
@@ -22,7 +22,7 @@
|
|
|
22
22
|
<img src="https://img.shields.io/maven-central/v/dev.kreuzberg/kreuzberg?label=Java&color=007ec6" alt="Java">
|
|
23
23
|
</a>
|
|
24
24
|
<a href="https://github.com/kreuzberg-dev/kreuzberg/releases">
|
|
25
|
-
<img src="https://img.shields.io/github/v/tag/kreuzberg-dev/kreuzberg?label=Go&color=007ec6&filter=v4.1
|
|
25
|
+
<img src="https://img.shields.io/github/v/tag/kreuzberg-dev/kreuzberg?label=Go&color=007ec6&filter=v4.2.1" alt="Go">
|
|
26
26
|
</a>
|
|
27
27
|
<a href="https://www.nuget.org/packages/Kreuzberg/">
|
|
28
28
|
<img src="https://img.shields.io/nuget/v/Kreuzberg?label=C%23&color=007ec6" alt="C#">
|
|
@@ -716,9 +716,9 @@ dependencies = [
|
|
|
716
716
|
|
|
717
717
|
[[package]]
|
|
718
718
|
name = "cc"
|
|
719
|
-
version = "1.2.
|
|
719
|
+
version = "1.2.54"
|
|
720
720
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
721
|
-
checksum = "
|
|
721
|
+
checksum = "6354c81bbfd62d9cfa9cb3c773c2b7b2a3a482d569de977fd0e961f6e7c00583"
|
|
722
722
|
dependencies = [
|
|
723
723
|
"find-msvc-tools",
|
|
724
724
|
"jobserver",
|
|
@@ -2027,9 +2027,9 @@ dependencies = [
|
|
|
2027
2027
|
|
|
2028
2028
|
[[package]]
|
|
2029
2029
|
name = "html-to-markdown-rs"
|
|
2030
|
-
version = "2.23.
|
|
2030
|
+
version = "2.23.4"
|
|
2031
2031
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
2032
|
-
checksum = "
|
|
2032
|
+
checksum = "ffbf49450676163bdf69fac2acf72674fcf2d2aaf690aa247368c567cc9afb2a"
|
|
2033
2033
|
dependencies = [
|
|
2034
2034
|
"astral-tl",
|
|
2035
2035
|
"base64 0.22.1",
|
|
@@ -2627,7 +2627,7 @@ dependencies = [
|
|
|
2627
2627
|
|
|
2628
2628
|
[[package]]
|
|
2629
2629
|
name = "kreuzberg"
|
|
2630
|
-
version = "4.
|
|
2630
|
+
version = "4.2.0"
|
|
2631
2631
|
dependencies = [
|
|
2632
2632
|
"ahash",
|
|
2633
2633
|
"async-trait",
|
|
@@ -2706,17 +2706,18 @@ dependencies = [
|
|
|
2706
2706
|
"uuid",
|
|
2707
2707
|
"whatlang",
|
|
2708
2708
|
"yake-rust",
|
|
2709
|
-
"zip 7.
|
|
2709
|
+
"zip 7.2.0",
|
|
2710
2710
|
]
|
|
2711
2711
|
|
|
2712
2712
|
[[package]]
|
|
2713
2713
|
name = "kreuzberg-ffi"
|
|
2714
|
-
version = "4.
|
|
2714
|
+
version = "4.2.0"
|
|
2715
2715
|
dependencies = [
|
|
2716
2716
|
"async-trait",
|
|
2717
2717
|
"cbindgen",
|
|
2718
2718
|
"html-to-markdown-rs",
|
|
2719
2719
|
"kreuzberg",
|
|
2720
|
+
"log",
|
|
2720
2721
|
"serde",
|
|
2721
2722
|
"serde_json",
|
|
2722
2723
|
"tokio",
|
|
@@ -2767,14 +2768,14 @@ dependencies = [
|
|
|
2767
2768
|
|
|
2768
2769
|
[[package]]
|
|
2769
2770
|
name = "kreuzberg-tesseract"
|
|
2770
|
-
version = "4.
|
|
2771
|
+
version = "4.2.0"
|
|
2771
2772
|
dependencies = [
|
|
2772
2773
|
"cc",
|
|
2773
2774
|
"cmake",
|
|
2774
2775
|
"libc",
|
|
2775
2776
|
"reqwest 0.13.1",
|
|
2776
2777
|
"thiserror 2.0.18",
|
|
2777
|
-
"zip 7.
|
|
2778
|
+
"zip 7.2.0",
|
|
2778
2779
|
]
|
|
2779
2780
|
|
|
2780
2781
|
[[package]]
|
|
@@ -5022,9 +5023,9 @@ dependencies = [
|
|
|
5022
5023
|
|
|
5023
5024
|
[[package]]
|
|
5024
5025
|
name = "rmcp"
|
|
5025
|
-
version = "0.
|
|
5026
|
+
version = "0.14.0"
|
|
5026
5027
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
5027
|
-
checksum = "
|
|
5028
|
+
checksum = "0a621b37a548ff6ab6292d57841eb25785a7f146d89391a19c9f199414bd13da"
|
|
5028
5029
|
dependencies = [
|
|
5029
5030
|
"async-trait",
|
|
5030
5031
|
"axum",
|
|
@@ -5054,9 +5055,9 @@ dependencies = [
|
|
|
5054
5055
|
|
|
5055
5056
|
[[package]]
|
|
5056
5057
|
name = "rmcp-macros"
|
|
5057
|
-
version = "0.
|
|
5058
|
+
version = "0.14.0"
|
|
5058
5059
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
5059
|
-
checksum = "
|
|
5060
|
+
checksum = "6b79ed92303f9262db79575aa8c3652581668e9d136be6fd0b9ededa78954c95"
|
|
5060
5061
|
dependencies = [
|
|
5061
5062
|
"darling 0.23.0",
|
|
5062
5063
|
"proc-macro2",
|
|
@@ -6013,6 +6014,7 @@ checksum = "91e7d9e3bb61134e77bde20dd4825b97c010155709965fedf0f49bb138e52a9d"
|
|
|
6013
6014
|
dependencies = [
|
|
6014
6015
|
"deranged",
|
|
6015
6016
|
"itoa",
|
|
6017
|
+
"js-sys",
|
|
6016
6018
|
"num-conv",
|
|
6017
6019
|
"powerfmt",
|
|
6018
6020
|
"serde",
|
|
@@ -6405,6 +6407,12 @@ version = "0.25.1"
|
|
|
6405
6407
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
6406
6408
|
checksum = "d2df906b07856748fa3f6e0ad0cbaa047052d4a7dd609e231c4f72cee8c36f31"
|
|
6407
6409
|
|
|
6410
|
+
[[package]]
|
|
6411
|
+
name = "typed-path"
|
|
6412
|
+
version = "0.12.1"
|
|
6413
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
6414
|
+
checksum = "e43ffa54726cdc9ea78392023ffe9fe9cf9ac779e1c6fcb0d23f9862e3879d20"
|
|
6415
|
+
|
|
6408
6416
|
[[package]]
|
|
6409
6417
|
name = "typeid"
|
|
6410
6418
|
version = "1.0.3"
|
|
@@ -6698,9 +6706,9 @@ checksum = "06abde3611657adf66d383f00b093d7faecc7fa57071cce2578660c9f1010821"
|
|
|
6698
6706
|
|
|
6699
6707
|
[[package]]
|
|
6700
6708
|
name = "uuid"
|
|
6701
|
-
version = "1.
|
|
6709
|
+
version = "1.20.0"
|
|
6702
6710
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
6703
|
-
checksum = "
|
|
6711
|
+
checksum = "ee48d38b119b0cd71fe4141b30f5ba9c7c5d9f4e7a3a8b4a674e4b6ef789976f"
|
|
6704
6712
|
dependencies = [
|
|
6705
6713
|
"getrandom 0.3.4",
|
|
6706
6714
|
"js-sys",
|
|
@@ -7498,9 +7506,9 @@ dependencies = [
|
|
|
7498
7506
|
|
|
7499
7507
|
[[package]]
|
|
7500
7508
|
name = "zip"
|
|
7501
|
-
version = "7.
|
|
7509
|
+
version = "7.2.0"
|
|
7502
7510
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
7503
|
-
checksum = "
|
|
7511
|
+
checksum = "c42e33efc22a0650c311c2ef19115ce232583abbe80850bc8b66509ebef02de0"
|
|
7504
7512
|
dependencies = [
|
|
7505
7513
|
"aes",
|
|
7506
7514
|
"bzip2",
|
|
@@ -7518,6 +7526,7 @@ dependencies = [
|
|
|
7518
7526
|
"ppmd-rust",
|
|
7519
7527
|
"sha1",
|
|
7520
7528
|
"time",
|
|
7529
|
+
"typed-path",
|
|
7521
7530
|
"zeroize",
|
|
7522
7531
|
"zopfli",
|
|
7523
7532
|
"zstd",
|
|
Binary file
|
data/kreuzberg.gemspec
CHANGED
|
@@ -130,10 +130,22 @@ vendor_files = Dir.chdir(__dir__) do
|
|
|
130
130
|
kreuzberg_files + kreuzberg_ffi_files + kreuzberg_tesseract_files + rb_sys_files + workspace_toml
|
|
131
131
|
end
|
|
132
132
|
|
|
133
|
+
# When vendor files exist, get ext/ files from filesystem (to include modified Cargo.toml
|
|
134
|
+
# with vendor paths) instead of from git (which has original 5-level crate paths)
|
|
135
|
+
ext_files_from_fs = Dir.chdir(__dir__) do
|
|
136
|
+
Dir.glob('ext/**/*', File::FNM_DOTMATCH)
|
|
137
|
+
.reject { |f| File.directory?(f) }
|
|
138
|
+
.reject { |f| f.include?('/target/') }
|
|
139
|
+
.grep_v(/\.(swp|bak|tmp)$/)
|
|
140
|
+
.grep_v(/~$/)
|
|
141
|
+
end
|
|
142
|
+
|
|
133
143
|
files = if (ruby_files + core_files + ffi_files).empty?
|
|
134
144
|
fallback_files
|
|
135
145
|
elsif vendor_files.any?
|
|
136
|
-
|
|
146
|
+
# Use ext/ files from filesystem (modified by vendor script) + non-ext ruby files from git
|
|
147
|
+
non_ext_ruby_files = ruby_files.reject { |f| f.start_with?('ext/') }
|
|
148
|
+
non_ext_ruby_files + ext_files_from_fs + vendor_files
|
|
137
149
|
else
|
|
138
150
|
ruby_files + core_files + ffi_files
|
|
139
151
|
end
|
data/lib/kreuzberg/cli.rb
CHANGED
|
@@ -8,24 +8,34 @@ module Kreuzberg
|
|
|
8
8
|
|
|
9
9
|
# Extract content from a file using the CLI
|
|
10
10
|
#
|
|
11
|
-
# @param
|
|
11
|
+
# @param path_or_nil [String, nil] Path to the file (positional, for backward compatibility)
|
|
12
|
+
# @param path [String] Path to the file (keyword argument)
|
|
12
13
|
# @param output [String] Output format ("text", "json", "markdown")
|
|
13
14
|
# @param ocr [Boolean] Enable OCR
|
|
14
15
|
# @return [String] Extracted content
|
|
15
16
|
#
|
|
16
|
-
def extract(path, output: 'text', ocr: false)
|
|
17
|
-
|
|
17
|
+
def extract(path_or_nil = nil, path: nil, output: 'text', ocr: false)
|
|
18
|
+
# Support both positional and keyword argument for path (backward compatibility)
|
|
19
|
+
actual_path = path_or_nil || path
|
|
20
|
+
raise ArgumentError, 'path is required' if actual_path.nil?
|
|
21
|
+
|
|
22
|
+
args = ['extract', actual_path, '--format', output]
|
|
18
23
|
args.push('--ocr', ocr ? 'true' : 'false')
|
|
19
24
|
CLIProxy.call(args)
|
|
20
25
|
end
|
|
21
26
|
|
|
22
27
|
# Detect MIME type of a file using the CLI
|
|
23
28
|
#
|
|
24
|
-
# @param
|
|
29
|
+
# @param path_or_nil [String, nil] Path to the file (positional, for backward compatibility)
|
|
30
|
+
# @param path [String] Path to the file (keyword argument)
|
|
25
31
|
# @return [String] MIME type
|
|
26
32
|
#
|
|
27
|
-
def detect(path)
|
|
28
|
-
|
|
33
|
+
def detect(path_or_nil = nil, path: nil)
|
|
34
|
+
# Support both positional and keyword argument for path (backward compatibility)
|
|
35
|
+
actual_path = path_or_nil || path
|
|
36
|
+
raise ArgumentError, 'path is required' if actual_path.nil?
|
|
37
|
+
|
|
38
|
+
CLIProxy.call(['detect', actual_path]).strip
|
|
29
39
|
end
|
|
30
40
|
|
|
31
41
|
# Get CLI version
|
data/lib/kreuzberg/cli_proxy.rb
CHANGED
|
@@ -91,11 +91,13 @@ module Kreuzberg
|
|
|
91
91
|
lib_path.join('bin', binary_name),
|
|
92
92
|
lib_path.join(binary_name),
|
|
93
93
|
root_path.join('../../crates/kreuzberg-cli/target/release', binary_name),
|
|
94
|
-
root_path.join('../../target/release', binary_name)
|
|
94
|
+
root_path.join('../../target/release', binary_name),
|
|
95
|
+
root_path.join('../../target/debug', binary_name)
|
|
95
96
|
]
|
|
96
97
|
|
|
97
98
|
workspace_root = root_path.parent&.parent
|
|
98
99
|
paths << workspace_root.join('target', 'release', binary_name) if workspace_root
|
|
100
|
+
paths << workspace_root.join('target', 'debug', binary_name) if workspace_root
|
|
99
101
|
|
|
100
102
|
paths
|
|
101
103
|
end
|
data/lib/kreuzberg/config.rb
CHANGED
|
@@ -715,9 +715,12 @@ module Kreuzberg
|
|
|
715
715
|
class Extraction
|
|
716
716
|
attr_reader :use_cache, :enable_quality_processing, :force_ocr,
|
|
717
717
|
:ocr, :chunking, :language_detection, :pdf_options,
|
|
718
|
-
:
|
|
718
|
+
:images, :image_preprocessing, :postprocessor,
|
|
719
719
|
:token_reduction, :keywords, :html_options, :pages,
|
|
720
|
-
:max_concurrent_extractions
|
|
720
|
+
:max_concurrent_extractions, :output_format, :result_format
|
|
721
|
+
|
|
722
|
+
# Alias for backward compatibility - image_extraction is the canonical name
|
|
723
|
+
alias image_extraction images
|
|
721
724
|
|
|
722
725
|
# Load configuration from a file.
|
|
723
726
|
#
|
|
@@ -738,7 +741,7 @@ module Kreuzberg
|
|
|
738
741
|
use_cache enable_quality_processing force_ocr ocr chunking
|
|
739
742
|
language_detection pdf_options image_extraction image_preprocessing
|
|
740
743
|
postprocessor token_reduction keywords html_options pages
|
|
741
|
-
max_concurrent_extractions
|
|
744
|
+
max_concurrent_extractions output_format result_format
|
|
742
745
|
].freeze
|
|
743
746
|
|
|
744
747
|
# Aliases for backward compatibility
|
|
@@ -746,6 +749,12 @@ module Kreuzberg
|
|
|
746
749
|
images: :image_extraction
|
|
747
750
|
}.freeze
|
|
748
751
|
|
|
752
|
+
# Valid output format values (case-insensitive, normalized internally)
|
|
753
|
+
VALID_OUTPUT_FORMATS = %w[plain markdown html djot].freeze
|
|
754
|
+
|
|
755
|
+
# Valid result format values (case-insensitive, normalized internally)
|
|
756
|
+
VALID_RESULT_FORMATS = %w[unified elements element_based].freeze
|
|
757
|
+
|
|
749
758
|
def self.from_file(path)
|
|
750
759
|
hash = Kreuzberg._config_from_file_native(path)
|
|
751
760
|
new(**normalize_hash_keys(hash))
|
|
@@ -789,41 +798,87 @@ module Kreuzberg
|
|
|
789
798
|
new(**normalize_hash_keys(hash))
|
|
790
799
|
end
|
|
791
800
|
|
|
792
|
-
def initialize(
|
|
793
|
-
|
|
794
|
-
|
|
795
|
-
|
|
796
|
-
|
|
797
|
-
|
|
798
|
-
|
|
799
|
-
|
|
800
|
-
|
|
801
|
-
|
|
802
|
-
|
|
803
|
-
|
|
804
|
-
|
|
805
|
-
|
|
806
|
-
|
|
807
|
-
|
|
808
|
-
|
|
809
|
-
|
|
810
|
-
|
|
811
|
-
|
|
812
|
-
|
|
813
|
-
|
|
814
|
-
|
|
815
|
-
|
|
816
|
-
|
|
817
|
-
|
|
818
|
-
|
|
819
|
-
|
|
820
|
-
|
|
821
|
-
|
|
822
|
-
|
|
823
|
-
|
|
801
|
+
def initialize(hash = nil,
|
|
802
|
+
use_cache: true,
|
|
803
|
+
enable_quality_processing: false,
|
|
804
|
+
force_ocr: false,
|
|
805
|
+
ocr: nil,
|
|
806
|
+
chunking: nil,
|
|
807
|
+
language_detection: nil,
|
|
808
|
+
pdf_options: nil,
|
|
809
|
+
image_extraction: nil,
|
|
810
|
+
image_preprocessing: nil,
|
|
811
|
+
postprocessor: nil,
|
|
812
|
+
token_reduction: nil,
|
|
813
|
+
keywords: nil,
|
|
814
|
+
html_options: nil,
|
|
815
|
+
pages: nil,
|
|
816
|
+
max_concurrent_extractions: nil,
|
|
817
|
+
output_format: nil,
|
|
818
|
+
result_format: nil)
|
|
819
|
+
kwargs = {
|
|
820
|
+
use_cache: use_cache, enable_quality_processing: enable_quality_processing,
|
|
821
|
+
force_ocr: force_ocr, ocr: ocr, chunking: chunking, language_detection: language_detection,
|
|
822
|
+
pdf_options: pdf_options, image_extraction: image_extraction,
|
|
823
|
+
image_preprocessing: image_preprocessing, postprocessor: postprocessor,
|
|
824
|
+
token_reduction: token_reduction, keywords: keywords, html_options: html_options,
|
|
825
|
+
pages: pages, max_concurrent_extractions: max_concurrent_extractions,
|
|
826
|
+
output_format: output_format, result_format: result_format
|
|
827
|
+
}
|
|
828
|
+
extracted = extract_from_hash(hash, kwargs)
|
|
829
|
+
|
|
830
|
+
assign_attributes(extracted)
|
|
831
|
+
end
|
|
832
|
+
|
|
833
|
+
def extract_from_hash(hash, defaults)
|
|
834
|
+
return defaults unless hash.is_a?(Hash)
|
|
835
|
+
|
|
836
|
+
hash = hash.transform_keys(&:to_sym)
|
|
837
|
+
defaults.merge(hash.slice(*defaults.keys))
|
|
838
|
+
end
|
|
839
|
+
|
|
840
|
+
def assign_attributes(params)
|
|
841
|
+
@use_cache = params[:use_cache] ? true : false
|
|
842
|
+
@enable_quality_processing = params[:enable_quality_processing] ? true : false
|
|
843
|
+
@force_ocr = params[:force_ocr] ? true : false
|
|
844
|
+
@ocr = normalize_config(params[:ocr], OCR)
|
|
845
|
+
@chunking = normalize_config(params[:chunking], Chunking)
|
|
846
|
+
@language_detection = normalize_config(params[:language_detection], LanguageDetection)
|
|
847
|
+
@pdf_options = normalize_config(params[:pdf_options], PDF)
|
|
848
|
+
@images = normalize_config(params[:image_extraction], ImageExtraction)
|
|
849
|
+
@image_preprocessing = normalize_config(params[:image_preprocessing], ImagePreprocessing)
|
|
850
|
+
@postprocessor = normalize_config(params[:postprocessor], PostProcessor)
|
|
851
|
+
@token_reduction = normalize_config(params[:token_reduction], TokenReduction)
|
|
852
|
+
@keywords = normalize_config(params[:keywords], Keywords)
|
|
853
|
+
@html_options = normalize_config(params[:html_options], HtmlOptions)
|
|
854
|
+
@pages = normalize_config(params[:pages], PageConfig)
|
|
855
|
+
@max_concurrent_extractions = params[:max_concurrent_extractions]&.to_i
|
|
856
|
+
@output_format = validate_output_format(params[:output_format])
|
|
857
|
+
@result_format = validate_result_format(params[:result_format])
|
|
858
|
+
end
|
|
859
|
+
|
|
860
|
+
def validate_output_format(value)
|
|
861
|
+
return nil if value.nil?
|
|
862
|
+
|
|
863
|
+
str_value = value.to_s.downcase
|
|
864
|
+
return str_value if VALID_OUTPUT_FORMATS.include?(str_value)
|
|
865
|
+
|
|
866
|
+
raise ArgumentError,
|
|
867
|
+
"Invalid output_format: #{value}. Valid values: #{VALID_OUTPUT_FORMATS.join(', ')}"
|
|
868
|
+
end
|
|
869
|
+
|
|
870
|
+
def validate_result_format(value)
|
|
871
|
+
return nil if value.nil?
|
|
872
|
+
|
|
873
|
+
str_value = value.to_s.downcase
|
|
874
|
+
return str_value if VALID_RESULT_FORMATS.include?(str_value)
|
|
875
|
+
|
|
876
|
+
raise ArgumentError,
|
|
877
|
+
"Invalid result_format: #{value}. Valid values: #{VALID_RESULT_FORMATS.join(', ')}"
|
|
824
878
|
end
|
|
825
879
|
|
|
826
880
|
# rubocop:disable Metrics/CyclomaticComplexity
|
|
881
|
+
# rubocop:disable Metrics/MethodLength
|
|
827
882
|
def to_h
|
|
828
883
|
{
|
|
829
884
|
use_cache: @use_cache,
|
|
@@ -833,16 +888,19 @@ module Kreuzberg
|
|
|
833
888
|
chunking: @chunking&.to_h,
|
|
834
889
|
language_detection: @language_detection&.to_h,
|
|
835
890
|
pdf_options: @pdf_options&.to_h,
|
|
836
|
-
|
|
891
|
+
images: @images&.to_h,
|
|
837
892
|
image_preprocessing: @image_preprocessing&.to_h,
|
|
838
893
|
postprocessor: @postprocessor&.to_h,
|
|
839
894
|
token_reduction: @token_reduction&.to_h,
|
|
840
895
|
keywords: @keywords&.to_h,
|
|
841
896
|
html_options: @html_options&.to_h,
|
|
842
897
|
pages: @pages&.to_h,
|
|
843
|
-
max_concurrent_extractions: @max_concurrent_extractions
|
|
898
|
+
max_concurrent_extractions: @max_concurrent_extractions,
|
|
899
|
+
output_format: @output_format,
|
|
900
|
+
result_format: @result_format
|
|
844
901
|
}.compact
|
|
845
902
|
end
|
|
903
|
+
# rubocop:enable Metrics/MethodLength
|
|
846
904
|
# rubocop:enable Metrics/CyclomaticComplexity
|
|
847
905
|
|
|
848
906
|
# Serialize configuration to JSON string
|
|
@@ -966,7 +1024,7 @@ module Kreuzberg
|
|
|
966
1024
|
when :pdf_options
|
|
967
1025
|
@pdf_options = normalize_config(value, PDF)
|
|
968
1026
|
when :image_extraction
|
|
969
|
-
@
|
|
1027
|
+
@images = normalize_config(value, ImageExtraction)
|
|
970
1028
|
when :image_preprocessing
|
|
971
1029
|
@image_preprocessing = normalize_config(value, ImagePreprocessing)
|
|
972
1030
|
when :postprocessor
|
|
@@ -981,6 +1039,10 @@ module Kreuzberg
|
|
|
981
1039
|
@pages = normalize_config(value, PageConfig)
|
|
982
1040
|
when :max_concurrent_extractions
|
|
983
1041
|
@max_concurrent_extractions = value&.to_i
|
|
1042
|
+
when :output_format
|
|
1043
|
+
@output_format = validate_output_format(value)
|
|
1044
|
+
when :result_format
|
|
1045
|
+
@result_format = validate_result_format(value)
|
|
984
1046
|
else
|
|
985
1047
|
raise ArgumentError, "Unknown configuration key: #{key}"
|
|
986
1048
|
end
|
|
@@ -1002,6 +1064,24 @@ module Kreuzberg
|
|
|
1002
1064
|
nil
|
|
1003
1065
|
end
|
|
1004
1066
|
|
|
1067
|
+
# Set output_format attribute
|
|
1068
|
+
#
|
|
1069
|
+
# @param value [String, nil] Output format value
|
|
1070
|
+
# @return [String, nil] The value that was set
|
|
1071
|
+
#
|
|
1072
|
+
def output_format=(value)
|
|
1073
|
+
@output_format = validate_output_format(value)
|
|
1074
|
+
end
|
|
1075
|
+
|
|
1076
|
+
# Set result_format attribute
|
|
1077
|
+
#
|
|
1078
|
+
# @param value [String, nil] Result format value
|
|
1079
|
+
# @return [String, nil] The value that was set
|
|
1080
|
+
#
|
|
1081
|
+
def result_format=(value)
|
|
1082
|
+
@result_format = validate_result_format(value)
|
|
1083
|
+
end
|
|
1084
|
+
|
|
1005
1085
|
private
|
|
1006
1086
|
|
|
1007
1087
|
def normalize_config(value, klass)
|
|
@@ -1020,7 +1100,7 @@ module Kreuzberg
|
|
|
1020
1100
|
@chunking = merged.chunking
|
|
1021
1101
|
@language_detection = merged.language_detection
|
|
1022
1102
|
@pdf_options = merged.pdf_options
|
|
1023
|
-
@
|
|
1103
|
+
@images = merged.image_extraction
|
|
1024
1104
|
@image_preprocessing = merged.image_preprocessing
|
|
1025
1105
|
@postprocessor = merged.postprocessor
|
|
1026
1106
|
@token_reduction = merged.token_reduction
|
|
@@ -1028,6 +1108,8 @@ module Kreuzberg
|
|
|
1028
1108
|
@html_options = merged.html_options
|
|
1029
1109
|
@pages = merged.pages
|
|
1030
1110
|
@max_concurrent_extractions = merged.max_concurrent_extractions
|
|
1111
|
+
@output_format = merged.output_format
|
|
1112
|
+
@result_format = merged.result_format
|
|
1031
1113
|
end
|
|
1032
1114
|
end
|
|
1033
1115
|
end
|