kreuzberg 4.2.0 → 4.2.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +2 -2
- data/README.md +1 -1
- data/ext/kreuzberg_rb/native/Cargo.lock +26 -17
- data/lib/kreuzberg/cli.rb +16 -6
- data/lib/kreuzberg/cli_proxy.rb +3 -1
- data/lib/kreuzberg/config.rb +59 -28
- data/lib/kreuzberg/djot_content.rb +225 -0
- data/lib/kreuzberg/extraction_api.rb +20 -4
- data/lib/kreuzberg/result.rb +12 -2
- data/lib/kreuzberg/version.rb +1 -1
- data/lib/kreuzberg.rb +1 -0
- data/sig/kreuzberg.rbs +23 -11
- data/spec/binding/batch_spec.rb +6 -5
- data/spec/binding/config_spec.rb +1 -1
- data/spec/binding/error_recovery_spec.rb +3 -3
- data/spec/binding/tables_spec.rb +11 -2
- data/spec/unit/config/extraction_config_spec.rb +2 -2
- data/spec/unit/config/output_format_spec.rb +18 -18
- data/vendor/Cargo.toml +1 -1
- data/vendor/kreuzberg/Cargo.toml +3 -2
- data/vendor/kreuzberg/README.md +1 -1
- data/vendor/kreuzberg/src/api/error.rs +60 -0
- data/vendor/kreuzberg/src/api/handlers.rs +153 -32
- data/vendor/kreuzberg/src/api/mod.rs +2 -0
- data/vendor/kreuzberg/src/api/openapi.rs +141 -0
- data/vendor/kreuzberg/src/api/router.rs +24 -2
- data/vendor/kreuzberg/src/api/startup.rs +21 -1
- data/vendor/kreuzberg/src/api/types.rs +50 -4
- data/vendor/kreuzberg/src/core/config/processing.rs +8 -1
- data/vendor/kreuzberg/src/core/config_validation/sections.rs +16 -4
- data/vendor/kreuzberg/src/core/extractor/file.rs +1 -2
- data/vendor/kreuzberg/src/core/extractor/mod.rs +2 -1
- data/vendor/kreuzberg/src/core/io.rs +7 -7
- data/vendor/kreuzberg/src/core/mime.rs +4 -4
- data/vendor/kreuzberg/src/extraction/excel.rs +246 -9
- data/vendor/kreuzberg/src/extraction/pptx/parser.rs +6 -0
- data/vendor/kreuzberg/src/plugins/mod.rs +1 -0
- data/vendor/kreuzberg/src/plugins/registry/extractor.rs +251 -5
- data/vendor/kreuzberg/src/plugins/registry/ocr.rs +150 -2
- data/vendor/kreuzberg/src/plugins/registry/processor.rs +213 -5
- data/vendor/kreuzberg/src/plugins/registry/validator.rs +220 -4
- data/vendor/kreuzberg/src/plugins/startup_validation.rs +385 -0
- data/vendor/kreuzberg/tests/config_behavioral.rs +14 -12
- data/vendor/kreuzberg/tests/core_integration.rs +2 -4
- data/vendor/kreuzberg/tests/mime_detection.rs +3 -2
- data/vendor/kreuzberg/tests/pptx_regression_tests.rs +284 -1
- data/vendor/kreuzberg/tests/xlsx_metadata_extraction_test.rs +56 -0
- data/vendor/kreuzberg-ffi/kreuzberg.h +7 -2
- data/vendor/kreuzberg-ffi/src/helpers.rs +13 -1
- data/vendor/kreuzberg-ffi/src/lib.rs +8 -5
- data/vendor/kreuzberg-ffi/src/memory.rs +35 -1
- data/vendor/kreuzberg-ffi/src/types.rs +8 -5
- data/vendor/kreuzberg-tesseract/Cargo.toml +1 -1
- metadata +5 -2
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 1765714785cbe89567dcb13ed0c1e1b79c2da7a2143a0d0b4653c5578a3ada84
|
|
4
|
+
data.tar.gz: 64d6db5e4d88992920f37fe9a3e28ab08b5bdd0b28385da570e8207e67d90f34
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: cbb71395a285ddb1a74101fc935ebb8266b81c6172043a128721ab37fad583c7202e559f9e8cb2534bf110721bf20e2d0cbe6838554c772831c56bc09583bf75
|
|
7
|
+
data.tar.gz: b752cf56da8810211e5efd5e5d69f136eb7d0a3d5e27e985b81dff18bde442f0033b15962823ad7e3c5a27e080d02b6a6df1726bffe4aa21eaf89f56a5c6b56f
|
data/Gemfile.lock
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
PATH
|
|
2
2
|
remote: .
|
|
3
3
|
specs:
|
|
4
|
-
kreuzberg (4.2.
|
|
4
|
+
kreuzberg (4.2.2)
|
|
5
5
|
|
|
6
6
|
GEM
|
|
7
7
|
remote: https://rubygems.org/
|
|
@@ -207,7 +207,7 @@ CHECKSUMS
|
|
|
207
207
|
i18n (1.14.8) sha256=285778639134865c5e0f6269e0b818256017e8cde89993fdfcbfb64d088824a5
|
|
208
208
|
io-console (0.8.2) sha256=d6e3ae7a7cc7574f4b8893b4fca2162e57a825b223a177b7afa236c5ef9814cc
|
|
209
209
|
json (2.18.0) sha256=b10506aee4183f5cf49e0efc48073d7b75843ce3782c68dbeb763351c08fd505
|
|
210
|
-
kreuzberg (4.2.
|
|
210
|
+
kreuzberg (4.2.2)
|
|
211
211
|
language_server-protocol (3.17.0.5) sha256=fd1e39a51a28bf3eec959379985a72e296e9f9acfce46f6a79d31ca8760803cc
|
|
212
212
|
lint_roller (1.1.0) sha256=2c0c845b632a7d172cb849cc90c1bce937a28c5c8ccccb50dfd46a485003cc87
|
|
213
213
|
listen (3.10.0) sha256=c6e182db62143aeccc2e1960033bebe7445309c7272061979bb098d03760c9d2
|
data/README.md
CHANGED
|
@@ -22,7 +22,7 @@
|
|
|
22
22
|
<img src="https://img.shields.io/maven-central/v/dev.kreuzberg/kreuzberg?label=Java&color=007ec6" alt="Java">
|
|
23
23
|
</a>
|
|
24
24
|
<a href="https://github.com/kreuzberg-dev/kreuzberg/releases">
|
|
25
|
-
<img src="https://img.shields.io/github/v/tag/kreuzberg-dev/kreuzberg?label=Go&color=007ec6&filter=v4.2.
|
|
25
|
+
<img src="https://img.shields.io/github/v/tag/kreuzberg-dev/kreuzberg?label=Go&color=007ec6&filter=v4.2.2" alt="Go">
|
|
26
26
|
</a>
|
|
27
27
|
<a href="https://www.nuget.org/packages/Kreuzberg/">
|
|
28
28
|
<img src="https://img.shields.io/nuget/v/Kreuzberg?label=C%23&color=007ec6" alt="C#">
|
|
@@ -716,9 +716,9 @@ dependencies = [
|
|
|
716
716
|
|
|
717
717
|
[[package]]
|
|
718
718
|
name = "cc"
|
|
719
|
-
version = "1.2.
|
|
719
|
+
version = "1.2.54"
|
|
720
720
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
721
|
-
checksum = "
|
|
721
|
+
checksum = "6354c81bbfd62d9cfa9cb3c773c2b7b2a3a482d569de977fd0e961f6e7c00583"
|
|
722
722
|
dependencies = [
|
|
723
723
|
"find-msvc-tools",
|
|
724
724
|
"jobserver",
|
|
@@ -2027,9 +2027,9 @@ dependencies = [
|
|
|
2027
2027
|
|
|
2028
2028
|
[[package]]
|
|
2029
2029
|
name = "html-to-markdown-rs"
|
|
2030
|
-
version = "2.23.
|
|
2030
|
+
version = "2.23.4"
|
|
2031
2031
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
2032
|
-
checksum = "
|
|
2032
|
+
checksum = "ffbf49450676163bdf69fac2acf72674fcf2d2aaf690aa247368c567cc9afb2a"
|
|
2033
2033
|
dependencies = [
|
|
2034
2034
|
"astral-tl",
|
|
2035
2035
|
"base64 0.22.1",
|
|
@@ -2627,7 +2627,7 @@ dependencies = [
|
|
|
2627
2627
|
|
|
2628
2628
|
[[package]]
|
|
2629
2629
|
name = "kreuzberg"
|
|
2630
|
-
version = "4.
|
|
2630
|
+
version = "4.2.0"
|
|
2631
2631
|
dependencies = [
|
|
2632
2632
|
"ahash",
|
|
2633
2633
|
"async-trait",
|
|
@@ -2706,17 +2706,18 @@ dependencies = [
|
|
|
2706
2706
|
"uuid",
|
|
2707
2707
|
"whatlang",
|
|
2708
2708
|
"yake-rust",
|
|
2709
|
-
"zip 7.
|
|
2709
|
+
"zip 7.2.0",
|
|
2710
2710
|
]
|
|
2711
2711
|
|
|
2712
2712
|
[[package]]
|
|
2713
2713
|
name = "kreuzberg-ffi"
|
|
2714
|
-
version = "4.
|
|
2714
|
+
version = "4.2.0"
|
|
2715
2715
|
dependencies = [
|
|
2716
2716
|
"async-trait",
|
|
2717
2717
|
"cbindgen",
|
|
2718
2718
|
"html-to-markdown-rs",
|
|
2719
2719
|
"kreuzberg",
|
|
2720
|
+
"log",
|
|
2720
2721
|
"serde",
|
|
2721
2722
|
"serde_json",
|
|
2722
2723
|
"tokio",
|
|
@@ -2767,14 +2768,14 @@ dependencies = [
|
|
|
2767
2768
|
|
|
2768
2769
|
[[package]]
|
|
2769
2770
|
name = "kreuzberg-tesseract"
|
|
2770
|
-
version = "4.
|
|
2771
|
+
version = "4.2.0"
|
|
2771
2772
|
dependencies = [
|
|
2772
2773
|
"cc",
|
|
2773
2774
|
"cmake",
|
|
2774
2775
|
"libc",
|
|
2775
2776
|
"reqwest 0.13.1",
|
|
2776
2777
|
"thiserror 2.0.18",
|
|
2777
|
-
"zip 7.
|
|
2778
|
+
"zip 7.2.0",
|
|
2778
2779
|
]
|
|
2779
2780
|
|
|
2780
2781
|
[[package]]
|
|
@@ -5022,9 +5023,9 @@ dependencies = [
|
|
|
5022
5023
|
|
|
5023
5024
|
[[package]]
|
|
5024
5025
|
name = "rmcp"
|
|
5025
|
-
version = "0.
|
|
5026
|
+
version = "0.14.0"
|
|
5026
5027
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
5027
|
-
checksum = "
|
|
5028
|
+
checksum = "0a621b37a548ff6ab6292d57841eb25785a7f146d89391a19c9f199414bd13da"
|
|
5028
5029
|
dependencies = [
|
|
5029
5030
|
"async-trait",
|
|
5030
5031
|
"axum",
|
|
@@ -5054,9 +5055,9 @@ dependencies = [
|
|
|
5054
5055
|
|
|
5055
5056
|
[[package]]
|
|
5056
5057
|
name = "rmcp-macros"
|
|
5057
|
-
version = "0.
|
|
5058
|
+
version = "0.14.0"
|
|
5058
5059
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
5059
|
-
checksum = "
|
|
5060
|
+
checksum = "6b79ed92303f9262db79575aa8c3652581668e9d136be6fd0b9ededa78954c95"
|
|
5060
5061
|
dependencies = [
|
|
5061
5062
|
"darling 0.23.0",
|
|
5062
5063
|
"proc-macro2",
|
|
@@ -6013,6 +6014,7 @@ checksum = "91e7d9e3bb61134e77bde20dd4825b97c010155709965fedf0f49bb138e52a9d"
|
|
|
6013
6014
|
dependencies = [
|
|
6014
6015
|
"deranged",
|
|
6015
6016
|
"itoa",
|
|
6017
|
+
"js-sys",
|
|
6016
6018
|
"num-conv",
|
|
6017
6019
|
"powerfmt",
|
|
6018
6020
|
"serde",
|
|
@@ -6405,6 +6407,12 @@ version = "0.25.1"
|
|
|
6405
6407
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
6406
6408
|
checksum = "d2df906b07856748fa3f6e0ad0cbaa047052d4a7dd609e231c4f72cee8c36f31"
|
|
6407
6409
|
|
|
6410
|
+
[[package]]
|
|
6411
|
+
name = "typed-path"
|
|
6412
|
+
version = "0.12.1"
|
|
6413
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
6414
|
+
checksum = "e43ffa54726cdc9ea78392023ffe9fe9cf9ac779e1c6fcb0d23f9862e3879d20"
|
|
6415
|
+
|
|
6408
6416
|
[[package]]
|
|
6409
6417
|
name = "typeid"
|
|
6410
6418
|
version = "1.0.3"
|
|
@@ -6698,9 +6706,9 @@ checksum = "06abde3611657adf66d383f00b093d7faecc7fa57071cce2578660c9f1010821"
|
|
|
6698
6706
|
|
|
6699
6707
|
[[package]]
|
|
6700
6708
|
name = "uuid"
|
|
6701
|
-
version = "1.
|
|
6709
|
+
version = "1.20.0"
|
|
6702
6710
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
6703
|
-
checksum = "
|
|
6711
|
+
checksum = "ee48d38b119b0cd71fe4141b30f5ba9c7c5d9f4e7a3a8b4a674e4b6ef789976f"
|
|
6704
6712
|
dependencies = [
|
|
6705
6713
|
"getrandom 0.3.4",
|
|
6706
6714
|
"js-sys",
|
|
@@ -7498,9 +7506,9 @@ dependencies = [
|
|
|
7498
7506
|
|
|
7499
7507
|
[[package]]
|
|
7500
7508
|
name = "zip"
|
|
7501
|
-
version = "7.
|
|
7509
|
+
version = "7.2.0"
|
|
7502
7510
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
7503
|
-
checksum = "
|
|
7511
|
+
checksum = "c42e33efc22a0650c311c2ef19115ce232583abbe80850bc8b66509ebef02de0"
|
|
7504
7512
|
dependencies = [
|
|
7505
7513
|
"aes",
|
|
7506
7514
|
"bzip2",
|
|
@@ -7518,6 +7526,7 @@ dependencies = [
|
|
|
7518
7526
|
"ppmd-rust",
|
|
7519
7527
|
"sha1",
|
|
7520
7528
|
"time",
|
|
7529
|
+
"typed-path",
|
|
7521
7530
|
"zeroize",
|
|
7522
7531
|
"zopfli",
|
|
7523
7532
|
"zstd",
|
data/lib/kreuzberg/cli.rb
CHANGED
|
@@ -8,24 +8,34 @@ module Kreuzberg
|
|
|
8
8
|
|
|
9
9
|
# Extract content from a file using the CLI
|
|
10
10
|
#
|
|
11
|
-
# @param
|
|
11
|
+
# @param path_or_nil [String, nil] Path to the file (positional, for backward compatibility)
|
|
12
|
+
# @param path [String] Path to the file (keyword argument)
|
|
12
13
|
# @param output [String] Output format ("text", "json", "markdown")
|
|
13
14
|
# @param ocr [Boolean] Enable OCR
|
|
14
15
|
# @return [String] Extracted content
|
|
15
16
|
#
|
|
16
|
-
def extract(path, output: 'text', ocr: false)
|
|
17
|
-
|
|
17
|
+
def extract(path_or_nil = nil, path: nil, output: 'text', ocr: false)
|
|
18
|
+
# Support both positional and keyword argument for path (backward compatibility)
|
|
19
|
+
actual_path = path_or_nil || path
|
|
20
|
+
raise ArgumentError, 'path is required' if actual_path.nil?
|
|
21
|
+
|
|
22
|
+
args = ['extract', actual_path, '--format', output]
|
|
18
23
|
args.push('--ocr', ocr ? 'true' : 'false')
|
|
19
24
|
CLIProxy.call(args)
|
|
20
25
|
end
|
|
21
26
|
|
|
22
27
|
# Detect MIME type of a file using the CLI
|
|
23
28
|
#
|
|
24
|
-
# @param
|
|
29
|
+
# @param path_or_nil [String, nil] Path to the file (positional, for backward compatibility)
|
|
30
|
+
# @param path [String] Path to the file (keyword argument)
|
|
25
31
|
# @return [String] MIME type
|
|
26
32
|
#
|
|
27
|
-
def detect(path)
|
|
28
|
-
|
|
33
|
+
def detect(path_or_nil = nil, path: nil)
|
|
34
|
+
# Support both positional and keyword argument for path (backward compatibility)
|
|
35
|
+
actual_path = path_or_nil || path
|
|
36
|
+
raise ArgumentError, 'path is required' if actual_path.nil?
|
|
37
|
+
|
|
38
|
+
CLIProxy.call(['detect', actual_path]).strip
|
|
29
39
|
end
|
|
30
40
|
|
|
31
41
|
# Get CLI version
|
data/lib/kreuzberg/cli_proxy.rb
CHANGED
|
@@ -91,11 +91,13 @@ module Kreuzberg
|
|
|
91
91
|
lib_path.join('bin', binary_name),
|
|
92
92
|
lib_path.join(binary_name),
|
|
93
93
|
root_path.join('../../crates/kreuzberg-cli/target/release', binary_name),
|
|
94
|
-
root_path.join('../../target/release', binary_name)
|
|
94
|
+
root_path.join('../../target/release', binary_name),
|
|
95
|
+
root_path.join('../../target/debug', binary_name)
|
|
95
96
|
]
|
|
96
97
|
|
|
97
98
|
workspace_root = root_path.parent&.parent
|
|
98
99
|
paths << workspace_root.join('target', 'release', binary_name) if workspace_root
|
|
100
|
+
paths << workspace_root.join('target', 'debug', binary_name) if workspace_root
|
|
99
101
|
|
|
100
102
|
paths
|
|
101
103
|
end
|
data/lib/kreuzberg/config.rb
CHANGED
|
@@ -684,13 +684,6 @@ module Kreuzberg
|
|
|
684
684
|
# image = Config::ImageExtraction.new(extract_images: true, target_dpi: 600)
|
|
685
685
|
# config = Extraction.new(image_extraction: image)
|
|
686
686
|
#
|
|
687
|
-
# @example With preprocessing
|
|
688
|
-
# preprocessing = Config::ImagePreprocessing.new(
|
|
689
|
-
# binarization_method: "sauvola",
|
|
690
|
-
# denoise: true
|
|
691
|
-
# )
|
|
692
|
-
# config = Extraction.new(image_preprocessing: preprocessing)
|
|
693
|
-
#
|
|
694
687
|
# @example With post-processing
|
|
695
688
|
# postprocessor = Config::PostProcessor.new(
|
|
696
689
|
# enabled: true,
|
|
@@ -708,17 +701,19 @@ module Kreuzberg
|
|
|
708
701
|
# language_detection: Config::LanguageDetection.new(enabled: true),
|
|
709
702
|
# pdf_options: Config::PDF.new(extract_images: true, passwords: ["secret"]),
|
|
710
703
|
# image_extraction: Config::ImageExtraction.new(target_dpi: 600),
|
|
711
|
-
# image_preprocessing: Config::ImagePreprocessing.new(denoise: true),
|
|
712
704
|
# postprocessor: Config::PostProcessor.new(enabled: true)
|
|
713
705
|
# )
|
|
714
706
|
#
|
|
715
707
|
class Extraction
|
|
716
708
|
attr_reader :use_cache, :enable_quality_processing, :force_ocr,
|
|
717
709
|
:ocr, :chunking, :language_detection, :pdf_options,
|
|
718
|
-
:
|
|
710
|
+
:images, :postprocessor,
|
|
719
711
|
:token_reduction, :keywords, :html_options, :pages,
|
|
720
712
|
:max_concurrent_extractions, :output_format, :result_format
|
|
721
713
|
|
|
714
|
+
# Alias for backward compatibility - image_extraction is the canonical name
|
|
715
|
+
alias image_extraction images
|
|
716
|
+
|
|
722
717
|
# Load configuration from a file.
|
|
723
718
|
#
|
|
724
719
|
# Detects the file format from the extension (.toml, .yaml, .json)
|
|
@@ -736,7 +731,7 @@ module Kreuzberg
|
|
|
736
731
|
# Keys that are allowed in the Extraction config
|
|
737
732
|
ALLOWED_KEYS = %i[
|
|
738
733
|
use_cache enable_quality_processing force_ocr ocr chunking
|
|
739
|
-
language_detection pdf_options image_extraction
|
|
734
|
+
language_detection pdf_options image_extraction
|
|
740
735
|
postprocessor token_reduction keywords html_options pages
|
|
741
736
|
max_concurrent_extractions output_format result_format
|
|
742
737
|
].freeze
|
|
@@ -746,6 +741,12 @@ module Kreuzberg
|
|
|
746
741
|
images: :image_extraction
|
|
747
742
|
}.freeze
|
|
748
743
|
|
|
744
|
+
# Valid output format values (case-insensitive, normalized internally)
|
|
745
|
+
VALID_OUTPUT_FORMATS = %w[plain markdown html djot].freeze
|
|
746
|
+
|
|
747
|
+
# Valid result format values (case-insensitive, normalized internally)
|
|
748
|
+
VALID_RESULT_FORMATS = %w[unified elements element_based].freeze
|
|
749
|
+
|
|
749
750
|
def self.from_file(path)
|
|
750
751
|
hash = Kreuzberg._config_from_file_native(path)
|
|
751
752
|
new(**normalize_hash_keys(hash))
|
|
@@ -791,14 +792,13 @@ module Kreuzberg
|
|
|
791
792
|
|
|
792
793
|
def initialize(hash = nil,
|
|
793
794
|
use_cache: true,
|
|
794
|
-
enable_quality_processing:
|
|
795
|
+
enable_quality_processing: true,
|
|
795
796
|
force_ocr: false,
|
|
796
797
|
ocr: nil,
|
|
797
798
|
chunking: nil,
|
|
798
799
|
language_detection: nil,
|
|
799
800
|
pdf_options: nil,
|
|
800
801
|
image_extraction: nil,
|
|
801
|
-
image_preprocessing: nil,
|
|
802
802
|
postprocessor: nil,
|
|
803
803
|
token_reduction: nil,
|
|
804
804
|
keywords: nil,
|
|
@@ -811,7 +811,7 @@ module Kreuzberg
|
|
|
811
811
|
use_cache: use_cache, enable_quality_processing: enable_quality_processing,
|
|
812
812
|
force_ocr: force_ocr, ocr: ocr, chunking: chunking, language_detection: language_detection,
|
|
813
813
|
pdf_options: pdf_options, image_extraction: image_extraction,
|
|
814
|
-
|
|
814
|
+
postprocessor: postprocessor,
|
|
815
815
|
token_reduction: token_reduction, keywords: keywords, html_options: html_options,
|
|
816
816
|
pages: pages, max_concurrent_extractions: max_concurrent_extractions,
|
|
817
817
|
output_format: output_format, result_format: result_format
|
|
@@ -836,20 +836,38 @@ module Kreuzberg
|
|
|
836
836
|
@chunking = normalize_config(params[:chunking], Chunking)
|
|
837
837
|
@language_detection = normalize_config(params[:language_detection], LanguageDetection)
|
|
838
838
|
@pdf_options = normalize_config(params[:pdf_options], PDF)
|
|
839
|
-
@
|
|
840
|
-
@image_preprocessing = normalize_config(params[:image_preprocessing], ImagePreprocessing)
|
|
839
|
+
@images = normalize_config(params[:image_extraction], ImageExtraction)
|
|
841
840
|
@postprocessor = normalize_config(params[:postprocessor], PostProcessor)
|
|
842
841
|
@token_reduction = normalize_config(params[:token_reduction], TokenReduction)
|
|
843
842
|
@keywords = normalize_config(params[:keywords], Keywords)
|
|
844
843
|
@html_options = normalize_config(params[:html_options], HtmlOptions)
|
|
845
844
|
@pages = normalize_config(params[:pages], PageConfig)
|
|
846
845
|
@max_concurrent_extractions = params[:max_concurrent_extractions]&.to_i
|
|
847
|
-
@output_format = params[:output_format]
|
|
848
|
-
@result_format = params[:result_format]
|
|
846
|
+
@output_format = validate_output_format(params[:output_format])
|
|
847
|
+
@result_format = validate_result_format(params[:result_format])
|
|
848
|
+
end
|
|
849
|
+
|
|
850
|
+
def validate_output_format(value)
|
|
851
|
+
return nil if value.nil?
|
|
852
|
+
|
|
853
|
+
str_value = value.to_s.downcase
|
|
854
|
+
return str_value if VALID_OUTPUT_FORMATS.include?(str_value)
|
|
855
|
+
|
|
856
|
+
raise ArgumentError,
|
|
857
|
+
"Invalid output_format: #{value}. Valid values: #{VALID_OUTPUT_FORMATS.join(', ')}"
|
|
858
|
+
end
|
|
859
|
+
|
|
860
|
+
def validate_result_format(value)
|
|
861
|
+
return nil if value.nil?
|
|
862
|
+
|
|
863
|
+
str_value = value.to_s.downcase
|
|
864
|
+
return str_value if VALID_RESULT_FORMATS.include?(str_value)
|
|
865
|
+
|
|
866
|
+
raise ArgumentError,
|
|
867
|
+
"Invalid result_format: #{value}. Valid values: #{VALID_RESULT_FORMATS.join(', ')}"
|
|
849
868
|
end
|
|
850
869
|
|
|
851
870
|
# rubocop:disable Metrics/CyclomaticComplexity
|
|
852
|
-
# rubocop:disable Metrics/MethodLength
|
|
853
871
|
def to_h
|
|
854
872
|
{
|
|
855
873
|
use_cache: @use_cache,
|
|
@@ -859,8 +877,7 @@ module Kreuzberg
|
|
|
859
877
|
chunking: @chunking&.to_h,
|
|
860
878
|
language_detection: @language_detection&.to_h,
|
|
861
879
|
pdf_options: @pdf_options&.to_h,
|
|
862
|
-
|
|
863
|
-
image_preprocessing: @image_preprocessing&.to_h,
|
|
880
|
+
images: @images&.to_h,
|
|
864
881
|
postprocessor: @postprocessor&.to_h,
|
|
865
882
|
token_reduction: @token_reduction&.to_h,
|
|
866
883
|
keywords: @keywords&.to_h,
|
|
@@ -871,7 +888,6 @@ module Kreuzberg
|
|
|
871
888
|
result_format: @result_format
|
|
872
889
|
}.compact
|
|
873
890
|
end
|
|
874
|
-
# rubocop:enable Metrics/MethodLength
|
|
875
891
|
# rubocop:enable Metrics/CyclomaticComplexity
|
|
876
892
|
|
|
877
893
|
# Serialize configuration to JSON string
|
|
@@ -995,9 +1011,7 @@ module Kreuzberg
|
|
|
995
1011
|
when :pdf_options
|
|
996
1012
|
@pdf_options = normalize_config(value, PDF)
|
|
997
1013
|
when :image_extraction
|
|
998
|
-
@
|
|
999
|
-
when :image_preprocessing
|
|
1000
|
-
@image_preprocessing = normalize_config(value, ImagePreprocessing)
|
|
1014
|
+
@images = normalize_config(value, ImageExtraction)
|
|
1001
1015
|
when :postprocessor
|
|
1002
1016
|
@postprocessor = normalize_config(value, PostProcessor)
|
|
1003
1017
|
when :token_reduction
|
|
@@ -1011,9 +1025,9 @@ module Kreuzberg
|
|
|
1011
1025
|
when :max_concurrent_extractions
|
|
1012
1026
|
@max_concurrent_extractions = value&.to_i
|
|
1013
1027
|
when :output_format
|
|
1014
|
-
@output_format = value
|
|
1028
|
+
@output_format = validate_output_format(value)
|
|
1015
1029
|
when :result_format
|
|
1016
|
-
@result_format = value
|
|
1030
|
+
@result_format = validate_result_format(value)
|
|
1017
1031
|
else
|
|
1018
1032
|
raise ArgumentError, "Unknown configuration key: #{key}"
|
|
1019
1033
|
end
|
|
@@ -1035,6 +1049,24 @@ module Kreuzberg
|
|
|
1035
1049
|
nil
|
|
1036
1050
|
end
|
|
1037
1051
|
|
|
1052
|
+
# Set output_format attribute
|
|
1053
|
+
#
|
|
1054
|
+
# @param value [String, nil] Output format value
|
|
1055
|
+
# @return [String, nil] The value that was set
|
|
1056
|
+
#
|
|
1057
|
+
def output_format=(value)
|
|
1058
|
+
@output_format = validate_output_format(value)
|
|
1059
|
+
end
|
|
1060
|
+
|
|
1061
|
+
# Set result_format attribute
|
|
1062
|
+
#
|
|
1063
|
+
# @param value [String, nil] Result format value
|
|
1064
|
+
# @return [String, nil] The value that was set
|
|
1065
|
+
#
|
|
1066
|
+
def result_format=(value)
|
|
1067
|
+
@result_format = validate_result_format(value)
|
|
1068
|
+
end
|
|
1069
|
+
|
|
1038
1070
|
private
|
|
1039
1071
|
|
|
1040
1072
|
def normalize_config(value, klass)
|
|
@@ -1053,8 +1085,7 @@ module Kreuzberg
|
|
|
1053
1085
|
@chunking = merged.chunking
|
|
1054
1086
|
@language_detection = merged.language_detection
|
|
1055
1087
|
@pdf_options = merged.pdf_options
|
|
1056
|
-
@
|
|
1057
|
-
@image_preprocessing = merged.image_preprocessing
|
|
1088
|
+
@images = merged.image_extraction
|
|
1058
1089
|
@postprocessor = merged.postprocessor
|
|
1059
1090
|
@token_reduction = merged.token_reduction
|
|
1060
1091
|
@keywords = merged.keywords
|
|
@@ -0,0 +1,225 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
begin
|
|
4
|
+
require 'json'
|
|
5
|
+
rescue LoadError
|
|
6
|
+
require 'json/pure'
|
|
7
|
+
end
|
|
8
|
+
|
|
9
|
+
module Kreuzberg
|
|
10
|
+
class Result
|
|
11
|
+
# Djot structured content representation
|
|
12
|
+
#
|
|
13
|
+
# Represents document content in Djot format with structured metadata about
|
|
14
|
+
# blocks, images, links, footnotes, and other document elements.
|
|
15
|
+
#
|
|
16
|
+
class DjotContent
|
|
17
|
+
attr_reader :plain_text, :blocks, :metadata_json, :tables, :images, :links, :footnotes, :attributes
|
|
18
|
+
|
|
19
|
+
# Represents a formatted block in Djot content
|
|
20
|
+
class FormattedBlock
|
|
21
|
+
attr_reader :block_type, :children, :attributes, :content, :level
|
|
22
|
+
|
|
23
|
+
# rubocop:disable Metrics/CyclomaticComplexity, Metrics/PerceivedComplexity
|
|
24
|
+
def initialize(hash_or_type = nil, children: nil, attributes: nil, content: nil, level: nil, block_type: nil)
|
|
25
|
+
if hash_or_type.is_a?(Hash)
|
|
26
|
+
# Initialize from hash
|
|
27
|
+
@block_type = hash_or_type[:block_type] || hash_or_type['block_type'] || ''
|
|
28
|
+
@children = hash_or_type[:children] || hash_or_type['children']
|
|
29
|
+
@attributes = hash_or_type[:attributes] || hash_or_type['attributes'] || {}
|
|
30
|
+
@content = hash_or_type[:content] || hash_or_type['content']
|
|
31
|
+
@level = hash_or_type[:level] || hash_or_type['level']
|
|
32
|
+
else
|
|
33
|
+
# Initialize from keyword arguments (for backward compatibility)
|
|
34
|
+
@block_type = block_type || hash_or_type || ''
|
|
35
|
+
@children = children || []
|
|
36
|
+
@attributes = attributes || {}
|
|
37
|
+
@content = content
|
|
38
|
+
@level = level
|
|
39
|
+
end
|
|
40
|
+
end
|
|
41
|
+
# rubocop:enable Metrics/CyclomaticComplexity, Metrics/PerceivedComplexity
|
|
42
|
+
|
|
43
|
+
def to_h
|
|
44
|
+
{
|
|
45
|
+
block_type: @block_type,
|
|
46
|
+
children: @children,
|
|
47
|
+
attributes: @attributes,
|
|
48
|
+
content: @content,
|
|
49
|
+
level: @level
|
|
50
|
+
}.compact
|
|
51
|
+
end
|
|
52
|
+
end
|
|
53
|
+
|
|
54
|
+
# Represents an image in Djot content
|
|
55
|
+
class DjotImage
|
|
56
|
+
attr_reader :url, :alt, :title, :width, :height
|
|
57
|
+
alias src url
|
|
58
|
+
|
|
59
|
+
# rubocop:disable Metrics/CyclomaticComplexity
|
|
60
|
+
def initialize(hash_or_url = nil, alt: nil, title: nil, width: nil, height: nil, url: nil, src: nil)
|
|
61
|
+
if hash_or_url.is_a?(Hash)
|
|
62
|
+
# Initialize from hash (supports both 'url' and 'src' keys)
|
|
63
|
+
@url = hash_or_url[:url] || hash_or_url['url'] || hash_or_url[:src] || hash_or_url['src']
|
|
64
|
+
@alt = hash_or_url[:alt] || hash_or_url['alt']
|
|
65
|
+
@title = hash_or_url[:title] || hash_or_url['title']
|
|
66
|
+
@width = hash_or_url[:width] || hash_or_url['width']
|
|
67
|
+
@height = hash_or_url[:height] || hash_or_url['height']
|
|
68
|
+
else
|
|
69
|
+
# Initialize from keyword arguments
|
|
70
|
+
@url = url || src || hash_or_url
|
|
71
|
+
@alt = alt
|
|
72
|
+
@title = title
|
|
73
|
+
@width = width
|
|
74
|
+
@height = height
|
|
75
|
+
end
|
|
76
|
+
end
|
|
77
|
+
# rubocop:enable Metrics/CyclomaticComplexity
|
|
78
|
+
|
|
79
|
+
def to_h
|
|
80
|
+
{
|
|
81
|
+
url: @url,
|
|
82
|
+
alt: @alt,
|
|
83
|
+
title: @title,
|
|
84
|
+
width: @width,
|
|
85
|
+
height: @height
|
|
86
|
+
}.compact
|
|
87
|
+
end
|
|
88
|
+
end
|
|
89
|
+
|
|
90
|
+
# Represents a link in Djot content
|
|
91
|
+
class DjotLink
|
|
92
|
+
attr_reader :url, :text, :title, :link_type
|
|
93
|
+
alias href url
|
|
94
|
+
|
|
95
|
+
# rubocop:disable Metrics/CyclomaticComplexity
|
|
96
|
+
def initialize(hash_or_url = nil, text: nil, title: nil, url: nil, href: nil, link_type: nil)
|
|
97
|
+
if hash_or_url.is_a?(Hash)
|
|
98
|
+
# Initialize from hash (supports both 'url' and 'href' keys)
|
|
99
|
+
@url = hash_or_url[:url] || hash_or_url['url'] || hash_or_url[:href] || hash_or_url['href']
|
|
100
|
+
@text = hash_or_url[:text] || hash_or_url['text']
|
|
101
|
+
@title = hash_or_url[:title] || hash_or_url['title']
|
|
102
|
+
@link_type = hash_or_url[:link_type] || hash_or_url['link_type']
|
|
103
|
+
else
|
|
104
|
+
# Initialize from keyword arguments
|
|
105
|
+
@url = url || href || hash_or_url
|
|
106
|
+
@text = text
|
|
107
|
+
@title = title
|
|
108
|
+
@link_type = link_type
|
|
109
|
+
end
|
|
110
|
+
end
|
|
111
|
+
# rubocop:enable Metrics/CyclomaticComplexity
|
|
112
|
+
|
|
113
|
+
def to_h
|
|
114
|
+
{
|
|
115
|
+
url: @url,
|
|
116
|
+
text: @text,
|
|
117
|
+
title: @title,
|
|
118
|
+
link_type: @link_type
|
|
119
|
+
}.compact
|
|
120
|
+
end
|
|
121
|
+
end
|
|
122
|
+
|
|
123
|
+
# Represents a footnote in Djot content
|
|
124
|
+
class Footnote
|
|
125
|
+
attr_reader :label, :content
|
|
126
|
+
|
|
127
|
+
def initialize(label:, content:)
|
|
128
|
+
@label = label
|
|
129
|
+
@content = content
|
|
130
|
+
end
|
|
131
|
+
|
|
132
|
+
def to_h
|
|
133
|
+
{
|
|
134
|
+
label: @label,
|
|
135
|
+
content: @content
|
|
136
|
+
}
|
|
137
|
+
end
|
|
138
|
+
end
|
|
139
|
+
|
|
140
|
+
# rubocop:disable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/PerceivedComplexity
|
|
141
|
+
def initialize(hash)
|
|
142
|
+
@plain_text = hash['plain_text'] || hash[:plain_text] || ''
|
|
143
|
+
@blocks = parse_blocks(hash['blocks'] || hash[:blocks] || [])
|
|
144
|
+
@metadata_json = hash['metadata_json'] || hash[:metadata_json] || '{}'
|
|
145
|
+
@tables = hash['tables'] || hash[:tables] || []
|
|
146
|
+
@images = parse_images(hash['images'] || hash[:images] || [])
|
|
147
|
+
@links = parse_links(hash['links'] || hash[:links] || [])
|
|
148
|
+
@footnotes = parse_footnotes(hash['footnotes'] || hash[:footnotes] || [])
|
|
149
|
+
@attributes = hash['attributes'] || hash[:attributes] || {}
|
|
150
|
+
end
|
|
151
|
+
# rubocop:enable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/PerceivedComplexity
|
|
152
|
+
|
|
153
|
+
def metadata
|
|
154
|
+
@metadata ||= parse_metadata(@metadata_json)
|
|
155
|
+
end
|
|
156
|
+
|
|
157
|
+
def to_h
|
|
158
|
+
{
|
|
159
|
+
plain_text: @plain_text,
|
|
160
|
+
blocks: @blocks.map(&:to_h),
|
|
161
|
+
metadata_json: @metadata_json,
|
|
162
|
+
tables: @tables,
|
|
163
|
+
images: @images.map(&:to_h),
|
|
164
|
+
links: @links.map(&:to_h),
|
|
165
|
+
footnotes: @footnotes.map(&:to_h),
|
|
166
|
+
attributes: @attributes
|
|
167
|
+
}
|
|
168
|
+
end
|
|
169
|
+
|
|
170
|
+
private
|
|
171
|
+
|
|
172
|
+
def parse_metadata(metadata_json)
|
|
173
|
+
JSON.parse(metadata_json)
|
|
174
|
+
rescue JSON::ParserError
|
|
175
|
+
{}
|
|
176
|
+
end
|
|
177
|
+
|
|
178
|
+
def parse_blocks(blocks_data)
|
|
179
|
+
blocks_data.map do |block|
|
|
180
|
+
FormattedBlock.new(
|
|
181
|
+
block_type: block['block_type'] || block[:block_type] || '',
|
|
182
|
+
children: block['children'] || block[:children],
|
|
183
|
+
attributes: block['attributes'] || block[:attributes]
|
|
184
|
+
)
|
|
185
|
+
end
|
|
186
|
+
end
|
|
187
|
+
|
|
188
|
+
# rubocop:disable Metrics/CyclomaticComplexity
|
|
189
|
+
def parse_images(images_data)
|
|
190
|
+
images_data.map do |image|
|
|
191
|
+
DjotImage.new(
|
|
192
|
+
url: image['url'] || image[:url] || image['src'] || image[:src],
|
|
193
|
+
alt: image['alt'] || image[:alt],
|
|
194
|
+
title: image['title'] || image[:title],
|
|
195
|
+
width: image['width'] || image[:width],
|
|
196
|
+
height: image['height'] || image[:height]
|
|
197
|
+
)
|
|
198
|
+
end
|
|
199
|
+
end
|
|
200
|
+
# rubocop:enable Metrics/CyclomaticComplexity
|
|
201
|
+
|
|
202
|
+
# rubocop:disable Metrics/CyclomaticComplexity
|
|
203
|
+
def parse_links(links_data)
|
|
204
|
+
links_data.map do |link|
|
|
205
|
+
DjotLink.new(
|
|
206
|
+
url: link['url'] || link[:url] || link['href'] || link[:href],
|
|
207
|
+
text: link['text'] || link[:text],
|
|
208
|
+
title: link['title'] || link[:title],
|
|
209
|
+
link_type: link['link_type'] || link[:link_type]
|
|
210
|
+
)
|
|
211
|
+
end
|
|
212
|
+
end
|
|
213
|
+
# rubocop:enable Metrics/CyclomaticComplexity
|
|
214
|
+
|
|
215
|
+
def parse_footnotes(footnotes_data)
|
|
216
|
+
footnotes_data.map do |note|
|
|
217
|
+
Footnote.new(
|
|
218
|
+
label: note['label'] || note[:label],
|
|
219
|
+
content: note['content'] || note[:content]
|
|
220
|
+
)
|
|
221
|
+
end
|
|
222
|
+
end
|
|
223
|
+
end
|
|
224
|
+
end
|
|
225
|
+
end
|