kreuzberg 4.2.0 → 4.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +2 -2
- data/README.md +1 -1
- data/ext/kreuzberg_rb/native/Cargo.lock +26 -17
- data/lib/kreuzberg/cli.rb +16 -6
- data/lib/kreuzberg/cli_proxy.rb +3 -1
- data/lib/kreuzberg/config.rb +56 -9
- data/lib/kreuzberg/djot_content.rb +225 -0
- data/lib/kreuzberg/extraction_api.rb +20 -4
- data/lib/kreuzberg/result.rb +12 -2
- data/lib/kreuzberg/version.rb +1 -1
- data/lib/kreuzberg.rb +1 -0
- data/sig/kreuzberg.rbs +23 -11
- data/spec/binding/batch_spec.rb +6 -5
- data/spec/binding/error_recovery_spec.rb +3 -3
- data/spec/binding/tables_spec.rb +11 -2
- data/spec/unit/config/output_format_spec.rb +18 -18
- data/vendor/Cargo.toml +1 -1
- data/vendor/kreuzberg/Cargo.toml +1 -1
- data/vendor/kreuzberg/README.md +1 -1
- data/vendor/kreuzberg/src/api/startup.rs +15 -1
- data/vendor/kreuzberg/src/core/config_validation/sections.rs +16 -4
- data/vendor/kreuzberg/src/core/extractor/file.rs +1 -2
- data/vendor/kreuzberg/src/core/extractor/mod.rs +2 -1
- data/vendor/kreuzberg/src/core/io.rs +7 -7
- data/vendor/kreuzberg/src/core/mime.rs +4 -4
- data/vendor/kreuzberg/src/extraction/pptx/parser.rs +6 -0
- data/vendor/kreuzberg/src/plugins/mod.rs +1 -0
- data/vendor/kreuzberg/src/plugins/registry/extractor.rs +251 -5
- data/vendor/kreuzberg/src/plugins/registry/ocr.rs +150 -2
- data/vendor/kreuzberg/src/plugins/registry/processor.rs +213 -5
- data/vendor/kreuzberg/src/plugins/registry/validator.rs +220 -4
- data/vendor/kreuzberg/src/plugins/startup_validation.rs +385 -0
- data/vendor/kreuzberg/tests/config_behavioral.rs +14 -12
- data/vendor/kreuzberg/tests/core_integration.rs +2 -4
- data/vendor/kreuzberg/tests/mime_detection.rs +3 -2
- data/vendor/kreuzberg/tests/pptx_regression_tests.rs +284 -1
- data/vendor/kreuzberg-tesseract/Cargo.toml +1 -1
- metadata +4 -2
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: '010412940492f83bc170a4a4efc644ac7e3818502734035523796287837a3893'
|
|
4
|
+
data.tar.gz: af24cba007cc58283d678d4b15251ebae3a5740f60ade9a54cc19973a2022a82
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: ad67348bec54a01ca3592ed72e9b2b8bc9e711a37e11b40ada31466c67f834132fc0de278c53a1c014fa6751da7abebae934cff2a9cc1835f7e056c895a273cb
|
|
7
|
+
data.tar.gz: ca2cdb076a5d1af67f0e807978a966d1a391cc286bcdf5499544e3403196140918a54674beab77ea09fc0e8bc7ab66f357da5d984326a511b1d21643a3d6cf41
|
data/Gemfile.lock
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
PATH
|
|
2
2
|
remote: .
|
|
3
3
|
specs:
|
|
4
|
-
kreuzberg (4.2.
|
|
4
|
+
kreuzberg (4.2.1)
|
|
5
5
|
|
|
6
6
|
GEM
|
|
7
7
|
remote: https://rubygems.org/
|
|
@@ -207,7 +207,7 @@ CHECKSUMS
|
|
|
207
207
|
i18n (1.14.8) sha256=285778639134865c5e0f6269e0b818256017e8cde89993fdfcbfb64d088824a5
|
|
208
208
|
io-console (0.8.2) sha256=d6e3ae7a7cc7574f4b8893b4fca2162e57a825b223a177b7afa236c5ef9814cc
|
|
209
209
|
json (2.18.0) sha256=b10506aee4183f5cf49e0efc48073d7b75843ce3782c68dbeb763351c08fd505
|
|
210
|
-
kreuzberg (4.2.
|
|
210
|
+
kreuzberg (4.2.1)
|
|
211
211
|
language_server-protocol (3.17.0.5) sha256=fd1e39a51a28bf3eec959379985a72e296e9f9acfce46f6a79d31ca8760803cc
|
|
212
212
|
lint_roller (1.1.0) sha256=2c0c845b632a7d172cb849cc90c1bce937a28c5c8ccccb50dfd46a485003cc87
|
|
213
213
|
listen (3.10.0) sha256=c6e182db62143aeccc2e1960033bebe7445309c7272061979bb098d03760c9d2
|
data/README.md
CHANGED
|
@@ -22,7 +22,7 @@
|
|
|
22
22
|
<img src="https://img.shields.io/maven-central/v/dev.kreuzberg/kreuzberg?label=Java&color=007ec6" alt="Java">
|
|
23
23
|
</a>
|
|
24
24
|
<a href="https://github.com/kreuzberg-dev/kreuzberg/releases">
|
|
25
|
-
<img src="https://img.shields.io/github/v/tag/kreuzberg-dev/kreuzberg?label=Go&color=007ec6&filter=v4.2.
|
|
25
|
+
<img src="https://img.shields.io/github/v/tag/kreuzberg-dev/kreuzberg?label=Go&color=007ec6&filter=v4.2.1" alt="Go">
|
|
26
26
|
</a>
|
|
27
27
|
<a href="https://www.nuget.org/packages/Kreuzberg/">
|
|
28
28
|
<img src="https://img.shields.io/nuget/v/Kreuzberg?label=C%23&color=007ec6" alt="C#">
|
|
@@ -716,9 +716,9 @@ dependencies = [
|
|
|
716
716
|
|
|
717
717
|
[[package]]
|
|
718
718
|
name = "cc"
|
|
719
|
-
version = "1.2.
|
|
719
|
+
version = "1.2.54"
|
|
720
720
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
721
|
-
checksum = "
|
|
721
|
+
checksum = "6354c81bbfd62d9cfa9cb3c773c2b7b2a3a482d569de977fd0e961f6e7c00583"
|
|
722
722
|
dependencies = [
|
|
723
723
|
"find-msvc-tools",
|
|
724
724
|
"jobserver",
|
|
@@ -2027,9 +2027,9 @@ dependencies = [
|
|
|
2027
2027
|
|
|
2028
2028
|
[[package]]
|
|
2029
2029
|
name = "html-to-markdown-rs"
|
|
2030
|
-
version = "2.23.
|
|
2030
|
+
version = "2.23.4"
|
|
2031
2031
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
2032
|
-
checksum = "
|
|
2032
|
+
checksum = "ffbf49450676163bdf69fac2acf72674fcf2d2aaf690aa247368c567cc9afb2a"
|
|
2033
2033
|
dependencies = [
|
|
2034
2034
|
"astral-tl",
|
|
2035
2035
|
"base64 0.22.1",
|
|
@@ -2627,7 +2627,7 @@ dependencies = [
|
|
|
2627
2627
|
|
|
2628
2628
|
[[package]]
|
|
2629
2629
|
name = "kreuzberg"
|
|
2630
|
-
version = "4.
|
|
2630
|
+
version = "4.2.0"
|
|
2631
2631
|
dependencies = [
|
|
2632
2632
|
"ahash",
|
|
2633
2633
|
"async-trait",
|
|
@@ -2706,17 +2706,18 @@ dependencies = [
|
|
|
2706
2706
|
"uuid",
|
|
2707
2707
|
"whatlang",
|
|
2708
2708
|
"yake-rust",
|
|
2709
|
-
"zip 7.
|
|
2709
|
+
"zip 7.2.0",
|
|
2710
2710
|
]
|
|
2711
2711
|
|
|
2712
2712
|
[[package]]
|
|
2713
2713
|
name = "kreuzberg-ffi"
|
|
2714
|
-
version = "4.
|
|
2714
|
+
version = "4.2.0"
|
|
2715
2715
|
dependencies = [
|
|
2716
2716
|
"async-trait",
|
|
2717
2717
|
"cbindgen",
|
|
2718
2718
|
"html-to-markdown-rs",
|
|
2719
2719
|
"kreuzberg",
|
|
2720
|
+
"log",
|
|
2720
2721
|
"serde",
|
|
2721
2722
|
"serde_json",
|
|
2722
2723
|
"tokio",
|
|
@@ -2767,14 +2768,14 @@ dependencies = [
|
|
|
2767
2768
|
|
|
2768
2769
|
[[package]]
|
|
2769
2770
|
name = "kreuzberg-tesseract"
|
|
2770
|
-
version = "4.
|
|
2771
|
+
version = "4.2.0"
|
|
2771
2772
|
dependencies = [
|
|
2772
2773
|
"cc",
|
|
2773
2774
|
"cmake",
|
|
2774
2775
|
"libc",
|
|
2775
2776
|
"reqwest 0.13.1",
|
|
2776
2777
|
"thiserror 2.0.18",
|
|
2777
|
-
"zip 7.
|
|
2778
|
+
"zip 7.2.0",
|
|
2778
2779
|
]
|
|
2779
2780
|
|
|
2780
2781
|
[[package]]
|
|
@@ -5022,9 +5023,9 @@ dependencies = [
|
|
|
5022
5023
|
|
|
5023
5024
|
[[package]]
|
|
5024
5025
|
name = "rmcp"
|
|
5025
|
-
version = "0.
|
|
5026
|
+
version = "0.14.0"
|
|
5026
5027
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
5027
|
-
checksum = "
|
|
5028
|
+
checksum = "0a621b37a548ff6ab6292d57841eb25785a7f146d89391a19c9f199414bd13da"
|
|
5028
5029
|
dependencies = [
|
|
5029
5030
|
"async-trait",
|
|
5030
5031
|
"axum",
|
|
@@ -5054,9 +5055,9 @@ dependencies = [
|
|
|
5054
5055
|
|
|
5055
5056
|
[[package]]
|
|
5056
5057
|
name = "rmcp-macros"
|
|
5057
|
-
version = "0.
|
|
5058
|
+
version = "0.14.0"
|
|
5058
5059
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
5059
|
-
checksum = "
|
|
5060
|
+
checksum = "6b79ed92303f9262db79575aa8c3652581668e9d136be6fd0b9ededa78954c95"
|
|
5060
5061
|
dependencies = [
|
|
5061
5062
|
"darling 0.23.0",
|
|
5062
5063
|
"proc-macro2",
|
|
@@ -6013,6 +6014,7 @@ checksum = "91e7d9e3bb61134e77bde20dd4825b97c010155709965fedf0f49bb138e52a9d"
|
|
|
6013
6014
|
dependencies = [
|
|
6014
6015
|
"deranged",
|
|
6015
6016
|
"itoa",
|
|
6017
|
+
"js-sys",
|
|
6016
6018
|
"num-conv",
|
|
6017
6019
|
"powerfmt",
|
|
6018
6020
|
"serde",
|
|
@@ -6405,6 +6407,12 @@ version = "0.25.1"
|
|
|
6405
6407
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
6406
6408
|
checksum = "d2df906b07856748fa3f6e0ad0cbaa047052d4a7dd609e231c4f72cee8c36f31"
|
|
6407
6409
|
|
|
6410
|
+
[[package]]
|
|
6411
|
+
name = "typed-path"
|
|
6412
|
+
version = "0.12.1"
|
|
6413
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
6414
|
+
checksum = "e43ffa54726cdc9ea78392023ffe9fe9cf9ac779e1c6fcb0d23f9862e3879d20"
|
|
6415
|
+
|
|
6408
6416
|
[[package]]
|
|
6409
6417
|
name = "typeid"
|
|
6410
6418
|
version = "1.0.3"
|
|
@@ -6698,9 +6706,9 @@ checksum = "06abde3611657adf66d383f00b093d7faecc7fa57071cce2578660c9f1010821"
|
|
|
6698
6706
|
|
|
6699
6707
|
[[package]]
|
|
6700
6708
|
name = "uuid"
|
|
6701
|
-
version = "1.
|
|
6709
|
+
version = "1.20.0"
|
|
6702
6710
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
6703
|
-
checksum = "
|
|
6711
|
+
checksum = "ee48d38b119b0cd71fe4141b30f5ba9c7c5d9f4e7a3a8b4a674e4b6ef789976f"
|
|
6704
6712
|
dependencies = [
|
|
6705
6713
|
"getrandom 0.3.4",
|
|
6706
6714
|
"js-sys",
|
|
@@ -7498,9 +7506,9 @@ dependencies = [
|
|
|
7498
7506
|
|
|
7499
7507
|
[[package]]
|
|
7500
7508
|
name = "zip"
|
|
7501
|
-
version = "7.
|
|
7509
|
+
version = "7.2.0"
|
|
7502
7510
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
7503
|
-
checksum = "
|
|
7511
|
+
checksum = "c42e33efc22a0650c311c2ef19115ce232583abbe80850bc8b66509ebef02de0"
|
|
7504
7512
|
dependencies = [
|
|
7505
7513
|
"aes",
|
|
7506
7514
|
"bzip2",
|
|
@@ -7518,6 +7526,7 @@ dependencies = [
|
|
|
7518
7526
|
"ppmd-rust",
|
|
7519
7527
|
"sha1",
|
|
7520
7528
|
"time",
|
|
7529
|
+
"typed-path",
|
|
7521
7530
|
"zeroize",
|
|
7522
7531
|
"zopfli",
|
|
7523
7532
|
"zstd",
|
data/lib/kreuzberg/cli.rb
CHANGED
|
@@ -8,24 +8,34 @@ module Kreuzberg
|
|
|
8
8
|
|
|
9
9
|
# Extract content from a file using the CLI
|
|
10
10
|
#
|
|
11
|
-
# @param
|
|
11
|
+
# @param path_or_nil [String, nil] Path to the file (positional, for backward compatibility)
|
|
12
|
+
# @param path [String] Path to the file (keyword argument)
|
|
12
13
|
# @param output [String] Output format ("text", "json", "markdown")
|
|
13
14
|
# @param ocr [Boolean] Enable OCR
|
|
14
15
|
# @return [String] Extracted content
|
|
15
16
|
#
|
|
16
|
-
def extract(path, output: 'text', ocr: false)
|
|
17
|
-
|
|
17
|
+
def extract(path_or_nil = nil, path: nil, output: 'text', ocr: false)
|
|
18
|
+
# Support both positional and keyword argument for path (backward compatibility)
|
|
19
|
+
actual_path = path_or_nil || path
|
|
20
|
+
raise ArgumentError, 'path is required' if actual_path.nil?
|
|
21
|
+
|
|
22
|
+
args = ['extract', actual_path, '--format', output]
|
|
18
23
|
args.push('--ocr', ocr ? 'true' : 'false')
|
|
19
24
|
CLIProxy.call(args)
|
|
20
25
|
end
|
|
21
26
|
|
|
22
27
|
# Detect MIME type of a file using the CLI
|
|
23
28
|
#
|
|
24
|
-
# @param
|
|
29
|
+
# @param path_or_nil [String, nil] Path to the file (positional, for backward compatibility)
|
|
30
|
+
# @param path [String] Path to the file (keyword argument)
|
|
25
31
|
# @return [String] MIME type
|
|
26
32
|
#
|
|
27
|
-
def detect(path)
|
|
28
|
-
|
|
33
|
+
def detect(path_or_nil = nil, path: nil)
|
|
34
|
+
# Support both positional and keyword argument for path (backward compatibility)
|
|
35
|
+
actual_path = path_or_nil || path
|
|
36
|
+
raise ArgumentError, 'path is required' if actual_path.nil?
|
|
37
|
+
|
|
38
|
+
CLIProxy.call(['detect', actual_path]).strip
|
|
29
39
|
end
|
|
30
40
|
|
|
31
41
|
# Get CLI version
|
data/lib/kreuzberg/cli_proxy.rb
CHANGED
|
@@ -91,11 +91,13 @@ module Kreuzberg
|
|
|
91
91
|
lib_path.join('bin', binary_name),
|
|
92
92
|
lib_path.join(binary_name),
|
|
93
93
|
root_path.join('../../crates/kreuzberg-cli/target/release', binary_name),
|
|
94
|
-
root_path.join('../../target/release', binary_name)
|
|
94
|
+
root_path.join('../../target/release', binary_name),
|
|
95
|
+
root_path.join('../../target/debug', binary_name)
|
|
95
96
|
]
|
|
96
97
|
|
|
97
98
|
workspace_root = root_path.parent&.parent
|
|
98
99
|
paths << workspace_root.join('target', 'release', binary_name) if workspace_root
|
|
100
|
+
paths << workspace_root.join('target', 'debug', binary_name) if workspace_root
|
|
99
101
|
|
|
100
102
|
paths
|
|
101
103
|
end
|
data/lib/kreuzberg/config.rb
CHANGED
|
@@ -715,10 +715,13 @@ module Kreuzberg
|
|
|
715
715
|
class Extraction
|
|
716
716
|
attr_reader :use_cache, :enable_quality_processing, :force_ocr,
|
|
717
717
|
:ocr, :chunking, :language_detection, :pdf_options,
|
|
718
|
-
:
|
|
718
|
+
:images, :image_preprocessing, :postprocessor,
|
|
719
719
|
:token_reduction, :keywords, :html_options, :pages,
|
|
720
720
|
:max_concurrent_extractions, :output_format, :result_format
|
|
721
721
|
|
|
722
|
+
# Alias for backward compatibility - image_extraction is the canonical name
|
|
723
|
+
alias image_extraction images
|
|
724
|
+
|
|
722
725
|
# Load configuration from a file.
|
|
723
726
|
#
|
|
724
727
|
# Detects the file format from the extension (.toml, .yaml, .json)
|
|
@@ -746,6 +749,12 @@ module Kreuzberg
|
|
|
746
749
|
images: :image_extraction
|
|
747
750
|
}.freeze
|
|
748
751
|
|
|
752
|
+
# Valid output format values (case-insensitive, normalized internally)
|
|
753
|
+
VALID_OUTPUT_FORMATS = %w[plain markdown html djot].freeze
|
|
754
|
+
|
|
755
|
+
# Valid result format values (case-insensitive, normalized internally)
|
|
756
|
+
VALID_RESULT_FORMATS = %w[unified elements element_based].freeze
|
|
757
|
+
|
|
749
758
|
def self.from_file(path)
|
|
750
759
|
hash = Kreuzberg._config_from_file_native(path)
|
|
751
760
|
new(**normalize_hash_keys(hash))
|
|
@@ -836,7 +845,7 @@ module Kreuzberg
|
|
|
836
845
|
@chunking = normalize_config(params[:chunking], Chunking)
|
|
837
846
|
@language_detection = normalize_config(params[:language_detection], LanguageDetection)
|
|
838
847
|
@pdf_options = normalize_config(params[:pdf_options], PDF)
|
|
839
|
-
@
|
|
848
|
+
@images = normalize_config(params[:image_extraction], ImageExtraction)
|
|
840
849
|
@image_preprocessing = normalize_config(params[:image_preprocessing], ImagePreprocessing)
|
|
841
850
|
@postprocessor = normalize_config(params[:postprocessor], PostProcessor)
|
|
842
851
|
@token_reduction = normalize_config(params[:token_reduction], TokenReduction)
|
|
@@ -844,8 +853,28 @@ module Kreuzberg
|
|
|
844
853
|
@html_options = normalize_config(params[:html_options], HtmlOptions)
|
|
845
854
|
@pages = normalize_config(params[:pages], PageConfig)
|
|
846
855
|
@max_concurrent_extractions = params[:max_concurrent_extractions]&.to_i
|
|
847
|
-
@output_format = params[:output_format]
|
|
848
|
-
@result_format = params[:result_format]
|
|
856
|
+
@output_format = validate_output_format(params[:output_format])
|
|
857
|
+
@result_format = validate_result_format(params[:result_format])
|
|
858
|
+
end
|
|
859
|
+
|
|
860
|
+
def validate_output_format(value)
|
|
861
|
+
return nil if value.nil?
|
|
862
|
+
|
|
863
|
+
str_value = value.to_s.downcase
|
|
864
|
+
return str_value if VALID_OUTPUT_FORMATS.include?(str_value)
|
|
865
|
+
|
|
866
|
+
raise ArgumentError,
|
|
867
|
+
"Invalid output_format: #{value}. Valid values: #{VALID_OUTPUT_FORMATS.join(', ')}"
|
|
868
|
+
end
|
|
869
|
+
|
|
870
|
+
def validate_result_format(value)
|
|
871
|
+
return nil if value.nil?
|
|
872
|
+
|
|
873
|
+
str_value = value.to_s.downcase
|
|
874
|
+
return str_value if VALID_RESULT_FORMATS.include?(str_value)
|
|
875
|
+
|
|
876
|
+
raise ArgumentError,
|
|
877
|
+
"Invalid result_format: #{value}. Valid values: #{VALID_RESULT_FORMATS.join(', ')}"
|
|
849
878
|
end
|
|
850
879
|
|
|
851
880
|
# rubocop:disable Metrics/CyclomaticComplexity
|
|
@@ -859,7 +888,7 @@ module Kreuzberg
|
|
|
859
888
|
chunking: @chunking&.to_h,
|
|
860
889
|
language_detection: @language_detection&.to_h,
|
|
861
890
|
pdf_options: @pdf_options&.to_h,
|
|
862
|
-
|
|
891
|
+
images: @images&.to_h,
|
|
863
892
|
image_preprocessing: @image_preprocessing&.to_h,
|
|
864
893
|
postprocessor: @postprocessor&.to_h,
|
|
865
894
|
token_reduction: @token_reduction&.to_h,
|
|
@@ -995,7 +1024,7 @@ module Kreuzberg
|
|
|
995
1024
|
when :pdf_options
|
|
996
1025
|
@pdf_options = normalize_config(value, PDF)
|
|
997
1026
|
when :image_extraction
|
|
998
|
-
@
|
|
1027
|
+
@images = normalize_config(value, ImageExtraction)
|
|
999
1028
|
when :image_preprocessing
|
|
1000
1029
|
@image_preprocessing = normalize_config(value, ImagePreprocessing)
|
|
1001
1030
|
when :postprocessor
|
|
@@ -1011,9 +1040,9 @@ module Kreuzberg
|
|
|
1011
1040
|
when :max_concurrent_extractions
|
|
1012
1041
|
@max_concurrent_extractions = value&.to_i
|
|
1013
1042
|
when :output_format
|
|
1014
|
-
@output_format = value
|
|
1043
|
+
@output_format = validate_output_format(value)
|
|
1015
1044
|
when :result_format
|
|
1016
|
-
@result_format = value
|
|
1045
|
+
@result_format = validate_result_format(value)
|
|
1017
1046
|
else
|
|
1018
1047
|
raise ArgumentError, "Unknown configuration key: #{key}"
|
|
1019
1048
|
end
|
|
@@ -1035,6 +1064,24 @@ module Kreuzberg
|
|
|
1035
1064
|
nil
|
|
1036
1065
|
end
|
|
1037
1066
|
|
|
1067
|
+
# Set output_format attribute
|
|
1068
|
+
#
|
|
1069
|
+
# @param value [String, nil] Output format value
|
|
1070
|
+
# @return [String, nil] The value that was set
|
|
1071
|
+
#
|
|
1072
|
+
def output_format=(value)
|
|
1073
|
+
@output_format = validate_output_format(value)
|
|
1074
|
+
end
|
|
1075
|
+
|
|
1076
|
+
# Set result_format attribute
|
|
1077
|
+
#
|
|
1078
|
+
# @param value [String, nil] Result format value
|
|
1079
|
+
# @return [String, nil] The value that was set
|
|
1080
|
+
#
|
|
1081
|
+
def result_format=(value)
|
|
1082
|
+
@result_format = validate_result_format(value)
|
|
1083
|
+
end
|
|
1084
|
+
|
|
1038
1085
|
private
|
|
1039
1086
|
|
|
1040
1087
|
def normalize_config(value, klass)
|
|
@@ -1053,7 +1100,7 @@ module Kreuzberg
|
|
|
1053
1100
|
@chunking = merged.chunking
|
|
1054
1101
|
@language_detection = merged.language_detection
|
|
1055
1102
|
@pdf_options = merged.pdf_options
|
|
1056
|
-
@
|
|
1103
|
+
@images = merged.image_extraction
|
|
1057
1104
|
@image_preprocessing = merged.image_preprocessing
|
|
1058
1105
|
@postprocessor = merged.postprocessor
|
|
1059
1106
|
@token_reduction = merged.token_reduction
|
|
@@ -0,0 +1,225 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
begin
|
|
4
|
+
require 'json'
|
|
5
|
+
rescue LoadError
|
|
6
|
+
require 'json/pure'
|
|
7
|
+
end
|
|
8
|
+
|
|
9
|
+
module Kreuzberg
|
|
10
|
+
class Result
|
|
11
|
+
# Djot structured content representation
|
|
12
|
+
#
|
|
13
|
+
# Represents document content in Djot format with structured metadata about
|
|
14
|
+
# blocks, images, links, footnotes, and other document elements.
|
|
15
|
+
#
|
|
16
|
+
class DjotContent
|
|
17
|
+
attr_reader :plain_text, :blocks, :metadata_json, :tables, :images, :links, :footnotes, :attributes
|
|
18
|
+
|
|
19
|
+
# Represents a formatted block in Djot content
|
|
20
|
+
class FormattedBlock
|
|
21
|
+
attr_reader :block_type, :children, :attributes, :content, :level
|
|
22
|
+
|
|
23
|
+
# rubocop:disable Metrics/CyclomaticComplexity, Metrics/PerceivedComplexity
|
|
24
|
+
def initialize(hash_or_type = nil, children: nil, attributes: nil, content: nil, level: nil, block_type: nil)
|
|
25
|
+
if hash_or_type.is_a?(Hash)
|
|
26
|
+
# Initialize from hash
|
|
27
|
+
@block_type = hash_or_type[:block_type] || hash_or_type['block_type'] || ''
|
|
28
|
+
@children = hash_or_type[:children] || hash_or_type['children']
|
|
29
|
+
@attributes = hash_or_type[:attributes] || hash_or_type['attributes'] || {}
|
|
30
|
+
@content = hash_or_type[:content] || hash_or_type['content']
|
|
31
|
+
@level = hash_or_type[:level] || hash_or_type['level']
|
|
32
|
+
else
|
|
33
|
+
# Initialize from keyword arguments (for backward compatibility)
|
|
34
|
+
@block_type = block_type || hash_or_type || ''
|
|
35
|
+
@children = children || []
|
|
36
|
+
@attributes = attributes || {}
|
|
37
|
+
@content = content
|
|
38
|
+
@level = level
|
|
39
|
+
end
|
|
40
|
+
end
|
|
41
|
+
# rubocop:enable Metrics/CyclomaticComplexity, Metrics/PerceivedComplexity
|
|
42
|
+
|
|
43
|
+
def to_h
|
|
44
|
+
{
|
|
45
|
+
block_type: @block_type,
|
|
46
|
+
children: @children,
|
|
47
|
+
attributes: @attributes,
|
|
48
|
+
content: @content,
|
|
49
|
+
level: @level
|
|
50
|
+
}.compact
|
|
51
|
+
end
|
|
52
|
+
end
|
|
53
|
+
|
|
54
|
+
# Represents an image in Djot content
|
|
55
|
+
class DjotImage
|
|
56
|
+
attr_reader :url, :alt, :title, :width, :height
|
|
57
|
+
alias src url
|
|
58
|
+
|
|
59
|
+
# rubocop:disable Metrics/CyclomaticComplexity
|
|
60
|
+
def initialize(hash_or_url = nil, alt: nil, title: nil, width: nil, height: nil, url: nil, src: nil)
|
|
61
|
+
if hash_or_url.is_a?(Hash)
|
|
62
|
+
# Initialize from hash (supports both 'url' and 'src' keys)
|
|
63
|
+
@url = hash_or_url[:url] || hash_or_url['url'] || hash_or_url[:src] || hash_or_url['src']
|
|
64
|
+
@alt = hash_or_url[:alt] || hash_or_url['alt']
|
|
65
|
+
@title = hash_or_url[:title] || hash_or_url['title']
|
|
66
|
+
@width = hash_or_url[:width] || hash_or_url['width']
|
|
67
|
+
@height = hash_or_url[:height] || hash_or_url['height']
|
|
68
|
+
else
|
|
69
|
+
# Initialize from keyword arguments
|
|
70
|
+
@url = url || src || hash_or_url
|
|
71
|
+
@alt = alt
|
|
72
|
+
@title = title
|
|
73
|
+
@width = width
|
|
74
|
+
@height = height
|
|
75
|
+
end
|
|
76
|
+
end
|
|
77
|
+
# rubocop:enable Metrics/CyclomaticComplexity
|
|
78
|
+
|
|
79
|
+
def to_h
|
|
80
|
+
{
|
|
81
|
+
url: @url,
|
|
82
|
+
alt: @alt,
|
|
83
|
+
title: @title,
|
|
84
|
+
width: @width,
|
|
85
|
+
height: @height
|
|
86
|
+
}.compact
|
|
87
|
+
end
|
|
88
|
+
end
|
|
89
|
+
|
|
90
|
+
# Represents a link in Djot content
|
|
91
|
+
class DjotLink
|
|
92
|
+
attr_reader :url, :text, :title, :link_type
|
|
93
|
+
alias href url
|
|
94
|
+
|
|
95
|
+
# rubocop:disable Metrics/CyclomaticComplexity
|
|
96
|
+
def initialize(hash_or_url = nil, text: nil, title: nil, url: nil, href: nil, link_type: nil)
|
|
97
|
+
if hash_or_url.is_a?(Hash)
|
|
98
|
+
# Initialize from hash (supports both 'url' and 'href' keys)
|
|
99
|
+
@url = hash_or_url[:url] || hash_or_url['url'] || hash_or_url[:href] || hash_or_url['href']
|
|
100
|
+
@text = hash_or_url[:text] || hash_or_url['text']
|
|
101
|
+
@title = hash_or_url[:title] || hash_or_url['title']
|
|
102
|
+
@link_type = hash_or_url[:link_type] || hash_or_url['link_type']
|
|
103
|
+
else
|
|
104
|
+
# Initialize from keyword arguments
|
|
105
|
+
@url = url || href || hash_or_url
|
|
106
|
+
@text = text
|
|
107
|
+
@title = title
|
|
108
|
+
@link_type = link_type
|
|
109
|
+
end
|
|
110
|
+
end
|
|
111
|
+
# rubocop:enable Metrics/CyclomaticComplexity
|
|
112
|
+
|
|
113
|
+
def to_h
|
|
114
|
+
{
|
|
115
|
+
url: @url,
|
|
116
|
+
text: @text,
|
|
117
|
+
title: @title,
|
|
118
|
+
link_type: @link_type
|
|
119
|
+
}.compact
|
|
120
|
+
end
|
|
121
|
+
end
|
|
122
|
+
|
|
123
|
+
# Represents a footnote in Djot content
|
|
124
|
+
class Footnote
|
|
125
|
+
attr_reader :label, :content
|
|
126
|
+
|
|
127
|
+
def initialize(label:, content:)
|
|
128
|
+
@label = label
|
|
129
|
+
@content = content
|
|
130
|
+
end
|
|
131
|
+
|
|
132
|
+
def to_h
|
|
133
|
+
{
|
|
134
|
+
label: @label,
|
|
135
|
+
content: @content
|
|
136
|
+
}
|
|
137
|
+
end
|
|
138
|
+
end
|
|
139
|
+
|
|
140
|
+
# rubocop:disable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/PerceivedComplexity
|
|
141
|
+
def initialize(hash)
|
|
142
|
+
@plain_text = hash['plain_text'] || hash[:plain_text] || ''
|
|
143
|
+
@blocks = parse_blocks(hash['blocks'] || hash[:blocks] || [])
|
|
144
|
+
@metadata_json = hash['metadata_json'] || hash[:metadata_json] || '{}'
|
|
145
|
+
@tables = hash['tables'] || hash[:tables] || []
|
|
146
|
+
@images = parse_images(hash['images'] || hash[:images] || [])
|
|
147
|
+
@links = parse_links(hash['links'] || hash[:links] || [])
|
|
148
|
+
@footnotes = parse_footnotes(hash['footnotes'] || hash[:footnotes] || [])
|
|
149
|
+
@attributes = hash['attributes'] || hash[:attributes] || {}
|
|
150
|
+
end
|
|
151
|
+
# rubocop:enable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/PerceivedComplexity
|
|
152
|
+
|
|
153
|
+
def metadata
|
|
154
|
+
@metadata ||= parse_metadata(@metadata_json)
|
|
155
|
+
end
|
|
156
|
+
|
|
157
|
+
def to_h
|
|
158
|
+
{
|
|
159
|
+
plain_text: @plain_text,
|
|
160
|
+
blocks: @blocks.map(&:to_h),
|
|
161
|
+
metadata_json: @metadata_json,
|
|
162
|
+
tables: @tables,
|
|
163
|
+
images: @images.map(&:to_h),
|
|
164
|
+
links: @links.map(&:to_h),
|
|
165
|
+
footnotes: @footnotes.map(&:to_h),
|
|
166
|
+
attributes: @attributes
|
|
167
|
+
}
|
|
168
|
+
end
|
|
169
|
+
|
|
170
|
+
private
|
|
171
|
+
|
|
172
|
+
def parse_metadata(metadata_json)
|
|
173
|
+
JSON.parse(metadata_json)
|
|
174
|
+
rescue JSON::ParserError
|
|
175
|
+
{}
|
|
176
|
+
end
|
|
177
|
+
|
|
178
|
+
def parse_blocks(blocks_data)
|
|
179
|
+
blocks_data.map do |block|
|
|
180
|
+
FormattedBlock.new(
|
|
181
|
+
block_type: block['block_type'] || block[:block_type] || '',
|
|
182
|
+
children: block['children'] || block[:children],
|
|
183
|
+
attributes: block['attributes'] || block[:attributes]
|
|
184
|
+
)
|
|
185
|
+
end
|
|
186
|
+
end
|
|
187
|
+
|
|
188
|
+
# rubocop:disable Metrics/CyclomaticComplexity
|
|
189
|
+
def parse_images(images_data)
|
|
190
|
+
images_data.map do |image|
|
|
191
|
+
DjotImage.new(
|
|
192
|
+
url: image['url'] || image[:url] || image['src'] || image[:src],
|
|
193
|
+
alt: image['alt'] || image[:alt],
|
|
194
|
+
title: image['title'] || image[:title],
|
|
195
|
+
width: image['width'] || image[:width],
|
|
196
|
+
height: image['height'] || image[:height]
|
|
197
|
+
)
|
|
198
|
+
end
|
|
199
|
+
end
|
|
200
|
+
# rubocop:enable Metrics/CyclomaticComplexity
|
|
201
|
+
|
|
202
|
+
# rubocop:disable Metrics/CyclomaticComplexity
|
|
203
|
+
def parse_links(links_data)
|
|
204
|
+
links_data.map do |link|
|
|
205
|
+
DjotLink.new(
|
|
206
|
+
url: link['url'] || link[:url] || link['href'] || link[:href],
|
|
207
|
+
text: link['text'] || link[:text],
|
|
208
|
+
title: link['title'] || link[:title],
|
|
209
|
+
link_type: link['link_type'] || link[:link_type]
|
|
210
|
+
)
|
|
211
|
+
end
|
|
212
|
+
end
|
|
213
|
+
# rubocop:enable Metrics/CyclomaticComplexity
|
|
214
|
+
|
|
215
|
+
def parse_footnotes(footnotes_data)
|
|
216
|
+
footnotes_data.map do |note|
|
|
217
|
+
Footnote.new(
|
|
218
|
+
label: note['label'] || note[:label],
|
|
219
|
+
content: note['content'] || note[:content]
|
|
220
|
+
)
|
|
221
|
+
end
|
|
222
|
+
end
|
|
223
|
+
end
|
|
224
|
+
end
|
|
225
|
+
end
|
|
@@ -15,11 +15,15 @@ module Kreuzberg
|
|
|
15
15
|
# @example Extract with explicit MIME type
|
|
16
16
|
# @example Extract with OCR enabled
|
|
17
17
|
def extract_file_sync(path:, mime_type: nil, config: nil)
|
|
18
|
+
# Validate that the file exists
|
|
19
|
+
path_str = path.to_s
|
|
20
|
+
raise Errors::IOError, "File not found: #{path_str}" unless File.exist?(path_str)
|
|
21
|
+
|
|
18
22
|
opts = normalize_config(config)
|
|
19
23
|
hash = if mime_type
|
|
20
|
-
native_extract_file_sync(
|
|
24
|
+
native_extract_file_sync(path_str, mime_type.to_s, **opts)
|
|
21
25
|
else
|
|
22
|
-
native_extract_file_sync(
|
|
26
|
+
native_extract_file_sync(path_str, **opts)
|
|
23
27
|
end
|
|
24
28
|
result = Result.new(hash)
|
|
25
29
|
record_cache_entry!(result, opts)
|
|
@@ -53,6 +57,8 @@ module Kreuzberg
|
|
|
53
57
|
# response = HTTParty.get("https://example.com/document.docx")
|
|
54
58
|
# result = Kreuzberg.extract_bytes_sync(response.body, "application/vnd.openxmlformats-officedocument.wordprocessingml.document")
|
|
55
59
|
def extract_bytes_sync(data:, mime_type:, config: nil)
|
|
60
|
+
raise TypeError, "mime_type must be a String, got #{mime_type.inspect}" if mime_type.nil?
|
|
61
|
+
|
|
56
62
|
opts = normalize_config(config)
|
|
57
63
|
hash = native_extract_bytes_sync(data.to_s, mime_type.to_s, **opts)
|
|
58
64
|
result = Result.new(hash)
|
|
@@ -92,6 +98,12 @@ module Kreuzberg
|
|
|
92
98
|
# config = Kreuzberg::Config::Extraction.new(force_ocr: true)
|
|
93
99
|
# results = Kreuzberg.batch_extract_files_sync(paths, config: config)
|
|
94
100
|
def batch_extract_files_sync(paths:, config: nil)
|
|
101
|
+
# Validate that all files exist
|
|
102
|
+
paths.each do |path|
|
|
103
|
+
path_str = path.to_s
|
|
104
|
+
raise Errors::IOError, "File not found: #{path_str}" unless File.exist?(path_str)
|
|
105
|
+
end
|
|
106
|
+
|
|
95
107
|
opts = normalize_config(config)
|
|
96
108
|
hashes = native_batch_extract_files_sync(paths.map(&:to_s), **opts)
|
|
97
109
|
results = hashes.map { |hash| Result.new(hash) }
|
|
@@ -130,11 +142,15 @@ module Kreuzberg
|
|
|
130
142
|
# )
|
|
131
143
|
# result = Kreuzberg.extract_file("document.pdf", config: config)
|
|
132
144
|
def extract_file(path:, mime_type: nil, config: nil)
|
|
145
|
+
# Validate that the file exists
|
|
146
|
+
path_str = path.to_s
|
|
147
|
+
raise Errors::IOError, "File not found: #{path_str}" unless File.exist?(path_str)
|
|
148
|
+
|
|
133
149
|
opts = normalize_config(config)
|
|
134
150
|
hash = if mime_type
|
|
135
|
-
native_extract_file(
|
|
151
|
+
native_extract_file(path_str, mime_type.to_s, **opts)
|
|
136
152
|
else
|
|
137
|
-
native_extract_file(
|
|
153
|
+
native_extract_file(path_str, **opts)
|
|
138
154
|
end
|
|
139
155
|
result = Result.new(hash)
|
|
140
156
|
record_cache_entry!(result, opts)
|