kreuzberg 4.2.0 → 4.2.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (55) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile.lock +2 -2
  3. data/README.md +1 -1
  4. data/ext/kreuzberg_rb/native/Cargo.lock +26 -17
  5. data/lib/kreuzberg/cli.rb +16 -6
  6. data/lib/kreuzberg/cli_proxy.rb +3 -1
  7. data/lib/kreuzberg/config.rb +59 -28
  8. data/lib/kreuzberg/djot_content.rb +225 -0
  9. data/lib/kreuzberg/extraction_api.rb +20 -4
  10. data/lib/kreuzberg/result.rb +12 -2
  11. data/lib/kreuzberg/version.rb +1 -1
  12. data/lib/kreuzberg.rb +1 -0
  13. data/sig/kreuzberg.rbs +23 -11
  14. data/spec/binding/batch_spec.rb +6 -5
  15. data/spec/binding/config_spec.rb +1 -1
  16. data/spec/binding/error_recovery_spec.rb +3 -3
  17. data/spec/binding/tables_spec.rb +11 -2
  18. data/spec/unit/config/extraction_config_spec.rb +2 -2
  19. data/spec/unit/config/output_format_spec.rb +18 -18
  20. data/vendor/Cargo.toml +1 -1
  21. data/vendor/kreuzberg/Cargo.toml +3 -2
  22. data/vendor/kreuzberg/README.md +1 -1
  23. data/vendor/kreuzberg/src/api/error.rs +60 -0
  24. data/vendor/kreuzberg/src/api/handlers.rs +153 -32
  25. data/vendor/kreuzberg/src/api/mod.rs +2 -0
  26. data/vendor/kreuzberg/src/api/openapi.rs +141 -0
  27. data/vendor/kreuzberg/src/api/router.rs +24 -2
  28. data/vendor/kreuzberg/src/api/startup.rs +21 -1
  29. data/vendor/kreuzberg/src/api/types.rs +50 -4
  30. data/vendor/kreuzberg/src/core/config/processing.rs +8 -1
  31. data/vendor/kreuzberg/src/core/config_validation/sections.rs +16 -4
  32. data/vendor/kreuzberg/src/core/extractor/file.rs +1 -2
  33. data/vendor/kreuzberg/src/core/extractor/mod.rs +2 -1
  34. data/vendor/kreuzberg/src/core/io.rs +7 -7
  35. data/vendor/kreuzberg/src/core/mime.rs +4 -4
  36. data/vendor/kreuzberg/src/extraction/excel.rs +246 -9
  37. data/vendor/kreuzberg/src/extraction/pptx/parser.rs +6 -0
  38. data/vendor/kreuzberg/src/plugins/mod.rs +1 -0
  39. data/vendor/kreuzberg/src/plugins/registry/extractor.rs +251 -5
  40. data/vendor/kreuzberg/src/plugins/registry/ocr.rs +150 -2
  41. data/vendor/kreuzberg/src/plugins/registry/processor.rs +213 -5
  42. data/vendor/kreuzberg/src/plugins/registry/validator.rs +220 -4
  43. data/vendor/kreuzberg/src/plugins/startup_validation.rs +385 -0
  44. data/vendor/kreuzberg/tests/config_behavioral.rs +14 -12
  45. data/vendor/kreuzberg/tests/core_integration.rs +2 -4
  46. data/vendor/kreuzberg/tests/mime_detection.rs +3 -2
  47. data/vendor/kreuzberg/tests/pptx_regression_tests.rs +284 -1
  48. data/vendor/kreuzberg/tests/xlsx_metadata_extraction_test.rs +56 -0
  49. data/vendor/kreuzberg-ffi/kreuzberg.h +7 -2
  50. data/vendor/kreuzberg-ffi/src/helpers.rs +13 -1
  51. data/vendor/kreuzberg-ffi/src/lib.rs +8 -5
  52. data/vendor/kreuzberg-ffi/src/memory.rs +35 -1
  53. data/vendor/kreuzberg-ffi/src/types.rs +8 -5
  54. data/vendor/kreuzberg-tesseract/Cargo.toml +1 -1
  55. metadata +5 -2
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 9a1c9adffca7d75c142bd661f1d481b1aee00d97c6f62dcc70292f37978bcc17
4
- data.tar.gz: 227af2ed45bff1dfa9afebd69220d15a41b2e476bf97f8a83173d21aab8b88e1
3
+ metadata.gz: 1765714785cbe89567dcb13ed0c1e1b79c2da7a2143a0d0b4653c5578a3ada84
4
+ data.tar.gz: 64d6db5e4d88992920f37fe9a3e28ab08b5bdd0b28385da570e8207e67d90f34
5
5
  SHA512:
6
- metadata.gz: 0d1b0081f89a73f5422e68a714fc415f6d290dd8be7cf0ba6d454cfdf1938ebdac4919358b25d6e5a0bc1a209e1b165062a0341d28cde1b3fa0595bffec837f5
7
- data.tar.gz: fc5a5f29309c29fbbf63ba035cf5e462e78b15c2afc239fb333bd8b6e70ef061219822ed4d533f81bb35cdd62db84da8c01e8f172561b0f7fb802b848b491c0a
6
+ metadata.gz: cbb71395a285ddb1a74101fc935ebb8266b81c6172043a128721ab37fad583c7202e559f9e8cb2534bf110721bf20e2d0cbe6838554c772831c56bc09583bf75
7
+ data.tar.gz: b752cf56da8810211e5efd5e5d69f136eb7d0a3d5e27e985b81dff18bde442f0033b15962823ad7e3c5a27e080d02b6a6df1726bffe4aa21eaf89f56a5c6b56f
data/Gemfile.lock CHANGED
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- kreuzberg (4.2.0)
4
+ kreuzberg (4.2.2)
5
5
 
6
6
  GEM
7
7
  remote: https://rubygems.org/
@@ -207,7 +207,7 @@ CHECKSUMS
207
207
  i18n (1.14.8) sha256=285778639134865c5e0f6269e0b818256017e8cde89993fdfcbfb64d088824a5
208
208
  io-console (0.8.2) sha256=d6e3ae7a7cc7574f4b8893b4fca2162e57a825b223a177b7afa236c5ef9814cc
209
209
  json (2.18.0) sha256=b10506aee4183f5cf49e0efc48073d7b75843ce3782c68dbeb763351c08fd505
210
- kreuzberg (4.2.0)
210
+ kreuzberg (4.2.2)
211
211
  language_server-protocol (3.17.0.5) sha256=fd1e39a51a28bf3eec959379985a72e296e9f9acfce46f6a79d31ca8760803cc
212
212
  lint_roller (1.1.0) sha256=2c0c845b632a7d172cb849cc90c1bce937a28c5c8ccccb50dfd46a485003cc87
213
213
  listen (3.10.0) sha256=c6e182db62143aeccc2e1960033bebe7445309c7272061979bb098d03760c9d2
data/README.md CHANGED
@@ -22,7 +22,7 @@
22
22
  <img src="https://img.shields.io/maven-central/v/dev.kreuzberg/kreuzberg?label=Java&color=007ec6" alt="Java">
23
23
  </a>
24
24
  <a href="https://github.com/kreuzberg-dev/kreuzberg/releases">
25
- <img src="https://img.shields.io/github/v/tag/kreuzberg-dev/kreuzberg?label=Go&color=007ec6&filter=v4.2.0" alt="Go">
25
+ <img src="https://img.shields.io/github/v/tag/kreuzberg-dev/kreuzberg?label=Go&color=007ec6&filter=v4.2.2" alt="Go">
26
26
  </a>
27
27
  <a href="https://www.nuget.org/packages/Kreuzberg/">
28
28
  <img src="https://img.shields.io/nuget/v/Kreuzberg?label=C%23&color=007ec6" alt="C#">
@@ -716,9 +716,9 @@ dependencies = [
716
716
 
717
717
  [[package]]
718
718
  name = "cc"
719
- version = "1.2.53"
719
+ version = "1.2.54"
720
720
  source = "registry+https://github.com/rust-lang/crates.io-index"
721
- checksum = "755d2fce177175ffca841e9a06afdb2c4ab0f593d53b4dee48147dfaade85932"
721
+ checksum = "6354c81bbfd62d9cfa9cb3c773c2b7b2a3a482d569de977fd0e961f6e7c00583"
722
722
  dependencies = [
723
723
  "find-msvc-tools",
724
724
  "jobserver",
@@ -2027,9 +2027,9 @@ dependencies = [
2027
2027
 
2028
2028
  [[package]]
2029
2029
  name = "html-to-markdown-rs"
2030
- version = "2.23.1"
2030
+ version = "2.23.4"
2031
2031
  source = "registry+https://github.com/rust-lang/crates.io-index"
2032
- checksum = "e1f4d6781ac8dd203853803d27054ca4153c7fd0f3956cb7fc95dc06f42a1c46"
2032
+ checksum = "ffbf49450676163bdf69fac2acf72674fcf2d2aaf690aa247368c567cc9afb2a"
2033
2033
  dependencies = [
2034
2034
  "astral-tl",
2035
2035
  "base64 0.22.1",
@@ -2627,7 +2627,7 @@ dependencies = [
2627
2627
 
2628
2628
  [[package]]
2629
2629
  name = "kreuzberg"
2630
- version = "4.1.0"
2630
+ version = "4.2.0"
2631
2631
  dependencies = [
2632
2632
  "ahash",
2633
2633
  "async-trait",
@@ -2706,17 +2706,18 @@ dependencies = [
2706
2706
  "uuid",
2707
2707
  "whatlang",
2708
2708
  "yake-rust",
2709
- "zip 7.1.0",
2709
+ "zip 7.2.0",
2710
2710
  ]
2711
2711
 
2712
2712
  [[package]]
2713
2713
  name = "kreuzberg-ffi"
2714
- version = "4.1.0"
2714
+ version = "4.2.0"
2715
2715
  dependencies = [
2716
2716
  "async-trait",
2717
2717
  "cbindgen",
2718
2718
  "html-to-markdown-rs",
2719
2719
  "kreuzberg",
2720
+ "log",
2720
2721
  "serde",
2721
2722
  "serde_json",
2722
2723
  "tokio",
@@ -2767,14 +2768,14 @@ dependencies = [
2767
2768
 
2768
2769
  [[package]]
2769
2770
  name = "kreuzberg-tesseract"
2770
- version = "4.1.0"
2771
+ version = "4.2.0"
2771
2772
  dependencies = [
2772
2773
  "cc",
2773
2774
  "cmake",
2774
2775
  "libc",
2775
2776
  "reqwest 0.13.1",
2776
2777
  "thiserror 2.0.18",
2777
- "zip 7.1.0",
2778
+ "zip 7.2.0",
2778
2779
  ]
2779
2780
 
2780
2781
  [[package]]
@@ -5022,9 +5023,9 @@ dependencies = [
5022
5023
 
5023
5024
  [[package]]
5024
5025
  name = "rmcp"
5025
- version = "0.13.0"
5026
+ version = "0.14.0"
5026
5027
  source = "registry+https://github.com/rust-lang/crates.io-index"
5027
- checksum = "d1815dbc06c414d720f8bc1951eccd66bc99efc6376331f1e7093a119b3eb508"
5028
+ checksum = "0a621b37a548ff6ab6292d57841eb25785a7f146d89391a19c9f199414bd13da"
5028
5029
  dependencies = [
5029
5030
  "async-trait",
5030
5031
  "axum",
@@ -5054,9 +5055,9 @@ dependencies = [
5054
5055
 
5055
5056
  [[package]]
5056
5057
  name = "rmcp-macros"
5057
- version = "0.13.0"
5058
+ version = "0.14.0"
5058
5059
  source = "registry+https://github.com/rust-lang/crates.io-index"
5059
- checksum = "11f0bc7008fa102e771a76c6d2c9b253be3f2baa5964e060464d038ae1cbc573"
5060
+ checksum = "6b79ed92303f9262db79575aa8c3652581668e9d136be6fd0b9ededa78954c95"
5060
5061
  dependencies = [
5061
5062
  "darling 0.23.0",
5062
5063
  "proc-macro2",
@@ -6013,6 +6014,7 @@ checksum = "91e7d9e3bb61134e77bde20dd4825b97c010155709965fedf0f49bb138e52a9d"
6013
6014
  dependencies = [
6014
6015
  "deranged",
6015
6016
  "itoa",
6017
+ "js-sys",
6016
6018
  "num-conv",
6017
6019
  "powerfmt",
6018
6020
  "serde",
@@ -6405,6 +6407,12 @@ version = "0.25.1"
6405
6407
  source = "registry+https://github.com/rust-lang/crates.io-index"
6406
6408
  checksum = "d2df906b07856748fa3f6e0ad0cbaa047052d4a7dd609e231c4f72cee8c36f31"
6407
6409
 
6410
+ [[package]]
6411
+ name = "typed-path"
6412
+ version = "0.12.1"
6413
+ source = "registry+https://github.com/rust-lang/crates.io-index"
6414
+ checksum = "e43ffa54726cdc9ea78392023ffe9fe9cf9ac779e1c6fcb0d23f9862e3879d20"
6415
+
6408
6416
  [[package]]
6409
6417
  name = "typeid"
6410
6418
  version = "1.0.3"
@@ -6698,9 +6706,9 @@ checksum = "06abde3611657adf66d383f00b093d7faecc7fa57071cce2578660c9f1010821"
6698
6706
 
6699
6707
  [[package]]
6700
6708
  name = "uuid"
6701
- version = "1.19.0"
6709
+ version = "1.20.0"
6702
6710
  source = "registry+https://github.com/rust-lang/crates.io-index"
6703
- checksum = "e2e054861b4bd027cd373e18e8d8d8e6548085000e41290d95ce0c373a654b4a"
6711
+ checksum = "ee48d38b119b0cd71fe4141b30f5ba9c7c5d9f4e7a3a8b4a674e4b6ef789976f"
6704
6712
  dependencies = [
6705
6713
  "getrandom 0.3.4",
6706
6714
  "js-sys",
@@ -7498,9 +7506,9 @@ dependencies = [
7498
7506
 
7499
7507
  [[package]]
7500
7508
  name = "zip"
7501
- version = "7.1.0"
7509
+ version = "7.2.0"
7502
7510
  source = "registry+https://github.com/rust-lang/crates.io-index"
7503
- checksum = "9013f1222db8a6d680f13a7ccdc60a781199cd09c2fa4eff58e728bb181757fc"
7511
+ checksum = "c42e33efc22a0650c311c2ef19115ce232583abbe80850bc8b66509ebef02de0"
7504
7512
  dependencies = [
7505
7513
  "aes",
7506
7514
  "bzip2",
@@ -7518,6 +7526,7 @@ dependencies = [
7518
7526
  "ppmd-rust",
7519
7527
  "sha1",
7520
7528
  "time",
7529
+ "typed-path",
7521
7530
  "zeroize",
7522
7531
  "zopfli",
7523
7532
  "zstd",
data/lib/kreuzberg/cli.rb CHANGED
@@ -8,24 +8,34 @@ module Kreuzberg
8
8
 
9
9
  # Extract content from a file using the CLI
10
10
  #
11
- # @param path [String] Path to the file
11
+ # @param path_or_nil [String, nil] Path to the file (positional, for backward compatibility)
12
+ # @param path [String] Path to the file (keyword argument)
12
13
  # @param output [String] Output format ("text", "json", "markdown")
13
14
  # @param ocr [Boolean] Enable OCR
14
15
  # @return [String] Extracted content
15
16
  #
16
- def extract(path, output: 'text', ocr: false)
17
- args = ['extract', path, '--format', output]
17
+ def extract(path_or_nil = nil, path: nil, output: 'text', ocr: false)
18
+ # Support both positional and keyword argument for path (backward compatibility)
19
+ actual_path = path_or_nil || path
20
+ raise ArgumentError, 'path is required' if actual_path.nil?
21
+
22
+ args = ['extract', actual_path, '--format', output]
18
23
  args.push('--ocr', ocr ? 'true' : 'false')
19
24
  CLIProxy.call(args)
20
25
  end
21
26
 
22
27
  # Detect MIME type of a file using the CLI
23
28
  #
24
- # @param path [String] Path to the file
29
+ # @param path_or_nil [String, nil] Path to the file (positional, for backward compatibility)
30
+ # @param path [String] Path to the file (keyword argument)
25
31
  # @return [String] MIME type
26
32
  #
27
- def detect(path)
28
- CLIProxy.call(['detect', path]).strip
33
+ def detect(path_or_nil = nil, path: nil)
34
+ # Support both positional and keyword argument for path (backward compatibility)
35
+ actual_path = path_or_nil || path
36
+ raise ArgumentError, 'path is required' if actual_path.nil?
37
+
38
+ CLIProxy.call(['detect', actual_path]).strip
29
39
  end
30
40
 
31
41
  # Get CLI version
@@ -91,11 +91,13 @@ module Kreuzberg
91
91
  lib_path.join('bin', binary_name),
92
92
  lib_path.join(binary_name),
93
93
  root_path.join('../../crates/kreuzberg-cli/target/release', binary_name),
94
- root_path.join('../../target/release', binary_name)
94
+ root_path.join('../../target/release', binary_name),
95
+ root_path.join('../../target/debug', binary_name)
95
96
  ]
96
97
 
97
98
  workspace_root = root_path.parent&.parent
98
99
  paths << workspace_root.join('target', 'release', binary_name) if workspace_root
100
+ paths << workspace_root.join('target', 'debug', binary_name) if workspace_root
99
101
 
100
102
  paths
101
103
  end
@@ -684,13 +684,6 @@ module Kreuzberg
684
684
  # image = Config::ImageExtraction.new(extract_images: true, target_dpi: 600)
685
685
  # config = Extraction.new(image_extraction: image)
686
686
  #
687
- # @example With preprocessing
688
- # preprocessing = Config::ImagePreprocessing.new(
689
- # binarization_method: "sauvola",
690
- # denoise: true
691
- # )
692
- # config = Extraction.new(image_preprocessing: preprocessing)
693
- #
694
687
  # @example With post-processing
695
688
  # postprocessor = Config::PostProcessor.new(
696
689
  # enabled: true,
@@ -708,17 +701,19 @@ module Kreuzberg
708
701
  # language_detection: Config::LanguageDetection.new(enabled: true),
709
702
  # pdf_options: Config::PDF.new(extract_images: true, passwords: ["secret"]),
710
703
  # image_extraction: Config::ImageExtraction.new(target_dpi: 600),
711
- # image_preprocessing: Config::ImagePreprocessing.new(denoise: true),
712
704
  # postprocessor: Config::PostProcessor.new(enabled: true)
713
705
  # )
714
706
  #
715
707
  class Extraction
716
708
  attr_reader :use_cache, :enable_quality_processing, :force_ocr,
717
709
  :ocr, :chunking, :language_detection, :pdf_options,
718
- :image_extraction, :image_preprocessing, :postprocessor,
710
+ :images, :postprocessor,
719
711
  :token_reduction, :keywords, :html_options, :pages,
720
712
  :max_concurrent_extractions, :output_format, :result_format
721
713
 
714
+ # Alias for backward compatibility - image_extraction is the canonical name
715
+ alias image_extraction images
716
+
722
717
  # Load configuration from a file.
723
718
  #
724
719
  # Detects the file format from the extension (.toml, .yaml, .json)
@@ -736,7 +731,7 @@ module Kreuzberg
736
731
  # Keys that are allowed in the Extraction config
737
732
  ALLOWED_KEYS = %i[
738
733
  use_cache enable_quality_processing force_ocr ocr chunking
739
- language_detection pdf_options image_extraction image_preprocessing
734
+ language_detection pdf_options image_extraction
740
735
  postprocessor token_reduction keywords html_options pages
741
736
  max_concurrent_extractions output_format result_format
742
737
  ].freeze
@@ -746,6 +741,12 @@ module Kreuzberg
746
741
  images: :image_extraction
747
742
  }.freeze
748
743
 
744
+ # Valid output format values (case-insensitive, normalized internally)
745
+ VALID_OUTPUT_FORMATS = %w[plain markdown html djot].freeze
746
+
747
+ # Valid result format values (case-insensitive, normalized internally)
748
+ VALID_RESULT_FORMATS = %w[unified elements element_based].freeze
749
+
749
750
  def self.from_file(path)
750
751
  hash = Kreuzberg._config_from_file_native(path)
751
752
  new(**normalize_hash_keys(hash))
@@ -791,14 +792,13 @@ module Kreuzberg
791
792
 
792
793
  def initialize(hash = nil,
793
794
  use_cache: true,
794
- enable_quality_processing: false,
795
+ enable_quality_processing: true,
795
796
  force_ocr: false,
796
797
  ocr: nil,
797
798
  chunking: nil,
798
799
  language_detection: nil,
799
800
  pdf_options: nil,
800
801
  image_extraction: nil,
801
- image_preprocessing: nil,
802
802
  postprocessor: nil,
803
803
  token_reduction: nil,
804
804
  keywords: nil,
@@ -811,7 +811,7 @@ module Kreuzberg
811
811
  use_cache: use_cache, enable_quality_processing: enable_quality_processing,
812
812
  force_ocr: force_ocr, ocr: ocr, chunking: chunking, language_detection: language_detection,
813
813
  pdf_options: pdf_options, image_extraction: image_extraction,
814
- image_preprocessing: image_preprocessing, postprocessor: postprocessor,
814
+ postprocessor: postprocessor,
815
815
  token_reduction: token_reduction, keywords: keywords, html_options: html_options,
816
816
  pages: pages, max_concurrent_extractions: max_concurrent_extractions,
817
817
  output_format: output_format, result_format: result_format
@@ -836,20 +836,38 @@ module Kreuzberg
836
836
  @chunking = normalize_config(params[:chunking], Chunking)
837
837
  @language_detection = normalize_config(params[:language_detection], LanguageDetection)
838
838
  @pdf_options = normalize_config(params[:pdf_options], PDF)
839
- @image_extraction = normalize_config(params[:image_extraction], ImageExtraction)
840
- @image_preprocessing = normalize_config(params[:image_preprocessing], ImagePreprocessing)
839
+ @images = normalize_config(params[:image_extraction], ImageExtraction)
841
840
  @postprocessor = normalize_config(params[:postprocessor], PostProcessor)
842
841
  @token_reduction = normalize_config(params[:token_reduction], TokenReduction)
843
842
  @keywords = normalize_config(params[:keywords], Keywords)
844
843
  @html_options = normalize_config(params[:html_options], HtmlOptions)
845
844
  @pages = normalize_config(params[:pages], PageConfig)
846
845
  @max_concurrent_extractions = params[:max_concurrent_extractions]&.to_i
847
- @output_format = params[:output_format]&.to_s
848
- @result_format = params[:result_format]&.to_s
846
+ @output_format = validate_output_format(params[:output_format])
847
+ @result_format = validate_result_format(params[:result_format])
848
+ end
849
+
850
+ def validate_output_format(value)
851
+ return nil if value.nil?
852
+
853
+ str_value = value.to_s.downcase
854
+ return str_value if VALID_OUTPUT_FORMATS.include?(str_value)
855
+
856
+ raise ArgumentError,
857
+ "Invalid output_format: #{value}. Valid values: #{VALID_OUTPUT_FORMATS.join(', ')}"
858
+ end
859
+
860
+ def validate_result_format(value)
861
+ return nil if value.nil?
862
+
863
+ str_value = value.to_s.downcase
864
+ return str_value if VALID_RESULT_FORMATS.include?(str_value)
865
+
866
+ raise ArgumentError,
867
+ "Invalid result_format: #{value}. Valid values: #{VALID_RESULT_FORMATS.join(', ')}"
849
868
  end
850
869
 
851
870
  # rubocop:disable Metrics/CyclomaticComplexity
852
- # rubocop:disable Metrics/MethodLength
853
871
  def to_h
854
872
  {
855
873
  use_cache: @use_cache,
@@ -859,8 +877,7 @@ module Kreuzberg
859
877
  chunking: @chunking&.to_h,
860
878
  language_detection: @language_detection&.to_h,
861
879
  pdf_options: @pdf_options&.to_h,
862
- image_extraction: @image_extraction&.to_h,
863
- image_preprocessing: @image_preprocessing&.to_h,
880
+ images: @images&.to_h,
864
881
  postprocessor: @postprocessor&.to_h,
865
882
  token_reduction: @token_reduction&.to_h,
866
883
  keywords: @keywords&.to_h,
@@ -871,7 +888,6 @@ module Kreuzberg
871
888
  result_format: @result_format
872
889
  }.compact
873
890
  end
874
- # rubocop:enable Metrics/MethodLength
875
891
  # rubocop:enable Metrics/CyclomaticComplexity
876
892
 
877
893
  # Serialize configuration to JSON string
@@ -995,9 +1011,7 @@ module Kreuzberg
995
1011
  when :pdf_options
996
1012
  @pdf_options = normalize_config(value, PDF)
997
1013
  when :image_extraction
998
- @image_extraction = normalize_config(value, ImageExtraction)
999
- when :image_preprocessing
1000
- @image_preprocessing = normalize_config(value, ImagePreprocessing)
1014
+ @images = normalize_config(value, ImageExtraction)
1001
1015
  when :postprocessor
1002
1016
  @postprocessor = normalize_config(value, PostProcessor)
1003
1017
  when :token_reduction
@@ -1011,9 +1025,9 @@ module Kreuzberg
1011
1025
  when :max_concurrent_extractions
1012
1026
  @max_concurrent_extractions = value&.to_i
1013
1027
  when :output_format
1014
- @output_format = value&.to_s
1028
+ @output_format = validate_output_format(value)
1015
1029
  when :result_format
1016
- @result_format = value&.to_s
1030
+ @result_format = validate_result_format(value)
1017
1031
  else
1018
1032
  raise ArgumentError, "Unknown configuration key: #{key}"
1019
1033
  end
@@ -1035,6 +1049,24 @@ module Kreuzberg
1035
1049
  nil
1036
1050
  end
1037
1051
 
1052
+ # Set output_format attribute
1053
+ #
1054
+ # @param value [String, nil] Output format value
1055
+ # @return [String, nil] The value that was set
1056
+ #
1057
+ def output_format=(value)
1058
+ @output_format = validate_output_format(value)
1059
+ end
1060
+
1061
+ # Set result_format attribute
1062
+ #
1063
+ # @param value [String, nil] Result format value
1064
+ # @return [String, nil] The value that was set
1065
+ #
1066
+ def result_format=(value)
1067
+ @result_format = validate_result_format(value)
1068
+ end
1069
+
1038
1070
  private
1039
1071
 
1040
1072
  def normalize_config(value, klass)
@@ -1053,8 +1085,7 @@ module Kreuzberg
1053
1085
  @chunking = merged.chunking
1054
1086
  @language_detection = merged.language_detection
1055
1087
  @pdf_options = merged.pdf_options
1056
- @image_extraction = merged.image_extraction
1057
- @image_preprocessing = merged.image_preprocessing
1088
+ @images = merged.image_extraction
1058
1089
  @postprocessor = merged.postprocessor
1059
1090
  @token_reduction = merged.token_reduction
1060
1091
  @keywords = merged.keywords
@@ -0,0 +1,225 @@
1
+ # frozen_string_literal: true
2
+
3
+ begin
4
+ require 'json'
5
+ rescue LoadError
6
+ require 'json/pure'
7
+ end
8
+
9
+ module Kreuzberg
10
+ class Result
11
+ # Djot structured content representation
12
+ #
13
+ # Represents document content in Djot format with structured metadata about
14
+ # blocks, images, links, footnotes, and other document elements.
15
+ #
16
+ class DjotContent
17
+ attr_reader :plain_text, :blocks, :metadata_json, :tables, :images, :links, :footnotes, :attributes
18
+
19
+ # Represents a formatted block in Djot content
20
+ class FormattedBlock
21
+ attr_reader :block_type, :children, :attributes, :content, :level
22
+
23
+ # rubocop:disable Metrics/CyclomaticComplexity, Metrics/PerceivedComplexity
24
+ def initialize(hash_or_type = nil, children: nil, attributes: nil, content: nil, level: nil, block_type: nil)
25
+ if hash_or_type.is_a?(Hash)
26
+ # Initialize from hash
27
+ @block_type = hash_or_type[:block_type] || hash_or_type['block_type'] || ''
28
+ @children = hash_or_type[:children] || hash_or_type['children']
29
+ @attributes = hash_or_type[:attributes] || hash_or_type['attributes'] || {}
30
+ @content = hash_or_type[:content] || hash_or_type['content']
31
+ @level = hash_or_type[:level] || hash_or_type['level']
32
+ else
33
+ # Initialize from keyword arguments (for backward compatibility)
34
+ @block_type = block_type || hash_or_type || ''
35
+ @children = children || []
36
+ @attributes = attributes || {}
37
+ @content = content
38
+ @level = level
39
+ end
40
+ end
41
+ # rubocop:enable Metrics/CyclomaticComplexity, Metrics/PerceivedComplexity
42
+
43
+ def to_h
44
+ {
45
+ block_type: @block_type,
46
+ children: @children,
47
+ attributes: @attributes,
48
+ content: @content,
49
+ level: @level
50
+ }.compact
51
+ end
52
+ end
53
+
54
+ # Represents an image in Djot content
55
+ class DjotImage
56
+ attr_reader :url, :alt, :title, :width, :height
57
+ alias src url
58
+
59
+ # rubocop:disable Metrics/CyclomaticComplexity
60
+ def initialize(hash_or_url = nil, alt: nil, title: nil, width: nil, height: nil, url: nil, src: nil)
61
+ if hash_or_url.is_a?(Hash)
62
+ # Initialize from hash (supports both 'url' and 'src' keys)
63
+ @url = hash_or_url[:url] || hash_or_url['url'] || hash_or_url[:src] || hash_or_url['src']
64
+ @alt = hash_or_url[:alt] || hash_or_url['alt']
65
+ @title = hash_or_url[:title] || hash_or_url['title']
66
+ @width = hash_or_url[:width] || hash_or_url['width']
67
+ @height = hash_or_url[:height] || hash_or_url['height']
68
+ else
69
+ # Initialize from keyword arguments
70
+ @url = url || src || hash_or_url
71
+ @alt = alt
72
+ @title = title
73
+ @width = width
74
+ @height = height
75
+ end
76
+ end
77
+ # rubocop:enable Metrics/CyclomaticComplexity
78
+
79
+ def to_h
80
+ {
81
+ url: @url,
82
+ alt: @alt,
83
+ title: @title,
84
+ width: @width,
85
+ height: @height
86
+ }.compact
87
+ end
88
+ end
89
+
90
+ # Represents a link in Djot content
91
+ class DjotLink
92
+ attr_reader :url, :text, :title, :link_type
93
+ alias href url
94
+
95
+ # rubocop:disable Metrics/CyclomaticComplexity
96
+ def initialize(hash_or_url = nil, text: nil, title: nil, url: nil, href: nil, link_type: nil)
97
+ if hash_or_url.is_a?(Hash)
98
+ # Initialize from hash (supports both 'url' and 'href' keys)
99
+ @url = hash_or_url[:url] || hash_or_url['url'] || hash_or_url[:href] || hash_or_url['href']
100
+ @text = hash_or_url[:text] || hash_or_url['text']
101
+ @title = hash_or_url[:title] || hash_or_url['title']
102
+ @link_type = hash_or_url[:link_type] || hash_or_url['link_type']
103
+ else
104
+ # Initialize from keyword arguments
105
+ @url = url || href || hash_or_url
106
+ @text = text
107
+ @title = title
108
+ @link_type = link_type
109
+ end
110
+ end
111
+ # rubocop:enable Metrics/CyclomaticComplexity
112
+
113
+ def to_h
114
+ {
115
+ url: @url,
116
+ text: @text,
117
+ title: @title,
118
+ link_type: @link_type
119
+ }.compact
120
+ end
121
+ end
122
+
123
+ # Represents a footnote in Djot content
124
+ class Footnote
125
+ attr_reader :label, :content
126
+
127
+ def initialize(label:, content:)
128
+ @label = label
129
+ @content = content
130
+ end
131
+
132
+ def to_h
133
+ {
134
+ label: @label,
135
+ content: @content
136
+ }
137
+ end
138
+ end
139
+
140
+ # rubocop:disable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/PerceivedComplexity
141
+ def initialize(hash)
142
+ @plain_text = hash['plain_text'] || hash[:plain_text] || ''
143
+ @blocks = parse_blocks(hash['blocks'] || hash[:blocks] || [])
144
+ @metadata_json = hash['metadata_json'] || hash[:metadata_json] || '{}'
145
+ @tables = hash['tables'] || hash[:tables] || []
146
+ @images = parse_images(hash['images'] || hash[:images] || [])
147
+ @links = parse_links(hash['links'] || hash[:links] || [])
148
+ @footnotes = parse_footnotes(hash['footnotes'] || hash[:footnotes] || [])
149
+ @attributes = hash['attributes'] || hash[:attributes] || {}
150
+ end
151
+ # rubocop:enable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/PerceivedComplexity
152
+
153
+ def metadata
154
+ @metadata ||= parse_metadata(@metadata_json)
155
+ end
156
+
157
+ def to_h
158
+ {
159
+ plain_text: @plain_text,
160
+ blocks: @blocks.map(&:to_h),
161
+ metadata_json: @metadata_json,
162
+ tables: @tables,
163
+ images: @images.map(&:to_h),
164
+ links: @links.map(&:to_h),
165
+ footnotes: @footnotes.map(&:to_h),
166
+ attributes: @attributes
167
+ }
168
+ end
169
+
170
+ private
171
+
172
+ def parse_metadata(metadata_json)
173
+ JSON.parse(metadata_json)
174
+ rescue JSON::ParserError
175
+ {}
176
+ end
177
+
178
+ def parse_blocks(blocks_data)
179
+ blocks_data.map do |block|
180
+ FormattedBlock.new(
181
+ block_type: block['block_type'] || block[:block_type] || '',
182
+ children: block['children'] || block[:children],
183
+ attributes: block['attributes'] || block[:attributes]
184
+ )
185
+ end
186
+ end
187
+
188
+ # rubocop:disable Metrics/CyclomaticComplexity
189
+ def parse_images(images_data)
190
+ images_data.map do |image|
191
+ DjotImage.new(
192
+ url: image['url'] || image[:url] || image['src'] || image[:src],
193
+ alt: image['alt'] || image[:alt],
194
+ title: image['title'] || image[:title],
195
+ width: image['width'] || image[:width],
196
+ height: image['height'] || image[:height]
197
+ )
198
+ end
199
+ end
200
+ # rubocop:enable Metrics/CyclomaticComplexity
201
+
202
+ # rubocop:disable Metrics/CyclomaticComplexity
203
+ def parse_links(links_data)
204
+ links_data.map do |link|
205
+ DjotLink.new(
206
+ url: link['url'] || link[:url] || link['href'] || link[:href],
207
+ text: link['text'] || link[:text],
208
+ title: link['title'] || link[:title],
209
+ link_type: link['link_type'] || link[:link_type]
210
+ )
211
+ end
212
+ end
213
+ # rubocop:enable Metrics/CyclomaticComplexity
214
+
215
+ def parse_footnotes(footnotes_data)
216
+ footnotes_data.map do |note|
217
+ Footnote.new(
218
+ label: note['label'] || note[:label],
219
+ content: note['content'] || note[:content]
220
+ )
221
+ end
222
+ end
223
+ end
224
+ end
225
+ end