kreuzberg 4.1.2 → 4.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (103) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile.lock +2 -2
  3. data/README.md +1 -1
  4. data/ext/kreuzberg_rb/native/Cargo.lock +26 -17
  5. data/ext/kreuzberg_rb/native/libpdfium.so +0 -0
  6. data/kreuzberg.gemspec +13 -1
  7. data/lib/kreuzberg/cli.rb +16 -6
  8. data/lib/kreuzberg/cli_proxy.rb +3 -1
  9. data/lib/kreuzberg/config.rb +121 -39
  10. data/lib/kreuzberg/djot_content.rb +225 -0
  11. data/lib/kreuzberg/extraction_api.rb +20 -4
  12. data/lib/kreuzberg/result.rb +12 -2
  13. data/lib/kreuzberg/version.rb +1 -1
  14. data/lib/kreuzberg.rb +1 -0
  15. data/sig/kreuzberg.rbs +28 -12
  16. data/spec/binding/batch_operations_spec.rb +80 -0
  17. data/spec/binding/batch_spec.rb +6 -5
  18. data/spec/binding/error_recovery_spec.rb +3 -3
  19. data/spec/binding/metadata_types_spec.rb +77 -57
  20. data/spec/binding/tables_spec.rb +11 -2
  21. data/spec/serialization_spec.rb +134 -0
  22. data/spec/unit/config/output_format_spec.rb +380 -0
  23. data/vendor/Cargo.toml +1 -1
  24. data/vendor/kreuzberg/Cargo.toml +1 -1
  25. data/vendor/kreuzberg/README.md +1 -1
  26. data/vendor/kreuzberg/src/api/startup.rs +15 -1
  27. data/vendor/kreuzberg/src/core/config_validation/sections.rs +16 -4
  28. data/vendor/kreuzberg/src/core/extractor/file.rs +1 -2
  29. data/vendor/kreuzberg/src/core/extractor/mod.rs +2 -1
  30. data/vendor/kreuzberg/src/core/io.rs +7 -7
  31. data/vendor/kreuzberg/src/core/mime.rs +4 -4
  32. data/vendor/kreuzberg/src/embeddings.rs +4 -4
  33. data/vendor/kreuzberg/src/extraction/pptx/parser.rs +6 -0
  34. data/vendor/kreuzberg/src/mcp/format.rs +237 -39
  35. data/vendor/kreuzberg/src/mcp/params.rs +26 -33
  36. data/vendor/kreuzberg/src/mcp/server.rs +6 -3
  37. data/vendor/kreuzberg/src/mcp/tools/extraction.rs +16 -23
  38. data/vendor/kreuzberg/src/plugins/mod.rs +1 -0
  39. data/vendor/kreuzberg/src/plugins/registry/extractor.rs +251 -5
  40. data/vendor/kreuzberg/src/plugins/registry/ocr.rs +150 -2
  41. data/vendor/kreuzberg/src/plugins/registry/processor.rs +213 -5
  42. data/vendor/kreuzberg/src/plugins/registry/validator.rs +220 -4
  43. data/vendor/kreuzberg/src/plugins/startup_validation.rs +385 -0
  44. data/vendor/kreuzberg/tests/api_chunk.rs +40 -30
  45. data/vendor/kreuzberg/tests/api_consistency.rs +349 -0
  46. data/vendor/kreuzberg/tests/api_embed.rs +84 -50
  47. data/vendor/kreuzberg/tests/api_large_pdf_extraction_diagnostics.rs +8 -2
  48. data/vendor/kreuzberg/tests/api_tests.rs +298 -139
  49. data/vendor/kreuzberg/tests/archive_integration.rs +63 -56
  50. data/vendor/kreuzberg/tests/batch_orchestration.rs +22 -14
  51. data/vendor/kreuzberg/tests/batch_pooling_benchmark.rs +13 -13
  52. data/vendor/kreuzberg/tests/batch_processing.rs +13 -9
  53. data/vendor/kreuzberg/tests/bibtex_parity_test.rs +10 -10
  54. data/vendor/kreuzberg/tests/concurrency_stress.rs +10 -6
  55. data/vendor/kreuzberg/tests/config_behavioral.rs +416 -0
  56. data/vendor/kreuzberg/tests/config_features.rs +19 -15
  57. data/vendor/kreuzberg/tests/config_integration_test.rs +68 -68
  58. data/vendor/kreuzberg/tests/config_loading_tests.rs +71 -62
  59. data/vendor/kreuzberg/tests/contract_mcp.rs +314 -0
  60. data/vendor/kreuzberg/tests/core_integration.rs +57 -57
  61. data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +23 -23
  62. data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +15 -14
  63. data/vendor/kreuzberg/tests/docx_vs_pandoc_comparison.rs +4 -4
  64. data/vendor/kreuzberg/tests/email_integration.rs +7 -7
  65. data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +2 -2
  66. data/vendor/kreuzberg/tests/error_handling.rs +13 -11
  67. data/vendor/kreuzberg/tests/fictionbook_extractor_tests.rs +2 -2
  68. data/vendor/kreuzberg/tests/html_table_test.rs +11 -11
  69. data/vendor/kreuzberg/tests/instrumentation_test.rs +18 -13
  70. data/vendor/kreuzberg/tests/jats_extractor_tests.rs +17 -17
  71. data/vendor/kreuzberg/tests/jupyter_extractor_tests.rs +9 -9
  72. data/vendor/kreuzberg/tests/keywords_integration.rs +25 -25
  73. data/vendor/kreuzberg/tests/keywords_quality.rs +9 -9
  74. data/vendor/kreuzberg/tests/latex_extractor_tests.rs +2 -2
  75. data/vendor/kreuzberg/tests/markdown_extractor_tests.rs +1 -1
  76. data/vendor/kreuzberg/tests/mcp_integration.rs +849 -0
  77. data/vendor/kreuzberg/tests/mime_detection.rs +75 -43
  78. data/vendor/kreuzberg/tests/ocr_errors.rs +10 -4
  79. data/vendor/kreuzberg/tests/ocr_language_registry.rs +1 -1
  80. data/vendor/kreuzberg/tests/ocr_stress.rs +3 -3
  81. data/vendor/kreuzberg/tests/odt_extractor_tests.rs +6 -6
  82. data/vendor/kreuzberg/tests/opml_extractor_tests.rs +2 -2
  83. data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +2 -2
  84. data/vendor/kreuzberg/tests/page_markers.rs +1 -1
  85. data/vendor/kreuzberg/tests/pdf_hierarchy_detection.rs +6 -6
  86. data/vendor/kreuzberg/tests/pdf_text_merging.rs +2 -2
  87. data/vendor/kreuzberg/tests/pipeline_integration.rs +77 -61
  88. data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +97 -77
  89. data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +78 -61
  90. data/vendor/kreuzberg/tests/plugin_system.rs +49 -46
  91. data/vendor/kreuzberg/tests/plugin_validator_test.rs +109 -97
  92. data/vendor/kreuzberg/tests/pptx_regression_tests.rs +324 -31
  93. data/vendor/kreuzberg/tests/registry_integration_tests.rs +26 -23
  94. data/vendor/kreuzberg/tests/rst_extractor_tests.rs +1 -1
  95. data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +18 -18
  96. data/vendor/kreuzberg/tests/security_validation.rs +20 -19
  97. data/vendor/kreuzberg/tests/serialization_integration.rs +112 -0
  98. data/vendor/kreuzberg/tests/stopwords_integration_test.rs +36 -36
  99. data/vendor/kreuzberg/tests/test_fastembed.rs +8 -8
  100. data/vendor/kreuzberg/tests/typst_extractor_tests.rs +9 -9
  101. data/vendor/kreuzberg/tests/xlsx_metadata_extraction_test.rs +12 -9
  102. data/vendor/kreuzberg-tesseract/Cargo.toml +1 -1
  103. metadata +12 -2
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 6550daabf58e5e396576e5a83c6a53f226e677f9c129920c9990bba309fbd7ba
4
- data.tar.gz: 9595aa468666391d08a0962db589bbbc50d5bd1c8532e101efa234f6c523d7c5
3
+ metadata.gz: '010412940492f83bc170a4a4efc644ac7e3818502734035523796287837a3893'
4
+ data.tar.gz: af24cba007cc58283d678d4b15251ebae3a5740f60ade9a54cc19973a2022a82
5
5
  SHA512:
6
- metadata.gz: 0dea911deebe061515dd4cbff2b76b3a7947c68f196fcc576001d42d80386f6c53f8ed63e0e4acb8e719ad6f95c21e689df7aef5f6cbbbc0d1c92ef96ddb673c
7
- data.tar.gz: 0df091f80f7c73dda0c17d89d4aa0571cd01f0f2b697b187fd9bae28f8dbcf96cd2e3a269f9831a442b8cf46ce40608586d3d6a242d84bb394fe6056cba3b492
6
+ metadata.gz: ad67348bec54a01ca3592ed72e9b2b8bc9e711a37e11b40ada31466c67f834132fc0de278c53a1c014fa6751da7abebae934cff2a9cc1835f7e056c895a273cb
7
+ data.tar.gz: ca2cdb076a5d1af67f0e807978a966d1a391cc286bcdf5499544e3403196140918a54674beab77ea09fc0e8bc7ab66f357da5d984326a511b1d21643a3d6cf41
data/Gemfile.lock CHANGED
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- kreuzberg (4.1.2)
4
+ kreuzberg (4.2.1)
5
5
 
6
6
  GEM
7
7
  remote: https://rubygems.org/
@@ -207,7 +207,7 @@ CHECKSUMS
207
207
  i18n (1.14.8) sha256=285778639134865c5e0f6269e0b818256017e8cde89993fdfcbfb64d088824a5
208
208
  io-console (0.8.2) sha256=d6e3ae7a7cc7574f4b8893b4fca2162e57a825b223a177b7afa236c5ef9814cc
209
209
  json (2.18.0) sha256=b10506aee4183f5cf49e0efc48073d7b75843ce3782c68dbeb763351c08fd505
210
- kreuzberg (4.1.2)
210
+ kreuzberg (4.2.1)
211
211
  language_server-protocol (3.17.0.5) sha256=fd1e39a51a28bf3eec959379985a72e296e9f9acfce46f6a79d31ca8760803cc
212
212
  lint_roller (1.1.0) sha256=2c0c845b632a7d172cb849cc90c1bce937a28c5c8ccccb50dfd46a485003cc87
213
213
  listen (3.10.0) sha256=c6e182db62143aeccc2e1960033bebe7445309c7272061979bb098d03760c9d2
data/README.md CHANGED
@@ -22,7 +22,7 @@
22
22
  <img src="https://img.shields.io/maven-central/v/dev.kreuzberg/kreuzberg?label=Java&color=007ec6" alt="Java">
23
23
  </a>
24
24
  <a href="https://github.com/kreuzberg-dev/kreuzberg/releases">
25
- <img src="https://img.shields.io/github/v/tag/kreuzberg-dev/kreuzberg?label=Go&color=007ec6&filter=v4.1.2" alt="Go">
25
+ <img src="https://img.shields.io/github/v/tag/kreuzberg-dev/kreuzberg?label=Go&color=007ec6&filter=v4.2.1" alt="Go">
26
26
  </a>
27
27
  <a href="https://www.nuget.org/packages/Kreuzberg/">
28
28
  <img src="https://img.shields.io/nuget/v/Kreuzberg?label=C%23&color=007ec6" alt="C#">
@@ -716,9 +716,9 @@ dependencies = [
716
716
 
717
717
  [[package]]
718
718
  name = "cc"
719
- version = "1.2.53"
719
+ version = "1.2.54"
720
720
  source = "registry+https://github.com/rust-lang/crates.io-index"
721
- checksum = "755d2fce177175ffca841e9a06afdb2c4ab0f593d53b4dee48147dfaade85932"
721
+ checksum = "6354c81bbfd62d9cfa9cb3c773c2b7b2a3a482d569de977fd0e961f6e7c00583"
722
722
  dependencies = [
723
723
  "find-msvc-tools",
724
724
  "jobserver",
@@ -2027,9 +2027,9 @@ dependencies = [
2027
2027
 
2028
2028
  [[package]]
2029
2029
  name = "html-to-markdown-rs"
2030
- version = "2.23.1"
2030
+ version = "2.23.4"
2031
2031
  source = "registry+https://github.com/rust-lang/crates.io-index"
2032
- checksum = "e1f4d6781ac8dd203853803d27054ca4153c7fd0f3956cb7fc95dc06f42a1c46"
2032
+ checksum = "ffbf49450676163bdf69fac2acf72674fcf2d2aaf690aa247368c567cc9afb2a"
2033
2033
  dependencies = [
2034
2034
  "astral-tl",
2035
2035
  "base64 0.22.1",
@@ -2627,7 +2627,7 @@ dependencies = [
2627
2627
 
2628
2628
  [[package]]
2629
2629
  name = "kreuzberg"
2630
- version = "4.1.0"
2630
+ version = "4.2.0"
2631
2631
  dependencies = [
2632
2632
  "ahash",
2633
2633
  "async-trait",
@@ -2706,17 +2706,18 @@ dependencies = [
2706
2706
  "uuid",
2707
2707
  "whatlang",
2708
2708
  "yake-rust",
2709
- "zip 7.1.0",
2709
+ "zip 7.2.0",
2710
2710
  ]
2711
2711
 
2712
2712
  [[package]]
2713
2713
  name = "kreuzberg-ffi"
2714
- version = "4.1.0"
2714
+ version = "4.2.0"
2715
2715
  dependencies = [
2716
2716
  "async-trait",
2717
2717
  "cbindgen",
2718
2718
  "html-to-markdown-rs",
2719
2719
  "kreuzberg",
2720
+ "log",
2720
2721
  "serde",
2721
2722
  "serde_json",
2722
2723
  "tokio",
@@ -2767,14 +2768,14 @@ dependencies = [
2767
2768
 
2768
2769
  [[package]]
2769
2770
  name = "kreuzberg-tesseract"
2770
- version = "4.1.0"
2771
+ version = "4.2.0"
2771
2772
  dependencies = [
2772
2773
  "cc",
2773
2774
  "cmake",
2774
2775
  "libc",
2775
2776
  "reqwest 0.13.1",
2776
2777
  "thiserror 2.0.18",
2777
- "zip 7.1.0",
2778
+ "zip 7.2.0",
2778
2779
  ]
2779
2780
 
2780
2781
  [[package]]
@@ -5022,9 +5023,9 @@ dependencies = [
5022
5023
 
5023
5024
  [[package]]
5024
5025
  name = "rmcp"
5025
- version = "0.13.0"
5026
+ version = "0.14.0"
5026
5027
  source = "registry+https://github.com/rust-lang/crates.io-index"
5027
- checksum = "d1815dbc06c414d720f8bc1951eccd66bc99efc6376331f1e7093a119b3eb508"
5028
+ checksum = "0a621b37a548ff6ab6292d57841eb25785a7f146d89391a19c9f199414bd13da"
5028
5029
  dependencies = [
5029
5030
  "async-trait",
5030
5031
  "axum",
@@ -5054,9 +5055,9 @@ dependencies = [
5054
5055
 
5055
5056
  [[package]]
5056
5057
  name = "rmcp-macros"
5057
- version = "0.13.0"
5058
+ version = "0.14.0"
5058
5059
  source = "registry+https://github.com/rust-lang/crates.io-index"
5059
- checksum = "11f0bc7008fa102e771a76c6d2c9b253be3f2baa5964e060464d038ae1cbc573"
5060
+ checksum = "6b79ed92303f9262db79575aa8c3652581668e9d136be6fd0b9ededa78954c95"
5060
5061
  dependencies = [
5061
5062
  "darling 0.23.0",
5062
5063
  "proc-macro2",
@@ -6013,6 +6014,7 @@ checksum = "91e7d9e3bb61134e77bde20dd4825b97c010155709965fedf0f49bb138e52a9d"
6013
6014
  dependencies = [
6014
6015
  "deranged",
6015
6016
  "itoa",
6017
+ "js-sys",
6016
6018
  "num-conv",
6017
6019
  "powerfmt",
6018
6020
  "serde",
@@ -6405,6 +6407,12 @@ version = "0.25.1"
6405
6407
  source = "registry+https://github.com/rust-lang/crates.io-index"
6406
6408
  checksum = "d2df906b07856748fa3f6e0ad0cbaa047052d4a7dd609e231c4f72cee8c36f31"
6407
6409
 
6410
+ [[package]]
6411
+ name = "typed-path"
6412
+ version = "0.12.1"
6413
+ source = "registry+https://github.com/rust-lang/crates.io-index"
6414
+ checksum = "e43ffa54726cdc9ea78392023ffe9fe9cf9ac779e1c6fcb0d23f9862e3879d20"
6415
+
6408
6416
  [[package]]
6409
6417
  name = "typeid"
6410
6418
  version = "1.0.3"
@@ -6698,9 +6706,9 @@ checksum = "06abde3611657adf66d383f00b093d7faecc7fa57071cce2578660c9f1010821"
6698
6706
 
6699
6707
  [[package]]
6700
6708
  name = "uuid"
6701
- version = "1.19.0"
6709
+ version = "1.20.0"
6702
6710
  source = "registry+https://github.com/rust-lang/crates.io-index"
6703
- checksum = "e2e054861b4bd027cd373e18e8d8d8e6548085000e41290d95ce0c373a654b4a"
6711
+ checksum = "ee48d38b119b0cd71fe4141b30f5ba9c7c5d9f4e7a3a8b4a674e4b6ef789976f"
6704
6712
  dependencies = [
6705
6713
  "getrandom 0.3.4",
6706
6714
  "js-sys",
@@ -7498,9 +7506,9 @@ dependencies = [
7498
7506
 
7499
7507
  [[package]]
7500
7508
  name = "zip"
7501
- version = "7.1.0"
7509
+ version = "7.2.0"
7502
7510
  source = "registry+https://github.com/rust-lang/crates.io-index"
7503
- checksum = "9013f1222db8a6d680f13a7ccdc60a781199cd09c2fa4eff58e728bb181757fc"
7511
+ checksum = "c42e33efc22a0650c311c2ef19115ce232583abbe80850bc8b66509ebef02de0"
7504
7512
  dependencies = [
7505
7513
  "aes",
7506
7514
  "bzip2",
@@ -7518,6 +7526,7 @@ dependencies = [
7518
7526
  "ppmd-rust",
7519
7527
  "sha1",
7520
7528
  "time",
7529
+ "typed-path",
7521
7530
  "zeroize",
7522
7531
  "zopfli",
7523
7532
  "zstd",
data/kreuzberg.gemspec CHANGED
@@ -130,10 +130,22 @@ vendor_files = Dir.chdir(__dir__) do
130
130
  kreuzberg_files + kreuzberg_ffi_files + kreuzberg_tesseract_files + rb_sys_files + workspace_toml
131
131
  end
132
132
 
133
+ # When vendor files exist, get ext/ files from filesystem (to include modified Cargo.toml
134
+ # with vendor paths) instead of from git (which has original 5-level crate paths)
135
+ ext_files_from_fs = Dir.chdir(__dir__) do
136
+ Dir.glob('ext/**/*', File::FNM_DOTMATCH)
137
+ .reject { |f| File.directory?(f) }
138
+ .reject { |f| f.include?('/target/') }
139
+ .grep_v(/\.(swp|bak|tmp)$/)
140
+ .grep_v(/~$/)
141
+ end
142
+
133
143
  files = if (ruby_files + core_files + ffi_files).empty?
134
144
  fallback_files
135
145
  elsif vendor_files.any?
136
- ruby_files + vendor_files
146
+ # Use ext/ files from filesystem (modified by vendor script) + non-ext ruby files from git
147
+ non_ext_ruby_files = ruby_files.reject { |f| f.start_with?('ext/') }
148
+ non_ext_ruby_files + ext_files_from_fs + vendor_files
137
149
  else
138
150
  ruby_files + core_files + ffi_files
139
151
  end
data/lib/kreuzberg/cli.rb CHANGED
@@ -8,24 +8,34 @@ module Kreuzberg
8
8
 
9
9
  # Extract content from a file using the CLI
10
10
  #
11
- # @param path [String] Path to the file
11
+ # @param path_or_nil [String, nil] Path to the file (positional, for backward compatibility)
12
+ # @param path [String] Path to the file (keyword argument)
12
13
  # @param output [String] Output format ("text", "json", "markdown")
13
14
  # @param ocr [Boolean] Enable OCR
14
15
  # @return [String] Extracted content
15
16
  #
16
- def extract(path, output: 'text', ocr: false)
17
- args = ['extract', path, '--format', output]
17
+ def extract(path_or_nil = nil, path: nil, output: 'text', ocr: false)
18
+ # Support both positional and keyword argument for path (backward compatibility)
19
+ actual_path = path_or_nil || path
20
+ raise ArgumentError, 'path is required' if actual_path.nil?
21
+
22
+ args = ['extract', actual_path, '--format', output]
18
23
  args.push('--ocr', ocr ? 'true' : 'false')
19
24
  CLIProxy.call(args)
20
25
  end
21
26
 
22
27
  # Detect MIME type of a file using the CLI
23
28
  #
24
- # @param path [String] Path to the file
29
+ # @param path_or_nil [String, nil] Path to the file (positional, for backward compatibility)
30
+ # @param path [String] Path to the file (keyword argument)
25
31
  # @return [String] MIME type
26
32
  #
27
- def detect(path)
28
- CLIProxy.call(['detect', path]).strip
33
+ def detect(path_or_nil = nil, path: nil)
34
+ # Support both positional and keyword argument for path (backward compatibility)
35
+ actual_path = path_or_nil || path
36
+ raise ArgumentError, 'path is required' if actual_path.nil?
37
+
38
+ CLIProxy.call(['detect', actual_path]).strip
29
39
  end
30
40
 
31
41
  # Get CLI version
@@ -91,11 +91,13 @@ module Kreuzberg
91
91
  lib_path.join('bin', binary_name),
92
92
  lib_path.join(binary_name),
93
93
  root_path.join('../../crates/kreuzberg-cli/target/release', binary_name),
94
- root_path.join('../../target/release', binary_name)
94
+ root_path.join('../../target/release', binary_name),
95
+ root_path.join('../../target/debug', binary_name)
95
96
  ]
96
97
 
97
98
  workspace_root = root_path.parent&.parent
98
99
  paths << workspace_root.join('target', 'release', binary_name) if workspace_root
100
+ paths << workspace_root.join('target', 'debug', binary_name) if workspace_root
99
101
 
100
102
  paths
101
103
  end
@@ -715,9 +715,12 @@ module Kreuzberg
715
715
  class Extraction
716
716
  attr_reader :use_cache, :enable_quality_processing, :force_ocr,
717
717
  :ocr, :chunking, :language_detection, :pdf_options,
718
- :image_extraction, :image_preprocessing, :postprocessor,
718
+ :images, :image_preprocessing, :postprocessor,
719
719
  :token_reduction, :keywords, :html_options, :pages,
720
- :max_concurrent_extractions
720
+ :max_concurrent_extractions, :output_format, :result_format
721
+
722
+ # Alias for backward compatibility - image_extraction is the canonical name
723
+ alias image_extraction images
721
724
 
722
725
  # Load configuration from a file.
723
726
  #
@@ -738,7 +741,7 @@ module Kreuzberg
738
741
  use_cache enable_quality_processing force_ocr ocr chunking
739
742
  language_detection pdf_options image_extraction image_preprocessing
740
743
  postprocessor token_reduction keywords html_options pages
741
- max_concurrent_extractions
744
+ max_concurrent_extractions output_format result_format
742
745
  ].freeze
743
746
 
744
747
  # Aliases for backward compatibility
@@ -746,6 +749,12 @@ module Kreuzberg
746
749
  images: :image_extraction
747
750
  }.freeze
748
751
 
752
+ # Valid output format values (case-insensitive, normalized internally)
753
+ VALID_OUTPUT_FORMATS = %w[plain markdown html djot].freeze
754
+
755
+ # Valid result format values (case-insensitive, normalized internally)
756
+ VALID_RESULT_FORMATS = %w[unified elements element_based].freeze
757
+
749
758
  def self.from_file(path)
750
759
  hash = Kreuzberg._config_from_file_native(path)
751
760
  new(**normalize_hash_keys(hash))
@@ -789,41 +798,87 @@ module Kreuzberg
789
798
  new(**normalize_hash_keys(hash))
790
799
  end
791
800
 
792
- def initialize(
793
- use_cache: true,
794
- enable_quality_processing: false,
795
- force_ocr: false,
796
- ocr: nil,
797
- chunking: nil,
798
- language_detection: nil,
799
- pdf_options: nil,
800
- image_extraction: nil,
801
- image_preprocessing: nil,
802
- postprocessor: nil,
803
- token_reduction: nil,
804
- keywords: nil,
805
- html_options: nil,
806
- pages: nil,
807
- max_concurrent_extractions: nil
808
- )
809
- @use_cache = use_cache ? true : false
810
- @enable_quality_processing = enable_quality_processing ? true : false
811
- @force_ocr = force_ocr ? true : false
812
- @ocr = normalize_config(ocr, OCR)
813
- @chunking = normalize_config(chunking, Chunking)
814
- @language_detection = normalize_config(language_detection, LanguageDetection)
815
- @pdf_options = normalize_config(pdf_options, PDF)
816
- @image_extraction = normalize_config(image_extraction, ImageExtraction)
817
- @image_preprocessing = normalize_config(image_preprocessing, ImagePreprocessing)
818
- @postprocessor = normalize_config(postprocessor, PostProcessor)
819
- @token_reduction = normalize_config(token_reduction, TokenReduction)
820
- @keywords = normalize_config(keywords, Keywords)
821
- @html_options = normalize_config(html_options, HtmlOptions)
822
- @pages = normalize_config(pages, PageConfig)
823
- @max_concurrent_extractions = max_concurrent_extractions&.to_i
801
+ def initialize(hash = nil,
802
+ use_cache: true,
803
+ enable_quality_processing: false,
804
+ force_ocr: false,
805
+ ocr: nil,
806
+ chunking: nil,
807
+ language_detection: nil,
808
+ pdf_options: nil,
809
+ image_extraction: nil,
810
+ image_preprocessing: nil,
811
+ postprocessor: nil,
812
+ token_reduction: nil,
813
+ keywords: nil,
814
+ html_options: nil,
815
+ pages: nil,
816
+ max_concurrent_extractions: nil,
817
+ output_format: nil,
818
+ result_format: nil)
819
+ kwargs = {
820
+ use_cache: use_cache, enable_quality_processing: enable_quality_processing,
821
+ force_ocr: force_ocr, ocr: ocr, chunking: chunking, language_detection: language_detection,
822
+ pdf_options: pdf_options, image_extraction: image_extraction,
823
+ image_preprocessing: image_preprocessing, postprocessor: postprocessor,
824
+ token_reduction: token_reduction, keywords: keywords, html_options: html_options,
825
+ pages: pages, max_concurrent_extractions: max_concurrent_extractions,
826
+ output_format: output_format, result_format: result_format
827
+ }
828
+ extracted = extract_from_hash(hash, kwargs)
829
+
830
+ assign_attributes(extracted)
831
+ end
832
+
833
+ def extract_from_hash(hash, defaults)
834
+ return defaults unless hash.is_a?(Hash)
835
+
836
+ hash = hash.transform_keys(&:to_sym)
837
+ defaults.merge(hash.slice(*defaults.keys))
838
+ end
839
+
840
+ def assign_attributes(params)
841
+ @use_cache = params[:use_cache] ? true : false
842
+ @enable_quality_processing = params[:enable_quality_processing] ? true : false
843
+ @force_ocr = params[:force_ocr] ? true : false
844
+ @ocr = normalize_config(params[:ocr], OCR)
845
+ @chunking = normalize_config(params[:chunking], Chunking)
846
+ @language_detection = normalize_config(params[:language_detection], LanguageDetection)
847
+ @pdf_options = normalize_config(params[:pdf_options], PDF)
848
+ @images = normalize_config(params[:image_extraction], ImageExtraction)
849
+ @image_preprocessing = normalize_config(params[:image_preprocessing], ImagePreprocessing)
850
+ @postprocessor = normalize_config(params[:postprocessor], PostProcessor)
851
+ @token_reduction = normalize_config(params[:token_reduction], TokenReduction)
852
+ @keywords = normalize_config(params[:keywords], Keywords)
853
+ @html_options = normalize_config(params[:html_options], HtmlOptions)
854
+ @pages = normalize_config(params[:pages], PageConfig)
855
+ @max_concurrent_extractions = params[:max_concurrent_extractions]&.to_i
856
+ @output_format = validate_output_format(params[:output_format])
857
+ @result_format = validate_result_format(params[:result_format])
858
+ end
859
+
860
+ def validate_output_format(value)
861
+ return nil if value.nil?
862
+
863
+ str_value = value.to_s.downcase
864
+ return str_value if VALID_OUTPUT_FORMATS.include?(str_value)
865
+
866
+ raise ArgumentError,
867
+ "Invalid output_format: #{value}. Valid values: #{VALID_OUTPUT_FORMATS.join(', ')}"
868
+ end
869
+
870
+ def validate_result_format(value)
871
+ return nil if value.nil?
872
+
873
+ str_value = value.to_s.downcase
874
+ return str_value if VALID_RESULT_FORMATS.include?(str_value)
875
+
876
+ raise ArgumentError,
877
+ "Invalid result_format: #{value}. Valid values: #{VALID_RESULT_FORMATS.join(', ')}"
824
878
  end
825
879
 
826
880
  # rubocop:disable Metrics/CyclomaticComplexity
881
+ # rubocop:disable Metrics/MethodLength
827
882
  def to_h
828
883
  {
829
884
  use_cache: @use_cache,
@@ -833,16 +888,19 @@ module Kreuzberg
833
888
  chunking: @chunking&.to_h,
834
889
  language_detection: @language_detection&.to_h,
835
890
  pdf_options: @pdf_options&.to_h,
836
- image_extraction: @image_extraction&.to_h,
891
+ images: @images&.to_h,
837
892
  image_preprocessing: @image_preprocessing&.to_h,
838
893
  postprocessor: @postprocessor&.to_h,
839
894
  token_reduction: @token_reduction&.to_h,
840
895
  keywords: @keywords&.to_h,
841
896
  html_options: @html_options&.to_h,
842
897
  pages: @pages&.to_h,
843
- max_concurrent_extractions: @max_concurrent_extractions
898
+ max_concurrent_extractions: @max_concurrent_extractions,
899
+ output_format: @output_format,
900
+ result_format: @result_format
844
901
  }.compact
845
902
  end
903
+ # rubocop:enable Metrics/MethodLength
846
904
  # rubocop:enable Metrics/CyclomaticComplexity
847
905
 
848
906
  # Serialize configuration to JSON string
@@ -966,7 +1024,7 @@ module Kreuzberg
966
1024
  when :pdf_options
967
1025
  @pdf_options = normalize_config(value, PDF)
968
1026
  when :image_extraction
969
- @image_extraction = normalize_config(value, ImageExtraction)
1027
+ @images = normalize_config(value, ImageExtraction)
970
1028
  when :image_preprocessing
971
1029
  @image_preprocessing = normalize_config(value, ImagePreprocessing)
972
1030
  when :postprocessor
@@ -981,6 +1039,10 @@ module Kreuzberg
981
1039
  @pages = normalize_config(value, PageConfig)
982
1040
  when :max_concurrent_extractions
983
1041
  @max_concurrent_extractions = value&.to_i
1042
+ when :output_format
1043
+ @output_format = validate_output_format(value)
1044
+ when :result_format
1045
+ @result_format = validate_result_format(value)
984
1046
  else
985
1047
  raise ArgumentError, "Unknown configuration key: #{key}"
986
1048
  end
@@ -1002,6 +1064,24 @@ module Kreuzberg
1002
1064
  nil
1003
1065
  end
1004
1066
 
1067
+ # Set output_format attribute
1068
+ #
1069
+ # @param value [String, nil] Output format value
1070
+ # @return [String, nil] The value that was set
1071
+ #
1072
+ def output_format=(value)
1073
+ @output_format = validate_output_format(value)
1074
+ end
1075
+
1076
+ # Set result_format attribute
1077
+ #
1078
+ # @param value [String, nil] Result format value
1079
+ # @return [String, nil] The value that was set
1080
+ #
1081
+ def result_format=(value)
1082
+ @result_format = validate_result_format(value)
1083
+ end
1084
+
1005
1085
  private
1006
1086
 
1007
1087
  def normalize_config(value, klass)
@@ -1020,7 +1100,7 @@ module Kreuzberg
1020
1100
  @chunking = merged.chunking
1021
1101
  @language_detection = merged.language_detection
1022
1102
  @pdf_options = merged.pdf_options
1023
- @image_extraction = merged.image_extraction
1103
+ @images = merged.image_extraction
1024
1104
  @image_preprocessing = merged.image_preprocessing
1025
1105
  @postprocessor = merged.postprocessor
1026
1106
  @token_reduction = merged.token_reduction
@@ -1028,6 +1108,8 @@ module Kreuzberg
1028
1108
  @html_options = merged.html_options
1029
1109
  @pages = merged.pages
1030
1110
  @max_concurrent_extractions = merged.max_concurrent_extractions
1111
+ @output_format = merged.output_format
1112
+ @result_format = merged.result_format
1031
1113
  end
1032
1114
  end
1033
1115
  end