kreuzberg 4.1.2 → 4.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (80) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile.lock +2 -2
  3. data/README.md +1 -1
  4. data/ext/kreuzberg_rb/native/libpdfium.so +0 -0
  5. data/kreuzberg.gemspec +13 -1
  6. data/lib/kreuzberg/config.rb +70 -35
  7. data/lib/kreuzberg/version.rb +1 -1
  8. data/sig/kreuzberg.rbs +5 -1
  9. data/spec/binding/batch_operations_spec.rb +80 -0
  10. data/spec/binding/metadata_types_spec.rb +77 -57
  11. data/spec/serialization_spec.rb +134 -0
  12. data/spec/unit/config/output_format_spec.rb +380 -0
  13. data/vendor/Cargo.toml +1 -1
  14. data/vendor/kreuzberg/Cargo.toml +1 -1
  15. data/vendor/kreuzberg/README.md +1 -1
  16. data/vendor/kreuzberg/src/embeddings.rs +4 -4
  17. data/vendor/kreuzberg/src/mcp/format.rs +237 -39
  18. data/vendor/kreuzberg/src/mcp/params.rs +26 -33
  19. data/vendor/kreuzberg/src/mcp/server.rs +6 -3
  20. data/vendor/kreuzberg/src/mcp/tools/extraction.rs +16 -23
  21. data/vendor/kreuzberg/tests/api_chunk.rs +40 -30
  22. data/vendor/kreuzberg/tests/api_consistency.rs +349 -0
  23. data/vendor/kreuzberg/tests/api_embed.rs +84 -50
  24. data/vendor/kreuzberg/tests/api_large_pdf_extraction_diagnostics.rs +8 -2
  25. data/vendor/kreuzberg/tests/api_tests.rs +298 -139
  26. data/vendor/kreuzberg/tests/archive_integration.rs +63 -56
  27. data/vendor/kreuzberg/tests/batch_orchestration.rs +22 -14
  28. data/vendor/kreuzberg/tests/batch_pooling_benchmark.rs +13 -13
  29. data/vendor/kreuzberg/tests/batch_processing.rs +13 -9
  30. data/vendor/kreuzberg/tests/bibtex_parity_test.rs +10 -10
  31. data/vendor/kreuzberg/tests/concurrency_stress.rs +10 -6
  32. data/vendor/kreuzberg/tests/config_behavioral.rs +414 -0
  33. data/vendor/kreuzberg/tests/config_features.rs +19 -15
  34. data/vendor/kreuzberg/tests/config_integration_test.rs +68 -68
  35. data/vendor/kreuzberg/tests/config_loading_tests.rs +71 -62
  36. data/vendor/kreuzberg/tests/contract_mcp.rs +314 -0
  37. data/vendor/kreuzberg/tests/core_integration.rs +55 -53
  38. data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +23 -23
  39. data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +15 -14
  40. data/vendor/kreuzberg/tests/docx_vs_pandoc_comparison.rs +4 -4
  41. data/vendor/kreuzberg/tests/email_integration.rs +7 -7
  42. data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +2 -2
  43. data/vendor/kreuzberg/tests/error_handling.rs +13 -11
  44. data/vendor/kreuzberg/tests/fictionbook_extractor_tests.rs +2 -2
  45. data/vendor/kreuzberg/tests/html_table_test.rs +11 -11
  46. data/vendor/kreuzberg/tests/instrumentation_test.rs +18 -13
  47. data/vendor/kreuzberg/tests/jats_extractor_tests.rs +17 -17
  48. data/vendor/kreuzberg/tests/jupyter_extractor_tests.rs +9 -9
  49. data/vendor/kreuzberg/tests/keywords_integration.rs +25 -25
  50. data/vendor/kreuzberg/tests/keywords_quality.rs +9 -9
  51. data/vendor/kreuzberg/tests/latex_extractor_tests.rs +2 -2
  52. data/vendor/kreuzberg/tests/markdown_extractor_tests.rs +1 -1
  53. data/vendor/kreuzberg/tests/mcp_integration.rs +849 -0
  54. data/vendor/kreuzberg/tests/mime_detection.rs +72 -41
  55. data/vendor/kreuzberg/tests/ocr_errors.rs +10 -4
  56. data/vendor/kreuzberg/tests/ocr_language_registry.rs +1 -1
  57. data/vendor/kreuzberg/tests/ocr_stress.rs +3 -3
  58. data/vendor/kreuzberg/tests/odt_extractor_tests.rs +6 -6
  59. data/vendor/kreuzberg/tests/opml_extractor_tests.rs +2 -2
  60. data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +2 -2
  61. data/vendor/kreuzberg/tests/page_markers.rs +1 -1
  62. data/vendor/kreuzberg/tests/pdf_hierarchy_detection.rs +6 -6
  63. data/vendor/kreuzberg/tests/pdf_text_merging.rs +2 -2
  64. data/vendor/kreuzberg/tests/pipeline_integration.rs +77 -61
  65. data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +97 -77
  66. data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +78 -61
  67. data/vendor/kreuzberg/tests/plugin_system.rs +49 -46
  68. data/vendor/kreuzberg/tests/plugin_validator_test.rs +109 -97
  69. data/vendor/kreuzberg/tests/pptx_regression_tests.rs +40 -30
  70. data/vendor/kreuzberg/tests/registry_integration_tests.rs +26 -23
  71. data/vendor/kreuzberg/tests/rst_extractor_tests.rs +1 -1
  72. data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +18 -18
  73. data/vendor/kreuzberg/tests/security_validation.rs +20 -19
  74. data/vendor/kreuzberg/tests/serialization_integration.rs +112 -0
  75. data/vendor/kreuzberg/tests/stopwords_integration_test.rs +36 -36
  76. data/vendor/kreuzberg/tests/test_fastembed.rs +8 -8
  77. data/vendor/kreuzberg/tests/typst_extractor_tests.rs +9 -9
  78. data/vendor/kreuzberg/tests/xlsx_metadata_extraction_test.rs +12 -9
  79. data/vendor/kreuzberg-tesseract/Cargo.toml +1 -1
  80. metadata +10 -2
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 6550daabf58e5e396576e5a83c6a53f226e677f9c129920c9990bba309fbd7ba
4
- data.tar.gz: 9595aa468666391d08a0962db589bbbc50d5bd1c8532e101efa234f6c523d7c5
3
+ metadata.gz: 9a1c9adffca7d75c142bd661f1d481b1aee00d97c6f62dcc70292f37978bcc17
4
+ data.tar.gz: 227af2ed45bff1dfa9afebd69220d15a41b2e476bf97f8a83173d21aab8b88e1
5
5
  SHA512:
6
- metadata.gz: 0dea911deebe061515dd4cbff2b76b3a7947c68f196fcc576001d42d80386f6c53f8ed63e0e4acb8e719ad6f95c21e689df7aef5f6cbbbc0d1c92ef96ddb673c
7
- data.tar.gz: 0df091f80f7c73dda0c17d89d4aa0571cd01f0f2b697b187fd9bae28f8dbcf96cd2e3a269f9831a442b8cf46ce40608586d3d6a242d84bb394fe6056cba3b492
6
+ metadata.gz: 0d1b0081f89a73f5422e68a714fc415f6d290dd8be7cf0ba6d454cfdf1938ebdac4919358b25d6e5a0bc1a209e1b165062a0341d28cde1b3fa0595bffec837f5
7
+ data.tar.gz: fc5a5f29309c29fbbf63ba035cf5e462e78b15c2afc239fb333bd8b6e70ef061219822ed4d533f81bb35cdd62db84da8c01e8f172561b0f7fb802b848b491c0a
data/Gemfile.lock CHANGED
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- kreuzberg (4.1.2)
4
+ kreuzberg (4.2.0)
5
5
 
6
6
  GEM
7
7
  remote: https://rubygems.org/
@@ -207,7 +207,7 @@ CHECKSUMS
207
207
  i18n (1.14.8) sha256=285778639134865c5e0f6269e0b818256017e8cde89993fdfcbfb64d088824a5
208
208
  io-console (0.8.2) sha256=d6e3ae7a7cc7574f4b8893b4fca2162e57a825b223a177b7afa236c5ef9814cc
209
209
  json (2.18.0) sha256=b10506aee4183f5cf49e0efc48073d7b75843ce3782c68dbeb763351c08fd505
210
- kreuzberg (4.1.2)
210
+ kreuzberg (4.2.0)
211
211
  language_server-protocol (3.17.0.5) sha256=fd1e39a51a28bf3eec959379985a72e296e9f9acfce46f6a79d31ca8760803cc
212
212
  lint_roller (1.1.0) sha256=2c0c845b632a7d172cb849cc90c1bce937a28c5c8ccccb50dfd46a485003cc87
213
213
  listen (3.10.0) sha256=c6e182db62143aeccc2e1960033bebe7445309c7272061979bb098d03760c9d2
data/README.md CHANGED
@@ -22,7 +22,7 @@
22
22
  <img src="https://img.shields.io/maven-central/v/dev.kreuzberg/kreuzberg?label=Java&color=007ec6" alt="Java">
23
23
  </a>
24
24
  <a href="https://github.com/kreuzberg-dev/kreuzberg/releases">
25
- <img src="https://img.shields.io/github/v/tag/kreuzberg-dev/kreuzberg?label=Go&color=007ec6&filter=v4.1.2" alt="Go">
25
+ <img src="https://img.shields.io/github/v/tag/kreuzberg-dev/kreuzberg?label=Go&color=007ec6&filter=v4.2.0" alt="Go">
26
26
  </a>
27
27
  <a href="https://www.nuget.org/packages/Kreuzberg/">
28
28
  <img src="https://img.shields.io/nuget/v/Kreuzberg?label=C%23&color=007ec6" alt="C#">
data/kreuzberg.gemspec CHANGED
@@ -130,10 +130,22 @@ vendor_files = Dir.chdir(__dir__) do
130
130
  kreuzberg_files + kreuzberg_ffi_files + kreuzberg_tesseract_files + rb_sys_files + workspace_toml
131
131
  end
132
132
 
133
+ # When vendor files exist, get ext/ files from filesystem (to include modified Cargo.toml
134
+ # with vendor paths) instead of from git (which has original 5-level crate paths)
135
+ ext_files_from_fs = Dir.chdir(__dir__) do
136
+ Dir.glob('ext/**/*', File::FNM_DOTMATCH)
137
+ .reject { |f| File.directory?(f) }
138
+ .reject { |f| f.include?('/target/') }
139
+ .grep_v(/\.(swp|bak|tmp)$/)
140
+ .grep_v(/~$/)
141
+ end
142
+
133
143
  files = if (ruby_files + core_files + ffi_files).empty?
134
144
  fallback_files
135
145
  elsif vendor_files.any?
136
- ruby_files + vendor_files
146
+ # Use ext/ files from filesystem (modified by vendor script) + non-ext ruby files from git
147
+ non_ext_ruby_files = ruby_files.reject { |f| f.start_with?('ext/') }
148
+ non_ext_ruby_files + ext_files_from_fs + vendor_files
137
149
  else
138
150
  ruby_files + core_files + ffi_files
139
151
  end
@@ -717,7 +717,7 @@ module Kreuzberg
717
717
  :ocr, :chunking, :language_detection, :pdf_options,
718
718
  :image_extraction, :image_preprocessing, :postprocessor,
719
719
  :token_reduction, :keywords, :html_options, :pages,
720
- :max_concurrent_extractions
720
+ :max_concurrent_extractions, :output_format, :result_format
721
721
 
722
722
  # Load configuration from a file.
723
723
  #
@@ -738,7 +738,7 @@ module Kreuzberg
738
738
  use_cache enable_quality_processing force_ocr ocr chunking
739
739
  language_detection pdf_options image_extraction image_preprocessing
740
740
  postprocessor token_reduction keywords html_options pages
741
- max_concurrent_extractions
741
+ max_concurrent_extractions output_format result_format
742
742
  ].freeze
743
743
 
744
744
  # Aliases for backward compatibility
@@ -789,41 +789,67 @@ module Kreuzberg
789
789
  new(**normalize_hash_keys(hash))
790
790
  end
791
791
 
792
- def initialize(
793
- use_cache: true,
794
- enable_quality_processing: false,
795
- force_ocr: false,
796
- ocr: nil,
797
- chunking: nil,
798
- language_detection: nil,
799
- pdf_options: nil,
800
- image_extraction: nil,
801
- image_preprocessing: nil,
802
- postprocessor: nil,
803
- token_reduction: nil,
804
- keywords: nil,
805
- html_options: nil,
806
- pages: nil,
807
- max_concurrent_extractions: nil
808
- )
809
- @use_cache = use_cache ? true : false
810
- @enable_quality_processing = enable_quality_processing ? true : false
811
- @force_ocr = force_ocr ? true : false
812
- @ocr = normalize_config(ocr, OCR)
813
- @chunking = normalize_config(chunking, Chunking)
814
- @language_detection = normalize_config(language_detection, LanguageDetection)
815
- @pdf_options = normalize_config(pdf_options, PDF)
816
- @image_extraction = normalize_config(image_extraction, ImageExtraction)
817
- @image_preprocessing = normalize_config(image_preprocessing, ImagePreprocessing)
818
- @postprocessor = normalize_config(postprocessor, PostProcessor)
819
- @token_reduction = normalize_config(token_reduction, TokenReduction)
820
- @keywords = normalize_config(keywords, Keywords)
821
- @html_options = normalize_config(html_options, HtmlOptions)
822
- @pages = normalize_config(pages, PageConfig)
823
- @max_concurrent_extractions = max_concurrent_extractions&.to_i
792
+ def initialize(hash = nil,
793
+ use_cache: true,
794
+ enable_quality_processing: false,
795
+ force_ocr: false,
796
+ ocr: nil,
797
+ chunking: nil,
798
+ language_detection: nil,
799
+ pdf_options: nil,
800
+ image_extraction: nil,
801
+ image_preprocessing: nil,
802
+ postprocessor: nil,
803
+ token_reduction: nil,
804
+ keywords: nil,
805
+ html_options: nil,
806
+ pages: nil,
807
+ max_concurrent_extractions: nil,
808
+ output_format: nil,
809
+ result_format: nil)
810
+ kwargs = {
811
+ use_cache: use_cache, enable_quality_processing: enable_quality_processing,
812
+ force_ocr: force_ocr, ocr: ocr, chunking: chunking, language_detection: language_detection,
813
+ pdf_options: pdf_options, image_extraction: image_extraction,
814
+ image_preprocessing: image_preprocessing, postprocessor: postprocessor,
815
+ token_reduction: token_reduction, keywords: keywords, html_options: html_options,
816
+ pages: pages, max_concurrent_extractions: max_concurrent_extractions,
817
+ output_format: output_format, result_format: result_format
818
+ }
819
+ extracted = extract_from_hash(hash, kwargs)
820
+
821
+ assign_attributes(extracted)
822
+ end
823
+
824
+ def extract_from_hash(hash, defaults)
825
+ return defaults unless hash.is_a?(Hash)
826
+
827
+ hash = hash.transform_keys(&:to_sym)
828
+ defaults.merge(hash.slice(*defaults.keys))
829
+ end
830
+
831
+ def assign_attributes(params)
832
+ @use_cache = params[:use_cache] ? true : false
833
+ @enable_quality_processing = params[:enable_quality_processing] ? true : false
834
+ @force_ocr = params[:force_ocr] ? true : false
835
+ @ocr = normalize_config(params[:ocr], OCR)
836
+ @chunking = normalize_config(params[:chunking], Chunking)
837
+ @language_detection = normalize_config(params[:language_detection], LanguageDetection)
838
+ @pdf_options = normalize_config(params[:pdf_options], PDF)
839
+ @image_extraction = normalize_config(params[:image_extraction], ImageExtraction)
840
+ @image_preprocessing = normalize_config(params[:image_preprocessing], ImagePreprocessing)
841
+ @postprocessor = normalize_config(params[:postprocessor], PostProcessor)
842
+ @token_reduction = normalize_config(params[:token_reduction], TokenReduction)
843
+ @keywords = normalize_config(params[:keywords], Keywords)
844
+ @html_options = normalize_config(params[:html_options], HtmlOptions)
845
+ @pages = normalize_config(params[:pages], PageConfig)
846
+ @max_concurrent_extractions = params[:max_concurrent_extractions]&.to_i
847
+ @output_format = params[:output_format]&.to_s
848
+ @result_format = params[:result_format]&.to_s
824
849
  end
825
850
 
826
851
  # rubocop:disable Metrics/CyclomaticComplexity
852
+ # rubocop:disable Metrics/MethodLength
827
853
  def to_h
828
854
  {
829
855
  use_cache: @use_cache,
@@ -840,9 +866,12 @@ module Kreuzberg
840
866
  keywords: @keywords&.to_h,
841
867
  html_options: @html_options&.to_h,
842
868
  pages: @pages&.to_h,
843
- max_concurrent_extractions: @max_concurrent_extractions
869
+ max_concurrent_extractions: @max_concurrent_extractions,
870
+ output_format: @output_format,
871
+ result_format: @result_format
844
872
  }.compact
845
873
  end
874
+ # rubocop:enable Metrics/MethodLength
846
875
  # rubocop:enable Metrics/CyclomaticComplexity
847
876
 
848
877
  # Serialize configuration to JSON string
@@ -981,6 +1010,10 @@ module Kreuzberg
981
1010
  @pages = normalize_config(value, PageConfig)
982
1011
  when :max_concurrent_extractions
983
1012
  @max_concurrent_extractions = value&.to_i
1013
+ when :output_format
1014
+ @output_format = value&.to_s
1015
+ when :result_format
1016
+ @result_format = value&.to_s
984
1017
  else
985
1018
  raise ArgumentError, "Unknown configuration key: #{key}"
986
1019
  end
@@ -1028,6 +1061,8 @@ module Kreuzberg
1028
1061
  @html_options = merged.html_options
1029
1062
  @pages = merged.pages
1030
1063
  @max_concurrent_extractions = merged.max_concurrent_extractions
1064
+ @output_format = merged.output_format
1065
+ @result_format = merged.result_format
1031
1066
  end
1032
1067
  end
1033
1068
  end
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Kreuzberg
4
- VERSION = '4.1.2'
4
+ VERSION = '4.2.0'
5
5
  end
data/sig/kreuzberg.rbs CHANGED
@@ -202,6 +202,8 @@ module Kreuzberg
202
202
  attr_reader html_options: HtmlOptions?
203
203
  attr_reader pages: PageConfig?
204
204
  attr_reader max_concurrent_extractions: Integer?
205
+ attr_reader output_format: String?
206
+ attr_reader result_format: String?
205
207
 
206
208
  def self.from_file: (String path) -> Extraction
207
209
  def initialize: (
@@ -219,7 +221,9 @@ module Kreuzberg
219
221
  ?keywords: (Keywords | Hash[Symbol, untyped])?,
220
222
  ?html_options: (HtmlOptions | Hash[Symbol, untyped])?,
221
223
  ?pages: (PageConfig | Hash[Symbol, untyped])?,
222
- ?max_concurrent_extractions: Integer?
224
+ ?max_concurrent_extractions: Integer?,
225
+ ?output_format: String?,
226
+ ?result_format: String?
223
227
  ) -> void
224
228
  def to_h: () -> Hash[Symbol, untyped]
225
229
 
@@ -592,4 +592,84 @@ RSpec.describe 'Batch Operations' do
592
592
  paths.each { |p| FileUtils.rm_f(p) }
593
593
  end
594
594
  end
595
+
596
+ describe 'batch with output and result formats' do
597
+ it 'batch processes with output_format' do
598
+ paths = []
599
+ file = Tempfile.new(['format_test', '.txt']).tap do |f|
600
+ f.write('Test content for output format')
601
+ f.close
602
+ end
603
+ paths << file.path
604
+
605
+ config = Kreuzberg::Config::Extraction.new(output_format: 'markdown')
606
+ results = Kreuzberg.batch_extract_files_sync(paths: paths, config: config)
607
+
608
+ expect(results).to be_an Array
609
+ expect(results.length).to eq 1
610
+ expect(results[0]).to be_a Kreuzberg::Result
611
+
612
+ paths.each { |p| FileUtils.rm_f(p) }
613
+ end
614
+
615
+ it 'batch processes with result_format' do
616
+ paths = []
617
+ file = Tempfile.new(['format_test', '.txt']).tap do |f|
618
+ f.write('Test content for result format')
619
+ f.close
620
+ end
621
+ paths << file.path
622
+
623
+ config = Kreuzberg::Config::Extraction.new(result_format: 'unified')
624
+ results = Kreuzberg.batch_extract_files_sync(paths: paths, config: config)
625
+
626
+ expect(results).to be_an Array
627
+ expect(results.length).to eq 1
628
+ expect(results[0]).to be_a Kreuzberg::Result
629
+
630
+ paths.each { |p| FileUtils.rm_f(p) }
631
+ end
632
+
633
+ it 'batch processes with both output and result formats' do
634
+ paths = []
635
+ file = Tempfile.new(['format_test', '.txt']).tap do |f|
636
+ f.write('Test content for both formats')
637
+ f.close
638
+ end
639
+ paths << file.path
640
+
641
+ config = Kreuzberg::Config::Extraction.new(
642
+ output_format: 'plain',
643
+ result_format: 'element_based'
644
+ )
645
+ results = Kreuzberg.batch_extract_files_sync(paths: paths, config: config)
646
+
647
+ expect(results).to be_an Array
648
+ expect(results.length).to eq 1
649
+ expect(results[0]).to be_a Kreuzberg::Result
650
+
651
+ paths.each { |p| FileUtils.rm_f(p) }
652
+ end
653
+
654
+ it 'batch processes with chunking and output_format' do
655
+ paths = []
656
+ file = Tempfile.new(['format_test', '.txt']).tap do |f|
657
+ f.write('Test content ' * 100)
658
+ f.close
659
+ end
660
+ paths << file.path
661
+
662
+ config = Kreuzberg::Config::Extraction.new(
663
+ output_format: 'markdown',
664
+ chunking: { max_chars: 1000 }
665
+ )
666
+ results = Kreuzberg.batch_extract_files_sync(paths: paths, config: config)
667
+
668
+ expect(results).to be_an Array
669
+ expect(results.length).to eq 1
670
+ expect(results[0]).to be_a Kreuzberg::Result
671
+
672
+ paths.each { |p| FileUtils.rm_f(p) }
673
+ end
674
+ end
595
675
  end
@@ -1154,64 +1154,11 @@ RSpec.describe 'Kreuzberg Metadata Types' do
1154
1154
 
1155
1155
  describe 'Thread Safety: Concurrent Extraction' do
1156
1156
  it 'handles concurrent extraction safely' do
1157
- test_files = []
1158
- results = []
1159
- errors = []
1157
+ test_files = create_concurrent_test_files
1158
+ results, errors = run_concurrent_extractions(test_files)
1160
1159
 
1161
- 5.times do |i|
1162
- html_content = <<~HTML
1163
- <html>
1164
- <head>
1165
- <title>Concurrent Test #{i}</title>
1166
- <meta name="description" content="Test document #{i}">
1167
- <meta name="keywords" content="test#{i}, concurrent, thread-safe">
1168
- </head>
1169
- <body>
1170
- <h1>Test Document #{i}</h1>
1171
- <p>Content for test #{i}</p>
1172
- <a href="/page-#{i}">Link #{i}</a>
1173
- <img src="image-#{i}.jpg" alt="Image #{i}">
1174
- </body>
1175
- </html>
1176
- HTML
1177
- test_files << create_test_html_file(html_content)
1178
- end
1179
-
1180
- begin
1181
- threads = test_files.map do |file|
1182
- Thread.new do
1183
- result = Kreuzberg.extract_file_sync(path: file)
1184
- results << result
1185
- rescue StandardError => e
1186
- errors << e
1187
- end
1188
- end
1189
-
1190
- threads.each(&:join)
1191
-
1192
- expect(errors).to be_empty
1193
-
1194
- expect(results.length).to eq(5)
1195
- results.each do |result|
1196
- expect(result).to be_a(Kreuzberg::Result)
1197
- expect(result.metadata).not_to be_nil
1198
-
1199
- metadata = result.metadata
1200
- next unless metadata.is_a?(Kreuzberg::HtmlMetadata)
1201
-
1202
- expect(metadata.title).not_to be_nil
1203
- expect(metadata.description).not_to be_nil
1204
- expect(metadata.keywords).to be_a(Array)
1205
- expect(metadata.headers).to be_a(Array)
1206
- expect(metadata.links).to be_a(Array)
1207
- expect(metadata.images).to be_a(Array)
1208
- end
1209
-
1210
- titles = results.map { |r| r.metadata.is_a?(Kreuzberg::HtmlMetadata) ? r.metadata.title : r.metadata['title'] }
1211
- expect(titles.uniq.length).to eq(5)
1212
- ensure
1213
- test_files.each { |f| FileUtils.rm_f(f) }
1214
- end
1160
+ expect(results).not_to be_empty
1161
+ verify_concurrent_results(results, errors, test_files)
1215
1162
  end
1216
1163
  end
1217
1164
 
@@ -1225,4 +1172,77 @@ RSpec.describe 'Kreuzberg Metadata Types' do
1225
1172
  file.close
1226
1173
  file.path
1227
1174
  end
1175
+
1176
+ def create_concurrent_test_files
1177
+ test_files = []
1178
+ 5.times do |i|
1179
+ html_content = <<~HTML
1180
+ <html>
1181
+ <head>
1182
+ <title>Concurrent Test #{i}</title>
1183
+ <meta name="description" content="Test document #{i}">
1184
+ <meta name="keywords" content="test#{i}, concurrent, thread-safe">
1185
+ </head>
1186
+ <body>
1187
+ <h1>Test Document #{i}</h1>
1188
+ <p>Content for test #{i}</p>
1189
+ <a href="/page-#{i}">Link #{i}</a>
1190
+ <img src="image-#{i}.jpg" alt="Image #{i}">
1191
+ </body>
1192
+ </html>
1193
+ HTML
1194
+ test_files << create_test_html_file(html_content)
1195
+ end
1196
+ test_files
1197
+ end
1198
+
1199
+ def run_concurrent_extractions(test_files)
1200
+ results = []
1201
+ errors = []
1202
+
1203
+ threads = test_files.map do |file|
1204
+ Thread.new do
1205
+ result = Kreuzberg.extract_file_sync(path: file)
1206
+ results << result
1207
+ rescue StandardError => e
1208
+ errors << e
1209
+ end
1210
+ end
1211
+
1212
+ threads.each(&:join)
1213
+ [results, errors]
1214
+ end
1215
+
1216
+ def verify_concurrent_results(results, errors, test_files)
1217
+ expect(errors).to be_empty
1218
+ expect(results.length).to eq(5)
1219
+
1220
+ results.each do |result|
1221
+ expect(result).to be_a(Kreuzberg::Result)
1222
+ expect(result.metadata).not_to be_nil
1223
+
1224
+ metadata = result.metadata
1225
+ next unless metadata.is_a?(Kreuzberg::HtmlMetadata)
1226
+
1227
+ verify_metadata_fields(metadata)
1228
+ end
1229
+
1230
+ titles = extract_titles(results)
1231
+ expect(titles.uniq.length).to eq(5)
1232
+ ensure
1233
+ test_files.each { |f| FileUtils.rm_f(f) }
1234
+ end
1235
+
1236
+ def verify_metadata_fields(metadata)
1237
+ expect(metadata.title).not_to be_nil
1238
+ expect(metadata.description).not_to be_nil
1239
+ expect(metadata.keywords).to be_a(Array)
1240
+ expect(metadata.headers).to be_a(Array)
1241
+ expect(metadata.links).to be_a(Array)
1242
+ expect(metadata.images).to be_a(Array)
1243
+ end
1244
+
1245
+ def extract_titles(results)
1246
+ results.map { |r| r.metadata.is_a?(Kreuzberg::HtmlMetadata) ? r.metadata.title : r.metadata['title'] }
1247
+ end
1228
1248
  end
@@ -0,0 +1,134 @@
1
+ # frozen_string_literal: true
2
+
3
+ # Cross-language serialization tests for Ruby bindings
4
+ #
5
+ # Validates that ExtractionConfig serializes consistently with other language bindings
6
+
7
+ require 'json'
8
+ require 'spec_helper'
9
+
10
+ RSpec.describe Kreuzberg::ExtractionConfig do
11
+ describe '#to_h' do
12
+ it 'serializes minimal config to hash' do
13
+ config = described_class.new
14
+ hash = config.to_h
15
+
16
+ expect(hash).to be_a(Hash)
17
+ expect(hash).to have_key(:use_cache)
18
+ expect(hash).to have_key(:enable_quality_processing)
19
+ expect(hash).to have_key(:force_ocr)
20
+ end
21
+
22
+ it 'serializes config with all fields' do
23
+ config = described_class.new(
24
+ use_cache: true,
25
+ enable_quality_processing: true,
26
+ force_ocr: false
27
+ )
28
+
29
+ hash = config.to_h
30
+
31
+ expect(hash[:use_cache]).to be(true)
32
+ expect(hash[:enable_quality_processing]).to be(true)
33
+ expect(hash[:force_ocr]).to be(false)
34
+ end
35
+
36
+ it 'preserves field values after serialization' do
37
+ original = described_class.new(
38
+ use_cache: false,
39
+ enable_quality_processing: true
40
+ )
41
+
42
+ hash = original.to_h
43
+
44
+ expect(hash[:use_cache]).to be(false)
45
+ expect(hash[:enable_quality_processing]).to be(true)
46
+ end
47
+ end
48
+
49
+ describe '#to_json' do
50
+ it 'serializes to JSON' do
51
+ config = described_class.new(use_cache: true)
52
+ json = config.to_json
53
+
54
+ expect(json).to be_a(String)
55
+
56
+ parsed = JSON.parse(json, symbolize_names: true)
57
+ expect(parsed).to have_key(:use_cache)
58
+ expect(parsed[:use_cache]).to be(true)
59
+ end
60
+
61
+ it 'produces valid JSON' do
62
+ config = described_class.new
63
+ json = config.to_json
64
+
65
+ expect { JSON.parse(json) }.not_to raise_error
66
+ end
67
+
68
+ it 'uses snake_case field names' do
69
+ config = described_class.new(use_cache: true)
70
+ json = config.to_json
71
+
72
+ expect(json).to include('use_cache')
73
+ expect(json).not_to include('useCache')
74
+ end
75
+ end
76
+
77
+ describe 'round-trip serialization' do
78
+ it 'survives serialization -> deserialization -> serialization' do
79
+ config1 = described_class.new(
80
+ use_cache: true,
81
+ enable_quality_processing: false
82
+ )
83
+
84
+ json1 = config1.to_json
85
+ hash1 = JSON.parse(json1, symbolize_names: true)
86
+
87
+ config2 = described_class.new(hash1)
88
+ json2 = config2.to_json
89
+
90
+ # JSON strings should be equivalent
91
+ expect(JSON.parse(json1)).to eq(JSON.parse(json2))
92
+ end
93
+ end
94
+
95
+ describe 'field consistency' do
96
+ it 'includes all mandatory fields' do
97
+ config = described_class.new
98
+ hash = config.to_h
99
+
100
+ mandatory_fields = %i[use_cache enable_quality_processing force_ocr]
101
+ mandatory_fields.each do |field|
102
+ expect(hash).to have_key(field)
103
+ end
104
+ end
105
+
106
+ it 'handles nested ocr config' do
107
+ config = described_class.new(
108
+ ocr: {
109
+ backend: 'tesseract',
110
+ language: 'eng'
111
+ }
112
+ )
113
+
114
+ hash = config.to_h
115
+
116
+ expect(hash).to have_key(:ocr)
117
+ expect(hash[:ocr][:backend]).to eq('tesseract')
118
+ expect(hash[:ocr][:language]).to eq('eng')
119
+ end
120
+ end
121
+
122
+ describe 'immutability' do
123
+ it 'does not modify original config during serialization' do
124
+ config = described_class.new(use_cache: true)
125
+
126
+ json1 = config.to_json
127
+ json2 = config.to_json
128
+ json3 = config.to_json
129
+
130
+ expect(json1).to eq(json2)
131
+ expect(json2).to eq(json3)
132
+ end
133
+ end
134
+ end