kreuzberg 4.0.0.pre.rc.29 → 4.0.0.rc1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +0 -6
- data/.rubocop.yaml +534 -1
- data/Gemfile +2 -1
- data/Gemfile.lock +28 -116
- data/README.md +269 -629
- data/Rakefile +0 -9
- data/Steepfile +4 -8
- data/examples/async_patterns.rb +58 -1
- data/ext/kreuzberg_rb/extconf.rb +5 -35
- data/ext/kreuzberg_rb/native/Cargo.toml +16 -55
- data/ext/kreuzberg_rb/native/build.rs +14 -12
- data/ext/kreuzberg_rb/native/include/ieeefp.h +1 -1
- data/ext/kreuzberg_rb/native/include/msvc_compat/strings.h +1 -1
- data/ext/kreuzberg_rb/native/include/strings.h +2 -2
- data/ext/kreuzberg_rb/native/include/unistd.h +1 -1
- data/ext/kreuzberg_rb/native/src/lib.rs +34 -897
- data/extconf.rb +6 -38
- data/kreuzberg.gemspec +20 -114
- data/lib/kreuzberg/api_proxy.rb +18 -2
- data/lib/kreuzberg/cache_api.rb +0 -22
- data/lib/kreuzberg/cli.rb +10 -2
- data/lib/kreuzberg/cli_proxy.rb +10 -0
- data/lib/kreuzberg/config.rb +22 -274
- data/lib/kreuzberg/errors.rb +7 -73
- data/lib/kreuzberg/extraction_api.rb +8 -237
- data/lib/kreuzberg/mcp_proxy.rb +11 -2
- data/lib/kreuzberg/ocr_backend_protocol.rb +73 -0
- data/lib/kreuzberg/post_processor_protocol.rb +71 -0
- data/lib/kreuzberg/result.rb +33 -151
- data/lib/kreuzberg/setup_lib_path.rb +2 -22
- data/lib/kreuzberg/validator_protocol.rb +73 -0
- data/lib/kreuzberg/version.rb +1 -1
- data/lib/kreuzberg.rb +13 -27
- data/pkg/kreuzberg-4.0.0.rc1.gem +0 -0
- data/sig/kreuzberg.rbs +12 -105
- data/spec/binding/cache_spec.rb +22 -22
- data/spec/binding/cli_proxy_spec.rb +4 -2
- data/spec/binding/cli_spec.rb +11 -12
- data/spec/binding/config_spec.rb +0 -74
- data/spec/binding/config_validation_spec.rb +6 -100
- data/spec/binding/error_handling_spec.rb +97 -283
- data/spec/binding/plugins/ocr_backend_spec.rb +8 -8
- data/spec/binding/plugins/postprocessor_spec.rb +11 -11
- data/spec/binding/plugins/validator_spec.rb +13 -12
- data/spec/examples.txt +104 -0
- data/spec/fixtures/config.toml +1 -0
- data/spec/fixtures/config.yaml +1 -0
- data/spec/fixtures/invalid_config.toml +1 -0
- data/spec/smoke/package_spec.rb +3 -2
- data/spec/spec_helper.rb +3 -1
- data/vendor/kreuzberg/Cargo.toml +67 -192
- data/vendor/kreuzberg/README.md +9 -97
- data/vendor/kreuzberg/build.rs +194 -516
- data/vendor/kreuzberg/src/api/handlers.rs +9 -130
- data/vendor/kreuzberg/src/api/mod.rs +3 -18
- data/vendor/kreuzberg/src/api/server.rs +71 -236
- data/vendor/kreuzberg/src/api/types.rs +7 -43
- data/vendor/kreuzberg/src/bin/profile_extract.rs +455 -0
- data/vendor/kreuzberg/src/cache/mod.rs +3 -27
- data/vendor/kreuzberg/src/chunking/mod.rs +79 -1705
- data/vendor/kreuzberg/src/core/batch_mode.rs +0 -60
- data/vendor/kreuzberg/src/core/config.rs +23 -905
- data/vendor/kreuzberg/src/core/extractor.rs +106 -403
- data/vendor/kreuzberg/src/core/io.rs +2 -4
- data/vendor/kreuzberg/src/core/mime.rs +12 -2
- data/vendor/kreuzberg/src/core/mod.rs +3 -22
- data/vendor/kreuzberg/src/core/pipeline.rs +78 -395
- data/vendor/kreuzberg/src/embeddings.rs +21 -169
- data/vendor/kreuzberg/src/error.rs +2 -2
- data/vendor/kreuzberg/src/extraction/archive.rs +31 -36
- data/vendor/kreuzberg/src/extraction/docx.rs +1 -365
- data/vendor/kreuzberg/src/extraction/email.rs +11 -12
- data/vendor/kreuzberg/src/extraction/excel.rs +129 -138
- data/vendor/kreuzberg/src/extraction/html.rs +170 -1447
- data/vendor/kreuzberg/src/extraction/image.rs +14 -138
- data/vendor/kreuzberg/src/extraction/libreoffice.rs +3 -13
- data/vendor/kreuzberg/src/extraction/mod.rs +5 -21
- data/vendor/kreuzberg/src/extraction/office_metadata/mod.rs +0 -2
- data/vendor/kreuzberg/src/extraction/pandoc/batch.rs +275 -0
- data/vendor/kreuzberg/src/extraction/pandoc/mime_types.rs +178 -0
- data/vendor/kreuzberg/src/extraction/pandoc/mod.rs +491 -0
- data/vendor/kreuzberg/src/extraction/pandoc/server.rs +496 -0
- data/vendor/kreuzberg/src/extraction/pandoc/subprocess.rs +1188 -0
- data/vendor/kreuzberg/src/extraction/pandoc/version.rs +162 -0
- data/vendor/kreuzberg/src/extraction/pptx.rs +94 -196
- data/vendor/kreuzberg/src/extraction/structured.rs +4 -5
- data/vendor/kreuzberg/src/extraction/table.rs +1 -2
- data/vendor/kreuzberg/src/extraction/text.rs +10 -18
- data/vendor/kreuzberg/src/extractors/archive.rs +0 -22
- data/vendor/kreuzberg/src/extractors/docx.rs +148 -69
- data/vendor/kreuzberg/src/extractors/email.rs +9 -37
- data/vendor/kreuzberg/src/extractors/excel.rs +40 -81
- data/vendor/kreuzberg/src/extractors/html.rs +173 -182
- data/vendor/kreuzberg/src/extractors/image.rs +8 -32
- data/vendor/kreuzberg/src/extractors/mod.rs +10 -171
- data/vendor/kreuzberg/src/extractors/pandoc.rs +201 -0
- data/vendor/kreuzberg/src/extractors/pdf.rs +64 -329
- data/vendor/kreuzberg/src/extractors/pptx.rs +34 -79
- data/vendor/kreuzberg/src/extractors/structured.rs +0 -16
- data/vendor/kreuzberg/src/extractors/text.rs +7 -30
- data/vendor/kreuzberg/src/extractors/xml.rs +8 -27
- data/vendor/kreuzberg/src/keywords/processor.rs +1 -9
- data/vendor/kreuzberg/src/keywords/rake.rs +1 -0
- data/vendor/kreuzberg/src/language_detection/mod.rs +51 -94
- data/vendor/kreuzberg/src/lib.rs +5 -17
- data/vendor/kreuzberg/src/mcp/mod.rs +1 -4
- data/vendor/kreuzberg/src/mcp/server.rs +21 -145
- data/vendor/kreuzberg/src/ocr/mod.rs +0 -2
- data/vendor/kreuzberg/src/ocr/processor.rs +8 -19
- data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +0 -2
- data/vendor/kreuzberg/src/pdf/error.rs +1 -93
- data/vendor/kreuzberg/src/pdf/metadata.rs +100 -263
- data/vendor/kreuzberg/src/pdf/mod.rs +2 -33
- data/vendor/kreuzberg/src/pdf/rendering.rs +12 -12
- data/vendor/kreuzberg/src/pdf/table.rs +64 -61
- data/vendor/kreuzberg/src/pdf/text.rs +24 -416
- data/vendor/kreuzberg/src/plugins/extractor.rs +8 -40
- data/vendor/kreuzberg/src/plugins/mod.rs +0 -3
- data/vendor/kreuzberg/src/plugins/ocr.rs +14 -22
- data/vendor/kreuzberg/src/plugins/processor.rs +1 -10
- data/vendor/kreuzberg/src/plugins/registry.rs +0 -15
- data/vendor/kreuzberg/src/plugins/validator.rs +8 -20
- data/vendor/kreuzberg/src/stopwords/mod.rs +2 -2
- data/vendor/kreuzberg/src/text/mod.rs +0 -8
- data/vendor/kreuzberg/src/text/quality.rs +15 -28
- data/vendor/kreuzberg/src/text/string_utils.rs +10 -22
- data/vendor/kreuzberg/src/text/token_reduction/core.rs +50 -86
- data/vendor/kreuzberg/src/text/token_reduction/filters.rs +16 -37
- data/vendor/kreuzberg/src/text/token_reduction/simd_text.rs +1 -2
- data/vendor/kreuzberg/src/types.rs +67 -907
- data/vendor/kreuzberg/src/utils/mod.rs +0 -14
- data/vendor/kreuzberg/src/utils/quality.rs +3 -12
- data/vendor/kreuzberg/tests/api_tests.rs +0 -506
- data/vendor/kreuzberg/tests/archive_integration.rs +0 -2
- data/vendor/kreuzberg/tests/batch_orchestration.rs +12 -57
- data/vendor/kreuzberg/tests/batch_processing.rs +8 -32
- data/vendor/kreuzberg/tests/chunking_offset_demo.rs +92 -0
- data/vendor/kreuzberg/tests/concurrency_stress.rs +8 -40
- data/vendor/kreuzberg/tests/config_features.rs +1 -33
- data/vendor/kreuzberg/tests/config_loading_tests.rs +39 -16
- data/vendor/kreuzberg/tests/core_integration.rs +9 -35
- data/vendor/kreuzberg/tests/csv_integration.rs +81 -71
- data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +25 -23
- data/vendor/kreuzberg/tests/email_integration.rs +1 -3
- data/vendor/kreuzberg/tests/error_handling.rs +34 -43
- data/vendor/kreuzberg/tests/format_integration.rs +1 -7
- data/vendor/kreuzberg/tests/helpers/mod.rs +0 -60
- data/vendor/kreuzberg/tests/image_integration.rs +0 -2
- data/vendor/kreuzberg/tests/mime_detection.rs +16 -17
- data/vendor/kreuzberg/tests/ocr_configuration.rs +0 -4
- data/vendor/kreuzberg/tests/ocr_errors.rs +0 -22
- data/vendor/kreuzberg/tests/ocr_quality.rs +0 -2
- data/vendor/kreuzberg/tests/pandoc_integration.rs +503 -0
- data/vendor/kreuzberg/tests/pdf_integration.rs +0 -2
- data/vendor/kreuzberg/tests/pipeline_integration.rs +2 -36
- data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +0 -5
- data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +1 -17
- data/vendor/kreuzberg/tests/plugin_system.rs +0 -6
- data/vendor/kreuzberg/tests/registry_integration_tests.rs +22 -2
- data/vendor/kreuzberg/tests/security_validation.rs +1 -13
- data/vendor/kreuzberg/tests/test_fastembed.rs +23 -45
- metadata +25 -171
- data/.rubocop.yml +0 -543
- data/ext/kreuzberg_rb/native/.cargo/config.toml +0 -23
- data/ext/kreuzberg_rb/native/Cargo.lock +0 -7619
- data/lib/kreuzberg/error_context.rb +0 -136
- data/lib/kreuzberg/types.rb +0 -170
- data/lib/libpdfium.so +0 -0
- data/spec/binding/async_operations_spec.rb +0 -473
- data/spec/binding/batch_operations_spec.rb +0 -595
- data/spec/binding/batch_spec.rb +0 -359
- data/spec/binding/config_result_spec.rb +0 -377
- data/spec/binding/embeddings_spec.rb +0 -816
- data/spec/binding/error_recovery_spec.rb +0 -488
- data/spec/binding/font_config_spec.rb +0 -220
- data/spec/binding/images_spec.rb +0 -738
- data/spec/binding/keywords_extraction_spec.rb +0 -600
- data/spec/binding/metadata_types_spec.rb +0 -1228
- data/spec/binding/pages_extraction_spec.rb +0 -471
- data/spec/binding/tables_spec.rb +0 -641
- data/spec/unit/config/chunking_config_spec.rb +0 -213
- data/spec/unit/config/embedding_config_spec.rb +0 -343
- data/spec/unit/config/extraction_config_spec.rb +0 -438
- data/spec/unit/config/font_config_spec.rb +0 -285
- data/spec/unit/config/hierarchy_config_spec.rb +0 -314
- data/spec/unit/config/image_extraction_config_spec.rb +0 -209
- data/spec/unit/config/image_preprocessing_config_spec.rb +0 -249
- data/spec/unit/config/keyword_config_spec.rb +0 -229
- data/spec/unit/config/language_detection_config_spec.rb +0 -258
- data/spec/unit/config/ocr_config_spec.rb +0 -171
- data/spec/unit/config/page_config_spec.rb +0 -221
- data/spec/unit/config/pdf_config_spec.rb +0 -267
- data/spec/unit/config/postprocessor_config_spec.rb +0 -290
- data/spec/unit/config/tesseract_config_spec.rb +0 -181
- data/spec/unit/config/token_reduction_config_spec.rb +0 -251
- data/test/metadata_types_test.rb +0 -959
- data/vendor/Cargo.toml +0 -61
- data/vendor/kreuzberg/examples/bench_fixes.rs +0 -71
- data/vendor/kreuzberg/examples/test_pdfium_fork.rs +0 -62
- data/vendor/kreuzberg/src/chunking/processor.rs +0 -219
- data/vendor/kreuzberg/src/core/batch_optimizations.rs +0 -385
- data/vendor/kreuzberg/src/core/config_validation.rs +0 -949
- data/vendor/kreuzberg/src/core/formats.rs +0 -235
- data/vendor/kreuzberg/src/core/server_config.rs +0 -1220
- data/vendor/kreuzberg/src/extraction/capacity.rs +0 -263
- data/vendor/kreuzberg/src/extraction/markdown.rs +0 -216
- data/vendor/kreuzberg/src/extraction/office_metadata/odt_properties.rs +0 -284
- data/vendor/kreuzberg/src/extractors/bibtex.rs +0 -470
- data/vendor/kreuzberg/src/extractors/docbook.rs +0 -504
- data/vendor/kreuzberg/src/extractors/epub.rs +0 -696
- data/vendor/kreuzberg/src/extractors/fictionbook.rs +0 -492
- data/vendor/kreuzberg/src/extractors/jats.rs +0 -1054
- data/vendor/kreuzberg/src/extractors/jupyter.rs +0 -368
- data/vendor/kreuzberg/src/extractors/latex.rs +0 -653
- data/vendor/kreuzberg/src/extractors/markdown.rs +0 -701
- data/vendor/kreuzberg/src/extractors/odt.rs +0 -628
- data/vendor/kreuzberg/src/extractors/opml.rs +0 -635
- data/vendor/kreuzberg/src/extractors/orgmode.rs +0 -529
- data/vendor/kreuzberg/src/extractors/rst.rs +0 -577
- data/vendor/kreuzberg/src/extractors/rtf.rs +0 -809
- data/vendor/kreuzberg/src/extractors/security.rs +0 -484
- data/vendor/kreuzberg/src/extractors/security_tests.rs +0 -367
- data/vendor/kreuzberg/src/extractors/typst.rs +0 -651
- data/vendor/kreuzberg/src/language_detection/processor.rs +0 -218
- data/vendor/kreuzberg/src/ocr/language_registry.rs +0 -520
- data/vendor/kreuzberg/src/panic_context.rs +0 -154
- data/vendor/kreuzberg/src/pdf/bindings.rs +0 -306
- data/vendor/kreuzberg/src/pdf/bundled.rs +0 -408
- data/vendor/kreuzberg/src/pdf/fonts.rs +0 -358
- data/vendor/kreuzberg/src/pdf/hierarchy.rs +0 -903
- data/vendor/kreuzberg/src/text/quality_processor.rs +0 -231
- data/vendor/kreuzberg/src/text/utf8_validation.rs +0 -193
- data/vendor/kreuzberg/src/utils/pool.rs +0 -503
- data/vendor/kreuzberg/src/utils/pool_sizing.rs +0 -364
- data/vendor/kreuzberg/src/utils/string_pool.rs +0 -761
- data/vendor/kreuzberg/tests/api_embed.rs +0 -360
- data/vendor/kreuzberg/tests/api_extract_multipart.rs +0 -52
- data/vendor/kreuzberg/tests/api_large_pdf_extraction.rs +0 -471
- data/vendor/kreuzberg/tests/api_large_pdf_extraction_diagnostics.rs +0 -289
- data/vendor/kreuzberg/tests/batch_pooling_benchmark.rs +0 -154
- data/vendor/kreuzberg/tests/bibtex_parity_test.rs +0 -421
- data/vendor/kreuzberg/tests/config_integration_test.rs +0 -753
- data/vendor/kreuzberg/tests/data/hierarchy_ground_truth.json +0 -294
- data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +0 -500
- data/vendor/kreuzberg/tests/docx_vs_pandoc_comparison.rs +0 -370
- data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +0 -275
- data/vendor/kreuzberg/tests/fictionbook_extractor_tests.rs +0 -228
- data/vendor/kreuzberg/tests/html_table_test.rs +0 -551
- data/vendor/kreuzberg/tests/instrumentation_test.rs +0 -139
- data/vendor/kreuzberg/tests/jats_extractor_tests.rs +0 -639
- data/vendor/kreuzberg/tests/jupyter_extractor_tests.rs +0 -704
- data/vendor/kreuzberg/tests/latex_extractor_tests.rs +0 -496
- data/vendor/kreuzberg/tests/markdown_extractor_tests.rs +0 -490
- data/vendor/kreuzberg/tests/ocr_language_registry.rs +0 -191
- data/vendor/kreuzberg/tests/odt_extractor_tests.rs +0 -674
- data/vendor/kreuzberg/tests/opml_extractor_tests.rs +0 -616
- data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +0 -822
- data/vendor/kreuzberg/tests/page_markers.rs +0 -297
- data/vendor/kreuzberg/tests/pdf_hierarchy_detection.rs +0 -301
- data/vendor/kreuzberg/tests/pdf_hierarchy_quality.rs +0 -589
- data/vendor/kreuzberg/tests/pdf_ocr_triggering.rs +0 -301
- data/vendor/kreuzberg/tests/pdf_text_merging.rs +0 -475
- data/vendor/kreuzberg/tests/pdfium_linking.rs +0 -340
- data/vendor/kreuzberg/tests/rst_extractor_tests.rs +0 -694
- data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +0 -775
- data/vendor/kreuzberg/tests/typst_behavioral_tests.rs +0 -1260
- data/vendor/kreuzberg/tests/typst_extractor_tests.rs +0 -648
- data/vendor/kreuzberg-ffi/Cargo.toml +0 -67
- data/vendor/kreuzberg-ffi/README.md +0 -851
- data/vendor/kreuzberg-ffi/benches/result_view_benchmark.rs +0 -227
- data/vendor/kreuzberg-ffi/build.rs +0 -168
- data/vendor/kreuzberg-ffi/cbindgen.toml +0 -37
- data/vendor/kreuzberg-ffi/kreuzberg-ffi.pc.in +0 -12
- data/vendor/kreuzberg-ffi/kreuzberg.h +0 -3012
- data/vendor/kreuzberg-ffi/src/batch_streaming.rs +0 -588
- data/vendor/kreuzberg-ffi/src/config.rs +0 -1341
- data/vendor/kreuzberg-ffi/src/error.rs +0 -901
- data/vendor/kreuzberg-ffi/src/extraction.rs +0 -555
- data/vendor/kreuzberg-ffi/src/helpers.rs +0 -879
- data/vendor/kreuzberg-ffi/src/lib.rs +0 -977
- data/vendor/kreuzberg-ffi/src/memory.rs +0 -493
- data/vendor/kreuzberg-ffi/src/mime.rs +0 -329
- data/vendor/kreuzberg-ffi/src/panic_shield.rs +0 -265
- data/vendor/kreuzberg-ffi/src/plugins/document_extractor.rs +0 -442
- data/vendor/kreuzberg-ffi/src/plugins/mod.rs +0 -14
- data/vendor/kreuzberg-ffi/src/plugins/ocr_backend.rs +0 -628
- data/vendor/kreuzberg-ffi/src/plugins/post_processor.rs +0 -438
- data/vendor/kreuzberg-ffi/src/plugins/validator.rs +0 -329
- data/vendor/kreuzberg-ffi/src/result.rs +0 -510
- data/vendor/kreuzberg-ffi/src/result_pool.rs +0 -639
- data/vendor/kreuzberg-ffi/src/result_view.rs +0 -773
- data/vendor/kreuzberg-ffi/src/string_intern.rs +0 -568
- data/vendor/kreuzberg-ffi/src/types.rs +0 -363
- data/vendor/kreuzberg-ffi/src/util.rs +0 -210
- data/vendor/kreuzberg-ffi/src/validation.rs +0 -848
- data/vendor/kreuzberg-ffi/tests.disabled/README.md +0 -48
- data/vendor/kreuzberg-ffi/tests.disabled/config_loading_tests.rs +0 -299
- data/vendor/kreuzberg-ffi/tests.disabled/config_tests.rs +0 -346
- data/vendor/kreuzberg-ffi/tests.disabled/extractor_tests.rs +0 -232
- data/vendor/kreuzberg-ffi/tests.disabled/plugin_registration_tests.rs +0 -470
- data/vendor/kreuzberg-tesseract/.commitlintrc.json +0 -13
- data/vendor/kreuzberg-tesseract/.crate-ignore +0 -2
- data/vendor/kreuzberg-tesseract/Cargo.lock +0 -2933
- data/vendor/kreuzberg-tesseract/Cargo.toml +0 -57
- data/vendor/kreuzberg-tesseract/LICENSE +0 -22
- data/vendor/kreuzberg-tesseract/README.md +0 -399
- data/vendor/kreuzberg-tesseract/build.rs +0 -1127
- data/vendor/kreuzberg-tesseract/patches/README.md +0 -71
- data/vendor/kreuzberg-tesseract/patches/tesseract.diff +0 -199
- data/vendor/kreuzberg-tesseract/src/api.rs +0 -1371
- data/vendor/kreuzberg-tesseract/src/choice_iterator.rs +0 -77
- data/vendor/kreuzberg-tesseract/src/enums.rs +0 -297
- data/vendor/kreuzberg-tesseract/src/error.rs +0 -81
- data/vendor/kreuzberg-tesseract/src/lib.rs +0 -145
- data/vendor/kreuzberg-tesseract/src/monitor.rs +0 -57
- data/vendor/kreuzberg-tesseract/src/mutable_iterator.rs +0 -197
- data/vendor/kreuzberg-tesseract/src/page_iterator.rs +0 -253
- data/vendor/kreuzberg-tesseract/src/result_iterator.rs +0 -286
- data/vendor/kreuzberg-tesseract/src/result_renderer.rs +0 -183
- data/vendor/kreuzberg-tesseract/tests/integration_test.rs +0 -211
data/spec/binding/tables_spec.rb
DELETED
|
@@ -1,641 +0,0 @@
|
|
|
1
|
-
# frozen_string_literal: true
|
|
2
|
-
|
|
3
|
-
require 'spec_helper'
|
|
4
|
-
|
|
5
|
-
RSpec.describe 'Table Extraction Quality' do
|
|
6
|
-
describe 'table structure extraction' do
|
|
7
|
-
let(:pdf_path) { test_document_path('pdf/table_document.pdf') }
|
|
8
|
-
|
|
9
|
-
it 'extracts table rows, columns, and headers' do
|
|
10
|
-
config = Kreuzberg::Config::Extraction.new
|
|
11
|
-
|
|
12
|
-
begin
|
|
13
|
-
result = Kreuzberg.extract_file(path: pdf_path, config: config)
|
|
14
|
-
rescue Kreuzberg::Errors::ValidationError
|
|
15
|
-
skip 'Test PDF file not available'
|
|
16
|
-
end
|
|
17
|
-
|
|
18
|
-
expect(result).not_to be_nil
|
|
19
|
-
expect(result.tables).not_to be_nil
|
|
20
|
-
unless result.tables.empty?
|
|
21
|
-
table = result.tables.first
|
|
22
|
-
expect(table).to be_a(Kreuzberg::Result::Table)
|
|
23
|
-
expect(table.cells).not_to be_nil
|
|
24
|
-
expect(table.cells).to be_a(Array)
|
|
25
|
-
end
|
|
26
|
-
end
|
|
27
|
-
|
|
28
|
-
it 'returns cell arrays with consistent structure' do
|
|
29
|
-
config = Kreuzberg::Config::Extraction.new
|
|
30
|
-
|
|
31
|
-
begin
|
|
32
|
-
result = Kreuzberg.extract_file(path: pdf_path, config: config)
|
|
33
|
-
rescue Kreuzberg::Errors::ValidationError
|
|
34
|
-
skip 'Test PDF file not available'
|
|
35
|
-
end
|
|
36
|
-
|
|
37
|
-
if result.tables && !result.tables.empty?
|
|
38
|
-
expect(result.tables).to all(
|
|
39
|
-
be_a(Kreuzberg::Types::Table).and(
|
|
40
|
-
have_attributes(cells: be_a(Array))
|
|
41
|
-
)
|
|
42
|
-
)
|
|
43
|
-
end
|
|
44
|
-
end
|
|
45
|
-
|
|
46
|
-
it 'provides page number for each table' do
|
|
47
|
-
config = Kreuzberg::Config::Extraction.new
|
|
48
|
-
|
|
49
|
-
begin
|
|
50
|
-
result = Kreuzberg.extract_file(path: pdf_path, config: config)
|
|
51
|
-
rescue Kreuzberg::Errors::ValidationError
|
|
52
|
-
skip 'Test PDF file not available'
|
|
53
|
-
end
|
|
54
|
-
|
|
55
|
-
if result.tables && !result.tables.empty?
|
|
56
|
-
result.tables.each do |table|
|
|
57
|
-
expect(table.page_number).not_to be_nil
|
|
58
|
-
expect(table.page_number).to be_a(Integer)
|
|
59
|
-
expect(table.page_number).to be > 0
|
|
60
|
-
end
|
|
61
|
-
end
|
|
62
|
-
end
|
|
63
|
-
|
|
64
|
-
it 'detects proper row and column counts' do
|
|
65
|
-
config = Kreuzberg::Config::Extraction.new
|
|
66
|
-
|
|
67
|
-
begin
|
|
68
|
-
result = Kreuzberg.extract_file(path: pdf_path, config: config)
|
|
69
|
-
rescue Kreuzberg::Errors::ValidationError
|
|
70
|
-
skip 'Test PDF file not available'
|
|
71
|
-
end
|
|
72
|
-
|
|
73
|
-
if result.tables && !result.tables.empty?
|
|
74
|
-
table = result.tables.first
|
|
75
|
-
unless table.cells.empty?
|
|
76
|
-
first_row_cols = table.cells.first.length
|
|
77
|
-
expect(first_row_cols).to be > 0
|
|
78
|
-
expect(first_row_cols).to be_a(Integer)
|
|
79
|
-
end
|
|
80
|
-
end
|
|
81
|
-
end
|
|
82
|
-
end
|
|
83
|
-
|
|
84
|
-
describe 'table markdown conversion accuracy' do
|
|
85
|
-
let(:pdf_path) { test_document_path('pdf/table_document.pdf') }
|
|
86
|
-
|
|
87
|
-
it 'generates markdown representation for tables' do
|
|
88
|
-
config = Kreuzberg::Config::Extraction.new
|
|
89
|
-
|
|
90
|
-
begin
|
|
91
|
-
result = Kreuzberg.extract_file(path: pdf_path, config: config)
|
|
92
|
-
rescue Kreuzberg::Errors::ValidationError
|
|
93
|
-
skip 'Test PDF file not available'
|
|
94
|
-
end
|
|
95
|
-
|
|
96
|
-
if result.tables && !result.tables.empty?
|
|
97
|
-
result.tables.each do |table|
|
|
98
|
-
expect(table.markdown).not_to be_nil
|
|
99
|
-
expect(table.markdown).to be_a(String)
|
|
100
|
-
# If table has cells, markdown must not be empty
|
|
101
|
-
if table.cells && !table.cells.empty?
|
|
102
|
-
expect(table.markdown).not_to be_empty, 'Markdown must not be empty when table has cells'
|
|
103
|
-
end
|
|
104
|
-
end
|
|
105
|
-
end
|
|
106
|
-
end
|
|
107
|
-
|
|
108
|
-
it 'markdown contains pipe delimiters for table structure' do
|
|
109
|
-
config = Kreuzberg::Config::Extraction.new
|
|
110
|
-
|
|
111
|
-
begin
|
|
112
|
-
result = Kreuzberg.extract_file(path: pdf_path, config: config)
|
|
113
|
-
rescue Kreuzberg::Errors::ValidationError
|
|
114
|
-
skip 'Test PDF file not available'
|
|
115
|
-
end
|
|
116
|
-
|
|
117
|
-
if result.tables && !result.tables.empty?
|
|
118
|
-
result.tables.each do |table|
|
|
119
|
-
# If table has cells and markdown, it must contain pipes
|
|
120
|
-
if table.cells && !table.cells.empty? && !table.markdown.empty?
|
|
121
|
-
expect(table.markdown).to include('|'), 'Markdown table must include pipe separators for cells'
|
|
122
|
-
end
|
|
123
|
-
end
|
|
124
|
-
end
|
|
125
|
-
end
|
|
126
|
-
|
|
127
|
-
it 'markdown format is consistent with cell data' do
|
|
128
|
-
config = Kreuzberg::Config::Extraction.new
|
|
129
|
-
|
|
130
|
-
begin
|
|
131
|
-
result = Kreuzberg.extract_file(path: pdf_path, config: config)
|
|
132
|
-
rescue Kreuzberg::Errors::ValidationError
|
|
133
|
-
skip 'Test PDF file not available'
|
|
134
|
-
end
|
|
135
|
-
|
|
136
|
-
if result.tables && !result.tables.empty?
|
|
137
|
-
table = result.tables.first
|
|
138
|
-
unless table.cells.empty?
|
|
139
|
-
row_count = table.cells.length
|
|
140
|
-
expect(row_count).to be > 0
|
|
141
|
-
expect(row_count).to be_a(Integer)
|
|
142
|
-
end
|
|
143
|
-
end
|
|
144
|
-
end
|
|
145
|
-
end
|
|
146
|
-
|
|
147
|
-
describe 'cell content preservation' do
|
|
148
|
-
let(:pdf_path) { test_document_path('pdf/table_document.pdf') }
|
|
149
|
-
|
|
150
|
-
it 'preserves text content in cells accurately' do
|
|
151
|
-
config = Kreuzberg::Config::Extraction.new
|
|
152
|
-
|
|
153
|
-
begin
|
|
154
|
-
result = Kreuzberg.extract_file(path: pdf_path, config: config)
|
|
155
|
-
rescue Kreuzberg::Errors::ValidationError
|
|
156
|
-
skip 'Test PDF file not available'
|
|
157
|
-
end
|
|
158
|
-
|
|
159
|
-
if result.tables && !result.tables.empty?
|
|
160
|
-
result.tables.each do |table|
|
|
161
|
-
table.cells.each do |row|
|
|
162
|
-
row.each do |cell|
|
|
163
|
-
expect(cell).to be_a(String)
|
|
164
|
-
expect(cell).not_to be_nil
|
|
165
|
-
end
|
|
166
|
-
end
|
|
167
|
-
end
|
|
168
|
-
end
|
|
169
|
-
end
|
|
170
|
-
|
|
171
|
-
it 'handles cells with numeric content' do
|
|
172
|
-
config = Kreuzberg::Config::Extraction.new
|
|
173
|
-
|
|
174
|
-
begin
|
|
175
|
-
result = Kreuzberg.extract_file(path: pdf_path, config: config)
|
|
176
|
-
rescue Kreuzberg::Errors::ValidationError
|
|
177
|
-
skip 'Test PDF file not available'
|
|
178
|
-
end
|
|
179
|
-
|
|
180
|
-
if result.tables && !result.tables.empty?
|
|
181
|
-
result.tables.each do |table|
|
|
182
|
-
table.cells.each do |row|
|
|
183
|
-
row.each do |cell|
|
|
184
|
-
expect(cell).not_to be_nil
|
|
185
|
-
end
|
|
186
|
-
end
|
|
187
|
-
end
|
|
188
|
-
end
|
|
189
|
-
end
|
|
190
|
-
|
|
191
|
-
it 'preserves whitespace and formatting in cells' do
|
|
192
|
-
config = Kreuzberg::Config::Extraction.new
|
|
193
|
-
|
|
194
|
-
begin
|
|
195
|
-
result = Kreuzberg.extract_file(path: pdf_path, config: config)
|
|
196
|
-
rescue Kreuzberg::Errors::ValidationError
|
|
197
|
-
skip 'Test PDF file not available'
|
|
198
|
-
end
|
|
199
|
-
|
|
200
|
-
if result.tables && !result.tables.empty?
|
|
201
|
-
result.tables.each do |table|
|
|
202
|
-
expect(table.cells).not_to be_empty
|
|
203
|
-
expect(table.cells).to all(all(be_a(String)))
|
|
204
|
-
end
|
|
205
|
-
end
|
|
206
|
-
end
|
|
207
|
-
|
|
208
|
-
it 'handles empty cells correctly' do
|
|
209
|
-
config = Kreuzberg::Config::Extraction.new
|
|
210
|
-
|
|
211
|
-
begin
|
|
212
|
-
result = Kreuzberg.extract_file(path: pdf_path, config: config)
|
|
213
|
-
rescue Kreuzberg::Errors::ValidationError
|
|
214
|
-
skip 'Test PDF file not available'
|
|
215
|
-
end
|
|
216
|
-
|
|
217
|
-
if result.tables && !result.tables.empty?
|
|
218
|
-
result.tables.each do |table|
|
|
219
|
-
expect(table.cells).to be_a(Array)
|
|
220
|
-
expect(table.cells).to all(all(be_a(String)))
|
|
221
|
-
end
|
|
222
|
-
end
|
|
223
|
-
end
|
|
224
|
-
end
|
|
225
|
-
|
|
226
|
-
describe 'format-specific table handling' do
|
|
227
|
-
let(:pdf_path) { test_document_path('pdf/table_document.pdf') }
|
|
228
|
-
|
|
229
|
-
it 'extracts tables from PDF documents' do
|
|
230
|
-
config = Kreuzberg::Config::Extraction.new
|
|
231
|
-
|
|
232
|
-
begin
|
|
233
|
-
result = Kreuzberg.extract_file(path: pdf_path, config: config)
|
|
234
|
-
rescue Kreuzberg::Errors::ValidationError
|
|
235
|
-
skip 'Test PDF file not available'
|
|
236
|
-
end
|
|
237
|
-
|
|
238
|
-
expect(result).not_to be_nil
|
|
239
|
-
expect(result.tables).not_to be_nil
|
|
240
|
-
expect(result.tables).to be_a(Array)
|
|
241
|
-
end
|
|
242
|
-
|
|
243
|
-
it 'extracts tables from Office formats' do
|
|
244
|
-
config = Kreuzberg::Config::Extraction.new
|
|
245
|
-
|
|
246
|
-
begin
|
|
247
|
-
result = Kreuzberg.extract_file(path: test_document_path('office/document.docx'), config: config)
|
|
248
|
-
expect(result).not_to be_nil
|
|
249
|
-
rescue Kreuzberg::Errors::ValidationError
|
|
250
|
-
skip 'DOCX test file not available'
|
|
251
|
-
end
|
|
252
|
-
end
|
|
253
|
-
|
|
254
|
-
it 'handles PDF tables with different layouts' do
|
|
255
|
-
config = Kreuzberg::Config::Extraction.new
|
|
256
|
-
|
|
257
|
-
begin
|
|
258
|
-
result = Kreuzberg.extract_file(path: pdf_path, config: config)
|
|
259
|
-
rescue Kreuzberg::Errors::ValidationError
|
|
260
|
-
skip 'Test PDF file not available'
|
|
261
|
-
end
|
|
262
|
-
|
|
263
|
-
if result.tables && !result.tables.empty?
|
|
264
|
-
result.tables.each do |table|
|
|
265
|
-
expect(table.cells).not_to be_nil
|
|
266
|
-
expect(table.markdown).not_to be_nil
|
|
267
|
-
end
|
|
268
|
-
end
|
|
269
|
-
end
|
|
270
|
-
|
|
271
|
-
it 'respects extraction configuration for tables' do
|
|
272
|
-
config = Kreuzberg::Config::Extraction.new
|
|
273
|
-
|
|
274
|
-
begin
|
|
275
|
-
result = Kreuzberg.extract_file(path: pdf_path, config: config)
|
|
276
|
-
rescue Kreuzberg::Errors::ValidationError
|
|
277
|
-
skip 'Test PDF file not available'
|
|
278
|
-
end
|
|
279
|
-
|
|
280
|
-
expect(result).not_to be_nil
|
|
281
|
-
expect(result.tables).not_to be_nil
|
|
282
|
-
end
|
|
283
|
-
end
|
|
284
|
-
|
|
285
|
-
describe 'table boundary detection' do
|
|
286
|
-
let(:pdf_path) { test_document_path('pdf/table_document.pdf') }
|
|
287
|
-
|
|
288
|
-
it 'correctly identifies table boundaries' do
|
|
289
|
-
config = Kreuzberg::Config::Extraction.new
|
|
290
|
-
|
|
291
|
-
begin
|
|
292
|
-
result = Kreuzberg.extract_file(path: pdf_path, config: config)
|
|
293
|
-
rescue Kreuzberg::Errors::ValidationError
|
|
294
|
-
skip 'Test PDF file not available'
|
|
295
|
-
end
|
|
296
|
-
|
|
297
|
-
if result.tables && !result.tables.empty?
|
|
298
|
-
result.tables.each do |table|
|
|
299
|
-
expect(table.cells.length).to be > 0
|
|
300
|
-
table.cells.each do |row|
|
|
301
|
-
expect(row.length).to be > 0
|
|
302
|
-
end
|
|
303
|
-
end
|
|
304
|
-
end
|
|
305
|
-
end
|
|
306
|
-
|
|
307
|
-
it 'separates adjacent tables correctly' do
|
|
308
|
-
config = Kreuzberg::Config::Extraction.new
|
|
309
|
-
|
|
310
|
-
begin
|
|
311
|
-
result = Kreuzberg.extract_file(path: pdf_path, config: config)
|
|
312
|
-
rescue Kreuzberg::Errors::ValidationError
|
|
313
|
-
skip 'Test PDF file not available'
|
|
314
|
-
end
|
|
315
|
-
|
|
316
|
-
if result.tables && result.tables.length > 1
|
|
317
|
-
table_count = result.tables.length
|
|
318
|
-
expect(table_count).to be > 1
|
|
319
|
-
result.tables.each do |table|
|
|
320
|
-
expect(table.cells).not_to be_nil
|
|
321
|
-
expect(table.cells.length).to be > 0
|
|
322
|
-
end
|
|
323
|
-
end
|
|
324
|
-
end
|
|
325
|
-
|
|
326
|
-
it 'maintains consistent column alignment across rows' do
|
|
327
|
-
config = Kreuzberg::Config::Extraction.new
|
|
328
|
-
|
|
329
|
-
begin
|
|
330
|
-
result = Kreuzberg.extract_file(path: pdf_path, config: config)
|
|
331
|
-
rescue Kreuzberg::Errors::ValidationError
|
|
332
|
-
skip 'Test PDF file not available'
|
|
333
|
-
end
|
|
334
|
-
|
|
335
|
-
if result.tables && !result.tables.empty?
|
|
336
|
-
table = result.tables.first
|
|
337
|
-
if table.cells.length > 1
|
|
338
|
-
first_row_cols = table.cells.first.length
|
|
339
|
-
table.cells.each do |row|
|
|
340
|
-
expect(row.length).to eq(first_row_cols)
|
|
341
|
-
end
|
|
342
|
-
end
|
|
343
|
-
end
|
|
344
|
-
end
|
|
345
|
-
end
|
|
346
|
-
|
|
347
|
-
describe 'performance with large tables' do
|
|
348
|
-
let(:pdf_path) { test_document_path('pdf/table_document.pdf') }
|
|
349
|
-
|
|
350
|
-
it 'extracts large tables with 100+ rows efficiently' do
|
|
351
|
-
config = Kreuzberg::Config::Extraction.new
|
|
352
|
-
|
|
353
|
-
begin
|
|
354
|
-
result = Kreuzberg.extract_file(path: pdf_path, config: config)
|
|
355
|
-
rescue Kreuzberg::Errors::ValidationError
|
|
356
|
-
skip 'Test PDF file not available'
|
|
357
|
-
end
|
|
358
|
-
|
|
359
|
-
expect(result).not_to be_nil
|
|
360
|
-
expect(result.tables).to be_a(Array)
|
|
361
|
-
end
|
|
362
|
-
|
|
363
|
-
it 'maintains data integrity for large tables' do
|
|
364
|
-
config = Kreuzberg::Config::Extraction.new
|
|
365
|
-
|
|
366
|
-
begin
|
|
367
|
-
result = Kreuzberg.extract_file(path: pdf_path, config: config)
|
|
368
|
-
rescue Kreuzberg::Errors::ValidationError
|
|
369
|
-
skip 'Test PDF file not available'
|
|
370
|
-
end
|
|
371
|
-
|
|
372
|
-
if result.tables && !result.tables.empty?
|
|
373
|
-
result.tables.each do |table|
|
|
374
|
-
expect(table.cells).not_to be_nil
|
|
375
|
-
expect(table.cells).to all(all(be_a(String)))
|
|
376
|
-
end
|
|
377
|
-
end
|
|
378
|
-
end
|
|
379
|
-
|
|
380
|
-
it 'handles tables with varying column counts' do
|
|
381
|
-
config = Kreuzberg::Config::Extraction.new
|
|
382
|
-
|
|
383
|
-
begin
|
|
384
|
-
result = Kreuzberg.extract_file(path: pdf_path, config: config)
|
|
385
|
-
rescue Kreuzberg::Errors::ValidationError
|
|
386
|
-
skip 'Test PDF file not available'
|
|
387
|
-
end
|
|
388
|
-
|
|
389
|
-
if result.tables && !result.tables.empty?
|
|
390
|
-
result.tables.each do |table|
|
|
391
|
-
expect(table.cells.length).to be >= 0
|
|
392
|
-
end
|
|
393
|
-
end
|
|
394
|
-
end
|
|
395
|
-
end
|
|
396
|
-
|
|
397
|
-
describe 'table serialization and conversion' do
|
|
398
|
-
let(:pdf_path) { test_document_path('pdf/table_document.pdf') }
|
|
399
|
-
|
|
400
|
-
it 'serializes table to hash correctly' do
|
|
401
|
-
config = Kreuzberg::Config::Extraction.new
|
|
402
|
-
|
|
403
|
-
begin
|
|
404
|
-
result = Kreuzberg.extract_file(path: pdf_path, config: config)
|
|
405
|
-
rescue Kreuzberg::Errors::ValidationError
|
|
406
|
-
skip 'Test PDF file not available'
|
|
407
|
-
end
|
|
408
|
-
|
|
409
|
-
if result.tables && !result.tables.empty?
|
|
410
|
-
table = result.tables.first
|
|
411
|
-
table_hash = table.to_h
|
|
412
|
-
|
|
413
|
-
expect(table_hash).to be_a(Hash)
|
|
414
|
-
expect(table_hash).to have_key(:cells)
|
|
415
|
-
expect(table_hash).to have_key(:markdown)
|
|
416
|
-
expect(table_hash).to have_key(:page_number)
|
|
417
|
-
end
|
|
418
|
-
end
|
|
419
|
-
|
|
420
|
-
it 'preserves table data through serialization' do
|
|
421
|
-
config = Kreuzberg::Config::Extraction.new
|
|
422
|
-
|
|
423
|
-
begin
|
|
424
|
-
result = Kreuzberg.extract_file(path: pdf_path, config: config)
|
|
425
|
-
rescue Kreuzberg::Errors::ValidationError
|
|
426
|
-
skip 'Test PDF file not available'
|
|
427
|
-
end
|
|
428
|
-
|
|
429
|
-
if result.tables && !result.tables.empty?
|
|
430
|
-
table = result.tables.first
|
|
431
|
-
table_hash = table.to_h
|
|
432
|
-
|
|
433
|
-
expect(table_hash[:cells]).to eq(table.cells)
|
|
434
|
-
expect(table_hash[:markdown]).to eq(table.markdown)
|
|
435
|
-
expect(table_hash[:page_number]).to eq(table.page_number)
|
|
436
|
-
end
|
|
437
|
-
end
|
|
438
|
-
|
|
439
|
-
it 'converts result with tables to JSON' do
|
|
440
|
-
config = Kreuzberg::Config::Extraction.new
|
|
441
|
-
|
|
442
|
-
begin
|
|
443
|
-
result = Kreuzberg.extract_file(path: pdf_path, config: config)
|
|
444
|
-
rescue Kreuzberg::Errors::ValidationError
|
|
445
|
-
skip 'Test PDF file not available'
|
|
446
|
-
end
|
|
447
|
-
|
|
448
|
-
expect(result).not_to be_nil
|
|
449
|
-
json_str = result.to_json
|
|
450
|
-
expect(json_str).to be_a(String)
|
|
451
|
-
expect(json_str.length).to be > 0
|
|
452
|
-
end
|
|
453
|
-
end
|
|
454
|
-
|
|
455
|
-
describe 'table extraction with page context' do
|
|
456
|
-
let(:pdf_path) { test_document_path('pdf/table_document.pdf') }
|
|
457
|
-
|
|
458
|
-
it 'associates tables with correct page numbers' do
|
|
459
|
-
config = Kreuzberg::Config::Extraction.new(
|
|
460
|
-
pages: Kreuzberg::Config::PageConfig.new(extract_pages: true)
|
|
461
|
-
)
|
|
462
|
-
|
|
463
|
-
begin
|
|
464
|
-
result = Kreuzberg.extract_file(path: pdf_path, config: config)
|
|
465
|
-
rescue Kreuzberg::Errors::ValidationError
|
|
466
|
-
skip 'Test PDF file not available'
|
|
467
|
-
end
|
|
468
|
-
|
|
469
|
-
if result.tables && !result.tables.empty?
|
|
470
|
-
result.tables.each do |table|
|
|
471
|
-
expect(table.page_number).to be > 0
|
|
472
|
-
expect(table.page_number).to be <= result.page_count
|
|
473
|
-
end
|
|
474
|
-
end
|
|
475
|
-
end
|
|
476
|
-
|
|
477
|
-
it 'extracts tables from specific pages when available' do
|
|
478
|
-
config = Kreuzberg::Config::Extraction.new(
|
|
479
|
-
pages: Kreuzberg::Config::PageConfig.new(extract_pages: true)
|
|
480
|
-
)
|
|
481
|
-
|
|
482
|
-
begin
|
|
483
|
-
result = Kreuzberg.extract_file(path: pdf_path, config: config)
|
|
484
|
-
rescue Kreuzberg::Errors::ValidationError
|
|
485
|
-
skip 'Test PDF file not available'
|
|
486
|
-
end
|
|
487
|
-
|
|
488
|
-
if result.pages && !result.pages.empty?
|
|
489
|
-
result.pages.each do |page|
|
|
490
|
-
expect(page.page_number).not_to be_nil
|
|
491
|
-
next unless page.tables
|
|
492
|
-
|
|
493
|
-
page.tables.each do |table|
|
|
494
|
-
expect(table.page_number).to eq(page.page_number)
|
|
495
|
-
end
|
|
496
|
-
end
|
|
497
|
-
end
|
|
498
|
-
end
|
|
499
|
-
|
|
500
|
-
it 'maintains table consistency across page and global results' do
|
|
501
|
-
config = Kreuzberg::Config::Extraction.new(
|
|
502
|
-
pages: Kreuzberg::Config::PageConfig.new(extract_pages: true)
|
|
503
|
-
)
|
|
504
|
-
|
|
505
|
-
begin
|
|
506
|
-
result = Kreuzberg.extract_file(path: pdf_path, config: config)
|
|
507
|
-
rescue Kreuzberg::Errors::ValidationError
|
|
508
|
-
skip 'Test PDF file not available'
|
|
509
|
-
end
|
|
510
|
-
|
|
511
|
-
if result.tables && !result.tables.empty? && result.pages && !result.pages.empty?
|
|
512
|
-
global_table_count = result.tables.length
|
|
513
|
-
page_table_count = result.pages.sum { |page| page.tables&.length || 0 }
|
|
514
|
-
|
|
515
|
-
expect(page_table_count).to eq(global_table_count)
|
|
516
|
-
end
|
|
517
|
-
end
|
|
518
|
-
end
|
|
519
|
-
|
|
520
|
-
describe 'table handling edge cases' do
|
|
521
|
-
let(:pdf_path) { test_document_path('pdf/table_document.pdf') }
|
|
522
|
-
|
|
523
|
-
it 'handles documents with no tables gracefully' do
|
|
524
|
-
config = Kreuzberg::Config::Extraction.new
|
|
525
|
-
|
|
526
|
-
begin
|
|
527
|
-
result = Kreuzberg.extract_file('test.txt', config: config)
|
|
528
|
-
expect(result).not_to be_nil
|
|
529
|
-
expect(result.tables).to be_a(Array) if result.tables
|
|
530
|
-
rescue Kreuzberg::Errors::ValidationError
|
|
531
|
-
skip 'Text file not available for testing'
|
|
532
|
-
end
|
|
533
|
-
end
|
|
534
|
-
|
|
535
|
-
it 'handles single-cell tables' do
|
|
536
|
-
config = Kreuzberg::Config::Extraction.new
|
|
537
|
-
|
|
538
|
-
begin
|
|
539
|
-
result = Kreuzberg.extract_file(path: pdf_path, config: config)
|
|
540
|
-
rescue Kreuzberg::Errors::ValidationError
|
|
541
|
-
skip 'Test PDF file not available'
|
|
542
|
-
end
|
|
543
|
-
|
|
544
|
-
if result.tables && !result.tables.empty?
|
|
545
|
-
result.tables.each do |table|
|
|
546
|
-
expect(table.cells).to be_a(Array)
|
|
547
|
-
end
|
|
548
|
-
end
|
|
549
|
-
end
|
|
550
|
-
|
|
551
|
-
it 'handles tables with long cell content' do
|
|
552
|
-
config = Kreuzberg::Config::Extraction.new
|
|
553
|
-
|
|
554
|
-
begin
|
|
555
|
-
result = Kreuzberg.extract_file(path: pdf_path, config: config)
|
|
556
|
-
rescue Kreuzberg::Errors::ValidationError
|
|
557
|
-
skip 'Test PDF file not available'
|
|
558
|
-
end
|
|
559
|
-
|
|
560
|
-
if result.tables && !result.tables.empty?
|
|
561
|
-
result.tables.each do |table|
|
|
562
|
-
table.cells.each do |row|
|
|
563
|
-
row.each do |cell|
|
|
564
|
-
expect(cell).to be_a(String)
|
|
565
|
-
expect(cell.length).to be >= 0
|
|
566
|
-
end
|
|
567
|
-
end
|
|
568
|
-
end
|
|
569
|
-
end
|
|
570
|
-
end
|
|
571
|
-
|
|
572
|
-
it 'handles tables with special characters' do
|
|
573
|
-
config = Kreuzberg::Config::Extraction.new
|
|
574
|
-
|
|
575
|
-
begin
|
|
576
|
-
result = Kreuzberg.extract_file(path: pdf_path, config: config)
|
|
577
|
-
rescue Kreuzberg::Errors::ValidationError
|
|
578
|
-
skip 'Test PDF file not available'
|
|
579
|
-
end
|
|
580
|
-
|
|
581
|
-
if result.tables && !result.tables.empty?
|
|
582
|
-
result.tables.each do |table|
|
|
583
|
-
expect(table.cells).to all(all(be_a(String)))
|
|
584
|
-
end
|
|
585
|
-
end
|
|
586
|
-
end
|
|
587
|
-
end
|
|
588
|
-
|
|
589
|
-
describe 'Table Struct validation' do
|
|
590
|
-
it 'creates Table struct with all fields' do
|
|
591
|
-
table = Kreuzberg::Result::Table.new(
|
|
592
|
-
cells: [%w[Header1 Header2], %w[Value1 Value2]],
|
|
593
|
-
markdown: '| Header1 | Header2 |\n|---------|--------|\n| Value1 | Value2 |',
|
|
594
|
-
page_number: 1
|
|
595
|
-
)
|
|
596
|
-
|
|
597
|
-
expect(table.cells).to eq([%w[Header1 Header2], %w[Value1 Value2]])
|
|
598
|
-
expect(table.markdown).to include('Header1')
|
|
599
|
-
expect(table.page_number).to eq(1)
|
|
600
|
-
end
|
|
601
|
-
|
|
602
|
-
it 'converts Table struct to hash' do
|
|
603
|
-
table = Kreuzberg::Result::Table.new(
|
|
604
|
-
cells: [%w[A B], %w[C D]],
|
|
605
|
-
markdown: '| A | B |\n|---|---|\n| C | D |',
|
|
606
|
-
page_number: 2
|
|
607
|
-
)
|
|
608
|
-
|
|
609
|
-
table_hash = table.to_h
|
|
610
|
-
|
|
611
|
-
expect(table_hash).to be_a(Hash)
|
|
612
|
-
expect(table_hash[:cells]).to eq([%w[A B], %w[C D]])
|
|
613
|
-
expect(table_hash[:markdown]).to include('A')
|
|
614
|
-
expect(table_hash[:page_number]).to eq(2)
|
|
615
|
-
end
|
|
616
|
-
|
|
617
|
-
it 'handles Table struct with empty cells' do
|
|
618
|
-
table = Kreuzberg::Result::Table.new(
|
|
619
|
-
cells: [],
|
|
620
|
-
markdown: '',
|
|
621
|
-
page_number: 1
|
|
622
|
-
)
|
|
623
|
-
|
|
624
|
-
expect(table.cells).to eq([])
|
|
625
|
-
expect(table.markdown).to eq('')
|
|
626
|
-
expect(table.page_number).to eq(1)
|
|
627
|
-
end
|
|
628
|
-
|
|
629
|
-
it 'handles Table struct with nil values' do
|
|
630
|
-
table = Kreuzberg::Result::Table.new(
|
|
631
|
-
cells: nil,
|
|
632
|
-
markdown: nil,
|
|
633
|
-
page_number: 0
|
|
634
|
-
)
|
|
635
|
-
|
|
636
|
-
expect(table.cells).to be_nil
|
|
637
|
-
expect(table.markdown).to be_nil
|
|
638
|
-
expect(table.page_number).to eq(0)
|
|
639
|
-
end
|
|
640
|
-
end
|
|
641
|
-
end
|