kreuzberg 4.0.0.rc1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +8 -0
- data/.rspec +3 -0
- data/.rubocop.yaml +534 -0
- data/Gemfile +9 -0
- data/Gemfile.lock +157 -0
- data/README.md +421 -0
- data/Rakefile +25 -0
- data/Steepfile +47 -0
- data/examples/async_patterns.rb +340 -0
- data/ext/kreuzberg_rb/extconf.rb +35 -0
- data/ext/kreuzberg_rb/native/Cargo.toml +36 -0
- data/ext/kreuzberg_rb/native/README.md +425 -0
- data/ext/kreuzberg_rb/native/build.rs +17 -0
- data/ext/kreuzberg_rb/native/include/ieeefp.h +11 -0
- data/ext/kreuzberg_rb/native/include/msvc_compat/strings.h +14 -0
- data/ext/kreuzberg_rb/native/include/strings.h +20 -0
- data/ext/kreuzberg_rb/native/include/unistd.h +47 -0
- data/ext/kreuzberg_rb/native/src/lib.rs +2939 -0
- data/extconf.rb +28 -0
- data/kreuzberg.gemspec +105 -0
- data/lib/kreuzberg/api_proxy.rb +142 -0
- data/lib/kreuzberg/cache_api.rb +45 -0
- data/lib/kreuzberg/cli.rb +55 -0
- data/lib/kreuzberg/cli_proxy.rb +127 -0
- data/lib/kreuzberg/config.rb +684 -0
- data/lib/kreuzberg/errors.rb +50 -0
- data/lib/kreuzberg/extraction_api.rb +84 -0
- data/lib/kreuzberg/mcp_proxy.rb +186 -0
- data/lib/kreuzberg/ocr_backend_protocol.rb +113 -0
- data/lib/kreuzberg/post_processor_protocol.rb +86 -0
- data/lib/kreuzberg/result.rb +216 -0
- data/lib/kreuzberg/setup_lib_path.rb +79 -0
- data/lib/kreuzberg/validator_protocol.rb +89 -0
- data/lib/kreuzberg/version.rb +5 -0
- data/lib/kreuzberg.rb +82 -0
- data/pkg/kreuzberg-4.0.0.rc1.gem +0 -0
- data/sig/kreuzberg/internal.rbs +184 -0
- data/sig/kreuzberg.rbs +468 -0
- data/spec/binding/cache_spec.rb +227 -0
- data/spec/binding/cli_proxy_spec.rb +87 -0
- data/spec/binding/cli_spec.rb +54 -0
- data/spec/binding/config_spec.rb +345 -0
- data/spec/binding/config_validation_spec.rb +283 -0
- data/spec/binding/error_handling_spec.rb +213 -0
- data/spec/binding/errors_spec.rb +66 -0
- data/spec/binding/plugins/ocr_backend_spec.rb +307 -0
- data/spec/binding/plugins/postprocessor_spec.rb +269 -0
- data/spec/binding/plugins/validator_spec.rb +274 -0
- data/spec/examples.txt +104 -0
- data/spec/fixtures/config.toml +39 -0
- data/spec/fixtures/config.yaml +42 -0
- data/spec/fixtures/invalid_config.toml +4 -0
- data/spec/smoke/package_spec.rb +178 -0
- data/spec/spec_helper.rb +42 -0
- data/vendor/kreuzberg/Cargo.toml +134 -0
- data/vendor/kreuzberg/README.md +175 -0
- data/vendor/kreuzberg/build.rs +460 -0
- data/vendor/kreuzberg/src/api/error.rs +81 -0
- data/vendor/kreuzberg/src/api/handlers.rs +199 -0
- data/vendor/kreuzberg/src/api/mod.rs +79 -0
- data/vendor/kreuzberg/src/api/server.rs +353 -0
- data/vendor/kreuzberg/src/api/types.rs +170 -0
- data/vendor/kreuzberg/src/bin/profile_extract.rs +455 -0
- data/vendor/kreuzberg/src/cache/mod.rs +1143 -0
- data/vendor/kreuzberg/src/chunking/mod.rs +677 -0
- data/vendor/kreuzberg/src/core/batch_mode.rs +35 -0
- data/vendor/kreuzberg/src/core/config.rs +1032 -0
- data/vendor/kreuzberg/src/core/extractor.rs +903 -0
- data/vendor/kreuzberg/src/core/io.rs +327 -0
- data/vendor/kreuzberg/src/core/mime.rs +615 -0
- data/vendor/kreuzberg/src/core/mod.rs +42 -0
- data/vendor/kreuzberg/src/core/pipeline.rs +906 -0
- data/vendor/kreuzberg/src/embeddings.rs +323 -0
- data/vendor/kreuzberg/src/error.rs +431 -0
- data/vendor/kreuzberg/src/extraction/archive.rs +954 -0
- data/vendor/kreuzberg/src/extraction/docx.rs +40 -0
- data/vendor/kreuzberg/src/extraction/email.rs +854 -0
- data/vendor/kreuzberg/src/extraction/excel.rs +688 -0
- data/vendor/kreuzberg/src/extraction/html.rs +553 -0
- data/vendor/kreuzberg/src/extraction/image.rs +368 -0
- data/vendor/kreuzberg/src/extraction/libreoffice.rs +564 -0
- data/vendor/kreuzberg/src/extraction/mod.rs +77 -0
- data/vendor/kreuzberg/src/extraction/office_metadata/app_properties.rs +398 -0
- data/vendor/kreuzberg/src/extraction/office_metadata/core_properties.rs +247 -0
- data/vendor/kreuzberg/src/extraction/office_metadata/custom_properties.rs +240 -0
- data/vendor/kreuzberg/src/extraction/office_metadata/mod.rs +128 -0
- data/vendor/kreuzberg/src/extraction/pandoc/batch.rs +275 -0
- data/vendor/kreuzberg/src/extraction/pandoc/mime_types.rs +178 -0
- data/vendor/kreuzberg/src/extraction/pandoc/mod.rs +491 -0
- data/vendor/kreuzberg/src/extraction/pandoc/server.rs +496 -0
- data/vendor/kreuzberg/src/extraction/pandoc/subprocess.rs +1188 -0
- data/vendor/kreuzberg/src/extraction/pandoc/version.rs +162 -0
- data/vendor/kreuzberg/src/extraction/pptx.rs +3000 -0
- data/vendor/kreuzberg/src/extraction/structured.rs +490 -0
- data/vendor/kreuzberg/src/extraction/table.rs +328 -0
- data/vendor/kreuzberg/src/extraction/text.rs +269 -0
- data/vendor/kreuzberg/src/extraction/xml.rs +333 -0
- data/vendor/kreuzberg/src/extractors/archive.rs +425 -0
- data/vendor/kreuzberg/src/extractors/docx.rs +479 -0
- data/vendor/kreuzberg/src/extractors/email.rs +129 -0
- data/vendor/kreuzberg/src/extractors/excel.rs +344 -0
- data/vendor/kreuzberg/src/extractors/html.rs +410 -0
- data/vendor/kreuzberg/src/extractors/image.rs +195 -0
- data/vendor/kreuzberg/src/extractors/mod.rs +268 -0
- data/vendor/kreuzberg/src/extractors/pandoc.rs +201 -0
- data/vendor/kreuzberg/src/extractors/pdf.rs +496 -0
- data/vendor/kreuzberg/src/extractors/pptx.rs +234 -0
- data/vendor/kreuzberg/src/extractors/structured.rs +126 -0
- data/vendor/kreuzberg/src/extractors/text.rs +242 -0
- data/vendor/kreuzberg/src/extractors/xml.rs +128 -0
- data/vendor/kreuzberg/src/image/dpi.rs +164 -0
- data/vendor/kreuzberg/src/image/mod.rs +6 -0
- data/vendor/kreuzberg/src/image/preprocessing.rs +417 -0
- data/vendor/kreuzberg/src/image/resize.rs +89 -0
- data/vendor/kreuzberg/src/keywords/config.rs +154 -0
- data/vendor/kreuzberg/src/keywords/mod.rs +237 -0
- data/vendor/kreuzberg/src/keywords/processor.rs +267 -0
- data/vendor/kreuzberg/src/keywords/rake.rs +294 -0
- data/vendor/kreuzberg/src/keywords/types.rs +68 -0
- data/vendor/kreuzberg/src/keywords/yake.rs +163 -0
- data/vendor/kreuzberg/src/language_detection/mod.rs +942 -0
- data/vendor/kreuzberg/src/lib.rs +102 -0
- data/vendor/kreuzberg/src/mcp/mod.rs +32 -0
- data/vendor/kreuzberg/src/mcp/server.rs +1966 -0
- data/vendor/kreuzberg/src/ocr/cache.rs +469 -0
- data/vendor/kreuzberg/src/ocr/error.rs +37 -0
- data/vendor/kreuzberg/src/ocr/hocr.rs +216 -0
- data/vendor/kreuzberg/src/ocr/mod.rs +58 -0
- data/vendor/kreuzberg/src/ocr/processor.rs +847 -0
- data/vendor/kreuzberg/src/ocr/table/mod.rs +4 -0
- data/vendor/kreuzberg/src/ocr/table/tsv_parser.rs +144 -0
- data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +450 -0
- data/vendor/kreuzberg/src/ocr/types.rs +393 -0
- data/vendor/kreuzberg/src/ocr/utils.rs +47 -0
- data/vendor/kreuzberg/src/ocr/validation.rs +206 -0
- data/vendor/kreuzberg/src/pdf/error.rs +122 -0
- data/vendor/kreuzberg/src/pdf/images.rs +139 -0
- data/vendor/kreuzberg/src/pdf/metadata.rs +346 -0
- data/vendor/kreuzberg/src/pdf/mod.rs +50 -0
- data/vendor/kreuzberg/src/pdf/rendering.rs +369 -0
- data/vendor/kreuzberg/src/pdf/table.rs +420 -0
- data/vendor/kreuzberg/src/pdf/text.rs +161 -0
- data/vendor/kreuzberg/src/plugins/extractor.rs +1010 -0
- data/vendor/kreuzberg/src/plugins/mod.rs +209 -0
- data/vendor/kreuzberg/src/plugins/ocr.rs +629 -0
- data/vendor/kreuzberg/src/plugins/processor.rs +641 -0
- data/vendor/kreuzberg/src/plugins/registry.rs +1324 -0
- data/vendor/kreuzberg/src/plugins/traits.rs +258 -0
- data/vendor/kreuzberg/src/plugins/validator.rs +955 -0
- data/vendor/kreuzberg/src/stopwords/mod.rs +1470 -0
- data/vendor/kreuzberg/src/text/mod.rs +19 -0
- data/vendor/kreuzberg/src/text/quality.rs +697 -0
- data/vendor/kreuzberg/src/text/string_utils.rs +217 -0
- data/vendor/kreuzberg/src/text/token_reduction/cjk_utils.rs +164 -0
- data/vendor/kreuzberg/src/text/token_reduction/config.rs +100 -0
- data/vendor/kreuzberg/src/text/token_reduction/core.rs +796 -0
- data/vendor/kreuzberg/src/text/token_reduction/filters.rs +902 -0
- data/vendor/kreuzberg/src/text/token_reduction/mod.rs +160 -0
- data/vendor/kreuzberg/src/text/token_reduction/semantic.rs +619 -0
- data/vendor/kreuzberg/src/text/token_reduction/simd_text.rs +147 -0
- data/vendor/kreuzberg/src/types.rs +873 -0
- data/vendor/kreuzberg/src/utils/mod.rs +17 -0
- data/vendor/kreuzberg/src/utils/quality.rs +959 -0
- data/vendor/kreuzberg/src/utils/string_utils.rs +381 -0
- data/vendor/kreuzberg/stopwords/af_stopwords.json +53 -0
- data/vendor/kreuzberg/stopwords/ar_stopwords.json +482 -0
- data/vendor/kreuzberg/stopwords/bg_stopwords.json +261 -0
- data/vendor/kreuzberg/stopwords/bn_stopwords.json +400 -0
- data/vendor/kreuzberg/stopwords/br_stopwords.json +1205 -0
- data/vendor/kreuzberg/stopwords/ca_stopwords.json +280 -0
- data/vendor/kreuzberg/stopwords/cs_stopwords.json +425 -0
- data/vendor/kreuzberg/stopwords/da_stopwords.json +172 -0
- data/vendor/kreuzberg/stopwords/de_stopwords.json +622 -0
- data/vendor/kreuzberg/stopwords/el_stopwords.json +849 -0
- data/vendor/kreuzberg/stopwords/en_stopwords.json +1300 -0
- data/vendor/kreuzberg/stopwords/eo_stopwords.json +175 -0
- data/vendor/kreuzberg/stopwords/es_stopwords.json +734 -0
- data/vendor/kreuzberg/stopwords/et_stopwords.json +37 -0
- data/vendor/kreuzberg/stopwords/eu_stopwords.json +100 -0
- data/vendor/kreuzberg/stopwords/fa_stopwords.json +801 -0
- data/vendor/kreuzberg/stopwords/fi_stopwords.json +849 -0
- data/vendor/kreuzberg/stopwords/fr_stopwords.json +693 -0
- data/vendor/kreuzberg/stopwords/ga_stopwords.json +111 -0
- data/vendor/kreuzberg/stopwords/gl_stopwords.json +162 -0
- data/vendor/kreuzberg/stopwords/gu_stopwords.json +226 -0
- data/vendor/kreuzberg/stopwords/ha_stopwords.json +41 -0
- data/vendor/kreuzberg/stopwords/he_stopwords.json +196 -0
- data/vendor/kreuzberg/stopwords/hi_stopwords.json +227 -0
- data/vendor/kreuzberg/stopwords/hr_stopwords.json +181 -0
- data/vendor/kreuzberg/stopwords/hu_stopwords.json +791 -0
- data/vendor/kreuzberg/stopwords/hy_stopwords.json +47 -0
- data/vendor/kreuzberg/stopwords/id_stopwords.json +760 -0
- data/vendor/kreuzberg/stopwords/it_stopwords.json +634 -0
- data/vendor/kreuzberg/stopwords/ja_stopwords.json +136 -0
- data/vendor/kreuzberg/stopwords/kn_stopwords.json +84 -0
- data/vendor/kreuzberg/stopwords/ko_stopwords.json +681 -0
- data/vendor/kreuzberg/stopwords/ku_stopwords.json +64 -0
- data/vendor/kreuzberg/stopwords/la_stopwords.json +51 -0
- data/vendor/kreuzberg/stopwords/lt_stopwords.json +476 -0
- data/vendor/kreuzberg/stopwords/lv_stopwords.json +163 -0
- data/vendor/kreuzberg/stopwords/ml_stopwords.json +1 -0
- data/vendor/kreuzberg/stopwords/mr_stopwords.json +101 -0
- data/vendor/kreuzberg/stopwords/ms_stopwords.json +477 -0
- data/vendor/kreuzberg/stopwords/ne_stopwords.json +490 -0
- data/vendor/kreuzberg/stopwords/nl_stopwords.json +415 -0
- data/vendor/kreuzberg/stopwords/no_stopwords.json +223 -0
- data/vendor/kreuzberg/stopwords/pl_stopwords.json +331 -0
- data/vendor/kreuzberg/stopwords/pt_stopwords.json +562 -0
- data/vendor/kreuzberg/stopwords/ro_stopwords.json +436 -0
- data/vendor/kreuzberg/stopwords/ru_stopwords.json +561 -0
- data/vendor/kreuzberg/stopwords/si_stopwords.json +193 -0
- data/vendor/kreuzberg/stopwords/sk_stopwords.json +420 -0
- data/vendor/kreuzberg/stopwords/sl_stopwords.json +448 -0
- data/vendor/kreuzberg/stopwords/so_stopwords.json +32 -0
- data/vendor/kreuzberg/stopwords/st_stopwords.json +33 -0
- data/vendor/kreuzberg/stopwords/sv_stopwords.json +420 -0
- data/vendor/kreuzberg/stopwords/sw_stopwords.json +76 -0
- data/vendor/kreuzberg/stopwords/ta_stopwords.json +129 -0
- data/vendor/kreuzberg/stopwords/te_stopwords.json +54 -0
- data/vendor/kreuzberg/stopwords/th_stopwords.json +118 -0
- data/vendor/kreuzberg/stopwords/tl_stopwords.json +149 -0
- data/vendor/kreuzberg/stopwords/tr_stopwords.json +506 -0
- data/vendor/kreuzberg/stopwords/uk_stopwords.json +75 -0
- data/vendor/kreuzberg/stopwords/ur_stopwords.json +519 -0
- data/vendor/kreuzberg/stopwords/vi_stopwords.json +647 -0
- data/vendor/kreuzberg/stopwords/yo_stopwords.json +62 -0
- data/vendor/kreuzberg/stopwords/zh_stopwords.json +796 -0
- data/vendor/kreuzberg/stopwords/zu_stopwords.json +31 -0
- data/vendor/kreuzberg/tests/api_tests.rs +966 -0
- data/vendor/kreuzberg/tests/archive_integration.rs +543 -0
- data/vendor/kreuzberg/tests/batch_orchestration.rs +542 -0
- data/vendor/kreuzberg/tests/batch_processing.rs +304 -0
- data/vendor/kreuzberg/tests/chunking_offset_demo.rs +92 -0
- data/vendor/kreuzberg/tests/concurrency_stress.rs +509 -0
- data/vendor/kreuzberg/tests/config_features.rs +580 -0
- data/vendor/kreuzberg/tests/config_loading_tests.rs +439 -0
- data/vendor/kreuzberg/tests/core_integration.rs +493 -0
- data/vendor/kreuzberg/tests/csv_integration.rs +424 -0
- data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +124 -0
- data/vendor/kreuzberg/tests/email_integration.rs +325 -0
- data/vendor/kreuzberg/tests/error_handling.rs +393 -0
- data/vendor/kreuzberg/tests/format_integration.rs +159 -0
- data/vendor/kreuzberg/tests/helpers/mod.rs +142 -0
- data/vendor/kreuzberg/tests/image_integration.rs +253 -0
- data/vendor/kreuzberg/tests/keywords_integration.rs +479 -0
- data/vendor/kreuzberg/tests/keywords_quality.rs +509 -0
- data/vendor/kreuzberg/tests/mime_detection.rs +428 -0
- data/vendor/kreuzberg/tests/ocr_configuration.rs +510 -0
- data/vendor/kreuzberg/tests/ocr_errors.rs +676 -0
- data/vendor/kreuzberg/tests/ocr_quality.rs +627 -0
- data/vendor/kreuzberg/tests/ocr_stress.rs +469 -0
- data/vendor/kreuzberg/tests/pandoc_integration.rs +503 -0
- data/vendor/kreuzberg/tests/pdf_integration.rs +43 -0
- data/vendor/kreuzberg/tests/pipeline_integration.rs +1412 -0
- data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +771 -0
- data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +561 -0
- data/vendor/kreuzberg/tests/plugin_system.rs +921 -0
- data/vendor/kreuzberg/tests/plugin_validator_test.rs +783 -0
- data/vendor/kreuzberg/tests/registry_integration_tests.rs +607 -0
- data/vendor/kreuzberg/tests/security_validation.rs +404 -0
- data/vendor/kreuzberg/tests/stopwords_integration_test.rs +888 -0
- data/vendor/kreuzberg/tests/test_fastembed.rs +609 -0
- data/vendor/kreuzberg/tests/xlsx_metadata_extraction_test.rs +87 -0
- metadata +471 -0
|
@@ -0,0 +1,274 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'spec_helper'
|
|
4
|
+
|
|
5
|
+
RSpec.describe 'Validator Plugin System' do
|
|
6
|
+
let(:test_pdf) { test_document_path('text/contract_test.txt') }
|
|
7
|
+
|
|
8
|
+
after do
|
|
9
|
+
Kreuzberg.clear_validators
|
|
10
|
+
end
|
|
11
|
+
|
|
12
|
+
describe 'registering validator as Proc' do
|
|
13
|
+
it 'registers and executes Proc validator during extraction' do
|
|
14
|
+
validator_called = false
|
|
15
|
+
validator = lambda do |_result|
|
|
16
|
+
validator_called = true
|
|
17
|
+
end
|
|
18
|
+
|
|
19
|
+
Kreuzberg.register_validator('check_called', validator)
|
|
20
|
+
Kreuzberg.extract_file_sync(test_pdf)
|
|
21
|
+
|
|
22
|
+
expect(validator_called).to be true
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
it 'allows extraction to proceed when validator passes' do
|
|
26
|
+
validator = lambda do |result|
|
|
27
|
+
# Validation passes - do nothing
|
|
28
|
+
end
|
|
29
|
+
|
|
30
|
+
Kreuzberg.register_validator('pass_validator', validator)
|
|
31
|
+
result = Kreuzberg.extract_file_sync(test_pdf)
|
|
32
|
+
|
|
33
|
+
expect(result).to be_a(Kreuzberg::Result)
|
|
34
|
+
expect(result.content).not_to be_empty
|
|
35
|
+
end
|
|
36
|
+
|
|
37
|
+
it 'prevents extraction when validator raises ValidationError' do
|
|
38
|
+
validator = lambda do |result|
|
|
39
|
+
if result['content'].length < 10_000_000
|
|
40
|
+
raise Kreuzberg::Errors::ValidationError, 'Content too short for this test'
|
|
41
|
+
end
|
|
42
|
+
end
|
|
43
|
+
|
|
44
|
+
Kreuzberg.register_validator('min_length', validator)
|
|
45
|
+
|
|
46
|
+
expect do
|
|
47
|
+
Kreuzberg.extract_file_sync(test_pdf)
|
|
48
|
+
end.to raise_error(Kreuzberg::Errors::ValidationError, /Content too short/)
|
|
49
|
+
end
|
|
50
|
+
end
|
|
51
|
+
|
|
52
|
+
describe 'registering validator as class' do
|
|
53
|
+
it 'registers and executes class-based validator' do
|
|
54
|
+
class MinimumLengthValidator
|
|
55
|
+
include Kreuzberg::ValidatorProtocol
|
|
56
|
+
|
|
57
|
+
def initialize(min_length)
|
|
58
|
+
@min_length = min_length
|
|
59
|
+
end
|
|
60
|
+
|
|
61
|
+
def call(result)
|
|
62
|
+
return unless result['content'].length < @min_length
|
|
63
|
+
|
|
64
|
+
raise Kreuzberg::Errors::ValidationError, "Content too short: #{result['content'].length} < #{@min_length}"
|
|
65
|
+
end
|
|
66
|
+
end
|
|
67
|
+
|
|
68
|
+
validator = MinimumLengthValidator.new(10)
|
|
69
|
+
Kreuzberg.register_validator('min_length', validator)
|
|
70
|
+
result = Kreuzberg.extract_file_sync(test_pdf)
|
|
71
|
+
|
|
72
|
+
expect(result).to be_a(Kreuzberg::Result)
|
|
73
|
+
expect(result.content.length).to be >= 10
|
|
74
|
+
end
|
|
75
|
+
|
|
76
|
+
it 'validates based on content characteristics' do
|
|
77
|
+
class NonEmptyValidator
|
|
78
|
+
include Kreuzberg::ValidatorProtocol
|
|
79
|
+
|
|
80
|
+
def call(result)
|
|
81
|
+
return unless result['content'].strip.empty?
|
|
82
|
+
|
|
83
|
+
raise Kreuzberg::Errors::ValidationError, 'Content cannot be empty'
|
|
84
|
+
end
|
|
85
|
+
end
|
|
86
|
+
|
|
87
|
+
validator = NonEmptyValidator.new
|
|
88
|
+
Kreuzberg.register_validator('non_empty', validator)
|
|
89
|
+
result = Kreuzberg.extract_file_sync(test_pdf)
|
|
90
|
+
|
|
91
|
+
expect(result.content.strip).not_to be_empty
|
|
92
|
+
end
|
|
93
|
+
end
|
|
94
|
+
|
|
95
|
+
describe 'validator receives correct parameters' do
|
|
96
|
+
it 'receives result hash with all required fields' do
|
|
97
|
+
received_result = nil
|
|
98
|
+
validator = lambda do |result|
|
|
99
|
+
received_result = result
|
|
100
|
+
end
|
|
101
|
+
|
|
102
|
+
Kreuzberg.register_validator('capture', validator)
|
|
103
|
+
Kreuzberg.extract_file_sync(test_pdf)
|
|
104
|
+
|
|
105
|
+
expect(received_result).to be_a(Hash)
|
|
106
|
+
expect(received_result).to have_key('content')
|
|
107
|
+
expect(received_result).to have_key('mime_type')
|
|
108
|
+
expect(received_result).to have_key('metadata')
|
|
109
|
+
expect(received_result).to have_key('tables')
|
|
110
|
+
end
|
|
111
|
+
|
|
112
|
+
it 'receives correct content in result hash' do
|
|
113
|
+
received_content = nil
|
|
114
|
+
validator = lambda do |result|
|
|
115
|
+
received_content = result['content']
|
|
116
|
+
end
|
|
117
|
+
|
|
118
|
+
Kreuzberg.register_validator('capture_content', validator)
|
|
119
|
+
result = Kreuzberg.extract_file_sync(test_pdf)
|
|
120
|
+
|
|
121
|
+
expect(received_content).to eq(result.content)
|
|
122
|
+
end
|
|
123
|
+
end
|
|
124
|
+
|
|
125
|
+
describe 'multiple validators' do
|
|
126
|
+
it 'executes all registered validators' do
|
|
127
|
+
validator1_called = false
|
|
128
|
+
validator2_called = false
|
|
129
|
+
|
|
130
|
+
validator1 = lambda do |_result|
|
|
131
|
+
validator1_called = true
|
|
132
|
+
end
|
|
133
|
+
|
|
134
|
+
validator2 = lambda do |_result|
|
|
135
|
+
validator2_called = true
|
|
136
|
+
end
|
|
137
|
+
|
|
138
|
+
Kreuzberg.register_validator('val1', validator1)
|
|
139
|
+
Kreuzberg.register_validator('val2', validator2)
|
|
140
|
+
Kreuzberg.extract_file_sync(test_pdf)
|
|
141
|
+
|
|
142
|
+
expect(validator1_called).to be true
|
|
143
|
+
expect(validator2_called).to be true
|
|
144
|
+
end
|
|
145
|
+
|
|
146
|
+
it 'stops execution if any validator fails' do
|
|
147
|
+
validator1 = lambda do |_result|
|
|
148
|
+
raise Kreuzberg::Errors::ValidationError, 'First validator failed'
|
|
149
|
+
end
|
|
150
|
+
|
|
151
|
+
validator2 = lambda do |_result|
|
|
152
|
+
raise StandardError, 'This should not be reached'
|
|
153
|
+
end
|
|
154
|
+
|
|
155
|
+
Kreuzberg.register_validator('fail_first', validator1)
|
|
156
|
+
Kreuzberg.register_validator('never_reached', validator2)
|
|
157
|
+
|
|
158
|
+
expect do
|
|
159
|
+
Kreuzberg.extract_file_sync(test_pdf)
|
|
160
|
+
end.to raise_error(Kreuzberg::Errors::ValidationError, /First validator failed/)
|
|
161
|
+
end
|
|
162
|
+
end
|
|
163
|
+
|
|
164
|
+
describe 'unregister_validator' do
|
|
165
|
+
it 'removes a registered validator by name' do
|
|
166
|
+
validator = lambda do |_result|
|
|
167
|
+
raise Kreuzberg::Errors::ValidationError, 'Should not be called'
|
|
168
|
+
end
|
|
169
|
+
|
|
170
|
+
Kreuzberg.register_validator('removable', validator)
|
|
171
|
+
Kreuzberg.unregister_validator('removable')
|
|
172
|
+
|
|
173
|
+
expect do
|
|
174
|
+
Kreuzberg.extract_file_sync(test_pdf)
|
|
175
|
+
end.not_to raise_error
|
|
176
|
+
end
|
|
177
|
+
|
|
178
|
+
it 'does not affect other registered validators' do
|
|
179
|
+
validator1_called = false
|
|
180
|
+
validator3_called = false
|
|
181
|
+
|
|
182
|
+
validator1 = lambda do |_result|
|
|
183
|
+
validator1_called = true
|
|
184
|
+
end
|
|
185
|
+
|
|
186
|
+
validator2 = lambda do |_result|
|
|
187
|
+
raise Kreuzberg::Errors::ValidationError, 'Should not be called'
|
|
188
|
+
end
|
|
189
|
+
|
|
190
|
+
validator3 = lambda do |_result|
|
|
191
|
+
validator3_called = true
|
|
192
|
+
end
|
|
193
|
+
|
|
194
|
+
Kreuzberg.register_validator('keep1', validator1)
|
|
195
|
+
Kreuzberg.register_validator('remove', validator2)
|
|
196
|
+
Kreuzberg.register_validator('keep3', validator3)
|
|
197
|
+
|
|
198
|
+
Kreuzberg.unregister_validator('remove')
|
|
199
|
+
Kreuzberg.extract_file_sync(test_pdf)
|
|
200
|
+
|
|
201
|
+
expect(validator1_called).to be true
|
|
202
|
+
expect(validator3_called).to be true
|
|
203
|
+
end
|
|
204
|
+
end
|
|
205
|
+
|
|
206
|
+
describe 'clear_validators' do
|
|
207
|
+
it 'removes all registered validators' do
|
|
208
|
+
validator1 = lambda do |_result|
|
|
209
|
+
raise Kreuzberg::Errors::ValidationError, 'Should not be called 1'
|
|
210
|
+
end
|
|
211
|
+
|
|
212
|
+
validator2 = lambda do |_result|
|
|
213
|
+
raise Kreuzberg::Errors::ValidationError, 'Should not be called 2'
|
|
214
|
+
end
|
|
215
|
+
|
|
216
|
+
Kreuzberg.register_validator('val1', validator1)
|
|
217
|
+
Kreuzberg.register_validator('val2', validator2)
|
|
218
|
+
|
|
219
|
+
Kreuzberg.clear_validators
|
|
220
|
+
|
|
221
|
+
expect do
|
|
222
|
+
Kreuzberg.extract_file_sync(test_pdf)
|
|
223
|
+
end.not_to raise_error
|
|
224
|
+
end
|
|
225
|
+
end
|
|
226
|
+
|
|
227
|
+
describe 'list_validators' do
|
|
228
|
+
it 'returns empty array when no validators registered' do
|
|
229
|
+
Kreuzberg.clear_validators
|
|
230
|
+
validators = Kreuzberg.list_validators
|
|
231
|
+
expect(validators).to be_an(Array)
|
|
232
|
+
expect(validators).to be_empty
|
|
233
|
+
end
|
|
234
|
+
|
|
235
|
+
it 'returns validator names after registration' do
|
|
236
|
+
Kreuzberg.clear_validators
|
|
237
|
+
validator = ->(result) {}
|
|
238
|
+
Kreuzberg.register_validator('test-validator', validator)
|
|
239
|
+
validators = Kreuzberg.list_validators
|
|
240
|
+
expect(validators).to include('test-validator')
|
|
241
|
+
Kreuzberg.clear_validators
|
|
242
|
+
end
|
|
243
|
+
|
|
244
|
+
it 'returns all registered validator names' do
|
|
245
|
+
Kreuzberg.clear_validators
|
|
246
|
+
validator1 = ->(result) {}
|
|
247
|
+
validator2 = ->(result) {}
|
|
248
|
+
validator3 = ->(result) {}
|
|
249
|
+
|
|
250
|
+
Kreuzberg.register_validator('validator-one', validator1)
|
|
251
|
+
Kreuzberg.register_validator('validator-two', validator2)
|
|
252
|
+
Kreuzberg.register_validator('validator-three', validator3)
|
|
253
|
+
|
|
254
|
+
validators = Kreuzberg.list_validators
|
|
255
|
+
expect(validators).to contain_exactly('validator-one', 'validator-two', 'validator-three')
|
|
256
|
+
Kreuzberg.clear_validators
|
|
257
|
+
end
|
|
258
|
+
|
|
259
|
+
it 'reflects changes after unregistration' do
|
|
260
|
+
Kreuzberg.clear_validators
|
|
261
|
+
validator = ->(result) {}
|
|
262
|
+
Kreuzberg.register_validator('temp-validator', validator)
|
|
263
|
+
|
|
264
|
+
validators_before = Kreuzberg.list_validators
|
|
265
|
+
expect(validators_before).to include('temp-validator')
|
|
266
|
+
|
|
267
|
+
Kreuzberg.unregister_validator('temp-validator')
|
|
268
|
+
|
|
269
|
+
validators_after = Kreuzberg.list_validators
|
|
270
|
+
expect(validators_after).not_to include('temp-validator')
|
|
271
|
+
Kreuzberg.clear_validators
|
|
272
|
+
end
|
|
273
|
+
end
|
|
274
|
+
end
|
data/spec/examples.txt
ADDED
|
@@ -0,0 +1,104 @@
|
|
|
1
|
+
example_id | status | run_time |
|
|
2
|
+
---------------------------------------------------------------------------------- | ------ | --------------- |
|
|
3
|
+
./spec/binding/plugins/ocr_backend_spec.rb[1:1:1] | failed | 0.00173 seconds |
|
|
4
|
+
./spec/binding/plugins/ocr_backend_spec.rb[1:1:2] | failed | 0.0018 seconds |
|
|
5
|
+
./spec/binding/plugins/ocr_backend_spec.rb[1:2:1] | failed | 0.00192 seconds |
|
|
6
|
+
./spec/binding/plugins/ocr_backend_spec.rb[1:2:2] | failed | 0.00581 seconds |
|
|
7
|
+
./spec/binding/plugins/ocr_backend_spec.rb[1:3:1] | failed | 0.00184 seconds |
|
|
8
|
+
./spec/binding/plugins/ocr_backend_spec.rb[1:4:1] | passed | 0.00088 seconds |
|
|
9
|
+
./spec/binding/plugins/ocr_backend_spec.rb[1:4:2] | passed | 0.00045 seconds |
|
|
10
|
+
./spec/binding/plugins/ocr_backend_spec.rb[1:5:1] | passed | 0.00007 seconds |
|
|
11
|
+
./spec/binding/plugins/ocr_backend_spec.rb[1:5:2] | passed | 0.00052 seconds |
|
|
12
|
+
./spec/binding/plugins/ocr_backend_spec.rb[1:6:1:1] | passed | 0.00012 seconds |
|
|
13
|
+
./spec/binding/plugins/ocr_backend_spec.rb[1:6:1:2] | passed | 0.00079 seconds |
|
|
14
|
+
./spec/binding/plugins/ocr_backend_spec.rb[1:6:2:1] | passed | 0.00004 seconds |
|
|
15
|
+
./spec/binding/plugins/ocr_backend_spec.rb[1:6:2:2] | passed | 0.00029 seconds |
|
|
16
|
+
./spec/binding/plugins/postprocessor_spec.rb[1:1:1] | failed | 0.00139 seconds |
|
|
17
|
+
./spec/binding/plugins/postprocessor_spec.rb[1:1:2] | failed | 0.00153 seconds |
|
|
18
|
+
./spec/binding/plugins/postprocessor_spec.rb[1:1:3] | failed | 0.0014 seconds |
|
|
19
|
+
./spec/binding/plugins/postprocessor_spec.rb[1:2:1] | failed | 0.00182 seconds |
|
|
20
|
+
./spec/binding/plugins/postprocessor_spec.rb[1:2:2] | failed | 0.00209 seconds |
|
|
21
|
+
./spec/binding/plugins/postprocessor_spec.rb[1:3:1] | failed | 0.00165 seconds |
|
|
22
|
+
./spec/binding/plugins/postprocessor_spec.rb[1:4:1] | failed | 0.00142 seconds |
|
|
23
|
+
./spec/binding/plugins/postprocessor_spec.rb[1:4:2] | failed | 0.00148 seconds |
|
|
24
|
+
./spec/binding/plugins/postprocessor_spec.rb[1:5:1] | failed | 0.00148 seconds |
|
|
25
|
+
./spec/binding/plugins/postprocessor_spec.rb[1:6:1] | passed | 0.0001 seconds |
|
|
26
|
+
./spec/binding/plugins/postprocessor_spec.rb[1:6:2] | passed | 0.00011 seconds |
|
|
27
|
+
./spec/binding/plugins/postprocessor_spec.rb[1:7:1] | passed | 0.00003 seconds |
|
|
28
|
+
./spec/binding/plugins/postprocessor_spec.rb[1:7:2] | passed | 0.00002 seconds |
|
|
29
|
+
./spec/binding/plugins/postprocessor_spec.rb[1:7:3] | passed | 0.00003 seconds |
|
|
30
|
+
./spec/binding/plugins/postprocessor_spec.rb[1:7:4] | passed | 0.00006 seconds |
|
|
31
|
+
./spec/binding/plugins/validator_spec.rb[1:1:1] | failed | 0.00154 seconds |
|
|
32
|
+
./spec/binding/plugins/validator_spec.rb[1:1:2] | failed | 0.00171 seconds |
|
|
33
|
+
./spec/binding/plugins/validator_spec.rb[1:1:3] | passed | 0.00099 seconds |
|
|
34
|
+
./spec/binding/plugins/validator_spec.rb[1:2:1] | failed | 0.00186 seconds |
|
|
35
|
+
./spec/binding/plugins/validator_spec.rb[1:2:2] | failed | 0.0016 seconds |
|
|
36
|
+
./spec/binding/plugins/validator_spec.rb[1:3:1] | failed | 0.00182 seconds |
|
|
37
|
+
./spec/binding/plugins/validator_spec.rb[1:3:2] | failed | 0.0128 seconds |
|
|
38
|
+
./spec/binding/plugins/validator_spec.rb[1:4:1] | failed | 0.00156 seconds |
|
|
39
|
+
./spec/binding/plugins/validator_spec.rb[1:4:2] | passed | 0.0001 seconds |
|
|
40
|
+
./spec/binding/plugins/validator_spec.rb[1:5:1] | failed | 0.00445 seconds |
|
|
41
|
+
./spec/binding/plugins/validator_spec.rb[1:5:2] | failed | 0.00198 seconds |
|
|
42
|
+
./spec/binding/plugins/validator_spec.rb[1:6:1] | failed | 0.00179 seconds |
|
|
43
|
+
./spec/binding/plugins/validator_spec.rb[1:7:1] | passed | 0.00068 seconds |
|
|
44
|
+
./spec/binding/plugins/validator_spec.rb[1:7:2] | passed | 0.00088 seconds |
|
|
45
|
+
./spec/binding/plugins/validator_spec.rb[1:7:3] | passed | 0.00045 seconds |
|
|
46
|
+
./spec/binding/plugins/validator_spec.rb[1:7:4] | passed | 0.00004 seconds |
|
|
47
|
+
/Users/naamanhirschfeld/workspace/kreuzberg/e2e/ruby/spec/email_spec.rb[1:1] | passed | 0.01048 seconds |
|
|
48
|
+
/Users/naamanhirschfeld/workspace/kreuzberg/e2e/ruby/spec/html_spec.rb[1:1] | passed | 1.95 seconds |
|
|
49
|
+
/Users/naamanhirschfeld/workspace/kreuzberg/e2e/ruby/spec/html_spec.rb[1:2] | passed | 0.00031 seconds |
|
|
50
|
+
/Users/naamanhirschfeld/workspace/kreuzberg/e2e/ruby/spec/image_spec.rb[1:1] | passed | 0.0027 seconds |
|
|
51
|
+
/Users/naamanhirschfeld/workspace/kreuzberg/e2e/ruby/spec/ocr_spec.rb[1:1] | passed | 0.04721 seconds |
|
|
52
|
+
/Users/naamanhirschfeld/workspace/kreuzberg/e2e/ruby/spec/ocr_spec.rb[1:2] | passed | 0.04402 seconds |
|
|
53
|
+
/Users/naamanhirschfeld/workspace/kreuzberg/e2e/ruby/spec/ocr_spec.rb[1:3] | passed | 3.41 seconds |
|
|
54
|
+
/Users/naamanhirschfeld/workspace/kreuzberg/e2e/ruby/spec/ocr_spec.rb[1:4] | passed | 0.34493 seconds |
|
|
55
|
+
/Users/naamanhirschfeld/workspace/kreuzberg/e2e/ruby/spec/ocr_spec.rb[1:5] | passed | 0.33223 seconds |
|
|
56
|
+
/Users/naamanhirschfeld/workspace/kreuzberg/e2e/ruby/spec/office_spec.rb[1:1] | passed | 2.74 seconds |
|
|
57
|
+
/Users/naamanhirschfeld/workspace/kreuzberg/e2e/ruby/spec/office_spec.rb[1:2] | passed | 0.00021 seconds |
|
|
58
|
+
/Users/naamanhirschfeld/workspace/kreuzberg/e2e/ruby/spec/office_spec.rb[1:3] | passed | 0.00035 seconds |
|
|
59
|
+
/Users/naamanhirschfeld/workspace/kreuzberg/e2e/ruby/spec/office_spec.rb[1:4] | passed | 0.00021 seconds |
|
|
60
|
+
/Users/naamanhirschfeld/workspace/kreuzberg/e2e/ruby/spec/office_spec.rb[1:5] | passed | 0.0003 seconds |
|
|
61
|
+
/Users/naamanhirschfeld/workspace/kreuzberg/e2e/ruby/spec/office_spec.rb[1:6] | passed | 0.00027 seconds |
|
|
62
|
+
/Users/naamanhirschfeld/workspace/kreuzberg/e2e/ruby/spec/office_spec.rb[1:7] | passed | 0.00023 seconds |
|
|
63
|
+
/Users/naamanhirschfeld/workspace/kreuzberg/e2e/ruby/spec/office_spec.rb[1:8] | passed | 0.00016 seconds |
|
|
64
|
+
/Users/naamanhirschfeld/workspace/kreuzberg/e2e/ruby/spec/office_spec.rb[1:9] | passed | 2.65 seconds |
|
|
65
|
+
/Users/naamanhirschfeld/workspace/kreuzberg/e2e/ruby/spec/office_spec.rb[1:10] | passed | 0.0003 seconds |
|
|
66
|
+
/Users/naamanhirschfeld/workspace/kreuzberg/e2e/ruby/spec/office_spec.rb[1:11] | passed | 0.0002 seconds |
|
|
67
|
+
/Users/naamanhirschfeld/workspace/kreuzberg/e2e/ruby/spec/office_spec.rb[1:12] | passed | 0.00984 seconds |
|
|
68
|
+
/Users/naamanhirschfeld/workspace/kreuzberg/e2e/ruby/spec/office_spec.rb[1:13] | passed | 0.00096 seconds |
|
|
69
|
+
/Users/naamanhirschfeld/workspace/kreuzberg/e2e/ruby/spec/office_spec.rb[1:14] | passed | 0.00115 seconds |
|
|
70
|
+
/Users/naamanhirschfeld/workspace/kreuzberg/e2e/ruby/spec/office_spec.rb[1:15] | passed | 0.00038 seconds |
|
|
71
|
+
/Users/naamanhirschfeld/workspace/kreuzberg/e2e/ruby/spec/office_spec.rb[1:16] | passed | 0.00448 seconds |
|
|
72
|
+
/Users/naamanhirschfeld/workspace/kreuzberg/e2e/ruby/spec/pdf_spec.rb[1:1] | passed | 0.99668 seconds |
|
|
73
|
+
/Users/naamanhirschfeld/workspace/kreuzberg/e2e/ruby/spec/pdf_spec.rb[1:2] | passed | 4.11 seconds |
|
|
74
|
+
/Users/naamanhirschfeld/workspace/kreuzberg/e2e/ruby/spec/pdf_spec.rb[1:3] | passed | 0.00451 seconds |
|
|
75
|
+
/Users/naamanhirschfeld/workspace/kreuzberg/e2e/ruby/spec/pdf_spec.rb[1:4] | passed | 0.07588 seconds |
|
|
76
|
+
/Users/naamanhirschfeld/workspace/kreuzberg/e2e/ruby/spec/pdf_spec.rb[1:5] | passed | 0.00339 seconds |
|
|
77
|
+
/Users/naamanhirschfeld/workspace/kreuzberg/e2e/ruby/spec/pdf_spec.rb[1:6] | passed | 0.00238 seconds |
|
|
78
|
+
/Users/naamanhirschfeld/workspace/kreuzberg/e2e/ruby/spec/pdf_spec.rb[1:7] | passed | 0.24683 seconds |
|
|
79
|
+
/Users/naamanhirschfeld/workspace/kreuzberg/e2e/ruby/spec/pdf_spec.rb[1:8] | passed | 0.07999 seconds |
|
|
80
|
+
/Users/naamanhirschfeld/workspace/kreuzberg/e2e/ruby/spec/pdf_spec.rb[1:9] | passed | 0.01214 seconds |
|
|
81
|
+
/Users/naamanhirschfeld/workspace/kreuzberg/e2e/ruby/spec/pdf_spec.rb[1:10] | passed | 0.00095 seconds |
|
|
82
|
+
/Users/naamanhirschfeld/workspace/kreuzberg/e2e/ruby/spec/pdf_spec.rb[1:11] | passed | 0.03728 seconds |
|
|
83
|
+
/Users/naamanhirschfeld/workspace/kreuzberg/e2e/ruby/spec/pdf_spec.rb[1:12] | passed | 0.01741 seconds |
|
|
84
|
+
/Users/naamanhirschfeld/workspace/kreuzberg/e2e/ruby/spec/pdf_spec.rb[1:13] | passed | 0.0036 seconds |
|
|
85
|
+
/Users/naamanhirschfeld/workspace/kreuzberg/e2e/ruby/spec/pdf_spec.rb[1:14] | passed | 0.89424 seconds |
|
|
86
|
+
/Users/naamanhirschfeld/workspace/kreuzberg/e2e/ruby/spec/plugin_apis_spec.rb[1:1] | passed | 0.00228 seconds |
|
|
87
|
+
/Users/naamanhirschfeld/workspace/kreuzberg/e2e/ruby/spec/plugin_apis_spec.rb[1:2] | passed | 0.0012 seconds |
|
|
88
|
+
/Users/naamanhirschfeld/workspace/kreuzberg/e2e/ruby/spec/plugin_apis_spec.rb[2:1] | passed | 0.0008 seconds |
|
|
89
|
+
/Users/naamanhirschfeld/workspace/kreuzberg/e2e/ruby/spec/plugin_apis_spec.rb[2:2] | passed | 0.00119 seconds |
|
|
90
|
+
/Users/naamanhirschfeld/workspace/kreuzberg/e2e/ruby/spec/plugin_apis_spec.rb[2:3] | passed | 0.0013 seconds |
|
|
91
|
+
/Users/naamanhirschfeld/workspace/kreuzberg/e2e/ruby/spec/plugin_apis_spec.rb[3:1] | passed | 0.00184 seconds |
|
|
92
|
+
/Users/naamanhirschfeld/workspace/kreuzberg/e2e/ruby/spec/plugin_apis_spec.rb[3:2] | passed | 0.00053 seconds |
|
|
93
|
+
/Users/naamanhirschfeld/workspace/kreuzberg/e2e/ruby/spec/plugin_apis_spec.rb[3:3] | passed | 0.00004 seconds |
|
|
94
|
+
/Users/naamanhirschfeld/workspace/kreuzberg/e2e/ruby/spec/plugin_apis_spec.rb[4:1] | passed | 0.00049 seconds |
|
|
95
|
+
/Users/naamanhirschfeld/workspace/kreuzberg/e2e/ruby/spec/plugin_apis_spec.rb[4:2] | passed | 0.00006 seconds |
|
|
96
|
+
/Users/naamanhirschfeld/workspace/kreuzberg/e2e/ruby/spec/plugin_apis_spec.rb[4:3] | passed | 0.00005 seconds |
|
|
97
|
+
/Users/naamanhirschfeld/workspace/kreuzberg/e2e/ruby/spec/plugin_apis_spec.rb[5:1] | passed | 0.00007 seconds |
|
|
98
|
+
/Users/naamanhirschfeld/workspace/kreuzberg/e2e/ruby/spec/plugin_apis_spec.rb[5:2] | passed | 0.00011 seconds |
|
|
99
|
+
/Users/naamanhirschfeld/workspace/kreuzberg/e2e/ruby/spec/plugin_apis_spec.rb[6:1] | passed | 0.00003 seconds |
|
|
100
|
+
/Users/naamanhirschfeld/workspace/kreuzberg/e2e/ruby/spec/plugin_apis_spec.rb[6:2] | passed | 0.00002 seconds |
|
|
101
|
+
/Users/naamanhirschfeld/workspace/kreuzberg/e2e/ruby/spec/structured_spec.rb[1:1] | passed | 0.00101 seconds |
|
|
102
|
+
/Users/naamanhirschfeld/workspace/kreuzberg/e2e/ruby/spec/structured_spec.rb[1:2] | passed | 0.00041 seconds |
|
|
103
|
+
/Users/naamanhirschfeld/workspace/kreuzberg/e2e/ruby/spec/structured_spec.rb[1:3] | passed | 0.00035 seconds |
|
|
104
|
+
/Users/naamanhirschfeld/workspace/kreuzberg/e2e/ruby/spec/xml_spec.rb[1:1] | passed | 0.00078 seconds |
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
# Test configuration file for Kreuzberg Ruby bindings
|
|
2
|
+
|
|
3
|
+
use_cache = false
|
|
4
|
+
enable_quality_processing = true
|
|
5
|
+
force_ocr = true
|
|
6
|
+
|
|
7
|
+
[ocr]
|
|
8
|
+
backend = "tesseract"
|
|
9
|
+
language = "deu"
|
|
10
|
+
|
|
11
|
+
[chunking]
|
|
12
|
+
max_chars = 500
|
|
13
|
+
max_overlap = 100
|
|
14
|
+
preset = "fast"
|
|
15
|
+
|
|
16
|
+
[language_detection]
|
|
17
|
+
enabled = true
|
|
18
|
+
min_confidence = 0.9
|
|
19
|
+
|
|
20
|
+
[pdf_options]
|
|
21
|
+
extract_images = true
|
|
22
|
+
passwords = ["secret", "backup"]
|
|
23
|
+
extract_metadata = true
|
|
24
|
+
|
|
25
|
+
[images]
|
|
26
|
+
extract_images = true
|
|
27
|
+
target_dpi = 600
|
|
28
|
+
max_image_dimension = 2000
|
|
29
|
+
auto_adjust_dpi = false
|
|
30
|
+
min_dpi = 150
|
|
31
|
+
max_dpi = 600
|
|
32
|
+
|
|
33
|
+
[postprocessor]
|
|
34
|
+
enabled = true
|
|
35
|
+
enabled_processors = ["quality", "formatting"]
|
|
36
|
+
|
|
37
|
+
[token_reduction]
|
|
38
|
+
mode = "moderate"
|
|
39
|
+
preserve_important_words = true
|
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
# Test configuration file for Kreuzberg Ruby bindings
|
|
2
|
+
|
|
3
|
+
use_cache: false
|
|
4
|
+
enable_quality_processing: true
|
|
5
|
+
force_ocr: true
|
|
6
|
+
|
|
7
|
+
ocr:
|
|
8
|
+
backend: tesseract
|
|
9
|
+
language: fra
|
|
10
|
+
|
|
11
|
+
chunking:
|
|
12
|
+
max_chars: 750
|
|
13
|
+
max_overlap: 150
|
|
14
|
+
preset: balanced
|
|
15
|
+
|
|
16
|
+
language_detection:
|
|
17
|
+
enabled: true
|
|
18
|
+
min_confidence: 0.85
|
|
19
|
+
|
|
20
|
+
pdf_options:
|
|
21
|
+
extract_images: false
|
|
22
|
+
passwords:
|
|
23
|
+
- password1
|
|
24
|
+
- password2
|
|
25
|
+
extract_metadata: true
|
|
26
|
+
|
|
27
|
+
images:
|
|
28
|
+
extract_images: true
|
|
29
|
+
target_dpi: 300
|
|
30
|
+
max_image_dimension: 4096
|
|
31
|
+
auto_adjust_dpi: true
|
|
32
|
+
min_dpi: 72
|
|
33
|
+
max_dpi: 600
|
|
34
|
+
|
|
35
|
+
postprocessor:
|
|
36
|
+
enabled: false
|
|
37
|
+
disabled_processors:
|
|
38
|
+
- token_reduction
|
|
39
|
+
|
|
40
|
+
token_reduction:
|
|
41
|
+
mode: light
|
|
42
|
+
preserve_important_words: false
|
|
@@ -0,0 +1,178 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
# Basic smoke tests to verify package structure and imports work
|
|
4
|
+
require 'stringio'
|
|
5
|
+
|
|
6
|
+
RSpec.describe 'Kreuzberg package' do
|
|
7
|
+
describe 'import and structure' do
|
|
8
|
+
it 'can be required without errors' do
|
|
9
|
+
expect { require 'kreuzberg' }.not_to raise_error
|
|
10
|
+
end
|
|
11
|
+
|
|
12
|
+
it 'has a version constant' do
|
|
13
|
+
expect(Kreuzberg::VERSION).not_to be_nil
|
|
14
|
+
expect(Kreuzberg::VERSION).to be_a(String)
|
|
15
|
+
expect(Kreuzberg::VERSION).to match(/^\d+\.\d+\.\d+/)
|
|
16
|
+
end
|
|
17
|
+
end
|
|
18
|
+
|
|
19
|
+
describe 'public API exports' do
|
|
20
|
+
describe 'configuration classes' do
|
|
21
|
+
it 'exports Config::Extraction' do
|
|
22
|
+
expect(defined?(Kreuzberg::Config::Extraction)).to eq('constant')
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
it 'exports Config::OCR' do
|
|
26
|
+
expect(defined?(Kreuzberg::Config::OCR)).to eq('constant')
|
|
27
|
+
end
|
|
28
|
+
|
|
29
|
+
it 'exports Config::Chunking' do
|
|
30
|
+
expect(defined?(Kreuzberg::Config::Chunking)).to eq('constant')
|
|
31
|
+
end
|
|
32
|
+
|
|
33
|
+
it 'exports Config::LanguageDetection' do
|
|
34
|
+
expect(defined?(Kreuzberg::Config::LanguageDetection)).to eq('constant')
|
|
35
|
+
end
|
|
36
|
+
|
|
37
|
+
it 'exports Config::PDF' do
|
|
38
|
+
expect(defined?(Kreuzberg::Config::PDF)).to eq('constant')
|
|
39
|
+
end
|
|
40
|
+
|
|
41
|
+
it 'exports Config::HtmlOptions' do
|
|
42
|
+
expect(defined?(Kreuzberg::Config::HtmlOptions)).to eq('constant')
|
|
43
|
+
end
|
|
44
|
+
|
|
45
|
+
it 'exports Config::Keywords' do
|
|
46
|
+
expect(defined?(Kreuzberg::Config::Keywords)).to eq('constant')
|
|
47
|
+
end
|
|
48
|
+
end
|
|
49
|
+
|
|
50
|
+
describe 'result classes' do
|
|
51
|
+
it 'exports Result' do
|
|
52
|
+
expect(defined?(Kreuzberg::Result)).to eq('constant')
|
|
53
|
+
end
|
|
54
|
+
|
|
55
|
+
it 'exports Result::Table' do
|
|
56
|
+
expect(defined?(Kreuzberg::Result::Table)).to eq('constant')
|
|
57
|
+
end
|
|
58
|
+
|
|
59
|
+
it 'exports Result::Chunk' do
|
|
60
|
+
expect(defined?(Kreuzberg::Result::Chunk)).to eq('constant')
|
|
61
|
+
end
|
|
62
|
+
|
|
63
|
+
it 'exports Result::Image' do
|
|
64
|
+
expect(defined?(Kreuzberg::Result::Image)).to eq('constant')
|
|
65
|
+
end
|
|
66
|
+
end
|
|
67
|
+
|
|
68
|
+
describe 'exception classes' do
|
|
69
|
+
it 'exports Errors::Error' do
|
|
70
|
+
expect(defined?(Kreuzberg::Errors::Error)).to eq('constant')
|
|
71
|
+
end
|
|
72
|
+
|
|
73
|
+
it 'exports Errors::ValidationError' do
|
|
74
|
+
expect(defined?(Kreuzberg::Errors::ValidationError)).to eq('constant')
|
|
75
|
+
end
|
|
76
|
+
|
|
77
|
+
it 'exports Errors::ParsingError' do
|
|
78
|
+
expect(defined?(Kreuzberg::Errors::ParsingError)).to eq('constant')
|
|
79
|
+
end
|
|
80
|
+
|
|
81
|
+
it 'exports Errors::OCRError' do
|
|
82
|
+
expect(defined?(Kreuzberg::Errors::OCRError)).to eq('constant')
|
|
83
|
+
end
|
|
84
|
+
|
|
85
|
+
it 'exports Errors::MissingDependencyError' do
|
|
86
|
+
expect(defined?(Kreuzberg::Errors::MissingDependencyError)).to eq('constant')
|
|
87
|
+
end
|
|
88
|
+
|
|
89
|
+
it 'exports Errors::IOError' do
|
|
90
|
+
expect(defined?(Kreuzberg::Errors::IOError)).to eq('constant')
|
|
91
|
+
end
|
|
92
|
+
|
|
93
|
+
it 'exports Errors::PluginError' do
|
|
94
|
+
expect(defined?(Kreuzberg::Errors::PluginError)).to eq('constant')
|
|
95
|
+
end
|
|
96
|
+
end
|
|
97
|
+
|
|
98
|
+
describe 'extraction functions (sync)' do
|
|
99
|
+
it 'exports extract_file_sync' do
|
|
100
|
+
expect(Kreuzberg).to respond_to(:extract_file_sync)
|
|
101
|
+
end
|
|
102
|
+
|
|
103
|
+
it 'exports extract_bytes_sync' do
|
|
104
|
+
expect(Kreuzberg).to respond_to(:extract_bytes_sync)
|
|
105
|
+
end
|
|
106
|
+
|
|
107
|
+
it 'exports batch_extract_files_sync' do
|
|
108
|
+
expect(Kreuzberg).to respond_to(:batch_extract_files_sync)
|
|
109
|
+
end
|
|
110
|
+
end
|
|
111
|
+
|
|
112
|
+
describe 'extraction functions (async)' do
|
|
113
|
+
it 'exports extract_file' do
|
|
114
|
+
expect(Kreuzberg).to respond_to(:extract_file)
|
|
115
|
+
end
|
|
116
|
+
|
|
117
|
+
it 'exports extract_bytes' do
|
|
118
|
+
expect(Kreuzberg).to respond_to(:extract_bytes)
|
|
119
|
+
end
|
|
120
|
+
|
|
121
|
+
it 'exports batch_extract_files' do
|
|
122
|
+
expect(Kreuzberg).to respond_to(:batch_extract_files)
|
|
123
|
+
end
|
|
124
|
+
end
|
|
125
|
+
|
|
126
|
+
describe 'utility modules' do
|
|
127
|
+
it 'exports CLI' do
|
|
128
|
+
expect(defined?(Kreuzberg::CLI)).to eq('constant')
|
|
129
|
+
end
|
|
130
|
+
|
|
131
|
+
it 'exports CLIProxy' do
|
|
132
|
+
expect(defined?(Kreuzberg::CLIProxy)).to eq('constant')
|
|
133
|
+
end
|
|
134
|
+
|
|
135
|
+
it 'exports APIProxy' do
|
|
136
|
+
expect(defined?(Kreuzberg::APIProxy)).to eq('constant')
|
|
137
|
+
end
|
|
138
|
+
|
|
139
|
+
it 'exports MCPProxy' do
|
|
140
|
+
expect(defined?(Kreuzberg::MCPProxy)).to eq('constant')
|
|
141
|
+
end
|
|
142
|
+
end
|
|
143
|
+
end
|
|
144
|
+
|
|
145
|
+
describe 'module structure' do
|
|
146
|
+
it 'defines Kreuzberg as a module' do
|
|
147
|
+
expect(Kreuzberg).to be_a(Module)
|
|
148
|
+
end
|
|
149
|
+
|
|
150
|
+
it 'defines Kreuzberg::Config as a module' do
|
|
151
|
+
expect(Kreuzberg::Config).to be_a(Module)
|
|
152
|
+
end
|
|
153
|
+
|
|
154
|
+
it 'defines Kreuzberg::Errors as a module' do
|
|
155
|
+
expect(Kreuzberg::Errors).to be_a(Module)
|
|
156
|
+
end
|
|
157
|
+
end
|
|
158
|
+
|
|
159
|
+
describe 'basic extraction smoke tests' do
|
|
160
|
+
it 'extracts inline text via bytes API' do
|
|
161
|
+
bytes = StringIO.new('Hello from Kreuzberg')
|
|
162
|
+
result = Kreuzberg.extract_bytes_sync(bytes.string, 'text/plain')
|
|
163
|
+
|
|
164
|
+
expect(result.content).to include('Hello')
|
|
165
|
+
expect(result.mime_type).to eq('text/plain')
|
|
166
|
+
end
|
|
167
|
+
|
|
168
|
+
it 'extracts from small temp file via sync API' do
|
|
169
|
+
file = create_test_file('Simple document for smoke testing')
|
|
170
|
+
result = Kreuzberg.extract_file_sync(file)
|
|
171
|
+
|
|
172
|
+
expect(result.content).to include('Simple document')
|
|
173
|
+
expect(result.mime_type).to eq('text/plain')
|
|
174
|
+
ensure
|
|
175
|
+
File.delete(file) if file && File.exist?(file)
|
|
176
|
+
end
|
|
177
|
+
end
|
|
178
|
+
end
|