kreuzberg 4.0.0.pre.rc.29 → 4.0.0.rc1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +0 -6
- data/.rubocop.yaml +534 -1
- data/Gemfile +2 -1
- data/Gemfile.lock +28 -116
- data/README.md +269 -629
- data/Rakefile +0 -9
- data/Steepfile +4 -8
- data/examples/async_patterns.rb +58 -1
- data/ext/kreuzberg_rb/extconf.rb +5 -35
- data/ext/kreuzberg_rb/native/Cargo.toml +16 -55
- data/ext/kreuzberg_rb/native/build.rs +14 -12
- data/ext/kreuzberg_rb/native/include/ieeefp.h +1 -1
- data/ext/kreuzberg_rb/native/include/msvc_compat/strings.h +1 -1
- data/ext/kreuzberg_rb/native/include/strings.h +2 -2
- data/ext/kreuzberg_rb/native/include/unistd.h +1 -1
- data/ext/kreuzberg_rb/native/src/lib.rs +34 -897
- data/extconf.rb +6 -38
- data/kreuzberg.gemspec +20 -114
- data/lib/kreuzberg/api_proxy.rb +18 -2
- data/lib/kreuzberg/cache_api.rb +0 -22
- data/lib/kreuzberg/cli.rb +10 -2
- data/lib/kreuzberg/cli_proxy.rb +10 -0
- data/lib/kreuzberg/config.rb +22 -274
- data/lib/kreuzberg/errors.rb +7 -73
- data/lib/kreuzberg/extraction_api.rb +8 -237
- data/lib/kreuzberg/mcp_proxy.rb +11 -2
- data/lib/kreuzberg/ocr_backend_protocol.rb +73 -0
- data/lib/kreuzberg/post_processor_protocol.rb +71 -0
- data/lib/kreuzberg/result.rb +33 -151
- data/lib/kreuzberg/setup_lib_path.rb +2 -22
- data/lib/kreuzberg/validator_protocol.rb +73 -0
- data/lib/kreuzberg/version.rb +1 -1
- data/lib/kreuzberg.rb +13 -27
- data/pkg/kreuzberg-4.0.0.rc1.gem +0 -0
- data/sig/kreuzberg.rbs +12 -105
- data/spec/binding/cache_spec.rb +22 -22
- data/spec/binding/cli_proxy_spec.rb +4 -2
- data/spec/binding/cli_spec.rb +11 -12
- data/spec/binding/config_spec.rb +0 -74
- data/spec/binding/config_validation_spec.rb +6 -100
- data/spec/binding/error_handling_spec.rb +97 -283
- data/spec/binding/plugins/ocr_backend_spec.rb +8 -8
- data/spec/binding/plugins/postprocessor_spec.rb +11 -11
- data/spec/binding/plugins/validator_spec.rb +13 -12
- data/spec/examples.txt +104 -0
- data/spec/fixtures/config.toml +1 -0
- data/spec/fixtures/config.yaml +1 -0
- data/spec/fixtures/invalid_config.toml +1 -0
- data/spec/smoke/package_spec.rb +3 -2
- data/spec/spec_helper.rb +3 -1
- data/vendor/kreuzberg/Cargo.toml +67 -192
- data/vendor/kreuzberg/README.md +9 -97
- data/vendor/kreuzberg/build.rs +194 -516
- data/vendor/kreuzberg/src/api/handlers.rs +9 -130
- data/vendor/kreuzberg/src/api/mod.rs +3 -18
- data/vendor/kreuzberg/src/api/server.rs +71 -236
- data/vendor/kreuzberg/src/api/types.rs +7 -43
- data/vendor/kreuzberg/src/bin/profile_extract.rs +455 -0
- data/vendor/kreuzberg/src/cache/mod.rs +3 -27
- data/vendor/kreuzberg/src/chunking/mod.rs +79 -1705
- data/vendor/kreuzberg/src/core/batch_mode.rs +0 -60
- data/vendor/kreuzberg/src/core/config.rs +23 -905
- data/vendor/kreuzberg/src/core/extractor.rs +106 -403
- data/vendor/kreuzberg/src/core/io.rs +2 -4
- data/vendor/kreuzberg/src/core/mime.rs +12 -2
- data/vendor/kreuzberg/src/core/mod.rs +3 -22
- data/vendor/kreuzberg/src/core/pipeline.rs +78 -395
- data/vendor/kreuzberg/src/embeddings.rs +21 -169
- data/vendor/kreuzberg/src/error.rs +2 -2
- data/vendor/kreuzberg/src/extraction/archive.rs +31 -36
- data/vendor/kreuzberg/src/extraction/docx.rs +1 -365
- data/vendor/kreuzberg/src/extraction/email.rs +11 -12
- data/vendor/kreuzberg/src/extraction/excel.rs +129 -138
- data/vendor/kreuzberg/src/extraction/html.rs +170 -1447
- data/vendor/kreuzberg/src/extraction/image.rs +14 -138
- data/vendor/kreuzberg/src/extraction/libreoffice.rs +3 -13
- data/vendor/kreuzberg/src/extraction/mod.rs +5 -21
- data/vendor/kreuzberg/src/extraction/office_metadata/mod.rs +0 -2
- data/vendor/kreuzberg/src/extraction/pandoc/batch.rs +275 -0
- data/vendor/kreuzberg/src/extraction/pandoc/mime_types.rs +178 -0
- data/vendor/kreuzberg/src/extraction/pandoc/mod.rs +491 -0
- data/vendor/kreuzberg/src/extraction/pandoc/server.rs +496 -0
- data/vendor/kreuzberg/src/extraction/pandoc/subprocess.rs +1188 -0
- data/vendor/kreuzberg/src/extraction/pandoc/version.rs +162 -0
- data/vendor/kreuzberg/src/extraction/pptx.rs +94 -196
- data/vendor/kreuzberg/src/extraction/structured.rs +4 -5
- data/vendor/kreuzberg/src/extraction/table.rs +1 -2
- data/vendor/kreuzberg/src/extraction/text.rs +10 -18
- data/vendor/kreuzberg/src/extractors/archive.rs +0 -22
- data/vendor/kreuzberg/src/extractors/docx.rs +148 -69
- data/vendor/kreuzberg/src/extractors/email.rs +9 -37
- data/vendor/kreuzberg/src/extractors/excel.rs +40 -81
- data/vendor/kreuzberg/src/extractors/html.rs +173 -182
- data/vendor/kreuzberg/src/extractors/image.rs +8 -32
- data/vendor/kreuzberg/src/extractors/mod.rs +10 -171
- data/vendor/kreuzberg/src/extractors/pandoc.rs +201 -0
- data/vendor/kreuzberg/src/extractors/pdf.rs +64 -329
- data/vendor/kreuzberg/src/extractors/pptx.rs +34 -79
- data/vendor/kreuzberg/src/extractors/structured.rs +0 -16
- data/vendor/kreuzberg/src/extractors/text.rs +7 -30
- data/vendor/kreuzberg/src/extractors/xml.rs +8 -27
- data/vendor/kreuzberg/src/keywords/processor.rs +1 -9
- data/vendor/kreuzberg/src/keywords/rake.rs +1 -0
- data/vendor/kreuzberg/src/language_detection/mod.rs +51 -94
- data/vendor/kreuzberg/src/lib.rs +5 -17
- data/vendor/kreuzberg/src/mcp/mod.rs +1 -4
- data/vendor/kreuzberg/src/mcp/server.rs +21 -145
- data/vendor/kreuzberg/src/ocr/mod.rs +0 -2
- data/vendor/kreuzberg/src/ocr/processor.rs +8 -19
- data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +0 -2
- data/vendor/kreuzberg/src/pdf/error.rs +1 -93
- data/vendor/kreuzberg/src/pdf/metadata.rs +100 -263
- data/vendor/kreuzberg/src/pdf/mod.rs +2 -33
- data/vendor/kreuzberg/src/pdf/rendering.rs +12 -12
- data/vendor/kreuzberg/src/pdf/table.rs +64 -61
- data/vendor/kreuzberg/src/pdf/text.rs +24 -416
- data/vendor/kreuzberg/src/plugins/extractor.rs +8 -40
- data/vendor/kreuzberg/src/plugins/mod.rs +0 -3
- data/vendor/kreuzberg/src/plugins/ocr.rs +14 -22
- data/vendor/kreuzberg/src/plugins/processor.rs +1 -10
- data/vendor/kreuzberg/src/plugins/registry.rs +0 -15
- data/vendor/kreuzberg/src/plugins/validator.rs +8 -20
- data/vendor/kreuzberg/src/stopwords/mod.rs +2 -2
- data/vendor/kreuzberg/src/text/mod.rs +0 -8
- data/vendor/kreuzberg/src/text/quality.rs +15 -28
- data/vendor/kreuzberg/src/text/string_utils.rs +10 -22
- data/vendor/kreuzberg/src/text/token_reduction/core.rs +50 -86
- data/vendor/kreuzberg/src/text/token_reduction/filters.rs +16 -37
- data/vendor/kreuzberg/src/text/token_reduction/simd_text.rs +1 -2
- data/vendor/kreuzberg/src/types.rs +67 -907
- data/vendor/kreuzberg/src/utils/mod.rs +0 -14
- data/vendor/kreuzberg/src/utils/quality.rs +3 -12
- data/vendor/kreuzberg/tests/api_tests.rs +0 -506
- data/vendor/kreuzberg/tests/archive_integration.rs +0 -2
- data/vendor/kreuzberg/tests/batch_orchestration.rs +12 -57
- data/vendor/kreuzberg/tests/batch_processing.rs +8 -32
- data/vendor/kreuzberg/tests/chunking_offset_demo.rs +92 -0
- data/vendor/kreuzberg/tests/concurrency_stress.rs +8 -40
- data/vendor/kreuzberg/tests/config_features.rs +1 -33
- data/vendor/kreuzberg/tests/config_loading_tests.rs +39 -16
- data/vendor/kreuzberg/tests/core_integration.rs +9 -35
- data/vendor/kreuzberg/tests/csv_integration.rs +81 -71
- data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +25 -23
- data/vendor/kreuzberg/tests/email_integration.rs +1 -3
- data/vendor/kreuzberg/tests/error_handling.rs +34 -43
- data/vendor/kreuzberg/tests/format_integration.rs +1 -7
- data/vendor/kreuzberg/tests/helpers/mod.rs +0 -60
- data/vendor/kreuzberg/tests/image_integration.rs +0 -2
- data/vendor/kreuzberg/tests/mime_detection.rs +16 -17
- data/vendor/kreuzberg/tests/ocr_configuration.rs +0 -4
- data/vendor/kreuzberg/tests/ocr_errors.rs +0 -22
- data/vendor/kreuzberg/tests/ocr_quality.rs +0 -2
- data/vendor/kreuzberg/tests/pandoc_integration.rs +503 -0
- data/vendor/kreuzberg/tests/pdf_integration.rs +0 -2
- data/vendor/kreuzberg/tests/pipeline_integration.rs +2 -36
- data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +0 -5
- data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +1 -17
- data/vendor/kreuzberg/tests/plugin_system.rs +0 -6
- data/vendor/kreuzberg/tests/registry_integration_tests.rs +22 -2
- data/vendor/kreuzberg/tests/security_validation.rs +1 -13
- data/vendor/kreuzberg/tests/test_fastembed.rs +23 -45
- metadata +25 -171
- data/.rubocop.yml +0 -543
- data/ext/kreuzberg_rb/native/.cargo/config.toml +0 -23
- data/ext/kreuzberg_rb/native/Cargo.lock +0 -7619
- data/lib/kreuzberg/error_context.rb +0 -136
- data/lib/kreuzberg/types.rb +0 -170
- data/lib/libpdfium.so +0 -0
- data/spec/binding/async_operations_spec.rb +0 -473
- data/spec/binding/batch_operations_spec.rb +0 -595
- data/spec/binding/batch_spec.rb +0 -359
- data/spec/binding/config_result_spec.rb +0 -377
- data/spec/binding/embeddings_spec.rb +0 -816
- data/spec/binding/error_recovery_spec.rb +0 -488
- data/spec/binding/font_config_spec.rb +0 -220
- data/spec/binding/images_spec.rb +0 -738
- data/spec/binding/keywords_extraction_spec.rb +0 -600
- data/spec/binding/metadata_types_spec.rb +0 -1228
- data/spec/binding/pages_extraction_spec.rb +0 -471
- data/spec/binding/tables_spec.rb +0 -641
- data/spec/unit/config/chunking_config_spec.rb +0 -213
- data/spec/unit/config/embedding_config_spec.rb +0 -343
- data/spec/unit/config/extraction_config_spec.rb +0 -438
- data/spec/unit/config/font_config_spec.rb +0 -285
- data/spec/unit/config/hierarchy_config_spec.rb +0 -314
- data/spec/unit/config/image_extraction_config_spec.rb +0 -209
- data/spec/unit/config/image_preprocessing_config_spec.rb +0 -249
- data/spec/unit/config/keyword_config_spec.rb +0 -229
- data/spec/unit/config/language_detection_config_spec.rb +0 -258
- data/spec/unit/config/ocr_config_spec.rb +0 -171
- data/spec/unit/config/page_config_spec.rb +0 -221
- data/spec/unit/config/pdf_config_spec.rb +0 -267
- data/spec/unit/config/postprocessor_config_spec.rb +0 -290
- data/spec/unit/config/tesseract_config_spec.rb +0 -181
- data/spec/unit/config/token_reduction_config_spec.rb +0 -251
- data/test/metadata_types_test.rb +0 -959
- data/vendor/Cargo.toml +0 -61
- data/vendor/kreuzberg/examples/bench_fixes.rs +0 -71
- data/vendor/kreuzberg/examples/test_pdfium_fork.rs +0 -62
- data/vendor/kreuzberg/src/chunking/processor.rs +0 -219
- data/vendor/kreuzberg/src/core/batch_optimizations.rs +0 -385
- data/vendor/kreuzberg/src/core/config_validation.rs +0 -949
- data/vendor/kreuzberg/src/core/formats.rs +0 -235
- data/vendor/kreuzberg/src/core/server_config.rs +0 -1220
- data/vendor/kreuzberg/src/extraction/capacity.rs +0 -263
- data/vendor/kreuzberg/src/extraction/markdown.rs +0 -216
- data/vendor/kreuzberg/src/extraction/office_metadata/odt_properties.rs +0 -284
- data/vendor/kreuzberg/src/extractors/bibtex.rs +0 -470
- data/vendor/kreuzberg/src/extractors/docbook.rs +0 -504
- data/vendor/kreuzberg/src/extractors/epub.rs +0 -696
- data/vendor/kreuzberg/src/extractors/fictionbook.rs +0 -492
- data/vendor/kreuzberg/src/extractors/jats.rs +0 -1054
- data/vendor/kreuzberg/src/extractors/jupyter.rs +0 -368
- data/vendor/kreuzberg/src/extractors/latex.rs +0 -653
- data/vendor/kreuzberg/src/extractors/markdown.rs +0 -701
- data/vendor/kreuzberg/src/extractors/odt.rs +0 -628
- data/vendor/kreuzberg/src/extractors/opml.rs +0 -635
- data/vendor/kreuzberg/src/extractors/orgmode.rs +0 -529
- data/vendor/kreuzberg/src/extractors/rst.rs +0 -577
- data/vendor/kreuzberg/src/extractors/rtf.rs +0 -809
- data/vendor/kreuzberg/src/extractors/security.rs +0 -484
- data/vendor/kreuzberg/src/extractors/security_tests.rs +0 -367
- data/vendor/kreuzberg/src/extractors/typst.rs +0 -651
- data/vendor/kreuzberg/src/language_detection/processor.rs +0 -218
- data/vendor/kreuzberg/src/ocr/language_registry.rs +0 -520
- data/vendor/kreuzberg/src/panic_context.rs +0 -154
- data/vendor/kreuzberg/src/pdf/bindings.rs +0 -306
- data/vendor/kreuzberg/src/pdf/bundled.rs +0 -408
- data/vendor/kreuzberg/src/pdf/fonts.rs +0 -358
- data/vendor/kreuzberg/src/pdf/hierarchy.rs +0 -903
- data/vendor/kreuzberg/src/text/quality_processor.rs +0 -231
- data/vendor/kreuzberg/src/text/utf8_validation.rs +0 -193
- data/vendor/kreuzberg/src/utils/pool.rs +0 -503
- data/vendor/kreuzberg/src/utils/pool_sizing.rs +0 -364
- data/vendor/kreuzberg/src/utils/string_pool.rs +0 -761
- data/vendor/kreuzberg/tests/api_embed.rs +0 -360
- data/vendor/kreuzberg/tests/api_extract_multipart.rs +0 -52
- data/vendor/kreuzberg/tests/api_large_pdf_extraction.rs +0 -471
- data/vendor/kreuzberg/tests/api_large_pdf_extraction_diagnostics.rs +0 -289
- data/vendor/kreuzberg/tests/batch_pooling_benchmark.rs +0 -154
- data/vendor/kreuzberg/tests/bibtex_parity_test.rs +0 -421
- data/vendor/kreuzberg/tests/config_integration_test.rs +0 -753
- data/vendor/kreuzberg/tests/data/hierarchy_ground_truth.json +0 -294
- data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +0 -500
- data/vendor/kreuzberg/tests/docx_vs_pandoc_comparison.rs +0 -370
- data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +0 -275
- data/vendor/kreuzberg/tests/fictionbook_extractor_tests.rs +0 -228
- data/vendor/kreuzberg/tests/html_table_test.rs +0 -551
- data/vendor/kreuzberg/tests/instrumentation_test.rs +0 -139
- data/vendor/kreuzberg/tests/jats_extractor_tests.rs +0 -639
- data/vendor/kreuzberg/tests/jupyter_extractor_tests.rs +0 -704
- data/vendor/kreuzberg/tests/latex_extractor_tests.rs +0 -496
- data/vendor/kreuzberg/tests/markdown_extractor_tests.rs +0 -490
- data/vendor/kreuzberg/tests/ocr_language_registry.rs +0 -191
- data/vendor/kreuzberg/tests/odt_extractor_tests.rs +0 -674
- data/vendor/kreuzberg/tests/opml_extractor_tests.rs +0 -616
- data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +0 -822
- data/vendor/kreuzberg/tests/page_markers.rs +0 -297
- data/vendor/kreuzberg/tests/pdf_hierarchy_detection.rs +0 -301
- data/vendor/kreuzberg/tests/pdf_hierarchy_quality.rs +0 -589
- data/vendor/kreuzberg/tests/pdf_ocr_triggering.rs +0 -301
- data/vendor/kreuzberg/tests/pdf_text_merging.rs +0 -475
- data/vendor/kreuzberg/tests/pdfium_linking.rs +0 -340
- data/vendor/kreuzberg/tests/rst_extractor_tests.rs +0 -694
- data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +0 -775
- data/vendor/kreuzberg/tests/typst_behavioral_tests.rs +0 -1260
- data/vendor/kreuzberg/tests/typst_extractor_tests.rs +0 -648
- data/vendor/kreuzberg-ffi/Cargo.toml +0 -67
- data/vendor/kreuzberg-ffi/README.md +0 -851
- data/vendor/kreuzberg-ffi/benches/result_view_benchmark.rs +0 -227
- data/vendor/kreuzberg-ffi/build.rs +0 -168
- data/vendor/kreuzberg-ffi/cbindgen.toml +0 -37
- data/vendor/kreuzberg-ffi/kreuzberg-ffi.pc.in +0 -12
- data/vendor/kreuzberg-ffi/kreuzberg.h +0 -3012
- data/vendor/kreuzberg-ffi/src/batch_streaming.rs +0 -588
- data/vendor/kreuzberg-ffi/src/config.rs +0 -1341
- data/vendor/kreuzberg-ffi/src/error.rs +0 -901
- data/vendor/kreuzberg-ffi/src/extraction.rs +0 -555
- data/vendor/kreuzberg-ffi/src/helpers.rs +0 -879
- data/vendor/kreuzberg-ffi/src/lib.rs +0 -977
- data/vendor/kreuzberg-ffi/src/memory.rs +0 -493
- data/vendor/kreuzberg-ffi/src/mime.rs +0 -329
- data/vendor/kreuzberg-ffi/src/panic_shield.rs +0 -265
- data/vendor/kreuzberg-ffi/src/plugins/document_extractor.rs +0 -442
- data/vendor/kreuzberg-ffi/src/plugins/mod.rs +0 -14
- data/vendor/kreuzberg-ffi/src/plugins/ocr_backend.rs +0 -628
- data/vendor/kreuzberg-ffi/src/plugins/post_processor.rs +0 -438
- data/vendor/kreuzberg-ffi/src/plugins/validator.rs +0 -329
- data/vendor/kreuzberg-ffi/src/result.rs +0 -510
- data/vendor/kreuzberg-ffi/src/result_pool.rs +0 -639
- data/vendor/kreuzberg-ffi/src/result_view.rs +0 -773
- data/vendor/kreuzberg-ffi/src/string_intern.rs +0 -568
- data/vendor/kreuzberg-ffi/src/types.rs +0 -363
- data/vendor/kreuzberg-ffi/src/util.rs +0 -210
- data/vendor/kreuzberg-ffi/src/validation.rs +0 -848
- data/vendor/kreuzberg-ffi/tests.disabled/README.md +0 -48
- data/vendor/kreuzberg-ffi/tests.disabled/config_loading_tests.rs +0 -299
- data/vendor/kreuzberg-ffi/tests.disabled/config_tests.rs +0 -346
- data/vendor/kreuzberg-ffi/tests.disabled/extractor_tests.rs +0 -232
- data/vendor/kreuzberg-ffi/tests.disabled/plugin_registration_tests.rs +0 -470
- data/vendor/kreuzberg-tesseract/.commitlintrc.json +0 -13
- data/vendor/kreuzberg-tesseract/.crate-ignore +0 -2
- data/vendor/kreuzberg-tesseract/Cargo.lock +0 -2933
- data/vendor/kreuzberg-tesseract/Cargo.toml +0 -57
- data/vendor/kreuzberg-tesseract/LICENSE +0 -22
- data/vendor/kreuzberg-tesseract/README.md +0 -399
- data/vendor/kreuzberg-tesseract/build.rs +0 -1127
- data/vendor/kreuzberg-tesseract/patches/README.md +0 -71
- data/vendor/kreuzberg-tesseract/patches/tesseract.diff +0 -199
- data/vendor/kreuzberg-tesseract/src/api.rs +0 -1371
- data/vendor/kreuzberg-tesseract/src/choice_iterator.rs +0 -77
- data/vendor/kreuzberg-tesseract/src/enums.rs +0 -297
- data/vendor/kreuzberg-tesseract/src/error.rs +0 -81
- data/vendor/kreuzberg-tesseract/src/lib.rs +0 -145
- data/vendor/kreuzberg-tesseract/src/monitor.rs +0 -57
- data/vendor/kreuzberg-tesseract/src/mutable_iterator.rs +0 -197
- data/vendor/kreuzberg-tesseract/src/page_iterator.rs +0 -253
- data/vendor/kreuzberg-tesseract/src/result_iterator.rs +0 -286
- data/vendor/kreuzberg-tesseract/src/result_renderer.rs +0 -183
- data/vendor/kreuzberg-tesseract/tests/integration_test.rs +0 -211
data/lib/kreuzberg/config.rb
CHANGED
|
@@ -1,10 +1,12 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
2
|
|
|
3
|
-
require 'json'
|
|
4
|
-
|
|
5
3
|
module Kreuzberg
|
|
6
4
|
module Config
|
|
5
|
+
# OCR configuration
|
|
6
|
+
#
|
|
7
7
|
# @example
|
|
8
|
+
# ocr = OCR.new(backend: "tesseract", language: "eng")
|
|
9
|
+
#
|
|
8
10
|
class OCR
|
|
9
11
|
attr_reader :backend, :language, :tesseract_config
|
|
10
12
|
|
|
@@ -37,7 +39,6 @@ module Kreuzberg
|
|
|
37
39
|
end
|
|
38
40
|
end
|
|
39
41
|
|
|
40
|
-
# Tesseract OCR engine configuration
|
|
41
42
|
class Tesseract
|
|
42
43
|
attr_reader :options
|
|
43
44
|
|
|
@@ -71,7 +72,6 @@ module Kreuzberg
|
|
|
71
72
|
class Chunking
|
|
72
73
|
attr_reader :max_chars, :max_overlap, :preset, :embedding, :enabled
|
|
73
74
|
|
|
74
|
-
# rubocop:disable Metrics/CyclomaticComplexity
|
|
75
75
|
def initialize(
|
|
76
76
|
max_chars: nil,
|
|
77
77
|
max_overlap: nil,
|
|
@@ -81,17 +81,11 @@ module Kreuzberg
|
|
|
81
81
|
chunk_overlap: nil,
|
|
82
82
|
enabled: true
|
|
83
83
|
)
|
|
84
|
-
# rubocop:enable Metrics/CyclomaticComplexity
|
|
85
84
|
resolved_size = chunk_size || max_chars || 1000
|
|
86
85
|
resolved_overlap = chunk_overlap || max_overlap || 200
|
|
87
86
|
|
|
88
87
|
@max_chars = resolved_size.to_i
|
|
89
88
|
@max_overlap = resolved_overlap.to_i
|
|
90
|
-
|
|
91
|
-
# Validate positive values
|
|
92
|
-
raise ArgumentError, "max_chars must be a positive integer, got #{@max_chars}" if @max_chars.negative?
|
|
93
|
-
raise ArgumentError, "max_overlap must be a positive integer, got #{@max_overlap}" if @max_overlap.negative?
|
|
94
|
-
|
|
95
89
|
@preset = preset&.to_s
|
|
96
90
|
@embedding = normalize_embedding(embedding)
|
|
97
91
|
@enabled = boolean_or_nil(enabled)
|
|
@@ -126,7 +120,6 @@ module Kreuzberg
|
|
|
126
120
|
end
|
|
127
121
|
end
|
|
128
122
|
|
|
129
|
-
# Embedding model configuration for document chunking
|
|
130
123
|
class Embedding
|
|
131
124
|
attr_reader :model, :normalize, :batch_size, :show_download_progress, :cache_dir
|
|
132
125
|
|
|
@@ -197,86 +190,18 @@ module Kreuzberg
|
|
|
197
190
|
end
|
|
198
191
|
end
|
|
199
192
|
|
|
200
|
-
# Font configuration for PDF rendering
|
|
201
|
-
#
|
|
202
|
-
# @example
|
|
203
|
-
# font_config = FontConfig.new(enabled: true, custom_font_dirs: ["/usr/share/fonts"])
|
|
204
|
-
#
|
|
205
|
-
class FontConfig
|
|
206
|
-
attr_accessor :enabled, :custom_font_dirs
|
|
207
|
-
|
|
208
|
-
def initialize(enabled: true, custom_font_dirs: nil)
|
|
209
|
-
@enabled = enabled ? true : false
|
|
210
|
-
@custom_font_dirs = custom_font_dirs
|
|
211
|
-
end
|
|
212
|
-
|
|
213
|
-
def to_h
|
|
214
|
-
{
|
|
215
|
-
enabled: @enabled,
|
|
216
|
-
custom_font_dirs: @custom_font_dirs
|
|
217
|
-
}.compact
|
|
218
|
-
end
|
|
219
|
-
end
|
|
220
|
-
|
|
221
|
-
# Hierarchy detection configuration
|
|
222
|
-
#
|
|
223
|
-
# @example
|
|
224
|
-
# hierarchy = Hierarchy.new(enabled: true, k_clusters: 6, include_bbox: true)
|
|
225
|
-
#
|
|
226
|
-
class Hierarchy
|
|
227
|
-
attr_reader :enabled, :k_clusters, :include_bbox, :ocr_coverage_threshold
|
|
228
|
-
|
|
229
|
-
def initialize(
|
|
230
|
-
enabled: true,
|
|
231
|
-
k_clusters: 6,
|
|
232
|
-
include_bbox: true,
|
|
233
|
-
ocr_coverage_threshold: nil
|
|
234
|
-
)
|
|
235
|
-
@enabled = enabled ? true : false
|
|
236
|
-
@k_clusters = k_clusters&.to_i || 6
|
|
237
|
-
@include_bbox = include_bbox ? true : false
|
|
238
|
-
@ocr_coverage_threshold = ocr_coverage_threshold&.to_f
|
|
239
|
-
end
|
|
240
|
-
|
|
241
|
-
def to_h
|
|
242
|
-
{
|
|
243
|
-
enabled: @enabled,
|
|
244
|
-
k_clusters: @k_clusters,
|
|
245
|
-
include_bbox: @include_bbox,
|
|
246
|
-
ocr_coverage_threshold: @ocr_coverage_threshold
|
|
247
|
-
}.compact
|
|
248
|
-
end
|
|
249
|
-
|
|
250
|
-
def self.from_h(hash)
|
|
251
|
-
return nil if hash.nil?
|
|
252
|
-
return hash if hash.is_a?(self)
|
|
253
|
-
|
|
254
|
-
new(**hash.transform_keys(&:to_sym)) if hash.is_a?(Hash)
|
|
255
|
-
end
|
|
256
|
-
end
|
|
257
|
-
|
|
258
193
|
# PDF-specific options
|
|
259
194
|
#
|
|
260
195
|
# @example
|
|
261
196
|
# pdf = PDF.new(extract_images: true, passwords: ["secret", "backup"])
|
|
262
197
|
#
|
|
263
|
-
# @example With font configuration
|
|
264
|
-
# font_config = FontConfig.new(enabled: true, custom_font_dirs: ["/usr/share/fonts"])
|
|
265
|
-
# pdf = PDF.new(extract_images: true, font_config: font_config)
|
|
266
|
-
#
|
|
267
|
-
# @example With hierarchy configuration
|
|
268
|
-
# hierarchy = Hierarchy.new(enabled: true, k_clusters: 6)
|
|
269
|
-
# pdf = PDF.new(extract_images: true, hierarchy: hierarchy)
|
|
270
|
-
#
|
|
271
198
|
class PDF
|
|
272
|
-
attr_reader :extract_images, :passwords, :extract_metadata
|
|
199
|
+
attr_reader :extract_images, :passwords, :extract_metadata
|
|
273
200
|
|
|
274
201
|
def initialize(
|
|
275
202
|
extract_images: false,
|
|
276
203
|
passwords: nil,
|
|
277
|
-
extract_metadata: true
|
|
278
|
-
font_config: nil,
|
|
279
|
-
hierarchy: nil
|
|
204
|
+
extract_metadata: true
|
|
280
205
|
)
|
|
281
206
|
@extract_images = extract_images ? true : false
|
|
282
207
|
@passwords = if passwords.is_a?(Array)
|
|
@@ -285,45 +210,15 @@ module Kreuzberg
|
|
|
285
210
|
(passwords ? [passwords.to_s] : nil)
|
|
286
211
|
end
|
|
287
212
|
@extract_metadata = extract_metadata ? true : false
|
|
288
|
-
@font_config = normalize_font_config(font_config)
|
|
289
|
-
@hierarchy = normalize_hierarchy(hierarchy)
|
|
290
213
|
end
|
|
291
214
|
|
|
292
215
|
def to_h
|
|
293
216
|
{
|
|
294
217
|
extract_images: @extract_images,
|
|
295
218
|
passwords: @passwords,
|
|
296
|
-
extract_metadata: @extract_metadata
|
|
297
|
-
font_config: @font_config&.to_h,
|
|
298
|
-
hierarchy: @hierarchy&.to_h
|
|
219
|
+
extract_metadata: @extract_metadata
|
|
299
220
|
}.compact
|
|
300
221
|
end
|
|
301
|
-
|
|
302
|
-
def font_config=(value)
|
|
303
|
-
@font_config = normalize_font_config(value)
|
|
304
|
-
end
|
|
305
|
-
|
|
306
|
-
def hierarchy=(value)
|
|
307
|
-
@hierarchy = normalize_hierarchy(value)
|
|
308
|
-
end
|
|
309
|
-
|
|
310
|
-
private
|
|
311
|
-
|
|
312
|
-
def normalize_font_config(value)
|
|
313
|
-
return nil if value.nil?
|
|
314
|
-
return value if value.is_a?(FontConfig)
|
|
315
|
-
return FontConfig.new(**value.transform_keys(&:to_sym)) if value.is_a?(Hash)
|
|
316
|
-
|
|
317
|
-
raise ArgumentError, "Expected #{FontConfig}, Hash, or nil, got #{value.class}"
|
|
318
|
-
end
|
|
319
|
-
|
|
320
|
-
def normalize_hierarchy(value)
|
|
321
|
-
return nil if value.nil?
|
|
322
|
-
return value if value.is_a?(Hierarchy)
|
|
323
|
-
return Hierarchy.new(**value.transform_keys(&:to_sym)) if value.is_a?(Hash)
|
|
324
|
-
|
|
325
|
-
raise ArgumentError, "Expected #{Hierarchy}, Hash, or nil, got #{value.class}"
|
|
326
|
-
end
|
|
327
222
|
end
|
|
328
223
|
|
|
329
224
|
# Image extraction configuration
|
|
@@ -394,8 +289,6 @@ module Kreuzberg
|
|
|
394
289
|
attr_reader :target_dpi, :auto_rotate, :deskew, :denoise,
|
|
395
290
|
:contrast_enhance, :binarization_method, :invert_colors
|
|
396
291
|
|
|
397
|
-
VALID_BINARIZATION_METHODS = %w[otsu sauvola niblack wolf bradley adaptive].freeze
|
|
398
|
-
|
|
399
292
|
def initialize(
|
|
400
293
|
target_dpi: 300,
|
|
401
294
|
auto_rotate: true,
|
|
@@ -413,12 +306,10 @@ module Kreuzberg
|
|
|
413
306
|
@binarization_method = binarization_method.to_s
|
|
414
307
|
@invert_colors = invert_colors ? true : false
|
|
415
308
|
|
|
416
|
-
|
|
417
|
-
return if
|
|
309
|
+
valid_methods = %w[otsu sauvola adaptive]
|
|
310
|
+
return if valid_methods.include?(@binarization_method)
|
|
418
311
|
|
|
419
|
-
|
|
420
|
-
raise ArgumentError,
|
|
421
|
-
"Invalid binarization_method: #{@binarization_method}. Valid methods are: #{valid_methods}"
|
|
312
|
+
raise ArgumentError, "binarization_method must be one of: #{valid_methods.join(', ')}"
|
|
422
313
|
end
|
|
423
314
|
|
|
424
315
|
def to_h
|
|
@@ -448,16 +339,14 @@ module Kreuzberg
|
|
|
448
339
|
class TokenReduction
|
|
449
340
|
attr_reader :mode, :preserve_important_words
|
|
450
341
|
|
|
451
|
-
VALID_MODES = %w[off light moderate aggressive maximum].freeze
|
|
452
|
-
|
|
453
342
|
def initialize(mode: 'off', preserve_important_words: true)
|
|
454
343
|
@mode = mode.to_s
|
|
455
344
|
@preserve_important_words = preserve_important_words ? true : false
|
|
456
345
|
|
|
457
|
-
|
|
458
|
-
return if
|
|
346
|
+
valid_modes = %w[off light moderate aggressive maximum]
|
|
347
|
+
return if valid_modes.include?(@mode)
|
|
459
348
|
|
|
460
|
-
raise ArgumentError, "
|
|
349
|
+
raise ArgumentError, "mode must be one of: #{valid_modes.join(', ')}"
|
|
461
350
|
end
|
|
462
351
|
|
|
463
352
|
def to_h
|
|
@@ -468,7 +357,6 @@ module Kreuzberg
|
|
|
468
357
|
end
|
|
469
358
|
end
|
|
470
359
|
|
|
471
|
-
# HTML preprocessing configuration for content extraction
|
|
472
360
|
class HtmlPreprocessing
|
|
473
361
|
attr_reader :enabled, :preset, :remove_navigation, :remove_forms
|
|
474
362
|
|
|
@@ -497,7 +385,6 @@ module Kreuzberg
|
|
|
497
385
|
end
|
|
498
386
|
end
|
|
499
387
|
|
|
500
|
-
# HTML rendering options for document conversion
|
|
501
388
|
class HtmlOptions
|
|
502
389
|
attr_reader :options
|
|
503
390
|
|
|
@@ -525,7 +412,6 @@ module Kreuzberg
|
|
|
525
412
|
end
|
|
526
413
|
end
|
|
527
414
|
|
|
528
|
-
# YAKE keyword extraction parameters
|
|
529
415
|
class KeywordYakeParams
|
|
530
416
|
attr_reader :window_size
|
|
531
417
|
|
|
@@ -538,7 +424,6 @@ module Kreuzberg
|
|
|
538
424
|
end
|
|
539
425
|
end
|
|
540
426
|
|
|
541
|
-
# RAKE keyword extraction parameters
|
|
542
427
|
class KeywordRakeParams
|
|
543
428
|
attr_reader :min_word_length, :max_words_per_phrase
|
|
544
429
|
|
|
@@ -555,7 +440,6 @@ module Kreuzberg
|
|
|
555
440
|
end
|
|
556
441
|
end
|
|
557
442
|
|
|
558
|
-
# Keyword extraction configuration for document analysis
|
|
559
443
|
class Keywords
|
|
560
444
|
attr_reader :algorithm, :max_keywords, :min_score, :ngram_range,
|
|
561
445
|
:language, :yake_params, :rake_params
|
|
@@ -601,36 +485,6 @@ module Kreuzberg
|
|
|
601
485
|
end
|
|
602
486
|
end
|
|
603
487
|
|
|
604
|
-
# Page tracking configuration for multi-page documents
|
|
605
|
-
#
|
|
606
|
-
# @example Enable page extraction
|
|
607
|
-
# pages = PageConfig.new(extract_pages: true)
|
|
608
|
-
#
|
|
609
|
-
# @example Enable page markers in content
|
|
610
|
-
# pages = PageConfig.new(insert_page_markers: true, marker_format: "--- PAGE {page_num} ---")
|
|
611
|
-
#
|
|
612
|
-
class PageConfig
|
|
613
|
-
attr_reader :extract_pages, :insert_page_markers, :marker_format
|
|
614
|
-
|
|
615
|
-
def initialize(
|
|
616
|
-
extract_pages: false,
|
|
617
|
-
insert_page_markers: false,
|
|
618
|
-
marker_format: "\n\n<!-- PAGE {page_num} -->\n\n"
|
|
619
|
-
)
|
|
620
|
-
@extract_pages = extract_pages ? true : false
|
|
621
|
-
@insert_page_markers = insert_page_markers ? true : false
|
|
622
|
-
@marker_format = marker_format.to_s
|
|
623
|
-
end
|
|
624
|
-
|
|
625
|
-
def to_h
|
|
626
|
-
{
|
|
627
|
-
extract_pages: @extract_pages,
|
|
628
|
-
insert_page_markers: @insert_page_markers,
|
|
629
|
-
marker_format: @marker_format
|
|
630
|
-
}
|
|
631
|
-
end
|
|
632
|
-
end
|
|
633
|
-
|
|
634
488
|
# Post-processor configuration
|
|
635
489
|
#
|
|
636
490
|
# @example Enable all post-processors
|
|
@@ -715,7 +569,7 @@ module Kreuzberg
|
|
|
715
569
|
attr_reader :use_cache, :enable_quality_processing, :force_ocr,
|
|
716
570
|
:ocr, :chunking, :language_detection, :pdf_options,
|
|
717
571
|
:image_extraction, :image_preprocessing, :postprocessor,
|
|
718
|
-
:token_reduction, :keywords, :html_options,
|
|
572
|
+
:token_reduction, :keywords, :html_options,
|
|
719
573
|
:max_concurrent_extractions
|
|
720
574
|
|
|
721
575
|
# Load configuration from a file.
|
|
@@ -734,6 +588,7 @@ module Kreuzberg
|
|
|
734
588
|
#
|
|
735
589
|
def self.from_file(path)
|
|
736
590
|
hash = Kreuzberg._config_from_file_native(path)
|
|
591
|
+
# Convert string keys to symbols for keyword arguments
|
|
737
592
|
new(**hash.transform_keys(&:to_sym))
|
|
738
593
|
end
|
|
739
594
|
|
|
@@ -754,6 +609,7 @@ module Kreuzberg
|
|
|
754
609
|
hash = Kreuzberg._config_discover_native
|
|
755
610
|
return nil if hash.nil?
|
|
756
611
|
|
|
612
|
+
# Convert string keys to symbols for keyword arguments
|
|
757
613
|
new(**hash.transform_keys(&:to_sym))
|
|
758
614
|
end
|
|
759
615
|
|
|
@@ -771,7 +627,6 @@ module Kreuzberg
|
|
|
771
627
|
token_reduction: nil,
|
|
772
628
|
keywords: nil,
|
|
773
629
|
html_options: nil,
|
|
774
|
-
pages: nil,
|
|
775
630
|
max_concurrent_extractions: nil
|
|
776
631
|
)
|
|
777
632
|
@use_cache = use_cache ? true : false
|
|
@@ -787,11 +642,10 @@ module Kreuzberg
|
|
|
787
642
|
@token_reduction = normalize_config(token_reduction, TokenReduction)
|
|
788
643
|
@keywords = normalize_config(keywords, Keywords)
|
|
789
644
|
@html_options = normalize_config(html_options, HtmlOptions)
|
|
790
|
-
@pages = normalize_config(pages, PageConfig)
|
|
791
645
|
@max_concurrent_extractions = max_concurrent_extractions&.to_i
|
|
792
646
|
end
|
|
793
647
|
|
|
794
|
-
# rubocop:disable Metrics/
|
|
648
|
+
# rubocop:disable Metrics/PerceivedComplexity
|
|
795
649
|
def to_h
|
|
796
650
|
{
|
|
797
651
|
use_cache: @use_cache,
|
|
@@ -807,130 +661,24 @@ module Kreuzberg
|
|
|
807
661
|
token_reduction: @token_reduction&.to_h,
|
|
808
662
|
keywords: @keywords&.to_h,
|
|
809
663
|
html_options: @html_options&.to_h,
|
|
810
|
-
pages: @pages&.to_h,
|
|
811
664
|
max_concurrent_extractions: @max_concurrent_extractions
|
|
812
665
|
}.compact
|
|
813
666
|
end
|
|
814
|
-
# rubocop:enable Metrics/
|
|
815
|
-
|
|
816
|
-
# Serialize configuration to JSON string
|
|
817
|
-
#
|
|
818
|
-
# @return [String] JSON representation of the configuration
|
|
819
|
-
#
|
|
820
|
-
# @example
|
|
821
|
-
# config = Extraction.new(use_cache: true)
|
|
822
|
-
# json = config.to_json
|
|
823
|
-
# puts json # => "{\"use_cache\":true,...}"
|
|
824
|
-
#
|
|
825
|
-
def to_json(*_args)
|
|
826
|
-
json_hash = to_h
|
|
827
|
-
# Convert to JSON directly - the native function has issues
|
|
828
|
-
JSON.generate(json_hash)
|
|
829
|
-
end
|
|
830
|
-
|
|
831
|
-
# Get a field from the configuration
|
|
832
|
-
#
|
|
833
|
-
# Supports dot notation for nested fields (e.g., "ocr.backend")
|
|
834
|
-
#
|
|
835
|
-
# @param field_name [String, Symbol] Field name to retrieve
|
|
836
|
-
# @return [Object, nil] Parsed field value, or nil if field doesn't exist
|
|
837
|
-
#
|
|
838
|
-
# @example Get a top-level field
|
|
839
|
-
# config = Extraction.new(use_cache: true)
|
|
840
|
-
# config.get_field("use_cache") # => true
|
|
841
|
-
#
|
|
842
|
-
# @example Get a nested field
|
|
843
|
-
# config = Extraction.new(ocr: OCR.new(backend: "tesseract"))
|
|
844
|
-
# config.get_field("ocr.backend") # => "tesseract"
|
|
845
|
-
#
|
|
846
|
-
def get_field(field_name)
|
|
847
|
-
json_hash = to_h
|
|
848
|
-
field_path = field_name.to_s.split('.')
|
|
849
|
-
|
|
850
|
-
# Navigate the nested hash using the field path
|
|
851
|
-
field_path.reduce(json_hash) do |current, key|
|
|
852
|
-
case current
|
|
853
|
-
when Hash
|
|
854
|
-
# Check both symbol and string keys, prefer symbol if exists
|
|
855
|
-
if current.key?(key.to_sym)
|
|
856
|
-
current[key.to_sym]
|
|
857
|
-
elsif current.key?(key.to_s)
|
|
858
|
-
current[key.to_s]
|
|
859
|
-
end
|
|
860
|
-
end
|
|
861
|
-
end
|
|
862
|
-
end
|
|
863
|
-
|
|
864
|
-
# Merge another configuration into this one
|
|
865
|
-
#
|
|
866
|
-
# Returns a new configuration with fields from the other config overriding
|
|
867
|
-
# fields from this config (shallow merge).
|
|
868
|
-
#
|
|
869
|
-
# @param other [Extraction, Hash] Configuration to merge
|
|
870
|
-
# @return [Extraction] New merged configuration
|
|
871
|
-
#
|
|
872
|
-
# @example
|
|
873
|
-
# base = Extraction.new(use_cache: true, force_ocr: false)
|
|
874
|
-
# override = Extraction.new(force_ocr: true)
|
|
875
|
-
# merged = base.merge(override)
|
|
876
|
-
# merged.use_cache # => true
|
|
877
|
-
# merged.force_ocr # => true
|
|
878
|
-
#
|
|
879
|
-
def merge(other)
|
|
880
|
-
other_config = other.is_a?(Extraction) ? other : Extraction.new(**other)
|
|
881
|
-
# Merge the two config hashes
|
|
882
|
-
merged_hash = to_h.merge(other_config.to_h)
|
|
883
|
-
Extraction.new(**merged_hash)
|
|
884
|
-
end
|
|
885
|
-
|
|
886
|
-
# Merge another configuration into this one (mutating)
|
|
887
|
-
#
|
|
888
|
-
# Modifies this configuration in-place by merging fields from another config.
|
|
889
|
-
#
|
|
890
|
-
# @param other [Extraction, Hash] Configuration to merge
|
|
891
|
-
# @return [self]
|
|
892
|
-
#
|
|
893
|
-
# @example
|
|
894
|
-
# base = Extraction.new(use_cache: true, force_ocr: false)
|
|
895
|
-
# override = Extraction.new(force_ocr: true)
|
|
896
|
-
# base.merge!(override)
|
|
897
|
-
# base.use_cache # => true
|
|
898
|
-
# base.force_ocr # => true
|
|
899
|
-
#
|
|
900
|
-
def merge!(other)
|
|
901
|
-
other_config = other.is_a?(Extraction) ? other : Extraction.new(**other)
|
|
902
|
-
merged = merge(other_config)
|
|
903
|
-
update_from_merged(merged)
|
|
904
|
-
self
|
|
905
|
-
end
|
|
667
|
+
# rubocop:enable Metrics/PerceivedComplexity
|
|
906
668
|
|
|
907
669
|
private
|
|
908
670
|
|
|
909
671
|
def normalize_config(value, klass)
|
|
910
672
|
return nil if value.nil?
|
|
911
673
|
return value if value.is_a?(klass)
|
|
674
|
+
# Convert string keys to symbols for keyword arguments
|
|
912
675
|
return klass.new(**value.transform_keys(&:to_sym)) if value.is_a?(Hash)
|
|
913
676
|
|
|
914
677
|
raise ArgumentError, "Expected #{klass}, Hash, or nil, got #{value.class}"
|
|
915
678
|
end
|
|
916
|
-
|
|
917
|
-
def update_from_merged(merged)
|
|
918
|
-
@use_cache = merged.use_cache
|
|
919
|
-
@enable_quality_processing = merged.enable_quality_processing
|
|
920
|
-
@force_ocr = merged.force_ocr
|
|
921
|
-
@ocr = merged.ocr
|
|
922
|
-
@chunking = merged.chunking
|
|
923
|
-
@language_detection = merged.language_detection
|
|
924
|
-
@pdf_options = merged.pdf_options
|
|
925
|
-
@image_extraction = merged.image_extraction
|
|
926
|
-
@image_preprocessing = merged.image_preprocessing
|
|
927
|
-
@postprocessor = merged.postprocessor
|
|
928
|
-
@token_reduction = merged.token_reduction
|
|
929
|
-
@keywords = merged.keywords
|
|
930
|
-
@html_options = merged.html_options
|
|
931
|
-
@pages = merged.pages
|
|
932
|
-
@max_concurrent_extractions = merged.max_concurrent_extractions
|
|
933
|
-
end
|
|
934
679
|
end
|
|
680
|
+
|
|
681
|
+
# Backwards compatibility aliases
|
|
682
|
+
Ocr = OCR
|
|
935
683
|
end
|
|
936
684
|
end
|
data/lib/kreuzberg/errors.rb
CHANGED
|
@@ -1,75 +1,9 @@
|
|
|
1
1
|
# frozen_string_literal: true
|
|
2
2
|
|
|
3
|
-
require 'json'
|
|
4
|
-
|
|
5
3
|
module Kreuzberg
|
|
6
|
-
ERROR_CODE_SUCCESS = 0
|
|
7
|
-
ERROR_CODE_GENERIC = 1
|
|
8
|
-
ERROR_CODE_PANIC = 2
|
|
9
|
-
ERROR_CODE_INVALID_ARGUMENT = 3
|
|
10
|
-
ERROR_CODE_IO = 4
|
|
11
|
-
ERROR_CODE_PARSING = 5
|
|
12
|
-
ERROR_CODE_OCR = 6
|
|
13
|
-
ERROR_CODE_MISSING_DEPENDENCY = 7
|
|
14
|
-
|
|
15
4
|
module Errors
|
|
16
|
-
class PanicContext
|
|
17
|
-
attr_reader :file, :line, :function, :message, :timestamp_secs
|
|
18
|
-
|
|
19
|
-
def initialize(file:, line:, function:, message:, timestamp_secs:)
|
|
20
|
-
@file = file
|
|
21
|
-
@line = line
|
|
22
|
-
@function = function
|
|
23
|
-
@message = message
|
|
24
|
-
@timestamp_secs = timestamp_secs
|
|
25
|
-
end
|
|
26
|
-
|
|
27
|
-
def to_s
|
|
28
|
-
"#{file}:#{line}:#{function}: #{message}"
|
|
29
|
-
end
|
|
30
|
-
|
|
31
|
-
def to_h
|
|
32
|
-
{
|
|
33
|
-
file:,
|
|
34
|
-
line:,
|
|
35
|
-
function:,
|
|
36
|
-
message:,
|
|
37
|
-
timestamp_secs:
|
|
38
|
-
}
|
|
39
|
-
end
|
|
40
|
-
|
|
41
|
-
def self.from_json(json_string)
|
|
42
|
-
return nil if json_string.nil? || json_string.empty?
|
|
43
|
-
|
|
44
|
-
data = JSON.parse(json_string, symbolize_names: true)
|
|
45
|
-
sliced = data.slice(:file, :line, :function, :message, :timestamp_secs)
|
|
46
|
-
new(**with_defaults(sliced))
|
|
47
|
-
rescue JSON::ParserError
|
|
48
|
-
nil
|
|
49
|
-
end
|
|
50
|
-
|
|
51
|
-
def self.with_defaults(sliced)
|
|
52
|
-
{
|
|
53
|
-
file: sliced[:file] || '',
|
|
54
|
-
line: sliced[:line] || 0,
|
|
55
|
-
function: sliced[:function] || '',
|
|
56
|
-
message: sliced[:message] || '',
|
|
57
|
-
timestamp_secs: sliced[:timestamp_secs] || 0
|
|
58
|
-
}
|
|
59
|
-
end
|
|
60
|
-
private_class_method :with_defaults
|
|
61
|
-
end
|
|
62
|
-
|
|
63
5
|
# Base error class for all Kreuzberg errors
|
|
64
|
-
class Error < StandardError
|
|
65
|
-
attr_reader :panic_context, :error_code
|
|
66
|
-
|
|
67
|
-
def initialize(message, panic_context: nil, error_code: nil)
|
|
68
|
-
super(message)
|
|
69
|
-
@panic_context = panic_context
|
|
70
|
-
@error_code = error_code
|
|
71
|
-
end
|
|
72
|
-
end
|
|
6
|
+
class Error < StandardError; end
|
|
73
7
|
|
|
74
8
|
# Raised when validation fails
|
|
75
9
|
class ValidationError < Error; end
|
|
@@ -78,8 +12,8 @@ module Kreuzberg
|
|
|
78
12
|
class ParsingError < Error
|
|
79
13
|
attr_reader :context
|
|
80
14
|
|
|
81
|
-
def initialize(message, context: nil
|
|
82
|
-
super(message
|
|
15
|
+
def initialize(message, context: nil)
|
|
16
|
+
super(message)
|
|
83
17
|
@context = context
|
|
84
18
|
end
|
|
85
19
|
end
|
|
@@ -88,8 +22,8 @@ module Kreuzberg
|
|
|
88
22
|
class OCRError < Error
|
|
89
23
|
attr_reader :context
|
|
90
24
|
|
|
91
|
-
def initialize(message, context: nil
|
|
92
|
-
super(message
|
|
25
|
+
def initialize(message, context: nil)
|
|
26
|
+
super(message)
|
|
93
27
|
@context = context
|
|
94
28
|
end
|
|
95
29
|
end
|
|
@@ -98,8 +32,8 @@ module Kreuzberg
|
|
|
98
32
|
class MissingDependencyError < Error
|
|
99
33
|
attr_reader :dependency
|
|
100
34
|
|
|
101
|
-
def initialize(message, dependency: nil
|
|
102
|
-
super(message
|
|
35
|
+
def initialize(message, dependency: nil)
|
|
36
|
+
super(message)
|
|
103
37
|
@dependency = dependency
|
|
104
38
|
end
|
|
105
39
|
end
|