kreuzberg 4.0.0.pre.rc.29 → 4.0.0.rc1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +0 -6
- data/.rubocop.yaml +534 -1
- data/Gemfile +2 -1
- data/Gemfile.lock +28 -116
- data/README.md +269 -629
- data/Rakefile +0 -9
- data/Steepfile +4 -8
- data/examples/async_patterns.rb +58 -1
- data/ext/kreuzberg_rb/extconf.rb +5 -35
- data/ext/kreuzberg_rb/native/Cargo.toml +16 -55
- data/ext/kreuzberg_rb/native/build.rs +14 -12
- data/ext/kreuzberg_rb/native/include/ieeefp.h +1 -1
- data/ext/kreuzberg_rb/native/include/msvc_compat/strings.h +1 -1
- data/ext/kreuzberg_rb/native/include/strings.h +2 -2
- data/ext/kreuzberg_rb/native/include/unistd.h +1 -1
- data/ext/kreuzberg_rb/native/src/lib.rs +34 -897
- data/extconf.rb +6 -38
- data/kreuzberg.gemspec +20 -114
- data/lib/kreuzberg/api_proxy.rb +18 -2
- data/lib/kreuzberg/cache_api.rb +0 -22
- data/lib/kreuzberg/cli.rb +10 -2
- data/lib/kreuzberg/cli_proxy.rb +10 -0
- data/lib/kreuzberg/config.rb +22 -274
- data/lib/kreuzberg/errors.rb +7 -73
- data/lib/kreuzberg/extraction_api.rb +8 -237
- data/lib/kreuzberg/mcp_proxy.rb +11 -2
- data/lib/kreuzberg/ocr_backend_protocol.rb +73 -0
- data/lib/kreuzberg/post_processor_protocol.rb +71 -0
- data/lib/kreuzberg/result.rb +33 -151
- data/lib/kreuzberg/setup_lib_path.rb +2 -22
- data/lib/kreuzberg/validator_protocol.rb +73 -0
- data/lib/kreuzberg/version.rb +1 -1
- data/lib/kreuzberg.rb +13 -27
- data/pkg/kreuzberg-4.0.0.rc1.gem +0 -0
- data/sig/kreuzberg.rbs +12 -105
- data/spec/binding/cache_spec.rb +22 -22
- data/spec/binding/cli_proxy_spec.rb +4 -2
- data/spec/binding/cli_spec.rb +11 -12
- data/spec/binding/config_spec.rb +0 -74
- data/spec/binding/config_validation_spec.rb +6 -100
- data/spec/binding/error_handling_spec.rb +97 -283
- data/spec/binding/plugins/ocr_backend_spec.rb +8 -8
- data/spec/binding/plugins/postprocessor_spec.rb +11 -11
- data/spec/binding/plugins/validator_spec.rb +13 -12
- data/spec/examples.txt +104 -0
- data/spec/fixtures/config.toml +1 -0
- data/spec/fixtures/config.yaml +1 -0
- data/spec/fixtures/invalid_config.toml +1 -0
- data/spec/smoke/package_spec.rb +3 -2
- data/spec/spec_helper.rb +3 -1
- data/vendor/kreuzberg/Cargo.toml +67 -192
- data/vendor/kreuzberg/README.md +9 -97
- data/vendor/kreuzberg/build.rs +194 -516
- data/vendor/kreuzberg/src/api/handlers.rs +9 -130
- data/vendor/kreuzberg/src/api/mod.rs +3 -18
- data/vendor/kreuzberg/src/api/server.rs +71 -236
- data/vendor/kreuzberg/src/api/types.rs +7 -43
- data/vendor/kreuzberg/src/bin/profile_extract.rs +455 -0
- data/vendor/kreuzberg/src/cache/mod.rs +3 -27
- data/vendor/kreuzberg/src/chunking/mod.rs +79 -1705
- data/vendor/kreuzberg/src/core/batch_mode.rs +0 -60
- data/vendor/kreuzberg/src/core/config.rs +23 -905
- data/vendor/kreuzberg/src/core/extractor.rs +106 -403
- data/vendor/kreuzberg/src/core/io.rs +2 -4
- data/vendor/kreuzberg/src/core/mime.rs +12 -2
- data/vendor/kreuzberg/src/core/mod.rs +3 -22
- data/vendor/kreuzberg/src/core/pipeline.rs +78 -395
- data/vendor/kreuzberg/src/embeddings.rs +21 -169
- data/vendor/kreuzberg/src/error.rs +2 -2
- data/vendor/kreuzberg/src/extraction/archive.rs +31 -36
- data/vendor/kreuzberg/src/extraction/docx.rs +1 -365
- data/vendor/kreuzberg/src/extraction/email.rs +11 -12
- data/vendor/kreuzberg/src/extraction/excel.rs +129 -138
- data/vendor/kreuzberg/src/extraction/html.rs +170 -1447
- data/vendor/kreuzberg/src/extraction/image.rs +14 -138
- data/vendor/kreuzberg/src/extraction/libreoffice.rs +3 -13
- data/vendor/kreuzberg/src/extraction/mod.rs +5 -21
- data/vendor/kreuzberg/src/extraction/office_metadata/mod.rs +0 -2
- data/vendor/kreuzberg/src/extraction/pandoc/batch.rs +275 -0
- data/vendor/kreuzberg/src/extraction/pandoc/mime_types.rs +178 -0
- data/vendor/kreuzberg/src/extraction/pandoc/mod.rs +491 -0
- data/vendor/kreuzberg/src/extraction/pandoc/server.rs +496 -0
- data/vendor/kreuzberg/src/extraction/pandoc/subprocess.rs +1188 -0
- data/vendor/kreuzberg/src/extraction/pandoc/version.rs +162 -0
- data/vendor/kreuzberg/src/extraction/pptx.rs +94 -196
- data/vendor/kreuzberg/src/extraction/structured.rs +4 -5
- data/vendor/kreuzberg/src/extraction/table.rs +1 -2
- data/vendor/kreuzberg/src/extraction/text.rs +10 -18
- data/vendor/kreuzberg/src/extractors/archive.rs +0 -22
- data/vendor/kreuzberg/src/extractors/docx.rs +148 -69
- data/vendor/kreuzberg/src/extractors/email.rs +9 -37
- data/vendor/kreuzberg/src/extractors/excel.rs +40 -81
- data/vendor/kreuzberg/src/extractors/html.rs +173 -182
- data/vendor/kreuzberg/src/extractors/image.rs +8 -32
- data/vendor/kreuzberg/src/extractors/mod.rs +10 -171
- data/vendor/kreuzberg/src/extractors/pandoc.rs +201 -0
- data/vendor/kreuzberg/src/extractors/pdf.rs +64 -329
- data/vendor/kreuzberg/src/extractors/pptx.rs +34 -79
- data/vendor/kreuzberg/src/extractors/structured.rs +0 -16
- data/vendor/kreuzberg/src/extractors/text.rs +7 -30
- data/vendor/kreuzberg/src/extractors/xml.rs +8 -27
- data/vendor/kreuzberg/src/keywords/processor.rs +1 -9
- data/vendor/kreuzberg/src/keywords/rake.rs +1 -0
- data/vendor/kreuzberg/src/language_detection/mod.rs +51 -94
- data/vendor/kreuzberg/src/lib.rs +5 -17
- data/vendor/kreuzberg/src/mcp/mod.rs +1 -4
- data/vendor/kreuzberg/src/mcp/server.rs +21 -145
- data/vendor/kreuzberg/src/ocr/mod.rs +0 -2
- data/vendor/kreuzberg/src/ocr/processor.rs +8 -19
- data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +0 -2
- data/vendor/kreuzberg/src/pdf/error.rs +1 -93
- data/vendor/kreuzberg/src/pdf/metadata.rs +100 -263
- data/vendor/kreuzberg/src/pdf/mod.rs +2 -33
- data/vendor/kreuzberg/src/pdf/rendering.rs +12 -12
- data/vendor/kreuzberg/src/pdf/table.rs +64 -61
- data/vendor/kreuzberg/src/pdf/text.rs +24 -416
- data/vendor/kreuzberg/src/plugins/extractor.rs +8 -40
- data/vendor/kreuzberg/src/plugins/mod.rs +0 -3
- data/vendor/kreuzberg/src/plugins/ocr.rs +14 -22
- data/vendor/kreuzberg/src/plugins/processor.rs +1 -10
- data/vendor/kreuzberg/src/plugins/registry.rs +0 -15
- data/vendor/kreuzberg/src/plugins/validator.rs +8 -20
- data/vendor/kreuzberg/src/stopwords/mod.rs +2 -2
- data/vendor/kreuzberg/src/text/mod.rs +0 -8
- data/vendor/kreuzberg/src/text/quality.rs +15 -28
- data/vendor/kreuzberg/src/text/string_utils.rs +10 -22
- data/vendor/kreuzberg/src/text/token_reduction/core.rs +50 -86
- data/vendor/kreuzberg/src/text/token_reduction/filters.rs +16 -37
- data/vendor/kreuzberg/src/text/token_reduction/simd_text.rs +1 -2
- data/vendor/kreuzberg/src/types.rs +67 -907
- data/vendor/kreuzberg/src/utils/mod.rs +0 -14
- data/vendor/kreuzberg/src/utils/quality.rs +3 -12
- data/vendor/kreuzberg/tests/api_tests.rs +0 -506
- data/vendor/kreuzberg/tests/archive_integration.rs +0 -2
- data/vendor/kreuzberg/tests/batch_orchestration.rs +12 -57
- data/vendor/kreuzberg/tests/batch_processing.rs +8 -32
- data/vendor/kreuzberg/tests/chunking_offset_demo.rs +92 -0
- data/vendor/kreuzberg/tests/concurrency_stress.rs +8 -40
- data/vendor/kreuzberg/tests/config_features.rs +1 -33
- data/vendor/kreuzberg/tests/config_loading_tests.rs +39 -16
- data/vendor/kreuzberg/tests/core_integration.rs +9 -35
- data/vendor/kreuzberg/tests/csv_integration.rs +81 -71
- data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +25 -23
- data/vendor/kreuzberg/tests/email_integration.rs +1 -3
- data/vendor/kreuzberg/tests/error_handling.rs +34 -43
- data/vendor/kreuzberg/tests/format_integration.rs +1 -7
- data/vendor/kreuzberg/tests/helpers/mod.rs +0 -60
- data/vendor/kreuzberg/tests/image_integration.rs +0 -2
- data/vendor/kreuzberg/tests/mime_detection.rs +16 -17
- data/vendor/kreuzberg/tests/ocr_configuration.rs +0 -4
- data/vendor/kreuzberg/tests/ocr_errors.rs +0 -22
- data/vendor/kreuzberg/tests/ocr_quality.rs +0 -2
- data/vendor/kreuzberg/tests/pandoc_integration.rs +503 -0
- data/vendor/kreuzberg/tests/pdf_integration.rs +0 -2
- data/vendor/kreuzberg/tests/pipeline_integration.rs +2 -36
- data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +0 -5
- data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +1 -17
- data/vendor/kreuzberg/tests/plugin_system.rs +0 -6
- data/vendor/kreuzberg/tests/registry_integration_tests.rs +22 -2
- data/vendor/kreuzberg/tests/security_validation.rs +1 -13
- data/vendor/kreuzberg/tests/test_fastembed.rs +23 -45
- metadata +25 -171
- data/.rubocop.yml +0 -543
- data/ext/kreuzberg_rb/native/.cargo/config.toml +0 -23
- data/ext/kreuzberg_rb/native/Cargo.lock +0 -7619
- data/lib/kreuzberg/error_context.rb +0 -136
- data/lib/kreuzberg/types.rb +0 -170
- data/lib/libpdfium.so +0 -0
- data/spec/binding/async_operations_spec.rb +0 -473
- data/spec/binding/batch_operations_spec.rb +0 -595
- data/spec/binding/batch_spec.rb +0 -359
- data/spec/binding/config_result_spec.rb +0 -377
- data/spec/binding/embeddings_spec.rb +0 -816
- data/spec/binding/error_recovery_spec.rb +0 -488
- data/spec/binding/font_config_spec.rb +0 -220
- data/spec/binding/images_spec.rb +0 -738
- data/spec/binding/keywords_extraction_spec.rb +0 -600
- data/spec/binding/metadata_types_spec.rb +0 -1228
- data/spec/binding/pages_extraction_spec.rb +0 -471
- data/spec/binding/tables_spec.rb +0 -641
- data/spec/unit/config/chunking_config_spec.rb +0 -213
- data/spec/unit/config/embedding_config_spec.rb +0 -343
- data/spec/unit/config/extraction_config_spec.rb +0 -438
- data/spec/unit/config/font_config_spec.rb +0 -285
- data/spec/unit/config/hierarchy_config_spec.rb +0 -314
- data/spec/unit/config/image_extraction_config_spec.rb +0 -209
- data/spec/unit/config/image_preprocessing_config_spec.rb +0 -249
- data/spec/unit/config/keyword_config_spec.rb +0 -229
- data/spec/unit/config/language_detection_config_spec.rb +0 -258
- data/spec/unit/config/ocr_config_spec.rb +0 -171
- data/spec/unit/config/page_config_spec.rb +0 -221
- data/spec/unit/config/pdf_config_spec.rb +0 -267
- data/spec/unit/config/postprocessor_config_spec.rb +0 -290
- data/spec/unit/config/tesseract_config_spec.rb +0 -181
- data/spec/unit/config/token_reduction_config_spec.rb +0 -251
- data/test/metadata_types_test.rb +0 -959
- data/vendor/Cargo.toml +0 -61
- data/vendor/kreuzberg/examples/bench_fixes.rs +0 -71
- data/vendor/kreuzberg/examples/test_pdfium_fork.rs +0 -62
- data/vendor/kreuzberg/src/chunking/processor.rs +0 -219
- data/vendor/kreuzberg/src/core/batch_optimizations.rs +0 -385
- data/vendor/kreuzberg/src/core/config_validation.rs +0 -949
- data/vendor/kreuzberg/src/core/formats.rs +0 -235
- data/vendor/kreuzberg/src/core/server_config.rs +0 -1220
- data/vendor/kreuzberg/src/extraction/capacity.rs +0 -263
- data/vendor/kreuzberg/src/extraction/markdown.rs +0 -216
- data/vendor/kreuzberg/src/extraction/office_metadata/odt_properties.rs +0 -284
- data/vendor/kreuzberg/src/extractors/bibtex.rs +0 -470
- data/vendor/kreuzberg/src/extractors/docbook.rs +0 -504
- data/vendor/kreuzberg/src/extractors/epub.rs +0 -696
- data/vendor/kreuzberg/src/extractors/fictionbook.rs +0 -492
- data/vendor/kreuzberg/src/extractors/jats.rs +0 -1054
- data/vendor/kreuzberg/src/extractors/jupyter.rs +0 -368
- data/vendor/kreuzberg/src/extractors/latex.rs +0 -653
- data/vendor/kreuzberg/src/extractors/markdown.rs +0 -701
- data/vendor/kreuzberg/src/extractors/odt.rs +0 -628
- data/vendor/kreuzberg/src/extractors/opml.rs +0 -635
- data/vendor/kreuzberg/src/extractors/orgmode.rs +0 -529
- data/vendor/kreuzberg/src/extractors/rst.rs +0 -577
- data/vendor/kreuzberg/src/extractors/rtf.rs +0 -809
- data/vendor/kreuzberg/src/extractors/security.rs +0 -484
- data/vendor/kreuzberg/src/extractors/security_tests.rs +0 -367
- data/vendor/kreuzberg/src/extractors/typst.rs +0 -651
- data/vendor/kreuzberg/src/language_detection/processor.rs +0 -218
- data/vendor/kreuzberg/src/ocr/language_registry.rs +0 -520
- data/vendor/kreuzberg/src/panic_context.rs +0 -154
- data/vendor/kreuzberg/src/pdf/bindings.rs +0 -306
- data/vendor/kreuzberg/src/pdf/bundled.rs +0 -408
- data/vendor/kreuzberg/src/pdf/fonts.rs +0 -358
- data/vendor/kreuzberg/src/pdf/hierarchy.rs +0 -903
- data/vendor/kreuzberg/src/text/quality_processor.rs +0 -231
- data/vendor/kreuzberg/src/text/utf8_validation.rs +0 -193
- data/vendor/kreuzberg/src/utils/pool.rs +0 -503
- data/vendor/kreuzberg/src/utils/pool_sizing.rs +0 -364
- data/vendor/kreuzberg/src/utils/string_pool.rs +0 -761
- data/vendor/kreuzberg/tests/api_embed.rs +0 -360
- data/vendor/kreuzberg/tests/api_extract_multipart.rs +0 -52
- data/vendor/kreuzberg/tests/api_large_pdf_extraction.rs +0 -471
- data/vendor/kreuzberg/tests/api_large_pdf_extraction_diagnostics.rs +0 -289
- data/vendor/kreuzberg/tests/batch_pooling_benchmark.rs +0 -154
- data/vendor/kreuzberg/tests/bibtex_parity_test.rs +0 -421
- data/vendor/kreuzberg/tests/config_integration_test.rs +0 -753
- data/vendor/kreuzberg/tests/data/hierarchy_ground_truth.json +0 -294
- data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +0 -500
- data/vendor/kreuzberg/tests/docx_vs_pandoc_comparison.rs +0 -370
- data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +0 -275
- data/vendor/kreuzberg/tests/fictionbook_extractor_tests.rs +0 -228
- data/vendor/kreuzberg/tests/html_table_test.rs +0 -551
- data/vendor/kreuzberg/tests/instrumentation_test.rs +0 -139
- data/vendor/kreuzberg/tests/jats_extractor_tests.rs +0 -639
- data/vendor/kreuzberg/tests/jupyter_extractor_tests.rs +0 -704
- data/vendor/kreuzberg/tests/latex_extractor_tests.rs +0 -496
- data/vendor/kreuzberg/tests/markdown_extractor_tests.rs +0 -490
- data/vendor/kreuzberg/tests/ocr_language_registry.rs +0 -191
- data/vendor/kreuzberg/tests/odt_extractor_tests.rs +0 -674
- data/vendor/kreuzberg/tests/opml_extractor_tests.rs +0 -616
- data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +0 -822
- data/vendor/kreuzberg/tests/page_markers.rs +0 -297
- data/vendor/kreuzberg/tests/pdf_hierarchy_detection.rs +0 -301
- data/vendor/kreuzberg/tests/pdf_hierarchy_quality.rs +0 -589
- data/vendor/kreuzberg/tests/pdf_ocr_triggering.rs +0 -301
- data/vendor/kreuzberg/tests/pdf_text_merging.rs +0 -475
- data/vendor/kreuzberg/tests/pdfium_linking.rs +0 -340
- data/vendor/kreuzberg/tests/rst_extractor_tests.rs +0 -694
- data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +0 -775
- data/vendor/kreuzberg/tests/typst_behavioral_tests.rs +0 -1260
- data/vendor/kreuzberg/tests/typst_extractor_tests.rs +0 -648
- data/vendor/kreuzberg-ffi/Cargo.toml +0 -67
- data/vendor/kreuzberg-ffi/README.md +0 -851
- data/vendor/kreuzberg-ffi/benches/result_view_benchmark.rs +0 -227
- data/vendor/kreuzberg-ffi/build.rs +0 -168
- data/vendor/kreuzberg-ffi/cbindgen.toml +0 -37
- data/vendor/kreuzberg-ffi/kreuzberg-ffi.pc.in +0 -12
- data/vendor/kreuzberg-ffi/kreuzberg.h +0 -3012
- data/vendor/kreuzberg-ffi/src/batch_streaming.rs +0 -588
- data/vendor/kreuzberg-ffi/src/config.rs +0 -1341
- data/vendor/kreuzberg-ffi/src/error.rs +0 -901
- data/vendor/kreuzberg-ffi/src/extraction.rs +0 -555
- data/vendor/kreuzberg-ffi/src/helpers.rs +0 -879
- data/vendor/kreuzberg-ffi/src/lib.rs +0 -977
- data/vendor/kreuzberg-ffi/src/memory.rs +0 -493
- data/vendor/kreuzberg-ffi/src/mime.rs +0 -329
- data/vendor/kreuzberg-ffi/src/panic_shield.rs +0 -265
- data/vendor/kreuzberg-ffi/src/plugins/document_extractor.rs +0 -442
- data/vendor/kreuzberg-ffi/src/plugins/mod.rs +0 -14
- data/vendor/kreuzberg-ffi/src/plugins/ocr_backend.rs +0 -628
- data/vendor/kreuzberg-ffi/src/plugins/post_processor.rs +0 -438
- data/vendor/kreuzberg-ffi/src/plugins/validator.rs +0 -329
- data/vendor/kreuzberg-ffi/src/result.rs +0 -510
- data/vendor/kreuzberg-ffi/src/result_pool.rs +0 -639
- data/vendor/kreuzberg-ffi/src/result_view.rs +0 -773
- data/vendor/kreuzberg-ffi/src/string_intern.rs +0 -568
- data/vendor/kreuzberg-ffi/src/types.rs +0 -363
- data/vendor/kreuzberg-ffi/src/util.rs +0 -210
- data/vendor/kreuzberg-ffi/src/validation.rs +0 -848
- data/vendor/kreuzberg-ffi/tests.disabled/README.md +0 -48
- data/vendor/kreuzberg-ffi/tests.disabled/config_loading_tests.rs +0 -299
- data/vendor/kreuzberg-ffi/tests.disabled/config_tests.rs +0 -346
- data/vendor/kreuzberg-ffi/tests.disabled/extractor_tests.rs +0 -232
- data/vendor/kreuzberg-ffi/tests.disabled/plugin_registration_tests.rs +0 -470
- data/vendor/kreuzberg-tesseract/.commitlintrc.json +0 -13
- data/vendor/kreuzberg-tesseract/.crate-ignore +0 -2
- data/vendor/kreuzberg-tesseract/Cargo.lock +0 -2933
- data/vendor/kreuzberg-tesseract/Cargo.toml +0 -57
- data/vendor/kreuzberg-tesseract/LICENSE +0 -22
- data/vendor/kreuzberg-tesseract/README.md +0 -399
- data/vendor/kreuzberg-tesseract/build.rs +0 -1127
- data/vendor/kreuzberg-tesseract/patches/README.md +0 -71
- data/vendor/kreuzberg-tesseract/patches/tesseract.diff +0 -199
- data/vendor/kreuzberg-tesseract/src/api.rs +0 -1371
- data/vendor/kreuzberg-tesseract/src/choice_iterator.rs +0 -77
- data/vendor/kreuzberg-tesseract/src/enums.rs +0 -297
- data/vendor/kreuzberg-tesseract/src/error.rs +0 -81
- data/vendor/kreuzberg-tesseract/src/lib.rs +0 -145
- data/vendor/kreuzberg-tesseract/src/monitor.rs +0 -57
- data/vendor/kreuzberg-tesseract/src/mutable_iterator.rs +0 -197
- data/vendor/kreuzberg-tesseract/src/page_iterator.rs +0 -253
- data/vendor/kreuzberg-tesseract/src/result_iterator.rs +0 -286
- data/vendor/kreuzberg-tesseract/src/result_renderer.rs +0 -183
- data/vendor/kreuzberg-tesseract/tests/integration_test.rs +0 -211
|
@@ -9,10 +9,7 @@ use crate::{batch_extract_bytes, cache, extract_bytes};
|
|
|
9
9
|
|
|
10
10
|
use super::{
|
|
11
11
|
error::ApiError,
|
|
12
|
-
types::{
|
|
13
|
-
ApiState, CacheClearResponse, CacheStatsResponse, EmbedRequest, EmbedResponse, ExtractResponse, HealthResponse,
|
|
14
|
-
InfoResponse,
|
|
15
|
-
},
|
|
12
|
+
types::{ApiState, CacheClearResponse, CacheStatsResponse, ExtractResponse, HealthResponse, InfoResponse},
|
|
16
13
|
};
|
|
17
14
|
|
|
18
15
|
/// Extract endpoint handler.
|
|
@@ -27,24 +24,15 @@ use super::{
|
|
|
27
24
|
///
|
|
28
25
|
/// # Size Limits
|
|
29
26
|
///
|
|
30
|
-
/// Request body size limits are enforced at the router layer via `
|
|
27
|
+
/// Request body size limits are enforced at the router layer via `RequestBodyLimitLayer`.
|
|
31
28
|
/// Default limits:
|
|
32
29
|
/// - Total request body: 100 MB (all files + form data combined)
|
|
33
|
-
/// - Individual multipart fields:
|
|
30
|
+
/// - Individual multipart fields: Controlled by Axum's default multipart limits
|
|
34
31
|
///
|
|
35
|
-
/// Limits can be configured via environment variables or programmatically when creating the router.
|
|
36
32
|
/// If a request exceeds the size limit, it will be rejected with HTTP 413 (Payload Too Large).
|
|
37
33
|
///
|
|
38
34
|
/// The server's default config (loaded from kreuzberg.toml/yaml/json via discovery)
|
|
39
35
|
/// is used as the base, and any per-request config overrides those defaults.
|
|
40
|
-
#[cfg_attr(
|
|
41
|
-
feature = "otel",
|
|
42
|
-
tracing::instrument(
|
|
43
|
-
name = "api.extract",
|
|
44
|
-
skip(state, multipart),
|
|
45
|
-
fields(files_count = tracing::field::Empty)
|
|
46
|
-
)
|
|
47
|
-
)]
|
|
48
36
|
pub async fn extract_handler(
|
|
49
37
|
State(state): State<ApiState>,
|
|
50
38
|
mut multipart: Multipart,
|
|
@@ -95,9 +83,6 @@ pub async fn extract_handler(
|
|
|
95
83
|
)));
|
|
96
84
|
}
|
|
97
85
|
|
|
98
|
-
#[cfg(feature = "otel")]
|
|
99
|
-
tracing::Span::current().record("files_count", files.len());
|
|
100
|
-
|
|
101
86
|
if files.len() == 1 {
|
|
102
87
|
let (data, mime_type, _file_name) = files
|
|
103
88
|
.into_iter()
|
|
@@ -109,14 +94,18 @@ pub async fn extract_handler(
|
|
|
109
94
|
|
|
110
95
|
let files_data: Vec<(Vec<u8>, String)> = files.into_iter().map(|(data, mime, _name)| (data, mime)).collect();
|
|
111
96
|
|
|
112
|
-
let
|
|
97
|
+
let file_refs: Vec<(&[u8], &str)> = files_data
|
|
98
|
+
.iter()
|
|
99
|
+
.map(|(data, mime)| (data.as_slice(), mime.as_str()))
|
|
100
|
+
.collect();
|
|
101
|
+
|
|
102
|
+
let results = batch_extract_bytes(file_refs, &config).await?;
|
|
113
103
|
Ok(Json(results))
|
|
114
104
|
}
|
|
115
105
|
|
|
116
106
|
/// Health check endpoint handler.
|
|
117
107
|
///
|
|
118
108
|
/// GET /health
|
|
119
|
-
#[cfg_attr(feature = "otel", tracing::instrument(name = "api.health"))]
|
|
120
109
|
pub async fn health_handler() -> Json<HealthResponse> {
|
|
121
110
|
Json(HealthResponse {
|
|
122
111
|
status: "healthy".to_string(),
|
|
@@ -127,7 +116,6 @@ pub async fn health_handler() -> Json<HealthResponse> {
|
|
|
127
116
|
/// Server info endpoint handler.
|
|
128
117
|
///
|
|
129
118
|
/// GET /info
|
|
130
|
-
#[cfg_attr(feature = "otel", tracing::instrument(name = "api.info"))]
|
|
131
119
|
pub async fn info_handler() -> Json<InfoResponse> {
|
|
132
120
|
Json(InfoResponse {
|
|
133
121
|
version: env!("CARGO_PKG_VERSION").to_string(),
|
|
@@ -145,7 +133,6 @@ pub async fn info_handler() -> Json<InfoResponse> {
|
|
|
145
133
|
/// - Current directory cannot be determined
|
|
146
134
|
/// - Cache directory path contains non-UTF8 characters
|
|
147
135
|
/// - Cache metadata retrieval fails
|
|
148
|
-
#[cfg_attr(feature = "otel", tracing::instrument(name = "api.cache_stats"))]
|
|
149
136
|
pub async fn cache_stats_handler() -> Result<Json<CacheStatsResponse>, ApiError> {
|
|
150
137
|
let cache_dir = std::env::current_dir()
|
|
151
138
|
.map_err(|e| {
|
|
@@ -185,7 +172,6 @@ pub async fn cache_stats_handler() -> Result<Json<CacheStatsResponse>, ApiError>
|
|
|
185
172
|
/// - Current directory cannot be determined
|
|
186
173
|
/// - Cache directory path contains non-UTF8 characters
|
|
187
174
|
/// - Cache clearing operation fails
|
|
188
|
-
#[cfg_attr(feature = "otel", tracing::instrument(name = "api.cache_clear"))]
|
|
189
175
|
pub async fn cache_clear_handler() -> Result<Json<CacheClearResponse>, ApiError> {
|
|
190
176
|
let cache_dir = std::env::current_dir()
|
|
191
177
|
.map_err(|e| {
|
|
@@ -211,110 +197,3 @@ pub async fn cache_clear_handler() -> Result<Json<CacheClearResponse>, ApiError>
|
|
|
211
197
|
freed_mb,
|
|
212
198
|
}))
|
|
213
199
|
}
|
|
214
|
-
|
|
215
|
-
/// Embedding endpoint handler.
|
|
216
|
-
///
|
|
217
|
-
/// POST /embed
|
|
218
|
-
///
|
|
219
|
-
/// Accepts JSON body with:
|
|
220
|
-
/// - `texts`: Array of strings to generate embeddings for
|
|
221
|
-
/// - `config` (optional): Embedding configuration (model, batch size, cache_dir)
|
|
222
|
-
///
|
|
223
|
-
/// Returns embeddings for each input text.
|
|
224
|
-
///
|
|
225
|
-
/// # Errors
|
|
226
|
-
///
|
|
227
|
-
/// Returns `ApiError::Internal` if:
|
|
228
|
-
/// - Embeddings feature is not enabled
|
|
229
|
-
/// - ONNX Runtime is not available
|
|
230
|
-
/// - Model initialization fails
|
|
231
|
-
/// - Embedding generation fails
|
|
232
|
-
#[cfg(feature = "embeddings")]
|
|
233
|
-
#[cfg_attr(
|
|
234
|
-
feature = "otel",
|
|
235
|
-
tracing::instrument(
|
|
236
|
-
name = "api.embed",
|
|
237
|
-
skip(request),
|
|
238
|
-
fields(
|
|
239
|
-
texts_count = request.texts.len(),
|
|
240
|
-
model = tracing::field::Empty
|
|
241
|
-
)
|
|
242
|
-
)
|
|
243
|
-
)]
|
|
244
|
-
pub async fn embed_handler(Json(request): Json<EmbedRequest>) -> Result<Json<EmbedResponse>, ApiError> {
|
|
245
|
-
use crate::types::{Chunk, ChunkMetadata};
|
|
246
|
-
|
|
247
|
-
if request.texts.is_empty() {
|
|
248
|
-
return Err(ApiError::validation(crate::error::KreuzbergError::validation(
|
|
249
|
-
"No texts provided for embedding generation",
|
|
250
|
-
)));
|
|
251
|
-
}
|
|
252
|
-
|
|
253
|
-
// Use default config if none provided
|
|
254
|
-
let config = request.config.unwrap_or_default();
|
|
255
|
-
|
|
256
|
-
// Create chunks from input texts
|
|
257
|
-
let mut chunks: Vec<Chunk> = request
|
|
258
|
-
.texts
|
|
259
|
-
.iter()
|
|
260
|
-
.enumerate()
|
|
261
|
-
.map(|(idx, text)| Chunk {
|
|
262
|
-
content: text.clone(),
|
|
263
|
-
embedding: None,
|
|
264
|
-
metadata: ChunkMetadata {
|
|
265
|
-
byte_start: 0,
|
|
266
|
-
byte_end: text.len(),
|
|
267
|
-
token_count: None,
|
|
268
|
-
chunk_index: idx,
|
|
269
|
-
total_chunks: request.texts.len(),
|
|
270
|
-
first_page: None,
|
|
271
|
-
last_page: None,
|
|
272
|
-
},
|
|
273
|
-
})
|
|
274
|
-
.collect();
|
|
275
|
-
|
|
276
|
-
// Generate embeddings
|
|
277
|
-
crate::embeddings::generate_embeddings_for_chunks(&mut chunks, &config).map_err(ApiError::internal)?;
|
|
278
|
-
|
|
279
|
-
// Extract embeddings from chunks
|
|
280
|
-
let embeddings: Vec<Vec<f32>> = chunks
|
|
281
|
-
.into_iter()
|
|
282
|
-
.map(|chunk| {
|
|
283
|
-
chunk.embedding.ok_or_else(|| {
|
|
284
|
-
ApiError::internal(crate::error::KreuzbergError::Other(
|
|
285
|
-
"Failed to generate embedding for text".to_string(),
|
|
286
|
-
))
|
|
287
|
-
})
|
|
288
|
-
})
|
|
289
|
-
.collect::<Result<Vec<_>, _>>()?;
|
|
290
|
-
|
|
291
|
-
let dimensions = embeddings.first().map(|e| e.len()).unwrap_or(0);
|
|
292
|
-
|
|
293
|
-
// Get model name from config
|
|
294
|
-
let model_name = match &config.model {
|
|
295
|
-
crate::core::config::EmbeddingModelType::Preset { name } => name.clone(),
|
|
296
|
-
#[cfg(feature = "embeddings")]
|
|
297
|
-
crate::core::config::EmbeddingModelType::FastEmbed { model, .. } => model.clone(),
|
|
298
|
-
crate::core::config::EmbeddingModelType::Custom { .. } => "custom".to_string(),
|
|
299
|
-
};
|
|
300
|
-
|
|
301
|
-
#[cfg(feature = "otel")]
|
|
302
|
-
tracing::Span::current().record("model", &model_name);
|
|
303
|
-
|
|
304
|
-
Ok(Json(EmbedResponse {
|
|
305
|
-
embeddings,
|
|
306
|
-
model: model_name,
|
|
307
|
-
dimensions,
|
|
308
|
-
count: request.texts.len(),
|
|
309
|
-
}))
|
|
310
|
-
}
|
|
311
|
-
|
|
312
|
-
/// Embedding endpoint handler (when embeddings feature is disabled).
|
|
313
|
-
///
|
|
314
|
-
/// Returns an error indicating embeddings feature is not enabled.
|
|
315
|
-
#[cfg(not(feature = "embeddings"))]
|
|
316
|
-
pub async fn embed_handler(Json(_request): Json<EmbedRequest>) -> Result<Json<EmbedResponse>, ApiError> {
|
|
317
|
-
Err(ApiError::internal(crate::error::KreuzbergError::MissingDependency(
|
|
318
|
-
"Embeddings feature is not enabled. Rebuild with --features embeddings".to_string(),
|
|
319
|
-
)))
|
|
320
|
-
}
|
|
@@ -6,11 +6,8 @@
|
|
|
6
6
|
//! # Endpoints
|
|
7
7
|
//!
|
|
8
8
|
//! - `POST /extract` - Extract text from uploaded files (multipart form data)
|
|
9
|
-
//! - `POST /embed` - Generate embeddings for text (JSON body with texts array)
|
|
10
9
|
//! - `GET /health` - Health check endpoint
|
|
11
10
|
//! - `GET /info` - Server information
|
|
12
|
-
//! - `GET /cache/stats` - Get cache statistics
|
|
13
|
-
//! - `DELETE /cache/clear` - Clear all cached files
|
|
14
11
|
//!
|
|
15
12
|
//! # Examples
|
|
16
13
|
//!
|
|
@@ -65,17 +62,6 @@
|
|
|
65
62
|
//!
|
|
66
63
|
//! # Server info
|
|
67
64
|
//! curl http://localhost:8000/info
|
|
68
|
-
//!
|
|
69
|
-
//! # Cache statistics
|
|
70
|
-
//! curl http://localhost:8000/cache/stats
|
|
71
|
-
//!
|
|
72
|
-
//! # Clear cache
|
|
73
|
-
//! curl -X DELETE http://localhost:8000/cache/clear
|
|
74
|
-
//!
|
|
75
|
-
//! # Generate embeddings
|
|
76
|
-
//! curl -X POST http://localhost:8000/embed \
|
|
77
|
-
//! -H "Content-Type: application/json" \
|
|
78
|
-
//! -d '{"texts":["Hello world","Second text"]}'
|
|
79
65
|
//! ```
|
|
80
66
|
|
|
81
67
|
mod error;
|
|
@@ -85,10 +71,9 @@ mod types;
|
|
|
85
71
|
|
|
86
72
|
pub use error::ApiError;
|
|
87
73
|
pub use server::{
|
|
88
|
-
create_router, create_router_with_limits,
|
|
89
|
-
serve_default, serve_with_config, serve_with_config_and_limits, serve_with_server_config,
|
|
74
|
+
create_router, create_router_with_limits, serve, serve_default, serve_with_config, serve_with_config_and_limits,
|
|
90
75
|
};
|
|
91
76
|
pub use types::{
|
|
92
|
-
ApiSizeLimits, ApiState, CacheClearResponse, CacheStatsResponse,
|
|
93
|
-
|
|
77
|
+
ApiSizeLimits, ApiState, CacheClearResponse, CacheStatsResponse, ErrorResponse, ExtractResponse, HealthResponse,
|
|
78
|
+
InfoResponse,
|
|
94
79
|
};
|
|
@@ -1,10 +1,12 @@
|
|
|
1
1
|
//! API server setup and configuration.
|
|
2
2
|
|
|
3
|
-
use std::{
|
|
3
|
+
use std::{
|
|
4
|
+
net::{IpAddr, SocketAddr},
|
|
5
|
+
sync::Arc,
|
|
6
|
+
};
|
|
4
7
|
|
|
5
8
|
use axum::{
|
|
6
9
|
Router,
|
|
7
|
-
extract::DefaultBodyLimit,
|
|
8
10
|
routing::{delete, get, post},
|
|
9
11
|
};
|
|
10
12
|
use tower_http::{
|
|
@@ -13,79 +15,60 @@ use tower_http::{
|
|
|
13
15
|
trace::TraceLayer,
|
|
14
16
|
};
|
|
15
17
|
|
|
16
|
-
use crate::{ExtractionConfig, Result
|
|
18
|
+
use crate::{ExtractionConfig, Result};
|
|
17
19
|
|
|
18
20
|
use super::{
|
|
19
|
-
handlers::{
|
|
20
|
-
cache_clear_handler, cache_stats_handler, embed_handler, extract_handler, health_handler, info_handler,
|
|
21
|
-
},
|
|
21
|
+
handlers::{cache_clear_handler, cache_stats_handler, extract_handler, health_handler, info_handler},
|
|
22
22
|
types::{ApiSizeLimits, ApiState},
|
|
23
23
|
};
|
|
24
24
|
|
|
25
|
-
///
|
|
26
|
-
///
|
|
27
|
-
///
|
|
28
|
-
///
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
};
|
|
71
|
-
|
|
72
|
-
// Apply environment variable overrides with proper logging
|
|
73
|
-
config.apply_env_overrides()?;
|
|
74
|
-
|
|
75
|
-
tracing::info!(
|
|
76
|
-
"Server configuration loaded: host={}, port={}, request_body_limit={} MB, multipart_field_limit={} MB, CORS={}",
|
|
77
|
-
config.host,
|
|
78
|
-
config.port,
|
|
79
|
-
config.max_request_body_mb(),
|
|
80
|
-
config.max_multipart_field_mb(),
|
|
81
|
-
if config.cors_allows_all() {
|
|
82
|
-
"allow all origins".to_string()
|
|
83
|
-
} else {
|
|
84
|
-
format!("{} specific origins", config.cors_origins.len())
|
|
25
|
+
/// Parse size limits from environment variables.
|
|
26
|
+
///
|
|
27
|
+
/// Reads `KREUZBERG_MAX_UPLOAD_SIZE_MB` to configure upload size limits.
|
|
28
|
+
/// Falls back to default (100 MB) if not set or invalid.
|
|
29
|
+
fn parse_size_limits_from_env() -> ApiSizeLimits {
|
|
30
|
+
match std::env::var("KREUZBERG_MAX_UPLOAD_SIZE_MB") {
|
|
31
|
+
Ok(value) => match value.parse::<usize>() {
|
|
32
|
+
Ok(mb) if mb > 0 => {
|
|
33
|
+
tracing::info!(
|
|
34
|
+
"Upload size limit configured from environment: {} MB ({} bytes)",
|
|
35
|
+
mb,
|
|
36
|
+
mb * 1024 * 1024
|
|
37
|
+
);
|
|
38
|
+
ApiSizeLimits::from_mb(mb, mb)
|
|
39
|
+
}
|
|
40
|
+
Ok(_) => {
|
|
41
|
+
tracing::warn!("Invalid KREUZBERG_MAX_UPLOAD_SIZE_MB value (must be > 0), using default 100 MB");
|
|
42
|
+
let limits = ApiSizeLimits::default();
|
|
43
|
+
tracing::info!(
|
|
44
|
+
"Upload size limit: 100 MB (default, {} bytes)",
|
|
45
|
+
limits.max_request_body_bytes
|
|
46
|
+
);
|
|
47
|
+
limits
|
|
48
|
+
}
|
|
49
|
+
Err(e) => {
|
|
50
|
+
tracing::warn!(
|
|
51
|
+
"Failed to parse KREUZBERG_MAX_UPLOAD_SIZE_MB='{}': {}, using default 100 MB",
|
|
52
|
+
value,
|
|
53
|
+
e
|
|
54
|
+
);
|
|
55
|
+
let limits = ApiSizeLimits::default();
|
|
56
|
+
tracing::info!(
|
|
57
|
+
"Upload size limit: 100 MB (default, {} bytes)",
|
|
58
|
+
limits.max_request_body_bytes
|
|
59
|
+
);
|
|
60
|
+
limits
|
|
61
|
+
}
|
|
62
|
+
},
|
|
63
|
+
Err(_) => {
|
|
64
|
+
let limits = ApiSizeLimits::default();
|
|
65
|
+
tracing::info!(
|
|
66
|
+
"Upload size limit: 100 MB (default, {} bytes)",
|
|
67
|
+
limits.max_request_body_bytes
|
|
68
|
+
);
|
|
69
|
+
limits
|
|
85
70
|
}
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
Ok(config)
|
|
71
|
+
}
|
|
89
72
|
}
|
|
90
73
|
|
|
91
74
|
/// Create the API router with all routes configured.
|
|
@@ -148,58 +131,15 @@ pub fn create_router(config: ExtractionConfig) -> Router {
|
|
|
148
131
|
/// # }
|
|
149
132
|
/// ```
|
|
150
133
|
pub fn create_router_with_limits(config: ExtractionConfig, limits: ApiSizeLimits) -> Router {
|
|
151
|
-
create_router_with_limits_and_server_config(config, limits, ServerConfig::default())
|
|
152
|
-
}
|
|
153
|
-
|
|
154
|
-
/// Create the API router with custom size limits and server configuration.
|
|
155
|
-
///
|
|
156
|
-
/// This function provides full control over request limits, CORS, and server settings via ServerConfig.
|
|
157
|
-
///
|
|
158
|
-
/// # Arguments
|
|
159
|
-
///
|
|
160
|
-
/// * `config` - Default extraction configuration. Per-request configs override these defaults.
|
|
161
|
-
/// * `limits` - Size limits for request bodies and multipart uploads.
|
|
162
|
-
/// * `server_config` - Server configuration including host, port, and CORS settings.
|
|
163
|
-
///
|
|
164
|
-
/// # Examples
|
|
165
|
-
///
|
|
166
|
-
/// ```no_run
|
|
167
|
-
/// use kreuzberg::{ExtractionConfig, api::create_router_with_limits, core::ServerConfig};
|
|
168
|
-
///
|
|
169
|
-
/// # #[tokio::main]
|
|
170
|
-
/// # async fn main() -> kreuzberg::Result<()> {
|
|
171
|
-
/// let extraction_config = ExtractionConfig::default();
|
|
172
|
-
/// let mut server_config = ServerConfig::default();
|
|
173
|
-
/// server_config.cors_origins = vec!["https://example.com".to_string()];
|
|
174
|
-
/// let router = create_router_with_limits_and_server_config(
|
|
175
|
-
/// extraction_config,
|
|
176
|
-
/// Default::default(),
|
|
177
|
-
/// server_config
|
|
178
|
-
/// );
|
|
179
|
-
/// # Ok(())
|
|
180
|
-
/// # }
|
|
181
|
-
/// ```
|
|
182
|
-
pub fn create_router_with_limits_and_server_config(
|
|
183
|
-
config: ExtractionConfig,
|
|
184
|
-
limits: ApiSizeLimits,
|
|
185
|
-
server_config: ServerConfig,
|
|
186
|
-
) -> Router {
|
|
187
134
|
let state = ApiState {
|
|
188
135
|
default_config: Arc::new(config),
|
|
189
136
|
};
|
|
190
137
|
|
|
191
|
-
//
|
|
192
|
-
let cors_layer = if
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
list of allowed origins (e.g., 'https://app.example.com,https://api.example.com')"
|
|
197
|
-
);
|
|
198
|
-
CorsLayer::new().allow_origin(Any).allow_methods(Any).allow_headers(Any)
|
|
199
|
-
} else {
|
|
200
|
-
let origins: Vec<_> = server_config
|
|
201
|
-
.cors_origins
|
|
202
|
-
.iter()
|
|
138
|
+
// SECURITY WARNING: The default allows all origins for development convenience,
|
|
139
|
+
let cors_layer = if let Ok(origins_str) = std::env::var("KREUZBERG_CORS_ORIGINS") {
|
|
140
|
+
let origins: Vec<_> = origins_str
|
|
141
|
+
.split(',')
|
|
142
|
+
.filter(|s| !s.trim().is_empty())
|
|
203
143
|
.filter_map(|s| s.trim().parse::<axum::http::HeaderValue>().ok())
|
|
204
144
|
.collect();
|
|
205
145
|
|
|
@@ -211,21 +151,26 @@ pub fn create_router_with_limits_and_server_config(
|
|
|
211
151
|
.allow_headers(Any)
|
|
212
152
|
} else {
|
|
213
153
|
tracing::warn!(
|
|
214
|
-
"
|
|
154
|
+
"KREUZBERG_CORS_ORIGINS set but empty/invalid - falling back to permissive CORS. \
|
|
215
155
|
This allows CSRF attacks. Set explicit origins for production."
|
|
216
156
|
);
|
|
217
157
|
CorsLayer::new().allow_origin(Any).allow_methods(Any).allow_headers(Any)
|
|
218
158
|
}
|
|
159
|
+
} else {
|
|
160
|
+
tracing::warn!(
|
|
161
|
+
"CORS configured to allow all origins (default). This permits CSRF attacks. \
|
|
162
|
+
For production, set KREUZBERG_CORS_ORIGINS environment variable to comma-separated \
|
|
163
|
+
list of allowed origins (e.g., 'https://app.example.com,https://api.example.com')"
|
|
164
|
+
);
|
|
165
|
+
CorsLayer::new().allow_origin(Any).allow_methods(Any).allow_headers(Any)
|
|
219
166
|
};
|
|
220
167
|
|
|
221
168
|
Router::new()
|
|
222
169
|
.route("/extract", post(extract_handler))
|
|
223
|
-
.route("/embed", post(embed_handler))
|
|
224
170
|
.route("/health", get(health_handler))
|
|
225
171
|
.route("/info", get(info_handler))
|
|
226
172
|
.route("/cache/stats", get(cache_stats_handler))
|
|
227
173
|
.route("/cache/clear", delete(cache_clear_handler))
|
|
228
|
-
.layer(DefaultBodyLimit::max(limits.max_request_body_bytes))
|
|
229
174
|
.layer(RequestBodyLimitLayer::new(limits.max_request_body_bytes))
|
|
230
175
|
.layer(cors_layer)
|
|
231
176
|
.layer(TraceLayer::new_for_http())
|
|
@@ -278,18 +223,13 @@ pub fn create_router_with_limits_and_server_config(
|
|
|
278
223
|
/// # Production: set to comma-separated list of allowed origins
|
|
279
224
|
/// export KREUZBERG_CORS_ORIGINS="https://app.example.com,https://api.example.com"
|
|
280
225
|
///
|
|
281
|
-
/// # Upload size
|
|
282
|
-
///
|
|
283
|
-
/// export KREUZBERG_MAX_REQUEST_BODY_BYTES=104857600 # 100 MB
|
|
284
|
-
/// export KREUZBERG_MAX_MULTIPART_FIELD_BYTES=104857600 # 100 MB per file
|
|
285
|
-
///
|
|
286
|
-
/// # Legacy approach (in MB, applies to both limits):
|
|
287
|
-
/// export KREUZBERG_MAX_UPLOAD_SIZE_MB=100 # 100 MB
|
|
226
|
+
/// # Upload size limit (default: 100 MB)
|
|
227
|
+
/// export KREUZBERG_MAX_UPLOAD_SIZE_MB=200
|
|
288
228
|
///
|
|
289
229
|
/// python -m kreuzberg.api
|
|
290
230
|
/// ```
|
|
291
231
|
pub async fn serve(host: impl AsRef<str>, port: u16) -> Result<()> {
|
|
292
|
-
let
|
|
232
|
+
let config = match ExtractionConfig::discover()? {
|
|
293
233
|
Some(config) => {
|
|
294
234
|
tracing::info!("Loaded extraction config from discovered file");
|
|
295
235
|
config
|
|
@@ -300,13 +240,9 @@ pub async fn serve(host: impl AsRef<str>, port: u16) -> Result<()> {
|
|
|
300
240
|
}
|
|
301
241
|
};
|
|
302
242
|
|
|
303
|
-
let
|
|
304
|
-
let limits = ApiSizeLimits::new(
|
|
305
|
-
server_config.max_request_body_bytes,
|
|
306
|
-
server_config.max_multipart_field_bytes,
|
|
307
|
-
);
|
|
243
|
+
let limits = parse_size_limits_from_env();
|
|
308
244
|
|
|
309
|
-
serve_with_config_and_limits(host, port,
|
|
245
|
+
serve_with_config_and_limits(host, port, config, limits).await
|
|
310
246
|
}
|
|
311
247
|
|
|
312
248
|
/// Start the API server with explicit config.
|
|
@@ -368,23 +304,13 @@ pub async fn serve_with_config_and_limits(
|
|
|
368
304
|
config: ExtractionConfig,
|
|
369
305
|
limits: ApiSizeLimits,
|
|
370
306
|
) -> Result<()> {
|
|
371
|
-
use std::net::IpAddr;
|
|
372
|
-
|
|
373
307
|
let ip: IpAddr = host
|
|
374
308
|
.as_ref()
|
|
375
309
|
.parse()
|
|
376
310
|
.map_err(|e| crate::error::KreuzbergError::validation(format!("Invalid host address: {}", e)))?;
|
|
377
311
|
|
|
378
|
-
let server_config = ServerConfig {
|
|
379
|
-
host: host.as_ref().to_string(),
|
|
380
|
-
port,
|
|
381
|
-
max_request_body_bytes: limits.max_request_body_bytes,
|
|
382
|
-
max_multipart_field_bytes: limits.max_multipart_field_bytes,
|
|
383
|
-
..Default::default()
|
|
384
|
-
};
|
|
385
|
-
|
|
386
312
|
let addr = SocketAddr::new(ip, port);
|
|
387
|
-
let app =
|
|
313
|
+
let app = create_router_with_limits(config, limits);
|
|
388
314
|
|
|
389
315
|
tracing::info!("Starting Kreuzberg API server on http://{}:{}", ip, port);
|
|
390
316
|
|
|
@@ -399,70 +325,6 @@ pub async fn serve_with_config_and_limits(
|
|
|
399
325
|
Ok(())
|
|
400
326
|
}
|
|
401
327
|
|
|
402
|
-
/// Start the API server with explicit extraction config and server config.
|
|
403
|
-
///
|
|
404
|
-
/// This function accepts a fully-configured ServerConfig, including CORS origins,
|
|
405
|
-
/// size limits, host, and port. It respects all ServerConfig fields without
|
|
406
|
-
/// re-parsing environment variables, making it ideal for CLI usage where
|
|
407
|
-
/// configuration precedence has already been applied.
|
|
408
|
-
///
|
|
409
|
-
/// # Arguments
|
|
410
|
-
///
|
|
411
|
-
/// * `extraction_config` - Default extraction configuration for all requests
|
|
412
|
-
/// * `server_config` - Server configuration including host, port, CORS, and size limits
|
|
413
|
-
///
|
|
414
|
-
/// # Examples
|
|
415
|
-
///
|
|
416
|
-
/// ```no_run
|
|
417
|
-
/// use kreuzberg::{ExtractionConfig, api::serve_with_server_config, core::ServerConfig};
|
|
418
|
-
///
|
|
419
|
-
/// #[tokio::main]
|
|
420
|
-
/// async fn main() -> kreuzberg::Result<()> {
|
|
421
|
-
/// let extraction_config = ExtractionConfig::default();
|
|
422
|
-
/// let mut server_config = ServerConfig::default();
|
|
423
|
-
/// server_config.host = "0.0.0.0".to_string();
|
|
424
|
-
/// server_config.port = 3000;
|
|
425
|
-
/// server_config.cors_origins = vec!["https://example.com".to_string()];
|
|
426
|
-
///
|
|
427
|
-
/// serve_with_server_config(extraction_config, server_config).await?;
|
|
428
|
-
/// Ok(())
|
|
429
|
-
/// }
|
|
430
|
-
/// ```
|
|
431
|
-
pub async fn serve_with_server_config(extraction_config: ExtractionConfig, server_config: ServerConfig) -> Result<()> {
|
|
432
|
-
use std::net::IpAddr;
|
|
433
|
-
|
|
434
|
-
let ip: IpAddr = server_config
|
|
435
|
-
.host
|
|
436
|
-
.parse()
|
|
437
|
-
.map_err(|e| crate::error::KreuzbergError::validation(format!("Invalid host address: {}", e)))?;
|
|
438
|
-
|
|
439
|
-
let limits = ApiSizeLimits::new(
|
|
440
|
-
server_config.max_request_body_bytes,
|
|
441
|
-
server_config.max_multipart_field_bytes,
|
|
442
|
-
);
|
|
443
|
-
|
|
444
|
-
let addr = SocketAddr::new(ip, server_config.port);
|
|
445
|
-
let app = create_router_with_limits_and_server_config(extraction_config, limits, server_config.clone());
|
|
446
|
-
|
|
447
|
-
tracing::info!(
|
|
448
|
-
"Starting Kreuzberg API server on http://{}:{} (request_body_limit={} MB, multipart_field_limit={} MB)",
|
|
449
|
-
ip,
|
|
450
|
-
server_config.port,
|
|
451
|
-
server_config.max_request_body_mb(),
|
|
452
|
-
server_config.max_multipart_field_mb()
|
|
453
|
-
);
|
|
454
|
-
|
|
455
|
-
let listener = tokio::net::TcpListener::bind(addr)
|
|
456
|
-
.await
|
|
457
|
-
.map_err(crate::error::KreuzbergError::Io)?;
|
|
458
|
-
|
|
459
|
-
axum::serve(listener, app)
|
|
460
|
-
.await
|
|
461
|
-
.map_err(|e| crate::error::KreuzbergError::Other(e.to_string()))?;
|
|
462
|
-
|
|
463
|
-
Ok(())
|
|
464
|
-
}
|
|
465
|
-
|
|
466
328
|
/// Start the API server with default host and port.
|
|
467
329
|
///
|
|
468
330
|
/// Defaults: host = "127.0.0.1", port = 8000
|
|
@@ -473,7 +335,6 @@ pub async fn serve_default() -> Result<()> {
|
|
|
473
335
|
}
|
|
474
336
|
|
|
475
337
|
#[cfg(test)]
|
|
476
|
-
#[allow(unsafe_code)]
|
|
477
338
|
mod tests {
|
|
478
339
|
use super::*;
|
|
479
340
|
|
|
@@ -489,30 +350,4 @@ mod tests {
|
|
|
489
350
|
let router = create_router(config);
|
|
490
351
|
assert!(size_of_val(&router) > 0);
|
|
491
352
|
}
|
|
492
|
-
|
|
493
|
-
#[test]
|
|
494
|
-
fn test_create_router_with_limits() {
|
|
495
|
-
let config = ExtractionConfig::default();
|
|
496
|
-
let limits = ApiSizeLimits::from_mb(50, 50);
|
|
497
|
-
let _router = create_router_with_limits(config, limits);
|
|
498
|
-
}
|
|
499
|
-
|
|
500
|
-
#[test]
|
|
501
|
-
fn test_create_router_with_server_config() {
|
|
502
|
-
let extraction_config = ExtractionConfig::default();
|
|
503
|
-
let limits = ApiSizeLimits::from_mb(100, 100);
|
|
504
|
-
let server_config = ServerConfig::default();
|
|
505
|
-
let _router = create_router_with_limits_and_server_config(extraction_config, limits, server_config);
|
|
506
|
-
}
|
|
507
|
-
|
|
508
|
-
#[test]
|
|
509
|
-
fn test_server_config_cors_handling() {
|
|
510
|
-
let extraction_config = ExtractionConfig::default();
|
|
511
|
-
let limits = ApiSizeLimits::default();
|
|
512
|
-
let server_config = ServerConfig {
|
|
513
|
-
cors_origins: vec!["https://example.com".to_string()],
|
|
514
|
-
..Default::default()
|
|
515
|
-
};
|
|
516
|
-
let _router = create_router_with_limits_and_server_config(extraction_config, limits, server_config);
|
|
517
|
-
}
|
|
518
353
|
}
|