kreuzberg 4.0.8 → 4.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +2 -2
- data/README.md +1 -1
- data/ext/kreuzberg_rb/native/Cargo.lock +94 -98
- data/ext/kreuzberg_rb/native/Cargo.toml +4 -2
- data/ext/kreuzberg_rb/native/src/batch.rs +139 -0
- data/ext/kreuzberg_rb/native/src/config/mod.rs +10 -0
- data/ext/kreuzberg_rb/native/src/config/types.rs +1058 -0
- data/ext/kreuzberg_rb/native/src/error_handling.rs +125 -0
- data/ext/kreuzberg_rb/native/src/extraction.rs +79 -0
- data/ext/kreuzberg_rb/native/src/gc_guarded_value.rs +35 -0
- data/ext/kreuzberg_rb/native/src/helpers.rs +176 -0
- data/ext/kreuzberg_rb/native/src/lib.rs +342 -3622
- data/ext/kreuzberg_rb/native/src/metadata.rs +34 -0
- data/ext/kreuzberg_rb/native/src/plugins/mod.rs +92 -0
- data/ext/kreuzberg_rb/native/src/plugins/ocr_backend.rs +159 -0
- data/ext/kreuzberg_rb/native/src/plugins/post_processor.rs +126 -0
- data/ext/kreuzberg_rb/native/src/plugins/validator.rs +99 -0
- data/ext/kreuzberg_rb/native/src/result.rs +326 -0
- data/ext/kreuzberg_rb/native/src/validation.rs +4 -0
- data/lib/kreuzberg/config.rb +66 -0
- data/lib/kreuzberg/result.rb +107 -2
- data/lib/kreuzberg/types.rb +104 -0
- data/lib/kreuzberg/version.rb +1 -1
- data/lib/kreuzberg.rb +0 -4
- data/sig/kreuzberg.rbs +105 -1
- data/vendor/Cargo.toml +3 -3
- data/vendor/kreuzberg/Cargo.toml +4 -3
- data/vendor/kreuzberg/README.md +1 -1
- data/vendor/kreuzberg/src/api/config.rs +69 -0
- data/vendor/kreuzberg/src/api/handlers.rs +99 -2
- data/vendor/kreuzberg/src/api/mod.rs +14 -7
- data/vendor/kreuzberg/src/api/router.rs +214 -0
- data/vendor/kreuzberg/src/api/startup.rs +243 -0
- data/vendor/kreuzberg/src/api/types.rs +78 -0
- data/vendor/kreuzberg/src/cache/cleanup.rs +277 -0
- data/vendor/kreuzberg/src/cache/core.rs +428 -0
- data/vendor/kreuzberg/src/cache/mod.rs +21 -843
- data/vendor/kreuzberg/src/cache/utilities.rs +156 -0
- data/vendor/kreuzberg/src/chunking/boundaries.rs +301 -0
- data/vendor/kreuzberg/src/chunking/builder.rs +294 -0
- data/vendor/kreuzberg/src/chunking/config.rs +52 -0
- data/vendor/kreuzberg/src/chunking/core.rs +1017 -0
- data/vendor/kreuzberg/src/chunking/mod.rs +14 -2211
- data/vendor/kreuzberg/src/chunking/processor.rs +10 -0
- data/vendor/kreuzberg/src/chunking/validation.rs +686 -0
- data/vendor/kreuzberg/src/core/config/extraction/core.rs +169 -0
- data/vendor/kreuzberg/src/core/config/extraction/env.rs +179 -0
- data/vendor/kreuzberg/src/core/config/extraction/loaders.rs +204 -0
- data/vendor/kreuzberg/src/core/config/extraction/mod.rs +42 -0
- data/vendor/kreuzberg/src/core/config/extraction/types.rs +93 -0
- data/vendor/kreuzberg/src/core/config/formats.rs +135 -0
- data/vendor/kreuzberg/src/core/config/mod.rs +20 -0
- data/vendor/kreuzberg/src/core/config/ocr.rs +73 -0
- data/vendor/kreuzberg/src/core/config/page.rs +57 -0
- data/vendor/kreuzberg/src/core/config/pdf.rs +111 -0
- data/vendor/kreuzberg/src/core/config/processing.rs +312 -0
- data/vendor/kreuzberg/src/core/config_validation/dependencies.rs +187 -0
- data/vendor/kreuzberg/src/core/config_validation/mod.rs +386 -0
- data/vendor/kreuzberg/src/core/config_validation/sections.rs +401 -0
- data/vendor/kreuzberg/src/core/extractor/batch.rs +246 -0
- data/vendor/kreuzberg/src/core/extractor/bytes.rs +116 -0
- data/vendor/kreuzberg/src/core/extractor/file.rs +240 -0
- data/vendor/kreuzberg/src/core/extractor/helpers.rs +71 -0
- data/vendor/kreuzberg/src/core/extractor/legacy.rs +62 -0
- data/vendor/kreuzberg/src/core/extractor/mod.rs +490 -0
- data/vendor/kreuzberg/src/core/extractor/sync.rs +208 -0
- data/vendor/kreuzberg/src/core/mod.rs +4 -1
- data/vendor/kreuzberg/src/core/pipeline/cache.rs +60 -0
- data/vendor/kreuzberg/src/core/pipeline/execution.rs +89 -0
- data/vendor/kreuzberg/src/core/pipeline/features.rs +108 -0
- data/vendor/kreuzberg/src/core/pipeline/format.rs +392 -0
- data/vendor/kreuzberg/src/core/pipeline/initialization.rs +67 -0
- data/vendor/kreuzberg/src/core/pipeline/mod.rs +135 -0
- data/vendor/kreuzberg/src/core/pipeline/tests.rs +975 -0
- data/vendor/kreuzberg/src/core/server_config/env.rs +90 -0
- data/vendor/kreuzberg/src/core/server_config/loader.rs +202 -0
- data/vendor/kreuzberg/src/core/server_config/mod.rs +380 -0
- data/vendor/kreuzberg/src/core/server_config/tests/basic_tests.rs +124 -0
- data/vendor/kreuzberg/src/core/server_config/tests/env_tests.rs +216 -0
- data/vendor/kreuzberg/src/core/server_config/tests/file_loading_tests.rs +341 -0
- data/vendor/kreuzberg/src/core/server_config/tests/mod.rs +5 -0
- data/vendor/kreuzberg/src/core/server_config/validation.rs +17 -0
- data/vendor/kreuzberg/src/embeddings.rs +136 -13
- data/vendor/kreuzberg/src/extraction/{archive.rs → archive/mod.rs} +45 -239
- data/vendor/kreuzberg/src/extraction/archive/sevenz.rs +98 -0
- data/vendor/kreuzberg/src/extraction/archive/tar.rs +118 -0
- data/vendor/kreuzberg/src/extraction/archive/zip.rs +101 -0
- data/vendor/kreuzberg/src/extraction/html/converter.rs +592 -0
- data/vendor/kreuzberg/src/extraction/html/image_handling.rs +95 -0
- data/vendor/kreuzberg/src/extraction/html/mod.rs +53 -0
- data/vendor/kreuzberg/src/extraction/html/processor.rs +659 -0
- data/vendor/kreuzberg/src/extraction/html/stack_management.rs +103 -0
- data/vendor/kreuzberg/src/extraction/html/types.rs +28 -0
- data/vendor/kreuzberg/src/extraction/mod.rs +6 -2
- data/vendor/kreuzberg/src/extraction/pptx/container.rs +159 -0
- data/vendor/kreuzberg/src/extraction/pptx/content_builder.rs +168 -0
- data/vendor/kreuzberg/src/extraction/pptx/elements.rs +132 -0
- data/vendor/kreuzberg/src/extraction/pptx/image_handling.rs +57 -0
- data/vendor/kreuzberg/src/extraction/pptx/metadata.rs +160 -0
- data/vendor/kreuzberg/src/extraction/pptx/mod.rs +558 -0
- data/vendor/kreuzberg/src/extraction/pptx/parser.rs +379 -0
- data/vendor/kreuzberg/src/extraction/transform/content.rs +205 -0
- data/vendor/kreuzberg/src/extraction/transform/elements.rs +211 -0
- data/vendor/kreuzberg/src/extraction/transform/mod.rs +480 -0
- data/vendor/kreuzberg/src/extraction/transform/types.rs +27 -0
- data/vendor/kreuzberg/src/extractors/archive.rs +2 -0
- data/vendor/kreuzberg/src/extractors/bibtex.rs +2 -0
- data/vendor/kreuzberg/src/extractors/djot_format/attributes.rs +134 -0
- data/vendor/kreuzberg/src/extractors/djot_format/conversion.rs +223 -0
- data/vendor/kreuzberg/src/extractors/djot_format/extractor.rs +172 -0
- data/vendor/kreuzberg/src/extractors/djot_format/mod.rs +24 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/block_handlers.rs +271 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/content_extraction.rs +257 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/event_handlers.rs +101 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/inline_handlers.rs +201 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/mod.rs +16 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/state.rs +78 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/table_extraction.rs +68 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/text_extraction.rs +61 -0
- data/vendor/kreuzberg/src/extractors/djot_format/rendering.rs +452 -0
- data/vendor/kreuzberg/src/extractors/docbook.rs +2 -0
- data/vendor/kreuzberg/src/extractors/docx.rs +12 -1
- data/vendor/kreuzberg/src/extractors/email.rs +2 -0
- data/vendor/kreuzberg/src/extractors/epub/content.rs +333 -0
- data/vendor/kreuzberg/src/extractors/epub/metadata.rs +137 -0
- data/vendor/kreuzberg/src/extractors/epub/mod.rs +186 -0
- data/vendor/kreuzberg/src/extractors/epub/parsing.rs +86 -0
- data/vendor/kreuzberg/src/extractors/excel.rs +4 -0
- data/vendor/kreuzberg/src/extractors/fictionbook.rs +2 -0
- data/vendor/kreuzberg/src/extractors/frontmatter_utils.rs +466 -0
- data/vendor/kreuzberg/src/extractors/html.rs +80 -8
- data/vendor/kreuzberg/src/extractors/image.rs +8 -1
- data/vendor/kreuzberg/src/extractors/jats/elements.rs +350 -0
- data/vendor/kreuzberg/src/extractors/jats/metadata.rs +21 -0
- data/vendor/kreuzberg/src/extractors/{jats.rs → jats/mod.rs} +10 -412
- data/vendor/kreuzberg/src/extractors/jats/parser.rs +52 -0
- data/vendor/kreuzberg/src/extractors/jupyter.rs +2 -0
- data/vendor/kreuzberg/src/extractors/latex/commands.rs +93 -0
- data/vendor/kreuzberg/src/extractors/latex/environments.rs +157 -0
- data/vendor/kreuzberg/src/extractors/latex/metadata.rs +27 -0
- data/vendor/kreuzberg/src/extractors/latex/mod.rs +146 -0
- data/vendor/kreuzberg/src/extractors/latex/parser.rs +231 -0
- data/vendor/kreuzberg/src/extractors/latex/utilities.rs +126 -0
- data/vendor/kreuzberg/src/extractors/markdown.rs +39 -162
- data/vendor/kreuzberg/src/extractors/mod.rs +9 -1
- data/vendor/kreuzberg/src/extractors/odt.rs +2 -0
- data/vendor/kreuzberg/src/extractors/opml/core.rs +165 -0
- data/vendor/kreuzberg/src/extractors/opml/mod.rs +31 -0
- data/vendor/kreuzberg/src/extractors/opml/parser.rs +479 -0
- data/vendor/kreuzberg/src/extractors/orgmode.rs +2 -0
- data/vendor/kreuzberg/src/extractors/pdf/extraction.rs +106 -0
- data/vendor/kreuzberg/src/extractors/{pdf.rs → pdf/mod.rs} +25 -324
- data/vendor/kreuzberg/src/extractors/pdf/ocr.rs +214 -0
- data/vendor/kreuzberg/src/extractors/pdf/pages.rs +51 -0
- data/vendor/kreuzberg/src/extractors/pptx.rs +9 -2
- data/vendor/kreuzberg/src/extractors/rst.rs +2 -0
- data/vendor/kreuzberg/src/extractors/rtf/encoding.rs +116 -0
- data/vendor/kreuzberg/src/extractors/rtf/formatting.rs +24 -0
- data/vendor/kreuzberg/src/extractors/rtf/images.rs +72 -0
- data/vendor/kreuzberg/src/extractors/rtf/metadata.rs +216 -0
- data/vendor/kreuzberg/src/extractors/rtf/mod.rs +142 -0
- data/vendor/kreuzberg/src/extractors/rtf/parser.rs +259 -0
- data/vendor/kreuzberg/src/extractors/rtf/tables.rs +83 -0
- data/vendor/kreuzberg/src/extractors/structured.rs +2 -0
- data/vendor/kreuzberg/src/extractors/text.rs +4 -0
- data/vendor/kreuzberg/src/extractors/typst.rs +2 -0
- data/vendor/kreuzberg/src/extractors/xml.rs +2 -0
- data/vendor/kreuzberg/src/keywords/processor.rs +14 -0
- data/vendor/kreuzberg/src/language_detection/processor.rs +10 -0
- data/vendor/kreuzberg/src/lib.rs +2 -2
- data/vendor/kreuzberg/src/mcp/errors.rs +312 -0
- data/vendor/kreuzberg/src/mcp/format.rs +211 -0
- data/vendor/kreuzberg/src/mcp/mod.rs +9 -3
- data/vendor/kreuzberg/src/mcp/params.rs +196 -0
- data/vendor/kreuzberg/src/mcp/server.rs +39 -1438
- data/vendor/kreuzberg/src/mcp/tools/cache.rs +179 -0
- data/vendor/kreuzberg/src/mcp/tools/extraction.rs +403 -0
- data/vendor/kreuzberg/src/mcp/tools/mime.rs +150 -0
- data/vendor/kreuzberg/src/mcp/tools/mod.rs +11 -0
- data/vendor/kreuzberg/src/ocr/backends/easyocr.rs +96 -0
- data/vendor/kreuzberg/src/ocr/backends/mod.rs +7 -0
- data/vendor/kreuzberg/src/ocr/backends/paddleocr.rs +27 -0
- data/vendor/kreuzberg/src/ocr/backends/tesseract.rs +134 -0
- data/vendor/kreuzberg/src/ocr/hocr.rs +60 -16
- data/vendor/kreuzberg/src/ocr/language_registry.rs +11 -235
- data/vendor/kreuzberg/src/ocr/mod.rs +1 -0
- data/vendor/kreuzberg/src/ocr/processor/config.rs +203 -0
- data/vendor/kreuzberg/src/ocr/processor/execution.rs +494 -0
- data/vendor/kreuzberg/src/ocr/processor/mod.rs +265 -0
- data/vendor/kreuzberg/src/ocr/processor/validation.rs +145 -0
- data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +41 -24
- data/vendor/kreuzberg/src/pdf/bindings.rs +21 -8
- data/vendor/kreuzberg/src/pdf/hierarchy/bounding_box.rs +289 -0
- data/vendor/kreuzberg/src/pdf/hierarchy/clustering.rs +199 -0
- data/vendor/kreuzberg/src/pdf/{hierarchy.rs → hierarchy/extraction.rs} +6 -346
- data/vendor/kreuzberg/src/pdf/hierarchy/mod.rs +18 -0
- data/vendor/kreuzberg/src/plugins/extractor/mod.rs +319 -0
- data/vendor/kreuzberg/src/plugins/extractor/registry.rs +434 -0
- data/vendor/kreuzberg/src/plugins/extractor/trait.rs +391 -0
- data/vendor/kreuzberg/src/plugins/mod.rs +13 -0
- data/vendor/kreuzberg/src/plugins/ocr.rs +11 -0
- data/vendor/kreuzberg/src/plugins/processor/mod.rs +365 -0
- data/vendor/kreuzberg/src/plugins/processor/registry.rs +37 -0
- data/vendor/kreuzberg/src/plugins/processor/trait.rs +284 -0
- data/vendor/kreuzberg/src/plugins/registry/extractor.rs +416 -0
- data/vendor/kreuzberg/src/plugins/registry/mod.rs +116 -0
- data/vendor/kreuzberg/src/plugins/registry/ocr.rs +293 -0
- data/vendor/kreuzberg/src/plugins/registry/processor.rs +304 -0
- data/vendor/kreuzberg/src/plugins/registry/validator.rs +238 -0
- data/vendor/kreuzberg/src/plugins/validator/mod.rs +424 -0
- data/vendor/kreuzberg/src/plugins/validator/registry.rs +355 -0
- data/vendor/kreuzberg/src/plugins/validator/trait.rs +276 -0
- data/vendor/kreuzberg/src/stopwords/languages/asian.rs +40 -0
- data/vendor/kreuzberg/src/stopwords/languages/germanic.rs +36 -0
- data/vendor/kreuzberg/src/stopwords/languages/mod.rs +10 -0
- data/vendor/kreuzberg/src/stopwords/languages/other.rs +44 -0
- data/vendor/kreuzberg/src/stopwords/languages/romance.rs +36 -0
- data/vendor/kreuzberg/src/stopwords/languages/slavic.rs +36 -0
- data/vendor/kreuzberg/src/stopwords/mod.rs +7 -33
- data/vendor/kreuzberg/src/text/quality.rs +1 -1
- data/vendor/kreuzberg/src/text/quality_processor.rs +10 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/analysis.rs +238 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/mod.rs +8 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/punctuation.rs +54 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/reducer.rs +384 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/sentence_selection.rs +68 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/word_filtering.rs +156 -0
- data/vendor/kreuzberg/src/text/token_reduction/filters/general.rs +377 -0
- data/vendor/kreuzberg/src/text/token_reduction/filters/html.rs +51 -0
- data/vendor/kreuzberg/src/text/token_reduction/filters/markdown.rs +285 -0
- data/vendor/kreuzberg/src/text/token_reduction/filters.rs +131 -246
- data/vendor/kreuzberg/src/types/djot.rs +209 -0
- data/vendor/kreuzberg/src/types/extraction.rs +301 -0
- data/vendor/kreuzberg/src/types/formats.rs +443 -0
- data/vendor/kreuzberg/src/types/metadata.rs +560 -0
- data/vendor/kreuzberg/src/types/mod.rs +281 -0
- data/vendor/kreuzberg/src/types/page.rs +182 -0
- data/vendor/kreuzberg/src/types/serde_helpers.rs +132 -0
- data/vendor/kreuzberg/src/types/tables.rs +39 -0
- data/vendor/kreuzberg/src/utils/quality/heuristics.rs +58 -0
- data/vendor/kreuzberg/src/utils/{quality.rs → quality/mod.rs} +168 -489
- data/vendor/kreuzberg/src/utils/quality/patterns.rs +117 -0
- data/vendor/kreuzberg/src/utils/quality/scoring.rs +178 -0
- data/vendor/kreuzberg/src/utils/string_pool/buffer_pool.rs +325 -0
- data/vendor/kreuzberg/src/utils/string_pool/interned.rs +102 -0
- data/vendor/kreuzberg/src/utils/string_pool/language_pool.rs +119 -0
- data/vendor/kreuzberg/src/utils/string_pool/mime_pool.rs +235 -0
- data/vendor/kreuzberg/src/utils/string_pool/mod.rs +41 -0
- data/vendor/kreuzberg/tests/api_chunk.rs +313 -0
- data/vendor/kreuzberg/tests/api_embed.rs +6 -9
- data/vendor/kreuzberg/tests/batch_orchestration.rs +1 -0
- data/vendor/kreuzberg/tests/concurrency_stress.rs +7 -0
- data/vendor/kreuzberg/tests/core_integration.rs +1 -0
- data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +130 -0
- data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +5 -14
- data/vendor/kreuzberg/tests/format_integration.rs +2 -0
- data/vendor/kreuzberg/tests/helpers/mod.rs +1 -0
- data/vendor/kreuzberg/tests/html_table_test.rs +11 -11
- data/vendor/kreuzberg/tests/ocr_configuration.rs +16 -0
- data/vendor/kreuzberg/tests/ocr_errors.rs +18 -0
- data/vendor/kreuzberg/tests/ocr_quality.rs +9 -0
- data/vendor/kreuzberg/tests/ocr_stress.rs +1 -0
- data/vendor/kreuzberg/tests/pipeline_integration.rs +50 -0
- data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +13 -0
- data/vendor/kreuzberg/tests/plugin_system.rs +12 -0
- data/vendor/kreuzberg/tests/registry_integration_tests.rs +2 -0
- data/vendor/kreuzberg-ffi/Cargo.toml +2 -1
- data/vendor/kreuzberg-ffi/benches/result_view_benchmark.rs +2 -0
- data/vendor/kreuzberg-ffi/kreuzberg.h +347 -178
- data/vendor/kreuzberg-ffi/src/config/html.rs +318 -0
- data/vendor/kreuzberg-ffi/src/config/loader.rs +154 -0
- data/vendor/kreuzberg-ffi/src/config/merge.rs +104 -0
- data/vendor/kreuzberg-ffi/src/config/mod.rs +385 -0
- data/vendor/kreuzberg-ffi/src/config/parse.rs +91 -0
- data/vendor/kreuzberg-ffi/src/config/serialize.rs +118 -0
- data/vendor/kreuzberg-ffi/src/config_builder.rs +598 -0
- data/vendor/kreuzberg-ffi/src/error.rs +46 -14
- data/vendor/kreuzberg-ffi/src/helpers.rs +10 -0
- data/vendor/kreuzberg-ffi/src/html_options.rs +421 -0
- data/vendor/kreuzberg-ffi/src/lib.rs +16 -0
- data/vendor/kreuzberg-ffi/src/panic_shield.rs +11 -0
- data/vendor/kreuzberg-ffi/src/plugins/ocr_backend.rs +2 -0
- data/vendor/kreuzberg-ffi/src/result.rs +148 -122
- data/vendor/kreuzberg-ffi/src/result_view.rs +4 -0
- data/vendor/kreuzberg-tesseract/Cargo.toml +2 -2
- metadata +200 -28
- data/vendor/kreuzberg/src/api/server.rs +0 -518
- data/vendor/kreuzberg/src/core/config.rs +0 -1914
- data/vendor/kreuzberg/src/core/config_validation.rs +0 -949
- data/vendor/kreuzberg/src/core/extractor.rs +0 -1200
- data/vendor/kreuzberg/src/core/pipeline.rs +0 -1223
- data/vendor/kreuzberg/src/core/server_config.rs +0 -1220
- data/vendor/kreuzberg/src/extraction/html.rs +0 -1830
- data/vendor/kreuzberg/src/extraction/pptx.rs +0 -3102
- data/vendor/kreuzberg/src/extractors/epub.rs +0 -696
- data/vendor/kreuzberg/src/extractors/latex.rs +0 -653
- data/vendor/kreuzberg/src/extractors/opml.rs +0 -635
- data/vendor/kreuzberg/src/extractors/rtf.rs +0 -809
- data/vendor/kreuzberg/src/ocr/processor.rs +0 -858
- data/vendor/kreuzberg/src/plugins/extractor.rs +0 -1042
- data/vendor/kreuzberg/src/plugins/processor.rs +0 -650
- data/vendor/kreuzberg/src/plugins/registry.rs +0 -1339
- data/vendor/kreuzberg/src/plugins/validator.rs +0 -967
- data/vendor/kreuzberg/src/text/token_reduction/core.rs +0 -832
- data/vendor/kreuzberg/src/types.rs +0 -1713
- data/vendor/kreuzberg/src/utils/string_pool.rs +0 -762
- data/vendor/kreuzberg-ffi/src/config.rs +0 -1341
data/sig/kreuzberg.rbs
CHANGED
|
@@ -250,7 +250,56 @@ module Kreuzberg
|
|
|
250
250
|
tables: Array[table_hash]?,
|
|
251
251
|
detected_languages: Array[String]?,
|
|
252
252
|
chunks: Array[chunk_hash]?,
|
|
253
|
-
images: Array[image_hash]
|
|
253
|
+
images: Array[image_hash]?,
|
|
254
|
+
elements: Array[element_hash]?,
|
|
255
|
+
djot_content: djot_content_hash?
|
|
256
|
+
}
|
|
257
|
+
|
|
258
|
+
type djot_content_hash = {
|
|
259
|
+
plain_text: String,
|
|
260
|
+
blocks: Array[formatted_block_hash],
|
|
261
|
+
metadata_json: String,
|
|
262
|
+
tables: Array[table_hash],
|
|
263
|
+
images: Array[djot_image_hash],
|
|
264
|
+
links: Array[djot_link_hash],
|
|
265
|
+
footnotes: Array[footnote_hash],
|
|
266
|
+
attributes: Hash[String, attributes_hash]?
|
|
267
|
+
}
|
|
268
|
+
|
|
269
|
+
type formatted_block_hash = {
|
|
270
|
+
block_type: String,
|
|
271
|
+
level: Integer?,
|
|
272
|
+
content: String?,
|
|
273
|
+
children: Array[formatted_block_hash]?,
|
|
274
|
+
attributes: attributes_hash?
|
|
275
|
+
}
|
|
276
|
+
|
|
277
|
+
type djot_image_hash = {
|
|
278
|
+
url: String,
|
|
279
|
+
alt: String?,
|
|
280
|
+
title: String?,
|
|
281
|
+
attributes: attributes_hash?
|
|
282
|
+
}
|
|
283
|
+
|
|
284
|
+
type djot_link_hash = {
|
|
285
|
+
url: String,
|
|
286
|
+
text: String,
|
|
287
|
+
title: String?,
|
|
288
|
+
link_type: String?
|
|
289
|
+
}
|
|
290
|
+
|
|
291
|
+
type footnote_hash = {
|
|
292
|
+
label: String,
|
|
293
|
+
content: String
|
|
294
|
+
}
|
|
295
|
+
|
|
296
|
+
type attributes_hash = Hash[String, String | Integer | bool | Array[String] | nil]
|
|
297
|
+
|
|
298
|
+
type element_hash = {
|
|
299
|
+
element_id: String,
|
|
300
|
+
element_type: String,
|
|
301
|
+
text: String,
|
|
302
|
+
metadata: Hash[String, untyped]?
|
|
254
303
|
}
|
|
255
304
|
|
|
256
305
|
type table_hash = {
|
|
@@ -359,6 +408,60 @@ module Kreuzberg
|
|
|
359
408
|
def to_h: () -> image_hash
|
|
360
409
|
end
|
|
361
410
|
|
|
411
|
+
# Structured Djot document representation
|
|
412
|
+
class DjotContent
|
|
413
|
+
attr_reader plain_text: String
|
|
414
|
+
attr_reader blocks: Array[DjotContent::FormattedBlock]
|
|
415
|
+
attr_reader metadata: Hash[untyped, untyped]
|
|
416
|
+
attr_reader tables: Array[Table]
|
|
417
|
+
attr_reader images: Array[DjotContent::DjotImage]
|
|
418
|
+
attr_reader links: Array[DjotContent::DjotLink]
|
|
419
|
+
attr_reader footnotes: Array[DjotContent::Footnote]
|
|
420
|
+
attr_reader attributes: Hash[String, untyped]?
|
|
421
|
+
|
|
422
|
+
def initialize: (djot_content_hash hash) -> void
|
|
423
|
+
def to_h: () -> djot_content_hash
|
|
424
|
+
|
|
425
|
+
class FormattedBlock
|
|
426
|
+
attr_reader block_type: String
|
|
427
|
+
attr_reader level: Integer?
|
|
428
|
+
attr_reader content: String?
|
|
429
|
+
attr_reader children: Array[FormattedBlock]?
|
|
430
|
+
attr_reader attributes: Hash[String, untyped]?
|
|
431
|
+
|
|
432
|
+
def initialize: (formatted_block_hash hash) -> void
|
|
433
|
+
def to_h: () -> formatted_block_hash
|
|
434
|
+
end
|
|
435
|
+
|
|
436
|
+
class DjotImage
|
|
437
|
+
attr_reader url: String
|
|
438
|
+
attr_reader alt: String?
|
|
439
|
+
attr_reader title: String?
|
|
440
|
+
attr_reader attributes: Hash[String, untyped]?
|
|
441
|
+
|
|
442
|
+
def initialize: (djot_image_hash hash) -> void
|
|
443
|
+
def to_h: () -> djot_image_hash
|
|
444
|
+
end
|
|
445
|
+
|
|
446
|
+
class DjotLink
|
|
447
|
+
attr_reader url: String
|
|
448
|
+
attr_reader text: String
|
|
449
|
+
attr_reader title: String?
|
|
450
|
+
attr_reader link_type: String?
|
|
451
|
+
|
|
452
|
+
def initialize: (djot_link_hash hash) -> void
|
|
453
|
+
def to_h: () -> djot_link_hash
|
|
454
|
+
end
|
|
455
|
+
|
|
456
|
+
class Footnote
|
|
457
|
+
attr_reader label: String
|
|
458
|
+
attr_reader content: String
|
|
459
|
+
|
|
460
|
+
def initialize: (label: String, content: String) -> void
|
|
461
|
+
def to_h: () -> footnote_hash
|
|
462
|
+
end
|
|
463
|
+
end
|
|
464
|
+
|
|
362
465
|
attr_reader content: String
|
|
363
466
|
attr_reader mime_type: String
|
|
364
467
|
attr_reader metadata: Hash[untyped, untyped]
|
|
@@ -367,6 +470,7 @@ module Kreuzberg
|
|
|
367
470
|
attr_reader detected_languages: Array[String]?
|
|
368
471
|
attr_reader chunks: Array[Chunk]?
|
|
369
472
|
attr_reader images: Array[Image]?
|
|
473
|
+
attr_reader djot_content: DjotContent?
|
|
370
474
|
|
|
371
475
|
def initialize: (extraction_result_hash hash) -> void
|
|
372
476
|
def to_h: () -> Hash[Symbol, untyped]
|
data/vendor/Cargo.toml
CHANGED
|
@@ -3,7 +3,7 @@ members = ["kreuzberg", "kreuzberg-tesseract", "kreuzberg-ffi"]
|
|
|
3
3
|
resolver = "2"
|
|
4
4
|
|
|
5
5
|
[workspace.package]
|
|
6
|
-
version = "4.0
|
|
6
|
+
version = "4.1.0"
|
|
7
7
|
edition = "2024"
|
|
8
8
|
rust-version = "1.91"
|
|
9
9
|
authors = ["Na'aman Hirschfeld <nhirschfeld@gmail.com>"]
|
|
@@ -29,7 +29,7 @@ serde = { version = "1.0.228", features = ["derive"] }
|
|
|
29
29
|
serde_json = "1.0.149"
|
|
30
30
|
|
|
31
31
|
# Error handling
|
|
32
|
-
thiserror = "2.0.
|
|
32
|
+
thiserror = "2.0.18"
|
|
33
33
|
anyhow = "1.0"
|
|
34
34
|
|
|
35
35
|
# Async utilities
|
|
@@ -47,7 +47,7 @@ hex = "0.4.3"
|
|
|
47
47
|
toml = "0.9.11"
|
|
48
48
|
num_cpus = "1.17.0"
|
|
49
49
|
once_cell = "1.21.3"
|
|
50
|
-
html-to-markdown-rs = { version = "2.
|
|
50
|
+
html-to-markdown-rs = { version = "2.23.4", default-features = false }
|
|
51
51
|
reqwest = { version = "0.13.1", default-features = false, features = ["json", "rustls"] }
|
|
52
52
|
image = { version = "0.25.9", default-features = false }
|
|
53
53
|
lzma-rust2 = { version = "0.15.7" }
|
data/vendor/kreuzberg/Cargo.toml
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[package]
|
|
2
2
|
name = "kreuzberg"
|
|
3
|
-
version = "4.0
|
|
3
|
+
version = "4.1.0"
|
|
4
4
|
edition = "2024"
|
|
5
5
|
rust-version = "1.91"
|
|
6
6
|
authors = ["Na'aman Hirschfeld <nhirschfeld@gmail.com>"]
|
|
@@ -136,6 +136,7 @@ regex = "1.12.2"
|
|
|
136
136
|
serde = { workspace = true }
|
|
137
137
|
serde_json = { workspace = true }
|
|
138
138
|
serde_yaml_ng = "0.10.0"
|
|
139
|
+
jotdown = "0.9"
|
|
139
140
|
toml = { workspace = true }
|
|
140
141
|
mime_guess = "2.0"
|
|
141
142
|
rmp-serde = "1.3"
|
|
@@ -152,7 +153,7 @@ lopdf = { version = "0.39.0", optional = true }
|
|
|
152
153
|
calamine = { version = "0.32.0", features = ["dates"], optional = true }
|
|
153
154
|
polars = { version = "0.52.0", default-features = false, features = ["ipc"], optional = true }
|
|
154
155
|
roxmltree = { version = "0.21.1", optional = true }
|
|
155
|
-
zip = { version = "7.
|
|
156
|
+
zip = { version = "7.2.0", optional = true }
|
|
156
157
|
mail-parser = { version = "0.11.1", optional = true }
|
|
157
158
|
msg_parser = { version = "0.1.1", optional = true }
|
|
158
159
|
html-to-markdown-rs = { workspace = true, features = [
|
|
@@ -215,7 +216,7 @@ smartcore = { version = "0.4", default-features = false, features = ["serde"] }
|
|
|
215
216
|
tempfile = { workspace = true }
|
|
216
217
|
filetime = "0.2"
|
|
217
218
|
tar = "0.4.44"
|
|
218
|
-
zip = "7.
|
|
219
|
+
zip = "7.2.0"
|
|
219
220
|
serial_test = "3.3.1"
|
|
220
221
|
anyhow = { workspace = true }
|
|
221
222
|
tokio-test = "0.4"
|
data/vendor/kreuzberg/README.md
CHANGED
|
@@ -17,7 +17,7 @@ High-performance document intelligence library for Rust. Extract text, metadata,
|
|
|
17
17
|
|
|
18
18
|
This is the core Rust library that powers the Python, TypeScript, and Ruby bindings.
|
|
19
19
|
|
|
20
|
-
> **🚀 Version 4.
|
|
20
|
+
> **🚀 Version 4.1.0 Release**
|
|
21
21
|
> This is a pre-release version. We invite you to test the library and [report any issues](https://github.com/kreuzberg-dev/kreuzberg/issues) you encounter.
|
|
22
22
|
>
|
|
23
23
|
> **Note**: The Rust crate is not currently published to crates.io for this RC. Use git dependencies or language bindings (Python, TypeScript, Ruby) instead.
|
|
@@ -0,0 +1,69 @@
|
|
|
1
|
+
//! API server configuration loading.
|
|
2
|
+
|
|
3
|
+
use crate::{Result, core::ServerConfig};
|
|
4
|
+
|
|
5
|
+
/// Load ServerConfig with proper precedence order.
|
|
6
|
+
///
|
|
7
|
+
/// This function implements the configuration hierarchy:
|
|
8
|
+
/// 1. File (if provided)
|
|
9
|
+
/// 2. Environment variables (via apply_env_overrides)
|
|
10
|
+
/// 3. Defaults
|
|
11
|
+
///
|
|
12
|
+
/// The config file can be in flat format (server settings at root) or nested format
|
|
13
|
+
/// (server settings under [server] section alongside other configs like [ocr]).
|
|
14
|
+
///
|
|
15
|
+
/// # Arguments
|
|
16
|
+
///
|
|
17
|
+
/// * `config_path` - Optional path to a ServerConfig file (TOML, YAML, or JSON)
|
|
18
|
+
///
|
|
19
|
+
/// # Returns
|
|
20
|
+
///
|
|
21
|
+
/// A configured ServerConfig with proper precedence applied.
|
|
22
|
+
///
|
|
23
|
+
/// # Errors
|
|
24
|
+
///
|
|
25
|
+
/// Returns an error if:
|
|
26
|
+
/// - The config file path is provided but cannot be read
|
|
27
|
+
/// - The config file contains invalid server configuration
|
|
28
|
+
/// - Environment variable overrides contain invalid values
|
|
29
|
+
///
|
|
30
|
+
/// # Examples
|
|
31
|
+
///
|
|
32
|
+
/// ```no_run
|
|
33
|
+
/// use kreuzberg::api::load_server_config;
|
|
34
|
+
/// use std::path::Path;
|
|
35
|
+
///
|
|
36
|
+
/// # fn example() -> kreuzberg::Result<()> {
|
|
37
|
+
/// // Load from file with env overrides
|
|
38
|
+
/// let config = load_server_config(Some(Path::new("server.toml")))?;
|
|
39
|
+
///
|
|
40
|
+
/// // Or use defaults with env overrides
|
|
41
|
+
/// let config = load_server_config(None)?;
|
|
42
|
+
/// # Ok(())
|
|
43
|
+
/// # }
|
|
44
|
+
/// ```
|
|
45
|
+
pub fn load_server_config(config_path: Option<&std::path::Path>) -> Result<ServerConfig> {
|
|
46
|
+
let mut config = if let Some(path) = config_path {
|
|
47
|
+
ServerConfig::from_file(path)?
|
|
48
|
+
} else {
|
|
49
|
+
ServerConfig::default()
|
|
50
|
+
};
|
|
51
|
+
|
|
52
|
+
// Apply environment variable overrides with proper logging
|
|
53
|
+
config.apply_env_overrides()?;
|
|
54
|
+
|
|
55
|
+
tracing::info!(
|
|
56
|
+
"Server configuration loaded: host={}, port={}, request_body_limit={} MB, multipart_field_limit={} MB, CORS={}",
|
|
57
|
+
config.host,
|
|
58
|
+
config.port,
|
|
59
|
+
config.max_request_body_mb(),
|
|
60
|
+
config.max_multipart_field_mb(),
|
|
61
|
+
if config.cors_allows_all() {
|
|
62
|
+
"allow all origins".to_string()
|
|
63
|
+
} else {
|
|
64
|
+
format!("{} specific origins", config.cors_origins.len())
|
|
65
|
+
}
|
|
66
|
+
);
|
|
67
|
+
|
|
68
|
+
Ok(config)
|
|
69
|
+
}
|
|
@@ -10,8 +10,8 @@ use crate::{batch_extract_bytes, cache, extract_bytes};
|
|
|
10
10
|
use super::{
|
|
11
11
|
error::ApiError,
|
|
12
12
|
types::{
|
|
13
|
-
ApiState, CacheClearResponse, CacheStatsResponse,
|
|
14
|
-
InfoResponse,
|
|
13
|
+
ApiState, CacheClearResponse, CacheStatsResponse, ChunkRequest, ChunkResponse, EmbedRequest, EmbedResponse,
|
|
14
|
+
ExtractResponse, HealthResponse, InfoResponse,
|
|
15
15
|
},
|
|
16
16
|
};
|
|
17
17
|
|
|
@@ -85,6 +85,25 @@ pub async fn extract_handler(
|
|
|
85
85
|
)))
|
|
86
86
|
})?;
|
|
87
87
|
}
|
|
88
|
+
"output_format" => {
|
|
89
|
+
let format_str = field
|
|
90
|
+
.text()
|
|
91
|
+
.await
|
|
92
|
+
.map_err(|e| ApiError::validation(crate::error::KreuzbergError::validation(e.to_string())))?;
|
|
93
|
+
|
|
94
|
+
config.output_format = match format_str.to_lowercase().as_str() {
|
|
95
|
+
"plain" => crate::core::config::OutputFormat::Plain,
|
|
96
|
+
"markdown" => crate::core::config::OutputFormat::Markdown,
|
|
97
|
+
"djot" => crate::core::config::OutputFormat::Djot,
|
|
98
|
+
"html" => crate::core::config::OutputFormat::Html,
|
|
99
|
+
_ => {
|
|
100
|
+
return Err(ApiError::validation(crate::error::KreuzbergError::validation(format!(
|
|
101
|
+
"Invalid output_format: '{}'. Valid values: 'plain', 'markdown', 'djot', 'html'",
|
|
102
|
+
format_str
|
|
103
|
+
))));
|
|
104
|
+
}
|
|
105
|
+
};
|
|
106
|
+
}
|
|
88
107
|
_ => {}
|
|
89
108
|
}
|
|
90
109
|
}
|
|
@@ -318,3 +337,81 @@ pub async fn embed_handler(Json(_request): Json<EmbedRequest>) -> Result<Json<Em
|
|
|
318
337
|
"Embeddings feature is not enabled. Rebuild with --features embeddings".to_string(),
|
|
319
338
|
)))
|
|
320
339
|
}
|
|
340
|
+
|
|
341
|
+
/// Chunk text endpoint handler.
|
|
342
|
+
///
|
|
343
|
+
/// POST /chunk
|
|
344
|
+
///
|
|
345
|
+
/// Accepts JSON body with text and optional configuration.
|
|
346
|
+
/// Returns chunks with metadata.
|
|
347
|
+
#[cfg_attr(
|
|
348
|
+
feature = "otel",
|
|
349
|
+
tracing::instrument(
|
|
350
|
+
name = "api.chunk",
|
|
351
|
+
skip(request),
|
|
352
|
+
fields(text_length = request.text.len(), chunker_type = request.chunker_type.as_str())
|
|
353
|
+
)
|
|
354
|
+
)]
|
|
355
|
+
pub async fn chunk_handler(Json(request): Json<ChunkRequest>) -> Result<Json<ChunkResponse>, ApiError> {
|
|
356
|
+
use super::types::{ChunkItem, ChunkingConfigResponse};
|
|
357
|
+
use crate::chunking::{ChunkerType, ChunkingConfig, chunk_text};
|
|
358
|
+
|
|
359
|
+
// Validate input
|
|
360
|
+
if request.text.is_empty() {
|
|
361
|
+
return Err(ApiError::validation(crate::error::KreuzbergError::validation(
|
|
362
|
+
"Text cannot be empty",
|
|
363
|
+
)));
|
|
364
|
+
}
|
|
365
|
+
|
|
366
|
+
// Parse chunker_type
|
|
367
|
+
let chunker_type = match request.chunker_type.to_lowercase().as_str() {
|
|
368
|
+
"text" | "" => ChunkerType::Text,
|
|
369
|
+
"markdown" => ChunkerType::Markdown,
|
|
370
|
+
other => {
|
|
371
|
+
return Err(ApiError::validation(crate::error::KreuzbergError::validation(format!(
|
|
372
|
+
"Invalid chunker_type: '{}'. Valid values: 'text', 'markdown'",
|
|
373
|
+
other
|
|
374
|
+
))));
|
|
375
|
+
}
|
|
376
|
+
};
|
|
377
|
+
|
|
378
|
+
// Build config with defaults
|
|
379
|
+
let cfg = request.config.unwrap_or_default();
|
|
380
|
+
let config = ChunkingConfig {
|
|
381
|
+
max_characters: cfg.max_characters.unwrap_or(2000),
|
|
382
|
+
overlap: cfg.overlap.unwrap_or(100),
|
|
383
|
+
trim: cfg.trim.unwrap_or(true),
|
|
384
|
+
chunker_type,
|
|
385
|
+
};
|
|
386
|
+
|
|
387
|
+
// Perform chunking
|
|
388
|
+
let result = chunk_text(&request.text, &config, None).map_err(ApiError::internal)?;
|
|
389
|
+
|
|
390
|
+
// Transform to response
|
|
391
|
+
let chunks = result
|
|
392
|
+
.chunks
|
|
393
|
+
.into_iter()
|
|
394
|
+
.map(|chunk| ChunkItem {
|
|
395
|
+
content: chunk.content,
|
|
396
|
+
byte_start: chunk.metadata.byte_start,
|
|
397
|
+
byte_end: chunk.metadata.byte_end,
|
|
398
|
+
chunk_index: chunk.metadata.chunk_index,
|
|
399
|
+
total_chunks: chunk.metadata.total_chunks,
|
|
400
|
+
first_page: chunk.metadata.first_page,
|
|
401
|
+
last_page: chunk.metadata.last_page,
|
|
402
|
+
})
|
|
403
|
+
.collect();
|
|
404
|
+
|
|
405
|
+
Ok(Json(ChunkResponse {
|
|
406
|
+
chunks,
|
|
407
|
+
chunk_count: result.chunk_count,
|
|
408
|
+
config: ChunkingConfigResponse {
|
|
409
|
+
max_characters: config.max_characters,
|
|
410
|
+
overlap: config.overlap,
|
|
411
|
+
trim: config.trim,
|
|
412
|
+
chunker_type: format!("{:?}", config.chunker_type).to_lowercase(),
|
|
413
|
+
},
|
|
414
|
+
input_size_bytes: request.text.len(),
|
|
415
|
+
chunker_type: request.chunker_type.to_lowercase(),
|
|
416
|
+
}))
|
|
417
|
+
}
|
|
@@ -7,6 +7,7 @@
|
|
|
7
7
|
//!
|
|
8
8
|
//! - `POST /extract` - Extract text from uploaded files (multipart form data)
|
|
9
9
|
//! - `POST /embed` - Generate embeddings for text (JSON body with texts array)
|
|
10
|
+
//! - `POST /chunk` - Chunk text into smaller pieces (JSON body with text and config)
|
|
10
11
|
//! - `GET /health` - Health check endpoint
|
|
11
12
|
//! - `GET /info` - Server information
|
|
12
13
|
//! - `GET /cache/stats` - Get cache statistics
|
|
@@ -76,19 +77,25 @@
|
|
|
76
77
|
//! curl -X POST http://localhost:8000/embed \
|
|
77
78
|
//! -H "Content-Type: application/json" \
|
|
78
79
|
//! -d '{"texts":["Hello world","Second text"]}'
|
|
80
|
+
//!
|
|
81
|
+
//! # Chunk text
|
|
82
|
+
//! curl -X POST http://localhost:8000/chunk \
|
|
83
|
+
//! -H "Content-Type: application/json" \
|
|
84
|
+
//! -d '{"text":"Long text to chunk...","chunker_type":"text"}'
|
|
79
85
|
//! ```
|
|
80
86
|
|
|
87
|
+
mod config;
|
|
81
88
|
mod error;
|
|
82
89
|
mod handlers;
|
|
83
|
-
mod
|
|
90
|
+
mod router;
|
|
91
|
+
mod startup;
|
|
84
92
|
mod types;
|
|
85
93
|
|
|
94
|
+
pub use config::load_server_config;
|
|
86
95
|
pub use error::ApiError;
|
|
87
|
-
pub use
|
|
88
|
-
|
|
89
|
-
serve_default, serve_with_config, serve_with_config_and_limits, serve_with_server_config,
|
|
90
|
-
};
|
|
96
|
+
pub use router::{create_router, create_router_with_limits, create_router_with_limits_and_server_config};
|
|
97
|
+
pub use startup::{serve, serve_default, serve_with_config, serve_with_config_and_limits, serve_with_server_config};
|
|
91
98
|
pub use types::{
|
|
92
|
-
ApiSizeLimits, ApiState, CacheClearResponse, CacheStatsResponse,
|
|
93
|
-
ExtractResponse, HealthResponse, InfoResponse,
|
|
99
|
+
ApiSizeLimits, ApiState, CacheClearResponse, CacheStatsResponse, ChunkRequest, ChunkResponse, EmbedRequest,
|
|
100
|
+
EmbedResponse, ErrorResponse, ExtractResponse, HealthResponse, InfoResponse,
|
|
94
101
|
};
|
|
@@ -0,0 +1,214 @@
|
|
|
1
|
+
//! API router setup and configuration.
|
|
2
|
+
|
|
3
|
+
use std::sync::Arc;
|
|
4
|
+
|
|
5
|
+
use axum::{
|
|
6
|
+
Router,
|
|
7
|
+
extract::DefaultBodyLimit,
|
|
8
|
+
routing::{delete, get, post},
|
|
9
|
+
};
|
|
10
|
+
use tower_http::{
|
|
11
|
+
cors::{AllowOrigin, Any, CorsLayer},
|
|
12
|
+
limit::RequestBodyLimitLayer,
|
|
13
|
+
trace::TraceLayer,
|
|
14
|
+
};
|
|
15
|
+
|
|
16
|
+
use crate::{ExtractionConfig, core::ServerConfig};
|
|
17
|
+
|
|
18
|
+
use super::{
|
|
19
|
+
handlers::{
|
|
20
|
+
cache_clear_handler, cache_stats_handler, chunk_handler, embed_handler, extract_handler, health_handler,
|
|
21
|
+
info_handler,
|
|
22
|
+
},
|
|
23
|
+
types::{ApiSizeLimits, ApiState},
|
|
24
|
+
};
|
|
25
|
+
|
|
26
|
+
/// Create the API router with all routes configured.
|
|
27
|
+
///
|
|
28
|
+
/// This is public to allow users to embed the router in their own applications.
|
|
29
|
+
///
|
|
30
|
+
/// # Arguments
|
|
31
|
+
///
|
|
32
|
+
/// * `config` - Default extraction configuration. Per-request configs override these defaults.
|
|
33
|
+
///
|
|
34
|
+
/// # Examples
|
|
35
|
+
///
|
|
36
|
+
/// ```no_run
|
|
37
|
+
/// use kreuzberg::{ExtractionConfig, api::create_router};
|
|
38
|
+
///
|
|
39
|
+
/// # #[tokio::main]
|
|
40
|
+
/// # async fn main() {
|
|
41
|
+
/// // Create router with default config and size limits
|
|
42
|
+
/// let config = ExtractionConfig::default();
|
|
43
|
+
/// let router = create_router(config);
|
|
44
|
+
/// # }
|
|
45
|
+
/// ```
|
|
46
|
+
pub fn create_router(config: ExtractionConfig) -> Router {
|
|
47
|
+
create_router_with_limits(config, ApiSizeLimits::default())
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
/// Create the API router with custom size limits.
|
|
51
|
+
///
|
|
52
|
+
/// This allows fine-grained control over request body and multipart field size limits.
|
|
53
|
+
///
|
|
54
|
+
/// # Arguments
|
|
55
|
+
///
|
|
56
|
+
/// * `config` - Default extraction configuration. Per-request configs override these defaults.
|
|
57
|
+
/// * `limits` - Size limits for request bodies and multipart uploads.
|
|
58
|
+
///
|
|
59
|
+
/// # Examples
|
|
60
|
+
///
|
|
61
|
+
/// ```no_run
|
|
62
|
+
/// use kreuzberg::{ExtractionConfig, api::{create_router_with_limits, ApiSizeLimits}};
|
|
63
|
+
///
|
|
64
|
+
/// # #[tokio::main]
|
|
65
|
+
/// # async fn main() {
|
|
66
|
+
/// // Create router with 50 MB limits
|
|
67
|
+
/// let config = ExtractionConfig::default();
|
|
68
|
+
/// let limits = ApiSizeLimits::from_mb(50, 50);
|
|
69
|
+
/// let router = create_router_with_limits(config, limits);
|
|
70
|
+
/// # }
|
|
71
|
+
/// ```
|
|
72
|
+
///
|
|
73
|
+
/// ```no_run
|
|
74
|
+
/// use kreuzberg::{ExtractionConfig, api::{create_router_with_limits, ApiSizeLimits}};
|
|
75
|
+
/// use tower_http::limit::RequestBodyLimitLayer;
|
|
76
|
+
///
|
|
77
|
+
/// # #[tokio::main]
|
|
78
|
+
/// # async fn main() {
|
|
79
|
+
/// // Custom limits for very large documents (500 MB)
|
|
80
|
+
/// let config = ExtractionConfig::default();
|
|
81
|
+
/// let limits = ApiSizeLimits::from_mb(500, 500);
|
|
82
|
+
/// let router = create_router_with_limits(config, limits);
|
|
83
|
+
/// # }
|
|
84
|
+
/// ```
|
|
85
|
+
pub fn create_router_with_limits(config: ExtractionConfig, limits: ApiSizeLimits) -> Router {
|
|
86
|
+
create_router_with_limits_and_server_config(config, limits, ServerConfig::default())
|
|
87
|
+
}
|
|
88
|
+
|
|
89
|
+
/// Create the API router with custom size limits and server configuration.
|
|
90
|
+
///
|
|
91
|
+
/// This function provides full control over request limits, CORS, and server settings via ServerConfig.
|
|
92
|
+
///
|
|
93
|
+
/// # Arguments
|
|
94
|
+
///
|
|
95
|
+
/// * `config` - Default extraction configuration. Per-request configs override these defaults.
|
|
96
|
+
/// * `limits` - Size limits for request bodies and multipart uploads.
|
|
97
|
+
/// * `server_config` - Server configuration including host, port, and CORS settings.
|
|
98
|
+
///
|
|
99
|
+
/// # Examples
|
|
100
|
+
///
|
|
101
|
+
/// ```no_run
|
|
102
|
+
/// use kreuzberg::{ExtractionConfig, api::create_router_with_limits, core::ServerConfig};
|
|
103
|
+
///
|
|
104
|
+
/// # #[tokio::main]
|
|
105
|
+
/// # async fn main() -> kreuzberg::Result<()> {
|
|
106
|
+
/// let extraction_config = ExtractionConfig::default();
|
|
107
|
+
/// let mut server_config = ServerConfig::default();
|
|
108
|
+
/// server_config.cors_origins = vec!["https://example.com".to_string()];
|
|
109
|
+
/// let router = create_router_with_limits_and_server_config(
|
|
110
|
+
/// extraction_config,
|
|
111
|
+
/// Default::default(),
|
|
112
|
+
/// server_config
|
|
113
|
+
/// );
|
|
114
|
+
/// # Ok(())
|
|
115
|
+
/// # }
|
|
116
|
+
/// ```
|
|
117
|
+
pub fn create_router_with_limits_and_server_config(
|
|
118
|
+
config: ExtractionConfig,
|
|
119
|
+
limits: ApiSizeLimits,
|
|
120
|
+
server_config: ServerConfig,
|
|
121
|
+
) -> Router {
|
|
122
|
+
let state = ApiState {
|
|
123
|
+
default_config: Arc::new(config),
|
|
124
|
+
};
|
|
125
|
+
|
|
126
|
+
// CORS configuration based on ServerConfig
|
|
127
|
+
let cors_layer = if server_config.cors_allows_all() {
|
|
128
|
+
tracing::warn!(
|
|
129
|
+
"CORS configured to allow all origins (default). This permits CSRF attacks. \
|
|
130
|
+
For production, set KREUZBERG_CORS_ORIGINS environment variable to comma-separated \
|
|
131
|
+
list of allowed origins (e.g., 'https://app.example.com,https://api.example.com')"
|
|
132
|
+
);
|
|
133
|
+
CorsLayer::new().allow_origin(Any).allow_methods(Any).allow_headers(Any)
|
|
134
|
+
} else {
|
|
135
|
+
let origins: Vec<_> = server_config
|
|
136
|
+
.cors_origins
|
|
137
|
+
.iter()
|
|
138
|
+
.filter_map(|s| s.trim().parse::<axum::http::HeaderValue>().ok())
|
|
139
|
+
.collect();
|
|
140
|
+
|
|
141
|
+
if !origins.is_empty() {
|
|
142
|
+
tracing::info!("CORS configured with {} explicit allowed origin(s)", origins.len());
|
|
143
|
+
CorsLayer::new()
|
|
144
|
+
.allow_origin(AllowOrigin::list(origins))
|
|
145
|
+
.allow_methods(Any)
|
|
146
|
+
.allow_headers(Any)
|
|
147
|
+
} else {
|
|
148
|
+
tracing::warn!(
|
|
149
|
+
"CORS origins configured but empty/invalid - falling back to permissive CORS. \
|
|
150
|
+
This allows CSRF attacks. Set explicit origins for production."
|
|
151
|
+
);
|
|
152
|
+
CorsLayer::new().allow_origin(Any).allow_methods(Any).allow_headers(Any)
|
|
153
|
+
}
|
|
154
|
+
};
|
|
155
|
+
|
|
156
|
+
Router::new()
|
|
157
|
+
.route("/extract", post(extract_handler))
|
|
158
|
+
.route("/embed", post(embed_handler))
|
|
159
|
+
.route("/chunk", post(chunk_handler))
|
|
160
|
+
.route("/health", get(health_handler))
|
|
161
|
+
.route("/info", get(info_handler))
|
|
162
|
+
.route("/cache/stats", get(cache_stats_handler))
|
|
163
|
+
.route("/cache/clear", delete(cache_clear_handler))
|
|
164
|
+
.layer(DefaultBodyLimit::max(limits.max_request_body_bytes))
|
|
165
|
+
.layer(RequestBodyLimitLayer::new(limits.max_request_body_bytes))
|
|
166
|
+
.layer(cors_layer)
|
|
167
|
+
.layer(TraceLayer::new_for_http())
|
|
168
|
+
.with_state(state)
|
|
169
|
+
}
|
|
170
|
+
|
|
171
|
+
#[cfg(test)]
|
|
172
|
+
mod tests {
|
|
173
|
+
use super::*;
|
|
174
|
+
|
|
175
|
+
#[test]
|
|
176
|
+
fn test_create_router() {
|
|
177
|
+
let config = ExtractionConfig::default();
|
|
178
|
+
let _router = create_router(config);
|
|
179
|
+
}
|
|
180
|
+
|
|
181
|
+
#[test]
|
|
182
|
+
fn test_router_has_routes() {
|
|
183
|
+
use std::mem::size_of_val;
|
|
184
|
+
let config = ExtractionConfig::default();
|
|
185
|
+
let router = create_router(config);
|
|
186
|
+
assert!(size_of_val(&router) > 0);
|
|
187
|
+
}
|
|
188
|
+
|
|
189
|
+
#[test]
|
|
190
|
+
fn test_create_router_with_limits() {
|
|
191
|
+
let config = ExtractionConfig::default();
|
|
192
|
+
let limits = ApiSizeLimits::from_mb(50, 50);
|
|
193
|
+
let _router = create_router_with_limits(config, limits);
|
|
194
|
+
}
|
|
195
|
+
|
|
196
|
+
#[test]
|
|
197
|
+
fn test_create_router_with_server_config() {
|
|
198
|
+
let extraction_config = ExtractionConfig::default();
|
|
199
|
+
let limits = ApiSizeLimits::from_mb(100, 100);
|
|
200
|
+
let server_config = ServerConfig::default();
|
|
201
|
+
let _router = create_router_with_limits_and_server_config(extraction_config, limits, server_config);
|
|
202
|
+
}
|
|
203
|
+
|
|
204
|
+
#[test]
|
|
205
|
+
fn test_server_config_cors_handling() {
|
|
206
|
+
let extraction_config = ExtractionConfig::default();
|
|
207
|
+
let limits = ApiSizeLimits::default();
|
|
208
|
+
let server_config = ServerConfig {
|
|
209
|
+
cors_origins: vec!["https://example.com".to_string()],
|
|
210
|
+
..Default::default()
|
|
211
|
+
};
|
|
212
|
+
let _router = create_router_with_limits_and_server_config(extraction_config, limits, server_config);
|
|
213
|
+
}
|
|
214
|
+
}
|