kreuzberg 4.0.8 → 4.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +2 -2
- data/README.md +1 -1
- data/ext/kreuzberg_rb/native/Cargo.lock +94 -98
- data/ext/kreuzberg_rb/native/Cargo.toml +4 -2
- data/ext/kreuzberg_rb/native/src/batch.rs +139 -0
- data/ext/kreuzberg_rb/native/src/config/mod.rs +10 -0
- data/ext/kreuzberg_rb/native/src/config/types.rs +1058 -0
- data/ext/kreuzberg_rb/native/src/error_handling.rs +125 -0
- data/ext/kreuzberg_rb/native/src/extraction.rs +79 -0
- data/ext/kreuzberg_rb/native/src/gc_guarded_value.rs +35 -0
- data/ext/kreuzberg_rb/native/src/helpers.rs +176 -0
- data/ext/kreuzberg_rb/native/src/lib.rs +342 -3622
- data/ext/kreuzberg_rb/native/src/metadata.rs +34 -0
- data/ext/kreuzberg_rb/native/src/plugins/mod.rs +92 -0
- data/ext/kreuzberg_rb/native/src/plugins/ocr_backend.rs +159 -0
- data/ext/kreuzberg_rb/native/src/plugins/post_processor.rs +126 -0
- data/ext/kreuzberg_rb/native/src/plugins/validator.rs +99 -0
- data/ext/kreuzberg_rb/native/src/result.rs +326 -0
- data/ext/kreuzberg_rb/native/src/validation.rs +4 -0
- data/lib/kreuzberg/config.rb +66 -0
- data/lib/kreuzberg/result.rb +107 -2
- data/lib/kreuzberg/types.rb +104 -0
- data/lib/kreuzberg/version.rb +1 -1
- data/lib/kreuzberg.rb +0 -4
- data/sig/kreuzberg.rbs +105 -1
- data/vendor/Cargo.toml +3 -3
- data/vendor/kreuzberg/Cargo.toml +4 -3
- data/vendor/kreuzberg/README.md +1 -1
- data/vendor/kreuzberg/src/api/config.rs +69 -0
- data/vendor/kreuzberg/src/api/handlers.rs +99 -2
- data/vendor/kreuzberg/src/api/mod.rs +14 -7
- data/vendor/kreuzberg/src/api/router.rs +214 -0
- data/vendor/kreuzberg/src/api/startup.rs +243 -0
- data/vendor/kreuzberg/src/api/types.rs +78 -0
- data/vendor/kreuzberg/src/cache/cleanup.rs +277 -0
- data/vendor/kreuzberg/src/cache/core.rs +428 -0
- data/vendor/kreuzberg/src/cache/mod.rs +21 -843
- data/vendor/kreuzberg/src/cache/utilities.rs +156 -0
- data/vendor/kreuzberg/src/chunking/boundaries.rs +301 -0
- data/vendor/kreuzberg/src/chunking/builder.rs +294 -0
- data/vendor/kreuzberg/src/chunking/config.rs +52 -0
- data/vendor/kreuzberg/src/chunking/core.rs +1017 -0
- data/vendor/kreuzberg/src/chunking/mod.rs +14 -2211
- data/vendor/kreuzberg/src/chunking/processor.rs +10 -0
- data/vendor/kreuzberg/src/chunking/validation.rs +686 -0
- data/vendor/kreuzberg/src/core/config/extraction/core.rs +169 -0
- data/vendor/kreuzberg/src/core/config/extraction/env.rs +179 -0
- data/vendor/kreuzberg/src/core/config/extraction/loaders.rs +204 -0
- data/vendor/kreuzberg/src/core/config/extraction/mod.rs +42 -0
- data/vendor/kreuzberg/src/core/config/extraction/types.rs +93 -0
- data/vendor/kreuzberg/src/core/config/formats.rs +135 -0
- data/vendor/kreuzberg/src/core/config/mod.rs +20 -0
- data/vendor/kreuzberg/src/core/config/ocr.rs +73 -0
- data/vendor/kreuzberg/src/core/config/page.rs +57 -0
- data/vendor/kreuzberg/src/core/config/pdf.rs +111 -0
- data/vendor/kreuzberg/src/core/config/processing.rs +312 -0
- data/vendor/kreuzberg/src/core/config_validation/dependencies.rs +187 -0
- data/vendor/kreuzberg/src/core/config_validation/mod.rs +386 -0
- data/vendor/kreuzberg/src/core/config_validation/sections.rs +401 -0
- data/vendor/kreuzberg/src/core/extractor/batch.rs +246 -0
- data/vendor/kreuzberg/src/core/extractor/bytes.rs +116 -0
- data/vendor/kreuzberg/src/core/extractor/file.rs +240 -0
- data/vendor/kreuzberg/src/core/extractor/helpers.rs +71 -0
- data/vendor/kreuzberg/src/core/extractor/legacy.rs +62 -0
- data/vendor/kreuzberg/src/core/extractor/mod.rs +490 -0
- data/vendor/kreuzberg/src/core/extractor/sync.rs +208 -0
- data/vendor/kreuzberg/src/core/mod.rs +4 -1
- data/vendor/kreuzberg/src/core/pipeline/cache.rs +60 -0
- data/vendor/kreuzberg/src/core/pipeline/execution.rs +89 -0
- data/vendor/kreuzberg/src/core/pipeline/features.rs +108 -0
- data/vendor/kreuzberg/src/core/pipeline/format.rs +392 -0
- data/vendor/kreuzberg/src/core/pipeline/initialization.rs +67 -0
- data/vendor/kreuzberg/src/core/pipeline/mod.rs +135 -0
- data/vendor/kreuzberg/src/core/pipeline/tests.rs +975 -0
- data/vendor/kreuzberg/src/core/server_config/env.rs +90 -0
- data/vendor/kreuzberg/src/core/server_config/loader.rs +202 -0
- data/vendor/kreuzberg/src/core/server_config/mod.rs +380 -0
- data/vendor/kreuzberg/src/core/server_config/tests/basic_tests.rs +124 -0
- data/vendor/kreuzberg/src/core/server_config/tests/env_tests.rs +216 -0
- data/vendor/kreuzberg/src/core/server_config/tests/file_loading_tests.rs +341 -0
- data/vendor/kreuzberg/src/core/server_config/tests/mod.rs +5 -0
- data/vendor/kreuzberg/src/core/server_config/validation.rs +17 -0
- data/vendor/kreuzberg/src/embeddings.rs +136 -13
- data/vendor/kreuzberg/src/extraction/{archive.rs → archive/mod.rs} +45 -239
- data/vendor/kreuzberg/src/extraction/archive/sevenz.rs +98 -0
- data/vendor/kreuzberg/src/extraction/archive/tar.rs +118 -0
- data/vendor/kreuzberg/src/extraction/archive/zip.rs +101 -0
- data/vendor/kreuzberg/src/extraction/html/converter.rs +592 -0
- data/vendor/kreuzberg/src/extraction/html/image_handling.rs +95 -0
- data/vendor/kreuzberg/src/extraction/html/mod.rs +53 -0
- data/vendor/kreuzberg/src/extraction/html/processor.rs +659 -0
- data/vendor/kreuzberg/src/extraction/html/stack_management.rs +103 -0
- data/vendor/kreuzberg/src/extraction/html/types.rs +28 -0
- data/vendor/kreuzberg/src/extraction/mod.rs +6 -2
- data/vendor/kreuzberg/src/extraction/pptx/container.rs +159 -0
- data/vendor/kreuzberg/src/extraction/pptx/content_builder.rs +168 -0
- data/vendor/kreuzberg/src/extraction/pptx/elements.rs +132 -0
- data/vendor/kreuzberg/src/extraction/pptx/image_handling.rs +57 -0
- data/vendor/kreuzberg/src/extraction/pptx/metadata.rs +160 -0
- data/vendor/kreuzberg/src/extraction/pptx/mod.rs +558 -0
- data/vendor/kreuzberg/src/extraction/pptx/parser.rs +379 -0
- data/vendor/kreuzberg/src/extraction/transform/content.rs +205 -0
- data/vendor/kreuzberg/src/extraction/transform/elements.rs +211 -0
- data/vendor/kreuzberg/src/extraction/transform/mod.rs +480 -0
- data/vendor/kreuzberg/src/extraction/transform/types.rs +27 -0
- data/vendor/kreuzberg/src/extractors/archive.rs +2 -0
- data/vendor/kreuzberg/src/extractors/bibtex.rs +2 -0
- data/vendor/kreuzberg/src/extractors/djot_format/attributes.rs +134 -0
- data/vendor/kreuzberg/src/extractors/djot_format/conversion.rs +223 -0
- data/vendor/kreuzberg/src/extractors/djot_format/extractor.rs +172 -0
- data/vendor/kreuzberg/src/extractors/djot_format/mod.rs +24 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/block_handlers.rs +271 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/content_extraction.rs +257 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/event_handlers.rs +101 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/inline_handlers.rs +201 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/mod.rs +16 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/state.rs +78 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/table_extraction.rs +68 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/text_extraction.rs +61 -0
- data/vendor/kreuzberg/src/extractors/djot_format/rendering.rs +452 -0
- data/vendor/kreuzberg/src/extractors/docbook.rs +2 -0
- data/vendor/kreuzberg/src/extractors/docx.rs +12 -1
- data/vendor/kreuzberg/src/extractors/email.rs +2 -0
- data/vendor/kreuzberg/src/extractors/epub/content.rs +333 -0
- data/vendor/kreuzberg/src/extractors/epub/metadata.rs +137 -0
- data/vendor/kreuzberg/src/extractors/epub/mod.rs +186 -0
- data/vendor/kreuzberg/src/extractors/epub/parsing.rs +86 -0
- data/vendor/kreuzberg/src/extractors/excel.rs +4 -0
- data/vendor/kreuzberg/src/extractors/fictionbook.rs +2 -0
- data/vendor/kreuzberg/src/extractors/frontmatter_utils.rs +466 -0
- data/vendor/kreuzberg/src/extractors/html.rs +80 -8
- data/vendor/kreuzberg/src/extractors/image.rs +8 -1
- data/vendor/kreuzberg/src/extractors/jats/elements.rs +350 -0
- data/vendor/kreuzberg/src/extractors/jats/metadata.rs +21 -0
- data/vendor/kreuzberg/src/extractors/{jats.rs → jats/mod.rs} +10 -412
- data/vendor/kreuzberg/src/extractors/jats/parser.rs +52 -0
- data/vendor/kreuzberg/src/extractors/jupyter.rs +2 -0
- data/vendor/kreuzberg/src/extractors/latex/commands.rs +93 -0
- data/vendor/kreuzberg/src/extractors/latex/environments.rs +157 -0
- data/vendor/kreuzberg/src/extractors/latex/metadata.rs +27 -0
- data/vendor/kreuzberg/src/extractors/latex/mod.rs +146 -0
- data/vendor/kreuzberg/src/extractors/latex/parser.rs +231 -0
- data/vendor/kreuzberg/src/extractors/latex/utilities.rs +126 -0
- data/vendor/kreuzberg/src/extractors/markdown.rs +39 -162
- data/vendor/kreuzberg/src/extractors/mod.rs +9 -1
- data/vendor/kreuzberg/src/extractors/odt.rs +2 -0
- data/vendor/kreuzberg/src/extractors/opml/core.rs +165 -0
- data/vendor/kreuzberg/src/extractors/opml/mod.rs +31 -0
- data/vendor/kreuzberg/src/extractors/opml/parser.rs +479 -0
- data/vendor/kreuzberg/src/extractors/orgmode.rs +2 -0
- data/vendor/kreuzberg/src/extractors/pdf/extraction.rs +106 -0
- data/vendor/kreuzberg/src/extractors/{pdf.rs → pdf/mod.rs} +25 -324
- data/vendor/kreuzberg/src/extractors/pdf/ocr.rs +214 -0
- data/vendor/kreuzberg/src/extractors/pdf/pages.rs +51 -0
- data/vendor/kreuzberg/src/extractors/pptx.rs +9 -2
- data/vendor/kreuzberg/src/extractors/rst.rs +2 -0
- data/vendor/kreuzberg/src/extractors/rtf/encoding.rs +116 -0
- data/vendor/kreuzberg/src/extractors/rtf/formatting.rs +24 -0
- data/vendor/kreuzberg/src/extractors/rtf/images.rs +72 -0
- data/vendor/kreuzberg/src/extractors/rtf/metadata.rs +216 -0
- data/vendor/kreuzberg/src/extractors/rtf/mod.rs +142 -0
- data/vendor/kreuzberg/src/extractors/rtf/parser.rs +259 -0
- data/vendor/kreuzberg/src/extractors/rtf/tables.rs +83 -0
- data/vendor/kreuzberg/src/extractors/structured.rs +2 -0
- data/vendor/kreuzberg/src/extractors/text.rs +4 -0
- data/vendor/kreuzberg/src/extractors/typst.rs +2 -0
- data/vendor/kreuzberg/src/extractors/xml.rs +2 -0
- data/vendor/kreuzberg/src/keywords/processor.rs +14 -0
- data/vendor/kreuzberg/src/language_detection/processor.rs +10 -0
- data/vendor/kreuzberg/src/lib.rs +2 -2
- data/vendor/kreuzberg/src/mcp/errors.rs +312 -0
- data/vendor/kreuzberg/src/mcp/format.rs +211 -0
- data/vendor/kreuzberg/src/mcp/mod.rs +9 -3
- data/vendor/kreuzberg/src/mcp/params.rs +196 -0
- data/vendor/kreuzberg/src/mcp/server.rs +39 -1438
- data/vendor/kreuzberg/src/mcp/tools/cache.rs +179 -0
- data/vendor/kreuzberg/src/mcp/tools/extraction.rs +403 -0
- data/vendor/kreuzberg/src/mcp/tools/mime.rs +150 -0
- data/vendor/kreuzberg/src/mcp/tools/mod.rs +11 -0
- data/vendor/kreuzberg/src/ocr/backends/easyocr.rs +96 -0
- data/vendor/kreuzberg/src/ocr/backends/mod.rs +7 -0
- data/vendor/kreuzberg/src/ocr/backends/paddleocr.rs +27 -0
- data/vendor/kreuzberg/src/ocr/backends/tesseract.rs +134 -0
- data/vendor/kreuzberg/src/ocr/hocr.rs +60 -16
- data/vendor/kreuzberg/src/ocr/language_registry.rs +11 -235
- data/vendor/kreuzberg/src/ocr/mod.rs +1 -0
- data/vendor/kreuzberg/src/ocr/processor/config.rs +203 -0
- data/vendor/kreuzberg/src/ocr/processor/execution.rs +494 -0
- data/vendor/kreuzberg/src/ocr/processor/mod.rs +265 -0
- data/vendor/kreuzberg/src/ocr/processor/validation.rs +145 -0
- data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +41 -24
- data/vendor/kreuzberg/src/pdf/bindings.rs +21 -8
- data/vendor/kreuzberg/src/pdf/hierarchy/bounding_box.rs +289 -0
- data/vendor/kreuzberg/src/pdf/hierarchy/clustering.rs +199 -0
- data/vendor/kreuzberg/src/pdf/{hierarchy.rs → hierarchy/extraction.rs} +6 -346
- data/vendor/kreuzberg/src/pdf/hierarchy/mod.rs +18 -0
- data/vendor/kreuzberg/src/plugins/extractor/mod.rs +319 -0
- data/vendor/kreuzberg/src/plugins/extractor/registry.rs +434 -0
- data/vendor/kreuzberg/src/plugins/extractor/trait.rs +391 -0
- data/vendor/kreuzberg/src/plugins/mod.rs +13 -0
- data/vendor/kreuzberg/src/plugins/ocr.rs +11 -0
- data/vendor/kreuzberg/src/plugins/processor/mod.rs +365 -0
- data/vendor/kreuzberg/src/plugins/processor/registry.rs +37 -0
- data/vendor/kreuzberg/src/plugins/processor/trait.rs +284 -0
- data/vendor/kreuzberg/src/plugins/registry/extractor.rs +416 -0
- data/vendor/kreuzberg/src/plugins/registry/mod.rs +116 -0
- data/vendor/kreuzberg/src/plugins/registry/ocr.rs +293 -0
- data/vendor/kreuzberg/src/plugins/registry/processor.rs +304 -0
- data/vendor/kreuzberg/src/plugins/registry/validator.rs +238 -0
- data/vendor/kreuzberg/src/plugins/validator/mod.rs +424 -0
- data/vendor/kreuzberg/src/plugins/validator/registry.rs +355 -0
- data/vendor/kreuzberg/src/plugins/validator/trait.rs +276 -0
- data/vendor/kreuzberg/src/stopwords/languages/asian.rs +40 -0
- data/vendor/kreuzberg/src/stopwords/languages/germanic.rs +36 -0
- data/vendor/kreuzberg/src/stopwords/languages/mod.rs +10 -0
- data/vendor/kreuzberg/src/stopwords/languages/other.rs +44 -0
- data/vendor/kreuzberg/src/stopwords/languages/romance.rs +36 -0
- data/vendor/kreuzberg/src/stopwords/languages/slavic.rs +36 -0
- data/vendor/kreuzberg/src/stopwords/mod.rs +7 -33
- data/vendor/kreuzberg/src/text/quality.rs +1 -1
- data/vendor/kreuzberg/src/text/quality_processor.rs +10 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/analysis.rs +238 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/mod.rs +8 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/punctuation.rs +54 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/reducer.rs +384 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/sentence_selection.rs +68 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/word_filtering.rs +156 -0
- data/vendor/kreuzberg/src/text/token_reduction/filters/general.rs +377 -0
- data/vendor/kreuzberg/src/text/token_reduction/filters/html.rs +51 -0
- data/vendor/kreuzberg/src/text/token_reduction/filters/markdown.rs +285 -0
- data/vendor/kreuzberg/src/text/token_reduction/filters.rs +131 -246
- data/vendor/kreuzberg/src/types/djot.rs +209 -0
- data/vendor/kreuzberg/src/types/extraction.rs +301 -0
- data/vendor/kreuzberg/src/types/formats.rs +443 -0
- data/vendor/kreuzberg/src/types/metadata.rs +560 -0
- data/vendor/kreuzberg/src/types/mod.rs +281 -0
- data/vendor/kreuzberg/src/types/page.rs +182 -0
- data/vendor/kreuzberg/src/types/serde_helpers.rs +132 -0
- data/vendor/kreuzberg/src/types/tables.rs +39 -0
- data/vendor/kreuzberg/src/utils/quality/heuristics.rs +58 -0
- data/vendor/kreuzberg/src/utils/{quality.rs → quality/mod.rs} +168 -489
- data/vendor/kreuzberg/src/utils/quality/patterns.rs +117 -0
- data/vendor/kreuzberg/src/utils/quality/scoring.rs +178 -0
- data/vendor/kreuzberg/src/utils/string_pool/buffer_pool.rs +325 -0
- data/vendor/kreuzberg/src/utils/string_pool/interned.rs +102 -0
- data/vendor/kreuzberg/src/utils/string_pool/language_pool.rs +119 -0
- data/vendor/kreuzberg/src/utils/string_pool/mime_pool.rs +235 -0
- data/vendor/kreuzberg/src/utils/string_pool/mod.rs +41 -0
- data/vendor/kreuzberg/tests/api_chunk.rs +313 -0
- data/vendor/kreuzberg/tests/api_embed.rs +6 -9
- data/vendor/kreuzberg/tests/batch_orchestration.rs +1 -0
- data/vendor/kreuzberg/tests/concurrency_stress.rs +7 -0
- data/vendor/kreuzberg/tests/core_integration.rs +1 -0
- data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +130 -0
- data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +5 -14
- data/vendor/kreuzberg/tests/format_integration.rs +2 -0
- data/vendor/kreuzberg/tests/helpers/mod.rs +1 -0
- data/vendor/kreuzberg/tests/html_table_test.rs +11 -11
- data/vendor/kreuzberg/tests/ocr_configuration.rs +16 -0
- data/vendor/kreuzberg/tests/ocr_errors.rs +18 -0
- data/vendor/kreuzberg/tests/ocr_quality.rs +9 -0
- data/vendor/kreuzberg/tests/ocr_stress.rs +1 -0
- data/vendor/kreuzberg/tests/pipeline_integration.rs +50 -0
- data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +13 -0
- data/vendor/kreuzberg/tests/plugin_system.rs +12 -0
- data/vendor/kreuzberg/tests/registry_integration_tests.rs +2 -0
- data/vendor/kreuzberg-ffi/Cargo.toml +2 -1
- data/vendor/kreuzberg-ffi/benches/result_view_benchmark.rs +2 -0
- data/vendor/kreuzberg-ffi/kreuzberg.h +347 -178
- data/vendor/kreuzberg-ffi/src/config/html.rs +318 -0
- data/vendor/kreuzberg-ffi/src/config/loader.rs +154 -0
- data/vendor/kreuzberg-ffi/src/config/merge.rs +104 -0
- data/vendor/kreuzberg-ffi/src/config/mod.rs +385 -0
- data/vendor/kreuzberg-ffi/src/config/parse.rs +91 -0
- data/vendor/kreuzberg-ffi/src/config/serialize.rs +118 -0
- data/vendor/kreuzberg-ffi/src/config_builder.rs +598 -0
- data/vendor/kreuzberg-ffi/src/error.rs +46 -14
- data/vendor/kreuzberg-ffi/src/helpers.rs +10 -0
- data/vendor/kreuzberg-ffi/src/html_options.rs +421 -0
- data/vendor/kreuzberg-ffi/src/lib.rs +16 -0
- data/vendor/kreuzberg-ffi/src/panic_shield.rs +11 -0
- data/vendor/kreuzberg-ffi/src/plugins/ocr_backend.rs +2 -0
- data/vendor/kreuzberg-ffi/src/result.rs +148 -122
- data/vendor/kreuzberg-ffi/src/result_view.rs +4 -0
- data/vendor/kreuzberg-tesseract/Cargo.toml +2 -2
- metadata +200 -28
- data/vendor/kreuzberg/src/api/server.rs +0 -518
- data/vendor/kreuzberg/src/core/config.rs +0 -1914
- data/vendor/kreuzberg/src/core/config_validation.rs +0 -949
- data/vendor/kreuzberg/src/core/extractor.rs +0 -1200
- data/vendor/kreuzberg/src/core/pipeline.rs +0 -1223
- data/vendor/kreuzberg/src/core/server_config.rs +0 -1220
- data/vendor/kreuzberg/src/extraction/html.rs +0 -1830
- data/vendor/kreuzberg/src/extraction/pptx.rs +0 -3102
- data/vendor/kreuzberg/src/extractors/epub.rs +0 -696
- data/vendor/kreuzberg/src/extractors/latex.rs +0 -653
- data/vendor/kreuzberg/src/extractors/opml.rs +0 -635
- data/vendor/kreuzberg/src/extractors/rtf.rs +0 -809
- data/vendor/kreuzberg/src/ocr/processor.rs +0 -858
- data/vendor/kreuzberg/src/plugins/extractor.rs +0 -1042
- data/vendor/kreuzberg/src/plugins/processor.rs +0 -650
- data/vendor/kreuzberg/src/plugins/registry.rs +0 -1339
- data/vendor/kreuzberg/src/plugins/validator.rs +0 -967
- data/vendor/kreuzberg/src/text/token_reduction/core.rs +0 -832
- data/vendor/kreuzberg/src/types.rs +0 -1713
- data/vendor/kreuzberg/src/utils/string_pool.rs +0 -762
- data/vendor/kreuzberg-ffi/src/config.rs +0 -1341
|
@@ -25,6 +25,14 @@ typedef struct ExtractionConfig ExtractionConfig;
|
|
|
25
25
|
typedef struct ExtractionResult ExtractionResult;
|
|
26
26
|
|
|
27
27
|
|
|
28
|
+
/**
|
|
29
|
+
* Opaque builder struct for constructing ExtractionConfig.
|
|
30
|
+
*
|
|
31
|
+
* Use kreuzberg_config_builder_new() to create, set fields with setters,
|
|
32
|
+
* then finalize with kreuzberg_config_builder_build().
|
|
33
|
+
*/
|
|
34
|
+
typedef struct ConfigBuilder ConfigBuilder;
|
|
35
|
+
|
|
28
36
|
typedef struct Option_ErrorCallback Option_ErrorCallback;
|
|
29
37
|
|
|
30
38
|
/**
|
|
@@ -658,22 +666,6 @@ int kreuzberg_extract_batch_parallel(const char *const *files,
|
|
|
658
666
|
* - `json_config` must be a valid null-terminated C string
|
|
659
667
|
* - The returned pointer must be freed with `kreuzberg_config_free`
|
|
660
668
|
* - Returns NULL if parsing fails (error available via `kreuzberg_last_error`)
|
|
661
|
-
*
|
|
662
|
-
* # Example (C)
|
|
663
|
-
*
|
|
664
|
-
* ```c
|
|
665
|
-
* const char* config_json = "{\"use_cache\": true, \"ocr\": {\"backend\": \"tesseract\"}}";
|
|
666
|
-
* ExtractionConfig* config = kreuzberg_config_from_json(config_json);
|
|
667
|
-
* if (config == NULL) {
|
|
668
|
-
* printf("Error: %s\n", kreuzberg_last_error());
|
|
669
|
-
* return 1;
|
|
670
|
-
* }
|
|
671
|
-
*
|
|
672
|
-
* // Use config...
|
|
673
|
-
* // char* result = kreuzberg_extract_file_with_config("doc.pdf", config);
|
|
674
|
-
*
|
|
675
|
-
* kreuzberg_config_free(config);
|
|
676
|
-
* ```
|
|
677
669
|
*/
|
|
678
670
|
ExtractionConfig *kreuzberg_config_from_json(const char *json_config);
|
|
679
671
|
|
|
@@ -685,30 +677,12 @@ ExtractionConfig *kreuzberg_config_from_json(const char *json_config);
|
|
|
685
677
|
* - `config` must be a pointer previously returned by a config creation function
|
|
686
678
|
* - `config` can be NULL (no-op)
|
|
687
679
|
* - `config` must not be used after this call
|
|
688
|
-
*
|
|
689
|
-
* # Example (C)
|
|
690
|
-
*
|
|
691
|
-
* ```c
|
|
692
|
-
* ExtractionConfig* config = kreuzberg_config_from_json("{...}");
|
|
693
|
-
* if (config != NULL) {
|
|
694
|
-
* // Use config...
|
|
695
|
-
* kreuzberg_config_free(config);
|
|
696
|
-
* }
|
|
697
|
-
* ```
|
|
698
680
|
*/
|
|
699
681
|
void kreuzberg_config_free(ExtractionConfig *config);
|
|
700
682
|
|
|
701
683
|
/**
|
|
702
684
|
* Validate a JSON config string without parsing it.
|
|
703
685
|
*
|
|
704
|
-
* This function checks if a JSON config string is valid and would parse correctly,
|
|
705
|
-
* without allocating the full ExtractionConfig structure. Useful for validation
|
|
706
|
-
* before committing to parsing.
|
|
707
|
-
*
|
|
708
|
-
* # Arguments
|
|
709
|
-
*
|
|
710
|
-
* * `json_config` - Null-terminated C string containing JSON configuration
|
|
711
|
-
*
|
|
712
686
|
* # Returns
|
|
713
687
|
*
|
|
714
688
|
* - 1 if valid (would parse successfully)
|
|
@@ -717,238 +691,312 @@ void kreuzberg_config_free(ExtractionConfig *config);
|
|
|
717
691
|
* # Safety
|
|
718
692
|
*
|
|
719
693
|
* - `json_config` must be a valid null-terminated C string
|
|
720
|
-
*
|
|
721
|
-
* # Example (C)
|
|
722
|
-
*
|
|
723
|
-
* ```c
|
|
724
|
-
* const char* config_json = "{\"use_cache\": true}";
|
|
725
|
-
* if (kreuzberg_config_is_valid(config_json)) {
|
|
726
|
-
* ExtractionConfig* config = kreuzberg_config_from_json(config_json);
|
|
727
|
-
* // Use config...
|
|
728
|
-
* kreuzberg_config_free(config);
|
|
729
|
-
* } else {
|
|
730
|
-
* printf("Invalid config: %s\n", kreuzberg_last_error());
|
|
731
|
-
* }
|
|
732
|
-
* ```
|
|
733
694
|
*/
|
|
734
695
|
int32_t kreuzberg_config_is_valid(const char *json_config);
|
|
735
696
|
|
|
736
697
|
/**
|
|
737
698
|
* Serialize an ExtractionConfig to JSON string.
|
|
738
699
|
*
|
|
739
|
-
*
|
|
740
|
-
* bindings to serialize configs without reimplementing serialization logic.
|
|
700
|
+
* # Safety
|
|
741
701
|
*
|
|
742
|
-
*
|
|
702
|
+
* - `config` must be a valid pointer to an ExtractionConfig
|
|
703
|
+
* - The returned pointer must be freed with `kreuzberg_free_string`
|
|
704
|
+
*/
|
|
705
|
+
char *kreuzberg_config_to_json(const ExtractionConfig *config);
|
|
706
|
+
|
|
707
|
+
/**
|
|
708
|
+
* Get a specific field from config as JSON string.
|
|
743
709
|
*
|
|
744
|
-
*
|
|
710
|
+
* # Safety
|
|
711
|
+
*
|
|
712
|
+
* - `config` must be a valid pointer to an ExtractionConfig
|
|
713
|
+
* - `field_name` must be a valid null-terminated C string
|
|
714
|
+
*/
|
|
715
|
+
char *kreuzberg_config_get_field(const ExtractionConfig *config, const char *field_name);
|
|
716
|
+
|
|
717
|
+
/**
|
|
718
|
+
* Merge two configs (override takes precedence over base).
|
|
745
719
|
*
|
|
746
720
|
* # Returns
|
|
747
721
|
*
|
|
748
|
-
*
|
|
749
|
-
*
|
|
722
|
+
* - 1 on success
|
|
723
|
+
* - 0 on error (check `kreuzberg_last_error`)
|
|
750
724
|
*
|
|
751
725
|
* # Safety
|
|
752
726
|
*
|
|
753
|
-
* - `
|
|
754
|
-
* - `
|
|
755
|
-
|
|
727
|
+
* - `base` must be a valid mutable pointer to an ExtractionConfig
|
|
728
|
+
* - `override_config` must be a valid pointer to an ExtractionConfig
|
|
729
|
+
*/
|
|
730
|
+
int32_t kreuzberg_config_merge(ExtractionConfig *base, const ExtractionConfig *override_config);
|
|
731
|
+
|
|
732
|
+
/**
|
|
733
|
+
* Load an ExtractionConfig from a file (returns JSON string).
|
|
756
734
|
*
|
|
757
|
-
* #
|
|
735
|
+
* # Safety
|
|
758
736
|
*
|
|
759
|
-
*
|
|
760
|
-
*
|
|
761
|
-
* if (config != NULL) {
|
|
762
|
-
* char* json = kreuzberg_config_to_json(config);
|
|
763
|
-
* if (json != NULL) {
|
|
764
|
-
* printf("Serialized: %s\n", json);
|
|
765
|
-
* kreuzberg_free_string(json);
|
|
766
|
-
* }
|
|
767
|
-
* kreuzberg_config_free(config);
|
|
768
|
-
* }
|
|
769
|
-
* ```
|
|
737
|
+
* - `file_path` must be a valid null-terminated C string
|
|
738
|
+
* - The returned string must be freed with `kreuzberg_free_string`
|
|
770
739
|
*/
|
|
771
|
-
char *
|
|
740
|
+
char *kreuzberg_load_extraction_config_from_file(const char *file_path);
|
|
772
741
|
|
|
773
742
|
/**
|
|
774
|
-
*
|
|
743
|
+
* Load an ExtractionConfig from a file (returns pointer to config struct).
|
|
775
744
|
*
|
|
776
|
-
*
|
|
777
|
-
* representation. Supports dot notation for nested fields (e.g., "ocr.backend").
|
|
745
|
+
* # Safety
|
|
778
746
|
*
|
|
779
|
-
*
|
|
747
|
+
* - `path` must be a valid null-terminated C string
|
|
748
|
+
* - The returned pointer must be freed with `kreuzberg_config_free`
|
|
749
|
+
*/
|
|
750
|
+
ExtractionConfig *kreuzberg_config_from_file(const char *path);
|
|
751
|
+
|
|
752
|
+
/**
|
|
753
|
+
* Discover and load an ExtractionConfig by searching parent directories.
|
|
780
754
|
*
|
|
781
|
-
*
|
|
782
|
-
* * `field_name` - Null-terminated C string with field path (e.g., "use_cache", "ocr.backend")
|
|
755
|
+
* # Safety
|
|
783
756
|
*
|
|
784
|
-
*
|
|
757
|
+
* - The returned string must be freed with `kreuzberg_free_string`
|
|
758
|
+
*/
|
|
759
|
+
char *kreuzberg_config_discover(void);
|
|
760
|
+
|
|
761
|
+
/**
|
|
762
|
+
* List available embedding preset names.
|
|
785
763
|
*
|
|
786
|
-
*
|
|
787
|
-
* - The field doesn't exist
|
|
788
|
-
* - An error occurs during serialization
|
|
764
|
+
* # Safety
|
|
789
765
|
*
|
|
790
|
-
*
|
|
766
|
+
* - Returned string is a JSON array and must be freed with `kreuzberg_free_string`
|
|
767
|
+
*/
|
|
768
|
+
char *kreuzberg_list_embedding_presets(void);
|
|
769
|
+
|
|
770
|
+
/**
|
|
771
|
+
* Get a specific embedding preset by name.
|
|
791
772
|
*
|
|
792
773
|
* # Safety
|
|
793
774
|
*
|
|
794
|
-
* - `
|
|
795
|
-
* -
|
|
796
|
-
|
|
775
|
+
* - `name` must be a valid null-terminated C string
|
|
776
|
+
* - Returned string is JSON object and must be freed with `kreuzberg_free_string`
|
|
777
|
+
*/
|
|
778
|
+
char *kreuzberg_get_embedding_preset(const char *name);
|
|
779
|
+
|
|
780
|
+
/**
|
|
781
|
+
* Create a new config builder.
|
|
797
782
|
*
|
|
798
|
-
*
|
|
783
|
+
* Returns an opaque pointer to ConfigBuilder. Must be freed with
|
|
784
|
+
* kreuzberg_config_builder_free() or consumed by kreuzberg_config_builder_build().
|
|
799
785
|
*
|
|
800
|
-
*
|
|
801
|
-
* ExtractionConfig* config = kreuzberg_config_from_json(
|
|
802
|
-
* "{\"use_cache\": true, \"ocr\": {\"backend\": \"tesseract\"}}"
|
|
803
|
-
* );
|
|
804
|
-
* if (config != NULL) {
|
|
805
|
-
* char* use_cache = kreuzberg_config_get_field(config, "use_cache");
|
|
806
|
-
* char* backend = kreuzberg_config_get_field(config, "ocr.backend");
|
|
786
|
+
* # Safety
|
|
807
787
|
*
|
|
808
|
-
*
|
|
809
|
-
*
|
|
810
|
-
* kreuzberg_free_string(use_cache);
|
|
811
|
-
* }
|
|
788
|
+
* The returned pointer must be freed with kreuzberg_config_builder_free()
|
|
789
|
+
* or passed to kreuzberg_config_builder_build().
|
|
812
790
|
*
|
|
813
|
-
*
|
|
814
|
-
* printf("backend: %s\n", backend);
|
|
815
|
-
* kreuzberg_free_string(backend);
|
|
816
|
-
* }
|
|
791
|
+
* # Example (C)
|
|
817
792
|
*
|
|
818
|
-
*
|
|
819
|
-
*
|
|
793
|
+
* ```c
|
|
794
|
+
* ConfigBuilder* builder = kreuzberg_config_builder_new();
|
|
795
|
+
* kreuzberg_config_builder_set_use_cache(builder, 1);
|
|
796
|
+
* ExtractionConfig* config = kreuzberg_config_builder_build(builder);
|
|
797
|
+
* // builder is now consumed, don't call kreuzberg_config_builder_free
|
|
798
|
+
* kreuzberg_config_free(config);
|
|
820
799
|
* ```
|
|
821
800
|
*/
|
|
822
|
-
|
|
801
|
+
struct ConfigBuilder *kreuzberg_config_builder_new(void);
|
|
823
802
|
|
|
824
803
|
/**
|
|
825
|
-
*
|
|
826
|
-
*
|
|
827
|
-
* Performs a shallow merge of two ExtractionConfig structures, where fields
|
|
828
|
-
* from `override_config` take precedence over fields in `base`. The `base`
|
|
829
|
-
* config is modified in-place.
|
|
804
|
+
* Set the use_cache field.
|
|
830
805
|
*
|
|
831
806
|
* # Arguments
|
|
832
807
|
*
|
|
833
|
-
* * `
|
|
834
|
-
* * `
|
|
808
|
+
* * `builder` - Non-null pointer to ConfigBuilder
|
|
809
|
+
* * `use_cache` - 1 for true, 0 for false
|
|
835
810
|
*
|
|
836
811
|
* # Returns
|
|
837
812
|
*
|
|
838
|
-
* -
|
|
839
|
-
* - 0 on error (check `kreuzberg_last_error`)
|
|
813
|
+
* 0 on success, -1 on error (NULL builder)
|
|
840
814
|
*
|
|
841
815
|
* # Safety
|
|
842
816
|
*
|
|
843
|
-
*
|
|
844
|
-
* - `
|
|
845
|
-
* -
|
|
846
|
-
|
|
817
|
+
* This function is meant to be called from C/FFI code. The caller must ensure:
|
|
818
|
+
* - `builder` must be a valid, non-null pointer previously returned by `kreuzberg_config_builder_new`
|
|
819
|
+
* - The pointer must be properly aligned and point to a valid ConfigBuilder instance
|
|
820
|
+
*/
|
|
821
|
+
int32_t kreuzberg_config_builder_set_use_cache(struct ConfigBuilder *builder,
|
|
822
|
+
int32_t use_cache);
|
|
823
|
+
|
|
824
|
+
/**
|
|
825
|
+
* Set OCR configuration from JSON.
|
|
847
826
|
*
|
|
848
|
-
* #
|
|
827
|
+
* # Arguments
|
|
849
828
|
*
|
|
850
|
-
*
|
|
851
|
-
*
|
|
852
|
-
* "{\"use_cache\": true, \"force_ocr\": false}"
|
|
853
|
-
* );
|
|
854
|
-
* ExtractionConfig* override = kreuzberg_config_from_json(
|
|
855
|
-
* "{\"force_ocr\": true}"
|
|
856
|
-
* );
|
|
829
|
+
* * `builder` - Non-null pointer to ConfigBuilder
|
|
830
|
+
* * `ocr_json` - JSON string like `{"backend": "tesseract", "languages": ["en"]}`
|
|
857
831
|
*
|
|
858
|
-
*
|
|
859
|
-
* // base now has: use_cache=true, force_ocr=true
|
|
860
|
-
* char* json = kreuzberg_config_to_json(base);
|
|
861
|
-
* printf("Merged config: %s\n", json);
|
|
862
|
-
* kreuzberg_free_string(json);
|
|
863
|
-
* }
|
|
832
|
+
* # Returns
|
|
864
833
|
*
|
|
865
|
-
*
|
|
866
|
-
*
|
|
867
|
-
*
|
|
834
|
+
* 0 on success, -1 on error (check kreuzberg_last_error)
|
|
835
|
+
*
|
|
836
|
+
* # Safety
|
|
837
|
+
*
|
|
838
|
+
* This function is meant to be called from C/FFI code. The caller must ensure:
|
|
839
|
+
* - `builder` must be a valid, non-null pointer previously returned by `kreuzberg_config_builder_new`
|
|
840
|
+
* - The pointer must be properly aligned and point to a valid ConfigBuilder instance
|
|
841
|
+
* - `ocr_json` must be a valid, non-null pointer to a null-terminated UTF-8 string
|
|
842
|
+
* - The string pointer must remain valid for the duration of the function call
|
|
868
843
|
*/
|
|
869
|
-
int32_t
|
|
844
|
+
int32_t kreuzberg_config_builder_set_ocr(struct ConfigBuilder *builder,
|
|
845
|
+
const char *ocr_json);
|
|
870
846
|
|
|
871
847
|
/**
|
|
872
|
-
*
|
|
848
|
+
* Set PDF configuration from JSON.
|
|
849
|
+
*
|
|
850
|
+
* # Arguments
|
|
851
|
+
*
|
|
852
|
+
* * `builder` - Non-null pointer to ConfigBuilder
|
|
853
|
+
* * `pdf_json` - JSON string for PDF config
|
|
854
|
+
*
|
|
855
|
+
* # Returns
|
|
873
856
|
*
|
|
874
|
-
*
|
|
857
|
+
* 0 on success, -1 on error
|
|
875
858
|
*
|
|
876
859
|
* # Safety
|
|
877
860
|
*
|
|
878
|
-
*
|
|
879
|
-
* -
|
|
880
|
-
* -
|
|
861
|
+
* This function is meant to be called from C/FFI code. The caller must ensure:
|
|
862
|
+
* - `builder` must be a valid, non-null pointer previously returned by `kreuzberg_config_builder_new`
|
|
863
|
+
* - The pointer must be properly aligned and point to a valid ConfigBuilder instance
|
|
864
|
+
* - `pdf_json` must be a valid, non-null pointer to a null-terminated UTF-8 string
|
|
865
|
+
* - The string pointer must remain valid for the duration of the function call
|
|
881
866
|
*/
|
|
882
|
-
|
|
867
|
+
int32_t kreuzberg_config_builder_set_pdf(struct ConfigBuilder *builder,
|
|
868
|
+
const char *pdf_json);
|
|
883
869
|
|
|
884
870
|
/**
|
|
885
|
-
*
|
|
871
|
+
* Set chunking configuration from JSON.
|
|
872
|
+
*
|
|
873
|
+
* # Arguments
|
|
874
|
+
*
|
|
875
|
+
* * `builder` - Non-null pointer to ConfigBuilder
|
|
876
|
+
* * `chunking_json` - JSON string for chunking config
|
|
877
|
+
*
|
|
878
|
+
* # Returns
|
|
879
|
+
*
|
|
880
|
+
* 0 on success, -1 on error
|
|
886
881
|
*
|
|
887
882
|
* # Safety
|
|
888
883
|
*
|
|
889
|
-
*
|
|
890
|
-
* -
|
|
891
|
-
* -
|
|
884
|
+
* This function is meant to be called from C/FFI code. The caller must ensure:
|
|
885
|
+
* - `builder` must be a valid, non-null pointer previously returned by `kreuzberg_config_builder_new`
|
|
886
|
+
* - The pointer must be properly aligned and point to a valid ConfigBuilder instance
|
|
887
|
+
* - `chunking_json` must be a valid, non-null pointer to a null-terminated UTF-8 string
|
|
888
|
+
* - The string pointer must remain valid for the duration of the function call
|
|
889
|
+
*/
|
|
890
|
+
int32_t kreuzberg_config_builder_set_chunking(struct ConfigBuilder *builder,
|
|
891
|
+
const char *chunking_json);
|
|
892
|
+
|
|
893
|
+
/**
|
|
894
|
+
* Set image extraction configuration from JSON.
|
|
892
895
|
*
|
|
893
|
-
* #
|
|
896
|
+
* # Arguments
|
|
894
897
|
*
|
|
895
|
-
*
|
|
896
|
-
*
|
|
897
|
-
*
|
|
898
|
-
*
|
|
899
|
-
*
|
|
900
|
-
*
|
|
901
|
-
*
|
|
902
|
-
*
|
|
898
|
+
* * `builder` - Non-null pointer to ConfigBuilder
|
|
899
|
+
* * `image_json` - JSON string for image extraction config
|
|
900
|
+
*
|
|
901
|
+
* # Returns
|
|
902
|
+
*
|
|
903
|
+
* 0 on success, -1 on error
|
|
904
|
+
*
|
|
905
|
+
* # Safety
|
|
906
|
+
*
|
|
907
|
+
* This function is meant to be called from C/FFI code. The caller must ensure:
|
|
908
|
+
* - `builder` must be a valid, non-null pointer previously returned by `kreuzberg_config_builder_new`
|
|
909
|
+
* - The pointer must be properly aligned and point to a valid ConfigBuilder instance
|
|
910
|
+
* - `image_json` must be a valid, non-null pointer to a null-terminated UTF-8 string
|
|
911
|
+
* - The string pointer must remain valid for the duration of the function call
|
|
903
912
|
*/
|
|
904
|
-
|
|
913
|
+
int32_t kreuzberg_config_builder_set_image_extraction(struct ConfigBuilder *builder,
|
|
914
|
+
const char *image_json);
|
|
905
915
|
|
|
906
916
|
/**
|
|
907
|
-
*
|
|
917
|
+
* Set post-processor configuration from JSON.
|
|
908
918
|
*
|
|
909
|
-
*
|
|
910
|
-
* - `kreuzberg.toml`
|
|
911
|
-
* - `kreuzberg.json`
|
|
919
|
+
* # Arguments
|
|
912
920
|
*
|
|
913
|
-
*
|
|
921
|
+
* * `builder` - Non-null pointer to ConfigBuilder
|
|
922
|
+
* * `pp_json` - JSON string for post-processor config
|
|
923
|
+
*
|
|
924
|
+
* # Returns
|
|
925
|
+
*
|
|
926
|
+
* 0 on success, -1 on error
|
|
914
927
|
*
|
|
915
928
|
* # Safety
|
|
916
929
|
*
|
|
917
|
-
*
|
|
918
|
-
* -
|
|
930
|
+
* This function is meant to be called from C/FFI code. The caller must ensure:
|
|
931
|
+
* - `builder` must be a valid, non-null pointer previously returned by `kreuzberg_config_builder_new`
|
|
932
|
+
* - The pointer must be properly aligned and point to a valid ConfigBuilder instance
|
|
933
|
+
* - `pp_json` must be a valid, non-null pointer to a null-terminated UTF-8 string
|
|
934
|
+
* - The string pointer must remain valid for the duration of the function call
|
|
935
|
+
*/
|
|
936
|
+
int32_t kreuzberg_config_builder_set_post_processor(struct ConfigBuilder *builder,
|
|
937
|
+
const char *pp_json);
|
|
938
|
+
|
|
939
|
+
/**
|
|
940
|
+
* Set language detection configuration from JSON.
|
|
919
941
|
*
|
|
920
|
-
* #
|
|
942
|
+
* # Arguments
|
|
921
943
|
*
|
|
922
|
-
*
|
|
923
|
-
*
|
|
924
|
-
*
|
|
925
|
-
*
|
|
926
|
-
*
|
|
927
|
-
*
|
|
928
|
-
*
|
|
944
|
+
* * `builder` - Non-null pointer to ConfigBuilder
|
|
945
|
+
* * `ld_json` - JSON string for language detection config
|
|
946
|
+
*
|
|
947
|
+
* # Returns
|
|
948
|
+
*
|
|
949
|
+
* 0 on success, -1 on error
|
|
950
|
+
*
|
|
951
|
+
* # Safety
|
|
952
|
+
*
|
|
953
|
+
* This function is meant to be called from C/FFI code. The caller must ensure:
|
|
954
|
+
* - `builder` must be a valid, non-null pointer previously returned by `kreuzberg_config_builder_new`
|
|
955
|
+
* - The pointer must be properly aligned and point to a valid ConfigBuilder instance
|
|
956
|
+
* - `ld_json` must be a valid, non-null pointer to a null-terminated UTF-8 string
|
|
957
|
+
* - The string pointer must remain valid for the duration of the function call
|
|
929
958
|
*/
|
|
930
|
-
|
|
959
|
+
int32_t kreuzberg_config_builder_set_language_detection(struct ConfigBuilder *builder,
|
|
960
|
+
const char *ld_json);
|
|
931
961
|
|
|
932
962
|
/**
|
|
933
|
-
*
|
|
963
|
+
* Build the final ExtractionConfig and consume the builder.
|
|
964
|
+
*
|
|
965
|
+
* After calling this function, the builder pointer is invalid and must not be used.
|
|
966
|
+
* The returned ExtractionConfig must be freed with kreuzberg_config_free().
|
|
967
|
+
*
|
|
968
|
+
* # Arguments
|
|
969
|
+
*
|
|
970
|
+
* * `builder` - Non-null pointer to ConfigBuilder (will be consumed)
|
|
971
|
+
*
|
|
972
|
+
* # Returns
|
|
973
|
+
*
|
|
974
|
+
* Pointer to ExtractionConfig on success, NULL on error
|
|
934
975
|
*
|
|
935
976
|
* # Safety
|
|
936
977
|
*
|
|
937
|
-
* -
|
|
938
|
-
* -
|
|
978
|
+
* - `builder` is consumed and must not be used after this call
|
|
979
|
+
* - Do NOT call kreuzberg_config_builder_free() after this function
|
|
980
|
+
* - The returned ExtractionConfig must be freed with kreuzberg_config_free()
|
|
939
981
|
*/
|
|
940
|
-
|
|
982
|
+
ExtractionConfig *kreuzberg_config_builder_build(struct ConfigBuilder *builder);
|
|
941
983
|
|
|
942
984
|
/**
|
|
943
|
-
*
|
|
985
|
+
* Free a ConfigBuilder without building.
|
|
986
|
+
*
|
|
987
|
+
* Use this to discard a builder without creating a config.
|
|
988
|
+
* Do NOT call this after kreuzberg_config_builder_build() (builder is already consumed).
|
|
989
|
+
*
|
|
990
|
+
* # Arguments
|
|
991
|
+
*
|
|
992
|
+
* * `builder` - Pointer to ConfigBuilder, can be NULL (no-op)
|
|
944
993
|
*
|
|
945
994
|
* # Safety
|
|
946
995
|
*
|
|
947
|
-
* - `
|
|
948
|
-
* -
|
|
949
|
-
* - Returns NULL on error (check `kreuzberg_last_error`)
|
|
996
|
+
* - `builder` can be NULL (no-op)
|
|
997
|
+
* - Do NOT call this after kreuzberg_config_builder_build()
|
|
950
998
|
*/
|
|
951
|
-
|
|
999
|
+
void kreuzberg_config_builder_free(struct ConfigBuilder *builder);
|
|
952
1000
|
|
|
953
1001
|
/**
|
|
954
1002
|
* Returns the validation error code (0).
|
|
@@ -1370,6 +1418,127 @@ struct CBatchResult *kreuzberg_batch_extract_bytes_sync(const struct CBytesWithM
|
|
|
1370
1418
|
uintptr_t count,
|
|
1371
1419
|
const char *config_json);
|
|
1372
1420
|
|
|
1421
|
+
/**
|
|
1422
|
+
* Parse HeadingStyle from string to discriminant.
|
|
1423
|
+
*
|
|
1424
|
+
* Valid values: "atx", "underlined", "atx_closed" | "atx-closed"
|
|
1425
|
+
* Returns: 0 = Atx, 1 = Underlined, 2 = AtxClosed, -1 = Invalid
|
|
1426
|
+
*
|
|
1427
|
+
* # Safety
|
|
1428
|
+
*
|
|
1429
|
+
* - `value` must be a valid null-terminated C string or NULL
|
|
1430
|
+
*/
|
|
1431
|
+
int32_t kreuzberg_parse_heading_style(const char *value);
|
|
1432
|
+
|
|
1433
|
+
/**
|
|
1434
|
+
* Convert HeadingStyle discriminant to string.
|
|
1435
|
+
*
|
|
1436
|
+
* Returns: pointer to static string, or NULL for invalid discriminant
|
|
1437
|
+
*/
|
|
1438
|
+
const char *kreuzberg_heading_style_to_string(int32_t discriminant);
|
|
1439
|
+
|
|
1440
|
+
/**
|
|
1441
|
+
* Parse CodeBlockStyle from string to discriminant.
|
|
1442
|
+
*
|
|
1443
|
+
* Valid values: "indented", "backticks", "tildes"
|
|
1444
|
+
* Returns: 0 = Indented, 1 = Backticks, 2 = Tildes, -1 = Invalid
|
|
1445
|
+
*
|
|
1446
|
+
* # Safety
|
|
1447
|
+
*
|
|
1448
|
+
* - `value` must be a valid null-terminated C string or NULL
|
|
1449
|
+
*/
|
|
1450
|
+
int32_t kreuzberg_parse_code_block_style(const char *value);
|
|
1451
|
+
|
|
1452
|
+
/**
|
|
1453
|
+
* Convert CodeBlockStyle discriminant to string.
|
|
1454
|
+
*/
|
|
1455
|
+
const char *kreuzberg_code_block_style_to_string(int32_t discriminant);
|
|
1456
|
+
|
|
1457
|
+
/**
|
|
1458
|
+
* Parse HighlightStyle from string to discriminant.
|
|
1459
|
+
*
|
|
1460
|
+
* Valid values: "double_equal" | "==" | "double-equal", "html", "bold", "none"
|
|
1461
|
+
* Returns: 0 = DoubleEqual, 1 = Html, 2 = Bold, 3 = None, -1 = Invalid
|
|
1462
|
+
*
|
|
1463
|
+
* # Safety
|
|
1464
|
+
*
|
|
1465
|
+
* - `value` must be a valid null-terminated C string or NULL
|
|
1466
|
+
*/
|
|
1467
|
+
int32_t kreuzberg_parse_highlight_style(const char *value);
|
|
1468
|
+
|
|
1469
|
+
/**
|
|
1470
|
+
* Convert HighlightStyle discriminant to string.
|
|
1471
|
+
*/
|
|
1472
|
+
const char *kreuzberg_highlight_style_to_string(int32_t discriminant);
|
|
1473
|
+
|
|
1474
|
+
/**
|
|
1475
|
+
* Parse ListIndentType from string to discriminant.
|
|
1476
|
+
*
|
|
1477
|
+
* Valid values: "spaces", "tabs"
|
|
1478
|
+
* Returns: 0 = Spaces, 1 = Tabs, -1 = Invalid
|
|
1479
|
+
*
|
|
1480
|
+
* # Safety
|
|
1481
|
+
*
|
|
1482
|
+
* - `value` must be a valid null-terminated C string or NULL
|
|
1483
|
+
*/
|
|
1484
|
+
int32_t kreuzberg_parse_list_indent_type(const char *value);
|
|
1485
|
+
|
|
1486
|
+
/**
|
|
1487
|
+
* Convert ListIndentType discriminant to string.
|
|
1488
|
+
*/
|
|
1489
|
+
const char *kreuzberg_list_indent_type_to_string(int32_t discriminant);
|
|
1490
|
+
|
|
1491
|
+
/**
|
|
1492
|
+
* Parse WhitespaceMode from string to discriminant.
|
|
1493
|
+
*
|
|
1494
|
+
* Valid values: "default", "preserve", "preserve_inner", "collapse"
|
|
1495
|
+
* Returns: 0 = Default, 1 = Preserve, 2 = PreserveInner, 3 = Collapse, -1 = Invalid
|
|
1496
|
+
*
|
|
1497
|
+
* # Safety
|
|
1498
|
+
*
|
|
1499
|
+
* - `value` must be a valid null-terminated C string or NULL
|
|
1500
|
+
*/
|
|
1501
|
+
int32_t kreuzberg_parse_whitespace_mode(const char *value);
|
|
1502
|
+
|
|
1503
|
+
/**
|
|
1504
|
+
* Convert WhitespaceMode discriminant to string.
|
|
1505
|
+
*/
|
|
1506
|
+
const char *kreuzberg_whitespace_mode_to_string(int32_t discriminant);
|
|
1507
|
+
|
|
1508
|
+
/**
|
|
1509
|
+
* Parse NewlineStyle from string to discriminant.
|
|
1510
|
+
*
|
|
1511
|
+
* Valid values: "default", "spaces", "backslash"
|
|
1512
|
+
* Returns: 0 = Default, 1 = Spaces, 2 = Backslash, -1 = Invalid
|
|
1513
|
+
*
|
|
1514
|
+
* # Safety
|
|
1515
|
+
*
|
|
1516
|
+
* - `value` must be a valid null-terminated C string or NULL
|
|
1517
|
+
*/
|
|
1518
|
+
int32_t kreuzberg_parse_newline_style(const char *value);
|
|
1519
|
+
|
|
1520
|
+
/**
|
|
1521
|
+
* Convert NewlineStyle discriminant to string.
|
|
1522
|
+
*/
|
|
1523
|
+
const char *kreuzberg_newline_style_to_string(int32_t discriminant);
|
|
1524
|
+
|
|
1525
|
+
/**
|
|
1526
|
+
* Parse PreprocessingPreset from string to discriminant.
|
|
1527
|
+
*
|
|
1528
|
+
* Valid values: "none", "conservative", "aggressive"
|
|
1529
|
+
* Returns: 0 = None, 1 = Conservative, 2 = Aggressive, -1 = Invalid
|
|
1530
|
+
*
|
|
1531
|
+
* # Safety
|
|
1532
|
+
*
|
|
1533
|
+
* - `value` must be a valid null-terminated C string or NULL
|
|
1534
|
+
*/
|
|
1535
|
+
int32_t kreuzberg_parse_preprocessing_preset(const char *value);
|
|
1536
|
+
|
|
1537
|
+
/**
|
|
1538
|
+
* Convert PreprocessingPreset discriminant to string.
|
|
1539
|
+
*/
|
|
1540
|
+
const char *kreuzberg_preprocessing_preset_to_string(int32_t discriminant);
|
|
1541
|
+
|
|
1373
1542
|
/**
|
|
1374
1543
|
* Free a batch result returned by batch extraction functions.
|
|
1375
1544
|
*
|