kreuzberg 4.8.4 → 4.9.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +6 -3
- data/ext/kreuzberg_rb/native/Cargo.lock +130 -109
- data/ext/kreuzberg_rb/native/Cargo.toml +3 -3
- data/ext/kreuzberg_rb/native/src/config/types.rs +22 -0
- data/ext/kreuzberg_rb/native/src/error_handling.rs +7 -0
- data/ext/kreuzberg_rb/native/src/plugins/ocr_backend.rs +1 -0
- data/ext/kreuzberg_rb/native/src/result.rs +46 -0
- data/lib/kreuzberg/errors.rb +3 -0
- data/lib/kreuzberg/result.rb +52 -5
- data/lib/kreuzberg/version.rb +1 -1
- data/sig/kreuzberg.rbs +111 -19
- data/vendor/Cargo.toml +8 -8
- data/vendor/kreuzberg/Cargo.toml +9 -9
- data/vendor/kreuzberg/README.md +1 -1
- data/vendor/kreuzberg/src/api/error.rs +1 -0
- data/vendor/kreuzberg/src/api/handlers.rs +75 -2
- data/vendor/kreuzberg/src/api/types.rs +11 -2
- data/vendor/kreuzberg/src/cancellation.rs +105 -0
- data/vendor/kreuzberg/src/chunking/boundary_detection.rs +496 -0
- data/vendor/kreuzberg/src/chunking/core.rs +122 -10
- data/vendor/kreuzberg/src/chunking/mod.rs +9 -10
- data/vendor/kreuzberg/src/chunking/semantic/merge.rs +477 -0
- data/vendor/kreuzberg/src/chunking/semantic/mod.rs +393 -0
- data/vendor/kreuzberg/src/chunking/semantic/topic.rs +224 -0
- data/vendor/kreuzberg/src/chunking/tokenizer_cache.rs +3 -3
- data/vendor/kreuzberg/src/core/config/extraction/core.rs +89 -1
- data/vendor/kreuzberg/src/core/config/layout.rs +8 -0
- data/vendor/kreuzberg/src/core/config/llm.rs +47 -1
- data/vendor/kreuzberg/src/core/config/ocr.rs +16 -1
- data/vendor/kreuzberg/src/core/config/processing.rs +63 -0
- data/vendor/kreuzberg/src/core/extractor/bytes.rs +1 -1
- data/vendor/kreuzberg/src/core/extractor/file.rs +1 -1
- data/vendor/kreuzberg/src/core/extractor/sync.rs +24 -21
- data/vendor/kreuzberg/src/core/formats.rs +2 -2
- data/vendor/kreuzberg/src/core/mime.rs +3 -3
- data/vendor/kreuzberg/src/core/pipeline/cache.rs +2 -2
- data/vendor/kreuzberg/src/core/pipeline/mod.rs +4 -1
- data/vendor/kreuzberg/src/doc_orientation.rs +22 -4
- data/vendor/kreuzberg/src/embeddings/mod.rs +253 -18
- data/vendor/kreuzberg/src/error.rs +6 -0
- data/vendor/kreuzberg/src/extraction/derive.rs +6 -1
- data/vendor/kreuzberg/src/extraction/docx/drawing.rs +2 -4
- data/vendor/kreuzberg/src/extraction/docx/mod.rs +185 -0
- data/vendor/kreuzberg/src/extraction/html/structure.rs +5 -7
- data/vendor/kreuzberg/src/extraction/image.rs +1 -0
- data/vendor/kreuzberg/src/extraction/pptx/content_builder.rs +1 -0
- data/vendor/kreuzberg/src/extraction/pst.rs +6 -7
- data/vendor/kreuzberg/src/extraction/transform/document_tree.rs +3 -0
- data/vendor/kreuzberg/src/extraction/transform/mod.rs +5 -0
- data/vendor/kreuzberg/src/extractors/djot_format/extractor.rs +38 -50
- data/vendor/kreuzberg/src/extractors/doc.rs +4 -1
- data/vendor/kreuzberg/src/extractors/docbook.rs +8 -12
- data/vendor/kreuzberg/src/extractors/docx.rs +16 -5
- data/vendor/kreuzberg/src/extractors/excel.rs +5 -2
- data/vendor/kreuzberg/src/extractors/frontmatter_utils.rs +2 -4
- data/vendor/kreuzberg/src/extractors/html.rs +173 -1
- data/vendor/kreuzberg/src/extractors/image.rs +268 -37
- data/vendor/kreuzberg/src/extractors/iwork/keynote.rs +4 -1
- data/vendor/kreuzberg/src/extractors/iwork/mod.rs +4 -8
- data/vendor/kreuzberg/src/extractors/iwork/numbers.rs +4 -1
- data/vendor/kreuzberg/src/extractors/iwork/pages.rs +4 -1
- data/vendor/kreuzberg/src/extractors/markdown.rs +22 -32
- data/vendor/kreuzberg/src/extractors/mdx.rs +22 -32
- data/vendor/kreuzberg/src/extractors/mod.rs +7 -12
- data/vendor/kreuzberg/src/extractors/pdf/extraction.rs +4 -0
- data/vendor/kreuzberg/src/extractors/pdf/mod.rs +161 -49
- data/vendor/kreuzberg/src/extractors/pdf/ocr.rs +148 -13
- data/vendor/kreuzberg/src/extractors/pdf/pages.rs +47 -1
- data/vendor/kreuzberg/src/extractors/ppt.rs +3 -0
- data/vendor/kreuzberg/src/extractors/pptx.rs +3 -0
- data/vendor/kreuzberg/src/extractors/rtf/parser.rs +4 -5
- data/vendor/kreuzberg/src/keywords/mod.rs +6 -10
- data/vendor/kreuzberg/src/language_detection/mod.rs +6 -10
- data/vendor/kreuzberg/src/layout/engine.rs +9 -2
- data/vendor/kreuzberg/src/layout/mod.rs +17 -6
- data/vendor/kreuzberg/src/layout/models/rtdetr.rs +5 -2
- data/vendor/kreuzberg/src/layout/models/slanet.rs +5 -2
- data/vendor/kreuzberg/src/layout/models/table_classifier.rs +5 -2
- data/vendor/kreuzberg/src/layout/models/tatr.rs +5 -2
- data/vendor/kreuzberg/src/layout/models/yolo.rs +2 -1
- data/vendor/kreuzberg/src/layout/session.rs +4 -51
- data/vendor/kreuzberg/src/lib.rs +2 -0
- data/vendor/kreuzberg/src/llm/mod.rs +2 -0
- data/vendor/kreuzberg/src/llm/structured.rs +7 -3
- data/vendor/kreuzberg/src/llm/usage.rs +40 -0
- data/vendor/kreuzberg/src/llm/vlm_embeddings.rs +5 -3
- data/vendor/kreuzberg/src/llm/vlm_ocr.rs +6 -3
- data/vendor/kreuzberg/src/mcp/errors.rs +18 -0
- data/vendor/kreuzberg/src/mcp/params.rs +19 -1
- data/vendor/kreuzberg/src/mcp/server.rs +15 -4
- data/vendor/kreuzberg/src/ocr/processor/execution.rs +8 -16
- data/vendor/kreuzberg/src/ort_discovery.rs +75 -1
- data/vendor/kreuzberg/src/paddle_ocr/backend.rs +43 -5
- data/vendor/kreuzberg/src/pdf/bindings.rs +40 -15
- data/vendor/kreuzberg/src/pdf/error.rs +3 -0
- data/vendor/kreuzberg/src/pdf/fonts.rs +2 -2
- data/vendor/kreuzberg/src/pdf/images.rs +1 -1
- data/vendor/kreuzberg/src/pdf/layout_runner.rs +1 -0
- data/vendor/kreuzberg/src/pdf/metadata.rs +1 -1
- data/vendor/kreuzberg/src/pdf/oxide/table.rs +6 -0
- data/vendor/kreuzberg/src/pdf/oxide/text.rs +1 -0
- data/vendor/kreuzberg/src/pdf/rendering.rs +1 -1
- data/vendor/kreuzberg/src/pdf/structure/bridge.rs +1 -1
- data/vendor/kreuzberg/src/pdf/structure/layout_classify.rs +2 -3
- data/vendor/kreuzberg/src/pdf/structure/pipeline.rs +116 -15
- data/vendor/kreuzberg/src/pdf/text.rs +2 -1
- data/vendor/kreuzberg/src/plugins/registry/mod.rs +11 -11
- data/vendor/kreuzberg/src/plugins/registry/ocr.rs +30 -31
- data/vendor/kreuzberg/src/stopwords/mod.rs +2 -2
- data/vendor/kreuzberg/src/text/token_reduction/filters/markdown.rs +2 -27
- data/vendor/kreuzberg/src/text/token_reduction/filters.rs +2 -1
- data/vendor/kreuzberg/src/types/extraction.rs +42 -1
- data/vendor/kreuzberg/src/types/internal.rs +18 -0
- data/vendor/kreuzberg/src/types/mod.rs +5 -0
- data/vendor/kreuzberg/src/types/page.rs +26 -1
- data/vendor/kreuzberg/src/utils/markdown_utils.rs +40 -0
- data/vendor/kreuzberg/src/utils/mod.rs +1 -0
- data/vendor/kreuzberg/src/utils/string_pool/buffer_pool.rs +3 -3
- data/vendor/kreuzberg/src/utils/string_pool/language_pool.rs +2 -2
- data/vendor/kreuzberg/src/utils/string_pool/mime_pool.rs +2 -2
- data/vendor/kreuzberg/src/utils/string_utils.rs +7 -7
- data/vendor/kreuzberg/tests/cross_format_parity.rs +9 -4
- data/vendor/kreuzberg/tests/llm_integration.rs +9 -7
- data/vendor/kreuzberg/tests/ocr_content_integrity.rs +154 -0
- data/vendor/kreuzberg/tests/pdf_image_extraction_tests.rs +42 -0
- data/vendor/kreuzberg-ffi/Cargo.toml +5 -5
- data/vendor/kreuzberg-ffi/kreuzberg.h +168 -15
- data/vendor/kreuzberg-ffi/src/cancellation.rs +167 -0
- data/vendor/kreuzberg-ffi/src/error.rs +32 -7
- data/vendor/kreuzberg-ffi/src/helpers.rs +13 -0
- data/vendor/kreuzberg-ffi/src/lib.rs +16 -7
- data/vendor/kreuzberg-ffi/src/memory.rs +30 -11
- data/vendor/kreuzberg-ffi/src/result.rs +71 -0
- data/vendor/kreuzberg-ffi/src/types.rs +19 -16
- data/vendor/kreuzberg-paddle-ocr/Cargo.toml +1 -1
- data/vendor/kreuzberg-paddle-ocr/src/ocr_lite.rs +21 -0
- data/vendor/kreuzberg-pdfium-render/Cargo.toml +1 -1
- data/vendor/kreuzberg-tesseract/Cargo.toml +1 -1
- data/vendor/kreuzberg-tesseract/build.rs +11 -5
- metadata +15 -6
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: bb7b77bae36a5da34ce209fbf1ea7c0a68aef4b22f8b373b908f9c113f404ef5
|
|
4
|
+
data.tar.gz: a6c8667aee6ae2c9e11d45fc98fcb355561fec6e4a7d51d852664bd6367af8cc
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 7569a4914ab4a4d440a0c74e622a9f26f7189b62bc9c2d05fc5e857a32c8fabde8eb854edef34e94bd95d5357e44137c1573e7ce68db45ed85c26dbe31e6972b
|
|
7
|
+
data.tar.gz: 9741106549d7bf79cc1ae34a07f686cca1bf6a4c19fcb01b40cd8f1372166c8e9c3a0321e1e26e416ebb98d413ee1c9093d247949292a7c07a85594ea1df508e
|
data/README.md
CHANGED
|
@@ -22,7 +22,7 @@
|
|
|
22
22
|
<img src="https://img.shields.io/maven-central/v/dev.kreuzberg/kreuzberg?label=Java&color=007ec6" alt="Java">
|
|
23
23
|
</a>
|
|
24
24
|
<a href="https://github.com/kreuzberg-dev/kreuzberg/releases">
|
|
25
|
-
<img src="https://img.shields.io/github/v/tag/kreuzberg-dev/kreuzberg?label=Go&color=007ec6&filter=v4.
|
|
25
|
+
<img src="https://img.shields.io/github/v/tag/kreuzberg-dev/kreuzberg?label=Go&color=007ec6&filter=v4.9.0" alt="Go">
|
|
26
26
|
</a>
|
|
27
27
|
<a href="https://www.nuget.org/packages/Kreuzberg/">
|
|
28
28
|
<img src="https://img.shields.io/nuget/v/Kreuzberg?label=C%23&color=007ec6" alt="C#">
|
|
@@ -39,10 +39,13 @@
|
|
|
39
39
|
<a href="https://github.com/kreuzberg-dev/kreuzberg/pkgs/container/kreuzberg">
|
|
40
40
|
<img src="https://img.shields.io/badge/Docker-007ec6?logo=docker&logoColor=white" alt="Docker">
|
|
41
41
|
</a>
|
|
42
|
+
<a href="https://artifacthub.io/packages/search?repo=kreuzberg">
|
|
43
|
+
<img src="https://img.shields.io/endpoint?url=https://artifacthub.io/badge/repository/kreuzberg" alt="Artifact Hub">
|
|
44
|
+
</a>
|
|
42
45
|
|
|
43
46
|
<!-- Project Info -->
|
|
44
47
|
<a href="https://github.com/kreuzberg-dev/kreuzberg/blob/main/LICENSE">
|
|
45
|
-
<img src="https://img.shields.io/badge/License-
|
|
48
|
+
<img src="https://img.shields.io/badge/License-MIT-007ec6" alt="License">
|
|
46
49
|
</a>
|
|
47
50
|
<a href="https://docs.kreuzberg.dev">
|
|
48
51
|
<img src="https://img.shields.io/badge/docs-kreuzberg.dev-007ec6" alt="Documentation">
|
|
@@ -419,7 +422,7 @@ Contributions are welcome! See [Contributing Guide](https://github.com/kreuzberg
|
|
|
419
422
|
|
|
420
423
|
## License
|
|
421
424
|
|
|
422
|
-
|
|
425
|
+
MIT License - see LICENSE file for details.
|
|
423
426
|
|
|
424
427
|
## Support
|
|
425
428
|
|