kreuzberg 4.4.6 → 4.5.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +8 -8
- data/README.md +4 -1
- data/ext/kreuzberg_rb/native/Cargo.lock +110 -239
- data/ext/kreuzberg_rb/native/Cargo.toml +4 -7
- data/ext/kreuzberg_rb/native/src/batch.rs +116 -15
- data/ext/kreuzberg_rb/native/src/config/mod.rs +21 -0
- data/ext/kreuzberg_rb/native/src/config/types.rs +151 -33
- data/ext/kreuzberg_rb/native/src/helpers.rs +1 -1
- data/ext/kreuzberg_rb/native/src/lib.rs +12 -10
- data/lib/kreuzberg/config.rb +116 -9
- data/lib/kreuzberg/version.rb +1 -1
- data/sig/kreuzberg.rbs +34 -6
- data/vendor/Cargo.toml +9 -8
- data/vendor/kreuzberg/Cargo.toml +82 -41
- data/vendor/kreuzberg/README.md +2 -1
- data/vendor/kreuzberg/src/api/handlers.rs +5 -4
- data/vendor/kreuzberg/src/cache/core.rs +2 -1
- data/vendor/kreuzberg/src/cache/utilities.rs +1 -1
- data/vendor/kreuzberg/src/chunking/core.rs +1 -1
- data/vendor/kreuzberg/src/core/batch_mode.rs +0 -1
- data/vendor/kreuzberg/src/core/batch_optimizations.rs +12 -4
- data/vendor/kreuzberg/src/core/config/acceleration.rs +55 -0
- data/vendor/kreuzberg/src/core/config/concurrency.rs +57 -0
- data/vendor/kreuzberg/src/core/config/email.rs +58 -0
- data/vendor/kreuzberg/src/core/config/extraction/core.rs +153 -2
- data/vendor/kreuzberg/src/core/config/extraction/env.rs +62 -0
- data/vendor/kreuzberg/src/core/config/extraction/file_config.rs +115 -0
- data/vendor/kreuzberg/src/core/config/extraction/mod.rs +2 -0
- data/vendor/kreuzberg/src/core/config/layout.rs +41 -0
- data/vendor/kreuzberg/src/core/config/mod.rs +13 -2
- data/vendor/kreuzberg/src/core/config/ocr.rs +492 -26
- data/vendor/kreuzberg/src/core/config/pdf.rs +12 -0
- data/vendor/kreuzberg/src/core/config/processing.rs +127 -5
- data/vendor/kreuzberg/src/core/extractor/batch.rs +202 -187
- data/vendor/kreuzberg/src/core/extractor/helpers.rs +37 -0
- data/vendor/kreuzberg/src/core/extractor/mod.rs +160 -38
- data/vendor/kreuzberg/src/core/extractor/sync.rs +35 -55
- data/vendor/kreuzberg/src/core/mod.rs +1 -1
- data/vendor/kreuzberg/src/core/pipeline/cache.rs +0 -1
- data/vendor/kreuzberg/src/core/pipeline/features.rs +2 -0
- data/vendor/kreuzberg/src/doc_orientation.rs +303 -0
- data/vendor/kreuzberg/src/embeddings/engine.rs +449 -0
- data/vendor/kreuzberg/src/embeddings/mod.rs +664 -0
- data/vendor/kreuzberg/src/extraction/docx/drawing.rs +14 -18
- data/vendor/kreuzberg/src/extraction/docx/math.rs +63 -73
- data/vendor/kreuzberg/src/extraction/docx/parser.rs +17 -19
- data/vendor/kreuzberg/src/extraction/docx/section.rs +5 -7
- data/vendor/kreuzberg/src/extraction/docx/table.rs +10 -10
- data/vendor/kreuzberg/src/extraction/email.rs +148 -28
- data/vendor/kreuzberg/src/extraction/html/image_handling.rs +0 -100
- data/vendor/kreuzberg/src/extraction/html/mod.rs +0 -4
- data/vendor/kreuzberg/src/extraction/html/processor.rs +0 -661
- data/vendor/kreuzberg/src/extraction/image.rs +29 -13
- data/vendor/kreuzberg/src/extraction/xml.rs +2 -2
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/table_extraction.rs +4 -6
- data/vendor/kreuzberg/src/extractors/email.rs +20 -2
- data/vendor/kreuzberg/src/extractors/image.rs +198 -0
- data/vendor/kreuzberg/src/extractors/markdown.rs +6 -10
- data/vendor/kreuzberg/src/extractors/mdx.rs +6 -10
- data/vendor/kreuzberg/src/extractors/pdf/extraction.rs +80 -23
- data/vendor/kreuzberg/src/extractors/pdf/mod.rs +353 -36
- data/vendor/kreuzberg/src/extractors/pdf/ocr.rs +1250 -76
- data/vendor/kreuzberg/src/extractors/rtf/metadata.rs +12 -22
- data/vendor/kreuzberg/src/extractors/rtf/tables.rs +0 -6
- data/vendor/kreuzberg/src/extractors/typst.rs +5 -7
- data/vendor/kreuzberg/src/keywords/rake.rs +1 -1
- data/vendor/kreuzberg/src/keywords/yake.rs +1 -1
- data/vendor/kreuzberg/src/layout/engine.rs +331 -0
- data/vendor/kreuzberg/src/layout/error.rs +15 -0
- data/vendor/kreuzberg/src/layout/inference_timings.rs +40 -0
- data/vendor/kreuzberg/src/layout/mod.rs +123 -0
- data/vendor/kreuzberg/src/layout/model_manager.rs +253 -0
- data/vendor/kreuzberg/src/layout/models/mod.rs +41 -0
- data/vendor/kreuzberg/src/layout/models/rtdetr.rs +344 -0
- data/vendor/kreuzberg/src/layout/models/tatr.rs +981 -0
- data/vendor/kreuzberg/src/layout/models/yolo.rs +342 -0
- data/vendor/kreuzberg/src/layout/postprocessing/heuristics.rs +183 -0
- data/vendor/kreuzberg/src/layout/postprocessing/mod.rs +2 -0
- data/vendor/kreuzberg/src/layout/postprocessing/nms.rs +35 -0
- data/vendor/kreuzberg/src/layout/preprocessing.rs +169 -0
- data/vendor/kreuzberg/src/layout/session.rs +79 -0
- data/vendor/kreuzberg/src/layout/types.rs +269 -0
- data/vendor/kreuzberg/src/lib.rs +26 -5
- data/vendor/kreuzberg/src/mcp/params.rs +5 -0
- data/vendor/kreuzberg/src/mcp/server.rs +36 -2
- data/vendor/kreuzberg/src/mcp/tools/extraction.rs +45 -6
- data/vendor/kreuzberg/src/model_cache.rs +76 -0
- data/vendor/kreuzberg/src/model_download.rs +79 -0
- data/vendor/kreuzberg/src/ocr/cache.rs +4 -0
- data/vendor/kreuzberg/src/ocr/conversion.rs +117 -9
- data/vendor/kreuzberg/src/ocr/layout_assembly.rs +1102 -0
- data/vendor/kreuzberg/src/ocr/mod.rs +2 -0
- data/vendor/kreuzberg/src/ocr/processor/execution.rs +619 -30
- data/vendor/kreuzberg/src/ocr/table/mod.rs +6 -303
- data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +11 -1
- data/vendor/kreuzberg/src/ocr/types.rs +9 -0
- data/vendor/kreuzberg/src/ort_discovery.rs +19 -4
- data/vendor/kreuzberg/src/paddle_ocr/backend.rs +81 -35
- data/vendor/kreuzberg/src/paddle_ocr/config.rs +60 -0
- data/vendor/kreuzberg/src/paddle_ocr/mod.rs +3 -1
- data/vendor/kreuzberg/src/paddle_ocr/model_manager.rs +511 -85
- data/vendor/kreuzberg/src/pdf/hierarchy/clustering.rs +9 -7
- data/vendor/kreuzberg/src/pdf/hierarchy/extraction.rs +5 -10
- data/vendor/kreuzberg/src/pdf/images.rs +282 -2
- data/vendor/kreuzberg/src/pdf/layout_runner.rs +708 -0
- data/vendor/kreuzberg/src/pdf/markdown/adapters.rs +327 -0
- data/vendor/kreuzberg/src/pdf/markdown/assembly.rs +162 -6
- data/vendor/kreuzberg/src/pdf/markdown/bridge.rs +1935 -676
- data/vendor/kreuzberg/src/pdf/markdown/classify.rs +494 -24
- data/vendor/kreuzberg/src/pdf/markdown/constants.rs +10 -8
- data/vendor/kreuzberg/src/pdf/markdown/content.rs +72 -0
- data/vendor/kreuzberg/src/pdf/markdown/content_convert.rs +1060 -0
- data/vendor/kreuzberg/src/pdf/markdown/geometry.rs +281 -0
- data/vendor/kreuzberg/src/pdf/markdown/layout_classify.rs +892 -0
- data/vendor/kreuzberg/src/pdf/markdown/lines.rs +4 -3
- data/vendor/kreuzberg/src/pdf/markdown/mod.rs +13 -2
- data/vendor/kreuzberg/src/pdf/markdown/paragraphs.rs +295 -263
- data/vendor/kreuzberg/src/pdf/markdown/pipeline.rs +1035 -204
- data/vendor/kreuzberg/src/pdf/markdown/regions/assignment.rs +538 -0
- data/vendor/kreuzberg/src/pdf/markdown/regions/heading.rs +476 -0
- data/vendor/kreuzberg/src/pdf/markdown/regions/layout_validation.rs +147 -0
- data/vendor/kreuzberg/src/pdf/markdown/regions/merge.rs +282 -0
- data/vendor/kreuzberg/src/pdf/markdown/regions/mod.rs +1319 -0
- data/vendor/kreuzberg/src/pdf/markdown/regions/reading_order.rs +480 -0
- data/vendor/kreuzberg/src/pdf/markdown/regions/table_recognition.rs +359 -0
- data/vendor/kreuzberg/src/pdf/markdown/regions/tables.rs +331 -0
- data/vendor/kreuzberg/src/pdf/markdown/render.rs +729 -21
- data/vendor/kreuzberg/src/pdf/markdown/text_repair.rs +1057 -0
- data/vendor/kreuzberg/src/pdf/markdown/types.rs +47 -2
- data/vendor/kreuzberg/src/pdf/mod.rs +19 -0
- data/vendor/kreuzberg/src/pdf/oxide_text.rs +110 -0
- data/vendor/kreuzberg/src/pdf/table.rs +2 -21
- data/vendor/kreuzberg/src/pdf/table_reconstruct.rs +496 -0
- data/vendor/kreuzberg/src/pdf/text.rs +30 -0
- data/vendor/kreuzberg/src/pdf/text_data.rs +372 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/sentence_selection.rs +1 -1
- data/vendor/kreuzberg/src/text/token_reduction/semantic.rs +2 -9
- data/vendor/kreuzberg/src/types/ocr_elements.rs +8 -2
- data/vendor/kreuzberg/src/types/serde_helpers.rs +1 -63
- data/vendor/kreuzberg/src/utils/mod.rs +1 -0
- data/vendor/kreuzberg/src/utils/string_pool/mime_pool.rs +6 -9
- data/vendor/kreuzberg/src/utils/timing.rs +59 -0
- data/vendor/kreuzberg/tests/batch_orchestration.rs +41 -24
- data/vendor/kreuzberg/tests/batch_processing.rs +29 -15
- data/vendor/kreuzberg/tests/concurrency_stress.rs +3 -4
- data/vendor/kreuzberg/tests/core_integration.rs +11 -7
- data/vendor/kreuzberg/tests/paddle_ocr_integration.rs +267 -0
- data/vendor/kreuzberg/tests/pdf_hierarchy_detection.rs +4 -0
- data/vendor/kreuzberg/tests/pdf_markdown_quality.rs +160 -0
- data/vendor/kreuzberg/tests/pdf_ocr_triggering.rs +1 -0
- data/vendor/kreuzberg/tests/pdfium_linking.rs +4 -1
- data/vendor/kreuzberg-ffi/Cargo.toml +7 -6
- data/vendor/kreuzberg-ffi/README.md +4 -1
- data/vendor/kreuzberg-ffi/kreuzberg.h +40 -136
- data/vendor/kreuzberg-ffi/src/config/loader.rs +1 -1
- data/vendor/kreuzberg-ffi/src/config_builder.rs +48 -364
- data/vendor/kreuzberg-ffi/src/extraction.rs +50 -22
- data/vendor/kreuzberg-ffi/src/helpers.rs +26 -0
- data/vendor/kreuzberg-ffi/src/lib.rs +10 -4
- data/vendor/kreuzberg-ffi/tests/c/test_batch.c +5 -5
- data/vendor/kreuzberg-paddle-ocr/Cargo.toml +4 -4
- data/vendor/kreuzberg-paddle-ocr/src/angle_net.rs +26 -9
- data/vendor/kreuzberg-paddle-ocr/src/base_net.rs +12 -9
- data/vendor/kreuzberg-paddle-ocr/src/crnn_net.rs +124 -11
- data/vendor/kreuzberg-paddle-ocr/src/db_net.rs +39 -20
- data/vendor/kreuzberg-paddle-ocr/src/ocr_lite.rs +148 -17
- data/vendor/kreuzberg-paddle-ocr/src/ocr_utils.rs +67 -52
- data/vendor/kreuzberg-pdfium-render/Cargo.toml +9 -2
- data/vendor/kreuzberg-pdfium-render/src/bindings/wasm_bindings.rs +24 -44
- data/vendor/kreuzberg-pdfium-render/src/bindings.rs +0 -8
- data/vendor/kreuzberg-pdfium-render/src/error.rs +0 -4
- data/vendor/kreuzberg-pdfium-render/src/lib.rs +19 -27
- data/vendor/kreuzberg-pdfium-render/src/pdf/document/page/extraction.rs +56 -57
- data/vendor/kreuzberg-pdfium-render/src/pdf/document/page/field/private.rs +1 -0
- data/vendor/kreuzberg-pdfium-render/src/pdf/document/page/text.rs +13 -0
- data/vendor/kreuzberg-pdfium-render/src/pdf/document.rs +0 -6
- data/vendor/kreuzberg-pdfium-render/src/pdf/points.rs +1 -1
- data/vendor/kreuzberg-pdfium-render/src/utils.rs +2 -1
- data/vendor/kreuzberg-tesseract/Cargo.toml +4 -4
- data/vendor/kreuzberg-tesseract/src/api.rs +474 -7
- data/vendor/kreuzberg-tesseract/src/leptonica.rs +750 -0
- data/vendor/kreuzberg-tesseract/src/lib.rs +8 -4
- data/vendor/kreuzberg-tesseract/src/page_iterator.rs +175 -8
- data/vendor/kreuzberg-tesseract/src/result_iterator.rs +219 -0
- metadata +49 -10
- data/vendor/kreuzberg/src/embeddings.rs +0 -514
- data/vendor/kreuzberg/tests/content_parity_debug.rs +0 -280
- data/vendor/kreuzberg/tests/debug_table_cells.rs +0 -56
- data/vendor/kreuzberg/tests/docx_vs_pandoc_comparison.rs +0 -370
- data/vendor/kreuzberg/tests/dump_pdf_markdown.rs +0 -116
- data/vendor/kreuzberg/tests/pdf_markdown_all_docs.rs +0 -281
- data/vendor/kreuzberg/tests/test_fastembed.rs +0 -642
- data/vendor/kreuzberg-pdfium-render/src/pdf/document/page/flatten.rs +0 -6
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 9d245ca9cadfb5b07cab9c8709854cfd0eb488684b6c5fbc6866b891f162b0d0
|
|
4
|
+
data.tar.gz: 31ab57a13cef6881bc58c52058b1ade5ca13af00ed5d81df1fc79853ca816e8d
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 5cada46dd61ecb89dd9a7ffdeeb5df86ca338b5864785d80753e12f0967d23e7183360dde623732c8579e9ba78a9b8ff26bdc978ee744a842a75c33bb3877784
|
|
7
|
+
data.tar.gz: 12b1b780c4065379cb7d0fde912bebfac9ae5ea48a33600e4331d705c65d58a05393fa42902e0d1d7b0d5dd2ba98b1dd231fc29a9819a60cb1a8f398146e1b9d
|
data/Gemfile.lock
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
PATH
|
|
2
2
|
remote: .
|
|
3
3
|
specs:
|
|
4
|
-
kreuzberg (4.
|
|
4
|
+
kreuzberg (4.5.1)
|
|
5
5
|
rb_sys (~> 0.9.119)
|
|
6
6
|
sorbet-runtime (~> 0.5)
|
|
7
7
|
|
|
@@ -49,7 +49,7 @@ GEM
|
|
|
49
49
|
i18n (1.14.8)
|
|
50
50
|
concurrent-ruby (~> 1.0)
|
|
51
51
|
io-console (0.8.2)
|
|
52
|
-
json (2.19.
|
|
52
|
+
json (2.19.2)
|
|
53
53
|
json-schema (6.2.0)
|
|
54
54
|
addressable (~> 2.8)
|
|
55
55
|
bigdecimal (>= 3.1, < 5)
|
|
@@ -60,7 +60,7 @@ GEM
|
|
|
60
60
|
rb-fsevent (~> 0.10, >= 0.10.3)
|
|
61
61
|
rb-inotify (~> 0.9, >= 0.9.10)
|
|
62
62
|
logger (1.7.0)
|
|
63
|
-
mcp (0.
|
|
63
|
+
mcp (0.9.0)
|
|
64
64
|
json-schema (>= 4.1)
|
|
65
65
|
method_source (1.1.0)
|
|
66
66
|
minitest (6.0.2)
|
|
@@ -134,7 +134,7 @@ GEM
|
|
|
134
134
|
rubocop (~> 1.81)
|
|
135
135
|
ruby-progressbar (1.13.0)
|
|
136
136
|
securerandom (0.4.1)
|
|
137
|
-
sorbet-runtime (0.6.
|
|
137
|
+
sorbet-runtime (0.6.13055)
|
|
138
138
|
steep (1.10.0)
|
|
139
139
|
activesupport (>= 5.1)
|
|
140
140
|
concurrent-ruby (>= 1.1.10)
|
|
@@ -220,14 +220,14 @@ CHECKSUMS
|
|
|
220
220
|
fileutils (1.8.0) sha256=8c6b1df54e2540bdb2f39258f08af78853aa70bad52b4d394bbc6424593c6e02
|
|
221
221
|
i18n (1.14.8) sha256=285778639134865c5e0f6269e0b818256017e8cde89993fdfcbfb64d088824a5
|
|
222
222
|
io-console (0.8.2) sha256=d6e3ae7a7cc7574f4b8893b4fca2162e57a825b223a177b7afa236c5ef9814cc
|
|
223
|
-
json (2.19.
|
|
223
|
+
json (2.19.2) sha256=e7e1bd318b2c37c4ceee2444841c86539bc462e81f40d134cf97826cb14e83cf
|
|
224
224
|
json-schema (6.2.0) sha256=e8bff46ed845a22c1ab2bd0d7eccf831c01fe23bb3920caa4c74db4306813666
|
|
225
|
-
kreuzberg (4.
|
|
225
|
+
kreuzberg (4.5.1)
|
|
226
226
|
language_server-protocol (3.17.0.5) sha256=fd1e39a51a28bf3eec959379985a72e296e9f9acfce46f6a79d31ca8760803cc
|
|
227
227
|
lint_roller (1.1.0) sha256=2c0c845b632a7d172cb849cc90c1bce937a28c5c8ccccb50dfd46a485003cc87
|
|
228
228
|
listen (3.10.0) sha256=c6e182db62143aeccc2e1960033bebe7445309c7272061979bb098d03760c9d2
|
|
229
229
|
logger (1.7.0) sha256=196edec7cc44b66cfb40f9755ce11b392f21f7967696af15d274dde7edff0203
|
|
230
|
-
mcp (0.
|
|
230
|
+
mcp (0.9.0) sha256=a0a3737b0ac9df0772f4ef7e2b013c260ddbcf217a5d50a66bff0baeddf03e47
|
|
231
231
|
method_source (1.1.0) sha256=181301c9c45b731b4769bc81e8860e72f9161ad7d66dd99103c9ab84f560f5c5
|
|
232
232
|
minitest (6.0.2) sha256=db6e57956f6ecc6134683b4c87467d6dd792323c7f0eea7b93f66bd284adbc3d
|
|
233
233
|
mutex_m (0.3.0) sha256=cfcb04ac16b69c4813777022fdceda24e9f798e48092a2b817eb4c0a782b0751
|
|
@@ -259,7 +259,7 @@ CHECKSUMS
|
|
|
259
259
|
rubocop-rspec (3.9.0) sha256=8fa70a3619408237d789aeecfb9beef40576acc855173e60939d63332fdb55e2
|
|
260
260
|
ruby-progressbar (1.13.0) sha256=80fc9c47a9b640d6834e0dc7b3c94c9df37f08cb072b7761e4a71e22cff29b33
|
|
261
261
|
securerandom (0.4.1) sha256=cc5193d414a4341b6e225f0cb4446aceca8e50d5e1888743fac16987638ea0b1
|
|
262
|
-
sorbet-runtime (0.6.
|
|
262
|
+
sorbet-runtime (0.6.13055) sha256=c8ae8c81310e0a28d290b11f44ddca59659b7d7f13752c0ef5d16964bbb84d18
|
|
263
263
|
steep (1.10.0) sha256=1b295b55f9aaff1b8d3ee42453ee55bc2a1078fda0268f288edb2dc014f4d7d1
|
|
264
264
|
strscan (3.1.7) sha256=5f76462b94a3ea50b44973225b7d75b2cb96d4e1bee9ef1319b99ca117b72c8c
|
|
265
265
|
terminal-table (4.0.0) sha256=f504793203f8251b2ea7c7068333053f0beeea26093ec9962e62ea79f94301d2
|
data/README.md
CHANGED
|
@@ -22,7 +22,7 @@
|
|
|
22
22
|
<img src="https://img.shields.io/maven-central/v/dev.kreuzberg/kreuzberg?label=Java&color=007ec6" alt="Java">
|
|
23
23
|
</a>
|
|
24
24
|
<a href="https://github.com/kreuzberg-dev/kreuzberg/releases">
|
|
25
|
-
<img src="https://img.shields.io/github/v/tag/kreuzberg-dev/kreuzberg?label=Go&color=007ec6&filter=v4.
|
|
25
|
+
<img src="https://img.shields.io/github/v/tag/kreuzberg-dev/kreuzberg?label=Go&color=007ec6&filter=v4.5.1" alt="Go">
|
|
26
26
|
</a>
|
|
27
27
|
<a href="https://www.nuget.org/packages/Kreuzberg/">
|
|
28
28
|
<img src="https://img.shields.io/nuget/v/Kreuzberg?label=C%23&color=007ec6" alt="C#">
|
|
@@ -47,6 +47,9 @@
|
|
|
47
47
|
<a href="https://docs.kreuzberg.dev">
|
|
48
48
|
<img src="https://img.shields.io/badge/docs-kreuzberg.dev-blue" alt="Documentation">
|
|
49
49
|
</a>
|
|
50
|
+
<a href="https://huggingface.co/Kreuzberg">
|
|
51
|
+
<img src="https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Models-yellow" alt="Hugging Face">
|
|
52
|
+
</a>
|
|
50
53
|
</div>
|
|
51
54
|
|
|
52
55
|
<img width="1128" height="191" alt="Banner2" src="https://github.com/user-attachments/assets/419fc06c-8313-4324-b159-4b4d3cfce5c0" />
|