kreuzberg 4.5.4 → 4.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +1 -1
- data/ext/kreuzberg_rb/native/Cargo.lock +73 -474
- data/ext/kreuzberg_rb/native/Cargo.toml +2 -2
- data/ext/kreuzberg_rb/native/src/plugins/mod.rs +0 -2
- data/ext/kreuzberg_rb/native/src/plugins/ocr_backend.rs +2 -1
- data/ext/kreuzberg_rb/native/src/plugins/post_processor.rs +0 -1
- data/ext/kreuzberg_rb/native/src/plugins/validator.rs +0 -1
- data/ext/kreuzberg_rb/native/src/result.rs +85 -2
- data/lib/kreuzberg/version.rb +1 -1
- data/vendor/Cargo.toml +6 -5
- data/vendor/kreuzberg/Cargo.toml +7 -8
- data/vendor/kreuzberg/README.md +1 -1
- data/vendor/kreuzberg/src/cache/core.rs +47 -42
- data/vendor/kreuzberg/src/cache/mod.rs +13 -13
- data/vendor/kreuzberg/src/chunking/core.rs +10 -2
- data/vendor/kreuzberg/src/chunking/mod.rs +3 -3
- data/vendor/kreuzberg/src/chunking/processor.rs +209 -1
- data/vendor/kreuzberg/src/chunking/yaml_section.rs +604 -0
- data/vendor/kreuzberg/src/core/batch_optimizations.rs +1 -1
- data/vendor/kreuzberg/src/core/config/concurrency.rs +79 -0
- data/vendor/kreuzberg/src/core/config/extraction/core.rs +32 -0
- data/vendor/kreuzberg/src/core/config/processing.rs +9 -5
- data/vendor/kreuzberg/src/core/extractor/batch.rs +3 -7
- data/vendor/kreuzberg/src/core/extractor/file.rs +17 -0
- data/vendor/kreuzberg/src/core/extractor/helpers.rs +2 -3
- data/vendor/kreuzberg/src/core/extractor/legacy.rs +1 -0
- data/vendor/kreuzberg/src/core/io.rs +87 -0
- data/vendor/kreuzberg/src/core/pipeline/cache.rs +4 -8
- data/vendor/kreuzberg/src/core/pipeline/execution.rs +3 -5
- data/vendor/kreuzberg/src/core/pipeline/features.rs +38 -70
- data/vendor/kreuzberg/src/core/pipeline/format.rs +23 -13
- data/vendor/kreuzberg/src/core/pipeline/initialization.rs +9 -9
- data/vendor/kreuzberg/src/core/pipeline/tests.rs +47 -20
- data/vendor/kreuzberg/src/doc_orientation.rs +1 -1
- data/vendor/kreuzberg/src/error.rs +1 -1
- data/vendor/kreuzberg/src/extraction/archive/gzip.rs +54 -0
- data/vendor/kreuzberg/src/extraction/archive/mod.rs +6 -4
- data/vendor/kreuzberg/src/extraction/archive/sevenz.rs +58 -0
- data/vendor/kreuzberg/src/extraction/archive/tar.rs +64 -0
- data/vendor/kreuzberg/src/extraction/archive/zip.rs +56 -0
- data/vendor/kreuzberg/src/extraction/docx/parser.rs +83 -35
- data/vendor/kreuzberg/src/extraction/email.rs +592 -2
- data/vendor/kreuzberg/src/extraction/html/converter.rs +12 -0
- data/vendor/kreuzberg/src/extraction/html/mod.rs +1 -0
- data/vendor/kreuzberg/src/extraction/html/structure.rs +1415 -0
- data/vendor/kreuzberg/src/extraction/hwp/model.rs +19 -12
- data/vendor/kreuzberg/src/extraction/hwp/parser.rs +7 -13
- data/vendor/kreuzberg/src/extraction/hwp/reader.rs +1 -2
- data/vendor/kreuzberg/src/extraction/image_ocr.rs +40 -13
- data/vendor/kreuzberg/src/extraction/office_metadata/app_properties.rs +12 -33
- data/vendor/kreuzberg/src/extraction/office_metadata/core_properties.rs +4 -11
- data/vendor/kreuzberg/src/extraction/office_metadata/custom_properties.rs +4 -11
- data/vendor/kreuzberg/src/extraction/office_metadata/mod.rs +30 -0
- data/vendor/kreuzberg/src/extraction/office_metadata/odt_properties.rs +4 -11
- data/vendor/kreuzberg/src/extraction/ppt/mod.rs +43 -5
- data/vendor/kreuzberg/src/extraction/pptx/elements.rs +12 -0
- data/vendor/kreuzberg/src/extraction/pptx/mod.rs +239 -17
- data/vendor/kreuzberg/src/extraction/pptx/parser.rs +40 -1
- data/vendor/kreuzberg/src/extraction/structured.rs +106 -74
- data/vendor/kreuzberg/src/extraction/transform/document_tree.rs +5 -0
- data/vendor/kreuzberg/src/extraction/transform/mod.rs +4 -0
- data/vendor/kreuzberg/src/extraction/xml.rs +88 -90
- data/vendor/kreuzberg/src/extractors/archive.rs +124 -23
- data/vendor/kreuzberg/src/extractors/bibtex.rs +151 -12
- data/vendor/kreuzberg/src/extractors/citation.rs +24 -3
- data/vendor/kreuzberg/src/extractors/csv.rs +261 -33
- data/vendor/kreuzberg/src/extractors/dbf.rs +116 -18
- data/vendor/kreuzberg/src/extractors/djot_format/conversion.rs +2 -0
- data/vendor/kreuzberg/src/extractors/djot_format/extractor.rs +335 -4
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/event_handlers.rs +1 -2
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/state.rs +1 -1
- data/vendor/kreuzberg/src/extractors/doc.rs +89 -2
- data/vendor/kreuzberg/src/extractors/docbook.rs +670 -21
- data/vendor/kreuzberg/src/extractors/docx.rs +239 -173
- data/vendor/kreuzberg/src/extractors/email.rs +107 -13
- data/vendor/kreuzberg/src/extractors/epub/content.rs +20 -8
- data/vendor/kreuzberg/src/extractors/epub/metadata.rs +73 -0
- data/vendor/kreuzberg/src/extractors/epub/mod.rs +148 -4
- data/vendor/kreuzberg/src/extractors/excel.rs +43 -3
- data/vendor/kreuzberg/src/extractors/fictionbook.rs +757 -50
- data/vendor/kreuzberg/src/extractors/html.rs +9 -1
- data/vendor/kreuzberg/src/extractors/hwp.rs +17 -2
- data/vendor/kreuzberg/src/extractors/image.rs +43 -10
- data/vendor/kreuzberg/src/extractors/iwork/keynote.rs +58 -15
- data/vendor/kreuzberg/src/extractors/iwork/mod.rs +18 -14
- data/vendor/kreuzberg/src/extractors/iwork/numbers.rs +32 -15
- data/vendor/kreuzberg/src/extractors/iwork/pages.rs +41 -15
- data/vendor/kreuzberg/src/extractors/jats/elements.rs +69 -0
- data/vendor/kreuzberg/src/extractors/jats/metadata.rs +8 -0
- data/vendor/kreuzberg/src/extractors/jats/mod.rs +479 -6
- data/vendor/kreuzberg/src/extractors/jats/parser.rs +2 -1
- data/vendor/kreuzberg/src/extractors/jupyter.rs +325 -20
- data/vendor/kreuzberg/src/extractors/latex/mod.rs +676 -3
- data/vendor/kreuzberg/src/extractors/markdown.rs +425 -182
- data/vendor/kreuzberg/src/extractors/markdown_utils.rs +169 -0
- data/vendor/kreuzberg/src/extractors/mdx.rs +402 -110
- data/vendor/kreuzberg/src/extractors/mod.rs +23 -26
- data/vendor/kreuzberg/src/extractors/odt.rs +620 -20
- data/vendor/kreuzberg/src/extractors/opml/core.rs +10 -3
- data/vendor/kreuzberg/src/extractors/opml/parser.rs +186 -1
- data/vendor/kreuzberg/src/extractors/orgmode.rs +507 -6
- data/vendor/kreuzberg/src/extractors/pdf/mod.rs +58 -46
- data/vendor/kreuzberg/src/extractors/pdf/ocr.rs +232 -76
- data/vendor/kreuzberg/src/extractors/ppt.rs +110 -2
- data/vendor/kreuzberg/src/extractors/pptx.rs +21 -5
- data/vendor/kreuzberg/src/extractors/rst.rs +516 -21
- data/vendor/kreuzberg/src/extractors/rtf/mod.rs +126 -5
- data/vendor/kreuzberg/src/extractors/rtf/parser.rs +563 -0
- data/vendor/kreuzberg/src/extractors/structured.rs +105 -3
- data/vendor/kreuzberg/src/extractors/text.rs +19 -153
- data/vendor/kreuzberg/src/extractors/typst.rs +431 -3
- data/vendor/kreuzberg/src/extractors/xml.rs +122 -5
- data/vendor/kreuzberg/src/keywords/mod.rs +1 -3
- data/vendor/kreuzberg/src/keywords/processor.rs +7 -0
- data/vendor/kreuzberg/src/language_detection/mod.rs +2 -4
- data/vendor/kreuzberg/src/language_detection/processor.rs +5 -0
- data/vendor/kreuzberg/src/layout/model_manager.rs +5 -1
- data/vendor/kreuzberg/src/layout/models/rtdetr.rs +2 -1
- data/vendor/kreuzberg/src/layout/models/slanet.rs +5 -5
- data/vendor/kreuzberg/src/layout/models/table_classifier.rs +5 -5
- data/vendor/kreuzberg/src/layout/models/tatr.rs +5 -5
- data/vendor/kreuzberg/src/layout/models/yolo.rs +2 -1
- data/vendor/kreuzberg/src/layout/session.rs +10 -4
- data/vendor/kreuzberg/src/lib.rs +1 -0
- data/vendor/kreuzberg/src/mcp/format.rs +4 -0
- data/vendor/kreuzberg/src/ocr/language_registry.rs +3 -3
- data/vendor/kreuzberg/src/ocr/processor/execution.rs +6 -5
- data/vendor/kreuzberg/src/ocr/processor/mod.rs +20 -16
- data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +9 -4
- data/vendor/kreuzberg/src/ocr/validation.rs +126 -127
- data/vendor/kreuzberg/src/paddle_ocr/backend.rs +4 -3
- data/vendor/kreuzberg/src/paddle_ocr/model_manager.rs +21 -44
- data/vendor/kreuzberg/src/pdf/hierarchy/clustering.rs +35 -8
- data/vendor/kreuzberg/src/pdf/images.rs +274 -12
- data/vendor/kreuzberg/src/pdf/layout_runner.rs +20 -15
- data/vendor/kreuzberg/src/pdf/markdown/bridge.rs +10 -3
- data/vendor/kreuzberg/src/pdf/markdown/classify.rs +6 -7
- data/vendor/kreuzberg/src/pdf/markdown/paragraphs.rs +4 -0
- data/vendor/kreuzberg/src/pdf/markdown/pipeline.rs +14 -20
- data/vendor/kreuzberg/src/pdf/markdown/regions/heading.rs +3 -1
- data/vendor/kreuzberg/src/pdf/rendering.rs +9 -0
- data/vendor/kreuzberg/src/plugins/extractor/mod.rs +2 -0
- data/vendor/kreuzberg/src/plugins/extractor/registry.rs +8 -12
- data/vendor/kreuzberg/src/plugins/extractor/trait.rs +4 -1
- data/vendor/kreuzberg/src/plugins/ocr.rs +36 -23
- data/vendor/kreuzberg/src/plugins/processor/mod.rs +8 -0
- data/vendor/kreuzberg/src/plugins/processor/registry.rs +1 -3
- data/vendor/kreuzberg/src/plugins/registry/extractor.rs +2 -0
- data/vendor/kreuzberg/src/plugins/registry/mod.rs +6 -17
- data/vendor/kreuzberg/src/plugins/registry/ocr.rs +2 -0
- data/vendor/kreuzberg/src/plugins/startup_validation.rs +4 -4
- data/vendor/kreuzberg/src/plugins/validator/mod.rs +11 -0
- data/vendor/kreuzberg/src/plugins/validator/registry.rs +4 -12
- data/vendor/kreuzberg/src/rendering/markdown.rs +620 -0
- data/vendor/kreuzberg/src/rendering/mod.rs +11 -0
- data/vendor/kreuzberg/src/rendering/plain.rs +288 -0
- data/vendor/kreuzberg/src/text/mod.rs +1 -1
- data/vendor/kreuzberg/src/text/quality_processor.rs +5 -0
- data/vendor/kreuzberg/src/text/string_utils.rs +6 -229
- data/vendor/kreuzberg/src/text/token_reduction/filters/markdown.rs +2 -3
- data/vendor/kreuzberg/src/text/token_reduction/filters.rs +2 -3
- data/vendor/kreuzberg/src/types/builder.rs +958 -0
- data/vendor/kreuzberg/src/types/document_structure.rs +295 -5
- data/vendor/kreuzberg/src/types/extraction.rs +29 -3
- data/vendor/kreuzberg/src/types/formats.rs +4 -0
- data/vendor/kreuzberg/src/types/mod.rs +1 -0
- data/vendor/kreuzberg/src/utils/mod.rs +23 -0
- data/vendor/kreuzberg/src/utils/quality/mod.rs +4 -3
- data/vendor/kreuzberg/src/utils/xml_utils.rs +7 -0
- data/vendor/kreuzberg/tests/concurrency_stress.rs +3 -3
- data/vendor/kreuzberg/tests/csv_embedding_quality.rs +142 -0
- data/vendor/kreuzberg/tests/document_structure_tests.rs +3911 -0
- data/vendor/kreuzberg/tests/instrumentation_test.rs +2 -2
- data/vendor/kreuzberg/tests/iwork_integration.rs +2 -2
- data/vendor/kreuzberg/tests/markdown_extractor_tests.rs +0 -2
- data/vendor/kreuzberg/tests/ocr_stress.rs +9 -9
- data/vendor/kreuzberg/tests/opml_extractor_tests.rs +1 -1
- data/vendor/kreuzberg/tests/paddle_ocr_integration.rs +2 -2
- data/vendor/kreuzberg/tests/pdf_integration.rs +25 -1
- data/vendor/kreuzberg/tests/pipeline_integration.rs +25 -75
- data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +313 -35
- data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +28 -30
- data/vendor/kreuzberg/tests/plugin_validator_test.rs +46 -46
- data/vendor/kreuzberg/tests/xml_embedding_quality.rs +137 -0
- data/vendor/kreuzberg-ffi/Cargo.toml +4 -4
- data/vendor/kreuzberg-ffi/kreuzberg.h +3 -3
- data/vendor/kreuzberg-ffi/src/batch_streaming.rs +15 -1
- data/vendor/kreuzberg-ffi/src/helpers.rs +5 -0
- data/vendor/kreuzberg-ffi/src/plugins/document_extractor.rs +4 -32
- data/vendor/kreuzberg-ffi/src/plugins/ocr_backend.rs +3 -16
- data/vendor/kreuzberg-ffi/src/plugins/post_processor.rs +5 -40
- data/vendor/kreuzberg-ffi/src/plugins/validator.rs +4 -32
- data/vendor/kreuzberg-ffi/src/result.rs +1 -0
- data/vendor/kreuzberg-ffi/src/result_pool.rs +15 -1
- data/vendor/kreuzberg-ffi/src/result_view.rs +2 -0
- data/vendor/kreuzberg-paddle-ocr/Cargo.toml +1 -1
- data/vendor/kreuzberg-paddle-ocr/src/angle_net.rs +9 -4
- data/vendor/kreuzberg-paddle-ocr/src/constants.rs +33 -0
- data/vendor/kreuzberg-paddle-ocr/src/crnn_net.rs +13 -8
- data/vendor/kreuzberg-paddle-ocr/src/db_net.rs +3 -8
- data/vendor/kreuzberg-paddle-ocr/src/lib.rs +1 -0
- data/vendor/kreuzberg-paddle-ocr/src/ocr_lite.rs +9 -27
- data/vendor/kreuzberg-pdfium-render/Cargo.toml +1 -1
- data/vendor/kreuzberg-pdfium-render/src/lib.rs +4 -5
- data/vendor/kreuzberg-pdfium-render/src/pdf/document/form.rs +5 -70
- data/vendor/kreuzberg-pdfium-render/src/pdf/document/page/annotation/private.rs +2 -124
- data/vendor/kreuzberg-pdfium-render/src/pdf/document/page/annotation.rs +2 -170
- data/vendor/kreuzberg-pdfium-render/src/pdf/document/page/annotations.rs +0 -12
- data/vendor/kreuzberg-pdfium-render/src/pdf/document/page/render_config.rs +1 -84
- data/vendor/kreuzberg-pdfium-render/src/pdf/document/page.rs +0 -1
- data/vendor/kreuzberg-pdfium-render/src/pdf.rs +0 -1
- data/vendor/kreuzberg-tesseract/Cargo.toml +1 -1
- data/vendor/kreuzberg-tesseract/src/api.rs +16 -10
- data/vendor/kreuzberg-tesseract/src/lib.rs +1 -1
- data/vendor/kreuzberg-tesseract/tests/integration_test.rs +7 -7
- metadata +14 -35
- data/.gitignore +0 -14
- data/.rspec +0 -3
- data/.rubocop.yaml +0 -1
- data/.rubocop.yml +0 -543
- data/Gemfile +0 -8
- data/Gemfile.lock +0 -274
- data/Rakefile +0 -34
- data/Steepfile +0 -51
- data/examples/async_patterns.rb +0 -282
- data/extconf.rb +0 -60
- data/kreuzberg.gemspec +0 -253
- data/spec/fixtures/config.toml +0 -38
- data/spec/fixtures/config.yaml +0 -41
- data/spec/fixtures/invalid_config.toml +0 -3
- data/test/metadata_types_test.rb +0 -959
- data/vendor/kreuzberg-pdfium-render/src/pdf/appearance_mode.rs +0 -39
- data/vendor/kreuzberg-pdfium-render/src/pdf/document/page/annotation/popup.rs +0 -64
- data/vendor/kreuzberg-pdfium-render/src/pdf/document/page/annotation/redacted.rs +0 -64
- data/vendor/kreuzberg-pdfium-render/src/pdf/document/page/annotation/variable_text.rs +0 -118
- data/vendor/kreuzberg-pdfium-render/src/pdf/document/page/annotation/widget.rs +0 -86
- data/vendor/kreuzberg-pdfium-render/src/pdf/document/page/annotation/xfa_widget.rs +0 -86
- data/vendor/kreuzberg-pdfium-render/src/pdf/document/page/field/button.rs +0 -64
- data/vendor/kreuzberg-pdfium-render/src/pdf/document/page/field/checkbox.rs +0 -142
- data/vendor/kreuzberg-pdfium-render/src/pdf/document/page/field/combo.rs +0 -129
- data/vendor/kreuzberg-pdfium-render/src/pdf/document/page/field/list.rs +0 -111
- data/vendor/kreuzberg-pdfium-render/src/pdf/document/page/field/option.rs +0 -36
- data/vendor/kreuzberg-pdfium-render/src/pdf/document/page/field/options.rs +0 -156
- data/vendor/kreuzberg-pdfium-render/src/pdf/document/page/field/private.rs +0 -518
- data/vendor/kreuzberg-pdfium-render/src/pdf/document/page/field/radio.rs +0 -140
- data/vendor/kreuzberg-pdfium-render/src/pdf/document/page/field/signature.rs +0 -63
- data/vendor/kreuzberg-pdfium-render/src/pdf/document/page/field/text.rs +0 -151
- data/vendor/kreuzberg-pdfium-render/src/pdf/document/page/field/unknown.rs +0 -62
- data/vendor/kreuzberg-pdfium-render/src/pdf/document/page/field.rs +0 -436
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 203e9719bcf3cf2cda1252dcd7a5c5782e7b73936a304b626a351894d4fcd909
|
|
4
|
+
data.tar.gz: 2c02a45c882ef6b6b6935896e9334c46f012aaf8bc6f6669fa3c0110b67398e5
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: d3dde81c8c38b1ee99bed3cae32e477e4c8941d401c6449fc9c3eec3608a5b771b47c20ab3a9679ccf75059fed5e6c09f9d91eefed83a7d9dc59eebf7acb5626
|
|
7
|
+
data.tar.gz: e590247800d9752175985ee3b8ad0c89c5926f1afa0669a881cf476455c8514332880d30ff35d46a2836cf9cdc18b752296fb06a545f42129b548b5675180a71
|
data/README.md
CHANGED
|
@@ -22,7 +22,7 @@
|
|
|
22
22
|
<img src="https://img.shields.io/maven-central/v/dev.kreuzberg/kreuzberg?label=Java&color=007ec6" alt="Java">
|
|
23
23
|
</a>
|
|
24
24
|
<a href="https://github.com/kreuzberg-dev/kreuzberg/releases">
|
|
25
|
-
<img src="https://img.shields.io/github/v/tag/kreuzberg-dev/kreuzberg?label=Go&color=007ec6&filter=v4.
|
|
25
|
+
<img src="https://img.shields.io/github/v/tag/kreuzberg-dev/kreuzberg?label=Go&color=007ec6&filter=v4.6.0" alt="Go">
|
|
26
26
|
</a>
|
|
27
27
|
<a href="https://www.nuget.org/packages/Kreuzberg/">
|
|
28
28
|
<img src="https://img.shields.io/nuget/v/Kreuzberg?label=C%23&color=007ec6" alt="C#">
|