kreuzberg 4.0.0.pre.rc.6 → 4.0.0.pre.rc.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (126) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile.lock +5 -3
  3. data/README.md +15 -9
  4. data/ext/kreuzberg_rb/native/.cargo/config.toml +2 -0
  5. data/ext/kreuzberg_rb/native/Cargo.lock +516 -324
  6. data/ext/kreuzberg_rb/native/Cargo.toml +13 -3
  7. data/ext/kreuzberg_rb/native/src/lib.rs +139 -2
  8. data/kreuzberg.gemspec +38 -4
  9. data/lib/kreuzberg/config.rb +34 -1
  10. data/lib/kreuzberg/result.rb +77 -14
  11. data/lib/kreuzberg/version.rb +1 -1
  12. data/sig/kreuzberg.rbs +23 -6
  13. data/vendor/kreuzberg/Cargo.toml +25 -11
  14. data/vendor/kreuzberg/README.md +13 -8
  15. data/vendor/kreuzberg/build.rs +17 -6
  16. data/vendor/kreuzberg/src/api/mod.rs +2 -0
  17. data/vendor/kreuzberg/src/chunking/mod.rs +1279 -79
  18. data/vendor/kreuzberg/src/chunking/processor.rs +220 -0
  19. data/vendor/kreuzberg/src/core/config.rs +49 -1
  20. data/vendor/kreuzberg/src/core/extractor.rs +134 -2
  21. data/vendor/kreuzberg/src/core/mod.rs +4 -2
  22. data/vendor/kreuzberg/src/core/pipeline.rs +188 -1
  23. data/vendor/kreuzberg/src/extraction/docx.rs +358 -0
  24. data/vendor/kreuzberg/src/extraction/html.rs +24 -8
  25. data/vendor/kreuzberg/src/extraction/image.rs +124 -1
  26. data/vendor/kreuzberg/src/extraction/libreoffice.rs +1 -2
  27. data/vendor/kreuzberg/src/extraction/office_metadata/odt_properties.rs +0 -3
  28. data/vendor/kreuzberg/src/extraction/pptx.rs +187 -87
  29. data/vendor/kreuzberg/src/extractors/archive.rs +1 -0
  30. data/vendor/kreuzberg/src/extractors/bibtex.rs +1 -0
  31. data/vendor/kreuzberg/src/extractors/docbook.rs +2 -0
  32. data/vendor/kreuzberg/src/extractors/docx.rs +50 -17
  33. data/vendor/kreuzberg/src/extractors/email.rs +29 -15
  34. data/vendor/kreuzberg/src/extractors/epub.rs +1 -0
  35. data/vendor/kreuzberg/src/extractors/excel.rs +2 -0
  36. data/vendor/kreuzberg/src/extractors/fictionbook.rs +1 -0
  37. data/vendor/kreuzberg/src/extractors/html.rs +29 -15
  38. data/vendor/kreuzberg/src/extractors/image.rs +25 -4
  39. data/vendor/kreuzberg/src/extractors/jats.rs +3 -0
  40. data/vendor/kreuzberg/src/extractors/jupyter.rs +1 -0
  41. data/vendor/kreuzberg/src/extractors/latex.rs +1 -0
  42. data/vendor/kreuzberg/src/extractors/markdown.rs +1 -0
  43. data/vendor/kreuzberg/src/extractors/mod.rs +78 -14
  44. data/vendor/kreuzberg/src/extractors/odt.rs +3 -3
  45. data/vendor/kreuzberg/src/extractors/opml.rs +1 -0
  46. data/vendor/kreuzberg/src/extractors/orgmode.rs +1 -0
  47. data/vendor/kreuzberg/src/extractors/pdf.rs +194 -17
  48. data/vendor/kreuzberg/src/extractors/pptx.rs +32 -13
  49. data/vendor/kreuzberg/src/extractors/rst.rs +1 -0
  50. data/vendor/kreuzberg/src/extractors/rtf.rs +3 -4
  51. data/vendor/kreuzberg/src/extractors/structured.rs +2 -0
  52. data/vendor/kreuzberg/src/extractors/text.rs +7 -2
  53. data/vendor/kreuzberg/src/extractors/typst.rs +1 -0
  54. data/vendor/kreuzberg/src/extractors/xml.rs +27 -15
  55. data/vendor/kreuzberg/src/keywords/processor.rs +9 -1
  56. data/vendor/kreuzberg/src/language_detection/mod.rs +43 -0
  57. data/vendor/kreuzberg/src/language_detection/processor.rs +219 -0
  58. data/vendor/kreuzberg/src/lib.rs +10 -2
  59. data/vendor/kreuzberg/src/mcp/mod.rs +2 -0
  60. data/vendor/kreuzberg/src/mcp/server.rs +14 -12
  61. data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +2 -0
  62. data/vendor/kreuzberg/src/pdf/error.rs +8 -0
  63. data/vendor/kreuzberg/src/pdf/metadata.rs +238 -95
  64. data/vendor/kreuzberg/src/pdf/mod.rs +14 -2
  65. data/vendor/kreuzberg/src/pdf/rendering.rs +1 -2
  66. data/vendor/kreuzberg/src/pdf/table.rs +26 -2
  67. data/vendor/kreuzberg/src/pdf/text.rs +89 -7
  68. data/vendor/kreuzberg/src/plugins/extractor.rs +34 -3
  69. data/vendor/kreuzberg/src/plugins/mod.rs +3 -0
  70. data/vendor/kreuzberg/src/plugins/ocr.rs +22 -3
  71. data/vendor/kreuzberg/src/plugins/processor.rs +8 -0
  72. data/vendor/kreuzberg/src/plugins/registry.rs +2 -0
  73. data/vendor/kreuzberg/src/plugins/validator.rs +11 -0
  74. data/vendor/kreuzberg/src/text/mod.rs +6 -0
  75. data/vendor/kreuzberg/src/text/quality_processor.rs +219 -0
  76. data/vendor/kreuzberg/src/types.rs +173 -21
  77. data/vendor/kreuzberg/tests/archive_integration.rs +2 -0
  78. data/vendor/kreuzberg/tests/batch_processing.rs +5 -3
  79. data/vendor/kreuzberg/tests/concurrency_stress.rs +14 -6
  80. data/vendor/kreuzberg/tests/config_features.rs +15 -1
  81. data/vendor/kreuzberg/tests/config_loading_tests.rs +1 -0
  82. data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +2 -0
  83. data/vendor/kreuzberg/tests/email_integration.rs +2 -0
  84. data/vendor/kreuzberg/tests/error_handling.rs +43 -34
  85. data/vendor/kreuzberg/tests/format_integration.rs +2 -0
  86. data/vendor/kreuzberg/tests/image_integration.rs +2 -0
  87. data/vendor/kreuzberg/tests/mime_detection.rs +17 -16
  88. data/vendor/kreuzberg/tests/ocr_configuration.rs +4 -0
  89. data/vendor/kreuzberg/tests/ocr_errors.rs +22 -0
  90. data/vendor/kreuzberg/tests/ocr_quality.rs +2 -0
  91. data/vendor/kreuzberg/tests/odt_extractor_tests.rs +0 -21
  92. data/vendor/kreuzberg/tests/pdf_integration.rs +2 -0
  93. data/vendor/kreuzberg/tests/pipeline_integration.rs +25 -0
  94. data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +5 -0
  95. data/vendor/kreuzberg/tests/plugin_system.rs +6 -0
  96. data/vendor/kreuzberg/tests/registry_integration_tests.rs +1 -0
  97. data/vendor/kreuzberg/tests/rst_extractor_tests.rs +2 -0
  98. data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +0 -1
  99. data/vendor/kreuzberg/tests/security_validation.rs +1 -0
  100. data/vendor/kreuzberg/tests/test_fastembed.rs +45 -23
  101. data/vendor/kreuzberg/tests/typst_behavioral_tests.rs +1 -0
  102. data/vendor/kreuzberg/tests/typst_extractor_tests.rs +3 -2
  103. data/vendor/rb-sys/.cargo_vcs_info.json +2 -2
  104. data/vendor/rb-sys/Cargo.lock +15 -15
  105. data/vendor/rb-sys/Cargo.toml +4 -4
  106. data/vendor/rb-sys/Cargo.toml.orig +4 -4
  107. data/vendor/rb-sys/bin/release.sh +9 -8
  108. data/vendor/rb-sys/build/features.rs +5 -2
  109. data/vendor/rb-sys/build/main.rs +55 -15
  110. data/vendor/rb-sys/build/stable_api_config.rs +4 -2
  111. data/vendor/rb-sys/build/version.rs +3 -1
  112. data/vendor/rb-sys/src/macros.rs +2 -2
  113. data/vendor/rb-sys/src/special_consts.rs +1 -1
  114. data/vendor/rb-sys/src/stable_api/compiled.rs +1 -1
  115. data/vendor/rb-sys/src/stable_api/ruby_2_7.rs +12 -4
  116. data/vendor/rb-sys/src/stable_api/ruby_3_0.rs +12 -4
  117. data/vendor/rb-sys/src/stable_api/ruby_3_1.rs +12 -4
  118. data/vendor/rb-sys/src/stable_api/ruby_3_2.rs +12 -4
  119. data/vendor/rb-sys/src/stable_api/ruby_3_3.rs +19 -6
  120. data/vendor/rb-sys/src/stable_api/ruby_3_4.rs +17 -5
  121. data/vendor/rb-sys/src/stable_api.rs +0 -1
  122. data/vendor/rb-sys/src/tracking_allocator.rs +1 -3
  123. metadata +11 -10
  124. data/vendor/kreuzberg/src/extractors/fictionbook.rs.backup2 +0 -738
  125. data/vendor/rb-sys/.cargo-ok +0 -1
  126. data/vendor/rb-sys/src/stable_api/ruby_2_6.rs +0 -316
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 9fc516b53481da9460a558cdfd860a53274d43091d8f750d76250e32059758be
4
- data.tar.gz: 886d27a343998685e302d771e3e517d0ed23620fba52fd9412c0d780f8034580
3
+ metadata.gz: 86ef74ea08ce1971136cf72aac3e1082a25bd60f10675f9d9e8cab7659076189
4
+ data.tar.gz: 7635098e6aa3d1aeaa238e88f993aece6b3adf46497f6423fcc92fa0c75e478f
5
5
  SHA512:
6
- metadata.gz: 22ffa8ab5ec78709a6b09927a21ae5ef08fc8c24cccfe94a14a54f47fa087935fb84c80719bc6a00266210178d453baaa798961fd00b91bbd484d20f4f73ddd8
7
- data.tar.gz: 859ae29e4fc31def124754482f018d022580e7f4ef7a1a7662dab5a57d8a7620c0de77a218c39ff033f30af2272961fd1cc9f6460690553d7a266b9916816828
6
+ metadata.gz: e29f2972b1384283dee4a11de61e3de8330a4a523ae754dea6995729a0cc03485efc22dc966846328f998d3fa0e91b973fa8e072169c177dcae432f3b1655118
7
+ data.tar.gz: 3c6146bbca9b009ddb2b1dd107ef767ffd921747555a973390eebd36c4074e4b520380b0a4d4925ed8e2e3f6c4aef351505a1d4b4d0aa0ceabafef28e8c517b7
data/Gemfile.lock CHANGED
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- kreuzberg (4.0.0.pre.rc.6)
4
+ kreuzberg (4.0.0.pre.rc.7)
5
5
 
6
6
  GEM
7
7
  remote: https://rubygems.org/
@@ -34,7 +34,7 @@ GEM
34
34
  fileutils (1.8.0)
35
35
  i18n (1.14.7)
36
36
  concurrent-ruby (~> 1.0)
37
- json (2.17.1)
37
+ json (2.18.0)
38
38
  language_server-protocol (3.17.0.5)
39
39
  lint_roller (1.1.0)
40
40
  listen (3.9.0)
@@ -42,7 +42,7 @@ GEM
42
42
  rb-inotify (~> 0.9, >= 0.9.10)
43
43
  logger (1.7.0)
44
44
  method_source (1.1.0)
45
- minitest (5.26.2)
45
+ minitest (5.27.0)
46
46
  mutex_m (0.3.0)
47
47
  parallel (1.27.0)
48
48
  parser (3.3.10.0)
@@ -136,6 +136,8 @@ GEM
136
136
  PLATFORMS
137
137
  arm64-darwin-23
138
138
  arm64-darwin-24
139
+ x64-mingw-ucrt
140
+ x86_64-linux
139
141
 
140
142
  DEPENDENCIES
141
143
  bundler (~> 4.0)
data/README.md CHANGED
@@ -1,22 +1,28 @@
1
- # Kreuzberg for Ruby
1
+ # Kreuzberg
2
+
3
+ [![Rust](https://img.shields.io/crates/v/kreuzberg?label=Rust)](https://crates.io/crates/kreuzberg)
4
+ [![Python](https://img.shields.io/pypi/v/kreuzberg?label=Python)](https://pypi.org/project/kreuzberg/)
5
+ [![TypeScript](https://img.shields.io/npm/v/@kreuzberg/node?label=TypeScript)](https://www.npmjs.com/package/@kreuzberg/node)
6
+ [![WASM](https://img.shields.io/npm/v/@kreuzberg/wasm?label=WASM)](https://www.npmjs.com/package/@kreuzberg/wasm)
7
+ [![Ruby](https://img.shields.io/gem/v/kreuzberg?label=Ruby)](https://rubygems.org/gems/kreuzberg)
8
+ [![Java](https://img.shields.io/maven-central/v/dev.kreuzberg/kreuzberg?label=Java)](https://central.sonatype.com/artifact/dev.kreuzberg/kreuzberg)
9
+ [![Go](https://img.shields.io/github/v/tag/kreuzberg-dev/kreuzberg?label=Go)](https://pkg.go.dev/github.com/kreuzberg-dev/kreuzberg)
10
+ [![C#](https://img.shields.io/nuget/v/Goldziher.Kreuzberg?label=C%23)](https://www.nuget.org/packages/Goldziher.Kreuzberg/)
2
11
 
3
- [![RubyGems](https://img.shields.io/gem/v/kreuzberg)](https://rubygems.org/gems/kreuzberg)
4
- [![Crates.io](https://img.shields.io/crates/v/kreuzberg)](https://crates.io/crates/kreuzberg)
5
- [![PyPI](https://img.shields.io/pypi/v/kreuzberg)](https://pypi.org/project/kreuzberg/)
6
- [![npm](https://img.shields.io/npm/v/kreuzberg)](https://www.npmjs.com/package/kreuzberg)
7
12
  [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
8
- [![Documentation](https://img.shields.io/badge/docs-kreuzberg.dev-blue)](https://kreuzberg.dev)
13
+ [![Documentation](https://img.shields.io/badge/docs-kreuzberg.dev-blue)](https://kreuzberg.dev/)
14
+ [![Discord](https://img.shields.io/badge/Discord-Join%20our%20community-7289da)](https://discord.gg/pXxagNK2zN)
9
15
 
10
16
  High-performance document intelligence for Ruby, powered by Rust.
11
17
 
12
- Extract text, tables, images, and metadata from 30+ file formats including PDF, DOCX, PPTX, XLSX, images, and more.
18
+ Extract text, tables, images, and metadata from 56 file formats including PDF, DOCX, PPTX, XLSX, images, and more.
13
19
 
14
20
  > **🚀 Version 4.0.0 Release Candidate**
15
21
  > This is a pre-release version. We invite you to test the library and [report any issues](https://github.com/kreuzberg-dev/kreuzberg/issues) you encounter.
16
22
 
17
23
  ## Features
18
24
 
19
- - **30+ File Formats**: PDF, DOCX, PPTX, XLSX, images, HTML, Markdown, XML, JSON, and more
25
+ - **56 File Formats**: PDF, DOCX, PPTX, XLSX, images, HTML, Markdown, XML, JSON, and more
20
26
  - **OCR Support**: Built-in Tesseract OCR for scanned documents and images
21
27
  - **High Performance**: Rust-powered extraction for native-level performance
22
28
  - **Table Extraction**: Extract structured tables from documents
@@ -421,6 +427,6 @@ Contributions are welcome! Please see [CONTRIBUTING.md](../../CONTRIBUTING.md) f
421
427
 
422
428
  ## Links
423
429
 
424
- - **Documentation**: https://docs.kreuzberg.dev
430
+ - **Documentation**: https://kreuzberg.dev
425
431
  - **GitHub**: https://github.com/kreuzberg-dev/kreuzberg
426
432
  - **Issues**: https://github.com/kreuzberg-dev/kreuzberg/issues
@@ -0,0 +1,2 @@
1
+ [build]
2
+ rustflags = ["-A", "unpredictable-function-pointer-comparisons"]