kreuzberg 4.0.0.pre.rc.6 → 4.0.0.pre.rc.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (127) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile.lock +5 -5
  3. data/README.md +15 -9
  4. data/ext/kreuzberg_rb/native/.cargo/config.toml +2 -0
  5. data/ext/kreuzberg_rb/native/Cargo.lock +511 -325
  6. data/ext/kreuzberg_rb/native/Cargo.toml +13 -3
  7. data/ext/kreuzberg_rb/native/src/lib.rs +139 -2
  8. data/kreuzberg.gemspec +38 -4
  9. data/lib/kreuzberg/config.rb +34 -1
  10. data/lib/kreuzberg/result.rb +77 -14
  11. data/lib/kreuzberg/version.rb +1 -1
  12. data/sig/kreuzberg.rbs +23 -6
  13. data/vendor/kreuzberg/Cargo.toml +32 -11
  14. data/vendor/kreuzberg/README.md +54 -8
  15. data/vendor/kreuzberg/build.rs +549 -132
  16. data/vendor/kreuzberg/src/chunking/mod.rs +1279 -79
  17. data/vendor/kreuzberg/src/chunking/processor.rs +220 -0
  18. data/vendor/kreuzberg/src/core/config.rs +49 -1
  19. data/vendor/kreuzberg/src/core/extractor.rs +134 -2
  20. data/vendor/kreuzberg/src/core/mod.rs +4 -2
  21. data/vendor/kreuzberg/src/core/pipeline.rs +188 -1
  22. data/vendor/kreuzberg/src/extraction/docx.rs +358 -0
  23. data/vendor/kreuzberg/src/extraction/html.rs +24 -8
  24. data/vendor/kreuzberg/src/extraction/image.rs +124 -1
  25. data/vendor/kreuzberg/src/extraction/libreoffice.rs +1 -2
  26. data/vendor/kreuzberg/src/extraction/office_metadata/odt_properties.rs +0 -3
  27. data/vendor/kreuzberg/src/extraction/pptx.rs +187 -87
  28. data/vendor/kreuzberg/src/extractors/archive.rs +1 -0
  29. data/vendor/kreuzberg/src/extractors/bibtex.rs +1 -0
  30. data/vendor/kreuzberg/src/extractors/docbook.rs +2 -0
  31. data/vendor/kreuzberg/src/extractors/docx.rs +50 -17
  32. data/vendor/kreuzberg/src/extractors/email.rs +29 -15
  33. data/vendor/kreuzberg/src/extractors/epub.rs +1 -0
  34. data/vendor/kreuzberg/src/extractors/excel.rs +2 -0
  35. data/vendor/kreuzberg/src/extractors/fictionbook.rs +1 -0
  36. data/vendor/kreuzberg/src/extractors/html.rs +29 -15
  37. data/vendor/kreuzberg/src/extractors/image.rs +25 -4
  38. data/vendor/kreuzberg/src/extractors/jats.rs +3 -0
  39. data/vendor/kreuzberg/src/extractors/jupyter.rs +1 -0
  40. data/vendor/kreuzberg/src/extractors/latex.rs +1 -0
  41. data/vendor/kreuzberg/src/extractors/markdown.rs +1 -0
  42. data/vendor/kreuzberg/src/extractors/mod.rs +78 -14
  43. data/vendor/kreuzberg/src/extractors/odt.rs +3 -3
  44. data/vendor/kreuzberg/src/extractors/opml.rs +1 -0
  45. data/vendor/kreuzberg/src/extractors/orgmode.rs +1 -0
  46. data/vendor/kreuzberg/src/extractors/pdf.rs +197 -17
  47. data/vendor/kreuzberg/src/extractors/pptx.rs +32 -13
  48. data/vendor/kreuzberg/src/extractors/rst.rs +1 -0
  49. data/vendor/kreuzberg/src/extractors/rtf.rs +3 -4
  50. data/vendor/kreuzberg/src/extractors/structured.rs +2 -0
  51. data/vendor/kreuzberg/src/extractors/text.rs +7 -2
  52. data/vendor/kreuzberg/src/extractors/typst.rs +1 -0
  53. data/vendor/kreuzberg/src/extractors/xml.rs +27 -15
  54. data/vendor/kreuzberg/src/keywords/processor.rs +9 -1
  55. data/vendor/kreuzberg/src/language_detection/mod.rs +43 -0
  56. data/vendor/kreuzberg/src/language_detection/processor.rs +219 -0
  57. data/vendor/kreuzberg/src/lib.rs +10 -2
  58. data/vendor/kreuzberg/src/mcp/mod.rs +3 -0
  59. data/vendor/kreuzberg/src/mcp/server.rs +120 -12
  60. data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +2 -0
  61. data/vendor/kreuzberg/src/pdf/bundled.rs +328 -0
  62. data/vendor/kreuzberg/src/pdf/error.rs +8 -0
  63. data/vendor/kreuzberg/src/pdf/metadata.rs +238 -95
  64. data/vendor/kreuzberg/src/pdf/mod.rs +18 -2
  65. data/vendor/kreuzberg/src/pdf/rendering.rs +1 -2
  66. data/vendor/kreuzberg/src/pdf/table.rs +26 -2
  67. data/vendor/kreuzberg/src/pdf/text.rs +89 -7
  68. data/vendor/kreuzberg/src/plugins/extractor.rs +34 -3
  69. data/vendor/kreuzberg/src/plugins/mod.rs +3 -0
  70. data/vendor/kreuzberg/src/plugins/ocr.rs +22 -3
  71. data/vendor/kreuzberg/src/plugins/processor.rs +8 -0
  72. data/vendor/kreuzberg/src/plugins/registry.rs +2 -0
  73. data/vendor/kreuzberg/src/plugins/validator.rs +11 -0
  74. data/vendor/kreuzberg/src/text/mod.rs +6 -0
  75. data/vendor/kreuzberg/src/text/quality_processor.rs +219 -0
  76. data/vendor/kreuzberg/src/types.rs +173 -21
  77. data/vendor/kreuzberg/tests/archive_integration.rs +2 -0
  78. data/vendor/kreuzberg/tests/batch_processing.rs +5 -3
  79. data/vendor/kreuzberg/tests/concurrency_stress.rs +14 -6
  80. data/vendor/kreuzberg/tests/config_features.rs +15 -1
  81. data/vendor/kreuzberg/tests/config_loading_tests.rs +1 -0
  82. data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +2 -0
  83. data/vendor/kreuzberg/tests/email_integration.rs +2 -0
  84. data/vendor/kreuzberg/tests/error_handling.rs +43 -34
  85. data/vendor/kreuzberg/tests/format_integration.rs +2 -0
  86. data/vendor/kreuzberg/tests/image_integration.rs +2 -0
  87. data/vendor/kreuzberg/tests/mime_detection.rs +17 -16
  88. data/vendor/kreuzberg/tests/ocr_configuration.rs +4 -0
  89. data/vendor/kreuzberg/tests/ocr_errors.rs +22 -0
  90. data/vendor/kreuzberg/tests/ocr_quality.rs +2 -0
  91. data/vendor/kreuzberg/tests/odt_extractor_tests.rs +0 -21
  92. data/vendor/kreuzberg/tests/pdf_integration.rs +2 -0
  93. data/vendor/kreuzberg/tests/pdfium_linking.rs +374 -0
  94. data/vendor/kreuzberg/tests/pipeline_integration.rs +25 -0
  95. data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +5 -0
  96. data/vendor/kreuzberg/tests/plugin_system.rs +6 -0
  97. data/vendor/kreuzberg/tests/registry_integration_tests.rs +1 -0
  98. data/vendor/kreuzberg/tests/rst_extractor_tests.rs +2 -0
  99. data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +0 -1
  100. data/vendor/kreuzberg/tests/security_validation.rs +1 -0
  101. data/vendor/kreuzberg/tests/test_fastembed.rs +45 -23
  102. data/vendor/kreuzberg/tests/typst_behavioral_tests.rs +1 -0
  103. data/vendor/kreuzberg/tests/typst_extractor_tests.rs +3 -2
  104. data/vendor/rb-sys/.cargo_vcs_info.json +2 -2
  105. data/vendor/rb-sys/Cargo.lock +15 -15
  106. data/vendor/rb-sys/Cargo.toml +4 -4
  107. data/vendor/rb-sys/Cargo.toml.orig +4 -4
  108. data/vendor/rb-sys/build/features.rs +5 -2
  109. data/vendor/rb-sys/build/main.rs +55 -15
  110. data/vendor/rb-sys/build/stable_api_config.rs +4 -2
  111. data/vendor/rb-sys/build/version.rs +3 -1
  112. data/vendor/rb-sys/src/lib.rs +1 -0
  113. data/vendor/rb-sys/src/macros.rs +2 -2
  114. data/vendor/rb-sys/src/special_consts.rs +1 -1
  115. data/vendor/rb-sys/src/stable_api/compiled.rs +1 -1
  116. data/vendor/rb-sys/src/stable_api/ruby_2_7.rs +12 -4
  117. data/vendor/rb-sys/src/stable_api/ruby_3_0.rs +12 -4
  118. data/vendor/rb-sys/src/stable_api/ruby_3_1.rs +12 -4
  119. data/vendor/rb-sys/src/stable_api/ruby_3_2.rs +12 -4
  120. data/vendor/rb-sys/src/stable_api/ruby_3_3.rs +19 -6
  121. data/vendor/rb-sys/src/stable_api/ruby_3_4.rs +17 -5
  122. data/vendor/rb-sys/src/stable_api.rs +0 -1
  123. data/vendor/rb-sys/src/tracking_allocator.rs +1 -3
  124. metadata +13 -10
  125. data/vendor/kreuzberg/src/extractors/fictionbook.rs.backup2 +0 -738
  126. data/vendor/rb-sys/.cargo-ok +0 -1
  127. data/vendor/rb-sys/src/stable_api/ruby_2_6.rs +0 -316
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 9fc516b53481da9460a558cdfd860a53274d43091d8f750d76250e32059758be
4
- data.tar.gz: 886d27a343998685e302d771e3e517d0ed23620fba52fd9412c0d780f8034580
3
+ metadata.gz: 916533f5e53e159638c5efcf50ca58ca16705ad2df86095b013fbfe7ed8f4cfb
4
+ data.tar.gz: dd0d04f68f00849b83e996b806af808400383400a29fdabd618108c63c6c9ffa
5
5
  SHA512:
6
- metadata.gz: 22ffa8ab5ec78709a6b09927a21ae5ef08fc8c24cccfe94a14a54f47fa087935fb84c80719bc6a00266210178d453baaa798961fd00b91bbd484d20f4f73ddd8
7
- data.tar.gz: 859ae29e4fc31def124754482f018d022580e7f4ef7a1a7662dab5a57d8a7620c0de77a218c39ff033f30af2272961fd1cc9f6460690553d7a266b9916816828
6
+ metadata.gz: b8b2db6d787185dd3764ed749bb5079528d8306bd6ce3a8b1458ab65561c8002ce2741e2d21a3cc7cff391fe44b9634a9aa0afc6112b4844ed5490bd109d52dc
7
+ data.tar.gz: 45a0ea17841640ed4e6bcaf0b96476cbde875dd6436305395ba7f32483ae8cf54aef0aed2cc7351ad4c7bed82cce74764a27659e5f2f215cda39b0027fef9b48
data/Gemfile.lock CHANGED
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- kreuzberg (4.0.0.pre.rc.6)
4
+ kreuzberg (4.0.0.pre.rc.8)
5
5
 
6
6
  GEM
7
7
  remote: https://rubygems.org/
@@ -24,7 +24,7 @@ GEM
24
24
  bigdecimal (3.3.1)
25
25
  byebug (12.0.0)
26
26
  coderay (1.1.3)
27
- concurrent-ruby (1.3.5)
27
+ concurrent-ruby (1.3.6)
28
28
  connection_pool (3.0.2)
29
29
  csv (3.3.5)
30
30
  diff-lcs (1.6.2)
@@ -34,7 +34,7 @@ GEM
34
34
  fileutils (1.8.0)
35
35
  i18n (1.14.7)
36
36
  concurrent-ruby (~> 1.0)
37
- json (2.17.1)
37
+ json (2.18.0)
38
38
  language_server-protocol (3.17.0.5)
39
39
  lint_roller (1.1.0)
40
40
  listen (3.9.0)
@@ -42,7 +42,7 @@ GEM
42
42
  rb-inotify (~> 0.9, >= 0.9.10)
43
43
  logger (1.7.0)
44
44
  method_source (1.1.0)
45
- minitest (5.26.2)
45
+ minitest (5.27.0)
46
46
  mutex_m (0.3.0)
47
47
  parallel (1.27.0)
48
48
  parser (3.3.10.0)
@@ -58,7 +58,7 @@ GEM
58
58
  racc (1.8.1)
59
59
  rainbow (3.1.1)
60
60
  rake (13.3.1)
61
- rake-compiler (1.3.0)
61
+ rake-compiler (1.3.1)
62
62
  rake
63
63
  rake-compiler-dock (1.10.0)
64
64
  rb-fsevent (0.11.2)
data/README.md CHANGED
@@ -1,22 +1,28 @@
1
- # Kreuzberg for Ruby
1
+ # Kreuzberg
2
+
3
+ [![Rust](https://img.shields.io/crates/v/kreuzberg?label=Rust)](https://crates.io/crates/kreuzberg)
4
+ [![Python](https://img.shields.io/pypi/v/kreuzberg?label=Python)](https://pypi.org/project/kreuzberg/)
5
+ [![TypeScript](https://img.shields.io/npm/v/@kreuzberg/node?label=TypeScript)](https://www.npmjs.com/package/@kreuzberg/node)
6
+ [![WASM](https://img.shields.io/npm/v/@kreuzberg/wasm?label=WASM)](https://www.npmjs.com/package/@kreuzberg/wasm)
7
+ [![Ruby](https://img.shields.io/gem/v/kreuzberg?label=Ruby)](https://rubygems.org/gems/kreuzberg)
8
+ [![Java](https://img.shields.io/maven-central/v/dev.kreuzberg/kreuzberg?label=Java)](https://central.sonatype.com/artifact/dev.kreuzberg/kreuzberg)
9
+ [![Go](https://img.shields.io/github/v/tag/kreuzberg-dev/kreuzberg?label=Go)](https://pkg.go.dev/github.com/kreuzberg-dev/kreuzberg)
10
+ [![C#](https://img.shields.io/nuget/v/Goldziher.Kreuzberg?label=C%23)](https://www.nuget.org/packages/Goldziher.Kreuzberg/)
2
11
 
3
- [![RubyGems](https://img.shields.io/gem/v/kreuzberg)](https://rubygems.org/gems/kreuzberg)
4
- [![Crates.io](https://img.shields.io/crates/v/kreuzberg)](https://crates.io/crates/kreuzberg)
5
- [![PyPI](https://img.shields.io/pypi/v/kreuzberg)](https://pypi.org/project/kreuzberg/)
6
- [![npm](https://img.shields.io/npm/v/kreuzberg)](https://www.npmjs.com/package/kreuzberg)
7
12
  [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
8
- [![Documentation](https://img.shields.io/badge/docs-kreuzberg.dev-blue)](https://kreuzberg.dev)
13
+ [![Documentation](https://img.shields.io/badge/docs-kreuzberg.dev-blue)](https://kreuzberg.dev/)
14
+ [![Discord](https://img.shields.io/badge/Discord-Join%20our%20community-7289da)](https://discord.gg/pXxagNK2zN)
9
15
 
10
16
  High-performance document intelligence for Ruby, powered by Rust.
11
17
 
12
- Extract text, tables, images, and metadata from 30+ file formats including PDF, DOCX, PPTX, XLSX, images, and more.
18
+ Extract text, tables, images, and metadata from 56 file formats including PDF, DOCX, PPTX, XLSX, images, and more.
13
19
 
14
20
  > **🚀 Version 4.0.0 Release Candidate**
15
21
  > This is a pre-release version. We invite you to test the library and [report any issues](https://github.com/kreuzberg-dev/kreuzberg/issues) you encounter.
16
22
 
17
23
  ## Features
18
24
 
19
- - **30+ File Formats**: PDF, DOCX, PPTX, XLSX, images, HTML, Markdown, XML, JSON, and more
25
+ - **56 File Formats**: PDF, DOCX, PPTX, XLSX, images, HTML, Markdown, XML, JSON, and more
20
26
  - **OCR Support**: Built-in Tesseract OCR for scanned documents and images
21
27
  - **High Performance**: Rust-powered extraction for native-level performance
22
28
  - **Table Extraction**: Extract structured tables from documents
@@ -421,6 +427,6 @@ Contributions are welcome! Please see [CONTRIBUTING.md](../../CONTRIBUTING.md) f
421
427
 
422
428
  ## Links
423
429
 
424
- - **Documentation**: https://docs.kreuzberg.dev
430
+ - **Documentation**: https://kreuzberg.dev
425
431
  - **GitHub**: https://github.com/kreuzberg-dev/kreuzberg
426
432
  - **Issues**: https://github.com/kreuzberg-dev/kreuzberg/issues
@@ -0,0 +1,2 @@
1
+ [build]
2
+ rustflags = ["-A", "unpredictable-function-pointer-comparisons", "-A", "fn_ptr_eq"]