kreuzberg 4.0.8 → 4.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (308) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile.lock +2 -2
  3. data/README.md +1 -1
  4. data/ext/kreuzberg_rb/native/Cargo.lock +94 -98
  5. data/ext/kreuzberg_rb/native/Cargo.toml +4 -2
  6. data/ext/kreuzberg_rb/native/src/batch.rs +139 -0
  7. data/ext/kreuzberg_rb/native/src/config/mod.rs +10 -0
  8. data/ext/kreuzberg_rb/native/src/config/types.rs +1058 -0
  9. data/ext/kreuzberg_rb/native/src/error_handling.rs +125 -0
  10. data/ext/kreuzberg_rb/native/src/extraction.rs +79 -0
  11. data/ext/kreuzberg_rb/native/src/gc_guarded_value.rs +35 -0
  12. data/ext/kreuzberg_rb/native/src/helpers.rs +176 -0
  13. data/ext/kreuzberg_rb/native/src/lib.rs +342 -3622
  14. data/ext/kreuzberg_rb/native/src/metadata.rs +34 -0
  15. data/ext/kreuzberg_rb/native/src/plugins/mod.rs +92 -0
  16. data/ext/kreuzberg_rb/native/src/plugins/ocr_backend.rs +159 -0
  17. data/ext/kreuzberg_rb/native/src/plugins/post_processor.rs +126 -0
  18. data/ext/kreuzberg_rb/native/src/plugins/validator.rs +99 -0
  19. data/ext/kreuzberg_rb/native/src/result.rs +326 -0
  20. data/ext/kreuzberg_rb/native/src/validation.rs +4 -0
  21. data/lib/kreuzberg/config.rb +66 -0
  22. data/lib/kreuzberg/result.rb +107 -2
  23. data/lib/kreuzberg/types.rb +104 -0
  24. data/lib/kreuzberg/version.rb +1 -1
  25. data/lib/kreuzberg.rb +0 -4
  26. data/sig/kreuzberg.rbs +105 -1
  27. data/vendor/Cargo.toml +3 -3
  28. data/vendor/kreuzberg/Cargo.toml +4 -3
  29. data/vendor/kreuzberg/README.md +1 -1
  30. data/vendor/kreuzberg/src/api/config.rs +69 -0
  31. data/vendor/kreuzberg/src/api/handlers.rs +99 -2
  32. data/vendor/kreuzberg/src/api/mod.rs +14 -7
  33. data/vendor/kreuzberg/src/api/router.rs +214 -0
  34. data/vendor/kreuzberg/src/api/startup.rs +243 -0
  35. data/vendor/kreuzberg/src/api/types.rs +78 -0
  36. data/vendor/kreuzberg/src/cache/cleanup.rs +277 -0
  37. data/vendor/kreuzberg/src/cache/core.rs +428 -0
  38. data/vendor/kreuzberg/src/cache/mod.rs +21 -843
  39. data/vendor/kreuzberg/src/cache/utilities.rs +156 -0
  40. data/vendor/kreuzberg/src/chunking/boundaries.rs +301 -0
  41. data/vendor/kreuzberg/src/chunking/builder.rs +294 -0
  42. data/vendor/kreuzberg/src/chunking/config.rs +52 -0
  43. data/vendor/kreuzberg/src/chunking/core.rs +1017 -0
  44. data/vendor/kreuzberg/src/chunking/mod.rs +14 -2211
  45. data/vendor/kreuzberg/src/chunking/processor.rs +10 -0
  46. data/vendor/kreuzberg/src/chunking/validation.rs +686 -0
  47. data/vendor/kreuzberg/src/core/config/extraction/core.rs +169 -0
  48. data/vendor/kreuzberg/src/core/config/extraction/env.rs +179 -0
  49. data/vendor/kreuzberg/src/core/config/extraction/loaders.rs +204 -0
  50. data/vendor/kreuzberg/src/core/config/extraction/mod.rs +42 -0
  51. data/vendor/kreuzberg/src/core/config/extraction/types.rs +93 -0
  52. data/vendor/kreuzberg/src/core/config/formats.rs +135 -0
  53. data/vendor/kreuzberg/src/core/config/mod.rs +20 -0
  54. data/vendor/kreuzberg/src/core/config/ocr.rs +73 -0
  55. data/vendor/kreuzberg/src/core/config/page.rs +57 -0
  56. data/vendor/kreuzberg/src/core/config/pdf.rs +111 -0
  57. data/vendor/kreuzberg/src/core/config/processing.rs +312 -0
  58. data/vendor/kreuzberg/src/core/config_validation/dependencies.rs +187 -0
  59. data/vendor/kreuzberg/src/core/config_validation/mod.rs +386 -0
  60. data/vendor/kreuzberg/src/core/config_validation/sections.rs +401 -0
  61. data/vendor/kreuzberg/src/core/extractor/batch.rs +246 -0
  62. data/vendor/kreuzberg/src/core/extractor/bytes.rs +116 -0
  63. data/vendor/kreuzberg/src/core/extractor/file.rs +240 -0
  64. data/vendor/kreuzberg/src/core/extractor/helpers.rs +71 -0
  65. data/vendor/kreuzberg/src/core/extractor/legacy.rs +62 -0
  66. data/vendor/kreuzberg/src/core/extractor/mod.rs +490 -0
  67. data/vendor/kreuzberg/src/core/extractor/sync.rs +208 -0
  68. data/vendor/kreuzberg/src/core/mod.rs +4 -1
  69. data/vendor/kreuzberg/src/core/pipeline/cache.rs +60 -0
  70. data/vendor/kreuzberg/src/core/pipeline/execution.rs +89 -0
  71. data/vendor/kreuzberg/src/core/pipeline/features.rs +108 -0
  72. data/vendor/kreuzberg/src/core/pipeline/format.rs +392 -0
  73. data/vendor/kreuzberg/src/core/pipeline/initialization.rs +67 -0
  74. data/vendor/kreuzberg/src/core/pipeline/mod.rs +135 -0
  75. data/vendor/kreuzberg/src/core/pipeline/tests.rs +975 -0
  76. data/vendor/kreuzberg/src/core/server_config/env.rs +90 -0
  77. data/vendor/kreuzberg/src/core/server_config/loader.rs +202 -0
  78. data/vendor/kreuzberg/src/core/server_config/mod.rs +380 -0
  79. data/vendor/kreuzberg/src/core/server_config/tests/basic_tests.rs +124 -0
  80. data/vendor/kreuzberg/src/core/server_config/tests/env_tests.rs +216 -0
  81. data/vendor/kreuzberg/src/core/server_config/tests/file_loading_tests.rs +341 -0
  82. data/vendor/kreuzberg/src/core/server_config/tests/mod.rs +5 -0
  83. data/vendor/kreuzberg/src/core/server_config/validation.rs +17 -0
  84. data/vendor/kreuzberg/src/embeddings.rs +136 -13
  85. data/vendor/kreuzberg/src/extraction/{archive.rs → archive/mod.rs} +45 -239
  86. data/vendor/kreuzberg/src/extraction/archive/sevenz.rs +98 -0
  87. data/vendor/kreuzberg/src/extraction/archive/tar.rs +118 -0
  88. data/vendor/kreuzberg/src/extraction/archive/zip.rs +101 -0
  89. data/vendor/kreuzberg/src/extraction/html/converter.rs +592 -0
  90. data/vendor/kreuzberg/src/extraction/html/image_handling.rs +95 -0
  91. data/vendor/kreuzberg/src/extraction/html/mod.rs +53 -0
  92. data/vendor/kreuzberg/src/extraction/html/processor.rs +659 -0
  93. data/vendor/kreuzberg/src/extraction/html/stack_management.rs +103 -0
  94. data/vendor/kreuzberg/src/extraction/html/types.rs +28 -0
  95. data/vendor/kreuzberg/src/extraction/mod.rs +6 -2
  96. data/vendor/kreuzberg/src/extraction/pptx/container.rs +159 -0
  97. data/vendor/kreuzberg/src/extraction/pptx/content_builder.rs +168 -0
  98. data/vendor/kreuzberg/src/extraction/pptx/elements.rs +132 -0
  99. data/vendor/kreuzberg/src/extraction/pptx/image_handling.rs +57 -0
  100. data/vendor/kreuzberg/src/extraction/pptx/metadata.rs +160 -0
  101. data/vendor/kreuzberg/src/extraction/pptx/mod.rs +558 -0
  102. data/vendor/kreuzberg/src/extraction/pptx/parser.rs +379 -0
  103. data/vendor/kreuzberg/src/extraction/transform/content.rs +205 -0
  104. data/vendor/kreuzberg/src/extraction/transform/elements.rs +211 -0
  105. data/vendor/kreuzberg/src/extraction/transform/mod.rs +480 -0
  106. data/vendor/kreuzberg/src/extraction/transform/types.rs +27 -0
  107. data/vendor/kreuzberg/src/extractors/archive.rs +2 -0
  108. data/vendor/kreuzberg/src/extractors/bibtex.rs +2 -0
  109. data/vendor/kreuzberg/src/extractors/djot_format/attributes.rs +134 -0
  110. data/vendor/kreuzberg/src/extractors/djot_format/conversion.rs +223 -0
  111. data/vendor/kreuzberg/src/extractors/djot_format/extractor.rs +172 -0
  112. data/vendor/kreuzberg/src/extractors/djot_format/mod.rs +24 -0
  113. data/vendor/kreuzberg/src/extractors/djot_format/parsing/block_handlers.rs +271 -0
  114. data/vendor/kreuzberg/src/extractors/djot_format/parsing/content_extraction.rs +257 -0
  115. data/vendor/kreuzberg/src/extractors/djot_format/parsing/event_handlers.rs +101 -0
  116. data/vendor/kreuzberg/src/extractors/djot_format/parsing/inline_handlers.rs +201 -0
  117. data/vendor/kreuzberg/src/extractors/djot_format/parsing/mod.rs +16 -0
  118. data/vendor/kreuzberg/src/extractors/djot_format/parsing/state.rs +78 -0
  119. data/vendor/kreuzberg/src/extractors/djot_format/parsing/table_extraction.rs +68 -0
  120. data/vendor/kreuzberg/src/extractors/djot_format/parsing/text_extraction.rs +61 -0
  121. data/vendor/kreuzberg/src/extractors/djot_format/rendering.rs +452 -0
  122. data/vendor/kreuzberg/src/extractors/docbook.rs +2 -0
  123. data/vendor/kreuzberg/src/extractors/docx.rs +12 -1
  124. data/vendor/kreuzberg/src/extractors/email.rs +2 -0
  125. data/vendor/kreuzberg/src/extractors/epub/content.rs +333 -0
  126. data/vendor/kreuzberg/src/extractors/epub/metadata.rs +137 -0
  127. data/vendor/kreuzberg/src/extractors/epub/mod.rs +186 -0
  128. data/vendor/kreuzberg/src/extractors/epub/parsing.rs +86 -0
  129. data/vendor/kreuzberg/src/extractors/excel.rs +4 -0
  130. data/vendor/kreuzberg/src/extractors/fictionbook.rs +2 -0
  131. data/vendor/kreuzberg/src/extractors/frontmatter_utils.rs +466 -0
  132. data/vendor/kreuzberg/src/extractors/html.rs +80 -8
  133. data/vendor/kreuzberg/src/extractors/image.rs +8 -1
  134. data/vendor/kreuzberg/src/extractors/jats/elements.rs +350 -0
  135. data/vendor/kreuzberg/src/extractors/jats/metadata.rs +21 -0
  136. data/vendor/kreuzberg/src/extractors/{jats.rs → jats/mod.rs} +10 -412
  137. data/vendor/kreuzberg/src/extractors/jats/parser.rs +52 -0
  138. data/vendor/kreuzberg/src/extractors/jupyter.rs +2 -0
  139. data/vendor/kreuzberg/src/extractors/latex/commands.rs +93 -0
  140. data/vendor/kreuzberg/src/extractors/latex/environments.rs +157 -0
  141. data/vendor/kreuzberg/src/extractors/latex/metadata.rs +27 -0
  142. data/vendor/kreuzberg/src/extractors/latex/mod.rs +146 -0
  143. data/vendor/kreuzberg/src/extractors/latex/parser.rs +231 -0
  144. data/vendor/kreuzberg/src/extractors/latex/utilities.rs +126 -0
  145. data/vendor/kreuzberg/src/extractors/markdown.rs +39 -162
  146. data/vendor/kreuzberg/src/extractors/mod.rs +9 -1
  147. data/vendor/kreuzberg/src/extractors/odt.rs +2 -0
  148. data/vendor/kreuzberg/src/extractors/opml/core.rs +165 -0
  149. data/vendor/kreuzberg/src/extractors/opml/mod.rs +31 -0
  150. data/vendor/kreuzberg/src/extractors/opml/parser.rs +479 -0
  151. data/vendor/kreuzberg/src/extractors/orgmode.rs +2 -0
  152. data/vendor/kreuzberg/src/extractors/pdf/extraction.rs +106 -0
  153. data/vendor/kreuzberg/src/extractors/{pdf.rs → pdf/mod.rs} +25 -324
  154. data/vendor/kreuzberg/src/extractors/pdf/ocr.rs +214 -0
  155. data/vendor/kreuzberg/src/extractors/pdf/pages.rs +51 -0
  156. data/vendor/kreuzberg/src/extractors/pptx.rs +9 -2
  157. data/vendor/kreuzberg/src/extractors/rst.rs +2 -0
  158. data/vendor/kreuzberg/src/extractors/rtf/encoding.rs +116 -0
  159. data/vendor/kreuzberg/src/extractors/rtf/formatting.rs +24 -0
  160. data/vendor/kreuzberg/src/extractors/rtf/images.rs +72 -0
  161. data/vendor/kreuzberg/src/extractors/rtf/metadata.rs +216 -0
  162. data/vendor/kreuzberg/src/extractors/rtf/mod.rs +142 -0
  163. data/vendor/kreuzberg/src/extractors/rtf/parser.rs +259 -0
  164. data/vendor/kreuzberg/src/extractors/rtf/tables.rs +83 -0
  165. data/vendor/kreuzberg/src/extractors/structured.rs +2 -0
  166. data/vendor/kreuzberg/src/extractors/text.rs +4 -0
  167. data/vendor/kreuzberg/src/extractors/typst.rs +2 -0
  168. data/vendor/kreuzberg/src/extractors/xml.rs +2 -0
  169. data/vendor/kreuzberg/src/keywords/processor.rs +14 -0
  170. data/vendor/kreuzberg/src/language_detection/processor.rs +10 -0
  171. data/vendor/kreuzberg/src/lib.rs +2 -2
  172. data/vendor/kreuzberg/src/mcp/errors.rs +312 -0
  173. data/vendor/kreuzberg/src/mcp/format.rs +211 -0
  174. data/vendor/kreuzberg/src/mcp/mod.rs +9 -3
  175. data/vendor/kreuzberg/src/mcp/params.rs +196 -0
  176. data/vendor/kreuzberg/src/mcp/server.rs +39 -1438
  177. data/vendor/kreuzberg/src/mcp/tools/cache.rs +179 -0
  178. data/vendor/kreuzberg/src/mcp/tools/extraction.rs +403 -0
  179. data/vendor/kreuzberg/src/mcp/tools/mime.rs +150 -0
  180. data/vendor/kreuzberg/src/mcp/tools/mod.rs +11 -0
  181. data/vendor/kreuzberg/src/ocr/backends/easyocr.rs +96 -0
  182. data/vendor/kreuzberg/src/ocr/backends/mod.rs +7 -0
  183. data/vendor/kreuzberg/src/ocr/backends/paddleocr.rs +27 -0
  184. data/vendor/kreuzberg/src/ocr/backends/tesseract.rs +134 -0
  185. data/vendor/kreuzberg/src/ocr/hocr.rs +60 -16
  186. data/vendor/kreuzberg/src/ocr/language_registry.rs +11 -235
  187. data/vendor/kreuzberg/src/ocr/mod.rs +1 -0
  188. data/vendor/kreuzberg/src/ocr/processor/config.rs +203 -0
  189. data/vendor/kreuzberg/src/ocr/processor/execution.rs +494 -0
  190. data/vendor/kreuzberg/src/ocr/processor/mod.rs +265 -0
  191. data/vendor/kreuzberg/src/ocr/processor/validation.rs +145 -0
  192. data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +41 -24
  193. data/vendor/kreuzberg/src/pdf/bindings.rs +21 -8
  194. data/vendor/kreuzberg/src/pdf/hierarchy/bounding_box.rs +289 -0
  195. data/vendor/kreuzberg/src/pdf/hierarchy/clustering.rs +199 -0
  196. data/vendor/kreuzberg/src/pdf/{hierarchy.rs → hierarchy/extraction.rs} +6 -346
  197. data/vendor/kreuzberg/src/pdf/hierarchy/mod.rs +18 -0
  198. data/vendor/kreuzberg/src/plugins/extractor/mod.rs +319 -0
  199. data/vendor/kreuzberg/src/plugins/extractor/registry.rs +434 -0
  200. data/vendor/kreuzberg/src/plugins/extractor/trait.rs +391 -0
  201. data/vendor/kreuzberg/src/plugins/mod.rs +13 -0
  202. data/vendor/kreuzberg/src/plugins/ocr.rs +11 -0
  203. data/vendor/kreuzberg/src/plugins/processor/mod.rs +365 -0
  204. data/vendor/kreuzberg/src/plugins/processor/registry.rs +37 -0
  205. data/vendor/kreuzberg/src/plugins/processor/trait.rs +284 -0
  206. data/vendor/kreuzberg/src/plugins/registry/extractor.rs +416 -0
  207. data/vendor/kreuzberg/src/plugins/registry/mod.rs +116 -0
  208. data/vendor/kreuzberg/src/plugins/registry/ocr.rs +293 -0
  209. data/vendor/kreuzberg/src/plugins/registry/processor.rs +304 -0
  210. data/vendor/kreuzberg/src/plugins/registry/validator.rs +238 -0
  211. data/vendor/kreuzberg/src/plugins/validator/mod.rs +424 -0
  212. data/vendor/kreuzberg/src/plugins/validator/registry.rs +355 -0
  213. data/vendor/kreuzberg/src/plugins/validator/trait.rs +276 -0
  214. data/vendor/kreuzberg/src/stopwords/languages/asian.rs +40 -0
  215. data/vendor/kreuzberg/src/stopwords/languages/germanic.rs +36 -0
  216. data/vendor/kreuzberg/src/stopwords/languages/mod.rs +10 -0
  217. data/vendor/kreuzberg/src/stopwords/languages/other.rs +44 -0
  218. data/vendor/kreuzberg/src/stopwords/languages/romance.rs +36 -0
  219. data/vendor/kreuzberg/src/stopwords/languages/slavic.rs +36 -0
  220. data/vendor/kreuzberg/src/stopwords/mod.rs +7 -33
  221. data/vendor/kreuzberg/src/text/quality.rs +1 -1
  222. data/vendor/kreuzberg/src/text/quality_processor.rs +10 -0
  223. data/vendor/kreuzberg/src/text/token_reduction/core/analysis.rs +238 -0
  224. data/vendor/kreuzberg/src/text/token_reduction/core/mod.rs +8 -0
  225. data/vendor/kreuzberg/src/text/token_reduction/core/punctuation.rs +54 -0
  226. data/vendor/kreuzberg/src/text/token_reduction/core/reducer.rs +384 -0
  227. data/vendor/kreuzberg/src/text/token_reduction/core/sentence_selection.rs +68 -0
  228. data/vendor/kreuzberg/src/text/token_reduction/core/word_filtering.rs +156 -0
  229. data/vendor/kreuzberg/src/text/token_reduction/filters/general.rs +377 -0
  230. data/vendor/kreuzberg/src/text/token_reduction/filters/html.rs +51 -0
  231. data/vendor/kreuzberg/src/text/token_reduction/filters/markdown.rs +285 -0
  232. data/vendor/kreuzberg/src/text/token_reduction/filters.rs +131 -246
  233. data/vendor/kreuzberg/src/types/djot.rs +209 -0
  234. data/vendor/kreuzberg/src/types/extraction.rs +301 -0
  235. data/vendor/kreuzberg/src/types/formats.rs +443 -0
  236. data/vendor/kreuzberg/src/types/metadata.rs +560 -0
  237. data/vendor/kreuzberg/src/types/mod.rs +281 -0
  238. data/vendor/kreuzberg/src/types/page.rs +182 -0
  239. data/vendor/kreuzberg/src/types/serde_helpers.rs +132 -0
  240. data/vendor/kreuzberg/src/types/tables.rs +39 -0
  241. data/vendor/kreuzberg/src/utils/quality/heuristics.rs +58 -0
  242. data/vendor/kreuzberg/src/utils/{quality.rs → quality/mod.rs} +168 -489
  243. data/vendor/kreuzberg/src/utils/quality/patterns.rs +117 -0
  244. data/vendor/kreuzberg/src/utils/quality/scoring.rs +178 -0
  245. data/vendor/kreuzberg/src/utils/string_pool/buffer_pool.rs +325 -0
  246. data/vendor/kreuzberg/src/utils/string_pool/interned.rs +102 -0
  247. data/vendor/kreuzberg/src/utils/string_pool/language_pool.rs +119 -0
  248. data/vendor/kreuzberg/src/utils/string_pool/mime_pool.rs +235 -0
  249. data/vendor/kreuzberg/src/utils/string_pool/mod.rs +41 -0
  250. data/vendor/kreuzberg/tests/api_chunk.rs +313 -0
  251. data/vendor/kreuzberg/tests/api_embed.rs +6 -9
  252. data/vendor/kreuzberg/tests/batch_orchestration.rs +1 -0
  253. data/vendor/kreuzberg/tests/concurrency_stress.rs +7 -0
  254. data/vendor/kreuzberg/tests/core_integration.rs +1 -0
  255. data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +130 -0
  256. data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +5 -14
  257. data/vendor/kreuzberg/tests/format_integration.rs +2 -0
  258. data/vendor/kreuzberg/tests/helpers/mod.rs +1 -0
  259. data/vendor/kreuzberg/tests/html_table_test.rs +11 -11
  260. data/vendor/kreuzberg/tests/ocr_configuration.rs +16 -0
  261. data/vendor/kreuzberg/tests/ocr_errors.rs +18 -0
  262. data/vendor/kreuzberg/tests/ocr_quality.rs +9 -0
  263. data/vendor/kreuzberg/tests/ocr_stress.rs +1 -0
  264. data/vendor/kreuzberg/tests/pipeline_integration.rs +50 -0
  265. data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +13 -0
  266. data/vendor/kreuzberg/tests/plugin_system.rs +12 -0
  267. data/vendor/kreuzberg/tests/registry_integration_tests.rs +2 -0
  268. data/vendor/kreuzberg-ffi/Cargo.toml +2 -1
  269. data/vendor/kreuzberg-ffi/benches/result_view_benchmark.rs +2 -0
  270. data/vendor/kreuzberg-ffi/kreuzberg.h +347 -178
  271. data/vendor/kreuzberg-ffi/src/config/html.rs +318 -0
  272. data/vendor/kreuzberg-ffi/src/config/loader.rs +154 -0
  273. data/vendor/kreuzberg-ffi/src/config/merge.rs +104 -0
  274. data/vendor/kreuzberg-ffi/src/config/mod.rs +385 -0
  275. data/vendor/kreuzberg-ffi/src/config/parse.rs +91 -0
  276. data/vendor/kreuzberg-ffi/src/config/serialize.rs +118 -0
  277. data/vendor/kreuzberg-ffi/src/config_builder.rs +598 -0
  278. data/vendor/kreuzberg-ffi/src/error.rs +46 -14
  279. data/vendor/kreuzberg-ffi/src/helpers.rs +10 -0
  280. data/vendor/kreuzberg-ffi/src/html_options.rs +421 -0
  281. data/vendor/kreuzberg-ffi/src/lib.rs +16 -0
  282. data/vendor/kreuzberg-ffi/src/panic_shield.rs +11 -0
  283. data/vendor/kreuzberg-ffi/src/plugins/ocr_backend.rs +2 -0
  284. data/vendor/kreuzberg-ffi/src/result.rs +148 -122
  285. data/vendor/kreuzberg-ffi/src/result_view.rs +4 -0
  286. data/vendor/kreuzberg-tesseract/Cargo.toml +2 -2
  287. metadata +200 -28
  288. data/vendor/kreuzberg/src/api/server.rs +0 -518
  289. data/vendor/kreuzberg/src/core/config.rs +0 -1914
  290. data/vendor/kreuzberg/src/core/config_validation.rs +0 -949
  291. data/vendor/kreuzberg/src/core/extractor.rs +0 -1200
  292. data/vendor/kreuzberg/src/core/pipeline.rs +0 -1223
  293. data/vendor/kreuzberg/src/core/server_config.rs +0 -1220
  294. data/vendor/kreuzberg/src/extraction/html.rs +0 -1830
  295. data/vendor/kreuzberg/src/extraction/pptx.rs +0 -3102
  296. data/vendor/kreuzberg/src/extractors/epub.rs +0 -696
  297. data/vendor/kreuzberg/src/extractors/latex.rs +0 -653
  298. data/vendor/kreuzberg/src/extractors/opml.rs +0 -635
  299. data/vendor/kreuzberg/src/extractors/rtf.rs +0 -809
  300. data/vendor/kreuzberg/src/ocr/processor.rs +0 -858
  301. data/vendor/kreuzberg/src/plugins/extractor.rs +0 -1042
  302. data/vendor/kreuzberg/src/plugins/processor.rs +0 -650
  303. data/vendor/kreuzberg/src/plugins/registry.rs +0 -1339
  304. data/vendor/kreuzberg/src/plugins/validator.rs +0 -967
  305. data/vendor/kreuzberg/src/text/token_reduction/core.rs +0 -832
  306. data/vendor/kreuzberg/src/types.rs +0 -1713
  307. data/vendor/kreuzberg/src/utils/string_pool.rs +0 -762
  308. data/vendor/kreuzberg-ffi/src/config.rs +0 -1341
@@ -215,26 +215,149 @@ pub fn get_or_init_model(
215
215
  return Ok(Arc::clone(cached_model));
216
216
  }
217
217
 
218
- let mut init_options = InitOptions::new(model);
219
- init_options = init_options.with_cache_dir(cache_directory);
220
-
221
- let embedding_model = TextEmbedding::try_new(init_options).map_err(|e| {
222
- let error_msg = e.to_string();
223
-
224
- if error_msg.contains("onnxruntime")
225
- || error_msg.contains("ORT")
226
- || error_msg.contains("libonnxruntime")
227
- || error_msg.contains("onnxruntime.dll")
228
- || error_msg.contains("Unable to load")
229
- || error_msg.contains("library load failed")
218
+ // Check if ONNX Runtime library exists and set ORT_DYLIB_PATH if needed
219
+ // This prevents panics that cannot unwind through FFI boundaries
220
+ fn ensure_onnx_available() -> Result<(), String> {
221
+ // Check if ORT_DYLIB_PATH is already set and valid
222
+ if let Ok(path) = std::env::var("ORT_DYLIB_PATH") {
223
+ if std::path::Path::new(&path).exists() {
224
+ return Ok(());
225
+ }
226
+ }
227
+
228
+ // Check common installation paths and set ORT_DYLIB_PATH if found
229
+ #[cfg(target_os = "macos")]
230
+ {
231
+ let paths = vec![
232
+ "/opt/homebrew/lib/libonnxruntime.dylib",
233
+ "/usr/local/lib/libonnxruntime.dylib",
234
+ ];
235
+ for path in paths {
236
+ if std::path::Path::new(path).exists() {
237
+ // Set ORT_DYLIB_PATH so the ort crate can find it
238
+ // SAFETY: We're setting an environment variable before any threads are spawned
239
+ // in this module, and we're the only ones setting this variable
240
+ #[allow(unsafe_code)]
241
+ unsafe {
242
+ std::env::set_var("ORT_DYLIB_PATH", path);
243
+ }
244
+ return Ok(());
245
+ }
246
+ }
247
+ }
248
+
249
+ #[cfg(target_os = "linux")]
250
+ {
251
+ let paths = vec![
252
+ "/usr/lib/libonnxruntime.so",
253
+ "/usr/local/lib/libonnxruntime.so",
254
+ "/usr/lib/x86_64-linux-gnu/libonnxruntime.so",
255
+ "/usr/lib/aarch64-linux-gnu/libonnxruntime.so",
256
+ ];
257
+ for path in paths {
258
+ if std::path::Path::new(path).exists() {
259
+ // SAFETY: We're setting an environment variable before any threads are spawned
260
+ // in this module, and we're the only ones setting this variable
261
+ #[allow(unsafe_code)]
262
+ unsafe {
263
+ std::env::set_var("ORT_DYLIB_PATH", path);
264
+ }
265
+ return Ok(());
266
+ }
267
+ }
268
+ }
269
+
270
+ #[cfg(target_os = "windows")]
271
+ {
272
+ let paths = vec![
273
+ "C:\\Program Files\\onnxruntime\\bin\\onnxruntime.dll",
274
+ "C:\\Windows\\System32\\onnxruntime.dll",
275
+ ];
276
+ for path in paths {
277
+ if std::path::Path::new(path).exists() {
278
+ // SAFETY: We're setting an environment variable before any threads are spawned
279
+ // in this module, and we're the only ones setting this variable
280
+ #[allow(unsafe_code)]
281
+ unsafe {
282
+ std::env::set_var("ORT_DYLIB_PATH", path);
283
+ }
284
+ return Ok(());
285
+ }
286
+ }
287
+ }
288
+
289
+ Err("ONNX Runtime library not found in common installation paths".to_string())
290
+ }
291
+
292
+ if let Err(e) = ensure_onnx_available() {
293
+ return Err(crate::KreuzbergError::MissingDependency(format!(
294
+ "{}. {}",
295
+ e,
296
+ onnx_runtime_install_message()
297
+ )));
298
+ }
299
+
300
+ // Wrap the entire embedding initialization with catch_unwind to handle panics from ONNX Runtime
301
+ // ONNX Runtime can panic when the library is not found, which causes issues in FFI contexts
302
+ // This includes both InitOptions::new and TextEmbedding::try_new as both can trigger ONNX Runtime loading
303
+ let embedding_model = std::panic::catch_unwind(std::panic::AssertUnwindSafe(|| {
304
+ let mut init_options = InitOptions::new(model);
305
+ init_options = init_options.with_cache_dir(cache_directory);
306
+ TextEmbedding::try_new(init_options)
307
+ }))
308
+ .map_err(|panic_payload| {
309
+ // Convert panic to a KreuzbergError
310
+ let panic_msg = if let Some(s) = panic_payload.downcast_ref::<&str>() {
311
+ s.to_string()
312
+ } else if let Some(s) = panic_payload.downcast_ref::<String>() {
313
+ s.clone()
314
+ } else {
315
+ "Unknown panic during ONNX Runtime initialization".to_string()
316
+ };
317
+
318
+ // Check if this looks like an ONNX Runtime missing dependency error
319
+ if panic_msg.contains("onnxruntime")
320
+ || panic_msg.contains("ORT")
321
+ || panic_msg.contains("libonnxruntime")
322
+ || panic_msg.contains("onnxruntime.dll")
323
+ || panic_msg.contains("Unable to load")
324
+ || panic_msg.contains("library load failed")
325
+ || panic_msg.contains("attempting to load")
326
+ || panic_msg.contains("An error occurred while")
230
327
  {
231
328
  crate::KreuzbergError::MissingDependency(format!("ONNX Runtime - {}", onnx_runtime_install_message()))
232
329
  } else {
233
330
  crate::KreuzbergError::Plugin {
234
- message: format!("Failed to initialize embedding model: {}", e),
331
+ message: format!("ONNX Runtime initialization panicked: {}", panic_msg),
235
332
  plugin_name: "embeddings".to_string(),
236
333
  }
237
334
  }
335
+ })
336
+ .and_then(|result| {
337
+ // Map fastembed errors to KreuzbergError
338
+ result.map_err(|e| {
339
+ let error_msg = e.to_string();
340
+
341
+ if error_msg.contains("onnxruntime")
342
+ || error_msg.contains("ORT")
343
+ || error_msg.contains("libonnxruntime")
344
+ || error_msg.contains("onnxruntime.dll")
345
+ || error_msg.contains("Unable to load")
346
+ || error_msg.contains("library load failed")
347
+ || error_msg.contains("attempting to load")
348
+ || error_msg.contains("An error occurred while")
349
+ {
350
+ crate::KreuzbergError::MissingDependency(format!(
351
+ "ONNX Runtime - {}",
352
+ onnx_runtime_install_message()
353
+ ))
354
+ } else {
355
+ crate::KreuzbergError::Plugin {
356
+ message: format!("Failed to initialize embedding model: {}", e),
357
+ plugin_name: "embeddings".to_string(),
358
+ }
359
+ }
360
+ })
238
361
  })?;
239
362
 
240
363
  let leaked_model = LeakedModel::new(embedding_model);
@@ -1,13 +1,21 @@
1
1
  //! Archive extraction functionality.
2
2
  //!
3
3
  //! This module provides functions for extracting file lists and contents from archives.
4
+ //! Supported formats:
5
+ //! - ZIP archives
6
+ //! - TAR archives (including compressed TAR.GZ, TAR.BZ2)
7
+ //! - 7Z archives
8
+ //!
9
+ //! Each format has its own submodule with specialized extraction logic.
10
+
11
+ mod sevenz;
12
+ mod tar;
13
+ mod zip;
4
14
 
5
- use crate::error::{KreuzbergError, Result};
6
- use sevenz_rust2::{ArchiveReader, Password};
7
- use std::collections::HashMap;
8
- use std::io::{Cursor, Read};
9
- use tar::Archive as TarArchive;
10
- use zip::ZipArchive;
15
+ // Re-export all public functions for backward compatibility
16
+ pub use sevenz::{extract_7z_metadata, extract_7z_text_content};
17
+ pub use tar::{extract_tar_metadata, extract_tar_text_content};
18
+ pub use zip::{extract_zip_metadata, extract_zip_text_content};
11
19
 
12
20
  /// Archive metadata extracted from an archive file.
13
21
  #[derive(Debug, Clone)]
@@ -33,223 +41,17 @@ pub struct ArchiveEntry {
33
41
  pub is_dir: bool,
34
42
  }
35
43
 
36
- /// Extract metadata from a ZIP archive.
37
- pub fn extract_zip_metadata(bytes: &[u8]) -> Result<ArchiveMetadata> {
38
- let cursor = Cursor::new(bytes);
39
- let mut archive =
40
- ZipArchive::new(cursor).map_err(|e| KreuzbergError::parsing(format!("Failed to read ZIP archive: {}", e)))?;
41
-
42
- let mut file_list = Vec::with_capacity(archive.len());
43
- let mut total_size = 0u64;
44
-
45
- for i in 0..archive.len() {
46
- let file = archive
47
- .by_index(i)
48
- .map_err(|e| KreuzbergError::parsing(format!("Failed to read ZIP entry: {}", e)))?;
49
-
50
- let path = file.name().to_string();
51
- let size = file.size();
52
- let is_dir = file.is_dir();
53
-
54
- if !is_dir {
55
- total_size += size;
56
- }
57
-
58
- file_list.push(ArchiveEntry { path, size, is_dir });
59
- }
60
-
61
- Ok(ArchiveMetadata {
62
- format: "ZIP".to_string(),
63
- file_list,
64
- file_count: archive.len(),
65
- total_size,
66
- })
67
- }
68
-
69
- /// Extract metadata from a TAR archive.
70
- pub fn extract_tar_metadata(bytes: &[u8]) -> Result<ArchiveMetadata> {
71
- let cursor = Cursor::new(bytes);
72
- let mut archive = TarArchive::new(cursor);
73
-
74
- let estimated_entries = bytes.len().saturating_div(512).max(16);
75
- let mut file_list = Vec::with_capacity(estimated_entries);
76
- let mut total_size = 0u64;
77
- let mut file_count = 0;
78
-
79
- let entries = archive
80
- .entries()
81
- .map_err(|e| KreuzbergError::parsing(format!("Failed to read TAR archive: {}", e)))?;
82
-
83
- for entry_result in entries {
84
- let entry = entry_result.map_err(|e| KreuzbergError::parsing(format!("Failed to read TAR entry: {}", e)))?;
85
-
86
- let path = entry
87
- .path()
88
- .map_err(|e| KreuzbergError::parsing(format!("Failed to read TAR entry path: {}", e)))?
89
- .to_string_lossy()
90
- .to_string();
91
-
92
- let size = entry.size();
93
- let is_dir = entry.header().entry_type().is_dir();
94
-
95
- if !is_dir {
96
- total_size += size;
97
- }
98
-
99
- file_count += 1;
100
- file_list.push(ArchiveEntry { path, size, is_dir });
101
- }
102
-
103
- Ok(ArchiveMetadata {
104
- format: "TAR".to_string(),
105
- file_list,
106
- file_count,
107
- total_size,
108
- })
109
- }
110
-
111
- /// Extract text content from files within a ZIP archive.
112
- ///
113
- /// Only extracts files with common text extensions: .txt, .md, .json, .xml, .html, .csv, .log
114
- pub fn extract_zip_text_content(bytes: &[u8]) -> Result<HashMap<String, String>> {
115
- let cursor = Cursor::new(bytes);
116
- let mut archive =
117
- ZipArchive::new(cursor).map_err(|e| KreuzbergError::parsing(format!("Failed to read ZIP archive: {}", e)))?;
118
-
119
- let estimated_text_files = archive.len().saturating_mul(3).saturating_div(10).max(2);
120
- let mut contents = HashMap::with_capacity(estimated_text_files);
121
- let text_extensions = [
122
- ".txt", ".md", ".json", ".xml", ".html", ".csv", ".log", ".yaml", ".toml",
123
- ];
124
-
125
- for i in 0..archive.len() {
126
- let mut file = archive
127
- .by_index(i)
128
- .map_err(|e| KreuzbergError::parsing(format!("Failed to read ZIP entry: {}", e)))?;
129
-
130
- let path = file.name().to_string();
131
-
132
- if !file.is_dir() && text_extensions.iter().any(|ext| path.to_lowercase().ends_with(ext)) {
133
- let estimated_size = (file.size() as usize).min(10 * 1024 * 1024);
134
- let mut content = String::with_capacity(estimated_size);
135
- if file.read_to_string(&mut content).is_ok() {
136
- contents.insert(path, content);
137
- }
138
- }
139
- }
140
-
141
- Ok(contents)
142
- }
143
-
144
- /// Extract text content from files within a TAR archive.
145
- ///
146
- /// Only extracts files with common text extensions: .txt, .md, .json, .xml, .html, .csv, .log
147
- pub fn extract_tar_text_content(bytes: &[u8]) -> Result<HashMap<String, String>> {
148
- let cursor = Cursor::new(bytes);
149
- let mut archive = TarArchive::new(cursor);
150
-
151
- let estimated_text_files = bytes.len().saturating_div(1024 * 10).min(100);
152
- let mut contents = HashMap::with_capacity(estimated_text_files.max(2));
153
- let text_extensions = [
154
- ".txt", ".md", ".json", ".xml", ".html", ".csv", ".log", ".yaml", ".toml",
155
- ];
156
-
157
- let entries = archive
158
- .entries()
159
- .map_err(|e| KreuzbergError::parsing(format!("Failed to read TAR archive: {}", e)))?;
160
-
161
- for entry_result in entries {
162
- let mut entry =
163
- entry_result.map_err(|e| KreuzbergError::parsing(format!("Failed to read TAR entry: {}", e)))?;
164
-
165
- let path = entry
166
- .path()
167
- .map_err(|e| KreuzbergError::parsing(format!("Failed to read TAR entry path: {}", e)))?
168
- .to_string_lossy()
169
- .to_string();
170
-
171
- if !entry.header().entry_type().is_dir() && text_extensions.iter().any(|ext| path.to_lowercase().ends_with(ext))
172
- {
173
- let estimated_size = (entry.size().min(10 * 1024 * 1024)) as usize;
174
- let mut content = String::with_capacity(estimated_size);
175
- if entry.read_to_string(&mut content).is_ok() {
176
- contents.insert(path, content);
177
- }
178
- }
179
- }
180
-
181
- Ok(contents)
182
- }
183
-
184
- /// Extract metadata from a 7z archive.
185
- pub fn extract_7z_metadata(bytes: &[u8]) -> Result<ArchiveMetadata> {
186
- let cursor = Cursor::new(bytes);
187
- let archive = ArchiveReader::new(cursor, Password::empty())
188
- .map_err(|e| KreuzbergError::parsing(format!("Failed to read 7z archive: {}", e)))?;
189
-
190
- let mut file_list = Vec::new();
191
- let mut total_size = 0u64;
192
-
193
- for entry in &archive.archive().files {
194
- let path = entry.name().to_string();
195
- let size = entry.size();
196
- let is_dir = entry.is_directory();
197
-
198
- if !is_dir {
199
- total_size += size;
200
- }
201
-
202
- file_list.push(ArchiveEntry { path, size, is_dir });
203
- }
204
-
205
- let file_count = file_list.len();
206
-
207
- Ok(ArchiveMetadata {
208
- format: "7Z".to_string(),
209
- file_list,
210
- file_count,
211
- total_size,
212
- })
213
- }
214
-
215
- /// Extract text content from files within a 7z archive.
216
- ///
217
- /// Only extracts files with common text extensions: .txt, .md, .json, .xml, .html, .csv, .log
218
- pub fn extract_7z_text_content(bytes: &[u8]) -> Result<HashMap<String, String>> {
219
- let cursor = Cursor::new(bytes);
220
- let mut archive = ArchiveReader::new(cursor, Password::empty())
221
- .map_err(|e| KreuzbergError::parsing(format!("Failed to read 7z archive: {}", e)))?;
222
-
223
- let mut contents = HashMap::new();
224
- let text_extensions = [
225
- ".txt", ".md", ".json", ".xml", ".html", ".csv", ".log", ".yaml", ".toml",
226
- ];
227
-
228
- archive
229
- .for_each_entries(|entry, reader| {
230
- let path = entry.name().to_string();
231
-
232
- if !entry.is_directory() && text_extensions.iter().any(|ext| path.to_lowercase().ends_with(ext)) {
233
- let mut content = Vec::new();
234
- if let Ok(_) = reader.read_to_end(&mut content)
235
- && let Ok(text) = String::from_utf8(content)
236
- {
237
- contents.insert(path, text);
238
- }
239
- }
240
- Ok(true)
241
- })
242
- .map_err(|e| KreuzbergError::parsing(format!("Failed to read 7z entries: {}", e)))?;
243
-
244
- Ok(contents)
245
- }
44
+ /// Common text file extensions that should be extracted from archives.
45
+ pub(crate) const TEXT_EXTENSIONS: &[&str] = &[
46
+ ".txt", ".md", ".json", ".xml", ".html", ".csv", ".log", ".yaml", ".toml",
47
+ ];
246
48
 
247
49
  #[cfg(test)]
248
50
  mod tests {
249
51
  use super::*;
250
- use std::io::Write;
251
- use tar::Builder as TarBuilder;
252
- use zip::write::{FileOptions, ZipWriter};
52
+ use ::tar::Builder as TarBuilder;
53
+ use ::zip::write::{FileOptions, ZipWriter};
54
+ use std::io::{Cursor, Write};
253
55
 
254
56
  #[test]
255
57
  fn test_extract_zip_metadata() {
@@ -283,14 +85,14 @@ mod tests {
283
85
  let mut tar = TarBuilder::new(&mut cursor);
284
86
 
285
87
  let data1 = b"Hello, World!";
286
- let mut header1 = tar::Header::new_gnu();
88
+ let mut header1 = ::tar::Header::new_gnu();
287
89
  header1.set_path("test.txt").unwrap();
288
90
  header1.set_size(data1.len() as u64);
289
91
  header1.set_cksum();
290
92
  tar.append(&header1, &data1[..]).unwrap();
291
93
 
292
94
  let data2 = b"# Header";
293
- let mut header2 = tar::Header::new_gnu();
95
+ let mut header2 = ::tar::Header::new_gnu();
294
96
  header2.set_path("dir/file.md").unwrap();
295
97
  header2.set_size(data2.len() as u64);
296
98
  header2.set_cksum();
@@ -339,14 +141,14 @@ mod tests {
339
141
  let mut tar = TarBuilder::new(&mut cursor);
340
142
 
341
143
  let data1 = b"Hello, World!";
342
- let mut header1 = tar::Header::new_gnu();
144
+ let mut header1 = ::tar::Header::new_gnu();
343
145
  header1.set_path("test.txt").unwrap();
344
146
  header1.set_size(data1.len() as u64);
345
147
  header1.set_cksum();
346
148
  tar.append(&header1, &data1[..]).unwrap();
347
149
 
348
150
  let data2 = b"# README";
349
- let mut header2 = tar::Header::new_gnu();
151
+ let mut header2 = ::tar::Header::new_gnu();
350
152
  header2.set_path("readme.md").unwrap();
351
153
  header2.set_size(data2.len() as u64);
352
154
  header2.set_cksum();
@@ -413,15 +215,15 @@ mod tests {
413
215
  {
414
216
  let mut tar = TarBuilder::new(&mut cursor);
415
217
 
416
- let mut header_dir = tar::Header::new_gnu();
218
+ let mut header_dir = ::tar::Header::new_gnu();
417
219
  header_dir.set_path("dir1/").unwrap();
418
220
  header_dir.set_size(0);
419
- header_dir.set_entry_type(tar::EntryType::Directory);
221
+ header_dir.set_entry_type(::tar::EntryType::Directory);
420
222
  header_dir.set_cksum();
421
223
  tar.append(&header_dir, &[][..]).unwrap();
422
224
 
423
225
  let data = b"content1";
424
- let mut header1 = tar::Header::new_gnu();
226
+ let mut header1 = ::tar::Header::new_gnu();
425
227
  header1.set_path("dir1/file1.txt").unwrap();
426
228
  header1.set_size(data.len() as u64);
427
229
  header1.set_cksum();
@@ -447,7 +249,7 @@ mod tests {
447
249
  let mut tar = TarBuilder::new(&mut tar_data);
448
250
 
449
251
  let data = b"Hello from gzip!";
450
- let mut header = tar::Header::new_gnu();
252
+ let mut header = ::tar::Header::new_gnu();
451
253
  header.set_path("test.txt").unwrap();
452
254
  header.set_size(data.len() as u64);
453
255
  header.set_cksum();
@@ -464,20 +266,20 @@ mod tests {
464
266
 
465
267
  #[test]
466
268
  fn test_extract_7z_metadata_with_files() {
467
- use sevenz_rust2::{ArchiveEntry, ArchiveWriter};
269
+ use sevenz_rust2::{ArchiveEntry as SevenzEntry, ArchiveWriter};
468
270
 
469
271
  let cursor = {
470
272
  let cursor = Cursor::new(Vec::new());
471
273
  let mut sz = ArchiveWriter::new(cursor).unwrap();
472
274
 
473
275
  sz.push_archive_entry(
474
- ArchiveEntry::new_file("test.txt"),
276
+ SevenzEntry::new_file("test.txt"),
475
277
  Some(Cursor::new(b"Hello 7z!".to_vec())),
476
278
  )
477
279
  .unwrap();
478
280
 
479
281
  sz.push_archive_entry(
480
- ArchiveEntry::new_file("data.json"),
282
+ SevenzEntry::new_file("data.json"),
481
283
  Some(Cursor::new(b"{\"key\":\"value\"}".to_vec())),
482
284
  )
483
285
  .unwrap();
@@ -538,7 +340,7 @@ mod tests {
538
340
  let mut inner_tar = TarBuilder::new(&mut inner_cursor);
539
341
 
540
342
  let data = b"Nested content";
541
- let mut header = tar::Header::new_gnu();
343
+ let mut header = ::tar::Header::new_gnu();
542
344
  header.set_path("inner.txt").unwrap();
543
345
  header.set_size(data.len() as u64);
544
346
  header.set_cksum();
@@ -552,14 +354,14 @@ mod tests {
552
354
  {
553
355
  let mut outer_tar = TarBuilder::new(&mut outer_cursor);
554
356
 
555
- let mut header1 = tar::Header::new_gnu();
357
+ let mut header1 = ::tar::Header::new_gnu();
556
358
  header1.set_path("archive.tar").unwrap();
557
359
  header1.set_size(inner_bytes.len() as u64);
558
360
  header1.set_cksum();
559
361
  outer_tar.append(&header1, &inner_bytes[..]).unwrap();
560
362
 
561
363
  let data = b"Outer content";
562
- let mut header2 = tar::Header::new_gnu();
364
+ let mut header2 = ::tar::Header::new_gnu();
563
365
  header2.set_path("readme.txt").unwrap();
564
366
  header2.set_size(data.len() as u64);
565
367
  header2.set_cksum();
@@ -579,6 +381,8 @@ mod tests {
579
381
 
580
382
  #[test]
581
383
  fn test_extract_zip_corrupted_data() {
384
+ use crate::error::KreuzbergError;
385
+
582
386
  let mut valid_cursor = Cursor::new(Vec::new());
583
387
  {
584
388
  let mut zip = ZipWriter::new(&mut valid_cursor);
@@ -608,7 +412,7 @@ mod tests {
608
412
  let mut tar = TarBuilder::new(&mut valid_cursor);
609
413
 
610
414
  let data = b"content";
611
- let mut header = tar::Header::new_gnu();
415
+ let mut header = ::tar::Header::new_gnu();
612
416
  header.set_path("test.txt").unwrap();
613
417
  header.set_size(data.len() as u64);
614
418
  header.set_cksum();
@@ -704,7 +508,7 @@ mod tests {
704
508
  ];
705
509
 
706
510
  for (path, data) in files {
707
- let mut header = tar::Header::new_gnu();
511
+ let mut header = ::tar::Header::new_gnu();
708
512
  header.set_path(path).unwrap();
709
513
  header.set_size(data.len() as u64);
710
514
  header.set_cksum();
@@ -839,20 +643,20 @@ mod tests {
839
643
 
840
644
  #[test]
841
645
  fn test_extract_7z_text_content() {
842
- use sevenz_rust2::{ArchiveEntry, ArchiveWriter};
646
+ use sevenz_rust2::{ArchiveEntry as SevenzEntry, ArchiveWriter};
843
647
 
844
648
  let cursor = {
845
649
  let cursor = Cursor::new(Vec::new());
846
650
  let mut sz = ArchiveWriter::new(cursor).unwrap();
847
651
 
848
652
  sz.push_archive_entry(
849
- ArchiveEntry::new_file("test.txt"),
653
+ SevenzEntry::new_file("test.txt"),
850
654
  Some(Cursor::new(b"Hello 7z text!".to_vec())),
851
655
  )
852
656
  .unwrap();
853
657
 
854
658
  sz.push_archive_entry(
855
- ArchiveEntry::new_file("readme.md"),
659
+ SevenzEntry::new_file("readme.md"),
856
660
  Some(Cursor::new(b"# 7z README".to_vec())),
857
661
  )
858
662
  .unwrap();
@@ -894,7 +698,7 @@ mod tests {
894
698
 
895
699
  let large_content = "y".repeat(50_000);
896
700
 
897
- let mut header = tar::Header::new_gnu();
701
+ let mut header = ::tar::Header::new_gnu();
898
702
  header.set_path("large.txt").unwrap();
899
703
  header.set_size(large_content.len() as u64);
900
704
  header.set_cksum();
@@ -947,6 +751,8 @@ mod tests {
947
751
 
948
752
  #[test]
949
753
  fn test_extract_7z_corrupted_data() {
754
+ use crate::error::KreuzbergError;
755
+
950
756
  let invalid_7z_data = vec![0x37, 0x7A, 0xBC, 0xAF, 0x27, 0x1C, 0x00];
951
757
 
952
758
  let result = extract_7z_metadata(&invalid_7z_data);