kreuzberg 4.0.0.rc1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (265) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +8 -0
  3. data/.rspec +3 -0
  4. data/.rubocop.yaml +534 -0
  5. data/Gemfile +9 -0
  6. data/Gemfile.lock +157 -0
  7. data/README.md +421 -0
  8. data/Rakefile +25 -0
  9. data/Steepfile +47 -0
  10. data/examples/async_patterns.rb +340 -0
  11. data/ext/kreuzberg_rb/extconf.rb +35 -0
  12. data/ext/kreuzberg_rb/native/Cargo.toml +36 -0
  13. data/ext/kreuzberg_rb/native/README.md +425 -0
  14. data/ext/kreuzberg_rb/native/build.rs +17 -0
  15. data/ext/kreuzberg_rb/native/include/ieeefp.h +11 -0
  16. data/ext/kreuzberg_rb/native/include/msvc_compat/strings.h +14 -0
  17. data/ext/kreuzberg_rb/native/include/strings.h +20 -0
  18. data/ext/kreuzberg_rb/native/include/unistd.h +47 -0
  19. data/ext/kreuzberg_rb/native/src/lib.rs +2939 -0
  20. data/extconf.rb +28 -0
  21. data/kreuzberg.gemspec +105 -0
  22. data/lib/kreuzberg/api_proxy.rb +142 -0
  23. data/lib/kreuzberg/cache_api.rb +45 -0
  24. data/lib/kreuzberg/cli.rb +55 -0
  25. data/lib/kreuzberg/cli_proxy.rb +127 -0
  26. data/lib/kreuzberg/config.rb +684 -0
  27. data/lib/kreuzberg/errors.rb +50 -0
  28. data/lib/kreuzberg/extraction_api.rb +84 -0
  29. data/lib/kreuzberg/mcp_proxy.rb +186 -0
  30. data/lib/kreuzberg/ocr_backend_protocol.rb +113 -0
  31. data/lib/kreuzberg/post_processor_protocol.rb +86 -0
  32. data/lib/kreuzberg/result.rb +216 -0
  33. data/lib/kreuzberg/setup_lib_path.rb +79 -0
  34. data/lib/kreuzberg/validator_protocol.rb +89 -0
  35. data/lib/kreuzberg/version.rb +5 -0
  36. data/lib/kreuzberg.rb +82 -0
  37. data/pkg/kreuzberg-4.0.0.rc1.gem +0 -0
  38. data/sig/kreuzberg/internal.rbs +184 -0
  39. data/sig/kreuzberg.rbs +468 -0
  40. data/spec/binding/cache_spec.rb +227 -0
  41. data/spec/binding/cli_proxy_spec.rb +87 -0
  42. data/spec/binding/cli_spec.rb +54 -0
  43. data/spec/binding/config_spec.rb +345 -0
  44. data/spec/binding/config_validation_spec.rb +283 -0
  45. data/spec/binding/error_handling_spec.rb +213 -0
  46. data/spec/binding/errors_spec.rb +66 -0
  47. data/spec/binding/plugins/ocr_backend_spec.rb +307 -0
  48. data/spec/binding/plugins/postprocessor_spec.rb +269 -0
  49. data/spec/binding/plugins/validator_spec.rb +274 -0
  50. data/spec/examples.txt +104 -0
  51. data/spec/fixtures/config.toml +39 -0
  52. data/spec/fixtures/config.yaml +42 -0
  53. data/spec/fixtures/invalid_config.toml +4 -0
  54. data/spec/smoke/package_spec.rb +178 -0
  55. data/spec/spec_helper.rb +42 -0
  56. data/vendor/kreuzberg/Cargo.toml +134 -0
  57. data/vendor/kreuzberg/README.md +175 -0
  58. data/vendor/kreuzberg/build.rs +460 -0
  59. data/vendor/kreuzberg/src/api/error.rs +81 -0
  60. data/vendor/kreuzberg/src/api/handlers.rs +199 -0
  61. data/vendor/kreuzberg/src/api/mod.rs +79 -0
  62. data/vendor/kreuzberg/src/api/server.rs +353 -0
  63. data/vendor/kreuzberg/src/api/types.rs +170 -0
  64. data/vendor/kreuzberg/src/bin/profile_extract.rs +455 -0
  65. data/vendor/kreuzberg/src/cache/mod.rs +1143 -0
  66. data/vendor/kreuzberg/src/chunking/mod.rs +677 -0
  67. data/vendor/kreuzberg/src/core/batch_mode.rs +35 -0
  68. data/vendor/kreuzberg/src/core/config.rs +1032 -0
  69. data/vendor/kreuzberg/src/core/extractor.rs +903 -0
  70. data/vendor/kreuzberg/src/core/io.rs +327 -0
  71. data/vendor/kreuzberg/src/core/mime.rs +615 -0
  72. data/vendor/kreuzberg/src/core/mod.rs +42 -0
  73. data/vendor/kreuzberg/src/core/pipeline.rs +906 -0
  74. data/vendor/kreuzberg/src/embeddings.rs +323 -0
  75. data/vendor/kreuzberg/src/error.rs +431 -0
  76. data/vendor/kreuzberg/src/extraction/archive.rs +954 -0
  77. data/vendor/kreuzberg/src/extraction/docx.rs +40 -0
  78. data/vendor/kreuzberg/src/extraction/email.rs +854 -0
  79. data/vendor/kreuzberg/src/extraction/excel.rs +688 -0
  80. data/vendor/kreuzberg/src/extraction/html.rs +553 -0
  81. data/vendor/kreuzberg/src/extraction/image.rs +368 -0
  82. data/vendor/kreuzberg/src/extraction/libreoffice.rs +564 -0
  83. data/vendor/kreuzberg/src/extraction/mod.rs +77 -0
  84. data/vendor/kreuzberg/src/extraction/office_metadata/app_properties.rs +398 -0
  85. data/vendor/kreuzberg/src/extraction/office_metadata/core_properties.rs +247 -0
  86. data/vendor/kreuzberg/src/extraction/office_metadata/custom_properties.rs +240 -0
  87. data/vendor/kreuzberg/src/extraction/office_metadata/mod.rs +128 -0
  88. data/vendor/kreuzberg/src/extraction/pandoc/batch.rs +275 -0
  89. data/vendor/kreuzberg/src/extraction/pandoc/mime_types.rs +178 -0
  90. data/vendor/kreuzberg/src/extraction/pandoc/mod.rs +491 -0
  91. data/vendor/kreuzberg/src/extraction/pandoc/server.rs +496 -0
  92. data/vendor/kreuzberg/src/extraction/pandoc/subprocess.rs +1188 -0
  93. data/vendor/kreuzberg/src/extraction/pandoc/version.rs +162 -0
  94. data/vendor/kreuzberg/src/extraction/pptx.rs +3000 -0
  95. data/vendor/kreuzberg/src/extraction/structured.rs +490 -0
  96. data/vendor/kreuzberg/src/extraction/table.rs +328 -0
  97. data/vendor/kreuzberg/src/extraction/text.rs +269 -0
  98. data/vendor/kreuzberg/src/extraction/xml.rs +333 -0
  99. data/vendor/kreuzberg/src/extractors/archive.rs +425 -0
  100. data/vendor/kreuzberg/src/extractors/docx.rs +479 -0
  101. data/vendor/kreuzberg/src/extractors/email.rs +129 -0
  102. data/vendor/kreuzberg/src/extractors/excel.rs +344 -0
  103. data/vendor/kreuzberg/src/extractors/html.rs +410 -0
  104. data/vendor/kreuzberg/src/extractors/image.rs +195 -0
  105. data/vendor/kreuzberg/src/extractors/mod.rs +268 -0
  106. data/vendor/kreuzberg/src/extractors/pandoc.rs +201 -0
  107. data/vendor/kreuzberg/src/extractors/pdf.rs +496 -0
  108. data/vendor/kreuzberg/src/extractors/pptx.rs +234 -0
  109. data/vendor/kreuzberg/src/extractors/structured.rs +126 -0
  110. data/vendor/kreuzberg/src/extractors/text.rs +242 -0
  111. data/vendor/kreuzberg/src/extractors/xml.rs +128 -0
  112. data/vendor/kreuzberg/src/image/dpi.rs +164 -0
  113. data/vendor/kreuzberg/src/image/mod.rs +6 -0
  114. data/vendor/kreuzberg/src/image/preprocessing.rs +417 -0
  115. data/vendor/kreuzberg/src/image/resize.rs +89 -0
  116. data/vendor/kreuzberg/src/keywords/config.rs +154 -0
  117. data/vendor/kreuzberg/src/keywords/mod.rs +237 -0
  118. data/vendor/kreuzberg/src/keywords/processor.rs +267 -0
  119. data/vendor/kreuzberg/src/keywords/rake.rs +294 -0
  120. data/vendor/kreuzberg/src/keywords/types.rs +68 -0
  121. data/vendor/kreuzberg/src/keywords/yake.rs +163 -0
  122. data/vendor/kreuzberg/src/language_detection/mod.rs +942 -0
  123. data/vendor/kreuzberg/src/lib.rs +102 -0
  124. data/vendor/kreuzberg/src/mcp/mod.rs +32 -0
  125. data/vendor/kreuzberg/src/mcp/server.rs +1966 -0
  126. data/vendor/kreuzberg/src/ocr/cache.rs +469 -0
  127. data/vendor/kreuzberg/src/ocr/error.rs +37 -0
  128. data/vendor/kreuzberg/src/ocr/hocr.rs +216 -0
  129. data/vendor/kreuzberg/src/ocr/mod.rs +58 -0
  130. data/vendor/kreuzberg/src/ocr/processor.rs +847 -0
  131. data/vendor/kreuzberg/src/ocr/table/mod.rs +4 -0
  132. data/vendor/kreuzberg/src/ocr/table/tsv_parser.rs +144 -0
  133. data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +450 -0
  134. data/vendor/kreuzberg/src/ocr/types.rs +393 -0
  135. data/vendor/kreuzberg/src/ocr/utils.rs +47 -0
  136. data/vendor/kreuzberg/src/ocr/validation.rs +206 -0
  137. data/vendor/kreuzberg/src/pdf/error.rs +122 -0
  138. data/vendor/kreuzberg/src/pdf/images.rs +139 -0
  139. data/vendor/kreuzberg/src/pdf/metadata.rs +346 -0
  140. data/vendor/kreuzberg/src/pdf/mod.rs +50 -0
  141. data/vendor/kreuzberg/src/pdf/rendering.rs +369 -0
  142. data/vendor/kreuzberg/src/pdf/table.rs +420 -0
  143. data/vendor/kreuzberg/src/pdf/text.rs +161 -0
  144. data/vendor/kreuzberg/src/plugins/extractor.rs +1010 -0
  145. data/vendor/kreuzberg/src/plugins/mod.rs +209 -0
  146. data/vendor/kreuzberg/src/plugins/ocr.rs +629 -0
  147. data/vendor/kreuzberg/src/plugins/processor.rs +641 -0
  148. data/vendor/kreuzberg/src/plugins/registry.rs +1324 -0
  149. data/vendor/kreuzberg/src/plugins/traits.rs +258 -0
  150. data/vendor/kreuzberg/src/plugins/validator.rs +955 -0
  151. data/vendor/kreuzberg/src/stopwords/mod.rs +1470 -0
  152. data/vendor/kreuzberg/src/text/mod.rs +19 -0
  153. data/vendor/kreuzberg/src/text/quality.rs +697 -0
  154. data/vendor/kreuzberg/src/text/string_utils.rs +217 -0
  155. data/vendor/kreuzberg/src/text/token_reduction/cjk_utils.rs +164 -0
  156. data/vendor/kreuzberg/src/text/token_reduction/config.rs +100 -0
  157. data/vendor/kreuzberg/src/text/token_reduction/core.rs +796 -0
  158. data/vendor/kreuzberg/src/text/token_reduction/filters.rs +902 -0
  159. data/vendor/kreuzberg/src/text/token_reduction/mod.rs +160 -0
  160. data/vendor/kreuzberg/src/text/token_reduction/semantic.rs +619 -0
  161. data/vendor/kreuzberg/src/text/token_reduction/simd_text.rs +147 -0
  162. data/vendor/kreuzberg/src/types.rs +873 -0
  163. data/vendor/kreuzberg/src/utils/mod.rs +17 -0
  164. data/vendor/kreuzberg/src/utils/quality.rs +959 -0
  165. data/vendor/kreuzberg/src/utils/string_utils.rs +381 -0
  166. data/vendor/kreuzberg/stopwords/af_stopwords.json +53 -0
  167. data/vendor/kreuzberg/stopwords/ar_stopwords.json +482 -0
  168. data/vendor/kreuzberg/stopwords/bg_stopwords.json +261 -0
  169. data/vendor/kreuzberg/stopwords/bn_stopwords.json +400 -0
  170. data/vendor/kreuzberg/stopwords/br_stopwords.json +1205 -0
  171. data/vendor/kreuzberg/stopwords/ca_stopwords.json +280 -0
  172. data/vendor/kreuzberg/stopwords/cs_stopwords.json +425 -0
  173. data/vendor/kreuzberg/stopwords/da_stopwords.json +172 -0
  174. data/vendor/kreuzberg/stopwords/de_stopwords.json +622 -0
  175. data/vendor/kreuzberg/stopwords/el_stopwords.json +849 -0
  176. data/vendor/kreuzberg/stopwords/en_stopwords.json +1300 -0
  177. data/vendor/kreuzberg/stopwords/eo_stopwords.json +175 -0
  178. data/vendor/kreuzberg/stopwords/es_stopwords.json +734 -0
  179. data/vendor/kreuzberg/stopwords/et_stopwords.json +37 -0
  180. data/vendor/kreuzberg/stopwords/eu_stopwords.json +100 -0
  181. data/vendor/kreuzberg/stopwords/fa_stopwords.json +801 -0
  182. data/vendor/kreuzberg/stopwords/fi_stopwords.json +849 -0
  183. data/vendor/kreuzberg/stopwords/fr_stopwords.json +693 -0
  184. data/vendor/kreuzberg/stopwords/ga_stopwords.json +111 -0
  185. data/vendor/kreuzberg/stopwords/gl_stopwords.json +162 -0
  186. data/vendor/kreuzberg/stopwords/gu_stopwords.json +226 -0
  187. data/vendor/kreuzberg/stopwords/ha_stopwords.json +41 -0
  188. data/vendor/kreuzberg/stopwords/he_stopwords.json +196 -0
  189. data/vendor/kreuzberg/stopwords/hi_stopwords.json +227 -0
  190. data/vendor/kreuzberg/stopwords/hr_stopwords.json +181 -0
  191. data/vendor/kreuzberg/stopwords/hu_stopwords.json +791 -0
  192. data/vendor/kreuzberg/stopwords/hy_stopwords.json +47 -0
  193. data/vendor/kreuzberg/stopwords/id_stopwords.json +760 -0
  194. data/vendor/kreuzberg/stopwords/it_stopwords.json +634 -0
  195. data/vendor/kreuzberg/stopwords/ja_stopwords.json +136 -0
  196. data/vendor/kreuzberg/stopwords/kn_stopwords.json +84 -0
  197. data/vendor/kreuzberg/stopwords/ko_stopwords.json +681 -0
  198. data/vendor/kreuzberg/stopwords/ku_stopwords.json +64 -0
  199. data/vendor/kreuzberg/stopwords/la_stopwords.json +51 -0
  200. data/vendor/kreuzberg/stopwords/lt_stopwords.json +476 -0
  201. data/vendor/kreuzberg/stopwords/lv_stopwords.json +163 -0
  202. data/vendor/kreuzberg/stopwords/ml_stopwords.json +1 -0
  203. data/vendor/kreuzberg/stopwords/mr_stopwords.json +101 -0
  204. data/vendor/kreuzberg/stopwords/ms_stopwords.json +477 -0
  205. data/vendor/kreuzberg/stopwords/ne_stopwords.json +490 -0
  206. data/vendor/kreuzberg/stopwords/nl_stopwords.json +415 -0
  207. data/vendor/kreuzberg/stopwords/no_stopwords.json +223 -0
  208. data/vendor/kreuzberg/stopwords/pl_stopwords.json +331 -0
  209. data/vendor/kreuzberg/stopwords/pt_stopwords.json +562 -0
  210. data/vendor/kreuzberg/stopwords/ro_stopwords.json +436 -0
  211. data/vendor/kreuzberg/stopwords/ru_stopwords.json +561 -0
  212. data/vendor/kreuzberg/stopwords/si_stopwords.json +193 -0
  213. data/vendor/kreuzberg/stopwords/sk_stopwords.json +420 -0
  214. data/vendor/kreuzberg/stopwords/sl_stopwords.json +448 -0
  215. data/vendor/kreuzberg/stopwords/so_stopwords.json +32 -0
  216. data/vendor/kreuzberg/stopwords/st_stopwords.json +33 -0
  217. data/vendor/kreuzberg/stopwords/sv_stopwords.json +420 -0
  218. data/vendor/kreuzberg/stopwords/sw_stopwords.json +76 -0
  219. data/vendor/kreuzberg/stopwords/ta_stopwords.json +129 -0
  220. data/vendor/kreuzberg/stopwords/te_stopwords.json +54 -0
  221. data/vendor/kreuzberg/stopwords/th_stopwords.json +118 -0
  222. data/vendor/kreuzberg/stopwords/tl_stopwords.json +149 -0
  223. data/vendor/kreuzberg/stopwords/tr_stopwords.json +506 -0
  224. data/vendor/kreuzberg/stopwords/uk_stopwords.json +75 -0
  225. data/vendor/kreuzberg/stopwords/ur_stopwords.json +519 -0
  226. data/vendor/kreuzberg/stopwords/vi_stopwords.json +647 -0
  227. data/vendor/kreuzberg/stopwords/yo_stopwords.json +62 -0
  228. data/vendor/kreuzberg/stopwords/zh_stopwords.json +796 -0
  229. data/vendor/kreuzberg/stopwords/zu_stopwords.json +31 -0
  230. data/vendor/kreuzberg/tests/api_tests.rs +966 -0
  231. data/vendor/kreuzberg/tests/archive_integration.rs +543 -0
  232. data/vendor/kreuzberg/tests/batch_orchestration.rs +542 -0
  233. data/vendor/kreuzberg/tests/batch_processing.rs +304 -0
  234. data/vendor/kreuzberg/tests/chunking_offset_demo.rs +92 -0
  235. data/vendor/kreuzberg/tests/concurrency_stress.rs +509 -0
  236. data/vendor/kreuzberg/tests/config_features.rs +580 -0
  237. data/vendor/kreuzberg/tests/config_loading_tests.rs +439 -0
  238. data/vendor/kreuzberg/tests/core_integration.rs +493 -0
  239. data/vendor/kreuzberg/tests/csv_integration.rs +424 -0
  240. data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +124 -0
  241. data/vendor/kreuzberg/tests/email_integration.rs +325 -0
  242. data/vendor/kreuzberg/tests/error_handling.rs +393 -0
  243. data/vendor/kreuzberg/tests/format_integration.rs +159 -0
  244. data/vendor/kreuzberg/tests/helpers/mod.rs +142 -0
  245. data/vendor/kreuzberg/tests/image_integration.rs +253 -0
  246. data/vendor/kreuzberg/tests/keywords_integration.rs +479 -0
  247. data/vendor/kreuzberg/tests/keywords_quality.rs +509 -0
  248. data/vendor/kreuzberg/tests/mime_detection.rs +428 -0
  249. data/vendor/kreuzberg/tests/ocr_configuration.rs +510 -0
  250. data/vendor/kreuzberg/tests/ocr_errors.rs +676 -0
  251. data/vendor/kreuzberg/tests/ocr_quality.rs +627 -0
  252. data/vendor/kreuzberg/tests/ocr_stress.rs +469 -0
  253. data/vendor/kreuzberg/tests/pandoc_integration.rs +503 -0
  254. data/vendor/kreuzberg/tests/pdf_integration.rs +43 -0
  255. data/vendor/kreuzberg/tests/pipeline_integration.rs +1412 -0
  256. data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +771 -0
  257. data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +561 -0
  258. data/vendor/kreuzberg/tests/plugin_system.rs +921 -0
  259. data/vendor/kreuzberg/tests/plugin_validator_test.rs +783 -0
  260. data/vendor/kreuzberg/tests/registry_integration_tests.rs +607 -0
  261. data/vendor/kreuzberg/tests/security_validation.rs +404 -0
  262. data/vendor/kreuzberg/tests/stopwords_integration_test.rs +888 -0
  263. data/vendor/kreuzberg/tests/test_fastembed.rs +609 -0
  264. data/vendor/kreuzberg/tests/xlsx_metadata_extraction_test.rs +87 -0
  265. metadata +471 -0
@@ -0,0 +1,425 @@
1
+ //! Archive extractors for ZIP, TAR, and 7z formats.
2
+
3
+ use crate::Result;
4
+ use crate::core::config::ExtractionConfig;
5
+ use crate::extraction::archive::{
6
+ ArchiveMetadata as ExtractedMetadata, extract_7z_metadata, extract_7z_text_content, extract_tar_metadata,
7
+ extract_tar_text_content, extract_zip_metadata, extract_zip_text_content,
8
+ };
9
+ use crate::plugins::{DocumentExtractor, Plugin};
10
+ use crate::types::{ArchiveMetadata, ExtractionResult, Metadata};
11
+ use async_trait::async_trait;
12
+ use std::collections::HashMap;
13
+
14
+ /// Build an ExtractionResult from archive metadata and text contents.
15
+ ///
16
+ /// This helper function eliminates duplication across ZIP/TAR/7Z extractors by centralizing
17
+ /// the logic for transforming extracted metadata into the final result structure.
18
+ fn build_archive_result(
19
+ extraction_metadata: ExtractedMetadata,
20
+ text_contents: HashMap<String, String>,
21
+ format_name: &str,
22
+ mime_type: &str,
23
+ ) -> ExtractionResult {
24
+ let file_names: Vec<String> = extraction_metadata
25
+ .file_list
26
+ .iter()
27
+ .map(|entry| entry.path.clone())
28
+ .collect();
29
+
30
+ let archive_metadata = ArchiveMetadata {
31
+ format: format_name.to_string(),
32
+ file_count: extraction_metadata.file_count,
33
+ file_list: file_names,
34
+ total_size: extraction_metadata.total_size as usize,
35
+ compressed_size: None,
36
+ };
37
+
38
+ let mut additional = HashMap::new();
39
+ let file_details: Vec<serde_json::Value> = extraction_metadata
40
+ .file_list
41
+ .iter()
42
+ .map(|entry| {
43
+ serde_json::json!({
44
+ "path": entry.path,
45
+ "size": entry.size,
46
+ "is_dir": entry.is_dir,
47
+ })
48
+ })
49
+ .collect();
50
+ additional.insert("files".to_string(), serde_json::json!(file_details));
51
+
52
+ let mut output = format!(
53
+ "{} Archive ({} files, {} bytes)\n\n",
54
+ format_name, extraction_metadata.file_count, extraction_metadata.total_size
55
+ );
56
+ output.push_str("Files:\n");
57
+ for entry in &extraction_metadata.file_list {
58
+ output.push_str(&format!("- {} ({} bytes)\n", entry.path, entry.size));
59
+ }
60
+
61
+ if !text_contents.is_empty() {
62
+ output.push_str("\n\nText File Contents:\n\n");
63
+ for (path, content) in text_contents {
64
+ output.push_str(&format!("=== {} ===\n{}\n\n", path, content));
65
+ }
66
+ }
67
+
68
+ ExtractionResult {
69
+ content: output,
70
+ mime_type: mime_type.to_string(),
71
+ metadata: Metadata {
72
+ format: Some(crate::types::FormatMetadata::Archive(archive_metadata)),
73
+ additional,
74
+ ..Default::default()
75
+ },
76
+ tables: vec![],
77
+ detected_languages: None,
78
+ chunks: None,
79
+ images: None,
80
+ }
81
+ }
82
+
83
+ /// ZIP archive extractor.
84
+ ///
85
+ /// Extracts file lists and text content from ZIP archives.
86
+ pub struct ZipExtractor;
87
+
88
+ impl ZipExtractor {
89
+ /// Create a new ZIP extractor.
90
+ pub fn new() -> Self {
91
+ Self
92
+ }
93
+ }
94
+
95
+ impl Default for ZipExtractor {
96
+ fn default() -> Self {
97
+ Self::new()
98
+ }
99
+ }
100
+
101
+ impl Plugin for ZipExtractor {
102
+ fn name(&self) -> &str {
103
+ "zip-extractor"
104
+ }
105
+
106
+ fn version(&self) -> String {
107
+ env!("CARGO_PKG_VERSION").to_string()
108
+ }
109
+
110
+ fn initialize(&self) -> Result<()> {
111
+ Ok(())
112
+ }
113
+
114
+ fn shutdown(&self) -> Result<()> {
115
+ Ok(())
116
+ }
117
+
118
+ fn description(&self) -> &str {
119
+ "Extracts file lists and text content from ZIP archives"
120
+ }
121
+
122
+ fn author(&self) -> &str {
123
+ "Kreuzberg Team"
124
+ }
125
+ }
126
+
127
+ #[async_trait]
128
+ impl DocumentExtractor for ZipExtractor {
129
+ async fn extract_bytes(
130
+ &self,
131
+ content: &[u8],
132
+ mime_type: &str,
133
+ _config: &ExtractionConfig,
134
+ ) -> Result<ExtractionResult> {
135
+ let extraction_metadata = extract_zip_metadata(content)?;
136
+ let text_contents = extract_zip_text_content(content)?;
137
+ Ok(build_archive_result(
138
+ extraction_metadata,
139
+ text_contents,
140
+ "ZIP",
141
+ mime_type,
142
+ ))
143
+ }
144
+
145
+ fn supported_mime_types(&self) -> &[&str] {
146
+ &["application/zip", "application/x-zip-compressed"]
147
+ }
148
+
149
+ fn priority(&self) -> i32 {
150
+ 50
151
+ }
152
+ }
153
+
154
+ /// TAR archive extractor.
155
+ ///
156
+ /// Extracts file lists and text content from TAR archives.
157
+ pub struct TarExtractor;
158
+
159
+ impl TarExtractor {
160
+ /// Create a new TAR extractor.
161
+ pub fn new() -> Self {
162
+ Self
163
+ }
164
+ }
165
+
166
+ impl Default for TarExtractor {
167
+ fn default() -> Self {
168
+ Self::new()
169
+ }
170
+ }
171
+
172
+ impl Plugin for TarExtractor {
173
+ fn name(&self) -> &str {
174
+ "tar-extractor"
175
+ }
176
+
177
+ fn version(&self) -> String {
178
+ env!("CARGO_PKG_VERSION").to_string()
179
+ }
180
+
181
+ fn initialize(&self) -> Result<()> {
182
+ Ok(())
183
+ }
184
+
185
+ fn shutdown(&self) -> Result<()> {
186
+ Ok(())
187
+ }
188
+
189
+ fn description(&self) -> &str {
190
+ "Extracts file lists and text content from TAR archives"
191
+ }
192
+
193
+ fn author(&self) -> &str {
194
+ "Kreuzberg Team"
195
+ }
196
+ }
197
+
198
+ #[async_trait]
199
+ impl DocumentExtractor for TarExtractor {
200
+ async fn extract_bytes(
201
+ &self,
202
+ content: &[u8],
203
+ mime_type: &str,
204
+ _config: &ExtractionConfig,
205
+ ) -> Result<ExtractionResult> {
206
+ let extraction_metadata = extract_tar_metadata(content)?;
207
+ let text_contents = extract_tar_text_content(content)?;
208
+ Ok(build_archive_result(
209
+ extraction_metadata,
210
+ text_contents,
211
+ "TAR",
212
+ mime_type,
213
+ ))
214
+ }
215
+
216
+ fn supported_mime_types(&self) -> &[&str] {
217
+ &[
218
+ "application/x-tar",
219
+ "application/tar",
220
+ "application/x-gtar",
221
+ "application/x-ustar",
222
+ ]
223
+ }
224
+
225
+ fn priority(&self) -> i32 {
226
+ 50
227
+ }
228
+ }
229
+
230
+ /// 7z archive extractor.
231
+ ///
232
+ /// Extracts file lists and text content from 7z archives.
233
+ pub struct SevenZExtractor;
234
+
235
+ impl SevenZExtractor {
236
+ /// Create a new 7z extractor.
237
+ pub fn new() -> Self {
238
+ Self
239
+ }
240
+ }
241
+
242
+ impl Default for SevenZExtractor {
243
+ fn default() -> Self {
244
+ Self::new()
245
+ }
246
+ }
247
+
248
+ impl Plugin for SevenZExtractor {
249
+ fn name(&self) -> &str {
250
+ "7z-extractor"
251
+ }
252
+
253
+ fn version(&self) -> String {
254
+ env!("CARGO_PKG_VERSION").to_string()
255
+ }
256
+
257
+ fn initialize(&self) -> Result<()> {
258
+ Ok(())
259
+ }
260
+
261
+ fn shutdown(&self) -> Result<()> {
262
+ Ok(())
263
+ }
264
+
265
+ fn description(&self) -> &str {
266
+ "Extracts file lists and text content from 7z archives"
267
+ }
268
+
269
+ fn author(&self) -> &str {
270
+ "Kreuzberg Team"
271
+ }
272
+ }
273
+
274
+ #[async_trait]
275
+ impl DocumentExtractor for SevenZExtractor {
276
+ async fn extract_bytes(
277
+ &self,
278
+ content: &[u8],
279
+ mime_type: &str,
280
+ _config: &ExtractionConfig,
281
+ ) -> Result<ExtractionResult> {
282
+ let extraction_metadata = extract_7z_metadata(content)?;
283
+ let text_contents = extract_7z_text_content(content)?;
284
+ Ok(build_archive_result(
285
+ extraction_metadata,
286
+ text_contents,
287
+ "7Z",
288
+ mime_type,
289
+ ))
290
+ }
291
+
292
+ fn supported_mime_types(&self) -> &[&str] {
293
+ &["application/x-7z-compressed"]
294
+ }
295
+
296
+ fn priority(&self) -> i32 {
297
+ 50
298
+ }
299
+ }
300
+
301
+ #[cfg(test)]
302
+ mod tests {
303
+ use super::*;
304
+ use std::io::{Cursor, Write};
305
+ use tar::Builder as TarBuilder;
306
+ use zip::write::{FileOptions, ZipWriter};
307
+
308
+ #[tokio::test]
309
+ async fn test_zip_extractor() {
310
+ let extractor = ZipExtractor::new();
311
+
312
+ let mut cursor = Cursor::new(Vec::new());
313
+ {
314
+ let mut zip = ZipWriter::new(&mut cursor);
315
+ let options = FileOptions::<'_, ()>::default();
316
+
317
+ zip.start_file("test.txt", options).unwrap();
318
+ zip.write_all(b"Hello, World!").unwrap();
319
+
320
+ zip.finish().unwrap();
321
+ }
322
+
323
+ let bytes = cursor.into_inner();
324
+ let config = ExtractionConfig::default();
325
+
326
+ let result = extractor
327
+ .extract_bytes(&bytes, "application/zip", &config)
328
+ .await
329
+ .unwrap();
330
+
331
+ assert_eq!(result.mime_type, "application/zip");
332
+ assert!(result.content.contains("ZIP Archive"));
333
+ assert!(result.content.contains("test.txt"));
334
+ assert!(result.content.contains("Hello, World!"));
335
+ assert!(result.metadata.format.is_some());
336
+ let archive_meta = match result.metadata.format.as_ref().unwrap() {
337
+ crate::types::FormatMetadata::Archive(meta) => meta,
338
+ _ => panic!("Expected Archive metadata"),
339
+ };
340
+ assert_eq!(archive_meta.format, "ZIP");
341
+ assert_eq!(archive_meta.file_count, 1);
342
+ }
343
+
344
+ #[tokio::test]
345
+ async fn test_tar_extractor() {
346
+ let extractor = TarExtractor::new();
347
+
348
+ let mut cursor = Cursor::new(Vec::new());
349
+ {
350
+ let mut tar = TarBuilder::new(&mut cursor);
351
+
352
+ let data = b"Hello, World!";
353
+ let mut header = tar::Header::new_gnu();
354
+ header.set_path("test.txt").unwrap();
355
+ header.set_size(data.len() as u64);
356
+ header.set_cksum();
357
+ tar.append(&header, &data[..]).unwrap();
358
+
359
+ tar.finish().unwrap();
360
+ }
361
+
362
+ let bytes = cursor.into_inner();
363
+ let config = ExtractionConfig::default();
364
+
365
+ let result = extractor
366
+ .extract_bytes(&bytes, "application/x-tar", &config)
367
+ .await
368
+ .unwrap();
369
+
370
+ assert_eq!(result.mime_type, "application/x-tar");
371
+ assert!(result.content.contains("TAR Archive"));
372
+ assert!(result.content.contains("test.txt"));
373
+ assert!(result.content.contains("Hello, World!"));
374
+ assert!(result.metadata.format.is_some());
375
+ let archive_meta = match result.metadata.format.as_ref().unwrap() {
376
+ crate::types::FormatMetadata::Archive(meta) => meta,
377
+ _ => panic!("Expected Archive metadata"),
378
+ };
379
+ assert_eq!(archive_meta.format, "TAR");
380
+ assert_eq!(archive_meta.file_count, 1);
381
+ }
382
+
383
+ #[tokio::test]
384
+ async fn test_zip_extractor_invalid() {
385
+ let extractor = ZipExtractor::new();
386
+ let invalid_bytes = vec![0, 1, 2, 3, 4, 5];
387
+ let config = ExtractionConfig::default();
388
+
389
+ let result = extractor
390
+ .extract_bytes(&invalid_bytes, "application/zip", &config)
391
+ .await;
392
+ assert!(result.is_err());
393
+ }
394
+
395
+ #[tokio::test]
396
+ async fn test_tar_extractor_invalid() {
397
+ let extractor = TarExtractor::new();
398
+ let invalid_bytes = vec![0, 1, 2, 3, 4, 5];
399
+ let config = ExtractionConfig::default();
400
+
401
+ let result = extractor
402
+ .extract_bytes(&invalid_bytes, "application/x-tar", &config)
403
+ .await;
404
+ assert!(result.is_err());
405
+ }
406
+
407
+ #[test]
408
+ fn test_zip_plugin_interface() {
409
+ let extractor = ZipExtractor::new();
410
+ assert_eq!(extractor.name(), "zip-extractor");
411
+ assert_eq!(extractor.version(), env!("CARGO_PKG_VERSION"));
412
+ assert!(extractor.supported_mime_types().contains(&"application/zip"));
413
+ assert_eq!(extractor.priority(), 50);
414
+ }
415
+
416
+ #[test]
417
+ fn test_tar_plugin_interface() {
418
+ let extractor = TarExtractor::new();
419
+ assert_eq!(extractor.name(), "tar-extractor");
420
+ assert_eq!(extractor.version(), env!("CARGO_PKG_VERSION"));
421
+ assert!(extractor.supported_mime_types().contains(&"application/x-tar"));
422
+ assert!(extractor.supported_mime_types().contains(&"application/tar"));
423
+ assert_eq!(extractor.priority(), 50);
424
+ }
425
+ }