kreuzberg 4.0.0.rc1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (265) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +8 -0
  3. data/.rspec +3 -0
  4. data/.rubocop.yaml +534 -0
  5. data/Gemfile +9 -0
  6. data/Gemfile.lock +157 -0
  7. data/README.md +421 -0
  8. data/Rakefile +25 -0
  9. data/Steepfile +47 -0
  10. data/examples/async_patterns.rb +340 -0
  11. data/ext/kreuzberg_rb/extconf.rb +35 -0
  12. data/ext/kreuzberg_rb/native/Cargo.toml +36 -0
  13. data/ext/kreuzberg_rb/native/README.md +425 -0
  14. data/ext/kreuzberg_rb/native/build.rs +17 -0
  15. data/ext/kreuzberg_rb/native/include/ieeefp.h +11 -0
  16. data/ext/kreuzberg_rb/native/include/msvc_compat/strings.h +14 -0
  17. data/ext/kreuzberg_rb/native/include/strings.h +20 -0
  18. data/ext/kreuzberg_rb/native/include/unistd.h +47 -0
  19. data/ext/kreuzberg_rb/native/src/lib.rs +2939 -0
  20. data/extconf.rb +28 -0
  21. data/kreuzberg.gemspec +105 -0
  22. data/lib/kreuzberg/api_proxy.rb +142 -0
  23. data/lib/kreuzberg/cache_api.rb +45 -0
  24. data/lib/kreuzberg/cli.rb +55 -0
  25. data/lib/kreuzberg/cli_proxy.rb +127 -0
  26. data/lib/kreuzberg/config.rb +684 -0
  27. data/lib/kreuzberg/errors.rb +50 -0
  28. data/lib/kreuzberg/extraction_api.rb +84 -0
  29. data/lib/kreuzberg/mcp_proxy.rb +186 -0
  30. data/lib/kreuzberg/ocr_backend_protocol.rb +113 -0
  31. data/lib/kreuzberg/post_processor_protocol.rb +86 -0
  32. data/lib/kreuzberg/result.rb +216 -0
  33. data/lib/kreuzberg/setup_lib_path.rb +79 -0
  34. data/lib/kreuzberg/validator_protocol.rb +89 -0
  35. data/lib/kreuzberg/version.rb +5 -0
  36. data/lib/kreuzberg.rb +82 -0
  37. data/pkg/kreuzberg-4.0.0.rc1.gem +0 -0
  38. data/sig/kreuzberg/internal.rbs +184 -0
  39. data/sig/kreuzberg.rbs +468 -0
  40. data/spec/binding/cache_spec.rb +227 -0
  41. data/spec/binding/cli_proxy_spec.rb +87 -0
  42. data/spec/binding/cli_spec.rb +54 -0
  43. data/spec/binding/config_spec.rb +345 -0
  44. data/spec/binding/config_validation_spec.rb +283 -0
  45. data/spec/binding/error_handling_spec.rb +213 -0
  46. data/spec/binding/errors_spec.rb +66 -0
  47. data/spec/binding/plugins/ocr_backend_spec.rb +307 -0
  48. data/spec/binding/plugins/postprocessor_spec.rb +269 -0
  49. data/spec/binding/plugins/validator_spec.rb +274 -0
  50. data/spec/examples.txt +104 -0
  51. data/spec/fixtures/config.toml +39 -0
  52. data/spec/fixtures/config.yaml +42 -0
  53. data/spec/fixtures/invalid_config.toml +4 -0
  54. data/spec/smoke/package_spec.rb +178 -0
  55. data/spec/spec_helper.rb +42 -0
  56. data/vendor/kreuzberg/Cargo.toml +134 -0
  57. data/vendor/kreuzberg/README.md +175 -0
  58. data/vendor/kreuzberg/build.rs +460 -0
  59. data/vendor/kreuzberg/src/api/error.rs +81 -0
  60. data/vendor/kreuzberg/src/api/handlers.rs +199 -0
  61. data/vendor/kreuzberg/src/api/mod.rs +79 -0
  62. data/vendor/kreuzberg/src/api/server.rs +353 -0
  63. data/vendor/kreuzberg/src/api/types.rs +170 -0
  64. data/vendor/kreuzberg/src/bin/profile_extract.rs +455 -0
  65. data/vendor/kreuzberg/src/cache/mod.rs +1143 -0
  66. data/vendor/kreuzberg/src/chunking/mod.rs +677 -0
  67. data/vendor/kreuzberg/src/core/batch_mode.rs +35 -0
  68. data/vendor/kreuzberg/src/core/config.rs +1032 -0
  69. data/vendor/kreuzberg/src/core/extractor.rs +903 -0
  70. data/vendor/kreuzberg/src/core/io.rs +327 -0
  71. data/vendor/kreuzberg/src/core/mime.rs +615 -0
  72. data/vendor/kreuzberg/src/core/mod.rs +42 -0
  73. data/vendor/kreuzberg/src/core/pipeline.rs +906 -0
  74. data/vendor/kreuzberg/src/embeddings.rs +323 -0
  75. data/vendor/kreuzberg/src/error.rs +431 -0
  76. data/vendor/kreuzberg/src/extraction/archive.rs +954 -0
  77. data/vendor/kreuzberg/src/extraction/docx.rs +40 -0
  78. data/vendor/kreuzberg/src/extraction/email.rs +854 -0
  79. data/vendor/kreuzberg/src/extraction/excel.rs +688 -0
  80. data/vendor/kreuzberg/src/extraction/html.rs +553 -0
  81. data/vendor/kreuzberg/src/extraction/image.rs +368 -0
  82. data/vendor/kreuzberg/src/extraction/libreoffice.rs +564 -0
  83. data/vendor/kreuzberg/src/extraction/mod.rs +77 -0
  84. data/vendor/kreuzberg/src/extraction/office_metadata/app_properties.rs +398 -0
  85. data/vendor/kreuzberg/src/extraction/office_metadata/core_properties.rs +247 -0
  86. data/vendor/kreuzberg/src/extraction/office_metadata/custom_properties.rs +240 -0
  87. data/vendor/kreuzberg/src/extraction/office_metadata/mod.rs +128 -0
  88. data/vendor/kreuzberg/src/extraction/pandoc/batch.rs +275 -0
  89. data/vendor/kreuzberg/src/extraction/pandoc/mime_types.rs +178 -0
  90. data/vendor/kreuzberg/src/extraction/pandoc/mod.rs +491 -0
  91. data/vendor/kreuzberg/src/extraction/pandoc/server.rs +496 -0
  92. data/vendor/kreuzberg/src/extraction/pandoc/subprocess.rs +1188 -0
  93. data/vendor/kreuzberg/src/extraction/pandoc/version.rs +162 -0
  94. data/vendor/kreuzberg/src/extraction/pptx.rs +3000 -0
  95. data/vendor/kreuzberg/src/extraction/structured.rs +490 -0
  96. data/vendor/kreuzberg/src/extraction/table.rs +328 -0
  97. data/vendor/kreuzberg/src/extraction/text.rs +269 -0
  98. data/vendor/kreuzberg/src/extraction/xml.rs +333 -0
  99. data/vendor/kreuzberg/src/extractors/archive.rs +425 -0
  100. data/vendor/kreuzberg/src/extractors/docx.rs +479 -0
  101. data/vendor/kreuzberg/src/extractors/email.rs +129 -0
  102. data/vendor/kreuzberg/src/extractors/excel.rs +344 -0
  103. data/vendor/kreuzberg/src/extractors/html.rs +410 -0
  104. data/vendor/kreuzberg/src/extractors/image.rs +195 -0
  105. data/vendor/kreuzberg/src/extractors/mod.rs +268 -0
  106. data/vendor/kreuzberg/src/extractors/pandoc.rs +201 -0
  107. data/vendor/kreuzberg/src/extractors/pdf.rs +496 -0
  108. data/vendor/kreuzberg/src/extractors/pptx.rs +234 -0
  109. data/vendor/kreuzberg/src/extractors/structured.rs +126 -0
  110. data/vendor/kreuzberg/src/extractors/text.rs +242 -0
  111. data/vendor/kreuzberg/src/extractors/xml.rs +128 -0
  112. data/vendor/kreuzberg/src/image/dpi.rs +164 -0
  113. data/vendor/kreuzberg/src/image/mod.rs +6 -0
  114. data/vendor/kreuzberg/src/image/preprocessing.rs +417 -0
  115. data/vendor/kreuzberg/src/image/resize.rs +89 -0
  116. data/vendor/kreuzberg/src/keywords/config.rs +154 -0
  117. data/vendor/kreuzberg/src/keywords/mod.rs +237 -0
  118. data/vendor/kreuzberg/src/keywords/processor.rs +267 -0
  119. data/vendor/kreuzberg/src/keywords/rake.rs +294 -0
  120. data/vendor/kreuzberg/src/keywords/types.rs +68 -0
  121. data/vendor/kreuzberg/src/keywords/yake.rs +163 -0
  122. data/vendor/kreuzberg/src/language_detection/mod.rs +942 -0
  123. data/vendor/kreuzberg/src/lib.rs +102 -0
  124. data/vendor/kreuzberg/src/mcp/mod.rs +32 -0
  125. data/vendor/kreuzberg/src/mcp/server.rs +1966 -0
  126. data/vendor/kreuzberg/src/ocr/cache.rs +469 -0
  127. data/vendor/kreuzberg/src/ocr/error.rs +37 -0
  128. data/vendor/kreuzberg/src/ocr/hocr.rs +216 -0
  129. data/vendor/kreuzberg/src/ocr/mod.rs +58 -0
  130. data/vendor/kreuzberg/src/ocr/processor.rs +847 -0
  131. data/vendor/kreuzberg/src/ocr/table/mod.rs +4 -0
  132. data/vendor/kreuzberg/src/ocr/table/tsv_parser.rs +144 -0
  133. data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +450 -0
  134. data/vendor/kreuzberg/src/ocr/types.rs +393 -0
  135. data/vendor/kreuzberg/src/ocr/utils.rs +47 -0
  136. data/vendor/kreuzberg/src/ocr/validation.rs +206 -0
  137. data/vendor/kreuzberg/src/pdf/error.rs +122 -0
  138. data/vendor/kreuzberg/src/pdf/images.rs +139 -0
  139. data/vendor/kreuzberg/src/pdf/metadata.rs +346 -0
  140. data/vendor/kreuzberg/src/pdf/mod.rs +50 -0
  141. data/vendor/kreuzberg/src/pdf/rendering.rs +369 -0
  142. data/vendor/kreuzberg/src/pdf/table.rs +420 -0
  143. data/vendor/kreuzberg/src/pdf/text.rs +161 -0
  144. data/vendor/kreuzberg/src/plugins/extractor.rs +1010 -0
  145. data/vendor/kreuzberg/src/plugins/mod.rs +209 -0
  146. data/vendor/kreuzberg/src/plugins/ocr.rs +629 -0
  147. data/vendor/kreuzberg/src/plugins/processor.rs +641 -0
  148. data/vendor/kreuzberg/src/plugins/registry.rs +1324 -0
  149. data/vendor/kreuzberg/src/plugins/traits.rs +258 -0
  150. data/vendor/kreuzberg/src/plugins/validator.rs +955 -0
  151. data/vendor/kreuzberg/src/stopwords/mod.rs +1470 -0
  152. data/vendor/kreuzberg/src/text/mod.rs +19 -0
  153. data/vendor/kreuzberg/src/text/quality.rs +697 -0
  154. data/vendor/kreuzberg/src/text/string_utils.rs +217 -0
  155. data/vendor/kreuzberg/src/text/token_reduction/cjk_utils.rs +164 -0
  156. data/vendor/kreuzberg/src/text/token_reduction/config.rs +100 -0
  157. data/vendor/kreuzberg/src/text/token_reduction/core.rs +796 -0
  158. data/vendor/kreuzberg/src/text/token_reduction/filters.rs +902 -0
  159. data/vendor/kreuzberg/src/text/token_reduction/mod.rs +160 -0
  160. data/vendor/kreuzberg/src/text/token_reduction/semantic.rs +619 -0
  161. data/vendor/kreuzberg/src/text/token_reduction/simd_text.rs +147 -0
  162. data/vendor/kreuzberg/src/types.rs +873 -0
  163. data/vendor/kreuzberg/src/utils/mod.rs +17 -0
  164. data/vendor/kreuzberg/src/utils/quality.rs +959 -0
  165. data/vendor/kreuzberg/src/utils/string_utils.rs +381 -0
  166. data/vendor/kreuzberg/stopwords/af_stopwords.json +53 -0
  167. data/vendor/kreuzberg/stopwords/ar_stopwords.json +482 -0
  168. data/vendor/kreuzberg/stopwords/bg_stopwords.json +261 -0
  169. data/vendor/kreuzberg/stopwords/bn_stopwords.json +400 -0
  170. data/vendor/kreuzberg/stopwords/br_stopwords.json +1205 -0
  171. data/vendor/kreuzberg/stopwords/ca_stopwords.json +280 -0
  172. data/vendor/kreuzberg/stopwords/cs_stopwords.json +425 -0
  173. data/vendor/kreuzberg/stopwords/da_stopwords.json +172 -0
  174. data/vendor/kreuzberg/stopwords/de_stopwords.json +622 -0
  175. data/vendor/kreuzberg/stopwords/el_stopwords.json +849 -0
  176. data/vendor/kreuzberg/stopwords/en_stopwords.json +1300 -0
  177. data/vendor/kreuzberg/stopwords/eo_stopwords.json +175 -0
  178. data/vendor/kreuzberg/stopwords/es_stopwords.json +734 -0
  179. data/vendor/kreuzberg/stopwords/et_stopwords.json +37 -0
  180. data/vendor/kreuzberg/stopwords/eu_stopwords.json +100 -0
  181. data/vendor/kreuzberg/stopwords/fa_stopwords.json +801 -0
  182. data/vendor/kreuzberg/stopwords/fi_stopwords.json +849 -0
  183. data/vendor/kreuzberg/stopwords/fr_stopwords.json +693 -0
  184. data/vendor/kreuzberg/stopwords/ga_stopwords.json +111 -0
  185. data/vendor/kreuzberg/stopwords/gl_stopwords.json +162 -0
  186. data/vendor/kreuzberg/stopwords/gu_stopwords.json +226 -0
  187. data/vendor/kreuzberg/stopwords/ha_stopwords.json +41 -0
  188. data/vendor/kreuzberg/stopwords/he_stopwords.json +196 -0
  189. data/vendor/kreuzberg/stopwords/hi_stopwords.json +227 -0
  190. data/vendor/kreuzberg/stopwords/hr_stopwords.json +181 -0
  191. data/vendor/kreuzberg/stopwords/hu_stopwords.json +791 -0
  192. data/vendor/kreuzberg/stopwords/hy_stopwords.json +47 -0
  193. data/vendor/kreuzberg/stopwords/id_stopwords.json +760 -0
  194. data/vendor/kreuzberg/stopwords/it_stopwords.json +634 -0
  195. data/vendor/kreuzberg/stopwords/ja_stopwords.json +136 -0
  196. data/vendor/kreuzberg/stopwords/kn_stopwords.json +84 -0
  197. data/vendor/kreuzberg/stopwords/ko_stopwords.json +681 -0
  198. data/vendor/kreuzberg/stopwords/ku_stopwords.json +64 -0
  199. data/vendor/kreuzberg/stopwords/la_stopwords.json +51 -0
  200. data/vendor/kreuzberg/stopwords/lt_stopwords.json +476 -0
  201. data/vendor/kreuzberg/stopwords/lv_stopwords.json +163 -0
  202. data/vendor/kreuzberg/stopwords/ml_stopwords.json +1 -0
  203. data/vendor/kreuzberg/stopwords/mr_stopwords.json +101 -0
  204. data/vendor/kreuzberg/stopwords/ms_stopwords.json +477 -0
  205. data/vendor/kreuzberg/stopwords/ne_stopwords.json +490 -0
  206. data/vendor/kreuzberg/stopwords/nl_stopwords.json +415 -0
  207. data/vendor/kreuzberg/stopwords/no_stopwords.json +223 -0
  208. data/vendor/kreuzberg/stopwords/pl_stopwords.json +331 -0
  209. data/vendor/kreuzberg/stopwords/pt_stopwords.json +562 -0
  210. data/vendor/kreuzberg/stopwords/ro_stopwords.json +436 -0
  211. data/vendor/kreuzberg/stopwords/ru_stopwords.json +561 -0
  212. data/vendor/kreuzberg/stopwords/si_stopwords.json +193 -0
  213. data/vendor/kreuzberg/stopwords/sk_stopwords.json +420 -0
  214. data/vendor/kreuzberg/stopwords/sl_stopwords.json +448 -0
  215. data/vendor/kreuzberg/stopwords/so_stopwords.json +32 -0
  216. data/vendor/kreuzberg/stopwords/st_stopwords.json +33 -0
  217. data/vendor/kreuzberg/stopwords/sv_stopwords.json +420 -0
  218. data/vendor/kreuzberg/stopwords/sw_stopwords.json +76 -0
  219. data/vendor/kreuzberg/stopwords/ta_stopwords.json +129 -0
  220. data/vendor/kreuzberg/stopwords/te_stopwords.json +54 -0
  221. data/vendor/kreuzberg/stopwords/th_stopwords.json +118 -0
  222. data/vendor/kreuzberg/stopwords/tl_stopwords.json +149 -0
  223. data/vendor/kreuzberg/stopwords/tr_stopwords.json +506 -0
  224. data/vendor/kreuzberg/stopwords/uk_stopwords.json +75 -0
  225. data/vendor/kreuzberg/stopwords/ur_stopwords.json +519 -0
  226. data/vendor/kreuzberg/stopwords/vi_stopwords.json +647 -0
  227. data/vendor/kreuzberg/stopwords/yo_stopwords.json +62 -0
  228. data/vendor/kreuzberg/stopwords/zh_stopwords.json +796 -0
  229. data/vendor/kreuzberg/stopwords/zu_stopwords.json +31 -0
  230. data/vendor/kreuzberg/tests/api_tests.rs +966 -0
  231. data/vendor/kreuzberg/tests/archive_integration.rs +543 -0
  232. data/vendor/kreuzberg/tests/batch_orchestration.rs +542 -0
  233. data/vendor/kreuzberg/tests/batch_processing.rs +304 -0
  234. data/vendor/kreuzberg/tests/chunking_offset_demo.rs +92 -0
  235. data/vendor/kreuzberg/tests/concurrency_stress.rs +509 -0
  236. data/vendor/kreuzberg/tests/config_features.rs +580 -0
  237. data/vendor/kreuzberg/tests/config_loading_tests.rs +439 -0
  238. data/vendor/kreuzberg/tests/core_integration.rs +493 -0
  239. data/vendor/kreuzberg/tests/csv_integration.rs +424 -0
  240. data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +124 -0
  241. data/vendor/kreuzberg/tests/email_integration.rs +325 -0
  242. data/vendor/kreuzberg/tests/error_handling.rs +393 -0
  243. data/vendor/kreuzberg/tests/format_integration.rs +159 -0
  244. data/vendor/kreuzberg/tests/helpers/mod.rs +142 -0
  245. data/vendor/kreuzberg/tests/image_integration.rs +253 -0
  246. data/vendor/kreuzberg/tests/keywords_integration.rs +479 -0
  247. data/vendor/kreuzberg/tests/keywords_quality.rs +509 -0
  248. data/vendor/kreuzberg/tests/mime_detection.rs +428 -0
  249. data/vendor/kreuzberg/tests/ocr_configuration.rs +510 -0
  250. data/vendor/kreuzberg/tests/ocr_errors.rs +676 -0
  251. data/vendor/kreuzberg/tests/ocr_quality.rs +627 -0
  252. data/vendor/kreuzberg/tests/ocr_stress.rs +469 -0
  253. data/vendor/kreuzberg/tests/pandoc_integration.rs +503 -0
  254. data/vendor/kreuzberg/tests/pdf_integration.rs +43 -0
  255. data/vendor/kreuzberg/tests/pipeline_integration.rs +1412 -0
  256. data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +771 -0
  257. data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +561 -0
  258. data/vendor/kreuzberg/tests/plugin_system.rs +921 -0
  259. data/vendor/kreuzberg/tests/plugin_validator_test.rs +783 -0
  260. data/vendor/kreuzberg/tests/registry_integration_tests.rs +607 -0
  261. data/vendor/kreuzberg/tests/security_validation.rs +404 -0
  262. data/vendor/kreuzberg/tests/stopwords_integration_test.rs +888 -0
  263. data/vendor/kreuzberg/tests/test_fastembed.rs +609 -0
  264. data/vendor/kreuzberg/tests/xlsx_metadata_extraction_test.rs +87 -0
  265. metadata +471 -0
@@ -0,0 +1,873 @@
1
+ use serde::{Deserialize, Serialize};
2
+ use std::collections::HashMap;
3
+
4
+ #[cfg(feature = "pdf")]
5
+ use crate::pdf::metadata::PdfMetadata;
6
+
7
+ // ============================================================================
8
+ // ============================================================================
9
+
10
+ /// General extraction result used by the core extraction API.
11
+ ///
12
+ /// This is the main result type returned by all extraction functions.
13
+ #[derive(Debug, Clone, Serialize, Deserialize)]
14
+ pub struct ExtractionResult {
15
+ pub content: String,
16
+ pub mime_type: String,
17
+ pub metadata: Metadata,
18
+ pub tables: Vec<Table>,
19
+ #[serde(skip_serializing_if = "Option::is_none")]
20
+ pub detected_languages: Option<Vec<String>>,
21
+
22
+ /// Text chunks when chunking is enabled.
23
+ ///
24
+ /// When chunking configuration is provided, the content is split into
25
+ /// overlapping chunks for efficient processing. Each chunk contains the text,
26
+ /// optional embeddings (if enabled), and metadata about its position.
27
+ #[serde(skip_serializing_if = "Option::is_none")]
28
+ pub chunks: Option<Vec<Chunk>>,
29
+
30
+ /// Extracted images from the document.
31
+ ///
32
+ /// When image extraction is enabled via `ImageExtractionConfig`, this field
33
+ /// contains all images found in the document with their raw data and metadata.
34
+ /// Each image may optionally contain a nested `ocr_result` if OCR was performed.
35
+ #[serde(skip_serializing_if = "Option::is_none")]
36
+ pub images: Option<Vec<ExtractedImage>>,
37
+ }
38
+
39
+ /// Format-specific metadata (discriminated union).
40
+ ///
41
+ /// Only one format type can exist per extraction result. This provides
42
+ /// type-safe, clean metadata without nested optionals.
43
+ #[derive(Debug, Clone, Serialize, Deserialize)]
44
+ #[serde(tag = "format_type", rename_all = "snake_case")]
45
+ pub enum FormatMetadata {
46
+ #[cfg(feature = "pdf")]
47
+ Pdf(PdfMetadata),
48
+ Excel(ExcelMetadata),
49
+ Email(EmailMetadata),
50
+ Pptx(PptxMetadata),
51
+ Archive(ArchiveMetadata),
52
+ Image(ImageMetadata),
53
+ Xml(XmlMetadata),
54
+ Text(TextMetadata),
55
+ Html(Box<HtmlMetadata>),
56
+ Ocr(OcrMetadata),
57
+ }
58
+
59
+ /// Extraction result metadata.
60
+ ///
61
+ /// Contains common fields applicable to all formats, format-specific metadata
62
+ /// via a discriminated union, and additional custom fields from postprocessors.
63
+ #[derive(Debug, Clone, Serialize, Deserialize, Default)]
64
+ pub struct Metadata {
65
+ /// Language of the document (ISO 639 code)
66
+ #[serde(skip_serializing_if = "Option::is_none")]
67
+ pub language: Option<String>,
68
+
69
+ /// Document date (format varies by source)
70
+ #[serde(skip_serializing_if = "Option::is_none")]
71
+ pub date: Option<String>,
72
+
73
+ /// Document subject/description
74
+ #[serde(skip_serializing_if = "Option::is_none")]
75
+ pub subject: Option<String>,
76
+
77
+ /// Format-specific metadata (discriminated union)
78
+ ///
79
+ /// Contains detailed metadata specific to the document format.
80
+ /// Serializes with a `format_type` discriminator field.
81
+ #[serde(flatten, skip_serializing_if = "Option::is_none")]
82
+ pub format: Option<FormatMetadata>,
83
+
84
+ /// Image preprocessing metadata (when OCR preprocessing was applied)
85
+ #[serde(skip_serializing_if = "Option::is_none")]
86
+ pub image_preprocessing: Option<ImagePreprocessingMetadata>,
87
+
88
+ /// JSON schema (for structured data extraction)
89
+ #[serde(skip_serializing_if = "Option::is_none")]
90
+ pub json_schema: Option<serde_json::Value>,
91
+
92
+ /// Error metadata (for batch operations)
93
+ #[serde(skip_serializing_if = "Option::is_none")]
94
+ pub error: Option<ErrorMetadata>,
95
+
96
+ /// Additional custom fields from postprocessors.
97
+ ///
98
+ /// This flattened HashMap allows Python/TypeScript postprocessors to add
99
+ /// arbitrary fields (entity extraction, keyword extraction, etc.).
100
+ /// Fields are merged at the root level during serialization.
101
+ #[serde(flatten)]
102
+ pub additional: HashMap<String, serde_json::Value>,
103
+ }
104
+
105
+ /// Excel/spreadsheet metadata.
106
+ ///
107
+ /// Contains information about sheets in Excel, LibreOffice Calc, and other
108
+ /// spreadsheet formats (.xlsx, .xls, .ods, etc.).
109
+ #[derive(Debug, Clone, Serialize, Deserialize)]
110
+ pub struct ExcelMetadata {
111
+ /// Total number of sheets in the workbook
112
+ pub sheet_count: usize,
113
+ /// Names of all sheets in order
114
+ pub sheet_names: Vec<String>,
115
+ }
116
+
117
+ /// Email metadata extracted from .eml and .msg files.
118
+ ///
119
+ /// Includes sender/recipient information, message ID, and attachment list.
120
+ #[derive(Debug, Clone, Serialize, Deserialize)]
121
+ pub struct EmailMetadata {
122
+ /// Sender's email address
123
+ #[serde(skip_serializing_if = "Option::is_none")]
124
+ pub from_email: Option<String>,
125
+
126
+ /// Sender's display name
127
+ #[serde(skip_serializing_if = "Option::is_none")]
128
+ pub from_name: Option<String>,
129
+
130
+ /// Primary recipients
131
+ pub to_emails: Vec<String>,
132
+ /// CC recipients
133
+ pub cc_emails: Vec<String>,
134
+ /// BCC recipients
135
+ pub bcc_emails: Vec<String>,
136
+
137
+ /// Message-ID header value
138
+ #[serde(skip_serializing_if = "Option::is_none")]
139
+ pub message_id: Option<String>,
140
+
141
+ /// List of attachment filenames
142
+ pub attachments: Vec<String>,
143
+ }
144
+
145
+ /// Archive (ZIP/TAR/7Z) metadata.
146
+ ///
147
+ /// Extracted from compressed archive files containing file lists and size information.
148
+ #[derive(Debug, Clone, Serialize, Deserialize)]
149
+ pub struct ArchiveMetadata {
150
+ /// Archive format ("ZIP", "TAR", "7Z", etc.)
151
+ pub format: String,
152
+ /// Total number of files in the archive
153
+ pub file_count: usize,
154
+ /// List of file paths within the archive
155
+ pub file_list: Vec<String>,
156
+ /// Total uncompressed size in bytes
157
+ pub total_size: usize,
158
+
159
+ /// Compressed size in bytes (if available)
160
+ #[serde(skip_serializing_if = "Option::is_none")]
161
+ pub compressed_size: Option<usize>,
162
+ }
163
+
164
+ /// Image metadata extracted from image files.
165
+ ///
166
+ /// Includes dimensions, format, and EXIF data.
167
+ #[derive(Debug, Clone, Serialize, Deserialize)]
168
+ pub struct ImageMetadata {
169
+ /// Image width in pixels
170
+ pub width: u32,
171
+ /// Image height in pixels
172
+ pub height: u32,
173
+ /// Image format (e.g., "PNG", "JPEG", "TIFF")
174
+ pub format: String,
175
+ /// EXIF metadata tags
176
+ pub exif: HashMap<String, String>,
177
+ }
178
+
179
+ /// XML metadata extracted during XML parsing.
180
+ ///
181
+ /// Provides statistics about XML document structure.
182
+ #[derive(Debug, Clone, Serialize, Deserialize)]
183
+ pub struct XmlMetadata {
184
+ /// Total number of XML elements processed
185
+ pub element_count: usize,
186
+ /// List of unique element tag names (sorted)
187
+ pub unique_elements: Vec<String>,
188
+ }
189
+
190
+ /// Text/Markdown metadata.
191
+ ///
192
+ /// Extracted from plain text and Markdown files. Includes word counts and,
193
+ /// for Markdown, structural elements like headers and links.
194
+ #[derive(Debug, Clone, Serialize, Deserialize)]
195
+ pub struct TextMetadata {
196
+ /// Number of lines in the document
197
+ pub line_count: usize,
198
+ /// Number of words
199
+ pub word_count: usize,
200
+ /// Number of characters
201
+ pub character_count: usize,
202
+
203
+ /// Markdown headers (headings text only, for Markdown files)
204
+ #[serde(skip_serializing_if = "Option::is_none")]
205
+ pub headers: Option<Vec<String>>,
206
+
207
+ /// Markdown links as (text, url) tuples (for Markdown files)
208
+ #[serde(skip_serializing_if = "Option::is_none")]
209
+ pub links: Option<Vec<(String, String)>>,
210
+
211
+ /// Code blocks as (language, code) tuples (for Markdown files)
212
+ #[serde(skip_serializing_if = "Option::is_none")]
213
+ pub code_blocks: Option<Vec<(String, String)>>,
214
+ }
215
+
216
+ /// HTML metadata extracted from HTML documents.
217
+ ///
218
+ /// Includes meta tags, Open Graph data, Twitter Card metadata, and link relations.
219
+ #[derive(Debug, Clone, Serialize, Deserialize, Default)]
220
+ pub struct HtmlMetadata {
221
+ #[serde(skip_serializing_if = "Option::is_none")]
222
+ pub title: Option<String>,
223
+
224
+ #[serde(skip_serializing_if = "Option::is_none")]
225
+ pub description: Option<String>,
226
+
227
+ #[serde(skip_serializing_if = "Option::is_none")]
228
+ pub keywords: Option<String>,
229
+
230
+ #[serde(skip_serializing_if = "Option::is_none")]
231
+ pub author: Option<String>,
232
+
233
+ #[serde(skip_serializing_if = "Option::is_none")]
234
+ pub canonical: Option<String>,
235
+
236
+ #[serde(skip_serializing_if = "Option::is_none")]
237
+ pub base_href: Option<String>,
238
+
239
+ #[serde(skip_serializing_if = "Option::is_none")]
240
+ pub og_title: Option<String>,
241
+
242
+ #[serde(skip_serializing_if = "Option::is_none")]
243
+ pub og_description: Option<String>,
244
+
245
+ #[serde(skip_serializing_if = "Option::is_none")]
246
+ pub og_image: Option<String>,
247
+
248
+ #[serde(skip_serializing_if = "Option::is_none")]
249
+ pub og_url: Option<String>,
250
+
251
+ #[serde(skip_serializing_if = "Option::is_none")]
252
+ pub og_type: Option<String>,
253
+
254
+ #[serde(skip_serializing_if = "Option::is_none")]
255
+ pub og_site_name: Option<String>,
256
+
257
+ #[serde(skip_serializing_if = "Option::is_none")]
258
+ pub twitter_card: Option<String>,
259
+
260
+ #[serde(skip_serializing_if = "Option::is_none")]
261
+ pub twitter_title: Option<String>,
262
+
263
+ #[serde(skip_serializing_if = "Option::is_none")]
264
+ pub twitter_description: Option<String>,
265
+
266
+ #[serde(skip_serializing_if = "Option::is_none")]
267
+ pub twitter_image: Option<String>,
268
+
269
+ #[serde(skip_serializing_if = "Option::is_none")]
270
+ pub twitter_site: Option<String>,
271
+
272
+ #[serde(skip_serializing_if = "Option::is_none")]
273
+ pub twitter_creator: Option<String>,
274
+
275
+ #[serde(skip_serializing_if = "Option::is_none")]
276
+ pub link_author: Option<String>,
277
+
278
+ #[serde(skip_serializing_if = "Option::is_none")]
279
+ pub link_license: Option<String>,
280
+
281
+ #[serde(skip_serializing_if = "Option::is_none")]
282
+ pub link_alternate: Option<String>,
283
+ }
284
+
285
+ /// OCR processing metadata.
286
+ ///
287
+ /// Captures information about OCR processing configuration and results.
288
+ #[derive(Debug, Clone, Serialize, Deserialize)]
289
+ pub struct OcrMetadata {
290
+ /// OCR language code(s) used
291
+ pub language: String,
292
+ /// Tesseract Page Segmentation Mode (PSM)
293
+ pub psm: i32,
294
+ /// Output format (e.g., "text", "hocr")
295
+ pub output_format: String,
296
+ /// Number of tables detected
297
+ pub table_count: usize,
298
+
299
+ #[serde(skip_serializing_if = "Option::is_none")]
300
+ pub table_rows: Option<usize>,
301
+
302
+ #[serde(skip_serializing_if = "Option::is_none")]
303
+ pub table_cols: Option<usize>,
304
+ }
305
+
306
+ /// Error metadata (for batch operations).
307
+ #[derive(Debug, Clone, Serialize, Deserialize)]
308
+ pub struct ErrorMetadata {
309
+ pub error_type: String,
310
+ pub message: String,
311
+ }
312
+
313
+ /// Extracted table structure.
314
+ ///
315
+ /// Represents a table detected and extracted from a document (PDF, image, etc.).
316
+ /// Tables are converted to both structured cell data and Markdown format.
317
+ #[derive(Debug, Clone, Serialize, Deserialize)]
318
+ pub struct Table {
319
+ /// Table cells as a 2D vector (rows × columns)
320
+ pub cells: Vec<Vec<String>>,
321
+ /// Markdown representation of the table
322
+ pub markdown: String,
323
+ /// Page number where the table was found (1-indexed)
324
+ pub page_number: usize,
325
+ }
326
+
327
+ /// A text chunk with optional embedding and metadata.
328
+ ///
329
+ /// Chunks are created when chunking is enabled in `ExtractionConfig`. Each chunk
330
+ /// contains the text content, optional embedding vector (if embedding generation
331
+ /// is configured), and metadata about its position in the document.
332
+ #[derive(Debug, Clone, Serialize, Deserialize)]
333
+ pub struct Chunk {
334
+ /// The text content of this chunk.
335
+ pub content: String,
336
+
337
+ /// Optional embedding vector for this chunk.
338
+ ///
339
+ /// Only populated when `EmbeddingConfig` is provided in chunking configuration.
340
+ /// The dimensionality depends on the chosen embedding model.
341
+ #[serde(skip_serializing_if = "Option::is_none")]
342
+ pub embedding: Option<Vec<f32>>,
343
+
344
+ /// Metadata about this chunk's position and properties.
345
+ pub metadata: ChunkMetadata,
346
+ }
347
+
348
+ /// Metadata about a chunk's position in the original document.
349
+ #[derive(Debug, Clone, Serialize, Deserialize)]
350
+ pub struct ChunkMetadata {
351
+ /// Character offset where this chunk starts in the original text.
352
+ pub char_start: usize,
353
+
354
+ /// Character offset where this chunk ends in the original text.
355
+ pub char_end: usize,
356
+
357
+ /// Number of tokens in this chunk (if available).
358
+ ///
359
+ /// This is calculated by the embedding model's tokenizer if embeddings are enabled.
360
+ #[serde(skip_serializing_if = "Option::is_none")]
361
+ pub token_count: Option<usize>,
362
+
363
+ /// Zero-based index of this chunk in the document.
364
+ pub chunk_index: usize,
365
+
366
+ /// Total number of chunks in the document.
367
+ pub total_chunks: usize,
368
+ }
369
+
370
+ /// Extracted image from a document.
371
+ ///
372
+ /// Contains raw image data, metadata, and optional nested OCR results.
373
+ /// Raw bytes allow cross-language compatibility - users can convert to
374
+ /// PIL.Image (Python), Sharp (Node.js), or other formats as needed.
375
+ #[derive(Debug, Clone, Serialize, Deserialize)]
376
+ pub struct ExtractedImage {
377
+ /// Raw image data (PNG, JPEG, WebP, etc. bytes)
378
+ pub data: Vec<u8>,
379
+
380
+ /// Image format (e.g., "jpeg", "png", "webp")
381
+ pub format: String,
382
+
383
+ /// Zero-indexed position of this image in the document/page
384
+ pub image_index: usize,
385
+
386
+ /// Page/slide number where image was found (1-indexed)
387
+ #[serde(skip_serializing_if = "Option::is_none")]
388
+ pub page_number: Option<usize>,
389
+
390
+ /// Image width in pixels
391
+ #[serde(skip_serializing_if = "Option::is_none")]
392
+ pub width: Option<u32>,
393
+
394
+ /// Image height in pixels
395
+ #[serde(skip_serializing_if = "Option::is_none")]
396
+ pub height: Option<u32>,
397
+
398
+ /// Colorspace information (e.g., "RGB", "CMYK", "Gray")
399
+ #[serde(skip_serializing_if = "Option::is_none")]
400
+ pub colorspace: Option<String>,
401
+
402
+ /// Bits per color component (e.g., 8, 16)
403
+ #[serde(skip_serializing_if = "Option::is_none")]
404
+ pub bits_per_component: Option<u32>,
405
+
406
+ /// Whether this image is a mask image
407
+ #[serde(default)]
408
+ pub is_mask: bool,
409
+
410
+ /// Optional description of the image
411
+ #[serde(skip_serializing_if = "Option::is_none")]
412
+ pub description: Option<String>,
413
+
414
+ /// Nested OCR extraction result (if image was OCRed)
415
+ ///
416
+ /// When OCR is performed on this image, the result is embedded here
417
+ /// rather than in a separate collection, making the relationship explicit.
418
+ #[serde(skip_serializing_if = "Option::is_none")]
419
+ pub ocr_result: Option<Box<ExtractionResult>>,
420
+ }
421
+
422
+ /// Excel workbook representation.
423
+ ///
424
+ /// Contains all sheets from an Excel file (.xlsx, .xls, etc.) with
425
+ /// extracted content and metadata.
426
+ #[derive(Debug, Clone, Serialize, Deserialize)]
427
+ pub struct ExcelWorkbook {
428
+ /// All sheets in the workbook
429
+ pub sheets: Vec<ExcelSheet>,
430
+ /// Workbook-level metadata (author, creation date, etc.)
431
+ pub metadata: HashMap<String, String>,
432
+ }
433
+
434
+ /// Single Excel worksheet.
435
+ ///
436
+ /// Represents one sheet from an Excel workbook with its content
437
+ /// converted to Markdown format and dimensional statistics.
438
+ #[derive(Debug, Clone, Serialize, Deserialize)]
439
+ pub struct ExcelSheet {
440
+ /// Sheet name as it appears in Excel
441
+ pub name: String,
442
+ /// Sheet content converted to Markdown tables
443
+ pub markdown: String,
444
+ /// Number of rows
445
+ pub row_count: usize,
446
+ /// Number of columns
447
+ pub col_count: usize,
448
+ /// Total number of non-empty cells
449
+ pub cell_count: usize,
450
+ }
451
+
452
+ /// XML extraction result.
453
+ ///
454
+ /// Contains extracted text content from XML files along with
455
+ /// structural statistics about the XML document.
456
+ #[derive(Debug, Clone, Serialize, Deserialize)]
457
+ pub struct XmlExtractionResult {
458
+ /// Extracted text content (XML structure filtered out)
459
+ pub content: String,
460
+ /// Total number of XML elements processed
461
+ pub element_count: usize,
462
+ /// List of unique element names found (sorted)
463
+ pub unique_elements: Vec<String>,
464
+ }
465
+
466
+ /// Plain text and Markdown extraction result.
467
+ ///
468
+ /// Contains the extracted text along with statistics and,
469
+ /// for Markdown files, structural elements like headers and links.
470
+ #[derive(Debug, Clone, Serialize, Deserialize)]
471
+ pub struct TextExtractionResult {
472
+ /// Extracted text content
473
+ pub content: String,
474
+ /// Number of lines
475
+ pub line_count: usize,
476
+ /// Number of words
477
+ pub word_count: usize,
478
+ /// Number of characters
479
+ pub character_count: usize,
480
+ /// Markdown headers (text only, Markdown files only)
481
+ #[serde(skip_serializing_if = "Option::is_none")]
482
+ pub headers: Option<Vec<String>>,
483
+ /// Markdown links as (text, URL) tuples (Markdown files only)
484
+ #[serde(skip_serializing_if = "Option::is_none")]
485
+ pub links: Option<Vec<(String, String)>>,
486
+ /// Code blocks as (language, code) tuples (Markdown files only)
487
+ #[serde(skip_serializing_if = "Option::is_none")]
488
+ pub code_blocks: Option<Vec<(String, String)>>,
489
+ }
490
+
491
+ /// PowerPoint (PPTX) extraction result.
492
+ ///
493
+ /// Contains extracted slide content, metadata, and embedded images/tables.
494
+ #[derive(Debug, Clone, Serialize, Deserialize)]
495
+ pub struct PptxExtractionResult {
496
+ /// Extracted text content from all slides
497
+ pub content: String,
498
+ /// Presentation metadata
499
+ pub metadata: PptxMetadata,
500
+ /// Total number of slides
501
+ pub slide_count: usize,
502
+ /// Total number of embedded images
503
+ pub image_count: usize,
504
+ /// Total number of tables
505
+ pub table_count: usize,
506
+ /// Extracted images from the presentation
507
+ pub images: Vec<ExtractedImage>,
508
+ }
509
+
510
+ /// PowerPoint presentation metadata.
511
+ ///
512
+ /// Contains document-level metadata extracted from the PPTX file.
513
+ #[derive(Debug, Clone, Serialize, Deserialize)]
514
+ pub struct PptxMetadata {
515
+ /// Presentation title
516
+ pub title: Option<String>,
517
+ /// Author name
518
+ pub author: Option<String>,
519
+ /// Description/comments
520
+ pub description: Option<String>,
521
+ /// Summary text
522
+ pub summary: Option<String>,
523
+ /// List of fonts used in the presentation
524
+ pub fonts: Vec<String>,
525
+ }
526
+
527
+ /// Email extraction result.
528
+ ///
529
+ /// Complete representation of an extracted email message (.eml or .msg)
530
+ /// including headers, body content, and attachments.
531
+ #[derive(Debug, Clone, Serialize, Deserialize)]
532
+ pub struct EmailExtractionResult {
533
+ /// Email subject line
534
+ pub subject: Option<String>,
535
+ /// Sender email address
536
+ pub from_email: Option<String>,
537
+ /// Primary recipient email addresses
538
+ pub to_emails: Vec<String>,
539
+ /// CC recipient email addresses
540
+ pub cc_emails: Vec<String>,
541
+ /// BCC recipient email addresses
542
+ pub bcc_emails: Vec<String>,
543
+ /// Email date/timestamp
544
+ pub date: Option<String>,
545
+ /// Message-ID header value
546
+ pub message_id: Option<String>,
547
+ /// Plain text version of the email body
548
+ pub plain_text: Option<String>,
549
+ /// HTML version of the email body
550
+ pub html_content: Option<String>,
551
+ /// Cleaned/processed text content
552
+ pub cleaned_text: String,
553
+ /// List of email attachments
554
+ pub attachments: Vec<EmailAttachment>,
555
+ /// Additional email headers and metadata
556
+ pub metadata: HashMap<String, String>,
557
+ }
558
+
559
+ /// Email attachment representation.
560
+ ///
561
+ /// Contains metadata and optionally the content of an email attachment.
562
+ #[derive(Debug, Clone, Serialize, Deserialize)]
563
+ pub struct EmailAttachment {
564
+ /// Attachment name (from Content-Disposition header)
565
+ pub name: Option<String>,
566
+ /// Filename of the attachment
567
+ pub filename: Option<String>,
568
+ /// MIME type of the attachment
569
+ pub mime_type: Option<String>,
570
+ /// Size in bytes
571
+ pub size: Option<usize>,
572
+ /// Whether this attachment is an image
573
+ pub is_image: bool,
574
+ /// Attachment data (if extracted)
575
+ pub data: Option<Vec<u8>>,
576
+ }
577
+
578
+ /// OCR extraction result.
579
+ ///
580
+ /// Result of performing OCR on an image or scanned document,
581
+ /// including recognized text and detected tables.
582
+ #[derive(Debug, Clone, Serialize, Deserialize)]
583
+ pub struct OcrExtractionResult {
584
+ /// Recognized text content
585
+ pub content: String,
586
+ /// Original MIME type of the processed image
587
+ pub mime_type: String,
588
+ /// OCR processing metadata (confidence scores, language, etc.)
589
+ pub metadata: HashMap<String, serde_json::Value>,
590
+ /// Tables detected and extracted via OCR
591
+ pub tables: Vec<OcrTable>,
592
+ }
593
+
594
+ /// Table detected via OCR.
595
+ ///
596
+ /// Represents a table structure recognized during OCR processing.
597
+ #[derive(Debug, Clone, Serialize, Deserialize)]
598
+ pub struct OcrTable {
599
+ /// Table cells as a 2D vector (rows × columns)
600
+ pub cells: Vec<Vec<String>>,
601
+ /// Markdown representation of the table
602
+ pub markdown: String,
603
+ /// Page number where the table was found (1-indexed)
604
+ pub page_number: usize,
605
+ }
606
+
607
+ /// Image preprocessing configuration for OCR.
608
+ ///
609
+ /// These settings control how images are preprocessed before OCR to improve
610
+ /// text recognition quality. Different preprocessing strategies work better
611
+ /// for different document types.
612
+ #[derive(Debug, Clone, Serialize, Deserialize)]
613
+ #[serde(default)]
614
+ pub struct ImagePreprocessingConfig {
615
+ /// Target DPI for the image (300 is standard, 600 for small text).
616
+ pub target_dpi: i32,
617
+
618
+ /// Auto-detect and correct image rotation.
619
+ pub auto_rotate: bool,
620
+
621
+ /// Correct skew (tilted images).
622
+ pub deskew: bool,
623
+
624
+ /// Remove noise from the image.
625
+ pub denoise: bool,
626
+
627
+ /// Enhance contrast for better text visibility.
628
+ pub contrast_enhance: bool,
629
+
630
+ /// Binarization method: "otsu", "sauvola", "adaptive".
631
+ pub binarization_method: String,
632
+
633
+ /// Invert colors (white text on black → black on white).
634
+ pub invert_colors: bool,
635
+ }
636
+
637
+ impl Default for ImagePreprocessingConfig {
638
+ fn default() -> Self {
639
+ Self {
640
+ target_dpi: 300,
641
+ auto_rotate: true,
642
+ deskew: true,
643
+ denoise: false,
644
+ contrast_enhance: false,
645
+ binarization_method: "otsu".to_string(),
646
+ invert_colors: false,
647
+ }
648
+ }
649
+ }
650
+
651
+ /// Tesseract OCR configuration.
652
+ ///
653
+ /// Provides fine-grained control over Tesseract OCR engine parameters.
654
+ /// Most users can use the defaults, but these settings allow optimization
655
+ /// for specific document types (invoices, handwriting, etc.).
656
+ #[derive(Debug, Clone, Serialize, Deserialize)]
657
+ #[serde(default)]
658
+ pub struct TesseractConfig {
659
+ /// Language code (e.g., "eng", "deu", "fra")
660
+ pub language: String,
661
+
662
+ /// Page Segmentation Mode (0-13).
663
+ ///
664
+ /// Common values:
665
+ /// - 3: Fully automatic page segmentation (default)
666
+ /// - 6: Assume a single uniform block of text
667
+ /// - 11: Sparse text with no particular order
668
+ pub psm: i32,
669
+
670
+ /// Output format ("text" or "markdown")
671
+ pub output_format: String,
672
+
673
+ /// OCR Engine Mode (0-3).
674
+ ///
675
+ /// - 0: Legacy engine only
676
+ /// - 1: Neural nets (LSTM) only (usually best)
677
+ /// - 2: Legacy + LSTM
678
+ /// - 3: Default (based on what's available)
679
+ pub oem: i32,
680
+
681
+ /// Minimum confidence threshold (0.0-100.0).
682
+ ///
683
+ /// Words with confidence below this threshold may be rejected or flagged.
684
+ pub min_confidence: f64,
685
+
686
+ /// Image preprocessing configuration.
687
+ ///
688
+ /// Controls how images are preprocessed before OCR. Can significantly
689
+ /// improve quality for scanned documents or low-quality images.
690
+ #[serde(skip_serializing_if = "Option::is_none")]
691
+ pub preprocessing: Option<ImagePreprocessingConfig>,
692
+
693
+ /// Enable automatic table detection and reconstruction
694
+ pub enable_table_detection: bool,
695
+
696
+ /// Minimum confidence threshold for table detection (0.0-1.0)
697
+ pub table_min_confidence: f64,
698
+
699
+ /// Column threshold for table detection (pixels)
700
+ pub table_column_threshold: i32,
701
+
702
+ /// Row threshold ratio for table detection (0.0-1.0)
703
+ pub table_row_threshold_ratio: f64,
704
+
705
+ /// Enable OCR result caching
706
+ pub use_cache: bool,
707
+
708
+ /// Use pre-adapted templates for character classification
709
+ pub classify_use_pre_adapted_templates: bool,
710
+
711
+ /// Enable N-gram language model
712
+ pub language_model_ngram_on: bool,
713
+
714
+ /// Don't reject good words during block-level processing
715
+ pub tessedit_dont_blkrej_good_wds: bool,
716
+
717
+ /// Don't reject good words during row-level processing
718
+ pub tessedit_dont_rowrej_good_wds: bool,
719
+
720
+ /// Enable dictionary correction
721
+ pub tessedit_enable_dict_correction: bool,
722
+
723
+ /// Whitelist of allowed characters (empty = all allowed)
724
+ pub tessedit_char_whitelist: String,
725
+
726
+ /// Blacklist of forbidden characters (empty = none forbidden)
727
+ pub tessedit_char_blacklist: String,
728
+
729
+ /// Use primary language params model
730
+ pub tessedit_use_primary_params_model: bool,
731
+
732
+ /// Variable-width space detection
733
+ pub textord_space_size_is_variable: bool,
734
+
735
+ /// Use adaptive thresholding method
736
+ pub thresholding_method: bool,
737
+ }
738
+
739
+ impl Default for TesseractConfig {
740
+ fn default() -> Self {
741
+ Self {
742
+ language: "eng".to_string(),
743
+ psm: 3,
744
+ output_format: "markdown".to_string(),
745
+ oem: 3,
746
+ min_confidence: 0.0,
747
+ preprocessing: None,
748
+ enable_table_detection: true,
749
+ table_min_confidence: 0.0,
750
+ table_column_threshold: 50,
751
+ table_row_threshold_ratio: 0.5,
752
+ use_cache: true,
753
+ classify_use_pre_adapted_templates: true,
754
+ language_model_ngram_on: false,
755
+ tessedit_dont_blkrej_good_wds: true,
756
+ tessedit_dont_rowrej_good_wds: true,
757
+ tessedit_enable_dict_correction: true,
758
+ tessedit_char_whitelist: String::new(),
759
+ tessedit_char_blacklist: String::new(),
760
+ tessedit_use_primary_params_model: true,
761
+ textord_space_size_is_variable: true,
762
+ thresholding_method: false,
763
+ }
764
+ }
765
+ }
766
+
767
+ /// Image preprocessing metadata.
768
+ ///
769
+ /// Tracks the transformations applied to an image during OCR preprocessing,
770
+ /// including DPI normalization, resizing, and resampling.
771
+ #[derive(Debug, Clone, Serialize, Deserialize)]
772
+ pub struct ImagePreprocessingMetadata {
773
+ /// Original image dimensions (width, height) in pixels
774
+ pub original_dimensions: (usize, usize),
775
+ /// Original image DPI (horizontal, vertical)
776
+ pub original_dpi: (f64, f64),
777
+ /// Target DPI from configuration
778
+ pub target_dpi: i32,
779
+ /// Scaling factor applied to the image
780
+ pub scale_factor: f64,
781
+ /// Whether DPI was auto-adjusted based on content
782
+ pub auto_adjusted: bool,
783
+ /// Final DPI after processing
784
+ pub final_dpi: i32,
785
+ /// New dimensions after resizing (if resized)
786
+ pub new_dimensions: Option<(usize, usize)>,
787
+ /// Resampling algorithm used ("LANCZOS3", "CATMULLROM", etc.)
788
+ pub resample_method: String,
789
+ /// Whether dimensions were clamped to max_image_dimension
790
+ pub dimension_clamped: bool,
791
+ /// Calculated optimal DPI (if auto_adjust_dpi enabled)
792
+ pub calculated_dpi: Option<i32>,
793
+ /// Whether resize was skipped (dimensions already optimal)
794
+ pub skipped_resize: bool,
795
+ /// Error message if resize failed
796
+ pub resize_error: Option<String>,
797
+ }
798
+
799
+ /// Image extraction configuration (internal use).
800
+ ///
801
+ /// **Note:** This is an internal type used for image preprocessing.
802
+ /// For the main extraction configuration, see [`crate::core::config::ExtractionConfig`].
803
+ #[derive(Debug, Clone, Serialize, Deserialize)]
804
+ pub struct ExtractionConfig {
805
+ /// Target DPI for image normalization
806
+ pub target_dpi: i32,
807
+ /// Maximum image dimension (width or height)
808
+ pub max_image_dimension: i32,
809
+ /// Whether to auto-adjust DPI based on content
810
+ pub auto_adjust_dpi: bool,
811
+ /// Minimum DPI threshold
812
+ pub min_dpi: i32,
813
+ /// Maximum DPI threshold
814
+ pub max_dpi: i32,
815
+ }
816
+
817
+ impl Default for ExtractionConfig {
818
+ fn default() -> Self {
819
+ Self {
820
+ target_dpi: 300,
821
+ max_image_dimension: 4096,
822
+ auto_adjust_dpi: true,
823
+ min_dpi: 72,
824
+ max_dpi: 600,
825
+ }
826
+ }
827
+ }
828
+
829
+ /// Cache statistics.
830
+ ///
831
+ /// Provides information about the extraction result cache,
832
+ /// including size, file count, and age distribution.
833
+ #[derive(Debug, Clone, Serialize, Deserialize)]
834
+ pub struct CacheStats {
835
+ /// Total number of cached files
836
+ pub total_files: usize,
837
+ /// Total cache size in megabytes
838
+ pub total_size_mb: f64,
839
+ /// Available disk space in megabytes
840
+ pub available_space_mb: f64,
841
+ /// Age of the oldest cached file in days
842
+ pub oldest_file_age_days: f64,
843
+ /// Age of the newest cached file in days
844
+ pub newest_file_age_days: f64,
845
+ }
846
+
847
+ /// Pandoc extraction result.
848
+ ///
849
+ /// Result of extracting content from a document using Pandoc,
850
+ /// including text and any metadata Pandoc was able to extract.
851
+ #[derive(Debug, Clone, Serialize, Deserialize)]
852
+ pub struct PandocExtractionResult {
853
+ /// Extracted text content
854
+ pub content: String,
855
+ /// Metadata extracted by Pandoc (varies by format)
856
+ pub metadata: HashMap<String, serde_json::Value>,
857
+ }
858
+
859
+ /// LibreOffice conversion result.
860
+ ///
861
+ /// Result of converting a legacy office document (e.g., .doc, .ppt)
862
+ /// to a modern format using LibreOffice.
863
+ #[derive(Debug, Clone, Serialize, Deserialize)]
864
+ pub struct LibreOfficeConversionResult {
865
+ /// Converted file bytes
866
+ pub converted_bytes: Vec<u8>,
867
+ /// Original format identifier
868
+ pub original_format: String,
869
+ /// Target format identifier
870
+ pub target_format: String,
871
+ /// Target MIME type after conversion
872
+ pub target_mime: String,
873
+ }