mindee-lite 5.0.0.beta1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (510) hide show
  1. checksums.yaml +7 -0
  2. data/.editorconfig +24 -0
  3. data/.gitattributes +14 -0
  4. data/.gitignore +76 -0
  5. data/.gitmodules +3 -0
  6. data/.pre-commit-config.yaml +36 -0
  7. data/.rubocop.yml +49 -0
  8. data/.yardopts +4 -0
  9. data/CHANGELOG.md +515 -0
  10. data/CODE_OF_CONDUCT.md +129 -0
  11. data/CONTRIBUTING.md +107 -0
  12. data/Gemfile +14 -0
  13. data/LICENSE +21 -0
  14. data/README.md +42 -0
  15. data/Rakefile +40 -0
  16. data/Steepfile +30 -0
  17. data/bin/console +14 -0
  18. data/bin/mindee.rb +30 -0
  19. data/bin/v1/parser.rb +153 -0
  20. data/bin/v1/products.rb +88 -0
  21. data/bin/v2/parser.rb +235 -0
  22. data/bin/v2/products.rb +34 -0
  23. data/docs/code_samples/bank_account_details_v1.txt +24 -0
  24. data/docs/code_samples/bank_account_details_v2.txt +24 -0
  25. data/docs/code_samples/bank_statement_fr_v2_async.txt +24 -0
  26. data/docs/code_samples/barcode_reader_v1.txt +24 -0
  27. data/docs/code_samples/cropper_v1.txt +21 -0
  28. data/docs/code_samples/default.txt +30 -0
  29. data/docs/code_samples/default_async.txt +29 -0
  30. data/docs/code_samples/expense_receipts_v5.txt +25 -0
  31. data/docs/code_samples/expense_receipts_v5_async.txt +24 -0
  32. data/docs/code_samples/financial_document_v1.txt +25 -0
  33. data/docs/code_samples/financial_document_v1_async.txt +24 -0
  34. data/docs/code_samples/idcard_fr_v1.txt +24 -0
  35. data/docs/code_samples/idcard_fr_v2.txt +24 -0
  36. data/docs/code_samples/international_id_v2_async.txt +24 -0
  37. data/docs/code_samples/invoice_splitter_v1_async.txt +24 -0
  38. data/docs/code_samples/invoices_v4.txt +25 -0
  39. data/docs/code_samples/invoices_v4_async.txt +24 -0
  40. data/docs/code_samples/multi_receipts_detector_v1.txt +24 -0
  41. data/docs/code_samples/passport_v1.txt +24 -0
  42. data/docs/code_samples/resume_v1_async.txt +24 -0
  43. data/docs/code_samples/v2_classification.txt +30 -0
  44. data/docs/code_samples/v2_crop.txt +30 -0
  45. data/docs/code_samples/v2_extraction.txt +42 -0
  46. data/docs/code_samples/v2_extraction_webhook.txt +45 -0
  47. data/docs/code_samples/v2_ocr.txt +30 -0
  48. data/docs/code_samples/v2_split.txt +30 -0
  49. data/docs/code_samples/workflow_execution.txt +28 -0
  50. data/docs/code_samples/workflow_polling.txt +35 -0
  51. data/examples/auto_invoice_splitter_extraction.rb +48 -0
  52. data/examples/auto_multi_receipts_detector_extraction.rb +30 -0
  53. data/lib/mindee/dependency.rb +29 -0
  54. data/lib/mindee/error/mindee_error.rb +17 -0
  55. data/lib/mindee/error/mindee_http_error.rb +36 -0
  56. data/lib/mindee/error/mindee_http_error_v2.rb +45 -0
  57. data/lib/mindee/error/mindee_http_unknown_error_v2.rb +18 -0
  58. data/lib/mindee/error/mindee_input_error.rb +30 -0
  59. data/lib/mindee/error.rb +6 -0
  60. data/lib/mindee/geometry/min_max.rb +23 -0
  61. data/lib/mindee/geometry/point.rb +41 -0
  62. data/lib/mindee/geometry/polygon.rb +37 -0
  63. data/lib/mindee/geometry/quadrilateral.rb +50 -0
  64. data/lib/mindee/geometry/utils.rb +88 -0
  65. data/lib/mindee/geometry.rb +7 -0
  66. data/lib/mindee/http/.rubocop.yml +7 -0
  67. data/lib/mindee/http/http_error_handler.rb +106 -0
  68. data/lib/mindee/http/response_validation.rb +81 -0
  69. data/lib/mindee/http.rb +3 -0
  70. data/lib/mindee/image/extracted_image.rb +89 -0
  71. data/lib/mindee/image/image_compressor.rb +29 -0
  72. data/lib/mindee/image/image_extractor.rb +118 -0
  73. data/lib/mindee/image/image_utils.rb +165 -0
  74. data/lib/mindee/image.rb +6 -0
  75. data/lib/mindee/input/base_parameters.rb +149 -0
  76. data/lib/mindee/input/local_response.rb +80 -0
  77. data/lib/mindee/input/polling_options.rb +26 -0
  78. data/lib/mindee/input/sources/base64_input_source.rb +31 -0
  79. data/lib/mindee/input/sources/bytes_input_source.rb +21 -0
  80. data/lib/mindee/input/sources/file_input_source.rb +20 -0
  81. data/lib/mindee/input/sources/local_input_source.rb +216 -0
  82. data/lib/mindee/input/sources/path_input_source.rb +20 -0
  83. data/lib/mindee/input/sources/url_input_source.rb +130 -0
  84. data/lib/mindee/input/sources.rb +8 -0
  85. data/lib/mindee/input.rb +4 -0
  86. data/lib/mindee/logging/logger.rb +24 -0
  87. data/lib/mindee/logging.rb +3 -0
  88. data/lib/mindee/page_options.rb +24 -0
  89. data/lib/mindee/pdf/extracted_pdf.rb +70 -0
  90. data/lib/mindee/pdf/pdf_compressor.rb +121 -0
  91. data/lib/mindee/pdf/pdf_extractor.rb +121 -0
  92. data/lib/mindee/pdf/pdf_processor.rb +91 -0
  93. data/lib/mindee/pdf/pdf_tools.rb +201 -0
  94. data/lib/mindee/pdf.rb +7 -0
  95. data/lib/mindee/v1/client.rb +490 -0
  96. data/lib/mindee/v1/extraction/multi_receipts_extractor.rb +32 -0
  97. data/lib/mindee/v1/extraction.rb +3 -0
  98. data/lib/mindee/v1/http/.rubocop.yml +7 -0
  99. data/lib/mindee/v1/http/endpoint.rb +221 -0
  100. data/lib/mindee/v1/http/workflow_endpoint.rb +93 -0
  101. data/lib/mindee/v1/http.rb +4 -0
  102. data/lib/mindee/v1/parsing/common/api_request.rb +38 -0
  103. data/lib/mindee/v1/parsing/common/api_response.rb +63 -0
  104. data/lib/mindee/v1/parsing/common/document.rb +86 -0
  105. data/lib/mindee/v1/parsing/common/execution.rb +78 -0
  106. data/lib/mindee/v1/parsing/common/execution_file.rb +26 -0
  107. data/lib/mindee/v1/parsing/common/execution_priority.rb +38 -0
  108. data/lib/mindee/v1/parsing/common/extras/cropper_extra.rb +32 -0
  109. data/lib/mindee/v1/parsing/common/extras/extras.rb +62 -0
  110. data/lib/mindee/v1/parsing/common/extras/full_text_ocr_extra.rb +35 -0
  111. data/lib/mindee/v1/parsing/common/extras/rag_extra.rb +28 -0
  112. data/lib/mindee/v1/parsing/common/extras.rb +6 -0
  113. data/lib/mindee/v1/parsing/common/inference.rb +69 -0
  114. data/lib/mindee/v1/parsing/common/job.rb +48 -0
  115. data/lib/mindee/v1/parsing/common/ocr/mvision_v1.rb +52 -0
  116. data/lib/mindee/v1/parsing/common/ocr/ocr.rb +180 -0
  117. data/lib/mindee/v1/parsing/common/ocr.rb +3 -0
  118. data/lib/mindee/v1/parsing/common/orientation.rb +28 -0
  119. data/lib/mindee/v1/parsing/common/page.rb +49 -0
  120. data/lib/mindee/v1/parsing/common/prediction.rb +19 -0
  121. data/lib/mindee/v1/parsing/common/product.rb +26 -0
  122. data/lib/mindee/v1/parsing/common/workflow_response.rb +30 -0
  123. data/lib/mindee/v1/parsing/common.rb +15 -0
  124. data/lib/mindee/v1/parsing/standard/abstract_field.rb +74 -0
  125. data/lib/mindee/v1/parsing/standard/address_field.rb +51 -0
  126. data/lib/mindee/v1/parsing/standard/amount_field.rb +28 -0
  127. data/lib/mindee/v1/parsing/standard/base_field.rb +30 -0
  128. data/lib/mindee/v1/parsing/standard/boolean_field.rb +29 -0
  129. data/lib/mindee/v1/parsing/standard/classification_field.rb +18 -0
  130. data/lib/mindee/v1/parsing/standard/company_registration_field.rb +45 -0
  131. data/lib/mindee/v1/parsing/standard/date_field.rb +40 -0
  132. data/lib/mindee/v1/parsing/standard/feature_field.rb +26 -0
  133. data/lib/mindee/v1/parsing/standard/locale_field.rb +52 -0
  134. data/lib/mindee/v1/parsing/standard/payment_details_field.rb +44 -0
  135. data/lib/mindee/v1/parsing/standard/position_field.rb +61 -0
  136. data/lib/mindee/v1/parsing/standard/string_field.rb +26 -0
  137. data/lib/mindee/v1/parsing/standard/tax_field.rb +110 -0
  138. data/lib/mindee/v1/parsing/standard.rb +15 -0
  139. data/lib/mindee/v1/parsing/universal/universal_list_field.rb +60 -0
  140. data/lib/mindee/v1/parsing/universal/universal_object_field.rb +123 -0
  141. data/lib/mindee/v1/parsing/universal.rb +4 -0
  142. data/lib/mindee/v1/parsing.rb +5 -0
  143. data/lib/mindee/v1/product/.rubocop.yml +12 -0
  144. data/lib/mindee/v1/product/barcode_reader/barcode_reader_v1.rb +47 -0
  145. data/lib/mindee/v1/product/barcode_reader/barcode_reader_v1_document.rb +47 -0
  146. data/lib/mindee/v1/product/barcode_reader/barcode_reader_v1_page.rb +38 -0
  147. data/lib/mindee/v1/product/cropper/cropper_v1.rb +47 -0
  148. data/lib/mindee/v1/product/cropper/cropper_v1_document.rb +15 -0
  149. data/lib/mindee/v1/product/cropper/cropper_v1_page.rb +55 -0
  150. data/lib/mindee/v1/product/financial_document/financial_document_v1.rb +47 -0
  151. data/lib/mindee/v1/product/financial_document/financial_document_v1_document.rb +329 -0
  152. data/lib/mindee/v1/product/financial_document/financial_document_v1_line_item.rb +124 -0
  153. data/lib/mindee/v1/product/financial_document/financial_document_v1_line_items.rb +64 -0
  154. data/lib/mindee/v1/product/financial_document/financial_document_v1_page.rb +38 -0
  155. data/lib/mindee/v1/product/fr/bank_account_details/bank_account_details_v1.rb +49 -0
  156. data/lib/mindee/v1/product/fr/bank_account_details/bank_account_details_v1_document.rb +49 -0
  157. data/lib/mindee/v1/product/fr/bank_account_details/bank_account_details_v1_page.rb +40 -0
  158. data/lib/mindee/v1/product/fr/bank_account_details/bank_account_details_v2.rb +49 -0
  159. data/lib/mindee/v1/product/fr/bank_account_details/bank_account_details_v2_bban.rb +63 -0
  160. data/lib/mindee/v1/product/fr/bank_account_details/bank_account_details_v2_document.rb +60 -0
  161. data/lib/mindee/v1/product/fr/bank_account_details/bank_account_details_v2_page.rb +40 -0
  162. data/lib/mindee/v1/product/fr/bank_statement/bank_statement_v2.rb +49 -0
  163. data/lib/mindee/v1/product/fr/bank_statement/bank_statement_v2_document.rb +169 -0
  164. data/lib/mindee/v1/product/fr/bank_statement/bank_statement_v2_page.rb +40 -0
  165. data/lib/mindee/v1/product/fr/bank_statement/bank_statement_v2_transaction.rb +78 -0
  166. data/lib/mindee/v1/product/fr/bank_statement/bank_statement_v2_transactions.rb +56 -0
  167. data/lib/mindee/v1/product/fr/id_card/id_card_v1.rb +49 -0
  168. data/lib/mindee/v1/product/fr/id_card/id_card_v1_document.rb +106 -0
  169. data/lib/mindee/v1/product/fr/id_card/id_card_v1_page.rb +57 -0
  170. data/lib/mindee/v1/product/fr/id_card/id_card_v2.rb +49 -0
  171. data/lib/mindee/v1/product/fr/id_card/id_card_v2_document.rb +143 -0
  172. data/lib/mindee/v1/product/fr/id_card/id_card_v2_page.rb +65 -0
  173. data/lib/mindee/v1/product/international_id/international_id_v2.rb +47 -0
  174. data/lib/mindee/v1/product/international_id/international_id_v2_document.rb +164 -0
  175. data/lib/mindee/v1/product/international_id/international_id_v2_page.rb +38 -0
  176. data/lib/mindee/v1/product/invoice/invoice_v4.rb +47 -0
  177. data/lib/mindee/v1/product/invoice/invoice_v4_document.rb +300 -0
  178. data/lib/mindee/v1/product/invoice/invoice_v4_line_item.rb +124 -0
  179. data/lib/mindee/v1/product/invoice/invoice_v4_line_items.rb +64 -0
  180. data/lib/mindee/v1/product/invoice/invoice_v4_page.rb +38 -0
  181. data/lib/mindee/v1/product/invoice_splitter/invoice_splitter_v1.rb +47 -0
  182. data/lib/mindee/v1/product/invoice_splitter/invoice_splitter_v1_document.rb +66 -0
  183. data/lib/mindee/v1/product/invoice_splitter/invoice_splitter_v1_invoice_page_group.rb +58 -0
  184. data/lib/mindee/v1/product/invoice_splitter/invoice_splitter_v1_invoice_page_groups.rb +50 -0
  185. data/lib/mindee/v1/product/invoice_splitter/invoice_splitter_v1_page.rb +38 -0
  186. data/lib/mindee/v1/product/multi_receipts_detector/multi_receipts_detector_v1.rb +47 -0
  187. data/lib/mindee/v1/product/multi_receipts_detector/multi_receipts_detector_v1_document.rb +38 -0
  188. data/lib/mindee/v1/product/multi_receipts_detector/multi_receipts_detector_v1_page.rb +38 -0
  189. data/lib/mindee/v1/product/passport/passport_v1.rb +47 -0
  190. data/lib/mindee/v1/product/passport/passport_v1_document.rb +112 -0
  191. data/lib/mindee/v1/product/passport/passport_v1_page.rb +38 -0
  192. data/lib/mindee/v1/product/receipt/receipt_v5.rb +47 -0
  193. data/lib/mindee/v1/product/receipt/receipt_v5_document.rb +187 -0
  194. data/lib/mindee/v1/product/receipt/receipt_v5_line_item.rb +88 -0
  195. data/lib/mindee/v1/product/receipt/receipt_v5_line_items.rb +56 -0
  196. data/lib/mindee/v1/product/receipt/receipt_v5_page.rb +38 -0
  197. data/lib/mindee/v1/product/resume/resume_v1.rb +47 -0
  198. data/lib/mindee/v1/product/resume/resume_v1_certificate.rb +82 -0
  199. data/lib/mindee/v1/product/resume/resume_v1_certificates.rb +60 -0
  200. data/lib/mindee/v1/product/resume/resume_v1_document.rb +340 -0
  201. data/lib/mindee/v1/product/resume/resume_v1_education.rb +106 -0
  202. data/lib/mindee/v1/product/resume/resume_v1_educations.rb +66 -0
  203. data/lib/mindee/v1/product/resume/resume_v1_language.rb +66 -0
  204. data/lib/mindee/v1/product/resume/resume_v1_languages.rb +56 -0
  205. data/lib/mindee/v1/product/resume/resume_v1_page.rb +38 -0
  206. data/lib/mindee/v1/product/resume/resume_v1_professional_experience.rb +122 -0
  207. data/lib/mindee/v1/product/resume/resume_v1_professional_experiences.rb +70 -0
  208. data/lib/mindee/v1/product/resume/resume_v1_social_networks_url.rb +66 -0
  209. data/lib/mindee/v1/product/resume/resume_v1_social_networks_urls.rb +56 -0
  210. data/lib/mindee/v1/product/universal/universal.rb +48 -0
  211. data/lib/mindee/v1/product/universal/universal_document.rb +35 -0
  212. data/lib/mindee/v1/product/universal/universal_page.rb +54 -0
  213. data/lib/mindee/v1/product/universal/universal_prediction.rb +128 -0
  214. data/lib/mindee/v1/product.rb +18 -0
  215. data/lib/mindee/v1.rb +7 -0
  216. data/lib/mindee/v2/client.rb +132 -0
  217. data/lib/mindee/v2/file_operation/crop.rb +51 -0
  218. data/lib/mindee/v2/file_operation/crop_files.rb +25 -0
  219. data/lib/mindee/v2/file_operation/split.rb +37 -0
  220. data/lib/mindee/v2/file_operation/split_files.rb +25 -0
  221. data/lib/mindee/v2/file_operation.rb +6 -0
  222. data/lib/mindee/v2/http/.rubocop.yml +7 -0
  223. data/lib/mindee/v2/http/api_v2_settings.rb +65 -0
  224. data/lib/mindee/v2/http/mindee_api_v2.rb +230 -0
  225. data/lib/mindee/v2/http.rb +4 -0
  226. data/lib/mindee/v2/parsing/base_inference.rb +44 -0
  227. data/lib/mindee/v2/parsing/base_response.rb +15 -0
  228. data/lib/mindee/v2/parsing/common_response.rb +20 -0
  229. data/lib/mindee/v2/parsing/error_item.rb +21 -0
  230. data/lib/mindee/v2/parsing/error_response.rb +51 -0
  231. data/lib/mindee/v2/parsing/field/base_field.rb +63 -0
  232. data/lib/mindee/v2/parsing/field/field_confidence.rb +128 -0
  233. data/lib/mindee/v2/parsing/field/field_location.rb +33 -0
  234. data/lib/mindee/v2/parsing/field/inference_fields.rb +105 -0
  235. data/lib/mindee/v2/parsing/field/list_field.rb +79 -0
  236. data/lib/mindee/v2/parsing/field/object_field.rb +138 -0
  237. data/lib/mindee/v2/parsing/field/simple_field.rb +60 -0
  238. data/lib/mindee/v2/parsing/field.rb +9 -0
  239. data/lib/mindee/v2/parsing/inference_active_options.rb +67 -0
  240. data/lib/mindee/v2/parsing/inference_file.rb +38 -0
  241. data/lib/mindee/v2/parsing/inference_job.rb +25 -0
  242. data/lib/mindee/v2/parsing/inference_model.rb +30 -0
  243. data/lib/mindee/v2/parsing/job.rb +93 -0
  244. data/lib/mindee/v2/parsing/job_response.rb +30 -0
  245. data/lib/mindee/v2/parsing/job_webhook.rb +59 -0
  246. data/lib/mindee/v2/parsing/rag_metadata.rb +17 -0
  247. data/lib/mindee/v2/parsing/raw_text.rb +27 -0
  248. data/lib/mindee/v2/parsing/raw_text_page.rb +24 -0
  249. data/lib/mindee/v2/parsing/search/pagination_metadata.rb +44 -0
  250. data/lib/mindee/v2/parsing/search/search_model.rb +38 -0
  251. data/lib/mindee/v2/parsing/search/search_models.rb +34 -0
  252. data/lib/mindee/v2/parsing/search/search_response.rb +38 -0
  253. data/lib/mindee/v2/parsing/search.rb +6 -0
  254. data/lib/mindee/v2/parsing.rb +16 -0
  255. data/lib/mindee/v2/product/base_product.rb +28 -0
  256. data/lib/mindee/v2/product/classification/classification.rb +20 -0
  257. data/lib/mindee/v2/product/classification/classification_classifier.rb +25 -0
  258. data/lib/mindee/v2/product/classification/classification_inference.rb +35 -0
  259. data/lib/mindee/v2/product/classification/classification_response.rb +32 -0
  260. data/lib/mindee/v2/product/classification/classification_result.rb +27 -0
  261. data/lib/mindee/v2/product/classification/params/classification_parameters.rb +47 -0
  262. data/lib/mindee/v2/product/crop/crop.rb +20 -0
  263. data/lib/mindee/v2/product/crop/crop_inference.rb +34 -0
  264. data/lib/mindee/v2/product/crop/crop_item.rb +39 -0
  265. data/lib/mindee/v2/product/crop/crop_response.rb +40 -0
  266. data/lib/mindee/v2/product/crop/crop_result.rb +34 -0
  267. data/lib/mindee/v2/product/crop/params/crop_parameters.rb +47 -0
  268. data/lib/mindee/v2/product/extraction/extraction.rb +21 -0
  269. data/lib/mindee/v2/product/extraction/extraction_inference.rb +40 -0
  270. data/lib/mindee/v2/product/extraction/extraction_response.rb +32 -0
  271. data/lib/mindee/v2/product/extraction/extraction_result.rb +44 -0
  272. data/lib/mindee/v2/product/extraction/params/data_schema.rb +51 -0
  273. data/lib/mindee/v2/product/extraction/params/data_schema_field.rb +69 -0
  274. data/lib/mindee/v2/product/extraction/params/data_schema_replace.rb +39 -0
  275. data/lib/mindee/v2/product/extraction/params/extraction_parameters.rb +125 -0
  276. data/lib/mindee/v2/product/ocr/ocr.rb +20 -0
  277. data/lib/mindee/v2/product/ocr/ocr_inference.rb +34 -0
  278. data/lib/mindee/v2/product/ocr/ocr_page.rb +33 -0
  279. data/lib/mindee/v2/product/ocr/ocr_response.rb +32 -0
  280. data/lib/mindee/v2/product/ocr/ocr_result.rb +34 -0
  281. data/lib/mindee/v2/product/ocr/ocr_word.rb +29 -0
  282. data/lib/mindee/v2/product/ocr/params/ocr_parameters.rb +47 -0
  283. data/lib/mindee/v2/product/split/params/split_parameters.rb +48 -0
  284. data/lib/mindee/v2/product/split/split.rb +19 -0
  285. data/lib/mindee/v2/product/split/split_inference.rb +34 -0
  286. data/lib/mindee/v2/product/split/split_range.rb +38 -0
  287. data/lib/mindee/v2/product/split/split_response.rb +40 -0
  288. data/lib/mindee/v2/product/split/split_result.rb +34 -0
  289. data/lib/mindee/v2/product.rb +7 -0
  290. data/lib/mindee/v2.rb +7 -0
  291. data/lib/mindee/version.rb +26 -0
  292. data/lib/mindee.rb +135 -0
  293. data/mindee-lite.gemspec +36 -0
  294. data/mindee.gemspec +44 -0
  295. data/sig/custom/marcel.rbs +3 -0
  296. data/sig/custom/mini_magick.rbs +31 -0
  297. data/sig/custom/net_http.rbs +43 -0
  298. data/sig/custom/origami.rbs +59 -0
  299. data/sig/mindee/dependency.rbs +13 -0
  300. data/sig/mindee/error/mindee_error.rbs +13 -0
  301. data/sig/mindee/error/mindee_http_error.rbs +17 -0
  302. data/sig/mindee/error/mindee_http_error_v2.rbs +15 -0
  303. data/sig/mindee/error/mindee_http_unknown_error_v2.rbs +9 -0
  304. data/sig/mindee/error/mindee_input_error.rbs +18 -0
  305. data/sig/mindee/geometry/min_max.rbs +11 -0
  306. data/sig/mindee/geometry/point.rbs +14 -0
  307. data/sig/mindee/geometry/polygon.rbs +12 -0
  308. data/sig/mindee/geometry/quadrilateral.rbs +15 -0
  309. data/sig/mindee/geometry/utils.rbs +13 -0
  310. data/sig/mindee/http/http_error_handler.rbs +15 -0
  311. data/sig/mindee/http/response_validation.rbs +11 -0
  312. data/sig/mindee/image/extracted_image.rbs +21 -0
  313. data/sig/mindee/image/image_compressor.rbs +8 -0
  314. data/sig/mindee/image/image_extractor.rbs +13 -0
  315. data/sig/mindee/image/image_utils.rbs +19 -0
  316. data/sig/mindee/input/base_parameters.rbs +35 -0
  317. data/sig/mindee/input/local_response.rbs +14 -0
  318. data/sig/mindee/input/polling_options.rbs +12 -0
  319. data/sig/mindee/input/sources/base64_input_source.rbs +11 -0
  320. data/sig/mindee/input/sources/bytes_input_source.rbs +10 -0
  321. data/sig/mindee/input/sources/file_input_source.rbs +10 -0
  322. data/sig/mindee/input/sources/local_input_source.rbs +30 -0
  323. data/sig/mindee/input/sources/path_input_source.rbs +10 -0
  324. data/sig/mindee/input/sources/url_input_source.rbs +20 -0
  325. data/sig/mindee/logging/logger.rbs +11 -0
  326. data/sig/mindee/page_options.rbs +11 -0
  327. data/sig/mindee/pdf/extracted_pdf.rbs +17 -0
  328. data/sig/mindee/pdf/pdf_compressor.rbs +15 -0
  329. data/sig/mindee/pdf/pdf_extractor.rbs +19 -0
  330. data/sig/mindee/pdf/pdf_processor.rbs +12 -0
  331. data/sig/mindee/pdf/pdf_tools.rbs +31 -0
  332. data/sig/mindee/v1/client.rbs +84 -0
  333. data/sig/mindee/v1/extraction/multi_receipts_extractor.rbs +8 -0
  334. data/sig/mindee/v1/http/endpoint.rbs +41 -0
  335. data/sig/mindee/v1/http/workflow_endpoint.rbs +22 -0
  336. data/sig/mindee/v1/parsing/common/api_request.rbs +22 -0
  337. data/sig/mindee/v1/parsing/common/api_response.rbs +31 -0
  338. data/sig/mindee/v1/parsing/common/document.rbs +32 -0
  339. data/sig/mindee/v1/parsing/common/execution.rbs +26 -0
  340. data/sig/mindee/v1/parsing/common/execution_file.rbs +16 -0
  341. data/sig/mindee/v1/parsing/common/execution_priority.rbs +16 -0
  342. data/sig/mindee/v1/parsing/common/extras/cropper_extra.rbs +18 -0
  343. data/sig/mindee/v1/parsing/common/extras/extras.rbs +24 -0
  344. data/sig/mindee/v1/parsing/common/extras/full_text_ocr_extra.rbs +22 -0
  345. data/sig/mindee/v1/parsing/common/extras/rag_extra.rbs +19 -0
  346. data/sig/mindee/v1/parsing/common/inference.rbs +31 -0
  347. data/sig/mindee/v1/parsing/common/job.rbs +24 -0
  348. data/sig/mindee/v1/parsing/common/ocr/mvision_v1.rbs +20 -0
  349. data/sig/mindee/v1/parsing/common/ocr/ocr.rbs +56 -0
  350. data/sig/mindee/v1/parsing/common/orientation.rbs +15 -0
  351. data/sig/mindee/v1/parsing/common/page.rbs +19 -0
  352. data/sig/mindee/v1/parsing/common/prediction.rbs +14 -0
  353. data/sig/mindee/v1/parsing/common/product.rbs +16 -0
  354. data/sig/mindee/v1/parsing/common/workflow_response.rbs +22 -0
  355. data/sig/mindee/v1/parsing/standard/abstract_field.rbs +30 -0
  356. data/sig/mindee/v1/parsing/standard/address_field.rbs +28 -0
  357. data/sig/mindee/v1/parsing/standard/amount_field.rbs +16 -0
  358. data/sig/mindee/v1/parsing/standard/base_field.rbs +16 -0
  359. data/sig/mindee/v1/parsing/standard/boolean_field.rbs +16 -0
  360. data/sig/mindee/v1/parsing/standard/classification_field.rbs +12 -0
  361. data/sig/mindee/v1/parsing/standard/company_registration_field.rbs +20 -0
  362. data/sig/mindee/v1/parsing/standard/date_field.rbs +20 -0
  363. data/sig/mindee/v1/parsing/standard/feature_field.rbs +12 -0
  364. data/sig/mindee/v1/parsing/standard/locale_field.rbs +24 -0
  365. data/sig/mindee/v1/parsing/standard/payment_details_field.rbs +19 -0
  366. data/sig/mindee/v1/parsing/standard/position_field.rbs +26 -0
  367. data/sig/mindee/v1/parsing/standard/string_field.rbs +16 -0
  368. data/sig/mindee/v1/parsing/standard/tax_field.rbs +33 -0
  369. data/sig/mindee/v1/parsing/universal/universal_list_field.rbs +21 -0
  370. data/sig/mindee/v1/parsing/universal/universal_object_field.rbs +38 -0
  371. data/sig/mindee/v1/product/barcode_reader/barcode_reader_v1.rbs +13 -0
  372. data/sig/mindee/v1/product/barcode_reader/barcode_reader_v1_document.rbs +16 -0
  373. data/sig/mindee/v1/product/barcode_reader/barcode_reader_v1_page.rbs +17 -0
  374. data/sig/mindee/v1/product/cropper/cropper_v1.rbs +13 -0
  375. data/sig/mindee/v1/product/cropper/cropper_v1_document.rbs +14 -0
  376. data/sig/mindee/v1/product/cropper/cropper_v1_page.rbs +19 -0
  377. data/sig/mindee/v1/product/financial_document/financial_document_v1.rbs +13 -0
  378. data/sig/mindee/v1/product/financial_document/financial_document_v1_document.rbs +49 -0
  379. data/sig/mindee/v1/product/financial_document/financial_document_v1_line_item.rbs +35 -0
  380. data/sig/mindee/v1/product/financial_document/financial_document_v1_line_items.rbs +15 -0
  381. data/sig/mindee/v1/product/financial_document/financial_document_v1_page.rbs +17 -0
  382. data/sig/mindee/v1/product/fr/bank_account_details/bank_account_details_v1.rbs +15 -0
  383. data/sig/mindee/v1/product/fr/bank_account_details/bank_account_details_v1_document.rbs +19 -0
  384. data/sig/mindee/v1/product/fr/bank_account_details/bank_account_details_v1_page.rbs +19 -0
  385. data/sig/mindee/v1/product/fr/bank_account_details/bank_account_details_v2.rbs +15 -0
  386. data/sig/mindee/v1/product/fr/bank_account_details/bank_account_details_v2_bban.rbs +25 -0
  387. data/sig/mindee/v1/product/fr/bank_account_details/bank_account_details_v2_document.rbs +20 -0
  388. data/sig/mindee/v1/product/fr/bank_account_details/bank_account_details_v2_page.rbs +19 -0
  389. data/sig/mindee/v1/product/fr/bank_statement/bank_statement_v2.rbs +15 -0
  390. data/sig/mindee/v1/product/fr/bank_statement/bank_statement_v2_document.rbs +31 -0
  391. data/sig/mindee/v1/product/fr/bank_statement/bank_statement_v2_page.rbs +19 -0
  392. data/sig/mindee/v1/product/fr/bank_statement/bank_statement_v2_transaction.rbs +27 -0
  393. data/sig/mindee/v1/product/fr/bank_statement/bank_statement_v2_transactions.rbs +17 -0
  394. data/sig/mindee/v1/product/fr/id_card/id_card_v1.rbs +15 -0
  395. data/sig/mindee/v1/product/fr/id_card/id_card_v1_document.rbs +26 -0
  396. data/sig/mindee/v1/product/fr/id_card/id_card_v1_page.rbs +20 -0
  397. data/sig/mindee/v1/product/fr/id_card/id_card_v2.rbs +15 -0
  398. data/sig/mindee/v1/product/fr/id_card/id_card_v2_document.rbs +31 -0
  399. data/sig/mindee/v1/product/fr/id_card/id_card_v2_page.rbs +21 -0
  400. data/sig/mindee/v1/product/international_id/international_id_v2.rbs +13 -0
  401. data/sig/mindee/v1/product/international_id/international_id_v2_document.rbs +31 -0
  402. data/sig/mindee/v1/product/international_id/international_id_v2_page.rbs +17 -0
  403. data/sig/mindee/v1/product/invoice/invoice_v4.rbs +13 -0
  404. data/sig/mindee/v1/product/invoice/invoice_v4_document.rbs +45 -0
  405. data/sig/mindee/v1/product/invoice/invoice_v4_line_item.rbs +35 -0
  406. data/sig/mindee/v1/product/invoice/invoice_v4_line_items.rbs +15 -0
  407. data/sig/mindee/v1/product/invoice/invoice_v4_page.rbs +17 -0
  408. data/sig/mindee/v1/product/invoice_splitter/invoice_splitter_v1.rbs +13 -0
  409. data/sig/mindee/v1/product/invoice_splitter/invoice_splitter_v1_document.rbs +17 -0
  410. data/sig/mindee/v1/product/invoice_splitter/invoice_splitter_v1_invoice_page_group.rbs +21 -0
  411. data/sig/mindee/v1/product/invoice_splitter/invoice_splitter_v1_invoice_page_groups.rbs +15 -0
  412. data/sig/mindee/v1/product/invoice_splitter/invoice_splitter_v1_page.rbs +17 -0
  413. data/sig/mindee/v1/product/multi_receipts_detector/multi_receipts_detector_v1.rbs +14 -0
  414. data/sig/mindee/v1/product/multi_receipts_detector/multi_receipts_detector_v1_document.rbs +15 -0
  415. data/sig/mindee/v1/product/multi_receipts_detector/multi_receipts_detector_v1_page.rbs +17 -0
  416. data/sig/mindee/v1/product/passport/passport_v1.rbs +13 -0
  417. data/sig/mindee/v1/product/passport/passport_v1_document.rbs +25 -0
  418. data/sig/mindee/v1/product/passport/passport_v1_page.rbs +17 -0
  419. data/sig/mindee/v1/product/receipt/receipt_v5.rbs +13 -0
  420. data/sig/mindee/v1/product/receipt/receipt_v5_document.rbs +33 -0
  421. data/sig/mindee/v1/product/receipt/receipt_v5_line_item.rbs +27 -0
  422. data/sig/mindee/v1/product/receipt/receipt_v5_line_items.rbs +15 -0
  423. data/sig/mindee/v1/product/receipt/receipt_v5_page.rbs +17 -0
  424. data/sig/mindee/v1/product/resume/resume_v1.rbs +13 -0
  425. data/sig/mindee/v1/product/resume/resume_v1_certificate.rbs +27 -0
  426. data/sig/mindee/v1/product/resume/resume_v1_certificates.rbs +17 -0
  427. data/sig/mindee/v1/product/resume/resume_v1_document.rbs +69 -0
  428. data/sig/mindee/v1/product/resume/resume_v1_education.rbs +33 -0
  429. data/sig/mindee/v1/product/resume/resume_v1_educations.rbs +17 -0
  430. data/sig/mindee/v1/product/resume/resume_v1_language.rbs +23 -0
  431. data/sig/mindee/v1/product/resume/resume_v1_languages.rbs +17 -0
  432. data/sig/mindee/v1/product/resume/resume_v1_page.rbs +19 -0
  433. data/sig/mindee/v1/product/resume/resume_v1_professional_experience.rbs +37 -0
  434. data/sig/mindee/v1/product/resume/resume_v1_professional_experiences.rbs +17 -0
  435. data/sig/mindee/v1/product/resume/resume_v1_social_networks_url.rbs +23 -0
  436. data/sig/mindee/v1/product/resume/resume_v1_social_networks_urls.rbs +17 -0
  437. data/sig/mindee/v1/product/universal/universal.rbs +16 -0
  438. data/sig/mindee/v1/product/universal/universal_document.rbs +12 -0
  439. data/sig/mindee/v1/product/universal/universal_page.rbs +18 -0
  440. data/sig/mindee/v1/product/universal/universal_prediction.rbs +30 -0
  441. data/sig/mindee/v2/client.rbs +29 -0
  442. data/sig/mindee/v2/file_operation/crop.rbs +10 -0
  443. data/sig/mindee/v2/file_operation/crop_files.rbs +9 -0
  444. data/sig/mindee/v2/file_operation/split.rbs +11 -0
  445. data/sig/mindee/v2/file_operation/split_files.rbs +9 -0
  446. data/sig/mindee/v2/http/api_v2_settings.rbs +27 -0
  447. data/sig/mindee/v2/http/mindee_api_v2.rbs +52 -0
  448. data/sig/mindee/v2/parsing/base_inference.rbs +18 -0
  449. data/sig/mindee/v2/parsing/base_response.rbs +11 -0
  450. data/sig/mindee/v2/parsing/common_response.rbs +12 -0
  451. data/sig/mindee/v2/parsing/error_item.rbs +13 -0
  452. data/sig/mindee/v2/parsing/error_response.rbs +20 -0
  453. data/sig/mindee/v2/parsing/field/base_field.rbs +17 -0
  454. data/sig/mindee/v2/parsing/field/field_confidence.rbs +30 -0
  455. data/sig/mindee/v2/parsing/field/field_location.rbs +16 -0
  456. data/sig/mindee/v2/parsing/field/inference_fields.rbs +20 -0
  457. data/sig/mindee/v2/parsing/field/list_field.rbs +23 -0
  458. data/sig/mindee/v2/parsing/field/object_field.rbs +27 -0
  459. data/sig/mindee/v2/parsing/field/simple_field.rbs +16 -0
  460. data/sig/mindee/v2/parsing/inference_active_options.rbs +26 -0
  461. data/sig/mindee/v2/parsing/inference_file.rbs +17 -0
  462. data/sig/mindee/v2/parsing/inference_job.rbs +13 -0
  463. data/sig/mindee/v2/parsing/inference_model.rbs +12 -0
  464. data/sig/mindee/v2/parsing/job.rbs +24 -0
  465. data/sig/mindee/v2/parsing/job_response.rbs +14 -0
  466. data/sig/mindee/v2/parsing/job_webhook.rbs +19 -0
  467. data/sig/mindee/v2/parsing/rag_metadata.rbs +13 -0
  468. data/sig/mindee/v2/parsing/raw_text.rbs +12 -0
  469. data/sig/mindee/v2/parsing/raw_text_page.rbs +11 -0
  470. data/sig/mindee/v2/parsing/search/pagination_metadata.rbs +20 -0
  471. data/sig/mindee/v2/parsing/search/search_model.rbs +19 -0
  472. data/sig/mindee/v2/parsing/search/search_response.rbs +17 -0
  473. data/sig/mindee/v2/parsing/search_models.rbs +14 -0
  474. data/sig/mindee/v2/product/base_product.rbs +19 -0
  475. data/sig/mindee/v2/product/classification/classification.rbs +10 -0
  476. data/sig/mindee/v2/product/classification/classification_classifier.rbs +15 -0
  477. data/sig/mindee/v2/product/classification/classification_inference.rbs +15 -0
  478. data/sig/mindee/v2/product/classification/classification_response.rbs +23 -0
  479. data/sig/mindee/v2/product/classification/classification_result.rbs +15 -0
  480. data/sig/mindee/v2/product/classification/params/classification_parameters/classification_parameters.rbs +23 -0
  481. data/sig/mindee/v2/product/crop/crop.rbs +10 -0
  482. data/sig/mindee/v2/product/crop/crop_inference.rbs +14 -0
  483. data/sig/mindee/v2/product/crop/crop_item.rbs +18 -0
  484. data/sig/mindee/v2/product/crop/crop_response.rbs +25 -0
  485. data/sig/mindee/v2/product/crop/crop_result.rbs +14 -0
  486. data/sig/mindee/v2/product/crop/params/crop_parameters/crop_parameters.rbs +23 -0
  487. data/sig/mindee/v2/product/extraction/extraction.rbs +15 -0
  488. data/sig/mindee/v2/product/extraction/extraction_inference.rbs +19 -0
  489. data/sig/mindee/v2/product/extraction/extraction_response.rbs +24 -0
  490. data/sig/mindee/v2/product/extraction/extraction_result.rbs +18 -0
  491. data/sig/mindee/v2/product/extraction/params/data_schema.rbs +21 -0
  492. data/sig/mindee/v2/product/extraction/params/data_schema_field.rbs +29 -0
  493. data/sig/mindee/v2/product/extraction/params/data_schema_replace.rbs +21 -0
  494. data/sig/mindee/v2/product/extraction/params/extraction_parameters.rbs +38 -0
  495. data/sig/mindee/v2/product/ocr/ocr.rbs +10 -0
  496. data/sig/mindee/v2/product/ocr/ocr_inference.rbs +14 -0
  497. data/sig/mindee/v2/product/ocr/ocr_page.rbs +15 -0
  498. data/sig/mindee/v2/product/ocr/ocr_response.rbs +23 -0
  499. data/sig/mindee/v2/product/ocr/ocr_result.rbs +14 -0
  500. data/sig/mindee/v2/product/ocr/ocr_word.rbs +15 -0
  501. data/sig/mindee/v2/product/ocr/params/ocr_parameters/ocr_parameters.rbs +24 -0
  502. data/sig/mindee/v2/product/split/params/split_parameters/split_parameters.rbs +23 -0
  503. data/sig/mindee/v2/product/split/split.rbs +10 -0
  504. data/sig/mindee/v2/product/split/split_inference.rbs +14 -0
  505. data/sig/mindee/v2/product/split/split_range.rbs +18 -0
  506. data/sig/mindee/v2/product/split/split_response.rbs +25 -0
  507. data/sig/mindee/v2/product/split/split_result.rbs +14 -0
  508. data/sig/mindee/version.rbs +6 -0
  509. data/sig/mindee.rbs +62 -0
  510. metadata +600 -0
@@ -0,0 +1,216 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'stringio'
4
+ require 'marcel'
5
+ require 'fileutils'
6
+
7
+ require_relative '../../dependency'
8
+ require_relative '../../pdf' if Mindee::Dependency.all_deps_available?
9
+ require_relative '../../image' if Mindee::Dependency.all_deps_available?
10
+
11
+ module Mindee
12
+ module Input
13
+ # Document source handling.
14
+ module Source
15
+ # Mime types accepted by the server.
16
+ ALLOWED_MIME_TYPES = [
17
+ 'application/pdf',
18
+ 'image/heic',
19
+ 'image/png',
20
+ 'image/jpeg',
21
+ 'image/tiff',
22
+ 'image/webp',
23
+ ].freeze
24
+
25
+ # Base class for loading documents.
26
+ class LocalInputSource
27
+ # @return [String]
28
+ attr_reader :filename
29
+ # @return [String]
30
+ attr_reader :file_mimetype
31
+ # @return [StringIO | File]
32
+ attr_reader :io_stream
33
+
34
+ # @param io_stream [StringIO, File]
35
+ # @param filename [String]
36
+ # @param repair_pdf [bool]
37
+ def initialize(io_stream, filename, repair_pdf: false)
38
+ @io_stream = io_stream
39
+ @filename = filename
40
+ @file_mimetype = if repair_pdf
41
+ Marcel::MimeType.for @io_stream
42
+ else
43
+ Marcel::MimeType.for @io_stream, name: @filename
44
+ end
45
+ if ALLOWED_MIME_TYPES.include? @file_mimetype
46
+ logger.debug("Loaded new input #{@filename} from #{self.class}")
47
+ return
48
+ end
49
+
50
+ if filename.end_with?('.pdf') && repair_pdf
51
+ fix_pdf!
52
+
53
+ logger.debug("Loaded new input #{@filename} from #{self.class}")
54
+ return if ALLOWED_MIME_TYPES.include? @file_mimetype
55
+ end
56
+
57
+ raise Error::MindeeMimeTypeError, @file_mimetype.to_s
58
+ end
59
+
60
+ # @deprecated See {#fix_pdf!} or {#self.fix_pdf} instead.
61
+ def rescue_broken_pdf(_)
62
+ fix_pdf!
63
+ end
64
+
65
+ # Shorthand for PDF mimetype validation.
66
+ def pdf?
67
+ @file_mimetype.to_s == 'application/pdf'
68
+ end
69
+
70
+ # Attempts to fix the PDF data in the file.
71
+ # @param maximum_offset [Integer] Maximum offset to look for the PDF header.
72
+ # @return [void]
73
+ # @raise [Mindee::Error::MindeePDFError]
74
+ def fix_pdf!(maximum_offset: 500)
75
+ @io_stream = LocalInputSource.fix_pdf(@io_stream, maximum_offset: maximum_offset)
76
+ @io_stream.rewind
77
+ @file_mimetype = Marcel::MimeType.for @io_stream
78
+ end
79
+
80
+ # Attempt to fix the PDF data in the given stream.
81
+ # @param stream [StringIO] The stream to fix.
82
+ # @param maximum_offset [Integer] Maximum offset to look for the PDF header.
83
+ # @return [StringIO] The fixed stream.
84
+ # @raise [Mindee::Error::MindeePDFError]
85
+ def self.fix_pdf(stream, maximum_offset: 500)
86
+ out_stream = StringIO.new
87
+ stream.gets('%PDF-')
88
+ raise Error::MindeePDFError if stream.eof? || stream.pos > maximum_offset
89
+
90
+ stream.pos = stream.pos - 5
91
+ out_stream << stream.read
92
+ end
93
+
94
+ # Cuts a PDF file according to provided options.
95
+ # @param options [PageOptions, nil] Page cutting/merge options:
96
+ #
97
+ # * `:page_indexes` Zero-based list of page indexes.
98
+ # * `:operation` Operation to apply on the document, given the `page_indexes specified:
99
+ # * `:KEEP_ONLY` - keep only the specified pages, and remove all others.
100
+ # * `:REMOVE` - remove the specified pages, and keep all others.
101
+ # * `:on_min_pages` Apply the operation only if document has at least this many pages.
102
+ def apply_page_options(options)
103
+ @io_stream.seek(0)
104
+ @io_stream = PDF::PDFProcessor.parse(@io_stream, options)
105
+ end
106
+
107
+ # @deprecated Use {#apply_page_options} instead.
108
+ # @see #apply_page_options
109
+ def process_pdf(options)
110
+ apply_page_options(options)
111
+ end
112
+
113
+ # Reads a document.
114
+ # @param close [bool]
115
+ # @return [Array<>]
116
+ def read_contents(close: true)
117
+ logger.debug("Reading data from: #{@filename}")
118
+ @io_stream.seek(0)
119
+ # Avoids needlessly re-packing some files
120
+ data = @io_stream.read
121
+ @io_stream.rewind
122
+ @io_stream.close if close
123
+ [data, { filename: Mindee::Input::Source.convert_to_unicode_escape(@filename) }]
124
+ end
125
+
126
+ # Write the file to a given path. Uses the initial file name by default.
127
+ # @param path [String] Path to write the file to.
128
+ def write_to_file(path)
129
+ t_path = if File.directory?(path || '') || path.to_s.end_with?('/')
130
+ File.join(path || '', @filename)
131
+ else
132
+ path
133
+ end
134
+ full_path = File.expand_path(t_path || '')
135
+ FileUtils.mkdir_p(File.dirname(full_path))
136
+ @io_stream.rewind
137
+ File.binwrite(full_path, @io_stream.read || '')
138
+ logger.debug("Wrote file successfully to #{full_path}")
139
+ @io_stream.rewind
140
+ end
141
+
142
+ # Returns the page count for a document.
143
+ # Defaults to one for images.
144
+ # @return [Integer]
145
+ def page_count
146
+ unless Mindee::Dependency.all_deps_available?
147
+ raise NotImplementedError, Mindee::Dependency::MINDEE_DEPENDENCIES_LOAD_ERROR
148
+ end
149
+ return 1 unless pdf?
150
+
151
+ @io_stream.seek(0)
152
+ pdf_processor = Mindee::PDF::PDFProcessor.open_pdf(@io_stream)
153
+ pdf_processor.pages.size
154
+ end
155
+
156
+ # Compresses the file, according to the provided info.
157
+ # @param [Integer] quality Quality of the output file.
158
+ # @param [Integer, nil] max_width Maximum width (Ignored for PDFs).
159
+ # @param [Integer, nil] max_height Maximum height (Ignored for PDFs).
160
+ # @param [bool] force_source_text Whether to force the operation on PDFs with source text.
161
+ # This will attempt to re-render PDF text over the rasterized original. If disabled, ignored the operation.
162
+ # WARNING: this operation is strongly discouraged.
163
+ # @param [bool] disable_source_text If the PDF has source text, whether to re-apply it to the original or
164
+ # not. Needs force_source_text to work.
165
+ def compress!(quality: 85, max_width: nil, max_height: nil, force_source_text: false, disable_source_text: true)
166
+ unless Mindee::Dependency.all_deps_available?
167
+ raise NotImplementedError, Mindee::Dependency::MINDEE_DEPENDENCIES_LOAD_ERROR
168
+ end
169
+
170
+ buffer = if pdf?
171
+ Mindee::PDF::PDFCompressor.compress_pdf(
172
+ @io_stream,
173
+ quality: quality,
174
+ force_source_text_compression: force_source_text,
175
+ disable_source_text: disable_source_text
176
+ )
177
+ else
178
+ Mindee::Image::ImageCompressor.compress_image(
179
+ @io_stream,
180
+ quality: quality,
181
+ max_width: max_width,
182
+ max_height: max_height
183
+ )
184
+ end
185
+ @io_stream = buffer
186
+ @io_stream.rewind
187
+ end
188
+
189
+ # Checks whether the file has source text if it is a pdf. `false` otherwise
190
+ # @return [bool] `true` if the file is a PDF and has source text.
191
+ def source_text?
192
+ unless Mindee::Dependency.all_deps_available?
193
+ raise NotImplementedError, Mindee::Dependency::MINDEE_DEPENDENCIES_LOAD_ERROR
194
+ end
195
+
196
+ Mindee::PDF::PDFTools.source_text?(@io_stream)
197
+ end
198
+ end
199
+
200
+ # Replaces non-ASCII characters by their UNICODE escape sequence.
201
+ # Keeps other characters as is.
202
+ # @return A clean String.
203
+ def self.convert_to_unicode_escape(string)
204
+ unicode_escape_string = ''.dup
205
+ string.each_char do |char|
206
+ unicode_escape_string << if char.bytesize > 1
207
+ "\\u#{format('%04x', char.unpack1('U'))}"
208
+ else
209
+ char
210
+ end
211
+ end
212
+ unicode_escape_string
213
+ end
214
+ end
215
+ end
216
+ end
@@ -0,0 +1,20 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'stringio'
4
+
5
+ module Mindee
6
+ module Input
7
+ # Document source handling.
8
+ module Source
9
+ # Load a document from a path.
10
+ class PathInputSource < LocalInputSource
11
+ # @param filepath [String]
12
+ # @param repair_pdf [bool]
13
+ def initialize(filepath, repair_pdf: false)
14
+ io_stream = File.new(filepath, 'rb')
15
+ super(io_stream, File.basename(filepath), repair_pdf: repair_pdf)
16
+ end
17
+ end
18
+ end
19
+ end
20
+ end
@@ -0,0 +1,130 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'net/http'
4
+ require 'uri'
5
+ require 'fileutils'
6
+ require_relative '../../logging'
7
+
8
+ module Mindee
9
+ module Input
10
+ module Source
11
+ # Load a remote document from a file url.
12
+ class URLInputSource
13
+ # @return [String]
14
+ attr_reader :url
15
+
16
+ def initialize(url)
17
+ raise Error::MindeeInputError, 'URL must be HTTPS' unless url.start_with? 'https://'
18
+
19
+ logger.debug("URL input: #{url}")
20
+
21
+ @url = url
22
+ end
23
+
24
+ # Downloads the file from the URL and saves it to the specified path.
25
+ #
26
+ # @param path [String] Path to save the file to.
27
+ # @param filename [String, nil] Optional name to give to the file.
28
+ # @param username [String, nil] Optional username for authentication.
29
+ # @param password [String, nil] Optional password for authentication.
30
+ # @param token [String, nil] Optional token for JWT-based authentication.
31
+ # @param max_redirects [Integer] Maximum amount of redirects to follow.
32
+ # @return [String] The full path of the saved file.
33
+ def write_to_file(path, filename: nil, username: nil, password: nil, token: nil, max_redirects: 3)
34
+ response_body = fetch_file_content(username: username, password: password, token: token,
35
+ max_redirects: max_redirects)
36
+
37
+ filename = fill_filename(filename)
38
+
39
+ full_path = File.join(path.chomp('/'), filename)
40
+ File.write(full_path, response_body)
41
+
42
+ full_path
43
+ end
44
+
45
+ # Downloads the file from the url, and returns a BytesInputSource wrapper object for it.
46
+ #
47
+ # @param filename [String, nil] Optional name to give to the file.
48
+ # @param username [String, nil] Optional username for authentication.
49
+ # @param password [String, nil] Optional password for authentication.
50
+ # @param token [String, nil] Optional token for JWT-based authentication.
51
+ # @param max_redirects [Integer] Maximum amount of redirects to follow.
52
+ # @return [BytesInputSource] The full path of the saved file.
53
+ def as_local_input_source(filename: nil, username: nil, password: nil, token: nil, max_redirects: 3)
54
+ filename = fill_filename(filename)
55
+ response_body = fetch_file_content(username: username, password: password, token: token,
56
+ max_redirects: max_redirects)
57
+ bytes = StringIO.new(response_body)
58
+
59
+ BytesInputSource.new(bytes.read || '', filename || '')
60
+ end
61
+
62
+ # Fetches the file content from the URL.
63
+ #
64
+ # @param username [String, nil] Optional username for authentication.
65
+ # @param password [String, nil] Optional password for authentication.
66
+ # @param token [String, nil] Optional token for JWT-based authentication.
67
+ # @param max_redirects [Integer] Maximum amount of redirects to follow.
68
+ # @return [String] The downloaded file content.
69
+ def fetch_file_content(username: nil, password: nil, token: nil, max_redirects: 3)
70
+ uri = URI.parse(@url)
71
+ request = Net::HTTP::Get.new(uri)
72
+
73
+ request['Authorization'] = "Bearer #{token}" if token
74
+ request.basic_auth(username, password) if username && password
75
+
76
+ response = make_request(uri, request, max_redirects)
77
+ if response.code.to_i > 299
78
+ raise Error::MindeeAPIError, "Failed to download file: HTTP status code #{response.code}"
79
+ elsif response.code.to_i < 200
80
+ raise Error::MindeeAPIError, "Failed to download file: Invalid response code #{response.code}."
81
+ end
82
+
83
+ response.body
84
+ end
85
+
86
+ private
87
+
88
+ def extract_filename_from_url(uri)
89
+ filename = File.basename(uri.path.to_s)
90
+ filename.empty? ? '' : filename
91
+ end
92
+
93
+ def fill_filename(filename)
94
+ filename ||= extract_filename_from_url(URI.parse(@url))
95
+ if filename.empty? || File.extname(filename).empty?
96
+ filename = generate_file_name(extension: get_file_extension(filename))
97
+ end
98
+ filename
99
+ end
100
+
101
+ def make_request(uri, request, max_redirects)
102
+ Net::HTTP.start(uri.hostname, uri.port, use_ssl: true) do |http|
103
+ response = http.request(request)
104
+ if response.is_a?(Net::HTTPRedirection) && max_redirects.positive?
105
+ location = response['location']
106
+ raise Error::MindeeInputError, 'No location in redirection header.' if location.nil?
107
+
108
+ new_uri = URI.parse(location)
109
+ request = Net::HTTP::Get.new(new_uri)
110
+ make_request(new_uri, request, max_redirects - 1)
111
+ else
112
+ response
113
+ end
114
+ end
115
+ end
116
+
117
+ def get_file_extension(filename)
118
+ ext = File.extname(filename)
119
+ ext.empty? ? nil : ext.downcase
120
+ end
121
+
122
+ def generate_file_name(extension: nil)
123
+ extension ||= '.tmp'
124
+ random_string = Array.new(8) { rand(36).to_s(36) }.join
125
+ "mindee_temp_#{Time.now.strftime('%Y-%m-%d_%H-%M-%S')}_#{random_string}#{extension}"
126
+ end
127
+ end
128
+ end
129
+ end
130
+ end
@@ -0,0 +1,8 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative 'sources/local_input_source'
4
+ require_relative 'sources/bytes_input_source'
5
+ require_relative 'sources/base64_input_source'
6
+ require_relative 'sources/file_input_source'
7
+ require_relative 'sources/path_input_source'
8
+ require_relative 'sources/url_input_source'
@@ -0,0 +1,4 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative 'input/polling_options'
4
+ require_relative 'input/sources'
@@ -0,0 +1,24 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'logger'
4
+
5
+ module Mindee
6
+ # Mindee logging module.
7
+ module Logging
8
+ log_level = ENV.fetch('MINDEE_LOG_LEVEL', 'WARN')
9
+ log_output = ENV.fetch('MINDEE_LOG_OUTPUT', 'stderr')
10
+ @logger = if log_output == 'stderr'
11
+ Logger.new($stderr)
12
+ elsif log_output == 'stdout'
13
+ Logger.new($stdout)
14
+ else
15
+ warn "Invalid MINDEE_LOG_OUTPUT='#{log_output}', defaulting to 'stderr'"
16
+ Logger.new($stderr)
17
+ end
18
+ @logger.level = Logger.const_get(log_level)
19
+
20
+ class << self
21
+ attr_accessor :logger
22
+ end
23
+ end
24
+ end
@@ -0,0 +1,3 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative 'logging/logger'
@@ -0,0 +1,24 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Mindee
4
+ # Class for page options in parse calls.
5
+ # @!attribute page_indexes [Array[Integer]] Zero-based list of page indexes.
6
+ # @!attribute operation [:KEEP_ONLY, :REMOVE] Operation to apply on the document, given the specified page indexes:
7
+ # * `:KEEP_ONLY` - keep only the specified pages, and remove all others.
8
+ # * `:REMOVE` - remove the specified pages, and keep all others.
9
+ # @!attribute on_min_pages [Integer, nil] Apply the operation only if the document has at least this many pages.
10
+ class PageOptions
11
+ attr_accessor :page_indexes, :operation, :on_min_pages
12
+
13
+ def initialize(params: {})
14
+ params ||= {} # : Hash[Symbol, untyped]
15
+ params = params.transform_keys(&:to_sym)
16
+ @page_indexes = params.fetch(
17
+ :page_indexes,
18
+ [] # : Array[Integer]
19
+ )
20
+ @operation = params.fetch(:operation, :KEEP_ONLY)
21
+ @on_min_pages = params.fetch(:on_min_pages, nil)
22
+ end
23
+ end
24
+ end
@@ -0,0 +1,70 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Mindee
4
+ # PDF Extraction Module.
5
+ module PDF
6
+ # An extracted sub-Pdf.
7
+ class ExtractedPDF
8
+ # Byte contents of the pdf
9
+ # @return [StringIO]
10
+ attr_reader :pdf_bytes
11
+
12
+ # Name of the file.
13
+ # @return [String]
14
+ attr_reader :filename
15
+
16
+ # @param pdf_stream [StringIO, File]
17
+ # @param filename [String]
18
+ def initialize(pdf_stream, filename)
19
+ @filename = filename
20
+
21
+ if pdf_stream.is_a?(File)
22
+ pdf_stream.rewind
23
+ @pdf_bytes = StringIO.new(pdf_stream.read)
24
+ else
25
+ @pdf_bytes = pdf_stream
26
+ end
27
+ end
28
+
29
+ # Retrieves the page count for a given pdf.
30
+ # @return [Integer]
31
+ def page_count
32
+ current_pdf = Mindee::PDF::PDFProcessor.open_pdf(pdf_bytes)
33
+ current_pdf.pages.size
34
+ rescue TypeError, Origami::InvalidPDFError
35
+ raise Error::MindeePDFError, 'Could not retrieve page count from Extracted PDF object.'
36
+ end
37
+
38
+ # Writes the contents of the current PDF object to a file.
39
+ # @param output_path [String] Path to write to.
40
+ # @param override [bool] Whether to override the destination file.
41
+ def write_to_file(output_path, override: false)
42
+ raise Error::MindeePDFError, 'Provided path is not a file' if File.directory?(output_path)
43
+ raise Error::MindeePDFError, 'Invalid save path provided' unless File.exist?(
44
+ File.expand_path('..', output_path)
45
+ ) && !override
46
+
47
+ if File.extname(output_path).downcase == 'pdf'
48
+ base_path = File.expand_path('..', output_path)
49
+ output_path = File.expand_path("#{File.basename(output_path)}.pdf", base_path)
50
+ end
51
+
52
+ @pdf_bytes.rewind if @pdf_bytes.respond_to?(:rewind)
53
+ File.binwrite(output_path, @pdf_bytes.read.to_s)
54
+ @pdf_bytes.rewind if @pdf_bytes.respond_to?(:rewind)
55
+ end
56
+
57
+ # Returns the current PDF object as a usable BytesInputSource.
58
+ # @return [Mindee::Input::Source::BytesInputSource]
59
+ def as_input_source
60
+ raise Error::MindeePDFError, 'Bytes object is nil.' if @pdf_bytes.nil?
61
+
62
+ @pdf_bytes.rewind if @pdf_bytes.respond_to?(:rewind)
63
+ data = @pdf_bytes.read || ''
64
+ @pdf_bytes.rewind if @pdf_bytes.respond_to?(:rewind)
65
+
66
+ Mindee::Input::Source::BytesInputSource.new(data, @filename)
67
+ end
68
+ end
69
+ end
70
+ end
@@ -0,0 +1,121 @@
1
+ # frozen_string_literal: true
2
+
3
+ Mindee::Dependency.require_all_deps!
4
+ require 'pdf-reader'
5
+
6
+ # Shorthand for pdf-reader's PDF namespace, to avoid mixups with the local Origami fork.
7
+ PDFReader = PDF
8
+
9
+ module Mindee
10
+ module PDF
11
+ # Image compressor module to handle PDF compression.
12
+ module PDFCompressor
13
+ # Compresses each page of a provided PDF stream. Skips if force_source_text isn't set and source text is detected.
14
+ # @param pdf_data [StringIO] StringIO handle of the file.
15
+ # @param quality [Integer] Compression quality (70-100 for most JPG images in the test dataset).
16
+ # @param force_source_text_compression [bool] If true, attempts to re-write detected text.
17
+ # @param disable_source_text [bool] If true, doesn't re-apply source text to the original PDF.
18
+ def self.compress_pdf(pdf_data, quality: 85, force_source_text_compression: false, disable_source_text: true)
19
+ if PDFTools.source_text?(pdf_data)
20
+ if force_source_text_compression
21
+ if disable_source_text
22
+ logger.warn('Re-writing PDF source-text is an EXPERIMENTAL feature.')
23
+ else
24
+ logger.warn('Source-file contains text, but disable_source_text flag is ignored. ' \
25
+ 'Resulting file will not contain any embedded text.')
26
+ end
27
+ else
28
+ logger.warn('Source-text detected in input PDF. Aborting operation.')
29
+ return pdf_data
30
+ end
31
+ end
32
+
33
+ pdf_data.rewind
34
+ pdf = Origami::PDF.read(pdf_data)
35
+ pages = process_pdf_pages(pdf, quality)
36
+
37
+ output_pdf = create_output_pdf(pages, disable_source_text, pdf_data)
38
+
39
+ output_stream = StringIO.new
40
+ output_pdf.save(output_stream)
41
+ output_stream
42
+ end
43
+
44
+ # Processes all pages in the PDF.
45
+ # @param pdf [Origami::PDF] The Origami PDF object to process.
46
+ # @param quality [Integer] Compression quality.
47
+ # @return [Array<Origami::Page>] Processed pages.
48
+ def self.process_pdf_pages(pdf, quality)
49
+ pdf.pages.map.with_index do |page, index|
50
+ retrieved_page = Mindee::PDF::PDFProcessor.get_page(pdf, index)
51
+ process_pdf_page(retrieved_page, index, quality, page[:MediaBox])
52
+ end
53
+ end
54
+
55
+ # Creates the output PDF with processed pages.
56
+ # @param pages [Array<Origami::Page>] Processed pages.
57
+ # @param disable_source_text [bool] Whether to disable source text.
58
+ # @param pdf_data [StringIO] Original PDF data.
59
+ # @return [Origami::PDF] Output PDF object.
60
+ def self.create_output_pdf(pages, disable_source_text, pdf_data)
61
+ output_pdf = Origami::PDF.new
62
+ pages.rotate!(1) if pages.count >= 2
63
+
64
+ inject_text(pdf_data, pages) unless disable_source_text
65
+
66
+ pages.each { |page| output_pdf.append_page(page) }
67
+
68
+ output_pdf
69
+ end
70
+
71
+ # Extracts text from a source text PDF, and injects it into a newly-created one.
72
+ # @param pdf_data [StringIO] Stream representation of the PDF.
73
+ # @param pages [Array<Origami::Page>] Array of pages containing the rasterized version of the initial pages.
74
+ def self.inject_text(pdf_data, pages)
75
+ reader = PDFReader::Reader.new(pdf_data)
76
+
77
+ reader.pages.each_with_index do |original_page, index|
78
+ break if index >= pages.length
79
+
80
+ receiver = PDFReader::Reader::PageTextReceiver.new
81
+ original_page.walk(receiver)
82
+
83
+ receiver.runs.each do |text_run|
84
+ x = text_run.origin.x
85
+ y = text_run.origin.y
86
+ text = text_run.text
87
+ font_size = text_run.font_size
88
+
89
+ content_stream = Origami::Stream.new
90
+ content_stream.dictionary[:Filter] = :FlateDecode
91
+ content_stream.data = "BT\n/F1 #{font_size} Tf\n#{x} #{y} Td\n(#{text}) Tj\nET\n"
92
+
93
+ pages[index].Contents.data += content_stream.data
94
+ end
95
+ end
96
+ end
97
+
98
+ # Takes in a page stream, rasterizes it into a JPEG image, and applies the result onto a new Origami PDF page.
99
+ # @param page_stream [StringIO] Stream representation of a single page from the initial PDF.
100
+ # @param page_index [Integer] Index of the current page. Technically not needed, but left for debugging purposes.
101
+ # @param image_quality [Integer] Quality to apply to the rasterized page.
102
+ # @param media_box [Array<Integer>, nil] Extracted media box from the page. Can be nil.
103
+ # @return [Origami::Page]
104
+ def self.process_pdf_page(page_stream, page_index, image_quality, media_box)
105
+ new_page = Origami::Page.new
106
+ compressed_image = Mindee::Image::ImageUtils.pdf_to_magick_image(page_stream, image_quality)
107
+ width, height = Mindee::Image::ImageUtils.calculate_dimensions_from_media_box(compressed_image, media_box)
108
+
109
+ compressed_xobject = PDF::PDFTools.create_xobject(compressed_image)
110
+ PDF::PDFTools.set_xobject_properties(compressed_xobject, compressed_image)
111
+
112
+ xobject_name = "X#{page_index + 1}"
113
+ PDF::PDFTools.add_content_to_page(new_page, xobject_name, width, height)
114
+ new_page.add_xobject(compressed_xobject, xobject_name)
115
+
116
+ PDF::PDFTools.set_page_dimensions(new_page, width, height)
117
+ new_page
118
+ end
119
+ end
120
+ end
121
+ end