mindee-lite 5.0.0.beta1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (510) hide show
  1. checksums.yaml +7 -0
  2. data/.editorconfig +24 -0
  3. data/.gitattributes +14 -0
  4. data/.gitignore +76 -0
  5. data/.gitmodules +3 -0
  6. data/.pre-commit-config.yaml +36 -0
  7. data/.rubocop.yml +49 -0
  8. data/.yardopts +4 -0
  9. data/CHANGELOG.md +515 -0
  10. data/CODE_OF_CONDUCT.md +129 -0
  11. data/CONTRIBUTING.md +107 -0
  12. data/Gemfile +14 -0
  13. data/LICENSE +21 -0
  14. data/README.md +42 -0
  15. data/Rakefile +40 -0
  16. data/Steepfile +30 -0
  17. data/bin/console +14 -0
  18. data/bin/mindee.rb +30 -0
  19. data/bin/v1/parser.rb +153 -0
  20. data/bin/v1/products.rb +88 -0
  21. data/bin/v2/parser.rb +235 -0
  22. data/bin/v2/products.rb +34 -0
  23. data/docs/code_samples/bank_account_details_v1.txt +24 -0
  24. data/docs/code_samples/bank_account_details_v2.txt +24 -0
  25. data/docs/code_samples/bank_statement_fr_v2_async.txt +24 -0
  26. data/docs/code_samples/barcode_reader_v1.txt +24 -0
  27. data/docs/code_samples/cropper_v1.txt +21 -0
  28. data/docs/code_samples/default.txt +30 -0
  29. data/docs/code_samples/default_async.txt +29 -0
  30. data/docs/code_samples/expense_receipts_v5.txt +25 -0
  31. data/docs/code_samples/expense_receipts_v5_async.txt +24 -0
  32. data/docs/code_samples/financial_document_v1.txt +25 -0
  33. data/docs/code_samples/financial_document_v1_async.txt +24 -0
  34. data/docs/code_samples/idcard_fr_v1.txt +24 -0
  35. data/docs/code_samples/idcard_fr_v2.txt +24 -0
  36. data/docs/code_samples/international_id_v2_async.txt +24 -0
  37. data/docs/code_samples/invoice_splitter_v1_async.txt +24 -0
  38. data/docs/code_samples/invoices_v4.txt +25 -0
  39. data/docs/code_samples/invoices_v4_async.txt +24 -0
  40. data/docs/code_samples/multi_receipts_detector_v1.txt +24 -0
  41. data/docs/code_samples/passport_v1.txt +24 -0
  42. data/docs/code_samples/resume_v1_async.txt +24 -0
  43. data/docs/code_samples/v2_classification.txt +30 -0
  44. data/docs/code_samples/v2_crop.txt +30 -0
  45. data/docs/code_samples/v2_extraction.txt +42 -0
  46. data/docs/code_samples/v2_extraction_webhook.txt +45 -0
  47. data/docs/code_samples/v2_ocr.txt +30 -0
  48. data/docs/code_samples/v2_split.txt +30 -0
  49. data/docs/code_samples/workflow_execution.txt +28 -0
  50. data/docs/code_samples/workflow_polling.txt +35 -0
  51. data/examples/auto_invoice_splitter_extraction.rb +48 -0
  52. data/examples/auto_multi_receipts_detector_extraction.rb +30 -0
  53. data/lib/mindee/dependency.rb +29 -0
  54. data/lib/mindee/error/mindee_error.rb +17 -0
  55. data/lib/mindee/error/mindee_http_error.rb +36 -0
  56. data/lib/mindee/error/mindee_http_error_v2.rb +45 -0
  57. data/lib/mindee/error/mindee_http_unknown_error_v2.rb +18 -0
  58. data/lib/mindee/error/mindee_input_error.rb +30 -0
  59. data/lib/mindee/error.rb +6 -0
  60. data/lib/mindee/geometry/min_max.rb +23 -0
  61. data/lib/mindee/geometry/point.rb +41 -0
  62. data/lib/mindee/geometry/polygon.rb +37 -0
  63. data/lib/mindee/geometry/quadrilateral.rb +50 -0
  64. data/lib/mindee/geometry/utils.rb +88 -0
  65. data/lib/mindee/geometry.rb +7 -0
  66. data/lib/mindee/http/.rubocop.yml +7 -0
  67. data/lib/mindee/http/http_error_handler.rb +106 -0
  68. data/lib/mindee/http/response_validation.rb +81 -0
  69. data/lib/mindee/http.rb +3 -0
  70. data/lib/mindee/image/extracted_image.rb +89 -0
  71. data/lib/mindee/image/image_compressor.rb +29 -0
  72. data/lib/mindee/image/image_extractor.rb +118 -0
  73. data/lib/mindee/image/image_utils.rb +165 -0
  74. data/lib/mindee/image.rb +6 -0
  75. data/lib/mindee/input/base_parameters.rb +149 -0
  76. data/lib/mindee/input/local_response.rb +80 -0
  77. data/lib/mindee/input/polling_options.rb +26 -0
  78. data/lib/mindee/input/sources/base64_input_source.rb +31 -0
  79. data/lib/mindee/input/sources/bytes_input_source.rb +21 -0
  80. data/lib/mindee/input/sources/file_input_source.rb +20 -0
  81. data/lib/mindee/input/sources/local_input_source.rb +216 -0
  82. data/lib/mindee/input/sources/path_input_source.rb +20 -0
  83. data/lib/mindee/input/sources/url_input_source.rb +130 -0
  84. data/lib/mindee/input/sources.rb +8 -0
  85. data/lib/mindee/input.rb +4 -0
  86. data/lib/mindee/logging/logger.rb +24 -0
  87. data/lib/mindee/logging.rb +3 -0
  88. data/lib/mindee/page_options.rb +24 -0
  89. data/lib/mindee/pdf/extracted_pdf.rb +70 -0
  90. data/lib/mindee/pdf/pdf_compressor.rb +121 -0
  91. data/lib/mindee/pdf/pdf_extractor.rb +121 -0
  92. data/lib/mindee/pdf/pdf_processor.rb +91 -0
  93. data/lib/mindee/pdf/pdf_tools.rb +201 -0
  94. data/lib/mindee/pdf.rb +7 -0
  95. data/lib/mindee/v1/client.rb +490 -0
  96. data/lib/mindee/v1/extraction/multi_receipts_extractor.rb +32 -0
  97. data/lib/mindee/v1/extraction.rb +3 -0
  98. data/lib/mindee/v1/http/.rubocop.yml +7 -0
  99. data/lib/mindee/v1/http/endpoint.rb +221 -0
  100. data/lib/mindee/v1/http/workflow_endpoint.rb +93 -0
  101. data/lib/mindee/v1/http.rb +4 -0
  102. data/lib/mindee/v1/parsing/common/api_request.rb +38 -0
  103. data/lib/mindee/v1/parsing/common/api_response.rb +63 -0
  104. data/lib/mindee/v1/parsing/common/document.rb +86 -0
  105. data/lib/mindee/v1/parsing/common/execution.rb +78 -0
  106. data/lib/mindee/v1/parsing/common/execution_file.rb +26 -0
  107. data/lib/mindee/v1/parsing/common/execution_priority.rb +38 -0
  108. data/lib/mindee/v1/parsing/common/extras/cropper_extra.rb +32 -0
  109. data/lib/mindee/v1/parsing/common/extras/extras.rb +62 -0
  110. data/lib/mindee/v1/parsing/common/extras/full_text_ocr_extra.rb +35 -0
  111. data/lib/mindee/v1/parsing/common/extras/rag_extra.rb +28 -0
  112. data/lib/mindee/v1/parsing/common/extras.rb +6 -0
  113. data/lib/mindee/v1/parsing/common/inference.rb +69 -0
  114. data/lib/mindee/v1/parsing/common/job.rb +48 -0
  115. data/lib/mindee/v1/parsing/common/ocr/mvision_v1.rb +52 -0
  116. data/lib/mindee/v1/parsing/common/ocr/ocr.rb +180 -0
  117. data/lib/mindee/v1/parsing/common/ocr.rb +3 -0
  118. data/lib/mindee/v1/parsing/common/orientation.rb +28 -0
  119. data/lib/mindee/v1/parsing/common/page.rb +49 -0
  120. data/lib/mindee/v1/parsing/common/prediction.rb +19 -0
  121. data/lib/mindee/v1/parsing/common/product.rb +26 -0
  122. data/lib/mindee/v1/parsing/common/workflow_response.rb +30 -0
  123. data/lib/mindee/v1/parsing/common.rb +15 -0
  124. data/lib/mindee/v1/parsing/standard/abstract_field.rb +74 -0
  125. data/lib/mindee/v1/parsing/standard/address_field.rb +51 -0
  126. data/lib/mindee/v1/parsing/standard/amount_field.rb +28 -0
  127. data/lib/mindee/v1/parsing/standard/base_field.rb +30 -0
  128. data/lib/mindee/v1/parsing/standard/boolean_field.rb +29 -0
  129. data/lib/mindee/v1/parsing/standard/classification_field.rb +18 -0
  130. data/lib/mindee/v1/parsing/standard/company_registration_field.rb +45 -0
  131. data/lib/mindee/v1/parsing/standard/date_field.rb +40 -0
  132. data/lib/mindee/v1/parsing/standard/feature_field.rb +26 -0
  133. data/lib/mindee/v1/parsing/standard/locale_field.rb +52 -0
  134. data/lib/mindee/v1/parsing/standard/payment_details_field.rb +44 -0
  135. data/lib/mindee/v1/parsing/standard/position_field.rb +61 -0
  136. data/lib/mindee/v1/parsing/standard/string_field.rb +26 -0
  137. data/lib/mindee/v1/parsing/standard/tax_field.rb +110 -0
  138. data/lib/mindee/v1/parsing/standard.rb +15 -0
  139. data/lib/mindee/v1/parsing/universal/universal_list_field.rb +60 -0
  140. data/lib/mindee/v1/parsing/universal/universal_object_field.rb +123 -0
  141. data/lib/mindee/v1/parsing/universal.rb +4 -0
  142. data/lib/mindee/v1/parsing.rb +5 -0
  143. data/lib/mindee/v1/product/.rubocop.yml +12 -0
  144. data/lib/mindee/v1/product/barcode_reader/barcode_reader_v1.rb +47 -0
  145. data/lib/mindee/v1/product/barcode_reader/barcode_reader_v1_document.rb +47 -0
  146. data/lib/mindee/v1/product/barcode_reader/barcode_reader_v1_page.rb +38 -0
  147. data/lib/mindee/v1/product/cropper/cropper_v1.rb +47 -0
  148. data/lib/mindee/v1/product/cropper/cropper_v1_document.rb +15 -0
  149. data/lib/mindee/v1/product/cropper/cropper_v1_page.rb +55 -0
  150. data/lib/mindee/v1/product/financial_document/financial_document_v1.rb +47 -0
  151. data/lib/mindee/v1/product/financial_document/financial_document_v1_document.rb +329 -0
  152. data/lib/mindee/v1/product/financial_document/financial_document_v1_line_item.rb +124 -0
  153. data/lib/mindee/v1/product/financial_document/financial_document_v1_line_items.rb +64 -0
  154. data/lib/mindee/v1/product/financial_document/financial_document_v1_page.rb +38 -0
  155. data/lib/mindee/v1/product/fr/bank_account_details/bank_account_details_v1.rb +49 -0
  156. data/lib/mindee/v1/product/fr/bank_account_details/bank_account_details_v1_document.rb +49 -0
  157. data/lib/mindee/v1/product/fr/bank_account_details/bank_account_details_v1_page.rb +40 -0
  158. data/lib/mindee/v1/product/fr/bank_account_details/bank_account_details_v2.rb +49 -0
  159. data/lib/mindee/v1/product/fr/bank_account_details/bank_account_details_v2_bban.rb +63 -0
  160. data/lib/mindee/v1/product/fr/bank_account_details/bank_account_details_v2_document.rb +60 -0
  161. data/lib/mindee/v1/product/fr/bank_account_details/bank_account_details_v2_page.rb +40 -0
  162. data/lib/mindee/v1/product/fr/bank_statement/bank_statement_v2.rb +49 -0
  163. data/lib/mindee/v1/product/fr/bank_statement/bank_statement_v2_document.rb +169 -0
  164. data/lib/mindee/v1/product/fr/bank_statement/bank_statement_v2_page.rb +40 -0
  165. data/lib/mindee/v1/product/fr/bank_statement/bank_statement_v2_transaction.rb +78 -0
  166. data/lib/mindee/v1/product/fr/bank_statement/bank_statement_v2_transactions.rb +56 -0
  167. data/lib/mindee/v1/product/fr/id_card/id_card_v1.rb +49 -0
  168. data/lib/mindee/v1/product/fr/id_card/id_card_v1_document.rb +106 -0
  169. data/lib/mindee/v1/product/fr/id_card/id_card_v1_page.rb +57 -0
  170. data/lib/mindee/v1/product/fr/id_card/id_card_v2.rb +49 -0
  171. data/lib/mindee/v1/product/fr/id_card/id_card_v2_document.rb +143 -0
  172. data/lib/mindee/v1/product/fr/id_card/id_card_v2_page.rb +65 -0
  173. data/lib/mindee/v1/product/international_id/international_id_v2.rb +47 -0
  174. data/lib/mindee/v1/product/international_id/international_id_v2_document.rb +164 -0
  175. data/lib/mindee/v1/product/international_id/international_id_v2_page.rb +38 -0
  176. data/lib/mindee/v1/product/invoice/invoice_v4.rb +47 -0
  177. data/lib/mindee/v1/product/invoice/invoice_v4_document.rb +300 -0
  178. data/lib/mindee/v1/product/invoice/invoice_v4_line_item.rb +124 -0
  179. data/lib/mindee/v1/product/invoice/invoice_v4_line_items.rb +64 -0
  180. data/lib/mindee/v1/product/invoice/invoice_v4_page.rb +38 -0
  181. data/lib/mindee/v1/product/invoice_splitter/invoice_splitter_v1.rb +47 -0
  182. data/lib/mindee/v1/product/invoice_splitter/invoice_splitter_v1_document.rb +66 -0
  183. data/lib/mindee/v1/product/invoice_splitter/invoice_splitter_v1_invoice_page_group.rb +58 -0
  184. data/lib/mindee/v1/product/invoice_splitter/invoice_splitter_v1_invoice_page_groups.rb +50 -0
  185. data/lib/mindee/v1/product/invoice_splitter/invoice_splitter_v1_page.rb +38 -0
  186. data/lib/mindee/v1/product/multi_receipts_detector/multi_receipts_detector_v1.rb +47 -0
  187. data/lib/mindee/v1/product/multi_receipts_detector/multi_receipts_detector_v1_document.rb +38 -0
  188. data/lib/mindee/v1/product/multi_receipts_detector/multi_receipts_detector_v1_page.rb +38 -0
  189. data/lib/mindee/v1/product/passport/passport_v1.rb +47 -0
  190. data/lib/mindee/v1/product/passport/passport_v1_document.rb +112 -0
  191. data/lib/mindee/v1/product/passport/passport_v1_page.rb +38 -0
  192. data/lib/mindee/v1/product/receipt/receipt_v5.rb +47 -0
  193. data/lib/mindee/v1/product/receipt/receipt_v5_document.rb +187 -0
  194. data/lib/mindee/v1/product/receipt/receipt_v5_line_item.rb +88 -0
  195. data/lib/mindee/v1/product/receipt/receipt_v5_line_items.rb +56 -0
  196. data/lib/mindee/v1/product/receipt/receipt_v5_page.rb +38 -0
  197. data/lib/mindee/v1/product/resume/resume_v1.rb +47 -0
  198. data/lib/mindee/v1/product/resume/resume_v1_certificate.rb +82 -0
  199. data/lib/mindee/v1/product/resume/resume_v1_certificates.rb +60 -0
  200. data/lib/mindee/v1/product/resume/resume_v1_document.rb +340 -0
  201. data/lib/mindee/v1/product/resume/resume_v1_education.rb +106 -0
  202. data/lib/mindee/v1/product/resume/resume_v1_educations.rb +66 -0
  203. data/lib/mindee/v1/product/resume/resume_v1_language.rb +66 -0
  204. data/lib/mindee/v1/product/resume/resume_v1_languages.rb +56 -0
  205. data/lib/mindee/v1/product/resume/resume_v1_page.rb +38 -0
  206. data/lib/mindee/v1/product/resume/resume_v1_professional_experience.rb +122 -0
  207. data/lib/mindee/v1/product/resume/resume_v1_professional_experiences.rb +70 -0
  208. data/lib/mindee/v1/product/resume/resume_v1_social_networks_url.rb +66 -0
  209. data/lib/mindee/v1/product/resume/resume_v1_social_networks_urls.rb +56 -0
  210. data/lib/mindee/v1/product/universal/universal.rb +48 -0
  211. data/lib/mindee/v1/product/universal/universal_document.rb +35 -0
  212. data/lib/mindee/v1/product/universal/universal_page.rb +54 -0
  213. data/lib/mindee/v1/product/universal/universal_prediction.rb +128 -0
  214. data/lib/mindee/v1/product.rb +18 -0
  215. data/lib/mindee/v1.rb +7 -0
  216. data/lib/mindee/v2/client.rb +132 -0
  217. data/lib/mindee/v2/file_operation/crop.rb +51 -0
  218. data/lib/mindee/v2/file_operation/crop_files.rb +25 -0
  219. data/lib/mindee/v2/file_operation/split.rb +37 -0
  220. data/lib/mindee/v2/file_operation/split_files.rb +25 -0
  221. data/lib/mindee/v2/file_operation.rb +6 -0
  222. data/lib/mindee/v2/http/.rubocop.yml +7 -0
  223. data/lib/mindee/v2/http/api_v2_settings.rb +65 -0
  224. data/lib/mindee/v2/http/mindee_api_v2.rb +230 -0
  225. data/lib/mindee/v2/http.rb +4 -0
  226. data/lib/mindee/v2/parsing/base_inference.rb +44 -0
  227. data/lib/mindee/v2/parsing/base_response.rb +15 -0
  228. data/lib/mindee/v2/parsing/common_response.rb +20 -0
  229. data/lib/mindee/v2/parsing/error_item.rb +21 -0
  230. data/lib/mindee/v2/parsing/error_response.rb +51 -0
  231. data/lib/mindee/v2/parsing/field/base_field.rb +63 -0
  232. data/lib/mindee/v2/parsing/field/field_confidence.rb +128 -0
  233. data/lib/mindee/v2/parsing/field/field_location.rb +33 -0
  234. data/lib/mindee/v2/parsing/field/inference_fields.rb +105 -0
  235. data/lib/mindee/v2/parsing/field/list_field.rb +79 -0
  236. data/lib/mindee/v2/parsing/field/object_field.rb +138 -0
  237. data/lib/mindee/v2/parsing/field/simple_field.rb +60 -0
  238. data/lib/mindee/v2/parsing/field.rb +9 -0
  239. data/lib/mindee/v2/parsing/inference_active_options.rb +67 -0
  240. data/lib/mindee/v2/parsing/inference_file.rb +38 -0
  241. data/lib/mindee/v2/parsing/inference_job.rb +25 -0
  242. data/lib/mindee/v2/parsing/inference_model.rb +30 -0
  243. data/lib/mindee/v2/parsing/job.rb +93 -0
  244. data/lib/mindee/v2/parsing/job_response.rb +30 -0
  245. data/lib/mindee/v2/parsing/job_webhook.rb +59 -0
  246. data/lib/mindee/v2/parsing/rag_metadata.rb +17 -0
  247. data/lib/mindee/v2/parsing/raw_text.rb +27 -0
  248. data/lib/mindee/v2/parsing/raw_text_page.rb +24 -0
  249. data/lib/mindee/v2/parsing/search/pagination_metadata.rb +44 -0
  250. data/lib/mindee/v2/parsing/search/search_model.rb +38 -0
  251. data/lib/mindee/v2/parsing/search/search_models.rb +34 -0
  252. data/lib/mindee/v2/parsing/search/search_response.rb +38 -0
  253. data/lib/mindee/v2/parsing/search.rb +6 -0
  254. data/lib/mindee/v2/parsing.rb +16 -0
  255. data/lib/mindee/v2/product/base_product.rb +28 -0
  256. data/lib/mindee/v2/product/classification/classification.rb +20 -0
  257. data/lib/mindee/v2/product/classification/classification_classifier.rb +25 -0
  258. data/lib/mindee/v2/product/classification/classification_inference.rb +35 -0
  259. data/lib/mindee/v2/product/classification/classification_response.rb +32 -0
  260. data/lib/mindee/v2/product/classification/classification_result.rb +27 -0
  261. data/lib/mindee/v2/product/classification/params/classification_parameters.rb +47 -0
  262. data/lib/mindee/v2/product/crop/crop.rb +20 -0
  263. data/lib/mindee/v2/product/crop/crop_inference.rb +34 -0
  264. data/lib/mindee/v2/product/crop/crop_item.rb +39 -0
  265. data/lib/mindee/v2/product/crop/crop_response.rb +40 -0
  266. data/lib/mindee/v2/product/crop/crop_result.rb +34 -0
  267. data/lib/mindee/v2/product/crop/params/crop_parameters.rb +47 -0
  268. data/lib/mindee/v2/product/extraction/extraction.rb +21 -0
  269. data/lib/mindee/v2/product/extraction/extraction_inference.rb +40 -0
  270. data/lib/mindee/v2/product/extraction/extraction_response.rb +32 -0
  271. data/lib/mindee/v2/product/extraction/extraction_result.rb +44 -0
  272. data/lib/mindee/v2/product/extraction/params/data_schema.rb +51 -0
  273. data/lib/mindee/v2/product/extraction/params/data_schema_field.rb +69 -0
  274. data/lib/mindee/v2/product/extraction/params/data_schema_replace.rb +39 -0
  275. data/lib/mindee/v2/product/extraction/params/extraction_parameters.rb +125 -0
  276. data/lib/mindee/v2/product/ocr/ocr.rb +20 -0
  277. data/lib/mindee/v2/product/ocr/ocr_inference.rb +34 -0
  278. data/lib/mindee/v2/product/ocr/ocr_page.rb +33 -0
  279. data/lib/mindee/v2/product/ocr/ocr_response.rb +32 -0
  280. data/lib/mindee/v2/product/ocr/ocr_result.rb +34 -0
  281. data/lib/mindee/v2/product/ocr/ocr_word.rb +29 -0
  282. data/lib/mindee/v2/product/ocr/params/ocr_parameters.rb +47 -0
  283. data/lib/mindee/v2/product/split/params/split_parameters.rb +48 -0
  284. data/lib/mindee/v2/product/split/split.rb +19 -0
  285. data/lib/mindee/v2/product/split/split_inference.rb +34 -0
  286. data/lib/mindee/v2/product/split/split_range.rb +38 -0
  287. data/lib/mindee/v2/product/split/split_response.rb +40 -0
  288. data/lib/mindee/v2/product/split/split_result.rb +34 -0
  289. data/lib/mindee/v2/product.rb +7 -0
  290. data/lib/mindee/v2.rb +7 -0
  291. data/lib/mindee/version.rb +26 -0
  292. data/lib/mindee.rb +135 -0
  293. data/mindee-lite.gemspec +36 -0
  294. data/mindee.gemspec +44 -0
  295. data/sig/custom/marcel.rbs +3 -0
  296. data/sig/custom/mini_magick.rbs +31 -0
  297. data/sig/custom/net_http.rbs +43 -0
  298. data/sig/custom/origami.rbs +59 -0
  299. data/sig/mindee/dependency.rbs +13 -0
  300. data/sig/mindee/error/mindee_error.rbs +13 -0
  301. data/sig/mindee/error/mindee_http_error.rbs +17 -0
  302. data/sig/mindee/error/mindee_http_error_v2.rbs +15 -0
  303. data/sig/mindee/error/mindee_http_unknown_error_v2.rbs +9 -0
  304. data/sig/mindee/error/mindee_input_error.rbs +18 -0
  305. data/sig/mindee/geometry/min_max.rbs +11 -0
  306. data/sig/mindee/geometry/point.rbs +14 -0
  307. data/sig/mindee/geometry/polygon.rbs +12 -0
  308. data/sig/mindee/geometry/quadrilateral.rbs +15 -0
  309. data/sig/mindee/geometry/utils.rbs +13 -0
  310. data/sig/mindee/http/http_error_handler.rbs +15 -0
  311. data/sig/mindee/http/response_validation.rbs +11 -0
  312. data/sig/mindee/image/extracted_image.rbs +21 -0
  313. data/sig/mindee/image/image_compressor.rbs +8 -0
  314. data/sig/mindee/image/image_extractor.rbs +13 -0
  315. data/sig/mindee/image/image_utils.rbs +19 -0
  316. data/sig/mindee/input/base_parameters.rbs +35 -0
  317. data/sig/mindee/input/local_response.rbs +14 -0
  318. data/sig/mindee/input/polling_options.rbs +12 -0
  319. data/sig/mindee/input/sources/base64_input_source.rbs +11 -0
  320. data/sig/mindee/input/sources/bytes_input_source.rbs +10 -0
  321. data/sig/mindee/input/sources/file_input_source.rbs +10 -0
  322. data/sig/mindee/input/sources/local_input_source.rbs +30 -0
  323. data/sig/mindee/input/sources/path_input_source.rbs +10 -0
  324. data/sig/mindee/input/sources/url_input_source.rbs +20 -0
  325. data/sig/mindee/logging/logger.rbs +11 -0
  326. data/sig/mindee/page_options.rbs +11 -0
  327. data/sig/mindee/pdf/extracted_pdf.rbs +17 -0
  328. data/sig/mindee/pdf/pdf_compressor.rbs +15 -0
  329. data/sig/mindee/pdf/pdf_extractor.rbs +19 -0
  330. data/sig/mindee/pdf/pdf_processor.rbs +12 -0
  331. data/sig/mindee/pdf/pdf_tools.rbs +31 -0
  332. data/sig/mindee/v1/client.rbs +84 -0
  333. data/sig/mindee/v1/extraction/multi_receipts_extractor.rbs +8 -0
  334. data/sig/mindee/v1/http/endpoint.rbs +41 -0
  335. data/sig/mindee/v1/http/workflow_endpoint.rbs +22 -0
  336. data/sig/mindee/v1/parsing/common/api_request.rbs +22 -0
  337. data/sig/mindee/v1/parsing/common/api_response.rbs +31 -0
  338. data/sig/mindee/v1/parsing/common/document.rbs +32 -0
  339. data/sig/mindee/v1/parsing/common/execution.rbs +26 -0
  340. data/sig/mindee/v1/parsing/common/execution_file.rbs +16 -0
  341. data/sig/mindee/v1/parsing/common/execution_priority.rbs +16 -0
  342. data/sig/mindee/v1/parsing/common/extras/cropper_extra.rbs +18 -0
  343. data/sig/mindee/v1/parsing/common/extras/extras.rbs +24 -0
  344. data/sig/mindee/v1/parsing/common/extras/full_text_ocr_extra.rbs +22 -0
  345. data/sig/mindee/v1/parsing/common/extras/rag_extra.rbs +19 -0
  346. data/sig/mindee/v1/parsing/common/inference.rbs +31 -0
  347. data/sig/mindee/v1/parsing/common/job.rbs +24 -0
  348. data/sig/mindee/v1/parsing/common/ocr/mvision_v1.rbs +20 -0
  349. data/sig/mindee/v1/parsing/common/ocr/ocr.rbs +56 -0
  350. data/sig/mindee/v1/parsing/common/orientation.rbs +15 -0
  351. data/sig/mindee/v1/parsing/common/page.rbs +19 -0
  352. data/sig/mindee/v1/parsing/common/prediction.rbs +14 -0
  353. data/sig/mindee/v1/parsing/common/product.rbs +16 -0
  354. data/sig/mindee/v1/parsing/common/workflow_response.rbs +22 -0
  355. data/sig/mindee/v1/parsing/standard/abstract_field.rbs +30 -0
  356. data/sig/mindee/v1/parsing/standard/address_field.rbs +28 -0
  357. data/sig/mindee/v1/parsing/standard/amount_field.rbs +16 -0
  358. data/sig/mindee/v1/parsing/standard/base_field.rbs +16 -0
  359. data/sig/mindee/v1/parsing/standard/boolean_field.rbs +16 -0
  360. data/sig/mindee/v1/parsing/standard/classification_field.rbs +12 -0
  361. data/sig/mindee/v1/parsing/standard/company_registration_field.rbs +20 -0
  362. data/sig/mindee/v1/parsing/standard/date_field.rbs +20 -0
  363. data/sig/mindee/v1/parsing/standard/feature_field.rbs +12 -0
  364. data/sig/mindee/v1/parsing/standard/locale_field.rbs +24 -0
  365. data/sig/mindee/v1/parsing/standard/payment_details_field.rbs +19 -0
  366. data/sig/mindee/v1/parsing/standard/position_field.rbs +26 -0
  367. data/sig/mindee/v1/parsing/standard/string_field.rbs +16 -0
  368. data/sig/mindee/v1/parsing/standard/tax_field.rbs +33 -0
  369. data/sig/mindee/v1/parsing/universal/universal_list_field.rbs +21 -0
  370. data/sig/mindee/v1/parsing/universal/universal_object_field.rbs +38 -0
  371. data/sig/mindee/v1/product/barcode_reader/barcode_reader_v1.rbs +13 -0
  372. data/sig/mindee/v1/product/barcode_reader/barcode_reader_v1_document.rbs +16 -0
  373. data/sig/mindee/v1/product/barcode_reader/barcode_reader_v1_page.rbs +17 -0
  374. data/sig/mindee/v1/product/cropper/cropper_v1.rbs +13 -0
  375. data/sig/mindee/v1/product/cropper/cropper_v1_document.rbs +14 -0
  376. data/sig/mindee/v1/product/cropper/cropper_v1_page.rbs +19 -0
  377. data/sig/mindee/v1/product/financial_document/financial_document_v1.rbs +13 -0
  378. data/sig/mindee/v1/product/financial_document/financial_document_v1_document.rbs +49 -0
  379. data/sig/mindee/v1/product/financial_document/financial_document_v1_line_item.rbs +35 -0
  380. data/sig/mindee/v1/product/financial_document/financial_document_v1_line_items.rbs +15 -0
  381. data/sig/mindee/v1/product/financial_document/financial_document_v1_page.rbs +17 -0
  382. data/sig/mindee/v1/product/fr/bank_account_details/bank_account_details_v1.rbs +15 -0
  383. data/sig/mindee/v1/product/fr/bank_account_details/bank_account_details_v1_document.rbs +19 -0
  384. data/sig/mindee/v1/product/fr/bank_account_details/bank_account_details_v1_page.rbs +19 -0
  385. data/sig/mindee/v1/product/fr/bank_account_details/bank_account_details_v2.rbs +15 -0
  386. data/sig/mindee/v1/product/fr/bank_account_details/bank_account_details_v2_bban.rbs +25 -0
  387. data/sig/mindee/v1/product/fr/bank_account_details/bank_account_details_v2_document.rbs +20 -0
  388. data/sig/mindee/v1/product/fr/bank_account_details/bank_account_details_v2_page.rbs +19 -0
  389. data/sig/mindee/v1/product/fr/bank_statement/bank_statement_v2.rbs +15 -0
  390. data/sig/mindee/v1/product/fr/bank_statement/bank_statement_v2_document.rbs +31 -0
  391. data/sig/mindee/v1/product/fr/bank_statement/bank_statement_v2_page.rbs +19 -0
  392. data/sig/mindee/v1/product/fr/bank_statement/bank_statement_v2_transaction.rbs +27 -0
  393. data/sig/mindee/v1/product/fr/bank_statement/bank_statement_v2_transactions.rbs +17 -0
  394. data/sig/mindee/v1/product/fr/id_card/id_card_v1.rbs +15 -0
  395. data/sig/mindee/v1/product/fr/id_card/id_card_v1_document.rbs +26 -0
  396. data/sig/mindee/v1/product/fr/id_card/id_card_v1_page.rbs +20 -0
  397. data/sig/mindee/v1/product/fr/id_card/id_card_v2.rbs +15 -0
  398. data/sig/mindee/v1/product/fr/id_card/id_card_v2_document.rbs +31 -0
  399. data/sig/mindee/v1/product/fr/id_card/id_card_v2_page.rbs +21 -0
  400. data/sig/mindee/v1/product/international_id/international_id_v2.rbs +13 -0
  401. data/sig/mindee/v1/product/international_id/international_id_v2_document.rbs +31 -0
  402. data/sig/mindee/v1/product/international_id/international_id_v2_page.rbs +17 -0
  403. data/sig/mindee/v1/product/invoice/invoice_v4.rbs +13 -0
  404. data/sig/mindee/v1/product/invoice/invoice_v4_document.rbs +45 -0
  405. data/sig/mindee/v1/product/invoice/invoice_v4_line_item.rbs +35 -0
  406. data/sig/mindee/v1/product/invoice/invoice_v4_line_items.rbs +15 -0
  407. data/sig/mindee/v1/product/invoice/invoice_v4_page.rbs +17 -0
  408. data/sig/mindee/v1/product/invoice_splitter/invoice_splitter_v1.rbs +13 -0
  409. data/sig/mindee/v1/product/invoice_splitter/invoice_splitter_v1_document.rbs +17 -0
  410. data/sig/mindee/v1/product/invoice_splitter/invoice_splitter_v1_invoice_page_group.rbs +21 -0
  411. data/sig/mindee/v1/product/invoice_splitter/invoice_splitter_v1_invoice_page_groups.rbs +15 -0
  412. data/sig/mindee/v1/product/invoice_splitter/invoice_splitter_v1_page.rbs +17 -0
  413. data/sig/mindee/v1/product/multi_receipts_detector/multi_receipts_detector_v1.rbs +14 -0
  414. data/sig/mindee/v1/product/multi_receipts_detector/multi_receipts_detector_v1_document.rbs +15 -0
  415. data/sig/mindee/v1/product/multi_receipts_detector/multi_receipts_detector_v1_page.rbs +17 -0
  416. data/sig/mindee/v1/product/passport/passport_v1.rbs +13 -0
  417. data/sig/mindee/v1/product/passport/passport_v1_document.rbs +25 -0
  418. data/sig/mindee/v1/product/passport/passport_v1_page.rbs +17 -0
  419. data/sig/mindee/v1/product/receipt/receipt_v5.rbs +13 -0
  420. data/sig/mindee/v1/product/receipt/receipt_v5_document.rbs +33 -0
  421. data/sig/mindee/v1/product/receipt/receipt_v5_line_item.rbs +27 -0
  422. data/sig/mindee/v1/product/receipt/receipt_v5_line_items.rbs +15 -0
  423. data/sig/mindee/v1/product/receipt/receipt_v5_page.rbs +17 -0
  424. data/sig/mindee/v1/product/resume/resume_v1.rbs +13 -0
  425. data/sig/mindee/v1/product/resume/resume_v1_certificate.rbs +27 -0
  426. data/sig/mindee/v1/product/resume/resume_v1_certificates.rbs +17 -0
  427. data/sig/mindee/v1/product/resume/resume_v1_document.rbs +69 -0
  428. data/sig/mindee/v1/product/resume/resume_v1_education.rbs +33 -0
  429. data/sig/mindee/v1/product/resume/resume_v1_educations.rbs +17 -0
  430. data/sig/mindee/v1/product/resume/resume_v1_language.rbs +23 -0
  431. data/sig/mindee/v1/product/resume/resume_v1_languages.rbs +17 -0
  432. data/sig/mindee/v1/product/resume/resume_v1_page.rbs +19 -0
  433. data/sig/mindee/v1/product/resume/resume_v1_professional_experience.rbs +37 -0
  434. data/sig/mindee/v1/product/resume/resume_v1_professional_experiences.rbs +17 -0
  435. data/sig/mindee/v1/product/resume/resume_v1_social_networks_url.rbs +23 -0
  436. data/sig/mindee/v1/product/resume/resume_v1_social_networks_urls.rbs +17 -0
  437. data/sig/mindee/v1/product/universal/universal.rbs +16 -0
  438. data/sig/mindee/v1/product/universal/universal_document.rbs +12 -0
  439. data/sig/mindee/v1/product/universal/universal_page.rbs +18 -0
  440. data/sig/mindee/v1/product/universal/universal_prediction.rbs +30 -0
  441. data/sig/mindee/v2/client.rbs +29 -0
  442. data/sig/mindee/v2/file_operation/crop.rbs +10 -0
  443. data/sig/mindee/v2/file_operation/crop_files.rbs +9 -0
  444. data/sig/mindee/v2/file_operation/split.rbs +11 -0
  445. data/sig/mindee/v2/file_operation/split_files.rbs +9 -0
  446. data/sig/mindee/v2/http/api_v2_settings.rbs +27 -0
  447. data/sig/mindee/v2/http/mindee_api_v2.rbs +52 -0
  448. data/sig/mindee/v2/parsing/base_inference.rbs +18 -0
  449. data/sig/mindee/v2/parsing/base_response.rbs +11 -0
  450. data/sig/mindee/v2/parsing/common_response.rbs +12 -0
  451. data/sig/mindee/v2/parsing/error_item.rbs +13 -0
  452. data/sig/mindee/v2/parsing/error_response.rbs +20 -0
  453. data/sig/mindee/v2/parsing/field/base_field.rbs +17 -0
  454. data/sig/mindee/v2/parsing/field/field_confidence.rbs +30 -0
  455. data/sig/mindee/v2/parsing/field/field_location.rbs +16 -0
  456. data/sig/mindee/v2/parsing/field/inference_fields.rbs +20 -0
  457. data/sig/mindee/v2/parsing/field/list_field.rbs +23 -0
  458. data/sig/mindee/v2/parsing/field/object_field.rbs +27 -0
  459. data/sig/mindee/v2/parsing/field/simple_field.rbs +16 -0
  460. data/sig/mindee/v2/parsing/inference_active_options.rbs +26 -0
  461. data/sig/mindee/v2/parsing/inference_file.rbs +17 -0
  462. data/sig/mindee/v2/parsing/inference_job.rbs +13 -0
  463. data/sig/mindee/v2/parsing/inference_model.rbs +12 -0
  464. data/sig/mindee/v2/parsing/job.rbs +24 -0
  465. data/sig/mindee/v2/parsing/job_response.rbs +14 -0
  466. data/sig/mindee/v2/parsing/job_webhook.rbs +19 -0
  467. data/sig/mindee/v2/parsing/rag_metadata.rbs +13 -0
  468. data/sig/mindee/v2/parsing/raw_text.rbs +12 -0
  469. data/sig/mindee/v2/parsing/raw_text_page.rbs +11 -0
  470. data/sig/mindee/v2/parsing/search/pagination_metadata.rbs +20 -0
  471. data/sig/mindee/v2/parsing/search/search_model.rbs +19 -0
  472. data/sig/mindee/v2/parsing/search/search_response.rbs +17 -0
  473. data/sig/mindee/v2/parsing/search_models.rbs +14 -0
  474. data/sig/mindee/v2/product/base_product.rbs +19 -0
  475. data/sig/mindee/v2/product/classification/classification.rbs +10 -0
  476. data/sig/mindee/v2/product/classification/classification_classifier.rbs +15 -0
  477. data/sig/mindee/v2/product/classification/classification_inference.rbs +15 -0
  478. data/sig/mindee/v2/product/classification/classification_response.rbs +23 -0
  479. data/sig/mindee/v2/product/classification/classification_result.rbs +15 -0
  480. data/sig/mindee/v2/product/classification/params/classification_parameters/classification_parameters.rbs +23 -0
  481. data/sig/mindee/v2/product/crop/crop.rbs +10 -0
  482. data/sig/mindee/v2/product/crop/crop_inference.rbs +14 -0
  483. data/sig/mindee/v2/product/crop/crop_item.rbs +18 -0
  484. data/sig/mindee/v2/product/crop/crop_response.rbs +25 -0
  485. data/sig/mindee/v2/product/crop/crop_result.rbs +14 -0
  486. data/sig/mindee/v2/product/crop/params/crop_parameters/crop_parameters.rbs +23 -0
  487. data/sig/mindee/v2/product/extraction/extraction.rbs +15 -0
  488. data/sig/mindee/v2/product/extraction/extraction_inference.rbs +19 -0
  489. data/sig/mindee/v2/product/extraction/extraction_response.rbs +24 -0
  490. data/sig/mindee/v2/product/extraction/extraction_result.rbs +18 -0
  491. data/sig/mindee/v2/product/extraction/params/data_schema.rbs +21 -0
  492. data/sig/mindee/v2/product/extraction/params/data_schema_field.rbs +29 -0
  493. data/sig/mindee/v2/product/extraction/params/data_schema_replace.rbs +21 -0
  494. data/sig/mindee/v2/product/extraction/params/extraction_parameters.rbs +38 -0
  495. data/sig/mindee/v2/product/ocr/ocr.rbs +10 -0
  496. data/sig/mindee/v2/product/ocr/ocr_inference.rbs +14 -0
  497. data/sig/mindee/v2/product/ocr/ocr_page.rbs +15 -0
  498. data/sig/mindee/v2/product/ocr/ocr_response.rbs +23 -0
  499. data/sig/mindee/v2/product/ocr/ocr_result.rbs +14 -0
  500. data/sig/mindee/v2/product/ocr/ocr_word.rbs +15 -0
  501. data/sig/mindee/v2/product/ocr/params/ocr_parameters/ocr_parameters.rbs +24 -0
  502. data/sig/mindee/v2/product/split/params/split_parameters/split_parameters.rbs +23 -0
  503. data/sig/mindee/v2/product/split/split.rbs +10 -0
  504. data/sig/mindee/v2/product/split/split_inference.rbs +14 -0
  505. data/sig/mindee/v2/product/split/split_range.rbs +18 -0
  506. data/sig/mindee/v2/product/split/split_response.rbs +25 -0
  507. data/sig/mindee/v2/product/split/split_result.rbs +14 -0
  508. data/sig/mindee/version.rbs +6 -0
  509. data/sig/mindee.rbs +62 -0
  510. metadata +600 -0
@@ -0,0 +1,121 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Mindee
4
+ # Pdf Extraction Module.
5
+ module PDF
6
+ # Pdf extraction class.
7
+ class PDFExtractor
8
+ # @param local_input [Mindee::Input::Source::LocalInputSource]
9
+ def initialize(local_input)
10
+ unless Mindee::Dependency.all_deps_available?
11
+ raise NotImplementedError, Mindee::Dependency::MINDEE_DEPENDENCIES_LOAD_ERROR
12
+ end
13
+
14
+ @filename = local_input.filename
15
+ if local_input.pdf?
16
+ @source_pdf = local_input.io_stream
17
+ else
18
+ pdf_image = Image::ImageExtractor.attach_image_as_new_file(local_input.io_stream)
19
+ io_buffer = StringIO.new
20
+ pdf_image.save(io_buffer)
21
+
22
+ @source_pdf = io_buffer
23
+ end
24
+ end
25
+
26
+ # Retrieves the page count for the Pdf object.
27
+ # @return [Integer]
28
+ def page_count
29
+ Mindee::PDF::PDFProcessor.open_pdf(@source_pdf).pages.size
30
+ end
31
+
32
+ # Creates a new Pdf from pages and save it into a buffer.
33
+ # @param page_indexes [Array<Integer>] List of page number to use for merging in the original Pdf.
34
+ # @return [StringIO] The buffer containing the new Pdf.
35
+ def cut_pages(page_indexes)
36
+ options = PageOptions.new(params: {
37
+ page_indexes: page_indexes,
38
+ })
39
+
40
+ Mindee::PDF::PDFProcessor.parse(@source_pdf, options)
41
+ end
42
+
43
+ # Extract the sub-documents from the main pdf, based on the given list of page indexes.
44
+ # @param page_indexes [Array<Array<Integer>>] List of page number to use for merging in the original Pdf.
45
+ # @return [Array<Mindee::PDF::ExtractedPDF>] The buffer containing the new Pdf.
46
+ def extract_sub_documents(page_indexes)
47
+ extracted_pdfs = [] # @type var extracted_pdfs: Array[Mindee::PDF::ExtractedPDF]
48
+ extension = File.extname(@filename)
49
+ basename = File.basename(@filename, extension)
50
+ page_indexes.each do |page_index_list|
51
+ if page_index_list.nil? || page_index_list.empty?
52
+ raise Error::MindeePDFError, "Empty indexes aren't allowed for extraction #{page_index_list}"
53
+ end
54
+
55
+ page_index_list.each do |page_index|
56
+ if (page_index > page_count) || page_index.negative?
57
+ raise Error::MindeePDFError,
58
+ "Index #{page_index} is out of range."
59
+ end
60
+ end
61
+ formatted_max_index = format('%03d', page_index_list[-1] + 1).to_s
62
+ field_filename = "#{basename}_#{format('%03d',
63
+ page_index_list[0] + 1)}-#{formatted_max_index}#{extension}"
64
+ extracted_pdf = Mindee::PDF::ExtractedPDF.new(cut_pages(page_index_list),
65
+ field_filename)
66
+ extracted_pdfs << extracted_pdf
67
+ end
68
+ extracted_pdfs
69
+ end
70
+
71
+ # rubocop:disable Metrics/CyclomaticComplexity
72
+ # rubocop:disable Metrics/PerceivedComplexity
73
+
74
+ # Extracts invoices as complete PDFs from the document.
75
+ # @param page_indexes [Array<Array<Integer>, InvoiceSplitterV1InvoicePageGroup>]
76
+ # @param strict [bool]
77
+ # @return [Array<Mindee::PDF::ExtractedPDF>]
78
+ def extract_invoices(page_indexes, strict: false)
79
+ raise Error::MindeePDFError, 'No indexes provided.' if page_indexes.empty?
80
+
81
+ if page_indexes[0].is_a?(Array) && page_indexes[0].all?(Integer)
82
+ page_indexes_as_array = page_indexes # @type var page_indexes : Array[Array[Integer]]
83
+ return extract_sub_documents(page_indexes_as_array)
84
+ end
85
+ p_ids = page_indexes # @type var page_indexes: Product::InvoiceSplitter::InvoiceSplitterV1InvoicePageGroups
86
+ return extract_sub_documents(p_ids.map(&:page_indexes)) unless strict
87
+
88
+ correct_page_indexes = [] # @type var correct_page_indexes: Array[Array[Integer]]
89
+ current_list = [] # @type var current_list: Array[Integer]
90
+ previous_confidence = nil
91
+ p_ids.each_with_index do |p_i, i|
92
+ page_index = p_i # @type var page_index: Product::InvoiceSplitter::InvoiceSplitterV1InvoicePageGroup
93
+ confidence = page_index.confidence.to_f
94
+ page_list = page_index.page_indexes
95
+
96
+ if confidence >= 0.5 && previous_confidence.nil?
97
+ current_list = page_list
98
+ elsif confidence >= 0.5 && i < p_ids.length - 1
99
+ correct_page_indexes << current_list
100
+ current_list = page_list
101
+ elsif confidence < 0.5 && i == p_ids.length - 1
102
+ current_list.concat page_list
103
+ correct_page_indexes << current_list
104
+ else
105
+ correct_page_indexes << current_list
106
+ correct_page_indexes << page_list
107
+ end
108
+ previous_confidence = confidence
109
+ end
110
+ extract_sub_documents(correct_page_indexes)
111
+ end
112
+
113
+ # rubocop:enable Metrics/CyclomaticComplexity
114
+ # rubocop:enable Metrics/PerceivedComplexity
115
+
116
+ private
117
+
118
+ attr_reader :source_pdf, :filename
119
+ end
120
+ end
121
+ end
@@ -0,0 +1,91 @@
1
+ # frozen_string_literal: true
2
+
3
+ Mindee::Dependency.require_all_deps!
4
+ require 'origami'
5
+ require_relative 'pdf_tools'
6
+
7
+ module Mindee
8
+ module PDF
9
+ # PDF document processing
10
+ module PDFProcessor
11
+ Origami::PDF.class_eval { include PDFTools }
12
+ # @param io_stream [StreamIO]
13
+ # @param options [PageOptions, Hash]
14
+ # @return [StringIO]
15
+ def self.parse(io_stream, options)
16
+ current_pdf = open_pdf(io_stream)
17
+ pages_count = current_pdf.pages.size
18
+ return current_pdf.to_io_stream if options.on_min_pages.to_i > pages_count
19
+
20
+ all_pages = (0..(pages_count - 1)).to_a
21
+
22
+ if options.operation == :KEEP_ONLY
23
+ pages_to_remove = indexes_from_keep(options.page_indexes, all_pages)
24
+ elsif options.operation == :REMOVE
25
+ pages_to_remove = indexes_from_remove(options.page_indexes, all_pages)
26
+ else
27
+ raise ArgumentError, "operation must be one of :KEEP_ONLY or :REMOVE, sent '#{options.operation}'"
28
+ end
29
+
30
+ current_pdf.delete_pages_at(pages_to_remove) if pages_to_remove.to_a != all_pages.to_a
31
+ current_pdf.to_io_stream
32
+ end
33
+
34
+ # @param page_indexes [Array<Integer>]
35
+ # @param all_pages [Array<Integer>]
36
+ def self.indexes_from_keep(page_indexes, all_pages)
37
+ pages_to_keep = Set.new
38
+ page_indexes.each do |idx|
39
+ idx = (all_pages.length - (idx + 2)) if idx.negative?
40
+ page = all_pages[idx]
41
+ next if page.nil?
42
+
43
+ pages_to_keep << page
44
+ end
45
+ (all_pages.to_set - pages_to_keep).to_a
46
+ end
47
+
48
+ # @param page_indexes [Array<Integer>]
49
+ # @param all_pages [Array<Integer>]
50
+ def self.indexes_from_remove(page_indexes, all_pages)
51
+ pages_to_remove = Set.new
52
+ page_indexes.each do |idx|
53
+ idx = (all_pages.length - (idx + 2)) if idx.negative?
54
+ page = all_pages[idx]
55
+ next if page.nil?
56
+
57
+ pages_to_remove << page
58
+ end
59
+ end
60
+
61
+ # @param io_stream [StringIO]
62
+ # @return [Origami::PDF]
63
+ def self.open_pdf(io_stream)
64
+ unless PDFTools.pdf_header?(io_stream)
65
+ raise Origami::InvalidPDFError,
66
+ 'Input stream does not contain a PDF header.'
67
+ end
68
+
69
+ pdf_parser = Origami::PDF::LinearParser.new({ verbosity: Origami::Parser::VERBOSE_QUIET })
70
+ io_stream.seek(0)
71
+ pdf_parser.parse(io_stream)
72
+ end
73
+
74
+ # Retrieves a PDF document's page.
75
+ #
76
+ # @param [Origami::PDF] pdf_doc Origami PDF handle.
77
+ # @param [Integer] page_id Page ID.
78
+ # @return [StringIO]
79
+ def self.get_page(pdf_doc, page_id)
80
+ stream = StringIO.new
81
+ pdf_doc.save(stream)
82
+
83
+ options = PageOptions.new(params: {
84
+ page_indexes: [page_id - 1],
85
+ })
86
+
87
+ parse(stream, options)
88
+ end
89
+ end
90
+ end
91
+ end
@@ -0,0 +1,201 @@
1
+ # frozen_string_literal: true
2
+
3
+ Mindee::Dependency.require_all_deps!
4
+ require 'origami'
5
+
6
+ module Mindee
7
+ module PDF
8
+ # Collection of miscellaneous PDF operations,as well as some monkey-patching for Origami.
9
+ module PDFTools
10
+ # Converts the current PDF document into a binary-encoded StringIO stream.
11
+ #
12
+ # @param [Hash] params Optional settings to override default processing flags.
13
+ # - :delinearize [bool] (default: true) Whether to convert a linearized PDF to its full form.
14
+ # - :recompile [bool] (default: true) Whether to recompile the PDF after processing.
15
+ # - :decrypt [bool] (default: false) Whether to attempt to decrypt the PDF.
16
+ # - Other keys such as :intent, :rebuild_xrefs, :noindent, and :obfuscate may be modified automatically.
17
+ #
18
+ # @return [StringIO] A binary-encoded stream representing the processed PDF.
19
+ def to_io_stream(params = {})
20
+ options = {
21
+ delinearize: true,
22
+ recompile: true,
23
+ decrypt: false,
24
+ noindent: nil,
25
+ }
26
+ options.update(params)
27
+
28
+ if frozen? # incompatible flags with frozen doc (signed)
29
+ options[:recompile] = nil
30
+ options[:rebuild_xrefs] = nil
31
+ options[:noindent] = nil
32
+ options[:obfuscate] = false
33
+ end
34
+ load_all_objects unless @loaded
35
+
36
+ intents_as_pdfa1 if options[:intent].to_s =~ %r{pdf[/-]?A1?/i}
37
+ delinearize! if options[:delinearize] && linearized?
38
+ compile(options) if options[:recompile]
39
+
40
+ io_stream = StringIO.new(output(options))
41
+ io_stream.set_encoding Encoding::BINARY
42
+ io_stream
43
+ end
44
+
45
+ # Checks a PDFs stream content for text operators
46
+ # See https://opensource.adobe.com/dc-acrobat-sdk-docs/pdfstandards/PDF32000_2008.pdf page 243-251.
47
+ # @param [StringIO] stream Stream object from a PDFs page.
48
+ # @return [bool] `true` if a text operator is found in the stream.
49
+ def self.stream_has_text?(stream)
50
+ data = stream.data
51
+ return false if data.nil? || data.empty?
52
+
53
+ text_operators = ['Tc', 'Tw', 'Th', 'TL', 'Tf', 'Tk', 'Tr', 'Tm', 'T*', 'Tj', 'TJ', "'", '"']
54
+ text_operators.any? { |op| data.include?(op) }
55
+ end
56
+
57
+ # Checks whether a stream contains a PDF header near the beginning.
58
+ # @param [StringIO] io_stream Binary-encoded stream.
59
+ # @param [Integer] maximum_offset Maximum allowed offset to find '%PDF-'.
60
+ # @return [bool] `true` when the stream appears to be a PDF.
61
+ def self.pdf_header?(io_stream, maximum_offset: 500)
62
+ initial_pos = nil
63
+ initial_pos = io_stream.pos if io_stream.respond_to?(:pos)
64
+ io_stream.seek(0)
65
+ io_stream.gets('%PDF-')
66
+ !(io_stream.eof? || io_stream.pos > maximum_offset)
67
+ rescue TypeError, IOError, SystemCallError
68
+ false
69
+ ensure
70
+ io_stream.seek(initial_pos) if !initial_pos.nil? && io_stream.respond_to?(:seek)
71
+ end
72
+
73
+ # Checks whether the file has source_text. Sends false if the file isn't a PDF.
74
+ # @param [StringIO] pdf_data Abinary-encoded stream representing the PDF file.
75
+ # @return [bool] `true` if the pdf has source text, false otherwise.
76
+ def self.source_text?(pdf_data)
77
+ return false unless pdf_header?(pdf_data)
78
+
79
+ begin
80
+ pdf_data.rewind
81
+ pdf = Origami::PDF.read(pdf_data)
82
+
83
+ pdf.each_page do |page|
84
+ next unless page[:Contents]
85
+
86
+ contents = page[:Contents].solve
87
+ contents = [contents] unless contents.is_a?(Origami::Array)
88
+
89
+ contents.each do |stream_ref|
90
+ stream = stream_ref.solve
91
+ return true if stream_has_text?(stream)
92
+ end
93
+ end
94
+
95
+ false
96
+ end
97
+
98
+ false
99
+ rescue Origami::InvalidPDFError
100
+ false
101
+ end
102
+
103
+ # Creates an image XObject from the provided image.
104
+ #
105
+ # Converts the given image to a binary stream using Mindee's image utilities, then creates
106
+ # an Origami::Graphics::ImageXObject with a JPEG filter.
107
+ #
108
+ # @param [MiniMagick::Image] image An image object with the necessary data and structure.
109
+ # @return [Origami::Graphics::ImageXObject] The created image XObject.
110
+ def self.create_xobject(image)
111
+ image_io = Mindee::Image::ImageUtils.image_to_stringio(image)
112
+ Origami::Graphics::ImageXObject.from_image_file(image_io, 'jpg')
113
+ end
114
+
115
+ # Sets properties on the provided image XObject based on image metadata.
116
+ #
117
+ # @param [Origami::Graphics::ImageXObject] xobject The image XObject to update.
118
+ # @param [Hash] image A hash containing image metadata (such as width, height, properties, etc.).
119
+ def self.set_xobject_properties(xobject, image)
120
+ xobject.dictionary[:BitsPerComponent] = 8
121
+ xobject.dictionary[:Filter] = determine_filter(image)
122
+ xobject.dictionary[:Width] = image[:width]
123
+ xobject.dictionary[:Height] = image[:height]
124
+ xobject.dictionary[:ColorSpace] = determine_colorspace(image)
125
+ end
126
+
127
+ # Determines the appropriate filter for an image based on its properties.
128
+ #
129
+ # @param [Hash] image The image data hash containing properties.
130
+ # @return [Symbol] One of :FlateDecode, :LZWDecode or :DCTDecode.
131
+ def self.determine_filter(image)
132
+ filter = image.data['properties']['filter']
133
+ case filter
134
+ when %r{Zip}i then :FlateDecode
135
+ when %r{LZW}i then :LZWDecode
136
+ else :DCTDecode
137
+ end
138
+ end
139
+
140
+ # Determines the colorspace for an image based on its metadata.
141
+ #
142
+ # @param [Hash] image The image data hash.
143
+ # @return [Symbol] One of :DeviceCMYK, :DeviceGray or :DeviceRGB.
144
+ def self.determine_colorspace(image)
145
+ colorspace = image.data['colorspace']
146
+ case colorspace
147
+ when 'CMYK' then :DeviceCMYK
148
+ when 'Gray', 'PseudoClass Gray' then :DeviceGray
149
+ else :DeviceRGB
150
+ end
151
+ end
152
+
153
+ # Adds a content stream to the specified PDF page to display an image XObject.
154
+ #
155
+ # @param [Origami::Page] page The PDF page to which content will be added.
156
+ # @param [String] xobject_name The name identifying the XObject.
157
+ # @param [Integer] width The width for the transformation matrix.
158
+ # @param [Integer] height The height for the transformation matrix.
159
+ def self.add_content_to_page(page, xobject_name, width, height)
160
+ content = "q\n#{width} 0 0 #{height} 0 0 cm\n/#{xobject_name} Do\nQ\n"
161
+ content_stream = Origami::Stream.new(content)
162
+ page.Contents = content_stream
163
+ end
164
+
165
+ # Sets the dimensions for the specified PDF page.
166
+ #
167
+ # @param [Origami::Page] page The PDF page whose dimensions are being set.
168
+ # @param [Numeric] width The target width of the page.
169
+ # @param [Numeric] height The target height of the page.
170
+ def self.set_page_dimensions(page, width, height)
171
+ page[:MediaBox] = [0, 0, width, height]
172
+ page[:CropBox] = [0, 0, width, height]
173
+ end
174
+
175
+ # Processes an image into an image XObject for PDF embedding.
176
+ #
177
+ # @param [MiniMagick::Image, StringIO] image_data The raw image data.
178
+ # @param [Integer] image_quality The quality setting for image compression.
179
+ # @param [Numeric] width The desired width of the output image.
180
+ # @param [Numeric] height The desired height of the output image.
181
+ # @return [Origami::Graphics::ImageXObject] The resulting image XObject.
182
+ def self.process_image_xobject(image_data, image_quality, width, height)
183
+ compressed_data = Image::ImageCompressor.compress_image(
184
+ image_data,
185
+ quality: image_quality,
186
+ max_width: width,
187
+ max_height: height
188
+ )
189
+
190
+ new_image = Origami::Graphics::ImageXObject.new
191
+ new_image.data = compressed_data
192
+ new_image.Width = width
193
+ new_image.Height = height
194
+ new_image.ColorSpace = :DeviceRGB
195
+ new_image.BitsPerComponent = 8
196
+
197
+ new_image
198
+ end
199
+ end
200
+ end
201
+ end
data/lib/mindee/pdf.rb ADDED
@@ -0,0 +1,7 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative 'pdf/extracted_pdf'
4
+ require_relative 'pdf/pdf_compressor'
5
+ require_relative 'pdf/pdf_extractor'
6
+ require_relative 'pdf/pdf_processor'
7
+ require_relative 'pdf/pdf_tools'