mindee-lite 5.0.0.beta1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.editorconfig +24 -0
- data/.gitattributes +14 -0
- data/.gitignore +76 -0
- data/.gitmodules +3 -0
- data/.pre-commit-config.yaml +36 -0
- data/.rubocop.yml +49 -0
- data/.yardopts +4 -0
- data/CHANGELOG.md +515 -0
- data/CODE_OF_CONDUCT.md +129 -0
- data/CONTRIBUTING.md +107 -0
- data/Gemfile +14 -0
- data/LICENSE +21 -0
- data/README.md +42 -0
- data/Rakefile +40 -0
- data/Steepfile +30 -0
- data/bin/console +14 -0
- data/bin/mindee.rb +30 -0
- data/bin/v1/parser.rb +153 -0
- data/bin/v1/products.rb +88 -0
- data/bin/v2/parser.rb +235 -0
- data/bin/v2/products.rb +34 -0
- data/docs/code_samples/bank_account_details_v1.txt +24 -0
- data/docs/code_samples/bank_account_details_v2.txt +24 -0
- data/docs/code_samples/bank_statement_fr_v2_async.txt +24 -0
- data/docs/code_samples/barcode_reader_v1.txt +24 -0
- data/docs/code_samples/cropper_v1.txt +21 -0
- data/docs/code_samples/default.txt +30 -0
- data/docs/code_samples/default_async.txt +29 -0
- data/docs/code_samples/expense_receipts_v5.txt +25 -0
- data/docs/code_samples/expense_receipts_v5_async.txt +24 -0
- data/docs/code_samples/financial_document_v1.txt +25 -0
- data/docs/code_samples/financial_document_v1_async.txt +24 -0
- data/docs/code_samples/idcard_fr_v1.txt +24 -0
- data/docs/code_samples/idcard_fr_v2.txt +24 -0
- data/docs/code_samples/international_id_v2_async.txt +24 -0
- data/docs/code_samples/invoice_splitter_v1_async.txt +24 -0
- data/docs/code_samples/invoices_v4.txt +25 -0
- data/docs/code_samples/invoices_v4_async.txt +24 -0
- data/docs/code_samples/multi_receipts_detector_v1.txt +24 -0
- data/docs/code_samples/passport_v1.txt +24 -0
- data/docs/code_samples/resume_v1_async.txt +24 -0
- data/docs/code_samples/v2_classification.txt +30 -0
- data/docs/code_samples/v2_crop.txt +30 -0
- data/docs/code_samples/v2_extraction.txt +42 -0
- data/docs/code_samples/v2_extraction_webhook.txt +45 -0
- data/docs/code_samples/v2_ocr.txt +30 -0
- data/docs/code_samples/v2_split.txt +30 -0
- data/docs/code_samples/workflow_execution.txt +28 -0
- data/docs/code_samples/workflow_polling.txt +35 -0
- data/examples/auto_invoice_splitter_extraction.rb +48 -0
- data/examples/auto_multi_receipts_detector_extraction.rb +30 -0
- data/lib/mindee/dependency.rb +29 -0
- data/lib/mindee/error/mindee_error.rb +17 -0
- data/lib/mindee/error/mindee_http_error.rb +36 -0
- data/lib/mindee/error/mindee_http_error_v2.rb +45 -0
- data/lib/mindee/error/mindee_http_unknown_error_v2.rb +18 -0
- data/lib/mindee/error/mindee_input_error.rb +30 -0
- data/lib/mindee/error.rb +6 -0
- data/lib/mindee/geometry/min_max.rb +23 -0
- data/lib/mindee/geometry/point.rb +41 -0
- data/lib/mindee/geometry/polygon.rb +37 -0
- data/lib/mindee/geometry/quadrilateral.rb +50 -0
- data/lib/mindee/geometry/utils.rb +88 -0
- data/lib/mindee/geometry.rb +7 -0
- data/lib/mindee/http/.rubocop.yml +7 -0
- data/lib/mindee/http/http_error_handler.rb +106 -0
- data/lib/mindee/http/response_validation.rb +81 -0
- data/lib/mindee/http.rb +3 -0
- data/lib/mindee/image/extracted_image.rb +89 -0
- data/lib/mindee/image/image_compressor.rb +29 -0
- data/lib/mindee/image/image_extractor.rb +118 -0
- data/lib/mindee/image/image_utils.rb +165 -0
- data/lib/mindee/image.rb +6 -0
- data/lib/mindee/input/base_parameters.rb +149 -0
- data/lib/mindee/input/local_response.rb +80 -0
- data/lib/mindee/input/polling_options.rb +26 -0
- data/lib/mindee/input/sources/base64_input_source.rb +31 -0
- data/lib/mindee/input/sources/bytes_input_source.rb +21 -0
- data/lib/mindee/input/sources/file_input_source.rb +20 -0
- data/lib/mindee/input/sources/local_input_source.rb +216 -0
- data/lib/mindee/input/sources/path_input_source.rb +20 -0
- data/lib/mindee/input/sources/url_input_source.rb +130 -0
- data/lib/mindee/input/sources.rb +8 -0
- data/lib/mindee/input.rb +4 -0
- data/lib/mindee/logging/logger.rb +24 -0
- data/lib/mindee/logging.rb +3 -0
- data/lib/mindee/page_options.rb +24 -0
- data/lib/mindee/pdf/extracted_pdf.rb +70 -0
- data/lib/mindee/pdf/pdf_compressor.rb +121 -0
- data/lib/mindee/pdf/pdf_extractor.rb +121 -0
- data/lib/mindee/pdf/pdf_processor.rb +91 -0
- data/lib/mindee/pdf/pdf_tools.rb +201 -0
- data/lib/mindee/pdf.rb +7 -0
- data/lib/mindee/v1/client.rb +490 -0
- data/lib/mindee/v1/extraction/multi_receipts_extractor.rb +32 -0
- data/lib/mindee/v1/extraction.rb +3 -0
- data/lib/mindee/v1/http/.rubocop.yml +7 -0
- data/lib/mindee/v1/http/endpoint.rb +221 -0
- data/lib/mindee/v1/http/workflow_endpoint.rb +93 -0
- data/lib/mindee/v1/http.rb +4 -0
- data/lib/mindee/v1/parsing/common/api_request.rb +38 -0
- data/lib/mindee/v1/parsing/common/api_response.rb +63 -0
- data/lib/mindee/v1/parsing/common/document.rb +86 -0
- data/lib/mindee/v1/parsing/common/execution.rb +78 -0
- data/lib/mindee/v1/parsing/common/execution_file.rb +26 -0
- data/lib/mindee/v1/parsing/common/execution_priority.rb +38 -0
- data/lib/mindee/v1/parsing/common/extras/cropper_extra.rb +32 -0
- data/lib/mindee/v1/parsing/common/extras/extras.rb +62 -0
- data/lib/mindee/v1/parsing/common/extras/full_text_ocr_extra.rb +35 -0
- data/lib/mindee/v1/parsing/common/extras/rag_extra.rb +28 -0
- data/lib/mindee/v1/parsing/common/extras.rb +6 -0
- data/lib/mindee/v1/parsing/common/inference.rb +69 -0
- data/lib/mindee/v1/parsing/common/job.rb +48 -0
- data/lib/mindee/v1/parsing/common/ocr/mvision_v1.rb +52 -0
- data/lib/mindee/v1/parsing/common/ocr/ocr.rb +180 -0
- data/lib/mindee/v1/parsing/common/ocr.rb +3 -0
- data/lib/mindee/v1/parsing/common/orientation.rb +28 -0
- data/lib/mindee/v1/parsing/common/page.rb +49 -0
- data/lib/mindee/v1/parsing/common/prediction.rb +19 -0
- data/lib/mindee/v1/parsing/common/product.rb +26 -0
- data/lib/mindee/v1/parsing/common/workflow_response.rb +30 -0
- data/lib/mindee/v1/parsing/common.rb +15 -0
- data/lib/mindee/v1/parsing/standard/abstract_field.rb +74 -0
- data/lib/mindee/v1/parsing/standard/address_field.rb +51 -0
- data/lib/mindee/v1/parsing/standard/amount_field.rb +28 -0
- data/lib/mindee/v1/parsing/standard/base_field.rb +30 -0
- data/lib/mindee/v1/parsing/standard/boolean_field.rb +29 -0
- data/lib/mindee/v1/parsing/standard/classification_field.rb +18 -0
- data/lib/mindee/v1/parsing/standard/company_registration_field.rb +45 -0
- data/lib/mindee/v1/parsing/standard/date_field.rb +40 -0
- data/lib/mindee/v1/parsing/standard/feature_field.rb +26 -0
- data/lib/mindee/v1/parsing/standard/locale_field.rb +52 -0
- data/lib/mindee/v1/parsing/standard/payment_details_field.rb +44 -0
- data/lib/mindee/v1/parsing/standard/position_field.rb +61 -0
- data/lib/mindee/v1/parsing/standard/string_field.rb +26 -0
- data/lib/mindee/v1/parsing/standard/tax_field.rb +110 -0
- data/lib/mindee/v1/parsing/standard.rb +15 -0
- data/lib/mindee/v1/parsing/universal/universal_list_field.rb +60 -0
- data/lib/mindee/v1/parsing/universal/universal_object_field.rb +123 -0
- data/lib/mindee/v1/parsing/universal.rb +4 -0
- data/lib/mindee/v1/parsing.rb +5 -0
- data/lib/mindee/v1/product/.rubocop.yml +12 -0
- data/lib/mindee/v1/product/barcode_reader/barcode_reader_v1.rb +47 -0
- data/lib/mindee/v1/product/barcode_reader/barcode_reader_v1_document.rb +47 -0
- data/lib/mindee/v1/product/barcode_reader/barcode_reader_v1_page.rb +38 -0
- data/lib/mindee/v1/product/cropper/cropper_v1.rb +47 -0
- data/lib/mindee/v1/product/cropper/cropper_v1_document.rb +15 -0
- data/lib/mindee/v1/product/cropper/cropper_v1_page.rb +55 -0
- data/lib/mindee/v1/product/financial_document/financial_document_v1.rb +47 -0
- data/lib/mindee/v1/product/financial_document/financial_document_v1_document.rb +329 -0
- data/lib/mindee/v1/product/financial_document/financial_document_v1_line_item.rb +124 -0
- data/lib/mindee/v1/product/financial_document/financial_document_v1_line_items.rb +64 -0
- data/lib/mindee/v1/product/financial_document/financial_document_v1_page.rb +38 -0
- data/lib/mindee/v1/product/fr/bank_account_details/bank_account_details_v1.rb +49 -0
- data/lib/mindee/v1/product/fr/bank_account_details/bank_account_details_v1_document.rb +49 -0
- data/lib/mindee/v1/product/fr/bank_account_details/bank_account_details_v1_page.rb +40 -0
- data/lib/mindee/v1/product/fr/bank_account_details/bank_account_details_v2.rb +49 -0
- data/lib/mindee/v1/product/fr/bank_account_details/bank_account_details_v2_bban.rb +63 -0
- data/lib/mindee/v1/product/fr/bank_account_details/bank_account_details_v2_document.rb +60 -0
- data/lib/mindee/v1/product/fr/bank_account_details/bank_account_details_v2_page.rb +40 -0
- data/lib/mindee/v1/product/fr/bank_statement/bank_statement_v2.rb +49 -0
- data/lib/mindee/v1/product/fr/bank_statement/bank_statement_v2_document.rb +169 -0
- data/lib/mindee/v1/product/fr/bank_statement/bank_statement_v2_page.rb +40 -0
- data/lib/mindee/v1/product/fr/bank_statement/bank_statement_v2_transaction.rb +78 -0
- data/lib/mindee/v1/product/fr/bank_statement/bank_statement_v2_transactions.rb +56 -0
- data/lib/mindee/v1/product/fr/id_card/id_card_v1.rb +49 -0
- data/lib/mindee/v1/product/fr/id_card/id_card_v1_document.rb +106 -0
- data/lib/mindee/v1/product/fr/id_card/id_card_v1_page.rb +57 -0
- data/lib/mindee/v1/product/fr/id_card/id_card_v2.rb +49 -0
- data/lib/mindee/v1/product/fr/id_card/id_card_v2_document.rb +143 -0
- data/lib/mindee/v1/product/fr/id_card/id_card_v2_page.rb +65 -0
- data/lib/mindee/v1/product/international_id/international_id_v2.rb +47 -0
- data/lib/mindee/v1/product/international_id/international_id_v2_document.rb +164 -0
- data/lib/mindee/v1/product/international_id/international_id_v2_page.rb +38 -0
- data/lib/mindee/v1/product/invoice/invoice_v4.rb +47 -0
- data/lib/mindee/v1/product/invoice/invoice_v4_document.rb +300 -0
- data/lib/mindee/v1/product/invoice/invoice_v4_line_item.rb +124 -0
- data/lib/mindee/v1/product/invoice/invoice_v4_line_items.rb +64 -0
- data/lib/mindee/v1/product/invoice/invoice_v4_page.rb +38 -0
- data/lib/mindee/v1/product/invoice_splitter/invoice_splitter_v1.rb +47 -0
- data/lib/mindee/v1/product/invoice_splitter/invoice_splitter_v1_document.rb +66 -0
- data/lib/mindee/v1/product/invoice_splitter/invoice_splitter_v1_invoice_page_group.rb +58 -0
- data/lib/mindee/v1/product/invoice_splitter/invoice_splitter_v1_invoice_page_groups.rb +50 -0
- data/lib/mindee/v1/product/invoice_splitter/invoice_splitter_v1_page.rb +38 -0
- data/lib/mindee/v1/product/multi_receipts_detector/multi_receipts_detector_v1.rb +47 -0
- data/lib/mindee/v1/product/multi_receipts_detector/multi_receipts_detector_v1_document.rb +38 -0
- data/lib/mindee/v1/product/multi_receipts_detector/multi_receipts_detector_v1_page.rb +38 -0
- data/lib/mindee/v1/product/passport/passport_v1.rb +47 -0
- data/lib/mindee/v1/product/passport/passport_v1_document.rb +112 -0
- data/lib/mindee/v1/product/passport/passport_v1_page.rb +38 -0
- data/lib/mindee/v1/product/receipt/receipt_v5.rb +47 -0
- data/lib/mindee/v1/product/receipt/receipt_v5_document.rb +187 -0
- data/lib/mindee/v1/product/receipt/receipt_v5_line_item.rb +88 -0
- data/lib/mindee/v1/product/receipt/receipt_v5_line_items.rb +56 -0
- data/lib/mindee/v1/product/receipt/receipt_v5_page.rb +38 -0
- data/lib/mindee/v1/product/resume/resume_v1.rb +47 -0
- data/lib/mindee/v1/product/resume/resume_v1_certificate.rb +82 -0
- data/lib/mindee/v1/product/resume/resume_v1_certificates.rb +60 -0
- data/lib/mindee/v1/product/resume/resume_v1_document.rb +340 -0
- data/lib/mindee/v1/product/resume/resume_v1_education.rb +106 -0
- data/lib/mindee/v1/product/resume/resume_v1_educations.rb +66 -0
- data/lib/mindee/v1/product/resume/resume_v1_language.rb +66 -0
- data/lib/mindee/v1/product/resume/resume_v1_languages.rb +56 -0
- data/lib/mindee/v1/product/resume/resume_v1_page.rb +38 -0
- data/lib/mindee/v1/product/resume/resume_v1_professional_experience.rb +122 -0
- data/lib/mindee/v1/product/resume/resume_v1_professional_experiences.rb +70 -0
- data/lib/mindee/v1/product/resume/resume_v1_social_networks_url.rb +66 -0
- data/lib/mindee/v1/product/resume/resume_v1_social_networks_urls.rb +56 -0
- data/lib/mindee/v1/product/universal/universal.rb +48 -0
- data/lib/mindee/v1/product/universal/universal_document.rb +35 -0
- data/lib/mindee/v1/product/universal/universal_page.rb +54 -0
- data/lib/mindee/v1/product/universal/universal_prediction.rb +128 -0
- data/lib/mindee/v1/product.rb +18 -0
- data/lib/mindee/v1.rb +7 -0
- data/lib/mindee/v2/client.rb +132 -0
- data/lib/mindee/v2/file_operation/crop.rb +51 -0
- data/lib/mindee/v2/file_operation/crop_files.rb +25 -0
- data/lib/mindee/v2/file_operation/split.rb +37 -0
- data/lib/mindee/v2/file_operation/split_files.rb +25 -0
- data/lib/mindee/v2/file_operation.rb +6 -0
- data/lib/mindee/v2/http/.rubocop.yml +7 -0
- data/lib/mindee/v2/http/api_v2_settings.rb +65 -0
- data/lib/mindee/v2/http/mindee_api_v2.rb +230 -0
- data/lib/mindee/v2/http.rb +4 -0
- data/lib/mindee/v2/parsing/base_inference.rb +44 -0
- data/lib/mindee/v2/parsing/base_response.rb +15 -0
- data/lib/mindee/v2/parsing/common_response.rb +20 -0
- data/lib/mindee/v2/parsing/error_item.rb +21 -0
- data/lib/mindee/v2/parsing/error_response.rb +51 -0
- data/lib/mindee/v2/parsing/field/base_field.rb +63 -0
- data/lib/mindee/v2/parsing/field/field_confidence.rb +128 -0
- data/lib/mindee/v2/parsing/field/field_location.rb +33 -0
- data/lib/mindee/v2/parsing/field/inference_fields.rb +105 -0
- data/lib/mindee/v2/parsing/field/list_field.rb +79 -0
- data/lib/mindee/v2/parsing/field/object_field.rb +138 -0
- data/lib/mindee/v2/parsing/field/simple_field.rb +60 -0
- data/lib/mindee/v2/parsing/field.rb +9 -0
- data/lib/mindee/v2/parsing/inference_active_options.rb +67 -0
- data/lib/mindee/v2/parsing/inference_file.rb +38 -0
- data/lib/mindee/v2/parsing/inference_job.rb +25 -0
- data/lib/mindee/v2/parsing/inference_model.rb +30 -0
- data/lib/mindee/v2/parsing/job.rb +93 -0
- data/lib/mindee/v2/parsing/job_response.rb +30 -0
- data/lib/mindee/v2/parsing/job_webhook.rb +59 -0
- data/lib/mindee/v2/parsing/rag_metadata.rb +17 -0
- data/lib/mindee/v2/parsing/raw_text.rb +27 -0
- data/lib/mindee/v2/parsing/raw_text_page.rb +24 -0
- data/lib/mindee/v2/parsing/search/pagination_metadata.rb +44 -0
- data/lib/mindee/v2/parsing/search/search_model.rb +38 -0
- data/lib/mindee/v2/parsing/search/search_models.rb +34 -0
- data/lib/mindee/v2/parsing/search/search_response.rb +38 -0
- data/lib/mindee/v2/parsing/search.rb +6 -0
- data/lib/mindee/v2/parsing.rb +16 -0
- data/lib/mindee/v2/product/base_product.rb +28 -0
- data/lib/mindee/v2/product/classification/classification.rb +20 -0
- data/lib/mindee/v2/product/classification/classification_classifier.rb +25 -0
- data/lib/mindee/v2/product/classification/classification_inference.rb +35 -0
- data/lib/mindee/v2/product/classification/classification_response.rb +32 -0
- data/lib/mindee/v2/product/classification/classification_result.rb +27 -0
- data/lib/mindee/v2/product/classification/params/classification_parameters.rb +47 -0
- data/lib/mindee/v2/product/crop/crop.rb +20 -0
- data/lib/mindee/v2/product/crop/crop_inference.rb +34 -0
- data/lib/mindee/v2/product/crop/crop_item.rb +39 -0
- data/lib/mindee/v2/product/crop/crop_response.rb +40 -0
- data/lib/mindee/v2/product/crop/crop_result.rb +34 -0
- data/lib/mindee/v2/product/crop/params/crop_parameters.rb +47 -0
- data/lib/mindee/v2/product/extraction/extraction.rb +21 -0
- data/lib/mindee/v2/product/extraction/extraction_inference.rb +40 -0
- data/lib/mindee/v2/product/extraction/extraction_response.rb +32 -0
- data/lib/mindee/v2/product/extraction/extraction_result.rb +44 -0
- data/lib/mindee/v2/product/extraction/params/data_schema.rb +51 -0
- data/lib/mindee/v2/product/extraction/params/data_schema_field.rb +69 -0
- data/lib/mindee/v2/product/extraction/params/data_schema_replace.rb +39 -0
- data/lib/mindee/v2/product/extraction/params/extraction_parameters.rb +125 -0
- data/lib/mindee/v2/product/ocr/ocr.rb +20 -0
- data/lib/mindee/v2/product/ocr/ocr_inference.rb +34 -0
- data/lib/mindee/v2/product/ocr/ocr_page.rb +33 -0
- data/lib/mindee/v2/product/ocr/ocr_response.rb +32 -0
- data/lib/mindee/v2/product/ocr/ocr_result.rb +34 -0
- data/lib/mindee/v2/product/ocr/ocr_word.rb +29 -0
- data/lib/mindee/v2/product/ocr/params/ocr_parameters.rb +47 -0
- data/lib/mindee/v2/product/split/params/split_parameters.rb +48 -0
- data/lib/mindee/v2/product/split/split.rb +19 -0
- data/lib/mindee/v2/product/split/split_inference.rb +34 -0
- data/lib/mindee/v2/product/split/split_range.rb +38 -0
- data/lib/mindee/v2/product/split/split_response.rb +40 -0
- data/lib/mindee/v2/product/split/split_result.rb +34 -0
- data/lib/mindee/v2/product.rb +7 -0
- data/lib/mindee/v2.rb +7 -0
- data/lib/mindee/version.rb +26 -0
- data/lib/mindee.rb +135 -0
- data/mindee-lite.gemspec +36 -0
- data/mindee.gemspec +44 -0
- data/sig/custom/marcel.rbs +3 -0
- data/sig/custom/mini_magick.rbs +31 -0
- data/sig/custom/net_http.rbs +43 -0
- data/sig/custom/origami.rbs +59 -0
- data/sig/mindee/dependency.rbs +13 -0
- data/sig/mindee/error/mindee_error.rbs +13 -0
- data/sig/mindee/error/mindee_http_error.rbs +17 -0
- data/sig/mindee/error/mindee_http_error_v2.rbs +15 -0
- data/sig/mindee/error/mindee_http_unknown_error_v2.rbs +9 -0
- data/sig/mindee/error/mindee_input_error.rbs +18 -0
- data/sig/mindee/geometry/min_max.rbs +11 -0
- data/sig/mindee/geometry/point.rbs +14 -0
- data/sig/mindee/geometry/polygon.rbs +12 -0
- data/sig/mindee/geometry/quadrilateral.rbs +15 -0
- data/sig/mindee/geometry/utils.rbs +13 -0
- data/sig/mindee/http/http_error_handler.rbs +15 -0
- data/sig/mindee/http/response_validation.rbs +11 -0
- data/sig/mindee/image/extracted_image.rbs +21 -0
- data/sig/mindee/image/image_compressor.rbs +8 -0
- data/sig/mindee/image/image_extractor.rbs +13 -0
- data/sig/mindee/image/image_utils.rbs +19 -0
- data/sig/mindee/input/base_parameters.rbs +35 -0
- data/sig/mindee/input/local_response.rbs +14 -0
- data/sig/mindee/input/polling_options.rbs +12 -0
- data/sig/mindee/input/sources/base64_input_source.rbs +11 -0
- data/sig/mindee/input/sources/bytes_input_source.rbs +10 -0
- data/sig/mindee/input/sources/file_input_source.rbs +10 -0
- data/sig/mindee/input/sources/local_input_source.rbs +30 -0
- data/sig/mindee/input/sources/path_input_source.rbs +10 -0
- data/sig/mindee/input/sources/url_input_source.rbs +20 -0
- data/sig/mindee/logging/logger.rbs +11 -0
- data/sig/mindee/page_options.rbs +11 -0
- data/sig/mindee/pdf/extracted_pdf.rbs +17 -0
- data/sig/mindee/pdf/pdf_compressor.rbs +15 -0
- data/sig/mindee/pdf/pdf_extractor.rbs +19 -0
- data/sig/mindee/pdf/pdf_processor.rbs +12 -0
- data/sig/mindee/pdf/pdf_tools.rbs +31 -0
- data/sig/mindee/v1/client.rbs +84 -0
- data/sig/mindee/v1/extraction/multi_receipts_extractor.rbs +8 -0
- data/sig/mindee/v1/http/endpoint.rbs +41 -0
- data/sig/mindee/v1/http/workflow_endpoint.rbs +22 -0
- data/sig/mindee/v1/parsing/common/api_request.rbs +22 -0
- data/sig/mindee/v1/parsing/common/api_response.rbs +31 -0
- data/sig/mindee/v1/parsing/common/document.rbs +32 -0
- data/sig/mindee/v1/parsing/common/execution.rbs +26 -0
- data/sig/mindee/v1/parsing/common/execution_file.rbs +16 -0
- data/sig/mindee/v1/parsing/common/execution_priority.rbs +16 -0
- data/sig/mindee/v1/parsing/common/extras/cropper_extra.rbs +18 -0
- data/sig/mindee/v1/parsing/common/extras/extras.rbs +24 -0
- data/sig/mindee/v1/parsing/common/extras/full_text_ocr_extra.rbs +22 -0
- data/sig/mindee/v1/parsing/common/extras/rag_extra.rbs +19 -0
- data/sig/mindee/v1/parsing/common/inference.rbs +31 -0
- data/sig/mindee/v1/parsing/common/job.rbs +24 -0
- data/sig/mindee/v1/parsing/common/ocr/mvision_v1.rbs +20 -0
- data/sig/mindee/v1/parsing/common/ocr/ocr.rbs +56 -0
- data/sig/mindee/v1/parsing/common/orientation.rbs +15 -0
- data/sig/mindee/v1/parsing/common/page.rbs +19 -0
- data/sig/mindee/v1/parsing/common/prediction.rbs +14 -0
- data/sig/mindee/v1/parsing/common/product.rbs +16 -0
- data/sig/mindee/v1/parsing/common/workflow_response.rbs +22 -0
- data/sig/mindee/v1/parsing/standard/abstract_field.rbs +30 -0
- data/sig/mindee/v1/parsing/standard/address_field.rbs +28 -0
- data/sig/mindee/v1/parsing/standard/amount_field.rbs +16 -0
- data/sig/mindee/v1/parsing/standard/base_field.rbs +16 -0
- data/sig/mindee/v1/parsing/standard/boolean_field.rbs +16 -0
- data/sig/mindee/v1/parsing/standard/classification_field.rbs +12 -0
- data/sig/mindee/v1/parsing/standard/company_registration_field.rbs +20 -0
- data/sig/mindee/v1/parsing/standard/date_field.rbs +20 -0
- data/sig/mindee/v1/parsing/standard/feature_field.rbs +12 -0
- data/sig/mindee/v1/parsing/standard/locale_field.rbs +24 -0
- data/sig/mindee/v1/parsing/standard/payment_details_field.rbs +19 -0
- data/sig/mindee/v1/parsing/standard/position_field.rbs +26 -0
- data/sig/mindee/v1/parsing/standard/string_field.rbs +16 -0
- data/sig/mindee/v1/parsing/standard/tax_field.rbs +33 -0
- data/sig/mindee/v1/parsing/universal/universal_list_field.rbs +21 -0
- data/sig/mindee/v1/parsing/universal/universal_object_field.rbs +38 -0
- data/sig/mindee/v1/product/barcode_reader/barcode_reader_v1.rbs +13 -0
- data/sig/mindee/v1/product/barcode_reader/barcode_reader_v1_document.rbs +16 -0
- data/sig/mindee/v1/product/barcode_reader/barcode_reader_v1_page.rbs +17 -0
- data/sig/mindee/v1/product/cropper/cropper_v1.rbs +13 -0
- data/sig/mindee/v1/product/cropper/cropper_v1_document.rbs +14 -0
- data/sig/mindee/v1/product/cropper/cropper_v1_page.rbs +19 -0
- data/sig/mindee/v1/product/financial_document/financial_document_v1.rbs +13 -0
- data/sig/mindee/v1/product/financial_document/financial_document_v1_document.rbs +49 -0
- data/sig/mindee/v1/product/financial_document/financial_document_v1_line_item.rbs +35 -0
- data/sig/mindee/v1/product/financial_document/financial_document_v1_line_items.rbs +15 -0
- data/sig/mindee/v1/product/financial_document/financial_document_v1_page.rbs +17 -0
- data/sig/mindee/v1/product/fr/bank_account_details/bank_account_details_v1.rbs +15 -0
- data/sig/mindee/v1/product/fr/bank_account_details/bank_account_details_v1_document.rbs +19 -0
- data/sig/mindee/v1/product/fr/bank_account_details/bank_account_details_v1_page.rbs +19 -0
- data/sig/mindee/v1/product/fr/bank_account_details/bank_account_details_v2.rbs +15 -0
- data/sig/mindee/v1/product/fr/bank_account_details/bank_account_details_v2_bban.rbs +25 -0
- data/sig/mindee/v1/product/fr/bank_account_details/bank_account_details_v2_document.rbs +20 -0
- data/sig/mindee/v1/product/fr/bank_account_details/bank_account_details_v2_page.rbs +19 -0
- data/sig/mindee/v1/product/fr/bank_statement/bank_statement_v2.rbs +15 -0
- data/sig/mindee/v1/product/fr/bank_statement/bank_statement_v2_document.rbs +31 -0
- data/sig/mindee/v1/product/fr/bank_statement/bank_statement_v2_page.rbs +19 -0
- data/sig/mindee/v1/product/fr/bank_statement/bank_statement_v2_transaction.rbs +27 -0
- data/sig/mindee/v1/product/fr/bank_statement/bank_statement_v2_transactions.rbs +17 -0
- data/sig/mindee/v1/product/fr/id_card/id_card_v1.rbs +15 -0
- data/sig/mindee/v1/product/fr/id_card/id_card_v1_document.rbs +26 -0
- data/sig/mindee/v1/product/fr/id_card/id_card_v1_page.rbs +20 -0
- data/sig/mindee/v1/product/fr/id_card/id_card_v2.rbs +15 -0
- data/sig/mindee/v1/product/fr/id_card/id_card_v2_document.rbs +31 -0
- data/sig/mindee/v1/product/fr/id_card/id_card_v2_page.rbs +21 -0
- data/sig/mindee/v1/product/international_id/international_id_v2.rbs +13 -0
- data/sig/mindee/v1/product/international_id/international_id_v2_document.rbs +31 -0
- data/sig/mindee/v1/product/international_id/international_id_v2_page.rbs +17 -0
- data/sig/mindee/v1/product/invoice/invoice_v4.rbs +13 -0
- data/sig/mindee/v1/product/invoice/invoice_v4_document.rbs +45 -0
- data/sig/mindee/v1/product/invoice/invoice_v4_line_item.rbs +35 -0
- data/sig/mindee/v1/product/invoice/invoice_v4_line_items.rbs +15 -0
- data/sig/mindee/v1/product/invoice/invoice_v4_page.rbs +17 -0
- data/sig/mindee/v1/product/invoice_splitter/invoice_splitter_v1.rbs +13 -0
- data/sig/mindee/v1/product/invoice_splitter/invoice_splitter_v1_document.rbs +17 -0
- data/sig/mindee/v1/product/invoice_splitter/invoice_splitter_v1_invoice_page_group.rbs +21 -0
- data/sig/mindee/v1/product/invoice_splitter/invoice_splitter_v1_invoice_page_groups.rbs +15 -0
- data/sig/mindee/v1/product/invoice_splitter/invoice_splitter_v1_page.rbs +17 -0
- data/sig/mindee/v1/product/multi_receipts_detector/multi_receipts_detector_v1.rbs +14 -0
- data/sig/mindee/v1/product/multi_receipts_detector/multi_receipts_detector_v1_document.rbs +15 -0
- data/sig/mindee/v1/product/multi_receipts_detector/multi_receipts_detector_v1_page.rbs +17 -0
- data/sig/mindee/v1/product/passport/passport_v1.rbs +13 -0
- data/sig/mindee/v1/product/passport/passport_v1_document.rbs +25 -0
- data/sig/mindee/v1/product/passport/passport_v1_page.rbs +17 -0
- data/sig/mindee/v1/product/receipt/receipt_v5.rbs +13 -0
- data/sig/mindee/v1/product/receipt/receipt_v5_document.rbs +33 -0
- data/sig/mindee/v1/product/receipt/receipt_v5_line_item.rbs +27 -0
- data/sig/mindee/v1/product/receipt/receipt_v5_line_items.rbs +15 -0
- data/sig/mindee/v1/product/receipt/receipt_v5_page.rbs +17 -0
- data/sig/mindee/v1/product/resume/resume_v1.rbs +13 -0
- data/sig/mindee/v1/product/resume/resume_v1_certificate.rbs +27 -0
- data/sig/mindee/v1/product/resume/resume_v1_certificates.rbs +17 -0
- data/sig/mindee/v1/product/resume/resume_v1_document.rbs +69 -0
- data/sig/mindee/v1/product/resume/resume_v1_education.rbs +33 -0
- data/sig/mindee/v1/product/resume/resume_v1_educations.rbs +17 -0
- data/sig/mindee/v1/product/resume/resume_v1_language.rbs +23 -0
- data/sig/mindee/v1/product/resume/resume_v1_languages.rbs +17 -0
- data/sig/mindee/v1/product/resume/resume_v1_page.rbs +19 -0
- data/sig/mindee/v1/product/resume/resume_v1_professional_experience.rbs +37 -0
- data/sig/mindee/v1/product/resume/resume_v1_professional_experiences.rbs +17 -0
- data/sig/mindee/v1/product/resume/resume_v1_social_networks_url.rbs +23 -0
- data/sig/mindee/v1/product/resume/resume_v1_social_networks_urls.rbs +17 -0
- data/sig/mindee/v1/product/universal/universal.rbs +16 -0
- data/sig/mindee/v1/product/universal/universal_document.rbs +12 -0
- data/sig/mindee/v1/product/universal/universal_page.rbs +18 -0
- data/sig/mindee/v1/product/universal/universal_prediction.rbs +30 -0
- data/sig/mindee/v2/client.rbs +29 -0
- data/sig/mindee/v2/file_operation/crop.rbs +10 -0
- data/sig/mindee/v2/file_operation/crop_files.rbs +9 -0
- data/sig/mindee/v2/file_operation/split.rbs +11 -0
- data/sig/mindee/v2/file_operation/split_files.rbs +9 -0
- data/sig/mindee/v2/http/api_v2_settings.rbs +27 -0
- data/sig/mindee/v2/http/mindee_api_v2.rbs +52 -0
- data/sig/mindee/v2/parsing/base_inference.rbs +18 -0
- data/sig/mindee/v2/parsing/base_response.rbs +11 -0
- data/sig/mindee/v2/parsing/common_response.rbs +12 -0
- data/sig/mindee/v2/parsing/error_item.rbs +13 -0
- data/sig/mindee/v2/parsing/error_response.rbs +20 -0
- data/sig/mindee/v2/parsing/field/base_field.rbs +17 -0
- data/sig/mindee/v2/parsing/field/field_confidence.rbs +30 -0
- data/sig/mindee/v2/parsing/field/field_location.rbs +16 -0
- data/sig/mindee/v2/parsing/field/inference_fields.rbs +20 -0
- data/sig/mindee/v2/parsing/field/list_field.rbs +23 -0
- data/sig/mindee/v2/parsing/field/object_field.rbs +27 -0
- data/sig/mindee/v2/parsing/field/simple_field.rbs +16 -0
- data/sig/mindee/v2/parsing/inference_active_options.rbs +26 -0
- data/sig/mindee/v2/parsing/inference_file.rbs +17 -0
- data/sig/mindee/v2/parsing/inference_job.rbs +13 -0
- data/sig/mindee/v2/parsing/inference_model.rbs +12 -0
- data/sig/mindee/v2/parsing/job.rbs +24 -0
- data/sig/mindee/v2/parsing/job_response.rbs +14 -0
- data/sig/mindee/v2/parsing/job_webhook.rbs +19 -0
- data/sig/mindee/v2/parsing/rag_metadata.rbs +13 -0
- data/sig/mindee/v2/parsing/raw_text.rbs +12 -0
- data/sig/mindee/v2/parsing/raw_text_page.rbs +11 -0
- data/sig/mindee/v2/parsing/search/pagination_metadata.rbs +20 -0
- data/sig/mindee/v2/parsing/search/search_model.rbs +19 -0
- data/sig/mindee/v2/parsing/search/search_response.rbs +17 -0
- data/sig/mindee/v2/parsing/search_models.rbs +14 -0
- data/sig/mindee/v2/product/base_product.rbs +19 -0
- data/sig/mindee/v2/product/classification/classification.rbs +10 -0
- data/sig/mindee/v2/product/classification/classification_classifier.rbs +15 -0
- data/sig/mindee/v2/product/classification/classification_inference.rbs +15 -0
- data/sig/mindee/v2/product/classification/classification_response.rbs +23 -0
- data/sig/mindee/v2/product/classification/classification_result.rbs +15 -0
- data/sig/mindee/v2/product/classification/params/classification_parameters/classification_parameters.rbs +23 -0
- data/sig/mindee/v2/product/crop/crop.rbs +10 -0
- data/sig/mindee/v2/product/crop/crop_inference.rbs +14 -0
- data/sig/mindee/v2/product/crop/crop_item.rbs +18 -0
- data/sig/mindee/v2/product/crop/crop_response.rbs +25 -0
- data/sig/mindee/v2/product/crop/crop_result.rbs +14 -0
- data/sig/mindee/v2/product/crop/params/crop_parameters/crop_parameters.rbs +23 -0
- data/sig/mindee/v2/product/extraction/extraction.rbs +15 -0
- data/sig/mindee/v2/product/extraction/extraction_inference.rbs +19 -0
- data/sig/mindee/v2/product/extraction/extraction_response.rbs +24 -0
- data/sig/mindee/v2/product/extraction/extraction_result.rbs +18 -0
- data/sig/mindee/v2/product/extraction/params/data_schema.rbs +21 -0
- data/sig/mindee/v2/product/extraction/params/data_schema_field.rbs +29 -0
- data/sig/mindee/v2/product/extraction/params/data_schema_replace.rbs +21 -0
- data/sig/mindee/v2/product/extraction/params/extraction_parameters.rbs +38 -0
- data/sig/mindee/v2/product/ocr/ocr.rbs +10 -0
- data/sig/mindee/v2/product/ocr/ocr_inference.rbs +14 -0
- data/sig/mindee/v2/product/ocr/ocr_page.rbs +15 -0
- data/sig/mindee/v2/product/ocr/ocr_response.rbs +23 -0
- data/sig/mindee/v2/product/ocr/ocr_result.rbs +14 -0
- data/sig/mindee/v2/product/ocr/ocr_word.rbs +15 -0
- data/sig/mindee/v2/product/ocr/params/ocr_parameters/ocr_parameters.rbs +24 -0
- data/sig/mindee/v2/product/split/params/split_parameters/split_parameters.rbs +23 -0
- data/sig/mindee/v2/product/split/split.rbs +10 -0
- data/sig/mindee/v2/product/split/split_inference.rbs +14 -0
- data/sig/mindee/v2/product/split/split_range.rbs +18 -0
- data/sig/mindee/v2/product/split/split_response.rbs +25 -0
- data/sig/mindee/v2/product/split/split_result.rbs +14 -0
- data/sig/mindee/version.rbs +6 -0
- data/sig/mindee.rbs +62 -0
- metadata +600 -0
|
@@ -0,0 +1,121 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Mindee
|
|
4
|
+
# Pdf Extraction Module.
|
|
5
|
+
module PDF
|
|
6
|
+
# Pdf extraction class.
|
|
7
|
+
class PDFExtractor
|
|
8
|
+
# @param local_input [Mindee::Input::Source::LocalInputSource]
|
|
9
|
+
def initialize(local_input)
|
|
10
|
+
unless Mindee::Dependency.all_deps_available?
|
|
11
|
+
raise NotImplementedError, Mindee::Dependency::MINDEE_DEPENDENCIES_LOAD_ERROR
|
|
12
|
+
end
|
|
13
|
+
|
|
14
|
+
@filename = local_input.filename
|
|
15
|
+
if local_input.pdf?
|
|
16
|
+
@source_pdf = local_input.io_stream
|
|
17
|
+
else
|
|
18
|
+
pdf_image = Image::ImageExtractor.attach_image_as_new_file(local_input.io_stream)
|
|
19
|
+
io_buffer = StringIO.new
|
|
20
|
+
pdf_image.save(io_buffer)
|
|
21
|
+
|
|
22
|
+
@source_pdf = io_buffer
|
|
23
|
+
end
|
|
24
|
+
end
|
|
25
|
+
|
|
26
|
+
# Retrieves the page count for the Pdf object.
|
|
27
|
+
# @return [Integer]
|
|
28
|
+
def page_count
|
|
29
|
+
Mindee::PDF::PDFProcessor.open_pdf(@source_pdf).pages.size
|
|
30
|
+
end
|
|
31
|
+
|
|
32
|
+
# Creates a new Pdf from pages and save it into a buffer.
|
|
33
|
+
# @param page_indexes [Array<Integer>] List of page number to use for merging in the original Pdf.
|
|
34
|
+
# @return [StringIO] The buffer containing the new Pdf.
|
|
35
|
+
def cut_pages(page_indexes)
|
|
36
|
+
options = PageOptions.new(params: {
|
|
37
|
+
page_indexes: page_indexes,
|
|
38
|
+
})
|
|
39
|
+
|
|
40
|
+
Mindee::PDF::PDFProcessor.parse(@source_pdf, options)
|
|
41
|
+
end
|
|
42
|
+
|
|
43
|
+
# Extract the sub-documents from the main pdf, based on the given list of page indexes.
|
|
44
|
+
# @param page_indexes [Array<Array<Integer>>] List of page number to use for merging in the original Pdf.
|
|
45
|
+
# @return [Array<Mindee::PDF::ExtractedPDF>] The buffer containing the new Pdf.
|
|
46
|
+
def extract_sub_documents(page_indexes)
|
|
47
|
+
extracted_pdfs = [] # @type var extracted_pdfs: Array[Mindee::PDF::ExtractedPDF]
|
|
48
|
+
extension = File.extname(@filename)
|
|
49
|
+
basename = File.basename(@filename, extension)
|
|
50
|
+
page_indexes.each do |page_index_list|
|
|
51
|
+
if page_index_list.nil? || page_index_list.empty?
|
|
52
|
+
raise Error::MindeePDFError, "Empty indexes aren't allowed for extraction #{page_index_list}"
|
|
53
|
+
end
|
|
54
|
+
|
|
55
|
+
page_index_list.each do |page_index|
|
|
56
|
+
if (page_index > page_count) || page_index.negative?
|
|
57
|
+
raise Error::MindeePDFError,
|
|
58
|
+
"Index #{page_index} is out of range."
|
|
59
|
+
end
|
|
60
|
+
end
|
|
61
|
+
formatted_max_index = format('%03d', page_index_list[-1] + 1).to_s
|
|
62
|
+
field_filename = "#{basename}_#{format('%03d',
|
|
63
|
+
page_index_list[0] + 1)}-#{formatted_max_index}#{extension}"
|
|
64
|
+
extracted_pdf = Mindee::PDF::ExtractedPDF.new(cut_pages(page_index_list),
|
|
65
|
+
field_filename)
|
|
66
|
+
extracted_pdfs << extracted_pdf
|
|
67
|
+
end
|
|
68
|
+
extracted_pdfs
|
|
69
|
+
end
|
|
70
|
+
|
|
71
|
+
# rubocop:disable Metrics/CyclomaticComplexity
|
|
72
|
+
# rubocop:disable Metrics/PerceivedComplexity
|
|
73
|
+
|
|
74
|
+
# Extracts invoices as complete PDFs from the document.
|
|
75
|
+
# @param page_indexes [Array<Array<Integer>, InvoiceSplitterV1InvoicePageGroup>]
|
|
76
|
+
# @param strict [bool]
|
|
77
|
+
# @return [Array<Mindee::PDF::ExtractedPDF>]
|
|
78
|
+
def extract_invoices(page_indexes, strict: false)
|
|
79
|
+
raise Error::MindeePDFError, 'No indexes provided.' if page_indexes.empty?
|
|
80
|
+
|
|
81
|
+
if page_indexes[0].is_a?(Array) && page_indexes[0].all?(Integer)
|
|
82
|
+
page_indexes_as_array = page_indexes # @type var page_indexes : Array[Array[Integer]]
|
|
83
|
+
return extract_sub_documents(page_indexes_as_array)
|
|
84
|
+
end
|
|
85
|
+
p_ids = page_indexes # @type var page_indexes: Product::InvoiceSplitter::InvoiceSplitterV1InvoicePageGroups
|
|
86
|
+
return extract_sub_documents(p_ids.map(&:page_indexes)) unless strict
|
|
87
|
+
|
|
88
|
+
correct_page_indexes = [] # @type var correct_page_indexes: Array[Array[Integer]]
|
|
89
|
+
current_list = [] # @type var current_list: Array[Integer]
|
|
90
|
+
previous_confidence = nil
|
|
91
|
+
p_ids.each_with_index do |p_i, i|
|
|
92
|
+
page_index = p_i # @type var page_index: Product::InvoiceSplitter::InvoiceSplitterV1InvoicePageGroup
|
|
93
|
+
confidence = page_index.confidence.to_f
|
|
94
|
+
page_list = page_index.page_indexes
|
|
95
|
+
|
|
96
|
+
if confidence >= 0.5 && previous_confidence.nil?
|
|
97
|
+
current_list = page_list
|
|
98
|
+
elsif confidence >= 0.5 && i < p_ids.length - 1
|
|
99
|
+
correct_page_indexes << current_list
|
|
100
|
+
current_list = page_list
|
|
101
|
+
elsif confidence < 0.5 && i == p_ids.length - 1
|
|
102
|
+
current_list.concat page_list
|
|
103
|
+
correct_page_indexes << current_list
|
|
104
|
+
else
|
|
105
|
+
correct_page_indexes << current_list
|
|
106
|
+
correct_page_indexes << page_list
|
|
107
|
+
end
|
|
108
|
+
previous_confidence = confidence
|
|
109
|
+
end
|
|
110
|
+
extract_sub_documents(correct_page_indexes)
|
|
111
|
+
end
|
|
112
|
+
|
|
113
|
+
# rubocop:enable Metrics/CyclomaticComplexity
|
|
114
|
+
# rubocop:enable Metrics/PerceivedComplexity
|
|
115
|
+
|
|
116
|
+
private
|
|
117
|
+
|
|
118
|
+
attr_reader :source_pdf, :filename
|
|
119
|
+
end
|
|
120
|
+
end
|
|
121
|
+
end
|
|
@@ -0,0 +1,91 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
Mindee::Dependency.require_all_deps!
|
|
4
|
+
require 'origami'
|
|
5
|
+
require_relative 'pdf_tools'
|
|
6
|
+
|
|
7
|
+
module Mindee
|
|
8
|
+
module PDF
|
|
9
|
+
# PDF document processing
|
|
10
|
+
module PDFProcessor
|
|
11
|
+
Origami::PDF.class_eval { include PDFTools }
|
|
12
|
+
# @param io_stream [StreamIO]
|
|
13
|
+
# @param options [PageOptions, Hash]
|
|
14
|
+
# @return [StringIO]
|
|
15
|
+
def self.parse(io_stream, options)
|
|
16
|
+
current_pdf = open_pdf(io_stream)
|
|
17
|
+
pages_count = current_pdf.pages.size
|
|
18
|
+
return current_pdf.to_io_stream if options.on_min_pages.to_i > pages_count
|
|
19
|
+
|
|
20
|
+
all_pages = (0..(pages_count - 1)).to_a
|
|
21
|
+
|
|
22
|
+
if options.operation == :KEEP_ONLY
|
|
23
|
+
pages_to_remove = indexes_from_keep(options.page_indexes, all_pages)
|
|
24
|
+
elsif options.operation == :REMOVE
|
|
25
|
+
pages_to_remove = indexes_from_remove(options.page_indexes, all_pages)
|
|
26
|
+
else
|
|
27
|
+
raise ArgumentError, "operation must be one of :KEEP_ONLY or :REMOVE, sent '#{options.operation}'"
|
|
28
|
+
end
|
|
29
|
+
|
|
30
|
+
current_pdf.delete_pages_at(pages_to_remove) if pages_to_remove.to_a != all_pages.to_a
|
|
31
|
+
current_pdf.to_io_stream
|
|
32
|
+
end
|
|
33
|
+
|
|
34
|
+
# @param page_indexes [Array<Integer>]
|
|
35
|
+
# @param all_pages [Array<Integer>]
|
|
36
|
+
def self.indexes_from_keep(page_indexes, all_pages)
|
|
37
|
+
pages_to_keep = Set.new
|
|
38
|
+
page_indexes.each do |idx|
|
|
39
|
+
idx = (all_pages.length - (idx + 2)) if idx.negative?
|
|
40
|
+
page = all_pages[idx]
|
|
41
|
+
next if page.nil?
|
|
42
|
+
|
|
43
|
+
pages_to_keep << page
|
|
44
|
+
end
|
|
45
|
+
(all_pages.to_set - pages_to_keep).to_a
|
|
46
|
+
end
|
|
47
|
+
|
|
48
|
+
# @param page_indexes [Array<Integer>]
|
|
49
|
+
# @param all_pages [Array<Integer>]
|
|
50
|
+
def self.indexes_from_remove(page_indexes, all_pages)
|
|
51
|
+
pages_to_remove = Set.new
|
|
52
|
+
page_indexes.each do |idx|
|
|
53
|
+
idx = (all_pages.length - (idx + 2)) if idx.negative?
|
|
54
|
+
page = all_pages[idx]
|
|
55
|
+
next if page.nil?
|
|
56
|
+
|
|
57
|
+
pages_to_remove << page
|
|
58
|
+
end
|
|
59
|
+
end
|
|
60
|
+
|
|
61
|
+
# @param io_stream [StringIO]
|
|
62
|
+
# @return [Origami::PDF]
|
|
63
|
+
def self.open_pdf(io_stream)
|
|
64
|
+
unless PDFTools.pdf_header?(io_stream)
|
|
65
|
+
raise Origami::InvalidPDFError,
|
|
66
|
+
'Input stream does not contain a PDF header.'
|
|
67
|
+
end
|
|
68
|
+
|
|
69
|
+
pdf_parser = Origami::PDF::LinearParser.new({ verbosity: Origami::Parser::VERBOSE_QUIET })
|
|
70
|
+
io_stream.seek(0)
|
|
71
|
+
pdf_parser.parse(io_stream)
|
|
72
|
+
end
|
|
73
|
+
|
|
74
|
+
# Retrieves a PDF document's page.
|
|
75
|
+
#
|
|
76
|
+
# @param [Origami::PDF] pdf_doc Origami PDF handle.
|
|
77
|
+
# @param [Integer] page_id Page ID.
|
|
78
|
+
# @return [StringIO]
|
|
79
|
+
def self.get_page(pdf_doc, page_id)
|
|
80
|
+
stream = StringIO.new
|
|
81
|
+
pdf_doc.save(stream)
|
|
82
|
+
|
|
83
|
+
options = PageOptions.new(params: {
|
|
84
|
+
page_indexes: [page_id - 1],
|
|
85
|
+
})
|
|
86
|
+
|
|
87
|
+
parse(stream, options)
|
|
88
|
+
end
|
|
89
|
+
end
|
|
90
|
+
end
|
|
91
|
+
end
|
|
@@ -0,0 +1,201 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
Mindee::Dependency.require_all_deps!
|
|
4
|
+
require 'origami'
|
|
5
|
+
|
|
6
|
+
module Mindee
|
|
7
|
+
module PDF
|
|
8
|
+
# Collection of miscellaneous PDF operations,as well as some monkey-patching for Origami.
|
|
9
|
+
module PDFTools
|
|
10
|
+
# Converts the current PDF document into a binary-encoded StringIO stream.
|
|
11
|
+
#
|
|
12
|
+
# @param [Hash] params Optional settings to override default processing flags.
|
|
13
|
+
# - :delinearize [bool] (default: true) Whether to convert a linearized PDF to its full form.
|
|
14
|
+
# - :recompile [bool] (default: true) Whether to recompile the PDF after processing.
|
|
15
|
+
# - :decrypt [bool] (default: false) Whether to attempt to decrypt the PDF.
|
|
16
|
+
# - Other keys such as :intent, :rebuild_xrefs, :noindent, and :obfuscate may be modified automatically.
|
|
17
|
+
#
|
|
18
|
+
# @return [StringIO] A binary-encoded stream representing the processed PDF.
|
|
19
|
+
def to_io_stream(params = {})
|
|
20
|
+
options = {
|
|
21
|
+
delinearize: true,
|
|
22
|
+
recompile: true,
|
|
23
|
+
decrypt: false,
|
|
24
|
+
noindent: nil,
|
|
25
|
+
}
|
|
26
|
+
options.update(params)
|
|
27
|
+
|
|
28
|
+
if frozen? # incompatible flags with frozen doc (signed)
|
|
29
|
+
options[:recompile] = nil
|
|
30
|
+
options[:rebuild_xrefs] = nil
|
|
31
|
+
options[:noindent] = nil
|
|
32
|
+
options[:obfuscate] = false
|
|
33
|
+
end
|
|
34
|
+
load_all_objects unless @loaded
|
|
35
|
+
|
|
36
|
+
intents_as_pdfa1 if options[:intent].to_s =~ %r{pdf[/-]?A1?/i}
|
|
37
|
+
delinearize! if options[:delinearize] && linearized?
|
|
38
|
+
compile(options) if options[:recompile]
|
|
39
|
+
|
|
40
|
+
io_stream = StringIO.new(output(options))
|
|
41
|
+
io_stream.set_encoding Encoding::BINARY
|
|
42
|
+
io_stream
|
|
43
|
+
end
|
|
44
|
+
|
|
45
|
+
# Checks a PDFs stream content for text operators
|
|
46
|
+
# See https://opensource.adobe.com/dc-acrobat-sdk-docs/pdfstandards/PDF32000_2008.pdf page 243-251.
|
|
47
|
+
# @param [StringIO] stream Stream object from a PDFs page.
|
|
48
|
+
# @return [bool] `true` if a text operator is found in the stream.
|
|
49
|
+
def self.stream_has_text?(stream)
|
|
50
|
+
data = stream.data
|
|
51
|
+
return false if data.nil? || data.empty?
|
|
52
|
+
|
|
53
|
+
text_operators = ['Tc', 'Tw', 'Th', 'TL', 'Tf', 'Tk', 'Tr', 'Tm', 'T*', 'Tj', 'TJ', "'", '"']
|
|
54
|
+
text_operators.any? { |op| data.include?(op) }
|
|
55
|
+
end
|
|
56
|
+
|
|
57
|
+
# Checks whether a stream contains a PDF header near the beginning.
|
|
58
|
+
# @param [StringIO] io_stream Binary-encoded stream.
|
|
59
|
+
# @param [Integer] maximum_offset Maximum allowed offset to find '%PDF-'.
|
|
60
|
+
# @return [bool] `true` when the stream appears to be a PDF.
|
|
61
|
+
def self.pdf_header?(io_stream, maximum_offset: 500)
|
|
62
|
+
initial_pos = nil
|
|
63
|
+
initial_pos = io_stream.pos if io_stream.respond_to?(:pos)
|
|
64
|
+
io_stream.seek(0)
|
|
65
|
+
io_stream.gets('%PDF-')
|
|
66
|
+
!(io_stream.eof? || io_stream.pos > maximum_offset)
|
|
67
|
+
rescue TypeError, IOError, SystemCallError
|
|
68
|
+
false
|
|
69
|
+
ensure
|
|
70
|
+
io_stream.seek(initial_pos) if !initial_pos.nil? && io_stream.respond_to?(:seek)
|
|
71
|
+
end
|
|
72
|
+
|
|
73
|
+
# Checks whether the file has source_text. Sends false if the file isn't a PDF.
|
|
74
|
+
# @param [StringIO] pdf_data Abinary-encoded stream representing the PDF file.
|
|
75
|
+
# @return [bool] `true` if the pdf has source text, false otherwise.
|
|
76
|
+
def self.source_text?(pdf_data)
|
|
77
|
+
return false unless pdf_header?(pdf_data)
|
|
78
|
+
|
|
79
|
+
begin
|
|
80
|
+
pdf_data.rewind
|
|
81
|
+
pdf = Origami::PDF.read(pdf_data)
|
|
82
|
+
|
|
83
|
+
pdf.each_page do |page|
|
|
84
|
+
next unless page[:Contents]
|
|
85
|
+
|
|
86
|
+
contents = page[:Contents].solve
|
|
87
|
+
contents = [contents] unless contents.is_a?(Origami::Array)
|
|
88
|
+
|
|
89
|
+
contents.each do |stream_ref|
|
|
90
|
+
stream = stream_ref.solve
|
|
91
|
+
return true if stream_has_text?(stream)
|
|
92
|
+
end
|
|
93
|
+
end
|
|
94
|
+
|
|
95
|
+
false
|
|
96
|
+
end
|
|
97
|
+
|
|
98
|
+
false
|
|
99
|
+
rescue Origami::InvalidPDFError
|
|
100
|
+
false
|
|
101
|
+
end
|
|
102
|
+
|
|
103
|
+
# Creates an image XObject from the provided image.
|
|
104
|
+
#
|
|
105
|
+
# Converts the given image to a binary stream using Mindee's image utilities, then creates
|
|
106
|
+
# an Origami::Graphics::ImageXObject with a JPEG filter.
|
|
107
|
+
#
|
|
108
|
+
# @param [MiniMagick::Image] image An image object with the necessary data and structure.
|
|
109
|
+
# @return [Origami::Graphics::ImageXObject] The created image XObject.
|
|
110
|
+
def self.create_xobject(image)
|
|
111
|
+
image_io = Mindee::Image::ImageUtils.image_to_stringio(image)
|
|
112
|
+
Origami::Graphics::ImageXObject.from_image_file(image_io, 'jpg')
|
|
113
|
+
end
|
|
114
|
+
|
|
115
|
+
# Sets properties on the provided image XObject based on image metadata.
|
|
116
|
+
#
|
|
117
|
+
# @param [Origami::Graphics::ImageXObject] xobject The image XObject to update.
|
|
118
|
+
# @param [Hash] image A hash containing image metadata (such as width, height, properties, etc.).
|
|
119
|
+
def self.set_xobject_properties(xobject, image)
|
|
120
|
+
xobject.dictionary[:BitsPerComponent] = 8
|
|
121
|
+
xobject.dictionary[:Filter] = determine_filter(image)
|
|
122
|
+
xobject.dictionary[:Width] = image[:width]
|
|
123
|
+
xobject.dictionary[:Height] = image[:height]
|
|
124
|
+
xobject.dictionary[:ColorSpace] = determine_colorspace(image)
|
|
125
|
+
end
|
|
126
|
+
|
|
127
|
+
# Determines the appropriate filter for an image based on its properties.
|
|
128
|
+
#
|
|
129
|
+
# @param [Hash] image The image data hash containing properties.
|
|
130
|
+
# @return [Symbol] One of :FlateDecode, :LZWDecode or :DCTDecode.
|
|
131
|
+
def self.determine_filter(image)
|
|
132
|
+
filter = image.data['properties']['filter']
|
|
133
|
+
case filter
|
|
134
|
+
when %r{Zip}i then :FlateDecode
|
|
135
|
+
when %r{LZW}i then :LZWDecode
|
|
136
|
+
else :DCTDecode
|
|
137
|
+
end
|
|
138
|
+
end
|
|
139
|
+
|
|
140
|
+
# Determines the colorspace for an image based on its metadata.
|
|
141
|
+
#
|
|
142
|
+
# @param [Hash] image The image data hash.
|
|
143
|
+
# @return [Symbol] One of :DeviceCMYK, :DeviceGray or :DeviceRGB.
|
|
144
|
+
def self.determine_colorspace(image)
|
|
145
|
+
colorspace = image.data['colorspace']
|
|
146
|
+
case colorspace
|
|
147
|
+
when 'CMYK' then :DeviceCMYK
|
|
148
|
+
when 'Gray', 'PseudoClass Gray' then :DeviceGray
|
|
149
|
+
else :DeviceRGB
|
|
150
|
+
end
|
|
151
|
+
end
|
|
152
|
+
|
|
153
|
+
# Adds a content stream to the specified PDF page to display an image XObject.
|
|
154
|
+
#
|
|
155
|
+
# @param [Origami::Page] page The PDF page to which content will be added.
|
|
156
|
+
# @param [String] xobject_name The name identifying the XObject.
|
|
157
|
+
# @param [Integer] width The width for the transformation matrix.
|
|
158
|
+
# @param [Integer] height The height for the transformation matrix.
|
|
159
|
+
def self.add_content_to_page(page, xobject_name, width, height)
|
|
160
|
+
content = "q\n#{width} 0 0 #{height} 0 0 cm\n/#{xobject_name} Do\nQ\n"
|
|
161
|
+
content_stream = Origami::Stream.new(content)
|
|
162
|
+
page.Contents = content_stream
|
|
163
|
+
end
|
|
164
|
+
|
|
165
|
+
# Sets the dimensions for the specified PDF page.
|
|
166
|
+
#
|
|
167
|
+
# @param [Origami::Page] page The PDF page whose dimensions are being set.
|
|
168
|
+
# @param [Numeric] width The target width of the page.
|
|
169
|
+
# @param [Numeric] height The target height of the page.
|
|
170
|
+
def self.set_page_dimensions(page, width, height)
|
|
171
|
+
page[:MediaBox] = [0, 0, width, height]
|
|
172
|
+
page[:CropBox] = [0, 0, width, height]
|
|
173
|
+
end
|
|
174
|
+
|
|
175
|
+
# Processes an image into an image XObject for PDF embedding.
|
|
176
|
+
#
|
|
177
|
+
# @param [MiniMagick::Image, StringIO] image_data The raw image data.
|
|
178
|
+
# @param [Integer] image_quality The quality setting for image compression.
|
|
179
|
+
# @param [Numeric] width The desired width of the output image.
|
|
180
|
+
# @param [Numeric] height The desired height of the output image.
|
|
181
|
+
# @return [Origami::Graphics::ImageXObject] The resulting image XObject.
|
|
182
|
+
def self.process_image_xobject(image_data, image_quality, width, height)
|
|
183
|
+
compressed_data = Image::ImageCompressor.compress_image(
|
|
184
|
+
image_data,
|
|
185
|
+
quality: image_quality,
|
|
186
|
+
max_width: width,
|
|
187
|
+
max_height: height
|
|
188
|
+
)
|
|
189
|
+
|
|
190
|
+
new_image = Origami::Graphics::ImageXObject.new
|
|
191
|
+
new_image.data = compressed_data
|
|
192
|
+
new_image.Width = width
|
|
193
|
+
new_image.Height = height
|
|
194
|
+
new_image.ColorSpace = :DeviceRGB
|
|
195
|
+
new_image.BitsPerComponent = 8
|
|
196
|
+
|
|
197
|
+
new_image
|
|
198
|
+
end
|
|
199
|
+
end
|
|
200
|
+
end
|
|
201
|
+
end
|