invoice2data 0.4.7__tar.gz → 0.5.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {invoice2data-0.4.7/src/invoice2data.egg-info → invoice2data-0.5.0}/PKG-INFO +5 -5
- {invoice2data-0.4.7 → invoice2data-0.5.0}/README.md +3 -2
- {invoice2data-0.4.7 → invoice2data-0.5.0}/pyproject.toml +3 -5
- {invoice2data-0.4.7 → invoice2data-0.5.0}/src/invoice2data/__init__.py +1 -0
- {invoice2data-0.4.7 → invoice2data-0.5.0}/src/invoice2data/__main__.py +64 -22
- {invoice2data-0.4.7 → invoice2data-0.5.0}/src/invoice2data/extract/invoice_template.py +11 -12
- {invoice2data-0.4.7 → invoice2data-0.5.0}/src/invoice2data/extract/loader.py +11 -14
- {invoice2data-0.4.7 → invoice2data-0.5.0}/src/invoice2data/extract/parsers/lines.py +29 -33
- {invoice2data-0.4.7 → invoice2data-0.5.0}/src/invoice2data/extract/parsers/regex.py +6 -9
- {invoice2data-0.4.7 → invoice2data-0.5.0}/src/invoice2data/extract/parsers/static.py +2 -4
- {invoice2data-0.4.7 → invoice2data-0.5.0}/src/invoice2data/extract/plugins/lines.py +2 -3
- {invoice2data-0.4.7 → invoice2data-0.5.0}/src/invoice2data/extract/plugins/tables.py +22 -24
- {invoice2data-0.4.7 → invoice2data-0.5.0}/src/invoice2data/extract/utils.py +1 -3
- {invoice2data-0.4.7 → invoice2data-0.5.0}/src/invoice2data/input/gvision.py +2 -3
- {invoice2data-0.4.7 → invoice2data-0.5.0}/src/invoice2data/input/ocrmypdf.py +7 -11
- {invoice2data-0.4.7 → invoice2data-0.5.0}/src/invoice2data/input/pdfminer_wrapper.py +3 -5
- invoice2data-0.5.0/src/invoice2data/input/pdfplumber.py +46 -0
- {invoice2data-0.4.7 → invoice2data-0.5.0}/src/invoice2data/input/pdftotext.py +3 -4
- {invoice2data-0.4.7 → invoice2data-0.5.0}/src/invoice2data/input/tesseract.py +3 -6
- {invoice2data-0.4.7 → invoice2data-0.5.0}/src/invoice2data/output/to_csv.py +2 -4
- {invoice2data-0.4.7 → invoice2data-0.5.0}/src/invoice2data/output/to_json.py +3 -5
- {invoice2data-0.4.7 → invoice2data-0.5.0}/src/invoice2data/output/to_xml.py +5 -7
- {invoice2data-0.4.7 → invoice2data-0.5.0/src/invoice2data.egg-info}/PKG-INFO +5 -5
- {invoice2data-0.4.7 → invoice2data-0.5.0}/tests/test_cli.py +4 -5
- {invoice2data-0.4.7 → invoice2data-0.5.0}/tests/test_extraction.py +1 -2
- {invoice2data-0.4.7 → invoice2data-0.5.0}/tests/test_invoice_template.py +10 -12
- {invoice2data-0.4.7 → invoice2data-0.5.0}/tests/test_lib.py +27 -5
- {invoice2data-0.4.7 → invoice2data-0.5.0}/tests/test_loader.py +1 -1
- invoice2data-0.4.7/src/invoice2data/input/pdfplumber.py +0 -66
- {invoice2data-0.4.7 → invoice2data-0.5.0}/LICENSE.md +0 -0
- {invoice2data-0.4.7 → invoice2data-0.5.0}/setup.cfg +0 -0
- {invoice2data-0.4.7 → invoice2data-0.5.0}/src/invoice2data/extract/__init__.py +0 -0
- {invoice2data-0.4.7 → invoice2data-0.5.0}/src/invoice2data/extract/parsers/__init__.py +0 -0
- {invoice2data-0.4.7 → invoice2data-0.5.0}/src/invoice2data/extract/parsers/__interface__.py +0 -0
- {invoice2data-0.4.7 → invoice2data-0.5.0}/src/invoice2data/extract/plugins/__init__.py +0 -0
- {invoice2data-0.4.7 → invoice2data-0.5.0}/src/invoice2data/extract/plugins/__interface__.py +0 -0
- {invoice2data-0.4.7 → invoice2data-0.5.0}/src/invoice2data/extract/templates/au/au.com.opal.yml +0 -0
- {invoice2data-0.4.7 → invoice2data-0.5.0}/src/invoice2data/extract/templates/au/au.com.telstra.yml +0 -0
- {invoice2data-0.4.7 → invoice2data-0.5.0}/src/invoice2data/extract/templates/be/be.accor.invest.ibis.yml +0 -0
- {invoice2data-0.4.7 → invoice2data-0.5.0}/src/invoice2data/extract/templates/be/be.accor.invest.novotel.yml +0 -0
- {invoice2data-0.4.7 → invoice2data-0.5.0}/src/invoice2data/extract/templates/be/be.boucherie.pochet.yml +0 -0
- {invoice2data-0.4.7 → invoice2data-0.5.0}/src/invoice2data/extract/templates/be/be.cebeo.yml +0 -0
- {invoice2data-0.4.7 → invoice2data-0.5.0}/src/invoice2data/extract/templates/be/be.eg_retail.yml +0 -0
- {invoice2data-0.4.7 → invoice2data-0.5.0}/src/invoice2data/extract/templates/be/be.lampiris.facture-dacompte.yml +0 -0
- {invoice2data-0.4.7 → invoice2data-0.5.0}/src/invoice2data/extract/templates/be/be.lampiris.factuur.yml +0 -0
- {invoice2data-0.4.7 → invoice2data-0.5.0}/src/invoice2data/extract/templates/be/be.lampiris.regularisation.yml +0 -0
- {invoice2data-0.4.7 → invoice2data-0.5.0}/src/invoice2data/extract/templates/be/be.melchior-vins.yml +0 -0
- {invoice2data-0.4.7 → invoice2data-0.5.0}/src/invoice2data/extract/templates/be/be.proximus.yml +0 -0
- {invoice2data-0.4.7 → invoice2data-0.5.0}/src/invoice2data/extract/templates/be/be.scarlet.yml +0 -0
- {invoice2data-0.4.7 → invoice2data-0.5.0}/src/invoice2data/extract/templates/be/be.securex.social.yml +0 -0
- {invoice2data-0.4.7 → invoice2data-0.5.0}/src/invoice2data/extract/templates/ch/ch.pcengines.yml +0 -0
- {invoice2data-0.4.7 → invoice2data-0.5.0}/src/invoice2data/extract/templates/com/com.AzureInterior.yml +0 -0
- {invoice2data-0.4.7 → invoice2data-0.5.0}/src/invoice2data/extract/templates/com/com.amazon.aws.yml +0 -0
- {invoice2data-0.4.7 → invoice2data-0.5.0}/src/invoice2data/extract/templates/com/com.apple.yml +0 -0
- {invoice2data-0.4.7 → invoice2data-0.5.0}/src/invoice2data/extract/templates/com/com.apps4rent.yml +0 -0
- {invoice2data-0.4.7 → invoice2data-0.5.0}/src/invoice2data/extract/templates/com/com.binarylife.yml +0 -0
- {invoice2data-0.4.7 → invoice2data-0.5.0}/src/invoice2data/extract/templates/com/com.bloomberg.yml +0 -0
- {invoice2data-0.4.7 → invoice2data-0.5.0}/src/invoice2data/extract/templates/com/com.cloudflare.yml +0 -0
- {invoice2data-0.4.7 → invoice2data-0.5.0}/src/invoice2data/extract/templates/com/com.cloudns.yml +0 -0
- {invoice2data-0.4.7 → invoice2data-0.5.0}/src/invoice2data/extract/templates/com/com.datadoghq.yml +0 -0
- {invoice2data-0.4.7 → invoice2data-0.5.0}/src/invoice2data/extract/templates/com/com.digitalocean.yml +0 -0
- {invoice2data-0.4.7 → invoice2data-0.5.0}/src/invoice2data/extract/templates/com/com.envato.yml +0 -0
- {invoice2data-0.4.7 → invoice2data-0.5.0}/src/invoice2data/extract/templates/com/com.eur.aliexpress.json +0 -0
- {invoice2data-0.4.7 → invoice2data-0.5.0}/src/invoice2data/extract/templates/com/com.expressvpn.yml +0 -0
- {invoice2data-0.4.7 → invoice2data-0.5.0}/src/invoice2data/extract/templates/com/com.expressvpn_prio6.yml +0 -0
- {invoice2data-0.4.7 → invoice2data-0.5.0}/src/invoice2data/extract/templates/com/com.flipkart.WSRetail.json +0 -0
- {invoice2data-0.4.7 → invoice2data-0.5.0}/src/invoice2data/extract/templates/com/com.ftserussell.yml +0 -0
- {invoice2data-0.4.7 → invoice2data-0.5.0}/src/invoice2data/extract/templates/com/com.github.yml +0 -0
- {invoice2data-0.4.7 → invoice2data-0.5.0}/src/invoice2data/extract/templates/com/com.globalsign.yml +0 -0
- {invoice2data-0.4.7 → invoice2data-0.5.0}/src/invoice2data/extract/templates/com/com.google.adwords.hk.yml +0 -0
- {invoice2data-0.4.7 → invoice2data-0.5.0}/src/invoice2data/extract/templates/com/com.hetzner.yml +0 -0
- {invoice2data-0.4.7 → invoice2data-0.5.0}/src/invoice2data/extract/templates/com/com.hobohost.yml +0 -0
- {invoice2data-0.4.7 → invoice2data-0.5.0}/src/invoice2data/extract/templates/com/com.jamiepro.yml +0 -0
- {invoice2data-0.4.7 → invoice2data-0.5.0}/src/invoice2data/extract/templates/com/com.linode.yml +0 -0
- {invoice2data-0.4.7 → invoice2data-0.5.0}/src/invoice2data/extract/templates/com/com.microsoftonline.hk-v2017.yml +0 -0
- {invoice2data-0.4.7 → invoice2data-0.5.0}/src/invoice2data/extract/templates/com/com.microsoftonline.hk.yml +0 -0
- {invoice2data-0.4.7 → invoice2data-0.5.0}/src/invoice2data/extract/templates/com/com.mongodb.yml +0 -0
- {invoice2data-0.4.7 → invoice2data-0.5.0}/src/invoice2data/extract/templates/com/com.namecheap.yml +0 -0
- {invoice2data-0.4.7 → invoice2data-0.5.0}/src/invoice2data/extract/templates/com/com.namesilo.yml +0 -0
- {invoice2data-0.4.7 → invoice2data-0.5.0}/src/invoice2data/extract/templates/com/com.newrelic.yml +0 -0
- {invoice2data-0.4.7 → invoice2data-0.5.0}/src/invoice2data/extract/templates/com/com.nl.lenovo.digitalriver.yml +0 -0
- {invoice2data-0.4.7 → invoice2data-0.5.0}/src/invoice2data/extract/templates/com/com.nmmn.yml +0 -0
- {invoice2data-0.4.7 → invoice2data-0.5.0}/src/invoice2data/extract/templates/com/com.nodisto.yml +0 -0
- {invoice2data-0.4.7 → invoice2data-0.5.0}/src/invoice2data/extract/templates/com/com.nyse.yml +0 -0
- {invoice2data-0.4.7 → invoice2data-0.5.0}/src/invoice2data/extract/templates/com/com.oyo.invoice.yml +0 -0
- {invoice2data-0.4.7 → invoice2data-0.5.0}/src/invoice2data/extract/templates/com/com.packtpub.yml +0 -0
- {invoice2data-0.4.7 → invoice2data-0.5.0}/src/invoice2data/extract/templates/com/com.pixartprinting.yml +0 -0
- {invoice2data-0.4.7 → invoice2data-0.5.0}/src/invoice2data/extract/templates/com/com.runbox.yml +0 -0
- {invoice2data-0.4.7 → invoice2data-0.5.0}/src/invoice2data/extract/templates/com/com.sammymaystone.yml +0 -0
- {invoice2data-0.4.7 → invoice2data-0.5.0}/src/invoice2data/extract/templates/com/com.scaleway.yml +0 -0
- {invoice2data-0.4.7 → invoice2data-0.5.0}/src/invoice2data/extract/templates/com/com.textmaster.yml +0 -0
- {invoice2data-0.4.7 → invoice2data-0.5.0}/src/invoice2data/extract/templates/com/com.tmx.yml +0 -0
- {invoice2data-0.4.7 → invoice2data-0.5.0}/src/invoice2data/extract/templates/com/com.travis-ci.yml +0 -0
- {invoice2data-0.4.7 → invoice2data-0.5.0}/src/invoice2data/extract/templates/com/com.twitter.de.yml +0 -0
- {invoice2data-0.4.7 → invoice2data-0.5.0}/src/invoice2data/extract/templates/com/com.twitter.uk.yml +0 -0
- {invoice2data-0.4.7 → invoice2data-0.5.0}/src/invoice2data/extract/templates/com/com.twitter.yml +0 -0
- {invoice2data-0.4.7 → invoice2data-0.5.0}/src/invoice2data/extract/templates/com/com.upwork.yml +0 -0
- {invoice2data-0.4.7 → invoice2data-0.5.0}/src/invoice2data/extract/templates/com/com.usersnap.yml +0 -0
- {invoice2data-0.4.7 → invoice2data-0.5.0}/src/invoice2data/extract/templates/com/com.vultr.yml +0 -0
- {invoice2data-0.4.7 → invoice2data-0.5.0}/src/invoice2data/extract/templates/de/de.amazon.yml +0 -0
- {invoice2data-0.4.7 → invoice2data-0.5.0}/src/invoice2data/extract/templates/de/de.bettina-kast.yml +0 -0
- {invoice2data-0.4.7 → invoice2data-0.5.0}/src/invoice2data/extract/templates/de/de.digikey.com.yml +0 -0
- {invoice2data-0.4.7 → invoice2data-0.5.0}/src/invoice2data/extract/templates/de/de.hosteurope.yml +0 -0
- {invoice2data-0.4.7 → invoice2data-0.5.0}/src/invoice2data/extract/templates/de/de.notebooksbilligerBillPay.yml +0 -0
- {invoice2data-0.4.7 → invoice2data-0.5.0}/src/invoice2data/extract/templates/de/de.ovh.yml +0 -0
- {invoice2data-0.4.7 → invoice2data-0.5.0}/src/invoice2data/extract/templates/de/de.qualityhosting.yml +0 -0
- {invoice2data-0.4.7 → invoice2data-0.5.0}/src/invoice2data/extract/templates/de/de.united-domains.yml +0 -0
- {invoice2data-0.4.7 → invoice2data-0.5.0}/src/invoice2data/extract/templates/es/com.mob-barcelona.caterina.yml +0 -0
- {invoice2data-0.4.7 → invoice2data-0.5.0}/src/invoice2data/extract/templates/es/com.pepephone.yml +0 -0
- {invoice2data-0.4.7 → invoice2data-0.5.0}/src/invoice2data/extract/templates/es/es.amazon.yml +0 -0
- {invoice2data-0.4.7 → invoice2data-0.5.0}/src/invoice2data/extract/templates/es/es.digimobile.yml +0 -0
- {invoice2data-0.4.7 → invoice2data-0.5.0}/src/invoice2data/extract/templates/es/es.supplies24.yml +0 -0
- {invoice2data-0.4.7 → invoice2data-0.5.0}/src/invoice2data/extract/templates/fr/co.mooncard.yml +0 -0
- {invoice2data-0.4.7 → invoice2data-0.5.0}/src/invoice2data/extract/templates/fr/com.adobe.ie.yml +0 -0
- {invoice2data-0.4.7 → invoice2data-0.5.0}/src/invoice2data/extract/templates/fr/com.akretion.fr.yml +0 -0
- {invoice2data-0.4.7 → invoice2data-0.5.0}/src/invoice2data/extract/templates/fr/com.amazon.aws.yml +0 -0
- {invoice2data-0.4.7 → invoice2data-0.5.0}/src/invoice2data/extract/templates/fr/com.ateliercopieservice.yml +0 -0
- {invoice2data-0.4.7 → invoice2data-0.5.0}/src/invoice2data/extract/templates/fr/com.chauffeur-prive.yml +0 -0
- {invoice2data-0.4.7 → invoice2data-0.5.0}/src/invoice2data/extract/templates/fr/com.coriolis.yml +0 -0
- {invoice2data-0.4.7 → invoice2data-0.5.0}/src/invoice2data/extract/templates/fr/com.easyjet.fr.yml +0 -0
- {invoice2data-0.4.7 → invoice2data-0.5.0}/src/invoice2data/extract/templates/fr/com.eaudugrandlyon.yml +0 -0
- {invoice2data-0.4.7 → invoice2data-0.5.0}/src/invoice2data/extract/templates/fr/com.godaddy.yml +0 -0
- {invoice2data-0.4.7 → invoice2data-0.5.0}/src/invoice2data/extract/templates/fr/com.google.ie.yml +0 -0
- {invoice2data-0.4.7 → invoice2data-0.5.0}/src/invoice2data/extract/templates/fr/com.hootsuite.yml +0 -0
- {invoice2data-0.4.7 → invoice2data-0.5.0}/src/invoice2data/extract/templates/fr/com.jeanbesson.yml +0 -0
- {invoice2data-0.4.7 → invoice2data-0.5.0}/src/invoice2data/extract/templates/fr/com.ldlc.yml +0 -0
- {invoice2data-0.4.7 → invoice2data-0.5.0}/src/invoice2data/extract/templates/fr/com.linkedin.yml +0 -0
- {invoice2data-0.4.7 → invoice2data-0.5.0}/src/invoice2data/extract/templates/fr/com.mention.yml +0 -0
- {invoice2data-0.4.7 → invoice2data-0.5.0}/src/invoice2data/extract/templates/fr/com.microsoft.ie.yml +0 -0
- {invoice2data-0.4.7 → invoice2data-0.5.0}/src/invoice2data/extract/templates/fr/com.myflyingbox.yml +0 -0
- {invoice2data-0.4.7 → invoice2data-0.5.0}/src/invoice2data/extract/templates/fr/com.officetimeline.yml +0 -0
- {invoice2data-0.4.7 → invoice2data-0.5.0}/src/invoice2data/extract/templates/fr/com.orange-business.mobile.yml +0 -0
- {invoice2data-0.4.7 → invoice2data-0.5.0}/src/invoice2data/extract/templates/fr/com.ovh.fr.yml +0 -0
- {invoice2data-0.4.7 → invoice2data-0.5.0}/src/invoice2data/extract/templates/fr/com.rs-online.fr.yml +0 -0
- {invoice2data-0.4.7 → invoice2data-0.5.0}/src/invoice2data/extract/templates/fr/com.saur.yml +0 -0
- {invoice2data-0.4.7 → invoice2data-0.5.0}/src/invoice2data/extract/templates/fr/com.soyoustart.yml +0 -0
- {invoice2data-0.4.7 → invoice2data-0.5.0}/src/invoice2data/extract/templates/fr/com.vinci-autoroutes.yml +0 -0
- {invoice2data-0.4.7 → invoice2data-0.5.0}/src/invoice2data/extract/templates/fr/dolibarr.generique.yml +0 -0
- {invoice2data-0.4.7 → invoice2data-0.5.0}/src/invoice2data/extract/templates/fr/eu.trainline.yml +0 -0
- {invoice2data-0.4.7 → invoice2data-0.5.0}/src/invoice2data/extract/templates/fr/fr.actn.yml +0 -0
- {invoice2data-0.4.7 → invoice2data-0.5.0}/src/invoice2data/extract/templates/fr/fr.airfrance.yml +0 -0
- {invoice2data-0.4.7 → invoice2data-0.5.0}/src/invoice2data/extract/templates/fr/fr.also.yml +0 -0
- {invoice2data-0.4.7 → invoice2data-0.5.0}/src/invoice2data/extract/templates/fr/fr.amazon.yml +0 -0
- {invoice2data-0.4.7 → invoice2data-0.5.0}/src/invoice2data/extract/templates/fr/fr.assurance-epargne-pension.yml +0 -0
- {invoice2data-0.4.7 → invoice2data-0.5.0}/src/invoice2data/extract/templates/fr/fr.bouyguestelecom.adsl-fiber.yml +0 -0
- {invoice2data-0.4.7 → invoice2data-0.5.0}/src/invoice2data/extract/templates/fr/fr.bouyguestelecom.mobile.yml +0 -0
- {invoice2data-0.4.7 → invoice2data-0.5.0}/src/invoice2data/extract/templates/fr/fr.butagaz.yml +0 -0
- {invoice2data-0.4.7 → invoice2data-0.5.0}/src/invoice2data/extract/templates/fr/fr.chronopost.yml +0 -0
- {invoice2data-0.4.7 → invoice2data-0.5.0}/src/invoice2data/extract/templates/fr/fr.dirafi.yml +0 -0
- {invoice2data-0.4.7 → invoice2data-0.5.0}/src/invoice2data/extract/templates/fr/fr.domaine-achat.yml +0 -0
- {invoice2data-0.4.7 → invoice2data-0.5.0}/src/invoice2data/extract/templates/fr/fr.easytrip.yml +0 -0
- {invoice2data-0.4.7 → invoice2data-0.5.0}/src/invoice2data/extract/templates/fr/fr.edf.entreprises.yml +0 -0
- {invoice2data-0.4.7 → invoice2data-0.5.0}/src/invoice2data/extract/templates/fr/fr.edf.pme.yml +0 -0
- {invoice2data-0.4.7 → invoice2data-0.5.0}/src/invoice2data/extract/templates/fr/fr.finagaz.yml +0 -0
- {invoice2data-0.4.7 → invoice2data-0.5.0}/src/invoice2data/extract/templates/fr/fr.fountain.yml +0 -0
- {invoice2data-0.4.7 → invoice2data-0.5.0}/src/invoice2data/extract/templates/fr/fr.free.adsl-fiber.yml +0 -0
- {invoice2data-0.4.7 → invoice2data-0.5.0}/src/invoice2data/extract/templates/fr/fr.free.mobile.yml +0 -0
- {invoice2data-0.4.7 → invoice2data-0.5.0}/src/invoice2data/extract/templates/fr/fr.free.mobile2.yml +0 -0
- {invoice2data-0.4.7 → invoice2data-0.5.0}/src/invoice2data/extract/templates/fr/fr.futur.yml +0 -0
- {invoice2data-0.4.7 → invoice2data-0.5.0}/src/invoice2data/extract/templates/fr/fr.ge-iroise.yml +0 -0
- {invoice2data-0.4.7 → invoice2data-0.5.0}/src/invoice2data/extract/templates/fr/fr.google.yml +0 -0
- {invoice2data-0.4.7 → invoice2data-0.5.0}/src/invoice2data/extract/templates/fr/fr.greffe-tc-lyon.yml +0 -0
- {invoice2data-0.4.7 → invoice2data-0.5.0}/src/invoice2data/extract/templates/fr/fr.hiscox.yml +0 -0
- {invoice2data-0.4.7 → invoice2data-0.5.0}/src/invoice2data/extract/templates/fr/fr.internetsatellite.yml +0 -0
- {invoice2data-0.4.7 → invoice2data-0.5.0}/src/invoice2data/extract/templates/fr/fr.jpg.yml +0 -0
- {invoice2data-0.4.7 → invoice2data-0.5.0}/src/invoice2data/extract/templates/fr/fr.kubii.yml +0 -0
- {invoice2data-0.4.7 → invoice2data-0.5.0}/src/invoice2data/extract/templates/fr/fr.laposte.boutique.yml +0 -0
- {invoice2data-0.4.7 → invoice2data-0.5.0}/src/invoice2data/extract/templates/fr/fr.laposte.coliposte.yml +0 -0
- {invoice2data-0.4.7 → invoice2data-0.5.0}/src/invoice2data/extract/templates/fr/fr.lecab.yml +0 -0
- {invoice2data-0.4.7 → invoice2data-0.5.0}/src/invoice2data/extract/templates/fr/fr.leroymerlin.yml +0 -0
- {invoice2data-0.4.7 → invoice2data-0.5.0}/src/invoice2data/extract/templates/fr/fr.maaf.yml +0 -0
- {invoice2data-0.4.7 → invoice2data-0.5.0}/src/invoice2data/extract/templates/fr/fr.mediapart.yml +0 -0
- {invoice2data-0.4.7 → invoice2data-0.5.0}/src/invoice2data/extract/templates/fr/fr.moneo-resto.yml +0 -0
- {invoice2data-0.4.7 → invoice2data-0.5.0}/src/invoice2data/extract/templates/fr/fr.mouser.yml +0 -0
- {invoice2data-0.4.7 → invoice2data-0.5.0}/src/invoice2data/extract/templates/fr/fr.mycelium-roulement.yml +0 -0
- {invoice2data-0.4.7 → invoice2data-0.5.0}/src/invoice2data/extract/templates/fr/fr.napsis.yml +0 -0
- {invoice2data-0.4.7 → invoice2data-0.5.0}/src/invoice2data/extract/templates/fr/fr.nexity.yml +0 -0
- {invoice2data-0.4.7 → invoice2data-0.5.0}/src/invoice2data/extract/templates/fr/fr.orange.fibre.yml +0 -0
- {invoice2data-0.4.7 → invoice2data-0.5.0}/src/invoice2data/extract/templates/fr/fr.orange.fixedline.yml +0 -0
- {invoice2data-0.4.7 → invoice2data-0.5.0}/src/invoice2data/extract/templates/fr/fr.prestaclic.yml +0 -0
- {invoice2data-0.4.7 → invoice2data-0.5.0}/src/invoice2data/extract/templates/fr/fr.publicationannoncelegale.yml +0 -0
- {invoice2data-0.4.7 → invoice2data-0.5.0}/src/invoice2data/extract/templates/fr/fr.sfr.adsl-fiber.yml +0 -0
- {invoice2data-0.4.7 → invoice2data-0.5.0}/src/invoice2data/extract/templates/fr/fr.sfr.mobile.yml +0 -0
- {invoice2data-0.4.7 → invoice2data-0.5.0}/src/invoice2data/extract/templates/fr/fr.sosh.yml +0 -0
- {invoice2data-0.4.7 → invoice2data-0.5.0}/src/invoice2data/extract/templates/fr/fr.teledec.yml +0 -0
- {invoice2data-0.4.7 → invoice2data-0.5.0}/src/invoice2data/extract/templates/fr/fr.topoffice.yml +0 -0
- {invoice2data-0.4.7 → invoice2data-0.5.0}/src/invoice2data/extract/templates/fr/net.online.yml +0 -0
- {invoice2data-0.4.7 → invoice2data-0.5.0}/src/invoice2data/extract/templates/fr/net.scaleway.yml +0 -0
- {invoice2data-0.4.7 → invoice2data-0.5.0}/src/invoice2data/extract/templates/nl/nl.accor.rhine.opco hotels.json +0 -0
- {invoice2data-0.4.7 → invoice2data-0.5.0}/src/invoice2data/extract/templates/nl/nl.action.yml +0 -0
- {invoice2data-0.4.7 → invoice2data-0.5.0}/src/invoice2data/extract/templates/nl/nl.agrisneltank.json +0 -0
- {invoice2data-0.4.7 → invoice2data-0.5.0}/src/invoice2data/extract/templates/nl/nl.albron.yml +0 -0
- {invoice2data-0.4.7 → invoice2data-0.5.0}/src/invoice2data/extract/templates/nl/nl.anwb.yml +0 -0
- {invoice2data-0.4.7 → invoice2data-0.5.0}/src/invoice2data/extract/templates/nl/nl.argos.json +0 -0
- {invoice2data-0.4.7 → invoice2data-0.5.0}/src/invoice2data/extract/templates/nl/nl.be.coolblue.yml +0 -0
- {invoice2data-0.4.7 → invoice2data-0.5.0}/src/invoice2data/extract/templates/nl/nl.begra.yml +0 -0
- {invoice2data-0.4.7 → invoice2data-0.5.0}/src/invoice2data/extract/templates/nl/nl.blokker.yml +0 -0
- {invoice2data-0.4.7 → invoice2data-0.5.0}/src/invoice2data/extract/templates/nl/nl.bouwmans.yml +0 -0
- {invoice2data-0.4.7 → invoice2data-0.5.0}/src/invoice2data/extract/templates/nl/nl.bp.yml +0 -0
- {invoice2data-0.4.7 → invoice2data-0.5.0}/src/invoice2data/extract/templates/nl/nl.buijtendijk.yml +0 -0
- {invoice2data-0.4.7 → invoice2data-0.5.0}/src/invoice2data/extract/templates/nl/nl.bunq.yml +0 -0
- {invoice2data-0.4.7 → invoice2data-0.5.0}/src/invoice2data/extract/templates/nl/nl.cpe.yml +0 -0
- {invoice2data-0.4.7 → invoice2data-0.5.0}/src/invoice2data/extract/templates/nl/nl.esso_eg_services.yml +0 -0
- {invoice2data-0.4.7 → invoice2data-0.5.0}/src/invoice2data/extract/templates/nl/nl.esso_eg_services_v2.yml +0 -0
- {invoice2data-0.4.7 → invoice2data-0.5.0}/src/invoice2data/extract/templates/nl/nl.farnell.yml +0 -0
- {invoice2data-0.4.7 → invoice2data-0.5.0}/src/invoice2data/extract/templates/nl/nl.fedex.json +0 -0
- {invoice2data-0.4.7 → invoice2data-0.5.0}/src/invoice2data/extract/templates/nl/nl.ferbox.yml +0 -0
- {invoice2data-0.4.7 → invoice2data-0.5.0}/src/invoice2data/extract/templates/nl/nl.fletcher.yml +0 -0
- {invoice2data-0.4.7 → invoice2data-0.5.0}/src/invoice2data/extract/templates/nl/nl.gamma.yml +0 -0
- {invoice2data-0.4.7 → invoice2data-0.5.0}/src/invoice2data/extract/templates/nl/nl.goos.yml +0 -0
- {invoice2data-0.4.7 → invoice2data-0.5.0}/src/invoice2data/extract/templates/nl/nl.gulf.yml +0 -0
- {invoice2data-0.4.7 → invoice2data-0.5.0}/src/invoice2data/extract/templates/nl/nl.ipparking.paleiskwartier.yml +0 -0
- {invoice2data-0.4.7 → invoice2data-0.5.0}/src/invoice2data/extract/templates/nl/nl.karwei.yml +0 -0
- {invoice2data-0.4.7 → invoice2data-0.5.0}/src/invoice2data/extract/templates/nl/nl.kav.yml +0 -0
- {invoice2data-0.4.7 → invoice2data-0.5.0}/src/invoice2data/extract/templates/nl/nl.koffiehenk.yml +0 -0
- {invoice2data-0.4.7 → invoice2data-0.5.0}/src/invoice2data/extract/templates/nl/nl.kuwait-q8.json +0 -0
- {invoice2data-0.4.7 → invoice2data-0.5.0}/src/invoice2data/extract/templates/nl/nl.makro.json +0 -0
- {invoice2data-0.4.7 → invoice2data-0.5.0}/src/invoice2data/extract/templates/nl/nl.marktplaats.json +0 -0
- {invoice2data-0.4.7 → invoice2data-0.5.0}/src/invoice2data/extract/templates/nl/nl.megekko.json +0 -0
- {invoice2data-0.4.7 → invoice2data-0.5.0}/src/invoice2data/extract/templates/nl/nl.momentsenmore.yml +0 -0
- {invoice2data-0.4.7 → invoice2data-0.5.0}/src/invoice2data/extract/templates/nl/nl.ns.invoice.yml +0 -0
- {invoice2data-0.4.7 → invoice2data-0.5.0}/src/invoice2data/extract/templates/nl/nl.odido.json +0 -0
- {invoice2data-0.4.7 → invoice2data-0.5.0}/src/invoice2data/extract/templates/nl/nl.ok.yml +0 -0
- {invoice2data-0.4.7 → invoice2data-0.5.0}/src/invoice2data/extract/templates/nl/nl.parkmobile.yml +0 -0
- {invoice2data-0.4.7 → invoice2data-0.5.0}/src/invoice2data/extract/templates/nl/nl.praxis.yml +0 -0
- {invoice2data-0.4.7 → invoice2data-0.5.0}/src/invoice2data/extract/templates/nl/nl.reclameland.yml +0 -0
- {invoice2data-0.4.7 → invoice2data-0.5.0}/src/invoice2data/extract/templates/nl/nl.saeco.philips.eluscious.yml +0 -0
- {invoice2data-0.4.7 → invoice2data-0.5.0}/src/invoice2data/extract/templates/nl/nl.shell_nederland.yml +0 -0
- {invoice2data-0.4.7 → invoice2data-0.5.0}/src/invoice2data/extract/templates/nl/nl.shell_schellenkens.yml +0 -0
- {invoice2data-0.4.7 → invoice2data-0.5.0}/src/invoice2data/extract/templates/nl/nl.simpel.yml +0 -0
- {invoice2data-0.4.7 → invoice2data-0.5.0}/src/invoice2data/extract/templates/nl/nl.tango.json +0 -0
- {invoice2data-0.4.7 → invoice2data-0.5.0}/src/invoice2data/extract/templates/nl/nl.total_express.yml +0 -0
- {invoice2data-0.4.7 → invoice2data-0.5.0}/src/invoice2data/extract/templates/nl/nl.total_ototol.yml +0 -0
- {invoice2data-0.4.7 → invoice2data-0.5.0}/src/invoice2data/extract/templates/nl/nl.total_servauto_ned.json +0 -0
- {invoice2data-0.4.7 → invoice2data-0.5.0}/src/invoice2data/extract/templates/nl/nl.transip.yml +0 -0
- {invoice2data-0.4.7 → invoice2data-0.5.0}/src/invoice2data/extract/templates/nl/nl.tuynder.yml +0 -0
- {invoice2data-0.4.7 → invoice2data-0.5.0}/src/invoice2data/extract/templates/nl/nl.valk.exclusief.hotel.json +0 -0
- {invoice2data-0.4.7 → invoice2data-0.5.0}/src/invoice2data/extract/templates/nl/nl.valk.exclusief.restaurant.json +0 -0
- {invoice2data-0.4.7 → invoice2data-0.5.0}/src/invoice2data/extract/templates/nl/nl.vistaprint.yml +0 -0
- {invoice2data-0.4.7 → invoice2data-0.5.0}/src/invoice2data/extract/templates/nl/nl.vodafone.yml +0 -0
- {invoice2data-0.4.7 → invoice2data-0.5.0}/src/invoice2data/extract/templates/nl/nl.wasco.yml +0 -0
- {invoice2data-0.4.7 → invoice2data-0.5.0}/src/invoice2data/extract/templates/nl/nl.weid.yml +0 -0
- {invoice2data-0.4.7 → invoice2data-0.5.0}/src/invoice2data/extract/templates/nl/nl.yezzer.yml +0 -0
- {invoice2data-0.4.7 → invoice2data-0.5.0}/src/invoice2data/extract/templates/nl/nl.zinkunie.yml +0 -0
- {invoice2data-0.4.7 → invoice2data-0.5.0}/src/invoice2data/extract/templates/pl/pl.bmw-fs.yml +0 -0
- {invoice2data-0.4.7 → invoice2data-0.5.0}/src/invoice2data/extract/templates/pl/pl.insert.subiekt-gt.yml +0 -0
- {invoice2data-0.4.7 → invoice2data-0.5.0}/src/invoice2data/extract/templates/pl/pl.insert.subiekt-nexo.yml +0 -0
- {invoice2data-0.4.7 → invoice2data-0.5.0}/src/invoice2data/extract/templates/pl/pl.ksef.yml +0 -0
- {invoice2data-0.4.7 → invoice2data-0.5.0}/src/invoice2data/extract/templates/pl/pl.orlen.yml +0 -0
- {invoice2data-0.4.7 → invoice2data-0.5.0}/src/invoice2data/extract/templates/pl/pl.p4.yml +0 -0
- {invoice2data-0.4.7 → invoice2data-0.5.0}/src/invoice2data/extract/templates/pl/pl.paypro.yml +0 -0
- {invoice2data-0.4.7 → invoice2data-0.5.0}/src/invoice2data/input/__init__.py +0 -0
- {invoice2data-0.4.7 → invoice2data-0.5.0}/src/invoice2data/input/text.py +0 -0
- {invoice2data-0.4.7 → invoice2data-0.5.0}/src/invoice2data/output/__init__.py +0 -0
- {invoice2data-0.4.7 → invoice2data-0.5.0}/src/invoice2data/py.typed +0 -0
- {invoice2data-0.4.7 → invoice2data-0.5.0}/src/invoice2data.egg-info/SOURCES.txt +0 -0
- {invoice2data-0.4.7 → invoice2data-0.5.0}/src/invoice2data.egg-info/dependency_links.txt +0 -0
- {invoice2data-0.4.7 → invoice2data-0.5.0}/src/invoice2data.egg-info/entry_points.txt +0 -0
- {invoice2data-0.4.7 → invoice2data-0.5.0}/src/invoice2data.egg-info/requires.txt +0 -0
- {invoice2data-0.4.7 → invoice2data-0.5.0}/src/invoice2data.egg-info/top_level.txt +0 -0
- {invoice2data-0.4.7 → invoice2data-0.5.0}/tests/test_gvision.py +0 -0
- {invoice2data-0.4.7 → invoice2data-0.5.0}/tests/test_main.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: invoice2data
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.5.0
|
|
4
4
|
Summary: Python parser to extract data from pdf invoice
|
|
5
5
|
Author: Manuel Riel
|
|
6
6
|
License: MIT
|
|
@@ -12,7 +12,6 @@ Keywords: python,data-mining,accounting,invoice,pdf,parcing
|
|
|
12
12
|
Classifier: Programming Language :: Python :: 3
|
|
13
13
|
Classifier: License :: OSI Approved :: MIT License
|
|
14
14
|
Classifier: Operating System :: OS Independent
|
|
15
|
-
Classifier: Programming Language :: Python :: 3.9
|
|
16
15
|
Classifier: Programming Language :: Python :: 3.10
|
|
17
16
|
Classifier: Programming Language :: Python :: 3.11
|
|
18
17
|
Classifier: Programming Language :: Python :: 3.12
|
|
@@ -26,7 +25,7 @@ Classifier: Topic :: Office/Business :: Financial :: Accounting
|
|
|
26
25
|
Classifier: Topic :: Office/Business :: Financial
|
|
27
26
|
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
28
27
|
Classifier: Development Status :: 5 - Production/Stable
|
|
29
|
-
Requires-Python: >=3.
|
|
28
|
+
Requires-Python: >=3.10
|
|
30
29
|
Description-Content-Type: text/markdown
|
|
31
30
|
License-File: LICENSE.md
|
|
32
31
|
Requires-Dist: click>=8.0.1
|
|
@@ -178,7 +177,7 @@ Using in-house templates
|
|
|
178
177
|
See `invoice2data/extract/templates` for existing templates. Just extend
|
|
179
178
|
the list to add your own. If deployed by a bigger organisation, there
|
|
180
179
|
should be an interface to edit templates for new suppliers. 80-20 rule.
|
|
181
|
-
For a short tutorial on how to add new templates, see [tutorial
|
|
180
|
+
For a short tutorial on how to add new templates, see the [template creation tutorial][tutorial].
|
|
182
181
|
|
|
183
182
|
Templates are based on Yaml or JSON. They define one or more keywords to find
|
|
184
183
|
the right template, one or more exclude_keywords to further narrow it down
|
|
@@ -237,7 +236,7 @@ It can be installed on most distributions by:
|
|
|
237
236
|
## Development
|
|
238
237
|
|
|
239
238
|
If you are interested in improving this project, have a look at our
|
|
240
|
-
[
|
|
239
|
+
[contributor guide] to get you started quickly.
|
|
241
240
|
|
|
242
241
|
## Roadmap and open tasks
|
|
243
242
|
|
|
@@ -287,3 +286,4 @@ To learn more, see the [Contributor Guide].
|
|
|
287
286
|
[license]: https://invoice2data.readthedocs.io/latest/license.html
|
|
288
287
|
[contributor guide]: https://invoice2data.readthedocs.io/latest/contributing.html
|
|
289
288
|
[command-line reference]: https://invoice2data.readthedocs.io/latest/usage.html
|
|
289
|
+
[tutorial]: https://invoice2data.readthedocs.io/latest/tutorial.html
|
|
@@ -125,7 +125,7 @@ Using in-house templates
|
|
|
125
125
|
See `invoice2data/extract/templates` for existing templates. Just extend
|
|
126
126
|
the list to add your own. If deployed by a bigger organisation, there
|
|
127
127
|
should be an interface to edit templates for new suppliers. 80-20 rule.
|
|
128
|
-
For a short tutorial on how to add new templates, see [tutorial
|
|
128
|
+
For a short tutorial on how to add new templates, see the [template creation tutorial][tutorial].
|
|
129
129
|
|
|
130
130
|
Templates are based on Yaml or JSON. They define one or more keywords to find
|
|
131
131
|
the right template, one or more exclude_keywords to further narrow it down
|
|
@@ -184,7 +184,7 @@ It can be installed on most distributions by:
|
|
|
184
184
|
## Development
|
|
185
185
|
|
|
186
186
|
If you are interested in improving this project, have a look at our
|
|
187
|
-
[
|
|
187
|
+
[contributor guide] to get you started quickly.
|
|
188
188
|
|
|
189
189
|
## Roadmap and open tasks
|
|
190
190
|
|
|
@@ -234,3 +234,4 @@ To learn more, see the [Contributor Guide].
|
|
|
234
234
|
[license]: https://invoice2data.readthedocs.io/latest/license.html
|
|
235
235
|
[contributor guide]: https://invoice2data.readthedocs.io/latest/contributing.html
|
|
236
236
|
[command-line reference]: https://invoice2data.readthedocs.io/latest/usage.html
|
|
237
|
+
[tutorial]: https://invoice2data.readthedocs.io/latest/tutorial.html
|
|
@@ -6,16 +6,15 @@ build-backend = "setuptools.build_meta"
|
|
|
6
6
|
name = "invoice2data"
|
|
7
7
|
authors = [{ name = "Manuel Riel" }]
|
|
8
8
|
description = "Python parser to extract data from pdf invoice"
|
|
9
|
-
version = "0.
|
|
9
|
+
version = "0.5.0"
|
|
10
10
|
keywords = ["python", "data-mining", "accounting", "invoice", "pdf", "parcing"]
|
|
11
11
|
license = { text = "MIT" }
|
|
12
12
|
readme = "README.md"
|
|
13
|
-
requires-python = ">=3.
|
|
13
|
+
requires-python = ">=3.10"
|
|
14
14
|
classifiers = [
|
|
15
15
|
"Programming Language :: Python :: 3",
|
|
16
16
|
"License :: OSI Approved :: MIT License",
|
|
17
17
|
"Operating System :: OS Independent",
|
|
18
|
-
"Programming Language :: Python :: 3.9",
|
|
19
18
|
"Programming Language :: Python :: 3.10",
|
|
20
19
|
"Programming Language :: Python :: 3.11",
|
|
21
20
|
"Programming Language :: Python :: 3.12",
|
|
@@ -64,6 +63,7 @@ docs = [
|
|
|
64
63
|
"sphinx-autobuild >=2021.3.14",
|
|
65
64
|
"sphinx-click >=3.0.2",
|
|
66
65
|
"sphinx-mermaid >=0.0.7",
|
|
66
|
+
"sphinxcontrib-svg2pdfconverter >=2.1.0",
|
|
67
67
|
]
|
|
68
68
|
mypy = ["mypy >=0.930"]
|
|
69
69
|
typeguard = ["typeguard >=2.13.3"]
|
|
@@ -161,8 +161,6 @@ extend-ignore = [
|
|
|
161
161
|
"S101", # use of assert Activate later
|
|
162
162
|
"S603",
|
|
163
163
|
"UP031", # Use fstring instead of % identifier
|
|
164
|
-
"UP006", # PEP 585 (list vs List): defer typing modernization to its own PR
|
|
165
|
-
"UP035", # deprecated typing imports: defer typing modernization to its own PR
|
|
166
164
|
"E501", # fix this when updating docstrings
|
|
167
165
|
# "DOC106",
|
|
168
166
|
# "DOC107",
|
|
@@ -9,10 +9,6 @@ from copy import deepcopy
|
|
|
9
9
|
from os.path import join
|
|
10
10
|
from typing import Any
|
|
11
11
|
from typing import ClassVar
|
|
12
|
-
from typing import Dict
|
|
13
|
-
from typing import List
|
|
14
|
-
from typing import Optional
|
|
15
|
-
from typing import Tuple
|
|
16
12
|
|
|
17
13
|
import click
|
|
18
14
|
|
|
@@ -110,9 +106,9 @@ if not logger.handlers:
|
|
|
110
106
|
|
|
111
107
|
def extract_data(
|
|
112
108
|
invoicefile: str,
|
|
113
|
-
templates:
|
|
109
|
+
templates: list[InvoiceTemplate] | None = None,
|
|
114
110
|
input_module: Any = None,
|
|
115
|
-
) ->
|
|
111
|
+
) -> dict[str, Any]:
|
|
116
112
|
"""Extracts structured data from PDF/image invoices.
|
|
117
113
|
|
|
118
114
|
This function uses the text extracted from a PDF file or image and
|
|
@@ -123,14 +119,14 @@ def extract_data(
|
|
|
123
119
|
|
|
124
120
|
Args:
|
|
125
121
|
invoicefile (str): Path of electronic invoice file in PDF, JPEG, PNG
|
|
126
|
-
templates (
|
|
122
|
+
templates (list[InvoiceTemplate] | None): List of instances of class `InvoiceTemplate`.
|
|
127
123
|
Templates are loaded using `read_template` function in `loader.py`.
|
|
128
124
|
input_module (Any, optional): Library to be used to extract text
|
|
129
125
|
from the given `invoicefile`.
|
|
130
126
|
Choices: {'pdftotext', 'pdfminer', 'tesseract', 'text'}.
|
|
131
127
|
|
|
132
128
|
Returns:
|
|
133
|
-
|
|
129
|
+
dict[str, Any]: Extracted and matched fields, or False if no template matches.
|
|
134
130
|
|
|
135
131
|
Notes:
|
|
136
132
|
Import the required `input_module` when using invoice2data as a library.
|
|
@@ -152,7 +148,15 @@ def extract_data(
|
|
|
152
148
|
elif input_module is None:
|
|
153
149
|
input_module = text if invoicefile.lower().endswith(".txt") else pdftotext
|
|
154
150
|
|
|
155
|
-
|
|
151
|
+
try:
|
|
152
|
+
extracted_str = input_module.to_text(invoicefile)
|
|
153
|
+
except Exception:
|
|
154
|
+
logger.exception(
|
|
155
|
+
"Failed to extract text from %s using %s",
|
|
156
|
+
invoicefile,
|
|
157
|
+
input_module.__name__,
|
|
158
|
+
)
|
|
159
|
+
return {}
|
|
156
160
|
if not isinstance(extracted_str, str) or not extracted_str.strip():
|
|
157
161
|
logger.error(
|
|
158
162
|
"Failed to extract text from %s using %s",
|
|
@@ -192,14 +196,14 @@ def extract_data(
|
|
|
192
196
|
|
|
193
197
|
def extract_data_fallback_ocrmypdf(
|
|
194
198
|
invoicefile: str,
|
|
195
|
-
templates:
|
|
199
|
+
templates: list[InvoiceTemplate],
|
|
196
200
|
input_module: Any,
|
|
197
|
-
) ->
|
|
201
|
+
) -> tuple[str, str, list[InvoiceTemplate]]:
|
|
198
202
|
logger.debug("Trying OCR extraction with ocrmypdf")
|
|
199
203
|
extracted_str = ocrmypdf.to_text(invoicefile)
|
|
200
204
|
|
|
201
205
|
# Convert the filter object to a list
|
|
202
|
-
templates_matched:
|
|
206
|
+
templates_matched: list[InvoiceTemplate] = list(
|
|
203
207
|
filter(lambda t: t.matches_input(extracted_str), templates)
|
|
204
208
|
)
|
|
205
209
|
templates_matched.sort(key=lambda k: k["priority"], reverse=True)
|
|
@@ -211,6 +215,44 @@ def extract_data_fallback_ocrmypdf(
|
|
|
211
215
|
return extracted_str, invoicefile, []
|
|
212
216
|
|
|
213
217
|
|
|
218
|
+
class Invoice2Data:
|
|
219
|
+
"""Object-oriented interface around :func:`extract_data`.
|
|
220
|
+
|
|
221
|
+
Holds a reusable set of templates so several invoices can be processed
|
|
222
|
+
without reloading templates each time.
|
|
223
|
+
|
|
224
|
+
Args:
|
|
225
|
+
load_built_in_templates (bool): Load the bundled templates on init.
|
|
226
|
+
Defaults to True.
|
|
227
|
+
"""
|
|
228
|
+
|
|
229
|
+
def __init__(self, load_built_in_templates: bool = True) -> None:
|
|
230
|
+
self.templates: list[InvoiceTemplate] = []
|
|
231
|
+
if load_built_in_templates:
|
|
232
|
+
self.templates += read_templates()
|
|
233
|
+
|
|
234
|
+
def read_templates(self, path: str) -> None:
|
|
235
|
+
"""Add templates from a user folder to this instance.
|
|
236
|
+
|
|
237
|
+
Args:
|
|
238
|
+
path (str): Folder containing .yml/.json templates to load.
|
|
239
|
+
"""
|
|
240
|
+
self.templates += read_templates(os.path.abspath(path))
|
|
241
|
+
|
|
242
|
+
def extract_data(self, path: str, input_module: Any = None) -> dict[str, Any]:
|
|
243
|
+
"""Extract data from an invoice using this instance's templates.
|
|
244
|
+
|
|
245
|
+
Args:
|
|
246
|
+
path (str): Path to the invoice file.
|
|
247
|
+
input_module (Any): Text-extraction module to use. Defaults to None
|
|
248
|
+
(auto-detect between text and pdftotext).
|
|
249
|
+
|
|
250
|
+
Returns:
|
|
251
|
+
dict[str, Any]: Extracted fields, or an empty dict if none matched.
|
|
252
|
+
"""
|
|
253
|
+
return extract_data(path, self.templates, input_module)
|
|
254
|
+
|
|
255
|
+
|
|
214
256
|
@click.command()
|
|
215
257
|
@click.option(
|
|
216
258
|
"--input-reader",
|
|
@@ -268,17 +310,17 @@ def extract_data_fallback_ocrmypdf(
|
|
|
268
310
|
)
|
|
269
311
|
@click.version_option()
|
|
270
312
|
def main(
|
|
271
|
-
input_reader:
|
|
313
|
+
input_reader: str | None,
|
|
272
314
|
output_format: str,
|
|
273
315
|
output_date_format: str,
|
|
274
316
|
output_name: str,
|
|
275
317
|
debug: bool,
|
|
276
|
-
copy:
|
|
277
|
-
move:
|
|
318
|
+
copy: str | None,
|
|
319
|
+
move: str | None,
|
|
278
320
|
filename_format: str,
|
|
279
|
-
template_folder:
|
|
321
|
+
template_folder: str | None,
|
|
280
322
|
exclude_built_in_templates: bool,
|
|
281
|
-
input_files:
|
|
323
|
+
input_files: tuple[Any, ...],
|
|
282
324
|
) -> None:
|
|
283
325
|
"""Extract data from PDF files and output it in a structured format."""
|
|
284
326
|
if debug:
|
|
@@ -315,8 +357,8 @@ def main(
|
|
|
315
357
|
|
|
316
358
|
|
|
317
359
|
def _load_templates(
|
|
318
|
-
template_folder:
|
|
319
|
-
) ->
|
|
360
|
+
template_folder: str | None, exclude_built_in_templates: bool
|
|
361
|
+
) -> list[Any]:
|
|
320
362
|
"""Load templates from the specified folder."""
|
|
321
363
|
templates = []
|
|
322
364
|
if template_folder:
|
|
@@ -328,9 +370,9 @@ def _load_templates(
|
|
|
328
370
|
|
|
329
371
|
def _process_and_move_copy(
|
|
330
372
|
filename: str,
|
|
331
|
-
res:
|
|
332
|
-
copy:
|
|
333
|
-
move:
|
|
373
|
+
res: dict[str, Any],
|
|
374
|
+
copy: str | None,
|
|
375
|
+
move: str | None,
|
|
334
376
|
filename_format: str,
|
|
335
377
|
) -> None:
|
|
336
378
|
"""Process the extracted data and copy/move the file."""
|
|
@@ -5,11 +5,10 @@ Templates are initially read from .yml files and then kept as class.
|
|
|
5
5
|
|
|
6
6
|
import re
|
|
7
7
|
import unicodedata
|
|
8
|
+
from collections import OrderedDict as OrderedDictType
|
|
8
9
|
from logging import getLogger
|
|
9
10
|
from pprint import pformat
|
|
10
11
|
from typing import Any
|
|
11
|
-
from typing import Dict
|
|
12
|
-
from typing import OrderedDict as OrderedDictType
|
|
13
12
|
|
|
14
13
|
import dateparser # type: ignore[import-untyped]
|
|
15
14
|
|
|
@@ -68,7 +67,7 @@ class InvoiceTemplate(OrderedDictType[str, Any]):
|
|
|
68
67
|
super().__init__(*args, **kwargs)
|
|
69
68
|
|
|
70
69
|
# Merge template-specific options with defaults
|
|
71
|
-
self.options:
|
|
70
|
+
self.options: dict[str, Any] = OPTIONS_DEFAULT.copy()
|
|
72
71
|
|
|
73
72
|
if "options" in self:
|
|
74
73
|
self.options.update(self["options"])
|
|
@@ -234,7 +233,7 @@ class InvoiceTemplate(OrderedDictType[str, Any]):
|
|
|
234
233
|
|
|
235
234
|
def extract(
|
|
236
235
|
self, optimized_str: str, invoice_file: str, input_module: Any
|
|
237
|
-
) ->
|
|
236
|
+
) -> dict[str, Any]:
|
|
238
237
|
"""Extracts data from the optimized string using the template.
|
|
239
238
|
|
|
240
239
|
Args:
|
|
@@ -243,7 +242,7 @@ class InvoiceTemplate(OrderedDictType[str, Any]):
|
|
|
243
242
|
input_module (Any): The input module used.
|
|
244
243
|
|
|
245
244
|
Returns:
|
|
246
|
-
|
|
245
|
+
dict[str, Any]: The extracted data.
|
|
247
246
|
|
|
248
247
|
"""
|
|
249
248
|
output = _initialize_output_and_log(self, optimized_str)
|
|
@@ -274,7 +273,7 @@ class InvoiceTemplate(OrderedDictType[str, Any]):
|
|
|
274
273
|
|
|
275
274
|
def _initialize_output_and_log(
|
|
276
275
|
self: InvoiceTemplate, optimized_str: str
|
|
277
|
-
) ->
|
|
276
|
+
) -> dict[str, Any]:
|
|
278
277
|
"""Initialize the output dictionary and log debug information."""
|
|
279
278
|
logger.debug("START optimized_str ========================\n" + optimized_str)
|
|
280
279
|
logger.debug("END optimized_str ==========================")
|
|
@@ -296,7 +295,7 @@ def _initialize_output_and_log(
|
|
|
296
295
|
|
|
297
296
|
def _handle_area(
|
|
298
297
|
self: InvoiceTemplate,
|
|
299
|
-
v:
|
|
298
|
+
v: dict[str, Any],
|
|
300
299
|
input_module: Any,
|
|
301
300
|
invoice_file: str,
|
|
302
301
|
optimized_str: str,
|
|
@@ -317,9 +316,9 @@ def _handle_area(
|
|
|
317
316
|
def _handle_parser(
|
|
318
317
|
self: InvoiceTemplate,
|
|
319
318
|
k: str,
|
|
320
|
-
v:
|
|
319
|
+
v: dict[str, Any],
|
|
321
320
|
optimized_str_for_parser: str,
|
|
322
|
-
output:
|
|
321
|
+
output: dict[str, Any],
|
|
323
322
|
) -> None:
|
|
324
323
|
"""Handle parsing using different parsers."""
|
|
325
324
|
if v["parser"] in PARSERS_MAPPING:
|
|
@@ -334,7 +333,7 @@ def _handle_parser(
|
|
|
334
333
|
|
|
335
334
|
|
|
336
335
|
def _handle_legacy_syntax(
|
|
337
|
-
self: InvoiceTemplate, k: str, v: Any, optimized_str: str, output:
|
|
336
|
+
self: InvoiceTemplate, k: str, v: Any, optimized_str: str, output: dict[str, Any]
|
|
338
337
|
) -> None:
|
|
339
338
|
"""Handle legacy syntax for backward compatibility."""
|
|
340
339
|
result = None
|
|
@@ -361,8 +360,8 @@ def _handle_legacy_syntax(
|
|
|
361
360
|
|
|
362
361
|
|
|
363
362
|
def _check_required_fields(
|
|
364
|
-
self: InvoiceTemplate, output:
|
|
365
|
-
) ->
|
|
363
|
+
self: InvoiceTemplate, output: dict[str, Any]
|
|
364
|
+
) -> dict[str, Any]:
|
|
366
365
|
"""Check if all required fields are present in the output."""
|
|
367
366
|
if "required_fields" not in self.keys():
|
|
368
367
|
required_fields = ["date", "amount", "invoice_number", "issuer"]
|
|
@@ -6,12 +6,9 @@ Templates are initially read from .yml or .json files and then kept as class.
|
|
|
6
6
|
import codecs
|
|
7
7
|
import json
|
|
8
8
|
import os
|
|
9
|
+
from collections.abc import Callable
|
|
9
10
|
from logging import getLogger
|
|
10
11
|
from typing import Any
|
|
11
|
-
from typing import Callable
|
|
12
|
-
from typing import Dict
|
|
13
|
-
from typing import List
|
|
14
|
-
from typing import Optional
|
|
15
12
|
from typing import cast
|
|
16
13
|
|
|
17
14
|
|
|
@@ -32,7 +29,7 @@ logger = getLogger(__name__)
|
|
|
32
29
|
|
|
33
30
|
def ordered_load(
|
|
34
31
|
stream: str, loader: Callable[[str], Any] = json.loads
|
|
35
|
-
) ->
|
|
32
|
+
) -> list[InvoiceTemplate]:
|
|
36
33
|
"""Loads a stream of JSON data.
|
|
37
34
|
|
|
38
35
|
Args:
|
|
@@ -40,7 +37,7 @@ def ordered_load(
|
|
|
40
37
|
loader (Callable[[str], Any], optional): JSON loader function. Defaults to json.loads.
|
|
41
38
|
|
|
42
39
|
Returns:
|
|
43
|
-
|
|
40
|
+
list[InvoiceTemplate]: List of InvoiceTemplate objects.
|
|
44
41
|
"""
|
|
45
42
|
output = []
|
|
46
43
|
|
|
@@ -54,22 +51,22 @@ def ordered_load(
|
|
|
54
51
|
for tpl in tpl_stream:
|
|
55
52
|
tpl = prepare_template(tpl)
|
|
56
53
|
if tpl:
|
|
57
|
-
output.append(InvoiceTemplate(cast(
|
|
54
|
+
output.append(InvoiceTemplate(cast(dict[str, Any], tpl)))
|
|
58
55
|
|
|
59
56
|
return output
|
|
60
57
|
|
|
61
58
|
|
|
62
|
-
def read_templates(folder:
|
|
59
|
+
def read_templates(folder: str | None = None) -> list[InvoiceTemplate]:
|
|
63
60
|
"""Load YAML templates from template folder. Return list of dicts.
|
|
64
61
|
|
|
65
62
|
Use built-in templates if no folder is set.
|
|
66
63
|
|
|
67
64
|
Args:
|
|
68
|
-
folder (
|
|
65
|
+
folder (str | None): User-defined folder where templates are stored.
|
|
69
66
|
If None, uses built-in templates.
|
|
70
67
|
|
|
71
68
|
Returns:
|
|
72
|
-
|
|
69
|
+
list[InvoiceTemplate]: List of InvoiceTemplate objects.
|
|
73
70
|
|
|
74
71
|
Examples:
|
|
75
72
|
>>> templates = read_templates("./src/invoice2data/extract/templates/au")
|
|
@@ -109,20 +106,20 @@ def read_templates(folder: Optional[str] = None) -> List[InvoiceTemplate]:
|
|
|
109
106
|
tpl = prepare_template(tpl)
|
|
110
107
|
|
|
111
108
|
if tpl:
|
|
112
|
-
output.append(InvoiceTemplate(cast(
|
|
109
|
+
output.append(InvoiceTemplate(cast(dict[str, Any], tpl)))
|
|
113
110
|
|
|
114
111
|
logger.info("Loaded %d templates from %s", len(output), folder)
|
|
115
112
|
return output
|
|
116
113
|
|
|
117
114
|
|
|
118
|
-
def prepare_template(tpl:
|
|
115
|
+
def prepare_template(tpl: dict[str, Any]) -> dict[str, Any] | None:
|
|
119
116
|
"""Prepare a template for use.
|
|
120
117
|
|
|
121
118
|
Args:
|
|
122
|
-
tpl (
|
|
119
|
+
tpl (dict[str, Any]): Template dictionary.
|
|
123
120
|
|
|
124
121
|
Returns:
|
|
125
|
-
|
|
122
|
+
dict[str, Any] | None: Processed template dictionary.
|
|
126
123
|
"""
|
|
127
124
|
# Test if all required fields are in template
|
|
128
125
|
if "keywords" not in tpl:
|
|
@@ -5,12 +5,8 @@ Initial work and maintenance by Holger Brunn @hbrunn
|
|
|
5
5
|
|
|
6
6
|
import re
|
|
7
7
|
from logging import getLogger
|
|
8
|
+
from re import Match
|
|
8
9
|
from typing import Any
|
|
9
|
-
from typing import Dict
|
|
10
|
-
from typing import List
|
|
11
|
-
from typing import Match
|
|
12
|
-
from typing import Optional
|
|
13
|
-
from typing import Union
|
|
14
10
|
|
|
15
11
|
|
|
16
12
|
# from ..invoice_template import InvoiceTemplate # type: ignore[unused-ignore]
|
|
@@ -20,7 +16,7 @@ logger = getLogger(__name__)
|
|
|
20
16
|
DEFAULT_OPTIONS = {"line_separator": r"\n"}
|
|
21
17
|
|
|
22
18
|
|
|
23
|
-
def parse_line(patterns:
|
|
19
|
+
def parse_line(patterns: str | list[str], line: str) -> Match[str] | None:
|
|
24
20
|
"""Parse a line using a given pattern or list of patterns.
|
|
25
21
|
|
|
26
22
|
This function searches for a match in the given line using the provided
|
|
@@ -28,11 +24,11 @@ def parse_line(patterns: Union[str, List[str]], line: str) -> Optional[Match[str
|
|
|
28
24
|
object; otherwise, it returns None.
|
|
29
25
|
|
|
30
26
|
Args:
|
|
31
|
-
patterns (
|
|
27
|
+
patterns (str | list[str]): The pattern(s) to search for.
|
|
32
28
|
line (str): The line to parse.
|
|
33
29
|
|
|
34
30
|
Returns:
|
|
35
|
-
|
|
31
|
+
Match[str] | None: A match object if a match is found, otherwise None.
|
|
36
32
|
"""
|
|
37
33
|
patterns = patterns if isinstance(patterns, list) else [patterns]
|
|
38
34
|
for pattern in patterns:
|
|
@@ -43,11 +39,11 @@ def parse_line(patterns: Union[str, List[str]], line: str) -> Optional[Match[str
|
|
|
43
39
|
|
|
44
40
|
|
|
45
41
|
def parse_block( # noqa: RUF100 C901
|
|
46
|
-
template:
|
|
42
|
+
template: dict[str, Any],
|
|
47
43
|
field: str,
|
|
48
|
-
settings:
|
|
44
|
+
settings: dict[str, Any],
|
|
49
45
|
content: str,
|
|
50
|
-
) ->
|
|
46
|
+
) -> list[dict[str, Any]]:
|
|
51
47
|
"""Parse a block of lines to extract data.
|
|
52
48
|
|
|
53
49
|
This function parses a block of lines from an invoice to extract data
|
|
@@ -56,13 +52,13 @@ def parse_block( # noqa: RUF100 C901
|
|
|
56
52
|
based on the configuration.
|
|
57
53
|
|
|
58
54
|
Args:
|
|
59
|
-
template (
|
|
55
|
+
template (dict[str, Any]): The template containing extraction rules.
|
|
60
56
|
field (str): The name of the field to extract.
|
|
61
|
-
settings (
|
|
57
|
+
settings (dict[str, Any]): The settings for the extraction rule.
|
|
62
58
|
content (str): The text content to parse.
|
|
63
59
|
|
|
64
60
|
Returns:
|
|
65
|
-
|
|
61
|
+
list[dict[str, Any]]: A list of dictionaries, where each dictionary
|
|
66
62
|
represents an extracted row with field-value pairs.
|
|
67
63
|
"""
|
|
68
64
|
# Validate settings
|
|
@@ -72,8 +68,8 @@ def parse_block( # noqa: RUF100 C901
|
|
|
72
68
|
|
|
73
69
|
logger.debug("START lines block content ========================\n%s", content)
|
|
74
70
|
logger.debug("END lines block content ==========================")
|
|
75
|
-
lines:
|
|
76
|
-
current_row:
|
|
71
|
+
lines: list[dict[str, Any]] = []
|
|
72
|
+
current_row: dict[str, Any] = {}
|
|
77
73
|
|
|
78
74
|
# We assume that structured line fields may either be individual lines or
|
|
79
75
|
# they may be main line items with descriptions or details following beneath.
|
|
@@ -167,21 +163,21 @@ def parse_block( # noqa: RUF100 C901
|
|
|
167
163
|
|
|
168
164
|
|
|
169
165
|
def parse_by_rule(
|
|
170
|
-
template:
|
|
166
|
+
template: dict[str, Any],
|
|
171
167
|
field: str,
|
|
172
|
-
rule:
|
|
168
|
+
rule: dict[str, Any],
|
|
173
169
|
content: str,
|
|
174
|
-
) ->
|
|
170
|
+
) -> list[dict[str, Any]]:
|
|
175
171
|
"""Parse lines from a block of text based on a rule.
|
|
176
172
|
|
|
177
173
|
Args:
|
|
178
|
-
template (
|
|
174
|
+
template (dict[str, Any]): The template dictionary.
|
|
179
175
|
field (str): The field name.
|
|
180
|
-
rule (
|
|
176
|
+
rule (dict[str, Any]): The rule dictionary.
|
|
181
177
|
content (str): The text content to parse.
|
|
182
178
|
|
|
183
179
|
Returns:
|
|
184
|
-
|
|
180
|
+
list[dict[str, Any]]: The parsed lines.
|
|
185
181
|
"""
|
|
186
182
|
# First apply default options.
|
|
187
183
|
settings = DEFAULT_OPTIONS.copy()
|
|
@@ -227,21 +223,21 @@ def parse_by_rule(
|
|
|
227
223
|
|
|
228
224
|
|
|
229
225
|
def parse(
|
|
230
|
-
template:
|
|
226
|
+
template: dict[str, Any],
|
|
231
227
|
field: str,
|
|
232
|
-
settings:
|
|
228
|
+
settings: dict[str, Any],
|
|
233
229
|
content: str,
|
|
234
|
-
) ->
|
|
230
|
+
) -> list[dict[str, Any]]:
|
|
235
231
|
"""Parse lines from the content based on the given settings.
|
|
236
232
|
|
|
237
233
|
Args:
|
|
238
|
-
template (
|
|
234
|
+
template (dict[str, Any]): The template dictionary.
|
|
239
235
|
field (str): The field name.
|
|
240
|
-
settings (
|
|
236
|
+
settings (dict[str, Any]): The settings dictionary.
|
|
241
237
|
content (str): The text content to parse.
|
|
242
238
|
|
|
243
239
|
Returns:
|
|
244
|
-
|
|
240
|
+
list[dict[str, Any]]: The parsed lines.
|
|
245
241
|
"""
|
|
246
242
|
if "rules" in settings:
|
|
247
243
|
# One field can have multiple sets of line-parsing rules
|
|
@@ -262,16 +258,16 @@ def parse(
|
|
|
262
258
|
|
|
263
259
|
|
|
264
260
|
def parse_current_row(
|
|
265
|
-
match:
|
|
266
|
-
) ->
|
|
261
|
+
match: Match[str] | None, current_row: dict[str, Any]
|
|
262
|
+
) -> dict[str, Any]:
|
|
267
263
|
"""Parse the current row data.
|
|
268
264
|
|
|
269
265
|
Args:
|
|
270
|
-
match (
|
|
271
|
-
current_row (
|
|
266
|
+
match (Match[str] | None): The match object.
|
|
267
|
+
current_row (dict[str, Any]): The current row dictionary.
|
|
272
268
|
|
|
273
269
|
Returns:
|
|
274
|
-
|
|
270
|
+
dict[str, Any]: The updated current row dictionary.
|
|
275
271
|
"""
|
|
276
272
|
if match:
|
|
277
273
|
for field, value in match.groupdict().items():
|
|
@@ -15,9 +15,6 @@ import logging
|
|
|
15
15
|
import re
|
|
16
16
|
from collections import OrderedDict
|
|
17
17
|
from typing import Any
|
|
18
|
-
from typing import Dict
|
|
19
|
-
from typing import List
|
|
20
|
-
from typing import Optional
|
|
21
18
|
|
|
22
19
|
from ..utils import _apply_grouping
|
|
23
20
|
|
|
@@ -28,7 +25,7 @@ logger = logging.getLogger(__name__)
|
|
|
28
25
|
def parse(
|
|
29
26
|
template: Any,
|
|
30
27
|
field: str,
|
|
31
|
-
settings:
|
|
28
|
+
settings: dict[str, Any],
|
|
32
29
|
content: str,
|
|
33
30
|
legacy: bool = False,
|
|
34
31
|
) -> Any:
|
|
@@ -37,7 +34,7 @@ def parse(
|
|
|
37
34
|
Args:
|
|
38
35
|
template (Any): The template object.
|
|
39
36
|
field (str): The name of the field to extract.
|
|
40
|
-
settings (
|
|
37
|
+
settings (dict[str, Any]): The settings for the field extraction.
|
|
41
38
|
content (str): The text content to parse.
|
|
42
39
|
legacy (bool, optional): Whether to use legacy parsing. Defaults to False.
|
|
43
40
|
|
|
@@ -64,7 +61,7 @@ def parse(
|
|
|
64
61
|
return result
|
|
65
62
|
|
|
66
63
|
|
|
67
|
-
def _extract_matches(settings:
|
|
64
|
+
def _extract_matches(settings: dict[str, Any], content: str) -> list[Any] | None:
|
|
68
65
|
"""Extract matches from the content using the given regexes."""
|
|
69
66
|
if isinstance(settings["regex"], list):
|
|
70
67
|
regexes = settings["regex"]
|
|
@@ -100,8 +97,8 @@ def _extract_matches(settings: Dict[str, Any], content: str) -> Optional[List[An
|
|
|
100
97
|
|
|
101
98
|
|
|
102
99
|
def _apply_type_coercion(
|
|
103
|
-
template: Any, settings:
|
|
104
|
-
) ->
|
|
100
|
+
template: Any, settings: dict[str, Any], result: list[Any]
|
|
101
|
+
) -> list[Any]:
|
|
105
102
|
"""Apply type coercion to the extracted values."""
|
|
106
103
|
if "type" in settings:
|
|
107
104
|
for k, v in enumerate(result):
|
|
@@ -109,7 +106,7 @@ def _apply_type_coercion(
|
|
|
109
106
|
return result
|
|
110
107
|
|
|
111
108
|
|
|
112
|
-
def _remove_duplicates(legacy: bool, result:
|
|
109
|
+
def _remove_duplicates(legacy: bool, result: Any | None) -> Any | None:
|
|
113
110
|
"""Remove duplicate values from the result."""
|
|
114
111
|
if isinstance(result, list):
|
|
115
112
|
if legacy:
|