invoice2data 0.4.2__tar.gz → 0.4.3__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {invoice2data-0.4.2/src/invoice2data.egg-info → invoice2data-0.4.3}/PKG-INFO +14 -5
- {invoice2data-0.4.2 → invoice2data-0.4.3}/README.md +13 -4
- {invoice2data-0.4.2 → invoice2data-0.4.3}/setup.cfg +1 -1
- {invoice2data-0.4.2 → invoice2data-0.4.3}/src/invoice2data/extract/invoice_template.py +37 -29
- invoice2data-0.4.3/src/invoice2data/extract/loader.py +109 -0
- {invoice2data-0.4.2 → invoice2data-0.4.3}/src/invoice2data/extract/parsers/__interface__.py +0 -2
- {invoice2data-0.4.2 → invoice2data-0.4.3}/src/invoice2data/extract/parsers/lines.py +41 -11
- {invoice2data-0.4.2 → invoice2data-0.4.3}/src/invoice2data/extract/parsers/regex.py +20 -3
- {invoice2data-0.4.2 → invoice2data-0.4.3}/src/invoice2data/extract/parsers/static.py +3 -3
- invoice2data-0.4.3/src/invoice2data/extract/plugins/tables.py +99 -0
- invoice2data-0.4.3/src/invoice2data/extract/templates/be/be.accor.invest.ibis.yml +73 -0
- invoice2data-0.4.3/src/invoice2data/extract/templates/com/com.AzureInterior.yml +209 -0
- invoice2data-0.4.3/src/invoice2data/extract/templates/com/com.expressvpn_prio6.yml +49 -0
- invoice2data-0.4.3/src/invoice2data/extract/templates/com/com.namecheap.yml +37 -0
- invoice2data-0.4.3/src/invoice2data/extract/templates/com/com.nl.lenovo.digitalriver.yml +60 -0
- {invoice2data-0.4.2 → invoice2data-0.4.3}/src/invoice2data/extract/templates/fr/fr.free.adsl-fiber.yml +6 -0
- {invoice2data-0.4.2 → invoice2data-0.4.3}/src/invoice2data/extract/templates/nl/nl.action.yml +3 -1
- {invoice2data-0.4.2 → invoice2data-0.4.3}/src/invoice2data/extract/templates/nl/nl.albron.yml +3 -1
- invoice2data-0.4.3/src/invoice2data/extract/templates/nl/nl.be.coolblue.yml +117 -0
- {invoice2data-0.4.2 → invoice2data-0.4.3}/src/invoice2data/extract/templates/nl/nl.gamma.yml +3 -1
- {invoice2data-0.4.2 → invoice2data-0.4.3}/src/invoice2data/extract/templates/nl/nl.karwei.yml +3 -1
- {invoice2data-0.4.2 → invoice2data-0.4.3}/src/invoice2data/extract/templates/nl/nl.koffiehenk.yml +9 -3
- {invoice2data-0.4.2 → invoice2data-0.4.3}/src/invoice2data/extract/templates/nl/nl.praxis.yml +14 -5
- invoice2data-0.4.3/src/invoice2data/extract/templates/nl/nl.saeco.philips.eluscious.yml +92 -0
- {invoice2data-0.4.2 → invoice2data-0.4.3}/src/invoice2data/extract/templates/nl/nl.transip.yml +3 -0
- {invoice2data-0.4.2 → invoice2data-0.4.3}/src/invoice2data/extract/templates/nl/nl.vodafone.yml +19 -16
- {invoice2data-0.4.2 → invoice2data-0.4.3}/src/invoice2data/extract/templates/nl/nl.yezzer.yml +3 -0
- invoice2data-0.4.3/src/invoice2data/extract/templates/pl/pl.insert.subiekt-gt.yml +43 -0
- invoice2data-0.4.3/src/invoice2data/extract/templates/pl/pl.insert.subiekt-nexo.yml +33 -0
- {invoice2data-0.4.2 → invoice2data-0.4.3}/src/invoice2data/input/gvision.py +1 -1
- invoice2data-0.4.3/src/invoice2data/input/ocrmypdf.py +146 -0
- {invoice2data-0.4.2 → invoice2data-0.4.3}/src/invoice2data/input/pdfminer_wrapper.py +1 -1
- {invoice2data-0.4.2 → invoice2data-0.4.3}/src/invoice2data/input/pdfplumber.py +3 -3
- {invoice2data-0.4.2 → invoice2data-0.4.3}/src/invoice2data/input/pdftotext.py +3 -3
- {invoice2data-0.4.2 → invoice2data-0.4.3}/src/invoice2data/input/tesseract.py +33 -9
- {invoice2data-0.4.2 → invoice2data-0.4.3}/src/invoice2data/input/text.py +1 -1
- {invoice2data-0.4.2 → invoice2data-0.4.3}/src/invoice2data/main.py +93 -21
- {invoice2data-0.4.2 → invoice2data-0.4.3/src/invoice2data.egg-info}/PKG-INFO +14 -5
- {invoice2data-0.4.2 → invoice2data-0.4.3}/src/invoice2data.egg-info/SOURCES.txt +12 -2
- {invoice2data-0.4.2 → invoice2data-0.4.3}/tests/test_cli.py +97 -23
- invoice2data-0.4.3/tests/test_invoice_template.py +146 -0
- {invoice2data-0.4.2 → invoice2data-0.4.3}/tests/test_lib.py +26 -1
- invoice2data-0.4.3/tests/test_loader.py +117 -0
- invoice2data-0.4.2/src/invoice2data/extract/loader.py +0 -114
- invoice2data-0.4.2/src/invoice2data/extract/plugins/tables.py +0 -62
- invoice2data-0.4.2/src/invoice2data/extract/templates/com/com.flipkart.WSRetail.yml +0 -16
- invoice2data-0.4.2/src/invoice2data/extract/templates/com/com.namecheap.yml +0 -15
- {invoice2data-0.4.2 → invoice2data-0.4.3}/LICENSE.txt +0 -0
- {invoice2data-0.4.2 → invoice2data-0.4.3}/MANIFEST.in +0 -0
- {invoice2data-0.4.2 → invoice2data-0.4.3}/setup.py +0 -0
- {invoice2data-0.4.2 → invoice2data-0.4.3}/src/invoice2data/__init__.py +0 -0
- {invoice2data-0.4.2/src/invoice2data/output → invoice2data-0.4.3/src/invoice2data/extract}/__init__.py +0 -0
- {invoice2data-0.4.2 → invoice2data-0.4.3}/src/invoice2data/extract/parsers/__init__.py +0 -0
- {invoice2data-0.4.2/src/invoice2data/input → invoice2data-0.4.3/src/invoice2data/extract/plugins}/__init__.py +0 -0
- {invoice2data-0.4.2 → invoice2data-0.4.3}/src/invoice2data/extract/plugins/__interface__.py +0 -0
- {invoice2data-0.4.2 → invoice2data-0.4.3}/src/invoice2data/extract/plugins/lines.py +0 -0
- {invoice2data-0.4.2 → invoice2data-0.4.3}/src/invoice2data/extract/templates/au/au.com.opal.yml +0 -0
- {invoice2data-0.4.2 → invoice2data-0.4.3}/src/invoice2data/extract/templates/au/au.com.telstra.yml +0 -0
- {invoice2data-0.4.2 → invoice2data-0.4.3}/src/invoice2data/extract/templates/be/be.boucherie.pochet.yml +0 -0
- {invoice2data-0.4.2 → invoice2data-0.4.3}/src/invoice2data/extract/templates/be/be.cebeo.yml +0 -0
- {invoice2data-0.4.2 → invoice2data-0.4.3}/src/invoice2data/extract/templates/be/be.lampiris.facture-dacompte.yml +0 -0
- {invoice2data-0.4.2 → invoice2data-0.4.3}/src/invoice2data/extract/templates/be/be.lampiris.factuur.yml +0 -0
- {invoice2data-0.4.2 → invoice2data-0.4.3}/src/invoice2data/extract/templates/be/be.lampiris.regularisation.yml +0 -0
- {invoice2data-0.4.2 → invoice2data-0.4.3}/src/invoice2data/extract/templates/be/be.melchior-vins.yml +0 -0
- {invoice2data-0.4.2 → invoice2data-0.4.3}/src/invoice2data/extract/templates/be/be.proximus.yml +0 -0
- {invoice2data-0.4.2 → invoice2data-0.4.3}/src/invoice2data/extract/templates/be/be.scarlet.yml +0 -0
- {invoice2data-0.4.2 → invoice2data-0.4.3}/src/invoice2data/extract/templates/be/be.securex.social.yml +0 -0
- {invoice2data-0.4.2 → invoice2data-0.4.3}/src/invoice2data/extract/templates/ch/ch.pcengines.yml +0 -0
- {invoice2data-0.4.2 → invoice2data-0.4.3}/src/invoice2data/extract/templates/com/com.amazon.aws.yml +0 -0
- {invoice2data-0.4.2 → invoice2data-0.4.3}/src/invoice2data/extract/templates/com/com.apple.yml +0 -0
- {invoice2data-0.4.2 → invoice2data-0.4.3}/src/invoice2data/extract/templates/com/com.apps4rent.yml +0 -0
- {invoice2data-0.4.2 → invoice2data-0.4.3}/src/invoice2data/extract/templates/com/com.binarylife.yml +0 -0
- {invoice2data-0.4.2 → invoice2data-0.4.3}/src/invoice2data/extract/templates/com/com.bloomberg.yml +0 -0
- {invoice2data-0.4.2 → invoice2data-0.4.3}/src/invoice2data/extract/templates/com/com.cloudns.yml +0 -0
- {invoice2data-0.4.2 → invoice2data-0.4.3}/src/invoice2data/extract/templates/com/com.datadoghq.yml +0 -0
- {invoice2data-0.4.2 → invoice2data-0.4.3}/src/invoice2data/extract/templates/com/com.digitalocean.yml +0 -0
- {invoice2data-0.4.2 → invoice2data-0.4.3}/src/invoice2data/extract/templates/com/com.envato.yml +0 -0
- {invoice2data-0.4.2 → invoice2data-0.4.3}/src/invoice2data/extract/templates/com/com.expressvpn.yml +0 -0
- {invoice2data-0.4.2 → invoice2data-0.4.3}/src/invoice2data/extract/templates/com/com.ftserussell.yml +0 -0
- {invoice2data-0.4.2 → invoice2data-0.4.3}/src/invoice2data/extract/templates/com/com.github.yml +0 -0
- {invoice2data-0.4.2 → invoice2data-0.4.3}/src/invoice2data/extract/templates/com/com.globalsign.yml +0 -0
- {invoice2data-0.4.2 → invoice2data-0.4.3}/src/invoice2data/extract/templates/com/com.google.adwords.hk.yml +0 -0
- {invoice2data-0.4.2 → invoice2data-0.4.3}/src/invoice2data/extract/templates/com/com.hobohost.yml +0 -0
- {invoice2data-0.4.2 → invoice2data-0.4.3}/src/invoice2data/extract/templates/com/com.jamiepro.yml +0 -0
- {invoice2data-0.4.2 → invoice2data-0.4.3}/src/invoice2data/extract/templates/com/com.linode.yml +0 -0
- {invoice2data-0.4.2 → invoice2data-0.4.3}/src/invoice2data/extract/templates/com/com.microsoftonline.hk-v2017.yml +0 -0
- {invoice2data-0.4.2 → invoice2data-0.4.3}/src/invoice2data/extract/templates/com/com.microsoftonline.hk.yml +0 -0
- {invoice2data-0.4.2 → invoice2data-0.4.3}/src/invoice2data/extract/templates/com/com.mongodb.yml +0 -0
- {invoice2data-0.4.2 → invoice2data-0.4.3}/src/invoice2data/extract/templates/com/com.namesilo.yml +0 -0
- {invoice2data-0.4.2 → invoice2data-0.4.3}/src/invoice2data/extract/templates/com/com.newrelic.yml +0 -0
- {invoice2data-0.4.2 → invoice2data-0.4.3}/src/invoice2data/extract/templates/com/com.nmmn.yml +0 -0
- {invoice2data-0.4.2 → invoice2data-0.4.3}/src/invoice2data/extract/templates/com/com.nodisto.yml +0 -0
- {invoice2data-0.4.2 → invoice2data-0.4.3}/src/invoice2data/extract/templates/com/com.nyse.yml +0 -0
- {invoice2data-0.4.2 → invoice2data-0.4.3}/src/invoice2data/extract/templates/com/com.oyo.invoice.yml +0 -0
- {invoice2data-0.4.2 → invoice2data-0.4.3}/src/invoice2data/extract/templates/com/com.packtpub.yml +0 -0
- {invoice2data-0.4.2 → invoice2data-0.4.3}/src/invoice2data/extract/templates/com/com.pixartprinting.yml +0 -0
- {invoice2data-0.4.2 → invoice2data-0.4.3}/src/invoice2data/extract/templates/com/com.sammymaystone.yml +0 -0
- {invoice2data-0.4.2 → invoice2data-0.4.3}/src/invoice2data/extract/templates/com/com.scaleway.yml +0 -0
- {invoice2data-0.4.2 → invoice2data-0.4.3}/src/invoice2data/extract/templates/com/com.textmaster.yml +0 -0
- {invoice2data-0.4.2 → invoice2data-0.4.3}/src/invoice2data/extract/templates/com/com.tmx.yml +0 -0
- {invoice2data-0.4.2 → invoice2data-0.4.3}/src/invoice2data/extract/templates/com/com.travis-ci.yml +0 -0
- {invoice2data-0.4.2 → invoice2data-0.4.3}/src/invoice2data/extract/templates/com/com.twitter.de.yml +0 -0
- {invoice2data-0.4.2 → invoice2data-0.4.3}/src/invoice2data/extract/templates/com/com.twitter.uk.yml +0 -0
- {invoice2data-0.4.2 → invoice2data-0.4.3}/src/invoice2data/extract/templates/com/com.twitter.yml +0 -0
- {invoice2data-0.4.2 → invoice2data-0.4.3}/src/invoice2data/extract/templates/com/com.upwork.yml +0 -0
- {invoice2data-0.4.2 → invoice2data-0.4.3}/src/invoice2data/extract/templates/com/com.usersnap.yml +0 -0
- {invoice2data-0.4.2 → invoice2data-0.4.3}/src/invoice2data/extract/templates/de/de.amazon.yml +0 -0
- {invoice2data-0.4.2 → invoice2data-0.4.3}/src/invoice2data/extract/templates/de/de.bettina-kast.yml +0 -0
- {invoice2data-0.4.2 → invoice2data-0.4.3}/src/invoice2data/extract/templates/de/de.digikey.com.yml +0 -0
- {invoice2data-0.4.2 → invoice2data-0.4.3}/src/invoice2data/extract/templates/de/de.hosteurope.yml +0 -0
- {invoice2data-0.4.2 → invoice2data-0.4.3}/src/invoice2data/extract/templates/de/de.notebooksbilligerBillPay.yml +0 -0
- {invoice2data-0.4.2 → invoice2data-0.4.3}/src/invoice2data/extract/templates/de/de.ovh.yml +0 -0
- {invoice2data-0.4.2 → invoice2data-0.4.3}/src/invoice2data/extract/templates/de/de.qualityhosting.yml +0 -0
- {invoice2data-0.4.2 → invoice2data-0.4.3}/src/invoice2data/extract/templates/de/de.united-domains.yml +0 -0
- {invoice2data-0.4.2 → invoice2data-0.4.3}/src/invoice2data/extract/templates/es/com.pepephone.yml +0 -0
- {invoice2data-0.4.2 → invoice2data-0.4.3}/src/invoice2data/extract/templates/es/es.supplies24.yml +0 -0
- {invoice2data-0.4.2 → invoice2data-0.4.3}/src/invoice2data/extract/templates/fr/co.mooncard.yml +0 -0
- {invoice2data-0.4.2 → invoice2data-0.4.3}/src/invoice2data/extract/templates/fr/com.adobe.ie.yml +0 -0
- {invoice2data-0.4.2 → invoice2data-0.4.3}/src/invoice2data/extract/templates/fr/com.akretion.fr.yml +0 -0
- {invoice2data-0.4.2 → invoice2data-0.4.3}/src/invoice2data/extract/templates/fr/com.amazon.aws.yml +0 -0
- {invoice2data-0.4.2 → invoice2data-0.4.3}/src/invoice2data/extract/templates/fr/com.ateliercopieservice.yml +0 -0
- {invoice2data-0.4.2 → invoice2data-0.4.3}/src/invoice2data/extract/templates/fr/com.chauffeur-prive.yml +0 -0
- {invoice2data-0.4.2 → invoice2data-0.4.3}/src/invoice2data/extract/templates/fr/com.coriolis.yml +0 -0
- {invoice2data-0.4.2 → invoice2data-0.4.3}/src/invoice2data/extract/templates/fr/com.easyjet.fr.yml +0 -0
- {invoice2data-0.4.2 → invoice2data-0.4.3}/src/invoice2data/extract/templates/fr/com.eaudugrandlyon.yml +0 -0
- {invoice2data-0.4.2 → invoice2data-0.4.3}/src/invoice2data/extract/templates/fr/com.godaddy.yml +0 -0
- {invoice2data-0.4.2 → invoice2data-0.4.3}/src/invoice2data/extract/templates/fr/com.google.ie.yml +0 -0
- {invoice2data-0.4.2 → invoice2data-0.4.3}/src/invoice2data/extract/templates/fr/com.hootsuite.yml +0 -0
- {invoice2data-0.4.2 → invoice2data-0.4.3}/src/invoice2data/extract/templates/fr/com.jeanbesson.yml +0 -0
- {invoice2data-0.4.2 → invoice2data-0.4.3}/src/invoice2data/extract/templates/fr/com.ldlc.yml +0 -0
- {invoice2data-0.4.2 → invoice2data-0.4.3}/src/invoice2data/extract/templates/fr/com.linkedin.yml +0 -0
- {invoice2data-0.4.2 → invoice2data-0.4.3}/src/invoice2data/extract/templates/fr/com.mention.yml +0 -0
- {invoice2data-0.4.2 → invoice2data-0.4.3}/src/invoice2data/extract/templates/fr/com.microsoft.ie.yml +0 -0
- {invoice2data-0.4.2 → invoice2data-0.4.3}/src/invoice2data/extract/templates/fr/com.myflyingbox.yml +0 -0
- {invoice2data-0.4.2 → invoice2data-0.4.3}/src/invoice2data/extract/templates/fr/com.officetimeline.yml +0 -0
- {invoice2data-0.4.2 → invoice2data-0.4.3}/src/invoice2data/extract/templates/fr/com.orange-business.mobile.yml +0 -0
- {invoice2data-0.4.2 → invoice2data-0.4.3}/src/invoice2data/extract/templates/fr/com.ovh.fr.yml +0 -0
- {invoice2data-0.4.2 → invoice2data-0.4.3}/src/invoice2data/extract/templates/fr/com.rs-online.fr.yml +0 -0
- {invoice2data-0.4.2 → invoice2data-0.4.3}/src/invoice2data/extract/templates/fr/com.saur.yml +0 -0
- {invoice2data-0.4.2 → invoice2data-0.4.3}/src/invoice2data/extract/templates/fr/com.soyoustart.yml +0 -0
- {invoice2data-0.4.2 → invoice2data-0.4.3}/src/invoice2data/extract/templates/fr/com.vinci-autoroutes.yml +0 -0
- {invoice2data-0.4.2 → invoice2data-0.4.3}/src/invoice2data/extract/templates/fr/dolibarr.generique.yml +0 -0
- {invoice2data-0.4.2 → invoice2data-0.4.3}/src/invoice2data/extract/templates/fr/eu.trainline.yml +0 -0
- {invoice2data-0.4.2 → invoice2data-0.4.3}/src/invoice2data/extract/templates/fr/fr.actn.yml +0 -0
- {invoice2data-0.4.2 → invoice2data-0.4.3}/src/invoice2data/extract/templates/fr/fr.airfrance.yml +0 -0
- {invoice2data-0.4.2 → invoice2data-0.4.3}/src/invoice2data/extract/templates/fr/fr.also.yml +0 -0
- {invoice2data-0.4.2 → invoice2data-0.4.3}/src/invoice2data/extract/templates/fr/fr.amazon.yml +0 -0
- {invoice2data-0.4.2 → invoice2data-0.4.3}/src/invoice2data/extract/templates/fr/fr.assurance-epargne-pension.yml +0 -0
- {invoice2data-0.4.2 → invoice2data-0.4.3}/src/invoice2data/extract/templates/fr/fr.bouyguestelecom.adsl-fiber.yml +0 -0
- {invoice2data-0.4.2 → invoice2data-0.4.3}/src/invoice2data/extract/templates/fr/fr.bouyguestelecom.mobile.yml +0 -0
- {invoice2data-0.4.2 → invoice2data-0.4.3}/src/invoice2data/extract/templates/fr/fr.butagaz.yml +0 -0
- {invoice2data-0.4.2 → invoice2data-0.4.3}/src/invoice2data/extract/templates/fr/fr.chronopost.yml +0 -0
- {invoice2data-0.4.2 → invoice2data-0.4.3}/src/invoice2data/extract/templates/fr/fr.dirafi.yml +0 -0
- {invoice2data-0.4.2 → invoice2data-0.4.3}/src/invoice2data/extract/templates/fr/fr.domaine-achat.yml +0 -0
- {invoice2data-0.4.2 → invoice2data-0.4.3}/src/invoice2data/extract/templates/fr/fr.easytrip.yml +0 -0
- {invoice2data-0.4.2 → invoice2data-0.4.3}/src/invoice2data/extract/templates/fr/fr.edf.entreprises.yml +0 -0
- {invoice2data-0.4.2 → invoice2data-0.4.3}/src/invoice2data/extract/templates/fr/fr.edf.pme.yml +0 -0
- {invoice2data-0.4.2 → invoice2data-0.4.3}/src/invoice2data/extract/templates/fr/fr.finagaz.yml +0 -0
- {invoice2data-0.4.2 → invoice2data-0.4.3}/src/invoice2data/extract/templates/fr/fr.fountain.yml +0 -0
- {invoice2data-0.4.2 → invoice2data-0.4.3}/src/invoice2data/extract/templates/fr/fr.free.mobile.yml +0 -0
- {invoice2data-0.4.2 → invoice2data-0.4.3}/src/invoice2data/extract/templates/fr/fr.free.mobile2.yml +0 -0
- {invoice2data-0.4.2 → invoice2data-0.4.3}/src/invoice2data/extract/templates/fr/fr.futur.yml +0 -0
- {invoice2data-0.4.2 → invoice2data-0.4.3}/src/invoice2data/extract/templates/fr/fr.ge-iroise.yml +0 -0
- {invoice2data-0.4.2 → invoice2data-0.4.3}/src/invoice2data/extract/templates/fr/fr.greffe-tc-lyon.yml +0 -0
- {invoice2data-0.4.2 → invoice2data-0.4.3}/src/invoice2data/extract/templates/fr/fr.hiscox.yml +0 -0
- {invoice2data-0.4.2 → invoice2data-0.4.3}/src/invoice2data/extract/templates/fr/fr.internetsatellite.yml +0 -0
- {invoice2data-0.4.2 → invoice2data-0.4.3}/src/invoice2data/extract/templates/fr/fr.jpg.yml +0 -0
- {invoice2data-0.4.2 → invoice2data-0.4.3}/src/invoice2data/extract/templates/fr/fr.kubii.yml +0 -0
- {invoice2data-0.4.2 → invoice2data-0.4.3}/src/invoice2data/extract/templates/fr/fr.laposte.boutique.yml +0 -0
- {invoice2data-0.4.2 → invoice2data-0.4.3}/src/invoice2data/extract/templates/fr/fr.laposte.coliposte.yml +0 -0
- {invoice2data-0.4.2 → invoice2data-0.4.3}/src/invoice2data/extract/templates/fr/fr.lecab.yml +0 -0
- {invoice2data-0.4.2 → invoice2data-0.4.3}/src/invoice2data/extract/templates/fr/fr.leroymerlin.yml +0 -0
- {invoice2data-0.4.2 → invoice2data-0.4.3}/src/invoice2data/extract/templates/fr/fr.maaf.yml +0 -0
- {invoice2data-0.4.2 → invoice2data-0.4.3}/src/invoice2data/extract/templates/fr/fr.mediapart.yml +0 -0
- {invoice2data-0.4.2 → invoice2data-0.4.3}/src/invoice2data/extract/templates/fr/fr.moneo-resto.yml +0 -0
- {invoice2data-0.4.2 → invoice2data-0.4.3}/src/invoice2data/extract/templates/fr/fr.mouser.yml +0 -0
- {invoice2data-0.4.2 → invoice2data-0.4.3}/src/invoice2data/extract/templates/fr/fr.mycelium-roulement.yml +0 -0
- {invoice2data-0.4.2 → invoice2data-0.4.3}/src/invoice2data/extract/templates/fr/fr.napsis.yml +0 -0
- {invoice2data-0.4.2 → invoice2data-0.4.3}/src/invoice2data/extract/templates/fr/fr.nexity.yml +0 -0
- {invoice2data-0.4.2 → invoice2data-0.4.3}/src/invoice2data/extract/templates/fr/fr.orange.fibre.yml +0 -0
- {invoice2data-0.4.2 → invoice2data-0.4.3}/src/invoice2data/extract/templates/fr/fr.orange.fixedline.yml +0 -0
- {invoice2data-0.4.2 → invoice2data-0.4.3}/src/invoice2data/extract/templates/fr/fr.prestaclic.yml +0 -0
- {invoice2data-0.4.2 → invoice2data-0.4.3}/src/invoice2data/extract/templates/fr/fr.publicationannoncelegale.yml +0 -0
- {invoice2data-0.4.2 → invoice2data-0.4.3}/src/invoice2data/extract/templates/fr/fr.sfr.adsl-fiber.yml +0 -0
- {invoice2data-0.4.2 → invoice2data-0.4.3}/src/invoice2data/extract/templates/fr/fr.sfr.mobile.yml +0 -0
- {invoice2data-0.4.2 → invoice2data-0.4.3}/src/invoice2data/extract/templates/fr/fr.sosh.yml +0 -0
- {invoice2data-0.4.2 → invoice2data-0.4.3}/src/invoice2data/extract/templates/fr/fr.teledec.yml +0 -0
- {invoice2data-0.4.2 → invoice2data-0.4.3}/src/invoice2data/extract/templates/fr/fr.topoffice.yml +0 -0
- {invoice2data-0.4.2 → invoice2data-0.4.3}/src/invoice2data/extract/templates/fr/net.online.yml +0 -0
- {invoice2data-0.4.2 → invoice2data-0.4.3}/src/invoice2data/extract/templates/fr/net.scaleway.yml +0 -0
- {invoice2data-0.4.2 → invoice2data-0.4.3}/src/invoice2data/extract/templates/nl/nl.begra.yml +0 -0
- {invoice2data-0.4.2 → invoice2data-0.4.3}/src/invoice2data/extract/templates/nl/nl.blokker.yml +0 -0
- {invoice2data-0.4.2 → invoice2data-0.4.3}/src/invoice2data/extract/templates/nl/nl.bunq.yml +0 -0
- {invoice2data-0.4.2 → invoice2data-0.4.3}/src/invoice2data/extract/templates/nl/nl.cpe.yml +0 -0
- {invoice2data-0.4.2 → invoice2data-0.4.3}/src/invoice2data/extract/templates/nl/nl.farnell.yml +0 -0
- {invoice2data-0.4.2 → invoice2data-0.4.3}/src/invoice2data/extract/templates/nl/nl.ferbox.yml +0 -0
- {invoice2data-0.4.2 → invoice2data-0.4.3}/src/invoice2data/extract/templates/nl/nl.goos.yml +0 -0
- {invoice2data-0.4.2 → invoice2data-0.4.3}/src/invoice2data/extract/templates/nl/nl.kav.yml +0 -0
- {invoice2data-0.4.2 → invoice2data-0.4.3}/src/invoice2data/extract/templates/nl/nl.reclameland.yml +0 -0
- {invoice2data-0.4.2 → invoice2data-0.4.3}/src/invoice2data/extract/templates/nl/nl.simpel.yml +0 -0
- {invoice2data-0.4.2 → invoice2data-0.4.3}/src/invoice2data/extract/templates/nl/nl.tuynder.yml +0 -0
- {invoice2data-0.4.2 → invoice2data-0.4.3}/src/invoice2data/extract/templates/nl/nl.vistaprint.yml +0 -0
- {invoice2data-0.4.2 → invoice2data-0.4.3}/src/invoice2data/extract/templates/nl/nl.wasco.yml +0 -0
- {invoice2data-0.4.2 → invoice2data-0.4.3}/src/invoice2data/extract/templates/nl/nl.weid.yml +0 -0
- {invoice2data-0.4.2 → invoice2data-0.4.3}/src/invoice2data/extract/templates/nl/nl.zinkunie.yml +0 -0
- {invoice2data-0.4.2 → invoice2data-0.4.3}/src/invoice2data/extract/templates/pl/pl.bmw-fs.yml +0 -0
- {invoice2data-0.4.2 → invoice2data-0.4.3}/src/invoice2data/extract/templates/pl/pl.orlen.yml +0 -0
- {invoice2data-0.4.2 → invoice2data-0.4.3}/src/invoice2data/extract/templates/pl/pl.p4.yml +0 -0
- {invoice2data-0.4.2 → invoice2data-0.4.3}/src/invoice2data/extract/templates/pl/pl.paypro.yml +0 -0
- {invoice2data-0.4.2/src/invoice2data/extract/plugins → invoice2data-0.4.3/src/invoice2data/input}/__init__.py +0 -0
- {invoice2data-0.4.2/src/invoice2data/extract → invoice2data-0.4.3/src/invoice2data/output}/__init__.py +0 -0
- {invoice2data-0.4.2 → invoice2data-0.4.3}/src/invoice2data/output/to_csv.py +0 -0
- {invoice2data-0.4.2 → invoice2data-0.4.3}/src/invoice2data/output/to_json.py +0 -0
- {invoice2data-0.4.2 → invoice2data-0.4.3}/src/invoice2data/output/to_xml.py +0 -0
- {invoice2data-0.4.2 → invoice2data-0.4.3}/src/invoice2data.egg-info/dependency_links.txt +0 -0
- {invoice2data-0.4.2 → invoice2data-0.4.3}/src/invoice2data.egg-info/entry_points.txt +0 -0
- {invoice2data-0.4.2 → invoice2data-0.4.3}/src/invoice2data.egg-info/requires.txt +0 -0
- {invoice2data-0.4.2 → invoice2data-0.4.3}/src/invoice2data.egg-info/top_level.txt +0 -0
- {invoice2data-0.4.2 → invoice2data-0.4.3}/tests/test_extraction.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: invoice2data
|
|
3
|
-
Version: 0.4.
|
|
3
|
+
Version: 0.4.3
|
|
4
4
|
Summary: Python parser to extract data from pdf invoice
|
|
5
5
|
Home-page: https://github.com/invoice-x/invoice2data
|
|
6
6
|
Author: Manuel Riel
|
|
@@ -36,9 +36,9 @@ A command line tool and Python library to support your accounting
|
|
|
36
36
|
process.
|
|
37
37
|
|
|
38
38
|
1. extracts text from PDF files using different techniques, like
|
|
39
|
-
`pdftotext`, `text`, `pdfminer`, `pdfplumber` or OCR --
|
|
39
|
+
`pdftotext`, `text`, `ocrmypdf`, `pdfminer`, `pdfplumber` or OCR -- `tesseract`, or
|
|
40
40
|
`gvision` (Google Cloud Vision).
|
|
41
|
-
2. searches for regex in the result using a YAML-based template system
|
|
41
|
+
2. searches for regex in the result using a YAML or JSON-based template system
|
|
42
42
|
3. saves results as CSV, JSON or XML or renames PDF files to match the content.
|
|
43
43
|
|
|
44
44
|
With the flexible template system you can:
|
|
@@ -108,6 +108,7 @@ Choose any of the following input readers:
|
|
|
108
108
|
- tesseract `invoice2data --input-reader tesseract invoice.pdf`
|
|
109
109
|
- pdfminer.six `invoice2data --input-reader pdfminer invoice.pdf`
|
|
110
110
|
- pdfplumber `invoice2data --input-reader pdfplumber invoice.pdf`
|
|
111
|
+
- ocrmypdf `invoice2data --input-reader ocrmypdf invoice.pdf`
|
|
111
112
|
- gvision `invoice2data --input-reader gvision invoice.pdf` (needs `GOOGLE_APPLICATION_CREDENTIALS` env var)
|
|
112
113
|
|
|
113
114
|
Choose any of the following output formats:
|
|
@@ -123,7 +124,7 @@ Save output file with custom name or a specific folder
|
|
|
123
124
|
**Note:** You must specify the `output-format` in order to create
|
|
124
125
|
`output-name`
|
|
125
126
|
|
|
126
|
-
Specify folder with yml templates. (e.g.
|
|
127
|
+
Specify folder with yml templates. (e.g. your suppliers)
|
|
127
128
|
|
|
128
129
|
`invoice2data --template-folder ACME-templates invoice.pdf`
|
|
129
130
|
|
|
@@ -166,7 +167,7 @@ the list to add your own. If deployed by a bigger organisation, there
|
|
|
166
167
|
should be an interface to edit templates for new suppliers. 80-20 rule.
|
|
167
168
|
For a short tutorial on how to add new templates, see [TUTORIAL.md](TUTORIAL.md).
|
|
168
169
|
|
|
169
|
-
Templates are based on Yaml. They define one or more keywords to find
|
|
170
|
+
Templates are based on Yaml or JSON. They define one or more keywords to find
|
|
170
171
|
the right template, one or more exclude_keywords to further narrow it down
|
|
171
172
|
and regexp for fields to be extracted. They could also be a static value,
|
|
172
173
|
like the full company name.
|
|
@@ -210,6 +211,14 @@ The lines package has multiple settings:
|
|
|
210
211
|
- skip_line > Optional. If first_line is passed, this pattern indicates which sub-lines will be skipped and their data not recorded. This is useful if tables span multiple pages and you need to skip over page numbers or headers that appear mid-table.
|
|
211
212
|
- last_line > Optional. If first_line is passed, this pattern denotes the final line of the sub-lines and is included in the output data.
|
|
212
213
|
|
|
214
|
+
:warning: Invoice2data uses a yaml templating system. The yaml templates are loaded with [pyyaml](https://github.com/yaml/pyyaml) which is a pure python implementation. (thus rather slow)
|
|
215
|
+
As an alternative json templates can be used. Which are natively better supported by python.
|
|
216
|
+
|
|
217
|
+
The performance with yaml templates can be greatly increased **10x** by using [libyaml](https://github.com/yaml/libyaml)
|
|
218
|
+
It can be installed on most distributions by:
|
|
219
|
+
`sudo apt-get libyaml-dev`
|
|
220
|
+
|
|
221
|
+
|
|
213
222
|
## Development
|
|
214
223
|
|
|
215
224
|
If you are interested in improving this project, have a look at our
|
|
@@ -8,9 +8,9 @@ A command line tool and Python library to support your accounting
|
|
|
8
8
|
process.
|
|
9
9
|
|
|
10
10
|
1. extracts text from PDF files using different techniques, like
|
|
11
|
-
`pdftotext`, `text`, `pdfminer`, `pdfplumber` or OCR --
|
|
11
|
+
`pdftotext`, `text`, `ocrmypdf`, `pdfminer`, `pdfplumber` or OCR -- `tesseract`, or
|
|
12
12
|
`gvision` (Google Cloud Vision).
|
|
13
|
-
2. searches for regex in the result using a YAML-based template system
|
|
13
|
+
2. searches for regex in the result using a YAML or JSON-based template system
|
|
14
14
|
3. saves results as CSV, JSON or XML or renames PDF files to match the content.
|
|
15
15
|
|
|
16
16
|
With the flexible template system you can:
|
|
@@ -80,6 +80,7 @@ Choose any of the following input readers:
|
|
|
80
80
|
- tesseract `invoice2data --input-reader tesseract invoice.pdf`
|
|
81
81
|
- pdfminer.six `invoice2data --input-reader pdfminer invoice.pdf`
|
|
82
82
|
- pdfplumber `invoice2data --input-reader pdfplumber invoice.pdf`
|
|
83
|
+
- ocrmypdf `invoice2data --input-reader ocrmypdf invoice.pdf`
|
|
83
84
|
- gvision `invoice2data --input-reader gvision invoice.pdf` (needs `GOOGLE_APPLICATION_CREDENTIALS` env var)
|
|
84
85
|
|
|
85
86
|
Choose any of the following output formats:
|
|
@@ -95,7 +96,7 @@ Save output file with custom name or a specific folder
|
|
|
95
96
|
**Note:** You must specify the `output-format` in order to create
|
|
96
97
|
`output-name`
|
|
97
98
|
|
|
98
|
-
Specify folder with yml templates. (e.g.
|
|
99
|
+
Specify folder with yml templates. (e.g. your suppliers)
|
|
99
100
|
|
|
100
101
|
`invoice2data --template-folder ACME-templates invoice.pdf`
|
|
101
102
|
|
|
@@ -138,7 +139,7 @@ the list to add your own. If deployed by a bigger organisation, there
|
|
|
138
139
|
should be an interface to edit templates for new suppliers. 80-20 rule.
|
|
139
140
|
For a short tutorial on how to add new templates, see [TUTORIAL.md](TUTORIAL.md).
|
|
140
141
|
|
|
141
|
-
Templates are based on Yaml. They define one or more keywords to find
|
|
142
|
+
Templates are based on Yaml or JSON. They define one or more keywords to find
|
|
142
143
|
the right template, one or more exclude_keywords to further narrow it down
|
|
143
144
|
and regexp for fields to be extracted. They could also be a static value,
|
|
144
145
|
like the full company name.
|
|
@@ -182,6 +183,14 @@ The lines package has multiple settings:
|
|
|
182
183
|
- skip_line > Optional. If first_line is passed, this pattern indicates which sub-lines will be skipped and their data not recorded. This is useful if tables span multiple pages and you need to skip over page numbers or headers that appear mid-table.
|
|
183
184
|
- last_line > Optional. If first_line is passed, this pattern denotes the final line of the sub-lines and is included in the output data.
|
|
184
185
|
|
|
186
|
+
:warning: Invoice2data uses a yaml templating system. The yaml templates are loaded with [pyyaml](https://github.com/yaml/pyyaml) which is a pure python implementation. (thus rather slow)
|
|
187
|
+
As an alternative json templates can be used. Which are natively better supported by python.
|
|
188
|
+
|
|
189
|
+
The performance with yaml templates can be greatly increased **10x** by using [libyaml](https://github.com/yaml/libyaml)
|
|
190
|
+
It can be installed on most distributions by:
|
|
191
|
+
`sudo apt-get libyaml-dev`
|
|
192
|
+
|
|
193
|
+
|
|
185
194
|
## Development
|
|
186
195
|
|
|
187
196
|
If you are interested in improving this project, have a look at our
|
|
@@ -7,15 +7,16 @@ Templates are initially read from .yml files and then kept as class.
|
|
|
7
7
|
import re
|
|
8
8
|
import dateparser
|
|
9
9
|
import unicodedata
|
|
10
|
-
import
|
|
10
|
+
from logging import getLogger
|
|
11
|
+
from pprint import pformat
|
|
11
12
|
from collections import OrderedDict
|
|
12
13
|
from . import parsers
|
|
13
14
|
from .plugins import lines, tables
|
|
14
|
-
# Area extraction is currently added for pdftotext and tesseract (which uses pdftotext)
|
|
15
|
-
from ..input import pdftotext, tesseract
|
|
15
|
+
# Area extraction is currently added for pdftotext, ocrmypdf and tesseract (which uses pdftotext)
|
|
16
|
+
from ..input import pdftotext, ocrmypdf, tesseract
|
|
16
17
|
from typing import Optional
|
|
17
18
|
|
|
18
|
-
logger =
|
|
19
|
+
logger = getLogger(__name__)
|
|
19
20
|
|
|
20
21
|
OPTIONS_DEFAULT = {
|
|
21
22
|
"remove_whitespace": False,
|
|
@@ -41,7 +42,7 @@ class InvoiceTemplate(OrderedDict):
|
|
|
41
42
|
-------
|
|
42
43
|
prepare_input(extracted_str)
|
|
43
44
|
Input raw string and do transformations, as set in template file.
|
|
44
|
-
matches_input(
|
|
45
|
+
matches_input(extracted_str)
|
|
45
46
|
See if string matches keywords set in template file
|
|
46
47
|
parse_number(value)
|
|
47
48
|
Parse number, remove decimal separator and add other options
|
|
@@ -59,12 +60,15 @@ class InvoiceTemplate(OrderedDict):
|
|
|
59
60
|
# Merge template-specific options with defaults
|
|
60
61
|
self.options = OPTIONS_DEFAULT.copy()
|
|
61
62
|
|
|
62
|
-
for lang in self.options["languages"]:
|
|
63
|
-
assert len(lang) == 2, "lang code must have 2 letters"
|
|
64
|
-
|
|
65
63
|
if "options" in self:
|
|
66
64
|
self.options.update(self["options"])
|
|
67
65
|
|
|
66
|
+
for lang in self.options["languages"]:
|
|
67
|
+
assert len(lang) == 2, (
|
|
68
|
+
"Error in Template %s lang code must have 2 letters"
|
|
69
|
+
% self["template_name"]
|
|
70
|
+
)
|
|
71
|
+
|
|
68
72
|
# Set issuer, if it doesn't exist.
|
|
69
73
|
if "issuer" not in self.keys():
|
|
70
74
|
self["issuer"] = self["keywords"][0]
|
|
@@ -82,7 +86,7 @@ class InvoiceTemplate(OrderedDict):
|
|
|
82
86
|
|
|
83
87
|
# Remove accents
|
|
84
88
|
if self.options["remove_accents"]:
|
|
85
|
-
optimized_str =
|
|
89
|
+
optimized_str = re.sub('[\u0300-\u0362]', '', unicodedata.normalize('NFKD', optimized_str))
|
|
86
90
|
|
|
87
91
|
# Convert to lower case
|
|
88
92
|
if self.options["lowercase"]:
|
|
@@ -90,40 +94,44 @@ class InvoiceTemplate(OrderedDict):
|
|
|
90
94
|
|
|
91
95
|
# Specific replace
|
|
92
96
|
for replace in self.options["replace"]:
|
|
93
|
-
assert len(replace) == 2,
|
|
97
|
+
assert len(replace) == 2, (
|
|
98
|
+
"Error in Template %s A replace should be a list of exactly 2 elements."
|
|
99
|
+
% self["template_name"]
|
|
100
|
+
)
|
|
94
101
|
optimized_str = re.sub(replace[0], replace[1], optimized_str)
|
|
95
102
|
|
|
96
103
|
return optimized_str
|
|
97
104
|
|
|
98
|
-
def matches_input(self,
|
|
105
|
+
def matches_input(self, extracted_str: str) -> bool:
|
|
99
106
|
"""See if string matches all keyword patterns and no exclude_keyword patterns set in template file.
|
|
100
107
|
|
|
101
108
|
Args:
|
|
102
|
-
|
|
109
|
+
extracted_str: String of the text from OCR of the pdf before applying options defined in the template.
|
|
103
110
|
|
|
104
111
|
Return:
|
|
105
112
|
Boolean
|
|
106
113
|
- True if all keywords are found and none of the exclude_keywords are found.
|
|
107
114
|
- False if either not all keywords are found or at least one exclude_keyword is found."""
|
|
108
115
|
|
|
109
|
-
if all([re.search(keyword,
|
|
116
|
+
if all([re.search(keyword, extracted_str) for keyword in self["keywords"]]):
|
|
110
117
|
# All keyword patterns matched
|
|
111
118
|
if self["exclude_keywords"]:
|
|
112
|
-
if any([re.search(exclude_keyword,
|
|
119
|
+
if any([re.search(exclude_keyword, extracted_str) for exclude_keyword in self["exclude_keywords"]]):
|
|
113
120
|
# At least one exclude_keyword matches
|
|
114
|
-
logger.debug("Template: %s
|
|
121
|
+
logger.debug("Template: %s | Keywords matched. Exclude keyword found!", self["template_name"])
|
|
115
122
|
return False
|
|
116
123
|
# No exclude_keywords or none match, template is good
|
|
117
|
-
logger.debug("Template: %s
|
|
124
|
+
logger.debug("Template: %s | Keywords matched. No exclude keywords found.", self["template_name"])
|
|
118
125
|
return True
|
|
119
126
|
else:
|
|
120
|
-
logger.debug("Template: %s
|
|
127
|
+
logger.debug("Template: %s | Failed to match all keywords.", self["template_name"])
|
|
121
128
|
return False
|
|
122
129
|
|
|
123
130
|
def parse_number(self, value):
|
|
124
|
-
assert (
|
|
125
|
-
|
|
126
|
-
|
|
131
|
+
assert value.count(self.options["decimal_separator"]) < 2, (
|
|
132
|
+
"Error in Template %s Decimal separator cannot be present several times"
|
|
133
|
+
% self["template_name"]
|
|
134
|
+
)
|
|
127
135
|
# replace decimal separator by a |
|
|
128
136
|
amount_pipe = value.replace(self.options["decimal_separator"], "|")
|
|
129
137
|
# remove all possible thousands separators
|
|
@@ -177,7 +185,7 @@ class InvoiceTemplate(OrderedDict):
|
|
|
177
185
|
self.options["date_formats"],
|
|
178
186
|
)
|
|
179
187
|
logger.debug(
|
|
180
|
-
"Float parsing: decimal separator
|
|
188
|
+
"Float parsing: decimal separator=[%s]", self.options["decimal_separator"]
|
|
181
189
|
)
|
|
182
190
|
logger.debug("keywords=%s", self["keywords"])
|
|
183
191
|
logger.debug(self.options)
|
|
@@ -191,7 +199,7 @@ class InvoiceTemplate(OrderedDict):
|
|
|
191
199
|
# v is the value
|
|
192
200
|
if isinstance(v, dict):
|
|
193
201
|
# Options were supplied to this field
|
|
194
|
-
if "area" in v and input_module in (pdftotext, tesseract):
|
|
202
|
+
if "area" in v and input_module in (pdftotext, ocrmypdf, tesseract):
|
|
195
203
|
# Area is currently only supported for pdftotext
|
|
196
204
|
# area is optional and re-extracts the text being searched
|
|
197
205
|
# This obviously has a performance impact, so use wisely
|
|
@@ -199,10 +207,9 @@ class InvoiceTemplate(OrderedDict):
|
|
|
199
207
|
logger.debug(f"Area was specified with parameters {v['area']}")
|
|
200
208
|
# Extract the text for the specified area
|
|
201
209
|
# Do NOT overwrite optimized_str. We're inside a loop and it will affect all other fields!
|
|
202
|
-
optimized_str_area = input_module.to_text(invoice_file, v['area'])
|
|
210
|
+
optimized_str_area = input_module.to_text(invoice_file, v['area'])
|
|
203
211
|
# Log the text
|
|
204
|
-
logger.debug("START pdftotext area result
|
|
205
|
-
logger.debug(optimized_str_area)
|
|
212
|
+
logger.debug("START pdftotext area result ===========================\n%s", optimized_str_area)
|
|
206
213
|
logger.debug("END pdftotext area result =============================")
|
|
207
214
|
optimized_str_for_parser = optimized_str_area
|
|
208
215
|
else:
|
|
@@ -217,11 +224,11 @@ class InvoiceTemplate(OrderedDict):
|
|
|
217
224
|
if value:
|
|
218
225
|
output[k] = value
|
|
219
226
|
else:
|
|
220
|
-
logger.
|
|
227
|
+
logger.warning("Failed to parse field %s with parser %s", k, v["parser"])
|
|
221
228
|
else:
|
|
222
|
-
logger.
|
|
229
|
+
logger.error("Field %s has unknown parser %s set", k, v["parser"])
|
|
223
230
|
else:
|
|
224
|
-
logger.
|
|
231
|
+
logger.error("Field %s doesn't have parser specified", k)
|
|
225
232
|
elif k.startswith("static_"):
|
|
226
233
|
logger.debug("field=%s | static value=%s", k, v)
|
|
227
234
|
output[k.replace("static_", "")] = v
|
|
@@ -261,7 +268,8 @@ class InvoiceTemplate(OrderedDict):
|
|
|
261
268
|
|
|
262
269
|
if set(required_fields).issubset(output.keys()):
|
|
263
270
|
output["desc"] = "Invoice from %s" % (self["issuer"])
|
|
264
|
-
logger.debug(output)
|
|
271
|
+
logger.debug("\n %s", pformat(output, indent=2))
|
|
272
|
+
# when python 3.7 support stops add sort_dicts=False,
|
|
265
273
|
return output
|
|
266
274
|
else:
|
|
267
275
|
fields = list(set(output.keys()))
|
|
@@ -0,0 +1,109 @@
|
|
|
1
|
+
"""
|
|
2
|
+
This module abstracts templates for invoice providers.
|
|
3
|
+
|
|
4
|
+
Templates are initially read from .yml or .json files and then kept as class.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import os
|
|
8
|
+
import json
|
|
9
|
+
try:
|
|
10
|
+
from yaml import load, YAMLError, CSafeLoader as SafeLoader
|
|
11
|
+
except ImportError: # pragma: no cover
|
|
12
|
+
from yaml import load, SafeLoader, YAMLError
|
|
13
|
+
import pkg_resources
|
|
14
|
+
from logging import getLogger
|
|
15
|
+
from .invoice_template import InvoiceTemplate
|
|
16
|
+
import codecs
|
|
17
|
+
|
|
18
|
+
logger = getLogger(__name__)
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def read_templates(folder=None):
|
|
22
|
+
"""
|
|
23
|
+
Load yaml templates from template folder. Return list of dicts.
|
|
24
|
+
|
|
25
|
+
Use built-in templates if no folder is set.
|
|
26
|
+
|
|
27
|
+
Parameters
|
|
28
|
+
----------
|
|
29
|
+
folder : str
|
|
30
|
+
user defined folder where they stores their files, if None uses built-in templates
|
|
31
|
+
|
|
32
|
+
Returns
|
|
33
|
+
-------
|
|
34
|
+
output : Instance of `InvoiceTemplate`
|
|
35
|
+
template which match based on keywords
|
|
36
|
+
|
|
37
|
+
Examples
|
|
38
|
+
--------
|
|
39
|
+
|
|
40
|
+
>>> read_template("home/duskybomb/invoice-templates/")
|
|
41
|
+
InvoiceTemplate([('issuer', 'OYO'), ('fields', {'amount': 'Grand Total\\s+Rs (\\d+)',
|
|
42
|
+
'date': 'Date:\\s(\\d{1,2}\\/\\d{1,2}\\/\\d{1,4})', 'invoice_number': '([A-Z0-9]+)\\s+Cash at Hotel'}),
|
|
43
|
+
('keywords', ['OYO', 'Oravel', 'Stays']), ('options', {'currency': 'INR', 'decimal_separator': '.'}),
|
|
44
|
+
('template_name', 'com.oyo.invoice.yml'), ('exclude_keywords', [])])
|
|
45
|
+
|
|
46
|
+
After reading the template you can use the result as an instance of `InvoiceTemplate` to extract fields from
|
|
47
|
+
`extract_data()`
|
|
48
|
+
|
|
49
|
+
>>> my_template = InvoiceTemplate([('issuer', 'OYO'), ('fields', {'amount': 'Grand Total\\s+Rs (\\d+)',
|
|
50
|
+
'date': 'Date:\\s(\\d{1,2}\\/\\d{1,2}\\/\\d{1,4})', 'invoice_number': '([A-Z0-9]+)\\s+Cash at Hotel'}),
|
|
51
|
+
('keywords', ['OYO', 'Oravel', 'Stays']), ('options', {'currency': 'INR', 'decimal_separator': '.'}),
|
|
52
|
+
('template_name', 'com.oyo.invoice.yml'), ('exclude_keywords', [])])
|
|
53
|
+
>>> extract_data("invoice2data/test/pdfs/oyo.pdf", my_template, pdftotext)
|
|
54
|
+
{'issuer': 'OYO', 'amount': 1939.0, 'date': datetime.datetime(2017, 12, 31, 0, 0), 'invoice_number': 'IBZY2087',
|
|
55
|
+
'currency': 'INR', 'desc': 'Invoice IBZY2087 from OYO'}
|
|
56
|
+
|
|
57
|
+
"""
|
|
58
|
+
|
|
59
|
+
output = []
|
|
60
|
+
|
|
61
|
+
if folder is None:
|
|
62
|
+
folder = pkg_resources.resource_filename(__name__, "templates")
|
|
63
|
+
|
|
64
|
+
for path, subdirs, files in os.walk(folder):
|
|
65
|
+
for name in sorted(files):
|
|
66
|
+
with codecs.open(
|
|
67
|
+
os.path.join(path, name), encoding="utf-8"
|
|
68
|
+
) as template_file:
|
|
69
|
+
if name.endswith(".yml"):
|
|
70
|
+
try:
|
|
71
|
+
tpl = load(template_file.read(), Loader=SafeLoader)
|
|
72
|
+
except YAMLError as error:
|
|
73
|
+
logger.warning("Failed to load %s template:\n%s", name, error)
|
|
74
|
+
continue
|
|
75
|
+
else:
|
|
76
|
+
try:
|
|
77
|
+
tpl = json.loads(template_file.read())
|
|
78
|
+
except ValueError as error:
|
|
79
|
+
logger.warning("json Loader Failed to load %s template:\n%s", name, error)
|
|
80
|
+
tpl["template_name"] = name
|
|
81
|
+
|
|
82
|
+
# Test if all required fields are in template
|
|
83
|
+
if "keywords" not in tpl.keys():
|
|
84
|
+
logger.warning(
|
|
85
|
+
"Failed to load template %s Missing mandatory 'keywords' field.",
|
|
86
|
+
name,
|
|
87
|
+
)
|
|
88
|
+
continue
|
|
89
|
+
|
|
90
|
+
# Convert keywords to list, if only one
|
|
91
|
+
if not isinstance(tpl["keywords"], list):
|
|
92
|
+
tpl["keywords"] = [tpl["keywords"]]
|
|
93
|
+
|
|
94
|
+
# Set excluded_keywords as empty list, if not provided
|
|
95
|
+
if "exclude_keywords" not in tpl.keys():
|
|
96
|
+
tpl["exclude_keywords"] = []
|
|
97
|
+
|
|
98
|
+
# Convert excluded_keywords to list, if only one
|
|
99
|
+
if not isinstance(tpl["exclude_keywords"], list):
|
|
100
|
+
tpl["exclude_keywords"] = [tpl["exclude_keywords"]]
|
|
101
|
+
|
|
102
|
+
if "priority" not in tpl.keys():
|
|
103
|
+
tpl["priority"] = 5
|
|
104
|
+
|
|
105
|
+
output.append(InvoiceTemplate(tpl))
|
|
106
|
+
|
|
107
|
+
logger.info("Loaded %d templates from %s", len(output), folder)
|
|
108
|
+
|
|
109
|
+
return output
|
|
@@ -5,9 +5,9 @@ Initial work and maintenance by Holger Brunn @hbrunn
|
|
|
5
5
|
"""
|
|
6
6
|
|
|
7
7
|
import re
|
|
8
|
-
import
|
|
8
|
+
from logging import getLogger
|
|
9
9
|
|
|
10
|
-
logger =
|
|
10
|
+
logger = getLogger(__name__)
|
|
11
11
|
|
|
12
12
|
DEFAULT_OPTIONS = {"line_separator": r"\n"}
|
|
13
13
|
|
|
@@ -23,8 +23,14 @@ def parse_line(patterns, line):
|
|
|
23
23
|
|
|
24
24
|
def parse_block(template, field, settings, content):
|
|
25
25
|
# Validate settings
|
|
26
|
-
assert "line" in settings,
|
|
27
|
-
|
|
26
|
+
assert "line" in settings, (
|
|
27
|
+
"Error in Template %s Line regex missing" % template["template_name"]
|
|
28
|
+
)
|
|
29
|
+
|
|
30
|
+
logger.debug(
|
|
31
|
+
"START lines block content ========================\n%s", content
|
|
32
|
+
)
|
|
33
|
+
logger.debug("END lines block content ==========================")
|
|
28
34
|
lines = []
|
|
29
35
|
current_row = {}
|
|
30
36
|
|
|
@@ -91,7 +97,7 @@ def parse_block(template, field, settings, content):
|
|
|
91
97
|
skip_line_results = [re.search(settings["skip_line"], line)]
|
|
92
98
|
if any(skip_line_results):
|
|
93
99
|
# There was at least one match to a skip_line
|
|
94
|
-
logger.debug("skip_line match on *%s*", line)
|
|
100
|
+
logger.debug("skip_line match on \ns*%s*", line)
|
|
95
101
|
continue
|
|
96
102
|
# If none of those have continued the loop, check if this is just a normal line
|
|
97
103
|
match = parse_line(settings["line"], line)
|
|
@@ -101,7 +107,7 @@ def parse_block(template, field, settings, content):
|
|
|
101
107
|
current_row = parse_current_row(match, current_row)
|
|
102
108
|
continue
|
|
103
109
|
# If the line doesn't match anything, log and continue to next line
|
|
104
|
-
logger.debug("
|
|
110
|
+
logger.debug("The following line doesn't match anything:\n*%s*", line)
|
|
105
111
|
if current_row:
|
|
106
112
|
# All lines processed, so append whatever the final current_row was to output
|
|
107
113
|
lines.append(current_row)
|
|
@@ -115,14 +121,18 @@ def parse_block(template, field, settings, content):
|
|
|
115
121
|
return lines
|
|
116
122
|
|
|
117
123
|
|
|
118
|
-
def
|
|
124
|
+
def parse_by_rule(template, field, rule, content):
|
|
119
125
|
# First apply default options.
|
|
120
126
|
settings = DEFAULT_OPTIONS.copy()
|
|
121
|
-
settings.update(
|
|
127
|
+
settings.update(rule)
|
|
122
128
|
|
|
123
129
|
# Validate settings
|
|
124
|
-
assert "start" in settings,
|
|
125
|
-
|
|
130
|
+
assert "start" in settings, (
|
|
131
|
+
"Error in Template %s Lines start regex missing" % template["template_name"]
|
|
132
|
+
)
|
|
133
|
+
assert "end" in settings, (
|
|
134
|
+
"Error in Template %s Lines end regex missing" % template["template_name"]
|
|
135
|
+
)
|
|
126
136
|
|
|
127
137
|
blocks_count = 0
|
|
128
138
|
lines = []
|
|
@@ -131,12 +141,13 @@ def parse(template, field, _settings, content):
|
|
|
131
141
|
while True:
|
|
132
142
|
start = re.search(settings["start"], content)
|
|
133
143
|
if not start:
|
|
144
|
+
logger.debug("Failed to find lines block start")
|
|
134
145
|
break
|
|
135
146
|
content = content[start.end():]
|
|
136
147
|
|
|
137
148
|
end = re.search(settings["end"], content)
|
|
138
149
|
if not end:
|
|
139
|
-
logger.
|
|
150
|
+
logger.debug("Failed to find lines block end")
|
|
140
151
|
break
|
|
141
152
|
|
|
142
153
|
blocks_count += 1
|
|
@@ -152,6 +163,25 @@ def parse(template, field, _settings, content):
|
|
|
152
163
|
return lines
|
|
153
164
|
|
|
154
165
|
|
|
166
|
+
def parse(template, field, settings, content):
|
|
167
|
+
if "rules" in settings:
|
|
168
|
+
# One field can have multiple sets of line-parsing rules
|
|
169
|
+
rules = settings['rules']
|
|
170
|
+
else:
|
|
171
|
+
# Original syntax stored line-parsing rules in top field YAML object
|
|
172
|
+
keys = ('start', 'end', 'line', 'first_line', 'last_line', 'skip_line', 'types')
|
|
173
|
+
rules = [{k: v for k, v in settings.items() if k in keys}]
|
|
174
|
+
|
|
175
|
+
lines = []
|
|
176
|
+
for i, rule in enumerate(rules):
|
|
177
|
+
logger.debug("Testing Rules set #%s", i)
|
|
178
|
+
new_lines = parse_by_rule(template, field, rule, content)
|
|
179
|
+
if new_lines is not None:
|
|
180
|
+
lines += new_lines
|
|
181
|
+
|
|
182
|
+
return lines
|
|
183
|
+
|
|
184
|
+
|
|
155
185
|
def parse_current_row(match, current_row):
|
|
156
186
|
# Parse the current row data
|
|
157
187
|
for field, value in match.groupdict().items():
|
|
@@ -1,3 +1,5 @@
|
|
|
1
|
+
#!/usr/bin/python
|
|
2
|
+
# -*- coding: utf-8 -*-
|
|
1
3
|
# SPDX-License-Identifier: MIT
|
|
2
4
|
|
|
3
5
|
"""
|
|
@@ -32,13 +34,15 @@ def parse(template, field, settings, content, legacy=False):
|
|
|
32
34
|
for regex in regexes:
|
|
33
35
|
if not isinstance(regex, str):
|
|
34
36
|
logger.warning("Field \"%s\" regex is not a string (%s)", field, str(regex))
|
|
37
|
+
|
|
35
38
|
continue
|
|
36
39
|
matches = re.findall(regex, content)
|
|
37
|
-
logger.debug("field
|
|
40
|
+
logger.debug("field=\033[1m\033[93m%s\033[0m | regex=\033[36m%s\033[0m | matches=\033[1m\033[92m%s\033[0m"
|
|
41
|
+
, field, settings["regex"], matches)
|
|
38
42
|
if matches:
|
|
39
43
|
for match in matches:
|
|
40
44
|
if isinstance(match, tuple):
|
|
41
|
-
logger.warning("Regex can't contain multiple capturing groups
|
|
45
|
+
logger.warning("Regex can't contain multiple capturing groups %s", regex)
|
|
42
46
|
return None
|
|
43
47
|
result += matches
|
|
44
48
|
|
|
@@ -47,15 +51,28 @@ def parse(template, field, settings, content, legacy=False):
|
|
|
47
51
|
result[k] = template.coerce_type(v, settings["type"])
|
|
48
52
|
|
|
49
53
|
if "group" in settings:
|
|
54
|
+
result = list(filter(None, result))
|
|
50
55
|
if settings["group"] == "sum":
|
|
51
56
|
result = sum(result)
|
|
57
|
+
elif settings["group"] == "min":
|
|
58
|
+
result = min(result)
|
|
59
|
+
elif settings["group"] == "max":
|
|
60
|
+
result = max(result)
|
|
61
|
+
elif settings["group"] == "first":
|
|
62
|
+
result = result[0]
|
|
63
|
+
elif settings["group"] == "last":
|
|
64
|
+
result = result[-1]
|
|
65
|
+
elif settings["group"] == "join":
|
|
66
|
+
result = " ".join(str(v) for v in result)
|
|
52
67
|
else:
|
|
53
|
-
logger.warning("Unsupported grouping method: "
|
|
68
|
+
logger.warning("Unsupported grouping method: %s", settings["group"])
|
|
54
69
|
return None
|
|
55
70
|
else:
|
|
71
|
+
|
|
56
72
|
# Remove duplicates maintaining the order by default (it's more
|
|
57
73
|
# natural). Don't do that for legacy parsing to keep backward
|
|
58
74
|
# compatibility.
|
|
75
|
+
|
|
59
76
|
if legacy:
|
|
60
77
|
result = list(set(result))
|
|
61
78
|
else:
|
|
@@ -4,9 +4,9 @@
|
|
|
4
4
|
Pseudo-parser returning a static (predefined) value
|
|
5
5
|
"""
|
|
6
6
|
|
|
7
|
-
import
|
|
7
|
+
from logging import getLogger
|
|
8
8
|
|
|
9
|
-
logger =
|
|
9
|
+
logger = getLogger(__name__)
|
|
10
10
|
|
|
11
11
|
|
|
12
12
|
def parse(template, field, settings, content):
|
|
@@ -14,6 +14,6 @@ def parse(template, field, settings, content):
|
|
|
14
14
|
logger.warning("Field \"%s\" doesn't have static value specified", field)
|
|
15
15
|
return None
|
|
16
16
|
|
|
17
|
-
logger.debug("field=%s | value
|
|
17
|
+
logger.debug("field=%s | value=['%s']", field, settings["value"])
|
|
18
18
|
|
|
19
19
|
return settings["value"]
|