invoice2data 0.4.1__tar.gz → 0.4.3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (219) hide show
  1. {invoice2data-0.4.1/src/invoice2data.egg-info → invoice2data-0.4.3}/PKG-INFO +16 -6
  2. {invoice2data-0.4.1 → invoice2data-0.4.3}/README.md +13 -4
  3. {invoice2data-0.4.1 → invoice2data-0.4.3}/setup.cfg +3 -2
  4. {invoice2data-0.4.1 → invoice2data-0.4.3}/src/invoice2data/extract/invoice_template.py +37 -29
  5. invoice2data-0.4.3/src/invoice2data/extract/loader.py +109 -0
  6. {invoice2data-0.4.1 → invoice2data-0.4.3}/src/invoice2data/extract/parsers/__interface__.py +0 -2
  7. {invoice2data-0.4.1 → invoice2data-0.4.3}/src/invoice2data/extract/parsers/lines.py +41 -11
  8. {invoice2data-0.4.1 → invoice2data-0.4.3}/src/invoice2data/extract/parsers/regex.py +20 -3
  9. {invoice2data-0.4.1 → invoice2data-0.4.3}/src/invoice2data/extract/parsers/static.py +3 -3
  10. invoice2data-0.4.3/src/invoice2data/extract/plugins/tables.py +99 -0
  11. invoice2data-0.4.3/src/invoice2data/extract/templates/be/be.accor.invest.ibis.yml +73 -0
  12. invoice2data-0.4.3/src/invoice2data/extract/templates/com/com.AzureInterior.yml +209 -0
  13. invoice2data-0.4.3/src/invoice2data/extract/templates/com/com.expressvpn_prio6.yml +49 -0
  14. invoice2data-0.4.3/src/invoice2data/extract/templates/com/com.namecheap.yml +37 -0
  15. invoice2data-0.4.3/src/invoice2data/extract/templates/com/com.nl.lenovo.digitalriver.yml +60 -0
  16. {invoice2data-0.4.1 → invoice2data-0.4.3}/src/invoice2data/extract/templates/fr/fr.free.adsl-fiber.yml +6 -0
  17. {invoice2data-0.4.1 → invoice2data-0.4.3}/src/invoice2data/extract/templates/nl/nl.action.yml +3 -1
  18. {invoice2data-0.4.1 → invoice2data-0.4.3}/src/invoice2data/extract/templates/nl/nl.albron.yml +3 -1
  19. invoice2data-0.4.3/src/invoice2data/extract/templates/nl/nl.be.coolblue.yml +117 -0
  20. {invoice2data-0.4.1 → invoice2data-0.4.3}/src/invoice2data/extract/templates/nl/nl.gamma.yml +3 -1
  21. {invoice2data-0.4.1 → invoice2data-0.4.3}/src/invoice2data/extract/templates/nl/nl.karwei.yml +3 -1
  22. {invoice2data-0.4.1 → invoice2data-0.4.3}/src/invoice2data/extract/templates/nl/nl.koffiehenk.yml +9 -3
  23. {invoice2data-0.4.1 → invoice2data-0.4.3}/src/invoice2data/extract/templates/nl/nl.praxis.yml +14 -5
  24. invoice2data-0.4.3/src/invoice2data/extract/templates/nl/nl.saeco.philips.eluscious.yml +92 -0
  25. {invoice2data-0.4.1 → invoice2data-0.4.3}/src/invoice2data/extract/templates/nl/nl.transip.yml +3 -0
  26. {invoice2data-0.4.1 → invoice2data-0.4.3}/src/invoice2data/extract/templates/nl/nl.vodafone.yml +19 -16
  27. {invoice2data-0.4.1 → invoice2data-0.4.3}/src/invoice2data/extract/templates/nl/nl.yezzer.yml +3 -0
  28. invoice2data-0.4.3/src/invoice2data/extract/templates/pl/pl.insert.subiekt-gt.yml +43 -0
  29. invoice2data-0.4.3/src/invoice2data/extract/templates/pl/pl.insert.subiekt-nexo.yml +33 -0
  30. {invoice2data-0.4.1 → invoice2data-0.4.3}/src/invoice2data/input/gvision.py +1 -1
  31. invoice2data-0.4.3/src/invoice2data/input/ocrmypdf.py +146 -0
  32. {invoice2data-0.4.1 → invoice2data-0.4.3}/src/invoice2data/input/pdfminer_wrapper.py +1 -1
  33. {invoice2data-0.4.1 → invoice2data-0.4.3}/src/invoice2data/input/pdfplumber.py +3 -3
  34. {invoice2data-0.4.1 → invoice2data-0.4.3}/src/invoice2data/input/pdftotext.py +3 -3
  35. {invoice2data-0.4.1 → invoice2data-0.4.3}/src/invoice2data/input/tesseract.py +38 -11
  36. {invoice2data-0.4.1 → invoice2data-0.4.3}/src/invoice2data/input/text.py +1 -1
  37. {invoice2data-0.4.1 → invoice2data-0.4.3}/src/invoice2data/main.py +93 -21
  38. {invoice2data-0.4.1 → invoice2data-0.4.3/src/invoice2data.egg-info}/PKG-INFO +16 -6
  39. {invoice2data-0.4.1 → invoice2data-0.4.3}/src/invoice2data.egg-info/SOURCES.txt +15 -2
  40. invoice2data-0.4.3/tests/test_cli.py +334 -0
  41. invoice2data-0.4.3/tests/test_extraction.py +63 -0
  42. invoice2data-0.4.3/tests/test_invoice_template.py +146 -0
  43. invoice2data-0.4.3/tests/test_lib.py +130 -0
  44. invoice2data-0.4.3/tests/test_loader.py +117 -0
  45. invoice2data-0.4.1/src/invoice2data/extract/loader.py +0 -114
  46. invoice2data-0.4.1/src/invoice2data/extract/plugins/tables.py +0 -62
  47. invoice2data-0.4.1/src/invoice2data/extract/templates/com/com.flipkart.WSRetail.yml +0 -16
  48. invoice2data-0.4.1/src/invoice2data/extract/templates/com/com.namecheap.yml +0 -15
  49. {invoice2data-0.4.1 → invoice2data-0.4.3}/LICENSE.txt +0 -0
  50. {invoice2data-0.4.1 → invoice2data-0.4.3}/MANIFEST.in +0 -0
  51. {invoice2data-0.4.1 → invoice2data-0.4.3}/setup.py +0 -0
  52. {invoice2data-0.4.1 → invoice2data-0.4.3}/src/invoice2data/__init__.py +0 -0
  53. {invoice2data-0.4.1 → invoice2data-0.4.3}/src/invoice2data/extract/__init__.py +0 -0
  54. {invoice2data-0.4.1 → invoice2data-0.4.3}/src/invoice2data/extract/parsers/__init__.py +0 -0
  55. {invoice2data-0.4.1 → invoice2data-0.4.3}/src/invoice2data/extract/plugins/__init__.py +0 -0
  56. {invoice2data-0.4.1 → invoice2data-0.4.3}/src/invoice2data/extract/plugins/__interface__.py +0 -0
  57. {invoice2data-0.4.1 → invoice2data-0.4.3}/src/invoice2data/extract/plugins/lines.py +0 -0
  58. {invoice2data-0.4.1 → invoice2data-0.4.3}/src/invoice2data/extract/templates/au/au.com.opal.yml +0 -0
  59. {invoice2data-0.4.1 → invoice2data-0.4.3}/src/invoice2data/extract/templates/au/au.com.telstra.yml +0 -0
  60. {invoice2data-0.4.1 → invoice2data-0.4.3}/src/invoice2data/extract/templates/be/be.boucherie.pochet.yml +0 -0
  61. {invoice2data-0.4.1 → invoice2data-0.4.3}/src/invoice2data/extract/templates/be/be.cebeo.yml +0 -0
  62. {invoice2data-0.4.1 → invoice2data-0.4.3}/src/invoice2data/extract/templates/be/be.lampiris.facture-dacompte.yml +0 -0
  63. {invoice2data-0.4.1 → invoice2data-0.4.3}/src/invoice2data/extract/templates/be/be.lampiris.factuur.yml +0 -0
  64. {invoice2data-0.4.1 → invoice2data-0.4.3}/src/invoice2data/extract/templates/be/be.lampiris.regularisation.yml +0 -0
  65. {invoice2data-0.4.1 → invoice2data-0.4.3}/src/invoice2data/extract/templates/be/be.melchior-vins.yml +0 -0
  66. {invoice2data-0.4.1 → invoice2data-0.4.3}/src/invoice2data/extract/templates/be/be.proximus.yml +0 -0
  67. {invoice2data-0.4.1 → invoice2data-0.4.3}/src/invoice2data/extract/templates/be/be.scarlet.yml +0 -0
  68. {invoice2data-0.4.1 → invoice2data-0.4.3}/src/invoice2data/extract/templates/be/be.securex.social.yml +0 -0
  69. {invoice2data-0.4.1 → invoice2data-0.4.3}/src/invoice2data/extract/templates/ch/ch.pcengines.yml +0 -0
  70. {invoice2data-0.4.1 → invoice2data-0.4.3}/src/invoice2data/extract/templates/com/com.amazon.aws.yml +0 -0
  71. {invoice2data-0.4.1 → invoice2data-0.4.3}/src/invoice2data/extract/templates/com/com.apple.yml +0 -0
  72. {invoice2data-0.4.1 → invoice2data-0.4.3}/src/invoice2data/extract/templates/com/com.apps4rent.yml +0 -0
  73. {invoice2data-0.4.1 → invoice2data-0.4.3}/src/invoice2data/extract/templates/com/com.binarylife.yml +0 -0
  74. {invoice2data-0.4.1 → invoice2data-0.4.3}/src/invoice2data/extract/templates/com/com.bloomberg.yml +0 -0
  75. {invoice2data-0.4.1 → invoice2data-0.4.3}/src/invoice2data/extract/templates/com/com.cloudns.yml +0 -0
  76. {invoice2data-0.4.1 → invoice2data-0.4.3}/src/invoice2data/extract/templates/com/com.datadoghq.yml +0 -0
  77. {invoice2data-0.4.1 → invoice2data-0.4.3}/src/invoice2data/extract/templates/com/com.digitalocean.yml +0 -0
  78. {invoice2data-0.4.1 → invoice2data-0.4.3}/src/invoice2data/extract/templates/com/com.envato.yml +0 -0
  79. {invoice2data-0.4.1 → invoice2data-0.4.3}/src/invoice2data/extract/templates/com/com.expressvpn.yml +0 -0
  80. {invoice2data-0.4.1 → invoice2data-0.4.3}/src/invoice2data/extract/templates/com/com.ftserussell.yml +0 -0
  81. {invoice2data-0.4.1 → invoice2data-0.4.3}/src/invoice2data/extract/templates/com/com.github.yml +0 -0
  82. {invoice2data-0.4.1 → invoice2data-0.4.3}/src/invoice2data/extract/templates/com/com.globalsign.yml +0 -0
  83. {invoice2data-0.4.1 → invoice2data-0.4.3}/src/invoice2data/extract/templates/com/com.google.adwords.hk.yml +0 -0
  84. {invoice2data-0.4.1 → invoice2data-0.4.3}/src/invoice2data/extract/templates/com/com.hobohost.yml +0 -0
  85. {invoice2data-0.4.1 → invoice2data-0.4.3}/src/invoice2data/extract/templates/com/com.jamiepro.yml +0 -0
  86. {invoice2data-0.4.1 → invoice2data-0.4.3}/src/invoice2data/extract/templates/com/com.linode.yml +0 -0
  87. {invoice2data-0.4.1 → invoice2data-0.4.3}/src/invoice2data/extract/templates/com/com.microsoftonline.hk-v2017.yml +0 -0
  88. {invoice2data-0.4.1 → invoice2data-0.4.3}/src/invoice2data/extract/templates/com/com.microsoftonline.hk.yml +0 -0
  89. {invoice2data-0.4.1 → invoice2data-0.4.3}/src/invoice2data/extract/templates/com/com.mongodb.yml +0 -0
  90. {invoice2data-0.4.1 → invoice2data-0.4.3}/src/invoice2data/extract/templates/com/com.namesilo.yml +0 -0
  91. {invoice2data-0.4.1 → invoice2data-0.4.3}/src/invoice2data/extract/templates/com/com.newrelic.yml +0 -0
  92. {invoice2data-0.4.1 → invoice2data-0.4.3}/src/invoice2data/extract/templates/com/com.nmmn.yml +0 -0
  93. {invoice2data-0.4.1 → invoice2data-0.4.3}/src/invoice2data/extract/templates/com/com.nodisto.yml +0 -0
  94. {invoice2data-0.4.1 → invoice2data-0.4.3}/src/invoice2data/extract/templates/com/com.nyse.yml +0 -0
  95. {invoice2data-0.4.1 → invoice2data-0.4.3}/src/invoice2data/extract/templates/com/com.oyo.invoice.yml +0 -0
  96. {invoice2data-0.4.1 → invoice2data-0.4.3}/src/invoice2data/extract/templates/com/com.packtpub.yml +0 -0
  97. {invoice2data-0.4.1 → invoice2data-0.4.3}/src/invoice2data/extract/templates/com/com.pixartprinting.yml +0 -0
  98. {invoice2data-0.4.1 → invoice2data-0.4.3}/src/invoice2data/extract/templates/com/com.sammymaystone.yml +0 -0
  99. {invoice2data-0.4.1 → invoice2data-0.4.3}/src/invoice2data/extract/templates/com/com.scaleway.yml +0 -0
  100. {invoice2data-0.4.1 → invoice2data-0.4.3}/src/invoice2data/extract/templates/com/com.textmaster.yml +0 -0
  101. {invoice2data-0.4.1 → invoice2data-0.4.3}/src/invoice2data/extract/templates/com/com.tmx.yml +0 -0
  102. {invoice2data-0.4.1 → invoice2data-0.4.3}/src/invoice2data/extract/templates/com/com.travis-ci.yml +0 -0
  103. {invoice2data-0.4.1 → invoice2data-0.4.3}/src/invoice2data/extract/templates/com/com.twitter.de.yml +0 -0
  104. {invoice2data-0.4.1 → invoice2data-0.4.3}/src/invoice2data/extract/templates/com/com.twitter.uk.yml +0 -0
  105. {invoice2data-0.4.1 → invoice2data-0.4.3}/src/invoice2data/extract/templates/com/com.twitter.yml +0 -0
  106. {invoice2data-0.4.1 → invoice2data-0.4.3}/src/invoice2data/extract/templates/com/com.upwork.yml +0 -0
  107. {invoice2data-0.4.1 → invoice2data-0.4.3}/src/invoice2data/extract/templates/com/com.usersnap.yml +0 -0
  108. {invoice2data-0.4.1 → invoice2data-0.4.3}/src/invoice2data/extract/templates/de/de.amazon.yml +0 -0
  109. {invoice2data-0.4.1 → invoice2data-0.4.3}/src/invoice2data/extract/templates/de/de.bettina-kast.yml +0 -0
  110. {invoice2data-0.4.1 → invoice2data-0.4.3}/src/invoice2data/extract/templates/de/de.digikey.com.yml +0 -0
  111. {invoice2data-0.4.1 → invoice2data-0.4.3}/src/invoice2data/extract/templates/de/de.hosteurope.yml +0 -0
  112. {invoice2data-0.4.1 → invoice2data-0.4.3}/src/invoice2data/extract/templates/de/de.notebooksbilligerBillPay.yml +0 -0
  113. {invoice2data-0.4.1 → invoice2data-0.4.3}/src/invoice2data/extract/templates/de/de.ovh.yml +0 -0
  114. {invoice2data-0.4.1 → invoice2data-0.4.3}/src/invoice2data/extract/templates/de/de.qualityhosting.yml +0 -0
  115. {invoice2data-0.4.1 → invoice2data-0.4.3}/src/invoice2data/extract/templates/de/de.united-domains.yml +0 -0
  116. {invoice2data-0.4.1 → invoice2data-0.4.3}/src/invoice2data/extract/templates/es/com.pepephone.yml +0 -0
  117. {invoice2data-0.4.1 → invoice2data-0.4.3}/src/invoice2data/extract/templates/es/es.supplies24.yml +0 -0
  118. {invoice2data-0.4.1 → invoice2data-0.4.3}/src/invoice2data/extract/templates/fr/co.mooncard.yml +0 -0
  119. {invoice2data-0.4.1 → invoice2data-0.4.3}/src/invoice2data/extract/templates/fr/com.adobe.ie.yml +0 -0
  120. {invoice2data-0.4.1 → invoice2data-0.4.3}/src/invoice2data/extract/templates/fr/com.akretion.fr.yml +0 -0
  121. {invoice2data-0.4.1 → invoice2data-0.4.3}/src/invoice2data/extract/templates/fr/com.amazon.aws.yml +0 -0
  122. {invoice2data-0.4.1 → invoice2data-0.4.3}/src/invoice2data/extract/templates/fr/com.ateliercopieservice.yml +0 -0
  123. {invoice2data-0.4.1 → invoice2data-0.4.3}/src/invoice2data/extract/templates/fr/com.chauffeur-prive.yml +0 -0
  124. {invoice2data-0.4.1 → invoice2data-0.4.3}/src/invoice2data/extract/templates/fr/com.coriolis.yml +0 -0
  125. {invoice2data-0.4.1 → invoice2data-0.4.3}/src/invoice2data/extract/templates/fr/com.easyjet.fr.yml +0 -0
  126. {invoice2data-0.4.1 → invoice2data-0.4.3}/src/invoice2data/extract/templates/fr/com.eaudugrandlyon.yml +0 -0
  127. {invoice2data-0.4.1 → invoice2data-0.4.3}/src/invoice2data/extract/templates/fr/com.godaddy.yml +0 -0
  128. {invoice2data-0.4.1 → invoice2data-0.4.3}/src/invoice2data/extract/templates/fr/com.google.ie.yml +0 -0
  129. {invoice2data-0.4.1 → invoice2data-0.4.3}/src/invoice2data/extract/templates/fr/com.hootsuite.yml +0 -0
  130. {invoice2data-0.4.1 → invoice2data-0.4.3}/src/invoice2data/extract/templates/fr/com.jeanbesson.yml +0 -0
  131. {invoice2data-0.4.1 → invoice2data-0.4.3}/src/invoice2data/extract/templates/fr/com.ldlc.yml +0 -0
  132. {invoice2data-0.4.1 → invoice2data-0.4.3}/src/invoice2data/extract/templates/fr/com.linkedin.yml +0 -0
  133. {invoice2data-0.4.1 → invoice2data-0.4.3}/src/invoice2data/extract/templates/fr/com.mention.yml +0 -0
  134. {invoice2data-0.4.1 → invoice2data-0.4.3}/src/invoice2data/extract/templates/fr/com.microsoft.ie.yml +0 -0
  135. {invoice2data-0.4.1 → invoice2data-0.4.3}/src/invoice2data/extract/templates/fr/com.myflyingbox.yml +0 -0
  136. {invoice2data-0.4.1 → invoice2data-0.4.3}/src/invoice2data/extract/templates/fr/com.officetimeline.yml +0 -0
  137. {invoice2data-0.4.1 → invoice2data-0.4.3}/src/invoice2data/extract/templates/fr/com.orange-business.mobile.yml +0 -0
  138. {invoice2data-0.4.1 → invoice2data-0.4.3}/src/invoice2data/extract/templates/fr/com.ovh.fr.yml +0 -0
  139. {invoice2data-0.4.1 → invoice2data-0.4.3}/src/invoice2data/extract/templates/fr/com.rs-online.fr.yml +0 -0
  140. {invoice2data-0.4.1 → invoice2data-0.4.3}/src/invoice2data/extract/templates/fr/com.saur.yml +0 -0
  141. {invoice2data-0.4.1 → invoice2data-0.4.3}/src/invoice2data/extract/templates/fr/com.soyoustart.yml +0 -0
  142. {invoice2data-0.4.1 → invoice2data-0.4.3}/src/invoice2data/extract/templates/fr/com.vinci-autoroutes.yml +0 -0
  143. {invoice2data-0.4.1 → invoice2data-0.4.3}/src/invoice2data/extract/templates/fr/dolibarr.generique.yml +0 -0
  144. {invoice2data-0.4.1 → invoice2data-0.4.3}/src/invoice2data/extract/templates/fr/eu.trainline.yml +0 -0
  145. {invoice2data-0.4.1 → invoice2data-0.4.3}/src/invoice2data/extract/templates/fr/fr.actn.yml +0 -0
  146. {invoice2data-0.4.1 → invoice2data-0.4.3}/src/invoice2data/extract/templates/fr/fr.airfrance.yml +0 -0
  147. {invoice2data-0.4.1 → invoice2data-0.4.3}/src/invoice2data/extract/templates/fr/fr.also.yml +0 -0
  148. {invoice2data-0.4.1 → invoice2data-0.4.3}/src/invoice2data/extract/templates/fr/fr.amazon.yml +0 -0
  149. {invoice2data-0.4.1 → invoice2data-0.4.3}/src/invoice2data/extract/templates/fr/fr.assurance-epargne-pension.yml +0 -0
  150. {invoice2data-0.4.1 → invoice2data-0.4.3}/src/invoice2data/extract/templates/fr/fr.bouyguestelecom.adsl-fiber.yml +0 -0
  151. {invoice2data-0.4.1 → invoice2data-0.4.3}/src/invoice2data/extract/templates/fr/fr.bouyguestelecom.mobile.yml +0 -0
  152. {invoice2data-0.4.1 → invoice2data-0.4.3}/src/invoice2data/extract/templates/fr/fr.butagaz.yml +0 -0
  153. {invoice2data-0.4.1 → invoice2data-0.4.3}/src/invoice2data/extract/templates/fr/fr.chronopost.yml +0 -0
  154. {invoice2data-0.4.1 → invoice2data-0.4.3}/src/invoice2data/extract/templates/fr/fr.dirafi.yml +0 -0
  155. {invoice2data-0.4.1 → invoice2data-0.4.3}/src/invoice2data/extract/templates/fr/fr.domaine-achat.yml +0 -0
  156. {invoice2data-0.4.1 → invoice2data-0.4.3}/src/invoice2data/extract/templates/fr/fr.easytrip.yml +0 -0
  157. {invoice2data-0.4.1 → invoice2data-0.4.3}/src/invoice2data/extract/templates/fr/fr.edf.entreprises.yml +0 -0
  158. {invoice2data-0.4.1 → invoice2data-0.4.3}/src/invoice2data/extract/templates/fr/fr.edf.pme.yml +0 -0
  159. {invoice2data-0.4.1 → invoice2data-0.4.3}/src/invoice2data/extract/templates/fr/fr.finagaz.yml +0 -0
  160. {invoice2data-0.4.1 → invoice2data-0.4.3}/src/invoice2data/extract/templates/fr/fr.fountain.yml +0 -0
  161. {invoice2data-0.4.1 → invoice2data-0.4.3}/src/invoice2data/extract/templates/fr/fr.free.mobile.yml +0 -0
  162. {invoice2data-0.4.1 → invoice2data-0.4.3}/src/invoice2data/extract/templates/fr/fr.free.mobile2.yml +0 -0
  163. {invoice2data-0.4.1 → invoice2data-0.4.3}/src/invoice2data/extract/templates/fr/fr.futur.yml +0 -0
  164. {invoice2data-0.4.1 → invoice2data-0.4.3}/src/invoice2data/extract/templates/fr/fr.ge-iroise.yml +0 -0
  165. {invoice2data-0.4.1 → invoice2data-0.4.3}/src/invoice2data/extract/templates/fr/fr.greffe-tc-lyon.yml +0 -0
  166. {invoice2data-0.4.1 → invoice2data-0.4.3}/src/invoice2data/extract/templates/fr/fr.hiscox.yml +0 -0
  167. {invoice2data-0.4.1 → invoice2data-0.4.3}/src/invoice2data/extract/templates/fr/fr.internetsatellite.yml +0 -0
  168. {invoice2data-0.4.1 → invoice2data-0.4.3}/src/invoice2data/extract/templates/fr/fr.jpg.yml +0 -0
  169. {invoice2data-0.4.1 → invoice2data-0.4.3}/src/invoice2data/extract/templates/fr/fr.kubii.yml +0 -0
  170. {invoice2data-0.4.1 → invoice2data-0.4.3}/src/invoice2data/extract/templates/fr/fr.laposte.boutique.yml +0 -0
  171. {invoice2data-0.4.1 → invoice2data-0.4.3}/src/invoice2data/extract/templates/fr/fr.laposte.coliposte.yml +0 -0
  172. {invoice2data-0.4.1 → invoice2data-0.4.3}/src/invoice2data/extract/templates/fr/fr.lecab.yml +0 -0
  173. {invoice2data-0.4.1 → invoice2data-0.4.3}/src/invoice2data/extract/templates/fr/fr.leroymerlin.yml +0 -0
  174. {invoice2data-0.4.1 → invoice2data-0.4.3}/src/invoice2data/extract/templates/fr/fr.maaf.yml +0 -0
  175. {invoice2data-0.4.1 → invoice2data-0.4.3}/src/invoice2data/extract/templates/fr/fr.mediapart.yml +0 -0
  176. {invoice2data-0.4.1 → invoice2data-0.4.3}/src/invoice2data/extract/templates/fr/fr.moneo-resto.yml +0 -0
  177. {invoice2data-0.4.1 → invoice2data-0.4.3}/src/invoice2data/extract/templates/fr/fr.mouser.yml +0 -0
  178. {invoice2data-0.4.1 → invoice2data-0.4.3}/src/invoice2data/extract/templates/fr/fr.mycelium-roulement.yml +0 -0
  179. {invoice2data-0.4.1 → invoice2data-0.4.3}/src/invoice2data/extract/templates/fr/fr.napsis.yml +0 -0
  180. {invoice2data-0.4.1 → invoice2data-0.4.3}/src/invoice2data/extract/templates/fr/fr.nexity.yml +0 -0
  181. {invoice2data-0.4.1 → invoice2data-0.4.3}/src/invoice2data/extract/templates/fr/fr.orange.fibre.yml +0 -0
  182. {invoice2data-0.4.1 → invoice2data-0.4.3}/src/invoice2data/extract/templates/fr/fr.orange.fixedline.yml +0 -0
  183. {invoice2data-0.4.1 → invoice2data-0.4.3}/src/invoice2data/extract/templates/fr/fr.prestaclic.yml +0 -0
  184. {invoice2data-0.4.1 → invoice2data-0.4.3}/src/invoice2data/extract/templates/fr/fr.publicationannoncelegale.yml +0 -0
  185. {invoice2data-0.4.1 → invoice2data-0.4.3}/src/invoice2data/extract/templates/fr/fr.sfr.adsl-fiber.yml +0 -0
  186. {invoice2data-0.4.1 → invoice2data-0.4.3}/src/invoice2data/extract/templates/fr/fr.sfr.mobile.yml +0 -0
  187. {invoice2data-0.4.1 → invoice2data-0.4.3}/src/invoice2data/extract/templates/fr/fr.sosh.yml +0 -0
  188. {invoice2data-0.4.1 → invoice2data-0.4.3}/src/invoice2data/extract/templates/fr/fr.teledec.yml +0 -0
  189. {invoice2data-0.4.1 → invoice2data-0.4.3}/src/invoice2data/extract/templates/fr/fr.topoffice.yml +0 -0
  190. {invoice2data-0.4.1 → invoice2data-0.4.3}/src/invoice2data/extract/templates/fr/net.online.yml +0 -0
  191. {invoice2data-0.4.1 → invoice2data-0.4.3}/src/invoice2data/extract/templates/fr/net.scaleway.yml +0 -0
  192. {invoice2data-0.4.1 → invoice2data-0.4.3}/src/invoice2data/extract/templates/nl/nl.begra.yml +0 -0
  193. {invoice2data-0.4.1 → invoice2data-0.4.3}/src/invoice2data/extract/templates/nl/nl.blokker.yml +0 -0
  194. {invoice2data-0.4.1 → invoice2data-0.4.3}/src/invoice2data/extract/templates/nl/nl.bunq.yml +0 -0
  195. {invoice2data-0.4.1 → invoice2data-0.4.3}/src/invoice2data/extract/templates/nl/nl.cpe.yml +0 -0
  196. {invoice2data-0.4.1 → invoice2data-0.4.3}/src/invoice2data/extract/templates/nl/nl.farnell.yml +0 -0
  197. {invoice2data-0.4.1 → invoice2data-0.4.3}/src/invoice2data/extract/templates/nl/nl.ferbox.yml +0 -0
  198. {invoice2data-0.4.1 → invoice2data-0.4.3}/src/invoice2data/extract/templates/nl/nl.goos.yml +0 -0
  199. {invoice2data-0.4.1 → invoice2data-0.4.3}/src/invoice2data/extract/templates/nl/nl.kav.yml +0 -0
  200. {invoice2data-0.4.1 → invoice2data-0.4.3}/src/invoice2data/extract/templates/nl/nl.reclameland.yml +0 -0
  201. {invoice2data-0.4.1 → invoice2data-0.4.3}/src/invoice2data/extract/templates/nl/nl.simpel.yml +0 -0
  202. {invoice2data-0.4.1 → invoice2data-0.4.3}/src/invoice2data/extract/templates/nl/nl.tuynder.yml +0 -0
  203. {invoice2data-0.4.1 → invoice2data-0.4.3}/src/invoice2data/extract/templates/nl/nl.vistaprint.yml +0 -0
  204. {invoice2data-0.4.1 → invoice2data-0.4.3}/src/invoice2data/extract/templates/nl/nl.wasco.yml +0 -0
  205. {invoice2data-0.4.1 → invoice2data-0.4.3}/src/invoice2data/extract/templates/nl/nl.weid.yml +0 -0
  206. {invoice2data-0.4.1 → invoice2data-0.4.3}/src/invoice2data/extract/templates/nl/nl.zinkunie.yml +0 -0
  207. {invoice2data-0.4.1 → invoice2data-0.4.3}/src/invoice2data/extract/templates/pl/pl.bmw-fs.yml +0 -0
  208. {invoice2data-0.4.1 → invoice2data-0.4.3}/src/invoice2data/extract/templates/pl/pl.orlen.yml +0 -0
  209. {invoice2data-0.4.1 → invoice2data-0.4.3}/src/invoice2data/extract/templates/pl/pl.p4.yml +0 -0
  210. {invoice2data-0.4.1 → invoice2data-0.4.3}/src/invoice2data/extract/templates/pl/pl.paypro.yml +0 -0
  211. {invoice2data-0.4.1 → invoice2data-0.4.3}/src/invoice2data/input/__init__.py +0 -0
  212. {invoice2data-0.4.1 → invoice2data-0.4.3}/src/invoice2data/output/__init__.py +0 -0
  213. {invoice2data-0.4.1 → invoice2data-0.4.3}/src/invoice2data/output/to_csv.py +0 -0
  214. {invoice2data-0.4.1 → invoice2data-0.4.3}/src/invoice2data/output/to_json.py +0 -0
  215. {invoice2data-0.4.1 → invoice2data-0.4.3}/src/invoice2data/output/to_xml.py +0 -0
  216. {invoice2data-0.4.1 → invoice2data-0.4.3}/src/invoice2data.egg-info/dependency_links.txt +0 -0
  217. {invoice2data-0.4.1 → invoice2data-0.4.3}/src/invoice2data.egg-info/entry_points.txt +0 -0
  218. {invoice2data-0.4.1 → invoice2data-0.4.3}/src/invoice2data.egg-info/requires.txt +0 -0
  219. {invoice2data-0.4.1 → invoice2data-0.4.3}/src/invoice2data.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: invoice2data
3
- Version: 0.4.1
3
+ Version: 0.4.3
4
4
  Summary: Python parser to extract data from pdf invoice
5
5
  Home-page: https://github.com/invoice-x/invoice2data
6
6
  Author: Manuel Riel
@@ -15,10 +15,11 @@ Classifier: Operating System :: POSIX
15
15
  Classifier: Operating System :: Unix
16
16
  Classifier: Operating System :: Microsoft :: Windows
17
17
  Classifier: License :: OSI Approved :: MIT License
18
- Classifier: Programming Language :: Python :: 3.6
19
18
  Classifier: Programming Language :: Python :: 3.7
20
19
  Classifier: Programming Language :: Python :: 3.8
21
20
  Classifier: Programming Language :: Python :: 3.9
21
+ Classifier: Programming Language :: Python :: 3.10
22
+ Classifier: Programming Language :: Python :: 3.11
22
23
  Classifier: Topic :: Office/Business :: Financial
23
24
  Classifier: Topic :: Office/Business :: Financial :: Accounting
24
25
  Description-Content-Type: text/markdown
@@ -35,9 +36,9 @@ A command line tool and Python library to support your accounting
35
36
  process.
36
37
 
37
38
  1. extracts text from PDF files using different techniques, like
38
- `pdftotext`, `text`, `pdfminer`, `pdfplumber` or OCR -- `tesseract`, or
39
+ `pdftotext`, `text`, `ocrmypdf`, `pdfminer`, `pdfplumber` or OCR -- `tesseract`, or
39
40
  `gvision` (Google Cloud Vision).
40
- 2. searches for regex in the result using a YAML-based template system
41
+ 2. searches for regex in the result using a YAML or JSON-based template system
41
42
  3. saves results as CSV, JSON or XML or renames PDF files to match the content.
42
43
 
43
44
  With the flexible template system you can:
@@ -107,6 +108,7 @@ Choose any of the following input readers:
107
108
  - tesseract `invoice2data --input-reader tesseract invoice.pdf`
108
109
  - pdfminer.six `invoice2data --input-reader pdfminer invoice.pdf`
109
110
  - pdfplumber `invoice2data --input-reader pdfplumber invoice.pdf`
111
+ - ocrmypdf `invoice2data --input-reader ocrmypdf invoice.pdf`
110
112
  - gvision `invoice2data --input-reader gvision invoice.pdf` (needs `GOOGLE_APPLICATION_CREDENTIALS` env var)
111
113
 
112
114
  Choose any of the following output formats:
@@ -122,7 +124,7 @@ Save output file with custom name or a specific folder
122
124
  **Note:** You must specify the `output-format` in order to create
123
125
  `output-name`
124
126
 
125
- Specify folder with yml templates. (e.g. your suppliers)
127
+ Specify folder with yml templates. (e.g. your suppliers)
126
128
 
127
129
  `invoice2data --template-folder ACME-templates invoice.pdf`
128
130
 
@@ -165,7 +167,7 @@ the list to add your own. If deployed by a bigger organisation, there
165
167
  should be an interface to edit templates for new suppliers. 80-20 rule.
166
168
  For a short tutorial on how to add new templates, see [TUTORIAL.md](TUTORIAL.md).
167
169
 
168
- Templates are based on Yaml. They define one or more keywords to find
170
+ Templates are based on Yaml or JSON. They define one or more keywords to find
169
171
  the right template, one or more exclude_keywords to further narrow it down
170
172
  and regexp for fields to be extracted. They could also be a static value,
171
173
  like the full company name.
@@ -209,6 +211,14 @@ The lines package has multiple settings:
209
211
  - skip_line > Optional. If first_line is passed, this pattern indicates which sub-lines will be skipped and their data not recorded. This is useful if tables span multiple pages and you need to skip over page numbers or headers that appear mid-table.
210
212
  - last_line > Optional. If first_line is passed, this pattern denotes the final line of the sub-lines and is included in the output data.
211
213
 
214
+ :warning: Invoice2data uses a yaml templating system. The yaml templates are loaded with [pyyaml](https://github.com/yaml/pyyaml) which is a pure python implementation. (thus rather slow)
215
+ As an alternative json templates can be used. Which are natively better supported by python.
216
+
217
+ The performance with yaml templates can be greatly increased **10x** by using [libyaml](https://github.com/yaml/libyaml)
218
+ It can be installed on most distributions by:
219
+ `sudo apt-get libyaml-dev`
220
+
221
+
212
222
  ## Development
213
223
 
214
224
  If you are interested in improving this project, have a look at our
@@ -8,9 +8,9 @@ A command line tool and Python library to support your accounting
8
8
  process.
9
9
 
10
10
  1. extracts text from PDF files using different techniques, like
11
- `pdftotext`, `text`, `pdfminer`, `pdfplumber` or OCR -- `tesseract`, or
11
+ `pdftotext`, `text`, `ocrmypdf`, `pdfminer`, `pdfplumber` or OCR -- `tesseract`, or
12
12
  `gvision` (Google Cloud Vision).
13
- 2. searches for regex in the result using a YAML-based template system
13
+ 2. searches for regex in the result using a YAML or JSON-based template system
14
14
  3. saves results as CSV, JSON or XML or renames PDF files to match the content.
15
15
 
16
16
  With the flexible template system you can:
@@ -80,6 +80,7 @@ Choose any of the following input readers:
80
80
  - tesseract `invoice2data --input-reader tesseract invoice.pdf`
81
81
  - pdfminer.six `invoice2data --input-reader pdfminer invoice.pdf`
82
82
  - pdfplumber `invoice2data --input-reader pdfplumber invoice.pdf`
83
+ - ocrmypdf `invoice2data --input-reader ocrmypdf invoice.pdf`
83
84
  - gvision `invoice2data --input-reader gvision invoice.pdf` (needs `GOOGLE_APPLICATION_CREDENTIALS` env var)
84
85
 
85
86
  Choose any of the following output formats:
@@ -95,7 +96,7 @@ Save output file with custom name or a specific folder
95
96
  **Note:** You must specify the `output-format` in order to create
96
97
  `output-name`
97
98
 
98
- Specify folder with yml templates. (e.g. your suppliers)
99
+ Specify folder with yml templates. (e.g. your suppliers)
99
100
 
100
101
  `invoice2data --template-folder ACME-templates invoice.pdf`
101
102
 
@@ -138,7 +139,7 @@ the list to add your own. If deployed by a bigger organisation, there
138
139
  should be an interface to edit templates for new suppliers. 80-20 rule.
139
140
  For a short tutorial on how to add new templates, see [TUTORIAL.md](TUTORIAL.md).
140
141
 
141
- Templates are based on Yaml. They define one or more keywords to find
142
+ Templates are based on Yaml or JSON. They define one or more keywords to find
142
143
  the right template, one or more exclude_keywords to further narrow it down
143
144
  and regexp for fields to be extracted. They could also be a static value,
144
145
  like the full company name.
@@ -182,6 +183,14 @@ The lines package has multiple settings:
182
183
  - skip_line > Optional. If first_line is passed, this pattern indicates which sub-lines will be skipped and their data not recorded. This is useful if tables span multiple pages and you need to skip over page numbers or headers that appear mid-table.
183
184
  - last_line > Optional. If first_line is passed, this pattern denotes the final line of the sub-lines and is included in the output data.
184
185
 
186
+ :warning: Invoice2data uses a yaml templating system. The yaml templates are loaded with [pyyaml](https://github.com/yaml/pyyaml) which is a pure python implementation. (thus rather slow)
187
+ As an alternative json templates can be used. Which are natively better supported by python.
188
+
189
+ The performance with yaml templates can be greatly increased **10x** by using [libyaml](https://github.com/yaml/libyaml)
190
+ It can be installed on most distributions by:
191
+ `sudo apt-get libyaml-dev`
192
+
193
+
185
194
  ## Development
186
195
 
187
196
  If you are interested in improving this project, have a look at our
@@ -2,7 +2,7 @@
2
2
  name = invoice2data
3
3
  author = Manuel Riel
4
4
  description = Python parser to extract data from pdf invoice
5
- version = 0.4.1
5
+ version = 0.4.3
6
6
  url = https://github.com/invoice-x/invoice2data
7
7
  keywords =
8
8
  pdf
@@ -17,10 +17,11 @@ classifiers =
17
17
  Operating System :: Unix
18
18
  Operating System :: Microsoft :: Windows
19
19
  License :: OSI Approved :: MIT License
20
- Programming Language :: Python :: 3.6
21
20
  Programming Language :: Python :: 3.7
22
21
  Programming Language :: Python :: 3.8
23
22
  Programming Language :: Python :: 3.9
23
+ Programming Language :: Python :: 3.10
24
+ Programming Language :: Python :: 3.11
24
25
  Topic :: Office/Business :: Financial
25
26
  Topic :: Office/Business :: Financial :: Accounting
26
27
  long_description = file: README.md
@@ -7,15 +7,16 @@ Templates are initially read from .yml files and then kept as class.
7
7
  import re
8
8
  import dateparser
9
9
  import unicodedata
10
- import logging
10
+ from logging import getLogger
11
+ from pprint import pformat
11
12
  from collections import OrderedDict
12
13
  from . import parsers
13
14
  from .plugins import lines, tables
14
- # Area extraction is currently added for pdftotext and tesseract (which uses pdftotext)
15
- from ..input import pdftotext, tesseract
15
+ # Area extraction is currently added for pdftotext, ocrmypdf and tesseract (which uses pdftotext)
16
+ from ..input import pdftotext, ocrmypdf, tesseract
16
17
  from typing import Optional
17
18
 
18
- logger = logging.getLogger(__name__)
19
+ logger = getLogger(__name__)
19
20
 
20
21
  OPTIONS_DEFAULT = {
21
22
  "remove_whitespace": False,
@@ -41,7 +42,7 @@ class InvoiceTemplate(OrderedDict):
41
42
  -------
42
43
  prepare_input(extracted_str)
43
44
  Input raw string and do transformations, as set in template file.
44
- matches_input(optimized_str)
45
+ matches_input(extracted_str)
45
46
  See if string matches keywords set in template file
46
47
  parse_number(value)
47
48
  Parse number, remove decimal separator and add other options
@@ -59,12 +60,15 @@ class InvoiceTemplate(OrderedDict):
59
60
  # Merge template-specific options with defaults
60
61
  self.options = OPTIONS_DEFAULT.copy()
61
62
 
62
- for lang in self.options["languages"]:
63
- assert len(lang) == 2, "lang code must have 2 letters"
64
-
65
63
  if "options" in self:
66
64
  self.options.update(self["options"])
67
65
 
66
+ for lang in self.options["languages"]:
67
+ assert len(lang) == 2, (
68
+ "Error in Template %s lang code must have 2 letters"
69
+ % self["template_name"]
70
+ )
71
+
68
72
  # Set issuer, if it doesn't exist.
69
73
  if "issuer" not in self.keys():
70
74
  self["issuer"] = self["keywords"][0]
@@ -82,7 +86,7 @@ class InvoiceTemplate(OrderedDict):
82
86
 
83
87
  # Remove accents
84
88
  if self.options["remove_accents"]:
85
- optimized_str = unicodedata.normalize('NFKD', optimized_str).encode('ascii', 'ignore').decode('ascii')
89
+ optimized_str = re.sub('[\u0300-\u0362]', '', unicodedata.normalize('NFKD', optimized_str))
86
90
 
87
91
  # Convert to lower case
88
92
  if self.options["lowercase"]:
@@ -90,40 +94,44 @@ class InvoiceTemplate(OrderedDict):
90
94
 
91
95
  # Specific replace
92
96
  for replace in self.options["replace"]:
93
- assert len(replace) == 2, "A replace should be a list of exactly 2 elements."
97
+ assert len(replace) == 2, (
98
+ "Error in Template %s A replace should be a list of exactly 2 elements."
99
+ % self["template_name"]
100
+ )
94
101
  optimized_str = re.sub(replace[0], replace[1], optimized_str)
95
102
 
96
103
  return optimized_str
97
104
 
98
- def matches_input(self, optimized_str: str) -> bool:
105
+ def matches_input(self, extracted_str: str) -> bool:
99
106
  """See if string matches all keyword patterns and no exclude_keyword patterns set in template file.
100
107
 
101
108
  Args:
102
- optimized_str: String of the text from OCR of the pdf after applying options defined in the template.
109
+ extracted_str: String of the text from OCR of the pdf before applying options defined in the template.
103
110
 
104
111
  Return:
105
112
  Boolean
106
113
  - True if all keywords are found and none of the exclude_keywords are found.
107
114
  - False if either not all keywords are found or at least one exclude_keyword is found."""
108
115
 
109
- if all([re.search(keyword, optimized_str) for keyword in self["keywords"]]):
116
+ if all([re.search(keyword, extracted_str) for keyword in self["keywords"]]):
110
117
  # All keyword patterns matched
111
118
  if self["exclude_keywords"]:
112
- if any([re.search(exclude_keyword, optimized_str) for exclude_keyword in self["exclude_keywords"]]):
119
+ if any([re.search(exclude_keyword, extracted_str) for exclude_keyword in self["exclude_keywords"]]):
113
120
  # At least one exclude_keyword matches
114
- logger.debug("Template: %s. Keywords matched. Exclude keyword found!", self["template_name"])
121
+ logger.debug("Template: %s | Keywords matched. Exclude keyword found!", self["template_name"])
115
122
  return False
116
123
  # No exclude_keywords or none match, template is good
117
- logger.debug("Template: %s. Keywords matched. No exclude keywords found.", self["template_name"])
124
+ logger.debug("Template: %s | Keywords matched. No exclude keywords found.", self["template_name"])
118
125
  return True
119
126
  else:
120
- logger.debug("Template: %s. Failed to match all keywords.", self["template_name"])
127
+ logger.debug("Template: %s | Failed to match all keywords.", self["template_name"])
121
128
  return False
122
129
 
123
130
  def parse_number(self, value):
124
- assert (
125
- value.count(self.options["decimal_separator"]) < 2
126
- ), "Decimal separator cannot be present several times"
131
+ assert value.count(self.options["decimal_separator"]) < 2, (
132
+ "Error in Template %s Decimal separator cannot be present several times"
133
+ % self["template_name"]
134
+ )
127
135
  # replace decimal separator by a |
128
136
  amount_pipe = value.replace(self.options["decimal_separator"], "|")
129
137
  # remove all possible thousands separators
@@ -177,7 +185,7 @@ class InvoiceTemplate(OrderedDict):
177
185
  self.options["date_formats"],
178
186
  )
179
187
  logger.debug(
180
- "Float parsing: decimal separator=%s", self.options["decimal_separator"]
188
+ "Float parsing: decimal separator=[%s]", self.options["decimal_separator"]
181
189
  )
182
190
  logger.debug("keywords=%s", self["keywords"])
183
191
  logger.debug(self.options)
@@ -191,7 +199,7 @@ class InvoiceTemplate(OrderedDict):
191
199
  # v is the value
192
200
  if isinstance(v, dict):
193
201
  # Options were supplied to this field
194
- if "area" in v and input_module in (pdftotext, tesseract):
202
+ if "area" in v and input_module in (pdftotext, ocrmypdf, tesseract):
195
203
  # Area is currently only supported for pdftotext
196
204
  # area is optional and re-extracts the text being searched
197
205
  # This obviously has a performance impact, so use wisely
@@ -199,10 +207,9 @@ class InvoiceTemplate(OrderedDict):
199
207
  logger.debug(f"Area was specified with parameters {v['area']}")
200
208
  # Extract the text for the specified area
201
209
  # Do NOT overwrite optimized_str. We're inside a loop and it will affect all other fields!
202
- optimized_str_area = input_module.to_text(invoice_file, v['area']).decode("utf-8")
210
+ optimized_str_area = input_module.to_text(invoice_file, v['area'])
203
211
  # Log the text
204
- logger.debug("START pdftotext area result ===========================")
205
- logger.debug(optimized_str_area)
212
+ logger.debug("START pdftotext area result ===========================\n%s", optimized_str_area)
206
213
  logger.debug("END pdftotext area result =============================")
207
214
  optimized_str_for_parser = optimized_str_area
208
215
  else:
@@ -217,11 +224,11 @@ class InvoiceTemplate(OrderedDict):
217
224
  if value:
218
225
  output[k] = value
219
226
  else:
220
- logger.error("Failed to parse field %s with parser %s", k, v["parser"])
227
+ logger.warning("Failed to parse field %s with parser %s", k, v["parser"])
221
228
  else:
222
- logger.warning("Field %s has unknown parser %s set", k, v["parser"])
229
+ logger.error("Field %s has unknown parser %s set", k, v["parser"])
223
230
  else:
224
- logger.warning("Field %s doesn't have parser specified", k)
231
+ logger.error("Field %s doesn't have parser specified", k)
225
232
  elif k.startswith("static_"):
226
233
  logger.debug("field=%s | static value=%s", k, v)
227
234
  output[k.replace("static_", "")] = v
@@ -261,7 +268,8 @@ class InvoiceTemplate(OrderedDict):
261
268
 
262
269
  if set(required_fields).issubset(output.keys()):
263
270
  output["desc"] = "Invoice from %s" % (self["issuer"])
264
- logger.debug(output)
271
+ logger.debug("\n %s", pformat(output, indent=2))
272
+ # when python 3.7 support stops add sort_dicts=False,
265
273
  return output
266
274
  else:
267
275
  fields = list(set(output.keys()))
@@ -0,0 +1,109 @@
1
+ """
2
+ This module abstracts templates for invoice providers.
3
+
4
+ Templates are initially read from .yml or .json files and then kept as class.
5
+ """
6
+
7
+ import os
8
+ import json
9
+ try:
10
+ from yaml import load, YAMLError, CSafeLoader as SafeLoader
11
+ except ImportError: # pragma: no cover
12
+ from yaml import load, SafeLoader, YAMLError
13
+ import pkg_resources
14
+ from logging import getLogger
15
+ from .invoice_template import InvoiceTemplate
16
+ import codecs
17
+
18
+ logger = getLogger(__name__)
19
+
20
+
21
+ def read_templates(folder=None):
22
+ """
23
+ Load yaml templates from template folder. Return list of dicts.
24
+
25
+ Use built-in templates if no folder is set.
26
+
27
+ Parameters
28
+ ----------
29
+ folder : str
30
+ user defined folder where they stores their files, if None uses built-in templates
31
+
32
+ Returns
33
+ -------
34
+ output : Instance of `InvoiceTemplate`
35
+ template which match based on keywords
36
+
37
+ Examples
38
+ --------
39
+
40
+ >>> read_template("home/duskybomb/invoice-templates/")
41
+ InvoiceTemplate([('issuer', 'OYO'), ('fields', {'amount': 'Grand Total\\s+Rs (\\d+)',
42
+ 'date': 'Date:\\s(\\d{1,2}\\/\\d{1,2}\\/\\d{1,4})', 'invoice_number': '([A-Z0-9]+)\\s+Cash at Hotel'}),
43
+ ('keywords', ['OYO', 'Oravel', 'Stays']), ('options', {'currency': 'INR', 'decimal_separator': '.'}),
44
+ ('template_name', 'com.oyo.invoice.yml'), ('exclude_keywords', [])])
45
+
46
+ After reading the template you can use the result as an instance of `InvoiceTemplate` to extract fields from
47
+ `extract_data()`
48
+
49
+ >>> my_template = InvoiceTemplate([('issuer', 'OYO'), ('fields', {'amount': 'Grand Total\\s+Rs (\\d+)',
50
+ 'date': 'Date:\\s(\\d{1,2}\\/\\d{1,2}\\/\\d{1,4})', 'invoice_number': '([A-Z0-9]+)\\s+Cash at Hotel'}),
51
+ ('keywords', ['OYO', 'Oravel', 'Stays']), ('options', {'currency': 'INR', 'decimal_separator': '.'}),
52
+ ('template_name', 'com.oyo.invoice.yml'), ('exclude_keywords', [])])
53
+ >>> extract_data("invoice2data/test/pdfs/oyo.pdf", my_template, pdftotext)
54
+ {'issuer': 'OYO', 'amount': 1939.0, 'date': datetime.datetime(2017, 12, 31, 0, 0), 'invoice_number': 'IBZY2087',
55
+ 'currency': 'INR', 'desc': 'Invoice IBZY2087 from OYO'}
56
+
57
+ """
58
+
59
+ output = []
60
+
61
+ if folder is None:
62
+ folder = pkg_resources.resource_filename(__name__, "templates")
63
+
64
+ for path, subdirs, files in os.walk(folder):
65
+ for name in sorted(files):
66
+ with codecs.open(
67
+ os.path.join(path, name), encoding="utf-8"
68
+ ) as template_file:
69
+ if name.endswith(".yml"):
70
+ try:
71
+ tpl = load(template_file.read(), Loader=SafeLoader)
72
+ except YAMLError as error:
73
+ logger.warning("Failed to load %s template:\n%s", name, error)
74
+ continue
75
+ else:
76
+ try:
77
+ tpl = json.loads(template_file.read())
78
+ except ValueError as error:
79
+ logger.warning("json Loader Failed to load %s template:\n%s", name, error)
80
+ tpl["template_name"] = name
81
+
82
+ # Test if all required fields are in template
83
+ if "keywords" not in tpl.keys():
84
+ logger.warning(
85
+ "Failed to load template %s Missing mandatory 'keywords' field.",
86
+ name,
87
+ )
88
+ continue
89
+
90
+ # Convert keywords to list, if only one
91
+ if not isinstance(tpl["keywords"], list):
92
+ tpl["keywords"] = [tpl["keywords"]]
93
+
94
+ # Set excluded_keywords as empty list, if not provided
95
+ if "exclude_keywords" not in tpl.keys():
96
+ tpl["exclude_keywords"] = []
97
+
98
+ # Convert excluded_keywords to list, if only one
99
+ if not isinstance(tpl["exclude_keywords"], list):
100
+ tpl["exclude_keywords"] = [tpl["exclude_keywords"]]
101
+
102
+ if "priority" not in tpl.keys():
103
+ tpl["priority"] = 5
104
+
105
+ output.append(InvoiceTemplate(tpl))
106
+
107
+ logger.info("Loaded %d templates from %s", len(output), folder)
108
+
109
+ return output
@@ -1,5 +1,3 @@
1
- # SPDX-License-Identifier: MIT
2
-
3
1
  """
4
2
  Interface for fields parsers.
5
3
 
@@ -5,9 +5,9 @@ Initial work and maintenance by Holger Brunn @hbrunn
5
5
  """
6
6
 
7
7
  import re
8
- import logging
8
+ from logging import getLogger
9
9
 
10
- logger = logging.getLogger(__name__)
10
+ logger = getLogger(__name__)
11
11
 
12
12
  DEFAULT_OPTIONS = {"line_separator": r"\n"}
13
13
 
@@ -23,8 +23,14 @@ def parse_line(patterns, line):
23
23
 
24
24
  def parse_block(template, field, settings, content):
25
25
  # Validate settings
26
- assert "line" in settings, "Line regex missing"
27
-
26
+ assert "line" in settings, (
27
+ "Error in Template %s Line regex missing" % template["template_name"]
28
+ )
29
+
30
+ logger.debug(
31
+ "START lines block content ========================\n%s", content
32
+ )
33
+ logger.debug("END lines block content ==========================")
28
34
  lines = []
29
35
  current_row = {}
30
36
 
@@ -91,7 +97,7 @@ def parse_block(template, field, settings, content):
91
97
  skip_line_results = [re.search(settings["skip_line"], line)]
92
98
  if any(skip_line_results):
93
99
  # There was at least one match to a skip_line
94
- logger.debug("skip_line match on *%s*", line)
100
+ logger.debug("skip_line match on \ns*%s*", line)
95
101
  continue
96
102
  # If none of those have continued the loop, check if this is just a normal line
97
103
  match = parse_line(settings["line"], line)
@@ -101,7 +107,7 @@ def parse_block(template, field, settings, content):
101
107
  current_row = parse_current_row(match, current_row)
102
108
  continue
103
109
  # If the line doesn't match anything, log and continue to next line
104
- logger.debug("ignoring *%s* because it doesn't match anything", line)
110
+ logger.debug("The following line doesn't match anything:\n*%s*", line)
105
111
  if current_row:
106
112
  # All lines processed, so append whatever the final current_row was to output
107
113
  lines.append(current_row)
@@ -115,14 +121,18 @@ def parse_block(template, field, settings, content):
115
121
  return lines
116
122
 
117
123
 
118
- def parse(template, field, _settings, content):
124
+ def parse_by_rule(template, field, rule, content):
119
125
  # First apply default options.
120
126
  settings = DEFAULT_OPTIONS.copy()
121
- settings.update(_settings)
127
+ settings.update(rule)
122
128
 
123
129
  # Validate settings
124
- assert "start" in settings, "Lines start regex missing"
125
- assert "end" in settings, "Lines end regex missing"
130
+ assert "start" in settings, (
131
+ "Error in Template %s Lines start regex missing" % template["template_name"]
132
+ )
133
+ assert "end" in settings, (
134
+ "Error in Template %s Lines end regex missing" % template["template_name"]
135
+ )
126
136
 
127
137
  blocks_count = 0
128
138
  lines = []
@@ -131,12 +141,13 @@ def parse(template, field, _settings, content):
131
141
  while True:
132
142
  start = re.search(settings["start"], content)
133
143
  if not start:
144
+ logger.debug("Failed to find lines block start")
134
145
  break
135
146
  content = content[start.end():]
136
147
 
137
148
  end = re.search(settings["end"], content)
138
149
  if not end:
139
- logger.warning("Failed to find lines block end")
150
+ logger.debug("Failed to find lines block end")
140
151
  break
141
152
 
142
153
  blocks_count += 1
@@ -152,6 +163,25 @@ def parse(template, field, _settings, content):
152
163
  return lines
153
164
 
154
165
 
166
+ def parse(template, field, settings, content):
167
+ if "rules" in settings:
168
+ # One field can have multiple sets of line-parsing rules
169
+ rules = settings['rules']
170
+ else:
171
+ # Original syntax stored line-parsing rules in top field YAML object
172
+ keys = ('start', 'end', 'line', 'first_line', 'last_line', 'skip_line', 'types')
173
+ rules = [{k: v for k, v in settings.items() if k in keys}]
174
+
175
+ lines = []
176
+ for i, rule in enumerate(rules):
177
+ logger.debug("Testing Rules set #%s", i)
178
+ new_lines = parse_by_rule(template, field, rule, content)
179
+ if new_lines is not None:
180
+ lines += new_lines
181
+
182
+ return lines
183
+
184
+
155
185
  def parse_current_row(match, current_row):
156
186
  # Parse the current row data
157
187
  for field, value in match.groupdict().items():
@@ -1,3 +1,5 @@
1
+ #!/usr/bin/python
2
+ # -*- coding: utf-8 -*-
1
3
  # SPDX-License-Identifier: MIT
2
4
 
3
5
  """
@@ -32,13 +34,15 @@ def parse(template, field, settings, content, legacy=False):
32
34
  for regex in regexes:
33
35
  if not isinstance(regex, str):
34
36
  logger.warning("Field \"%s\" regex is not a string (%s)", field, str(regex))
37
+
35
38
  continue
36
39
  matches = re.findall(regex, content)
37
- logger.debug("field=%s | regex=%s | matches=%s", field, settings["regex"], matches)
40
+ logger.debug("field=\033[1m\033[93m%s\033[0m | regex=\033[36m%s\033[0m | matches=\033[1m\033[92m%s\033[0m"
41
+ , field, settings["regex"], matches)
38
42
  if matches:
39
43
  for match in matches:
40
44
  if isinstance(match, tuple):
41
- logger.warning("Regex can't contain multiple capturing groups (\"" + regex + "\")")
45
+ logger.warning("Regex can't contain multiple capturing groups %s", regex)
42
46
  return None
43
47
  result += matches
44
48
 
@@ -47,15 +51,28 @@ def parse(template, field, settings, content, legacy=False):
47
51
  result[k] = template.coerce_type(v, settings["type"])
48
52
 
49
53
  if "group" in settings:
54
+ result = list(filter(None, result))
50
55
  if settings["group"] == "sum":
51
56
  result = sum(result)
57
+ elif settings["group"] == "min":
58
+ result = min(result)
59
+ elif settings["group"] == "max":
60
+ result = max(result)
61
+ elif settings["group"] == "first":
62
+ result = result[0]
63
+ elif settings["group"] == "last":
64
+ result = result[-1]
65
+ elif settings["group"] == "join":
66
+ result = " ".join(str(v) for v in result)
52
67
  else:
53
- logger.warning("Unsupported grouping method: " + settings["group"])
68
+ logger.warning("Unsupported grouping method: %s", settings["group"])
54
69
  return None
55
70
  else:
71
+
56
72
  # Remove duplicates maintaining the order by default (it's more
57
73
  # natural). Don't do that for legacy parsing to keep backward
58
74
  # compatibility.
75
+
59
76
  if legacy:
60
77
  result = list(set(result))
61
78
  else:
@@ -4,9 +4,9 @@
4
4
  Pseudo-parser returning a static (predefined) value
5
5
  """
6
6
 
7
- import logging
7
+ from logging import getLogger
8
8
 
9
- logger = logging.getLogger(__name__)
9
+ logger = getLogger(__name__)
10
10
 
11
11
 
12
12
  def parse(template, field, settings, content):
@@ -14,6 +14,6 @@ def parse(template, field, settings, content):
14
14
  logger.warning("Field \"%s\" doesn't have static value specified", field)
15
15
  return None
16
16
 
17
- logger.debug("field=%s | value=%s", field, settings["value"])
17
+ logger.debug("field=%s | value=['%s']", field, settings["value"])
18
18
 
19
19
  return settings["value"]