mikrowerk-edi-invoicing 0.3.6__tar.gz → 0.5.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (49) hide show
  1. {mikrowerk_edi_invoicing-0.3.6/mikrowerk_edi_invoicing.egg-info → mikrowerk_edi_invoicing-0.5.0}/PKG-INFO +3 -2
  2. {mikrowerk_edi_invoicing-0.3.6 → mikrowerk_edi_invoicing-0.5.0}/edi_invoice_parser/__init__.py +18 -1
  3. mikrowerk_edi_invoicing-0.5.0/edi_invoice_parser/model/__init__.py +19 -0
  4. mikrowerk_edi_invoicing-0.5.0/edi_invoice_parser/model/trade_document_types.py +139 -0
  5. mikrowerk_edi_invoicing-0.5.0/edi_invoice_parser/parse_plain_pdf_file.py +5 -0
  6. mikrowerk_edi_invoicing-0.5.0/edi_invoice_parser/pdf_llm_parser/__init__.py +5 -0
  7. mikrowerk_edi_invoicing-0.5.0/edi_invoice_parser/pdf_llm_parser/google_gemini_parser.py +145 -0
  8. mikrowerk_edi_invoicing-0.5.0/edi_invoice_parser/tests/test_parse_plain_pdf_invoice.py +71 -0
  9. mikrowerk_edi_invoicing-0.5.0/edi_invoice_parser/util/timer_helper.py +11 -0
  10. {mikrowerk_edi_invoicing-0.3.6 → mikrowerk_edi_invoicing-0.5.0/mikrowerk_edi_invoicing.egg-info}/PKG-INFO +3 -2
  11. {mikrowerk_edi_invoicing-0.3.6 → mikrowerk_edi_invoicing-0.5.0}/mikrowerk_edi_invoicing.egg-info/SOURCES.txt +12 -0
  12. {mikrowerk_edi_invoicing-0.3.6 → mikrowerk_edi_invoicing-0.5.0}/mikrowerk_edi_invoicing.egg-info/requires.txt +1 -0
  13. {mikrowerk_edi_invoicing-0.3.6 → mikrowerk_edi_invoicing-0.5.0}/setup.py +5 -9
  14. mikrowerk_edi_invoicing-0.3.6/edi_invoice_parser/model/__init__.py +0 -4
  15. {mikrowerk_edi_invoicing-0.3.6 → mikrowerk_edi_invoicing-0.5.0}/LICENSE +0 -0
  16. {mikrowerk_edi_invoicing-0.3.6 → mikrowerk_edi_invoicing-0.5.0}/README.md +0 -0
  17. {mikrowerk_edi_invoicing-0.3.6 → mikrowerk_edi_invoicing-0.5.0}/edi_invoice_parser/cii_dom_parser/__init__.py +0 -0
  18. {mikrowerk_edi_invoicing-0.3.6 → mikrowerk_edi_invoicing-0.5.0}/edi_invoice_parser/cii_dom_parser/dom_elements_helper.py +0 -0
  19. {mikrowerk_edi_invoicing-0.3.6 → mikrowerk_edi_invoicing-0.5.0}/edi_invoice_parser/cii_dom_parser/models/__init__.py +0 -0
  20. {mikrowerk_edi_invoicing-0.3.6 → mikrowerk_edi_invoicing-0.5.0}/edi_invoice_parser/cii_dom_parser/models/accounting.py +0 -0
  21. {mikrowerk_edi_invoicing-0.3.6 → mikrowerk_edi_invoicing-0.5.0}/edi_invoice_parser/cii_dom_parser/models/container.py +0 -0
  22. {mikrowerk_edi_invoicing-0.3.6 → mikrowerk_edi_invoicing-0.5.0}/edi_invoice_parser/cii_dom_parser/models/delivery.py +0 -0
  23. {mikrowerk_edi_invoicing-0.3.6 → mikrowerk_edi_invoicing-0.5.0}/edi_invoice_parser/cii_dom_parser/models/document.py +0 -0
  24. {mikrowerk_edi_invoicing-0.3.6 → mikrowerk_edi_invoicing-0.5.0}/edi_invoice_parser/cii_dom_parser/models/elements.py +0 -0
  25. {mikrowerk_edi_invoicing-0.3.6 → mikrowerk_edi_invoicing-0.5.0}/edi_invoice_parser/cii_dom_parser/models/fields.py +0 -0
  26. {mikrowerk_edi_invoicing-0.3.6 → mikrowerk_edi_invoicing-0.5.0}/edi_invoice_parser/cii_dom_parser/models/note.py +0 -0
  27. {mikrowerk_edi_invoicing-0.3.6 → mikrowerk_edi_invoicing-0.5.0}/edi_invoice_parser/cii_dom_parser/models/party.py +0 -0
  28. {mikrowerk_edi_invoicing-0.3.6 → mikrowerk_edi_invoicing-0.5.0}/edi_invoice_parser/cii_dom_parser/models/payment.py +0 -0
  29. {mikrowerk_edi_invoicing-0.3.6 → mikrowerk_edi_invoicing-0.5.0}/edi_invoice_parser/cii_dom_parser/models/product.py +0 -0
  30. {mikrowerk_edi_invoicing-0.3.6 → mikrowerk_edi_invoicing-0.5.0}/edi_invoice_parser/cii_dom_parser/models/references.py +0 -0
  31. {mikrowerk_edi_invoicing-0.3.6 → mikrowerk_edi_invoicing-0.5.0}/edi_invoice_parser/cii_dom_parser/models/trade.py +0 -0
  32. {mikrowerk_edi_invoicing-0.3.6 → mikrowerk_edi_invoicing-0.5.0}/edi_invoice_parser/cii_dom_parser/models/tradelines.py +0 -0
  33. {mikrowerk_edi_invoicing-0.3.6 → mikrowerk_edi_invoicing-0.5.0}/edi_invoice_parser/cii_dom_parser/pdf.py +0 -0
  34. {mikrowerk_edi_invoicing-0.3.6 → mikrowerk_edi_invoicing-0.5.0}/edi_invoice_parser/cii_dom_parser/utils.py +0 -0
  35. {mikrowerk_edi_invoicing-0.3.6 → mikrowerk_edi_invoicing-0.5.0}/edi_invoice_parser/cii_dom_parser/xml_cii_dom_parser.py +0 -0
  36. {mikrowerk_edi_invoicing-0.3.6 → mikrowerk_edi_invoicing-0.5.0}/edi_invoice_parser/cii_dom_parser/xmp_schema.py +0 -0
  37. {mikrowerk_edi_invoicing-0.3.6 → mikrowerk_edi_invoicing-0.5.0}/edi_invoice_parser/cross_industry_invoice_mapper.py +0 -0
  38. {mikrowerk_edi_invoicing-0.3.6 → mikrowerk_edi_invoicing-0.5.0}/edi_invoice_parser/model/x_rechnung.py +0 -0
  39. {mikrowerk_edi_invoicing-0.3.6 → mikrowerk_edi_invoicing-0.5.0}/edi_invoice_parser/model/xml_abstract_x_rechnung_parser.py +0 -0
  40. {mikrowerk_edi_invoicing-0.3.6 → mikrowerk_edi_invoicing-0.5.0}/edi_invoice_parser/tests/__init__.py +0 -0
  41. {mikrowerk_edi_invoicing-0.3.6 → mikrowerk_edi_invoicing-0.5.0}/edi_invoice_parser/tests/test_iban_handling.py +0 -0
  42. {mikrowerk_edi_invoicing-0.3.6 → mikrowerk_edi_invoicing-0.5.0}/edi_invoice_parser/tests/test_parse_x_rechnung.py +0 -0
  43. {mikrowerk_edi_invoicing-0.3.6 → mikrowerk_edi_invoicing-0.5.0}/edi_invoice_parser/ubl_sax_parser/__init__.py +0 -0
  44. {mikrowerk_edi_invoicing-0.3.6 → mikrowerk_edi_invoicing-0.5.0}/edi_invoice_parser/ubl_sax_parser/xml_ubl_sax_parser.py +0 -0
  45. {mikrowerk_edi_invoicing-0.3.6 → mikrowerk_edi_invoicing-0.5.0}/edi_invoice_parser/util/__init__.py +0 -0
  46. {mikrowerk_edi_invoicing-0.3.6 → mikrowerk_edi_invoicing-0.5.0}/edi_invoice_parser/util/file_helper.py +0 -0
  47. {mikrowerk_edi_invoicing-0.3.6 → mikrowerk_edi_invoicing-0.5.0}/mikrowerk_edi_invoicing.egg-info/dependency_links.txt +0 -0
  48. {mikrowerk_edi_invoicing-0.3.6 → mikrowerk_edi_invoicing-0.5.0}/mikrowerk_edi_invoicing.egg-info/top_level.txt +0 -0
  49. {mikrowerk_edi_invoicing-0.3.6 → mikrowerk_edi_invoicing-0.5.0}/setup.cfg +0 -0
@@ -1,7 +1,7 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: mikrowerk_edi_invoicing
3
- Version: 0.3.6
4
- Summary: Parser for EDI invoices in CII or UBL format
3
+ Version: 0.5.0
4
+ Summary: Parser for EDI invoices in CII or UBL format or plain pdf with LLM support
5
5
  Author: Mikrowerk a Gammadata Division
6
6
  Author-email: info@mikrowerk.com
7
7
  License: GNU Affero General Public License v3
@@ -22,6 +22,7 @@ Requires-Dist: factur-x==3.6
22
22
  Requires-Dist: jsonpickle~=4.0.1
23
23
  Requires-Dist: parameterized
24
24
  Requires-Dist: schwifty
25
+ Requires-Dist: google
25
26
  Dynamic: author
26
27
  Dynamic: author-email
27
28
  Dynamic: classifier
@@ -2,6 +2,9 @@ from .cross_industry_invoice_mapper import parse_and_map_x_rechnung
2
2
  from .model.x_rechnung import (XRechnung, XRechnungTradeParty, XRechnungTradeAddress, XRechnungTradeContact,
3
3
  XRechnungPaymentMeans, XRechnungBankAccount, XRechnungCurrency, XRechnungTradeLine,
4
4
  XRechnungAppliedTradeTax, XRechnungFinancialCard)
5
+ from .model.trade_document_types import TradeDocument, TradeParty, TradePartyAddress, TradeCurrency, TradePartyContact, \
6
+ TradeLine, PaymentMeans, AppliedTradeTax, BankAccount, FinancialCard, ubl_doc_codes
7
+ from .parse_plain_pdf_file import analyze_document
5
8
 
6
9
  __all__ = ["parse_and_map_x_rechnung",
7
10
  "XRechnung",
@@ -13,5 +16,19 @@ __all__ = ["parse_and_map_x_rechnung",
13
16
  "XRechnungCurrency",
14
17
  "XRechnungTradeLine",
15
18
  "XRechnungAppliedTradeTax",
16
- "XRechnungFinancialCard"
19
+ "XRechnungFinancialCard",
20
+ "analyze_document",
21
+ "TradeDocument",
22
+ "TradeParty",
23
+ "TradePartyAddress",
24
+ "TradeCurrency",
25
+ "TradePartyContact",
26
+ "TradeLine",
27
+ "PaymentMeans",
28
+ "AppliedTradeTax",
29
+ "BankAccount",
30
+ "FinancialCard",
31
+ "ubl_doc_codes"
17
32
  ]
33
+
34
+ version = "0.5.0"
@@ -0,0 +1,19 @@
1
+ from .x_rechnung import XRechnung
2
+ from .trade_document_types import TradeDocument, TradeParty, TradePartyAddress, TradeCurrency, TradePartyContact, \
3
+ TradeLine, PaymentMeans, AppliedTradeTax, BankAccount, FinancialCard, ubl_doc_codes
4
+ from .xml_abstract_x_rechnung_parser import XMLAbstractXRechnungParser
5
+
6
+ __all__ = ["XRechnung",
7
+ "XMLAbstractXRechnungParser",
8
+ "TradeDocument",
9
+ "TradeParty",
10
+ "TradePartyAddress",
11
+ "TradeCurrency",
12
+ "TradePartyContact",
13
+ "TradeLine",
14
+ "PaymentMeans",
15
+ "AppliedTradeTax",
16
+ "BankAccount",
17
+ "FinancialCard",
18
+ "ubl_doc_codes"
19
+ ]
@@ -0,0 +1,139 @@
1
+ from dataclasses import dataclass
2
+ from decimal import Decimal
3
+ import datetime
4
+
5
+ """
6
+ UBL Document Type (XML Root) Description UNCL 1001 Code (Example)
7
+ """
8
+
9
+ ubl_doc_codes = {
10
+ "ApplicationResponse": ('ApplicationResponse', 431, 'Response to an application/message'),
11
+ "Catalogue": ('Catalogue', 71, 'Product catalogue'),
12
+ "CatalogueRequest": ('CatalogueRequest', 171, 'Catalogue request'),
13
+ "CreditNote": ('CreditNote', 381, 'Commercial Credit note'),
14
+ "DebitNote": ('DebitNote', 383, 'Debit note'),
15
+ "DespatchAdvice": ('DespatchAdvice', 250, 'Despatch advice (Advance Ship Notice)'),
16
+ "Invoice": ('Invoice', 380, 'Commercial Invoice'),
17
+ "Order": ('Order', 220, 'Order'),
18
+ "OrderChange": ('OrderChange', 222, 'Order change'),
19
+ "OrderResponse": ('OrderResponse', 255, 'Order response (confirmation/rejection)'),
20
+ "Quotation": ('Quotation', 83, 'Quotation'),
21
+ "RequestForQuotation": ('RequestForQuotation', 135, 'Request for quotation'),
22
+ "RemittanceAdvice": ('RemittanceAdvice', 256, 'Remittance advice'),
23
+ "Statement": ('Statement', 86, 'Account statement / Balance confirmation'),
24
+ "UtilityStatement": ('Utility statement', 490, 'Statement (electricity, gas, etc.)'),
25
+ }
26
+
27
+
28
+ @dataclass
29
+ class TradePartyAddress:
30
+ post_code: str = None
31
+ city_name: str = None
32
+ country_id: str = None
33
+ country_subdivision_id: str = None
34
+ address_line_1: str = None
35
+ address_line_2: str = None
36
+ address_line_3: str = None
37
+
38
+
39
+ @dataclass
40
+ class TradePartyContact:
41
+ name: str
42
+ department_name: str = None
43
+ telephone: str = None
44
+ fax: str = None
45
+ email: str = None
46
+
47
+
48
+ @dataclass
49
+ class TradeParty:
50
+ name: str
51
+ vat_registration_number: str = None
52
+ fiscal_registration_number: str = None
53
+ legal_registration_number: str = None
54
+ address: TradePartyAddress = None
55
+ contact: TradePartyContact = None
56
+
57
+
58
+ @dataclass
59
+ class AppliedTradeTax:
60
+ name: str
61
+ type_code: str = None
62
+ category_code: str = None
63
+ applicable_percent: Decimal = None
64
+ basis_amount: Decimal = None
65
+ calculated_amount: Decimal = None
66
+
67
+
68
+ @dataclass
69
+ class TradeLine:
70
+ pos_number: int
71
+ article_code: str = None
72
+ name: str = None
73
+ description: str = None
74
+ quantity: Decimal = None
75
+ unit_of_measure: str = None
76
+ unit_price: Decimal = None
77
+ total_net: Decimal = None
78
+ tax: AppliedTradeTax = None
79
+ total_amount: Decimal = None
80
+
81
+
82
+ @dataclass
83
+ class TradeCurrency:
84
+ amount: Decimal
85
+ currency_code: str
86
+
87
+
88
+ @dataclass
89
+ class BankAccount:
90
+ iban: str
91
+ bic: str = None
92
+ name: str = None
93
+
94
+
95
+ @dataclass
96
+ class FinancialCard:
97
+ id: str
98
+ cardholder_name: str | None = None
99
+
100
+
101
+ @dataclass
102
+ class PaymentMeans:
103
+ id: str = None
104
+ type_code: str = None
105
+ information: str = None
106
+ financial_card: FinancialCard = None
107
+ payee_account: BankAccount = None
108
+
109
+
110
+ @dataclass
111
+ class TradeDocument:
112
+ """
113
+ Model of a Trade Document
114
+ """
115
+ name: str
116
+ doc_type_code: tuple # Document Type Code: ubl_doc_codes
117
+ doc_id: str = None
118
+ issued_date_time: datetime = None # 'Date'
119
+ languages: str = None # 'Languages'
120
+ notes: str = None # 'Notes'
121
+ sender_reference: str = None # 'Buyer Reference'
122
+ receiver_reference: str = None
123
+ dispatch_reference: str = None
124
+ sales_order_reference: str = None
125
+ sender: TradeParty = None
126
+ receiver: TradeParty = None
127
+ currency_code: str = None # 'Currency Code'
128
+ payment_means: PaymentMeans = None
129
+ payment_terms: str = None # 'Payment Terms'
130
+ line_total_amount: Decimal = None # 'Line Total Amount'
131
+ charge_total_amount: Decimal = None # 'Charge Total Amount'
132
+ allowance_total_amount: Decimal = None # 'Allowance Total Amount'
133
+ tax_basis_total_amount: Decimal = None
134
+ tax_total_amount: Decimal = None # 'Tax Grand Total Amount'
135
+ grand_total_amount: Decimal = None # 'Grand Total Amount'
136
+ total_prepaid_amount: Decimal = None # 'Total Prepaid Amount'
137
+ due_payable_amount: Decimal = None # 'Due Payable Amount'
138
+ trade_line_items: [TradeLine] = None
139
+ applicable_trade_taxes: [AppliedTradeTax] = None
@@ -0,0 +1,5 @@
1
+ from .pdf_llm_parser.google_gemini_parser import analyze_document as google_analyze_document
2
+
3
+
4
+ def analyze_document(pdf_binary: bytes, api_key=None, model: str = None, prompt=None) -> dict:
5
+ return google_analyze_document(pdf_binary, api_key=api_key, model=model, prompt=prompt)
@@ -0,0 +1,5 @@
1
+ from .google_gemini_parser import analyze_document as google_gemini_parser
2
+
3
+ __all__ = ['google_gemini_parser']
4
+
5
+ version = "0.0.1"
@@ -0,0 +1,145 @@
1
+ import json
2
+ import logging
3
+ import os
4
+
5
+ from google import genai
6
+ from google.genai import types
7
+
8
+ DEFAULT_LLM_MODEL = "gemini-2.5-flash-lite"
9
+
10
+ _logger = logging.getLogger(__name__)
11
+
12
+ DEFAULT_PROMPT = """
13
+ Bitte analysiere das beigefügte Dokument und extrahiere alle relevanten Informationen als strukturiertes JSON.
14
+ Verwende exakt das folgende JSON-Schema und beachte die Anweisungen.
15
+
16
+ Wichtige Hinweise:
17
+ - Gib NUR valides JSON zurück. Kein umgebender Text oder Markdown-Formatierung.
18
+ - Bei fehlenden Werten verwende den JSON-Wert null.
19
+ - Gib alle Beträge als Zahlen (float oder integer), nicht als Strings.
20
+ - Formatiere alle Datumsangaben im Format YYYY-MM-DD.
21
+ - Gib das land als "ISO 3166-2" Code an, wenn nicht angegeben nehme "DE" an.
22
+ - Erkenne den Dokumententyp und klassifiziere ihn als: "Rechnung", "Gutschrift", "Auftrag", "Angebot", "Anfrage", "Auftragsbestätigung", "Bestellung"
23
+ - Erkenne für eine Gutschrift auch Varianten wie Credit Note, Rechnungsgutschrift, Erstattung
24
+ - Erkenne für eine Rechnung auch Varianten wie: "Rechnung","Proforma Rechnung","R-Nr.","Rechnung-Nr.","Rech.-Nr.","Invoice No.","Invoice Number","Kostenrechnung"
25
+ - Erkenne für eine Auftragsbestätigung auch Varianten wie: "Bestätigung", "Vertrag","Order Acknowledgement", "Order Confirmation"
26
+ - Erkenne für die Referenz auch Varianten wie: "Projektnummer", "Auftragsnummer", "Bestellnummer", "Referenznummer", "Kundenreferenz", "Reference"
27
+ - Ist der Dokumententyp nicht erkennbar klassifiziere ihn als: "sonstiges"
28
+ - Sind keine Positionen vorhanden erstelle eine zusammenfassung des Textes und die Summen oder Beträge, falls vorhanden
29
+ - Ist der Dokumententyp ist nicht erkennbar erstelle eine Zusammenfassung des Textes
30
+ - Erkenne auch Varianten wie: "MWST.", "VAT", "Ust.", etc. für die Umsatzsteuer.
31
+ - Erkenne für die Währung Varianten wie: "EUR", "€", "$", "USD", "CHF" oder sonstige Währungsbezeichnung oder Symbol
32
+ - Wenn keine Währung abgeben ist nimm "EUR" an
33
+ - Erkenne für das fälligkeitsdatum auch Varianten wie: "zahlbar bis", "fällig", "fällig bis", "zu liefern bis", "gültig bis"
34
+ - Erkenne ob eine Rechnung bereits bezahlt ist, ja oder nein, und vermerke die Zahlungsmethode wie: "Paypal", "Kreditkarte", "Überweisung", "Barzahlung"
35
+ - Eine Rechnung ist bezahlt "ja", wenn die Zahlungsmethode wie: "Paypal", "Kreditkarte" ist.
36
+ - Ernenne für die Steuernummer des Absenders auch: "Tax Number", "VAT Number", "VAT ID", "MWST ID", "UST ID", "Ust-id"
37
+
38
+ JSON-Schema:
39
+ {
40
+ "dokumenttyp": "string",
41
+ "dokumentnummer": "string",
42
+ "referenz": "string",
43
+ "dokumentendatum": "YYYY-MM-DD",
44
+ "fälligkeitsdatum": "YYYY-MM-DD",
45
+ "absender": {
46
+ "name": "string",
47
+ "adresse": "string",
48
+ "plz": "string",
49
+ "ort": "string",
50
+ "land": "string",
51
+ "steuernummer": "string",
52
+ },
53
+ "empfänger": {
54
+ "name": "string",
55
+ "adresse": "string",
56
+ "plz": "string",
57
+ "ort": "string",
58
+ "land": "string",
59
+ },
60
+ "positionen": [
61
+ {
62
+ "POS": "number",
63
+ "Bezeichnung": "string",
64
+ "Menge": "number",
65
+ "Einheit": "string",
66
+ "einzelpreis": "number",
67
+ "Betrag": "number",
68
+ "mwst_satz": "number"
69
+ }
70
+ ],
71
+ "summen": {
72
+ "nettobetrag": "number",
73
+ "umsatzsteuer": "number",
74
+ "rechnungsbetrag": "number"
75
+ "währung": "string"
76
+ },
77
+ "zahlungshinweise": "string",
78
+ "zahlungsmethode": "string"
79
+ "bezahlt": "string",
80
+ "weitere_info": "string",
81
+ "bankverbindung": "string"
82
+ "zusammenfassung": "string",
83
+ }
84
+ """
85
+
86
+
87
+ def analyze_document(pdf_binary: bytes, api_key, model: str, prompt) -> dict:
88
+ """
89
+ Analysiert eine PDF-Rechnung mit Gemini, extrahiert Informationen
90
+ und gibt sie als Python-Dictionary zurück.
91
+ """
92
+
93
+ if not pdf_binary:
94
+ _logger.error("Error no binaries supplied")
95
+ raise ValueError("Error no binaries supplied")
96
+
97
+ if api_key:
98
+ if len(api_key) < 20:
99
+ raise RuntimeError("Der API Key scheint ungültig oder zu kurz zu sein.")
100
+ else:
101
+ api_key = os.environ.get("GEMINI_API_KEY", None)
102
+ if api_key is None:
103
+ raise RuntimeError("Die Umgebungsvariable 'GEMINI_API_KEY' wurde nicht gefunden.")
104
+ if len(api_key) < 20:
105
+ raise RuntimeError("Der API Key scheint ungültig oder zu kurz zu sein.")
106
+
107
+ api_key = api_key.strip()
108
+ client = genai.Client(api_key=api_key)
109
+ if not model:
110
+ model = DEFAULT_LLM_MODEL
111
+ if not prompt:
112
+ prompt = DEFAULT_PROMPT
113
+
114
+ _logger.info("\nSende Anfrage an die Gemini API...")
115
+ # Sende die Anfrage mit dem Prompt und der hochgeladenen Datei
116
+ _logger.info(f"use API-KEY: {api_key} length: {len(api_key)}")
117
+ try:
118
+ response = client.models.generate_content(
119
+ model=model,
120
+ contents=[
121
+ types.Part.from_bytes(
122
+ data=pdf_binary,
123
+ mime_type='application/pdf',
124
+ ),
125
+ prompt])
126
+ except KeyError:
127
+ raise UserWarning("Fehler: Die Umgebungsvariable GOOGLE_API_KEY wurde nicht gefunden.\n"
128
+ "Bitte setzen Sie den Schlüssel, z.B. mit 'export GOOGLE_API_KEY=\"DEIN_API_SCHLÜSSEL\"'")
129
+
130
+ except Exception as e:
131
+ raise UserWarning(f"Ein unerwarteter Fehler ist aufgetreten: {e}")
132
+
133
+ # Bereinige und parse die Antwort
134
+ try:
135
+ # Manchmal gibt das Modell die Antwort in einem Markdown-Codeblock zurück.
136
+ # Dieser Code entfernt die Markierungen, um reines JSON zu erhalten.
137
+ cleaned_response = response.text.strip().replace("```json", "").replace("```", "").strip()
138
+
139
+ # Parse den JSON-String in ein Python-Dictionary
140
+ extracted_data = json.loads(cleaned_response)
141
+ return extracted_data
142
+ except json.JSONDecodeError:
143
+ raise RuntimeError("Fehler beim Parsen der Modell-Antwort:", response.text)
144
+ except Exception as e:
145
+ raise RuntimeError(f"Ein unerwarteter Fehler ist aufgetreten: {e}")
@@ -0,0 +1,71 @@
1
+ import unittest
2
+ import os
3
+ import pathlib
4
+ from parameterized import parameterized
5
+ import json
6
+
7
+ from . import get_checked_file_path
8
+ from edi_invoice_parser.parse_plain_pdf_file import analyze_document
9
+
10
+
11
+ class TestPlainPdfInvoiceParser(unittest.TestCase):
12
+ @parameterized.expand([
13
+ ('pdf', 'plain_pdf_invoices/25313 - Rechnung Konzepthausevent - 16-09-2025.pdf'),
14
+ ('pdf', 'plain_pdf_invoices/57856 - 2509-7377.pdf'),
15
+ ('pdf', 'plain_pdf_invoices/Bestätigung griffty GmbH VA 150919 SCCON_25.pdf'),
16
+ # ('pdf', 'plain_pdf_invoices/Invoice INV-1013.pdf'),
17
+ # ('pdf', 'plain_pdf_invoices/LGM-2509-784_Griffity_GmbH_10_09_2025_Rechnung.pdf'),
18
+ # ('pdf', 'plain_pdf_invoices/Order GO-0741243.pdf'),
19
+ # ('pdf', 'plain_pdf_invoices/Rechnung 30250628.pdf'),
20
+ # ('pdf', 'plain_pdf_invoices/Rechnung-202511899-11267.pdf'),
21
+ # ('pdf', 'plain_pdf_invoices/TS Rechnung TS2025-10586.pdf'),
22
+ # ('pdf', 'plain_pdf_invoices/Verkaufsrechnung 01-137334.pdf'),
23
+ ])
24
+ def test_parse_pdf_invoice(self, file_type, file_path):
25
+ _file_path, _exists, _is_dir = get_checked_file_path(file_path, __file__)
26
+ self.assertEqual(file_type, 'pdf', "Only 'pdf' filetype is supported")
27
+ self.assertTrue(_exists, f"file does not exist: {_file_path}")
28
+ self.test_api_key_is_available()
29
+ api_key = os.environ.get("GEMINI_API_KEY")
30
+
31
+ filepath = pathlib.Path(_file_path)
32
+ binary = filepath.read_bytes()
33
+ invoice_data = analyze_document(binary, api_key=api_key)
34
+ self.assertIsNotNone(invoice_data, "No result retrieved")
35
+ print("\n------------------------------------------------------------")
36
+ print(f"\nDokument Date: {file_path}")
37
+ print("\n--- Extrahierte Dokumentdaten ---")
38
+ # Beispielhafter Zugriff auf einzelne Daten
39
+ print(f"\nDokumenttyp: {invoice_data.get('dokumenttyp', None)}")
40
+ print(f"\ndokumentnummer: {invoice_data.get('dokumentnummer', None)}")
41
+ print(f"\nAbsender: {invoice_data.get('absender', {}).get('name', None)}")
42
+ print(f"\nEmpfänger: {invoice_data.get('empfänger', {}).get('name', None)}")
43
+ print(f"Fälligkeitsdatum: {invoice_data.get('fälligkeitsdatum', None)}")
44
+ print(f"Gesamtbetrag: {invoice_data.get('summen', {}).get('rechnungsbetrag', 'None')} €")
45
+ print("\n------------------------------------------------------------")
46
+ # Gib das Dictionary als formatierten JSON-String aus
47
+ print(json.dumps(invoice_data, indent=2, ensure_ascii=False))
48
+
49
+ _out_file_path = _file_path.replace('.pdf', '.json')
50
+ with open(_out_file_path, "w") as f:
51
+ f.write(json.dumps(invoice_data, indent=2, ensure_ascii=False))
52
+ print(f"written result json to {_out_file_path}")
53
+ print("\n---------------------------------")
54
+
55
+ def test_api_key_is_available(self):
56
+ """
57
+ Prüft, ob die Umgebungsvariable 'PROD_API_KEY' gesetzt ist.
58
+
59
+ WICHTIG: Gib niemals den Inhalt des Keys in Logs aus!
60
+ Prüfe nur, ob er existiert oder valide aussieht (z.B. Länge).
61
+ """
62
+
63
+ # Hier liest Python die Umgebungsvariable
64
+ api_key = os.environ.get("GEMINI_API_KEY")
65
+
66
+ assert api_key is not None, "Die Umgebungsvariable 'GEMINI_API_KEY' wurde nicht gefunden."
67
+ assert len(api_key) > 20, "Der API Key scheint ungültig oder zu kurz zu sein."
68
+
69
+
70
+ if __name__ == '__main__':
71
+ unittest.main()
@@ -0,0 +1,11 @@
1
+ from timeit import default_timer as timer
2
+
3
+
4
+ def timer_func(func):
5
+ def wrapper(*args, **kwargs):
6
+ t1 = timer()
7
+ result = func(*args, **kwargs)
8
+ t2 = timer()
9
+ print(f'{func.__name__}() executed in {(t2-t1):.6f}s')
10
+ return result
11
+ return wrapper
@@ -1,7 +1,7 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: mikrowerk_edi_invoicing
3
- Version: 0.3.6
4
- Summary: Parser for EDI invoices in CII or UBL format
3
+ Version: 0.5.0
4
+ Summary: Parser for EDI invoices in CII or UBL format or plain pdf with LLM support
5
5
  Author: Mikrowerk a Gammadata Division
6
6
  Author-email: info@mikrowerk.com
7
7
  License: GNU Affero General Public License v3
@@ -22,6 +22,7 @@ Requires-Dist: factur-x==3.6
22
22
  Requires-Dist: jsonpickle~=4.0.1
23
23
  Requires-Dist: parameterized
24
24
  Requires-Dist: schwifty
25
+ Requires-Dist: google
25
26
  Dynamic: author
26
27
  Dynamic: author-email
27
28
  Dynamic: classifier
@@ -3,6 +3,7 @@ README.md
3
3
  setup.py
4
4
  ./edi_invoice_parser/__init__.py
5
5
  ./edi_invoice_parser/cross_industry_invoice_mapper.py
6
+ ./edi_invoice_parser/parse_plain_pdf_file.py
6
7
  ./edi_invoice_parser/cii_dom_parser/__init__.py
7
8
  ./edi_invoice_parser/cii_dom_parser/dom_elements_helper.py
8
9
  ./edi_invoice_parser/cii_dom_parser/pdf.py
@@ -24,17 +25,23 @@ setup.py
24
25
  ./edi_invoice_parser/cii_dom_parser/models/trade.py
25
26
  ./edi_invoice_parser/cii_dom_parser/models/tradelines.py
26
27
  ./edi_invoice_parser/model/__init__.py
28
+ ./edi_invoice_parser/model/trade_document_types.py
27
29
  ./edi_invoice_parser/model/x_rechnung.py
28
30
  ./edi_invoice_parser/model/xml_abstract_x_rechnung_parser.py
31
+ ./edi_invoice_parser/pdf_llm_parser/__init__.py
32
+ ./edi_invoice_parser/pdf_llm_parser/google_gemini_parser.py
29
33
  ./edi_invoice_parser/tests/__init__.py
30
34
  ./edi_invoice_parser/tests/test_iban_handling.py
35
+ ./edi_invoice_parser/tests/test_parse_plain_pdf_invoice.py
31
36
  ./edi_invoice_parser/tests/test_parse_x_rechnung.py
32
37
  ./edi_invoice_parser/ubl_sax_parser/__init__.py
33
38
  ./edi_invoice_parser/ubl_sax_parser/xml_ubl_sax_parser.py
34
39
  ./edi_invoice_parser/util/__init__.py
35
40
  ./edi_invoice_parser/util/file_helper.py
41
+ ./edi_invoice_parser/util/timer_helper.py
36
42
  edi_invoice_parser/__init__.py
37
43
  edi_invoice_parser/cross_industry_invoice_mapper.py
44
+ edi_invoice_parser/parse_plain_pdf_file.py
38
45
  edi_invoice_parser/cii_dom_parser/__init__.py
39
46
  edi_invoice_parser/cii_dom_parser/dom_elements_helper.py
40
47
  edi_invoice_parser/cii_dom_parser/pdf.py
@@ -56,15 +63,20 @@ edi_invoice_parser/cii_dom_parser/models/references.py
56
63
  edi_invoice_parser/cii_dom_parser/models/trade.py
57
64
  edi_invoice_parser/cii_dom_parser/models/tradelines.py
58
65
  edi_invoice_parser/model/__init__.py
66
+ edi_invoice_parser/model/trade_document_types.py
59
67
  edi_invoice_parser/model/x_rechnung.py
60
68
  edi_invoice_parser/model/xml_abstract_x_rechnung_parser.py
69
+ edi_invoice_parser/pdf_llm_parser/__init__.py
70
+ edi_invoice_parser/pdf_llm_parser/google_gemini_parser.py
61
71
  edi_invoice_parser/tests/__init__.py
62
72
  edi_invoice_parser/tests/test_iban_handling.py
73
+ edi_invoice_parser/tests/test_parse_plain_pdf_invoice.py
63
74
  edi_invoice_parser/tests/test_parse_x_rechnung.py
64
75
  edi_invoice_parser/ubl_sax_parser/__init__.py
65
76
  edi_invoice_parser/ubl_sax_parser/xml_ubl_sax_parser.py
66
77
  edi_invoice_parser/util/__init__.py
67
78
  edi_invoice_parser/util/file_helper.py
79
+ edi_invoice_parser/util/timer_helper.py
68
80
  mikrowerk_edi_invoicing.egg-info/PKG-INFO
69
81
  mikrowerk_edi_invoicing.egg-info/SOURCES.txt
70
82
  mikrowerk_edi_invoicing.egg-info/dependency_links.txt
@@ -10,3 +10,4 @@ factur-x==3.6
10
10
  jsonpickle~=4.0.1
11
11
  parameterized
12
12
  schwifty
13
+ google
@@ -16,7 +16,7 @@ setuptools.setup(
16
16
  name="mikrowerk_edi_invoicing",
17
17
 
18
18
  # version of the module
19
- version="0.3.6",
19
+ version="0.5.0",
20
20
 
21
21
  # Name of Author
22
22
  author="Mikrowerk a Gammadata Division",
@@ -25,23 +25,18 @@ setuptools.setup(
25
25
  author_email="info@mikrowerk.com",
26
26
 
27
27
  # #Small Description about module
28
- description="Parser for EDI invoices in CII or UBL format",
28
+ description="Parser for EDI invoices in CII or UBL format or plain pdf with LLM support",
29
29
 
30
30
  # Specifying that we are using markdown file for description
31
31
  long_description=long_description,
32
32
  long_description_content_type="text/markdown",
33
33
 
34
- # Any link to reach this module, ***if*** you have any webpage or github profile
35
- # url="https://github.com/username/",
36
- packages=setuptools.find_packages(exclude=["tests_*", "tests"]),
34
+ packages=setuptools.find_packages(exclude=["tests_*", "tests", "pdfparser"]),
37
35
 
38
36
  package_dir={"": "."},
39
37
  include_package_data=True,
40
38
  package_data={'': ['*.yaml', '*.jinja2', '*.sh']},
41
39
 
42
- # if module has dependencies i.e. if your package rely on other package at pypi.org
43
- # then you must add there, in order to download every requirement of package
44
-
45
40
  install_requires=[
46
41
  "lxml",
47
42
  "pypdf",
@@ -55,11 +50,12 @@ setuptools.setup(
55
50
  "jsonpickle~=4.0.1",
56
51
  "parameterized",
57
52
  "schwifty",
53
+ "google"
58
54
  ],
59
55
 
60
56
  license="GNU Affero General Public License v3 ",
61
57
 
62
- # classifiers like program is suitable for python3, just leave as it is.
58
+ # classifiers like program are suitable for python3, leave as it is.
63
59
  classifiers=[
64
60
  "Programming Language :: Python :: 3",
65
61
  "License :: OSI Approved :: GNU Affero General Public License v3 or later (AGPLv3+)",
@@ -1,4 +0,0 @@
1
- from .x_rechnung import XRechnung
2
- from .xml_abstract_x_rechnung_parser import XMLAbstractXRechnungParser
3
-
4
- __all__ = ["XRechnung", "XMLAbstractXRechnungParser"]