docid 0.1.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,508 @@
1
+ """
2
+ Ekstraktory danych z dokumentów OCR.
3
+
4
+ Każdy ekstraktor specjalizuje się w określonym typie dokumentu
5
+ i wyciąga dane kanoniczne potrzebne do generowania ID.
6
+ """
7
+
8
+ import logging
9
+ import re
10
+ from abc import ABC, abstractmethod
11
+ from dataclasses import dataclass
12
+ from enum import Enum
13
+ from typing import Any, Dict, List, Optional, Tuple
14
+
15
+ from ..ocr_processor import DocumentOCRResult
16
+
17
+ logger = logging.getLogger(__name__)
18
+
19
+
20
+ class DocumentCategory(Enum):
21
+ """Kategorie dokumentów."""
22
+ INVOICE = "invoice"
23
+ RECEIPT = "receipt"
24
+ CONTRACT = "contract"
25
+ BANK_STATEMENT = "bank_statement"
26
+ UNKNOWN = "unknown"
27
+
28
+
29
+ @dataclass
30
+ class ExtractionResult:
31
+ """Wynik ekstrakcji danych z dokumentu."""
32
+ category: DocumentCategory
33
+ confidence: float
34
+
35
+ # Wspólne pola
36
+ document_date: Optional[str] = None
37
+ issuer_nip: Optional[str] = None
38
+
39
+ # Pola faktury
40
+ invoice_number: Optional[str] = None
41
+ buyer_nip: Optional[str] = None
42
+ gross_amount: Optional[str] = None
43
+ net_amount: Optional[str] = None
44
+ vat_amount: Optional[str] = None
45
+
46
+ # Pola paragonu
47
+ receipt_number: Optional[str] = None
48
+ cash_register_number: Optional[str] = None
49
+
50
+ # Pola umowy
51
+ contract_number: Optional[str] = None
52
+ party2_nip: Optional[str] = None
53
+ contract_type: Optional[str] = None
54
+
55
+ # Pola wyciągu
56
+ account_number: Optional[str] = None
57
+ statement_number: Optional[str] = None
58
+
59
+ # Surowe dane
60
+ raw_text: Optional[str] = None
61
+ all_extracted: Optional[Dict[str, Any]] = None
62
+
63
+
64
+ class BaseExtractor(ABC):
65
+ """Bazowa klasa dla ekstraktorów."""
66
+
67
+ @abstractmethod
68
+ def can_extract(self, ocr_result: DocumentOCRResult) -> Tuple[bool, float]:
69
+ """
70
+ Sprawdza czy ekstraktor może przetworzyć dokument.
71
+
72
+ Returns:
73
+ Tuple (can_process, confidence)
74
+ """
75
+ pass
76
+
77
+ @abstractmethod
78
+ def extract(self, ocr_result: DocumentOCRResult) -> ExtractionResult:
79
+ """Wyciąga dane z dokumentu."""
80
+ pass
81
+
82
+ def _normalize_amount(self, amount: str) -> str:
83
+ """Normalizuje kwotę do formatu X.XX"""
84
+ if not amount:
85
+ return ""
86
+ cleaned = re.sub(r'[^\d,\.]', '', amount)
87
+ cleaned = cleaned.replace(',', '.')
88
+ # Usuń separatory tysięcy
89
+ parts = cleaned.rsplit('.', 1)
90
+ if len(parts) == 2 and len(parts[1]) == 2:
91
+ cleaned = parts[0].replace('.', '') + '.' + parts[1]
92
+ try:
93
+ return f"{float(cleaned):.2f}"
94
+ except ValueError:
95
+ return ""
96
+
97
+ def _normalize_nip(self, nip: str) -> str:
98
+ """Normalizuje NIP do 10 cyfr."""
99
+ if not nip:
100
+ return ""
101
+ return re.sub(r'[\s\-]', '', nip)
102
+
103
+ def _normalize_date(self, date_str: str) -> str:
104
+ """Normalizuje datę do YYYY-MM-DD."""
105
+ if not date_str:
106
+ return ""
107
+
108
+ # Różne formaty - sprawdzamy od najdłuższych z granicami słów
109
+ patterns = [
110
+ (r'\b(\d{4})[/\-\.](\d{2})[/\-\.](\d{2})\b', r'\1-\2-\3'), # YYYY-MM-DD
111
+ (r'\b(\d{2})[/\-\.](\d{2})[/\-\.](\d{4})\b', r'\3-\2-\1'), # DD-MM-YYYY
112
+ (r'\b(\d{2})[/\-\.](\d{2})[/\-\.](\d{2})\b', lambda m: f'20{m.group(3)}-{m.group(2)}-{m.group(1)}'), # DD-MM-YY
113
+ ]
114
+
115
+ for pattern, replacement in patterns:
116
+ match = re.search(pattern, date_str)
117
+ if match:
118
+ if callable(replacement):
119
+ return replacement(match)
120
+ return re.sub(pattern, replacement, match.group())
121
+
122
+ return date_str
123
+
124
+
125
+ class InvoiceExtractor(BaseExtractor):
126
+ """Ekstraktor dla faktur VAT."""
127
+
128
+ INVOICE_KEYWORDS = [
129
+ 'faktura', 'fv', 'rachunek', 'invoice',
130
+ 'sprzedawca', 'nabywca', 'nip', 'vat',
131
+ 'brutto', 'netto', 'podatek'
132
+ ]
133
+
134
+ def can_extract(self, ocr_result: DocumentOCRResult) -> Tuple[bool, float]:
135
+ text_lower = ocr_result.full_text.lower()
136
+
137
+ # Liczba słów kluczowych
138
+ keyword_count = sum(1 for kw in self.INVOICE_KEYWORDS if kw in text_lower)
139
+
140
+ # Czy są NIP-y i kwoty?
141
+ has_nips = len(ocr_result.detected_nips) >= 1
142
+ has_amounts = len(ocr_result.detected_amounts) >= 1
143
+ has_invoice_num = len(ocr_result.detected_invoice_numbers) >= 1
144
+
145
+ confidence = min(1.0, keyword_count * 0.15 +
146
+ (0.2 if has_nips else 0) +
147
+ (0.2 if has_amounts else 0) +
148
+ (0.2 if has_invoice_num else 0))
149
+
150
+ return confidence > 0.4, confidence
151
+
152
+ def extract(self, ocr_result: DocumentOCRResult) -> ExtractionResult:
153
+ text = ocr_result.full_text
154
+
155
+ # NIP sprzedawcy - zwykle pierwszy
156
+ seller_nip = ocr_result.detected_nips[0] if ocr_result.detected_nips else None
157
+
158
+ # NIP nabywcy - zwykle drugi
159
+ buyer_nip = ocr_result.detected_nips[1] if len(ocr_result.detected_nips) > 1 else None
160
+
161
+ # Numer faktury
162
+ invoice_number = self._find_invoice_number(text, ocr_result.detected_invoice_numbers)
163
+
164
+ # Data wystawienia
165
+ issue_date = self._find_issue_date(text, ocr_result.detected_dates)
166
+
167
+ # Kwoty
168
+ gross_amount, net_amount, vat_amount = self._find_amounts(text, ocr_result.detected_amounts)
169
+
170
+ return ExtractionResult(
171
+ category=DocumentCategory.INVOICE,
172
+ confidence=ocr_result.average_confidence,
173
+ document_date=issue_date,
174
+ issuer_nip=seller_nip,
175
+ buyer_nip=buyer_nip,
176
+ invoice_number=invoice_number,
177
+ gross_amount=gross_amount,
178
+ net_amount=net_amount,
179
+ vat_amount=vat_amount,
180
+ raw_text=text,
181
+ all_extracted={
182
+ 'detected_nips': ocr_result.detected_nips,
183
+ 'detected_amounts': ocr_result.detected_amounts,
184
+ 'detected_dates': ocr_result.detected_dates,
185
+ 'detected_invoice_numbers': ocr_result.detected_invoice_numbers,
186
+ }
187
+ )
188
+
189
+ def _find_invoice_number(self, text: str, detected: List[str]) -> Optional[str]:
190
+ """Znajduje numer faktury."""
191
+ # Szukaj w kontekście - wymaga przynajmniej jednej cyfry
192
+ # Używamy (?i) dla case-insensitive i upewniamy się, że słowa kluczowe nie zjadają prefiksów
193
+ patterns = [
194
+ r'(?i)\b(?:faktura|fv|rachunek|dokumentu)\b\s*(?:vat)?\s*(?:nr|numer)?[:\s]+([A-Z0-9\/\-]*\d+[A-Z0-9\/\-]*)',
195
+ r'(?i)\b(?:nr|numer)\b\s*(?:faktury|fv|dokumentu)?[:\s]+([A-Z0-9\/\-]*\d+[A-Z0-9\/\-]*)',
196
+ ]
197
+
198
+ for pattern in patterns:
199
+ match = re.search(pattern, text)
200
+ if match:
201
+ return match.group(1).strip().upper()
202
+
203
+ # Fallback na wykryte numery
204
+ return detected[0] if detected else None
205
+
206
+ def _find_issue_date(self, text: str, detected: List[str]) -> Optional[str]:
207
+ """Znajduje datę wystawienia."""
208
+ # Szukaj w kontekście - wspiera YYYY-MM-DD i DD-MM-YYYY
209
+ patterns = [
210
+ r'(?i)data\s*wystawienia[:\s]*(\d{2,4}[.\-/]\d{2}[.\-/]\d{2,4})',
211
+ r'(?i)wystawion[ao]\s*(?:dnia)?[:\s]*(\d{2,4}[.\-/]\d{2}[.\-/]\d{2,4})',
212
+ r'(?i)data[:\s]*(\d{2,4}[.\-/]\d{2}[.\-/]\d{2,4})',
213
+ ]
214
+
215
+ for pattern in patterns:
216
+ match = re.search(pattern, text)
217
+ if match:
218
+ return self._normalize_date(match.group(1))
219
+
220
+ return self._normalize_date(detected[0]) if detected else None
221
+
222
+ def _find_amounts(self, text: str, detected: List[str]) -> Tuple[Optional[str], Optional[str], Optional[str]]:
223
+ """Znajduje kwoty brutto, netto, VAT."""
224
+ gross = None
225
+ net = None
226
+ vat = None
227
+
228
+ # Szukaj kwoty brutto
229
+ brutto_match = re.search(r'brutto[:\s]*(\d[\d\s,\.]*\d)', text, re.IGNORECASE)
230
+ if brutto_match:
231
+ gross = self._normalize_amount(brutto_match.group(1))
232
+
233
+ # Szukaj kwoty netto
234
+ netto_match = re.search(r'netto[:\s]*(\d[\d\s,\.]*\d)', text, re.IGNORECASE)
235
+ if netto_match:
236
+ net = self._normalize_amount(netto_match.group(1))
237
+
238
+ # Szukaj VAT
239
+ vat_match = re.search(r'(?:vat|podatek)[:\s]*(\d[\d\s,\.]*\d)', text, re.IGNORECASE)
240
+ if vat_match:
241
+ vat = self._normalize_amount(vat_match.group(1))
242
+
243
+ # Jeśli nie znaleziono brutto, weź największą kwotę
244
+ if not gross and detected:
245
+ amounts = [float(self._normalize_amount(a) or 0) for a in detected]
246
+ if amounts:
247
+ gross = f"{max(amounts):.2f}"
248
+
249
+ return gross, net, vat
250
+
251
+
252
+ class ReceiptExtractor(BaseExtractor):
253
+ """Ekstraktor dla paragonów fiskalnych."""
254
+
255
+ RECEIPT_KEYWORDS = [
256
+ 'paragon', 'fiskalny', 'kasa', 'sprzedaż',
257
+ 'gotówka', 'karta', 'reszta', 'ptu', 'suma'
258
+ ]
259
+
260
+ def can_extract(self, ocr_result: DocumentOCRResult) -> Tuple[bool, float]:
261
+ text_lower = ocr_result.full_text.lower()
262
+
263
+ keyword_count = sum(1 for kw in self.RECEIPT_KEYWORDS if kw in text_lower)
264
+
265
+ # Paragon ma specyficzny format - brak NIP nabywcy, wiele pozycji
266
+ has_fiscal_markers = 'fiskaln' in text_lower or 'paragon' in text_lower
267
+ has_ptu = 'ptu' in text_lower or bool(re.search(r'\d+%', text_lower))
268
+
269
+ confidence = min(1.0, keyword_count * 0.15 +
270
+ (0.3 if has_fiscal_markers else 0) +
271
+ (0.2 if has_ptu else 0))
272
+
273
+ return confidence > 0.4, confidence
274
+
275
+ def extract(self, ocr_result: DocumentOCRResult) -> ExtractionResult:
276
+ text = ocr_result.full_text
277
+
278
+ # NIP sprzedawcy
279
+ seller_nip = ocr_result.detected_nips[0] if ocr_result.detected_nips else None
280
+
281
+ # Data
282
+ receipt_date = ocr_result.detected_dates[0] if ocr_result.detected_dates else None
283
+
284
+ # Kwota - szukaj SUMA lub ostatniej dużej kwoty
285
+ gross_amount = self._find_total_amount(text, ocr_result.detected_amounts)
286
+
287
+ # Numer paragonu / kasy
288
+ receipt_num, cash_register = self._find_receipt_identifiers(text)
289
+
290
+ return ExtractionResult(
291
+ category=DocumentCategory.RECEIPT,
292
+ confidence=ocr_result.average_confidence,
293
+ document_date=self._normalize_date(receipt_date) if receipt_date else None,
294
+ issuer_nip=seller_nip,
295
+ gross_amount=gross_amount,
296
+ receipt_number=receipt_num,
297
+ cash_register_number=cash_register,
298
+ raw_text=text,
299
+ all_extracted={
300
+ 'detected_nips': ocr_result.detected_nips,
301
+ 'detected_amounts': ocr_result.detected_amounts,
302
+ 'detected_dates': ocr_result.detected_dates,
303
+ }
304
+ )
305
+
306
+ def _find_total_amount(self, text: str, detected: List[str]) -> Optional[str]:
307
+ """Znajduje kwotę SUMA na paragonie."""
308
+ patterns = [
309
+ r'suma[:\s]*(\d[\d\s,\.]*\d)',
310
+ r'razem[:\s]*(\d[\d\s,\.]*\d)',
311
+ r'do zapłaty[:\s]*(\d[\d\s,\.]*\d)',
312
+ ]
313
+
314
+ for pattern in patterns:
315
+ match = re.search(pattern, text, re.IGNORECASE)
316
+ if match:
317
+ return self._normalize_amount(match.group(1))
318
+
319
+ # Fallback - największa kwota
320
+ if detected:
321
+ amounts = [float(self._normalize_amount(a) or 0) for a in detected]
322
+ if amounts:
323
+ return f"{max(amounts):.2f}"
324
+
325
+ return None
326
+
327
+ def _find_receipt_identifiers(self, text: str) -> Tuple[Optional[str], Optional[str]]:
328
+ """Znajduje numer paragonu i numer kasy."""
329
+ receipt_num = None
330
+ cash_register = None
331
+
332
+ # Numer paragonu
333
+ receipt_match = re.search(r'(?:nr|numer)\s*(?:paragonu)?[:\s]*(\d+)', text, re.IGNORECASE)
334
+ if receipt_match:
335
+ receipt_num = receipt_match.group(1)
336
+
337
+ # Numer kasy
338
+ cash_match = re.search(r'(?:kasa|stanowisko)[:\s]*(\d+)', text, re.IGNORECASE)
339
+ if cash_match:
340
+ cash_register = cash_match.group(1)
341
+
342
+ return receipt_num, cash_register
343
+
344
+
345
+ class ContractExtractor(BaseExtractor):
346
+ """Ekstraktor dla umów."""
347
+
348
+ CONTRACT_KEYWORDS = [
349
+ 'umowa', 'kontrakt', 'porozumienie', 'zlecenie',
350
+ 'strona', 'wykonawca', 'zamawiający', 'zleceniodawca',
351
+ 'przedmiot', 'wynagrodzenie', 'termin'
352
+ ]
353
+
354
+ def can_extract(self, ocr_result: DocumentOCRResult) -> Tuple[bool, float]:
355
+ text_lower = ocr_result.full_text.lower()
356
+
357
+ keyword_count = sum(1 for kw in self.CONTRACT_KEYWORDS if kw in text_lower)
358
+
359
+ has_contract_header = 'umowa' in text_lower or 'kontrakt' in text_lower
360
+ has_parties = 'strona' in text_lower or 'wykonawca' in text_lower
361
+
362
+ confidence = min(1.0, keyword_count * 0.1 +
363
+ (0.3 if has_contract_header else 0) +
364
+ (0.2 if has_parties else 0))
365
+
366
+ return confidence > 0.4, confidence
367
+
368
+ def extract(self, ocr_result: DocumentOCRResult) -> ExtractionResult:
369
+ text = ocr_result.full_text
370
+
371
+ # NIP-y stron
372
+ party1_nip = ocr_result.detected_nips[0] if ocr_result.detected_nips else None
373
+ party2_nip = ocr_result.detected_nips[1] if len(ocr_result.detected_nips) > 1 else None
374
+
375
+ # Data umowy
376
+ contract_date = self._find_contract_date(text, ocr_result.detected_dates)
377
+
378
+ # Numer umowy
379
+ contract_number = self._find_contract_number(text)
380
+
381
+ # Typ umowy
382
+ contract_type = self._find_contract_type(text)
383
+
384
+ return ExtractionResult(
385
+ category=DocumentCategory.CONTRACT,
386
+ confidence=ocr_result.average_confidence,
387
+ document_date=contract_date,
388
+ issuer_nip=party1_nip,
389
+ party2_nip=party2_nip,
390
+ contract_number=contract_number,
391
+ contract_type=contract_type,
392
+ raw_text=text,
393
+ all_extracted={
394
+ 'detected_nips': ocr_result.detected_nips,
395
+ 'detected_dates': ocr_result.detected_dates,
396
+ }
397
+ )
398
+
399
+ def _find_contract_date(self, text: str, detected: List[str]) -> Optional[str]:
400
+ patterns = [
401
+ r'zawarta\s*(?:w\s*dniu)?[:\s]*(\d{2}[.\-/]\d{2}[.\-/]\d{4})',
402
+ r'dnia[:\s]*(\d{2}[.\-/]\d{2}[.\-/]\d{4})',
403
+ r'data[:\s]*(\d{2}[.\-/]\d{2}[.\-/]\d{4})',
404
+ ]
405
+
406
+ for pattern in patterns:
407
+ match = re.search(pattern, text, re.IGNORECASE)
408
+ if match:
409
+ return self._normalize_date(match.group(1))
410
+
411
+ return self._normalize_date(detected[0]) if detected else None
412
+
413
+ def _find_contract_number(self, text: str) -> Optional[str]:
414
+ patterns = [
415
+ r'umowa\s*(?:nr|numer)?[:\s]*([A-Z0-9\/\-]+)',
416
+ r'(?:nr|numer)\s*(?:umowy)?[:\s]*([A-Z0-9\/\-]+)',
417
+ ]
418
+
419
+ for pattern in patterns:
420
+ match = re.search(pattern, text, re.IGNORECASE)
421
+ if match:
422
+ return match.group(1).strip().upper()
423
+
424
+ return None
425
+
426
+ def _find_contract_type(self, text: str) -> Optional[str]:
427
+ types = {
428
+ 'zlecenie': 'ZLECENIE',
429
+ 'o dzieło': 'DZIELO',
430
+ 'najmu': 'NAJEM',
431
+ 'sprzedaży': 'SPRZEDAZ',
432
+ 'współpracy': 'WSPOLPRACA',
433
+ 'o pracę': 'PRACA',
434
+ }
435
+
436
+ text_lower = text.lower()
437
+ for keyword, contract_type in types.items():
438
+ if keyword in text_lower:
439
+ return contract_type
440
+
441
+ return None
442
+
443
+
444
+ class DocumentExtractor:
445
+ """
446
+ Główny ekstraktor dokumentów.
447
+
448
+ Automatycznie wybiera odpowiedni ekstraktor na podstawie treści.
449
+ """
450
+
451
+ def __init__(self):
452
+ self.extractors: List[BaseExtractor] = [
453
+ InvoiceExtractor(),
454
+ ReceiptExtractor(),
455
+ ContractExtractor(),
456
+ ]
457
+
458
+ def extract(self, ocr_result: DocumentOCRResult) -> ExtractionResult:
459
+ """
460
+ Wyciąga dane z dokumentu OCR.
461
+
462
+ Automatycznie wybiera najlepszy ekstraktor.
463
+ """
464
+ best_extractor = None
465
+ best_confidence = 0.0
466
+
467
+ for extractor in self.extractors:
468
+ can_extract, confidence = extractor.can_extract(ocr_result)
469
+ if can_extract and confidence > best_confidence:
470
+ best_confidence = confidence
471
+ best_extractor = extractor
472
+
473
+ if best_extractor:
474
+ logger.info(f"Using {best_extractor.__class__.__name__} with confidence {best_confidence:.2f}")
475
+ return best_extractor.extract(ocr_result)
476
+
477
+ # Fallback - nieznany dokument
478
+ logger.warning("Could not determine document type, returning unknown")
479
+ return ExtractionResult(
480
+ category=DocumentCategory.UNKNOWN,
481
+ confidence=ocr_result.average_confidence,
482
+ document_date=ocr_result.detected_dates[0] if ocr_result.detected_dates else None,
483
+ issuer_nip=ocr_result.detected_nips[0] if ocr_result.detected_nips else None,
484
+ raw_text=ocr_result.full_text,
485
+ all_extracted={
486
+ 'detected_nips': ocr_result.detected_nips,
487
+ 'detected_amounts': ocr_result.detected_amounts,
488
+ 'detected_dates': ocr_result.detected_dates,
489
+ 'detected_invoice_numbers': ocr_result.detected_invoice_numbers,
490
+ }
491
+ )
492
+
493
+ def extract_all(self, ocr_result: DocumentOCRResult) -> List[ExtractionResult]:
494
+ """
495
+ Wyciąga dane wszystkimi pasującymi ekstraktorami.
496
+
497
+ Przydatne do porównania wyników.
498
+ """
499
+ results = []
500
+
501
+ for extractor in self.extractors:
502
+ can_extract, confidence = extractor.can_extract(ocr_result)
503
+ if can_extract:
504
+ result = extractor.extract(ocr_result)
505
+ result.confidence = confidence # Użyj confidence z can_extract
506
+ results.append(result)
507
+
508
+ return sorted(results, key=lambda r: r.confidence, reverse=True)