docid 0.1.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,579 @@
1
+ """
2
+ Procesor OCR zoptymalizowany dla CPU.
3
+
4
+ Używa PaddleOCR jako głównego silnika (najlepszy stosunek jakość/wydajność na CPU),
5
+ z fallbackiem na Tesseract dla kompatybilności.
6
+ """
7
+
8
+ import logging
9
+ import os
10
+ import re
11
+ from abc import ABC, abstractmethod
12
+ from dataclasses import dataclass, field
13
+ from enum import Enum
14
+ from pathlib import Path
15
+ from typing import List, Optional, Tuple, Union
16
+
17
+ logger = logging.getLogger(__name__)
18
+
19
+
20
+ class OCREngine(Enum):
21
+ """Dostępne silniki OCR."""
22
+ PADDLE = "paddle" # PaddleOCR - najlepsza jakość na CPU
23
+ TESSERACT = "tesseract" # Tesseract - fallback, szybki
24
+ EASYOCR = "easyocr" # EasyOCR - wolniejszy ale dokładny
25
+
26
+
27
+ @dataclass
28
+ class OCRResult:
29
+ """Wynik OCR dla pojedynczego fragmentu tekstu."""
30
+ text: str
31
+ confidence: float
32
+ bbox: Optional[Tuple[int, int, int, int]] = None # x1, y1, x2, y2
33
+
34
+ def __str__(self) -> str:
35
+ return self.text
36
+
37
+
38
+ @dataclass
39
+ class DocumentOCRResult:
40
+ """Pełny wynik OCR dla dokumentu."""
41
+ full_text: str
42
+ lines: List[OCRResult]
43
+ average_confidence: float
44
+ engine_used: OCREngine
45
+
46
+ # Metadane
47
+ source_file: Optional[str] = None
48
+ processing_time_ms: Optional[float] = None
49
+
50
+ # Wykryte struktury
51
+ detected_nips: List[str] = field(default_factory=list)
52
+ detected_amounts: List[str] = field(default_factory=list)
53
+ detected_dates: List[str] = field(default_factory=list)
54
+ detected_invoice_numbers: List[str] = field(default_factory=list)
55
+
56
+
57
+ class BaseOCRProcessor(ABC):
58
+ """Bazowa klasa dla procesorów OCR."""
59
+
60
+ @abstractmethod
61
+ def process_image(self, image_path: Union[str, Path]) -> DocumentOCRResult:
62
+ """Przetwarza obraz i zwraca wynik OCR."""
63
+ pass
64
+
65
+ @abstractmethod
66
+ def process_pdf(self, pdf_path: Union[str, Path]) -> List[DocumentOCRResult]:
67
+ """Przetwarza PDF i zwraca wyniki OCR dla każdej strony."""
68
+ pass
69
+
70
+ def extract_structured_data(self, text: str) -> dict:
71
+ """
72
+ Wyciąga strukturyzowane dane z tekstu OCR.
73
+
74
+ Szuka: NIP, kwoty, daty, numery faktur.
75
+ """
76
+ return {
77
+ 'nips': self._find_nips(text),
78
+ 'amounts': self._find_amounts(text),
79
+ 'dates': self._find_dates(text),
80
+ 'invoice_numbers': self._find_invoice_numbers(text),
81
+ }
82
+
83
+ def _find_nips(self, text: str) -> List[str]:
84
+ """Znajduje wszystkie NIP-y w tekście."""
85
+ patterns = [
86
+ r'NIP[:\s]*(\d{3}[-\s]?\d{3}[-\s]?\d{2}[-\s]?\d{2})',
87
+ r'NIP[:\s]*(\d{10})',
88
+ r'(\d{3}-\d{3}-\d{2}-\d{2})',
89
+ r'(?<!\d)(\d{10})(?!\d)', # 10 cyfr bez kontekstu
90
+ ]
91
+
92
+ results = []
93
+ for pattern in patterns:
94
+ matches = re.findall(pattern, text, re.IGNORECASE)
95
+ for match in matches:
96
+ # Normalizuj - usuń separatory
97
+ nip = re.sub(r'[\s\-]', '', match)
98
+ if len(nip) == 10 and nip.isdigit():
99
+ # Walidacja checksum
100
+ weights = [6, 5, 7, 2, 3, 4, 5, 6, 7]
101
+ checksum = sum(int(nip[i]) * weights[i] for i in range(9))
102
+ if checksum % 11 == int(nip[9]):
103
+ if nip not in results:
104
+ results.append(nip)
105
+
106
+ return results
107
+
108
+ def _find_amounts(self, text: str) -> List[str]:
109
+ """Znajduje kwoty pieniężne w tekście."""
110
+ patterns = [
111
+ r'(\d{1,3}(?:[\s\xa0]?\d{3})*[,\.]\d{2})\s*(?:zł|PLN|złotych)?',
112
+ r'(?:brutto|netto|razem|suma|do zapłaty)[:\s]*(\d{1,3}(?:[\s\xa0]?\d{3})*[,\.]\d{2})',
113
+ r'(\d+[,\.]\d{2})\s*(?:zł|PLN)',
114
+ ]
115
+
116
+ results = []
117
+ for pattern in patterns:
118
+ matches = re.findall(pattern, text, re.IGNORECASE)
119
+ for match in matches:
120
+ # Normalizuj
121
+ amount = match.replace('\xa0', '').replace(' ', '')
122
+ amount = amount.replace(',', '.')
123
+ if amount not in results:
124
+ results.append(amount)
125
+
126
+ return results
127
+
128
+ def _find_dates(self, text: str) -> List[str]:
129
+ """Znajduje daty w tekście."""
130
+ patterns = [
131
+ r'\b(\d{2}[-\.\/]\d{2}[-\.\/]\d{4})\b', # DD-MM-YYYY
132
+ r'\b(\d{4}[-\.\/]\d{2}[-\.\/]\d{2})\b', # YYYY-MM-DD
133
+ r'\b(\d{2}[-\.\/]\d{2}[-\.\/]\d{2})\b', # DD-MM-YY
134
+ ]
135
+
136
+ results = []
137
+ for pattern in patterns:
138
+ matches = re.findall(pattern, text)
139
+ for match in matches:
140
+ if match not in results:
141
+ results.append(match)
142
+
143
+ return results
144
+
145
+ def _find_invoice_numbers(self, text: str) -> List[str]:
146
+ """Znajduje numery faktur w tekście."""
147
+ patterns = [
148
+ r'(?:faktura|fv|rachunek|nr)[:\s]*([A-Z0-9\/\-]+\d+[A-Z0-9\/\-]*)',
149
+ r'(?:numer|nr)[:\s]*([A-Z]{1,3}[\s\/\-]?\d{1,4}[\s\/\-]?\d{2,4}[\s\/\-]?\d{2,6})',
150
+ r'(FV[\s\/\-]?\d+[\s\/\-]?\d*[\s\/\-]?\d*)',
151
+ r'(F[\s\/\-]?\d+[\s\/\-]?\d{4})',
152
+ ]
153
+
154
+ results = []
155
+ for pattern in patterns:
156
+ matches = re.findall(pattern, text, re.IGNORECASE)
157
+ for match in matches:
158
+ normalized = match.strip().upper()
159
+ if len(normalized) >= 4 and normalized not in results:
160
+ results.append(normalized)
161
+
162
+ return results
163
+
164
+
165
+ class PaddleOCRProcessor(BaseOCRProcessor):
166
+ """
167
+ Procesor OCR oparty na PaddleOCR.
168
+
169
+ Najlepszy stosunek jakość/wydajność na CPU.
170
+ Obsługuje język polski i angielski.
171
+ """
172
+
173
+ def __init__(
174
+ self,
175
+ lang: str = 'pl', # 'pl', 'en', 'latin'
176
+ use_gpu: bool = False,
177
+ det_model_dir: Optional[str] = None,
178
+ rec_model_dir: Optional[str] = None,
179
+ ):
180
+ self.lang = lang
181
+ self.use_gpu = use_gpu
182
+ self._ocr = None
183
+ self._det_model_dir = det_model_dir
184
+ self._rec_model_dir = rec_model_dir
185
+
186
+ def _init_ocr(self):
187
+ """Lazy initialization silnika OCR."""
188
+ if self._ocr is None:
189
+ try:
190
+ from paddleocr import PaddleOCR
191
+
192
+ # Dla polskiego używamy en (obsługuje dobrze znaki łacińskie w tym polskie)
193
+ lang = 'en' if self.lang == 'pl' else self.lang
194
+
195
+ self._ocr = PaddleOCR(
196
+ use_angle_cls=True,
197
+ lang=lang,
198
+ use_gpu=self.use_gpu,
199
+ det_model_dir=self._det_model_dir,
200
+ rec_model_dir=self._rec_model_dir,
201
+ # Optymalizacje CPU
202
+ enable_mkldnn=True,
203
+ cpu_threads=4,
204
+ )
205
+ except ImportError:
206
+ raise ImportError(
207
+ "PaddleOCR not installed. Install with: "
208
+ "pip install paddleocr paddlepaddle"
209
+ )
210
+
211
+ def process_image(self, image_path: Union[str, Path]) -> DocumentOCRResult:
212
+ """Przetwarza obraz."""
213
+ import time
214
+ start_time = time.time()
215
+
216
+ self._init_ocr()
217
+
218
+ image_path = str(image_path)
219
+ result = self._ocr.ocr(image_path, cls=True)
220
+
221
+ lines = []
222
+ full_text_parts = []
223
+
224
+ if result and result[0]:
225
+ for line in result[0]:
226
+ bbox_points, (text, confidence) = line
227
+
228
+ # Konwersja bbox z punktów do prostokąta
229
+ x_coords = [p[0] for p in bbox_points]
230
+ y_coords = [p[1] for p in bbox_points]
231
+ bbox = (
232
+ int(min(x_coords)),
233
+ int(min(y_coords)),
234
+ int(max(x_coords)),
235
+ int(max(y_coords)),
236
+ )
237
+
238
+ lines.append(OCRResult(
239
+ text=text,
240
+ confidence=confidence,
241
+ bbox=bbox,
242
+ ))
243
+ full_text_parts.append(text)
244
+
245
+ full_text = '\n'.join(full_text_parts)
246
+ avg_confidence = sum(line.confidence for line in lines) / len(lines) if lines else 0.0
247
+
248
+ # Ekstrakcja strukturyzowanych danych
249
+ structured = self.extract_structured_data(full_text)
250
+
251
+ processing_time = (time.time() - start_time) * 1000
252
+
253
+ return DocumentOCRResult(
254
+ full_text=full_text,
255
+ lines=lines,
256
+ average_confidence=avg_confidence,
257
+ engine_used=OCREngine.PADDLE,
258
+ source_file=image_path,
259
+ processing_time_ms=processing_time,
260
+ detected_nips=structured['nips'],
261
+ detected_amounts=structured['amounts'],
262
+ detected_dates=structured['dates'],
263
+ detected_invoice_numbers=structured['invoice_numbers'],
264
+ )
265
+
266
+ def process_pdf(self, pdf_path: Union[str, Path]) -> List[DocumentOCRResult]:
267
+ """Przetwarza PDF - konwertuje strony na obrazy i procesuje."""
268
+ try:
269
+ import pdf2image
270
+ except ImportError:
271
+ raise ImportError(
272
+ "pdf2image not installed. Install with: pip install pdf2image"
273
+ )
274
+
275
+ pdf_path = str(pdf_path)
276
+ images = pdf2image.convert_from_path(pdf_path, dpi=300)
277
+
278
+ results = []
279
+ for i, image in enumerate(images):
280
+ # Zapisz tymczasowo jako PNG
281
+ import tempfile
282
+ with tempfile.NamedTemporaryFile(suffix='.png', delete=False) as tmp:
283
+ image.save(tmp.name, 'PNG')
284
+ result = self.process_image(tmp.name)
285
+ result.source_file = f"{pdf_path}#page={i+1}"
286
+ results.append(result)
287
+ os.unlink(tmp.name)
288
+
289
+ return results
290
+
291
+
292
+ class TesseractOCRProcessor(BaseOCRProcessor):
293
+ """
294
+ Procesor OCR oparty na Tesseract.
295
+
296
+ Szybszy, dobry fallback, wymaga zainstalowanego tesseract-ocr.
297
+ """
298
+
299
+ def __init__(
300
+ self,
301
+ lang: str = 'pol+eng',
302
+ config: str = '--oem 3 --psm 6',
303
+ ):
304
+ self.lang = lang
305
+ self.config = config
306
+ self._check_tesseract()
307
+
308
+ def _check_tesseract(self):
309
+ """Sprawdza czy Tesseract jest zainstalowany."""
310
+ try:
311
+ import pytesseract
312
+ pytesseract.get_tesseract_version()
313
+ except Exception:
314
+ raise ImportError(
315
+ "Tesseract not found. Install with:\n"
316
+ " Ubuntu: sudo apt install tesseract-ocr tesseract-ocr-pol\n"
317
+ " pip install pytesseract"
318
+ )
319
+
320
+ def process_image(self, image_path: Union[str, Path]) -> DocumentOCRResult:
321
+ """Przetwarza obraz."""
322
+ import time
323
+
324
+ import pytesseract
325
+ from PIL import Image
326
+
327
+ start_time = time.time()
328
+
329
+ image = Image.open(image_path)
330
+
331
+ # OCR z detalami
332
+ data = pytesseract.image_to_data(
333
+ image,
334
+ lang=self.lang,
335
+ config=self.config,
336
+ output_type=pytesseract.Output.DICT,
337
+ )
338
+
339
+ lines = []
340
+ current_line = []
341
+ current_line_num = -1
342
+
343
+ for i, text in enumerate(data['text']):
344
+ if not text.strip():
345
+ continue
346
+
347
+ conf = float(data['conf'][i])
348
+ if conf < 0:
349
+ conf = 0
350
+
351
+ line_num = data['line_num'][i]
352
+
353
+ bbox = (
354
+ data['left'][i],
355
+ data['top'][i],
356
+ data['left'][i] + data['width'][i],
357
+ data['top'][i] + data['height'][i],
358
+ )
359
+
360
+ if line_num != current_line_num:
361
+ if current_line:
362
+ # Zakończ poprzednią linię
363
+ line_text = ' '.join([r.text for r in current_line])
364
+ avg_conf = sum(r.confidence for r in current_line) / len(current_line)
365
+ lines.append(OCRResult(
366
+ text=line_text,
367
+ confidence=avg_conf / 100, # Tesseract daje 0-100
368
+ bbox=current_line[0].bbox,
369
+ ))
370
+ current_line = []
371
+ current_line_num = line_num
372
+
373
+ current_line.append(OCRResult(
374
+ text=text,
375
+ confidence=conf,
376
+ bbox=bbox,
377
+ ))
378
+
379
+ # Ostatnia linia
380
+ if current_line:
381
+ line_text = ' '.join([r.text for r in current_line])
382
+ avg_conf = sum(r.confidence for r in current_line) / len(current_line)
383
+ lines.append(OCRResult(
384
+ text=line_text,
385
+ confidence=avg_conf / 100,
386
+ bbox=current_line[0].bbox,
387
+ ))
388
+
389
+ full_text = '\n'.join([line.text for line in lines])
390
+ avg_confidence = sum(line.confidence for line in lines) / len(lines) if lines else 0.0
391
+
392
+ # Ekstrakcja strukturyzowanych danych
393
+ structured = self.extract_structured_data(full_text)
394
+
395
+ processing_time = (time.time() - start_time) * 1000
396
+
397
+ return DocumentOCRResult(
398
+ full_text=full_text,
399
+ lines=lines,
400
+ average_confidence=avg_confidence,
401
+ engine_used=OCREngine.TESSERACT,
402
+ source_file=str(image_path),
403
+ processing_time_ms=processing_time,
404
+ detected_nips=structured['nips'],
405
+ detected_amounts=structured['amounts'],
406
+ detected_dates=structured['dates'],
407
+ detected_invoice_numbers=structured['invoice_numbers'],
408
+ )
409
+
410
+ def process_pdf(self, pdf_path: Union[str, Path]) -> List[DocumentOCRResult]:
411
+ """Przetwarza PDF."""
412
+ try:
413
+ import pdf2image
414
+ except ImportError:
415
+ raise ImportError("pdf2image not installed")
416
+
417
+ pdf_path = str(pdf_path)
418
+ images = pdf2image.convert_from_path(pdf_path, dpi=300)
419
+
420
+ results = []
421
+ for i, image in enumerate(images):
422
+ import tempfile
423
+ with tempfile.NamedTemporaryFile(suffix='.png', delete=False) as tmp:
424
+ image.save(tmp.name, 'PNG')
425
+ result = self.process_image(tmp.name)
426
+ result.source_file = f"{pdf_path}#page={i+1}"
427
+ results.append(result)
428
+ os.unlink(tmp.name)
429
+
430
+ return results
431
+
432
+
433
+ class OCRProcessor:
434
+ """
435
+ Główny procesor OCR z automatycznym wyborem silnika.
436
+
437
+ Próbuje użyć PaddleOCR (najlepsza jakość), fallback na Tesseract.
438
+ """
439
+
440
+ def __init__(
441
+ self,
442
+ preferred_engine: OCREngine = OCREngine.PADDLE,
443
+ fallback_engine: OCREngine = OCREngine.TESSERACT,
444
+ lang: str = 'pl',
445
+ use_gpu: bool = False,
446
+ ):
447
+ self.preferred_engine = preferred_engine
448
+ self.fallback_engine = fallback_engine
449
+ self.lang = lang
450
+ self.use_gpu = use_gpu
451
+
452
+ self._processor: Optional[BaseOCRProcessor] = None
453
+ self._active_engine: Optional[OCREngine] = None
454
+
455
+ def _init_processor(self) -> BaseOCRProcessor:
456
+ """Inicjalizuje procesor, próbując preferowany silnik."""
457
+ if self._processor is not None:
458
+ return self._processor
459
+
460
+ # Lista silników do wypróbowania w kolejności
461
+ engines_to_try = []
462
+
463
+ # 1. Dodaj preferowany silnik
464
+ engines_to_try.append(self.preferred_engine)
465
+
466
+ # 2. Dodaj fallback jeśli inny
467
+ if self.fallback_engine != self.preferred_engine:
468
+ engines_to_try.append(self.fallback_engine)
469
+
470
+ # 3. Dodaj pozostałe jako ostatnia deska ratunku
471
+ for eng in OCREngine:
472
+ if eng not in engines_to_try:
473
+ engines_to_try.append(eng)
474
+
475
+ last_error = None
476
+ for engine in engines_to_try:
477
+ try:
478
+ if engine == OCREngine.PADDLE:
479
+ # Sprawdź czy paddle jest zainstalowany bez importowania wszystkiego
480
+ import importlib.util
481
+ if importlib.util.find_spec("paddleocr") is None:
482
+ raise ImportError("paddleocr not installed")
483
+
484
+ self._processor = PaddleOCRProcessor(
485
+ lang=self.lang,
486
+ use_gpu=self.use_gpu,
487
+ )
488
+ self._active_engine = OCREngine.PADDLE
489
+ logger.info("Using PaddleOCR engine")
490
+ return self._processor
491
+
492
+ elif engine == OCREngine.TESSERACT:
493
+ self._processor = TesseractOCRProcessor(
494
+ lang='pol+eng' if self.lang == 'pl' else self.lang,
495
+ )
496
+ self._active_engine = OCREngine.TESSERACT
497
+ logger.info("Using Tesseract engine")
498
+ return self._processor
499
+ except (ImportError, Exception) as e:
500
+ logger.warning(f"Engine {engine} not available: {e}")
501
+ last_error = e
502
+ continue
503
+
504
+ raise ImportError(
505
+ f"No OCR engine available. Last error: {last_error}. "
506
+ "Install PaddleOCR or Tesseract.\n"
507
+ "PaddleOCR: pip install paddleocr paddlepaddle\n"
508
+ "Tesseract: apt install tesseract-ocr tesseract-ocr-pol && pip install pytesseract"
509
+ )
510
+
511
+ @property
512
+ def active_engine(self) -> Optional[OCREngine]:
513
+ """Zwraca aktualnie używany silnik OCR."""
514
+ return self._active_engine
515
+
516
+ def process(
517
+ self,
518
+ file_path: Union[str, Path]
519
+ ) -> Union[DocumentOCRResult, List[DocumentOCRResult]]:
520
+ """
521
+ Przetwarza plik (obraz lub PDF).
522
+
523
+ Dla obrazów zwraca pojedynczy DocumentOCRResult.
524
+ Dla PDF zwraca listę DocumentOCRResult (jeden per strona).
525
+ """
526
+ processor = self._init_processor()
527
+ file_path = Path(file_path)
528
+
529
+ if file_path.suffix.lower() == '.pdf':
530
+ return processor.process_pdf(file_path)
531
+ else:
532
+ return processor.process_image(file_path)
533
+
534
+ def process_image(self, image_path: Union[str, Path]) -> DocumentOCRResult:
535
+ """Przetwarza pojedynczy obraz."""
536
+ processor = self._init_processor()
537
+ return processor.process_image(image_path)
538
+
539
+ def process_pdf(self, pdf_path: Union[str, Path]) -> List[DocumentOCRResult]:
540
+ """Przetwarza PDF."""
541
+ processor = self._init_processor()
542
+ return processor.process_pdf(pdf_path)
543
+
544
+
545
+ def preprocess_image_for_ocr(
546
+ image_path: Union[str, Path],
547
+ output_path: Optional[str] = None
548
+ ) -> str:
549
+ """
550
+ Preprocessing obrazu przed OCR dla lepszych wyników.
551
+
552
+ Stosuje: grayscale, contrast enhancement, denoising, deskew.
553
+ """
554
+ from PIL import Image, ImageEnhance, ImageFilter
555
+
556
+ img = Image.open(image_path)
557
+
558
+ # 1. Konwersja do grayscale
559
+ if img.mode != 'L':
560
+ img = img.convert('L')
561
+
562
+ # 2. Zwiększenie kontrastu
563
+ enhancer = ImageEnhance.Contrast(img)
564
+ img = enhancer.enhance(1.5)
565
+
566
+ # 3. Wyostrzenie
567
+ img = img.filter(ImageFilter.SHARPEN)
568
+
569
+ # 4. Binaryzacja adaptacyjna (opcjonalna)
570
+ # Możesz użyć OpenCV dla lepszych wyników
571
+
572
+ if output_path:
573
+ img.save(output_path)
574
+ return output_path
575
+ else:
576
+ import tempfile
577
+ with tempfile.NamedTemporaryFile(suffix='.png', delete=False) as tmp:
578
+ img.save(tmp.name)
579
+ return tmp.name