docid 0.1.3__tar.gz → 0.1.4__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {docid-0.1.3 → docid-0.1.4}/PKG-INFO +1 -1
- {docid-0.1.3 → docid-0.1.4}/docid.egg-info/PKG-INFO +1 -1
- {docid-0.1.3 → docid-0.1.4}/docid.egg-info/SOURCES.txt +2 -1
- {docid-0.1.3 → docid-0.1.4}/exef_docid/__init__.py +3 -3
- {docid-0.1.3 → docid-0.1.4}/exef_docid/cli.py +5 -5
- {docid-0.1.3 → docid-0.1.4}/exef_docid/cli_universal.py +1 -1
- {docid-0.1.3 → docid-0.1.4}/exef_docid/document_id.py +10 -10
- {docid-0.1.3 → docid-0.1.4}/exef_docid/ocr_processor.py +3 -7
- {docid-0.1.3 → docid-0.1.4}/exef_docid/pipeline.py +8 -8
- {docid-0.1.3 → docid-0.1.4}/pyproject.toml +1 -1
- docid-0.1.4/tests/test_samples_id.py +277 -0
- {docid-0.1.3 → docid-0.1.4}/README.md +0 -0
- {docid-0.1.3 → docid-0.1.4}/docid.egg-info/dependency_links.txt +0 -0
- {docid-0.1.3 → docid-0.1.4}/docid.egg-info/entry_points.txt +0 -0
- {docid-0.1.3 → docid-0.1.4}/docid.egg-info/not-zip-safe +0 -0
- {docid-0.1.3 → docid-0.1.4}/docid.egg-info/requires.txt +0 -0
- {docid-0.1.3 → docid-0.1.4}/docid.egg-info/top_level.txt +0 -0
- {docid-0.1.3 → docid-0.1.4}/exef_docid/document_id_universal.py +0 -0
- {docid-0.1.3 → docid-0.1.4}/exef_docid/extractors/__init__.py +0 -0
- {docid-0.1.3 → docid-0.1.4}/exef_docid/extractors/base.py +0 -0
- {docid-0.1.3 → docid-0.1.4}/setup.cfg +0 -0
- {docid-0.1.3 → docid-0.1.4}/setup.py +0 -0
- {docid-0.1.3 → docid-0.1.4}/tests/test_document_id.py +0 -0
- {docid-0.1.3 → docid-0.1.4}/tests/test_extractors.py +0 -0
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
"""
|
|
2
|
-
|
|
2
|
+
DOC Document ID Generator
|
|
3
3
|
|
|
4
4
|
Deterministyczny generator identyfikatorów dokumentów z OCR.
|
|
5
5
|
Generuje zawsze ten sam ID dla tego samego dokumentu,
|
|
@@ -10,14 +10,14 @@ Przykład użycia:
|
|
|
10
10
|
|
|
11
11
|
# Pełne przetwarzanie
|
|
12
12
|
result = process_document("faktura.pdf")
|
|
13
|
-
print(result.document_id) #
|
|
13
|
+
print(result.document_id) # DOC-FV-A7B3C9D2E1F04856
|
|
14
14
|
print(result.extraction.issuer_nip) # 5213017228
|
|
15
15
|
|
|
16
16
|
# Tylko ID
|
|
17
17
|
doc_id = get_document_id("paragon.jpg")
|
|
18
18
|
|
|
19
19
|
# Weryfikacja
|
|
20
|
-
is_same = verify_document_id("skan.png", "
|
|
20
|
+
is_same = verify_document_id("skan.png", "DOC-FV-A7B3C9D2E1F04856")
|
|
21
21
|
|
|
22
22
|
Wymagania:
|
|
23
23
|
pip install paddleocr paddlepaddle pdf2image pillow
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
#!/usr/bin/env python3
|
|
2
2
|
"""
|
|
3
|
-
CLI dla
|
|
3
|
+
CLI dla DOC Document ID Generator.
|
|
4
4
|
|
|
5
5
|
Użycie:
|
|
6
6
|
# Przetwórz pojedynczy plik
|
|
@@ -13,7 +13,7 @@ Użycie:
|
|
|
13
13
|
docid batch ./dokumenty/ --output results.json
|
|
14
14
|
|
|
15
15
|
# Weryfikacja ID
|
|
16
|
-
docid verify faktura.pdf
|
|
16
|
+
docid verify faktura.pdf DOC-FV-A7B3C9D2E1F04856
|
|
17
17
|
|
|
18
18
|
# Tylko OCR (bez generowania ID)
|
|
19
19
|
docid ocr skan.jpg
|
|
@@ -267,7 +267,7 @@ def cmd_generate_id(args):
|
|
|
267
267
|
|
|
268
268
|
def main():
|
|
269
269
|
parser = argparse.ArgumentParser(
|
|
270
|
-
description='
|
|
270
|
+
description='DOC Document ID Generator - deterministyczne ID dokumentów z OCR',
|
|
271
271
|
formatter_class=argparse.RawDescriptionHelpFormatter,
|
|
272
272
|
)
|
|
273
273
|
parser.add_argument('--version', action='version', version='docid 0.1.0')
|
|
@@ -279,7 +279,7 @@ def main():
|
|
|
279
279
|
common.add_argument('--engine', choices=['paddle', 'tesseract'], default='paddle',
|
|
280
280
|
help='Silnik OCR (domyślnie: paddle)')
|
|
281
281
|
common.add_argument('--lang', default='pl', help='Język dokumentów')
|
|
282
|
-
common.add_argument('--prefix', default='
|
|
282
|
+
common.add_argument('--prefix', default='DOC', help='Prefiks ID')
|
|
283
283
|
common.add_argument('--gpu', action='store_true', help='Użyj GPU')
|
|
284
284
|
common.add_argument('-v', '--verbose', action='store_true', help='Więcej szczegółów')
|
|
285
285
|
|
|
@@ -324,7 +324,7 @@ def main():
|
|
|
324
324
|
p_gen.add_argument('--number', help='Numer dokumentu')
|
|
325
325
|
p_gen.add_argument('--date', help='Data (YYYY-MM-DD)')
|
|
326
326
|
p_gen.add_argument('--amount', help='Kwota brutto')
|
|
327
|
-
p_gen.add_argument('--prefix', default='
|
|
327
|
+
p_gen.add_argument('--prefix', default='DOC', help='Prefiks ID')
|
|
328
328
|
p_gen.set_defaults(func=cmd_generate_id)
|
|
329
329
|
|
|
330
330
|
args = parser.parse_args()
|
|
@@ -431,7 +431,7 @@ def main():
|
|
|
431
431
|
"""Main CLI entry point"""
|
|
432
432
|
parser = argparse.ArgumentParser(
|
|
433
433
|
prog='docid',
|
|
434
|
-
description='
|
|
434
|
+
description='DOC Document ID Generator - CLI'
|
|
435
435
|
)
|
|
436
436
|
|
|
437
437
|
parser.add_argument('--version', action='version', version='%(prog)s 0.1.0')
|
|
@@ -14,8 +14,8 @@ from decimal import ROUND_HALF_UP, Decimal
|
|
|
14
14
|
from enum import Enum
|
|
15
15
|
from typing import Optional, Union
|
|
16
16
|
|
|
17
|
-
# Namespace UUID dla
|
|
18
|
-
|
|
17
|
+
# Namespace UUID dla DOC (RFC 4122 UUID v5)
|
|
18
|
+
DOC_NAMESPACE = uuid.UUID('a1b2c3d4-e5f6-7890-abcd-ef1234567890')
|
|
19
19
|
|
|
20
20
|
|
|
21
21
|
class DocumentType(Enum):
|
|
@@ -207,10 +207,10 @@ class DocumentIDGenerator:
|
|
|
207
207
|
niezależnie od formatu źródłowego dokumentu.
|
|
208
208
|
"""
|
|
209
209
|
|
|
210
|
-
def __init__(self, prefix: str = "
|
|
210
|
+
def __init__(self, prefix: str = "DOC"):
|
|
211
211
|
"""
|
|
212
212
|
Args:
|
|
213
|
-
prefix: Prefiks identyfikatora (domyślnie
|
|
213
|
+
prefix: Prefiks identyfikatora (domyślnie DOC)
|
|
214
214
|
"""
|
|
215
215
|
self.prefix = prefix
|
|
216
216
|
|
|
@@ -229,7 +229,7 @@ class DocumentIDGenerator:
|
|
|
229
229
|
|
|
230
230
|
>>> gen = DocumentIDGenerator()
|
|
231
231
|
>>> gen.generate_invoice_id("5213017228", "FV/2025/00142", "2025-01-15", 1230.00)
|
|
232
|
-
'
|
|
232
|
+
'DOC-FV-A7B3C9D2E1F04856'
|
|
233
233
|
"""
|
|
234
234
|
canonical = CanonicalData(
|
|
235
235
|
document_type=DocumentType.INVOICE,
|
|
@@ -265,7 +265,7 @@ class DocumentIDGenerator:
|
|
|
265
265
|
|
|
266
266
|
>>> gen = DocumentIDGenerator()
|
|
267
267
|
>>> gen.generate_receipt_id("5213017228", "2025-01-15", 45.99)
|
|
268
|
-
'
|
|
268
|
+
'DOC-PAR-...'
|
|
269
269
|
"""
|
|
270
270
|
parts = [
|
|
271
271
|
NIPValidator.normalize(seller_nip),
|
|
@@ -649,7 +649,7 @@ class DocumentIDGenerator:
|
|
|
649
649
|
Generuje finalny identyfikator z danych kanonicznych.
|
|
650
650
|
|
|
651
651
|
Format: {PREFIX}-{TYPE}-{HASH16}
|
|
652
|
-
Przykład:
|
|
652
|
+
Przykład: DOC-FV-A7B3C9D2E1F04856
|
|
653
653
|
"""
|
|
654
654
|
# SHA256 z canonical string
|
|
655
655
|
hash_bytes = hashlib.sha256(canonical.canonical_string.encode('utf-8')).digest()
|
|
@@ -662,7 +662,7 @@ class DocumentIDGenerator:
|
|
|
662
662
|
Weryfikuje czy ID odpowiada danym kanonicznym.
|
|
663
663
|
|
|
664
664
|
>>> gen = DocumentIDGenerator()
|
|
665
|
-
>>> gen.verify_id("
|
|
665
|
+
>>> gen.verify_id("DOC-FV-A7B3C9D2E1F04856", "5213017228|FV/2025/00142|2025-01-15|1230.00")
|
|
666
666
|
True
|
|
667
667
|
"""
|
|
668
668
|
hash_bytes = hashlib.sha256(canonical_string.encode('utf-8')).digest()
|
|
@@ -679,8 +679,8 @@ class DocumentIDGenerator:
|
|
|
679
679
|
"""
|
|
680
680
|
Parsuje identyfikator dokumentu.
|
|
681
681
|
|
|
682
|
-
>>> DocumentIDGenerator.parse_id("
|
|
683
|
-
{'prefix': '
|
|
682
|
+
>>> DocumentIDGenerator.parse_id("DOC-FV-A7B3C9D2E1F04856")
|
|
683
|
+
{'prefix': 'DOC', 'type': 'FV', 'hash': 'A7B3C9D2E1F04856',
|
|
684
684
|
'document_type': <DocumentType.INVOICE>}
|
|
685
685
|
"""
|
|
686
686
|
parts = document_id.split('-')
|
|
@@ -195,12 +195,8 @@ class PaddleOCRProcessor(BaseOCRProcessor):
|
|
|
195
195
|
self._ocr = PaddleOCR(
|
|
196
196
|
use_angle_cls=True,
|
|
197
197
|
lang=lang,
|
|
198
|
-
use_gpu=self.use_gpu,
|
|
199
198
|
det_model_dir=self._det_model_dir,
|
|
200
199
|
rec_model_dir=self._rec_model_dir,
|
|
201
|
-
# Optymalizacje CPU
|
|
202
|
-
enable_mkldnn=True,
|
|
203
|
-
cpu_threads=4,
|
|
204
200
|
)
|
|
205
201
|
except ImportError:
|
|
206
202
|
raise ImportError(
|
|
@@ -216,7 +212,7 @@ class PaddleOCRProcessor(BaseOCRProcessor):
|
|
|
216
212
|
self._init_ocr()
|
|
217
213
|
|
|
218
214
|
image_path = str(image_path)
|
|
219
|
-
result = self._ocr.ocr(image_path
|
|
215
|
+
result = self._ocr.ocr(image_path)
|
|
220
216
|
|
|
221
217
|
lines = []
|
|
222
218
|
full_text_parts = []
|
|
@@ -439,8 +435,8 @@ class OCRProcessor:
|
|
|
439
435
|
|
|
440
436
|
def __init__(
|
|
441
437
|
self,
|
|
442
|
-
preferred_engine: OCREngine = OCREngine.
|
|
443
|
-
fallback_engine: OCREngine = OCREngine.
|
|
438
|
+
preferred_engine: OCREngine = OCREngine.TESSERACT,
|
|
439
|
+
fallback_engine: OCREngine = OCREngine.PADDLE,
|
|
444
440
|
lang: str = 'pl',
|
|
445
441
|
use_gpu: bool = False,
|
|
446
442
|
):
|
|
@@ -97,20 +97,20 @@ class DocumentPipeline:
|
|
|
97
97
|
Przykład użycia:
|
|
98
98
|
pipeline = DocumentPipeline()
|
|
99
99
|
result = pipeline.process("faktura.pdf")
|
|
100
|
-
print(result.document_id) #
|
|
100
|
+
print(result.document_id) # DOC-FV-A7B3C9D2E1F04856
|
|
101
101
|
"""
|
|
102
102
|
|
|
103
103
|
def __init__(
|
|
104
104
|
self,
|
|
105
|
-
ocr_engine: OCREngine = OCREngine.
|
|
106
|
-
id_prefix: str = "
|
|
105
|
+
ocr_engine: OCREngine = OCREngine.TESSERACT,
|
|
106
|
+
id_prefix: str = "DOC",
|
|
107
107
|
lang: str = "pl",
|
|
108
108
|
use_gpu: bool = False,
|
|
109
109
|
):
|
|
110
110
|
"""
|
|
111
111
|
Args:
|
|
112
112
|
ocr_engine: Silnik OCR (PADDLE lub TESSERACT)
|
|
113
|
-
id_prefix: Prefiks identyfikatorów (domyślnie
|
|
113
|
+
id_prefix: Prefiks identyfikatorów (domyślnie DOC)
|
|
114
114
|
lang: Język dokumentów (pl, en)
|
|
115
115
|
use_gpu: Czy używać GPU (domyślnie False dla CPU)
|
|
116
116
|
"""
|
|
@@ -390,7 +390,7 @@ class DocumentPipeline:
|
|
|
390
390
|
_default_pipeline: Optional[DocumentPipeline] = None
|
|
391
391
|
|
|
392
392
|
|
|
393
|
-
def get_pipeline(ocr_engine: OCREngine = OCREngine.
|
|
393
|
+
def get_pipeline(ocr_engine: OCREngine = OCREngine.TESSERACT) -> DocumentPipeline:
|
|
394
394
|
"""Zwraca domyślny pipeline (lazy init)."""
|
|
395
395
|
global _default_pipeline
|
|
396
396
|
if _default_pipeline is None or _default_pipeline.ocr.preferred_engine != ocr_engine:
|
|
@@ -398,7 +398,7 @@ def get_pipeline(ocr_engine: OCREngine = OCREngine.PADDLE) -> DocumentPipeline:
|
|
|
398
398
|
return _default_pipeline
|
|
399
399
|
|
|
400
400
|
|
|
401
|
-
def process_document(file_path: Union[str, Path], ocr_engine: OCREngine = OCREngine.
|
|
401
|
+
def process_document(file_path: Union[str, Path], ocr_engine: OCREngine = OCREngine.TESSERACT, use_ocr: bool = True) -> ProcessedDocument:
|
|
402
402
|
"""
|
|
403
403
|
Przetwarza dokument i zwraca wynik z ID.
|
|
404
404
|
|
|
@@ -416,7 +416,7 @@ def get_document_id(file_path: Union[str, Path]) -> str:
|
|
|
416
416
|
|
|
417
417
|
Przykład:
|
|
418
418
|
doc_id = get_document_id("faktura.pdf")
|
|
419
|
-
print(doc_id) #
|
|
419
|
+
print(doc_id) # DOC-FV-A7B3C9D2E1F04856
|
|
420
420
|
"""
|
|
421
421
|
return get_pipeline().process(file_path).document_id
|
|
422
422
|
|
|
@@ -426,6 +426,6 @@ def verify_document_id(file_path: Union[str, Path], expected_id: str) -> bool:
|
|
|
426
426
|
Weryfikuje czy dokument ma oczekiwany ID.
|
|
427
427
|
|
|
428
428
|
Przykład:
|
|
429
|
-
is_valid = verify_document_id("skan.jpg", "
|
|
429
|
+
is_valid = verify_document_id("skan.jpg", "DOC-FV-A7B3C9D2E1F04856")
|
|
430
430
|
"""
|
|
431
431
|
return get_pipeline().verify_document(file_path, expected_id)
|
|
@@ -0,0 +1,277 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Testy ID dla wszystkich próbek (samples).
|
|
3
|
+
|
|
4
|
+
Sprawdza generowanie ID dla wszystkich plików w katalogu samples,
|
|
5
|
+
w tym spójność ID między różnymi formatami tego samego dokumentu.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import os
|
|
9
|
+
from pathlib import Path
|
|
10
|
+
|
|
11
|
+
import pytest
|
|
12
|
+
|
|
13
|
+
from exef_docid.document_id import DocumentIDGenerator, DocumentType
|
|
14
|
+
from exef_docid.pipeline import DocumentPipeline, process_document
|
|
15
|
+
from exef_docid.ocr_processor import OCREngine
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
SAMPLES_DIR = Path(__file__).parent.parent / "samples"
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class TestSampleIDGeneration:
|
|
22
|
+
"""Testy generowania ID dla wszystkich próbek."""
|
|
23
|
+
|
|
24
|
+
@pytest.fixture
|
|
25
|
+
def pipeline(self):
|
|
26
|
+
"""Fixture tworzący pipeline z Tesseract OCR."""
|
|
27
|
+
return DocumentPipeline(ocr_engine=OCREngine.TESSERACT)
|
|
28
|
+
|
|
29
|
+
def get_sample_files(self, subdirectory: str) -> list[Path]:
|
|
30
|
+
"""Zwraca listę plików próbek z danego podkatalogu."""
|
|
31
|
+
sample_dir = SAMPLES_DIR / subdirectory
|
|
32
|
+
if not sample_dir.exists():
|
|
33
|
+
return []
|
|
34
|
+
|
|
35
|
+
files = []
|
|
36
|
+
for ext in ['*.pdf', '*.jpg', '*.jpeg', '*.png', '*.xml', '*.html', '*.htm', '*.txt']:
|
|
37
|
+
files.extend(sample_dir.glob(ext))
|
|
38
|
+
return sorted(files)
|
|
39
|
+
|
|
40
|
+
def test_invoice_samples_generate_id(self, pipeline):
|
|
41
|
+
"""Test generowania ID dla wszystkich próbek faktur."""
|
|
42
|
+
invoice_files = self.get_sample_files("invoices")
|
|
43
|
+
assert len(invoice_files) > 0, "Brak plików faktur w samples/invoices"
|
|
44
|
+
|
|
45
|
+
results = []
|
|
46
|
+
for file_path in invoice_files:
|
|
47
|
+
try:
|
|
48
|
+
result = pipeline.process(file_path)
|
|
49
|
+
results.append({
|
|
50
|
+
'file': file_path.name,
|
|
51
|
+
'id': result.document_id,
|
|
52
|
+
'type': result.document_type.value,
|
|
53
|
+
'canonical': result.canonical_string,
|
|
54
|
+
'confidence': result.ocr_confidence,
|
|
55
|
+
})
|
|
56
|
+
# Sprawdź czy ID zostało wygenerowane
|
|
57
|
+
assert result.document_id, f"Brak ID dla {file_path}"
|
|
58
|
+
assert result.document_id.startswith("EXEF-FV"), f"Nieprawidłowy prefix ID dla faktury: {result.document_id}"
|
|
59
|
+
except Exception as e:
|
|
60
|
+
pytest.fail(f"Błąd przetwarzania {file_path}: {e}")
|
|
61
|
+
|
|
62
|
+
# Wypisz podsumowanie
|
|
63
|
+
print(f"\n=== Faktury ({len(results)} plików) ===")
|
|
64
|
+
for r in results:
|
|
65
|
+
print(f" {r['file']}: {r['id']} (confidence: {r['confidence']:.2f})")
|
|
66
|
+
|
|
67
|
+
def test_receipt_samples_generate_id(self, pipeline):
|
|
68
|
+
"""Test generowania ID dla wszystkich próbek paragonów."""
|
|
69
|
+
receipt_files = self.get_sample_files("receipts")
|
|
70
|
+
assert len(receipt_files) > 0, "Brak plików paragonów w samples/receipts"
|
|
71
|
+
|
|
72
|
+
results = []
|
|
73
|
+
for file_path in receipt_files:
|
|
74
|
+
try:
|
|
75
|
+
result = pipeline.process(file_path)
|
|
76
|
+
results.append({
|
|
77
|
+
'file': file_path.name,
|
|
78
|
+
'id': result.document_id,
|
|
79
|
+
'type': result.document_type.value,
|
|
80
|
+
'confidence': result.ocr_confidence,
|
|
81
|
+
})
|
|
82
|
+
assert result.document_id, f"Brak ID dla {file_path}"
|
|
83
|
+
except Exception as e:
|
|
84
|
+
pytest.fail(f"Błąd przetwarzania {file_path}: {e}")
|
|
85
|
+
|
|
86
|
+
print(f"\n=== Paragony ({len(results)} plików) ===")
|
|
87
|
+
for r in results:
|
|
88
|
+
print(f" {r['file']}: {r['id']} (confidence: {r['confidence']:.2f})")
|
|
89
|
+
|
|
90
|
+
def test_contract_samples_generate_id(self, pipeline):
|
|
91
|
+
"""Test generowania ID dla wszystkich próbek umów."""
|
|
92
|
+
contract_files = self.get_sample_files("contracts")
|
|
93
|
+
assert len(contract_files) > 0, "Brak plików umów w samples/contracts"
|
|
94
|
+
|
|
95
|
+
results = []
|
|
96
|
+
for file_path in contract_files:
|
|
97
|
+
try:
|
|
98
|
+
result = pipeline.process(file_path)
|
|
99
|
+
results.append({
|
|
100
|
+
'file': file_path.name,
|
|
101
|
+
'id': result.document_id,
|
|
102
|
+
'type': result.document_type.value,
|
|
103
|
+
'confidence': result.ocr_confidence,
|
|
104
|
+
})
|
|
105
|
+
assert result.document_id, f"Brak ID dla {file_path}"
|
|
106
|
+
except Exception as e:
|
|
107
|
+
pytest.fail(f"Błąd przetwarzania {file_path}: {e}")
|
|
108
|
+
|
|
109
|
+
print(f"\n=== Umowy ({len(results)} plików) ===")
|
|
110
|
+
for r in results:
|
|
111
|
+
print(f" {r['file']}: {r['id']} (confidence: {r['confidence']:.2f})")
|
|
112
|
+
|
|
113
|
+
def test_invoice_cross_format_consistency(self, pipeline):
|
|
114
|
+
"""Test czy różne formaty tej samej faktury mają ten sam ID."""
|
|
115
|
+
invoice_dir = SAMPLES_DIR / "invoices"
|
|
116
|
+
|
|
117
|
+
# Znajdź wszystkie formaty faktury (faktura_full.*)
|
|
118
|
+
base_name = "faktura_full"
|
|
119
|
+
formats = []
|
|
120
|
+
|
|
121
|
+
for ext in ['.jpg', '.jpeg', '.png', '.pdf', '.xml', '.html', '.txt']:
|
|
122
|
+
file_path = invoice_dir / f"{base_name}{ext}"
|
|
123
|
+
if file_path.exists():
|
|
124
|
+
formats.append(file_path)
|
|
125
|
+
|
|
126
|
+
if len(formats) < 2:
|
|
127
|
+
pytest.skip(f"Za mało formatów faktury do testu (znaleziono: {len(formats)})")
|
|
128
|
+
|
|
129
|
+
# Przetwórz wszystkie formaty
|
|
130
|
+
ids = {}
|
|
131
|
+
for file_path in formats:
|
|
132
|
+
try:
|
|
133
|
+
result = pipeline.process(file_path)
|
|
134
|
+
ids[file_path.suffix] = {
|
|
135
|
+
'id': result.document_id,
|
|
136
|
+
'canonical': result.canonical_string,
|
|
137
|
+
'confidence': result.ocr_confidence,
|
|
138
|
+
}
|
|
139
|
+
except Exception as e:
|
|
140
|
+
pytest.fail(f"Błąd przetwarzania {file_path}: {e}")
|
|
141
|
+
|
|
142
|
+
# Sprawdź czy wszystkie formaty mają ten sam ID
|
|
143
|
+
unique_ids = set(r['id'] for r in ids.values())
|
|
144
|
+
|
|
145
|
+
print(f"\n=== Cross-format consistency dla {base_name} ===")
|
|
146
|
+
for ext, data in sorted(ids.items()):
|
|
147
|
+
print(f" {ext}: {data['id']} (confidence: {data['confidence']:.2f})")
|
|
148
|
+
|
|
149
|
+
assert len(unique_ids) == 1, (
|
|
150
|
+
f"Różne formaty {base_name} mają różne ID: {unique_ids}\n"
|
|
151
|
+
f"Szczegóły: {ids}"
|
|
152
|
+
)
|
|
153
|
+
|
|
154
|
+
def test_receipt_cross_format_consistency(self, pipeline):
|
|
155
|
+
"""Test czy różne formaty tego samego paragonu mają ten sam ID."""
|
|
156
|
+
receipt_dir = SAMPLES_DIR / "receipts"
|
|
157
|
+
|
|
158
|
+
base_name = "paragon_full"
|
|
159
|
+
formats = []
|
|
160
|
+
|
|
161
|
+
for ext in ['.jpg', '.jpeg', '.png', '.pdf', '.xml', '.html', '.txt']:
|
|
162
|
+
file_path = receipt_dir / f"{base_name}{ext}"
|
|
163
|
+
if file_path.exists():
|
|
164
|
+
formats.append(file_path)
|
|
165
|
+
|
|
166
|
+
if len(formats) < 2:
|
|
167
|
+
pytest.skip(f"Za mało formatów paragonu do testu (znaleziono: {len(formats)})")
|
|
168
|
+
|
|
169
|
+
ids = {}
|
|
170
|
+
for file_path in formats:
|
|
171
|
+
try:
|
|
172
|
+
result = pipeline.process(file_path)
|
|
173
|
+
ids[file_path.suffix] = {
|
|
174
|
+
'id': result.document_id,
|
|
175
|
+
'canonical': result.canonical_string,
|
|
176
|
+
'confidence': result.ocr_confidence,
|
|
177
|
+
}
|
|
178
|
+
except Exception as e:
|
|
179
|
+
pytest.fail(f"Błąd przetwarzania {file_path}: {e}")
|
|
180
|
+
|
|
181
|
+
unique_ids = set(r['id'] for r in ids.values())
|
|
182
|
+
|
|
183
|
+
print(f"\n=== Cross-format consistency dla {base_name} ===")
|
|
184
|
+
for ext, data in sorted(ids.items()):
|
|
185
|
+
print(f" {ext}: {data['id']} (confidence: {data['confidence']:.2f})")
|
|
186
|
+
|
|
187
|
+
assert len(unique_ids) == 1, (
|
|
188
|
+
f"Różne formaty {base_name} mają różne ID: {unique_ids}\n"
|
|
189
|
+
f"Szczegóły: {ids}"
|
|
190
|
+
)
|
|
191
|
+
|
|
192
|
+
def test_all_samples_summary(self, pipeline):
|
|
193
|
+
"""Podsumowanie wszystkich próbek."""
|
|
194
|
+
all_results = []
|
|
195
|
+
|
|
196
|
+
for subdir in ['invoices', 'receipts', 'contracts', 'universal']:
|
|
197
|
+
files = self.get_sample_files(subdir)
|
|
198
|
+
for file_path in files:
|
|
199
|
+
try:
|
|
200
|
+
result = pipeline.process(file_path)
|
|
201
|
+
all_results.append({
|
|
202
|
+
'subdir': subdir,
|
|
203
|
+
'file': file_path.name,
|
|
204
|
+
'id': result.document_id,
|
|
205
|
+
'type': result.document_type.value,
|
|
206
|
+
'confidence': result.ocr_confidence,
|
|
207
|
+
})
|
|
208
|
+
except Exception as e:
|
|
209
|
+
all_results.append({
|
|
210
|
+
'subdir': subdir,
|
|
211
|
+
'file': file_path.name,
|
|
212
|
+
'error': str(e),
|
|
213
|
+
})
|
|
214
|
+
|
|
215
|
+
print(f"\n{'='*60}")
|
|
216
|
+
print(f"PODSUMOWANIE WSZYSTKICH PRÓBEK ({len(all_results)} plików)")
|
|
217
|
+
print(f"{'='*60}")
|
|
218
|
+
|
|
219
|
+
success_count = sum(1 for r in all_results if 'error' not in r)
|
|
220
|
+
error_count = sum(1 for r in all_results if 'error' in r)
|
|
221
|
+
|
|
222
|
+
for r in sorted(all_results, key=lambda x: (x['subdir'], x['file'])):
|
|
223
|
+
if 'error' in r:
|
|
224
|
+
print(f" [ERR] {r['subdir']}/{r['file']}: {r['error']}")
|
|
225
|
+
else:
|
|
226
|
+
print(f" [OK] {r['subdir']}/{r['file']}: {r['id']}")
|
|
227
|
+
|
|
228
|
+
print(f"{'='*60}")
|
|
229
|
+
print(f"Sukcesy: {success_count}/{len(all_results)}, Błędy: {error_count}/{len(all_results)}")
|
|
230
|
+
print(f"{'='*60}")
|
|
231
|
+
|
|
232
|
+
# Nie failujemy testu przy błędach - to tylko podsumowanie
|
|
233
|
+
assert success_count > 0, "Żaden plik nie został przetworzony pomyślnie"
|
|
234
|
+
|
|
235
|
+
|
|
236
|
+
class TestSampleIDDeterminism:
|
|
237
|
+
"""Testy determinizmu ID - te same dane = ten sam ID."""
|
|
238
|
+
|
|
239
|
+
def test_invoice_deterministic_id(self):
|
|
240
|
+
"""Test czy faktura generuje ten sam ID przy każdym uruchomieniu."""
|
|
241
|
+
invoice_dir = SAMPLES_DIR / "invoices"
|
|
242
|
+
|
|
243
|
+
# Znajdź pierwszy dostępny plik faktury
|
|
244
|
+
invoice_file = None
|
|
245
|
+
for ext in ['.txt', '.xml', '.html']:
|
|
246
|
+
candidate = invoice_dir / f"faktura_full{ext}"
|
|
247
|
+
if candidate.exists():
|
|
248
|
+
invoice_file = candidate
|
|
249
|
+
break
|
|
250
|
+
|
|
251
|
+
if not invoice_file:
|
|
252
|
+
pytest.skip("Brak pliku faktury do testu determinizmu")
|
|
253
|
+
|
|
254
|
+
# Generuj ID dwa razy
|
|
255
|
+
id1 = process_document(invoice_file).document_id
|
|
256
|
+
id2 = process_document(invoice_file).document_id
|
|
257
|
+
|
|
258
|
+
assert id1 == id2, f"ID nie jest deterministyczne: {id1} != {id2}"
|
|
259
|
+
|
|
260
|
+
def test_receipt_deterministic_id(self):
|
|
261
|
+
"""Test czy paragon generuje ten sam ID przy każdym uruchomieniu."""
|
|
262
|
+
receipt_dir = SAMPLES_DIR / "receipts"
|
|
263
|
+
|
|
264
|
+
receipt_file = None
|
|
265
|
+
for ext in ['.txt', '.xml', '.html']:
|
|
266
|
+
candidate = receipt_dir / f"paragon_full{ext}"
|
|
267
|
+
if candidate.exists():
|
|
268
|
+
receipt_file = candidate
|
|
269
|
+
break
|
|
270
|
+
|
|
271
|
+
if not receipt_file:
|
|
272
|
+
pytest.skip("Brak pliku paragonu do testu determinizmu")
|
|
273
|
+
|
|
274
|
+
id1 = process_document(receipt_file).document_id
|
|
275
|
+
id2 = process_document(receipt_file).document_id
|
|
276
|
+
|
|
277
|
+
assert id1 == id2, f"ID nie jest deterministyczne: {id1} != {id2}"
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|