docid 0.1.3__tar.gz → 0.1.4__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: docid
3
- Version: 0.1.3
3
+ Version: 0.1.4
4
4
  Summary: Deterministyczny generator identyfikatorów dokumentów z OCR
5
5
  Home-page: https://github.com/softreck/docid
6
6
  Author: Softreck
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: docid
3
- Version: 0.1.3
3
+ Version: 0.1.4
4
4
  Summary: Deterministyczny generator identyfikatorów dokumentów z OCR
5
5
  Home-page: https://github.com/softreck/docid
6
6
  Author: Softreck
@@ -18,4 +18,5 @@ exef_docid/pipeline.py
18
18
  exef_docid/extractors/__init__.py
19
19
  exef_docid/extractors/base.py
20
20
  tests/test_document_id.py
21
- tests/test_extractors.py
21
+ tests/test_extractors.py
22
+ tests/test_samples_id.py
@@ -1,5 +1,5 @@
1
1
  """
2
- EXEF Document ID Generator
2
+ DOC Document ID Generator
3
3
 
4
4
  Deterministyczny generator identyfikatorów dokumentów z OCR.
5
5
  Generuje zawsze ten sam ID dla tego samego dokumentu,
@@ -10,14 +10,14 @@ Przykład użycia:
10
10
 
11
11
  # Pełne przetwarzanie
12
12
  result = process_document("faktura.pdf")
13
- print(result.document_id) # EXEF-FV-A7B3C9D2E1F04856
13
+ print(result.document_id) # DOC-FV-A7B3C9D2E1F04856
14
14
  print(result.extraction.issuer_nip) # 5213017228
15
15
 
16
16
  # Tylko ID
17
17
  doc_id = get_document_id("paragon.jpg")
18
18
 
19
19
  # Weryfikacja
20
- is_same = verify_document_id("skan.png", "EXEF-FV-A7B3C9D2E1F04856")
20
+ is_same = verify_document_id("skan.png", "DOC-FV-A7B3C9D2E1F04856")
21
21
 
22
22
  Wymagania:
23
23
  pip install paddleocr paddlepaddle pdf2image pillow
@@ -1,6 +1,6 @@
1
1
  #!/usr/bin/env python3
2
2
  """
3
- CLI dla EXEF Document ID Generator.
3
+ CLI dla DOC Document ID Generator.
4
4
 
5
5
  Użycie:
6
6
  # Przetwórz pojedynczy plik
@@ -13,7 +13,7 @@ Użycie:
13
13
  docid batch ./dokumenty/ --output results.json
14
14
 
15
15
  # Weryfikacja ID
16
- docid verify faktura.pdf EXEF-FV-A7B3C9D2E1F04856
16
+ docid verify faktura.pdf DOC-FV-A7B3C9D2E1F04856
17
17
 
18
18
  # Tylko OCR (bez generowania ID)
19
19
  docid ocr skan.jpg
@@ -267,7 +267,7 @@ def cmd_generate_id(args):
267
267
 
268
268
  def main():
269
269
  parser = argparse.ArgumentParser(
270
- description='EXEF Document ID Generator - deterministyczne ID dokumentów z OCR',
270
+ description='DOC Document ID Generator - deterministyczne ID dokumentów z OCR',
271
271
  formatter_class=argparse.RawDescriptionHelpFormatter,
272
272
  )
273
273
  parser.add_argument('--version', action='version', version='docid 0.1.0')
@@ -279,7 +279,7 @@ def main():
279
279
  common.add_argument('--engine', choices=['paddle', 'tesseract'], default='paddle',
280
280
  help='Silnik OCR (domyślnie: paddle)')
281
281
  common.add_argument('--lang', default='pl', help='Język dokumentów')
282
- common.add_argument('--prefix', default='EXEF', help='Prefiks ID')
282
+ common.add_argument('--prefix', default='DOC', help='Prefiks ID')
283
283
  common.add_argument('--gpu', action='store_true', help='Użyj GPU')
284
284
  common.add_argument('-v', '--verbose', action='store_true', help='Więcej szczegółów')
285
285
 
@@ -324,7 +324,7 @@ def main():
324
324
  p_gen.add_argument('--number', help='Numer dokumentu')
325
325
  p_gen.add_argument('--date', help='Data (YYYY-MM-DD)')
326
326
  p_gen.add_argument('--amount', help='Kwota brutto')
327
- p_gen.add_argument('--prefix', default='EXEF', help='Prefiks ID')
327
+ p_gen.add_argument('--prefix', default='DOC', help='Prefiks ID')
328
328
  p_gen.set_defaults(func=cmd_generate_id)
329
329
 
330
330
  args = parser.parse_args()
@@ -431,7 +431,7 @@ def main():
431
431
  """Main CLI entry point"""
432
432
  parser = argparse.ArgumentParser(
433
433
  prog='docid',
434
- description='EXEF Document ID Generator - CLI'
434
+ description='DOC Document ID Generator - CLI'
435
435
  )
436
436
 
437
437
  parser.add_argument('--version', action='version', version='%(prog)s 0.1.0')
@@ -14,8 +14,8 @@ from decimal import ROUND_HALF_UP, Decimal
14
14
  from enum import Enum
15
15
  from typing import Optional, Union
16
16
 
17
- # Namespace UUID dla EXEF (RFC 4122 UUID v5)
18
- EXEF_NAMESPACE = uuid.UUID('a1b2c3d4-e5f6-7890-abcd-ef1234567890')
17
+ # Namespace UUID dla DOC (RFC 4122 UUID v5)
18
+ DOC_NAMESPACE = uuid.UUID('a1b2c3d4-e5f6-7890-abcd-ef1234567890')
19
19
 
20
20
 
21
21
  class DocumentType(Enum):
@@ -207,10 +207,10 @@ class DocumentIDGenerator:
207
207
  niezależnie od formatu źródłowego dokumentu.
208
208
  """
209
209
 
210
- def __init__(self, prefix: str = "EXEF"):
210
+ def __init__(self, prefix: str = "DOC"):
211
211
  """
212
212
  Args:
213
- prefix: Prefiks identyfikatora (domyślnie EXEF)
213
+ prefix: Prefiks identyfikatora (domyślnie DOC)
214
214
  """
215
215
  self.prefix = prefix
216
216
 
@@ -229,7 +229,7 @@ class DocumentIDGenerator:
229
229
 
230
230
  >>> gen = DocumentIDGenerator()
231
231
  >>> gen.generate_invoice_id("5213017228", "FV/2025/00142", "2025-01-15", 1230.00)
232
- 'EXEF-FV-A7B3C9D2E1F04856'
232
+ 'DOC-FV-A7B3C9D2E1F04856'
233
233
  """
234
234
  canonical = CanonicalData(
235
235
  document_type=DocumentType.INVOICE,
@@ -265,7 +265,7 @@ class DocumentIDGenerator:
265
265
 
266
266
  >>> gen = DocumentIDGenerator()
267
267
  >>> gen.generate_receipt_id("5213017228", "2025-01-15", 45.99)
268
- 'EXEF-PAR-...'
268
+ 'DOC-PAR-...'
269
269
  """
270
270
  parts = [
271
271
  NIPValidator.normalize(seller_nip),
@@ -649,7 +649,7 @@ class DocumentIDGenerator:
649
649
  Generuje finalny identyfikator z danych kanonicznych.
650
650
 
651
651
  Format: {PREFIX}-{TYPE}-{HASH16}
652
- Przykład: EXEF-FV-A7B3C9D2E1F04856
652
+ Przykład: DOC-FV-A7B3C9D2E1F04856
653
653
  """
654
654
  # SHA256 z canonical string
655
655
  hash_bytes = hashlib.sha256(canonical.canonical_string.encode('utf-8')).digest()
@@ -662,7 +662,7 @@ class DocumentIDGenerator:
662
662
  Weryfikuje czy ID odpowiada danym kanonicznym.
663
663
 
664
664
  >>> gen = DocumentIDGenerator()
665
- >>> gen.verify_id("EXEF-FV-A7B3C9D2E1F04856", "5213017228|FV/2025/00142|2025-01-15|1230.00")
665
+ >>> gen.verify_id("DOC-FV-A7B3C9D2E1F04856", "5213017228|FV/2025/00142|2025-01-15|1230.00")
666
666
  True
667
667
  """
668
668
  hash_bytes = hashlib.sha256(canonical_string.encode('utf-8')).digest()
@@ -679,8 +679,8 @@ class DocumentIDGenerator:
679
679
  """
680
680
  Parsuje identyfikator dokumentu.
681
681
 
682
- >>> DocumentIDGenerator.parse_id("EXEF-FV-A7B3C9D2E1F04856")
683
- {'prefix': 'EXEF', 'type': 'FV', 'hash': 'A7B3C9D2E1F04856',
682
+ >>> DocumentIDGenerator.parse_id("DOC-FV-A7B3C9D2E1F04856")
683
+ {'prefix': 'DOC', 'type': 'FV', 'hash': 'A7B3C9D2E1F04856',
684
684
  'document_type': <DocumentType.INVOICE>}
685
685
  """
686
686
  parts = document_id.split('-')
@@ -195,12 +195,8 @@ class PaddleOCRProcessor(BaseOCRProcessor):
195
195
  self._ocr = PaddleOCR(
196
196
  use_angle_cls=True,
197
197
  lang=lang,
198
- use_gpu=self.use_gpu,
199
198
  det_model_dir=self._det_model_dir,
200
199
  rec_model_dir=self._rec_model_dir,
201
- # Optymalizacje CPU
202
- enable_mkldnn=True,
203
- cpu_threads=4,
204
200
  )
205
201
  except ImportError:
206
202
  raise ImportError(
@@ -216,7 +212,7 @@ class PaddleOCRProcessor(BaseOCRProcessor):
216
212
  self._init_ocr()
217
213
 
218
214
  image_path = str(image_path)
219
- result = self._ocr.ocr(image_path, cls=True)
215
+ result = self._ocr.ocr(image_path)
220
216
 
221
217
  lines = []
222
218
  full_text_parts = []
@@ -439,8 +435,8 @@ class OCRProcessor:
439
435
 
440
436
  def __init__(
441
437
  self,
442
- preferred_engine: OCREngine = OCREngine.PADDLE,
443
- fallback_engine: OCREngine = OCREngine.TESSERACT,
438
+ preferred_engine: OCREngine = OCREngine.TESSERACT,
439
+ fallback_engine: OCREngine = OCREngine.PADDLE,
444
440
  lang: str = 'pl',
445
441
  use_gpu: bool = False,
446
442
  ):
@@ -97,20 +97,20 @@ class DocumentPipeline:
97
97
  Przykład użycia:
98
98
  pipeline = DocumentPipeline()
99
99
  result = pipeline.process("faktura.pdf")
100
- print(result.document_id) # EXEF-FV-A7B3C9D2E1F04856
100
+ print(result.document_id) # DOC-FV-A7B3C9D2E1F04856
101
101
  """
102
102
 
103
103
  def __init__(
104
104
  self,
105
- ocr_engine: OCREngine = OCREngine.PADDLE,
106
- id_prefix: str = "EXEF",
105
+ ocr_engine: OCREngine = OCREngine.TESSERACT,
106
+ id_prefix: str = "DOC",
107
107
  lang: str = "pl",
108
108
  use_gpu: bool = False,
109
109
  ):
110
110
  """
111
111
  Args:
112
112
  ocr_engine: Silnik OCR (PADDLE lub TESSERACT)
113
- id_prefix: Prefiks identyfikatorów (domyślnie EXEF)
113
+ id_prefix: Prefiks identyfikatorów (domyślnie DOC)
114
114
  lang: Język dokumentów (pl, en)
115
115
  use_gpu: Czy używać GPU (domyślnie False dla CPU)
116
116
  """
@@ -390,7 +390,7 @@ class DocumentPipeline:
390
390
  _default_pipeline: Optional[DocumentPipeline] = None
391
391
 
392
392
 
393
- def get_pipeline(ocr_engine: OCREngine = OCREngine.PADDLE) -> DocumentPipeline:
393
+ def get_pipeline(ocr_engine: OCREngine = OCREngine.TESSERACT) -> DocumentPipeline:
394
394
  """Zwraca domyślny pipeline (lazy init)."""
395
395
  global _default_pipeline
396
396
  if _default_pipeline is None or _default_pipeline.ocr.preferred_engine != ocr_engine:
@@ -398,7 +398,7 @@ def get_pipeline(ocr_engine: OCREngine = OCREngine.PADDLE) -> DocumentPipeline:
398
398
  return _default_pipeline
399
399
 
400
400
 
401
- def process_document(file_path: Union[str, Path], ocr_engine: OCREngine = OCREngine.PADDLE, use_ocr: bool = True) -> ProcessedDocument:
401
+ def process_document(file_path: Union[str, Path], ocr_engine: OCREngine = OCREngine.TESSERACT, use_ocr: bool = True) -> ProcessedDocument:
402
402
  """
403
403
  Przetwarza dokument i zwraca wynik z ID.
404
404
 
@@ -416,7 +416,7 @@ def get_document_id(file_path: Union[str, Path]) -> str:
416
416
 
417
417
  Przykład:
418
418
  doc_id = get_document_id("faktura.pdf")
419
- print(doc_id) # EXEF-FV-A7B3C9D2E1F04856
419
+ print(doc_id) # DOC-FV-A7B3C9D2E1F04856
420
420
  """
421
421
  return get_pipeline().process(file_path).document_id
422
422
 
@@ -426,6 +426,6 @@ def verify_document_id(file_path: Union[str, Path], expected_id: str) -> bool:
426
426
  Weryfikuje czy dokument ma oczekiwany ID.
427
427
 
428
428
  Przykład:
429
- is_valid = verify_document_id("skan.jpg", "EXEF-FV-A7B3C9D2E1F04856")
429
+ is_valid = verify_document_id("skan.jpg", "DOC-FV-A7B3C9D2E1F04856")
430
430
  """
431
431
  return get_pipeline().verify_document(file_path, expected_id)
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "docid"
7
- version = "0.1.3"
7
+ version = "0.1.4"
8
8
  description = "Deterministyczny generator identyfikatorów dokumentów z OCR"
9
9
  readme = "README.md"
10
10
  license = {text = "MIT"}
@@ -0,0 +1,277 @@
1
+ """
2
+ Testy ID dla wszystkich próbek (samples).
3
+
4
+ Sprawdza generowanie ID dla wszystkich plików w katalogu samples,
5
+ w tym spójność ID między różnymi formatami tego samego dokumentu.
6
+ """
7
+
8
+ import os
9
+ from pathlib import Path
10
+
11
+ import pytest
12
+
13
+ from exef_docid.document_id import DocumentIDGenerator, DocumentType
14
+ from exef_docid.pipeline import DocumentPipeline, process_document
15
+ from exef_docid.ocr_processor import OCREngine
16
+
17
+
18
+ SAMPLES_DIR = Path(__file__).parent.parent / "samples"
19
+
20
+
21
+ class TestSampleIDGeneration:
22
+ """Testy generowania ID dla wszystkich próbek."""
23
+
24
+ @pytest.fixture
25
+ def pipeline(self):
26
+ """Fixture tworzący pipeline z Tesseract OCR."""
27
+ return DocumentPipeline(ocr_engine=OCREngine.TESSERACT)
28
+
29
+ def get_sample_files(self, subdirectory: str) -> list[Path]:
30
+ """Zwraca listę plików próbek z danego podkatalogu."""
31
+ sample_dir = SAMPLES_DIR / subdirectory
32
+ if not sample_dir.exists():
33
+ return []
34
+
35
+ files = []
36
+ for ext in ['*.pdf', '*.jpg', '*.jpeg', '*.png', '*.xml', '*.html', '*.htm', '*.txt']:
37
+ files.extend(sample_dir.glob(ext))
38
+ return sorted(files)
39
+
40
+ def test_invoice_samples_generate_id(self, pipeline):
41
+ """Test generowania ID dla wszystkich próbek faktur."""
42
+ invoice_files = self.get_sample_files("invoices")
43
+ assert len(invoice_files) > 0, "Brak plików faktur w samples/invoices"
44
+
45
+ results = []
46
+ for file_path in invoice_files:
47
+ try:
48
+ result = pipeline.process(file_path)
49
+ results.append({
50
+ 'file': file_path.name,
51
+ 'id': result.document_id,
52
+ 'type': result.document_type.value,
53
+ 'canonical': result.canonical_string,
54
+ 'confidence': result.ocr_confidence,
55
+ })
56
+ # Sprawdź czy ID zostało wygenerowane
57
+ assert result.document_id, f"Brak ID dla {file_path}"
58
+ assert result.document_id.startswith("EXEF-FV"), f"Nieprawidłowy prefix ID dla faktury: {result.document_id}"
59
+ except Exception as e:
60
+ pytest.fail(f"Błąd przetwarzania {file_path}: {e}")
61
+
62
+ # Wypisz podsumowanie
63
+ print(f"\n=== Faktury ({len(results)} plików) ===")
64
+ for r in results:
65
+ print(f" {r['file']}: {r['id']} (confidence: {r['confidence']:.2f})")
66
+
67
+ def test_receipt_samples_generate_id(self, pipeline):
68
+ """Test generowania ID dla wszystkich próbek paragonów."""
69
+ receipt_files = self.get_sample_files("receipts")
70
+ assert len(receipt_files) > 0, "Brak plików paragonów w samples/receipts"
71
+
72
+ results = []
73
+ for file_path in receipt_files:
74
+ try:
75
+ result = pipeline.process(file_path)
76
+ results.append({
77
+ 'file': file_path.name,
78
+ 'id': result.document_id,
79
+ 'type': result.document_type.value,
80
+ 'confidence': result.ocr_confidence,
81
+ })
82
+ assert result.document_id, f"Brak ID dla {file_path}"
83
+ except Exception as e:
84
+ pytest.fail(f"Błąd przetwarzania {file_path}: {e}")
85
+
86
+ print(f"\n=== Paragony ({len(results)} plików) ===")
87
+ for r in results:
88
+ print(f" {r['file']}: {r['id']} (confidence: {r['confidence']:.2f})")
89
+
90
+ def test_contract_samples_generate_id(self, pipeline):
91
+ """Test generowania ID dla wszystkich próbek umów."""
92
+ contract_files = self.get_sample_files("contracts")
93
+ assert len(contract_files) > 0, "Brak plików umów w samples/contracts"
94
+
95
+ results = []
96
+ for file_path in contract_files:
97
+ try:
98
+ result = pipeline.process(file_path)
99
+ results.append({
100
+ 'file': file_path.name,
101
+ 'id': result.document_id,
102
+ 'type': result.document_type.value,
103
+ 'confidence': result.ocr_confidence,
104
+ })
105
+ assert result.document_id, f"Brak ID dla {file_path}"
106
+ except Exception as e:
107
+ pytest.fail(f"Błąd przetwarzania {file_path}: {e}")
108
+
109
+ print(f"\n=== Umowy ({len(results)} plików) ===")
110
+ for r in results:
111
+ print(f" {r['file']}: {r['id']} (confidence: {r['confidence']:.2f})")
112
+
113
+ def test_invoice_cross_format_consistency(self, pipeline):
114
+ """Test czy różne formaty tej samej faktury mają ten sam ID."""
115
+ invoice_dir = SAMPLES_DIR / "invoices"
116
+
117
+ # Znajdź wszystkie formaty faktury (faktura_full.*)
118
+ base_name = "faktura_full"
119
+ formats = []
120
+
121
+ for ext in ['.jpg', '.jpeg', '.png', '.pdf', '.xml', '.html', '.txt']:
122
+ file_path = invoice_dir / f"{base_name}{ext}"
123
+ if file_path.exists():
124
+ formats.append(file_path)
125
+
126
+ if len(formats) < 2:
127
+ pytest.skip(f"Za mało formatów faktury do testu (znaleziono: {len(formats)})")
128
+
129
+ # Przetwórz wszystkie formaty
130
+ ids = {}
131
+ for file_path in formats:
132
+ try:
133
+ result = pipeline.process(file_path)
134
+ ids[file_path.suffix] = {
135
+ 'id': result.document_id,
136
+ 'canonical': result.canonical_string,
137
+ 'confidence': result.ocr_confidence,
138
+ }
139
+ except Exception as e:
140
+ pytest.fail(f"Błąd przetwarzania {file_path}: {e}")
141
+
142
+ # Sprawdź czy wszystkie formaty mają ten sam ID
143
+ unique_ids = set(r['id'] for r in ids.values())
144
+
145
+ print(f"\n=== Cross-format consistency dla {base_name} ===")
146
+ for ext, data in sorted(ids.items()):
147
+ print(f" {ext}: {data['id']} (confidence: {data['confidence']:.2f})")
148
+
149
+ assert len(unique_ids) == 1, (
150
+ f"Różne formaty {base_name} mają różne ID: {unique_ids}\n"
151
+ f"Szczegóły: {ids}"
152
+ )
153
+
154
+ def test_receipt_cross_format_consistency(self, pipeline):
155
+ """Test czy różne formaty tego samego paragonu mają ten sam ID."""
156
+ receipt_dir = SAMPLES_DIR / "receipts"
157
+
158
+ base_name = "paragon_full"
159
+ formats = []
160
+
161
+ for ext in ['.jpg', '.jpeg', '.png', '.pdf', '.xml', '.html', '.txt']:
162
+ file_path = receipt_dir / f"{base_name}{ext}"
163
+ if file_path.exists():
164
+ formats.append(file_path)
165
+
166
+ if len(formats) < 2:
167
+ pytest.skip(f"Za mało formatów paragonu do testu (znaleziono: {len(formats)})")
168
+
169
+ ids = {}
170
+ for file_path in formats:
171
+ try:
172
+ result = pipeline.process(file_path)
173
+ ids[file_path.suffix] = {
174
+ 'id': result.document_id,
175
+ 'canonical': result.canonical_string,
176
+ 'confidence': result.ocr_confidence,
177
+ }
178
+ except Exception as e:
179
+ pytest.fail(f"Błąd przetwarzania {file_path}: {e}")
180
+
181
+ unique_ids = set(r['id'] for r in ids.values())
182
+
183
+ print(f"\n=== Cross-format consistency dla {base_name} ===")
184
+ for ext, data in sorted(ids.items()):
185
+ print(f" {ext}: {data['id']} (confidence: {data['confidence']:.2f})")
186
+
187
+ assert len(unique_ids) == 1, (
188
+ f"Różne formaty {base_name} mają różne ID: {unique_ids}\n"
189
+ f"Szczegóły: {ids}"
190
+ )
191
+
192
+ def test_all_samples_summary(self, pipeline):
193
+ """Podsumowanie wszystkich próbek."""
194
+ all_results = []
195
+
196
+ for subdir in ['invoices', 'receipts', 'contracts', 'universal']:
197
+ files = self.get_sample_files(subdir)
198
+ for file_path in files:
199
+ try:
200
+ result = pipeline.process(file_path)
201
+ all_results.append({
202
+ 'subdir': subdir,
203
+ 'file': file_path.name,
204
+ 'id': result.document_id,
205
+ 'type': result.document_type.value,
206
+ 'confidence': result.ocr_confidence,
207
+ })
208
+ except Exception as e:
209
+ all_results.append({
210
+ 'subdir': subdir,
211
+ 'file': file_path.name,
212
+ 'error': str(e),
213
+ })
214
+
215
+ print(f"\n{'='*60}")
216
+ print(f"PODSUMOWANIE WSZYSTKICH PRÓBEK ({len(all_results)} plików)")
217
+ print(f"{'='*60}")
218
+
219
+ success_count = sum(1 for r in all_results if 'error' not in r)
220
+ error_count = sum(1 for r in all_results if 'error' in r)
221
+
222
+ for r in sorted(all_results, key=lambda x: (x['subdir'], x['file'])):
223
+ if 'error' in r:
224
+ print(f" [ERR] {r['subdir']}/{r['file']}: {r['error']}")
225
+ else:
226
+ print(f" [OK] {r['subdir']}/{r['file']}: {r['id']}")
227
+
228
+ print(f"{'='*60}")
229
+ print(f"Sukcesy: {success_count}/{len(all_results)}, Błędy: {error_count}/{len(all_results)}")
230
+ print(f"{'='*60}")
231
+
232
+ # Nie failujemy testu przy błędach - to tylko podsumowanie
233
+ assert success_count > 0, "Żaden plik nie został przetworzony pomyślnie"
234
+
235
+
236
+ class TestSampleIDDeterminism:
237
+ """Testy determinizmu ID - te same dane = ten sam ID."""
238
+
239
+ def test_invoice_deterministic_id(self):
240
+ """Test czy faktura generuje ten sam ID przy każdym uruchomieniu."""
241
+ invoice_dir = SAMPLES_DIR / "invoices"
242
+
243
+ # Znajdź pierwszy dostępny plik faktury
244
+ invoice_file = None
245
+ for ext in ['.txt', '.xml', '.html']:
246
+ candidate = invoice_dir / f"faktura_full{ext}"
247
+ if candidate.exists():
248
+ invoice_file = candidate
249
+ break
250
+
251
+ if not invoice_file:
252
+ pytest.skip("Brak pliku faktury do testu determinizmu")
253
+
254
+ # Generuj ID dwa razy
255
+ id1 = process_document(invoice_file).document_id
256
+ id2 = process_document(invoice_file).document_id
257
+
258
+ assert id1 == id2, f"ID nie jest deterministyczne: {id1} != {id2}"
259
+
260
+ def test_receipt_deterministic_id(self):
261
+ """Test czy paragon generuje ten sam ID przy każdym uruchomieniu."""
262
+ receipt_dir = SAMPLES_DIR / "receipts"
263
+
264
+ receipt_file = None
265
+ for ext in ['.txt', '.xml', '.html']:
266
+ candidate = receipt_dir / f"paragon_full{ext}"
267
+ if candidate.exists():
268
+ receipt_file = candidate
269
+ break
270
+
271
+ if not receipt_file:
272
+ pytest.skip("Brak pliku paragonu do testu determinizmu")
273
+
274
+ id1 = process_document(receipt_file).document_id
275
+ id2 = process_document(receipt_file).document_id
276
+
277
+ assert id1 == id2, f"ID nie jest deterministyczne: {id1} != {id2}"
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes