docid 0.1.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
exef_docid/__init__.py ADDED
@@ -0,0 +1,129 @@
1
+ """
2
+ EXEF Document ID Generator
3
+
4
+ Deterministyczny generator identyfikatorów dokumentów z OCR.
5
+ Generuje zawsze ten sam ID dla tego samego dokumentu,
6
+ niezależnie od formatu źródłowego (skan, PDF, KSeF XML).
7
+
8
+ Przykład użycia:
9
+ from exef_docid import process_document, get_document_id
10
+
11
+ # Pełne przetwarzanie
12
+ result = process_document("faktura.pdf")
13
+ print(result.document_id) # EXEF-FV-A7B3C9D2E1F04856
14
+ print(result.extraction.issuer_nip) # 5213017228
15
+
16
+ # Tylko ID
17
+ doc_id = get_document_id("paragon.jpg")
18
+
19
+ # Weryfikacja
20
+ is_same = verify_document_id("skan.png", "EXEF-FV-A7B3C9D2E1F04856")
21
+
22
+ Wymagania:
23
+ pip install paddleocr paddlepaddle pdf2image pillow
24
+
25
+ Lub dla Tesseract:
26
+ apt install tesseract-ocr tesseract-ocr-pol
27
+ pip install pytesseract pdf2image pillow
28
+ """
29
+
30
+ __version__ = "0.1.0"
31
+ __author__ = "Softreck"
32
+
33
+ # Główne API
34
+ # Generator ID (bez OCR)
35
+ from .document_id import (
36
+ AmountNormalizer,
37
+ DateNormalizer,
38
+ DocumentIDGenerator,
39
+ DocumentType,
40
+ InvoiceNumberNormalizer,
41
+ NIPValidator,
42
+ generate_contract_id,
43
+ generate_invoice_id,
44
+ generate_receipt_id,
45
+ )
46
+
47
+ # Ekstraktory
48
+ from .extractors import (
49
+ DocumentCategory,
50
+ DocumentExtractor,
51
+ ExtractionResult,
52
+ )
53
+
54
+ # OCR
55
+ from .ocr_processor import (
56
+ DocumentOCRResult,
57
+ OCREngine,
58
+ OCRProcessor,
59
+ OCRResult,
60
+ PaddleOCRProcessor,
61
+ TesseractOCRProcessor,
62
+ preprocess_image_for_ocr,
63
+ )
64
+
65
+ # Universal Document ID Generator
66
+ from .document_id_universal import (
67
+ UniversalDocumentIDGenerator,
68
+ UniversalDocumentFeatures,
69
+ DocumentType as UniversalDocumentType,
70
+ generate_universal_document_id,
71
+ verify_universal_document_id,
72
+ compare_universal_documents,
73
+ )
74
+ from .pipeline import (
75
+ DocumentPipeline,
76
+ ProcessedDocument,
77
+ get_document_id,
78
+ get_pipeline,
79
+ process_document,
80
+ verify_document_id,
81
+ )
82
+
83
+ __all__ = [
84
+ # Wersja
85
+ '__version__',
86
+
87
+ # Pipeline (główne API)
88
+ 'DocumentPipeline',
89
+ 'ProcessedDocument',
90
+ 'process_document',
91
+ 'get_document_id',
92
+ 'verify_document_id',
93
+ 'get_pipeline',
94
+
95
+ # Generator ID
96
+ 'DocumentIDGenerator',
97
+ 'DocumentType',
98
+ 'generate_invoice_id',
99
+ 'generate_receipt_id',
100
+ 'generate_contract_id',
101
+
102
+ # Normalizatory
103
+ 'NIPValidator',
104
+ 'AmountNormalizer',
105
+ 'DateNormalizer',
106
+ 'InvoiceNumberNormalizer',
107
+
108
+ # OCR
109
+ 'OCRProcessor',
110
+ 'OCREngine',
111
+ 'DocumentOCRResult',
112
+ 'OCRResult',
113
+ 'PaddleOCRProcessor',
114
+ 'TesseractOCRProcessor',
115
+ 'preprocess_image_for_ocr',
116
+
117
+ # Ekstraktory
118
+ 'DocumentExtractor',
119
+ 'ExtractionResult',
120
+ 'DocumentCategory',
121
+
122
+ # Universal Document ID Generator
123
+ 'UniversalDocumentIDGenerator',
124
+ 'UniversalDocumentFeatures',
125
+ 'UniversalDocumentType',
126
+ 'generate_universal_document_id',
127
+ 'verify_universal_document_id',
128
+ 'compare_universal_documents',
129
+ ]
exef_docid/cli.py ADDED
@@ -0,0 +1,340 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ CLI dla EXEF Document ID Generator.
4
+
5
+ Użycie:
6
+ # Przetwórz pojedynczy plik
7
+ docid process faktura.pdf
8
+
9
+ # Przetwórz wiele plików
10
+ docid process *.pdf *.jpg
11
+
12
+ # Batch z katalogu
13
+ docid batch ./dokumenty/ --output results.json
14
+
15
+ # Weryfikacja ID
16
+ docid verify faktura.pdf EXEF-FV-A7B3C9D2E1F04856
17
+
18
+ # Tylko OCR (bez generowania ID)
19
+ docid ocr skan.jpg
20
+
21
+ # Generuj ID bez OCR (z podanych danych)
22
+ docid generate-id --type invoice --nip 5213017228 \
23
+ --number FV/2025/001 --date 2025-01-15 --amount 1230.00
24
+ """
25
+
26
+ import argparse
27
+ import json
28
+ import logging
29
+ import sys
30
+ from pathlib import Path
31
+
32
+ # Konfiguracja logowania
33
+ logging.basicConfig(
34
+ level=logging.INFO,
35
+ format='%(asctime)s - %(levelname)s - %(message)s'
36
+ )
37
+ logger = logging.getLogger(__name__)
38
+
39
+
40
+ def cmd_process(args):
41
+ """Przetwarza pliki i generuje ID."""
42
+ from . import DocumentPipeline, OCREngine
43
+
44
+ engine = OCREngine.PADDLE if args.engine == 'paddle' else OCREngine.TESSERACT
45
+ pipeline = DocumentPipeline(
46
+ ocr_engine=engine,
47
+ id_prefix=args.prefix,
48
+ lang=args.lang,
49
+ use_gpu=args.gpu,
50
+ )
51
+
52
+ results = []
53
+
54
+ for file_path in args.files:
55
+ file_path = Path(file_path)
56
+
57
+ if not file_path.exists():
58
+ logger.error(f"File not found: {file_path}")
59
+ continue
60
+
61
+ try:
62
+ result = pipeline.process(file_path)
63
+
64
+ output = {
65
+ 'file': str(file_path),
66
+ 'document_id': result.document_id,
67
+ 'type': result.document_type.value,
68
+ 'confidence': round(result.ocr_confidence, 3),
69
+ 'is_duplicate': result.is_duplicate,
70
+ }
71
+
72
+ if args.verbose:
73
+ output['extraction'] = {
74
+ 'category': result.extraction.category.value,
75
+ 'issuer_nip': result.extraction.issuer_nip,
76
+ 'document_date': result.extraction.document_date,
77
+ 'gross_amount': result.extraction.gross_amount,
78
+ 'invoice_number': result.extraction.invoice_number,
79
+ }
80
+ output['canonical_string'] = result.canonical_string
81
+
82
+ results.append(output)
83
+
84
+ if not args.quiet:
85
+ print(f"{file_path}: {result.document_id}")
86
+ if result.is_duplicate:
87
+ print(f" ⚠ Duplicate of: {result.duplicate_of}")
88
+
89
+ except Exception as e:
90
+ logger.error(f"Error processing {file_path}: {e}")
91
+ if args.verbose:
92
+ import traceback
93
+ traceback.print_exc()
94
+
95
+ if args.output:
96
+ with open(args.output, 'w', encoding='utf-8') as f:
97
+ json.dump(results, f, indent=2, ensure_ascii=False)
98
+ print(f"\nResults saved to: {args.output}")
99
+
100
+ return results
101
+
102
+
103
+ def cmd_batch(args):
104
+ """Przetwarza wszystkie pliki z katalogu."""
105
+ from . import DocumentPipeline, OCREngine
106
+
107
+ directory = Path(args.directory)
108
+ if not directory.is_dir():
109
+ logger.error(f"Not a directory: {directory}")
110
+ sys.exit(1)
111
+
112
+ # Znajdź pliki
113
+ extensions = {'.pdf', '.png', '.jpg', '.jpeg', '.tiff', '.bmp'}
114
+ files = [
115
+ f for f in directory.rglob('*')
116
+ if f.suffix.lower() in extensions
117
+ ]
118
+
119
+ if not files:
120
+ logger.warning(f"No supported files found in {directory}")
121
+ sys.exit(0)
122
+
123
+ logger.info(f"Found {len(files)} files to process")
124
+
125
+ # Przetwarzanie
126
+ engine = OCREngine.PADDLE if args.engine == 'paddle' else OCREngine.TESSERACT
127
+ pipeline = DocumentPipeline(
128
+ ocr_engine=engine,
129
+ id_prefix=args.prefix,
130
+ lang=args.lang,
131
+ use_gpu=args.gpu,
132
+ )
133
+
134
+ results = pipeline.process_batch(
135
+ files,
136
+ skip_duplicates=not args.keep_duplicates,
137
+ )
138
+
139
+ # Raport
140
+ print(f"\n{'='*60}")
141
+ print(f"Processed: {len(results)} documents")
142
+ print(f"Duplicates found: {sum(1 for r in results if r.is_duplicate)}")
143
+ print(f"{'='*60}\n")
144
+
145
+ # Zapisz wyniki
146
+ if args.output:
147
+ output_data = [r.to_dict() for r in results]
148
+ with open(args.output, 'w', encoding='utf-8') as f:
149
+ json.dump(output_data, f, indent=2, ensure_ascii=False, default=str)
150
+ print(f"Results saved to: {args.output}")
151
+
152
+ # Podsumowanie po typach
153
+ by_type = {}
154
+ for r in results:
155
+ t = r.document_type.value
156
+ by_type[t] = by_type.get(t, 0) + 1
157
+
158
+ print("\nBy document type:")
159
+ for t, count in sorted(by_type.items()):
160
+ print(f" {t}: {count}")
161
+
162
+
163
+ def cmd_verify(args):
164
+ """Weryfikuje czy dokument ma oczekiwany ID."""
165
+ from . import DocumentPipeline, OCREngine
166
+
167
+ engine = OCREngine.PADDLE if args.engine == 'paddle' else OCREngine.TESSERACT
168
+ pipeline = DocumentPipeline(
169
+ ocr_engine=engine,
170
+ id_prefix=args.prefix,
171
+ lang=args.lang,
172
+ )
173
+
174
+ result = pipeline.process(args.file)
175
+
176
+ if result.document_id == args.expected_id:
177
+ print(f"✓ MATCH: {result.document_id}")
178
+ sys.exit(0)
179
+ else:
180
+ print("✗ MISMATCH:")
181
+ print(f" Expected: {args.expected_id}")
182
+ print(f" Got: {result.document_id}")
183
+ sys.exit(1)
184
+
185
+
186
+ def cmd_ocr(args):
187
+ """Wykonuje tylko OCR bez generowania ID."""
188
+ from . import OCREngine, OCRProcessor
189
+
190
+ engine = OCREngine.PADDLE if args.engine == 'paddle' else OCREngine.TESSERACT
191
+ processor = OCRProcessor(
192
+ preferred_engine=engine,
193
+ lang=args.lang,
194
+ use_gpu=args.gpu,
195
+ )
196
+
197
+ result = processor.process(args.file)
198
+
199
+ if isinstance(result, list):
200
+ # PDF z wieloma stronami
201
+ for i, page in enumerate(result):
202
+ print(f"\n--- Page {i+1} ---")
203
+ print(page.full_text)
204
+ if args.verbose:
205
+ print(f"\nConfidence: {page.average_confidence:.2%}")
206
+ print(f"NIPs: {page.detected_nips}")
207
+ print(f"Amounts: {page.detected_amounts}")
208
+ print(f"Dates: {page.detected_dates}")
209
+ else:
210
+ print(result.full_text)
211
+ if args.verbose:
212
+ print(f"\nConfidence: {result.average_confidence:.2%}")
213
+ print(f"NIPs: {result.detected_nips}")
214
+ print(f"Amounts: {result.detected_amounts}")
215
+ print(f"Dates: {result.detected_dates}")
216
+ print(f"Invoice numbers: {result.detected_invoice_numbers}")
217
+
218
+
219
+ def cmd_generate_id(args):
220
+ """Generuje ID bez OCR - z podanych danych."""
221
+ from . import DocumentIDGenerator
222
+
223
+ generator = DocumentIDGenerator(prefix=args.prefix)
224
+
225
+ if args.type == 'invoice':
226
+ if not all([args.nip, args.number, args.date, args.amount]):
227
+ logger.error("Invoice requires: --nip, --number, --date, --amount")
228
+ sys.exit(1)
229
+
230
+ doc_id = generator.generate_invoice_id(
231
+ seller_nip=args.nip,
232
+ invoice_number=args.number,
233
+ issue_date=args.date,
234
+ gross_amount=args.amount,
235
+ )
236
+
237
+ elif args.type == 'receipt':
238
+ if not all([args.nip, args.date, args.amount]):
239
+ logger.error("Receipt requires: --nip, --date, --amount")
240
+ sys.exit(1)
241
+
242
+ doc_id = generator.generate_receipt_id(
243
+ seller_nip=args.nip,
244
+ receipt_date=args.date,
245
+ gross_amount=args.amount,
246
+ receipt_number=args.number,
247
+ )
248
+
249
+ elif args.type == 'contract':
250
+ if not all([args.nip, args.nip2, args.date]):
251
+ logger.error("Contract requires: --nip, --nip2, --date")
252
+ sys.exit(1)
253
+
254
+ doc_id = generator.generate_contract_id(
255
+ party1_nip=args.nip,
256
+ party2_nip=args.nip2,
257
+ contract_date=args.date,
258
+ contract_number=args.number,
259
+ )
260
+
261
+ else:
262
+ logger.error(f"Unknown type: {args.type}")
263
+ sys.exit(1)
264
+
265
+ print(doc_id)
266
+
267
+
268
+ def main():
269
+ parser = argparse.ArgumentParser(
270
+ description='EXEF Document ID Generator - deterministyczne ID dokumentów z OCR',
271
+ formatter_class=argparse.RawDescriptionHelpFormatter,
272
+ )
273
+ parser.add_argument('--version', action='version', version='docid 0.1.0')
274
+
275
+ subparsers = parser.add_subparsers(dest='command', help='Dostępne komendy')
276
+
277
+ # Wspólne argumenty
278
+ common = argparse.ArgumentParser(add_help=False)
279
+ common.add_argument('--engine', choices=['paddle', 'tesseract'], default='paddle',
280
+ help='Silnik OCR (domyślnie: paddle)')
281
+ common.add_argument('--lang', default='pl', help='Język dokumentów')
282
+ common.add_argument('--prefix', default='EXEF', help='Prefiks ID')
283
+ common.add_argument('--gpu', action='store_true', help='Użyj GPU')
284
+ common.add_argument('-v', '--verbose', action='store_true', help='Więcej szczegółów')
285
+
286
+ # process
287
+ p_process = subparsers.add_parser('process', parents=[common],
288
+ help='Przetwórz pliki i wygeneruj ID')
289
+ p_process.add_argument('files', nargs='+', help='Pliki do przetworzenia')
290
+ p_process.add_argument('-o', '--output', help='Zapisz wyniki do JSON')
291
+ p_process.add_argument('-q', '--quiet', action='store_true', help='Cichy tryb')
292
+ p_process.set_defaults(func=cmd_process)
293
+
294
+ # batch
295
+ p_batch = subparsers.add_parser('batch', parents=[common],
296
+ help='Przetwórz cały katalog')
297
+ p_batch.add_argument('directory', help='Katalog z dokumentami')
298
+ p_batch.add_argument('-o', '--output', help='Zapisz wyniki do JSON')
299
+ p_batch.add_argument('--keep-duplicates', action='store_true',
300
+ help='Zachowaj duplikaty w wynikach')
301
+ p_batch.set_defaults(func=cmd_batch)
302
+
303
+ # verify
304
+ p_verify = subparsers.add_parser('verify', parents=[common],
305
+ help='Zweryfikuj ID dokumentu')
306
+ p_verify.add_argument('file', help='Plik do weryfikacji')
307
+ p_verify.add_argument('expected_id', help='Oczekiwany ID')
308
+ p_verify.set_defaults(func=cmd_verify)
309
+
310
+ # ocr
311
+ p_ocr = subparsers.add_parser('ocr', parents=[common],
312
+ help='Wykonaj tylko OCR')
313
+ p_ocr.add_argument('file', help='Plik do OCR')
314
+ p_ocr.set_defaults(func=cmd_ocr)
315
+
316
+ # generate-id
317
+ p_gen = subparsers.add_parser('generate-id',
318
+ help='Wygeneruj ID z podanych danych (bez OCR)')
319
+ p_gen.add_argument('--type', required=True,
320
+ choices=['invoice', 'receipt', 'contract'],
321
+ help='Typ dokumentu')
322
+ p_gen.add_argument('--nip', help='NIP sprzedawcy/strony 1')
323
+ p_gen.add_argument('--nip2', help='NIP nabywcy/strony 2')
324
+ p_gen.add_argument('--number', help='Numer dokumentu')
325
+ p_gen.add_argument('--date', help='Data (YYYY-MM-DD)')
326
+ p_gen.add_argument('--amount', help='Kwota brutto')
327
+ p_gen.add_argument('--prefix', default='EXEF', help='Prefiks ID')
328
+ p_gen.set_defaults(func=cmd_generate_id)
329
+
330
+ args = parser.parse_args()
331
+
332
+ if not args.command:
333
+ parser.print_help()
334
+ sys.exit(1)
335
+
336
+ args.func(args)
337
+
338
+
339
+ if __name__ == '__main__':
340
+ main()