docid 0.1.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,517 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ Universal Document ID CLI - Complete command-line interface
4
+ """
5
+
6
+ import argparse
7
+ import json
8
+ import sys
9
+ from pathlib import Path
10
+ from typing import Optional, Dict, Any, List
11
+ import os
12
+
13
+ from .document_id import (
14
+ generate_invoice_id,
15
+ generate_receipt_id,
16
+ generate_contract_id,
17
+ DocumentIDGenerator,
18
+ )
19
+ from .document_id_universal import (
20
+ UniversalDocumentIDGenerator,
21
+ generate_universal_document_id,
22
+ verify_universal_document_id,
23
+ compare_universal_documents
24
+ )
25
+ from .pipeline import process_document, get_pipeline, DocumentPipeline
26
+ from .ocr_processor import OCREngine
27
+
28
+
29
+ def cmd_generate_business_id(args):
30
+ """Generate business document ID"""
31
+ if args.type == 'invoice':
32
+ doc_id = generate_invoice_id(
33
+ seller_nip=args.nip,
34
+ invoice_number=args.number,
35
+ issue_date=args.date,
36
+ gross_amount=args.amount
37
+ )
38
+ elif args.type == 'receipt':
39
+ doc_id = generate_receipt_id(
40
+ seller_nip=args.nip,
41
+ receipt_date=args.date,
42
+ gross_amount=args.amount,
43
+ cash_register_number=args.register
44
+ )
45
+ elif args.type == 'contract':
46
+ doc_id = generate_contract_id(
47
+ party1_nip=args.nip,
48
+ party2_nip=args.party2_nip,
49
+ contract_date=args.date,
50
+ contract_number=args.number
51
+ )
52
+ else:
53
+ print(f"❌ Nieznany typ dokumentu: {args.type}", file=sys.stderr)
54
+ return 1
55
+
56
+ print(doc_id)
57
+ return 0
58
+
59
+
60
+ def cmd_generate_universal_id(args):
61
+ """Generate universal document ID"""
62
+ try:
63
+ doc_id = generate_universal_document_id(args.file)
64
+ print(doc_id)
65
+ return 0
66
+ except Exception as e:
67
+ print(f"❌ Błąd generowania ID: {e}", file=sys.stderr)
68
+ return 1
69
+
70
+
71
+ def cmd_process_document(args):
72
+ """Process document with full OCR and extraction"""
73
+ try:
74
+ # Choose OCR engine
75
+ ocr_engine = OCREngine.PADDLE
76
+ if args.ocr == 'tesseract':
77
+ ocr_engine = OCREngine.TESSERACT
78
+
79
+ # Create pipeline with specific settings
80
+ pipeline = DocumentPipeline(ocr_engine=ocr_engine)
81
+
82
+ # Process document
83
+ result = pipeline.process(args.file)
84
+
85
+ # Output format
86
+ if args.format == 'json':
87
+ ocr_text = result.ocr_result.full_text if result.ocr_result else ""
88
+ output = {
89
+ 'document_id': result.document_id,
90
+ 'document_type': result.document_type.value if result.document_type else None,
91
+ 'confidence': result.ocr_confidence,
92
+ 'extraction': None,
93
+ 'ocr_text': ocr_text[:500] + '...' if ocr_text and len(ocr_text) > 500 else ocr_text
94
+ }
95
+
96
+ if result.extraction:
97
+ output['extraction'] = {
98
+ 'issuer_nip': result.extraction.issuer_nip,
99
+ 'buyer_nip': result.extraction.buyer_nip,
100
+ 'invoice_number': result.extraction.invoice_number,
101
+ 'issue_date': result.extraction.document_date,
102
+ 'gross_amount': result.extraction.gross_amount,
103
+ 'net_amount': result.extraction.net_amount,
104
+ 'vat_amount': result.extraction.vat_amount,
105
+ 'cash_register_number': result.extraction.cash_register_number,
106
+ 'contract_number': result.extraction.contract_number,
107
+ 'party1_nip': result.extraction.issuer_nip,
108
+ 'party2_nip': result.extraction.party2_nip,
109
+ 'contract_date': result.extraction.document_date
110
+ }
111
+
112
+ ocr_text = result.ocr_result.full_text if result.ocr_result else ""
113
+ output['ocr_text'] = ocr_text[:500] + '...' if ocr_text and len(ocr_text) > 500 else ocr_text
114
+
115
+ print(json.dumps(output, indent=2, ensure_ascii=False))
116
+ else:
117
+ # Human readable format
118
+ print(f"📄 Dokument: {args.file}")
119
+ print(f"🆔 ID: {result.document_id}")
120
+ if result.document_type:
121
+ print(f"📋 Typ: {result.document_type.value}")
122
+ print(f"🎯 Pewność OCR: {result.ocr_confidence:.2%}")
123
+ if args.verbose:
124
+ print(f"🔗 Canonical: {result.canonical_string}")
125
+
126
+ if result.extraction:
127
+ print("\n📊 Wyekstrahowane dane:")
128
+ if result.extraction.issuer_nip:
129
+ print(f" NIP sprzedawcy: {result.extraction.issuer_nip}")
130
+ if result.extraction.buyer_nip:
131
+ print(f" NIP nabywcy: {result.extraction.buyer_nip}")
132
+ if result.extraction.invoice_number:
133
+ print(f" Numer faktury: {result.extraction.invoice_number}")
134
+ if result.extraction.document_date:
135
+ print(f" Data: {result.extraction.document_date}")
136
+ if result.extraction.gross_amount:
137
+ print(f" Kwota brutto: {result.extraction.gross_amount}")
138
+ if result.extraction.cash_register_number:
139
+ print(f" Kasa fiskalna: {result.extraction.cash_register_number}")
140
+ if result.extraction.contract_number:
141
+ print(f" Numer umowy: {result.extraction.contract_number}")
142
+ if result.extraction.issuer_nip and result.extraction.party2_nip:
143
+ print(f" Strony umowy: {result.extraction.issuer_nip} ↔ {result.extraction.party2_nip}")
144
+
145
+ ocr_text = result.ocr_result.full_text if result.ocr_result else ""
146
+ if args.verbose and ocr_text:
147
+ print(f"\n📝 Tekst OCR:\n{ocr_text}")
148
+
149
+ return 0
150
+ except Exception as e:
151
+ print(f"❌ Błąd przetwarzania: {e}", file=sys.stderr)
152
+ return 1
153
+
154
+
155
+ def cmd_verify_id(args):
156
+ """Verify document ID"""
157
+ try:
158
+ if args.universal:
159
+ is_valid = verify_universal_document_id(args.file, args.id)
160
+ else:
161
+ # Use pipeline to verify business ID
162
+ pipeline = get_pipeline()
163
+ result = pipeline.process(args.file)
164
+ is_valid = result.document_id == args.id
165
+
166
+ print(f"✅ Poprawny" if is_valid else "❌ Niepoprawny")
167
+ return 0 if is_valid else 1
168
+ except Exception as e:
169
+ print(f"❌ Błąd weryfikacji: {e}", file=sys.stderr)
170
+ return 1
171
+
172
+
173
+ def cmd_compare_documents(args):
174
+ """Compare two documents (Universal and Business)"""
175
+ try:
176
+ # 1. Universal comparison
177
+ comparison = compare_universal_documents(args.file1, args.file2)
178
+
179
+ # 2. Business ID comparison (using pipeline)
180
+ pipeline = get_pipeline()
181
+ res1 = pipeline.process(args.file1)
182
+ res2 = pipeline.process(args.file2)
183
+
184
+ comparison['business_id1'] = res1.document_id
185
+ comparison['business_id2'] = res2.document_id
186
+ comparison['canonical1'] = res1.canonical_string
187
+ comparison['canonical2'] = res2.canonical_string
188
+ comparison['identical_business_ids'] = res1.document_id == res2.document_id
189
+
190
+ if args.format == 'json':
191
+ print(json.dumps(comparison, indent=2, ensure_ascii=False))
192
+ else:
193
+ print(f"📄 Porównanie dokumentów:")
194
+ print(f" Plik 1: {args.file1}")
195
+ print(f" Plik 2: {args.file2}")
196
+
197
+ print(f"\n✨ WYNIK: {'✅ DOKUMENTY IDENTYCZNE' if comparison['identical_business_ids'] else '❌ DOKUMENTY RÓŻNE'}")
198
+
199
+ print(f"\n🏢 Identyfikatory Biznesowe (OCR - spójne między formatami):")
200
+ print(f" Identyczne: {'✅' if comparison['identical_business_ids'] else '❌'}")
201
+ print(f" ID1: {res1.document_id}")
202
+ print(f" ID2: {res2.document_id}")
203
+
204
+ if not comparison['identical_business_ids']:
205
+ print(f"\n🔍 Analiza różnic (Dane kanoniczne):")
206
+ c1 = res1.canonical_string.split('|')
207
+ c2 = res2.canonical_string.split('|')
208
+ labels = ["NIP", "Numer", "Data", "Kwota", "Dodatkowe"]
209
+
210
+ for i in range(max(len(c1), len(c2))):
211
+ val1 = c1[i] if i < len(c1) else "BRAK"
212
+ val2 = c2[i] if i < len(c2) else "BRAK"
213
+ label = labels[i] if i < len(labels) else f"Pole {i+1}"
214
+
215
+ status = "✅" if val1 == val2 else "❌"
216
+ print(f" {status} {label:10}: {val1} vs {val2}")
217
+
218
+ if res1.document_type:
219
+ print(f"\n📋 Typ: {res1.document_type.value}")
220
+
221
+ print(f"\n🌍 Identyfikatory Uniwersalne (Cechy pliku - czułe na format):")
222
+ print(f" Identyczne: {'✅' if comparison['identical_ids'] else '❌'}")
223
+ print(f" ID1: {comparison['id1']}")
224
+ print(f" ID2: {comparison['id2']}")
225
+
226
+ print(f"\n📊 Szczegóły techniczne:")
227
+ print(f" Ten sam typ pliku: {'✅' if comparison['same_type'] else '❌'}")
228
+ print(f" Ten sam rozmiar: {'✅' if comparison['same_size'] else '❌'}")
229
+ print(f" Ten sam hash treści: {'✅' if comparison['same_content_hash'] else '❌'}")
230
+ if comparison.get('same_visual_hash') is not None:
231
+ print(f" Ten sam hash wizualny: {'✅' if comparison['same_visual_hash'] else '❌'}")
232
+ if comparison.get('same_text_hash') is not None:
233
+ print(f" Ten sam hash tekstu: {'✅' if comparison['same_text_hash'] else '❌'}")
234
+
235
+ return 0
236
+ except Exception as e:
237
+ print(f"❌ Błąd porównywania: {e}", file=sys.stderr)
238
+ return 1
239
+
240
+
241
+ def cmd_batch_process(args):
242
+ """Process multiple documents"""
243
+ try:
244
+ # Choose OCR engine
245
+ ocr_engine = OCREngine.PADDLE
246
+ if args.ocr == 'tesseract':
247
+ ocr_engine = OCREngine.TESSERACT
248
+
249
+ # Get files
250
+ if args.recursive:
251
+ files = list(Path(args.directory).rglob("*"))
252
+ else:
253
+ files = list(Path(args.directory).glob("*"))
254
+
255
+ files = [f for f in files if f.is_file()]
256
+
257
+ if not files:
258
+ print(f"❌ Brak plików w folderze: {args.directory}", file=sys.stderr)
259
+ return 1
260
+
261
+ print(f"📁 Przetwarzanie {len(files)} plików z {args.directory}")
262
+
263
+ # Process files
264
+ pipeline = DocumentPipeline(ocr_engine=ocr_engine)
265
+ results = []
266
+
267
+ for i, file_path in enumerate(files, 1):
268
+ try:
269
+ print(f"\n[{i}/{len(files)}] 📄 {file_path.name}", end="")
270
+
271
+ result = pipeline.process(str(file_path))
272
+ results.append(result)
273
+
274
+ print(f" → {result.document_id}")
275
+
276
+ if args.verbose and result.extraction:
277
+ print(f" 📊 {result.document_type.value if result.document_type else 'Unknown'}")
278
+
279
+ except Exception as e:
280
+ print(f" ❌ Błąd: {e}")
281
+ if args.continue_on_error:
282
+ continue
283
+ else:
284
+ return 1
285
+
286
+ # Summary
287
+ print(f"\n✅ Przetworzono: {len(results)} plików")
288
+
289
+ if args.duplicates:
290
+ # Find duplicates
291
+ id_counts = {}
292
+ for result in results:
293
+ doc_id = result.document_id
294
+ if doc_id not in id_counts:
295
+ id_counts[doc_id] = []
296
+ id_counts[doc_id].append(result.source_file)
297
+
298
+ duplicates = {id_: files for id_, files in id_counts.items() if len(files) > 1}
299
+
300
+ if duplicates:
301
+ print(f"\n🔍 Znalezione duplikaty ({len(duplicates)} grup):")
302
+ for doc_id, files in duplicates.items():
303
+ print(f" {doc_id}:")
304
+ for file_path in files:
305
+ print(f" - {file_path}")
306
+ else:
307
+ print(f"\n✅ Brak duplikatów")
308
+
309
+ # Save results if requested
310
+ if args.output:
311
+ output_data = []
312
+ for result in results:
313
+ output_data.append({
314
+ 'file': result.source_file,
315
+ 'id': result.document_id,
316
+ 'type': result.document_type.value if result.document_type else None,
317
+ 'confidence': result.ocr_confidence
318
+ })
319
+
320
+ with open(args.output, 'w', encoding='utf-8') as f:
321
+ json.dump(output_data, f, indent=2, ensure_ascii=False)
322
+
323
+ print(f"\n💾 Zapisano wyniki do: {args.output}")
324
+
325
+ return 0
326
+ except Exception as e:
327
+ print(f"❌ Błąd przetwarzania wsadowego: {e}", file=sys.stderr)
328
+ return 1
329
+
330
+
331
+ def cmd_analyze_file(args):
332
+ """Analyze file features (universal)"""
333
+ try:
334
+ generator = UniversalDocumentIDGenerator()
335
+ features = generator.get_document_features(args.file)
336
+
337
+ if args.format == 'json':
338
+ output = {
339
+ 'file': str(args.file),
340
+ 'type': features.file_type,
341
+ 'size': features.file_size,
342
+ 'content_hash': features.content_hash,
343
+ 'visual_hash': features.visual_hash,
344
+ 'text_hash': features.text_hash,
345
+ 'metadata_hash': features.metadata_hash,
346
+ 'structure_hash': features.structure_hash,
347
+ 'color_profile_hash': features.color_profile_hash,
348
+ 'dimensions': features.dimensions,
349
+ 'page_count': features.page_count,
350
+ 'creation_time': features.creation_time,
351
+ 'modification_time': features.modification_time
352
+ }
353
+ print(json.dumps(output, indent=2, ensure_ascii=False))
354
+ else:
355
+ print(f"📄 Analiza pliku: {args.file}")
356
+ print(f"\n📊 Podstawowe informacje:")
357
+ print(f" Typ: {features.file_type}")
358
+ print(f" Rozmiar: {features.file_size}B")
359
+
360
+ if features.dimensions:
361
+ print(f" Wymiary: {features.dimensions[0]} × {features.dimensions[1]}")
362
+
363
+ if features.page_count:
364
+ print(f" Stron: {features.page_count}")
365
+
366
+ print(f"\n🔐 Hashy:")
367
+ print(f" Treści: {features.content_hash}")
368
+ if features.visual_hash:
369
+ print(f" Wizualny: {features.visual_hash}")
370
+ if features.text_hash:
371
+ print(f" Tekstu: {features.text_hash}")
372
+ if features.metadata_hash:
373
+ print(f" Metadanych: {features.metadata_hash}")
374
+ if features.color_profile_hash:
375
+ print(f" Kolorów: {features.color_profile_hash}")
376
+
377
+ return 0
378
+ except Exception as e:
379
+ print(f"❌ Błąd analizy: {e}", file=sys.stderr)
380
+ return 1
381
+
382
+
383
+ def cmd_test_determinism(args):
384
+ """Test ID determinism"""
385
+ try:
386
+ print(f"🧪 Testowanie determinizmu dla: {args.file}")
387
+ print(f"🔄 Liczba iteracji: {args.iterations}")
388
+
389
+ ids = []
390
+
391
+ for i in range(args.iterations):
392
+ if args.universal:
393
+ doc_id = generate_universal_document_id(args.file)
394
+ else:
395
+ # Use pipeline with specified OCR engine
396
+ ocr_engine = OCREngine.PADDLE
397
+ if args.ocr == 'tesseract':
398
+ ocr_engine = OCREngine.TESSERACT
399
+
400
+ result = process_document(args.file, ocr_engine=ocr_engine)
401
+ doc_id = result.document_id
402
+
403
+ ids.append(doc_id)
404
+
405
+ if args.verbose or i < 5 or i >= args.iterations - 5:
406
+ print(f" {i+1:3d}. {doc_id}")
407
+ elif i == 5:
408
+ print(f" ...")
409
+
410
+ # Check results
411
+ all_same = all(id == ids[0] for id in ids)
412
+ unique_count = len(set(ids))
413
+
414
+ print(f"\n📊 Wyniki:")
415
+ print(f" Wszystkie identyczne: {'✅' if all_same else '❌'}")
416
+ print(f" Unikalnych ID: {unique_count}")
417
+ print(f" ID: {ids[0]}")
418
+
419
+ if all_same:
420
+ print(f"\n✅ {args.file} jest 100% deterministyczny!")
421
+ else:
422
+ print(f"\n❌ {args.file} NIE jest deterministyczny!")
423
+
424
+ return 0 if all_same else 1
425
+ except Exception as e:
426
+ print(f"❌ Błąd testu: {e}", file=sys.stderr)
427
+ return 1
428
+
429
+
430
+ def main():
431
+ """Main CLI entry point"""
432
+ parser = argparse.ArgumentParser(
433
+ prog='docid',
434
+ description='EXEF Document ID Generator - CLI'
435
+ )
436
+
437
+ parser.add_argument('--version', action='version', version='%(prog)s 0.1.0')
438
+
439
+ subparsers = parser.add_subparsers(dest='command', help='Dostępne komendy')
440
+
441
+ # Generate business ID
442
+ parser_gen = subparsers.add_parser('generate', help='Generuj ID dokumentu biznesowego')
443
+ parser_gen.add_argument('type', choices=['invoice', 'receipt', 'contract'], help='Typ dokumentu')
444
+ parser_gen.add_argument('--nip', required=True, help='NIP sprzedawcy/strony')
445
+ parser_gen.add_argument('--number', help='Numer dokumentu')
446
+ parser_gen.add_argument('--date', required=True, help='Data dokumentu')
447
+ parser_gen.add_argument('--amount', type=float, help='Kwota')
448
+ parser_gen.add_argument('--register', help='Numer kasy fiskalnej (dla paragonów)')
449
+ parser_gen.add_argument('--party2-nip', help='NIP drugiej strony (dla umów)')
450
+ parser_gen.set_defaults(func=cmd_generate_business_id)
451
+
452
+ # Generate universal ID
453
+ parser_univ = subparsers.add_parser('universal', help='Generuj uniwersalne ID dokumentu')
454
+ parser_univ.add_argument('file', help='Ścieżka do pliku')
455
+ parser_univ.set_defaults(func=cmd_generate_universal_id)
456
+
457
+ # Process document
458
+ parser_proc = subparsers.add_parser('process', help='Przetwarzaj dokument z OCR')
459
+ parser_proc.add_argument('file', help='Ścieżka do pliku')
460
+ parser_proc.add_argument('--format', choices=['text', 'json'], default='text', help='Format wyjściowy')
461
+ parser_proc.add_argument('--ocr', choices=['paddle', 'tesseract', 'auto'], default='auto', help='Silnik OCR (domyślnie auto)')
462
+ parser_proc.add_argument('-v', '--verbose', action='store_true', help='Szczegółowe informacje')
463
+ parser_proc.set_defaults(func=cmd_process_document)
464
+
465
+ # Verify ID
466
+ parser_verify = subparsers.add_parser('verify', help='Weryfikuj ID dokumentu')
467
+ parser_verify.add_argument('file', help='Ścieżka do pliku')
468
+ parser_verify.add_argument('id', help='ID do weryfikacji')
469
+ parser_verify.add_argument('--universal', action='store_true', help='Uniwersalne ID')
470
+ parser_verify.set_defaults(func=cmd_verify_id)
471
+
472
+ # Compare documents
473
+ parser_compare = subparsers.add_parser('compare', help='Porównaj dwa dokumenty')
474
+ parser_compare.add_argument('file1', help='Pierwszy plik')
475
+ parser_compare.add_argument('file2', help='Drugi plik')
476
+ parser_compare.add_argument('--format', choices=['text', 'json'], default='text', help='Format wyjściowy')
477
+ parser_compare.set_defaults(func=cmd_compare_documents)
478
+
479
+ # Batch process
480
+ parser_batch = subparsers.add_parser('batch', help='Przetwarzaj wsadowe dokumenty')
481
+ parser_batch.add_argument('directory', help='Folder z dokumentami')
482
+ parser_batch.add_argument('--output', '-o', help='Plik wyjściowy (JSON)')
483
+ parser_batch.add_argument('--ocr', choices=['paddle', 'tesseract', 'auto'], default='auto', help='Silnik OCR (domyślnie auto)')
484
+ parser_batch.add_argument('--recursive', '-r', action='store_true', help='Przetwarzaj rekurencyjnie')
485
+ parser_batch.add_argument('--duplicates', '-d', action='store_true', help='Pokaż duplikaty')
486
+ parser_batch.add_argument('--continue-on-error', action='store_true', help='Kontynuuj przy błędach')
487
+ parser_batch.add_argument('-v', '--verbose', action='store_true', help='Szczegółowe informacje')
488
+ parser_batch.set_defaults(func=cmd_batch_process)
489
+
490
+ # Analyze file
491
+ parser_analyze = subparsers.add_parser('analyze', help='Analizuj cechy pliku')
492
+ parser_analyze.add_argument('file', help='Ścieżka do pliku')
493
+ parser_analyze.add_argument('--format', choices=['text', 'json'], default='text', help='Format wyjściowy')
494
+ parser_analyze.set_defaults(func=cmd_analyze_file)
495
+
496
+ # Test determinism
497
+ parser_test = subparsers.add_parser('test', help='Test determinizmu ID')
498
+ parser_test.add_argument('file', help='Ścieżka do pliku')
499
+ parser_test.add_argument('--iterations', '-n', type=int, default=10, help='Liczba iteracji')
500
+ parser_test.add_argument('--universal', action='store_true', help='Uniwersalne ID')
501
+ parser_test.add_argument('--ocr', choices=['paddle', 'tesseract', 'auto'], default='auto', help='Silnik OCR (domyślnie auto)')
502
+ parser_test.add_argument('-v', '--verbose', action='store_true', help='Pokaż wszystkie iteracje')
503
+ parser_test.set_defaults(func=cmd_test_determinism)
504
+
505
+ # Parse arguments
506
+ args = parser.parse_args()
507
+
508
+ if not args.command:
509
+ parser.print_help()
510
+ return 1
511
+
512
+ # Execute command
513
+ return args.func(args)
514
+
515
+
516
+ if __name__ == '__main__':
517
+ sys.exit(main())