docid 0.1.4__tar.gz → 0.1.5__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {docid-0.1.4 → docid-0.1.5}/PKG-INFO +44 -44
- {docid-0.1.4 → docid-0.1.5}/README.md +43 -43
- {docid-0.1.4/exef_docid → docid-0.1.5/docid}/__init__.py +2 -2
- {docid-0.1.4 → docid-0.1.5}/docid.egg-info/PKG-INFO +44 -44
- {docid-0.1.4 → docid-0.1.5}/docid.egg-info/SOURCES.txt +9 -9
- docid-0.1.5/docid.egg-info/entry_points.txt +3 -0
- docid-0.1.5/docid.egg-info/top_level.txt +1 -0
- {docid-0.1.4 → docid-0.1.5}/pyproject.toml +4 -4
- {docid-0.1.4 → docid-0.1.5}/setup.py +2 -2
- {docid-0.1.4 → docid-0.1.5}/tests/test_document_id.py +10 -10
- {docid-0.1.4 → docid-0.1.5}/tests/test_extractors.py +7 -7
- {docid-0.1.4 → docid-0.1.5}/tests/test_samples_id.py +4 -4
- docid-0.1.4/docid.egg-info/entry_points.txt +0 -3
- docid-0.1.4/docid.egg-info/top_level.txt +0 -1
- {docid-0.1.4/exef_docid → docid-0.1.5/docid}/cli.py +0 -0
- {docid-0.1.4/exef_docid → docid-0.1.5/docid}/cli_universal.py +0 -0
- {docid-0.1.4/exef_docid → docid-0.1.5/docid}/document_id.py +0 -0
- {docid-0.1.4/exef_docid → docid-0.1.5/docid}/document_id_universal.py +0 -0
- {docid-0.1.4/exef_docid → docid-0.1.5/docid}/extractors/__init__.py +0 -0
- {docid-0.1.4/exef_docid → docid-0.1.5/docid}/extractors/base.py +0 -0
- {docid-0.1.4/exef_docid → docid-0.1.5/docid}/ocr_processor.py +0 -0
- {docid-0.1.4/exef_docid → docid-0.1.5/docid}/pipeline.py +0 -0
- {docid-0.1.4 → docid-0.1.5}/docid.egg-info/dependency_links.txt +0 -0
- {docid-0.1.4 → docid-0.1.5}/docid.egg-info/not-zip-safe +0 -0
- {docid-0.1.4 → docid-0.1.5}/docid.egg-info/requires.txt +0 -0
- {docid-0.1.4 → docid-0.1.5}/setup.cfg +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: docid
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.5
|
|
4
4
|
Summary: Deterministyczny generator identyfikatorów dokumentów z OCR
|
|
5
5
|
Home-page: https://github.com/softreck/docid
|
|
6
6
|
Author: Softreck
|
|
@@ -55,7 +55,7 @@ Dynamic: author
|
|
|
55
55
|
Dynamic: home-page
|
|
56
56
|
Dynamic: requires-python
|
|
57
57
|
|
|
58
|
-
#
|
|
58
|
+
# DOC Document ID Generator
|
|
59
59
|
|
|
60
60
|
Deterministyczny generator identyfikatorów dokumentów z OCR. Generuje **zawsze ten sam ID** dla tego samego dokumentu, niezależnie od formatu źródłowego (skan, PDF, KSeF XML, obrazy).
|
|
61
61
|
|
|
@@ -71,12 +71,12 @@ Jak uzyskać **ten sam identyfikator** dla wszystkich trzech?
|
|
|
71
71
|
## ✨ Rozwiązanie
|
|
72
72
|
|
|
73
73
|
```python
|
|
74
|
-
from
|
|
74
|
+
from docid import get_document_id
|
|
75
75
|
|
|
76
76
|
# Wszystkie trzy zwrócą TEN SAM ID!
|
|
77
|
-
get_document_id("faktura_skan.jpg") #
|
|
78
|
-
get_document_id("faktura.pdf") #
|
|
79
|
-
get_document_id("faktura_ksef.xml") #
|
|
77
|
+
get_document_id("faktura_skan.jpg") # DOC-FV-A7B3C9D2E1F04856
|
|
78
|
+
get_document_id("faktura.pdf") # DOC-FV-A7B3C9D2E1F04856
|
|
79
|
+
get_document_id("faktura_ksef.xml") # DOC-FV-A7B3C9D2E1F04856
|
|
80
80
|
```
|
|
81
81
|
|
|
82
82
|
## 📦 Instalacja
|
|
@@ -85,8 +85,8 @@ get_document_id("faktura_ksef.xml") # EXEF-FV-A7B3C9D2E1F04856
|
|
|
85
85
|
|
|
86
86
|
```bash
|
|
87
87
|
# Klonuj repozytorium
|
|
88
|
-
git clone https://github.com/softreck/
|
|
89
|
-
cd
|
|
88
|
+
git clone https://github.com/softreck/doc-pl.git
|
|
89
|
+
cd doc-pl/app/docid
|
|
90
90
|
|
|
91
91
|
# Utwórz środowisko wirtualne
|
|
92
92
|
python3 -m venv venv
|
|
@@ -124,7 +124,7 @@ pip install docid[all]
|
|
|
124
124
|
### Generator ID dla dokumentów biznesowych
|
|
125
125
|
|
|
126
126
|
```python
|
|
127
|
-
from
|
|
127
|
+
from docid import generate_invoice_id, generate_receipt_id, generate_contract_id
|
|
128
128
|
|
|
129
129
|
# Faktura VAT
|
|
130
130
|
invoice_id = generate_invoice_id(
|
|
@@ -133,7 +133,7 @@ invoice_id = generate_invoice_id(
|
|
|
133
133
|
issue_date="2025-01-15",
|
|
134
134
|
gross_amount=1230.50
|
|
135
135
|
)
|
|
136
|
-
print(invoice_id) #
|
|
136
|
+
print(invoice_id) # DOC-FV-F0BE35240C77B2DB
|
|
137
137
|
|
|
138
138
|
# Paragon fiskalny
|
|
139
139
|
receipt_id = generate_receipt_id(
|
|
@@ -142,7 +142,7 @@ receipt_id = generate_receipt_id(
|
|
|
142
142
|
gross_amount=37.88,
|
|
143
143
|
cash_register_number="001"
|
|
144
144
|
)
|
|
145
|
-
print(receipt_id) #
|
|
145
|
+
print(receipt_id) # DOC-PAR-8142B3FC69D7778C
|
|
146
146
|
|
|
147
147
|
# Umowa
|
|
148
148
|
contract_id = generate_contract_id(
|
|
@@ -151,13 +151,13 @@ contract_id = generate_contract_id(
|
|
|
151
151
|
contract_date="2025-01-15",
|
|
152
152
|
contract_number="001/2025"
|
|
153
153
|
)
|
|
154
|
-
print(contract_id) #
|
|
154
|
+
print(contract_id) # DOC-UMO-C54CB968D1342642
|
|
155
155
|
```
|
|
156
156
|
|
|
157
157
|
### Uniwersalny generator ID (dowolne dokumenty)
|
|
158
158
|
|
|
159
159
|
```python
|
|
160
|
-
from
|
|
160
|
+
from docid import generate_universal_document_id
|
|
161
161
|
|
|
162
162
|
# Dowolny dokument
|
|
163
163
|
doc_id = generate_universal_document_id("dokument.pdf")
|
|
@@ -175,20 +175,20 @@ print(doc_id) # UNIV-IMG-E2E2131A335F0918
|
|
|
175
175
|
### Pełne przetwarzanie z OCR
|
|
176
176
|
|
|
177
177
|
```python
|
|
178
|
-
from
|
|
178
|
+
from docid import process_document, get_document_id
|
|
179
179
|
|
|
180
180
|
# Pełne przetwarzanie z ekstrakcją danych
|
|
181
181
|
result = process_document("faktura.pdf")
|
|
182
|
-
print(result.document_id) #
|
|
182
|
+
print(result.document_id) # DOC-FV-F0BE35240C77B2DB
|
|
183
183
|
print(result.extraction.issuer_nip) # 5213017228
|
|
184
184
|
print(result.extraction.invoice_number) # FV/2025/00142
|
|
185
185
|
|
|
186
186
|
# Tylko wygeneruj ID
|
|
187
187
|
doc_id = get_document_id("paragon.jpg")
|
|
188
|
-
print(doc_id) #
|
|
188
|
+
print(doc_id) # DOC-PAR-8142B3FC69D7778C
|
|
189
189
|
|
|
190
190
|
# Weryfikacja ID
|
|
191
|
-
is_valid = verify_document_id("skan.png", "
|
|
191
|
+
is_valid = verify_document_id("skan.png", "DOC-FV-F0BE35240C77B2DB")
|
|
192
192
|
print(is_valid) # True/False
|
|
193
193
|
```
|
|
194
194
|
|
|
@@ -250,7 +250,7 @@ curl -X POST -F "file=@faktura.pdf" http://localhost:8000/process
|
|
|
250
250
|
|
|
251
251
|
**2. Weryfikacja ID:**
|
|
252
252
|
```bash
|
|
253
|
-
curl -X POST -F "file=@skan.jpg" -F "document_id=
|
|
253
|
+
curl -X POST -F "file=@skan.jpg" -F "document_id=DOC-FV-F0BE35240C77B2DB" http://localhost:8000/verify
|
|
254
254
|
```
|
|
255
255
|
|
|
256
256
|
**3. Porównywanie plików:**
|
|
@@ -303,7 +303,7 @@ make run-web # Uruchom serwer API
|
|
|
303
303
|
### 1. Przetwarzanie faktur
|
|
304
304
|
|
|
305
305
|
```python
|
|
306
|
-
from
|
|
306
|
+
from docid import process_document
|
|
307
307
|
|
|
308
308
|
# Przetwarzanie faktury PDF
|
|
309
309
|
result = process_document("faktura.pdf")
|
|
@@ -319,7 +319,7 @@ print(f"ID: {result.document_id}")
|
|
|
319
319
|
### 2. Porównywanie dokumentów
|
|
320
320
|
|
|
321
321
|
```python
|
|
322
|
-
from
|
|
322
|
+
from docid import compare_universal_documents
|
|
323
323
|
|
|
324
324
|
# Porównaj dwa dokumenty
|
|
325
325
|
comparison = compare_universal_documents("dokument1.pdf", "dokument2.png")
|
|
@@ -331,10 +331,10 @@ print(f"Ten sam rozmiar: {comparison['same_size']}")
|
|
|
331
331
|
### 3. Weryfikacja ID
|
|
332
332
|
|
|
333
333
|
```python
|
|
334
|
-
from
|
|
334
|
+
from docid import verify_document_id, verify_universal_document_id
|
|
335
335
|
|
|
336
336
|
# Weryfikacja ID dokumentu biznesowego
|
|
337
|
-
is_valid = verify_document_id("faktura.pdf", "
|
|
337
|
+
is_valid = verify_document_id("faktura.pdf", "DOC-FV-F0BE35240C77B2DB")
|
|
338
338
|
|
|
339
339
|
# Weryfikacja uniwersalnego ID
|
|
340
340
|
is_valid = verify_universal_document_id("dowolny_plik.jpg", "UNIV-IMG-4225A473A725978D")
|
|
@@ -344,7 +344,7 @@ is_valid = verify_universal_document_id("dowolny_plik.jpg", "UNIV-IMG-4225A473A7
|
|
|
344
344
|
|
|
345
345
|
```python
|
|
346
346
|
from pathlib import Path
|
|
347
|
-
from
|
|
347
|
+
from docid import generate_universal_document_id
|
|
348
348
|
|
|
349
349
|
# Przetwarzaj wszystkie pliki w folderze
|
|
350
350
|
documents_dir = Path("dokumenty")
|
|
@@ -372,16 +372,16 @@ for file_path in documents_dir.glob("*"):
|
|
|
372
372
|
**TAK!** Formaty PNG i JPG są w pełni przetwarzane przez OCR:
|
|
373
373
|
|
|
374
374
|
```python
|
|
375
|
-
from
|
|
375
|
+
from docid import process_document
|
|
376
376
|
|
|
377
377
|
# Przetwarzanie skanu PNG z OCR
|
|
378
378
|
result = process_document("skan_faktury.png")
|
|
379
|
-
print(result.document_id) #
|
|
379
|
+
print(result.document_id) # DOC-FV-F0BE35240C77B2DB
|
|
380
380
|
print(result.extraction.issuer_nip) # 5213017228
|
|
381
381
|
|
|
382
382
|
# Przetwarzanie zdjęcia JPG z OCR
|
|
383
383
|
result = process_document("zdjecie_paragonu.jpg")
|
|
384
|
-
print(result.document_id) #
|
|
384
|
+
print(result.document_id) # DOC-PAR-8142B3FC69D7778C
|
|
385
385
|
```
|
|
386
386
|
|
|
387
387
|
#### Co jest ekstrahowane z PNG/JPG:
|
|
@@ -402,13 +402,13 @@ print(result.document_id) # EXEF-PAR-8142B3FC69D7778C
|
|
|
402
402
|
|
|
403
403
|
```python
|
|
404
404
|
# Ten sam dokument w różnych formatach - ten sam ID biznesowy
|
|
405
|
-
generate_invoice_id(...) # ->
|
|
405
|
+
generate_invoice_id(...) # -> DOC-FV-F0BE35240C77B2DB
|
|
406
406
|
|
|
407
407
|
# Przetwarzanie przez OCR daje ten sam wynik
|
|
408
|
-
process_document("faktura.pdf") # ->
|
|
409
|
-
process_document("faktura.png") # ->
|
|
410
|
-
process_document("faktura.jpg") # ->
|
|
411
|
-
process_document("faktura.xml") # ->
|
|
408
|
+
process_document("faktura.pdf") # -> DOC-FV-F0BE35240C77B2DB
|
|
409
|
+
process_document("faktura.png") # -> DOC-FV-F0BE35240C77B2DB
|
|
410
|
+
process_document("faktura.jpg") # -> DOC-FV-F0BE35240C77B2DB
|
|
411
|
+
process_document("faktura.xml") # -> DOC-FV-F0BE35240C77B2DB
|
|
412
412
|
|
|
413
413
|
# Różne ID uniwersalne dla różnych formatów
|
|
414
414
|
generate_universal_document_id("faktura.pdf") # -> UNIV-PDF-...
|
|
@@ -451,25 +451,25 @@ TEST WSZYSTKICH FORMATÓW - PDF, PNG, JPG, HTML, TXT, XML
|
|
|
451
451
|
================================================================================
|
|
452
452
|
|
|
453
453
|
FOLDER: FAKTURY (invoices/)
|
|
454
|
-
📄 faktura_full.pdf (.pdf ) [ 2242B] ->
|
|
455
|
-
📄 faktura_full.xml (.xml ) [ 2077B] ->
|
|
456
|
-
📄 faktura_full.html (.html) [ 3334B] ->
|
|
457
|
-
📄 faktura_full.jpg (.jpg ) [ 28182B] ->
|
|
458
|
-
📄 faktura_full.png (.png ) [ 32325B] ->
|
|
459
|
-
📄 faktura_full.txt (.txt ) [ 2839B] ->
|
|
454
|
+
📄 faktura_full.pdf (.pdf ) [ 2242B] -> DOC-FV-F0BE35240C77B2DB
|
|
455
|
+
📄 faktura_full.xml (.xml ) [ 2077B] -> DOC-FV-F0BE35240C77B2DB
|
|
456
|
+
📄 faktura_full.html (.html) [ 3334B] -> DOC-FV-F0BE35240C77B2DB
|
|
457
|
+
📄 faktura_full.jpg (.jpg ) [ 28182B] -> DOC-FV-F0BE35240C77B2DB
|
|
458
|
+
📄 faktura_full.png (.png ) [ 32325B] -> DOC-FV-F0BE35240C77B2DB
|
|
459
|
+
📄 faktura_full.txt (.txt ) [ 2839B] -> DOC-FV-F0BE35240C77B2DB
|
|
460
460
|
|
|
461
461
|
📊 Podsumowanie folderu invoices:
|
|
462
462
|
Plików przetworzonych: 6
|
|
463
463
|
Unikalnych ID: 1
|
|
464
464
|
Wszystkie identyczne: True
|
|
465
|
-
✅ ID:
|
|
465
|
+
✅ ID: DOC-FV-F0BE35240C77B2DB
|
|
466
466
|
```
|
|
467
467
|
|
|
468
468
|
## 📁 Struktura projektu
|
|
469
469
|
|
|
470
470
|
```
|
|
471
471
|
docid/
|
|
472
|
-
├──
|
|
472
|
+
├── docid/ # Główny pakiet
|
|
473
473
|
│ ├── __init__.py # Eksporty API
|
|
474
474
|
│ ├── document_id.py # Generator ID biznesowy
|
|
475
475
|
│ ├── document_id_universal.py # Generator ID uniwersalny
|
|
@@ -493,7 +493,7 @@ docid/
|
|
|
493
493
|
### Silniki OCR
|
|
494
494
|
|
|
495
495
|
```python
|
|
496
|
-
from
|
|
496
|
+
from docid import OCREngine, get_pipeline
|
|
497
497
|
|
|
498
498
|
# Użyj PaddleOCR (domyślnie)
|
|
499
499
|
pipeline = get_pipeline(ocr_engine=OCREngine.PADDLE)
|
|
@@ -505,7 +505,7 @@ pipeline = get_pipeline(ocr_engine=OCREngine.TESSERACT)
|
|
|
505
505
|
### Custom prefix
|
|
506
506
|
|
|
507
507
|
```python
|
|
508
|
-
from
|
|
508
|
+
from docid import UniversalDocumentIDGenerator
|
|
509
509
|
|
|
510
510
|
generator = UniversalDocumentIDGenerator(prefix="MOJA")
|
|
511
511
|
doc_id = generator.generate_universal_id("plik.pdf")
|
|
@@ -549,8 +549,8 @@ MIT License - zobacz [LICENSE](LICENSE) dla szczegółów.
|
|
|
549
549
|
## 🆘 Wsparcie
|
|
550
550
|
|
|
551
551
|
- 📧 Email: info@softreck.dev
|
|
552
|
-
- 🐛 Issues: [GitHub Issues](https://github.com/softreck/
|
|
553
|
-
- 📖 Dokumentacja: [GitHub Wiki](https://github.com/softreck/
|
|
552
|
+
- 🐛 Issues: [GitHub Issues](https://github.com/softreck/doc-pl/issues)
|
|
553
|
+
- 📖 Dokumentacja: [GitHub Wiki](https://github.com/softreck/doc-pl/wiki)
|
|
554
554
|
|
|
555
555
|
## 🗺️ Roadmap
|
|
556
556
|
|
|
@@ -563,4 +563,4 @@ MIT License - zobacz [LICENSE](LICENSE) dla szczegółów.
|
|
|
563
563
|
|
|
564
564
|
---
|
|
565
565
|
|
|
566
|
-
**
|
|
566
|
+
**DOC Document ID Generator** - Deterministyczne identyfikatory dla każdego dokumentu! 🚀
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
#
|
|
1
|
+
# DOC Document ID Generator
|
|
2
2
|
|
|
3
3
|
Deterministyczny generator identyfikatorów dokumentów z OCR. Generuje **zawsze ten sam ID** dla tego samego dokumentu, niezależnie od formatu źródłowego (skan, PDF, KSeF XML, obrazy).
|
|
4
4
|
|
|
@@ -14,12 +14,12 @@ Jak uzyskać **ten sam identyfikator** dla wszystkich trzech?
|
|
|
14
14
|
## ✨ Rozwiązanie
|
|
15
15
|
|
|
16
16
|
```python
|
|
17
|
-
from
|
|
17
|
+
from docid import get_document_id
|
|
18
18
|
|
|
19
19
|
# Wszystkie trzy zwrócą TEN SAM ID!
|
|
20
|
-
get_document_id("faktura_skan.jpg") #
|
|
21
|
-
get_document_id("faktura.pdf") #
|
|
22
|
-
get_document_id("faktura_ksef.xml") #
|
|
20
|
+
get_document_id("faktura_skan.jpg") # DOC-FV-A7B3C9D2E1F04856
|
|
21
|
+
get_document_id("faktura.pdf") # DOC-FV-A7B3C9D2E1F04856
|
|
22
|
+
get_document_id("faktura_ksef.xml") # DOC-FV-A7B3C9D2E1F04856
|
|
23
23
|
```
|
|
24
24
|
|
|
25
25
|
## 📦 Instalacja
|
|
@@ -28,8 +28,8 @@ get_document_id("faktura_ksef.xml") # EXEF-FV-A7B3C9D2E1F04856
|
|
|
28
28
|
|
|
29
29
|
```bash
|
|
30
30
|
# Klonuj repozytorium
|
|
31
|
-
git clone https://github.com/softreck/
|
|
32
|
-
cd
|
|
31
|
+
git clone https://github.com/softreck/doc-pl.git
|
|
32
|
+
cd doc-pl/app/docid
|
|
33
33
|
|
|
34
34
|
# Utwórz środowisko wirtualne
|
|
35
35
|
python3 -m venv venv
|
|
@@ -67,7 +67,7 @@ pip install docid[all]
|
|
|
67
67
|
### Generator ID dla dokumentów biznesowych
|
|
68
68
|
|
|
69
69
|
```python
|
|
70
|
-
from
|
|
70
|
+
from docid import generate_invoice_id, generate_receipt_id, generate_contract_id
|
|
71
71
|
|
|
72
72
|
# Faktura VAT
|
|
73
73
|
invoice_id = generate_invoice_id(
|
|
@@ -76,7 +76,7 @@ invoice_id = generate_invoice_id(
|
|
|
76
76
|
issue_date="2025-01-15",
|
|
77
77
|
gross_amount=1230.50
|
|
78
78
|
)
|
|
79
|
-
print(invoice_id) #
|
|
79
|
+
print(invoice_id) # DOC-FV-F0BE35240C77B2DB
|
|
80
80
|
|
|
81
81
|
# Paragon fiskalny
|
|
82
82
|
receipt_id = generate_receipt_id(
|
|
@@ -85,7 +85,7 @@ receipt_id = generate_receipt_id(
|
|
|
85
85
|
gross_amount=37.88,
|
|
86
86
|
cash_register_number="001"
|
|
87
87
|
)
|
|
88
|
-
print(receipt_id) #
|
|
88
|
+
print(receipt_id) # DOC-PAR-8142B3FC69D7778C
|
|
89
89
|
|
|
90
90
|
# Umowa
|
|
91
91
|
contract_id = generate_contract_id(
|
|
@@ -94,13 +94,13 @@ contract_id = generate_contract_id(
|
|
|
94
94
|
contract_date="2025-01-15",
|
|
95
95
|
contract_number="001/2025"
|
|
96
96
|
)
|
|
97
|
-
print(contract_id) #
|
|
97
|
+
print(contract_id) # DOC-UMO-C54CB968D1342642
|
|
98
98
|
```
|
|
99
99
|
|
|
100
100
|
### Uniwersalny generator ID (dowolne dokumenty)
|
|
101
101
|
|
|
102
102
|
```python
|
|
103
|
-
from
|
|
103
|
+
from docid import generate_universal_document_id
|
|
104
104
|
|
|
105
105
|
# Dowolny dokument
|
|
106
106
|
doc_id = generate_universal_document_id("dokument.pdf")
|
|
@@ -118,20 +118,20 @@ print(doc_id) # UNIV-IMG-E2E2131A335F0918
|
|
|
118
118
|
### Pełne przetwarzanie z OCR
|
|
119
119
|
|
|
120
120
|
```python
|
|
121
|
-
from
|
|
121
|
+
from docid import process_document, get_document_id
|
|
122
122
|
|
|
123
123
|
# Pełne przetwarzanie z ekstrakcją danych
|
|
124
124
|
result = process_document("faktura.pdf")
|
|
125
|
-
print(result.document_id) #
|
|
125
|
+
print(result.document_id) # DOC-FV-F0BE35240C77B2DB
|
|
126
126
|
print(result.extraction.issuer_nip) # 5213017228
|
|
127
127
|
print(result.extraction.invoice_number) # FV/2025/00142
|
|
128
128
|
|
|
129
129
|
# Tylko wygeneruj ID
|
|
130
130
|
doc_id = get_document_id("paragon.jpg")
|
|
131
|
-
print(doc_id) #
|
|
131
|
+
print(doc_id) # DOC-PAR-8142B3FC69D7778C
|
|
132
132
|
|
|
133
133
|
# Weryfikacja ID
|
|
134
|
-
is_valid = verify_document_id("skan.png", "
|
|
134
|
+
is_valid = verify_document_id("skan.png", "DOC-FV-F0BE35240C77B2DB")
|
|
135
135
|
print(is_valid) # True/False
|
|
136
136
|
```
|
|
137
137
|
|
|
@@ -193,7 +193,7 @@ curl -X POST -F "file=@faktura.pdf" http://localhost:8000/process
|
|
|
193
193
|
|
|
194
194
|
**2. Weryfikacja ID:**
|
|
195
195
|
```bash
|
|
196
|
-
curl -X POST -F "file=@skan.jpg" -F "document_id=
|
|
196
|
+
curl -X POST -F "file=@skan.jpg" -F "document_id=DOC-FV-F0BE35240C77B2DB" http://localhost:8000/verify
|
|
197
197
|
```
|
|
198
198
|
|
|
199
199
|
**3. Porównywanie plików:**
|
|
@@ -246,7 +246,7 @@ make run-web # Uruchom serwer API
|
|
|
246
246
|
### 1. Przetwarzanie faktur
|
|
247
247
|
|
|
248
248
|
```python
|
|
249
|
-
from
|
|
249
|
+
from docid import process_document
|
|
250
250
|
|
|
251
251
|
# Przetwarzanie faktury PDF
|
|
252
252
|
result = process_document("faktura.pdf")
|
|
@@ -262,7 +262,7 @@ print(f"ID: {result.document_id}")
|
|
|
262
262
|
### 2. Porównywanie dokumentów
|
|
263
263
|
|
|
264
264
|
```python
|
|
265
|
-
from
|
|
265
|
+
from docid import compare_universal_documents
|
|
266
266
|
|
|
267
267
|
# Porównaj dwa dokumenty
|
|
268
268
|
comparison = compare_universal_documents("dokument1.pdf", "dokument2.png")
|
|
@@ -274,10 +274,10 @@ print(f"Ten sam rozmiar: {comparison['same_size']}")
|
|
|
274
274
|
### 3. Weryfikacja ID
|
|
275
275
|
|
|
276
276
|
```python
|
|
277
|
-
from
|
|
277
|
+
from docid import verify_document_id, verify_universal_document_id
|
|
278
278
|
|
|
279
279
|
# Weryfikacja ID dokumentu biznesowego
|
|
280
|
-
is_valid = verify_document_id("faktura.pdf", "
|
|
280
|
+
is_valid = verify_document_id("faktura.pdf", "DOC-FV-F0BE35240C77B2DB")
|
|
281
281
|
|
|
282
282
|
# Weryfikacja uniwersalnego ID
|
|
283
283
|
is_valid = verify_universal_document_id("dowolny_plik.jpg", "UNIV-IMG-4225A473A725978D")
|
|
@@ -287,7 +287,7 @@ is_valid = verify_universal_document_id("dowolny_plik.jpg", "UNIV-IMG-4225A473A7
|
|
|
287
287
|
|
|
288
288
|
```python
|
|
289
289
|
from pathlib import Path
|
|
290
|
-
from
|
|
290
|
+
from docid import generate_universal_document_id
|
|
291
291
|
|
|
292
292
|
# Przetwarzaj wszystkie pliki w folderze
|
|
293
293
|
documents_dir = Path("dokumenty")
|
|
@@ -315,16 +315,16 @@ for file_path in documents_dir.glob("*"):
|
|
|
315
315
|
**TAK!** Formaty PNG i JPG są w pełni przetwarzane przez OCR:
|
|
316
316
|
|
|
317
317
|
```python
|
|
318
|
-
from
|
|
318
|
+
from docid import process_document
|
|
319
319
|
|
|
320
320
|
# Przetwarzanie skanu PNG z OCR
|
|
321
321
|
result = process_document("skan_faktury.png")
|
|
322
|
-
print(result.document_id) #
|
|
322
|
+
print(result.document_id) # DOC-FV-F0BE35240C77B2DB
|
|
323
323
|
print(result.extraction.issuer_nip) # 5213017228
|
|
324
324
|
|
|
325
325
|
# Przetwarzanie zdjęcia JPG z OCR
|
|
326
326
|
result = process_document("zdjecie_paragonu.jpg")
|
|
327
|
-
print(result.document_id) #
|
|
327
|
+
print(result.document_id) # DOC-PAR-8142B3FC69D7778C
|
|
328
328
|
```
|
|
329
329
|
|
|
330
330
|
#### Co jest ekstrahowane z PNG/JPG:
|
|
@@ -345,13 +345,13 @@ print(result.document_id) # EXEF-PAR-8142B3FC69D7778C
|
|
|
345
345
|
|
|
346
346
|
```python
|
|
347
347
|
# Ten sam dokument w różnych formatach - ten sam ID biznesowy
|
|
348
|
-
generate_invoice_id(...) # ->
|
|
348
|
+
generate_invoice_id(...) # -> DOC-FV-F0BE35240C77B2DB
|
|
349
349
|
|
|
350
350
|
# Przetwarzanie przez OCR daje ten sam wynik
|
|
351
|
-
process_document("faktura.pdf") # ->
|
|
352
|
-
process_document("faktura.png") # ->
|
|
353
|
-
process_document("faktura.jpg") # ->
|
|
354
|
-
process_document("faktura.xml") # ->
|
|
351
|
+
process_document("faktura.pdf") # -> DOC-FV-F0BE35240C77B2DB
|
|
352
|
+
process_document("faktura.png") # -> DOC-FV-F0BE35240C77B2DB
|
|
353
|
+
process_document("faktura.jpg") # -> DOC-FV-F0BE35240C77B2DB
|
|
354
|
+
process_document("faktura.xml") # -> DOC-FV-F0BE35240C77B2DB
|
|
355
355
|
|
|
356
356
|
# Różne ID uniwersalne dla różnych formatów
|
|
357
357
|
generate_universal_document_id("faktura.pdf") # -> UNIV-PDF-...
|
|
@@ -394,25 +394,25 @@ TEST WSZYSTKICH FORMATÓW - PDF, PNG, JPG, HTML, TXT, XML
|
|
|
394
394
|
================================================================================
|
|
395
395
|
|
|
396
396
|
FOLDER: FAKTURY (invoices/)
|
|
397
|
-
📄 faktura_full.pdf (.pdf ) [ 2242B] ->
|
|
398
|
-
📄 faktura_full.xml (.xml ) [ 2077B] ->
|
|
399
|
-
📄 faktura_full.html (.html) [ 3334B] ->
|
|
400
|
-
📄 faktura_full.jpg (.jpg ) [ 28182B] ->
|
|
401
|
-
📄 faktura_full.png (.png ) [ 32325B] ->
|
|
402
|
-
📄 faktura_full.txt (.txt ) [ 2839B] ->
|
|
397
|
+
📄 faktura_full.pdf (.pdf ) [ 2242B] -> DOC-FV-F0BE35240C77B2DB
|
|
398
|
+
📄 faktura_full.xml (.xml ) [ 2077B] -> DOC-FV-F0BE35240C77B2DB
|
|
399
|
+
📄 faktura_full.html (.html) [ 3334B] -> DOC-FV-F0BE35240C77B2DB
|
|
400
|
+
📄 faktura_full.jpg (.jpg ) [ 28182B] -> DOC-FV-F0BE35240C77B2DB
|
|
401
|
+
📄 faktura_full.png (.png ) [ 32325B] -> DOC-FV-F0BE35240C77B2DB
|
|
402
|
+
📄 faktura_full.txt (.txt ) [ 2839B] -> DOC-FV-F0BE35240C77B2DB
|
|
403
403
|
|
|
404
404
|
📊 Podsumowanie folderu invoices:
|
|
405
405
|
Plików przetworzonych: 6
|
|
406
406
|
Unikalnych ID: 1
|
|
407
407
|
Wszystkie identyczne: True
|
|
408
|
-
✅ ID:
|
|
408
|
+
✅ ID: DOC-FV-F0BE35240C77B2DB
|
|
409
409
|
```
|
|
410
410
|
|
|
411
411
|
## 📁 Struktura projektu
|
|
412
412
|
|
|
413
413
|
```
|
|
414
414
|
docid/
|
|
415
|
-
├──
|
|
415
|
+
├── docid/ # Główny pakiet
|
|
416
416
|
│ ├── __init__.py # Eksporty API
|
|
417
417
|
│ ├── document_id.py # Generator ID biznesowy
|
|
418
418
|
│ ├── document_id_universal.py # Generator ID uniwersalny
|
|
@@ -436,7 +436,7 @@ docid/
|
|
|
436
436
|
### Silniki OCR
|
|
437
437
|
|
|
438
438
|
```python
|
|
439
|
-
from
|
|
439
|
+
from docid import OCREngine, get_pipeline
|
|
440
440
|
|
|
441
441
|
# Użyj PaddleOCR (domyślnie)
|
|
442
442
|
pipeline = get_pipeline(ocr_engine=OCREngine.PADDLE)
|
|
@@ -448,7 +448,7 @@ pipeline = get_pipeline(ocr_engine=OCREngine.TESSERACT)
|
|
|
448
448
|
### Custom prefix
|
|
449
449
|
|
|
450
450
|
```python
|
|
451
|
-
from
|
|
451
|
+
from docid import UniversalDocumentIDGenerator
|
|
452
452
|
|
|
453
453
|
generator = UniversalDocumentIDGenerator(prefix="MOJA")
|
|
454
454
|
doc_id = generator.generate_universal_id("plik.pdf")
|
|
@@ -492,8 +492,8 @@ MIT License - zobacz [LICENSE](LICENSE) dla szczegółów.
|
|
|
492
492
|
## 🆘 Wsparcie
|
|
493
493
|
|
|
494
494
|
- 📧 Email: info@softreck.dev
|
|
495
|
-
- 🐛 Issues: [GitHub Issues](https://github.com/softreck/
|
|
496
|
-
- 📖 Dokumentacja: [GitHub Wiki](https://github.com/softreck/
|
|
495
|
+
- 🐛 Issues: [GitHub Issues](https://github.com/softreck/doc-pl/issues)
|
|
496
|
+
- 📖 Dokumentacja: [GitHub Wiki](https://github.com/softreck/doc-pl/wiki)
|
|
497
497
|
|
|
498
498
|
## 🗺️ Roadmap
|
|
499
499
|
|
|
@@ -506,4 +506,4 @@ MIT License - zobacz [LICENSE](LICENSE) dla szczegółów.
|
|
|
506
506
|
|
|
507
507
|
---
|
|
508
508
|
|
|
509
|
-
**
|
|
509
|
+
**DOC Document ID Generator** - Deterministyczne identyfikatory dla każdego dokumentu! 🚀
|
|
@@ -6,7 +6,7 @@ Generuje zawsze ten sam ID dla tego samego dokumentu,
|
|
|
6
6
|
niezależnie od formatu źródłowego (skan, PDF, KSeF XML).
|
|
7
7
|
|
|
8
8
|
Przykład użycia:
|
|
9
|
-
from
|
|
9
|
+
from docid import process_document, get_document_id
|
|
10
10
|
|
|
11
11
|
# Pełne przetwarzanie
|
|
12
12
|
result = process_document("faktura.pdf")
|
|
@@ -27,7 +27,7 @@ Wymagania:
|
|
|
27
27
|
pip install pytesseract pdf2image pillow
|
|
28
28
|
"""
|
|
29
29
|
|
|
30
|
-
__version__ = "0.1.
|
|
30
|
+
__version__ = "0.1.5"
|
|
31
31
|
__author__ = "Softreck"
|
|
32
32
|
|
|
33
33
|
# Główne API
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: docid
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.5
|
|
4
4
|
Summary: Deterministyczny generator identyfikatorów dokumentów z OCR
|
|
5
5
|
Home-page: https://github.com/softreck/docid
|
|
6
6
|
Author: Softreck
|
|
@@ -55,7 +55,7 @@ Dynamic: author
|
|
|
55
55
|
Dynamic: home-page
|
|
56
56
|
Dynamic: requires-python
|
|
57
57
|
|
|
58
|
-
#
|
|
58
|
+
# DOC Document ID Generator
|
|
59
59
|
|
|
60
60
|
Deterministyczny generator identyfikatorów dokumentów z OCR. Generuje **zawsze ten sam ID** dla tego samego dokumentu, niezależnie od formatu źródłowego (skan, PDF, KSeF XML, obrazy).
|
|
61
61
|
|
|
@@ -71,12 +71,12 @@ Jak uzyskać **ten sam identyfikator** dla wszystkich trzech?
|
|
|
71
71
|
## ✨ Rozwiązanie
|
|
72
72
|
|
|
73
73
|
```python
|
|
74
|
-
from
|
|
74
|
+
from docid import get_document_id
|
|
75
75
|
|
|
76
76
|
# Wszystkie trzy zwrócą TEN SAM ID!
|
|
77
|
-
get_document_id("faktura_skan.jpg") #
|
|
78
|
-
get_document_id("faktura.pdf") #
|
|
79
|
-
get_document_id("faktura_ksef.xml") #
|
|
77
|
+
get_document_id("faktura_skan.jpg") # DOC-FV-A7B3C9D2E1F04856
|
|
78
|
+
get_document_id("faktura.pdf") # DOC-FV-A7B3C9D2E1F04856
|
|
79
|
+
get_document_id("faktura_ksef.xml") # DOC-FV-A7B3C9D2E1F04856
|
|
80
80
|
```
|
|
81
81
|
|
|
82
82
|
## 📦 Instalacja
|
|
@@ -85,8 +85,8 @@ get_document_id("faktura_ksef.xml") # EXEF-FV-A7B3C9D2E1F04856
|
|
|
85
85
|
|
|
86
86
|
```bash
|
|
87
87
|
# Klonuj repozytorium
|
|
88
|
-
git clone https://github.com/softreck/
|
|
89
|
-
cd
|
|
88
|
+
git clone https://github.com/softreck/doc-pl.git
|
|
89
|
+
cd doc-pl/app/docid
|
|
90
90
|
|
|
91
91
|
# Utwórz środowisko wirtualne
|
|
92
92
|
python3 -m venv venv
|
|
@@ -124,7 +124,7 @@ pip install docid[all]
|
|
|
124
124
|
### Generator ID dla dokumentów biznesowych
|
|
125
125
|
|
|
126
126
|
```python
|
|
127
|
-
from
|
|
127
|
+
from docid import generate_invoice_id, generate_receipt_id, generate_contract_id
|
|
128
128
|
|
|
129
129
|
# Faktura VAT
|
|
130
130
|
invoice_id = generate_invoice_id(
|
|
@@ -133,7 +133,7 @@ invoice_id = generate_invoice_id(
|
|
|
133
133
|
issue_date="2025-01-15",
|
|
134
134
|
gross_amount=1230.50
|
|
135
135
|
)
|
|
136
|
-
print(invoice_id) #
|
|
136
|
+
print(invoice_id) # DOC-FV-F0BE35240C77B2DB
|
|
137
137
|
|
|
138
138
|
# Paragon fiskalny
|
|
139
139
|
receipt_id = generate_receipt_id(
|
|
@@ -142,7 +142,7 @@ receipt_id = generate_receipt_id(
|
|
|
142
142
|
gross_amount=37.88,
|
|
143
143
|
cash_register_number="001"
|
|
144
144
|
)
|
|
145
|
-
print(receipt_id) #
|
|
145
|
+
print(receipt_id) # DOC-PAR-8142B3FC69D7778C
|
|
146
146
|
|
|
147
147
|
# Umowa
|
|
148
148
|
contract_id = generate_contract_id(
|
|
@@ -151,13 +151,13 @@ contract_id = generate_contract_id(
|
|
|
151
151
|
contract_date="2025-01-15",
|
|
152
152
|
contract_number="001/2025"
|
|
153
153
|
)
|
|
154
|
-
print(contract_id) #
|
|
154
|
+
print(contract_id) # DOC-UMO-C54CB968D1342642
|
|
155
155
|
```
|
|
156
156
|
|
|
157
157
|
### Uniwersalny generator ID (dowolne dokumenty)
|
|
158
158
|
|
|
159
159
|
```python
|
|
160
|
-
from
|
|
160
|
+
from docid import generate_universal_document_id
|
|
161
161
|
|
|
162
162
|
# Dowolny dokument
|
|
163
163
|
doc_id = generate_universal_document_id("dokument.pdf")
|
|
@@ -175,20 +175,20 @@ print(doc_id) # UNIV-IMG-E2E2131A335F0918
|
|
|
175
175
|
### Pełne przetwarzanie z OCR
|
|
176
176
|
|
|
177
177
|
```python
|
|
178
|
-
from
|
|
178
|
+
from docid import process_document, get_document_id
|
|
179
179
|
|
|
180
180
|
# Pełne przetwarzanie z ekstrakcją danych
|
|
181
181
|
result = process_document("faktura.pdf")
|
|
182
|
-
print(result.document_id) #
|
|
182
|
+
print(result.document_id) # DOC-FV-F0BE35240C77B2DB
|
|
183
183
|
print(result.extraction.issuer_nip) # 5213017228
|
|
184
184
|
print(result.extraction.invoice_number) # FV/2025/00142
|
|
185
185
|
|
|
186
186
|
# Tylko wygeneruj ID
|
|
187
187
|
doc_id = get_document_id("paragon.jpg")
|
|
188
|
-
print(doc_id) #
|
|
188
|
+
print(doc_id) # DOC-PAR-8142B3FC69D7778C
|
|
189
189
|
|
|
190
190
|
# Weryfikacja ID
|
|
191
|
-
is_valid = verify_document_id("skan.png", "
|
|
191
|
+
is_valid = verify_document_id("skan.png", "DOC-FV-F0BE35240C77B2DB")
|
|
192
192
|
print(is_valid) # True/False
|
|
193
193
|
```
|
|
194
194
|
|
|
@@ -250,7 +250,7 @@ curl -X POST -F "file=@faktura.pdf" http://localhost:8000/process
|
|
|
250
250
|
|
|
251
251
|
**2. Weryfikacja ID:**
|
|
252
252
|
```bash
|
|
253
|
-
curl -X POST -F "file=@skan.jpg" -F "document_id=
|
|
253
|
+
curl -X POST -F "file=@skan.jpg" -F "document_id=DOC-FV-F0BE35240C77B2DB" http://localhost:8000/verify
|
|
254
254
|
```
|
|
255
255
|
|
|
256
256
|
**3. Porównywanie plików:**
|
|
@@ -303,7 +303,7 @@ make run-web # Uruchom serwer API
|
|
|
303
303
|
### 1. Przetwarzanie faktur
|
|
304
304
|
|
|
305
305
|
```python
|
|
306
|
-
from
|
|
306
|
+
from docid import process_document
|
|
307
307
|
|
|
308
308
|
# Przetwarzanie faktury PDF
|
|
309
309
|
result = process_document("faktura.pdf")
|
|
@@ -319,7 +319,7 @@ print(f"ID: {result.document_id}")
|
|
|
319
319
|
### 2. Porównywanie dokumentów
|
|
320
320
|
|
|
321
321
|
```python
|
|
322
|
-
from
|
|
322
|
+
from docid import compare_universal_documents
|
|
323
323
|
|
|
324
324
|
# Porównaj dwa dokumenty
|
|
325
325
|
comparison = compare_universal_documents("dokument1.pdf", "dokument2.png")
|
|
@@ -331,10 +331,10 @@ print(f"Ten sam rozmiar: {comparison['same_size']}")
|
|
|
331
331
|
### 3. Weryfikacja ID
|
|
332
332
|
|
|
333
333
|
```python
|
|
334
|
-
from
|
|
334
|
+
from docid import verify_document_id, verify_universal_document_id
|
|
335
335
|
|
|
336
336
|
# Weryfikacja ID dokumentu biznesowego
|
|
337
|
-
is_valid = verify_document_id("faktura.pdf", "
|
|
337
|
+
is_valid = verify_document_id("faktura.pdf", "DOC-FV-F0BE35240C77B2DB")
|
|
338
338
|
|
|
339
339
|
# Weryfikacja uniwersalnego ID
|
|
340
340
|
is_valid = verify_universal_document_id("dowolny_plik.jpg", "UNIV-IMG-4225A473A725978D")
|
|
@@ -344,7 +344,7 @@ is_valid = verify_universal_document_id("dowolny_plik.jpg", "UNIV-IMG-4225A473A7
|
|
|
344
344
|
|
|
345
345
|
```python
|
|
346
346
|
from pathlib import Path
|
|
347
|
-
from
|
|
347
|
+
from docid import generate_universal_document_id
|
|
348
348
|
|
|
349
349
|
# Przetwarzaj wszystkie pliki w folderze
|
|
350
350
|
documents_dir = Path("dokumenty")
|
|
@@ -372,16 +372,16 @@ for file_path in documents_dir.glob("*"):
|
|
|
372
372
|
**TAK!** Formaty PNG i JPG są w pełni przetwarzane przez OCR:
|
|
373
373
|
|
|
374
374
|
```python
|
|
375
|
-
from
|
|
375
|
+
from docid import process_document
|
|
376
376
|
|
|
377
377
|
# Przetwarzanie skanu PNG z OCR
|
|
378
378
|
result = process_document("skan_faktury.png")
|
|
379
|
-
print(result.document_id) #
|
|
379
|
+
print(result.document_id) # DOC-FV-F0BE35240C77B2DB
|
|
380
380
|
print(result.extraction.issuer_nip) # 5213017228
|
|
381
381
|
|
|
382
382
|
# Przetwarzanie zdjęcia JPG z OCR
|
|
383
383
|
result = process_document("zdjecie_paragonu.jpg")
|
|
384
|
-
print(result.document_id) #
|
|
384
|
+
print(result.document_id) # DOC-PAR-8142B3FC69D7778C
|
|
385
385
|
```
|
|
386
386
|
|
|
387
387
|
#### Co jest ekstrahowane z PNG/JPG:
|
|
@@ -402,13 +402,13 @@ print(result.document_id) # EXEF-PAR-8142B3FC69D7778C
|
|
|
402
402
|
|
|
403
403
|
```python
|
|
404
404
|
# Ten sam dokument w różnych formatach - ten sam ID biznesowy
|
|
405
|
-
generate_invoice_id(...) # ->
|
|
405
|
+
generate_invoice_id(...) # -> DOC-FV-F0BE35240C77B2DB
|
|
406
406
|
|
|
407
407
|
# Przetwarzanie przez OCR daje ten sam wynik
|
|
408
|
-
process_document("faktura.pdf") # ->
|
|
409
|
-
process_document("faktura.png") # ->
|
|
410
|
-
process_document("faktura.jpg") # ->
|
|
411
|
-
process_document("faktura.xml") # ->
|
|
408
|
+
process_document("faktura.pdf") # -> DOC-FV-F0BE35240C77B2DB
|
|
409
|
+
process_document("faktura.png") # -> DOC-FV-F0BE35240C77B2DB
|
|
410
|
+
process_document("faktura.jpg") # -> DOC-FV-F0BE35240C77B2DB
|
|
411
|
+
process_document("faktura.xml") # -> DOC-FV-F0BE35240C77B2DB
|
|
412
412
|
|
|
413
413
|
# Różne ID uniwersalne dla różnych formatów
|
|
414
414
|
generate_universal_document_id("faktura.pdf") # -> UNIV-PDF-...
|
|
@@ -451,25 +451,25 @@ TEST WSZYSTKICH FORMATÓW - PDF, PNG, JPG, HTML, TXT, XML
|
|
|
451
451
|
================================================================================
|
|
452
452
|
|
|
453
453
|
FOLDER: FAKTURY (invoices/)
|
|
454
|
-
📄 faktura_full.pdf (.pdf ) [ 2242B] ->
|
|
455
|
-
📄 faktura_full.xml (.xml ) [ 2077B] ->
|
|
456
|
-
📄 faktura_full.html (.html) [ 3334B] ->
|
|
457
|
-
📄 faktura_full.jpg (.jpg ) [ 28182B] ->
|
|
458
|
-
📄 faktura_full.png (.png ) [ 32325B] ->
|
|
459
|
-
📄 faktura_full.txt (.txt ) [ 2839B] ->
|
|
454
|
+
📄 faktura_full.pdf (.pdf ) [ 2242B] -> DOC-FV-F0BE35240C77B2DB
|
|
455
|
+
📄 faktura_full.xml (.xml ) [ 2077B] -> DOC-FV-F0BE35240C77B2DB
|
|
456
|
+
📄 faktura_full.html (.html) [ 3334B] -> DOC-FV-F0BE35240C77B2DB
|
|
457
|
+
📄 faktura_full.jpg (.jpg ) [ 28182B] -> DOC-FV-F0BE35240C77B2DB
|
|
458
|
+
📄 faktura_full.png (.png ) [ 32325B] -> DOC-FV-F0BE35240C77B2DB
|
|
459
|
+
📄 faktura_full.txt (.txt ) [ 2839B] -> DOC-FV-F0BE35240C77B2DB
|
|
460
460
|
|
|
461
461
|
📊 Podsumowanie folderu invoices:
|
|
462
462
|
Plików przetworzonych: 6
|
|
463
463
|
Unikalnych ID: 1
|
|
464
464
|
Wszystkie identyczne: True
|
|
465
|
-
✅ ID:
|
|
465
|
+
✅ ID: DOC-FV-F0BE35240C77B2DB
|
|
466
466
|
```
|
|
467
467
|
|
|
468
468
|
## 📁 Struktura projektu
|
|
469
469
|
|
|
470
470
|
```
|
|
471
471
|
docid/
|
|
472
|
-
├──
|
|
472
|
+
├── docid/ # Główny pakiet
|
|
473
473
|
│ ├── __init__.py # Eksporty API
|
|
474
474
|
│ ├── document_id.py # Generator ID biznesowy
|
|
475
475
|
│ ├── document_id_universal.py # Generator ID uniwersalny
|
|
@@ -493,7 +493,7 @@ docid/
|
|
|
493
493
|
### Silniki OCR
|
|
494
494
|
|
|
495
495
|
```python
|
|
496
|
-
from
|
|
496
|
+
from docid import OCREngine, get_pipeline
|
|
497
497
|
|
|
498
498
|
# Użyj PaddleOCR (domyślnie)
|
|
499
499
|
pipeline = get_pipeline(ocr_engine=OCREngine.PADDLE)
|
|
@@ -505,7 +505,7 @@ pipeline = get_pipeline(ocr_engine=OCREngine.TESSERACT)
|
|
|
505
505
|
### Custom prefix
|
|
506
506
|
|
|
507
507
|
```python
|
|
508
|
-
from
|
|
508
|
+
from docid import UniversalDocumentIDGenerator
|
|
509
509
|
|
|
510
510
|
generator = UniversalDocumentIDGenerator(prefix="MOJA")
|
|
511
511
|
doc_id = generator.generate_universal_id("plik.pdf")
|
|
@@ -549,8 +549,8 @@ MIT License - zobacz [LICENSE](LICENSE) dla szczegółów.
|
|
|
549
549
|
## 🆘 Wsparcie
|
|
550
550
|
|
|
551
551
|
- 📧 Email: info@softreck.dev
|
|
552
|
-
- 🐛 Issues: [GitHub Issues](https://github.com/softreck/
|
|
553
|
-
- 📖 Dokumentacja: [GitHub Wiki](https://github.com/softreck/
|
|
552
|
+
- 🐛 Issues: [GitHub Issues](https://github.com/softreck/doc-pl/issues)
|
|
553
|
+
- 📖 Dokumentacja: [GitHub Wiki](https://github.com/softreck/doc-pl/wiki)
|
|
554
554
|
|
|
555
555
|
## 🗺️ Roadmap
|
|
556
556
|
|
|
@@ -563,4 +563,4 @@ MIT License - zobacz [LICENSE](LICENSE) dla szczegółów.
|
|
|
563
563
|
|
|
564
564
|
---
|
|
565
565
|
|
|
566
|
-
**
|
|
566
|
+
**DOC Document ID Generator** - Deterministyczne identyfikatory dla każdego dokumentu! 🚀
|
|
@@ -1,6 +1,13 @@
|
|
|
1
1
|
README.md
|
|
2
2
|
pyproject.toml
|
|
3
3
|
setup.py
|
|
4
|
+
docid/__init__.py
|
|
5
|
+
docid/cli.py
|
|
6
|
+
docid/cli_universal.py
|
|
7
|
+
docid/document_id.py
|
|
8
|
+
docid/document_id_universal.py
|
|
9
|
+
docid/ocr_processor.py
|
|
10
|
+
docid/pipeline.py
|
|
4
11
|
docid.egg-info/PKG-INFO
|
|
5
12
|
docid.egg-info/SOURCES.txt
|
|
6
13
|
docid.egg-info/dependency_links.txt
|
|
@@ -8,15 +15,8 @@ docid.egg-info/entry_points.txt
|
|
|
8
15
|
docid.egg-info/not-zip-safe
|
|
9
16
|
docid.egg-info/requires.txt
|
|
10
17
|
docid.egg-info/top_level.txt
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
exef_docid/cli_universal.py
|
|
14
|
-
exef_docid/document_id.py
|
|
15
|
-
exef_docid/document_id_universal.py
|
|
16
|
-
exef_docid/ocr_processor.py
|
|
17
|
-
exef_docid/pipeline.py
|
|
18
|
-
exef_docid/extractors/__init__.py
|
|
19
|
-
exef_docid/extractors/base.py
|
|
18
|
+
docid/extractors/__init__.py
|
|
19
|
+
docid/extractors/base.py
|
|
20
20
|
tests/test_document_id.py
|
|
21
21
|
tests/test_extractors.py
|
|
22
22
|
tests/test_samples_id.py
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
docid
|
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "docid"
|
|
7
|
-
version = "0.1.
|
|
7
|
+
version = "0.1.5"
|
|
8
8
|
description = "Deterministyczny generator identyfikatorów dokumentów z OCR"
|
|
9
9
|
readme = "README.md"
|
|
10
10
|
license = {text = "MIT"}
|
|
@@ -63,8 +63,8 @@ dev = [
|
|
|
63
63
|
]
|
|
64
64
|
|
|
65
65
|
[project.scripts]
|
|
66
|
-
docid = "
|
|
67
|
-
docid-universal = "
|
|
66
|
+
docid = "docid.cli:main"
|
|
67
|
+
docid-universal = "docid.cli_universal:main"
|
|
68
68
|
|
|
69
69
|
[project.urls]
|
|
70
70
|
Homepage = "https://github.com/softreck/docid"
|
|
@@ -73,7 +73,7 @@ Repository = "https://github.com/softreck/docid"
|
|
|
73
73
|
Issues = "https://github.com/softreck/docid/issues"
|
|
74
74
|
|
|
75
75
|
[tool.setuptools.packages.find]
|
|
76
|
-
include = ["
|
|
76
|
+
include = ["docid*"]
|
|
77
77
|
|
|
78
78
|
[tool.black]
|
|
79
79
|
line-length = 100
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
#!/usr/bin/env python3
|
|
2
|
-
"""Setup dla
|
|
2
|
+
"""Setup dla DOC Document ID Generator."""
|
|
3
3
|
|
|
4
4
|
from setuptools import setup, find_packages
|
|
5
5
|
from pathlib import Path
|
|
@@ -57,7 +57,7 @@ setup(
|
|
|
57
57
|
},
|
|
58
58
|
entry_points={
|
|
59
59
|
"console_scripts": [
|
|
60
|
-
"docid=
|
|
60
|
+
"docid=docid.cli:main",
|
|
61
61
|
],
|
|
62
62
|
},
|
|
63
63
|
include_package_data=True,
|
|
@@ -4,7 +4,7 @@ Testy dla generatora identyfikatorów dokumentów.
|
|
|
4
4
|
|
|
5
5
|
import pytest
|
|
6
6
|
|
|
7
|
-
from
|
|
7
|
+
from docid.document_id import (
|
|
8
8
|
AmountNormalizer,
|
|
9
9
|
DateNormalizer,
|
|
10
10
|
DocumentIDGenerator,
|
|
@@ -109,7 +109,7 @@ class TestDocumentIDGenerator:
|
|
|
109
109
|
|
|
110
110
|
@pytest.fixture
|
|
111
111
|
def generator(self):
|
|
112
|
-
return DocumentIDGenerator(prefix="
|
|
112
|
+
return DocumentIDGenerator(prefix="DOC")
|
|
113
113
|
|
|
114
114
|
def test_generate_invoice_id_deterministic(self, generator):
|
|
115
115
|
"""Ten sam input = ten sam output."""
|
|
@@ -152,10 +152,10 @@ class TestDocumentIDGenerator:
|
|
|
152
152
|
gross_amount=1230.50,
|
|
153
153
|
)
|
|
154
154
|
|
|
155
|
-
assert doc_id.startswith("
|
|
155
|
+
assert doc_id.startswith("DOC-FV-")
|
|
156
156
|
parts = doc_id.split("-")
|
|
157
157
|
assert len(parts) == 3
|
|
158
|
-
assert parts[0] == "
|
|
158
|
+
assert parts[0] == "DOC"
|
|
159
159
|
assert parts[1] == "FV"
|
|
160
160
|
assert len(parts[2]) == 16
|
|
161
161
|
assert all(c in "0123456789ABCDEF" for c in parts[2])
|
|
@@ -168,7 +168,7 @@ class TestDocumentIDGenerator:
|
|
|
168
168
|
gross_amount=45.99,
|
|
169
169
|
)
|
|
170
170
|
|
|
171
|
-
assert doc_id.startswith("
|
|
171
|
+
assert doc_id.startswith("DOC-PAR-")
|
|
172
172
|
|
|
173
173
|
def test_generate_receipt_id_with_extras(self, generator):
|
|
174
174
|
"""Paragon z numerem kasy daje inny ID."""
|
|
@@ -232,10 +232,10 @@ class TestDocumentIDGenerator:
|
|
|
232
232
|
|
|
233
233
|
def test_parse_id(self, generator):
|
|
234
234
|
"""Test parsowania ID."""
|
|
235
|
-
doc_id = "
|
|
235
|
+
doc_id = "DOC-FV-A7B3C9D2E1F04856"
|
|
236
236
|
parsed = DocumentIDGenerator.parse_id(doc_id)
|
|
237
237
|
|
|
238
|
-
assert parsed['prefix'] == "
|
|
238
|
+
assert parsed['prefix'] == "DOC"
|
|
239
239
|
assert parsed['type'] == "FV"
|
|
240
240
|
assert parsed['hash'] == "A7B3C9D2E1F04856"
|
|
241
241
|
assert parsed['document_type'] == DocumentType.INVOICE
|
|
@@ -263,7 +263,7 @@ class TestDocumentTypes:
|
|
|
263
263
|
gross_amount=-100.00,
|
|
264
264
|
)
|
|
265
265
|
|
|
266
|
-
assert doc_id.startswith("
|
|
266
|
+
assert doc_id.startswith("DOC-KOR-")
|
|
267
267
|
|
|
268
268
|
def test_bank_statement(self, generator):
|
|
269
269
|
"""Wyciąg bankowy."""
|
|
@@ -273,7 +273,7 @@ class TestDocumentTypes:
|
|
|
273
273
|
statement_number="001/2025",
|
|
274
274
|
)
|
|
275
275
|
|
|
276
|
-
assert doc_id.startswith("
|
|
276
|
+
assert doc_id.startswith("DOC-WB-")
|
|
277
277
|
|
|
278
278
|
def test_generic_document(self, generator):
|
|
279
279
|
"""Dokument generyczny."""
|
|
@@ -287,4 +287,4 @@ class TestDocumentTypes:
|
|
|
287
287
|
document_date="2025-01-15",
|
|
288
288
|
)
|
|
289
289
|
|
|
290
|
-
assert doc_id.startswith("
|
|
290
|
+
assert doc_id.startswith("DOC-DOC-")
|
|
@@ -4,14 +4,14 @@ Testy dla ekstraktorów danych z dokumentów.
|
|
|
4
4
|
|
|
5
5
|
import pytest
|
|
6
6
|
|
|
7
|
-
from
|
|
7
|
+
from docid.extractors import (
|
|
8
8
|
ContractExtractor,
|
|
9
9
|
DocumentCategory,
|
|
10
10
|
DocumentExtractor,
|
|
11
11
|
InvoiceExtractor,
|
|
12
12
|
ReceiptExtractor,
|
|
13
13
|
)
|
|
14
|
-
from
|
|
14
|
+
from docid.ocr_processor import DocumentOCRResult, OCREngine, OCRResult
|
|
15
15
|
|
|
16
16
|
|
|
17
17
|
def create_mock_ocr_result(
|
|
@@ -271,7 +271,7 @@ class TestNIPDetection:
|
|
|
271
271
|
"""Testy wykrywania NIP w tekście."""
|
|
272
272
|
|
|
273
273
|
def test_detect_nip_with_dashes(self):
|
|
274
|
-
from
|
|
274
|
+
from docid.ocr_processor import BaseOCRProcessor
|
|
275
275
|
|
|
276
276
|
class TestProcessor(BaseOCRProcessor):
|
|
277
277
|
def process_image(self, path): pass
|
|
@@ -283,7 +283,7 @@ class TestNIPDetection:
|
|
|
283
283
|
assert "5213017228" in nips
|
|
284
284
|
|
|
285
285
|
def test_detect_nip_without_dashes(self):
|
|
286
|
-
from
|
|
286
|
+
from docid.ocr_processor import BaseOCRProcessor
|
|
287
287
|
|
|
288
288
|
class TestProcessor(BaseOCRProcessor):
|
|
289
289
|
def process_image(self, path): pass
|
|
@@ -295,7 +295,7 @@ class TestNIPDetection:
|
|
|
295
295
|
assert "5213017228" in nips
|
|
296
296
|
|
|
297
297
|
def test_detect_multiple_nips(self):
|
|
298
|
-
from
|
|
298
|
+
from docid.ocr_processor import BaseOCRProcessor
|
|
299
299
|
|
|
300
300
|
class TestProcessor(BaseOCRProcessor):
|
|
301
301
|
def process_image(self, path): pass
|
|
@@ -316,7 +316,7 @@ class TestAmountDetection:
|
|
|
316
316
|
"""Testy wykrywania kwot."""
|
|
317
317
|
|
|
318
318
|
def test_detect_amount_with_currency(self):
|
|
319
|
-
from
|
|
319
|
+
from docid.ocr_processor import BaseOCRProcessor
|
|
320
320
|
|
|
321
321
|
class TestProcessor(BaseOCRProcessor):
|
|
322
322
|
def process_image(self, path): pass
|
|
@@ -328,7 +328,7 @@ class TestAmountDetection:
|
|
|
328
328
|
assert "1230.50" in amounts
|
|
329
329
|
|
|
330
330
|
def test_detect_amount_brutto(self):
|
|
331
|
-
from
|
|
331
|
+
from docid.ocr_processor import BaseOCRProcessor
|
|
332
332
|
|
|
333
333
|
class TestProcessor(BaseOCRProcessor):
|
|
334
334
|
def process_image(self, path): pass
|
|
@@ -10,9 +10,9 @@ from pathlib import Path
|
|
|
10
10
|
|
|
11
11
|
import pytest
|
|
12
12
|
|
|
13
|
-
from
|
|
14
|
-
from
|
|
15
|
-
from
|
|
13
|
+
from docid.document_id import DocumentIDGenerator, DocumentType
|
|
14
|
+
from docid.pipeline import DocumentPipeline, process_document
|
|
15
|
+
from docid.ocr_processor import OCREngine
|
|
16
16
|
|
|
17
17
|
|
|
18
18
|
SAMPLES_DIR = Path(__file__).parent.parent / "samples"
|
|
@@ -55,7 +55,7 @@ class TestSampleIDGeneration:
|
|
|
55
55
|
})
|
|
56
56
|
# Sprawdź czy ID zostało wygenerowane
|
|
57
57
|
assert result.document_id, f"Brak ID dla {file_path}"
|
|
58
|
-
assert result.document_id.startswith("
|
|
58
|
+
assert result.document_id.startswith("DOC-FV"), f"Nieprawidłowy prefix ID dla faktury: {result.document_id}"
|
|
59
59
|
except Exception as e:
|
|
60
60
|
pytest.fail(f"Błąd przetwarzania {file_path}: {e}")
|
|
61
61
|
|
|
@@ -1 +0,0 @@
|
|
|
1
|
-
exef_docid
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|