docid 0.1.3__py3-none-any.whl → 0.1.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {exef_docid → docid}/__init__.py +5 -5
- {exef_docid → docid}/cli.py +5 -5
- {exef_docid → docid}/cli_universal.py +1 -1
- {exef_docid → docid}/document_id.py +10 -10
- {exef_docid → docid}/ocr_processor.py +3 -7
- {exef_docid → docid}/pipeline.py +8 -8
- {docid-0.1.3.dist-info → docid-0.1.5.dist-info}/METADATA +44 -44
- docid-0.1.5.dist-info/RECORD +14 -0
- docid-0.1.5.dist-info/entry_points.txt +3 -0
- docid-0.1.5.dist-info/top_level.txt +1 -0
- docid-0.1.3.dist-info/RECORD +0 -14
- docid-0.1.3.dist-info/entry_points.txt +0 -3
- docid-0.1.3.dist-info/top_level.txt +0 -1
- {exef_docid → docid}/document_id_universal.py +0 -0
- {exef_docid → docid}/extractors/__init__.py +0 -0
- {exef_docid → docid}/extractors/base.py +0 -0
- {docid-0.1.3.dist-info → docid-0.1.5.dist-info}/WHEEL +0 -0
{exef_docid → docid}/__init__.py
RENAMED
|
@@ -1,23 +1,23 @@
|
|
|
1
1
|
"""
|
|
2
|
-
|
|
2
|
+
DOC Document ID Generator
|
|
3
3
|
|
|
4
4
|
Deterministyczny generator identyfikatorów dokumentów z OCR.
|
|
5
5
|
Generuje zawsze ten sam ID dla tego samego dokumentu,
|
|
6
6
|
niezależnie od formatu źródłowego (skan, PDF, KSeF XML).
|
|
7
7
|
|
|
8
8
|
Przykład użycia:
|
|
9
|
-
from
|
|
9
|
+
from docid import process_document, get_document_id
|
|
10
10
|
|
|
11
11
|
# Pełne przetwarzanie
|
|
12
12
|
result = process_document("faktura.pdf")
|
|
13
|
-
print(result.document_id) #
|
|
13
|
+
print(result.document_id) # DOC-FV-A7B3C9D2E1F04856
|
|
14
14
|
print(result.extraction.issuer_nip) # 5213017228
|
|
15
15
|
|
|
16
16
|
# Tylko ID
|
|
17
17
|
doc_id = get_document_id("paragon.jpg")
|
|
18
18
|
|
|
19
19
|
# Weryfikacja
|
|
20
|
-
is_same = verify_document_id("skan.png", "
|
|
20
|
+
is_same = verify_document_id("skan.png", "DOC-FV-A7B3C9D2E1F04856")
|
|
21
21
|
|
|
22
22
|
Wymagania:
|
|
23
23
|
pip install paddleocr paddlepaddle pdf2image pillow
|
|
@@ -27,7 +27,7 @@ Wymagania:
|
|
|
27
27
|
pip install pytesseract pdf2image pillow
|
|
28
28
|
"""
|
|
29
29
|
|
|
30
|
-
__version__ = "0.1.
|
|
30
|
+
__version__ = "0.1.5"
|
|
31
31
|
__author__ = "Softreck"
|
|
32
32
|
|
|
33
33
|
# Główne API
|
{exef_docid → docid}/cli.py
RENAMED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
#!/usr/bin/env python3
|
|
2
2
|
"""
|
|
3
|
-
CLI dla
|
|
3
|
+
CLI dla DOC Document ID Generator.
|
|
4
4
|
|
|
5
5
|
Użycie:
|
|
6
6
|
# Przetwórz pojedynczy plik
|
|
@@ -13,7 +13,7 @@ Użycie:
|
|
|
13
13
|
docid batch ./dokumenty/ --output results.json
|
|
14
14
|
|
|
15
15
|
# Weryfikacja ID
|
|
16
|
-
docid verify faktura.pdf
|
|
16
|
+
docid verify faktura.pdf DOC-FV-A7B3C9D2E1F04856
|
|
17
17
|
|
|
18
18
|
# Tylko OCR (bez generowania ID)
|
|
19
19
|
docid ocr skan.jpg
|
|
@@ -267,7 +267,7 @@ def cmd_generate_id(args):
|
|
|
267
267
|
|
|
268
268
|
def main():
|
|
269
269
|
parser = argparse.ArgumentParser(
|
|
270
|
-
description='
|
|
270
|
+
description='DOC Document ID Generator - deterministyczne ID dokumentów z OCR',
|
|
271
271
|
formatter_class=argparse.RawDescriptionHelpFormatter,
|
|
272
272
|
)
|
|
273
273
|
parser.add_argument('--version', action='version', version='docid 0.1.0')
|
|
@@ -279,7 +279,7 @@ def main():
|
|
|
279
279
|
common.add_argument('--engine', choices=['paddle', 'tesseract'], default='paddle',
|
|
280
280
|
help='Silnik OCR (domyślnie: paddle)')
|
|
281
281
|
common.add_argument('--lang', default='pl', help='Język dokumentów')
|
|
282
|
-
common.add_argument('--prefix', default='
|
|
282
|
+
common.add_argument('--prefix', default='DOC', help='Prefiks ID')
|
|
283
283
|
common.add_argument('--gpu', action='store_true', help='Użyj GPU')
|
|
284
284
|
common.add_argument('-v', '--verbose', action='store_true', help='Więcej szczegółów')
|
|
285
285
|
|
|
@@ -324,7 +324,7 @@ def main():
|
|
|
324
324
|
p_gen.add_argument('--number', help='Numer dokumentu')
|
|
325
325
|
p_gen.add_argument('--date', help='Data (YYYY-MM-DD)')
|
|
326
326
|
p_gen.add_argument('--amount', help='Kwota brutto')
|
|
327
|
-
p_gen.add_argument('--prefix', default='
|
|
327
|
+
p_gen.add_argument('--prefix', default='DOC', help='Prefiks ID')
|
|
328
328
|
p_gen.set_defaults(func=cmd_generate_id)
|
|
329
329
|
|
|
330
330
|
args = parser.parse_args()
|
|
@@ -431,7 +431,7 @@ def main():
|
|
|
431
431
|
"""Main CLI entry point"""
|
|
432
432
|
parser = argparse.ArgumentParser(
|
|
433
433
|
prog='docid',
|
|
434
|
-
description='
|
|
434
|
+
description='DOC Document ID Generator - CLI'
|
|
435
435
|
)
|
|
436
436
|
|
|
437
437
|
parser.add_argument('--version', action='version', version='%(prog)s 0.1.0')
|
|
@@ -14,8 +14,8 @@ from decimal import ROUND_HALF_UP, Decimal
|
|
|
14
14
|
from enum import Enum
|
|
15
15
|
from typing import Optional, Union
|
|
16
16
|
|
|
17
|
-
# Namespace UUID dla
|
|
18
|
-
|
|
17
|
+
# Namespace UUID dla DOC (RFC 4122 UUID v5)
|
|
18
|
+
DOC_NAMESPACE = uuid.UUID('a1b2c3d4-e5f6-7890-abcd-ef1234567890')
|
|
19
19
|
|
|
20
20
|
|
|
21
21
|
class DocumentType(Enum):
|
|
@@ -207,10 +207,10 @@ class DocumentIDGenerator:
|
|
|
207
207
|
niezależnie od formatu źródłowego dokumentu.
|
|
208
208
|
"""
|
|
209
209
|
|
|
210
|
-
def __init__(self, prefix: str = "
|
|
210
|
+
def __init__(self, prefix: str = "DOC"):
|
|
211
211
|
"""
|
|
212
212
|
Args:
|
|
213
|
-
prefix: Prefiks identyfikatora (domyślnie
|
|
213
|
+
prefix: Prefiks identyfikatora (domyślnie DOC)
|
|
214
214
|
"""
|
|
215
215
|
self.prefix = prefix
|
|
216
216
|
|
|
@@ -229,7 +229,7 @@ class DocumentIDGenerator:
|
|
|
229
229
|
|
|
230
230
|
>>> gen = DocumentIDGenerator()
|
|
231
231
|
>>> gen.generate_invoice_id("5213017228", "FV/2025/00142", "2025-01-15", 1230.00)
|
|
232
|
-
'
|
|
232
|
+
'DOC-FV-A7B3C9D2E1F04856'
|
|
233
233
|
"""
|
|
234
234
|
canonical = CanonicalData(
|
|
235
235
|
document_type=DocumentType.INVOICE,
|
|
@@ -265,7 +265,7 @@ class DocumentIDGenerator:
|
|
|
265
265
|
|
|
266
266
|
>>> gen = DocumentIDGenerator()
|
|
267
267
|
>>> gen.generate_receipt_id("5213017228", "2025-01-15", 45.99)
|
|
268
|
-
'
|
|
268
|
+
'DOC-PAR-...'
|
|
269
269
|
"""
|
|
270
270
|
parts = [
|
|
271
271
|
NIPValidator.normalize(seller_nip),
|
|
@@ -649,7 +649,7 @@ class DocumentIDGenerator:
|
|
|
649
649
|
Generuje finalny identyfikator z danych kanonicznych.
|
|
650
650
|
|
|
651
651
|
Format: {PREFIX}-{TYPE}-{HASH16}
|
|
652
|
-
Przykład:
|
|
652
|
+
Przykład: DOC-FV-A7B3C9D2E1F04856
|
|
653
653
|
"""
|
|
654
654
|
# SHA256 z canonical string
|
|
655
655
|
hash_bytes = hashlib.sha256(canonical.canonical_string.encode('utf-8')).digest()
|
|
@@ -662,7 +662,7 @@ class DocumentIDGenerator:
|
|
|
662
662
|
Weryfikuje czy ID odpowiada danym kanonicznym.
|
|
663
663
|
|
|
664
664
|
>>> gen = DocumentIDGenerator()
|
|
665
|
-
>>> gen.verify_id("
|
|
665
|
+
>>> gen.verify_id("DOC-FV-A7B3C9D2E1F04856", "5213017228|FV/2025/00142|2025-01-15|1230.00")
|
|
666
666
|
True
|
|
667
667
|
"""
|
|
668
668
|
hash_bytes = hashlib.sha256(canonical_string.encode('utf-8')).digest()
|
|
@@ -679,8 +679,8 @@ class DocumentIDGenerator:
|
|
|
679
679
|
"""
|
|
680
680
|
Parsuje identyfikator dokumentu.
|
|
681
681
|
|
|
682
|
-
>>> DocumentIDGenerator.parse_id("
|
|
683
|
-
{'prefix': '
|
|
682
|
+
>>> DocumentIDGenerator.parse_id("DOC-FV-A7B3C9D2E1F04856")
|
|
683
|
+
{'prefix': 'DOC', 'type': 'FV', 'hash': 'A7B3C9D2E1F04856',
|
|
684
684
|
'document_type': <DocumentType.INVOICE>}
|
|
685
685
|
"""
|
|
686
686
|
parts = document_id.split('-')
|
|
@@ -195,12 +195,8 @@ class PaddleOCRProcessor(BaseOCRProcessor):
|
|
|
195
195
|
self._ocr = PaddleOCR(
|
|
196
196
|
use_angle_cls=True,
|
|
197
197
|
lang=lang,
|
|
198
|
-
use_gpu=self.use_gpu,
|
|
199
198
|
det_model_dir=self._det_model_dir,
|
|
200
199
|
rec_model_dir=self._rec_model_dir,
|
|
201
|
-
# Optymalizacje CPU
|
|
202
|
-
enable_mkldnn=True,
|
|
203
|
-
cpu_threads=4,
|
|
204
200
|
)
|
|
205
201
|
except ImportError:
|
|
206
202
|
raise ImportError(
|
|
@@ -216,7 +212,7 @@ class PaddleOCRProcessor(BaseOCRProcessor):
|
|
|
216
212
|
self._init_ocr()
|
|
217
213
|
|
|
218
214
|
image_path = str(image_path)
|
|
219
|
-
result = self._ocr.ocr(image_path
|
|
215
|
+
result = self._ocr.ocr(image_path)
|
|
220
216
|
|
|
221
217
|
lines = []
|
|
222
218
|
full_text_parts = []
|
|
@@ -439,8 +435,8 @@ class OCRProcessor:
|
|
|
439
435
|
|
|
440
436
|
def __init__(
|
|
441
437
|
self,
|
|
442
|
-
preferred_engine: OCREngine = OCREngine.
|
|
443
|
-
fallback_engine: OCREngine = OCREngine.
|
|
438
|
+
preferred_engine: OCREngine = OCREngine.TESSERACT,
|
|
439
|
+
fallback_engine: OCREngine = OCREngine.PADDLE,
|
|
444
440
|
lang: str = 'pl',
|
|
445
441
|
use_gpu: bool = False,
|
|
446
442
|
):
|
{exef_docid → docid}/pipeline.py
RENAMED
|
@@ -97,20 +97,20 @@ class DocumentPipeline:
|
|
|
97
97
|
Przykład użycia:
|
|
98
98
|
pipeline = DocumentPipeline()
|
|
99
99
|
result = pipeline.process("faktura.pdf")
|
|
100
|
-
print(result.document_id) #
|
|
100
|
+
print(result.document_id) # DOC-FV-A7B3C9D2E1F04856
|
|
101
101
|
"""
|
|
102
102
|
|
|
103
103
|
def __init__(
|
|
104
104
|
self,
|
|
105
|
-
ocr_engine: OCREngine = OCREngine.
|
|
106
|
-
id_prefix: str = "
|
|
105
|
+
ocr_engine: OCREngine = OCREngine.TESSERACT,
|
|
106
|
+
id_prefix: str = "DOC",
|
|
107
107
|
lang: str = "pl",
|
|
108
108
|
use_gpu: bool = False,
|
|
109
109
|
):
|
|
110
110
|
"""
|
|
111
111
|
Args:
|
|
112
112
|
ocr_engine: Silnik OCR (PADDLE lub TESSERACT)
|
|
113
|
-
id_prefix: Prefiks identyfikatorów (domyślnie
|
|
113
|
+
id_prefix: Prefiks identyfikatorów (domyślnie DOC)
|
|
114
114
|
lang: Język dokumentów (pl, en)
|
|
115
115
|
use_gpu: Czy używać GPU (domyślnie False dla CPU)
|
|
116
116
|
"""
|
|
@@ -390,7 +390,7 @@ class DocumentPipeline:
|
|
|
390
390
|
_default_pipeline: Optional[DocumentPipeline] = None
|
|
391
391
|
|
|
392
392
|
|
|
393
|
-
def get_pipeline(ocr_engine: OCREngine = OCREngine.
|
|
393
|
+
def get_pipeline(ocr_engine: OCREngine = OCREngine.TESSERACT) -> DocumentPipeline:
|
|
394
394
|
"""Zwraca domyślny pipeline (lazy init)."""
|
|
395
395
|
global _default_pipeline
|
|
396
396
|
if _default_pipeline is None or _default_pipeline.ocr.preferred_engine != ocr_engine:
|
|
@@ -398,7 +398,7 @@ def get_pipeline(ocr_engine: OCREngine = OCREngine.PADDLE) -> DocumentPipeline:
|
|
|
398
398
|
return _default_pipeline
|
|
399
399
|
|
|
400
400
|
|
|
401
|
-
def process_document(file_path: Union[str, Path], ocr_engine: OCREngine = OCREngine.
|
|
401
|
+
def process_document(file_path: Union[str, Path], ocr_engine: OCREngine = OCREngine.TESSERACT, use_ocr: bool = True) -> ProcessedDocument:
|
|
402
402
|
"""
|
|
403
403
|
Przetwarza dokument i zwraca wynik z ID.
|
|
404
404
|
|
|
@@ -416,7 +416,7 @@ def get_document_id(file_path: Union[str, Path]) -> str:
|
|
|
416
416
|
|
|
417
417
|
Przykład:
|
|
418
418
|
doc_id = get_document_id("faktura.pdf")
|
|
419
|
-
print(doc_id) #
|
|
419
|
+
print(doc_id) # DOC-FV-A7B3C9D2E1F04856
|
|
420
420
|
"""
|
|
421
421
|
return get_pipeline().process(file_path).document_id
|
|
422
422
|
|
|
@@ -426,6 +426,6 @@ def verify_document_id(file_path: Union[str, Path], expected_id: str) -> bool:
|
|
|
426
426
|
Weryfikuje czy dokument ma oczekiwany ID.
|
|
427
427
|
|
|
428
428
|
Przykład:
|
|
429
|
-
is_valid = verify_document_id("skan.jpg", "
|
|
429
|
+
is_valid = verify_document_id("skan.jpg", "DOC-FV-A7B3C9D2E1F04856")
|
|
430
430
|
"""
|
|
431
431
|
return get_pipeline().verify_document(file_path, expected_id)
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: docid
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.5
|
|
4
4
|
Summary: Deterministyczny generator identyfikatorów dokumentów z OCR
|
|
5
5
|
Home-page: https://github.com/softreck/docid
|
|
6
6
|
Author: Softreck
|
|
@@ -55,7 +55,7 @@ Dynamic: author
|
|
|
55
55
|
Dynamic: home-page
|
|
56
56
|
Dynamic: requires-python
|
|
57
57
|
|
|
58
|
-
#
|
|
58
|
+
# DOC Document ID Generator
|
|
59
59
|
|
|
60
60
|
Deterministyczny generator identyfikatorów dokumentów z OCR. Generuje **zawsze ten sam ID** dla tego samego dokumentu, niezależnie od formatu źródłowego (skan, PDF, KSeF XML, obrazy).
|
|
61
61
|
|
|
@@ -71,12 +71,12 @@ Jak uzyskać **ten sam identyfikator** dla wszystkich trzech?
|
|
|
71
71
|
## ✨ Rozwiązanie
|
|
72
72
|
|
|
73
73
|
```python
|
|
74
|
-
from
|
|
74
|
+
from docid import get_document_id
|
|
75
75
|
|
|
76
76
|
# Wszystkie trzy zwrócą TEN SAM ID!
|
|
77
|
-
get_document_id("faktura_skan.jpg") #
|
|
78
|
-
get_document_id("faktura.pdf") #
|
|
79
|
-
get_document_id("faktura_ksef.xml") #
|
|
77
|
+
get_document_id("faktura_skan.jpg") # DOC-FV-A7B3C9D2E1F04856
|
|
78
|
+
get_document_id("faktura.pdf") # DOC-FV-A7B3C9D2E1F04856
|
|
79
|
+
get_document_id("faktura_ksef.xml") # DOC-FV-A7B3C9D2E1F04856
|
|
80
80
|
```
|
|
81
81
|
|
|
82
82
|
## 📦 Instalacja
|
|
@@ -85,8 +85,8 @@ get_document_id("faktura_ksef.xml") # EXEF-FV-A7B3C9D2E1F04856
|
|
|
85
85
|
|
|
86
86
|
```bash
|
|
87
87
|
# Klonuj repozytorium
|
|
88
|
-
git clone https://github.com/softreck/
|
|
89
|
-
cd
|
|
88
|
+
git clone https://github.com/softreck/doc-pl.git
|
|
89
|
+
cd doc-pl/app/docid
|
|
90
90
|
|
|
91
91
|
# Utwórz środowisko wirtualne
|
|
92
92
|
python3 -m venv venv
|
|
@@ -124,7 +124,7 @@ pip install docid[all]
|
|
|
124
124
|
### Generator ID dla dokumentów biznesowych
|
|
125
125
|
|
|
126
126
|
```python
|
|
127
|
-
from
|
|
127
|
+
from docid import generate_invoice_id, generate_receipt_id, generate_contract_id
|
|
128
128
|
|
|
129
129
|
# Faktura VAT
|
|
130
130
|
invoice_id = generate_invoice_id(
|
|
@@ -133,7 +133,7 @@ invoice_id = generate_invoice_id(
|
|
|
133
133
|
issue_date="2025-01-15",
|
|
134
134
|
gross_amount=1230.50
|
|
135
135
|
)
|
|
136
|
-
print(invoice_id) #
|
|
136
|
+
print(invoice_id) # DOC-FV-F0BE35240C77B2DB
|
|
137
137
|
|
|
138
138
|
# Paragon fiskalny
|
|
139
139
|
receipt_id = generate_receipt_id(
|
|
@@ -142,7 +142,7 @@ receipt_id = generate_receipt_id(
|
|
|
142
142
|
gross_amount=37.88,
|
|
143
143
|
cash_register_number="001"
|
|
144
144
|
)
|
|
145
|
-
print(receipt_id) #
|
|
145
|
+
print(receipt_id) # DOC-PAR-8142B3FC69D7778C
|
|
146
146
|
|
|
147
147
|
# Umowa
|
|
148
148
|
contract_id = generate_contract_id(
|
|
@@ -151,13 +151,13 @@ contract_id = generate_contract_id(
|
|
|
151
151
|
contract_date="2025-01-15",
|
|
152
152
|
contract_number="001/2025"
|
|
153
153
|
)
|
|
154
|
-
print(contract_id) #
|
|
154
|
+
print(contract_id) # DOC-UMO-C54CB968D1342642
|
|
155
155
|
```
|
|
156
156
|
|
|
157
157
|
### Uniwersalny generator ID (dowolne dokumenty)
|
|
158
158
|
|
|
159
159
|
```python
|
|
160
|
-
from
|
|
160
|
+
from docid import generate_universal_document_id
|
|
161
161
|
|
|
162
162
|
# Dowolny dokument
|
|
163
163
|
doc_id = generate_universal_document_id("dokument.pdf")
|
|
@@ -175,20 +175,20 @@ print(doc_id) # UNIV-IMG-E2E2131A335F0918
|
|
|
175
175
|
### Pełne przetwarzanie z OCR
|
|
176
176
|
|
|
177
177
|
```python
|
|
178
|
-
from
|
|
178
|
+
from docid import process_document, get_document_id
|
|
179
179
|
|
|
180
180
|
# Pełne przetwarzanie z ekstrakcją danych
|
|
181
181
|
result = process_document("faktura.pdf")
|
|
182
|
-
print(result.document_id) #
|
|
182
|
+
print(result.document_id) # DOC-FV-F0BE35240C77B2DB
|
|
183
183
|
print(result.extraction.issuer_nip) # 5213017228
|
|
184
184
|
print(result.extraction.invoice_number) # FV/2025/00142
|
|
185
185
|
|
|
186
186
|
# Tylko wygeneruj ID
|
|
187
187
|
doc_id = get_document_id("paragon.jpg")
|
|
188
|
-
print(doc_id) #
|
|
188
|
+
print(doc_id) # DOC-PAR-8142B3FC69D7778C
|
|
189
189
|
|
|
190
190
|
# Weryfikacja ID
|
|
191
|
-
is_valid = verify_document_id("skan.png", "
|
|
191
|
+
is_valid = verify_document_id("skan.png", "DOC-FV-F0BE35240C77B2DB")
|
|
192
192
|
print(is_valid) # True/False
|
|
193
193
|
```
|
|
194
194
|
|
|
@@ -250,7 +250,7 @@ curl -X POST -F "file=@faktura.pdf" http://localhost:8000/process
|
|
|
250
250
|
|
|
251
251
|
**2. Weryfikacja ID:**
|
|
252
252
|
```bash
|
|
253
|
-
curl -X POST -F "file=@skan.jpg" -F "document_id=
|
|
253
|
+
curl -X POST -F "file=@skan.jpg" -F "document_id=DOC-FV-F0BE35240C77B2DB" http://localhost:8000/verify
|
|
254
254
|
```
|
|
255
255
|
|
|
256
256
|
**3. Porównywanie plików:**
|
|
@@ -303,7 +303,7 @@ make run-web # Uruchom serwer API
|
|
|
303
303
|
### 1. Przetwarzanie faktur
|
|
304
304
|
|
|
305
305
|
```python
|
|
306
|
-
from
|
|
306
|
+
from docid import process_document
|
|
307
307
|
|
|
308
308
|
# Przetwarzanie faktury PDF
|
|
309
309
|
result = process_document("faktura.pdf")
|
|
@@ -319,7 +319,7 @@ print(f"ID: {result.document_id}")
|
|
|
319
319
|
### 2. Porównywanie dokumentów
|
|
320
320
|
|
|
321
321
|
```python
|
|
322
|
-
from
|
|
322
|
+
from docid import compare_universal_documents
|
|
323
323
|
|
|
324
324
|
# Porównaj dwa dokumenty
|
|
325
325
|
comparison = compare_universal_documents("dokument1.pdf", "dokument2.png")
|
|
@@ -331,10 +331,10 @@ print(f"Ten sam rozmiar: {comparison['same_size']}")
|
|
|
331
331
|
### 3. Weryfikacja ID
|
|
332
332
|
|
|
333
333
|
```python
|
|
334
|
-
from
|
|
334
|
+
from docid import verify_document_id, verify_universal_document_id
|
|
335
335
|
|
|
336
336
|
# Weryfikacja ID dokumentu biznesowego
|
|
337
|
-
is_valid = verify_document_id("faktura.pdf", "
|
|
337
|
+
is_valid = verify_document_id("faktura.pdf", "DOC-FV-F0BE35240C77B2DB")
|
|
338
338
|
|
|
339
339
|
# Weryfikacja uniwersalnego ID
|
|
340
340
|
is_valid = verify_universal_document_id("dowolny_plik.jpg", "UNIV-IMG-4225A473A725978D")
|
|
@@ -344,7 +344,7 @@ is_valid = verify_universal_document_id("dowolny_plik.jpg", "UNIV-IMG-4225A473A7
|
|
|
344
344
|
|
|
345
345
|
```python
|
|
346
346
|
from pathlib import Path
|
|
347
|
-
from
|
|
347
|
+
from docid import generate_universal_document_id
|
|
348
348
|
|
|
349
349
|
# Przetwarzaj wszystkie pliki w folderze
|
|
350
350
|
documents_dir = Path("dokumenty")
|
|
@@ -372,16 +372,16 @@ for file_path in documents_dir.glob("*"):
|
|
|
372
372
|
**TAK!** Formaty PNG i JPG są w pełni przetwarzane przez OCR:
|
|
373
373
|
|
|
374
374
|
```python
|
|
375
|
-
from
|
|
375
|
+
from docid import process_document
|
|
376
376
|
|
|
377
377
|
# Przetwarzanie skanu PNG z OCR
|
|
378
378
|
result = process_document("skan_faktury.png")
|
|
379
|
-
print(result.document_id) #
|
|
379
|
+
print(result.document_id) # DOC-FV-F0BE35240C77B2DB
|
|
380
380
|
print(result.extraction.issuer_nip) # 5213017228
|
|
381
381
|
|
|
382
382
|
# Przetwarzanie zdjęcia JPG z OCR
|
|
383
383
|
result = process_document("zdjecie_paragonu.jpg")
|
|
384
|
-
print(result.document_id) #
|
|
384
|
+
print(result.document_id) # DOC-PAR-8142B3FC69D7778C
|
|
385
385
|
```
|
|
386
386
|
|
|
387
387
|
#### Co jest ekstrahowane z PNG/JPG:
|
|
@@ -402,13 +402,13 @@ print(result.document_id) # EXEF-PAR-8142B3FC69D7778C
|
|
|
402
402
|
|
|
403
403
|
```python
|
|
404
404
|
# Ten sam dokument w różnych formatach - ten sam ID biznesowy
|
|
405
|
-
generate_invoice_id(...) # ->
|
|
405
|
+
generate_invoice_id(...) # -> DOC-FV-F0BE35240C77B2DB
|
|
406
406
|
|
|
407
407
|
# Przetwarzanie przez OCR daje ten sam wynik
|
|
408
|
-
process_document("faktura.pdf") # ->
|
|
409
|
-
process_document("faktura.png") # ->
|
|
410
|
-
process_document("faktura.jpg") # ->
|
|
411
|
-
process_document("faktura.xml") # ->
|
|
408
|
+
process_document("faktura.pdf") # -> DOC-FV-F0BE35240C77B2DB
|
|
409
|
+
process_document("faktura.png") # -> DOC-FV-F0BE35240C77B2DB
|
|
410
|
+
process_document("faktura.jpg") # -> DOC-FV-F0BE35240C77B2DB
|
|
411
|
+
process_document("faktura.xml") # -> DOC-FV-F0BE35240C77B2DB
|
|
412
412
|
|
|
413
413
|
# Różne ID uniwersalne dla różnych formatów
|
|
414
414
|
generate_universal_document_id("faktura.pdf") # -> UNIV-PDF-...
|
|
@@ -451,25 +451,25 @@ TEST WSZYSTKICH FORMATÓW - PDF, PNG, JPG, HTML, TXT, XML
|
|
|
451
451
|
================================================================================
|
|
452
452
|
|
|
453
453
|
FOLDER: FAKTURY (invoices/)
|
|
454
|
-
📄 faktura_full.pdf (.pdf ) [ 2242B] ->
|
|
455
|
-
📄 faktura_full.xml (.xml ) [ 2077B] ->
|
|
456
|
-
📄 faktura_full.html (.html) [ 3334B] ->
|
|
457
|
-
📄 faktura_full.jpg (.jpg ) [ 28182B] ->
|
|
458
|
-
📄 faktura_full.png (.png ) [ 32325B] ->
|
|
459
|
-
📄 faktura_full.txt (.txt ) [ 2839B] ->
|
|
454
|
+
📄 faktura_full.pdf (.pdf ) [ 2242B] -> DOC-FV-F0BE35240C77B2DB
|
|
455
|
+
📄 faktura_full.xml (.xml ) [ 2077B] -> DOC-FV-F0BE35240C77B2DB
|
|
456
|
+
📄 faktura_full.html (.html) [ 3334B] -> DOC-FV-F0BE35240C77B2DB
|
|
457
|
+
📄 faktura_full.jpg (.jpg ) [ 28182B] -> DOC-FV-F0BE35240C77B2DB
|
|
458
|
+
📄 faktura_full.png (.png ) [ 32325B] -> DOC-FV-F0BE35240C77B2DB
|
|
459
|
+
📄 faktura_full.txt (.txt ) [ 2839B] -> DOC-FV-F0BE35240C77B2DB
|
|
460
460
|
|
|
461
461
|
📊 Podsumowanie folderu invoices:
|
|
462
462
|
Plików przetworzonych: 6
|
|
463
463
|
Unikalnych ID: 1
|
|
464
464
|
Wszystkie identyczne: True
|
|
465
|
-
✅ ID:
|
|
465
|
+
✅ ID: DOC-FV-F0BE35240C77B2DB
|
|
466
466
|
```
|
|
467
467
|
|
|
468
468
|
## 📁 Struktura projektu
|
|
469
469
|
|
|
470
470
|
```
|
|
471
471
|
docid/
|
|
472
|
-
├──
|
|
472
|
+
├── docid/ # Główny pakiet
|
|
473
473
|
│ ├── __init__.py # Eksporty API
|
|
474
474
|
│ ├── document_id.py # Generator ID biznesowy
|
|
475
475
|
│ ├── document_id_universal.py # Generator ID uniwersalny
|
|
@@ -493,7 +493,7 @@ docid/
|
|
|
493
493
|
### Silniki OCR
|
|
494
494
|
|
|
495
495
|
```python
|
|
496
|
-
from
|
|
496
|
+
from docid import OCREngine, get_pipeline
|
|
497
497
|
|
|
498
498
|
# Użyj PaddleOCR (domyślnie)
|
|
499
499
|
pipeline = get_pipeline(ocr_engine=OCREngine.PADDLE)
|
|
@@ -505,7 +505,7 @@ pipeline = get_pipeline(ocr_engine=OCREngine.TESSERACT)
|
|
|
505
505
|
### Custom prefix
|
|
506
506
|
|
|
507
507
|
```python
|
|
508
|
-
from
|
|
508
|
+
from docid import UniversalDocumentIDGenerator
|
|
509
509
|
|
|
510
510
|
generator = UniversalDocumentIDGenerator(prefix="MOJA")
|
|
511
511
|
doc_id = generator.generate_universal_id("plik.pdf")
|
|
@@ -549,8 +549,8 @@ MIT License - zobacz [LICENSE](LICENSE) dla szczegółów.
|
|
|
549
549
|
## 🆘 Wsparcie
|
|
550
550
|
|
|
551
551
|
- 📧 Email: info@softreck.dev
|
|
552
|
-
- 🐛 Issues: [GitHub Issues](https://github.com/softreck/
|
|
553
|
-
- 📖 Dokumentacja: [GitHub Wiki](https://github.com/softreck/
|
|
552
|
+
- 🐛 Issues: [GitHub Issues](https://github.com/softreck/doc-pl/issues)
|
|
553
|
+
- 📖 Dokumentacja: [GitHub Wiki](https://github.com/softreck/doc-pl/wiki)
|
|
554
554
|
|
|
555
555
|
## 🗺️ Roadmap
|
|
556
556
|
|
|
@@ -563,4 +563,4 @@ MIT License - zobacz [LICENSE](LICENSE) dla szczegółów.
|
|
|
563
563
|
|
|
564
564
|
---
|
|
565
565
|
|
|
566
|
-
**
|
|
566
|
+
**DOC Document ID Generator** - Deterministyczne identyfikatory dla każdego dokumentu! 🚀
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
docid/__init__.py,sha256=OQq5-zVwqX-TtbUnL3m2cAzbxDTJuiu3C_wivzBOcUY,2881
|
|
2
|
+
docid/cli.py,sha256=G8beRYj8ImzCvZPrHnIZkxORygCdsIE3O2_bHpJio5o,11109
|
|
3
|
+
docid/cli_universal.py,sha256=HlvLyYKtp3VBx0CEilSGoMDu912zUfo-1BGDzWyehJA,21649
|
|
4
|
+
docid/document_id.py,sha256=2sX4YTKGuazXdkNfBtj0buJMdwopcMk7UTSjr3oJOtQ,23490
|
|
5
|
+
docid/document_id_universal.py,sha256=M66ZjxtufVfxUQkQI4su2Ph3JGy0HEaO4U-8Osv_beQ,13873
|
|
6
|
+
docid/ocr_processor.py,sha256=eFD8m_LsKgUkFngGL_xAseeriBHhHj9ORVWScpxyP5A,19138
|
|
7
|
+
docid/pipeline.py,sha256=_UGjY9bImW__uJakUqLJztLSRPYi3IYZclR09dI3f4o,15435
|
|
8
|
+
docid/extractors/__init__.py,sha256=a2AS9aExd-EpOBp5eO3ZaUOmd0tP5sMSJ3QdVERrTAE,360
|
|
9
|
+
docid/extractors/base.py,sha256=l_8L2irgxOhm5MwM9URCA1IkKTzq0hl5pTSB8EWp_c0,17910
|
|
10
|
+
docid-0.1.5.dist-info/METADATA,sha256=7v6yXer0ztGR3jZcQqOs2TMqdZLzzRDdv7Pe8Z38e1A,16115
|
|
11
|
+
docid-0.1.5.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
|
|
12
|
+
docid-0.1.5.dist-info/entry_points.txt,sha256=HCEl6elisxSa8tftr3wYg5rtBdFoIhgQISLS-AMojF8,84
|
|
13
|
+
docid-0.1.5.dist-info/top_level.txt,sha256=rpBskDuRYygNthewc7JvtTt4f99WiZTyqgxOiedCBx8,6
|
|
14
|
+
docid-0.1.5.dist-info/RECORD,,
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
docid
|
docid-0.1.3.dist-info/RECORD
DELETED
|
@@ -1,14 +0,0 @@
|
|
|
1
|
-
exef_docid/__init__.py,sha256=fhG2gmFD4ZCOFNW6G_QZ-i-SQ5GDHKrTp8DWuMDBVOc,2889
|
|
2
|
-
exef_docid/cli.py,sha256=-04_ykTVBvlYN5-AnMH-OLVjoFADawJ5QzGWR_6NazI,11114
|
|
3
|
-
exef_docid/cli_universal.py,sha256=bcupHuuLG4ep_Rh9gTqQizuR9cQCAO7bNY6cWqzelg4,21650
|
|
4
|
-
exef_docid/document_id.py,sha256=lE9raTmmFa7xEnSfJPuNPM8yOlvixC5uF5gFO5S7GHU,23500
|
|
5
|
-
exef_docid/document_id_universal.py,sha256=M66ZjxtufVfxUQkQI4su2Ph3JGy0HEaO4U-8Osv_beQ,13873
|
|
6
|
-
exef_docid/ocr_processor.py,sha256=Ooy7X_EKd3wQUz4fNmD-oGIKYLauylrXAxQ9IfssmaI,19305
|
|
7
|
-
exef_docid/pipeline.py,sha256=-o-sPCqYEwzsA2R6llkKv61EbcQ8F2KhJVSuJus7tWU,15431
|
|
8
|
-
exef_docid/extractors/__init__.py,sha256=a2AS9aExd-EpOBp5eO3ZaUOmd0tP5sMSJ3QdVERrTAE,360
|
|
9
|
-
exef_docid/extractors/base.py,sha256=l_8L2irgxOhm5MwM9URCA1IkKTzq0hl5pTSB8EWp_c0,17910
|
|
10
|
-
docid-0.1.3.dist-info/METADATA,sha256=gtoHdagrVWN2Eu_Ii9DUWTgkRYrFQs_cxRWcf4w3t14,16206
|
|
11
|
-
docid-0.1.3.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
|
|
12
|
-
docid-0.1.3.dist-info/entry_points.txt,sha256=P85wntY_GMh6lwhXTMsWV2QwCLCgIRe1sbIpGoDxrQE,94
|
|
13
|
-
docid-0.1.3.dist-info/top_level.txt,sha256=wEXHg0mYQhhmZ0R3yymDasZhXfI7S0RpTxJ-hmdZ6Ww,11
|
|
14
|
-
docid-0.1.3.dist-info/RECORD,,
|
|
@@ -1 +0,0 @@
|
|
|
1
|
-
exef_docid
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|