docid 0.1.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- docid-0.1.3.dist-info/METADATA +566 -0
- docid-0.1.3.dist-info/RECORD +14 -0
- docid-0.1.3.dist-info/WHEEL +5 -0
- docid-0.1.3.dist-info/entry_points.txt +3 -0
- docid-0.1.3.dist-info/top_level.txt +1 -0
- exef_docid/__init__.py +129 -0
- exef_docid/cli.py +340 -0
- exef_docid/cli_universal.py +517 -0
- exef_docid/document_id.py +720 -0
- exef_docid/document_id_universal.py +370 -0
- exef_docid/extractors/__init__.py +21 -0
- exef_docid/extractors/base.py +508 -0
- exef_docid/ocr_processor.py +579 -0
- exef_docid/pipeline.py +431 -0
|
@@ -0,0 +1,370 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Universal Document ID Generator for any document type including images, graphics, vectors
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
import hashlib
|
|
6
|
+
import json
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
from typing import Dict, Any, Optional, Union
|
|
9
|
+
from dataclasses import dataclass
|
|
10
|
+
from enum import Enum
|
|
11
|
+
import os
|
|
12
|
+
import time
|
|
13
|
+
|
|
14
|
+
try:
|
|
15
|
+
import fitz # PyMuPDF
|
|
16
|
+
PYMUPDF_AVAILABLE = True
|
|
17
|
+
except ImportError:
|
|
18
|
+
PYMUPDF_AVAILABLE = False
|
|
19
|
+
|
|
20
|
+
try:
|
|
21
|
+
from PIL import Image, ImageChops, ImageStat
|
|
22
|
+
PIL_AVAILABLE = True
|
|
23
|
+
except ImportError:
|
|
24
|
+
PIL_AVAILABLE = False
|
|
25
|
+
|
|
26
|
+
try:
|
|
27
|
+
import numpy as np
|
|
28
|
+
NUMPY_AVAILABLE = True
|
|
29
|
+
except ImportError:
|
|
30
|
+
NUMPY_AVAILABLE = False
|
|
31
|
+
|
|
32
|
+
class DocumentType(Enum):
|
|
33
|
+
"""Universal document types"""
|
|
34
|
+
PDF = "PDF"
|
|
35
|
+
IMAGE = "IMG"
|
|
36
|
+
VECTOR = "VEC"
|
|
37
|
+
MIXED = "MIX"
|
|
38
|
+
UNKNOWN = "UNK"
|
|
39
|
+
|
|
40
|
+
@dataclass
|
|
41
|
+
class UniversalDocumentFeatures:
|
|
42
|
+
"""Universal features extracted from any document"""
|
|
43
|
+
file_type: str
|
|
44
|
+
file_size: int
|
|
45
|
+
content_hash: str
|
|
46
|
+
visual_hash: Optional[str] = None
|
|
47
|
+
text_hash: Optional[str] = None
|
|
48
|
+
metadata_hash: Optional[str] = None
|
|
49
|
+
structure_hash: Optional[str] = None
|
|
50
|
+
color_profile_hash: Optional[str] = None
|
|
51
|
+
dimensions: Optional[tuple] = None
|
|
52
|
+
page_count: Optional[int] = None
|
|
53
|
+
creation_time: Optional[float] = None
|
|
54
|
+
modification_time: Optional[float] = None
|
|
55
|
+
|
|
56
|
+
class UniversalDocumentIDGenerator:
|
|
57
|
+
"""Universal document ID generator for any document format"""
|
|
58
|
+
|
|
59
|
+
def __init__(self, prefix: str = "UNIV"):
|
|
60
|
+
self.prefix = prefix
|
|
61
|
+
|
|
62
|
+
def _calculate_visual_hash(self, img: Any) -> Optional[str]:
|
|
63
|
+
"""
|
|
64
|
+
Calculate robust visual hash for image consistency across formats.
|
|
65
|
+
Uses perceptual-like approach: grayscale -> resize -> normalize.
|
|
66
|
+
"""
|
|
67
|
+
if not PIL_AVAILABLE:
|
|
68
|
+
return None
|
|
69
|
+
|
|
70
|
+
try:
|
|
71
|
+
from PIL import Image, ImageOps
|
|
72
|
+
# 1. Convert to grayscale
|
|
73
|
+
if img.mode != 'L':
|
|
74
|
+
img = img.convert('L')
|
|
75
|
+
|
|
76
|
+
# 2. Pad to square to handle different aspect ratios consistently
|
|
77
|
+
width, height = img.size
|
|
78
|
+
max_side = max(width, height)
|
|
79
|
+
img = ImageOps.pad(img, (max_side, max_side), color=255) # White padding for grayscale
|
|
80
|
+
|
|
81
|
+
# 3. Resize to small fixed size (32x32)
|
|
82
|
+
img_small = img.resize((32, 32), Image.Resampling.LANCZOS)
|
|
83
|
+
|
|
84
|
+
# 4. Get pixel data and calculate average
|
|
85
|
+
pixels = list(img_small.getdata())
|
|
86
|
+
avg = sum(pixels) / len(pixels)
|
|
87
|
+
|
|
88
|
+
# 5. Generate bit string
|
|
89
|
+
bits = "".join(['1' if p >= avg else '0' for p in pixels])
|
|
90
|
+
|
|
91
|
+
# 6. Convert bits to hex
|
|
92
|
+
hex_hash = hex(int(bits, 2))[2:].zfill(len(bits)//4)
|
|
93
|
+
return hashlib.sha256(hex_hash.encode()).hexdigest()[:16]
|
|
94
|
+
except Exception as e:
|
|
95
|
+
logger.debug(f"Visual hash calculation failed: {e}")
|
|
96
|
+
return None
|
|
97
|
+
|
|
98
|
+
def extract_pdf_features(self, file_path: Union[str, Path]) -> UniversalDocumentFeatures:
|
|
99
|
+
"""Extract features from PDF documents"""
|
|
100
|
+
if not PYMUPDF_AVAILABLE:
|
|
101
|
+
raise ImportError("PyMuPDF (fitz) is required for PDF processing")
|
|
102
|
+
|
|
103
|
+
doc = fitz.open(str(file_path))
|
|
104
|
+
|
|
105
|
+
try:
|
|
106
|
+
# Basic file info
|
|
107
|
+
stat = Path(file_path).stat()
|
|
108
|
+
file_size = stat.st_size
|
|
109
|
+
creation_time = stat.st_ctime
|
|
110
|
+
modification_time = stat.st_mtime
|
|
111
|
+
|
|
112
|
+
# Content features
|
|
113
|
+
content_features = []
|
|
114
|
+
text_content = ""
|
|
115
|
+
|
|
116
|
+
for page_num in range(len(doc)):
|
|
117
|
+
page = doc[page_num]
|
|
118
|
+
|
|
119
|
+
# Extract text
|
|
120
|
+
page_text = page.get_text()
|
|
121
|
+
text_content += page_text
|
|
122
|
+
content_features.append(f"page_{page_num}_text_length:{len(page_text)}")
|
|
123
|
+
|
|
124
|
+
# Extract images info
|
|
125
|
+
image_list = page.get_images()
|
|
126
|
+
content_features.append(f"page_{page_num}_images:{len(image_list)}")
|
|
127
|
+
|
|
128
|
+
# Extract page dimensions
|
|
129
|
+
rect = page.rect
|
|
130
|
+
content_features.append(f"page_{page_num}_size:{rect.width:.2f}x{rect.height:.2f}")
|
|
131
|
+
|
|
132
|
+
# Extract drawings/vectors
|
|
133
|
+
drawings = page.get_drawings()
|
|
134
|
+
content_features.append(f"page_{page_num}_drawings:{len(drawings)}")
|
|
135
|
+
|
|
136
|
+
# Extract font information
|
|
137
|
+
font_info = page.get_fonts()
|
|
138
|
+
content_features.append(f"page_{page_num}_fonts:{len(font_info)}")
|
|
139
|
+
|
|
140
|
+
# Metadata
|
|
141
|
+
metadata = doc.metadata
|
|
142
|
+
metadata_str = json.dumps(metadata, sort_keys=True)
|
|
143
|
+
|
|
144
|
+
# Calculate hashes
|
|
145
|
+
content_hash = hashlib.sha256('\n'.join(content_features).encode()).hexdigest()[:16]
|
|
146
|
+
text_hash = hashlib.sha256(text_content.encode()).hexdigest()[:16] if text_content else None
|
|
147
|
+
metadata_hash = hashlib.sha256(metadata_str.encode()).hexdigest()[:16]
|
|
148
|
+
|
|
149
|
+
# Visual hash (first page rendered as image)
|
|
150
|
+
visual_hash = None
|
|
151
|
+
try:
|
|
152
|
+
first_page = doc[0]
|
|
153
|
+
# Render at fixed resolution (e.g. 72 DPI)
|
|
154
|
+
pix = first_page.get_pixmap(matrix=fitz.Matrix(1.0, 1.0))
|
|
155
|
+
img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
|
|
156
|
+
visual_hash = self._calculate_visual_hash(img)
|
|
157
|
+
except:
|
|
158
|
+
pass
|
|
159
|
+
|
|
160
|
+
return UniversalDocumentFeatures(
|
|
161
|
+
file_type="PDF",
|
|
162
|
+
file_size=file_size,
|
|
163
|
+
content_hash=content_hash,
|
|
164
|
+
text_hash=text_hash,
|
|
165
|
+
metadata_hash=metadata_hash,
|
|
166
|
+
visual_hash=visual_hash,
|
|
167
|
+
dimensions=(doc[0].rect.width, doc[0].rect.height) if len(doc) > 0 else None,
|
|
168
|
+
page_count=len(doc),
|
|
169
|
+
creation_time=creation_time,
|
|
170
|
+
modification_time=modification_time
|
|
171
|
+
)
|
|
172
|
+
|
|
173
|
+
finally:
|
|
174
|
+
doc.close()
|
|
175
|
+
|
|
176
|
+
def extract_image_features(self, file_path: Union[str, Path]) -> UniversalDocumentFeatures:
|
|
177
|
+
"""Extract features from image files"""
|
|
178
|
+
if not PIL_AVAILABLE:
|
|
179
|
+
raise ImportError("PIL (Pillow) is required for image processing")
|
|
180
|
+
|
|
181
|
+
with Image.open(file_path) as img:
|
|
182
|
+
stat = Path(file_path).stat()
|
|
183
|
+
file_size = stat.st_size
|
|
184
|
+
creation_time = stat.st_ctime
|
|
185
|
+
modification_time = stat.st_mtime
|
|
186
|
+
|
|
187
|
+
# Convert to RGB for consistent processing
|
|
188
|
+
if img.mode != 'RGB':
|
|
189
|
+
img = img.convert('RGB')
|
|
190
|
+
|
|
191
|
+
# Basic features
|
|
192
|
+
dimensions = img.size
|
|
193
|
+
mode = img.mode
|
|
194
|
+
|
|
195
|
+
# Visual hash
|
|
196
|
+
visual_hash = self._calculate_visual_hash(img)
|
|
197
|
+
|
|
198
|
+
# Color histogram hash
|
|
199
|
+
histogram = img.histogram()
|
|
200
|
+
color_hash = hashlib.sha256(str(histogram).encode()).hexdigest()[:16]
|
|
201
|
+
|
|
202
|
+
# Content hash based on multiple features
|
|
203
|
+
content_features = [
|
|
204
|
+
f"size:{dimensions[0]}x{dimensions[1]}",
|
|
205
|
+
f"mode:{mode}",
|
|
206
|
+
f"visual_hash:{visual_hash}",
|
|
207
|
+
f"color_hash:{color_hash}",
|
|
208
|
+
f"file_size:{file_size}"
|
|
209
|
+
]
|
|
210
|
+
content_hash = hashlib.sha256('\n'.join(content_features).encode()).hexdigest()[:16]
|
|
211
|
+
|
|
212
|
+
return UniversalDocumentFeatures(
|
|
213
|
+
file_type="IMAGE",
|
|
214
|
+
file_size=file_size,
|
|
215
|
+
content_hash=content_hash,
|
|
216
|
+
visual_hash=visual_hash,
|
|
217
|
+
color_profile_hash=color_hash,
|
|
218
|
+
dimensions=dimensions,
|
|
219
|
+
creation_time=creation_time,
|
|
220
|
+
modification_time=modification_time
|
|
221
|
+
)
|
|
222
|
+
|
|
223
|
+
def extract_generic_features(self, file_path: Union[str, Path]) -> UniversalDocumentFeatures:
|
|
224
|
+
"""Extract basic features from any file type"""
|
|
225
|
+
stat = Path(file_path).stat()
|
|
226
|
+
file_size = stat.st_size
|
|
227
|
+
creation_time = stat.st_ctime
|
|
228
|
+
modification_time = stat.st_mtime
|
|
229
|
+
|
|
230
|
+
# Basic file content hash
|
|
231
|
+
try:
|
|
232
|
+
with open(file_path, 'rb') as f:
|
|
233
|
+
content_hash = hashlib.sha256(f.read()).hexdigest()[:16]
|
|
234
|
+
except:
|
|
235
|
+
content_hash = hashlib.sha256(str(file_size).encode()).hexdigest()[:16]
|
|
236
|
+
|
|
237
|
+
# File extension
|
|
238
|
+
file_ext = Path(file_path).suffix.lower()
|
|
239
|
+
|
|
240
|
+
return UniversalDocumentFeatures(
|
|
241
|
+
file_type=file_ext.upper().replace('.', ''),
|
|
242
|
+
file_size=file_size,
|
|
243
|
+
content_hash=content_hash,
|
|
244
|
+
creation_time=creation_time,
|
|
245
|
+
modification_time=modification_time
|
|
246
|
+
)
|
|
247
|
+
|
|
248
|
+
def get_document_features(self, file_path: Union[str, Path]) -> UniversalDocumentFeatures:
|
|
249
|
+
"""Extract features based on file type"""
|
|
250
|
+
file_path = Path(file_path)
|
|
251
|
+
|
|
252
|
+
if not file_path.exists():
|
|
253
|
+
raise FileNotFoundError(f"File not found: {file_path}")
|
|
254
|
+
|
|
255
|
+
file_ext = file_path.suffix.lower()
|
|
256
|
+
|
|
257
|
+
if file_ext == '.pdf':
|
|
258
|
+
return self.extract_pdf_features(file_path)
|
|
259
|
+
elif file_ext in ['.jpg', '.jpeg', '.png', '.gif', '.bmp', '.tiff', '.webp']:
|
|
260
|
+
return self.extract_image_features(file_path)
|
|
261
|
+
else:
|
|
262
|
+
return self.extract_generic_features(file_path)
|
|
263
|
+
|
|
264
|
+
def generate_universal_id(self, file_path: Union[str, Path]) -> str:
|
|
265
|
+
"""Generate universal document ID"""
|
|
266
|
+
features = self.get_document_features(file_path)
|
|
267
|
+
|
|
268
|
+
# Create canonical data string
|
|
269
|
+
canonical_data = [
|
|
270
|
+
features.file_type,
|
|
271
|
+
str(features.file_size),
|
|
272
|
+
features.content_hash,
|
|
273
|
+
features.visual_hash or "",
|
|
274
|
+
features.text_hash or "",
|
|
275
|
+
features.metadata_hash or "",
|
|
276
|
+
features.structure_hash or "",
|
|
277
|
+
features.color_profile_hash or "",
|
|
278
|
+
str(features.dimensions) if features.dimensions else "",
|
|
279
|
+
str(features.page_count) if features.page_count else "",
|
|
280
|
+
str(int(features.creation_time)) if features.creation_time else "",
|
|
281
|
+
str(int(features.modification_time)) if features.modification_time else ""
|
|
282
|
+
]
|
|
283
|
+
|
|
284
|
+
canonical_string = "|".join(canonical_data)
|
|
285
|
+
|
|
286
|
+
# Generate hash
|
|
287
|
+
hash_value = hashlib.sha256(canonical_string.encode()).hexdigest()[:16].upper()
|
|
288
|
+
|
|
289
|
+
# Determine document type code
|
|
290
|
+
type_codes = {
|
|
291
|
+
'PDF': 'PDF',
|
|
292
|
+
'IMAGE': 'IMG',
|
|
293
|
+
'JPG': 'IMG',
|
|
294
|
+
'JPEG': 'IMG',
|
|
295
|
+
'PNG': 'IMG',
|
|
296
|
+
'GIF': 'IMG',
|
|
297
|
+
'BMP': 'IMG',
|
|
298
|
+
'TIFF': 'IMG',
|
|
299
|
+
'WEBP': 'IMG',
|
|
300
|
+
}
|
|
301
|
+
|
|
302
|
+
type_code = type_codes.get(features.file_type, features.file_type[:3].upper())
|
|
303
|
+
|
|
304
|
+
return f"{self.prefix}-{type_code}-{hash_value}"
|
|
305
|
+
|
|
306
|
+
def verify_universal_id(self, file_path: Union[str, Path], document_id: str) -> bool:
|
|
307
|
+
"""Verify universal document ID"""
|
|
308
|
+
try:
|
|
309
|
+
generated_id = self.generate_universal_id(file_path)
|
|
310
|
+
return generated_id == document_id
|
|
311
|
+
except:
|
|
312
|
+
return False
|
|
313
|
+
|
|
314
|
+
def parse_universal_id(self, document_id: str) -> Dict[str, Any]:
|
|
315
|
+
"""Parse universal document ID"""
|
|
316
|
+
parts = document_id.split('-')
|
|
317
|
+
if len(parts) != 3:
|
|
318
|
+
raise ValueError(f"Invalid universal document ID format: {document_id}")
|
|
319
|
+
|
|
320
|
+
prefix, type_code, hash_value = parts
|
|
321
|
+
|
|
322
|
+
return {
|
|
323
|
+
'prefix': prefix,
|
|
324
|
+
'type_code': type_code,
|
|
325
|
+
'hash': hash_value,
|
|
326
|
+
'document_type': type_code
|
|
327
|
+
}
|
|
328
|
+
|
|
329
|
+
def compare_documents(self, file_path1: Union[str, Path], file_path2: Union[str, Path]) -> Dict[str, Any]:
|
|
330
|
+
"""Compare two documents"""
|
|
331
|
+
features1 = self.get_document_features(file_path1)
|
|
332
|
+
features2 = self.get_document_features(file_path2)
|
|
333
|
+
|
|
334
|
+
id1 = self.generate_universal_id(file_path1)
|
|
335
|
+
id2 = self.generate_universal_id(file_path2)
|
|
336
|
+
|
|
337
|
+
comparison = {
|
|
338
|
+
'identical_ids': id1 == id2,
|
|
339
|
+
'id1': id1,
|
|
340
|
+
'id2': id2,
|
|
341
|
+
'same_type': features1.file_type == features2.file_type,
|
|
342
|
+
'same_size': features1.file_size == features2.file_size,
|
|
343
|
+
'same_content_hash': features1.content_hash == features2.content_hash,
|
|
344
|
+
'same_visual_hash': None,
|
|
345
|
+
'same_text_hash': None
|
|
346
|
+
}
|
|
347
|
+
|
|
348
|
+
if features1.visual_hash and features2.visual_hash:
|
|
349
|
+
comparison['same_visual_hash'] = features1.visual_hash == features2.visual_hash
|
|
350
|
+
|
|
351
|
+
if features1.text_hash and features2.text_hash:
|
|
352
|
+
comparison['same_text_hash'] = features1.text_hash == features2.text_hash
|
|
353
|
+
|
|
354
|
+
return comparison
|
|
355
|
+
|
|
356
|
+
# Convenience functions
|
|
357
|
+
def generate_universal_document_id(file_path: Union[str, Path]) -> str:
|
|
358
|
+
"""Generate universal document ID"""
|
|
359
|
+
generator = UniversalDocumentIDGenerator()
|
|
360
|
+
return generator.generate_universal_id(file_path)
|
|
361
|
+
|
|
362
|
+
def verify_universal_document_id(file_path: Union[str, Path], document_id: str) -> bool:
|
|
363
|
+
"""Verify universal document ID"""
|
|
364
|
+
generator = UniversalDocumentIDGenerator()
|
|
365
|
+
return generator.verify_universal_id(file_path, document_id)
|
|
366
|
+
|
|
367
|
+
def compare_universal_documents(file_path1: Union[str, Path], file_path2: Union[str, Path]) -> Dict[str, Any]:
|
|
368
|
+
"""Compare two universal documents"""
|
|
369
|
+
generator = UniversalDocumentIDGenerator()
|
|
370
|
+
return generator.compare_documents(file_path1, file_path2)
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Ekstraktory danych z dokumentów.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
from .base import (
|
|
6
|
+
ContractExtractor,
|
|
7
|
+
DocumentCategory,
|
|
8
|
+
DocumentExtractor,
|
|
9
|
+
ExtractionResult,
|
|
10
|
+
InvoiceExtractor,
|
|
11
|
+
ReceiptExtractor,
|
|
12
|
+
)
|
|
13
|
+
|
|
14
|
+
__all__ = [
|
|
15
|
+
'DocumentExtractor',
|
|
16
|
+
'InvoiceExtractor',
|
|
17
|
+
'ReceiptExtractor',
|
|
18
|
+
'ContractExtractor',
|
|
19
|
+
'ExtractionResult',
|
|
20
|
+
'DocumentCategory',
|
|
21
|
+
]
|