docid 0.1.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,370 @@
1
+ """
2
+ Universal Document ID Generator for any document type including images, graphics, vectors
3
+ """
4
+
5
+ import hashlib
6
+ import json
7
+ from pathlib import Path
8
+ from typing import Dict, Any, Optional, Union
9
+ from dataclasses import dataclass
10
+ from enum import Enum
11
+ import os
12
+ import time
13
+
14
+ try:
15
+ import fitz # PyMuPDF
16
+ PYMUPDF_AVAILABLE = True
17
+ except ImportError:
18
+ PYMUPDF_AVAILABLE = False
19
+
20
+ try:
21
+ from PIL import Image, ImageChops, ImageStat
22
+ PIL_AVAILABLE = True
23
+ except ImportError:
24
+ PIL_AVAILABLE = False
25
+
26
+ try:
27
+ import numpy as np
28
+ NUMPY_AVAILABLE = True
29
+ except ImportError:
30
+ NUMPY_AVAILABLE = False
31
+
32
+ class DocumentType(Enum):
33
+ """Universal document types"""
34
+ PDF = "PDF"
35
+ IMAGE = "IMG"
36
+ VECTOR = "VEC"
37
+ MIXED = "MIX"
38
+ UNKNOWN = "UNK"
39
+
40
+ @dataclass
41
+ class UniversalDocumentFeatures:
42
+ """Universal features extracted from any document"""
43
+ file_type: str
44
+ file_size: int
45
+ content_hash: str
46
+ visual_hash: Optional[str] = None
47
+ text_hash: Optional[str] = None
48
+ metadata_hash: Optional[str] = None
49
+ structure_hash: Optional[str] = None
50
+ color_profile_hash: Optional[str] = None
51
+ dimensions: Optional[tuple] = None
52
+ page_count: Optional[int] = None
53
+ creation_time: Optional[float] = None
54
+ modification_time: Optional[float] = None
55
+
56
+ class UniversalDocumentIDGenerator:
57
+ """Universal document ID generator for any document format"""
58
+
59
+ def __init__(self, prefix: str = "UNIV"):
60
+ self.prefix = prefix
61
+
62
+ def _calculate_visual_hash(self, img: Any) -> Optional[str]:
63
+ """
64
+ Calculate robust visual hash for image consistency across formats.
65
+ Uses perceptual-like approach: grayscale -> resize -> normalize.
66
+ """
67
+ if not PIL_AVAILABLE:
68
+ return None
69
+
70
+ try:
71
+ from PIL import Image, ImageOps
72
+ # 1. Convert to grayscale
73
+ if img.mode != 'L':
74
+ img = img.convert('L')
75
+
76
+ # 2. Pad to square to handle different aspect ratios consistently
77
+ width, height = img.size
78
+ max_side = max(width, height)
79
+ img = ImageOps.pad(img, (max_side, max_side), color=255) # White padding for grayscale
80
+
81
+ # 3. Resize to small fixed size (32x32)
82
+ img_small = img.resize((32, 32), Image.Resampling.LANCZOS)
83
+
84
+ # 4. Get pixel data and calculate average
85
+ pixels = list(img_small.getdata())
86
+ avg = sum(pixels) / len(pixels)
87
+
88
+ # 5. Generate bit string
89
+ bits = "".join(['1' if p >= avg else '0' for p in pixels])
90
+
91
+ # 6. Convert bits to hex
92
+ hex_hash = hex(int(bits, 2))[2:].zfill(len(bits)//4)
93
+ return hashlib.sha256(hex_hash.encode()).hexdigest()[:16]
94
+ except Exception as e:
95
+ logger.debug(f"Visual hash calculation failed: {e}")
96
+ return None
97
+
98
+ def extract_pdf_features(self, file_path: Union[str, Path]) -> UniversalDocumentFeatures:
99
+ """Extract features from PDF documents"""
100
+ if not PYMUPDF_AVAILABLE:
101
+ raise ImportError("PyMuPDF (fitz) is required for PDF processing")
102
+
103
+ doc = fitz.open(str(file_path))
104
+
105
+ try:
106
+ # Basic file info
107
+ stat = Path(file_path).stat()
108
+ file_size = stat.st_size
109
+ creation_time = stat.st_ctime
110
+ modification_time = stat.st_mtime
111
+
112
+ # Content features
113
+ content_features = []
114
+ text_content = ""
115
+
116
+ for page_num in range(len(doc)):
117
+ page = doc[page_num]
118
+
119
+ # Extract text
120
+ page_text = page.get_text()
121
+ text_content += page_text
122
+ content_features.append(f"page_{page_num}_text_length:{len(page_text)}")
123
+
124
+ # Extract images info
125
+ image_list = page.get_images()
126
+ content_features.append(f"page_{page_num}_images:{len(image_list)}")
127
+
128
+ # Extract page dimensions
129
+ rect = page.rect
130
+ content_features.append(f"page_{page_num}_size:{rect.width:.2f}x{rect.height:.2f}")
131
+
132
+ # Extract drawings/vectors
133
+ drawings = page.get_drawings()
134
+ content_features.append(f"page_{page_num}_drawings:{len(drawings)}")
135
+
136
+ # Extract font information
137
+ font_info = page.get_fonts()
138
+ content_features.append(f"page_{page_num}_fonts:{len(font_info)}")
139
+
140
+ # Metadata
141
+ metadata = doc.metadata
142
+ metadata_str = json.dumps(metadata, sort_keys=True)
143
+
144
+ # Calculate hashes
145
+ content_hash = hashlib.sha256('\n'.join(content_features).encode()).hexdigest()[:16]
146
+ text_hash = hashlib.sha256(text_content.encode()).hexdigest()[:16] if text_content else None
147
+ metadata_hash = hashlib.sha256(metadata_str.encode()).hexdigest()[:16]
148
+
149
+ # Visual hash (first page rendered as image)
150
+ visual_hash = None
151
+ try:
152
+ first_page = doc[0]
153
+ # Render at fixed resolution (e.g. 72 DPI)
154
+ pix = first_page.get_pixmap(matrix=fitz.Matrix(1.0, 1.0))
155
+ img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
156
+ visual_hash = self._calculate_visual_hash(img)
157
+ except:
158
+ pass
159
+
160
+ return UniversalDocumentFeatures(
161
+ file_type="PDF",
162
+ file_size=file_size,
163
+ content_hash=content_hash,
164
+ text_hash=text_hash,
165
+ metadata_hash=metadata_hash,
166
+ visual_hash=visual_hash,
167
+ dimensions=(doc[0].rect.width, doc[0].rect.height) if len(doc) > 0 else None,
168
+ page_count=len(doc),
169
+ creation_time=creation_time,
170
+ modification_time=modification_time
171
+ )
172
+
173
+ finally:
174
+ doc.close()
175
+
176
+ def extract_image_features(self, file_path: Union[str, Path]) -> UniversalDocumentFeatures:
177
+ """Extract features from image files"""
178
+ if not PIL_AVAILABLE:
179
+ raise ImportError("PIL (Pillow) is required for image processing")
180
+
181
+ with Image.open(file_path) as img:
182
+ stat = Path(file_path).stat()
183
+ file_size = stat.st_size
184
+ creation_time = stat.st_ctime
185
+ modification_time = stat.st_mtime
186
+
187
+ # Convert to RGB for consistent processing
188
+ if img.mode != 'RGB':
189
+ img = img.convert('RGB')
190
+
191
+ # Basic features
192
+ dimensions = img.size
193
+ mode = img.mode
194
+
195
+ # Visual hash
196
+ visual_hash = self._calculate_visual_hash(img)
197
+
198
+ # Color histogram hash
199
+ histogram = img.histogram()
200
+ color_hash = hashlib.sha256(str(histogram).encode()).hexdigest()[:16]
201
+
202
+ # Content hash based on multiple features
203
+ content_features = [
204
+ f"size:{dimensions[0]}x{dimensions[1]}",
205
+ f"mode:{mode}",
206
+ f"visual_hash:{visual_hash}",
207
+ f"color_hash:{color_hash}",
208
+ f"file_size:{file_size}"
209
+ ]
210
+ content_hash = hashlib.sha256('\n'.join(content_features).encode()).hexdigest()[:16]
211
+
212
+ return UniversalDocumentFeatures(
213
+ file_type="IMAGE",
214
+ file_size=file_size,
215
+ content_hash=content_hash,
216
+ visual_hash=visual_hash,
217
+ color_profile_hash=color_hash,
218
+ dimensions=dimensions,
219
+ creation_time=creation_time,
220
+ modification_time=modification_time
221
+ )
222
+
223
+ def extract_generic_features(self, file_path: Union[str, Path]) -> UniversalDocumentFeatures:
224
+ """Extract basic features from any file type"""
225
+ stat = Path(file_path).stat()
226
+ file_size = stat.st_size
227
+ creation_time = stat.st_ctime
228
+ modification_time = stat.st_mtime
229
+
230
+ # Basic file content hash
231
+ try:
232
+ with open(file_path, 'rb') as f:
233
+ content_hash = hashlib.sha256(f.read()).hexdigest()[:16]
234
+ except:
235
+ content_hash = hashlib.sha256(str(file_size).encode()).hexdigest()[:16]
236
+
237
+ # File extension
238
+ file_ext = Path(file_path).suffix.lower()
239
+
240
+ return UniversalDocumentFeatures(
241
+ file_type=file_ext.upper().replace('.', ''),
242
+ file_size=file_size,
243
+ content_hash=content_hash,
244
+ creation_time=creation_time,
245
+ modification_time=modification_time
246
+ )
247
+
248
+ def get_document_features(self, file_path: Union[str, Path]) -> UniversalDocumentFeatures:
249
+ """Extract features based on file type"""
250
+ file_path = Path(file_path)
251
+
252
+ if not file_path.exists():
253
+ raise FileNotFoundError(f"File not found: {file_path}")
254
+
255
+ file_ext = file_path.suffix.lower()
256
+
257
+ if file_ext == '.pdf':
258
+ return self.extract_pdf_features(file_path)
259
+ elif file_ext in ['.jpg', '.jpeg', '.png', '.gif', '.bmp', '.tiff', '.webp']:
260
+ return self.extract_image_features(file_path)
261
+ else:
262
+ return self.extract_generic_features(file_path)
263
+
264
+ def generate_universal_id(self, file_path: Union[str, Path]) -> str:
265
+ """Generate universal document ID"""
266
+ features = self.get_document_features(file_path)
267
+
268
+ # Create canonical data string
269
+ canonical_data = [
270
+ features.file_type,
271
+ str(features.file_size),
272
+ features.content_hash,
273
+ features.visual_hash or "",
274
+ features.text_hash or "",
275
+ features.metadata_hash or "",
276
+ features.structure_hash or "",
277
+ features.color_profile_hash or "",
278
+ str(features.dimensions) if features.dimensions else "",
279
+ str(features.page_count) if features.page_count else "",
280
+ str(int(features.creation_time)) if features.creation_time else "",
281
+ str(int(features.modification_time)) if features.modification_time else ""
282
+ ]
283
+
284
+ canonical_string = "|".join(canonical_data)
285
+
286
+ # Generate hash
287
+ hash_value = hashlib.sha256(canonical_string.encode()).hexdigest()[:16].upper()
288
+
289
+ # Determine document type code
290
+ type_codes = {
291
+ 'PDF': 'PDF',
292
+ 'IMAGE': 'IMG',
293
+ 'JPG': 'IMG',
294
+ 'JPEG': 'IMG',
295
+ 'PNG': 'IMG',
296
+ 'GIF': 'IMG',
297
+ 'BMP': 'IMG',
298
+ 'TIFF': 'IMG',
299
+ 'WEBP': 'IMG',
300
+ }
301
+
302
+ type_code = type_codes.get(features.file_type, features.file_type[:3].upper())
303
+
304
+ return f"{self.prefix}-{type_code}-{hash_value}"
305
+
306
+ def verify_universal_id(self, file_path: Union[str, Path], document_id: str) -> bool:
307
+ """Verify universal document ID"""
308
+ try:
309
+ generated_id = self.generate_universal_id(file_path)
310
+ return generated_id == document_id
311
+ except:
312
+ return False
313
+
314
+ def parse_universal_id(self, document_id: str) -> Dict[str, Any]:
315
+ """Parse universal document ID"""
316
+ parts = document_id.split('-')
317
+ if len(parts) != 3:
318
+ raise ValueError(f"Invalid universal document ID format: {document_id}")
319
+
320
+ prefix, type_code, hash_value = parts
321
+
322
+ return {
323
+ 'prefix': prefix,
324
+ 'type_code': type_code,
325
+ 'hash': hash_value,
326
+ 'document_type': type_code
327
+ }
328
+
329
+ def compare_documents(self, file_path1: Union[str, Path], file_path2: Union[str, Path]) -> Dict[str, Any]:
330
+ """Compare two documents"""
331
+ features1 = self.get_document_features(file_path1)
332
+ features2 = self.get_document_features(file_path2)
333
+
334
+ id1 = self.generate_universal_id(file_path1)
335
+ id2 = self.generate_universal_id(file_path2)
336
+
337
+ comparison = {
338
+ 'identical_ids': id1 == id2,
339
+ 'id1': id1,
340
+ 'id2': id2,
341
+ 'same_type': features1.file_type == features2.file_type,
342
+ 'same_size': features1.file_size == features2.file_size,
343
+ 'same_content_hash': features1.content_hash == features2.content_hash,
344
+ 'same_visual_hash': None,
345
+ 'same_text_hash': None
346
+ }
347
+
348
+ if features1.visual_hash and features2.visual_hash:
349
+ comparison['same_visual_hash'] = features1.visual_hash == features2.visual_hash
350
+
351
+ if features1.text_hash and features2.text_hash:
352
+ comparison['same_text_hash'] = features1.text_hash == features2.text_hash
353
+
354
+ return comparison
355
+
356
+ # Convenience functions
357
+ def generate_universal_document_id(file_path: Union[str, Path]) -> str:
358
+ """Generate universal document ID"""
359
+ generator = UniversalDocumentIDGenerator()
360
+ return generator.generate_universal_id(file_path)
361
+
362
+ def verify_universal_document_id(file_path: Union[str, Path], document_id: str) -> bool:
363
+ """Verify universal document ID"""
364
+ generator = UniversalDocumentIDGenerator()
365
+ return generator.verify_universal_id(file_path, document_id)
366
+
367
+ def compare_universal_documents(file_path1: Union[str, Path], file_path2: Union[str, Path]) -> Dict[str, Any]:
368
+ """Compare two universal documents"""
369
+ generator = UniversalDocumentIDGenerator()
370
+ return generator.compare_documents(file_path1, file_path2)
@@ -0,0 +1,21 @@
1
+ """
2
+ Ekstraktory danych z dokumentów.
3
+ """
4
+
5
+ from .base import (
6
+ ContractExtractor,
7
+ DocumentCategory,
8
+ DocumentExtractor,
9
+ ExtractionResult,
10
+ InvoiceExtractor,
11
+ ReceiptExtractor,
12
+ )
13
+
14
+ __all__ = [
15
+ 'DocumentExtractor',
16
+ 'InvoiceExtractor',
17
+ 'ReceiptExtractor',
18
+ 'ContractExtractor',
19
+ 'ExtractionResult',
20
+ 'DocumentCategory',
21
+ ]