xgen-doc2chunk 0.1.0__py3-none-any.whl → 0.1.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (162) hide show
  1. xgen_doc2chunk/__init__.py +42 -0
  2. xgen_doc2chunk/chunking/__init__.py +168 -0
  3. xgen_doc2chunk/chunking/chunking.py +786 -0
  4. xgen_doc2chunk/chunking/constants.py +134 -0
  5. xgen_doc2chunk/chunking/page_chunker.py +248 -0
  6. xgen_doc2chunk/chunking/protected_regions.py +715 -0
  7. xgen_doc2chunk/chunking/sheet_processor.py +406 -0
  8. xgen_doc2chunk/chunking/table_chunker.py +832 -0
  9. xgen_doc2chunk/chunking/table_parser.py +172 -0
  10. xgen_doc2chunk/chunking/text_chunker.py +443 -0
  11. xgen_doc2chunk/core/__init__.py +64 -0
  12. xgen_doc2chunk/core/document_processor.py +1307 -0
  13. xgen_doc2chunk/core/functions/__init__.py +85 -0
  14. xgen_doc2chunk/core/functions/chart_extractor.py +144 -0
  15. xgen_doc2chunk/core/functions/chart_processor.py +534 -0
  16. xgen_doc2chunk/core/functions/file_converter.py +220 -0
  17. xgen_doc2chunk/core/functions/img_processor.py +649 -0
  18. xgen_doc2chunk/core/functions/metadata_extractor.py +542 -0
  19. xgen_doc2chunk/core/functions/page_tag_processor.py +393 -0
  20. xgen_doc2chunk/core/functions/preprocessor.py +162 -0
  21. xgen_doc2chunk/core/functions/storage_backend.py +381 -0
  22. xgen_doc2chunk/core/functions/table_extractor.py +468 -0
  23. xgen_doc2chunk/core/functions/table_processor.py +299 -0
  24. xgen_doc2chunk/core/functions/utils.py +159 -0
  25. xgen_doc2chunk/core/processor/__init__.py +96 -0
  26. xgen_doc2chunk/core/processor/base_handler.py +544 -0
  27. xgen_doc2chunk/core/processor/csv_handler.py +135 -0
  28. xgen_doc2chunk/core/processor/csv_helper/__init__.py +89 -0
  29. xgen_doc2chunk/core/processor/csv_helper/csv_constants.py +63 -0
  30. xgen_doc2chunk/core/processor/csv_helper/csv_encoding.py +104 -0
  31. xgen_doc2chunk/core/processor/csv_helper/csv_file_converter.py +78 -0
  32. xgen_doc2chunk/core/processor/csv_helper/csv_image_processor.py +75 -0
  33. xgen_doc2chunk/core/processor/csv_helper/csv_metadata.py +168 -0
  34. xgen_doc2chunk/core/processor/csv_helper/csv_parser.py +225 -0
  35. xgen_doc2chunk/core/processor/csv_helper/csv_preprocessor.py +86 -0
  36. xgen_doc2chunk/core/processor/csv_helper/csv_table.py +266 -0
  37. xgen_doc2chunk/core/processor/doc_handler.py +579 -0
  38. xgen_doc2chunk/core/processor/doc_helpers/__init__.py +25 -0
  39. xgen_doc2chunk/core/processor/doc_helpers/doc_file_converter.py +160 -0
  40. xgen_doc2chunk/core/processor/doc_helpers/doc_image_processor.py +179 -0
  41. xgen_doc2chunk/core/processor/doc_helpers/doc_preprocessor.py +83 -0
  42. xgen_doc2chunk/core/processor/docx_handler.py +376 -0
  43. xgen_doc2chunk/core/processor/docx_helper/__init__.py +84 -0
  44. xgen_doc2chunk/core/processor/docx_helper/docx_chart_extractor.py +436 -0
  45. xgen_doc2chunk/core/processor/docx_helper/docx_constants.py +75 -0
  46. xgen_doc2chunk/core/processor/docx_helper/docx_file_converter.py +76 -0
  47. xgen_doc2chunk/core/processor/docx_helper/docx_image.py +145 -0
  48. xgen_doc2chunk/core/processor/docx_helper/docx_image_processor.py +410 -0
  49. xgen_doc2chunk/core/processor/docx_helper/docx_metadata.py +71 -0
  50. xgen_doc2chunk/core/processor/docx_helper/docx_paragraph.py +126 -0
  51. xgen_doc2chunk/core/processor/docx_helper/docx_preprocessor.py +82 -0
  52. xgen_doc2chunk/core/processor/docx_helper/docx_table_extractor.py +527 -0
  53. xgen_doc2chunk/core/processor/docx_helper/docx_table_processor.py +220 -0
  54. xgen_doc2chunk/core/processor/excel_handler.py +353 -0
  55. xgen_doc2chunk/core/processor/excel_helper/__init__.py +97 -0
  56. xgen_doc2chunk/core/processor/excel_helper/excel_chart_extractor.py +498 -0
  57. xgen_doc2chunk/core/processor/excel_helper/excel_file_converter.py +157 -0
  58. xgen_doc2chunk/core/processor/excel_helper/excel_image_processor.py +316 -0
  59. xgen_doc2chunk/core/processor/excel_helper/excel_layout_detector.py +739 -0
  60. xgen_doc2chunk/core/processor/excel_helper/excel_metadata.py +145 -0
  61. xgen_doc2chunk/core/processor/excel_helper/excel_preprocessor.py +83 -0
  62. xgen_doc2chunk/core/processor/excel_helper/excel_table_xls.py +357 -0
  63. xgen_doc2chunk/core/processor/excel_helper/excel_table_xlsx.py +361 -0
  64. xgen_doc2chunk/core/processor/excel_helper/excel_textbox.py +266 -0
  65. xgen_doc2chunk/core/processor/html_helper/__init__.py +7 -0
  66. xgen_doc2chunk/core/processor/html_helper/html_file_converter.py +92 -0
  67. xgen_doc2chunk/core/processor/html_helper/html_preprocessor.py +74 -0
  68. xgen_doc2chunk/core/processor/html_reprocessor.py +140 -0
  69. xgen_doc2chunk/core/processor/hwp_handler.py +401 -0
  70. xgen_doc2chunk/core/processor/hwp_helper/__init__.py +120 -0
  71. xgen_doc2chunk/core/processor/hwp_helper/hwp_chart_extractor.py +373 -0
  72. xgen_doc2chunk/core/processor/hwp_helper/hwp_constants.py +78 -0
  73. xgen_doc2chunk/core/processor/hwp_helper/hwp_decoder.py +106 -0
  74. xgen_doc2chunk/core/processor/hwp_helper/hwp_docinfo.py +174 -0
  75. xgen_doc2chunk/core/processor/hwp_helper/hwp_file_converter.py +60 -0
  76. xgen_doc2chunk/core/processor/hwp_helper/hwp_image_processor.py +413 -0
  77. xgen_doc2chunk/core/processor/hwp_helper/hwp_metadata.py +236 -0
  78. xgen_doc2chunk/core/processor/hwp_helper/hwp_preprocessor.py +82 -0
  79. xgen_doc2chunk/core/processor/hwp_helper/hwp_record.py +149 -0
  80. xgen_doc2chunk/core/processor/hwp_helper/hwp_recovery.py +217 -0
  81. xgen_doc2chunk/core/processor/hwp_helper/hwp_table.py +205 -0
  82. xgen_doc2chunk/core/processor/hwpx_handler.py +191 -0
  83. xgen_doc2chunk/core/processor/hwpx_helper/__init__.py +85 -0
  84. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_chart_extractor.py +464 -0
  85. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_constants.py +30 -0
  86. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_file_converter.py +70 -0
  87. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_image_processor.py +258 -0
  88. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_metadata.py +163 -0
  89. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_preprocessor.py +80 -0
  90. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_section.py +242 -0
  91. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_table_extractor.py +462 -0
  92. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_table_processor.py +220 -0
  93. xgen_doc2chunk/core/processor/image_file_handler.py +212 -0
  94. xgen_doc2chunk/core/processor/image_file_helper/__init__.py +17 -0
  95. xgen_doc2chunk/core/processor/image_file_helper/image_file_converter.py +69 -0
  96. xgen_doc2chunk/core/processor/image_file_helper/image_file_image_processor.py +123 -0
  97. xgen_doc2chunk/core/processor/image_file_helper/image_file_preprocessor.py +84 -0
  98. xgen_doc2chunk/core/processor/pdf_handler.py +597 -0
  99. xgen_doc2chunk/core/processor/pdf_helpers/__init__.py +229 -0
  100. xgen_doc2chunk/core/processor/pdf_helpers/pdf_block_image_engine.py +667 -0
  101. xgen_doc2chunk/core/processor/pdf_helpers/pdf_cell_analysis.py +493 -0
  102. xgen_doc2chunk/core/processor/pdf_helpers/pdf_complexity_analyzer.py +598 -0
  103. xgen_doc2chunk/core/processor/pdf_helpers/pdf_element_merger.py +46 -0
  104. xgen_doc2chunk/core/processor/pdf_helpers/pdf_file_converter.py +72 -0
  105. xgen_doc2chunk/core/processor/pdf_helpers/pdf_graphic_detector.py +332 -0
  106. xgen_doc2chunk/core/processor/pdf_helpers/pdf_image_processor.py +321 -0
  107. xgen_doc2chunk/core/processor/pdf_helpers/pdf_layout_block_detector.py +1244 -0
  108. xgen_doc2chunk/core/processor/pdf_helpers/pdf_line_analysis.py +420 -0
  109. xgen_doc2chunk/core/processor/pdf_helpers/pdf_metadata.py +101 -0
  110. xgen_doc2chunk/core/processor/pdf_helpers/pdf_page_analyzer.py +114 -0
  111. xgen_doc2chunk/core/processor/pdf_helpers/pdf_preprocessor.py +106 -0
  112. xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_detection.py +1346 -0
  113. xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_processor.py +897 -0
  114. xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_quality_analyzer.py +750 -0
  115. xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_validator.py +401 -0
  116. xgen_doc2chunk/core/processor/pdf_helpers/pdf_text_extractor.py +155 -0
  117. xgen_doc2chunk/core/processor/pdf_helpers/pdf_text_quality_analyzer.py +655 -0
  118. xgen_doc2chunk/core/processor/pdf_helpers/pdf_utils.py +183 -0
  119. xgen_doc2chunk/core/processor/pdf_helpers/pdf_vector_text_ocr.py +302 -0
  120. xgen_doc2chunk/core/processor/pdf_helpers/types.py +278 -0
  121. xgen_doc2chunk/core/processor/ppt_handler.py +288 -0
  122. xgen_doc2chunk/core/processor/ppt_helper/__init__.py +96 -0
  123. xgen_doc2chunk/core/processor/ppt_helper/ppt_bullet.py +332 -0
  124. xgen_doc2chunk/core/processor/ppt_helper/ppt_chart_extractor.py +182 -0
  125. xgen_doc2chunk/core/processor/ppt_helper/ppt_constants.py +119 -0
  126. xgen_doc2chunk/core/processor/ppt_helper/ppt_file_converter.py +55 -0
  127. xgen_doc2chunk/core/processor/ppt_helper/ppt_image_processor.py +196 -0
  128. xgen_doc2chunk/core/processor/ppt_helper/ppt_metadata.py +71 -0
  129. xgen_doc2chunk/core/processor/ppt_helper/ppt_preprocessor.py +77 -0
  130. xgen_doc2chunk/core/processor/ppt_helper/ppt_shape.py +189 -0
  131. xgen_doc2chunk/core/processor/ppt_helper/ppt_slide.py +69 -0
  132. xgen_doc2chunk/core/processor/ppt_helper/ppt_table.py +386 -0
  133. xgen_doc2chunk/core/processor/rtf_handler.py +290 -0
  134. xgen_doc2chunk/core/processor/rtf_helper/__init__.py +128 -0
  135. xgen_doc2chunk/core/processor/rtf_helper/rtf_constants.py +94 -0
  136. xgen_doc2chunk/core/processor/rtf_helper/rtf_content_extractor.py +211 -0
  137. xgen_doc2chunk/core/processor/rtf_helper/rtf_decoder.py +141 -0
  138. xgen_doc2chunk/core/processor/rtf_helper/rtf_file_converter.py +87 -0
  139. xgen_doc2chunk/core/processor/rtf_helper/rtf_metadata_extractor.py +179 -0
  140. xgen_doc2chunk/core/processor/rtf_helper/rtf_preprocessor.py +426 -0
  141. xgen_doc2chunk/core/processor/rtf_helper/rtf_region_finder.py +91 -0
  142. xgen_doc2chunk/core/processor/rtf_helper/rtf_table_extractor.py +482 -0
  143. xgen_doc2chunk/core/processor/rtf_helper/rtf_text_cleaner.py +389 -0
  144. xgen_doc2chunk/core/processor/text_handler.py +95 -0
  145. xgen_doc2chunk/core/processor/text_helper/__init__.py +17 -0
  146. xgen_doc2chunk/core/processor/text_helper/text_file_converter.py +28 -0
  147. xgen_doc2chunk/core/processor/text_helper/text_image_processor.py +75 -0
  148. xgen_doc2chunk/core/processor/text_helper/text_preprocessor.py +82 -0
  149. xgen_doc2chunk/ocr/__init__.py +67 -0
  150. xgen_doc2chunk/ocr/base.py +209 -0
  151. xgen_doc2chunk/ocr/ocr_engine/__init__.py +22 -0
  152. xgen_doc2chunk/ocr/ocr_engine/anthropic_ocr.py +91 -0
  153. xgen_doc2chunk/ocr/ocr_engine/bedrock_ocr.py +172 -0
  154. xgen_doc2chunk/ocr/ocr_engine/gemini_ocr.py +91 -0
  155. xgen_doc2chunk/ocr/ocr_engine/openai_ocr.py +100 -0
  156. xgen_doc2chunk/ocr/ocr_engine/vllm_ocr.py +116 -0
  157. xgen_doc2chunk/ocr/ocr_processor.py +387 -0
  158. {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.2.dist-info}/METADATA +1 -1
  159. xgen_doc2chunk-0.1.2.dist-info/RECORD +161 -0
  160. xgen_doc2chunk-0.1.0.dist-info/RECORD +0 -4
  161. {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.2.dist-info}/WHEEL +0 -0
  162. {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.2.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,649 @@
1
+ # xgen_doc2chunk/core/functions/img_processor.py
2
+ """
3
+ Image Processing Module
4
+
5
+ Provides functionality to save image data to various storage backends
6
+ and convert to tag format. Uses Strategy pattern for storage backends.
7
+
8
+ This is the BASE class for all image processors.
9
+ Format-specific processors (PDFImageProcessor, DOCXImageProcessor, etc.)
10
+ should inherit from ImageProcessor and override process_image() method.
11
+
12
+ Main Features:
13
+ - Base ImageProcessor class with pluggable storage backend
14
+ - Save image data to specified storage (Local, MinIO, S3, etc.)
15
+ - Return saved path in custom tag format
16
+ - Duplicate image detection and handling
17
+ - Support for various image formats
18
+ - Extensible for format-specific processing
19
+
20
+ Usage Example:
21
+ from xgen_doc2chunk.core.functions.img_processor import ImageProcessor
22
+ from xgen_doc2chunk.core.functions.storage_backend import (
23
+ LocalStorageBackend,
24
+ MinIOStorageBackend,
25
+ )
26
+
27
+ # Use with default settings (local storage)
28
+ processor = ImageProcessor()
29
+ tag = processor.save_image(image_bytes)
30
+ # Result: "[Image:temp/images/abc123.png]"
31
+
32
+ # Use with MinIO storage (when implemented)
33
+ minio_backend = MinIOStorageBackend(endpoint="localhost:9000", bucket="images")
34
+ processor = ImageProcessor(storage_backend=minio_backend)
35
+
36
+ # Custom tag format
37
+ processor = ImageProcessor(
38
+ directory_path="output/images",
39
+ tag_prefix="<img src='",
40
+ tag_suffix="'>"
41
+ )
42
+ tag = processor.save_image(image_bytes)
43
+ # Result: "<img src='output/images/abc123.png'>"
44
+
45
+ # Inherit for format-specific processing
46
+ class PDFImageProcessor(ImageProcessor):
47
+ def process_image(self, image_data: bytes, **kwargs) -> Optional[str]:
48
+ xref = kwargs.get('xref')
49
+ custom_name = f"pdf_xref_{xref}" if xref else None
50
+ return self.save_image(image_data, custom_name=custom_name)
51
+ """
52
+ import hashlib
53
+ import io
54
+ import logging
55
+ import os
56
+ import uuid
57
+ from dataclasses import dataclass, field
58
+ from enum import Enum
59
+ from pathlib import Path
60
+ from typing import Any, Dict, List, Optional, Set, Union
61
+
62
+ from xgen_doc2chunk.core.functions.storage_backend import (
63
+ BaseStorageBackend,
64
+ LocalStorageBackend,
65
+ StorageType,
66
+ get_default_backend,
67
+ )
68
+
69
+ logger = logging.getLogger("xgen_doc2chunk.image_processor")
70
+
71
+
72
+ class ImageFormat(Enum):
73
+ """Supported image formats."""
74
+ PNG = "png"
75
+ JPEG = "jpeg"
76
+ JPG = "jpg"
77
+ GIF = "gif"
78
+ BMP = "bmp"
79
+ WEBP = "webp"
80
+ TIFF = "tiff"
81
+ UNKNOWN = "unknown"
82
+
83
+
84
+ class NamingStrategy(Enum):
85
+ """Image file naming strategies."""
86
+ HASH = "hash" # Content-based hash (prevents duplicates)
87
+ UUID = "uuid" # Unique UUID
88
+ SEQUENTIAL = "sequential" # Sequential numbering
89
+ TIMESTAMP = "timestamp" # Timestamp-based
90
+
91
+
92
+ @dataclass
93
+ class ImageProcessorConfig:
94
+ """
95
+ ImageProcessor Configuration.
96
+
97
+ Attributes:
98
+ directory_path: Directory path or bucket prefix for saving images
99
+ tag_prefix: Tag prefix (e.g., "[Image:")
100
+ tag_suffix: Tag suffix (e.g., "]")
101
+ naming_strategy: File naming strategy
102
+ default_format: Default image format
103
+ create_directory: Auto-create directory if not exists
104
+ use_absolute_path: Use absolute path in tags
105
+ hash_algorithm: Hash algorithm (for hash strategy)
106
+ max_filename_length: Maximum filename length
107
+ """
108
+ directory_path: str = "temp/images"
109
+ tag_prefix: str = "[Image:"
110
+ tag_suffix: str = "]"
111
+ naming_strategy: NamingStrategy = NamingStrategy.HASH
112
+ default_format: ImageFormat = ImageFormat.PNG
113
+ create_directory: bool = True
114
+ use_absolute_path: bool = False
115
+ hash_algorithm: str = "sha256"
116
+ max_filename_length: int = 64
117
+
118
+
119
+ class ImageProcessor:
120
+ """
121
+ Base Image Processing Class.
122
+
123
+ Saves image data using a pluggable storage backend and returns
124
+ the saved path in the specified tag format.
125
+
126
+ This is the BASE CLASS for all format-specific image processors.
127
+ Subclasses should override process_image() for format-specific handling.
128
+
129
+ Args:
130
+ directory_path: Image save directory (default: "temp/images")
131
+ tag_prefix: Tag prefix (default: "[Image:")
132
+ tag_suffix: Tag suffix (default: "]")
133
+ naming_strategy: File naming strategy (default: HASH)
134
+ storage_backend: Storage backend instance (default: LocalStorageBackend)
135
+ config: ImageProcessorConfig object (takes precedence)
136
+
137
+ Examples:
138
+ >>> # Default usage (local storage)
139
+ >>> processor = ImageProcessor()
140
+ >>> tag = processor.save_image(image_bytes)
141
+ "[Image:temp/images/a1b2c3d4.png]"
142
+
143
+ >>> # Custom directory and tags
144
+ >>> processor = ImageProcessor(
145
+ ... directory_path="images",
146
+ ... tag_prefix="![image](",
147
+ ... tag_suffix=")"
148
+ ... )
149
+ >>> tag = processor.save_image(image_bytes)
150
+ "![image](images/a1b2c3d4.png)"
151
+
152
+ >>> # Subclass for format-specific processing
153
+ >>> class PDFImageProcessor(ImageProcessor):
154
+ ... def process_image(self, image_data, **kwargs):
155
+ ... xref = kwargs.get('xref')
156
+ ... return self.save_image(image_data, custom_name=f"pdf_{xref}")
157
+ """
158
+
159
+ def __init__(
160
+ self,
161
+ directory_path: str = "temp/images",
162
+ tag_prefix: str = "[Image:",
163
+ tag_suffix: str = "]",
164
+ naming_strategy: Union[NamingStrategy, str] = NamingStrategy.HASH,
165
+ storage_backend: Optional[BaseStorageBackend] = None,
166
+ config: Optional[ImageProcessorConfig] = None,
167
+ ):
168
+ # Set config
169
+ if config:
170
+ self.config = config
171
+ else:
172
+ if isinstance(naming_strategy, str):
173
+ naming_strategy = NamingStrategy(naming_strategy.lower())
174
+
175
+ self.config = ImageProcessorConfig(
176
+ directory_path=directory_path,
177
+ tag_prefix=tag_prefix,
178
+ tag_suffix=tag_suffix,
179
+ naming_strategy=naming_strategy,
180
+ )
181
+
182
+ # Set storage backend (default: local)
183
+ self._storage_backend = storage_backend or get_default_backend()
184
+
185
+ # Track processed image hashes (for duplicate prevention)
186
+ self._processed_hashes: Dict[str, str] = {}
187
+
188
+ # Sequential counter (for sequential strategy)
189
+ self._sequential_counter: int = 0
190
+
191
+ # Logger
192
+ self._logger = logging.getLogger("xgen_doc2chunk.image_processor.ImageProcessor")
193
+
194
+ # Create directory if using local storage
195
+ if self.config.create_directory:
196
+ self._ensure_storage_ready()
197
+
198
+ @property
199
+ def storage_backend(self) -> BaseStorageBackend:
200
+ """Get the current storage backend."""
201
+ return self._storage_backend
202
+
203
+ @storage_backend.setter
204
+ def storage_backend(self, backend: BaseStorageBackend) -> None:
205
+ """
206
+ Set storage backend.
207
+
208
+ Args:
209
+ backend: New storage backend instance
210
+ """
211
+ self._storage_backend = backend
212
+ if self.config.create_directory:
213
+ self._ensure_storage_ready()
214
+
215
+ @property
216
+ def storage_type(self) -> StorageType:
217
+ """Get the current storage type."""
218
+ return self._storage_backend.storage_type
219
+
220
+ def _ensure_storage_ready(self) -> None:
221
+ """Ensure storage is ready."""
222
+ self._storage_backend.ensure_ready(self.config.directory_path)
223
+
224
+ def _compute_hash(self, data: bytes) -> str:
225
+ """Compute hash of image data."""
226
+ hasher = hashlib.new(self.config.hash_algorithm)
227
+ hasher.update(data)
228
+ return hasher.hexdigest()[:32]
229
+
230
+ def _detect_format(self, data: bytes) -> ImageFormat:
231
+ """Detect format from image data using magic bytes."""
232
+ if len(data) < 12:
233
+ return ImageFormat.UNKNOWN
234
+
235
+ if data[:8] == b'\x89PNG\r\n\x1a\n':
236
+ return ImageFormat.PNG
237
+ elif data[:2] == b'\xff\xd8':
238
+ return ImageFormat.JPEG
239
+ elif data[:6] in (b'GIF87a', b'GIF89a'):
240
+ return ImageFormat.GIF
241
+ elif data[:2] == b'BM':
242
+ return ImageFormat.BMP
243
+ elif data[:4] == b'RIFF' and data[8:12] == b'WEBP':
244
+ return ImageFormat.WEBP
245
+ elif data[:4] in (b'II*\x00', b'MM\x00*'):
246
+ return ImageFormat.TIFF
247
+ else:
248
+ return ImageFormat.UNKNOWN
249
+
250
+ def _generate_filename(
251
+ self,
252
+ data: bytes,
253
+ image_format: ImageFormat,
254
+ custom_name: Optional[str] = None
255
+ ) -> str:
256
+ """Generate filename based on naming strategy."""
257
+ if custom_name:
258
+ if not any(custom_name.lower().endswith(f".{fmt.value}")
259
+ for fmt in ImageFormat if fmt != ImageFormat.UNKNOWN):
260
+ ext = (image_format.value if image_format != ImageFormat.UNKNOWN
261
+ else self.config.default_format.value)
262
+ return f"{custom_name}.{ext}"
263
+ return custom_name
264
+
265
+ ext = (image_format.value if image_format != ImageFormat.UNKNOWN
266
+ else self.config.default_format.value)
267
+
268
+ strategy = self.config.naming_strategy
269
+
270
+ if strategy == NamingStrategy.HASH:
271
+ base = self._compute_hash(data)
272
+ elif strategy == NamingStrategy.UUID:
273
+ base = str(uuid.uuid4())[:16]
274
+ elif strategy == NamingStrategy.SEQUENTIAL:
275
+ self._sequential_counter += 1
276
+ base = f"image_{self._sequential_counter:06d}"
277
+ elif strategy == NamingStrategy.TIMESTAMP:
278
+ import time
279
+ base = f"img_{int(time.time() * 1000)}"
280
+ else:
281
+ base = self._compute_hash(data)
282
+
283
+ filename = f"{base}.{ext}"
284
+
285
+ if len(filename) > self.config.max_filename_length:
286
+ max_base_len = self.config.max_filename_length - len(ext) - 1
287
+ filename = f"{base[:max_base_len]}.{ext}"
288
+
289
+ return filename
290
+
291
+ def _build_file_path(self, filename: str) -> str:
292
+ """Build full file path from filename."""
293
+ return os.path.join(self.config.directory_path, filename)
294
+
295
+ def _build_tag(self, file_path: str) -> str:
296
+ """Build tag from file path."""
297
+ if self.config.use_absolute_path:
298
+ path_str = str(Path(file_path).absolute())
299
+ else:
300
+ path_str = self._storage_backend.build_url(file_path)
301
+
302
+ path_str = path_str.replace("\\", "/")
303
+ return f"{self.config.tag_prefix}{path_str}{self.config.tag_suffix}"
304
+
305
+ def save_image(
306
+ self,
307
+ image_data: bytes,
308
+ custom_name: Optional[str] = None,
309
+ processed_images: Optional[Set[str]] = None,
310
+ skip_duplicate: bool = True,
311
+ ) -> Optional[str]:
312
+ """
313
+ Save image data and return tag.
314
+
315
+ Args:
316
+ image_data: Image binary data
317
+ custom_name: Custom filename (extension optional)
318
+ processed_images: Set of processed image paths (for external duplicate tracking)
319
+ skip_duplicate: If True, skip saving duplicate images
320
+
321
+ Returns:
322
+ Image tag string, or None on failure
323
+
324
+ Examples:
325
+ >>> processor = ImageProcessor()
326
+ >>> tag = processor.save_image(png_bytes)
327
+ "[Image:temp/images/abc123.png]"
328
+ """
329
+ if not image_data:
330
+ self._logger.warning("Empty image data provided")
331
+ return None
332
+
333
+ try:
334
+ # Detect image format
335
+ image_format = self._detect_format(image_data)
336
+
337
+ # Compute hash
338
+ image_hash = self._compute_hash(image_data)
339
+
340
+ # Check for duplicates
341
+ if skip_duplicate and image_hash in self._processed_hashes:
342
+ existing_path = self._processed_hashes[image_hash]
343
+ self._logger.debug(f"Duplicate image detected: {existing_path}")
344
+ return self._build_tag(existing_path)
345
+
346
+ # Generate filename
347
+ filename = self._generate_filename(image_data, image_format, custom_name)
348
+ file_path = self._build_file_path(filename)
349
+
350
+ # Check external duplicate tracking
351
+ if processed_images is not None and file_path in processed_images:
352
+ self._logger.debug(f"Image already processed: {file_path}")
353
+ return self._build_tag(file_path)
354
+
355
+ # Ensure storage is ready
356
+ self._ensure_storage_ready()
357
+
358
+ # Save using storage backend
359
+ if not self._storage_backend.save(image_data, file_path):
360
+ return None
361
+
362
+ self._logger.debug(f"Image saved: {file_path}")
363
+
364
+ # Update tracking
365
+ self._processed_hashes[image_hash] = file_path
366
+ if processed_images is not None:
367
+ processed_images.add(file_path)
368
+
369
+ return self._build_tag(file_path)
370
+
371
+ except Exception as e:
372
+ self._logger.error(f"Failed to save image: {e}")
373
+ return None
374
+
375
+ def process_image(
376
+ self,
377
+ image_data: bytes,
378
+ **kwargs
379
+ ) -> Optional[str]:
380
+ """
381
+ Process and save image data.
382
+
383
+ This is the main method for format-specific image processing.
384
+ Subclasses should override this method to provide format-specific
385
+ processing logic before saving.
386
+
387
+ Default implementation simply saves the image.
388
+
389
+ Args:
390
+ image_data: Raw image binary data
391
+ **kwargs: Format-specific options (e.g., xref, page_num, sheet_name)
392
+
393
+ Returns:
394
+ Image tag string, or None on failure
395
+
396
+ Examples:
397
+ >>> processor = ImageProcessor()
398
+ >>> tag = processor.process_image(png_bytes)
399
+ "[Image:temp/images/abc123.png]"
400
+
401
+ >>> # Subclass example
402
+ >>> class PDFImageProcessor(ImageProcessor):
403
+ ... def process_image(self, image_data, **kwargs):
404
+ ... xref = kwargs.get('xref')
405
+ ... custom_name = f"pdf_xref_{xref}" if xref else None
406
+ ... return self.save_image(image_data, custom_name=custom_name)
407
+ """
408
+ custom_name = kwargs.get('custom_name')
409
+ return self.save_image(image_data, custom_name=custom_name)
410
+
411
+ def process_embedded_image(
412
+ self,
413
+ image_data: bytes,
414
+ image_name: Optional[str] = None,
415
+ **kwargs
416
+ ) -> Optional[str]:
417
+ """
418
+ Process embedded image from document.
419
+
420
+ Override in subclasses for format-specific embedded image handling.
421
+ Default implementation just saves the image.
422
+
423
+ Args:
424
+ image_data: Image binary data
425
+ image_name: Original image name in document
426
+ **kwargs: Additional options
427
+
428
+ Returns:
429
+ Image tag string, or None on failure
430
+ """
431
+ return self.save_image(image_data, custom_name=image_name)
432
+
433
+ def process_chart_image(
434
+ self,
435
+ chart_data: bytes,
436
+ chart_name: Optional[str] = None,
437
+ **kwargs
438
+ ) -> Optional[str]:
439
+ """
440
+ Process chart as image.
441
+
442
+ Override in subclasses for format-specific chart image handling.
443
+ Default implementation just saves the image.
444
+
445
+ Args:
446
+ chart_data: Chart image binary data
447
+ chart_name: Chart name
448
+ **kwargs: Additional options
449
+
450
+ Returns:
451
+ Image tag string, or None on failure
452
+ """
453
+ return self.save_image(chart_data, custom_name=chart_name)
454
+
455
+ def save_image_from_pil(
456
+ self,
457
+ pil_image,
458
+ image_format: Optional[ImageFormat] = None,
459
+ custom_name: Optional[str] = None,
460
+ processed_images: Optional[Set[str]] = None,
461
+ quality: int = 95,
462
+ ) -> Optional[str]:
463
+ """
464
+ Save PIL Image object and return tag.
465
+
466
+ Args:
467
+ pil_image: PIL Image object
468
+ image_format: Image format to save
469
+ custom_name: Custom filename
470
+ processed_images: Set of processed image paths
471
+ quality: JPEG quality (1-100)
472
+
473
+ Returns:
474
+ Image tag string, or None on failure
475
+ """
476
+ try:
477
+ from PIL import Image
478
+
479
+ if not isinstance(pil_image, Image.Image):
480
+ self._logger.error("Invalid PIL Image object")
481
+ return None
482
+
483
+ fmt = image_format or ImageFormat.PNG
484
+ if fmt == ImageFormat.UNKNOWN:
485
+ fmt = self.config.default_format
486
+
487
+ buffer = io.BytesIO()
488
+ save_format = fmt.value.upper()
489
+ if save_format == "JPG":
490
+ save_format = "JPEG"
491
+
492
+ save_kwargs = {}
493
+ if save_format == "JPEG":
494
+ save_kwargs["quality"] = quality
495
+ elif save_format == "PNG":
496
+ save_kwargs["compress_level"] = 6
497
+
498
+ pil_image.save(buffer, format=save_format, **save_kwargs)
499
+ image_data = buffer.getvalue()
500
+
501
+ return self.save_image(image_data, custom_name, processed_images)
502
+
503
+ except Exception as e:
504
+ self._logger.error(f"Failed to save PIL image: {e}")
505
+ return None
506
+
507
+ def get_processed_count(self) -> int:
508
+ """Return number of processed images."""
509
+ return len(self._processed_hashes)
510
+
511
+ def get_processed_paths(self) -> List[str]:
512
+ """Return all processed image paths."""
513
+ return list(self._processed_hashes.values())
514
+
515
+ def clear_cache(self) -> None:
516
+ """Clear internal duplicate tracking cache."""
517
+ self._processed_hashes.clear()
518
+ self._sequential_counter = 0
519
+
520
+ def cleanup(self, delete_files: bool = False) -> int:
521
+ """
522
+ Clean up resources.
523
+
524
+ Args:
525
+ delete_files: If True, delete saved files
526
+
527
+ Returns:
528
+ Number of deleted files
529
+ """
530
+ deleted = 0
531
+ if delete_files:
532
+ for path in self._processed_hashes.values():
533
+ if self._storage_backend.delete(path):
534
+ deleted += 1
535
+ self.clear_cache()
536
+ return deleted
537
+
538
+ def get_pattern_string(self) -> str:
539
+ """
540
+ Get regex pattern string for matching image tags.
541
+
542
+ Returns:
543
+ Regex pattern string
544
+ """
545
+ import re
546
+ prefix = re.escape(self.config.tag_prefix)
547
+ suffix = re.escape(self.config.tag_suffix)
548
+
549
+ if not self.config.tag_suffix:
550
+ capture = r'(\S+)'
551
+ else:
552
+ first_char = self.config.tag_suffix[0]
553
+ capture = f'([^{re.escape(first_char)}]+)'
554
+
555
+ return f'{prefix}{capture}{suffix}'
556
+
557
+
558
+ # ============================================================================
559
+ # Default Configuration
560
+ # ============================================================================
561
+
562
+ DEFAULT_IMAGE_CONFIG = {
563
+ "directory_path": "temp/images",
564
+ "tag_prefix": "[Image:",
565
+ "tag_suffix": "]",
566
+ "naming_strategy": NamingStrategy.HASH,
567
+ }
568
+
569
+
570
+ # ============================================================================
571
+ # Factory Function
572
+ # ============================================================================
573
+
574
+ def create_image_processor(
575
+ directory_path: Optional[str] = None,
576
+ tag_prefix: Optional[str] = None,
577
+ tag_suffix: Optional[str] = None,
578
+ naming_strategy: Optional[Union[NamingStrategy, str]] = None,
579
+ storage_backend: Optional[BaseStorageBackend] = None,
580
+ ) -> ImageProcessor:
581
+ """
582
+ Create a new ImageProcessor instance.
583
+
584
+ Args:
585
+ directory_path: Image save directory
586
+ tag_prefix: Tag prefix
587
+ tag_suffix: Tag suffix
588
+ naming_strategy: File naming strategy
589
+ storage_backend: Storage backend instance
590
+
591
+ Returns:
592
+ ImageProcessor instance
593
+ """
594
+ if naming_strategy is not None and isinstance(naming_strategy, str):
595
+ naming_strategy = NamingStrategy(naming_strategy.lower())
596
+
597
+ return ImageProcessor(
598
+ directory_path=directory_path or DEFAULT_IMAGE_CONFIG["directory_path"],
599
+ tag_prefix=tag_prefix or DEFAULT_IMAGE_CONFIG["tag_prefix"],
600
+ tag_suffix=tag_suffix or DEFAULT_IMAGE_CONFIG["tag_suffix"],
601
+ naming_strategy=naming_strategy or DEFAULT_IMAGE_CONFIG["naming_strategy"],
602
+ storage_backend=storage_backend,
603
+ )
604
+
605
+
606
+ def save_image_to_file(
607
+ image_data: bytes,
608
+ directory_path: str = "temp",
609
+ tag_prefix: str = "[Image:",
610
+ tag_suffix: str = "]",
611
+ processed_images: Optional[Set[str]] = None,
612
+ ) -> Optional[str]:
613
+ """
614
+ Save image to file and return tag.
615
+
616
+ Convenience function for quick image saving using local storage.
617
+
618
+ Args:
619
+ image_data: Image binary data
620
+ directory_path: Save directory
621
+ tag_prefix: Tag prefix
622
+ tag_suffix: Tag suffix
623
+ processed_images: Set for duplicate tracking
624
+
625
+ Returns:
626
+ Image tag string, or None on failure
627
+ """
628
+ processor = ImageProcessor(
629
+ directory_path=directory_path,
630
+ tag_prefix=tag_prefix,
631
+ tag_suffix=tag_suffix,
632
+ )
633
+ return processor.save_image(image_data, processed_images=processed_images)
634
+
635
+
636
+ __all__ = [
637
+ # Main class
638
+ "ImageProcessor",
639
+ # Config
640
+ "ImageProcessorConfig",
641
+ # Enums
642
+ "ImageFormat",
643
+ "NamingStrategy",
644
+ # Factory function
645
+ "create_image_processor",
646
+ "DEFAULT_IMAGE_CONFIG",
647
+ # Convenience function
648
+ "save_image_to_file",
649
+ ]