xgen-doc2chunk 0.1.0__py3-none-any.whl → 0.1.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- xgen_doc2chunk/__init__.py +42 -0
- xgen_doc2chunk/chunking/__init__.py +168 -0
- xgen_doc2chunk/chunking/chunking.py +786 -0
- xgen_doc2chunk/chunking/constants.py +134 -0
- xgen_doc2chunk/chunking/page_chunker.py +248 -0
- xgen_doc2chunk/chunking/protected_regions.py +715 -0
- xgen_doc2chunk/chunking/sheet_processor.py +406 -0
- xgen_doc2chunk/chunking/table_chunker.py +832 -0
- xgen_doc2chunk/chunking/table_parser.py +172 -0
- xgen_doc2chunk/chunking/text_chunker.py +443 -0
- xgen_doc2chunk/core/__init__.py +64 -0
- xgen_doc2chunk/core/document_processor.py +1307 -0
- xgen_doc2chunk/core/functions/__init__.py +85 -0
- xgen_doc2chunk/core/functions/chart_extractor.py +144 -0
- xgen_doc2chunk/core/functions/chart_processor.py +534 -0
- xgen_doc2chunk/core/functions/file_converter.py +220 -0
- xgen_doc2chunk/core/functions/img_processor.py +649 -0
- xgen_doc2chunk/core/functions/metadata_extractor.py +542 -0
- xgen_doc2chunk/core/functions/page_tag_processor.py +393 -0
- xgen_doc2chunk/core/functions/preprocessor.py +162 -0
- xgen_doc2chunk/core/functions/storage_backend.py +381 -0
- xgen_doc2chunk/core/functions/table_extractor.py +468 -0
- xgen_doc2chunk/core/functions/table_processor.py +299 -0
- xgen_doc2chunk/core/functions/utils.py +159 -0
- xgen_doc2chunk/core/processor/__init__.py +96 -0
- xgen_doc2chunk/core/processor/base_handler.py +544 -0
- xgen_doc2chunk/core/processor/csv_handler.py +135 -0
- xgen_doc2chunk/core/processor/csv_helper/__init__.py +89 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_constants.py +63 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_encoding.py +104 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_file_converter.py +78 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_image_processor.py +75 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_metadata.py +168 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_parser.py +225 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_preprocessor.py +86 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_table.py +266 -0
- xgen_doc2chunk/core/processor/doc_handler.py +579 -0
- xgen_doc2chunk/core/processor/doc_helpers/__init__.py +25 -0
- xgen_doc2chunk/core/processor/doc_helpers/doc_file_converter.py +160 -0
- xgen_doc2chunk/core/processor/doc_helpers/doc_image_processor.py +179 -0
- xgen_doc2chunk/core/processor/doc_helpers/doc_preprocessor.py +83 -0
- xgen_doc2chunk/core/processor/docx_handler.py +376 -0
- xgen_doc2chunk/core/processor/docx_helper/__init__.py +84 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_chart_extractor.py +436 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_constants.py +75 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_file_converter.py +76 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_image.py +145 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_image_processor.py +410 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_metadata.py +71 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_paragraph.py +126 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_preprocessor.py +82 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_table_extractor.py +527 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_table_processor.py +220 -0
- xgen_doc2chunk/core/processor/excel_handler.py +353 -0
- xgen_doc2chunk/core/processor/excel_helper/__init__.py +97 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_chart_extractor.py +498 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_file_converter.py +157 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_image_processor.py +316 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_layout_detector.py +739 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_metadata.py +145 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_preprocessor.py +83 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_table_xls.py +357 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_table_xlsx.py +361 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_textbox.py +266 -0
- xgen_doc2chunk/core/processor/html_helper/__init__.py +7 -0
- xgen_doc2chunk/core/processor/html_helper/html_file_converter.py +92 -0
- xgen_doc2chunk/core/processor/html_helper/html_preprocessor.py +74 -0
- xgen_doc2chunk/core/processor/html_reprocessor.py +140 -0
- xgen_doc2chunk/core/processor/hwp_handler.py +401 -0
- xgen_doc2chunk/core/processor/hwp_helper/__init__.py +120 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_chart_extractor.py +373 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_constants.py +78 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_decoder.py +106 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_docinfo.py +174 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_file_converter.py +60 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_image_processor.py +413 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_metadata.py +236 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_preprocessor.py +82 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_record.py +149 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_recovery.py +217 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_table.py +205 -0
- xgen_doc2chunk/core/processor/hwpx_handler.py +191 -0
- xgen_doc2chunk/core/processor/hwpx_helper/__init__.py +85 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_chart_extractor.py +464 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_constants.py +30 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_file_converter.py +70 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_image_processor.py +258 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_metadata.py +163 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_preprocessor.py +80 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_section.py +242 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_table_extractor.py +462 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_table_processor.py +220 -0
- xgen_doc2chunk/core/processor/image_file_handler.py +212 -0
- xgen_doc2chunk/core/processor/image_file_helper/__init__.py +17 -0
- xgen_doc2chunk/core/processor/image_file_helper/image_file_converter.py +69 -0
- xgen_doc2chunk/core/processor/image_file_helper/image_file_image_processor.py +123 -0
- xgen_doc2chunk/core/processor/image_file_helper/image_file_preprocessor.py +84 -0
- xgen_doc2chunk/core/processor/pdf_handler.py +597 -0
- xgen_doc2chunk/core/processor/pdf_helpers/__init__.py +229 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_block_image_engine.py +667 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_cell_analysis.py +493 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_complexity_analyzer.py +598 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_element_merger.py +46 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_file_converter.py +72 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_graphic_detector.py +332 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_image_processor.py +321 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_layout_block_detector.py +1244 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_line_analysis.py +420 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_metadata.py +101 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_page_analyzer.py +114 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_preprocessor.py +106 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_detection.py +1346 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_processor.py +897 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_quality_analyzer.py +750 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_validator.py +401 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_text_extractor.py +155 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_text_quality_analyzer.py +655 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_utils.py +183 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_vector_text_ocr.py +302 -0
- xgen_doc2chunk/core/processor/pdf_helpers/types.py +278 -0
- xgen_doc2chunk/core/processor/ppt_handler.py +288 -0
- xgen_doc2chunk/core/processor/ppt_helper/__init__.py +96 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_bullet.py +332 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_chart_extractor.py +182 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_constants.py +119 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_file_converter.py +55 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_image_processor.py +196 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_metadata.py +71 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_preprocessor.py +77 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_shape.py +189 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_slide.py +69 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_table.py +386 -0
- xgen_doc2chunk/core/processor/rtf_handler.py +290 -0
- xgen_doc2chunk/core/processor/rtf_helper/__init__.py +128 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_constants.py +94 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_content_extractor.py +211 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_decoder.py +141 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_file_converter.py +87 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_metadata_extractor.py +179 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_preprocessor.py +426 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_region_finder.py +91 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_table_extractor.py +482 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_text_cleaner.py +389 -0
- xgen_doc2chunk/core/processor/text_handler.py +95 -0
- xgen_doc2chunk/core/processor/text_helper/__init__.py +17 -0
- xgen_doc2chunk/core/processor/text_helper/text_file_converter.py +28 -0
- xgen_doc2chunk/core/processor/text_helper/text_image_processor.py +75 -0
- xgen_doc2chunk/core/processor/text_helper/text_preprocessor.py +82 -0
- xgen_doc2chunk/ocr/__init__.py +67 -0
- xgen_doc2chunk/ocr/base.py +209 -0
- xgen_doc2chunk/ocr/ocr_engine/__init__.py +22 -0
- xgen_doc2chunk/ocr/ocr_engine/anthropic_ocr.py +91 -0
- xgen_doc2chunk/ocr/ocr_engine/bedrock_ocr.py +172 -0
- xgen_doc2chunk/ocr/ocr_engine/gemini_ocr.py +91 -0
- xgen_doc2chunk/ocr/ocr_engine/openai_ocr.py +100 -0
- xgen_doc2chunk/ocr/ocr_engine/vllm_ocr.py +116 -0
- xgen_doc2chunk/ocr/ocr_processor.py +387 -0
- {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.2.dist-info}/METADATA +1 -1
- xgen_doc2chunk-0.1.2.dist-info/RECORD +161 -0
- xgen_doc2chunk-0.1.0.dist-info/RECORD +0 -4
- {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.2.dist-info}/WHEEL +0 -0
- {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.2.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,649 @@
|
|
|
1
|
+
# xgen_doc2chunk/core/functions/img_processor.py
|
|
2
|
+
"""
|
|
3
|
+
Image Processing Module
|
|
4
|
+
|
|
5
|
+
Provides functionality to save image data to various storage backends
|
|
6
|
+
and convert to tag format. Uses Strategy pattern for storage backends.
|
|
7
|
+
|
|
8
|
+
This is the BASE class for all image processors.
|
|
9
|
+
Format-specific processors (PDFImageProcessor, DOCXImageProcessor, etc.)
|
|
10
|
+
should inherit from ImageProcessor and override process_image() method.
|
|
11
|
+
|
|
12
|
+
Main Features:
|
|
13
|
+
- Base ImageProcessor class with pluggable storage backend
|
|
14
|
+
- Save image data to specified storage (Local, MinIO, S3, etc.)
|
|
15
|
+
- Return saved path in custom tag format
|
|
16
|
+
- Duplicate image detection and handling
|
|
17
|
+
- Support for various image formats
|
|
18
|
+
- Extensible for format-specific processing
|
|
19
|
+
|
|
20
|
+
Usage Example:
|
|
21
|
+
from xgen_doc2chunk.core.functions.img_processor import ImageProcessor
|
|
22
|
+
from xgen_doc2chunk.core.functions.storage_backend import (
|
|
23
|
+
LocalStorageBackend,
|
|
24
|
+
MinIOStorageBackend,
|
|
25
|
+
)
|
|
26
|
+
|
|
27
|
+
# Use with default settings (local storage)
|
|
28
|
+
processor = ImageProcessor()
|
|
29
|
+
tag = processor.save_image(image_bytes)
|
|
30
|
+
# Result: "[Image:temp/images/abc123.png]"
|
|
31
|
+
|
|
32
|
+
# Use with MinIO storage (when implemented)
|
|
33
|
+
minio_backend = MinIOStorageBackend(endpoint="localhost:9000", bucket="images")
|
|
34
|
+
processor = ImageProcessor(storage_backend=minio_backend)
|
|
35
|
+
|
|
36
|
+
# Custom tag format
|
|
37
|
+
processor = ImageProcessor(
|
|
38
|
+
directory_path="output/images",
|
|
39
|
+
tag_prefix="<img src='",
|
|
40
|
+
tag_suffix="'>"
|
|
41
|
+
)
|
|
42
|
+
tag = processor.save_image(image_bytes)
|
|
43
|
+
# Result: "<img src='output/images/abc123.png'>"
|
|
44
|
+
|
|
45
|
+
# Inherit for format-specific processing
|
|
46
|
+
class PDFImageProcessor(ImageProcessor):
|
|
47
|
+
def process_image(self, image_data: bytes, **kwargs) -> Optional[str]:
|
|
48
|
+
xref = kwargs.get('xref')
|
|
49
|
+
custom_name = f"pdf_xref_{xref}" if xref else None
|
|
50
|
+
return self.save_image(image_data, custom_name=custom_name)
|
|
51
|
+
"""
|
|
52
|
+
import hashlib
|
|
53
|
+
import io
|
|
54
|
+
import logging
|
|
55
|
+
import os
|
|
56
|
+
import uuid
|
|
57
|
+
from dataclasses import dataclass, field
|
|
58
|
+
from enum import Enum
|
|
59
|
+
from pathlib import Path
|
|
60
|
+
from typing import Any, Dict, List, Optional, Set, Union
|
|
61
|
+
|
|
62
|
+
from xgen_doc2chunk.core.functions.storage_backend import (
|
|
63
|
+
BaseStorageBackend,
|
|
64
|
+
LocalStorageBackend,
|
|
65
|
+
StorageType,
|
|
66
|
+
get_default_backend,
|
|
67
|
+
)
|
|
68
|
+
|
|
69
|
+
logger = logging.getLogger("xgen_doc2chunk.image_processor")
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
class ImageFormat(Enum):
|
|
73
|
+
"""Supported image formats."""
|
|
74
|
+
PNG = "png"
|
|
75
|
+
JPEG = "jpeg"
|
|
76
|
+
JPG = "jpg"
|
|
77
|
+
GIF = "gif"
|
|
78
|
+
BMP = "bmp"
|
|
79
|
+
WEBP = "webp"
|
|
80
|
+
TIFF = "tiff"
|
|
81
|
+
UNKNOWN = "unknown"
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
class NamingStrategy(Enum):
|
|
85
|
+
"""Image file naming strategies."""
|
|
86
|
+
HASH = "hash" # Content-based hash (prevents duplicates)
|
|
87
|
+
UUID = "uuid" # Unique UUID
|
|
88
|
+
SEQUENTIAL = "sequential" # Sequential numbering
|
|
89
|
+
TIMESTAMP = "timestamp" # Timestamp-based
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
@dataclass
|
|
93
|
+
class ImageProcessorConfig:
|
|
94
|
+
"""
|
|
95
|
+
ImageProcessor Configuration.
|
|
96
|
+
|
|
97
|
+
Attributes:
|
|
98
|
+
directory_path: Directory path or bucket prefix for saving images
|
|
99
|
+
tag_prefix: Tag prefix (e.g., "[Image:")
|
|
100
|
+
tag_suffix: Tag suffix (e.g., "]")
|
|
101
|
+
naming_strategy: File naming strategy
|
|
102
|
+
default_format: Default image format
|
|
103
|
+
create_directory: Auto-create directory if not exists
|
|
104
|
+
use_absolute_path: Use absolute path in tags
|
|
105
|
+
hash_algorithm: Hash algorithm (for hash strategy)
|
|
106
|
+
max_filename_length: Maximum filename length
|
|
107
|
+
"""
|
|
108
|
+
directory_path: str = "temp/images"
|
|
109
|
+
tag_prefix: str = "[Image:"
|
|
110
|
+
tag_suffix: str = "]"
|
|
111
|
+
naming_strategy: NamingStrategy = NamingStrategy.HASH
|
|
112
|
+
default_format: ImageFormat = ImageFormat.PNG
|
|
113
|
+
create_directory: bool = True
|
|
114
|
+
use_absolute_path: bool = False
|
|
115
|
+
hash_algorithm: str = "sha256"
|
|
116
|
+
max_filename_length: int = 64
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
class ImageProcessor:
|
|
120
|
+
"""
|
|
121
|
+
Base Image Processing Class.
|
|
122
|
+
|
|
123
|
+
Saves image data using a pluggable storage backend and returns
|
|
124
|
+
the saved path in the specified tag format.
|
|
125
|
+
|
|
126
|
+
This is the BASE CLASS for all format-specific image processors.
|
|
127
|
+
Subclasses should override process_image() for format-specific handling.
|
|
128
|
+
|
|
129
|
+
Args:
|
|
130
|
+
directory_path: Image save directory (default: "temp/images")
|
|
131
|
+
tag_prefix: Tag prefix (default: "[Image:")
|
|
132
|
+
tag_suffix: Tag suffix (default: "]")
|
|
133
|
+
naming_strategy: File naming strategy (default: HASH)
|
|
134
|
+
storage_backend: Storage backend instance (default: LocalStorageBackend)
|
|
135
|
+
config: ImageProcessorConfig object (takes precedence)
|
|
136
|
+
|
|
137
|
+
Examples:
|
|
138
|
+
>>> # Default usage (local storage)
|
|
139
|
+
>>> processor = ImageProcessor()
|
|
140
|
+
>>> tag = processor.save_image(image_bytes)
|
|
141
|
+
"[Image:temp/images/a1b2c3d4.png]"
|
|
142
|
+
|
|
143
|
+
>>> # Custom directory and tags
|
|
144
|
+
>>> processor = ImageProcessor(
|
|
145
|
+
... directory_path="images",
|
|
146
|
+
... tag_prefix=""
|
|
148
|
+
... )
|
|
149
|
+
>>> tag = processor.save_image(image_bytes)
|
|
150
|
+
""
|
|
151
|
+
|
|
152
|
+
>>> # Subclass for format-specific processing
|
|
153
|
+
>>> class PDFImageProcessor(ImageProcessor):
|
|
154
|
+
... def process_image(self, image_data, **kwargs):
|
|
155
|
+
... xref = kwargs.get('xref')
|
|
156
|
+
... return self.save_image(image_data, custom_name=f"pdf_{xref}")
|
|
157
|
+
"""
|
|
158
|
+
|
|
159
|
+
def __init__(
|
|
160
|
+
self,
|
|
161
|
+
directory_path: str = "temp/images",
|
|
162
|
+
tag_prefix: str = "[Image:",
|
|
163
|
+
tag_suffix: str = "]",
|
|
164
|
+
naming_strategy: Union[NamingStrategy, str] = NamingStrategy.HASH,
|
|
165
|
+
storage_backend: Optional[BaseStorageBackend] = None,
|
|
166
|
+
config: Optional[ImageProcessorConfig] = None,
|
|
167
|
+
):
|
|
168
|
+
# Set config
|
|
169
|
+
if config:
|
|
170
|
+
self.config = config
|
|
171
|
+
else:
|
|
172
|
+
if isinstance(naming_strategy, str):
|
|
173
|
+
naming_strategy = NamingStrategy(naming_strategy.lower())
|
|
174
|
+
|
|
175
|
+
self.config = ImageProcessorConfig(
|
|
176
|
+
directory_path=directory_path,
|
|
177
|
+
tag_prefix=tag_prefix,
|
|
178
|
+
tag_suffix=tag_suffix,
|
|
179
|
+
naming_strategy=naming_strategy,
|
|
180
|
+
)
|
|
181
|
+
|
|
182
|
+
# Set storage backend (default: local)
|
|
183
|
+
self._storage_backend = storage_backend or get_default_backend()
|
|
184
|
+
|
|
185
|
+
# Track processed image hashes (for duplicate prevention)
|
|
186
|
+
self._processed_hashes: Dict[str, str] = {}
|
|
187
|
+
|
|
188
|
+
# Sequential counter (for sequential strategy)
|
|
189
|
+
self._sequential_counter: int = 0
|
|
190
|
+
|
|
191
|
+
# Logger
|
|
192
|
+
self._logger = logging.getLogger("xgen_doc2chunk.image_processor.ImageProcessor")
|
|
193
|
+
|
|
194
|
+
# Create directory if using local storage
|
|
195
|
+
if self.config.create_directory:
|
|
196
|
+
self._ensure_storage_ready()
|
|
197
|
+
|
|
198
|
+
@property
|
|
199
|
+
def storage_backend(self) -> BaseStorageBackend:
|
|
200
|
+
"""Get the current storage backend."""
|
|
201
|
+
return self._storage_backend
|
|
202
|
+
|
|
203
|
+
@storage_backend.setter
|
|
204
|
+
def storage_backend(self, backend: BaseStorageBackend) -> None:
|
|
205
|
+
"""
|
|
206
|
+
Set storage backend.
|
|
207
|
+
|
|
208
|
+
Args:
|
|
209
|
+
backend: New storage backend instance
|
|
210
|
+
"""
|
|
211
|
+
self._storage_backend = backend
|
|
212
|
+
if self.config.create_directory:
|
|
213
|
+
self._ensure_storage_ready()
|
|
214
|
+
|
|
215
|
+
@property
|
|
216
|
+
def storage_type(self) -> StorageType:
|
|
217
|
+
"""Get the current storage type."""
|
|
218
|
+
return self._storage_backend.storage_type
|
|
219
|
+
|
|
220
|
+
def _ensure_storage_ready(self) -> None:
|
|
221
|
+
"""Ensure storage is ready."""
|
|
222
|
+
self._storage_backend.ensure_ready(self.config.directory_path)
|
|
223
|
+
|
|
224
|
+
def _compute_hash(self, data: bytes) -> str:
|
|
225
|
+
"""Compute hash of image data."""
|
|
226
|
+
hasher = hashlib.new(self.config.hash_algorithm)
|
|
227
|
+
hasher.update(data)
|
|
228
|
+
return hasher.hexdigest()[:32]
|
|
229
|
+
|
|
230
|
+
def _detect_format(self, data: bytes) -> ImageFormat:
|
|
231
|
+
"""Detect format from image data using magic bytes."""
|
|
232
|
+
if len(data) < 12:
|
|
233
|
+
return ImageFormat.UNKNOWN
|
|
234
|
+
|
|
235
|
+
if data[:8] == b'\x89PNG\r\n\x1a\n':
|
|
236
|
+
return ImageFormat.PNG
|
|
237
|
+
elif data[:2] == b'\xff\xd8':
|
|
238
|
+
return ImageFormat.JPEG
|
|
239
|
+
elif data[:6] in (b'GIF87a', b'GIF89a'):
|
|
240
|
+
return ImageFormat.GIF
|
|
241
|
+
elif data[:2] == b'BM':
|
|
242
|
+
return ImageFormat.BMP
|
|
243
|
+
elif data[:4] == b'RIFF' and data[8:12] == b'WEBP':
|
|
244
|
+
return ImageFormat.WEBP
|
|
245
|
+
elif data[:4] in (b'II*\x00', b'MM\x00*'):
|
|
246
|
+
return ImageFormat.TIFF
|
|
247
|
+
else:
|
|
248
|
+
return ImageFormat.UNKNOWN
|
|
249
|
+
|
|
250
|
+
def _generate_filename(
|
|
251
|
+
self,
|
|
252
|
+
data: bytes,
|
|
253
|
+
image_format: ImageFormat,
|
|
254
|
+
custom_name: Optional[str] = None
|
|
255
|
+
) -> str:
|
|
256
|
+
"""Generate filename based on naming strategy."""
|
|
257
|
+
if custom_name:
|
|
258
|
+
if not any(custom_name.lower().endswith(f".{fmt.value}")
|
|
259
|
+
for fmt in ImageFormat if fmt != ImageFormat.UNKNOWN):
|
|
260
|
+
ext = (image_format.value if image_format != ImageFormat.UNKNOWN
|
|
261
|
+
else self.config.default_format.value)
|
|
262
|
+
return f"{custom_name}.{ext}"
|
|
263
|
+
return custom_name
|
|
264
|
+
|
|
265
|
+
ext = (image_format.value if image_format != ImageFormat.UNKNOWN
|
|
266
|
+
else self.config.default_format.value)
|
|
267
|
+
|
|
268
|
+
strategy = self.config.naming_strategy
|
|
269
|
+
|
|
270
|
+
if strategy == NamingStrategy.HASH:
|
|
271
|
+
base = self._compute_hash(data)
|
|
272
|
+
elif strategy == NamingStrategy.UUID:
|
|
273
|
+
base = str(uuid.uuid4())[:16]
|
|
274
|
+
elif strategy == NamingStrategy.SEQUENTIAL:
|
|
275
|
+
self._sequential_counter += 1
|
|
276
|
+
base = f"image_{self._sequential_counter:06d}"
|
|
277
|
+
elif strategy == NamingStrategy.TIMESTAMP:
|
|
278
|
+
import time
|
|
279
|
+
base = f"img_{int(time.time() * 1000)}"
|
|
280
|
+
else:
|
|
281
|
+
base = self._compute_hash(data)
|
|
282
|
+
|
|
283
|
+
filename = f"{base}.{ext}"
|
|
284
|
+
|
|
285
|
+
if len(filename) > self.config.max_filename_length:
|
|
286
|
+
max_base_len = self.config.max_filename_length - len(ext) - 1
|
|
287
|
+
filename = f"{base[:max_base_len]}.{ext}"
|
|
288
|
+
|
|
289
|
+
return filename
|
|
290
|
+
|
|
291
|
+
def _build_file_path(self, filename: str) -> str:
|
|
292
|
+
"""Build full file path from filename."""
|
|
293
|
+
return os.path.join(self.config.directory_path, filename)
|
|
294
|
+
|
|
295
|
+
def _build_tag(self, file_path: str) -> str:
|
|
296
|
+
"""Build tag from file path."""
|
|
297
|
+
if self.config.use_absolute_path:
|
|
298
|
+
path_str = str(Path(file_path).absolute())
|
|
299
|
+
else:
|
|
300
|
+
path_str = self._storage_backend.build_url(file_path)
|
|
301
|
+
|
|
302
|
+
path_str = path_str.replace("\\", "/")
|
|
303
|
+
return f"{self.config.tag_prefix}{path_str}{self.config.tag_suffix}"
|
|
304
|
+
|
|
305
|
+
def save_image(
|
|
306
|
+
self,
|
|
307
|
+
image_data: bytes,
|
|
308
|
+
custom_name: Optional[str] = None,
|
|
309
|
+
processed_images: Optional[Set[str]] = None,
|
|
310
|
+
skip_duplicate: bool = True,
|
|
311
|
+
) -> Optional[str]:
|
|
312
|
+
"""
|
|
313
|
+
Save image data and return tag.
|
|
314
|
+
|
|
315
|
+
Args:
|
|
316
|
+
image_data: Image binary data
|
|
317
|
+
custom_name: Custom filename (extension optional)
|
|
318
|
+
processed_images: Set of processed image paths (for external duplicate tracking)
|
|
319
|
+
skip_duplicate: If True, skip saving duplicate images
|
|
320
|
+
|
|
321
|
+
Returns:
|
|
322
|
+
Image tag string, or None on failure
|
|
323
|
+
|
|
324
|
+
Examples:
|
|
325
|
+
>>> processor = ImageProcessor()
|
|
326
|
+
>>> tag = processor.save_image(png_bytes)
|
|
327
|
+
"[Image:temp/images/abc123.png]"
|
|
328
|
+
"""
|
|
329
|
+
if not image_data:
|
|
330
|
+
self._logger.warning("Empty image data provided")
|
|
331
|
+
return None
|
|
332
|
+
|
|
333
|
+
try:
|
|
334
|
+
# Detect image format
|
|
335
|
+
image_format = self._detect_format(image_data)
|
|
336
|
+
|
|
337
|
+
# Compute hash
|
|
338
|
+
image_hash = self._compute_hash(image_data)
|
|
339
|
+
|
|
340
|
+
# Check for duplicates
|
|
341
|
+
if skip_duplicate and image_hash in self._processed_hashes:
|
|
342
|
+
existing_path = self._processed_hashes[image_hash]
|
|
343
|
+
self._logger.debug(f"Duplicate image detected: {existing_path}")
|
|
344
|
+
return self._build_tag(existing_path)
|
|
345
|
+
|
|
346
|
+
# Generate filename
|
|
347
|
+
filename = self._generate_filename(image_data, image_format, custom_name)
|
|
348
|
+
file_path = self._build_file_path(filename)
|
|
349
|
+
|
|
350
|
+
# Check external duplicate tracking
|
|
351
|
+
if processed_images is not None and file_path in processed_images:
|
|
352
|
+
self._logger.debug(f"Image already processed: {file_path}")
|
|
353
|
+
return self._build_tag(file_path)
|
|
354
|
+
|
|
355
|
+
# Ensure storage is ready
|
|
356
|
+
self._ensure_storage_ready()
|
|
357
|
+
|
|
358
|
+
# Save using storage backend
|
|
359
|
+
if not self._storage_backend.save(image_data, file_path):
|
|
360
|
+
return None
|
|
361
|
+
|
|
362
|
+
self._logger.debug(f"Image saved: {file_path}")
|
|
363
|
+
|
|
364
|
+
# Update tracking
|
|
365
|
+
self._processed_hashes[image_hash] = file_path
|
|
366
|
+
if processed_images is not None:
|
|
367
|
+
processed_images.add(file_path)
|
|
368
|
+
|
|
369
|
+
return self._build_tag(file_path)
|
|
370
|
+
|
|
371
|
+
except Exception as e:
|
|
372
|
+
self._logger.error(f"Failed to save image: {e}")
|
|
373
|
+
return None
|
|
374
|
+
|
|
375
|
+
def process_image(
|
|
376
|
+
self,
|
|
377
|
+
image_data: bytes,
|
|
378
|
+
**kwargs
|
|
379
|
+
) -> Optional[str]:
|
|
380
|
+
"""
|
|
381
|
+
Process and save image data.
|
|
382
|
+
|
|
383
|
+
This is the main method for format-specific image processing.
|
|
384
|
+
Subclasses should override this method to provide format-specific
|
|
385
|
+
processing logic before saving.
|
|
386
|
+
|
|
387
|
+
Default implementation simply saves the image.
|
|
388
|
+
|
|
389
|
+
Args:
|
|
390
|
+
image_data: Raw image binary data
|
|
391
|
+
**kwargs: Format-specific options (e.g., xref, page_num, sheet_name)
|
|
392
|
+
|
|
393
|
+
Returns:
|
|
394
|
+
Image tag string, or None on failure
|
|
395
|
+
|
|
396
|
+
Examples:
|
|
397
|
+
>>> processor = ImageProcessor()
|
|
398
|
+
>>> tag = processor.process_image(png_bytes)
|
|
399
|
+
"[Image:temp/images/abc123.png]"
|
|
400
|
+
|
|
401
|
+
>>> # Subclass example
|
|
402
|
+
>>> class PDFImageProcessor(ImageProcessor):
|
|
403
|
+
... def process_image(self, image_data, **kwargs):
|
|
404
|
+
... xref = kwargs.get('xref')
|
|
405
|
+
... custom_name = f"pdf_xref_{xref}" if xref else None
|
|
406
|
+
... return self.save_image(image_data, custom_name=custom_name)
|
|
407
|
+
"""
|
|
408
|
+
custom_name = kwargs.get('custom_name')
|
|
409
|
+
return self.save_image(image_data, custom_name=custom_name)
|
|
410
|
+
|
|
411
|
+
def process_embedded_image(
|
|
412
|
+
self,
|
|
413
|
+
image_data: bytes,
|
|
414
|
+
image_name: Optional[str] = None,
|
|
415
|
+
**kwargs
|
|
416
|
+
) -> Optional[str]:
|
|
417
|
+
"""
|
|
418
|
+
Process embedded image from document.
|
|
419
|
+
|
|
420
|
+
Override in subclasses for format-specific embedded image handling.
|
|
421
|
+
Default implementation just saves the image.
|
|
422
|
+
|
|
423
|
+
Args:
|
|
424
|
+
image_data: Image binary data
|
|
425
|
+
image_name: Original image name in document
|
|
426
|
+
**kwargs: Additional options
|
|
427
|
+
|
|
428
|
+
Returns:
|
|
429
|
+
Image tag string, or None on failure
|
|
430
|
+
"""
|
|
431
|
+
return self.save_image(image_data, custom_name=image_name)
|
|
432
|
+
|
|
433
|
+
def process_chart_image(
|
|
434
|
+
self,
|
|
435
|
+
chart_data: bytes,
|
|
436
|
+
chart_name: Optional[str] = None,
|
|
437
|
+
**kwargs
|
|
438
|
+
) -> Optional[str]:
|
|
439
|
+
"""
|
|
440
|
+
Process chart as image.
|
|
441
|
+
|
|
442
|
+
Override in subclasses for format-specific chart image handling.
|
|
443
|
+
Default implementation just saves the image.
|
|
444
|
+
|
|
445
|
+
Args:
|
|
446
|
+
chart_data: Chart image binary data
|
|
447
|
+
chart_name: Chart name
|
|
448
|
+
**kwargs: Additional options
|
|
449
|
+
|
|
450
|
+
Returns:
|
|
451
|
+
Image tag string, or None on failure
|
|
452
|
+
"""
|
|
453
|
+
return self.save_image(chart_data, custom_name=chart_name)
|
|
454
|
+
|
|
455
|
+
def save_image_from_pil(
|
|
456
|
+
self,
|
|
457
|
+
pil_image,
|
|
458
|
+
image_format: Optional[ImageFormat] = None,
|
|
459
|
+
custom_name: Optional[str] = None,
|
|
460
|
+
processed_images: Optional[Set[str]] = None,
|
|
461
|
+
quality: int = 95,
|
|
462
|
+
) -> Optional[str]:
|
|
463
|
+
"""
|
|
464
|
+
Save PIL Image object and return tag.
|
|
465
|
+
|
|
466
|
+
Args:
|
|
467
|
+
pil_image: PIL Image object
|
|
468
|
+
image_format: Image format to save
|
|
469
|
+
custom_name: Custom filename
|
|
470
|
+
processed_images: Set of processed image paths
|
|
471
|
+
quality: JPEG quality (1-100)
|
|
472
|
+
|
|
473
|
+
Returns:
|
|
474
|
+
Image tag string, or None on failure
|
|
475
|
+
"""
|
|
476
|
+
try:
|
|
477
|
+
from PIL import Image
|
|
478
|
+
|
|
479
|
+
if not isinstance(pil_image, Image.Image):
|
|
480
|
+
self._logger.error("Invalid PIL Image object")
|
|
481
|
+
return None
|
|
482
|
+
|
|
483
|
+
fmt = image_format or ImageFormat.PNG
|
|
484
|
+
if fmt == ImageFormat.UNKNOWN:
|
|
485
|
+
fmt = self.config.default_format
|
|
486
|
+
|
|
487
|
+
buffer = io.BytesIO()
|
|
488
|
+
save_format = fmt.value.upper()
|
|
489
|
+
if save_format == "JPG":
|
|
490
|
+
save_format = "JPEG"
|
|
491
|
+
|
|
492
|
+
save_kwargs = {}
|
|
493
|
+
if save_format == "JPEG":
|
|
494
|
+
save_kwargs["quality"] = quality
|
|
495
|
+
elif save_format == "PNG":
|
|
496
|
+
save_kwargs["compress_level"] = 6
|
|
497
|
+
|
|
498
|
+
pil_image.save(buffer, format=save_format, **save_kwargs)
|
|
499
|
+
image_data = buffer.getvalue()
|
|
500
|
+
|
|
501
|
+
return self.save_image(image_data, custom_name, processed_images)
|
|
502
|
+
|
|
503
|
+
except Exception as e:
|
|
504
|
+
self._logger.error(f"Failed to save PIL image: {e}")
|
|
505
|
+
return None
|
|
506
|
+
|
|
507
|
+
def get_processed_count(self) -> int:
|
|
508
|
+
"""Return number of processed images."""
|
|
509
|
+
return len(self._processed_hashes)
|
|
510
|
+
|
|
511
|
+
def get_processed_paths(self) -> List[str]:
|
|
512
|
+
"""Return all processed image paths."""
|
|
513
|
+
return list(self._processed_hashes.values())
|
|
514
|
+
|
|
515
|
+
def clear_cache(self) -> None:
|
|
516
|
+
"""Clear internal duplicate tracking cache."""
|
|
517
|
+
self._processed_hashes.clear()
|
|
518
|
+
self._sequential_counter = 0
|
|
519
|
+
|
|
520
|
+
def cleanup(self, delete_files: bool = False) -> int:
|
|
521
|
+
"""
|
|
522
|
+
Clean up resources.
|
|
523
|
+
|
|
524
|
+
Args:
|
|
525
|
+
delete_files: If True, delete saved files
|
|
526
|
+
|
|
527
|
+
Returns:
|
|
528
|
+
Number of deleted files
|
|
529
|
+
"""
|
|
530
|
+
deleted = 0
|
|
531
|
+
if delete_files:
|
|
532
|
+
for path in self._processed_hashes.values():
|
|
533
|
+
if self._storage_backend.delete(path):
|
|
534
|
+
deleted += 1
|
|
535
|
+
self.clear_cache()
|
|
536
|
+
return deleted
|
|
537
|
+
|
|
538
|
+
def get_pattern_string(self) -> str:
|
|
539
|
+
"""
|
|
540
|
+
Get regex pattern string for matching image tags.
|
|
541
|
+
|
|
542
|
+
Returns:
|
|
543
|
+
Regex pattern string
|
|
544
|
+
"""
|
|
545
|
+
import re
|
|
546
|
+
prefix = re.escape(self.config.tag_prefix)
|
|
547
|
+
suffix = re.escape(self.config.tag_suffix)
|
|
548
|
+
|
|
549
|
+
if not self.config.tag_suffix:
|
|
550
|
+
capture = r'(\S+)'
|
|
551
|
+
else:
|
|
552
|
+
first_char = self.config.tag_suffix[0]
|
|
553
|
+
capture = f'([^{re.escape(first_char)}]+)'
|
|
554
|
+
|
|
555
|
+
return f'{prefix}{capture}{suffix}'
|
|
556
|
+
|
|
557
|
+
|
|
558
|
+
# ============================================================================
|
|
559
|
+
# Default Configuration
|
|
560
|
+
# ============================================================================
|
|
561
|
+
|
|
562
|
+
DEFAULT_IMAGE_CONFIG = {
|
|
563
|
+
"directory_path": "temp/images",
|
|
564
|
+
"tag_prefix": "[Image:",
|
|
565
|
+
"tag_suffix": "]",
|
|
566
|
+
"naming_strategy": NamingStrategy.HASH,
|
|
567
|
+
}
|
|
568
|
+
|
|
569
|
+
|
|
570
|
+
# ============================================================================
|
|
571
|
+
# Factory Function
|
|
572
|
+
# ============================================================================
|
|
573
|
+
|
|
574
|
+
def create_image_processor(
|
|
575
|
+
directory_path: Optional[str] = None,
|
|
576
|
+
tag_prefix: Optional[str] = None,
|
|
577
|
+
tag_suffix: Optional[str] = None,
|
|
578
|
+
naming_strategy: Optional[Union[NamingStrategy, str]] = None,
|
|
579
|
+
storage_backend: Optional[BaseStorageBackend] = None,
|
|
580
|
+
) -> ImageProcessor:
|
|
581
|
+
"""
|
|
582
|
+
Create a new ImageProcessor instance.
|
|
583
|
+
|
|
584
|
+
Args:
|
|
585
|
+
directory_path: Image save directory
|
|
586
|
+
tag_prefix: Tag prefix
|
|
587
|
+
tag_suffix: Tag suffix
|
|
588
|
+
naming_strategy: File naming strategy
|
|
589
|
+
storage_backend: Storage backend instance
|
|
590
|
+
|
|
591
|
+
Returns:
|
|
592
|
+
ImageProcessor instance
|
|
593
|
+
"""
|
|
594
|
+
if naming_strategy is not None and isinstance(naming_strategy, str):
|
|
595
|
+
naming_strategy = NamingStrategy(naming_strategy.lower())
|
|
596
|
+
|
|
597
|
+
return ImageProcessor(
|
|
598
|
+
directory_path=directory_path or DEFAULT_IMAGE_CONFIG["directory_path"],
|
|
599
|
+
tag_prefix=tag_prefix or DEFAULT_IMAGE_CONFIG["tag_prefix"],
|
|
600
|
+
tag_suffix=tag_suffix or DEFAULT_IMAGE_CONFIG["tag_suffix"],
|
|
601
|
+
naming_strategy=naming_strategy or DEFAULT_IMAGE_CONFIG["naming_strategy"],
|
|
602
|
+
storage_backend=storage_backend,
|
|
603
|
+
)
|
|
604
|
+
|
|
605
|
+
|
|
606
|
+
def save_image_to_file(
|
|
607
|
+
image_data: bytes,
|
|
608
|
+
directory_path: str = "temp",
|
|
609
|
+
tag_prefix: str = "[Image:",
|
|
610
|
+
tag_suffix: str = "]",
|
|
611
|
+
processed_images: Optional[Set[str]] = None,
|
|
612
|
+
) -> Optional[str]:
|
|
613
|
+
"""
|
|
614
|
+
Save image to file and return tag.
|
|
615
|
+
|
|
616
|
+
Convenience function for quick image saving using local storage.
|
|
617
|
+
|
|
618
|
+
Args:
|
|
619
|
+
image_data: Image binary data
|
|
620
|
+
directory_path: Save directory
|
|
621
|
+
tag_prefix: Tag prefix
|
|
622
|
+
tag_suffix: Tag suffix
|
|
623
|
+
processed_images: Set for duplicate tracking
|
|
624
|
+
|
|
625
|
+
Returns:
|
|
626
|
+
Image tag string, or None on failure
|
|
627
|
+
"""
|
|
628
|
+
processor = ImageProcessor(
|
|
629
|
+
directory_path=directory_path,
|
|
630
|
+
tag_prefix=tag_prefix,
|
|
631
|
+
tag_suffix=tag_suffix,
|
|
632
|
+
)
|
|
633
|
+
return processor.save_image(image_data, processed_images=processed_images)
|
|
634
|
+
|
|
635
|
+
|
|
636
|
+
__all__ = [
|
|
637
|
+
# Main class
|
|
638
|
+
"ImageProcessor",
|
|
639
|
+
# Config
|
|
640
|
+
"ImageProcessorConfig",
|
|
641
|
+
# Enums
|
|
642
|
+
"ImageFormat",
|
|
643
|
+
"NamingStrategy",
|
|
644
|
+
# Factory function
|
|
645
|
+
"create_image_processor",
|
|
646
|
+
"DEFAULT_IMAGE_CONFIG",
|
|
647
|
+
# Convenience function
|
|
648
|
+
"save_image_to_file",
|
|
649
|
+
]
|