xgen-doc2chunk 0.1.0__py3-none-any.whl → 0.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- xgen_doc2chunk/__init__.py +42 -0
- xgen_doc2chunk/chunking/__init__.py +168 -0
- xgen_doc2chunk/chunking/chunking.py +786 -0
- xgen_doc2chunk/chunking/constants.py +134 -0
- xgen_doc2chunk/chunking/page_chunker.py +248 -0
- xgen_doc2chunk/chunking/protected_regions.py +715 -0
- xgen_doc2chunk/chunking/sheet_processor.py +406 -0
- xgen_doc2chunk/chunking/table_chunker.py +832 -0
- xgen_doc2chunk/chunking/table_parser.py +172 -0
- xgen_doc2chunk/chunking/text_chunker.py +443 -0
- xgen_doc2chunk/core/__init__.py +64 -0
- xgen_doc2chunk/core/document_processor.py +1307 -0
- xgen_doc2chunk/core/functions/__init__.py +85 -0
- xgen_doc2chunk/core/functions/chart_extractor.py +144 -0
- xgen_doc2chunk/core/functions/chart_processor.py +534 -0
- xgen_doc2chunk/core/functions/file_converter.py +220 -0
- xgen_doc2chunk/core/functions/img_processor.py +649 -0
- xgen_doc2chunk/core/functions/metadata_extractor.py +542 -0
- xgen_doc2chunk/core/functions/page_tag_processor.py +393 -0
- xgen_doc2chunk/core/functions/preprocessor.py +162 -0
- xgen_doc2chunk/core/functions/storage_backend.py +381 -0
- xgen_doc2chunk/core/functions/table_extractor.py +468 -0
- xgen_doc2chunk/core/functions/table_processor.py +299 -0
- xgen_doc2chunk/core/functions/utils.py +159 -0
- xgen_doc2chunk/core/processor/__init__.py +96 -0
- xgen_doc2chunk/core/processor/base_handler.py +544 -0
- xgen_doc2chunk/core/processor/csv_handler.py +135 -0
- xgen_doc2chunk/core/processor/csv_helper/__init__.py +89 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_constants.py +63 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_encoding.py +104 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_file_converter.py +78 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_image_processor.py +75 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_metadata.py +168 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_parser.py +225 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_preprocessor.py +86 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_table.py +266 -0
- xgen_doc2chunk/core/processor/doc_handler.py +579 -0
- xgen_doc2chunk/core/processor/doc_helpers/__init__.py +25 -0
- xgen_doc2chunk/core/processor/doc_helpers/doc_file_converter.py +160 -0
- xgen_doc2chunk/core/processor/doc_helpers/doc_image_processor.py +179 -0
- xgen_doc2chunk/core/processor/doc_helpers/doc_preprocessor.py +83 -0
- xgen_doc2chunk/core/processor/docx_handler.py +376 -0
- xgen_doc2chunk/core/processor/docx_helper/__init__.py +84 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_chart_extractor.py +436 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_constants.py +75 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_file_converter.py +76 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_image.py +145 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_image_processor.py +410 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_metadata.py +71 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_paragraph.py +126 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_preprocessor.py +82 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_table_extractor.py +527 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_table_processor.py +220 -0
- xgen_doc2chunk/core/processor/excel_handler.py +353 -0
- xgen_doc2chunk/core/processor/excel_helper/__init__.py +97 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_chart_extractor.py +498 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_file_converter.py +157 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_image_processor.py +316 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_layout_detector.py +739 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_metadata.py +145 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_preprocessor.py +83 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_table_xls.py +357 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_table_xlsx.py +361 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_textbox.py +266 -0
- xgen_doc2chunk/core/processor/html_helper/__init__.py +7 -0
- xgen_doc2chunk/core/processor/html_helper/html_file_converter.py +92 -0
- xgen_doc2chunk/core/processor/html_helper/html_preprocessor.py +74 -0
- xgen_doc2chunk/core/processor/html_reprocessor.py +140 -0
- xgen_doc2chunk/core/processor/hwp_handler.py +401 -0
- xgen_doc2chunk/core/processor/hwp_helper/__init__.py +120 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_chart_extractor.py +373 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_constants.py +78 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_decoder.py +106 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_docinfo.py +174 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_file_converter.py +60 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_image_processor.py +413 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_metadata.py +236 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_preprocessor.py +82 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_record.py +149 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_recovery.py +217 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_table.py +205 -0
- xgen_doc2chunk/core/processor/hwpx_handler.py +191 -0
- xgen_doc2chunk/core/processor/hwpx_helper/__init__.py +85 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_chart_extractor.py +464 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_constants.py +30 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_file_converter.py +70 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_image_processor.py +258 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_metadata.py +163 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_preprocessor.py +80 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_section.py +242 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_table_extractor.py +462 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_table_processor.py +220 -0
- xgen_doc2chunk/core/processor/image_file_handler.py +212 -0
- xgen_doc2chunk/core/processor/image_file_helper/__init__.py +17 -0
- xgen_doc2chunk/core/processor/image_file_helper/image_file_converter.py +69 -0
- xgen_doc2chunk/core/processor/image_file_helper/image_file_image_processor.py +123 -0
- xgen_doc2chunk/core/processor/image_file_helper/image_file_preprocessor.py +84 -0
- xgen_doc2chunk/core/processor/pdf_handler.py +597 -0
- xgen_doc2chunk/core/processor/pdf_helpers/__init__.py +229 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_block_image_engine.py +667 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_cell_analysis.py +493 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_complexity_analyzer.py +598 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_element_merger.py +46 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_file_converter.py +72 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_graphic_detector.py +332 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_image_processor.py +321 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_layout_block_detector.py +1244 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_line_analysis.py +420 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_metadata.py +101 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_page_analyzer.py +114 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_preprocessor.py +106 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_detection.py +1346 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_processor.py +897 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_quality_analyzer.py +750 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_validator.py +401 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_text_extractor.py +155 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_text_quality_analyzer.py +655 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_utils.py +183 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_vector_text_ocr.py +302 -0
- xgen_doc2chunk/core/processor/pdf_helpers/types.py +278 -0
- xgen_doc2chunk/core/processor/ppt_handler.py +288 -0
- xgen_doc2chunk/core/processor/ppt_helper/__init__.py +96 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_bullet.py +332 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_chart_extractor.py +182 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_constants.py +119 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_file_converter.py +55 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_image_processor.py +196 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_metadata.py +71 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_preprocessor.py +77 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_shape.py +189 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_slide.py +69 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_table.py +386 -0
- xgen_doc2chunk/core/processor/rtf_handler.py +290 -0
- xgen_doc2chunk/core/processor/rtf_helper/__init__.py +128 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_constants.py +94 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_content_extractor.py +211 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_decoder.py +141 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_file_converter.py +87 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_metadata_extractor.py +179 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_preprocessor.py +426 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_region_finder.py +91 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_table_extractor.py +482 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_text_cleaner.py +389 -0
- xgen_doc2chunk/core/processor/text_handler.py +95 -0
- xgen_doc2chunk/core/processor/text_helper/__init__.py +17 -0
- xgen_doc2chunk/core/processor/text_helper/text_file_converter.py +28 -0
- xgen_doc2chunk/core/processor/text_helper/text_image_processor.py +75 -0
- xgen_doc2chunk/core/processor/text_helper/text_preprocessor.py +82 -0
- xgen_doc2chunk/ocr/__init__.py +67 -0
- xgen_doc2chunk/ocr/base.py +209 -0
- xgen_doc2chunk/ocr/ocr_engine/__init__.py +22 -0
- xgen_doc2chunk/ocr/ocr_engine/anthropic_ocr.py +91 -0
- xgen_doc2chunk/ocr/ocr_engine/bedrock_ocr.py +172 -0
- xgen_doc2chunk/ocr/ocr_engine/gemini_ocr.py +91 -0
- xgen_doc2chunk/ocr/ocr_engine/openai_ocr.py +100 -0
- xgen_doc2chunk/ocr/ocr_engine/vllm_ocr.py +116 -0
- xgen_doc2chunk/ocr/ocr_processor.py +387 -0
- {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.1.dist-info}/METADATA +1 -1
- xgen_doc2chunk-0.1.1.dist-info/RECORD +161 -0
- xgen_doc2chunk-0.1.0.dist-info/RECORD +0 -4
- {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.1.dist-info}/WHEEL +0 -0
- {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.1.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,145 @@
|
|
|
1
|
+
# xgen_doc2chunk/core/processor/docx_helper/docx_image.py
|
|
2
|
+
"""
|
|
3
|
+
DOCX ?��?지 추출 ?�틸리티
|
|
4
|
+
|
|
5
|
+
DOCX 문서?�서 ?��?지�?추출?�고 로컬???�?�합?�다.
|
|
6
|
+
- extract_image_from_drawing: Drawing ?�소?�서 ?��?지 추출
|
|
7
|
+
- process_pict_element: ?�거??VML pict ?�소 처리
|
|
8
|
+
|
|
9
|
+
Note: ???�수?��? DOCXImageProcessor??메서?��? ?�출?�는 wrapper?�니??
|
|
10
|
+
?�제 로직?� DOCXImageProcessor???�합?�어 ?�습?�다.
|
|
11
|
+
"""
|
|
12
|
+
import logging
|
|
13
|
+
from typing import Optional, Set, Tuple, TYPE_CHECKING
|
|
14
|
+
|
|
15
|
+
from docx import Document
|
|
16
|
+
|
|
17
|
+
from xgen_doc2chunk.core.processor.docx_helper.docx_constants import ElementType
|
|
18
|
+
|
|
19
|
+
if TYPE_CHECKING:
|
|
20
|
+
from xgen_doc2chunk.core.processor.docx_helper.docx_image_processor import DOCXImageProcessor
|
|
21
|
+
from xgen_doc2chunk.core.functions.img_processor import ImageProcessor
|
|
22
|
+
|
|
23
|
+
logger = logging.getLogger("document-processor")
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def extract_image_from_drawing(
|
|
27
|
+
graphic_data,
|
|
28
|
+
doc: Document,
|
|
29
|
+
processed_images: Set[str],
|
|
30
|
+
image_processor: "ImageProcessor"
|
|
31
|
+
) -> Tuple[str, Optional[ElementType]]:
|
|
32
|
+
"""
|
|
33
|
+
Drawing?�서 ?��?지�?추출?�니??
|
|
34
|
+
|
|
35
|
+
Args:
|
|
36
|
+
graphic_data: graphicData XML ?�소
|
|
37
|
+
doc: python-docx Document 객체
|
|
38
|
+
processed_images: 처리???��?지 경로 집합 (중복 방�?)
|
|
39
|
+
image_processor: ImageProcessor ?�스?�스 (DOCXImageProcessor 권장)
|
|
40
|
+
|
|
41
|
+
Returns:
|
|
42
|
+
(content, element_type) ?�플
|
|
43
|
+
"""
|
|
44
|
+
# DOCXImageProcessor??경우 ?�합??메서???�용
|
|
45
|
+
if hasattr(image_processor, 'extract_from_drawing'):
|
|
46
|
+
content, is_image = image_processor.extract_from_drawing(
|
|
47
|
+
graphic_data, doc, processed_images
|
|
48
|
+
)
|
|
49
|
+
return (content, ElementType.IMAGE) if is_image else ("", None)
|
|
50
|
+
|
|
51
|
+
# Fallback: 기존 로직 (ImageProcessor 기본 ?�래?�인 경우)
|
|
52
|
+
from docx.oxml.ns import qn
|
|
53
|
+
from xgen_doc2chunk.core.processor.docx_helper.docx_constants import NAMESPACES
|
|
54
|
+
|
|
55
|
+
try:
|
|
56
|
+
blip = graphic_data.find('.//a:blip', NAMESPACES)
|
|
57
|
+
if blip is None:
|
|
58
|
+
return "", None
|
|
59
|
+
|
|
60
|
+
r_embed = blip.get(qn('r:embed'))
|
|
61
|
+
r_link = blip.get(qn('r:link'))
|
|
62
|
+
rId = r_embed or r_link
|
|
63
|
+
|
|
64
|
+
if not rId:
|
|
65
|
+
return "", None
|
|
66
|
+
|
|
67
|
+
try:
|
|
68
|
+
rel = doc.part.rels.get(rId)
|
|
69
|
+
if rel is None:
|
|
70
|
+
return "", None
|
|
71
|
+
|
|
72
|
+
if hasattr(rel, 'target_part') and hasattr(rel.target_part, 'blob'):
|
|
73
|
+
image_data = rel.target_part.blob
|
|
74
|
+
image_tag = image_processor.save_image(image_data, processed_images=processed_images)
|
|
75
|
+
if image_tag:
|
|
76
|
+
return f"\n{image_tag}\n", ElementType.IMAGE
|
|
77
|
+
|
|
78
|
+
return "[?��?지]", ElementType.IMAGE
|
|
79
|
+
|
|
80
|
+
except Exception as e:
|
|
81
|
+
logger.warning(f"Error extracting image from relationship: {e}")
|
|
82
|
+
return "[?��?지]", ElementType.IMAGE
|
|
83
|
+
|
|
84
|
+
except Exception as e:
|
|
85
|
+
logger.warning(f"Error extracting image from drawing: {e}")
|
|
86
|
+
return "", None
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
def process_pict_element(
|
|
90
|
+
pict_elem,
|
|
91
|
+
doc: Document,
|
|
92
|
+
processed_images: Set[str],
|
|
93
|
+
image_processor: "ImageProcessor"
|
|
94
|
+
) -> str:
|
|
95
|
+
"""
|
|
96
|
+
?�거??VML pict ?�소�?처리?�니??
|
|
97
|
+
|
|
98
|
+
Args:
|
|
99
|
+
pict_elem: pict XML ?�소
|
|
100
|
+
doc: python-docx Document 객체
|
|
101
|
+
processed_images: 처리???��?지 경로 집합 (중복 방�?)
|
|
102
|
+
image_processor: ImageProcessor ?�스?�스 (DOCXImageProcessor 권장)
|
|
103
|
+
|
|
104
|
+
Returns:
|
|
105
|
+
?��?지 마크??문자??
|
|
106
|
+
"""
|
|
107
|
+
# DOCXImageProcessor??경우 ?�합??메서???�용
|
|
108
|
+
if hasattr(image_processor, 'extract_from_pict'):
|
|
109
|
+
return image_processor.extract_from_pict(pict_elem, doc, processed_images)
|
|
110
|
+
|
|
111
|
+
# Fallback: 기존 로직 (ImageProcessor 기본 ?�래?�인 경우)
|
|
112
|
+
try:
|
|
113
|
+
ns_v = 'urn:schemas-microsoft-com:vml'
|
|
114
|
+
ns_r = 'http://schemas.openxmlformats.org/officeDocument/2006/relationships'
|
|
115
|
+
|
|
116
|
+
imagedata = pict_elem.find('.//{%s}imagedata' % ns_v)
|
|
117
|
+
if imagedata is None:
|
|
118
|
+
return "[?��?지]"
|
|
119
|
+
|
|
120
|
+
rId = imagedata.get('{%s}id' % ns_r)
|
|
121
|
+
if not rId:
|
|
122
|
+
return "[?��?지]"
|
|
123
|
+
|
|
124
|
+
try:
|
|
125
|
+
rel = doc.part.rels.get(rId)
|
|
126
|
+
if rel and hasattr(rel, 'target_part') and hasattr(rel.target_part, 'blob'):
|
|
127
|
+
image_data = rel.target_part.blob
|
|
128
|
+
image_tag = image_processor.save_image(image_data, processed_images=processed_images)
|
|
129
|
+
if image_tag:
|
|
130
|
+
return f"\n{image_tag}\n"
|
|
131
|
+
except Exception:
|
|
132
|
+
pass
|
|
133
|
+
|
|
134
|
+
return "[?��?지]"
|
|
135
|
+
|
|
136
|
+
except Exception as e:
|
|
137
|
+
logger.warning(f"Error processing pict element: {e}")
|
|
138
|
+
return ""
|
|
139
|
+
|
|
140
|
+
|
|
141
|
+
__all__ = [
|
|
142
|
+
'extract_image_from_drawing',
|
|
143
|
+
'process_pict_element',
|
|
144
|
+
]
|
|
145
|
+
|
|
@@ -0,0 +1,410 @@
|
|
|
1
|
+
# xgen_doc2chunk/core/processor/docx_helper/docx_image_processor.py
|
|
2
|
+
"""
|
|
3
|
+
DOCX Image Processor
|
|
4
|
+
|
|
5
|
+
Provides DOCX-specific image processing that inherits from ImageProcessor.
|
|
6
|
+
Handles embedded images, drawing elements (image/diagram), and relationship-based images.
|
|
7
|
+
|
|
8
|
+
This class consolidates all DOCX image and drawing extraction logic including:
|
|
9
|
+
- Drawing/picture element extraction (blip)
|
|
10
|
+
- Diagram text extraction from drawings
|
|
11
|
+
- Legacy VML pict element processing
|
|
12
|
+
- Relationship-based image loading
|
|
13
|
+
"""
|
|
14
|
+
import logging
|
|
15
|
+
from typing import Any, Callable, Dict, List, Optional, Set, Tuple, TYPE_CHECKING
|
|
16
|
+
|
|
17
|
+
from docx.oxml.ns import qn
|
|
18
|
+
|
|
19
|
+
from xgen_doc2chunk.core.functions.img_processor import ImageProcessor
|
|
20
|
+
from xgen_doc2chunk.core.functions.storage_backend import BaseStorageBackend
|
|
21
|
+
from xgen_doc2chunk.core.processor.docx_helper.docx_constants import ElementType
|
|
22
|
+
|
|
23
|
+
if TYPE_CHECKING:
|
|
24
|
+
from docx import Document
|
|
25
|
+
from docx.opc.part import Part
|
|
26
|
+
|
|
27
|
+
logger = logging.getLogger("xgen_doc2chunk.image_processor.docx")
|
|
28
|
+
|
|
29
|
+
# DOCX XML namespaces
|
|
30
|
+
NAMESPACES = {
|
|
31
|
+
'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main',
|
|
32
|
+
'wp': 'http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing',
|
|
33
|
+
'a': 'http://schemas.openxmlformats.org/drawingml/2006/main',
|
|
34
|
+
'r': 'http://schemas.openxmlformats.org/officeDocument/2006/relationships',
|
|
35
|
+
'pic': 'http://schemas.openxmlformats.org/drawingml/2006/picture',
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
class DOCXImageProcessor(ImageProcessor):
|
|
40
|
+
"""
|
|
41
|
+
DOCX-specific image processor.
|
|
42
|
+
|
|
43
|
+
Inherits from ImageProcessor and provides DOCX-specific processing.
|
|
44
|
+
|
|
45
|
+
Handles:
|
|
46
|
+
- Embedded images via relationships
|
|
47
|
+
- Drawing/picture elements
|
|
48
|
+
- Inline images in runs
|
|
49
|
+
- Shape images
|
|
50
|
+
|
|
51
|
+
Example:
|
|
52
|
+
processor = DOCXImageProcessor()
|
|
53
|
+
|
|
54
|
+
# Process relationship-based image
|
|
55
|
+
tag = processor.process_image(image_data, rel_id="rId1")
|
|
56
|
+
|
|
57
|
+
# Process from part
|
|
58
|
+
tag = processor.process_image_part(image_part)
|
|
59
|
+
"""
|
|
60
|
+
|
|
61
|
+
def __init__(
|
|
62
|
+
self,
|
|
63
|
+
directory_path: str = "temp/images",
|
|
64
|
+
tag_prefix: str = "[Image:",
|
|
65
|
+
tag_suffix: str = "]",
|
|
66
|
+
storage_backend: Optional[BaseStorageBackend] = None,
|
|
67
|
+
):
|
|
68
|
+
"""
|
|
69
|
+
Initialize DOCXImageProcessor.
|
|
70
|
+
|
|
71
|
+
Args:
|
|
72
|
+
directory_path: Image save directory
|
|
73
|
+
tag_prefix: Tag prefix for image references
|
|
74
|
+
tag_suffix: Tag suffix for image references
|
|
75
|
+
storage_backend: Storage backend for saving images
|
|
76
|
+
"""
|
|
77
|
+
super().__init__(
|
|
78
|
+
directory_path=directory_path,
|
|
79
|
+
tag_prefix=tag_prefix,
|
|
80
|
+
tag_suffix=tag_suffix,
|
|
81
|
+
storage_backend=storage_backend,
|
|
82
|
+
)
|
|
83
|
+
|
|
84
|
+
def process_image(
|
|
85
|
+
self,
|
|
86
|
+
image_data: bytes,
|
|
87
|
+
rel_id: Optional[str] = None,
|
|
88
|
+
image_name: Optional[str] = None,
|
|
89
|
+
**kwargs
|
|
90
|
+
) -> Optional[str]:
|
|
91
|
+
"""
|
|
92
|
+
Process and save DOCX image data.
|
|
93
|
+
|
|
94
|
+
Args:
|
|
95
|
+
image_data: Raw image binary data
|
|
96
|
+
rel_id: Relationship ID (for naming)
|
|
97
|
+
image_name: Original image name
|
|
98
|
+
**kwargs: Additional options
|
|
99
|
+
|
|
100
|
+
Returns:
|
|
101
|
+
Image tag string, or None on failure
|
|
102
|
+
"""
|
|
103
|
+
custom_name = image_name
|
|
104
|
+
if custom_name is None and rel_id is not None:
|
|
105
|
+
custom_name = f"docx_{rel_id}"
|
|
106
|
+
|
|
107
|
+
return self.save_image(image_data, custom_name=custom_name)
|
|
108
|
+
|
|
109
|
+
def process_image_part(
|
|
110
|
+
self,
|
|
111
|
+
image_part: "Part",
|
|
112
|
+
rel_id: Optional[str] = None,
|
|
113
|
+
) -> Optional[str]:
|
|
114
|
+
"""
|
|
115
|
+
Process image from OOXML part.
|
|
116
|
+
|
|
117
|
+
Args:
|
|
118
|
+
image_part: OOXML Part containing image data
|
|
119
|
+
rel_id: Relationship ID
|
|
120
|
+
|
|
121
|
+
Returns:
|
|
122
|
+
Image tag string, or None on failure
|
|
123
|
+
"""
|
|
124
|
+
try:
|
|
125
|
+
image_data = image_part.blob
|
|
126
|
+
if not image_data:
|
|
127
|
+
return None
|
|
128
|
+
|
|
129
|
+
# Try to get original filename
|
|
130
|
+
image_name = None
|
|
131
|
+
if hasattr(image_part, 'partname'):
|
|
132
|
+
partname = str(image_part.partname)
|
|
133
|
+
if '/' in partname:
|
|
134
|
+
image_name = partname.split('/')[-1]
|
|
135
|
+
|
|
136
|
+
return self.process_image(
|
|
137
|
+
image_data,
|
|
138
|
+
rel_id=rel_id,
|
|
139
|
+
image_name=image_name
|
|
140
|
+
)
|
|
141
|
+
|
|
142
|
+
except Exception as e:
|
|
143
|
+
self._logger.warning(f"Failed to process image part: {e}")
|
|
144
|
+
return None
|
|
145
|
+
|
|
146
|
+
def process_embedded_image(
|
|
147
|
+
self,
|
|
148
|
+
image_data: bytes,
|
|
149
|
+
image_name: Optional[str] = None,
|
|
150
|
+
embed_id: Optional[str] = None,
|
|
151
|
+
**kwargs
|
|
152
|
+
) -> Optional[str]:
|
|
153
|
+
"""
|
|
154
|
+
Process embedded DOCX image.
|
|
155
|
+
|
|
156
|
+
Args:
|
|
157
|
+
image_data: Image binary data
|
|
158
|
+
image_name: Original image filename
|
|
159
|
+
embed_id: Embed relationship ID
|
|
160
|
+
**kwargs: Additional options
|
|
161
|
+
|
|
162
|
+
Returns:
|
|
163
|
+
Image tag string, or None on failure
|
|
164
|
+
"""
|
|
165
|
+
custom_name = image_name
|
|
166
|
+
if custom_name is None and embed_id is not None:
|
|
167
|
+
custom_name = f"docx_embed_{embed_id}"
|
|
168
|
+
|
|
169
|
+
return self.save_image(image_data, custom_name=custom_name)
|
|
170
|
+
|
|
171
|
+
def process_drawing_image(
|
|
172
|
+
self,
|
|
173
|
+
image_data: bytes,
|
|
174
|
+
drawing_id: Optional[str] = None,
|
|
175
|
+
description: Optional[str] = None,
|
|
176
|
+
**kwargs
|
|
177
|
+
) -> Optional[str]:
|
|
178
|
+
"""
|
|
179
|
+
Process DOCX drawing/picture element image.
|
|
180
|
+
|
|
181
|
+
Args:
|
|
182
|
+
image_data: Image binary data
|
|
183
|
+
drawing_id: Drawing element ID
|
|
184
|
+
description: Image description/alt text
|
|
185
|
+
**kwargs: Additional options
|
|
186
|
+
|
|
187
|
+
Returns:
|
|
188
|
+
Image tag string, or None on failure
|
|
189
|
+
"""
|
|
190
|
+
custom_name = None
|
|
191
|
+
if drawing_id is not None:
|
|
192
|
+
custom_name = f"docx_drawing_{drawing_id}"
|
|
193
|
+
|
|
194
|
+
return self.save_image(image_data, custom_name=custom_name)
|
|
195
|
+
|
|
196
|
+
def extract_from_drawing(
|
|
197
|
+
self,
|
|
198
|
+
graphic_data,
|
|
199
|
+
doc: "Document",
|
|
200
|
+
processed_images: Set[str],
|
|
201
|
+
) -> Tuple[str, bool]:
|
|
202
|
+
"""
|
|
203
|
+
Extract image from Drawing graphic data element.
|
|
204
|
+
|
|
205
|
+
This is the core DOCX image extraction logic that was previously
|
|
206
|
+
in docx_image.py extract_image_from_drawing() function.
|
|
207
|
+
|
|
208
|
+
Args:
|
|
209
|
+
graphic_data: graphicData XML element
|
|
210
|
+
doc: python-docx Document object
|
|
211
|
+
processed_images: Set of processed image paths (deduplication)
|
|
212
|
+
|
|
213
|
+
Returns:
|
|
214
|
+
(image_tag, is_image) tuple. image_tag is the tag string or empty,
|
|
215
|
+
is_image indicates if an image was found.
|
|
216
|
+
"""
|
|
217
|
+
try:
|
|
218
|
+
# Find blip element (image reference)
|
|
219
|
+
blip = graphic_data.find('.//a:blip', NAMESPACES)
|
|
220
|
+
if blip is None:
|
|
221
|
+
return "", False
|
|
222
|
+
|
|
223
|
+
# Get relationship ID
|
|
224
|
+
r_embed = blip.get(qn('r:embed'))
|
|
225
|
+
r_link = blip.get(qn('r:link'))
|
|
226
|
+
rId = r_embed or r_link
|
|
227
|
+
|
|
228
|
+
if not rId:
|
|
229
|
+
return "", False
|
|
230
|
+
|
|
231
|
+
# Find image part from relationship
|
|
232
|
+
try:
|
|
233
|
+
rel = doc.part.rels.get(rId)
|
|
234
|
+
if rel is None:
|
|
235
|
+
return "", False
|
|
236
|
+
|
|
237
|
+
# Extract image data
|
|
238
|
+
if hasattr(rel, 'target_part') and hasattr(rel.target_part, 'blob'):
|
|
239
|
+
image_data = rel.target_part.blob
|
|
240
|
+
|
|
241
|
+
# Save using process_image with rel_id
|
|
242
|
+
image_tag = self.process_image(
|
|
243
|
+
image_data,
|
|
244
|
+
rel_id=rId,
|
|
245
|
+
processed_images=processed_images
|
|
246
|
+
)
|
|
247
|
+
|
|
248
|
+
if image_tag:
|
|
249
|
+
return f"\n{image_tag}\n", True
|
|
250
|
+
|
|
251
|
+
return "[Unknown Image]", True
|
|
252
|
+
|
|
253
|
+
except Exception as e:
|
|
254
|
+
logger.warning(f"Error extracting image from relationship: {e}")
|
|
255
|
+
return "[Unknown Image]", True
|
|
256
|
+
|
|
257
|
+
except Exception as e:
|
|
258
|
+
logger.warning(f"Error extracting image from drawing: {e}")
|
|
259
|
+
return "", False
|
|
260
|
+
|
|
261
|
+
def extract_from_pict(
|
|
262
|
+
self,
|
|
263
|
+
pict_elem,
|
|
264
|
+
doc: "Document",
|
|
265
|
+
processed_images: Set[str],
|
|
266
|
+
) -> str:
|
|
267
|
+
"""
|
|
268
|
+
Extract image from legacy VML pict element.
|
|
269
|
+
|
|
270
|
+
This is the core DOCX VML image extraction logic that was previously
|
|
271
|
+
in docx_image.py process_pict_element() function.
|
|
272
|
+
|
|
273
|
+
Args:
|
|
274
|
+
pict_elem: pict XML element
|
|
275
|
+
doc: python-docx Document object
|
|
276
|
+
processed_images: Set of processed image paths (deduplication)
|
|
277
|
+
|
|
278
|
+
Returns:
|
|
279
|
+
Image tag string or placeholder
|
|
280
|
+
"""
|
|
281
|
+
try:
|
|
282
|
+
# Find VML imagedata
|
|
283
|
+
ns_v = 'urn:schemas-microsoft-com:vml'
|
|
284
|
+
ns_r = 'http://schemas.openxmlformats.org/officeDocument/2006/relationships'
|
|
285
|
+
|
|
286
|
+
imagedata = pict_elem.find('.//{%s}imagedata' % ns_v)
|
|
287
|
+
if imagedata is None:
|
|
288
|
+
return "[Unknown Image]"
|
|
289
|
+
|
|
290
|
+
rId = imagedata.get('{%s}id' % ns_r)
|
|
291
|
+
if not rId:
|
|
292
|
+
return "[Unknown Image]"
|
|
293
|
+
|
|
294
|
+
try:
|
|
295
|
+
rel = doc.part.rels.get(rId)
|
|
296
|
+
if rel and hasattr(rel, 'target_part') and hasattr(rel.target_part, 'blob'):
|
|
297
|
+
image_data = rel.target_part.blob
|
|
298
|
+
image_tag = self.process_image(
|
|
299
|
+
image_data,
|
|
300
|
+
rel_id=rId,
|
|
301
|
+
processed_images=processed_images
|
|
302
|
+
)
|
|
303
|
+
if image_tag:
|
|
304
|
+
return f"\n{image_tag}\n"
|
|
305
|
+
except Exception:
|
|
306
|
+
pass
|
|
307
|
+
|
|
308
|
+
return "[Unknown Image]"
|
|
309
|
+
|
|
310
|
+
except Exception as e:
|
|
311
|
+
logger.warning(f"Error processing pict element: {e}")
|
|
312
|
+
return ""
|
|
313
|
+
|
|
314
|
+
def process_drawing_element(
|
|
315
|
+
self,
|
|
316
|
+
drawing_elem,
|
|
317
|
+
doc: "Document",
|
|
318
|
+
processed_images: Set[str],
|
|
319
|
+
chart_callback: Optional[Callable[[], str]] = None,
|
|
320
|
+
) -> Tuple[str, Optional[ElementType]]:
|
|
321
|
+
"""
|
|
322
|
+
Process Drawing element (image, chart, diagram).
|
|
323
|
+
|
|
324
|
+
Main entry point for handling all drawing elements in DOCX.
|
|
325
|
+
Branches to appropriate handler based on content type.
|
|
326
|
+
|
|
327
|
+
Args:
|
|
328
|
+
drawing_elem: drawing XML element
|
|
329
|
+
doc: python-docx Document object
|
|
330
|
+
processed_images: Set of processed image paths (deduplication)
|
|
331
|
+
chart_callback: Callback function to get next chart content
|
|
332
|
+
|
|
333
|
+
Returns:
|
|
334
|
+
(content, element_type) tuple
|
|
335
|
+
"""
|
|
336
|
+
try:
|
|
337
|
+
# Check inline or anchor
|
|
338
|
+
inline = drawing_elem.find('.//wp:inline', NAMESPACES)
|
|
339
|
+
anchor = drawing_elem.find('.//wp:anchor', NAMESPACES)
|
|
340
|
+
|
|
341
|
+
container = inline if inline is not None else anchor
|
|
342
|
+
if container is None:
|
|
343
|
+
return "", None
|
|
344
|
+
|
|
345
|
+
# Check graphic data
|
|
346
|
+
graphic = container.find('.//a:graphic', NAMESPACES)
|
|
347
|
+
if graphic is None:
|
|
348
|
+
return "", None
|
|
349
|
+
|
|
350
|
+
graphic_data = graphic.find('a:graphicData', NAMESPACES)
|
|
351
|
+
if graphic_data is None:
|
|
352
|
+
return "", None
|
|
353
|
+
|
|
354
|
+
uri = graphic_data.get('uri', '')
|
|
355
|
+
|
|
356
|
+
# Image case
|
|
357
|
+
if 'picture' in uri.lower():
|
|
358
|
+
content, is_image = self.extract_from_drawing(
|
|
359
|
+
graphic_data, doc, processed_images
|
|
360
|
+
)
|
|
361
|
+
return (content, ElementType.IMAGE) if is_image else ("", None)
|
|
362
|
+
|
|
363
|
+
# Chart case - delegate to callback
|
|
364
|
+
if 'chart' in uri.lower():
|
|
365
|
+
if chart_callback:
|
|
366
|
+
chart_content = chart_callback()
|
|
367
|
+
return chart_content, ElementType.CHART
|
|
368
|
+
return "", ElementType.CHART
|
|
369
|
+
|
|
370
|
+
# Diagram case
|
|
371
|
+
if 'diagram' in uri.lower():
|
|
372
|
+
return self.extract_diagram(graphic_data)
|
|
373
|
+
|
|
374
|
+
return "", None
|
|
375
|
+
|
|
376
|
+
except Exception as e:
|
|
377
|
+
logger.warning(f"Error processing drawing element: {e}")
|
|
378
|
+
return "", None
|
|
379
|
+
|
|
380
|
+
def extract_diagram(
|
|
381
|
+
self,
|
|
382
|
+
graphic_data,
|
|
383
|
+
) -> Tuple[str, Optional[ElementType]]:
|
|
384
|
+
"""
|
|
385
|
+
Extract diagram information from Drawing.
|
|
386
|
+
|
|
387
|
+
Args:
|
|
388
|
+
graphic_data: graphicData XML element
|
|
389
|
+
|
|
390
|
+
Returns:
|
|
391
|
+
(content, element_type) tuple
|
|
392
|
+
"""
|
|
393
|
+
try:
|
|
394
|
+
texts = []
|
|
395
|
+
ns_a = 'http://schemas.openxmlformats.org/drawingml/2006/main'
|
|
396
|
+
for t_elem in graphic_data.findall('.//{%s}t' % ns_a):
|
|
397
|
+
if t_elem.text:
|
|
398
|
+
texts.append(t_elem.text.strip())
|
|
399
|
+
|
|
400
|
+
if texts:
|
|
401
|
+
return f"[Diagram: {' / '.join(texts)}]", ElementType.DIAGRAM
|
|
402
|
+
|
|
403
|
+
return "[Diagram]", ElementType.DIAGRAM
|
|
404
|
+
|
|
405
|
+
except Exception as e:
|
|
406
|
+
logger.warning(f"Error extracting diagram: {e}")
|
|
407
|
+
return "[Diagram]", ElementType.DIAGRAM
|
|
408
|
+
|
|
409
|
+
|
|
410
|
+
__all__ = ["DOCXImageProcessor"]
|
|
@@ -0,0 +1,71 @@
|
|
|
1
|
+
# xgen_doc2chunk/core/processor/docx_helper/docx_metadata.py
|
|
2
|
+
"""
|
|
3
|
+
DOCX Metadata Extraction Module
|
|
4
|
+
|
|
5
|
+
Provides DOCXMetadataExtractor class for extracting metadata from DOCX documents
|
|
6
|
+
using python-docx core_properties. Implements BaseMetadataExtractor interface.
|
|
7
|
+
"""
|
|
8
|
+
import logging
|
|
9
|
+
from typing import Any, Optional
|
|
10
|
+
|
|
11
|
+
from docx import Document
|
|
12
|
+
|
|
13
|
+
from xgen_doc2chunk.core.functions.metadata_extractor import (
|
|
14
|
+
BaseMetadataExtractor,
|
|
15
|
+
DocumentMetadata,
|
|
16
|
+
)
|
|
17
|
+
|
|
18
|
+
logger = logging.getLogger("document-processor")
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class DOCXMetadataExtractor(BaseMetadataExtractor):
|
|
22
|
+
"""
|
|
23
|
+
DOCX Metadata Extractor.
|
|
24
|
+
|
|
25
|
+
Extracts metadata from python-docx Document objects.
|
|
26
|
+
|
|
27
|
+
Supported fields:
|
|
28
|
+
- title, subject, author, keywords, comments
|
|
29
|
+
- last_saved_by, create_time, last_saved_time
|
|
30
|
+
|
|
31
|
+
Usage:
|
|
32
|
+
extractor = DOCXMetadataExtractor()
|
|
33
|
+
metadata = extractor.extract(docx_document)
|
|
34
|
+
text = extractor.format(metadata)
|
|
35
|
+
"""
|
|
36
|
+
|
|
37
|
+
def extract(self, source: Document) -> DocumentMetadata:
|
|
38
|
+
"""
|
|
39
|
+
Extract metadata from DOCX document.
|
|
40
|
+
|
|
41
|
+
Args:
|
|
42
|
+
source: python-docx Document object
|
|
43
|
+
|
|
44
|
+
Returns:
|
|
45
|
+
DocumentMetadata instance containing extracted metadata.
|
|
46
|
+
"""
|
|
47
|
+
try:
|
|
48
|
+
props = source.core_properties
|
|
49
|
+
|
|
50
|
+
return DocumentMetadata(
|
|
51
|
+
title=self._get_stripped(props.title),
|
|
52
|
+
subject=self._get_stripped(props.subject),
|
|
53
|
+
author=self._get_stripped(props.author),
|
|
54
|
+
keywords=self._get_stripped(props.keywords),
|
|
55
|
+
comments=self._get_stripped(props.comments),
|
|
56
|
+
last_saved_by=self._get_stripped(props.last_modified_by),
|
|
57
|
+
create_time=props.created,
|
|
58
|
+
last_saved_time=props.modified,
|
|
59
|
+
)
|
|
60
|
+
except Exception as e:
|
|
61
|
+
self.logger.warning(f"Failed to extract DOCX metadata: {e}")
|
|
62
|
+
return DocumentMetadata()
|
|
63
|
+
|
|
64
|
+
def _get_stripped(self, value: Optional[str]) -> Optional[str]:
|
|
65
|
+
"""Return stripped string value, or None if empty."""
|
|
66
|
+
return value.strip() if value else None
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
__all__ = [
|
|
70
|
+
'DOCXMetadataExtractor',
|
|
71
|
+
]
|