xgen-doc2chunk 0.1.0__py3-none-any.whl → 0.1.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (162) hide show
  1. xgen_doc2chunk/__init__.py +42 -0
  2. xgen_doc2chunk/chunking/__init__.py +168 -0
  3. xgen_doc2chunk/chunking/chunking.py +786 -0
  4. xgen_doc2chunk/chunking/constants.py +134 -0
  5. xgen_doc2chunk/chunking/page_chunker.py +248 -0
  6. xgen_doc2chunk/chunking/protected_regions.py +715 -0
  7. xgen_doc2chunk/chunking/sheet_processor.py +406 -0
  8. xgen_doc2chunk/chunking/table_chunker.py +832 -0
  9. xgen_doc2chunk/chunking/table_parser.py +172 -0
  10. xgen_doc2chunk/chunking/text_chunker.py +443 -0
  11. xgen_doc2chunk/core/__init__.py +64 -0
  12. xgen_doc2chunk/core/document_processor.py +1307 -0
  13. xgen_doc2chunk/core/functions/__init__.py +85 -0
  14. xgen_doc2chunk/core/functions/chart_extractor.py +144 -0
  15. xgen_doc2chunk/core/functions/chart_processor.py +534 -0
  16. xgen_doc2chunk/core/functions/file_converter.py +220 -0
  17. xgen_doc2chunk/core/functions/img_processor.py +649 -0
  18. xgen_doc2chunk/core/functions/metadata_extractor.py +542 -0
  19. xgen_doc2chunk/core/functions/page_tag_processor.py +393 -0
  20. xgen_doc2chunk/core/functions/preprocessor.py +162 -0
  21. xgen_doc2chunk/core/functions/storage_backend.py +381 -0
  22. xgen_doc2chunk/core/functions/table_extractor.py +468 -0
  23. xgen_doc2chunk/core/functions/table_processor.py +299 -0
  24. xgen_doc2chunk/core/functions/utils.py +159 -0
  25. xgen_doc2chunk/core/processor/__init__.py +96 -0
  26. xgen_doc2chunk/core/processor/base_handler.py +544 -0
  27. xgen_doc2chunk/core/processor/csv_handler.py +135 -0
  28. xgen_doc2chunk/core/processor/csv_helper/__init__.py +89 -0
  29. xgen_doc2chunk/core/processor/csv_helper/csv_constants.py +63 -0
  30. xgen_doc2chunk/core/processor/csv_helper/csv_encoding.py +104 -0
  31. xgen_doc2chunk/core/processor/csv_helper/csv_file_converter.py +78 -0
  32. xgen_doc2chunk/core/processor/csv_helper/csv_image_processor.py +75 -0
  33. xgen_doc2chunk/core/processor/csv_helper/csv_metadata.py +168 -0
  34. xgen_doc2chunk/core/processor/csv_helper/csv_parser.py +225 -0
  35. xgen_doc2chunk/core/processor/csv_helper/csv_preprocessor.py +86 -0
  36. xgen_doc2chunk/core/processor/csv_helper/csv_table.py +266 -0
  37. xgen_doc2chunk/core/processor/doc_handler.py +579 -0
  38. xgen_doc2chunk/core/processor/doc_helpers/__init__.py +25 -0
  39. xgen_doc2chunk/core/processor/doc_helpers/doc_file_converter.py +160 -0
  40. xgen_doc2chunk/core/processor/doc_helpers/doc_image_processor.py +179 -0
  41. xgen_doc2chunk/core/processor/doc_helpers/doc_preprocessor.py +83 -0
  42. xgen_doc2chunk/core/processor/docx_handler.py +376 -0
  43. xgen_doc2chunk/core/processor/docx_helper/__init__.py +84 -0
  44. xgen_doc2chunk/core/processor/docx_helper/docx_chart_extractor.py +436 -0
  45. xgen_doc2chunk/core/processor/docx_helper/docx_constants.py +75 -0
  46. xgen_doc2chunk/core/processor/docx_helper/docx_file_converter.py +76 -0
  47. xgen_doc2chunk/core/processor/docx_helper/docx_image.py +145 -0
  48. xgen_doc2chunk/core/processor/docx_helper/docx_image_processor.py +410 -0
  49. xgen_doc2chunk/core/processor/docx_helper/docx_metadata.py +71 -0
  50. xgen_doc2chunk/core/processor/docx_helper/docx_paragraph.py +126 -0
  51. xgen_doc2chunk/core/processor/docx_helper/docx_preprocessor.py +82 -0
  52. xgen_doc2chunk/core/processor/docx_helper/docx_table_extractor.py +527 -0
  53. xgen_doc2chunk/core/processor/docx_helper/docx_table_processor.py +220 -0
  54. xgen_doc2chunk/core/processor/excel_handler.py +353 -0
  55. xgen_doc2chunk/core/processor/excel_helper/__init__.py +97 -0
  56. xgen_doc2chunk/core/processor/excel_helper/excel_chart_extractor.py +498 -0
  57. xgen_doc2chunk/core/processor/excel_helper/excel_file_converter.py +157 -0
  58. xgen_doc2chunk/core/processor/excel_helper/excel_image_processor.py +316 -0
  59. xgen_doc2chunk/core/processor/excel_helper/excel_layout_detector.py +739 -0
  60. xgen_doc2chunk/core/processor/excel_helper/excel_metadata.py +145 -0
  61. xgen_doc2chunk/core/processor/excel_helper/excel_preprocessor.py +83 -0
  62. xgen_doc2chunk/core/processor/excel_helper/excel_table_xls.py +357 -0
  63. xgen_doc2chunk/core/processor/excel_helper/excel_table_xlsx.py +361 -0
  64. xgen_doc2chunk/core/processor/excel_helper/excel_textbox.py +266 -0
  65. xgen_doc2chunk/core/processor/html_helper/__init__.py +7 -0
  66. xgen_doc2chunk/core/processor/html_helper/html_file_converter.py +92 -0
  67. xgen_doc2chunk/core/processor/html_helper/html_preprocessor.py +74 -0
  68. xgen_doc2chunk/core/processor/html_reprocessor.py +140 -0
  69. xgen_doc2chunk/core/processor/hwp_handler.py +401 -0
  70. xgen_doc2chunk/core/processor/hwp_helper/__init__.py +120 -0
  71. xgen_doc2chunk/core/processor/hwp_helper/hwp_chart_extractor.py +373 -0
  72. xgen_doc2chunk/core/processor/hwp_helper/hwp_constants.py +78 -0
  73. xgen_doc2chunk/core/processor/hwp_helper/hwp_decoder.py +106 -0
  74. xgen_doc2chunk/core/processor/hwp_helper/hwp_docinfo.py +174 -0
  75. xgen_doc2chunk/core/processor/hwp_helper/hwp_file_converter.py +60 -0
  76. xgen_doc2chunk/core/processor/hwp_helper/hwp_image_processor.py +413 -0
  77. xgen_doc2chunk/core/processor/hwp_helper/hwp_metadata.py +236 -0
  78. xgen_doc2chunk/core/processor/hwp_helper/hwp_preprocessor.py +82 -0
  79. xgen_doc2chunk/core/processor/hwp_helper/hwp_record.py +149 -0
  80. xgen_doc2chunk/core/processor/hwp_helper/hwp_recovery.py +217 -0
  81. xgen_doc2chunk/core/processor/hwp_helper/hwp_table.py +205 -0
  82. xgen_doc2chunk/core/processor/hwpx_handler.py +191 -0
  83. xgen_doc2chunk/core/processor/hwpx_helper/__init__.py +85 -0
  84. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_chart_extractor.py +464 -0
  85. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_constants.py +30 -0
  86. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_file_converter.py +70 -0
  87. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_image_processor.py +258 -0
  88. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_metadata.py +163 -0
  89. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_preprocessor.py +80 -0
  90. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_section.py +242 -0
  91. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_table_extractor.py +462 -0
  92. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_table_processor.py +220 -0
  93. xgen_doc2chunk/core/processor/image_file_handler.py +212 -0
  94. xgen_doc2chunk/core/processor/image_file_helper/__init__.py +17 -0
  95. xgen_doc2chunk/core/processor/image_file_helper/image_file_converter.py +69 -0
  96. xgen_doc2chunk/core/processor/image_file_helper/image_file_image_processor.py +123 -0
  97. xgen_doc2chunk/core/processor/image_file_helper/image_file_preprocessor.py +84 -0
  98. xgen_doc2chunk/core/processor/pdf_handler.py +597 -0
  99. xgen_doc2chunk/core/processor/pdf_helpers/__init__.py +229 -0
  100. xgen_doc2chunk/core/processor/pdf_helpers/pdf_block_image_engine.py +667 -0
  101. xgen_doc2chunk/core/processor/pdf_helpers/pdf_cell_analysis.py +493 -0
  102. xgen_doc2chunk/core/processor/pdf_helpers/pdf_complexity_analyzer.py +598 -0
  103. xgen_doc2chunk/core/processor/pdf_helpers/pdf_element_merger.py +46 -0
  104. xgen_doc2chunk/core/processor/pdf_helpers/pdf_file_converter.py +72 -0
  105. xgen_doc2chunk/core/processor/pdf_helpers/pdf_graphic_detector.py +332 -0
  106. xgen_doc2chunk/core/processor/pdf_helpers/pdf_image_processor.py +321 -0
  107. xgen_doc2chunk/core/processor/pdf_helpers/pdf_layout_block_detector.py +1244 -0
  108. xgen_doc2chunk/core/processor/pdf_helpers/pdf_line_analysis.py +420 -0
  109. xgen_doc2chunk/core/processor/pdf_helpers/pdf_metadata.py +101 -0
  110. xgen_doc2chunk/core/processor/pdf_helpers/pdf_page_analyzer.py +114 -0
  111. xgen_doc2chunk/core/processor/pdf_helpers/pdf_preprocessor.py +106 -0
  112. xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_detection.py +1346 -0
  113. xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_processor.py +897 -0
  114. xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_quality_analyzer.py +750 -0
  115. xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_validator.py +401 -0
  116. xgen_doc2chunk/core/processor/pdf_helpers/pdf_text_extractor.py +155 -0
  117. xgen_doc2chunk/core/processor/pdf_helpers/pdf_text_quality_analyzer.py +655 -0
  118. xgen_doc2chunk/core/processor/pdf_helpers/pdf_utils.py +183 -0
  119. xgen_doc2chunk/core/processor/pdf_helpers/pdf_vector_text_ocr.py +302 -0
  120. xgen_doc2chunk/core/processor/pdf_helpers/types.py +278 -0
  121. xgen_doc2chunk/core/processor/ppt_handler.py +288 -0
  122. xgen_doc2chunk/core/processor/ppt_helper/__init__.py +96 -0
  123. xgen_doc2chunk/core/processor/ppt_helper/ppt_bullet.py +332 -0
  124. xgen_doc2chunk/core/processor/ppt_helper/ppt_chart_extractor.py +182 -0
  125. xgen_doc2chunk/core/processor/ppt_helper/ppt_constants.py +119 -0
  126. xgen_doc2chunk/core/processor/ppt_helper/ppt_file_converter.py +55 -0
  127. xgen_doc2chunk/core/processor/ppt_helper/ppt_image_processor.py +196 -0
  128. xgen_doc2chunk/core/processor/ppt_helper/ppt_metadata.py +71 -0
  129. xgen_doc2chunk/core/processor/ppt_helper/ppt_preprocessor.py +77 -0
  130. xgen_doc2chunk/core/processor/ppt_helper/ppt_shape.py +189 -0
  131. xgen_doc2chunk/core/processor/ppt_helper/ppt_slide.py +69 -0
  132. xgen_doc2chunk/core/processor/ppt_helper/ppt_table.py +386 -0
  133. xgen_doc2chunk/core/processor/rtf_handler.py +290 -0
  134. xgen_doc2chunk/core/processor/rtf_helper/__init__.py +128 -0
  135. xgen_doc2chunk/core/processor/rtf_helper/rtf_constants.py +94 -0
  136. xgen_doc2chunk/core/processor/rtf_helper/rtf_content_extractor.py +211 -0
  137. xgen_doc2chunk/core/processor/rtf_helper/rtf_decoder.py +141 -0
  138. xgen_doc2chunk/core/processor/rtf_helper/rtf_file_converter.py +87 -0
  139. xgen_doc2chunk/core/processor/rtf_helper/rtf_metadata_extractor.py +179 -0
  140. xgen_doc2chunk/core/processor/rtf_helper/rtf_preprocessor.py +426 -0
  141. xgen_doc2chunk/core/processor/rtf_helper/rtf_region_finder.py +91 -0
  142. xgen_doc2chunk/core/processor/rtf_helper/rtf_table_extractor.py +482 -0
  143. xgen_doc2chunk/core/processor/rtf_helper/rtf_text_cleaner.py +389 -0
  144. xgen_doc2chunk/core/processor/text_handler.py +95 -0
  145. xgen_doc2chunk/core/processor/text_helper/__init__.py +17 -0
  146. xgen_doc2chunk/core/processor/text_helper/text_file_converter.py +28 -0
  147. xgen_doc2chunk/core/processor/text_helper/text_image_processor.py +75 -0
  148. xgen_doc2chunk/core/processor/text_helper/text_preprocessor.py +82 -0
  149. xgen_doc2chunk/ocr/__init__.py +67 -0
  150. xgen_doc2chunk/ocr/base.py +209 -0
  151. xgen_doc2chunk/ocr/ocr_engine/__init__.py +22 -0
  152. xgen_doc2chunk/ocr/ocr_engine/anthropic_ocr.py +91 -0
  153. xgen_doc2chunk/ocr/ocr_engine/bedrock_ocr.py +172 -0
  154. xgen_doc2chunk/ocr/ocr_engine/gemini_ocr.py +91 -0
  155. xgen_doc2chunk/ocr/ocr_engine/openai_ocr.py +100 -0
  156. xgen_doc2chunk/ocr/ocr_engine/vllm_ocr.py +116 -0
  157. xgen_doc2chunk/ocr/ocr_processor.py +387 -0
  158. {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.2.dist-info}/METADATA +1 -1
  159. xgen_doc2chunk-0.1.2.dist-info/RECORD +161 -0
  160. xgen_doc2chunk-0.1.0.dist-info/RECORD +0 -4
  161. {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.2.dist-info}/WHEEL +0 -0
  162. {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.2.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,145 @@
1
+ # xgen_doc2chunk/core/processor/docx_helper/docx_image.py
2
+ """
3
+ DOCX ?��?지 추출 ?�틸리티
4
+
5
+ DOCX 문서?�서 ?��?지�?추출?�고 로컬???�?�합?�다.
6
+ - extract_image_from_drawing: Drawing ?�소?�서 ?��?지 추출
7
+ - process_pict_element: ?�거??VML pict ?�소 처리
8
+
9
+ Note: ???�수?��? DOCXImageProcessor??메서?��? ?�출?�는 wrapper?�니??
10
+ ?�제 로직?� DOCXImageProcessor???�합?�어 ?�습?�다.
11
+ """
12
+ import logging
13
+ from typing import Optional, Set, Tuple, TYPE_CHECKING
14
+
15
+ from docx import Document
16
+
17
+ from xgen_doc2chunk.core.processor.docx_helper.docx_constants import ElementType
18
+
19
+ if TYPE_CHECKING:
20
+ from xgen_doc2chunk.core.processor.docx_helper.docx_image_processor import DOCXImageProcessor
21
+ from xgen_doc2chunk.core.functions.img_processor import ImageProcessor
22
+
23
+ logger = logging.getLogger("document-processor")
24
+
25
+
26
+ def extract_image_from_drawing(
27
+ graphic_data,
28
+ doc: Document,
29
+ processed_images: Set[str],
30
+ image_processor: "ImageProcessor"
31
+ ) -> Tuple[str, Optional[ElementType]]:
32
+ """
33
+ Drawing?�서 ?��?지�?추출?�니??
34
+
35
+ Args:
36
+ graphic_data: graphicData XML ?�소
37
+ doc: python-docx Document 객체
38
+ processed_images: 처리???��?지 경로 집합 (중복 방�?)
39
+ image_processor: ImageProcessor ?�스?�스 (DOCXImageProcessor 권장)
40
+
41
+ Returns:
42
+ (content, element_type) ?�플
43
+ """
44
+ # DOCXImageProcessor??경우 ?�합??메서???�용
45
+ if hasattr(image_processor, 'extract_from_drawing'):
46
+ content, is_image = image_processor.extract_from_drawing(
47
+ graphic_data, doc, processed_images
48
+ )
49
+ return (content, ElementType.IMAGE) if is_image else ("", None)
50
+
51
+ # Fallback: 기존 로직 (ImageProcessor 기본 ?�래?�인 경우)
52
+ from docx.oxml.ns import qn
53
+ from xgen_doc2chunk.core.processor.docx_helper.docx_constants import NAMESPACES
54
+
55
+ try:
56
+ blip = graphic_data.find('.//a:blip', NAMESPACES)
57
+ if blip is None:
58
+ return "", None
59
+
60
+ r_embed = blip.get(qn('r:embed'))
61
+ r_link = blip.get(qn('r:link'))
62
+ rId = r_embed or r_link
63
+
64
+ if not rId:
65
+ return "", None
66
+
67
+ try:
68
+ rel = doc.part.rels.get(rId)
69
+ if rel is None:
70
+ return "", None
71
+
72
+ if hasattr(rel, 'target_part') and hasattr(rel.target_part, 'blob'):
73
+ image_data = rel.target_part.blob
74
+ image_tag = image_processor.save_image(image_data, processed_images=processed_images)
75
+ if image_tag:
76
+ return f"\n{image_tag}\n", ElementType.IMAGE
77
+
78
+ return "[?��?지]", ElementType.IMAGE
79
+
80
+ except Exception as e:
81
+ logger.warning(f"Error extracting image from relationship: {e}")
82
+ return "[?��?지]", ElementType.IMAGE
83
+
84
+ except Exception as e:
85
+ logger.warning(f"Error extracting image from drawing: {e}")
86
+ return "", None
87
+
88
+
89
+ def process_pict_element(
90
+ pict_elem,
91
+ doc: Document,
92
+ processed_images: Set[str],
93
+ image_processor: "ImageProcessor"
94
+ ) -> str:
95
+ """
96
+ ?�거??VML pict ?�소�?처리?�니??
97
+
98
+ Args:
99
+ pict_elem: pict XML ?�소
100
+ doc: python-docx Document 객체
101
+ processed_images: 처리???��?지 경로 집합 (중복 방�?)
102
+ image_processor: ImageProcessor ?�스?�스 (DOCXImageProcessor 권장)
103
+
104
+ Returns:
105
+ ?��?지 마크??문자??
106
+ """
107
+ # DOCXImageProcessor??경우 ?�합??메서???�용
108
+ if hasattr(image_processor, 'extract_from_pict'):
109
+ return image_processor.extract_from_pict(pict_elem, doc, processed_images)
110
+
111
+ # Fallback: 기존 로직 (ImageProcessor 기본 ?�래?�인 경우)
112
+ try:
113
+ ns_v = 'urn:schemas-microsoft-com:vml'
114
+ ns_r = 'http://schemas.openxmlformats.org/officeDocument/2006/relationships'
115
+
116
+ imagedata = pict_elem.find('.//{%s}imagedata' % ns_v)
117
+ if imagedata is None:
118
+ return "[?��?지]"
119
+
120
+ rId = imagedata.get('{%s}id' % ns_r)
121
+ if not rId:
122
+ return "[?��?지]"
123
+
124
+ try:
125
+ rel = doc.part.rels.get(rId)
126
+ if rel and hasattr(rel, 'target_part') and hasattr(rel.target_part, 'blob'):
127
+ image_data = rel.target_part.blob
128
+ image_tag = image_processor.save_image(image_data, processed_images=processed_images)
129
+ if image_tag:
130
+ return f"\n{image_tag}\n"
131
+ except Exception:
132
+ pass
133
+
134
+ return "[?��?지]"
135
+
136
+ except Exception as e:
137
+ logger.warning(f"Error processing pict element: {e}")
138
+ return ""
139
+
140
+
141
+ __all__ = [
142
+ 'extract_image_from_drawing',
143
+ 'process_pict_element',
144
+ ]
145
+
@@ -0,0 +1,410 @@
1
+ # xgen_doc2chunk/core/processor/docx_helper/docx_image_processor.py
2
+ """
3
+ DOCX Image Processor
4
+
5
+ Provides DOCX-specific image processing that inherits from ImageProcessor.
6
+ Handles embedded images, drawing elements (image/diagram), and relationship-based images.
7
+
8
+ This class consolidates all DOCX image and drawing extraction logic including:
9
+ - Drawing/picture element extraction (blip)
10
+ - Diagram text extraction from drawings
11
+ - Legacy VML pict element processing
12
+ - Relationship-based image loading
13
+ """
14
+ import logging
15
+ from typing import Any, Callable, Dict, List, Optional, Set, Tuple, TYPE_CHECKING
16
+
17
+ from docx.oxml.ns import qn
18
+
19
+ from xgen_doc2chunk.core.functions.img_processor import ImageProcessor
20
+ from xgen_doc2chunk.core.functions.storage_backend import BaseStorageBackend
21
+ from xgen_doc2chunk.core.processor.docx_helper.docx_constants import ElementType
22
+
23
+ if TYPE_CHECKING:
24
+ from docx import Document
25
+ from docx.opc.part import Part
26
+
27
+ logger = logging.getLogger("xgen_doc2chunk.image_processor.docx")
28
+
29
+ # DOCX XML namespaces
30
+ NAMESPACES = {
31
+ 'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main',
32
+ 'wp': 'http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing',
33
+ 'a': 'http://schemas.openxmlformats.org/drawingml/2006/main',
34
+ 'r': 'http://schemas.openxmlformats.org/officeDocument/2006/relationships',
35
+ 'pic': 'http://schemas.openxmlformats.org/drawingml/2006/picture',
36
+ }
37
+
38
+
39
+ class DOCXImageProcessor(ImageProcessor):
40
+ """
41
+ DOCX-specific image processor.
42
+
43
+ Inherits from ImageProcessor and provides DOCX-specific processing.
44
+
45
+ Handles:
46
+ - Embedded images via relationships
47
+ - Drawing/picture elements
48
+ - Inline images in runs
49
+ - Shape images
50
+
51
+ Example:
52
+ processor = DOCXImageProcessor()
53
+
54
+ # Process relationship-based image
55
+ tag = processor.process_image(image_data, rel_id="rId1")
56
+
57
+ # Process from part
58
+ tag = processor.process_image_part(image_part)
59
+ """
60
+
61
+ def __init__(
62
+ self,
63
+ directory_path: str = "temp/images",
64
+ tag_prefix: str = "[Image:",
65
+ tag_suffix: str = "]",
66
+ storage_backend: Optional[BaseStorageBackend] = None,
67
+ ):
68
+ """
69
+ Initialize DOCXImageProcessor.
70
+
71
+ Args:
72
+ directory_path: Image save directory
73
+ tag_prefix: Tag prefix for image references
74
+ tag_suffix: Tag suffix for image references
75
+ storage_backend: Storage backend for saving images
76
+ """
77
+ super().__init__(
78
+ directory_path=directory_path,
79
+ tag_prefix=tag_prefix,
80
+ tag_suffix=tag_suffix,
81
+ storage_backend=storage_backend,
82
+ )
83
+
84
+ def process_image(
85
+ self,
86
+ image_data: bytes,
87
+ rel_id: Optional[str] = None,
88
+ image_name: Optional[str] = None,
89
+ **kwargs
90
+ ) -> Optional[str]:
91
+ """
92
+ Process and save DOCX image data.
93
+
94
+ Args:
95
+ image_data: Raw image binary data
96
+ rel_id: Relationship ID (for naming)
97
+ image_name: Original image name
98
+ **kwargs: Additional options
99
+
100
+ Returns:
101
+ Image tag string, or None on failure
102
+ """
103
+ custom_name = image_name
104
+ if custom_name is None and rel_id is not None:
105
+ custom_name = f"docx_{rel_id}"
106
+
107
+ return self.save_image(image_data, custom_name=custom_name)
108
+
109
+ def process_image_part(
110
+ self,
111
+ image_part: "Part",
112
+ rel_id: Optional[str] = None,
113
+ ) -> Optional[str]:
114
+ """
115
+ Process image from OOXML part.
116
+
117
+ Args:
118
+ image_part: OOXML Part containing image data
119
+ rel_id: Relationship ID
120
+
121
+ Returns:
122
+ Image tag string, or None on failure
123
+ """
124
+ try:
125
+ image_data = image_part.blob
126
+ if not image_data:
127
+ return None
128
+
129
+ # Try to get original filename
130
+ image_name = None
131
+ if hasattr(image_part, 'partname'):
132
+ partname = str(image_part.partname)
133
+ if '/' in partname:
134
+ image_name = partname.split('/')[-1]
135
+
136
+ return self.process_image(
137
+ image_data,
138
+ rel_id=rel_id,
139
+ image_name=image_name
140
+ )
141
+
142
+ except Exception as e:
143
+ self._logger.warning(f"Failed to process image part: {e}")
144
+ return None
145
+
146
+ def process_embedded_image(
147
+ self,
148
+ image_data: bytes,
149
+ image_name: Optional[str] = None,
150
+ embed_id: Optional[str] = None,
151
+ **kwargs
152
+ ) -> Optional[str]:
153
+ """
154
+ Process embedded DOCX image.
155
+
156
+ Args:
157
+ image_data: Image binary data
158
+ image_name: Original image filename
159
+ embed_id: Embed relationship ID
160
+ **kwargs: Additional options
161
+
162
+ Returns:
163
+ Image tag string, or None on failure
164
+ """
165
+ custom_name = image_name
166
+ if custom_name is None and embed_id is not None:
167
+ custom_name = f"docx_embed_{embed_id}"
168
+
169
+ return self.save_image(image_data, custom_name=custom_name)
170
+
171
+ def process_drawing_image(
172
+ self,
173
+ image_data: bytes,
174
+ drawing_id: Optional[str] = None,
175
+ description: Optional[str] = None,
176
+ **kwargs
177
+ ) -> Optional[str]:
178
+ """
179
+ Process DOCX drawing/picture element image.
180
+
181
+ Args:
182
+ image_data: Image binary data
183
+ drawing_id: Drawing element ID
184
+ description: Image description/alt text
185
+ **kwargs: Additional options
186
+
187
+ Returns:
188
+ Image tag string, or None on failure
189
+ """
190
+ custom_name = None
191
+ if drawing_id is not None:
192
+ custom_name = f"docx_drawing_{drawing_id}"
193
+
194
+ return self.save_image(image_data, custom_name=custom_name)
195
+
196
+ def extract_from_drawing(
197
+ self,
198
+ graphic_data,
199
+ doc: "Document",
200
+ processed_images: Set[str],
201
+ ) -> Tuple[str, bool]:
202
+ """
203
+ Extract image from Drawing graphic data element.
204
+
205
+ This is the core DOCX image extraction logic that was previously
206
+ in docx_image.py extract_image_from_drawing() function.
207
+
208
+ Args:
209
+ graphic_data: graphicData XML element
210
+ doc: python-docx Document object
211
+ processed_images: Set of processed image paths (deduplication)
212
+
213
+ Returns:
214
+ (image_tag, is_image) tuple. image_tag is the tag string or empty,
215
+ is_image indicates if an image was found.
216
+ """
217
+ try:
218
+ # Find blip element (image reference)
219
+ blip = graphic_data.find('.//a:blip', NAMESPACES)
220
+ if blip is None:
221
+ return "", False
222
+
223
+ # Get relationship ID
224
+ r_embed = blip.get(qn('r:embed'))
225
+ r_link = blip.get(qn('r:link'))
226
+ rId = r_embed or r_link
227
+
228
+ if not rId:
229
+ return "", False
230
+
231
+ # Find image part from relationship
232
+ try:
233
+ rel = doc.part.rels.get(rId)
234
+ if rel is None:
235
+ return "", False
236
+
237
+ # Extract image data
238
+ if hasattr(rel, 'target_part') and hasattr(rel.target_part, 'blob'):
239
+ image_data = rel.target_part.blob
240
+
241
+ # Save using process_image with rel_id
242
+ image_tag = self.process_image(
243
+ image_data,
244
+ rel_id=rId,
245
+ processed_images=processed_images
246
+ )
247
+
248
+ if image_tag:
249
+ return f"\n{image_tag}\n", True
250
+
251
+ return "[Unknown Image]", True
252
+
253
+ except Exception as e:
254
+ logger.warning(f"Error extracting image from relationship: {e}")
255
+ return "[Unknown Image]", True
256
+
257
+ except Exception as e:
258
+ logger.warning(f"Error extracting image from drawing: {e}")
259
+ return "", False
260
+
261
+ def extract_from_pict(
262
+ self,
263
+ pict_elem,
264
+ doc: "Document",
265
+ processed_images: Set[str],
266
+ ) -> str:
267
+ """
268
+ Extract image from legacy VML pict element.
269
+
270
+ This is the core DOCX VML image extraction logic that was previously
271
+ in docx_image.py process_pict_element() function.
272
+
273
+ Args:
274
+ pict_elem: pict XML element
275
+ doc: python-docx Document object
276
+ processed_images: Set of processed image paths (deduplication)
277
+
278
+ Returns:
279
+ Image tag string or placeholder
280
+ """
281
+ try:
282
+ # Find VML imagedata
283
+ ns_v = 'urn:schemas-microsoft-com:vml'
284
+ ns_r = 'http://schemas.openxmlformats.org/officeDocument/2006/relationships'
285
+
286
+ imagedata = pict_elem.find('.//{%s}imagedata' % ns_v)
287
+ if imagedata is None:
288
+ return "[Unknown Image]"
289
+
290
+ rId = imagedata.get('{%s}id' % ns_r)
291
+ if not rId:
292
+ return "[Unknown Image]"
293
+
294
+ try:
295
+ rel = doc.part.rels.get(rId)
296
+ if rel and hasattr(rel, 'target_part') and hasattr(rel.target_part, 'blob'):
297
+ image_data = rel.target_part.blob
298
+ image_tag = self.process_image(
299
+ image_data,
300
+ rel_id=rId,
301
+ processed_images=processed_images
302
+ )
303
+ if image_tag:
304
+ return f"\n{image_tag}\n"
305
+ except Exception:
306
+ pass
307
+
308
+ return "[Unknown Image]"
309
+
310
+ except Exception as e:
311
+ logger.warning(f"Error processing pict element: {e}")
312
+ return ""
313
+
314
+ def process_drawing_element(
315
+ self,
316
+ drawing_elem,
317
+ doc: "Document",
318
+ processed_images: Set[str],
319
+ chart_callback: Optional[Callable[[], str]] = None,
320
+ ) -> Tuple[str, Optional[ElementType]]:
321
+ """
322
+ Process Drawing element (image, chart, diagram).
323
+
324
+ Main entry point for handling all drawing elements in DOCX.
325
+ Branches to appropriate handler based on content type.
326
+
327
+ Args:
328
+ drawing_elem: drawing XML element
329
+ doc: python-docx Document object
330
+ processed_images: Set of processed image paths (deduplication)
331
+ chart_callback: Callback function to get next chart content
332
+
333
+ Returns:
334
+ (content, element_type) tuple
335
+ """
336
+ try:
337
+ # Check inline or anchor
338
+ inline = drawing_elem.find('.//wp:inline', NAMESPACES)
339
+ anchor = drawing_elem.find('.//wp:anchor', NAMESPACES)
340
+
341
+ container = inline if inline is not None else anchor
342
+ if container is None:
343
+ return "", None
344
+
345
+ # Check graphic data
346
+ graphic = container.find('.//a:graphic', NAMESPACES)
347
+ if graphic is None:
348
+ return "", None
349
+
350
+ graphic_data = graphic.find('a:graphicData', NAMESPACES)
351
+ if graphic_data is None:
352
+ return "", None
353
+
354
+ uri = graphic_data.get('uri', '')
355
+
356
+ # Image case
357
+ if 'picture' in uri.lower():
358
+ content, is_image = self.extract_from_drawing(
359
+ graphic_data, doc, processed_images
360
+ )
361
+ return (content, ElementType.IMAGE) if is_image else ("", None)
362
+
363
+ # Chart case - delegate to callback
364
+ if 'chart' in uri.lower():
365
+ if chart_callback:
366
+ chart_content = chart_callback()
367
+ return chart_content, ElementType.CHART
368
+ return "", ElementType.CHART
369
+
370
+ # Diagram case
371
+ if 'diagram' in uri.lower():
372
+ return self.extract_diagram(graphic_data)
373
+
374
+ return "", None
375
+
376
+ except Exception as e:
377
+ logger.warning(f"Error processing drawing element: {e}")
378
+ return "", None
379
+
380
+ def extract_diagram(
381
+ self,
382
+ graphic_data,
383
+ ) -> Tuple[str, Optional[ElementType]]:
384
+ """
385
+ Extract diagram information from Drawing.
386
+
387
+ Args:
388
+ graphic_data: graphicData XML element
389
+
390
+ Returns:
391
+ (content, element_type) tuple
392
+ """
393
+ try:
394
+ texts = []
395
+ ns_a = 'http://schemas.openxmlformats.org/drawingml/2006/main'
396
+ for t_elem in graphic_data.findall('.//{%s}t' % ns_a):
397
+ if t_elem.text:
398
+ texts.append(t_elem.text.strip())
399
+
400
+ if texts:
401
+ return f"[Diagram: {' / '.join(texts)}]", ElementType.DIAGRAM
402
+
403
+ return "[Diagram]", ElementType.DIAGRAM
404
+
405
+ except Exception as e:
406
+ logger.warning(f"Error extracting diagram: {e}")
407
+ return "[Diagram]", ElementType.DIAGRAM
408
+
409
+
410
+ __all__ = ["DOCXImageProcessor"]
@@ -0,0 +1,71 @@
1
+ # xgen_doc2chunk/core/processor/docx_helper/docx_metadata.py
2
+ """
3
+ DOCX Metadata Extraction Module
4
+
5
+ Provides DOCXMetadataExtractor class for extracting metadata from DOCX documents
6
+ using python-docx core_properties. Implements BaseMetadataExtractor interface.
7
+ """
8
+ import logging
9
+ from typing import Any, Optional
10
+
11
+ from docx import Document
12
+
13
+ from xgen_doc2chunk.core.functions.metadata_extractor import (
14
+ BaseMetadataExtractor,
15
+ DocumentMetadata,
16
+ )
17
+
18
+ logger = logging.getLogger("document-processor")
19
+
20
+
21
+ class DOCXMetadataExtractor(BaseMetadataExtractor):
22
+ """
23
+ DOCX Metadata Extractor.
24
+
25
+ Extracts metadata from python-docx Document objects.
26
+
27
+ Supported fields:
28
+ - title, subject, author, keywords, comments
29
+ - last_saved_by, create_time, last_saved_time
30
+
31
+ Usage:
32
+ extractor = DOCXMetadataExtractor()
33
+ metadata = extractor.extract(docx_document)
34
+ text = extractor.format(metadata)
35
+ """
36
+
37
+ def extract(self, source: Document) -> DocumentMetadata:
38
+ """
39
+ Extract metadata from DOCX document.
40
+
41
+ Args:
42
+ source: python-docx Document object
43
+
44
+ Returns:
45
+ DocumentMetadata instance containing extracted metadata.
46
+ """
47
+ try:
48
+ props = source.core_properties
49
+
50
+ return DocumentMetadata(
51
+ title=self._get_stripped(props.title),
52
+ subject=self._get_stripped(props.subject),
53
+ author=self._get_stripped(props.author),
54
+ keywords=self._get_stripped(props.keywords),
55
+ comments=self._get_stripped(props.comments),
56
+ last_saved_by=self._get_stripped(props.last_modified_by),
57
+ create_time=props.created,
58
+ last_saved_time=props.modified,
59
+ )
60
+ except Exception as e:
61
+ self.logger.warning(f"Failed to extract DOCX metadata: {e}")
62
+ return DocumentMetadata()
63
+
64
+ def _get_stripped(self, value: Optional[str]) -> Optional[str]:
65
+ """Return stripped string value, or None if empty."""
66
+ return value.strip() if value else None
67
+
68
+
69
+ __all__ = [
70
+ 'DOCXMetadataExtractor',
71
+ ]