xgen-doc2chunk 0.1.0__py3-none-any.whl → 0.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (162) hide show
  1. xgen_doc2chunk/__init__.py +42 -0
  2. xgen_doc2chunk/chunking/__init__.py +168 -0
  3. xgen_doc2chunk/chunking/chunking.py +786 -0
  4. xgen_doc2chunk/chunking/constants.py +134 -0
  5. xgen_doc2chunk/chunking/page_chunker.py +248 -0
  6. xgen_doc2chunk/chunking/protected_regions.py +715 -0
  7. xgen_doc2chunk/chunking/sheet_processor.py +406 -0
  8. xgen_doc2chunk/chunking/table_chunker.py +832 -0
  9. xgen_doc2chunk/chunking/table_parser.py +172 -0
  10. xgen_doc2chunk/chunking/text_chunker.py +443 -0
  11. xgen_doc2chunk/core/__init__.py +64 -0
  12. xgen_doc2chunk/core/document_processor.py +1307 -0
  13. xgen_doc2chunk/core/functions/__init__.py +85 -0
  14. xgen_doc2chunk/core/functions/chart_extractor.py +144 -0
  15. xgen_doc2chunk/core/functions/chart_processor.py +534 -0
  16. xgen_doc2chunk/core/functions/file_converter.py +220 -0
  17. xgen_doc2chunk/core/functions/img_processor.py +649 -0
  18. xgen_doc2chunk/core/functions/metadata_extractor.py +542 -0
  19. xgen_doc2chunk/core/functions/page_tag_processor.py +393 -0
  20. xgen_doc2chunk/core/functions/preprocessor.py +162 -0
  21. xgen_doc2chunk/core/functions/storage_backend.py +381 -0
  22. xgen_doc2chunk/core/functions/table_extractor.py +468 -0
  23. xgen_doc2chunk/core/functions/table_processor.py +299 -0
  24. xgen_doc2chunk/core/functions/utils.py +159 -0
  25. xgen_doc2chunk/core/processor/__init__.py +96 -0
  26. xgen_doc2chunk/core/processor/base_handler.py +544 -0
  27. xgen_doc2chunk/core/processor/csv_handler.py +135 -0
  28. xgen_doc2chunk/core/processor/csv_helper/__init__.py +89 -0
  29. xgen_doc2chunk/core/processor/csv_helper/csv_constants.py +63 -0
  30. xgen_doc2chunk/core/processor/csv_helper/csv_encoding.py +104 -0
  31. xgen_doc2chunk/core/processor/csv_helper/csv_file_converter.py +78 -0
  32. xgen_doc2chunk/core/processor/csv_helper/csv_image_processor.py +75 -0
  33. xgen_doc2chunk/core/processor/csv_helper/csv_metadata.py +168 -0
  34. xgen_doc2chunk/core/processor/csv_helper/csv_parser.py +225 -0
  35. xgen_doc2chunk/core/processor/csv_helper/csv_preprocessor.py +86 -0
  36. xgen_doc2chunk/core/processor/csv_helper/csv_table.py +266 -0
  37. xgen_doc2chunk/core/processor/doc_handler.py +579 -0
  38. xgen_doc2chunk/core/processor/doc_helpers/__init__.py +25 -0
  39. xgen_doc2chunk/core/processor/doc_helpers/doc_file_converter.py +160 -0
  40. xgen_doc2chunk/core/processor/doc_helpers/doc_image_processor.py +179 -0
  41. xgen_doc2chunk/core/processor/doc_helpers/doc_preprocessor.py +83 -0
  42. xgen_doc2chunk/core/processor/docx_handler.py +376 -0
  43. xgen_doc2chunk/core/processor/docx_helper/__init__.py +84 -0
  44. xgen_doc2chunk/core/processor/docx_helper/docx_chart_extractor.py +436 -0
  45. xgen_doc2chunk/core/processor/docx_helper/docx_constants.py +75 -0
  46. xgen_doc2chunk/core/processor/docx_helper/docx_file_converter.py +76 -0
  47. xgen_doc2chunk/core/processor/docx_helper/docx_image.py +145 -0
  48. xgen_doc2chunk/core/processor/docx_helper/docx_image_processor.py +410 -0
  49. xgen_doc2chunk/core/processor/docx_helper/docx_metadata.py +71 -0
  50. xgen_doc2chunk/core/processor/docx_helper/docx_paragraph.py +126 -0
  51. xgen_doc2chunk/core/processor/docx_helper/docx_preprocessor.py +82 -0
  52. xgen_doc2chunk/core/processor/docx_helper/docx_table_extractor.py +527 -0
  53. xgen_doc2chunk/core/processor/docx_helper/docx_table_processor.py +220 -0
  54. xgen_doc2chunk/core/processor/excel_handler.py +353 -0
  55. xgen_doc2chunk/core/processor/excel_helper/__init__.py +97 -0
  56. xgen_doc2chunk/core/processor/excel_helper/excel_chart_extractor.py +498 -0
  57. xgen_doc2chunk/core/processor/excel_helper/excel_file_converter.py +157 -0
  58. xgen_doc2chunk/core/processor/excel_helper/excel_image_processor.py +316 -0
  59. xgen_doc2chunk/core/processor/excel_helper/excel_layout_detector.py +739 -0
  60. xgen_doc2chunk/core/processor/excel_helper/excel_metadata.py +145 -0
  61. xgen_doc2chunk/core/processor/excel_helper/excel_preprocessor.py +83 -0
  62. xgen_doc2chunk/core/processor/excel_helper/excel_table_xls.py +357 -0
  63. xgen_doc2chunk/core/processor/excel_helper/excel_table_xlsx.py +361 -0
  64. xgen_doc2chunk/core/processor/excel_helper/excel_textbox.py +266 -0
  65. xgen_doc2chunk/core/processor/html_helper/__init__.py +7 -0
  66. xgen_doc2chunk/core/processor/html_helper/html_file_converter.py +92 -0
  67. xgen_doc2chunk/core/processor/html_helper/html_preprocessor.py +74 -0
  68. xgen_doc2chunk/core/processor/html_reprocessor.py +140 -0
  69. xgen_doc2chunk/core/processor/hwp_handler.py +401 -0
  70. xgen_doc2chunk/core/processor/hwp_helper/__init__.py +120 -0
  71. xgen_doc2chunk/core/processor/hwp_helper/hwp_chart_extractor.py +373 -0
  72. xgen_doc2chunk/core/processor/hwp_helper/hwp_constants.py +78 -0
  73. xgen_doc2chunk/core/processor/hwp_helper/hwp_decoder.py +106 -0
  74. xgen_doc2chunk/core/processor/hwp_helper/hwp_docinfo.py +174 -0
  75. xgen_doc2chunk/core/processor/hwp_helper/hwp_file_converter.py +60 -0
  76. xgen_doc2chunk/core/processor/hwp_helper/hwp_image_processor.py +413 -0
  77. xgen_doc2chunk/core/processor/hwp_helper/hwp_metadata.py +236 -0
  78. xgen_doc2chunk/core/processor/hwp_helper/hwp_preprocessor.py +82 -0
  79. xgen_doc2chunk/core/processor/hwp_helper/hwp_record.py +149 -0
  80. xgen_doc2chunk/core/processor/hwp_helper/hwp_recovery.py +217 -0
  81. xgen_doc2chunk/core/processor/hwp_helper/hwp_table.py +205 -0
  82. xgen_doc2chunk/core/processor/hwpx_handler.py +191 -0
  83. xgen_doc2chunk/core/processor/hwpx_helper/__init__.py +85 -0
  84. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_chart_extractor.py +464 -0
  85. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_constants.py +30 -0
  86. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_file_converter.py +70 -0
  87. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_image_processor.py +258 -0
  88. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_metadata.py +163 -0
  89. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_preprocessor.py +80 -0
  90. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_section.py +242 -0
  91. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_table_extractor.py +462 -0
  92. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_table_processor.py +220 -0
  93. xgen_doc2chunk/core/processor/image_file_handler.py +212 -0
  94. xgen_doc2chunk/core/processor/image_file_helper/__init__.py +17 -0
  95. xgen_doc2chunk/core/processor/image_file_helper/image_file_converter.py +69 -0
  96. xgen_doc2chunk/core/processor/image_file_helper/image_file_image_processor.py +123 -0
  97. xgen_doc2chunk/core/processor/image_file_helper/image_file_preprocessor.py +84 -0
  98. xgen_doc2chunk/core/processor/pdf_handler.py +597 -0
  99. xgen_doc2chunk/core/processor/pdf_helpers/__init__.py +229 -0
  100. xgen_doc2chunk/core/processor/pdf_helpers/pdf_block_image_engine.py +667 -0
  101. xgen_doc2chunk/core/processor/pdf_helpers/pdf_cell_analysis.py +493 -0
  102. xgen_doc2chunk/core/processor/pdf_helpers/pdf_complexity_analyzer.py +598 -0
  103. xgen_doc2chunk/core/processor/pdf_helpers/pdf_element_merger.py +46 -0
  104. xgen_doc2chunk/core/processor/pdf_helpers/pdf_file_converter.py +72 -0
  105. xgen_doc2chunk/core/processor/pdf_helpers/pdf_graphic_detector.py +332 -0
  106. xgen_doc2chunk/core/processor/pdf_helpers/pdf_image_processor.py +321 -0
  107. xgen_doc2chunk/core/processor/pdf_helpers/pdf_layout_block_detector.py +1244 -0
  108. xgen_doc2chunk/core/processor/pdf_helpers/pdf_line_analysis.py +420 -0
  109. xgen_doc2chunk/core/processor/pdf_helpers/pdf_metadata.py +101 -0
  110. xgen_doc2chunk/core/processor/pdf_helpers/pdf_page_analyzer.py +114 -0
  111. xgen_doc2chunk/core/processor/pdf_helpers/pdf_preprocessor.py +106 -0
  112. xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_detection.py +1346 -0
  113. xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_processor.py +897 -0
  114. xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_quality_analyzer.py +750 -0
  115. xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_validator.py +401 -0
  116. xgen_doc2chunk/core/processor/pdf_helpers/pdf_text_extractor.py +155 -0
  117. xgen_doc2chunk/core/processor/pdf_helpers/pdf_text_quality_analyzer.py +655 -0
  118. xgen_doc2chunk/core/processor/pdf_helpers/pdf_utils.py +183 -0
  119. xgen_doc2chunk/core/processor/pdf_helpers/pdf_vector_text_ocr.py +302 -0
  120. xgen_doc2chunk/core/processor/pdf_helpers/types.py +278 -0
  121. xgen_doc2chunk/core/processor/ppt_handler.py +288 -0
  122. xgen_doc2chunk/core/processor/ppt_helper/__init__.py +96 -0
  123. xgen_doc2chunk/core/processor/ppt_helper/ppt_bullet.py +332 -0
  124. xgen_doc2chunk/core/processor/ppt_helper/ppt_chart_extractor.py +182 -0
  125. xgen_doc2chunk/core/processor/ppt_helper/ppt_constants.py +119 -0
  126. xgen_doc2chunk/core/processor/ppt_helper/ppt_file_converter.py +55 -0
  127. xgen_doc2chunk/core/processor/ppt_helper/ppt_image_processor.py +196 -0
  128. xgen_doc2chunk/core/processor/ppt_helper/ppt_metadata.py +71 -0
  129. xgen_doc2chunk/core/processor/ppt_helper/ppt_preprocessor.py +77 -0
  130. xgen_doc2chunk/core/processor/ppt_helper/ppt_shape.py +189 -0
  131. xgen_doc2chunk/core/processor/ppt_helper/ppt_slide.py +69 -0
  132. xgen_doc2chunk/core/processor/ppt_helper/ppt_table.py +386 -0
  133. xgen_doc2chunk/core/processor/rtf_handler.py +290 -0
  134. xgen_doc2chunk/core/processor/rtf_helper/__init__.py +128 -0
  135. xgen_doc2chunk/core/processor/rtf_helper/rtf_constants.py +94 -0
  136. xgen_doc2chunk/core/processor/rtf_helper/rtf_content_extractor.py +211 -0
  137. xgen_doc2chunk/core/processor/rtf_helper/rtf_decoder.py +141 -0
  138. xgen_doc2chunk/core/processor/rtf_helper/rtf_file_converter.py +87 -0
  139. xgen_doc2chunk/core/processor/rtf_helper/rtf_metadata_extractor.py +179 -0
  140. xgen_doc2chunk/core/processor/rtf_helper/rtf_preprocessor.py +426 -0
  141. xgen_doc2chunk/core/processor/rtf_helper/rtf_region_finder.py +91 -0
  142. xgen_doc2chunk/core/processor/rtf_helper/rtf_table_extractor.py +482 -0
  143. xgen_doc2chunk/core/processor/rtf_helper/rtf_text_cleaner.py +389 -0
  144. xgen_doc2chunk/core/processor/text_handler.py +95 -0
  145. xgen_doc2chunk/core/processor/text_helper/__init__.py +17 -0
  146. xgen_doc2chunk/core/processor/text_helper/text_file_converter.py +28 -0
  147. xgen_doc2chunk/core/processor/text_helper/text_image_processor.py +75 -0
  148. xgen_doc2chunk/core/processor/text_helper/text_preprocessor.py +82 -0
  149. xgen_doc2chunk/ocr/__init__.py +67 -0
  150. xgen_doc2chunk/ocr/base.py +209 -0
  151. xgen_doc2chunk/ocr/ocr_engine/__init__.py +22 -0
  152. xgen_doc2chunk/ocr/ocr_engine/anthropic_ocr.py +91 -0
  153. xgen_doc2chunk/ocr/ocr_engine/bedrock_ocr.py +172 -0
  154. xgen_doc2chunk/ocr/ocr_engine/gemini_ocr.py +91 -0
  155. xgen_doc2chunk/ocr/ocr_engine/openai_ocr.py +100 -0
  156. xgen_doc2chunk/ocr/ocr_engine/vllm_ocr.py +116 -0
  157. xgen_doc2chunk/ocr/ocr_processor.py +387 -0
  158. {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.1.dist-info}/METADATA +1 -1
  159. xgen_doc2chunk-0.1.1.dist-info/RECORD +161 -0
  160. xgen_doc2chunk-0.1.0.dist-info/RECORD +0 -4
  161. {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.1.dist-info}/WHEEL +0 -0
  162. {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.1.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,542 @@
1
+ # xgen_doc2chunk/core/functions/metadata_extractor.py
2
+ """
3
+ Metadata Extractor Interface
4
+
5
+ Provides abstract base class and common utilities for document metadata extraction.
6
+ Each handler's helper module should implement a concrete extractor inheriting from
7
+ BaseMetadataExtractor.
8
+
9
+ This module defines:
10
+ - DocumentMetadata: Standardized metadata container dataclass
11
+ - MetadataField: Enum for standard metadata field names
12
+ - BaseMetadataExtractor: Abstract base class for metadata extractors
13
+ - MetadataFormatter: Shared formatter for consistent metadata output
14
+
15
+ Usage Example:
16
+ from xgen_doc2chunk.core.functions.metadata_extractor import (
17
+ BaseMetadataExtractor,
18
+ DocumentMetadata,
19
+ MetadataFormatter,
20
+ )
21
+
22
+ class PDFMetadataExtractor(BaseMetadataExtractor):
23
+ def extract(self, source: Any) -> DocumentMetadata:
24
+ # PDF-specific extraction logic
25
+ ...
26
+ """
27
+ import logging
28
+ from abc import ABC, abstractmethod
29
+ from dataclasses import dataclass, field
30
+ from datetime import datetime
31
+ from enum import Enum
32
+ from typing import Any, Dict, Optional
33
+
34
+ logger = logging.getLogger("xgen_doc2chunk.metadata")
35
+
36
+
37
+ class MetadataField(str, Enum):
38
+ """
39
+ Standard metadata field names.
40
+
41
+ These field names are used consistently across all document formats
42
+ to ensure uniform metadata handling.
43
+ """
44
+ TITLE = "title"
45
+ SUBJECT = "subject"
46
+ AUTHOR = "author"
47
+ KEYWORDS = "keywords"
48
+ COMMENTS = "comments"
49
+ LAST_SAVED_BY = "last_saved_by"
50
+ CREATE_TIME = "create_time"
51
+ LAST_SAVED_TIME = "last_saved_time"
52
+
53
+ # Additional fields for specific formats
54
+ VERSION = "version"
55
+ CATEGORY = "category"
56
+ COMPANY = "company"
57
+ MANAGER = "manager"
58
+
59
+ # File-level metadata (for CSV, etc.)
60
+ FILE_NAME = "file_name"
61
+ FILE_SIZE = "file_size"
62
+ ENCODING = "encoding"
63
+ ROW_COUNT = "row_count"
64
+ COL_COUNT = "col_count"
65
+
66
+
67
+ @dataclass
68
+ class DocumentMetadata:
69
+ """
70
+ Standardized metadata container for all document types.
71
+
72
+ This dataclass provides a unified structure for storing document metadata
73
+ across all supported file formats. It includes common fields and allows
74
+ for format-specific custom fields.
75
+
76
+ Attributes:
77
+ title: Document title
78
+ subject: Document subject
79
+ author: Document author/creator
80
+ keywords: Document keywords
81
+ comments: Document comments/description
82
+ last_saved_by: Last person who saved the document
83
+ create_time: Document creation timestamp
84
+ last_saved_time: Last modification timestamp
85
+ custom: Dictionary for format-specific additional fields
86
+
87
+ Example:
88
+ >>> metadata = DocumentMetadata(
89
+ ... title="Annual Report",
90
+ ... author="John Doe",
91
+ ... create_time=datetime.now()
92
+ ... )
93
+ >>> metadata.to_dict()
94
+ {'title': 'Annual Report', 'author': 'John Doe', ...}
95
+ """
96
+ title: Optional[str] = None
97
+ subject: Optional[str] = None
98
+ author: Optional[str] = None
99
+ keywords: Optional[str] = None
100
+ comments: Optional[str] = None
101
+ last_saved_by: Optional[str] = None
102
+ create_time: Optional[datetime] = None
103
+ last_saved_time: Optional[datetime] = None
104
+ custom: Dict[str, Any] = field(default_factory=dict)
105
+
106
+ def to_dict(self) -> Dict[str, Any]:
107
+ """
108
+ Convert metadata to dictionary.
109
+
110
+ Returns:
111
+ Dictionary containing all non-None metadata fields.
112
+ """
113
+ result = {}
114
+
115
+ if self.title:
116
+ result[MetadataField.TITLE.value] = self.title
117
+ if self.subject:
118
+ result[MetadataField.SUBJECT.value] = self.subject
119
+ if self.author:
120
+ result[MetadataField.AUTHOR.value] = self.author
121
+ if self.keywords:
122
+ result[MetadataField.KEYWORDS.value] = self.keywords
123
+ if self.comments:
124
+ result[MetadataField.COMMENTS.value] = self.comments
125
+ if self.last_saved_by:
126
+ result[MetadataField.LAST_SAVED_BY.value] = self.last_saved_by
127
+ if self.create_time:
128
+ result[MetadataField.CREATE_TIME.value] = self.create_time
129
+ if self.last_saved_time:
130
+ result[MetadataField.LAST_SAVED_TIME.value] = self.last_saved_time
131
+
132
+ # Add custom fields
133
+ result.update(self.custom)
134
+
135
+ return result
136
+
137
+ @classmethod
138
+ def from_dict(cls, data: Dict[str, Any]) -> "DocumentMetadata":
139
+ """
140
+ Create DocumentMetadata from dictionary.
141
+
142
+ Standard fields are extracted into their respective attributes,
143
+ while non-standard fields go into the custom dictionary.
144
+
145
+ Args:
146
+ data: Dictionary containing metadata fields.
147
+
148
+ Returns:
149
+ DocumentMetadata instance.
150
+ """
151
+ standard_fields = {
152
+ MetadataField.TITLE.value,
153
+ MetadataField.SUBJECT.value,
154
+ MetadataField.AUTHOR.value,
155
+ MetadataField.KEYWORDS.value,
156
+ MetadataField.COMMENTS.value,
157
+ MetadataField.LAST_SAVED_BY.value,
158
+ MetadataField.CREATE_TIME.value,
159
+ MetadataField.LAST_SAVED_TIME.value,
160
+ }
161
+
162
+ custom = {k: v for k, v in data.items() if k not in standard_fields}
163
+
164
+ return cls(
165
+ title=data.get(MetadataField.TITLE.value),
166
+ subject=data.get(MetadataField.SUBJECT.value),
167
+ author=data.get(MetadataField.AUTHOR.value),
168
+ keywords=data.get(MetadataField.KEYWORDS.value),
169
+ comments=data.get(MetadataField.COMMENTS.value),
170
+ last_saved_by=data.get(MetadataField.LAST_SAVED_BY.value),
171
+ create_time=data.get(MetadataField.CREATE_TIME.value),
172
+ last_saved_time=data.get(MetadataField.LAST_SAVED_TIME.value),
173
+ custom=custom,
174
+ )
175
+
176
+ def is_empty(self) -> bool:
177
+ """
178
+ Check if metadata is empty (no fields set).
179
+
180
+ Returns:
181
+ True if no metadata fields are set.
182
+ """
183
+ return not self.to_dict()
184
+
185
+ def __bool__(self) -> bool:
186
+ """Return True if metadata has any fields set."""
187
+ return not self.is_empty()
188
+
189
+
190
+ class MetadataFormatter:
191
+ """
192
+ Shared formatter for consistent metadata output.
193
+
194
+ This class provides a unified way to format DocumentMetadata objects
195
+ as strings for inclusion in extracted text output.
196
+
197
+ Attributes:
198
+ metadata_tag_prefix: Opening tag for metadata section (default: "<Document-Metadata>")
199
+ metadata_tag_suffix: Closing tag for metadata section (default: "</Document-Metadata>")
200
+ field_labels: Dictionary mapping field names to display labels
201
+ date_format: Date/time format string
202
+ language: Output language ('ko' for Korean, 'en' for English)
203
+
204
+ Example:
205
+ >>> formatter = MetadataFormatter(language='en')
206
+ >>> text = formatter.format(metadata)
207
+ >>> print(text)
208
+ <Document-Metadata>
209
+ Title: Annual Report
210
+ Author: John Doe
211
+ </Document-Metadata>
212
+ """
213
+
214
+ # Field labels in Korean
215
+ LABELS_KO = {
216
+ MetadataField.TITLE.value: "제목",
217
+ MetadataField.SUBJECT.value: "주제",
218
+ MetadataField.AUTHOR.value: "작성자",
219
+ MetadataField.KEYWORDS.value: "키워드",
220
+ MetadataField.COMMENTS.value: "설명",
221
+ MetadataField.LAST_SAVED_BY.value: "마지막 수정자",
222
+ MetadataField.CREATE_TIME.value: "작성일",
223
+ MetadataField.LAST_SAVED_TIME.value: "수정일",
224
+ # Additional fields
225
+ MetadataField.VERSION.value: "버전",
226
+ MetadataField.CATEGORY.value: "범주",
227
+ MetadataField.COMPANY.value: "회사",
228
+ MetadataField.MANAGER.value: "관리자",
229
+ MetadataField.FILE_NAME.value: "파일명",
230
+ MetadataField.FILE_SIZE.value: "파일 크기",
231
+ MetadataField.ENCODING.value: "인코딩",
232
+ MetadataField.ROW_COUNT.value: "행 개수",
233
+ MetadataField.COL_COUNT.value: "열 개수",
234
+ }
235
+
236
+ # Field labels in English
237
+ LABELS_EN = {
238
+ MetadataField.TITLE.value: "Title",
239
+ MetadataField.SUBJECT.value: "Subject",
240
+ MetadataField.AUTHOR.value: "Author",
241
+ MetadataField.KEYWORDS.value: "Keywords",
242
+ MetadataField.COMMENTS.value: "Comments",
243
+ MetadataField.LAST_SAVED_BY.value: "Last Saved By",
244
+ MetadataField.CREATE_TIME.value: "Created",
245
+ MetadataField.LAST_SAVED_TIME.value: "Last Modified",
246
+ # Additional fields
247
+ MetadataField.VERSION.value: "Version",
248
+ MetadataField.CATEGORY.value: "Category",
249
+ MetadataField.COMPANY.value: "Company",
250
+ MetadataField.MANAGER.value: "Manager",
251
+ MetadataField.FILE_NAME.value: "File Name",
252
+ MetadataField.FILE_SIZE.value: "File Size",
253
+ MetadataField.ENCODING.value: "Encoding",
254
+ MetadataField.ROW_COUNT.value: "Row Count",
255
+ MetadataField.COL_COUNT.value: "Column Count",
256
+ }
257
+
258
+ # Standard field order for output
259
+ FIELD_ORDER = [
260
+ MetadataField.TITLE.value,
261
+ MetadataField.SUBJECT.value,
262
+ MetadataField.AUTHOR.value,
263
+ MetadataField.KEYWORDS.value,
264
+ MetadataField.COMMENTS.value,
265
+ MetadataField.LAST_SAVED_BY.value,
266
+ MetadataField.CREATE_TIME.value,
267
+ MetadataField.LAST_SAVED_TIME.value,
268
+ ]
269
+
270
+ def __init__(
271
+ self,
272
+ metadata_tag_prefix: str = "<Document-Metadata>",
273
+ metadata_tag_suffix: str = "</Document-Metadata>",
274
+ date_format: str = "%Y-%m-%d %H:%M:%S",
275
+ language: str = "ko",
276
+ indent: str = " ",
277
+ ):
278
+ """
279
+ Initialize MetadataFormatter.
280
+
281
+ Args:
282
+ metadata_tag_prefix: Opening tag for metadata section
283
+ metadata_tag_suffix: Closing tag for metadata section
284
+ date_format: strftime format for datetime values
285
+ language: Output language ('ko' or 'en')
286
+ indent: Indentation string for each field
287
+ """
288
+ self.metadata_tag_prefix = metadata_tag_prefix
289
+ self.metadata_tag_suffix = metadata_tag_suffix
290
+ self.date_format = date_format
291
+ self.language = language
292
+ self.indent = indent
293
+
294
+ # Select labels based on language
295
+ self.field_labels = self.LABELS_KO if language == "ko" else self.LABELS_EN
296
+
297
+ def format(self, metadata: DocumentMetadata) -> str:
298
+ """
299
+ Format DocumentMetadata as a string.
300
+
301
+ Args:
302
+ metadata: DocumentMetadata instance to format.
303
+
304
+ Returns:
305
+ Formatted metadata string, or empty string if metadata is empty.
306
+ """
307
+ if not metadata:
308
+ return ""
309
+
310
+ data = metadata.to_dict()
311
+ if not data:
312
+ return ""
313
+
314
+ lines = [self.metadata_tag_prefix]
315
+
316
+ # Output standard fields in order
317
+ for field_name in self.FIELD_ORDER:
318
+ if field_name in data:
319
+ value = data.pop(field_name)
320
+ formatted_line = self._format_field(field_name, value)
321
+ if formatted_line:
322
+ lines.append(formatted_line)
323
+
324
+ # Output remaining custom fields
325
+ for field_name, value in data.items():
326
+ formatted_line = self._format_field(field_name, value)
327
+ if formatted_line:
328
+ lines.append(formatted_line)
329
+
330
+ lines.append(self.metadata_tag_suffix)
331
+
332
+ return "\n".join(lines)
333
+
334
+ def format_dict(self, metadata_dict: Dict[str, Any]) -> str:
335
+ """
336
+ Format metadata dictionary as a string.
337
+
338
+ Convenience method for formatting raw dictionaries without
339
+ first converting to DocumentMetadata.
340
+
341
+ Args:
342
+ metadata_dict: Dictionary containing metadata fields.
343
+
344
+ Returns:
345
+ Formatted metadata string.
346
+ """
347
+ if not metadata_dict:
348
+ return ""
349
+
350
+ return self.format(DocumentMetadata.from_dict(metadata_dict))
351
+
352
+ def _format_field(self, field_name: str, value: Any) -> Optional[str]:
353
+ """
354
+ Format a single metadata field.
355
+
356
+ Args:
357
+ field_name: Field name
358
+ value: Field value
359
+
360
+ Returns:
361
+ Formatted field string, or None if value is empty.
362
+ """
363
+ if value is None:
364
+ return None
365
+
366
+ # Format datetime values
367
+ if isinstance(value, datetime):
368
+ value = value.strftime(self.date_format)
369
+
370
+ # Get label (use field name as fallback)
371
+ label = self.field_labels.get(field_name, field_name.replace("_", " ").title())
372
+
373
+ return f"{self.indent}{label}: {value}"
374
+
375
+ def get_label(self, field_name: str) -> str:
376
+ """
377
+ Get display label for a field name.
378
+
379
+ Args:
380
+ field_name: Field name
381
+
382
+ Returns:
383
+ Display label for the field.
384
+ """
385
+ return self.field_labels.get(field_name, field_name.replace("_", " ").title())
386
+
387
+
388
+ class BaseMetadataExtractor(ABC):
389
+ """
390
+ Abstract base class for metadata extractors.
391
+
392
+ Each document format should implement a concrete extractor
393
+ that inherits from this class and provides format-specific
394
+ extraction logic.
395
+
396
+ Subclasses must implement:
397
+ - extract(): Extract metadata from format-specific source object
398
+
399
+ Subclasses may optionally override:
400
+ - format(): Customize metadata formatting
401
+ - get_formatter(): Provide custom formatter instance
402
+
403
+ Attributes:
404
+ formatter: MetadataFormatter instance for output formatting
405
+ logger: Logger instance for this extractor
406
+
407
+ Example:
408
+ class PDFMetadataExtractor(BaseMetadataExtractor):
409
+ def extract(self, doc) -> DocumentMetadata:
410
+ # Extract from PyMuPDF document object
411
+ pdf_meta = doc.metadata
412
+ return DocumentMetadata(
413
+ title=pdf_meta.get('title'),
414
+ author=pdf_meta.get('author'),
415
+ ...
416
+ )
417
+ """
418
+
419
+ def __init__(
420
+ self,
421
+ formatter: Optional[MetadataFormatter] = None,
422
+ language: str = "ko",
423
+ ):
424
+ """
425
+ Initialize BaseMetadataExtractor.
426
+
427
+ Args:
428
+ formatter: Custom MetadataFormatter instance (optional)
429
+ language: Default language for formatter if not provided
430
+ """
431
+ self._formatter = formatter or MetadataFormatter(language=language)
432
+ self._logger = logging.getLogger(
433
+ f"xgen_doc2chunk.metadata.{self.__class__.__name__}"
434
+ )
435
+
436
+ @property
437
+ def formatter(self) -> MetadataFormatter:
438
+ """Get the metadata formatter instance."""
439
+ return self._formatter
440
+
441
+ @property
442
+ def logger(self) -> logging.Logger:
443
+ """Get the logger instance."""
444
+ return self._logger
445
+
446
+ @abstractmethod
447
+ def extract(self, source: Any) -> DocumentMetadata:
448
+ """
449
+ Extract metadata from source object.
450
+
451
+ This method must be implemented by subclasses to provide
452
+ format-specific metadata extraction logic.
453
+
454
+ Args:
455
+ source: Format-specific source object (e.g., PyMuPDF doc,
456
+ python-docx Document, openpyxl Workbook, etc.)
457
+
458
+ Returns:
459
+ DocumentMetadata instance containing extracted metadata.
460
+ """
461
+ pass
462
+
463
+ def format(self, metadata: DocumentMetadata) -> str:
464
+ """
465
+ Format metadata as a string.
466
+
467
+ Uses the formatter to convert DocumentMetadata to a string.
468
+ Can be overridden by subclasses for custom formatting.
469
+
470
+ Args:
471
+ metadata: DocumentMetadata instance to format.
472
+
473
+ Returns:
474
+ Formatted metadata string.
475
+ """
476
+ return self._formatter.format(metadata)
477
+
478
+ def extract_and_format(self, source: Any) -> str:
479
+ """
480
+ Extract metadata and format as string in one step.
481
+
482
+ Convenience method that combines extract() and format().
483
+
484
+ Args:
485
+ source: Format-specific source object.
486
+
487
+ Returns:
488
+ Formatted metadata string.
489
+ """
490
+ try:
491
+ metadata = self.extract(source)
492
+ return self.format(metadata)
493
+ except Exception as e:
494
+ self._logger.warning(f"Failed to extract metadata: {e}")
495
+ return ""
496
+
497
+ def extract_to_dict(self, source: Any) -> Dict[str, Any]:
498
+ """
499
+ Extract metadata and return as dictionary.
500
+
501
+ Convenience method that extracts metadata and converts to dict.
502
+
503
+ Args:
504
+ source: Format-specific source object.
505
+
506
+ Returns:
507
+ Dictionary containing metadata fields.
508
+ """
509
+ try:
510
+ metadata = self.extract(source)
511
+ return metadata.to_dict()
512
+ except Exception as e:
513
+ self._logger.warning(f"Failed to extract metadata: {e}")
514
+ return {}
515
+
516
+
517
+ # Default formatter instance (Korean)
518
+ _default_formatter = MetadataFormatter(language="ko")
519
+
520
+
521
+ def format_metadata(metadata: Dict[str, Any]) -> str:
522
+ """
523
+ Format metadata dictionary as a string.
524
+
525
+ Convenience function using default formatter for backward compatibility.
526
+
527
+ Args:
528
+ metadata: Dictionary containing metadata fields.
529
+
530
+ Returns:
531
+ Formatted metadata string.
532
+ """
533
+ return _default_formatter.format_dict(metadata)
534
+
535
+
536
+ __all__ = [
537
+ "MetadataField",
538
+ "DocumentMetadata",
539
+ "MetadataFormatter",
540
+ "BaseMetadataExtractor",
541
+ "format_metadata",
542
+ ]