xgen-doc2chunk 0.1.0__py3-none-any.whl → 0.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- xgen_doc2chunk/__init__.py +42 -0
- xgen_doc2chunk/chunking/__init__.py +168 -0
- xgen_doc2chunk/chunking/chunking.py +786 -0
- xgen_doc2chunk/chunking/constants.py +134 -0
- xgen_doc2chunk/chunking/page_chunker.py +248 -0
- xgen_doc2chunk/chunking/protected_regions.py +715 -0
- xgen_doc2chunk/chunking/sheet_processor.py +406 -0
- xgen_doc2chunk/chunking/table_chunker.py +832 -0
- xgen_doc2chunk/chunking/table_parser.py +172 -0
- xgen_doc2chunk/chunking/text_chunker.py +443 -0
- xgen_doc2chunk/core/__init__.py +64 -0
- xgen_doc2chunk/core/document_processor.py +1307 -0
- xgen_doc2chunk/core/functions/__init__.py +85 -0
- xgen_doc2chunk/core/functions/chart_extractor.py +144 -0
- xgen_doc2chunk/core/functions/chart_processor.py +534 -0
- xgen_doc2chunk/core/functions/file_converter.py +220 -0
- xgen_doc2chunk/core/functions/img_processor.py +649 -0
- xgen_doc2chunk/core/functions/metadata_extractor.py +542 -0
- xgen_doc2chunk/core/functions/page_tag_processor.py +393 -0
- xgen_doc2chunk/core/functions/preprocessor.py +162 -0
- xgen_doc2chunk/core/functions/storage_backend.py +381 -0
- xgen_doc2chunk/core/functions/table_extractor.py +468 -0
- xgen_doc2chunk/core/functions/table_processor.py +299 -0
- xgen_doc2chunk/core/functions/utils.py +159 -0
- xgen_doc2chunk/core/processor/__init__.py +96 -0
- xgen_doc2chunk/core/processor/base_handler.py +544 -0
- xgen_doc2chunk/core/processor/csv_handler.py +135 -0
- xgen_doc2chunk/core/processor/csv_helper/__init__.py +89 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_constants.py +63 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_encoding.py +104 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_file_converter.py +78 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_image_processor.py +75 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_metadata.py +168 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_parser.py +225 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_preprocessor.py +86 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_table.py +266 -0
- xgen_doc2chunk/core/processor/doc_handler.py +579 -0
- xgen_doc2chunk/core/processor/doc_helpers/__init__.py +25 -0
- xgen_doc2chunk/core/processor/doc_helpers/doc_file_converter.py +160 -0
- xgen_doc2chunk/core/processor/doc_helpers/doc_image_processor.py +179 -0
- xgen_doc2chunk/core/processor/doc_helpers/doc_preprocessor.py +83 -0
- xgen_doc2chunk/core/processor/docx_handler.py +376 -0
- xgen_doc2chunk/core/processor/docx_helper/__init__.py +84 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_chart_extractor.py +436 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_constants.py +75 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_file_converter.py +76 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_image.py +145 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_image_processor.py +410 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_metadata.py +71 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_paragraph.py +126 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_preprocessor.py +82 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_table_extractor.py +527 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_table_processor.py +220 -0
- xgen_doc2chunk/core/processor/excel_handler.py +353 -0
- xgen_doc2chunk/core/processor/excel_helper/__init__.py +97 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_chart_extractor.py +498 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_file_converter.py +157 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_image_processor.py +316 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_layout_detector.py +739 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_metadata.py +145 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_preprocessor.py +83 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_table_xls.py +357 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_table_xlsx.py +361 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_textbox.py +266 -0
- xgen_doc2chunk/core/processor/html_helper/__init__.py +7 -0
- xgen_doc2chunk/core/processor/html_helper/html_file_converter.py +92 -0
- xgen_doc2chunk/core/processor/html_helper/html_preprocessor.py +74 -0
- xgen_doc2chunk/core/processor/html_reprocessor.py +140 -0
- xgen_doc2chunk/core/processor/hwp_handler.py +401 -0
- xgen_doc2chunk/core/processor/hwp_helper/__init__.py +120 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_chart_extractor.py +373 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_constants.py +78 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_decoder.py +106 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_docinfo.py +174 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_file_converter.py +60 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_image_processor.py +413 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_metadata.py +236 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_preprocessor.py +82 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_record.py +149 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_recovery.py +217 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_table.py +205 -0
- xgen_doc2chunk/core/processor/hwpx_handler.py +191 -0
- xgen_doc2chunk/core/processor/hwpx_helper/__init__.py +85 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_chart_extractor.py +464 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_constants.py +30 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_file_converter.py +70 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_image_processor.py +258 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_metadata.py +163 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_preprocessor.py +80 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_section.py +242 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_table_extractor.py +462 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_table_processor.py +220 -0
- xgen_doc2chunk/core/processor/image_file_handler.py +212 -0
- xgen_doc2chunk/core/processor/image_file_helper/__init__.py +17 -0
- xgen_doc2chunk/core/processor/image_file_helper/image_file_converter.py +69 -0
- xgen_doc2chunk/core/processor/image_file_helper/image_file_image_processor.py +123 -0
- xgen_doc2chunk/core/processor/image_file_helper/image_file_preprocessor.py +84 -0
- xgen_doc2chunk/core/processor/pdf_handler.py +597 -0
- xgen_doc2chunk/core/processor/pdf_helpers/__init__.py +229 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_block_image_engine.py +667 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_cell_analysis.py +493 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_complexity_analyzer.py +598 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_element_merger.py +46 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_file_converter.py +72 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_graphic_detector.py +332 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_image_processor.py +321 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_layout_block_detector.py +1244 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_line_analysis.py +420 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_metadata.py +101 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_page_analyzer.py +114 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_preprocessor.py +106 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_detection.py +1346 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_processor.py +897 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_quality_analyzer.py +750 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_validator.py +401 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_text_extractor.py +155 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_text_quality_analyzer.py +655 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_utils.py +183 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_vector_text_ocr.py +302 -0
- xgen_doc2chunk/core/processor/pdf_helpers/types.py +278 -0
- xgen_doc2chunk/core/processor/ppt_handler.py +288 -0
- xgen_doc2chunk/core/processor/ppt_helper/__init__.py +96 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_bullet.py +332 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_chart_extractor.py +182 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_constants.py +119 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_file_converter.py +55 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_image_processor.py +196 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_metadata.py +71 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_preprocessor.py +77 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_shape.py +189 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_slide.py +69 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_table.py +386 -0
- xgen_doc2chunk/core/processor/rtf_handler.py +290 -0
- xgen_doc2chunk/core/processor/rtf_helper/__init__.py +128 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_constants.py +94 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_content_extractor.py +211 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_decoder.py +141 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_file_converter.py +87 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_metadata_extractor.py +179 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_preprocessor.py +426 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_region_finder.py +91 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_table_extractor.py +482 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_text_cleaner.py +389 -0
- xgen_doc2chunk/core/processor/text_handler.py +95 -0
- xgen_doc2chunk/core/processor/text_helper/__init__.py +17 -0
- xgen_doc2chunk/core/processor/text_helper/text_file_converter.py +28 -0
- xgen_doc2chunk/core/processor/text_helper/text_image_processor.py +75 -0
- xgen_doc2chunk/core/processor/text_helper/text_preprocessor.py +82 -0
- xgen_doc2chunk/ocr/__init__.py +67 -0
- xgen_doc2chunk/ocr/base.py +209 -0
- xgen_doc2chunk/ocr/ocr_engine/__init__.py +22 -0
- xgen_doc2chunk/ocr/ocr_engine/anthropic_ocr.py +91 -0
- xgen_doc2chunk/ocr/ocr_engine/bedrock_ocr.py +172 -0
- xgen_doc2chunk/ocr/ocr_engine/gemini_ocr.py +91 -0
- xgen_doc2chunk/ocr/ocr_engine/openai_ocr.py +100 -0
- xgen_doc2chunk/ocr/ocr_engine/vllm_ocr.py +116 -0
- xgen_doc2chunk/ocr/ocr_processor.py +387 -0
- {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.1.dist-info}/METADATA +1 -1
- xgen_doc2chunk-0.1.1.dist-info/RECORD +161 -0
- xgen_doc2chunk-0.1.0.dist-info/RECORD +0 -4
- {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.1.dist-info}/WHEEL +0 -0
- {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.1.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,542 @@
|
|
|
1
|
+
# xgen_doc2chunk/core/functions/metadata_extractor.py
|
|
2
|
+
"""
|
|
3
|
+
Metadata Extractor Interface
|
|
4
|
+
|
|
5
|
+
Provides abstract base class and common utilities for document metadata extraction.
|
|
6
|
+
Each handler's helper module should implement a concrete extractor inheriting from
|
|
7
|
+
BaseMetadataExtractor.
|
|
8
|
+
|
|
9
|
+
This module defines:
|
|
10
|
+
- DocumentMetadata: Standardized metadata container dataclass
|
|
11
|
+
- MetadataField: Enum for standard metadata field names
|
|
12
|
+
- BaseMetadataExtractor: Abstract base class for metadata extractors
|
|
13
|
+
- MetadataFormatter: Shared formatter for consistent metadata output
|
|
14
|
+
|
|
15
|
+
Usage Example:
|
|
16
|
+
from xgen_doc2chunk.core.functions.metadata_extractor import (
|
|
17
|
+
BaseMetadataExtractor,
|
|
18
|
+
DocumentMetadata,
|
|
19
|
+
MetadataFormatter,
|
|
20
|
+
)
|
|
21
|
+
|
|
22
|
+
class PDFMetadataExtractor(BaseMetadataExtractor):
|
|
23
|
+
def extract(self, source: Any) -> DocumentMetadata:
|
|
24
|
+
# PDF-specific extraction logic
|
|
25
|
+
...
|
|
26
|
+
"""
|
|
27
|
+
import logging
|
|
28
|
+
from abc import ABC, abstractmethod
|
|
29
|
+
from dataclasses import dataclass, field
|
|
30
|
+
from datetime import datetime
|
|
31
|
+
from enum import Enum
|
|
32
|
+
from typing import Any, Dict, Optional
|
|
33
|
+
|
|
34
|
+
logger = logging.getLogger("xgen_doc2chunk.metadata")
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
class MetadataField(str, Enum):
|
|
38
|
+
"""
|
|
39
|
+
Standard metadata field names.
|
|
40
|
+
|
|
41
|
+
These field names are used consistently across all document formats
|
|
42
|
+
to ensure uniform metadata handling.
|
|
43
|
+
"""
|
|
44
|
+
TITLE = "title"
|
|
45
|
+
SUBJECT = "subject"
|
|
46
|
+
AUTHOR = "author"
|
|
47
|
+
KEYWORDS = "keywords"
|
|
48
|
+
COMMENTS = "comments"
|
|
49
|
+
LAST_SAVED_BY = "last_saved_by"
|
|
50
|
+
CREATE_TIME = "create_time"
|
|
51
|
+
LAST_SAVED_TIME = "last_saved_time"
|
|
52
|
+
|
|
53
|
+
# Additional fields for specific formats
|
|
54
|
+
VERSION = "version"
|
|
55
|
+
CATEGORY = "category"
|
|
56
|
+
COMPANY = "company"
|
|
57
|
+
MANAGER = "manager"
|
|
58
|
+
|
|
59
|
+
# File-level metadata (for CSV, etc.)
|
|
60
|
+
FILE_NAME = "file_name"
|
|
61
|
+
FILE_SIZE = "file_size"
|
|
62
|
+
ENCODING = "encoding"
|
|
63
|
+
ROW_COUNT = "row_count"
|
|
64
|
+
COL_COUNT = "col_count"
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
@dataclass
|
|
68
|
+
class DocumentMetadata:
|
|
69
|
+
"""
|
|
70
|
+
Standardized metadata container for all document types.
|
|
71
|
+
|
|
72
|
+
This dataclass provides a unified structure for storing document metadata
|
|
73
|
+
across all supported file formats. It includes common fields and allows
|
|
74
|
+
for format-specific custom fields.
|
|
75
|
+
|
|
76
|
+
Attributes:
|
|
77
|
+
title: Document title
|
|
78
|
+
subject: Document subject
|
|
79
|
+
author: Document author/creator
|
|
80
|
+
keywords: Document keywords
|
|
81
|
+
comments: Document comments/description
|
|
82
|
+
last_saved_by: Last person who saved the document
|
|
83
|
+
create_time: Document creation timestamp
|
|
84
|
+
last_saved_time: Last modification timestamp
|
|
85
|
+
custom: Dictionary for format-specific additional fields
|
|
86
|
+
|
|
87
|
+
Example:
|
|
88
|
+
>>> metadata = DocumentMetadata(
|
|
89
|
+
... title="Annual Report",
|
|
90
|
+
... author="John Doe",
|
|
91
|
+
... create_time=datetime.now()
|
|
92
|
+
... )
|
|
93
|
+
>>> metadata.to_dict()
|
|
94
|
+
{'title': 'Annual Report', 'author': 'John Doe', ...}
|
|
95
|
+
"""
|
|
96
|
+
title: Optional[str] = None
|
|
97
|
+
subject: Optional[str] = None
|
|
98
|
+
author: Optional[str] = None
|
|
99
|
+
keywords: Optional[str] = None
|
|
100
|
+
comments: Optional[str] = None
|
|
101
|
+
last_saved_by: Optional[str] = None
|
|
102
|
+
create_time: Optional[datetime] = None
|
|
103
|
+
last_saved_time: Optional[datetime] = None
|
|
104
|
+
custom: Dict[str, Any] = field(default_factory=dict)
|
|
105
|
+
|
|
106
|
+
def to_dict(self) -> Dict[str, Any]:
|
|
107
|
+
"""
|
|
108
|
+
Convert metadata to dictionary.
|
|
109
|
+
|
|
110
|
+
Returns:
|
|
111
|
+
Dictionary containing all non-None metadata fields.
|
|
112
|
+
"""
|
|
113
|
+
result = {}
|
|
114
|
+
|
|
115
|
+
if self.title:
|
|
116
|
+
result[MetadataField.TITLE.value] = self.title
|
|
117
|
+
if self.subject:
|
|
118
|
+
result[MetadataField.SUBJECT.value] = self.subject
|
|
119
|
+
if self.author:
|
|
120
|
+
result[MetadataField.AUTHOR.value] = self.author
|
|
121
|
+
if self.keywords:
|
|
122
|
+
result[MetadataField.KEYWORDS.value] = self.keywords
|
|
123
|
+
if self.comments:
|
|
124
|
+
result[MetadataField.COMMENTS.value] = self.comments
|
|
125
|
+
if self.last_saved_by:
|
|
126
|
+
result[MetadataField.LAST_SAVED_BY.value] = self.last_saved_by
|
|
127
|
+
if self.create_time:
|
|
128
|
+
result[MetadataField.CREATE_TIME.value] = self.create_time
|
|
129
|
+
if self.last_saved_time:
|
|
130
|
+
result[MetadataField.LAST_SAVED_TIME.value] = self.last_saved_time
|
|
131
|
+
|
|
132
|
+
# Add custom fields
|
|
133
|
+
result.update(self.custom)
|
|
134
|
+
|
|
135
|
+
return result
|
|
136
|
+
|
|
137
|
+
@classmethod
|
|
138
|
+
def from_dict(cls, data: Dict[str, Any]) -> "DocumentMetadata":
|
|
139
|
+
"""
|
|
140
|
+
Create DocumentMetadata from dictionary.
|
|
141
|
+
|
|
142
|
+
Standard fields are extracted into their respective attributes,
|
|
143
|
+
while non-standard fields go into the custom dictionary.
|
|
144
|
+
|
|
145
|
+
Args:
|
|
146
|
+
data: Dictionary containing metadata fields.
|
|
147
|
+
|
|
148
|
+
Returns:
|
|
149
|
+
DocumentMetadata instance.
|
|
150
|
+
"""
|
|
151
|
+
standard_fields = {
|
|
152
|
+
MetadataField.TITLE.value,
|
|
153
|
+
MetadataField.SUBJECT.value,
|
|
154
|
+
MetadataField.AUTHOR.value,
|
|
155
|
+
MetadataField.KEYWORDS.value,
|
|
156
|
+
MetadataField.COMMENTS.value,
|
|
157
|
+
MetadataField.LAST_SAVED_BY.value,
|
|
158
|
+
MetadataField.CREATE_TIME.value,
|
|
159
|
+
MetadataField.LAST_SAVED_TIME.value,
|
|
160
|
+
}
|
|
161
|
+
|
|
162
|
+
custom = {k: v for k, v in data.items() if k not in standard_fields}
|
|
163
|
+
|
|
164
|
+
return cls(
|
|
165
|
+
title=data.get(MetadataField.TITLE.value),
|
|
166
|
+
subject=data.get(MetadataField.SUBJECT.value),
|
|
167
|
+
author=data.get(MetadataField.AUTHOR.value),
|
|
168
|
+
keywords=data.get(MetadataField.KEYWORDS.value),
|
|
169
|
+
comments=data.get(MetadataField.COMMENTS.value),
|
|
170
|
+
last_saved_by=data.get(MetadataField.LAST_SAVED_BY.value),
|
|
171
|
+
create_time=data.get(MetadataField.CREATE_TIME.value),
|
|
172
|
+
last_saved_time=data.get(MetadataField.LAST_SAVED_TIME.value),
|
|
173
|
+
custom=custom,
|
|
174
|
+
)
|
|
175
|
+
|
|
176
|
+
def is_empty(self) -> bool:
|
|
177
|
+
"""
|
|
178
|
+
Check if metadata is empty (no fields set).
|
|
179
|
+
|
|
180
|
+
Returns:
|
|
181
|
+
True if no metadata fields are set.
|
|
182
|
+
"""
|
|
183
|
+
return not self.to_dict()
|
|
184
|
+
|
|
185
|
+
def __bool__(self) -> bool:
|
|
186
|
+
"""Return True if metadata has any fields set."""
|
|
187
|
+
return not self.is_empty()
|
|
188
|
+
|
|
189
|
+
|
|
190
|
+
class MetadataFormatter:
|
|
191
|
+
"""
|
|
192
|
+
Shared formatter for consistent metadata output.
|
|
193
|
+
|
|
194
|
+
This class provides a unified way to format DocumentMetadata objects
|
|
195
|
+
as strings for inclusion in extracted text output.
|
|
196
|
+
|
|
197
|
+
Attributes:
|
|
198
|
+
metadata_tag_prefix: Opening tag for metadata section (default: "<Document-Metadata>")
|
|
199
|
+
metadata_tag_suffix: Closing tag for metadata section (default: "</Document-Metadata>")
|
|
200
|
+
field_labels: Dictionary mapping field names to display labels
|
|
201
|
+
date_format: Date/time format string
|
|
202
|
+
language: Output language ('ko' for Korean, 'en' for English)
|
|
203
|
+
|
|
204
|
+
Example:
|
|
205
|
+
>>> formatter = MetadataFormatter(language='en')
|
|
206
|
+
>>> text = formatter.format(metadata)
|
|
207
|
+
>>> print(text)
|
|
208
|
+
<Document-Metadata>
|
|
209
|
+
Title: Annual Report
|
|
210
|
+
Author: John Doe
|
|
211
|
+
</Document-Metadata>
|
|
212
|
+
"""
|
|
213
|
+
|
|
214
|
+
# Field labels in Korean
|
|
215
|
+
LABELS_KO = {
|
|
216
|
+
MetadataField.TITLE.value: "제목",
|
|
217
|
+
MetadataField.SUBJECT.value: "주제",
|
|
218
|
+
MetadataField.AUTHOR.value: "작성자",
|
|
219
|
+
MetadataField.KEYWORDS.value: "키워드",
|
|
220
|
+
MetadataField.COMMENTS.value: "설명",
|
|
221
|
+
MetadataField.LAST_SAVED_BY.value: "마지막 수정자",
|
|
222
|
+
MetadataField.CREATE_TIME.value: "작성일",
|
|
223
|
+
MetadataField.LAST_SAVED_TIME.value: "수정일",
|
|
224
|
+
# Additional fields
|
|
225
|
+
MetadataField.VERSION.value: "버전",
|
|
226
|
+
MetadataField.CATEGORY.value: "범주",
|
|
227
|
+
MetadataField.COMPANY.value: "회사",
|
|
228
|
+
MetadataField.MANAGER.value: "관리자",
|
|
229
|
+
MetadataField.FILE_NAME.value: "파일명",
|
|
230
|
+
MetadataField.FILE_SIZE.value: "파일 크기",
|
|
231
|
+
MetadataField.ENCODING.value: "인코딩",
|
|
232
|
+
MetadataField.ROW_COUNT.value: "행 개수",
|
|
233
|
+
MetadataField.COL_COUNT.value: "열 개수",
|
|
234
|
+
}
|
|
235
|
+
|
|
236
|
+
# Field labels in English
|
|
237
|
+
LABELS_EN = {
|
|
238
|
+
MetadataField.TITLE.value: "Title",
|
|
239
|
+
MetadataField.SUBJECT.value: "Subject",
|
|
240
|
+
MetadataField.AUTHOR.value: "Author",
|
|
241
|
+
MetadataField.KEYWORDS.value: "Keywords",
|
|
242
|
+
MetadataField.COMMENTS.value: "Comments",
|
|
243
|
+
MetadataField.LAST_SAVED_BY.value: "Last Saved By",
|
|
244
|
+
MetadataField.CREATE_TIME.value: "Created",
|
|
245
|
+
MetadataField.LAST_SAVED_TIME.value: "Last Modified",
|
|
246
|
+
# Additional fields
|
|
247
|
+
MetadataField.VERSION.value: "Version",
|
|
248
|
+
MetadataField.CATEGORY.value: "Category",
|
|
249
|
+
MetadataField.COMPANY.value: "Company",
|
|
250
|
+
MetadataField.MANAGER.value: "Manager",
|
|
251
|
+
MetadataField.FILE_NAME.value: "File Name",
|
|
252
|
+
MetadataField.FILE_SIZE.value: "File Size",
|
|
253
|
+
MetadataField.ENCODING.value: "Encoding",
|
|
254
|
+
MetadataField.ROW_COUNT.value: "Row Count",
|
|
255
|
+
MetadataField.COL_COUNT.value: "Column Count",
|
|
256
|
+
}
|
|
257
|
+
|
|
258
|
+
# Standard field order for output
|
|
259
|
+
FIELD_ORDER = [
|
|
260
|
+
MetadataField.TITLE.value,
|
|
261
|
+
MetadataField.SUBJECT.value,
|
|
262
|
+
MetadataField.AUTHOR.value,
|
|
263
|
+
MetadataField.KEYWORDS.value,
|
|
264
|
+
MetadataField.COMMENTS.value,
|
|
265
|
+
MetadataField.LAST_SAVED_BY.value,
|
|
266
|
+
MetadataField.CREATE_TIME.value,
|
|
267
|
+
MetadataField.LAST_SAVED_TIME.value,
|
|
268
|
+
]
|
|
269
|
+
|
|
270
|
+
def __init__(
|
|
271
|
+
self,
|
|
272
|
+
metadata_tag_prefix: str = "<Document-Metadata>",
|
|
273
|
+
metadata_tag_suffix: str = "</Document-Metadata>",
|
|
274
|
+
date_format: str = "%Y-%m-%d %H:%M:%S",
|
|
275
|
+
language: str = "ko",
|
|
276
|
+
indent: str = " ",
|
|
277
|
+
):
|
|
278
|
+
"""
|
|
279
|
+
Initialize MetadataFormatter.
|
|
280
|
+
|
|
281
|
+
Args:
|
|
282
|
+
metadata_tag_prefix: Opening tag for metadata section
|
|
283
|
+
metadata_tag_suffix: Closing tag for metadata section
|
|
284
|
+
date_format: strftime format for datetime values
|
|
285
|
+
language: Output language ('ko' or 'en')
|
|
286
|
+
indent: Indentation string for each field
|
|
287
|
+
"""
|
|
288
|
+
self.metadata_tag_prefix = metadata_tag_prefix
|
|
289
|
+
self.metadata_tag_suffix = metadata_tag_suffix
|
|
290
|
+
self.date_format = date_format
|
|
291
|
+
self.language = language
|
|
292
|
+
self.indent = indent
|
|
293
|
+
|
|
294
|
+
# Select labels based on language
|
|
295
|
+
self.field_labels = self.LABELS_KO if language == "ko" else self.LABELS_EN
|
|
296
|
+
|
|
297
|
+
def format(self, metadata: DocumentMetadata) -> str:
|
|
298
|
+
"""
|
|
299
|
+
Format DocumentMetadata as a string.
|
|
300
|
+
|
|
301
|
+
Args:
|
|
302
|
+
metadata: DocumentMetadata instance to format.
|
|
303
|
+
|
|
304
|
+
Returns:
|
|
305
|
+
Formatted metadata string, or empty string if metadata is empty.
|
|
306
|
+
"""
|
|
307
|
+
if not metadata:
|
|
308
|
+
return ""
|
|
309
|
+
|
|
310
|
+
data = metadata.to_dict()
|
|
311
|
+
if not data:
|
|
312
|
+
return ""
|
|
313
|
+
|
|
314
|
+
lines = [self.metadata_tag_prefix]
|
|
315
|
+
|
|
316
|
+
# Output standard fields in order
|
|
317
|
+
for field_name in self.FIELD_ORDER:
|
|
318
|
+
if field_name in data:
|
|
319
|
+
value = data.pop(field_name)
|
|
320
|
+
formatted_line = self._format_field(field_name, value)
|
|
321
|
+
if formatted_line:
|
|
322
|
+
lines.append(formatted_line)
|
|
323
|
+
|
|
324
|
+
# Output remaining custom fields
|
|
325
|
+
for field_name, value in data.items():
|
|
326
|
+
formatted_line = self._format_field(field_name, value)
|
|
327
|
+
if formatted_line:
|
|
328
|
+
lines.append(formatted_line)
|
|
329
|
+
|
|
330
|
+
lines.append(self.metadata_tag_suffix)
|
|
331
|
+
|
|
332
|
+
return "\n".join(lines)
|
|
333
|
+
|
|
334
|
+
def format_dict(self, metadata_dict: Dict[str, Any]) -> str:
|
|
335
|
+
"""
|
|
336
|
+
Format metadata dictionary as a string.
|
|
337
|
+
|
|
338
|
+
Convenience method for formatting raw dictionaries without
|
|
339
|
+
first converting to DocumentMetadata.
|
|
340
|
+
|
|
341
|
+
Args:
|
|
342
|
+
metadata_dict: Dictionary containing metadata fields.
|
|
343
|
+
|
|
344
|
+
Returns:
|
|
345
|
+
Formatted metadata string.
|
|
346
|
+
"""
|
|
347
|
+
if not metadata_dict:
|
|
348
|
+
return ""
|
|
349
|
+
|
|
350
|
+
return self.format(DocumentMetadata.from_dict(metadata_dict))
|
|
351
|
+
|
|
352
|
+
def _format_field(self, field_name: str, value: Any) -> Optional[str]:
|
|
353
|
+
"""
|
|
354
|
+
Format a single metadata field.
|
|
355
|
+
|
|
356
|
+
Args:
|
|
357
|
+
field_name: Field name
|
|
358
|
+
value: Field value
|
|
359
|
+
|
|
360
|
+
Returns:
|
|
361
|
+
Formatted field string, or None if value is empty.
|
|
362
|
+
"""
|
|
363
|
+
if value is None:
|
|
364
|
+
return None
|
|
365
|
+
|
|
366
|
+
# Format datetime values
|
|
367
|
+
if isinstance(value, datetime):
|
|
368
|
+
value = value.strftime(self.date_format)
|
|
369
|
+
|
|
370
|
+
# Get label (use field name as fallback)
|
|
371
|
+
label = self.field_labels.get(field_name, field_name.replace("_", " ").title())
|
|
372
|
+
|
|
373
|
+
return f"{self.indent}{label}: {value}"
|
|
374
|
+
|
|
375
|
+
def get_label(self, field_name: str) -> str:
|
|
376
|
+
"""
|
|
377
|
+
Get display label for a field name.
|
|
378
|
+
|
|
379
|
+
Args:
|
|
380
|
+
field_name: Field name
|
|
381
|
+
|
|
382
|
+
Returns:
|
|
383
|
+
Display label for the field.
|
|
384
|
+
"""
|
|
385
|
+
return self.field_labels.get(field_name, field_name.replace("_", " ").title())
|
|
386
|
+
|
|
387
|
+
|
|
388
|
+
class BaseMetadataExtractor(ABC):
|
|
389
|
+
"""
|
|
390
|
+
Abstract base class for metadata extractors.
|
|
391
|
+
|
|
392
|
+
Each document format should implement a concrete extractor
|
|
393
|
+
that inherits from this class and provides format-specific
|
|
394
|
+
extraction logic.
|
|
395
|
+
|
|
396
|
+
Subclasses must implement:
|
|
397
|
+
- extract(): Extract metadata from format-specific source object
|
|
398
|
+
|
|
399
|
+
Subclasses may optionally override:
|
|
400
|
+
- format(): Customize metadata formatting
|
|
401
|
+
- get_formatter(): Provide custom formatter instance
|
|
402
|
+
|
|
403
|
+
Attributes:
|
|
404
|
+
formatter: MetadataFormatter instance for output formatting
|
|
405
|
+
logger: Logger instance for this extractor
|
|
406
|
+
|
|
407
|
+
Example:
|
|
408
|
+
class PDFMetadataExtractor(BaseMetadataExtractor):
|
|
409
|
+
def extract(self, doc) -> DocumentMetadata:
|
|
410
|
+
# Extract from PyMuPDF document object
|
|
411
|
+
pdf_meta = doc.metadata
|
|
412
|
+
return DocumentMetadata(
|
|
413
|
+
title=pdf_meta.get('title'),
|
|
414
|
+
author=pdf_meta.get('author'),
|
|
415
|
+
...
|
|
416
|
+
)
|
|
417
|
+
"""
|
|
418
|
+
|
|
419
|
+
def __init__(
|
|
420
|
+
self,
|
|
421
|
+
formatter: Optional[MetadataFormatter] = None,
|
|
422
|
+
language: str = "ko",
|
|
423
|
+
):
|
|
424
|
+
"""
|
|
425
|
+
Initialize BaseMetadataExtractor.
|
|
426
|
+
|
|
427
|
+
Args:
|
|
428
|
+
formatter: Custom MetadataFormatter instance (optional)
|
|
429
|
+
language: Default language for formatter if not provided
|
|
430
|
+
"""
|
|
431
|
+
self._formatter = formatter or MetadataFormatter(language=language)
|
|
432
|
+
self._logger = logging.getLogger(
|
|
433
|
+
f"xgen_doc2chunk.metadata.{self.__class__.__name__}"
|
|
434
|
+
)
|
|
435
|
+
|
|
436
|
+
@property
|
|
437
|
+
def formatter(self) -> MetadataFormatter:
|
|
438
|
+
"""Get the metadata formatter instance."""
|
|
439
|
+
return self._formatter
|
|
440
|
+
|
|
441
|
+
@property
|
|
442
|
+
def logger(self) -> logging.Logger:
|
|
443
|
+
"""Get the logger instance."""
|
|
444
|
+
return self._logger
|
|
445
|
+
|
|
446
|
+
@abstractmethod
|
|
447
|
+
def extract(self, source: Any) -> DocumentMetadata:
|
|
448
|
+
"""
|
|
449
|
+
Extract metadata from source object.
|
|
450
|
+
|
|
451
|
+
This method must be implemented by subclasses to provide
|
|
452
|
+
format-specific metadata extraction logic.
|
|
453
|
+
|
|
454
|
+
Args:
|
|
455
|
+
source: Format-specific source object (e.g., PyMuPDF doc,
|
|
456
|
+
python-docx Document, openpyxl Workbook, etc.)
|
|
457
|
+
|
|
458
|
+
Returns:
|
|
459
|
+
DocumentMetadata instance containing extracted metadata.
|
|
460
|
+
"""
|
|
461
|
+
pass
|
|
462
|
+
|
|
463
|
+
def format(self, metadata: DocumentMetadata) -> str:
|
|
464
|
+
"""
|
|
465
|
+
Format metadata as a string.
|
|
466
|
+
|
|
467
|
+
Uses the formatter to convert DocumentMetadata to a string.
|
|
468
|
+
Can be overridden by subclasses for custom formatting.
|
|
469
|
+
|
|
470
|
+
Args:
|
|
471
|
+
metadata: DocumentMetadata instance to format.
|
|
472
|
+
|
|
473
|
+
Returns:
|
|
474
|
+
Formatted metadata string.
|
|
475
|
+
"""
|
|
476
|
+
return self._formatter.format(metadata)
|
|
477
|
+
|
|
478
|
+
def extract_and_format(self, source: Any) -> str:
|
|
479
|
+
"""
|
|
480
|
+
Extract metadata and format as string in one step.
|
|
481
|
+
|
|
482
|
+
Convenience method that combines extract() and format().
|
|
483
|
+
|
|
484
|
+
Args:
|
|
485
|
+
source: Format-specific source object.
|
|
486
|
+
|
|
487
|
+
Returns:
|
|
488
|
+
Formatted metadata string.
|
|
489
|
+
"""
|
|
490
|
+
try:
|
|
491
|
+
metadata = self.extract(source)
|
|
492
|
+
return self.format(metadata)
|
|
493
|
+
except Exception as e:
|
|
494
|
+
self._logger.warning(f"Failed to extract metadata: {e}")
|
|
495
|
+
return ""
|
|
496
|
+
|
|
497
|
+
def extract_to_dict(self, source: Any) -> Dict[str, Any]:
|
|
498
|
+
"""
|
|
499
|
+
Extract metadata and return as dictionary.
|
|
500
|
+
|
|
501
|
+
Convenience method that extracts metadata and converts to dict.
|
|
502
|
+
|
|
503
|
+
Args:
|
|
504
|
+
source: Format-specific source object.
|
|
505
|
+
|
|
506
|
+
Returns:
|
|
507
|
+
Dictionary containing metadata fields.
|
|
508
|
+
"""
|
|
509
|
+
try:
|
|
510
|
+
metadata = self.extract(source)
|
|
511
|
+
return metadata.to_dict()
|
|
512
|
+
except Exception as e:
|
|
513
|
+
self._logger.warning(f"Failed to extract metadata: {e}")
|
|
514
|
+
return {}
|
|
515
|
+
|
|
516
|
+
|
|
517
|
+
# Default formatter instance (Korean)
|
|
518
|
+
_default_formatter = MetadataFormatter(language="ko")
|
|
519
|
+
|
|
520
|
+
|
|
521
|
+
def format_metadata(metadata: Dict[str, Any]) -> str:
|
|
522
|
+
"""
|
|
523
|
+
Format metadata dictionary as a string.
|
|
524
|
+
|
|
525
|
+
Convenience function using default formatter for backward compatibility.
|
|
526
|
+
|
|
527
|
+
Args:
|
|
528
|
+
metadata: Dictionary containing metadata fields.
|
|
529
|
+
|
|
530
|
+
Returns:
|
|
531
|
+
Formatted metadata string.
|
|
532
|
+
"""
|
|
533
|
+
return _default_formatter.format_dict(metadata)
|
|
534
|
+
|
|
535
|
+
|
|
536
|
+
__all__ = [
|
|
537
|
+
"MetadataField",
|
|
538
|
+
"DocumentMetadata",
|
|
539
|
+
"MetadataFormatter",
|
|
540
|
+
"BaseMetadataExtractor",
|
|
541
|
+
"format_metadata",
|
|
542
|
+
]
|