xgen-doc2chunk 0.1.0__py3-none-any.whl → 0.1.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (162) hide show
  1. xgen_doc2chunk/__init__.py +42 -0
  2. xgen_doc2chunk/chunking/__init__.py +168 -0
  3. xgen_doc2chunk/chunking/chunking.py +786 -0
  4. xgen_doc2chunk/chunking/constants.py +134 -0
  5. xgen_doc2chunk/chunking/page_chunker.py +248 -0
  6. xgen_doc2chunk/chunking/protected_regions.py +715 -0
  7. xgen_doc2chunk/chunking/sheet_processor.py +406 -0
  8. xgen_doc2chunk/chunking/table_chunker.py +832 -0
  9. xgen_doc2chunk/chunking/table_parser.py +172 -0
  10. xgen_doc2chunk/chunking/text_chunker.py +443 -0
  11. xgen_doc2chunk/core/__init__.py +64 -0
  12. xgen_doc2chunk/core/document_processor.py +1307 -0
  13. xgen_doc2chunk/core/functions/__init__.py +85 -0
  14. xgen_doc2chunk/core/functions/chart_extractor.py +144 -0
  15. xgen_doc2chunk/core/functions/chart_processor.py +534 -0
  16. xgen_doc2chunk/core/functions/file_converter.py +220 -0
  17. xgen_doc2chunk/core/functions/img_processor.py +649 -0
  18. xgen_doc2chunk/core/functions/metadata_extractor.py +542 -0
  19. xgen_doc2chunk/core/functions/page_tag_processor.py +393 -0
  20. xgen_doc2chunk/core/functions/preprocessor.py +162 -0
  21. xgen_doc2chunk/core/functions/storage_backend.py +381 -0
  22. xgen_doc2chunk/core/functions/table_extractor.py +468 -0
  23. xgen_doc2chunk/core/functions/table_processor.py +299 -0
  24. xgen_doc2chunk/core/functions/utils.py +159 -0
  25. xgen_doc2chunk/core/processor/__init__.py +96 -0
  26. xgen_doc2chunk/core/processor/base_handler.py +544 -0
  27. xgen_doc2chunk/core/processor/csv_handler.py +135 -0
  28. xgen_doc2chunk/core/processor/csv_helper/__init__.py +89 -0
  29. xgen_doc2chunk/core/processor/csv_helper/csv_constants.py +63 -0
  30. xgen_doc2chunk/core/processor/csv_helper/csv_encoding.py +104 -0
  31. xgen_doc2chunk/core/processor/csv_helper/csv_file_converter.py +78 -0
  32. xgen_doc2chunk/core/processor/csv_helper/csv_image_processor.py +75 -0
  33. xgen_doc2chunk/core/processor/csv_helper/csv_metadata.py +168 -0
  34. xgen_doc2chunk/core/processor/csv_helper/csv_parser.py +225 -0
  35. xgen_doc2chunk/core/processor/csv_helper/csv_preprocessor.py +86 -0
  36. xgen_doc2chunk/core/processor/csv_helper/csv_table.py +266 -0
  37. xgen_doc2chunk/core/processor/doc_handler.py +579 -0
  38. xgen_doc2chunk/core/processor/doc_helpers/__init__.py +25 -0
  39. xgen_doc2chunk/core/processor/doc_helpers/doc_file_converter.py +160 -0
  40. xgen_doc2chunk/core/processor/doc_helpers/doc_image_processor.py +179 -0
  41. xgen_doc2chunk/core/processor/doc_helpers/doc_preprocessor.py +83 -0
  42. xgen_doc2chunk/core/processor/docx_handler.py +376 -0
  43. xgen_doc2chunk/core/processor/docx_helper/__init__.py +84 -0
  44. xgen_doc2chunk/core/processor/docx_helper/docx_chart_extractor.py +436 -0
  45. xgen_doc2chunk/core/processor/docx_helper/docx_constants.py +75 -0
  46. xgen_doc2chunk/core/processor/docx_helper/docx_file_converter.py +76 -0
  47. xgen_doc2chunk/core/processor/docx_helper/docx_image.py +145 -0
  48. xgen_doc2chunk/core/processor/docx_helper/docx_image_processor.py +410 -0
  49. xgen_doc2chunk/core/processor/docx_helper/docx_metadata.py +71 -0
  50. xgen_doc2chunk/core/processor/docx_helper/docx_paragraph.py +126 -0
  51. xgen_doc2chunk/core/processor/docx_helper/docx_preprocessor.py +82 -0
  52. xgen_doc2chunk/core/processor/docx_helper/docx_table_extractor.py +527 -0
  53. xgen_doc2chunk/core/processor/docx_helper/docx_table_processor.py +220 -0
  54. xgen_doc2chunk/core/processor/excel_handler.py +353 -0
  55. xgen_doc2chunk/core/processor/excel_helper/__init__.py +97 -0
  56. xgen_doc2chunk/core/processor/excel_helper/excel_chart_extractor.py +498 -0
  57. xgen_doc2chunk/core/processor/excel_helper/excel_file_converter.py +157 -0
  58. xgen_doc2chunk/core/processor/excel_helper/excel_image_processor.py +316 -0
  59. xgen_doc2chunk/core/processor/excel_helper/excel_layout_detector.py +739 -0
  60. xgen_doc2chunk/core/processor/excel_helper/excel_metadata.py +145 -0
  61. xgen_doc2chunk/core/processor/excel_helper/excel_preprocessor.py +83 -0
  62. xgen_doc2chunk/core/processor/excel_helper/excel_table_xls.py +357 -0
  63. xgen_doc2chunk/core/processor/excel_helper/excel_table_xlsx.py +361 -0
  64. xgen_doc2chunk/core/processor/excel_helper/excel_textbox.py +266 -0
  65. xgen_doc2chunk/core/processor/html_helper/__init__.py +7 -0
  66. xgen_doc2chunk/core/processor/html_helper/html_file_converter.py +92 -0
  67. xgen_doc2chunk/core/processor/html_helper/html_preprocessor.py +74 -0
  68. xgen_doc2chunk/core/processor/html_reprocessor.py +140 -0
  69. xgen_doc2chunk/core/processor/hwp_handler.py +401 -0
  70. xgen_doc2chunk/core/processor/hwp_helper/__init__.py +120 -0
  71. xgen_doc2chunk/core/processor/hwp_helper/hwp_chart_extractor.py +373 -0
  72. xgen_doc2chunk/core/processor/hwp_helper/hwp_constants.py +78 -0
  73. xgen_doc2chunk/core/processor/hwp_helper/hwp_decoder.py +106 -0
  74. xgen_doc2chunk/core/processor/hwp_helper/hwp_docinfo.py +174 -0
  75. xgen_doc2chunk/core/processor/hwp_helper/hwp_file_converter.py +60 -0
  76. xgen_doc2chunk/core/processor/hwp_helper/hwp_image_processor.py +413 -0
  77. xgen_doc2chunk/core/processor/hwp_helper/hwp_metadata.py +236 -0
  78. xgen_doc2chunk/core/processor/hwp_helper/hwp_preprocessor.py +82 -0
  79. xgen_doc2chunk/core/processor/hwp_helper/hwp_record.py +149 -0
  80. xgen_doc2chunk/core/processor/hwp_helper/hwp_recovery.py +217 -0
  81. xgen_doc2chunk/core/processor/hwp_helper/hwp_table.py +205 -0
  82. xgen_doc2chunk/core/processor/hwpx_handler.py +191 -0
  83. xgen_doc2chunk/core/processor/hwpx_helper/__init__.py +85 -0
  84. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_chart_extractor.py +464 -0
  85. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_constants.py +30 -0
  86. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_file_converter.py +70 -0
  87. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_image_processor.py +258 -0
  88. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_metadata.py +163 -0
  89. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_preprocessor.py +80 -0
  90. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_section.py +242 -0
  91. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_table_extractor.py +462 -0
  92. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_table_processor.py +220 -0
  93. xgen_doc2chunk/core/processor/image_file_handler.py +212 -0
  94. xgen_doc2chunk/core/processor/image_file_helper/__init__.py +17 -0
  95. xgen_doc2chunk/core/processor/image_file_helper/image_file_converter.py +69 -0
  96. xgen_doc2chunk/core/processor/image_file_helper/image_file_image_processor.py +123 -0
  97. xgen_doc2chunk/core/processor/image_file_helper/image_file_preprocessor.py +84 -0
  98. xgen_doc2chunk/core/processor/pdf_handler.py +597 -0
  99. xgen_doc2chunk/core/processor/pdf_helpers/__init__.py +229 -0
  100. xgen_doc2chunk/core/processor/pdf_helpers/pdf_block_image_engine.py +667 -0
  101. xgen_doc2chunk/core/processor/pdf_helpers/pdf_cell_analysis.py +493 -0
  102. xgen_doc2chunk/core/processor/pdf_helpers/pdf_complexity_analyzer.py +598 -0
  103. xgen_doc2chunk/core/processor/pdf_helpers/pdf_element_merger.py +46 -0
  104. xgen_doc2chunk/core/processor/pdf_helpers/pdf_file_converter.py +72 -0
  105. xgen_doc2chunk/core/processor/pdf_helpers/pdf_graphic_detector.py +332 -0
  106. xgen_doc2chunk/core/processor/pdf_helpers/pdf_image_processor.py +321 -0
  107. xgen_doc2chunk/core/processor/pdf_helpers/pdf_layout_block_detector.py +1244 -0
  108. xgen_doc2chunk/core/processor/pdf_helpers/pdf_line_analysis.py +420 -0
  109. xgen_doc2chunk/core/processor/pdf_helpers/pdf_metadata.py +101 -0
  110. xgen_doc2chunk/core/processor/pdf_helpers/pdf_page_analyzer.py +114 -0
  111. xgen_doc2chunk/core/processor/pdf_helpers/pdf_preprocessor.py +106 -0
  112. xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_detection.py +1346 -0
  113. xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_processor.py +897 -0
  114. xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_quality_analyzer.py +750 -0
  115. xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_validator.py +401 -0
  116. xgen_doc2chunk/core/processor/pdf_helpers/pdf_text_extractor.py +155 -0
  117. xgen_doc2chunk/core/processor/pdf_helpers/pdf_text_quality_analyzer.py +655 -0
  118. xgen_doc2chunk/core/processor/pdf_helpers/pdf_utils.py +183 -0
  119. xgen_doc2chunk/core/processor/pdf_helpers/pdf_vector_text_ocr.py +302 -0
  120. xgen_doc2chunk/core/processor/pdf_helpers/types.py +278 -0
  121. xgen_doc2chunk/core/processor/ppt_handler.py +288 -0
  122. xgen_doc2chunk/core/processor/ppt_helper/__init__.py +96 -0
  123. xgen_doc2chunk/core/processor/ppt_helper/ppt_bullet.py +332 -0
  124. xgen_doc2chunk/core/processor/ppt_helper/ppt_chart_extractor.py +182 -0
  125. xgen_doc2chunk/core/processor/ppt_helper/ppt_constants.py +119 -0
  126. xgen_doc2chunk/core/processor/ppt_helper/ppt_file_converter.py +55 -0
  127. xgen_doc2chunk/core/processor/ppt_helper/ppt_image_processor.py +196 -0
  128. xgen_doc2chunk/core/processor/ppt_helper/ppt_metadata.py +71 -0
  129. xgen_doc2chunk/core/processor/ppt_helper/ppt_preprocessor.py +77 -0
  130. xgen_doc2chunk/core/processor/ppt_helper/ppt_shape.py +189 -0
  131. xgen_doc2chunk/core/processor/ppt_helper/ppt_slide.py +69 -0
  132. xgen_doc2chunk/core/processor/ppt_helper/ppt_table.py +386 -0
  133. xgen_doc2chunk/core/processor/rtf_handler.py +290 -0
  134. xgen_doc2chunk/core/processor/rtf_helper/__init__.py +128 -0
  135. xgen_doc2chunk/core/processor/rtf_helper/rtf_constants.py +94 -0
  136. xgen_doc2chunk/core/processor/rtf_helper/rtf_content_extractor.py +211 -0
  137. xgen_doc2chunk/core/processor/rtf_helper/rtf_decoder.py +141 -0
  138. xgen_doc2chunk/core/processor/rtf_helper/rtf_file_converter.py +87 -0
  139. xgen_doc2chunk/core/processor/rtf_helper/rtf_metadata_extractor.py +179 -0
  140. xgen_doc2chunk/core/processor/rtf_helper/rtf_preprocessor.py +426 -0
  141. xgen_doc2chunk/core/processor/rtf_helper/rtf_region_finder.py +91 -0
  142. xgen_doc2chunk/core/processor/rtf_helper/rtf_table_extractor.py +482 -0
  143. xgen_doc2chunk/core/processor/rtf_helper/rtf_text_cleaner.py +389 -0
  144. xgen_doc2chunk/core/processor/text_handler.py +95 -0
  145. xgen_doc2chunk/core/processor/text_helper/__init__.py +17 -0
  146. xgen_doc2chunk/core/processor/text_helper/text_file_converter.py +28 -0
  147. xgen_doc2chunk/core/processor/text_helper/text_image_processor.py +75 -0
  148. xgen_doc2chunk/core/processor/text_helper/text_preprocessor.py +82 -0
  149. xgen_doc2chunk/ocr/__init__.py +67 -0
  150. xgen_doc2chunk/ocr/base.py +209 -0
  151. xgen_doc2chunk/ocr/ocr_engine/__init__.py +22 -0
  152. xgen_doc2chunk/ocr/ocr_engine/anthropic_ocr.py +91 -0
  153. xgen_doc2chunk/ocr/ocr_engine/bedrock_ocr.py +172 -0
  154. xgen_doc2chunk/ocr/ocr_engine/gemini_ocr.py +91 -0
  155. xgen_doc2chunk/ocr/ocr_engine/openai_ocr.py +100 -0
  156. xgen_doc2chunk/ocr/ocr_engine/vllm_ocr.py +116 -0
  157. xgen_doc2chunk/ocr/ocr_processor.py +387 -0
  158. {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.2.dist-info}/METADATA +1 -1
  159. xgen_doc2chunk-0.1.2.dist-info/RECORD +161 -0
  160. xgen_doc2chunk-0.1.0.dist-info/RECORD +0 -4
  161. {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.2.dist-info}/WHEEL +0 -0
  162. {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.2.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,220 @@
1
+ # xgen_doc2chunk/core/functions/file_converter.py
2
+ """
3
+ BaseFileConverter - Abstract base class for file format conversion
4
+
5
+ Defines the interface for converting binary file data to a workable format.
6
+ Each handler can optionally implement a format-specific converter.
7
+
8
+ The converter's job is to transform raw binary data into a format-specific
9
+ object that the handler can work with (e.g., Document, Workbook, OLE file).
10
+
11
+ This is the FIRST step in the processing pipeline:
12
+ Binary Data ??FileConverter ??Workable Object ??Handler Processing
13
+
14
+ Usage:
15
+ class PDFFileConverter(BaseFileConverter):
16
+ def convert(self, file_data: bytes, file_stream: BinaryIO) -> Any:
17
+ import fitz
18
+ return fitz.open(stream=file_data, filetype="pdf")
19
+
20
+ def get_format_name(self) -> str:
21
+ return "PDF Document"
22
+ """
23
+ from abc import ABC, abstractmethod
24
+ from io import BytesIO
25
+ from typing import Any, Optional, Union, BinaryIO
26
+
27
+
28
+ class BaseFileConverter(ABC):
29
+ """
30
+ Abstract base class for file format converters.
31
+
32
+ Converts raw binary file data into a format-specific workable object.
33
+ This is the first processing step before text extraction.
34
+
35
+ Subclasses must implement:
36
+ - convert(): Convert binary data to workable format
37
+ - get_format_name(): Return human-readable format name
38
+ """
39
+
40
+ @abstractmethod
41
+ def convert(
42
+ self,
43
+ file_data: bytes,
44
+ file_stream: Optional[BinaryIO] = None,
45
+ **kwargs
46
+ ) -> Any:
47
+ """
48
+ Convert binary file data to a workable format.
49
+
50
+ Args:
51
+ file_data: Raw binary file data
52
+ file_stream: Optional file stream (BytesIO) for libraries that prefer streams
53
+ **kwargs: Additional format-specific options
54
+
55
+ Returns:
56
+ Format-specific object (Document, Workbook, OLE file, etc.)
57
+
58
+ Raises:
59
+ ConversionError: If conversion fails
60
+ """
61
+ pass
62
+
63
+ @abstractmethod
64
+ def get_format_name(self) -> str:
65
+ """
66
+ Return human-readable format name.
67
+
68
+ Returns:
69
+ Format name string (e.g., "PDF Document", "DOCX Document")
70
+ """
71
+ pass
72
+
73
+ def validate(self, file_data: bytes) -> bool:
74
+ """
75
+ Validate if the file data can be converted by this converter.
76
+
77
+ Override this method to add format-specific validation.
78
+ Default implementation returns True.
79
+
80
+ Args:
81
+ file_data: Raw binary file data
82
+
83
+ Returns:
84
+ True if file can be converted, False otherwise
85
+ """
86
+ return True
87
+
88
+ def close(self, converted_object: Any) -> None:
89
+ """
90
+ Close/cleanup the converted object if needed.
91
+
92
+ Override this method if the converted object needs explicit cleanup.
93
+ Default implementation does nothing.
94
+
95
+ Args:
96
+ converted_object: The object returned by convert()
97
+ """
98
+ pass
99
+
100
+
101
+ class NullFileConverter(BaseFileConverter):
102
+ """
103
+ Null implementation of file converter.
104
+
105
+ Used as default when no conversion is needed.
106
+ Returns the original file data unchanged.
107
+ """
108
+
109
+ def convert(
110
+ self,
111
+ file_data: bytes,
112
+ file_stream: Optional[BinaryIO] = None,
113
+ **kwargs
114
+ ) -> bytes:
115
+ """Return file data unchanged."""
116
+ return file_data
117
+
118
+ def get_format_name(self) -> str:
119
+ """Return generic format name."""
120
+ return "Raw Binary"
121
+
122
+
123
+ class PassThroughConverter(BaseFileConverter):
124
+ """
125
+ Pass-through converter that returns file stream.
126
+
127
+ Used for handlers that work directly with BytesIO streams.
128
+ """
129
+
130
+ def convert(
131
+ self,
132
+ file_data: bytes,
133
+ file_stream: Optional[BinaryIO] = None,
134
+ **kwargs
135
+ ) -> BinaryIO:
136
+ """Return BytesIO stream of file data."""
137
+ if file_stream is not None:
138
+ file_stream.seek(0)
139
+ return file_stream
140
+ return BytesIO(file_data)
141
+
142
+ def get_format_name(self) -> str:
143
+ """Return format name."""
144
+ return "Binary Stream"
145
+
146
+
147
+ class TextFileConverter(BaseFileConverter):
148
+ """
149
+ Converter for text-based files.
150
+
151
+ Decodes binary data to text string using encoding detection.
152
+ """
153
+
154
+ DEFAULT_ENCODINGS = ['utf-8', 'utf-8-sig', 'cp949', 'euc-kr', 'latin-1', 'ascii']
155
+
156
+ def __init__(self, encodings: Optional[list] = None):
157
+ """
158
+ Initialize TextFileConverter.
159
+
160
+ Args:
161
+ encodings: List of encodings to try (default: common encodings)
162
+ """
163
+ self._encodings = encodings or self.DEFAULT_ENCODINGS
164
+ self._detected_encoding: Optional[str] = None
165
+
166
+ def convert(
167
+ self,
168
+ file_data: bytes,
169
+ file_stream: Optional[BinaryIO] = None,
170
+ encoding: Optional[str] = None,
171
+ **kwargs
172
+ ) -> str:
173
+ """
174
+ Convert binary data to text string.
175
+
176
+ Args:
177
+ file_data: Raw binary file data
178
+ file_stream: Ignored for text conversion
179
+ encoding: Specific encoding to use (None for auto-detect)
180
+ **kwargs: Additional options
181
+
182
+ Returns:
183
+ Decoded text string
184
+
185
+ Raises:
186
+ UnicodeDecodeError: If decoding fails with all encodings
187
+ """
188
+ # Try specified encoding first
189
+ if encoding:
190
+ try:
191
+ result = file_data.decode(encoding)
192
+ self._detected_encoding = encoding
193
+ return result
194
+ except UnicodeDecodeError:
195
+ pass
196
+
197
+ # Try each encoding in order
198
+ for enc in self._encodings:
199
+ try:
200
+ result = file_data.decode(enc)
201
+ self._detected_encoding = enc
202
+ return result
203
+ except UnicodeDecodeError:
204
+ continue
205
+
206
+ # Fallback: decode with errors='replace'
207
+ self._detected_encoding = 'utf-8'
208
+ return file_data.decode('utf-8', errors='replace')
209
+
210
+ def get_format_name(self) -> str:
211
+ """Return format name with detected encoding."""
212
+ if self._detected_encoding:
213
+ return f"Text ({self._detected_encoding})"
214
+ return "Text"
215
+
216
+ @property
217
+ def detected_encoding(self) -> Optional[str]:
218
+ """Return the encoding detected during last conversion."""
219
+ return self._detected_encoding
220
+