xgen-doc2chunk 0.1.0__py3-none-any.whl → 0.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (162) hide show
  1. xgen_doc2chunk/__init__.py +42 -0
  2. xgen_doc2chunk/chunking/__init__.py +168 -0
  3. xgen_doc2chunk/chunking/chunking.py +786 -0
  4. xgen_doc2chunk/chunking/constants.py +134 -0
  5. xgen_doc2chunk/chunking/page_chunker.py +248 -0
  6. xgen_doc2chunk/chunking/protected_regions.py +715 -0
  7. xgen_doc2chunk/chunking/sheet_processor.py +406 -0
  8. xgen_doc2chunk/chunking/table_chunker.py +832 -0
  9. xgen_doc2chunk/chunking/table_parser.py +172 -0
  10. xgen_doc2chunk/chunking/text_chunker.py +443 -0
  11. xgen_doc2chunk/core/__init__.py +64 -0
  12. xgen_doc2chunk/core/document_processor.py +1307 -0
  13. xgen_doc2chunk/core/functions/__init__.py +85 -0
  14. xgen_doc2chunk/core/functions/chart_extractor.py +144 -0
  15. xgen_doc2chunk/core/functions/chart_processor.py +534 -0
  16. xgen_doc2chunk/core/functions/file_converter.py +220 -0
  17. xgen_doc2chunk/core/functions/img_processor.py +649 -0
  18. xgen_doc2chunk/core/functions/metadata_extractor.py +542 -0
  19. xgen_doc2chunk/core/functions/page_tag_processor.py +393 -0
  20. xgen_doc2chunk/core/functions/preprocessor.py +162 -0
  21. xgen_doc2chunk/core/functions/storage_backend.py +381 -0
  22. xgen_doc2chunk/core/functions/table_extractor.py +468 -0
  23. xgen_doc2chunk/core/functions/table_processor.py +299 -0
  24. xgen_doc2chunk/core/functions/utils.py +159 -0
  25. xgen_doc2chunk/core/processor/__init__.py +96 -0
  26. xgen_doc2chunk/core/processor/base_handler.py +544 -0
  27. xgen_doc2chunk/core/processor/csv_handler.py +135 -0
  28. xgen_doc2chunk/core/processor/csv_helper/__init__.py +89 -0
  29. xgen_doc2chunk/core/processor/csv_helper/csv_constants.py +63 -0
  30. xgen_doc2chunk/core/processor/csv_helper/csv_encoding.py +104 -0
  31. xgen_doc2chunk/core/processor/csv_helper/csv_file_converter.py +78 -0
  32. xgen_doc2chunk/core/processor/csv_helper/csv_image_processor.py +75 -0
  33. xgen_doc2chunk/core/processor/csv_helper/csv_metadata.py +168 -0
  34. xgen_doc2chunk/core/processor/csv_helper/csv_parser.py +225 -0
  35. xgen_doc2chunk/core/processor/csv_helper/csv_preprocessor.py +86 -0
  36. xgen_doc2chunk/core/processor/csv_helper/csv_table.py +266 -0
  37. xgen_doc2chunk/core/processor/doc_handler.py +579 -0
  38. xgen_doc2chunk/core/processor/doc_helpers/__init__.py +25 -0
  39. xgen_doc2chunk/core/processor/doc_helpers/doc_file_converter.py +160 -0
  40. xgen_doc2chunk/core/processor/doc_helpers/doc_image_processor.py +179 -0
  41. xgen_doc2chunk/core/processor/doc_helpers/doc_preprocessor.py +83 -0
  42. xgen_doc2chunk/core/processor/docx_handler.py +376 -0
  43. xgen_doc2chunk/core/processor/docx_helper/__init__.py +84 -0
  44. xgen_doc2chunk/core/processor/docx_helper/docx_chart_extractor.py +436 -0
  45. xgen_doc2chunk/core/processor/docx_helper/docx_constants.py +75 -0
  46. xgen_doc2chunk/core/processor/docx_helper/docx_file_converter.py +76 -0
  47. xgen_doc2chunk/core/processor/docx_helper/docx_image.py +145 -0
  48. xgen_doc2chunk/core/processor/docx_helper/docx_image_processor.py +410 -0
  49. xgen_doc2chunk/core/processor/docx_helper/docx_metadata.py +71 -0
  50. xgen_doc2chunk/core/processor/docx_helper/docx_paragraph.py +126 -0
  51. xgen_doc2chunk/core/processor/docx_helper/docx_preprocessor.py +82 -0
  52. xgen_doc2chunk/core/processor/docx_helper/docx_table_extractor.py +527 -0
  53. xgen_doc2chunk/core/processor/docx_helper/docx_table_processor.py +220 -0
  54. xgen_doc2chunk/core/processor/excel_handler.py +353 -0
  55. xgen_doc2chunk/core/processor/excel_helper/__init__.py +97 -0
  56. xgen_doc2chunk/core/processor/excel_helper/excel_chart_extractor.py +498 -0
  57. xgen_doc2chunk/core/processor/excel_helper/excel_file_converter.py +157 -0
  58. xgen_doc2chunk/core/processor/excel_helper/excel_image_processor.py +316 -0
  59. xgen_doc2chunk/core/processor/excel_helper/excel_layout_detector.py +739 -0
  60. xgen_doc2chunk/core/processor/excel_helper/excel_metadata.py +145 -0
  61. xgen_doc2chunk/core/processor/excel_helper/excel_preprocessor.py +83 -0
  62. xgen_doc2chunk/core/processor/excel_helper/excel_table_xls.py +357 -0
  63. xgen_doc2chunk/core/processor/excel_helper/excel_table_xlsx.py +361 -0
  64. xgen_doc2chunk/core/processor/excel_helper/excel_textbox.py +266 -0
  65. xgen_doc2chunk/core/processor/html_helper/__init__.py +7 -0
  66. xgen_doc2chunk/core/processor/html_helper/html_file_converter.py +92 -0
  67. xgen_doc2chunk/core/processor/html_helper/html_preprocessor.py +74 -0
  68. xgen_doc2chunk/core/processor/html_reprocessor.py +140 -0
  69. xgen_doc2chunk/core/processor/hwp_handler.py +401 -0
  70. xgen_doc2chunk/core/processor/hwp_helper/__init__.py +120 -0
  71. xgen_doc2chunk/core/processor/hwp_helper/hwp_chart_extractor.py +373 -0
  72. xgen_doc2chunk/core/processor/hwp_helper/hwp_constants.py +78 -0
  73. xgen_doc2chunk/core/processor/hwp_helper/hwp_decoder.py +106 -0
  74. xgen_doc2chunk/core/processor/hwp_helper/hwp_docinfo.py +174 -0
  75. xgen_doc2chunk/core/processor/hwp_helper/hwp_file_converter.py +60 -0
  76. xgen_doc2chunk/core/processor/hwp_helper/hwp_image_processor.py +413 -0
  77. xgen_doc2chunk/core/processor/hwp_helper/hwp_metadata.py +236 -0
  78. xgen_doc2chunk/core/processor/hwp_helper/hwp_preprocessor.py +82 -0
  79. xgen_doc2chunk/core/processor/hwp_helper/hwp_record.py +149 -0
  80. xgen_doc2chunk/core/processor/hwp_helper/hwp_recovery.py +217 -0
  81. xgen_doc2chunk/core/processor/hwp_helper/hwp_table.py +205 -0
  82. xgen_doc2chunk/core/processor/hwpx_handler.py +191 -0
  83. xgen_doc2chunk/core/processor/hwpx_helper/__init__.py +85 -0
  84. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_chart_extractor.py +464 -0
  85. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_constants.py +30 -0
  86. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_file_converter.py +70 -0
  87. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_image_processor.py +258 -0
  88. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_metadata.py +163 -0
  89. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_preprocessor.py +80 -0
  90. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_section.py +242 -0
  91. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_table_extractor.py +462 -0
  92. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_table_processor.py +220 -0
  93. xgen_doc2chunk/core/processor/image_file_handler.py +212 -0
  94. xgen_doc2chunk/core/processor/image_file_helper/__init__.py +17 -0
  95. xgen_doc2chunk/core/processor/image_file_helper/image_file_converter.py +69 -0
  96. xgen_doc2chunk/core/processor/image_file_helper/image_file_image_processor.py +123 -0
  97. xgen_doc2chunk/core/processor/image_file_helper/image_file_preprocessor.py +84 -0
  98. xgen_doc2chunk/core/processor/pdf_handler.py +597 -0
  99. xgen_doc2chunk/core/processor/pdf_helpers/__init__.py +229 -0
  100. xgen_doc2chunk/core/processor/pdf_helpers/pdf_block_image_engine.py +667 -0
  101. xgen_doc2chunk/core/processor/pdf_helpers/pdf_cell_analysis.py +493 -0
  102. xgen_doc2chunk/core/processor/pdf_helpers/pdf_complexity_analyzer.py +598 -0
  103. xgen_doc2chunk/core/processor/pdf_helpers/pdf_element_merger.py +46 -0
  104. xgen_doc2chunk/core/processor/pdf_helpers/pdf_file_converter.py +72 -0
  105. xgen_doc2chunk/core/processor/pdf_helpers/pdf_graphic_detector.py +332 -0
  106. xgen_doc2chunk/core/processor/pdf_helpers/pdf_image_processor.py +321 -0
  107. xgen_doc2chunk/core/processor/pdf_helpers/pdf_layout_block_detector.py +1244 -0
  108. xgen_doc2chunk/core/processor/pdf_helpers/pdf_line_analysis.py +420 -0
  109. xgen_doc2chunk/core/processor/pdf_helpers/pdf_metadata.py +101 -0
  110. xgen_doc2chunk/core/processor/pdf_helpers/pdf_page_analyzer.py +114 -0
  111. xgen_doc2chunk/core/processor/pdf_helpers/pdf_preprocessor.py +106 -0
  112. xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_detection.py +1346 -0
  113. xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_processor.py +897 -0
  114. xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_quality_analyzer.py +750 -0
  115. xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_validator.py +401 -0
  116. xgen_doc2chunk/core/processor/pdf_helpers/pdf_text_extractor.py +155 -0
  117. xgen_doc2chunk/core/processor/pdf_helpers/pdf_text_quality_analyzer.py +655 -0
  118. xgen_doc2chunk/core/processor/pdf_helpers/pdf_utils.py +183 -0
  119. xgen_doc2chunk/core/processor/pdf_helpers/pdf_vector_text_ocr.py +302 -0
  120. xgen_doc2chunk/core/processor/pdf_helpers/types.py +278 -0
  121. xgen_doc2chunk/core/processor/ppt_handler.py +288 -0
  122. xgen_doc2chunk/core/processor/ppt_helper/__init__.py +96 -0
  123. xgen_doc2chunk/core/processor/ppt_helper/ppt_bullet.py +332 -0
  124. xgen_doc2chunk/core/processor/ppt_helper/ppt_chart_extractor.py +182 -0
  125. xgen_doc2chunk/core/processor/ppt_helper/ppt_constants.py +119 -0
  126. xgen_doc2chunk/core/processor/ppt_helper/ppt_file_converter.py +55 -0
  127. xgen_doc2chunk/core/processor/ppt_helper/ppt_image_processor.py +196 -0
  128. xgen_doc2chunk/core/processor/ppt_helper/ppt_metadata.py +71 -0
  129. xgen_doc2chunk/core/processor/ppt_helper/ppt_preprocessor.py +77 -0
  130. xgen_doc2chunk/core/processor/ppt_helper/ppt_shape.py +189 -0
  131. xgen_doc2chunk/core/processor/ppt_helper/ppt_slide.py +69 -0
  132. xgen_doc2chunk/core/processor/ppt_helper/ppt_table.py +386 -0
  133. xgen_doc2chunk/core/processor/rtf_handler.py +290 -0
  134. xgen_doc2chunk/core/processor/rtf_helper/__init__.py +128 -0
  135. xgen_doc2chunk/core/processor/rtf_helper/rtf_constants.py +94 -0
  136. xgen_doc2chunk/core/processor/rtf_helper/rtf_content_extractor.py +211 -0
  137. xgen_doc2chunk/core/processor/rtf_helper/rtf_decoder.py +141 -0
  138. xgen_doc2chunk/core/processor/rtf_helper/rtf_file_converter.py +87 -0
  139. xgen_doc2chunk/core/processor/rtf_helper/rtf_metadata_extractor.py +179 -0
  140. xgen_doc2chunk/core/processor/rtf_helper/rtf_preprocessor.py +426 -0
  141. xgen_doc2chunk/core/processor/rtf_helper/rtf_region_finder.py +91 -0
  142. xgen_doc2chunk/core/processor/rtf_helper/rtf_table_extractor.py +482 -0
  143. xgen_doc2chunk/core/processor/rtf_helper/rtf_text_cleaner.py +389 -0
  144. xgen_doc2chunk/core/processor/text_handler.py +95 -0
  145. xgen_doc2chunk/core/processor/text_helper/__init__.py +17 -0
  146. xgen_doc2chunk/core/processor/text_helper/text_file_converter.py +28 -0
  147. xgen_doc2chunk/core/processor/text_helper/text_image_processor.py +75 -0
  148. xgen_doc2chunk/core/processor/text_helper/text_preprocessor.py +82 -0
  149. xgen_doc2chunk/ocr/__init__.py +67 -0
  150. xgen_doc2chunk/ocr/base.py +209 -0
  151. xgen_doc2chunk/ocr/ocr_engine/__init__.py +22 -0
  152. xgen_doc2chunk/ocr/ocr_engine/anthropic_ocr.py +91 -0
  153. xgen_doc2chunk/ocr/ocr_engine/bedrock_ocr.py +172 -0
  154. xgen_doc2chunk/ocr/ocr_engine/gemini_ocr.py +91 -0
  155. xgen_doc2chunk/ocr/ocr_engine/openai_ocr.py +100 -0
  156. xgen_doc2chunk/ocr/ocr_engine/vllm_ocr.py +116 -0
  157. xgen_doc2chunk/ocr/ocr_processor.py +387 -0
  158. {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.1.dist-info}/METADATA +1 -1
  159. xgen_doc2chunk-0.1.1.dist-info/RECORD +161 -0
  160. xgen_doc2chunk-0.1.0.dist-info/RECORD +0 -4
  161. {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.1.dist-info}/WHEEL +0 -0
  162. {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.1.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,299 @@
1
+ # xgen_doc2chunk/core/functions/table_processor.py
2
+ """
3
+ Table Processor - Common Table Processing Module
4
+
5
+ Provides common table processing utilities for formatting tables.
6
+ This module handles HTML, Markdown, and Text conversion of TableData.
7
+
8
+ ================================================================================
9
+ TABLE PROCESSOR ARCHITECTURE
10
+ ================================================================================
11
+
12
+ Main Entry Point:
13
+ format_table(table: TableData) -> str
14
+
15
+ Internal Processing Functions (called from format_table):
16
+ - format_table_as_html() : HTML conversion (rowspan/colspan support)
17
+ - format_table_as_markdown() : Markdown conversion (simple table)
18
+ - format_table_as_text() : Text conversion (plain text)
19
+
20
+ Common Utility:
21
+ - _clean_cell_content() : Cell content cleaning (whitespace handling)
22
+
23
+ ================================================================================
24
+ PROCESSING FLOW
25
+ ================================================================================
26
+
27
+ format_table(table) -> Main Entry Point
28
+ |
29
+ +-- config.output_format == HTML?
30
+ | YES -> format_table_as_html(table)
31
+ | +-- colgroup generation (column width)
32
+ | +-- row/cell iteration
33
+ | | +-- rowspan/colspan handling
34
+ | | +-- nested_table recursive handling
35
+ | | +-- _clean_cell_content()
36
+ | +-- HTML string return
37
+ |
38
+ +-- config.output_format == MARKDOWN?
39
+ | YES -> format_table_as_markdown(table)
40
+ | +-- row/cell iteration
41
+ | | +-- _clean_cell_content()
42
+ | +-- header separator addition
43
+ | +-- Markdown string return
44
+ |
45
+ +-- config.output_format == TEXT?
46
+ YES -> format_table_as_text(table)
47
+ +-- row/cell iteration
48
+ | +-- _clean_cell_content()
49
+ +-- Text string return
50
+
51
+ ================================================================================
52
+ OUTPUT FORMAT COMPARISON
53
+ ================================================================================
54
+
55
+ | Format | Use Case | Merge Support | Structure |
56
+ |----------|-----------------------------|--------------:|:-----------|
57
+ | HTML | Web rendering, full convert | Full support | Complex |
58
+ | Markdown | GitHub, docs, simple render | Not supported | Simplified |
59
+ | Text | Search index, logs, debug | Not supported | Minimal |
60
+
61
+ ================================================================================
62
+ """
63
+ import logging
64
+ import re
65
+ from enum import Enum
66
+ from dataclasses import dataclass
67
+ from typing import Optional
68
+
69
+ from xgen_doc2chunk.core.functions.table_extractor import TableData, TableCell
70
+
71
+ logger = logging.getLogger("document-processor")
72
+
73
+
74
+ class TableOutputFormat(Enum):
75
+ """Table output format options."""
76
+ HTML = "html"
77
+ MARKDOWN = "markdown"
78
+ TEXT = "text"
79
+
80
+
81
+ @dataclass
82
+ class TableProcessorConfig:
83
+ """Configuration for table processing."""
84
+ output_format: TableOutputFormat = TableOutputFormat.HTML
85
+ clean_whitespace: bool = True
86
+ preserve_merged_cells: bool = True
87
+
88
+
89
+ class TableProcessor:
90
+ """
91
+ Main table processing class.
92
+
93
+ ============================================================================
94
+ CLASS STRUCTURE
95
+ ============================================================================
96
+
97
+ Public Methods:
98
+ format_table() -> Main Entry Point (routes by config.output_format)
99
+ format_table_as_html() -> HTML conversion (internal: called from format_table)
100
+ format_table_as_markdown()-> Markdown conversion (internal: called from format_table)
101
+ format_table_as_text() -> Text conversion (internal: called from format_table)
102
+
103
+ Private Methods:
104
+ _clean_cell_content() -> Common utility (cell content cleaning)
105
+
106
+ ============================================================================
107
+ """
108
+
109
+ def __init__(self, config: Optional[TableProcessorConfig] = None):
110
+ self.config = config or TableProcessorConfig()
111
+ self.logger = logging.getLogger("document-processor")
112
+
113
+ # ==========================================================================
114
+ # format_table() - Main Entry Point
115
+ # ==========================================================================
116
+ #
117
+ # +------------------------------------------------------------------------+
118
+ # | format_table(table) |
119
+ # | |
120
+ # | Check config.output_format |
121
+ # | | |
122
+ # | +-- HTML -------> format_table_as_html() |
123
+ # | | +-- colgroup generation (col width) |
124
+ # | | +-- row/cell iteration |
125
+ # | | | +-- rowspan/colspan |
126
+ # | | | +-- nested_table (recursive) |
127
+ # | | | +-- _clean_cell_content() |
128
+ # | | +-- HTML return |
129
+ # | | |
130
+ # | +-- MARKDOWN ---> format_table_as_markdown() |
131
+ # | | +-- row/cell iteration |
132
+ # | | | +-- _clean_cell_content() |
133
+ # | | +-- header separator |
134
+ # | | +-- Markdown return |
135
+ # | | |
136
+ # | +-- TEXT -------> format_table_as_text() |
137
+ # | +-- row/cell iteration |
138
+ # | | +-- _clean_cell_content() |
139
+ # | +-- Text return |
140
+ # +------------------------------------------------------------------------+
141
+ #
142
+ # ==========================================================================
143
+
144
+ def format_table(self, table: TableData) -> str:
145
+ """
146
+ Main entry point for table formatting.
147
+
148
+ Routes to appropriate format handler based on config.output_format.
149
+
150
+ Args:
151
+ table: TableData from table extractor
152
+
153
+ Returns:
154
+ Formatted string (HTML/Markdown/Text)
155
+ """
156
+ if self.config.output_format == TableOutputFormat.HTML:
157
+ return self.format_table_as_html(table)
158
+ elif self.config.output_format == TableOutputFormat.MARKDOWN:
159
+ return self.format_table_as_markdown(table)
160
+ else:
161
+ return self.format_table_as_text(table)
162
+
163
+ # ==========================================================================
164
+ # format_table_as_html() - HTML conversion (called from format_table)
165
+ # ==========================================================================
166
+
167
+ def format_table_as_html(self, table: TableData) -> str:
168
+ """
169
+ Convert TableData to HTML string.
170
+
171
+ Called from format_table() when output_format == HTML.
172
+
173
+ Features:
174
+ - colgroup for column widths
175
+ - rowspan/colspan for merged cells
176
+ - nested_table support (recursive)
177
+ """
178
+ if not table.rows:
179
+ return ""
180
+
181
+ html_parts = ["<table>"]
182
+
183
+ if table.col_widths_percent:
184
+ html_parts.append(" <colgroup>")
185
+ for width_pct in table.col_widths_percent:
186
+ html_parts.append(f' <col style="width: {width_pct:.1f}%">')
187
+ html_parts.append(" </colgroup>")
188
+
189
+ for row_idx, row in enumerate(table.rows):
190
+ html_parts.append(" <tr>")
191
+
192
+ for cell in row:
193
+ tag = "th" if cell.is_header else "td"
194
+ attrs = []
195
+ if self.config.preserve_merged_cells:
196
+ if cell.row_span > 1:
197
+ attrs.append(f'rowspan="{cell.row_span}"')
198
+ if cell.col_span > 1:
199
+ attrs.append(f'colspan="{cell.col_span}"')
200
+
201
+ attr_str = " " + " ".join(attrs) if attrs else ""
202
+
203
+ if cell.nested_table:
204
+ nested_html = self.format_table_as_html(cell.nested_table)
205
+ html_parts.append(f" <{tag}{attr_str}>{nested_html}</{tag}>")
206
+ else:
207
+ content = self._clean_cell_content(cell.content)
208
+ html_parts.append(f" <{tag}{attr_str}>{content}</{tag}>")
209
+
210
+ html_parts.append(" </tr>")
211
+
212
+ html_parts.append("</table>")
213
+ return "\n".join(html_parts)
214
+
215
+ # ==========================================================================
216
+ # format_table_as_markdown() - Markdown conversion (called from format_table)
217
+ # ==========================================================================
218
+
219
+ def format_table_as_markdown(self, table: TableData) -> str:
220
+ """
221
+ Convert TableData to Markdown string.
222
+
223
+ Called from format_table() when output_format == MARKDOWN.
224
+
225
+ Note: Markdown does NOT support rowspan/colspan.
226
+ """
227
+ if not table.rows:
228
+ return ""
229
+
230
+ lines = []
231
+ for row_idx, row in enumerate(table.rows):
232
+ cells = [self._clean_cell_content(cell.content) for cell in row]
233
+ line = "| " + " | ".join(cells) + " |"
234
+ lines.append(line)
235
+
236
+ if row_idx == 0 and table.has_header:
237
+ separator = "| " + " | ".join(["---"] * len(row)) + " |"
238
+ lines.append(separator)
239
+
240
+ return "\n".join(lines)
241
+
242
+ # ==========================================================================
243
+ # format_table_as_text() - Text conversion (called from format_table)
244
+ # ==========================================================================
245
+
246
+ def format_table_as_text(self, table: TableData) -> str:
247
+ """
248
+ Convert TableData to plain text string.
249
+
250
+ Called from format_table() when output_format == TEXT.
251
+
252
+ Note: No table structure preserved. Useful for search indexing.
253
+ """
254
+ if not table.rows:
255
+ return ""
256
+
257
+ lines = []
258
+ for row in table.rows:
259
+ cells = [self._clean_cell_content(cell.content) for cell in row]
260
+ lines.append("\t".join(cells))
261
+
262
+ return "\n".join(lines)
263
+
264
+ # ==========================================================================
265
+ # _clean_cell_content() - Common utility (called from all format functions)
266
+ # ==========================================================================
267
+
268
+ def _clean_cell_content(self, content: str) -> str:
269
+ """
270
+ Clean cell content (whitespace normalization).
271
+
272
+ Called from all format_table_as_* methods.
273
+ """
274
+ if not content:
275
+ return ""
276
+
277
+ if self.config.clean_whitespace:
278
+ content = re.sub(r'\s+', ' ', content)
279
+ content = content.strip()
280
+
281
+ return content
282
+
283
+
284
+ def create_table_processor(config: Optional[TableProcessorConfig] = None) -> TableProcessor:
285
+ """
286
+ Factory function to create a TableProcessor.
287
+
288
+ Args:
289
+ config: Table processing configuration
290
+
291
+ Returns:
292
+ Configured TableProcessor instance
293
+ """
294
+ return TableProcessor(config)
295
+
296
+
297
+ # Default configuration
298
+ DEFAULT_PROCESSOR_CONFIG = TableProcessorConfig()
299
+
@@ -0,0 +1,159 @@
1
+ # your_package/document_processor/utils.py
2
+ """
3
+ Common utility module for document processing
4
+ """
5
+ import io
6
+ import os
7
+ import hashlib
8
+ import tempfile
9
+ import logging
10
+ import re
11
+ import bisect
12
+ from typing import Any, Dict, List, Optional, Set
13
+
14
+ from PIL import Image
15
+
16
+ def sanitize_text_for_json(text: Optional[str]) -> str:
17
+ """
18
+ Sanitizes text to be safely encodable in a UTF-8 JSON response.
19
+
20
+ Removes or replaces the following characters:
21
+ - Invalid surrogate pairs (U+D800-U+DFFF): removes isolated high/low surrogates
22
+ - Private Use Area characters (U+E000-U+F8FF, U+F0000 and above): removed
23
+ - Non-character code points (U+FFFE, U+FFFF): removed
24
+ - Problematic control characters (except tab, newline, carriage return)
25
+
26
+ Args:
27
+ text: Input text that may contain invalid characters
28
+
29
+ Returns:
30
+ Sanitized text safe for JSON encoding
31
+ """
32
+ if not text:
33
+ return text if text is not None else ""
34
+
35
+ result = []
36
+ i = 0
37
+ text_len = len(text)
38
+
39
+ while i < text_len:
40
+ char = text[i]
41
+ code = ord(char)
42
+
43
+ # Check for surrogate pairs (\uD800-\uDFFF)
44
+ if 0xD800 <= code <= 0xDFFF:
45
+ # High surrogate (\uD800-\uDBFF)
46
+ if 0xD800 <= code <= 0xDBFF:
47
+ # Check if followed by a valid low surrogate
48
+ if i + 1 < text_len:
49
+ next_code = ord(text[i + 1])
50
+ if 0xDC00 <= next_code <= 0xDFFF:
51
+ # Valid surrogate pair, calculate actual code point
52
+ full_code = 0x10000 + ((code - 0xD800) << 10) + (next_code - 0xDC00)
53
+ # Supplementary Private Use Area-A: U+F0000 ~ U+FFFFF
54
+ # Supplementary Private Use Area-B: U+100000 ~ U+10FFFF
55
+ if full_code >= 0xF0000:
56
+ # Skip Private Use Supplementary characters
57
+ i += 2
58
+ continue
59
+ else:
60
+ # Valid supplementary character, keep it
61
+ result.append(char)
62
+ result.append(text[i + 1])
63
+ i += 2
64
+ continue
65
+ # Invalid isolated high surrogate, skip it
66
+ i += 1
67
+ continue
68
+ else:
69
+ # Low surrogate without high surrogate, skip it
70
+ i += 1
71
+ continue
72
+
73
+ # Check Basic Private Use Area (U+E000 ~ U+F8FF)
74
+ if 0xE000 <= code <= 0xF8FF:
75
+ # Skip Private Use characters
76
+ i += 1
77
+ continue
78
+
79
+ # Check for problematic control characters
80
+ # Keep: \t (9), \n (10), \r (13), space (32) and above
81
+ # Remove: \x00-\x08, \x0B, \x0C, \x0E-\x1F (except those above)
82
+ if code < 32 and code not in (9, 10, 13):
83
+ # Skip problematic control characters
84
+ i += 1
85
+ continue
86
+
87
+ # Check for non-characters (U+FFFE, U+FFFF)
88
+ if code in (0xFFFE, 0xFFFF):
89
+ i += 1
90
+ continue
91
+
92
+ # Valid character, keep it
93
+ result.append(char)
94
+ i += 1
95
+
96
+ return ''.join(result)
97
+
98
+
99
+ def clean_text(text: Optional[str]) -> str:
100
+ if not text:
101
+ return ""
102
+ text = re.sub(r'\n\s*\n\s*\n+', '\n\n', text)
103
+ return text.strip()
104
+
105
+ def clean_code_text(text: str) -> str:
106
+ if not text:
107
+ return ""
108
+ text = text.rstrip().replace('\t', ' ')
109
+ return text
110
+
111
+ def is_text_quality_sufficient(text: Optional[str], min_chars: int = 500, min_word_ratio: float = 0.6) -> bool:
112
+ try:
113
+ if not text or len(text) < min_chars:
114
+ return False
115
+ word_chars = re.findall(r"[\w\u1100-\u11FF\u3130-\u318F\uAC00-\uD7AF]", text)
116
+ ratio = len(word_chars) / max(1, len(text))
117
+ return ratio >= min_word_ratio
118
+ except Exception:
119
+ return False
120
+
121
+ def find_chunk_position(chunk: str, full_text: str, start_pos: int = 0) -> int:
122
+ try:
123
+ pos = full_text.find(chunk, start_pos)
124
+ if pos != -1:
125
+ return pos
126
+ lines = chunk.strip().split('\n')
127
+ if lines and len(lines[0]) >= 10:
128
+ first_line = lines[0].strip()
129
+ pos = full_text.find(first_line, start_pos)
130
+ if pos != -1:
131
+ chunk_start = full_text.find(chunk[:50] if len(chunk) > 50 else chunk, pos)
132
+ return chunk_start if chunk_start != -1 else pos
133
+ if len(chunk.strip()) >= 10:
134
+ start = chunk.strip()[:50]
135
+ pos = full_text.find(start, start_pos)
136
+ if pos != -1:
137
+ return pos
138
+ return -1
139
+ except Exception:
140
+ return -1
141
+
142
+ def build_line_starts(text: str) -> List[int]:
143
+ try:
144
+ starts = [0]
145
+ for i, ch in enumerate(text):
146
+ if ch == '\n' and i + 1 < len(text):
147
+ starts.append(i + 1)
148
+ return starts
149
+ except Exception:
150
+ return [0]
151
+
152
+ def pos_to_line(pos: int, line_starts: List[int]) -> int:
153
+ try:
154
+ if pos < 0:
155
+ return 1
156
+ idx = bisect.bisect_right(line_starts, pos) - 1
157
+ return max(1, idx + 1)
158
+ except Exception:
159
+ return 1
@@ -0,0 +1,96 @@
1
+ # xgen_doc2chunk/core/processor/__init__.py
2
+ """
3
+ Processor - Document Type-specific Handler Module
4
+
5
+ Provides handlers for processing individual document formats.
6
+
7
+ Handler List:
8
+ - pdf_handler: PDF document processing (adaptive complexity-based)
9
+ - docx_handler: DOCX document processing
10
+ - doc_handler: DOC document processing (OLE, HTML, misnamed DOCX)
11
+ - rtf_handler: RTF document processing
12
+ - ppt_handler: PPT/PPTX document processing
13
+ - excel_handler: Excel (XLSX/XLS) document processing
14
+ - hwp_processor: HWP document processing
15
+ - hwpx_processor: HWPX document processing
16
+ - csv_handler: CSV file processing
17
+ - text_handler: Text file processing
18
+ - html_reprocessor: HTML reprocessing
19
+
20
+ Helper Modules (subdirectories):
21
+ - csv_helper/: CSV processing helper
22
+ - docx_helper/: DOCX processing helper
23
+ - doc_helpers/: DOC processing helper
24
+ - rtf_helper/: RTF processing helper
25
+ - excel_helper/: Excel processing helper
26
+ - hwp_helper/: HWP processing helper
27
+ - hwpx_helper/: HWPX processing helper
28
+ - pdf_helpers/: PDF processing helper
29
+ - ppt_helper/: PPT processing helper
30
+
31
+ Usage Example:
32
+ from xgen_doc2chunk.core.processor import PDFHandler
33
+ from xgen_doc2chunk.core.processor import DOCXHandler
34
+ from xgen_doc2chunk.core.processor import RTFHandler
35
+ from xgen_doc2chunk.core.processor.pdf_helpers import extract_pdf_metadata
36
+ """
37
+
38
+ # === PDF Handler ===
39
+ from xgen_doc2chunk.core.processor.pdf_handler import PDFHandler
40
+
41
+ # === Document Handlers ===
42
+ from xgen_doc2chunk.core.processor.docx_handler import DOCXHandler
43
+ from xgen_doc2chunk.core.processor.doc_handler import DOCHandler
44
+ from xgen_doc2chunk.core.processor.rtf_handler import RTFHandler
45
+ from xgen_doc2chunk.core.processor.ppt_handler import PPTHandler
46
+
47
+ # === Data Handlers ===
48
+ from xgen_doc2chunk.core.processor.excel_handler import ExcelHandler
49
+ from xgen_doc2chunk.core.processor.csv_handler import CSVHandler
50
+ from xgen_doc2chunk.core.processor.text_handler import TextHandler
51
+
52
+ # === HWP Handlers ===
53
+ from xgen_doc2chunk.core.processor.hwp_handler import HWPHandler
54
+ from xgen_doc2chunk.core.processor.hwpx_handler import HWPXHandler
55
+
56
+ # === Other Processors ===
57
+ # from xgen_doc2chunk.core.processor.html_reprocessor import ... # HTML reprocessing
58
+
59
+ # === Helper Modules (subpackages) ===
60
+ from xgen_doc2chunk.core.processor import csv_helper
61
+ from xgen_doc2chunk.core.processor import doc_helpers
62
+ from xgen_doc2chunk.core.processor import docx_helper
63
+ from xgen_doc2chunk.core.processor import excel_helper
64
+ from xgen_doc2chunk.core.processor import hwp_helper
65
+ from xgen_doc2chunk.core.processor import hwpx_helper
66
+ from xgen_doc2chunk.core.processor import pdf_helpers
67
+ from xgen_doc2chunk.core.processor import ppt_helper
68
+ from xgen_doc2chunk.core.processor import rtf_helper
69
+
70
+ __all__ = [
71
+ # PDF Handler
72
+ "PDFHandler",
73
+ # Document Handlers
74
+ "DOCXHandler",
75
+ "DOCHandler",
76
+ "RTFHandler",
77
+ "PPTHandler",
78
+ # Data Handlers
79
+ "ExcelHandler",
80
+ "CSVHandler",
81
+ "TextHandler",
82
+ # HWP Handlers
83
+ "HWPHandler",
84
+ "HWPXHandler",
85
+ # Helper subpackages
86
+ "csv_helper",
87
+ "doc_helpers",
88
+ "docx_helper",
89
+ "excel_helper",
90
+ "hwp_helper",
91
+ "hwpx_helper",
92
+ "pdf_helpers",
93
+ "ppt_helper",
94
+ "rtf_helper",
95
+ ]
96
+