xgen-doc2chunk 0.1.0__py3-none-any.whl → 0.1.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (162) hide show
  1. xgen_doc2chunk/__init__.py +42 -0
  2. xgen_doc2chunk/chunking/__init__.py +168 -0
  3. xgen_doc2chunk/chunking/chunking.py +786 -0
  4. xgen_doc2chunk/chunking/constants.py +134 -0
  5. xgen_doc2chunk/chunking/page_chunker.py +248 -0
  6. xgen_doc2chunk/chunking/protected_regions.py +715 -0
  7. xgen_doc2chunk/chunking/sheet_processor.py +406 -0
  8. xgen_doc2chunk/chunking/table_chunker.py +832 -0
  9. xgen_doc2chunk/chunking/table_parser.py +172 -0
  10. xgen_doc2chunk/chunking/text_chunker.py +443 -0
  11. xgen_doc2chunk/core/__init__.py +64 -0
  12. xgen_doc2chunk/core/document_processor.py +1307 -0
  13. xgen_doc2chunk/core/functions/__init__.py +85 -0
  14. xgen_doc2chunk/core/functions/chart_extractor.py +144 -0
  15. xgen_doc2chunk/core/functions/chart_processor.py +534 -0
  16. xgen_doc2chunk/core/functions/file_converter.py +220 -0
  17. xgen_doc2chunk/core/functions/img_processor.py +649 -0
  18. xgen_doc2chunk/core/functions/metadata_extractor.py +542 -0
  19. xgen_doc2chunk/core/functions/page_tag_processor.py +393 -0
  20. xgen_doc2chunk/core/functions/preprocessor.py +162 -0
  21. xgen_doc2chunk/core/functions/storage_backend.py +381 -0
  22. xgen_doc2chunk/core/functions/table_extractor.py +468 -0
  23. xgen_doc2chunk/core/functions/table_processor.py +299 -0
  24. xgen_doc2chunk/core/functions/utils.py +159 -0
  25. xgen_doc2chunk/core/processor/__init__.py +96 -0
  26. xgen_doc2chunk/core/processor/base_handler.py +544 -0
  27. xgen_doc2chunk/core/processor/csv_handler.py +135 -0
  28. xgen_doc2chunk/core/processor/csv_helper/__init__.py +89 -0
  29. xgen_doc2chunk/core/processor/csv_helper/csv_constants.py +63 -0
  30. xgen_doc2chunk/core/processor/csv_helper/csv_encoding.py +104 -0
  31. xgen_doc2chunk/core/processor/csv_helper/csv_file_converter.py +78 -0
  32. xgen_doc2chunk/core/processor/csv_helper/csv_image_processor.py +75 -0
  33. xgen_doc2chunk/core/processor/csv_helper/csv_metadata.py +168 -0
  34. xgen_doc2chunk/core/processor/csv_helper/csv_parser.py +225 -0
  35. xgen_doc2chunk/core/processor/csv_helper/csv_preprocessor.py +86 -0
  36. xgen_doc2chunk/core/processor/csv_helper/csv_table.py +266 -0
  37. xgen_doc2chunk/core/processor/doc_handler.py +579 -0
  38. xgen_doc2chunk/core/processor/doc_helpers/__init__.py +25 -0
  39. xgen_doc2chunk/core/processor/doc_helpers/doc_file_converter.py +160 -0
  40. xgen_doc2chunk/core/processor/doc_helpers/doc_image_processor.py +179 -0
  41. xgen_doc2chunk/core/processor/doc_helpers/doc_preprocessor.py +83 -0
  42. xgen_doc2chunk/core/processor/docx_handler.py +376 -0
  43. xgen_doc2chunk/core/processor/docx_helper/__init__.py +84 -0
  44. xgen_doc2chunk/core/processor/docx_helper/docx_chart_extractor.py +436 -0
  45. xgen_doc2chunk/core/processor/docx_helper/docx_constants.py +75 -0
  46. xgen_doc2chunk/core/processor/docx_helper/docx_file_converter.py +76 -0
  47. xgen_doc2chunk/core/processor/docx_helper/docx_image.py +145 -0
  48. xgen_doc2chunk/core/processor/docx_helper/docx_image_processor.py +410 -0
  49. xgen_doc2chunk/core/processor/docx_helper/docx_metadata.py +71 -0
  50. xgen_doc2chunk/core/processor/docx_helper/docx_paragraph.py +126 -0
  51. xgen_doc2chunk/core/processor/docx_helper/docx_preprocessor.py +82 -0
  52. xgen_doc2chunk/core/processor/docx_helper/docx_table_extractor.py +527 -0
  53. xgen_doc2chunk/core/processor/docx_helper/docx_table_processor.py +220 -0
  54. xgen_doc2chunk/core/processor/excel_handler.py +353 -0
  55. xgen_doc2chunk/core/processor/excel_helper/__init__.py +97 -0
  56. xgen_doc2chunk/core/processor/excel_helper/excel_chart_extractor.py +498 -0
  57. xgen_doc2chunk/core/processor/excel_helper/excel_file_converter.py +157 -0
  58. xgen_doc2chunk/core/processor/excel_helper/excel_image_processor.py +316 -0
  59. xgen_doc2chunk/core/processor/excel_helper/excel_layout_detector.py +739 -0
  60. xgen_doc2chunk/core/processor/excel_helper/excel_metadata.py +145 -0
  61. xgen_doc2chunk/core/processor/excel_helper/excel_preprocessor.py +83 -0
  62. xgen_doc2chunk/core/processor/excel_helper/excel_table_xls.py +357 -0
  63. xgen_doc2chunk/core/processor/excel_helper/excel_table_xlsx.py +361 -0
  64. xgen_doc2chunk/core/processor/excel_helper/excel_textbox.py +266 -0
  65. xgen_doc2chunk/core/processor/html_helper/__init__.py +7 -0
  66. xgen_doc2chunk/core/processor/html_helper/html_file_converter.py +92 -0
  67. xgen_doc2chunk/core/processor/html_helper/html_preprocessor.py +74 -0
  68. xgen_doc2chunk/core/processor/html_reprocessor.py +140 -0
  69. xgen_doc2chunk/core/processor/hwp_handler.py +401 -0
  70. xgen_doc2chunk/core/processor/hwp_helper/__init__.py +120 -0
  71. xgen_doc2chunk/core/processor/hwp_helper/hwp_chart_extractor.py +373 -0
  72. xgen_doc2chunk/core/processor/hwp_helper/hwp_constants.py +78 -0
  73. xgen_doc2chunk/core/processor/hwp_helper/hwp_decoder.py +106 -0
  74. xgen_doc2chunk/core/processor/hwp_helper/hwp_docinfo.py +174 -0
  75. xgen_doc2chunk/core/processor/hwp_helper/hwp_file_converter.py +60 -0
  76. xgen_doc2chunk/core/processor/hwp_helper/hwp_image_processor.py +413 -0
  77. xgen_doc2chunk/core/processor/hwp_helper/hwp_metadata.py +236 -0
  78. xgen_doc2chunk/core/processor/hwp_helper/hwp_preprocessor.py +82 -0
  79. xgen_doc2chunk/core/processor/hwp_helper/hwp_record.py +149 -0
  80. xgen_doc2chunk/core/processor/hwp_helper/hwp_recovery.py +217 -0
  81. xgen_doc2chunk/core/processor/hwp_helper/hwp_table.py +205 -0
  82. xgen_doc2chunk/core/processor/hwpx_handler.py +191 -0
  83. xgen_doc2chunk/core/processor/hwpx_helper/__init__.py +85 -0
  84. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_chart_extractor.py +464 -0
  85. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_constants.py +30 -0
  86. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_file_converter.py +70 -0
  87. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_image_processor.py +258 -0
  88. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_metadata.py +163 -0
  89. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_preprocessor.py +80 -0
  90. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_section.py +242 -0
  91. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_table_extractor.py +462 -0
  92. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_table_processor.py +220 -0
  93. xgen_doc2chunk/core/processor/image_file_handler.py +212 -0
  94. xgen_doc2chunk/core/processor/image_file_helper/__init__.py +17 -0
  95. xgen_doc2chunk/core/processor/image_file_helper/image_file_converter.py +69 -0
  96. xgen_doc2chunk/core/processor/image_file_helper/image_file_image_processor.py +123 -0
  97. xgen_doc2chunk/core/processor/image_file_helper/image_file_preprocessor.py +84 -0
  98. xgen_doc2chunk/core/processor/pdf_handler.py +597 -0
  99. xgen_doc2chunk/core/processor/pdf_helpers/__init__.py +229 -0
  100. xgen_doc2chunk/core/processor/pdf_helpers/pdf_block_image_engine.py +667 -0
  101. xgen_doc2chunk/core/processor/pdf_helpers/pdf_cell_analysis.py +493 -0
  102. xgen_doc2chunk/core/processor/pdf_helpers/pdf_complexity_analyzer.py +598 -0
  103. xgen_doc2chunk/core/processor/pdf_helpers/pdf_element_merger.py +46 -0
  104. xgen_doc2chunk/core/processor/pdf_helpers/pdf_file_converter.py +72 -0
  105. xgen_doc2chunk/core/processor/pdf_helpers/pdf_graphic_detector.py +332 -0
  106. xgen_doc2chunk/core/processor/pdf_helpers/pdf_image_processor.py +321 -0
  107. xgen_doc2chunk/core/processor/pdf_helpers/pdf_layout_block_detector.py +1244 -0
  108. xgen_doc2chunk/core/processor/pdf_helpers/pdf_line_analysis.py +420 -0
  109. xgen_doc2chunk/core/processor/pdf_helpers/pdf_metadata.py +101 -0
  110. xgen_doc2chunk/core/processor/pdf_helpers/pdf_page_analyzer.py +114 -0
  111. xgen_doc2chunk/core/processor/pdf_helpers/pdf_preprocessor.py +106 -0
  112. xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_detection.py +1346 -0
  113. xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_processor.py +897 -0
  114. xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_quality_analyzer.py +750 -0
  115. xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_validator.py +401 -0
  116. xgen_doc2chunk/core/processor/pdf_helpers/pdf_text_extractor.py +155 -0
  117. xgen_doc2chunk/core/processor/pdf_helpers/pdf_text_quality_analyzer.py +655 -0
  118. xgen_doc2chunk/core/processor/pdf_helpers/pdf_utils.py +183 -0
  119. xgen_doc2chunk/core/processor/pdf_helpers/pdf_vector_text_ocr.py +302 -0
  120. xgen_doc2chunk/core/processor/pdf_helpers/types.py +278 -0
  121. xgen_doc2chunk/core/processor/ppt_handler.py +288 -0
  122. xgen_doc2chunk/core/processor/ppt_helper/__init__.py +96 -0
  123. xgen_doc2chunk/core/processor/ppt_helper/ppt_bullet.py +332 -0
  124. xgen_doc2chunk/core/processor/ppt_helper/ppt_chart_extractor.py +182 -0
  125. xgen_doc2chunk/core/processor/ppt_helper/ppt_constants.py +119 -0
  126. xgen_doc2chunk/core/processor/ppt_helper/ppt_file_converter.py +55 -0
  127. xgen_doc2chunk/core/processor/ppt_helper/ppt_image_processor.py +196 -0
  128. xgen_doc2chunk/core/processor/ppt_helper/ppt_metadata.py +71 -0
  129. xgen_doc2chunk/core/processor/ppt_helper/ppt_preprocessor.py +77 -0
  130. xgen_doc2chunk/core/processor/ppt_helper/ppt_shape.py +189 -0
  131. xgen_doc2chunk/core/processor/ppt_helper/ppt_slide.py +69 -0
  132. xgen_doc2chunk/core/processor/ppt_helper/ppt_table.py +386 -0
  133. xgen_doc2chunk/core/processor/rtf_handler.py +290 -0
  134. xgen_doc2chunk/core/processor/rtf_helper/__init__.py +128 -0
  135. xgen_doc2chunk/core/processor/rtf_helper/rtf_constants.py +94 -0
  136. xgen_doc2chunk/core/processor/rtf_helper/rtf_content_extractor.py +211 -0
  137. xgen_doc2chunk/core/processor/rtf_helper/rtf_decoder.py +141 -0
  138. xgen_doc2chunk/core/processor/rtf_helper/rtf_file_converter.py +87 -0
  139. xgen_doc2chunk/core/processor/rtf_helper/rtf_metadata_extractor.py +179 -0
  140. xgen_doc2chunk/core/processor/rtf_helper/rtf_preprocessor.py +426 -0
  141. xgen_doc2chunk/core/processor/rtf_helper/rtf_region_finder.py +91 -0
  142. xgen_doc2chunk/core/processor/rtf_helper/rtf_table_extractor.py +482 -0
  143. xgen_doc2chunk/core/processor/rtf_helper/rtf_text_cleaner.py +389 -0
  144. xgen_doc2chunk/core/processor/text_handler.py +95 -0
  145. xgen_doc2chunk/core/processor/text_helper/__init__.py +17 -0
  146. xgen_doc2chunk/core/processor/text_helper/text_file_converter.py +28 -0
  147. xgen_doc2chunk/core/processor/text_helper/text_image_processor.py +75 -0
  148. xgen_doc2chunk/core/processor/text_helper/text_preprocessor.py +82 -0
  149. xgen_doc2chunk/ocr/__init__.py +67 -0
  150. xgen_doc2chunk/ocr/base.py +209 -0
  151. xgen_doc2chunk/ocr/ocr_engine/__init__.py +22 -0
  152. xgen_doc2chunk/ocr/ocr_engine/anthropic_ocr.py +91 -0
  153. xgen_doc2chunk/ocr/ocr_engine/bedrock_ocr.py +172 -0
  154. xgen_doc2chunk/ocr/ocr_engine/gemini_ocr.py +91 -0
  155. xgen_doc2chunk/ocr/ocr_engine/openai_ocr.py +100 -0
  156. xgen_doc2chunk/ocr/ocr_engine/vllm_ocr.py +116 -0
  157. xgen_doc2chunk/ocr/ocr_processor.py +387 -0
  158. {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.2.dist-info}/METADATA +1 -1
  159. xgen_doc2chunk-0.1.2.dist-info/RECORD +161 -0
  160. xgen_doc2chunk-0.1.0.dist-info/RECORD +0 -4
  161. {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.2.dist-info}/WHEEL +0 -0
  162. {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.2.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,172 @@
1
+ # chunking_helper/table_parser.py
2
+ """
3
+ Table Parser - HTML table parsing functions
4
+
5
+ Main Features:
6
+ - HTML table parsing and structure analysis
7
+ - Cell span information extraction (rowspan, colspan)
8
+ - Table complexity analysis
9
+ """
10
+ import logging
11
+ import re
12
+ from typing import Dict, List, Optional, Tuple
13
+
14
+ from xgen_doc2chunk.chunking.constants import ParsedTable, TableRow
15
+
16
+ logger = logging.getLogger("document-processor")
17
+
18
+
19
+ def parse_html_table(table_html: str) -> Optional[ParsedTable]:
20
+ """
21
+ Parse an HTML table and extract structured information.
22
+
23
+ Args:
24
+ table_html: HTML table string
25
+
26
+ Returns:
27
+ ParsedTable object or None (if parsing fails)
28
+ """
29
+ try:
30
+ # Extract rows
31
+ row_pattern = r'<tr[^>]*>(.*?)</tr>'
32
+ row_matches = re.findall(row_pattern, table_html, re.DOTALL | re.IGNORECASE)
33
+
34
+ if not row_matches:
35
+ logger.debug("No rows found in table")
36
+ return None
37
+
38
+ header_rows: List[TableRow] = []
39
+ data_rows: List[TableRow] = []
40
+ max_cols = 0
41
+
42
+ for row_content in row_matches:
43
+ # Extract cells
44
+ th_cells = re.findall(r'<th[^>]*>(.*?)</th>', row_content, re.DOTALL | re.IGNORECASE)
45
+ td_cells = re.findall(r'<td[^>]*>(.*?)</td>', row_content, re.DOTALL | re.IGNORECASE)
46
+
47
+ is_header = len(th_cells) > 0 and len(td_cells) == 0
48
+ cell_count = len(th_cells) if is_header else len(td_cells)
49
+ max_cols = max(max_cols, cell_count)
50
+
51
+ # Reconstruct original row HTML
52
+ row_html = f"<tr>{row_content}</tr>"
53
+ row_length = len(row_html)
54
+
55
+ table_row = TableRow(
56
+ html=row_html,
57
+ is_header=is_header,
58
+ cell_count=cell_count,
59
+ char_length=row_length
60
+ )
61
+
62
+ if is_header and not data_rows:
63
+ # Header row before any data rows
64
+ header_rows.append(table_row)
65
+ else:
66
+ data_rows.append(table_row)
67
+
68
+ # Build header HTML
69
+ if header_rows:
70
+ header_html = "\n".join(row.html for row in header_rows)
71
+ header_size = sum(row.char_length for row in header_rows) + len(header_rows) # Including newlines
72
+ else:
73
+ header_html = ""
74
+ header_size = 0
75
+
76
+ return ParsedTable(
77
+ header_rows=header_rows,
78
+ data_rows=data_rows,
79
+ total_cols=max_cols,
80
+ original_html=table_html,
81
+ header_html=header_html,
82
+ header_size=header_size
83
+ )
84
+
85
+ except Exception as e:
86
+ logger.warning(f"Failed to parse HTML table: {e}")
87
+ return None
88
+
89
+
90
+ def extract_cell_spans(row_html: str) -> List[Tuple[int, int]]:
91
+ """
92
+ Extract rowspan/colspan information from cells in a row.
93
+
94
+ Args:
95
+ row_html: Row HTML
96
+
97
+ Returns:
98
+ [(rowspan, colspan), ...] list
99
+ """
100
+ spans = []
101
+
102
+ # Find th and td cells
103
+ cell_pattern = r'<(th|td)([^>]*)>'
104
+
105
+ for match in re.finditer(cell_pattern, row_html, re.IGNORECASE):
106
+ attrs = match.group(2)
107
+
108
+ # Extract rowspan
109
+ rowspan_match = re.search(r'rowspan=["\']?(\d+)["\']?', attrs, re.IGNORECASE)
110
+ rowspan = int(rowspan_match.group(1)) if rowspan_match else 1
111
+
112
+ # Extract colspan
113
+ colspan_match = re.search(r'colspan=["\']?(\d+)["\']?', attrs, re.IGNORECASE)
114
+ colspan = int(colspan_match.group(1)) if colspan_match else 1
115
+
116
+ spans.append((rowspan, colspan))
117
+
118
+ return spans
119
+
120
+
121
+ def extract_cell_spans_with_positions(row_html: str) -> Dict[int, int]:
122
+ """
123
+ Extract rowspan information by column position from a row (considering colspan).
124
+
125
+ Args:
126
+ row_html: Row HTML
127
+
128
+ Returns:
129
+ {column_position: rowspan} dictionary (only cells with rowspan > 1)
130
+ """
131
+ spans: Dict[int, int] = {}
132
+ cell_pattern = r'<(th|td)([^>]*)>'
133
+
134
+ current_col = 0
135
+ for match in re.finditer(cell_pattern, row_html, re.IGNORECASE):
136
+ attrs = match.group(2)
137
+
138
+ # Extract rowspan
139
+ rowspan_match = re.search(r'rowspan=["\']?(\d+)["\']?', attrs, re.IGNORECASE)
140
+ rowspan = int(rowspan_match.group(1)) if rowspan_match else 1
141
+
142
+ # Extract colspan
143
+ colspan_match = re.search(r'colspan=["\']?(\d+)["\']?', attrs, re.IGNORECASE)
144
+ colspan = int(colspan_match.group(1)) if colspan_match else 1
145
+
146
+ if rowspan > 1:
147
+ spans[current_col] = rowspan
148
+
149
+ current_col += colspan
150
+
151
+ return spans
152
+
153
+
154
+ def has_complex_spans(table_html: str) -> bool:
155
+ """
156
+ Check if a table has complex rowspan.
157
+ (colspan does not affect row splitting, only rowspan is problematic)
158
+
159
+ Args:
160
+ table_html: Table HTML
161
+
162
+ Returns:
163
+ True if there are cells with rowspan > 1
164
+ """
165
+ rowspan_pattern = r'rowspan=["\']?(\d+)["\']?'
166
+ matches = re.findall(rowspan_pattern, table_html, re.IGNORECASE)
167
+
168
+ for val in matches:
169
+ if int(val) > 1:
170
+ return True
171
+
172
+ return False
@@ -0,0 +1,443 @@
1
+ # chunking_helper/text_chunker.py
2
+ """
3
+ Text Chunker - Text chunking functionality
4
+
5
+ Main Features:
6
+ - Plain text chunking
7
+ - Table-free text chunking
8
+ - Row-preserving chunking (for tables)
9
+ - Code text chunking
10
+ - Markdown table support with NO overlap
11
+ """
12
+ import logging
13
+ import re
14
+ from typing import Any, List, Optional, Tuple
15
+
16
+ from langchain_text_splitters import RecursiveCharacterTextSplitter
17
+
18
+ from xgen_doc2chunk.chunking.constants import (
19
+ LANGCHAIN_CODE_LANGUAGE_MAP, HTML_TABLE_PATTERN, MARKDOWN_TABLE_PATTERN
20
+ )
21
+
22
+ logger = logging.getLogger("document-processor")
23
+
24
+
25
+ def chunk_plain_text(text: str, chunk_size: int, chunk_overlap: int) -> List[str]:
26
+ """
27
+ Chunk plain text using RecursiveCharacterTextSplitter.
28
+ """
29
+ if not text or not text.strip():
30
+ return []
31
+
32
+ splitter = RecursiveCharacterTextSplitter(
33
+ chunk_size=chunk_size,
34
+ chunk_overlap=chunk_overlap,
35
+ length_function=len,
36
+ separators=["\n\n", "\n", " ", ""]
37
+ )
38
+
39
+ return splitter.split_text(text)
40
+
41
+
42
+ def chunk_text_without_tables(
43
+ text: str,
44
+ chunk_size: int,
45
+ chunk_overlap: int,
46
+ metadata: Optional[str],
47
+ prepend_metadata_func,
48
+ page_tag_processor: Optional[Any] = None
49
+ ) -> List[str]:
50
+ """
51
+ Chunk text that does not contain tables.
52
+
53
+ Args:
54
+ text: Text to chunk
55
+ chunk_size: Maximum chunk size
56
+ chunk_overlap: Overlap size between chunks
57
+ metadata: Metadata to prepend to chunks
58
+ prepend_metadata_func: Function to prepend metadata
59
+ page_tag_processor: PageTagProcessor instance (for custom tag patterns)
60
+
61
+ Returns:
62
+ List of chunks
63
+ """
64
+ if not text or not text.strip():
65
+ return []
66
+
67
+ # Handle HTML code blocks (```html ... ```) separately
68
+ html_code_pattern = r'```html\s*(.*?)\s*```'
69
+
70
+ html_chunks = []
71
+ matches = list(re.finditer(html_code_pattern, text, re.DOTALL))
72
+
73
+ if matches:
74
+ current_pos = 0
75
+ for m in matches:
76
+ s, e = m.span()
77
+ before = text[current_pos:s].strip()
78
+ if before:
79
+ html_chunks.append(('text', before))
80
+ html_chunks.append(('html', text[s:e]))
81
+ current_pos = e
82
+ after = text[current_pos:].strip()
83
+ if after:
84
+ html_chunks.append(('text', after))
85
+ else:
86
+ html_chunks = [('text', text)]
87
+
88
+ final_chunks: List[str] = []
89
+
90
+ for kind, content in html_chunks:
91
+ if kind == 'html':
92
+ # Keep HTML code blocks as-is
93
+ final_chunks.append(content)
94
+ continue
95
+
96
+ # Plain text uses RecursiveCharacterTextSplitter for chunking
97
+ text_chunks = chunk_plain_text(content, chunk_size, chunk_overlap)
98
+ final_chunks.extend(text_chunks)
99
+
100
+ cleaned_chunks = clean_chunks(final_chunks, page_tag_processor)
101
+ cleaned_chunks = prepend_metadata_func(cleaned_chunks, metadata)
102
+
103
+ return cleaned_chunks
104
+
105
+
106
+ def _is_markdown_table(text: str) -> bool:
107
+ """
108
+ Check if text is a Markdown table.
109
+ """
110
+ lines = text.strip().split('\n')
111
+ if len(lines) < 2:
112
+ return False
113
+ has_pipe_rows = any(line.strip().startswith('|') for line in lines)
114
+ has_separator = any('---' in line and '|' in line for line in lines)
115
+ return has_pipe_rows and has_separator
116
+
117
+
118
+ def chunk_with_row_protection(
119
+ text: str,
120
+ chunk_size: int,
121
+ chunk_overlap: int,
122
+ split_with_protected_regions_func,
123
+ chunk_large_table_func
124
+ ) -> List[str]:
125
+ """
126
+ Chunk with row-level protection when table protection is disabled.
127
+
128
+ HTML tables are processed with chunk_large_table_func to maintain structure.
129
+ Markdown tables are processed with chunk_large_markdown_table for proper row-level splitting.
130
+ Both table types have NO overlap applied.
131
+
132
+ Args:
133
+ text: Text to chunk
134
+ chunk_size: Maximum chunk size
135
+ chunk_overlap: Overlap size between chunks (NOT applied to tables)
136
+ split_with_protected_regions_func: Protected region splitting function
137
+ chunk_large_table_func: Large table chunking function (for HTML)
138
+
139
+ Returns:
140
+ List of chunks
141
+ """
142
+ if not text or not text.strip():
143
+ return []
144
+
145
+ # === Extract both HTML and Markdown tables for separate processing ===
146
+ segments: List[Tuple[str, str]] = [] # [(type, content), ...]
147
+
148
+ # Find all HTML tables
149
+ html_matches = list(re.finditer(HTML_TABLE_PATTERN, text, re.DOTALL | re.IGNORECASE))
150
+
151
+ # Find all Markdown tables
152
+ markdown_matches = list(re.finditer(MARKDOWN_TABLE_PATTERN, text, re.MULTILINE))
153
+
154
+ # Combine and sort by start position
155
+ all_matches = []
156
+ for match in html_matches:
157
+ all_matches.append((match.start(), match.end(), 'html_table', match.group(0)))
158
+ for match in markdown_matches:
159
+ start = match.start()
160
+ if match.group(0).startswith('\n'):
161
+ start += 1
162
+ all_matches.append((start, match.end(), 'markdown_table', match.group(0).strip()))
163
+
164
+ # Sort by start position
165
+ all_matches.sort(key=lambda x: x[0])
166
+
167
+ # Remove overlapping matches (first non-overlapping match by position wins)
168
+ filtered_matches = []
169
+ last_end = 0
170
+ for start, end, ttype, content in all_matches:
171
+ if start >= last_end:
172
+ filtered_matches.append((start, end, ttype, content))
173
+ last_end = end
174
+
175
+ # Build segments
176
+ last_end = 0
177
+ for start, end, ttype, content in filtered_matches:
178
+ # Text before table
179
+ if start > last_end:
180
+ before_text = text[last_end:start].strip()
181
+ if before_text:
182
+ segments.append(('text', before_text))
183
+
184
+ # Table
185
+ segments.append((ttype, content))
186
+ last_end = end
187
+
188
+ # Text after last table
189
+ if last_end < len(text):
190
+ after_text = text[last_end:].strip()
191
+ if after_text:
192
+ segments.append(('text', after_text))
193
+
194
+ # If no tables, use simple row protection
195
+ if not any(seg_type in ('html_table', 'markdown_table') for seg_type, _ in segments):
196
+ return chunk_with_row_protection_simple(
197
+ text, chunk_size, chunk_overlap, split_with_protected_regions_func
198
+ )
199
+
200
+ # === Process each segment ===
201
+ all_chunks: List[str] = []
202
+
203
+ for seg_type, content in segments:
204
+ if seg_type == 'html_table':
205
+ # HTML table -> split efficiently by rows with NO overlap
206
+ table_chunks = chunk_large_table_func(content, chunk_size, 0, "")
207
+ all_chunks.extend(table_chunks)
208
+ elif seg_type == 'markdown_table':
209
+ # Markdown table -> split efficiently by rows with NO overlap
210
+ from .table_chunker import chunk_large_markdown_table
211
+ table_chunks = chunk_large_markdown_table(content, chunk_size, 0, "")
212
+ all_chunks.extend(table_chunks)
213
+ else:
214
+ # Plain text -> chunk with Markdown row protection
215
+ text_chunks = chunk_with_row_protection_simple(
216
+ content, chunk_size, chunk_overlap, split_with_protected_regions_func
217
+ )
218
+ all_chunks.extend(text_chunks)
219
+
220
+ return all_chunks
221
+
222
+
223
+ def chunk_with_row_protection_simple(
224
+ text: str,
225
+ chunk_size: int,
226
+ chunk_overlap: int,
227
+ split_with_protected_regions_func
228
+ ) -> List[str]:
229
+ """
230
+ Chunk while protecting Markdown table rows from being split mid-row.
231
+ Assumes HTML tables have already been separated.
232
+
233
+ NOTE: If a complete Markdown table is found, it will be chunked with NO overlap
234
+ using chunk_large_markdown_table. Only individual rows (not part of a complete table)
235
+ are protected as regions.
236
+
237
+ Args:
238
+ text: Text to chunk
239
+ chunk_size: Maximum chunk size
240
+ chunk_overlap: Overlap size between chunks (NOT applied to Markdown tables)
241
+ split_with_protected_regions_func: Protected region splitting function
242
+
243
+ Returns:
244
+ List of chunks
245
+ """
246
+ if not text or not text.strip():
247
+ return []
248
+
249
+ # Check if text contains a complete Markdown table
250
+ if _is_markdown_table(text):
251
+ # Process as a complete Markdown table with NO overlap
252
+ from .table_chunker import chunk_large_markdown_table
253
+ return chunk_large_markdown_table(text, chunk_size, 0, "")
254
+
255
+ # Protect individual Markdown table rows (for mixed content)
256
+ row_patterns = [
257
+ r'\|[^\n]+\|', # Markdown table row (headers, data, separators)
258
+ ]
259
+
260
+ # Find all row positions
261
+ row_positions: List[Tuple[int, int]] = []
262
+ for pattern in row_patterns:
263
+ for match in re.finditer(pattern, text, re.DOTALL | re.IGNORECASE):
264
+ row_positions.append((match.start(), match.end()))
265
+
266
+ # Sort by position
267
+ row_positions.sort(key=lambda x: x[0])
268
+
269
+ # Merge overlapping regions
270
+ merged_rows: List[Tuple[int, int]] = []
271
+ for start, end in row_positions:
272
+ if merged_rows and start < merged_rows[-1][1]:
273
+ # Overlap -> merge
274
+ prev_start, prev_end = merged_rows[-1]
275
+ merged_rows[-1] = (prev_start, max(prev_end, end))
276
+ else:
277
+ merged_rows.append((start, end))
278
+
279
+ if not merged_rows:
280
+ # No rows to protect -> use plain chunking
281
+ return chunk_plain_text(text, chunk_size, chunk_overlap)
282
+
283
+ # Chunk while protecting rows
284
+ return split_with_protected_regions_func(text, merged_rows, chunk_size, chunk_overlap)
285
+
286
+
287
+ def clean_chunks(
288
+ chunks: List[str],
289
+ page_tag_processor: Optional[Any] = None
290
+ ) -> List[str]:
291
+ """
292
+ Clean chunks: remove empty chunks and chunks with only page markers.
293
+
294
+ Args:
295
+ chunks: List of chunks
296
+ page_tag_processor: PageTagProcessor instance (for custom tag patterns)
297
+
298
+ Returns:
299
+ Cleaned list of chunks
300
+ """
301
+ cleaned_chunks = []
302
+
303
+ # Build patterns from PageTagProcessor or use defaults
304
+ if page_tag_processor is not None:
305
+ config = page_tag_processor.config
306
+ # Page pattern with optional OCR suffix
307
+ page_prefix = re.escape(config.tag_prefix)
308
+ page_suffix = re.escape(config.tag_suffix)
309
+ slide_prefix = re.escape(config.slide_prefix)
310
+ slide_suffix = re.escape(config.slide_suffix)
311
+
312
+ page_marker_patterns = [
313
+ f"{page_prefix}\\d+(\\s*\\(OCR[+Ref]*\\))?{page_suffix}",
314
+ f"{slide_prefix}\\d+(\\s*\\(OCR\\))?{slide_suffix}",
315
+ ]
316
+ else:
317
+ # Default patterns
318
+ page_marker_patterns = [
319
+ r"\[Page Number:\s*\d+(\s*\(OCR[+Ref]*\))?\]",
320
+ r"\[Slide Number:\s*\d+(\s*\(OCR\))?\]",
321
+ ]
322
+
323
+ for chunk in chunks:
324
+ if not chunk.strip():
325
+ continue
326
+
327
+ # Check if chunk contains only page marker
328
+ is_page_marker_only = False
329
+ for pattern in page_marker_patterns:
330
+ if re.fullmatch(pattern, chunk.strip()):
331
+ is_page_marker_only = True
332
+ break
333
+
334
+ if not is_page_marker_only:
335
+ cleaned_chunks.append(chunk)
336
+
337
+ return cleaned_chunks
338
+
339
+
340
+ def chunk_code_text(
341
+ text: str,
342
+ file_type: str,
343
+ chunk_size: int = 1500,
344
+ chunk_overlap: int = 300
345
+ ) -> List[str]:
346
+ """
347
+ Chunk code text using language-specific splitter.
348
+
349
+ Args:
350
+ text: Code text
351
+ file_type: File extension (e.g., 'py', 'js')
352
+ chunk_size: Maximum chunk size
353
+ chunk_overlap: Overlap size between chunks
354
+
355
+ Returns:
356
+ List of chunks
357
+ """
358
+ if not text or not text.strip():
359
+ return [""]
360
+
361
+ lang = LANGCHAIN_CODE_LANGUAGE_MAP.get(file_type.lower())
362
+
363
+ if lang:
364
+ splitter = RecursiveCharacterTextSplitter.from_language(
365
+ language=lang, chunk_size=chunk_size, chunk_overlap=chunk_overlap
366
+ )
367
+ else:
368
+ splitter = RecursiveCharacterTextSplitter(
369
+ chunk_size=chunk_size, chunk_overlap=chunk_overlap,
370
+ length_function=len, separators=["\n\n", "\n", " ", ""]
371
+ )
372
+
373
+ chunks = splitter.split_text(text)
374
+ logger.info(f"Code text split into {len(chunks)} chunks (size: {chunk_size}, overlap: {chunk_overlap})")
375
+
376
+ return chunks
377
+
378
+
379
+ def reconstruct_text_from_chunks(chunks: List[str], chunk_overlap: int) -> str:
380
+ """
381
+ Reconstruct original text from chunks.
382
+ Removes overlap portions to avoid duplication.
383
+
384
+ Args:
385
+ chunks: List of chunks
386
+ chunk_overlap: Overlap size between chunks
387
+
388
+ Returns:
389
+ Reconstructed text
390
+ """
391
+ if not chunks:
392
+ return ""
393
+ if len(chunks) == 1:
394
+ return chunks[0]
395
+
396
+ out = chunks[0]
397
+ for i in range(1, len(chunks)):
398
+ prev = chunks[i - 1]
399
+ cur = chunks[i]
400
+ ov = find_overlap_length(prev, cur, chunk_overlap)
401
+ out += cur[ov:] if ov > 0 else cur
402
+
403
+ return out
404
+
405
+
406
+ def find_overlap_length(c1: str, c2: str, max_overlap: int) -> int:
407
+ """
408
+ Find the actual overlap length between two chunks.
409
+
410
+ Args:
411
+ c1: Previous chunk
412
+ c2: Current chunk
413
+ max_overlap: Maximum overlap size
414
+
415
+ Returns:
416
+ Actual overlap length
417
+ """
418
+ max_check = min(len(c1), len(c2), max_overlap)
419
+ for ov in range(max_check, 0, -1):
420
+ if c1[-ov:] == c2[:ov]:
421
+ return ov
422
+ return 0
423
+
424
+
425
+ def estimate_chunks_count(text: str, chunk_size: int = 1000, chunk_overlap: int = 200) -> int:
426
+ """
427
+ Estimate the number of chunks when text is chunked.
428
+
429
+ Args:
430
+ text: Text
431
+ chunk_size: Maximum chunk size
432
+ chunk_overlap: Overlap size between chunks
433
+
434
+ Returns:
435
+ Estimated chunk count
436
+ """
437
+ if not text:
438
+ return 0
439
+ if len(text) <= chunk_size:
440
+ return 1
441
+
442
+ eff = chunk_size - chunk_overlap
443
+ return max(1, (len(text) - chunk_overlap) // eff + 1)
@@ -0,0 +1,64 @@
1
+ # xgen_doc2chunk/core/__init__.py
2
+ """
3
+ Core - Document Processing Core Module
4
+
5
+ This package provides core functionality for processing various document formats.
6
+
7
+ Module Structure:
8
+ - document_processor: Main DocumentProcessor class
9
+ - processor/: Individual document type handlers
10
+ - pdf_handler: PDF document processing
11
+ - docx_handler: DOCX document processing
12
+ - doc_handler: DOC document processing
13
+ - ppt_handler: PPT/PPTX document processing
14
+ - excel_handler: Excel document processing
15
+ - hwp_handler: HWP document processing
16
+ - hwpx_handler: HWPX document processing
17
+ - csv_handler: CSV file processing
18
+ - text_handler: Text file processing
19
+ - functions/: Utility functions
20
+ - utils: Text cleaning, code cleaning, and common utilities
21
+ - img_processor: Image processing and saving (ImageProcessor class)
22
+ - ppt2pdf: PPT to PDF conversion
23
+
24
+ Usage:
25
+ from xgen_doc2chunk import DocumentProcessor
26
+ from xgen_doc2chunk.core.processor import PDFHandler, DocxHandler
27
+ from xgen_doc2chunk.core.functions import clean_text, ImageProcessor
28
+ """
29
+
30
+ # === Main Class ===
31
+ from xgen_doc2chunk.core.document_processor import DocumentProcessor
32
+
33
+ # === Utility Functions ===
34
+ from xgen_doc2chunk.core.functions.utils import (
35
+ clean_text,
36
+ clean_code_text,
37
+ sanitize_text_for_json,
38
+ )
39
+
40
+ # === Image Processing ===
41
+ from xgen_doc2chunk.core.functions.img_processor import (
42
+ ImageProcessor,
43
+ save_image_to_file,
44
+ )
45
+
46
+ # === Explicit Subpackage Imports ===
47
+ from xgen_doc2chunk.core import processor
48
+ from xgen_doc2chunk.core import functions
49
+
50
+ __all__ = [
51
+ # Main Class
52
+ "DocumentProcessor",
53
+ # Utility Functions
54
+ "clean_text",
55
+ "clean_code_text",
56
+ "sanitize_text_for_json",
57
+ # Image Processing
58
+ "ImageProcessor",
59
+ "save_image_to_file",
60
+ # Subpackages
61
+ "processor",
62
+ "functions",
63
+ ]
64
+