xgen-doc2chunk 0.1.0__py3-none-any.whl → 0.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- xgen_doc2chunk/__init__.py +42 -0
- xgen_doc2chunk/chunking/__init__.py +168 -0
- xgen_doc2chunk/chunking/chunking.py +786 -0
- xgen_doc2chunk/chunking/constants.py +134 -0
- xgen_doc2chunk/chunking/page_chunker.py +248 -0
- xgen_doc2chunk/chunking/protected_regions.py +715 -0
- xgen_doc2chunk/chunking/sheet_processor.py +406 -0
- xgen_doc2chunk/chunking/table_chunker.py +832 -0
- xgen_doc2chunk/chunking/table_parser.py +172 -0
- xgen_doc2chunk/chunking/text_chunker.py +443 -0
- xgen_doc2chunk/core/__init__.py +64 -0
- xgen_doc2chunk/core/document_processor.py +1307 -0
- xgen_doc2chunk/core/functions/__init__.py +85 -0
- xgen_doc2chunk/core/functions/chart_extractor.py +144 -0
- xgen_doc2chunk/core/functions/chart_processor.py +534 -0
- xgen_doc2chunk/core/functions/file_converter.py +220 -0
- xgen_doc2chunk/core/functions/img_processor.py +649 -0
- xgen_doc2chunk/core/functions/metadata_extractor.py +542 -0
- xgen_doc2chunk/core/functions/page_tag_processor.py +393 -0
- xgen_doc2chunk/core/functions/preprocessor.py +162 -0
- xgen_doc2chunk/core/functions/storage_backend.py +381 -0
- xgen_doc2chunk/core/functions/table_extractor.py +468 -0
- xgen_doc2chunk/core/functions/table_processor.py +299 -0
- xgen_doc2chunk/core/functions/utils.py +159 -0
- xgen_doc2chunk/core/processor/__init__.py +96 -0
- xgen_doc2chunk/core/processor/base_handler.py +544 -0
- xgen_doc2chunk/core/processor/csv_handler.py +135 -0
- xgen_doc2chunk/core/processor/csv_helper/__init__.py +89 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_constants.py +63 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_encoding.py +104 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_file_converter.py +78 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_image_processor.py +75 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_metadata.py +168 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_parser.py +225 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_preprocessor.py +86 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_table.py +266 -0
- xgen_doc2chunk/core/processor/doc_handler.py +579 -0
- xgen_doc2chunk/core/processor/doc_helpers/__init__.py +25 -0
- xgen_doc2chunk/core/processor/doc_helpers/doc_file_converter.py +160 -0
- xgen_doc2chunk/core/processor/doc_helpers/doc_image_processor.py +179 -0
- xgen_doc2chunk/core/processor/doc_helpers/doc_preprocessor.py +83 -0
- xgen_doc2chunk/core/processor/docx_handler.py +376 -0
- xgen_doc2chunk/core/processor/docx_helper/__init__.py +84 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_chart_extractor.py +436 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_constants.py +75 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_file_converter.py +76 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_image.py +145 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_image_processor.py +410 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_metadata.py +71 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_paragraph.py +126 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_preprocessor.py +82 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_table_extractor.py +527 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_table_processor.py +220 -0
- xgen_doc2chunk/core/processor/excel_handler.py +353 -0
- xgen_doc2chunk/core/processor/excel_helper/__init__.py +97 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_chart_extractor.py +498 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_file_converter.py +157 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_image_processor.py +316 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_layout_detector.py +739 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_metadata.py +145 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_preprocessor.py +83 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_table_xls.py +357 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_table_xlsx.py +361 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_textbox.py +266 -0
- xgen_doc2chunk/core/processor/html_helper/__init__.py +7 -0
- xgen_doc2chunk/core/processor/html_helper/html_file_converter.py +92 -0
- xgen_doc2chunk/core/processor/html_helper/html_preprocessor.py +74 -0
- xgen_doc2chunk/core/processor/html_reprocessor.py +140 -0
- xgen_doc2chunk/core/processor/hwp_handler.py +401 -0
- xgen_doc2chunk/core/processor/hwp_helper/__init__.py +120 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_chart_extractor.py +373 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_constants.py +78 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_decoder.py +106 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_docinfo.py +174 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_file_converter.py +60 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_image_processor.py +413 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_metadata.py +236 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_preprocessor.py +82 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_record.py +149 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_recovery.py +217 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_table.py +205 -0
- xgen_doc2chunk/core/processor/hwpx_handler.py +191 -0
- xgen_doc2chunk/core/processor/hwpx_helper/__init__.py +85 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_chart_extractor.py +464 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_constants.py +30 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_file_converter.py +70 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_image_processor.py +258 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_metadata.py +163 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_preprocessor.py +80 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_section.py +242 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_table_extractor.py +462 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_table_processor.py +220 -0
- xgen_doc2chunk/core/processor/image_file_handler.py +212 -0
- xgen_doc2chunk/core/processor/image_file_helper/__init__.py +17 -0
- xgen_doc2chunk/core/processor/image_file_helper/image_file_converter.py +69 -0
- xgen_doc2chunk/core/processor/image_file_helper/image_file_image_processor.py +123 -0
- xgen_doc2chunk/core/processor/image_file_helper/image_file_preprocessor.py +84 -0
- xgen_doc2chunk/core/processor/pdf_handler.py +597 -0
- xgen_doc2chunk/core/processor/pdf_helpers/__init__.py +229 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_block_image_engine.py +667 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_cell_analysis.py +493 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_complexity_analyzer.py +598 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_element_merger.py +46 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_file_converter.py +72 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_graphic_detector.py +332 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_image_processor.py +321 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_layout_block_detector.py +1244 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_line_analysis.py +420 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_metadata.py +101 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_page_analyzer.py +114 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_preprocessor.py +106 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_detection.py +1346 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_processor.py +897 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_quality_analyzer.py +750 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_validator.py +401 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_text_extractor.py +155 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_text_quality_analyzer.py +655 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_utils.py +183 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_vector_text_ocr.py +302 -0
- xgen_doc2chunk/core/processor/pdf_helpers/types.py +278 -0
- xgen_doc2chunk/core/processor/ppt_handler.py +288 -0
- xgen_doc2chunk/core/processor/ppt_helper/__init__.py +96 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_bullet.py +332 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_chart_extractor.py +182 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_constants.py +119 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_file_converter.py +55 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_image_processor.py +196 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_metadata.py +71 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_preprocessor.py +77 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_shape.py +189 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_slide.py +69 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_table.py +386 -0
- xgen_doc2chunk/core/processor/rtf_handler.py +290 -0
- xgen_doc2chunk/core/processor/rtf_helper/__init__.py +128 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_constants.py +94 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_content_extractor.py +211 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_decoder.py +141 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_file_converter.py +87 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_metadata_extractor.py +179 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_preprocessor.py +426 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_region_finder.py +91 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_table_extractor.py +482 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_text_cleaner.py +389 -0
- xgen_doc2chunk/core/processor/text_handler.py +95 -0
- xgen_doc2chunk/core/processor/text_helper/__init__.py +17 -0
- xgen_doc2chunk/core/processor/text_helper/text_file_converter.py +28 -0
- xgen_doc2chunk/core/processor/text_helper/text_image_processor.py +75 -0
- xgen_doc2chunk/core/processor/text_helper/text_preprocessor.py +82 -0
- xgen_doc2chunk/ocr/__init__.py +67 -0
- xgen_doc2chunk/ocr/base.py +209 -0
- xgen_doc2chunk/ocr/ocr_engine/__init__.py +22 -0
- xgen_doc2chunk/ocr/ocr_engine/anthropic_ocr.py +91 -0
- xgen_doc2chunk/ocr/ocr_engine/bedrock_ocr.py +172 -0
- xgen_doc2chunk/ocr/ocr_engine/gemini_ocr.py +91 -0
- xgen_doc2chunk/ocr/ocr_engine/openai_ocr.py +100 -0
- xgen_doc2chunk/ocr/ocr_engine/vllm_ocr.py +116 -0
- xgen_doc2chunk/ocr/ocr_processor.py +387 -0
- {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.1.dist-info}/METADATA +1 -1
- xgen_doc2chunk-0.1.1.dist-info/RECORD +161 -0
- xgen_doc2chunk-0.1.0.dist-info/RECORD +0 -4
- {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.1.dist-info}/WHEEL +0 -0
- {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.1.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,172 @@
|
|
|
1
|
+
# chunking_helper/table_parser.py
|
|
2
|
+
"""
|
|
3
|
+
Table Parser - HTML table parsing functions
|
|
4
|
+
|
|
5
|
+
Main Features:
|
|
6
|
+
- HTML table parsing and structure analysis
|
|
7
|
+
- Cell span information extraction (rowspan, colspan)
|
|
8
|
+
- Table complexity analysis
|
|
9
|
+
"""
|
|
10
|
+
import logging
|
|
11
|
+
import re
|
|
12
|
+
from typing import Dict, List, Optional, Tuple
|
|
13
|
+
|
|
14
|
+
from xgen_doc2chunk.chunking.constants import ParsedTable, TableRow
|
|
15
|
+
|
|
16
|
+
logger = logging.getLogger("document-processor")
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def parse_html_table(table_html: str) -> Optional[ParsedTable]:
|
|
20
|
+
"""
|
|
21
|
+
Parse an HTML table and extract structured information.
|
|
22
|
+
|
|
23
|
+
Args:
|
|
24
|
+
table_html: HTML table string
|
|
25
|
+
|
|
26
|
+
Returns:
|
|
27
|
+
ParsedTable object or None (if parsing fails)
|
|
28
|
+
"""
|
|
29
|
+
try:
|
|
30
|
+
# Extract rows
|
|
31
|
+
row_pattern = r'<tr[^>]*>(.*?)</tr>'
|
|
32
|
+
row_matches = re.findall(row_pattern, table_html, re.DOTALL | re.IGNORECASE)
|
|
33
|
+
|
|
34
|
+
if not row_matches:
|
|
35
|
+
logger.debug("No rows found in table")
|
|
36
|
+
return None
|
|
37
|
+
|
|
38
|
+
header_rows: List[TableRow] = []
|
|
39
|
+
data_rows: List[TableRow] = []
|
|
40
|
+
max_cols = 0
|
|
41
|
+
|
|
42
|
+
for row_content in row_matches:
|
|
43
|
+
# Extract cells
|
|
44
|
+
th_cells = re.findall(r'<th[^>]*>(.*?)</th>', row_content, re.DOTALL | re.IGNORECASE)
|
|
45
|
+
td_cells = re.findall(r'<td[^>]*>(.*?)</td>', row_content, re.DOTALL | re.IGNORECASE)
|
|
46
|
+
|
|
47
|
+
is_header = len(th_cells) > 0 and len(td_cells) == 0
|
|
48
|
+
cell_count = len(th_cells) if is_header else len(td_cells)
|
|
49
|
+
max_cols = max(max_cols, cell_count)
|
|
50
|
+
|
|
51
|
+
# Reconstruct original row HTML
|
|
52
|
+
row_html = f"<tr>{row_content}</tr>"
|
|
53
|
+
row_length = len(row_html)
|
|
54
|
+
|
|
55
|
+
table_row = TableRow(
|
|
56
|
+
html=row_html,
|
|
57
|
+
is_header=is_header,
|
|
58
|
+
cell_count=cell_count,
|
|
59
|
+
char_length=row_length
|
|
60
|
+
)
|
|
61
|
+
|
|
62
|
+
if is_header and not data_rows:
|
|
63
|
+
# Header row before any data rows
|
|
64
|
+
header_rows.append(table_row)
|
|
65
|
+
else:
|
|
66
|
+
data_rows.append(table_row)
|
|
67
|
+
|
|
68
|
+
# Build header HTML
|
|
69
|
+
if header_rows:
|
|
70
|
+
header_html = "\n".join(row.html for row in header_rows)
|
|
71
|
+
header_size = sum(row.char_length for row in header_rows) + len(header_rows) # Including newlines
|
|
72
|
+
else:
|
|
73
|
+
header_html = ""
|
|
74
|
+
header_size = 0
|
|
75
|
+
|
|
76
|
+
return ParsedTable(
|
|
77
|
+
header_rows=header_rows,
|
|
78
|
+
data_rows=data_rows,
|
|
79
|
+
total_cols=max_cols,
|
|
80
|
+
original_html=table_html,
|
|
81
|
+
header_html=header_html,
|
|
82
|
+
header_size=header_size
|
|
83
|
+
)
|
|
84
|
+
|
|
85
|
+
except Exception as e:
|
|
86
|
+
logger.warning(f"Failed to parse HTML table: {e}")
|
|
87
|
+
return None
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
def extract_cell_spans(row_html: str) -> List[Tuple[int, int]]:
|
|
91
|
+
"""
|
|
92
|
+
Extract rowspan/colspan information from cells in a row.
|
|
93
|
+
|
|
94
|
+
Args:
|
|
95
|
+
row_html: Row HTML
|
|
96
|
+
|
|
97
|
+
Returns:
|
|
98
|
+
[(rowspan, colspan), ...] list
|
|
99
|
+
"""
|
|
100
|
+
spans = []
|
|
101
|
+
|
|
102
|
+
# Find th and td cells
|
|
103
|
+
cell_pattern = r'<(th|td)([^>]*)>'
|
|
104
|
+
|
|
105
|
+
for match in re.finditer(cell_pattern, row_html, re.IGNORECASE):
|
|
106
|
+
attrs = match.group(2)
|
|
107
|
+
|
|
108
|
+
# Extract rowspan
|
|
109
|
+
rowspan_match = re.search(r'rowspan=["\']?(\d+)["\']?', attrs, re.IGNORECASE)
|
|
110
|
+
rowspan = int(rowspan_match.group(1)) if rowspan_match else 1
|
|
111
|
+
|
|
112
|
+
# Extract colspan
|
|
113
|
+
colspan_match = re.search(r'colspan=["\']?(\d+)["\']?', attrs, re.IGNORECASE)
|
|
114
|
+
colspan = int(colspan_match.group(1)) if colspan_match else 1
|
|
115
|
+
|
|
116
|
+
spans.append((rowspan, colspan))
|
|
117
|
+
|
|
118
|
+
return spans
|
|
119
|
+
|
|
120
|
+
|
|
121
|
+
def extract_cell_spans_with_positions(row_html: str) -> Dict[int, int]:
|
|
122
|
+
"""
|
|
123
|
+
Extract rowspan information by column position from a row (considering colspan).
|
|
124
|
+
|
|
125
|
+
Args:
|
|
126
|
+
row_html: Row HTML
|
|
127
|
+
|
|
128
|
+
Returns:
|
|
129
|
+
{column_position: rowspan} dictionary (only cells with rowspan > 1)
|
|
130
|
+
"""
|
|
131
|
+
spans: Dict[int, int] = {}
|
|
132
|
+
cell_pattern = r'<(th|td)([^>]*)>'
|
|
133
|
+
|
|
134
|
+
current_col = 0
|
|
135
|
+
for match in re.finditer(cell_pattern, row_html, re.IGNORECASE):
|
|
136
|
+
attrs = match.group(2)
|
|
137
|
+
|
|
138
|
+
# Extract rowspan
|
|
139
|
+
rowspan_match = re.search(r'rowspan=["\']?(\d+)["\']?', attrs, re.IGNORECASE)
|
|
140
|
+
rowspan = int(rowspan_match.group(1)) if rowspan_match else 1
|
|
141
|
+
|
|
142
|
+
# Extract colspan
|
|
143
|
+
colspan_match = re.search(r'colspan=["\']?(\d+)["\']?', attrs, re.IGNORECASE)
|
|
144
|
+
colspan = int(colspan_match.group(1)) if colspan_match else 1
|
|
145
|
+
|
|
146
|
+
if rowspan > 1:
|
|
147
|
+
spans[current_col] = rowspan
|
|
148
|
+
|
|
149
|
+
current_col += colspan
|
|
150
|
+
|
|
151
|
+
return spans
|
|
152
|
+
|
|
153
|
+
|
|
154
|
+
def has_complex_spans(table_html: str) -> bool:
|
|
155
|
+
"""
|
|
156
|
+
Check if a table has complex rowspan.
|
|
157
|
+
(colspan does not affect row splitting, only rowspan is problematic)
|
|
158
|
+
|
|
159
|
+
Args:
|
|
160
|
+
table_html: Table HTML
|
|
161
|
+
|
|
162
|
+
Returns:
|
|
163
|
+
True if there are cells with rowspan > 1
|
|
164
|
+
"""
|
|
165
|
+
rowspan_pattern = r'rowspan=["\']?(\d+)["\']?'
|
|
166
|
+
matches = re.findall(rowspan_pattern, table_html, re.IGNORECASE)
|
|
167
|
+
|
|
168
|
+
for val in matches:
|
|
169
|
+
if int(val) > 1:
|
|
170
|
+
return True
|
|
171
|
+
|
|
172
|
+
return False
|
|
@@ -0,0 +1,443 @@
|
|
|
1
|
+
# chunking_helper/text_chunker.py
|
|
2
|
+
"""
|
|
3
|
+
Text Chunker - Text chunking functionality
|
|
4
|
+
|
|
5
|
+
Main Features:
|
|
6
|
+
- Plain text chunking
|
|
7
|
+
- Table-free text chunking
|
|
8
|
+
- Row-preserving chunking (for tables)
|
|
9
|
+
- Code text chunking
|
|
10
|
+
- Markdown table support with NO overlap
|
|
11
|
+
"""
|
|
12
|
+
import logging
|
|
13
|
+
import re
|
|
14
|
+
from typing import Any, List, Optional, Tuple
|
|
15
|
+
|
|
16
|
+
from langchain_text_splitters import RecursiveCharacterTextSplitter
|
|
17
|
+
|
|
18
|
+
from xgen_doc2chunk.chunking.constants import (
|
|
19
|
+
LANGCHAIN_CODE_LANGUAGE_MAP, HTML_TABLE_PATTERN, MARKDOWN_TABLE_PATTERN
|
|
20
|
+
)
|
|
21
|
+
|
|
22
|
+
logger = logging.getLogger("document-processor")
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def chunk_plain_text(text: str, chunk_size: int, chunk_overlap: int) -> List[str]:
|
|
26
|
+
"""
|
|
27
|
+
Chunk plain text using RecursiveCharacterTextSplitter.
|
|
28
|
+
"""
|
|
29
|
+
if not text or not text.strip():
|
|
30
|
+
return []
|
|
31
|
+
|
|
32
|
+
splitter = RecursiveCharacterTextSplitter(
|
|
33
|
+
chunk_size=chunk_size,
|
|
34
|
+
chunk_overlap=chunk_overlap,
|
|
35
|
+
length_function=len,
|
|
36
|
+
separators=["\n\n", "\n", " ", ""]
|
|
37
|
+
)
|
|
38
|
+
|
|
39
|
+
return splitter.split_text(text)
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def chunk_text_without_tables(
|
|
43
|
+
text: str,
|
|
44
|
+
chunk_size: int,
|
|
45
|
+
chunk_overlap: int,
|
|
46
|
+
metadata: Optional[str],
|
|
47
|
+
prepend_metadata_func,
|
|
48
|
+
page_tag_processor: Optional[Any] = None
|
|
49
|
+
) -> List[str]:
|
|
50
|
+
"""
|
|
51
|
+
Chunk text that does not contain tables.
|
|
52
|
+
|
|
53
|
+
Args:
|
|
54
|
+
text: Text to chunk
|
|
55
|
+
chunk_size: Maximum chunk size
|
|
56
|
+
chunk_overlap: Overlap size between chunks
|
|
57
|
+
metadata: Metadata to prepend to chunks
|
|
58
|
+
prepend_metadata_func: Function to prepend metadata
|
|
59
|
+
page_tag_processor: PageTagProcessor instance (for custom tag patterns)
|
|
60
|
+
|
|
61
|
+
Returns:
|
|
62
|
+
List of chunks
|
|
63
|
+
"""
|
|
64
|
+
if not text or not text.strip():
|
|
65
|
+
return []
|
|
66
|
+
|
|
67
|
+
# Handle HTML code blocks (```html ... ```) separately
|
|
68
|
+
html_code_pattern = r'```html\s*(.*?)\s*```'
|
|
69
|
+
|
|
70
|
+
html_chunks = []
|
|
71
|
+
matches = list(re.finditer(html_code_pattern, text, re.DOTALL))
|
|
72
|
+
|
|
73
|
+
if matches:
|
|
74
|
+
current_pos = 0
|
|
75
|
+
for m in matches:
|
|
76
|
+
s, e = m.span()
|
|
77
|
+
before = text[current_pos:s].strip()
|
|
78
|
+
if before:
|
|
79
|
+
html_chunks.append(('text', before))
|
|
80
|
+
html_chunks.append(('html', text[s:e]))
|
|
81
|
+
current_pos = e
|
|
82
|
+
after = text[current_pos:].strip()
|
|
83
|
+
if after:
|
|
84
|
+
html_chunks.append(('text', after))
|
|
85
|
+
else:
|
|
86
|
+
html_chunks = [('text', text)]
|
|
87
|
+
|
|
88
|
+
final_chunks: List[str] = []
|
|
89
|
+
|
|
90
|
+
for kind, content in html_chunks:
|
|
91
|
+
if kind == 'html':
|
|
92
|
+
# Keep HTML code blocks as-is
|
|
93
|
+
final_chunks.append(content)
|
|
94
|
+
continue
|
|
95
|
+
|
|
96
|
+
# Plain text uses RecursiveCharacterTextSplitter for chunking
|
|
97
|
+
text_chunks = chunk_plain_text(content, chunk_size, chunk_overlap)
|
|
98
|
+
final_chunks.extend(text_chunks)
|
|
99
|
+
|
|
100
|
+
cleaned_chunks = clean_chunks(final_chunks, page_tag_processor)
|
|
101
|
+
cleaned_chunks = prepend_metadata_func(cleaned_chunks, metadata)
|
|
102
|
+
|
|
103
|
+
return cleaned_chunks
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
def _is_markdown_table(text: str) -> bool:
|
|
107
|
+
"""
|
|
108
|
+
Check if text is a Markdown table.
|
|
109
|
+
"""
|
|
110
|
+
lines = text.strip().split('\n')
|
|
111
|
+
if len(lines) < 2:
|
|
112
|
+
return False
|
|
113
|
+
has_pipe_rows = any(line.strip().startswith('|') for line in lines)
|
|
114
|
+
has_separator = any('---' in line and '|' in line for line in lines)
|
|
115
|
+
return has_pipe_rows and has_separator
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
def chunk_with_row_protection(
|
|
119
|
+
text: str,
|
|
120
|
+
chunk_size: int,
|
|
121
|
+
chunk_overlap: int,
|
|
122
|
+
split_with_protected_regions_func,
|
|
123
|
+
chunk_large_table_func
|
|
124
|
+
) -> List[str]:
|
|
125
|
+
"""
|
|
126
|
+
Chunk with row-level protection when table protection is disabled.
|
|
127
|
+
|
|
128
|
+
HTML tables are processed with chunk_large_table_func to maintain structure.
|
|
129
|
+
Markdown tables are processed with chunk_large_markdown_table for proper row-level splitting.
|
|
130
|
+
Both table types have NO overlap applied.
|
|
131
|
+
|
|
132
|
+
Args:
|
|
133
|
+
text: Text to chunk
|
|
134
|
+
chunk_size: Maximum chunk size
|
|
135
|
+
chunk_overlap: Overlap size between chunks (NOT applied to tables)
|
|
136
|
+
split_with_protected_regions_func: Protected region splitting function
|
|
137
|
+
chunk_large_table_func: Large table chunking function (for HTML)
|
|
138
|
+
|
|
139
|
+
Returns:
|
|
140
|
+
List of chunks
|
|
141
|
+
"""
|
|
142
|
+
if not text or not text.strip():
|
|
143
|
+
return []
|
|
144
|
+
|
|
145
|
+
# === Extract both HTML and Markdown tables for separate processing ===
|
|
146
|
+
segments: List[Tuple[str, str]] = [] # [(type, content), ...]
|
|
147
|
+
|
|
148
|
+
# Find all HTML tables
|
|
149
|
+
html_matches = list(re.finditer(HTML_TABLE_PATTERN, text, re.DOTALL | re.IGNORECASE))
|
|
150
|
+
|
|
151
|
+
# Find all Markdown tables
|
|
152
|
+
markdown_matches = list(re.finditer(MARKDOWN_TABLE_PATTERN, text, re.MULTILINE))
|
|
153
|
+
|
|
154
|
+
# Combine and sort by start position
|
|
155
|
+
all_matches = []
|
|
156
|
+
for match in html_matches:
|
|
157
|
+
all_matches.append((match.start(), match.end(), 'html_table', match.group(0)))
|
|
158
|
+
for match in markdown_matches:
|
|
159
|
+
start = match.start()
|
|
160
|
+
if match.group(0).startswith('\n'):
|
|
161
|
+
start += 1
|
|
162
|
+
all_matches.append((start, match.end(), 'markdown_table', match.group(0).strip()))
|
|
163
|
+
|
|
164
|
+
# Sort by start position
|
|
165
|
+
all_matches.sort(key=lambda x: x[0])
|
|
166
|
+
|
|
167
|
+
# Remove overlapping matches (first non-overlapping match by position wins)
|
|
168
|
+
filtered_matches = []
|
|
169
|
+
last_end = 0
|
|
170
|
+
for start, end, ttype, content in all_matches:
|
|
171
|
+
if start >= last_end:
|
|
172
|
+
filtered_matches.append((start, end, ttype, content))
|
|
173
|
+
last_end = end
|
|
174
|
+
|
|
175
|
+
# Build segments
|
|
176
|
+
last_end = 0
|
|
177
|
+
for start, end, ttype, content in filtered_matches:
|
|
178
|
+
# Text before table
|
|
179
|
+
if start > last_end:
|
|
180
|
+
before_text = text[last_end:start].strip()
|
|
181
|
+
if before_text:
|
|
182
|
+
segments.append(('text', before_text))
|
|
183
|
+
|
|
184
|
+
# Table
|
|
185
|
+
segments.append((ttype, content))
|
|
186
|
+
last_end = end
|
|
187
|
+
|
|
188
|
+
# Text after last table
|
|
189
|
+
if last_end < len(text):
|
|
190
|
+
after_text = text[last_end:].strip()
|
|
191
|
+
if after_text:
|
|
192
|
+
segments.append(('text', after_text))
|
|
193
|
+
|
|
194
|
+
# If no tables, use simple row protection
|
|
195
|
+
if not any(seg_type in ('html_table', 'markdown_table') for seg_type, _ in segments):
|
|
196
|
+
return chunk_with_row_protection_simple(
|
|
197
|
+
text, chunk_size, chunk_overlap, split_with_protected_regions_func
|
|
198
|
+
)
|
|
199
|
+
|
|
200
|
+
# === Process each segment ===
|
|
201
|
+
all_chunks: List[str] = []
|
|
202
|
+
|
|
203
|
+
for seg_type, content in segments:
|
|
204
|
+
if seg_type == 'html_table':
|
|
205
|
+
# HTML table -> split efficiently by rows with NO overlap
|
|
206
|
+
table_chunks = chunk_large_table_func(content, chunk_size, 0, "")
|
|
207
|
+
all_chunks.extend(table_chunks)
|
|
208
|
+
elif seg_type == 'markdown_table':
|
|
209
|
+
# Markdown table -> split efficiently by rows with NO overlap
|
|
210
|
+
from .table_chunker import chunk_large_markdown_table
|
|
211
|
+
table_chunks = chunk_large_markdown_table(content, chunk_size, 0, "")
|
|
212
|
+
all_chunks.extend(table_chunks)
|
|
213
|
+
else:
|
|
214
|
+
# Plain text -> chunk with Markdown row protection
|
|
215
|
+
text_chunks = chunk_with_row_protection_simple(
|
|
216
|
+
content, chunk_size, chunk_overlap, split_with_protected_regions_func
|
|
217
|
+
)
|
|
218
|
+
all_chunks.extend(text_chunks)
|
|
219
|
+
|
|
220
|
+
return all_chunks
|
|
221
|
+
|
|
222
|
+
|
|
223
|
+
def chunk_with_row_protection_simple(
|
|
224
|
+
text: str,
|
|
225
|
+
chunk_size: int,
|
|
226
|
+
chunk_overlap: int,
|
|
227
|
+
split_with_protected_regions_func
|
|
228
|
+
) -> List[str]:
|
|
229
|
+
"""
|
|
230
|
+
Chunk while protecting Markdown table rows from being split mid-row.
|
|
231
|
+
Assumes HTML tables have already been separated.
|
|
232
|
+
|
|
233
|
+
NOTE: If a complete Markdown table is found, it will be chunked with NO overlap
|
|
234
|
+
using chunk_large_markdown_table. Only individual rows (not part of a complete table)
|
|
235
|
+
are protected as regions.
|
|
236
|
+
|
|
237
|
+
Args:
|
|
238
|
+
text: Text to chunk
|
|
239
|
+
chunk_size: Maximum chunk size
|
|
240
|
+
chunk_overlap: Overlap size between chunks (NOT applied to Markdown tables)
|
|
241
|
+
split_with_protected_regions_func: Protected region splitting function
|
|
242
|
+
|
|
243
|
+
Returns:
|
|
244
|
+
List of chunks
|
|
245
|
+
"""
|
|
246
|
+
if not text or not text.strip():
|
|
247
|
+
return []
|
|
248
|
+
|
|
249
|
+
# Check if text contains a complete Markdown table
|
|
250
|
+
if _is_markdown_table(text):
|
|
251
|
+
# Process as a complete Markdown table with NO overlap
|
|
252
|
+
from .table_chunker import chunk_large_markdown_table
|
|
253
|
+
return chunk_large_markdown_table(text, chunk_size, 0, "")
|
|
254
|
+
|
|
255
|
+
# Protect individual Markdown table rows (for mixed content)
|
|
256
|
+
row_patterns = [
|
|
257
|
+
r'\|[^\n]+\|', # Markdown table row (headers, data, separators)
|
|
258
|
+
]
|
|
259
|
+
|
|
260
|
+
# Find all row positions
|
|
261
|
+
row_positions: List[Tuple[int, int]] = []
|
|
262
|
+
for pattern in row_patterns:
|
|
263
|
+
for match in re.finditer(pattern, text, re.DOTALL | re.IGNORECASE):
|
|
264
|
+
row_positions.append((match.start(), match.end()))
|
|
265
|
+
|
|
266
|
+
# Sort by position
|
|
267
|
+
row_positions.sort(key=lambda x: x[0])
|
|
268
|
+
|
|
269
|
+
# Merge overlapping regions
|
|
270
|
+
merged_rows: List[Tuple[int, int]] = []
|
|
271
|
+
for start, end in row_positions:
|
|
272
|
+
if merged_rows and start < merged_rows[-1][1]:
|
|
273
|
+
# Overlap -> merge
|
|
274
|
+
prev_start, prev_end = merged_rows[-1]
|
|
275
|
+
merged_rows[-1] = (prev_start, max(prev_end, end))
|
|
276
|
+
else:
|
|
277
|
+
merged_rows.append((start, end))
|
|
278
|
+
|
|
279
|
+
if not merged_rows:
|
|
280
|
+
# No rows to protect -> use plain chunking
|
|
281
|
+
return chunk_plain_text(text, chunk_size, chunk_overlap)
|
|
282
|
+
|
|
283
|
+
# Chunk while protecting rows
|
|
284
|
+
return split_with_protected_regions_func(text, merged_rows, chunk_size, chunk_overlap)
|
|
285
|
+
|
|
286
|
+
|
|
287
|
+
def clean_chunks(
|
|
288
|
+
chunks: List[str],
|
|
289
|
+
page_tag_processor: Optional[Any] = None
|
|
290
|
+
) -> List[str]:
|
|
291
|
+
"""
|
|
292
|
+
Clean chunks: remove empty chunks and chunks with only page markers.
|
|
293
|
+
|
|
294
|
+
Args:
|
|
295
|
+
chunks: List of chunks
|
|
296
|
+
page_tag_processor: PageTagProcessor instance (for custom tag patterns)
|
|
297
|
+
|
|
298
|
+
Returns:
|
|
299
|
+
Cleaned list of chunks
|
|
300
|
+
"""
|
|
301
|
+
cleaned_chunks = []
|
|
302
|
+
|
|
303
|
+
# Build patterns from PageTagProcessor or use defaults
|
|
304
|
+
if page_tag_processor is not None:
|
|
305
|
+
config = page_tag_processor.config
|
|
306
|
+
# Page pattern with optional OCR suffix
|
|
307
|
+
page_prefix = re.escape(config.tag_prefix)
|
|
308
|
+
page_suffix = re.escape(config.tag_suffix)
|
|
309
|
+
slide_prefix = re.escape(config.slide_prefix)
|
|
310
|
+
slide_suffix = re.escape(config.slide_suffix)
|
|
311
|
+
|
|
312
|
+
page_marker_patterns = [
|
|
313
|
+
f"{page_prefix}\\d+(\\s*\\(OCR[+Ref]*\\))?{page_suffix}",
|
|
314
|
+
f"{slide_prefix}\\d+(\\s*\\(OCR\\))?{slide_suffix}",
|
|
315
|
+
]
|
|
316
|
+
else:
|
|
317
|
+
# Default patterns
|
|
318
|
+
page_marker_patterns = [
|
|
319
|
+
r"\[Page Number:\s*\d+(\s*\(OCR[+Ref]*\))?\]",
|
|
320
|
+
r"\[Slide Number:\s*\d+(\s*\(OCR\))?\]",
|
|
321
|
+
]
|
|
322
|
+
|
|
323
|
+
for chunk in chunks:
|
|
324
|
+
if not chunk.strip():
|
|
325
|
+
continue
|
|
326
|
+
|
|
327
|
+
# Check if chunk contains only page marker
|
|
328
|
+
is_page_marker_only = False
|
|
329
|
+
for pattern in page_marker_patterns:
|
|
330
|
+
if re.fullmatch(pattern, chunk.strip()):
|
|
331
|
+
is_page_marker_only = True
|
|
332
|
+
break
|
|
333
|
+
|
|
334
|
+
if not is_page_marker_only:
|
|
335
|
+
cleaned_chunks.append(chunk)
|
|
336
|
+
|
|
337
|
+
return cleaned_chunks
|
|
338
|
+
|
|
339
|
+
|
|
340
|
+
def chunk_code_text(
|
|
341
|
+
text: str,
|
|
342
|
+
file_type: str,
|
|
343
|
+
chunk_size: int = 1500,
|
|
344
|
+
chunk_overlap: int = 300
|
|
345
|
+
) -> List[str]:
|
|
346
|
+
"""
|
|
347
|
+
Chunk code text using language-specific splitter.
|
|
348
|
+
|
|
349
|
+
Args:
|
|
350
|
+
text: Code text
|
|
351
|
+
file_type: File extension (e.g., 'py', 'js')
|
|
352
|
+
chunk_size: Maximum chunk size
|
|
353
|
+
chunk_overlap: Overlap size between chunks
|
|
354
|
+
|
|
355
|
+
Returns:
|
|
356
|
+
List of chunks
|
|
357
|
+
"""
|
|
358
|
+
if not text or not text.strip():
|
|
359
|
+
return [""]
|
|
360
|
+
|
|
361
|
+
lang = LANGCHAIN_CODE_LANGUAGE_MAP.get(file_type.lower())
|
|
362
|
+
|
|
363
|
+
if lang:
|
|
364
|
+
splitter = RecursiveCharacterTextSplitter.from_language(
|
|
365
|
+
language=lang, chunk_size=chunk_size, chunk_overlap=chunk_overlap
|
|
366
|
+
)
|
|
367
|
+
else:
|
|
368
|
+
splitter = RecursiveCharacterTextSplitter(
|
|
369
|
+
chunk_size=chunk_size, chunk_overlap=chunk_overlap,
|
|
370
|
+
length_function=len, separators=["\n\n", "\n", " ", ""]
|
|
371
|
+
)
|
|
372
|
+
|
|
373
|
+
chunks = splitter.split_text(text)
|
|
374
|
+
logger.info(f"Code text split into {len(chunks)} chunks (size: {chunk_size}, overlap: {chunk_overlap})")
|
|
375
|
+
|
|
376
|
+
return chunks
|
|
377
|
+
|
|
378
|
+
|
|
379
|
+
def reconstruct_text_from_chunks(chunks: List[str], chunk_overlap: int) -> str:
|
|
380
|
+
"""
|
|
381
|
+
Reconstruct original text from chunks.
|
|
382
|
+
Removes overlap portions to avoid duplication.
|
|
383
|
+
|
|
384
|
+
Args:
|
|
385
|
+
chunks: List of chunks
|
|
386
|
+
chunk_overlap: Overlap size between chunks
|
|
387
|
+
|
|
388
|
+
Returns:
|
|
389
|
+
Reconstructed text
|
|
390
|
+
"""
|
|
391
|
+
if not chunks:
|
|
392
|
+
return ""
|
|
393
|
+
if len(chunks) == 1:
|
|
394
|
+
return chunks[0]
|
|
395
|
+
|
|
396
|
+
out = chunks[0]
|
|
397
|
+
for i in range(1, len(chunks)):
|
|
398
|
+
prev = chunks[i - 1]
|
|
399
|
+
cur = chunks[i]
|
|
400
|
+
ov = find_overlap_length(prev, cur, chunk_overlap)
|
|
401
|
+
out += cur[ov:] if ov > 0 else cur
|
|
402
|
+
|
|
403
|
+
return out
|
|
404
|
+
|
|
405
|
+
|
|
406
|
+
def find_overlap_length(c1: str, c2: str, max_overlap: int) -> int:
|
|
407
|
+
"""
|
|
408
|
+
Find the actual overlap length between two chunks.
|
|
409
|
+
|
|
410
|
+
Args:
|
|
411
|
+
c1: Previous chunk
|
|
412
|
+
c2: Current chunk
|
|
413
|
+
max_overlap: Maximum overlap size
|
|
414
|
+
|
|
415
|
+
Returns:
|
|
416
|
+
Actual overlap length
|
|
417
|
+
"""
|
|
418
|
+
max_check = min(len(c1), len(c2), max_overlap)
|
|
419
|
+
for ov in range(max_check, 0, -1):
|
|
420
|
+
if c1[-ov:] == c2[:ov]:
|
|
421
|
+
return ov
|
|
422
|
+
return 0
|
|
423
|
+
|
|
424
|
+
|
|
425
|
+
def estimate_chunks_count(text: str, chunk_size: int = 1000, chunk_overlap: int = 200) -> int:
|
|
426
|
+
"""
|
|
427
|
+
Estimate the number of chunks when text is chunked.
|
|
428
|
+
|
|
429
|
+
Args:
|
|
430
|
+
text: Text
|
|
431
|
+
chunk_size: Maximum chunk size
|
|
432
|
+
chunk_overlap: Overlap size between chunks
|
|
433
|
+
|
|
434
|
+
Returns:
|
|
435
|
+
Estimated chunk count
|
|
436
|
+
"""
|
|
437
|
+
if not text:
|
|
438
|
+
return 0
|
|
439
|
+
if len(text) <= chunk_size:
|
|
440
|
+
return 1
|
|
441
|
+
|
|
442
|
+
eff = chunk_size - chunk_overlap
|
|
443
|
+
return max(1, (len(text) - chunk_overlap) // eff + 1)
|
|
@@ -0,0 +1,64 @@
|
|
|
1
|
+
# xgen_doc2chunk/core/__init__.py
|
|
2
|
+
"""
|
|
3
|
+
Core - Document Processing Core Module
|
|
4
|
+
|
|
5
|
+
This package provides core functionality for processing various document formats.
|
|
6
|
+
|
|
7
|
+
Module Structure:
|
|
8
|
+
- document_processor: Main DocumentProcessor class
|
|
9
|
+
- processor/: Individual document type handlers
|
|
10
|
+
- pdf_handler: PDF document processing
|
|
11
|
+
- docx_handler: DOCX document processing
|
|
12
|
+
- doc_handler: DOC document processing
|
|
13
|
+
- ppt_handler: PPT/PPTX document processing
|
|
14
|
+
- excel_handler: Excel document processing
|
|
15
|
+
- hwp_handler: HWP document processing
|
|
16
|
+
- hwpx_handler: HWPX document processing
|
|
17
|
+
- csv_handler: CSV file processing
|
|
18
|
+
- text_handler: Text file processing
|
|
19
|
+
- functions/: Utility functions
|
|
20
|
+
- utils: Text cleaning, code cleaning, and common utilities
|
|
21
|
+
- img_processor: Image processing and saving (ImageProcessor class)
|
|
22
|
+
- ppt2pdf: PPT to PDF conversion
|
|
23
|
+
|
|
24
|
+
Usage:
|
|
25
|
+
from xgen_doc2chunk import DocumentProcessor
|
|
26
|
+
from xgen_doc2chunk.core.processor import PDFHandler, DocxHandler
|
|
27
|
+
from xgen_doc2chunk.core.functions import clean_text, ImageProcessor
|
|
28
|
+
"""
|
|
29
|
+
|
|
30
|
+
# === Main Class ===
|
|
31
|
+
from xgen_doc2chunk.core.document_processor import DocumentProcessor
|
|
32
|
+
|
|
33
|
+
# === Utility Functions ===
|
|
34
|
+
from xgen_doc2chunk.core.functions.utils import (
|
|
35
|
+
clean_text,
|
|
36
|
+
clean_code_text,
|
|
37
|
+
sanitize_text_for_json,
|
|
38
|
+
)
|
|
39
|
+
|
|
40
|
+
# === Image Processing ===
|
|
41
|
+
from xgen_doc2chunk.core.functions.img_processor import (
|
|
42
|
+
ImageProcessor,
|
|
43
|
+
save_image_to_file,
|
|
44
|
+
)
|
|
45
|
+
|
|
46
|
+
# === Explicit Subpackage Imports ===
|
|
47
|
+
from xgen_doc2chunk.core import processor
|
|
48
|
+
from xgen_doc2chunk.core import functions
|
|
49
|
+
|
|
50
|
+
__all__ = [
|
|
51
|
+
# Main Class
|
|
52
|
+
"DocumentProcessor",
|
|
53
|
+
# Utility Functions
|
|
54
|
+
"clean_text",
|
|
55
|
+
"clean_code_text",
|
|
56
|
+
"sanitize_text_for_json",
|
|
57
|
+
# Image Processing
|
|
58
|
+
"ImageProcessor",
|
|
59
|
+
"save_image_to_file",
|
|
60
|
+
# Subpackages
|
|
61
|
+
"processor",
|
|
62
|
+
"functions",
|
|
63
|
+
]
|
|
64
|
+
|