xgen-doc2chunk 0.1.0__tar.gz → 0.1.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {xgen_doc2chunk-0.1.0 → xgen_doc2chunk-0.1.1}/PKG-INFO +1 -1
- {xgen_doc2chunk-0.1.0 → xgen_doc2chunk-0.1.1}/pyproject.toml +3 -3
- xgen_doc2chunk-0.1.1/xgen_doc2chunk/__init__.py +42 -0
- xgen_doc2chunk-0.1.1/xgen_doc2chunk/chunking/__init__.py +168 -0
- xgen_doc2chunk-0.1.1/xgen_doc2chunk/chunking/chunking.py +786 -0
- xgen_doc2chunk-0.1.1/xgen_doc2chunk/chunking/constants.py +134 -0
- xgen_doc2chunk-0.1.1/xgen_doc2chunk/chunking/page_chunker.py +248 -0
- xgen_doc2chunk-0.1.1/xgen_doc2chunk/chunking/protected_regions.py +715 -0
- xgen_doc2chunk-0.1.1/xgen_doc2chunk/chunking/sheet_processor.py +406 -0
- xgen_doc2chunk-0.1.1/xgen_doc2chunk/chunking/table_chunker.py +832 -0
- xgen_doc2chunk-0.1.1/xgen_doc2chunk/chunking/table_parser.py +172 -0
- xgen_doc2chunk-0.1.1/xgen_doc2chunk/chunking/text_chunker.py +443 -0
- xgen_doc2chunk-0.1.1/xgen_doc2chunk/core/__init__.py +64 -0
- xgen_doc2chunk-0.1.1/xgen_doc2chunk/core/document_processor.py +1307 -0
- xgen_doc2chunk-0.1.1/xgen_doc2chunk/core/functions/__init__.py +85 -0
- xgen_doc2chunk-0.1.1/xgen_doc2chunk/core/functions/chart_extractor.py +144 -0
- xgen_doc2chunk-0.1.1/xgen_doc2chunk/core/functions/chart_processor.py +534 -0
- xgen_doc2chunk-0.1.1/xgen_doc2chunk/core/functions/file_converter.py +220 -0
- xgen_doc2chunk-0.1.1/xgen_doc2chunk/core/functions/img_processor.py +649 -0
- xgen_doc2chunk-0.1.1/xgen_doc2chunk/core/functions/metadata_extractor.py +542 -0
- xgen_doc2chunk-0.1.1/xgen_doc2chunk/core/functions/page_tag_processor.py +393 -0
- xgen_doc2chunk-0.1.1/xgen_doc2chunk/core/functions/preprocessor.py +162 -0
- xgen_doc2chunk-0.1.1/xgen_doc2chunk/core/functions/storage_backend.py +381 -0
- xgen_doc2chunk-0.1.1/xgen_doc2chunk/core/functions/table_extractor.py +468 -0
- xgen_doc2chunk-0.1.1/xgen_doc2chunk/core/functions/table_processor.py +299 -0
- xgen_doc2chunk-0.1.1/xgen_doc2chunk/core/functions/utils.py +159 -0
- xgen_doc2chunk-0.1.1/xgen_doc2chunk/core/processor/__init__.py +96 -0
- xgen_doc2chunk-0.1.1/xgen_doc2chunk/core/processor/base_handler.py +544 -0
- xgen_doc2chunk-0.1.1/xgen_doc2chunk/core/processor/csv_handler.py +135 -0
- xgen_doc2chunk-0.1.1/xgen_doc2chunk/core/processor/csv_helper/__init__.py +89 -0
- xgen_doc2chunk-0.1.1/xgen_doc2chunk/core/processor/csv_helper/csv_constants.py +63 -0
- xgen_doc2chunk-0.1.1/xgen_doc2chunk/core/processor/csv_helper/csv_encoding.py +104 -0
- xgen_doc2chunk-0.1.1/xgen_doc2chunk/core/processor/csv_helper/csv_file_converter.py +78 -0
- xgen_doc2chunk-0.1.1/xgen_doc2chunk/core/processor/csv_helper/csv_image_processor.py +75 -0
- xgen_doc2chunk-0.1.1/xgen_doc2chunk/core/processor/csv_helper/csv_metadata.py +168 -0
- xgen_doc2chunk-0.1.1/xgen_doc2chunk/core/processor/csv_helper/csv_parser.py +225 -0
- xgen_doc2chunk-0.1.1/xgen_doc2chunk/core/processor/csv_helper/csv_preprocessor.py +86 -0
- xgen_doc2chunk-0.1.1/xgen_doc2chunk/core/processor/csv_helper/csv_table.py +266 -0
- xgen_doc2chunk-0.1.1/xgen_doc2chunk/core/processor/doc_handler.py +579 -0
- xgen_doc2chunk-0.1.1/xgen_doc2chunk/core/processor/doc_helpers/__init__.py +25 -0
- xgen_doc2chunk-0.1.1/xgen_doc2chunk/core/processor/doc_helpers/doc_file_converter.py +160 -0
- xgen_doc2chunk-0.1.1/xgen_doc2chunk/core/processor/doc_helpers/doc_image_processor.py +179 -0
- xgen_doc2chunk-0.1.1/xgen_doc2chunk/core/processor/doc_helpers/doc_preprocessor.py +83 -0
- xgen_doc2chunk-0.1.1/xgen_doc2chunk/core/processor/docx_handler.py +376 -0
- xgen_doc2chunk-0.1.1/xgen_doc2chunk/core/processor/docx_helper/__init__.py +84 -0
- xgen_doc2chunk-0.1.1/xgen_doc2chunk/core/processor/docx_helper/docx_chart_extractor.py +436 -0
- xgen_doc2chunk-0.1.1/xgen_doc2chunk/core/processor/docx_helper/docx_constants.py +75 -0
- xgen_doc2chunk-0.1.1/xgen_doc2chunk/core/processor/docx_helper/docx_file_converter.py +76 -0
- xgen_doc2chunk-0.1.1/xgen_doc2chunk/core/processor/docx_helper/docx_image.py +145 -0
- xgen_doc2chunk-0.1.1/xgen_doc2chunk/core/processor/docx_helper/docx_image_processor.py +410 -0
- xgen_doc2chunk-0.1.1/xgen_doc2chunk/core/processor/docx_helper/docx_metadata.py +71 -0
- xgen_doc2chunk-0.1.1/xgen_doc2chunk/core/processor/docx_helper/docx_paragraph.py +126 -0
- xgen_doc2chunk-0.1.1/xgen_doc2chunk/core/processor/docx_helper/docx_preprocessor.py +82 -0
- xgen_doc2chunk-0.1.1/xgen_doc2chunk/core/processor/docx_helper/docx_table_extractor.py +527 -0
- xgen_doc2chunk-0.1.1/xgen_doc2chunk/core/processor/docx_helper/docx_table_processor.py +220 -0
- xgen_doc2chunk-0.1.1/xgen_doc2chunk/core/processor/excel_handler.py +353 -0
- xgen_doc2chunk-0.1.1/xgen_doc2chunk/core/processor/excel_helper/__init__.py +97 -0
- xgen_doc2chunk-0.1.1/xgen_doc2chunk/core/processor/excel_helper/excel_chart_extractor.py +498 -0
- xgen_doc2chunk-0.1.1/xgen_doc2chunk/core/processor/excel_helper/excel_file_converter.py +157 -0
- xgen_doc2chunk-0.1.1/xgen_doc2chunk/core/processor/excel_helper/excel_image_processor.py +316 -0
- xgen_doc2chunk-0.1.1/xgen_doc2chunk/core/processor/excel_helper/excel_layout_detector.py +739 -0
- xgen_doc2chunk-0.1.1/xgen_doc2chunk/core/processor/excel_helper/excel_metadata.py +145 -0
- xgen_doc2chunk-0.1.1/xgen_doc2chunk/core/processor/excel_helper/excel_preprocessor.py +83 -0
- xgen_doc2chunk-0.1.1/xgen_doc2chunk/core/processor/excel_helper/excel_table_xls.py +357 -0
- xgen_doc2chunk-0.1.1/xgen_doc2chunk/core/processor/excel_helper/excel_table_xlsx.py +361 -0
- xgen_doc2chunk-0.1.1/xgen_doc2chunk/core/processor/excel_helper/excel_textbox.py +266 -0
- xgen_doc2chunk-0.1.1/xgen_doc2chunk/core/processor/html_helper/__init__.py +7 -0
- xgen_doc2chunk-0.1.1/xgen_doc2chunk/core/processor/html_helper/html_file_converter.py +92 -0
- xgen_doc2chunk-0.1.1/xgen_doc2chunk/core/processor/html_helper/html_preprocessor.py +74 -0
- xgen_doc2chunk-0.1.1/xgen_doc2chunk/core/processor/html_reprocessor.py +140 -0
- xgen_doc2chunk-0.1.1/xgen_doc2chunk/core/processor/hwp_handler.py +401 -0
- xgen_doc2chunk-0.1.1/xgen_doc2chunk/core/processor/hwp_helper/__init__.py +120 -0
- xgen_doc2chunk-0.1.1/xgen_doc2chunk/core/processor/hwp_helper/hwp_chart_extractor.py +373 -0
- xgen_doc2chunk-0.1.1/xgen_doc2chunk/core/processor/hwp_helper/hwp_constants.py +78 -0
- xgen_doc2chunk-0.1.1/xgen_doc2chunk/core/processor/hwp_helper/hwp_decoder.py +106 -0
- xgen_doc2chunk-0.1.1/xgen_doc2chunk/core/processor/hwp_helper/hwp_docinfo.py +174 -0
- xgen_doc2chunk-0.1.1/xgen_doc2chunk/core/processor/hwp_helper/hwp_file_converter.py +60 -0
- xgen_doc2chunk-0.1.1/xgen_doc2chunk/core/processor/hwp_helper/hwp_image_processor.py +413 -0
- xgen_doc2chunk-0.1.1/xgen_doc2chunk/core/processor/hwp_helper/hwp_metadata.py +236 -0
- xgen_doc2chunk-0.1.1/xgen_doc2chunk/core/processor/hwp_helper/hwp_preprocessor.py +82 -0
- xgen_doc2chunk-0.1.1/xgen_doc2chunk/core/processor/hwp_helper/hwp_record.py +149 -0
- xgen_doc2chunk-0.1.1/xgen_doc2chunk/core/processor/hwp_helper/hwp_recovery.py +217 -0
- xgen_doc2chunk-0.1.1/xgen_doc2chunk/core/processor/hwp_helper/hwp_table.py +205 -0
- xgen_doc2chunk-0.1.1/xgen_doc2chunk/core/processor/hwpx_handler.py +191 -0
- xgen_doc2chunk-0.1.1/xgen_doc2chunk/core/processor/hwpx_helper/__init__.py +85 -0
- xgen_doc2chunk-0.1.1/xgen_doc2chunk/core/processor/hwpx_helper/hwpx_chart_extractor.py +464 -0
- xgen_doc2chunk-0.1.1/xgen_doc2chunk/core/processor/hwpx_helper/hwpx_constants.py +30 -0
- xgen_doc2chunk-0.1.1/xgen_doc2chunk/core/processor/hwpx_helper/hwpx_file_converter.py +70 -0
- xgen_doc2chunk-0.1.1/xgen_doc2chunk/core/processor/hwpx_helper/hwpx_image_processor.py +258 -0
- xgen_doc2chunk-0.1.1/xgen_doc2chunk/core/processor/hwpx_helper/hwpx_metadata.py +163 -0
- xgen_doc2chunk-0.1.1/xgen_doc2chunk/core/processor/hwpx_helper/hwpx_preprocessor.py +80 -0
- xgen_doc2chunk-0.1.1/xgen_doc2chunk/core/processor/hwpx_helper/hwpx_section.py +242 -0
- xgen_doc2chunk-0.1.1/xgen_doc2chunk/core/processor/hwpx_helper/hwpx_table_extractor.py +462 -0
- xgen_doc2chunk-0.1.1/xgen_doc2chunk/core/processor/hwpx_helper/hwpx_table_processor.py +220 -0
- xgen_doc2chunk-0.1.1/xgen_doc2chunk/core/processor/image_file_handler.py +212 -0
- xgen_doc2chunk-0.1.1/xgen_doc2chunk/core/processor/image_file_helper/__init__.py +17 -0
- xgen_doc2chunk-0.1.1/xgen_doc2chunk/core/processor/image_file_helper/image_file_converter.py +69 -0
- xgen_doc2chunk-0.1.1/xgen_doc2chunk/core/processor/image_file_helper/image_file_image_processor.py +123 -0
- xgen_doc2chunk-0.1.1/xgen_doc2chunk/core/processor/image_file_helper/image_file_preprocessor.py +84 -0
- xgen_doc2chunk-0.1.1/xgen_doc2chunk/core/processor/pdf_handler.py +597 -0
- xgen_doc2chunk-0.1.1/xgen_doc2chunk/core/processor/pdf_helpers/__init__.py +229 -0
- xgen_doc2chunk-0.1.1/xgen_doc2chunk/core/processor/pdf_helpers/pdf_block_image_engine.py +667 -0
- xgen_doc2chunk-0.1.1/xgen_doc2chunk/core/processor/pdf_helpers/pdf_cell_analysis.py +493 -0
- xgen_doc2chunk-0.1.1/xgen_doc2chunk/core/processor/pdf_helpers/pdf_complexity_analyzer.py +598 -0
- xgen_doc2chunk-0.1.1/xgen_doc2chunk/core/processor/pdf_helpers/pdf_element_merger.py +46 -0
- xgen_doc2chunk-0.1.1/xgen_doc2chunk/core/processor/pdf_helpers/pdf_file_converter.py +72 -0
- xgen_doc2chunk-0.1.1/xgen_doc2chunk/core/processor/pdf_helpers/pdf_graphic_detector.py +332 -0
- xgen_doc2chunk-0.1.1/xgen_doc2chunk/core/processor/pdf_helpers/pdf_image_processor.py +321 -0
- xgen_doc2chunk-0.1.1/xgen_doc2chunk/core/processor/pdf_helpers/pdf_layout_block_detector.py +1244 -0
- xgen_doc2chunk-0.1.1/xgen_doc2chunk/core/processor/pdf_helpers/pdf_line_analysis.py +420 -0
- xgen_doc2chunk-0.1.1/xgen_doc2chunk/core/processor/pdf_helpers/pdf_metadata.py +101 -0
- xgen_doc2chunk-0.1.1/xgen_doc2chunk/core/processor/pdf_helpers/pdf_page_analyzer.py +114 -0
- xgen_doc2chunk-0.1.1/xgen_doc2chunk/core/processor/pdf_helpers/pdf_preprocessor.py +106 -0
- xgen_doc2chunk-0.1.1/xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_detection.py +1346 -0
- xgen_doc2chunk-0.1.1/xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_processor.py +897 -0
- xgen_doc2chunk-0.1.1/xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_quality_analyzer.py +750 -0
- xgen_doc2chunk-0.1.1/xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_validator.py +401 -0
- xgen_doc2chunk-0.1.1/xgen_doc2chunk/core/processor/pdf_helpers/pdf_text_extractor.py +155 -0
- xgen_doc2chunk-0.1.1/xgen_doc2chunk/core/processor/pdf_helpers/pdf_text_quality_analyzer.py +655 -0
- xgen_doc2chunk-0.1.1/xgen_doc2chunk/core/processor/pdf_helpers/pdf_utils.py +183 -0
- xgen_doc2chunk-0.1.1/xgen_doc2chunk/core/processor/pdf_helpers/pdf_vector_text_ocr.py +302 -0
- xgen_doc2chunk-0.1.1/xgen_doc2chunk/core/processor/pdf_helpers/types.py +278 -0
- xgen_doc2chunk-0.1.1/xgen_doc2chunk/core/processor/ppt_handler.py +288 -0
- xgen_doc2chunk-0.1.1/xgen_doc2chunk/core/processor/ppt_helper/__init__.py +96 -0
- xgen_doc2chunk-0.1.1/xgen_doc2chunk/core/processor/ppt_helper/ppt_bullet.py +332 -0
- xgen_doc2chunk-0.1.1/xgen_doc2chunk/core/processor/ppt_helper/ppt_chart_extractor.py +182 -0
- xgen_doc2chunk-0.1.1/xgen_doc2chunk/core/processor/ppt_helper/ppt_constants.py +119 -0
- xgen_doc2chunk-0.1.1/xgen_doc2chunk/core/processor/ppt_helper/ppt_file_converter.py +55 -0
- xgen_doc2chunk-0.1.1/xgen_doc2chunk/core/processor/ppt_helper/ppt_image_processor.py +196 -0
- xgen_doc2chunk-0.1.1/xgen_doc2chunk/core/processor/ppt_helper/ppt_metadata.py +71 -0
- xgen_doc2chunk-0.1.1/xgen_doc2chunk/core/processor/ppt_helper/ppt_preprocessor.py +77 -0
- xgen_doc2chunk-0.1.1/xgen_doc2chunk/core/processor/ppt_helper/ppt_shape.py +189 -0
- xgen_doc2chunk-0.1.1/xgen_doc2chunk/core/processor/ppt_helper/ppt_slide.py +69 -0
- xgen_doc2chunk-0.1.1/xgen_doc2chunk/core/processor/ppt_helper/ppt_table.py +386 -0
- xgen_doc2chunk-0.1.1/xgen_doc2chunk/core/processor/rtf_handler.py +290 -0
- xgen_doc2chunk-0.1.1/xgen_doc2chunk/core/processor/rtf_helper/__init__.py +128 -0
- xgen_doc2chunk-0.1.1/xgen_doc2chunk/core/processor/rtf_helper/rtf_constants.py +94 -0
- xgen_doc2chunk-0.1.1/xgen_doc2chunk/core/processor/rtf_helper/rtf_content_extractor.py +211 -0
- xgen_doc2chunk-0.1.1/xgen_doc2chunk/core/processor/rtf_helper/rtf_decoder.py +141 -0
- xgen_doc2chunk-0.1.1/xgen_doc2chunk/core/processor/rtf_helper/rtf_file_converter.py +87 -0
- xgen_doc2chunk-0.1.1/xgen_doc2chunk/core/processor/rtf_helper/rtf_metadata_extractor.py +179 -0
- xgen_doc2chunk-0.1.1/xgen_doc2chunk/core/processor/rtf_helper/rtf_preprocessor.py +426 -0
- xgen_doc2chunk-0.1.1/xgen_doc2chunk/core/processor/rtf_helper/rtf_region_finder.py +91 -0
- xgen_doc2chunk-0.1.1/xgen_doc2chunk/core/processor/rtf_helper/rtf_table_extractor.py +482 -0
- xgen_doc2chunk-0.1.1/xgen_doc2chunk/core/processor/rtf_helper/rtf_text_cleaner.py +389 -0
- xgen_doc2chunk-0.1.1/xgen_doc2chunk/core/processor/text_handler.py +95 -0
- xgen_doc2chunk-0.1.1/xgen_doc2chunk/core/processor/text_helper/__init__.py +17 -0
- xgen_doc2chunk-0.1.1/xgen_doc2chunk/core/processor/text_helper/text_file_converter.py +28 -0
- xgen_doc2chunk-0.1.1/xgen_doc2chunk/core/processor/text_helper/text_image_processor.py +75 -0
- xgen_doc2chunk-0.1.1/xgen_doc2chunk/core/processor/text_helper/text_preprocessor.py +82 -0
- xgen_doc2chunk-0.1.1/xgen_doc2chunk/ocr/__init__.py +67 -0
- xgen_doc2chunk-0.1.1/xgen_doc2chunk/ocr/base.py +209 -0
- xgen_doc2chunk-0.1.1/xgen_doc2chunk/ocr/ocr_engine/__init__.py +22 -0
- xgen_doc2chunk-0.1.1/xgen_doc2chunk/ocr/ocr_engine/anthropic_ocr.py +91 -0
- xgen_doc2chunk-0.1.1/xgen_doc2chunk/ocr/ocr_engine/bedrock_ocr.py +172 -0
- xgen_doc2chunk-0.1.1/xgen_doc2chunk/ocr/ocr_engine/gemini_ocr.py +91 -0
- xgen_doc2chunk-0.1.1/xgen_doc2chunk/ocr/ocr_engine/openai_ocr.py +100 -0
- xgen_doc2chunk-0.1.1/xgen_doc2chunk/ocr/ocr_engine/vllm_ocr.py +116 -0
- xgen_doc2chunk-0.1.1/xgen_doc2chunk/ocr/ocr_processor.py +387 -0
- {xgen_doc2chunk-0.1.0 → xgen_doc2chunk-0.1.1}/.gitignore +0 -0
- {xgen_doc2chunk-0.1.0 → xgen_doc2chunk-0.1.1}/LICENSE +0 -0
- {xgen_doc2chunk-0.1.0 → xgen_doc2chunk-0.1.1}/README.md +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: xgen-doc2chunk
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.1
|
|
4
4
|
Summary: Convert raw documents into AI-understandable context with intelligent text extraction, table detection, and semantic chunking
|
|
5
5
|
Project-URL: Homepage, https://github.com/master0419/doc2chunk
|
|
6
6
|
Project-URL: Documentation, https://github.com/master0419/doc2chunk#readme
|
|
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "xgen-doc2chunk"
|
|
7
|
-
version = "0.1.
|
|
7
|
+
version = "0.1.1"
|
|
8
8
|
description = "Convert raw documents into AI-understandable context with intelligent text extraction, table detection, and semantic chunking"
|
|
9
9
|
readme = "README.md"
|
|
10
10
|
requires-python = ">=3.12"
|
|
@@ -86,11 +86,11 @@ Issues = "https://github.com/master0419/doc2chunk/issues"
|
|
|
86
86
|
Changelog = "https://github.com/master0419/doc2chunk/releases"
|
|
87
87
|
|
|
88
88
|
[tool.hatch.build.targets.wheel]
|
|
89
|
-
packages = ["
|
|
89
|
+
packages = ["xgen_doc2chunk"]
|
|
90
90
|
|
|
91
91
|
[tool.hatch.build.targets.sdist]
|
|
92
92
|
include = [
|
|
93
|
-
"
|
|
93
|
+
"xgen_doc2chunk/",
|
|
94
94
|
"README.md",
|
|
95
95
|
"LICENSE",
|
|
96
96
|
"pyproject.toml",
|
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
# xgen_doc2chunk/__init__.py
|
|
2
|
+
"""
|
|
3
|
+
xgen_doc2chunk Library
|
|
4
|
+
|
|
5
|
+
A document processing and chunking library for AI applications.
|
|
6
|
+
|
|
7
|
+
Package Structure:
|
|
8
|
+
- core: Document processing core module
|
|
9
|
+
- DocumentProcessor: Main document processing class
|
|
10
|
+
- processor: Individual document type handlers (PDF, DOCX, PPT, Excel, HWP, etc.)
|
|
11
|
+
- functions: Utility functions
|
|
12
|
+
|
|
13
|
+
- chunking: Text chunking module
|
|
14
|
+
- Text splitting and chunking logic
|
|
15
|
+
- Table-preserving chunking
|
|
16
|
+
- Page-based chunking
|
|
17
|
+
|
|
18
|
+
Usage:
|
|
19
|
+
from xgen_doc2chunk import DocumentProcessor
|
|
20
|
+
|
|
21
|
+
processor = DocumentProcessor()
|
|
22
|
+
text = processor.extract_text("document.pdf")
|
|
23
|
+
result = processor.extract_chunks("document.pdf", chunk_size=1000)
|
|
24
|
+
"""
|
|
25
|
+
|
|
26
|
+
__version__ = "0.1.0"
|
|
27
|
+
|
|
28
|
+
# Expose core classes at top level
|
|
29
|
+
from xgen_doc2chunk.core import DocumentProcessor
|
|
30
|
+
|
|
31
|
+
# Explicit subpackages
|
|
32
|
+
from xgen_doc2chunk import core
|
|
33
|
+
from xgen_doc2chunk import chunking
|
|
34
|
+
|
|
35
|
+
__all__ = [
|
|
36
|
+
"__version__",
|
|
37
|
+
# Core classes
|
|
38
|
+
"DocumentProcessor",
|
|
39
|
+
# Subpackages
|
|
40
|
+
"core",
|
|
41
|
+
"chunking",
|
|
42
|
+
]
|
|
@@ -0,0 +1,168 @@
|
|
|
1
|
+
# xgen_doc2chunk/chunking/__init__.py
|
|
2
|
+
"""
|
|
3
|
+
Chunking - Text Chunking Module
|
|
4
|
+
|
|
5
|
+
This package provides functionality to split document text into appropriately sized chunks.
|
|
6
|
+
|
|
7
|
+
Module Structure:
|
|
8
|
+
- chunking: Main chunking functions (split_text_preserving_html_blocks, etc.)
|
|
9
|
+
- constants: Constants, patterns, and data classes
|
|
10
|
+
- table_parser: HTML table parsing
|
|
11
|
+
- table_chunker: Table chunking core logic
|
|
12
|
+
- protected_regions: Protected region handling
|
|
13
|
+
- page_chunker: Page-based chunking
|
|
14
|
+
- text_chunker: Text chunking
|
|
15
|
+
- sheet_processor: Sheet and metadata processing
|
|
16
|
+
|
|
17
|
+
Usage:
|
|
18
|
+
from xgen_doc2chunk.chunking import split_text_preserving_html_blocks, chunk_plain_text
|
|
19
|
+
from xgen_doc2chunk.chunking import TableRow, ParsedTable
|
|
20
|
+
"""
|
|
21
|
+
|
|
22
|
+
# === Main Chunking Functions (chunking.py) ===
|
|
23
|
+
from xgen_doc2chunk.chunking.chunking import (
|
|
24
|
+
create_chunks,
|
|
25
|
+
)
|
|
26
|
+
|
|
27
|
+
# constants
|
|
28
|
+
from xgen_doc2chunk.chunking.constants import (
|
|
29
|
+
# Constants
|
|
30
|
+
LANGCHAIN_CODE_LANGUAGE_MAP,
|
|
31
|
+
HTML_TABLE_PATTERN,
|
|
32
|
+
CHART_BLOCK_PATTERN,
|
|
33
|
+
TEXTBOX_BLOCK_PATTERN,
|
|
34
|
+
IMAGE_TAG_PATTERN,
|
|
35
|
+
MARKDOWN_TABLE_PATTERN,
|
|
36
|
+
TABLE_WRAPPER_OVERHEAD,
|
|
37
|
+
CHUNK_INDEX_OVERHEAD,
|
|
38
|
+
TABLE_SIZE_THRESHOLD_MULTIPLIER,
|
|
39
|
+
TABLE_BASED_FILE_TYPES,
|
|
40
|
+
# Data classes
|
|
41
|
+
TableRow,
|
|
42
|
+
ParsedTable,
|
|
43
|
+
)
|
|
44
|
+
|
|
45
|
+
# table_parser
|
|
46
|
+
from xgen_doc2chunk.chunking.table_parser import (
|
|
47
|
+
parse_html_table,
|
|
48
|
+
extract_cell_spans,
|
|
49
|
+
extract_cell_spans_with_positions,
|
|
50
|
+
has_complex_spans,
|
|
51
|
+
)
|
|
52
|
+
|
|
53
|
+
# table_chunker
|
|
54
|
+
from xgen_doc2chunk.chunking.table_chunker import (
|
|
55
|
+
calculate_available_space,
|
|
56
|
+
adjust_rowspan_in_chunk,
|
|
57
|
+
build_table_chunk,
|
|
58
|
+
update_chunk_metadata,
|
|
59
|
+
split_table_into_chunks,
|
|
60
|
+
split_table_preserving_rowspan,
|
|
61
|
+
chunk_large_table,
|
|
62
|
+
)
|
|
63
|
+
|
|
64
|
+
# protected_regions
|
|
65
|
+
from xgen_doc2chunk.chunking.protected_regions import (
|
|
66
|
+
find_protected_regions,
|
|
67
|
+
get_protected_region_positions,
|
|
68
|
+
ensure_protected_region_integrity,
|
|
69
|
+
split_with_protected_regions,
|
|
70
|
+
split_large_chunk_with_protected_regions,
|
|
71
|
+
# Backward compatibility aliases
|
|
72
|
+
ensure_table_integrity,
|
|
73
|
+
split_large_chunk_with_table_protection,
|
|
74
|
+
)
|
|
75
|
+
|
|
76
|
+
# page_chunker
|
|
77
|
+
from xgen_doc2chunk.chunking.page_chunker import (
|
|
78
|
+
split_into_pages,
|
|
79
|
+
merge_pages,
|
|
80
|
+
get_overlap_content,
|
|
81
|
+
chunk_by_pages,
|
|
82
|
+
)
|
|
83
|
+
|
|
84
|
+
# text_chunker
|
|
85
|
+
from xgen_doc2chunk.chunking.text_chunker import (
|
|
86
|
+
chunk_plain_text,
|
|
87
|
+
chunk_text_without_tables,
|
|
88
|
+
chunk_with_row_protection,
|
|
89
|
+
chunk_with_row_protection_simple,
|
|
90
|
+
clean_chunks,
|
|
91
|
+
chunk_code_text,
|
|
92
|
+
reconstruct_text_from_chunks,
|
|
93
|
+
find_overlap_length,
|
|
94
|
+
estimate_chunks_count,
|
|
95
|
+
)
|
|
96
|
+
|
|
97
|
+
# sheet_processor
|
|
98
|
+
from xgen_doc2chunk.chunking.sheet_processor import (
|
|
99
|
+
extract_document_metadata,
|
|
100
|
+
prepend_metadata_to_chunks,
|
|
101
|
+
extract_sheet_sections,
|
|
102
|
+
extract_content_segments,
|
|
103
|
+
chunk_multi_sheet_content,
|
|
104
|
+
chunk_single_table_content,
|
|
105
|
+
)
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
__all__ = [
|
|
109
|
+
# === Primary API ===
|
|
110
|
+
"create_chunks",
|
|
111
|
+
# constants
|
|
112
|
+
"LANGCHAIN_CODE_LANGUAGE_MAP",
|
|
113
|
+
"HTML_TABLE_PATTERN",
|
|
114
|
+
"CHART_BLOCK_PATTERN",
|
|
115
|
+
"TEXTBOX_BLOCK_PATTERN",
|
|
116
|
+
"IMAGE_TAG_PATTERN",
|
|
117
|
+
"MARKDOWN_TABLE_PATTERN",
|
|
118
|
+
"TABLE_WRAPPER_OVERHEAD",
|
|
119
|
+
"CHUNK_INDEX_OVERHEAD",
|
|
120
|
+
"TABLE_SIZE_THRESHOLD_MULTIPLIER",
|
|
121
|
+
"TABLE_BASED_FILE_TYPES",
|
|
122
|
+
"TableRow",
|
|
123
|
+
"ParsedTable",
|
|
124
|
+
# table_parser
|
|
125
|
+
"parse_html_table",
|
|
126
|
+
"extract_cell_spans",
|
|
127
|
+
"extract_cell_spans_with_positions",
|
|
128
|
+
"has_complex_spans",
|
|
129
|
+
# table_chunker
|
|
130
|
+
"calculate_available_space",
|
|
131
|
+
"adjust_rowspan_in_chunk",
|
|
132
|
+
"build_table_chunk",
|
|
133
|
+
"update_chunk_metadata",
|
|
134
|
+
"split_table_into_chunks",
|
|
135
|
+
"split_table_preserving_rowspan",
|
|
136
|
+
"chunk_large_table",
|
|
137
|
+
# protected_regions
|
|
138
|
+
"find_protected_regions",
|
|
139
|
+
"get_protected_region_positions",
|
|
140
|
+
"ensure_protected_region_integrity",
|
|
141
|
+
"split_with_protected_regions",
|
|
142
|
+
"split_large_chunk_with_protected_regions",
|
|
143
|
+
"ensure_table_integrity",
|
|
144
|
+
"split_large_chunk_with_table_protection",
|
|
145
|
+
# page_chunker
|
|
146
|
+
"split_into_pages",
|
|
147
|
+
"merge_pages",
|
|
148
|
+
"get_overlap_content",
|
|
149
|
+
"chunk_by_pages",
|
|
150
|
+
# text_chunker
|
|
151
|
+
"chunk_plain_text",
|
|
152
|
+
"chunk_text_without_tables",
|
|
153
|
+
"chunk_with_row_protection",
|
|
154
|
+
"chunk_with_row_protection_simple",
|
|
155
|
+
"clean_chunks",
|
|
156
|
+
"chunk_code_text",
|
|
157
|
+
"reconstruct_text_from_chunks",
|
|
158
|
+
"find_overlap_length",
|
|
159
|
+
"estimate_chunks_count",
|
|
160
|
+
# sheet_processor
|
|
161
|
+
"extract_document_metadata",
|
|
162
|
+
"prepend_metadata_to_chunks",
|
|
163
|
+
"extract_sheet_sections",
|
|
164
|
+
"extract_content_segments",
|
|
165
|
+
"chunk_multi_sheet_content",
|
|
166
|
+
"chunk_single_table_content",
|
|
167
|
+
]
|
|
168
|
+
|