xgen-doc2chunk 0.1.0__tar.gz → 0.1.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (162) hide show
  1. {xgen_doc2chunk-0.1.0 → xgen_doc2chunk-0.1.1}/PKG-INFO +1 -1
  2. {xgen_doc2chunk-0.1.0 → xgen_doc2chunk-0.1.1}/pyproject.toml +3 -3
  3. xgen_doc2chunk-0.1.1/xgen_doc2chunk/__init__.py +42 -0
  4. xgen_doc2chunk-0.1.1/xgen_doc2chunk/chunking/__init__.py +168 -0
  5. xgen_doc2chunk-0.1.1/xgen_doc2chunk/chunking/chunking.py +786 -0
  6. xgen_doc2chunk-0.1.1/xgen_doc2chunk/chunking/constants.py +134 -0
  7. xgen_doc2chunk-0.1.1/xgen_doc2chunk/chunking/page_chunker.py +248 -0
  8. xgen_doc2chunk-0.1.1/xgen_doc2chunk/chunking/protected_regions.py +715 -0
  9. xgen_doc2chunk-0.1.1/xgen_doc2chunk/chunking/sheet_processor.py +406 -0
  10. xgen_doc2chunk-0.1.1/xgen_doc2chunk/chunking/table_chunker.py +832 -0
  11. xgen_doc2chunk-0.1.1/xgen_doc2chunk/chunking/table_parser.py +172 -0
  12. xgen_doc2chunk-0.1.1/xgen_doc2chunk/chunking/text_chunker.py +443 -0
  13. xgen_doc2chunk-0.1.1/xgen_doc2chunk/core/__init__.py +64 -0
  14. xgen_doc2chunk-0.1.1/xgen_doc2chunk/core/document_processor.py +1307 -0
  15. xgen_doc2chunk-0.1.1/xgen_doc2chunk/core/functions/__init__.py +85 -0
  16. xgen_doc2chunk-0.1.1/xgen_doc2chunk/core/functions/chart_extractor.py +144 -0
  17. xgen_doc2chunk-0.1.1/xgen_doc2chunk/core/functions/chart_processor.py +534 -0
  18. xgen_doc2chunk-0.1.1/xgen_doc2chunk/core/functions/file_converter.py +220 -0
  19. xgen_doc2chunk-0.1.1/xgen_doc2chunk/core/functions/img_processor.py +649 -0
  20. xgen_doc2chunk-0.1.1/xgen_doc2chunk/core/functions/metadata_extractor.py +542 -0
  21. xgen_doc2chunk-0.1.1/xgen_doc2chunk/core/functions/page_tag_processor.py +393 -0
  22. xgen_doc2chunk-0.1.1/xgen_doc2chunk/core/functions/preprocessor.py +162 -0
  23. xgen_doc2chunk-0.1.1/xgen_doc2chunk/core/functions/storage_backend.py +381 -0
  24. xgen_doc2chunk-0.1.1/xgen_doc2chunk/core/functions/table_extractor.py +468 -0
  25. xgen_doc2chunk-0.1.1/xgen_doc2chunk/core/functions/table_processor.py +299 -0
  26. xgen_doc2chunk-0.1.1/xgen_doc2chunk/core/functions/utils.py +159 -0
  27. xgen_doc2chunk-0.1.1/xgen_doc2chunk/core/processor/__init__.py +96 -0
  28. xgen_doc2chunk-0.1.1/xgen_doc2chunk/core/processor/base_handler.py +544 -0
  29. xgen_doc2chunk-0.1.1/xgen_doc2chunk/core/processor/csv_handler.py +135 -0
  30. xgen_doc2chunk-0.1.1/xgen_doc2chunk/core/processor/csv_helper/__init__.py +89 -0
  31. xgen_doc2chunk-0.1.1/xgen_doc2chunk/core/processor/csv_helper/csv_constants.py +63 -0
  32. xgen_doc2chunk-0.1.1/xgen_doc2chunk/core/processor/csv_helper/csv_encoding.py +104 -0
  33. xgen_doc2chunk-0.1.1/xgen_doc2chunk/core/processor/csv_helper/csv_file_converter.py +78 -0
  34. xgen_doc2chunk-0.1.1/xgen_doc2chunk/core/processor/csv_helper/csv_image_processor.py +75 -0
  35. xgen_doc2chunk-0.1.1/xgen_doc2chunk/core/processor/csv_helper/csv_metadata.py +168 -0
  36. xgen_doc2chunk-0.1.1/xgen_doc2chunk/core/processor/csv_helper/csv_parser.py +225 -0
  37. xgen_doc2chunk-0.1.1/xgen_doc2chunk/core/processor/csv_helper/csv_preprocessor.py +86 -0
  38. xgen_doc2chunk-0.1.1/xgen_doc2chunk/core/processor/csv_helper/csv_table.py +266 -0
  39. xgen_doc2chunk-0.1.1/xgen_doc2chunk/core/processor/doc_handler.py +579 -0
  40. xgen_doc2chunk-0.1.1/xgen_doc2chunk/core/processor/doc_helpers/__init__.py +25 -0
  41. xgen_doc2chunk-0.1.1/xgen_doc2chunk/core/processor/doc_helpers/doc_file_converter.py +160 -0
  42. xgen_doc2chunk-0.1.1/xgen_doc2chunk/core/processor/doc_helpers/doc_image_processor.py +179 -0
  43. xgen_doc2chunk-0.1.1/xgen_doc2chunk/core/processor/doc_helpers/doc_preprocessor.py +83 -0
  44. xgen_doc2chunk-0.1.1/xgen_doc2chunk/core/processor/docx_handler.py +376 -0
  45. xgen_doc2chunk-0.1.1/xgen_doc2chunk/core/processor/docx_helper/__init__.py +84 -0
  46. xgen_doc2chunk-0.1.1/xgen_doc2chunk/core/processor/docx_helper/docx_chart_extractor.py +436 -0
  47. xgen_doc2chunk-0.1.1/xgen_doc2chunk/core/processor/docx_helper/docx_constants.py +75 -0
  48. xgen_doc2chunk-0.1.1/xgen_doc2chunk/core/processor/docx_helper/docx_file_converter.py +76 -0
  49. xgen_doc2chunk-0.1.1/xgen_doc2chunk/core/processor/docx_helper/docx_image.py +145 -0
  50. xgen_doc2chunk-0.1.1/xgen_doc2chunk/core/processor/docx_helper/docx_image_processor.py +410 -0
  51. xgen_doc2chunk-0.1.1/xgen_doc2chunk/core/processor/docx_helper/docx_metadata.py +71 -0
  52. xgen_doc2chunk-0.1.1/xgen_doc2chunk/core/processor/docx_helper/docx_paragraph.py +126 -0
  53. xgen_doc2chunk-0.1.1/xgen_doc2chunk/core/processor/docx_helper/docx_preprocessor.py +82 -0
  54. xgen_doc2chunk-0.1.1/xgen_doc2chunk/core/processor/docx_helper/docx_table_extractor.py +527 -0
  55. xgen_doc2chunk-0.1.1/xgen_doc2chunk/core/processor/docx_helper/docx_table_processor.py +220 -0
  56. xgen_doc2chunk-0.1.1/xgen_doc2chunk/core/processor/excel_handler.py +353 -0
  57. xgen_doc2chunk-0.1.1/xgen_doc2chunk/core/processor/excel_helper/__init__.py +97 -0
  58. xgen_doc2chunk-0.1.1/xgen_doc2chunk/core/processor/excel_helper/excel_chart_extractor.py +498 -0
  59. xgen_doc2chunk-0.1.1/xgen_doc2chunk/core/processor/excel_helper/excel_file_converter.py +157 -0
  60. xgen_doc2chunk-0.1.1/xgen_doc2chunk/core/processor/excel_helper/excel_image_processor.py +316 -0
  61. xgen_doc2chunk-0.1.1/xgen_doc2chunk/core/processor/excel_helper/excel_layout_detector.py +739 -0
  62. xgen_doc2chunk-0.1.1/xgen_doc2chunk/core/processor/excel_helper/excel_metadata.py +145 -0
  63. xgen_doc2chunk-0.1.1/xgen_doc2chunk/core/processor/excel_helper/excel_preprocessor.py +83 -0
  64. xgen_doc2chunk-0.1.1/xgen_doc2chunk/core/processor/excel_helper/excel_table_xls.py +357 -0
  65. xgen_doc2chunk-0.1.1/xgen_doc2chunk/core/processor/excel_helper/excel_table_xlsx.py +361 -0
  66. xgen_doc2chunk-0.1.1/xgen_doc2chunk/core/processor/excel_helper/excel_textbox.py +266 -0
  67. xgen_doc2chunk-0.1.1/xgen_doc2chunk/core/processor/html_helper/__init__.py +7 -0
  68. xgen_doc2chunk-0.1.1/xgen_doc2chunk/core/processor/html_helper/html_file_converter.py +92 -0
  69. xgen_doc2chunk-0.1.1/xgen_doc2chunk/core/processor/html_helper/html_preprocessor.py +74 -0
  70. xgen_doc2chunk-0.1.1/xgen_doc2chunk/core/processor/html_reprocessor.py +140 -0
  71. xgen_doc2chunk-0.1.1/xgen_doc2chunk/core/processor/hwp_handler.py +401 -0
  72. xgen_doc2chunk-0.1.1/xgen_doc2chunk/core/processor/hwp_helper/__init__.py +120 -0
  73. xgen_doc2chunk-0.1.1/xgen_doc2chunk/core/processor/hwp_helper/hwp_chart_extractor.py +373 -0
  74. xgen_doc2chunk-0.1.1/xgen_doc2chunk/core/processor/hwp_helper/hwp_constants.py +78 -0
  75. xgen_doc2chunk-0.1.1/xgen_doc2chunk/core/processor/hwp_helper/hwp_decoder.py +106 -0
  76. xgen_doc2chunk-0.1.1/xgen_doc2chunk/core/processor/hwp_helper/hwp_docinfo.py +174 -0
  77. xgen_doc2chunk-0.1.1/xgen_doc2chunk/core/processor/hwp_helper/hwp_file_converter.py +60 -0
  78. xgen_doc2chunk-0.1.1/xgen_doc2chunk/core/processor/hwp_helper/hwp_image_processor.py +413 -0
  79. xgen_doc2chunk-0.1.1/xgen_doc2chunk/core/processor/hwp_helper/hwp_metadata.py +236 -0
  80. xgen_doc2chunk-0.1.1/xgen_doc2chunk/core/processor/hwp_helper/hwp_preprocessor.py +82 -0
  81. xgen_doc2chunk-0.1.1/xgen_doc2chunk/core/processor/hwp_helper/hwp_record.py +149 -0
  82. xgen_doc2chunk-0.1.1/xgen_doc2chunk/core/processor/hwp_helper/hwp_recovery.py +217 -0
  83. xgen_doc2chunk-0.1.1/xgen_doc2chunk/core/processor/hwp_helper/hwp_table.py +205 -0
  84. xgen_doc2chunk-0.1.1/xgen_doc2chunk/core/processor/hwpx_handler.py +191 -0
  85. xgen_doc2chunk-0.1.1/xgen_doc2chunk/core/processor/hwpx_helper/__init__.py +85 -0
  86. xgen_doc2chunk-0.1.1/xgen_doc2chunk/core/processor/hwpx_helper/hwpx_chart_extractor.py +464 -0
  87. xgen_doc2chunk-0.1.1/xgen_doc2chunk/core/processor/hwpx_helper/hwpx_constants.py +30 -0
  88. xgen_doc2chunk-0.1.1/xgen_doc2chunk/core/processor/hwpx_helper/hwpx_file_converter.py +70 -0
  89. xgen_doc2chunk-0.1.1/xgen_doc2chunk/core/processor/hwpx_helper/hwpx_image_processor.py +258 -0
  90. xgen_doc2chunk-0.1.1/xgen_doc2chunk/core/processor/hwpx_helper/hwpx_metadata.py +163 -0
  91. xgen_doc2chunk-0.1.1/xgen_doc2chunk/core/processor/hwpx_helper/hwpx_preprocessor.py +80 -0
  92. xgen_doc2chunk-0.1.1/xgen_doc2chunk/core/processor/hwpx_helper/hwpx_section.py +242 -0
  93. xgen_doc2chunk-0.1.1/xgen_doc2chunk/core/processor/hwpx_helper/hwpx_table_extractor.py +462 -0
  94. xgen_doc2chunk-0.1.1/xgen_doc2chunk/core/processor/hwpx_helper/hwpx_table_processor.py +220 -0
  95. xgen_doc2chunk-0.1.1/xgen_doc2chunk/core/processor/image_file_handler.py +212 -0
  96. xgen_doc2chunk-0.1.1/xgen_doc2chunk/core/processor/image_file_helper/__init__.py +17 -0
  97. xgen_doc2chunk-0.1.1/xgen_doc2chunk/core/processor/image_file_helper/image_file_converter.py +69 -0
  98. xgen_doc2chunk-0.1.1/xgen_doc2chunk/core/processor/image_file_helper/image_file_image_processor.py +123 -0
  99. xgen_doc2chunk-0.1.1/xgen_doc2chunk/core/processor/image_file_helper/image_file_preprocessor.py +84 -0
  100. xgen_doc2chunk-0.1.1/xgen_doc2chunk/core/processor/pdf_handler.py +597 -0
  101. xgen_doc2chunk-0.1.1/xgen_doc2chunk/core/processor/pdf_helpers/__init__.py +229 -0
  102. xgen_doc2chunk-0.1.1/xgen_doc2chunk/core/processor/pdf_helpers/pdf_block_image_engine.py +667 -0
  103. xgen_doc2chunk-0.1.1/xgen_doc2chunk/core/processor/pdf_helpers/pdf_cell_analysis.py +493 -0
  104. xgen_doc2chunk-0.1.1/xgen_doc2chunk/core/processor/pdf_helpers/pdf_complexity_analyzer.py +598 -0
  105. xgen_doc2chunk-0.1.1/xgen_doc2chunk/core/processor/pdf_helpers/pdf_element_merger.py +46 -0
  106. xgen_doc2chunk-0.1.1/xgen_doc2chunk/core/processor/pdf_helpers/pdf_file_converter.py +72 -0
  107. xgen_doc2chunk-0.1.1/xgen_doc2chunk/core/processor/pdf_helpers/pdf_graphic_detector.py +332 -0
  108. xgen_doc2chunk-0.1.1/xgen_doc2chunk/core/processor/pdf_helpers/pdf_image_processor.py +321 -0
  109. xgen_doc2chunk-0.1.1/xgen_doc2chunk/core/processor/pdf_helpers/pdf_layout_block_detector.py +1244 -0
  110. xgen_doc2chunk-0.1.1/xgen_doc2chunk/core/processor/pdf_helpers/pdf_line_analysis.py +420 -0
  111. xgen_doc2chunk-0.1.1/xgen_doc2chunk/core/processor/pdf_helpers/pdf_metadata.py +101 -0
  112. xgen_doc2chunk-0.1.1/xgen_doc2chunk/core/processor/pdf_helpers/pdf_page_analyzer.py +114 -0
  113. xgen_doc2chunk-0.1.1/xgen_doc2chunk/core/processor/pdf_helpers/pdf_preprocessor.py +106 -0
  114. xgen_doc2chunk-0.1.1/xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_detection.py +1346 -0
  115. xgen_doc2chunk-0.1.1/xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_processor.py +897 -0
  116. xgen_doc2chunk-0.1.1/xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_quality_analyzer.py +750 -0
  117. xgen_doc2chunk-0.1.1/xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_validator.py +401 -0
  118. xgen_doc2chunk-0.1.1/xgen_doc2chunk/core/processor/pdf_helpers/pdf_text_extractor.py +155 -0
  119. xgen_doc2chunk-0.1.1/xgen_doc2chunk/core/processor/pdf_helpers/pdf_text_quality_analyzer.py +655 -0
  120. xgen_doc2chunk-0.1.1/xgen_doc2chunk/core/processor/pdf_helpers/pdf_utils.py +183 -0
  121. xgen_doc2chunk-0.1.1/xgen_doc2chunk/core/processor/pdf_helpers/pdf_vector_text_ocr.py +302 -0
  122. xgen_doc2chunk-0.1.1/xgen_doc2chunk/core/processor/pdf_helpers/types.py +278 -0
  123. xgen_doc2chunk-0.1.1/xgen_doc2chunk/core/processor/ppt_handler.py +288 -0
  124. xgen_doc2chunk-0.1.1/xgen_doc2chunk/core/processor/ppt_helper/__init__.py +96 -0
  125. xgen_doc2chunk-0.1.1/xgen_doc2chunk/core/processor/ppt_helper/ppt_bullet.py +332 -0
  126. xgen_doc2chunk-0.1.1/xgen_doc2chunk/core/processor/ppt_helper/ppt_chart_extractor.py +182 -0
  127. xgen_doc2chunk-0.1.1/xgen_doc2chunk/core/processor/ppt_helper/ppt_constants.py +119 -0
  128. xgen_doc2chunk-0.1.1/xgen_doc2chunk/core/processor/ppt_helper/ppt_file_converter.py +55 -0
  129. xgen_doc2chunk-0.1.1/xgen_doc2chunk/core/processor/ppt_helper/ppt_image_processor.py +196 -0
  130. xgen_doc2chunk-0.1.1/xgen_doc2chunk/core/processor/ppt_helper/ppt_metadata.py +71 -0
  131. xgen_doc2chunk-0.1.1/xgen_doc2chunk/core/processor/ppt_helper/ppt_preprocessor.py +77 -0
  132. xgen_doc2chunk-0.1.1/xgen_doc2chunk/core/processor/ppt_helper/ppt_shape.py +189 -0
  133. xgen_doc2chunk-0.1.1/xgen_doc2chunk/core/processor/ppt_helper/ppt_slide.py +69 -0
  134. xgen_doc2chunk-0.1.1/xgen_doc2chunk/core/processor/ppt_helper/ppt_table.py +386 -0
  135. xgen_doc2chunk-0.1.1/xgen_doc2chunk/core/processor/rtf_handler.py +290 -0
  136. xgen_doc2chunk-0.1.1/xgen_doc2chunk/core/processor/rtf_helper/__init__.py +128 -0
  137. xgen_doc2chunk-0.1.1/xgen_doc2chunk/core/processor/rtf_helper/rtf_constants.py +94 -0
  138. xgen_doc2chunk-0.1.1/xgen_doc2chunk/core/processor/rtf_helper/rtf_content_extractor.py +211 -0
  139. xgen_doc2chunk-0.1.1/xgen_doc2chunk/core/processor/rtf_helper/rtf_decoder.py +141 -0
  140. xgen_doc2chunk-0.1.1/xgen_doc2chunk/core/processor/rtf_helper/rtf_file_converter.py +87 -0
  141. xgen_doc2chunk-0.1.1/xgen_doc2chunk/core/processor/rtf_helper/rtf_metadata_extractor.py +179 -0
  142. xgen_doc2chunk-0.1.1/xgen_doc2chunk/core/processor/rtf_helper/rtf_preprocessor.py +426 -0
  143. xgen_doc2chunk-0.1.1/xgen_doc2chunk/core/processor/rtf_helper/rtf_region_finder.py +91 -0
  144. xgen_doc2chunk-0.1.1/xgen_doc2chunk/core/processor/rtf_helper/rtf_table_extractor.py +482 -0
  145. xgen_doc2chunk-0.1.1/xgen_doc2chunk/core/processor/rtf_helper/rtf_text_cleaner.py +389 -0
  146. xgen_doc2chunk-0.1.1/xgen_doc2chunk/core/processor/text_handler.py +95 -0
  147. xgen_doc2chunk-0.1.1/xgen_doc2chunk/core/processor/text_helper/__init__.py +17 -0
  148. xgen_doc2chunk-0.1.1/xgen_doc2chunk/core/processor/text_helper/text_file_converter.py +28 -0
  149. xgen_doc2chunk-0.1.1/xgen_doc2chunk/core/processor/text_helper/text_image_processor.py +75 -0
  150. xgen_doc2chunk-0.1.1/xgen_doc2chunk/core/processor/text_helper/text_preprocessor.py +82 -0
  151. xgen_doc2chunk-0.1.1/xgen_doc2chunk/ocr/__init__.py +67 -0
  152. xgen_doc2chunk-0.1.1/xgen_doc2chunk/ocr/base.py +209 -0
  153. xgen_doc2chunk-0.1.1/xgen_doc2chunk/ocr/ocr_engine/__init__.py +22 -0
  154. xgen_doc2chunk-0.1.1/xgen_doc2chunk/ocr/ocr_engine/anthropic_ocr.py +91 -0
  155. xgen_doc2chunk-0.1.1/xgen_doc2chunk/ocr/ocr_engine/bedrock_ocr.py +172 -0
  156. xgen_doc2chunk-0.1.1/xgen_doc2chunk/ocr/ocr_engine/gemini_ocr.py +91 -0
  157. xgen_doc2chunk-0.1.1/xgen_doc2chunk/ocr/ocr_engine/openai_ocr.py +100 -0
  158. xgen_doc2chunk-0.1.1/xgen_doc2chunk/ocr/ocr_engine/vllm_ocr.py +116 -0
  159. xgen_doc2chunk-0.1.1/xgen_doc2chunk/ocr/ocr_processor.py +387 -0
  160. {xgen_doc2chunk-0.1.0 → xgen_doc2chunk-0.1.1}/.gitignore +0 -0
  161. {xgen_doc2chunk-0.1.0 → xgen_doc2chunk-0.1.1}/LICENSE +0 -0
  162. {xgen_doc2chunk-0.1.0 → xgen_doc2chunk-0.1.1}/README.md +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: xgen-doc2chunk
3
- Version: 0.1.0
3
+ Version: 0.1.1
4
4
  Summary: Convert raw documents into AI-understandable context with intelligent text extraction, table detection, and semantic chunking
5
5
  Project-URL: Homepage, https://github.com/master0419/doc2chunk
6
6
  Project-URL: Documentation, https://github.com/master0419/doc2chunk#readme
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
4
4
 
5
5
  [project]
6
6
  name = "xgen-doc2chunk"
7
- version = "0.1.0"
7
+ version = "0.1.1"
8
8
  description = "Convert raw documents into AI-understandable context with intelligent text extraction, table detection, and semantic chunking"
9
9
  readme = "README.md"
10
10
  requires-python = ">=3.12"
@@ -86,11 +86,11 @@ Issues = "https://github.com/master0419/doc2chunk/issues"
86
86
  Changelog = "https://github.com/master0419/doc2chunk/releases"
87
87
 
88
88
  [tool.hatch.build.targets.wheel]
89
- packages = ["xgen-doc2chunk"]
89
+ packages = ["xgen_doc2chunk"]
90
90
 
91
91
  [tool.hatch.build.targets.sdist]
92
92
  include = [
93
- "xgen-doc2chunk/",
93
+ "xgen_doc2chunk/",
94
94
  "README.md",
95
95
  "LICENSE",
96
96
  "pyproject.toml",
@@ -0,0 +1,42 @@
1
+ # xgen_doc2chunk/__init__.py
2
+ """
3
+ xgen_doc2chunk Library
4
+
5
+ A document processing and chunking library for AI applications.
6
+
7
+ Package Structure:
8
+ - core: Document processing core module
9
+ - DocumentProcessor: Main document processing class
10
+ - processor: Individual document type handlers (PDF, DOCX, PPT, Excel, HWP, etc.)
11
+ - functions: Utility functions
12
+
13
+ - chunking: Text chunking module
14
+ - Text splitting and chunking logic
15
+ - Table-preserving chunking
16
+ - Page-based chunking
17
+
18
+ Usage:
19
+ from xgen_doc2chunk import DocumentProcessor
20
+
21
+ processor = DocumentProcessor()
22
+ text = processor.extract_text("document.pdf")
23
+ result = processor.extract_chunks("document.pdf", chunk_size=1000)
24
+ """
25
+
26
+ __version__ = "0.1.0"
27
+
28
+ # Expose core classes at top level
29
+ from xgen_doc2chunk.core import DocumentProcessor
30
+
31
+ # Explicit subpackages
32
+ from xgen_doc2chunk import core
33
+ from xgen_doc2chunk import chunking
34
+
35
+ __all__ = [
36
+ "__version__",
37
+ # Core classes
38
+ "DocumentProcessor",
39
+ # Subpackages
40
+ "core",
41
+ "chunking",
42
+ ]
@@ -0,0 +1,168 @@
1
+ # xgen_doc2chunk/chunking/__init__.py
2
+ """
3
+ Chunking - Text Chunking Module
4
+
5
+ This package provides functionality to split document text into appropriately sized chunks.
6
+
7
+ Module Structure:
8
+ - chunking: Main chunking functions (split_text_preserving_html_blocks, etc.)
9
+ - constants: Constants, patterns, and data classes
10
+ - table_parser: HTML table parsing
11
+ - table_chunker: Table chunking core logic
12
+ - protected_regions: Protected region handling
13
+ - page_chunker: Page-based chunking
14
+ - text_chunker: Text chunking
15
+ - sheet_processor: Sheet and metadata processing
16
+
17
+ Usage:
18
+ from xgen_doc2chunk.chunking import split_text_preserving_html_blocks, chunk_plain_text
19
+ from xgen_doc2chunk.chunking import TableRow, ParsedTable
20
+ """
21
+
22
+ # === Main Chunking Functions (chunking.py) ===
23
+ from xgen_doc2chunk.chunking.chunking import (
24
+ create_chunks,
25
+ )
26
+
27
+ # constants
28
+ from xgen_doc2chunk.chunking.constants import (
29
+ # Constants
30
+ LANGCHAIN_CODE_LANGUAGE_MAP,
31
+ HTML_TABLE_PATTERN,
32
+ CHART_BLOCK_PATTERN,
33
+ TEXTBOX_BLOCK_PATTERN,
34
+ IMAGE_TAG_PATTERN,
35
+ MARKDOWN_TABLE_PATTERN,
36
+ TABLE_WRAPPER_OVERHEAD,
37
+ CHUNK_INDEX_OVERHEAD,
38
+ TABLE_SIZE_THRESHOLD_MULTIPLIER,
39
+ TABLE_BASED_FILE_TYPES,
40
+ # Data classes
41
+ TableRow,
42
+ ParsedTable,
43
+ )
44
+
45
+ # table_parser
46
+ from xgen_doc2chunk.chunking.table_parser import (
47
+ parse_html_table,
48
+ extract_cell_spans,
49
+ extract_cell_spans_with_positions,
50
+ has_complex_spans,
51
+ )
52
+
53
+ # table_chunker
54
+ from xgen_doc2chunk.chunking.table_chunker import (
55
+ calculate_available_space,
56
+ adjust_rowspan_in_chunk,
57
+ build_table_chunk,
58
+ update_chunk_metadata,
59
+ split_table_into_chunks,
60
+ split_table_preserving_rowspan,
61
+ chunk_large_table,
62
+ )
63
+
64
+ # protected_regions
65
+ from xgen_doc2chunk.chunking.protected_regions import (
66
+ find_protected_regions,
67
+ get_protected_region_positions,
68
+ ensure_protected_region_integrity,
69
+ split_with_protected_regions,
70
+ split_large_chunk_with_protected_regions,
71
+ # Backward compatibility aliases
72
+ ensure_table_integrity,
73
+ split_large_chunk_with_table_protection,
74
+ )
75
+
76
+ # page_chunker
77
+ from xgen_doc2chunk.chunking.page_chunker import (
78
+ split_into_pages,
79
+ merge_pages,
80
+ get_overlap_content,
81
+ chunk_by_pages,
82
+ )
83
+
84
+ # text_chunker
85
+ from xgen_doc2chunk.chunking.text_chunker import (
86
+ chunk_plain_text,
87
+ chunk_text_without_tables,
88
+ chunk_with_row_protection,
89
+ chunk_with_row_protection_simple,
90
+ clean_chunks,
91
+ chunk_code_text,
92
+ reconstruct_text_from_chunks,
93
+ find_overlap_length,
94
+ estimate_chunks_count,
95
+ )
96
+
97
+ # sheet_processor
98
+ from xgen_doc2chunk.chunking.sheet_processor import (
99
+ extract_document_metadata,
100
+ prepend_metadata_to_chunks,
101
+ extract_sheet_sections,
102
+ extract_content_segments,
103
+ chunk_multi_sheet_content,
104
+ chunk_single_table_content,
105
+ )
106
+
107
+
108
+ __all__ = [
109
+ # === Primary API ===
110
+ "create_chunks",
111
+ # constants
112
+ "LANGCHAIN_CODE_LANGUAGE_MAP",
113
+ "HTML_TABLE_PATTERN",
114
+ "CHART_BLOCK_PATTERN",
115
+ "TEXTBOX_BLOCK_PATTERN",
116
+ "IMAGE_TAG_PATTERN",
117
+ "MARKDOWN_TABLE_PATTERN",
118
+ "TABLE_WRAPPER_OVERHEAD",
119
+ "CHUNK_INDEX_OVERHEAD",
120
+ "TABLE_SIZE_THRESHOLD_MULTIPLIER",
121
+ "TABLE_BASED_FILE_TYPES",
122
+ "TableRow",
123
+ "ParsedTable",
124
+ # table_parser
125
+ "parse_html_table",
126
+ "extract_cell_spans",
127
+ "extract_cell_spans_with_positions",
128
+ "has_complex_spans",
129
+ # table_chunker
130
+ "calculate_available_space",
131
+ "adjust_rowspan_in_chunk",
132
+ "build_table_chunk",
133
+ "update_chunk_metadata",
134
+ "split_table_into_chunks",
135
+ "split_table_preserving_rowspan",
136
+ "chunk_large_table",
137
+ # protected_regions
138
+ "find_protected_regions",
139
+ "get_protected_region_positions",
140
+ "ensure_protected_region_integrity",
141
+ "split_with_protected_regions",
142
+ "split_large_chunk_with_protected_regions",
143
+ "ensure_table_integrity",
144
+ "split_large_chunk_with_table_protection",
145
+ # page_chunker
146
+ "split_into_pages",
147
+ "merge_pages",
148
+ "get_overlap_content",
149
+ "chunk_by_pages",
150
+ # text_chunker
151
+ "chunk_plain_text",
152
+ "chunk_text_without_tables",
153
+ "chunk_with_row_protection",
154
+ "chunk_with_row_protection_simple",
155
+ "clean_chunks",
156
+ "chunk_code_text",
157
+ "reconstruct_text_from_chunks",
158
+ "find_overlap_length",
159
+ "estimate_chunks_count",
160
+ # sheet_processor
161
+ "extract_document_metadata",
162
+ "prepend_metadata_to_chunks",
163
+ "extract_sheet_sections",
164
+ "extract_content_segments",
165
+ "chunk_multi_sheet_content",
166
+ "chunk_single_table_content",
167
+ ]
168
+