xgen-doc2chunk 0.1.0__py3-none-any.whl → 0.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (162) hide show
  1. xgen_doc2chunk/__init__.py +42 -0
  2. xgen_doc2chunk/chunking/__init__.py +168 -0
  3. xgen_doc2chunk/chunking/chunking.py +786 -0
  4. xgen_doc2chunk/chunking/constants.py +134 -0
  5. xgen_doc2chunk/chunking/page_chunker.py +248 -0
  6. xgen_doc2chunk/chunking/protected_regions.py +715 -0
  7. xgen_doc2chunk/chunking/sheet_processor.py +406 -0
  8. xgen_doc2chunk/chunking/table_chunker.py +832 -0
  9. xgen_doc2chunk/chunking/table_parser.py +172 -0
  10. xgen_doc2chunk/chunking/text_chunker.py +443 -0
  11. xgen_doc2chunk/core/__init__.py +64 -0
  12. xgen_doc2chunk/core/document_processor.py +1307 -0
  13. xgen_doc2chunk/core/functions/__init__.py +85 -0
  14. xgen_doc2chunk/core/functions/chart_extractor.py +144 -0
  15. xgen_doc2chunk/core/functions/chart_processor.py +534 -0
  16. xgen_doc2chunk/core/functions/file_converter.py +220 -0
  17. xgen_doc2chunk/core/functions/img_processor.py +649 -0
  18. xgen_doc2chunk/core/functions/metadata_extractor.py +542 -0
  19. xgen_doc2chunk/core/functions/page_tag_processor.py +393 -0
  20. xgen_doc2chunk/core/functions/preprocessor.py +162 -0
  21. xgen_doc2chunk/core/functions/storage_backend.py +381 -0
  22. xgen_doc2chunk/core/functions/table_extractor.py +468 -0
  23. xgen_doc2chunk/core/functions/table_processor.py +299 -0
  24. xgen_doc2chunk/core/functions/utils.py +159 -0
  25. xgen_doc2chunk/core/processor/__init__.py +96 -0
  26. xgen_doc2chunk/core/processor/base_handler.py +544 -0
  27. xgen_doc2chunk/core/processor/csv_handler.py +135 -0
  28. xgen_doc2chunk/core/processor/csv_helper/__init__.py +89 -0
  29. xgen_doc2chunk/core/processor/csv_helper/csv_constants.py +63 -0
  30. xgen_doc2chunk/core/processor/csv_helper/csv_encoding.py +104 -0
  31. xgen_doc2chunk/core/processor/csv_helper/csv_file_converter.py +78 -0
  32. xgen_doc2chunk/core/processor/csv_helper/csv_image_processor.py +75 -0
  33. xgen_doc2chunk/core/processor/csv_helper/csv_metadata.py +168 -0
  34. xgen_doc2chunk/core/processor/csv_helper/csv_parser.py +225 -0
  35. xgen_doc2chunk/core/processor/csv_helper/csv_preprocessor.py +86 -0
  36. xgen_doc2chunk/core/processor/csv_helper/csv_table.py +266 -0
  37. xgen_doc2chunk/core/processor/doc_handler.py +579 -0
  38. xgen_doc2chunk/core/processor/doc_helpers/__init__.py +25 -0
  39. xgen_doc2chunk/core/processor/doc_helpers/doc_file_converter.py +160 -0
  40. xgen_doc2chunk/core/processor/doc_helpers/doc_image_processor.py +179 -0
  41. xgen_doc2chunk/core/processor/doc_helpers/doc_preprocessor.py +83 -0
  42. xgen_doc2chunk/core/processor/docx_handler.py +376 -0
  43. xgen_doc2chunk/core/processor/docx_helper/__init__.py +84 -0
  44. xgen_doc2chunk/core/processor/docx_helper/docx_chart_extractor.py +436 -0
  45. xgen_doc2chunk/core/processor/docx_helper/docx_constants.py +75 -0
  46. xgen_doc2chunk/core/processor/docx_helper/docx_file_converter.py +76 -0
  47. xgen_doc2chunk/core/processor/docx_helper/docx_image.py +145 -0
  48. xgen_doc2chunk/core/processor/docx_helper/docx_image_processor.py +410 -0
  49. xgen_doc2chunk/core/processor/docx_helper/docx_metadata.py +71 -0
  50. xgen_doc2chunk/core/processor/docx_helper/docx_paragraph.py +126 -0
  51. xgen_doc2chunk/core/processor/docx_helper/docx_preprocessor.py +82 -0
  52. xgen_doc2chunk/core/processor/docx_helper/docx_table_extractor.py +527 -0
  53. xgen_doc2chunk/core/processor/docx_helper/docx_table_processor.py +220 -0
  54. xgen_doc2chunk/core/processor/excel_handler.py +353 -0
  55. xgen_doc2chunk/core/processor/excel_helper/__init__.py +97 -0
  56. xgen_doc2chunk/core/processor/excel_helper/excel_chart_extractor.py +498 -0
  57. xgen_doc2chunk/core/processor/excel_helper/excel_file_converter.py +157 -0
  58. xgen_doc2chunk/core/processor/excel_helper/excel_image_processor.py +316 -0
  59. xgen_doc2chunk/core/processor/excel_helper/excel_layout_detector.py +739 -0
  60. xgen_doc2chunk/core/processor/excel_helper/excel_metadata.py +145 -0
  61. xgen_doc2chunk/core/processor/excel_helper/excel_preprocessor.py +83 -0
  62. xgen_doc2chunk/core/processor/excel_helper/excel_table_xls.py +357 -0
  63. xgen_doc2chunk/core/processor/excel_helper/excel_table_xlsx.py +361 -0
  64. xgen_doc2chunk/core/processor/excel_helper/excel_textbox.py +266 -0
  65. xgen_doc2chunk/core/processor/html_helper/__init__.py +7 -0
  66. xgen_doc2chunk/core/processor/html_helper/html_file_converter.py +92 -0
  67. xgen_doc2chunk/core/processor/html_helper/html_preprocessor.py +74 -0
  68. xgen_doc2chunk/core/processor/html_reprocessor.py +140 -0
  69. xgen_doc2chunk/core/processor/hwp_handler.py +401 -0
  70. xgen_doc2chunk/core/processor/hwp_helper/__init__.py +120 -0
  71. xgen_doc2chunk/core/processor/hwp_helper/hwp_chart_extractor.py +373 -0
  72. xgen_doc2chunk/core/processor/hwp_helper/hwp_constants.py +78 -0
  73. xgen_doc2chunk/core/processor/hwp_helper/hwp_decoder.py +106 -0
  74. xgen_doc2chunk/core/processor/hwp_helper/hwp_docinfo.py +174 -0
  75. xgen_doc2chunk/core/processor/hwp_helper/hwp_file_converter.py +60 -0
  76. xgen_doc2chunk/core/processor/hwp_helper/hwp_image_processor.py +413 -0
  77. xgen_doc2chunk/core/processor/hwp_helper/hwp_metadata.py +236 -0
  78. xgen_doc2chunk/core/processor/hwp_helper/hwp_preprocessor.py +82 -0
  79. xgen_doc2chunk/core/processor/hwp_helper/hwp_record.py +149 -0
  80. xgen_doc2chunk/core/processor/hwp_helper/hwp_recovery.py +217 -0
  81. xgen_doc2chunk/core/processor/hwp_helper/hwp_table.py +205 -0
  82. xgen_doc2chunk/core/processor/hwpx_handler.py +191 -0
  83. xgen_doc2chunk/core/processor/hwpx_helper/__init__.py +85 -0
  84. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_chart_extractor.py +464 -0
  85. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_constants.py +30 -0
  86. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_file_converter.py +70 -0
  87. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_image_processor.py +258 -0
  88. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_metadata.py +163 -0
  89. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_preprocessor.py +80 -0
  90. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_section.py +242 -0
  91. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_table_extractor.py +462 -0
  92. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_table_processor.py +220 -0
  93. xgen_doc2chunk/core/processor/image_file_handler.py +212 -0
  94. xgen_doc2chunk/core/processor/image_file_helper/__init__.py +17 -0
  95. xgen_doc2chunk/core/processor/image_file_helper/image_file_converter.py +69 -0
  96. xgen_doc2chunk/core/processor/image_file_helper/image_file_image_processor.py +123 -0
  97. xgen_doc2chunk/core/processor/image_file_helper/image_file_preprocessor.py +84 -0
  98. xgen_doc2chunk/core/processor/pdf_handler.py +597 -0
  99. xgen_doc2chunk/core/processor/pdf_helpers/__init__.py +229 -0
  100. xgen_doc2chunk/core/processor/pdf_helpers/pdf_block_image_engine.py +667 -0
  101. xgen_doc2chunk/core/processor/pdf_helpers/pdf_cell_analysis.py +493 -0
  102. xgen_doc2chunk/core/processor/pdf_helpers/pdf_complexity_analyzer.py +598 -0
  103. xgen_doc2chunk/core/processor/pdf_helpers/pdf_element_merger.py +46 -0
  104. xgen_doc2chunk/core/processor/pdf_helpers/pdf_file_converter.py +72 -0
  105. xgen_doc2chunk/core/processor/pdf_helpers/pdf_graphic_detector.py +332 -0
  106. xgen_doc2chunk/core/processor/pdf_helpers/pdf_image_processor.py +321 -0
  107. xgen_doc2chunk/core/processor/pdf_helpers/pdf_layout_block_detector.py +1244 -0
  108. xgen_doc2chunk/core/processor/pdf_helpers/pdf_line_analysis.py +420 -0
  109. xgen_doc2chunk/core/processor/pdf_helpers/pdf_metadata.py +101 -0
  110. xgen_doc2chunk/core/processor/pdf_helpers/pdf_page_analyzer.py +114 -0
  111. xgen_doc2chunk/core/processor/pdf_helpers/pdf_preprocessor.py +106 -0
  112. xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_detection.py +1346 -0
  113. xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_processor.py +897 -0
  114. xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_quality_analyzer.py +750 -0
  115. xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_validator.py +401 -0
  116. xgen_doc2chunk/core/processor/pdf_helpers/pdf_text_extractor.py +155 -0
  117. xgen_doc2chunk/core/processor/pdf_helpers/pdf_text_quality_analyzer.py +655 -0
  118. xgen_doc2chunk/core/processor/pdf_helpers/pdf_utils.py +183 -0
  119. xgen_doc2chunk/core/processor/pdf_helpers/pdf_vector_text_ocr.py +302 -0
  120. xgen_doc2chunk/core/processor/pdf_helpers/types.py +278 -0
  121. xgen_doc2chunk/core/processor/ppt_handler.py +288 -0
  122. xgen_doc2chunk/core/processor/ppt_helper/__init__.py +96 -0
  123. xgen_doc2chunk/core/processor/ppt_helper/ppt_bullet.py +332 -0
  124. xgen_doc2chunk/core/processor/ppt_helper/ppt_chart_extractor.py +182 -0
  125. xgen_doc2chunk/core/processor/ppt_helper/ppt_constants.py +119 -0
  126. xgen_doc2chunk/core/processor/ppt_helper/ppt_file_converter.py +55 -0
  127. xgen_doc2chunk/core/processor/ppt_helper/ppt_image_processor.py +196 -0
  128. xgen_doc2chunk/core/processor/ppt_helper/ppt_metadata.py +71 -0
  129. xgen_doc2chunk/core/processor/ppt_helper/ppt_preprocessor.py +77 -0
  130. xgen_doc2chunk/core/processor/ppt_helper/ppt_shape.py +189 -0
  131. xgen_doc2chunk/core/processor/ppt_helper/ppt_slide.py +69 -0
  132. xgen_doc2chunk/core/processor/ppt_helper/ppt_table.py +386 -0
  133. xgen_doc2chunk/core/processor/rtf_handler.py +290 -0
  134. xgen_doc2chunk/core/processor/rtf_helper/__init__.py +128 -0
  135. xgen_doc2chunk/core/processor/rtf_helper/rtf_constants.py +94 -0
  136. xgen_doc2chunk/core/processor/rtf_helper/rtf_content_extractor.py +211 -0
  137. xgen_doc2chunk/core/processor/rtf_helper/rtf_decoder.py +141 -0
  138. xgen_doc2chunk/core/processor/rtf_helper/rtf_file_converter.py +87 -0
  139. xgen_doc2chunk/core/processor/rtf_helper/rtf_metadata_extractor.py +179 -0
  140. xgen_doc2chunk/core/processor/rtf_helper/rtf_preprocessor.py +426 -0
  141. xgen_doc2chunk/core/processor/rtf_helper/rtf_region_finder.py +91 -0
  142. xgen_doc2chunk/core/processor/rtf_helper/rtf_table_extractor.py +482 -0
  143. xgen_doc2chunk/core/processor/rtf_helper/rtf_text_cleaner.py +389 -0
  144. xgen_doc2chunk/core/processor/text_handler.py +95 -0
  145. xgen_doc2chunk/core/processor/text_helper/__init__.py +17 -0
  146. xgen_doc2chunk/core/processor/text_helper/text_file_converter.py +28 -0
  147. xgen_doc2chunk/core/processor/text_helper/text_image_processor.py +75 -0
  148. xgen_doc2chunk/core/processor/text_helper/text_preprocessor.py +82 -0
  149. xgen_doc2chunk/ocr/__init__.py +67 -0
  150. xgen_doc2chunk/ocr/base.py +209 -0
  151. xgen_doc2chunk/ocr/ocr_engine/__init__.py +22 -0
  152. xgen_doc2chunk/ocr/ocr_engine/anthropic_ocr.py +91 -0
  153. xgen_doc2chunk/ocr/ocr_engine/bedrock_ocr.py +172 -0
  154. xgen_doc2chunk/ocr/ocr_engine/gemini_ocr.py +91 -0
  155. xgen_doc2chunk/ocr/ocr_engine/openai_ocr.py +100 -0
  156. xgen_doc2chunk/ocr/ocr_engine/vllm_ocr.py +116 -0
  157. xgen_doc2chunk/ocr/ocr_processor.py +387 -0
  158. {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.1.dist-info}/METADATA +1 -1
  159. xgen_doc2chunk-0.1.1.dist-info/RECORD +161 -0
  160. xgen_doc2chunk-0.1.0.dist-info/RECORD +0 -4
  161. {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.1.dist-info}/WHEEL +0 -0
  162. {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.1.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,229 @@
1
+ """
2
+ PDF Helpers Package
3
+
4
+ Contains helper modules for PDF processing.
5
+ """
6
+
7
+ # Metadata - class-based extractor
8
+ from xgen_doc2chunk.core.processor.pdf_helpers.pdf_metadata import (
9
+ PDFMetadataExtractor,
10
+ parse_pdf_date,
11
+ )
12
+
13
+ from xgen_doc2chunk.core.processor.pdf_helpers.pdf_utils import (
14
+ escape_html,
15
+ calculate_overlap_ratio,
16
+ is_inside_any_bbox,
17
+ find_image_position,
18
+ get_text_lines_with_positions,
19
+ bbox_overlaps,
20
+ )
21
+
22
+ # Image Processor (replaces pdf_image.py utility functions)
23
+ from xgen_doc2chunk.core.processor.pdf_helpers.pdf_image_processor import (
24
+ PDFImageProcessor,
25
+ )
26
+
27
+ from xgen_doc2chunk.core.processor.pdf_helpers.pdf_text_extractor import (
28
+ extract_text_blocks,
29
+ split_ocr_text_to_blocks,
30
+ )
31
+
32
+ from xgen_doc2chunk.core.processor.pdf_helpers.pdf_page_analyzer import (
33
+ detect_page_border,
34
+ is_table_likely_border,
35
+ )
36
+
37
+ from xgen_doc2chunk.core.processor.pdf_helpers.pdf_element_merger import (
38
+ merge_page_elements,
39
+ )
40
+
41
+ from xgen_doc2chunk.core.processor.pdf_helpers.pdf_table_processor import (
42
+ TableInfo,
43
+ AnnotationInfo as TableAnnotationInfo,
44
+ extract_all_tables,
45
+ find_and_insert_annotations,
46
+ add_annotation_to_table,
47
+ merge_adjacent_tables,
48
+ should_merge_tables,
49
+ do_merge_tables,
50
+ process_table_continuity,
51
+ extract_last_category,
52
+ is_single_column_table,
53
+ convert_single_column_to_text,
54
+ convert_table_to_html,
55
+ generate_html_from_cells,
56
+ )
57
+
58
+ from xgen_doc2chunk.core.processor.pdf_helpers.types import (
59
+ LineThickness,
60
+ TableDetectionStrategy,
61
+ ElementType,
62
+ PDFConfig,
63
+ LineInfo,
64
+ GridInfo,
65
+ CellInfo,
66
+ AnnotationInfo,
67
+ VectorTextRegion,
68
+ GraphicRegionInfo,
69
+ TableCandidate,
70
+ PageElement,
71
+ PageBorderInfo,
72
+ )
73
+
74
+ from xgen_doc2chunk.core.processor.pdf_helpers.pdf_vector_text_ocr import (
75
+ VectorTextConfig,
76
+ VectorTextOCREngine,
77
+ )
78
+
79
+ from xgen_doc2chunk.core.processor.pdf_helpers.pdf_graphic_detector import (
80
+ GraphicRegionDetector,
81
+ )
82
+
83
+ from xgen_doc2chunk.core.processor.pdf_helpers.pdf_table_validator import (
84
+ TableQualityValidator,
85
+ )
86
+
87
+ from xgen_doc2chunk.core.processor.pdf_helpers.pdf_line_analysis import (
88
+ LineAnalysisEngine,
89
+ )
90
+
91
+ from xgen_doc2chunk.core.processor.pdf_helpers.pdf_table_detection import (
92
+ TableDetectionEngine,
93
+ )
94
+
95
+ from xgen_doc2chunk.core.processor.pdf_helpers.pdf_cell_analysis import (
96
+ CellAnalysisEngine,
97
+ )
98
+
99
+ from xgen_doc2chunk.core.processor.pdf_helpers.pdf_complexity_analyzer import (
100
+ ComplexityLevel,
101
+ ProcessingStrategy,
102
+ RegionComplexity,
103
+ PageComplexity,
104
+ ComplexityConfig,
105
+ ComplexityAnalyzer,
106
+ )
107
+
108
+ from xgen_doc2chunk.core.processor.pdf_helpers.pdf_block_image_engine import (
109
+ BlockStrategy,
110
+ BlockImageConfig,
111
+ BlockImageResult,
112
+ MultiBlockResult,
113
+ BlockImageEngine,
114
+ )
115
+
116
+ from xgen_doc2chunk.core.processor.pdf_helpers.pdf_layout_block_detector import (
117
+ LayoutBlockType,
118
+ ContentElement,
119
+ LayoutBlock,
120
+ ColumnInfo,
121
+ LayoutAnalysisResult,
122
+ LayoutDetectorConfig,
123
+ LayoutBlockDetector,
124
+ )
125
+
126
+ from xgen_doc2chunk.core.processor.pdf_helpers.pdf_table_quality_analyzer import (
127
+ TableQuality,
128
+ TableQualityResult,
129
+ TableQualityAnalyzer,
130
+ )
131
+
132
+ __all__ = [
133
+ # pdf_helper
134
+ 'extract_pdf_metadata',
135
+ 'format_metadata',
136
+ 'escape_html',
137
+ 'calculate_overlap_ratio',
138
+ 'is_inside_any_bbox',
139
+ 'find_image_position',
140
+ 'get_text_lines_with_positions',
141
+ # types
142
+ 'LineThickness',
143
+ 'TableDetectionStrategy',
144
+ 'ElementType',
145
+ 'PDFConfig',
146
+ 'LineInfo',
147
+ 'GridInfo',
148
+ 'CellInfo',
149
+ 'AnnotationInfo',
150
+ 'VectorTextRegion',
151
+ 'GraphicRegionInfo',
152
+ 'TableCandidate',
153
+ 'PageElement',
154
+ 'PageBorderInfo',
155
+ # vector_text_ocr
156
+ 'VectorTextConfig',
157
+ 'VectorTextOCREngine',
158
+ # graphic_detector
159
+ 'GraphicRegionDetector',
160
+ # table_validator
161
+ 'TableQualityValidator',
162
+ # line_analysis
163
+ 'LineAnalysisEngine',
164
+ # table_detection
165
+ 'TableDetectionEngine',
166
+ # cell_analysis
167
+ 'CellAnalysisEngine',
168
+ # complexity_analyzer
169
+ 'ComplexityLevel',
170
+ 'ProcessingStrategy',
171
+ 'RegionComplexity',
172
+ 'PageComplexity',
173
+ 'ComplexityConfig',
174
+ 'ComplexityAnalyzer',
175
+ # block_image_engine
176
+ 'BlockStrategy',
177
+ 'BlockImageConfig',
178
+ 'BlockImageResult',
179
+ 'MultiBlockResult',
180
+ 'BlockImageEngine',
181
+ # layout_block_detector
182
+ 'LayoutBlockType',
183
+ 'ContentElement',
184
+ 'LayoutBlock',
185
+ 'ColumnInfo',
186
+ 'LayoutAnalysisResult',
187
+ 'LayoutDetectorConfig',
188
+ 'LayoutBlockDetector',
189
+ # table_quality_analyzer
190
+ 'TableQuality',
191
+ 'TableQualityResult',
192
+ 'TableQualityAnalyzer',
193
+ # pdf_metadata
194
+ 'extract_pdf_metadata',
195
+ 'format_metadata',
196
+ 'parse_pdf_date',
197
+ # pdf_utils
198
+ 'escape_html',
199
+ 'calculate_overlap_ratio',
200
+ 'is_inside_any_bbox',
201
+ 'find_image_position',
202
+ 'get_text_lines_with_positions',
203
+ 'bbox_overlaps',
204
+ # Image Processor
205
+ 'PDFImageProcessor',
206
+ # pdf_text_extractor
207
+ 'extract_text_blocks',
208
+ 'split_ocr_text_to_blocks',
209
+ # pdf_page_analyzer
210
+ 'detect_page_border',
211
+ 'is_table_likely_border',
212
+ # pdf_element_merger
213
+ 'merge_page_elements',
214
+ # pdf_table_processor
215
+ 'TableInfo',
216
+ 'TableAnnotationInfo',
217
+ 'extract_all_tables',
218
+ 'find_and_insert_annotations',
219
+ 'add_annotation_to_table',
220
+ 'merge_adjacent_tables',
221
+ 'should_merge_tables',
222
+ 'do_merge_tables',
223
+ 'process_table_continuity',
224
+ 'extract_last_category',
225
+ 'is_single_column_table',
226
+ 'convert_single_column_to_text',
227
+ 'convert_table_to_html',
228
+ 'generate_html_from_cells',
229
+ ]