xgen-doc2chunk 0.1.0__py3-none-any.whl → 0.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- xgen_doc2chunk/__init__.py +42 -0
- xgen_doc2chunk/chunking/__init__.py +168 -0
- xgen_doc2chunk/chunking/chunking.py +786 -0
- xgen_doc2chunk/chunking/constants.py +134 -0
- xgen_doc2chunk/chunking/page_chunker.py +248 -0
- xgen_doc2chunk/chunking/protected_regions.py +715 -0
- xgen_doc2chunk/chunking/sheet_processor.py +406 -0
- xgen_doc2chunk/chunking/table_chunker.py +832 -0
- xgen_doc2chunk/chunking/table_parser.py +172 -0
- xgen_doc2chunk/chunking/text_chunker.py +443 -0
- xgen_doc2chunk/core/__init__.py +64 -0
- xgen_doc2chunk/core/document_processor.py +1307 -0
- xgen_doc2chunk/core/functions/__init__.py +85 -0
- xgen_doc2chunk/core/functions/chart_extractor.py +144 -0
- xgen_doc2chunk/core/functions/chart_processor.py +534 -0
- xgen_doc2chunk/core/functions/file_converter.py +220 -0
- xgen_doc2chunk/core/functions/img_processor.py +649 -0
- xgen_doc2chunk/core/functions/metadata_extractor.py +542 -0
- xgen_doc2chunk/core/functions/page_tag_processor.py +393 -0
- xgen_doc2chunk/core/functions/preprocessor.py +162 -0
- xgen_doc2chunk/core/functions/storage_backend.py +381 -0
- xgen_doc2chunk/core/functions/table_extractor.py +468 -0
- xgen_doc2chunk/core/functions/table_processor.py +299 -0
- xgen_doc2chunk/core/functions/utils.py +159 -0
- xgen_doc2chunk/core/processor/__init__.py +96 -0
- xgen_doc2chunk/core/processor/base_handler.py +544 -0
- xgen_doc2chunk/core/processor/csv_handler.py +135 -0
- xgen_doc2chunk/core/processor/csv_helper/__init__.py +89 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_constants.py +63 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_encoding.py +104 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_file_converter.py +78 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_image_processor.py +75 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_metadata.py +168 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_parser.py +225 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_preprocessor.py +86 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_table.py +266 -0
- xgen_doc2chunk/core/processor/doc_handler.py +579 -0
- xgen_doc2chunk/core/processor/doc_helpers/__init__.py +25 -0
- xgen_doc2chunk/core/processor/doc_helpers/doc_file_converter.py +160 -0
- xgen_doc2chunk/core/processor/doc_helpers/doc_image_processor.py +179 -0
- xgen_doc2chunk/core/processor/doc_helpers/doc_preprocessor.py +83 -0
- xgen_doc2chunk/core/processor/docx_handler.py +376 -0
- xgen_doc2chunk/core/processor/docx_helper/__init__.py +84 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_chart_extractor.py +436 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_constants.py +75 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_file_converter.py +76 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_image.py +145 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_image_processor.py +410 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_metadata.py +71 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_paragraph.py +126 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_preprocessor.py +82 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_table_extractor.py +527 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_table_processor.py +220 -0
- xgen_doc2chunk/core/processor/excel_handler.py +353 -0
- xgen_doc2chunk/core/processor/excel_helper/__init__.py +97 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_chart_extractor.py +498 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_file_converter.py +157 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_image_processor.py +316 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_layout_detector.py +739 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_metadata.py +145 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_preprocessor.py +83 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_table_xls.py +357 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_table_xlsx.py +361 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_textbox.py +266 -0
- xgen_doc2chunk/core/processor/html_helper/__init__.py +7 -0
- xgen_doc2chunk/core/processor/html_helper/html_file_converter.py +92 -0
- xgen_doc2chunk/core/processor/html_helper/html_preprocessor.py +74 -0
- xgen_doc2chunk/core/processor/html_reprocessor.py +140 -0
- xgen_doc2chunk/core/processor/hwp_handler.py +401 -0
- xgen_doc2chunk/core/processor/hwp_helper/__init__.py +120 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_chart_extractor.py +373 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_constants.py +78 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_decoder.py +106 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_docinfo.py +174 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_file_converter.py +60 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_image_processor.py +413 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_metadata.py +236 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_preprocessor.py +82 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_record.py +149 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_recovery.py +217 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_table.py +205 -0
- xgen_doc2chunk/core/processor/hwpx_handler.py +191 -0
- xgen_doc2chunk/core/processor/hwpx_helper/__init__.py +85 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_chart_extractor.py +464 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_constants.py +30 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_file_converter.py +70 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_image_processor.py +258 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_metadata.py +163 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_preprocessor.py +80 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_section.py +242 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_table_extractor.py +462 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_table_processor.py +220 -0
- xgen_doc2chunk/core/processor/image_file_handler.py +212 -0
- xgen_doc2chunk/core/processor/image_file_helper/__init__.py +17 -0
- xgen_doc2chunk/core/processor/image_file_helper/image_file_converter.py +69 -0
- xgen_doc2chunk/core/processor/image_file_helper/image_file_image_processor.py +123 -0
- xgen_doc2chunk/core/processor/image_file_helper/image_file_preprocessor.py +84 -0
- xgen_doc2chunk/core/processor/pdf_handler.py +597 -0
- xgen_doc2chunk/core/processor/pdf_helpers/__init__.py +229 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_block_image_engine.py +667 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_cell_analysis.py +493 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_complexity_analyzer.py +598 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_element_merger.py +46 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_file_converter.py +72 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_graphic_detector.py +332 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_image_processor.py +321 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_layout_block_detector.py +1244 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_line_analysis.py +420 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_metadata.py +101 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_page_analyzer.py +114 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_preprocessor.py +106 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_detection.py +1346 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_processor.py +897 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_quality_analyzer.py +750 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_validator.py +401 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_text_extractor.py +155 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_text_quality_analyzer.py +655 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_utils.py +183 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_vector_text_ocr.py +302 -0
- xgen_doc2chunk/core/processor/pdf_helpers/types.py +278 -0
- xgen_doc2chunk/core/processor/ppt_handler.py +288 -0
- xgen_doc2chunk/core/processor/ppt_helper/__init__.py +96 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_bullet.py +332 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_chart_extractor.py +182 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_constants.py +119 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_file_converter.py +55 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_image_processor.py +196 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_metadata.py +71 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_preprocessor.py +77 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_shape.py +189 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_slide.py +69 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_table.py +386 -0
- xgen_doc2chunk/core/processor/rtf_handler.py +290 -0
- xgen_doc2chunk/core/processor/rtf_helper/__init__.py +128 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_constants.py +94 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_content_extractor.py +211 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_decoder.py +141 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_file_converter.py +87 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_metadata_extractor.py +179 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_preprocessor.py +426 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_region_finder.py +91 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_table_extractor.py +482 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_text_cleaner.py +389 -0
- xgen_doc2chunk/core/processor/text_handler.py +95 -0
- xgen_doc2chunk/core/processor/text_helper/__init__.py +17 -0
- xgen_doc2chunk/core/processor/text_helper/text_file_converter.py +28 -0
- xgen_doc2chunk/core/processor/text_helper/text_image_processor.py +75 -0
- xgen_doc2chunk/core/processor/text_helper/text_preprocessor.py +82 -0
- xgen_doc2chunk/ocr/__init__.py +67 -0
- xgen_doc2chunk/ocr/base.py +209 -0
- xgen_doc2chunk/ocr/ocr_engine/__init__.py +22 -0
- xgen_doc2chunk/ocr/ocr_engine/anthropic_ocr.py +91 -0
- xgen_doc2chunk/ocr/ocr_engine/bedrock_ocr.py +172 -0
- xgen_doc2chunk/ocr/ocr_engine/gemini_ocr.py +91 -0
- xgen_doc2chunk/ocr/ocr_engine/openai_ocr.py +100 -0
- xgen_doc2chunk/ocr/ocr_engine/vllm_ocr.py +116 -0
- xgen_doc2chunk/ocr/ocr_processor.py +387 -0
- {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.1.dist-info}/METADATA +1 -1
- xgen_doc2chunk-0.1.1.dist-info/RECORD +161 -0
- xgen_doc2chunk-0.1.0.dist-info/RECORD +0 -4
- {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.1.dist-info}/WHEEL +0 -0
- {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.1.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,229 @@
|
|
|
1
|
+
"""
|
|
2
|
+
PDF Helpers Package
|
|
3
|
+
|
|
4
|
+
Contains helper modules for PDF processing.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
# Metadata - class-based extractor
|
|
8
|
+
from xgen_doc2chunk.core.processor.pdf_helpers.pdf_metadata import (
|
|
9
|
+
PDFMetadataExtractor,
|
|
10
|
+
parse_pdf_date,
|
|
11
|
+
)
|
|
12
|
+
|
|
13
|
+
from xgen_doc2chunk.core.processor.pdf_helpers.pdf_utils import (
|
|
14
|
+
escape_html,
|
|
15
|
+
calculate_overlap_ratio,
|
|
16
|
+
is_inside_any_bbox,
|
|
17
|
+
find_image_position,
|
|
18
|
+
get_text_lines_with_positions,
|
|
19
|
+
bbox_overlaps,
|
|
20
|
+
)
|
|
21
|
+
|
|
22
|
+
# Image Processor (replaces pdf_image.py utility functions)
|
|
23
|
+
from xgen_doc2chunk.core.processor.pdf_helpers.pdf_image_processor import (
|
|
24
|
+
PDFImageProcessor,
|
|
25
|
+
)
|
|
26
|
+
|
|
27
|
+
from xgen_doc2chunk.core.processor.pdf_helpers.pdf_text_extractor import (
|
|
28
|
+
extract_text_blocks,
|
|
29
|
+
split_ocr_text_to_blocks,
|
|
30
|
+
)
|
|
31
|
+
|
|
32
|
+
from xgen_doc2chunk.core.processor.pdf_helpers.pdf_page_analyzer import (
|
|
33
|
+
detect_page_border,
|
|
34
|
+
is_table_likely_border,
|
|
35
|
+
)
|
|
36
|
+
|
|
37
|
+
from xgen_doc2chunk.core.processor.pdf_helpers.pdf_element_merger import (
|
|
38
|
+
merge_page_elements,
|
|
39
|
+
)
|
|
40
|
+
|
|
41
|
+
from xgen_doc2chunk.core.processor.pdf_helpers.pdf_table_processor import (
|
|
42
|
+
TableInfo,
|
|
43
|
+
AnnotationInfo as TableAnnotationInfo,
|
|
44
|
+
extract_all_tables,
|
|
45
|
+
find_and_insert_annotations,
|
|
46
|
+
add_annotation_to_table,
|
|
47
|
+
merge_adjacent_tables,
|
|
48
|
+
should_merge_tables,
|
|
49
|
+
do_merge_tables,
|
|
50
|
+
process_table_continuity,
|
|
51
|
+
extract_last_category,
|
|
52
|
+
is_single_column_table,
|
|
53
|
+
convert_single_column_to_text,
|
|
54
|
+
convert_table_to_html,
|
|
55
|
+
generate_html_from_cells,
|
|
56
|
+
)
|
|
57
|
+
|
|
58
|
+
from xgen_doc2chunk.core.processor.pdf_helpers.types import (
|
|
59
|
+
LineThickness,
|
|
60
|
+
TableDetectionStrategy,
|
|
61
|
+
ElementType,
|
|
62
|
+
PDFConfig,
|
|
63
|
+
LineInfo,
|
|
64
|
+
GridInfo,
|
|
65
|
+
CellInfo,
|
|
66
|
+
AnnotationInfo,
|
|
67
|
+
VectorTextRegion,
|
|
68
|
+
GraphicRegionInfo,
|
|
69
|
+
TableCandidate,
|
|
70
|
+
PageElement,
|
|
71
|
+
PageBorderInfo,
|
|
72
|
+
)
|
|
73
|
+
|
|
74
|
+
from xgen_doc2chunk.core.processor.pdf_helpers.pdf_vector_text_ocr import (
|
|
75
|
+
VectorTextConfig,
|
|
76
|
+
VectorTextOCREngine,
|
|
77
|
+
)
|
|
78
|
+
|
|
79
|
+
from xgen_doc2chunk.core.processor.pdf_helpers.pdf_graphic_detector import (
|
|
80
|
+
GraphicRegionDetector,
|
|
81
|
+
)
|
|
82
|
+
|
|
83
|
+
from xgen_doc2chunk.core.processor.pdf_helpers.pdf_table_validator import (
|
|
84
|
+
TableQualityValidator,
|
|
85
|
+
)
|
|
86
|
+
|
|
87
|
+
from xgen_doc2chunk.core.processor.pdf_helpers.pdf_line_analysis import (
|
|
88
|
+
LineAnalysisEngine,
|
|
89
|
+
)
|
|
90
|
+
|
|
91
|
+
from xgen_doc2chunk.core.processor.pdf_helpers.pdf_table_detection import (
|
|
92
|
+
TableDetectionEngine,
|
|
93
|
+
)
|
|
94
|
+
|
|
95
|
+
from xgen_doc2chunk.core.processor.pdf_helpers.pdf_cell_analysis import (
|
|
96
|
+
CellAnalysisEngine,
|
|
97
|
+
)
|
|
98
|
+
|
|
99
|
+
from xgen_doc2chunk.core.processor.pdf_helpers.pdf_complexity_analyzer import (
|
|
100
|
+
ComplexityLevel,
|
|
101
|
+
ProcessingStrategy,
|
|
102
|
+
RegionComplexity,
|
|
103
|
+
PageComplexity,
|
|
104
|
+
ComplexityConfig,
|
|
105
|
+
ComplexityAnalyzer,
|
|
106
|
+
)
|
|
107
|
+
|
|
108
|
+
from xgen_doc2chunk.core.processor.pdf_helpers.pdf_block_image_engine import (
|
|
109
|
+
BlockStrategy,
|
|
110
|
+
BlockImageConfig,
|
|
111
|
+
BlockImageResult,
|
|
112
|
+
MultiBlockResult,
|
|
113
|
+
BlockImageEngine,
|
|
114
|
+
)
|
|
115
|
+
|
|
116
|
+
from xgen_doc2chunk.core.processor.pdf_helpers.pdf_layout_block_detector import (
|
|
117
|
+
LayoutBlockType,
|
|
118
|
+
ContentElement,
|
|
119
|
+
LayoutBlock,
|
|
120
|
+
ColumnInfo,
|
|
121
|
+
LayoutAnalysisResult,
|
|
122
|
+
LayoutDetectorConfig,
|
|
123
|
+
LayoutBlockDetector,
|
|
124
|
+
)
|
|
125
|
+
|
|
126
|
+
from xgen_doc2chunk.core.processor.pdf_helpers.pdf_table_quality_analyzer import (
|
|
127
|
+
TableQuality,
|
|
128
|
+
TableQualityResult,
|
|
129
|
+
TableQualityAnalyzer,
|
|
130
|
+
)
|
|
131
|
+
|
|
132
|
+
__all__ = [
|
|
133
|
+
# pdf_helper
|
|
134
|
+
'extract_pdf_metadata',
|
|
135
|
+
'format_metadata',
|
|
136
|
+
'escape_html',
|
|
137
|
+
'calculate_overlap_ratio',
|
|
138
|
+
'is_inside_any_bbox',
|
|
139
|
+
'find_image_position',
|
|
140
|
+
'get_text_lines_with_positions',
|
|
141
|
+
# types
|
|
142
|
+
'LineThickness',
|
|
143
|
+
'TableDetectionStrategy',
|
|
144
|
+
'ElementType',
|
|
145
|
+
'PDFConfig',
|
|
146
|
+
'LineInfo',
|
|
147
|
+
'GridInfo',
|
|
148
|
+
'CellInfo',
|
|
149
|
+
'AnnotationInfo',
|
|
150
|
+
'VectorTextRegion',
|
|
151
|
+
'GraphicRegionInfo',
|
|
152
|
+
'TableCandidate',
|
|
153
|
+
'PageElement',
|
|
154
|
+
'PageBorderInfo',
|
|
155
|
+
# vector_text_ocr
|
|
156
|
+
'VectorTextConfig',
|
|
157
|
+
'VectorTextOCREngine',
|
|
158
|
+
# graphic_detector
|
|
159
|
+
'GraphicRegionDetector',
|
|
160
|
+
# table_validator
|
|
161
|
+
'TableQualityValidator',
|
|
162
|
+
# line_analysis
|
|
163
|
+
'LineAnalysisEngine',
|
|
164
|
+
# table_detection
|
|
165
|
+
'TableDetectionEngine',
|
|
166
|
+
# cell_analysis
|
|
167
|
+
'CellAnalysisEngine',
|
|
168
|
+
# complexity_analyzer
|
|
169
|
+
'ComplexityLevel',
|
|
170
|
+
'ProcessingStrategy',
|
|
171
|
+
'RegionComplexity',
|
|
172
|
+
'PageComplexity',
|
|
173
|
+
'ComplexityConfig',
|
|
174
|
+
'ComplexityAnalyzer',
|
|
175
|
+
# block_image_engine
|
|
176
|
+
'BlockStrategy',
|
|
177
|
+
'BlockImageConfig',
|
|
178
|
+
'BlockImageResult',
|
|
179
|
+
'MultiBlockResult',
|
|
180
|
+
'BlockImageEngine',
|
|
181
|
+
# layout_block_detector
|
|
182
|
+
'LayoutBlockType',
|
|
183
|
+
'ContentElement',
|
|
184
|
+
'LayoutBlock',
|
|
185
|
+
'ColumnInfo',
|
|
186
|
+
'LayoutAnalysisResult',
|
|
187
|
+
'LayoutDetectorConfig',
|
|
188
|
+
'LayoutBlockDetector',
|
|
189
|
+
# table_quality_analyzer
|
|
190
|
+
'TableQuality',
|
|
191
|
+
'TableQualityResult',
|
|
192
|
+
'TableQualityAnalyzer',
|
|
193
|
+
# pdf_metadata
|
|
194
|
+
'extract_pdf_metadata',
|
|
195
|
+
'format_metadata',
|
|
196
|
+
'parse_pdf_date',
|
|
197
|
+
# pdf_utils
|
|
198
|
+
'escape_html',
|
|
199
|
+
'calculate_overlap_ratio',
|
|
200
|
+
'is_inside_any_bbox',
|
|
201
|
+
'find_image_position',
|
|
202
|
+
'get_text_lines_with_positions',
|
|
203
|
+
'bbox_overlaps',
|
|
204
|
+
# Image Processor
|
|
205
|
+
'PDFImageProcessor',
|
|
206
|
+
# pdf_text_extractor
|
|
207
|
+
'extract_text_blocks',
|
|
208
|
+
'split_ocr_text_to_blocks',
|
|
209
|
+
# pdf_page_analyzer
|
|
210
|
+
'detect_page_border',
|
|
211
|
+
'is_table_likely_border',
|
|
212
|
+
# pdf_element_merger
|
|
213
|
+
'merge_page_elements',
|
|
214
|
+
# pdf_table_processor
|
|
215
|
+
'TableInfo',
|
|
216
|
+
'TableAnnotationInfo',
|
|
217
|
+
'extract_all_tables',
|
|
218
|
+
'find_and_insert_annotations',
|
|
219
|
+
'add_annotation_to_table',
|
|
220
|
+
'merge_adjacent_tables',
|
|
221
|
+
'should_merge_tables',
|
|
222
|
+
'do_merge_tables',
|
|
223
|
+
'process_table_continuity',
|
|
224
|
+
'extract_last_category',
|
|
225
|
+
'is_single_column_table',
|
|
226
|
+
'convert_single_column_to_text',
|
|
227
|
+
'convert_table_to_html',
|
|
228
|
+
'generate_html_from_cells',
|
|
229
|
+
]
|