xgen-doc2chunk 0.1.0__py3-none-any.whl → 0.1.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- xgen_doc2chunk/__init__.py +42 -0
- xgen_doc2chunk/chunking/__init__.py +168 -0
- xgen_doc2chunk/chunking/chunking.py +786 -0
- xgen_doc2chunk/chunking/constants.py +134 -0
- xgen_doc2chunk/chunking/page_chunker.py +248 -0
- xgen_doc2chunk/chunking/protected_regions.py +715 -0
- xgen_doc2chunk/chunking/sheet_processor.py +406 -0
- xgen_doc2chunk/chunking/table_chunker.py +832 -0
- xgen_doc2chunk/chunking/table_parser.py +172 -0
- xgen_doc2chunk/chunking/text_chunker.py +443 -0
- xgen_doc2chunk/core/__init__.py +64 -0
- xgen_doc2chunk/core/document_processor.py +1307 -0
- xgen_doc2chunk/core/functions/__init__.py +85 -0
- xgen_doc2chunk/core/functions/chart_extractor.py +144 -0
- xgen_doc2chunk/core/functions/chart_processor.py +534 -0
- xgen_doc2chunk/core/functions/file_converter.py +220 -0
- xgen_doc2chunk/core/functions/img_processor.py +649 -0
- xgen_doc2chunk/core/functions/metadata_extractor.py +542 -0
- xgen_doc2chunk/core/functions/page_tag_processor.py +393 -0
- xgen_doc2chunk/core/functions/preprocessor.py +162 -0
- xgen_doc2chunk/core/functions/storage_backend.py +381 -0
- xgen_doc2chunk/core/functions/table_extractor.py +468 -0
- xgen_doc2chunk/core/functions/table_processor.py +299 -0
- xgen_doc2chunk/core/functions/utils.py +159 -0
- xgen_doc2chunk/core/processor/__init__.py +96 -0
- xgen_doc2chunk/core/processor/base_handler.py +544 -0
- xgen_doc2chunk/core/processor/csv_handler.py +135 -0
- xgen_doc2chunk/core/processor/csv_helper/__init__.py +89 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_constants.py +63 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_encoding.py +104 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_file_converter.py +78 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_image_processor.py +75 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_metadata.py +168 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_parser.py +225 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_preprocessor.py +86 -0
- xgen_doc2chunk/core/processor/csv_helper/csv_table.py +266 -0
- xgen_doc2chunk/core/processor/doc_handler.py +579 -0
- xgen_doc2chunk/core/processor/doc_helpers/__init__.py +25 -0
- xgen_doc2chunk/core/processor/doc_helpers/doc_file_converter.py +160 -0
- xgen_doc2chunk/core/processor/doc_helpers/doc_image_processor.py +179 -0
- xgen_doc2chunk/core/processor/doc_helpers/doc_preprocessor.py +83 -0
- xgen_doc2chunk/core/processor/docx_handler.py +376 -0
- xgen_doc2chunk/core/processor/docx_helper/__init__.py +84 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_chart_extractor.py +436 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_constants.py +75 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_file_converter.py +76 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_image.py +145 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_image_processor.py +410 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_metadata.py +71 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_paragraph.py +126 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_preprocessor.py +82 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_table_extractor.py +527 -0
- xgen_doc2chunk/core/processor/docx_helper/docx_table_processor.py +220 -0
- xgen_doc2chunk/core/processor/excel_handler.py +353 -0
- xgen_doc2chunk/core/processor/excel_helper/__init__.py +97 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_chart_extractor.py +498 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_file_converter.py +157 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_image_processor.py +316 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_layout_detector.py +739 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_metadata.py +145 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_preprocessor.py +83 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_table_xls.py +357 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_table_xlsx.py +361 -0
- xgen_doc2chunk/core/processor/excel_helper/excel_textbox.py +266 -0
- xgen_doc2chunk/core/processor/html_helper/__init__.py +7 -0
- xgen_doc2chunk/core/processor/html_helper/html_file_converter.py +92 -0
- xgen_doc2chunk/core/processor/html_helper/html_preprocessor.py +74 -0
- xgen_doc2chunk/core/processor/html_reprocessor.py +140 -0
- xgen_doc2chunk/core/processor/hwp_handler.py +401 -0
- xgen_doc2chunk/core/processor/hwp_helper/__init__.py +120 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_chart_extractor.py +373 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_constants.py +78 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_decoder.py +106 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_docinfo.py +174 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_file_converter.py +60 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_image_processor.py +413 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_metadata.py +236 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_preprocessor.py +82 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_record.py +149 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_recovery.py +217 -0
- xgen_doc2chunk/core/processor/hwp_helper/hwp_table.py +205 -0
- xgen_doc2chunk/core/processor/hwpx_handler.py +191 -0
- xgen_doc2chunk/core/processor/hwpx_helper/__init__.py +85 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_chart_extractor.py +464 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_constants.py +30 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_file_converter.py +70 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_image_processor.py +258 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_metadata.py +163 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_preprocessor.py +80 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_section.py +242 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_table_extractor.py +462 -0
- xgen_doc2chunk/core/processor/hwpx_helper/hwpx_table_processor.py +220 -0
- xgen_doc2chunk/core/processor/image_file_handler.py +212 -0
- xgen_doc2chunk/core/processor/image_file_helper/__init__.py +17 -0
- xgen_doc2chunk/core/processor/image_file_helper/image_file_converter.py +69 -0
- xgen_doc2chunk/core/processor/image_file_helper/image_file_image_processor.py +123 -0
- xgen_doc2chunk/core/processor/image_file_helper/image_file_preprocessor.py +84 -0
- xgen_doc2chunk/core/processor/pdf_handler.py +597 -0
- xgen_doc2chunk/core/processor/pdf_helpers/__init__.py +229 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_block_image_engine.py +667 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_cell_analysis.py +493 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_complexity_analyzer.py +598 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_element_merger.py +46 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_file_converter.py +72 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_graphic_detector.py +332 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_image_processor.py +321 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_layout_block_detector.py +1244 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_line_analysis.py +420 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_metadata.py +101 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_page_analyzer.py +114 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_preprocessor.py +106 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_detection.py +1346 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_processor.py +897 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_quality_analyzer.py +750 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_validator.py +401 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_text_extractor.py +155 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_text_quality_analyzer.py +655 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_utils.py +183 -0
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_vector_text_ocr.py +302 -0
- xgen_doc2chunk/core/processor/pdf_helpers/types.py +278 -0
- xgen_doc2chunk/core/processor/ppt_handler.py +288 -0
- xgen_doc2chunk/core/processor/ppt_helper/__init__.py +96 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_bullet.py +332 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_chart_extractor.py +182 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_constants.py +119 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_file_converter.py +55 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_image_processor.py +196 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_metadata.py +71 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_preprocessor.py +77 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_shape.py +189 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_slide.py +69 -0
- xgen_doc2chunk/core/processor/ppt_helper/ppt_table.py +386 -0
- xgen_doc2chunk/core/processor/rtf_handler.py +290 -0
- xgen_doc2chunk/core/processor/rtf_helper/__init__.py +128 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_constants.py +94 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_content_extractor.py +211 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_decoder.py +141 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_file_converter.py +87 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_metadata_extractor.py +179 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_preprocessor.py +426 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_region_finder.py +91 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_table_extractor.py +482 -0
- xgen_doc2chunk/core/processor/rtf_helper/rtf_text_cleaner.py +389 -0
- xgen_doc2chunk/core/processor/text_handler.py +95 -0
- xgen_doc2chunk/core/processor/text_helper/__init__.py +17 -0
- xgen_doc2chunk/core/processor/text_helper/text_file_converter.py +28 -0
- xgen_doc2chunk/core/processor/text_helper/text_image_processor.py +75 -0
- xgen_doc2chunk/core/processor/text_helper/text_preprocessor.py +82 -0
- xgen_doc2chunk/ocr/__init__.py +67 -0
- xgen_doc2chunk/ocr/base.py +209 -0
- xgen_doc2chunk/ocr/ocr_engine/__init__.py +22 -0
- xgen_doc2chunk/ocr/ocr_engine/anthropic_ocr.py +91 -0
- xgen_doc2chunk/ocr/ocr_engine/bedrock_ocr.py +172 -0
- xgen_doc2chunk/ocr/ocr_engine/gemini_ocr.py +91 -0
- xgen_doc2chunk/ocr/ocr_engine/openai_ocr.py +100 -0
- xgen_doc2chunk/ocr/ocr_engine/vllm_ocr.py +116 -0
- xgen_doc2chunk/ocr/ocr_processor.py +387 -0
- {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.2.dist-info}/METADATA +1 -1
- xgen_doc2chunk-0.1.2.dist-info/RECORD +161 -0
- xgen_doc2chunk-0.1.0.dist-info/RECORD +0 -4
- {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.2.dist-info}/WHEEL +0 -0
- {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.2.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,420 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Line Analysis Engine for PDF Handler
|
|
3
|
+
|
|
4
|
+
Extracts and analyzes lines from PDF drawings.
|
|
5
|
+
- Thin line detection
|
|
6
|
+
- Double line merging
|
|
7
|
+
- Incomplete border reconstruction
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
import logging
|
|
11
|
+
import math
|
|
12
|
+
from typing import List, Optional, Tuple
|
|
13
|
+
|
|
14
|
+
import fitz
|
|
15
|
+
|
|
16
|
+
from xgen_doc2chunk.core.processor.pdf_helpers.types import LineInfo, GridInfo, LineThickness, PDFConfig
|
|
17
|
+
|
|
18
|
+
logger = logging.getLogger(__name__)
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
# ============================================================================
|
|
22
|
+
# Line Analysis Engine
|
|
23
|
+
# ============================================================================
|
|
24
|
+
|
|
25
|
+
class LineAnalysisEngine:
|
|
26
|
+
"""
|
|
27
|
+
Line Analysis Engine
|
|
28
|
+
|
|
29
|
+
Extracts and analyzes lines from PDF drawings.
|
|
30
|
+
- Thin line detection
|
|
31
|
+
- Double line merging
|
|
32
|
+
- Incomplete border reconstruction
|
|
33
|
+
"""
|
|
34
|
+
|
|
35
|
+
# Configuration constants (from PDFConfig or default values)
|
|
36
|
+
THIN_LINE_THRESHOLD = getattr(PDFConfig, 'THIN_LINE_THRESHOLD', 0.5)
|
|
37
|
+
THICK_LINE_THRESHOLD = getattr(PDFConfig, 'THICK_LINE_THRESHOLD', 2.0)
|
|
38
|
+
DOUBLE_LINE_GAP = getattr(PDFConfig, 'DOUBLE_LINE_GAP', 5.0)
|
|
39
|
+
LINE_MERGE_TOLERANCE = getattr(PDFConfig, 'LINE_MERGE_TOLERANCE', 3.0)
|
|
40
|
+
BORDER_EXTENSION_MARGIN = getattr(PDFConfig, 'BORDER_EXTENSION_MARGIN', 20.0)
|
|
41
|
+
|
|
42
|
+
def __init__(self, page, page_width: float, page_height: float):
|
|
43
|
+
"""
|
|
44
|
+
Args:
|
|
45
|
+
page: PyMuPDF page object
|
|
46
|
+
page_width: Page width
|
|
47
|
+
page_height: Page height
|
|
48
|
+
"""
|
|
49
|
+
self.page = page
|
|
50
|
+
self.page_width = page_width
|
|
51
|
+
self.page_height = page_height
|
|
52
|
+
self.all_lines: List[LineInfo] = []
|
|
53
|
+
self.h_lines: List[LineInfo] = [] # Horizontal lines
|
|
54
|
+
self.v_lines: List[LineInfo] = [] # Vertical lines
|
|
55
|
+
|
|
56
|
+
def analyze(self) -> Tuple[List[LineInfo], List[LineInfo]]:
|
|
57
|
+
"""
|
|
58
|
+
Perform line analysis
|
|
59
|
+
|
|
60
|
+
Returns:
|
|
61
|
+
Tuple of (horizontal lines list, vertical lines list)
|
|
62
|
+
"""
|
|
63
|
+
self._extract_all_lines()
|
|
64
|
+
self._classify_lines()
|
|
65
|
+
self._merge_double_lines()
|
|
66
|
+
return self.h_lines, self.v_lines
|
|
67
|
+
|
|
68
|
+
def _extract_all_lines(self):
|
|
69
|
+
"""Extract all lines"""
|
|
70
|
+
drawings = self.page.get_drawings()
|
|
71
|
+
if not drawings:
|
|
72
|
+
return
|
|
73
|
+
|
|
74
|
+
for drawing in drawings:
|
|
75
|
+
# Extract line information
|
|
76
|
+
items = drawing.get('items', [])
|
|
77
|
+
rect = drawing.get('rect')
|
|
78
|
+
|
|
79
|
+
if not rect:
|
|
80
|
+
continue
|
|
81
|
+
|
|
82
|
+
# Rect-based line analysis
|
|
83
|
+
x0, y0, x1, y1 = rect.x0, rect.y0, rect.x1, rect.y1
|
|
84
|
+
w = abs(x1 - x0)
|
|
85
|
+
h = abs(y1 - y0)
|
|
86
|
+
|
|
87
|
+
# Estimate line thickness
|
|
88
|
+
stroke_width = drawing.get('width', 1.0) or 1.0
|
|
89
|
+
|
|
90
|
+
# Determine if it's a line (horizontal or vertical)
|
|
91
|
+
is_h_line = h <= max(3.0, stroke_width * 2) and w > 10
|
|
92
|
+
is_v_line = w <= max(3.0, stroke_width * 2) and h > 10
|
|
93
|
+
|
|
94
|
+
if not (is_h_line or is_v_line):
|
|
95
|
+
# Try to extract 'l' (line) from items
|
|
96
|
+
for item in items:
|
|
97
|
+
if item[0] == 'l': # line
|
|
98
|
+
p1, p2 = item[1], item[2]
|
|
99
|
+
self._add_line_from_points(p1, p2, stroke_width)
|
|
100
|
+
continue
|
|
101
|
+
|
|
102
|
+
# Classify thickness
|
|
103
|
+
thickness_class = self._classify_thickness(stroke_width)
|
|
104
|
+
|
|
105
|
+
line_info = LineInfo(
|
|
106
|
+
x0=x0,
|
|
107
|
+
y0=y0 if is_h_line else y0,
|
|
108
|
+
x1=x1,
|
|
109
|
+
y1=y1 if is_h_line else y1,
|
|
110
|
+
thickness=stroke_width,
|
|
111
|
+
thickness_class=thickness_class,
|
|
112
|
+
is_horizontal=is_h_line,
|
|
113
|
+
is_vertical=is_v_line
|
|
114
|
+
)
|
|
115
|
+
|
|
116
|
+
self.all_lines.append(line_info)
|
|
117
|
+
|
|
118
|
+
def _add_line_from_points(self, p1, p2, stroke_width: float):
|
|
119
|
+
"""Create a line from two points"""
|
|
120
|
+
x0, y0 = p1.x, p1.y
|
|
121
|
+
x1, y1 = p2.x, p2.y
|
|
122
|
+
|
|
123
|
+
dx = abs(x1 - x0)
|
|
124
|
+
dy = abs(y1 - y0)
|
|
125
|
+
|
|
126
|
+
# Determine line direction (within tolerance)
|
|
127
|
+
is_horizontal = dy < 3 and dx > 10
|
|
128
|
+
is_vertical = dx < 3 and dy > 10
|
|
129
|
+
|
|
130
|
+
if not (is_horizontal or is_vertical):
|
|
131
|
+
return
|
|
132
|
+
|
|
133
|
+
thickness_class = self._classify_thickness(stroke_width)
|
|
134
|
+
|
|
135
|
+
line_info = LineInfo(
|
|
136
|
+
x0=min(x0, x1),
|
|
137
|
+
y0=min(y0, y1),
|
|
138
|
+
x1=max(x0, x1),
|
|
139
|
+
y1=max(y0, y1),
|
|
140
|
+
thickness=stroke_width,
|
|
141
|
+
thickness_class=thickness_class,
|
|
142
|
+
is_horizontal=is_horizontal,
|
|
143
|
+
is_vertical=is_vertical
|
|
144
|
+
)
|
|
145
|
+
|
|
146
|
+
self.all_lines.append(line_info)
|
|
147
|
+
|
|
148
|
+
def _classify_thickness(self, thickness: float) -> LineThickness:
|
|
149
|
+
"""Classify line thickness"""
|
|
150
|
+
if thickness < self.THIN_LINE_THRESHOLD:
|
|
151
|
+
return LineThickness.THIN
|
|
152
|
+
elif thickness > self.THICK_LINE_THRESHOLD:
|
|
153
|
+
return LineThickness.THICK
|
|
154
|
+
return LineThickness.NORMAL
|
|
155
|
+
|
|
156
|
+
def _classify_lines(self):
|
|
157
|
+
"""Classify horizontal/vertical lines"""
|
|
158
|
+
for line in self.all_lines:
|
|
159
|
+
if line.is_horizontal:
|
|
160
|
+
self.h_lines.append(line)
|
|
161
|
+
elif line.is_vertical:
|
|
162
|
+
self.v_lines.append(line)
|
|
163
|
+
|
|
164
|
+
def _merge_double_lines(self):
|
|
165
|
+
"""Merge double lines"""
|
|
166
|
+
# Merge horizontal lines
|
|
167
|
+
self.h_lines = self._merge_parallel_lines(self.h_lines, is_horizontal=True)
|
|
168
|
+
# Merge vertical lines
|
|
169
|
+
self.v_lines = self._merge_parallel_lines(self.v_lines, is_horizontal=False)
|
|
170
|
+
|
|
171
|
+
def _merge_parallel_lines(self, lines: List[LineInfo], is_horizontal: bool) -> List[LineInfo]:
|
|
172
|
+
"""Merge parallel double lines"""
|
|
173
|
+
if len(lines) < 2:
|
|
174
|
+
return lines
|
|
175
|
+
|
|
176
|
+
merged = []
|
|
177
|
+
used = set()
|
|
178
|
+
|
|
179
|
+
# Sort by position
|
|
180
|
+
if is_horizontal:
|
|
181
|
+
sorted_lines = sorted(lines, key=lambda l: (l.y0, l.x0))
|
|
182
|
+
else:
|
|
183
|
+
sorted_lines = sorted(lines, key=lambda l: (l.x0, l.y0))
|
|
184
|
+
|
|
185
|
+
for i, line1 in enumerate(sorted_lines):
|
|
186
|
+
if i in used:
|
|
187
|
+
continue
|
|
188
|
+
|
|
189
|
+
merged_line = line1
|
|
190
|
+
|
|
191
|
+
for j in range(i + 1, len(sorted_lines)):
|
|
192
|
+
if j in used:
|
|
193
|
+
continue
|
|
194
|
+
|
|
195
|
+
line2 = sorted_lines[j]
|
|
196
|
+
|
|
197
|
+
# Check if double line
|
|
198
|
+
if self._is_double_line(line1, line2, is_horizontal):
|
|
199
|
+
# Merge two lines (middle position, maximum range)
|
|
200
|
+
merged_line = self._merge_two_lines(merged_line, line2, is_horizontal)
|
|
201
|
+
used.add(j)
|
|
202
|
+
|
|
203
|
+
merged.append(merged_line)
|
|
204
|
+
used.add(i)
|
|
205
|
+
|
|
206
|
+
return merged
|
|
207
|
+
|
|
208
|
+
def _is_double_line(self, line1: LineInfo, line2: LineInfo, is_horizontal: bool) -> bool:
|
|
209
|
+
"""Determine if two lines form a double line"""
|
|
210
|
+
if is_horizontal:
|
|
211
|
+
# Double line if Y coordinate difference is small and X ranges overlap
|
|
212
|
+
y_gap = abs(line1.y0 - line2.y0)
|
|
213
|
+
if y_gap > self.DOUBLE_LINE_GAP:
|
|
214
|
+
return False
|
|
215
|
+
|
|
216
|
+
# Check X range overlap
|
|
217
|
+
x_overlap = min(line1.x1, line2.x1) - max(line1.x0, line2.x0)
|
|
218
|
+
min_length = min(self._get_line_length(line1), self._get_line_length(line2))
|
|
219
|
+
return x_overlap > min_length * 0.5
|
|
220
|
+
else:
|
|
221
|
+
# Double line if X coordinate difference is small and Y ranges overlap
|
|
222
|
+
x_gap = abs(line1.x0 - line2.x0)
|
|
223
|
+
if x_gap > self.DOUBLE_LINE_GAP:
|
|
224
|
+
return False
|
|
225
|
+
|
|
226
|
+
# Check Y range overlap
|
|
227
|
+
y_overlap = min(line1.y1, line2.y1) - max(line1.y0, line2.y0)
|
|
228
|
+
min_length = min(self._get_line_length(line1), self._get_line_length(line2))
|
|
229
|
+
return y_overlap > min_length * 0.5
|
|
230
|
+
|
|
231
|
+
def _get_line_length(self, line: LineInfo) -> float:
|
|
232
|
+
"""Calculate line length"""
|
|
233
|
+
return math.sqrt((line.x1 - line.x0) ** 2 + (line.y1 - line.y0) ** 2)
|
|
234
|
+
|
|
235
|
+
def _merge_two_lines(self, line1: LineInfo, line2: LineInfo, is_horizontal: bool) -> LineInfo:
|
|
236
|
+
"""Merge two lines"""
|
|
237
|
+
if is_horizontal:
|
|
238
|
+
# Middle Y, maximum X range
|
|
239
|
+
avg_y = (line1.y0 + line2.y0) / 2
|
|
240
|
+
return LineInfo(
|
|
241
|
+
x0=min(line1.x0, line2.x0),
|
|
242
|
+
y0=avg_y,
|
|
243
|
+
x1=max(line1.x1, line2.x1),
|
|
244
|
+
y1=avg_y,
|
|
245
|
+
thickness=max(line1.thickness, line2.thickness),
|
|
246
|
+
thickness_class=line1.thickness_class if line1.thickness >= line2.thickness else line2.thickness_class,
|
|
247
|
+
is_horizontal=True,
|
|
248
|
+
is_vertical=False
|
|
249
|
+
)
|
|
250
|
+
else:
|
|
251
|
+
# Middle X, maximum Y range
|
|
252
|
+
avg_x = (line1.x0 + line2.x0) / 2
|
|
253
|
+
return LineInfo(
|
|
254
|
+
x0=avg_x,
|
|
255
|
+
y0=min(line1.y0, line2.y0),
|
|
256
|
+
x1=avg_x,
|
|
257
|
+
y1=max(line1.y1, line2.y1),
|
|
258
|
+
thickness=max(line1.thickness, line2.thickness),
|
|
259
|
+
thickness_class=line1.thickness_class if line1.thickness >= line2.thickness else line2.thickness_class,
|
|
260
|
+
is_horizontal=False,
|
|
261
|
+
is_vertical=True
|
|
262
|
+
)
|
|
263
|
+
|
|
264
|
+
def build_grid(self, tolerance: float = None) -> Optional[GridInfo]:
|
|
265
|
+
"""
|
|
266
|
+
Build grid from lines
|
|
267
|
+
|
|
268
|
+
Reconstructs incomplete borders and returns grid structure.
|
|
269
|
+
|
|
270
|
+
Args:
|
|
271
|
+
tolerance: Position clustering tolerance
|
|
272
|
+
|
|
273
|
+
Returns:
|
|
274
|
+
GridInfo or None
|
|
275
|
+
"""
|
|
276
|
+
if tolerance is None:
|
|
277
|
+
tolerance = self.LINE_MERGE_TOLERANCE
|
|
278
|
+
|
|
279
|
+
if not self.h_lines and not self.v_lines:
|
|
280
|
+
return None
|
|
281
|
+
|
|
282
|
+
# Collect Y coordinates (horizontal lines)
|
|
283
|
+
h_positions = self._cluster_positions(
|
|
284
|
+
[line.y0 for line in self.h_lines],
|
|
285
|
+
tolerance
|
|
286
|
+
)
|
|
287
|
+
|
|
288
|
+
# Collect X coordinates (vertical lines)
|
|
289
|
+
v_positions = self._cluster_positions(
|
|
290
|
+
[line.x0 for line in self.v_lines],
|
|
291
|
+
tolerance
|
|
292
|
+
)
|
|
293
|
+
|
|
294
|
+
if len(h_positions) < 2 or len(v_positions) < 2:
|
|
295
|
+
return None
|
|
296
|
+
|
|
297
|
+
# Calculate bbox
|
|
298
|
+
x0 = min(v_positions)
|
|
299
|
+
y0 = min(h_positions)
|
|
300
|
+
x1 = max(v_positions)
|
|
301
|
+
y1 = max(h_positions)
|
|
302
|
+
|
|
303
|
+
# Check border completeness
|
|
304
|
+
is_complete = self._check_border_completeness(h_positions, v_positions)
|
|
305
|
+
|
|
306
|
+
return GridInfo(
|
|
307
|
+
h_lines=sorted(h_positions),
|
|
308
|
+
v_lines=sorted(v_positions),
|
|
309
|
+
bbox=(x0, y0, x1, y1),
|
|
310
|
+
is_complete=is_complete,
|
|
311
|
+
reconstructed=False
|
|
312
|
+
)
|
|
313
|
+
|
|
314
|
+
def _cluster_positions(self, positions: List[float], tolerance: float) -> List[float]:
|
|
315
|
+
"""Cluster similar positions"""
|
|
316
|
+
if not positions:
|
|
317
|
+
return []
|
|
318
|
+
|
|
319
|
+
sorted_pos = sorted(positions)
|
|
320
|
+
clusters = [[sorted_pos[0]]]
|
|
321
|
+
|
|
322
|
+
for pos in sorted_pos[1:]:
|
|
323
|
+
if pos - clusters[-1][-1] <= tolerance:
|
|
324
|
+
clusters[-1].append(pos)
|
|
325
|
+
else:
|
|
326
|
+
clusters.append([pos])
|
|
327
|
+
|
|
328
|
+
# Return the mean value of each cluster
|
|
329
|
+
return [sum(c) / len(c) for c in clusters]
|
|
330
|
+
|
|
331
|
+
def _check_border_completeness(self, h_positions: List[float], v_positions: List[float]) -> bool:
|
|
332
|
+
"""Check border completeness"""
|
|
333
|
+
if len(h_positions) < 2 or len(v_positions) < 2:
|
|
334
|
+
return False
|
|
335
|
+
|
|
336
|
+
y_min, y_max = min(h_positions), max(h_positions)
|
|
337
|
+
x_min, x_max = min(v_positions), max(v_positions)
|
|
338
|
+
|
|
339
|
+
# Check if there are enough horizontal lines at top/bottom
|
|
340
|
+
has_top = any(line.y0 <= y_min + self.LINE_MERGE_TOLERANCE for line in self.h_lines)
|
|
341
|
+
has_bottom = any(line.y0 >= y_max - self.LINE_MERGE_TOLERANCE for line in self.h_lines)
|
|
342
|
+
|
|
343
|
+
# Check if there are enough vertical lines at left/right
|
|
344
|
+
has_left = any(line.x0 <= x_min + self.LINE_MERGE_TOLERANCE for line in self.v_lines)
|
|
345
|
+
has_right = any(line.x0 >= x_max - self.LINE_MERGE_TOLERANCE for line in self.v_lines)
|
|
346
|
+
|
|
347
|
+
return all([has_top, has_bottom, has_left, has_right])
|
|
348
|
+
|
|
349
|
+
def reconstruct_incomplete_border(self, grid: GridInfo) -> GridInfo:
|
|
350
|
+
"""
|
|
351
|
+
Reconstruct incomplete border
|
|
352
|
+
|
|
353
|
+
Completes to 4 sides if 3 or more sides exist.
|
|
354
|
+
|
|
355
|
+
Args:
|
|
356
|
+
grid: Existing GridInfo
|
|
357
|
+
|
|
358
|
+
Returns:
|
|
359
|
+
Reconstructed GridInfo
|
|
360
|
+
"""
|
|
361
|
+
if grid.is_complete:
|
|
362
|
+
return grid
|
|
363
|
+
|
|
364
|
+
h_lines = list(grid.h_lines)
|
|
365
|
+
v_lines = list(grid.v_lines)
|
|
366
|
+
|
|
367
|
+
y_min, y_max = min(h_lines), max(h_lines)
|
|
368
|
+
x_min, x_max = min(v_lines), max(v_lines)
|
|
369
|
+
|
|
370
|
+
reconstructed = False
|
|
371
|
+
|
|
372
|
+
# Check/add top horizontal line
|
|
373
|
+
has_top = any(abs(y - y_min) < self.LINE_MERGE_TOLERANCE for y in h_lines)
|
|
374
|
+
if not has_top and len(h_lines) >= 2:
|
|
375
|
+
# Estimate top border
|
|
376
|
+
h_lines.insert(0, y_min - self.BORDER_EXTENSION_MARGIN)
|
|
377
|
+
reconstructed = True
|
|
378
|
+
|
|
379
|
+
# Check/add bottom horizontal line
|
|
380
|
+
has_bottom = any(abs(y - y_max) < self.LINE_MERGE_TOLERANCE for y in h_lines)
|
|
381
|
+
if not has_bottom and len(h_lines) >= 2:
|
|
382
|
+
h_lines.append(y_max + self.BORDER_EXTENSION_MARGIN)
|
|
383
|
+
reconstructed = True
|
|
384
|
+
|
|
385
|
+
# Check/add left vertical line
|
|
386
|
+
has_left = any(abs(x - x_min) < self.LINE_MERGE_TOLERANCE for x in v_lines)
|
|
387
|
+
if not has_left and len(v_lines) >= 2:
|
|
388
|
+
v_lines.insert(0, x_min - self.BORDER_EXTENSION_MARGIN)
|
|
389
|
+
reconstructed = True
|
|
390
|
+
|
|
391
|
+
# Check/add right vertical line
|
|
392
|
+
has_right = any(abs(x - x_max) < self.LINE_MERGE_TOLERANCE for x in v_lines)
|
|
393
|
+
if not has_right and len(v_lines) >= 2:
|
|
394
|
+
v_lines.append(x_max + self.BORDER_EXTENSION_MARGIN)
|
|
395
|
+
reconstructed = True
|
|
396
|
+
|
|
397
|
+
if not reconstructed:
|
|
398
|
+
return grid
|
|
399
|
+
|
|
400
|
+
new_x0 = min(v_lines)
|
|
401
|
+
new_y0 = min(h_lines)
|
|
402
|
+
new_x1 = max(v_lines)
|
|
403
|
+
new_y1 = max(h_lines)
|
|
404
|
+
|
|
405
|
+
return GridInfo(
|
|
406
|
+
h_lines=sorted(h_lines),
|
|
407
|
+
v_lines=sorted(v_lines),
|
|
408
|
+
bbox=(new_x0, new_y0, new_x1, new_y1),
|
|
409
|
+
is_complete=True,
|
|
410
|
+
reconstructed=True
|
|
411
|
+
)
|
|
412
|
+
|
|
413
|
+
|
|
414
|
+
# ============================================================================
|
|
415
|
+
# Export
|
|
416
|
+
# ============================================================================
|
|
417
|
+
|
|
418
|
+
__all__ = [
|
|
419
|
+
'LineAnalysisEngine',
|
|
420
|
+
]
|
|
@@ -0,0 +1,101 @@
|
|
|
1
|
+
# xgen_doc2chunk/core/processor/pdf_helpers/pdf_metadata.py
|
|
2
|
+
"""
|
|
3
|
+
PDF Metadata Extraction Module
|
|
4
|
+
|
|
5
|
+
Provides PDFMetadataExtractor class for extracting and formatting PDF document metadata.
|
|
6
|
+
Implements BaseMetadataExtractor interface from xgen_doc2chunk.core.functions.
|
|
7
|
+
"""
|
|
8
|
+
import logging
|
|
9
|
+
from datetime import datetime
|
|
10
|
+
from typing import Any, Dict, Optional
|
|
11
|
+
|
|
12
|
+
from xgen_doc2chunk.core.functions.metadata_extractor import (
|
|
13
|
+
BaseMetadataExtractor,
|
|
14
|
+
DocumentMetadata,
|
|
15
|
+
)
|
|
16
|
+
|
|
17
|
+
logger = logging.getLogger("document-processor")
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class PDFMetadataExtractor(BaseMetadataExtractor):
|
|
21
|
+
"""
|
|
22
|
+
PDF Metadata Extractor.
|
|
23
|
+
|
|
24
|
+
Extracts metadata from PyMuPDF (fitz) document objects.
|
|
25
|
+
|
|
26
|
+
Supported fields:
|
|
27
|
+
- title, subject, author, keywords
|
|
28
|
+
- create_time, last_saved_time
|
|
29
|
+
|
|
30
|
+
Usage:
|
|
31
|
+
extractor = PDFMetadataExtractor()
|
|
32
|
+
metadata = extractor.extract(pdf_doc)
|
|
33
|
+
text = extractor.format(metadata)
|
|
34
|
+
"""
|
|
35
|
+
|
|
36
|
+
def extract(self, source: Any) -> DocumentMetadata:
|
|
37
|
+
"""
|
|
38
|
+
Extract metadata from PDF document.
|
|
39
|
+
|
|
40
|
+
Args:
|
|
41
|
+
source: PyMuPDF document object (fitz.Document)
|
|
42
|
+
|
|
43
|
+
Returns:
|
|
44
|
+
DocumentMetadata instance containing extracted metadata.
|
|
45
|
+
"""
|
|
46
|
+
try:
|
|
47
|
+
pdf_meta = source.metadata
|
|
48
|
+
if not pdf_meta:
|
|
49
|
+
return DocumentMetadata()
|
|
50
|
+
|
|
51
|
+
return DocumentMetadata(
|
|
52
|
+
title=self._get_stripped(pdf_meta, 'title'),
|
|
53
|
+
subject=self._get_stripped(pdf_meta, 'subject'),
|
|
54
|
+
author=self._get_stripped(pdf_meta, 'author'),
|
|
55
|
+
keywords=self._get_stripped(pdf_meta, 'keywords'),
|
|
56
|
+
create_time=parse_pdf_date(pdf_meta.get('creationDate')),
|
|
57
|
+
last_saved_time=parse_pdf_date(pdf_meta.get('modDate')),
|
|
58
|
+
)
|
|
59
|
+
except Exception as e:
|
|
60
|
+
self.logger.debug(f"[PDF] Error extracting metadata: {e}")
|
|
61
|
+
return DocumentMetadata()
|
|
62
|
+
|
|
63
|
+
def _get_stripped(self, meta: Dict[str, Any], key: str) -> Optional[str]:
|
|
64
|
+
"""Get stripped string value from metadata dict."""
|
|
65
|
+
value = meta.get(key)
|
|
66
|
+
return value.strip() if value else None
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
def parse_pdf_date(date_str: Optional[str]) -> Optional[datetime]:
|
|
70
|
+
"""
|
|
71
|
+
Convert a PDF date string to datetime.
|
|
72
|
+
|
|
73
|
+
Args:
|
|
74
|
+
date_str: PDF date string (e.g., "D:20231215120000")
|
|
75
|
+
|
|
76
|
+
Returns:
|
|
77
|
+
datetime object or None
|
|
78
|
+
"""
|
|
79
|
+
if not date_str:
|
|
80
|
+
return None
|
|
81
|
+
|
|
82
|
+
try:
|
|
83
|
+
if date_str.startswith("D:"):
|
|
84
|
+
date_str = date_str[2:]
|
|
85
|
+
|
|
86
|
+
if len(date_str) >= 14:
|
|
87
|
+
return datetime.strptime(date_str[:14], "%Y%m%d%H%M%S")
|
|
88
|
+
elif len(date_str) >= 8:
|
|
89
|
+
return datetime.strptime(date_str[:8], "%Y%m%d")
|
|
90
|
+
|
|
91
|
+
except Exception as e:
|
|
92
|
+
logger.debug(f"[PDF] Error parsing date '{date_str}': {e}")
|
|
93
|
+
|
|
94
|
+
return None
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
__all__ = [
|
|
98
|
+
"PDFMetadataExtractor",
|
|
99
|
+
"parse_pdf_date",
|
|
100
|
+
]
|
|
101
|
+
|
|
@@ -0,0 +1,114 @@
|
|
|
1
|
+
# xgen_doc2chunk/core/processor/pdf_helpers/pdf_page_analyzer.py
|
|
2
|
+
"""
|
|
3
|
+
PDF Page Analysis Module
|
|
4
|
+
|
|
5
|
+
Provides functions for analyzing PDF page structure including border detection.
|
|
6
|
+
"""
|
|
7
|
+
import logging
|
|
8
|
+
from typing import List, Tuple
|
|
9
|
+
|
|
10
|
+
from xgen_doc2chunk.core.processor.pdf_helpers.types import (
|
|
11
|
+
PDFConfig,
|
|
12
|
+
PageElement,
|
|
13
|
+
PageBorderInfo,
|
|
14
|
+
)
|
|
15
|
+
|
|
16
|
+
logger = logging.getLogger("document-processor")
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def detect_page_border(page) -> PageBorderInfo:
|
|
20
|
+
"""
|
|
21
|
+
Detects page borders (decorative).
|
|
22
|
+
|
|
23
|
+
Improvements:
|
|
24
|
+
1. Detect thin lines as well
|
|
25
|
+
2. Handle double lines
|
|
26
|
+
3. More accurate border identification
|
|
27
|
+
|
|
28
|
+
Args:
|
|
29
|
+
page: PyMuPDF page object
|
|
30
|
+
|
|
31
|
+
Returns:
|
|
32
|
+
PageBorderInfo object
|
|
33
|
+
"""
|
|
34
|
+
result = PageBorderInfo()
|
|
35
|
+
|
|
36
|
+
drawings = page.get_drawings()
|
|
37
|
+
if not drawings:
|
|
38
|
+
return result
|
|
39
|
+
|
|
40
|
+
page_width = page.rect.width
|
|
41
|
+
page_height = page.rect.height
|
|
42
|
+
|
|
43
|
+
edge_margin = min(page_width, page_height) * PDFConfig.PAGE_BORDER_MARGIN
|
|
44
|
+
page_spanning_ratio = PDFConfig.PAGE_SPANNING_RATIO
|
|
45
|
+
|
|
46
|
+
border_lines = {
|
|
47
|
+
'top': False,
|
|
48
|
+
'bottom': False,
|
|
49
|
+
'left': False,
|
|
50
|
+
'right': False
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
for drawing in drawings:
|
|
54
|
+
rect = drawing.get('rect')
|
|
55
|
+
if not rect:
|
|
56
|
+
continue
|
|
57
|
+
|
|
58
|
+
w = rect.width
|
|
59
|
+
h = rect.height
|
|
60
|
+
|
|
61
|
+
# Detect thin lines as well (relaxed thickness limit)
|
|
62
|
+
# Horizontal line (small height, large width)
|
|
63
|
+
if h <= 10 and w > page_width * page_spanning_ratio:
|
|
64
|
+
if rect.y0 < edge_margin:
|
|
65
|
+
border_lines['top'] = True
|
|
66
|
+
elif rect.y1 > page_height - edge_margin:
|
|
67
|
+
border_lines['bottom'] = True
|
|
68
|
+
|
|
69
|
+
# Vertical line (small width, large height)
|
|
70
|
+
if w <= 10 and h > page_height * page_spanning_ratio:
|
|
71
|
+
if rect.x0 < edge_margin:
|
|
72
|
+
border_lines['left'] = True
|
|
73
|
+
elif rect.x1 > page_width - edge_margin:
|
|
74
|
+
border_lines['right'] = True
|
|
75
|
+
|
|
76
|
+
# If all 4 sides present, it's a page border
|
|
77
|
+
if all(border_lines.values()):
|
|
78
|
+
result.has_border = True
|
|
79
|
+
result.border_bbox = (edge_margin, edge_margin, page_width - edge_margin, page_height - edge_margin)
|
|
80
|
+
result.border_lines = border_lines
|
|
81
|
+
|
|
82
|
+
return result
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
def is_table_likely_border(
|
|
86
|
+
table_bbox: Tuple[float, float, float, float],
|
|
87
|
+
border_info: PageBorderInfo,
|
|
88
|
+
page
|
|
89
|
+
) -> bool:
|
|
90
|
+
"""
|
|
91
|
+
Check if a table is likely a page border.
|
|
92
|
+
|
|
93
|
+
Args:
|
|
94
|
+
table_bbox: Table bounding box
|
|
95
|
+
border_info: Page border information
|
|
96
|
+
page: PyMuPDF page object
|
|
97
|
+
|
|
98
|
+
Returns:
|
|
99
|
+
True if table is likely a border, False otherwise
|
|
100
|
+
"""
|
|
101
|
+
if not border_info.has_border or not border_info.border_bbox:
|
|
102
|
+
return False
|
|
103
|
+
|
|
104
|
+
page_width = page.rect.width
|
|
105
|
+
page_height = page.rect.height
|
|
106
|
+
|
|
107
|
+
table_width = table_bbox[2] - table_bbox[0]
|
|
108
|
+
table_height = table_bbox[3] - table_bbox[1]
|
|
109
|
+
|
|
110
|
+
if table_width > page_width * 0.85 and table_height > page_height * 0.85:
|
|
111
|
+
return True
|
|
112
|
+
|
|
113
|
+
return False
|
|
114
|
+
|