xgen-doc2chunk 0.1.0__py3-none-any.whl → 0.1.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (162) hide show
  1. xgen_doc2chunk/__init__.py +42 -0
  2. xgen_doc2chunk/chunking/__init__.py +168 -0
  3. xgen_doc2chunk/chunking/chunking.py +786 -0
  4. xgen_doc2chunk/chunking/constants.py +134 -0
  5. xgen_doc2chunk/chunking/page_chunker.py +248 -0
  6. xgen_doc2chunk/chunking/protected_regions.py +715 -0
  7. xgen_doc2chunk/chunking/sheet_processor.py +406 -0
  8. xgen_doc2chunk/chunking/table_chunker.py +832 -0
  9. xgen_doc2chunk/chunking/table_parser.py +172 -0
  10. xgen_doc2chunk/chunking/text_chunker.py +443 -0
  11. xgen_doc2chunk/core/__init__.py +64 -0
  12. xgen_doc2chunk/core/document_processor.py +1307 -0
  13. xgen_doc2chunk/core/functions/__init__.py +85 -0
  14. xgen_doc2chunk/core/functions/chart_extractor.py +144 -0
  15. xgen_doc2chunk/core/functions/chart_processor.py +534 -0
  16. xgen_doc2chunk/core/functions/file_converter.py +220 -0
  17. xgen_doc2chunk/core/functions/img_processor.py +649 -0
  18. xgen_doc2chunk/core/functions/metadata_extractor.py +542 -0
  19. xgen_doc2chunk/core/functions/page_tag_processor.py +393 -0
  20. xgen_doc2chunk/core/functions/preprocessor.py +162 -0
  21. xgen_doc2chunk/core/functions/storage_backend.py +381 -0
  22. xgen_doc2chunk/core/functions/table_extractor.py +468 -0
  23. xgen_doc2chunk/core/functions/table_processor.py +299 -0
  24. xgen_doc2chunk/core/functions/utils.py +159 -0
  25. xgen_doc2chunk/core/processor/__init__.py +96 -0
  26. xgen_doc2chunk/core/processor/base_handler.py +544 -0
  27. xgen_doc2chunk/core/processor/csv_handler.py +135 -0
  28. xgen_doc2chunk/core/processor/csv_helper/__init__.py +89 -0
  29. xgen_doc2chunk/core/processor/csv_helper/csv_constants.py +63 -0
  30. xgen_doc2chunk/core/processor/csv_helper/csv_encoding.py +104 -0
  31. xgen_doc2chunk/core/processor/csv_helper/csv_file_converter.py +78 -0
  32. xgen_doc2chunk/core/processor/csv_helper/csv_image_processor.py +75 -0
  33. xgen_doc2chunk/core/processor/csv_helper/csv_metadata.py +168 -0
  34. xgen_doc2chunk/core/processor/csv_helper/csv_parser.py +225 -0
  35. xgen_doc2chunk/core/processor/csv_helper/csv_preprocessor.py +86 -0
  36. xgen_doc2chunk/core/processor/csv_helper/csv_table.py +266 -0
  37. xgen_doc2chunk/core/processor/doc_handler.py +579 -0
  38. xgen_doc2chunk/core/processor/doc_helpers/__init__.py +25 -0
  39. xgen_doc2chunk/core/processor/doc_helpers/doc_file_converter.py +160 -0
  40. xgen_doc2chunk/core/processor/doc_helpers/doc_image_processor.py +179 -0
  41. xgen_doc2chunk/core/processor/doc_helpers/doc_preprocessor.py +83 -0
  42. xgen_doc2chunk/core/processor/docx_handler.py +376 -0
  43. xgen_doc2chunk/core/processor/docx_helper/__init__.py +84 -0
  44. xgen_doc2chunk/core/processor/docx_helper/docx_chart_extractor.py +436 -0
  45. xgen_doc2chunk/core/processor/docx_helper/docx_constants.py +75 -0
  46. xgen_doc2chunk/core/processor/docx_helper/docx_file_converter.py +76 -0
  47. xgen_doc2chunk/core/processor/docx_helper/docx_image.py +145 -0
  48. xgen_doc2chunk/core/processor/docx_helper/docx_image_processor.py +410 -0
  49. xgen_doc2chunk/core/processor/docx_helper/docx_metadata.py +71 -0
  50. xgen_doc2chunk/core/processor/docx_helper/docx_paragraph.py +126 -0
  51. xgen_doc2chunk/core/processor/docx_helper/docx_preprocessor.py +82 -0
  52. xgen_doc2chunk/core/processor/docx_helper/docx_table_extractor.py +527 -0
  53. xgen_doc2chunk/core/processor/docx_helper/docx_table_processor.py +220 -0
  54. xgen_doc2chunk/core/processor/excel_handler.py +353 -0
  55. xgen_doc2chunk/core/processor/excel_helper/__init__.py +97 -0
  56. xgen_doc2chunk/core/processor/excel_helper/excel_chart_extractor.py +498 -0
  57. xgen_doc2chunk/core/processor/excel_helper/excel_file_converter.py +157 -0
  58. xgen_doc2chunk/core/processor/excel_helper/excel_image_processor.py +316 -0
  59. xgen_doc2chunk/core/processor/excel_helper/excel_layout_detector.py +739 -0
  60. xgen_doc2chunk/core/processor/excel_helper/excel_metadata.py +145 -0
  61. xgen_doc2chunk/core/processor/excel_helper/excel_preprocessor.py +83 -0
  62. xgen_doc2chunk/core/processor/excel_helper/excel_table_xls.py +357 -0
  63. xgen_doc2chunk/core/processor/excel_helper/excel_table_xlsx.py +361 -0
  64. xgen_doc2chunk/core/processor/excel_helper/excel_textbox.py +266 -0
  65. xgen_doc2chunk/core/processor/html_helper/__init__.py +7 -0
  66. xgen_doc2chunk/core/processor/html_helper/html_file_converter.py +92 -0
  67. xgen_doc2chunk/core/processor/html_helper/html_preprocessor.py +74 -0
  68. xgen_doc2chunk/core/processor/html_reprocessor.py +140 -0
  69. xgen_doc2chunk/core/processor/hwp_handler.py +401 -0
  70. xgen_doc2chunk/core/processor/hwp_helper/__init__.py +120 -0
  71. xgen_doc2chunk/core/processor/hwp_helper/hwp_chart_extractor.py +373 -0
  72. xgen_doc2chunk/core/processor/hwp_helper/hwp_constants.py +78 -0
  73. xgen_doc2chunk/core/processor/hwp_helper/hwp_decoder.py +106 -0
  74. xgen_doc2chunk/core/processor/hwp_helper/hwp_docinfo.py +174 -0
  75. xgen_doc2chunk/core/processor/hwp_helper/hwp_file_converter.py +60 -0
  76. xgen_doc2chunk/core/processor/hwp_helper/hwp_image_processor.py +413 -0
  77. xgen_doc2chunk/core/processor/hwp_helper/hwp_metadata.py +236 -0
  78. xgen_doc2chunk/core/processor/hwp_helper/hwp_preprocessor.py +82 -0
  79. xgen_doc2chunk/core/processor/hwp_helper/hwp_record.py +149 -0
  80. xgen_doc2chunk/core/processor/hwp_helper/hwp_recovery.py +217 -0
  81. xgen_doc2chunk/core/processor/hwp_helper/hwp_table.py +205 -0
  82. xgen_doc2chunk/core/processor/hwpx_handler.py +191 -0
  83. xgen_doc2chunk/core/processor/hwpx_helper/__init__.py +85 -0
  84. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_chart_extractor.py +464 -0
  85. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_constants.py +30 -0
  86. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_file_converter.py +70 -0
  87. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_image_processor.py +258 -0
  88. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_metadata.py +163 -0
  89. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_preprocessor.py +80 -0
  90. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_section.py +242 -0
  91. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_table_extractor.py +462 -0
  92. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_table_processor.py +220 -0
  93. xgen_doc2chunk/core/processor/image_file_handler.py +212 -0
  94. xgen_doc2chunk/core/processor/image_file_helper/__init__.py +17 -0
  95. xgen_doc2chunk/core/processor/image_file_helper/image_file_converter.py +69 -0
  96. xgen_doc2chunk/core/processor/image_file_helper/image_file_image_processor.py +123 -0
  97. xgen_doc2chunk/core/processor/image_file_helper/image_file_preprocessor.py +84 -0
  98. xgen_doc2chunk/core/processor/pdf_handler.py +597 -0
  99. xgen_doc2chunk/core/processor/pdf_helpers/__init__.py +229 -0
  100. xgen_doc2chunk/core/processor/pdf_helpers/pdf_block_image_engine.py +667 -0
  101. xgen_doc2chunk/core/processor/pdf_helpers/pdf_cell_analysis.py +493 -0
  102. xgen_doc2chunk/core/processor/pdf_helpers/pdf_complexity_analyzer.py +598 -0
  103. xgen_doc2chunk/core/processor/pdf_helpers/pdf_element_merger.py +46 -0
  104. xgen_doc2chunk/core/processor/pdf_helpers/pdf_file_converter.py +72 -0
  105. xgen_doc2chunk/core/processor/pdf_helpers/pdf_graphic_detector.py +332 -0
  106. xgen_doc2chunk/core/processor/pdf_helpers/pdf_image_processor.py +321 -0
  107. xgen_doc2chunk/core/processor/pdf_helpers/pdf_layout_block_detector.py +1244 -0
  108. xgen_doc2chunk/core/processor/pdf_helpers/pdf_line_analysis.py +420 -0
  109. xgen_doc2chunk/core/processor/pdf_helpers/pdf_metadata.py +101 -0
  110. xgen_doc2chunk/core/processor/pdf_helpers/pdf_page_analyzer.py +114 -0
  111. xgen_doc2chunk/core/processor/pdf_helpers/pdf_preprocessor.py +106 -0
  112. xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_detection.py +1346 -0
  113. xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_processor.py +897 -0
  114. xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_quality_analyzer.py +750 -0
  115. xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_validator.py +401 -0
  116. xgen_doc2chunk/core/processor/pdf_helpers/pdf_text_extractor.py +155 -0
  117. xgen_doc2chunk/core/processor/pdf_helpers/pdf_text_quality_analyzer.py +655 -0
  118. xgen_doc2chunk/core/processor/pdf_helpers/pdf_utils.py +183 -0
  119. xgen_doc2chunk/core/processor/pdf_helpers/pdf_vector_text_ocr.py +302 -0
  120. xgen_doc2chunk/core/processor/pdf_helpers/types.py +278 -0
  121. xgen_doc2chunk/core/processor/ppt_handler.py +288 -0
  122. xgen_doc2chunk/core/processor/ppt_helper/__init__.py +96 -0
  123. xgen_doc2chunk/core/processor/ppt_helper/ppt_bullet.py +332 -0
  124. xgen_doc2chunk/core/processor/ppt_helper/ppt_chart_extractor.py +182 -0
  125. xgen_doc2chunk/core/processor/ppt_helper/ppt_constants.py +119 -0
  126. xgen_doc2chunk/core/processor/ppt_helper/ppt_file_converter.py +55 -0
  127. xgen_doc2chunk/core/processor/ppt_helper/ppt_image_processor.py +196 -0
  128. xgen_doc2chunk/core/processor/ppt_helper/ppt_metadata.py +71 -0
  129. xgen_doc2chunk/core/processor/ppt_helper/ppt_preprocessor.py +77 -0
  130. xgen_doc2chunk/core/processor/ppt_helper/ppt_shape.py +189 -0
  131. xgen_doc2chunk/core/processor/ppt_helper/ppt_slide.py +69 -0
  132. xgen_doc2chunk/core/processor/ppt_helper/ppt_table.py +386 -0
  133. xgen_doc2chunk/core/processor/rtf_handler.py +290 -0
  134. xgen_doc2chunk/core/processor/rtf_helper/__init__.py +128 -0
  135. xgen_doc2chunk/core/processor/rtf_helper/rtf_constants.py +94 -0
  136. xgen_doc2chunk/core/processor/rtf_helper/rtf_content_extractor.py +211 -0
  137. xgen_doc2chunk/core/processor/rtf_helper/rtf_decoder.py +141 -0
  138. xgen_doc2chunk/core/processor/rtf_helper/rtf_file_converter.py +87 -0
  139. xgen_doc2chunk/core/processor/rtf_helper/rtf_metadata_extractor.py +179 -0
  140. xgen_doc2chunk/core/processor/rtf_helper/rtf_preprocessor.py +426 -0
  141. xgen_doc2chunk/core/processor/rtf_helper/rtf_region_finder.py +91 -0
  142. xgen_doc2chunk/core/processor/rtf_helper/rtf_table_extractor.py +482 -0
  143. xgen_doc2chunk/core/processor/rtf_helper/rtf_text_cleaner.py +389 -0
  144. xgen_doc2chunk/core/processor/text_handler.py +95 -0
  145. xgen_doc2chunk/core/processor/text_helper/__init__.py +17 -0
  146. xgen_doc2chunk/core/processor/text_helper/text_file_converter.py +28 -0
  147. xgen_doc2chunk/core/processor/text_helper/text_image_processor.py +75 -0
  148. xgen_doc2chunk/core/processor/text_helper/text_preprocessor.py +82 -0
  149. xgen_doc2chunk/ocr/__init__.py +67 -0
  150. xgen_doc2chunk/ocr/base.py +209 -0
  151. xgen_doc2chunk/ocr/ocr_engine/__init__.py +22 -0
  152. xgen_doc2chunk/ocr/ocr_engine/anthropic_ocr.py +91 -0
  153. xgen_doc2chunk/ocr/ocr_engine/bedrock_ocr.py +172 -0
  154. xgen_doc2chunk/ocr/ocr_engine/gemini_ocr.py +91 -0
  155. xgen_doc2chunk/ocr/ocr_engine/openai_ocr.py +100 -0
  156. xgen_doc2chunk/ocr/ocr_engine/vllm_ocr.py +116 -0
  157. xgen_doc2chunk/ocr/ocr_processor.py +387 -0
  158. {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.2.dist-info}/METADATA +1 -1
  159. xgen_doc2chunk-0.1.2.dist-info/RECORD +161 -0
  160. xgen_doc2chunk-0.1.0.dist-info/RECORD +0 -4
  161. {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.2.dist-info}/WHEEL +0 -0
  162. {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.2.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,420 @@
1
+ """
2
+ Line Analysis Engine for PDF Handler
3
+
4
+ Extracts and analyzes lines from PDF drawings.
5
+ - Thin line detection
6
+ - Double line merging
7
+ - Incomplete border reconstruction
8
+ """
9
+
10
+ import logging
11
+ import math
12
+ from typing import List, Optional, Tuple
13
+
14
+ import fitz
15
+
16
+ from xgen_doc2chunk.core.processor.pdf_helpers.types import LineInfo, GridInfo, LineThickness, PDFConfig
17
+
18
+ logger = logging.getLogger(__name__)
19
+
20
+
21
+ # ============================================================================
22
+ # Line Analysis Engine
23
+ # ============================================================================
24
+
25
+ class LineAnalysisEngine:
26
+ """
27
+ Line Analysis Engine
28
+
29
+ Extracts and analyzes lines from PDF drawings.
30
+ - Thin line detection
31
+ - Double line merging
32
+ - Incomplete border reconstruction
33
+ """
34
+
35
+ # Configuration constants (from PDFConfig or default values)
36
+ THIN_LINE_THRESHOLD = getattr(PDFConfig, 'THIN_LINE_THRESHOLD', 0.5)
37
+ THICK_LINE_THRESHOLD = getattr(PDFConfig, 'THICK_LINE_THRESHOLD', 2.0)
38
+ DOUBLE_LINE_GAP = getattr(PDFConfig, 'DOUBLE_LINE_GAP', 5.0)
39
+ LINE_MERGE_TOLERANCE = getattr(PDFConfig, 'LINE_MERGE_TOLERANCE', 3.0)
40
+ BORDER_EXTENSION_MARGIN = getattr(PDFConfig, 'BORDER_EXTENSION_MARGIN', 20.0)
41
+
42
+ def __init__(self, page, page_width: float, page_height: float):
43
+ """
44
+ Args:
45
+ page: PyMuPDF page object
46
+ page_width: Page width
47
+ page_height: Page height
48
+ """
49
+ self.page = page
50
+ self.page_width = page_width
51
+ self.page_height = page_height
52
+ self.all_lines: List[LineInfo] = []
53
+ self.h_lines: List[LineInfo] = [] # Horizontal lines
54
+ self.v_lines: List[LineInfo] = [] # Vertical lines
55
+
56
+ def analyze(self) -> Tuple[List[LineInfo], List[LineInfo]]:
57
+ """
58
+ Perform line analysis
59
+
60
+ Returns:
61
+ Tuple of (horizontal lines list, vertical lines list)
62
+ """
63
+ self._extract_all_lines()
64
+ self._classify_lines()
65
+ self._merge_double_lines()
66
+ return self.h_lines, self.v_lines
67
+
68
+ def _extract_all_lines(self):
69
+ """Extract all lines"""
70
+ drawings = self.page.get_drawings()
71
+ if not drawings:
72
+ return
73
+
74
+ for drawing in drawings:
75
+ # Extract line information
76
+ items = drawing.get('items', [])
77
+ rect = drawing.get('rect')
78
+
79
+ if not rect:
80
+ continue
81
+
82
+ # Rect-based line analysis
83
+ x0, y0, x1, y1 = rect.x0, rect.y0, rect.x1, rect.y1
84
+ w = abs(x1 - x0)
85
+ h = abs(y1 - y0)
86
+
87
+ # Estimate line thickness
88
+ stroke_width = drawing.get('width', 1.0) or 1.0
89
+
90
+ # Determine if it's a line (horizontal or vertical)
91
+ is_h_line = h <= max(3.0, stroke_width * 2) and w > 10
92
+ is_v_line = w <= max(3.0, stroke_width * 2) and h > 10
93
+
94
+ if not (is_h_line or is_v_line):
95
+ # Try to extract 'l' (line) from items
96
+ for item in items:
97
+ if item[0] == 'l': # line
98
+ p1, p2 = item[1], item[2]
99
+ self._add_line_from_points(p1, p2, stroke_width)
100
+ continue
101
+
102
+ # Classify thickness
103
+ thickness_class = self._classify_thickness(stroke_width)
104
+
105
+ line_info = LineInfo(
106
+ x0=x0,
107
+ y0=y0 if is_h_line else y0,
108
+ x1=x1,
109
+ y1=y1 if is_h_line else y1,
110
+ thickness=stroke_width,
111
+ thickness_class=thickness_class,
112
+ is_horizontal=is_h_line,
113
+ is_vertical=is_v_line
114
+ )
115
+
116
+ self.all_lines.append(line_info)
117
+
118
+ def _add_line_from_points(self, p1, p2, stroke_width: float):
119
+ """Create a line from two points"""
120
+ x0, y0 = p1.x, p1.y
121
+ x1, y1 = p2.x, p2.y
122
+
123
+ dx = abs(x1 - x0)
124
+ dy = abs(y1 - y0)
125
+
126
+ # Determine line direction (within tolerance)
127
+ is_horizontal = dy < 3 and dx > 10
128
+ is_vertical = dx < 3 and dy > 10
129
+
130
+ if not (is_horizontal or is_vertical):
131
+ return
132
+
133
+ thickness_class = self._classify_thickness(stroke_width)
134
+
135
+ line_info = LineInfo(
136
+ x0=min(x0, x1),
137
+ y0=min(y0, y1),
138
+ x1=max(x0, x1),
139
+ y1=max(y0, y1),
140
+ thickness=stroke_width,
141
+ thickness_class=thickness_class,
142
+ is_horizontal=is_horizontal,
143
+ is_vertical=is_vertical
144
+ )
145
+
146
+ self.all_lines.append(line_info)
147
+
148
+ def _classify_thickness(self, thickness: float) -> LineThickness:
149
+ """Classify line thickness"""
150
+ if thickness < self.THIN_LINE_THRESHOLD:
151
+ return LineThickness.THIN
152
+ elif thickness > self.THICK_LINE_THRESHOLD:
153
+ return LineThickness.THICK
154
+ return LineThickness.NORMAL
155
+
156
+ def _classify_lines(self):
157
+ """Classify horizontal/vertical lines"""
158
+ for line in self.all_lines:
159
+ if line.is_horizontal:
160
+ self.h_lines.append(line)
161
+ elif line.is_vertical:
162
+ self.v_lines.append(line)
163
+
164
+ def _merge_double_lines(self):
165
+ """Merge double lines"""
166
+ # Merge horizontal lines
167
+ self.h_lines = self._merge_parallel_lines(self.h_lines, is_horizontal=True)
168
+ # Merge vertical lines
169
+ self.v_lines = self._merge_parallel_lines(self.v_lines, is_horizontal=False)
170
+
171
+ def _merge_parallel_lines(self, lines: List[LineInfo], is_horizontal: bool) -> List[LineInfo]:
172
+ """Merge parallel double lines"""
173
+ if len(lines) < 2:
174
+ return lines
175
+
176
+ merged = []
177
+ used = set()
178
+
179
+ # Sort by position
180
+ if is_horizontal:
181
+ sorted_lines = sorted(lines, key=lambda l: (l.y0, l.x0))
182
+ else:
183
+ sorted_lines = sorted(lines, key=lambda l: (l.x0, l.y0))
184
+
185
+ for i, line1 in enumerate(sorted_lines):
186
+ if i in used:
187
+ continue
188
+
189
+ merged_line = line1
190
+
191
+ for j in range(i + 1, len(sorted_lines)):
192
+ if j in used:
193
+ continue
194
+
195
+ line2 = sorted_lines[j]
196
+
197
+ # Check if double line
198
+ if self._is_double_line(line1, line2, is_horizontal):
199
+ # Merge two lines (middle position, maximum range)
200
+ merged_line = self._merge_two_lines(merged_line, line2, is_horizontal)
201
+ used.add(j)
202
+
203
+ merged.append(merged_line)
204
+ used.add(i)
205
+
206
+ return merged
207
+
208
+ def _is_double_line(self, line1: LineInfo, line2: LineInfo, is_horizontal: bool) -> bool:
209
+ """Determine if two lines form a double line"""
210
+ if is_horizontal:
211
+ # Double line if Y coordinate difference is small and X ranges overlap
212
+ y_gap = abs(line1.y0 - line2.y0)
213
+ if y_gap > self.DOUBLE_LINE_GAP:
214
+ return False
215
+
216
+ # Check X range overlap
217
+ x_overlap = min(line1.x1, line2.x1) - max(line1.x0, line2.x0)
218
+ min_length = min(self._get_line_length(line1), self._get_line_length(line2))
219
+ return x_overlap > min_length * 0.5
220
+ else:
221
+ # Double line if X coordinate difference is small and Y ranges overlap
222
+ x_gap = abs(line1.x0 - line2.x0)
223
+ if x_gap > self.DOUBLE_LINE_GAP:
224
+ return False
225
+
226
+ # Check Y range overlap
227
+ y_overlap = min(line1.y1, line2.y1) - max(line1.y0, line2.y0)
228
+ min_length = min(self._get_line_length(line1), self._get_line_length(line2))
229
+ return y_overlap > min_length * 0.5
230
+
231
+ def _get_line_length(self, line: LineInfo) -> float:
232
+ """Calculate line length"""
233
+ return math.sqrt((line.x1 - line.x0) ** 2 + (line.y1 - line.y0) ** 2)
234
+
235
+ def _merge_two_lines(self, line1: LineInfo, line2: LineInfo, is_horizontal: bool) -> LineInfo:
236
+ """Merge two lines"""
237
+ if is_horizontal:
238
+ # Middle Y, maximum X range
239
+ avg_y = (line1.y0 + line2.y0) / 2
240
+ return LineInfo(
241
+ x0=min(line1.x0, line2.x0),
242
+ y0=avg_y,
243
+ x1=max(line1.x1, line2.x1),
244
+ y1=avg_y,
245
+ thickness=max(line1.thickness, line2.thickness),
246
+ thickness_class=line1.thickness_class if line1.thickness >= line2.thickness else line2.thickness_class,
247
+ is_horizontal=True,
248
+ is_vertical=False
249
+ )
250
+ else:
251
+ # Middle X, maximum Y range
252
+ avg_x = (line1.x0 + line2.x0) / 2
253
+ return LineInfo(
254
+ x0=avg_x,
255
+ y0=min(line1.y0, line2.y0),
256
+ x1=avg_x,
257
+ y1=max(line1.y1, line2.y1),
258
+ thickness=max(line1.thickness, line2.thickness),
259
+ thickness_class=line1.thickness_class if line1.thickness >= line2.thickness else line2.thickness_class,
260
+ is_horizontal=False,
261
+ is_vertical=True
262
+ )
263
+
264
+ def build_grid(self, tolerance: float = None) -> Optional[GridInfo]:
265
+ """
266
+ Build grid from lines
267
+
268
+ Reconstructs incomplete borders and returns grid structure.
269
+
270
+ Args:
271
+ tolerance: Position clustering tolerance
272
+
273
+ Returns:
274
+ GridInfo or None
275
+ """
276
+ if tolerance is None:
277
+ tolerance = self.LINE_MERGE_TOLERANCE
278
+
279
+ if not self.h_lines and not self.v_lines:
280
+ return None
281
+
282
+ # Collect Y coordinates (horizontal lines)
283
+ h_positions = self._cluster_positions(
284
+ [line.y0 for line in self.h_lines],
285
+ tolerance
286
+ )
287
+
288
+ # Collect X coordinates (vertical lines)
289
+ v_positions = self._cluster_positions(
290
+ [line.x0 for line in self.v_lines],
291
+ tolerance
292
+ )
293
+
294
+ if len(h_positions) < 2 or len(v_positions) < 2:
295
+ return None
296
+
297
+ # Calculate bbox
298
+ x0 = min(v_positions)
299
+ y0 = min(h_positions)
300
+ x1 = max(v_positions)
301
+ y1 = max(h_positions)
302
+
303
+ # Check border completeness
304
+ is_complete = self._check_border_completeness(h_positions, v_positions)
305
+
306
+ return GridInfo(
307
+ h_lines=sorted(h_positions),
308
+ v_lines=sorted(v_positions),
309
+ bbox=(x0, y0, x1, y1),
310
+ is_complete=is_complete,
311
+ reconstructed=False
312
+ )
313
+
314
+ def _cluster_positions(self, positions: List[float], tolerance: float) -> List[float]:
315
+ """Cluster similar positions"""
316
+ if not positions:
317
+ return []
318
+
319
+ sorted_pos = sorted(positions)
320
+ clusters = [[sorted_pos[0]]]
321
+
322
+ for pos in sorted_pos[1:]:
323
+ if pos - clusters[-1][-1] <= tolerance:
324
+ clusters[-1].append(pos)
325
+ else:
326
+ clusters.append([pos])
327
+
328
+ # Return the mean value of each cluster
329
+ return [sum(c) / len(c) for c in clusters]
330
+
331
+ def _check_border_completeness(self, h_positions: List[float], v_positions: List[float]) -> bool:
332
+ """Check border completeness"""
333
+ if len(h_positions) < 2 or len(v_positions) < 2:
334
+ return False
335
+
336
+ y_min, y_max = min(h_positions), max(h_positions)
337
+ x_min, x_max = min(v_positions), max(v_positions)
338
+
339
+ # Check if there are enough horizontal lines at top/bottom
340
+ has_top = any(line.y0 <= y_min + self.LINE_MERGE_TOLERANCE for line in self.h_lines)
341
+ has_bottom = any(line.y0 >= y_max - self.LINE_MERGE_TOLERANCE for line in self.h_lines)
342
+
343
+ # Check if there are enough vertical lines at left/right
344
+ has_left = any(line.x0 <= x_min + self.LINE_MERGE_TOLERANCE for line in self.v_lines)
345
+ has_right = any(line.x0 >= x_max - self.LINE_MERGE_TOLERANCE for line in self.v_lines)
346
+
347
+ return all([has_top, has_bottom, has_left, has_right])
348
+
349
+ def reconstruct_incomplete_border(self, grid: GridInfo) -> GridInfo:
350
+ """
351
+ Reconstruct incomplete border
352
+
353
+ Completes to 4 sides if 3 or more sides exist.
354
+
355
+ Args:
356
+ grid: Existing GridInfo
357
+
358
+ Returns:
359
+ Reconstructed GridInfo
360
+ """
361
+ if grid.is_complete:
362
+ return grid
363
+
364
+ h_lines = list(grid.h_lines)
365
+ v_lines = list(grid.v_lines)
366
+
367
+ y_min, y_max = min(h_lines), max(h_lines)
368
+ x_min, x_max = min(v_lines), max(v_lines)
369
+
370
+ reconstructed = False
371
+
372
+ # Check/add top horizontal line
373
+ has_top = any(abs(y - y_min) < self.LINE_MERGE_TOLERANCE for y in h_lines)
374
+ if not has_top and len(h_lines) >= 2:
375
+ # Estimate top border
376
+ h_lines.insert(0, y_min - self.BORDER_EXTENSION_MARGIN)
377
+ reconstructed = True
378
+
379
+ # Check/add bottom horizontal line
380
+ has_bottom = any(abs(y - y_max) < self.LINE_MERGE_TOLERANCE for y in h_lines)
381
+ if not has_bottom and len(h_lines) >= 2:
382
+ h_lines.append(y_max + self.BORDER_EXTENSION_MARGIN)
383
+ reconstructed = True
384
+
385
+ # Check/add left vertical line
386
+ has_left = any(abs(x - x_min) < self.LINE_MERGE_TOLERANCE for x in v_lines)
387
+ if not has_left and len(v_lines) >= 2:
388
+ v_lines.insert(0, x_min - self.BORDER_EXTENSION_MARGIN)
389
+ reconstructed = True
390
+
391
+ # Check/add right vertical line
392
+ has_right = any(abs(x - x_max) < self.LINE_MERGE_TOLERANCE for x in v_lines)
393
+ if not has_right and len(v_lines) >= 2:
394
+ v_lines.append(x_max + self.BORDER_EXTENSION_MARGIN)
395
+ reconstructed = True
396
+
397
+ if not reconstructed:
398
+ return grid
399
+
400
+ new_x0 = min(v_lines)
401
+ new_y0 = min(h_lines)
402
+ new_x1 = max(v_lines)
403
+ new_y1 = max(h_lines)
404
+
405
+ return GridInfo(
406
+ h_lines=sorted(h_lines),
407
+ v_lines=sorted(v_lines),
408
+ bbox=(new_x0, new_y0, new_x1, new_y1),
409
+ is_complete=True,
410
+ reconstructed=True
411
+ )
412
+
413
+
414
+ # ============================================================================
415
+ # Export
416
+ # ============================================================================
417
+
418
+ __all__ = [
419
+ 'LineAnalysisEngine',
420
+ ]
@@ -0,0 +1,101 @@
1
+ # xgen_doc2chunk/core/processor/pdf_helpers/pdf_metadata.py
2
+ """
3
+ PDF Metadata Extraction Module
4
+
5
+ Provides PDFMetadataExtractor class for extracting and formatting PDF document metadata.
6
+ Implements BaseMetadataExtractor interface from xgen_doc2chunk.core.functions.
7
+ """
8
+ import logging
9
+ from datetime import datetime
10
+ from typing import Any, Dict, Optional
11
+
12
+ from xgen_doc2chunk.core.functions.metadata_extractor import (
13
+ BaseMetadataExtractor,
14
+ DocumentMetadata,
15
+ )
16
+
17
+ logger = logging.getLogger("document-processor")
18
+
19
+
20
+ class PDFMetadataExtractor(BaseMetadataExtractor):
21
+ """
22
+ PDF Metadata Extractor.
23
+
24
+ Extracts metadata from PyMuPDF (fitz) document objects.
25
+
26
+ Supported fields:
27
+ - title, subject, author, keywords
28
+ - create_time, last_saved_time
29
+
30
+ Usage:
31
+ extractor = PDFMetadataExtractor()
32
+ metadata = extractor.extract(pdf_doc)
33
+ text = extractor.format(metadata)
34
+ """
35
+
36
+ def extract(self, source: Any) -> DocumentMetadata:
37
+ """
38
+ Extract metadata from PDF document.
39
+
40
+ Args:
41
+ source: PyMuPDF document object (fitz.Document)
42
+
43
+ Returns:
44
+ DocumentMetadata instance containing extracted metadata.
45
+ """
46
+ try:
47
+ pdf_meta = source.metadata
48
+ if not pdf_meta:
49
+ return DocumentMetadata()
50
+
51
+ return DocumentMetadata(
52
+ title=self._get_stripped(pdf_meta, 'title'),
53
+ subject=self._get_stripped(pdf_meta, 'subject'),
54
+ author=self._get_stripped(pdf_meta, 'author'),
55
+ keywords=self._get_stripped(pdf_meta, 'keywords'),
56
+ create_time=parse_pdf_date(pdf_meta.get('creationDate')),
57
+ last_saved_time=parse_pdf_date(pdf_meta.get('modDate')),
58
+ )
59
+ except Exception as e:
60
+ self.logger.debug(f"[PDF] Error extracting metadata: {e}")
61
+ return DocumentMetadata()
62
+
63
+ def _get_stripped(self, meta: Dict[str, Any], key: str) -> Optional[str]:
64
+ """Get stripped string value from metadata dict."""
65
+ value = meta.get(key)
66
+ return value.strip() if value else None
67
+
68
+
69
+ def parse_pdf_date(date_str: Optional[str]) -> Optional[datetime]:
70
+ """
71
+ Convert a PDF date string to datetime.
72
+
73
+ Args:
74
+ date_str: PDF date string (e.g., "D:20231215120000")
75
+
76
+ Returns:
77
+ datetime object or None
78
+ """
79
+ if not date_str:
80
+ return None
81
+
82
+ try:
83
+ if date_str.startswith("D:"):
84
+ date_str = date_str[2:]
85
+
86
+ if len(date_str) >= 14:
87
+ return datetime.strptime(date_str[:14], "%Y%m%d%H%M%S")
88
+ elif len(date_str) >= 8:
89
+ return datetime.strptime(date_str[:8], "%Y%m%d")
90
+
91
+ except Exception as e:
92
+ logger.debug(f"[PDF] Error parsing date '{date_str}': {e}")
93
+
94
+ return None
95
+
96
+
97
+ __all__ = [
98
+ "PDFMetadataExtractor",
99
+ "parse_pdf_date",
100
+ ]
101
+
@@ -0,0 +1,114 @@
1
+ # xgen_doc2chunk/core/processor/pdf_helpers/pdf_page_analyzer.py
2
+ """
3
+ PDF Page Analysis Module
4
+
5
+ Provides functions for analyzing PDF page structure including border detection.
6
+ """
7
+ import logging
8
+ from typing import List, Tuple
9
+
10
+ from xgen_doc2chunk.core.processor.pdf_helpers.types import (
11
+ PDFConfig,
12
+ PageElement,
13
+ PageBorderInfo,
14
+ )
15
+
16
+ logger = logging.getLogger("document-processor")
17
+
18
+
19
+ def detect_page_border(page) -> PageBorderInfo:
20
+ """
21
+ Detects page borders (decorative).
22
+
23
+ Improvements:
24
+ 1. Detect thin lines as well
25
+ 2. Handle double lines
26
+ 3. More accurate border identification
27
+
28
+ Args:
29
+ page: PyMuPDF page object
30
+
31
+ Returns:
32
+ PageBorderInfo object
33
+ """
34
+ result = PageBorderInfo()
35
+
36
+ drawings = page.get_drawings()
37
+ if not drawings:
38
+ return result
39
+
40
+ page_width = page.rect.width
41
+ page_height = page.rect.height
42
+
43
+ edge_margin = min(page_width, page_height) * PDFConfig.PAGE_BORDER_MARGIN
44
+ page_spanning_ratio = PDFConfig.PAGE_SPANNING_RATIO
45
+
46
+ border_lines = {
47
+ 'top': False,
48
+ 'bottom': False,
49
+ 'left': False,
50
+ 'right': False
51
+ }
52
+
53
+ for drawing in drawings:
54
+ rect = drawing.get('rect')
55
+ if not rect:
56
+ continue
57
+
58
+ w = rect.width
59
+ h = rect.height
60
+
61
+ # Detect thin lines as well (relaxed thickness limit)
62
+ # Horizontal line (small height, large width)
63
+ if h <= 10 and w > page_width * page_spanning_ratio:
64
+ if rect.y0 < edge_margin:
65
+ border_lines['top'] = True
66
+ elif rect.y1 > page_height - edge_margin:
67
+ border_lines['bottom'] = True
68
+
69
+ # Vertical line (small width, large height)
70
+ if w <= 10 and h > page_height * page_spanning_ratio:
71
+ if rect.x0 < edge_margin:
72
+ border_lines['left'] = True
73
+ elif rect.x1 > page_width - edge_margin:
74
+ border_lines['right'] = True
75
+
76
+ # If all 4 sides present, it's a page border
77
+ if all(border_lines.values()):
78
+ result.has_border = True
79
+ result.border_bbox = (edge_margin, edge_margin, page_width - edge_margin, page_height - edge_margin)
80
+ result.border_lines = border_lines
81
+
82
+ return result
83
+
84
+
85
+ def is_table_likely_border(
86
+ table_bbox: Tuple[float, float, float, float],
87
+ border_info: PageBorderInfo,
88
+ page
89
+ ) -> bool:
90
+ """
91
+ Check if a table is likely a page border.
92
+
93
+ Args:
94
+ table_bbox: Table bounding box
95
+ border_info: Page border information
96
+ page: PyMuPDF page object
97
+
98
+ Returns:
99
+ True if table is likely a border, False otherwise
100
+ """
101
+ if not border_info.has_border or not border_info.border_bbox:
102
+ return False
103
+
104
+ page_width = page.rect.width
105
+ page_height = page.rect.height
106
+
107
+ table_width = table_bbox[2] - table_bbox[0]
108
+ table_height = table_bbox[3] - table_bbox[1]
109
+
110
+ if table_width > page_width * 0.85 and table_height > page_height * 0.85:
111
+ return True
112
+
113
+ return False
114
+