xgen-doc2chunk 0.1.0__py3-none-any.whl → 0.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (162) hide show
  1. xgen_doc2chunk/__init__.py +42 -0
  2. xgen_doc2chunk/chunking/__init__.py +168 -0
  3. xgen_doc2chunk/chunking/chunking.py +786 -0
  4. xgen_doc2chunk/chunking/constants.py +134 -0
  5. xgen_doc2chunk/chunking/page_chunker.py +248 -0
  6. xgen_doc2chunk/chunking/protected_regions.py +715 -0
  7. xgen_doc2chunk/chunking/sheet_processor.py +406 -0
  8. xgen_doc2chunk/chunking/table_chunker.py +832 -0
  9. xgen_doc2chunk/chunking/table_parser.py +172 -0
  10. xgen_doc2chunk/chunking/text_chunker.py +443 -0
  11. xgen_doc2chunk/core/__init__.py +64 -0
  12. xgen_doc2chunk/core/document_processor.py +1307 -0
  13. xgen_doc2chunk/core/functions/__init__.py +85 -0
  14. xgen_doc2chunk/core/functions/chart_extractor.py +144 -0
  15. xgen_doc2chunk/core/functions/chart_processor.py +534 -0
  16. xgen_doc2chunk/core/functions/file_converter.py +220 -0
  17. xgen_doc2chunk/core/functions/img_processor.py +649 -0
  18. xgen_doc2chunk/core/functions/metadata_extractor.py +542 -0
  19. xgen_doc2chunk/core/functions/page_tag_processor.py +393 -0
  20. xgen_doc2chunk/core/functions/preprocessor.py +162 -0
  21. xgen_doc2chunk/core/functions/storage_backend.py +381 -0
  22. xgen_doc2chunk/core/functions/table_extractor.py +468 -0
  23. xgen_doc2chunk/core/functions/table_processor.py +299 -0
  24. xgen_doc2chunk/core/functions/utils.py +159 -0
  25. xgen_doc2chunk/core/processor/__init__.py +96 -0
  26. xgen_doc2chunk/core/processor/base_handler.py +544 -0
  27. xgen_doc2chunk/core/processor/csv_handler.py +135 -0
  28. xgen_doc2chunk/core/processor/csv_helper/__init__.py +89 -0
  29. xgen_doc2chunk/core/processor/csv_helper/csv_constants.py +63 -0
  30. xgen_doc2chunk/core/processor/csv_helper/csv_encoding.py +104 -0
  31. xgen_doc2chunk/core/processor/csv_helper/csv_file_converter.py +78 -0
  32. xgen_doc2chunk/core/processor/csv_helper/csv_image_processor.py +75 -0
  33. xgen_doc2chunk/core/processor/csv_helper/csv_metadata.py +168 -0
  34. xgen_doc2chunk/core/processor/csv_helper/csv_parser.py +225 -0
  35. xgen_doc2chunk/core/processor/csv_helper/csv_preprocessor.py +86 -0
  36. xgen_doc2chunk/core/processor/csv_helper/csv_table.py +266 -0
  37. xgen_doc2chunk/core/processor/doc_handler.py +579 -0
  38. xgen_doc2chunk/core/processor/doc_helpers/__init__.py +25 -0
  39. xgen_doc2chunk/core/processor/doc_helpers/doc_file_converter.py +160 -0
  40. xgen_doc2chunk/core/processor/doc_helpers/doc_image_processor.py +179 -0
  41. xgen_doc2chunk/core/processor/doc_helpers/doc_preprocessor.py +83 -0
  42. xgen_doc2chunk/core/processor/docx_handler.py +376 -0
  43. xgen_doc2chunk/core/processor/docx_helper/__init__.py +84 -0
  44. xgen_doc2chunk/core/processor/docx_helper/docx_chart_extractor.py +436 -0
  45. xgen_doc2chunk/core/processor/docx_helper/docx_constants.py +75 -0
  46. xgen_doc2chunk/core/processor/docx_helper/docx_file_converter.py +76 -0
  47. xgen_doc2chunk/core/processor/docx_helper/docx_image.py +145 -0
  48. xgen_doc2chunk/core/processor/docx_helper/docx_image_processor.py +410 -0
  49. xgen_doc2chunk/core/processor/docx_helper/docx_metadata.py +71 -0
  50. xgen_doc2chunk/core/processor/docx_helper/docx_paragraph.py +126 -0
  51. xgen_doc2chunk/core/processor/docx_helper/docx_preprocessor.py +82 -0
  52. xgen_doc2chunk/core/processor/docx_helper/docx_table_extractor.py +527 -0
  53. xgen_doc2chunk/core/processor/docx_helper/docx_table_processor.py +220 -0
  54. xgen_doc2chunk/core/processor/excel_handler.py +353 -0
  55. xgen_doc2chunk/core/processor/excel_helper/__init__.py +97 -0
  56. xgen_doc2chunk/core/processor/excel_helper/excel_chart_extractor.py +498 -0
  57. xgen_doc2chunk/core/processor/excel_helper/excel_file_converter.py +157 -0
  58. xgen_doc2chunk/core/processor/excel_helper/excel_image_processor.py +316 -0
  59. xgen_doc2chunk/core/processor/excel_helper/excel_layout_detector.py +739 -0
  60. xgen_doc2chunk/core/processor/excel_helper/excel_metadata.py +145 -0
  61. xgen_doc2chunk/core/processor/excel_helper/excel_preprocessor.py +83 -0
  62. xgen_doc2chunk/core/processor/excel_helper/excel_table_xls.py +357 -0
  63. xgen_doc2chunk/core/processor/excel_helper/excel_table_xlsx.py +361 -0
  64. xgen_doc2chunk/core/processor/excel_helper/excel_textbox.py +266 -0
  65. xgen_doc2chunk/core/processor/html_helper/__init__.py +7 -0
  66. xgen_doc2chunk/core/processor/html_helper/html_file_converter.py +92 -0
  67. xgen_doc2chunk/core/processor/html_helper/html_preprocessor.py +74 -0
  68. xgen_doc2chunk/core/processor/html_reprocessor.py +140 -0
  69. xgen_doc2chunk/core/processor/hwp_handler.py +401 -0
  70. xgen_doc2chunk/core/processor/hwp_helper/__init__.py +120 -0
  71. xgen_doc2chunk/core/processor/hwp_helper/hwp_chart_extractor.py +373 -0
  72. xgen_doc2chunk/core/processor/hwp_helper/hwp_constants.py +78 -0
  73. xgen_doc2chunk/core/processor/hwp_helper/hwp_decoder.py +106 -0
  74. xgen_doc2chunk/core/processor/hwp_helper/hwp_docinfo.py +174 -0
  75. xgen_doc2chunk/core/processor/hwp_helper/hwp_file_converter.py +60 -0
  76. xgen_doc2chunk/core/processor/hwp_helper/hwp_image_processor.py +413 -0
  77. xgen_doc2chunk/core/processor/hwp_helper/hwp_metadata.py +236 -0
  78. xgen_doc2chunk/core/processor/hwp_helper/hwp_preprocessor.py +82 -0
  79. xgen_doc2chunk/core/processor/hwp_helper/hwp_record.py +149 -0
  80. xgen_doc2chunk/core/processor/hwp_helper/hwp_recovery.py +217 -0
  81. xgen_doc2chunk/core/processor/hwp_helper/hwp_table.py +205 -0
  82. xgen_doc2chunk/core/processor/hwpx_handler.py +191 -0
  83. xgen_doc2chunk/core/processor/hwpx_helper/__init__.py +85 -0
  84. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_chart_extractor.py +464 -0
  85. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_constants.py +30 -0
  86. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_file_converter.py +70 -0
  87. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_image_processor.py +258 -0
  88. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_metadata.py +163 -0
  89. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_preprocessor.py +80 -0
  90. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_section.py +242 -0
  91. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_table_extractor.py +462 -0
  92. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_table_processor.py +220 -0
  93. xgen_doc2chunk/core/processor/image_file_handler.py +212 -0
  94. xgen_doc2chunk/core/processor/image_file_helper/__init__.py +17 -0
  95. xgen_doc2chunk/core/processor/image_file_helper/image_file_converter.py +69 -0
  96. xgen_doc2chunk/core/processor/image_file_helper/image_file_image_processor.py +123 -0
  97. xgen_doc2chunk/core/processor/image_file_helper/image_file_preprocessor.py +84 -0
  98. xgen_doc2chunk/core/processor/pdf_handler.py +597 -0
  99. xgen_doc2chunk/core/processor/pdf_helpers/__init__.py +229 -0
  100. xgen_doc2chunk/core/processor/pdf_helpers/pdf_block_image_engine.py +667 -0
  101. xgen_doc2chunk/core/processor/pdf_helpers/pdf_cell_analysis.py +493 -0
  102. xgen_doc2chunk/core/processor/pdf_helpers/pdf_complexity_analyzer.py +598 -0
  103. xgen_doc2chunk/core/processor/pdf_helpers/pdf_element_merger.py +46 -0
  104. xgen_doc2chunk/core/processor/pdf_helpers/pdf_file_converter.py +72 -0
  105. xgen_doc2chunk/core/processor/pdf_helpers/pdf_graphic_detector.py +332 -0
  106. xgen_doc2chunk/core/processor/pdf_helpers/pdf_image_processor.py +321 -0
  107. xgen_doc2chunk/core/processor/pdf_helpers/pdf_layout_block_detector.py +1244 -0
  108. xgen_doc2chunk/core/processor/pdf_helpers/pdf_line_analysis.py +420 -0
  109. xgen_doc2chunk/core/processor/pdf_helpers/pdf_metadata.py +101 -0
  110. xgen_doc2chunk/core/processor/pdf_helpers/pdf_page_analyzer.py +114 -0
  111. xgen_doc2chunk/core/processor/pdf_helpers/pdf_preprocessor.py +106 -0
  112. xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_detection.py +1346 -0
  113. xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_processor.py +897 -0
  114. xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_quality_analyzer.py +750 -0
  115. xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_validator.py +401 -0
  116. xgen_doc2chunk/core/processor/pdf_helpers/pdf_text_extractor.py +155 -0
  117. xgen_doc2chunk/core/processor/pdf_helpers/pdf_text_quality_analyzer.py +655 -0
  118. xgen_doc2chunk/core/processor/pdf_helpers/pdf_utils.py +183 -0
  119. xgen_doc2chunk/core/processor/pdf_helpers/pdf_vector_text_ocr.py +302 -0
  120. xgen_doc2chunk/core/processor/pdf_helpers/types.py +278 -0
  121. xgen_doc2chunk/core/processor/ppt_handler.py +288 -0
  122. xgen_doc2chunk/core/processor/ppt_helper/__init__.py +96 -0
  123. xgen_doc2chunk/core/processor/ppt_helper/ppt_bullet.py +332 -0
  124. xgen_doc2chunk/core/processor/ppt_helper/ppt_chart_extractor.py +182 -0
  125. xgen_doc2chunk/core/processor/ppt_helper/ppt_constants.py +119 -0
  126. xgen_doc2chunk/core/processor/ppt_helper/ppt_file_converter.py +55 -0
  127. xgen_doc2chunk/core/processor/ppt_helper/ppt_image_processor.py +196 -0
  128. xgen_doc2chunk/core/processor/ppt_helper/ppt_metadata.py +71 -0
  129. xgen_doc2chunk/core/processor/ppt_helper/ppt_preprocessor.py +77 -0
  130. xgen_doc2chunk/core/processor/ppt_helper/ppt_shape.py +189 -0
  131. xgen_doc2chunk/core/processor/ppt_helper/ppt_slide.py +69 -0
  132. xgen_doc2chunk/core/processor/ppt_helper/ppt_table.py +386 -0
  133. xgen_doc2chunk/core/processor/rtf_handler.py +290 -0
  134. xgen_doc2chunk/core/processor/rtf_helper/__init__.py +128 -0
  135. xgen_doc2chunk/core/processor/rtf_helper/rtf_constants.py +94 -0
  136. xgen_doc2chunk/core/processor/rtf_helper/rtf_content_extractor.py +211 -0
  137. xgen_doc2chunk/core/processor/rtf_helper/rtf_decoder.py +141 -0
  138. xgen_doc2chunk/core/processor/rtf_helper/rtf_file_converter.py +87 -0
  139. xgen_doc2chunk/core/processor/rtf_helper/rtf_metadata_extractor.py +179 -0
  140. xgen_doc2chunk/core/processor/rtf_helper/rtf_preprocessor.py +426 -0
  141. xgen_doc2chunk/core/processor/rtf_helper/rtf_region_finder.py +91 -0
  142. xgen_doc2chunk/core/processor/rtf_helper/rtf_table_extractor.py +482 -0
  143. xgen_doc2chunk/core/processor/rtf_helper/rtf_text_cleaner.py +389 -0
  144. xgen_doc2chunk/core/processor/text_handler.py +95 -0
  145. xgen_doc2chunk/core/processor/text_helper/__init__.py +17 -0
  146. xgen_doc2chunk/core/processor/text_helper/text_file_converter.py +28 -0
  147. xgen_doc2chunk/core/processor/text_helper/text_image_processor.py +75 -0
  148. xgen_doc2chunk/core/processor/text_helper/text_preprocessor.py +82 -0
  149. xgen_doc2chunk/ocr/__init__.py +67 -0
  150. xgen_doc2chunk/ocr/base.py +209 -0
  151. xgen_doc2chunk/ocr/ocr_engine/__init__.py +22 -0
  152. xgen_doc2chunk/ocr/ocr_engine/anthropic_ocr.py +91 -0
  153. xgen_doc2chunk/ocr/ocr_engine/bedrock_ocr.py +172 -0
  154. xgen_doc2chunk/ocr/ocr_engine/gemini_ocr.py +91 -0
  155. xgen_doc2chunk/ocr/ocr_engine/openai_ocr.py +100 -0
  156. xgen_doc2chunk/ocr/ocr_engine/vllm_ocr.py +116 -0
  157. xgen_doc2chunk/ocr/ocr_processor.py +387 -0
  158. {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.1.dist-info}/METADATA +1 -1
  159. xgen_doc2chunk-0.1.1.dist-info/RECORD +161 -0
  160. xgen_doc2chunk-0.1.0.dist-info/RECORD +0 -4
  161. {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.1.dist-info}/WHEEL +0 -0
  162. {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.1.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,183 @@
1
+ # xgen_doc2chunk/core/processor/pdf_helpers/pdf_utils.py
2
+ """
3
+ PDF Common Utilities Module
4
+
5
+ Provides common utility functions for PDF processing.
6
+ """
7
+ import logging
8
+ from typing import Dict, List, Optional, Tuple
9
+
10
+ logger = logging.getLogger("document-processor")
11
+
12
+
13
+ # ============================================================================
14
+ # HTML Escape
15
+ # ============================================================================
16
+
17
+ def escape_html(text: str) -> str:
18
+ """
19
+ Escape HTML special characters.
20
+
21
+ Args:
22
+ text: Original text
23
+
24
+ Returns:
25
+ Escaped text
26
+ """
27
+ if not text:
28
+ return ""
29
+ return (text
30
+ .replace("&", "&")
31
+ .replace("<", "&lt;")
32
+ .replace(">", "&gt;")
33
+ .replace('"', "&quot;"))
34
+
35
+
36
+ # ============================================================================
37
+ # Bounding Box Utilities
38
+ # ============================================================================
39
+
40
+ def calculate_overlap_ratio(
41
+ bbox1: Tuple[float, float, float, float],
42
+ bbox2: Tuple[float, float, float, float]
43
+ ) -> float:
44
+ """
45
+ Calculate the overlap ratio between two bounding boxes.
46
+
47
+ Args:
48
+ bbox1: First bbox (x0, y0, x1, y1)
49
+ bbox2: Second bbox (x0, y0, x1, y1)
50
+
51
+ Returns:
52
+ Overlap ratio relative to bbox1 (0.0 ~ 1.0)
53
+ """
54
+ x0 = max(bbox1[0], bbox2[0])
55
+ y0 = max(bbox1[1], bbox2[1])
56
+ x1 = min(bbox1[2], bbox2[2])
57
+ y1 = min(bbox1[3], bbox2[3])
58
+
59
+ if x1 <= x0 or y1 <= y0:
60
+ return 0.0
61
+
62
+ overlap_area = (x1 - x0) * (y1 - y0)
63
+ bbox1_area = (bbox1[2] - bbox1[0]) * (bbox1[3] - bbox1[1])
64
+
65
+ if bbox1_area <= 0:
66
+ return 0.0
67
+
68
+ return overlap_area / bbox1_area
69
+
70
+
71
+ def is_inside_any_bbox(
72
+ bbox: Tuple[float, float, float, float],
73
+ bbox_list: List[Tuple[float, float, float, float]],
74
+ threshold: float = 0.5
75
+ ) -> bool:
76
+ """
77
+ Check if a bbox is contained within any bbox in the list.
78
+
79
+ Args:
80
+ bbox: Bounding box to check
81
+ bbox_list: List of bounding boxes
82
+ threshold: Overlap ratio threshold
83
+
84
+ Returns:
85
+ True if contained, False otherwise
86
+ """
87
+ for target_bbox in bbox_list:
88
+ overlap = calculate_overlap_ratio(bbox, target_bbox)
89
+ if overlap > threshold:
90
+ return True
91
+ return False
92
+
93
+
94
+ def bbox_overlaps(bbox1: Tuple, bbox2: Tuple) -> bool:
95
+ """
96
+ Check if two bboxes overlap.
97
+
98
+ Args:
99
+ bbox1: First bbox (x0, y0, x1, y1)
100
+ bbox2: Second bbox (x0, y0, x1, y1)
101
+
102
+ Returns:
103
+ True if overlapping, False otherwise
104
+ """
105
+ return not (
106
+ bbox1[2] <= bbox2[0] or
107
+ bbox1[0] >= bbox2[2] or
108
+ bbox1[3] <= bbox2[1] or
109
+ bbox1[1] >= bbox2[3]
110
+ )
111
+
112
+
113
+ # ============================================================================
114
+ # Image Position Detection
115
+ # ============================================================================
116
+
117
+ def find_image_position(page, xref: int) -> Optional[Tuple[float, float, float, float]]:
118
+ """
119
+ Find the position of an image within a page.
120
+
121
+ Args:
122
+ page: PyMuPDF page object
123
+ xref: Image xref
124
+
125
+ Returns:
126
+ Bounding box or None
127
+ """
128
+ try:
129
+ image_list = page.get_image_info(xrefs=True)
130
+
131
+ for img_info in image_list:
132
+ if img_info.get("xref") == xref:
133
+ bbox = img_info.get("bbox")
134
+ if bbox:
135
+ return tuple(bbox)
136
+
137
+ return None
138
+
139
+ except Exception as e:
140
+ logger.debug(f"[PDF] Error finding image position: {e}")
141
+ return None
142
+
143
+
144
+ # ============================================================================
145
+ # Text Line Extraction
146
+ # ============================================================================
147
+
148
+ def get_text_lines_with_positions(page) -> List[Dict]:
149
+ """
150
+ Extract text lines and position information from a page.
151
+
152
+ Args:
153
+ page: PyMuPDF page object
154
+
155
+ Returns:
156
+ List of text line information
157
+ """
158
+ lines = []
159
+ page_dict = page.get_text("dict", sort=True)
160
+
161
+ for block in page_dict.get("blocks", []):
162
+ if block.get("type") != 0:
163
+ continue
164
+
165
+ for line in block.get("lines", []):
166
+ line_bbox = line.get("bbox", (0, 0, 0, 0))
167
+ text_parts = []
168
+
169
+ for span in line.get("spans", []):
170
+ text_parts.append(span.get("text", ""))
171
+
172
+ full_text = "".join(text_parts).strip()
173
+ if full_text:
174
+ lines.append({
175
+ 'text': full_text,
176
+ 'y0': line_bbox[1],
177
+ 'y1': line_bbox[3],
178
+ 'x0': line_bbox[0],
179
+ 'x1': line_bbox[2]
180
+ })
181
+
182
+ return lines
183
+
@@ -0,0 +1,302 @@
1
+ """
2
+ Vector Text OCR Engine for PDF Handler
3
+
4
+ Detects regions in PDFs where text is rendered as vector curves (Bézier curves)
5
+ rather than font glyphs, and extracts text using OCR.
6
+ """
7
+
8
+ import io
9
+ import logging
10
+ from typing import List, Dict, Tuple, Optional
11
+
12
+ import fitz
13
+ from PIL import Image
14
+ import pytesseract
15
+
16
+ from xgen_doc2chunk.core.processor.pdf_helpers.types import VectorTextRegion
17
+
18
+ logger = logging.getLogger(__name__)
19
+
20
+
21
+ # ============================================================================
22
+ # Configuration for Vector Text OCR
23
+ # ============================================================================
24
+
25
+ class VectorTextConfig:
26
+ """Vector text OCR configuration settings."""
27
+ MAX_HEIGHT = 50.0 # Maximum height for vector text regions
28
+ MIN_ITEMS = 5 # Minimum number of drawing items
29
+ OCR_SCALE = 3.0 # Rendering scale factor for OCR
30
+ OCR_LANG = 'kor+eng' # OCR language
31
+
32
+
33
+ # ============================================================================
34
+ # Vector Text OCR Engine
35
+ # ============================================================================
36
+
37
+ class VectorTextOCREngine:
38
+ """
39
+ Vector Text OCR Engine
40
+
41
+ Detects regions in PDFs where text is rendered as vector curves (Bézier curves)
42
+ rather than font glyphs, and extracts text using OCR.
43
+
44
+ Why is this needed?
45
+ - Some PDFs convert text to outlines to avoid font embedding issues
46
+ - Design programs (Illustrator, InDesign, etc.) apply "Create Outlines"
47
+ - In these cases, regular text extraction cannot retrieve the content
48
+ """
49
+
50
+ def __init__(self, page, page_num: int):
51
+ """
52
+ Args:
53
+ page: PyMuPDF page object
54
+ page_num: Page number (0-indexed)
55
+ """
56
+ self.page = page
57
+ self.page_num = page_num
58
+ self.page_width = page.rect.width
59
+ self.page_height = page.rect.height
60
+ self.vector_regions: List[VectorTextRegion] = []
61
+
62
+ def detect_and_extract(self) -> List[VectorTextRegion]:
63
+ """
64
+ Detect vector text regions and extract using OCR.
65
+
66
+ Returns:
67
+ List of VectorTextRegion (including OCR text)
68
+ """
69
+ # 1. Detect vector text regions
70
+ self._detect_vector_text_regions()
71
+
72
+ if not self.vector_regions:
73
+ return []
74
+
75
+ logger.info(f"[VectorTextOCR] Page {self.page_num + 1}: Found {len(self.vector_regions)} vector text regions")
76
+
77
+ # 2. Perform OCR for each region
78
+ for region in self.vector_regions:
79
+ self._ocr_region(region)
80
+
81
+ # 3. Return only regions with OCR results
82
+ valid_regions = [r for r in self.vector_regions if r.ocr_text.strip()]
83
+ logger.info(f"[VectorTextOCR] Page {self.page_num + 1}: Extracted text from {len(valid_regions)} regions")
84
+
85
+ return valid_regions
86
+
87
+ def _detect_vector_text_regions(self):
88
+ """
89
+ Detect vector text regions.
90
+
91
+ Characteristics of vector text:
92
+ 1. Many items in drawings (each character stroke is a path)
93
+ 2. Relatively narrow height (text height level)
94
+ 3. No or very little actual text in that region
95
+ """
96
+ drawings = self.page.get_drawings()
97
+ if not drawings:
98
+ return
99
+
100
+ # Collect text block areas (for comparing vector text vs actual text)
101
+ text_dict = self.page.get_text("dict")
102
+ text_blocks = text_dict.get("blocks", [])
103
+ text_bboxes = []
104
+ for block in text_blocks:
105
+ if block.get("type") == 0: # Text block
106
+ for line in block.get("lines", []):
107
+ for span in line.get("spans", []):
108
+ text = span.get("text", "").strip()
109
+ if text and len(text) > 1: # Meaningful text
110
+ text_bboxes.append((span.get("bbox"), text))
111
+
112
+ # Group drawings (merge adjacent drawings into one region)
113
+ potential_regions: List[Dict] = []
114
+
115
+ for drawing in drawings:
116
+ rect = drawing.get("rect")
117
+ items = drawing.get("items", [])
118
+
119
+ if not rect or not items:
120
+ continue
121
+
122
+ x0, y0, x1, y1 = rect.x0, rect.y0, rect.x1, rect.y1
123
+ height = y1 - y0
124
+ width = x1 - x0
125
+ item_count = len(items)
126
+
127
+ # Count curves
128
+ curve_count = sum(1 for item in items if item[0] == 'c')
129
+ fill = drawing.get("fill")
130
+
131
+ # Vector text conditions:
132
+ # 1. Height at text level (below VectorTextConfig.MAX_HEIGHT)
133
+ # 2. Many items (character strokes)
134
+ # 3. Small height relative to width (text line shape)
135
+ if (height <= VectorTextConfig.MAX_HEIGHT and
136
+ item_count >= VectorTextConfig.MIN_ITEMS and
137
+ width > height * 2):
138
+
139
+ # Check if actual text exists in this region
140
+ has_real_text = self._has_text_in_region((x0, y0, x1, y1), text_bboxes)
141
+
142
+ if not has_real_text:
143
+ potential_regions.append({
144
+ 'bbox': (x0, y0, x1, y1),
145
+ 'item_count': item_count,
146
+ 'curve_count': curve_count,
147
+ 'fill_count': 1 if fill else 0
148
+ })
149
+
150
+ # Merge adjacent regions
151
+ merged_regions = self._merge_adjacent_regions(potential_regions)
152
+
153
+ for region_data in merged_regions:
154
+ self.vector_regions.append(VectorTextRegion(
155
+ bbox=region_data['bbox'],
156
+ drawing_count=region_data.get('drawing_count', 1),
157
+ curve_count=region_data.get('curve_count', 0),
158
+ fill_count=region_data.get('fill_count', 0)
159
+ ))
160
+
161
+ def _has_text_in_region(self, bbox: Tuple[float, float, float, float],
162
+ text_bboxes: List[Tuple]) -> bool:
163
+ """Check if actual text exists in the specified region."""
164
+ x0, y0, x1, y1 = bbox
165
+
166
+ for text_bbox, text in text_bboxes:
167
+ if not text_bbox:
168
+ continue
169
+ tx0, ty0, tx1, ty1 = text_bbox
170
+
171
+ # Check region overlap
172
+ if (x0 <= tx1 and x1 >= tx0 and y0 <= ty1 and y1 >= ty0):
173
+ # True if there is sufficient text
174
+ if len(text) >= 3:
175
+ return True
176
+
177
+ return False
178
+
179
+ def _merge_adjacent_regions(self, regions: List[Dict]) -> List[Dict]:
180
+ """Merge adjacent vector text regions."""
181
+ if not regions:
182
+ return []
183
+
184
+ # Sort by Y coordinate
185
+ sorted_regions = sorted(regions, key=lambda r: (r['bbox'][1], r['bbox'][0]))
186
+
187
+ merged = []
188
+ current = None
189
+
190
+ for region in sorted_regions:
191
+ if current is None:
192
+ current = {
193
+ 'bbox': list(region['bbox']),
194
+ 'item_count': region['item_count'],
195
+ 'curve_count': region.get('curve_count', 0),
196
+ 'fill_count': region.get('fill_count', 0),
197
+ 'drawing_count': 1
198
+ }
199
+ else:
200
+ # Merge if on the same line and adjacent
201
+ c_x0, c_y0, c_x1, c_y1 = current['bbox']
202
+ r_x0, r_y0, r_x1, r_y1 = region['bbox']
203
+
204
+ # Similar Y coordinates (same line) and adjacent X
205
+ y_overlap = abs(c_y0 - r_y0) < 5 and abs(c_y1 - r_y1) < 5
206
+ x_adjacent = r_x0 - c_x1 < 20 # Adjacent if within 20pt
207
+
208
+ if y_overlap and x_adjacent:
209
+ # Merge
210
+ current['bbox'][0] = min(c_x0, r_x0)
211
+ current['bbox'][2] = max(c_x1, r_x1)
212
+ current['bbox'][1] = min(c_y0, r_y0)
213
+ current['bbox'][3] = max(c_y1, r_y1)
214
+ current['item_count'] += region['item_count']
215
+ current['curve_count'] += region.get('curve_count', 0)
216
+ current['fill_count'] += region.get('fill_count', 0)
217
+ current['drawing_count'] += 1
218
+ else:
219
+ # New region
220
+ merged.append({
221
+ 'bbox': tuple(current['bbox']),
222
+ 'item_count': current['item_count'],
223
+ 'curve_count': current['curve_count'],
224
+ 'fill_count': current['fill_count'],
225
+ 'drawing_count': current['drawing_count']
226
+ })
227
+ current = {
228
+ 'bbox': list(region['bbox']),
229
+ 'item_count': region['item_count'],
230
+ 'curve_count': region.get('curve_count', 0),
231
+ 'fill_count': region.get('fill_count', 0),
232
+ 'drawing_count': 1
233
+ }
234
+
235
+ if current:
236
+ merged.append({
237
+ 'bbox': tuple(current['bbox']),
238
+ 'item_count': current['item_count'],
239
+ 'curve_count': current['curve_count'],
240
+ 'fill_count': current['fill_count'],
241
+ 'drawing_count': current['drawing_count']
242
+ })
243
+
244
+ return merged
245
+
246
+ def _ocr_region(self, region: VectorTextRegion):
247
+ """Perform OCR on a specific region."""
248
+ try:
249
+ x0, y0, x1, y1 = region.bbox
250
+
251
+ # Add slight padding
252
+ padding = 5
253
+ clip = fitz.Rect(
254
+ max(0, x0 - padding),
255
+ max(0, y0 - padding),
256
+ min(self.page_width, x1 + padding),
257
+ min(self.page_height, y1 + padding)
258
+ )
259
+
260
+ # Render at high resolution
261
+ mat = fitz.Matrix(VectorTextConfig.OCR_SCALE, VectorTextConfig.OCR_SCALE)
262
+ pix = self.page.get_pixmap(matrix=mat, clip=clip)
263
+
264
+ # Convert to PIL Image
265
+ img_data = pix.tobytes("png")
266
+ img = Image.open(io.BytesIO(img_data))
267
+
268
+ # Perform OCR
269
+ ocr_config = '--psm 7' # Treat as single text line
270
+ text = pytesseract.image_to_string(
271
+ img,
272
+ lang=VectorTextConfig.OCR_LANG,
273
+ config=ocr_config
274
+ )
275
+
276
+ region.ocr_text = text.strip()
277
+
278
+ # Calculate confidence (simple heuristic)
279
+ if region.ocr_text:
280
+ # Estimate confidence by Korean/English character ratio
281
+ def is_korean(c: str) -> bool:
282
+ return '가' <= c <= '힣' or 'ㄱ' <= c <= 'ㅎ' or 'ㅏ' <= c <= 'ㅣ'
283
+ valid_chars = sum(1 for c in region.ocr_text if c.isalnum() or is_korean(c))
284
+ total_chars = len(region.ocr_text)
285
+ region.confidence = valid_chars / total_chars if total_chars > 0 else 0.0
286
+
287
+ logger.debug(f"[VectorTextOCR] Region {region.bbox}: OCR='{region.ocr_text[:50]}...' conf={region.confidence:.2f}")
288
+
289
+ except Exception as e:
290
+ logger.warning(f"[VectorTextOCR] OCR failed for region {region.bbox}: {e}")
291
+ region.ocr_text = ""
292
+ region.confidence = 0.0
293
+
294
+
295
+ # ============================================================================
296
+ # Export
297
+ # ============================================================================
298
+
299
+ __all__ = [
300
+ 'VectorTextConfig',
301
+ 'VectorTextOCREngine',
302
+ ]