xgen-doc2chunk 0.1.0__py3-none-any.whl → 0.1.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (162) hide show
  1. xgen_doc2chunk/__init__.py +42 -0
  2. xgen_doc2chunk/chunking/__init__.py +168 -0
  3. xgen_doc2chunk/chunking/chunking.py +786 -0
  4. xgen_doc2chunk/chunking/constants.py +134 -0
  5. xgen_doc2chunk/chunking/page_chunker.py +248 -0
  6. xgen_doc2chunk/chunking/protected_regions.py +715 -0
  7. xgen_doc2chunk/chunking/sheet_processor.py +406 -0
  8. xgen_doc2chunk/chunking/table_chunker.py +832 -0
  9. xgen_doc2chunk/chunking/table_parser.py +172 -0
  10. xgen_doc2chunk/chunking/text_chunker.py +443 -0
  11. xgen_doc2chunk/core/__init__.py +64 -0
  12. xgen_doc2chunk/core/document_processor.py +1307 -0
  13. xgen_doc2chunk/core/functions/__init__.py +85 -0
  14. xgen_doc2chunk/core/functions/chart_extractor.py +144 -0
  15. xgen_doc2chunk/core/functions/chart_processor.py +534 -0
  16. xgen_doc2chunk/core/functions/file_converter.py +220 -0
  17. xgen_doc2chunk/core/functions/img_processor.py +649 -0
  18. xgen_doc2chunk/core/functions/metadata_extractor.py +542 -0
  19. xgen_doc2chunk/core/functions/page_tag_processor.py +393 -0
  20. xgen_doc2chunk/core/functions/preprocessor.py +162 -0
  21. xgen_doc2chunk/core/functions/storage_backend.py +381 -0
  22. xgen_doc2chunk/core/functions/table_extractor.py +468 -0
  23. xgen_doc2chunk/core/functions/table_processor.py +299 -0
  24. xgen_doc2chunk/core/functions/utils.py +159 -0
  25. xgen_doc2chunk/core/processor/__init__.py +96 -0
  26. xgen_doc2chunk/core/processor/base_handler.py +544 -0
  27. xgen_doc2chunk/core/processor/csv_handler.py +135 -0
  28. xgen_doc2chunk/core/processor/csv_helper/__init__.py +89 -0
  29. xgen_doc2chunk/core/processor/csv_helper/csv_constants.py +63 -0
  30. xgen_doc2chunk/core/processor/csv_helper/csv_encoding.py +104 -0
  31. xgen_doc2chunk/core/processor/csv_helper/csv_file_converter.py +78 -0
  32. xgen_doc2chunk/core/processor/csv_helper/csv_image_processor.py +75 -0
  33. xgen_doc2chunk/core/processor/csv_helper/csv_metadata.py +168 -0
  34. xgen_doc2chunk/core/processor/csv_helper/csv_parser.py +225 -0
  35. xgen_doc2chunk/core/processor/csv_helper/csv_preprocessor.py +86 -0
  36. xgen_doc2chunk/core/processor/csv_helper/csv_table.py +266 -0
  37. xgen_doc2chunk/core/processor/doc_handler.py +579 -0
  38. xgen_doc2chunk/core/processor/doc_helpers/__init__.py +25 -0
  39. xgen_doc2chunk/core/processor/doc_helpers/doc_file_converter.py +160 -0
  40. xgen_doc2chunk/core/processor/doc_helpers/doc_image_processor.py +179 -0
  41. xgen_doc2chunk/core/processor/doc_helpers/doc_preprocessor.py +83 -0
  42. xgen_doc2chunk/core/processor/docx_handler.py +376 -0
  43. xgen_doc2chunk/core/processor/docx_helper/__init__.py +84 -0
  44. xgen_doc2chunk/core/processor/docx_helper/docx_chart_extractor.py +436 -0
  45. xgen_doc2chunk/core/processor/docx_helper/docx_constants.py +75 -0
  46. xgen_doc2chunk/core/processor/docx_helper/docx_file_converter.py +76 -0
  47. xgen_doc2chunk/core/processor/docx_helper/docx_image.py +145 -0
  48. xgen_doc2chunk/core/processor/docx_helper/docx_image_processor.py +410 -0
  49. xgen_doc2chunk/core/processor/docx_helper/docx_metadata.py +71 -0
  50. xgen_doc2chunk/core/processor/docx_helper/docx_paragraph.py +126 -0
  51. xgen_doc2chunk/core/processor/docx_helper/docx_preprocessor.py +82 -0
  52. xgen_doc2chunk/core/processor/docx_helper/docx_table_extractor.py +527 -0
  53. xgen_doc2chunk/core/processor/docx_helper/docx_table_processor.py +220 -0
  54. xgen_doc2chunk/core/processor/excel_handler.py +353 -0
  55. xgen_doc2chunk/core/processor/excel_helper/__init__.py +97 -0
  56. xgen_doc2chunk/core/processor/excel_helper/excel_chart_extractor.py +498 -0
  57. xgen_doc2chunk/core/processor/excel_helper/excel_file_converter.py +157 -0
  58. xgen_doc2chunk/core/processor/excel_helper/excel_image_processor.py +316 -0
  59. xgen_doc2chunk/core/processor/excel_helper/excel_layout_detector.py +739 -0
  60. xgen_doc2chunk/core/processor/excel_helper/excel_metadata.py +145 -0
  61. xgen_doc2chunk/core/processor/excel_helper/excel_preprocessor.py +83 -0
  62. xgen_doc2chunk/core/processor/excel_helper/excel_table_xls.py +357 -0
  63. xgen_doc2chunk/core/processor/excel_helper/excel_table_xlsx.py +361 -0
  64. xgen_doc2chunk/core/processor/excel_helper/excel_textbox.py +266 -0
  65. xgen_doc2chunk/core/processor/html_helper/__init__.py +7 -0
  66. xgen_doc2chunk/core/processor/html_helper/html_file_converter.py +92 -0
  67. xgen_doc2chunk/core/processor/html_helper/html_preprocessor.py +74 -0
  68. xgen_doc2chunk/core/processor/html_reprocessor.py +140 -0
  69. xgen_doc2chunk/core/processor/hwp_handler.py +401 -0
  70. xgen_doc2chunk/core/processor/hwp_helper/__init__.py +120 -0
  71. xgen_doc2chunk/core/processor/hwp_helper/hwp_chart_extractor.py +373 -0
  72. xgen_doc2chunk/core/processor/hwp_helper/hwp_constants.py +78 -0
  73. xgen_doc2chunk/core/processor/hwp_helper/hwp_decoder.py +106 -0
  74. xgen_doc2chunk/core/processor/hwp_helper/hwp_docinfo.py +174 -0
  75. xgen_doc2chunk/core/processor/hwp_helper/hwp_file_converter.py +60 -0
  76. xgen_doc2chunk/core/processor/hwp_helper/hwp_image_processor.py +413 -0
  77. xgen_doc2chunk/core/processor/hwp_helper/hwp_metadata.py +236 -0
  78. xgen_doc2chunk/core/processor/hwp_helper/hwp_preprocessor.py +82 -0
  79. xgen_doc2chunk/core/processor/hwp_helper/hwp_record.py +149 -0
  80. xgen_doc2chunk/core/processor/hwp_helper/hwp_recovery.py +217 -0
  81. xgen_doc2chunk/core/processor/hwp_helper/hwp_table.py +205 -0
  82. xgen_doc2chunk/core/processor/hwpx_handler.py +191 -0
  83. xgen_doc2chunk/core/processor/hwpx_helper/__init__.py +85 -0
  84. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_chart_extractor.py +464 -0
  85. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_constants.py +30 -0
  86. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_file_converter.py +70 -0
  87. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_image_processor.py +258 -0
  88. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_metadata.py +163 -0
  89. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_preprocessor.py +80 -0
  90. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_section.py +242 -0
  91. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_table_extractor.py +462 -0
  92. xgen_doc2chunk/core/processor/hwpx_helper/hwpx_table_processor.py +220 -0
  93. xgen_doc2chunk/core/processor/image_file_handler.py +212 -0
  94. xgen_doc2chunk/core/processor/image_file_helper/__init__.py +17 -0
  95. xgen_doc2chunk/core/processor/image_file_helper/image_file_converter.py +69 -0
  96. xgen_doc2chunk/core/processor/image_file_helper/image_file_image_processor.py +123 -0
  97. xgen_doc2chunk/core/processor/image_file_helper/image_file_preprocessor.py +84 -0
  98. xgen_doc2chunk/core/processor/pdf_handler.py +597 -0
  99. xgen_doc2chunk/core/processor/pdf_helpers/__init__.py +229 -0
  100. xgen_doc2chunk/core/processor/pdf_helpers/pdf_block_image_engine.py +667 -0
  101. xgen_doc2chunk/core/processor/pdf_helpers/pdf_cell_analysis.py +493 -0
  102. xgen_doc2chunk/core/processor/pdf_helpers/pdf_complexity_analyzer.py +598 -0
  103. xgen_doc2chunk/core/processor/pdf_helpers/pdf_element_merger.py +46 -0
  104. xgen_doc2chunk/core/processor/pdf_helpers/pdf_file_converter.py +72 -0
  105. xgen_doc2chunk/core/processor/pdf_helpers/pdf_graphic_detector.py +332 -0
  106. xgen_doc2chunk/core/processor/pdf_helpers/pdf_image_processor.py +321 -0
  107. xgen_doc2chunk/core/processor/pdf_helpers/pdf_layout_block_detector.py +1244 -0
  108. xgen_doc2chunk/core/processor/pdf_helpers/pdf_line_analysis.py +420 -0
  109. xgen_doc2chunk/core/processor/pdf_helpers/pdf_metadata.py +101 -0
  110. xgen_doc2chunk/core/processor/pdf_helpers/pdf_page_analyzer.py +114 -0
  111. xgen_doc2chunk/core/processor/pdf_helpers/pdf_preprocessor.py +106 -0
  112. xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_detection.py +1346 -0
  113. xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_processor.py +897 -0
  114. xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_quality_analyzer.py +750 -0
  115. xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_validator.py +401 -0
  116. xgen_doc2chunk/core/processor/pdf_helpers/pdf_text_extractor.py +155 -0
  117. xgen_doc2chunk/core/processor/pdf_helpers/pdf_text_quality_analyzer.py +655 -0
  118. xgen_doc2chunk/core/processor/pdf_helpers/pdf_utils.py +183 -0
  119. xgen_doc2chunk/core/processor/pdf_helpers/pdf_vector_text_ocr.py +302 -0
  120. xgen_doc2chunk/core/processor/pdf_helpers/types.py +278 -0
  121. xgen_doc2chunk/core/processor/ppt_handler.py +288 -0
  122. xgen_doc2chunk/core/processor/ppt_helper/__init__.py +96 -0
  123. xgen_doc2chunk/core/processor/ppt_helper/ppt_bullet.py +332 -0
  124. xgen_doc2chunk/core/processor/ppt_helper/ppt_chart_extractor.py +182 -0
  125. xgen_doc2chunk/core/processor/ppt_helper/ppt_constants.py +119 -0
  126. xgen_doc2chunk/core/processor/ppt_helper/ppt_file_converter.py +55 -0
  127. xgen_doc2chunk/core/processor/ppt_helper/ppt_image_processor.py +196 -0
  128. xgen_doc2chunk/core/processor/ppt_helper/ppt_metadata.py +71 -0
  129. xgen_doc2chunk/core/processor/ppt_helper/ppt_preprocessor.py +77 -0
  130. xgen_doc2chunk/core/processor/ppt_helper/ppt_shape.py +189 -0
  131. xgen_doc2chunk/core/processor/ppt_helper/ppt_slide.py +69 -0
  132. xgen_doc2chunk/core/processor/ppt_helper/ppt_table.py +386 -0
  133. xgen_doc2chunk/core/processor/rtf_handler.py +290 -0
  134. xgen_doc2chunk/core/processor/rtf_helper/__init__.py +128 -0
  135. xgen_doc2chunk/core/processor/rtf_helper/rtf_constants.py +94 -0
  136. xgen_doc2chunk/core/processor/rtf_helper/rtf_content_extractor.py +211 -0
  137. xgen_doc2chunk/core/processor/rtf_helper/rtf_decoder.py +141 -0
  138. xgen_doc2chunk/core/processor/rtf_helper/rtf_file_converter.py +87 -0
  139. xgen_doc2chunk/core/processor/rtf_helper/rtf_metadata_extractor.py +179 -0
  140. xgen_doc2chunk/core/processor/rtf_helper/rtf_preprocessor.py +426 -0
  141. xgen_doc2chunk/core/processor/rtf_helper/rtf_region_finder.py +91 -0
  142. xgen_doc2chunk/core/processor/rtf_helper/rtf_table_extractor.py +482 -0
  143. xgen_doc2chunk/core/processor/rtf_helper/rtf_text_cleaner.py +389 -0
  144. xgen_doc2chunk/core/processor/text_handler.py +95 -0
  145. xgen_doc2chunk/core/processor/text_helper/__init__.py +17 -0
  146. xgen_doc2chunk/core/processor/text_helper/text_file_converter.py +28 -0
  147. xgen_doc2chunk/core/processor/text_helper/text_image_processor.py +75 -0
  148. xgen_doc2chunk/core/processor/text_helper/text_preprocessor.py +82 -0
  149. xgen_doc2chunk/ocr/__init__.py +67 -0
  150. xgen_doc2chunk/ocr/base.py +209 -0
  151. xgen_doc2chunk/ocr/ocr_engine/__init__.py +22 -0
  152. xgen_doc2chunk/ocr/ocr_engine/anthropic_ocr.py +91 -0
  153. xgen_doc2chunk/ocr/ocr_engine/bedrock_ocr.py +172 -0
  154. xgen_doc2chunk/ocr/ocr_engine/gemini_ocr.py +91 -0
  155. xgen_doc2chunk/ocr/ocr_engine/openai_ocr.py +100 -0
  156. xgen_doc2chunk/ocr/ocr_engine/vllm_ocr.py +116 -0
  157. xgen_doc2chunk/ocr/ocr_processor.py +387 -0
  158. {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.2.dist-info}/METADATA +1 -1
  159. xgen_doc2chunk-0.1.2.dist-info/RECORD +161 -0
  160. xgen_doc2chunk-0.1.0.dist-info/RECORD +0 -4
  161. {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.2.dist-info}/WHEEL +0 -0
  162. {xgen_doc2chunk-0.1.0.dist-info → xgen_doc2chunk-0.1.2.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,655 @@
1
+ """
2
+ Text Quality Analyzer for PDF Handler
3
+
4
+ Analyzes the quality of text extracted from PDF and detects broken text
5
+ (encoding issues, missing ToUnicode CMap, etc.) to determine whether OCR fallback is needed.
6
+
7
+ =============================================================================
8
+ Characteristics of Broken Text:
9
+ =============================================================================
10
+ 1. Contains many Private Use Area (PUA) characters: U+E000 ~ U+F8FF
11
+ 2. Replacement Character: U+FFFD (�)
12
+ 3. Invalid Korean character combinations (only consonants/vowels in sequence)
13
+ 4. Meaningless Korean syllable sequences (random combinations, not real words)
14
+ 5. Mixture of CJK characters with PUA/control characters
15
+
16
+ =============================================================================
17
+ Resolution Strategy:
18
+ =============================================================================
19
+ 1. Calculate text quality score (0.0 ~ 1.0)
20
+ 2. Perform OCR fallback if quality is below threshold
21
+ 3. Apply OCR to entire page or specific regions
22
+ """
23
+
24
+ import logging
25
+ import re
26
+ import unicodedata
27
+ from typing import List, Dict, Tuple, Optional, Set
28
+ from dataclasses import dataclass
29
+
30
+ import fitz
31
+ from PIL import Image
32
+ import pytesseract
33
+
34
+ logger = logging.getLogger(__name__)
35
+
36
+
37
+ # ============================================================================
38
+ # Configuration
39
+ # ============================================================================
40
+
41
+ class TextQualityConfig:
42
+ """Text quality analysis configuration."""
43
+
44
+ # Quality threshold
45
+ QUALITY_THRESHOLD = 0.7 # OCR fallback if below this value (raised from 0.5 to 0.7)
46
+ MIN_TEXT_LENGTH = 10 # Minimum text length for quality analysis
47
+
48
+ # PUA-based threshold (force OCR if PUA ratio is above this)
49
+ PUA_RATIO_THRESHOLD = 0.1 # 10% or more triggers OCR
50
+
51
+ # PUA (Private Use Area) ranges
52
+ PUA_RANGES = [
53
+ (0xE000, 0xF8FF), # BMP Private Use Area
54
+ (0xF0000, 0xFFFFD), # Supplementary PUA-A
55
+ (0x100000, 0x10FFFD), # Supplementary PUA-B
56
+ ]
57
+
58
+ # Control characters and special characters
59
+ CONTROL_RANGES = [
60
+ (0x0000, 0x001F), # C0 controls
61
+ (0x007F, 0x009F), # C1 controls
62
+ (0xFFF0, 0xFFFF), # Specials
63
+ ]
64
+
65
+ # OCR settings
66
+ OCR_LANG = 'kor+eng'
67
+ OCR_DPI = 300
68
+ OCR_SCALE = 3.0
69
+
70
+ # Korean syllable ranges
71
+ HANGUL_SYLLABLE_RANGE = (0xAC00, 0xD7A3)
72
+ HANGUL_JAMO_RANGE = (0x1100, 0x11FF)
73
+ HANGUL_COMPAT_JAMO_RANGE = (0x3130, 0x318F)
74
+
75
+ # Quality analysis weights
76
+ WEIGHT_PUA = 0.4 # PUA character ratio weight
77
+ WEIGHT_REPLACEMENT = 0.3 # Replacement character weight
78
+ WEIGHT_VALID_RATIO = 0.3 # Valid character ratio weight
79
+
80
+
81
+ # ============================================================================
82
+ # Data Classes
83
+ # ============================================================================
84
+
85
+ @dataclass
86
+ class TextQualityResult:
87
+ """Text quality analysis result."""
88
+ quality_score: float # 0.0 ~ 1.0 (higher is better quality)
89
+ total_chars: int # Total character count
90
+ pua_count: int # PUA character count
91
+ replacement_count: int # Replacement character count
92
+ valid_chars: int # Valid character count (Korean, English, digits)
93
+ control_chars: int # Control character count
94
+ needs_ocr: bool # Whether OCR is needed
95
+ details: Dict # Detailed information
96
+
97
+
98
+ @dataclass
99
+ class PageTextAnalysis:
100
+ """Page text analysis result."""
101
+ page_num: int
102
+ quality_result: TextQualityResult
103
+ text_blocks: List[Dict] # Individual text block information
104
+ problem_regions: List[Tuple[float, float, float, float]] # Bounding boxes of problematic regions
105
+ ocr_text: Optional[str] = None # OCR result (if performed)
106
+
107
+
108
+ # ============================================================================
109
+ # Text Quality Analyzer
110
+ # ============================================================================
111
+
112
+ class TextQualityAnalyzer:
113
+ """
114
+ Text Quality Analyzer.
115
+
116
+ Analyzes the quality of text extracted from PDF and
117
+ detects broken text to determine whether OCR fallback is needed.
118
+ """
119
+
120
+ def __init__(self, page, page_num: int):
121
+ """
122
+ Args:
123
+ page: PyMuPDF page object
124
+ page_num: Page number (0-indexed)
125
+ """
126
+ self.page = page
127
+ self.page_num = page_num
128
+ self.page_width = page.rect.width
129
+ self.page_height = page.rect.height
130
+
131
+ def analyze_page(self) -> PageTextAnalysis:
132
+ """
133
+ Analyze text quality for the entire page.
134
+
135
+ Returns:
136
+ PageTextAnalysis object
137
+ """
138
+ # Extract text dictionary
139
+ text_dict = self.page.get_text("dict", sort=True)
140
+ blocks = text_dict.get("blocks", [])
141
+
142
+ all_text = []
143
+ text_blocks = []
144
+ problem_regions = []
145
+
146
+ for block in blocks:
147
+ if block.get("type") != 0: # Text blocks only
148
+ continue
149
+
150
+ block_bbox = block.get("bbox", (0, 0, 0, 0))
151
+ block_text = []
152
+
153
+ for line in block.get("lines", []):
154
+ for span in line.get("spans", []):
155
+ text = span.get("text", "")
156
+ if text:
157
+ block_text.append(text)
158
+ all_text.append(text)
159
+
160
+ if block_text:
161
+ combined_text = " ".join(block_text)
162
+ quality = self.analyze_text(combined_text)
163
+
164
+ text_blocks.append({
165
+ 'bbox': block_bbox,
166
+ 'text': combined_text,
167
+ 'quality': quality
168
+ })
169
+
170
+ # Record low quality regions
171
+ if quality.needs_ocr:
172
+ problem_regions.append(block_bbox)
173
+
174
+ # Analyze overall text quality
175
+ full_text = " ".join(all_text)
176
+ overall_quality = self.analyze_text(full_text)
177
+
178
+ return PageTextAnalysis(
179
+ page_num=self.page_num,
180
+ quality_result=overall_quality,
181
+ text_blocks=text_blocks,
182
+ problem_regions=problem_regions
183
+ )
184
+
185
+ def analyze_text(self, text: str) -> TextQualityResult:
186
+ """
187
+ Analyze text quality.
188
+
189
+ Args:
190
+ text: Text to analyze
191
+
192
+ Returns:
193
+ TextQualityResult object
194
+ """
195
+ if not text or len(text) < TextQualityConfig.MIN_TEXT_LENGTH:
196
+ return TextQualityResult(
197
+ quality_score=1.0, # Treat as OK if text is empty or too short
198
+ total_chars=len(text),
199
+ pua_count=0,
200
+ replacement_count=0,
201
+ valid_chars=len(text),
202
+ control_chars=0,
203
+ needs_ocr=False,
204
+ details={'reason': 'text_too_short'}
205
+ )
206
+
207
+ total_chars = len(text)
208
+ pua_count = 0
209
+ replacement_count = 0
210
+ control_count = 0
211
+ valid_chars = 0 # Korean, English, digits, spaces, basic punctuation
212
+
213
+ # Character-by-character analysis
214
+ for char in text:
215
+ code = ord(char)
216
+
217
+ # PUA check
218
+ if self._is_pua(code):
219
+ pua_count += 1
220
+ continue
221
+
222
+ # Replacement character check
223
+ if code == 0xFFFD:
224
+ replacement_count += 1
225
+ continue
226
+
227
+ # Control character check
228
+ if self._is_control(code):
229
+ control_count += 1
230
+ continue
231
+
232
+ # Valid character check
233
+ if self._is_valid_char(char, code):
234
+ valid_chars += 1
235
+
236
+ # Calculate quality score
237
+ quality_score = self._calculate_quality_score(
238
+ total_chars=total_chars,
239
+ pua_count=pua_count,
240
+ replacement_count=replacement_count,
241
+ valid_chars=valid_chars
242
+ )
243
+
244
+ # Determine OCR necessity
245
+ pua_ratio = pua_count / total_chars if total_chars > 0 else 0
246
+ needs_ocr = (
247
+ quality_score < TextQualityConfig.QUALITY_THRESHOLD or
248
+ pua_ratio >= TextQualityConfig.PUA_RATIO_THRESHOLD
249
+ )
250
+
251
+ return TextQualityResult(
252
+ quality_score=quality_score,
253
+ total_chars=total_chars,
254
+ pua_count=pua_count,
255
+ replacement_count=replacement_count,
256
+ valid_chars=valid_chars,
257
+ control_chars=control_count,
258
+ needs_ocr=needs_ocr,
259
+ details={
260
+ 'pua_ratio': pua_count / total_chars if total_chars > 0 else 0,
261
+ 'replacement_ratio': replacement_count / total_chars if total_chars > 0 else 0,
262
+ 'valid_ratio': valid_chars / total_chars if total_chars > 0 else 0,
263
+ }
264
+ )
265
+
266
+ def _is_pua(self, code: int) -> bool:
267
+ """Check if character is in Private Use Area."""
268
+ for start, end in TextQualityConfig.PUA_RANGES:
269
+ if start <= code <= end:
270
+ return True
271
+ return False
272
+
273
+ def _is_control(self, code: int) -> bool:
274
+ """Check if character is a control character."""
275
+ for start, end in TextQualityConfig.CONTROL_RANGES:
276
+ if start <= code <= end:
277
+ return True
278
+ return False
279
+
280
+ def _is_valid_char(self, char: str, code: int) -> bool:
281
+ """Check if character is valid (Korean, English, digits, spaces, basic punctuation)."""
282
+ # Whitespace
283
+ if char.isspace():
284
+ return True
285
+
286
+ # ASCII alphanumeric
287
+ if char.isalnum() and code < 128:
288
+ return True
289
+
290
+ # Korean syllables
291
+ if TextQualityConfig.HANGUL_SYLLABLE_RANGE[0] <= code <= TextQualityConfig.HANGUL_SYLLABLE_RANGE[1]:
292
+ return True
293
+
294
+ # Korean Jamo
295
+ if TextQualityConfig.HANGUL_JAMO_RANGE[0] <= code <= TextQualityConfig.HANGUL_JAMO_RANGE[1]:
296
+ return True
297
+
298
+ # Korean compatibility Jamo
299
+ if TextQualityConfig.HANGUL_COMPAT_JAMO_RANGE[0] <= code <= TextQualityConfig.HANGUL_COMPAT_JAMO_RANGE[1]:
300
+ return True
301
+
302
+ # Basic punctuation
303
+ if char in '.,!?;:\'"()[]{}-–—…·•':
304
+ return True
305
+
306
+ # CJK characters (Chinese, Japanese)
307
+ if 0x4E00 <= code <= 0x9FFF: # CJK Unified Ideographs
308
+ return True
309
+
310
+ # Japanese Hiragana/Katakana
311
+ if 0x3040 <= code <= 0x30FF:
312
+ return True
313
+
314
+ return False
315
+
316
+ def _calculate_quality_score(
317
+ self,
318
+ total_chars: int,
319
+ pua_count: int,
320
+ replacement_count: int,
321
+ valid_chars: int
322
+ ) -> float:
323
+ """Calculate quality score (0.0 ~ 1.0)."""
324
+ if total_chars == 0:
325
+ return 1.0
326
+
327
+ # Calculate ratios
328
+ pua_ratio = pua_count / total_chars
329
+ replacement_ratio = replacement_count / total_chars
330
+ valid_ratio = valid_chars / total_chars
331
+
332
+ # Calculate weighted score
333
+ # Score decreases with more PUA chars, more replacement chars, lower valid ratio
334
+ score = 1.0
335
+
336
+ # PUA character penalty (more = lower score)
337
+ score -= pua_ratio * TextQualityConfig.WEIGHT_PUA * 2
338
+
339
+ # Replacement character penalty
340
+ score -= replacement_ratio * TextQualityConfig.WEIGHT_REPLACEMENT * 3
341
+
342
+ # Valid character ratio adjustment
343
+ score = score * (0.5 + valid_ratio * 0.5)
344
+
345
+ return max(0.0, min(1.0, score))
346
+
347
+
348
+ # ============================================================================
349
+ # Page OCR Fallback Engine
350
+ # ============================================================================
351
+
352
+ class PageOCRFallbackEngine:
353
+ """
354
+ Page OCR Fallback Engine.
355
+
356
+ Performs OCR on the entire page or specific regions
357
+ for pages with low text quality.
358
+ """
359
+
360
+ def __init__(self, page, page_num: int):
361
+ """
362
+ Args:
363
+ page: PyMuPDF page object
364
+ page_num: Page number (0-indexed)
365
+ """
366
+ self.page = page
367
+ self.page_num = page_num
368
+ self.page_width = page.rect.width
369
+ self.page_height = page.rect.height
370
+
371
+ def ocr_full_page(self) -> str:
372
+ """
373
+ Perform OCR on the entire page.
374
+
375
+ Returns:
376
+ Text extracted via OCR
377
+ """
378
+ try:
379
+ # Render page at high resolution
380
+ mat = fitz.Matrix(TextQualityConfig.OCR_SCALE, TextQualityConfig.OCR_SCALE)
381
+ pix = self.page.get_pixmap(matrix=mat)
382
+
383
+ # Convert to PIL Image
384
+ import io
385
+ img_data = pix.tobytes("png")
386
+ img = Image.open(io.BytesIO(img_data))
387
+
388
+ # Perform OCR (Korean priority)
389
+ ocr_config = '--psm 3 --oem 3' # Automatic page segmentation + LSTM OCR
390
+ text = pytesseract.image_to_string(
391
+ img,
392
+ lang=TextQualityConfig.OCR_LANG,
393
+ config=ocr_config
394
+ )
395
+
396
+ # OCR post-processing: noise removal
397
+ text = self._postprocess_ocr_text(text)
398
+
399
+ logger.info(f"[PageOCR] Page {self.page_num + 1}: OCR extracted {len(text)} chars")
400
+ return text.strip()
401
+
402
+ except Exception as e:
403
+ logger.error(f"[PageOCR] Page {self.page_num + 1} OCR failed: {e}")
404
+ return ""
405
+
406
+ def _postprocess_ocr_text(self, text: str) -> str:
407
+ """
408
+ Post-process OCR results.
409
+
410
+ - Remove lines consisting only of special symbols
411
+ - Remove meaningless short lines
412
+ - Clean up repeated characters
413
+ - Remove OCR noise patterns
414
+ """
415
+ if not text:
416
+ return ""
417
+
418
+ lines = text.split('\n')
419
+ cleaned_lines = []
420
+
421
+ # OCR noise patterns (text incorrectly recognized from background graphics)
422
+ noise_patterns = [
423
+ r'^[ri\-—maOANIUTLOG\s]+$', # Noise from circular background graphics
424
+ r'^[0-9"\'\[\]\(\)°\s]{1,5}$', # Short number/symbol combinations
425
+ r'^[A-Za-z\-—\s]{3,}$', # Meaningless English combinations (when no Korean)
426
+ r'^‥+\s*$', # Only dotted lines
427
+ r'^\s*[°·•○●□■◇◆△▲▽▼]+\s*$', # Only symbols
428
+ ]
429
+
430
+ import re
431
+
432
+ for line in lines:
433
+ line = line.strip()
434
+
435
+ # Skip empty lines
436
+ if not line:
437
+ continue
438
+
439
+ # Remove lines consisting only of special symbols
440
+ if all(c in '.,;:!?@#$%^&*()[]{}|\\/<>~`\'"-_+=°·•○●□■◇◆△▲▽▼' or c.isspace() for c in line):
441
+ continue
442
+
443
+ # Check noise patterns
444
+ is_noise = False
445
+ for pattern in noise_patterns:
446
+ if re.match(pattern, line, re.IGNORECASE):
447
+ is_noise = True
448
+ break
449
+ if is_noise:
450
+ continue
451
+
452
+ # Prioritize keeping lines with Korean
453
+ korean_count = sum(1 for c in line if '가' <= c <= '힣')
454
+ if korean_count > 0:
455
+ cleaned_lines.append(line)
456
+ continue
457
+
458
+ # For English-only lines, check if meaningful
459
+ alpha_count = sum(1 for c in line if c.isalpha())
460
+ total_len = len(line.replace(' ', ''))
461
+
462
+ if total_len > 0:
463
+ meaningful_ratio = alpha_count / total_len
464
+ # Keep only if meaningful characters >= 50% and at least 3 characters
465
+ if meaningful_ratio >= 0.5 and alpha_count >= 3:
466
+ # Keep uppercase abbreviations (PLATEER, IDT, etc.)
467
+ if line.isupper() or any(word.isupper() and len(word) >= 2 for word in line.split()):
468
+ cleaned_lines.append(line)
469
+ # Regular English text (Insight Report, etc.)
470
+ elif any(c.islower() for c in line):
471
+ cleaned_lines.append(line)
472
+
473
+ return '\n'.join(cleaned_lines)
474
+
475
+ def ocr_region(self, bbox: Tuple[float, float, float, float]) -> str:
476
+ """
477
+ Perform OCR on a specific region.
478
+
479
+ Args:
480
+ bbox: Region coordinates (x0, y0, x1, y1)
481
+
482
+ Returns:
483
+ Text extracted via OCR
484
+ """
485
+ try:
486
+ x0, y0, x1, y1 = bbox
487
+
488
+ # Add padding
489
+ padding = 10
490
+ clip = fitz.Rect(
491
+ max(0, x0 - padding),
492
+ max(0, y0 - padding),
493
+ min(self.page_width, x1 + padding),
494
+ min(self.page_height, y1 + padding)
495
+ )
496
+
497
+ # Render region at high resolution
498
+ mat = fitz.Matrix(TextQualityConfig.OCR_SCALE, TextQualityConfig.OCR_SCALE)
499
+ pix = self.page.get_pixmap(matrix=mat, clip=clip)
500
+
501
+ # Convert to PIL Image
502
+ import io
503
+ img_data = pix.tobytes("png")
504
+ img = Image.open(io.BytesIO(img_data))
505
+
506
+ # Perform OCR
507
+ ocr_config = '--psm 6 --oem 3' # Uniform text block + LSTM
508
+ text = pytesseract.image_to_string(
509
+ img,
510
+ lang=TextQualityConfig.OCR_LANG,
511
+ config=ocr_config
512
+ )
513
+
514
+ # OCR post-processing
515
+ text = self._postprocess_ocr_text(text)
516
+
517
+ return text.strip()
518
+
519
+ except Exception as e:
520
+ logger.warning(f"[PageOCR] Region OCR failed for {bbox}: {e}")
521
+ return ""
522
+
523
+ def ocr_problem_regions(
524
+ self,
525
+ problem_regions: List[Tuple[float, float, float, float]]
526
+ ) -> Dict[Tuple, str]:
527
+ """
528
+ Perform OCR on problematic regions.
529
+
530
+ Args:
531
+ problem_regions: List of bounding boxes for problematic regions
532
+
533
+ Returns:
534
+ Dictionary mapping {bbox: ocr_text}
535
+ """
536
+ results = {}
537
+
538
+ for bbox in problem_regions:
539
+ text = self.ocr_region(bbox)
540
+ if text:
541
+ results[bbox] = text
542
+
543
+ return results
544
+
545
+
546
+ # ============================================================================
547
+ # Integrated Text Extractor with Quality Check
548
+ # ============================================================================
549
+
550
+ class QualityAwareTextExtractor:
551
+ """
552
+ Quality-Aware Text Extractor.
553
+
554
+ Analyzes text quality and performs OCR fallback when necessary
555
+ to always extract high-quality text.
556
+ """
557
+
558
+ def __init__(self, page, page_num: int, quality_threshold: float = None):
559
+ """
560
+ Args:
561
+ page: PyMuPDF page object
562
+ page_num: Page number (0-indexed)
563
+ quality_threshold: Quality threshold (default: TextQualityConfig.QUALITY_THRESHOLD)
564
+ """
565
+ self.page = page
566
+ self.page_num = page_num
567
+ self.quality_threshold = quality_threshold or TextQualityConfig.QUALITY_THRESHOLD
568
+
569
+ self.analyzer = TextQualityAnalyzer(page, page_num)
570
+ self.ocr_engine = PageOCRFallbackEngine(page, page_num)
571
+
572
+ def extract(self) -> Tuple[str, PageTextAnalysis]:
573
+ """
574
+ Extract text with quality consideration.
575
+
576
+ Returns:
577
+ Tuple of (extracted text, analysis result)
578
+ """
579
+ # 1. Analyze page text quality
580
+ analysis = self.analyzer.analyze_page()
581
+
582
+ logger.debug(
583
+ f"[QualityAware] Page {self.page_num + 1}: "
584
+ f"quality={analysis.quality_result.quality_score:.2f}, "
585
+ f"pua={analysis.quality_result.pua_count}, "
586
+ f"valid={analysis.quality_result.valid_chars}"
587
+ )
588
+
589
+ # 2. Return existing text if quality is good
590
+ if not analysis.quality_result.needs_ocr:
591
+ # Extract text using standard method
592
+ text = self.page.get_text("text")
593
+ return text, analysis
594
+
595
+ # 3. OCR fallback if quality is low
596
+ logger.info(
597
+ f"[QualityAware] Page {self.page_num + 1}: "
598
+ f"Quality too low ({analysis.quality_result.quality_score:.2f}), "
599
+ f"falling back to OCR"
600
+ )
601
+
602
+ # If few problem regions, OCR only those regions
603
+ if len(analysis.problem_regions) <= 3 and len(analysis.problem_regions) > 0:
604
+ # OCR only problem regions
605
+ ocr_results = self.ocr_engine.ocr_problem_regions(analysis.problem_regions)
606
+
607
+ # Replace problem region text with OCR results
608
+ text = self._merge_ocr_results(analysis, ocr_results)
609
+ analysis.ocr_text = str(ocr_results)
610
+ else:
611
+ # Full page OCR
612
+ text = self.ocr_engine.ocr_full_page()
613
+ analysis.ocr_text = text
614
+
615
+ return text, analysis
616
+
617
+ def _merge_ocr_results(
618
+ self,
619
+ analysis: PageTextAnalysis,
620
+ ocr_results: Dict[Tuple, str]
621
+ ) -> str:
622
+ """
623
+ Merge existing text with OCR results.
624
+
625
+ Uses existing text for good quality blocks,
626
+ replaces problematic blocks with OCR results.
627
+ """
628
+ merged_parts = []
629
+
630
+ for block in analysis.text_blocks:
631
+ bbox = tuple(block['bbox'])
632
+ quality = block['quality']
633
+
634
+ if quality.needs_ocr and bbox in ocr_results:
635
+ # Use OCR result
636
+ merged_parts.append(ocr_results[bbox])
637
+ else:
638
+ # Use existing text
639
+ merged_parts.append(block['text'])
640
+
641
+ return "\n".join(merged_parts)
642
+
643
+
644
+ # ============================================================================
645
+ # Export
646
+ # ============================================================================
647
+
648
+ __all__ = [
649
+ 'TextQualityConfig',
650
+ 'TextQualityResult',
651
+ 'PageTextAnalysis',
652
+ 'TextQualityAnalyzer',
653
+ 'PageOCRFallbackEngine',
654
+ 'QualityAwareTextExtractor',
655
+ ]