xgen-doc2chunk 0.1.5__py3-none-any.whl → 0.1.52__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -25,6 +25,9 @@ from xgen_doc2chunk.core.processor.pdf_helpers.pdf_utils import (
25
25
  )
26
26
  from xgen_doc2chunk.core.processor.pdf_helpers.pdf_table_detection import TableDetectionEngine
27
27
  from xgen_doc2chunk.core.processor.pdf_helpers.pdf_cell_analysis import CellAnalysisEngine
28
+ from xgen_doc2chunk.core.processor.pdf_helpers.pdf_text_quality_analyzer import (
29
+ apply_cjk_compat_mapping,
30
+ )
28
31
 
29
32
  logger = logging.getLogger("document-processor")
30
33
 
@@ -873,7 +876,12 @@ def generate_html_from_cells(
873
876
  content = ""
874
877
  if col_idx < len(row_data):
875
878
  content = row_data[col_idx]
876
- content = escape_html(str(content).strip() if content else "")
879
+
880
+ # Apply CJK Compatibility character mapping to fix broken characters
881
+ # (e.g., 㛳→→, ㏙→(, ㏚→) etc. from Word→PDF conversion)
882
+ content = str(content).strip() if content else ""
883
+ content = apply_cjk_compat_mapping(content)
884
+ content = escape_html(content)
877
885
 
878
886
  # Get span info (default to 1 if not found)
879
887
  spans = span_map.get((row_idx, col_idx), {'rowspan': 1, 'colspan': 1})
@@ -383,11 +383,11 @@ class TableQualityValidator:
383
383
  # if num_rows > 5 and col2_has_paragraphs >= 2:
384
384
  # return False, f"col2_paragraphs({col2_has_paragraphs})"
385
385
 
386
- # Pattern 3: If first column is short and second is long overall, likely body text not key-value
387
- if num_rows > 10:
388
- col1_short_ratio = (col1_empty_count + col1_short_count) / num_rows
389
- if col1_short_ratio >= 0.8 and col2_long_count >= 5:
390
- return False, f"asymmetric_cols(short1={col1_short_ratio:.0%}, long2={col2_long_count})"
386
+ # # Pattern 3: If first column is short and second is long overall, likely body text not key-value
387
+ # if num_rows > 10:
388
+ # col1_short_ratio = (col1_empty_count + col1_short_count) / num_rows
389
+ # if col1_short_ratio >= 0.8 and col2_long_count >= 5:
390
+ # return False, f"asymmetric_cols(short1={col1_short_ratio:.0%}, long2={col2_long_count})"
391
391
 
392
392
  return True, "valid"
393
393
 
@@ -3,6 +3,9 @@
3
3
  PDF Text Extraction Module
4
4
 
5
5
  Provides functions for extracting text blocks from PDF pages.
6
+ Includes support for:
7
+ - Fragmented text reconstruction (Word->PDF conversion issues)
8
+ - CJK Compatibility character mapping (broken character fixes)
6
9
  """
7
10
  import logging
8
11
  from typing import List, Tuple
@@ -17,6 +20,8 @@ from xgen_doc2chunk.core.processor.pdf_helpers.pdf_text_quality_analyzer import
17
20
  TextQualityAnalyzer,
18
21
  QualityAwareTextExtractor,
19
22
  PageOCRFallbackEngine,
23
+ FragmentedTextReconstructor,
24
+ apply_cjk_compat_mapping,
20
25
  )
21
26
 
22
27
  logger = logging.getLogger("document-processor")
@@ -53,13 +58,76 @@ def extract_text_blocks(
53
58
  analyzer = TextQualityAnalyzer(page, page_num)
54
59
  page_analysis = analyzer.analyze_page()
55
60
 
56
- # If quality is too low, use full page OCR fallback
61
+ # If quality is low, try text reconstruction first (before OCR)
57
62
  if page_analysis.quality_result.needs_ocr:
63
+ quality_result = page_analysis.quality_result
58
64
  logger.info(
59
- f"[PDF] Page {page_num + 1}: Low text quality "
60
- f"({page_analysis.quality_result.quality_score:.2f}), "
61
- f"PUA={page_analysis.quality_result.pua_count}, "
62
- f"using OCR fallback"
65
+ f"[PDF] Page {page_num + 1}: Low text quality detected - "
66
+ f"score={quality_result.quality_score:.2f}, "
67
+ f"PUA={quality_result.pua_count}, "
68
+ f"CJK_Compat={quality_result.cjk_compat_count}, "
69
+ f"fragmented={quality_result.is_fragmented}"
70
+ )
71
+
72
+ # Try reconstruction for fragmented text or CJK Compat issues
73
+ if quality_result.is_fragmented or quality_result.cjk_compat_count > 0:
74
+ logger.info(
75
+ f"[PDF] Page {page_num + 1}: Attempting text reconstruction "
76
+ f"(excluding {len(table_bboxes)} table regions)"
77
+ )
78
+
79
+ # Exclude table regions from reconstruction to avoid duplication
80
+ reconstructor = FragmentedTextReconstructor(
81
+ page, page_num, exclude_bboxes=table_bboxes
82
+ )
83
+
84
+ # Use section-based reconstruction for proper table positioning
85
+ if table_bboxes:
86
+ sections = reconstructor.reconstruct_with_sections()
87
+
88
+ if sections:
89
+ result_elements = []
90
+ for section in sections:
91
+ # Apply CJK Compatibility character mapping
92
+ cleaned_text = apply_cjk_compat_mapping(section['text'])
93
+
94
+ if cleaned_text.strip():
95
+ # Create element with proper Y position for sorting
96
+ result_elements.append(PageElement(
97
+ element_type=ElementType.TEXT,
98
+ content=cleaned_text,
99
+ bbox=(0, section['y_start'], page.rect.width, section['y_end']),
100
+ page_num=page_num
101
+ ))
102
+
103
+ if result_elements:
104
+ logger.info(
105
+ f"[PDF] Page {page_num + 1}: Text reconstruction successful "
106
+ f"({len(result_elements)} sections)"
107
+ )
108
+ return result_elements
109
+ else:
110
+ # No tables - use simple reconstruction
111
+ reconstructed_text = reconstructor.reconstruct()
112
+
113
+ if reconstructed_text:
114
+ cleaned_text = apply_cjk_compat_mapping(reconstructed_text)
115
+
116
+ logger.info(
117
+ f"[PDF] Page {page_num + 1}: Text reconstruction successful "
118
+ f"({len(cleaned_text)} chars)"
119
+ )
120
+
121
+ return [PageElement(
122
+ element_type=ElementType.TEXT,
123
+ content=cleaned_text,
124
+ bbox=(0, 0, page.rect.width, page.rect.height),
125
+ page_num=page_num
126
+ )]
127
+
128
+ # Fall back to OCR if reconstruction not applicable
129
+ logger.info(
130
+ f"[PDF] Page {page_num + 1}: Using OCR fallback"
63
131
  )
64
132
 
65
133
  extractor = QualityAwareTextExtractor(page, page_num)
@@ -12,20 +12,23 @@ Characteristics of Broken Text:
12
12
  3. Invalid Korean character combinations (only consonants/vowels in sequence)
13
13
  4. Meaningless Korean syllable sequences (random combinations, not real words)
14
14
  5. Mixture of CJK characters with PUA/control characters
15
+ 6. CJK Compatibility characters used instead of normal punctuation
16
+ 7. Fragmented text where each character is on a separate line
15
17
 
16
18
  =============================================================================
17
19
  Resolution Strategy:
18
20
  =============================================================================
19
21
  1. Calculate text quality score (0.0 ~ 1.0)
20
- 2. Perform OCR fallback if quality is below threshold
21
- 3. Apply OCR to entire page or specific regions
22
+ 2. For fragmented text: Reconstruct using character position data
23
+ 3. For CJK Compatibility characters: Map to correct characters
24
+ 4. Perform OCR fallback only if reconstruction fails
22
25
  """
23
26
 
24
27
  import logging
25
28
  import re
26
29
  import unicodedata
27
30
  from typing import List, Dict, Tuple, Optional, Set
28
- from dataclasses import dataclass
31
+ from dataclasses import dataclass, field
29
32
 
30
33
  import fitz
31
34
  from PIL import Image
@@ -35,9 +38,40 @@ logger = logging.getLogger(__name__)
35
38
 
36
39
 
37
40
  # ============================================================================
38
- # Configuration
41
+ # CJK Compatibility Character Mapping
39
42
  # ============================================================================
40
43
 
44
+ # Map CJK Compatibility characters to their intended characters
45
+ # These occur when Word documents are converted to PDF with font issues
46
+ CJK_COMPAT_CHAR_MAP = {
47
+ # Parentheses
48
+ '\u33D9': '(', # ㏙ → (
49
+ '\u33DA': ')', # ㏚ → )
50
+
51
+ # Brackets (section markers)
52
+ '\u33DB': '[', # ㏛ → [ (or could be 【)
53
+ '\u33DC': ']', # ㏜ → ] (or could be 】)
54
+ '\u33DD': '[', # ㏝ → [ (section start)
55
+ '\u33DE': ']', # ㏞ → ] (section end)
56
+
57
+ # Arrows and connectors
58
+ '\u3711': '→', # 㜑 → arrow
59
+ '\u36A8': '/', # 㚨 → / or +
60
+ '\u36F3': '→', # 㛳 → arrow (Word→PDF conversion often maps arrow to this)
61
+ '\u3689': '+', # 㚉 → + (plus sign, e.g., Vector + Graph)
62
+
63
+ # Range indicator
64
+ '\u33CA': '~', # ㏊ → ~ (range, e.g., 2~6개월)
65
+
66
+ # Quotation marks
67
+ '\u3431': '"', # 㐱 → opening quote
68
+ '\u3432': '"', # 㐲 → closing quote
69
+ '\u3433': '"', # 㐳 → opening quote
70
+ '\u3434': '"', # 㐴 → closing quote
71
+ '\u3443': '"', # 㑃 → quote
72
+ }
73
+
74
+
41
75
  class TextQualityConfig:
42
76
  """Text quality analysis configuration."""
43
77
 
@@ -55,6 +89,15 @@ class TextQualityConfig:
55
89
  (0x100000, 0x10FFFD), # Supplementary PUA-B
56
90
  ]
57
91
 
92
+ # CJK Compatibility ranges (often indicates broken text from Word->PDF conversion)
93
+ # These are unit symbols that are rarely used in normal text but appear when
94
+ # character encoding is broken (e.g., parentheses becoming ㏙, ㏚, etc.)
95
+ CJK_COMPAT_RANGES = [
96
+ (0x3300, 0x33FF), # CJK Compatibility (squared Katakana, units)
97
+ (0x3200, 0x32FF), # Enclosed CJK Letters and Months
98
+ (0x3700, 0x37FF), # CJK Extension A (rarely used Hanja)
99
+ ]
100
+
58
101
  # Control characters and special characters
59
102
  CONTROL_RANGES = [
60
103
  (0x0000, 0x001F), # C0 controls
@@ -76,6 +119,13 @@ class TextQualityConfig:
76
119
  WEIGHT_PUA = 0.4 # PUA character ratio weight
77
120
  WEIGHT_REPLACEMENT = 0.3 # Replacement character weight
78
121
  WEIGHT_VALID_RATIO = 0.3 # Valid character ratio weight
122
+ WEIGHT_CJK_COMPAT = 0.5 # CJK Compatibility character weight (broken text indicator)
123
+
124
+ # Fragmented text detection settings
125
+ # When each line has only 1-2 characters, it indicates conversion issue
126
+ FRAGMENTED_TEXT_THRESHOLD = 0.5 # If >50% of lines have <=2 chars, text is fragmented
127
+ FRAGMENTED_LINE_CHAR_LIMIT = 3 # Lines with <= this many chars are considered fragmented
128
+ MIN_LINES_FOR_FRAGMENTED_CHECK = 5 # Minimum lines needed to check for fragmentation
79
129
 
80
130
 
81
131
  # ============================================================================
@@ -91,8 +141,14 @@ class TextQualityResult:
91
141
  replacement_count: int # Replacement character count
92
142
  valid_chars: int # Valid character count (Korean, English, digits)
93
143
  control_chars: int # Control character count
94
- needs_ocr: bool # Whether OCR is needed
95
- details: Dict # Detailed information
144
+ cjk_compat_count: int = 0 # CJK Compatibility character count (broken text indicator)
145
+ is_fragmented: bool = False # Whether text is fragmented (char-by-char line breaks)
146
+ needs_ocr: bool = False # Whether OCR is needed
147
+ details: Dict = None # Detailed information
148
+
149
+ def __post_init__(self):
150
+ if self.details is None:
151
+ self.details = {}
96
152
 
97
153
 
98
154
  @dataclass
@@ -143,6 +199,11 @@ class TextQualityAnalyzer:
143
199
  text_blocks = []
144
200
  problem_regions = []
145
201
 
202
+ # Count lines to detect fragmented text pattern
203
+ # conversion issue where each char is a separate line)
204
+ total_lines = 0
205
+ total_chars = 0
206
+
146
207
  for block in blocks:
147
208
  if block.get("type") != 0: # Text blocks only
148
209
  continue
@@ -151,9 +212,11 @@ class TextQualityAnalyzer:
151
212
  block_text = []
152
213
 
153
214
  for line in block.get("lines", []):
215
+ total_lines += 1
154
216
  for span in line.get("spans", []):
155
217
  text = span.get("text", "")
156
218
  if text:
219
+ total_chars += len(text.strip())
157
220
  block_text.append(text)
158
221
  all_text.append(text)
159
222
 
@@ -175,6 +238,37 @@ class TextQualityAnalyzer:
175
238
  full_text = " ".join(all_text)
176
239
  overall_quality = self.analyze_text(full_text)
177
240
 
241
+ # Detect fragmented text at page level
242
+ # If average chars per line is very low, text is likely fragmented
243
+ if total_lines > 0 and total_chars > 0:
244
+ avg_chars_per_line = total_chars / total_lines
245
+ # If average is less than 15 chars per line, text is fragmented
246
+ page_is_fragmented = avg_chars_per_line < 15 and total_lines >= TextQualityConfig.MIN_LINES_FOR_FRAGMENTED_CHECK
247
+
248
+ if page_is_fragmented:
249
+ logger.info(
250
+ f"[QualityAnalyzer] Page {self.page_num + 1}: "
251
+ f"Detected fragmented text (avg {avg_chars_per_line:.1f} chars/line, {total_lines} lines)"
252
+ )
253
+ # Update overall quality to reflect fragmented status
254
+ overall_quality = TextQualityResult(
255
+ quality_score=max(0.0, overall_quality.quality_score - 0.5),
256
+ total_chars=overall_quality.total_chars,
257
+ pua_count=overall_quality.pua_count,
258
+ replacement_count=overall_quality.replacement_count,
259
+ valid_chars=overall_quality.valid_chars,
260
+ control_chars=overall_quality.control_chars,
261
+ cjk_compat_count=overall_quality.cjk_compat_count,
262
+ is_fragmented=True, # Mark as fragmented
263
+ needs_ocr=True, # Trigger reconstruction
264
+ details={
265
+ **overall_quality.details,
266
+ 'is_fragmented': True,
267
+ 'avg_chars_per_line': avg_chars_per_line,
268
+ 'total_lines': total_lines,
269
+ }
270
+ )
271
+
178
272
  return PageTextAnalysis(
179
273
  page_num=self.page_num,
180
274
  quality_result=overall_quality,
@@ -200,6 +294,8 @@ class TextQualityAnalyzer:
200
294
  replacement_count=0,
201
295
  valid_chars=len(text),
202
296
  control_chars=0,
297
+ cjk_compat_count=0,
298
+ is_fragmented=False,
203
299
  needs_ocr=False,
204
300
  details={'reason': 'text_too_short'}
205
301
  )
@@ -208,6 +304,7 @@ class TextQualityAnalyzer:
208
304
  pua_count = 0
209
305
  replacement_count = 0
210
306
  control_count = 0
307
+ cjk_compat_count = 0 # CJK Compatibility character count
211
308
  valid_chars = 0 # Korean, English, digits, spaces, basic punctuation
212
309
 
213
310
  # Character-by-character analysis
@@ -219,6 +316,11 @@ class TextQualityAnalyzer:
219
316
  pua_count += 1
220
317
  continue
221
318
 
319
+ # CJK Compatibility check (broken text indicator)
320
+ if self._is_cjk_compat(code):
321
+ cjk_compat_count += 1
322
+ continue
323
+
222
324
  # Replacement character check
223
325
  if code == 0xFFFD:
224
326
  replacement_count += 1
@@ -233,19 +335,27 @@ class TextQualityAnalyzer:
233
335
  if self._is_valid_char(char, code):
234
336
  valid_chars += 1
235
337
 
338
+ # Check for fragmented text pattern (char-by-char line breaks)
339
+ is_fragmented = self._is_fragmented_text(text)
340
+
236
341
  # Calculate quality score
237
342
  quality_score = self._calculate_quality_score(
238
343
  total_chars=total_chars,
239
344
  pua_count=pua_count,
240
345
  replacement_count=replacement_count,
241
- valid_chars=valid_chars
346
+ valid_chars=valid_chars,
347
+ cjk_compat_count=cjk_compat_count,
348
+ is_fragmented=is_fragmented
242
349
  )
243
350
 
244
351
  # Determine OCR necessity
245
352
  pua_ratio = pua_count / total_chars if total_chars > 0 else 0
353
+ cjk_compat_ratio = cjk_compat_count / total_chars if total_chars > 0 else 0
246
354
  needs_ocr = (
247
355
  quality_score < TextQualityConfig.QUALITY_THRESHOLD or
248
- pua_ratio >= TextQualityConfig.PUA_RATIO_THRESHOLD
356
+ pua_ratio >= TextQualityConfig.PUA_RATIO_THRESHOLD or
357
+ cjk_compat_ratio >= 0.05 or # 5% or more CJK compat chars triggers OCR
358
+ is_fragmented # Fragmented text always needs OCR
249
359
  )
250
360
 
251
361
  return TextQualityResult(
@@ -255,11 +365,15 @@ class TextQualityAnalyzer:
255
365
  replacement_count=replacement_count,
256
366
  valid_chars=valid_chars,
257
367
  control_chars=control_count,
368
+ cjk_compat_count=cjk_compat_count,
369
+ is_fragmented=is_fragmented,
258
370
  needs_ocr=needs_ocr,
259
371
  details={
260
372
  'pua_ratio': pua_count / total_chars if total_chars > 0 else 0,
261
373
  'replacement_ratio': replacement_count / total_chars if total_chars > 0 else 0,
262
374
  'valid_ratio': valid_chars / total_chars if total_chars > 0 else 0,
375
+ 'cjk_compat_ratio': cjk_compat_count / total_chars if total_chars > 0 else 0,
376
+ 'is_fragmented': is_fragmented,
263
377
  }
264
378
  )
265
379
 
@@ -270,6 +384,57 @@ class TextQualityAnalyzer:
270
384
  return True
271
385
  return False
272
386
 
387
+ def _is_cjk_compat(self, code: int) -> bool:
388
+ """
389
+ Check if character is in CJK Compatibility range.
390
+
391
+ These characters often indicate broken text from Word->PDF conversion
392
+ where parentheses, brackets, and other symbols are incorrectly mapped
393
+ to CJK Compatibility characters (e.g., U+3319 for '(', U+331A for ')').
394
+ """
395
+ for start, end in TextQualityConfig.CJK_COMPAT_RANGES:
396
+ if start <= code <= end:
397
+ return True
398
+ return False
399
+
400
+ def _is_fragmented_text(self, text: str) -> bool:
401
+ """
402
+ Detect fragmented text pattern where each line has only 1-2 characters.
403
+
404
+ This pattern occurs when Word documents with special layouts
405
+ (text boxes, vertical text, etc.) are converted to PDF,
406
+ resulting in characters being stored as separate lines.
407
+
408
+ Example of fragmented text:
409
+ '현\n재\n시\n장\n에\n대\n한\n이\n해'
410
+ Should be: '현재 시장에 대한 이해'
411
+
412
+ Args:
413
+ text: Text to analyze
414
+
415
+ Returns:
416
+ True if text appears to be fragmented
417
+ """
418
+ lines = text.split('\n')
419
+
420
+ # Need minimum number of lines to detect pattern
421
+ if len(lines) < TextQualityConfig.MIN_LINES_FOR_FRAGMENTED_CHECK:
422
+ return False
423
+
424
+ # Count lines with few characters (excluding empty lines)
425
+ non_empty_lines = [line for line in lines if line.strip()]
426
+ if not non_empty_lines:
427
+ return False
428
+
429
+ short_line_count = sum(
430
+ 1 for line in non_empty_lines
431
+ if len(line.strip()) <= TextQualityConfig.FRAGMENTED_LINE_CHAR_LIMIT
432
+ )
433
+
434
+ fragmented_ratio = short_line_count / len(non_empty_lines)
435
+
436
+ return fragmented_ratio >= TextQualityConfig.FRAGMENTED_TEXT_THRESHOLD
437
+
273
438
  def _is_control(self, code: int) -> bool:
274
439
  """Check if character is a control character."""
275
440
  for start, end in TextQualityConfig.CONTROL_RANGES:
@@ -318,7 +483,9 @@ class TextQualityAnalyzer:
318
483
  total_chars: int,
319
484
  pua_count: int,
320
485
  replacement_count: int,
321
- valid_chars: int
486
+ valid_chars: int,
487
+ cjk_compat_count: int = 0,
488
+ is_fragmented: bool = False
322
489
  ) -> float:
323
490
  """Calculate quality score (0.0 ~ 1.0)."""
324
491
  if total_chars == 0:
@@ -328,6 +495,7 @@ class TextQualityAnalyzer:
328
495
  pua_ratio = pua_count / total_chars
329
496
  replacement_ratio = replacement_count / total_chars
330
497
  valid_ratio = valid_chars / total_chars
498
+ cjk_compat_ratio = cjk_compat_count / total_chars
331
499
 
332
500
  # Calculate weighted score
333
501
  # Score decreases with more PUA chars, more replacement chars, lower valid ratio
@@ -339,6 +507,13 @@ class TextQualityAnalyzer:
339
507
  # Replacement character penalty
340
508
  score -= replacement_ratio * TextQualityConfig.WEIGHT_REPLACEMENT * 3
341
509
 
510
+ # CJK Compatibility character penalty (broken text indicator)
511
+ score -= cjk_compat_ratio * TextQualityConfig.WEIGHT_CJK_COMPAT * 3
512
+
513
+ # Fragmented text penalty (severe quality issue)
514
+ if is_fragmented:
515
+ score -= 0.5 # Major penalty for fragmented text
516
+
342
517
  # Valid character ratio adjustment
343
518
  score = score * (0.5 + valid_ratio * 0.5)
344
519
 
@@ -592,7 +767,26 @@ class QualityAwareTextExtractor:
592
767
  text = self.page.get_text("text")
593
768
  return text, analysis
594
769
 
595
- # 3. OCR fallback if quality is low
770
+ # 3. Try text reconstruction first (before OCR)
771
+ # This is more reliable than OCR for fragmented text from Word->PDF conversion
772
+ if analysis.quality_result.is_fragmented or analysis.quality_result.cjk_compat_count > 0:
773
+ logger.info(
774
+ f"[QualityAware] Page {self.page_num + 1}: "
775
+ f"Attempting text reconstruction "
776
+ f"(fragmented={analysis.quality_result.is_fragmented}, "
777
+ f"cjk_compat={analysis.quality_result.cjk_compat_count})"
778
+ )
779
+
780
+ reconstructor = FragmentedTextReconstructor(self.page, self.page_num)
781
+ reconstructed_text = reconstructor.reconstruct()
782
+
783
+ if reconstructed_text:
784
+ # Apply CJK Compatibility character mapping
785
+ cleaned_text = apply_cjk_compat_mapping(reconstructed_text)
786
+ analysis.ocr_text = f"[Reconstructed] {len(cleaned_text)} chars"
787
+ return cleaned_text, analysis
788
+
789
+ # 4. OCR fallback if reconstruction fails
596
790
  logger.info(
597
791
  f"[QualityAware] Page {self.page_num + 1}: "
598
792
  f"Quality too low ({analysis.quality_result.quality_score:.2f}), "
@@ -641,6 +835,348 @@ class QualityAwareTextExtractor:
641
835
  return "\n".join(merged_parts)
642
836
 
643
837
 
838
+ # ============================================================================
839
+ # Fragmented Text Reconstructor
840
+ # ============================================================================
841
+
842
+ class FragmentedTextReconstructor:
843
+ """
844
+ Reconstructs fragmented text from PDF pages.
845
+
846
+ When Word documents with special layouts (text boxes, vertical text, etc.)
847
+ are converted to PDF, characters may be stored as separate lines.
848
+ This class reconstructs the text by analyzing character positions.
849
+
850
+ Example:
851
+ Input: '현\\n재\\n시\\n장\\n에\\n대\\n한\\n이\\n해'
852
+ Output: '현재 시장에 대한 이해'
853
+ """
854
+
855
+ def __init__(self, page, page_num: int, y_tolerance: float = 3.0,
856
+ exclude_bboxes: List[Tuple[float, float, float, float]] = None):
857
+ """
858
+ Args:
859
+ page: PyMuPDF page object
860
+ page_num: Page number (0-indexed)
861
+ y_tolerance: Y coordinate tolerance for same-line detection
862
+ exclude_bboxes: List of bounding boxes to exclude (e.g., table regions)
863
+ """
864
+ self.page = page
865
+ self.page_num = page_num
866
+ self.y_tolerance = y_tolerance
867
+ self.exclude_bboxes = exclude_bboxes or []
868
+
869
+ def reconstruct(self) -> str:
870
+ """
871
+ Reconstruct fragmented text using character position data.
872
+
873
+ Returns:
874
+ Reconstructed text with proper line breaks
875
+ """
876
+ try:
877
+ # Extract character-level position data
878
+ raw_dict = self.page.get_text("rawdict")
879
+ all_chars = self._extract_chars(raw_dict)
880
+
881
+ if not all_chars:
882
+ logger.warning(f"[Reconstruct] Page {self.page_num + 1}: No characters found")
883
+ return ""
884
+
885
+ # Group characters by Y coordinate (same line)
886
+ lines_by_y = self._group_by_y(all_chars)
887
+
888
+ # Sort each line by X coordinate and build text
889
+ reconstructed_lines = self._build_lines(lines_by_y)
890
+
891
+ result = "\n".join(reconstructed_lines)
892
+
893
+ logger.info(
894
+ f"[Reconstruct] Page {self.page_num + 1}: "
895
+ f"Reconstructed {len(all_chars)} chars into {len(reconstructed_lines)} lines"
896
+ )
897
+
898
+ return result
899
+
900
+ except Exception as e:
901
+ logger.error(f"[Reconstruct] Page {self.page_num + 1} failed: {e}")
902
+ return ""
903
+
904
+ def reconstruct_with_sections(self) -> List[Dict]:
905
+ """
906
+ Reconstruct fragmented text, split into sections by table positions.
907
+
908
+ This method returns multiple text sections with their Y-coordinate ranges,
909
+ allowing proper positioning relative to tables.
910
+
911
+ Returns:
912
+ List of dicts: [{'text': str, 'y_start': float, 'y_end': float}, ...]
913
+ """
914
+ try:
915
+ raw_dict = self.page.get_text("rawdict")
916
+ all_chars = self._extract_chars(raw_dict)
917
+
918
+ if not all_chars:
919
+ logger.warning(f"[Reconstruct] Page {self.page_num + 1}: No characters found")
920
+ return []
921
+
922
+ # Group characters by Y coordinate
923
+ lines_by_y = self._group_by_y(all_chars)
924
+
925
+ if not lines_by_y:
926
+ return []
927
+
928
+ # Get sorted Y positions of tables (exclusion regions)
929
+ table_y_ranges = []
930
+ for bbox in self.exclude_bboxes:
931
+ table_y_ranges.append((bbox[1], bbox[3])) # (y_start, y_end)
932
+ table_y_ranges.sort(key=lambda x: x[0])
933
+
934
+ if not table_y_ranges:
935
+ # No tables - return single section
936
+ section_text = self._build_section_text(list(lines_by_y.keys()), lines_by_y)
937
+ if section_text.strip():
938
+ sorted_ys = sorted(lines_by_y.keys())
939
+ return [{
940
+ 'text': section_text,
941
+ 'y_start': sorted_ys[0],
942
+ 'y_end': sorted_ys[-1]
943
+ }]
944
+ return []
945
+
946
+ # Split lines into sections based on table positions
947
+ # Key insight: when we skip from a Y before table to a Y after table,
948
+ # we need to split the section
949
+ sections = []
950
+ current_section_lines = []
951
+ current_y_start = None
952
+ current_y_end = None
953
+
954
+ sorted_ys = sorted(lines_by_y.keys())
955
+
956
+ for y in sorted_ys:
957
+ # Check if we're jumping over a table
958
+ should_split = False
959
+ if current_y_end is not None:
960
+ for table_y_start, table_y_end in table_y_ranges:
961
+ # If previous line was before table start AND current line is after table end
962
+ # (meaning we jumped over the table)
963
+ if current_y_end < table_y_start and y > table_y_end:
964
+ should_split = True
965
+ break
966
+
967
+ if should_split and current_section_lines:
968
+ # Save current section (text BEFORE the table)
969
+ section_text = self._build_section_text(current_section_lines, lines_by_y)
970
+ if section_text.strip():
971
+ sections.append({
972
+ 'text': section_text,
973
+ 'y_start': current_y_start,
974
+ 'y_end': current_y_end
975
+ })
976
+ current_section_lines = []
977
+ current_y_start = None
978
+
979
+ # Add line to current section
980
+ current_section_lines.append(y)
981
+ if current_y_start is None:
982
+ current_y_start = y
983
+ current_y_end = y
984
+
985
+ # Don't forget the last section (text AFTER the last table or all text if no split)
986
+ if current_section_lines:
987
+ section_text = self._build_section_text(current_section_lines, lines_by_y)
988
+ if section_text.strip():
989
+ sections.append({
990
+ 'text': section_text,
991
+ 'y_start': current_y_start,
992
+ 'y_end': current_y_end
993
+ })
994
+
995
+ logger.info(
996
+ f"[Reconstruct] Page {self.page_num + 1}: "
997
+ f"Split into {len(sections)} sections around {len(table_y_ranges)} tables"
998
+ )
999
+
1000
+ return sections
1001
+
1002
+ except Exception as e:
1003
+ logger.error(f"[Reconstruct] Page {self.page_num + 1} sections failed: {e}")
1004
+ return []
1005
+
1006
+ def _build_section_text(self, y_positions: List[float], lines_by_y: Dict) -> str:
1007
+ """Build text from a list of Y positions."""
1008
+ lines = []
1009
+ for y in sorted(y_positions):
1010
+ chars = lines_by_y.get(y, [])
1011
+ chars_sorted = sorted(chars, key=lambda c: c['bbox'][0])
1012
+
1013
+ if not chars_sorted:
1014
+ continue
1015
+
1016
+ line_text = ""
1017
+ prev_x_end = None
1018
+
1019
+ for char_info in chars_sorted:
1020
+ x_start = char_info['bbox'][0]
1021
+ char = char_info['c']
1022
+
1023
+ if prev_x_end is not None:
1024
+ gap = x_start - prev_x_end
1025
+ avg_char_width = char_info['size'] * 0.5
1026
+ if gap > avg_char_width * 0.5:
1027
+ line_text += " "
1028
+
1029
+ line_text += char
1030
+ prev_x_end = char_info['bbox'][2]
1031
+
1032
+ if line_text.strip():
1033
+ lines.append(line_text)
1034
+
1035
+ return "\n".join(lines)
1036
+
1037
+ def _extract_chars(self, raw_dict: Dict) -> List[Dict]:
1038
+ """Extract all characters with position info from rawdict.
1039
+
1040
+ Characters inside exclude_bboxes (e.g., table regions) are filtered out.
1041
+ """
1042
+ all_chars = []
1043
+
1044
+ for block in raw_dict.get('blocks', []):
1045
+ if block.get('type') != 0: # Text blocks only
1046
+ continue
1047
+
1048
+ for line in block.get('lines', []):
1049
+ for span in line.get('spans', []):
1050
+ font = span.get('font', '')
1051
+ size = span.get('size', 0)
1052
+
1053
+ for char in span.get('chars', []):
1054
+ char_bbox = char.get('bbox', [0, 0, 0, 0])
1055
+
1056
+ # Skip characters inside excluded regions (e.g., tables)
1057
+ if self._is_inside_excluded_bbox(char_bbox):
1058
+ continue
1059
+
1060
+ char_info = {
1061
+ 'c': char.get('c', ''),
1062
+ 'bbox': char_bbox,
1063
+ 'origin': char.get('origin', [0, 0]),
1064
+ 'font': font,
1065
+ 'size': size,
1066
+ }
1067
+ all_chars.append(char_info)
1068
+
1069
+ return all_chars
1070
+
1071
+ def _is_inside_excluded_bbox(self, char_bbox: List[float]) -> bool:
1072
+ """Check if character is inside any excluded bbox.
1073
+
1074
+ Args:
1075
+ char_bbox: Character bounding box [x0, y0, x1, y1]
1076
+
1077
+ Returns:
1078
+ True if character center is inside any excluded region
1079
+ """
1080
+ if not self.exclude_bboxes:
1081
+ return False
1082
+
1083
+ # Use character center point for check
1084
+ char_center_x = (char_bbox[0] + char_bbox[2]) / 2
1085
+ char_center_y = (char_bbox[1] + char_bbox[3]) / 2
1086
+
1087
+ for bbox in self.exclude_bboxes:
1088
+ # bbox = (x0, y0, x1, y1)
1089
+ if (bbox[0] <= char_center_x <= bbox[2] and
1090
+ bbox[1] <= char_center_y <= bbox[3]):
1091
+ return True
1092
+
1093
+ return False
1094
+
1095
+ def _group_by_y(self, chars: List[Dict]) -> Dict[float, List[Dict]]:
1096
+ """Group characters by Y coordinate with tolerance."""
1097
+ lines_by_y = {}
1098
+
1099
+ for char_info in chars:
1100
+ # Use origin Y if available, otherwise use bbox Y
1101
+ y = char_info['origin'][1] if char_info['origin'] else char_info['bbox'][1]
1102
+
1103
+ # Find existing Y group within tolerance
1104
+ found_y = None
1105
+ for existing_y in lines_by_y.keys():
1106
+ if abs(existing_y - y) <= self.y_tolerance:
1107
+ found_y = existing_y
1108
+ break
1109
+
1110
+ if found_y is None:
1111
+ found_y = y
1112
+ lines_by_y[found_y] = []
1113
+
1114
+ lines_by_y[found_y].append(char_info)
1115
+
1116
+ return lines_by_y
1117
+
1118
+ def _build_lines(self, lines_by_y: Dict[float, List[Dict]]) -> List[str]:
1119
+ """Build text lines from character groups."""
1120
+ reconstructed_lines = []
1121
+
1122
+ for y in sorted(lines_by_y.keys()):
1123
+ chars = lines_by_y[y]
1124
+ chars_sorted = sorted(chars, key=lambda c: c['bbox'][0])
1125
+
1126
+ if not chars_sorted:
1127
+ continue
1128
+
1129
+ # Build line text with appropriate spacing
1130
+ line_text = ""
1131
+ prev_x_end = None
1132
+
1133
+ for char_info in chars_sorted:
1134
+ x_start = char_info['bbox'][0]
1135
+ char = char_info['c']
1136
+
1137
+ if prev_x_end is not None:
1138
+ gap = x_start - prev_x_end
1139
+ # Add space if gap is significant
1140
+ avg_char_width = char_info['size'] * 0.5
1141
+ if gap > avg_char_width * 0.5:
1142
+ line_text += " "
1143
+
1144
+ line_text += char
1145
+ prev_x_end = char_info['bbox'][2]
1146
+
1147
+ if line_text.strip():
1148
+ reconstructed_lines.append(line_text)
1149
+
1150
+ return reconstructed_lines
1151
+
1152
+
1153
+ # ============================================================================
1154
+ # CJK Compatibility Character Mapping Function
1155
+ # ============================================================================
1156
+
1157
+ def apply_cjk_compat_mapping(text: str) -> str:
1158
+ """
1159
+ Replace CJK Compatibility characters with their intended characters.
1160
+
1161
+ These characters appear when Word documents are converted to PDF
1162
+ and font encoding is not properly preserved.
1163
+
1164
+ Args:
1165
+ text: Text containing CJK Compatibility characters
1166
+
1167
+ Returns:
1168
+ Text with characters replaced
1169
+ """
1170
+ if not text:
1171
+ return text
1172
+
1173
+ result = text
1174
+ for cjk_char, replacement in CJK_COMPAT_CHAR_MAP.items():
1175
+ result = result.replace(cjk_char, replacement)
1176
+
1177
+ return result
1178
+
1179
+
644
1180
  # ============================================================================
645
1181
  # Export
646
1182
  # ============================================================================
@@ -652,4 +1188,7 @@ __all__ = [
652
1188
  'TextQualityAnalyzer',
653
1189
  'PageOCRFallbackEngine',
654
1190
  'QualityAwareTextExtractor',
1191
+ 'FragmentedTextReconstructor',
1192
+ 'apply_cjk_compat_mapping',
1193
+ 'CJK_COMPAT_CHAR_MAP',
655
1194
  ]
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: xgen-doc2chunk
3
- Version: 0.1.5
3
+ Version: 0.1.52
4
4
  Summary: Convert raw documents into AI-understandable context with intelligent text extraction, table detection, and semantic chunking
5
5
  Project-URL: Homepage, https://github.com/master0419/doc2chunk
6
6
  Project-URL: Documentation, https://github.com/master0419/doc2chunk#readme
@@ -113,11 +113,11 @@ xgen_doc2chunk/core/processor/pdf_helpers/pdf_metadata.py,sha256=7ZTeHXAfUqa_W9H
113
113
  xgen_doc2chunk/core/processor/pdf_helpers/pdf_page_analyzer.py,sha256=4kpY8WY9hH-cfjd-Ai6vA4V7I8KwE5hSq8Yt4QXliqM,3009
114
114
  xgen_doc2chunk/core/processor/pdf_helpers/pdf_preprocessor.py,sha256=qPgtMTMbaTm7_QyU7kKwVDtGAldf_yV4rTyoGVVgkTU,3406
115
115
  xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_detection.py,sha256=bwD6MVUuZJVYe3bWDsD6BpK1UZKKPsVyKOG6oHeoumw,47042
116
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_processor.py,sha256=cqoMzSySnapXRkELtmOahpmWyBnc1TquXPz1IqRqDSk,28168
116
+ xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_processor.py,sha256=H9bw3SybQJubvtjTqRrJNFviLFc2OMtWDv2HNTETxf0,28544
117
117
  xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_quality_analyzer.py,sha256=v6VH-E6clI71-G2zJcT5754VFcPYqb1Qz4l3UcPeDeM,27863
118
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_validator.py,sha256=7qI_kcY-scGaLPChkAeCtkQD9GAsD_NryMQw1nNMUwU,16075
119
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_text_extractor.py,sha256=wAnOCAQ3cTsVgMg0uVavodZHV2DAvrVkugqA0c4MhTY,4754
120
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_text_quality_analyzer.py,sha256=8rCAnLvNRSVvIAbEiggXawrMOo-zWpMxwDc5Rrk19Co,22520
118
+ xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_validator.py,sha256=rI5QAdqqJfiITZxu4bAf50pD7aIjVlhkYFsc2pt4i8c,16085
119
+ xgen_doc2chunk/core/processor/pdf_helpers/pdf_text_extractor.py,sha256=go259muoxeIxpN1TEiPNdwVkdVb1_YX8BeGO7HS0-jE,8177
120
+ xgen_doc2chunk/core/processor/pdf_helpers/pdf_text_quality_analyzer.py,sha256=_4IoDk15yIMvilcDlSxqiUlNLA9xUV1k69UmlzBq5aI,44641
121
121
  xgen_doc2chunk/core/processor/pdf_helpers/pdf_utils.py,sha256=W72HOARz7LjSzwzFTLo4-XTDQWvwBTGlqdovFyPBU7M,4724
122
122
  xgen_doc2chunk/core/processor/pdf_helpers/pdf_vector_text_ocr.py,sha256=KWkaj7LT5ih5Nkb2EDggA02JuHIsIy3Sbm7pVIhxWuE,11736
123
123
  xgen_doc2chunk/core/processor/pdf_helpers/types.py,sha256=IXV493hkpPa67DPZfH319m2rh6sIgL0R4nOd6pcd-to,9030
@@ -155,7 +155,7 @@ xgen_doc2chunk/ocr/ocr_engine/bedrock_ocr.py,sha256=4kIPb8u2_GSJ435GHJFXiIeQavMv
155
155
  xgen_doc2chunk/ocr/ocr_engine/gemini_ocr.py,sha256=A4V_AcC0tySYB4q-lNW7Tuhg7aTq0atj_RhMrCftKsM,2972
156
156
  xgen_doc2chunk/ocr/ocr_engine/openai_ocr.py,sha256=ZN-3Dq1BehFmwFvxTaYmiEAdFUqujviONNDiR8c5X4A,3194
157
157
  xgen_doc2chunk/ocr/ocr_engine/vllm_ocr.py,sha256=TeQOdPCPKQW8o4IyUb-4o6v6uTVzKupr4qh9NLjIj24,3672
158
- xgen_doc2chunk-0.1.5.dist-info/METADATA,sha256=qBfTY7YCh61_spWvm_TkEaN9zLeOKKz0LdzpMD_RKgM,7623
159
- xgen_doc2chunk-0.1.5.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
160
- xgen_doc2chunk-0.1.5.dist-info/licenses/LICENSE,sha256=pokMTCMoEcrcnjBAJ8cb7UVADBMGce6GLFbbRfqJVJc,11346
161
- xgen_doc2chunk-0.1.5.dist-info/RECORD,,
158
+ xgen_doc2chunk-0.1.52.dist-info/METADATA,sha256=M63N__jN6H7F3XFKtOM-Um0-TG0uTsknck9YnAZTQOk,7624
159
+ xgen_doc2chunk-0.1.52.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
160
+ xgen_doc2chunk-0.1.52.dist-info/licenses/LICENSE,sha256=pokMTCMoEcrcnjBAJ8cb7UVADBMGce6GLFbbRfqJVJc,11346
161
+ xgen_doc2chunk-0.1.52.dist-info/RECORD,,