PyPI - xgen-doc2chunk - Versions diffs - 0.1.5__py3-none-any.whl → 0.1.52__py3-none-any.whl - Mend

xgen-doc2chunk 0.1.5py3-none-any.whl → 0.1.52py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_processor.py CHANGED Viewed

@@ -25,6 +25,9 @@ from xgen_doc2chunk.core.processor.pdf_helpers.pdf_utils import (
 )
 from xgen_doc2chunk.core.processor.pdf_helpers.pdf_table_detection import TableDetectionEngine
 from xgen_doc2chunk.core.processor.pdf_helpers.pdf_cell_analysis import CellAnalysisEngine
+from xgen_doc2chunk.core.processor.pdf_helpers.pdf_text_quality_analyzer import (
+    apply_cjk_compat_mapping,
+)
 logger = logging.getLogger("document-processor")
@@ -873,7 +876,12 @@ def generate_html_from_cells(
             content = ""
             if col_idx < len(row_data):
                 content = row_data[col_idx]
-            content = escape_html(str(content).strip() if content else "")
+            # Apply CJK Compatibility character mapping to fix broken characters
+            # (e.g., 㛳→→, ㏙→(, ㏚→) etc. from Word→PDF conversion)
+            content = str(content).strip() if content else ""
+            content = apply_cjk_compat_mapping(content)
+            content = escape_html(content)
             # Get span info (default to 1 if not found)
             spans = span_map.get((row_idx, col_idx), {'rowspan': 1, 'colspan': 1})

xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_validator.py CHANGED Viewed

@@ -383,11 +383,11 @@ class TableQualityValidator:
         # if num_rows > 5 and col2_has_paragraphs >= 2:
         #     return False, f"col2_paragraphs({col2_has_paragraphs})"
-        # Pattern 3: If first column is short and second is long overall, likely body text not key-value
-        if num_rows > 10:
-            col1_short_ratio = (col1_empty_count + col1_short_count) / num_rows
-            if col1_short_ratio >= 0.8 and col2_long_count >= 5:
-                return False, f"asymmetric_cols(short1={col1_short_ratio:.0%}, long2={col2_long_count})"
+        # # Pattern 3: If first column is short and second is long overall, likely body text not key-value
+        # if num_rows > 10:
+        #     col1_short_ratio = (col1_empty_count + col1_short_count) / num_rows
+        #     if col1_short_ratio >= 0.8 and col2_long_count >= 5:
+        #         return False, f"asymmetric_cols(short1={col1_short_ratio:.0%}, long2={col2_long_count})"
         return True, "valid"

xgen_doc2chunk/core/processor/pdf_helpers/pdf_text_extractor.py CHANGED Viewed

@@ -3,6 +3,9 @@
 PDF Text Extraction Module
 Provides functions for extracting text blocks from PDF pages.
+Includes support for:
+- Fragmented text reconstruction (Word->PDF conversion issues)
+- CJK Compatibility character mapping (broken character fixes)
 """
 import logging
 from typing import List, Tuple
@@ -17,6 +20,8 @@ from xgen_doc2chunk.core.processor.pdf_helpers.pdf_text_quality_analyzer import
     TextQualityAnalyzer,
     QualityAwareTextExtractor,
     PageOCRFallbackEngine,
+    FragmentedTextReconstructor,
+    apply_cjk_compat_mapping,
 )
 logger = logging.getLogger("document-processor")
@@ -53,13 +58,76 @@ def extract_text_blocks(
         analyzer = TextQualityAnalyzer(page, page_num)
         page_analysis = analyzer.analyze_page()
-        # If quality is too low, use full page OCR fallback
+        # If quality is low, try text reconstruction first (before OCR)
         if page_analysis.quality_result.needs_ocr:
+            quality_result = page_analysis.quality_result
             logger.info(
-                f"[PDF] Page {page_num + 1}: Low text quality "
-                f"({page_analysis.quality_result.quality_score:.2f}), "
-                f"PUA={page_analysis.quality_result.pua_count}, "
-                f"using OCR fallback"
+                f"[PDF] Page {page_num + 1}: Low text quality detected - "
+                f"score={quality_result.quality_score:.2f}, "
+                f"PUA={quality_result.pua_count}, "
+                f"CJK_Compat={quality_result.cjk_compat_count}, "
+                f"fragmented={quality_result.is_fragmented}"
+            )
+            # Try reconstruction for fragmented text or CJK Compat issues
+            if quality_result.is_fragmented or quality_result.cjk_compat_count > 0:
+                logger.info(
+                    f"[PDF] Page {page_num + 1}: Attempting text reconstruction "
+                    f"(excluding {len(table_bboxes)} table regions)"
+                )
+                # Exclude table regions from reconstruction to avoid duplication
+                reconstructor = FragmentedTextReconstructor(
+                    page, page_num, exclude_bboxes=table_bboxes
+                )
+                # Use section-based reconstruction for proper table positioning
+                if table_bboxes:
+                    sections = reconstructor.reconstruct_with_sections()
+                    if sections:
+                        result_elements = []
+                        for section in sections:
+                            # Apply CJK Compatibility character mapping
+                            cleaned_text = apply_cjk_compat_mapping(section['text'])
+                            if cleaned_text.strip():
+                                # Create element with proper Y position for sorting
+                                result_elements.append(PageElement(
+                                    element_type=ElementType.TEXT,
+                                    content=cleaned_text,
+                                    bbox=(0, section['y_start'], page.rect.width, section['y_end']),
+                                    page_num=page_num
+                                ))
+                        if result_elements:
+                            logger.info(
+                                f"[PDF] Page {page_num + 1}: Text reconstruction successful "
+                                f"({len(result_elements)} sections)"
+                            )
+                            return result_elements
+                else:
+                    # No tables - use simple reconstruction
+                    reconstructed_text = reconstructor.reconstruct()
+                    if reconstructed_text:
+                        cleaned_text = apply_cjk_compat_mapping(reconstructed_text)
+                        logger.info(
+                            f"[PDF] Page {page_num + 1}: Text reconstruction successful "
+                            f"({len(cleaned_text)} chars)"
+                        )
+                        return [PageElement(
+                            element_type=ElementType.TEXT,
+                            content=cleaned_text,
+                            bbox=(0, 0, page.rect.width, page.rect.height),
+                            page_num=page_num
+                        )]
+            # Fall back to OCR if reconstruction not applicable
+            logger.info(
+                f"[PDF] Page {page_num + 1}: Using OCR fallback"
             )
             extractor = QualityAwareTextExtractor(page, page_num)

xgen_doc2chunk/core/processor/pdf_helpers/pdf_text_quality_analyzer.py CHANGED Viewed

@@ -12,20 +12,23 @@ Characteristics of Broken Text:
 3. Invalid Korean character combinations (only consonants/vowels in sequence)
 4. Meaningless Korean syllable sequences (random combinations, not real words)
 5. Mixture of CJK characters with PUA/control characters
+6. CJK Compatibility characters used instead of normal punctuation
+7. Fragmented text where each character is on a separate line
 =============================================================================
 Resolution Strategy:
 =============================================================================
 1. Calculate text quality score (0.0 ~ 1.0)
-2. Perform OCR fallback if quality is below threshold
-3. Apply OCR to entire page or specific regions
+2. For fragmented text: Reconstruct using character position data
+3. For CJK Compatibility characters: Map to correct characters
+4. Perform OCR fallback only if reconstruction fails
 """
 import logging
 import re
 import unicodedata
 from typing import List, Dict, Tuple, Optional, Set
-from dataclasses import dataclass
+from dataclasses import dataclass, field
 import fitz
 from PIL import Image
@@ -35,9 +38,40 @@ logger = logging.getLogger(__name__)
 # ============================================================================
-# Configuration
+# CJK Compatibility Character Mapping
 # ============================================================================
+# Map CJK Compatibility characters to their intended characters
+# These occur when Word documents are converted to PDF with font issues
+CJK_COMPAT_CHAR_MAP = {
+    # Parentheses
+    '\u33D9': '(',      # ㏙ → (
+    '\u33DA': ')',      # ㏚ → )
+    # Brackets (section markers)
+    '\u33DB': '[',      # ㏛ → [ (or could be 【)
+    '\u33DC': ']',      # ㏜ → ] (or could be 】)
+    '\u33DD': '[',      # ㏝ → [ (section start)
+    '\u33DE': ']',      # ㏞ → ] (section end)
+    # Arrows and connectors
+    '\u3711': '→',      # 㜑 → arrow
+    '\u36A8': '/',      # 㚨 → / or +
+    '\u36F3': '→',      # 㛳 → arrow (Word→PDF conversion often maps arrow to this)
+    '\u3689': '+',      # 㚉 → + (plus sign, e.g., Vector + Graph)
+    # Range indicator
+    '\u33CA': '~',      # ㏊ → ~ (range, e.g., 2~6개월)
+    # Quotation marks
+    '\u3431': '"',      # 㐱 → opening quote
+    '\u3432': '"',      # 㐲 → closing quote
+    '\u3433': '"',      # 㐳 → opening quote
+    '\u3434': '"',      # 㐴 → closing quote
+    '\u3443': '"',      # 㑃 → quote
+}
 class TextQualityConfig:
     """Text quality analysis configuration."""
@@ -55,6 +89,15 @@ class TextQualityConfig:
         (0x100000, 0x10FFFD), # Supplementary PUA-B
     ]
+    # CJK Compatibility ranges (often indicates broken text from Word->PDF conversion)
+    # These are unit symbols that are rarely used in normal text but appear when
+    # character encoding is broken (e.g., parentheses becoming ㏙, ㏚, etc.)
+    CJK_COMPAT_RANGES = [
+        (0x3300, 0x33FF),     # CJK Compatibility (squared Katakana, units)
+        (0x3200, 0x32FF),     # Enclosed CJK Letters and Months
+        (0x3700, 0x37FF),     # CJK Extension A (rarely used Hanja)
+    ]
     # Control characters and special characters
     CONTROL_RANGES = [
         (0x0000, 0x001F),     # C0 controls
@@ -76,6 +119,13 @@ class TextQualityConfig:
     WEIGHT_PUA = 0.4              # PUA character ratio weight
     WEIGHT_REPLACEMENT = 0.3      # Replacement character weight
     WEIGHT_VALID_RATIO = 0.3      # Valid character ratio weight
+    WEIGHT_CJK_COMPAT = 0.5       # CJK Compatibility character weight (broken text indicator)
+    # Fragmented text detection settings
+    # When each line has only 1-2 characters, it indicates conversion issue
+    FRAGMENTED_TEXT_THRESHOLD = 0.5  # If >50% of lines have <=2 chars, text is fragmented
+    FRAGMENTED_LINE_CHAR_LIMIT = 3   # Lines with <= this many chars are considered fragmented
+    MIN_LINES_FOR_FRAGMENTED_CHECK = 5  # Minimum lines needed to check for fragmentation
 # ============================================================================
@@ -91,8 +141,14 @@ class TextQualityResult:
     replacement_count: int        # Replacement character count
     valid_chars: int              # Valid character count (Korean, English, digits)
     control_chars: int            # Control character count
-    needs_ocr: bool               # Whether OCR is needed
-    details: Dict                 # Detailed information
+    cjk_compat_count: int = 0     # CJK Compatibility character count (broken text indicator)
+    is_fragmented: bool = False   # Whether text is fragmented (char-by-char line breaks)
+    needs_ocr: bool = False       # Whether OCR is needed
+    details: Dict = None          # Detailed information
+    def __post_init__(self):
+        if self.details is None:
+            self.details = {}
 @dataclass
@@ -143,6 +199,11 @@ class TextQualityAnalyzer:
         text_blocks = []
         problem_regions = []
+        # Count lines to detect fragmented text pattern
+        # conversion issue where each char is a separate line)
+        total_lines = 0
+        total_chars = 0
         for block in blocks:
             if block.get("type") != 0:  # Text blocks only
                 continue
@@ -151,9 +212,11 @@ class TextQualityAnalyzer:
             block_text = []
             for line in block.get("lines", []):
+                total_lines += 1
                 for span in line.get("spans", []):
                     text = span.get("text", "")
                     if text:
+                        total_chars += len(text.strip())
                         block_text.append(text)
                         all_text.append(text)
@@ -175,6 +238,37 @@ class TextQualityAnalyzer:
         full_text = " ".join(all_text)
         overall_quality = self.analyze_text(full_text)
+        # Detect fragmented text at page level
+        # If average chars per line is very low, text is likely fragmented
+        if total_lines > 0 and total_chars > 0:
+            avg_chars_per_line = total_chars / total_lines
+            # If average is less than 15 chars per line, text is fragmented
+            page_is_fragmented = avg_chars_per_line < 15 and total_lines >= TextQualityConfig.MIN_LINES_FOR_FRAGMENTED_CHECK
+            if page_is_fragmented:
+                logger.info(
+                    f"[QualityAnalyzer] Page {self.page_num + 1}: "
+                    f"Detected fragmented text (avg {avg_chars_per_line:.1f} chars/line, {total_lines} lines)"
+                )
+                # Update overall quality to reflect fragmented status
+                overall_quality = TextQualityResult(
+                    quality_score=max(0.0, overall_quality.quality_score - 0.5),
+                    total_chars=overall_quality.total_chars,
+                    pua_count=overall_quality.pua_count,
+                    replacement_count=overall_quality.replacement_count,
+                    valid_chars=overall_quality.valid_chars,
+                    control_chars=overall_quality.control_chars,
+                    cjk_compat_count=overall_quality.cjk_compat_count,
+                    is_fragmented=True,  # Mark as fragmented
+                    needs_ocr=True,  # Trigger reconstruction
+                    details={
+                        **overall_quality.details,
+                        'is_fragmented': True,
+                        'avg_chars_per_line': avg_chars_per_line,
+                        'total_lines': total_lines,
+                    }
+                )
         return PageTextAnalysis(
             page_num=self.page_num,
             quality_result=overall_quality,
@@ -200,6 +294,8 @@ class TextQualityAnalyzer:
                 replacement_count=0,
                 valid_chars=len(text),
                 control_chars=0,
+                cjk_compat_count=0,
+                is_fragmented=False,
                 needs_ocr=False,
                 details={'reason': 'text_too_short'}
             )
@@ -208,6 +304,7 @@ class TextQualityAnalyzer:
         pua_count = 0
         replacement_count = 0
         control_count = 0
+        cjk_compat_count = 0  # CJK Compatibility character count
         valid_chars = 0  # Korean, English, digits, spaces, basic punctuation
         # Character-by-character analysis
@@ -219,6 +316,11 @@ class TextQualityAnalyzer:
                 pua_count += 1
                 continue
+            # CJK Compatibility check (broken text indicator)
+            if self._is_cjk_compat(code):
+                cjk_compat_count += 1
+                continue
             # Replacement character check
             if code == 0xFFFD:
                 replacement_count += 1
@@ -233,19 +335,27 @@ class TextQualityAnalyzer:
             if self._is_valid_char(char, code):
                 valid_chars += 1
+        # Check for fragmented text pattern (char-by-char line breaks)
+        is_fragmented = self._is_fragmented_text(text)
         # Calculate quality score
         quality_score = self._calculate_quality_score(
             total_chars=total_chars,
             pua_count=pua_count,
             replacement_count=replacement_count,
-            valid_chars=valid_chars
+            valid_chars=valid_chars,
+            cjk_compat_count=cjk_compat_count,
+            is_fragmented=is_fragmented
         )
         # Determine OCR necessity
         pua_ratio = pua_count / total_chars if total_chars > 0 else 0
+        cjk_compat_ratio = cjk_compat_count / total_chars if total_chars > 0 else 0
         needs_ocr = (
             quality_score < TextQualityConfig.QUALITY_THRESHOLD or
-            pua_ratio >= TextQualityConfig.PUA_RATIO_THRESHOLD
+            pua_ratio >= TextQualityConfig.PUA_RATIO_THRESHOLD or
+            cjk_compat_ratio >= 0.05 or  # 5% or more CJK compat chars triggers OCR
+            is_fragmented  # Fragmented text always needs OCR
         )
         return TextQualityResult(
@@ -255,11 +365,15 @@ class TextQualityAnalyzer:
             replacement_count=replacement_count,
             valid_chars=valid_chars,
             control_chars=control_count,
+            cjk_compat_count=cjk_compat_count,
+            is_fragmented=is_fragmented,
             needs_ocr=needs_ocr,
             details={
                 'pua_ratio': pua_count / total_chars if total_chars > 0 else 0,
                 'replacement_ratio': replacement_count / total_chars if total_chars > 0 else 0,
                 'valid_ratio': valid_chars / total_chars if total_chars > 0 else 0,
+                'cjk_compat_ratio': cjk_compat_count / total_chars if total_chars > 0 else 0,
+                'is_fragmented': is_fragmented,
             }
         )
@@ -270,6 +384,57 @@ class TextQualityAnalyzer:
                 return True
         return False
+    def _is_cjk_compat(self, code: int) -> bool:
+        """
+        Check if character is in CJK Compatibility range.
+        These characters often indicate broken text from Word->PDF conversion
+        where parentheses, brackets, and other symbols are incorrectly mapped
+        to CJK Compatibility characters (e.g., U+3319 for '(', U+331A for ')').
+        """
+        for start, end in TextQualityConfig.CJK_COMPAT_RANGES:
+            if start <= code <= end:
+                return True
+        return False
+    def _is_fragmented_text(self, text: str) -> bool:
+        """
+        Detect fragmented text pattern where each line has only 1-2 characters.
+        This pattern occurs when Word documents with special layouts
+        (text boxes, vertical text, etc.) are converted to PDF,
+        resulting in characters being stored as separate lines.
+        Example of fragmented text:
+            '현\n재\n시\n장\n에\n대\n한\n이\n해'
+        Should be: '현재 시장에 대한 이해'
+        Args:
+            text: Text to analyze
+        Returns:
+            True if text appears to be fragmented
+        """
+        lines = text.split('\n')
+        # Need minimum number of lines to detect pattern
+        if len(lines) < TextQualityConfig.MIN_LINES_FOR_FRAGMENTED_CHECK:
+            return False
+        # Count lines with few characters (excluding empty lines)
+        non_empty_lines = [line for line in lines if line.strip()]
+        if not non_empty_lines:
+            return False
+        short_line_count = sum(
+            1 for line in non_empty_lines
+            if len(line.strip()) <= TextQualityConfig.FRAGMENTED_LINE_CHAR_LIMIT
+        )
+        fragmented_ratio = short_line_count / len(non_empty_lines)
+        return fragmented_ratio >= TextQualityConfig.FRAGMENTED_TEXT_THRESHOLD
     def _is_control(self, code: int) -> bool:
         """Check if character is a control character."""
         for start, end in TextQualityConfig.CONTROL_RANGES:
@@ -318,7 +483,9 @@ class TextQualityAnalyzer:
         total_chars: int,
         pua_count: int,
         replacement_count: int,
-        valid_chars: int
+        valid_chars: int,
+        cjk_compat_count: int = 0,
+        is_fragmented: bool = False
     ) -> float:
         """Calculate quality score (0.0 ~ 1.0)."""
         if total_chars == 0:
@@ -328,6 +495,7 @@ class TextQualityAnalyzer:
         pua_ratio = pua_count / total_chars
         replacement_ratio = replacement_count / total_chars
         valid_ratio = valid_chars / total_chars
+        cjk_compat_ratio = cjk_compat_count / total_chars
         # Calculate weighted score
         # Score decreases with more PUA chars, more replacement chars, lower valid ratio
@@ -339,6 +507,13 @@ class TextQualityAnalyzer:
         # Replacement character penalty
         score -= replacement_ratio * TextQualityConfig.WEIGHT_REPLACEMENT * 3
+        # CJK Compatibility character penalty (broken text indicator)
+        score -= cjk_compat_ratio * TextQualityConfig.WEIGHT_CJK_COMPAT * 3
+        # Fragmented text penalty (severe quality issue)
+        if is_fragmented:
+            score -= 0.5  # Major penalty for fragmented text
         # Valid character ratio adjustment
         score = score * (0.5 + valid_ratio * 0.5)
@@ -592,7 +767,26 @@ class QualityAwareTextExtractor:
             text = self.page.get_text("text")
             return text, analysis
-        # 3. OCR fallback if quality is low
+        # 3. Try text reconstruction first (before OCR)
+        # This is more reliable than OCR for fragmented text from Word->PDF conversion
+        if analysis.quality_result.is_fragmented or analysis.quality_result.cjk_compat_count > 0:
+            logger.info(
+                f"[QualityAware] Page {self.page_num + 1}: "
+                f"Attempting text reconstruction "
+                f"(fragmented={analysis.quality_result.is_fragmented}, "
+                f"cjk_compat={analysis.quality_result.cjk_compat_count})"
+            )
+            reconstructor = FragmentedTextReconstructor(self.page, self.page_num)
+            reconstructed_text = reconstructor.reconstruct()
+            if reconstructed_text:
+                # Apply CJK Compatibility character mapping
+                cleaned_text = apply_cjk_compat_mapping(reconstructed_text)
+                analysis.ocr_text = f"[Reconstructed] {len(cleaned_text)} chars"
+                return cleaned_text, analysis
+        # 4. OCR fallback if reconstruction fails
         logger.info(
             f"[QualityAware] Page {self.page_num + 1}: "
             f"Quality too low ({analysis.quality_result.quality_score:.2f}), "
@@ -641,6 +835,348 @@ class QualityAwareTextExtractor:
         return "\n".join(merged_parts)
+# ============================================================================
+# Fragmented Text Reconstructor
+# ============================================================================
+class FragmentedTextReconstructor:
+    """
+    Reconstructs fragmented text from PDF pages.
+    When Word documents with special layouts (text boxes, vertical text, etc.)
+    are converted to PDF, characters may be stored as separate lines.
+    This class reconstructs the text by analyzing character positions.
+    Example:
+        Input: '현\\n재\\n시\\n장\\n에\\n대\\n한\\n이\\n해'
+        Output: '현재 시장에 대한 이해'
+    """
+    def __init__(self, page, page_num: int, y_tolerance: float = 3.0,
+                 exclude_bboxes: List[Tuple[float, float, float, float]] = None):
+        """
+        Args:
+            page: PyMuPDF page object
+            page_num: Page number (0-indexed)
+            y_tolerance: Y coordinate tolerance for same-line detection
+            exclude_bboxes: List of bounding boxes to exclude (e.g., table regions)
+        """
+        self.page = page
+        self.page_num = page_num
+        self.y_tolerance = y_tolerance
+        self.exclude_bboxes = exclude_bboxes or []
+    def reconstruct(self) -> str:
+        """
+        Reconstruct fragmented text using character position data.
+        Returns:
+            Reconstructed text with proper line breaks
+        """
+        try:
+            # Extract character-level position data
+            raw_dict = self.page.get_text("rawdict")
+            all_chars = self._extract_chars(raw_dict)
+            if not all_chars:
+                logger.warning(f"[Reconstruct] Page {self.page_num + 1}: No characters found")
+                return ""
+            # Group characters by Y coordinate (same line)
+            lines_by_y = self._group_by_y(all_chars)
+            # Sort each line by X coordinate and build text
+            reconstructed_lines = self._build_lines(lines_by_y)
+            result = "\n".join(reconstructed_lines)
+            logger.info(
+                f"[Reconstruct] Page {self.page_num + 1}: "
+                f"Reconstructed {len(all_chars)} chars into {len(reconstructed_lines)} lines"
+            )
+            return result
+        except Exception as e:
+            logger.error(f"[Reconstruct] Page {self.page_num + 1} failed: {e}")
+            return ""
+    def reconstruct_with_sections(self) -> List[Dict]:
+        """
+        Reconstruct fragmented text, split into sections by table positions.
+        This method returns multiple text sections with their Y-coordinate ranges,
+        allowing proper positioning relative to tables.
+        Returns:
+            List of dicts: [{'text': str, 'y_start': float, 'y_end': float}, ...]
+        """
+        try:
+            raw_dict = self.page.get_text("rawdict")
+            all_chars = self._extract_chars(raw_dict)
+            if not all_chars:
+                logger.warning(f"[Reconstruct] Page {self.page_num + 1}: No characters found")
+                return []
+            # Group characters by Y coordinate
+            lines_by_y = self._group_by_y(all_chars)
+            if not lines_by_y:
+                return []
+            # Get sorted Y positions of tables (exclusion regions)
+            table_y_ranges = []
+            for bbox in self.exclude_bboxes:
+                table_y_ranges.append((bbox[1], bbox[3]))  # (y_start, y_end)
+            table_y_ranges.sort(key=lambda x: x[0])
+            if not table_y_ranges:
+                # No tables - return single section
+                section_text = self._build_section_text(list(lines_by_y.keys()), lines_by_y)
+                if section_text.strip():
+                    sorted_ys = sorted(lines_by_y.keys())
+                    return [{
+                        'text': section_text,
+                        'y_start': sorted_ys[0],
+                        'y_end': sorted_ys[-1]
+                    }]
+                return []
+            # Split lines into sections based on table positions
+            # Key insight: when we skip from a Y before table to a Y after table,
+            # we need to split the section
+            sections = []
+            current_section_lines = []
+            current_y_start = None
+            current_y_end = None
+            sorted_ys = sorted(lines_by_y.keys())
+            for y in sorted_ys:
+                # Check if we're jumping over a table
+                should_split = False
+                if current_y_end is not None:
+                    for table_y_start, table_y_end in table_y_ranges:
+                        # If previous line was before table start AND current line is after table end
+                        # (meaning we jumped over the table)
+                        if current_y_end < table_y_start and y > table_y_end:
+                            should_split = True
+                            break
+                if should_split and current_section_lines:
+                    # Save current section (text BEFORE the table)
+                    section_text = self._build_section_text(current_section_lines, lines_by_y)
+                    if section_text.strip():
+                        sections.append({
+                            'text': section_text,
+                            'y_start': current_y_start,
+                            'y_end': current_y_end
+                        })
+                    current_section_lines = []
+                    current_y_start = None
+                # Add line to current section
+                current_section_lines.append(y)
+                if current_y_start is None:
+                    current_y_start = y
+                current_y_end = y
+            # Don't forget the last section (text AFTER the last table or all text if no split)
+            if current_section_lines:
+                section_text = self._build_section_text(current_section_lines, lines_by_y)
+                if section_text.strip():
+                    sections.append({
+                        'text': section_text,
+                        'y_start': current_y_start,
+                        'y_end': current_y_end
+                    })
+            logger.info(
+                f"[Reconstruct] Page {self.page_num + 1}: "
+                f"Split into {len(sections)} sections around {len(table_y_ranges)} tables"
+            )
+            return sections
+        except Exception as e:
+            logger.error(f"[Reconstruct] Page {self.page_num + 1} sections failed: {e}")
+            return []
+    def _build_section_text(self, y_positions: List[float], lines_by_y: Dict) -> str:
+        """Build text from a list of Y positions."""
+        lines = []
+        for y in sorted(y_positions):
+            chars = lines_by_y.get(y, [])
+            chars_sorted = sorted(chars, key=lambda c: c['bbox'][0])
+            if not chars_sorted:
+                continue
+            line_text = ""
+            prev_x_end = None
+            for char_info in chars_sorted:
+                x_start = char_info['bbox'][0]
+                char = char_info['c']
+                if prev_x_end is not None:
+                    gap = x_start - prev_x_end
+                    avg_char_width = char_info['size'] * 0.5
+                    if gap > avg_char_width * 0.5:
+                        line_text += " "
+                line_text += char
+                prev_x_end = char_info['bbox'][2]
+            if line_text.strip():
+                lines.append(line_text)
+        return "\n".join(lines)
+    def _extract_chars(self, raw_dict: Dict) -> List[Dict]:
+        """Extract all characters with position info from rawdict.
+        Characters inside exclude_bboxes (e.g., table regions) are filtered out.
+        """
+        all_chars = []
+        for block in raw_dict.get('blocks', []):
+            if block.get('type') != 0:  # Text blocks only
+                continue
+            for line in block.get('lines', []):
+                for span in line.get('spans', []):
+                    font = span.get('font', '')
+                    size = span.get('size', 0)
+                    for char in span.get('chars', []):
+                        char_bbox = char.get('bbox', [0, 0, 0, 0])
+                        # Skip characters inside excluded regions (e.g., tables)
+                        if self._is_inside_excluded_bbox(char_bbox):
+                            continue
+                        char_info = {
+                            'c': char.get('c', ''),
+                            'bbox': char_bbox,
+                            'origin': char.get('origin', [0, 0]),
+                            'font': font,
+                            'size': size,
+                        }
+                        all_chars.append(char_info)
+        return all_chars
+    def _is_inside_excluded_bbox(self, char_bbox: List[float]) -> bool:
+        """Check if character is inside any excluded bbox.
+        Args:
+            char_bbox: Character bounding box [x0, y0, x1, y1]
+        Returns:
+            True if character center is inside any excluded region
+        """
+        if not self.exclude_bboxes:
+            return False
+        # Use character center point for check
+        char_center_x = (char_bbox[0] + char_bbox[2]) / 2
+        char_center_y = (char_bbox[1] + char_bbox[3]) / 2
+        for bbox in self.exclude_bboxes:
+            # bbox = (x0, y0, x1, y1)
+            if (bbox[0] <= char_center_x <= bbox[2] and
+                bbox[1] <= char_center_y <= bbox[3]):
+                return True
+        return False
+    def _group_by_y(self, chars: List[Dict]) -> Dict[float, List[Dict]]:
+        """Group characters by Y coordinate with tolerance."""
+        lines_by_y = {}
+        for char_info in chars:
+            # Use origin Y if available, otherwise use bbox Y
+            y = char_info['origin'][1] if char_info['origin'] else char_info['bbox'][1]
+            # Find existing Y group within tolerance
+            found_y = None
+            for existing_y in lines_by_y.keys():
+                if abs(existing_y - y) <= self.y_tolerance:
+                    found_y = existing_y
+                    break
+            if found_y is None:
+                found_y = y
+                lines_by_y[found_y] = []
+            lines_by_y[found_y].append(char_info)
+        return lines_by_y
+    def _build_lines(self, lines_by_y: Dict[float, List[Dict]]) -> List[str]:
+        """Build text lines from character groups."""
+        reconstructed_lines = []
+        for y in sorted(lines_by_y.keys()):
+            chars = lines_by_y[y]
+            chars_sorted = sorted(chars, key=lambda c: c['bbox'][0])
+            if not chars_sorted:
+                continue
+            # Build line text with appropriate spacing
+            line_text = ""
+            prev_x_end = None
+            for char_info in chars_sorted:
+                x_start = char_info['bbox'][0]
+                char = char_info['c']
+                if prev_x_end is not None:
+                    gap = x_start - prev_x_end
+                    # Add space if gap is significant
+                    avg_char_width = char_info['size'] * 0.5
+                    if gap > avg_char_width * 0.5:
+                        line_text += " "
+                line_text += char
+                prev_x_end = char_info['bbox'][2]
+            if line_text.strip():
+                reconstructed_lines.append(line_text)
+        return reconstructed_lines
+# ============================================================================
+# CJK Compatibility Character Mapping Function
+# ============================================================================
+def apply_cjk_compat_mapping(text: str) -> str:
+    """
+    Replace CJK Compatibility characters with their intended characters.
+    These characters appear when Word documents are converted to PDF
+    and font encoding is not properly preserved.
+    Args:
+        text: Text containing CJK Compatibility characters
+    Returns:
+        Text with characters replaced
+    """
+    if not text:
+        return text
+    result = text
+    for cjk_char, replacement in CJK_COMPAT_CHAR_MAP.items():
+        result = result.replace(cjk_char, replacement)
+    return result
 # ============================================================================
 # Export
 # ============================================================================
@@ -652,4 +1188,7 @@ __all__ = [
     'TextQualityAnalyzer',
     'PageOCRFallbackEngine',
     'QualityAwareTextExtractor',
+    'FragmentedTextReconstructor',
+    'apply_cjk_compat_mapping',
+    'CJK_COMPAT_CHAR_MAP',
 ]

{xgen_doc2chunk-0.1.5.dist-info → xgen_doc2chunk-0.1.52.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: xgen-doc2chunk
-Version: 0.1.5
+Version: 0.1.52
 Summary: Convert raw documents into AI-understandable context with intelligent text extraction, table detection, and semantic chunking
 Project-URL: Homepage, https://github.com/master0419/doc2chunk
 Project-URL: Documentation, https://github.com/master0419/doc2chunk#readme

{xgen_doc2chunk-0.1.5.dist-info → xgen_doc2chunk-0.1.52.dist-info}/RECORD RENAMED Viewed

@@ -113,11 +113,11 @@ xgen_doc2chunk/core/processor/pdf_helpers/pdf_metadata.py,sha256=7ZTeHXAfUqa_W9H
 xgen_doc2chunk/core/processor/pdf_helpers/pdf_page_analyzer.py,sha256=4kpY8WY9hH-cfjd-Ai6vA4V7I8KwE5hSq8Yt4QXliqM,3009
 xgen_doc2chunk/core/processor/pdf_helpers/pdf_preprocessor.py,sha256=qPgtMTMbaTm7_QyU7kKwVDtGAldf_yV4rTyoGVVgkTU,3406
 xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_detection.py,sha256=bwD6MVUuZJVYe3bWDsD6BpK1UZKKPsVyKOG6oHeoumw,47042
-xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_processor.py,sha256=cqoMzSySnapXRkELtmOahpmWyBnc1TquXPz1IqRqDSk,28168
+xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_processor.py,sha256=H9bw3SybQJubvtjTqRrJNFviLFc2OMtWDv2HNTETxf0,28544
 xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_quality_analyzer.py,sha256=v6VH-E6clI71-G2zJcT5754VFcPYqb1Qz4l3UcPeDeM,27863
-xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_validator.py,sha256=7qI_kcY-scGaLPChkAeCtkQD9GAsD_NryMQw1nNMUwU,16075
-xgen_doc2chunk/core/processor/pdf_helpers/pdf_text_extractor.py,sha256=wAnOCAQ3cTsVgMg0uVavodZHV2DAvrVkugqA0c4MhTY,4754
-xgen_doc2chunk/core/processor/pdf_helpers/pdf_text_quality_analyzer.py,sha256=8rCAnLvNRSVvIAbEiggXawrMOo-zWpMxwDc5Rrk19Co,22520
+xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_validator.py,sha256=rI5QAdqqJfiITZxu4bAf50pD7aIjVlhkYFsc2pt4i8c,16085
+xgen_doc2chunk/core/processor/pdf_helpers/pdf_text_extractor.py,sha256=go259muoxeIxpN1TEiPNdwVkdVb1_YX8BeGO7HS0-jE,8177
+xgen_doc2chunk/core/processor/pdf_helpers/pdf_text_quality_analyzer.py,sha256=_4IoDk15yIMvilcDlSxqiUlNLA9xUV1k69UmlzBq5aI,44641
 xgen_doc2chunk/core/processor/pdf_helpers/pdf_utils.py,sha256=W72HOARz7LjSzwzFTLo4-XTDQWvwBTGlqdovFyPBU7M,4724
 xgen_doc2chunk/core/processor/pdf_helpers/pdf_vector_text_ocr.py,sha256=KWkaj7LT5ih5Nkb2EDggA02JuHIsIy3Sbm7pVIhxWuE,11736
 xgen_doc2chunk/core/processor/pdf_helpers/types.py,sha256=IXV493hkpPa67DPZfH319m2rh6sIgL0R4nOd6pcd-to,9030
@@ -155,7 +155,7 @@ xgen_doc2chunk/ocr/ocr_engine/bedrock_ocr.py,sha256=4kIPb8u2_GSJ435GHJFXiIeQavMv
 xgen_doc2chunk/ocr/ocr_engine/gemini_ocr.py,sha256=A4V_AcC0tySYB4q-lNW7Tuhg7aTq0atj_RhMrCftKsM,2972
 xgen_doc2chunk/ocr/ocr_engine/openai_ocr.py,sha256=ZN-3Dq1BehFmwFvxTaYmiEAdFUqujviONNDiR8c5X4A,3194
 xgen_doc2chunk/ocr/ocr_engine/vllm_ocr.py,sha256=TeQOdPCPKQW8o4IyUb-4o6v6uTVzKupr4qh9NLjIj24,3672
-xgen_doc2chunk-0.1.5.dist-info/METADATA,sha256=qBfTY7YCh61_spWvm_TkEaN9zLeOKKz0LdzpMD_RKgM,7623
-xgen_doc2chunk-0.1.5.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
-xgen_doc2chunk-0.1.5.dist-info/licenses/LICENSE,sha256=pokMTCMoEcrcnjBAJ8cb7UVADBMGce6GLFbbRfqJVJc,11346
-xgen_doc2chunk-0.1.5.dist-info/RECORD,,
+xgen_doc2chunk-0.1.52.dist-info/METADATA,sha256=M63N__jN6H7F3XFKtOM-Um0-TG0uTsknck9YnAZTQOk,7624
+xgen_doc2chunk-0.1.52.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
+xgen_doc2chunk-0.1.52.dist-info/licenses/LICENSE,sha256=pokMTCMoEcrcnjBAJ8cb7UVADBMGce6GLFbbRfqJVJc,11346
+xgen_doc2chunk-0.1.52.dist-info/RECORD,,

{xgen_doc2chunk-0.1.5.dist-info → xgen_doc2chunk-0.1.52.dist-info}/WHEEL RENAMED Viewed

File without changes

{xgen_doc2chunk-0.1.5.dist-info → xgen_doc2chunk-0.1.52.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

xgen-doc2chunk 0.1.5__py3-none-any.whl → 0.1.52__py3-none-any.whl

xgen-doc2chunk 0.1.5py3-none-any.whl → 0.1.52py3-none-any.whl