PyPI - xgen-doc2chunk - Versions diffs - 0.1.5__tar.gz → 0.1.52__tar.gz - Mend

xgen-doc2chunk 0.1.5tar.gz → 0.1.52tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (163) hide show

{xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: xgen-doc2chunk
-Version: 0.1.5
+Version: 0.1.52
 Summary: Convert raw documents into AI-understandable context with intelligent text extraction, table detection, and semantic chunking
 Project-URL: Homepage, https://github.com/master0419/doc2chunk
 Project-URL: Documentation, https://github.com/master0419/doc2chunk#readme

{xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
 [project]
 name = "xgen-doc2chunk"
-version = "0.1.5"
+version = "0.1.52"
 description = "Convert raw documents into AI-understandable context with intelligent text extraction, table detection, and semantic chunking"
 readme = "README.md"
 requires-python = ">=3.12"

{xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_processor.py RENAMED Viewed

@@ -25,6 +25,9 @@ from xgen_doc2chunk.core.processor.pdf_helpers.pdf_utils import (
 )
 from xgen_doc2chunk.core.processor.pdf_helpers.pdf_table_detection import TableDetectionEngine
 from xgen_doc2chunk.core.processor.pdf_helpers.pdf_cell_analysis import CellAnalysisEngine
+from xgen_doc2chunk.core.processor.pdf_helpers.pdf_text_quality_analyzer import (
+    apply_cjk_compat_mapping,
+)
 logger = logging.getLogger("document-processor")
@@ -873,7 +876,12 @@ def generate_html_from_cells(
             content = ""
             if col_idx < len(row_data):
                 content = row_data[col_idx]
-            content = escape_html(str(content).strip() if content else "")
+            # Apply CJK Compatibility character mapping to fix broken characters
+            # (e.g., 㛳→→, ㏙→(, ㏚→) etc. from Word→PDF conversion)
+            content = str(content).strip() if content else ""
+            content = apply_cjk_compat_mapping(content)
+            content = escape_html(content)
             # Get span info (default to 1 if not found)
             spans = span_map.get((row_idx, col_idx), {'rowspan': 1, 'colspan': 1})

{xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_validator.py RENAMED Viewed

@@ -383,11 +383,11 @@ class TableQualityValidator:
         # if num_rows > 5 and col2_has_paragraphs >= 2:
         #     return False, f"col2_paragraphs({col2_has_paragraphs})"
-        # Pattern 3: If first column is short and second is long overall, likely body text not key-value
-        if num_rows > 10:
-            col1_short_ratio = (col1_empty_count + col1_short_count) / num_rows
-            if col1_short_ratio >= 0.8 and col2_long_count >= 5:
-                return False, f"asymmetric_cols(short1={col1_short_ratio:.0%}, long2={col2_long_count})"
+        # # Pattern 3: If first column is short and second is long overall, likely body text not key-value
+        # if num_rows > 10:
+        #     col1_short_ratio = (col1_empty_count + col1_short_count) / num_rows
+        #     if col1_short_ratio >= 0.8 and col2_long_count >= 5:
+        #         return False, f"asymmetric_cols(short1={col1_short_ratio:.0%}, long2={col2_long_count})"
         return True, "valid"

{xgen_doc2chunk-0.1.5 → xgen_doc2chunk-0.1.52}/xgen_doc2chunk/core/processor/pdf_helpers/pdf_text_extractor.py RENAMED Viewed

@@ -3,6 +3,9 @@
 PDF Text Extraction Module
 Provides functions for extracting text blocks from PDF pages.
+Includes support for:
+- Fragmented text reconstruction (Word->PDF conversion issues)
+- CJK Compatibility character mapping (broken character fixes)
 """
 import logging
 from typing import List, Tuple
@@ -17,6 +20,8 @@ from xgen_doc2chunk.core.processor.pdf_helpers.pdf_text_quality_analyzer import
     TextQualityAnalyzer,
     QualityAwareTextExtractor,
     PageOCRFallbackEngine,
+    FragmentedTextReconstructor,
+    apply_cjk_compat_mapping,
 )
 logger = logging.getLogger("document-processor")
@@ -53,13 +58,76 @@ def extract_text_blocks(
         analyzer = TextQualityAnalyzer(page, page_num)
         page_analysis = analyzer.analyze_page()
-        # If quality is too low, use full page OCR fallback
+        # If quality is low, try text reconstruction first (before OCR)
         if page_analysis.quality_result.needs_ocr:
+            quality_result = page_analysis.quality_result
             logger.info(
-                f"[PDF] Page {page_num + 1}: Low text quality "
-                f"({page_analysis.quality_result.quality_score:.2f}), "
-                f"PUA={page_analysis.quality_result.pua_count}, "
-                f"using OCR fallback"
+                f"[PDF] Page {page_num + 1}: Low text quality detected - "
+                f"score={quality_result.quality_score:.2f}, "
+                f"PUA={quality_result.pua_count}, "
+                f"CJK_Compat={quality_result.cjk_compat_count}, "
+                f"fragmented={quality_result.is_fragmented}"
+            )
+            # Try reconstruction for fragmented text or CJK Compat issues
+            if quality_result.is_fragmented or quality_result.cjk_compat_count > 0:
+                logger.info(
+                    f"[PDF] Page {page_num + 1}: Attempting text reconstruction "
+                    f"(excluding {len(table_bboxes)} table regions)"
+                )
+                # Exclude table regions from reconstruction to avoid duplication
+                reconstructor = FragmentedTextReconstructor(
+                    page, page_num, exclude_bboxes=table_bboxes
+                )
+                # Use section-based reconstruction for proper table positioning
+                if table_bboxes:
+                    sections = reconstructor.reconstruct_with_sections()
+                    if sections:
+                        result_elements = []
+                        for section in sections:
+                            # Apply CJK Compatibility character mapping
+                            cleaned_text = apply_cjk_compat_mapping(section['text'])
+                            if cleaned_text.strip():
+                                # Create element with proper Y position for sorting
+                                result_elements.append(PageElement(
+                                    element_type=ElementType.TEXT,
+                                    content=cleaned_text,
+                                    bbox=(0, section['y_start'], page.rect.width, section['y_end']),
+                                    page_num=page_num
+                                ))
+                        if result_elements:
+                            logger.info(
+                                f"[PDF] Page {page_num + 1}: Text reconstruction successful "
+                                f"({len(result_elements)} sections)"
+                            )
+                            return result_elements
+                else:
+                    # No tables - use simple reconstruction
+                    reconstructed_text = reconstructor.reconstruct()
+                    if reconstructed_text:
+                        cleaned_text = apply_cjk_compat_mapping(reconstructed_text)
+                        logger.info(
+                            f"[PDF] Page {page_num + 1}: Text reconstruction successful "
+                            f"({len(cleaned_text)} chars)"
+                        )
+                        return [PageElement(
+                            element_type=ElementType.TEXT,
+                            content=cleaned_text,
+                            bbox=(0, 0, page.rect.width, page.rect.height),
+                            page_num=page_num
+                        )]
+            # Fall back to OCR if reconstruction not applicable
+            logger.info(
+                f"[PDF] Page {page_num + 1}: Using OCR fallback"
             )
             extractor = QualityAwareTextExtractor(page, page_num)

xgen-doc2chunk 0.1.5__tar.gz → 0.1.52__tar.gz

xgen-doc2chunk 0.1.5tar.gz → 0.1.52tar.gz