PyPI - xgen-doc2chunk - Versions diffs - 0.1.0__py3-none-any.whl → 0.1.2__py3-none-any.whl - Mend

xgen-doc2chunk 0.1.0py3-none-any.whl → 0.1.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (162) hide show

xgen_doc2chunk/core/processor/pdf_helpers/types.py ADDED Viewed

@@ -0,0 +1,278 @@
+"""
+PDF Handler Types and Configuration
+Defines all data classes and configuration values used by the PDF engine.
+"""
+from dataclasses import dataclass, field
+from enum import Enum, auto
+from typing import Optional, List, Dict, Tuple, Any
+# ============================================================================
+# Enums
+# ============================================================================
+class LineThickness(Enum):
+    """Line thickness classification"""
+    THIN = auto()      # Table inner lines (0.3-0.5pt)
+    NORMAL = auto()    # Regular borders (0.5-1.5pt)
+    THICK = auto()     # Emphasis/header divider lines (1.5pt+)
+class TableDetectionStrategy(Enum):
+    """Table detection strategy"""
+    PYMUPDF_NATIVE = auto()        # PyMuPDF built-in table detection
+    PDFPLUMBER_LINES = auto()      # pdfplumber line-based detection
+    HYBRID_ANALYSIS = auto()       # Line analysis-based hybrid
+    BORDERLESS_HEURISTIC = auto()  # Borderless table heuristic
+class ElementType(Enum):
+    """Page element type"""
+    TEXT = "text"
+    TABLE = "table"
+    IMAGE = "image"
+    ANNOTATION = "annotation"
+# ============================================================================
+# Configuration Constants
+# ============================================================================
+class PDFConfig:
+    """PDF engine configuration constants"""
+    # Line thickness thresholds (pt)
+    THIN_LINE_MAX = 0.5
+    NORMAL_LINE_MAX = 1.5
+    # Table detection settings
+    MIN_TABLE_ROWS = 2
+    MIN_TABLE_COLS = 2
+    TABLE_MERGE_TOLERANCE = 5.0  # Table merge tolerance (pt)
+    # Double line merge settings
+    DOUBLE_LINE_TOLERANCE = 3.0  # Double line detection distance (pt)
+    # Cell analysis settings
+    CELL_PADDING = 2.0
+    MIN_CELL_WIDTH = 10.0
+    MIN_CELL_HEIGHT = 8.0
+    # Text extraction settings
+    TEXT_BLOCK_TOLERANCE = 3.0
+    # Confidence threshold
+    CONFIDENCE_THRESHOLD = 0.5
+    # Page border detection settings
+    BORDER_MARGIN = 30.0        # Maximum distance from page edge
+    BORDER_LENGTH_RATIO = 0.8   # Minimum border length ratio relative to page size
+    PAGE_BORDER_MARGIN = 0.1    # Page border margin ratio relative to page size
+    PAGE_SPANNING_RATIO = 0.85  # Ratio to determine if line spans the page
+    # Graphic region detection settings
+    GRAPHIC_CURVE_RATIO_THRESHOLD = 0.3   # Curve ratio threshold
+    GRAPHIC_MIN_CURVE_COUNT = 10          # Minimum curve count
+    GRAPHIC_FILL_RATIO_THRESHOLD = 0.2    # Fill ratio threshold
+    GRAPHIC_COLOR_VARIETY_THRESHOLD = 3   # Color variety threshold
+    # Table quality validation settings
+    TABLE_MIN_FILLED_CELL_RATIO = 0.15    # Minimum filled cell ratio
+    TABLE_MAX_EMPTY_ROW_RATIO = 0.7       # Maximum empty row ratio
+    TABLE_MIN_MEANINGFUL_CELLS = 2        # Minimum meaningful cell count
+    TABLE_MIN_VALID_ROWS = 2              # Minimum valid row count
+    TABLE_MIN_TEXT_DENSITY = 0.005        # Minimum text density
+    # Cell text length settings
+    TABLE_MAX_CELL_TEXT_LENGTH = 300      # Maximum text length per cell
+    TABLE_EXTREME_CELL_LENGTH = 800       # Extremely long cell threshold
+    TABLE_MAX_LONG_CELLS_RATIO = 0.4      # Maximum long cell ratio
+    # Annotation detection settings
+    ANNOTATION_Y_MARGIN = 30.0            # pt - Search range below table for annotations
+    ANNOTATION_PATTERNS = ['주)', '주 )', '※', '*', '†', '‡', '¹', '²', '³']
+# ============================================================================
+# Data Classes - Basic Types
+# ============================================================================
+@dataclass
+class LineInfo:
+    """Line information"""
+    x0: float
+    y0: float
+    x1: float
+    y1: float
+    thickness: float = 1.0
+    thickness_class: LineThickness = LineThickness.NORMAL
+    is_horizontal: bool = False
+    is_vertical: bool = False
+    @property
+    def length(self) -> float:
+        """Line length"""
+        import math
+        return math.sqrt((self.x1 - self.x0) ** 2 + (self.y1 - self.y0) ** 2)
+    @property
+    def midpoint(self) -> Tuple[float, float]:
+        """Midpoint"""
+        return ((self.x0 + self.x1) / 2, (self.y0 + self.y1) / 2)
+@dataclass
+class GridInfo:
+    """Grid information"""
+    h_lines: List[float] = field(default_factory=list)  # Y coordinates
+    v_lines: List[float] = field(default_factory=list)  # X coordinates
+    cells: List['CellInfo'] = field(default_factory=list)
+    bbox: Tuple[float, float, float, float] = (0, 0, 0, 0)
+    is_complete: bool = False  # Whether border is complete
+    reconstructed: bool = False  # Whether border was reconstructed
+    @property
+    def row_count(self) -> int:
+        """Row count (number of regions between horizontal lines)"""
+        return max(0, len(self.h_lines) - 1)
+    @property
+    def col_count(self) -> int:
+        """Column count (number of regions between vertical lines)"""
+        return max(0, len(self.v_lines) - 1)
+@dataclass
+class CellInfo:
+    """Cell information"""
+    row: int
+    col: int
+    bbox: Tuple[float, float, float, float]
+    text: str = ""
+    rowspan: int = 1
+    colspan: int = 1
+    is_header: bool = False
+    alignment: str = "left"
+@dataclass
+class AnnotationInfo:
+    """Annotation information"""
+    type: str
+    bbox: Tuple[float, float, float, float]
+    content: str = ""
+    color: Optional[Tuple[float, float, float]] = None
+# ============================================================================
+# Data Classes - Vector Text OCR
+# ============================================================================
+@dataclass
+class VectorTextRegion:
+    """
+    Vector text (Outlined/Path Text) region information
+    """
+    bbox: Tuple[float, float, float, float]
+    drawing_count: int              # Number of drawings contained
+    curve_count: int                # Curve count (c items)
+    fill_count: int                 # Filled path count
+    ocr_text: str = ""              # OCR result
+    confidence: float = 0.0         # Confidence score
+    is_vector_text: bool = False    # Whether this is vector text
+# ============================================================================
+# Data Classes - Graphic Region
+# ============================================================================
+@dataclass
+class GraphicRegionInfo:
+    """
+    Graphic region information (charts, diagrams, icons, etc.)
+    """
+    bbox: Tuple[float, float, float, float]
+    curve_count: int = 0            # Curve count
+    line_count: int = 0             # Straight line count
+    rect_count: int = 0             # Rectangle count
+    fill_count: int = 0             # Filled shape count
+    color_count: int = 0            # Number of colors used
+    is_graphic: bool = False        # Whether this is a graphic region
+    confidence: float = 0.0         # Confidence score
+    reason: str = ""                # Reasoning for determination
+# ============================================================================
+# Data Classes - Table Detection
+# ============================================================================
+@dataclass
+class TableCandidate:
+    """Table candidate"""
+    strategy: TableDetectionStrategy
+    confidence: float
+    bbox: Tuple[float, float, float, float]
+    grid: Optional[GridInfo] = None
+    cells: List['CellInfo'] = field(default_factory=list)
+    data: List[List[Optional[str]]] = field(default_factory=list)
+    raw_table: Any = None  # Original table object
+    @property
+    def row_count(self) -> int:
+        """Row count"""
+        return len(self.data)
+    @property
+    def col_count(self) -> int:
+        """Column count"""
+        return max(len(row) for row in self.data) if self.data else 0
+@dataclass
+class PageElement:
+    """Page element"""
+    element_type: ElementType
+    content: str
+    bbox: Tuple[float, float, float, float]
+    page_num: int
+    table_data: Optional[List[List]] = None
+    cells_info: Optional[List[Dict]] = None
+    annotations: Optional[List[AnnotationInfo]] = None
+    detection_strategy: Optional[TableDetectionStrategy] = None
+    confidence: float = 1.0
+@dataclass
+class PageBorderInfo:
+    """Page border information"""
+    has_border: bool = False
+    border_bbox: Optional[Tuple[float, float, float, float]] = None
+    border_lines: Dict[str, bool] = field(default_factory=lambda: {
+        'top': False, 'bottom': False, 'left': False, 'right': False
+    })
+# ============================================================================
+# Export
+# ============================================================================
+__all__ = [
+    # Enums
+    'LineThickness',
+    'TableDetectionStrategy',
+    'ElementType',
+    # Config
+    'PDFConfig',
+    # Data Classes
+    'LineInfo',
+    'GridInfo',
+    'CellInfo',
+    'AnnotationInfo',
+    'VectorTextRegion',
+    'GraphicRegionInfo',
+    'TableCandidate',
+    'PageElement',
+    'PageBorderInfo',
+]

xgen_doc2chunk/core/processor/ppt_handler.py ADDED Viewed

@@ -0,0 +1,288 @@
+# xgen_doc2chunk/core/processor/ppt_handler.py
+"""
+PPT Handler - PPT/PPTX Document Processor
+Class-based handler for PPT/PPTX files inheriting from BaseHandler.
+"""
+import logging
+from typing import Any, Dict, List, Optional, Set, TYPE_CHECKING
+from xgen_doc2chunk.core.processor.base_handler import BaseHandler
+from xgen_doc2chunk.core.functions.chart_extractor import BaseChartExtractor
+from xgen_doc2chunk.core.processor.ppt_helper import (
+    ElementType,
+    SlideElement,
+    extract_text_with_bullets,
+    is_simple_table,
+    extract_simple_table_as_text,
+    convert_table_to_html,
+    extract_table_as_text,
+    get_shape_position,
+    is_picture_shape,
+    process_image_shape,
+    process_group_shape,
+    extract_slide_notes,
+    merge_slide_elements,
+)
+from xgen_doc2chunk.core.processor.ppt_helper.ppt_chart_extractor import PPTChartExtractor
+from xgen_doc2chunk.core.processor.ppt_helper.ppt_metadata import PPTMetadataExtractor
+from xgen_doc2chunk.core.processor.ppt_helper.ppt_image_processor import PPTImageProcessor
+if TYPE_CHECKING:
+    from xgen_doc2chunk.core.document_processor import CurrentFile
+    from xgen_doc2chunk.core.functions.chart_extractor import ChartData
+logger = logging.getLogger("document-processor")
+class PPTHandler(BaseHandler):
+    """PPT/PPTX File Processing Handler Class"""
+    def _create_file_converter(self):
+        """Create PPT-specific file converter."""
+        from xgen_doc2chunk.core.processor.ppt_helper.ppt_file_converter import PPTFileConverter
+        return PPTFileConverter()
+    def _create_preprocessor(self):
+        """Create PPT-specific preprocessor."""
+        from xgen_doc2chunk.core.processor.ppt_helper.ppt_preprocessor import PPTPreprocessor
+        return PPTPreprocessor()
+    def _create_chart_extractor(self) -> BaseChartExtractor:
+        """Create PPT-specific chart extractor."""
+        return PPTChartExtractor(self._chart_processor)
+    def _create_metadata_extractor(self):
+        """Create PPT-specific metadata extractor."""
+        return PPTMetadataExtractor()
+    def _create_format_image_processor(self):
+        """Create PPT-specific image processor."""
+        return PPTImageProcessor(
+            directory_path=self._image_processor.config.directory_path,
+            tag_prefix=self._image_processor.config.tag_prefix,
+            tag_suffix=self._image_processor.config.tag_suffix,
+            storage_backend=self._image_processor.storage_backend,
+        )
+    def extract_text(
+        self,
+        current_file: "CurrentFile",
+        extract_metadata: bool = True,
+        **kwargs
+    ) -> str:
+        """
+        Extract text from PPT/PPTX file.
+        Args:
+            current_file: CurrentFile dict containing file info and binary data
+            extract_metadata: Whether to extract metadata
+            **kwargs: Additional options
+        Returns:
+            Extracted text
+        """
+        file_path = current_file.get("file_path", "unknown")
+        self.logger.info(f"PPT processing: {file_path}")
+        return self._extract_ppt_enhanced(current_file, extract_metadata)
+    def _extract_ppt_enhanced(self, current_file: "CurrentFile", extract_metadata: bool = True) -> str:
+        """Enhanced PPT processing with pre-extracted charts."""
+        file_path = current_file.get("file_path", "unknown")
+        self.logger.info(f"Enhanced PPT processing: {file_path}")
+        try:
+            # Step 1: Convert to Presentation using file_converter
+            file_data = current_file.get("file_data", b"")
+            file_stream = self.get_file_stream(current_file)
+            prs = self.file_converter.convert(file_data, file_stream)
+            # Step 2: Preprocess - may transform prs in the future
+            preprocessed = self.preprocess(prs)
+            prs = preprocessed.clean_content  # TRUE SOURCE
+            result_parts = []
+            processed_images: Set[str] = set()
+            total_tables = 0
+            total_images = 0
+            total_charts = 0
+            # Pre-extract all charts using ChartExtractor
+            file_stream.seek(0)
+            chart_data_list = self.chart_extractor.extract_all_from_file(file_stream)
+            chart_idx = [0]  # Mutable container for closure
+            def get_next_chart() -> str:
+                """Callback to get the next pre-extracted chart content."""
+                if chart_idx[0] < len(chart_data_list):
+                    chart_data = chart_data_list[chart_idx[0]]
+                    chart_idx[0] += 1
+                    return self._format_chart_data(chart_data)
+                return ""
+            if extract_metadata:
+                metadata_text = self.extract_and_format_metadata(prs)
+                if metadata_text:
+                    result_parts.append(metadata_text)
+                    result_parts.append("")
+            for slide_idx, slide in enumerate(prs.slides):
+                slide_tag = self.create_slide_tag(slide_idx + 1)
+                result_parts.append(f"\n{slide_tag}\n")
+                elements: List[SlideElement] = []
+                for shape in slide.shapes:
+                    try:
+                        position = get_shape_position(shape)
+                        shape_id = shape.shape_id if hasattr(shape, 'shape_id') else id(shape)
+                        if shape.has_table:
+                            if is_simple_table(shape.table):
+                                simple_text = extract_simple_table_as_text(shape.table)
+                                if simple_text:
+                                    elements.append(SlideElement(
+                                        element_type=ElementType.TEXT,
+                                        content=simple_text,
+                                        position=position,
+                                        shape_id=shape_id
+                                    ))
+                            else:
+                                table_html = convert_table_to_html(shape.table)
+                                if table_html:
+                                    total_tables += 1
+                                    elements.append(SlideElement(
+                                        element_type=ElementType.TABLE,
+                                        content=table_html,
+                                        position=position,
+                                        shape_id=shape_id
+                                    ))
+                        elif is_picture_shape(shape):
+                            image_tag = process_image_shape(shape, processed_images, self.format_image_processor)
+                            if image_tag:
+                                total_images += 1
+                                elements.append(SlideElement(
+                                    element_type=ElementType.IMAGE,
+                                    content=image_tag,
+                                    position=position,
+                                    shape_id=shape_id
+                                ))
+                        elif shape.has_chart:
+                            # Use pre-extracted chart via callback
+                            chart_text = get_next_chart()
+                            if chart_text:
+                                total_charts += 1
+                                elements.append(SlideElement(
+                                    element_type=ElementType.CHART,
+                                    content=chart_text,
+                                    position=position,
+                                    shape_id=shape_id
+                                ))
+                        elif hasattr(shape, "text_frame") and shape.text_frame:
+                            text_content = extract_text_with_bullets(shape.text_frame)
+                            if text_content:
+                                elements.append(SlideElement(
+                                    element_type=ElementType.TEXT,
+                                    content=text_content,
+                                    position=position,
+                                    shape_id=shape_id
+                                ))
+                        elif hasattr(shape, "text") and shape.text.strip():
+                            elements.append(SlideElement(
+                                element_type=ElementType.TEXT,
+                                content=shape.text.strip(),
+                                position=position,
+                                shape_id=shape_id
+                            ))
+                        elif hasattr(shape, "shapes"):
+                            group_elements = process_group_shape(shape, processed_images, self.format_image_processor)
+                            elements.extend(group_elements)
+                    except Exception as shape_e:
+                        self.logger.warning(f"Error processing shape in slide {slide_idx + 1}: {shape_e}")
+                        continue
+                elements.sort(key=lambda e: e.sort_key)
+                slide_content = merge_slide_elements(elements)
+                if slide_content.strip():
+                    result_parts.append(slide_content)
+                else:
+                    result_parts.append("[Empty Slide]\n")
+                notes_text = extract_slide_notes(slide)
+                if notes_text:
+                    result_parts.append(f"\n[Slide Notes]\n{notes_text}\n")
+            result = "".join(result_parts)
+            self.logger.info(f"Enhanced PPT: {len(prs.slides)} slides, {total_tables} tables, "
+                           f"{total_images} images, {total_charts} charts")
+            return result
+        except Exception as e:
+            self.logger.error(f"Error in enhanced PPT processing: {e}")
+            import traceback
+            self.logger.debug(traceback.format_exc())
+            return self._extract_ppt_simple(current_file)
+    def _format_chart_data(self, chart_data: "ChartData") -> str:
+        """Format ChartData using ChartProcessor."""
+        from xgen_doc2chunk.core.functions.chart_extractor import ChartData
+        if not isinstance(chart_data, ChartData):
+            return ""
+        if chart_data.has_data():
+            return self.chart_processor.format_chart_data(
+                chart_type=chart_data.chart_type,
+                title=chart_data.title,
+                categories=chart_data.categories,
+                series=chart_data.series
+            )
+        else:
+            return self.chart_processor.format_chart_fallback(
+                chart_type=chart_data.chart_type,
+                title=chart_data.title
+            )
+    def _extract_ppt_simple(self, current_file: "CurrentFile") -> str:
+        """Simple text extraction (fallback)."""
+        try:
+            file_data = current_file.get("file_data", b"")
+            file_stream = self.get_file_stream(current_file)
+            prs = self.file_converter.convert(file_data, file_stream)
+            result_parts = []
+            for slide_idx, slide in enumerate(prs.slides):
+                slide_tag = self.create_slide_tag(slide_idx + 1)
+                result_parts.append(f"\n{slide_tag}\n")
+                slide_texts = []
+                for shape in slide.shapes:
+                    try:
+                        if hasattr(shape, "text") and shape.text.strip():
+                            slide_texts.append(shape.text.strip())
+                        elif hasattr(shape, "table"):
+                            table_text = extract_table_as_text(shape.table)
+                            if table_text:
+                                slide_texts.append(table_text)
+                    except:
+                        continue
+                if slide_texts:
+                    result_parts.append("\n".join(slide_texts) + "\n")
+                else:
+                    result_parts.append("[Empty Slide]\n")
+            return "".join(result_parts)
+        except Exception as e:
+            self.logger.error(f"Error in simple PPT extraction: {e}")
+            return f"[PPT file processing failed: {str(e)}]"

xgen_doc2chunk/core/processor/ppt_helper/__init__.py ADDED Viewed

@@ -0,0 +1,96 @@
+"""
+PPT Helper 모듈
+PPT/PPTX 문서 처리를 위한 헬퍼 함수 모음.
+모듈 구성:
+- ppt_constants: 상수, 매핑 테이블, 타입 정의
+- ppt_metadata: 메타데이터 추출/포맷팅
+- ppt_bullet: 목록(Bullet/Numbering) 처리
+- ppt_table: 테이블 처리 (HTML 변환, 병합)
+- ppt_chart_extractor: 차트 데이터 추출 (ChartExtractor)
+- ppt_shape: Shape 처리 (위치, 이미지, 그룹)
+- ppt_slide: 슬라이드 처리 (노트, 요소 병합)
+"""
+# === Constants ===
+from xgen_doc2chunk.core.processor.ppt_helper.ppt_constants import (
+    WINGDINGS_MAPPING,
+    WINGDINGS_CHAR_MAPPING,
+    SYMBOL_MAPPING,
+    ElementType,
+    SlideElement,
+)
+# === Metadata ===
+from xgen_doc2chunk.core.processor.ppt_helper.ppt_metadata import (
+    PPTMetadataExtractor,
+)
+# === Bullet/Numbering ===
+from xgen_doc2chunk.core.processor.ppt_helper.ppt_bullet import (
+    extract_text_with_bullets,
+    extract_bullet_info,
+    convert_special_font_char,
+)
+# === Table ===
+from xgen_doc2chunk.core.processor.ppt_helper.ppt_table import (
+    is_simple_table,
+    extract_simple_table_as_text,
+    convert_table_to_html,
+    extract_table_as_text,
+    debug_table_structure,
+)
+# === Chart Extractor ===
+from xgen_doc2chunk.core.processor.ppt_helper.ppt_chart_extractor import (
+    PPTChartExtractor,
+)
+# === Shape ===
+from xgen_doc2chunk.core.processor.ppt_helper.ppt_shape import (
+    get_shape_position,
+    is_picture_shape,
+    process_image_shape,
+    process_group_shape,
+)
+# === Slide ===
+from xgen_doc2chunk.core.processor.ppt_helper.ppt_slide import (
+    extract_slide_notes,
+    merge_slide_elements,
+)
+__all__ = [
+    # Constants
+    "WINGDINGS_MAPPING",
+    "WINGDINGS_CHAR_MAPPING",
+    "SYMBOL_MAPPING",
+    "ElementType",
+    "SlideElement",
+    # Metadata
+    "extract_ppt_metadata",
+    "format_metadata",
+    # Bullet
+    "extract_text_with_bullets",
+    "extract_bullet_info",
+    "convert_special_font_char",
+    # Table
+    "is_simple_table",
+    "extract_simple_table_as_text",
+    "convert_table_to_html",
+    "extract_table_as_text",
+    "debug_table_structure",
+    # Chart Extractor
+    "PPTChartExtractor",
+    # Shape
+    "get_shape_position",
+    "is_picture_shape",
+    "process_image_shape",
+    "process_group_shape",
+    # Slide
+    "extract_slide_notes",
+    "merge_slide_elements",
+]

xgen-doc2chunk 0.1.0__py3-none-any.whl → 0.1.2__py3-none-any.whl

xgen-doc2chunk 0.1.0py3-none-any.whl → 0.1.2py3-none-any.whl