PyPI - xgen-doc2chunk - Versions diffs - 0.1.0__py3-none-any.whl → 0.1.2__py3-none-any.whl - Mend

xgen-doc2chunk 0.1.0py3-none-any.whl → 0.1.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (162) hide show

xgen_doc2chunk/core/processor/docx_helper/docx_table_processor.py ADDED Viewed

@@ -0,0 +1,220 @@
+# xgen_doc2chunk/core/processor/docx_helper/docx_table_processor.py
+"""
+DOCX Table Processor
+Formats TableData into HTML/Markdown/Text output for DOCX documents.
+Extends the base TableProcessor with DOCX-specific formatting options.
+Key Features:
+- HTML output with border attributes for backward compatibility
+- Special handling for 1x1 container tables
+- Special handling for single column tables
+- Post-processing for DOCX-specific requirements
+Usage:
+    from xgen_doc2chunk.core.processor.docx_helper.docx_table_processor import (
+        DOCXTableProcessor,
+        create_docx_table_processor,
+    )
+    processor = DOCXTableProcessor()
+    html = processor.format_table(table_data)
+"""
+import logging
+from dataclasses import dataclass
+from typing import Optional
+from xgen_doc2chunk.core.functions.table_extractor import TableData
+from xgen_doc2chunk.core.functions.table_processor import (
+    TableProcessor,
+    TableProcessorConfig,
+    TableOutputFormat,
+)
+logger = logging.getLogger("document-processor")
+@dataclass
+class DOCXTableProcessorConfig(TableProcessorConfig):
+    """Configuration for DOCX table processing.
+    Extends TableProcessorConfig with DOCX-specific options.
+    Attributes:
+        add_border: Whether to add border='1' attribute to HTML tables
+        collapse_single_cell: Whether to collapse 1x1 tables to plain text
+        collapse_single_column: Whether to collapse single-column tables to line-separated text
+    """
+    add_border: bool = True
+    collapse_single_cell: bool = True
+    collapse_single_column: bool = True
+class DOCXTableProcessor(TableProcessor):
+    """DOCX-specific table processor.
+    Extends TableProcessor with DOCX-specific formatting:
+    - Adds border='1' to HTML tables for backward compatibility
+    - Collapses 1x1 container tables to plain text
+    - Collapses single-column tables to line-separated text
+    Usage:
+        processor = DOCXTableProcessor()
+        html = processor.format_table(table_data)
+    """
+    def __init__(self, config: Optional[DOCXTableProcessorConfig] = None):
+        """Initialize the DOCX table processor.
+        Args:
+            config: DOCX table processing configuration
+        """
+        if config is None:
+            config = DOCXTableProcessorConfig()
+        super().__init__(config)
+        self.docx_config = config
+    def format_table(self, table: TableData) -> str:
+        """Format a table with DOCX-specific handling.
+        Handles special cases before delegating to base class:
+        - 1x1 tables: Return cell content only (container tables)
+        - Single column tables: Return as line-separated text
+        Args:
+            table: TableData to format
+        Returns:
+            Formatted table string
+        """
+        if not table or not table.rows:
+            return ""
+        # Special case: 1x1 table (container table)
+        if (self.docx_config.collapse_single_cell and
+            table.num_rows == 1 and table.num_cols == 1):
+            if table.rows and table.rows[0]:
+                return table.rows[0][0].content
+            return ""
+        # Special case: Single column table
+        if (self.docx_config.collapse_single_column and
+            table.num_cols == 1):
+            text_items = []
+            for row in table.rows:
+                if row and row[0].content:
+                    text_items.append(row[0].content)
+            if text_items:
+                return "\n\n".join(text_items)
+            return ""
+        # Normal table processing
+        return super().format_table(table)
+    def format_table_as_html(self, table: TableData) -> str:
+        """Format table as HTML with DOCX-specific attributes.
+        Adds border='1' attribute for backward compatibility.
+        Args:
+            table: TableData to format
+        Returns:
+            HTML table string
+        """
+        # Check for special cases first
+        if not table or not table.rows:
+            return ""
+        # 1x1 table handling
+        if (self.docx_config.collapse_single_cell and
+            table.num_rows == 1 and table.num_cols == 1):
+            if table.rows and table.rows[0]:
+                return table.rows[0][0].content
+            return ""
+        # Single column table handling
+        if (self.docx_config.collapse_single_column and
+            table.num_cols == 1):
+            text_items = []
+            for row in table.rows:
+                if row and row[0].content:
+                    text_items.append(row[0].content)
+            if text_items:
+                return "\n\n".join(text_items)
+            return ""
+        # Generate HTML using base class
+        html = super().format_table_as_html(table)
+        # Post-process: Add border attribute
+        if self.docx_config.add_border:
+            html = html.replace("<table>", "<table border='1'>")
+        return html
+# Default configuration
+DEFAULT_DOCX_PROCESSOR_CONFIG = DOCXTableProcessorConfig(
+    output_format=TableOutputFormat.HTML,
+    clean_whitespace=True,
+    preserve_merged_cells=True,
+    add_border=True,
+    collapse_single_cell=True,
+    collapse_single_column=True,
+)
+# Module-level default processor (lazy initialized)
+_default_processor: Optional[DOCXTableProcessor] = None
+def get_default_processor() -> DOCXTableProcessor:
+    """Get or create the default DOCX table processor.
+    Returns:
+        Configured DOCXTableProcessor instance
+    """
+    global _default_processor
+    if _default_processor is None:
+        _default_processor = DOCXTableProcessor(DEFAULT_DOCX_PROCESSOR_CONFIG)
+    return _default_processor
+def create_docx_table_processor(
+    config: Optional[DOCXTableProcessorConfig] = None
+) -> DOCXTableProcessor:
+    """Create a DOCX table processor instance.
+    Args:
+        config: DOCX table processing configuration
+    Returns:
+        Configured DOCXTableProcessor instance
+    """
+    return DOCXTableProcessor(config)
+def format_table_as_html(table: TableData) -> str:
+    """Convenience function to format a table as HTML.
+    Uses the default DOCX table processor.
+    Args:
+        table: TableData to format
+    Returns:
+        HTML table string
+    """
+    processor = get_default_processor()
+    return processor.format_table_as_html(table)
+__all__ = [
+    'DOCXTableProcessor',
+    'DOCXTableProcessorConfig',
+    'DEFAULT_DOCX_PROCESSOR_CONFIG',
+    'create_docx_table_processor',
+    'get_default_processor',
+    'format_table_as_html',
+]

xgen_doc2chunk/core/processor/excel_handler.py ADDED Viewed

@@ -0,0 +1,353 @@
+# your_package/document_processor/excel_handler.py
+"""
+Excel Handler - Excel Document Processor (XLSX/XLS)
+Main Features:
+- Metadata extraction (title, author, subject, keywords, creation date, modification date, etc.)
+- Text extraction (direct parsing via openpyxl/xlrd)
+- Table extraction (Markdown or HTML conversion based on merged cells)
+- Inline image extraction and local storage
+- Chart processing (convert to table)
+- Multi-sheet support
+Class-based Handler:
+- ExcelHandler class inherits from BaseHandler to manage config/image_processor
+"""
+from __future__ import annotations
+import logging
+import os
+from typing import TYPE_CHECKING, Any, Dict, List, Optional, Set
+from xgen_doc2chunk.core.processor.base_handler import BaseHandler
+from xgen_doc2chunk.core.functions.img_processor import ImageProcessor
+from xgen_doc2chunk.core.functions.chart_extractor import BaseChartExtractor
+from xgen_doc2chunk.core.processor.excel_helper.excel_chart_extractor import ExcelChartExtractor
+if TYPE_CHECKING:
+    from openpyxl.workbook import Workbook
+    from openpyxl.worksheet.worksheet import Worksheet
+    from xgen_doc2chunk.core.document_processor import CurrentFile
+from xgen_doc2chunk.core.processor.excel_helper import (
+    # Textbox
+    extract_textboxes_from_xlsx,
+    # Table
+    convert_xlsx_sheet_to_table,
+    convert_xls_sheet_to_table,
+    # Object Detection
+    convert_xlsx_objects_to_tables,
+    convert_xls_objects_to_tables,
+)
+from xgen_doc2chunk.core.processor.excel_helper.excel_metadata import (
+    XLSXMetadataExtractor,
+    XLSMetadataExtractor,
+)
+from xgen_doc2chunk.core.processor.excel_helper.excel_image_processor import (
+    ExcelImageProcessor,
+)
+logger = logging.getLogger("document-processor")
+# ============================================================================
+# ExcelHandler Class
+# ============================================================================
+class ExcelHandler(BaseHandler):
+    """
+    Excel Document Handler (XLSX/XLS)
+    Inherits from BaseHandler to manage config and image_processor at instance level.
+    Usage:
+        handler = ExcelHandler(config=config, image_processor=image_processor)
+        text = handler.extract_text(current_file)
+    """
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self._xlsx_metadata_extractor = None
+        self._xls_metadata_extractor = None
+    def _create_file_converter(self):
+        """Create Excel-specific file converter."""
+        from xgen_doc2chunk.core.processor.excel_helper.excel_file_converter import ExcelFileConverter
+        return ExcelFileConverter()
+    def _create_preprocessor(self):
+        """Create Excel-specific preprocessor."""
+        from xgen_doc2chunk.core.processor.excel_helper.excel_preprocessor import ExcelPreprocessor
+        return ExcelPreprocessor()
+    def _create_chart_extractor(self) -> BaseChartExtractor:
+        """Create Excel-specific chart extractor."""
+        return ExcelChartExtractor(self._chart_processor)
+    def _create_metadata_extractor(self):
+        """Create XLSX-specific metadata extractor (default)."""
+        return XLSXMetadataExtractor()
+    def _create_format_image_processor(self):
+        """Create Excel-specific image processor."""
+        return ExcelImageProcessor(
+            directory_path=self._image_processor.config.directory_path,
+            tag_prefix=self._image_processor.config.tag_prefix,
+            tag_suffix=self._image_processor.config.tag_suffix,
+            storage_backend=self._image_processor.storage_backend,
+        )
+    def _get_xls_metadata_extractor(self):
+        """Get XLS-specific metadata extractor."""
+        if self._xls_metadata_extractor is None:
+            self._xls_metadata_extractor = XLSMetadataExtractor()
+        return self._xls_metadata_extractor
+    def extract_text(
+        self,
+        current_file: "CurrentFile",
+        extract_metadata: bool = True,
+        **kwargs
+    ) -> str:
+        """
+        Extract text from Excel file.
+        Args:
+            current_file: CurrentFile dict containing file info and binary data
+            extract_metadata: Whether to extract metadata
+            **kwargs: Additional options
+        Returns:
+            Extracted text
+        """
+        file_path = current_file.get("file_path", "unknown")
+        ext = current_file.get("file_extension", os.path.splitext(file_path)[1]).lower()
+        # Normalize extension (remove leading dot if present)
+        ext = ext.lstrip('.')
+        self.logger.info(f"Excel processing: {file_path}, ext: {ext}")
+        if ext == 'xlsx':
+            return self._extract_xlsx(current_file, extract_metadata)
+        elif ext == 'xls':
+            return self._extract_xls(current_file, extract_metadata)
+        else:
+            raise ValueError(f"Unsupported Excel format: {ext}")
+    def _extract_xlsx(
+        self,
+        current_file: "CurrentFile",
+        extract_metadata: bool = True
+    ) -> str:
+        """XLSX file processing."""
+        file_path = current_file.get("file_path", "unknown")
+        self.logger.info(f"XLSX processing: {file_path}")
+        try:
+            # Step 1: Convert to Workbook using file_converter
+            file_data = current_file.get("file_data", b"")
+            wb = self.file_converter.convert(file_data, extension='xlsx')
+            # Step 2: Preprocess - may transform wb in the future
+            preprocessed = self.preprocess(wb)
+            wb = preprocessed.clean_content  # TRUE SOURCE
+            preload = self._preload_xlsx_data(current_file, wb, extract_metadata)
+            result_parts = [preload["metadata_str"]] if preload["metadata_str"] else []
+            processed_images: Set[str] = set()
+            stats = {"charts": 0, "images": 0, "textboxes": 0}
+            for sheet_name in wb.sheetnames:
+                sheet_result = self._process_xlsx_sheet(
+                    wb[sheet_name], sheet_name, preload, processed_images, stats
+                )
+                result_parts.append(sheet_result)
+            remaining = self._process_remaining_charts(
+                preload["chart_data_list"], preload["chart_idx"], processed_images, stats
+            )
+            if remaining:
+                result_parts.append(remaining)
+            result = "".join(result_parts)
+            self.logger.info(
+                f"XLSX processing completed: {len(wb.sheetnames)} sheets, "
+                f"{stats['charts']} charts, {stats['images']} images"
+            )
+            return result
+        except Exception as e:
+            self.logger.error(f"Error in XLSX processing: {e}")
+            import traceback
+            self.logger.debug(traceback.format_exc())
+            raise
+    def _extract_xls(
+        self,
+        current_file: "CurrentFile",
+        extract_metadata: bool = True
+    ) -> str:
+        """XLS file processing."""
+        file_path = current_file.get("file_path", "unknown")
+        self.logger.info(f"XLS processing: {file_path}")
+        try:
+            # Step 1: Convert to Workbook using file_converter
+            file_data = current_file.get("file_data", b"")
+            wb = self.file_converter.convert(file_data, extension='xls')
+            # Step 2: Preprocess - may transform wb in the future
+            preprocessed = self.preprocess(wb)
+            wb = preprocessed.clean_content  # TRUE SOURCE
+            result_parts = []
+            if extract_metadata:
+                xls_extractor = self._get_xls_metadata_extractor()
+                metadata_str = xls_extractor.extract_and_format(wb)
+                if metadata_str:
+                    result_parts.append(metadata_str + "\n\n")
+            for sheet_idx in range(wb.nsheets):
+                ws = wb.sheet_by_index(sheet_idx)
+                sheet_tag = self.create_sheet_tag(ws.name)
+                result_parts.append(f"\n{sheet_tag}\n")
+                table_contents = convert_xls_objects_to_tables(ws, wb)
+                if table_contents:
+                    for i, table_content in enumerate(table_contents, 1):
+                        if len(table_contents) > 1:
+                            result_parts.append(f"\n[Table {i}]\n{table_content}\n")
+                        else:
+                            result_parts.append(f"\n{table_content}\n")
+            result = "".join(result_parts)
+            self.logger.info(f"XLS processing completed: {wb.nsheets} sheets")
+            return result
+        except Exception as e:
+            self.logger.error(f"Error in XLS processing: {e}")
+            import traceback
+            self.logger.debug(traceback.format_exc())
+            raise
+    def _preload_xlsx_data(
+        self, current_file: "CurrentFile", wb, extract_metadata: bool
+    ) -> Dict[str, Any]:
+        """Extract preprocessing data from XLSX file."""
+        file_path = current_file.get("file_path", "unknown")
+        file_stream = self.get_file_stream(current_file)
+        result = {
+            "metadata_str": "",
+            "chart_data_list": [],  # ChartData instances from extractor
+            "images_data": [],
+            "textboxes_by_sheet": {},
+            "chart_idx": 0,
+        }
+        if extract_metadata:
+            result["metadata_str"] = self.extract_and_format_metadata(wb)
+            if result["metadata_str"]:
+                result["metadata_str"] += "\n\n"
+        # Use ChartExtractor for chart extraction
+        result["chart_data_list"] = self.chart_extractor.extract_all_from_file(file_stream)
+        # Use format_image_processor directly for image extraction
+        image_processor = self.format_image_processor
+        if hasattr(image_processor, 'extract_images_from_xlsx'):
+            result["images_data"] = image_processor.extract_images_from_xlsx(file_path)
+        else:
+            result["images_data"] = {}
+        result["textboxes_by_sheet"] = extract_textboxes_from_xlsx(file_path)
+        return result
+    def _process_xlsx_sheet(
+        self, ws, sheet_name: str, preload: Dict[str, Any],
+        processed_images: Set[str], stats: Dict[str, int]
+    ) -> str:
+        """Process a single XLSX sheet."""
+        sheet_tag = self.create_sheet_tag(sheet_name)
+        parts = [f"\n{sheet_tag}\n"]
+        table_contents = convert_xlsx_objects_to_tables(ws)
+        if table_contents:
+            for i, table_content in enumerate(table_contents, 1):
+                if len(table_contents) > 1:
+                    parts.append(f"\n[Table {i}]\n{table_content}\n")
+                else:
+                    parts.append(f"\n{table_content}\n")
+        # Chart processing using ChartExtractor
+        if hasattr(ws, '_charts') and ws._charts:
+            chart_data_list = preload["chart_data_list"]
+            for chart in ws._charts:
+                if preload["chart_idx"] < len(chart_data_list):
+                    chart_data = chart_data_list[preload["chart_idx"]]
+                    # chart_data is already ChartData instance, format it
+                    chart_output = self._format_chart_data(chart_data)
+                    if chart_output:
+                        parts.append(f"\n{chart_output}\n")
+                        stats["charts"] += 1
+                    preload["chart_idx"] += 1
+        # Image processing - use format_image_processor directly
+        image_processor = self.format_image_processor
+        if hasattr(image_processor, 'get_sheet_images'):
+            sheet_images = image_processor.get_sheet_images(ws, preload["images_data"], "")
+        else:
+            sheet_images = []
+        for image_data, anchor in sheet_images:
+            if image_data:
+                image_tag = self.format_image_processor.save_image(image_data)
+                if image_tag:
+                    parts.append(f"\n{image_tag}\n")
+                    stats["images"] += 1
+        # Textbox processing
+        textboxes = preload["textboxes_by_sheet"].get(sheet_name, [])
+        for tb in textboxes:
+            if tb:
+                parts.append(f"\n[Textbox] {tb}\n")
+                stats["textboxes"] += 1
+        return "".join(parts)
+    def _format_chart_data(self, chart_data) -> str:
+        """Format ChartData using ChartProcessor."""
+        from xgen_doc2chunk.core.functions.chart_extractor import ChartData
+        if not isinstance(chart_data, ChartData):
+            return ""
+        if chart_data.has_data():
+            return self.chart_processor.format_chart_data(
+                chart_type=chart_data.chart_type,
+                title=chart_data.title,
+                categories=chart_data.categories,
+                series=chart_data.series
+            )
+        else:
+            return self.chart_processor.format_chart_fallback(
+                chart_type=chart_data.chart_type,
+                title=chart_data.title
+            )
+    def _process_remaining_charts(
+        self, chart_data_list: List, chart_idx: int,
+        processed_images: Set[str], stats: Dict[str, int]
+    ) -> str:
+        """Process remaining charts not associated with sheets."""
+        parts = []
+        while chart_idx < len(chart_data_list):
+            chart_data = chart_data_list[chart_idx]
+            chart_output = self._format_chart_data(chart_data)
+            if chart_output:
+                parts.append(f"\n{chart_output}\n")
+                stats["charts"] += 1
+            chart_idx += 1
+        return "".join(parts)
+__all__ = ["ExcelHandler"]

xgen_doc2chunk/core/processor/excel_helper/__init__.py ADDED Viewed

@@ -0,0 +1,97 @@
+"""
+Excel Helper Module
+Handles extraction of elements (textboxes, charts, images, tables, etc.) from XLSX/XLS files.
+Module Structure:
+- excel_chart_constants: Chart type mapping constants
+- excel_chart_extractor: Chart extraction (ChartExtractor)
+- excel_table_xlsx: XLSX table conversion
+- excel_table_xls: XLS table conversion
+- excel_textbox: Textbox extraction
+- excel_metadata: Metadata extraction
+- excel_image: Image extraction
+- excel_layout_detector: Layout detection
+"""
+# === Textbox ===
+from xgen_doc2chunk.core.processor.excel_helper.excel_textbox import extract_textboxes_from_xlsx
+# === Metadata ===
+from xgen_doc2chunk.core.processor.excel_helper.excel_metadata import (
+    ExcelMetadataExtractor,
+    XLSXMetadataExtractor,
+    XLSMetadataExtractor,
+)
+# === Chart Extractor ===
+from xgen_doc2chunk.core.processor.excel_helper.excel_chart_extractor import (
+    ExcelChartExtractor,
+    CHART_TYPE_MAP,
+)
+# === Image Processor (replaces excel_image.py utility functions) ===
+from xgen_doc2chunk.core.processor.excel_helper.excel_image_processor import (
+    ExcelImageProcessor,
+)
+# === Table XLSX ===
+from xgen_doc2chunk.core.processor.excel_helper.excel_table_xlsx import (
+    has_merged_cells_xlsx,
+    convert_xlsx_sheet_to_table,
+    convert_xlsx_sheet_to_markdown,
+    convert_xlsx_sheet_to_html,
+    convert_xlsx_objects_to_tables,
+)
+# === Table XLS ===
+from xgen_doc2chunk.core.processor.excel_helper.excel_table_xls import (
+    has_merged_cells_xls,
+    convert_xls_sheet_to_table,
+    convert_xls_sheet_to_markdown,
+    convert_xls_sheet_to_html,
+    convert_xls_objects_to_tables,
+)
+# === Layout Detector ===
+from xgen_doc2chunk.core.processor.excel_helper.excel_layout_detector import (
+    layout_detect_range_xlsx,
+    layout_detect_range_xls,
+    object_detect_xlsx,
+    object_detect_xls,
+    LayoutRange,
+)
+__all__ = [
+    # Textbox
+    'extract_textboxes_from_xlsx',
+    # Metadata
+    'ExcelMetadataExtractor',
+    'XLSXMetadataExtractor',
+    'XLSMetadataExtractor',
+    # Chart Constants
+    'CHART_TYPE_MAP',
+    # Chart Extractor
+    'ExcelChartExtractor',
+    # Image Processor
+    'ExcelImageProcessor',
+    # Table XLSX
+    'has_merged_cells_xlsx',
+    'convert_xlsx_sheet_to_table',
+    'convert_xlsx_sheet_to_markdown',
+    'convert_xlsx_sheet_to_html',
+    'convert_xlsx_objects_to_tables',
+    # Table XLS
+    'has_merged_cells_xls',
+    'convert_xls_sheet_to_table',
+    'convert_xls_sheet_to_markdown',
+    'convert_xls_sheet_to_html',
+    'convert_xls_objects_to_tables',
+    # Layout Detector
+    'layout_detect_range_xlsx',
+    'layout_detect_range_xls',
+    'object_detect_xlsx',
+    'object_detect_xls',
+    'LayoutRange',
+]

xgen-doc2chunk 0.1.0__py3-none-any.whl → 0.1.2__py3-none-any.whl

xgen-doc2chunk 0.1.0py3-none-any.whl → 0.1.2py3-none-any.whl