PyPI - xgen-doc2chunk - Versions diffs - 0.1.0__py3-none-any.whl → 0.1.1__py3-none-any.whl - Mend

xgen-doc2chunk 0.1.0py3-none-any.whl → 0.1.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (162) hide show

xgen_doc2chunk/core/processor/ppt_helper/ppt_bullet.py ADDED Viewed

@@ -0,0 +1,332 @@
+"""
+PPT 목록(Bullet/Numbering) 처리 모듈
+포함 함수:
+- extract_text_with_bullets(): TextFrame에서 목록 기호 포함 텍스트 추출
+- extract_bullet_info(): Paragraph에서 목록 정보 추출
+- convert_special_font_char(): 특수 폰트 문자 변환
+지원하는 목록 스타일:
+- Bullet: •, ○, ■, □, ✓, ➢ 등 모든 Unicode 문자
+- Numbering: 1., I., i., A., a., (1), 1) 등
+"""
+import logging
+from typing import Any, Dict
+from xgen_doc2chunk.core.processor.ppt_helper.ppt_constants import WINGDINGS_MAPPING, WINGDINGS_CHAR_MAPPING, SYMBOL_MAPPING
+logger = logging.getLogger("document-processor")
+def extract_text_with_bullets(text_frame) -> str:
+    """
+    TextFrame에서 목록 기호/번호를 포함한 텍스트를 추출합니다.
+    지원하는 목록 스타일:
+    - Bullet: •, ○, ■, □, ✓, ➢ 등 모든 Unicode 문자
+    - Numbering: 1., I., i., A., a., (1), 1) 등
+    Args:
+        text_frame: Shape의 text_frame 객체
+    Returns:
+        목록 기호가 포함된 텍스트
+    """
+    if not text_frame:
+        return ""
+    result_lines = []
+    numbering_state = {}  # 레벨별 번호 상태 추적
+    try:
+        for paragraph in text_frame.paragraphs:
+            para_text = paragraph.text.strip()
+            if not para_text:
+                result_lines.append("")
+                continue
+            # 들여쓰기 레벨 (0-8)
+            level = paragraph.level if hasattr(paragraph, 'level') else 0
+            indent = "  " * level  # 2칸씩 들여쓰기
+            # 목록 정보 추출
+            bullet_info = extract_bullet_info(paragraph)
+            if bullet_info['type'] == 'numbered':
+                # 번호 목록 처리
+                num_format = bullet_info['format']
+                current_num = _get_or_increment_number(numbering_state, level, bullet_info)
+                # 번호 포맷팅
+                formatted_num = _format_number(current_num, num_format)
+                result_lines.append(f"{indent}{formatted_num} {para_text}")
+            elif bullet_info['type'] == 'bulleted':
+                # Bullet 목록 처리
+                bullet_char = bullet_info['char']
+                result_lines.append(f"{indent}{bullet_char} {para_text}")
+            else:
+                # 목록이 아닌 일반 텍스트
+                # 목록이 끝나면 번호 상태 초기화
+                if numbering_state:
+                    numbering_state.clear()
+                if level > 0:
+                    result_lines.append(f"{indent}{para_text}")
+                else:
+                    result_lines.append(para_text)
+    except Exception as e:
+        logger.warning(f"Error extracting text with bullets: {e}")
+        # 폴백: 기본 텍스트만 추출
+        return text_frame.text.strip() if text_frame.text else ""
+    return "\n".join(result_lines)
+def extract_bullet_info(paragraph) -> Dict[str, Any]:
+    """
+    Paragraph에서 목록(bullet/numbering) 정보를 추출합니다.
+    특수 폰트(Wingdings, Symbol 등)의 문자를 올바른 Unicode로 변환합니다.
+    Args:
+        paragraph: python-pptx Paragraph 객체
+    Returns:
+        {
+            'type': 'none' | 'bulleted' | 'numbered',
+            'char': str,           # bullet 문자 (type='bulleted'인 경우)
+            'format': str,         # 번호 포맷 (type='numbered'인 경우)
+            'start_at': int        # 시작 번호
+        }
+    """
+    result = {
+        'type': 'none',
+        'char': None,
+        'format': None,
+        'start_at': 1
+    }
+    try:
+        # XML 요소 접근
+        pPr = paragraph._element.pPr
+        if pPr is None:
+            return result
+        # namespace
+        ns = {'a': 'http://schemas.openxmlformats.org/drawingml/2006/main'}
+        # buNone 확인 (목록 비활성화)
+        buNone = pPr.find('.//a:buNone', namespaces=ns)
+        if buNone is not None:
+            return result
+        # Bullet 폰트 확인 (특수 폰트 여부)
+        buFont = pPr.find('.//a:buFont', namespaces=ns)
+        font_typeface = None
+        if buFont is not None:
+            font_typeface = buFont.get('typeface', '').lower()
+        # Bullet 문자 확인
+        buChar = pPr.find('.//a:buChar', namespaces=ns)
+        if buChar is not None:
+            result['type'] = 'bulleted'
+            raw_char = buChar.get('char', '•')
+            # 특수 폰트인 경우 문자 변환
+            if font_typeface:
+                converted_char = convert_special_font_char(raw_char, font_typeface)
+                result['char'] = converted_char
+            else:
+                result['char'] = raw_char
+            return result
+        # 자동 번호 확인
+        buAutoNum = pPr.find('.//a:buAutoNum', namespaces=ns)
+        if buAutoNum is not None:
+            result['type'] = 'numbered'
+            result['format'] = buAutoNum.get('type', 'arabicPeriod')
+            result['start_at'] = int(buAutoNum.get('startAt', '1'))
+            return result
+        # Font bullet만 있고 buChar가 없는 경우 (기본 bullet)
+        if buFont is not None:
+            result['type'] = 'bulleted'
+            result['char'] = '•'
+            return result
+    except Exception as e:
+        logger.debug(f"Error extracting bullet info: {e}")
+    return result
+def convert_special_font_char(char: str, font_typeface: str) -> str:
+    """
+    특수 폰트(Wingdings, Symbol 등)의 문자를 일반 Unicode로 변환합니다.
+    Args:
+        char: 원본 문자
+        font_typeface: 폰트 이름 (소문자)
+    Returns:
+        변환된 Unicode 문자
+    """
+    if not char:
+        return '•'
+    try:
+        # 먼저 문자 기반 매핑 시도 (가장 정확)
+        if 'wingdings' in font_typeface:
+            # 문자 자체로 매핑 시도
+            if char in WINGDINGS_CHAR_MAPPING:
+                return WINGDINGS_CHAR_MAPPING[char]
+            # 문자 코드로 매핑 시도
+            char_code = ord(char[0]) if len(char) > 0 else 0
+            if char_code in WINGDINGS_MAPPING:
+                return WINGDINGS_MAPPING[char_code]
+            # 매핑되지 않은 경우 로그 출력 (디버깅용)
+            logger.debug(f"Unmapped Wingdings char: '{char}' (code: {char_code}, hex: 0x{char_code:02X})")
+            return '•'  # 기본값
+        # Symbol 폰트
+        elif 'symbol' in font_typeface:
+            char_code = ord(char[0]) if len(char) > 0 else 0
+            if char_code in SYMBOL_MAPPING:
+                return SYMBOL_MAPPING[char_code]
+            return char
+        # Webdings 폰트 (필요시 매핑 추가)
+        elif 'webdings' in font_typeface:
+            return '•'  # 기본값
+        # 일반 폰트는 그대로 반환
+        else:
+            return char
+    except Exception as e:
+        logger.debug(f"Error converting special font char: {e}")
+        return '•'
+def _get_or_increment_number(numbering_state: Dict, level: int, bullet_info: Dict) -> int:
+    """
+    레벨별 번호를 추적하고 증가시킵니다.
+    Args:
+        numbering_state: 레벨별 번호 상태 딕셔너리
+        level: 현재 들여쓰기 레벨
+        bullet_info: 목록 정보
+    Returns:
+        현재 번호
+    """
+    # 새로운 번호 시퀀스 시작
+    if level not in numbering_state:
+        numbering_state[level] = bullet_info['start_at']
+    else:
+        numbering_state[level] += 1
+    # 하위 레벨 초기화
+    for l in list(numbering_state.keys()):
+        if l > level:
+            del numbering_state[l]
+    return numbering_state[level]
+def _format_number(num: int, format_type: str) -> str:
+    """
+    번호를 지정된 포맷으로 변환합니다.
+    지원 포맷:
+    - arabicPeriod: 1.
+    - arabicParenR: 1)
+    - arabicParenBoth: (1)
+    - romanUcPeriod: I.
+    - romanLcPeriod: i.
+    - alphaUcPeriod: A.
+    - alphaLcPeriod: a.
+    - alphaUcParenR: A)
+    - alphaLcParenR: a)
+    등등...
+    Args:
+        num: 번호
+        format_type: 포맷 타입 문자열
+    Returns:
+        포맷팅된 번호 문자열
+    """
+    # 번호 변환
+    if 'roman' in format_type.lower():
+        num_str = _to_roman(num)
+        if 'Lc' in format_type:  # 소문자
+            num_str = num_str.lower()
+    elif 'alpha' in format_type.lower():
+        num_str = _to_alpha(num)
+        if 'Lc' in format_type:  # 소문자
+            num_str = num_str.lower()
+    else:
+        num_str = str(num)
+    # 구분자 추가
+    if 'Period' in format_type:
+        return f"{num_str}."
+    elif 'ParenBoth' in format_type:
+        return f"({num_str})"
+    elif 'ParenR' in format_type:
+        return f"{num_str})"
+    elif 'ParenL' in format_type:
+        return f"({num_str}"
+    elif 'Plain' in format_type:
+        return num_str
+    else:
+        # 기본값
+        return f"{num_str}."
+def _to_roman(num: int) -> str:
+    """
+    숫자를 로마 숫자로 변환합니다.
+    Args:
+        num: 1-3999 범위의 정수
+    Returns:
+        로마 숫자 문자열 (예: 1→I, 4→IV, 9→IX)
+    """
+    val_map = [
+        (1000, 'M'), (900, 'CM'), (500, 'D'), (400, 'CD'),
+        (100, 'C'), (90, 'XC'), (50, 'L'), (40, 'XL'),
+        (10, 'X'), (9, 'IX'), (5, 'V'), (4, 'IV'), (1, 'I')
+    ]
+    result = []
+    for value, letter in val_map:
+        count, num = divmod(num, value)
+        result.append(letter * count)
+    return ''.join(result)
+def _to_alpha(num: int) -> str:
+    """
+    숫자를 알파벳으로 변환합니다.
+    Args:
+        num: 양의 정수
+    Returns:
+        알파벳 문자열 (예: 1→A, 2→B, 26→Z, 27→AA)
+    """
+    result = []
+    while num > 0:
+        num -= 1
+        result.append(chr(65 + (num % 26)))
+        num //= 26
+    return ''.join(reversed(result))

xgen_doc2chunk/core/processor/ppt_helper/ppt_chart_extractor.py ADDED Viewed

@@ -0,0 +1,182 @@
+"""
+PPT Chart Extractor
+Extracts chart data from PowerPoint files (PPTX).
+Uses python-pptx Presentation and Chart objects.
+Provides:
+- extract(): Single chart extraction from python-pptx Chart object
+- extract_all_from_file(): Extract all charts from PPTX file in slide order
+"""
+import io
+import logging
+from typing import Any, BinaryIO, Dict, List, Optional, Union
+from pptx import Presentation
+from xgen_doc2chunk.core.functions.chart_extractor import BaseChartExtractor, ChartData
+logger = logging.getLogger("document-processor")
+class PPTChartExtractor(BaseChartExtractor):
+    """
+    Chart extractor for PowerPoint files.
+    Supports:
+    - Direct python-pptx Chart object extraction
+    - Full file extraction via extract_all_from_file()
+    """
+    # ========================================================================
+    # Main Interface
+    # ========================================================================
+    def extract(self, chart_element: Any) -> ChartData:
+        """
+        Extract chart data from python-pptx Chart object.
+        Args:
+            chart_element: python-pptx Chart object
+        Returns:
+            ChartData with extracted information
+        """
+        if not chart_element:
+            return ChartData()
+        title = self._extract_title(chart_element)
+        chart_type = self._extract_chart_type(chart_element)
+        categories = self._extract_categories(chart_element)
+        series = self._extract_series(chart_element)
+        return ChartData(
+            chart_type=chart_type,
+            title=title,
+            categories=categories,
+            series=series
+        )
+    def extract_all_from_file(
+        self,
+        file_source: Union[str, bytes, BinaryIO]
+    ) -> List[ChartData]:
+        """
+        Extract all charts from a PowerPoint file in slide order.
+        Args:
+            file_source: File path, bytes, or file-like object
+        Returns:
+            List of ChartData for all charts in the file
+        """
+        charts = []
+        try:
+            # Prepare file-like object
+            if isinstance(file_source, str):
+                with open(file_source, 'rb') as f:
+                    file_obj = io.BytesIO(f.read())
+            elif isinstance(file_source, bytes):
+                file_obj = io.BytesIO(file_source)
+            else:
+                file_source.seek(0)
+                file_obj = file_source
+            # Open presentation
+            prs = Presentation(file_obj)
+            # Iterate slides in order
+            for slide in prs.slides:
+                for shape in slide.shapes:
+                    if shape.has_chart:
+                        chart_data = self.extract(shape.chart)
+                        charts.append(chart_data)
+                    # Check group shapes
+                    if hasattr(shape, 'shapes'):
+                        for sub_shape in shape.shapes:
+                            if hasattr(sub_shape, 'has_chart') and sub_shape.has_chart:
+                                chart_data = self.extract(sub_shape.chart)
+                                charts.append(chart_data)
+            logger.info(f"Extracted {len(charts)} charts from PowerPoint file")
+        except Exception as e:
+            logger.error(f"Error extracting charts from PowerPoint: {e}")
+        return charts
+    # ========================================================================
+    # Private Methods
+    # ========================================================================
+    def _extract_title(self, chart) -> Optional[str]:
+        """Extract chart title."""
+        try:
+            if chart.has_title and chart.chart_title:
+                if chart.chart_title.has_text_frame:
+                    title_text = chart.chart_title.text_frame.text
+                    if title_text:
+                        return title_text.strip()
+        except Exception:
+            pass
+        return None
+    def _extract_chart_type(self, chart) -> str:
+        """Extract chart type."""
+        try:
+            if hasattr(chart, 'chart_type'):
+                type_str = str(chart.chart_type)
+                type_name = type_str.split('.')[-1].split(' ')[0]
+                return type_name.replace('_', ' ').title()
+        except Exception:
+            pass
+        return "Chart"
+    def _extract_categories(self, chart) -> List[str]:
+        """Extract category labels."""
+        categories = []
+        try:
+            if hasattr(chart, 'plots') and chart.plots:
+                for plot in chart.plots:
+                    if hasattr(plot, 'categories') and plot.categories:
+                        categories = [str(c) for c in plot.categories]
+                        break
+        except Exception:
+            pass
+        return categories
+    def _extract_series(self, chart) -> List[Dict[str, Any]]:
+        """Extract series data."""
+        series_data = []
+        try:
+            for idx, series in enumerate(chart.series):
+                series_info = {
+                    'name': self._get_series_name(series, idx),
+                    'values': []
+                }
+                try:
+                    if hasattr(series, 'values') and series.values:
+                        series_info['values'] = list(series.values)
+                except Exception:
+                    pass
+                series_data.append(series_info)
+        except Exception:
+            pass
+        return series_data
+    def _get_series_name(self, series, idx: int) -> str:
+        """Get series name."""
+        try:
+            if hasattr(series, 'name') and series.name:
+                return str(series.name)
+        except Exception:
+            pass
+        return f"Series {idx + 1}"
+__all__ = ['PPTChartExtractor']

xgen_doc2chunk/core/processor/ppt_helper/ppt_constants.py ADDED Viewed

@@ -0,0 +1,119 @@
+"""
+PPT 상수 및 타입 정의 모듈
+포함 내용:
+- Wingdings/Symbol 폰트 매핑 테이블
+- ElementType Enum
+- SlideElement dataclass
+"""
+from dataclasses import dataclass
+from enum import Enum
+from typing import Tuple
+# === Wingdings/Symbol 폰트 매핑 테이블 ===
+# PPT에서 특수 폰트(Wingdings, Symbol 등)로 표시되는 목록 기호를
+# 올바른 Unicode 문자로 변환하기 위한 매핑 테이블
+WINGDINGS_MAPPING = {
+    # 기본 도형
+    0x6C: '●',   # 'l' -> 검정 원 (filled circle)
+    0x6D: '○',   # 'm' -> 빈 원 (empty circle)
+    0x6E: '■',   # 'n' -> 검정 사각형 (filled square)
+    0x6F: '□',   # 'o' -> 빈 사각형 (empty square)
+    0x70: '◆',   # 'p' -> 검정 마름모 (filled diamond)
+    0x71: '◇',   # 'q' -> 빈 마름모 (empty diamond)
+    0x75: '◆',   # 'u' -> 마름모
+    0x76: '❖',   # 'v' -> 마름모 변형
+    # 체크마크/X 마크
+    0xFC: '✓',   # 체크마크
+    0xFB: '✓',   # 체크마크 변형
+    0xFD: '✗',   # X 마크
+    0xFE: '✘',   # Heavy X
+    # 화살표
+    0xD8: '➢',   # Ø -> 3D 입체 화살표 (가장 많이 사용)
+    0xE0: '➢',   # 오른쪽 화살표
+    0xE1: '⬅',   # 왼쪽 화살표
+    0xE2: '⬆',   # 위쪽 화살표
+    0xE3: '⬇',   # 아래쪽 화살표
+    0xE4: '⬌',   # 양방향 화살표
+    0xE8: '➢',   # 화살표 (è)
+    0xE9: '➣',   # 화살표 변형
+    0xEA: '➤',   # 삼각 화살표
+    0xF0: '➢',   # 화살표
+    0xD0: '➢',   # 화살표
+    # 손가락 포인터
+    0x46: '☞',   # 'F' -> 오른쪽 손가락
+    0x47: '☜',   # 'G' -> 왼쪽 손가락
+    # 별/특수 기호
+    0xAB: '★',   # 검정 별
+    0xAC: '☆',   # 빈 별
+    0xA7: '§',   # Section -> 네모로 변환
+    # 숫자 원
+    0x31: '①',   # '1'
+    0x32: '②',   # '2'
+    0x33: '③',   # '3'
+    0x34: '④',   # '4'
+    0x35: '⑤',   # '5'
+    0x36: '⑥',   # '6'
+    0x37: '⑦',   # '7'
+    0x38: '⑧',   # '8'
+    0x39: '⑨',   # '9'
+    0x30: '⓪',   # '0'
+}
+# 특정 문자에서 Unicode로 직접 매핑 (문자 기반)
+WINGDINGS_CHAR_MAPPING = {
+    '§': '■',    # Section sign -> 검정 사각형
+    'Ø': '➢',    # 3D 입체 화살표 (가장 많이 사용)
+    'ü': '✓',    # 체크마크
+    'u': '◆',    # 마름모
+    'n': '■',    # 검정 사각형
+    'l': '●',    # 검정 원
+    'o': '□',    # 빈 사각형
+    'q': '◇',    # 빈 마름모
+    'v': '❖',    # 마름모 변형
+    'F': '☞',    # 오른쪽 손가락
+    'ð': '➢',    # 화살표
+    'Ð': '➢',    # 화살표
+    'à': '➢',    # 화살표
+    'è': '➢',    # 화살표 (0xE8)
+    'ê': '➤',    # 삼각 화살표
+}
+SYMBOL_MAPPING = {
+    0xB7: '•',   # Bullet
+    0xD7: '×',   # Multiplication
+    0xF7: '÷',   # Division
+    0xA5: '∞',   # Infinity
+    0xB1: '±',   # Plus-minus
+}
+# === 슬라이드 요소 타입 정의 ===
+class ElementType(Enum):
+    """슬라이드 요소 타입"""
+    TEXT = "text"
+    IMAGE = "image"
+    TABLE = "table"
+    CHART = "chart"
+@dataclass
+class SlideElement:
+    """슬라이드 내 요소를 나타내는 데이터 클래스"""
+    element_type: ElementType
+    content: str
+    position: Tuple[int, int, int, int]  # (left, top, width, height) in EMU
+    shape_id: int
+    @property
+    def sort_key(self) -> Tuple[int, int]:
+        """정렬 키: (top, left) - 위에서 아래, 왼쪽에서 오른쪽"""
+        return (self.position[1], self.position[0])

xgen_doc2chunk/core/processor/ppt_helper/ppt_file_converter.py ADDED Viewed

@@ -0,0 +1,55 @@
+# xgen_doc2chunk/core/processor/ppt_helper/ppt_file_converter.py
+"""
+PPTFileConverter - PPT/PPTX file format converter
+Converts binary PPT/PPTX data to python-pptx Presentation object.
+"""
+from io import BytesIO
+from typing import Any, Optional, BinaryIO
+from xgen_doc2chunk.core.functions.file_converter import BaseFileConverter
+class PPTFileConverter(BaseFileConverter):
+    """
+    PPT/PPTX file converter using python-pptx.
+    Converts binary PPT/PPTX data to Presentation object.
+    """
+    # ZIP magic number (PPTX is a ZIP file)
+    ZIP_MAGIC = b'PK\x03\x04'
+    def convert(
+        self,
+        file_data: bytes,
+        file_stream: Optional[BinaryIO] = None,
+        **kwargs
+    ) -> Any:
+        """
+        Convert binary PPT/PPTX data to Presentation object.
+        Args:
+            file_data: Raw binary PPT/PPTX data
+            file_stream: Optional file stream
+            **kwargs: Additional options
+        Returns:
+            pptx.Presentation object
+        """
+        from pptx import Presentation
+        stream = file_stream if file_stream is not None else BytesIO(file_data)
+        stream.seek(0)
+        return Presentation(stream)
+    def get_format_name(self) -> str:
+        """Return format name."""
+        return "PPT/PPTX Presentation"
+    def validate(self, file_data: bytes) -> bool:
+        """Validate if data is a valid PPTX."""
+        if not file_data or len(file_data) < 4:
+            return False
+        return file_data[:4] == self.ZIP_MAGIC

xgen-doc2chunk 0.1.0__py3-none-any.whl → 0.1.1__py3-none-any.whl

xgen-doc2chunk 0.1.0py3-none-any.whl → 0.1.1py3-none-any.whl