PyPI - xgen-doc2chunk - Versions diffs - 0.1.2__py3-none-any.whl → 0.1.4__py3-none-any.whl - Mend

xgen-doc2chunk 0.1.2py3-none-any.whl → 0.1.4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

xgen_doc2chunk/core/processor/doc_handler.py CHANGED Viewed

@@ -115,8 +115,8 @@ class DOCHandler(BaseHandler):
         """
         Delegate RTF processing to RTFHandler.
-        DOC ?�일???�제로는 RTF ?�식??경우, RTFHandler???�임?�니??
-        RTFHandler.extract_text()??raw bytes�?받으므�?current_file??그�?�??�달?�니??
+        When a DOC file is actually in RTF format, delegate to RTFHandler.
+        RTFHandler.extract_text() receives raw bytes, so pass current_file as is.
         Args:
             rtf_doc: Pre-converted RTFDocument object (unused, for consistency)
@@ -135,7 +135,7 @@ class DOCHandler(BaseHandler):
             chart_processor=self._chart_processor
         )
-        # RTFHandler.extract_text()??current_file?�서 file_data�?직접 ?�어 처리
+        # RTFHandler.extract_text() reads file_data directly from current_file
         return rtf_handler.extract_text(current_file, extract_metadata=extract_metadata)
     def _extract_from_ole_obj(self, ole, current_file: "CurrentFile", extract_metadata: bool) -> str:
@@ -258,7 +258,7 @@ class DOCHandler(BaseHandler):
         return str(value).strip()
     def _extract_ole_images(self, ole: olefile.OleFileIO, processed_images: Set[str]) -> List[str]:
-        """OLE?�서 ?��?지 추출"""
+        """Extract images from OLE container."""
         images = []
         try:
             for entry in ole.listdir():
@@ -493,7 +493,7 @@ class DOCHandler(BaseHandler):
             # Text extraction attempt
             text_parts = []
-            # 1. Table ?�트림에???�스??조각 찾기 ?�도
+            # 1. Try to find text fragments in Table stream
             table_stream_name = None
             if ole.exists('1Table'):
                 table_stream_name = '1Table'
@@ -521,9 +521,9 @@ class DOCHandler(BaseHandler):
             # Find consecutive Unicode characters
             i = 0
             while i < len(data) - 1:
-                # ?�니코드 ?�스???�작??찾기 (printable 문자)
+                # Find start of Unicode text (printable characters)
                 if 0x20 <= data[i] <= 0x7E and data[i+1] == 0x00:
-                    # ?�니코드 문자???�집
+                    # Collect Unicode characters
                     unicode_bytes = []
                     j = i
                     while j < len(data) - 1:

xgen_doc2chunk/core/processor/docx_helper/docx_image.py CHANGED Viewed

@@ -1,13 +1,13 @@
 # xgen_doc2chunk/core/processor/docx_helper/docx_image.py
 """
-DOCX ?��?지 추출 ?�틸리티
+DOCX Image Extraction Utilities
-DOCX 문서?�서 ?��?지�?추출?�고 로컬???�?�합?�다.
-- extract_image_from_drawing: Drawing ?�소?�서 ?��?지 추출
-- process_pict_element: ?�거??VML pict ?�소 처리
+Extracts images from DOCX documents and saves them locally.
+- extract_image_from_drawing: Extract images from Drawing elements
+- process_pict_element: Process legacy VML pict elements
-Note: ???�수?��? DOCXImageProcessor??메서?��? ?�출?�는 wrapper?�니??
-      ?�제 로직?� DOCXImageProcessor???�합?�어 ?�습?�다.
+Note: These functions are wrappers that call DOCXImageProcessor methods.
+      The actual logic is consolidated in DOCXImageProcessor.
 """
 import logging
 from typing import Optional, Set, Tuple, TYPE_CHECKING
@@ -30,25 +30,25 @@ def extract_image_from_drawing(
     image_processor: "ImageProcessor"
 ) -> Tuple[str, Optional[ElementType]]:
     """
-    Drawing?�서 ?��?지�?추출?�니??
+    Extract image from Drawing element.
     Args:
-        graphic_data: graphicData XML ?�소
-        doc: python-docx Document 객체
-        processed_images: 처리???��?지 경로 집합 (중복 방�?)
-        image_processor: ImageProcessor ?�스?�스 (DOCXImageProcessor 권장)
+        graphic_data: graphicData XML element
+        doc: python-docx Document object
+        processed_images: Set of processed image paths (for deduplication)
+        image_processor: ImageProcessor instance (DOCXImageProcessor recommended)
     Returns:
-        (content, element_type) ?�플
+        (content, element_type) tuple
     """
-    # DOCXImageProcessor??경우 ?�합??메서???�용
+    # Use integrated method if DOCXImageProcessor
     if hasattr(image_processor, 'extract_from_drawing'):
         content, is_image = image_processor.extract_from_drawing(
             graphic_data, doc, processed_images
         )
         return (content, ElementType.IMAGE) if is_image else ("", None)
-    # Fallback: 기존 로직 (ImageProcessor 기본 ?�래?�인 경우)
+    # Fallback: Legacy logic (when using base ImageProcessor class)
     from docx.oxml.ns import qn
     from xgen_doc2chunk.core.processor.docx_helper.docx_constants import NAMESPACES
@@ -75,11 +75,11 @@ def extract_image_from_drawing(
                 if image_tag:
                     return f"\n{image_tag}\n", ElementType.IMAGE
-            return "[?��?지]", ElementType.IMAGE
+            return "[Image]", ElementType.IMAGE
         except Exception as e:
             logger.warning(f"Error extracting image from relationship: {e}")
-            return "[?��?지]", ElementType.IMAGE
+            return "[Image]", ElementType.IMAGE
     except Exception as e:
         logger.warning(f"Error extracting image from drawing: {e}")
@@ -93,33 +93,33 @@ def process_pict_element(
     image_processor: "ImageProcessor"
 ) -> str:
     """
-    ?�거??VML pict ?�소�?처리?�니??
+    Process legacy VML pict element.
     Args:
-        pict_elem: pict XML ?�소
-        doc: python-docx Document 객체
-        processed_images: 처리???��?지 경로 집합 (중복 방�?)
-        image_processor: ImageProcessor ?�스?�스 (DOCXImageProcessor 권장)
+        pict_elem: pict XML element
+        doc: python-docx Document object
+        processed_images: Set of processed image paths (for deduplication)
+        image_processor: ImageProcessor instance (DOCXImageProcessor recommended)
     Returns:
-        ?��?지 마크??문자??
+        Image marker string
     """
-    # DOCXImageProcessor??경우 ?�합??메서???�용
+    # Use integrated method if DOCXImageProcessor
     if hasattr(image_processor, 'extract_from_pict'):
         return image_processor.extract_from_pict(pict_elem, doc, processed_images)
-    # Fallback: 기존 로직 (ImageProcessor 기본 ?�래?�인 경우)
+    # Fallback: Legacy logic (when using base ImageProcessor class)
     try:
         ns_v = 'urn:schemas-microsoft-com:vml'
         ns_r = 'http://schemas.openxmlformats.org/officeDocument/2006/relationships'
         imagedata = pict_elem.find('.//{%s}imagedata' % ns_v)
         if imagedata is None:
-            return "[?��?지]"
+            return "[Image]"
         rId = imagedata.get('{%s}id' % ns_r)
         if not rId:
-            return "[?��?지]"
+            return "[Image]"
         try:
             rel = doc.part.rels.get(rId)
@@ -131,7 +131,7 @@ def process_pict_element(
         except Exception:
             pass
-        return "[?��?지]"
+        return "[Image]"
     except Exception as e:
         logger.warning(f"Error processing pict element: {e}")

xgen_doc2chunk/core/processor/hwp_helper/hwp_image_processor.py CHANGED Viewed

@@ -295,7 +295,7 @@ class HWPImageProcessor(ImageProcessor):
         bindata_index = None
-        # Strategy 1: ?�프??79 (HWP 5.0.3.x+ ?�펙)
+        # Strategy 1: Offset 79 (HWP 5.0.3.x+ spec)
         if len(payload) >= 81:
             test_id = struct.unpack('<H', payload[79:81])[0]
             if 0 < test_id <= bin_data_list_len:
@@ -303,7 +303,7 @@ class HWPImageProcessor(ImageProcessor):
                 logger.debug(f"Found BinData index at offset 79: {bindata_index}")
                 return bindata_index
-        # Strategy 2: ?�프??8 (�?버전)
+        # Strategy 2: Offset 8 (older version)
         if len(payload) >= 10:
             test_id = struct.unpack('<H', payload[8:10])[0]
             if 0 < test_id <= bin_data_list_len:
@@ -311,7 +311,7 @@ class HWPImageProcessor(ImageProcessor):
                 logger.debug(f"Found BinData index at offset 8: {bindata_index}")
                 return bindata_index
-        # Strategy 3: ?�반?�인 ?�프???�캔
+        # Strategy 3: General offset scan
         for offset in [4, 6, 10, 12, 14, 16, 18, 20, 40, 44, 48, 52, 56, 60, 64, 68, 72, 76, 80]:
             if len(payload) >= offset + 2:
                 test_id = struct.unpack('<H', payload[offset:offset+2])[0]
@@ -320,7 +320,7 @@ class HWPImageProcessor(ImageProcessor):
                     logger.debug(f"Found potential BinData index at offset {offset}: {bindata_index}")
                     return bindata_index
-        # Strategy 4: 범위 ??�?번째 non-zero 2바이??�??�캔
+        # Strategy 4: Scan for first non-zero 2-byte value in range
         for i in range(0, min(len(payload) - 1, 100), 2):
             test_id = struct.unpack('<H', payload[i:i+2])[0]
             if 0 < test_id <= bin_data_list_len:

xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_processor.py CHANGED Viewed

@@ -212,7 +212,7 @@ def find_and_insert_annotations(doc, tables: List[TableInfo]) -> List[TableInfo]
     Detection patterns:
     1. Rows starting with "Note)" etc. right after table
     2. Subheader rows inside table (e.g., (A), (B))
-    3. Footnote/endnote markers (?? *, ?? ?? etc.)
+    3. Footnote/endnote markers (*, **, †, ‡ etc.)
     Args:
         doc: PyMuPDF document object

{xgen_doc2chunk-0.1.2.dist-info → xgen_doc2chunk-0.1.4.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: xgen-doc2chunk
-Version: 0.1.2
+Version: 0.1.4
 Summary: Convert raw documents into AI-understandable context with intelligent text extraction, table detection, and semantic chunking
 Project-URL: Homepage, https://github.com/master0419/doc2chunk
 Project-URL: Documentation, https://github.com/master0419/doc2chunk#readme

{xgen_doc2chunk-0.1.2.dist-info → xgen_doc2chunk-0.1.4.dist-info}/RECORD RENAMED Viewed

@@ -25,7 +25,7 @@ xgen_doc2chunk/core/functions/utils.py,sha256=FXIamLR6qxO4QTX12QooGzqD8yQAYMIPBI
 xgen_doc2chunk/core/processor/__init__.py,sha256=RzlwESn8--LpB6N6WZ-i117TnIlh1EDZ_ImDaIjwYvA,3202
 xgen_doc2chunk/core/processor/base_handler.py,sha256=z1L29Nx7JBHv2N1rlcowzGXBgwtTXL8zhI7xWN1_J6o,19546
 xgen_doc2chunk/core/processor/csv_handler.py,sha256=SnAzRWycVivuRV4gjBLiI0HfLYxsGUpBK4Z4UyeyfWQ,4779
-xgen_doc2chunk/core/processor/doc_handler.py,sha256=TCk1pNKEqqJHqV-dXusAJiD2NUAaxwOq_Nn9BN5fVRE,23146
+xgen_doc2chunk/core/processor/doc_handler.py,sha256=MLw71dY0Qt08q0zDwAnEe6uEIC9gr4gEYHW5fQV1w0Q,23087
 xgen_doc2chunk/core/processor/docx_handler.py,sha256=gSuv4gB0sAVGCwsmnZsSK_ADSTh52NTntwKav_KYDc8,14650
 xgen_doc2chunk/core/processor/excel_handler.py,sha256=VmJsTkFWn9bhj_tvHhBEkzQIgm2LGk1DdkiBJyiN05c,13502
 xgen_doc2chunk/core/processor/html_reprocessor.py,sha256=yrufNBPKUCHu6tcWPS9sKHMCB6Vj_t1fJ3EgPHkTaBc,5076
@@ -53,7 +53,7 @@ xgen_doc2chunk/core/processor/docx_helper/__init__.py,sha256=D-JeAVe1PcJOKlq37RX
 xgen_doc2chunk/core/processor/docx_helper/docx_chart_extractor.py,sha256=-7ONbBeOKeyyAEDfVdkJh8DGdR3FYJESlD1doHiw0A8,16224
 xgen_doc2chunk/core/processor/docx_helper/docx_constants.py,sha256=2aERy2K0EpHEbrTWz1pzcBrdk2vJcCyBDXamyhHVlo8,2330
 xgen_doc2chunk/core/processor/docx_helper/docx_file_converter.py,sha256=w6QtsXlT59PSGJm7bYaqspCUlLGcnwYUTQxTJ3CkswM,2076
-xgen_doc2chunk/core/processor/docx_helper/docx_image.py,sha256=7L3_BGlI9KQ6A4ZPqawHRFMvzamxovGourMALEACz7o,4688
+xgen_doc2chunk/core/processor/docx_helper/docx_image.py,sha256=LcoL8maN2Mm-JVAbT7v0Ejf77QIU8FPIMdNnyJwNLQg,4671
 xgen_doc2chunk/core/processor/docx_helper/docx_image_processor.py,sha256=CFFd0ITAWeSP-IhSOvp1BzVQLmYDkmKN8BTQMWGl66c,13324
 xgen_doc2chunk/core/processor/docx_helper/docx_metadata.py,sha256=-A1mJqyTPe5FGpbTq5m9tJRU675GJ28qxSQlRvihoZk,2196
 xgen_doc2chunk/core/processor/docx_helper/docx_paragraph.py,sha256=2rzi27HKgzcA9Uqn_SIyfctZYfdmc4IObtgPxPRZcyU,4262
@@ -79,7 +79,7 @@ xgen_doc2chunk/core/processor/hwp_helper/hwp_constants.py,sha256=tqxYKzBjX0ZFTey
 xgen_doc2chunk/core/processor/hwp_helper/hwp_decoder.py,sha256=GM6PxtdSbvBQV1JFmkuGlNd3Yn0cQg-YU5h9KQaAOK0,2795
 xgen_doc2chunk/core/processor/hwp_helper/hwp_docinfo.py,sha256=RgK_kb0MzEnlPjiiQZdQLg69JZeJhPPahqYzs2E6CvI,6958
 xgen_doc2chunk/core/processor/hwp_helper/hwp_file_converter.py,sha256=FVwuO-6Bng7q0jJcX_pDiEOP4ZUPyngG1DKRD4UTTNk,1716
-xgen_doc2chunk/core/processor/hwp_helper/hwp_image_processor.py,sha256=I4D6AKp0VoTsAItkhfT1zJgWix753Xc0GM9yVRGcsqo,13546
+xgen_doc2chunk/core/processor/hwp_helper/hwp_image_processor.py,sha256=ZdOU5FIGc7T_XhE-NClX7Es78Xtfy_t-IsK0q0WtGzo,13545
 xgen_doc2chunk/core/processor/hwp_helper/hwp_metadata.py,sha256=JVCsUEanpMYx7FJh3ymhr20w_hpJIU-JuMryZJUUb_s,8838
 xgen_doc2chunk/core/processor/hwp_helper/hwp_preprocessor.py,sha256=p7Tvv2ABvtmLTe1sr4I4RU_DijV90LVRvWg2-u85Kz4,2601
 xgen_doc2chunk/core/processor/hwp_helper/hwp_record.py,sha256=UZg3xyOyAbO9g6u_uSVRv2H9z3EMnX1JBPc7oA0WUJE,4858
@@ -113,7 +113,7 @@ xgen_doc2chunk/core/processor/pdf_helpers/pdf_metadata.py,sha256=7ZTeHXAfUqa_W9H
 xgen_doc2chunk/core/processor/pdf_helpers/pdf_page_analyzer.py,sha256=4kpY8WY9hH-cfjd-Ai6vA4V7I8KwE5hSq8Yt4QXliqM,3009
 xgen_doc2chunk/core/processor/pdf_helpers/pdf_preprocessor.py,sha256=qPgtMTMbaTm7_QyU7kKwVDtGAldf_yV4rTyoGVVgkTU,3406
 xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_detection.py,sha256=bwD6MVUuZJVYe3bWDsD6BpK1UZKKPsVyKOG6oHeoumw,47042
-xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_processor.py,sha256=KQ0eGnf-uZbooIK_BTr-Q_O6pTQaUhh1OAtHvErT72s,28164
+xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_processor.py,sha256=cqoMzSySnapXRkELtmOahpmWyBnc1TquXPz1IqRqDSk,28168
 xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_quality_analyzer.py,sha256=v6VH-E6clI71-G2zJcT5754VFcPYqb1Qz4l3UcPeDeM,27863
 xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_validator.py,sha256=HXHl0tukTUHFSIWxQUcrYs8lYJ8gZnYV12HtSezWIho,16069
 xgen_doc2chunk/core/processor/pdf_helpers/pdf_text_extractor.py,sha256=wAnOCAQ3cTsVgMg0uVavodZHV2DAvrVkugqA0c4MhTY,4754
@@ -155,7 +155,7 @@ xgen_doc2chunk/ocr/ocr_engine/bedrock_ocr.py,sha256=4kIPb8u2_GSJ435GHJFXiIeQavMv
 xgen_doc2chunk/ocr/ocr_engine/gemini_ocr.py,sha256=A4V_AcC0tySYB4q-lNW7Tuhg7aTq0atj_RhMrCftKsM,2972
 xgen_doc2chunk/ocr/ocr_engine/openai_ocr.py,sha256=ZN-3Dq1BehFmwFvxTaYmiEAdFUqujviONNDiR8c5X4A,3194
 xgen_doc2chunk/ocr/ocr_engine/vllm_ocr.py,sha256=TeQOdPCPKQW8o4IyUb-4o6v6uTVzKupr4qh9NLjIj24,3672
-xgen_doc2chunk-0.1.2.dist-info/METADATA,sha256=qjsyqG8HTkCZvAeNqxlXPsJMCtPokpVM6UAgIRoNe7M,7623
-xgen_doc2chunk-0.1.2.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
-xgen_doc2chunk-0.1.2.dist-info/licenses/LICENSE,sha256=pokMTCMoEcrcnjBAJ8cb7UVADBMGce6GLFbbRfqJVJc,11346
-xgen_doc2chunk-0.1.2.dist-info/RECORD,,
+xgen_doc2chunk-0.1.4.dist-info/METADATA,sha256=IfRE6mjY7MVBaifP6sdZXm66AMZburo5IWqrdNne-wk,7623
+xgen_doc2chunk-0.1.4.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
+xgen_doc2chunk-0.1.4.dist-info/licenses/LICENSE,sha256=pokMTCMoEcrcnjBAJ8cb7UVADBMGce6GLFbbRfqJVJc,11346
+xgen_doc2chunk-0.1.4.dist-info/RECORD,,

{xgen_doc2chunk-0.1.2.dist-info → xgen_doc2chunk-0.1.4.dist-info}/WHEEL RENAMED Viewed

File without changes

{xgen_doc2chunk-0.1.2.dist-info → xgen_doc2chunk-0.1.4.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

xgen-doc2chunk 0.1.2__py3-none-any.whl → 0.1.4__py3-none-any.whl

xgen-doc2chunk 0.1.2py3-none-any.whl → 0.1.4py3-none-any.whl