xgen-doc2chunk 0.1.2__py3-none-any.whl → 0.1.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -115,8 +115,8 @@ class DOCHandler(BaseHandler):
115
115
  """
116
116
  Delegate RTF processing to RTFHandler.
117
117
 
118
- DOC ?�일???�제로는 RTF ?�식??경우, RTFHandler???�임?�니??
119
- RTFHandler.extract_text()??raw bytes�?받으므�?current_file??그�?�??�달?�니??
118
+ When a DOC file is actually in RTF format, delegate to RTFHandler.
119
+ RTFHandler.extract_text() receives raw bytes, so pass current_file as is.
120
120
 
121
121
  Args:
122
122
  rtf_doc: Pre-converted RTFDocument object (unused, for consistency)
@@ -135,7 +135,7 @@ class DOCHandler(BaseHandler):
135
135
  chart_processor=self._chart_processor
136
136
  )
137
137
 
138
- # RTFHandler.extract_text()??current_file?�서 file_data�?직접 ?�어 처리
138
+ # RTFHandler.extract_text() reads file_data directly from current_file
139
139
  return rtf_handler.extract_text(current_file, extract_metadata=extract_metadata)
140
140
 
141
141
  def _extract_from_ole_obj(self, ole, current_file: "CurrentFile", extract_metadata: bool) -> str:
@@ -258,7 +258,7 @@ class DOCHandler(BaseHandler):
258
258
  return str(value).strip()
259
259
 
260
260
  def _extract_ole_images(self, ole: olefile.OleFileIO, processed_images: Set[str]) -> List[str]:
261
- """OLE?�서 ?��?지 추출"""
261
+ """Extract images from OLE container."""
262
262
  images = []
263
263
  try:
264
264
  for entry in ole.listdir():
@@ -493,7 +493,7 @@ class DOCHandler(BaseHandler):
493
493
  # Text extraction attempt
494
494
  text_parts = []
495
495
 
496
- # 1. Table ?�트림에???�스??조각 찾기 ?�도
496
+ # 1. Try to find text fragments in Table stream
497
497
  table_stream_name = None
498
498
  if ole.exists('1Table'):
499
499
  table_stream_name = '1Table'
@@ -521,9 +521,9 @@ class DOCHandler(BaseHandler):
521
521
  # Find consecutive Unicode characters
522
522
  i = 0
523
523
  while i < len(data) - 1:
524
- # ?�니코드 ?�스???�작??찾기 (printable 문자)
524
+ # Find start of Unicode text (printable characters)
525
525
  if 0x20 <= data[i] <= 0x7E and data[i+1] == 0x00:
526
- # ?�니코드 문자???�집
526
+ # Collect Unicode characters
527
527
  unicode_bytes = []
528
528
  j = i
529
529
  while j < len(data) - 1:
@@ -1,13 +1,13 @@
1
1
  # xgen_doc2chunk/core/processor/docx_helper/docx_image.py
2
2
  """
3
- DOCX ?��?지 추출 ?�틸리티
3
+ DOCX Image Extraction Utilities
4
4
 
5
- DOCX 문서?�서 ?��?지�?추출?�고 로컬???�?�합?�다.
6
- - extract_image_from_drawing: Drawing ?�소?�서 ?��?지 추출
7
- - process_pict_element: ?�거??VML pict ?�소 처리
5
+ Extracts images from DOCX documents and saves them locally.
6
+ - extract_image_from_drawing: Extract images from Drawing elements
7
+ - process_pict_element: Process legacy VML pict elements
8
8
 
9
- Note: ???�수?��? DOCXImageProcessor??메서?��? ?�출?�는 wrapper?�니??
10
- ?�제 로직?� DOCXImageProcessor???�합?�어 ?�습?�다.
9
+ Note: These functions are wrappers that call DOCXImageProcessor methods.
10
+ The actual logic is consolidated in DOCXImageProcessor.
11
11
  """
12
12
  import logging
13
13
  from typing import Optional, Set, Tuple, TYPE_CHECKING
@@ -30,25 +30,25 @@ def extract_image_from_drawing(
30
30
  image_processor: "ImageProcessor"
31
31
  ) -> Tuple[str, Optional[ElementType]]:
32
32
  """
33
- Drawing?�서 ?��?지�?추출?�니??
33
+ Extract image from Drawing element.
34
34
 
35
35
  Args:
36
- graphic_data: graphicData XML ?�소
37
- doc: python-docx Document 객체
38
- processed_images: 처리???��?지 경로 집합 (중복 방�?)
39
- image_processor: ImageProcessor ?�스?�스 (DOCXImageProcessor 권장)
36
+ graphic_data: graphicData XML element
37
+ doc: python-docx Document object
38
+ processed_images: Set of processed image paths (for deduplication)
39
+ image_processor: ImageProcessor instance (DOCXImageProcessor recommended)
40
40
 
41
41
  Returns:
42
- (content, element_type) ?�플
42
+ (content, element_type) tuple
43
43
  """
44
- # DOCXImageProcessor??경우 ?�합??메서???�용
44
+ # Use integrated method if DOCXImageProcessor
45
45
  if hasattr(image_processor, 'extract_from_drawing'):
46
46
  content, is_image = image_processor.extract_from_drawing(
47
47
  graphic_data, doc, processed_images
48
48
  )
49
49
  return (content, ElementType.IMAGE) if is_image else ("", None)
50
50
 
51
- # Fallback: 기존 로직 (ImageProcessor 기본 ?�래?�인 경우)
51
+ # Fallback: Legacy logic (when using base ImageProcessor class)
52
52
  from docx.oxml.ns import qn
53
53
  from xgen_doc2chunk.core.processor.docx_helper.docx_constants import NAMESPACES
54
54
 
@@ -75,11 +75,11 @@ def extract_image_from_drawing(
75
75
  if image_tag:
76
76
  return f"\n{image_tag}\n", ElementType.IMAGE
77
77
 
78
- return "[?��?지]", ElementType.IMAGE
78
+ return "[Image]", ElementType.IMAGE
79
79
 
80
80
  except Exception as e:
81
81
  logger.warning(f"Error extracting image from relationship: {e}")
82
- return "[?��?지]", ElementType.IMAGE
82
+ return "[Image]", ElementType.IMAGE
83
83
 
84
84
  except Exception as e:
85
85
  logger.warning(f"Error extracting image from drawing: {e}")
@@ -93,33 +93,33 @@ def process_pict_element(
93
93
  image_processor: "ImageProcessor"
94
94
  ) -> str:
95
95
  """
96
- ?�거??VML pict ?�소�?처리?�니??
96
+ Process legacy VML pict element.
97
97
 
98
98
  Args:
99
- pict_elem: pict XML ?�소
100
- doc: python-docx Document 객체
101
- processed_images: 처리???��?지 경로 집합 (중복 방�?)
102
- image_processor: ImageProcessor ?�스?�스 (DOCXImageProcessor 권장)
99
+ pict_elem: pict XML element
100
+ doc: python-docx Document object
101
+ processed_images: Set of processed image paths (for deduplication)
102
+ image_processor: ImageProcessor instance (DOCXImageProcessor recommended)
103
103
 
104
104
  Returns:
105
- ?��?지 마크??문자??
105
+ Image marker string
106
106
  """
107
- # DOCXImageProcessor??경우 ?�합??메서???�용
107
+ # Use integrated method if DOCXImageProcessor
108
108
  if hasattr(image_processor, 'extract_from_pict'):
109
109
  return image_processor.extract_from_pict(pict_elem, doc, processed_images)
110
110
 
111
- # Fallback: 기존 로직 (ImageProcessor 기본 ?�래?�인 경우)
111
+ # Fallback: Legacy logic (when using base ImageProcessor class)
112
112
  try:
113
113
  ns_v = 'urn:schemas-microsoft-com:vml'
114
114
  ns_r = 'http://schemas.openxmlformats.org/officeDocument/2006/relationships'
115
115
 
116
116
  imagedata = pict_elem.find('.//{%s}imagedata' % ns_v)
117
117
  if imagedata is None:
118
- return "[?��?지]"
118
+ return "[Image]"
119
119
 
120
120
  rId = imagedata.get('{%s}id' % ns_r)
121
121
  if not rId:
122
- return "[?��?지]"
122
+ return "[Image]"
123
123
 
124
124
  try:
125
125
  rel = doc.part.rels.get(rId)
@@ -131,7 +131,7 @@ def process_pict_element(
131
131
  except Exception:
132
132
  pass
133
133
 
134
- return "[?��?지]"
134
+ return "[Image]"
135
135
 
136
136
  except Exception as e:
137
137
  logger.warning(f"Error processing pict element: {e}")
@@ -295,7 +295,7 @@ class HWPImageProcessor(ImageProcessor):
295
295
 
296
296
  bindata_index = None
297
297
 
298
- # Strategy 1: ?�프??79 (HWP 5.0.3.x+ ?�펙)
298
+ # Strategy 1: Offset 79 (HWP 5.0.3.x+ spec)
299
299
  if len(payload) >= 81:
300
300
  test_id = struct.unpack('<H', payload[79:81])[0]
301
301
  if 0 < test_id <= bin_data_list_len:
@@ -303,7 +303,7 @@ class HWPImageProcessor(ImageProcessor):
303
303
  logger.debug(f"Found BinData index at offset 79: {bindata_index}")
304
304
  return bindata_index
305
305
 
306
- # Strategy 2: ?�프??8 (�?버전)
306
+ # Strategy 2: Offset 8 (older version)
307
307
  if len(payload) >= 10:
308
308
  test_id = struct.unpack('<H', payload[8:10])[0]
309
309
  if 0 < test_id <= bin_data_list_len:
@@ -311,7 +311,7 @@ class HWPImageProcessor(ImageProcessor):
311
311
  logger.debug(f"Found BinData index at offset 8: {bindata_index}")
312
312
  return bindata_index
313
313
 
314
- # Strategy 3: ?�반?�인 ?�프???�캔
314
+ # Strategy 3: General offset scan
315
315
  for offset in [4, 6, 10, 12, 14, 16, 18, 20, 40, 44, 48, 52, 56, 60, 64, 68, 72, 76, 80]:
316
316
  if len(payload) >= offset + 2:
317
317
  test_id = struct.unpack('<H', payload[offset:offset+2])[0]
@@ -320,7 +320,7 @@ class HWPImageProcessor(ImageProcessor):
320
320
  logger.debug(f"Found potential BinData index at offset {offset}: {bindata_index}")
321
321
  return bindata_index
322
322
 
323
- # Strategy 4: 범위 ??�?번째 non-zero 2바이??�??�캔
323
+ # Strategy 4: Scan for first non-zero 2-byte value in range
324
324
  for i in range(0, min(len(payload) - 1, 100), 2):
325
325
  test_id = struct.unpack('<H', payload[i:i+2])[0]
326
326
  if 0 < test_id <= bin_data_list_len:
@@ -212,7 +212,7 @@ def find_and_insert_annotations(doc, tables: List[TableInfo]) -> List[TableInfo]
212
212
  Detection patterns:
213
213
  1. Rows starting with "Note)" etc. right after table
214
214
  2. Subheader rows inside table (e.g., (A), (B))
215
- 3. Footnote/endnote markers (?? *, ?? ?? etc.)
215
+ 3. Footnote/endnote markers (*, **, †, etc.)
216
216
 
217
217
  Args:
218
218
  doc: PyMuPDF document object
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: xgen-doc2chunk
3
- Version: 0.1.2
3
+ Version: 0.1.4
4
4
  Summary: Convert raw documents into AI-understandable context with intelligent text extraction, table detection, and semantic chunking
5
5
  Project-URL: Homepage, https://github.com/master0419/doc2chunk
6
6
  Project-URL: Documentation, https://github.com/master0419/doc2chunk#readme
@@ -25,7 +25,7 @@ xgen_doc2chunk/core/functions/utils.py,sha256=FXIamLR6qxO4QTX12QooGzqD8yQAYMIPBI
25
25
  xgen_doc2chunk/core/processor/__init__.py,sha256=RzlwESn8--LpB6N6WZ-i117TnIlh1EDZ_ImDaIjwYvA,3202
26
26
  xgen_doc2chunk/core/processor/base_handler.py,sha256=z1L29Nx7JBHv2N1rlcowzGXBgwtTXL8zhI7xWN1_J6o,19546
27
27
  xgen_doc2chunk/core/processor/csv_handler.py,sha256=SnAzRWycVivuRV4gjBLiI0HfLYxsGUpBK4Z4UyeyfWQ,4779
28
- xgen_doc2chunk/core/processor/doc_handler.py,sha256=TCk1pNKEqqJHqV-dXusAJiD2NUAaxwOq_Nn9BN5fVRE,23146
28
+ xgen_doc2chunk/core/processor/doc_handler.py,sha256=MLw71dY0Qt08q0zDwAnEe6uEIC9gr4gEYHW5fQV1w0Q,23087
29
29
  xgen_doc2chunk/core/processor/docx_handler.py,sha256=gSuv4gB0sAVGCwsmnZsSK_ADSTh52NTntwKav_KYDc8,14650
30
30
  xgen_doc2chunk/core/processor/excel_handler.py,sha256=VmJsTkFWn9bhj_tvHhBEkzQIgm2LGk1DdkiBJyiN05c,13502
31
31
  xgen_doc2chunk/core/processor/html_reprocessor.py,sha256=yrufNBPKUCHu6tcWPS9sKHMCB6Vj_t1fJ3EgPHkTaBc,5076
@@ -53,7 +53,7 @@ xgen_doc2chunk/core/processor/docx_helper/__init__.py,sha256=D-JeAVe1PcJOKlq37RX
53
53
  xgen_doc2chunk/core/processor/docx_helper/docx_chart_extractor.py,sha256=-7ONbBeOKeyyAEDfVdkJh8DGdR3FYJESlD1doHiw0A8,16224
54
54
  xgen_doc2chunk/core/processor/docx_helper/docx_constants.py,sha256=2aERy2K0EpHEbrTWz1pzcBrdk2vJcCyBDXamyhHVlo8,2330
55
55
  xgen_doc2chunk/core/processor/docx_helper/docx_file_converter.py,sha256=w6QtsXlT59PSGJm7bYaqspCUlLGcnwYUTQxTJ3CkswM,2076
56
- xgen_doc2chunk/core/processor/docx_helper/docx_image.py,sha256=7L3_BGlI9KQ6A4ZPqawHRFMvzamxovGourMALEACz7o,4688
56
+ xgen_doc2chunk/core/processor/docx_helper/docx_image.py,sha256=LcoL8maN2Mm-JVAbT7v0Ejf77QIU8FPIMdNnyJwNLQg,4671
57
57
  xgen_doc2chunk/core/processor/docx_helper/docx_image_processor.py,sha256=CFFd0ITAWeSP-IhSOvp1BzVQLmYDkmKN8BTQMWGl66c,13324
58
58
  xgen_doc2chunk/core/processor/docx_helper/docx_metadata.py,sha256=-A1mJqyTPe5FGpbTq5m9tJRU675GJ28qxSQlRvihoZk,2196
59
59
  xgen_doc2chunk/core/processor/docx_helper/docx_paragraph.py,sha256=2rzi27HKgzcA9Uqn_SIyfctZYfdmc4IObtgPxPRZcyU,4262
@@ -79,7 +79,7 @@ xgen_doc2chunk/core/processor/hwp_helper/hwp_constants.py,sha256=tqxYKzBjX0ZFTey
79
79
  xgen_doc2chunk/core/processor/hwp_helper/hwp_decoder.py,sha256=GM6PxtdSbvBQV1JFmkuGlNd3Yn0cQg-YU5h9KQaAOK0,2795
80
80
  xgen_doc2chunk/core/processor/hwp_helper/hwp_docinfo.py,sha256=RgK_kb0MzEnlPjiiQZdQLg69JZeJhPPahqYzs2E6CvI,6958
81
81
  xgen_doc2chunk/core/processor/hwp_helper/hwp_file_converter.py,sha256=FVwuO-6Bng7q0jJcX_pDiEOP4ZUPyngG1DKRD4UTTNk,1716
82
- xgen_doc2chunk/core/processor/hwp_helper/hwp_image_processor.py,sha256=I4D6AKp0VoTsAItkhfT1zJgWix753Xc0GM9yVRGcsqo,13546
82
+ xgen_doc2chunk/core/processor/hwp_helper/hwp_image_processor.py,sha256=ZdOU5FIGc7T_XhE-NClX7Es78Xtfy_t-IsK0q0WtGzo,13545
83
83
  xgen_doc2chunk/core/processor/hwp_helper/hwp_metadata.py,sha256=JVCsUEanpMYx7FJh3ymhr20w_hpJIU-JuMryZJUUb_s,8838
84
84
  xgen_doc2chunk/core/processor/hwp_helper/hwp_preprocessor.py,sha256=p7Tvv2ABvtmLTe1sr4I4RU_DijV90LVRvWg2-u85Kz4,2601
85
85
  xgen_doc2chunk/core/processor/hwp_helper/hwp_record.py,sha256=UZg3xyOyAbO9g6u_uSVRv2H9z3EMnX1JBPc7oA0WUJE,4858
@@ -113,7 +113,7 @@ xgen_doc2chunk/core/processor/pdf_helpers/pdf_metadata.py,sha256=7ZTeHXAfUqa_W9H
113
113
  xgen_doc2chunk/core/processor/pdf_helpers/pdf_page_analyzer.py,sha256=4kpY8WY9hH-cfjd-Ai6vA4V7I8KwE5hSq8Yt4QXliqM,3009
114
114
  xgen_doc2chunk/core/processor/pdf_helpers/pdf_preprocessor.py,sha256=qPgtMTMbaTm7_QyU7kKwVDtGAldf_yV4rTyoGVVgkTU,3406
115
115
  xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_detection.py,sha256=bwD6MVUuZJVYe3bWDsD6BpK1UZKKPsVyKOG6oHeoumw,47042
116
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_processor.py,sha256=KQ0eGnf-uZbooIK_BTr-Q_O6pTQaUhh1OAtHvErT72s,28164
116
+ xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_processor.py,sha256=cqoMzSySnapXRkELtmOahpmWyBnc1TquXPz1IqRqDSk,28168
117
117
  xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_quality_analyzer.py,sha256=v6VH-E6clI71-G2zJcT5754VFcPYqb1Qz4l3UcPeDeM,27863
118
118
  xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_validator.py,sha256=HXHl0tukTUHFSIWxQUcrYs8lYJ8gZnYV12HtSezWIho,16069
119
119
  xgen_doc2chunk/core/processor/pdf_helpers/pdf_text_extractor.py,sha256=wAnOCAQ3cTsVgMg0uVavodZHV2DAvrVkugqA0c4MhTY,4754
@@ -155,7 +155,7 @@ xgen_doc2chunk/ocr/ocr_engine/bedrock_ocr.py,sha256=4kIPb8u2_GSJ435GHJFXiIeQavMv
155
155
  xgen_doc2chunk/ocr/ocr_engine/gemini_ocr.py,sha256=A4V_AcC0tySYB4q-lNW7Tuhg7aTq0atj_RhMrCftKsM,2972
156
156
  xgen_doc2chunk/ocr/ocr_engine/openai_ocr.py,sha256=ZN-3Dq1BehFmwFvxTaYmiEAdFUqujviONNDiR8c5X4A,3194
157
157
  xgen_doc2chunk/ocr/ocr_engine/vllm_ocr.py,sha256=TeQOdPCPKQW8o4IyUb-4o6v6uTVzKupr4qh9NLjIj24,3672
158
- xgen_doc2chunk-0.1.2.dist-info/METADATA,sha256=qjsyqG8HTkCZvAeNqxlXPsJMCtPokpVM6UAgIRoNe7M,7623
159
- xgen_doc2chunk-0.1.2.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
160
- xgen_doc2chunk-0.1.2.dist-info/licenses/LICENSE,sha256=pokMTCMoEcrcnjBAJ8cb7UVADBMGce6GLFbbRfqJVJc,11346
161
- xgen_doc2chunk-0.1.2.dist-info/RECORD,,
158
+ xgen_doc2chunk-0.1.4.dist-info/METADATA,sha256=IfRE6mjY7MVBaifP6sdZXm66AMZburo5IWqrdNne-wk,7623
159
+ xgen_doc2chunk-0.1.4.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
160
+ xgen_doc2chunk-0.1.4.dist-info/licenses/LICENSE,sha256=pokMTCMoEcrcnjBAJ8cb7UVADBMGce6GLFbbRfqJVJc,11346
161
+ xgen_doc2chunk-0.1.4.dist-info/RECORD,,