xgen-doc2chunk 0.1.2__py3-none-any.whl → 0.1.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- xgen_doc2chunk/core/processor/doc_handler.py +7 -7
- xgen_doc2chunk/core/processor/docx_helper/docx_image.py +27 -27
- xgen_doc2chunk/core/processor/hwp_helper/hwp_image_processor.py +4 -4
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_processor.py +1 -1
- {xgen_doc2chunk-0.1.2.dist-info → xgen_doc2chunk-0.1.4.dist-info}/METADATA +1 -1
- {xgen_doc2chunk-0.1.2.dist-info → xgen_doc2chunk-0.1.4.dist-info}/RECORD +8 -8
- {xgen_doc2chunk-0.1.2.dist-info → xgen_doc2chunk-0.1.4.dist-info}/WHEEL +0 -0
- {xgen_doc2chunk-0.1.2.dist-info → xgen_doc2chunk-0.1.4.dist-info}/licenses/LICENSE +0 -0
|
@@ -115,8 +115,8 @@ class DOCHandler(BaseHandler):
|
|
|
115
115
|
"""
|
|
116
116
|
Delegate RTF processing to RTFHandler.
|
|
117
117
|
|
|
118
|
-
DOC
|
|
119
|
-
RTFHandler.extract_text()
|
|
118
|
+
When a DOC file is actually in RTF format, delegate to RTFHandler.
|
|
119
|
+
RTFHandler.extract_text() receives raw bytes, so pass current_file as is.
|
|
120
120
|
|
|
121
121
|
Args:
|
|
122
122
|
rtf_doc: Pre-converted RTFDocument object (unused, for consistency)
|
|
@@ -135,7 +135,7 @@ class DOCHandler(BaseHandler):
|
|
|
135
135
|
chart_processor=self._chart_processor
|
|
136
136
|
)
|
|
137
137
|
|
|
138
|
-
# RTFHandler.extract_text()
|
|
138
|
+
# RTFHandler.extract_text() reads file_data directly from current_file
|
|
139
139
|
return rtf_handler.extract_text(current_file, extract_metadata=extract_metadata)
|
|
140
140
|
|
|
141
141
|
def _extract_from_ole_obj(self, ole, current_file: "CurrentFile", extract_metadata: bool) -> str:
|
|
@@ -258,7 +258,7 @@ class DOCHandler(BaseHandler):
|
|
|
258
258
|
return str(value).strip()
|
|
259
259
|
|
|
260
260
|
def _extract_ole_images(self, ole: olefile.OleFileIO, processed_images: Set[str]) -> List[str]:
|
|
261
|
-
"""OLE
|
|
261
|
+
"""Extract images from OLE container."""
|
|
262
262
|
images = []
|
|
263
263
|
try:
|
|
264
264
|
for entry in ole.listdir():
|
|
@@ -493,7 +493,7 @@ class DOCHandler(BaseHandler):
|
|
|
493
493
|
# Text extraction attempt
|
|
494
494
|
text_parts = []
|
|
495
495
|
|
|
496
|
-
# 1.
|
|
496
|
+
# 1. Try to find text fragments in Table stream
|
|
497
497
|
table_stream_name = None
|
|
498
498
|
if ole.exists('1Table'):
|
|
499
499
|
table_stream_name = '1Table'
|
|
@@ -521,9 +521,9 @@ class DOCHandler(BaseHandler):
|
|
|
521
521
|
# Find consecutive Unicode characters
|
|
522
522
|
i = 0
|
|
523
523
|
while i < len(data) - 1:
|
|
524
|
-
#
|
|
524
|
+
# Find start of Unicode text (printable characters)
|
|
525
525
|
if 0x20 <= data[i] <= 0x7E and data[i+1] == 0x00:
|
|
526
|
-
#
|
|
526
|
+
# Collect Unicode characters
|
|
527
527
|
unicode_bytes = []
|
|
528
528
|
j = i
|
|
529
529
|
while j < len(data) - 1:
|
|
@@ -1,13 +1,13 @@
|
|
|
1
1
|
# xgen_doc2chunk/core/processor/docx_helper/docx_image.py
|
|
2
2
|
"""
|
|
3
|
-
DOCX
|
|
3
|
+
DOCX Image Extraction Utilities
|
|
4
4
|
|
|
5
|
-
DOCX
|
|
6
|
-
- extract_image_from_drawing:
|
|
7
|
-
- process_pict_element:
|
|
5
|
+
Extracts images from DOCX documents and saves them locally.
|
|
6
|
+
- extract_image_from_drawing: Extract images from Drawing elements
|
|
7
|
+
- process_pict_element: Process legacy VML pict elements
|
|
8
8
|
|
|
9
|
-
Note:
|
|
10
|
-
|
|
9
|
+
Note: These functions are wrappers that call DOCXImageProcessor methods.
|
|
10
|
+
The actual logic is consolidated in DOCXImageProcessor.
|
|
11
11
|
"""
|
|
12
12
|
import logging
|
|
13
13
|
from typing import Optional, Set, Tuple, TYPE_CHECKING
|
|
@@ -30,25 +30,25 @@ def extract_image_from_drawing(
|
|
|
30
30
|
image_processor: "ImageProcessor"
|
|
31
31
|
) -> Tuple[str, Optional[ElementType]]:
|
|
32
32
|
"""
|
|
33
|
-
Drawing
|
|
33
|
+
Extract image from Drawing element.
|
|
34
34
|
|
|
35
35
|
Args:
|
|
36
|
-
graphic_data: graphicData XML
|
|
37
|
-
doc: python-docx Document
|
|
38
|
-
processed_images:
|
|
39
|
-
image_processor: ImageProcessor
|
|
36
|
+
graphic_data: graphicData XML element
|
|
37
|
+
doc: python-docx Document object
|
|
38
|
+
processed_images: Set of processed image paths (for deduplication)
|
|
39
|
+
image_processor: ImageProcessor instance (DOCXImageProcessor recommended)
|
|
40
40
|
|
|
41
41
|
Returns:
|
|
42
|
-
(content, element_type)
|
|
42
|
+
(content, element_type) tuple
|
|
43
43
|
"""
|
|
44
|
-
# DOCXImageProcessor
|
|
44
|
+
# Use integrated method if DOCXImageProcessor
|
|
45
45
|
if hasattr(image_processor, 'extract_from_drawing'):
|
|
46
46
|
content, is_image = image_processor.extract_from_drawing(
|
|
47
47
|
graphic_data, doc, processed_images
|
|
48
48
|
)
|
|
49
49
|
return (content, ElementType.IMAGE) if is_image else ("", None)
|
|
50
50
|
|
|
51
|
-
# Fallback:
|
|
51
|
+
# Fallback: Legacy logic (when using base ImageProcessor class)
|
|
52
52
|
from docx.oxml.ns import qn
|
|
53
53
|
from xgen_doc2chunk.core.processor.docx_helper.docx_constants import NAMESPACES
|
|
54
54
|
|
|
@@ -75,11 +75,11 @@ def extract_image_from_drawing(
|
|
|
75
75
|
if image_tag:
|
|
76
76
|
return f"\n{image_tag}\n", ElementType.IMAGE
|
|
77
77
|
|
|
78
|
-
return "[
|
|
78
|
+
return "[Image]", ElementType.IMAGE
|
|
79
79
|
|
|
80
80
|
except Exception as e:
|
|
81
81
|
logger.warning(f"Error extracting image from relationship: {e}")
|
|
82
|
-
return "[
|
|
82
|
+
return "[Image]", ElementType.IMAGE
|
|
83
83
|
|
|
84
84
|
except Exception as e:
|
|
85
85
|
logger.warning(f"Error extracting image from drawing: {e}")
|
|
@@ -93,33 +93,33 @@ def process_pict_element(
|
|
|
93
93
|
image_processor: "ImageProcessor"
|
|
94
94
|
) -> str:
|
|
95
95
|
"""
|
|
96
|
-
|
|
96
|
+
Process legacy VML pict element.
|
|
97
97
|
|
|
98
98
|
Args:
|
|
99
|
-
pict_elem: pict XML
|
|
100
|
-
doc: python-docx Document
|
|
101
|
-
processed_images:
|
|
102
|
-
image_processor: ImageProcessor
|
|
99
|
+
pict_elem: pict XML element
|
|
100
|
+
doc: python-docx Document object
|
|
101
|
+
processed_images: Set of processed image paths (for deduplication)
|
|
102
|
+
image_processor: ImageProcessor instance (DOCXImageProcessor recommended)
|
|
103
103
|
|
|
104
104
|
Returns:
|
|
105
|
-
|
|
105
|
+
Image marker string
|
|
106
106
|
"""
|
|
107
|
-
# DOCXImageProcessor
|
|
107
|
+
# Use integrated method if DOCXImageProcessor
|
|
108
108
|
if hasattr(image_processor, 'extract_from_pict'):
|
|
109
109
|
return image_processor.extract_from_pict(pict_elem, doc, processed_images)
|
|
110
110
|
|
|
111
|
-
# Fallback:
|
|
111
|
+
# Fallback: Legacy logic (when using base ImageProcessor class)
|
|
112
112
|
try:
|
|
113
113
|
ns_v = 'urn:schemas-microsoft-com:vml'
|
|
114
114
|
ns_r = 'http://schemas.openxmlformats.org/officeDocument/2006/relationships'
|
|
115
115
|
|
|
116
116
|
imagedata = pict_elem.find('.//{%s}imagedata' % ns_v)
|
|
117
117
|
if imagedata is None:
|
|
118
|
-
return "[
|
|
118
|
+
return "[Image]"
|
|
119
119
|
|
|
120
120
|
rId = imagedata.get('{%s}id' % ns_r)
|
|
121
121
|
if not rId:
|
|
122
|
-
return "[
|
|
122
|
+
return "[Image]"
|
|
123
123
|
|
|
124
124
|
try:
|
|
125
125
|
rel = doc.part.rels.get(rId)
|
|
@@ -131,7 +131,7 @@ def process_pict_element(
|
|
|
131
131
|
except Exception:
|
|
132
132
|
pass
|
|
133
133
|
|
|
134
|
-
return "[
|
|
134
|
+
return "[Image]"
|
|
135
135
|
|
|
136
136
|
except Exception as e:
|
|
137
137
|
logger.warning(f"Error processing pict element: {e}")
|
|
@@ -295,7 +295,7 @@ class HWPImageProcessor(ImageProcessor):
|
|
|
295
295
|
|
|
296
296
|
bindata_index = None
|
|
297
297
|
|
|
298
|
-
# Strategy 1:
|
|
298
|
+
# Strategy 1: Offset 79 (HWP 5.0.3.x+ spec)
|
|
299
299
|
if len(payload) >= 81:
|
|
300
300
|
test_id = struct.unpack('<H', payload[79:81])[0]
|
|
301
301
|
if 0 < test_id <= bin_data_list_len:
|
|
@@ -303,7 +303,7 @@ class HWPImageProcessor(ImageProcessor):
|
|
|
303
303
|
logger.debug(f"Found BinData index at offset 79: {bindata_index}")
|
|
304
304
|
return bindata_index
|
|
305
305
|
|
|
306
|
-
# Strategy 2:
|
|
306
|
+
# Strategy 2: Offset 8 (older version)
|
|
307
307
|
if len(payload) >= 10:
|
|
308
308
|
test_id = struct.unpack('<H', payload[8:10])[0]
|
|
309
309
|
if 0 < test_id <= bin_data_list_len:
|
|
@@ -311,7 +311,7 @@ class HWPImageProcessor(ImageProcessor):
|
|
|
311
311
|
logger.debug(f"Found BinData index at offset 8: {bindata_index}")
|
|
312
312
|
return bindata_index
|
|
313
313
|
|
|
314
|
-
# Strategy 3:
|
|
314
|
+
# Strategy 3: General offset scan
|
|
315
315
|
for offset in [4, 6, 10, 12, 14, 16, 18, 20, 40, 44, 48, 52, 56, 60, 64, 68, 72, 76, 80]:
|
|
316
316
|
if len(payload) >= offset + 2:
|
|
317
317
|
test_id = struct.unpack('<H', payload[offset:offset+2])[0]
|
|
@@ -320,7 +320,7 @@ class HWPImageProcessor(ImageProcessor):
|
|
|
320
320
|
logger.debug(f"Found potential BinData index at offset {offset}: {bindata_index}")
|
|
321
321
|
return bindata_index
|
|
322
322
|
|
|
323
|
-
# Strategy 4:
|
|
323
|
+
# Strategy 4: Scan for first non-zero 2-byte value in range
|
|
324
324
|
for i in range(0, min(len(payload) - 1, 100), 2):
|
|
325
325
|
test_id = struct.unpack('<H', payload[i:i+2])[0]
|
|
326
326
|
if 0 < test_id <= bin_data_list_len:
|
|
@@ -212,7 +212,7 @@ def find_and_insert_annotations(doc, tables: List[TableInfo]) -> List[TableInfo]
|
|
|
212
212
|
Detection patterns:
|
|
213
213
|
1. Rows starting with "Note)" etc. right after table
|
|
214
214
|
2. Subheader rows inside table (e.g., (A), (B))
|
|
215
|
-
3. Footnote/endnote markers (
|
|
215
|
+
3. Footnote/endnote markers (*, **, †, ‡ etc.)
|
|
216
216
|
|
|
217
217
|
Args:
|
|
218
218
|
doc: PyMuPDF document object
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: xgen-doc2chunk
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.4
|
|
4
4
|
Summary: Convert raw documents into AI-understandable context with intelligent text extraction, table detection, and semantic chunking
|
|
5
5
|
Project-URL: Homepage, https://github.com/master0419/doc2chunk
|
|
6
6
|
Project-URL: Documentation, https://github.com/master0419/doc2chunk#readme
|
|
@@ -25,7 +25,7 @@ xgen_doc2chunk/core/functions/utils.py,sha256=FXIamLR6qxO4QTX12QooGzqD8yQAYMIPBI
|
|
|
25
25
|
xgen_doc2chunk/core/processor/__init__.py,sha256=RzlwESn8--LpB6N6WZ-i117TnIlh1EDZ_ImDaIjwYvA,3202
|
|
26
26
|
xgen_doc2chunk/core/processor/base_handler.py,sha256=z1L29Nx7JBHv2N1rlcowzGXBgwtTXL8zhI7xWN1_J6o,19546
|
|
27
27
|
xgen_doc2chunk/core/processor/csv_handler.py,sha256=SnAzRWycVivuRV4gjBLiI0HfLYxsGUpBK4Z4UyeyfWQ,4779
|
|
28
|
-
xgen_doc2chunk/core/processor/doc_handler.py,sha256=
|
|
28
|
+
xgen_doc2chunk/core/processor/doc_handler.py,sha256=MLw71dY0Qt08q0zDwAnEe6uEIC9gr4gEYHW5fQV1w0Q,23087
|
|
29
29
|
xgen_doc2chunk/core/processor/docx_handler.py,sha256=gSuv4gB0sAVGCwsmnZsSK_ADSTh52NTntwKav_KYDc8,14650
|
|
30
30
|
xgen_doc2chunk/core/processor/excel_handler.py,sha256=VmJsTkFWn9bhj_tvHhBEkzQIgm2LGk1DdkiBJyiN05c,13502
|
|
31
31
|
xgen_doc2chunk/core/processor/html_reprocessor.py,sha256=yrufNBPKUCHu6tcWPS9sKHMCB6Vj_t1fJ3EgPHkTaBc,5076
|
|
@@ -53,7 +53,7 @@ xgen_doc2chunk/core/processor/docx_helper/__init__.py,sha256=D-JeAVe1PcJOKlq37RX
|
|
|
53
53
|
xgen_doc2chunk/core/processor/docx_helper/docx_chart_extractor.py,sha256=-7ONbBeOKeyyAEDfVdkJh8DGdR3FYJESlD1doHiw0A8,16224
|
|
54
54
|
xgen_doc2chunk/core/processor/docx_helper/docx_constants.py,sha256=2aERy2K0EpHEbrTWz1pzcBrdk2vJcCyBDXamyhHVlo8,2330
|
|
55
55
|
xgen_doc2chunk/core/processor/docx_helper/docx_file_converter.py,sha256=w6QtsXlT59PSGJm7bYaqspCUlLGcnwYUTQxTJ3CkswM,2076
|
|
56
|
-
xgen_doc2chunk/core/processor/docx_helper/docx_image.py,sha256=
|
|
56
|
+
xgen_doc2chunk/core/processor/docx_helper/docx_image.py,sha256=LcoL8maN2Mm-JVAbT7v0Ejf77QIU8FPIMdNnyJwNLQg,4671
|
|
57
57
|
xgen_doc2chunk/core/processor/docx_helper/docx_image_processor.py,sha256=CFFd0ITAWeSP-IhSOvp1BzVQLmYDkmKN8BTQMWGl66c,13324
|
|
58
58
|
xgen_doc2chunk/core/processor/docx_helper/docx_metadata.py,sha256=-A1mJqyTPe5FGpbTq5m9tJRU675GJ28qxSQlRvihoZk,2196
|
|
59
59
|
xgen_doc2chunk/core/processor/docx_helper/docx_paragraph.py,sha256=2rzi27HKgzcA9Uqn_SIyfctZYfdmc4IObtgPxPRZcyU,4262
|
|
@@ -79,7 +79,7 @@ xgen_doc2chunk/core/processor/hwp_helper/hwp_constants.py,sha256=tqxYKzBjX0ZFTey
|
|
|
79
79
|
xgen_doc2chunk/core/processor/hwp_helper/hwp_decoder.py,sha256=GM6PxtdSbvBQV1JFmkuGlNd3Yn0cQg-YU5h9KQaAOK0,2795
|
|
80
80
|
xgen_doc2chunk/core/processor/hwp_helper/hwp_docinfo.py,sha256=RgK_kb0MzEnlPjiiQZdQLg69JZeJhPPahqYzs2E6CvI,6958
|
|
81
81
|
xgen_doc2chunk/core/processor/hwp_helper/hwp_file_converter.py,sha256=FVwuO-6Bng7q0jJcX_pDiEOP4ZUPyngG1DKRD4UTTNk,1716
|
|
82
|
-
xgen_doc2chunk/core/processor/hwp_helper/hwp_image_processor.py,sha256=
|
|
82
|
+
xgen_doc2chunk/core/processor/hwp_helper/hwp_image_processor.py,sha256=ZdOU5FIGc7T_XhE-NClX7Es78Xtfy_t-IsK0q0WtGzo,13545
|
|
83
83
|
xgen_doc2chunk/core/processor/hwp_helper/hwp_metadata.py,sha256=JVCsUEanpMYx7FJh3ymhr20w_hpJIU-JuMryZJUUb_s,8838
|
|
84
84
|
xgen_doc2chunk/core/processor/hwp_helper/hwp_preprocessor.py,sha256=p7Tvv2ABvtmLTe1sr4I4RU_DijV90LVRvWg2-u85Kz4,2601
|
|
85
85
|
xgen_doc2chunk/core/processor/hwp_helper/hwp_record.py,sha256=UZg3xyOyAbO9g6u_uSVRv2H9z3EMnX1JBPc7oA0WUJE,4858
|
|
@@ -113,7 +113,7 @@ xgen_doc2chunk/core/processor/pdf_helpers/pdf_metadata.py,sha256=7ZTeHXAfUqa_W9H
|
|
|
113
113
|
xgen_doc2chunk/core/processor/pdf_helpers/pdf_page_analyzer.py,sha256=4kpY8WY9hH-cfjd-Ai6vA4V7I8KwE5hSq8Yt4QXliqM,3009
|
|
114
114
|
xgen_doc2chunk/core/processor/pdf_helpers/pdf_preprocessor.py,sha256=qPgtMTMbaTm7_QyU7kKwVDtGAldf_yV4rTyoGVVgkTU,3406
|
|
115
115
|
xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_detection.py,sha256=bwD6MVUuZJVYe3bWDsD6BpK1UZKKPsVyKOG6oHeoumw,47042
|
|
116
|
-
xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_processor.py,sha256=
|
|
116
|
+
xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_processor.py,sha256=cqoMzSySnapXRkELtmOahpmWyBnc1TquXPz1IqRqDSk,28168
|
|
117
117
|
xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_quality_analyzer.py,sha256=v6VH-E6clI71-G2zJcT5754VFcPYqb1Qz4l3UcPeDeM,27863
|
|
118
118
|
xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_validator.py,sha256=HXHl0tukTUHFSIWxQUcrYs8lYJ8gZnYV12HtSezWIho,16069
|
|
119
119
|
xgen_doc2chunk/core/processor/pdf_helpers/pdf_text_extractor.py,sha256=wAnOCAQ3cTsVgMg0uVavodZHV2DAvrVkugqA0c4MhTY,4754
|
|
@@ -155,7 +155,7 @@ xgen_doc2chunk/ocr/ocr_engine/bedrock_ocr.py,sha256=4kIPb8u2_GSJ435GHJFXiIeQavMv
|
|
|
155
155
|
xgen_doc2chunk/ocr/ocr_engine/gemini_ocr.py,sha256=A4V_AcC0tySYB4q-lNW7Tuhg7aTq0atj_RhMrCftKsM,2972
|
|
156
156
|
xgen_doc2chunk/ocr/ocr_engine/openai_ocr.py,sha256=ZN-3Dq1BehFmwFvxTaYmiEAdFUqujviONNDiR8c5X4A,3194
|
|
157
157
|
xgen_doc2chunk/ocr/ocr_engine/vllm_ocr.py,sha256=TeQOdPCPKQW8o4IyUb-4o6v6uTVzKupr4qh9NLjIj24,3672
|
|
158
|
-
xgen_doc2chunk-0.1.
|
|
159
|
-
xgen_doc2chunk-0.1.
|
|
160
|
-
xgen_doc2chunk-0.1.
|
|
161
|
-
xgen_doc2chunk-0.1.
|
|
158
|
+
xgen_doc2chunk-0.1.4.dist-info/METADATA,sha256=IfRE6mjY7MVBaifP6sdZXm66AMZburo5IWqrdNne-wk,7623
|
|
159
|
+
xgen_doc2chunk-0.1.4.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
|
|
160
|
+
xgen_doc2chunk-0.1.4.dist-info/licenses/LICENSE,sha256=pokMTCMoEcrcnjBAJ8cb7UVADBMGce6GLFbbRfqJVJc,11346
|
|
161
|
+
xgen_doc2chunk-0.1.4.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|