xgen-doc2chunk 0.1.5__py3-none-any.whl → 0.1.52__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_processor.py +9 -1
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_validator.py +5 -5
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_text_extractor.py +73 -5
- xgen_doc2chunk/core/processor/pdf_helpers/pdf_text_quality_analyzer.py +549 -10
- {xgen_doc2chunk-0.1.5.dist-info → xgen_doc2chunk-0.1.52.dist-info}/METADATA +1 -1
- {xgen_doc2chunk-0.1.5.dist-info → xgen_doc2chunk-0.1.52.dist-info}/RECORD +8 -8
- {xgen_doc2chunk-0.1.5.dist-info → xgen_doc2chunk-0.1.52.dist-info}/WHEEL +0 -0
- {xgen_doc2chunk-0.1.5.dist-info → xgen_doc2chunk-0.1.52.dist-info}/licenses/LICENSE +0 -0
|
@@ -25,6 +25,9 @@ from xgen_doc2chunk.core.processor.pdf_helpers.pdf_utils import (
|
|
|
25
25
|
)
|
|
26
26
|
from xgen_doc2chunk.core.processor.pdf_helpers.pdf_table_detection import TableDetectionEngine
|
|
27
27
|
from xgen_doc2chunk.core.processor.pdf_helpers.pdf_cell_analysis import CellAnalysisEngine
|
|
28
|
+
from xgen_doc2chunk.core.processor.pdf_helpers.pdf_text_quality_analyzer import (
|
|
29
|
+
apply_cjk_compat_mapping,
|
|
30
|
+
)
|
|
28
31
|
|
|
29
32
|
logger = logging.getLogger("document-processor")
|
|
30
33
|
|
|
@@ -873,7 +876,12 @@ def generate_html_from_cells(
|
|
|
873
876
|
content = ""
|
|
874
877
|
if col_idx < len(row_data):
|
|
875
878
|
content = row_data[col_idx]
|
|
876
|
-
|
|
879
|
+
|
|
880
|
+
# Apply CJK Compatibility character mapping to fix broken characters
|
|
881
|
+
# (e.g., 㛳→→, ㏙→(, ㏚→) etc. from Word→PDF conversion)
|
|
882
|
+
content = str(content).strip() if content else ""
|
|
883
|
+
content = apply_cjk_compat_mapping(content)
|
|
884
|
+
content = escape_html(content)
|
|
877
885
|
|
|
878
886
|
# Get span info (default to 1 if not found)
|
|
879
887
|
spans = span_map.get((row_idx, col_idx), {'rowspan': 1, 'colspan': 1})
|
|
@@ -383,11 +383,11 @@ class TableQualityValidator:
|
|
|
383
383
|
# if num_rows > 5 and col2_has_paragraphs >= 2:
|
|
384
384
|
# return False, f"col2_paragraphs({col2_has_paragraphs})"
|
|
385
385
|
|
|
386
|
-
# Pattern 3: If first column is short and second is long overall, likely body text not key-value
|
|
387
|
-
if num_rows > 10:
|
|
388
|
-
|
|
389
|
-
|
|
390
|
-
|
|
386
|
+
# # Pattern 3: If first column is short and second is long overall, likely body text not key-value
|
|
387
|
+
# if num_rows > 10:
|
|
388
|
+
# col1_short_ratio = (col1_empty_count + col1_short_count) / num_rows
|
|
389
|
+
# if col1_short_ratio >= 0.8 and col2_long_count >= 5:
|
|
390
|
+
# return False, f"asymmetric_cols(short1={col1_short_ratio:.0%}, long2={col2_long_count})"
|
|
391
391
|
|
|
392
392
|
return True, "valid"
|
|
393
393
|
|
|
@@ -3,6 +3,9 @@
|
|
|
3
3
|
PDF Text Extraction Module
|
|
4
4
|
|
|
5
5
|
Provides functions for extracting text blocks from PDF pages.
|
|
6
|
+
Includes support for:
|
|
7
|
+
- Fragmented text reconstruction (Word->PDF conversion issues)
|
|
8
|
+
- CJK Compatibility character mapping (broken character fixes)
|
|
6
9
|
"""
|
|
7
10
|
import logging
|
|
8
11
|
from typing import List, Tuple
|
|
@@ -17,6 +20,8 @@ from xgen_doc2chunk.core.processor.pdf_helpers.pdf_text_quality_analyzer import
|
|
|
17
20
|
TextQualityAnalyzer,
|
|
18
21
|
QualityAwareTextExtractor,
|
|
19
22
|
PageOCRFallbackEngine,
|
|
23
|
+
FragmentedTextReconstructor,
|
|
24
|
+
apply_cjk_compat_mapping,
|
|
20
25
|
)
|
|
21
26
|
|
|
22
27
|
logger = logging.getLogger("document-processor")
|
|
@@ -53,13 +58,76 @@ def extract_text_blocks(
|
|
|
53
58
|
analyzer = TextQualityAnalyzer(page, page_num)
|
|
54
59
|
page_analysis = analyzer.analyze_page()
|
|
55
60
|
|
|
56
|
-
# If quality is
|
|
61
|
+
# If quality is low, try text reconstruction first (before OCR)
|
|
57
62
|
if page_analysis.quality_result.needs_ocr:
|
|
63
|
+
quality_result = page_analysis.quality_result
|
|
58
64
|
logger.info(
|
|
59
|
-
f"[PDF] Page {page_num + 1}: Low text quality "
|
|
60
|
-
f"
|
|
61
|
-
f"PUA={
|
|
62
|
-
f"
|
|
65
|
+
f"[PDF] Page {page_num + 1}: Low text quality detected - "
|
|
66
|
+
f"score={quality_result.quality_score:.2f}, "
|
|
67
|
+
f"PUA={quality_result.pua_count}, "
|
|
68
|
+
f"CJK_Compat={quality_result.cjk_compat_count}, "
|
|
69
|
+
f"fragmented={quality_result.is_fragmented}"
|
|
70
|
+
)
|
|
71
|
+
|
|
72
|
+
# Try reconstruction for fragmented text or CJK Compat issues
|
|
73
|
+
if quality_result.is_fragmented or quality_result.cjk_compat_count > 0:
|
|
74
|
+
logger.info(
|
|
75
|
+
f"[PDF] Page {page_num + 1}: Attempting text reconstruction "
|
|
76
|
+
f"(excluding {len(table_bboxes)} table regions)"
|
|
77
|
+
)
|
|
78
|
+
|
|
79
|
+
# Exclude table regions from reconstruction to avoid duplication
|
|
80
|
+
reconstructor = FragmentedTextReconstructor(
|
|
81
|
+
page, page_num, exclude_bboxes=table_bboxes
|
|
82
|
+
)
|
|
83
|
+
|
|
84
|
+
# Use section-based reconstruction for proper table positioning
|
|
85
|
+
if table_bboxes:
|
|
86
|
+
sections = reconstructor.reconstruct_with_sections()
|
|
87
|
+
|
|
88
|
+
if sections:
|
|
89
|
+
result_elements = []
|
|
90
|
+
for section in sections:
|
|
91
|
+
# Apply CJK Compatibility character mapping
|
|
92
|
+
cleaned_text = apply_cjk_compat_mapping(section['text'])
|
|
93
|
+
|
|
94
|
+
if cleaned_text.strip():
|
|
95
|
+
# Create element with proper Y position for sorting
|
|
96
|
+
result_elements.append(PageElement(
|
|
97
|
+
element_type=ElementType.TEXT,
|
|
98
|
+
content=cleaned_text,
|
|
99
|
+
bbox=(0, section['y_start'], page.rect.width, section['y_end']),
|
|
100
|
+
page_num=page_num
|
|
101
|
+
))
|
|
102
|
+
|
|
103
|
+
if result_elements:
|
|
104
|
+
logger.info(
|
|
105
|
+
f"[PDF] Page {page_num + 1}: Text reconstruction successful "
|
|
106
|
+
f"({len(result_elements)} sections)"
|
|
107
|
+
)
|
|
108
|
+
return result_elements
|
|
109
|
+
else:
|
|
110
|
+
# No tables - use simple reconstruction
|
|
111
|
+
reconstructed_text = reconstructor.reconstruct()
|
|
112
|
+
|
|
113
|
+
if reconstructed_text:
|
|
114
|
+
cleaned_text = apply_cjk_compat_mapping(reconstructed_text)
|
|
115
|
+
|
|
116
|
+
logger.info(
|
|
117
|
+
f"[PDF] Page {page_num + 1}: Text reconstruction successful "
|
|
118
|
+
f"({len(cleaned_text)} chars)"
|
|
119
|
+
)
|
|
120
|
+
|
|
121
|
+
return [PageElement(
|
|
122
|
+
element_type=ElementType.TEXT,
|
|
123
|
+
content=cleaned_text,
|
|
124
|
+
bbox=(0, 0, page.rect.width, page.rect.height),
|
|
125
|
+
page_num=page_num
|
|
126
|
+
)]
|
|
127
|
+
|
|
128
|
+
# Fall back to OCR if reconstruction not applicable
|
|
129
|
+
logger.info(
|
|
130
|
+
f"[PDF] Page {page_num + 1}: Using OCR fallback"
|
|
63
131
|
)
|
|
64
132
|
|
|
65
133
|
extractor = QualityAwareTextExtractor(page, page_num)
|
|
@@ -12,20 +12,23 @@ Characteristics of Broken Text:
|
|
|
12
12
|
3. Invalid Korean character combinations (only consonants/vowels in sequence)
|
|
13
13
|
4. Meaningless Korean syllable sequences (random combinations, not real words)
|
|
14
14
|
5. Mixture of CJK characters with PUA/control characters
|
|
15
|
+
6. CJK Compatibility characters used instead of normal punctuation
|
|
16
|
+
7. Fragmented text where each character is on a separate line
|
|
15
17
|
|
|
16
18
|
=============================================================================
|
|
17
19
|
Resolution Strategy:
|
|
18
20
|
=============================================================================
|
|
19
21
|
1. Calculate text quality score (0.0 ~ 1.0)
|
|
20
|
-
2.
|
|
21
|
-
3.
|
|
22
|
+
2. For fragmented text: Reconstruct using character position data
|
|
23
|
+
3. For CJK Compatibility characters: Map to correct characters
|
|
24
|
+
4. Perform OCR fallback only if reconstruction fails
|
|
22
25
|
"""
|
|
23
26
|
|
|
24
27
|
import logging
|
|
25
28
|
import re
|
|
26
29
|
import unicodedata
|
|
27
30
|
from typing import List, Dict, Tuple, Optional, Set
|
|
28
|
-
from dataclasses import dataclass
|
|
31
|
+
from dataclasses import dataclass, field
|
|
29
32
|
|
|
30
33
|
import fitz
|
|
31
34
|
from PIL import Image
|
|
@@ -35,9 +38,40 @@ logger = logging.getLogger(__name__)
|
|
|
35
38
|
|
|
36
39
|
|
|
37
40
|
# ============================================================================
|
|
38
|
-
#
|
|
41
|
+
# CJK Compatibility Character Mapping
|
|
39
42
|
# ============================================================================
|
|
40
43
|
|
|
44
|
+
# Map CJK Compatibility characters to their intended characters
|
|
45
|
+
# These occur when Word documents are converted to PDF with font issues
|
|
46
|
+
CJK_COMPAT_CHAR_MAP = {
|
|
47
|
+
# Parentheses
|
|
48
|
+
'\u33D9': '(', # ㏙ → (
|
|
49
|
+
'\u33DA': ')', # ㏚ → )
|
|
50
|
+
|
|
51
|
+
# Brackets (section markers)
|
|
52
|
+
'\u33DB': '[', # ㏛ → [ (or could be 【)
|
|
53
|
+
'\u33DC': ']', # ㏜ → ] (or could be 】)
|
|
54
|
+
'\u33DD': '[', # ㏝ → [ (section start)
|
|
55
|
+
'\u33DE': ']', # ㏞ → ] (section end)
|
|
56
|
+
|
|
57
|
+
# Arrows and connectors
|
|
58
|
+
'\u3711': '→', # 㜑 → arrow
|
|
59
|
+
'\u36A8': '/', # 㚨 → / or +
|
|
60
|
+
'\u36F3': '→', # 㛳 → arrow (Word→PDF conversion often maps arrow to this)
|
|
61
|
+
'\u3689': '+', # 㚉 → + (plus sign, e.g., Vector + Graph)
|
|
62
|
+
|
|
63
|
+
# Range indicator
|
|
64
|
+
'\u33CA': '~', # ㏊ → ~ (range, e.g., 2~6개월)
|
|
65
|
+
|
|
66
|
+
# Quotation marks
|
|
67
|
+
'\u3431': '"', # 㐱 → opening quote
|
|
68
|
+
'\u3432': '"', # 㐲 → closing quote
|
|
69
|
+
'\u3433': '"', # 㐳 → opening quote
|
|
70
|
+
'\u3434': '"', # 㐴 → closing quote
|
|
71
|
+
'\u3443': '"', # 㑃 → quote
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
|
|
41
75
|
class TextQualityConfig:
|
|
42
76
|
"""Text quality analysis configuration."""
|
|
43
77
|
|
|
@@ -55,6 +89,15 @@ class TextQualityConfig:
|
|
|
55
89
|
(0x100000, 0x10FFFD), # Supplementary PUA-B
|
|
56
90
|
]
|
|
57
91
|
|
|
92
|
+
# CJK Compatibility ranges (often indicates broken text from Word->PDF conversion)
|
|
93
|
+
# These are unit symbols that are rarely used in normal text but appear when
|
|
94
|
+
# character encoding is broken (e.g., parentheses becoming ㏙, ㏚, etc.)
|
|
95
|
+
CJK_COMPAT_RANGES = [
|
|
96
|
+
(0x3300, 0x33FF), # CJK Compatibility (squared Katakana, units)
|
|
97
|
+
(0x3200, 0x32FF), # Enclosed CJK Letters and Months
|
|
98
|
+
(0x3700, 0x37FF), # CJK Extension A (rarely used Hanja)
|
|
99
|
+
]
|
|
100
|
+
|
|
58
101
|
# Control characters and special characters
|
|
59
102
|
CONTROL_RANGES = [
|
|
60
103
|
(0x0000, 0x001F), # C0 controls
|
|
@@ -76,6 +119,13 @@ class TextQualityConfig:
|
|
|
76
119
|
WEIGHT_PUA = 0.4 # PUA character ratio weight
|
|
77
120
|
WEIGHT_REPLACEMENT = 0.3 # Replacement character weight
|
|
78
121
|
WEIGHT_VALID_RATIO = 0.3 # Valid character ratio weight
|
|
122
|
+
WEIGHT_CJK_COMPAT = 0.5 # CJK Compatibility character weight (broken text indicator)
|
|
123
|
+
|
|
124
|
+
# Fragmented text detection settings
|
|
125
|
+
# When each line has only 1-2 characters, it indicates conversion issue
|
|
126
|
+
FRAGMENTED_TEXT_THRESHOLD = 0.5 # If >50% of lines have <=2 chars, text is fragmented
|
|
127
|
+
FRAGMENTED_LINE_CHAR_LIMIT = 3 # Lines with <= this many chars are considered fragmented
|
|
128
|
+
MIN_LINES_FOR_FRAGMENTED_CHECK = 5 # Minimum lines needed to check for fragmentation
|
|
79
129
|
|
|
80
130
|
|
|
81
131
|
# ============================================================================
|
|
@@ -91,8 +141,14 @@ class TextQualityResult:
|
|
|
91
141
|
replacement_count: int # Replacement character count
|
|
92
142
|
valid_chars: int # Valid character count (Korean, English, digits)
|
|
93
143
|
control_chars: int # Control character count
|
|
94
|
-
|
|
95
|
-
|
|
144
|
+
cjk_compat_count: int = 0 # CJK Compatibility character count (broken text indicator)
|
|
145
|
+
is_fragmented: bool = False # Whether text is fragmented (char-by-char line breaks)
|
|
146
|
+
needs_ocr: bool = False # Whether OCR is needed
|
|
147
|
+
details: Dict = None # Detailed information
|
|
148
|
+
|
|
149
|
+
def __post_init__(self):
|
|
150
|
+
if self.details is None:
|
|
151
|
+
self.details = {}
|
|
96
152
|
|
|
97
153
|
|
|
98
154
|
@dataclass
|
|
@@ -143,6 +199,11 @@ class TextQualityAnalyzer:
|
|
|
143
199
|
text_blocks = []
|
|
144
200
|
problem_regions = []
|
|
145
201
|
|
|
202
|
+
# Count lines to detect fragmented text pattern
|
|
203
|
+
# conversion issue where each char is a separate line)
|
|
204
|
+
total_lines = 0
|
|
205
|
+
total_chars = 0
|
|
206
|
+
|
|
146
207
|
for block in blocks:
|
|
147
208
|
if block.get("type") != 0: # Text blocks only
|
|
148
209
|
continue
|
|
@@ -151,9 +212,11 @@ class TextQualityAnalyzer:
|
|
|
151
212
|
block_text = []
|
|
152
213
|
|
|
153
214
|
for line in block.get("lines", []):
|
|
215
|
+
total_lines += 1
|
|
154
216
|
for span in line.get("spans", []):
|
|
155
217
|
text = span.get("text", "")
|
|
156
218
|
if text:
|
|
219
|
+
total_chars += len(text.strip())
|
|
157
220
|
block_text.append(text)
|
|
158
221
|
all_text.append(text)
|
|
159
222
|
|
|
@@ -175,6 +238,37 @@ class TextQualityAnalyzer:
|
|
|
175
238
|
full_text = " ".join(all_text)
|
|
176
239
|
overall_quality = self.analyze_text(full_text)
|
|
177
240
|
|
|
241
|
+
# Detect fragmented text at page level
|
|
242
|
+
# If average chars per line is very low, text is likely fragmented
|
|
243
|
+
if total_lines > 0 and total_chars > 0:
|
|
244
|
+
avg_chars_per_line = total_chars / total_lines
|
|
245
|
+
# If average is less than 15 chars per line, text is fragmented
|
|
246
|
+
page_is_fragmented = avg_chars_per_line < 15 and total_lines >= TextQualityConfig.MIN_LINES_FOR_FRAGMENTED_CHECK
|
|
247
|
+
|
|
248
|
+
if page_is_fragmented:
|
|
249
|
+
logger.info(
|
|
250
|
+
f"[QualityAnalyzer] Page {self.page_num + 1}: "
|
|
251
|
+
f"Detected fragmented text (avg {avg_chars_per_line:.1f} chars/line, {total_lines} lines)"
|
|
252
|
+
)
|
|
253
|
+
# Update overall quality to reflect fragmented status
|
|
254
|
+
overall_quality = TextQualityResult(
|
|
255
|
+
quality_score=max(0.0, overall_quality.quality_score - 0.5),
|
|
256
|
+
total_chars=overall_quality.total_chars,
|
|
257
|
+
pua_count=overall_quality.pua_count,
|
|
258
|
+
replacement_count=overall_quality.replacement_count,
|
|
259
|
+
valid_chars=overall_quality.valid_chars,
|
|
260
|
+
control_chars=overall_quality.control_chars,
|
|
261
|
+
cjk_compat_count=overall_quality.cjk_compat_count,
|
|
262
|
+
is_fragmented=True, # Mark as fragmented
|
|
263
|
+
needs_ocr=True, # Trigger reconstruction
|
|
264
|
+
details={
|
|
265
|
+
**overall_quality.details,
|
|
266
|
+
'is_fragmented': True,
|
|
267
|
+
'avg_chars_per_line': avg_chars_per_line,
|
|
268
|
+
'total_lines': total_lines,
|
|
269
|
+
}
|
|
270
|
+
)
|
|
271
|
+
|
|
178
272
|
return PageTextAnalysis(
|
|
179
273
|
page_num=self.page_num,
|
|
180
274
|
quality_result=overall_quality,
|
|
@@ -200,6 +294,8 @@ class TextQualityAnalyzer:
|
|
|
200
294
|
replacement_count=0,
|
|
201
295
|
valid_chars=len(text),
|
|
202
296
|
control_chars=0,
|
|
297
|
+
cjk_compat_count=0,
|
|
298
|
+
is_fragmented=False,
|
|
203
299
|
needs_ocr=False,
|
|
204
300
|
details={'reason': 'text_too_short'}
|
|
205
301
|
)
|
|
@@ -208,6 +304,7 @@ class TextQualityAnalyzer:
|
|
|
208
304
|
pua_count = 0
|
|
209
305
|
replacement_count = 0
|
|
210
306
|
control_count = 0
|
|
307
|
+
cjk_compat_count = 0 # CJK Compatibility character count
|
|
211
308
|
valid_chars = 0 # Korean, English, digits, spaces, basic punctuation
|
|
212
309
|
|
|
213
310
|
# Character-by-character analysis
|
|
@@ -219,6 +316,11 @@ class TextQualityAnalyzer:
|
|
|
219
316
|
pua_count += 1
|
|
220
317
|
continue
|
|
221
318
|
|
|
319
|
+
# CJK Compatibility check (broken text indicator)
|
|
320
|
+
if self._is_cjk_compat(code):
|
|
321
|
+
cjk_compat_count += 1
|
|
322
|
+
continue
|
|
323
|
+
|
|
222
324
|
# Replacement character check
|
|
223
325
|
if code == 0xFFFD:
|
|
224
326
|
replacement_count += 1
|
|
@@ -233,19 +335,27 @@ class TextQualityAnalyzer:
|
|
|
233
335
|
if self._is_valid_char(char, code):
|
|
234
336
|
valid_chars += 1
|
|
235
337
|
|
|
338
|
+
# Check for fragmented text pattern (char-by-char line breaks)
|
|
339
|
+
is_fragmented = self._is_fragmented_text(text)
|
|
340
|
+
|
|
236
341
|
# Calculate quality score
|
|
237
342
|
quality_score = self._calculate_quality_score(
|
|
238
343
|
total_chars=total_chars,
|
|
239
344
|
pua_count=pua_count,
|
|
240
345
|
replacement_count=replacement_count,
|
|
241
|
-
valid_chars=valid_chars
|
|
346
|
+
valid_chars=valid_chars,
|
|
347
|
+
cjk_compat_count=cjk_compat_count,
|
|
348
|
+
is_fragmented=is_fragmented
|
|
242
349
|
)
|
|
243
350
|
|
|
244
351
|
# Determine OCR necessity
|
|
245
352
|
pua_ratio = pua_count / total_chars if total_chars > 0 else 0
|
|
353
|
+
cjk_compat_ratio = cjk_compat_count / total_chars if total_chars > 0 else 0
|
|
246
354
|
needs_ocr = (
|
|
247
355
|
quality_score < TextQualityConfig.QUALITY_THRESHOLD or
|
|
248
|
-
pua_ratio >= TextQualityConfig.PUA_RATIO_THRESHOLD
|
|
356
|
+
pua_ratio >= TextQualityConfig.PUA_RATIO_THRESHOLD or
|
|
357
|
+
cjk_compat_ratio >= 0.05 or # 5% or more CJK compat chars triggers OCR
|
|
358
|
+
is_fragmented # Fragmented text always needs OCR
|
|
249
359
|
)
|
|
250
360
|
|
|
251
361
|
return TextQualityResult(
|
|
@@ -255,11 +365,15 @@ class TextQualityAnalyzer:
|
|
|
255
365
|
replacement_count=replacement_count,
|
|
256
366
|
valid_chars=valid_chars,
|
|
257
367
|
control_chars=control_count,
|
|
368
|
+
cjk_compat_count=cjk_compat_count,
|
|
369
|
+
is_fragmented=is_fragmented,
|
|
258
370
|
needs_ocr=needs_ocr,
|
|
259
371
|
details={
|
|
260
372
|
'pua_ratio': pua_count / total_chars if total_chars > 0 else 0,
|
|
261
373
|
'replacement_ratio': replacement_count / total_chars if total_chars > 0 else 0,
|
|
262
374
|
'valid_ratio': valid_chars / total_chars if total_chars > 0 else 0,
|
|
375
|
+
'cjk_compat_ratio': cjk_compat_count / total_chars if total_chars > 0 else 0,
|
|
376
|
+
'is_fragmented': is_fragmented,
|
|
263
377
|
}
|
|
264
378
|
)
|
|
265
379
|
|
|
@@ -270,6 +384,57 @@ class TextQualityAnalyzer:
|
|
|
270
384
|
return True
|
|
271
385
|
return False
|
|
272
386
|
|
|
387
|
+
def _is_cjk_compat(self, code: int) -> bool:
|
|
388
|
+
"""
|
|
389
|
+
Check if character is in CJK Compatibility range.
|
|
390
|
+
|
|
391
|
+
These characters often indicate broken text from Word->PDF conversion
|
|
392
|
+
where parentheses, brackets, and other symbols are incorrectly mapped
|
|
393
|
+
to CJK Compatibility characters (e.g., U+3319 for '(', U+331A for ')').
|
|
394
|
+
"""
|
|
395
|
+
for start, end in TextQualityConfig.CJK_COMPAT_RANGES:
|
|
396
|
+
if start <= code <= end:
|
|
397
|
+
return True
|
|
398
|
+
return False
|
|
399
|
+
|
|
400
|
+
def _is_fragmented_text(self, text: str) -> bool:
|
|
401
|
+
"""
|
|
402
|
+
Detect fragmented text pattern where each line has only 1-2 characters.
|
|
403
|
+
|
|
404
|
+
This pattern occurs when Word documents with special layouts
|
|
405
|
+
(text boxes, vertical text, etc.) are converted to PDF,
|
|
406
|
+
resulting in characters being stored as separate lines.
|
|
407
|
+
|
|
408
|
+
Example of fragmented text:
|
|
409
|
+
'현\n재\n시\n장\n에\n대\n한\n이\n해'
|
|
410
|
+
Should be: '현재 시장에 대한 이해'
|
|
411
|
+
|
|
412
|
+
Args:
|
|
413
|
+
text: Text to analyze
|
|
414
|
+
|
|
415
|
+
Returns:
|
|
416
|
+
True if text appears to be fragmented
|
|
417
|
+
"""
|
|
418
|
+
lines = text.split('\n')
|
|
419
|
+
|
|
420
|
+
# Need minimum number of lines to detect pattern
|
|
421
|
+
if len(lines) < TextQualityConfig.MIN_LINES_FOR_FRAGMENTED_CHECK:
|
|
422
|
+
return False
|
|
423
|
+
|
|
424
|
+
# Count lines with few characters (excluding empty lines)
|
|
425
|
+
non_empty_lines = [line for line in lines if line.strip()]
|
|
426
|
+
if not non_empty_lines:
|
|
427
|
+
return False
|
|
428
|
+
|
|
429
|
+
short_line_count = sum(
|
|
430
|
+
1 for line in non_empty_lines
|
|
431
|
+
if len(line.strip()) <= TextQualityConfig.FRAGMENTED_LINE_CHAR_LIMIT
|
|
432
|
+
)
|
|
433
|
+
|
|
434
|
+
fragmented_ratio = short_line_count / len(non_empty_lines)
|
|
435
|
+
|
|
436
|
+
return fragmented_ratio >= TextQualityConfig.FRAGMENTED_TEXT_THRESHOLD
|
|
437
|
+
|
|
273
438
|
def _is_control(self, code: int) -> bool:
|
|
274
439
|
"""Check if character is a control character."""
|
|
275
440
|
for start, end in TextQualityConfig.CONTROL_RANGES:
|
|
@@ -318,7 +483,9 @@ class TextQualityAnalyzer:
|
|
|
318
483
|
total_chars: int,
|
|
319
484
|
pua_count: int,
|
|
320
485
|
replacement_count: int,
|
|
321
|
-
valid_chars: int
|
|
486
|
+
valid_chars: int,
|
|
487
|
+
cjk_compat_count: int = 0,
|
|
488
|
+
is_fragmented: bool = False
|
|
322
489
|
) -> float:
|
|
323
490
|
"""Calculate quality score (0.0 ~ 1.0)."""
|
|
324
491
|
if total_chars == 0:
|
|
@@ -328,6 +495,7 @@ class TextQualityAnalyzer:
|
|
|
328
495
|
pua_ratio = pua_count / total_chars
|
|
329
496
|
replacement_ratio = replacement_count / total_chars
|
|
330
497
|
valid_ratio = valid_chars / total_chars
|
|
498
|
+
cjk_compat_ratio = cjk_compat_count / total_chars
|
|
331
499
|
|
|
332
500
|
# Calculate weighted score
|
|
333
501
|
# Score decreases with more PUA chars, more replacement chars, lower valid ratio
|
|
@@ -339,6 +507,13 @@ class TextQualityAnalyzer:
|
|
|
339
507
|
# Replacement character penalty
|
|
340
508
|
score -= replacement_ratio * TextQualityConfig.WEIGHT_REPLACEMENT * 3
|
|
341
509
|
|
|
510
|
+
# CJK Compatibility character penalty (broken text indicator)
|
|
511
|
+
score -= cjk_compat_ratio * TextQualityConfig.WEIGHT_CJK_COMPAT * 3
|
|
512
|
+
|
|
513
|
+
# Fragmented text penalty (severe quality issue)
|
|
514
|
+
if is_fragmented:
|
|
515
|
+
score -= 0.5 # Major penalty for fragmented text
|
|
516
|
+
|
|
342
517
|
# Valid character ratio adjustment
|
|
343
518
|
score = score * (0.5 + valid_ratio * 0.5)
|
|
344
519
|
|
|
@@ -592,7 +767,26 @@ class QualityAwareTextExtractor:
|
|
|
592
767
|
text = self.page.get_text("text")
|
|
593
768
|
return text, analysis
|
|
594
769
|
|
|
595
|
-
# 3.
|
|
770
|
+
# 3. Try text reconstruction first (before OCR)
|
|
771
|
+
# This is more reliable than OCR for fragmented text from Word->PDF conversion
|
|
772
|
+
if analysis.quality_result.is_fragmented or analysis.quality_result.cjk_compat_count > 0:
|
|
773
|
+
logger.info(
|
|
774
|
+
f"[QualityAware] Page {self.page_num + 1}: "
|
|
775
|
+
f"Attempting text reconstruction "
|
|
776
|
+
f"(fragmented={analysis.quality_result.is_fragmented}, "
|
|
777
|
+
f"cjk_compat={analysis.quality_result.cjk_compat_count})"
|
|
778
|
+
)
|
|
779
|
+
|
|
780
|
+
reconstructor = FragmentedTextReconstructor(self.page, self.page_num)
|
|
781
|
+
reconstructed_text = reconstructor.reconstruct()
|
|
782
|
+
|
|
783
|
+
if reconstructed_text:
|
|
784
|
+
# Apply CJK Compatibility character mapping
|
|
785
|
+
cleaned_text = apply_cjk_compat_mapping(reconstructed_text)
|
|
786
|
+
analysis.ocr_text = f"[Reconstructed] {len(cleaned_text)} chars"
|
|
787
|
+
return cleaned_text, analysis
|
|
788
|
+
|
|
789
|
+
# 4. OCR fallback if reconstruction fails
|
|
596
790
|
logger.info(
|
|
597
791
|
f"[QualityAware] Page {self.page_num + 1}: "
|
|
598
792
|
f"Quality too low ({analysis.quality_result.quality_score:.2f}), "
|
|
@@ -641,6 +835,348 @@ class QualityAwareTextExtractor:
|
|
|
641
835
|
return "\n".join(merged_parts)
|
|
642
836
|
|
|
643
837
|
|
|
838
|
+
# ============================================================================
|
|
839
|
+
# Fragmented Text Reconstructor
|
|
840
|
+
# ============================================================================
|
|
841
|
+
|
|
842
|
+
class FragmentedTextReconstructor:
|
|
843
|
+
"""
|
|
844
|
+
Reconstructs fragmented text from PDF pages.
|
|
845
|
+
|
|
846
|
+
When Word documents with special layouts (text boxes, vertical text, etc.)
|
|
847
|
+
are converted to PDF, characters may be stored as separate lines.
|
|
848
|
+
This class reconstructs the text by analyzing character positions.
|
|
849
|
+
|
|
850
|
+
Example:
|
|
851
|
+
Input: '현\\n재\\n시\\n장\\n에\\n대\\n한\\n이\\n해'
|
|
852
|
+
Output: '현재 시장에 대한 이해'
|
|
853
|
+
"""
|
|
854
|
+
|
|
855
|
+
def __init__(self, page, page_num: int, y_tolerance: float = 3.0,
|
|
856
|
+
exclude_bboxes: List[Tuple[float, float, float, float]] = None):
|
|
857
|
+
"""
|
|
858
|
+
Args:
|
|
859
|
+
page: PyMuPDF page object
|
|
860
|
+
page_num: Page number (0-indexed)
|
|
861
|
+
y_tolerance: Y coordinate tolerance for same-line detection
|
|
862
|
+
exclude_bboxes: List of bounding boxes to exclude (e.g., table regions)
|
|
863
|
+
"""
|
|
864
|
+
self.page = page
|
|
865
|
+
self.page_num = page_num
|
|
866
|
+
self.y_tolerance = y_tolerance
|
|
867
|
+
self.exclude_bboxes = exclude_bboxes or []
|
|
868
|
+
|
|
869
|
+
def reconstruct(self) -> str:
|
|
870
|
+
"""
|
|
871
|
+
Reconstruct fragmented text using character position data.
|
|
872
|
+
|
|
873
|
+
Returns:
|
|
874
|
+
Reconstructed text with proper line breaks
|
|
875
|
+
"""
|
|
876
|
+
try:
|
|
877
|
+
# Extract character-level position data
|
|
878
|
+
raw_dict = self.page.get_text("rawdict")
|
|
879
|
+
all_chars = self._extract_chars(raw_dict)
|
|
880
|
+
|
|
881
|
+
if not all_chars:
|
|
882
|
+
logger.warning(f"[Reconstruct] Page {self.page_num + 1}: No characters found")
|
|
883
|
+
return ""
|
|
884
|
+
|
|
885
|
+
# Group characters by Y coordinate (same line)
|
|
886
|
+
lines_by_y = self._group_by_y(all_chars)
|
|
887
|
+
|
|
888
|
+
# Sort each line by X coordinate and build text
|
|
889
|
+
reconstructed_lines = self._build_lines(lines_by_y)
|
|
890
|
+
|
|
891
|
+
result = "\n".join(reconstructed_lines)
|
|
892
|
+
|
|
893
|
+
logger.info(
|
|
894
|
+
f"[Reconstruct] Page {self.page_num + 1}: "
|
|
895
|
+
f"Reconstructed {len(all_chars)} chars into {len(reconstructed_lines)} lines"
|
|
896
|
+
)
|
|
897
|
+
|
|
898
|
+
return result
|
|
899
|
+
|
|
900
|
+
except Exception as e:
|
|
901
|
+
logger.error(f"[Reconstruct] Page {self.page_num + 1} failed: {e}")
|
|
902
|
+
return ""
|
|
903
|
+
|
|
904
|
+
def reconstruct_with_sections(self) -> List[Dict]:
|
|
905
|
+
"""
|
|
906
|
+
Reconstruct fragmented text, split into sections by table positions.
|
|
907
|
+
|
|
908
|
+
This method returns multiple text sections with their Y-coordinate ranges,
|
|
909
|
+
allowing proper positioning relative to tables.
|
|
910
|
+
|
|
911
|
+
Returns:
|
|
912
|
+
List of dicts: [{'text': str, 'y_start': float, 'y_end': float}, ...]
|
|
913
|
+
"""
|
|
914
|
+
try:
|
|
915
|
+
raw_dict = self.page.get_text("rawdict")
|
|
916
|
+
all_chars = self._extract_chars(raw_dict)
|
|
917
|
+
|
|
918
|
+
if not all_chars:
|
|
919
|
+
logger.warning(f"[Reconstruct] Page {self.page_num + 1}: No characters found")
|
|
920
|
+
return []
|
|
921
|
+
|
|
922
|
+
# Group characters by Y coordinate
|
|
923
|
+
lines_by_y = self._group_by_y(all_chars)
|
|
924
|
+
|
|
925
|
+
if not lines_by_y:
|
|
926
|
+
return []
|
|
927
|
+
|
|
928
|
+
# Get sorted Y positions of tables (exclusion regions)
|
|
929
|
+
table_y_ranges = []
|
|
930
|
+
for bbox in self.exclude_bboxes:
|
|
931
|
+
table_y_ranges.append((bbox[1], bbox[3])) # (y_start, y_end)
|
|
932
|
+
table_y_ranges.sort(key=lambda x: x[0])
|
|
933
|
+
|
|
934
|
+
if not table_y_ranges:
|
|
935
|
+
# No tables - return single section
|
|
936
|
+
section_text = self._build_section_text(list(lines_by_y.keys()), lines_by_y)
|
|
937
|
+
if section_text.strip():
|
|
938
|
+
sorted_ys = sorted(lines_by_y.keys())
|
|
939
|
+
return [{
|
|
940
|
+
'text': section_text,
|
|
941
|
+
'y_start': sorted_ys[0],
|
|
942
|
+
'y_end': sorted_ys[-1]
|
|
943
|
+
}]
|
|
944
|
+
return []
|
|
945
|
+
|
|
946
|
+
# Split lines into sections based on table positions
|
|
947
|
+
# Key insight: when we skip from a Y before table to a Y after table,
|
|
948
|
+
# we need to split the section
|
|
949
|
+
sections = []
|
|
950
|
+
current_section_lines = []
|
|
951
|
+
current_y_start = None
|
|
952
|
+
current_y_end = None
|
|
953
|
+
|
|
954
|
+
sorted_ys = sorted(lines_by_y.keys())
|
|
955
|
+
|
|
956
|
+
for y in sorted_ys:
|
|
957
|
+
# Check if we're jumping over a table
|
|
958
|
+
should_split = False
|
|
959
|
+
if current_y_end is not None:
|
|
960
|
+
for table_y_start, table_y_end in table_y_ranges:
|
|
961
|
+
# If previous line was before table start AND current line is after table end
|
|
962
|
+
# (meaning we jumped over the table)
|
|
963
|
+
if current_y_end < table_y_start and y > table_y_end:
|
|
964
|
+
should_split = True
|
|
965
|
+
break
|
|
966
|
+
|
|
967
|
+
if should_split and current_section_lines:
|
|
968
|
+
# Save current section (text BEFORE the table)
|
|
969
|
+
section_text = self._build_section_text(current_section_lines, lines_by_y)
|
|
970
|
+
if section_text.strip():
|
|
971
|
+
sections.append({
|
|
972
|
+
'text': section_text,
|
|
973
|
+
'y_start': current_y_start,
|
|
974
|
+
'y_end': current_y_end
|
|
975
|
+
})
|
|
976
|
+
current_section_lines = []
|
|
977
|
+
current_y_start = None
|
|
978
|
+
|
|
979
|
+
# Add line to current section
|
|
980
|
+
current_section_lines.append(y)
|
|
981
|
+
if current_y_start is None:
|
|
982
|
+
current_y_start = y
|
|
983
|
+
current_y_end = y
|
|
984
|
+
|
|
985
|
+
# Don't forget the last section (text AFTER the last table or all text if no split)
|
|
986
|
+
if current_section_lines:
|
|
987
|
+
section_text = self._build_section_text(current_section_lines, lines_by_y)
|
|
988
|
+
if section_text.strip():
|
|
989
|
+
sections.append({
|
|
990
|
+
'text': section_text,
|
|
991
|
+
'y_start': current_y_start,
|
|
992
|
+
'y_end': current_y_end
|
|
993
|
+
})
|
|
994
|
+
|
|
995
|
+
logger.info(
|
|
996
|
+
f"[Reconstruct] Page {self.page_num + 1}: "
|
|
997
|
+
f"Split into {len(sections)} sections around {len(table_y_ranges)} tables"
|
|
998
|
+
)
|
|
999
|
+
|
|
1000
|
+
return sections
|
|
1001
|
+
|
|
1002
|
+
except Exception as e:
|
|
1003
|
+
logger.error(f"[Reconstruct] Page {self.page_num + 1} sections failed: {e}")
|
|
1004
|
+
return []
|
|
1005
|
+
|
|
1006
|
+
def _build_section_text(self, y_positions: List[float], lines_by_y: Dict) -> str:
|
|
1007
|
+
"""Build text from a list of Y positions."""
|
|
1008
|
+
lines = []
|
|
1009
|
+
for y in sorted(y_positions):
|
|
1010
|
+
chars = lines_by_y.get(y, [])
|
|
1011
|
+
chars_sorted = sorted(chars, key=lambda c: c['bbox'][0])
|
|
1012
|
+
|
|
1013
|
+
if not chars_sorted:
|
|
1014
|
+
continue
|
|
1015
|
+
|
|
1016
|
+
line_text = ""
|
|
1017
|
+
prev_x_end = None
|
|
1018
|
+
|
|
1019
|
+
for char_info in chars_sorted:
|
|
1020
|
+
x_start = char_info['bbox'][0]
|
|
1021
|
+
char = char_info['c']
|
|
1022
|
+
|
|
1023
|
+
if prev_x_end is not None:
|
|
1024
|
+
gap = x_start - prev_x_end
|
|
1025
|
+
avg_char_width = char_info['size'] * 0.5
|
|
1026
|
+
if gap > avg_char_width * 0.5:
|
|
1027
|
+
line_text += " "
|
|
1028
|
+
|
|
1029
|
+
line_text += char
|
|
1030
|
+
prev_x_end = char_info['bbox'][2]
|
|
1031
|
+
|
|
1032
|
+
if line_text.strip():
|
|
1033
|
+
lines.append(line_text)
|
|
1034
|
+
|
|
1035
|
+
return "\n".join(lines)
|
|
1036
|
+
|
|
1037
|
+
def _extract_chars(self, raw_dict: Dict) -> List[Dict]:
|
|
1038
|
+
"""Extract all characters with position info from rawdict.
|
|
1039
|
+
|
|
1040
|
+
Characters inside exclude_bboxes (e.g., table regions) are filtered out.
|
|
1041
|
+
"""
|
|
1042
|
+
all_chars = []
|
|
1043
|
+
|
|
1044
|
+
for block in raw_dict.get('blocks', []):
|
|
1045
|
+
if block.get('type') != 0: # Text blocks only
|
|
1046
|
+
continue
|
|
1047
|
+
|
|
1048
|
+
for line in block.get('lines', []):
|
|
1049
|
+
for span in line.get('spans', []):
|
|
1050
|
+
font = span.get('font', '')
|
|
1051
|
+
size = span.get('size', 0)
|
|
1052
|
+
|
|
1053
|
+
for char in span.get('chars', []):
|
|
1054
|
+
char_bbox = char.get('bbox', [0, 0, 0, 0])
|
|
1055
|
+
|
|
1056
|
+
# Skip characters inside excluded regions (e.g., tables)
|
|
1057
|
+
if self._is_inside_excluded_bbox(char_bbox):
|
|
1058
|
+
continue
|
|
1059
|
+
|
|
1060
|
+
char_info = {
|
|
1061
|
+
'c': char.get('c', ''),
|
|
1062
|
+
'bbox': char_bbox,
|
|
1063
|
+
'origin': char.get('origin', [0, 0]),
|
|
1064
|
+
'font': font,
|
|
1065
|
+
'size': size,
|
|
1066
|
+
}
|
|
1067
|
+
all_chars.append(char_info)
|
|
1068
|
+
|
|
1069
|
+
return all_chars
|
|
1070
|
+
|
|
1071
|
+
def _is_inside_excluded_bbox(self, char_bbox: List[float]) -> bool:
|
|
1072
|
+
"""Check if character is inside any excluded bbox.
|
|
1073
|
+
|
|
1074
|
+
Args:
|
|
1075
|
+
char_bbox: Character bounding box [x0, y0, x1, y1]
|
|
1076
|
+
|
|
1077
|
+
Returns:
|
|
1078
|
+
True if character center is inside any excluded region
|
|
1079
|
+
"""
|
|
1080
|
+
if not self.exclude_bboxes:
|
|
1081
|
+
return False
|
|
1082
|
+
|
|
1083
|
+
# Use character center point for check
|
|
1084
|
+
char_center_x = (char_bbox[0] + char_bbox[2]) / 2
|
|
1085
|
+
char_center_y = (char_bbox[1] + char_bbox[3]) / 2
|
|
1086
|
+
|
|
1087
|
+
for bbox in self.exclude_bboxes:
|
|
1088
|
+
# bbox = (x0, y0, x1, y1)
|
|
1089
|
+
if (bbox[0] <= char_center_x <= bbox[2] and
|
|
1090
|
+
bbox[1] <= char_center_y <= bbox[3]):
|
|
1091
|
+
return True
|
|
1092
|
+
|
|
1093
|
+
return False
|
|
1094
|
+
|
|
1095
|
+
def _group_by_y(self, chars: List[Dict]) -> Dict[float, List[Dict]]:
|
|
1096
|
+
"""Group characters by Y coordinate with tolerance."""
|
|
1097
|
+
lines_by_y = {}
|
|
1098
|
+
|
|
1099
|
+
for char_info in chars:
|
|
1100
|
+
# Use origin Y if available, otherwise use bbox Y
|
|
1101
|
+
y = char_info['origin'][1] if char_info['origin'] else char_info['bbox'][1]
|
|
1102
|
+
|
|
1103
|
+
# Find existing Y group within tolerance
|
|
1104
|
+
found_y = None
|
|
1105
|
+
for existing_y in lines_by_y.keys():
|
|
1106
|
+
if abs(existing_y - y) <= self.y_tolerance:
|
|
1107
|
+
found_y = existing_y
|
|
1108
|
+
break
|
|
1109
|
+
|
|
1110
|
+
if found_y is None:
|
|
1111
|
+
found_y = y
|
|
1112
|
+
lines_by_y[found_y] = []
|
|
1113
|
+
|
|
1114
|
+
lines_by_y[found_y].append(char_info)
|
|
1115
|
+
|
|
1116
|
+
return lines_by_y
|
|
1117
|
+
|
|
1118
|
+
def _build_lines(self, lines_by_y: Dict[float, List[Dict]]) -> List[str]:
|
|
1119
|
+
"""Build text lines from character groups."""
|
|
1120
|
+
reconstructed_lines = []
|
|
1121
|
+
|
|
1122
|
+
for y in sorted(lines_by_y.keys()):
|
|
1123
|
+
chars = lines_by_y[y]
|
|
1124
|
+
chars_sorted = sorted(chars, key=lambda c: c['bbox'][0])
|
|
1125
|
+
|
|
1126
|
+
if not chars_sorted:
|
|
1127
|
+
continue
|
|
1128
|
+
|
|
1129
|
+
# Build line text with appropriate spacing
|
|
1130
|
+
line_text = ""
|
|
1131
|
+
prev_x_end = None
|
|
1132
|
+
|
|
1133
|
+
for char_info in chars_sorted:
|
|
1134
|
+
x_start = char_info['bbox'][0]
|
|
1135
|
+
char = char_info['c']
|
|
1136
|
+
|
|
1137
|
+
if prev_x_end is not None:
|
|
1138
|
+
gap = x_start - prev_x_end
|
|
1139
|
+
# Add space if gap is significant
|
|
1140
|
+
avg_char_width = char_info['size'] * 0.5
|
|
1141
|
+
if gap > avg_char_width * 0.5:
|
|
1142
|
+
line_text += " "
|
|
1143
|
+
|
|
1144
|
+
line_text += char
|
|
1145
|
+
prev_x_end = char_info['bbox'][2]
|
|
1146
|
+
|
|
1147
|
+
if line_text.strip():
|
|
1148
|
+
reconstructed_lines.append(line_text)
|
|
1149
|
+
|
|
1150
|
+
return reconstructed_lines
|
|
1151
|
+
|
|
1152
|
+
|
|
1153
|
+
# ============================================================================
|
|
1154
|
+
# CJK Compatibility Character Mapping Function
|
|
1155
|
+
# ============================================================================
|
|
1156
|
+
|
|
1157
|
+
def apply_cjk_compat_mapping(text: str) -> str:
|
|
1158
|
+
"""
|
|
1159
|
+
Replace CJK Compatibility characters with their intended characters.
|
|
1160
|
+
|
|
1161
|
+
These characters appear when Word documents are converted to PDF
|
|
1162
|
+
and font encoding is not properly preserved.
|
|
1163
|
+
|
|
1164
|
+
Args:
|
|
1165
|
+
text: Text containing CJK Compatibility characters
|
|
1166
|
+
|
|
1167
|
+
Returns:
|
|
1168
|
+
Text with characters replaced
|
|
1169
|
+
"""
|
|
1170
|
+
if not text:
|
|
1171
|
+
return text
|
|
1172
|
+
|
|
1173
|
+
result = text
|
|
1174
|
+
for cjk_char, replacement in CJK_COMPAT_CHAR_MAP.items():
|
|
1175
|
+
result = result.replace(cjk_char, replacement)
|
|
1176
|
+
|
|
1177
|
+
return result
|
|
1178
|
+
|
|
1179
|
+
|
|
644
1180
|
# ============================================================================
|
|
645
1181
|
# Export
|
|
646
1182
|
# ============================================================================
|
|
@@ -652,4 +1188,7 @@ __all__ = [
|
|
|
652
1188
|
'TextQualityAnalyzer',
|
|
653
1189
|
'PageOCRFallbackEngine',
|
|
654
1190
|
'QualityAwareTextExtractor',
|
|
1191
|
+
'FragmentedTextReconstructor',
|
|
1192
|
+
'apply_cjk_compat_mapping',
|
|
1193
|
+
'CJK_COMPAT_CHAR_MAP',
|
|
655
1194
|
]
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: xgen-doc2chunk
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.52
|
|
4
4
|
Summary: Convert raw documents into AI-understandable context with intelligent text extraction, table detection, and semantic chunking
|
|
5
5
|
Project-URL: Homepage, https://github.com/master0419/doc2chunk
|
|
6
6
|
Project-URL: Documentation, https://github.com/master0419/doc2chunk#readme
|
|
@@ -113,11 +113,11 @@ xgen_doc2chunk/core/processor/pdf_helpers/pdf_metadata.py,sha256=7ZTeHXAfUqa_W9H
|
|
|
113
113
|
xgen_doc2chunk/core/processor/pdf_helpers/pdf_page_analyzer.py,sha256=4kpY8WY9hH-cfjd-Ai6vA4V7I8KwE5hSq8Yt4QXliqM,3009
|
|
114
114
|
xgen_doc2chunk/core/processor/pdf_helpers/pdf_preprocessor.py,sha256=qPgtMTMbaTm7_QyU7kKwVDtGAldf_yV4rTyoGVVgkTU,3406
|
|
115
115
|
xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_detection.py,sha256=bwD6MVUuZJVYe3bWDsD6BpK1UZKKPsVyKOG6oHeoumw,47042
|
|
116
|
-
xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_processor.py,sha256=
|
|
116
|
+
xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_processor.py,sha256=H9bw3SybQJubvtjTqRrJNFviLFc2OMtWDv2HNTETxf0,28544
|
|
117
117
|
xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_quality_analyzer.py,sha256=v6VH-E6clI71-G2zJcT5754VFcPYqb1Qz4l3UcPeDeM,27863
|
|
118
|
-
xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_validator.py,sha256=
|
|
119
|
-
xgen_doc2chunk/core/processor/pdf_helpers/pdf_text_extractor.py,sha256=
|
|
120
|
-
xgen_doc2chunk/core/processor/pdf_helpers/pdf_text_quality_analyzer.py,sha256=
|
|
118
|
+
xgen_doc2chunk/core/processor/pdf_helpers/pdf_table_validator.py,sha256=rI5QAdqqJfiITZxu4bAf50pD7aIjVlhkYFsc2pt4i8c,16085
|
|
119
|
+
xgen_doc2chunk/core/processor/pdf_helpers/pdf_text_extractor.py,sha256=go259muoxeIxpN1TEiPNdwVkdVb1_YX8BeGO7HS0-jE,8177
|
|
120
|
+
xgen_doc2chunk/core/processor/pdf_helpers/pdf_text_quality_analyzer.py,sha256=_4IoDk15yIMvilcDlSxqiUlNLA9xUV1k69UmlzBq5aI,44641
|
|
121
121
|
xgen_doc2chunk/core/processor/pdf_helpers/pdf_utils.py,sha256=W72HOARz7LjSzwzFTLo4-XTDQWvwBTGlqdovFyPBU7M,4724
|
|
122
122
|
xgen_doc2chunk/core/processor/pdf_helpers/pdf_vector_text_ocr.py,sha256=KWkaj7LT5ih5Nkb2EDggA02JuHIsIy3Sbm7pVIhxWuE,11736
|
|
123
123
|
xgen_doc2chunk/core/processor/pdf_helpers/types.py,sha256=IXV493hkpPa67DPZfH319m2rh6sIgL0R4nOd6pcd-to,9030
|
|
@@ -155,7 +155,7 @@ xgen_doc2chunk/ocr/ocr_engine/bedrock_ocr.py,sha256=4kIPb8u2_GSJ435GHJFXiIeQavMv
|
|
|
155
155
|
xgen_doc2chunk/ocr/ocr_engine/gemini_ocr.py,sha256=A4V_AcC0tySYB4q-lNW7Tuhg7aTq0atj_RhMrCftKsM,2972
|
|
156
156
|
xgen_doc2chunk/ocr/ocr_engine/openai_ocr.py,sha256=ZN-3Dq1BehFmwFvxTaYmiEAdFUqujviONNDiR8c5X4A,3194
|
|
157
157
|
xgen_doc2chunk/ocr/ocr_engine/vllm_ocr.py,sha256=TeQOdPCPKQW8o4IyUb-4o6v6uTVzKupr4qh9NLjIj24,3672
|
|
158
|
-
xgen_doc2chunk-0.1.
|
|
159
|
-
xgen_doc2chunk-0.1.
|
|
160
|
-
xgen_doc2chunk-0.1.
|
|
161
|
-
xgen_doc2chunk-0.1.
|
|
158
|
+
xgen_doc2chunk-0.1.52.dist-info/METADATA,sha256=M63N__jN6H7F3XFKtOM-Um0-TG0uTsknck9YnAZTQOk,7624
|
|
159
|
+
xgen_doc2chunk-0.1.52.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
|
|
160
|
+
xgen_doc2chunk-0.1.52.dist-info/licenses/LICENSE,sha256=pokMTCMoEcrcnjBAJ8cb7UVADBMGce6GLFbbRfqJVJc,11346
|
|
161
|
+
xgen_doc2chunk-0.1.52.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|