kreuzberg 3.7.0__py3-none-any.whl → 3.8.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- kreuzberg/_extractors/_base.py +40 -0
- kreuzberg/_extractors/_email.py +149 -0
- kreuzberg/_extractors/_html.py +15 -3
- kreuzberg/_extractors/_image.py +17 -18
- kreuzberg/_extractors/_pdf.py +68 -14
- kreuzberg/_extractors/_presentation.py +62 -10
- kreuzberg/_extractors/_spread_sheet.py +179 -4
- kreuzberg/_extractors/_structured.py +148 -0
- kreuzberg/_gmft.py +2 -2
- kreuzberg/_mime_types.py +27 -1
- kreuzberg/_multiprocessing/__init__.py +2 -3
- kreuzberg/_ocr/__init__.py +30 -0
- kreuzberg/{_multiprocessing/tesseract_pool.py → _ocr/_pool.py} +3 -5
- kreuzberg/_ocr/_sync.py +566 -0
- kreuzberg/_ocr/_tesseract.py +6 -2
- kreuzberg/_registry.py +4 -0
- kreuzberg/_types.py +131 -0
- kreuzberg/_utils/_cache.py +17 -2
- kreuzberg/_utils/_process_pool.py +178 -1
- kreuzberg/_utils/_quality.py +237 -0
- kreuzberg/_utils/_serialization.py +4 -2
- kreuzberg/_utils/_string.py +153 -10
- kreuzberg/_utils/_sync.py +5 -2
- kreuzberg/_utils/_table.py +261 -0
- {kreuzberg-3.7.0.dist-info → kreuzberg-3.8.0.dist-info}/METADATA +66 -50
- {kreuzberg-3.7.0.dist-info → kreuzberg-3.8.0.dist-info}/RECORD +29 -28
- kreuzberg/_multiprocessing/process_manager.py +0 -189
- kreuzberg/_multiprocessing/sync_easyocr.py +0 -235
- kreuzberg/_multiprocessing/sync_paddleocr.py +0 -199
- kreuzberg/_multiprocessing/sync_tesseract.py +0 -261
- {kreuzberg-3.7.0.dist-info → kreuzberg-3.8.0.dist-info}/WHEEL +0 -0
- {kreuzberg-3.7.0.dist-info → kreuzberg-3.8.0.dist-info}/entry_points.txt +0 -0
- {kreuzberg-3.7.0.dist-info → kreuzberg-3.8.0.dist-info}/licenses/LICENSE +0 -0
kreuzberg/_types.py
CHANGED
@@ -100,6 +100,96 @@ class Metadata(TypedDict, total=False):
|
|
100
100
|
width: NotRequired[int]
|
101
101
|
"""Width of the document page/slide/image, if applicable."""
|
102
102
|
|
103
|
+
# Email-specific fields
|
104
|
+
email_from: NotRequired[str]
|
105
|
+
"""Email sender (from field)."""
|
106
|
+
email_to: NotRequired[str]
|
107
|
+
"""Email recipient (to field)."""
|
108
|
+
email_cc: NotRequired[str]
|
109
|
+
"""Email carbon copy recipients."""
|
110
|
+
email_bcc: NotRequired[str]
|
111
|
+
"""Email blind carbon copy recipients."""
|
112
|
+
date: NotRequired[str]
|
113
|
+
"""Email date or document date."""
|
114
|
+
attachments: NotRequired[list[str]]
|
115
|
+
"""List of attachment names."""
|
116
|
+
|
117
|
+
# Additional metadata fields for various extractors
|
118
|
+
content: NotRequired[str]
|
119
|
+
"""Content metadata field."""
|
120
|
+
parse_error: NotRequired[str]
|
121
|
+
"""Parse error information."""
|
122
|
+
warning: NotRequired[str]
|
123
|
+
"""Warning messages."""
|
124
|
+
|
125
|
+
# Table extraction metadata
|
126
|
+
table_count: NotRequired[int]
|
127
|
+
"""Number of tables extracted from the document."""
|
128
|
+
tables_summary: NotRequired[str]
|
129
|
+
"""Summary of table extraction results."""
|
130
|
+
quality_score: NotRequired[float]
|
131
|
+
"""Quality score for extracted content (0.0-1.0)."""
|
132
|
+
|
133
|
+
|
134
|
+
# Cache valid metadata keys at module level for performance
|
135
|
+
_VALID_METADATA_KEYS = {
|
136
|
+
"authors",
|
137
|
+
"categories",
|
138
|
+
"citations",
|
139
|
+
"comments",
|
140
|
+
"content",
|
141
|
+
"copyright",
|
142
|
+
"created_at",
|
143
|
+
"created_by",
|
144
|
+
"description",
|
145
|
+
"fonts",
|
146
|
+
"height",
|
147
|
+
"identifier",
|
148
|
+
"keywords",
|
149
|
+
"languages",
|
150
|
+
"license",
|
151
|
+
"modified_at",
|
152
|
+
"modified_by",
|
153
|
+
"organization",
|
154
|
+
"parse_error",
|
155
|
+
"publisher",
|
156
|
+
"references",
|
157
|
+
"status",
|
158
|
+
"subject",
|
159
|
+
"subtitle",
|
160
|
+
"summary",
|
161
|
+
"title",
|
162
|
+
"version",
|
163
|
+
"warning",
|
164
|
+
"width",
|
165
|
+
"email_from",
|
166
|
+
"email_to",
|
167
|
+
"email_cc",
|
168
|
+
"email_bcc",
|
169
|
+
"date",
|
170
|
+
"attachments",
|
171
|
+
"table_count",
|
172
|
+
"tables_summary",
|
173
|
+
"quality_score",
|
174
|
+
}
|
175
|
+
|
176
|
+
|
177
|
+
def normalize_metadata(data: dict[str, Any] | None) -> Metadata:
|
178
|
+
"""Normalize any dict to proper Metadata TypedDict.
|
179
|
+
|
180
|
+
Filters out invalid keys and ensures type safety.
|
181
|
+
"""
|
182
|
+
if not data:
|
183
|
+
return {}
|
184
|
+
|
185
|
+
# Filter and return only valid metadata
|
186
|
+
normalized: Metadata = {}
|
187
|
+
for key, value in data.items():
|
188
|
+
if key in _VALID_METADATA_KEYS and value is not None:
|
189
|
+
normalized[key] = value # type: ignore[literal-required]
|
190
|
+
|
191
|
+
return normalized
|
192
|
+
|
103
193
|
|
104
194
|
@dataclass(frozen=True)
|
105
195
|
class Entity:
|
@@ -140,6 +230,45 @@ class ExtractionResult:
|
|
140
230
|
"""Converts the ExtractionResult to a dictionary."""
|
141
231
|
return asdict(self)
|
142
232
|
|
233
|
+
def export_tables_to_csv(self) -> list[str]:
|
234
|
+
"""Export all tables to CSV format.
|
235
|
+
|
236
|
+
Returns:
|
237
|
+
List of CSV strings, one per table
|
238
|
+
"""
|
239
|
+
if not self.tables:
|
240
|
+
return []
|
241
|
+
|
242
|
+
from kreuzberg._utils._table import export_table_to_csv
|
243
|
+
|
244
|
+
return [export_table_to_csv(table) for table in self.tables]
|
245
|
+
|
246
|
+
def export_tables_to_tsv(self) -> list[str]:
|
247
|
+
"""Export all tables to TSV format.
|
248
|
+
|
249
|
+
Returns:
|
250
|
+
List of TSV strings, one per table
|
251
|
+
"""
|
252
|
+
if not self.tables:
|
253
|
+
return []
|
254
|
+
|
255
|
+
from kreuzberg._utils._table import export_table_to_tsv
|
256
|
+
|
257
|
+
return [export_table_to_tsv(table) for table in self.tables]
|
258
|
+
|
259
|
+
def get_table_summaries(self) -> list[dict[str, Any]]:
|
260
|
+
"""Get structural information for all tables.
|
261
|
+
|
262
|
+
Returns:
|
263
|
+
List of table structure dictionaries
|
264
|
+
"""
|
265
|
+
if not self.tables:
|
266
|
+
return []
|
267
|
+
|
268
|
+
from kreuzberg._utils._table import extract_table_structure_info
|
269
|
+
|
270
|
+
return [extract_table_structure_info(table) for table in self.tables]
|
271
|
+
|
143
272
|
|
144
273
|
PostProcessingHook = Callable[[ExtractionResult], ExtractionResult | Awaitable[ExtractionResult]]
|
145
274
|
ValidationHook = Callable[[ExtractionResult], None | Awaitable[None]]
|
@@ -193,6 +322,8 @@ class ExtractionConfig:
|
|
193
322
|
"""Configuration for language detection. If None, uses default settings."""
|
194
323
|
spacy_entity_extraction_config: SpacyEntityExtractionConfig | None = None
|
195
324
|
"""Configuration for spaCy entity extraction. If None, uses default settings."""
|
325
|
+
enable_quality_processing: bool = True
|
326
|
+
"""Whether to apply quality post-processing to improve extraction results."""
|
196
327
|
|
197
328
|
def __post_init__(self) -> None:
|
198
329
|
if self.custom_entity_patterns is not None and isinstance(self.custom_entity_patterns, dict):
|
kreuzberg/_utils/_cache.py
CHANGED
@@ -64,8 +64,23 @@ class KreuzbergCache(Generic[T]):
|
|
64
64
|
Returns:
|
65
65
|
Unique cache key string
|
66
66
|
"""
|
67
|
-
#
|
68
|
-
|
67
|
+
# Use more efficient string building for cache key
|
68
|
+
if not kwargs:
|
69
|
+
return "empty"
|
70
|
+
|
71
|
+
# Build key string efficiently
|
72
|
+
parts = []
|
73
|
+
for key in sorted(kwargs):
|
74
|
+
value = kwargs[key]
|
75
|
+
# Convert common types efficiently
|
76
|
+
if isinstance(value, (str, int, float, bool)):
|
77
|
+
parts.append(f"{key}={value}")
|
78
|
+
elif isinstance(value, bytes):
|
79
|
+
parts.append(f"{key}=bytes:{len(value)}")
|
80
|
+
else:
|
81
|
+
parts.append(f"{key}={type(value).__name__}:{value!s}")
|
82
|
+
|
83
|
+
cache_str = "&".join(parts)
|
69
84
|
return hashlib.sha256(cache_str.encode()).hexdigest()[:16]
|
70
85
|
|
71
86
|
def _get_cache_path(self, cache_key: str) -> Path:
|
@@ -7,7 +7,12 @@ from concurrent.futures import ProcessPoolExecutor
|
|
7
7
|
from contextlib import contextmanager
|
8
8
|
from typing import TYPE_CHECKING, Any, TypeVar
|
9
9
|
|
10
|
+
import anyio
|
11
|
+
import psutil
|
12
|
+
from typing_extensions import Self
|
13
|
+
|
10
14
|
if TYPE_CHECKING:
|
15
|
+
import types
|
11
16
|
from collections.abc import Callable, Generator
|
12
17
|
|
13
18
|
T = TypeVar("T")
|
@@ -62,7 +67,7 @@ def _extract_pdf_text_worker(pdf_path: str) -> tuple[str, str]:
|
|
62
67
|
text_parts = []
|
63
68
|
for page in pdf:
|
64
69
|
text_page = page.get_textpage()
|
65
|
-
text = text_page.
|
70
|
+
text = text_page.get_text_bounded()
|
66
71
|
text_parts.append(text)
|
67
72
|
text_page.close()
|
68
73
|
page.close()
|
@@ -98,3 +103,175 @@ def _extract_pdf_images_worker(pdf_path: str, scale: float = 4.25) -> tuple[str,
|
|
98
103
|
finally:
|
99
104
|
if pdf:
|
100
105
|
pdf.close()
|
106
|
+
|
107
|
+
|
108
|
+
class ProcessPoolManager:
|
109
|
+
"""Resource-aware process pool manager for CPU-intensive tasks."""
|
110
|
+
|
111
|
+
def __init__(
|
112
|
+
self,
|
113
|
+
max_processes: int | None = None,
|
114
|
+
memory_limit_gb: float | None = None,
|
115
|
+
) -> None:
|
116
|
+
"""Initialize the process pool manager.
|
117
|
+
|
118
|
+
Args:
|
119
|
+
max_processes: Maximum number of processes. Defaults to CPU count.
|
120
|
+
memory_limit_gb: Memory limit in GB. Defaults to 75% of available memory.
|
121
|
+
"""
|
122
|
+
self.max_processes = max_processes or mp.cpu_count()
|
123
|
+
|
124
|
+
if memory_limit_gb is None:
|
125
|
+
available_memory = psutil.virtual_memory().available
|
126
|
+
self.memory_limit_bytes = int(available_memory * 0.75) # Use 75% of available # ~keep
|
127
|
+
else:
|
128
|
+
self.memory_limit_bytes = int(memory_limit_gb * 1024**3)
|
129
|
+
|
130
|
+
self._executor: ProcessPoolExecutor | None = None
|
131
|
+
self._active_tasks = 0
|
132
|
+
|
133
|
+
def get_optimal_workers(self, task_memory_mb: float = 100) -> int:
|
134
|
+
"""Calculate optimal number of workers based on memory constraints.
|
135
|
+
|
136
|
+
Args:
|
137
|
+
task_memory_mb: Estimated memory usage per task in MB.
|
138
|
+
|
139
|
+
Returns:
|
140
|
+
Optimal number of workers.
|
141
|
+
"""
|
142
|
+
task_memory_bytes = task_memory_mb * 1024**2
|
143
|
+
memory_based_limit = max(1, int(self.memory_limit_bytes / task_memory_bytes))
|
144
|
+
|
145
|
+
return min(self.max_processes, memory_based_limit)
|
146
|
+
|
147
|
+
def _ensure_executor(self, max_workers: int | None = None) -> ProcessPoolExecutor:
|
148
|
+
"""Ensure process pool executor is initialized."""
|
149
|
+
if self._executor is None or getattr(self._executor, "_max_workers", None) != max_workers:
|
150
|
+
if self._executor is not None:
|
151
|
+
self._executor.shutdown(wait=False)
|
152
|
+
|
153
|
+
workers = max_workers or self.max_processes
|
154
|
+
self._executor = ProcessPoolExecutor(max_workers=workers)
|
155
|
+
|
156
|
+
return self._executor
|
157
|
+
|
158
|
+
async def submit_task(
|
159
|
+
self,
|
160
|
+
func: Callable[..., T],
|
161
|
+
*args: Any,
|
162
|
+
task_memory_mb: float = 100,
|
163
|
+
) -> T:
|
164
|
+
"""Submit a task to the process pool.
|
165
|
+
|
166
|
+
Args:
|
167
|
+
func: Function to execute.
|
168
|
+
*args: Positional arguments for the function.
|
169
|
+
task_memory_mb: Estimated memory usage in MB.
|
170
|
+
|
171
|
+
Returns:
|
172
|
+
Result of the function execution.
|
173
|
+
"""
|
174
|
+
workers = self.get_optimal_workers(task_memory_mb)
|
175
|
+
self._ensure_executor(workers)
|
176
|
+
|
177
|
+
self._active_tasks += 1
|
178
|
+
|
179
|
+
try:
|
180
|
+
return await anyio.to_thread.run_sync(func, *args)
|
181
|
+
finally:
|
182
|
+
self._active_tasks -= 1
|
183
|
+
|
184
|
+
async def submit_batch(
|
185
|
+
self,
|
186
|
+
func: Callable[..., T],
|
187
|
+
arg_batches: list[tuple[Any, ...]],
|
188
|
+
task_memory_mb: float = 100,
|
189
|
+
max_concurrent: int | None = None,
|
190
|
+
) -> list[T]:
|
191
|
+
"""Submit a batch of tasks to the process pool.
|
192
|
+
|
193
|
+
Args:
|
194
|
+
func: Function to execute.
|
195
|
+
arg_batches: List of argument tuples for each task.
|
196
|
+
task_memory_mb: Estimated memory usage per task in MB.
|
197
|
+
max_concurrent: Maximum concurrent tasks. Defaults to optimal workers.
|
198
|
+
|
199
|
+
Returns:
|
200
|
+
List of results in the same order as input.
|
201
|
+
"""
|
202
|
+
if not arg_batches:
|
203
|
+
return []
|
204
|
+
|
205
|
+
workers = self.get_optimal_workers(task_memory_mb)
|
206
|
+
max_concurrent = max_concurrent or workers
|
207
|
+
|
208
|
+
self._ensure_executor(workers)
|
209
|
+
|
210
|
+
semaphore = anyio.CapacityLimiter(max_concurrent)
|
211
|
+
|
212
|
+
async def submit_single(args: tuple[Any, ...]) -> T:
|
213
|
+
async with semaphore:
|
214
|
+
self._active_tasks += 1
|
215
|
+
try:
|
216
|
+
return await anyio.to_thread.run_sync(func, *args)
|
217
|
+
finally:
|
218
|
+
self._active_tasks -= 1
|
219
|
+
|
220
|
+
async with anyio.create_task_group() as tg:
|
221
|
+
results: list[T] = [None] * len(arg_batches) # type: ignore[list-item]
|
222
|
+
|
223
|
+
async def run_task(idx: int, args: tuple[Any, ...]) -> None:
|
224
|
+
results[idx] = await submit_single(args)
|
225
|
+
|
226
|
+
for idx, args in enumerate(arg_batches):
|
227
|
+
tg.start_soon(run_task, idx, args)
|
228
|
+
|
229
|
+
return results
|
230
|
+
|
231
|
+
def get_system_info(self) -> dict[str, Any]:
|
232
|
+
"""Get current system resource information."""
|
233
|
+
memory = psutil.virtual_memory()
|
234
|
+
cpu_percent = psutil.cpu_percent(interval=1)
|
235
|
+
|
236
|
+
return {
|
237
|
+
"cpu_count": mp.cpu_count(),
|
238
|
+
"cpu_percent": cpu_percent,
|
239
|
+
"memory_total": memory.total,
|
240
|
+
"memory_available": memory.available,
|
241
|
+
"memory_percent": memory.percent,
|
242
|
+
"active_tasks": self._active_tasks,
|
243
|
+
"max_processes": self.max_processes,
|
244
|
+
"memory_limit": self.memory_limit_bytes,
|
245
|
+
}
|
246
|
+
|
247
|
+
def shutdown(self, wait: bool = True) -> None:
|
248
|
+
"""Shutdown the process pool."""
|
249
|
+
if self._executor is not None:
|
250
|
+
self._executor.shutdown(wait=wait)
|
251
|
+
self._executor = None
|
252
|
+
|
253
|
+
def __enter__(self) -> Self:
|
254
|
+
"""Context manager entry."""
|
255
|
+
return self
|
256
|
+
|
257
|
+
def __exit__(
|
258
|
+
self,
|
259
|
+
exc_type: type[BaseException] | None,
|
260
|
+
exc_val: BaseException | None,
|
261
|
+
exc_tb: types.TracebackType | None,
|
262
|
+
) -> None:
|
263
|
+
"""Context manager exit."""
|
264
|
+
self.shutdown()
|
265
|
+
|
266
|
+
async def __aenter__(self) -> Self:
|
267
|
+
"""Async context manager entry."""
|
268
|
+
return self
|
269
|
+
|
270
|
+
async def __aexit__(
|
271
|
+
self,
|
272
|
+
exc_type: type[BaseException] | None,
|
273
|
+
exc_val: BaseException | None,
|
274
|
+
exc_tb: types.TracebackType | None,
|
275
|
+
) -> None:
|
276
|
+
"""Async context manager exit."""
|
277
|
+
self.shutdown()
|
@@ -0,0 +1,237 @@
|
|
1
|
+
"""Quality post-processing utilities for extracted text."""
|
2
|
+
|
3
|
+
from __future__ import annotations
|
4
|
+
|
5
|
+
import re
|
6
|
+
from typing import Any
|
7
|
+
|
8
|
+
# Pre-compiled patterns for performance
|
9
|
+
_OCR_ARTIFACTS = {
|
10
|
+
# Common OCR misreads
|
11
|
+
"scattered_chars": re.compile(r"\b[a-zA-Z]\s{2,}[a-zA-Z]\s{2,}[a-zA-Z]\b"),
|
12
|
+
"repeated_punctuation": re.compile(r"[.]{3,}|[-]{3,}|[_]{3,}"),
|
13
|
+
"isolated_punctuation": re.compile(r"\s[.,;:!?]\s"),
|
14
|
+
"malformed_words": re.compile(r"\b[a-zA-Z]+[0-9]+[a-zA-Z]+[a-zA-Z0-9]*\b"),
|
15
|
+
"excessive_whitespace": re.compile(r"\s{3,}"),
|
16
|
+
"broken_sentences": re.compile(r"[a-z]\s{3,}[A-Z][a-z]"),
|
17
|
+
}
|
18
|
+
|
19
|
+
# Combined pattern for faster OCR penalty calculation
|
20
|
+
_COMBINED_OCR_PATTERN = re.compile(
|
21
|
+
r"(?P<scattered>\b[a-zA-Z]\s{2,}[a-zA-Z]\s{2,}[a-zA-Z]\b)|"
|
22
|
+
r"(?P<repeated>[.]{3,}|[-]{3,}|[_]{3,})|"
|
23
|
+
r"(?P<isolated>\s[.,;:!?]\s)|"
|
24
|
+
r"(?P<malformed>\b[a-zA-Z]+[0-9]+[a-zA-Z]+[a-zA-Z0-9]*\b)|"
|
25
|
+
r"(?P<whitespace>\s{3,})|"
|
26
|
+
r"(?P<broken>[a-z]\s{3,}[A-Z][a-z])"
|
27
|
+
)
|
28
|
+
|
29
|
+
# Pre-compiled patterns for text normalization
|
30
|
+
_WHITESPACE_NORMALIZE = re.compile(r"[ \t\f\v\r\xa0\u2000-\u200b\u2028\u2029\u3000]+")
|
31
|
+
_NEWLINE_NORMALIZE = re.compile(r"\n\s*\n\s*\n+")
|
32
|
+
_SENTENCE_DETECT = re.compile(r"[.!?]\s+[A-Z]")
|
33
|
+
_PUNCTUATION_DETECT = re.compile(r"[.!?]")
|
34
|
+
|
35
|
+
_SCRIPT_PATTERNS = {
|
36
|
+
# JavaScript and CSS content
|
37
|
+
"js_functions": re.compile(r"function\s+\w+\s*\([^)]*\)\s*\{[^}]*\}", re.IGNORECASE),
|
38
|
+
"css_rules": re.compile(r"\.[a-zA-Z][\w-]*\s*\{[^}]*\}", re.IGNORECASE),
|
39
|
+
"script_tags": re.compile(r"<script[^>]*>.*?</script>", re.DOTALL | re.IGNORECASE),
|
40
|
+
"style_tags": re.compile(r"<style[^>]*>.*?</style>", re.DOTALL | re.IGNORECASE),
|
41
|
+
}
|
42
|
+
|
43
|
+
_NAVIGATION_PATTERNS = {
|
44
|
+
"nav_words": re.compile(r"\b(?:Skip to main content|Back to top|Main navigation|Site navigation)\b", re.IGNORECASE),
|
45
|
+
"breadcrumbs": re.compile(r"(?:Home\s*[>»]\s*|[>»]\s*){2,}"),
|
46
|
+
"pagination": re.compile(
|
47
|
+
r"\b(?:Page \d+ of \d+|First page|Last page|Previous page|Next page|^\d+ of \d+$)\b", re.IGNORECASE
|
48
|
+
),
|
49
|
+
}
|
50
|
+
|
51
|
+
|
52
|
+
def calculate_quality_score(text: str, metadata: dict[str, Any] | None = None) -> float:
|
53
|
+
"""Calculate overall quality score for extracted text.
|
54
|
+
|
55
|
+
Args:
|
56
|
+
text: The extracted text content
|
57
|
+
metadata: Optional metadata for additional scoring
|
58
|
+
|
59
|
+
Returns:
|
60
|
+
Quality score between 0.0 and 1.0
|
61
|
+
"""
|
62
|
+
if not text or not text.strip():
|
63
|
+
return 0.0
|
64
|
+
|
65
|
+
# Initialize score
|
66
|
+
score = 1.0
|
67
|
+
total_chars = len(text)
|
68
|
+
|
69
|
+
# Penalize OCR artifacts
|
70
|
+
ocr_penalty = _calculate_ocr_penalty(text, total_chars)
|
71
|
+
score -= ocr_penalty * 0.3
|
72
|
+
|
73
|
+
# Penalize script/style content
|
74
|
+
script_penalty = _calculate_script_penalty(text, total_chars)
|
75
|
+
score -= script_penalty * 0.2
|
76
|
+
|
77
|
+
# Penalize navigation content
|
78
|
+
nav_penalty = _calculate_navigation_penalty(text, total_chars)
|
79
|
+
score -= nav_penalty * 0.1
|
80
|
+
|
81
|
+
# Bonus for structure (sentences, paragraphs)
|
82
|
+
structure_bonus = _calculate_structure_bonus(text)
|
83
|
+
score += structure_bonus * 0.2
|
84
|
+
|
85
|
+
# Bonus for metadata richness
|
86
|
+
if metadata:
|
87
|
+
metadata_bonus = _calculate_metadata_bonus(metadata)
|
88
|
+
score += metadata_bonus * 0.1
|
89
|
+
|
90
|
+
return max(0.0, min(1.0, score))
|
91
|
+
|
92
|
+
|
93
|
+
def clean_extracted_text(text: str) -> str:
|
94
|
+
"""Clean extracted text by removing artifacts and improving quality.
|
95
|
+
|
96
|
+
Args:
|
97
|
+
text: The raw extracted text
|
98
|
+
|
99
|
+
Returns:
|
100
|
+
Cleaned text with artifacts removed
|
101
|
+
"""
|
102
|
+
if not text:
|
103
|
+
return text
|
104
|
+
|
105
|
+
# Remove script and style content
|
106
|
+
for pattern in _SCRIPT_PATTERNS.values():
|
107
|
+
text = pattern.sub(" ", text)
|
108
|
+
|
109
|
+
# Clean OCR artifacts
|
110
|
+
text = _clean_ocr_artifacts(text)
|
111
|
+
|
112
|
+
# Clean navigation elements
|
113
|
+
text = _clean_navigation_elements(text)
|
114
|
+
|
115
|
+
# Normalize whitespace using pre-compiled patterns
|
116
|
+
text = _WHITESPACE_NORMALIZE.sub(" ", text)
|
117
|
+
text = _NEWLINE_NORMALIZE.sub("\n\n", text)
|
118
|
+
|
119
|
+
return text.strip()
|
120
|
+
|
121
|
+
|
122
|
+
def _calculate_ocr_penalty(text: str, total_chars: int) -> float:
|
123
|
+
"""Calculate penalty for OCR artifacts."""
|
124
|
+
if total_chars == 0:
|
125
|
+
return 0.0
|
126
|
+
|
127
|
+
# Use combined pattern for single-pass processing
|
128
|
+
artifact_chars = sum(len(match.group()) for match in _COMBINED_OCR_PATTERN.finditer(text))
|
129
|
+
return min(1.0, artifact_chars / total_chars)
|
130
|
+
|
131
|
+
|
132
|
+
def _calculate_script_penalty(text: str, total_chars: int) -> float:
|
133
|
+
"""Calculate penalty for script/style content."""
|
134
|
+
if total_chars == 0:
|
135
|
+
return 0.0
|
136
|
+
|
137
|
+
script_chars = 0
|
138
|
+
for pattern in _SCRIPT_PATTERNS.values():
|
139
|
+
matches = pattern.findall(text)
|
140
|
+
script_chars += sum(len(match) for match in matches)
|
141
|
+
|
142
|
+
return min(1.0, script_chars / total_chars)
|
143
|
+
|
144
|
+
|
145
|
+
def _calculate_navigation_penalty(text: str, total_chars: int) -> float:
|
146
|
+
"""Calculate penalty for navigation content."""
|
147
|
+
if total_chars == 0:
|
148
|
+
return 0.0
|
149
|
+
|
150
|
+
nav_chars = 0
|
151
|
+
for pattern in _NAVIGATION_PATTERNS.values():
|
152
|
+
matches = pattern.findall(text)
|
153
|
+
nav_chars += sum(len(match) for match in matches)
|
154
|
+
|
155
|
+
return min(1.0, nav_chars / total_chars)
|
156
|
+
|
157
|
+
|
158
|
+
def _calculate_structure_bonus(text: str) -> float:
|
159
|
+
"""Calculate bonus for proper text structure."""
|
160
|
+
if not text:
|
161
|
+
return 0.0
|
162
|
+
|
163
|
+
# Count sentences (rough heuristic)
|
164
|
+
sentence_count = len(_SENTENCE_DETECT.findall(text))
|
165
|
+
|
166
|
+
# Count paragraphs
|
167
|
+
paragraph_count = len(text.split("\n\n"))
|
168
|
+
|
169
|
+
# Calculate structure score
|
170
|
+
words = len(text.split())
|
171
|
+
if words == 0:
|
172
|
+
return 0.0
|
173
|
+
|
174
|
+
# Good structure: reasonable sentence and paragraph distribution
|
175
|
+
avg_words_per_sentence = words / max(1, sentence_count)
|
176
|
+
avg_words_per_paragraph = words / max(1, paragraph_count)
|
177
|
+
|
178
|
+
structure_score = 0.0
|
179
|
+
|
180
|
+
# Bonus for reasonable sentence length (10-30 words)
|
181
|
+
if 10 <= avg_words_per_sentence <= 30:
|
182
|
+
structure_score += 0.3
|
183
|
+
|
184
|
+
# Bonus for reasonable paragraph length (50-300 words)
|
185
|
+
if 50 <= avg_words_per_paragraph <= 300:
|
186
|
+
structure_score += 0.3
|
187
|
+
|
188
|
+
# Bonus for having multiple paragraphs
|
189
|
+
if paragraph_count > 1:
|
190
|
+
structure_score += 0.2
|
191
|
+
|
192
|
+
# Bonus for having punctuation
|
193
|
+
if _PUNCTUATION_DETECT.search(text):
|
194
|
+
structure_score += 0.2
|
195
|
+
|
196
|
+
return min(1.0, structure_score)
|
197
|
+
|
198
|
+
|
199
|
+
def _calculate_metadata_bonus(metadata: dict[str, Any]) -> float:
|
200
|
+
"""Calculate bonus for rich metadata."""
|
201
|
+
if not metadata:
|
202
|
+
return 0.0
|
203
|
+
|
204
|
+
important_fields = {"title", "author", "subject", "description", "keywords"}
|
205
|
+
present_fields = sum(1 for field in important_fields if metadata.get(field))
|
206
|
+
|
207
|
+
return present_fields / len(important_fields)
|
208
|
+
|
209
|
+
|
210
|
+
def _clean_ocr_artifacts(text: str) -> str:
|
211
|
+
"""Remove common OCR artifacts from text."""
|
212
|
+
# Fix scattered characters (likely OCR errors)
|
213
|
+
text = _OCR_ARTIFACTS["scattered_chars"].sub(lambda m: m.group().replace(" ", ""), text)
|
214
|
+
|
215
|
+
# Clean repeated punctuation
|
216
|
+
text = _OCR_ARTIFACTS["repeated_punctuation"].sub("...", text)
|
217
|
+
|
218
|
+
# Fix isolated punctuation
|
219
|
+
text = _OCR_ARTIFACTS["isolated_punctuation"].sub(" ", text)
|
220
|
+
|
221
|
+
# Remove malformed words with numbers mixed in
|
222
|
+
text = _OCR_ARTIFACTS["malformed_words"].sub(" ", text)
|
223
|
+
|
224
|
+
# Normalize excessive whitespace
|
225
|
+
return _OCR_ARTIFACTS["excessive_whitespace"].sub(" ", text)
|
226
|
+
|
227
|
+
|
228
|
+
def _clean_navigation_elements(text: str) -> str:
|
229
|
+
"""Remove navigation elements from text."""
|
230
|
+
# Remove navigation words
|
231
|
+
text = _NAVIGATION_PATTERNS["nav_words"].sub(" ", text)
|
232
|
+
|
233
|
+
# Remove breadcrumbs
|
234
|
+
text = _NAVIGATION_PATTERNS["breadcrumbs"].sub(" ", text)
|
235
|
+
|
236
|
+
# Remove pagination
|
237
|
+
return _NAVIGATION_PATTERNS["pagination"].sub(" ", text)
|
@@ -29,8 +29,10 @@ def encode_hook(obj: Any) -> Any:
|
|
29
29
|
"to_list",
|
30
30
|
"tolist",
|
31
31
|
):
|
32
|
-
if hasattr(obj, key)
|
33
|
-
|
32
|
+
if hasattr(obj, key):
|
33
|
+
method = getattr(obj, key) # Cache the attribute lookup
|
34
|
+
if callable(method):
|
35
|
+
return method()
|
34
36
|
|
35
37
|
if is_dataclass(obj) and not isinstance(obj, type):
|
36
38
|
return {k: v if not isinstance(v, Enum) else v.value for (k, v) in asdict(obj).items()}
|