natural-pdf 0.1.38__py3-none-any.whl → 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- natural_pdf/__init__.py +11 -6
- natural_pdf/analyzers/__init__.py +6 -1
- natural_pdf/analyzers/guides.py +354 -258
- natural_pdf/analyzers/layout/layout_analyzer.py +2 -3
- natural_pdf/analyzers/layout/layout_manager.py +18 -4
- natural_pdf/analyzers/layout/paddle.py +11 -0
- natural_pdf/analyzers/layout/surya.py +2 -3
- natural_pdf/analyzers/shape_detection_mixin.py +25 -34
- natural_pdf/analyzers/text_structure.py +2 -2
- natural_pdf/classification/manager.py +1 -1
- natural_pdf/collections/mixins.py +3 -2
- natural_pdf/core/highlighting_service.py +743 -32
- natural_pdf/core/page.py +252 -399
- natural_pdf/core/page_collection.py +1249 -0
- natural_pdf/core/pdf.py +231 -89
- natural_pdf/{collections → core}/pdf_collection.py +18 -11
- natural_pdf/core/render_spec.py +335 -0
- natural_pdf/describe/base.py +1 -1
- natural_pdf/elements/__init__.py +1 -0
- natural_pdf/elements/base.py +108 -83
- natural_pdf/elements/{collections.py → element_collection.py} +575 -1372
- natural_pdf/elements/line.py +0 -1
- natural_pdf/elements/rect.py +0 -1
- natural_pdf/elements/region.py +405 -280
- natural_pdf/elements/text.py +9 -7
- natural_pdf/exporters/base.py +2 -2
- natural_pdf/exporters/original_pdf.py +1 -1
- natural_pdf/exporters/paddleocr.py +2 -4
- natural_pdf/exporters/searchable_pdf.py +3 -2
- natural_pdf/extraction/mixin.py +1 -3
- natural_pdf/flows/collections.py +1 -69
- natural_pdf/flows/element.py +25 -0
- natural_pdf/flows/flow.py +1658 -19
- natural_pdf/flows/region.py +757 -263
- natural_pdf/ocr/ocr_options.py +0 -2
- natural_pdf/ocr/utils.py +2 -1
- natural_pdf/qa/document_qa.py +21 -5
- natural_pdf/search/search_service_protocol.py +1 -1
- natural_pdf/selectors/parser.py +35 -2
- natural_pdf/tables/result.py +35 -1
- natural_pdf/text_mixin.py +101 -0
- natural_pdf/utils/debug.py +2 -1
- natural_pdf/utils/highlighting.py +1 -0
- natural_pdf/utils/layout.py +2 -2
- natural_pdf/utils/packaging.py +4 -3
- natural_pdf/utils/text_extraction.py +15 -12
- natural_pdf/utils/visualization.py +385 -0
- {natural_pdf-0.1.38.dist-info → natural_pdf-0.2.0.dist-info}/METADATA +7 -3
- {natural_pdf-0.1.38.dist-info → natural_pdf-0.2.0.dist-info}/RECORD +55 -52
- optimization/memory_comparison.py +1 -1
- optimization/pdf_analyzer.py +2 -2
- {natural_pdf-0.1.38.dist-info → natural_pdf-0.2.0.dist-info}/WHEEL +0 -0
- {natural_pdf-0.1.38.dist-info → natural_pdf-0.2.0.dist-info}/entry_points.txt +0 -0
- {natural_pdf-0.1.38.dist-info → natural_pdf-0.2.0.dist-info}/licenses/LICENSE +0 -0
- {natural_pdf-0.1.38.dist-info → natural_pdf-0.2.0.dist-info}/top_level.txt +0 -0
natural_pdf/ocr/ocr_options.py
CHANGED
@@ -3,7 +3,6 @@ from dataclasses import dataclass, field
|
|
3
3
|
from typing import Any, Dict, List, Optional, Tuple, Union
|
4
4
|
|
5
5
|
|
6
|
-
|
7
6
|
# --- Base Options ---
|
8
7
|
@dataclass
|
9
8
|
class BaseOCROptions:
|
@@ -54,7 +53,6 @@ class EasyOCROptions(BaseOCROptions):
|
|
54
53
|
output_format: str = "standard"
|
55
54
|
|
56
55
|
|
57
|
-
|
58
56
|
# --- PaddleOCR Specific Options ---
|
59
57
|
@dataclass
|
60
58
|
class PaddleOCROptions(BaseOCROptions):
|
natural_pdf/ocr/utils.py
CHANGED
@@ -90,7 +90,8 @@ def direct_ocr_llm(
|
|
90
90
|
buffered = io.BytesIO()
|
91
91
|
# Use the global PDF render lock when rendering images
|
92
92
|
with pdf_render_lock:
|
93
|
-
|
93
|
+
# Use render() for clean image without highlights
|
94
|
+
region_img = region.render(resolution=resolution)
|
94
95
|
|
95
96
|
# Handle cases where image creation might fail (e.g., zero-dim region)
|
96
97
|
if region_img is None:
|
natural_pdf/qa/document_qa.py
CHANGED
@@ -8,7 +8,7 @@ from typing import Any, Dict, List, Optional, Tuple, Union
|
|
8
8
|
import numpy as np
|
9
9
|
from PIL import Image, ImageDraw
|
10
10
|
|
11
|
-
from natural_pdf.elements.
|
11
|
+
from natural_pdf.elements.element_collection import ElementCollection
|
12
12
|
|
13
13
|
from .qa_result import QAResult
|
14
14
|
|
@@ -63,8 +63,22 @@ class DocumentQA:
|
|
63
63
|
|
64
64
|
logger.info(f"Initializing DocumentQA with model {model_name} on {device}")
|
65
65
|
|
66
|
-
#
|
67
|
-
|
66
|
+
# Try MPS, fallback to CPU if OOM
|
67
|
+
if device is None and torch.backends.mps.is_available():
|
68
|
+
try:
|
69
|
+
self.pipe = pipeline(
|
70
|
+
"document-question-answering", model=model_name, device="mps"
|
71
|
+
)
|
72
|
+
self.device = "mps"
|
73
|
+
except RuntimeError as e:
|
74
|
+
logger.warning(f"MPS OOM: {e}, falling back to CPU")
|
75
|
+
self.pipe = pipeline(
|
76
|
+
"document-question-answering", model=model_name, device="cpu"
|
77
|
+
)
|
78
|
+
self.device = "cpu"
|
79
|
+
else:
|
80
|
+
self.pipe = pipeline("document-question-answering", model=model_name, device=device)
|
81
|
+
self.device = device
|
68
82
|
|
69
83
|
self.model_name = model_name
|
70
84
|
self.device = device
|
@@ -356,7 +370,8 @@ class DocumentQA:
|
|
356
370
|
temp_path = temp_file.name
|
357
371
|
|
358
372
|
# Save a high resolution image (300 DPI)
|
359
|
-
|
373
|
+
# Use render() for clean image without highlights
|
374
|
+
page_image = page.render(resolution=300)
|
360
375
|
page_image.save(temp_path)
|
361
376
|
|
362
377
|
try:
|
@@ -470,7 +485,8 @@ class DocumentQA:
|
|
470
485
|
temp_path = temp_file.name
|
471
486
|
|
472
487
|
# Get page image at high resolution - this returns a PIL Image directly
|
473
|
-
|
488
|
+
# Use render() for clean image without highlights
|
489
|
+
page_image = region.page.render(resolution=300)
|
474
490
|
|
475
491
|
# Crop to region
|
476
492
|
x0, top, x1, bottom = int(region.x0), int(region.top), int(region.x1), int(region.bottom)
|
@@ -49,7 +49,7 @@ class Indexable(Protocol):
|
|
49
49
|
"""
|
50
50
|
Return the primary content of this item.
|
51
51
|
The SearchService implementation will determine how to process this content
|
52
|
-
(e.g., call .extract_text(), .
|
52
|
+
(e.g., call .extract_text(), .render(), or handle directly).
|
53
53
|
"""
|
54
54
|
...
|
55
55
|
|
natural_pdf/selectors/parser.py
CHANGED
@@ -24,7 +24,7 @@ This enables powerful document navigation like:
|
|
24
24
|
- page.find('text[size>12]:bold:contains("Summary")')
|
25
25
|
- page.find_all('rect[color~="red"]:above(text:contains("Total"))')
|
26
26
|
- page.find('text:regex("[0-9]{4}-[0-9]{2}-[0-9]{2}")')
|
27
|
-
- page.find('text:regex("[\u2500-\
|
27
|
+
- page.find('text:regex("[\u2500-\u257f]")') # Box drawing characters
|
28
28
|
"""
|
29
29
|
|
30
30
|
import ast
|
@@ -101,6 +101,12 @@ def safe_parse_color(value_str: str) -> tuple:
|
|
101
101
|
"""
|
102
102
|
value_str = value_str.strip()
|
103
103
|
|
104
|
+
# Strip quotes first if it's a quoted string (same logic as safe_parse_value)
|
105
|
+
if (value_str.startswith('"') and value_str.endswith('"')) or (
|
106
|
+
value_str.startswith("'") and value_str.endswith("'")
|
107
|
+
):
|
108
|
+
value_str = value_str[1:-1]
|
109
|
+
|
104
110
|
# Try parsing as a Python literal (for RGB tuples)
|
105
111
|
try:
|
106
112
|
# If it's already a valid tuple or list, parse it
|
@@ -504,6 +510,21 @@ def _is_approximate_match(value1, value2) -> bool:
|
|
504
510
|
return value1 == value2
|
505
511
|
|
506
512
|
|
513
|
+
def _is_exact_color_match(value1, value2) -> bool:
|
514
|
+
"""
|
515
|
+
Check if two color values match exactly (with small tolerance for color variations).
|
516
|
+
|
517
|
+
For colors: Uses Delta E color difference with strict tolerance of 2.0
|
518
|
+
For non-colors: Falls back to exact equality
|
519
|
+
"""
|
520
|
+
# First check if both values are colors
|
521
|
+
if _is_color_value(value1) and _is_color_value(value2):
|
522
|
+
return _color_distance(value1, value2) <= 2.0
|
523
|
+
|
524
|
+
# Default to exact match for non-colors
|
525
|
+
return value1 == value2
|
526
|
+
|
527
|
+
|
507
528
|
PSEUDO_CLASS_FUNCTIONS = {
|
508
529
|
"bold": lambda el: hasattr(el, "bold") and el.bold,
|
509
530
|
"italic": lambda el: hasattr(el, "italic") and el.italic,
|
@@ -603,7 +624,19 @@ def _build_filter_list(selector: Dict[str, Any], **kwargs) -> List[Dict[str, Any
|
|
603
624
|
|
604
625
|
# Determine compare_func based on op (reuse existing logic)
|
605
626
|
if op == "=":
|
606
|
-
|
627
|
+
# For color attributes, use exact color matching with small tolerance
|
628
|
+
if name in [
|
629
|
+
"color",
|
630
|
+
"non_stroking_color",
|
631
|
+
"fill",
|
632
|
+
"stroke",
|
633
|
+
"strokeColor",
|
634
|
+
"fillColor",
|
635
|
+
]:
|
636
|
+
op_desc = f"= {value!r} (exact color)"
|
637
|
+
compare_func = lambda el_val, sel_val: _is_exact_color_match(el_val, sel_val)
|
638
|
+
else:
|
639
|
+
compare_func = lambda el_val, sel_val: el_val == sel_val
|
607
640
|
elif op == "!=":
|
608
641
|
compare_func = lambda el_val, sel_val: el_val != sel_val
|
609
642
|
elif op == "~=":
|
natural_pdf/tables/result.py
CHANGED
@@ -39,7 +39,13 @@ class TableResult(Sequence):
|
|
39
39
|
"""Quick property alias → calls :py:meth:`to_df` with default args."""
|
40
40
|
return self.to_df()
|
41
41
|
|
42
|
-
def to_df(
|
42
|
+
def to_df(
|
43
|
+
self,
|
44
|
+
header: Union[str, int, List[int], None] = "first",
|
45
|
+
index_col=None,
|
46
|
+
skip_repeating_headers=None,
|
47
|
+
**kwargs,
|
48
|
+
):
|
43
49
|
"""Convert to *pandas* DataFrame.
|
44
50
|
|
45
51
|
Parameters
|
@@ -47,6 +53,10 @@ class TableResult(Sequence):
|
|
47
53
|
header : "first" | int | list[int] | None, default "first"
|
48
54
|
• "first" – use row 0 as column names.\n • int – use that row index.\n • list[int] – multi-row header.\n • None/False– no header.
|
49
55
|
index_col : same semantics as pandas, forwarded.
|
56
|
+
skip_repeating_headers : bool, optional
|
57
|
+
Whether to remove body rows that exactly match the header row(s).
|
58
|
+
Defaults to True when header is truthy, False otherwise.
|
59
|
+
Useful for PDFs where headers repeat throughout the table body.
|
50
60
|
**kwargs : forwarded to :pyclass:`pandas.DataFrame`.
|
51
61
|
"""
|
52
62
|
try:
|
@@ -60,6 +70,10 @@ class TableResult(Sequence):
|
|
60
70
|
if not rows:
|
61
71
|
return pd.DataFrame()
|
62
72
|
|
73
|
+
# Determine default for skip_repeating_headers based on header parameter
|
74
|
+
if skip_repeating_headers is None:
|
75
|
+
skip_repeating_headers = header is not None and header is not False
|
76
|
+
|
63
77
|
# Determine header rows and body rows
|
64
78
|
body = rows
|
65
79
|
hdr = None
|
@@ -78,6 +92,26 @@ class TableResult(Sequence):
|
|
78
92
|
else:
|
79
93
|
raise ValueError("Invalid value for header parameter")
|
80
94
|
|
95
|
+
# Skip repeating headers in body if requested
|
96
|
+
if skip_repeating_headers and hdr is not None and body:
|
97
|
+
original_body_len = len(body)
|
98
|
+
if isinstance(hdr, list) and len(hdr) > 0 and not isinstance(hdr[0], list):
|
99
|
+
# Single header row (most common case)
|
100
|
+
body = [row for row in body if row != hdr]
|
101
|
+
elif isinstance(hdr, list) and len(hdr) > 0 and isinstance(hdr[0], list):
|
102
|
+
# Multi-row header (less common)
|
103
|
+
hdr_set = {tuple(h) if isinstance(h, list) else h for h in hdr}
|
104
|
+
body = [
|
105
|
+
row
|
106
|
+
for row in body
|
107
|
+
if (tuple(row) if isinstance(row, list) else row) not in hdr_set
|
108
|
+
]
|
109
|
+
|
110
|
+
skipped_count = original_body_len - len(body)
|
111
|
+
if skipped_count > 0:
|
112
|
+
# Could add logging here if desired
|
113
|
+
pass
|
114
|
+
|
81
115
|
df = pd.DataFrame(body, columns=hdr)
|
82
116
|
if index_col is not None and not df.empty:
|
83
117
|
df.set_index(
|
@@ -0,0 +1,101 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
|
3
|
+
import logging
|
4
|
+
from typing import Any, Callable, Optional
|
5
|
+
|
6
|
+
logger = logging.getLogger(__name__)
|
7
|
+
|
8
|
+
|
9
|
+
class TextMixin: # pylint: disable=too-few-public-methods
|
10
|
+
"""Mixin that adds general text-replacement capabilities.
|
11
|
+
|
12
|
+
Two public entry points are exposed to any class that inherits this mix-in:
|
13
|
+
|
14
|
+
1. ``update_text`` (preferred) – iterate over text elements selected via the
|
15
|
+
``selector`` argument (default: ``"text"``) and apply a *correction* callback
|
16
|
+
which optionally returns replacement text. If the callback returns a
|
17
|
+
non-``None`` string that differs from the current value, the element's
|
18
|
+
``text`` attribute is updated in-place.
|
19
|
+
|
20
|
+
2. ``correct_ocr`` – legacy name kept for backward compatibility. It simply
|
21
|
+
forwards to :py:meth:`update_text` while forcing
|
22
|
+
``selector="text[source=ocr]"`` so that the historic behaviour (acting only
|
23
|
+
on OCR-generated elements) is preserved.
|
24
|
+
"""
|
25
|
+
|
26
|
+
# ---------------------------------------------------------------------
|
27
|
+
# Back-compat shim
|
28
|
+
# ---------------------------------------------------------------------
|
29
|
+
def correct_ocr(self, *args, selector: str = "text[source=ocr]", **kwargs): # type: ignore[override]
|
30
|
+
"""Backward-compatibility wrapper that forwards to *update_text*.
|
31
|
+
|
32
|
+
Parameters
|
33
|
+
----------
|
34
|
+
*args, **kwargs
|
35
|
+
Forwarded verbatim to :py:meth:`update_text` (after injecting the
|
36
|
+
``selector`` default shown above).
|
37
|
+
"""
|
38
|
+
|
39
|
+
# Delegate – subclasses may have overridden *update_text* with a richer
|
40
|
+
# signature so we pass everything through untouched.
|
41
|
+
return self.update_text(*args, selector=selector, **kwargs) # type: ignore[arg-type]
|
42
|
+
|
43
|
+
# ------------------------------------------------------------------
|
44
|
+
# Generic fallback implementation
|
45
|
+
# ------------------------------------------------------------------
|
46
|
+
def update_text( # type: ignore[override]
|
47
|
+
self,
|
48
|
+
transform: Callable[[Any], Optional[str]],
|
49
|
+
*,
|
50
|
+
selector: str = "text",
|
51
|
+
apply_exclusions: bool = False,
|
52
|
+
**_,
|
53
|
+
):
|
54
|
+
"""Generic implementation that works for any object exposing *find_all*.
|
55
|
+
|
56
|
+
Classes that require more sophisticated behaviour (parallelism, page
|
57
|
+
delegation, etc.) are expected to *override* this method while keeping
|
58
|
+
the same public contract.
|
59
|
+
"""
|
60
|
+
|
61
|
+
if not callable(transform):
|
62
|
+
raise TypeError("transform must be callable")
|
63
|
+
|
64
|
+
# We rely on the presence of *find_all* to obtain elements. If the
|
65
|
+
# subclass does not implement it then it *must* override update_text.
|
66
|
+
if not hasattr(self, "find_all"):
|
67
|
+
raise NotImplementedError(
|
68
|
+
f"{self.__class__.__name__} must implement `update_text` explicitly "
|
69
|
+
"(no `find_all` method found)."
|
70
|
+
)
|
71
|
+
|
72
|
+
try:
|
73
|
+
elements_collection = self.find_all(
|
74
|
+
selector=selector, apply_exclusions=apply_exclusions
|
75
|
+
)
|
76
|
+
except Exception as exc: # pragma: no cover – defensive
|
77
|
+
raise RuntimeError(
|
78
|
+
f"Failed to gather elements with selector '{selector}': {exc}"
|
79
|
+
) from exc
|
80
|
+
|
81
|
+
# `find_all` returns an ElementCollection; fall back gracefully otherwise.
|
82
|
+
elements_iter = getattr(elements_collection, "elements", elements_collection)
|
83
|
+
updated = 0
|
84
|
+
|
85
|
+
for element in elements_iter:
|
86
|
+
if not hasattr(element, "text"):
|
87
|
+
continue
|
88
|
+
|
89
|
+
new_text = transform(element)
|
90
|
+
if new_text is not None and isinstance(new_text, str) and new_text != element.text:
|
91
|
+
element.text = new_text
|
92
|
+
updated += 1
|
93
|
+
|
94
|
+
logger.info(
|
95
|
+
"%s.update_text – processed %d element(s); updated %d.",
|
96
|
+
self.__class__.__name__,
|
97
|
+
len(elements_iter),
|
98
|
+
updated,
|
99
|
+
)
|
100
|
+
|
101
|
+
return self
|
natural_pdf/utils/debug.py
CHANGED
@@ -24,7 +24,8 @@ def _get_page_image_base64(page: Page) -> str:
|
|
24
24
|
"""Generate a base64 encoded image of the page."""
|
25
25
|
# Create a clean image of the page without highlights for the base background
|
26
26
|
# Use a fixed scale consistent with the HTML/JS rendering logic
|
27
|
-
|
27
|
+
# Use render() for clean image without highlights
|
28
|
+
img = page.render(resolution=144)
|
28
29
|
if img is None:
|
29
30
|
raise ValueError(f"Failed to render image for page {page.number}")
|
30
31
|
|
natural_pdf/utils/layout.py
CHANGED
@@ -2,7 +2,7 @@ from typing import List, Optional, Tuple
|
|
2
2
|
|
3
3
|
|
4
4
|
def merge_bboxes(
|
5
|
-
bboxes: List[Optional[Tuple[float, float, float, float]]]
|
5
|
+
bboxes: List[Optional[Tuple[float, float, float, float]]],
|
6
6
|
) -> Optional[Tuple[float, float, float, float]]:
|
7
7
|
"""
|
8
8
|
Merge multiple bounding boxes into a single one that encompasses all of them.
|
@@ -23,4 +23,4 @@ def merge_bboxes(
|
|
23
23
|
|
24
24
|
x0s, tops, x1s, bottoms = zip(*valid_bboxes)
|
25
25
|
|
26
|
-
return (min(x0s), min(tops), max(x1s), max(bottoms))
|
26
|
+
return (min(x0s), min(tops), max(x1s), max(bottoms))
|
natural_pdf/utils/packaging.py
CHANGED
@@ -18,9 +18,9 @@ from natural_pdf.elements.text import TextElement
|
|
18
18
|
|
19
19
|
# Import the specific PDF/Page types if possible, otherwise use Any
|
20
20
|
if TYPE_CHECKING:
|
21
|
-
from natural_pdf.collections.pdf_collection import PDFCollection
|
22
21
|
from natural_pdf.core.page import Page
|
23
22
|
from natural_pdf.core.pdf import PDF
|
23
|
+
from natural_pdf.core.pdf_collection import PDFCollection
|
24
24
|
else:
|
25
25
|
PDF = Any
|
26
26
|
Page = Any
|
@@ -145,9 +145,10 @@ def create_correction_task_package(
|
|
145
145
|
image_filename = f"{pdf_short_id}_page_{page.index}.png"
|
146
146
|
image_save_path = os.path.join(images_dir, image_filename)
|
147
147
|
try:
|
148
|
-
|
148
|
+
# Use render() for clean image without highlights
|
149
|
+
img = page.render(resolution=resolution)
|
149
150
|
if img is None:
|
150
|
-
raise ValueError("page.
|
151
|
+
raise ValueError("page.render returned None")
|
151
152
|
img.save(image_save_path, "PNG")
|
152
153
|
except Exception as e:
|
153
154
|
logger.error(
|
@@ -175,28 +175,27 @@ def filter_chars_spatially(
|
|
175
175
|
|
176
176
|
|
177
177
|
def _apply_content_filter(
|
178
|
-
char_dicts: List[Dict[str, Any]],
|
179
|
-
content_filter: Union[str, Callable[[str], bool], List[str]]
|
178
|
+
char_dicts: List[Dict[str, Any]], content_filter: Union[str, Callable[[str], bool], List[str]]
|
180
179
|
) -> List[Dict[str, Any]]:
|
181
180
|
"""
|
182
181
|
Applies content filtering to character dictionaries based on their text content.
|
183
|
-
|
182
|
+
|
184
183
|
Args:
|
185
184
|
char_dicts: List of character dictionaries to filter.
|
186
185
|
content_filter: Can be:
|
187
186
|
- A regex pattern string (characters matching the pattern are EXCLUDED)
|
188
187
|
- A callable that takes text and returns True to KEEP the character
|
189
188
|
- A list of regex patterns (characters matching ANY pattern are EXCLUDED)
|
190
|
-
|
189
|
+
|
191
190
|
Returns:
|
192
191
|
Filtered list of character dictionaries.
|
193
192
|
"""
|
194
193
|
if not char_dicts or content_filter is None:
|
195
194
|
return char_dicts
|
196
|
-
|
195
|
+
|
197
196
|
initial_count = len(char_dicts)
|
198
197
|
filtered_chars = []
|
199
|
-
|
198
|
+
|
200
199
|
# Handle different filter types
|
201
200
|
if isinstance(content_filter, str):
|
202
201
|
# Single regex pattern - exclude matching characters
|
@@ -207,9 +206,11 @@ def _apply_content_filter(
|
|
207
206
|
if not pattern.search(text):
|
208
207
|
filtered_chars.append(char_dict)
|
209
208
|
except re.error as e:
|
210
|
-
logger.warning(
|
209
|
+
logger.warning(
|
210
|
+
f"Invalid regex pattern '{content_filter}': {e}. Skipping content filtering."
|
211
|
+
)
|
211
212
|
return char_dicts
|
212
|
-
|
213
|
+
|
213
214
|
elif isinstance(content_filter, list):
|
214
215
|
# List of regex patterns - exclude characters matching ANY pattern
|
215
216
|
try:
|
@@ -221,7 +222,7 @@ def _apply_content_filter(
|
|
221
222
|
except re.error as e:
|
222
223
|
logger.warning(f"Invalid regex pattern in list: {e}. Skipping content filtering.")
|
223
224
|
return char_dicts
|
224
|
-
|
225
|
+
|
225
226
|
elif callable(content_filter):
|
226
227
|
# Callable filter - keep characters where function returns True
|
227
228
|
try:
|
@@ -233,13 +234,15 @@ def _apply_content_filter(
|
|
233
234
|
logger.warning(f"Error in content filter function: {e}. Skipping content filtering.")
|
234
235
|
return char_dicts
|
235
236
|
else:
|
236
|
-
logger.warning(
|
237
|
+
logger.warning(
|
238
|
+
f"Unsupported content_filter type: {type(content_filter)}. Skipping content filtering."
|
239
|
+
)
|
237
240
|
return char_dicts
|
238
|
-
|
241
|
+
|
239
242
|
filtered_count = initial_count - len(filtered_chars)
|
240
243
|
if filtered_count > 0:
|
241
244
|
logger.debug(f"Content filter removed {filtered_count} characters.")
|
242
|
-
|
245
|
+
|
243
246
|
return filtered_chars
|
244
247
|
|
245
248
|
|