natural-pdf 0.1.8__py3-none-any.whl → 0.1.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- natural_pdf/__init__.py +1 -0
- natural_pdf/analyzers/layout/base.py +1 -5
- natural_pdf/analyzers/layout/gemini.py +61 -51
- natural_pdf/analyzers/layout/layout_analyzer.py +40 -11
- natural_pdf/analyzers/layout/layout_manager.py +26 -84
- natural_pdf/analyzers/layout/layout_options.py +7 -0
- natural_pdf/analyzers/layout/pdfplumber_table_finder.py +142 -0
- natural_pdf/analyzers/layout/surya.py +46 -123
- natural_pdf/analyzers/layout/tatr.py +51 -4
- natural_pdf/analyzers/text_structure.py +3 -5
- natural_pdf/analyzers/utils.py +3 -3
- natural_pdf/classification/manager.py +230 -151
- natural_pdf/classification/mixin.py +49 -35
- natural_pdf/classification/results.py +64 -46
- natural_pdf/collections/mixins.py +68 -20
- natural_pdf/collections/pdf_collection.py +177 -64
- natural_pdf/core/element_manager.py +30 -14
- natural_pdf/core/highlighting_service.py +13 -22
- natural_pdf/core/page.py +423 -101
- natural_pdf/core/pdf.py +633 -190
- natural_pdf/elements/base.py +134 -40
- natural_pdf/elements/collections.py +503 -131
- natural_pdf/elements/region.py +659 -90
- natural_pdf/elements/text.py +1 -1
- natural_pdf/export/mixin.py +137 -0
- natural_pdf/exporters/base.py +3 -3
- natural_pdf/exporters/paddleocr.py +4 -3
- natural_pdf/extraction/manager.py +50 -49
- natural_pdf/extraction/mixin.py +90 -57
- natural_pdf/extraction/result.py +9 -23
- natural_pdf/ocr/__init__.py +5 -5
- natural_pdf/ocr/engine_doctr.py +346 -0
- natural_pdf/ocr/ocr_factory.py +24 -4
- natural_pdf/ocr/ocr_manager.py +61 -25
- natural_pdf/ocr/ocr_options.py +70 -10
- natural_pdf/ocr/utils.py +6 -4
- natural_pdf/search/__init__.py +20 -34
- natural_pdf/search/haystack_search_service.py +309 -265
- natural_pdf/search/haystack_utils.py +99 -75
- natural_pdf/search/search_service_protocol.py +11 -12
- natural_pdf/selectors/parser.py +219 -143
- natural_pdf/utils/debug.py +3 -3
- natural_pdf/utils/identifiers.py +1 -1
- natural_pdf/utils/locks.py +1 -1
- natural_pdf/utils/packaging.py +8 -6
- natural_pdf/utils/text_extraction.py +24 -16
- natural_pdf/utils/tqdm_utils.py +18 -10
- natural_pdf/utils/visualization.py +18 -0
- natural_pdf/widgets/viewer.py +4 -25
- {natural_pdf-0.1.8.dist-info → natural_pdf-0.1.9.dist-info}/METADATA +12 -3
- natural_pdf-0.1.9.dist-info/RECORD +80 -0
- {natural_pdf-0.1.8.dist-info → natural_pdf-0.1.9.dist-info}/WHEEL +1 -1
- {natural_pdf-0.1.8.dist-info → natural_pdf-0.1.9.dist-info}/top_level.txt +0 -2
- docs/api/index.md +0 -386
- docs/assets/favicon.png +0 -3
- docs/assets/favicon.svg +0 -3
- docs/assets/javascripts/custom.js +0 -17
- docs/assets/logo.svg +0 -3
- docs/assets/sample-screen.png +0 -0
- docs/assets/social-preview.png +0 -17
- docs/assets/social-preview.svg +0 -17
- docs/assets/stylesheets/custom.css +0 -65
- docs/categorizing-documents/index.md +0 -168
- docs/data-extraction/index.md +0 -87
- docs/document-qa/index.ipynb +0 -435
- docs/document-qa/index.md +0 -79
- docs/element-selection/index.ipynb +0 -969
- docs/element-selection/index.md +0 -249
- docs/finetuning/index.md +0 -176
- docs/index.md +0 -189
- docs/installation/index.md +0 -69
- docs/interactive-widget/index.ipynb +0 -962
- docs/interactive-widget/index.md +0 -12
- docs/layout-analysis/index.ipynb +0 -818
- docs/layout-analysis/index.md +0 -185
- docs/ocr/index.md +0 -256
- docs/pdf-navigation/index.ipynb +0 -314
- docs/pdf-navigation/index.md +0 -97
- docs/regions/index.ipynb +0 -816
- docs/regions/index.md +0 -294
- docs/tables/index.ipynb +0 -658
- docs/tables/index.md +0 -144
- docs/text-analysis/index.ipynb +0 -370
- docs/text-analysis/index.md +0 -105
- docs/text-extraction/index.ipynb +0 -1478
- docs/text-extraction/index.md +0 -292
- docs/tutorials/01-loading-and-extraction.ipynb +0 -1873
- docs/tutorials/01-loading-and-extraction.md +0 -95
- docs/tutorials/02-finding-elements.ipynb +0 -417
- docs/tutorials/02-finding-elements.md +0 -149
- docs/tutorials/03-extracting-blocks.ipynb +0 -152
- docs/tutorials/03-extracting-blocks.md +0 -48
- docs/tutorials/04-table-extraction.ipynb +0 -119
- docs/tutorials/04-table-extraction.md +0 -50
- docs/tutorials/05-excluding-content.ipynb +0 -275
- docs/tutorials/05-excluding-content.md +0 -109
- docs/tutorials/06-document-qa.ipynb +0 -337
- docs/tutorials/06-document-qa.md +0 -91
- docs/tutorials/07-layout-analysis.ipynb +0 -293
- docs/tutorials/07-layout-analysis.md +0 -66
- docs/tutorials/07-working-with-regions.ipynb +0 -414
- docs/tutorials/07-working-with-regions.md +0 -151
- docs/tutorials/08-spatial-navigation.ipynb +0 -513
- docs/tutorials/08-spatial-navigation.md +0 -190
- docs/tutorials/09-section-extraction.ipynb +0 -2439
- docs/tutorials/09-section-extraction.md +0 -256
- docs/tutorials/10-form-field-extraction.ipynb +0 -517
- docs/tutorials/10-form-field-extraction.md +0 -201
- docs/tutorials/11-enhanced-table-processing.ipynb +0 -59
- docs/tutorials/11-enhanced-table-processing.md +0 -9
- docs/tutorials/12-ocr-integration.ipynb +0 -3712
- docs/tutorials/12-ocr-integration.md +0 -137
- docs/tutorials/13-semantic-search.ipynb +0 -1718
- docs/tutorials/13-semantic-search.md +0 -77
- docs/visual-debugging/index.ipynb +0 -2970
- docs/visual-debugging/index.md +0 -157
- docs/visual-debugging/region.png +0 -0
- natural_pdf/templates/finetune/fine_tune_paddleocr.md +0 -420
- natural_pdf/templates/spa/css/style.css +0 -334
- natural_pdf/templates/spa/index.html +0 -31
- natural_pdf/templates/spa/js/app.js +0 -472
- natural_pdf/templates/spa/words.txt +0 -235976
- natural_pdf/widgets/frontend/viewer.js +0 -88
- natural_pdf-0.1.8.dist-info/RECORD +0 -156
- notebooks/Examples.ipynb +0 -1293
- pdfs/.gitkeep +0 -0
- pdfs/01-practice.pdf +0 -543
- pdfs/0500000US42001.pdf +0 -0
- pdfs/0500000US42007.pdf +0 -0
- pdfs/2014 Statistics.pdf +0 -0
- pdfs/2019 Statistics.pdf +0 -0
- pdfs/Atlanta_Public_Schools_GA_sample.pdf +0 -0
- pdfs/needs-ocr.pdf +0 -0
- {natural_pdf-0.1.8.dist-info → natural_pdf-0.1.9.dist-info}/licenses/LICENSE +0 -0
@@ -3,21 +3,23 @@ from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple, Un
|
|
3
3
|
|
4
4
|
# Assuming PIL is installed as it's needed for vision
|
5
5
|
try:
|
6
|
-
|
6
|
+
from PIL import Image
|
7
7
|
except ImportError:
|
8
|
-
|
8
|
+
Image = None # type: ignore
|
9
9
|
|
10
10
|
# Import result classes
|
11
|
-
from .results import ClassificationResult
|
11
|
+
from .results import ClassificationResult # Assuming results.py is in the same dir
|
12
12
|
|
13
13
|
if TYPE_CHECKING:
|
14
14
|
# Avoid runtime import cycle
|
15
15
|
from natural_pdf.core.page import Page
|
16
16
|
from natural_pdf.elements.region import Region
|
17
|
+
|
17
18
|
from .manager import ClassificationManager
|
18
19
|
|
19
20
|
logger = logging.getLogger(__name__)
|
20
21
|
|
22
|
+
|
21
23
|
class ClassificationMixin:
|
22
24
|
"""
|
23
25
|
Mixin class providing classification capabilities to Page and Region objects.
|
@@ -38,18 +40,18 @@ class ClassificationMixin:
|
|
38
40
|
# Host class needs 'analyses' attribute initialized as Dict[str, Any]
|
39
41
|
# analyses: Dict[str, Any]
|
40
42
|
|
41
|
-
# --- End Abstract --- #
|
43
|
+
# --- End Abstract --- #
|
42
44
|
|
43
45
|
def classify(
|
44
46
|
self,
|
45
47
|
categories: List[str],
|
46
|
-
model: Optional[str] = None,
|
47
|
-
using: Optional[str] = None,
|
48
|
+
model: Optional[str] = None, # Default handled by manager
|
49
|
+
using: Optional[str] = None, # Renamed parameter
|
48
50
|
min_confidence: float = 0.0,
|
49
|
-
analysis_key: str =
|
51
|
+
analysis_key: str = "classification", # Default key
|
50
52
|
multi_label: bool = False,
|
51
|
-
**kwargs
|
52
|
-
) -> "ClassificationMixin":
|
53
|
+
**kwargs,
|
54
|
+
) -> "ClassificationMixin": # Return self for chaining
|
53
55
|
"""
|
54
56
|
Classifies this item (Page or Region) using the configured manager.
|
55
57
|
|
@@ -71,22 +73,30 @@ class ClassificationMixin:
|
|
71
73
|
Self for method chaining.
|
72
74
|
"""
|
73
75
|
# Ensure analyses dict exists
|
74
|
-
if not hasattr(self,
|
75
|
-
|
76
|
-
|
76
|
+
if not hasattr(self, "analyses") or self.analyses is None:
|
77
|
+
logger.warning("'analyses' attribute not found or is None. Initializing as empty dict.")
|
78
|
+
self.analyses = {}
|
77
79
|
|
78
80
|
try:
|
79
81
|
manager = self._get_classification_manager()
|
80
|
-
|
82
|
+
|
81
83
|
# Determine the effective model ID and engine type
|
82
84
|
effective_model_id = model
|
83
|
-
inferred_using = manager.infer_using(
|
85
|
+
inferred_using = manager.infer_using(
|
86
|
+
model if model else manager.DEFAULT_TEXT_MODEL, using
|
87
|
+
)
|
84
88
|
|
85
89
|
# If model was not provided, use the manager's default for the inferred engine type
|
86
90
|
if effective_model_id is None:
|
87
|
-
effective_model_id =
|
88
|
-
|
89
|
-
|
91
|
+
effective_model_id = (
|
92
|
+
manager.DEFAULT_TEXT_MODEL
|
93
|
+
if inferred_using == "text"
|
94
|
+
else manager.DEFAULT_VISION_MODEL
|
95
|
+
)
|
96
|
+
logger.debug(
|
97
|
+
f"No model provided, using default for mode '{inferred_using}': '{effective_model_id}'"
|
98
|
+
)
|
99
|
+
|
90
100
|
# Get content based on the *final* determined engine type
|
91
101
|
content = self._get_classification_content(model_type=inferred_using, **kwargs)
|
92
102
|
|
@@ -94,11 +104,11 @@ class ClassificationMixin:
|
|
94
104
|
result_obj: ClassificationResult = manager.classify_item(
|
95
105
|
item_content=content,
|
96
106
|
categories=categories,
|
97
|
-
model_id=effective_model_id,
|
98
|
-
using=inferred_using,
|
107
|
+
model_id=effective_model_id, # Pass the resolved model ID
|
108
|
+
using=inferred_using, # Pass renamed argument
|
99
109
|
min_confidence=min_confidence,
|
100
110
|
multi_label=multi_label,
|
101
|
-
**kwargs
|
111
|
+
**kwargs,
|
102
112
|
)
|
103
113
|
|
104
114
|
# Store the structured result object under the specified key
|
@@ -106,8 +116,8 @@ class ClassificationMixin:
|
|
106
116
|
logger.debug(f"Stored classification result under key '{analysis_key}': {result_obj}")
|
107
117
|
|
108
118
|
except NotImplementedError as nie:
|
109
|
-
|
110
|
-
|
119
|
+
logger.error(f"Classification cannot proceed: {nie}")
|
120
|
+
raise
|
111
121
|
except Exception as e:
|
112
122
|
logger.error(f"Classification failed: {e}", exc_info=True)
|
113
123
|
# Optionally re-raise or just log and return self
|
@@ -118,32 +128,36 @@ class ClassificationMixin:
|
|
118
128
|
@property
|
119
129
|
def classification_results(self) -> Optional[ClassificationResult]:
|
120
130
|
"""Returns the ClassificationResult from the *default* ('classification') key, or None."""
|
121
|
-
if not hasattr(self,
|
131
|
+
if not hasattr(self, "analyses") or self.analyses is None:
|
122
132
|
return None
|
123
133
|
# Return the result object directly from the default key
|
124
|
-
return self.analyses.get(
|
134
|
+
return self.analyses.get("classification")
|
125
135
|
|
126
136
|
@property
|
127
137
|
def category(self) -> Optional[str]:
|
128
138
|
"""Returns the top category label from the *default* ('classification') key, or None."""
|
129
|
-
result_obj = self.classification_results
|
139
|
+
result_obj = self.classification_results # Uses the property above
|
130
140
|
# Access the property on the result object
|
131
141
|
return result_obj.top_category if result_obj else None
|
132
142
|
|
133
143
|
@property
|
134
144
|
def category_confidence(self) -> Optional[float]:
|
135
145
|
"""Returns the top category confidence from the *default* ('classification') key, or None."""
|
136
|
-
result_obj = self.classification_results
|
146
|
+
result_obj = self.classification_results # Uses the property above
|
137
147
|
# Access the property on the result object
|
138
148
|
return result_obj.top_confidence if result_obj else None
|
139
149
|
|
140
150
|
# Maybe add a helper to get results by specific key?
|
141
|
-
def get_classification_result(
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
151
|
+
def get_classification_result(
|
152
|
+
self, analysis_key: str = "classification"
|
153
|
+
) -> Optional[ClassificationResult]:
|
154
|
+
"""Gets a classification result object stored under a specific key."""
|
155
|
+
if not hasattr(self, "analyses") or self.analyses is None:
|
156
|
+
return None
|
157
|
+
result = self.analyses.get(analysis_key)
|
158
|
+
if result is not None and not isinstance(result, ClassificationResult):
|
159
|
+
logger.warning(
|
160
|
+
f"Item found under key '{analysis_key}' is not a ClassificationResult (type: {type(result)}). Returning None."
|
161
|
+
)
|
162
|
+
return None
|
163
|
+
return result
|
@@ -1,62 +1,80 @@
|
|
1
1
|
# natural_pdf/classification/results.py
|
2
|
-
from typing import List, Optional, Dict, Any
|
3
|
-
from datetime import datetime
|
4
2
|
import logging
|
3
|
+
from dataclasses import dataclass
|
4
|
+
from datetime import datetime
|
5
|
+
from typing import Any, Dict, List, Optional
|
5
6
|
|
6
7
|
logger = logging.getLogger(__name__)
|
7
8
|
|
9
|
+
|
10
|
+
@dataclass
|
8
11
|
class CategoryScore:
|
9
|
-
"""Represents
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
# Fallback or raise? For now, allow but log.
|
18
|
-
# raise ValueError("Category label must be a non-empty string.")
|
19
|
-
if not isinstance(confidence, (float, int)) or not (0.0 <= confidence <= 1.0):
|
20
|
-
logger.warning(f"Initializing CategoryScore with invalid confidence: {confidence} for label '{label}'. Clamping to [0, 1].")
|
21
|
-
confidence = max(0.0, min(1.0, float(confidence)))
|
22
|
-
# raise ValueError("Category confidence must be a float between 0.0 and 1.0.")
|
23
|
-
|
24
|
-
self.label = str(label)
|
25
|
-
self.confidence = float(confidence)
|
26
|
-
|
27
|
-
def __repr__(self):
|
28
|
-
return f"<CategoryScore label='{self.label}' confidence={self.confidence:.3f}>"
|
12
|
+
"""Represents a category and its confidence score from classification."""
|
13
|
+
|
14
|
+
category: str
|
15
|
+
score: float
|
16
|
+
|
17
|
+
def to_dict(self) -> Dict[str, Any]:
|
18
|
+
"""Convert to dictionary for serialization."""
|
19
|
+
return {"category": self.category, "score": self.score}
|
29
20
|
|
21
|
+
|
22
|
+
@dataclass
|
30
23
|
class ClassificationResult:
|
31
|
-
"""
|
24
|
+
"""Results from a classification operation."""
|
25
|
+
|
26
|
+
category: str
|
27
|
+
score: float
|
28
|
+
scores: List[CategoryScore]
|
32
29
|
model_id: str
|
33
|
-
using: str # Renamed from engine_type ('text' or 'vision')
|
34
30
|
timestamp: datetime
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
def __init__(
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
31
|
+
using: str # 'text' or 'vision'
|
32
|
+
parameters: Optional[Dict[str, Any]] = None
|
33
|
+
|
34
|
+
def __init__(
|
35
|
+
self,
|
36
|
+
category: str,
|
37
|
+
score: float,
|
38
|
+
scores: List[CategoryScore],
|
39
|
+
model_id: str,
|
40
|
+
using: str,
|
41
|
+
parameters: Optional[Dict[str, Any]] = None,
|
42
|
+
timestamp: Optional[datetime] = None,
|
43
|
+
):
|
44
|
+
self.category = category
|
45
|
+
self.score = score
|
46
|
+
self.scores = scores
|
47
|
+
self.model_id = model_id
|
48
|
+
self.using = using
|
49
|
+
self.parameters = parameters or {}
|
50
|
+
self.timestamp = timestamp or datetime.now()
|
51
|
+
|
52
|
+
def to_dict(self) -> Dict[str, Any]:
|
53
|
+
"""
|
54
|
+
Convert the classification result to a dictionary for serialization.
|
55
|
+
|
56
|
+
Returns:
|
57
|
+
Dictionary representation of the classification result
|
58
|
+
"""
|
59
|
+
return {
|
60
|
+
"category": self.category,
|
61
|
+
"score": self.score,
|
62
|
+
"scores": [s.to_dict() for s in self.scores],
|
63
|
+
"model_id": self.model_id,
|
64
|
+
"using": self.using,
|
65
|
+
"parameters": self.parameters,
|
66
|
+
"timestamp": self.timestamp.isoformat(),
|
67
|
+
}
|
48
68
|
|
49
69
|
@property
|
50
|
-
def top_category(self) ->
|
51
|
-
"""Returns the
|
52
|
-
return self.
|
70
|
+
def top_category(self) -> str:
|
71
|
+
"""Returns the category with the highest score."""
|
72
|
+
return self.category
|
53
73
|
|
54
74
|
@property
|
55
|
-
def top_confidence(self) ->
|
75
|
+
def top_confidence(self) -> float:
|
56
76
|
"""Returns the confidence score of the top category."""
|
57
|
-
return self.
|
77
|
+
return self.score
|
58
78
|
|
59
|
-
def __repr__(self):
|
60
|
-
|
61
|
-
num_scores = len(self.scores)
|
62
|
-
return f"<ClassificationResult model='{self.model_id}' using='{self.using}' scores={num_scores}{top_cat}>"
|
79
|
+
def __repr__(self) -> str:
|
80
|
+
return f"<ClassificationResult category='{self.category}' score={self.score:.3f} model='{self.model_id}'>"
|
@@ -1,10 +1,38 @@
|
|
1
1
|
import logging
|
2
|
-
from typing import Callable, Iterable,
|
2
|
+
from typing import Any, Callable, Iterable, TypeVar
|
3
|
+
|
3
4
|
from tqdm.auto import tqdm
|
4
5
|
|
5
6
|
logger = logging.getLogger(__name__)
|
6
7
|
|
7
|
-
T = TypeVar("T")
|
8
|
+
T = TypeVar("T") # Generic type for items in the collection
|
9
|
+
|
10
|
+
|
11
|
+
class DirectionalCollectionMixin:
|
12
|
+
"""
|
13
|
+
Mixin providing directional methods for collections of elements/regions.
|
14
|
+
"""
|
15
|
+
|
16
|
+
def below(self, **kwargs) -> "ElementCollection":
|
17
|
+
"""Find regions below all elements in this collection."""
|
18
|
+
return self.apply(lambda element: element.below(**kwargs))
|
19
|
+
|
20
|
+
def above(self, **kwargs) -> "ElementCollection":
|
21
|
+
"""Find regions above all elements in this collection."""
|
22
|
+
return self.apply(lambda element: element.above(**kwargs))
|
23
|
+
|
24
|
+
def left(self, **kwargs) -> "ElementCollection":
|
25
|
+
"""Find regions to the left of all elements in this collection."""
|
26
|
+
return self.apply(lambda element: element.left(**kwargs))
|
27
|
+
|
28
|
+
def right(self, **kwargs) -> "ElementCollection":
|
29
|
+
"""Find regions to the right of all elements in this collection."""
|
30
|
+
return self.apply(lambda element: element.right(**kwargs))
|
31
|
+
|
32
|
+
def expand(self, **kwargs) -> "ElementCollection":
|
33
|
+
"""Expand all elements in this collection."""
|
34
|
+
return self.apply(lambda element: element.expand(**kwargs))
|
35
|
+
|
8
36
|
|
9
37
|
class ApplyMixin:
|
10
38
|
"""
|
@@ -13,6 +41,7 @@ class ApplyMixin:
|
|
13
41
|
Assumes the inheriting class implements `__iter__` and `__len__` appropriately
|
14
42
|
for the items to be processed by `apply`.
|
15
43
|
"""
|
44
|
+
|
16
45
|
def _get_items_for_apply(self) -> Iterable[Any]:
|
17
46
|
"""
|
18
47
|
Returns the iterable of items to apply the function to.
|
@@ -22,7 +51,7 @@ class ApplyMixin:
|
|
22
51
|
# Default to standard iteration over the collection itself
|
23
52
|
return iter(self)
|
24
53
|
|
25
|
-
def apply(self: Any, func: Callable[[Any, ...], Any], *args, **kwargs) ->
|
54
|
+
def apply(self: Any, func: Callable[[Any, ...], Any], *args, **kwargs) -> Iterable[Any]:
|
26
55
|
"""
|
27
56
|
Applies a function to each item in the collection.
|
28
57
|
|
@@ -34,7 +63,7 @@ class ApplyMixin:
|
|
34
63
|
A special keyword argument 'show_progress' (bool, default=False)
|
35
64
|
can be used to display a progress bar.
|
36
65
|
"""
|
37
|
-
show_progress = kwargs.pop(
|
66
|
+
show_progress = kwargs.pop("show_progress", False)
|
38
67
|
# Derive unit name from class name
|
39
68
|
unit_name = self.__class__.__name__.lower()
|
40
69
|
items_iterable = self._get_items_for_apply()
|
@@ -42,22 +71,41 @@ class ApplyMixin:
|
|
42
71
|
# Need total count for tqdm, assumes __len__ is implemented by the inheriting class
|
43
72
|
total_items = 0
|
44
73
|
try:
|
45
|
-
|
46
|
-
except TypeError:
|
47
|
-
|
74
|
+
total_items = len(self)
|
75
|
+
except TypeError: # Handle cases where __len__ might not be defined on self
|
76
|
+
logger.warning(f"Could not determine collection length for progress bar.")
|
48
77
|
|
49
78
|
if show_progress and total_items > 0:
|
50
|
-
items_iterable = tqdm(
|
79
|
+
items_iterable = tqdm(
|
80
|
+
items_iterable, total=total_items, desc=f"Applying {func.__name__}", unit=unit_name
|
81
|
+
)
|
51
82
|
elif show_progress:
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
83
|
+
logger.info(
|
84
|
+
f"Applying {func.__name__} (progress bar disabled for zero/unknown length)."
|
85
|
+
)
|
86
|
+
|
87
|
+
results = [func(item, *args, **kwargs) for item in items_iterable]
|
88
|
+
|
89
|
+
# If results is empty, return an empty list
|
90
|
+
if not results:
|
91
|
+
return []
|
92
|
+
|
93
|
+
# Import here to avoid circular imports
|
94
|
+
from natural_pdf import PDF, Page
|
95
|
+
from natural_pdf.collections.pdf_collection import PDFCollection
|
96
|
+
from natural_pdf.elements.base import Element
|
97
|
+
from natural_pdf.elements.collections import ElementCollection, PageCollection
|
98
|
+
from natural_pdf.elements.region import Region
|
99
|
+
|
100
|
+
first_non_none = next((r for r in results if r is not None), None)
|
101
|
+
first_type = type(first_non_none) if first_non_none is not None else None
|
102
|
+
|
103
|
+
# Return the appropriate collection based on result type (...generally)
|
104
|
+
if issubclass(first_type, Element) or issubclass(first_type, Region):
|
105
|
+
return ElementCollection(results)
|
106
|
+
elif first_type == PDF:
|
107
|
+
return PDFCollection(results)
|
108
|
+
elif first_type == Page:
|
109
|
+
return PageCollection(results)
|
110
|
+
|
111
|
+
return results
|