natural-pdf 0.1.8__py3-none-any.whl → 0.1.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (134) hide show
  1. natural_pdf/__init__.py +1 -0
  2. natural_pdf/analyzers/layout/base.py +1 -5
  3. natural_pdf/analyzers/layout/gemini.py +61 -51
  4. natural_pdf/analyzers/layout/layout_analyzer.py +40 -11
  5. natural_pdf/analyzers/layout/layout_manager.py +26 -84
  6. natural_pdf/analyzers/layout/layout_options.py +7 -0
  7. natural_pdf/analyzers/layout/pdfplumber_table_finder.py +142 -0
  8. natural_pdf/analyzers/layout/surya.py +46 -123
  9. natural_pdf/analyzers/layout/tatr.py +51 -4
  10. natural_pdf/analyzers/text_structure.py +3 -5
  11. natural_pdf/analyzers/utils.py +3 -3
  12. natural_pdf/classification/manager.py +230 -151
  13. natural_pdf/classification/mixin.py +49 -35
  14. natural_pdf/classification/results.py +64 -46
  15. natural_pdf/collections/mixins.py +68 -20
  16. natural_pdf/collections/pdf_collection.py +177 -64
  17. natural_pdf/core/element_manager.py +30 -14
  18. natural_pdf/core/highlighting_service.py +13 -22
  19. natural_pdf/core/page.py +423 -101
  20. natural_pdf/core/pdf.py +633 -190
  21. natural_pdf/elements/base.py +134 -40
  22. natural_pdf/elements/collections.py +503 -131
  23. natural_pdf/elements/region.py +659 -90
  24. natural_pdf/elements/text.py +1 -1
  25. natural_pdf/export/mixin.py +137 -0
  26. natural_pdf/exporters/base.py +3 -3
  27. natural_pdf/exporters/paddleocr.py +4 -3
  28. natural_pdf/extraction/manager.py +50 -49
  29. natural_pdf/extraction/mixin.py +90 -57
  30. natural_pdf/extraction/result.py +9 -23
  31. natural_pdf/ocr/__init__.py +5 -5
  32. natural_pdf/ocr/engine_doctr.py +346 -0
  33. natural_pdf/ocr/ocr_factory.py +24 -4
  34. natural_pdf/ocr/ocr_manager.py +61 -25
  35. natural_pdf/ocr/ocr_options.py +70 -10
  36. natural_pdf/ocr/utils.py +6 -4
  37. natural_pdf/search/__init__.py +20 -34
  38. natural_pdf/search/haystack_search_service.py +309 -265
  39. natural_pdf/search/haystack_utils.py +99 -75
  40. natural_pdf/search/search_service_protocol.py +11 -12
  41. natural_pdf/selectors/parser.py +219 -143
  42. natural_pdf/utils/debug.py +3 -3
  43. natural_pdf/utils/identifiers.py +1 -1
  44. natural_pdf/utils/locks.py +1 -1
  45. natural_pdf/utils/packaging.py +8 -6
  46. natural_pdf/utils/text_extraction.py +24 -16
  47. natural_pdf/utils/tqdm_utils.py +18 -10
  48. natural_pdf/utils/visualization.py +18 -0
  49. natural_pdf/widgets/viewer.py +4 -25
  50. {natural_pdf-0.1.8.dist-info → natural_pdf-0.1.9.dist-info}/METADATA +12 -3
  51. natural_pdf-0.1.9.dist-info/RECORD +80 -0
  52. {natural_pdf-0.1.8.dist-info → natural_pdf-0.1.9.dist-info}/WHEEL +1 -1
  53. {natural_pdf-0.1.8.dist-info → natural_pdf-0.1.9.dist-info}/top_level.txt +0 -2
  54. docs/api/index.md +0 -386
  55. docs/assets/favicon.png +0 -3
  56. docs/assets/favicon.svg +0 -3
  57. docs/assets/javascripts/custom.js +0 -17
  58. docs/assets/logo.svg +0 -3
  59. docs/assets/sample-screen.png +0 -0
  60. docs/assets/social-preview.png +0 -17
  61. docs/assets/social-preview.svg +0 -17
  62. docs/assets/stylesheets/custom.css +0 -65
  63. docs/categorizing-documents/index.md +0 -168
  64. docs/data-extraction/index.md +0 -87
  65. docs/document-qa/index.ipynb +0 -435
  66. docs/document-qa/index.md +0 -79
  67. docs/element-selection/index.ipynb +0 -969
  68. docs/element-selection/index.md +0 -249
  69. docs/finetuning/index.md +0 -176
  70. docs/index.md +0 -189
  71. docs/installation/index.md +0 -69
  72. docs/interactive-widget/index.ipynb +0 -962
  73. docs/interactive-widget/index.md +0 -12
  74. docs/layout-analysis/index.ipynb +0 -818
  75. docs/layout-analysis/index.md +0 -185
  76. docs/ocr/index.md +0 -256
  77. docs/pdf-navigation/index.ipynb +0 -314
  78. docs/pdf-navigation/index.md +0 -97
  79. docs/regions/index.ipynb +0 -816
  80. docs/regions/index.md +0 -294
  81. docs/tables/index.ipynb +0 -658
  82. docs/tables/index.md +0 -144
  83. docs/text-analysis/index.ipynb +0 -370
  84. docs/text-analysis/index.md +0 -105
  85. docs/text-extraction/index.ipynb +0 -1478
  86. docs/text-extraction/index.md +0 -292
  87. docs/tutorials/01-loading-and-extraction.ipynb +0 -1873
  88. docs/tutorials/01-loading-and-extraction.md +0 -95
  89. docs/tutorials/02-finding-elements.ipynb +0 -417
  90. docs/tutorials/02-finding-elements.md +0 -149
  91. docs/tutorials/03-extracting-blocks.ipynb +0 -152
  92. docs/tutorials/03-extracting-blocks.md +0 -48
  93. docs/tutorials/04-table-extraction.ipynb +0 -119
  94. docs/tutorials/04-table-extraction.md +0 -50
  95. docs/tutorials/05-excluding-content.ipynb +0 -275
  96. docs/tutorials/05-excluding-content.md +0 -109
  97. docs/tutorials/06-document-qa.ipynb +0 -337
  98. docs/tutorials/06-document-qa.md +0 -91
  99. docs/tutorials/07-layout-analysis.ipynb +0 -293
  100. docs/tutorials/07-layout-analysis.md +0 -66
  101. docs/tutorials/07-working-with-regions.ipynb +0 -414
  102. docs/tutorials/07-working-with-regions.md +0 -151
  103. docs/tutorials/08-spatial-navigation.ipynb +0 -513
  104. docs/tutorials/08-spatial-navigation.md +0 -190
  105. docs/tutorials/09-section-extraction.ipynb +0 -2439
  106. docs/tutorials/09-section-extraction.md +0 -256
  107. docs/tutorials/10-form-field-extraction.ipynb +0 -517
  108. docs/tutorials/10-form-field-extraction.md +0 -201
  109. docs/tutorials/11-enhanced-table-processing.ipynb +0 -59
  110. docs/tutorials/11-enhanced-table-processing.md +0 -9
  111. docs/tutorials/12-ocr-integration.ipynb +0 -3712
  112. docs/tutorials/12-ocr-integration.md +0 -137
  113. docs/tutorials/13-semantic-search.ipynb +0 -1718
  114. docs/tutorials/13-semantic-search.md +0 -77
  115. docs/visual-debugging/index.ipynb +0 -2970
  116. docs/visual-debugging/index.md +0 -157
  117. docs/visual-debugging/region.png +0 -0
  118. natural_pdf/templates/finetune/fine_tune_paddleocr.md +0 -420
  119. natural_pdf/templates/spa/css/style.css +0 -334
  120. natural_pdf/templates/spa/index.html +0 -31
  121. natural_pdf/templates/spa/js/app.js +0 -472
  122. natural_pdf/templates/spa/words.txt +0 -235976
  123. natural_pdf/widgets/frontend/viewer.js +0 -88
  124. natural_pdf-0.1.8.dist-info/RECORD +0 -156
  125. notebooks/Examples.ipynb +0 -1293
  126. pdfs/.gitkeep +0 -0
  127. pdfs/01-practice.pdf +0 -543
  128. pdfs/0500000US42001.pdf +0 -0
  129. pdfs/0500000US42007.pdf +0 -0
  130. pdfs/2014 Statistics.pdf +0 -0
  131. pdfs/2019 Statistics.pdf +0 -0
  132. pdfs/Atlanta_Public_Schools_GA_sample.pdf +0 -0
  133. pdfs/needs-ocr.pdf +0 -0
  134. {natural_pdf-0.1.8.dist-info → natural_pdf-0.1.9.dist-info}/licenses/LICENSE +0 -0
@@ -3,21 +3,23 @@ from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple, Un
3
3
 
4
4
  # Assuming PIL is installed as it's needed for vision
5
5
  try:
6
- from PIL import Image
6
+ from PIL import Image
7
7
  except ImportError:
8
- Image = None # type: ignore
8
+ Image = None # type: ignore
9
9
 
10
10
  # Import result classes
11
- from .results import ClassificationResult # Assuming results.py is in the same dir
11
+ from .results import ClassificationResult # Assuming results.py is in the same dir
12
12
 
13
13
  if TYPE_CHECKING:
14
14
  # Avoid runtime import cycle
15
15
  from natural_pdf.core.page import Page
16
16
  from natural_pdf.elements.region import Region
17
+
17
18
  from .manager import ClassificationManager
18
19
 
19
20
  logger = logging.getLogger(__name__)
20
21
 
22
+
21
23
  class ClassificationMixin:
22
24
  """
23
25
  Mixin class providing classification capabilities to Page and Region objects.
@@ -38,18 +40,18 @@ class ClassificationMixin:
38
40
  # Host class needs 'analyses' attribute initialized as Dict[str, Any]
39
41
  # analyses: Dict[str, Any]
40
42
 
41
- # --- End Abstract --- #
43
+ # --- End Abstract --- #
42
44
 
43
45
  def classify(
44
46
  self,
45
47
  categories: List[str],
46
- model: Optional[str] = None, # Default handled by manager
47
- using: Optional[str] = None, # Renamed parameter
48
+ model: Optional[str] = None, # Default handled by manager
49
+ using: Optional[str] = None, # Renamed parameter
48
50
  min_confidence: float = 0.0,
49
- analysis_key: str = 'classification', # Default key
51
+ analysis_key: str = "classification", # Default key
50
52
  multi_label: bool = False,
51
- **kwargs
52
- ) -> "ClassificationMixin": # Return self for chaining
53
+ **kwargs,
54
+ ) -> "ClassificationMixin": # Return self for chaining
53
55
  """
54
56
  Classifies this item (Page or Region) using the configured manager.
55
57
 
@@ -71,22 +73,30 @@ class ClassificationMixin:
71
73
  Self for method chaining.
72
74
  """
73
75
  # Ensure analyses dict exists
74
- if not hasattr(self, 'analyses') or self.analyses is None:
75
- logger.warning("'analyses' attribute not found or is None. Initializing as empty dict.")
76
- self.analyses = {}
76
+ if not hasattr(self, "analyses") or self.analyses is None:
77
+ logger.warning("'analyses' attribute not found or is None. Initializing as empty dict.")
78
+ self.analyses = {}
77
79
 
78
80
  try:
79
81
  manager = self._get_classification_manager()
80
-
82
+
81
83
  # Determine the effective model ID and engine type
82
84
  effective_model_id = model
83
- inferred_using = manager.infer_using(model if model else manager.DEFAULT_TEXT_MODEL, using)
85
+ inferred_using = manager.infer_using(
86
+ model if model else manager.DEFAULT_TEXT_MODEL, using
87
+ )
84
88
 
85
89
  # If model was not provided, use the manager's default for the inferred engine type
86
90
  if effective_model_id is None:
87
- effective_model_id = manager.DEFAULT_TEXT_MODEL if inferred_using == 'text' else manager.DEFAULT_VISION_MODEL
88
- logger.debug(f"No model provided, using default for mode '{inferred_using}': '{effective_model_id}'")
89
-
91
+ effective_model_id = (
92
+ manager.DEFAULT_TEXT_MODEL
93
+ if inferred_using == "text"
94
+ else manager.DEFAULT_VISION_MODEL
95
+ )
96
+ logger.debug(
97
+ f"No model provided, using default for mode '{inferred_using}': '{effective_model_id}'"
98
+ )
99
+
90
100
  # Get content based on the *final* determined engine type
91
101
  content = self._get_classification_content(model_type=inferred_using, **kwargs)
92
102
 
@@ -94,11 +104,11 @@ class ClassificationMixin:
94
104
  result_obj: ClassificationResult = manager.classify_item(
95
105
  item_content=content,
96
106
  categories=categories,
97
- model_id=effective_model_id, # Pass the resolved model ID
98
- using=inferred_using, # Pass renamed argument
107
+ model_id=effective_model_id, # Pass the resolved model ID
108
+ using=inferred_using, # Pass renamed argument
99
109
  min_confidence=min_confidence,
100
110
  multi_label=multi_label,
101
- **kwargs
111
+ **kwargs,
102
112
  )
103
113
 
104
114
  # Store the structured result object under the specified key
@@ -106,8 +116,8 @@ class ClassificationMixin:
106
116
  logger.debug(f"Stored classification result under key '{analysis_key}': {result_obj}")
107
117
 
108
118
  except NotImplementedError as nie:
109
- logger.error(f"Classification cannot proceed: {nie}")
110
- raise
119
+ logger.error(f"Classification cannot proceed: {nie}")
120
+ raise
111
121
  except Exception as e:
112
122
  logger.error(f"Classification failed: {e}", exc_info=True)
113
123
  # Optionally re-raise or just log and return self
@@ -118,32 +128,36 @@ class ClassificationMixin:
118
128
  @property
119
129
  def classification_results(self) -> Optional[ClassificationResult]:
120
130
  """Returns the ClassificationResult from the *default* ('classification') key, or None."""
121
- if not hasattr(self, 'analyses') or self.analyses is None:
131
+ if not hasattr(self, "analyses") or self.analyses is None:
122
132
  return None
123
133
  # Return the result object directly from the default key
124
- return self.analyses.get('classification')
134
+ return self.analyses.get("classification")
125
135
 
126
136
  @property
127
137
  def category(self) -> Optional[str]:
128
138
  """Returns the top category label from the *default* ('classification') key, or None."""
129
- result_obj = self.classification_results # Uses the property above
139
+ result_obj = self.classification_results # Uses the property above
130
140
  # Access the property on the result object
131
141
  return result_obj.top_category if result_obj else None
132
142
 
133
143
  @property
134
144
  def category_confidence(self) -> Optional[float]:
135
145
  """Returns the top category confidence from the *default* ('classification') key, or None."""
136
- result_obj = self.classification_results # Uses the property above
146
+ result_obj = self.classification_results # Uses the property above
137
147
  # Access the property on the result object
138
148
  return result_obj.top_confidence if result_obj else None
139
149
 
140
150
  # Maybe add a helper to get results by specific key?
141
- def get_classification_result(self, analysis_key: str = 'classification') -> Optional[ClassificationResult]:
142
- """Gets a classification result object stored under a specific key."""
143
- if not hasattr(self, 'analyses') or self.analyses is None:
144
- return None
145
- result = self.analyses.get(analysis_key)
146
- if result is not None and not isinstance(result, ClassificationResult):
147
- logger.warning(f"Item found under key '{analysis_key}' is not a ClassificationResult (type: {type(result)}). Returning None.")
148
- return None
149
- return result
151
+ def get_classification_result(
152
+ self, analysis_key: str = "classification"
153
+ ) -> Optional[ClassificationResult]:
154
+ """Gets a classification result object stored under a specific key."""
155
+ if not hasattr(self, "analyses") or self.analyses is None:
156
+ return None
157
+ result = self.analyses.get(analysis_key)
158
+ if result is not None and not isinstance(result, ClassificationResult):
159
+ logger.warning(
160
+ f"Item found under key '{analysis_key}' is not a ClassificationResult (type: {type(result)}). Returning None."
161
+ )
162
+ return None
163
+ return result
@@ -1,62 +1,80 @@
1
1
  # natural_pdf/classification/results.py
2
- from typing import List, Optional, Dict, Any
3
- from datetime import datetime
4
2
  import logging
3
+ from dataclasses import dataclass
4
+ from datetime import datetime
5
+ from typing import Any, Dict, List, Optional
5
6
 
6
7
  logger = logging.getLogger(__name__)
7
8
 
9
+
10
+ @dataclass
8
11
  class CategoryScore:
9
- """Represents the score for a single category."""
10
- label: str
11
- confidence: float # Score between 0.0 and 1.0
12
-
13
- def __init__(self, label: str, confidence: float):
14
- # Basic validation
15
- if not isinstance(label, str) or not label:
16
- logger.warning(f"Initializing CategoryScore with invalid label: {label}")
17
- # Fallback or raise? For now, allow but log.
18
- # raise ValueError("Category label must be a non-empty string.")
19
- if not isinstance(confidence, (float, int)) or not (0.0 <= confidence <= 1.0):
20
- logger.warning(f"Initializing CategoryScore with invalid confidence: {confidence} for label '{label}'. Clamping to [0, 1].")
21
- confidence = max(0.0, min(1.0, float(confidence)))
22
- # raise ValueError("Category confidence must be a float between 0.0 and 1.0.")
23
-
24
- self.label = str(label)
25
- self.confidence = float(confidence)
26
-
27
- def __repr__(self):
28
- return f"<CategoryScore label='{self.label}' confidence={self.confidence:.3f}>"
12
+ """Represents a category and its confidence score from classification."""
13
+
14
+ category: str
15
+ score: float
16
+
17
+ def to_dict(self) -> Dict[str, Any]:
18
+ """Convert to dictionary for serialization."""
19
+ return {"category": self.category, "score": self.score}
29
20
 
21
+
22
+ @dataclass
30
23
  class ClassificationResult:
31
- """Holds the structured results of a classification task."""
24
+ """Results from a classification operation."""
25
+
26
+ category: str
27
+ score: float
28
+ scores: List[CategoryScore]
32
29
  model_id: str
33
- using: str # Renamed from engine_type ('text' or 'vision')
34
30
  timestamp: datetime
35
- parameters: Dict[str, Any] # e.g., {'categories': [...], 'min_confidence': 0.1}
36
- scores: List[CategoryScore] # List of scores above threshold, sorted by confidence
37
-
38
- def __init__(self, model_id: str, using: str, timestamp: datetime, parameters: Dict[str, Any], scores: List[CategoryScore]):
39
- if not isinstance(scores, list) or not all(isinstance(s, CategoryScore) for s in scores):
40
- raise TypeError("Scores must be a list of CategoryScore objects.")
41
-
42
- self.model_id = str(model_id)
43
- self.using = str(using) # Renamed from engine_type
44
- self.timestamp = timestamp
45
- self.parameters = parameters if parameters is not None else {}
46
- # Ensure scores are sorted descending by confidence
47
- self.scores = sorted(scores, key=lambda s: s.confidence, reverse=True)
31
+ using: str # 'text' or 'vision'
32
+ parameters: Optional[Dict[str, Any]] = None
33
+
34
+ def __init__(
35
+ self,
36
+ category: str,
37
+ score: float,
38
+ scores: List[CategoryScore],
39
+ model_id: str,
40
+ using: str,
41
+ parameters: Optional[Dict[str, Any]] = None,
42
+ timestamp: Optional[datetime] = None,
43
+ ):
44
+ self.category = category
45
+ self.score = score
46
+ self.scores = scores
47
+ self.model_id = model_id
48
+ self.using = using
49
+ self.parameters = parameters or {}
50
+ self.timestamp = timestamp or datetime.now()
51
+
52
+ def to_dict(self) -> Dict[str, Any]:
53
+ """
54
+ Convert the classification result to a dictionary for serialization.
55
+
56
+ Returns:
57
+ Dictionary representation of the classification result
58
+ """
59
+ return {
60
+ "category": self.category,
61
+ "score": self.score,
62
+ "scores": [s.to_dict() for s in self.scores],
63
+ "model_id": self.model_id,
64
+ "using": self.using,
65
+ "parameters": self.parameters,
66
+ "timestamp": self.timestamp.isoformat(),
67
+ }
48
68
 
49
69
  @property
50
- def top_category(self) -> Optional[str]:
51
- """Returns the label of the category with the highest confidence."""
52
- return self.scores[0].label if self.scores else None
70
+ def top_category(self) -> str:
71
+ """Returns the category with the highest score."""
72
+ return self.category
53
73
 
54
74
  @property
55
- def top_confidence(self) -> Optional[float]:
75
+ def top_confidence(self) -> float:
56
76
  """Returns the confidence score of the top category."""
57
- return self.scores[0].confidence if self.scores else None
77
+ return self.score
58
78
 
59
- def __repr__(self):
60
- top_cat = f" top='{self.top_category}' ({self.top_confidence:.2f})" if self.scores else ""
61
- num_scores = len(self.scores)
62
- return f"<ClassificationResult model='{self.model_id}' using='{self.using}' scores={num_scores}{top_cat}>"
79
+ def __repr__(self) -> str:
80
+ return f"<ClassificationResult category='{self.category}' score={self.score:.3f} model='{self.model_id}'>"
@@ -1,10 +1,38 @@
1
1
  import logging
2
- from typing import Callable, Iterable, Any, TypeVar
2
+ from typing import Any, Callable, Iterable, TypeVar
3
+
3
4
  from tqdm.auto import tqdm
4
5
 
5
6
  logger = logging.getLogger(__name__)
6
7
 
7
- T = TypeVar("T") # Generic type for items in the collection
8
+ T = TypeVar("T") # Generic type for items in the collection
9
+
10
+
11
+ class DirectionalCollectionMixin:
12
+ """
13
+ Mixin providing directional methods for collections of elements/regions.
14
+ """
15
+
16
+ def below(self, **kwargs) -> "ElementCollection":
17
+ """Find regions below all elements in this collection."""
18
+ return self.apply(lambda element: element.below(**kwargs))
19
+
20
+ def above(self, **kwargs) -> "ElementCollection":
21
+ """Find regions above all elements in this collection."""
22
+ return self.apply(lambda element: element.above(**kwargs))
23
+
24
+ def left(self, **kwargs) -> "ElementCollection":
25
+ """Find regions to the left of all elements in this collection."""
26
+ return self.apply(lambda element: element.left(**kwargs))
27
+
28
+ def right(self, **kwargs) -> "ElementCollection":
29
+ """Find regions to the right of all elements in this collection."""
30
+ return self.apply(lambda element: element.right(**kwargs))
31
+
32
+ def expand(self, **kwargs) -> "ElementCollection":
33
+ """Expand all elements in this collection."""
34
+ return self.apply(lambda element: element.expand(**kwargs))
35
+
8
36
 
9
37
  class ApplyMixin:
10
38
  """
@@ -13,6 +41,7 @@ class ApplyMixin:
13
41
  Assumes the inheriting class implements `__iter__` and `__len__` appropriately
14
42
  for the items to be processed by `apply`.
15
43
  """
44
+
16
45
  def _get_items_for_apply(self) -> Iterable[Any]:
17
46
  """
18
47
  Returns the iterable of items to apply the function to.
@@ -22,7 +51,7 @@ class ApplyMixin:
22
51
  # Default to standard iteration over the collection itself
23
52
  return iter(self)
24
53
 
25
- def apply(self: Any, func: Callable[[Any, ...], Any], *args, **kwargs) -> None:
54
+ def apply(self: Any, func: Callable[[Any, ...], Any], *args, **kwargs) -> Iterable[Any]:
26
55
  """
27
56
  Applies a function to each item in the collection.
28
57
 
@@ -34,7 +63,7 @@ class ApplyMixin:
34
63
  A special keyword argument 'show_progress' (bool, default=False)
35
64
  can be used to display a progress bar.
36
65
  """
37
- show_progress = kwargs.pop('show_progress', False)
66
+ show_progress = kwargs.pop("show_progress", False)
38
67
  # Derive unit name from class name
39
68
  unit_name = self.__class__.__name__.lower()
40
69
  items_iterable = self._get_items_for_apply()
@@ -42,22 +71,41 @@ class ApplyMixin:
42
71
  # Need total count for tqdm, assumes __len__ is implemented by the inheriting class
43
72
  total_items = 0
44
73
  try:
45
- total_items = len(self)
46
- except TypeError: # Handle cases where __len__ might not be defined on self
47
- logger.warning(f"Could not determine collection length for progress bar.")
74
+ total_items = len(self)
75
+ except TypeError: # Handle cases where __len__ might not be defined on self
76
+ logger.warning(f"Could not determine collection length for progress bar.")
48
77
 
49
78
  if show_progress and total_items > 0:
50
- items_iterable = tqdm(items_iterable, total=total_items, desc=f"Applying {func.__name__}", unit=unit_name)
79
+ items_iterable = tqdm(
80
+ items_iterable, total=total_items, desc=f"Applying {func.__name__}", unit=unit_name
81
+ )
51
82
  elif show_progress:
52
- logger.info(f"Applying {func.__name__} (progress bar disabled for zero/unknown length).")
53
-
54
- for item in items_iterable:
55
- try:
56
- # Apply the function with the item and any extra args/kwargs
57
- func(item, *args, **kwargs)
58
- except Exception as e:
59
- # Log and continue for batch operations
60
- logger.error(f"Error applying {func.__name__} to {item}: {e}", exc_info=True)
61
- # Optionally add a mechanism to collect errors
62
-
63
- # Returns None, primarily used for side effects.
83
+ logger.info(
84
+ f"Applying {func.__name__} (progress bar disabled for zero/unknown length)."
85
+ )
86
+
87
+ results = [func(item, *args, **kwargs) for item in items_iterable]
88
+
89
+ # If results is empty, return an empty list
90
+ if not results:
91
+ return []
92
+
93
+ # Import here to avoid circular imports
94
+ from natural_pdf import PDF, Page
95
+ from natural_pdf.collections.pdf_collection import PDFCollection
96
+ from natural_pdf.elements.base import Element
97
+ from natural_pdf.elements.collections import ElementCollection, PageCollection
98
+ from natural_pdf.elements.region import Region
99
+
100
+ first_non_none = next((r for r in results if r is not None), None)
101
+ first_type = type(first_non_none) if first_non_none is not None else None
102
+
103
+ # Return the appropriate collection based on result type (...generally)
104
+ if issubclass(first_type, Element) or issubclass(first_type, Region):
105
+ return ElementCollection(results)
106
+ elif first_type == PDF:
107
+ return PDFCollection(results)
108
+ elif first_type == Page:
109
+ return PageCollection(results)
110
+
111
+ return results