natural-pdf 0.1.10__py3-none-any.whl → 0.1.12__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (31) hide show
  1. natural_pdf/__init__.py +29 -40
  2. natural_pdf/analyzers/text_options.py +9 -1
  3. natural_pdf/analyzers/text_structure.py +371 -58
  4. natural_pdf/classification/manager.py +1 -1
  5. natural_pdf/core/element_manager.py +11 -1
  6. natural_pdf/core/highlighting_service.py +120 -40
  7. natural_pdf/core/page.py +20 -18
  8. natural_pdf/core/pdf.py +146 -13
  9. natural_pdf/elements/base.py +17 -0
  10. natural_pdf/elements/collections.py +374 -30
  11. natural_pdf/elements/region.py +45 -14
  12. natural_pdf/exporters/data/__init__.py +0 -0
  13. natural_pdf/exporters/data/pdf.ttf +0 -0
  14. natural_pdf/exporters/data/sRGB.icc +0 -0
  15. natural_pdf/exporters/hocr.py +519 -0
  16. natural_pdf/exporters/hocr_font.py +136 -0
  17. natural_pdf/exporters/original_pdf.py +127 -0
  18. natural_pdf/exporters/searchable_pdf.py +2 -12
  19. natural_pdf/ocr/engine_surya.py +1 -1
  20. natural_pdf/search/__init__.py +65 -52
  21. natural_pdf/search/lancedb_search_service.py +325 -0
  22. natural_pdf/search/numpy_search_service.py +255 -0
  23. natural_pdf/search/searchable_mixin.py +25 -71
  24. natural_pdf/widgets/viewer.py +22 -31
  25. {natural_pdf-0.1.10.dist-info → natural_pdf-0.1.12.dist-info}/METADATA +54 -50
  26. {natural_pdf-0.1.10.dist-info → natural_pdf-0.1.12.dist-info}/RECORD +29 -23
  27. {natural_pdf-0.1.10.dist-info → natural_pdf-0.1.12.dist-info}/WHEEL +1 -1
  28. natural_pdf/search/haystack_search_service.py +0 -687
  29. natural_pdf/search/haystack_utils.py +0 -474
  30. {natural_pdf-0.1.10.dist-info → natural_pdf-0.1.12.dist-info}/licenses/LICENSE +0 -0
  31. {natural_pdf-0.1.10.dist-info → natural_pdf-0.1.12.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,255 @@
1
+ import logging
2
+ import numpy as np
3
+ import json
4
+ from pathlib import Path
5
+ from typing import Any, Dict, Iterable, List, Optional, Union
6
+
7
+ from sentence_transformers import SentenceTransformer
8
+
9
+ from .search_options import BaseSearchOptions
10
+ from .search_service_protocol import (
11
+ Indexable,
12
+ IndexConfigurationError,
13
+ SearchServiceProtocol,
14
+ )
15
+
16
+ logger = logging.getLogger(__name__)
17
+
18
+ DEFAULT_EMBEDDING_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
19
+
20
+ class NumpySearchService(SearchServiceProtocol):
21
+ """Basic in-memory vector search implementation using NumPy."""
22
+
23
+ collection_name: str
24
+
25
+ def __init__(
26
+ self,
27
+ collection_name: str,
28
+ persist: bool = False,
29
+ uri: Optional[str] = None,
30
+ embedding_model_name: str = DEFAULT_EMBEDDING_MODEL,
31
+ ):
32
+ if persist:
33
+ raise RuntimeError(
34
+ "Persistence requested but LanceDB is not installed. "
35
+ "For persistent vector search, install LanceDB: pip install lancedb"
36
+ )
37
+
38
+ self.collection_name = collection_name
39
+ self._embedding_model_name = embedding_model_name
40
+ self.embedding_model = SentenceTransformer(self._embedding_model_name)
41
+ self._embedding_dims = len(self.embedding_model.encode("test"))
42
+
43
+ # Simple in-memory storage
44
+ self._vectors = []
45
+ self._documents = []
46
+ self._metadata = []
47
+ self._ids = []
48
+
49
+ logger.info(f"NumpySearchService initialized for collection '{collection_name}' with model '{embedding_model_name}'")
50
+
51
+ def index(
52
+ self,
53
+ documents: Iterable[Indexable],
54
+ embedder_device: Optional[str] = None,
55
+ force_reindex: bool = False,
56
+ ) -> None:
57
+ if force_reindex:
58
+ logger.info(f"Force reindex requested for collection '{self.collection_name}'. Clearing in-memory vectors.")
59
+ self._vectors = []
60
+ self._documents = []
61
+ self._metadata = []
62
+ self._ids = []
63
+
64
+ items = list(documents)
65
+ logger.info(f"Indexing {len(items)} documents for collection '{self.collection_name}'")
66
+
67
+ if not items:
68
+ logger.warning("No documents provided for indexing. Skipping.")
69
+ return
70
+
71
+ texts_to_embed = []
72
+ items_info = []
73
+
74
+ for item in items:
75
+ doc_id = item.get_id()
76
+ metadata = item.get_metadata().copy()
77
+ content_obj = item.get_content()
78
+ content_text = ""
79
+
80
+ if isinstance(content_obj, str):
81
+ content_text = content_obj
82
+ elif hasattr(content_obj, "extract_text") and callable(getattr(content_obj, "extract_text")):
83
+ content_text = content_obj.extract_text()
84
+ if not isinstance(content_text, str):
85
+ content_text = str(content_obj)
86
+ else:
87
+ content_text = str(content_obj)
88
+
89
+ # Try to add content hash to metadata
90
+ try:
91
+ content_hash = item.get_content_hash()
92
+ if content_hash:
93
+ metadata["content_hash"] = content_hash
94
+ except (AttributeError, NotImplementedError):
95
+ pass
96
+ except Exception as e:
97
+ logger.warning(f"Error getting content_hash for item ID '{doc_id}': {e}")
98
+
99
+ texts_to_embed.append(content_text)
100
+ items_info.append({
101
+ "id": doc_id,
102
+ "metadata": metadata,
103
+ "text": content_text
104
+ })
105
+
106
+ if not texts_to_embed:
107
+ logger.warning("No text content to embed. Skipping.")
108
+ return
109
+
110
+ logger.info(f"Embedding {len(texts_to_embed)} documents using '{self._embedding_model_name}'...")
111
+ generated_embeddings = self.embedding_model.encode(
112
+ texts_to_embed, device=embedder_device, show_progress_bar=len(texts_to_embed) > 10
113
+ )
114
+
115
+ for i, item_info in enumerate(items_info):
116
+ self._vectors.append(generated_embeddings[i])
117
+ self._documents.append(item_info["text"])
118
+ self._metadata.append(item_info["metadata"])
119
+ self._ids.append(item_info["id"])
120
+
121
+ logger.info(f"Successfully indexed {len(texts_to_embed)} documents. Total count: {len(self._vectors)}")
122
+
123
+ def search(
124
+ self,
125
+ query: Any,
126
+ options: BaseSearchOptions,
127
+ ) -> List[Dict[str, Any]]:
128
+ if not self._vectors:
129
+ logger.debug("No vectors in index. Returning empty results.")
130
+ return []
131
+
132
+ # Process query to text
133
+ query_text = ""
134
+ if isinstance(query, (str, Path)):
135
+ query_text = str(query)
136
+ elif hasattr(query, "extract_text") and callable(getattr(query, "extract_text")):
137
+ query_text = query.extract_text()
138
+ if not query_text or not query_text.strip():
139
+ return []
140
+ else:
141
+ raise TypeError(f"Unsupported query type: {type(query)}")
142
+
143
+ logger.info(f"Search request for collection '{self.collection_name}' with query type {type(query).__name__}")
144
+
145
+ # Encode query and perform similarity search
146
+ query_vector = self.embedding_model.encode(query_text)
147
+
148
+ # Convert list to numpy array for batch operations
149
+ vectors_array = np.array(self._vectors)
150
+
151
+ # Normalize vectors for cosine similarity
152
+ query_norm = np.linalg.norm(query_vector)
153
+ if query_norm > 0:
154
+ query_vector = query_vector / query_norm
155
+
156
+ # Normalize all vectors (avoid division by zero)
157
+ vector_norms = np.linalg.norm(vectors_array, axis=1, keepdims=True)
158
+ valid_indices = vector_norms.flatten() > 0
159
+ vectors_array[valid_indices] = vectors_array[valid_indices] / vector_norms[valid_indices]
160
+
161
+ # Calculate cosine similarities
162
+ similarities = np.dot(vectors_array, query_vector)
163
+
164
+ # Apply filters if present
165
+ filtered_indices = np.arange(len(similarities))
166
+ if options.filters:
167
+ # Simple filtering for metadata fields
168
+ # This is a basic implementation and doesn't support complex filters like LanceDB
169
+ if isinstance(options.filters, dict):
170
+ for field, value in options.filters.items():
171
+ new_filtered = []
172
+ for i in filtered_indices:
173
+ metadata = self._metadata[i]
174
+ if field in metadata and metadata[field] == value:
175
+ new_filtered.append(i)
176
+ filtered_indices = np.array(new_filtered)
177
+ else:
178
+ logger.warning(f"Complex filter expressions not supported in NumPy backend: {options.filters}")
179
+
180
+ # Apply filtering and sort by similarity
181
+ if len(filtered_indices) > 0:
182
+ filtered_similarities = similarities[filtered_indices]
183
+ top_k = min(options.top_k, len(filtered_similarities))
184
+ if top_k == 0:
185
+ return []
186
+
187
+ top_indices_within_filtered = np.argsort(filtered_similarities)[-top_k:][::-1]
188
+ top_indices = filtered_indices[top_indices_within_filtered]
189
+ else:
190
+ top_k = min(options.top_k, len(similarities))
191
+ if top_k == 0:
192
+ return []
193
+
194
+ top_indices = np.argsort(similarities)[-top_k:][::-1]
195
+
196
+ # Format results
197
+ results = []
198
+ for idx in top_indices:
199
+ metadata = self._metadata[idx]
200
+ results.append({
201
+ "id": self._ids[idx],
202
+ "content_snippet": self._documents[idx][:200] if self._documents[idx] else "",
203
+ "score": float(similarities[idx]),
204
+ "page_number": metadata.get("page_number"),
205
+ "pdf_path": metadata.get("pdf_path"),
206
+ "metadata": metadata,
207
+ })
208
+
209
+ logger.info(f"Search returned {len(results)} results from collection '{self.collection_name}'")
210
+ return results
211
+
212
+ def index_exists(self) -> bool:
213
+ return len(self._vectors) > 0
214
+
215
+ def delete_index(self) -> bool:
216
+ logger.warning(f"Deleting in-memory index for collection '{self.collection_name}'")
217
+ self._vectors = []
218
+ self._documents = []
219
+ self._metadata = []
220
+ self._ids = []
221
+ return True
222
+
223
+ def list_documents(self, include_metadata: bool = False, **kwargs) -> List[Dict]:
224
+ logger.debug(f"Listing documents for NumPy collection '{self.collection_name}' (include_metadata={include_metadata})...")
225
+
226
+ results = []
227
+ for i, doc_id in enumerate(self._ids):
228
+ doc_info = {"id": doc_id}
229
+ if include_metadata:
230
+ doc_info["meta"] = self._metadata[i]
231
+ results.append(doc_info)
232
+
233
+ logger.info(f"Retrieved {len(results)} documents from NumPy collection '{self.collection_name}'")
234
+ return results
235
+
236
+ def delete_documents(self, ids: List[str]) -> None:
237
+ if not ids:
238
+ logger.debug("No document IDs provided for deletion. Skipping.")
239
+ return
240
+
241
+ logger.warning(f"Request to delete {len(ids)} documents from NumPy collection '{self.collection_name}'")
242
+
243
+ # Find indices to remove
244
+ keep_indices = []
245
+ for i, doc_id in enumerate(self._ids):
246
+ if doc_id not in ids:
247
+ keep_indices.append(i)
248
+
249
+ # Create new filtered lists
250
+ self._ids = [self._ids[i] for i in keep_indices]
251
+ self._vectors = [self._vectors[i] for i in keep_indices]
252
+ self._documents = [self._documents[i] for i in keep_indices]
253
+ self._metadata = [self._metadata[i] for i in keep_indices]
254
+
255
+ logger.info(f"Deleted documents. Collection now contains {len(self._ids)} documents.")
@@ -4,7 +4,6 @@ from abc import ABC, abstractmethod
4
4
  from typing import TYPE_CHECKING, Any, Dict, Generator, Iterable, List, Optional, Type, Union
5
5
 
6
6
  # Now import the flag from the canonical source - this import should always work
7
- from .haystack_utils import HAS_HAYSTACK_EXTRAS
8
7
 
9
8
  DEFAULT_SEARCH_COLLECTION_NAME = "default_collection"
10
9
 
@@ -108,7 +107,6 @@ class SearchableMixin(ABC):
108
107
  logger.info(
109
108
  f"Attaching provided SearchService instance (Collection: '{getattr(service, 'collection_name', '<Unknown>')}')."
110
109
  )
111
- # TODO: Add stricter type check? isinstance(service, SearchServiceProtocol) requires runtime_checkable
112
110
  self._search_service = service
113
111
  else:
114
112
  # Create new service
@@ -125,28 +123,17 @@ class SearchableMixin(ABC):
125
123
  logger.info(
126
124
  f"Creating new SearchService: name='{effective_collection_name}', persist={effective_persist}, model={embedding_model or 'default'}"
127
125
  )
128
- try:
129
- service_args = {
130
- "collection_name": effective_collection_name,
131
- "persist": effective_persist,
132
- **kwargs,
133
- }
134
- if embedding_model:
135
- service_args["embedding_model"] = embedding_model
136
- self._search_service = get_search_service(**service_args)
137
- except ImportError as ie: # Catch the specific ImportError first
138
- logger.error(f"Failed to create SearchService due to missing dependency: {ie}")
139
- raise ie # Re-raise the original ImportError
140
- except Exception as e:
141
- logger.error(
142
- f"Failed to create SearchService due to unexpected error: {e}", exc_info=True
143
- )
144
- # Keep the RuntimeError for other unexpected creation errors
145
- raise RuntimeError(
146
- "Could not create SearchService instance due to an unexpected error."
147
- ) from e
126
+
127
+ # Direct creation without try/except
128
+ service_args = {
129
+ "collection_name": effective_collection_name,
130
+ "persist": effective_persist,
131
+ **kwargs,
132
+ }
133
+ if embedding_model:
134
+ service_args["embedding_model"] = embedding_model
135
+ self._search_service = get_search_service(**service_args)
148
136
 
149
- # --- Optional Immediate Indexing (with safety check for persistent) ---
150
137
  if index:
151
138
  if not self._search_service: # Should not happen if logic above is correct
152
139
  raise RuntimeError(
@@ -176,8 +163,6 @@ class SearchableMixin(ABC):
176
163
  logger.warning(
177
164
  f"Proceeding with index=True and force_reindex=True for persistent index '{collection_name}'. Existing data will be deleted."
178
165
  )
179
- # else: # Not persistent, safe to proceed without existence check
180
- # logger.debug("Proceeding with index=True for non-persistent index.")
181
166
 
182
167
  # Proceed with indexing if checks passed or not applicable
183
168
  logger.info(
@@ -197,12 +182,8 @@ class SearchableMixin(ABC):
197
182
  f"Starting internal indexing process into SearchService collection '{collection_name}'..."
198
183
  )
199
184
 
200
- # Use the abstract method to get items
201
- try:
202
- indexable_items = list(self.get_indexable_items()) # Consume iterator
203
- except Exception as e:
204
- logger.error(f"Error calling get_indexable_items: {e}", exc_info=True)
205
- raise RuntimeError("Failed to retrieve indexable items for indexing.") from e
185
+ # Get indexable items without try/except
186
+ indexable_items = list(self.get_indexable_items()) # Consume iterator
206
187
 
207
188
  if not indexable_items:
208
189
  logger.warning(
@@ -211,27 +192,19 @@ class SearchableMixin(ABC):
211
192
  return
212
193
 
213
194
  logger.info(f"Prepared {len(indexable_items)} indexable items for indexing.")
214
- try:
215
- logger.debug(
216
- f"Calling index() on SearchService for collection '{collection_name}' (force_reindex={force_reindex})."
217
- )
218
- self._search_service.index(
219
- documents=indexable_items,
220
- embedder_device=embedder_device,
221
- force_reindex=force_reindex,
222
- )
223
- logger.info(
224
- f"Successfully completed indexing into SearchService collection '{collection_name}'."
225
- )
226
- except IndexConfigurationError as ice:
227
- logger.error(
228
- f"Indexing failed due to configuration error in collection '{collection_name}': {ice}",
229
- exc_info=True,
230
- )
231
- raise # Re-raise specific error
232
- except Exception as e: # Catch other indexing errors from the service
233
- logger.error(f"Indexing failed for collection '{collection_name}': {e}", exc_info=True)
234
- raise RuntimeError(f"Indexing failed for collection '{collection_name}'.") from e
195
+ logger.debug(
196
+ f"Calling index() on SearchService for collection '{collection_name}' (force_reindex={force_reindex})."
197
+ )
198
+
199
+ # Call index without try/except
200
+ self._search_service.index(
201
+ documents=indexable_items,
202
+ embedder_device=embedder_device,
203
+ force_reindex=force_reindex,
204
+ )
205
+ logger.info(
206
+ f"Successfully completed indexing into SearchService collection '{collection_name}'."
207
+ )
235
208
 
236
209
  def index_for_search(
237
210
  self,
@@ -254,14 +227,12 @@ class SearchableMixin(ABC):
254
227
  Returns:
255
228
  Self for method chaining.
256
229
  """
257
- # --- Ensure Service is Initialized (Use Default if Needed) ---
258
230
  if not self._search_service:
259
231
  logger.info(
260
232
  "Search service not initialized prior to index_for_search. Initializing default in-memory service."
261
233
  )
262
234
  self.init_search() # Call init with defaults
263
235
 
264
- # --- Perform Indexing ---
265
236
  self._perform_indexing(force_reindex=force_reindex, embedder_device=embedder_device)
266
237
  return self
267
238
 
@@ -289,7 +260,6 @@ class SearchableMixin(ABC):
289
260
  RuntimeError: If no search service is configured or provided, or if search fails.
290
261
  FileNotFoundError: If the collection managed by the service does not exist.
291
262
  """
292
- # --- Determine which Search Service to use ---
293
263
  effective_service = search_service or self._search_service
294
264
  if not effective_service:
295
265
  raise RuntimeError(
@@ -302,21 +272,9 @@ class SearchableMixin(ABC):
302
272
  f"Searching collection '{collection_name}' via {type(effective_service).__name__}..."
303
273
  )
304
274
 
305
- # --- Prepare Query and Options ---
306
275
  query_input = query
307
- # Example: Handle Region query - maybe move this logic into HaystackSearchService.search?
308
- # If we keep it here, it makes the mixin less generic.
309
- # Let's assume the SearchService handles the query type appropriately for now.
310
- # if isinstance(query, Region):
311
- # logger.debug("Query is a Region object. Extracting text.")
312
- # query_input = query.extract_text()
313
- # if not query_input or query_input.isspace():
314
- # logger.warning("Region provided for query has no extractable text.")
315
- # return []
316
-
317
276
  effective_options = options if options is not None else TextSearchOptions()
318
277
 
319
- # --- Call SearchService Search Method ---
320
278
  try:
321
279
  results = effective_service.search(
322
280
  query=query_input,
@@ -336,7 +294,6 @@ class SearchableMixin(ABC):
336
294
  # Consider wrapping in a SearchError?
337
295
  raise RuntimeError(f"Search failed in collection '{collection_name}'.") from e
338
296
 
339
- # --- NEW Sync Method ---
340
297
  def sync_index(
341
298
  self,
342
299
  strategy: str = "full", # 'full' (add/update/delete) or 'upsert_only'
@@ -378,7 +335,6 @@ class SearchableMixin(ABC):
378
335
  )
379
336
  summary = {"added": 0, "updated": 0, "deleted": 0, "skipped": 0}
380
337
 
381
- # --- Check Service Capabilities for 'full' sync ---
382
338
  if strategy == "full":
383
339
  required_methods = ["list_documents", "delete_documents"]
384
340
  missing_methods = [m for m in required_methods if not hasattr(self._search_service, m)]
@@ -388,7 +344,6 @@ class SearchableMixin(ABC):
388
344
  f"is missing required methods for 'full' sync strategy: {', '.join(missing_methods)}"
389
345
  )
390
346
 
391
- # --- 1. Get Desired State (from current collection) ---
392
347
  desired_state: Dict[str, Indexable] = {} # {id: item}
393
348
  desired_hashes: Dict[str, Optional[str]] = {} # {id: hash or None}
394
349
  try:
@@ -426,7 +381,6 @@ class SearchableMixin(ABC):
426
381
 
427
382
  logger.info(f"Desired state contains {len(desired_state)} indexable items.")
428
383
 
429
- # --- 2. Handle Different Strategies ---
430
384
  if strategy == "upsert_only":
431
385
  # Simple case: just index everything, let the service handle upserts
432
386
  items_to_index = list(desired_state.values())
@@ -31,20 +31,6 @@ try:
31
31
  from PIL import Image
32
32
  from traitlets import Dict, List, Unicode, observe
33
33
 
34
- # --- Read JS code from file (only needed if widgets are defined) --- #
35
- _MODULE_DIR = os.path.dirname(__file__)
36
- _FRONTEND_JS_PATH = os.path.join(_MODULE_DIR, "frontend", "viewer.js")
37
- try:
38
- with open(_FRONTEND_JS_PATH, "r", encoding="utf-8") as f:
39
- _FRONTEND_JS_CODE = f.read()
40
- logger.debug(f"Successfully read frontend JS from: {_FRONTEND_JS_PATH}")
41
- except FileNotFoundError:
42
- logger.error(f"Frontend JS file not found at {_FRONTEND_JS_PATH}. Widget will likely fail.")
43
- _FRONTEND_JS_CODE = "console.error('Frontend JS file not found! Widget cannot load.');"
44
- except Exception as e:
45
- logger.error(f"Error reading frontend JS file {_FRONTEND_JS_PATH}: {e}")
46
- _FRONTEND_JS_CODE = f"console.error('Error reading frontend JS file: {e}');"
47
-
48
34
  # --- Define Widget Classes ONLY if ipywidgets is available ---
49
35
  class SimpleInteractiveViewerWidget(widgets.DOMWidget):
50
36
  def __init__(self, pdf_data=None, **kwargs):
@@ -631,7 +617,7 @@ try:
631
617
 
632
618
  # Filter out 'char' elements
633
619
  filtered_page_elements = [
634
- el for el in page_elements if getattr(el, "type", "").lower() != "char"
620
+ el for el in page_elements if str(getattr(el, "type", "")).lower() != "char"
635
621
  ]
636
622
  logger.debug(
637
623
  f"Filtered out char elements, keeping {len(filtered_page_elements)} elements."
@@ -659,19 +645,21 @@ try:
659
645
 
660
646
  for i, element in enumerate(filtered_page_elements):
661
647
  # Get original coordinates and calculated width/height (always present via base class)
648
+ # Assuming 'element' is always an object with these attributes now
662
649
  original_x0 = element.x0
663
650
  original_y0 = element.top
664
651
  original_x1 = element.x1
665
652
  original_y1 = element.bottom
666
653
  width = element.width
667
654
  height = element.height
655
+ current_element_type = element.type # Direct attribute access
668
656
  scale = 1.0
669
657
 
670
658
  # Base element dict with required info
671
659
  elem_dict = {
672
660
  "id": i,
673
661
  # Use the standardized .type property
674
- "type": element.type,
662
+ "type": current_element_type,
675
663
  # Scaled coordinates for positioning in HTML/SVG
676
664
  "x0": original_x0 * scale,
677
665
  "y0": original_y0 * scale,
@@ -684,21 +672,24 @@ try:
684
672
  # --- Get Default Attributes --- #
685
673
  attributes_found = set()
686
674
  for attr_name in default_attributes_to_get:
675
+ # Assuming 'element' is always an object
687
676
  if hasattr(element, attr_name):
688
677
  try:
689
- value = getattr(element, attr_name)
678
+ value_to_process = getattr(element, attr_name)
690
679
  # Convert non-JSON serializable types to string
691
- processed_value = value
680
+ processed_value = value_to_process
692
681
  if (
693
- not isinstance(value, (str, int, float, bool, list, dict, tuple))
694
- and value is not None
682
+ not isinstance(
683
+ value_to_process, (str, int, float, bool, list, dict, tuple)
684
+ )
685
+ and value_to_process is not None
695
686
  ):
696
- processed_value = str(value)
687
+ processed_value = str(value_to_process)
697
688
  elem_dict[attr_name] = processed_value
698
689
  attributes_found.add(attr_name)
699
690
  except Exception as e:
700
691
  logger.warning(
701
- f"Could not get or process default attribute '{attr_name}' for element {i} ({element.type}): {e}"
692
+ f"Could not get or process default attribute '{attr_name}' for element {i} ({current_element_type}): {e}"
702
693
  )
703
694
 
704
695
  # --- Get User-Requested Attributes (if any) --- #
@@ -707,23 +698,23 @@ try:
707
698
  # Only process if not already added and exists
708
699
  if attr_name not in attributes_found and hasattr(element, attr_name):
709
700
  try:
710
- value = getattr(element, attr_name)
711
- processed_value = value
701
+ value_to_process = getattr(element, attr_name)
702
+ processed_value = value_to_process
712
703
  if (
713
704
  not isinstance(
714
- value, (str, int, float, bool, list, dict, tuple)
705
+ value_to_process, (str, int, float, bool, list, dict, tuple)
715
706
  )
716
- and value is not None
707
+ and value_to_process is not None
717
708
  ):
718
- processed_value = str(value)
709
+ processed_value = str(value_to_process)
719
710
  elem_dict[attr_name] = processed_value
720
711
  except Exception as e:
721
712
  logger.warning(
722
- f"Could not get or process requested attribute '{attr_name}' for element {i} ({element.type}): {e}"
713
+ f"Could not get or process requested attribute '{attr_name}' for element {i} ({current_element_type}): {e}"
723
714
  )
724
- for attr_name in elem_dict:
725
- if isinstance(elem_dict[attr_name], float):
726
- elem_dict[attr_name] = round(elem_dict[attr_name], 2)
715
+ for attr_name_val in elem_dict: # Renamed to avoid conflict
716
+ if isinstance(elem_dict[attr_name_val], float):
717
+ elem_dict[attr_name_val] = round(elem_dict[attr_name_val], 2)
727
718
  elements.append(elem_dict)
728
719
 
729
720
  logger.debug(