natural-pdf 0.1.11__py3-none-any.whl → 0.1.13__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- natural_pdf/__init__.py +7 -2
- natural_pdf/analyzers/shape_detection_mixin.py +1092 -0
- natural_pdf/analyzers/text_options.py +9 -1
- natural_pdf/analyzers/text_structure.py +371 -58
- natural_pdf/classification/manager.py +3 -4
- natural_pdf/collections/pdf_collection.py +19 -39
- natural_pdf/core/element_manager.py +11 -1
- natural_pdf/core/highlighting_service.py +146 -75
- natural_pdf/core/page.py +287 -188
- natural_pdf/core/pdf.py +57 -42
- natural_pdf/elements/base.py +51 -0
- natural_pdf/elements/collections.py +362 -67
- natural_pdf/elements/line.py +5 -0
- natural_pdf/elements/region.py +396 -23
- natural_pdf/exporters/data/__init__.py +0 -0
- natural_pdf/exporters/data/pdf.ttf +0 -0
- natural_pdf/exporters/data/sRGB.icc +0 -0
- natural_pdf/exporters/hocr.py +40 -61
- natural_pdf/exporters/hocr_font.py +7 -13
- natural_pdf/exporters/original_pdf.py +10 -13
- natural_pdf/exporters/paddleocr.py +51 -11
- natural_pdf/exporters/searchable_pdf.py +0 -10
- natural_pdf/flows/__init__.py +12 -0
- natural_pdf/flows/collections.py +533 -0
- natural_pdf/flows/element.py +382 -0
- natural_pdf/flows/flow.py +216 -0
- natural_pdf/flows/region.py +458 -0
- natural_pdf/search/__init__.py +65 -52
- natural_pdf/search/lancedb_search_service.py +325 -0
- natural_pdf/search/numpy_search_service.py +255 -0
- natural_pdf/search/searchable_mixin.py +25 -71
- natural_pdf/selectors/parser.py +163 -8
- natural_pdf/widgets/viewer.py +22 -31
- {natural_pdf-0.1.11.dist-info → natural_pdf-0.1.13.dist-info}/METADATA +55 -49
- {natural_pdf-0.1.11.dist-info → natural_pdf-0.1.13.dist-info}/RECORD +38 -30
- {natural_pdf-0.1.11.dist-info → natural_pdf-0.1.13.dist-info}/WHEEL +1 -1
- natural_pdf/search/haystack_search_service.py +0 -687
- natural_pdf/search/haystack_utils.py +0 -474
- natural_pdf/utils/tqdm_utils.py +0 -51
- {natural_pdf-0.1.11.dist-info → natural_pdf-0.1.13.dist-info}/licenses/LICENSE +0 -0
- {natural_pdf-0.1.11.dist-info → natural_pdf-0.1.13.dist-info}/top_level.txt +0 -0
@@ -4,7 +4,6 @@ from abc import ABC, abstractmethod
|
|
4
4
|
from typing import TYPE_CHECKING, Any, Dict, Generator, Iterable, List, Optional, Type, Union
|
5
5
|
|
6
6
|
# Now import the flag from the canonical source - this import should always work
|
7
|
-
from .haystack_utils import HAS_HAYSTACK_EXTRAS
|
8
7
|
|
9
8
|
DEFAULT_SEARCH_COLLECTION_NAME = "default_collection"
|
10
9
|
|
@@ -108,7 +107,6 @@ class SearchableMixin(ABC):
|
|
108
107
|
logger.info(
|
109
108
|
f"Attaching provided SearchService instance (Collection: '{getattr(service, 'collection_name', '<Unknown>')}')."
|
110
109
|
)
|
111
|
-
# TODO: Add stricter type check? isinstance(service, SearchServiceProtocol) requires runtime_checkable
|
112
110
|
self._search_service = service
|
113
111
|
else:
|
114
112
|
# Create new service
|
@@ -125,28 +123,17 @@ class SearchableMixin(ABC):
|
|
125
123
|
logger.info(
|
126
124
|
f"Creating new SearchService: name='{effective_collection_name}', persist={effective_persist}, model={embedding_model or 'default'}"
|
127
125
|
)
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
logger.error(f"Failed to create SearchService due to missing dependency: {ie}")
|
139
|
-
raise ie # Re-raise the original ImportError
|
140
|
-
except Exception as e:
|
141
|
-
logger.error(
|
142
|
-
f"Failed to create SearchService due to unexpected error: {e}", exc_info=True
|
143
|
-
)
|
144
|
-
# Keep the RuntimeError for other unexpected creation errors
|
145
|
-
raise RuntimeError(
|
146
|
-
"Could not create SearchService instance due to an unexpected error."
|
147
|
-
) from e
|
126
|
+
|
127
|
+
# Direct creation without try/except
|
128
|
+
service_args = {
|
129
|
+
"collection_name": effective_collection_name,
|
130
|
+
"persist": effective_persist,
|
131
|
+
**kwargs,
|
132
|
+
}
|
133
|
+
if embedding_model:
|
134
|
+
service_args["embedding_model"] = embedding_model
|
135
|
+
self._search_service = get_search_service(**service_args)
|
148
136
|
|
149
|
-
# --- Optional Immediate Indexing (with safety check for persistent) ---
|
150
137
|
if index:
|
151
138
|
if not self._search_service: # Should not happen if logic above is correct
|
152
139
|
raise RuntimeError(
|
@@ -176,8 +163,6 @@ class SearchableMixin(ABC):
|
|
176
163
|
logger.warning(
|
177
164
|
f"Proceeding with index=True and force_reindex=True for persistent index '{collection_name}'. Existing data will be deleted."
|
178
165
|
)
|
179
|
-
# else: # Not persistent, safe to proceed without existence check
|
180
|
-
# logger.debug("Proceeding with index=True for non-persistent index.")
|
181
166
|
|
182
167
|
# Proceed with indexing if checks passed or not applicable
|
183
168
|
logger.info(
|
@@ -197,12 +182,8 @@ class SearchableMixin(ABC):
|
|
197
182
|
f"Starting internal indexing process into SearchService collection '{collection_name}'..."
|
198
183
|
)
|
199
184
|
|
200
|
-
#
|
201
|
-
|
202
|
-
indexable_items = list(self.get_indexable_items()) # Consume iterator
|
203
|
-
except Exception as e:
|
204
|
-
logger.error(f"Error calling get_indexable_items: {e}", exc_info=True)
|
205
|
-
raise RuntimeError("Failed to retrieve indexable items for indexing.") from e
|
185
|
+
# Get indexable items without try/except
|
186
|
+
indexable_items = list(self.get_indexable_items()) # Consume iterator
|
206
187
|
|
207
188
|
if not indexable_items:
|
208
189
|
logger.warning(
|
@@ -211,27 +192,19 @@ class SearchableMixin(ABC):
|
|
211
192
|
return
|
212
193
|
|
213
194
|
logger.info(f"Prepared {len(indexable_items)} indexable items for indexing.")
|
214
|
-
|
215
|
-
|
216
|
-
|
217
|
-
|
218
|
-
|
219
|
-
|
220
|
-
|
221
|
-
|
222
|
-
|
223
|
-
|
224
|
-
|
225
|
-
|
226
|
-
|
227
|
-
logger.error(
|
228
|
-
f"Indexing failed due to configuration error in collection '{collection_name}': {ice}",
|
229
|
-
exc_info=True,
|
230
|
-
)
|
231
|
-
raise # Re-raise specific error
|
232
|
-
except Exception as e: # Catch other indexing errors from the service
|
233
|
-
logger.error(f"Indexing failed for collection '{collection_name}': {e}", exc_info=True)
|
234
|
-
raise RuntimeError(f"Indexing failed for collection '{collection_name}'.") from e
|
195
|
+
logger.debug(
|
196
|
+
f"Calling index() on SearchService for collection '{collection_name}' (force_reindex={force_reindex})."
|
197
|
+
)
|
198
|
+
|
199
|
+
# Call index without try/except
|
200
|
+
self._search_service.index(
|
201
|
+
documents=indexable_items,
|
202
|
+
embedder_device=embedder_device,
|
203
|
+
force_reindex=force_reindex,
|
204
|
+
)
|
205
|
+
logger.info(
|
206
|
+
f"Successfully completed indexing into SearchService collection '{collection_name}'."
|
207
|
+
)
|
235
208
|
|
236
209
|
def index_for_search(
|
237
210
|
self,
|
@@ -254,14 +227,12 @@ class SearchableMixin(ABC):
|
|
254
227
|
Returns:
|
255
228
|
Self for method chaining.
|
256
229
|
"""
|
257
|
-
# --- Ensure Service is Initialized (Use Default if Needed) ---
|
258
230
|
if not self._search_service:
|
259
231
|
logger.info(
|
260
232
|
"Search service not initialized prior to index_for_search. Initializing default in-memory service."
|
261
233
|
)
|
262
234
|
self.init_search() # Call init with defaults
|
263
235
|
|
264
|
-
# --- Perform Indexing ---
|
265
236
|
self._perform_indexing(force_reindex=force_reindex, embedder_device=embedder_device)
|
266
237
|
return self
|
267
238
|
|
@@ -289,7 +260,6 @@ class SearchableMixin(ABC):
|
|
289
260
|
RuntimeError: If no search service is configured or provided, or if search fails.
|
290
261
|
FileNotFoundError: If the collection managed by the service does not exist.
|
291
262
|
"""
|
292
|
-
# --- Determine which Search Service to use ---
|
293
263
|
effective_service = search_service or self._search_service
|
294
264
|
if not effective_service:
|
295
265
|
raise RuntimeError(
|
@@ -302,21 +272,9 @@ class SearchableMixin(ABC):
|
|
302
272
|
f"Searching collection '{collection_name}' via {type(effective_service).__name__}..."
|
303
273
|
)
|
304
274
|
|
305
|
-
# --- Prepare Query and Options ---
|
306
275
|
query_input = query
|
307
|
-
# Example: Handle Region query - maybe move this logic into HaystackSearchService.search?
|
308
|
-
# If we keep it here, it makes the mixin less generic.
|
309
|
-
# Let's assume the SearchService handles the query type appropriately for now.
|
310
|
-
# if isinstance(query, Region):
|
311
|
-
# logger.debug("Query is a Region object. Extracting text.")
|
312
|
-
# query_input = query.extract_text()
|
313
|
-
# if not query_input or query_input.isspace():
|
314
|
-
# logger.warning("Region provided for query has no extractable text.")
|
315
|
-
# return []
|
316
|
-
|
317
276
|
effective_options = options if options is not None else TextSearchOptions()
|
318
277
|
|
319
|
-
# --- Call SearchService Search Method ---
|
320
278
|
try:
|
321
279
|
results = effective_service.search(
|
322
280
|
query=query_input,
|
@@ -336,7 +294,6 @@ class SearchableMixin(ABC):
|
|
336
294
|
# Consider wrapping in a SearchError?
|
337
295
|
raise RuntimeError(f"Search failed in collection '{collection_name}'.") from e
|
338
296
|
|
339
|
-
# --- NEW Sync Method ---
|
340
297
|
def sync_index(
|
341
298
|
self,
|
342
299
|
strategy: str = "full", # 'full' (add/update/delete) or 'upsert_only'
|
@@ -378,7 +335,6 @@ class SearchableMixin(ABC):
|
|
378
335
|
)
|
379
336
|
summary = {"added": 0, "updated": 0, "deleted": 0, "skipped": 0}
|
380
337
|
|
381
|
-
# --- Check Service Capabilities for 'full' sync ---
|
382
338
|
if strategy == "full":
|
383
339
|
required_methods = ["list_documents", "delete_documents"]
|
384
340
|
missing_methods = [m for m in required_methods if not hasattr(self._search_service, m)]
|
@@ -388,7 +344,6 @@ class SearchableMixin(ABC):
|
|
388
344
|
f"is missing required methods for 'full' sync strategy: {', '.join(missing_methods)}"
|
389
345
|
)
|
390
346
|
|
391
|
-
# --- 1. Get Desired State (from current collection) ---
|
392
347
|
desired_state: Dict[str, Indexable] = {} # {id: item}
|
393
348
|
desired_hashes: Dict[str, Optional[str]] = {} # {id: hash or None}
|
394
349
|
try:
|
@@ -426,7 +381,6 @@ class SearchableMixin(ABC):
|
|
426
381
|
|
427
382
|
logger.info(f"Desired state contains {len(desired_state)} indexable items.")
|
428
383
|
|
429
|
-
# --- 2. Handle Different Strategies ---
|
430
384
|
if strategy == "upsert_only":
|
431
385
|
# Simple case: just index everything, let the service handle upserts
|
432
386
|
items_to_index = list(desired_state.values())
|
natural_pdf/selectors/parser.py
CHANGED
@@ -71,6 +71,91 @@ def safe_parse_color(value_str: str) -> tuple:
|
|
71
71
|
return (0, 0, 0)
|
72
72
|
|
73
73
|
|
74
|
+
def _split_top_level_or(selector: str) -> List[str]:
|
75
|
+
"""
|
76
|
+
Split a selector string on top-level OR operators (| or ,) only.
|
77
|
+
|
78
|
+
Respects parsing contexts and does not split when | or , appear inside:
|
79
|
+
- Quoted strings (both single and double quotes)
|
80
|
+
- Parentheses (for pseudo-class arguments like :not(...))
|
81
|
+
- Square brackets (for attribute selectors like [attr="value"])
|
82
|
+
|
83
|
+
Args:
|
84
|
+
selector: The selector string to split
|
85
|
+
|
86
|
+
Returns:
|
87
|
+
List of selector parts. If no top-level OR operators found, returns [selector].
|
88
|
+
|
89
|
+
Examples:
|
90
|
+
>>> _split_top_level_or('text:contains("a|b")|text:bold')
|
91
|
+
['text:contains("a|b")', 'text:bold']
|
92
|
+
|
93
|
+
>>> _split_top_level_or('text:contains("hello,world")')
|
94
|
+
['text:contains("hello,world")']
|
95
|
+
"""
|
96
|
+
if not selector or not isinstance(selector, str):
|
97
|
+
return [selector] if selector else []
|
98
|
+
|
99
|
+
parts = []
|
100
|
+
current_part = ""
|
101
|
+
i = 0
|
102
|
+
|
103
|
+
# Parsing state
|
104
|
+
in_double_quotes = False
|
105
|
+
in_single_quotes = False
|
106
|
+
paren_depth = 0
|
107
|
+
bracket_depth = 0
|
108
|
+
|
109
|
+
while i < len(selector):
|
110
|
+
char = selector[i]
|
111
|
+
|
112
|
+
# Handle escape sequences in quotes
|
113
|
+
if i > 0 and selector[i-1] == '\\':
|
114
|
+
current_part += char
|
115
|
+
i += 1
|
116
|
+
continue
|
117
|
+
|
118
|
+
# Handle quote state changes
|
119
|
+
if char == '"' and not in_single_quotes:
|
120
|
+
in_double_quotes = not in_double_quotes
|
121
|
+
elif char == "'" and not in_double_quotes:
|
122
|
+
in_single_quotes = not in_single_quotes
|
123
|
+
|
124
|
+
# Handle parentheses and brackets only when not in quotes
|
125
|
+
elif not in_double_quotes and not in_single_quotes:
|
126
|
+
if char == '(':
|
127
|
+
paren_depth += 1
|
128
|
+
elif char == ')':
|
129
|
+
paren_depth -= 1
|
130
|
+
elif char == '[':
|
131
|
+
bracket_depth += 1
|
132
|
+
elif char == ']':
|
133
|
+
bracket_depth -= 1
|
134
|
+
|
135
|
+
# Check for top-level OR operators
|
136
|
+
elif (char == '|' or char == ',') and paren_depth == 0 and bracket_depth == 0:
|
137
|
+
# Found a top-level OR operator
|
138
|
+
part = current_part.strip()
|
139
|
+
if part: # Only add non-empty parts
|
140
|
+
parts.append(part)
|
141
|
+
current_part = ""
|
142
|
+
i += 1
|
143
|
+
continue
|
144
|
+
|
145
|
+
# Add character to current part
|
146
|
+
current_part += char
|
147
|
+
i += 1
|
148
|
+
|
149
|
+
# Add the final part
|
150
|
+
final_part = current_part.strip()
|
151
|
+
if final_part:
|
152
|
+
parts.append(final_part)
|
153
|
+
|
154
|
+
# If we only found one part, return it as a single-element list
|
155
|
+
# If we found multiple parts, those are the OR-separated parts
|
156
|
+
return parts if parts else [selector]
|
157
|
+
|
158
|
+
|
74
159
|
def parse_selector(selector: str) -> Dict[str, Any]:
|
75
160
|
"""
|
76
161
|
Parse a CSS-like selector string into a structured selector object.
|
@@ -80,12 +165,28 @@ def parse_selector(selector: str) -> Dict[str, Any]:
|
|
80
165
|
- Attribute presence (e.g., '[data-id]')
|
81
166
|
- Attribute value checks with various operators (e.g., '[count=5]', '[name*="bold"]'')
|
82
167
|
- Pseudo-classes (e.g., ':contains("Total")', ':empty', ':not(...)')
|
168
|
+
- OR operators (e.g., 'text:contains("A")|text:bold', 'sel1,sel2')
|
83
169
|
|
84
170
|
Args:
|
85
171
|
selector: CSS-like selector string
|
86
172
|
|
87
173
|
Returns:
|
88
|
-
Dict representing the parsed selector
|
174
|
+
Dict representing the parsed selector, or compound selector with OR logic
|
175
|
+
|
176
|
+
Examples:
|
177
|
+
>>> parse_selector('text:contains("hello")') # Single selector
|
178
|
+
{'type': 'text', 'pseudo_classes': [{'name': 'contains', 'args': 'hello'}], ...}
|
179
|
+
|
180
|
+
>>> parse_selector('text:contains("A")|text:bold') # OR with pipe
|
181
|
+
{'type': 'or', 'selectors': [...]}
|
182
|
+
|
183
|
+
>>> parse_selector('text:contains("A"),line[width>5]') # OR with comma
|
184
|
+
{'type': 'or', 'selectors': [...]}
|
185
|
+
|
186
|
+
Note:
|
187
|
+
OR operators work with all selector types except spatial pseudo-classes
|
188
|
+
(:above, :below, :near, :left-of, :right-of) which require page context.
|
189
|
+
Spatial relationships within OR selectors are not currently supported.
|
89
190
|
"""
|
90
191
|
result = {
|
91
192
|
"type": "any",
|
@@ -100,6 +201,36 @@ def parse_selector(selector: str) -> Dict[str, Any]:
|
|
100
201
|
|
101
202
|
selector = selector.strip()
|
102
203
|
|
204
|
+
# --- Handle OR operators first (| or ,) ---
|
205
|
+
# Check if selector contains OR operators at the top level only
|
206
|
+
# (not inside quotes, parentheses, or brackets)
|
207
|
+
or_parts = _split_top_level_or(selector)
|
208
|
+
|
209
|
+
# If we found OR parts, parse each one recursively and return compound selector
|
210
|
+
if len(or_parts) > 1:
|
211
|
+
parsed_selectors = []
|
212
|
+
for part in or_parts:
|
213
|
+
try:
|
214
|
+
parsed_selectors.append(parse_selector(part))
|
215
|
+
except (ValueError, TypeError) as e:
|
216
|
+
logger.warning(f"Skipping invalid OR selector part '{part}': {e}")
|
217
|
+
continue
|
218
|
+
|
219
|
+
if len(parsed_selectors) > 1:
|
220
|
+
return {
|
221
|
+
"type": "or",
|
222
|
+
"selectors": parsed_selectors
|
223
|
+
}
|
224
|
+
elif len(parsed_selectors) == 1:
|
225
|
+
# Only one valid part, return it directly
|
226
|
+
return parsed_selectors[0]
|
227
|
+
else:
|
228
|
+
# No valid parts, return default
|
229
|
+
logger.warning(f"No valid parts found in OR selector '{original_selector_for_error}', returning default selector")
|
230
|
+
return result
|
231
|
+
|
232
|
+
# --- Continue with single selector parsing (existing logic) ---
|
233
|
+
|
103
234
|
# --- Handle wildcard selector explicitly ---
|
104
235
|
if selector == "*":
|
105
236
|
# Wildcard matches any type, already the default.
|
@@ -109,12 +240,6 @@ def parse_selector(selector: str) -> Dict[str, Any]:
|
|
109
240
|
|
110
241
|
# 1. Extract type (optional, at the beginning)
|
111
242
|
# Only run if selector wasn't '*'
|
112
|
-
if selector:
|
113
|
-
type_match = re.match(r"^([a-zA-Z_\-]+)", selector)
|
114
|
-
if type_match:
|
115
|
-
result["type"] = type_match.group(1).lower()
|
116
|
-
selector = selector[len(type_match.group(0)) :].strip()
|
117
|
-
# Only run if selector wasn't '*'
|
118
243
|
if selector:
|
119
244
|
type_match = re.match(r"^([a-zA-Z_\-]+)", selector)
|
120
245
|
if type_match:
|
@@ -597,12 +722,42 @@ def selector_to_filter_func(selector: Dict[str, Any], **kwargs) -> Callable[[Any
|
|
597
722
|
To inspect the individual filters, call `_build_filter_list` directly.
|
598
723
|
|
599
724
|
Args:
|
600
|
-
selector: Parsed selector dictionary
|
725
|
+
selector: Parsed selector dictionary (single or compound OR selector)
|
601
726
|
**kwargs: Additional filter parameters (e.g., regex, case).
|
602
727
|
|
603
728
|
Returns:
|
604
729
|
Function that takes an element and returns True if it matches the selector.
|
605
730
|
"""
|
731
|
+
# Handle compound OR selectors
|
732
|
+
if selector.get("type") == "or":
|
733
|
+
sub_selectors = selector.get("selectors", [])
|
734
|
+
if not sub_selectors:
|
735
|
+
# Empty OR selector, return a function that never matches
|
736
|
+
return lambda element: False
|
737
|
+
|
738
|
+
# Create filter functions for each sub-selector
|
739
|
+
sub_filter_funcs = []
|
740
|
+
for sub_selector in sub_selectors:
|
741
|
+
sub_filter_funcs.append(selector_to_filter_func(sub_selector, **kwargs))
|
742
|
+
|
743
|
+
if logger.isEnabledFor(logging.DEBUG):
|
744
|
+
logger.debug(f"Creating OR filter with {len(sub_filter_funcs)} sub-selectors")
|
745
|
+
|
746
|
+
# Return OR combination - element matches if ANY sub-selector matches
|
747
|
+
def or_filter(element):
|
748
|
+
for func in sub_filter_funcs:
|
749
|
+
try:
|
750
|
+
if func(element):
|
751
|
+
return True
|
752
|
+
except Exception as e:
|
753
|
+
logger.error(f"Error applying OR sub-filter to element: {e}", exc_info=True)
|
754
|
+
# Continue to next sub-filter on error
|
755
|
+
continue
|
756
|
+
return False
|
757
|
+
|
758
|
+
return or_filter
|
759
|
+
|
760
|
+
# Handle single selectors (existing logic)
|
606
761
|
filter_list = _build_filter_list(selector, **kwargs)
|
607
762
|
|
608
763
|
if logger.isEnabledFor(logging.DEBUG):
|
natural_pdf/widgets/viewer.py
CHANGED
@@ -31,20 +31,6 @@ try:
|
|
31
31
|
from PIL import Image
|
32
32
|
from traitlets import Dict, List, Unicode, observe
|
33
33
|
|
34
|
-
# --- Read JS code from file (only needed if widgets are defined) --- #
|
35
|
-
_MODULE_DIR = os.path.dirname(__file__)
|
36
|
-
_FRONTEND_JS_PATH = os.path.join(_MODULE_DIR, "frontend", "viewer.js")
|
37
|
-
try:
|
38
|
-
with open(_FRONTEND_JS_PATH, "r", encoding="utf-8") as f:
|
39
|
-
_FRONTEND_JS_CODE = f.read()
|
40
|
-
logger.debug(f"Successfully read frontend JS from: {_FRONTEND_JS_PATH}")
|
41
|
-
except FileNotFoundError:
|
42
|
-
logger.error(f"Frontend JS file not found at {_FRONTEND_JS_PATH}. Widget will likely fail.")
|
43
|
-
_FRONTEND_JS_CODE = "console.error('Frontend JS file not found! Widget cannot load.');"
|
44
|
-
except Exception as e:
|
45
|
-
logger.error(f"Error reading frontend JS file {_FRONTEND_JS_PATH}: {e}")
|
46
|
-
_FRONTEND_JS_CODE = f"console.error('Error reading frontend JS file: {e}');"
|
47
|
-
|
48
34
|
# --- Define Widget Classes ONLY if ipywidgets is available ---
|
49
35
|
class SimpleInteractiveViewerWidget(widgets.DOMWidget):
|
50
36
|
def __init__(self, pdf_data=None, **kwargs):
|
@@ -631,7 +617,7 @@ try:
|
|
631
617
|
|
632
618
|
# Filter out 'char' elements
|
633
619
|
filtered_page_elements = [
|
634
|
-
el for el in page_elements if getattr(el, "type", "").lower() != "char"
|
620
|
+
el for el in page_elements if str(getattr(el, "type", "")).lower() != "char"
|
635
621
|
]
|
636
622
|
logger.debug(
|
637
623
|
f"Filtered out char elements, keeping {len(filtered_page_elements)} elements."
|
@@ -659,19 +645,21 @@ try:
|
|
659
645
|
|
660
646
|
for i, element in enumerate(filtered_page_elements):
|
661
647
|
# Get original coordinates and calculated width/height (always present via base class)
|
648
|
+
# Assuming 'element' is always an object with these attributes now
|
662
649
|
original_x0 = element.x0
|
663
650
|
original_y0 = element.top
|
664
651
|
original_x1 = element.x1
|
665
652
|
original_y1 = element.bottom
|
666
653
|
width = element.width
|
667
654
|
height = element.height
|
655
|
+
current_element_type = element.type # Direct attribute access
|
668
656
|
scale = 1.0
|
669
657
|
|
670
658
|
# Base element dict with required info
|
671
659
|
elem_dict = {
|
672
660
|
"id": i,
|
673
661
|
# Use the standardized .type property
|
674
|
-
"type":
|
662
|
+
"type": current_element_type,
|
675
663
|
# Scaled coordinates for positioning in HTML/SVG
|
676
664
|
"x0": original_x0 * scale,
|
677
665
|
"y0": original_y0 * scale,
|
@@ -684,21 +672,24 @@ try:
|
|
684
672
|
# --- Get Default Attributes --- #
|
685
673
|
attributes_found = set()
|
686
674
|
for attr_name in default_attributes_to_get:
|
675
|
+
# Assuming 'element' is always an object
|
687
676
|
if hasattr(element, attr_name):
|
688
677
|
try:
|
689
|
-
|
678
|
+
value_to_process = getattr(element, attr_name)
|
690
679
|
# Convert non-JSON serializable types to string
|
691
|
-
processed_value =
|
680
|
+
processed_value = value_to_process
|
692
681
|
if (
|
693
|
-
not isinstance(
|
694
|
-
|
682
|
+
not isinstance(
|
683
|
+
value_to_process, (str, int, float, bool, list, dict, tuple)
|
684
|
+
)
|
685
|
+
and value_to_process is not None
|
695
686
|
):
|
696
|
-
processed_value = str(
|
687
|
+
processed_value = str(value_to_process)
|
697
688
|
elem_dict[attr_name] = processed_value
|
698
689
|
attributes_found.add(attr_name)
|
699
690
|
except Exception as e:
|
700
691
|
logger.warning(
|
701
|
-
f"Could not get or process default attribute '{attr_name}' for element {i} ({
|
692
|
+
f"Could not get or process default attribute '{attr_name}' for element {i} ({current_element_type}): {e}"
|
702
693
|
)
|
703
694
|
|
704
695
|
# --- Get User-Requested Attributes (if any) --- #
|
@@ -707,23 +698,23 @@ try:
|
|
707
698
|
# Only process if not already added and exists
|
708
699
|
if attr_name not in attributes_found and hasattr(element, attr_name):
|
709
700
|
try:
|
710
|
-
|
711
|
-
processed_value =
|
701
|
+
value_to_process = getattr(element, attr_name)
|
702
|
+
processed_value = value_to_process
|
712
703
|
if (
|
713
704
|
not isinstance(
|
714
|
-
|
705
|
+
value_to_process, (str, int, float, bool, list, dict, tuple)
|
715
706
|
)
|
716
|
-
and
|
707
|
+
and value_to_process is not None
|
717
708
|
):
|
718
|
-
processed_value = str(
|
709
|
+
processed_value = str(value_to_process)
|
719
710
|
elem_dict[attr_name] = processed_value
|
720
711
|
except Exception as e:
|
721
712
|
logger.warning(
|
722
|
-
f"Could not get or process requested attribute '{attr_name}' for element {i} ({
|
713
|
+
f"Could not get or process requested attribute '{attr_name}' for element {i} ({current_element_type}): {e}"
|
723
714
|
)
|
724
|
-
for
|
725
|
-
if isinstance(elem_dict[
|
726
|
-
elem_dict[
|
715
|
+
for attr_name_val in elem_dict: # Renamed to avoid conflict
|
716
|
+
if isinstance(elem_dict[attr_name_val], float):
|
717
|
+
elem_dict[attr_name_val] = round(elem_dict[attr_name_val], 2)
|
727
718
|
elements.append(elem_dict)
|
728
719
|
|
729
720
|
logger.debug(
|