natural-pdf 0.1.14__py3-none-any.whl → 0.1.16__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- natural_pdf/__init__.py +31 -0
- natural_pdf/analyzers/layout/gemini.py +137 -162
- natural_pdf/analyzers/layout/layout_manager.py +9 -5
- natural_pdf/analyzers/layout/layout_options.py +77 -7
- natural_pdf/analyzers/layout/paddle.py +318 -165
- natural_pdf/analyzers/layout/table_structure_utils.py +78 -0
- natural_pdf/analyzers/shape_detection_mixin.py +770 -405
- natural_pdf/classification/mixin.py +2 -8
- natural_pdf/collections/pdf_collection.py +25 -30
- natural_pdf/core/highlighting_service.py +47 -32
- natural_pdf/core/page.py +226 -70
- natural_pdf/core/pdf.py +19 -22
- natural_pdf/elements/base.py +9 -9
- natural_pdf/elements/collections.py +105 -50
- natural_pdf/elements/region.py +320 -113
- natural_pdf/exporters/paddleocr.py +38 -13
- natural_pdf/flows/__init__.py +3 -3
- natural_pdf/flows/collections.py +303 -132
- natural_pdf/flows/element.py +277 -132
- natural_pdf/flows/flow.py +33 -16
- natural_pdf/flows/region.py +142 -79
- natural_pdf/ocr/engine_doctr.py +37 -4
- natural_pdf/ocr/engine_easyocr.py +23 -3
- natural_pdf/ocr/engine_paddle.py +281 -30
- natural_pdf/ocr/engine_surya.py +8 -3
- natural_pdf/ocr/ocr_manager.py +75 -76
- natural_pdf/ocr/ocr_options.py +52 -87
- natural_pdf/search/__init__.py +25 -12
- natural_pdf/search/lancedb_search_service.py +91 -54
- natural_pdf/search/numpy_search_service.py +86 -65
- natural_pdf/search/searchable_mixin.py +2 -2
- natural_pdf/selectors/parser.py +125 -81
- natural_pdf/widgets/__init__.py +1 -1
- natural_pdf/widgets/viewer.py +205 -449
- {natural_pdf-0.1.14.dist-info → natural_pdf-0.1.16.dist-info}/METADATA +27 -45
- {natural_pdf-0.1.14.dist-info → natural_pdf-0.1.16.dist-info}/RECORD +39 -38
- {natural_pdf-0.1.14.dist-info → natural_pdf-0.1.16.dist-info}/WHEEL +0 -0
- {natural_pdf-0.1.14.dist-info → natural_pdf-0.1.16.dist-info}/licenses/LICENSE +0 -0
- {natural_pdf-0.1.14.dist-info → natural_pdf-0.1.16.dist-info}/top_level.txt +0 -0
natural_pdf/widgets/viewer.py
CHANGED
@@ -10,7 +10,6 @@ logger = logging.getLogger(__name__)
|
|
10
10
|
# Initialize flag and module/class variables to None
|
11
11
|
_IPYWIDGETS_AVAILABLE = False
|
12
12
|
widgets = None
|
13
|
-
SimpleInteractiveViewerWidget = None
|
14
13
|
InteractiveViewerWidget = None
|
15
14
|
|
16
15
|
try:
|
@@ -29,13 +28,12 @@ try:
|
|
29
28
|
|
30
29
|
from IPython.display import HTML, Javascript, display
|
31
30
|
from PIL import Image
|
32
|
-
from traitlets import Dict, List, Unicode, observe
|
33
31
|
|
34
|
-
# --- Define Widget
|
35
|
-
class
|
32
|
+
# --- Define Widget Class ---
|
33
|
+
class InteractiveViewerWidget(widgets.DOMWidget):
|
36
34
|
def __init__(self, pdf_data=None, **kwargs):
|
37
35
|
"""
|
38
|
-
Create
|
36
|
+
Create an interactive PDF viewer widget.
|
39
37
|
|
40
38
|
Args:
|
41
39
|
pdf_data (dict, optional): Dictionary containing 'page_image', 'elements', etc.
|
@@ -56,7 +54,7 @@ try:
|
|
56
54
|
self.pdf_data = {"page_image": image_source, "elements": kwargs.get("elements", [])}
|
57
55
|
|
58
56
|
# Log for debugging
|
59
|
-
logger.debug(f"
|
57
|
+
logger.debug(f"InteractiveViewerWidget initialized with widget_id={id(self)}")
|
60
58
|
logger.debug(
|
61
59
|
f"Image source provided: {self.pdf_data.get('page_image', 'None')[:30]}..."
|
62
60
|
)
|
@@ -248,133 +246,53 @@ try:
|
|
248
246
|
|
249
247
|
function handleMouseDown(event) {
|
250
248
|
// Prevent default only if needed (e.g., text selection on image)
|
251
|
-
|
249
|
+
if (event.target.tagName !== 'BUTTON') {
|
252
250
|
event.preventDefault();
|
253
|
-
|
254
|
-
|
255
|
-
|
251
|
+
}
|
252
|
+
|
253
|
+
viewerData.isDragging = true;
|
256
254
|
viewerData.startX = event.clientX;
|
257
255
|
viewerData.startY = event.clientY;
|
258
|
-
// Store initial translate values to calculate relative movement
|
259
256
|
viewerData.startTranslateX = viewerData.translateX;
|
260
257
|
viewerData.startTranslateY = viewerData.translateY;
|
261
|
-
|
262
|
-
|
258
|
+
viewerData.justDragged = false; // Reset drag flag
|
259
|
+
zoomPanContainer.style.cursor = 'grabbing';
|
263
260
|
}
|
264
|
-
|
261
|
+
|
265
262
|
function handleMouseMove(event) {
|
266
|
-
|
267
|
-
if (event.buttons !== 1) {
|
268
|
-
if (viewerData.isDragging) {
|
269
|
-
// Force drag end if button is released unexpectedly
|
270
|
-
handleMouseUp(event);
|
271
|
-
}
|
272
|
-
return;
|
273
|
-
}
|
274
|
-
|
275
|
-
const currentX = event.clientX;
|
276
|
-
const currentY = event.clientY;
|
277
|
-
const deltaX = currentX - viewerData.startX;
|
278
|
-
const deltaY = currentY - viewerData.startY;
|
279
|
-
|
280
|
-
// If not already dragging, check if threshold is exceeded
|
281
|
-
if (!viewerData.isDragging) {
|
282
|
-
const movedDistance = Math.hypot(deltaX, deltaY);
|
283
|
-
if (movedDistance > dragThreshold) {
|
284
|
-
viewerData.isDragging = true;
|
285
|
-
zoomPanContainer.style.cursor = 'grabbing';
|
286
|
-
// Now disable pointer events on elements since a drag has started
|
287
|
-
elements.forEach(el => el.style.pointerEvents = 'none');
|
288
|
-
}
|
289
|
-
}
|
263
|
+
if (!viewerData.isDragging) return;
|
290
264
|
|
291
|
-
|
292
|
-
|
293
|
-
|
294
|
-
|
295
|
-
|
296
|
-
|
297
|
-
applyTransform();
|
265
|
+
const dx = event.clientX - viewerData.startX;
|
266
|
+
const dy = event.clientY - viewerData.startY;
|
267
|
+
|
268
|
+
// If we've moved past the threshold, it's a drag
|
269
|
+
if (Math.abs(dx) > dragThreshold || Math.abs(dy) > dragThreshold) {
|
270
|
+
viewerData.justDragged = true;
|
298
271
|
}
|
272
|
+
|
273
|
+
viewerData.translateX = viewerData.startTranslateX + dx;
|
274
|
+
viewerData.translateY = viewerData.startTranslateY + dy;
|
275
|
+
applyTransform();
|
299
276
|
}
|
300
|
-
|
301
|
-
function handleMouseUp(event) {
|
302
|
-
const wasDragging = viewerData.isDragging;
|
303
|
-
|
304
|
-
// Always reset cursor on mouse up
|
305
|
-
zoomPanContainer.style.cursor = 'grab';
|
306
277
|
|
307
|
-
|
308
|
-
|
309
|
-
|
310
|
-
elements.forEach(el => el.style.pointerEvents = 'auto');
|
311
|
-
|
312
|
-
// Set flag to indicate a drag just finished
|
313
|
-
viewerData.justDragged = true;
|
314
|
-
// Reset the flag after a minimal delay, allowing the click event to be ignored
|
315
|
-
setTimeout(() => { viewerData.justDragged = false; }, 0);
|
316
|
-
|
317
|
-
// IMPORTANT: Prevent this mouseup from triggering other default actions
|
318
|
-
event.preventDefault();
|
319
|
-
// Stop propagation might not be needed here if the click listener checks justDragged
|
320
|
-
// event.stopPropagation();
|
321
|
-
} else {
|
322
|
-
// If it wasn't a drag, do nothing here.
|
323
|
-
// The browser should naturally fire a 'click' event on the target element
|
324
|
-
// which will be handled by the element's specific click listener
|
325
|
-
// or the outerContainer's listener if it was on the background.
|
326
|
-
}
|
278
|
+
function handleMouseUp() {
|
279
|
+
viewerData.isDragging = false;
|
280
|
+
zoomPanContainer.style.cursor = 'grab';
|
327
281
|
}
|
328
282
|
|
329
|
-
|
330
|
-
|
331
|
-
|
283
|
+
zoomPanContainer.addEventListener('mousedown', handleMouseDown);
|
284
|
+
document.addEventListener('mousemove', handleMouseMove);
|
285
|
+
document.addEventListener('mouseup', handleMouseUp);
|
332
286
|
|
333
|
-
//
|
334
|
-
// Attach to window or document for smoother dragging even if mouse leaves outerContainer
|
335
|
-
// Using outerContainer for now, might need adjustment if dragging feels jerky near edges
|
336
|
-
outerContainer.addEventListener('mousemove', handleMouseMove);
|
337
|
-
|
338
|
-
// Mouseup ends the drag *or* allows a click to proceed
|
339
|
-
// Attach to window or document to ensure drag ends even if mouse released outside
|
340
|
-
// Using outerContainer for now
|
341
|
-
outerContainer.addEventListener('mouseup', handleMouseUp);
|
342
|
-
|
343
|
-
// Stop dragging if mouse leaves the outer container entirely (optional but good practice)
|
344
|
-
outerContainer.addEventListener('mouseleave', (event) => {
|
345
|
-
// Only act if the primary mouse button is NOT pressed anymore when leaving
|
346
|
-
if (viewerData.isDragging && event.buttons !== 1) {
|
347
|
-
handleMouseUp(event);
|
348
|
-
}
|
349
|
-
});
|
350
|
-
|
351
|
-
// --- Button Listeners ---
|
287
|
+
// --- Button Controls ---
|
352
288
|
zoomInButton.addEventListener('click', () => {
|
353
|
-
|
354
|
-
|
355
|
-
const centerY = centerRect.height / 2;
|
356
|
-
const zoomFactor = 1.2;
|
357
|
-
const newScale = Math.min(5, viewerData.scale * zoomFactor);
|
358
|
-
const pointX = (centerX - viewerData.translateX) / viewerData.scale;
|
359
|
-
const pointY = (centerY - viewerData.translateY) / viewerData.scale;
|
360
|
-
viewerData.scale = newScale;
|
361
|
-
viewerData.translateX = centerX - pointX * viewerData.scale;
|
362
|
-
viewerData.translateY = centerY - pointY * viewerData.scale;
|
363
|
-
applyTransform();
|
289
|
+
viewerData.scale = Math.min(5, viewerData.scale * 1.2);
|
290
|
+
applyTransform();
|
364
291
|
});
|
365
292
|
|
366
|
-
|
367
|
-
|
368
|
-
|
369
|
-
const centerY = centerRect.height / 2;
|
370
|
-
const zoomFactor = 1 / 1.2;
|
371
|
-
const newScale = Math.max(0.5, viewerData.scale * zoomFactor);
|
372
|
-
const pointX = (centerX - viewerData.translateX) / viewerData.scale;
|
373
|
-
const pointY = (centerY - viewerData.translateY) / viewerData.scale;
|
374
|
-
viewerData.scale = newScale;
|
375
|
-
viewerData.translateX = centerX - pointX * viewerData.scale;
|
376
|
-
viewerData.translateY = centerY - pointY * viewerData.scale;
|
377
|
-
applyTransform();
|
293
|
+
zoomOutButton.addEventListener('click', () => {
|
294
|
+
viewerData.scale = Math.max(0.5, viewerData.scale / 1.2);
|
295
|
+
applyTransform();
|
378
296
|
});
|
379
297
|
|
380
298
|
resetButton.addEventListener('click', () => {
|
@@ -382,385 +300,223 @@ try:
|
|
382
300
|
viewerData.translateX = 0;
|
383
301
|
viewerData.translateY = 0;
|
384
302
|
applyTransform();
|
385
|
-
// Also reset selection on zoom reset
|
386
|
-
if (viewerData.selectedElement !== null) {
|
387
|
-
resetElementStyle(viewerData.selectedElement);
|
388
|
-
viewerData.selectedElement = null;
|
389
|
-
// Optionally clear info panel
|
390
|
-
// const elementData = document.getElementById(widgetId + "-element-data");
|
391
|
-
// if (elementData) elementData.textContent = '';
|
392
|
-
}
|
393
303
|
});
|
394
|
-
|
395
|
-
// --- Helper function to reset element style ---
|
396
|
-
function resetElementStyle(elementIdx) {
|
397
|
-
const el = zoomPanContainer.querySelector(`.pdf-element[data-element-id='${elementIdx}']`);
|
398
|
-
const svgRect = document.querySelector(`#${widgetId} .svg-layer svg rect[data-element-id='${elementIdx}']`);
|
399
|
-
if (!el) return;
|
400
|
-
|
401
|
-
const viewer = window.pdfViewerRegistry[widgetId];
|
402
|
-
const eType = viewer.initialData.elements[elementIdx].type || 'unknown';
|
403
|
-
|
404
|
-
if (eType === 'text') {
|
405
|
-
el.style.backgroundColor = "rgba(255, 255, 0, 0.3)";
|
406
|
-
} else if (eType === 'image') {
|
407
|
-
el.style.backgroundColor = "rgba(0, 128, 255, 0.3)";
|
408
|
-
} else if (eType === 'figure') {
|
409
|
-
el.style.backgroundColor = "rgba(255, 0, 255, 0.3)";
|
410
|
-
} else if (eType === 'table') {
|
411
|
-
el.style.backgroundColor = "rgba(0, 255, 0, 0.3)";
|
412
|
-
} else {
|
413
|
-
el.style.backgroundColor = "rgba(200, 200, 200, 0.3)";
|
414
|
-
}
|
415
|
-
el.style.border = "1px dashed transparent";
|
416
304
|
|
417
|
-
|
418
|
-
|
419
|
-
|
305
|
+
// --- Element Interaction ---
|
306
|
+
function highlightElement(elementId) {
|
307
|
+
// Remove previous highlights on SVG rects
|
308
|
+
const allRects = zoomPanContainer.querySelectorAll('svg rect');
|
309
|
+
allRects.forEach(rect => {
|
310
|
+
rect.style.stroke = 'rgba(255, 165, 0, 0.85)';
|
311
|
+
rect.style.strokeWidth = '1.5';
|
312
|
+
});
|
313
|
+
|
314
|
+
// Highlight the new one
|
315
|
+
const targetRect = zoomPanContainer.querySelector(`svg rect[data-element-id='${elementId}']`);
|
316
|
+
if (targetRect) {
|
317
|
+
targetRect.style.stroke = 'red';
|
318
|
+
targetRect.style.strokeWidth = '3';
|
420
319
|
}
|
421
320
|
}
|
422
|
-
|
423
|
-
// --- Helper function to set element style (selected/hover) ---
|
424
|
-
function setElementHighlightStyle(elementIdx) {
|
425
|
-
const el = zoomPanContainer.querySelector(`.pdf-element[data-element-id='${elementIdx}']`);
|
426
|
-
const svgRect = document.querySelector(`#${widgetId} .svg-layer svg rect[data-element-id='${elementIdx}']`);
|
427
|
-
if (!el) return;
|
428
|
-
|
429
|
-
el.style.backgroundColor = "rgba(64, 158, 255, 0.15)";
|
430
|
-
el.style.border = "2px solid rgba(64, 158, 255, 0.6)";
|
431
|
-
|
432
|
-
if (svgRect) {
|
433
|
-
svgRect.setAttribute("stroke", "rgba(64, 158, 255, 0.9)");
|
434
|
-
svgRect.setAttribute("stroke-width", "2.5");
|
435
|
-
}
|
436
|
-
}
|
437
321
|
|
438
|
-
|
439
|
-
|
440
|
-
|
441
|
-
|
442
|
-
|
443
|
-
|
444
|
-
|
445
|
-
|
446
|
-
|
447
|
-
|
448
|
-
|
449
|
-
|
450
|
-
|
451
|
-
|
452
|
-
|
453
|
-
|
454
|
-
// If an element is selected, deselect it
|
455
|
-
if (viewerData.selectedElement !== null) {
|
456
|
-
resetElementStyle(viewerData.selectedElement);
|
457
|
-
viewerData.selectedElement = null;
|
458
|
-
|
459
|
-
// Optionally clear the info panel
|
460
|
-
const infoPanel = document.getElementById(widgetId + "-info-panel");
|
461
|
-
const elementData = document.getElementById(widgetId + "-element-data");
|
462
|
-
if (infoPanel && elementData) {
|
463
|
-
// infoPanel.style.display = "none"; // Or hide it
|
464
|
-
elementData.textContent = ""; // Clear content
|
465
|
-
}
|
322
|
+
function updateInfoPanel(element) {
|
323
|
+
const infoPanel = document.getElementById(`${widgetId}-element-data`);
|
324
|
+
if (infoPanel) {
|
325
|
+
// Pretty print the JSON
|
326
|
+
let displayData = {};
|
327
|
+
for (const [key, value] of Object.entries(element)) {
|
328
|
+
if (key !== 'bbox') { // Exclude raw bbox
|
329
|
+
if (typeof value === 'number') {
|
330
|
+
displayData[key] = parseFloat(value.toFixed(2));
|
331
|
+
} else {
|
332
|
+
displayData[key] = value;
|
333
|
+
}
|
334
|
+
}
|
335
|
+
}
|
336
|
+
infoPanel.textContent = JSON.stringify(displayData, null, 2);
|
466
337
|
}
|
467
|
-
}
|
468
|
-
|
469
|
-
// Add click handlers to elements
|
470
|
-
elements.forEach(function(el) {
|
471
|
-
el.addEventListener("click", function(event) {
|
472
|
-
// Stop propagation to prevent the background click handler from immediately deselecting.
|
473
|
-
event.stopPropagation();
|
474
|
-
|
475
|
-
const elementIdx = parseInt(this.dataset.elementId);
|
476
|
-
const viewer = window.pdfViewerRegistry[widgetId];
|
477
|
-
|
478
|
-
// If there was a previously selected element, reset its style
|
479
|
-
if (viewer.selectedElement !== null && viewer.selectedElement !== elementIdx) {
|
480
|
-
resetElementStyle(viewer.selectedElement);
|
481
|
-
}
|
338
|
+
}
|
482
339
|
|
483
|
-
|
484
|
-
|
485
|
-
if (
|
486
|
-
|
487
|
-
|
488
|
-
|
489
|
-
const elementData = document.getElementById(widgetId + "-element-data");
|
490
|
-
if (elementData) elementData.textContent = '';
|
491
|
-
return; // Stop further processing
|
340
|
+
elements.forEach(el => {
|
341
|
+
el.addEventListener('click', function(event) {
|
342
|
+
if (viewerData.justDragged) {
|
343
|
+
// If a drag just ended, prevent the click action
|
344
|
+
viewerData.justDragged = false;
|
345
|
+
return;
|
492
346
|
}
|
493
|
-
*/
|
494
|
-
|
495
|
-
// Store newly selected element
|
496
|
-
viewer.selectedElement = elementIdx;
|
497
|
-
|
498
|
-
// Highlight newly selected element
|
499
|
-
setElementHighlightStyle(elementIdx);
|
500
347
|
|
501
|
-
//
|
502
|
-
const
|
503
|
-
const elementData =
|
348
|
+
event.stopPropagation(); // Stop click from propagating to the container
|
349
|
+
const elementId = this.getAttribute('data-element-id');
|
350
|
+
const elementData = viewerData.initialData.elements[elementId];
|
504
351
|
|
505
|
-
|
506
|
-
|
507
|
-
if (!element) { /* console.error(`[${widgetId}] Element data not found for index ${elementIdx}!`); */ return; }
|
508
|
-
infoPanel.style.display = "block";
|
509
|
-
elementData.textContent = JSON.stringify(element, null, 2);
|
510
|
-
} else {
|
511
|
-
/* console.error(`[${widgetId}] Info panel or element data container not found via getElementById on click!`); */
|
512
|
-
}
|
513
|
-
});
|
514
|
-
|
515
|
-
// Add hover effects
|
516
|
-
el.addEventListener("mouseenter", function() {
|
517
|
-
// *** Only apply hover if NOTHING is selected ***
|
518
|
-
const viewer = window.pdfViewerRegistry[widgetId];
|
519
|
-
if (viewer.selectedElement !== null) {
|
520
|
-
return; // Do nothing if an element is selected
|
521
|
-
}
|
522
|
-
// Avoid hover effect while dragging
|
523
|
-
if (viewer.isDragging) {
|
524
|
-
return;
|
525
|
-
}
|
352
|
+
console.log('Clicked element:', elementData);
|
353
|
+
viewerData.selectedElement = elementData;
|
526
354
|
|
527
|
-
|
528
|
-
|
529
|
-
|
530
|
-
setElementHighlightStyle(elementIdx);
|
531
|
-
|
532
|
-
// Show element info on hover (only if nothing selected)
|
533
|
-
const infoPanel = document.getElementById(widgetId + "-info-panel");
|
534
|
-
const elementData = document.getElementById(widgetId + "-element-data");
|
355
|
+
// Update UI
|
356
|
+
updateInfoPanel(elementData);
|
357
|
+
highlightElement(elementId);
|
535
358
|
|
536
|
-
|
537
|
-
|
538
|
-
|
539
|
-
|
540
|
-
|
541
|
-
} else {
|
542
|
-
// Don't spam console on hover if it's not found initially
|
543
|
-
// console.error(`[${widgetId}] Info panel or element data container not found via getElementById on hover!`);
|
544
|
-
}
|
545
|
-
});
|
546
|
-
|
547
|
-
el.addEventListener("mouseleave", function() {
|
548
|
-
// *** Only reset hover if NOTHING is selected ***
|
549
|
-
const viewer = window.pdfViewerRegistry[widgetId];
|
550
|
-
if (viewer.selectedElement !== null) {
|
551
|
-
return; // Do nothing if an element is selected
|
359
|
+
// Example of sending data back to Python kernel
|
360
|
+
if (window.IPython && window.IPython.notebook && window.IPython.notebook.kernel) {
|
361
|
+
const command = `import json; from natural_pdf.widgets.viewer import InteractiveViewerWidget; InteractiveViewerWidget._handle_element_click(json.loads('${JSON.stringify(elementData)}'))`;
|
362
|
+
console.log("Executing command:", command);
|
363
|
+
// window.IPython.notebook.kernel.execute(command);
|
552
364
|
}
|
553
|
-
// Avoid hover effect while dragging
|
554
|
-
if (viewer.isDragging) {
|
555
|
-
return;
|
556
|
-
}
|
557
|
-
|
558
|
-
const elementIdx = parseInt(this.dataset.elementId);
|
559
|
-
|
560
|
-
// Reset styling
|
561
|
-
resetElementStyle(elementIdx);
|
562
|
-
|
563
|
-
// Optionally hide/clear the info panel on mouse leave when nothing is selected
|
564
|
-
// const infoPanel = document.getElementById(widgetId + "-info-panel");
|
565
|
-
// const elementData = document.getElementById(widgetId + "-element-data");
|
566
|
-
// if (infoPanel && elementData) {
|
567
|
-
// elementData.textContent = '';
|
568
|
-
// }
|
569
365
|
});
|
570
366
|
});
|
571
|
-
|
572
367
|
})();
|
573
368
|
""" % (
|
574
369
|
self.widget_id,
|
575
370
|
json.dumps(self.pdf_data),
|
576
371
|
)
|
577
|
-
|
578
|
-
# Add the JavaScript
|
372
|
+
# Display the JavaScript
|
579
373
|
display(Javascript(js_code))
|
580
374
|
|
375
|
+
def _get_element_json(self):
|
376
|
+
"""Returns the elements as a JSON string."""
|
377
|
+
# We don't need to do anything special here as the coords are already scaled
|
378
|
+
return json.dumps(self.pdf_data.get("elements", []))
|
379
|
+
|
581
380
|
def _repr_html_(self):
|
582
|
-
"""
|
583
|
-
|
381
|
+
"""Called by Jupyter to display the widget."""
|
382
|
+
# The __init__ method already calls display(), so nothing more is needed here
|
383
|
+
return None
|
584
384
|
|
585
385
|
@classmethod
|
586
386
|
def from_page(cls, page, on_element_click=None, include_attributes=None):
|
587
387
|
"""
|
588
|
-
|
388
|
+
Factory method to create a viewer from a Page object.
|
589
389
|
|
590
390
|
Args:
|
591
|
-
page:
|
592
|
-
on_element_click:
|
593
|
-
include_attributes:
|
594
|
-
A default set of common/useful attributes is always included.
|
391
|
+
page (Page): The Page object to display.
|
392
|
+
on_element_click (callable, optional): Callback function when an element is clicked.
|
393
|
+
include_attributes (list, optional): List of element attributes to include.
|
595
394
|
|
596
395
|
Returns:
|
597
|
-
|
396
|
+
An instance of InteractiveViewerWidget.
|
598
397
|
"""
|
599
|
-
|
600
|
-
|
601
|
-
|
602
|
-
|
603
|
-
|
604
|
-
|
605
|
-
|
606
|
-
|
607
|
-
|
608
|
-
|
609
|
-
|
610
|
-
|
611
|
-
|
612
|
-
|
613
|
-
|
614
|
-
|
615
|
-
|
616
|
-
|
617
|
-
|
618
|
-
|
619
|
-
|
620
|
-
|
621
|
-
|
622
|
-
|
623
|
-
|
624
|
-
|
398
|
+
if not _IPYWIDGETS_AVAILABLE:
|
399
|
+
logger.warning(
|
400
|
+
"Optional dependency 'ipywidgets' not found. Cannot create interactive viewer."
|
401
|
+
)
|
402
|
+
return None
|
403
|
+
|
404
|
+
try:
|
405
|
+
# --- This logic is restored from the original SimpleInteractiveViewerWidget ---
|
406
|
+
|
407
|
+
resolution = 150 # Define resolution to calculate scale
|
408
|
+
scale = resolution / 72.0 # PDF standard DPI is 72
|
409
|
+
|
410
|
+
# Get the page image, rendered at the higher resolution
|
411
|
+
img = render_plain_page(page, resolution=resolution)
|
412
|
+
|
413
|
+
buffered = BytesIO()
|
414
|
+
img.save(buffered, format="PNG")
|
415
|
+
img_str = base64.b64encode(buffered.getvalue()).decode()
|
416
|
+
image_uri = f"data:image/png;base64,{img_str}"
|
417
|
+
|
418
|
+
# Convert elements to dict format
|
419
|
+
elements = []
|
420
|
+
# Use page.elements directly if available, otherwise fallback to find_all
|
421
|
+
page_elements = getattr(page, "elements", page.find_all("*"))
|
422
|
+
|
423
|
+
# Filter out 'char' elements which are too noisy for the viewer
|
424
|
+
filtered_page_elements = [
|
425
|
+
el for el in page_elements if str(getattr(el, "type", "")).lower() != "char"
|
426
|
+
]
|
427
|
+
|
428
|
+
# Define a list of common/useful attributes to check for
|
429
|
+
default_attributes_to_get = [
|
430
|
+
"text",
|
431
|
+
"fontname",
|
432
|
+
"size",
|
433
|
+
"bold",
|
434
|
+
"italic",
|
435
|
+
"color",
|
436
|
+
"linewidth",
|
437
|
+
"is_horizontal",
|
438
|
+
"is_vertical",
|
439
|
+
"source",
|
440
|
+
"confidence",
|
441
|
+
"label",
|
442
|
+
"model",
|
443
|
+
"upright",
|
444
|
+
"direction",
|
445
|
+
]
|
446
|
+
|
447
|
+
for i, element in enumerate(filtered_page_elements):
|
448
|
+
elem_dict = {
|
449
|
+
"id": i,
|
450
|
+
"type": element.type,
|
451
|
+
# Apply scaling to all coordinates and dimensions
|
452
|
+
"x0": element.x0 * scale,
|
453
|
+
"y0": element.top * scale,
|
454
|
+
"x1": element.x1 * scale,
|
455
|
+
"y1": element.bottom * scale,
|
456
|
+
"width": element.width * scale,
|
457
|
+
"height": element.height * scale,
|
458
|
+
}
|
625
459
|
|
626
|
-
|
627
|
-
|
628
|
-
|
629
|
-
"fontname",
|
630
|
-
"size",
|
631
|
-
"bold",
|
632
|
-
"italic",
|
633
|
-
"color",
|
634
|
-
"linewidth", # For lines (pdfplumber uses 'linewidth')
|
635
|
-
"is_horizontal",
|
636
|
-
"is_vertical", # For lines
|
637
|
-
"source",
|
638
|
-
"confidence", # For text/OCR
|
639
|
-
"label", # Common for layout elements
|
640
|
-
"model", # Add the model name (engine)
|
641
|
-
# Add any other common properties you expect from your elements
|
642
|
-
"upright",
|
643
|
-
"direction", # from pdfplumber chars/words
|
644
|
-
]
|
645
|
-
|
646
|
-
for i, element in enumerate(filtered_page_elements):
|
647
|
-
# Get original coordinates and calculated width/height (always present via base class)
|
648
|
-
# Assuming 'element' is always an object with these attributes now
|
649
|
-
original_x0 = element.x0
|
650
|
-
original_y0 = element.top
|
651
|
-
original_x1 = element.x1
|
652
|
-
original_y1 = element.bottom
|
653
|
-
width = element.width
|
654
|
-
height = element.height
|
655
|
-
current_element_type = element.type # Direct attribute access
|
656
|
-
scale = 1.0
|
657
|
-
|
658
|
-
# Base element dict with required info
|
659
|
-
elem_dict = {
|
660
|
-
"id": i,
|
661
|
-
# Use the standardized .type property
|
662
|
-
"type": current_element_type,
|
663
|
-
# Scaled coordinates for positioning in HTML/SVG
|
664
|
-
"x0": original_x0 * scale,
|
665
|
-
"y0": original_y0 * scale,
|
666
|
-
"x1": original_x1 * scale,
|
667
|
-
"y1": original_y1 * scale,
|
668
|
-
"width": width * scale,
|
669
|
-
"height": height * scale,
|
670
|
-
}
|
460
|
+
# Get Default and User-Requested Attributes
|
461
|
+
attributes_found = set()
|
462
|
+
all_attrs_to_check = default_attributes_to_get + (include_attributes or [])
|
671
463
|
|
672
|
-
|
673
|
-
attributes_found = set()
|
674
|
-
for attr_name in default_attributes_to_get:
|
675
|
-
# Assuming 'element' is always an object
|
676
|
-
if hasattr(element, attr_name):
|
677
|
-
try:
|
678
|
-
value_to_process = getattr(element, attr_name)
|
679
|
-
# Convert non-JSON serializable types to string
|
680
|
-
processed_value = value_to_process
|
681
|
-
if (
|
682
|
-
not isinstance(
|
683
|
-
value_to_process, (str, int, float, bool, list, dict, tuple)
|
684
|
-
)
|
685
|
-
and value_to_process is not None
|
686
|
-
):
|
687
|
-
processed_value = str(value_to_process)
|
688
|
-
elem_dict[attr_name] = processed_value
|
689
|
-
attributes_found.add(attr_name)
|
690
|
-
except Exception as e:
|
691
|
-
logger.warning(
|
692
|
-
f"Could not get or process default attribute '{attr_name}' for element {i} ({current_element_type}): {e}"
|
693
|
-
)
|
694
|
-
|
695
|
-
# --- Get User-Requested Attributes (if any) --- #
|
696
|
-
if include_attributes:
|
697
|
-
for attr_name in include_attributes:
|
698
|
-
# Only process if not already added and exists
|
464
|
+
for attr_name in all_attrs_to_check:
|
699
465
|
if attr_name not in attributes_found and hasattr(element, attr_name):
|
700
466
|
try:
|
701
|
-
|
702
|
-
|
703
|
-
if (
|
704
|
-
|
705
|
-
value_to_process, (str, int, float, bool, list, dict, tuple)
|
706
|
-
)
|
707
|
-
and value_to_process is not None
|
467
|
+
value = getattr(element, attr_name)
|
468
|
+
# Ensure value is JSON serializable
|
469
|
+
if not isinstance(
|
470
|
+
value, (str, int, float, bool, list, dict, type(None))
|
708
471
|
):
|
709
|
-
|
710
|
-
elem_dict[attr_name] =
|
472
|
+
value = str(value)
|
473
|
+
elem_dict[attr_name] = value
|
474
|
+
attributes_found.add(attr_name)
|
711
475
|
except Exception as e:
|
712
476
|
logger.warning(
|
713
|
-
f"Could not get
|
477
|
+
f"Could not get attribute '{attr_name}' for element {i}: {e}"
|
714
478
|
)
|
715
|
-
for attr_name_val in elem_dict: # Renamed to avoid conflict
|
716
|
-
if isinstance(elem_dict[attr_name_val], float):
|
717
|
-
elem_dict[attr_name_val] = round(elem_dict[attr_name_val], 2)
|
718
|
-
elements.append(elem_dict)
|
719
479
|
|
720
|
-
|
721
|
-
|
722
|
-
|
480
|
+
# Round float values for cleaner display
|
481
|
+
for key, val in elem_dict.items():
|
482
|
+
if isinstance(val, float):
|
483
|
+
elem_dict[key] = round(val, 2)
|
723
484
|
|
724
|
-
|
725
|
-
# The actual JSON conversion happens when the data is sent to the frontend
|
726
|
-
return cls(image_uri=image_uri, elements=elements)
|
485
|
+
elements.append(elem_dict)
|
727
486
|
|
728
|
-
|
729
|
-
|
730
|
-
|
731
|
-
|
732
|
-
|
733
|
-
|
734
|
-
|
735
|
-
|
736
|
-
|
737
|
-
|
738
|
-
|
739
|
-
|
740
|
-
|
741
|
-
|
742
|
-
|
743
|
-
|
744
|
-
|
745
|
-
|
746
|
-
|
747
|
-
|
748
|
-
|
749
|
-
|
750
|
-
|
751
|
-
|
752
|
-
# Can add Python-side logic here if needed when elements change
|
753
|
-
# print(f"Python: Elements traitlet changed. New count: {len(change['new'])}")
|
754
|
-
pass
|
487
|
+
viewer_data = {"page_image": image_uri, "elements": elements}
|
488
|
+
# --- End of restored logic ---
|
489
|
+
|
490
|
+
# Set the callback if provided
|
491
|
+
if on_element_click:
|
492
|
+
cls._on_element_click_callback = on_element_click
|
493
|
+
|
494
|
+
return cls(pdf_data=viewer_data)
|
495
|
+
|
496
|
+
except Exception as e:
|
497
|
+
logger.error(f"Failed to create viewer from page: {e}", exc_info=True)
|
498
|
+
return None
|
499
|
+
|
500
|
+
# Static callback storage and handler
|
501
|
+
_on_element_click_callback = None
|
502
|
+
|
503
|
+
@staticmethod
|
504
|
+
def _handle_element_click(element_data):
|
505
|
+
"""Static method to handle element click events from JavaScript."""
|
506
|
+
if InteractiveViewerWidget._on_element_click_callback:
|
507
|
+
try:
|
508
|
+
InteractiveViewerWidget._on_element_click_callback(element_data)
|
509
|
+
except Exception as e:
|
510
|
+
logger.error(f"Error in element click callback: {e}", exc_info=True)
|
755
511
|
|
756
512
|
except ImportError:
|
513
|
+
# This block runs if 'ipywidgets' is not installed
|
757
514
|
logger.info(
|
758
515
|
"Optional dependency 'ipywidgets' not found. Interactive viewer widgets will not be defined."
|
759
516
|
)
|
760
|
-
# Ensure
|
761
|
-
|
762
|
-
|
517
|
+
# Ensure flag is False if the import fails for any reason
|
518
|
+
_IPYWIDGETS_AVAILABLE = False
|
519
|
+
except Exception as e:
|
520
|
+
# Catch other potential errors during widget definition
|
521
|
+
logger.error(f"An unexpected error occurred while defining viewer widgets: {e}", exc_info=True)
|
763
522
|
_IPYWIDGETS_AVAILABLE = False # Explicitly set flag to False here too
|
764
|
-
|
765
|
-
# Example usage - kept outside the try/except as comments
|
766
|
-
# ... (existing example usage comments) ...
|