natural-pdf 0.1.8__py3-none-any.whl → 0.1.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- natural_pdf/__init__.py +1 -0
- natural_pdf/analyzers/layout/base.py +1 -5
- natural_pdf/analyzers/layout/gemini.py +61 -51
- natural_pdf/analyzers/layout/layout_analyzer.py +40 -11
- natural_pdf/analyzers/layout/layout_manager.py +26 -84
- natural_pdf/analyzers/layout/layout_options.py +7 -0
- natural_pdf/analyzers/layout/pdfplumber_table_finder.py +142 -0
- natural_pdf/analyzers/layout/surya.py +46 -123
- natural_pdf/analyzers/layout/tatr.py +51 -4
- natural_pdf/analyzers/text_structure.py +3 -5
- natural_pdf/analyzers/utils.py +3 -3
- natural_pdf/classification/manager.py +230 -151
- natural_pdf/classification/mixin.py +49 -35
- natural_pdf/classification/results.py +64 -46
- natural_pdf/collections/mixins.py +68 -20
- natural_pdf/collections/pdf_collection.py +177 -64
- natural_pdf/core/element_manager.py +30 -14
- natural_pdf/core/highlighting_service.py +13 -22
- natural_pdf/core/page.py +423 -101
- natural_pdf/core/pdf.py +633 -190
- natural_pdf/elements/base.py +134 -40
- natural_pdf/elements/collections.py +503 -131
- natural_pdf/elements/region.py +659 -90
- natural_pdf/elements/text.py +1 -1
- natural_pdf/export/mixin.py +137 -0
- natural_pdf/exporters/base.py +3 -3
- natural_pdf/exporters/paddleocr.py +4 -3
- natural_pdf/extraction/manager.py +50 -49
- natural_pdf/extraction/mixin.py +90 -57
- natural_pdf/extraction/result.py +9 -23
- natural_pdf/ocr/__init__.py +5 -5
- natural_pdf/ocr/engine_doctr.py +346 -0
- natural_pdf/ocr/ocr_factory.py +24 -4
- natural_pdf/ocr/ocr_manager.py +61 -25
- natural_pdf/ocr/ocr_options.py +70 -10
- natural_pdf/ocr/utils.py +6 -4
- natural_pdf/search/__init__.py +20 -34
- natural_pdf/search/haystack_search_service.py +309 -265
- natural_pdf/search/haystack_utils.py +99 -75
- natural_pdf/search/search_service_protocol.py +11 -12
- natural_pdf/selectors/parser.py +219 -143
- natural_pdf/utils/debug.py +3 -3
- natural_pdf/utils/identifiers.py +1 -1
- natural_pdf/utils/locks.py +1 -1
- natural_pdf/utils/packaging.py +8 -6
- natural_pdf/utils/text_extraction.py +24 -16
- natural_pdf/utils/tqdm_utils.py +18 -10
- natural_pdf/utils/visualization.py +18 -0
- natural_pdf/widgets/viewer.py +4 -25
- {natural_pdf-0.1.8.dist-info → natural_pdf-0.1.9.dist-info}/METADATA +12 -3
- natural_pdf-0.1.9.dist-info/RECORD +80 -0
- {natural_pdf-0.1.8.dist-info → natural_pdf-0.1.9.dist-info}/WHEEL +1 -1
- {natural_pdf-0.1.8.dist-info → natural_pdf-0.1.9.dist-info}/top_level.txt +0 -2
- docs/api/index.md +0 -386
- docs/assets/favicon.png +0 -3
- docs/assets/favicon.svg +0 -3
- docs/assets/javascripts/custom.js +0 -17
- docs/assets/logo.svg +0 -3
- docs/assets/sample-screen.png +0 -0
- docs/assets/social-preview.png +0 -17
- docs/assets/social-preview.svg +0 -17
- docs/assets/stylesheets/custom.css +0 -65
- docs/categorizing-documents/index.md +0 -168
- docs/data-extraction/index.md +0 -87
- docs/document-qa/index.ipynb +0 -435
- docs/document-qa/index.md +0 -79
- docs/element-selection/index.ipynb +0 -969
- docs/element-selection/index.md +0 -249
- docs/finetuning/index.md +0 -176
- docs/index.md +0 -189
- docs/installation/index.md +0 -69
- docs/interactive-widget/index.ipynb +0 -962
- docs/interactive-widget/index.md +0 -12
- docs/layout-analysis/index.ipynb +0 -818
- docs/layout-analysis/index.md +0 -185
- docs/ocr/index.md +0 -256
- docs/pdf-navigation/index.ipynb +0 -314
- docs/pdf-navigation/index.md +0 -97
- docs/regions/index.ipynb +0 -816
- docs/regions/index.md +0 -294
- docs/tables/index.ipynb +0 -658
- docs/tables/index.md +0 -144
- docs/text-analysis/index.ipynb +0 -370
- docs/text-analysis/index.md +0 -105
- docs/text-extraction/index.ipynb +0 -1478
- docs/text-extraction/index.md +0 -292
- docs/tutorials/01-loading-and-extraction.ipynb +0 -1873
- docs/tutorials/01-loading-and-extraction.md +0 -95
- docs/tutorials/02-finding-elements.ipynb +0 -417
- docs/tutorials/02-finding-elements.md +0 -149
- docs/tutorials/03-extracting-blocks.ipynb +0 -152
- docs/tutorials/03-extracting-blocks.md +0 -48
- docs/tutorials/04-table-extraction.ipynb +0 -119
- docs/tutorials/04-table-extraction.md +0 -50
- docs/tutorials/05-excluding-content.ipynb +0 -275
- docs/tutorials/05-excluding-content.md +0 -109
- docs/tutorials/06-document-qa.ipynb +0 -337
- docs/tutorials/06-document-qa.md +0 -91
- docs/tutorials/07-layout-analysis.ipynb +0 -293
- docs/tutorials/07-layout-analysis.md +0 -66
- docs/tutorials/07-working-with-regions.ipynb +0 -414
- docs/tutorials/07-working-with-regions.md +0 -151
- docs/tutorials/08-spatial-navigation.ipynb +0 -513
- docs/tutorials/08-spatial-navigation.md +0 -190
- docs/tutorials/09-section-extraction.ipynb +0 -2439
- docs/tutorials/09-section-extraction.md +0 -256
- docs/tutorials/10-form-field-extraction.ipynb +0 -517
- docs/tutorials/10-form-field-extraction.md +0 -201
- docs/tutorials/11-enhanced-table-processing.ipynb +0 -59
- docs/tutorials/11-enhanced-table-processing.md +0 -9
- docs/tutorials/12-ocr-integration.ipynb +0 -3712
- docs/tutorials/12-ocr-integration.md +0 -137
- docs/tutorials/13-semantic-search.ipynb +0 -1718
- docs/tutorials/13-semantic-search.md +0 -77
- docs/visual-debugging/index.ipynb +0 -2970
- docs/visual-debugging/index.md +0 -157
- docs/visual-debugging/region.png +0 -0
- natural_pdf/templates/finetune/fine_tune_paddleocr.md +0 -420
- natural_pdf/templates/spa/css/style.css +0 -334
- natural_pdf/templates/spa/index.html +0 -31
- natural_pdf/templates/spa/js/app.js +0 -472
- natural_pdf/templates/spa/words.txt +0 -235976
- natural_pdf/widgets/frontend/viewer.js +0 -88
- natural_pdf-0.1.8.dist-info/RECORD +0 -156
- notebooks/Examples.ipynb +0 -1293
- pdfs/.gitkeep +0 -0
- pdfs/01-practice.pdf +0 -543
- pdfs/0500000US42001.pdf +0 -0
- pdfs/0500000US42007.pdf +0 -0
- pdfs/2014 Statistics.pdf +0 -0
- pdfs/2019 Statistics.pdf +0 -0
- pdfs/Atlanta_Public_Schools_GA_sample.pdf +0 -0
- pdfs/needs-ocr.pdf +0 -0
- {natural_pdf-0.1.8.dist-info → natural_pdf-0.1.9.dist-info}/licenses/LICENSE +0 -0
@@ -1,13 +1,28 @@
|
|
1
|
+
import concurrent.futures # Import concurrent.futures
|
1
2
|
import copy # Added for copying options
|
2
3
|
import glob as py_glob
|
3
4
|
import logging
|
4
5
|
import os
|
5
6
|
import re # Added for safe path generation
|
7
|
+
import threading # Import threading for logging thread information
|
8
|
+
import time # Import time for logging timestamps
|
6
9
|
from pathlib import Path
|
7
|
-
from typing import
|
8
|
-
|
9
|
-
|
10
|
-
|
10
|
+
from typing import (
|
11
|
+
TYPE_CHECKING,
|
12
|
+
Any,
|
13
|
+
Callable,
|
14
|
+
Dict,
|
15
|
+
Generic,
|
16
|
+
Iterable,
|
17
|
+
Iterator,
|
18
|
+
List,
|
19
|
+
Optional,
|
20
|
+
Set,
|
21
|
+
Type,
|
22
|
+
TypeVar,
|
23
|
+
Union,
|
24
|
+
overload,
|
25
|
+
)
|
11
26
|
|
12
27
|
from PIL import Image
|
13
28
|
from tqdm import tqdm
|
@@ -26,6 +41,7 @@ logger = logging.getLogger(__name__)
|
|
26
41
|
|
27
42
|
from natural_pdf.core.pdf import PDF
|
28
43
|
from natural_pdf.elements.region import Region
|
44
|
+
from natural_pdf.export.mixin import ExportMixin
|
29
45
|
|
30
46
|
# --- Search Imports ---
|
31
47
|
try:
|
@@ -47,12 +63,12 @@ except ImportError as e:
|
|
47
63
|
|
48
64
|
SearchServiceProtocol, SearchOptions, Indexable = object, object, object
|
49
65
|
|
50
|
-
from natural_pdf.search.searchable_mixin import SearchableMixin # Import the new mixin
|
51
66
|
# Import the ApplyMixin
|
52
67
|
from natural_pdf.collections.mixins import ApplyMixin
|
68
|
+
from natural_pdf.search.searchable_mixin import SearchableMixin # Import the new mixin
|
53
69
|
|
54
70
|
|
55
|
-
class PDFCollection(SearchableMixin, ApplyMixin): #
|
71
|
+
class PDFCollection(SearchableMixin, ApplyMixin, ExportMixin): # Add ExportMixin
|
56
72
|
def __init__(
|
57
73
|
self,
|
58
74
|
source: Union[str, Iterable[Union[str, "PDF"]]],
|
@@ -252,54 +268,83 @@ class PDFCollection(SearchableMixin, ApplyMixin): # Inherit from ApplyMixin
|
|
252
268
|
def __repr__(self) -> str:
|
253
269
|
# Removed search status
|
254
270
|
return f"<PDFCollection(count={len(self._pdfs)})>"
|
271
|
+
return f"<PDFCollection(count={len(self._pdfs)})>"
|
255
272
|
|
256
273
|
@property
|
257
274
|
def pdfs(self) -> List["PDF"]:
|
258
275
|
"""Returns the list of PDF objects held by the collection."""
|
259
276
|
return self._pdfs
|
260
277
|
|
278
|
+
@overload
|
279
|
+
def find_all(
|
280
|
+
self,
|
281
|
+
*,
|
282
|
+
text: str,
|
283
|
+
apply_exclusions: bool = True,
|
284
|
+
regex: bool = False,
|
285
|
+
case: bool = True,
|
286
|
+
**kwargs,
|
287
|
+
) -> "ElementCollection": ...
|
288
|
+
|
289
|
+
@overload
|
290
|
+
def find_all(
|
291
|
+
self,
|
292
|
+
selector: str,
|
293
|
+
*,
|
294
|
+
apply_exclusions: bool = True,
|
295
|
+
regex: bool = False,
|
296
|
+
case: bool = True,
|
297
|
+
**kwargs,
|
298
|
+
) -> "ElementCollection": ...
|
299
|
+
|
261
300
|
def find_all(
|
262
|
-
self,
|
263
|
-
selector: str,
|
264
|
-
|
265
|
-
|
266
|
-
|
267
|
-
|
301
|
+
self,
|
302
|
+
selector: Optional[str] = None, # Now optional
|
303
|
+
*,
|
304
|
+
text: Optional[str] = None, # New text parameter
|
305
|
+
apply_exclusions: bool = True,
|
306
|
+
regex: bool = False,
|
307
|
+
case: bool = True,
|
308
|
+
**kwargs,
|
268
309
|
) -> "ElementCollection":
|
269
310
|
"""
|
270
|
-
Find all elements matching the selector across all PDFs in the collection.
|
271
|
-
|
311
|
+
Find all elements matching the selector OR text across all PDFs in the collection.
|
312
|
+
|
313
|
+
Provide EITHER `selector` OR `text`, but not both.
|
314
|
+
|
272
315
|
This creates an ElementCollection that can span multiple PDFs. Note that
|
273
316
|
some ElementCollection methods have limitations when spanning PDFs.
|
274
|
-
|
317
|
+
|
275
318
|
Args:
|
276
|
-
selector: CSS-like selector string to query elements
|
277
|
-
|
278
|
-
|
279
|
-
|
280
|
-
|
281
|
-
|
319
|
+
selector: CSS-like selector string to query elements.
|
320
|
+
text: Text content to search for (equivalent to 'text:contains(...)').
|
321
|
+
apply_exclusions: Whether to exclude elements in exclusion regions (default: True).
|
322
|
+
regex: Whether to use regex for text search (`selector` or `text`) (default: False).
|
323
|
+
case: Whether to do case-sensitive text search (`selector` or `text`) (default: True).
|
324
|
+
**kwargs: Additional keyword arguments passed to the find_all method of each PDF.
|
325
|
+
|
282
326
|
Returns:
|
283
|
-
ElementCollection containing all matching elements across all PDFs
|
327
|
+
ElementCollection containing all matching elements across all PDFs.
|
284
328
|
"""
|
285
|
-
|
286
|
-
|
329
|
+
# Validation happens within pdf.find_all
|
330
|
+
|
287
331
|
# Collect elements from all PDFs
|
288
332
|
all_elements = []
|
289
333
|
for pdf in self._pdfs:
|
290
334
|
try:
|
291
|
-
#
|
335
|
+
# Pass the relevant arguments down to each PDF's find_all
|
292
336
|
elements = pdf.find_all(
|
293
|
-
selector,
|
337
|
+
selector=selector,
|
338
|
+
text=text,
|
294
339
|
apply_exclusions=apply_exclusions,
|
295
340
|
regex=regex,
|
296
341
|
case=case,
|
297
|
-
**kwargs
|
342
|
+
**kwargs,
|
298
343
|
)
|
299
344
|
all_elements.extend(elements.elements)
|
300
345
|
except Exception as e:
|
301
346
|
logger.error(f"Error finding elements in {pdf.path}: {e}", exc_info=True)
|
302
|
-
|
347
|
+
|
303
348
|
return ElementCollection(all_elements)
|
304
349
|
|
305
350
|
def apply_ocr(
|
@@ -330,24 +375,26 @@ class PDFCollection(SearchableMixin, ApplyMixin): # Inherit from ApplyMixin
|
|
330
375
|
replace: If True, replace existing OCR elements
|
331
376
|
options: Engine-specific options
|
332
377
|
pages: Specific pages to process (None for all pages)
|
333
|
-
max_workers: Maximum number of threads to process PDFs concurrently.
|
378
|
+
max_workers: Maximum number of threads to process PDFs concurrently.
|
334
379
|
If None or 1, processing is sequential. (default: None)
|
335
380
|
|
336
381
|
Returns:
|
337
382
|
Self for method chaining
|
338
383
|
"""
|
339
384
|
PDF = self._get_pdf_class()
|
340
|
-
logger.info(
|
385
|
+
logger.info(
|
386
|
+
f"Applying OCR to {len(self._pdfs)} PDFs in collection (max_workers={max_workers})..."
|
387
|
+
)
|
341
388
|
|
342
389
|
# Worker function takes PDF object again
|
343
390
|
def _process_pdf(pdf: PDF):
|
344
391
|
"""Helper function to apply OCR to a single PDF, handling errors."""
|
345
|
-
thread_id = threading.current_thread().name
|
346
|
-
pdf_path = pdf.path
|
392
|
+
thread_id = threading.current_thread().name # Get thread name for logging
|
393
|
+
pdf_path = pdf.path # Get path for logging
|
347
394
|
logger.debug(f"[{thread_id}] Starting OCR process for: {pdf_path}")
|
348
395
|
start_time = time.monotonic()
|
349
396
|
try:
|
350
|
-
pdf.apply_ocr(
|
397
|
+
pdf.apply_ocr( # Call apply_ocr on the original PDF object
|
351
398
|
pages=pages,
|
352
399
|
engine=engine,
|
353
400
|
languages=languages,
|
@@ -362,17 +409,24 @@ class PDFCollection(SearchableMixin, ApplyMixin): # Inherit from ApplyMixin
|
|
362
409
|
# For now, PDF.apply_ocr doesn't have it.
|
363
410
|
)
|
364
411
|
end_time = time.monotonic()
|
365
|
-
logger.debug(
|
412
|
+
logger.debug(
|
413
|
+
f"[{thread_id}] Finished OCR process for: {pdf_path} (Duration: {end_time - start_time:.2f}s)"
|
414
|
+
)
|
366
415
|
return pdf_path, None
|
367
416
|
except Exception as e:
|
368
417
|
end_time = time.monotonic()
|
369
|
-
logger.error(
|
370
|
-
|
418
|
+
logger.error(
|
419
|
+
f"[{thread_id}] Failed OCR process for {pdf_path} after {end_time - start_time:.2f}s: {e}",
|
420
|
+
exc_info=False,
|
421
|
+
)
|
422
|
+
return pdf_path, e # Return path and error
|
371
423
|
|
372
424
|
# Use ThreadPoolExecutor for parallel processing if max_workers > 1
|
373
425
|
if max_workers is not None and max_workers > 1:
|
374
426
|
futures = []
|
375
|
-
with concurrent.futures.ThreadPoolExecutor(
|
427
|
+
with concurrent.futures.ThreadPoolExecutor(
|
428
|
+
max_workers=max_workers, thread_name_prefix="OCRWorker"
|
429
|
+
) as executor:
|
376
430
|
for pdf in self._pdfs:
|
377
431
|
# Submit the PDF object to the worker function
|
378
432
|
futures.append(executor.submit(_process_pdf, pdf))
|
@@ -382,22 +436,22 @@ class PDFCollection(SearchableMixin, ApplyMixin): # Inherit from ApplyMixin
|
|
382
436
|
concurrent.futures.as_completed(futures),
|
383
437
|
total=len(self._pdfs),
|
384
438
|
desc="Applying OCR (Parallel)",
|
385
|
-
unit="pdf"
|
439
|
+
unit="pdf",
|
386
440
|
)
|
387
|
-
|
441
|
+
|
388
442
|
for future in progress_bar:
|
389
|
-
pdf_path, error = future.result()
|
443
|
+
pdf_path, error = future.result() # Get result (or exception)
|
390
444
|
if error:
|
391
445
|
progress_bar.set_postfix_str(f"Error: {pdf_path}", refresh=True)
|
392
446
|
# Progress is updated automatically by tqdm
|
393
447
|
|
394
|
-
else:
|
448
|
+
else: # Sequential processing (max_workers is None or 1)
|
395
449
|
logger.info("Applying OCR sequentially...")
|
396
450
|
# Use the selected tqdm class for sequential too for consistency
|
397
451
|
# Iterate over PDF objects directly for sequential
|
398
452
|
for pdf in tqdm(self._pdfs, desc="Applying OCR (Sequential)", unit="pdf"):
|
399
|
-
_process_pdf(pdf)
|
400
|
-
|
453
|
+
_process_pdf(pdf) # Call helper directly with PDF object
|
454
|
+
|
401
455
|
logger.info("Finished applying OCR across the collection.")
|
402
456
|
return self
|
403
457
|
|
@@ -421,7 +475,7 @@ class PDFCollection(SearchableMixin, ApplyMixin): # Inherit from ApplyMixin
|
|
421
475
|
Returns:
|
422
476
|
Self for method chaining.
|
423
477
|
"""
|
424
|
-
PDF = self._get_pdf_class()
|
478
|
+
PDF = self._get_pdf_class() # Ensure PDF class is available
|
425
479
|
if not callable(correction_callback):
|
426
480
|
raise TypeError("`correction_callback` must be a callable function.")
|
427
481
|
|
@@ -436,7 +490,9 @@ class PDFCollection(SearchableMixin, ApplyMixin): # Inherit from ApplyMixin
|
|
436
490
|
return self
|
437
491
|
|
438
492
|
total_elements = len(all_ocr_elements)
|
439
|
-
logger.info(
|
493
|
+
logger.info(
|
494
|
+
f"Found {total_elements} OCR elements across the collection. Starting correction process..."
|
495
|
+
)
|
440
496
|
|
441
497
|
# 2. Initialize the progress bar
|
442
498
|
progress_bar = tqdm(total=total_elements, desc="Correcting OCR Elements", unit="element")
|
@@ -450,11 +506,14 @@ class PDFCollection(SearchableMixin, ApplyMixin): # Inherit from ApplyMixin
|
|
450
506
|
pdf.correct_ocr(
|
451
507
|
correction_callback=correction_callback,
|
452
508
|
max_workers=max_workers,
|
453
|
-
progress_callback=progress_bar.update
|
509
|
+
progress_callback=progress_bar.update, # Pass the bar's update method
|
454
510
|
)
|
455
511
|
except Exception as e:
|
456
|
-
|
457
|
-
|
512
|
+
logger.error(
|
513
|
+
f"Error occurred during correction process for PDF {pdf.path}: {e}",
|
514
|
+
exc_info=True,
|
515
|
+
)
|
516
|
+
# Decide if we should stop or continue? For now, continue.
|
458
517
|
|
459
518
|
progress_bar.close()
|
460
519
|
|
@@ -544,7 +603,9 @@ class PDFCollection(SearchableMixin, ApplyMixin): # Inherit from ApplyMixin
|
|
544
603
|
if not categories:
|
545
604
|
raise ValueError("Categories list cannot be empty.")
|
546
605
|
|
547
|
-
logger.info(
|
606
|
+
logger.info(
|
607
|
+
f"Starting classification for {len(self._pdfs)} PDFs in collection (model: '{model}')..."
|
608
|
+
)
|
548
609
|
|
549
610
|
# Calculate total pages for the progress bar
|
550
611
|
total_pages = sum(len(pdf.pages) for pdf in self._pdfs if pdf.pages)
|
@@ -553,9 +614,7 @@ class PDFCollection(SearchableMixin, ApplyMixin): # Inherit from ApplyMixin
|
|
553
614
|
return self
|
554
615
|
|
555
616
|
progress_bar = tqdm(
|
556
|
-
total=total_pages,
|
557
|
-
desc=f"Classifying Pages (model: {model})",
|
558
|
-
unit="page"
|
617
|
+
total=total_pages, desc=f"Classifying Pages (model: {model})", unit="page"
|
559
618
|
)
|
560
619
|
|
561
620
|
# Worker function
|
@@ -570,15 +629,20 @@ class PDFCollection(SearchableMixin, ApplyMixin): # Inherit from ApplyMixin
|
|
570
629
|
categories=categories,
|
571
630
|
model=model,
|
572
631
|
progress_callback=progress_bar.update,
|
573
|
-
**kwargs
|
632
|
+
**kwargs,
|
574
633
|
)
|
575
634
|
end_time = time.monotonic()
|
576
|
-
logger.debug(
|
577
|
-
|
635
|
+
logger.debug(
|
636
|
+
f"[{thread_id}] Finished classification for: {pdf_path} (Duration: {end_time - start_time:.2f}s)"
|
637
|
+
)
|
638
|
+
return pdf_path, None # Return path and no error
|
578
639
|
except Exception as e:
|
579
640
|
end_time = time.monotonic()
|
580
641
|
# Error is logged within classify_pages, but log summary here
|
581
|
-
logger.error(
|
642
|
+
logger.error(
|
643
|
+
f"[{thread_id}] Failed classification process for {pdf_path} after {end_time - start_time:.2f}s: {e}",
|
644
|
+
exc_info=False,
|
645
|
+
)
|
582
646
|
# Close progress bar immediately on error to avoid hanging
|
583
647
|
progress_bar.close()
|
584
648
|
# Re-raise the exception to stop the entire collection processing
|
@@ -589,16 +653,18 @@ class PDFCollection(SearchableMixin, ApplyMixin): # Inherit from ApplyMixin
|
|
589
653
|
if max_workers is not None and max_workers > 1:
|
590
654
|
logger.info(f"Classifying PDFs in parallel with {max_workers} workers.")
|
591
655
|
futures = []
|
592
|
-
with concurrent.futures.ThreadPoolExecutor(
|
656
|
+
with concurrent.futures.ThreadPoolExecutor(
|
657
|
+
max_workers=max_workers, thread_name_prefix="ClassifyWorker"
|
658
|
+
) as executor:
|
593
659
|
for pdf in self._pdfs:
|
594
660
|
futures.append(executor.submit(_process_pdf_classification, pdf))
|
595
661
|
|
596
662
|
# Wait for all futures to complete (progress updated by callback)
|
597
663
|
# Exceptions are raised by future.result() if worker failed
|
598
664
|
for future in concurrent.futures.as_completed(futures):
|
599
|
-
|
665
|
+
future.result() # Raise exception if worker failed
|
600
666
|
|
601
|
-
else:
|
667
|
+
else: # Sequential processing
|
602
668
|
logger.info("Classifying PDFs sequentially.")
|
603
669
|
for pdf in self._pdfs:
|
604
670
|
_process_pdf_classification(pdf)
|
@@ -606,12 +672,59 @@ class PDFCollection(SearchableMixin, ApplyMixin): # Inherit from ApplyMixin
|
|
606
672
|
logger.info("Finished classification across the collection.")
|
607
673
|
|
608
674
|
finally:
|
609
|
-
|
610
|
-
|
611
|
-
|
612
|
-
|
613
|
-
|
675
|
+
# Ensure progress bar is closed even if errors occurred elsewhere
|
676
|
+
if not progress_bar.disable and progress_bar.n < progress_bar.total:
|
677
|
+
progress_bar.close()
|
678
|
+
elif progress_bar.disable is False:
|
679
|
+
progress_bar.close()
|
614
680
|
|
615
681
|
return self
|
616
682
|
|
617
683
|
# --- End Classification Method --- #
|
684
|
+
|
685
|
+
def _gather_analysis_data(
|
686
|
+
self,
|
687
|
+
analysis_keys: List[str],
|
688
|
+
include_content: bool,
|
689
|
+
include_images: bool,
|
690
|
+
image_dir: Optional[Path],
|
691
|
+
image_format: str,
|
692
|
+
image_resolution: int,
|
693
|
+
) -> List[Dict[str, Any]]:
|
694
|
+
"""
|
695
|
+
Gather analysis data from all PDFs in the collection.
|
696
|
+
|
697
|
+
Args:
|
698
|
+
analysis_keys: Keys in the analyses dictionary to export
|
699
|
+
include_content: Whether to include extracted text
|
700
|
+
include_images: Whether to export images
|
701
|
+
image_dir: Directory to save images
|
702
|
+
image_format: Format to save images
|
703
|
+
image_resolution: Resolution for exported images
|
704
|
+
|
705
|
+
Returns:
|
706
|
+
List of dictionaries containing analysis data
|
707
|
+
"""
|
708
|
+
if not self._pdfs:
|
709
|
+
logger.warning("No PDFs found in collection")
|
710
|
+
return []
|
711
|
+
|
712
|
+
all_data = []
|
713
|
+
|
714
|
+
for pdf in tqdm(self._pdfs, desc="Gathering PDF data", leave=False):
|
715
|
+
# PDF level data
|
716
|
+
pdf_data = {
|
717
|
+
"pdf_path": pdf.path,
|
718
|
+
"pdf_filename": Path(pdf.path).name,
|
719
|
+
"total_pages": len(pdf.pages) if hasattr(pdf, "pages") else 0,
|
720
|
+
}
|
721
|
+
|
722
|
+
# Add metadata if available
|
723
|
+
if hasattr(pdf, "metadata") and pdf.metadata:
|
724
|
+
for k, v in pdf.metadata.items():
|
725
|
+
if v: # Only add non-empty metadata
|
726
|
+
pdf_data[f"metadata.{k}"] = str(v)
|
727
|
+
|
728
|
+
all_data.append(pdf_data)
|
729
|
+
|
730
|
+
return all_data
|
@@ -544,56 +544,56 @@ class ElementManager:
|
|
544
544
|
"""
|
545
545
|
Remove all elements with source="ocr" from the elements dictionary.
|
546
546
|
This should be called before adding new OCR elements if replacement is desired.
|
547
|
-
|
547
|
+
|
548
548
|
Returns:
|
549
549
|
int: Number of OCR elements removed
|
550
550
|
"""
|
551
551
|
# Load elements if not already loaded
|
552
552
|
self.load_elements()
|
553
|
-
|
553
|
+
|
554
554
|
removed_count = 0
|
555
|
-
|
555
|
+
|
556
556
|
# Filter out OCR elements from words
|
557
557
|
if "words" in self._elements:
|
558
558
|
original_len = len(self._elements["words"])
|
559
559
|
self._elements["words"] = [
|
560
|
-
word for word in self._elements["words"]
|
561
|
-
if getattr(word, "source", None) != "ocr"
|
560
|
+
word for word in self._elements["words"] if getattr(word, "source", None) != "ocr"
|
562
561
|
]
|
563
562
|
removed_count += original_len - len(self._elements["words"])
|
564
|
-
|
563
|
+
|
565
564
|
# Filter out OCR elements from chars
|
566
565
|
if "chars" in self._elements:
|
567
566
|
original_len = len(self._elements["chars"])
|
568
567
|
self._elements["chars"] = [
|
569
|
-
char
|
570
|
-
|
571
|
-
|
568
|
+
char
|
569
|
+
for char in self._elements["chars"]
|
570
|
+
if (isinstance(char, dict) and char.get("source") != "ocr")
|
571
|
+
or (not isinstance(char, dict) and getattr(char, "source", None) != "ocr")
|
572
572
|
]
|
573
573
|
removed_count += original_len - len(self._elements["chars"])
|
574
|
-
|
574
|
+
|
575
575
|
logger.info(f"Page {self._page.number}: Removed {removed_count} OCR elements.")
|
576
576
|
return removed_count
|
577
577
|
|
578
578
|
def remove_element(self, element, element_type="words"):
|
579
579
|
"""
|
580
580
|
Remove a specific element from the managed elements.
|
581
|
-
|
581
|
+
|
582
582
|
Args:
|
583
583
|
element: The element to remove
|
584
584
|
element_type: The type of element ('words', 'chars', etc.)
|
585
|
-
|
585
|
+
|
586
586
|
Returns:
|
587
587
|
bool: True if removed successfully, False otherwise
|
588
588
|
"""
|
589
589
|
# Load elements if not already loaded
|
590
590
|
self.load_elements()
|
591
|
-
|
591
|
+
|
592
592
|
# Check if the collection exists
|
593
593
|
if element_type not in self._elements:
|
594
594
|
logger.warning(f"Cannot remove element: collection '{element_type}' does not exist")
|
595
595
|
return False
|
596
|
-
|
596
|
+
|
597
597
|
# Try to remove the element
|
598
598
|
try:
|
599
599
|
if element in self._elements[element_type]:
|
@@ -606,3 +606,19 @@ class ElementManager:
|
|
606
606
|
except Exception as e:
|
607
607
|
logger.error(f"Error removing element from {element_type}: {e}", exc_info=True)
|
608
608
|
return False
|
609
|
+
|
610
|
+
def has_elements(self) -> bool:
|
611
|
+
"""
|
612
|
+
Check if any significant elements (words, rects, lines, regions)
|
613
|
+
have been loaded or added.
|
614
|
+
|
615
|
+
Returns:
|
616
|
+
True if any elements exist, False otherwise.
|
617
|
+
"""
|
618
|
+
self.load_elements()
|
619
|
+
|
620
|
+
for key in ["words", "rects", "lines", "regions"]:
|
621
|
+
if self._elements.get(key):
|
622
|
+
return True
|
623
|
+
|
624
|
+
return False
|
@@ -18,7 +18,12 @@ except ImportError:
|
|
18
18
|
Page = Any # Fallback if circular import issue arises during type checking
|
19
19
|
|
20
20
|
# Import ColorManager and related utils
|
21
|
-
from natural_pdf.utils.visualization import
|
21
|
+
from natural_pdf.utils.visualization import (
|
22
|
+
ColorManager,
|
23
|
+
create_legend,
|
24
|
+
merge_images_with_legend,
|
25
|
+
render_plain_page,
|
26
|
+
)
|
22
27
|
|
23
28
|
# Constants for drawing (Can be potentially moved to ColorManager/Renderer if desired)
|
24
29
|
BORDER_ALPHA = 180 # Default alpha for highlight border
|
@@ -622,28 +627,14 @@ class HighlightingService:
|
|
622
627
|
return None
|
623
628
|
|
624
629
|
page = self._pdf[page_index]
|
625
|
-
highlights_on_page = self.get_highlights_for_page(
|
626
|
-
page_index
|
627
|
-
) # This list will be empty if clear_page was called
|
630
|
+
highlights_on_page = self.get_highlights_for_page(page_index)
|
628
631
|
|
629
|
-
|
630
|
-
|
631
|
-
|
632
|
-
|
633
|
-
|
634
|
-
|
635
|
-
png_data = img_object._repr_png_()
|
636
|
-
if png_data:
|
637
|
-
base_image = Image.open(io.BytesIO(png_data)).convert("RGB")
|
638
|
-
else:
|
639
|
-
raise ValueError("Could not extract base PIL image from pdfplumber.")
|
640
|
-
base_image = base_image.convert("RGBA")
|
641
|
-
logger.debug(
|
642
|
-
f"Base image for page {page_index} rendered with resolution {render_resolution}."
|
643
|
-
)
|
644
|
-
except Exception as e:
|
645
|
-
logger.error(f"Failed to render base image for page {page_index}: {e}", exc_info=True)
|
646
|
-
return None
|
632
|
+
render_resolution = resolution if resolution is not None else scale * 72
|
633
|
+
base_image = render_plain_page(page, render_resolution)
|
634
|
+
base_image = base_image.convert("RGBA")
|
635
|
+
logger.debug(
|
636
|
+
f"Base image for page {page_index} rendered with resolution {render_resolution}."
|
637
|
+
)
|
647
638
|
|
648
639
|
# --- Render Highlights ---
|
649
640
|
rendered_image: Image.Image
|