natural-pdf 0.1.7__py3-none-any.whl → 0.1.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- natural_pdf/__init__.py +3 -0
- natural_pdf/analyzers/layout/base.py +1 -5
- natural_pdf/analyzers/layout/gemini.py +61 -51
- natural_pdf/analyzers/layout/layout_analyzer.py +40 -11
- natural_pdf/analyzers/layout/layout_manager.py +26 -84
- natural_pdf/analyzers/layout/layout_options.py +7 -0
- natural_pdf/analyzers/layout/pdfplumber_table_finder.py +142 -0
- natural_pdf/analyzers/layout/surya.py +46 -123
- natural_pdf/analyzers/layout/tatr.py +51 -4
- natural_pdf/analyzers/text_structure.py +3 -5
- natural_pdf/analyzers/utils.py +3 -3
- natural_pdf/classification/manager.py +422 -0
- natural_pdf/classification/mixin.py +163 -0
- natural_pdf/classification/results.py +80 -0
- natural_pdf/collections/mixins.py +111 -0
- natural_pdf/collections/pdf_collection.py +434 -15
- natural_pdf/core/element_manager.py +83 -0
- natural_pdf/core/highlighting_service.py +13 -22
- natural_pdf/core/page.py +578 -93
- natural_pdf/core/pdf.py +912 -460
- natural_pdf/elements/base.py +134 -40
- natural_pdf/elements/collections.py +712 -109
- natural_pdf/elements/region.py +722 -69
- natural_pdf/elements/text.py +4 -1
- natural_pdf/export/mixin.py +137 -0
- natural_pdf/exporters/base.py +3 -3
- natural_pdf/exporters/paddleocr.py +5 -4
- natural_pdf/extraction/manager.py +135 -0
- natural_pdf/extraction/mixin.py +279 -0
- natural_pdf/extraction/result.py +23 -0
- natural_pdf/ocr/__init__.py +5 -5
- natural_pdf/ocr/engine_doctr.py +346 -0
- natural_pdf/ocr/engine_easyocr.py +6 -3
- natural_pdf/ocr/ocr_factory.py +24 -4
- natural_pdf/ocr/ocr_manager.py +122 -26
- natural_pdf/ocr/ocr_options.py +94 -11
- natural_pdf/ocr/utils.py +19 -6
- natural_pdf/qa/document_qa.py +0 -4
- natural_pdf/search/__init__.py +20 -34
- natural_pdf/search/haystack_search_service.py +309 -265
- natural_pdf/search/haystack_utils.py +99 -75
- natural_pdf/search/search_service_protocol.py +11 -12
- natural_pdf/selectors/parser.py +431 -230
- natural_pdf/utils/debug.py +3 -3
- natural_pdf/utils/identifiers.py +1 -1
- natural_pdf/utils/locks.py +8 -0
- natural_pdf/utils/packaging.py +8 -6
- natural_pdf/utils/text_extraction.py +60 -1
- natural_pdf/utils/tqdm_utils.py +51 -0
- natural_pdf/utils/visualization.py +18 -0
- natural_pdf/widgets/viewer.py +4 -25
- {natural_pdf-0.1.7.dist-info → natural_pdf-0.1.9.dist-info}/METADATA +17 -3
- natural_pdf-0.1.9.dist-info/RECORD +80 -0
- {natural_pdf-0.1.7.dist-info → natural_pdf-0.1.9.dist-info}/WHEEL +1 -1
- {natural_pdf-0.1.7.dist-info → natural_pdf-0.1.9.dist-info}/top_level.txt +0 -2
- docs/api/index.md +0 -386
- docs/assets/favicon.png +0 -3
- docs/assets/favicon.svg +0 -3
- docs/assets/javascripts/custom.js +0 -17
- docs/assets/logo.svg +0 -3
- docs/assets/sample-screen.png +0 -0
- docs/assets/social-preview.png +0 -17
- docs/assets/social-preview.svg +0 -17
- docs/assets/stylesheets/custom.css +0 -65
- docs/document-qa/index.ipynb +0 -435
- docs/document-qa/index.md +0 -79
- docs/element-selection/index.ipynb +0 -915
- docs/element-selection/index.md +0 -229
- docs/finetuning/index.md +0 -176
- docs/index.md +0 -170
- docs/installation/index.md +0 -69
- docs/interactive-widget/index.ipynb +0 -962
- docs/interactive-widget/index.md +0 -12
- docs/layout-analysis/index.ipynb +0 -818
- docs/layout-analysis/index.md +0 -185
- docs/ocr/index.md +0 -209
- docs/pdf-navigation/index.ipynb +0 -314
- docs/pdf-navigation/index.md +0 -97
- docs/regions/index.ipynb +0 -816
- docs/regions/index.md +0 -294
- docs/tables/index.ipynb +0 -658
- docs/tables/index.md +0 -144
- docs/text-analysis/index.ipynb +0 -370
- docs/text-analysis/index.md +0 -105
- docs/text-extraction/index.ipynb +0 -1478
- docs/text-extraction/index.md +0 -292
- docs/tutorials/01-loading-and-extraction.ipynb +0 -194
- docs/tutorials/01-loading-and-extraction.md +0 -95
- docs/tutorials/02-finding-elements.ipynb +0 -340
- docs/tutorials/02-finding-elements.md +0 -149
- docs/tutorials/03-extracting-blocks.ipynb +0 -147
- docs/tutorials/03-extracting-blocks.md +0 -48
- docs/tutorials/04-table-extraction.ipynb +0 -114
- docs/tutorials/04-table-extraction.md +0 -50
- docs/tutorials/05-excluding-content.ipynb +0 -270
- docs/tutorials/05-excluding-content.md +0 -109
- docs/tutorials/06-document-qa.ipynb +0 -332
- docs/tutorials/06-document-qa.md +0 -91
- docs/tutorials/07-layout-analysis.ipynb +0 -288
- docs/tutorials/07-layout-analysis.md +0 -66
- docs/tutorials/07-working-with-regions.ipynb +0 -413
- docs/tutorials/07-working-with-regions.md +0 -151
- docs/tutorials/08-spatial-navigation.ipynb +0 -508
- docs/tutorials/08-spatial-navigation.md +0 -190
- docs/tutorials/09-section-extraction.ipynb +0 -2434
- docs/tutorials/09-section-extraction.md +0 -256
- docs/tutorials/10-form-field-extraction.ipynb +0 -512
- docs/tutorials/10-form-field-extraction.md +0 -201
- docs/tutorials/11-enhanced-table-processing.ipynb +0 -54
- docs/tutorials/11-enhanced-table-processing.md +0 -9
- docs/tutorials/12-ocr-integration.ipynb +0 -604
- docs/tutorials/12-ocr-integration.md +0 -175
- docs/tutorials/13-semantic-search.ipynb +0 -1328
- docs/tutorials/13-semantic-search.md +0 -77
- docs/visual-debugging/index.ipynb +0 -2970
- docs/visual-debugging/index.md +0 -157
- docs/visual-debugging/region.png +0 -0
- natural_pdf/templates/finetune/fine_tune_paddleocr.md +0 -415
- natural_pdf/templates/spa/css/style.css +0 -334
- natural_pdf/templates/spa/index.html +0 -31
- natural_pdf/templates/spa/js/app.js +0 -472
- natural_pdf/templates/spa/words.txt +0 -235976
- natural_pdf/widgets/frontend/viewer.js +0 -88
- natural_pdf-0.1.7.dist-info/RECORD +0 -145
- notebooks/Examples.ipynb +0 -1293
- pdfs/.gitkeep +0 -0
- pdfs/01-practice.pdf +0 -543
- pdfs/0500000US42001.pdf +0 -0
- pdfs/0500000US42007.pdf +0 -0
- pdfs/2014 Statistics.pdf +0 -0
- pdfs/2019 Statistics.pdf +0 -0
- pdfs/Atlanta_Public_Schools_GA_sample.pdf +0 -0
- pdfs/needs-ocr.pdf +0 -0
- {natural_pdf-0.1.7.dist-info → natural_pdf-0.1.9.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,111 @@
|
|
1
|
+
import logging
|
2
|
+
from typing import Any, Callable, Iterable, TypeVar
|
3
|
+
|
4
|
+
from tqdm.auto import tqdm
|
5
|
+
|
6
|
+
logger = logging.getLogger(__name__)
|
7
|
+
|
8
|
+
T = TypeVar("T") # Generic type for items in the collection
|
9
|
+
|
10
|
+
|
11
|
+
class DirectionalCollectionMixin:
|
12
|
+
"""
|
13
|
+
Mixin providing directional methods for collections of elements/regions.
|
14
|
+
"""
|
15
|
+
|
16
|
+
def below(self, **kwargs) -> "ElementCollection":
|
17
|
+
"""Find regions below all elements in this collection."""
|
18
|
+
return self.apply(lambda element: element.below(**kwargs))
|
19
|
+
|
20
|
+
def above(self, **kwargs) -> "ElementCollection":
|
21
|
+
"""Find regions above all elements in this collection."""
|
22
|
+
return self.apply(lambda element: element.above(**kwargs))
|
23
|
+
|
24
|
+
def left(self, **kwargs) -> "ElementCollection":
|
25
|
+
"""Find regions to the left of all elements in this collection."""
|
26
|
+
return self.apply(lambda element: element.left(**kwargs))
|
27
|
+
|
28
|
+
def right(self, **kwargs) -> "ElementCollection":
|
29
|
+
"""Find regions to the right of all elements in this collection."""
|
30
|
+
return self.apply(lambda element: element.right(**kwargs))
|
31
|
+
|
32
|
+
def expand(self, **kwargs) -> "ElementCollection":
|
33
|
+
"""Expand all elements in this collection."""
|
34
|
+
return self.apply(lambda element: element.expand(**kwargs))
|
35
|
+
|
36
|
+
|
37
|
+
class ApplyMixin:
|
38
|
+
"""
|
39
|
+
Mixin class providing an `.apply()` method for collections.
|
40
|
+
|
41
|
+
Assumes the inheriting class implements `__iter__` and `__len__` appropriately
|
42
|
+
for the items to be processed by `apply`.
|
43
|
+
"""
|
44
|
+
|
45
|
+
def _get_items_for_apply(self) -> Iterable[Any]:
|
46
|
+
"""
|
47
|
+
Returns the iterable of items to apply the function to.
|
48
|
+
Defaults to iterating over `self`. Subclasses should override this
|
49
|
+
if the default iteration is not suitable for the apply operation.
|
50
|
+
"""
|
51
|
+
# Default to standard iteration over the collection itself
|
52
|
+
return iter(self)
|
53
|
+
|
54
|
+
def apply(self: Any, func: Callable[[Any, ...], Any], *args, **kwargs) -> Iterable[Any]:
|
55
|
+
"""
|
56
|
+
Applies a function to each item in the collection.
|
57
|
+
|
58
|
+
Args:
|
59
|
+
func: The function to apply to each item. The item itself
|
60
|
+
will be passed as the first argument to the function.
|
61
|
+
*args: Additional positional arguments to pass to func.
|
62
|
+
**kwargs: Additional keyword arguments to pass to func.
|
63
|
+
A special keyword argument 'show_progress' (bool, default=False)
|
64
|
+
can be used to display a progress bar.
|
65
|
+
"""
|
66
|
+
show_progress = kwargs.pop("show_progress", False)
|
67
|
+
# Derive unit name from class name
|
68
|
+
unit_name = self.__class__.__name__.lower()
|
69
|
+
items_iterable = self._get_items_for_apply()
|
70
|
+
|
71
|
+
# Need total count for tqdm, assumes __len__ is implemented by the inheriting class
|
72
|
+
total_items = 0
|
73
|
+
try:
|
74
|
+
total_items = len(self)
|
75
|
+
except TypeError: # Handle cases where __len__ might not be defined on self
|
76
|
+
logger.warning(f"Could not determine collection length for progress bar.")
|
77
|
+
|
78
|
+
if show_progress and total_items > 0:
|
79
|
+
items_iterable = tqdm(
|
80
|
+
items_iterable, total=total_items, desc=f"Applying {func.__name__}", unit=unit_name
|
81
|
+
)
|
82
|
+
elif show_progress:
|
83
|
+
logger.info(
|
84
|
+
f"Applying {func.__name__} (progress bar disabled for zero/unknown length)."
|
85
|
+
)
|
86
|
+
|
87
|
+
results = [func(item, *args, **kwargs) for item in items_iterable]
|
88
|
+
|
89
|
+
# If results is empty, return an empty list
|
90
|
+
if not results:
|
91
|
+
return []
|
92
|
+
|
93
|
+
# Import here to avoid circular imports
|
94
|
+
from natural_pdf import PDF, Page
|
95
|
+
from natural_pdf.collections.pdf_collection import PDFCollection
|
96
|
+
from natural_pdf.elements.base import Element
|
97
|
+
from natural_pdf.elements.collections import ElementCollection, PageCollection
|
98
|
+
from natural_pdf.elements.region import Region
|
99
|
+
|
100
|
+
first_non_none = next((r for r in results if r is not None), None)
|
101
|
+
first_type = type(first_non_none) if first_non_none is not None else None
|
102
|
+
|
103
|
+
# Return the appropriate collection based on result type (...generally)
|
104
|
+
if issubclass(first_type, Element) or issubclass(first_type, Region):
|
105
|
+
return ElementCollection(results)
|
106
|
+
elif first_type == PDF:
|
107
|
+
return PDFCollection(results)
|
108
|
+
elif first_type == Page:
|
109
|
+
return PageCollection(results)
|
110
|
+
|
111
|
+
return results
|
@@ -1,19 +1,47 @@
|
|
1
|
+
import concurrent.futures # Import concurrent.futures
|
1
2
|
import copy # Added for copying options
|
2
3
|
import glob as py_glob
|
3
4
|
import logging
|
4
5
|
import os
|
5
6
|
import re # Added for safe path generation
|
7
|
+
import threading # Import threading for logging thread information
|
8
|
+
import time # Import time for logging timestamps
|
6
9
|
from pathlib import Path
|
7
|
-
from typing import
|
10
|
+
from typing import (
|
11
|
+
TYPE_CHECKING,
|
12
|
+
Any,
|
13
|
+
Callable,
|
14
|
+
Dict,
|
15
|
+
Generic,
|
16
|
+
Iterable,
|
17
|
+
Iterator,
|
18
|
+
List,
|
19
|
+
Optional,
|
20
|
+
Set,
|
21
|
+
Type,
|
22
|
+
TypeVar,
|
23
|
+
Union,
|
24
|
+
overload,
|
25
|
+
)
|
8
26
|
|
9
27
|
from PIL import Image
|
10
28
|
from tqdm import tqdm
|
29
|
+
from tqdm.auto import tqdm as auto_tqdm
|
30
|
+
from tqdm.notebook import tqdm as notebook_tqdm
|
31
|
+
|
32
|
+
from natural_pdf.utils.tqdm_utils import get_tqdm
|
33
|
+
|
34
|
+
# Get the appropriate tqdm class once
|
35
|
+
tqdm = get_tqdm()
|
11
36
|
|
12
37
|
# Set up logger early
|
38
|
+
# Configure logging to include thread information
|
39
|
+
# logging.basicConfig(level=logging.DEBUG, format='%(asctime)s - %(threadName)s - %(name)s - %(levelname)s - %(message)s')
|
13
40
|
logger = logging.getLogger(__name__)
|
14
41
|
|
15
42
|
from natural_pdf.core.pdf import PDF
|
16
43
|
from natural_pdf.elements.region import Region
|
44
|
+
from natural_pdf.export.mixin import ExportMixin
|
17
45
|
|
18
46
|
# --- Search Imports ---
|
19
47
|
try:
|
@@ -35,10 +63,12 @@ except ImportError as e:
|
|
35
63
|
|
36
64
|
SearchServiceProtocol, SearchOptions, Indexable = object, object, object
|
37
65
|
|
66
|
+
# Import the ApplyMixin
|
67
|
+
from natural_pdf.collections.mixins import ApplyMixin
|
38
68
|
from natural_pdf.search.searchable_mixin import SearchableMixin # Import the new mixin
|
39
69
|
|
40
70
|
|
41
|
-
class PDFCollection(SearchableMixin): #
|
71
|
+
class PDFCollection(SearchableMixin, ApplyMixin, ExportMixin): # Add ExportMixin
|
42
72
|
def __init__(
|
43
73
|
self,
|
44
74
|
source: Union[str, Iterable[Union[str, "PDF"]]],
|
@@ -237,30 +267,257 @@ class PDFCollection(SearchableMixin): # Inherit from the mixin
|
|
237
267
|
|
238
268
|
def __repr__(self) -> str:
|
239
269
|
# Removed search status
|
240
|
-
return f"<PDFCollection(count={len(self)})>"
|
270
|
+
return f"<PDFCollection(count={len(self._pdfs)})>"
|
271
|
+
return f"<PDFCollection(count={len(self._pdfs)})>"
|
241
272
|
|
242
273
|
@property
|
243
274
|
def pdfs(self) -> List["PDF"]:
|
244
275
|
"""Returns the list of PDF objects held by the collection."""
|
245
276
|
return self._pdfs
|
246
277
|
|
247
|
-
|
248
|
-
|
249
|
-
|
250
|
-
|
251
|
-
|
278
|
+
@overload
|
279
|
+
def find_all(
|
280
|
+
self,
|
281
|
+
*,
|
282
|
+
text: str,
|
283
|
+
apply_exclusions: bool = True,
|
284
|
+
regex: bool = False,
|
285
|
+
case: bool = True,
|
286
|
+
**kwargs,
|
287
|
+
) -> "ElementCollection": ...
|
288
|
+
|
289
|
+
@overload
|
290
|
+
def find_all(
|
291
|
+
self,
|
292
|
+
selector: str,
|
293
|
+
*,
|
294
|
+
apply_exclusions: bool = True,
|
295
|
+
regex: bool = False,
|
296
|
+
case: bool = True,
|
297
|
+
**kwargs,
|
298
|
+
) -> "ElementCollection": ...
|
299
|
+
|
300
|
+
def find_all(
|
301
|
+
self,
|
302
|
+
selector: Optional[str] = None, # Now optional
|
303
|
+
*,
|
304
|
+
text: Optional[str] = None, # New text parameter
|
305
|
+
apply_exclusions: bool = True,
|
306
|
+
regex: bool = False,
|
307
|
+
case: bool = True,
|
308
|
+
**kwargs,
|
309
|
+
) -> "ElementCollection":
|
310
|
+
"""
|
311
|
+
Find all elements matching the selector OR text across all PDFs in the collection.
|
312
|
+
|
313
|
+
Provide EITHER `selector` OR `text`, but not both.
|
314
|
+
|
315
|
+
This creates an ElementCollection that can span multiple PDFs. Note that
|
316
|
+
some ElementCollection methods have limitations when spanning PDFs.
|
317
|
+
|
318
|
+
Args:
|
319
|
+
selector: CSS-like selector string to query elements.
|
320
|
+
text: Text content to search for (equivalent to 'text:contains(...)').
|
321
|
+
apply_exclusions: Whether to exclude elements in exclusion regions (default: True).
|
322
|
+
regex: Whether to use regex for text search (`selector` or `text`) (default: False).
|
323
|
+
case: Whether to do case-sensitive text search (`selector` or `text`) (default: True).
|
324
|
+
**kwargs: Additional keyword arguments passed to the find_all method of each PDF.
|
325
|
+
|
326
|
+
Returns:
|
327
|
+
ElementCollection containing all matching elements across all PDFs.
|
328
|
+
"""
|
329
|
+
# Validation happens within pdf.find_all
|
330
|
+
|
331
|
+
# Collect elements from all PDFs
|
332
|
+
all_elements = []
|
252
333
|
for pdf in self._pdfs:
|
253
|
-
# We need to figure out which pages belong to which PDF if batching here
|
254
|
-
# For now, simpler to call on each PDF
|
255
334
|
try:
|
256
|
-
#
|
257
|
-
pdf.
|
335
|
+
# Pass the relevant arguments down to each PDF's find_all
|
336
|
+
elements = pdf.find_all(
|
337
|
+
selector=selector,
|
338
|
+
text=text,
|
339
|
+
apply_exclusions=apply_exclusions,
|
340
|
+
regex=regex,
|
341
|
+
case=case,
|
342
|
+
**kwargs,
|
343
|
+
)
|
344
|
+
all_elements.extend(elements.elements)
|
345
|
+
except Exception as e:
|
346
|
+
logger.error(f"Error finding elements in {pdf.path}: {e}", exc_info=True)
|
347
|
+
|
348
|
+
return ElementCollection(all_elements)
|
349
|
+
|
350
|
+
def apply_ocr(
|
351
|
+
self,
|
352
|
+
engine: Optional[str] = None,
|
353
|
+
languages: Optional[List[str]] = None,
|
354
|
+
min_confidence: Optional[float] = None,
|
355
|
+
device: Optional[str] = None,
|
356
|
+
resolution: Optional[int] = None,
|
357
|
+
apply_exclusions: bool = True,
|
358
|
+
detect_only: bool = False,
|
359
|
+
replace: bool = True,
|
360
|
+
options: Optional[Any] = None,
|
361
|
+
pages: Optional[Union[slice, List[int]]] = None,
|
362
|
+
max_workers: Optional[int] = None,
|
363
|
+
) -> "PDFCollection":
|
364
|
+
"""
|
365
|
+
Apply OCR to all PDFs in the collection, potentially in parallel.
|
366
|
+
|
367
|
+
Args:
|
368
|
+
engine: OCR engine to use (e.g., 'easyocr', 'paddleocr', 'surya')
|
369
|
+
languages: List of language codes for OCR
|
370
|
+
min_confidence: Minimum confidence threshold for text detection
|
371
|
+
device: Device to use for OCR (e.g., 'cpu', 'cuda')
|
372
|
+
resolution: DPI resolution for page rendering
|
373
|
+
apply_exclusions: Whether to apply exclusion regions
|
374
|
+
detect_only: If True, only detect text regions without extracting text
|
375
|
+
replace: If True, replace existing OCR elements
|
376
|
+
options: Engine-specific options
|
377
|
+
pages: Specific pages to process (None for all pages)
|
378
|
+
max_workers: Maximum number of threads to process PDFs concurrently.
|
379
|
+
If None or 1, processing is sequential. (default: None)
|
380
|
+
|
381
|
+
Returns:
|
382
|
+
Self for method chaining
|
383
|
+
"""
|
384
|
+
PDF = self._get_pdf_class()
|
385
|
+
logger.info(
|
386
|
+
f"Applying OCR to {len(self._pdfs)} PDFs in collection (max_workers={max_workers})..."
|
387
|
+
)
|
388
|
+
|
389
|
+
# Worker function takes PDF object again
|
390
|
+
def _process_pdf(pdf: PDF):
|
391
|
+
"""Helper function to apply OCR to a single PDF, handling errors."""
|
392
|
+
thread_id = threading.current_thread().name # Get thread name for logging
|
393
|
+
pdf_path = pdf.path # Get path for logging
|
394
|
+
logger.debug(f"[{thread_id}] Starting OCR process for: {pdf_path}")
|
395
|
+
start_time = time.monotonic()
|
396
|
+
try:
|
397
|
+
pdf.apply_ocr( # Call apply_ocr on the original PDF object
|
398
|
+
pages=pages,
|
399
|
+
engine=engine,
|
400
|
+
languages=languages,
|
401
|
+
min_confidence=min_confidence,
|
402
|
+
device=device,
|
403
|
+
resolution=resolution,
|
404
|
+
apply_exclusions=apply_exclusions,
|
405
|
+
detect_only=detect_only,
|
406
|
+
replace=replace,
|
407
|
+
options=options,
|
408
|
+
# Note: We might want a max_workers here too for page rendering?
|
409
|
+
# For now, PDF.apply_ocr doesn't have it.
|
410
|
+
)
|
411
|
+
end_time = time.monotonic()
|
412
|
+
logger.debug(
|
413
|
+
f"[{thread_id}] Finished OCR process for: {pdf_path} (Duration: {end_time - start_time:.2f}s)"
|
414
|
+
)
|
415
|
+
return pdf_path, None
|
258
416
|
except Exception as e:
|
259
|
-
|
417
|
+
end_time = time.monotonic()
|
418
|
+
logger.error(
|
419
|
+
f"[{thread_id}] Failed OCR process for {pdf_path} after {end_time - start_time:.2f}s: {e}",
|
420
|
+
exc_info=False,
|
421
|
+
)
|
422
|
+
return pdf_path, e # Return path and error
|
423
|
+
|
424
|
+
# Use ThreadPoolExecutor for parallel processing if max_workers > 1
|
425
|
+
if max_workers is not None and max_workers > 1:
|
426
|
+
futures = []
|
427
|
+
with concurrent.futures.ThreadPoolExecutor(
|
428
|
+
max_workers=max_workers, thread_name_prefix="OCRWorker"
|
429
|
+
) as executor:
|
430
|
+
for pdf in self._pdfs:
|
431
|
+
# Submit the PDF object to the worker function
|
432
|
+
futures.append(executor.submit(_process_pdf, pdf))
|
433
|
+
|
434
|
+
# Use the selected tqdm class with as_completed for progress tracking
|
435
|
+
progress_bar = tqdm(
|
436
|
+
concurrent.futures.as_completed(futures),
|
437
|
+
total=len(self._pdfs),
|
438
|
+
desc="Applying OCR (Parallel)",
|
439
|
+
unit="pdf",
|
440
|
+
)
|
441
|
+
|
442
|
+
for future in progress_bar:
|
443
|
+
pdf_path, error = future.result() # Get result (or exception)
|
444
|
+
if error:
|
445
|
+
progress_bar.set_postfix_str(f"Error: {pdf_path}", refresh=True)
|
446
|
+
# Progress is updated automatically by tqdm
|
447
|
+
|
448
|
+
else: # Sequential processing (max_workers is None or 1)
|
449
|
+
logger.info("Applying OCR sequentially...")
|
450
|
+
# Use the selected tqdm class for sequential too for consistency
|
451
|
+
# Iterate over PDF objects directly for sequential
|
452
|
+
for pdf in tqdm(self._pdfs, desc="Applying OCR (Sequential)", unit="pdf"):
|
453
|
+
_process_pdf(pdf) # Call helper directly with PDF object
|
454
|
+
|
455
|
+
logger.info("Finished applying OCR across the collection.")
|
260
456
|
return self
|
261
457
|
|
262
|
-
|
263
|
-
|
458
|
+
def correct_ocr(
|
459
|
+
self,
|
460
|
+
correction_callback: Callable[[Any], Optional[str]],
|
461
|
+
max_workers: Optional[int] = None,
|
462
|
+
progress_callback: Optional[Callable[[], None]] = None,
|
463
|
+
) -> "PDFCollection":
|
464
|
+
"""
|
465
|
+
Apply OCR correction to all relevant elements across all pages and PDFs
|
466
|
+
in the collection using a single progress bar.
|
467
|
+
|
468
|
+
Args:
|
469
|
+
correction_callback: Function to apply to each OCR element.
|
470
|
+
It receives the element and should return
|
471
|
+
the corrected text (str) or None.
|
472
|
+
max_workers: Max threads to use for parallel execution within each page.
|
473
|
+
progress_callback: Optional callback function to call after processing each element.
|
474
|
+
|
475
|
+
Returns:
|
476
|
+
Self for method chaining.
|
477
|
+
"""
|
478
|
+
PDF = self._get_pdf_class() # Ensure PDF class is available
|
479
|
+
if not callable(correction_callback):
|
480
|
+
raise TypeError("`correction_callback` must be a callable function.")
|
481
|
+
|
482
|
+
logger.info(f"Gathering OCR elements from {len(self._pdfs)} PDFs for correction...")
|
483
|
+
|
484
|
+
# 1. Gather all target elements using the collection's find_all
|
485
|
+
# Crucially, set apply_exclusions=False to include elements in headers/footers etc.
|
486
|
+
all_ocr_elements = self.find_all("text[source=ocr]", apply_exclusions=False).elements
|
487
|
+
|
488
|
+
if not all_ocr_elements:
|
489
|
+
logger.info("No OCR elements found in the collection to correct.")
|
490
|
+
return self
|
491
|
+
|
492
|
+
total_elements = len(all_ocr_elements)
|
493
|
+
logger.info(
|
494
|
+
f"Found {total_elements} OCR elements across the collection. Starting correction process..."
|
495
|
+
)
|
496
|
+
|
497
|
+
# 2. Initialize the progress bar
|
498
|
+
progress_bar = tqdm(total=total_elements, desc="Correcting OCR Elements", unit="element")
|
499
|
+
|
500
|
+
# 3. Iterate through PDFs and delegate to PDF.correct_ocr
|
501
|
+
# PDF.correct_ocr handles page iteration and passing the progress callback down.
|
502
|
+
for pdf in self._pdfs:
|
503
|
+
if not pdf.pages:
|
504
|
+
continue
|
505
|
+
try:
|
506
|
+
pdf.correct_ocr(
|
507
|
+
correction_callback=correction_callback,
|
508
|
+
max_workers=max_workers,
|
509
|
+
progress_callback=progress_bar.update, # Pass the bar's update method
|
510
|
+
)
|
511
|
+
except Exception as e:
|
512
|
+
logger.error(
|
513
|
+
f"Error occurred during correction process for PDF {pdf.path}: {e}",
|
514
|
+
exc_info=True,
|
515
|
+
)
|
516
|
+
# Decide if we should stop or continue? For now, continue.
|
517
|
+
|
518
|
+
progress_bar.close()
|
519
|
+
|
520
|
+
return self
|
264
521
|
|
265
522
|
def categorize(self, categories: List[str], **kwargs):
|
266
523
|
"""Categorizes PDFs in the collection based on content or features."""
|
@@ -309,3 +566,165 @@ class PDFCollection(SearchableMixin): # Inherit from the mixin
|
|
309
566
|
# logger.debug(f"Skipping empty page {page.page_number} from PDF '{pdf.path}'.")
|
310
567
|
# continue
|
311
568
|
yield page
|
569
|
+
|
570
|
+
# --- Classification Method --- #
|
571
|
+
def classify_all(
|
572
|
+
self,
|
573
|
+
categories: List[str],
|
574
|
+
model: str = "text",
|
575
|
+
max_workers: Optional[int] = None,
|
576
|
+
**kwargs,
|
577
|
+
) -> "PDFCollection":
|
578
|
+
"""
|
579
|
+
Classify all pages across all PDFs in the collection, potentially in parallel.
|
580
|
+
|
581
|
+
This method uses the unified `classify_all` approach, delegating page
|
582
|
+
classification to each PDF's `classify_pages` method.
|
583
|
+
It displays a progress bar tracking individual pages.
|
584
|
+
|
585
|
+
Args:
|
586
|
+
categories: A list of string category names.
|
587
|
+
model: Model identifier ('text', 'vision', or specific HF ID).
|
588
|
+
max_workers: Maximum number of threads to process PDFs concurrently.
|
589
|
+
If None or 1, processing is sequential.
|
590
|
+
**kwargs: Additional arguments passed down to `pdf.classify_pages` and
|
591
|
+
subsequently to `page.classify` (e.g., device,
|
592
|
+
confidence_threshold, resolution).
|
593
|
+
|
594
|
+
Returns:
|
595
|
+
Self for method chaining.
|
596
|
+
|
597
|
+
Raises:
|
598
|
+
ValueError: If categories list is empty.
|
599
|
+
ClassificationError: If classification fails for any page (will stop processing).
|
600
|
+
ImportError: If classification dependencies are missing.
|
601
|
+
"""
|
602
|
+
PDF = self._get_pdf_class()
|
603
|
+
if not categories:
|
604
|
+
raise ValueError("Categories list cannot be empty.")
|
605
|
+
|
606
|
+
logger.info(
|
607
|
+
f"Starting classification for {len(self._pdfs)} PDFs in collection (model: '{model}')..."
|
608
|
+
)
|
609
|
+
|
610
|
+
# Calculate total pages for the progress bar
|
611
|
+
total_pages = sum(len(pdf.pages) for pdf in self._pdfs if pdf.pages)
|
612
|
+
if total_pages == 0:
|
613
|
+
logger.warning("No pages found in the PDF collection to classify.")
|
614
|
+
return self
|
615
|
+
|
616
|
+
progress_bar = tqdm(
|
617
|
+
total=total_pages, desc=f"Classifying Pages (model: {model})", unit="page"
|
618
|
+
)
|
619
|
+
|
620
|
+
# Worker function
|
621
|
+
def _process_pdf_classification(pdf: PDF):
|
622
|
+
thread_id = threading.current_thread().name
|
623
|
+
pdf_path = pdf.path
|
624
|
+
logger.debug(f"[{thread_id}] Starting classification process for: {pdf_path}")
|
625
|
+
start_time = time.monotonic()
|
626
|
+
try:
|
627
|
+
# Call classify_pages on the PDF, passing the progress callback
|
628
|
+
pdf.classify_pages(
|
629
|
+
categories=categories,
|
630
|
+
model=model,
|
631
|
+
progress_callback=progress_bar.update,
|
632
|
+
**kwargs,
|
633
|
+
)
|
634
|
+
end_time = time.monotonic()
|
635
|
+
logger.debug(
|
636
|
+
f"[{thread_id}] Finished classification for: {pdf_path} (Duration: {end_time - start_time:.2f}s)"
|
637
|
+
)
|
638
|
+
return pdf_path, None # Return path and no error
|
639
|
+
except Exception as e:
|
640
|
+
end_time = time.monotonic()
|
641
|
+
# Error is logged within classify_pages, but log summary here
|
642
|
+
logger.error(
|
643
|
+
f"[{thread_id}] Failed classification process for {pdf_path} after {end_time - start_time:.2f}s: {e}",
|
644
|
+
exc_info=False,
|
645
|
+
)
|
646
|
+
# Close progress bar immediately on error to avoid hanging
|
647
|
+
progress_bar.close()
|
648
|
+
# Re-raise the exception to stop the entire collection processing
|
649
|
+
raise
|
650
|
+
|
651
|
+
# Use ThreadPoolExecutor for parallel processing if max_workers > 1
|
652
|
+
try:
|
653
|
+
if max_workers is not None and max_workers > 1:
|
654
|
+
logger.info(f"Classifying PDFs in parallel with {max_workers} workers.")
|
655
|
+
futures = []
|
656
|
+
with concurrent.futures.ThreadPoolExecutor(
|
657
|
+
max_workers=max_workers, thread_name_prefix="ClassifyWorker"
|
658
|
+
) as executor:
|
659
|
+
for pdf in self._pdfs:
|
660
|
+
futures.append(executor.submit(_process_pdf_classification, pdf))
|
661
|
+
|
662
|
+
# Wait for all futures to complete (progress updated by callback)
|
663
|
+
# Exceptions are raised by future.result() if worker failed
|
664
|
+
for future in concurrent.futures.as_completed(futures):
|
665
|
+
future.result() # Raise exception if worker failed
|
666
|
+
|
667
|
+
else: # Sequential processing
|
668
|
+
logger.info("Classifying PDFs sequentially.")
|
669
|
+
for pdf in self._pdfs:
|
670
|
+
_process_pdf_classification(pdf)
|
671
|
+
|
672
|
+
logger.info("Finished classification across the collection.")
|
673
|
+
|
674
|
+
finally:
|
675
|
+
# Ensure progress bar is closed even if errors occurred elsewhere
|
676
|
+
if not progress_bar.disable and progress_bar.n < progress_bar.total:
|
677
|
+
progress_bar.close()
|
678
|
+
elif progress_bar.disable is False:
|
679
|
+
progress_bar.close()
|
680
|
+
|
681
|
+
return self
|
682
|
+
|
683
|
+
# --- End Classification Method --- #
|
684
|
+
|
685
|
+
def _gather_analysis_data(
|
686
|
+
self,
|
687
|
+
analysis_keys: List[str],
|
688
|
+
include_content: bool,
|
689
|
+
include_images: bool,
|
690
|
+
image_dir: Optional[Path],
|
691
|
+
image_format: str,
|
692
|
+
image_resolution: int,
|
693
|
+
) -> List[Dict[str, Any]]:
|
694
|
+
"""
|
695
|
+
Gather analysis data from all PDFs in the collection.
|
696
|
+
|
697
|
+
Args:
|
698
|
+
analysis_keys: Keys in the analyses dictionary to export
|
699
|
+
include_content: Whether to include extracted text
|
700
|
+
include_images: Whether to export images
|
701
|
+
image_dir: Directory to save images
|
702
|
+
image_format: Format to save images
|
703
|
+
image_resolution: Resolution for exported images
|
704
|
+
|
705
|
+
Returns:
|
706
|
+
List of dictionaries containing analysis data
|
707
|
+
"""
|
708
|
+
if not self._pdfs:
|
709
|
+
logger.warning("No PDFs found in collection")
|
710
|
+
return []
|
711
|
+
|
712
|
+
all_data = []
|
713
|
+
|
714
|
+
for pdf in tqdm(self._pdfs, desc="Gathering PDF data", leave=False):
|
715
|
+
# PDF level data
|
716
|
+
pdf_data = {
|
717
|
+
"pdf_path": pdf.path,
|
718
|
+
"pdf_filename": Path(pdf.path).name,
|
719
|
+
"total_pages": len(pdf.pages) if hasattr(pdf, "pages") else 0,
|
720
|
+
}
|
721
|
+
|
722
|
+
# Add metadata if available
|
723
|
+
if hasattr(pdf, "metadata") and pdf.metadata:
|
724
|
+
for k, v in pdf.metadata.items():
|
725
|
+
if v: # Only add non-empty metadata
|
726
|
+
pdf_data[f"metadata.{k}"] = str(v)
|
727
|
+
|
728
|
+
all_data.append(pdf_data)
|
729
|
+
|
730
|
+
return all_data
|
@@ -539,3 +539,86 @@ class ElementManager:
|
|
539
539
|
"""Get all region elements."""
|
540
540
|
self.load_elements()
|
541
541
|
return self._elements.get("regions", [])
|
542
|
+
|
543
|
+
def remove_ocr_elements(self):
|
544
|
+
"""
|
545
|
+
Remove all elements with source="ocr" from the elements dictionary.
|
546
|
+
This should be called before adding new OCR elements if replacement is desired.
|
547
|
+
|
548
|
+
Returns:
|
549
|
+
int: Number of OCR elements removed
|
550
|
+
"""
|
551
|
+
# Load elements if not already loaded
|
552
|
+
self.load_elements()
|
553
|
+
|
554
|
+
removed_count = 0
|
555
|
+
|
556
|
+
# Filter out OCR elements from words
|
557
|
+
if "words" in self._elements:
|
558
|
+
original_len = len(self._elements["words"])
|
559
|
+
self._elements["words"] = [
|
560
|
+
word for word in self._elements["words"] if getattr(word, "source", None) != "ocr"
|
561
|
+
]
|
562
|
+
removed_count += original_len - len(self._elements["words"])
|
563
|
+
|
564
|
+
# Filter out OCR elements from chars
|
565
|
+
if "chars" in self._elements:
|
566
|
+
original_len = len(self._elements["chars"])
|
567
|
+
self._elements["chars"] = [
|
568
|
+
char
|
569
|
+
for char in self._elements["chars"]
|
570
|
+
if (isinstance(char, dict) and char.get("source") != "ocr")
|
571
|
+
or (not isinstance(char, dict) and getattr(char, "source", None) != "ocr")
|
572
|
+
]
|
573
|
+
removed_count += original_len - len(self._elements["chars"])
|
574
|
+
|
575
|
+
logger.info(f"Page {self._page.number}: Removed {removed_count} OCR elements.")
|
576
|
+
return removed_count
|
577
|
+
|
578
|
+
def remove_element(self, element, element_type="words"):
|
579
|
+
"""
|
580
|
+
Remove a specific element from the managed elements.
|
581
|
+
|
582
|
+
Args:
|
583
|
+
element: The element to remove
|
584
|
+
element_type: The type of element ('words', 'chars', etc.)
|
585
|
+
|
586
|
+
Returns:
|
587
|
+
bool: True if removed successfully, False otherwise
|
588
|
+
"""
|
589
|
+
# Load elements if not already loaded
|
590
|
+
self.load_elements()
|
591
|
+
|
592
|
+
# Check if the collection exists
|
593
|
+
if element_type not in self._elements:
|
594
|
+
logger.warning(f"Cannot remove element: collection '{element_type}' does not exist")
|
595
|
+
return False
|
596
|
+
|
597
|
+
# Try to remove the element
|
598
|
+
try:
|
599
|
+
if element in self._elements[element_type]:
|
600
|
+
self._elements[element_type].remove(element)
|
601
|
+
logger.debug(f"Removed element from {element_type}: {element}")
|
602
|
+
return True
|
603
|
+
else:
|
604
|
+
logger.debug(f"Element not found in {element_type}: {element}")
|
605
|
+
return False
|
606
|
+
except Exception as e:
|
607
|
+
logger.error(f"Error removing element from {element_type}: {e}", exc_info=True)
|
608
|
+
return False
|
609
|
+
|
610
|
+
def has_elements(self) -> bool:
|
611
|
+
"""
|
612
|
+
Check if any significant elements (words, rects, lines, regions)
|
613
|
+
have been loaded or added.
|
614
|
+
|
615
|
+
Returns:
|
616
|
+
True if any elements exist, False otherwise.
|
617
|
+
"""
|
618
|
+
self.load_elements()
|
619
|
+
|
620
|
+
for key in ["words", "rects", "lines", "regions"]:
|
621
|
+
if self._elements.get(key):
|
622
|
+
return True
|
623
|
+
|
624
|
+
return False
|