natural-pdf 0.1.8__py3-none-any.whl → 0.1.10__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- natural_pdf/__init__.py +1 -0
- natural_pdf/analyzers/layout/base.py +1 -5
- natural_pdf/analyzers/layout/gemini.py +61 -51
- natural_pdf/analyzers/layout/layout_analyzer.py +40 -11
- natural_pdf/analyzers/layout/layout_manager.py +26 -84
- natural_pdf/analyzers/layout/layout_options.py +7 -0
- natural_pdf/analyzers/layout/pdfplumber_table_finder.py +142 -0
- natural_pdf/analyzers/layout/surya.py +46 -123
- natural_pdf/analyzers/layout/tatr.py +51 -4
- natural_pdf/analyzers/text_structure.py +3 -5
- natural_pdf/analyzers/utils.py +3 -3
- natural_pdf/classification/manager.py +241 -158
- natural_pdf/classification/mixin.py +52 -38
- natural_pdf/classification/results.py +71 -45
- natural_pdf/collections/mixins.py +85 -20
- natural_pdf/collections/pdf_collection.py +245 -100
- natural_pdf/core/element_manager.py +30 -14
- natural_pdf/core/highlighting_service.py +13 -22
- natural_pdf/core/page.py +423 -101
- natural_pdf/core/pdf.py +694 -195
- natural_pdf/elements/base.py +134 -40
- natural_pdf/elements/collections.py +610 -134
- natural_pdf/elements/region.py +659 -90
- natural_pdf/elements/text.py +1 -1
- natural_pdf/export/mixin.py +137 -0
- natural_pdf/exporters/base.py +3 -3
- natural_pdf/exporters/paddleocr.py +4 -3
- natural_pdf/extraction/manager.py +50 -49
- natural_pdf/extraction/mixin.py +90 -57
- natural_pdf/extraction/result.py +9 -23
- natural_pdf/ocr/__init__.py +5 -5
- natural_pdf/ocr/engine_doctr.py +346 -0
- natural_pdf/ocr/ocr_factory.py +24 -4
- natural_pdf/ocr/ocr_manager.py +61 -25
- natural_pdf/ocr/ocr_options.py +70 -10
- natural_pdf/ocr/utils.py +6 -4
- natural_pdf/search/__init__.py +20 -34
- natural_pdf/search/haystack_search_service.py +309 -265
- natural_pdf/search/haystack_utils.py +99 -75
- natural_pdf/search/search_service_protocol.py +11 -12
- natural_pdf/selectors/parser.py +219 -143
- natural_pdf/utils/debug.py +3 -3
- natural_pdf/utils/identifiers.py +1 -1
- natural_pdf/utils/locks.py +1 -1
- natural_pdf/utils/packaging.py +8 -6
- natural_pdf/utils/text_extraction.py +24 -16
- natural_pdf/utils/tqdm_utils.py +18 -10
- natural_pdf/utils/visualization.py +18 -0
- natural_pdf/widgets/viewer.py +4 -25
- {natural_pdf-0.1.8.dist-info → natural_pdf-0.1.10.dist-info}/METADATA +12 -3
- natural_pdf-0.1.10.dist-info/RECORD +80 -0
- {natural_pdf-0.1.8.dist-info → natural_pdf-0.1.10.dist-info}/WHEEL +1 -1
- {natural_pdf-0.1.8.dist-info → natural_pdf-0.1.10.dist-info}/top_level.txt +0 -2
- docs/api/index.md +0 -386
- docs/assets/favicon.png +0 -3
- docs/assets/favicon.svg +0 -3
- docs/assets/javascripts/custom.js +0 -17
- docs/assets/logo.svg +0 -3
- docs/assets/sample-screen.png +0 -0
- docs/assets/social-preview.png +0 -17
- docs/assets/social-preview.svg +0 -17
- docs/assets/stylesheets/custom.css +0 -65
- docs/categorizing-documents/index.md +0 -168
- docs/data-extraction/index.md +0 -87
- docs/document-qa/index.ipynb +0 -435
- docs/document-qa/index.md +0 -79
- docs/element-selection/index.ipynb +0 -969
- docs/element-selection/index.md +0 -249
- docs/finetuning/index.md +0 -176
- docs/index.md +0 -189
- docs/installation/index.md +0 -69
- docs/interactive-widget/index.ipynb +0 -962
- docs/interactive-widget/index.md +0 -12
- docs/layout-analysis/index.ipynb +0 -818
- docs/layout-analysis/index.md +0 -185
- docs/ocr/index.md +0 -256
- docs/pdf-navigation/index.ipynb +0 -314
- docs/pdf-navigation/index.md +0 -97
- docs/regions/index.ipynb +0 -816
- docs/regions/index.md +0 -294
- docs/tables/index.ipynb +0 -658
- docs/tables/index.md +0 -144
- docs/text-analysis/index.ipynb +0 -370
- docs/text-analysis/index.md +0 -105
- docs/text-extraction/index.ipynb +0 -1478
- docs/text-extraction/index.md +0 -292
- docs/tutorials/01-loading-and-extraction.ipynb +0 -1873
- docs/tutorials/01-loading-and-extraction.md +0 -95
- docs/tutorials/02-finding-elements.ipynb +0 -417
- docs/tutorials/02-finding-elements.md +0 -149
- docs/tutorials/03-extracting-blocks.ipynb +0 -152
- docs/tutorials/03-extracting-blocks.md +0 -48
- docs/tutorials/04-table-extraction.ipynb +0 -119
- docs/tutorials/04-table-extraction.md +0 -50
- docs/tutorials/05-excluding-content.ipynb +0 -275
- docs/tutorials/05-excluding-content.md +0 -109
- docs/tutorials/06-document-qa.ipynb +0 -337
- docs/tutorials/06-document-qa.md +0 -91
- docs/tutorials/07-layout-analysis.ipynb +0 -293
- docs/tutorials/07-layout-analysis.md +0 -66
- docs/tutorials/07-working-with-regions.ipynb +0 -414
- docs/tutorials/07-working-with-regions.md +0 -151
- docs/tutorials/08-spatial-navigation.ipynb +0 -513
- docs/tutorials/08-spatial-navigation.md +0 -190
- docs/tutorials/09-section-extraction.ipynb +0 -2439
- docs/tutorials/09-section-extraction.md +0 -256
- docs/tutorials/10-form-field-extraction.ipynb +0 -517
- docs/tutorials/10-form-field-extraction.md +0 -201
- docs/tutorials/11-enhanced-table-processing.ipynb +0 -59
- docs/tutorials/11-enhanced-table-processing.md +0 -9
- docs/tutorials/12-ocr-integration.ipynb +0 -3712
- docs/tutorials/12-ocr-integration.md +0 -137
- docs/tutorials/13-semantic-search.ipynb +0 -1718
- docs/tutorials/13-semantic-search.md +0 -77
- docs/visual-debugging/index.ipynb +0 -2970
- docs/visual-debugging/index.md +0 -157
- docs/visual-debugging/region.png +0 -0
- natural_pdf/templates/finetune/fine_tune_paddleocr.md +0 -420
- natural_pdf/templates/spa/css/style.css +0 -334
- natural_pdf/templates/spa/index.html +0 -31
- natural_pdf/templates/spa/js/app.js +0 -472
- natural_pdf/templates/spa/words.txt +0 -235976
- natural_pdf/widgets/frontend/viewer.js +0 -88
- natural_pdf-0.1.8.dist-info/RECORD +0 -156
- notebooks/Examples.ipynb +0 -1293
- pdfs/.gitkeep +0 -0
- pdfs/01-practice.pdf +0 -543
- pdfs/0500000US42001.pdf +0 -0
- pdfs/0500000US42007.pdf +0 -0
- pdfs/2014 Statistics.pdf +0 -0
- pdfs/2019 Statistics.pdf +0 -0
- pdfs/Atlanta_Public_Schools_GA_sample.pdf +0 -0
- pdfs/needs-ocr.pdf +0 -0
- {natural_pdf-0.1.8.dist-info → natural_pdf-0.1.10.dist-info}/licenses/LICENSE +0 -0
@@ -1,13 +1,28 @@
|
|
1
|
+
import concurrent.futures # Import concurrent.futures
|
1
2
|
import copy # Added for copying options
|
2
3
|
import glob as py_glob
|
3
4
|
import logging
|
4
5
|
import os
|
5
6
|
import re # Added for safe path generation
|
7
|
+
import threading # Import threading for logging thread information
|
8
|
+
import time # Import time for logging timestamps
|
6
9
|
from pathlib import Path
|
7
|
-
from typing import
|
8
|
-
|
9
|
-
|
10
|
-
|
10
|
+
from typing import (
|
11
|
+
TYPE_CHECKING,
|
12
|
+
Any,
|
13
|
+
Callable,
|
14
|
+
Dict,
|
15
|
+
Generic,
|
16
|
+
Iterable,
|
17
|
+
Iterator,
|
18
|
+
List,
|
19
|
+
Optional,
|
20
|
+
Set,
|
21
|
+
Type,
|
22
|
+
TypeVar,
|
23
|
+
Union,
|
24
|
+
overload,
|
25
|
+
)
|
11
26
|
|
12
27
|
from PIL import Image
|
13
28
|
from tqdm import tqdm
|
@@ -26,6 +41,7 @@ logger = logging.getLogger(__name__)
|
|
26
41
|
|
27
42
|
from natural_pdf.core.pdf import PDF
|
28
43
|
from natural_pdf.elements.region import Region
|
44
|
+
from natural_pdf.export.mixin import ExportMixin
|
29
45
|
|
30
46
|
# --- Search Imports ---
|
31
47
|
try:
|
@@ -47,12 +63,12 @@ except ImportError as e:
|
|
47
63
|
|
48
64
|
SearchServiceProtocol, SearchOptions, Indexable = object, object, object
|
49
65
|
|
50
|
-
from natural_pdf.search.searchable_mixin import SearchableMixin # Import the new mixin
|
51
66
|
# Import the ApplyMixin
|
52
67
|
from natural_pdf.collections.mixins import ApplyMixin
|
68
|
+
from natural_pdf.search.searchable_mixin import SearchableMixin # Import the new mixin
|
53
69
|
|
54
70
|
|
55
|
-
class PDFCollection(SearchableMixin, ApplyMixin): #
|
71
|
+
class PDFCollection(SearchableMixin, ApplyMixin, ExportMixin): # Add ExportMixin
|
56
72
|
def __init__(
|
57
73
|
self,
|
58
74
|
source: Union[str, Iterable[Union[str, "PDF"]]],
|
@@ -252,54 +268,83 @@ class PDFCollection(SearchableMixin, ApplyMixin): # Inherit from ApplyMixin
|
|
252
268
|
def __repr__(self) -> str:
|
253
269
|
# Removed search status
|
254
270
|
return f"<PDFCollection(count={len(self._pdfs)})>"
|
271
|
+
return f"<PDFCollection(count={len(self._pdfs)})>"
|
255
272
|
|
256
273
|
@property
|
257
274
|
def pdfs(self) -> List["PDF"]:
|
258
275
|
"""Returns the list of PDF objects held by the collection."""
|
259
276
|
return self._pdfs
|
260
277
|
|
278
|
+
@overload
|
279
|
+
def find_all(
|
280
|
+
self,
|
281
|
+
*,
|
282
|
+
text: str,
|
283
|
+
apply_exclusions: bool = True,
|
284
|
+
regex: bool = False,
|
285
|
+
case: bool = True,
|
286
|
+
**kwargs,
|
287
|
+
) -> "ElementCollection": ...
|
288
|
+
|
289
|
+
@overload
|
261
290
|
def find_all(
|
262
|
-
self,
|
263
|
-
selector: str,
|
264
|
-
|
265
|
-
|
266
|
-
|
267
|
-
|
291
|
+
self,
|
292
|
+
selector: str,
|
293
|
+
*,
|
294
|
+
apply_exclusions: bool = True,
|
295
|
+
regex: bool = False,
|
296
|
+
case: bool = True,
|
297
|
+
**kwargs,
|
298
|
+
) -> "ElementCollection": ...
|
299
|
+
|
300
|
+
def find_all(
|
301
|
+
self,
|
302
|
+
selector: Optional[str] = None, # Now optional
|
303
|
+
*,
|
304
|
+
text: Optional[str] = None, # New text parameter
|
305
|
+
apply_exclusions: bool = True,
|
306
|
+
regex: bool = False,
|
307
|
+
case: bool = True,
|
308
|
+
**kwargs,
|
268
309
|
) -> "ElementCollection":
|
269
310
|
"""
|
270
|
-
Find all elements matching the selector across all PDFs in the collection.
|
271
|
-
|
311
|
+
Find all elements matching the selector OR text across all PDFs in the collection.
|
312
|
+
|
313
|
+
Provide EITHER `selector` OR `text`, but not both.
|
314
|
+
|
272
315
|
This creates an ElementCollection that can span multiple PDFs. Note that
|
273
316
|
some ElementCollection methods have limitations when spanning PDFs.
|
274
|
-
|
317
|
+
|
275
318
|
Args:
|
276
|
-
selector: CSS-like selector string to query elements
|
277
|
-
|
278
|
-
|
279
|
-
|
280
|
-
|
281
|
-
|
319
|
+
selector: CSS-like selector string to query elements.
|
320
|
+
text: Text content to search for (equivalent to 'text:contains(...)').
|
321
|
+
apply_exclusions: Whether to exclude elements in exclusion regions (default: True).
|
322
|
+
regex: Whether to use regex for text search (`selector` or `text`) (default: False).
|
323
|
+
case: Whether to do case-sensitive text search (`selector` or `text`) (default: True).
|
324
|
+
**kwargs: Additional keyword arguments passed to the find_all method of each PDF.
|
325
|
+
|
282
326
|
Returns:
|
283
|
-
ElementCollection containing all matching elements across all PDFs
|
327
|
+
ElementCollection containing all matching elements across all PDFs.
|
284
328
|
"""
|
285
|
-
|
286
|
-
|
329
|
+
# Validation happens within pdf.find_all
|
330
|
+
|
287
331
|
# Collect elements from all PDFs
|
288
332
|
all_elements = []
|
289
333
|
for pdf in self._pdfs:
|
290
334
|
try:
|
291
|
-
#
|
335
|
+
# Pass the relevant arguments down to each PDF's find_all
|
292
336
|
elements = pdf.find_all(
|
293
|
-
selector,
|
337
|
+
selector=selector,
|
338
|
+
text=text,
|
294
339
|
apply_exclusions=apply_exclusions,
|
295
340
|
regex=regex,
|
296
341
|
case=case,
|
297
|
-
**kwargs
|
342
|
+
**kwargs,
|
298
343
|
)
|
299
344
|
all_elements.extend(elements.elements)
|
300
345
|
except Exception as e:
|
301
346
|
logger.error(f"Error finding elements in {pdf.path}: {e}", exc_info=True)
|
302
|
-
|
347
|
+
|
303
348
|
return ElementCollection(all_elements)
|
304
349
|
|
305
350
|
def apply_ocr(
|
@@ -330,24 +375,26 @@ class PDFCollection(SearchableMixin, ApplyMixin): # Inherit from ApplyMixin
|
|
330
375
|
replace: If True, replace existing OCR elements
|
331
376
|
options: Engine-specific options
|
332
377
|
pages: Specific pages to process (None for all pages)
|
333
|
-
max_workers: Maximum number of threads to process PDFs concurrently.
|
378
|
+
max_workers: Maximum number of threads to process PDFs concurrently.
|
334
379
|
If None or 1, processing is sequential. (default: None)
|
335
380
|
|
336
381
|
Returns:
|
337
382
|
Self for method chaining
|
338
383
|
"""
|
339
384
|
PDF = self._get_pdf_class()
|
340
|
-
logger.info(
|
385
|
+
logger.info(
|
386
|
+
f"Applying OCR to {len(self._pdfs)} PDFs in collection (max_workers={max_workers})..."
|
387
|
+
)
|
341
388
|
|
342
389
|
# Worker function takes PDF object again
|
343
390
|
def _process_pdf(pdf: PDF):
|
344
391
|
"""Helper function to apply OCR to a single PDF, handling errors."""
|
345
|
-
thread_id = threading.current_thread().name
|
346
|
-
pdf_path = pdf.path
|
392
|
+
thread_id = threading.current_thread().name # Get thread name for logging
|
393
|
+
pdf_path = pdf.path # Get path for logging
|
347
394
|
logger.debug(f"[{thread_id}] Starting OCR process for: {pdf_path}")
|
348
395
|
start_time = time.monotonic()
|
349
396
|
try:
|
350
|
-
pdf.apply_ocr(
|
397
|
+
pdf.apply_ocr( # Call apply_ocr on the original PDF object
|
351
398
|
pages=pages,
|
352
399
|
engine=engine,
|
353
400
|
languages=languages,
|
@@ -362,17 +409,24 @@ class PDFCollection(SearchableMixin, ApplyMixin): # Inherit from ApplyMixin
|
|
362
409
|
# For now, PDF.apply_ocr doesn't have it.
|
363
410
|
)
|
364
411
|
end_time = time.monotonic()
|
365
|
-
logger.debug(
|
412
|
+
logger.debug(
|
413
|
+
f"[{thread_id}] Finished OCR process for: {pdf_path} (Duration: {end_time - start_time:.2f}s)"
|
414
|
+
)
|
366
415
|
return pdf_path, None
|
367
416
|
except Exception as e:
|
368
417
|
end_time = time.monotonic()
|
369
|
-
logger.error(
|
370
|
-
|
418
|
+
logger.error(
|
419
|
+
f"[{thread_id}] Failed OCR process for {pdf_path} after {end_time - start_time:.2f}s: {e}",
|
420
|
+
exc_info=False,
|
421
|
+
)
|
422
|
+
return pdf_path, e # Return path and error
|
371
423
|
|
372
424
|
# Use ThreadPoolExecutor for parallel processing if max_workers > 1
|
373
425
|
if max_workers is not None and max_workers > 1:
|
374
426
|
futures = []
|
375
|
-
with concurrent.futures.ThreadPoolExecutor(
|
427
|
+
with concurrent.futures.ThreadPoolExecutor(
|
428
|
+
max_workers=max_workers, thread_name_prefix="OCRWorker"
|
429
|
+
) as executor:
|
376
430
|
for pdf in self._pdfs:
|
377
431
|
# Submit the PDF object to the worker function
|
378
432
|
futures.append(executor.submit(_process_pdf, pdf))
|
@@ -382,22 +436,22 @@ class PDFCollection(SearchableMixin, ApplyMixin): # Inherit from ApplyMixin
|
|
382
436
|
concurrent.futures.as_completed(futures),
|
383
437
|
total=len(self._pdfs),
|
384
438
|
desc="Applying OCR (Parallel)",
|
385
|
-
unit="pdf"
|
439
|
+
unit="pdf",
|
386
440
|
)
|
387
|
-
|
441
|
+
|
388
442
|
for future in progress_bar:
|
389
|
-
pdf_path, error = future.result()
|
443
|
+
pdf_path, error = future.result() # Get result (or exception)
|
390
444
|
if error:
|
391
445
|
progress_bar.set_postfix_str(f"Error: {pdf_path}", refresh=True)
|
392
446
|
# Progress is updated automatically by tqdm
|
393
447
|
|
394
|
-
else:
|
448
|
+
else: # Sequential processing (max_workers is None or 1)
|
395
449
|
logger.info("Applying OCR sequentially...")
|
396
450
|
# Use the selected tqdm class for sequential too for consistency
|
397
451
|
# Iterate over PDF objects directly for sequential
|
398
452
|
for pdf in tqdm(self._pdfs, desc="Applying OCR (Sequential)", unit="pdf"):
|
399
|
-
_process_pdf(pdf)
|
400
|
-
|
453
|
+
_process_pdf(pdf) # Call helper directly with PDF object
|
454
|
+
|
401
455
|
logger.info("Finished applying OCR across the collection.")
|
402
456
|
return self
|
403
457
|
|
@@ -421,7 +475,7 @@ class PDFCollection(SearchableMixin, ApplyMixin): # Inherit from ApplyMixin
|
|
421
475
|
Returns:
|
422
476
|
Self for method chaining.
|
423
477
|
"""
|
424
|
-
PDF = self._get_pdf_class()
|
478
|
+
PDF = self._get_pdf_class() # Ensure PDF class is available
|
425
479
|
if not callable(correction_callback):
|
426
480
|
raise TypeError("`correction_callback` must be a callable function.")
|
427
481
|
|
@@ -436,7 +490,9 @@ class PDFCollection(SearchableMixin, ApplyMixin): # Inherit from ApplyMixin
|
|
436
490
|
return self
|
437
491
|
|
438
492
|
total_elements = len(all_ocr_elements)
|
439
|
-
logger.info(
|
493
|
+
logger.info(
|
494
|
+
f"Found {total_elements} OCR elements across the collection. Starting correction process..."
|
495
|
+
)
|
440
496
|
|
441
497
|
# 2. Initialize the progress bar
|
442
498
|
progress_bar = tqdm(total=total_elements, desc="Correcting OCR Elements", unit="element")
|
@@ -450,17 +506,20 @@ class PDFCollection(SearchableMixin, ApplyMixin): # Inherit from ApplyMixin
|
|
450
506
|
pdf.correct_ocr(
|
451
507
|
correction_callback=correction_callback,
|
452
508
|
max_workers=max_workers,
|
453
|
-
progress_callback=progress_bar.update
|
509
|
+
progress_callback=progress_bar.update, # Pass the bar's update method
|
454
510
|
)
|
455
511
|
except Exception as e:
|
456
|
-
|
457
|
-
|
512
|
+
logger.error(
|
513
|
+
f"Error occurred during correction process for PDF {pdf.path}: {e}",
|
514
|
+
exc_info=True,
|
515
|
+
)
|
516
|
+
# Decide if we should stop or continue? For now, continue.
|
458
517
|
|
459
518
|
progress_bar.close()
|
460
519
|
|
461
520
|
return self
|
462
521
|
|
463
|
-
def categorize(self,
|
522
|
+
def categorize(self, labels: List[str], **kwargs):
|
464
523
|
"""Categorizes PDFs in the collection based on content or features."""
|
465
524
|
# Implementation requires integrating with classification models or logic
|
466
525
|
raise NotImplementedError("categorize requires classification implementation.")
|
@@ -511,107 +570,193 @@ class PDFCollection(SearchableMixin, ApplyMixin): # Inherit from ApplyMixin
|
|
511
570
|
# --- Classification Method --- #
|
512
571
|
def classify_all(
|
513
572
|
self,
|
514
|
-
|
515
|
-
|
573
|
+
labels: List[str],
|
574
|
+
using: Optional[str] = None, # Default handled by PDF.classify -> manager
|
575
|
+
model: Optional[str] = None, # Optional model ID
|
516
576
|
max_workers: Optional[int] = None,
|
577
|
+
analysis_key: str = "classification", # Key for storing result in PDF.analyses
|
517
578
|
**kwargs,
|
518
579
|
) -> "PDFCollection":
|
519
580
|
"""
|
520
|
-
Classify
|
581
|
+
Classify each PDF document in the collection, potentially in parallel.
|
521
582
|
|
522
|
-
This method
|
523
|
-
|
524
|
-
|
583
|
+
This method delegates classification to each PDF object's `classify` method.
|
584
|
+
By default, uses the full extracted text of the PDF.
|
585
|
+
If `using='vision'`, it classifies the first page's image, but ONLY if
|
586
|
+
the PDF has a single page (raises ValueError otherwise).
|
525
587
|
|
526
588
|
Args:
|
527
|
-
|
528
|
-
|
589
|
+
labels: A list of string category names.
|
590
|
+
using: Processing mode ('text', 'vision'). If None, manager infers (defaulting to text).
|
591
|
+
model: Optional specific model identifier (e.g., HF ID). If None, manager uses default for 'using' mode.
|
529
592
|
max_workers: Maximum number of threads to process PDFs concurrently.
|
530
593
|
If None or 1, processing is sequential.
|
531
|
-
|
532
|
-
|
533
|
-
|
594
|
+
analysis_key: Key under which to store the ClassificationResult in each PDF's `analyses` dict.
|
595
|
+
**kwargs: Additional arguments passed down to `pdf.classify` (e.g., device,
|
596
|
+
min_confidence, multi_label, text extraction options).
|
534
597
|
|
535
598
|
Returns:
|
536
599
|
Self for method chaining.
|
537
600
|
|
538
601
|
Raises:
|
539
|
-
ValueError: If
|
540
|
-
ClassificationError: If classification fails for any
|
602
|
+
ValueError: If labels list is empty, or if using='vision' on a multi-page PDF.
|
603
|
+
ClassificationError: If classification fails for any PDF (will stop processing).
|
541
604
|
ImportError: If classification dependencies are missing.
|
542
605
|
"""
|
543
606
|
PDF = self._get_pdf_class()
|
544
|
-
if not
|
545
|
-
raise ValueError("
|
546
|
-
|
547
|
-
logger.info(f"Starting classification for {len(self._pdfs)} PDFs in collection (model: '{model}')...")
|
607
|
+
if not labels:
|
608
|
+
raise ValueError("Labels list cannot be empty.")
|
548
609
|
|
549
|
-
|
550
|
-
|
551
|
-
if total_pages == 0:
|
552
|
-
logger.warning("No pages found in the PDF collection to classify.")
|
610
|
+
if not self._pdfs:
|
611
|
+
logger.warning("PDFCollection is empty, skipping classification.")
|
553
612
|
return self
|
554
613
|
|
614
|
+
mode_desc = f"using='{using}'" if using else f"model='{model}'" if model else "default text"
|
615
|
+
logger.info(
|
616
|
+
f"Starting classification for {len(self._pdfs)} PDFs in collection ({mode_desc})..."
|
617
|
+
)
|
618
|
+
|
555
619
|
progress_bar = tqdm(
|
556
|
-
total=
|
557
|
-
desc=f"Classifying Pages (model: {model})",
|
558
|
-
unit="page"
|
620
|
+
total=len(self._pdfs), desc=f"Classifying PDFs ({mode_desc})", unit="pdf"
|
559
621
|
)
|
560
622
|
|
561
623
|
# Worker function
|
562
624
|
def _process_pdf_classification(pdf: PDF):
|
563
625
|
thread_id = threading.current_thread().name
|
564
626
|
pdf_path = pdf.path
|
565
|
-
logger.debug(f"[{thread_id}] Starting classification process for: {pdf_path}")
|
627
|
+
logger.debug(f"[{thread_id}] Starting classification process for PDF: {pdf_path}")
|
566
628
|
start_time = time.monotonic()
|
567
629
|
try:
|
568
|
-
# Call
|
569
|
-
pdf.
|
570
|
-
|
630
|
+
# Call classify directly on the PDF object
|
631
|
+
pdf.classify(
|
632
|
+
labels=labels,
|
633
|
+
using=using,
|
571
634
|
model=model,
|
572
|
-
|
573
|
-
**kwargs
|
635
|
+
analysis_key=analysis_key,
|
636
|
+
**kwargs, # Pass other relevant args like min_confidence, multi_label
|
637
|
+
)
|
638
|
+
end_time = time.monotonic()
|
639
|
+
logger.debug(
|
640
|
+
f"[{thread_id}] Finished classification for PDF: {pdf_path} (Duration: {end_time - start_time:.2f}s)"
|
574
641
|
)
|
642
|
+
progress_bar.update(1) # Update progress bar upon success
|
643
|
+
return pdf_path, None # Return path and no error
|
644
|
+
except ValueError as ve:
|
645
|
+
# Catch specific error for vision on multi-page PDF
|
575
646
|
end_time = time.monotonic()
|
576
|
-
logger.
|
577
|
-
|
647
|
+
logger.error(
|
648
|
+
f"[{thread_id}] Skipped classification for {pdf_path} after {end_time - start_time:.2f}s: {ve}",
|
649
|
+
exc_info=False,
|
650
|
+
)
|
651
|
+
progress_bar.update(1) # Still update progress bar
|
652
|
+
return pdf_path, ve # Return the specific ValueError
|
578
653
|
except Exception as e:
|
579
654
|
end_time = time.monotonic()
|
580
|
-
|
581
|
-
|
582
|
-
|
583
|
-
|
655
|
+
logger.error(
|
656
|
+
f"[{thread_id}] Failed classification process for PDF {pdf_path} after {end_time - start_time:.2f}s: {e}",
|
657
|
+
exc_info=True, # Log full traceback for unexpected errors
|
658
|
+
)
|
659
|
+
# Close progress bar immediately on critical error to avoid hanging
|
660
|
+
if not progress_bar.disable:
|
661
|
+
progress_bar.close()
|
584
662
|
# Re-raise the exception to stop the entire collection processing
|
585
|
-
raise
|
663
|
+
raise ClassificationError(f"Classification failed for {pdf_path}: {e}") from e
|
586
664
|
|
587
665
|
# Use ThreadPoolExecutor for parallel processing if max_workers > 1
|
666
|
+
processed_count = 0
|
667
|
+
skipped_count = 0
|
588
668
|
try:
|
589
669
|
if max_workers is not None and max_workers > 1:
|
590
670
|
logger.info(f"Classifying PDFs in parallel with {max_workers} workers.")
|
591
671
|
futures = []
|
592
|
-
with concurrent.futures.ThreadPoolExecutor(
|
672
|
+
with concurrent.futures.ThreadPoolExecutor(
|
673
|
+
max_workers=max_workers, thread_name_prefix="ClassifyWorker"
|
674
|
+
) as executor:
|
593
675
|
for pdf in self._pdfs:
|
594
676
|
futures.append(executor.submit(_process_pdf_classification, pdf))
|
595
677
|
|
596
|
-
# Wait for all futures to complete
|
597
|
-
#
|
678
|
+
# Wait for all futures to complete
|
679
|
+
# Progress updated within worker
|
598
680
|
for future in concurrent.futures.as_completed(futures):
|
599
|
-
|
600
|
-
|
601
|
-
|
681
|
+
processed_count += 1
|
682
|
+
pdf_path, error = (
|
683
|
+
future.result()
|
684
|
+
) # Raise ClassificationError if worker failed critically
|
685
|
+
if isinstance(error, ValueError):
|
686
|
+
# Logged in worker, just count as skipped
|
687
|
+
skipped_count += 1
|
688
|
+
|
689
|
+
else: # Sequential processing
|
602
690
|
logger.info("Classifying PDFs sequentially.")
|
603
691
|
for pdf in self._pdfs:
|
604
|
-
|
605
|
-
|
606
|
-
|
692
|
+
processed_count += 1
|
693
|
+
pdf_path, error = _process_pdf_classification(
|
694
|
+
pdf
|
695
|
+
) # Raise ClassificationError if worker failed critically
|
696
|
+
if isinstance(error, ValueError):
|
697
|
+
skipped_count += 1
|
698
|
+
|
699
|
+
final_message = (
|
700
|
+
f"Finished classification across the collection. Processed: {processed_count}"
|
701
|
+
)
|
702
|
+
if skipped_count > 0:
|
703
|
+
final_message += f", Skipped (e.g., vision on multi-page): {skipped_count}"
|
704
|
+
logger.info(final_message + ".")
|
607
705
|
|
608
706
|
finally:
|
609
|
-
|
610
|
-
|
611
|
-
|
612
|
-
|
613
|
-
|
707
|
+
# Ensure progress bar is closed properly
|
708
|
+
if not progress_bar.disable and progress_bar.n < progress_bar.total:
|
709
|
+
progress_bar.n = progress_bar.total # Ensure it reaches 100%
|
710
|
+
if not progress_bar.disable:
|
711
|
+
progress_bar.close()
|
614
712
|
|
615
713
|
return self
|
616
714
|
|
617
715
|
# --- End Classification Method --- #
|
716
|
+
|
717
|
+
def _gather_analysis_data(
|
718
|
+
self,
|
719
|
+
analysis_keys: List[str],
|
720
|
+
include_content: bool,
|
721
|
+
include_images: bool,
|
722
|
+
image_dir: Optional[Path],
|
723
|
+
image_format: str,
|
724
|
+
image_resolution: int,
|
725
|
+
) -> List[Dict[str, Any]]:
|
726
|
+
"""
|
727
|
+
Gather analysis data from all PDFs in the collection.
|
728
|
+
|
729
|
+
Args:
|
730
|
+
analysis_keys: Keys in the analyses dictionary to export
|
731
|
+
include_content: Whether to include extracted text
|
732
|
+
include_images: Whether to export images
|
733
|
+
image_dir: Directory to save images
|
734
|
+
image_format: Format to save images
|
735
|
+
image_resolution: Resolution for exported images
|
736
|
+
|
737
|
+
Returns:
|
738
|
+
List of dictionaries containing analysis data
|
739
|
+
"""
|
740
|
+
if not self._pdfs:
|
741
|
+
logger.warning("No PDFs found in collection")
|
742
|
+
return []
|
743
|
+
|
744
|
+
all_data = []
|
745
|
+
|
746
|
+
for pdf in tqdm(self._pdfs, desc="Gathering PDF data", leave=False):
|
747
|
+
# PDF level data
|
748
|
+
pdf_data = {
|
749
|
+
"pdf_path": pdf.path,
|
750
|
+
"pdf_filename": Path(pdf.path).name,
|
751
|
+
"total_pages": len(pdf.pages) if hasattr(pdf, "pages") else 0,
|
752
|
+
}
|
753
|
+
|
754
|
+
# Add metadata if available
|
755
|
+
if hasattr(pdf, "metadata") and pdf.metadata:
|
756
|
+
for k, v in pdf.metadata.items():
|
757
|
+
if v: # Only add non-empty metadata
|
758
|
+
pdf_data[f"metadata.{k}"] = str(v)
|
759
|
+
|
760
|
+
all_data.append(pdf_data)
|
761
|
+
|
762
|
+
return all_data
|
@@ -544,56 +544,56 @@ class ElementManager:
|
|
544
544
|
"""
|
545
545
|
Remove all elements with source="ocr" from the elements dictionary.
|
546
546
|
This should be called before adding new OCR elements if replacement is desired.
|
547
|
-
|
547
|
+
|
548
548
|
Returns:
|
549
549
|
int: Number of OCR elements removed
|
550
550
|
"""
|
551
551
|
# Load elements if not already loaded
|
552
552
|
self.load_elements()
|
553
|
-
|
553
|
+
|
554
554
|
removed_count = 0
|
555
|
-
|
555
|
+
|
556
556
|
# Filter out OCR elements from words
|
557
557
|
if "words" in self._elements:
|
558
558
|
original_len = len(self._elements["words"])
|
559
559
|
self._elements["words"] = [
|
560
|
-
word for word in self._elements["words"]
|
561
|
-
if getattr(word, "source", None) != "ocr"
|
560
|
+
word for word in self._elements["words"] if getattr(word, "source", None) != "ocr"
|
562
561
|
]
|
563
562
|
removed_count += original_len - len(self._elements["words"])
|
564
|
-
|
563
|
+
|
565
564
|
# Filter out OCR elements from chars
|
566
565
|
if "chars" in self._elements:
|
567
566
|
original_len = len(self._elements["chars"])
|
568
567
|
self._elements["chars"] = [
|
569
|
-
char
|
570
|
-
|
571
|
-
|
568
|
+
char
|
569
|
+
for char in self._elements["chars"]
|
570
|
+
if (isinstance(char, dict) and char.get("source") != "ocr")
|
571
|
+
or (not isinstance(char, dict) and getattr(char, "source", None) != "ocr")
|
572
572
|
]
|
573
573
|
removed_count += original_len - len(self._elements["chars"])
|
574
|
-
|
574
|
+
|
575
575
|
logger.info(f"Page {self._page.number}: Removed {removed_count} OCR elements.")
|
576
576
|
return removed_count
|
577
577
|
|
578
578
|
def remove_element(self, element, element_type="words"):
|
579
579
|
"""
|
580
580
|
Remove a specific element from the managed elements.
|
581
|
-
|
581
|
+
|
582
582
|
Args:
|
583
583
|
element: The element to remove
|
584
584
|
element_type: The type of element ('words', 'chars', etc.)
|
585
|
-
|
585
|
+
|
586
586
|
Returns:
|
587
587
|
bool: True if removed successfully, False otherwise
|
588
588
|
"""
|
589
589
|
# Load elements if not already loaded
|
590
590
|
self.load_elements()
|
591
|
-
|
591
|
+
|
592
592
|
# Check if the collection exists
|
593
593
|
if element_type not in self._elements:
|
594
594
|
logger.warning(f"Cannot remove element: collection '{element_type}' does not exist")
|
595
595
|
return False
|
596
|
-
|
596
|
+
|
597
597
|
# Try to remove the element
|
598
598
|
try:
|
599
599
|
if element in self._elements[element_type]:
|
@@ -606,3 +606,19 @@ class ElementManager:
|
|
606
606
|
except Exception as e:
|
607
607
|
logger.error(f"Error removing element from {element_type}: {e}", exc_info=True)
|
608
608
|
return False
|
609
|
+
|
610
|
+
def has_elements(self) -> bool:
|
611
|
+
"""
|
612
|
+
Check if any significant elements (words, rects, lines, regions)
|
613
|
+
have been loaded or added.
|
614
|
+
|
615
|
+
Returns:
|
616
|
+
True if any elements exist, False otherwise.
|
617
|
+
"""
|
618
|
+
self.load_elements()
|
619
|
+
|
620
|
+
for key in ["words", "rects", "lines", "regions"]:
|
621
|
+
if self._elements.get(key):
|
622
|
+
return True
|
623
|
+
|
624
|
+
return False
|