natural-pdf 0.1.21__py3-none-any.whl → 0.1.23__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- natural_pdf/analyzers/shape_detection_mixin.py +3 -3
- natural_pdf/classification/manager.py +1 -1
- natural_pdf/classification/mixin.py +35 -14
- natural_pdf/classification/results.py +16 -1
- natural_pdf/cli.py +9 -27
- natural_pdf/core/highlighting_service.py +23 -0
- natural_pdf/core/page.py +16 -0
- natural_pdf/core/pdf.py +55 -49
- natural_pdf/describe/base.py +2 -2
- natural_pdf/describe/elements.py +1 -1
- natural_pdf/elements/base.py +79 -1
- natural_pdf/elements/collections.py +23 -1
- natural_pdf/elements/region.py +54 -148
- natural_pdf/exporters/paddleocr.py +1 -1
- natural_pdf/extraction/manager.py +2 -2
- natural_pdf/extraction/mixin.py +295 -11
- natural_pdf/extraction/result.py +28 -1
- natural_pdf/flows/region.py +1 -1
- natural_pdf/ocr/engine_surya.py +25 -5
- natural_pdf/qa/__init__.py +2 -1
- natural_pdf/qa/document_qa.py +33 -37
- natural_pdf/qa/qa_result.py +55 -0
- natural_pdf/selectors/parser.py +22 -0
- {natural_pdf-0.1.21.dist-info → natural_pdf-0.1.23.dist-info}/METADATA +21 -14
- {natural_pdf-0.1.21.dist-info → natural_pdf-0.1.23.dist-info}/RECORD +29 -28
- {natural_pdf-0.1.21.dist-info → natural_pdf-0.1.23.dist-info}/WHEEL +0 -0
- {natural_pdf-0.1.21.dist-info → natural_pdf-0.1.23.dist-info}/entry_points.txt +0 -0
- {natural_pdf-0.1.21.dist-info → natural_pdf-0.1.23.dist-info}/licenses/LICENSE +0 -0
- {natural_pdf-0.1.21.dist-info → natural_pdf-0.1.23.dist-info}/top_level.txt +0 -0
natural_pdf/extraction/mixin.py
CHANGED
@@ -1,8 +1,8 @@
|
|
1
1
|
import logging
|
2
2
|
from abc import ABC, abstractmethod
|
3
|
-
from typing import TYPE_CHECKING, Any, Optional, Type
|
3
|
+
from typing import TYPE_CHECKING, Any, Optional, Type, Sequence
|
4
4
|
|
5
|
-
from pydantic import BaseModel
|
5
|
+
from pydantic import BaseModel, Field, create_model
|
6
6
|
|
7
7
|
# Avoid circular import
|
8
8
|
if TYPE_CHECKING:
|
@@ -65,12 +65,13 @@ class ExtractionMixin(ABC):
|
|
65
65
|
def extract(
|
66
66
|
self: Any,
|
67
67
|
schema: Type[BaseModel],
|
68
|
-
client: Any,
|
68
|
+
client: Any = None,
|
69
69
|
analysis_key: str = DEFAULT_STRUCTURED_KEY, # Default key
|
70
70
|
prompt: Optional[str] = None,
|
71
71
|
using: str = "text",
|
72
72
|
model: Optional[str] = None,
|
73
|
-
|
73
|
+
engine: Optional[str] = None, # NEW: choose between 'llm' and 'doc_qa'
|
74
|
+
overwrite: bool = True, # Overwrite by default
|
74
75
|
**kwargs,
|
75
76
|
) -> Any:
|
76
77
|
"""
|
@@ -79,18 +80,67 @@ class ExtractionMixin(ABC):
|
|
79
80
|
Results are stored in the element's `analyses` dictionary.
|
80
81
|
|
81
82
|
Args:
|
82
|
-
schema: Pydantic model class defining the desired structure
|
83
|
-
|
83
|
+
schema: Either a Pydantic model class defining the desired structure, or an
|
84
|
+
iterable (e.g. list) of field names. When iterable is supplied a
|
85
|
+
temporary Pydantic model of string fields is created automatically.
|
86
|
+
client: Initialized LLM client (required for LLM engine only)
|
84
87
|
analysis_key: Key to store the result under in `analyses`. Defaults to "default-structured".
|
85
88
|
prompt: Optional user-provided prompt for the LLM
|
86
89
|
using: Modality ('text' or 'vision')
|
87
90
|
model: Optional specific LLM model identifier
|
88
|
-
|
91
|
+
engine: Extraction engine to use ("llm" or "doc_qa"). If None, auto-determined.
|
92
|
+
overwrite: Whether to overwrite an existing result stored at `analysis_key`. Defaults to True.
|
89
93
|
**kwargs: Additional parameters for extraction
|
90
94
|
|
91
95
|
Returns:
|
92
96
|
Self for method chaining
|
93
97
|
"""
|
98
|
+
# ------------------------------------------------------------------
|
99
|
+
# If the user supplied a plain list/tuple of field names, dynamically
|
100
|
+
# build a simple Pydantic model (all `str` fields) so the rest of the
|
101
|
+
# pipeline can work unmodified.
|
102
|
+
# ------------------------------------------------------------------
|
103
|
+
if not isinstance(schema, type): # not already a class
|
104
|
+
if isinstance(schema, Sequence):
|
105
|
+
field_names = list(schema)
|
106
|
+
if not field_names:
|
107
|
+
raise ValueError("Schema list cannot be empty")
|
108
|
+
|
109
|
+
import re
|
110
|
+
|
111
|
+
field_defs = {}
|
112
|
+
for orig_name in field_names:
|
113
|
+
safe_name = re.sub(r"[^0-9a-zA-Z_]", "_", orig_name)
|
114
|
+
if safe_name and safe_name[0].isdigit():
|
115
|
+
safe_name = f"_{safe_name}"
|
116
|
+
|
117
|
+
field_defs[safe_name] = (
|
118
|
+
str,
|
119
|
+
Field(
|
120
|
+
None,
|
121
|
+
description=f"{orig_name}",
|
122
|
+
alias=orig_name, # allow access via original name
|
123
|
+
),
|
124
|
+
)
|
125
|
+
|
126
|
+
schema = create_model("DynamicExtractSchema", **field_defs) # type: ignore[arg-type]
|
127
|
+
else:
|
128
|
+
raise TypeError(
|
129
|
+
"schema must be a Pydantic model class or a sequence of field names"
|
130
|
+
)
|
131
|
+
|
132
|
+
# ------------------------------------------------------------------
|
133
|
+
# Resolve which engine to use
|
134
|
+
# ------------------------------------------------------------------
|
135
|
+
if engine not in (None, "llm", "doc_qa"):
|
136
|
+
raise ValueError("engine must be either 'llm', 'doc_qa', or None")
|
137
|
+
|
138
|
+
# Auto-select: LLM when client provided, else Document-QA
|
139
|
+
if engine is None:
|
140
|
+
engine = "llm" if client is not None else "doc_qa"
|
141
|
+
|
142
|
+
logger.info(f"Extraction engine resolved to '{engine}'")
|
143
|
+
|
94
144
|
if not analysis_key:
|
95
145
|
raise ValueError("analysis_key cannot be empty for extract operation")
|
96
146
|
|
@@ -99,12 +149,48 @@ class ExtractionMixin(ABC):
|
|
99
149
|
self.analyses = {}
|
100
150
|
|
101
151
|
if analysis_key in self.analyses and not overwrite:
|
102
|
-
|
103
|
-
f"
|
104
|
-
|
152
|
+
logger.info(
|
153
|
+
f"Extraction for key '{analysis_key}' already exists; returning cached result. "
|
154
|
+
"Pass overwrite=True to force re-extraction."
|
105
155
|
)
|
156
|
+
return self
|
106
157
|
# --- End Overwrite Check --- #
|
107
158
|
|
159
|
+
# ------------------------------------------------------------------
|
160
|
+
# Delegate to engine-specific helpers and return early
|
161
|
+
# ------------------------------------------------------------------
|
162
|
+
if engine == "doc_qa":
|
163
|
+
self._perform_docqa_extraction(
|
164
|
+
schema=schema,
|
165
|
+
analysis_key=analysis_key,
|
166
|
+
model=model,
|
167
|
+
overwrite=overwrite,
|
168
|
+
**kwargs,
|
169
|
+
)
|
170
|
+
return self
|
171
|
+
|
172
|
+
if engine == "llm":
|
173
|
+
if client is None:
|
174
|
+
raise ValueError("LLM engine selected but no 'client' was provided.")
|
175
|
+
|
176
|
+
self._perform_llm_extraction(
|
177
|
+
schema=schema,
|
178
|
+
client=client,
|
179
|
+
analysis_key=analysis_key,
|
180
|
+
prompt=prompt,
|
181
|
+
using=using,
|
182
|
+
model=model,
|
183
|
+
overwrite=overwrite,
|
184
|
+
**kwargs,
|
185
|
+
)
|
186
|
+
return self
|
187
|
+
|
188
|
+
# ------------------------------------------------------------------
|
189
|
+
# LLM ENGINE (existing behaviour)
|
190
|
+
# ------------------------------------------------------------------
|
191
|
+
if engine == "llm" and client is None:
|
192
|
+
raise ValueError("LLM engine selected but no 'client' was provided.")
|
193
|
+
|
108
194
|
# Determine PDF instance to get manager
|
109
195
|
pdf_instance = None
|
110
196
|
|
@@ -162,7 +248,7 @@ class ExtractionMixin(ABC):
|
|
162
248
|
data=None,
|
163
249
|
success=False,
|
164
250
|
error_message=f"No content available for extraction (using='{using}')",
|
165
|
-
|
251
|
+
model_used=model, # Use model requested, even if failed
|
166
252
|
)
|
167
253
|
else:
|
168
254
|
result = manager.extract(
|
@@ -277,3 +363,201 @@ class ExtractionMixin(ABC):
|
|
277
363
|
raise TypeError(
|
278
364
|
f"Could not access field/attribute '{field_name}' on extracted data for key '{target_key}' (type: {type(result.data).__name__}). Error: {e}"
|
279
365
|
) from e
|
366
|
+
|
367
|
+
# ------------------------------------------------------------------
|
368
|
+
# Internal helper: Document-QA powered extraction
|
369
|
+
# ------------------------------------------------------------------
|
370
|
+
def _perform_docqa_extraction(
|
371
|
+
self,
|
372
|
+
*,
|
373
|
+
schema: Type[BaseModel],
|
374
|
+
analysis_key: str,
|
375
|
+
model: Optional[str] = None,
|
376
|
+
overwrite: bool = True,
|
377
|
+
min_confidence: float = 0.1,
|
378
|
+
debug: bool = False,
|
379
|
+
question_map: Optional[dict] = None,
|
380
|
+
**kwargs,
|
381
|
+
) -> None:
|
382
|
+
"""Run extraction using the local Document-QA engine.
|
383
|
+
|
384
|
+
Mutates ``self.analyses[analysis_key]`` with a StructuredDataResult.
|
385
|
+
"""
|
386
|
+
question_map = question_map or {}
|
387
|
+
|
388
|
+
try:
|
389
|
+
from natural_pdf.qa.document_qa import get_qa_engine
|
390
|
+
from natural_pdf.extraction.result import StructuredDataResult
|
391
|
+
from pydantic import Field as _Field, create_model
|
392
|
+
import re
|
393
|
+
except ImportError as exc:
|
394
|
+
raise RuntimeError(
|
395
|
+
"Document-QA dependencies missing. Install with `pip install natural-pdf[ai]`."
|
396
|
+
) from exc
|
397
|
+
|
398
|
+
qa_engine = get_qa_engine(model_name=model) if model else get_qa_engine()
|
399
|
+
|
400
|
+
# Iterate over schema fields
|
401
|
+
if hasattr(schema, "__fields__"):
|
402
|
+
fields_iter = schema.__fields__.items() # Pydantic v1
|
403
|
+
else:
|
404
|
+
fields_iter = schema.model_fields.items() # Pydantic v2
|
405
|
+
|
406
|
+
answers: dict = {}
|
407
|
+
confidences: dict = {}
|
408
|
+
errors: list[str] = []
|
409
|
+
|
410
|
+
# Ensure we can call QA on this object type
|
411
|
+
from natural_pdf.core.page import Page as _Page
|
412
|
+
from natural_pdf.elements.region import Region as _Region
|
413
|
+
|
414
|
+
if not isinstance(self, (_Page, _Region)):
|
415
|
+
raise NotImplementedError(
|
416
|
+
"Document-QA extraction is only supported on Page or Region objects."
|
417
|
+
)
|
418
|
+
|
419
|
+
for field_name, field_obj in fields_iter:
|
420
|
+
display_name = getattr(field_obj, "alias", field_name)
|
421
|
+
|
422
|
+
# Compose question text
|
423
|
+
if display_name in question_map:
|
424
|
+
question = question_map[display_name]
|
425
|
+
else:
|
426
|
+
description = None
|
427
|
+
if hasattr(field_obj, "field_info") and hasattr(field_obj.field_info, "description"):
|
428
|
+
description = field_obj.field_info.description
|
429
|
+
elif hasattr(field_obj, "description"):
|
430
|
+
description = field_obj.description
|
431
|
+
|
432
|
+
question = description or f"What is the {display_name.replace('_', ' ')}?"
|
433
|
+
|
434
|
+
try:
|
435
|
+
# Ask via appropriate helper
|
436
|
+
if isinstance(self, _Page):
|
437
|
+
qa_resp = qa_engine.ask_pdf_page(
|
438
|
+
self,
|
439
|
+
question,
|
440
|
+
min_confidence=min_confidence,
|
441
|
+
debug=debug,
|
442
|
+
)
|
443
|
+
else: # Region
|
444
|
+
qa_resp = qa_engine.ask_pdf_region(
|
445
|
+
self,
|
446
|
+
question,
|
447
|
+
min_confidence=min_confidence,
|
448
|
+
debug=debug,
|
449
|
+
)
|
450
|
+
|
451
|
+
confidence_val = qa_resp.get("confidence") if qa_resp else None
|
452
|
+
answer_val = qa_resp.get("answer") if qa_resp else None
|
453
|
+
|
454
|
+
if confidence_val is not None and confidence_val < min_confidence:
|
455
|
+
answer_val = None
|
456
|
+
|
457
|
+
answers[display_name] = answer_val
|
458
|
+
confidences[f"{display_name}_confidence"] = confidence_val
|
459
|
+
|
460
|
+
except Exception as e: # noqa: BLE001
|
461
|
+
logger.error("Doc-QA failed for field '%s': %s", field_name, e)
|
462
|
+
errors.append(str(e))
|
463
|
+
answers[display_name] = None
|
464
|
+
confidences[f"{display_name}_confidence"] = None
|
465
|
+
|
466
|
+
combined = {**answers, **confidences}
|
467
|
+
|
468
|
+
# Build extended model that includes confidence fields
|
469
|
+
field_defs_ext = {}
|
470
|
+
for orig_key, val in combined.items():
|
471
|
+
safe_key = re.sub(r"[^0-9a-zA-Z_]", "_", orig_key)
|
472
|
+
if safe_key and safe_key[0].isdigit():
|
473
|
+
safe_key = f"_{safe_key}"
|
474
|
+
|
475
|
+
if orig_key.endswith("_confidence"):
|
476
|
+
field_defs_ext[safe_key] = (
|
477
|
+
Optional[float],
|
478
|
+
_Field(None, description=f"Confidence for {orig_key}", alias=orig_key),
|
479
|
+
)
|
480
|
+
else:
|
481
|
+
field_defs_ext[safe_key] = (
|
482
|
+
Optional[type(val) if val is not None else str],
|
483
|
+
_Field(None, alias=orig_key),
|
484
|
+
)
|
485
|
+
|
486
|
+
ExtendedSchema = create_model(f"{schema.__name__}WithConf", **field_defs_ext)
|
487
|
+
|
488
|
+
try:
|
489
|
+
structured_instance = ExtendedSchema(**combined)
|
490
|
+
success_flag = not errors
|
491
|
+
err_msg = None if not errors else "; ".join(errors)
|
492
|
+
except Exception as exc: # noqa: BLE001
|
493
|
+
structured_instance = None
|
494
|
+
success_flag = False
|
495
|
+
err_msg = str(exc)
|
496
|
+
|
497
|
+
result = StructuredDataResult(
|
498
|
+
data=structured_instance if structured_instance is not None else combined,
|
499
|
+
success=success_flag,
|
500
|
+
error_message=err_msg,
|
501
|
+
model_used=getattr(qa_engine, "model_name", None),
|
502
|
+
)
|
503
|
+
|
504
|
+
self.analyses[analysis_key] = result
|
505
|
+
|
506
|
+
# ------------------------------------------------------------------
|
507
|
+
# Internal helper: LLM powered extraction (existing behaviour)
|
508
|
+
# ------------------------------------------------------------------
|
509
|
+
def _perform_llm_extraction(
|
510
|
+
self,
|
511
|
+
*,
|
512
|
+
schema: Type[BaseModel],
|
513
|
+
client: Any,
|
514
|
+
analysis_key: str,
|
515
|
+
prompt: Optional[str] = None,
|
516
|
+
using: str = "text",
|
517
|
+
model: Optional[str] = None,
|
518
|
+
overwrite: bool = True,
|
519
|
+
**kwargs,
|
520
|
+
) -> None:
|
521
|
+
"""Run extraction via the StructuredDataManager (LLM)."""
|
522
|
+
|
523
|
+
from natural_pdf.extraction.result import StructuredDataResult
|
524
|
+
|
525
|
+
# Determine PDF instance to obtain StructuredDataManager
|
526
|
+
pdf_instance = None
|
527
|
+
|
528
|
+
if hasattr(self, "get_manager") and callable(self.get_manager):
|
529
|
+
pdf_instance = self
|
530
|
+
elif hasattr(self, "pdf") and hasattr(self.pdf, "get_manager"):
|
531
|
+
pdf_instance = self.pdf
|
532
|
+
elif hasattr(self, "page") and hasattr(self.page, "pdf") and hasattr(self.page.pdf, "get_manager"):
|
533
|
+
pdf_instance = self.page.pdf
|
534
|
+
else:
|
535
|
+
raise RuntimeError("Cannot access PDF manager to perform LLM extraction.")
|
536
|
+
|
537
|
+
manager = pdf_instance.get_manager("structured_data")
|
538
|
+
if not manager or not manager.is_available():
|
539
|
+
raise RuntimeError("StructuredDataManager is not available")
|
540
|
+
|
541
|
+
# Content preparation
|
542
|
+
layout_for_text = kwargs.pop("layout", True)
|
543
|
+
content = self._get_extraction_content(using=using, layout=layout_for_text, **kwargs)
|
544
|
+
|
545
|
+
if content is None or (using == "text" and isinstance(content, str) and not content.strip()):
|
546
|
+
result = StructuredDataResult(
|
547
|
+
data=None,
|
548
|
+
success=False,
|
549
|
+
error_message=f"No content available for extraction (using='{using}')",
|
550
|
+
model_used=model,
|
551
|
+
)
|
552
|
+
else:
|
553
|
+
result = manager.extract(
|
554
|
+
content=content,
|
555
|
+
schema=schema,
|
556
|
+
client=client,
|
557
|
+
prompt=prompt,
|
558
|
+
using=using,
|
559
|
+
model=model,
|
560
|
+
**kwargs,
|
561
|
+
)
|
562
|
+
|
563
|
+
self.analyses[analysis_key] = result
|
natural_pdf/extraction/result.py
CHANGED
@@ -1,4 +1,5 @@
|
|
1
1
|
from typing import Any, Generic, Optional, TypeVar
|
2
|
+
from collections.abc import Mapping
|
2
3
|
|
3
4
|
from pydantic import BaseModel, Field
|
4
5
|
|
@@ -6,7 +7,7 @@ from pydantic import BaseModel, Field
|
|
6
7
|
T_Schema = TypeVar("T_Schema", bound=BaseModel)
|
7
8
|
|
8
9
|
|
9
|
-
class StructuredDataResult(BaseModel, Generic[T_Schema]):
|
10
|
+
class StructuredDataResult(BaseModel, Generic[T_Schema], Mapping):
|
10
11
|
"""
|
11
12
|
Represents the result of a structured data extraction operation.
|
12
13
|
|
@@ -21,3 +22,29 @@ class StructuredDataResult(BaseModel, Generic[T_Schema]):
|
|
21
22
|
|
22
23
|
class Config:
|
23
24
|
arbitrary_types_allowed = True
|
25
|
+
|
26
|
+
# ---------------------------------------------------------------------
|
27
|
+
# Mapping interface implementation
|
28
|
+
# ---------------------------------------------------------------------
|
29
|
+
|
30
|
+
def _as_dict(self) -> dict:
|
31
|
+
"""Return the underlying data as a plain dict (Pydantic v1 & v2 safe)."""
|
32
|
+
if hasattr(self, "model_dump"):
|
33
|
+
# Pydantic v2
|
34
|
+
return self.model_dump()
|
35
|
+
else:
|
36
|
+
# Pydantic v1
|
37
|
+
return self.dict()
|
38
|
+
|
39
|
+
def __iter__(self):
|
40
|
+
"""Iterate over keys, preserving insertion order guaranteed in Py≥3.7."""
|
41
|
+
return iter(self._as_dict())
|
42
|
+
|
43
|
+
def __getitem__(self, key):
|
44
|
+
try:
|
45
|
+
return self._as_dict()[key]
|
46
|
+
except KeyError as exc:
|
47
|
+
raise KeyError(key) from exc
|
48
|
+
|
49
|
+
def __len__(self):
|
50
|
+
return len(self._as_dict())
|
natural_pdf/flows/region.py
CHANGED
@@ -426,7 +426,7 @@ class FlowRegion:
|
|
426
426
|
for region_part in self.constituent_regions:
|
427
427
|
try:
|
428
428
|
img = region_part.to_image(
|
429
|
-
resolution=resolution,
|
429
|
+
resolution=resolution, crop=True, include_highlights=False, **kwargs
|
430
430
|
)
|
431
431
|
if img:
|
432
432
|
cropped_images.append(img)
|
natural_pdf/ocr/engine_surya.py
CHANGED
@@ -72,11 +72,31 @@ class SuryaOCREngine(OCREngine):
|
|
72
72
|
if detect_only:
|
73
73
|
results = self._detection_predictor(images=[image])
|
74
74
|
else:
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
75
|
+
# Some Surya versions require 'langs' parameter in the __call__ while
|
76
|
+
# others assume the predictor was initialized with languages already.
|
77
|
+
# Inspect the callable signature to decide what to pass.
|
78
|
+
import inspect
|
79
|
+
|
80
|
+
recog_callable = self._recognition_predictor
|
81
|
+
try:
|
82
|
+
sig = inspect.signature(recog_callable)
|
83
|
+
has_langs_param = "langs" in sig.parameters
|
84
|
+
except (TypeError, ValueError):
|
85
|
+
# Fallback: assume langs not required if signature cannot be inspected
|
86
|
+
has_langs_param = False
|
87
|
+
|
88
|
+
if has_langs_param:
|
89
|
+
results = recog_callable(
|
90
|
+
langs=langs,
|
91
|
+
images=[image],
|
92
|
+
det_predictor=self._detection_predictor,
|
93
|
+
)
|
94
|
+
else:
|
95
|
+
# Older/newer Surya versions that omit 'langs'
|
96
|
+
results = recog_callable(
|
97
|
+
images=[image],
|
98
|
+
det_predictor=self._detection_predictor,
|
99
|
+
)
|
80
100
|
|
81
101
|
# Surya may return a list with one result per image or a single result object
|
82
102
|
# Return the result as-is and handle the extraction in _standardize_results
|
natural_pdf/qa/__init__.py
CHANGED
natural_pdf/qa/document_qa.py
CHANGED
@@ -8,6 +8,7 @@ import numpy as np
|
|
8
8
|
from PIL import Image, ImageDraw
|
9
9
|
|
10
10
|
from natural_pdf.elements.collections import ElementCollection
|
11
|
+
from .qa_result import QAResult
|
11
12
|
|
12
13
|
logger = logging.getLogger("natural_pdf.qa.document_qa")
|
13
14
|
|
@@ -123,7 +124,7 @@ class DocumentQA:
|
|
123
124
|
min_confidence: float = 0.1,
|
124
125
|
debug: bool = False,
|
125
126
|
debug_output_dir: str = "output",
|
126
|
-
) ->
|
127
|
+
) -> QAResult:
|
127
128
|
"""
|
128
129
|
Ask a question about document content.
|
129
130
|
|
@@ -136,12 +137,7 @@ class DocumentQA:
|
|
136
137
|
debug_output_dir: Directory to save debug files
|
137
138
|
|
138
139
|
Returns:
|
139
|
-
|
140
|
-
"answer": extracted text,
|
141
|
-
"confidence": confidence score,
|
142
|
-
"start": start word index,
|
143
|
-
"end": end word index
|
144
|
-
}
|
140
|
+
QAResult instance with answer details
|
145
141
|
"""
|
146
142
|
if not self._is_initialized:
|
147
143
|
raise RuntimeError("DocumentQA is not properly initialized")
|
@@ -225,25 +221,25 @@ class DocumentQA:
|
|
225
221
|
# Check confidence against threshold
|
226
222
|
if result["score"] < min_confidence:
|
227
223
|
logger.info(f"Answer confidence {result['score']:.4f} below threshold {min_confidence}")
|
228
|
-
return
|
229
|
-
|
230
|
-
|
231
|
-
|
232
|
-
|
233
|
-
|
234
|
-
|
235
|
-
|
236
|
-
return
|
237
|
-
|
238
|
-
|
239
|
-
|
240
|
-
|
241
|
-
|
242
|
-
|
224
|
+
return QAResult(
|
225
|
+
answer="",
|
226
|
+
confidence=result["score"],
|
227
|
+
start=result.get("start", -1),
|
228
|
+
end=result.get("end", -1),
|
229
|
+
found=False,
|
230
|
+
)
|
231
|
+
|
232
|
+
return QAResult(
|
233
|
+
answer=result["answer"],
|
234
|
+
confidence=result["score"],
|
235
|
+
start=result.get("start", 0),
|
236
|
+
end=result.get("end", 0),
|
237
|
+
found=True,
|
238
|
+
)
|
243
239
|
|
244
240
|
def ask_pdf_page(
|
245
241
|
self, page, question: str, min_confidence: float = 0.1, debug: bool = False
|
246
|
-
) ->
|
242
|
+
) -> QAResult:
|
247
243
|
"""
|
248
244
|
Ask a question about a specific PDF page.
|
249
245
|
|
@@ -253,7 +249,7 @@ class DocumentQA:
|
|
253
249
|
min_confidence: Minimum confidence threshold for answers
|
254
250
|
|
255
251
|
Returns:
|
256
|
-
|
252
|
+
QAResult instance with answer details
|
257
253
|
"""
|
258
254
|
# Ensure we have text elements on the page
|
259
255
|
if not page.find_all("text"):
|
@@ -284,12 +280,12 @@ class DocumentQA:
|
|
284
280
|
)
|
285
281
|
|
286
282
|
# Add page reference to the result
|
287
|
-
result
|
283
|
+
result.page_num = page.index
|
288
284
|
|
289
285
|
# Add element references if possible
|
290
|
-
if result.
|
291
|
-
start_idx = result
|
292
|
-
end_idx = result
|
286
|
+
if result.found and "start" in result and "end" in result:
|
287
|
+
start_idx = result.start
|
288
|
+
end_idx = result.end
|
293
289
|
|
294
290
|
# Make sure we have valid indices and elements to work with
|
295
291
|
if elements and 0 <= start_idx < len(word_boxes) and 0 <= end_idx < len(word_boxes):
|
@@ -308,7 +304,7 @@ class DocumentQA:
|
|
308
304
|
if element.text in matched_texts:
|
309
305
|
matched_texts.remove(element.text)
|
310
306
|
|
311
|
-
result
|
307
|
+
result.source_elements = ElementCollection(source_elements)
|
312
308
|
|
313
309
|
return result
|
314
310
|
|
@@ -319,7 +315,7 @@ class DocumentQA:
|
|
319
315
|
|
320
316
|
def ask_pdf_region(
|
321
317
|
self, region, question: str, min_confidence: float = 0.1, debug: bool = False
|
322
|
-
) ->
|
318
|
+
) -> QAResult:
|
323
319
|
"""
|
324
320
|
Ask a question about a specific region of a PDF page.
|
325
321
|
|
@@ -329,7 +325,7 @@ class DocumentQA:
|
|
329
325
|
min_confidence: Minimum confidence threshold for answers
|
330
326
|
|
331
327
|
Returns:
|
332
|
-
|
328
|
+
QAResult instance with answer details
|
333
329
|
"""
|
334
330
|
# Get all text elements within the region
|
335
331
|
elements = region.find_all("text")
|
@@ -366,13 +362,13 @@ class DocumentQA:
|
|
366
362
|
)
|
367
363
|
|
368
364
|
# Add region reference to the result
|
369
|
-
result
|
370
|
-
result
|
365
|
+
result.region = region
|
366
|
+
result.page_num = region.page.index
|
371
367
|
|
372
368
|
# Add element references if possible
|
373
|
-
if result.
|
374
|
-
start_idx = result
|
375
|
-
end_idx = result
|
369
|
+
if result.found and "start" in result and "end" in result:
|
370
|
+
start_idx = result.start
|
371
|
+
end_idx = result.end
|
376
372
|
|
377
373
|
# Make sure we have valid indices and elements to work with
|
378
374
|
if elements and 0 <= start_idx < len(word_boxes) and 0 <= end_idx < len(word_boxes):
|
@@ -391,7 +387,7 @@ class DocumentQA:
|
|
391
387
|
if element.text in matched_texts:
|
392
388
|
matched_texts.remove(element.text)
|
393
389
|
|
394
|
-
result
|
390
|
+
result.source_elements = ElementCollection(source_elements)
|
395
391
|
|
396
392
|
return result
|
397
393
|
|
@@ -0,0 +1,55 @@
|
|
1
|
+
class QAResult(dict):
|
2
|
+
"""Dictionary-like container for Document QA results with a convenient ``show`` method.
|
3
|
+
|
4
|
+
This class behaves exactly like a regular ``dict`` so existing code that
|
5
|
+
expects a mapping will continue to work. In addition it exposes:
|
6
|
+
|
7
|
+
• ``show()`` – delegates to the underlying ``source_elements.show`` if those
|
8
|
+
elements are present (added automatically by ``ask_pdf_page`` and
|
9
|
+
``ask_pdf_region``). This provides a quick way to visualise where an
|
10
|
+
answer was found in the document.
|
11
|
+
|
12
|
+
• Attribute access (e.g. ``result.answer``) as sugar for the usual
|
13
|
+
``result["answer"]``.
|
14
|
+
"""
|
15
|
+
|
16
|
+
# ---------------------------------------------------------------------
|
17
|
+
# Convenience helpers
|
18
|
+
# ---------------------------------------------------------------------
|
19
|
+
def show(self, *args, **kwargs):
|
20
|
+
"""Display the answer region by delegating to ``source_elements.show``.
|
21
|
+
|
22
|
+
Any positional or keyword arguments are forwarded to
|
23
|
+
``ElementCollection.show``.
|
24
|
+
"""
|
25
|
+
source = self.get("source_elements")
|
26
|
+
if source is None:
|
27
|
+
raise AttributeError(
|
28
|
+
"QAResult does not contain 'source_elements'; nothing to show()."
|
29
|
+
)
|
30
|
+
if not hasattr(source, "show"):
|
31
|
+
raise AttributeError(
|
32
|
+
"'source_elements' object has no 'show' method; cannot visualise."
|
33
|
+
)
|
34
|
+
return source.show(*args, **kwargs)
|
35
|
+
|
36
|
+
# ------------------------------------------------------------------
|
37
|
+
# Attribute <-> key delegation so ``result.answer`` works
|
38
|
+
# ------------------------------------------------------------------
|
39
|
+
def __getattr__(self, item):
|
40
|
+
try:
|
41
|
+
return self[item]
|
42
|
+
except KeyError as exc:
|
43
|
+
raise AttributeError(item) from exc
|
44
|
+
|
45
|
+
def __setattr__(self, key, value):
|
46
|
+
# Store all non-dunder attributes in the underlying mapping so that
|
47
|
+
# they remain serialisable.
|
48
|
+
if key.startswith("__") and key.endswith("__"):
|
49
|
+
super().__setattr__(key, value)
|
50
|
+
else:
|
51
|
+
self[key] = value
|
52
|
+
|
53
|
+
# Ensure ``copy`` keeps the subclass type
|
54
|
+
def copy(self):
|
55
|
+
return QAResult(self)
|