nfield 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- nfield/__init__.py +102 -0
- nfield/_version.py +17 -0
- nfield/assembly/__init__.py +49 -0
- nfield/assembly/_blackboard.py +488 -0
- nfield/assembly/_quality.py +254 -0
- nfield/assembly/_trie.py +376 -0
- nfield/cli/__init__.py +22 -0
- nfield/cli/_app.py +235 -0
- nfield/config.py +207 -0
- nfield/engine/__init__.py +39 -0
- nfield/engine/_async.py +610 -0
- nfield/engine/_sync.py +248 -0
- nfield/exceptions.py +251 -0
- nfield/export.py +145 -0
- nfield/extraction/__init__.py +55 -0
- nfield/extraction/_papt.py +335 -0
- nfield/extraction/_prompt.py +461 -0
- nfield/extraction/_sfep.py +572 -0
- nfield/io.py +122 -0
- nfield/pipeline/__init__.py +62 -0
- nfield/pipeline/_coverage.py +88 -0
- nfield/pipeline/_state.py +135 -0
- nfield/pipeline/_structure.py +653 -0
- nfield/pipeline/s0_resources.py +63 -0
- nfield/pipeline/s1_schema.py +97 -0
- nfield/pipeline/s2a_structure.py +54 -0
- nfield/pipeline/s2b_prepass.py +342 -0
- nfield/pipeline/s2c_packing.py +898 -0
- nfield/pipeline/s3_excerpt.py +130 -0
- nfield/pipeline/s4_extract.py +348 -0
- nfield/pipeline/s5_validate.py +132 -0
- nfield/pipeline/s5b_recover.py +418 -0
- nfield/pipeline/s6_assemble.py +157 -0
- nfield/providers/__init__.py +45 -0
- nfield/providers/_base.py +240 -0
- nfield/providers/_protocol.py +79 -0
- nfield/providers/_reasoning.py +108 -0
- nfield/providers/_registry.py +171 -0
- nfield/providers/_token_budget.py +77 -0
- nfield/providers/groq/__init__.py +10 -0
- nfield/providers/groq/_provider.py +287 -0
- nfield/providers/openai/__init__.py +10 -0
- nfield/providers/openai/_provider.py +300 -0
- nfield/py.typed +0 -0
- nfield/retrieval/__init__.py +41 -0
- nfield/retrieval/_bmx.py +213 -0
- nfield/retrieval/_chunker.py +495 -0
- nfield/retrieval/_glean.py +470 -0
- nfield/retrieval/_morphology.py +272 -0
- nfield/retrieval/_retarget.py +145 -0
- nfield/retrieval/_tokenize.py +62 -0
- nfield/schema/__init__.py +49 -0
- nfield/schema/_deps.py +154 -0
- nfield/schema/_difficulty.py +193 -0
- nfield/schema/_flatten.py +492 -0
- nfield/schema/_preflight.py +393 -0
- nfield/schema/_tau.py +173 -0
- nfield/schema/_types.py +282 -0
- nfield/types.py +223 -0
- nfield/validation/__init__.py +71 -0
- nfield/validation/_grounding.py +294 -0
- nfield/validation/_normalize.py +132 -0
- nfield/validation/_retry.py +576 -0
- nfield/validation/_type_check.py +334 -0
- nfield-0.1.0.dist-info/METADATA +201 -0
- nfield-0.1.0.dist-info/RECORD +69 -0
- nfield-0.1.0.dist-info/WHEEL +4 -0
- nfield-0.1.0.dist-info/entry_points.txt +2 -0
- nfield-0.1.0.dist-info/licenses/LICENSE +192 -0
nfield/__init__.py
ADDED
|
@@ -0,0 +1,102 @@
|
|
|
1
|
+
"""nfield - N-field structured extraction from documents with LLMs.
|
|
2
|
+
|
|
3
|
+
Extract hundreds of structured fields from any document without the format tax.
|
|
4
|
+
|
|
5
|
+
Quickstart:
|
|
6
|
+
>>> from nfield import nfield
|
|
7
|
+
>>> # result = nfield(document, MySchema, "groq/llama-3.1-8b")
|
|
8
|
+
>>> # result.data, result.metadata, result.status
|
|
9
|
+
|
|
10
|
+
Every public name is imported lazily, so ``import nfield`` stays fast and
|
|
11
|
+
never fails because an optional provider SDK (e.g. groq) is not installed.
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
from __future__ import annotations
|
|
15
|
+
|
|
16
|
+
import importlib
|
|
17
|
+
from typing import TYPE_CHECKING
|
|
18
|
+
|
|
19
|
+
from ._version import __version__
|
|
20
|
+
|
|
21
|
+
if TYPE_CHECKING:
|
|
22
|
+
from .config import ExtractionConfig
|
|
23
|
+
from .engine import AsyncNField, NField, nfield, nfield_async
|
|
24
|
+
from .exceptions import (
|
|
25
|
+
AssemblyError,
|
|
26
|
+
ExtractionError,
|
|
27
|
+
NFieldError,
|
|
28
|
+
ProviderError,
|
|
29
|
+
SchemaError,
|
|
30
|
+
ValidationError,
|
|
31
|
+
)
|
|
32
|
+
from .export import result_to_dataframe, results_to_csv, results_to_dataframe
|
|
33
|
+
from .io import load_document, load_results, load_schema, save_results
|
|
34
|
+
from .providers import from_model
|
|
35
|
+
from .types import ExtractionResult, ExtractionStatus, FieldResult, Metadata
|
|
36
|
+
|
|
37
|
+
__all__ = [
|
|
38
|
+
"AssemblyError",
|
|
39
|
+
"AsyncNField",
|
|
40
|
+
"ExtractionConfig",
|
|
41
|
+
"ExtractionError",
|
|
42
|
+
"ExtractionResult",
|
|
43
|
+
"ExtractionStatus",
|
|
44
|
+
"FieldResult",
|
|
45
|
+
"Metadata",
|
|
46
|
+
"NField",
|
|
47
|
+
"NFieldError",
|
|
48
|
+
"ProviderError",
|
|
49
|
+
"SchemaError",
|
|
50
|
+
"ValidationError",
|
|
51
|
+
"__version__",
|
|
52
|
+
"from_model",
|
|
53
|
+
"load_document",
|
|
54
|
+
"load_results",
|
|
55
|
+
"load_schema",
|
|
56
|
+
"nfield",
|
|
57
|
+
"nfield_async",
|
|
58
|
+
"result_to_dataframe",
|
|
59
|
+
"results_to_csv",
|
|
60
|
+
"results_to_dataframe",
|
|
61
|
+
"save_results",
|
|
62
|
+
]
|
|
63
|
+
|
|
64
|
+
_dynamic_imports: dict[str, str] = {
|
|
65
|
+
# Entry-point functions and engine classes
|
|
66
|
+
"nfield": ".engine",
|
|
67
|
+
"nfield_async": ".engine",
|
|
68
|
+
"NField": ".engine",
|
|
69
|
+
"AsyncNField": ".engine",
|
|
70
|
+
# Filesystem helpers (load inputs, persist results)
|
|
71
|
+
"load_document": ".io",
|
|
72
|
+
"load_schema": ".io",
|
|
73
|
+
"save_results": ".io",
|
|
74
|
+
"load_results": ".io",
|
|
75
|
+
# Tabular export (optional pandas dependency)
|
|
76
|
+
"results_to_dataframe": ".export",
|
|
77
|
+
"result_to_dataframe": ".export",
|
|
78
|
+
"results_to_csv": ".export",
|
|
79
|
+
# Provider factory
|
|
80
|
+
"from_model": ".providers",
|
|
81
|
+
# Config
|
|
82
|
+
"ExtractionConfig": ".config",
|
|
83
|
+
# Types
|
|
84
|
+
"ExtractionResult": ".types",
|
|
85
|
+
"FieldResult": ".types",
|
|
86
|
+
"Metadata": ".types",
|
|
87
|
+
"ExtractionStatus": ".types",
|
|
88
|
+
# Exceptions
|
|
89
|
+
"NFieldError": ".exceptions",
|
|
90
|
+
"SchemaError": ".exceptions",
|
|
91
|
+
"ProviderError": ".exceptions",
|
|
92
|
+
"ExtractionError": ".exceptions",
|
|
93
|
+
"ValidationError": ".exceptions",
|
|
94
|
+
"AssemblyError": ".exceptions",
|
|
95
|
+
}
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
def __getattr__(name: str) -> object:
|
|
99
|
+
if name in _dynamic_imports:
|
|
100
|
+
module = importlib.import_module(_dynamic_imports[name], package=__name__)
|
|
101
|
+
return getattr(module, name)
|
|
102
|
+
raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
|
nfield/_version.py
ADDED
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
"""Single source of version truth.
|
|
2
|
+
|
|
3
|
+
hatch-vcs writes the real version at build time from git tags.
|
|
4
|
+
At development time, falls back to "0.0.0+unknown".
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
try:
|
|
10
|
+
from importlib.metadata import PackageNotFoundError, version
|
|
11
|
+
|
|
12
|
+
try:
|
|
13
|
+
__version__ = version("nfield")
|
|
14
|
+
except PackageNotFoundError:
|
|
15
|
+
__version__ = "0.0.0+unknown"
|
|
16
|
+
except ImportError:
|
|
17
|
+
__version__ = "0.0.0+unknown"
|
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
"""nfield assembly module - JSON assembly, blackboard, and quality scoring.
|
|
2
|
+
|
|
3
|
+
Public surface
|
|
4
|
+
--------------
|
|
5
|
+
* :func:`assemble_json` - assemble flat SFEP results into a nested JSON dict.
|
|
6
|
+
* :func:`parse_path_segments` - parse dot-notation path into segment list.
|
|
7
|
+
* :class:`RadixTrie` - low-level trie for custom assembly workflows.
|
|
8
|
+
* :class:`Blackboard` - per-field state machine for extraction state tracking.
|
|
9
|
+
* :class:`FieldState` - enum of the 6 blackboard field states.
|
|
10
|
+
* :func:`compute_quality_score` - compute quality metrics from blackboard state.
|
|
11
|
+
* :class:`QualityReport` - immutable quality metrics dataclass.
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
from __future__ import annotations
|
|
15
|
+
|
|
16
|
+
import importlib
|
|
17
|
+
from typing import TYPE_CHECKING
|
|
18
|
+
|
|
19
|
+
if TYPE_CHECKING:
|
|
20
|
+
from ._blackboard import Blackboard, FieldState
|
|
21
|
+
from ._quality import QualityReport, compute_quality_score
|
|
22
|
+
from ._trie import RadixTrie, assemble_json, parse_path_segments
|
|
23
|
+
|
|
24
|
+
__all__ = [
|
|
25
|
+
"Blackboard",
|
|
26
|
+
"FieldState",
|
|
27
|
+
"QualityReport",
|
|
28
|
+
"RadixTrie",
|
|
29
|
+
"assemble_json",
|
|
30
|
+
"compute_quality_score",
|
|
31
|
+
"parse_path_segments",
|
|
32
|
+
]
|
|
33
|
+
|
|
34
|
+
_dynamic_imports: dict[str, str] = {
|
|
35
|
+
"Blackboard": "._blackboard",
|
|
36
|
+
"FieldState": "._blackboard",
|
|
37
|
+
"QualityReport": "._quality",
|
|
38
|
+
"compute_quality_score": "._quality",
|
|
39
|
+
"RadixTrie": "._trie",
|
|
40
|
+
"assemble_json": "._trie",
|
|
41
|
+
"parse_path_segments": "._trie",
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def __getattr__(name: str) -> object:
|
|
46
|
+
if name in _dynamic_imports:
|
|
47
|
+
module = importlib.import_module(_dynamic_imports[name], package=__name__)
|
|
48
|
+
return getattr(module, name)
|
|
49
|
+
raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
|
|
@@ -0,0 +1,488 @@
|
|
|
1
|
+
"""Blackboard state machine for per-field extraction state tracking.
|
|
2
|
+
|
|
3
|
+
The Blackboard is the shared data structure that accumulates extraction
|
|
4
|
+
results across all capacity leaves and retry rounds. It tracks the state
|
|
5
|
+
of each field throughout the pipeline using a finite state machine with
|
|
6
|
+
6 states per field.
|
|
7
|
+
|
|
8
|
+
State transitions
|
|
9
|
+
-----------------
|
|
10
|
+
|
|
11
|
+
EMPTY ──write()──► PENDING ──write()──► FILLED
|
|
12
|
+
│
|
|
13
|
+
mark_failed() ─────┼───► FAILED
|
|
14
|
+
mark_needs_revalidation() ► NEEDS_REVALIDATION
|
|
15
|
+
write() (conflict) ─────► CONFLICT
|
|
16
|
+
|
|
17
|
+
Notes
|
|
18
|
+
-----
|
|
19
|
+
* ``write_raw()`` is the dep-change-safe variant: it does NOT transition
|
|
20
|
+
a FILLED field back to PENDING. Used when updating a dependency whose
|
|
21
|
+
change may invalidate a dependent field.
|
|
22
|
+
* Cross-leaf conflict detection: if two leaves extract different non-None
|
|
23
|
+
values for the same field, the state transitions to ``CONFLICT`` and
|
|
24
|
+
both values are stored for reporting.
|
|
25
|
+
* Once a field is ``FAILED`` or ``CONFLICT``, it can only transition to
|
|
26
|
+
``NEEDS_REVALIDATION`` (for human review), not back to ``FILLED``.
|
|
27
|
+
"""
|
|
28
|
+
|
|
29
|
+
from __future__ import annotations
|
|
30
|
+
|
|
31
|
+
from enum import Enum
|
|
32
|
+
from typing import Any
|
|
33
|
+
|
|
34
|
+
from nfield.exceptions import AssemblyError
|
|
35
|
+
|
|
36
|
+
__all__ = [
|
|
37
|
+
"Blackboard",
|
|
38
|
+
"FieldState",
|
|
39
|
+
]
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
# ---------------------------------------------------------------------------
|
|
43
|
+
# FieldState enum
|
|
44
|
+
# ---------------------------------------------------------------------------
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
class FieldState(Enum):
|
|
48
|
+
"""State of a single field in the extraction blackboard.
|
|
49
|
+
|
|
50
|
+
Attributes:
|
|
51
|
+
EMPTY: Field has not been seen in any extraction output yet.
|
|
52
|
+
PENDING: Field has been written at least once but not confirmed.
|
|
53
|
+
FILLED: Field has a validated value (at least type-valid).
|
|
54
|
+
FAILED: Field extraction failed and retry did not recover it.
|
|
55
|
+
CONFLICT: Two or more leaves extracted different values for this field.
|
|
56
|
+
NEEDS_REVALIDATION: Field is flagged for human or semantic review.
|
|
57
|
+
|
|
58
|
+
Example:
|
|
59
|
+
>>> FieldState.FILLED.value
|
|
60
|
+
'filled'
|
|
61
|
+
"""
|
|
62
|
+
|
|
63
|
+
EMPTY = "empty"
|
|
64
|
+
PENDING = "pending"
|
|
65
|
+
FILLED = "filled"
|
|
66
|
+
FAILED = "failed"
|
|
67
|
+
CONFLICT = "conflict"
|
|
68
|
+
NEEDS_REVALIDATION = "needs_revalidation"
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
# ---------------------------------------------------------------------------
|
|
72
|
+
# Blackboard
|
|
73
|
+
# ---------------------------------------------------------------------------
|
|
74
|
+
#
|
|
75
|
+
# State transitions are enforced inline in each write/mark_* method below, not via
|
|
76
|
+
# a transition table: the real rules are value-dependent (a same-value re-write is a
|
|
77
|
+
# no-op, a different value escalates to CONFLICT, a transient flag tags a FAILED),
|
|
78
|
+
# which a flat state->states table cannot express. The legal moves, for reference:
|
|
79
|
+
# EMPTY -> PENDING | FILLED | FAILED
|
|
80
|
+
# PENDING -> FILLED | FAILED | CONFLICT
|
|
81
|
+
# FILLED -> CONFLICT | NEEDS_REVALIDATION (+ reopen_for_retry -> PENDING)
|
|
82
|
+
# FAILED -> FILLED | NEEDS_REVALIDATION (+ reopen_for_retry -> PENDING)
|
|
83
|
+
# CONFLICT -> NEEDS_REVALIDATION (+ reopen_for_retry -> PENDING)
|
|
84
|
+
# NEEDS_REVALIDATION -> (terminal) (+ reopen_for_retry -> PENDING)
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
class Blackboard:
|
|
88
|
+
"""Per-field state machine tracking extraction results across all leaves.
|
|
89
|
+
|
|
90
|
+
The Blackboard is initialized with the complete list of field paths
|
|
91
|
+
from Stage 1 and accumulates values written by Stage 4 (extraction)
|
|
92
|
+
and Stage 5 (validation + retry).
|
|
93
|
+
|
|
94
|
+
Attributes:
|
|
95
|
+
_states: Mapping of field path to current FieldState.
|
|
96
|
+
_values: Mapping of field path to current typed value.
|
|
97
|
+
_errors: Mapping of failed field paths to error messages.
|
|
98
|
+
_conflict_values: Mapping of conflicted field paths to all seen values.
|
|
99
|
+
|
|
100
|
+
Example:
|
|
101
|
+
>>> bb = Blackboard(["name", "age"])
|
|
102
|
+
>>> bb.write("name", "Alice")
|
|
103
|
+
>>> bb.get_filled()
|
|
104
|
+
{'name': 'Alice'}
|
|
105
|
+
>>> bb.get_missing()
|
|
106
|
+
['age']
|
|
107
|
+
"""
|
|
108
|
+
|
|
109
|
+
def __init__(self, paths: list[str]) -> None:
|
|
110
|
+
"""Initialise a Blackboard for the given field paths.
|
|
111
|
+
|
|
112
|
+
Args:
|
|
113
|
+
paths: All field paths from the flattened schema (Stage 1 output).
|
|
114
|
+
All paths start in ``EMPTY`` state.
|
|
115
|
+
|
|
116
|
+
Raises:
|
|
117
|
+
ValueError: If *paths* contains duplicates.
|
|
118
|
+
"""
|
|
119
|
+
if len(paths) != len(set(paths)):
|
|
120
|
+
duplicates = [p for p in paths if paths.count(p) > 1]
|
|
121
|
+
raise ValueError(f"Blackboard paths must be unique; duplicates: {duplicates}")
|
|
122
|
+
self._states: dict[str, FieldState] = dict.fromkeys(paths, FieldState.EMPTY)
|
|
123
|
+
self._values: dict[str, Any] = {}
|
|
124
|
+
self._errors: dict[str, str] = {}
|
|
125
|
+
self._conflict_values: dict[str, list[Any]] = {}
|
|
126
|
+
# Paths whose FAILED state is a transient API/call failure (the call never
|
|
127
|
+
# returned), tracked apart from a genuine "absent in document" failure.
|
|
128
|
+
self._call_failed: set[str] = set()
|
|
129
|
+
|
|
130
|
+
# ------------------------------------------------------------------
|
|
131
|
+
# Write operations
|
|
132
|
+
# ------------------------------------------------------------------
|
|
133
|
+
|
|
134
|
+
def write(self, path: str, value: Any) -> None:
|
|
135
|
+
"""Write a value for a field, transitioning its state.
|
|
136
|
+
|
|
137
|
+
Transitions:
|
|
138
|
+
* ``EMPTY`` / ``PENDING`` → ``FILLED`` (or ``CONFLICT`` if value differs)
|
|
139
|
+
* ``FILLED`` with same value → no-op
|
|
140
|
+
* ``FILLED`` with new value → ``CONFLICT``
|
|
141
|
+
* ``FAILED`` → ``FILLED`` (retry recovered this field)
|
|
142
|
+
|
|
143
|
+
Args:
|
|
144
|
+
path: Dot-notation field path.
|
|
145
|
+
value: Typed Python value from the SFEP parser.
|
|
146
|
+
|
|
147
|
+
Raises:
|
|
148
|
+
AssemblyError: If the path is not registered in this blackboard.
|
|
149
|
+
"""
|
|
150
|
+
self._require_path(path)
|
|
151
|
+
state = self._states[path]
|
|
152
|
+
|
|
153
|
+
if state == FieldState.FILLED:
|
|
154
|
+
existing = self._values.get(path)
|
|
155
|
+
if existing == value:
|
|
156
|
+
return # Same value from a second leaf - no conflict
|
|
157
|
+
# Different value from a second leaf - conflict
|
|
158
|
+
self._conflict_values.setdefault(path, [existing])
|
|
159
|
+
if value not in self._conflict_values[path]:
|
|
160
|
+
self._conflict_values[path].append(value)
|
|
161
|
+
self._states[path] = FieldState.CONFLICT
|
|
162
|
+
return
|
|
163
|
+
|
|
164
|
+
if state == FieldState.CONFLICT:
|
|
165
|
+
# Already conflicted - accumulate additional values
|
|
166
|
+
if value not in self._conflict_values.get(path, []):
|
|
167
|
+
self._conflict_values.setdefault(path, []).append(value)
|
|
168
|
+
return
|
|
169
|
+
|
|
170
|
+
if state == FieldState.NEEDS_REVALIDATION:
|
|
171
|
+
# Terminal state - no further writes
|
|
172
|
+
return
|
|
173
|
+
|
|
174
|
+
# EMPTY / PENDING / FAILED → FILLED
|
|
175
|
+
self._values[path] = value
|
|
176
|
+
self._states[path] = FieldState.FILLED
|
|
177
|
+
self._call_failed.discard(path)
|
|
178
|
+
|
|
179
|
+
def write_raw(self, path: str, value: Any) -> None:
|
|
180
|
+
"""Dependency-change-safe write: does not overwrite a FILLED field.
|
|
181
|
+
|
|
182
|
+
Used when updating dependency values that may propagate to dependent
|
|
183
|
+
fields. Prevents overwriting a valid extracted value with a stale
|
|
184
|
+
dependency update.
|
|
185
|
+
|
|
186
|
+
If the field is ``EMPTY`` or ``PENDING``, behaves like :meth:`write`.
|
|
187
|
+
If the field is already ``FILLED``, the write is silently discarded
|
|
188
|
+
and the field is flagged ``NEEDS_REVALIDATION`` (since its dependency
|
|
189
|
+
changed, its value may be stale).
|
|
190
|
+
|
|
191
|
+
Args:
|
|
192
|
+
path: Dot-notation field path.
|
|
193
|
+
value: New typed Python value.
|
|
194
|
+
|
|
195
|
+
Raises:
|
|
196
|
+
AssemblyError: If the path is not registered in this blackboard.
|
|
197
|
+
"""
|
|
198
|
+
self._require_path(path)
|
|
199
|
+
state = self._states[path]
|
|
200
|
+
|
|
201
|
+
if state == FieldState.FILLED:
|
|
202
|
+
# Dependency changed while this field already has a value -
|
|
203
|
+
# flag for revalidation without overwriting
|
|
204
|
+
self._states[path] = FieldState.NEEDS_REVALIDATION
|
|
205
|
+
return
|
|
206
|
+
|
|
207
|
+
if state in (FieldState.FAILED, FieldState.CONFLICT, FieldState.NEEDS_REVALIDATION):
|
|
208
|
+
return # Cannot update terminal/conflict states
|
|
209
|
+
|
|
210
|
+
# EMPTY / PENDING → write normally
|
|
211
|
+
self._values[path] = value
|
|
212
|
+
self._states[path] = FieldState.FILLED
|
|
213
|
+
|
|
214
|
+
# ------------------------------------------------------------------
|
|
215
|
+
# State transitions
|
|
216
|
+
# ------------------------------------------------------------------
|
|
217
|
+
|
|
218
|
+
def mark_failed(self, path: str, error: str, *, transient: bool = False) -> None:
|
|
219
|
+
"""Transition a field to ``FAILED`` state with an error message.
|
|
220
|
+
|
|
221
|
+
Args:
|
|
222
|
+
path: Dot-notation field path.
|
|
223
|
+
error: Human-readable description of the failure.
|
|
224
|
+
transient: ``True`` when the failure is a call/API error (the request
|
|
225
|
+
never returned) rather than the field being absent from the
|
|
226
|
+
document. Tracked separately so reporting and recovery can tell a
|
|
227
|
+
network blip from genuinely missing data.
|
|
228
|
+
|
|
229
|
+
Raises:
|
|
230
|
+
AssemblyError: If the path is not registered.
|
|
231
|
+
"""
|
|
232
|
+
self._require_path(path)
|
|
233
|
+
state = self._states[path]
|
|
234
|
+
if state not in (
|
|
235
|
+
FieldState.EMPTY,
|
|
236
|
+
FieldState.PENDING,
|
|
237
|
+
FieldState.FILLED,
|
|
238
|
+
FieldState.FAILED,
|
|
239
|
+
):
|
|
240
|
+
return # Cannot transition from CONFLICT or NEEDS_REVALIDATION to FAILED
|
|
241
|
+
self._states[path] = FieldState.FAILED
|
|
242
|
+
self._errors[path] = error
|
|
243
|
+
if transient:
|
|
244
|
+
self._call_failed.add(path)
|
|
245
|
+
else:
|
|
246
|
+
self._call_failed.discard(path)
|
|
247
|
+
|
|
248
|
+
def mark_needs_revalidation(self, path: str) -> None:
|
|
249
|
+
"""Transition a field to ``NEEDS_REVALIDATION`` state.
|
|
250
|
+
|
|
251
|
+
Args:
|
|
252
|
+
path: Dot-notation field path.
|
|
253
|
+
|
|
254
|
+
Raises:
|
|
255
|
+
AssemblyError: If the path is not registered.
|
|
256
|
+
"""
|
|
257
|
+
self._require_path(path)
|
|
258
|
+
self._states[path] = FieldState.NEEDS_REVALIDATION
|
|
259
|
+
|
|
260
|
+
def mark_pending(self, path: str) -> None:
|
|
261
|
+
"""Transition a field from ``EMPTY`` to ``PENDING`` state.
|
|
262
|
+
|
|
263
|
+
Used to indicate that extraction for this field is in-flight.
|
|
264
|
+
|
|
265
|
+
Args:
|
|
266
|
+
path: Dot-notation field path.
|
|
267
|
+
|
|
268
|
+
Raises:
|
|
269
|
+
AssemblyError: If the path is not registered.
|
|
270
|
+
"""
|
|
271
|
+
self._require_path(path)
|
|
272
|
+
if self._states[path] == FieldState.EMPTY:
|
|
273
|
+
self._states[path] = FieldState.PENDING
|
|
274
|
+
|
|
275
|
+
def reopen_for_retry(self, path: str) -> bool:
|
|
276
|
+
"""Reopen a FAILED / CONFLICT / NEEDS_REVALIDATION field for re-extraction.
|
|
277
|
+
|
|
278
|
+
A controlled escape hatch for the retry orchestrator: it moves a field
|
|
279
|
+
that the normal FSM treats as settled (or terminal) back to ``PENDING`` so
|
|
280
|
+
a subsequent :meth:`write` can record a fresh value. Clears the field's
|
|
281
|
+
prior error and any stored conflicting values, since the retry supersedes
|
|
282
|
+
them. Fields in ``EMPTY``/``PENDING``/``FILLED`` are left unchanged.
|
|
283
|
+
|
|
284
|
+
Args:
|
|
285
|
+
path: Dot-notation field path.
|
|
286
|
+
|
|
287
|
+
Returns:
|
|
288
|
+
``True`` if the field was reopened, ``False`` if its state was not
|
|
289
|
+
eligible (so the caller knows whether a retry will be applied).
|
|
290
|
+
|
|
291
|
+
Raises:
|
|
292
|
+
AssemblyError: If the path is not registered.
|
|
293
|
+
"""
|
|
294
|
+
self._require_path(path)
|
|
295
|
+
if self._states[path] in (
|
|
296
|
+
FieldState.FAILED,
|
|
297
|
+
FieldState.CONFLICT,
|
|
298
|
+
FieldState.NEEDS_REVALIDATION,
|
|
299
|
+
):
|
|
300
|
+
self._states[path] = FieldState.PENDING
|
|
301
|
+
self._errors.pop(path, None)
|
|
302
|
+
self._conflict_values.pop(path, None)
|
|
303
|
+
return True
|
|
304
|
+
return False
|
|
305
|
+
|
|
306
|
+
# ------------------------------------------------------------------
|
|
307
|
+
# Read operations
|
|
308
|
+
# ------------------------------------------------------------------
|
|
309
|
+
|
|
310
|
+
def get_missing(self) -> list[str]:
|
|
311
|
+
"""Return paths of fields still in ``EMPTY`` state after extraction.
|
|
312
|
+
|
|
313
|
+
Returns:
|
|
314
|
+
Sorted list of dot-notation paths that were never extracted.
|
|
315
|
+
|
|
316
|
+
Example:
|
|
317
|
+
>>> bb = Blackboard(["a", "b"])
|
|
318
|
+
>>> bb.write("a", 1)
|
|
319
|
+
>>> bb.get_missing()
|
|
320
|
+
['b']
|
|
321
|
+
"""
|
|
322
|
+
return sorted(p for p, s in self._states.items() if s == FieldState.EMPTY)
|
|
323
|
+
|
|
324
|
+
def get_conflicts(self) -> list[str]:
|
|
325
|
+
"""Return paths of fields in ``CONFLICT`` state.
|
|
326
|
+
|
|
327
|
+
Returns:
|
|
328
|
+
Sorted list of dot-notation paths with conflicting values.
|
|
329
|
+
|
|
330
|
+
Example:
|
|
331
|
+
>>> bb = Blackboard(["x"])
|
|
332
|
+
>>> bb.write("x", 1)
|
|
333
|
+
>>> bb.write("x", 2)
|
|
334
|
+
>>> bb.get_conflicts()
|
|
335
|
+
['x']
|
|
336
|
+
"""
|
|
337
|
+
return sorted(p for p, s in self._states.items() if s == FieldState.CONFLICT)
|
|
338
|
+
|
|
339
|
+
def get_needs_revalidation(self) -> list[str]:
|
|
340
|
+
"""Return paths of fields flagged for revalidation.
|
|
341
|
+
|
|
342
|
+
Returns:
|
|
343
|
+
Sorted list of dot-notation paths in NEEDS_REVALIDATION state.
|
|
344
|
+
"""
|
|
345
|
+
return sorted(p for p, s in self._states.items() if s == FieldState.NEEDS_REVALIDATION)
|
|
346
|
+
|
|
347
|
+
def get_failed(self) -> list[str]:
|
|
348
|
+
"""Return paths of fields in ``FAILED`` state.
|
|
349
|
+
|
|
350
|
+
Returns:
|
|
351
|
+
Sorted list of dot-notation paths that failed extraction.
|
|
352
|
+
"""
|
|
353
|
+
return sorted(p for p, s in self._states.items() if s == FieldState.FAILED)
|
|
354
|
+
|
|
355
|
+
def get_call_failed(self) -> list[str]:
|
|
356
|
+
"""Return paths whose ``FAILED`` state is a transient call/API failure.
|
|
357
|
+
|
|
358
|
+
These are fields the model never got a chance to answer (the request
|
|
359
|
+
failed), as opposed to fields it answered ``NULL`` (absent from the
|
|
360
|
+
document). Used to report API failures distinctly from missing data.
|
|
361
|
+
|
|
362
|
+
Returns:
|
|
363
|
+
Sorted list of dot-notation paths still FAILED due to a call error.
|
|
364
|
+
"""
|
|
365
|
+
return sorted(p for p in self._call_failed if self._states.get(p) == FieldState.FAILED)
|
|
366
|
+
|
|
367
|
+
def get_filled(self) -> dict[str, Any]:
|
|
368
|
+
"""Return fields that hold a real (non-``None``) extracted value.
|
|
369
|
+
|
|
370
|
+
``None`` is excluded on purpose: the recovery pass marks tree-backtracked
|
|
371
|
+
"confirmed absent" fields ``FILLED`` with ``None`` (:meth:`write_raw`), but
|
|
372
|
+
such a field has no value - it was confirmed missing, not extracted. Counting
|
|
373
|
+
it as filled would overstate the extraction rate, so it is omitted here and
|
|
374
|
+
therefore counted as missing by the quality metrics.
|
|
375
|
+
|
|
376
|
+
Returns:
|
|
377
|
+
Dict of ``{path: value}`` for ``FILLED`` fields whose value is not ``None``.
|
|
378
|
+
|
|
379
|
+
Example:
|
|
380
|
+
>>> bb = Blackboard(["name", "nickname"])
|
|
381
|
+
>>> bb.write("name", "Alice")
|
|
382
|
+
>>> bb.write_raw("nickname", None) # confirmed absent
|
|
383
|
+
>>> bb.get_filled()
|
|
384
|
+
{'name': 'Alice'}
|
|
385
|
+
"""
|
|
386
|
+
return {
|
|
387
|
+
p: self._values[p]
|
|
388
|
+
for p, s in self._states.items()
|
|
389
|
+
if s == FieldState.FILLED and self._values.get(p) is not None
|
|
390
|
+
}
|
|
391
|
+
|
|
392
|
+
def get_value(self, path: str) -> Any:
|
|
393
|
+
"""Return the last value written for a field, regardless of its state.
|
|
394
|
+
|
|
395
|
+
Unlike :meth:`get_filled`, this returns the stored value even for a ``FAILED``
|
|
396
|
+
or ``NEEDS_REVALIDATION`` field, so a retry can show the model the exact value
|
|
397
|
+
its previous attempt produced. Returns ``None`` if nothing was ever written.
|
|
398
|
+
|
|
399
|
+
Args:
|
|
400
|
+
path: Dot-notation field path.
|
|
401
|
+
|
|
402
|
+
Returns:
|
|
403
|
+
The stored value, or ``None`` if the field has no recorded value.
|
|
404
|
+
"""
|
|
405
|
+
return self._values.get(path)
|
|
406
|
+
|
|
407
|
+
def get_conflict_values(self, path: str) -> list[Any]:
|
|
408
|
+
"""Return all conflicting values seen for a field.
|
|
409
|
+
|
|
410
|
+
Args:
|
|
411
|
+
path: Dot-notation field path.
|
|
412
|
+
|
|
413
|
+
Returns:
|
|
414
|
+
List of all values written to a CONFLICT field.
|
|
415
|
+
"""
|
|
416
|
+
return list(self._conflict_values.get(path, []))
|
|
417
|
+
|
|
418
|
+
def get_state(self, path: str) -> FieldState:
|
|
419
|
+
"""Return the current state of a field.
|
|
420
|
+
|
|
421
|
+
Args:
|
|
422
|
+
path: Dot-notation field path.
|
|
423
|
+
|
|
424
|
+
Returns:
|
|
425
|
+
Current :class:`FieldState` for the path.
|
|
426
|
+
|
|
427
|
+
Raises:
|
|
428
|
+
AssemblyError: If the path is not registered.
|
|
429
|
+
"""
|
|
430
|
+
self._require_path(path)
|
|
431
|
+
return self._states[path]
|
|
432
|
+
|
|
433
|
+
def get_error(self, path: str) -> str | None:
|
|
434
|
+
"""Return the error message for a failed field.
|
|
435
|
+
|
|
436
|
+
Args:
|
|
437
|
+
path: Dot-notation field path.
|
|
438
|
+
|
|
439
|
+
Returns:
|
|
440
|
+
Error message string, or ``None`` if the field did not fail.
|
|
441
|
+
"""
|
|
442
|
+
return self._errors.get(path)
|
|
443
|
+
|
|
444
|
+
def all_paths(self) -> list[str]:
|
|
445
|
+
"""Return all registered field paths in sorted order.
|
|
446
|
+
|
|
447
|
+
Returns:
|
|
448
|
+
Sorted list of all paths registered at construction.
|
|
449
|
+
"""
|
|
450
|
+
return sorted(self._states)
|
|
451
|
+
|
|
452
|
+
def summary(self) -> dict[str, int]:
|
|
453
|
+
"""Return a count of fields in each state.
|
|
454
|
+
|
|
455
|
+
Returns:
|
|
456
|
+
Dict mapping state name to field count.
|
|
457
|
+
|
|
458
|
+
Example:
|
|
459
|
+
>>> bb = Blackboard(["a", "b", "c"])
|
|
460
|
+
>>> bb.write("a", 1)
|
|
461
|
+
>>> bb.mark_failed("b", "parse error")
|
|
462
|
+
>>> bb.summary()
|
|
463
|
+
{'empty': 1, 'pending': 0, 'filled': 1, 'failed': 1, 'conflict': 0, 'needs_revalidation': 0}
|
|
464
|
+
"""
|
|
465
|
+
counts: dict[str, int] = {s.value: 0 for s in FieldState}
|
|
466
|
+
for state in self._states.values():
|
|
467
|
+
counts[state.value] += 1
|
|
468
|
+
return counts
|
|
469
|
+
|
|
470
|
+
# ------------------------------------------------------------------
|
|
471
|
+
# Private
|
|
472
|
+
# ------------------------------------------------------------------
|
|
473
|
+
|
|
474
|
+
def _require_path(self, path: str) -> None:
|
|
475
|
+
"""Assert that *path* is registered in this blackboard.
|
|
476
|
+
|
|
477
|
+
Args:
|
|
478
|
+
path: Path to check.
|
|
479
|
+
|
|
480
|
+
Raises:
|
|
481
|
+
AssemblyError: If the path was not registered at construction.
|
|
482
|
+
"""
|
|
483
|
+
if path not in self._states:
|
|
484
|
+
raise AssemblyError(
|
|
485
|
+
f"Unknown field path {path!r} - "
|
|
486
|
+
"path must be registered at Blackboard construction",
|
|
487
|
+
path=path,
|
|
488
|
+
)
|