crprotocol 2.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- crp/__init__.py +126 -0
- crp/__main__.py +8 -0
- crp/_typing.py +27 -0
- crp/_version.py +5 -0
- crp/adapters.py +31 -0
- crp/advanced/__init__.py +40 -0
- crp/advanced/auto_ingest.py +400 -0
- crp/advanced/cqs.py +235 -0
- crp/advanced/cross_window.py +477 -0
- crp/advanced/curator.py +265 -0
- crp/advanced/feedback.py +146 -0
- crp/advanced/hierarchical.py +211 -0
- crp/advanced/meta_learning.py +401 -0
- crp/advanced/parallel.py +98 -0
- crp/advanced/review_cycle.py +329 -0
- crp/advanced/scale_mode.py +129 -0
- crp/advanced/source_grounding.py +207 -0
- crp/ckf/__init__.py +35 -0
- crp/ckf/community.py +377 -0
- crp/ckf/fabric.py +445 -0
- crp/ckf/gc.py +175 -0
- crp/ckf/graph_walk.py +87 -0
- crp/ckf/merge.py +133 -0
- crp/ckf/pattern_query.py +122 -0
- crp/ckf/pubsub.py +128 -0
- crp/ckf/semantic.py +207 -0
- crp/cli/__init__.py +7 -0
- crp/cli/main.py +329 -0
- crp/cli/sidecar.py +929 -0
- crp/cli/startup.py +272 -0
- crp/continuation/__init__.py +103 -0
- crp/continuation/completion.py +348 -0
- crp/continuation/degradation.py +157 -0
- crp/continuation/document_map.py +160 -0
- crp/continuation/flow.py +109 -0
- crp/continuation/gap.py +419 -0
- crp/continuation/manager.py +484 -0
- crp/continuation/quality_monitor.py +179 -0
- crp/continuation/stitch.py +419 -0
- crp/continuation/trigger.py +142 -0
- crp/continuation/voice.py +157 -0
- crp/core/__init__.py +69 -0
- crp/core/batch.py +77 -0
- crp/core/circuit_breaker.py +116 -0
- crp/core/config.py +377 -0
- crp/core/context_tools.py +540 -0
- crp/core/dispatch_router.py +3977 -0
- crp/core/errors.py +128 -0
- crp/core/extraction_facade.py +384 -0
- crp/core/facilitator.py +713 -0
- crp/core/idempotency.py +215 -0
- crp/core/orchestrator.py +1435 -0
- crp/core/relay_strategies.py +613 -0
- crp/core/security_manager.py +140 -0
- crp/core/session.py +134 -0
- crp/core/task_intent.py +36 -0
- crp/core/window.py +363 -0
- crp/envelope/__init__.py +30 -0
- crp/envelope/builder.py +288 -0
- crp/envelope/decomposer.py +236 -0
- crp/envelope/formatter.py +168 -0
- crp/envelope/packer.py +211 -0
- crp/envelope/reranker.py +209 -0
- crp/envelope/scoring.py +310 -0
- crp/extraction/__init__.py +45 -0
- crp/extraction/complexity.py +96 -0
- crp/extraction/contradiction.py +132 -0
- crp/extraction/pipeline.py +360 -0
- crp/extraction/quality_gate.py +237 -0
- crp/extraction/stage1_regex.py +173 -0
- crp/extraction/stage2_statistical.py +244 -0
- crp/extraction/stage3_gliner.py +210 -0
- crp/extraction/stage4_uie.py +183 -0
- crp/extraction/stage5_discourse.py +175 -0
- crp/extraction/stage6_llm.py +178 -0
- crp/extraction/structured_output.py +219 -0
- crp/extraction/types.py +299 -0
- crp/license_guard.py +722 -0
- crp/observability/__init__.py +30 -0
- crp/observability/audit.py +118 -0
- crp/observability/events.py +233 -0
- crp/observability/metrics.py +264 -0
- crp/observability/quality.py +135 -0
- crp/observability/structured_logging.py +81 -0
- crp/observability/telemetry.py +117 -0
- crp/provenance/__init__.py +314 -0
- crp/provenance/_embeddings.py +97 -0
- crp/provenance/_types.py +378 -0
- crp/provenance/attribution_scorer.py +252 -0
- crp/provenance/claim_detector.py +229 -0
- crp/provenance/contradiction_detector.py +243 -0
- crp/provenance/distortion_detector.py +397 -0
- crp/provenance/entailment_verifier.py +358 -0
- crp/provenance/fabrication_detector.py +203 -0
- crp/provenance/hallucination_scorer.py +320 -0
- crp/provenance/omission_analyzer.py +106 -0
- crp/provenance/provenance_chain.py +205 -0
- crp/provenance/report_generator.py +440 -0
- crp/providers/__init__.py +43 -0
- crp/providers/anthropic.py +270 -0
- crp/providers/base.py +135 -0
- crp/providers/custom.py +63 -0
- crp/providers/diagnostic.py +251 -0
- crp/providers/llamacpp.py +224 -0
- crp/providers/manager.py +139 -0
- crp/providers/ollama.py +243 -0
- crp/providers/openai.py +628 -0
- crp/providers/tokenizers.py +48 -0
- crp/py.typed +0 -0
- crp/resources/__init__.py +53 -0
- crp/resources/adaptive_allocator.py +525 -0
- crp/resources/cost_model.py +388 -0
- crp/resources/overhead_manager.py +217 -0
- crp/resources/resource_manager.py +262 -0
- crp/schemas/__init__.py +20 -0
- crp/schemas/cost-estimate.json +33 -0
- crp/schemas/crp-error.json +43 -0
- crp/schemas/envelope-preview.json +40 -0
- crp/schemas/persisted-state-header.json +27 -0
- crp/schemas/quality-report.json +94 -0
- crp/schemas/session-handle.json +33 -0
- crp/schemas/session-status.json +57 -0
- crp/schemas/stream-event.json +18 -0
- crp/schemas/task-intent.json +42 -0
- crp/security/__init__.py +93 -0
- crp/security/audit_trail.py +392 -0
- crp/security/binding.py +192 -0
- crp/security/compliance.py +813 -0
- crp/security/consent.py +593 -0
- crp/security/embedding_defense.py +161 -0
- crp/security/encryption.py +202 -0
- crp/security/injection.py +335 -0
- crp/security/integrity.py +267 -0
- crp/security/privacy.py +662 -0
- crp/security/quarantine.py +249 -0
- crp/security/rbac.py +221 -0
- crp/security/validation.py +164 -0
- crp/state/__init__.py +31 -0
- crp/state/cold_storage.py +258 -0
- crp/state/compaction.py +263 -0
- crp/state/critical_state.py +104 -0
- crp/state/event_log.py +313 -0
- crp/state/fact.py +189 -0
- crp/state/serialization.py +189 -0
- crp/state/session_cleanup.py +77 -0
- crp/state/snapshot.py +290 -0
- crp/state/warm_store.py +346 -0
- crprotocol-2.0.0.dist-info/METADATA +1295 -0
- crprotocol-2.0.0.dist-info/RECORD +153 -0
- crprotocol-2.0.0.dist-info/WHEEL +4 -0
- crprotocol-2.0.0.dist-info/entry_points.txt +2 -0
- crprotocol-2.0.0.dist-info/licenses/LICENSE.md +170 -0
- crprotocol-2.0.0.dist-info/licenses/NOTICE +18 -0
|
@@ -0,0 +1,219 @@
|
|
|
1
|
+
# Copyright © 2025 Constantinos Vidiniotis. All rights reserved.
|
|
2
|
+
# Licensed under Elastic License 2.0 — see LICENSE.md for details.
|
|
3
|
+
"""Structured output handling — schema/grammar enforcement (§06 §6.9, 2J).
|
|
4
|
+
|
|
5
|
+
Supports: Outlines FSM, GBNF grammar, logit masking, fallback JSON repair.
|
|
6
|
+
All integrations are optional — graceful fallback if libraries unavailable.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
import json
|
|
12
|
+
import logging
|
|
13
|
+
import re
|
|
14
|
+
from typing import Any
|
|
15
|
+
|
|
16
|
+
logger = logging.getLogger(__name__)
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
# ---------------------------------------------------------------------------
|
|
20
|
+
# JSON repair (always available — no external deps)
|
|
21
|
+
# ---------------------------------------------------------------------------
|
|
22
|
+
|
|
23
|
+
def repair_json(raw: str) -> str | None:
|
|
24
|
+
"""Best-effort repair of malformed JSON.
|
|
25
|
+
|
|
26
|
+
Handles: trailing commas, unquoted keys, single quotes, truncated output.
|
|
27
|
+
Returns the repaired JSON string, or None if unrecoverable.
|
|
28
|
+
"""
|
|
29
|
+
# Strip markdown fences
|
|
30
|
+
cleaned = re.sub(r"```(?:json)?\s*", "", raw).strip()
|
|
31
|
+
cleaned = re.sub(r"```\s*$", "", cleaned).strip()
|
|
32
|
+
|
|
33
|
+
# Try direct parse first
|
|
34
|
+
try:
|
|
35
|
+
json.loads(cleaned)
|
|
36
|
+
return cleaned
|
|
37
|
+
except (json.JSONDecodeError, ValueError):
|
|
38
|
+
pass
|
|
39
|
+
|
|
40
|
+
# Fix single quotes → double quotes (naive — handles simple cases)
|
|
41
|
+
attempt = cleaned.replace("'", '"')
|
|
42
|
+
try:
|
|
43
|
+
json.loads(attempt)
|
|
44
|
+
return attempt
|
|
45
|
+
except (json.JSONDecodeError, ValueError):
|
|
46
|
+
pass
|
|
47
|
+
|
|
48
|
+
# Remove trailing commas before } or ]
|
|
49
|
+
attempt = re.sub(r",\s*([}\]])", r"\1", attempt)
|
|
50
|
+
try:
|
|
51
|
+
json.loads(attempt)
|
|
52
|
+
return attempt
|
|
53
|
+
except (json.JSONDecodeError, ValueError):
|
|
54
|
+
pass
|
|
55
|
+
|
|
56
|
+
# Try closing truncated output
|
|
57
|
+
open_braces = attempt.count("{") - attempt.count("}")
|
|
58
|
+
open_brackets = attempt.count("[") - attempt.count("]")
|
|
59
|
+
if open_braces > 0 or open_brackets > 0:
|
|
60
|
+
attempt += "}" * max(open_braces, 0)
|
|
61
|
+
attempt += "]" * max(open_brackets, 0)
|
|
62
|
+
try:
|
|
63
|
+
json.loads(attempt)
|
|
64
|
+
return attempt
|
|
65
|
+
except (json.JSONDecodeError, ValueError):
|
|
66
|
+
pass
|
|
67
|
+
|
|
68
|
+
return None
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
def validate_json_schema(data: Any, schema: dict[str, Any]) -> list[str]:
|
|
72
|
+
"""Validate *data* against JSON Schema. Returns list of error messages."""
|
|
73
|
+
try:
|
|
74
|
+
import jsonschema # type: ignore[import-untyped]
|
|
75
|
+
|
|
76
|
+
validator = jsonschema.Draft7Validator(schema)
|
|
77
|
+
return [e.message for e in validator.iter_errors(data)]
|
|
78
|
+
except ImportError:
|
|
79
|
+
# jsonschema not installed — skip validation
|
|
80
|
+
return []
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
# ---------------------------------------------------------------------------
|
|
84
|
+
# Outlines FSM integration (optional)
|
|
85
|
+
# ---------------------------------------------------------------------------
|
|
86
|
+
|
|
87
|
+
class OutlinesFSMHandler:
|
|
88
|
+
"""Outlines-based constrained generation via finite state machine."""
|
|
89
|
+
|
|
90
|
+
def __init__(self) -> None:
|
|
91
|
+
self._available: bool | None = None
|
|
92
|
+
|
|
93
|
+
@property
|
|
94
|
+
def is_available(self) -> bool:
|
|
95
|
+
if self._available is None:
|
|
96
|
+
try:
|
|
97
|
+
import outlines # type: ignore[import-untyped] # noqa: F401
|
|
98
|
+
|
|
99
|
+
self._available = True
|
|
100
|
+
except ImportError:
|
|
101
|
+
self._available = False
|
|
102
|
+
return self._available
|
|
103
|
+
|
|
104
|
+
def build_guide(self, schema: dict[str, Any]) -> Any:
|
|
105
|
+
"""Build an Outlines JSON guide from a JSON Schema."""
|
|
106
|
+
if not self.is_available:
|
|
107
|
+
return None
|
|
108
|
+
try:
|
|
109
|
+
from outlines.generate import json as outlines_json # type: ignore[import-untyped]
|
|
110
|
+
|
|
111
|
+
return outlines_json(schema)
|
|
112
|
+
except Exception:
|
|
113
|
+
logger.warning("Failed to build Outlines guide")
|
|
114
|
+
return None
|
|
115
|
+
|
|
116
|
+
|
|
117
|
+
# ---------------------------------------------------------------------------
|
|
118
|
+
# GBNF grammar support (for llama.cpp providers)
|
|
119
|
+
# ---------------------------------------------------------------------------
|
|
120
|
+
|
|
121
|
+
def json_schema_to_gbnf(schema: dict[str, Any]) -> str | None:
|
|
122
|
+
"""Convert a simple JSON Schema to GBNF grammar string.
|
|
123
|
+
|
|
124
|
+
Handles flat object schemas with string/number/boolean/array properties.
|
|
125
|
+
Complex nested schemas require the full llama.cpp grammar converter.
|
|
126
|
+
"""
|
|
127
|
+
props = schema.get("properties", {})
|
|
128
|
+
if not props:
|
|
129
|
+
return None
|
|
130
|
+
|
|
131
|
+
rules: list[str] = ['root ::= "{" ws']
|
|
132
|
+
prop_rules: list[str] = []
|
|
133
|
+
|
|
134
|
+
for i, (name, prop_schema) in enumerate(props.items()):
|
|
135
|
+
ptype = prop_schema.get("type", "string")
|
|
136
|
+
sep = ', "' if i > 0 else '"'
|
|
137
|
+
type_rule = _type_to_gbnf(ptype, name)
|
|
138
|
+
prop_rules.append(f'{sep}{name}": ' + type_rule)
|
|
139
|
+
|
|
140
|
+
rules.append(" ".join(prop_rules))
|
|
141
|
+
rules.append('ws "}"')
|
|
142
|
+
rules.append('ws ::= [ \\t\\n]*')
|
|
143
|
+
rules.append('string ::= "\\"" [^"\\\\]* "\\""')
|
|
144
|
+
rules.append('number ::= "-"? [0-9]+ ("." [0-9]+)?')
|
|
145
|
+
rules.append('boolean ::= "true" | "false"')
|
|
146
|
+
|
|
147
|
+
return "\n".join(rules)
|
|
148
|
+
|
|
149
|
+
|
|
150
|
+
def _type_to_gbnf(json_type: str, _name: str) -> str:
|
|
151
|
+
mapping = {
|
|
152
|
+
"string": "ws string",
|
|
153
|
+
"number": "ws number",
|
|
154
|
+
"integer": "ws number",
|
|
155
|
+
"boolean": "ws boolean",
|
|
156
|
+
}
|
|
157
|
+
return mapping.get(json_type, "ws string")
|
|
158
|
+
|
|
159
|
+
|
|
160
|
+
# ---------------------------------------------------------------------------
|
|
161
|
+
# Composite handler
|
|
162
|
+
# ---------------------------------------------------------------------------
|
|
163
|
+
|
|
164
|
+
class StructuredOutputHandler:
|
|
165
|
+
"""Orchestrates structured-output enforcement.
|
|
166
|
+
|
|
167
|
+
Priority order:
|
|
168
|
+
1. Outlines FSM (if available and provider supports it)
|
|
169
|
+
2. GBNF grammar (if provider is llama.cpp compatible)
|
|
170
|
+
3. Logit masking (if provider supports token-level constraints)
|
|
171
|
+
4. Fallback: post-hoc JSON repair + validation
|
|
172
|
+
"""
|
|
173
|
+
|
|
174
|
+
def __init__(self) -> None:
|
|
175
|
+
self._outlines = OutlinesFSMHandler()
|
|
176
|
+
|
|
177
|
+
@property
|
|
178
|
+
def outlines_available(self) -> bool:
|
|
179
|
+
return self._outlines.is_available
|
|
180
|
+
|
|
181
|
+
def enforce(
|
|
182
|
+
self,
|
|
183
|
+
raw_output: str,
|
|
184
|
+
schema: dict[str, Any] | None = None,
|
|
185
|
+
) -> tuple[Any | None, list[str]]:
|
|
186
|
+
"""Attempt to parse and validate *raw_output* against *schema*.
|
|
187
|
+
|
|
188
|
+
Returns ``(parsed_data, errors)`` where *errors* is empty on success.
|
|
189
|
+
"""
|
|
190
|
+
if schema is None:
|
|
191
|
+
# No schema — just try to parse as JSON
|
|
192
|
+
try:
|
|
193
|
+
return json.loads(raw_output), []
|
|
194
|
+
except (json.JSONDecodeError, ValueError):
|
|
195
|
+
repaired = repair_json(raw_output)
|
|
196
|
+
if repaired is not None:
|
|
197
|
+
return json.loads(repaired), ["json_repaired"]
|
|
198
|
+
return None, ["json_parse_failed"]
|
|
199
|
+
|
|
200
|
+
# Try direct parse
|
|
201
|
+
try:
|
|
202
|
+
data = json.loads(raw_output)
|
|
203
|
+
except (json.JSONDecodeError, ValueError):
|
|
204
|
+
repaired = repair_json(raw_output)
|
|
205
|
+
if repaired is None:
|
|
206
|
+
return None, ["json_parse_failed"]
|
|
207
|
+
data = json.loads(repaired)
|
|
208
|
+
|
|
209
|
+
# Validate against schema
|
|
210
|
+
errors = validate_json_schema(data, schema)
|
|
211
|
+
return data, errors
|
|
212
|
+
|
|
213
|
+
def build_gbnf(self, schema: dict[str, Any]) -> str | None:
|
|
214
|
+
"""Build a GBNF grammar string for llama.cpp providers."""
|
|
215
|
+
return json_schema_to_gbnf(schema)
|
|
216
|
+
|
|
217
|
+
def build_outlines_guide(self, schema: dict[str, Any]) -> Any:
|
|
218
|
+
"""Build an Outlines FSM guide (returns None if unavailable)."""
|
|
219
|
+
return self._outlines.build_guide(schema)
|
crp/extraction/types.py
ADDED
|
@@ -0,0 +1,299 @@
|
|
|
1
|
+
# Copyright © 2025 Constantinos Vidiniotis. All rights reserved.
|
|
2
|
+
# Licensed under Elastic License 2.0 — see LICENSE.md for details.
|
|
3
|
+
"""Extraction pipeline data types — Fact, FactEdge, FactGraph, ExtractionResult."""
|
|
4
|
+
|
|
5
|
+
from __future__ import annotations
|
|
6
|
+
|
|
7
|
+
import time
|
|
8
|
+
import uuid
|
|
9
|
+
from dataclasses import dataclass, field
|
|
10
|
+
from enum import Enum
|
|
11
|
+
from typing import Any
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class ContentType(str, Enum):
|
|
15
|
+
"""Content complexity classification for pipeline routing."""
|
|
16
|
+
|
|
17
|
+
ENTITY_RICH = "ENTITY_RICH"
|
|
18
|
+
REASONING_DENSE = "REASONING_DENSE"
|
|
19
|
+
NARRATIVE = "NARRATIVE"
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class RelationType(str, Enum):
|
|
23
|
+
"""Semantic relation types for FactEdge records."""
|
|
24
|
+
|
|
25
|
+
CONDITION_FOR = "CONDITION_FOR"
|
|
26
|
+
CAUSE_EFFECT = "CAUSE_EFFECT"
|
|
27
|
+
CONTRAST = "CONTRAST"
|
|
28
|
+
CONCESSION = "CONCESSION"
|
|
29
|
+
CONSEQUENCE = "CONSEQUENCE"
|
|
30
|
+
ELABORATION = "ELABORATION"
|
|
31
|
+
SEQUENCE = "SEQUENCE"
|
|
32
|
+
RELATED = "RELATED"
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
@dataclass
|
|
36
|
+
class Fact:
|
|
37
|
+
"""Single extracted fact from the extraction pipeline.
|
|
38
|
+
|
|
39
|
+
Lightweight record — embedding is computed lazily in Phase 4 (state layer).
|
|
40
|
+
"""
|
|
41
|
+
|
|
42
|
+
# Metadata size limits (§audit M4)
|
|
43
|
+
MAX_METADATA_KEYS: int = 64
|
|
44
|
+
MAX_KEY_LENGTH: int = 128
|
|
45
|
+
MAX_VALUE_SIZE: int = 4096
|
|
46
|
+
|
|
47
|
+
id: str = field(default_factory=lambda: str(uuid.uuid4()))
|
|
48
|
+
text: str = ""
|
|
49
|
+
category: str = ""
|
|
50
|
+
source_window_id: str = ""
|
|
51
|
+
confidence: float = 0.0
|
|
52
|
+
extraction_stage: int = 0
|
|
53
|
+
created_at: float = field(default_factory=time.time)
|
|
54
|
+
metadata: dict[str, Any] = field(default_factory=dict)
|
|
55
|
+
|
|
56
|
+
# Quality gate flags (set by post-extraction validation)
|
|
57
|
+
flagged_confidence: bool = False
|
|
58
|
+
confidence_flag_reason: str = ""
|
|
59
|
+
|
|
60
|
+
# Supersession (set by contradiction detection)
|
|
61
|
+
superseded_by: str | None = None
|
|
62
|
+
supersession_confidence: float = 0.0
|
|
63
|
+
|
|
64
|
+
def validate_metadata(self) -> None:
|
|
65
|
+
"""Enforce metadata size limits (§audit M4).
|
|
66
|
+
|
|
67
|
+
Raises ValueError if metadata exceeds configured bounds.
|
|
68
|
+
"""
|
|
69
|
+
if len(self.metadata) > self.MAX_METADATA_KEYS:
|
|
70
|
+
raise ValueError(
|
|
71
|
+
f"Fact metadata exceeds {self.MAX_METADATA_KEYS} keys "
|
|
72
|
+
f"(got {len(self.metadata)})"
|
|
73
|
+
)
|
|
74
|
+
for key, value in self.metadata.items():
|
|
75
|
+
if len(str(key)) > self.MAX_KEY_LENGTH:
|
|
76
|
+
raise ValueError(
|
|
77
|
+
f"Metadata key exceeds {self.MAX_KEY_LENGTH} chars: {str(key)[:50]}..."
|
|
78
|
+
)
|
|
79
|
+
val_str = str(value)
|
|
80
|
+
if len(val_str) > self.MAX_VALUE_SIZE:
|
|
81
|
+
raise ValueError(
|
|
82
|
+
f"Metadata value for '{key}' exceeds {self.MAX_VALUE_SIZE} chars "
|
|
83
|
+
f"(got {len(val_str)})"
|
|
84
|
+
)
|
|
85
|
+
|
|
86
|
+
def set_metadata(self, key: str, value: Any) -> None:
|
|
87
|
+
"""Set a metadata key with size validation."""
|
|
88
|
+
if len(str(key)) > self.MAX_KEY_LENGTH:
|
|
89
|
+
raise ValueError(f"Metadata key exceeds {self.MAX_KEY_LENGTH} chars")
|
|
90
|
+
if len(str(value)) > self.MAX_VALUE_SIZE:
|
|
91
|
+
raise ValueError(f"Metadata value exceeds {self.MAX_VALUE_SIZE} chars")
|
|
92
|
+
if key not in self.metadata and len(self.metadata) >= self.MAX_METADATA_KEYS:
|
|
93
|
+
raise ValueError(f"Metadata exceeds {self.MAX_METADATA_KEYS} keys limit")
|
|
94
|
+
self.metadata[key] = value
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
@dataclass
|
|
98
|
+
class FactEdge:
|
|
99
|
+
"""Directed relation between two facts or text spans."""
|
|
100
|
+
|
|
101
|
+
id: str = field(default_factory=lambda: str(uuid.uuid4()))
|
|
102
|
+
source_id: str = ""
|
|
103
|
+
target_id: str = ""
|
|
104
|
+
relation_type: RelationType | str = RelationType.RELATED
|
|
105
|
+
confidence: float = 0.0
|
|
106
|
+
source_stage: int = 0
|
|
107
|
+
metadata: dict[str, Any] = field(default_factory=dict)
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
@dataclass
|
|
111
|
+
class FactGraph:
|
|
112
|
+
"""In-memory graph of facts and edges."""
|
|
113
|
+
|
|
114
|
+
nodes: dict[str, Fact] = field(default_factory=dict)
|
|
115
|
+
edges: list[FactEdge] = field(default_factory=list)
|
|
116
|
+
# Edge indices for O(1) lookup (§audit L4)
|
|
117
|
+
_edges_from: dict[str, list[FactEdge]] = field(default_factory=lambda: {})
|
|
118
|
+
_edges_to: dict[str, list[FactEdge]] = field(default_factory=lambda: {})
|
|
119
|
+
|
|
120
|
+
def add_fact(self, fact: Fact) -> None:
|
|
121
|
+
self.nodes[fact.id] = fact
|
|
122
|
+
|
|
123
|
+
def remove_fact(self, fact_id: str) -> None:
|
|
124
|
+
"""Remove a fact and all its edges from the graph (§audit2 STATE-H5)."""
|
|
125
|
+
self.nodes.pop(fact_id, None)
|
|
126
|
+
# Remove edges referencing this fact
|
|
127
|
+
self.edges = [e for e in self.edges
|
|
128
|
+
if e.source_id != fact_id and e.target_id != fact_id]
|
|
129
|
+
# Clean edge indices
|
|
130
|
+
self._edges_from.pop(fact_id, None)
|
|
131
|
+
self._edges_to.pop(fact_id, None)
|
|
132
|
+
# Remove from other nodes' index entries
|
|
133
|
+
for idx in (self._edges_from, self._edges_to):
|
|
134
|
+
for key in list(idx):
|
|
135
|
+
idx[key] = [e for e in idx[key]
|
|
136
|
+
if e.source_id != fact_id and e.target_id != fact_id]
|
|
137
|
+
if not idx[key]:
|
|
138
|
+
del idx[key]
|
|
139
|
+
|
|
140
|
+
def add_edge(self, edge: FactEdge) -> None:
|
|
141
|
+
# Skip edges referencing non-existent facts (§audit G7)
|
|
142
|
+
if edge.source_id not in self.nodes or edge.target_id not in self.nodes:
|
|
143
|
+
return
|
|
144
|
+
self.edges.append(edge)
|
|
145
|
+
# Maintain O(1) edge indices (§audit L4)
|
|
146
|
+
self._edges_from.setdefault(edge.source_id, []).append(edge)
|
|
147
|
+
self._edges_to.setdefault(edge.target_id, []).append(edge)
|
|
148
|
+
|
|
149
|
+
def edges_from(self, fact_id: str) -> list[FactEdge]:
|
|
150
|
+
return list(self._edges_from.get(fact_id, []))
|
|
151
|
+
|
|
152
|
+
def edges_to(self, fact_id: str) -> list[FactEdge]:
|
|
153
|
+
return list(self._edges_to.get(fact_id, []))
|
|
154
|
+
|
|
155
|
+
def subgraph_for(self, fact_ids: set[str], max_hops: int = 1) -> FactGraph:
|
|
156
|
+
"""Return subgraph containing *fact_ids* plus neighbours within *max_hops*."""
|
|
157
|
+
visited: set[str] = set(fact_ids)
|
|
158
|
+
frontier = set(fact_ids)
|
|
159
|
+
for _ in range(max_hops):
|
|
160
|
+
next_frontier: set[str] = set()
|
|
161
|
+
for fid in frontier:
|
|
162
|
+
for e in self._edges_from.get(fid, []):
|
|
163
|
+
if e.target_id not in visited:
|
|
164
|
+
next_frontier.add(e.target_id)
|
|
165
|
+
for e in self._edges_to.get(fid, []):
|
|
166
|
+
if e.source_id not in visited:
|
|
167
|
+
next_frontier.add(e.source_id)
|
|
168
|
+
visited |= next_frontier
|
|
169
|
+
frontier = next_frontier
|
|
170
|
+
sub_nodes = {fid: self.nodes[fid] for fid in visited if fid in self.nodes}
|
|
171
|
+
sub_edges = [e for e in self.edges if e.source_id in visited and e.target_id in visited]
|
|
172
|
+
sub = FactGraph(nodes=sub_nodes, edges=sub_edges)
|
|
173
|
+
# Rebuild edge indices for the subgraph (§audit2 STATE-H4)
|
|
174
|
+
for e in sub_edges:
|
|
175
|
+
sub._edges_from.setdefault(e.source_id, []).append(e)
|
|
176
|
+
sub._edges_to.setdefault(e.target_id, []).append(e)
|
|
177
|
+
return sub
|
|
178
|
+
|
|
179
|
+
def serialize_for_envelope(self) -> str:
|
|
180
|
+
"""Plain-text serialisation for envelope packing."""
|
|
181
|
+
lines: list[str] = []
|
|
182
|
+
for fid, fact in self.nodes.items():
|
|
183
|
+
lines.append(f"- {fact.text}")
|
|
184
|
+
for edge in self.edges_from(fid):
|
|
185
|
+
target = self.nodes.get(edge.target_id)
|
|
186
|
+
if target:
|
|
187
|
+
rel = edge.relation_type
|
|
188
|
+
if isinstance(rel, RelationType):
|
|
189
|
+
rel = rel.value
|
|
190
|
+
lines.append(f" ↳ [{rel}] {target.text}")
|
|
191
|
+
return "\n".join(lines)
|
|
192
|
+
|
|
193
|
+
|
|
194
|
+
class ValidationSeverity(str, Enum):
|
|
195
|
+
LOW = "low"
|
|
196
|
+
MEDIUM = "medium"
|
|
197
|
+
HIGH = "high"
|
|
198
|
+
|
|
199
|
+
|
|
200
|
+
@dataclass
|
|
201
|
+
class ValidationIssue:
|
|
202
|
+
"""Single issue found by the quality gate."""
|
|
203
|
+
|
|
204
|
+
type: str = ""
|
|
205
|
+
severity: ValidationSeverity = ValidationSeverity.LOW
|
|
206
|
+
detail: str = ""
|
|
207
|
+
|
|
208
|
+
|
|
209
|
+
@dataclass
|
|
210
|
+
class ValidationResult:
|
|
211
|
+
"""Result from one quality-gate tier."""
|
|
212
|
+
|
|
213
|
+
tier: int = 0
|
|
214
|
+
passed: bool = True
|
|
215
|
+
issues: list[ValidationIssue] = field(default_factory=list)
|
|
216
|
+
|
|
217
|
+
|
|
218
|
+
@dataclass
|
|
219
|
+
class Contradiction:
|
|
220
|
+
"""A detected contradiction between two facts."""
|
|
221
|
+
|
|
222
|
+
fact_a: Fact | None = None
|
|
223
|
+
fact_b: Fact | None = None
|
|
224
|
+
similarity: float = 0.0
|
|
225
|
+
content_diff: float = 0.0
|
|
226
|
+
confidence: float = 0.0
|
|
227
|
+
|
|
228
|
+
|
|
229
|
+
@dataclass
|
|
230
|
+
class FactEvent:
|
|
231
|
+
"""Immutable audit-log entry for fact lifecycle events."""
|
|
232
|
+
|
|
233
|
+
event_id: int = 0
|
|
234
|
+
timestamp: float = field(default_factory=time.time)
|
|
235
|
+
window_id: str = ""
|
|
236
|
+
event_type: str = "" # "created" | "superseded" | "compacted" | "archived" | "restored"
|
|
237
|
+
fact_id: str = ""
|
|
238
|
+
payload: dict[str, Any] = field(default_factory=dict)
|
|
239
|
+
|
|
240
|
+
|
|
241
|
+
@dataclass
|
|
242
|
+
class ExtractionResult:
|
|
243
|
+
"""Complete extraction result from the graduated pipeline."""
|
|
244
|
+
|
|
245
|
+
extraction_id: str = field(default_factory=lambda: str(uuid.uuid4()))
|
|
246
|
+
source_window_id: str = ""
|
|
247
|
+
timestamp: float = field(default_factory=time.time)
|
|
248
|
+
|
|
249
|
+
# Extracted data
|
|
250
|
+
facts: list[Fact] = field(default_factory=list)
|
|
251
|
+
edges: list[FactEdge] = field(default_factory=list)
|
|
252
|
+
fact_graph: FactGraph = field(default_factory=FactGraph)
|
|
253
|
+
|
|
254
|
+
# Pipeline execution
|
|
255
|
+
stages_run: list[int] = field(default_factory=list)
|
|
256
|
+
stages_skipped: list[int] = field(default_factory=list)
|
|
257
|
+
total_extraction_latency_ms: float = 0.0
|
|
258
|
+
per_stage_latency: dict[int, float] = field(default_factory=dict)
|
|
259
|
+
|
|
260
|
+
# Quality metrics
|
|
261
|
+
total_facts: int = 0
|
|
262
|
+
total_edges: int = 0
|
|
263
|
+
average_confidence: float = 0.0
|
|
264
|
+
entity_density: float = 0.0
|
|
265
|
+
relation_density: float = 0.0
|
|
266
|
+
|
|
267
|
+
# Content classification
|
|
268
|
+
content_type: ContentType = ContentType.NARRATIVE
|
|
269
|
+
discourse_markers_found: int = 0
|
|
270
|
+
|
|
271
|
+
# Pipeline state (for self-calibration)
|
|
272
|
+
stage_yields: dict[int, int] = field(default_factory=dict)
|
|
273
|
+
escalation_triggers: list[str] = field(default_factory=list)
|
|
274
|
+
|
|
275
|
+
# Quality gate
|
|
276
|
+
quality_gate_passed: bool = True
|
|
277
|
+
quality_issues: list[str] = field(default_factory=list)
|
|
278
|
+
|
|
279
|
+
# Normalization
|
|
280
|
+
facts_after_normalization: int = 0
|
|
281
|
+
|
|
282
|
+
@property
|
|
283
|
+
def success(self) -> bool:
|
|
284
|
+
return self.quality_gate_passed
|
|
285
|
+
|
|
286
|
+
def finalize(self) -> None:
|
|
287
|
+
"""Compute aggregate metrics from facts/edges lists."""
|
|
288
|
+
self.total_facts = len(self.facts)
|
|
289
|
+
self.total_edges = len(self.edges)
|
|
290
|
+
if self.facts:
|
|
291
|
+
self.average_confidence = sum(f.confidence for f in self.facts) / len(self.facts)
|
|
292
|
+
self.relation_density = self.total_edges / max(self.total_facts, 1)
|
|
293
|
+
self.facts_after_normalization = self.total_facts
|
|
294
|
+
# Build graph
|
|
295
|
+
self.fact_graph = FactGraph()
|
|
296
|
+
for f in self.facts:
|
|
297
|
+
self.fact_graph.add_fact(f)
|
|
298
|
+
for e in self.edges:
|
|
299
|
+
self.fact_graph.add_edge(e)
|