crprotocol 2.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- crp/__init__.py +126 -0
- crp/__main__.py +8 -0
- crp/_typing.py +27 -0
- crp/_version.py +5 -0
- crp/adapters.py +31 -0
- crp/advanced/__init__.py +40 -0
- crp/advanced/auto_ingest.py +400 -0
- crp/advanced/cqs.py +235 -0
- crp/advanced/cross_window.py +477 -0
- crp/advanced/curator.py +265 -0
- crp/advanced/feedback.py +146 -0
- crp/advanced/hierarchical.py +211 -0
- crp/advanced/meta_learning.py +401 -0
- crp/advanced/parallel.py +98 -0
- crp/advanced/review_cycle.py +329 -0
- crp/advanced/scale_mode.py +129 -0
- crp/advanced/source_grounding.py +207 -0
- crp/ckf/__init__.py +35 -0
- crp/ckf/community.py +377 -0
- crp/ckf/fabric.py +445 -0
- crp/ckf/gc.py +175 -0
- crp/ckf/graph_walk.py +87 -0
- crp/ckf/merge.py +133 -0
- crp/ckf/pattern_query.py +122 -0
- crp/ckf/pubsub.py +128 -0
- crp/ckf/semantic.py +207 -0
- crp/cli/__init__.py +7 -0
- crp/cli/main.py +329 -0
- crp/cli/sidecar.py +929 -0
- crp/cli/startup.py +272 -0
- crp/continuation/__init__.py +103 -0
- crp/continuation/completion.py +348 -0
- crp/continuation/degradation.py +157 -0
- crp/continuation/document_map.py +160 -0
- crp/continuation/flow.py +109 -0
- crp/continuation/gap.py +419 -0
- crp/continuation/manager.py +484 -0
- crp/continuation/quality_monitor.py +179 -0
- crp/continuation/stitch.py +419 -0
- crp/continuation/trigger.py +142 -0
- crp/continuation/voice.py +157 -0
- crp/core/__init__.py +69 -0
- crp/core/batch.py +77 -0
- crp/core/circuit_breaker.py +116 -0
- crp/core/config.py +377 -0
- crp/core/context_tools.py +540 -0
- crp/core/dispatch_router.py +3977 -0
- crp/core/errors.py +128 -0
- crp/core/extraction_facade.py +384 -0
- crp/core/facilitator.py +713 -0
- crp/core/idempotency.py +215 -0
- crp/core/orchestrator.py +1435 -0
- crp/core/relay_strategies.py +613 -0
- crp/core/security_manager.py +140 -0
- crp/core/session.py +134 -0
- crp/core/task_intent.py +36 -0
- crp/core/window.py +363 -0
- crp/envelope/__init__.py +30 -0
- crp/envelope/builder.py +288 -0
- crp/envelope/decomposer.py +236 -0
- crp/envelope/formatter.py +168 -0
- crp/envelope/packer.py +211 -0
- crp/envelope/reranker.py +209 -0
- crp/envelope/scoring.py +310 -0
- crp/extraction/__init__.py +45 -0
- crp/extraction/complexity.py +96 -0
- crp/extraction/contradiction.py +132 -0
- crp/extraction/pipeline.py +360 -0
- crp/extraction/quality_gate.py +237 -0
- crp/extraction/stage1_regex.py +173 -0
- crp/extraction/stage2_statistical.py +244 -0
- crp/extraction/stage3_gliner.py +210 -0
- crp/extraction/stage4_uie.py +183 -0
- crp/extraction/stage5_discourse.py +175 -0
- crp/extraction/stage6_llm.py +178 -0
- crp/extraction/structured_output.py +219 -0
- crp/extraction/types.py +299 -0
- crp/license_guard.py +722 -0
- crp/observability/__init__.py +30 -0
- crp/observability/audit.py +118 -0
- crp/observability/events.py +233 -0
- crp/observability/metrics.py +264 -0
- crp/observability/quality.py +135 -0
- crp/observability/structured_logging.py +81 -0
- crp/observability/telemetry.py +117 -0
- crp/provenance/__init__.py +314 -0
- crp/provenance/_embeddings.py +97 -0
- crp/provenance/_types.py +378 -0
- crp/provenance/attribution_scorer.py +252 -0
- crp/provenance/claim_detector.py +229 -0
- crp/provenance/contradiction_detector.py +243 -0
- crp/provenance/distortion_detector.py +397 -0
- crp/provenance/entailment_verifier.py +358 -0
- crp/provenance/fabrication_detector.py +203 -0
- crp/provenance/hallucination_scorer.py +320 -0
- crp/provenance/omission_analyzer.py +106 -0
- crp/provenance/provenance_chain.py +205 -0
- crp/provenance/report_generator.py +440 -0
- crp/providers/__init__.py +43 -0
- crp/providers/anthropic.py +270 -0
- crp/providers/base.py +135 -0
- crp/providers/custom.py +63 -0
- crp/providers/diagnostic.py +251 -0
- crp/providers/llamacpp.py +224 -0
- crp/providers/manager.py +139 -0
- crp/providers/ollama.py +243 -0
- crp/providers/openai.py +628 -0
- crp/providers/tokenizers.py +48 -0
- crp/py.typed +0 -0
- crp/resources/__init__.py +53 -0
- crp/resources/adaptive_allocator.py +525 -0
- crp/resources/cost_model.py +388 -0
- crp/resources/overhead_manager.py +217 -0
- crp/resources/resource_manager.py +262 -0
- crp/schemas/__init__.py +20 -0
- crp/schemas/cost-estimate.json +33 -0
- crp/schemas/crp-error.json +43 -0
- crp/schemas/envelope-preview.json +40 -0
- crp/schemas/persisted-state-header.json +27 -0
- crp/schemas/quality-report.json +94 -0
- crp/schemas/session-handle.json +33 -0
- crp/schemas/session-status.json +57 -0
- crp/schemas/stream-event.json +18 -0
- crp/schemas/task-intent.json +42 -0
- crp/security/__init__.py +93 -0
- crp/security/audit_trail.py +392 -0
- crp/security/binding.py +192 -0
- crp/security/compliance.py +813 -0
- crp/security/consent.py +593 -0
- crp/security/embedding_defense.py +161 -0
- crp/security/encryption.py +202 -0
- crp/security/injection.py +335 -0
- crp/security/integrity.py +267 -0
- crp/security/privacy.py +662 -0
- crp/security/quarantine.py +249 -0
- crp/security/rbac.py +221 -0
- crp/security/validation.py +164 -0
- crp/state/__init__.py +31 -0
- crp/state/cold_storage.py +258 -0
- crp/state/compaction.py +263 -0
- crp/state/critical_state.py +104 -0
- crp/state/event_log.py +313 -0
- crp/state/fact.py +189 -0
- crp/state/serialization.py +189 -0
- crp/state/session_cleanup.py +77 -0
- crp/state/snapshot.py +290 -0
- crp/state/warm_store.py +346 -0
- crprotocol-2.0.0.dist-info/METADATA +1295 -0
- crprotocol-2.0.0.dist-info/RECORD +153 -0
- crprotocol-2.0.0.dist-info/WHEEL +4 -0
- crprotocol-2.0.0.dist-info/entry_points.txt +2 -0
- crprotocol-2.0.0.dist-info/licenses/LICENSE.md +170 -0
- crprotocol-2.0.0.dist-info/licenses/NOTICE +18 -0
|
@@ -0,0 +1,183 @@
|
|
|
1
|
+
# Copyright © 2025 Constantinos Vidiniotis. All rights reserved.
|
|
2
|
+
# Licensed under Elastic License 2.0 — see LICENSE.md for details.
|
|
3
|
+
"""Stage 4 — UIE relational extraction (SHOULD, ~100ms, lazy model load).
|
|
4
|
+
|
|
5
|
+
Extracts (subject, predicate, object) triples and converts them to FactEdge
|
|
6
|
+
records. Trigger: Stage 3 relation yield < 0.1 per sentence.
|
|
7
|
+
Model: UIE / universal IE (~400MB), loaded lazily.
|
|
8
|
+
Graceful fallback: returns empty if unavailable.
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
from __future__ import annotations
|
|
12
|
+
|
|
13
|
+
import logging
|
|
14
|
+
from typing import Any, Protocol, runtime_checkable
|
|
15
|
+
|
|
16
|
+
from crp.extraction.types import Fact, FactEdge, RelationType
|
|
17
|
+
|
|
18
|
+
logger = logging.getLogger(__name__)
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
# ---------------------------------------------------------------------------
|
|
22
|
+
# UIE model protocol
|
|
23
|
+
# ---------------------------------------------------------------------------
|
|
24
|
+
|
|
25
|
+
@runtime_checkable
|
|
26
|
+
class UIEModel(Protocol):
|
|
27
|
+
"""Minimal interface for a Universal Information Extraction model."""
|
|
28
|
+
|
|
29
|
+
def extract_triples(
|
|
30
|
+
self, text: str
|
|
31
|
+
) -> list[dict[str, Any]]:
|
|
32
|
+
"""Return list of dicts with keys: subject, predicate, object, confidence."""
|
|
33
|
+
...
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
# ---------------------------------------------------------------------------
|
|
37
|
+
# Triple → FactEdge mapping
|
|
38
|
+
# ---------------------------------------------------------------------------
|
|
39
|
+
|
|
40
|
+
_PREDICATE_TO_RELATION: dict[str, RelationType] = {
|
|
41
|
+
"causes": RelationType.CAUSE_EFFECT,
|
|
42
|
+
"caused by": RelationType.CAUSE_EFFECT,
|
|
43
|
+
"leads to": RelationType.CONSEQUENCE,
|
|
44
|
+
"results in": RelationType.CONSEQUENCE,
|
|
45
|
+
"depends on": RelationType.CONDITION_FOR,
|
|
46
|
+
"requires": RelationType.CONDITION_FOR,
|
|
47
|
+
"contrasts with": RelationType.CONTRAST,
|
|
48
|
+
"despite": RelationType.CONCESSION,
|
|
49
|
+
"elaborates": RelationType.ELABORATION,
|
|
50
|
+
"extends": RelationType.ELABORATION,
|
|
51
|
+
"follows": RelationType.SEQUENCE,
|
|
52
|
+
"precedes": RelationType.SEQUENCE,
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def _map_predicate(predicate: str) -> RelationType:
|
|
57
|
+
"""Best-effort mapping from free-text predicate to RelationType."""
|
|
58
|
+
pred_lower = predicate.strip().lower()
|
|
59
|
+
for key, rel in _PREDICATE_TO_RELATION.items():
|
|
60
|
+
if key in pred_lower:
|
|
61
|
+
return rel
|
|
62
|
+
return RelationType.RELATED
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
# ---------------------------------------------------------------------------
|
|
66
|
+
# Stage 4 Extractor
|
|
67
|
+
# ---------------------------------------------------------------------------
|
|
68
|
+
|
|
69
|
+
class UIEExtractor:
|
|
70
|
+
"""Stage 4 — UIE triple extraction (lazy, optional).
|
|
71
|
+
|
|
72
|
+
Loads the model on first use. Returns ``(facts, edges)`` where *facts*
|
|
73
|
+
are the subject/object entities and *edges* are the relations.
|
|
74
|
+
If the UIE library is unavailable, all calls return ``([], [])``.
|
|
75
|
+
"""
|
|
76
|
+
|
|
77
|
+
def __init__(self) -> None:
|
|
78
|
+
self._model: UIEModel | None = None
|
|
79
|
+
self._available: bool | None = None
|
|
80
|
+
|
|
81
|
+
# -- Lifecycle ----------------------------------------------------------
|
|
82
|
+
|
|
83
|
+
def _ensure_model(self) -> UIEModel | None:
|
|
84
|
+
if self._available is False:
|
|
85
|
+
return None
|
|
86
|
+
if self._model is not None:
|
|
87
|
+
return self._model
|
|
88
|
+
try:
|
|
89
|
+
# Try to import a UIE implementation.
|
|
90
|
+
# The spec is model-agnostic; accept any class exposing extract_triples().
|
|
91
|
+
from uie import UIE # type: ignore[import-untyped]
|
|
92
|
+
|
|
93
|
+
self._model = UIE() # type: ignore[assignment]
|
|
94
|
+
self._available = True
|
|
95
|
+
logger.info("UIE model loaded successfully")
|
|
96
|
+
return self._model
|
|
97
|
+
except Exception:
|
|
98
|
+
self._available = False
|
|
99
|
+
logger.warning("UIE not available — Stage 4 will be skipped")
|
|
100
|
+
return None
|
|
101
|
+
|
|
102
|
+
def unload(self) -> None:
|
|
103
|
+
self._model = None
|
|
104
|
+
|
|
105
|
+
@property
|
|
106
|
+
def is_available(self) -> bool:
|
|
107
|
+
if self._available is None:
|
|
108
|
+
self._ensure_model()
|
|
109
|
+
return self._available is True
|
|
110
|
+
|
|
111
|
+
# -- Extraction ---------------------------------------------------------
|
|
112
|
+
|
|
113
|
+
def extract(
|
|
114
|
+
self,
|
|
115
|
+
text: str,
|
|
116
|
+
source_window_id: str = "",
|
|
117
|
+
) -> tuple[list[Fact], list[FactEdge]]:
|
|
118
|
+
"""Extract relational triples from *text*.
|
|
119
|
+
|
|
120
|
+
Returns ``(facts, edges)`` — each triple yields two Fact items
|
|
121
|
+
(subject, object) and one FactEdge.
|
|
122
|
+
Returns ``([], [])`` on failure or if model unavailable.
|
|
123
|
+
"""
|
|
124
|
+
model = self._ensure_model()
|
|
125
|
+
if model is None:
|
|
126
|
+
return [], []
|
|
127
|
+
|
|
128
|
+
try:
|
|
129
|
+
triples = model.extract_triples(text)
|
|
130
|
+
except Exception:
|
|
131
|
+
logger.exception("UIE extraction failed")
|
|
132
|
+
return [], []
|
|
133
|
+
|
|
134
|
+
facts: list[Fact] = []
|
|
135
|
+
edges: list[FactEdge] = []
|
|
136
|
+
seen_texts: dict[str, str] = {} # text → fact_id (dedup entities)
|
|
137
|
+
|
|
138
|
+
for triple in triples:
|
|
139
|
+
subj_text = str(triple.get("subject", ""))
|
|
140
|
+
obj_text = str(triple.get("object", ""))
|
|
141
|
+
predicate = str(triple.get("predicate", ""))
|
|
142
|
+
conf = float(triple.get("confidence", 0.70))
|
|
143
|
+
|
|
144
|
+
if not subj_text or not obj_text:
|
|
145
|
+
continue
|
|
146
|
+
|
|
147
|
+
# Dedup entity facts
|
|
148
|
+
if subj_text not in seen_texts:
|
|
149
|
+
subj_fact = Fact(
|
|
150
|
+
text=subj_text,
|
|
151
|
+
category="uie_entity",
|
|
152
|
+
source_window_id=source_window_id,
|
|
153
|
+
confidence=min(0.85, max(0.70, conf)),
|
|
154
|
+
extraction_stage=4,
|
|
155
|
+
metadata={"role": "subject"},
|
|
156
|
+
)
|
|
157
|
+
facts.append(subj_fact)
|
|
158
|
+
seen_texts[subj_text] = subj_fact.id
|
|
159
|
+
subj_id = seen_texts[subj_text]
|
|
160
|
+
|
|
161
|
+
if obj_text not in seen_texts:
|
|
162
|
+
obj_fact = Fact(
|
|
163
|
+
text=obj_text,
|
|
164
|
+
category="uie_entity",
|
|
165
|
+
source_window_id=source_window_id,
|
|
166
|
+
confidence=min(0.85, max(0.70, conf)),
|
|
167
|
+
extraction_stage=4,
|
|
168
|
+
metadata={"role": "object"},
|
|
169
|
+
)
|
|
170
|
+
facts.append(obj_fact)
|
|
171
|
+
seen_texts[obj_text] = obj_fact.id
|
|
172
|
+
obj_id = seen_texts[obj_text]
|
|
173
|
+
|
|
174
|
+
edges.append(FactEdge(
|
|
175
|
+
source_id=subj_id,
|
|
176
|
+
target_id=obj_id,
|
|
177
|
+
relation_type=_map_predicate(predicate),
|
|
178
|
+
confidence=min(0.85, max(0.70, conf)),
|
|
179
|
+
source_stage=4,
|
|
180
|
+
metadata={"predicate": predicate},
|
|
181
|
+
))
|
|
182
|
+
|
|
183
|
+
return facts, edges
|
|
@@ -0,0 +1,175 @@
|
|
|
1
|
+
# Copyright © 2025 Constantinos Vidiniotis. All rights reserved.
|
|
2
|
+
# Licensed under Elastic License 2.0 — see LICENSE.md for details.
|
|
3
|
+
"""Stage 5 — Discourse structure extraction (SHOULD, ~150ms, CPU-only).
|
|
4
|
+
|
|
5
|
+
Detects discourse markers and maps them to semantic relation types (RST-inspired).
|
|
6
|
+
Trigger: content_type in {REASONING_DENSE, NARRATIVE}.
|
|
7
|
+
No ML model — pure pattern matching over sentences.
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
from __future__ import annotations
|
|
11
|
+
|
|
12
|
+
import re
|
|
13
|
+
|
|
14
|
+
from crp.extraction.types import Fact, FactEdge, RelationType
|
|
15
|
+
|
|
16
|
+
# ---------------------------------------------------------------------------
|
|
17
|
+
# Discourse marker → relation-type mapping
|
|
18
|
+
# ---------------------------------------------------------------------------
|
|
19
|
+
|
|
20
|
+
_MARKER_GROUPS: list[tuple[RelationType, list[str]]] = [
|
|
21
|
+
(RelationType.CONDITION_FOR, [
|
|
22
|
+
"if", "unless", "provided", "provided that", "assuming",
|
|
23
|
+
"given that", "in case", "on condition that",
|
|
24
|
+
]),
|
|
25
|
+
(RelationType.CAUSE_EFFECT, [
|
|
26
|
+
"because", "since", "due to", "owing to", "causes",
|
|
27
|
+
"caused by", "as a result of",
|
|
28
|
+
]),
|
|
29
|
+
(RelationType.CONTRAST, [
|
|
30
|
+
"however", "but", "yet", "on the other hand",
|
|
31
|
+
"in contrast", "conversely", "nevertheless",
|
|
32
|
+
]),
|
|
33
|
+
(RelationType.CONCESSION, [
|
|
34
|
+
"although", "despite", "even though", "in spite of",
|
|
35
|
+
"notwithstanding", "regardless",
|
|
36
|
+
]),
|
|
37
|
+
(RelationType.CONSEQUENCE, [
|
|
38
|
+
"therefore", "thus", "hence", "so", "consequently",
|
|
39
|
+
"as a result", "accordingly",
|
|
40
|
+
]),
|
|
41
|
+
(RelationType.ELABORATION, [
|
|
42
|
+
"as", "given", "for example", "for instance",
|
|
43
|
+
"in particular", "specifically", "namely",
|
|
44
|
+
"that is", "i.e.", "e.g.",
|
|
45
|
+
]),
|
|
46
|
+
(RelationType.SEQUENCE, [
|
|
47
|
+
"and then", "subsequently", "next", "afterwards",
|
|
48
|
+
"following", "before", "after", "finally",
|
|
49
|
+
"first", "second", "third", "lastly",
|
|
50
|
+
]),
|
|
51
|
+
]
|
|
52
|
+
|
|
53
|
+
# Build compiled regex per group — match markers at word boundaries
|
|
54
|
+
_COMPILED_MARKERS: list[tuple[RelationType, re.Pattern[str]]] = []
|
|
55
|
+
for _rel, _markers in _MARKER_GROUPS:
|
|
56
|
+
# Sort longest first so "provided that" matches before "provided"
|
|
57
|
+
_sorted = sorted(_markers, key=len, reverse=True)
|
|
58
|
+
escaped = [re.escape(m) for m in _sorted]
|
|
59
|
+
_COMPILED_MARKERS.append((
|
|
60
|
+
_rel,
|
|
61
|
+
re.compile(r"\b(?:" + "|".join(escaped) + r")\b", re.IGNORECASE),
|
|
62
|
+
))
|
|
63
|
+
|
|
64
|
+
# Flat set for quick counting
|
|
65
|
+
_ALL_MARKERS_FLAT: set[str] = set()
|
|
66
|
+
for _, markers in _MARKER_GROUPS:
|
|
67
|
+
_ALL_MARKERS_FLAT.update(markers)
|
|
68
|
+
|
|
69
|
+
# ---------------------------------------------------------------------------
|
|
70
|
+
# Sentence splitting (reuse from stage2 would be nice, but keep self-contained)
|
|
71
|
+
# ---------------------------------------------------------------------------
|
|
72
|
+
|
|
73
|
+
_SENT_RE = re.compile(r"(?<=[.!?])\s+(?=[A-Z])")
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
def _split_sentences(text: str) -> list[str]:
|
|
77
|
+
raw = _SENT_RE.split(text.strip())
|
|
78
|
+
return [s.strip() for s in raw if len(s.strip()) > 5]
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
# ---------------------------------------------------------------------------
|
|
82
|
+
# Public helpers (used by complexity detector)
|
|
83
|
+
# ---------------------------------------------------------------------------
|
|
84
|
+
|
|
85
|
+
def count_discourse_markers(text: str) -> int:
|
|
86
|
+
"""Count total discourse-marker occurrences in *text* (fast)."""
|
|
87
|
+
total = 0
|
|
88
|
+
text_lower = text.lower()
|
|
89
|
+
for marker in _ALL_MARKERS_FLAT:
|
|
90
|
+
# Word-boundary check via regex on each marker would be expensive;
|
|
91
|
+
# for the counting use-case a simple substring is sufficient.
|
|
92
|
+
start = 0
|
|
93
|
+
while True:
|
|
94
|
+
idx = text_lower.find(marker, start)
|
|
95
|
+
if idx == -1:
|
|
96
|
+
break
|
|
97
|
+
# Rough word-boundary check
|
|
98
|
+
before_ok = idx == 0 or not text_lower[idx - 1].isalnum()
|
|
99
|
+
after_idx = idx + len(marker)
|
|
100
|
+
after_ok = after_idx >= len(text_lower) or not text_lower[after_idx].isalnum()
|
|
101
|
+
if before_ok and after_ok:
|
|
102
|
+
total += 1
|
|
103
|
+
start = idx + 1
|
|
104
|
+
return total
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
# ---------------------------------------------------------------------------
|
|
108
|
+
# Stage 5 Extractor
|
|
109
|
+
# ---------------------------------------------------------------------------
|
|
110
|
+
|
|
111
|
+
class DiscourseExtractor:
|
|
112
|
+
"""Stage 5 — discourse-structure extraction (CPU-only)."""
|
|
113
|
+
|
|
114
|
+
def extract(
|
|
115
|
+
self,
|
|
116
|
+
text: str,
|
|
117
|
+
source_window_id: str = "",
|
|
118
|
+
) -> tuple[list[Fact], list[FactEdge]]:
|
|
119
|
+
"""Detect discourse markers and create FactEdge relations.
|
|
120
|
+
|
|
121
|
+
Returns ``(marker_facts, edges)`` where *marker_facts* are the
|
|
122
|
+
clauses surrounding each detected marker, and *edges* link them.
|
|
123
|
+
"""
|
|
124
|
+
sentences = _split_sentences(text)
|
|
125
|
+
if not sentences:
|
|
126
|
+
return [], []
|
|
127
|
+
|
|
128
|
+
facts: list[Fact] = []
|
|
129
|
+
edges: list[FactEdge] = []
|
|
130
|
+
sent_fact_ids: dict[int, str] = {} # sentence_index → fact_id
|
|
131
|
+
|
|
132
|
+
def _get_or_create_fact(idx: int) -> str:
|
|
133
|
+
"""Ensure a Fact exists for the sentence at *idx*."""
|
|
134
|
+
if idx in sent_fact_ids:
|
|
135
|
+
return sent_fact_ids[idx]
|
|
136
|
+
f = Fact(
|
|
137
|
+
text=sentences[idx],
|
|
138
|
+
category="discourse_unit",
|
|
139
|
+
source_window_id=source_window_id,
|
|
140
|
+
confidence=0.70,
|
|
141
|
+
extraction_stage=5,
|
|
142
|
+
metadata={"sentence_index": idx},
|
|
143
|
+
)
|
|
144
|
+
facts.append(f)
|
|
145
|
+
sent_fact_ids[idx] = f.id
|
|
146
|
+
return f.id
|
|
147
|
+
|
|
148
|
+
for i, sent in enumerate(sentences):
|
|
149
|
+
for rel_type, pattern in _COMPILED_MARKERS:
|
|
150
|
+
match = pattern.search(sent)
|
|
151
|
+
if match is None:
|
|
152
|
+
continue
|
|
153
|
+
|
|
154
|
+
# The marker links this sentence (or the clause after the marker)
|
|
155
|
+
# to the preceding sentence.
|
|
156
|
+
before_idx = max(0, i - 1)
|
|
157
|
+
after_idx = i
|
|
158
|
+
|
|
159
|
+
# Avoid self-loops
|
|
160
|
+
if before_idx == after_idx and i == 0:
|
|
161
|
+
continue
|
|
162
|
+
|
|
163
|
+
before_id = _get_or_create_fact(before_idx)
|
|
164
|
+
after_id = _get_or_create_fact(after_idx)
|
|
165
|
+
|
|
166
|
+
edges.append(FactEdge(
|
|
167
|
+
source_id=before_id,
|
|
168
|
+
target_id=after_id,
|
|
169
|
+
relation_type=rel_type,
|
|
170
|
+
confidence=0.70,
|
|
171
|
+
source_stage=5,
|
|
172
|
+
metadata={"marker": match.group(0), "sentence_index": i},
|
|
173
|
+
))
|
|
174
|
+
|
|
175
|
+
return facts, edges
|
|
@@ -0,0 +1,178 @@
|
|
|
1
|
+
# Copyright © 2025 Constantinos Vidiniotis. All rights reserved.
|
|
2
|
+
# Licensed under Elastic License 2.0 — see LICENSE.md for details.
|
|
3
|
+
"""Stage 6 — LLM-assisted relational extraction (MAY, expensive).
|
|
4
|
+
|
|
5
|
+
Dispatches a dedicated extraction window to a small LLM to extract logical
|
|
6
|
+
relationships from reasoning-dense content. Trigger: content_type == REASONING_DENSE
|
|
7
|
+
AND Stage 5 edge_yield < 0.1 edges/sentence.
|
|
8
|
+
|
|
9
|
+
This stage is **user-configurable** (can be disabled via config flag).
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
from __future__ import annotations
|
|
13
|
+
|
|
14
|
+
import json
|
|
15
|
+
import logging
|
|
16
|
+
import re
|
|
17
|
+
from collections.abc import Callable
|
|
18
|
+
|
|
19
|
+
from crp.extraction.types import Fact, FactEdge, RelationType
|
|
20
|
+
|
|
21
|
+
logger = logging.getLogger(__name__)
|
|
22
|
+
|
|
23
|
+
# ---------------------------------------------------------------------------
|
|
24
|
+
# Extraction prompt
|
|
25
|
+
# ---------------------------------------------------------------------------
|
|
26
|
+
|
|
27
|
+
_EXTRACTION_PROMPT = (
|
|
28
|
+
"Extract ALL logical relationships, conditions, dependencies, "
|
|
29
|
+
"and reasoning chains from this text. Output as a JSON array of objects "
|
|
30
|
+
'with keys: "subject", "predicate", "object". Example:\n'
|
|
31
|
+
'[{"subject": "A", "predicate": "causes", "object": "B"}]'
|
|
32
|
+
)
|
|
33
|
+
|
|
34
|
+
_RELATION_MAP: dict[str, RelationType] = {
|
|
35
|
+
"causes": RelationType.CAUSE_EFFECT,
|
|
36
|
+
"caused by": RelationType.CAUSE_EFFECT,
|
|
37
|
+
"leads to": RelationType.CONSEQUENCE,
|
|
38
|
+
"results in": RelationType.CONSEQUENCE,
|
|
39
|
+
"depends on": RelationType.CONDITION_FOR,
|
|
40
|
+
"requires": RelationType.CONDITION_FOR,
|
|
41
|
+
"if": RelationType.CONDITION_FOR,
|
|
42
|
+
"contrasts": RelationType.CONTRAST,
|
|
43
|
+
"despite": RelationType.CONCESSION,
|
|
44
|
+
"elaborates": RelationType.ELABORATION,
|
|
45
|
+
"follows": RelationType.SEQUENCE,
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
# ---------------------------------------------------------------------------
|
|
49
|
+
# Dispatch callback type
|
|
50
|
+
# ---------------------------------------------------------------------------
|
|
51
|
+
|
|
52
|
+
# Type alias for the LLM dispatch function the pipeline will inject.
|
|
53
|
+
# Signature: dispatch(system_prompt, task_input, max_output_tokens) → str
|
|
54
|
+
DispatchFn = Callable[[str, str, int], str]
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
# ---------------------------------------------------------------------------
|
|
58
|
+
# Response parser
|
|
59
|
+
# ---------------------------------------------------------------------------
|
|
60
|
+
|
|
61
|
+
_JSON_ARRAY_RE = re.compile(r"\[.*\]", re.DOTALL)
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
def _parse_extraction_response(raw: str) -> list[dict[str, str]]:
|
|
65
|
+
"""Best-effort JSON parse from LLM output."""
|
|
66
|
+
m = _JSON_ARRAY_RE.search(raw)
|
|
67
|
+
if not m:
|
|
68
|
+
logger.debug("Stage 6: no JSON array found in LLM response (%d chars)", len(raw))
|
|
69
|
+
return []
|
|
70
|
+
try:
|
|
71
|
+
data = json.loads(m.group(0))
|
|
72
|
+
if isinstance(data, list):
|
|
73
|
+
return [d for d in data if isinstance(d, dict)]
|
|
74
|
+
except (json.JSONDecodeError, ValueError):
|
|
75
|
+
logger.debug("Stage 6: JSON parse failed for extraction response")
|
|
76
|
+
return []
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
def _map_predicate(predicate: str) -> RelationType:
|
|
80
|
+
pred = predicate.strip().lower()
|
|
81
|
+
for key, rel in _RELATION_MAP.items():
|
|
82
|
+
if key in pred:
|
|
83
|
+
return rel
|
|
84
|
+
return RelationType.RELATED
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
# ---------------------------------------------------------------------------
|
|
88
|
+
# Stage 6 Extractor
|
|
89
|
+
# ---------------------------------------------------------------------------
|
|
90
|
+
|
|
91
|
+
class LLMExtractor:
|
|
92
|
+
"""Stage 6 — LLM-assisted extraction (optional, expensive).
|
|
93
|
+
|
|
94
|
+
Requires a *dispatch_fn* to be injected by the pipeline. If not set,
|
|
95
|
+
``extract()`` returns empty.
|
|
96
|
+
"""
|
|
97
|
+
|
|
98
|
+
def __init__(self, dispatch_fn: DispatchFn | None = None) -> None:
|
|
99
|
+
self._dispatch = dispatch_fn
|
|
100
|
+
|
|
101
|
+
@property
|
|
102
|
+
def is_available(self) -> bool:
|
|
103
|
+
return self._dispatch is not None
|
|
104
|
+
|
|
105
|
+
def set_dispatch(self, fn: DispatchFn) -> None:
|
|
106
|
+
self._dispatch = fn
|
|
107
|
+
|
|
108
|
+
def extract(
|
|
109
|
+
self,
|
|
110
|
+
text: str,
|
|
111
|
+
source_window_id: str = "",
|
|
112
|
+
max_input_chars: int = 8000,
|
|
113
|
+
max_output_tokens: int = 1024,
|
|
114
|
+
) -> tuple[list[Fact], list[FactEdge]]:
|
|
115
|
+
"""Dispatch extraction window and parse results.
|
|
116
|
+
|
|
117
|
+
Returns ``(facts, edges)`` or ``([], [])`` if dispatch unavailable.
|
|
118
|
+
"""
|
|
119
|
+
if self._dispatch is None:
|
|
120
|
+
return [], []
|
|
121
|
+
|
|
122
|
+
# Chunk if necessary
|
|
123
|
+
chunk = text[:max_input_chars]
|
|
124
|
+
|
|
125
|
+
try:
|
|
126
|
+
raw = self._dispatch(_EXTRACTION_PROMPT, chunk, max_output_tokens)
|
|
127
|
+
except Exception:
|
|
128
|
+
logger.exception("Stage 6 LLM dispatch failed")
|
|
129
|
+
return [], []
|
|
130
|
+
|
|
131
|
+
triples = _parse_extraction_response(raw)
|
|
132
|
+
if not triples:
|
|
133
|
+
return [], []
|
|
134
|
+
|
|
135
|
+
facts: list[Fact] = []
|
|
136
|
+
edges: list[FactEdge] = []
|
|
137
|
+
seen: dict[str, str] = {}
|
|
138
|
+
|
|
139
|
+
for triple in triples:
|
|
140
|
+
subj = str(triple.get("subject", "")).strip()
|
|
141
|
+
obj = str(triple.get("object", "")).strip()
|
|
142
|
+
pred = str(triple.get("predicate", "")).strip()
|
|
143
|
+
if not subj or not obj:
|
|
144
|
+
continue
|
|
145
|
+
|
|
146
|
+
if subj not in seen:
|
|
147
|
+
sf = Fact(
|
|
148
|
+
text=subj,
|
|
149
|
+
category="llm_entity",
|
|
150
|
+
source_window_id=source_window_id,
|
|
151
|
+
confidence=0.75,
|
|
152
|
+
extraction_stage=6,
|
|
153
|
+
metadata={"role": "subject"},
|
|
154
|
+
)
|
|
155
|
+
facts.append(sf)
|
|
156
|
+
seen[subj] = sf.id
|
|
157
|
+
if obj not in seen:
|
|
158
|
+
of = Fact(
|
|
159
|
+
text=obj,
|
|
160
|
+
category="llm_entity",
|
|
161
|
+
source_window_id=source_window_id,
|
|
162
|
+
confidence=0.75,
|
|
163
|
+
extraction_stage=6,
|
|
164
|
+
metadata={"role": "object"},
|
|
165
|
+
)
|
|
166
|
+
facts.append(of)
|
|
167
|
+
seen[obj] = of.id
|
|
168
|
+
|
|
169
|
+
edges.append(FactEdge(
|
|
170
|
+
source_id=seen[subj],
|
|
171
|
+
target_id=seen[obj],
|
|
172
|
+
relation_type=_map_predicate(pred),
|
|
173
|
+
confidence=0.75,
|
|
174
|
+
source_stage=6,
|
|
175
|
+
metadata={"predicate": pred},
|
|
176
|
+
))
|
|
177
|
+
|
|
178
|
+
return facts, edges
|