crprotocol 2.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (153) hide show
  1. crp/__init__.py +126 -0
  2. crp/__main__.py +8 -0
  3. crp/_typing.py +27 -0
  4. crp/_version.py +5 -0
  5. crp/adapters.py +31 -0
  6. crp/advanced/__init__.py +40 -0
  7. crp/advanced/auto_ingest.py +400 -0
  8. crp/advanced/cqs.py +235 -0
  9. crp/advanced/cross_window.py +477 -0
  10. crp/advanced/curator.py +265 -0
  11. crp/advanced/feedback.py +146 -0
  12. crp/advanced/hierarchical.py +211 -0
  13. crp/advanced/meta_learning.py +401 -0
  14. crp/advanced/parallel.py +98 -0
  15. crp/advanced/review_cycle.py +329 -0
  16. crp/advanced/scale_mode.py +129 -0
  17. crp/advanced/source_grounding.py +207 -0
  18. crp/ckf/__init__.py +35 -0
  19. crp/ckf/community.py +377 -0
  20. crp/ckf/fabric.py +445 -0
  21. crp/ckf/gc.py +175 -0
  22. crp/ckf/graph_walk.py +87 -0
  23. crp/ckf/merge.py +133 -0
  24. crp/ckf/pattern_query.py +122 -0
  25. crp/ckf/pubsub.py +128 -0
  26. crp/ckf/semantic.py +207 -0
  27. crp/cli/__init__.py +7 -0
  28. crp/cli/main.py +329 -0
  29. crp/cli/sidecar.py +929 -0
  30. crp/cli/startup.py +272 -0
  31. crp/continuation/__init__.py +103 -0
  32. crp/continuation/completion.py +348 -0
  33. crp/continuation/degradation.py +157 -0
  34. crp/continuation/document_map.py +160 -0
  35. crp/continuation/flow.py +109 -0
  36. crp/continuation/gap.py +419 -0
  37. crp/continuation/manager.py +484 -0
  38. crp/continuation/quality_monitor.py +179 -0
  39. crp/continuation/stitch.py +419 -0
  40. crp/continuation/trigger.py +142 -0
  41. crp/continuation/voice.py +157 -0
  42. crp/core/__init__.py +69 -0
  43. crp/core/batch.py +77 -0
  44. crp/core/circuit_breaker.py +116 -0
  45. crp/core/config.py +377 -0
  46. crp/core/context_tools.py +540 -0
  47. crp/core/dispatch_router.py +3977 -0
  48. crp/core/errors.py +128 -0
  49. crp/core/extraction_facade.py +384 -0
  50. crp/core/facilitator.py +713 -0
  51. crp/core/idempotency.py +215 -0
  52. crp/core/orchestrator.py +1435 -0
  53. crp/core/relay_strategies.py +613 -0
  54. crp/core/security_manager.py +140 -0
  55. crp/core/session.py +134 -0
  56. crp/core/task_intent.py +36 -0
  57. crp/core/window.py +363 -0
  58. crp/envelope/__init__.py +30 -0
  59. crp/envelope/builder.py +288 -0
  60. crp/envelope/decomposer.py +236 -0
  61. crp/envelope/formatter.py +168 -0
  62. crp/envelope/packer.py +211 -0
  63. crp/envelope/reranker.py +209 -0
  64. crp/envelope/scoring.py +310 -0
  65. crp/extraction/__init__.py +45 -0
  66. crp/extraction/complexity.py +96 -0
  67. crp/extraction/contradiction.py +132 -0
  68. crp/extraction/pipeline.py +360 -0
  69. crp/extraction/quality_gate.py +237 -0
  70. crp/extraction/stage1_regex.py +173 -0
  71. crp/extraction/stage2_statistical.py +244 -0
  72. crp/extraction/stage3_gliner.py +210 -0
  73. crp/extraction/stage4_uie.py +183 -0
  74. crp/extraction/stage5_discourse.py +175 -0
  75. crp/extraction/stage6_llm.py +178 -0
  76. crp/extraction/structured_output.py +219 -0
  77. crp/extraction/types.py +299 -0
  78. crp/license_guard.py +722 -0
  79. crp/observability/__init__.py +30 -0
  80. crp/observability/audit.py +118 -0
  81. crp/observability/events.py +233 -0
  82. crp/observability/metrics.py +264 -0
  83. crp/observability/quality.py +135 -0
  84. crp/observability/structured_logging.py +81 -0
  85. crp/observability/telemetry.py +117 -0
  86. crp/provenance/__init__.py +314 -0
  87. crp/provenance/_embeddings.py +97 -0
  88. crp/provenance/_types.py +378 -0
  89. crp/provenance/attribution_scorer.py +252 -0
  90. crp/provenance/claim_detector.py +229 -0
  91. crp/provenance/contradiction_detector.py +243 -0
  92. crp/provenance/distortion_detector.py +397 -0
  93. crp/provenance/entailment_verifier.py +358 -0
  94. crp/provenance/fabrication_detector.py +203 -0
  95. crp/provenance/hallucination_scorer.py +320 -0
  96. crp/provenance/omission_analyzer.py +106 -0
  97. crp/provenance/provenance_chain.py +205 -0
  98. crp/provenance/report_generator.py +440 -0
  99. crp/providers/__init__.py +43 -0
  100. crp/providers/anthropic.py +270 -0
  101. crp/providers/base.py +135 -0
  102. crp/providers/custom.py +63 -0
  103. crp/providers/diagnostic.py +251 -0
  104. crp/providers/llamacpp.py +224 -0
  105. crp/providers/manager.py +139 -0
  106. crp/providers/ollama.py +243 -0
  107. crp/providers/openai.py +628 -0
  108. crp/providers/tokenizers.py +48 -0
  109. crp/py.typed +0 -0
  110. crp/resources/__init__.py +53 -0
  111. crp/resources/adaptive_allocator.py +525 -0
  112. crp/resources/cost_model.py +388 -0
  113. crp/resources/overhead_manager.py +217 -0
  114. crp/resources/resource_manager.py +262 -0
  115. crp/schemas/__init__.py +20 -0
  116. crp/schemas/cost-estimate.json +33 -0
  117. crp/schemas/crp-error.json +43 -0
  118. crp/schemas/envelope-preview.json +40 -0
  119. crp/schemas/persisted-state-header.json +27 -0
  120. crp/schemas/quality-report.json +94 -0
  121. crp/schemas/session-handle.json +33 -0
  122. crp/schemas/session-status.json +57 -0
  123. crp/schemas/stream-event.json +18 -0
  124. crp/schemas/task-intent.json +42 -0
  125. crp/security/__init__.py +93 -0
  126. crp/security/audit_trail.py +392 -0
  127. crp/security/binding.py +192 -0
  128. crp/security/compliance.py +813 -0
  129. crp/security/consent.py +593 -0
  130. crp/security/embedding_defense.py +161 -0
  131. crp/security/encryption.py +202 -0
  132. crp/security/injection.py +335 -0
  133. crp/security/integrity.py +267 -0
  134. crp/security/privacy.py +662 -0
  135. crp/security/quarantine.py +249 -0
  136. crp/security/rbac.py +221 -0
  137. crp/security/validation.py +164 -0
  138. crp/state/__init__.py +31 -0
  139. crp/state/cold_storage.py +258 -0
  140. crp/state/compaction.py +263 -0
  141. crp/state/critical_state.py +104 -0
  142. crp/state/event_log.py +313 -0
  143. crp/state/fact.py +189 -0
  144. crp/state/serialization.py +189 -0
  145. crp/state/session_cleanup.py +77 -0
  146. crp/state/snapshot.py +290 -0
  147. crp/state/warm_store.py +346 -0
  148. crprotocol-2.0.0.dist-info/METADATA +1295 -0
  149. crprotocol-2.0.0.dist-info/RECORD +153 -0
  150. crprotocol-2.0.0.dist-info/WHEEL +4 -0
  151. crprotocol-2.0.0.dist-info/entry_points.txt +2 -0
  152. crprotocol-2.0.0.dist-info/licenses/LICENSE.md +170 -0
  153. crprotocol-2.0.0.dist-info/licenses/NOTICE +18 -0
@@ -0,0 +1,183 @@
1
+ # Copyright © 2025 Constantinos Vidiniotis. All rights reserved.
2
+ # Licensed under Elastic License 2.0 — see LICENSE.md for details.
3
+ """Stage 4 — UIE relational extraction (SHOULD, ~100ms, lazy model load).
4
+
5
+ Extracts (subject, predicate, object) triples and converts them to FactEdge
6
+ records. Trigger: Stage 3 relation yield < 0.1 per sentence.
7
+ Model: UIE / universal IE (~400MB), loaded lazily.
8
+ Graceful fallback: returns empty if unavailable.
9
+ """
10
+
11
+ from __future__ import annotations
12
+
13
+ import logging
14
+ from typing import Any, Protocol, runtime_checkable
15
+
16
+ from crp.extraction.types import Fact, FactEdge, RelationType
17
+
18
+ logger = logging.getLogger(__name__)
19
+
20
+
21
+ # ---------------------------------------------------------------------------
22
+ # UIE model protocol
23
+ # ---------------------------------------------------------------------------
24
+
25
+ @runtime_checkable
26
+ class UIEModel(Protocol):
27
+ """Minimal interface for a Universal Information Extraction model."""
28
+
29
+ def extract_triples(
30
+ self, text: str
31
+ ) -> list[dict[str, Any]]:
32
+ """Return list of dicts with keys: subject, predicate, object, confidence."""
33
+ ...
34
+
35
+
36
+ # ---------------------------------------------------------------------------
37
+ # Triple → FactEdge mapping
38
+ # ---------------------------------------------------------------------------
39
+
40
+ _PREDICATE_TO_RELATION: dict[str, RelationType] = {
41
+ "causes": RelationType.CAUSE_EFFECT,
42
+ "caused by": RelationType.CAUSE_EFFECT,
43
+ "leads to": RelationType.CONSEQUENCE,
44
+ "results in": RelationType.CONSEQUENCE,
45
+ "depends on": RelationType.CONDITION_FOR,
46
+ "requires": RelationType.CONDITION_FOR,
47
+ "contrasts with": RelationType.CONTRAST,
48
+ "despite": RelationType.CONCESSION,
49
+ "elaborates": RelationType.ELABORATION,
50
+ "extends": RelationType.ELABORATION,
51
+ "follows": RelationType.SEQUENCE,
52
+ "precedes": RelationType.SEQUENCE,
53
+ }
54
+
55
+
56
+ def _map_predicate(predicate: str) -> RelationType:
57
+ """Best-effort mapping from free-text predicate to RelationType."""
58
+ pred_lower = predicate.strip().lower()
59
+ for key, rel in _PREDICATE_TO_RELATION.items():
60
+ if key in pred_lower:
61
+ return rel
62
+ return RelationType.RELATED
63
+
64
+
65
+ # ---------------------------------------------------------------------------
66
+ # Stage 4 Extractor
67
+ # ---------------------------------------------------------------------------
68
+
69
+ class UIEExtractor:
70
+ """Stage 4 — UIE triple extraction (lazy, optional).
71
+
72
+ Loads the model on first use. Returns ``(facts, edges)`` where *facts*
73
+ are the subject/object entities and *edges* are the relations.
74
+ If the UIE library is unavailable, all calls return ``([], [])``.
75
+ """
76
+
77
+ def __init__(self) -> None:
78
+ self._model: UIEModel | None = None
79
+ self._available: bool | None = None
80
+
81
+ # -- Lifecycle ----------------------------------------------------------
82
+
83
+ def _ensure_model(self) -> UIEModel | None:
84
+ if self._available is False:
85
+ return None
86
+ if self._model is not None:
87
+ return self._model
88
+ try:
89
+ # Try to import a UIE implementation.
90
+ # The spec is model-agnostic; accept any class exposing extract_triples().
91
+ from uie import UIE # type: ignore[import-untyped]
92
+
93
+ self._model = UIE() # type: ignore[assignment]
94
+ self._available = True
95
+ logger.info("UIE model loaded successfully")
96
+ return self._model
97
+ except Exception:
98
+ self._available = False
99
+ logger.warning("UIE not available — Stage 4 will be skipped")
100
+ return None
101
+
102
+ def unload(self) -> None:
103
+ self._model = None
104
+
105
+ @property
106
+ def is_available(self) -> bool:
107
+ if self._available is None:
108
+ self._ensure_model()
109
+ return self._available is True
110
+
111
+ # -- Extraction ---------------------------------------------------------
112
+
113
+ def extract(
114
+ self,
115
+ text: str,
116
+ source_window_id: str = "",
117
+ ) -> tuple[list[Fact], list[FactEdge]]:
118
+ """Extract relational triples from *text*.
119
+
120
+ Returns ``(facts, edges)`` — each triple yields two Fact items
121
+ (subject, object) and one FactEdge.
122
+ Returns ``([], [])`` on failure or if model unavailable.
123
+ """
124
+ model = self._ensure_model()
125
+ if model is None:
126
+ return [], []
127
+
128
+ try:
129
+ triples = model.extract_triples(text)
130
+ except Exception:
131
+ logger.exception("UIE extraction failed")
132
+ return [], []
133
+
134
+ facts: list[Fact] = []
135
+ edges: list[FactEdge] = []
136
+ seen_texts: dict[str, str] = {} # text → fact_id (dedup entities)
137
+
138
+ for triple in triples:
139
+ subj_text = str(triple.get("subject", ""))
140
+ obj_text = str(triple.get("object", ""))
141
+ predicate = str(triple.get("predicate", ""))
142
+ conf = float(triple.get("confidence", 0.70))
143
+
144
+ if not subj_text or not obj_text:
145
+ continue
146
+
147
+ # Dedup entity facts
148
+ if subj_text not in seen_texts:
149
+ subj_fact = Fact(
150
+ text=subj_text,
151
+ category="uie_entity",
152
+ source_window_id=source_window_id,
153
+ confidence=min(0.85, max(0.70, conf)),
154
+ extraction_stage=4,
155
+ metadata={"role": "subject"},
156
+ )
157
+ facts.append(subj_fact)
158
+ seen_texts[subj_text] = subj_fact.id
159
+ subj_id = seen_texts[subj_text]
160
+
161
+ if obj_text not in seen_texts:
162
+ obj_fact = Fact(
163
+ text=obj_text,
164
+ category="uie_entity",
165
+ source_window_id=source_window_id,
166
+ confidence=min(0.85, max(0.70, conf)),
167
+ extraction_stage=4,
168
+ metadata={"role": "object"},
169
+ )
170
+ facts.append(obj_fact)
171
+ seen_texts[obj_text] = obj_fact.id
172
+ obj_id = seen_texts[obj_text]
173
+
174
+ edges.append(FactEdge(
175
+ source_id=subj_id,
176
+ target_id=obj_id,
177
+ relation_type=_map_predicate(predicate),
178
+ confidence=min(0.85, max(0.70, conf)),
179
+ source_stage=4,
180
+ metadata={"predicate": predicate},
181
+ ))
182
+
183
+ return facts, edges
@@ -0,0 +1,175 @@
1
+ # Copyright © 2025 Constantinos Vidiniotis. All rights reserved.
2
+ # Licensed under Elastic License 2.0 — see LICENSE.md for details.
3
+ """Stage 5 — Discourse structure extraction (SHOULD, ~150ms, CPU-only).
4
+
5
+ Detects discourse markers and maps them to semantic relation types (RST-inspired).
6
+ Trigger: content_type in {REASONING_DENSE, NARRATIVE}.
7
+ No ML model — pure pattern matching over sentences.
8
+ """
9
+
10
+ from __future__ import annotations
11
+
12
+ import re
13
+
14
+ from crp.extraction.types import Fact, FactEdge, RelationType
15
+
16
+ # ---------------------------------------------------------------------------
17
+ # Discourse marker → relation-type mapping
18
+ # ---------------------------------------------------------------------------
19
+
20
+ _MARKER_GROUPS: list[tuple[RelationType, list[str]]] = [
21
+ (RelationType.CONDITION_FOR, [
22
+ "if", "unless", "provided", "provided that", "assuming",
23
+ "given that", "in case", "on condition that",
24
+ ]),
25
+ (RelationType.CAUSE_EFFECT, [
26
+ "because", "since", "due to", "owing to", "causes",
27
+ "caused by", "as a result of",
28
+ ]),
29
+ (RelationType.CONTRAST, [
30
+ "however", "but", "yet", "on the other hand",
31
+ "in contrast", "conversely", "nevertheless",
32
+ ]),
33
+ (RelationType.CONCESSION, [
34
+ "although", "despite", "even though", "in spite of",
35
+ "notwithstanding", "regardless",
36
+ ]),
37
+ (RelationType.CONSEQUENCE, [
38
+ "therefore", "thus", "hence", "so", "consequently",
39
+ "as a result", "accordingly",
40
+ ]),
41
+ (RelationType.ELABORATION, [
42
+ "as", "given", "for example", "for instance",
43
+ "in particular", "specifically", "namely",
44
+ "that is", "i.e.", "e.g.",
45
+ ]),
46
+ (RelationType.SEQUENCE, [
47
+ "and then", "subsequently", "next", "afterwards",
48
+ "following", "before", "after", "finally",
49
+ "first", "second", "third", "lastly",
50
+ ]),
51
+ ]
52
+
53
+ # Build compiled regex per group — match markers at word boundaries
54
+ _COMPILED_MARKERS: list[tuple[RelationType, re.Pattern[str]]] = []
55
+ for _rel, _markers in _MARKER_GROUPS:
56
+ # Sort longest first so "provided that" matches before "provided"
57
+ _sorted = sorted(_markers, key=len, reverse=True)
58
+ escaped = [re.escape(m) for m in _sorted]
59
+ _COMPILED_MARKERS.append((
60
+ _rel,
61
+ re.compile(r"\b(?:" + "|".join(escaped) + r")\b", re.IGNORECASE),
62
+ ))
63
+
64
+ # Flat set for quick counting
65
+ _ALL_MARKERS_FLAT: set[str] = set()
66
+ for _, markers in _MARKER_GROUPS:
67
+ _ALL_MARKERS_FLAT.update(markers)
68
+
69
+ # ---------------------------------------------------------------------------
70
+ # Sentence splitting (reuse from stage2 would be nice, but keep self-contained)
71
+ # ---------------------------------------------------------------------------
72
+
73
+ _SENT_RE = re.compile(r"(?<=[.!?])\s+(?=[A-Z])")
74
+
75
+
76
+ def _split_sentences(text: str) -> list[str]:
77
+ raw = _SENT_RE.split(text.strip())
78
+ return [s.strip() for s in raw if len(s.strip()) > 5]
79
+
80
+
81
+ # ---------------------------------------------------------------------------
82
+ # Public helpers (used by complexity detector)
83
+ # ---------------------------------------------------------------------------
84
+
85
+ def count_discourse_markers(text: str) -> int:
86
+ """Count total discourse-marker occurrences in *text* (fast)."""
87
+ total = 0
88
+ text_lower = text.lower()
89
+ for marker in _ALL_MARKERS_FLAT:
90
+ # Word-boundary check via regex on each marker would be expensive;
91
+ # for the counting use-case a simple substring is sufficient.
92
+ start = 0
93
+ while True:
94
+ idx = text_lower.find(marker, start)
95
+ if idx == -1:
96
+ break
97
+ # Rough word-boundary check
98
+ before_ok = idx == 0 or not text_lower[idx - 1].isalnum()
99
+ after_idx = idx + len(marker)
100
+ after_ok = after_idx >= len(text_lower) or not text_lower[after_idx].isalnum()
101
+ if before_ok and after_ok:
102
+ total += 1
103
+ start = idx + 1
104
+ return total
105
+
106
+
107
+ # ---------------------------------------------------------------------------
108
+ # Stage 5 Extractor
109
+ # ---------------------------------------------------------------------------
110
+
111
+ class DiscourseExtractor:
112
+ """Stage 5 — discourse-structure extraction (CPU-only)."""
113
+
114
+ def extract(
115
+ self,
116
+ text: str,
117
+ source_window_id: str = "",
118
+ ) -> tuple[list[Fact], list[FactEdge]]:
119
+ """Detect discourse markers and create FactEdge relations.
120
+
121
+ Returns ``(marker_facts, edges)`` where *marker_facts* are the
122
+ clauses surrounding each detected marker, and *edges* link them.
123
+ """
124
+ sentences = _split_sentences(text)
125
+ if not sentences:
126
+ return [], []
127
+
128
+ facts: list[Fact] = []
129
+ edges: list[FactEdge] = []
130
+ sent_fact_ids: dict[int, str] = {} # sentence_index → fact_id
131
+
132
+ def _get_or_create_fact(idx: int) -> str:
133
+ """Ensure a Fact exists for the sentence at *idx*."""
134
+ if idx in sent_fact_ids:
135
+ return sent_fact_ids[idx]
136
+ f = Fact(
137
+ text=sentences[idx],
138
+ category="discourse_unit",
139
+ source_window_id=source_window_id,
140
+ confidence=0.70,
141
+ extraction_stage=5,
142
+ metadata={"sentence_index": idx},
143
+ )
144
+ facts.append(f)
145
+ sent_fact_ids[idx] = f.id
146
+ return f.id
147
+
148
+ for i, sent in enumerate(sentences):
149
+ for rel_type, pattern in _COMPILED_MARKERS:
150
+ match = pattern.search(sent)
151
+ if match is None:
152
+ continue
153
+
154
+ # The marker links this sentence (or the clause after the marker)
155
+ # to the preceding sentence.
156
+ before_idx = max(0, i - 1)
157
+ after_idx = i
158
+
159
+ # Avoid self-loops
160
+ if before_idx == after_idx and i == 0:
161
+ continue
162
+
163
+ before_id = _get_or_create_fact(before_idx)
164
+ after_id = _get_or_create_fact(after_idx)
165
+
166
+ edges.append(FactEdge(
167
+ source_id=before_id,
168
+ target_id=after_id,
169
+ relation_type=rel_type,
170
+ confidence=0.70,
171
+ source_stage=5,
172
+ metadata={"marker": match.group(0), "sentence_index": i},
173
+ ))
174
+
175
+ return facts, edges
@@ -0,0 +1,178 @@
1
+ # Copyright © 2025 Constantinos Vidiniotis. All rights reserved.
2
+ # Licensed under Elastic License 2.0 — see LICENSE.md for details.
3
+ """Stage 6 — LLM-assisted relational extraction (MAY, expensive).
4
+
5
+ Dispatches a dedicated extraction window to a small LLM to extract logical
6
+ relationships from reasoning-dense content. Trigger: content_type == REASONING_DENSE
7
+ AND Stage 5 edge_yield < 0.1 edges/sentence.
8
+
9
+ This stage is **user-configurable** (can be disabled via config flag).
10
+ """
11
+
12
+ from __future__ import annotations
13
+
14
+ import json
15
+ import logging
16
+ import re
17
+ from collections.abc import Callable
18
+
19
+ from crp.extraction.types import Fact, FactEdge, RelationType
20
+
21
+ logger = logging.getLogger(__name__)
22
+
23
+ # ---------------------------------------------------------------------------
24
+ # Extraction prompt
25
+ # ---------------------------------------------------------------------------
26
+
27
+ _EXTRACTION_PROMPT = (
28
+ "Extract ALL logical relationships, conditions, dependencies, "
29
+ "and reasoning chains from this text. Output as a JSON array of objects "
30
+ 'with keys: "subject", "predicate", "object". Example:\n'
31
+ '[{"subject": "A", "predicate": "causes", "object": "B"}]'
32
+ )
33
+
34
+ _RELATION_MAP: dict[str, RelationType] = {
35
+ "causes": RelationType.CAUSE_EFFECT,
36
+ "caused by": RelationType.CAUSE_EFFECT,
37
+ "leads to": RelationType.CONSEQUENCE,
38
+ "results in": RelationType.CONSEQUENCE,
39
+ "depends on": RelationType.CONDITION_FOR,
40
+ "requires": RelationType.CONDITION_FOR,
41
+ "if": RelationType.CONDITION_FOR,
42
+ "contrasts": RelationType.CONTRAST,
43
+ "despite": RelationType.CONCESSION,
44
+ "elaborates": RelationType.ELABORATION,
45
+ "follows": RelationType.SEQUENCE,
46
+ }
47
+
48
+ # ---------------------------------------------------------------------------
49
+ # Dispatch callback type
50
+ # ---------------------------------------------------------------------------
51
+
52
+ # Type alias for the LLM dispatch function the pipeline will inject.
53
+ # Signature: dispatch(system_prompt, task_input, max_output_tokens) → str
54
+ DispatchFn = Callable[[str, str, int], str]
55
+
56
+
57
+ # ---------------------------------------------------------------------------
58
+ # Response parser
59
+ # ---------------------------------------------------------------------------
60
+
61
+ _JSON_ARRAY_RE = re.compile(r"\[.*\]", re.DOTALL)
62
+
63
+
64
+ def _parse_extraction_response(raw: str) -> list[dict[str, str]]:
65
+ """Best-effort JSON parse from LLM output."""
66
+ m = _JSON_ARRAY_RE.search(raw)
67
+ if not m:
68
+ logger.debug("Stage 6: no JSON array found in LLM response (%d chars)", len(raw))
69
+ return []
70
+ try:
71
+ data = json.loads(m.group(0))
72
+ if isinstance(data, list):
73
+ return [d for d in data if isinstance(d, dict)]
74
+ except (json.JSONDecodeError, ValueError):
75
+ logger.debug("Stage 6: JSON parse failed for extraction response")
76
+ return []
77
+
78
+
79
+ def _map_predicate(predicate: str) -> RelationType:
80
+ pred = predicate.strip().lower()
81
+ for key, rel in _RELATION_MAP.items():
82
+ if key in pred:
83
+ return rel
84
+ return RelationType.RELATED
85
+
86
+
87
+ # ---------------------------------------------------------------------------
88
+ # Stage 6 Extractor
89
+ # ---------------------------------------------------------------------------
90
+
91
+ class LLMExtractor:
92
+ """Stage 6 — LLM-assisted extraction (optional, expensive).
93
+
94
+ Requires a *dispatch_fn* to be injected by the pipeline. If not set,
95
+ ``extract()`` returns empty.
96
+ """
97
+
98
+ def __init__(self, dispatch_fn: DispatchFn | None = None) -> None:
99
+ self._dispatch = dispatch_fn
100
+
101
+ @property
102
+ def is_available(self) -> bool:
103
+ return self._dispatch is not None
104
+
105
+ def set_dispatch(self, fn: DispatchFn) -> None:
106
+ self._dispatch = fn
107
+
108
+ def extract(
109
+ self,
110
+ text: str,
111
+ source_window_id: str = "",
112
+ max_input_chars: int = 8000,
113
+ max_output_tokens: int = 1024,
114
+ ) -> tuple[list[Fact], list[FactEdge]]:
115
+ """Dispatch extraction window and parse results.
116
+
117
+ Returns ``(facts, edges)`` or ``([], [])`` if dispatch unavailable.
118
+ """
119
+ if self._dispatch is None:
120
+ return [], []
121
+
122
+ # Chunk if necessary
123
+ chunk = text[:max_input_chars]
124
+
125
+ try:
126
+ raw = self._dispatch(_EXTRACTION_PROMPT, chunk, max_output_tokens)
127
+ except Exception:
128
+ logger.exception("Stage 6 LLM dispatch failed")
129
+ return [], []
130
+
131
+ triples = _parse_extraction_response(raw)
132
+ if not triples:
133
+ return [], []
134
+
135
+ facts: list[Fact] = []
136
+ edges: list[FactEdge] = []
137
+ seen: dict[str, str] = {}
138
+
139
+ for triple in triples:
140
+ subj = str(triple.get("subject", "")).strip()
141
+ obj = str(triple.get("object", "")).strip()
142
+ pred = str(triple.get("predicate", "")).strip()
143
+ if not subj or not obj:
144
+ continue
145
+
146
+ if subj not in seen:
147
+ sf = Fact(
148
+ text=subj,
149
+ category="llm_entity",
150
+ source_window_id=source_window_id,
151
+ confidence=0.75,
152
+ extraction_stage=6,
153
+ metadata={"role": "subject"},
154
+ )
155
+ facts.append(sf)
156
+ seen[subj] = sf.id
157
+ if obj not in seen:
158
+ of = Fact(
159
+ text=obj,
160
+ category="llm_entity",
161
+ source_window_id=source_window_id,
162
+ confidence=0.75,
163
+ extraction_stage=6,
164
+ metadata={"role": "object"},
165
+ )
166
+ facts.append(of)
167
+ seen[obj] = of.id
168
+
169
+ edges.append(FactEdge(
170
+ source_id=seen[subj],
171
+ target_id=seen[obj],
172
+ relation_type=_map_predicate(pred),
173
+ confidence=0.75,
174
+ source_stage=6,
175
+ metadata={"predicate": pred},
176
+ ))
177
+
178
+ return facts, edges