crprotocol 2.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (153) hide show
  1. crp/__init__.py +126 -0
  2. crp/__main__.py +8 -0
  3. crp/_typing.py +27 -0
  4. crp/_version.py +5 -0
  5. crp/adapters.py +31 -0
  6. crp/advanced/__init__.py +40 -0
  7. crp/advanced/auto_ingest.py +400 -0
  8. crp/advanced/cqs.py +235 -0
  9. crp/advanced/cross_window.py +477 -0
  10. crp/advanced/curator.py +265 -0
  11. crp/advanced/feedback.py +146 -0
  12. crp/advanced/hierarchical.py +211 -0
  13. crp/advanced/meta_learning.py +401 -0
  14. crp/advanced/parallel.py +98 -0
  15. crp/advanced/review_cycle.py +329 -0
  16. crp/advanced/scale_mode.py +129 -0
  17. crp/advanced/source_grounding.py +207 -0
  18. crp/ckf/__init__.py +35 -0
  19. crp/ckf/community.py +377 -0
  20. crp/ckf/fabric.py +445 -0
  21. crp/ckf/gc.py +175 -0
  22. crp/ckf/graph_walk.py +87 -0
  23. crp/ckf/merge.py +133 -0
  24. crp/ckf/pattern_query.py +122 -0
  25. crp/ckf/pubsub.py +128 -0
  26. crp/ckf/semantic.py +207 -0
  27. crp/cli/__init__.py +7 -0
  28. crp/cli/main.py +329 -0
  29. crp/cli/sidecar.py +929 -0
  30. crp/cli/startup.py +272 -0
  31. crp/continuation/__init__.py +103 -0
  32. crp/continuation/completion.py +348 -0
  33. crp/continuation/degradation.py +157 -0
  34. crp/continuation/document_map.py +160 -0
  35. crp/continuation/flow.py +109 -0
  36. crp/continuation/gap.py +419 -0
  37. crp/continuation/manager.py +484 -0
  38. crp/continuation/quality_monitor.py +179 -0
  39. crp/continuation/stitch.py +419 -0
  40. crp/continuation/trigger.py +142 -0
  41. crp/continuation/voice.py +157 -0
  42. crp/core/__init__.py +69 -0
  43. crp/core/batch.py +77 -0
  44. crp/core/circuit_breaker.py +116 -0
  45. crp/core/config.py +377 -0
  46. crp/core/context_tools.py +540 -0
  47. crp/core/dispatch_router.py +3977 -0
  48. crp/core/errors.py +128 -0
  49. crp/core/extraction_facade.py +384 -0
  50. crp/core/facilitator.py +713 -0
  51. crp/core/idempotency.py +215 -0
  52. crp/core/orchestrator.py +1435 -0
  53. crp/core/relay_strategies.py +613 -0
  54. crp/core/security_manager.py +140 -0
  55. crp/core/session.py +134 -0
  56. crp/core/task_intent.py +36 -0
  57. crp/core/window.py +363 -0
  58. crp/envelope/__init__.py +30 -0
  59. crp/envelope/builder.py +288 -0
  60. crp/envelope/decomposer.py +236 -0
  61. crp/envelope/formatter.py +168 -0
  62. crp/envelope/packer.py +211 -0
  63. crp/envelope/reranker.py +209 -0
  64. crp/envelope/scoring.py +310 -0
  65. crp/extraction/__init__.py +45 -0
  66. crp/extraction/complexity.py +96 -0
  67. crp/extraction/contradiction.py +132 -0
  68. crp/extraction/pipeline.py +360 -0
  69. crp/extraction/quality_gate.py +237 -0
  70. crp/extraction/stage1_regex.py +173 -0
  71. crp/extraction/stage2_statistical.py +244 -0
  72. crp/extraction/stage3_gliner.py +210 -0
  73. crp/extraction/stage4_uie.py +183 -0
  74. crp/extraction/stage5_discourse.py +175 -0
  75. crp/extraction/stage6_llm.py +178 -0
  76. crp/extraction/structured_output.py +219 -0
  77. crp/extraction/types.py +299 -0
  78. crp/license_guard.py +722 -0
  79. crp/observability/__init__.py +30 -0
  80. crp/observability/audit.py +118 -0
  81. crp/observability/events.py +233 -0
  82. crp/observability/metrics.py +264 -0
  83. crp/observability/quality.py +135 -0
  84. crp/observability/structured_logging.py +81 -0
  85. crp/observability/telemetry.py +117 -0
  86. crp/provenance/__init__.py +314 -0
  87. crp/provenance/_embeddings.py +97 -0
  88. crp/provenance/_types.py +378 -0
  89. crp/provenance/attribution_scorer.py +252 -0
  90. crp/provenance/claim_detector.py +229 -0
  91. crp/provenance/contradiction_detector.py +243 -0
  92. crp/provenance/distortion_detector.py +397 -0
  93. crp/provenance/entailment_verifier.py +358 -0
  94. crp/provenance/fabrication_detector.py +203 -0
  95. crp/provenance/hallucination_scorer.py +320 -0
  96. crp/provenance/omission_analyzer.py +106 -0
  97. crp/provenance/provenance_chain.py +205 -0
  98. crp/provenance/report_generator.py +440 -0
  99. crp/providers/__init__.py +43 -0
  100. crp/providers/anthropic.py +270 -0
  101. crp/providers/base.py +135 -0
  102. crp/providers/custom.py +63 -0
  103. crp/providers/diagnostic.py +251 -0
  104. crp/providers/llamacpp.py +224 -0
  105. crp/providers/manager.py +139 -0
  106. crp/providers/ollama.py +243 -0
  107. crp/providers/openai.py +628 -0
  108. crp/providers/tokenizers.py +48 -0
  109. crp/py.typed +0 -0
  110. crp/resources/__init__.py +53 -0
  111. crp/resources/adaptive_allocator.py +525 -0
  112. crp/resources/cost_model.py +388 -0
  113. crp/resources/overhead_manager.py +217 -0
  114. crp/resources/resource_manager.py +262 -0
  115. crp/schemas/__init__.py +20 -0
  116. crp/schemas/cost-estimate.json +33 -0
  117. crp/schemas/crp-error.json +43 -0
  118. crp/schemas/envelope-preview.json +40 -0
  119. crp/schemas/persisted-state-header.json +27 -0
  120. crp/schemas/quality-report.json +94 -0
  121. crp/schemas/session-handle.json +33 -0
  122. crp/schemas/session-status.json +57 -0
  123. crp/schemas/stream-event.json +18 -0
  124. crp/schemas/task-intent.json +42 -0
  125. crp/security/__init__.py +93 -0
  126. crp/security/audit_trail.py +392 -0
  127. crp/security/binding.py +192 -0
  128. crp/security/compliance.py +813 -0
  129. crp/security/consent.py +593 -0
  130. crp/security/embedding_defense.py +161 -0
  131. crp/security/encryption.py +202 -0
  132. crp/security/injection.py +335 -0
  133. crp/security/integrity.py +267 -0
  134. crp/security/privacy.py +662 -0
  135. crp/security/quarantine.py +249 -0
  136. crp/security/rbac.py +221 -0
  137. crp/security/validation.py +164 -0
  138. crp/state/__init__.py +31 -0
  139. crp/state/cold_storage.py +258 -0
  140. crp/state/compaction.py +263 -0
  141. crp/state/critical_state.py +104 -0
  142. crp/state/event_log.py +313 -0
  143. crp/state/fact.py +189 -0
  144. crp/state/serialization.py +189 -0
  145. crp/state/session_cleanup.py +77 -0
  146. crp/state/snapshot.py +290 -0
  147. crp/state/warm_store.py +346 -0
  148. crprotocol-2.0.0.dist-info/METADATA +1295 -0
  149. crprotocol-2.0.0.dist-info/RECORD +153 -0
  150. crprotocol-2.0.0.dist-info/WHEEL +4 -0
  151. crprotocol-2.0.0.dist-info/entry_points.txt +2 -0
  152. crprotocol-2.0.0.dist-info/licenses/LICENSE.md +170 -0
  153. crprotocol-2.0.0.dist-info/licenses/NOTICE +18 -0
@@ -0,0 +1,358 @@
1
+ # Copyright © 2025 Constantinos Vidiniotis. All rights reserved.
2
+ # Licensed under Elastic License 2.0 — see LICENSE.md for details.
3
+ """Semantic Entailment Verifier — ML-powered claim↔fact verification (§7.14.3).
4
+
5
+ **THE PROBLEM rule-based fidelity cannot solve:**
6
+
7
+ Fact: "The treatment significantly reduced patient mortality."
8
+ Claim: "The treatment showed some positive outcomes for patients."
9
+
10
+ Lexically similar. Zero number distortions. No negation flip.
11
+ But the claim *lost critical specificity* — a regulator reading the
12
+ claim would make a DIFFERENT decision than one reading the fact.
13
+
14
+ Rule-based detectors catch surface edits: 10→25, "safe"→"not safe".
15
+ They *cannot* detect:
16
+ - Specificity loss ("reduced mortality" → "positive outcomes")
17
+ - Causation inflation ("correlation observed" → "X causes Y")
18
+ - Scope generalisation ("in clinical settings" → "broadly")
19
+ - Hedging removal ("might reduce" → "reduces")
20
+
21
+ **THE SOLUTION: Natural Language Inference (NLI).**
22
+
23
+ A lightweight cross-encoder NLI model (~80 MB, CPU-only, <50 ms/pair)
24
+ classifies each (premise=fact, hypothesis=claim) pair as:
25
+ - ENTAILED — claim logically follows from the fact ✅
26
+ - NEUTRAL — claim is unrelated to the fact ⚠️
27
+ - CONTRADICTION — claim conflicts with the fact ❌
28
+
29
+ This gives CRP a **semantic fidelity layer** that sits above the lexical
30
+ layer, catching meaning-level distortions no regex can reach.
31
+
32
+ When the NLI model is unavailable (not installed, resource-constrained),
33
+ the verifier degrades gracefully to a heuristic based on bag-of-words
34
+ similarity — still better than nothing, while flagging that the result
35
+ is heuristic-only.
36
+ """
37
+
38
+ from __future__ import annotations
39
+
40
+ import logging
41
+ import re
42
+ from collections.abc import Sequence
43
+ from typing import Any
44
+
45
+ from crp.envelope.packer import PackedFact
46
+
47
+ from ._embeddings import cosine_similarity as _emb_cosine
48
+ from ._embeddings import encode_texts as _encode_texts
49
+ from ._types import (
50
+ AttributionType,
51
+ ClaimAttribution,
52
+ ClaimType,
53
+ EntailmentLabel,
54
+ EntailmentResult,
55
+ ProvenanceConfig,
56
+ )
57
+
58
+ logger = logging.getLogger(__name__)
59
+
60
+ # ---------------------------------------------------------------------------
61
+ # Module-level model cache (lazy singleton)
62
+ # ---------------------------------------------------------------------------
63
+
64
+ _nli_model: Any = None
65
+ _nli_model_name: str = ""
66
+ _nli_load_failed: bool = False
67
+
68
+
69
+ def _get_nli_model(model_name: str) -> Any:
70
+ """Lazy-load the NLI cross-encoder model (singleton).
71
+
72
+ Returns the model or None if loading fails.
73
+ """
74
+ global _nli_model, _nli_model_name, _nli_load_failed # noqa: PLW0603
75
+
76
+ if _nli_load_failed:
77
+ return None
78
+
79
+ if _nli_model is not None and _nli_model_name == model_name:
80
+ return _nli_model
81
+
82
+ try:
83
+ from sentence_transformers import CrossEncoder # type: ignore[import-untyped]
84
+ logger.info("Loading NLI model: %s", model_name)
85
+ _nli_model = CrossEncoder(model_name)
86
+ _nli_model_name = model_name
87
+ return _nli_model
88
+ except Exception as exc:
89
+ logger.warning("NLI model unavailable (%s), using heuristic: %s", model_name, exc)
90
+ _nli_load_failed = True
91
+ return None
92
+
93
+
94
+ def reset_model_cache() -> None:
95
+ """Reset the module-level model cache (for testing)."""
96
+ global _nli_model, _nli_model_name, _nli_load_failed # noqa: PLW0603
97
+ _nli_model = None
98
+ _nli_model_name = ""
99
+ _nli_load_failed = False
100
+
101
+
102
+ # ---------------------------------------------------------------------------
103
+ # Heuristic fallback (when NLI model unavailable)
104
+ # ---------------------------------------------------------------------------
105
+
106
+ _STOP = frozenset({
107
+ "a", "an", "the", "and", "or", "but", "in", "on", "at", "to",
108
+ "for", "of", "is", "it", "are", "was", "were", "be", "been",
109
+ "being", "have", "has", "had", "this", "that", "these", "those",
110
+ "with", "from", "by", "as", "will", "would", "can", "could",
111
+ })
112
+
113
+ _NEGATION_WORDS = frozenset({
114
+ "not", "no", "never", "neither", "nor", "none",
115
+ "doesn't", "don't", "didn't", "isn't", "aren't", "wasn't",
116
+ "weren't", "won't", "wouldn't", "shouldn't", "couldn't",
117
+ "can't", "cannot", "hasn't", "haven't", "hadn't",
118
+ })
119
+
120
+
121
+ def _content_words(text: str) -> set[str]:
122
+ return {
123
+ w for w in re.findall(r"[a-z]+", text.lower())
124
+ if w not in _STOP and len(w) > 2
125
+ }
126
+
127
+
128
+ def _has_negation(text: str) -> bool:
129
+ words = set(re.findall(r"[a-z']+", text.lower()))
130
+ return bool(words & _NEGATION_WORDS)
131
+
132
+
133
+ def _heuristic_entailment(
134
+ claim: str,
135
+ fact: str,
136
+ ) -> tuple[float, float, float]:
137
+ """ML-driven heuristic entailment when the NLI cross-encoder is unavailable.
138
+
139
+ Strategy (ordered by quality):
140
+ 1. Try dense sentence-transformer embeddings (cosine similarity)
141
+ for genuine semantic comparison. Maps cosine→NLI-like scores.
142
+ 2. Fall back to word overlap + negation if embeddings unavailable.
143
+
144
+ Returns (entailment_score, contradiction_score, neutral_score).
145
+ """
146
+ # Short-circuit on empty input
147
+ if not claim.strip() or not fact.strip():
148
+ return 0.0, 0.0, 1.0
149
+
150
+ # --- Attempt 1: Dense embedding similarity ---
151
+ embs = _encode_texts([claim, fact])
152
+ if embs is not None and len(embs) == 2:
153
+ sim = max(0.0, min(1.0, _emb_cosine(embs[0], embs[1])))
154
+
155
+ # Check negation asymmetry even with embeddings
156
+ claim_neg = _has_negation(claim)
157
+ fact_neg = _has_negation(fact)
158
+ negation_flip = claim_neg != fact_neg
159
+
160
+ if negation_flip and sim > 0.40:
161
+ return 0.05, 0.75, 0.20
162
+
163
+ # Map cosine similarity to NLI-like probability distribution
164
+ if sim > 0.75:
165
+ return 0.80, 0.03, 0.17
166
+ elif sim > 0.55:
167
+ return 0.55, 0.08, 0.37
168
+ elif sim > 0.35:
169
+ return 0.25, 0.10, 0.65
170
+ else:
171
+ return 0.08, 0.07, 0.85
172
+
173
+ # --- Attempt 2: Bag-of-words fallback ---
174
+ claim_words = _content_words(claim)
175
+ fact_words = _content_words(fact)
176
+
177
+ if not claim_words or not fact_words:
178
+ return 0.0, 0.0, 1.0
179
+
180
+ intersection = claim_words & fact_words
181
+ union = claim_words | fact_words
182
+ jaccard = len(intersection) / len(union) if union else 0.0
183
+
184
+ # Check negation asymmetry
185
+ claim_neg = _has_negation(claim)
186
+ fact_neg = _has_negation(fact)
187
+ negation_flip = claim_neg != fact_neg
188
+
189
+ if negation_flip and jaccard > 0.25:
190
+ # Same topic + negation flip = contradiction signal
191
+ return 0.05, 0.80, 0.15
192
+ elif jaccard > 0.50:
193
+ return 0.70, 0.05, 0.25
194
+ elif jaccard > 0.30:
195
+ return 0.40, 0.10, 0.50
196
+ else:
197
+ return 0.10, 0.05, 0.85
198
+
199
+
200
+ # ---------------------------------------------------------------------------
201
+ # Public API
202
+ # ---------------------------------------------------------------------------
203
+
204
+
205
+ def verify_entailment(
206
+ attributions: list[ClaimAttribution],
207
+ packed_facts: Sequence[PackedFact],
208
+ *,
209
+ config: ProvenanceConfig | None = None,
210
+ _model_override: Any = None,
211
+ ) -> list[EntailmentResult]:
212
+ """Verify semantic entailment between grounded claims and their source facts.
213
+
214
+ For each CONTEXT_GROUNDED or MIXED claim, runs NLI inference against
215
+ the top source fact. Returns an EntailmentResult per checked pair.
216
+
217
+ The NLI model classifies (premise=fact, hypothesis=claim):
218
+ - ENTAILED: claim logically follows from fact
219
+ - CONTRADICTION: claim conflicts with fact
220
+ - NEUTRAL: claim is unrelated to fact
221
+
222
+ When the NLI model is unavailable, falls back to a heuristic based on
223
+ word overlap + negation detection. The ``used_model`` field in each
224
+ result indicates which method was used.
225
+
226
+ Args:
227
+ attributions: Scored claim attributions from the DPE pipeline.
228
+ packed_facts: All envelope facts (for full-text lookup).
229
+ config: ProvenanceConfig (controls model name, thresholds).
230
+ _model_override: Override NLI model for testing (internal).
231
+
232
+ Returns:
233
+ List of EntailmentResult — one per checked claim-fact pair.
234
+ """
235
+ cfg = config or ProvenanceConfig()
236
+ if not cfg.entailment_enabled:
237
+ return []
238
+
239
+ # Build fact lookup
240
+ fact_lookup: dict[str, str] = {pf.fact_id: pf.text for pf in packed_facts}
241
+
242
+ # Get or load NLI model
243
+ model = _model_override or _get_nli_model(cfg.entailment_model)
244
+ use_model = model is not None
245
+
246
+ results: list[EntailmentResult] = []
247
+
248
+ # Collect claim-fact pairs to verify
249
+ pairs_to_check: list[tuple[ClaimAttribution, str, str]] = []
250
+ for attr in attributions:
251
+ if attr.attribution_type not in (
252
+ AttributionType.CONTEXT_GROUNDED,
253
+ AttributionType.MIXED,
254
+ ):
255
+ continue
256
+ if attr.claim_type not in (ClaimType.FACTUAL_CLAIM, ClaimType.HEDGE):
257
+ continue
258
+ if not attr.attributed_facts:
259
+ continue
260
+
261
+ top_fact = attr.attributed_facts[0]
262
+ fact_text = fact_lookup.get(top_fact.fact_id, top_fact.fact_text_preview)
263
+ if not fact_text.strip():
264
+ continue
265
+
266
+ pairs_to_check.append((attr, top_fact.fact_id, fact_text))
267
+
268
+ if not pairs_to_check:
269
+ return []
270
+
271
+ if use_model:
272
+ # Batch NLI inference for efficiency
273
+ nli_inputs = [
274
+ (fact_text, attr.claim_text) # premise=fact, hypothesis=claim
275
+ for attr, _, fact_text in pairs_to_check
276
+ ]
277
+ try:
278
+ raw_scores = model.predict(nli_inputs)
279
+ # CrossEncoder NLI returns [contradiction, entailment, neutral]
280
+ # or [entailment, neutral, contradiction] depending on model
281
+ # Normalise via softmax-like interpretation
282
+ for i, (attr, fact_id, fact_text) in enumerate(pairs_to_check):
283
+ scores = raw_scores[i]
284
+ # Standard NLI cross-encoder label order: [contradiction, entailment, neutral]
285
+ if len(scores) == 3:
286
+ contradiction_s = float(scores[0])
287
+ entailment_s = float(scores[1])
288
+ neutral_s = float(scores[2])
289
+ else:
290
+ # Fallback: treat as binary
291
+ entailment_s = float(scores[0]) if len(scores) > 0 else 0.0
292
+ contradiction_s = 0.0
293
+ neutral_s = 1.0 - entailment_s
294
+
295
+ # Softmax normalisation
296
+ import math
297
+ vals = [entailment_s, contradiction_s, neutral_s]
298
+ max_v = max(vals)
299
+ exp_vals = [math.exp(v - max_v) for v in vals]
300
+ total = sum(exp_vals)
301
+ ent_p = exp_vals[0] / total
302
+ con_p = exp_vals[1] / total
303
+ neu_p = exp_vals[2] / total
304
+
305
+ # Classify
306
+ label, confidence = _classify_from_probs(ent_p, con_p, neu_p)
307
+
308
+ results.append(EntailmentResult(
309
+ claim_index=attr.claim_index,
310
+ claim_text=attr.claim_text[:200],
311
+ fact_id=fact_id,
312
+ fact_text_preview=fact_text[:120],
313
+ label=label,
314
+ confidence=round(confidence, 4),
315
+ entailment_score=round(ent_p, 4),
316
+ contradiction_score=round(con_p, 4),
317
+ neutral_score=round(neu_p, 4),
318
+ used_model=True,
319
+ ))
320
+ except Exception as exc:
321
+ logger.warning("NLI inference failed, falling back to heuristic: %s", exc)
322
+ use_model = False
323
+
324
+ if not use_model:
325
+ # Heuristic fallback
326
+ for attr, fact_id, fact_text in pairs_to_check:
327
+ ent_s, con_s, neu_s = _heuristic_entailment(attr.claim_text, fact_text)
328
+ label, confidence = _classify_from_probs(ent_s, con_s, neu_s)
329
+
330
+ results.append(EntailmentResult(
331
+ claim_index=attr.claim_index,
332
+ claim_text=attr.claim_text[:200],
333
+ fact_id=fact_id,
334
+ fact_text_preview=fact_text[:120],
335
+ label=label,
336
+ confidence=round(confidence, 4),
337
+ entailment_score=round(ent_s, 4),
338
+ contradiction_score=round(con_s, 4),
339
+ neutral_score=round(neu_s, 4),
340
+ used_model=False,
341
+ ))
342
+
343
+ return results
344
+
345
+
346
+ def _classify_from_probs(
347
+ ent_p: float,
348
+ con_p: float,
349
+ neu_p: float,
350
+ ) -> tuple[EntailmentLabel, float]:
351
+ """Classify into ENTAILED/CONTRADICTION/NEUTRAL from probabilities."""
352
+ best = max(ent_p, con_p, neu_p)
353
+ if con_p == best:
354
+ return EntailmentLabel.CONTRADICTION, con_p
355
+ elif ent_p == best:
356
+ return EntailmentLabel.ENTAILED, ent_p
357
+ else:
358
+ return EntailmentLabel.NEUTRAL, neu_p
@@ -0,0 +1,203 @@
1
+ # Copyright © 2025 Constantinos Vidiniotis. All rights reserved.
2
+ # Licensed under Elastic License 2.0 — see LICENSE.md for details.
3
+ """Fabrication Detector — catch invented entities not in any source fact.
4
+
5
+ The model outputs "According to the 2024 Johnson report, revenue grew
6
+ 23%." The envelope contains no entity "Johnson", no year "2024", no
7
+ number "23". The model fabricated a citation to sound authoritative.
8
+
9
+ This module extracts specific entities from claims (numbers, percentages,
10
+ dates, proper nouns, citations) and cross-references them against ALL
11
+ envelope facts. Entities found in no source are flagged as fabrications.
12
+ """
13
+
14
+ from __future__ import annotations
15
+
16
+ import re
17
+ from collections.abc import Sequence
18
+
19
+ from crp.envelope.packer import PackedFact
20
+
21
+ from ._types import (
22
+ ClaimAttribution,
23
+ ClaimType,
24
+ FabricationResult,
25
+ FabricationType,
26
+ )
27
+
28
+
29
+ # ---------------------------------------------------------------------------
30
+ # Entity extraction patterns
31
+ # ---------------------------------------------------------------------------
32
+
33
+ # Percentages: "15%", "3.2%", "0.5 %"
34
+ _PCT_RE = re.compile(r"\b(\d+(?:\.\d+)?)\s*%")
35
+
36
+ # Numbers with context (skip very small/common numbers 0-9):
37
+ # "$1.2M", "1,234", "45.6", but not "a", "the", single digits
38
+ _NUM_RE = re.compile(r"(?<![a-zA-Z])(\d[\d,]*(?:\.\d+)?)")
39
+
40
+ # Dates: "2024", "2023-01-15", "January 2024", "Q3 2023"
41
+ _DATE_RE = re.compile(
42
+ r"\b("
43
+ r"(?:19|20)\d{2}(?:-\d{2}(?:-\d{2})?)?" # 2024, 2024-01-15
44
+ r"|(?:January|February|March|April|May|June"
45
+ r"|July|August|September|October|November|December)"
46
+ r"\s+(?:19|20)\d{2}" # January 2024
47
+ r"|Q[1-4]\s+(?:19|20)\d{2}" # Q3 2023
48
+ r")\b"
49
+ )
50
+
51
+ # Proper nouns: 2+ capitalized words in sequence
52
+ _PROPER_RE = re.compile(r"\b([A-Z][a-z]+(?:\s+[A-Z][a-z]+)+)\b")
53
+
54
+ # Citation-like: "according to X", "X et al.", "the X report/study/paper"
55
+ _CITATION_RE = re.compile(
56
+ r"(?:"
57
+ r"according\s+to\s+([A-Z][\w\s]+?)(?:\s*,|\s+\(|\s+report)"
58
+ r"|\b([A-Z][a-z]+\s+et\s+al\.?)"
59
+ r"|the\s+([A-Z][\w\s]+?)\s+(?:report|study|paper|analysis|survey)"
60
+ r")",
61
+ re.IGNORECASE,
62
+ )
63
+
64
+ # Trivial numbers to skip (too common to be meaningful)
65
+ _TRIVIAL_NUMBERS = frozenset({
66
+ "0", "1", "2", "3", "4", "5", "6", "7", "8", "9", "10",
67
+ "100", "1000",
68
+ })
69
+
70
+
71
+ # ---------------------------------------------------------------------------
72
+ # Internal helpers
73
+ # ---------------------------------------------------------------------------
74
+
75
+
76
+ def _entity_in_facts(entity: str, fact_texts: Sequence[str]) -> bool:
77
+ """Check if entity appears in any fact using word-boundary matching.
78
+
79
+ Uses per-fact matching (not a single concatenated corpus) to avoid
80
+ false negatives where entity fragments span fact boundaries.
81
+ Uses regex word-boundary matching to avoid substring false positives
82
+ (e.g., "23" matching inside "1234").
83
+ """
84
+ # Escape the entity for safe regex use, then match with word boundaries
85
+ pattern = re.compile(
86
+ r"(?<!\d)" + re.escape(entity.lower()) + r"(?!\d)"
87
+ if entity.strip().replace(".", "").replace(",", "").isdigit()
88
+ else r"\b" + re.escape(entity.lower()) + r"\b",
89
+ re.IGNORECASE,
90
+ )
91
+ return any(pattern.search(fact) for fact in fact_texts)
92
+
93
+
94
+ # ---------------------------------------------------------------------------
95
+ # Public API
96
+ # ---------------------------------------------------------------------------
97
+
98
+
99
+ def detect_fabrications(
100
+ attributions: list[ClaimAttribution],
101
+ packed_facts: Sequence[PackedFact],
102
+ ) -> list[FabricationResult]:
103
+ """Detect fabricated entities in claims that appear in no source fact.
104
+
105
+ Examines FACTUAL_CLAIM and HEDGE claims for specific entities (numbers,
106
+ percentages, dates, proper nouns, citations) and flags those not found
107
+ in any envelope fact.
108
+
109
+ Args:
110
+ attributions: Scored claim attributions.
111
+ packed_facts: All envelope facts.
112
+
113
+ Returns:
114
+ List of FabricationResult — one per fabricated entity found.
115
+ """
116
+ if not packed_facts:
117
+ # No facts to check against — every specific entity is unsourced
118
+ # but we can't meaningfully flag without reference material
119
+ return []
120
+
121
+ # Pre-compute lowercased per-fact texts for boundary matching
122
+ fact_texts = [pf.text.lower() for pf in packed_facts]
123
+ results: list[FabricationResult] = []
124
+
125
+ for attr in attributions:
126
+ # Only check factual and hedge claims
127
+ if attr.claim_type not in (ClaimType.FACTUAL_CLAIM, ClaimType.HEDGE):
128
+ continue
129
+
130
+ claim = attr.claim_text
131
+
132
+ # --- Check 1: Percentages ---
133
+ for m in _PCT_RE.finditer(claim):
134
+ pct_str = m.group(0) # e.g., "15%"
135
+ if not _entity_in_facts(m.group(1), fact_texts):
136
+ results.append(FabricationResult(
137
+ claim_index=attr.claim_index,
138
+ claim_text=claim[:200],
139
+ fabricated_entity=pct_str,
140
+ entity_type=FabricationType.PERCENTAGE,
141
+ severity=0.80,
142
+ detail=f"Percentage '{pct_str}' not found in any source fact",
143
+ ))
144
+
145
+ # --- Check 2: Significant numbers (> 9) ---
146
+ for m in _NUM_RE.finditer(claim):
147
+ num_raw = m.group(1).replace(",", "")
148
+ if num_raw in _TRIVIAL_NUMBERS:
149
+ continue
150
+ # Skip if it's part of a percentage (already caught above)
151
+ end_pos = m.end()
152
+ if end_pos < len(claim) and claim[end_pos:end_pos + 1] == "%":
153
+ continue
154
+ if not _entity_in_facts(num_raw, fact_texts):
155
+ results.append(FabricationResult(
156
+ claim_index=attr.claim_index,
157
+ claim_text=claim[:200],
158
+ fabricated_entity=num_raw,
159
+ entity_type=FabricationType.NUMBER,
160
+ severity=0.70,
161
+ detail=f"Number '{num_raw}' not found in any source fact",
162
+ ))
163
+
164
+ # --- Check 3: Dates ---
165
+ for m in _DATE_RE.finditer(claim):
166
+ date_str = m.group(0)
167
+ if not _entity_in_facts(date_str, fact_texts):
168
+ results.append(FabricationResult(
169
+ claim_index=attr.claim_index,
170
+ claim_text=claim[:200],
171
+ fabricated_entity=date_str,
172
+ entity_type=FabricationType.DATE,
173
+ severity=0.75,
174
+ detail=f"Date '{date_str}' not found in any source fact",
175
+ ))
176
+
177
+ # --- Check 4: Citations ---
178
+ for m in _CITATION_RE.finditer(claim):
179
+ citation = m.group(1) or m.group(2) or m.group(3)
180
+ if citation and not _entity_in_facts(citation.strip(), fact_texts):
181
+ results.append(FabricationResult(
182
+ claim_index=attr.claim_index,
183
+ claim_text=claim[:200],
184
+ fabricated_entity=citation.strip(),
185
+ entity_type=FabricationType.CITATION,
186
+ severity=0.90,
187
+ detail=f"Citation '{citation.strip()}' not found in any source fact",
188
+ ))
189
+
190
+ # --- Check 5: Proper nouns ---
191
+ for m in _PROPER_RE.finditer(claim):
192
+ name = m.group(0)
193
+ if not _entity_in_facts(name, fact_texts):
194
+ results.append(FabricationResult(
195
+ claim_index=attr.claim_index,
196
+ claim_text=claim[:200],
197
+ fabricated_entity=name,
198
+ entity_type=FabricationType.PROPER_NOUN,
199
+ severity=0.65,
200
+ detail=f"Name '{name}' not found in any source fact",
201
+ ))
202
+
203
+ return results