mcp-agentic-pipelines 1.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.env.example +93 -0
- package/README.md +258 -0
- package/package.json +70 -0
- package/packages/clinical/package.json +22 -0
- package/packages/clinical/src/index.ts +262 -0
- package/packages/clinical/tsconfig.json +13 -0
- package/packages/core/package.json +21 -0
- package/packages/core/src/config.ts +138 -0
- package/packages/core/src/errors.ts +100 -0
- package/packages/core/src/index.ts +104 -0
- package/packages/core/src/llm-config.ts +213 -0
- package/packages/core/src/logging.ts +66 -0
- package/packages/core/src/python-bridge.ts +384 -0
- package/packages/core/src/rate-limiter.ts +136 -0
- package/packages/core/src/types.ts +203 -0
- package/packages/core/src/validation.ts +101 -0
- package/packages/core/tsconfig.json +10 -0
- package/packages/deeppipe/package.json +21 -0
- package/packages/deeppipe/src/index.ts +424 -0
- package/packages/deeppipe/tsconfig.json +13 -0
- package/packages/piste/package.json +20 -0
- package/packages/piste/src/index.ts +48 -0
- package/packages/piste/tsconfig.json +13 -0
- package/packages/precis/package.json +20 -0
- package/packages/precis/src/index.ts +67 -0
- package/packages/precis/tsconfig.json +13 -0
- package/packages/server/package.json +31 -0
- package/packages/server/src/index.ts +427 -0
- package/packages/server/tsconfig.json +17 -0
- package/setup.mjs +141 -0
- package/test.mjs +337 -0
- package/vendors/clinical-intake/pipeline.mjs +349 -0
- package/vendors/clinical-intake/questions/en.txt +9 -0
- package/vendors/clinical-intake/questions/fr.txt +9 -0
- package/vendors/piste/.env.example +73 -0
- package/vendors/piste/app/core/__init__.py +4 -0
- package/vendors/piste/app/core/config.py +83 -0
- package/vendors/piste/app/core/debuglog.py +16 -0
- package/vendors/piste/app/core/middleware.py +40 -0
- package/vendors/piste/bridge_piste.py +301 -0
- package/vendors/piste/pipeline/__init__.py +4 -0
- package/vendors/piste/pipeline/compiler.py +68 -0
- package/vendors/piste/pipeline/offline/__init__.py +28 -0
- package/vendors/piste/pipeline/offline/verifaid_pipeline.py +247 -0
- package/vendors/piste/pipeline/replay.py +15 -0
- package/vendors/piste/pipeline/replay_engine.py +249 -0
- package/vendors/piste/pipeline/signatures/__init__.py +4 -0
- package/vendors/piste/pipeline/signatures/signatures.py +136 -0
- package/vendors/piste/pipeline/stage1/__init__.py +21 -0
- package/vendors/piste/pipeline/stage1/atomic_decomposer.py +61 -0
- package/vendors/piste/pipeline/stage1/check_worthiness.py +100 -0
- package/vendors/piste/pipeline/stage1/orchestrator.py +175 -0
- package/vendors/piste/pipeline/stage1/test_stage1.py +162 -0
- package/vendors/piste/pipeline/stage2/__init__.py +34 -0
- package/vendors/piste/pipeline/stage2/blind_retriever.py +303 -0
- package/vendors/piste/pipeline/stage2/canonical_mapper.py +124 -0
- package/vendors/piste/pipeline/stage2/credibility_scorer.py +85 -0
- package/vendors/piste/pipeline/stage2/orchestrator.py +311 -0
- package/vendors/piste/pipeline/stage2/query_refiner.py +88 -0
- package/vendors/piste/pipeline/stage2/search_decision.py +69 -0
- package/vendors/piste/pipeline/stage2/test_stage2.py +265 -0
- package/vendors/piste/pipeline/stage3/__init__.py +20 -0
- package/vendors/piste/pipeline/stage3/classifier.py +79 -0
- package/vendors/piste/pipeline/stage3/orchestrator.py +225 -0
- package/vendors/piste/pipeline/stage3/test_stage3.py +101 -0
- package/vendors/piste/pipeline/stage4/__init__.py +33 -0
- package/vendors/piste/pipeline/stage4/criticality_gate.py +177 -0
- package/vendors/piste/pipeline/stage4/orchestrator.py +269 -0
- package/vendors/piste/pipeline/stage4/test_stage4.py +192 -0
- package/vendors/piste/pipeline/stage4/verdict_aggregator.py +157 -0
- package/vendors/piste/requirements.txt +53 -0
- package/vendors/precis/backend/__init__.py +6 -0
- package/vendors/precis/backend/agents/__init__.py +3 -0
- package/vendors/precis/backend/agents/data_synthesis.py +105 -0
- package/vendors/precis/backend/agents/dist_free_synth.py +97 -0
- package/vendors/precis/backend/agents/exact_hash_retriever.py +327 -0
- package/vendors/precis/backend/agents/fusion_ranker.py +64 -0
- package/vendors/precis/backend/agents/guardrail.py +175 -0
- package/vendors/precis/backend/agents/query_expander.py +89 -0
- package/vendors/precis/backend/agents/radial_interpol.py +99 -0
- package/vendors/precis/backend/agents/report_generator.py +92 -0
- package/vendors/precis/backend/agents/semantic_reranker.py +135 -0
- package/vendors/precis/backend/agents/stat_anomaly.py +93 -0
- package/vendors/precis/backend/agents/vector_index.py +123 -0
- package/vendors/precis/backend/agents/veri_score.py +341 -0
- package/vendors/precis/backend/agents/work_order_extractor.py +205 -0
- package/vendors/precis/backend/api/__init__.py +3 -0
- package/vendors/precis/backend/api/routes/__init__.py +3 -0
- package/vendors/precis/backend/config.py +88 -0
- package/vendors/precis/backend/core/__init__.py +13 -0
- package/vendors/precis/backend/core/hashing.py +22 -0
- package/vendors/precis/backend/core/metrics.py +77 -0
- package/vendors/precis/backend/core/multitoken.py +166 -0
- package/vendors/precis/backend/core/pmi.py +54 -0
- package/vendors/precis/backend/core/stemming.py +74 -0
- package/vendors/precis/backend/core/tracing.py +150 -0
- package/vendors/precis/backend/data/__init__.py +3 -0
- package/vendors/precis/backend/data/chunker.py +57 -0
- package/vendors/precis/backend/data/pdf_parser.py +42 -0
- package/vendors/precis/backend/db/__init__.py +3 -0
- package/vendors/precis/backend/db/models.py +173 -0
- package/vendors/precis/backend/db/repository.py +269 -0
- package/vendors/precis/backend/llm/__init__.py +3 -0
- package/vendors/precis/backend/llm/anthropic_provider.py +39 -0
- package/vendors/precis/backend/llm/base.py +147 -0
- package/vendors/precis/backend/llm/deepseek_provider.py +43 -0
- package/vendors/precis/backend/llm/factory.py +60 -0
- package/vendors/precis/backend/llm/google_provider.py +39 -0
- package/vendors/precis/backend/llm/ollama_provider.py +54 -0
- package/vendors/precis/backend/llm/openai_provider.py +50 -0
- package/vendors/precis/backend/main.py +677 -0
- package/vendors/precis/backend/orchestrator/__init__.py +3 -0
- package/vendors/precis/backend/orchestrator/planner.py +81 -0
- package/vendors/precis/backend/orchestrator/router.py +319 -0
- package/vendors/precis/backend/orchestrator/types.py +58 -0
- package/vendors/precis/bridge_precis.py +185 -0
- package/vendors/precis/data/sample_reports/README.md +8 -0
- package/vendors/precis/data/seed_data.py +115 -0
- package/vendors/precis/requirements.txt +19 -0
|
@@ -0,0 +1,157 @@
|
|
|
1
|
+
# Copyright (c) 2026 Jinan Kordab
|
|
2
|
+
# SPDX-License-Identifier: MIT
|
|
3
|
+
|
|
4
|
+
"""
|
|
5
|
+
Stage 4b: Verdict Aggregator [J5]
|
|
6
|
+
====================================
|
|
7
|
+
DSPy-powered synthesis of per-source classifications into a
|
|
8
|
+
7-way PolitiFact-aligned verdict.
|
|
9
|
+
|
|
10
|
+
Jewel [J5] — DSPy typed Signatures with auto-optimization:
|
|
11
|
+
- Weighted by source credibility scores
|
|
12
|
+
- Generates natural language explanation with citations
|
|
13
|
+
- Returns probability distribution over all 7 verdict labels
|
|
14
|
+
- Model-agnostic (swap LLMs without changing code)
|
|
15
|
+
|
|
16
|
+
Verdict labels:
|
|
17
|
+
TRUE, MOSTLY_TRUE, HALF_TRUE, MOSTLY_FALSE, FALSE, PANTS_ON_FIRE, UNVERIFIABLE
|
|
18
|
+
"""
|
|
19
|
+
|
|
20
|
+
import json
|
|
21
|
+
import dspy
|
|
22
|
+
from typing import Tuple, Dict
|
|
23
|
+
from pipeline.signatures.signatures import VerdictAggregationSignature
|
|
24
|
+
from pipeline.stage3.orchestrator import ClassificationResult
|
|
25
|
+
|
|
26
|
+
# 7-way PolitiFact-aligned verdict labels
|
|
27
|
+
VERDICT_LABELS = [
|
|
28
|
+
"TRUE",
|
|
29
|
+
"MOSTLY_TRUE",
|
|
30
|
+
"HALF_TRUE",
|
|
31
|
+
"MOSTLY_FALSE",
|
|
32
|
+
"FALSE",
|
|
33
|
+
"PANTS_ON_FIRE",
|
|
34
|
+
"UNVERIFIABLE",
|
|
35
|
+
]
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
class VerdictAggregator(dspy.Module):
|
|
39
|
+
"""
|
|
40
|
+
DSPy module that synthesizes per-source classifications into a
|
|
41
|
+
final 7-way verdict with explanation and probability distribution.
|
|
42
|
+
|
|
43
|
+
Jewel [J5] — DSPy Compiler + Framework:
|
|
44
|
+
Typed Signatures, model-agnostic, compiler-optimizable.
|
|
45
|
+
Loop 3: re-optimized with user feedback labels.
|
|
46
|
+
"""
|
|
47
|
+
|
|
48
|
+
def __init__(self):
|
|
49
|
+
super().__init__()
|
|
50
|
+
self.aggregate = dspy.ChainOfThought(VerdictAggregationSignature)
|
|
51
|
+
|
|
52
|
+
def forward(
|
|
53
|
+
self,
|
|
54
|
+
claim: str,
|
|
55
|
+
classifications: list[ClassificationResult],
|
|
56
|
+
locale: str = "en",
|
|
57
|
+
) -> Tuple[str, float, str, Dict[str, float]]:
|
|
58
|
+
"""
|
|
59
|
+
Aggregate per-source classifications into final verdict.
|
|
60
|
+
|
|
61
|
+
Args:
|
|
62
|
+
claim: The atomic claim being verified.
|
|
63
|
+
classifications: Per-source classification results from Stage 3.
|
|
64
|
+
locale: Language locale for the response (en, fr).
|
|
65
|
+
|
|
66
|
+
Returns:
|
|
67
|
+
verdict: One of the 7 verdict labels
|
|
68
|
+
confidence: 0.0–1.0 overall confidence
|
|
69
|
+
explanation: Natural language explanation with citations
|
|
70
|
+
distribution: Probability weight for each verdict label
|
|
71
|
+
"""
|
|
72
|
+
# Build weighted classification summary
|
|
73
|
+
classifications_json = self._build_classifications_payload(
|
|
74
|
+
claim, classifications
|
|
75
|
+
)
|
|
76
|
+
|
|
77
|
+
result = self.aggregate(
|
|
78
|
+
claim=claim,
|
|
79
|
+
classifications_json=classifications_json,
|
|
80
|
+
locale=locale,
|
|
81
|
+
)
|
|
82
|
+
|
|
83
|
+
# Normalize verdict
|
|
84
|
+
verdict = result.verdict.strip().upper()
|
|
85
|
+
verdict = self._normalize_verdict(verdict)
|
|
86
|
+
|
|
87
|
+
# Parse distribution
|
|
88
|
+
try:
|
|
89
|
+
distribution = json.loads(result.distribution_json)
|
|
90
|
+
except (json.JSONDecodeError, TypeError):
|
|
91
|
+
distribution = self._default_distribution(verdict)
|
|
92
|
+
|
|
93
|
+
# Ensure all labels are present
|
|
94
|
+
for label in VERDICT_LABELS:
|
|
95
|
+
if label not in distribution:
|
|
96
|
+
distribution[label] = 0.0
|
|
97
|
+
|
|
98
|
+
return (
|
|
99
|
+
verdict,
|
|
100
|
+
float(result.confidence),
|
|
101
|
+
result.explanation,
|
|
102
|
+
distribution,
|
|
103
|
+
)
|
|
104
|
+
|
|
105
|
+
def _build_classifications_payload(
|
|
106
|
+
self,
|
|
107
|
+
claim: str,
|
|
108
|
+
classifications: list[ClassificationResult],
|
|
109
|
+
) -> str:
|
|
110
|
+
"""Build JSON payload summarizing per-source classifications."""
|
|
111
|
+
summary = []
|
|
112
|
+
for i, c in enumerate(classifications):
|
|
113
|
+
summary.append({
|
|
114
|
+
"source_index": i,
|
|
115
|
+
"source_domain": c.source_domain,
|
|
116
|
+
"credibility_score": c.credibility_score,
|
|
117
|
+
"label": c.label,
|
|
118
|
+
"confidence": c.confidence,
|
|
119
|
+
"rationale": c.rationale,
|
|
120
|
+
})
|
|
121
|
+
return json.dumps(summary, indent=2)
|
|
122
|
+
|
|
123
|
+
def _normalize_verdict(self, raw: str) -> str:
|
|
124
|
+
"""Normalize LLM output to a valid verdict label."""
|
|
125
|
+
raw_upper = raw.upper().strip()
|
|
126
|
+
|
|
127
|
+
# Direct match
|
|
128
|
+
if raw_upper in VERDICT_LABELS:
|
|
129
|
+
return raw_upper
|
|
130
|
+
|
|
131
|
+
# Fuzzy match
|
|
132
|
+
mapping = {
|
|
133
|
+
"TRUE": "TRUE",
|
|
134
|
+
"MOSTLY TRUE": "MOSTLY_TRUE",
|
|
135
|
+
"MOSTLYTRUE": "MOSTLY_TRUE",
|
|
136
|
+
"HALF TRUE": "HALF_TRUE",
|
|
137
|
+
"HALFTRUE": "HALF_TRUE",
|
|
138
|
+
"MOSTLY FALSE": "MOSTLY_FALSE",
|
|
139
|
+
"MOSTLYFALSE": "MOSTLY_FALSE",
|
|
140
|
+
"FALSE": "FALSE",
|
|
141
|
+
"PANTS ON FIRE": "PANTS_ON_FIRE",
|
|
142
|
+
"PANTSONFIRE": "PANTS_ON_FIRE",
|
|
143
|
+
"UNVERIFIABLE": "UNVERIFIABLE",
|
|
144
|
+
"NOT ENOUGH INFORMATION": "UNVERIFIABLE",
|
|
145
|
+
"NEI": "UNVERIFIABLE",
|
|
146
|
+
}
|
|
147
|
+
return mapping.get(raw_upper, "UNVERIFIABLE")
|
|
148
|
+
|
|
149
|
+
def _default_distribution(self, verdict: str) -> Dict[str, float]:
|
|
150
|
+
"""Create a default distribution centered on the given verdict."""
|
|
151
|
+
dist = {label: 0.0 for label in VERDICT_LABELS}
|
|
152
|
+
dist[verdict] = 1.0
|
|
153
|
+
return dist
|
|
154
|
+
|
|
155
|
+
|
|
156
|
+
# Singleton
|
|
157
|
+
verdict_aggregator = VerdictAggregator()
|
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
# ============================================================
|
|
2
|
+
# Piste — Backend Dependencies
|
|
3
|
+
# FastAPI 0.115 + DSPy 2.6 + LiteLLM + PostgreSQL + Redis
|
|
4
|
+
# ============================================================
|
|
5
|
+
|
|
6
|
+
# --- Web Framework ---
|
|
7
|
+
fastapi==0.115.6
|
|
8
|
+
uvicorn[standard]==0.34.0
|
|
9
|
+
sse-starlette==2.2.1
|
|
10
|
+
|
|
11
|
+
# --- Validation ---
|
|
12
|
+
pydantic==2.10.5
|
|
13
|
+
pydantic-settings==2.7.1
|
|
14
|
+
|
|
15
|
+
# --- Database ---
|
|
16
|
+
sqlalchemy[asyncio]==2.0.36
|
|
17
|
+
asyncpg==0.30.0
|
|
18
|
+
alembic==1.14.1
|
|
19
|
+
psycopg2-binary==2.9.10
|
|
20
|
+
|
|
21
|
+
# --- Cache ---
|
|
22
|
+
redis==5.2.1
|
|
23
|
+
hiredis==3.0.0
|
|
24
|
+
|
|
25
|
+
# --- AI / ML Pipeline ---
|
|
26
|
+
dspy-ai
|
|
27
|
+
litellm
|
|
28
|
+
faiss-cpu
|
|
29
|
+
numpy
|
|
30
|
+
|
|
31
|
+
# --- HTTP / SSE / Web ---
|
|
32
|
+
httpx
|
|
33
|
+
aiohttp
|
|
34
|
+
|
|
35
|
+
# --- Observability ---
|
|
36
|
+
prometheus-client==0.21.1
|
|
37
|
+
opentelemetry-api==1.29.0
|
|
38
|
+
opentelemetry-sdk==1.29.0
|
|
39
|
+
opentelemetry-instrumentation-fastapi==0.50b0
|
|
40
|
+
|
|
41
|
+
# --- Utilities ---
|
|
42
|
+
python-dotenv==1.0.1
|
|
43
|
+
python-multipart==0.0.19
|
|
44
|
+
tenacity==9.0.0
|
|
45
|
+
orjson==3.10.13
|
|
46
|
+
|
|
47
|
+
# --- Auth ---
|
|
48
|
+
python-jose[cryptography]==3.3.0
|
|
49
|
+
passlib[bcrypt]==1.7.4
|
|
50
|
+
|
|
51
|
+
# --- Testing ---
|
|
52
|
+
pytest==8.3.4
|
|
53
|
+
pytest-asyncio==0.25.0
|
|
@@ -0,0 +1,6 @@
|
|
|
1
|
+
# =============================================================================
|
|
2
|
+
# © JINAN KORDAB — 2026 AI HYBRID AGENTIC RETRIEVAL-AUGMENTED GENERATION RAG PIPELINE - PERSONAL PROJECT
|
|
3
|
+
# =============================================================================
|
|
4
|
+
# Package initialization. Import key classes here for convenient access:
|
|
5
|
+
# from backend import PrecisOrchestrator, ExactHashRetriever, ...
|
|
6
|
+
# =============================================================================
|
|
@@ -0,0 +1,105 @@
|
|
|
1
|
+
"""© JINAN KORDAB — 2026 AI HYBRID AGENTIC RETRIEVAL-AUGMENTED GENERATION RAG PIPELINE - PERSONAL PROJECT"""
|
|
2
|
+
|
|
3
|
+
from typing import Any, Dict, List, Optional
|
|
4
|
+
|
|
5
|
+
from backend.orchestrator.types import AgentResult
|
|
6
|
+
from backend.llm.base import LLMProvider
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class DataSynthesisAgent:
|
|
10
|
+
"""Combines results from upstream agents into a coherent synthesis using LLM reasoning.
|
|
11
|
+
|
|
12
|
+
The *llm* parameter on __init__ serves as a default; individual calls to
|
|
13
|
+
synthesize() may override it. If no provider is available at call time
|
|
14
|
+
the agent falls back to returning raw fragments — this is intentional so
|
|
15
|
+
the pipeline never breaks, but a warning is logged.
|
|
16
|
+
"""
|
|
17
|
+
|
|
18
|
+
def __init__(self, llm: Optional[LLMProvider] = None) -> None:
|
|
19
|
+
self.llm = llm
|
|
20
|
+
|
|
21
|
+
async def synthesize(
|
|
22
|
+
self,
|
|
23
|
+
query: str,
|
|
24
|
+
upstream_results: List[AgentResult],
|
|
25
|
+
llm: Optional[LLMProvider] = None,
|
|
26
|
+
) -> AgentResult:
|
|
27
|
+
"""Synthesize results from multiple upstream agents into a single answer.
|
|
28
|
+
|
|
29
|
+
Parameters
|
|
30
|
+
----------
|
|
31
|
+
query : str
|
|
32
|
+
The original user query.
|
|
33
|
+
upstream_results : List[AgentResult]
|
|
34
|
+
Results from upstream retrieval / analysis agents.
|
|
35
|
+
llm : Optional[LLMProvider]
|
|
36
|
+
Per-call override for the LLM provider. Falls back to self.llm.
|
|
37
|
+
"""
|
|
38
|
+
provider = llm or self.llm
|
|
39
|
+
|
|
40
|
+
# ── Collect text fragments from upstream results ─────────
|
|
41
|
+
fragments: List[str] = []
|
|
42
|
+
for r in upstream_results:
|
|
43
|
+
if not r.success or not r.data:
|
|
44
|
+
continue
|
|
45
|
+
if isinstance(r.data, dict):
|
|
46
|
+
for item in r.data.get("results", []):
|
|
47
|
+
if isinstance(item, dict):
|
|
48
|
+
text = item.get("text", "")
|
|
49
|
+
if isinstance(text, (list, tuple)):
|
|
50
|
+
fragments.append(" ".join(str(t) for t in text))
|
|
51
|
+
elif text:
|
|
52
|
+
fragments.append(str(text))
|
|
53
|
+
# Also capture any pre-existing synthesis
|
|
54
|
+
synth = r.data.get("synthesis", "")
|
|
55
|
+
if synth:
|
|
56
|
+
fragments.append(str(synth))
|
|
57
|
+
elif isinstance(r.data, str):
|
|
58
|
+
fragments.append(r.data)
|
|
59
|
+
|
|
60
|
+
combined_context = "\n".join(fragments[:50]) if fragments else "(no data retrieved)"
|
|
61
|
+
|
|
62
|
+
# ── LLM synthesis (or graceful fallback) ─────────────────
|
|
63
|
+
if provider and fragments:
|
|
64
|
+
prompt = (
|
|
65
|
+
"You are a precise data synthesis agent. Synthesize the following retrieved\n"
|
|
66
|
+
"information into a concise answer to the user's query.\n\n"
|
|
67
|
+
f"USER QUERY: {query}\n\n"
|
|
68
|
+
f"RETRIEVED DATA:\n{combined_context}\n\n"
|
|
69
|
+
"SYNTHESIS: Answer the query using ONLY the retrieved data above. "
|
|
70
|
+
"If the data is insufficient, state what's missing. "
|
|
71
|
+
"Be specific with numbers, percentages, and entity names found in the data."
|
|
72
|
+
)
|
|
73
|
+
try:
|
|
74
|
+
import asyncio
|
|
75
|
+
response = await asyncio.wait_for(
|
|
76
|
+
provider.generate(prompt, max_tokens=250),
|
|
77
|
+
timeout=30,
|
|
78
|
+
)
|
|
79
|
+
synthesis_text = response
|
|
80
|
+
except asyncio.TimeoutError:
|
|
81
|
+
synthesis_text = (
|
|
82
|
+
f"(LLM synthesis timed out. Retrieved {len(fragments)} fragments.)\n"
|
|
83
|
+
f"{combined_context[:500]}"
|
|
84
|
+
)
|
|
85
|
+
except Exception as e:
|
|
86
|
+
synthesis_text = (
|
|
87
|
+
f"(LLM synthesis error: {e})\n{combined_context[:500]}"
|
|
88
|
+
)
|
|
89
|
+
elif fragments:
|
|
90
|
+
synthesis_text = (
|
|
91
|
+
f"Retrieved {len(fragments)} fragments:\n{combined_context[:1000]}"
|
|
92
|
+
)
|
|
93
|
+
else:
|
|
94
|
+
synthesis_text = "(No data retrieved — unable to synthesize a response.)"
|
|
95
|
+
|
|
96
|
+
return AgentResult(
|
|
97
|
+
subtask_id="synthesis",
|
|
98
|
+
agent_name="DataSynthesis",
|
|
99
|
+
success=True,
|
|
100
|
+
data={
|
|
101
|
+
"synthesis": synthesis_text,
|
|
102
|
+
"source_fragments": len(fragments),
|
|
103
|
+
},
|
|
104
|
+
citations=[],
|
|
105
|
+
)
|
|
@@ -0,0 +1,97 @@
|
|
|
1
|
+
"""© JINAN KORDAB — 2026 AI HYBRID AGENTIC RETRIEVAL-AUGMENTED GENERATION RAG PIPELINE - PERSONAL PROJECT"""
|
|
2
|
+
|
|
3
|
+
import numpy as np
|
|
4
|
+
import pandas as pd
|
|
5
|
+
from dataclasses import dataclass, field
|
|
6
|
+
from typing import Dict, List, Optional
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
@dataclass
|
|
10
|
+
class SyntheticDataset:
|
|
11
|
+
dataframe: pd.DataFrame
|
|
12
|
+
n_rows: int
|
|
13
|
+
n_features: int
|
|
14
|
+
generation_mode: str
|
|
15
|
+
correlation_preserved: bool
|
|
16
|
+
hellinger_distance: float
|
|
17
|
+
metadata: Dict = field(default_factory=dict)
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class DistFreeSynth:
|
|
21
|
+
"""Bin-based distribution-free synthesizer. Outperforms GANs on tabular data."""
|
|
22
|
+
|
|
23
|
+
def __init__(self, bins_per_feature: Optional[List[int]] = None) -> None:
|
|
24
|
+
self.bins_per_feature = bins_per_feature
|
|
25
|
+
self.pc_table: List[np.ndarray] = []
|
|
26
|
+
self.bin_counts: Dict[str, int] = {}
|
|
27
|
+
self.bin_obs: Dict[str, List[np.ndarray]] = {}
|
|
28
|
+
self.features: List[str] = []
|
|
29
|
+
self.n_features: int = 0
|
|
30
|
+
self.n_original: int = 0
|
|
31
|
+
|
|
32
|
+
def fit(self, df: pd.DataFrame, bins_per_feature: Optional[List[int]] = None) -> None:
|
|
33
|
+
self.features = list(df.columns)
|
|
34
|
+
self.n_features = len(self.features)
|
|
35
|
+
self.n_original = len(df)
|
|
36
|
+
if bins_per_feature is None:
|
|
37
|
+
n = max(len(df), 1)
|
|
38
|
+
bins_per_feature = [max(5, int(np.sqrt(n))) for _ in range(self.n_features)]
|
|
39
|
+
self.bins_per_feature = bins_per_feature
|
|
40
|
+
npdata = df.to_numpy()
|
|
41
|
+
self.pc_table = []
|
|
42
|
+
for k in range(self.n_features):
|
|
43
|
+
incr = 1.0 / bins_per_feature[k]
|
|
44
|
+
pc = np.arange(0, 1 + incr / 2, incr)
|
|
45
|
+
arr = np.quantile(npdata[:, k], np.clip(pc, 0, 1))
|
|
46
|
+
self.pc_table.append(arr)
|
|
47
|
+
self.bin_counts, self.bin_obs = {}, {}
|
|
48
|
+
for obs in npdata:
|
|
49
|
+
key = []
|
|
50
|
+
for k in range(self.n_features):
|
|
51
|
+
idx = int(np.searchsorted(self.pc_table[k], obs[k], side="right")) - 1
|
|
52
|
+
idx = max(0, min(idx, bins_per_feature[k] - 1))
|
|
53
|
+
key.append(idx)
|
|
54
|
+
skey = str(key)
|
|
55
|
+
self.bin_counts[skey] = self.bin_counts.get(skey, 0) + 1
|
|
56
|
+
self.bin_obs.setdefault(skey, []).append(obs)
|
|
57
|
+
|
|
58
|
+
def generate(self, n_synth: int, mode: str = "random_counts",
|
|
59
|
+
correlation_preserve: bool = True, seed: Optional[int] = None) -> SyntheticDataset:
|
|
60
|
+
if seed is not None:
|
|
61
|
+
np.random.seed(seed)
|
|
62
|
+
bin_keys = list(self.bin_counts.keys())
|
|
63
|
+
if mode == "random_counts":
|
|
64
|
+
probs = np.array([self.bin_counts[k] for k in bin_keys], dtype=float)
|
|
65
|
+
probs /= probs.sum()
|
|
66
|
+
sampled = np.random.choice(len(bin_keys), size=n_synth, p=probs)
|
|
67
|
+
key_counts: Dict[str, int] = {k: 0 for k in bin_keys}
|
|
68
|
+
for idx in sampled:
|
|
69
|
+
key_counts[bin_keys[idx]] += 1
|
|
70
|
+
else:
|
|
71
|
+
key_counts = dict(self.bin_counts)
|
|
72
|
+
synth_data = []
|
|
73
|
+
for skey, count in key_counts.items():
|
|
74
|
+
if count <= 0:
|
|
75
|
+
continue
|
|
76
|
+
key = eval(skey)
|
|
77
|
+
L = [self.pc_table[k][key[k]] for k in range(self.n_features)]
|
|
78
|
+
U = [self.pc_table[k][key[k] + 1] for k in range(self.n_features)]
|
|
79
|
+
for _ in range(count):
|
|
80
|
+
obs = np.array([np.random.uniform(L[k], U[k]) for k in range(self.n_features)])
|
|
81
|
+
synth_data.append(obs)
|
|
82
|
+
result = pd.DataFrame(synth_data, columns=self.features)
|
|
83
|
+
hd = self._compute_hellinger(self.bin_counts, key_counts)
|
|
84
|
+
return SyntheticDataset(dataframe=result, n_rows=len(result), n_features=self.n_features,
|
|
85
|
+
generation_mode=mode, correlation_preserved=correlation_preserve,
|
|
86
|
+
hellinger_distance=hd)
|
|
87
|
+
|
|
88
|
+
def _compute_hellinger(self, real: Dict[str, int], synth: Dict[str, int]) -> float:
|
|
89
|
+
all_keys = set(real) | set(synth)
|
|
90
|
+
r_total = max(1, sum(real.values()))
|
|
91
|
+
s_total = max(1, sum(synth.values()))
|
|
92
|
+
sum_sq = 0.0
|
|
93
|
+
for k in all_keys:
|
|
94
|
+
p = real.get(k, 0) / r_total
|
|
95
|
+
q = synth.get(k, 0) / s_total
|
|
96
|
+
sum_sq += (np.sqrt(p) - np.sqrt(q)) ** 2
|
|
97
|
+
return float(np.sqrt(0.5 * sum_sq))
|