mcp-agentic-pipelines 1.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (119) hide show
  1. package/.env.example +93 -0
  2. package/README.md +258 -0
  3. package/package.json +70 -0
  4. package/packages/clinical/package.json +22 -0
  5. package/packages/clinical/src/index.ts +262 -0
  6. package/packages/clinical/tsconfig.json +13 -0
  7. package/packages/core/package.json +21 -0
  8. package/packages/core/src/config.ts +138 -0
  9. package/packages/core/src/errors.ts +100 -0
  10. package/packages/core/src/index.ts +104 -0
  11. package/packages/core/src/llm-config.ts +213 -0
  12. package/packages/core/src/logging.ts +66 -0
  13. package/packages/core/src/python-bridge.ts +384 -0
  14. package/packages/core/src/rate-limiter.ts +136 -0
  15. package/packages/core/src/types.ts +203 -0
  16. package/packages/core/src/validation.ts +101 -0
  17. package/packages/core/tsconfig.json +10 -0
  18. package/packages/deeppipe/package.json +21 -0
  19. package/packages/deeppipe/src/index.ts +424 -0
  20. package/packages/deeppipe/tsconfig.json +13 -0
  21. package/packages/piste/package.json +20 -0
  22. package/packages/piste/src/index.ts +48 -0
  23. package/packages/piste/tsconfig.json +13 -0
  24. package/packages/precis/package.json +20 -0
  25. package/packages/precis/src/index.ts +67 -0
  26. package/packages/precis/tsconfig.json +13 -0
  27. package/packages/server/package.json +31 -0
  28. package/packages/server/src/index.ts +427 -0
  29. package/packages/server/tsconfig.json +17 -0
  30. package/setup.mjs +141 -0
  31. package/test.mjs +337 -0
  32. package/vendors/clinical-intake/pipeline.mjs +349 -0
  33. package/vendors/clinical-intake/questions/en.txt +9 -0
  34. package/vendors/clinical-intake/questions/fr.txt +9 -0
  35. package/vendors/piste/.env.example +73 -0
  36. package/vendors/piste/app/core/__init__.py +4 -0
  37. package/vendors/piste/app/core/config.py +83 -0
  38. package/vendors/piste/app/core/debuglog.py +16 -0
  39. package/vendors/piste/app/core/middleware.py +40 -0
  40. package/vendors/piste/bridge_piste.py +301 -0
  41. package/vendors/piste/pipeline/__init__.py +4 -0
  42. package/vendors/piste/pipeline/compiler.py +68 -0
  43. package/vendors/piste/pipeline/offline/__init__.py +28 -0
  44. package/vendors/piste/pipeline/offline/verifaid_pipeline.py +247 -0
  45. package/vendors/piste/pipeline/replay.py +15 -0
  46. package/vendors/piste/pipeline/replay_engine.py +249 -0
  47. package/vendors/piste/pipeline/signatures/__init__.py +4 -0
  48. package/vendors/piste/pipeline/signatures/signatures.py +136 -0
  49. package/vendors/piste/pipeline/stage1/__init__.py +21 -0
  50. package/vendors/piste/pipeline/stage1/atomic_decomposer.py +61 -0
  51. package/vendors/piste/pipeline/stage1/check_worthiness.py +100 -0
  52. package/vendors/piste/pipeline/stage1/orchestrator.py +175 -0
  53. package/vendors/piste/pipeline/stage1/test_stage1.py +162 -0
  54. package/vendors/piste/pipeline/stage2/__init__.py +34 -0
  55. package/vendors/piste/pipeline/stage2/blind_retriever.py +303 -0
  56. package/vendors/piste/pipeline/stage2/canonical_mapper.py +124 -0
  57. package/vendors/piste/pipeline/stage2/credibility_scorer.py +85 -0
  58. package/vendors/piste/pipeline/stage2/orchestrator.py +311 -0
  59. package/vendors/piste/pipeline/stage2/query_refiner.py +88 -0
  60. package/vendors/piste/pipeline/stage2/search_decision.py +69 -0
  61. package/vendors/piste/pipeline/stage2/test_stage2.py +265 -0
  62. package/vendors/piste/pipeline/stage3/__init__.py +20 -0
  63. package/vendors/piste/pipeline/stage3/classifier.py +79 -0
  64. package/vendors/piste/pipeline/stage3/orchestrator.py +225 -0
  65. package/vendors/piste/pipeline/stage3/test_stage3.py +101 -0
  66. package/vendors/piste/pipeline/stage4/__init__.py +33 -0
  67. package/vendors/piste/pipeline/stage4/criticality_gate.py +177 -0
  68. package/vendors/piste/pipeline/stage4/orchestrator.py +269 -0
  69. package/vendors/piste/pipeline/stage4/test_stage4.py +192 -0
  70. package/vendors/piste/pipeline/stage4/verdict_aggregator.py +157 -0
  71. package/vendors/piste/requirements.txt +53 -0
  72. package/vendors/precis/backend/__init__.py +6 -0
  73. package/vendors/precis/backend/agents/__init__.py +3 -0
  74. package/vendors/precis/backend/agents/data_synthesis.py +105 -0
  75. package/vendors/precis/backend/agents/dist_free_synth.py +97 -0
  76. package/vendors/precis/backend/agents/exact_hash_retriever.py +327 -0
  77. package/vendors/precis/backend/agents/fusion_ranker.py +64 -0
  78. package/vendors/precis/backend/agents/guardrail.py +175 -0
  79. package/vendors/precis/backend/agents/query_expander.py +89 -0
  80. package/vendors/precis/backend/agents/radial_interpol.py +99 -0
  81. package/vendors/precis/backend/agents/report_generator.py +92 -0
  82. package/vendors/precis/backend/agents/semantic_reranker.py +135 -0
  83. package/vendors/precis/backend/agents/stat_anomaly.py +93 -0
  84. package/vendors/precis/backend/agents/vector_index.py +123 -0
  85. package/vendors/precis/backend/agents/veri_score.py +341 -0
  86. package/vendors/precis/backend/agents/work_order_extractor.py +205 -0
  87. package/vendors/precis/backend/api/__init__.py +3 -0
  88. package/vendors/precis/backend/api/routes/__init__.py +3 -0
  89. package/vendors/precis/backend/config.py +88 -0
  90. package/vendors/precis/backend/core/__init__.py +13 -0
  91. package/vendors/precis/backend/core/hashing.py +22 -0
  92. package/vendors/precis/backend/core/metrics.py +77 -0
  93. package/vendors/precis/backend/core/multitoken.py +166 -0
  94. package/vendors/precis/backend/core/pmi.py +54 -0
  95. package/vendors/precis/backend/core/stemming.py +74 -0
  96. package/vendors/precis/backend/core/tracing.py +150 -0
  97. package/vendors/precis/backend/data/__init__.py +3 -0
  98. package/vendors/precis/backend/data/chunker.py +57 -0
  99. package/vendors/precis/backend/data/pdf_parser.py +42 -0
  100. package/vendors/precis/backend/db/__init__.py +3 -0
  101. package/vendors/precis/backend/db/models.py +173 -0
  102. package/vendors/precis/backend/db/repository.py +269 -0
  103. package/vendors/precis/backend/llm/__init__.py +3 -0
  104. package/vendors/precis/backend/llm/anthropic_provider.py +39 -0
  105. package/vendors/precis/backend/llm/base.py +147 -0
  106. package/vendors/precis/backend/llm/deepseek_provider.py +43 -0
  107. package/vendors/precis/backend/llm/factory.py +60 -0
  108. package/vendors/precis/backend/llm/google_provider.py +39 -0
  109. package/vendors/precis/backend/llm/ollama_provider.py +54 -0
  110. package/vendors/precis/backend/llm/openai_provider.py +50 -0
  111. package/vendors/precis/backend/main.py +677 -0
  112. package/vendors/precis/backend/orchestrator/__init__.py +3 -0
  113. package/vendors/precis/backend/orchestrator/planner.py +81 -0
  114. package/vendors/precis/backend/orchestrator/router.py +319 -0
  115. package/vendors/precis/backend/orchestrator/types.py +58 -0
  116. package/vendors/precis/bridge_precis.py +185 -0
  117. package/vendors/precis/data/sample_reports/README.md +8 -0
  118. package/vendors/precis/data/seed_data.py +115 -0
  119. package/vendors/precis/requirements.txt +19 -0
@@ -0,0 +1,157 @@
1
+ # Copyright (c) 2026 Jinan Kordab
2
+ # SPDX-License-Identifier: MIT
3
+
4
+ """
5
+ Stage 4b: Verdict Aggregator [J5]
6
+ ====================================
7
+ DSPy-powered synthesis of per-source classifications into a
8
+ 7-way PolitiFact-aligned verdict.
9
+
10
+ Jewel [J5] — DSPy typed Signatures with auto-optimization:
11
+ - Weighted by source credibility scores
12
+ - Generates natural language explanation with citations
13
+ - Returns probability distribution over all 7 verdict labels
14
+ - Model-agnostic (swap LLMs without changing code)
15
+
16
+ Verdict labels:
17
+ TRUE, MOSTLY_TRUE, HALF_TRUE, MOSTLY_FALSE, FALSE, PANTS_ON_FIRE, UNVERIFIABLE
18
+ """
19
+
20
+ import json
21
+ import dspy
22
+ from typing import Tuple, Dict
23
+ from pipeline.signatures.signatures import VerdictAggregationSignature
24
+ from pipeline.stage3.orchestrator import ClassificationResult
25
+
26
+ # 7-way PolitiFact-aligned verdict labels
27
+ VERDICT_LABELS = [
28
+ "TRUE",
29
+ "MOSTLY_TRUE",
30
+ "HALF_TRUE",
31
+ "MOSTLY_FALSE",
32
+ "FALSE",
33
+ "PANTS_ON_FIRE",
34
+ "UNVERIFIABLE",
35
+ ]
36
+
37
+
38
+ class VerdictAggregator(dspy.Module):
39
+ """
40
+ DSPy module that synthesizes per-source classifications into a
41
+ final 7-way verdict with explanation and probability distribution.
42
+
43
+ Jewel [J5] — DSPy Compiler + Framework:
44
+ Typed Signatures, model-agnostic, compiler-optimizable.
45
+ Loop 3: re-optimized with user feedback labels.
46
+ """
47
+
48
+ def __init__(self):
49
+ super().__init__()
50
+ self.aggregate = dspy.ChainOfThought(VerdictAggregationSignature)
51
+
52
+ def forward(
53
+ self,
54
+ claim: str,
55
+ classifications: list[ClassificationResult],
56
+ locale: str = "en",
57
+ ) -> Tuple[str, float, str, Dict[str, float]]:
58
+ """
59
+ Aggregate per-source classifications into final verdict.
60
+
61
+ Args:
62
+ claim: The atomic claim being verified.
63
+ classifications: Per-source classification results from Stage 3.
64
+ locale: Language locale for the response (en, fr).
65
+
66
+ Returns:
67
+ verdict: One of the 7 verdict labels
68
+ confidence: 0.0–1.0 overall confidence
69
+ explanation: Natural language explanation with citations
70
+ distribution: Probability weight for each verdict label
71
+ """
72
+ # Build weighted classification summary
73
+ classifications_json = self._build_classifications_payload(
74
+ claim, classifications
75
+ )
76
+
77
+ result = self.aggregate(
78
+ claim=claim,
79
+ classifications_json=classifications_json,
80
+ locale=locale,
81
+ )
82
+
83
+ # Normalize verdict
84
+ verdict = result.verdict.strip().upper()
85
+ verdict = self._normalize_verdict(verdict)
86
+
87
+ # Parse distribution
88
+ try:
89
+ distribution = json.loads(result.distribution_json)
90
+ except (json.JSONDecodeError, TypeError):
91
+ distribution = self._default_distribution(verdict)
92
+
93
+ # Ensure all labels are present
94
+ for label in VERDICT_LABELS:
95
+ if label not in distribution:
96
+ distribution[label] = 0.0
97
+
98
+ return (
99
+ verdict,
100
+ float(result.confidence),
101
+ result.explanation,
102
+ distribution,
103
+ )
104
+
105
+ def _build_classifications_payload(
106
+ self,
107
+ claim: str,
108
+ classifications: list[ClassificationResult],
109
+ ) -> str:
110
+ """Build JSON payload summarizing per-source classifications."""
111
+ summary = []
112
+ for i, c in enumerate(classifications):
113
+ summary.append({
114
+ "source_index": i,
115
+ "source_domain": c.source_domain,
116
+ "credibility_score": c.credibility_score,
117
+ "label": c.label,
118
+ "confidence": c.confidence,
119
+ "rationale": c.rationale,
120
+ })
121
+ return json.dumps(summary, indent=2)
122
+
123
+ def _normalize_verdict(self, raw: str) -> str:
124
+ """Normalize LLM output to a valid verdict label."""
125
+ raw_upper = raw.upper().strip()
126
+
127
+ # Direct match
128
+ if raw_upper in VERDICT_LABELS:
129
+ return raw_upper
130
+
131
+ # Fuzzy match
132
+ mapping = {
133
+ "TRUE": "TRUE",
134
+ "MOSTLY TRUE": "MOSTLY_TRUE",
135
+ "MOSTLYTRUE": "MOSTLY_TRUE",
136
+ "HALF TRUE": "HALF_TRUE",
137
+ "HALFTRUE": "HALF_TRUE",
138
+ "MOSTLY FALSE": "MOSTLY_FALSE",
139
+ "MOSTLYFALSE": "MOSTLY_FALSE",
140
+ "FALSE": "FALSE",
141
+ "PANTS ON FIRE": "PANTS_ON_FIRE",
142
+ "PANTSONFIRE": "PANTS_ON_FIRE",
143
+ "UNVERIFIABLE": "UNVERIFIABLE",
144
+ "NOT ENOUGH INFORMATION": "UNVERIFIABLE",
145
+ "NEI": "UNVERIFIABLE",
146
+ }
147
+ return mapping.get(raw_upper, "UNVERIFIABLE")
148
+
149
+ def _default_distribution(self, verdict: str) -> Dict[str, float]:
150
+ """Create a default distribution centered on the given verdict."""
151
+ dist = {label: 0.0 for label in VERDICT_LABELS}
152
+ dist[verdict] = 1.0
153
+ return dist
154
+
155
+
156
+ # Singleton
157
+ verdict_aggregator = VerdictAggregator()
@@ -0,0 +1,53 @@
1
+ # ============================================================
2
+ # Piste — Backend Dependencies
3
+ # FastAPI 0.115 + DSPy 2.6 + LiteLLM + PostgreSQL + Redis
4
+ # ============================================================
5
+
6
+ # --- Web Framework ---
7
+ fastapi==0.115.6
8
+ uvicorn[standard]==0.34.0
9
+ sse-starlette==2.2.1
10
+
11
+ # --- Validation ---
12
+ pydantic==2.10.5
13
+ pydantic-settings==2.7.1
14
+
15
+ # --- Database ---
16
+ sqlalchemy[asyncio]==2.0.36
17
+ asyncpg==0.30.0
18
+ alembic==1.14.1
19
+ psycopg2-binary==2.9.10
20
+
21
+ # --- Cache ---
22
+ redis==5.2.1
23
+ hiredis==3.0.0
24
+
25
+ # --- AI / ML Pipeline ---
26
+ dspy-ai
27
+ litellm
28
+ faiss-cpu
29
+ numpy
30
+
31
+ # --- HTTP / SSE / Web ---
32
+ httpx
33
+ aiohttp
34
+
35
+ # --- Observability ---
36
+ prometheus-client==0.21.1
37
+ opentelemetry-api==1.29.0
38
+ opentelemetry-sdk==1.29.0
39
+ opentelemetry-instrumentation-fastapi==0.50b0
40
+
41
+ # --- Utilities ---
42
+ python-dotenv==1.0.1
43
+ python-multipart==0.0.19
44
+ tenacity==9.0.0
45
+ orjson==3.10.13
46
+
47
+ # --- Auth ---
48
+ python-jose[cryptography]==3.3.0
49
+ passlib[bcrypt]==1.7.4
50
+
51
+ # --- Testing ---
52
+ pytest==8.3.4
53
+ pytest-asyncio==0.25.0
@@ -0,0 +1,6 @@
1
+ # =============================================================================
2
+ # © JINAN KORDAB — 2026 AI HYBRID AGENTIC RETRIEVAL-AUGMENTED GENERATION RAG PIPELINE - PERSONAL PROJECT
3
+ # =============================================================================
4
+ # Package initialization. Import key classes here for convenient access:
5
+ # from backend import PrecisOrchestrator, ExactHashRetriever, ...
6
+ # =============================================================================
@@ -0,0 +1,3 @@
1
+ # =============================================================================
2
+ # © JINAN KORDAB — 2026 AI HYBRID AGENTIC RETRIEVAL-AUGMENTED GENERATION RAG PIPELINE - PERSONAL PROJECT
3
+ # =============================================================================
@@ -0,0 +1,105 @@
1
+ """© JINAN KORDAB — 2026 AI HYBRID AGENTIC RETRIEVAL-AUGMENTED GENERATION RAG PIPELINE - PERSONAL PROJECT"""
2
+
3
+ from typing import Any, Dict, List, Optional
4
+
5
+ from backend.orchestrator.types import AgentResult
6
+ from backend.llm.base import LLMProvider
7
+
8
+
9
+ class DataSynthesisAgent:
10
+ """Combines results from upstream agents into a coherent synthesis using LLM reasoning.
11
+
12
+ The *llm* parameter on __init__ serves as a default; individual calls to
13
+ synthesize() may override it. If no provider is available at call time
14
+ the agent falls back to returning raw fragments — this is intentional so
15
+ the pipeline never breaks, but a warning is logged.
16
+ """
17
+
18
+ def __init__(self, llm: Optional[LLMProvider] = None) -> None:
19
+ self.llm = llm
20
+
21
+ async def synthesize(
22
+ self,
23
+ query: str,
24
+ upstream_results: List[AgentResult],
25
+ llm: Optional[LLMProvider] = None,
26
+ ) -> AgentResult:
27
+ """Synthesize results from multiple upstream agents into a single answer.
28
+
29
+ Parameters
30
+ ----------
31
+ query : str
32
+ The original user query.
33
+ upstream_results : List[AgentResult]
34
+ Results from upstream retrieval / analysis agents.
35
+ llm : Optional[LLMProvider]
36
+ Per-call override for the LLM provider. Falls back to self.llm.
37
+ """
38
+ provider = llm or self.llm
39
+
40
+ # ── Collect text fragments from upstream results ─────────
41
+ fragments: List[str] = []
42
+ for r in upstream_results:
43
+ if not r.success or not r.data:
44
+ continue
45
+ if isinstance(r.data, dict):
46
+ for item in r.data.get("results", []):
47
+ if isinstance(item, dict):
48
+ text = item.get("text", "")
49
+ if isinstance(text, (list, tuple)):
50
+ fragments.append(" ".join(str(t) for t in text))
51
+ elif text:
52
+ fragments.append(str(text))
53
+ # Also capture any pre-existing synthesis
54
+ synth = r.data.get("synthesis", "")
55
+ if synth:
56
+ fragments.append(str(synth))
57
+ elif isinstance(r.data, str):
58
+ fragments.append(r.data)
59
+
60
+ combined_context = "\n".join(fragments[:50]) if fragments else "(no data retrieved)"
61
+
62
+ # ── LLM synthesis (or graceful fallback) ─────────────────
63
+ if provider and fragments:
64
+ prompt = (
65
+ "You are a precise data synthesis agent. Synthesize the following retrieved\n"
66
+ "information into a concise answer to the user's query.\n\n"
67
+ f"USER QUERY: {query}\n\n"
68
+ f"RETRIEVED DATA:\n{combined_context}\n\n"
69
+ "SYNTHESIS: Answer the query using ONLY the retrieved data above. "
70
+ "If the data is insufficient, state what's missing. "
71
+ "Be specific with numbers, percentages, and entity names found in the data."
72
+ )
73
+ try:
74
+ import asyncio
75
+ response = await asyncio.wait_for(
76
+ provider.generate(prompt, max_tokens=250),
77
+ timeout=30,
78
+ )
79
+ synthesis_text = response
80
+ except asyncio.TimeoutError:
81
+ synthesis_text = (
82
+ f"(LLM synthesis timed out. Retrieved {len(fragments)} fragments.)\n"
83
+ f"{combined_context[:500]}"
84
+ )
85
+ except Exception as e:
86
+ synthesis_text = (
87
+ f"(LLM synthesis error: {e})\n{combined_context[:500]}"
88
+ )
89
+ elif fragments:
90
+ synthesis_text = (
91
+ f"Retrieved {len(fragments)} fragments:\n{combined_context[:1000]}"
92
+ )
93
+ else:
94
+ synthesis_text = "(No data retrieved — unable to synthesize a response.)"
95
+
96
+ return AgentResult(
97
+ subtask_id="synthesis",
98
+ agent_name="DataSynthesis",
99
+ success=True,
100
+ data={
101
+ "synthesis": synthesis_text,
102
+ "source_fragments": len(fragments),
103
+ },
104
+ citations=[],
105
+ )
@@ -0,0 +1,97 @@
1
+ """© JINAN KORDAB — 2026 AI HYBRID AGENTIC RETRIEVAL-AUGMENTED GENERATION RAG PIPELINE - PERSONAL PROJECT"""
2
+
3
+ import numpy as np
4
+ import pandas as pd
5
+ from dataclasses import dataclass, field
6
+ from typing import Dict, List, Optional
7
+
8
+
9
+ @dataclass
10
+ class SyntheticDataset:
11
+ dataframe: pd.DataFrame
12
+ n_rows: int
13
+ n_features: int
14
+ generation_mode: str
15
+ correlation_preserved: bool
16
+ hellinger_distance: float
17
+ metadata: Dict = field(default_factory=dict)
18
+
19
+
20
+ class DistFreeSynth:
21
+ """Bin-based distribution-free synthesizer. Outperforms GANs on tabular data."""
22
+
23
+ def __init__(self, bins_per_feature: Optional[List[int]] = None) -> None:
24
+ self.bins_per_feature = bins_per_feature
25
+ self.pc_table: List[np.ndarray] = []
26
+ self.bin_counts: Dict[str, int] = {}
27
+ self.bin_obs: Dict[str, List[np.ndarray]] = {}
28
+ self.features: List[str] = []
29
+ self.n_features: int = 0
30
+ self.n_original: int = 0
31
+
32
+ def fit(self, df: pd.DataFrame, bins_per_feature: Optional[List[int]] = None) -> None:
33
+ self.features = list(df.columns)
34
+ self.n_features = len(self.features)
35
+ self.n_original = len(df)
36
+ if bins_per_feature is None:
37
+ n = max(len(df), 1)
38
+ bins_per_feature = [max(5, int(np.sqrt(n))) for _ in range(self.n_features)]
39
+ self.bins_per_feature = bins_per_feature
40
+ npdata = df.to_numpy()
41
+ self.pc_table = []
42
+ for k in range(self.n_features):
43
+ incr = 1.0 / bins_per_feature[k]
44
+ pc = np.arange(0, 1 + incr / 2, incr)
45
+ arr = np.quantile(npdata[:, k], np.clip(pc, 0, 1))
46
+ self.pc_table.append(arr)
47
+ self.bin_counts, self.bin_obs = {}, {}
48
+ for obs in npdata:
49
+ key = []
50
+ for k in range(self.n_features):
51
+ idx = int(np.searchsorted(self.pc_table[k], obs[k], side="right")) - 1
52
+ idx = max(0, min(idx, bins_per_feature[k] - 1))
53
+ key.append(idx)
54
+ skey = str(key)
55
+ self.bin_counts[skey] = self.bin_counts.get(skey, 0) + 1
56
+ self.bin_obs.setdefault(skey, []).append(obs)
57
+
58
+ def generate(self, n_synth: int, mode: str = "random_counts",
59
+ correlation_preserve: bool = True, seed: Optional[int] = None) -> SyntheticDataset:
60
+ if seed is not None:
61
+ np.random.seed(seed)
62
+ bin_keys = list(self.bin_counts.keys())
63
+ if mode == "random_counts":
64
+ probs = np.array([self.bin_counts[k] for k in bin_keys], dtype=float)
65
+ probs /= probs.sum()
66
+ sampled = np.random.choice(len(bin_keys), size=n_synth, p=probs)
67
+ key_counts: Dict[str, int] = {k: 0 for k in bin_keys}
68
+ for idx in sampled:
69
+ key_counts[bin_keys[idx]] += 1
70
+ else:
71
+ key_counts = dict(self.bin_counts)
72
+ synth_data = []
73
+ for skey, count in key_counts.items():
74
+ if count <= 0:
75
+ continue
76
+ key = eval(skey)
77
+ L = [self.pc_table[k][key[k]] for k in range(self.n_features)]
78
+ U = [self.pc_table[k][key[k] + 1] for k in range(self.n_features)]
79
+ for _ in range(count):
80
+ obs = np.array([np.random.uniform(L[k], U[k]) for k in range(self.n_features)])
81
+ synth_data.append(obs)
82
+ result = pd.DataFrame(synth_data, columns=self.features)
83
+ hd = self._compute_hellinger(self.bin_counts, key_counts)
84
+ return SyntheticDataset(dataframe=result, n_rows=len(result), n_features=self.n_features,
85
+ generation_mode=mode, correlation_preserved=correlation_preserve,
86
+ hellinger_distance=hd)
87
+
88
+ def _compute_hellinger(self, real: Dict[str, int], synth: Dict[str, int]) -> float:
89
+ all_keys = set(real) | set(synth)
90
+ r_total = max(1, sum(real.values()))
91
+ s_total = max(1, sum(synth.values()))
92
+ sum_sq = 0.0
93
+ for k in all_keys:
94
+ p = real.get(k, 0) / r_total
95
+ q = synth.get(k, 0) / s_total
96
+ sum_sq += (np.sqrt(p) - np.sqrt(q)) ** 2
97
+ return float(np.sqrt(0.5 * sum_sq))