mcp-agentic-pipelines 1.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.env.example +93 -0
- package/README.md +258 -0
- package/package.json +70 -0
- package/packages/clinical/package.json +22 -0
- package/packages/clinical/src/index.ts +262 -0
- package/packages/clinical/tsconfig.json +13 -0
- package/packages/core/package.json +21 -0
- package/packages/core/src/config.ts +138 -0
- package/packages/core/src/errors.ts +100 -0
- package/packages/core/src/index.ts +104 -0
- package/packages/core/src/llm-config.ts +213 -0
- package/packages/core/src/logging.ts +66 -0
- package/packages/core/src/python-bridge.ts +384 -0
- package/packages/core/src/rate-limiter.ts +136 -0
- package/packages/core/src/types.ts +203 -0
- package/packages/core/src/validation.ts +101 -0
- package/packages/core/tsconfig.json +10 -0
- package/packages/deeppipe/package.json +21 -0
- package/packages/deeppipe/src/index.ts +424 -0
- package/packages/deeppipe/tsconfig.json +13 -0
- package/packages/piste/package.json +20 -0
- package/packages/piste/src/index.ts +48 -0
- package/packages/piste/tsconfig.json +13 -0
- package/packages/precis/package.json +20 -0
- package/packages/precis/src/index.ts +67 -0
- package/packages/precis/tsconfig.json +13 -0
- package/packages/server/package.json +31 -0
- package/packages/server/src/index.ts +427 -0
- package/packages/server/tsconfig.json +17 -0
- package/setup.mjs +141 -0
- package/test.mjs +337 -0
- package/vendors/clinical-intake/pipeline.mjs +349 -0
- package/vendors/clinical-intake/questions/en.txt +9 -0
- package/vendors/clinical-intake/questions/fr.txt +9 -0
- package/vendors/piste/.env.example +73 -0
- package/vendors/piste/app/core/__init__.py +4 -0
- package/vendors/piste/app/core/config.py +83 -0
- package/vendors/piste/app/core/debuglog.py +16 -0
- package/vendors/piste/app/core/middleware.py +40 -0
- package/vendors/piste/bridge_piste.py +301 -0
- package/vendors/piste/pipeline/__init__.py +4 -0
- package/vendors/piste/pipeline/compiler.py +68 -0
- package/vendors/piste/pipeline/offline/__init__.py +28 -0
- package/vendors/piste/pipeline/offline/verifaid_pipeline.py +247 -0
- package/vendors/piste/pipeline/replay.py +15 -0
- package/vendors/piste/pipeline/replay_engine.py +249 -0
- package/vendors/piste/pipeline/signatures/__init__.py +4 -0
- package/vendors/piste/pipeline/signatures/signatures.py +136 -0
- package/vendors/piste/pipeline/stage1/__init__.py +21 -0
- package/vendors/piste/pipeline/stage1/atomic_decomposer.py +61 -0
- package/vendors/piste/pipeline/stage1/check_worthiness.py +100 -0
- package/vendors/piste/pipeline/stage1/orchestrator.py +175 -0
- package/vendors/piste/pipeline/stage1/test_stage1.py +162 -0
- package/vendors/piste/pipeline/stage2/__init__.py +34 -0
- package/vendors/piste/pipeline/stage2/blind_retriever.py +303 -0
- package/vendors/piste/pipeline/stage2/canonical_mapper.py +124 -0
- package/vendors/piste/pipeline/stage2/credibility_scorer.py +85 -0
- package/vendors/piste/pipeline/stage2/orchestrator.py +311 -0
- package/vendors/piste/pipeline/stage2/query_refiner.py +88 -0
- package/vendors/piste/pipeline/stage2/search_decision.py +69 -0
- package/vendors/piste/pipeline/stage2/test_stage2.py +265 -0
- package/vendors/piste/pipeline/stage3/__init__.py +20 -0
- package/vendors/piste/pipeline/stage3/classifier.py +79 -0
- package/vendors/piste/pipeline/stage3/orchestrator.py +225 -0
- package/vendors/piste/pipeline/stage3/test_stage3.py +101 -0
- package/vendors/piste/pipeline/stage4/__init__.py +33 -0
- package/vendors/piste/pipeline/stage4/criticality_gate.py +177 -0
- package/vendors/piste/pipeline/stage4/orchestrator.py +269 -0
- package/vendors/piste/pipeline/stage4/test_stage4.py +192 -0
- package/vendors/piste/pipeline/stage4/verdict_aggregator.py +157 -0
- package/vendors/piste/requirements.txt +53 -0
- package/vendors/precis/backend/__init__.py +6 -0
- package/vendors/precis/backend/agents/__init__.py +3 -0
- package/vendors/precis/backend/agents/data_synthesis.py +105 -0
- package/vendors/precis/backend/agents/dist_free_synth.py +97 -0
- package/vendors/precis/backend/agents/exact_hash_retriever.py +327 -0
- package/vendors/precis/backend/agents/fusion_ranker.py +64 -0
- package/vendors/precis/backend/agents/guardrail.py +175 -0
- package/vendors/precis/backend/agents/query_expander.py +89 -0
- package/vendors/precis/backend/agents/radial_interpol.py +99 -0
- package/vendors/precis/backend/agents/report_generator.py +92 -0
- package/vendors/precis/backend/agents/semantic_reranker.py +135 -0
- package/vendors/precis/backend/agents/stat_anomaly.py +93 -0
- package/vendors/precis/backend/agents/vector_index.py +123 -0
- package/vendors/precis/backend/agents/veri_score.py +341 -0
- package/vendors/precis/backend/agents/work_order_extractor.py +205 -0
- package/vendors/precis/backend/api/__init__.py +3 -0
- package/vendors/precis/backend/api/routes/__init__.py +3 -0
- package/vendors/precis/backend/config.py +88 -0
- package/vendors/precis/backend/core/__init__.py +13 -0
- package/vendors/precis/backend/core/hashing.py +22 -0
- package/vendors/precis/backend/core/metrics.py +77 -0
- package/vendors/precis/backend/core/multitoken.py +166 -0
- package/vendors/precis/backend/core/pmi.py +54 -0
- package/vendors/precis/backend/core/stemming.py +74 -0
- package/vendors/precis/backend/core/tracing.py +150 -0
- package/vendors/precis/backend/data/__init__.py +3 -0
- package/vendors/precis/backend/data/chunker.py +57 -0
- package/vendors/precis/backend/data/pdf_parser.py +42 -0
- package/vendors/precis/backend/db/__init__.py +3 -0
- package/vendors/precis/backend/db/models.py +173 -0
- package/vendors/precis/backend/db/repository.py +269 -0
- package/vendors/precis/backend/llm/__init__.py +3 -0
- package/vendors/precis/backend/llm/anthropic_provider.py +39 -0
- package/vendors/precis/backend/llm/base.py +147 -0
- package/vendors/precis/backend/llm/deepseek_provider.py +43 -0
- package/vendors/precis/backend/llm/factory.py +60 -0
- package/vendors/precis/backend/llm/google_provider.py +39 -0
- package/vendors/precis/backend/llm/ollama_provider.py +54 -0
- package/vendors/precis/backend/llm/openai_provider.py +50 -0
- package/vendors/precis/backend/main.py +677 -0
- package/vendors/precis/backend/orchestrator/__init__.py +3 -0
- package/vendors/precis/backend/orchestrator/planner.py +81 -0
- package/vendors/precis/backend/orchestrator/router.py +319 -0
- package/vendors/precis/backend/orchestrator/types.py +58 -0
- package/vendors/precis/bridge_precis.py +185 -0
- package/vendors/precis/data/sample_reports/README.md +8 -0
- package/vendors/precis/data/seed_data.py +115 -0
- package/vendors/precis/requirements.txt +19 -0
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
"""© JINAN KORDAB — 2026 AI HYBRID AGENTIC RETRIEVAL-AUGMENTED GENERATION RAG PIPELINE - PERSONAL PROJECT"""
|
|
2
|
+
|
|
3
|
+
import hashlib
|
|
4
|
+
from typing import Tuple
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def make_query_hash(query_tokens: Tuple[str, ...]) -> str:
|
|
8
|
+
"""SHA-256 hash of pipe-joined query tokens. Deterministic cache key for MemoryAgent."""
|
|
9
|
+
joined = "|".join(query_tokens)
|
|
10
|
+
return hashlib.sha256(joined.encode("utf-8")).hexdigest()[:16]
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def make_document_hash(source_doc: str, source_page: int, source_position: int) -> str:
|
|
14
|
+
"""Unique hash for a document location (file + page + position). Used for citation deduplication."""
|
|
15
|
+
combined = f"{source_doc}|{source_page}|{source_position}"
|
|
16
|
+
return hashlib.sha256(combined.encode("utf-8")).hexdigest()[:16]
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def make_session_key(session_id: str) -> str:
|
|
20
|
+
"""Normalized session key. Strips whitespace, lowercases, hashes."""
|
|
21
|
+
normalized = session_id.strip().lower()
|
|
22
|
+
return hashlib.sha256(normalized.encode("utf-8")).hexdigest()[:16]
|
|
@@ -0,0 +1,77 @@
|
|
|
1
|
+
"""© JINAN KORDAB — 2026 AI HYBRID AGENTIC RETRIEVAL-AUGMENTED GENERATION RAG PIPELINE - PERSONAL PROJECT"""
|
|
2
|
+
|
|
3
|
+
import math
|
|
4
|
+
import numpy as np
|
|
5
|
+
from typing import List, Dict, Any
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def compute_relevancy(query_tokens: List[str], chunk_tokens_list: List[List[str]],
|
|
9
|
+
pmi_scorer=None) -> float:
|
|
10
|
+
"""Average Jaccard similarity. Uses PMI scorer when available."""
|
|
11
|
+
if not chunk_tokens_list:
|
|
12
|
+
return 0.0
|
|
13
|
+
scores = []
|
|
14
|
+
query_set = set(query_tokens)
|
|
15
|
+
for chunk_tokens in chunk_tokens_list:
|
|
16
|
+
chunk_set = set(chunk_tokens)
|
|
17
|
+
intersection = query_set & chunk_set
|
|
18
|
+
union = query_set | chunk_set
|
|
19
|
+
jaccard = len(intersection) / len(union) if union else 0.0
|
|
20
|
+
scores.append(jaccard)
|
|
21
|
+
return sum(scores) / len(scores)
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def compute_trustworthiness(source_metadata: List[Dict[str, Any]]) -> float:
|
|
25
|
+
"""Score source reliability from metadata: type, recency, cross-referencing."""
|
|
26
|
+
if not source_metadata:
|
|
27
|
+
return 0.5
|
|
28
|
+
scores = []
|
|
29
|
+
for meta in source_metadata:
|
|
30
|
+
s = 0.5
|
|
31
|
+
if meta.get("is_title"):
|
|
32
|
+
s += 0.2
|
|
33
|
+
if meta.get("font_size", 0) > 14:
|
|
34
|
+
s += 0.1
|
|
35
|
+
if meta.get("token_type") == "contextual":
|
|
36
|
+
s += 0.15
|
|
37
|
+
scores.append(min(s, 1.0))
|
|
38
|
+
return sum(scores) / len(scores)
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def compute_hallucination_rate(generated_claims: List[str],
|
|
42
|
+
source_evidence: List[Dict[str, Any]]) -> float:
|
|
43
|
+
"""Proportion of claims not supported by source evidence."""
|
|
44
|
+
if not generated_claims:
|
|
45
|
+
return 0.0
|
|
46
|
+
supported = 0
|
|
47
|
+
for claim in generated_claims:
|
|
48
|
+
claim_lower = claim.lower()
|
|
49
|
+
for evidence in source_evidence:
|
|
50
|
+
text = evidence.get("text", "").lower()
|
|
51
|
+
if any(word in text for word in claim_lower.split() if len(word) > 3):
|
|
52
|
+
supported += 1
|
|
53
|
+
break
|
|
54
|
+
return 1.0 - (supported / len(generated_claims))
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
def compute_r_squared(y_true: np.ndarray, y_pred: np.ndarray) -> float:
|
|
58
|
+
"""Coefficient of determination. 1.0 = perfect prediction."""
|
|
59
|
+
ss_res = float(np.sum((y_true - y_pred) ** 2))
|
|
60
|
+
ss_tot = float(np.sum((y_true - np.mean(y_true)) ** 2))
|
|
61
|
+
if ss_tot == 0:
|
|
62
|
+
return 1.0 if ss_res == 0 else 0.0
|
|
63
|
+
return 1.0 - (ss_res / ss_tot)
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
def compute_hellinger_distance(p_counts: Dict[str, int],
|
|
67
|
+
q_counts: Dict[str, int]) -> float:
|
|
68
|
+
"""Hellinger distance between two categorical distributions. 0 = identical."""
|
|
69
|
+
all_keys = set(p_counts) | set(q_counts)
|
|
70
|
+
p_total = max(1, sum(p_counts.values()))
|
|
71
|
+
q_total = max(1, sum(q_counts.values()))
|
|
72
|
+
sum_sq = 0.0
|
|
73
|
+
for k in all_keys:
|
|
74
|
+
p = p_counts.get(k, 0) / p_total
|
|
75
|
+
q = q_counts.get(k, 0) / q_total
|
|
76
|
+
sum_sq += (math.sqrt(p) - math.sqrt(q)) ** 2
|
|
77
|
+
return math.sqrt(0.5 * sum_sq)
|
|
@@ -0,0 +1,166 @@
|
|
|
1
|
+
"""© JINAN KORDAB — 2026 AI HYBRID AGENTIC RETRIEVAL-AUGMENTED GENERATION RAG PIPELINE - PERSONAL PROJECT
|
|
2
|
+
|
|
3
|
+
Optimised: pre-stems all words once (O(N) stemming) then slides an n-gram window
|
|
4
|
+
over the pre-stemmed tokens — a ~12× speedup vs. stemming each n-gram independently.
|
|
5
|
+
Element-level extraction can run in parallel via index_document_async().
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import asyncio
|
|
9
|
+
from typing import List, Tuple
|
|
10
|
+
|
|
11
|
+
from backend.agents.exact_hash_retriever import MultiToken, NestedHashIndex
|
|
12
|
+
from backend.core.stemming import PrecisStemmer
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class MultiTokenExtractor:
|
|
16
|
+
"""Extracts variable-length multi-tokens (n-grams of stemmed words) from document text.
|
|
17
|
+
|
|
18
|
+
Parameters
|
|
19
|
+
----------
|
|
20
|
+
max_token_length : int
|
|
21
|
+
Maximum n-gram length (inclusive).
|
|
22
|
+
min_token_length : int
|
|
23
|
+
Minimum n-gram length (inclusive).
|
|
24
|
+
"""
|
|
25
|
+
|
|
26
|
+
def __init__(self, max_token_length: int = 7, min_token_length: int = 2) -> None:
|
|
27
|
+
self.max_token_length = max_token_length
|
|
28
|
+
self.min_token_length = min_token_length
|
|
29
|
+
self.stemmer = PrecisStemmer()
|
|
30
|
+
|
|
31
|
+
# ── Core extraction (pre-stemmed, single-element) ──────────────
|
|
32
|
+
|
|
33
|
+
def extract(self, text: str, source_doc: str, source_page: int,
|
|
34
|
+
font_size: float = 12.0, is_title: bool = False,
|
|
35
|
+
is_header: bool = False,
|
|
36
|
+
global_position: int = 0) -> List[MultiToken]:
|
|
37
|
+
"""Extract all valid multi-tokens from a text segment.
|
|
38
|
+
|
|
39
|
+
Optimisation: words are stemmed ONCE, then n-grams are formed from
|
|
40
|
+
the pre-stemmed list. This is O(N) stemming instead of O(N×M).
|
|
41
|
+
|
|
42
|
+
global_position is the LINE number in the full document — used by
|
|
43
|
+
get_context() to retrieve surrounding text from the correct location.
|
|
44
|
+
"""
|
|
45
|
+
words = text.strip().split()
|
|
46
|
+
if len(words) < self.min_token_length:
|
|
47
|
+
return []
|
|
48
|
+
|
|
49
|
+
# ── Pre-stem all words once ────────────────────────────
|
|
50
|
+
stemmed_words = self.stemmer.stem_tokens(words)
|
|
51
|
+
if len(stemmed_words) < self.min_token_length:
|
|
52
|
+
return []
|
|
53
|
+
|
|
54
|
+
token_type = "contextual" if (is_title or is_header) else "standard"
|
|
55
|
+
multitokens: List[MultiToken] = []
|
|
56
|
+
|
|
57
|
+
n = len(stemmed_words)
|
|
58
|
+
for start in range(n):
|
|
59
|
+
max_len = min(self.max_token_length, n - start)
|
|
60
|
+
for length in range(self.min_token_length, max_len + 1):
|
|
61
|
+
stemmed = tuple(stemmed_words[start:start + length])
|
|
62
|
+
# All tokens already stemmed & filtered; just check length
|
|
63
|
+
if len(stemmed) >= self.min_token_length:
|
|
64
|
+
mt = MultiToken(
|
|
65
|
+
tokens=stemmed,
|
|
66
|
+
token_type=token_type,
|
|
67
|
+
source_doc=source_doc,
|
|
68
|
+
source_page=source_page,
|
|
69
|
+
source_position=global_position, # ← LINE number, not word index
|
|
70
|
+
font_size=font_size,
|
|
71
|
+
is_title=is_title,
|
|
72
|
+
is_header=is_header,
|
|
73
|
+
metadata={"original_words": words[start:start + length]},
|
|
74
|
+
)
|
|
75
|
+
multitokens.append(mt)
|
|
76
|
+
return multitokens
|
|
77
|
+
|
|
78
|
+
# ── Document indexing (sequential) ────────────────────────────
|
|
79
|
+
|
|
80
|
+
def index_document(self, doc_path: str, parsed_content: List[dict],
|
|
81
|
+
index: NestedHashIndex) -> int:
|
|
82
|
+
"""Parse a full document and index all multi-tokens. Returns count indexed."""
|
|
83
|
+
total = 0
|
|
84
|
+
global_line = 0 # track actual line number in the document
|
|
85
|
+
for page in parsed_content:
|
|
86
|
+
page_num = page.get("page_number", 0)
|
|
87
|
+
for element in page.get("elements", []):
|
|
88
|
+
text = element.get("text", "")
|
|
89
|
+
mts = self.extract(
|
|
90
|
+
text, doc_path, page_num,
|
|
91
|
+
font_size=element.get("font_size", 12.0),
|
|
92
|
+
is_title=element.get("is_title", False),
|
|
93
|
+
is_header=element.get("is_header", False),
|
|
94
|
+
global_position=global_line, # ← pass line number, not word index
|
|
95
|
+
)
|
|
96
|
+
for mt in mts:
|
|
97
|
+
index.insert(mt)
|
|
98
|
+
total += len(mts)
|
|
99
|
+
global_line += 1
|
|
100
|
+
return total
|
|
101
|
+
|
|
102
|
+
# ── Document indexing (async / parallel pages) ──────────────
|
|
103
|
+
|
|
104
|
+
async def index_document_async(self, doc_path: str,
|
|
105
|
+
parsed_content: List[dict],
|
|
106
|
+
index: NestedHashIndex,
|
|
107
|
+
max_tasks: int = 32) -> int:
|
|
108
|
+
"""Like index_document(), but processes **pages** concurrently.
|
|
109
|
+
|
|
110
|
+
Each page's full set of elements is extracted in a single thread —
|
|
111
|
+
this avoids the thread-pool explosion that would result from
|
|
112
|
+
launching one task per element (thousands of tasks for a large doc).
|
|
113
|
+
|
|
114
|
+
Parameters
|
|
115
|
+
----------
|
|
116
|
+
max_tasks : int
|
|
117
|
+
Upper bound on concurrent page-extraction tasks. Prevents
|
|
118
|
+
thread-pool exhaustion on documents with many short pages.
|
|
119
|
+
"""
|
|
120
|
+
# ── Build one task per PAGE (not per element!) ────────
|
|
121
|
+
# Each task extracts ALL elements on that page sequentially.
|
|
122
|
+
# Tracks global line numbers so get_context() retrieves from
|
|
123
|
+
# the correct position in the document.
|
|
124
|
+
_line_counter = [0] # mutable counter shared across pages
|
|
125
|
+
|
|
126
|
+
def extract_page(page: dict) -> List[MultiToken]:
|
|
127
|
+
page_num = page.get("page_number", 0)
|
|
128
|
+
all_mts: List[MultiToken] = []
|
|
129
|
+
for element in page.get("elements", []):
|
|
130
|
+
text = element.get("text", "")
|
|
131
|
+
if not text.strip():
|
|
132
|
+
_line_counter[0] += 1
|
|
133
|
+
continue
|
|
134
|
+
mts = self.extract(
|
|
135
|
+
text, doc_path, page_num,
|
|
136
|
+
font_size=element.get("font_size", 12.0),
|
|
137
|
+
is_title=element.get("is_title", False),
|
|
138
|
+
is_header=element.get("is_header", False),
|
|
139
|
+
global_position=_line_counter[0],
|
|
140
|
+
)
|
|
141
|
+
all_mts.extend(mts)
|
|
142
|
+
_line_counter[0] += 1
|
|
143
|
+
return all_mts
|
|
144
|
+
|
|
145
|
+
pages = [p for p in parsed_content if p.get("elements")]
|
|
146
|
+
if not pages:
|
|
147
|
+
return 0
|
|
148
|
+
|
|
149
|
+
# ── Throttle: at most max_tasks in flight at once ─────
|
|
150
|
+
sem = asyncio.Semaphore(max_tasks)
|
|
151
|
+
|
|
152
|
+
async def bounded_extract(page: dict) -> List[MultiToken]:
|
|
153
|
+
async with sem:
|
|
154
|
+
return await asyncio.to_thread(extract_page, page)
|
|
155
|
+
|
|
156
|
+
results: List[List[MultiToken]] = await asyncio.gather(
|
|
157
|
+
*(bounded_extract(p) for p in pages)
|
|
158
|
+
)
|
|
159
|
+
|
|
160
|
+
# ── Batch-insert all MultiTokens ──────────────────────
|
|
161
|
+
total = 0
|
|
162
|
+
for mts in results:
|
|
163
|
+
for mt in mts:
|
|
164
|
+
index.insert(mt)
|
|
165
|
+
total += len(mts)
|
|
166
|
+
return total
|
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
"""© JINAN KORDAB — 2026 AI HYBRID AGENTIC RETRIEVAL-AUGMENTED GENERATION RAG PIPELINE - PERSONAL PROJECT"""
|
|
2
|
+
|
|
3
|
+
import math
|
|
4
|
+
from typing import Dict, List, Tuple
|
|
5
|
+
from collections import defaultdict
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class PMIScorer:
|
|
9
|
+
"""Computes PMI-based relevance scores using incremental corpus token statistics."""
|
|
10
|
+
|
|
11
|
+
def __init__(self) -> None:
|
|
12
|
+
self.token_count: Dict[str, int] = defaultdict(int)
|
|
13
|
+
self.pair_count: Dict[Tuple[str, str], int] = defaultdict(int)
|
|
14
|
+
self.total_tokens: int = 0
|
|
15
|
+
|
|
16
|
+
def ingest_tokens(self, tokens: List[str]) -> None:
|
|
17
|
+
"""Update corpus statistics with a sequence of stemmed tokens from one multi-token."""
|
|
18
|
+
for i, t in enumerate(tokens):
|
|
19
|
+
self.token_count[t] += 1
|
|
20
|
+
self.total_tokens += 1
|
|
21
|
+
for j in range(i + 1, len(tokens)):
|
|
22
|
+
pair = (t, tokens[j]) if t <= tokens[j] else (tokens[j], t)
|
|
23
|
+
self.pair_count[pair] += 1
|
|
24
|
+
|
|
25
|
+
def score(self, query_tokens: List[str], chunk_tokens: List[str]) -> float:
|
|
26
|
+
"""Average PMI across all (query, chunk) token pairs. Higher = more relevant."""
|
|
27
|
+
if not query_tokens or not chunk_tokens:
|
|
28
|
+
return 0.0
|
|
29
|
+
pmi_values: List[float] = []
|
|
30
|
+
for q in query_tokens:
|
|
31
|
+
for c in chunk_tokens:
|
|
32
|
+
pair = (q, c) if q <= c else (c, q)
|
|
33
|
+
joint = self.pair_count.get(pair, 0)
|
|
34
|
+
if joint == 0:
|
|
35
|
+
continue
|
|
36
|
+
p_q = self.token_count.get(q, 0) / max(1, self.total_tokens)
|
|
37
|
+
p_c = self.token_count.get(c, 0) / max(1, self.total_tokens)
|
|
38
|
+
p_joint = joint / max(1, self.total_tokens)
|
|
39
|
+
if p_q > 0 and p_c > 0:
|
|
40
|
+
pmi = math.log2(p_joint / (p_q * p_c))
|
|
41
|
+
pmi_values.append(pmi)
|
|
42
|
+
return sum(pmi_values) / len(pmi_values) if pmi_values else 0.0
|
|
43
|
+
|
|
44
|
+
def normalize_score(self, raw_score: float, max_observed: float = 10.0) -> float:
|
|
45
|
+
"""Clip and normalize to [0, 1]."""
|
|
46
|
+
return min(max(raw_score, 0.0), max_observed) / max_observed
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
# Global singleton — shared across index, search, and upload
|
|
50
|
+
_pmi_scorer: PMIScorer = PMIScorer()
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def get_pmi_scorer() -> PMIScorer:
|
|
54
|
+
return _pmi_scorer
|
|
@@ -0,0 +1,74 @@
|
|
|
1
|
+
"""© JINAN KORDAB — 2026 AI HYBRID AGENTIC RETRIEVAL-AUGMENTED GENERATION RAG PIPELINE - PERSONAL PROJECT"""
|
|
2
|
+
|
|
3
|
+
from typing import List, Set
|
|
4
|
+
from nltk.stem import PorterStemmer
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
# ── Lazy-loaded NLTK stopwords, minus content-bearing words ────
|
|
8
|
+
# Words like "other", "more", "same" are stopwords in general NLP
|
|
9
|
+
# but ARE content in document section titles and technical text.
|
|
10
|
+
_CONTENT_WORDS_TO_KEEP: Set[str] = {
|
|
11
|
+
"other", "more", "most", "some", "such", "only", "own", "same",
|
|
12
|
+
"very", "just", "both", "few", "each", "every", "any", "all",
|
|
13
|
+
"no", "not", "nor", # negation is semantically important
|
|
14
|
+
}
|
|
15
|
+
|
|
16
|
+
def _load_nltk_stopwords() -> Set[str]:
|
|
17
|
+
"""Return NLTK stopwords minus content-bearing words."""
|
|
18
|
+
try:
|
|
19
|
+
from nltk.corpus import stopwords
|
|
20
|
+
return set(stopwords.words("english")) - _CONTENT_WORDS_TO_KEEP
|
|
21
|
+
except (ImportError, LookupError, OSError):
|
|
22
|
+
pass
|
|
23
|
+
# Minimal fallback
|
|
24
|
+
return {"i", "me", "my", "we", "our", "you", "your", "he", "him",
|
|
25
|
+
"his", "she", "her", "it", "its", "they", "them", "their",
|
|
26
|
+
"this", "that", "these", "those", "am", "is", "are", "was",
|
|
27
|
+
"were", "be", "been", "being", "have", "has", "had", "do",
|
|
28
|
+
"does", "did", "a", "an", "the", "and", "but", "if", "or",
|
|
29
|
+
"because", "as", "of", "at", "by", "for", "with", "about",
|
|
30
|
+
"between", "into", "through", "during", "before", "after",
|
|
31
|
+
"to", "from", "in", "on", "off", "over", "under",
|
|
32
|
+
"can", "will", "should", "now", "don", "doesn", "didn",
|
|
33
|
+
"won", "wouldn", "couldn", "shouldn", "isn", "aren"}
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
class PrecisStemmer:
|
|
37
|
+
"""Combines Porter stemming with domain-specific rules. Acronyms (KYC, AML, ESG) are preserved as-is."""
|
|
38
|
+
|
|
39
|
+
_KNOWN_ACRONYMS: Set[str] = {"kyc", "aml", "esg", "gaap", "ifrs", "sec", "fdic", "finra",
|
|
40
|
+
"soc", "iso", "hipaa", "gdpr", "ccpa", "sox", "cfpb", "finra"}
|
|
41
|
+
|
|
42
|
+
# ── Precis-specific additions (query-structure words) ─────
|
|
43
|
+
_PRECIS_STOPWORDS: Set[str] = {
|
|
44
|
+
"summarize", "summary", "summarise", "explain", "describe",
|
|
45
|
+
"list", "identify", "compare", "contrast", "discuss", "analyze",
|
|
46
|
+
"key", "finding", "findings", "detail", "details", "overview",
|
|
47
|
+
"section", "chapter", "paragraph", "figure", "table", "page",
|
|
48
|
+
"get", "make", "made", "see", "show", "shown", "find", "found",
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
# ── Merged set: NLTK standard + Precis custom ─────────────
|
|
52
|
+
_STOPWORDS: Set[str] = _load_nltk_stopwords() | _PRECIS_STOPWORDS
|
|
53
|
+
|
|
54
|
+
def __init__(self) -> None:
|
|
55
|
+
self._stemmer = PorterStemmer()
|
|
56
|
+
|
|
57
|
+
def stem(self, word: str) -> str:
|
|
58
|
+
"""Stem a single word. Preserves known acronyms. Filters stopwords to empty string."""
|
|
59
|
+
word_lower = word.strip().lower()
|
|
60
|
+
if not word_lower:
|
|
61
|
+
return ""
|
|
62
|
+
if word_lower in self._KNOWN_ACRONYMS:
|
|
63
|
+
return word_lower
|
|
64
|
+
if word_lower in self._STOPWORDS:
|
|
65
|
+
return ""
|
|
66
|
+
return self._stemmer.stem(word_lower)
|
|
67
|
+
|
|
68
|
+
def stem_tokens(self, tokens: List[str]) -> List[str]:
|
|
69
|
+
"""Stem a list of tokens, filtering out stopwords and empty results."""
|
|
70
|
+
return [s for token in tokens if (s := self.stem(token))]
|
|
71
|
+
|
|
72
|
+
def add_acronym(self, acronym: str) -> None:
|
|
73
|
+
"""Register a domain-specific acronym to preserve during stemming."""
|
|
74
|
+
self._KNOWN_ACRONYMS.add(acronym.strip().lower())
|
|
@@ -0,0 +1,150 @@
|
|
|
1
|
+
"""© JINAN KORDAB — 2026 AI HYBRID AGENTIC RETRIEVAL-AUGMENTED GENERATION RAG PIPELINE - PERSONAL PROJECT"""
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
import time
|
|
5
|
+
import uuid
|
|
6
|
+
from dataclasses import dataclass, field, asdict
|
|
7
|
+
from datetime import datetime, timezone
|
|
8
|
+
from enum import Enum
|
|
9
|
+
from typing import Callable, Dict, List, Optional, Any
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class TraceEventType(str, Enum):
|
|
13
|
+
QUERY_STARTED = "query.started"
|
|
14
|
+
QUERY_COMPLETED = "query.completed"
|
|
15
|
+
QUERY_FAILED = "query.failed"
|
|
16
|
+
PLAN_CREATED = "plan.created"
|
|
17
|
+
AGENT_STARTED = "agent.started"
|
|
18
|
+
AGENT_COMPLETED = "agent.completed"
|
|
19
|
+
AGENT_FAILED = "agent.failed"
|
|
20
|
+
DECISION_SEARCH_TYPE = "decision.search_type"
|
|
21
|
+
DECISION_THRESHOLD = "decision.threshold"
|
|
22
|
+
DECISION_PREDICTION = "decision.prediction"
|
|
23
|
+
LLM_CALL_STARTED = "llm.call_started"
|
|
24
|
+
LLM_CALL_COMPLETED = "llm.call_completed"
|
|
25
|
+
LLM_TOKEN_USAGE = "llm.token_usage"
|
|
26
|
+
EVALUATION_COMPLETED = "evaluation.completed"
|
|
27
|
+
GUARDRAIL_ACTION = "guardrail.action"
|
|
28
|
+
RESULT_FOUND = "result.found"
|
|
29
|
+
ANOMALY_FLAGGED = "anomaly.flagged"
|
|
30
|
+
CITATION_ADDED = "citation.added"
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
@dataclass
|
|
34
|
+
class TraceEvent:
|
|
35
|
+
event_id: str = field(default_factory=lambda: str(uuid.uuid4())[:8])
|
|
36
|
+
event_type: TraceEventType = TraceEventType.QUERY_STARTED
|
|
37
|
+
agent_name: str = ""
|
|
38
|
+
span_id: Optional[str] = None
|
|
39
|
+
message: str = ""
|
|
40
|
+
data: Dict[str, Any] = field(default_factory=dict)
|
|
41
|
+
timestamp: str = field(default_factory=lambda: datetime.now(timezone.utc).isoformat())
|
|
42
|
+
duration_ms: float = 0.0
|
|
43
|
+
|
|
44
|
+
def to_dict(self) -> Dict[str, Any]:
|
|
45
|
+
return {"event_id": self.event_id, "event_type": self.event_type.value,
|
|
46
|
+
"agent_name": self.agent_name, "span_id": self.span_id,
|
|
47
|
+
"message": self.message, "data": self.data,
|
|
48
|
+
"timestamp": self.timestamp, "duration_ms": self.duration_ms}
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
@dataclass
|
|
52
|
+
class TraceSpan:
|
|
53
|
+
span_id: str = field(default_factory=lambda: str(uuid.uuid4())[:8])
|
|
54
|
+
parent_span_id: Optional[str] = None
|
|
55
|
+
agent_name: str = ""
|
|
56
|
+
operation: str = ""
|
|
57
|
+
start_time: float = 0.0
|
|
58
|
+
end_time: float = 0.0
|
|
59
|
+
events: List[TraceEvent] = field(default_factory=list)
|
|
60
|
+
child_spans: List["TraceSpan"] = field(default_factory=list)
|
|
61
|
+
metadata: Dict[str, Any] = field(default_factory=dict)
|
|
62
|
+
|
|
63
|
+
@property
|
|
64
|
+
def duration_ms(self) -> float:
|
|
65
|
+
return (self.end_time - self.start_time) * 1000 if self.end_time and self.start_time else 0.0
|
|
66
|
+
|
|
67
|
+
def to_dict(self) -> Dict[str, Any]:
|
|
68
|
+
return {"span_id": self.span_id, "parent_span_id": self.parent_span_id,
|
|
69
|
+
"agent_name": self.agent_name, "operation": self.operation,
|
|
70
|
+
"duration_ms": self.duration_ms,
|
|
71
|
+
"events": [e.to_dict() for e in self.events],
|
|
72
|
+
"child_spans": [c.to_dict() for c in self.child_spans],
|
|
73
|
+
"metadata": self.metadata}
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
class TraceCollector:
|
|
77
|
+
"""Collects trace events during query execution. Streams to WebSocket, persists to SQLite."""
|
|
78
|
+
|
|
79
|
+
def __init__(self, query_id: str, session_id: Optional[str] = None) -> None:
|
|
80
|
+
self.trace_id = str(uuid.uuid4())
|
|
81
|
+
self.query_id = query_id
|
|
82
|
+
self.session_id = session_id
|
|
83
|
+
self._span_stack: List[TraceSpan] = []
|
|
84
|
+
self._root_spans: List[TraceSpan] = []
|
|
85
|
+
self._events: List[TraceEvent] = []
|
|
86
|
+
self._query_start_time = time.time()
|
|
87
|
+
self._status = "running"
|
|
88
|
+
self._stream_callback: Optional[Callable[[Dict[str, Any]], None]] = None
|
|
89
|
+
|
|
90
|
+
def span_start(self, agent_name: str, operation: str, metadata: dict = None) -> str:
|
|
91
|
+
span = TraceSpan(agent_name=agent_name, operation=operation,
|
|
92
|
+
start_time=time.time(), metadata=metadata or {})
|
|
93
|
+
if self._span_stack:
|
|
94
|
+
span.parent_span_id = self._span_stack[-1].span_id
|
|
95
|
+
self._span_stack[-1].child_spans.append(span)
|
|
96
|
+
else:
|
|
97
|
+
self._root_spans.append(span)
|
|
98
|
+
self._span_stack.append(span)
|
|
99
|
+
return span.span_id
|
|
100
|
+
|
|
101
|
+
def span_end(self, metadata: dict = None) -> str:
|
|
102
|
+
if not self._span_stack:
|
|
103
|
+
return ""
|
|
104
|
+
span = self._span_stack.pop()
|
|
105
|
+
span.end_time = time.time()
|
|
106
|
+
if metadata:
|
|
107
|
+
span.metadata.update(metadata)
|
|
108
|
+
return span.span_id
|
|
109
|
+
|
|
110
|
+
def event(self, event_type: TraceEventType, agent_name: str = "",
|
|
111
|
+
message: str = "", data: Dict[str, Any] = None,
|
|
112
|
+
duration_ms: float = 0.0) -> TraceEvent:
|
|
113
|
+
evt = TraceEvent(event_type=event_type, agent_name=agent_name,
|
|
114
|
+
message=message, data=data or {}, duration_ms=duration_ms)
|
|
115
|
+
if self._span_stack:
|
|
116
|
+
evt.span_id = self._span_stack[-1].span_id
|
|
117
|
+
self._span_stack[-1].events.append(evt)
|
|
118
|
+
self._events.append(evt)
|
|
119
|
+
if self._stream_callback:
|
|
120
|
+
self._stream_callback(evt.to_dict())
|
|
121
|
+
return evt
|
|
122
|
+
|
|
123
|
+
def set_stream_callback(self, callback: Callable[[Dict[str, Any]], None]) -> None:
|
|
124
|
+
self._stream_callback = callback
|
|
125
|
+
|
|
126
|
+
def complete(self, status: str = "success") -> None:
|
|
127
|
+
while self._span_stack:
|
|
128
|
+
self.span_end()
|
|
129
|
+
self._status = status
|
|
130
|
+
self.event(TraceEventType.QUERY_COMPLETED if status == "success" else TraceEventType.QUERY_FAILED,
|
|
131
|
+
message=f"Query {status}", data={"duration_ms": self.get_total_duration_ms()})
|
|
132
|
+
|
|
133
|
+
def to_dict(self) -> Dict[str, Any]:
|
|
134
|
+
return {"trace_id": self.trace_id, "query_id": self.query_id,
|
|
135
|
+
"session_id": self.session_id, "status": self._status,
|
|
136
|
+
"duration_ms": self.get_total_duration_ms(),
|
|
137
|
+
"agent_count": len(self._root_spans),
|
|
138
|
+
"event_count": len(self._events),
|
|
139
|
+
"root_spans": [s.to_dict() for s in self._root_spans],
|
|
140
|
+
"events": [e.to_dict() for e in self._events],
|
|
141
|
+
"created_at": datetime.now(timezone.utc).isoformat()}
|
|
142
|
+
|
|
143
|
+
def to_json(self) -> str:
|
|
144
|
+
return json.dumps(self.to_dict(), default=str)
|
|
145
|
+
|
|
146
|
+
def get_event_count(self) -> int:
|
|
147
|
+
return len(self._events)
|
|
148
|
+
|
|
149
|
+
def get_total_duration_ms(self) -> float:
|
|
150
|
+
return (time.time() - self._query_start_time) * 1000
|
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
"""© JINAN KORDAB — 2026 AI HYBRID AGENTIC RETRIEVAL-AUGMENTED GENERATION RAG PIPELINE - PERSONAL PROJECT"""
|
|
2
|
+
|
|
3
|
+
from dataclasses import dataclass
|
|
4
|
+
from typing import Any, Dict, List
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
@dataclass
|
|
8
|
+
class Chunk:
|
|
9
|
+
chunk_id: str
|
|
10
|
+
text: str
|
|
11
|
+
source_doc: str
|
|
12
|
+
source_page: int
|
|
13
|
+
start_position: int
|
|
14
|
+
end_position: int
|
|
15
|
+
level: str = "paragraph"
|
|
16
|
+
context_type: str = "body"
|
|
17
|
+
font_size: float = 12.0
|
|
18
|
+
is_title: bool = False
|
|
19
|
+
is_header: bool = False
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class HierarchicalChunker:
|
|
23
|
+
"""Three-level overlapping chunking: section → paragraph → sentence."""
|
|
24
|
+
|
|
25
|
+
def __init__(self, overlap_ratio: float = 0.2) -> None:
|
|
26
|
+
self.overlap_ratio = overlap_ratio
|
|
27
|
+
|
|
28
|
+
def chunk_page(self, page_number: int, elements: List[Dict[str, Any]],
|
|
29
|
+
source_doc: str) -> List[Chunk]:
|
|
30
|
+
chunks = []
|
|
31
|
+
full_text = " ".join(e.get("text", "") for e in elements)
|
|
32
|
+
if not full_text.strip():
|
|
33
|
+
return chunks
|
|
34
|
+
paragraphs = [p.strip() for p in full_text.split("\n\n") if p.strip()]
|
|
35
|
+
pos = 0
|
|
36
|
+
for i, para in enumerate(paragraphs):
|
|
37
|
+
chunks.append(Chunk(
|
|
38
|
+
chunk_id=f"{source_doc}_p{page_number}_para{i}",
|
|
39
|
+
text=para, source_doc=source_doc, source_page=page_number,
|
|
40
|
+
start_position=pos, end_position=pos + len(para), level="paragraph"))
|
|
41
|
+
pos += len(para) + 2
|
|
42
|
+
# Sentence-level chunks
|
|
43
|
+
sentences = [s.strip() + "." for s in para.replace("\n", " ").split(". ") if s.strip()]
|
|
44
|
+
for j, sent in enumerate(sentences):
|
|
45
|
+
if len(sent) > 10:
|
|
46
|
+
chunks.append(Chunk(
|
|
47
|
+
chunk_id=f"{source_doc}_p{page_number}_para{i}_sent{j}",
|
|
48
|
+
text=sent, source_doc=source_doc, source_page=page_number,
|
|
49
|
+
start_position=pos, end_position=pos + len(sent), level="sentence"))
|
|
50
|
+
return chunks
|
|
51
|
+
|
|
52
|
+
def chunk_document(self, parsed_pages: List[Dict[str, Any]], source_doc: str) -> List[Chunk]:
|
|
53
|
+
all_chunks = []
|
|
54
|
+
for page in parsed_pages:
|
|
55
|
+
all_chunks.extend(self.chunk_page(page.get("page_number", 0),
|
|
56
|
+
page.get("elements", []), source_doc))
|
|
57
|
+
return all_chunks
|
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
"""© JINAN KORDAB — 2026 AI HYBRID AGENTIC RETRIEVAL-AUGMENTED GENERATION RAG PIPELINE - PERSONAL PROJECT"""
|
|
2
|
+
|
|
3
|
+
from typing import Any, Dict, List
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class PDFParser:
|
|
7
|
+
"""Extracts text with font metadata for contextual multi-token scoring."""
|
|
8
|
+
|
|
9
|
+
def parse(self, file_path: str) -> List[Dict[str, Any]]:
|
|
10
|
+
try:
|
|
11
|
+
import fitz # pymupdf
|
|
12
|
+
pages = []
|
|
13
|
+
doc = fitz.open(file_path)
|
|
14
|
+
for page_num, page in enumerate(doc):
|
|
15
|
+
blocks = page.get_text("dict")["blocks"]
|
|
16
|
+
elements = []
|
|
17
|
+
for block in blocks:
|
|
18
|
+
if "lines" not in block:
|
|
19
|
+
continue
|
|
20
|
+
for line in block["lines"]:
|
|
21
|
+
for span in line["spans"]:
|
|
22
|
+
font_size = span.get("size", 12.0)
|
|
23
|
+
text = span.get("text", "").strip()
|
|
24
|
+
if text:
|
|
25
|
+
elements.append({"text": text, "font_size": font_size,
|
|
26
|
+
"is_title": font_size > 16,
|
|
27
|
+
"is_header": 14 <= font_size <= 16})
|
|
28
|
+
pages.append({"page_number": page_num + 1, "elements": elements})
|
|
29
|
+
doc.close()
|
|
30
|
+
return pages
|
|
31
|
+
except ImportError:
|
|
32
|
+
text = open(file_path, "r", encoding="utf-8", errors="ignore").read()
|
|
33
|
+
return [{"page_number": 1, "elements": [{"text": text, "font_size": 12.0,
|
|
34
|
+
"is_title": False, "is_header": False}]}]
|
|
35
|
+
|
|
36
|
+
def get_sections(self, parsed_content: List[Dict]) -> List[Dict]:
|
|
37
|
+
sections = []
|
|
38
|
+
for page in parsed_content:
|
|
39
|
+
for el in page.get("elements", []):
|
|
40
|
+
sections.append({"text": el["text"], "page": page["page_number"],
|
|
41
|
+
"is_title": el["is_title"], "is_header": el["is_header"]})
|
|
42
|
+
return sections
|