mcp-agentic-pipelines 1.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (119) hide show
  1. package/.env.example +93 -0
  2. package/README.md +258 -0
  3. package/package.json +70 -0
  4. package/packages/clinical/package.json +22 -0
  5. package/packages/clinical/src/index.ts +262 -0
  6. package/packages/clinical/tsconfig.json +13 -0
  7. package/packages/core/package.json +21 -0
  8. package/packages/core/src/config.ts +138 -0
  9. package/packages/core/src/errors.ts +100 -0
  10. package/packages/core/src/index.ts +104 -0
  11. package/packages/core/src/llm-config.ts +213 -0
  12. package/packages/core/src/logging.ts +66 -0
  13. package/packages/core/src/python-bridge.ts +384 -0
  14. package/packages/core/src/rate-limiter.ts +136 -0
  15. package/packages/core/src/types.ts +203 -0
  16. package/packages/core/src/validation.ts +101 -0
  17. package/packages/core/tsconfig.json +10 -0
  18. package/packages/deeppipe/package.json +21 -0
  19. package/packages/deeppipe/src/index.ts +424 -0
  20. package/packages/deeppipe/tsconfig.json +13 -0
  21. package/packages/piste/package.json +20 -0
  22. package/packages/piste/src/index.ts +48 -0
  23. package/packages/piste/tsconfig.json +13 -0
  24. package/packages/precis/package.json +20 -0
  25. package/packages/precis/src/index.ts +67 -0
  26. package/packages/precis/tsconfig.json +13 -0
  27. package/packages/server/package.json +31 -0
  28. package/packages/server/src/index.ts +427 -0
  29. package/packages/server/tsconfig.json +17 -0
  30. package/setup.mjs +141 -0
  31. package/test.mjs +337 -0
  32. package/vendors/clinical-intake/pipeline.mjs +349 -0
  33. package/vendors/clinical-intake/questions/en.txt +9 -0
  34. package/vendors/clinical-intake/questions/fr.txt +9 -0
  35. package/vendors/piste/.env.example +73 -0
  36. package/vendors/piste/app/core/__init__.py +4 -0
  37. package/vendors/piste/app/core/config.py +83 -0
  38. package/vendors/piste/app/core/debuglog.py +16 -0
  39. package/vendors/piste/app/core/middleware.py +40 -0
  40. package/vendors/piste/bridge_piste.py +301 -0
  41. package/vendors/piste/pipeline/__init__.py +4 -0
  42. package/vendors/piste/pipeline/compiler.py +68 -0
  43. package/vendors/piste/pipeline/offline/__init__.py +28 -0
  44. package/vendors/piste/pipeline/offline/verifaid_pipeline.py +247 -0
  45. package/vendors/piste/pipeline/replay.py +15 -0
  46. package/vendors/piste/pipeline/replay_engine.py +249 -0
  47. package/vendors/piste/pipeline/signatures/__init__.py +4 -0
  48. package/vendors/piste/pipeline/signatures/signatures.py +136 -0
  49. package/vendors/piste/pipeline/stage1/__init__.py +21 -0
  50. package/vendors/piste/pipeline/stage1/atomic_decomposer.py +61 -0
  51. package/vendors/piste/pipeline/stage1/check_worthiness.py +100 -0
  52. package/vendors/piste/pipeline/stage1/orchestrator.py +175 -0
  53. package/vendors/piste/pipeline/stage1/test_stage1.py +162 -0
  54. package/vendors/piste/pipeline/stage2/__init__.py +34 -0
  55. package/vendors/piste/pipeline/stage2/blind_retriever.py +303 -0
  56. package/vendors/piste/pipeline/stage2/canonical_mapper.py +124 -0
  57. package/vendors/piste/pipeline/stage2/credibility_scorer.py +85 -0
  58. package/vendors/piste/pipeline/stage2/orchestrator.py +311 -0
  59. package/vendors/piste/pipeline/stage2/query_refiner.py +88 -0
  60. package/vendors/piste/pipeline/stage2/search_decision.py +69 -0
  61. package/vendors/piste/pipeline/stage2/test_stage2.py +265 -0
  62. package/vendors/piste/pipeline/stage3/__init__.py +20 -0
  63. package/vendors/piste/pipeline/stage3/classifier.py +79 -0
  64. package/vendors/piste/pipeline/stage3/orchestrator.py +225 -0
  65. package/vendors/piste/pipeline/stage3/test_stage3.py +101 -0
  66. package/vendors/piste/pipeline/stage4/__init__.py +33 -0
  67. package/vendors/piste/pipeline/stage4/criticality_gate.py +177 -0
  68. package/vendors/piste/pipeline/stage4/orchestrator.py +269 -0
  69. package/vendors/piste/pipeline/stage4/test_stage4.py +192 -0
  70. package/vendors/piste/pipeline/stage4/verdict_aggregator.py +157 -0
  71. package/vendors/piste/requirements.txt +53 -0
  72. package/vendors/precis/backend/__init__.py +6 -0
  73. package/vendors/precis/backend/agents/__init__.py +3 -0
  74. package/vendors/precis/backend/agents/data_synthesis.py +105 -0
  75. package/vendors/precis/backend/agents/dist_free_synth.py +97 -0
  76. package/vendors/precis/backend/agents/exact_hash_retriever.py +327 -0
  77. package/vendors/precis/backend/agents/fusion_ranker.py +64 -0
  78. package/vendors/precis/backend/agents/guardrail.py +175 -0
  79. package/vendors/precis/backend/agents/query_expander.py +89 -0
  80. package/vendors/precis/backend/agents/radial_interpol.py +99 -0
  81. package/vendors/precis/backend/agents/report_generator.py +92 -0
  82. package/vendors/precis/backend/agents/semantic_reranker.py +135 -0
  83. package/vendors/precis/backend/agents/stat_anomaly.py +93 -0
  84. package/vendors/precis/backend/agents/vector_index.py +123 -0
  85. package/vendors/precis/backend/agents/veri_score.py +341 -0
  86. package/vendors/precis/backend/agents/work_order_extractor.py +205 -0
  87. package/vendors/precis/backend/api/__init__.py +3 -0
  88. package/vendors/precis/backend/api/routes/__init__.py +3 -0
  89. package/vendors/precis/backend/config.py +88 -0
  90. package/vendors/precis/backend/core/__init__.py +13 -0
  91. package/vendors/precis/backend/core/hashing.py +22 -0
  92. package/vendors/precis/backend/core/metrics.py +77 -0
  93. package/vendors/precis/backend/core/multitoken.py +166 -0
  94. package/vendors/precis/backend/core/pmi.py +54 -0
  95. package/vendors/precis/backend/core/stemming.py +74 -0
  96. package/vendors/precis/backend/core/tracing.py +150 -0
  97. package/vendors/precis/backend/data/__init__.py +3 -0
  98. package/vendors/precis/backend/data/chunker.py +57 -0
  99. package/vendors/precis/backend/data/pdf_parser.py +42 -0
  100. package/vendors/precis/backend/db/__init__.py +3 -0
  101. package/vendors/precis/backend/db/models.py +173 -0
  102. package/vendors/precis/backend/db/repository.py +269 -0
  103. package/vendors/precis/backend/llm/__init__.py +3 -0
  104. package/vendors/precis/backend/llm/anthropic_provider.py +39 -0
  105. package/vendors/precis/backend/llm/base.py +147 -0
  106. package/vendors/precis/backend/llm/deepseek_provider.py +43 -0
  107. package/vendors/precis/backend/llm/factory.py +60 -0
  108. package/vendors/precis/backend/llm/google_provider.py +39 -0
  109. package/vendors/precis/backend/llm/ollama_provider.py +54 -0
  110. package/vendors/precis/backend/llm/openai_provider.py +50 -0
  111. package/vendors/precis/backend/main.py +677 -0
  112. package/vendors/precis/backend/orchestrator/__init__.py +3 -0
  113. package/vendors/precis/backend/orchestrator/planner.py +81 -0
  114. package/vendors/precis/backend/orchestrator/router.py +319 -0
  115. package/vendors/precis/backend/orchestrator/types.py +58 -0
  116. package/vendors/precis/bridge_precis.py +185 -0
  117. package/vendors/precis/data/sample_reports/README.md +8 -0
  118. package/vendors/precis/data/seed_data.py +115 -0
  119. package/vendors/precis/requirements.txt +19 -0
@@ -0,0 +1,22 @@
1
+ """© JINAN KORDAB — 2026 AI HYBRID AGENTIC RETRIEVAL-AUGMENTED GENERATION RAG PIPELINE - PERSONAL PROJECT"""
2
+
3
+ import hashlib
4
+ from typing import Tuple
5
+
6
+
7
+ def make_query_hash(query_tokens: Tuple[str, ...]) -> str:
8
+ """SHA-256 hash of pipe-joined query tokens. Deterministic cache key for MemoryAgent."""
9
+ joined = "|".join(query_tokens)
10
+ return hashlib.sha256(joined.encode("utf-8")).hexdigest()[:16]
11
+
12
+
13
+ def make_document_hash(source_doc: str, source_page: int, source_position: int) -> str:
14
+ """Unique hash for a document location (file + page + position). Used for citation deduplication."""
15
+ combined = f"{source_doc}|{source_page}|{source_position}"
16
+ return hashlib.sha256(combined.encode("utf-8")).hexdigest()[:16]
17
+
18
+
19
+ def make_session_key(session_id: str) -> str:
20
+ """Normalized session key. Strips whitespace, lowercases, hashes."""
21
+ normalized = session_id.strip().lower()
22
+ return hashlib.sha256(normalized.encode("utf-8")).hexdigest()[:16]
@@ -0,0 +1,77 @@
1
+ """© JINAN KORDAB — 2026 AI HYBRID AGENTIC RETRIEVAL-AUGMENTED GENERATION RAG PIPELINE - PERSONAL PROJECT"""
2
+
3
+ import math
4
+ import numpy as np
5
+ from typing import List, Dict, Any
6
+
7
+
8
+ def compute_relevancy(query_tokens: List[str], chunk_tokens_list: List[List[str]],
9
+ pmi_scorer=None) -> float:
10
+ """Average Jaccard similarity. Uses PMI scorer when available."""
11
+ if not chunk_tokens_list:
12
+ return 0.0
13
+ scores = []
14
+ query_set = set(query_tokens)
15
+ for chunk_tokens in chunk_tokens_list:
16
+ chunk_set = set(chunk_tokens)
17
+ intersection = query_set & chunk_set
18
+ union = query_set | chunk_set
19
+ jaccard = len(intersection) / len(union) if union else 0.0
20
+ scores.append(jaccard)
21
+ return sum(scores) / len(scores)
22
+
23
+
24
+ def compute_trustworthiness(source_metadata: List[Dict[str, Any]]) -> float:
25
+ """Score source reliability from metadata: type, recency, cross-referencing."""
26
+ if not source_metadata:
27
+ return 0.5
28
+ scores = []
29
+ for meta in source_metadata:
30
+ s = 0.5
31
+ if meta.get("is_title"):
32
+ s += 0.2
33
+ if meta.get("font_size", 0) > 14:
34
+ s += 0.1
35
+ if meta.get("token_type") == "contextual":
36
+ s += 0.15
37
+ scores.append(min(s, 1.0))
38
+ return sum(scores) / len(scores)
39
+
40
+
41
+ def compute_hallucination_rate(generated_claims: List[str],
42
+ source_evidence: List[Dict[str, Any]]) -> float:
43
+ """Proportion of claims not supported by source evidence."""
44
+ if not generated_claims:
45
+ return 0.0
46
+ supported = 0
47
+ for claim in generated_claims:
48
+ claim_lower = claim.lower()
49
+ for evidence in source_evidence:
50
+ text = evidence.get("text", "").lower()
51
+ if any(word in text for word in claim_lower.split() if len(word) > 3):
52
+ supported += 1
53
+ break
54
+ return 1.0 - (supported / len(generated_claims))
55
+
56
+
57
+ def compute_r_squared(y_true: np.ndarray, y_pred: np.ndarray) -> float:
58
+ """Coefficient of determination. 1.0 = perfect prediction."""
59
+ ss_res = float(np.sum((y_true - y_pred) ** 2))
60
+ ss_tot = float(np.sum((y_true - np.mean(y_true)) ** 2))
61
+ if ss_tot == 0:
62
+ return 1.0 if ss_res == 0 else 0.0
63
+ return 1.0 - (ss_res / ss_tot)
64
+
65
+
66
+ def compute_hellinger_distance(p_counts: Dict[str, int],
67
+ q_counts: Dict[str, int]) -> float:
68
+ """Hellinger distance between two categorical distributions. 0 = identical."""
69
+ all_keys = set(p_counts) | set(q_counts)
70
+ p_total = max(1, sum(p_counts.values()))
71
+ q_total = max(1, sum(q_counts.values()))
72
+ sum_sq = 0.0
73
+ for k in all_keys:
74
+ p = p_counts.get(k, 0) / p_total
75
+ q = q_counts.get(k, 0) / q_total
76
+ sum_sq += (math.sqrt(p) - math.sqrt(q)) ** 2
77
+ return math.sqrt(0.5 * sum_sq)
@@ -0,0 +1,166 @@
1
+ """© JINAN KORDAB — 2026 AI HYBRID AGENTIC RETRIEVAL-AUGMENTED GENERATION RAG PIPELINE - PERSONAL PROJECT
2
+
3
+ Optimised: pre-stems all words once (O(N) stemming) then slides an n-gram window
4
+ over the pre-stemmed tokens — a ~12× speedup vs. stemming each n-gram independently.
5
+ Element-level extraction can run in parallel via index_document_async().
6
+ """
7
+
8
+ import asyncio
9
+ from typing import List, Tuple
10
+
11
+ from backend.agents.exact_hash_retriever import MultiToken, NestedHashIndex
12
+ from backend.core.stemming import PrecisStemmer
13
+
14
+
15
+ class MultiTokenExtractor:
16
+ """Extracts variable-length multi-tokens (n-grams of stemmed words) from document text.
17
+
18
+ Parameters
19
+ ----------
20
+ max_token_length : int
21
+ Maximum n-gram length (inclusive).
22
+ min_token_length : int
23
+ Minimum n-gram length (inclusive).
24
+ """
25
+
26
+ def __init__(self, max_token_length: int = 7, min_token_length: int = 2) -> None:
27
+ self.max_token_length = max_token_length
28
+ self.min_token_length = min_token_length
29
+ self.stemmer = PrecisStemmer()
30
+
31
+ # ── Core extraction (pre-stemmed, single-element) ──────────────
32
+
33
+ def extract(self, text: str, source_doc: str, source_page: int,
34
+ font_size: float = 12.0, is_title: bool = False,
35
+ is_header: bool = False,
36
+ global_position: int = 0) -> List[MultiToken]:
37
+ """Extract all valid multi-tokens from a text segment.
38
+
39
+ Optimisation: words are stemmed ONCE, then n-grams are formed from
40
+ the pre-stemmed list. This is O(N) stemming instead of O(N×M).
41
+
42
+ global_position is the LINE number in the full document — used by
43
+ get_context() to retrieve surrounding text from the correct location.
44
+ """
45
+ words = text.strip().split()
46
+ if len(words) < self.min_token_length:
47
+ return []
48
+
49
+ # ── Pre-stem all words once ────────────────────────────
50
+ stemmed_words = self.stemmer.stem_tokens(words)
51
+ if len(stemmed_words) < self.min_token_length:
52
+ return []
53
+
54
+ token_type = "contextual" if (is_title or is_header) else "standard"
55
+ multitokens: List[MultiToken] = []
56
+
57
+ n = len(stemmed_words)
58
+ for start in range(n):
59
+ max_len = min(self.max_token_length, n - start)
60
+ for length in range(self.min_token_length, max_len + 1):
61
+ stemmed = tuple(stemmed_words[start:start + length])
62
+ # All tokens already stemmed & filtered; just check length
63
+ if len(stemmed) >= self.min_token_length:
64
+ mt = MultiToken(
65
+ tokens=stemmed,
66
+ token_type=token_type,
67
+ source_doc=source_doc,
68
+ source_page=source_page,
69
+ source_position=global_position, # ← LINE number, not word index
70
+ font_size=font_size,
71
+ is_title=is_title,
72
+ is_header=is_header,
73
+ metadata={"original_words": words[start:start + length]},
74
+ )
75
+ multitokens.append(mt)
76
+ return multitokens
77
+
78
+ # ── Document indexing (sequential) ────────────────────────────
79
+
80
+ def index_document(self, doc_path: str, parsed_content: List[dict],
81
+ index: NestedHashIndex) -> int:
82
+ """Parse a full document and index all multi-tokens. Returns count indexed."""
83
+ total = 0
84
+ global_line = 0 # track actual line number in the document
85
+ for page in parsed_content:
86
+ page_num = page.get("page_number", 0)
87
+ for element in page.get("elements", []):
88
+ text = element.get("text", "")
89
+ mts = self.extract(
90
+ text, doc_path, page_num,
91
+ font_size=element.get("font_size", 12.0),
92
+ is_title=element.get("is_title", False),
93
+ is_header=element.get("is_header", False),
94
+ global_position=global_line, # ← pass line number, not word index
95
+ )
96
+ for mt in mts:
97
+ index.insert(mt)
98
+ total += len(mts)
99
+ global_line += 1
100
+ return total
101
+
102
+ # ── Document indexing (async / parallel pages) ──────────────
103
+
104
+ async def index_document_async(self, doc_path: str,
105
+ parsed_content: List[dict],
106
+ index: NestedHashIndex,
107
+ max_tasks: int = 32) -> int:
108
+ """Like index_document(), but processes **pages** concurrently.
109
+
110
+ Each page's full set of elements is extracted in a single thread —
111
+ this avoids the thread-pool explosion that would result from
112
+ launching one task per element (thousands of tasks for a large doc).
113
+
114
+ Parameters
115
+ ----------
116
+ max_tasks : int
117
+ Upper bound on concurrent page-extraction tasks. Prevents
118
+ thread-pool exhaustion on documents with many short pages.
119
+ """
120
+ # ── Build one task per PAGE (not per element!) ────────
121
+ # Each task extracts ALL elements on that page sequentially.
122
+ # Tracks global line numbers so get_context() retrieves from
123
+ # the correct position in the document.
124
+ _line_counter = [0] # mutable counter shared across pages
125
+
126
+ def extract_page(page: dict) -> List[MultiToken]:
127
+ page_num = page.get("page_number", 0)
128
+ all_mts: List[MultiToken] = []
129
+ for element in page.get("elements", []):
130
+ text = element.get("text", "")
131
+ if not text.strip():
132
+ _line_counter[0] += 1
133
+ continue
134
+ mts = self.extract(
135
+ text, doc_path, page_num,
136
+ font_size=element.get("font_size", 12.0),
137
+ is_title=element.get("is_title", False),
138
+ is_header=element.get("is_header", False),
139
+ global_position=_line_counter[0],
140
+ )
141
+ all_mts.extend(mts)
142
+ _line_counter[0] += 1
143
+ return all_mts
144
+
145
+ pages = [p for p in parsed_content if p.get("elements")]
146
+ if not pages:
147
+ return 0
148
+
149
+ # ── Throttle: at most max_tasks in flight at once ─────
150
+ sem = asyncio.Semaphore(max_tasks)
151
+
152
+ async def bounded_extract(page: dict) -> List[MultiToken]:
153
+ async with sem:
154
+ return await asyncio.to_thread(extract_page, page)
155
+
156
+ results: List[List[MultiToken]] = await asyncio.gather(
157
+ *(bounded_extract(p) for p in pages)
158
+ )
159
+
160
+ # ── Batch-insert all MultiTokens ──────────────────────
161
+ total = 0
162
+ for mts in results:
163
+ for mt in mts:
164
+ index.insert(mt)
165
+ total += len(mts)
166
+ return total
@@ -0,0 +1,54 @@
1
+ """© JINAN KORDAB — 2026 AI HYBRID AGENTIC RETRIEVAL-AUGMENTED GENERATION RAG PIPELINE - PERSONAL PROJECT"""
2
+
3
+ import math
4
+ from typing import Dict, List, Tuple
5
+ from collections import defaultdict
6
+
7
+
8
+ class PMIScorer:
9
+ """Computes PMI-based relevance scores using incremental corpus token statistics."""
10
+
11
+ def __init__(self) -> None:
12
+ self.token_count: Dict[str, int] = defaultdict(int)
13
+ self.pair_count: Dict[Tuple[str, str], int] = defaultdict(int)
14
+ self.total_tokens: int = 0
15
+
16
+ def ingest_tokens(self, tokens: List[str]) -> None:
17
+ """Update corpus statistics with a sequence of stemmed tokens from one multi-token."""
18
+ for i, t in enumerate(tokens):
19
+ self.token_count[t] += 1
20
+ self.total_tokens += 1
21
+ for j in range(i + 1, len(tokens)):
22
+ pair = (t, tokens[j]) if t <= tokens[j] else (tokens[j], t)
23
+ self.pair_count[pair] += 1
24
+
25
+ def score(self, query_tokens: List[str], chunk_tokens: List[str]) -> float:
26
+ """Average PMI across all (query, chunk) token pairs. Higher = more relevant."""
27
+ if not query_tokens or not chunk_tokens:
28
+ return 0.0
29
+ pmi_values: List[float] = []
30
+ for q in query_tokens:
31
+ for c in chunk_tokens:
32
+ pair = (q, c) if q <= c else (c, q)
33
+ joint = self.pair_count.get(pair, 0)
34
+ if joint == 0:
35
+ continue
36
+ p_q = self.token_count.get(q, 0) / max(1, self.total_tokens)
37
+ p_c = self.token_count.get(c, 0) / max(1, self.total_tokens)
38
+ p_joint = joint / max(1, self.total_tokens)
39
+ if p_q > 0 and p_c > 0:
40
+ pmi = math.log2(p_joint / (p_q * p_c))
41
+ pmi_values.append(pmi)
42
+ return sum(pmi_values) / len(pmi_values) if pmi_values else 0.0
43
+
44
+ def normalize_score(self, raw_score: float, max_observed: float = 10.0) -> float:
45
+ """Clip and normalize to [0, 1]."""
46
+ return min(max(raw_score, 0.0), max_observed) / max_observed
47
+
48
+
49
+ # Global singleton — shared across index, search, and upload
50
+ _pmi_scorer: PMIScorer = PMIScorer()
51
+
52
+
53
+ def get_pmi_scorer() -> PMIScorer:
54
+ return _pmi_scorer
@@ -0,0 +1,74 @@
1
+ """© JINAN KORDAB — 2026 AI HYBRID AGENTIC RETRIEVAL-AUGMENTED GENERATION RAG PIPELINE - PERSONAL PROJECT"""
2
+
3
+ from typing import List, Set
4
+ from nltk.stem import PorterStemmer
5
+
6
+
7
+ # ── Lazy-loaded NLTK stopwords, minus content-bearing words ────
8
+ # Words like "other", "more", "same" are stopwords in general NLP
9
+ # but ARE content in document section titles and technical text.
10
+ _CONTENT_WORDS_TO_KEEP: Set[str] = {
11
+ "other", "more", "most", "some", "such", "only", "own", "same",
12
+ "very", "just", "both", "few", "each", "every", "any", "all",
13
+ "no", "not", "nor", # negation is semantically important
14
+ }
15
+
16
+ def _load_nltk_stopwords() -> Set[str]:
17
+ """Return NLTK stopwords minus content-bearing words."""
18
+ try:
19
+ from nltk.corpus import stopwords
20
+ return set(stopwords.words("english")) - _CONTENT_WORDS_TO_KEEP
21
+ except (ImportError, LookupError, OSError):
22
+ pass
23
+ # Minimal fallback
24
+ return {"i", "me", "my", "we", "our", "you", "your", "he", "him",
25
+ "his", "she", "her", "it", "its", "they", "them", "their",
26
+ "this", "that", "these", "those", "am", "is", "are", "was",
27
+ "were", "be", "been", "being", "have", "has", "had", "do",
28
+ "does", "did", "a", "an", "the", "and", "but", "if", "or",
29
+ "because", "as", "of", "at", "by", "for", "with", "about",
30
+ "between", "into", "through", "during", "before", "after",
31
+ "to", "from", "in", "on", "off", "over", "under",
32
+ "can", "will", "should", "now", "don", "doesn", "didn",
33
+ "won", "wouldn", "couldn", "shouldn", "isn", "aren"}
34
+
35
+
36
+ class PrecisStemmer:
37
+ """Combines Porter stemming with domain-specific rules. Acronyms (KYC, AML, ESG) are preserved as-is."""
38
+
39
+ _KNOWN_ACRONYMS: Set[str] = {"kyc", "aml", "esg", "gaap", "ifrs", "sec", "fdic", "finra",
40
+ "soc", "iso", "hipaa", "gdpr", "ccpa", "sox", "cfpb", "finra"}
41
+
42
+ # ── Precis-specific additions (query-structure words) ─────
43
+ _PRECIS_STOPWORDS: Set[str] = {
44
+ "summarize", "summary", "summarise", "explain", "describe",
45
+ "list", "identify", "compare", "contrast", "discuss", "analyze",
46
+ "key", "finding", "findings", "detail", "details", "overview",
47
+ "section", "chapter", "paragraph", "figure", "table", "page",
48
+ "get", "make", "made", "see", "show", "shown", "find", "found",
49
+ }
50
+
51
+ # ── Merged set: NLTK standard + Precis custom ─────────────
52
+ _STOPWORDS: Set[str] = _load_nltk_stopwords() | _PRECIS_STOPWORDS
53
+
54
+ def __init__(self) -> None:
55
+ self._stemmer = PorterStemmer()
56
+
57
+ def stem(self, word: str) -> str:
58
+ """Stem a single word. Preserves known acronyms. Filters stopwords to empty string."""
59
+ word_lower = word.strip().lower()
60
+ if not word_lower:
61
+ return ""
62
+ if word_lower in self._KNOWN_ACRONYMS:
63
+ return word_lower
64
+ if word_lower in self._STOPWORDS:
65
+ return ""
66
+ return self._stemmer.stem(word_lower)
67
+
68
+ def stem_tokens(self, tokens: List[str]) -> List[str]:
69
+ """Stem a list of tokens, filtering out stopwords and empty results."""
70
+ return [s for token in tokens if (s := self.stem(token))]
71
+
72
+ def add_acronym(self, acronym: str) -> None:
73
+ """Register a domain-specific acronym to preserve during stemming."""
74
+ self._KNOWN_ACRONYMS.add(acronym.strip().lower())
@@ -0,0 +1,150 @@
1
+ """© JINAN KORDAB — 2026 AI HYBRID AGENTIC RETRIEVAL-AUGMENTED GENERATION RAG PIPELINE - PERSONAL PROJECT"""
2
+
3
+ import json
4
+ import time
5
+ import uuid
6
+ from dataclasses import dataclass, field, asdict
7
+ from datetime import datetime, timezone
8
+ from enum import Enum
9
+ from typing import Callable, Dict, List, Optional, Any
10
+
11
+
12
+ class TraceEventType(str, Enum):
13
+ QUERY_STARTED = "query.started"
14
+ QUERY_COMPLETED = "query.completed"
15
+ QUERY_FAILED = "query.failed"
16
+ PLAN_CREATED = "plan.created"
17
+ AGENT_STARTED = "agent.started"
18
+ AGENT_COMPLETED = "agent.completed"
19
+ AGENT_FAILED = "agent.failed"
20
+ DECISION_SEARCH_TYPE = "decision.search_type"
21
+ DECISION_THRESHOLD = "decision.threshold"
22
+ DECISION_PREDICTION = "decision.prediction"
23
+ LLM_CALL_STARTED = "llm.call_started"
24
+ LLM_CALL_COMPLETED = "llm.call_completed"
25
+ LLM_TOKEN_USAGE = "llm.token_usage"
26
+ EVALUATION_COMPLETED = "evaluation.completed"
27
+ GUARDRAIL_ACTION = "guardrail.action"
28
+ RESULT_FOUND = "result.found"
29
+ ANOMALY_FLAGGED = "anomaly.flagged"
30
+ CITATION_ADDED = "citation.added"
31
+
32
+
33
+ @dataclass
34
+ class TraceEvent:
35
+ event_id: str = field(default_factory=lambda: str(uuid.uuid4())[:8])
36
+ event_type: TraceEventType = TraceEventType.QUERY_STARTED
37
+ agent_name: str = ""
38
+ span_id: Optional[str] = None
39
+ message: str = ""
40
+ data: Dict[str, Any] = field(default_factory=dict)
41
+ timestamp: str = field(default_factory=lambda: datetime.now(timezone.utc).isoformat())
42
+ duration_ms: float = 0.0
43
+
44
+ def to_dict(self) -> Dict[str, Any]:
45
+ return {"event_id": self.event_id, "event_type": self.event_type.value,
46
+ "agent_name": self.agent_name, "span_id": self.span_id,
47
+ "message": self.message, "data": self.data,
48
+ "timestamp": self.timestamp, "duration_ms": self.duration_ms}
49
+
50
+
51
+ @dataclass
52
+ class TraceSpan:
53
+ span_id: str = field(default_factory=lambda: str(uuid.uuid4())[:8])
54
+ parent_span_id: Optional[str] = None
55
+ agent_name: str = ""
56
+ operation: str = ""
57
+ start_time: float = 0.0
58
+ end_time: float = 0.0
59
+ events: List[TraceEvent] = field(default_factory=list)
60
+ child_spans: List["TraceSpan"] = field(default_factory=list)
61
+ metadata: Dict[str, Any] = field(default_factory=dict)
62
+
63
+ @property
64
+ def duration_ms(self) -> float:
65
+ return (self.end_time - self.start_time) * 1000 if self.end_time and self.start_time else 0.0
66
+
67
+ def to_dict(self) -> Dict[str, Any]:
68
+ return {"span_id": self.span_id, "parent_span_id": self.parent_span_id,
69
+ "agent_name": self.agent_name, "operation": self.operation,
70
+ "duration_ms": self.duration_ms,
71
+ "events": [e.to_dict() for e in self.events],
72
+ "child_spans": [c.to_dict() for c in self.child_spans],
73
+ "metadata": self.metadata}
74
+
75
+
76
+ class TraceCollector:
77
+ """Collects trace events during query execution. Streams to WebSocket, persists to SQLite."""
78
+
79
+ def __init__(self, query_id: str, session_id: Optional[str] = None) -> None:
80
+ self.trace_id = str(uuid.uuid4())
81
+ self.query_id = query_id
82
+ self.session_id = session_id
83
+ self._span_stack: List[TraceSpan] = []
84
+ self._root_spans: List[TraceSpan] = []
85
+ self._events: List[TraceEvent] = []
86
+ self._query_start_time = time.time()
87
+ self._status = "running"
88
+ self._stream_callback: Optional[Callable[[Dict[str, Any]], None]] = None
89
+
90
+ def span_start(self, agent_name: str, operation: str, metadata: dict = None) -> str:
91
+ span = TraceSpan(agent_name=agent_name, operation=operation,
92
+ start_time=time.time(), metadata=metadata or {})
93
+ if self._span_stack:
94
+ span.parent_span_id = self._span_stack[-1].span_id
95
+ self._span_stack[-1].child_spans.append(span)
96
+ else:
97
+ self._root_spans.append(span)
98
+ self._span_stack.append(span)
99
+ return span.span_id
100
+
101
+ def span_end(self, metadata: dict = None) -> str:
102
+ if not self._span_stack:
103
+ return ""
104
+ span = self._span_stack.pop()
105
+ span.end_time = time.time()
106
+ if metadata:
107
+ span.metadata.update(metadata)
108
+ return span.span_id
109
+
110
+ def event(self, event_type: TraceEventType, agent_name: str = "",
111
+ message: str = "", data: Dict[str, Any] = None,
112
+ duration_ms: float = 0.0) -> TraceEvent:
113
+ evt = TraceEvent(event_type=event_type, agent_name=agent_name,
114
+ message=message, data=data or {}, duration_ms=duration_ms)
115
+ if self._span_stack:
116
+ evt.span_id = self._span_stack[-1].span_id
117
+ self._span_stack[-1].events.append(evt)
118
+ self._events.append(evt)
119
+ if self._stream_callback:
120
+ self._stream_callback(evt.to_dict())
121
+ return evt
122
+
123
+ def set_stream_callback(self, callback: Callable[[Dict[str, Any]], None]) -> None:
124
+ self._stream_callback = callback
125
+
126
+ def complete(self, status: str = "success") -> None:
127
+ while self._span_stack:
128
+ self.span_end()
129
+ self._status = status
130
+ self.event(TraceEventType.QUERY_COMPLETED if status == "success" else TraceEventType.QUERY_FAILED,
131
+ message=f"Query {status}", data={"duration_ms": self.get_total_duration_ms()})
132
+
133
+ def to_dict(self) -> Dict[str, Any]:
134
+ return {"trace_id": self.trace_id, "query_id": self.query_id,
135
+ "session_id": self.session_id, "status": self._status,
136
+ "duration_ms": self.get_total_duration_ms(),
137
+ "agent_count": len(self._root_spans),
138
+ "event_count": len(self._events),
139
+ "root_spans": [s.to_dict() for s in self._root_spans],
140
+ "events": [e.to_dict() for e in self._events],
141
+ "created_at": datetime.now(timezone.utc).isoformat()}
142
+
143
+ def to_json(self) -> str:
144
+ return json.dumps(self.to_dict(), default=str)
145
+
146
+ def get_event_count(self) -> int:
147
+ return len(self._events)
148
+
149
+ def get_total_duration_ms(self) -> float:
150
+ return (time.time() - self._query_start_time) * 1000
@@ -0,0 +1,3 @@
1
+ # =============================================================================
2
+ # © JINAN KORDAB — 2026 AI HYBRID AGENTIC RETRIEVAL-AUGMENTED GENERATION RAG PIPELINE - PERSONAL PROJECT
3
+ # =============================================================================
@@ -0,0 +1,57 @@
1
+ """© JINAN KORDAB — 2026 AI HYBRID AGENTIC RETRIEVAL-AUGMENTED GENERATION RAG PIPELINE - PERSONAL PROJECT"""
2
+
3
+ from dataclasses import dataclass
4
+ from typing import Any, Dict, List
5
+
6
+
7
+ @dataclass
8
+ class Chunk:
9
+ chunk_id: str
10
+ text: str
11
+ source_doc: str
12
+ source_page: int
13
+ start_position: int
14
+ end_position: int
15
+ level: str = "paragraph"
16
+ context_type: str = "body"
17
+ font_size: float = 12.0
18
+ is_title: bool = False
19
+ is_header: bool = False
20
+
21
+
22
+ class HierarchicalChunker:
23
+ """Three-level overlapping chunking: section → paragraph → sentence."""
24
+
25
+ def __init__(self, overlap_ratio: float = 0.2) -> None:
26
+ self.overlap_ratio = overlap_ratio
27
+
28
+ def chunk_page(self, page_number: int, elements: List[Dict[str, Any]],
29
+ source_doc: str) -> List[Chunk]:
30
+ chunks = []
31
+ full_text = " ".join(e.get("text", "") for e in elements)
32
+ if not full_text.strip():
33
+ return chunks
34
+ paragraphs = [p.strip() for p in full_text.split("\n\n") if p.strip()]
35
+ pos = 0
36
+ for i, para in enumerate(paragraphs):
37
+ chunks.append(Chunk(
38
+ chunk_id=f"{source_doc}_p{page_number}_para{i}",
39
+ text=para, source_doc=source_doc, source_page=page_number,
40
+ start_position=pos, end_position=pos + len(para), level="paragraph"))
41
+ pos += len(para) + 2
42
+ # Sentence-level chunks
43
+ sentences = [s.strip() + "." for s in para.replace("\n", " ").split(". ") if s.strip()]
44
+ for j, sent in enumerate(sentences):
45
+ if len(sent) > 10:
46
+ chunks.append(Chunk(
47
+ chunk_id=f"{source_doc}_p{page_number}_para{i}_sent{j}",
48
+ text=sent, source_doc=source_doc, source_page=page_number,
49
+ start_position=pos, end_position=pos + len(sent), level="sentence"))
50
+ return chunks
51
+
52
+ def chunk_document(self, parsed_pages: List[Dict[str, Any]], source_doc: str) -> List[Chunk]:
53
+ all_chunks = []
54
+ for page in parsed_pages:
55
+ all_chunks.extend(self.chunk_page(page.get("page_number", 0),
56
+ page.get("elements", []), source_doc))
57
+ return all_chunks
@@ -0,0 +1,42 @@
1
+ """© JINAN KORDAB — 2026 AI HYBRID AGENTIC RETRIEVAL-AUGMENTED GENERATION RAG PIPELINE - PERSONAL PROJECT"""
2
+
3
+ from typing import Any, Dict, List
4
+
5
+
6
+ class PDFParser:
7
+ """Extracts text with font metadata for contextual multi-token scoring."""
8
+
9
+ def parse(self, file_path: str) -> List[Dict[str, Any]]:
10
+ try:
11
+ import fitz # pymupdf
12
+ pages = []
13
+ doc = fitz.open(file_path)
14
+ for page_num, page in enumerate(doc):
15
+ blocks = page.get_text("dict")["blocks"]
16
+ elements = []
17
+ for block in blocks:
18
+ if "lines" not in block:
19
+ continue
20
+ for line in block["lines"]:
21
+ for span in line["spans"]:
22
+ font_size = span.get("size", 12.0)
23
+ text = span.get("text", "").strip()
24
+ if text:
25
+ elements.append({"text": text, "font_size": font_size,
26
+ "is_title": font_size > 16,
27
+ "is_header": 14 <= font_size <= 16})
28
+ pages.append({"page_number": page_num + 1, "elements": elements})
29
+ doc.close()
30
+ return pages
31
+ except ImportError:
32
+ text = open(file_path, "r", encoding="utf-8", errors="ignore").read()
33
+ return [{"page_number": 1, "elements": [{"text": text, "font_size": 12.0,
34
+ "is_title": False, "is_header": False}]}]
35
+
36
+ def get_sections(self, parsed_content: List[Dict]) -> List[Dict]:
37
+ sections = []
38
+ for page in parsed_content:
39
+ for el in page.get("elements", []):
40
+ sections.append({"text": el["text"], "page": page["page_number"],
41
+ "is_title": el["is_title"], "is_header": el["is_header"]})
42
+ return sections
@@ -0,0 +1,3 @@
1
+ # =============================================================================
2
+ # © JINAN KORDAB — 2026 AI HYBRID AGENTIC RETRIEVAL-AUGMENTED GENERATION RAG PIPELINE - PERSONAL PROJECT
3
+ # =============================================================================