parishad 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (68) hide show
  1. parishad/__init__.py +70 -0
  2. parishad/__main__.py +10 -0
  3. parishad/checker/__init__.py +25 -0
  4. parishad/checker/deterministic.py +644 -0
  5. parishad/checker/ensemble.py +496 -0
  6. parishad/checker/retrieval.py +546 -0
  7. parishad/cli/__init__.py +6 -0
  8. parishad/cli/code.py +3254 -0
  9. parishad/cli/main.py +1158 -0
  10. parishad/cli/prarambh.py +99 -0
  11. parishad/cli/sthapana.py +368 -0
  12. parishad/config/modes.py +139 -0
  13. parishad/config/pipeline.core.yaml +128 -0
  14. parishad/config/pipeline.extended.yaml +172 -0
  15. parishad/config/pipeline.fast.yaml +89 -0
  16. parishad/config/user_config.py +115 -0
  17. parishad/data/catalog.py +118 -0
  18. parishad/data/models.json +108 -0
  19. parishad/memory/__init__.py +79 -0
  20. parishad/models/__init__.py +181 -0
  21. parishad/models/backends/__init__.py +247 -0
  22. parishad/models/backends/base.py +211 -0
  23. parishad/models/backends/huggingface.py +318 -0
  24. parishad/models/backends/llama_cpp.py +239 -0
  25. parishad/models/backends/mlx_lm.py +141 -0
  26. parishad/models/backends/ollama.py +253 -0
  27. parishad/models/backends/openai_api.py +193 -0
  28. parishad/models/backends/transformers_hf.py +198 -0
  29. parishad/models/costs.py +385 -0
  30. parishad/models/downloader.py +1557 -0
  31. parishad/models/optimizations.py +871 -0
  32. parishad/models/profiles.py +610 -0
  33. parishad/models/reliability.py +876 -0
  34. parishad/models/runner.py +651 -0
  35. parishad/models/tokenization.py +287 -0
  36. parishad/orchestrator/__init__.py +24 -0
  37. parishad/orchestrator/config_loader.py +210 -0
  38. parishad/orchestrator/engine.py +1113 -0
  39. parishad/orchestrator/exceptions.py +14 -0
  40. parishad/roles/__init__.py +71 -0
  41. parishad/roles/base.py +712 -0
  42. parishad/roles/dandadhyaksha.py +163 -0
  43. parishad/roles/darbari.py +246 -0
  44. parishad/roles/majumdar.py +274 -0
  45. parishad/roles/pantapradhan.py +150 -0
  46. parishad/roles/prerak.py +357 -0
  47. parishad/roles/raja.py +345 -0
  48. parishad/roles/sacheev.py +203 -0
  49. parishad/roles/sainik.py +427 -0
  50. parishad/roles/sar_senapati.py +164 -0
  51. parishad/roles/vidushak.py +69 -0
  52. parishad/tools/__init__.py +7 -0
  53. parishad/tools/base.py +57 -0
  54. parishad/tools/fs.py +110 -0
  55. parishad/tools/perception.py +96 -0
  56. parishad/tools/retrieval.py +74 -0
  57. parishad/tools/shell.py +103 -0
  58. parishad/utils/__init__.py +7 -0
  59. parishad/utils/hardware.py +122 -0
  60. parishad/utils/logging.py +79 -0
  61. parishad/utils/scanner.py +164 -0
  62. parishad/utils/text.py +61 -0
  63. parishad/utils/tracing.py +133 -0
  64. parishad-0.1.0.dist-info/METADATA +256 -0
  65. parishad-0.1.0.dist-info/RECORD +68 -0
  66. parishad-0.1.0.dist-info/WHEEL +4 -0
  67. parishad-0.1.0.dist-info/entry_points.txt +2 -0
  68. parishad-0.1.0.dist-info/licenses/LICENSE +21 -0
@@ -0,0 +1,287 @@
1
+ """
2
+ Tokenization utilities for Parishad.
3
+
4
+ Provides token estimation for different backends and models.
5
+ This is used for:
6
+ - Budget enforcement (tracking token usage)
7
+ - Cost estimation
8
+ - Context length management
9
+ """
10
+
11
+ from __future__ import annotations
12
+
13
+ import re
14
+ from functools import lru_cache
15
+ from typing import Callable
16
+
17
+
18
+ # =============================================================================
19
+ # Heuristic Token Estimators
20
+ # =============================================================================
21
+
22
+
23
+ def estimate_tokens_simple(text: str) -> int:
24
+ """
25
+ Simple heuristic token estimation using word count.
26
+
27
+ Uses ~1.3 tokens per word as a rough approximation for English text.
28
+ This is fast but not accurate for code or non-English text.
29
+
30
+ Args:
31
+ text: Input text
32
+
33
+ Returns:
34
+ Estimated token count
35
+ """
36
+ if not text:
37
+ return 0
38
+ words = len(text.split())
39
+ return int(words * 1.3)
40
+
41
+
42
+ def estimate_tokens_chars(text: str) -> int:
43
+ """
44
+ Character-based token estimation.
45
+
46
+ Uses ~4 characters per token as a rough approximation.
47
+ Better for code and mixed content.
48
+
49
+ Args:
50
+ text: Input text
51
+
52
+ Returns:
53
+ Estimated token count
54
+ """
55
+ if not text:
56
+ return 0
57
+ return max(1, len(text) // 4)
58
+
59
+
60
+ def estimate_tokens_hybrid(text: str) -> int:
61
+ """
62
+ Hybrid token estimation combining word and character counts.
63
+
64
+ Uses a weighted combination for better accuracy across
65
+ different content types (prose vs code).
66
+
67
+ Args:
68
+ text: Input text
69
+
70
+ Returns:
71
+ Estimated token count
72
+ """
73
+ if not text:
74
+ return 0
75
+
76
+ # Count words and characters
77
+ words = len(text.split())
78
+ chars = len(text)
79
+
80
+ # Count code-like patterns (more tokens per character in code)
81
+ code_patterns = len(re.findall(r'[{}()\[\];:,<>=!&|+\-*/]', text))
82
+
83
+ # Base estimate from words
84
+ word_estimate = int(words * 1.3)
85
+
86
+ # Character-based estimate
87
+ char_estimate = chars // 4
88
+
89
+ # If lots of code patterns, weight towards character estimate
90
+ if code_patterns > words * 0.3:
91
+ # Code-heavy: use character estimate
92
+ return max(1, int(char_estimate * 1.1))
93
+ else:
94
+ # Prose-heavy: average of both
95
+ return max(1, (word_estimate + char_estimate) // 2)
96
+
97
+
98
+ # =============================================================================
99
+ # Tokenizer Registry
100
+ # =============================================================================
101
+
102
+
103
+ # Map of backend/model to tokenizer function
104
+ _TOKENIZER_REGISTRY: dict[str, Callable[[str], int]] = {}
105
+
106
+
107
+ def register_tokenizer(
108
+ name: str,
109
+ tokenizer_fn: Callable[[str], int],
110
+ ) -> None:
111
+ """
112
+ Register a tokenizer function for a backend or model.
113
+
114
+ Args:
115
+ name: Backend name or model ID
116
+ tokenizer_fn: Function that takes text and returns token count
117
+ """
118
+ _TOKENIZER_REGISTRY[name] = tokenizer_fn
119
+
120
+
121
+ def get_tokenizer(backend: str, model_id: str = "") -> Callable[[str], int]:
122
+ """
123
+ Get the best tokenizer for a backend/model.
124
+
125
+ Looks up in order:
126
+ 1. Exact model_id match
127
+ 2. Backend name match
128
+ 3. Default hybrid estimator
129
+
130
+ Args:
131
+ backend: Backend name (e.g., 'openai', 'llama_cpp')
132
+ model_id: Optional model identifier
133
+
134
+ Returns:
135
+ Tokenizer function
136
+ """
137
+ # Try model-specific tokenizer
138
+ if model_id and model_id in _TOKENIZER_REGISTRY:
139
+ return _TOKENIZER_REGISTRY[model_id]
140
+
141
+ # Try backend tokenizer
142
+ if backend in _TOKENIZER_REGISTRY:
143
+ return _TOKENIZER_REGISTRY[backend]
144
+
145
+ # Default
146
+ return estimate_tokens_hybrid
147
+
148
+
149
+ # =============================================================================
150
+ # Tiktoken Integration (for OpenAI models)
151
+ # =============================================================================
152
+
153
+
154
+ _tiktoken = None
155
+
156
+
157
+ def _get_tiktoken():
158
+ """Lazy import of tiktoken."""
159
+ global _tiktoken
160
+ if _tiktoken is None:
161
+ try:
162
+ import tiktoken
163
+ _tiktoken = tiktoken
164
+ except ImportError:
165
+ return None
166
+ return _tiktoken
167
+
168
+
169
+ @lru_cache(maxsize=8)
170
+ def _get_tiktoken_encoding(model: str):
171
+ """Get tiktoken encoding for a model (cached)."""
172
+ tiktoken = _get_tiktoken()
173
+ if tiktoken is None:
174
+ return None
175
+
176
+ try:
177
+ return tiktoken.encoding_for_model(model)
178
+ except KeyError:
179
+ # Fall back to cl100k_base for unknown models
180
+ try:
181
+ return tiktoken.get_encoding("cl100k_base")
182
+ except Exception:
183
+ return None
184
+
185
+
186
+ def count_tokens_tiktoken(text: str, model: str = "gpt-4") -> int:
187
+ """
188
+ Count tokens using tiktoken (for OpenAI models).
189
+
190
+ Falls back to heuristic if tiktoken unavailable.
191
+
192
+ Args:
193
+ text: Input text
194
+ model: OpenAI model name
195
+
196
+ Returns:
197
+ Token count
198
+ """
199
+ if not text:
200
+ return 0
201
+
202
+ encoding = _get_tiktoken_encoding(model)
203
+ if encoding is None:
204
+ return estimate_tokens_hybrid(text)
205
+
206
+ return len(encoding.encode(text))
207
+
208
+
209
+ def is_tiktoken_available() -> bool:
210
+ """Check if tiktoken is available."""
211
+ return _get_tiktoken() is not None
212
+
213
+
214
+ # =============================================================================
215
+ # Register Default Tokenizers
216
+ # =============================================================================
217
+
218
+
219
+ # OpenAI models use tiktoken when available
220
+ def _openai_tokenizer(text: str) -> int:
221
+ return count_tokens_tiktoken(text, "gpt-4")
222
+
223
+
224
+ register_tokenizer("openai", _openai_tokenizer)
225
+
226
+ # Other backends use hybrid by default
227
+ register_tokenizer("llama_cpp", estimate_tokens_hybrid)
228
+ register_tokenizer("transformers", estimate_tokens_hybrid)
229
+
230
+
231
+ # =============================================================================
232
+ # Convenience Functions
233
+ # =============================================================================
234
+
235
+
236
+ def estimate_tokens(
237
+ text: str,
238
+ backend: str = "",
239
+ model_id: str = "",
240
+ ) -> int:
241
+ """
242
+ Estimate token count for text.
243
+
244
+ Uses the best available tokenizer for the backend/model.
245
+
246
+ Args:
247
+ text: Input text
248
+ backend: Optional backend name
249
+ model_id: Optional model identifier
250
+
251
+ Returns:
252
+ Estimated token count
253
+ """
254
+ tokenizer = get_tokenizer(backend, model_id)
255
+ return tokenizer(text)
256
+
257
+
258
+ def estimate_prompt_tokens(
259
+ system_prompt: str,
260
+ user_message: str,
261
+ backend: str = "",
262
+ model_id: str = "",
263
+ ) -> int:
264
+ """
265
+ Estimate tokens for a full prompt (system + user).
266
+
267
+ Accounts for message formatting overhead.
268
+
269
+ Args:
270
+ system_prompt: System prompt text
271
+ user_message: User message text
272
+ backend: Optional backend name
273
+ model_id: Optional model identifier
274
+
275
+ Returns:
276
+ Estimated token count including overhead
277
+ """
278
+ tokenizer = get_tokenizer(backend, model_id)
279
+
280
+ # Count tokens in each part
281
+ system_tokens = tokenizer(system_prompt)
282
+ user_tokens = tokenizer(user_message)
283
+
284
+ # Add overhead for message formatting (~4 tokens per message)
285
+ overhead = 8 # system + user messages
286
+
287
+ return system_tokens + user_tokens + overhead
@@ -0,0 +1,24 @@
1
+ """Orchestrator for Parishad council pipeline."""
2
+
3
+ from .engine import (
4
+ ParishadEngine,
5
+ Parishad,
6
+ PipelineConfig,
7
+ BudgetConfig,
8
+ RetryConfig,
9
+ DifficultyRouting,
10
+ ExecutionContext,
11
+ ROLE_REGISTRY,
12
+ )
13
+
14
+
15
+ __all__ = [
16
+ "ParishadEngine",
17
+ "Parishad",
18
+ "PipelineConfig",
19
+ "BudgetConfig",
20
+ "RetryConfig",
21
+ "DifficultyRouting",
22
+ "ExecutionContext",
23
+ "ROLE_REGISTRY",
24
+ ]
@@ -0,0 +1,210 @@
1
+ """
2
+ Configuration loader for pipeline definitions.
3
+
4
+ Prepares for Phase 2 config-driven pipelines (Core vs Extended).
5
+ """
6
+
7
+ from dataclasses import dataclass, field
8
+ from pathlib import Path
9
+ from typing import Optional
10
+ import yaml
11
+ import logging
12
+
13
+ from .exceptions import InvalidPipelineConfigError
14
+
15
+ logger = logging.getLogger(__name__)
16
+
17
+
18
+ @dataclass
19
+ class RoleSpec:
20
+ """Specification for a single role in the pipeline."""
21
+ name: str
22
+ class_name: str
23
+ slot: str
24
+ version: str = "0.1.0"
25
+ budget_tokens: int = 1000
26
+ dependencies: list[str] = field(default_factory=list)
27
+ max_tokens: Optional[int] = None
28
+ temperature: Optional[float] = None
29
+ extra_config: dict = field(default_factory=dict)
30
+
31
+ def to_dict(self) -> dict:
32
+ """Convert to dictionary."""
33
+ return {
34
+ "name": self.name,
35
+ "class_name": self.class_name,
36
+ "slot": self.slot,
37
+ "version": self.version,
38
+ "budget_tokens": self.budget_tokens,
39
+ "dependencies": self.dependencies,
40
+ "max_tokens": self.max_tokens,
41
+ "temperature": self.temperature,
42
+ "extra_config": self.extra_config
43
+ }
44
+
45
+
46
+ def load_pipeline_config(name: str, config_dir: Optional[Path] = None) -> list[RoleSpec]:
47
+ """
48
+ Load pipeline configuration from YAML file.
49
+
50
+ Args:
51
+ name: Pipeline name ("core" or "extended")
52
+ config_dir: Optional directory containing config files
53
+
54
+ Returns:
55
+ List of RoleSpec objects defining the pipeline
56
+
57
+ Raises:
58
+ FileNotFoundError: If config file doesn't exist
59
+ ValueError: If config is invalid
60
+ """
61
+ # Resolve config directory
62
+ if config_dir is None:
63
+ # Default to package config directory
64
+ package_dir = Path(__file__).parent.parent
65
+ config_dir = package_dir / "config"
66
+
67
+ config_path = config_dir / f"pipeline.{name}.yaml"
68
+
69
+ if not config_path.exists():
70
+ raise FileNotFoundError(
71
+ f"Pipeline config not found: {config_path}. "
72
+ f"Expected one of: pipeline.core.yaml, pipeline.extended.yaml"
73
+ )
74
+
75
+ # Load YAML
76
+ logger.debug(f"Loading pipeline config from {config_path}")
77
+
78
+ with open(config_path) as f:
79
+ data = yaml.safe_load(f)
80
+
81
+ if not data:
82
+ raise ValueError(f"Empty pipeline config: {config_path}")
83
+
84
+ # Parse roles
85
+ roles_data = data.get("roles", {})
86
+ pipeline_order = data.get("pipeline", [])
87
+
88
+ if not pipeline_order:
89
+ raise ValueError(f"No pipeline order specified in {config_path}")
90
+
91
+ # Build RoleSpec list in pipeline order
92
+ role_specs = []
93
+
94
+ for role_name in pipeline_order:
95
+ role_config = roles_data.get(role_name, {})
96
+
97
+ if not role_config:
98
+ logger.warning(f"No configuration for role '{role_name}', using defaults")
99
+ role_config = {}
100
+
101
+ # Extract known fields to avoid duplication in extra_config
102
+ known_fields = {
103
+ "name", "class", "slot", "version", "budget_tokens",
104
+ "dependencies", "max_tokens", "temperature"
105
+ }
106
+ extra_config = {k: v for k, v in role_config.items() if k not in known_fields}
107
+
108
+ # Extract role spec
109
+ spec = RoleSpec(
110
+ name=role_name.lower(), # Always store as lowercase for consistent lookups
111
+ class_name=role_config.get("class", role_name.capitalize()),
112
+ slot=role_config.get("slot", "mid"),
113
+ version=role_config.get("version", "0.1.0"),
114
+ budget_tokens=role_config.get("budget_tokens", 1000),
115
+ dependencies=role_config.get("dependencies", []),
116
+ max_tokens=role_config.get("max_tokens"),
117
+ temperature=role_config.get("temperature"),
118
+ extra_config=extra_config
119
+ )
120
+
121
+ role_specs.append(spec)
122
+ logger.debug(f"Loaded role spec: {role_name} ({spec.class_name}, slot={spec.slot})")
123
+
124
+ logger.info(f"Loaded pipeline '{name}' with {len(role_specs)} roles: {pipeline_order}")
125
+
126
+ # Validate the loaded configuration
127
+ validation_result = validate_pipeline_config(role_specs)
128
+ if not validation_result["valid"]:
129
+ raise InvalidPipelineConfigError(validation_result["errors"])
130
+
131
+ return role_specs
132
+
133
+
134
+ def validate_pipeline_config(role_specs: list[RoleSpec]) -> dict[str, any]:
135
+ """
136
+ Validate a loaded pipeline configuration.
137
+
138
+ Args:
139
+ role_specs: List of role specifications
140
+
141
+ Returns:
142
+ Validation result dict with 'valid' (bool) and 'errors' (list) keys
143
+ """
144
+ errors = []
145
+
146
+ # Check for empty pipeline
147
+ if not role_specs:
148
+ errors.append("Pipeline is empty")
149
+ return {"valid": False, "errors": errors}
150
+
151
+ # Check for duplicate role names
152
+ role_names = [spec.name for spec in role_specs]
153
+ duplicates = [name for name in role_names if role_names.count(name) > 1]
154
+ if duplicates:
155
+ errors.append(f"Duplicate role names: {set(duplicates)}")
156
+
157
+ # Check for valid slots
158
+ valid_slots = {"small", "mid", "big"}
159
+ for spec in role_specs:
160
+ if spec.slot not in valid_slots:
161
+ errors.append(f"Invalid slot '{spec.slot}' for role '{spec.name}'")
162
+
163
+ # Check for circular dependencies
164
+ for spec in role_specs:
165
+ for dep in spec.dependencies:
166
+ if dep not in role_names:
167
+ errors.append(f"Role '{spec.name}' depends on unknown role '{dep}'")
168
+ if dep == spec.name:
169
+ errors.append(f"Role '{spec.name}' has circular self-dependency")
170
+
171
+ # Check budget sanity
172
+ for spec in role_specs:
173
+ if spec.budget_tokens < 0:
174
+ errors.append(f"Negative budget for role '{spec.name}': {spec.budget_tokens}")
175
+
176
+ return {
177
+ "valid": len(errors) == 0,
178
+ "errors": errors
179
+ }
180
+
181
+
182
+ def get_available_pipelines(config_dir: Optional[Path] = None) -> list[str]:
183
+ """
184
+ List all available pipeline configurations.
185
+
186
+ Args:
187
+ config_dir: Optional directory containing config files
188
+
189
+ Returns:
190
+ List of pipeline names (without .yaml extension)
191
+ """
192
+ if config_dir is None:
193
+ package_dir = Path(__file__).parent.parent
194
+ config_dir = package_dir / "config"
195
+
196
+ if not config_dir.exists():
197
+ return []
198
+
199
+ # Find all pipeline.*.yaml files
200
+ pipeline_files = config_dir.glob("pipeline.*.yaml")
201
+
202
+ # Extract names
203
+ names = []
204
+ for path in pipeline_files:
205
+ # Extract name between "pipeline." and ".yaml"
206
+ name = path.stem.replace("pipeline.", "")
207
+ if name != "pipeline": # Exclude "pipeline.yaml" itself
208
+ names.append(name)
209
+
210
+ return sorted(names)