PyPI - parishad - Versions diffs - 0.1.0__py3-none-any.whl - Mend

parishad 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (68) hide show

parishad/__init__.py +70 -0
parishad/__main__.py +10 -0
parishad/checker/__init__.py +25 -0
parishad/checker/deterministic.py +644 -0
parishad/checker/ensemble.py +496 -0
parishad/checker/retrieval.py +546 -0
parishad/cli/__init__.py +6 -0
parishad/cli/code.py +3254 -0
parishad/cli/main.py +1158 -0
parishad/cli/prarambh.py +99 -0
parishad/cli/sthapana.py +368 -0
parishad/config/modes.py +139 -0
parishad/config/pipeline.core.yaml +128 -0
parishad/config/pipeline.extended.yaml +172 -0
parishad/config/pipeline.fast.yaml +89 -0
parishad/config/user_config.py +115 -0
parishad/data/catalog.py +118 -0
parishad/data/models.json +108 -0
parishad/memory/__init__.py +79 -0
parishad/models/__init__.py +181 -0
parishad/models/backends/__init__.py +247 -0
parishad/models/backends/base.py +211 -0
parishad/models/backends/huggingface.py +318 -0
parishad/models/backends/llama_cpp.py +239 -0
parishad/models/backends/mlx_lm.py +141 -0
parishad/models/backends/ollama.py +253 -0
parishad/models/backends/openai_api.py +193 -0
parishad/models/backends/transformers_hf.py +198 -0
parishad/models/costs.py +385 -0
parishad/models/downloader.py +1557 -0
parishad/models/optimizations.py +871 -0
parishad/models/profiles.py +610 -0
parishad/models/reliability.py +876 -0
parishad/models/runner.py +651 -0
parishad/models/tokenization.py +287 -0
parishad/orchestrator/__init__.py +24 -0
parishad/orchestrator/config_loader.py +210 -0
parishad/orchestrator/engine.py +1113 -0
parishad/orchestrator/exceptions.py +14 -0
parishad/roles/__init__.py +71 -0
parishad/roles/base.py +712 -0
parishad/roles/dandadhyaksha.py +163 -0
parishad/roles/darbari.py +246 -0
parishad/roles/majumdar.py +274 -0
parishad/roles/pantapradhan.py +150 -0
parishad/roles/prerak.py +357 -0
parishad/roles/raja.py +345 -0
parishad/roles/sacheev.py +203 -0
parishad/roles/sainik.py +427 -0
parishad/roles/sar_senapati.py +164 -0
parishad/roles/vidushak.py +69 -0
parishad/tools/__init__.py +7 -0
parishad/tools/base.py +57 -0
parishad/tools/fs.py +110 -0
parishad/tools/perception.py +96 -0
parishad/tools/retrieval.py +74 -0
parishad/tools/shell.py +103 -0
parishad/utils/__init__.py +7 -0
parishad/utils/hardware.py +122 -0
parishad/utils/logging.py +79 -0
parishad/utils/scanner.py +164 -0
parishad/utils/text.py +61 -0
parishad/utils/tracing.py +133 -0
parishad-0.1.0.dist-info/METADATA +256 -0
parishad-0.1.0.dist-info/RECORD +68 -0
parishad-0.1.0.dist-info/WHEEL +4 -0
parishad-0.1.0.dist-info/entry_points.txt +2 -0
parishad-0.1.0.dist-info/licenses/LICENSE +21 -0

parishad/models/tokenization.py ADDED Viewed

@@ -0,0 +1,287 @@
+"""
+Tokenization utilities for Parishad.
+Provides token estimation for different backends and models.
+This is used for:
+- Budget enforcement (tracking token usage)
+- Cost estimation
+- Context length management
+"""
+from __future__ import annotations
+import re
+from functools import lru_cache
+from typing import Callable
+# =============================================================================
+# Heuristic Token Estimators
+# =============================================================================
+def estimate_tokens_simple(text: str) -> int:
+    """
+    Simple heuristic token estimation using word count.
+    Uses ~1.3 tokens per word as a rough approximation for English text.
+    This is fast but not accurate for code or non-English text.
+    Args:
+        text: Input text
+    Returns:
+        Estimated token count
+    """
+    if not text:
+        return 0
+    words = len(text.split())
+    return int(words * 1.3)
+def estimate_tokens_chars(text: str) -> int:
+    """
+    Character-based token estimation.
+    Uses ~4 characters per token as a rough approximation.
+    Better for code and mixed content.
+    Args:
+        text: Input text
+    Returns:
+        Estimated token count
+    """
+    if not text:
+        return 0
+    return max(1, len(text) // 4)
+def estimate_tokens_hybrid(text: str) -> int:
+    """
+    Hybrid token estimation combining word and character counts.
+    Uses a weighted combination for better accuracy across
+    different content types (prose vs code).
+    Args:
+        text: Input text
+    Returns:
+        Estimated token count
+    """
+    if not text:
+        return 0
+    # Count words and characters
+    words = len(text.split())
+    chars = len(text)
+    # Count code-like patterns (more tokens per character in code)
+    code_patterns = len(re.findall(r'[{}()\[\];:,<>=!&|+\-*/]', text))
+    # Base estimate from words
+    word_estimate = int(words * 1.3)
+    # Character-based estimate
+    char_estimate = chars // 4
+    # If lots of code patterns, weight towards character estimate
+    if code_patterns > words * 0.3:
+        # Code-heavy: use character estimate
+        return max(1, int(char_estimate * 1.1))
+    else:
+        # Prose-heavy: average of both
+        return max(1, (word_estimate + char_estimate) // 2)
+# =============================================================================
+# Tokenizer Registry
+# =============================================================================
+# Map of backend/model to tokenizer function
+_TOKENIZER_REGISTRY: dict[str, Callable[[str], int]] = {}
+def register_tokenizer(
+    name: str,
+    tokenizer_fn: Callable[[str], int],
+) -> None:
+    """
+    Register a tokenizer function for a backend or model.
+    Args:
+        name: Backend name or model ID
+        tokenizer_fn: Function that takes text and returns token count
+    """
+    _TOKENIZER_REGISTRY[name] = tokenizer_fn
+def get_tokenizer(backend: str, model_id: str = "") -> Callable[[str], int]:
+    """
+    Get the best tokenizer for a backend/model.
+    Looks up in order:
+    1. Exact model_id match
+    2. Backend name match
+    3. Default hybrid estimator
+    Args:
+        backend: Backend name (e.g., 'openai', 'llama_cpp')
+        model_id: Optional model identifier
+    Returns:
+        Tokenizer function
+    """
+    # Try model-specific tokenizer
+    if model_id and model_id in _TOKENIZER_REGISTRY:
+        return _TOKENIZER_REGISTRY[model_id]
+    # Try backend tokenizer
+    if backend in _TOKENIZER_REGISTRY:
+        return _TOKENIZER_REGISTRY[backend]
+    # Default
+    return estimate_tokens_hybrid
+# =============================================================================
+# Tiktoken Integration (for OpenAI models)
+# =============================================================================
+_tiktoken = None
+def _get_tiktoken():
+    """Lazy import of tiktoken."""
+    global _tiktoken
+    if _tiktoken is None:
+        try:
+            import tiktoken
+            _tiktoken = tiktoken
+        except ImportError:
+            return None
+    return _tiktoken
+@lru_cache(maxsize=8)
+def _get_tiktoken_encoding(model: str):
+    """Get tiktoken encoding for a model (cached)."""
+    tiktoken = _get_tiktoken()
+    if tiktoken is None:
+        return None
+    try:
+        return tiktoken.encoding_for_model(model)
+    except KeyError:
+        # Fall back to cl100k_base for unknown models
+        try:
+            return tiktoken.get_encoding("cl100k_base")
+        except Exception:
+            return None
+def count_tokens_tiktoken(text: str, model: str = "gpt-4") -> int:
+    """
+    Count tokens using tiktoken (for OpenAI models).
+    Falls back to heuristic if tiktoken unavailable.
+    Args:
+        text: Input text
+        model: OpenAI model name
+    Returns:
+        Token count
+    """
+    if not text:
+        return 0
+    encoding = _get_tiktoken_encoding(model)
+    if encoding is None:
+        return estimate_tokens_hybrid(text)
+    return len(encoding.encode(text))
+def is_tiktoken_available() -> bool:
+    """Check if tiktoken is available."""
+    return _get_tiktoken() is not None
+# =============================================================================
+# Register Default Tokenizers
+# =============================================================================
+# OpenAI models use tiktoken when available
+def _openai_tokenizer(text: str) -> int:
+    return count_tokens_tiktoken(text, "gpt-4")
+register_tokenizer("openai", _openai_tokenizer)
+# Other backends use hybrid by default
+register_tokenizer("llama_cpp", estimate_tokens_hybrid)
+register_tokenizer("transformers", estimate_tokens_hybrid)
+# =============================================================================
+# Convenience Functions
+# =============================================================================
+def estimate_tokens(
+    text: str,
+    backend: str = "",
+    model_id: str = "",
+) -> int:
+    """
+    Estimate token count for text.
+    Uses the best available tokenizer for the backend/model.
+    Args:
+        text: Input text
+        backend: Optional backend name
+        model_id: Optional model identifier
+    Returns:
+        Estimated token count
+    """
+    tokenizer = get_tokenizer(backend, model_id)
+    return tokenizer(text)
+def estimate_prompt_tokens(
+    system_prompt: str,
+    user_message: str,
+    backend: str = "",
+    model_id: str = "",
+) -> int:
+    """
+    Estimate tokens for a full prompt (system + user).
+    Accounts for message formatting overhead.
+    Args:
+        system_prompt: System prompt text
+        user_message: User message text
+        backend: Optional backend name
+        model_id: Optional model identifier
+    Returns:
+        Estimated token count including overhead
+    """
+    tokenizer = get_tokenizer(backend, model_id)
+    # Count tokens in each part
+    system_tokens = tokenizer(system_prompt)
+    user_tokens = tokenizer(user_message)
+    # Add overhead for message formatting (~4 tokens per message)
+    overhead = 8  # system + user messages
+    return system_tokens + user_tokens + overhead

parishad/orchestrator/__init__.py ADDED Viewed

@@ -0,0 +1,24 @@
+"""Orchestrator for Parishad council pipeline."""
+from .engine import (
+    ParishadEngine,
+    Parishad,
+    PipelineConfig,
+    BudgetConfig,
+    RetryConfig,
+    DifficultyRouting,
+    ExecutionContext,
+    ROLE_REGISTRY,
+)
+__all__ = [
+    "ParishadEngine",
+    "Parishad",
+    "PipelineConfig",
+    "BudgetConfig",
+    "RetryConfig",
+    "DifficultyRouting",
+    "ExecutionContext",
+    "ROLE_REGISTRY",
+]

parishad/orchestrator/config_loader.py ADDED Viewed

@@ -0,0 +1,210 @@
+"""
+Configuration loader for pipeline definitions.
+Prepares for Phase 2 config-driven pipelines (Core vs Extended).
+"""
+from dataclasses import dataclass, field
+from pathlib import Path
+from typing import Optional
+import yaml
+import logging
+from .exceptions import InvalidPipelineConfigError
+logger = logging.getLogger(__name__)
+@dataclass
+class RoleSpec:
+    """Specification for a single role in the pipeline."""
+    name: str
+    class_name: str
+    slot: str
+    version: str = "0.1.0"
+    budget_tokens: int = 1000
+    dependencies: list[str] = field(default_factory=list)
+    max_tokens: Optional[int] = None
+    temperature: Optional[float] = None
+    extra_config: dict = field(default_factory=dict)
+    def to_dict(self) -> dict:
+        """Convert to dictionary."""
+        return {
+            "name": self.name,
+            "class_name": self.class_name,
+            "slot": self.slot,
+            "version": self.version,
+            "budget_tokens": self.budget_tokens,
+            "dependencies": self.dependencies,
+            "max_tokens": self.max_tokens,
+            "temperature": self.temperature,
+            "extra_config": self.extra_config
+        }
+def load_pipeline_config(name: str, config_dir: Optional[Path] = None) -> list[RoleSpec]:
+    """
+    Load pipeline configuration from YAML file.
+    Args:
+        name: Pipeline name ("core" or "extended")
+        config_dir: Optional directory containing config files
+    Returns:
+        List of RoleSpec objects defining the pipeline
+    Raises:
+        FileNotFoundError: If config file doesn't exist
+        ValueError: If config is invalid
+    """
+    # Resolve config directory
+    if config_dir is None:
+        # Default to package config directory
+        package_dir = Path(__file__).parent.parent
+        config_dir = package_dir / "config"
+    config_path = config_dir / f"pipeline.{name}.yaml"
+    if not config_path.exists():
+        raise FileNotFoundError(
+            f"Pipeline config not found: {config_path}. "
+            f"Expected one of: pipeline.core.yaml, pipeline.extended.yaml"
+        )
+    # Load YAML
+    logger.debug(f"Loading pipeline config from {config_path}")
+    with open(config_path) as f:
+        data = yaml.safe_load(f)
+    if not data:
+        raise ValueError(f"Empty pipeline config: {config_path}")
+    # Parse roles
+    roles_data = data.get("roles", {})
+    pipeline_order = data.get("pipeline", [])
+    if not pipeline_order:
+        raise ValueError(f"No pipeline order specified in {config_path}")
+    # Build RoleSpec list in pipeline order
+    role_specs = []
+    for role_name in pipeline_order:
+        role_config = roles_data.get(role_name, {})
+        if not role_config:
+            logger.warning(f"No configuration for role '{role_name}', using defaults")
+            role_config = {}
+        # Extract known fields to avoid duplication in extra_config
+        known_fields = {
+            "name", "class", "slot", "version", "budget_tokens",
+            "dependencies", "max_tokens", "temperature"
+        }
+        extra_config = {k: v for k, v in role_config.items() if k not in known_fields}
+        # Extract role spec
+        spec = RoleSpec(
+            name=role_name.lower(),  # Always store as lowercase for consistent lookups
+            class_name=role_config.get("class", role_name.capitalize()),
+            slot=role_config.get("slot", "mid"),
+            version=role_config.get("version", "0.1.0"),
+            budget_tokens=role_config.get("budget_tokens", 1000),
+            dependencies=role_config.get("dependencies", []),
+            max_tokens=role_config.get("max_tokens"),
+            temperature=role_config.get("temperature"),
+            extra_config=extra_config
+        )
+        role_specs.append(spec)
+        logger.debug(f"Loaded role spec: {role_name} ({spec.class_name}, slot={spec.slot})")
+    logger.info(f"Loaded pipeline '{name}' with {len(role_specs)} roles: {pipeline_order}")
+    # Validate the loaded configuration
+    validation_result = validate_pipeline_config(role_specs)
+    if not validation_result["valid"]:
+        raise InvalidPipelineConfigError(validation_result["errors"])
+    return role_specs
+def validate_pipeline_config(role_specs: list[RoleSpec]) -> dict[str, any]:
+    """
+    Validate a loaded pipeline configuration.
+    Args:
+        role_specs: List of role specifications
+    Returns:
+        Validation result dict with 'valid' (bool) and 'errors' (list) keys
+    """
+    errors = []
+    # Check for empty pipeline
+    if not role_specs:
+        errors.append("Pipeline is empty")
+        return {"valid": False, "errors": errors}
+    # Check for duplicate role names
+    role_names = [spec.name for spec in role_specs]
+    duplicates = [name for name in role_names if role_names.count(name) > 1]
+    if duplicates:
+        errors.append(f"Duplicate role names: {set(duplicates)}")
+    # Check for valid slots
+    valid_slots = {"small", "mid", "big"}
+    for spec in role_specs:
+        if spec.slot not in valid_slots:
+            errors.append(f"Invalid slot '{spec.slot}' for role '{spec.name}'")
+    # Check for circular dependencies
+    for spec in role_specs:
+        for dep in spec.dependencies:
+            if dep not in role_names:
+                errors.append(f"Role '{spec.name}' depends on unknown role '{dep}'")
+            if dep == spec.name:
+                errors.append(f"Role '{spec.name}' has circular self-dependency")
+    # Check budget sanity
+    for spec in role_specs:
+        if spec.budget_tokens < 0:
+            errors.append(f"Negative budget for role '{spec.name}': {spec.budget_tokens}")
+    return {
+        "valid": len(errors) == 0,
+        "errors": errors
+    }
+def get_available_pipelines(config_dir: Optional[Path] = None) -> list[str]:
+    """
+    List all available pipeline configurations.
+    Args:
+        config_dir: Optional directory containing config files
+    Returns:
+        List of pipeline names (without .yaml extension)
+    """
+    if config_dir is None:
+        package_dir = Path(__file__).parent.parent
+        config_dir = package_dir / "config"
+    if not config_dir.exists():
+        return []
+    # Find all pipeline.*.yaml files
+    pipeline_files = config_dir.glob("pipeline.*.yaml")
+    # Extract names
+    names = []
+    for path in pipeline_files:
+        # Extract name between "pipeline." and ".yaml"
+        name = path.stem.replace("pipeline.", "")
+        if name != "pipeline":  # Exclude "pipeline.yaml" itself
+            names.append(name)
+    return sorted(names)