PyPI - causaliq-knowledge - Versions diffs - 0.2.0__py3-none-any.whl → 0.4.0__py3-none-any.whl - Mend

causaliq-knowledge 0.2.0py3-none-any.whl → 0.4.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (31) hide show

causaliq_knowledge/__init__.py +6 -3
causaliq_knowledge/action.py +480 -0
causaliq_knowledge/cache/__init__.py +18 -0
causaliq_knowledge/cache/encoders/__init__.py +13 -0
causaliq_knowledge/cache/encoders/base.py +90 -0
causaliq_knowledge/cache/encoders/json_encoder.py +430 -0
causaliq_knowledge/cache/token_cache.py +666 -0
causaliq_knowledge/cli/__init__.py +15 -0
causaliq_knowledge/cli/cache.py +478 -0
causaliq_knowledge/cli/generate.py +410 -0
causaliq_knowledge/cli/main.py +172 -0
causaliq_knowledge/cli/models.py +309 -0
causaliq_knowledge/graph/__init__.py +78 -0
causaliq_knowledge/graph/generator.py +457 -0
causaliq_knowledge/graph/loader.py +222 -0
causaliq_knowledge/graph/models.py +426 -0
causaliq_knowledge/graph/params.py +175 -0
causaliq_knowledge/graph/prompts.py +445 -0
causaliq_knowledge/graph/response.py +392 -0
causaliq_knowledge/graph/view_filter.py +154 -0
causaliq_knowledge/llm/base_client.py +147 -1
causaliq_knowledge/llm/cache.py +443 -0
causaliq_knowledge/py.typed +0 -0
{causaliq_knowledge-0.2.0.dist-info → causaliq_knowledge-0.4.0.dist-info}/METADATA +10 -6
causaliq_knowledge-0.4.0.dist-info/RECORD +42 -0
{causaliq_knowledge-0.2.0.dist-info → causaliq_knowledge-0.4.0.dist-info}/WHEEL +1 -1
{causaliq_knowledge-0.2.0.dist-info → causaliq_knowledge-0.4.0.dist-info}/entry_points.txt +3 -0
causaliq_knowledge/cli.py +0 -414
causaliq_knowledge-0.2.0.dist-info/RECORD +0 -22
{causaliq_knowledge-0.2.0.dist-info → causaliq_knowledge-0.4.0.dist-info}/licenses/LICENSE +0 -0
{causaliq_knowledge-0.2.0.dist-info → causaliq_knowledge-0.4.0.dist-info}/top_level.txt +0 -0

causaliq_knowledge/graph/models.py ADDED Viewed

@@ -0,0 +1,426 @@
+"""Pydantic models for model specification schemas.
+This module defines the data models for loading and validating
+causal model specifications from JSON files.
+"""
+from __future__ import annotations
+from enum import Enum
+from typing import Any, Optional
+from pydantic import BaseModel, Field, field_validator, model_validator
+class VariableType(str, Enum):
+    """Type of variable in the model."""
+    BINARY = "binary"
+    CATEGORICAL = "categorical"
+    ORDINAL = "ordinal"
+    CONTINUOUS = "continuous"
+class VariableRole(str, Enum):
+    """Role of variable in the causal structure."""
+    EXOGENOUS = "exogenous"  # No parents (root cause)
+    ENDOGENOUS = "endogenous"  # Has parents (caused by other variables)
+    LATENT = "latent"  # Unobserved variable
+class VariableSpec(BaseModel):
+    """Specification for a single variable in the causal model.
+    This model captures all metadata about a variable that can be used
+    to provide context to LLMs for graph generation.
+    Attributes:
+        name: Benchmark/literature name used for ground truth and reporting.
+        llm_name: Name used when querying LLMs (prevents memorisation).
+            Defaults to name if not specified.
+        display_name: Human-readable name for display.
+        aliases: Alternative names for the variable.
+        type: Variable type (binary, categorical, ordinal, continuous).
+        states: Possible values/states for discrete variables.
+        role: Causal role (exogenous, endogenous, latent).
+        category: Domain-specific category (e.g., "environmental_exposure").
+        short_description: Brief description of the variable.
+        extended_description: Detailed description with domain context.
+        base_rate: Prior probabilities for each state.
+        conditional_rates: Conditional probabilities given parent states.
+        sensitivity_hints: Hints about causal relationships.
+        related_domain_knowledge: Domain knowledge statements.
+        references: Literature references.
+    Example:
+        >>> var = VariableSpec(
+        ...     name="smoke",
+        ...     llm_name="tobacco_history",
+        ...     type="binary",
+        ...     states=["never", "ever"],
+        ...     role="exogenous",
+        ...     short_description="Patient has history of tobacco smoking."
+        ... )
+    """
+    name: str = Field(
+        ..., description="Benchmark/literature name for ground truth"
+    )
+    llm_name: str = Field(
+        default="",
+        description="Name used for LLM queries (defaults to name)",
+    )
+    display_name: Optional[str] = Field(
+        default=None, description="Human-readable display name"
+    )
+    aliases: list[str] = Field(
+        default_factory=list, description="Alternative names"
+    )
+    type: VariableType = Field(..., description="Variable type")
+    states: list[str] = Field(
+        default_factory=list,
+        description="Possible states for discrete variables",
+    )
+    role: Optional[VariableRole] = Field(
+        default=None, description="Causal role in the structure"
+    )
+    @model_validator(mode="after")
+    def set_llm_name_default(self) -> "VariableSpec":
+        """Set llm_name to name if not specified or empty."""
+        if not self.llm_name:
+            # Use object.__setattr__ since Pydantic models may be frozen
+            object.__setattr__(self, "llm_name", self.name)
+        return self
+    category: Optional[str] = Field(
+        default=None, description="Domain-specific category"
+    )
+    short_description: Optional[str] = Field(
+        default=None, description="Brief description"
+    )
+    extended_description: Optional[str] = Field(
+        default=None, description="Detailed description with domain context"
+    )
+    base_rate: Optional[dict[str, float]] = Field(
+        default=None, description="Prior probabilities for each state"
+    )
+    conditional_rates: Optional[dict[str, Any]] = Field(
+        default=None, description="Conditional probabilities"
+    )
+    sensitivity_hints: Optional[str] = Field(
+        default=None, description="Hints about causal relationships"
+    )
+    related_domain_knowledge: list[str] = Field(
+        default_factory=list, description="Domain knowledge statements"
+    )
+    references: list[str] = Field(
+        default_factory=list, description="Literature references"
+    )
+    @field_validator("type", mode="before")
+    @classmethod
+    def validate_type(cls, v: str | VariableType) -> VariableType:
+        """Convert string type to VariableType enum."""
+        if isinstance(v, VariableType):
+            return v
+        return VariableType(v.lower())
+    @field_validator("role", mode="before")
+    @classmethod
+    def validate_role(
+        cls, v: str | VariableRole | None
+    ) -> VariableRole | None:
+        """Convert string role to VariableRole enum."""
+        if v is None:
+            return None
+        if isinstance(v, VariableRole):
+            return v
+        return VariableRole(v.lower())
+class Provenance(BaseModel):
+    """Provenance information for the model specification.
+    Attributes:
+        source_network: Name of the source benchmark network.
+        source_reference: Citation for the original source.
+        source_url: URL to the source data.
+        disguise_strategy: Strategy used for variable name disguising.
+        memorization_risk: Risk level for LLM memorization.
+        notes: Additional notes about the source.
+    """
+    source_network: Optional[str] = Field(
+        default=None, description="Source benchmark network name"
+    )
+    source_reference: Optional[str] = Field(
+        default=None, description="Citation for original source"
+    )
+    source_url: Optional[str] = Field(
+        default=None, description="URL to source data"
+    )
+    disguise_strategy: Optional[str] = Field(
+        default=None, description="Variable name disguising strategy"
+    )
+    memorization_risk: Optional[str] = Field(
+        default=None, description="LLM memorization risk level"
+    )
+    notes: Optional[str] = Field(default=None, description="Additional notes")
+class LLMGuidance(BaseModel):
+    """Guidance for LLMs when processing the model.
+    Attributes:
+        usage_notes: Notes about how to use the model.
+        do_not_provide: Information that should not be given to LLMs.
+    """
+    usage_notes: list[str] = Field(
+        default_factory=list, description="Usage guidance for LLMs"
+    )
+    do_not_provide: list[str] = Field(
+        default_factory=list, description="Information to withhold from LLMs"
+    )
+class ViewDefinition(BaseModel):
+    """Definition of a view (minimal, standard, rich).
+    Attributes:
+        description: Description of what this view includes.
+        include_fields: List of VariableSpec fields to include in this view.
+    """
+    description: Optional[str] = Field(
+        default=None, description="Description of this view"
+    )
+    include_fields: list[str] = Field(
+        default_factory=list, description="Fields to include in this view"
+    )
+class PromptDetails(BaseModel):
+    """Collection of prompt detail definitions.
+    Attributes:
+        minimal: Minimal view (typically just variable names).
+        standard: Standard view (names, types, descriptions, states).
+        rich: Rich view (all available metadata).
+    """
+    minimal: ViewDefinition = Field(
+        default_factory=lambda: ViewDefinition(include_fields=["name"]),
+        description="Minimal context view",
+    )
+    standard: ViewDefinition = Field(
+        default_factory=lambda: ViewDefinition(
+            include_fields=["name", "type", "short_description", "states"]
+        ),
+        description="Standard context view",
+    )
+    rich: ViewDefinition = Field(
+        default_factory=lambda: ViewDefinition(
+            include_fields=[
+                "name",
+                "display_name",
+                "type",
+                "role",
+                "category",
+                "short_description",
+                "extended_description",
+                "states",
+                "base_rate",
+                "conditional_rates",
+                "sensitivity_hints",
+                "related_domain_knowledge",
+                "references",
+            ]
+        ),
+        description="Rich context view",
+    )
+class Constraints(BaseModel):
+    """Structural constraints for the causal model.
+    Attributes:
+        forbidden_edges: Pairs of variables that cannot have direct edges.
+        partial_order: Pairs indicating causal ordering (a must precede b).
+        tiers: Grouping of variables into causal tiers.
+        notes: Additional notes about constraints.
+    """
+    forbidden_edges: list[list[str]] = Field(
+        default_factory=list,
+        description="Variable pairs that cannot have edges",
+    )
+    partial_order: list[list[str]] = Field(
+        default_factory=list, description="Causal ordering constraints"
+    )
+    tiers: dict[str, list[str]] = Field(
+        default_factory=dict, description="Variable tier groupings"
+    )
+    notes: Optional[str] = Field(
+        default=None, description="Notes about constraints"
+    )
+class CausalPrinciple(BaseModel):
+    """A causal principle that applies to the domain.
+    Attributes:
+        id: Unique identifier for the principle.
+        statement: The causal principle statement.
+        references: Supporting literature references.
+    """
+    id: str = Field(..., description="Principle identifier")
+    statement: str = Field(..., description="The causal principle")
+    references: list[str] = Field(
+        default_factory=list, description="Literature references"
+    )
+class GroundTruth(BaseModel):
+    """Ground truth structure for evaluation.
+    Note: This should NOT be provided to LLMs during generation.
+    Attributes:
+        edges: Ground truth edges using benchmark variable names.
+        v_structures: V-structure definitions.
+        adjacency_matrix: Adjacency matrix representation.
+    """
+    edges: list[list[str]] = Field(
+        default_factory=list, description="Edges with benchmark variable names"
+    )
+    v_structures: list[dict[str, Any]] = Field(
+        default_factory=list, description="V-structure definitions"
+    )
+    adjacency_matrix: Optional[dict[str, Any]] = Field(
+        default=None, description="Adjacency matrix representation"
+    )
+class ModelSpec(BaseModel):
+    """Complete specification for a causal model.
+    This is the top-level model that represents an entire model
+    specification JSON file.
+    Attributes:
+        schema_version: Version of the specification schema.
+        dataset_id: Unique identifier for the dataset.
+        domain: Domain of the causal model (e.g., "pulmonary_oncology").
+        purpose: Purpose of the model specification.
+        provenance: Source and provenance information.
+        llm_guidance: Guidance for LLM usage.
+    views: View definitions (minimal, standard, rich).
+        variables: List of variable specifications.
+        constraints: Structural constraints.
+        causal_principles: Domain causal principles.
+        ground_truth: Ground truth for evaluation (not for LLMs).
+    Example:
+        >>> spec = ModelSpec(
+        ...     schema_version="2.0",
+        ...     dataset_id="cancer",
+        ...     domain="pulmonary_oncology",
+        ...     variables=[
+        ...         VariableSpec(
+        ...             name="smoking", llm_name="tobacco_use", type="binary"
+        ...         ),
+        ...         VariableSpec(
+        ...             name="cancer", llm_name="malignancy", type="binary"
+        ...         ),
+        ...     ]
+        ... )
+    """
+    schema_version: str = Field(default="2.0", description="Schema version")
+    dataset_id: str = Field(..., description="Dataset identifier")
+    domain: str = Field(..., description="Domain of the causal model")
+    purpose: Optional[str] = Field(
+        default=None, description="Purpose of this specification"
+    )
+    provenance: Optional[Provenance] = Field(
+        default=None, description="Source and provenance information"
+    )
+    llm_guidance: Optional[LLMGuidance] = Field(
+        default=None, description="Guidance for LLM usage"
+    )
+    prompt_details: PromptDetails = Field(
+        default_factory=PromptDetails,
+        description="Prompt detail definitions",
+        alias="prompt_details",
+    )
+    variables: list[VariableSpec] = Field(
+        default_factory=list, description="Variable specifications"
+    )
+    constraints: Optional[Constraints] = Field(
+        default=None, description="Structural constraints"
+    )
+    causal_principles: list[CausalPrinciple] = Field(
+        default_factory=list, description="Domain causal principles"
+    )
+    ground_truth: Optional[GroundTruth] = Field(
+        default=None, description="Ground truth for evaluation"
+    )
+    def get_variable(self, name: str) -> VariableSpec | None:
+        """Get a variable by name.
+        Args:
+            name: Variable name to look up.
+        Returns:
+            VariableSpec if found, None otherwise.
+        """
+        for var in self.variables:
+            if var.name == name:
+                return var
+        return None
+    def get_variable_names(self) -> list[str]:
+        """Get list of all benchmark variable names.
+        Returns:
+            List of variable names.
+        """
+        return [var.name for var in self.variables]
+    def get_llm_names(self) -> list[str]:
+        """Get list of all LLM variable names.
+        Returns:
+            List of llm_name values.
+        """
+        return [var.llm_name for var in self.variables]
+    def get_llm_to_name_mapping(self) -> dict[str, str]:
+        """Get mapping from LLM names to benchmark names.
+        Returns:
+            Dict mapping llm_name -> name.
+        """
+        return {var.llm_name: var.name for var in self.variables}
+    def get_name_to_llm_mapping(self) -> dict[str, str]:
+        """Get mapping from benchmark names to LLM names.
+        Returns:
+            Dict mapping name -> llm_name.
+        """
+        return {var.name: var.llm_name for var in self.variables}
+    def uses_distinct_llm_names(self) -> bool:
+        """Check if any variable has a different llm_name from name.
+        Returns:
+            True if at least one variable has llm_name != name.
+        """
+        return any(var.llm_name != var.name for var in self.variables)

causaliq_knowledge/graph/params.py ADDED Viewed

@@ -0,0 +1,175 @@
+"""Shared parameter models for graph generation.
+This module provides Pydantic models for validating graph generation
+parameters, shared between CLI commands and workflow actions.
+"""
+from __future__ import annotations
+from pathlib import Path
+from typing import Any, Optional
+from pydantic import BaseModel, Field, field_validator
+from causaliq_knowledge.graph.view_filter import PromptDetail
+class GenerateGraphParams(BaseModel):
+    """Parameters for graph generation - shared by CLI and Action.
+    This model provides validation for all graph generation parameters,
+    ensuring consistent behaviour between CLI invocation and workflow
+    action execution.
+    Attributes:
+        model_spec: Path to model specification JSON file.
+        prompt_detail: Detail level for variable information in prompts.
+        use_benchmark_names: Use benchmark names instead of LLM names.
+        llm_model: LLM model identifier with provider prefix.
+        output: Output destination - .json file path or "none" for stdout.
+        llm_cache: Path to cache database file (.db) or "none" to disable.
+        llm_temperature: LLM sampling temperature.
+    Example:
+        >>> params = GenerateGraphParams(
+        ...     model_spec=Path("model.json"),
+        ...     prompt_detail=PromptDetail.STANDARD,
+        ...     llm_model="groq/llama-3.1-8b-instant",
+        ...     output="none",
+        ...     llm_cache="cache.db",
+        ... )
+    """
+    model_spec: Path = Field(
+        ...,
+        description="Path to model specification JSON file",
+    )
+    prompt_detail: PromptDetail = Field(
+        default=PromptDetail.STANDARD,
+        description="Detail level for variable information in prompts",
+    )
+    use_benchmark_names: bool = Field(
+        default=False,
+        description="Use benchmark names instead of LLM names",
+    )
+    llm_model: str = Field(
+        default="groq/llama-3.1-8b-instant",
+        description="LLM model identifier with provider prefix",
+    )
+    output: str = Field(
+        ...,
+        description="Output destination: .json file path or 'none' for stdout",
+    )
+    llm_cache: str = Field(
+        ...,
+        description="Path to cache database file (.db) or 'none' to disable",
+    )
+    llm_temperature: float = Field(
+        default=0.1,
+        ge=0.0,
+        le=2.0,
+        description="LLM sampling temperature (0.0-2.0)",
+    )
+    model_config = {"arbitrary_types_allowed": True}
+    @field_validator("llm_model")
+    @classmethod
+    def validate_llm_model_format(cls, v: str) -> str:
+        """Validate LLM model identifier has provider prefix."""
+        valid_prefixes = (
+            "anthropic/",
+            "deepseek/",
+            "gemini/",
+            "groq/",
+            "mistral/",
+            "ollama/",
+            "openai/",
+        )
+        if not v.startswith(valid_prefixes):
+            raise ValueError(
+                f"LLM model must start with provider prefix. "
+                f"Valid prefixes: {', '.join(valid_prefixes)}. Got: {v}"
+            )
+        return v
+    @field_validator("llm_cache")
+    @classmethod
+    def validate_llm_cache_format(cls, v: str) -> str:
+        """Validate llm_cache is 'none' or a path ending with .db."""
+        if v.lower() == "none":
+            return "none"
+        if not v.endswith(".db"):
+            raise ValueError(
+                f"llm_cache must be 'none' or a path ending with .db. "
+                f"Got: {v}"
+            )
+        return v
+    @field_validator("output")
+    @classmethod
+    def validate_output_format(cls, v: str) -> str:
+        """Validate output is 'none' or a path ending with .json."""
+        if v.lower() == "none":
+            return "none"
+        if not v.endswith(".json"):
+            raise ValueError(
+                f"output must be 'none' or a path ending with .json. "
+                f"Got: {v}"
+            )
+        return v
+    @classmethod
+    def from_dict(cls, data: dict[str, Any]) -> "GenerateGraphParams":
+        """Create params from dictionary with string-to-enum conversion.
+        This method handles conversion of string values to enum types,
+        useful when receiving parameters from workflow inputs.
+        Args:
+            data: Dictionary of parameter values.
+        Returns:
+            Validated GenerateGraphParams instance.
+        Raises:
+            ValueError: If validation fails.
+        """
+        # Convert string values to enums where needed
+        processed = dict(data)
+        # Convert prompt_detail string to PromptDetail enum
+        if "prompt_detail" in processed and isinstance(
+            processed["prompt_detail"], str
+        ):
+            processed["prompt_detail"] = PromptDetail(
+                processed["prompt_detail"].lower()
+            )
+        # Convert model_spec string to Path
+        if "model_spec" in processed and isinstance(
+            processed["model_spec"], str
+        ):
+            processed["model_spec"] = Path(processed["model_spec"])
+        return cls(**processed)
+    def get_effective_cache_path(self) -> Optional[Path]:
+        """Get the effective cache path.
+        Returns:
+            Path to cache database, or None if caching is disabled.
+        """
+        if self.llm_cache.lower() == "none":
+            return None
+        return Path(self.llm_cache)
+    def get_effective_output_path(self) -> Optional[Path]:
+        """Get the effective output path.
+        Returns:
+            Path to output JSON file, or None for stdout.
+        """
+        if self.output.lower() == "none":
+            return None
+        return Path(self.output)

causaliq-knowledge 0.2.0__py3-none-any.whl → 0.4.0__py3-none-any.whl

causaliq-knowledge 0.2.0py3-none-any.whl → 0.4.0py3-none-any.whl