PyPI - causaliq-knowledge - Versions diffs - 0.3.0__py3-none-any.whl → 0.4.0__py3-none-any.whl - Mend

causaliq-knowledge 0.3.0py3-none-any.whl → 0.4.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (28) hide show

causaliq_knowledge/__init__.py +5 -2
causaliq_knowledge/action.py +480 -0
causaliq_knowledge/cache/encoders/json_encoder.py +15 -3
causaliq_knowledge/cache/token_cache.py +36 -2
causaliq_knowledge/cli/__init__.py +15 -0
causaliq_knowledge/cli/cache.py +478 -0
causaliq_knowledge/cli/generate.py +410 -0
causaliq_knowledge/cli/main.py +172 -0
causaliq_knowledge/cli/models.py +309 -0
causaliq_knowledge/graph/__init__.py +78 -0
causaliq_knowledge/graph/generator.py +457 -0
causaliq_knowledge/graph/loader.py +222 -0
causaliq_knowledge/graph/models.py +426 -0
causaliq_knowledge/graph/params.py +175 -0
causaliq_knowledge/graph/prompts.py +445 -0
causaliq_knowledge/graph/response.py +392 -0
causaliq_knowledge/graph/view_filter.py +154 -0
causaliq_knowledge/llm/base_client.py +6 -0
causaliq_knowledge/llm/cache.py +124 -61
causaliq_knowledge/py.typed +0 -0
{causaliq_knowledge-0.3.0.dist-info → causaliq_knowledge-0.4.0.dist-info}/METADATA +10 -6
causaliq_knowledge-0.4.0.dist-info/RECORD +42 -0
{causaliq_knowledge-0.3.0.dist-info → causaliq_knowledge-0.4.0.dist-info}/entry_points.txt +3 -0
causaliq_knowledge/cli.py +0 -757
causaliq_knowledge-0.3.0.dist-info/RECORD +0 -28
{causaliq_knowledge-0.3.0.dist-info → causaliq_knowledge-0.4.0.dist-info}/WHEEL +0 -0
{causaliq_knowledge-0.3.0.dist-info → causaliq_knowledge-0.4.0.dist-info}/licenses/LICENSE +0 -0
{causaliq_knowledge-0.3.0.dist-info → causaliq_knowledge-0.4.0.dist-info}/top_level.txt +0 -0

causaliq_knowledge/__init__.py CHANGED Viewed

@@ -2,10 +2,11 @@
 causaliq-knowledge: LLM and human knowledge for causal discovery.
 """
+from causaliq_knowledge.action import CausalIQAction
 from causaliq_knowledge.base import KnowledgeProvider
 from causaliq_knowledge.models import EdgeDirection, EdgeKnowledge
-__version__ = "0.3.0"
+__version__ = "0.4.0"
 __author__ = "CausalIQ"
 __email__ = "info@causaliq.com"
@@ -17,7 +18,7 @@ __url__ = "https://github.com/causaliq/causaliq-knowledge"
 __license__ = "MIT"
 # Version tuple for programmatic access (major, minor, patch)
-VERSION = (0, 3, 0)
+VERSION = (0, 4, 0)
 __all__ = [
     "__version__",
@@ -29,5 +30,7 @@ __all__ = [
     "EdgeDirection",
     # Abstract interface
     "KnowledgeProvider",
+    # Workflow action (auto-discovered by causaliq-workflow)
+    "CausalIQAction",
     # Note: Import LLMKnowledge from causaliq_knowledge.llm
 ]

causaliq_knowledge/action.py ADDED Viewed

@@ -0,0 +1,480 @@
+"""CausalIQ workflow action for graph generation.
+This module provides the workflow action integration for causaliq-knowledge,
+allowing graph generation to be used as a step in CausalIQ workflows.
+The action is auto-discovered by causaliq-workflow when this package is
+imported, using the convention of exporting a class named 'CausalIQAction'.
+"""
+from __future__ import annotations
+import json
+import logging
+from pathlib import Path
+from typing import TYPE_CHECKING, Any, Dict, Optional
+from causaliq_workflow.action import (
+    ActionExecutionError,
+    ActionInput,
+    ActionValidationError,
+)
+from causaliq_workflow.action import CausalIQAction as BaseCausalIQAction
+from causaliq_workflow.logger import WorkflowLogger
+from causaliq_workflow.registry import WorkflowContext
+from pydantic import ValidationError
+from causaliq_knowledge.graph.params import GenerateGraphParams
+if TYPE_CHECKING:  # pragma: no cover
+    from causaliq_knowledge.graph.response import GeneratedGraph
+logger = logging.getLogger(__name__)
+# Re-export for convenience (unused imports are intentional for API surface)
+__all__ = [
+    "ActionExecutionError",
+    "ActionInput",
+    "ActionValidationError",
+    "BaseCausalIQAction",
+    "WorkflowContext",
+    "WorkflowLogger",
+    "GenerateGraphAction",
+    "CausalIQAction",
+    "SUPPORTED_ACTIONS",
+]
+# Supported actions within this package
+SUPPORTED_ACTIONS = {"generate_graph"}
+def _create_action_inputs() -> Dict[str, Any]:
+    """Create action input specifications.
+    Returns:
+        Dictionary of ActionInput specifications.
+    """
+    return {
+        "action": ActionInput(
+            name="action",
+            description="Action to perform (e.g., 'generate_graph')",
+            required=True,
+            type_hint="str",
+        ),
+        "model_spec": ActionInput(
+            name="model_spec",
+            description="Path to model specification JSON file",
+            required=True,
+            type_hint="str",
+        ),
+        "prompt_detail": ActionInput(
+            name="prompt_detail",
+            description="Detail level for prompts: minimal, standard, or rich",
+            required=False,
+            default="standard",
+            type_hint="str",
+        ),
+        "use_benchmark_names": ActionInput(
+            name="use_benchmark_names",
+            description="Use benchmark names instead of LLM names",
+            required=False,
+            default=False,
+            type_hint="bool",
+        ),
+        "llm_model": ActionInput(
+            name="llm_model",
+            description="LLM model identifier (e.g., groq/llama-3.1-8b)",
+            required=False,
+            default="groq/llama-3.1-8b-instant",
+            type_hint="str",
+        ),
+        "output": ActionInput(
+            name="output",
+            description="Output: .json file path or 'none' for stdout",
+            required=True,
+            type_hint="str",
+        ),
+        "llm_cache": ActionInput(
+            name="llm_cache",
+            description="Path to cache database (.db) or 'none' to disable",
+            required=True,
+            type_hint="str",
+        ),
+        "llm_temperature": ActionInput(
+            name="llm_temperature",
+            description="LLM sampling temperature (0.0-2.0)",
+            required=False,
+            default=0.1,
+            type_hint="float",
+        ),
+    }
+class GenerateGraphAction(BaseCausalIQAction):
+    """Workflow action for generating causal graphs from model specifications.
+    This action integrates causaliq-knowledge graph generation into
+    CausalIQ workflows, allowing LLM-based graph generation to be used
+    as workflow steps.
+    The action supports the 'generate_graph' operation, which:
+    - Loads a model specification from a JSON file
+    - Queries an LLM to propose causal relationships
+    - Returns the generated graph structure
+    Attributes:
+        name: Action identifier for workflow 'uses' field.
+        version: Action version.
+        description: Human-readable description.
+        inputs: Input parameter specifications.
+    Example workflow step:
+        ```yaml
+        steps:
+          - name: Generate causal graph
+            uses: causaliq-knowledge
+            with:
+              action: generate_graph
+              model_spec: "{{data_dir}}/cancer.json"
+              llm_cache: "{{data_dir}}/cancer_llm.db"
+              prompt_detail: standard
+              llm_model: groq/llama-3.1-8b-instant
+        ```
+    """
+    name: str = "causaliq-knowledge"
+    version: str = "0.4.0"
+    description: str = "Generate causal graphs using LLM knowledge"
+    author: str = "CausalIQ"
+    inputs: Dict[str, Any] = _create_action_inputs()
+    outputs: Dict[str, str] = {
+        "graph": "Generated graph structure as JSON",
+        "edge_count": "Number of edges in generated graph",
+        "variable_count": "Number of variables in the model",
+        "model_used": "LLM model used for generation",
+        "cached": "Whether the result was retrieved from cache",
+    }
+    def validate_inputs(self, inputs: Dict[str, Any]) -> bool:
+        """Validate input values against specifications.
+        Args:
+            inputs: Dictionary of input values to validate.
+        Returns:
+            True if all inputs are valid.
+        Raises:
+            ActionValidationError: If validation fails.
+        """
+        # Check required 'action' parameter
+        if "action" not in inputs:
+            raise ActionValidationError(
+                "Missing required input: 'action'. "
+                f"Supported actions: {SUPPORTED_ACTIONS}"
+            )
+        action = inputs["action"]
+        if action not in SUPPORTED_ACTIONS:
+            raise ActionValidationError(
+                f"Unknown action: '{action}'. "
+                f"Supported actions: {SUPPORTED_ACTIONS}"
+            )
+        # For generate_graph, validate using GenerateGraphParams
+        if action == "generate_graph":
+            # Check required model_spec
+            if "model_spec" not in inputs:
+                raise ActionValidationError(
+                    "Missing required input: 'model_spec' for generate_graph"
+                )
+            # Build params dict (excluding 'action' which isn't a param)
+            params_dict = {k: v for k, v in inputs.items() if k != "action"}
+            try:
+                # Validate using Pydantic model
+                GenerateGraphParams.from_dict(params_dict)
+            except (ValidationError, ValueError) as e:
+                raise ActionValidationError(
+                    f"Invalid parameters for generate_graph: {e}"
+                )
+        return True
+    def run(
+        self,
+        inputs: Dict[str, Any],
+        mode: str = "dry-run",
+        context: Optional[Any] = None,
+        logger: Optional[Any] = None,
+    ) -> Dict[str, Any]:
+        """Execute the action with validated inputs.
+        Args:
+            inputs: Dictionary of input values keyed by input name.
+            mode: Execution mode ('dry-run', 'run', 'compare').
+            context: Workflow context for optimisation.
+            logger: Optional logger for task execution reporting.
+        Returns:
+            Dictionary containing:
+            - status: 'success' or 'skipped' (for dry-run)
+            - graph: Generated graph as JSON (if run mode)
+            - edge_count: Number of edges
+            - variable_count: Number of variables
+            - model_used: LLM model identifier
+            - cached: Whether result was from cache
+        Raises:
+            ActionExecutionError: If action execution fails.
+        """
+        # Validate inputs first
+        self.validate_inputs(inputs)
+        action = inputs["action"]
+        if action == "generate_graph":
+            return self._run_generate_graph(inputs, mode, context, logger)
+        else:  # pragma: no cover
+            # This shouldn't happen after validate_inputs
+            raise ActionExecutionError(f"Unknown action: {action}")
+    def _run_generate_graph(
+        self,
+        inputs: Dict[str, Any],
+        mode: str,
+        context: Optional[Any],
+        logger: Optional[Any],
+    ) -> Dict[str, Any]:
+        """Execute the generate_graph action.
+        Args:
+            inputs: Validated input parameters.
+            mode: Execution mode.
+            context: Workflow context.
+            logger: Optional workflow logger.
+        Returns:
+            Action result dictionary.
+        """
+        # Build params (excluding 'action')
+        params_dict = {k: v for k, v in inputs.items() if k != "action"}
+        try:
+            params = GenerateGraphParams.from_dict(params_dict)
+        except (ValidationError, ValueError) as e:
+            raise ActionExecutionError(f"Parameter validation failed: {e}")
+        # Check model_spec exists
+        if not params.model_spec.exists():
+            raise ActionExecutionError(
+                f"Model specification not found: {params.model_spec}"
+            )
+        # Dry-run mode: validate only, don't execute
+        if mode == "dry-run":
+            return self._dry_run_result(params)
+        # Run mode: execute graph generation
+        return self._execute_generate_graph(params)
+    def _dry_run_result(self, params: GenerateGraphParams) -> Dict[str, Any]:
+        """Return dry-run result without executing.
+        Args:
+            params: Validated parameters.
+        Returns:
+            Dry-run result dictionary.
+        """
+        return {
+            "status": "skipped",
+            "message": "Dry-run mode: would generate graph",
+            "model_spec": str(params.model_spec),
+            "llm_model": params.llm_model,
+            "prompt_detail": params.prompt_detail.value,
+            "output": params.output,
+        }
+    def _execute_generate_graph(
+        self, params: GenerateGraphParams
+    ) -> Dict[str, Any]:
+        """Execute graph generation.
+        Args:
+            params: Validated parameters.
+        Returns:
+            Result dictionary with generated graph.
+        """
+        # Import here to avoid slow startup and circular imports
+        from causaliq_knowledge.cache import TokenCache
+        from causaliq_knowledge.graph import ModelLoader
+        from causaliq_knowledge.graph.generator import (
+            GraphGenerator,
+            GraphGeneratorConfig,
+        )
+        try:
+            # Load model specification
+            spec = ModelLoader.load(params.model_spec)
+            logger.info(
+                f"Loaded model specification: {spec.dataset_id} "
+                f"({len(spec.variables)} variables)"
+            )
+        except Exception as e:
+            raise ActionExecutionError(
+                f"Failed to load model specification: {e}"
+            )
+        # Track mapping for name conversion
+        llm_to_benchmark_mapping: Dict[str, str] = {}
+        # Determine naming mode
+        use_llm_names = not params.use_benchmark_names
+        if use_llm_names and spec.uses_distinct_llm_names():
+            llm_to_benchmark_mapping = spec.get_llm_to_name_mapping()
+        # Set up cache
+        cache: Optional[TokenCache] = None
+        cache_path = params.get_effective_cache_path()
+        if cache_path is not None:
+            try:
+                cache = TokenCache(str(cache_path))
+                cache.open()
+            except Exception as e:
+                raise ActionExecutionError(f"Failed to open cache: {e}")
+        try:
+            # Import OutputFormat for generator config
+            from causaliq_knowledge.graph.prompts import OutputFormat
+            # Create generator - always use edge_list format
+            # Derive request_id from output filename stem
+            if params.output.lower() == "none":
+                request_id = "none"
+            else:
+                request_id = Path(params.output).stem
+            config = GraphGeneratorConfig(
+                temperature=params.llm_temperature,
+                output_format=OutputFormat.EDGE_LIST,
+                prompt_detail=params.prompt_detail,
+                use_llm_names=use_llm_names,
+                request_id=request_id,
+            )
+            generator = GraphGenerator(
+                model=params.llm_model, config=config, cache=cache
+            )
+            # Generate graph
+            graph = generator.generate_from_spec(
+                spec, level=params.prompt_detail
+            )
+            # Map LLM names back to benchmark names
+            if llm_to_benchmark_mapping:
+                graph = self._map_graph_names(graph, llm_to_benchmark_mapping)
+            # Get stats
+            stats = generator.get_stats()
+            # Build result
+            result = {
+                "status": "success",
+                "graph": self._graph_to_dict(graph),
+                "edge_count": len(graph.edges),
+                "variable_count": len(graph.variables),
+                "model_used": params.llm_model,
+                "cached": stats.get("cache_hits", 0) > 0,
+                "outputs": {
+                    "graph": self._graph_to_dict(graph),
+                    "edge_count": len(graph.edges),
+                    "variable_count": len(graph.variables),
+                    "model_used": params.llm_model,
+                    "cached": stats.get("cache_hits", 0) > 0,
+                },
+            }
+            # Write output file if specified
+            output_path = params.get_effective_output_path()
+            if output_path:
+                output_path.parent.mkdir(parents=True, exist_ok=True)
+                output_path.write_text(
+                    json.dumps(result["graph"], indent=2),
+                    encoding="utf-8",
+                )
+                result["output_file"] = str(output_path)
+            return result
+        except Exception as e:
+            raise ActionExecutionError(f"Graph generation failed: {e}")
+        finally:
+            if cache:
+                cache.close()
+    def _graph_to_dict(self, graph: "GeneratedGraph") -> Dict[str, Any]:
+        """Convert GeneratedGraph to dictionary.
+        Args:
+            graph: Generated graph object.
+        Returns:
+            Dictionary representation of the graph.
+        """
+        return {
+            "edges": [
+                {
+                    "source": edge.source,
+                    "target": edge.target,
+                    "confidence": edge.confidence,
+                }
+                for edge in graph.edges
+            ],
+            "variables": graph.variables,
+            "reasoning": graph.reasoning,
+        }
+    def _map_graph_names(
+        self, graph: "GeneratedGraph", mapping: Dict[str, str]
+    ) -> "GeneratedGraph":
+        """Map variable names in a graph using a mapping dictionary.
+        Args:
+            graph: The generated graph with edges to map.
+            mapping: Dictionary mapping old names to new names.
+        Returns:
+            New GeneratedGraph with mapped variable names.
+        """
+        from causaliq_knowledge.graph.response import (
+            GeneratedGraph,
+            ProposedEdge,
+        )
+        new_edges = []
+        for edge in graph.edges:
+            new_edge = ProposedEdge(
+                source=mapping.get(edge.source, edge.source),
+                target=mapping.get(edge.target, edge.target),
+                confidence=edge.confidence,
+            )
+            new_edges.append(new_edge)
+        new_variables = [mapping.get(v, v) for v in graph.variables]
+        return GeneratedGraph(
+            edges=new_edges,
+            variables=new_variables,
+            reasoning=graph.reasoning,
+            metadata=graph.metadata,
+        )
+# Export as CausalIQAction for auto-discovery by causaliq-workflow
+# This name is required by the auto-discovery convention
+CausalIQAction = GenerateGraphAction

causaliq_knowledge/cache/encoders/json_encoder.py CHANGED Viewed

@@ -173,7 +173,8 @@ class JsonEncoder(EntryEncoder):
         """Encode a string value with tokenisation.
         Strings are split into tokens (words/punctuation) with special
-        markers for string start/end.
+        markers for string start/end. Double quotes within the string
+        are encoded as '\\"' token to distinguish from string delimiters.
         Args:
             value: String to encode.
@@ -184,7 +185,11 @@ class JsonEncoder(EntryEncoder):
         # Split on whitespace and punctuation, keeping delimiters
         tokens = self._tokenise_string(value)
         for token in tokens:
-            self._encode_token(token, token_cache, result)
+            # Escape embedded quotes to distinguish from string delimiter
+            if token == '"':
+                self._encode_token('\\"', token_cache, result)
+            else:
+                self._encode_token(token, token_cache, result)
         self._encode_token('"', token_cache, result)
     def _encode_list(
@@ -296,6 +301,9 @@ class JsonEncoder(EntryEncoder):
     ) -> tuple[str, int]:
         """Decode a string value (after opening quote consumed).
+        Handles escaped quotes ('\\"' token) which represent literal
+        double quotes within the string content.
         Args:
             blob: Binary data to decode.
             offset: Current position (after opening quote).
@@ -317,7 +325,11 @@ class JsonEncoder(EntryEncoder):
             if token == '"':
                 # End of string
                 return "".join(parts), offset
-            parts.append(token)
+            elif token == '\\"':
+                # Escaped quote - append literal quote character
+                parts.append('"')
+            else:
+                parts.append(token)
         raise ValueError("Unterminated string")
     def _decode_list(

causaliq_knowledge/cache/token_cache.py CHANGED Viewed

@@ -51,6 +51,8 @@ class TokenCache:
             data BLOB NOT NULL,
             created_at TEXT NOT NULL,
             metadata BLOB,
+            hit_count INTEGER DEFAULT 0,
+            last_accessed_at TEXT,
             PRIMARY KEY (hash, entry_type)
         );
@@ -239,6 +241,28 @@ class TokenCache:
         row = cursor.fetchone()
         return int(row[0]) if row else 0
+    def total_hits(self, entry_type: str | None = None) -> int:
+        """Get total cache hits across all entries.
+        Args:
+            entry_type: If provided, count only hits for this entry type.
+        Returns:
+            Total hit count.
+        """
+        if entry_type is None:
+            cursor = self.conn.execute(
+                "SELECT COALESCE(SUM(hit_count), 0) FROM cache_entries"
+            )
+        else:
+            cursor = self.conn.execute(
+                "SELECT COALESCE(SUM(hit_count), 0) FROM cache_entries "
+                "WHERE entry_type = ?",
+                (entry_type,),
+            )
+        row = cursor.fetchone()
+        return int(row[0]) if row else 0
     def get_or_create_token(self, token: str) -> int:
         """Get token ID, creating a new entry if needed.
@@ -319,7 +343,7 @@ class TokenCache:
         self.conn.commit()
     def get(self, hash: str, entry_type: str) -> bytes | None:
-        """Retrieve a cache entry.
+        """Retrieve a cache entry and increment hit count.
         Args:
             hash: Unique identifier for the entry.
@@ -334,7 +358,17 @@ class TokenCache:
             (hash, entry_type),
         )
         row = cursor.fetchone()
-        return row[0] if row else None
+        if row:
+            # Increment hit count and update last accessed time
+            self.conn.execute(
+                "UPDATE cache_entries SET hit_count = hit_count + 1, "
+                "last_accessed_at = ? WHERE hash = ? AND entry_type = ?",
+                (self._utcnow_iso(), hash, entry_type),
+            )
+            self.conn.commit()
+            result: bytes = row[0]
+            return result
+        return None
     def get_with_metadata(
         self, hash: str, entry_type: str

causaliq_knowledge/cli/__init__.py ADDED Viewed

@@ -0,0 +1,15 @@
+"""Command-line interface for causaliq-knowledge.
+This package provides the CLI implementation split into logical modules:
+- main: Core CLI entry point and query command
+- cache: Cache management commands (stats, export, import)
+- generate: Graph generation commands
+- models: Model listing command
+"""
+from __future__ import annotations
+from causaliq_knowledge.cli.main import cli, main
+__all__ = ["cli", "main"]

causaliq-knowledge 0.3.0__py3-none-any.whl → 0.4.0__py3-none-any.whl

causaliq-knowledge 0.3.0py3-none-any.whl → 0.4.0py3-none-any.whl