PyPI - khora - Versions diffs - 0.0.1__py3-none-any.whl - Mend

khora 0.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (19) hide show

khora/__init__.py +6 -0
khora/__main__.py +101 -0
khora/agents/__init__.py +6 -0
khora/agents/data_fetcher.py +158 -0
khora/agents/pipeline_builder.py +217 -0
khora/pipelines/__init__.py +6 -0
khora/pipelines/data_pipeline.py +131 -0
khora/pipelines/definitions.py +14 -0
khora/tools/__init__.py +7 -0
khora/tools/api_tool.py +81 -0
khora/tools/google_docs_tool.py +169 -0
khora/tools/web_scraper_tool.py +197 -0
khora/utils/__init__.py +6 -0
khora/utils/config.py +54 -0
khora/utils/data_models.py +57 -0
khora-0.0.1.dist-info/METADATA +309 -0
khora-0.0.1.dist-info/RECORD +19 -0
khora-0.0.1.dist-info/WHEEL +4 -0
khora-0.0.1.dist-info/licenses/LICENSE +21 -0

khora/__init__.py ADDED Viewed

@@ -0,0 +1,6 @@
+"""
+Khora - Ad-hoc Dagster pipelines for data fetching using AI/LLM prompts.
+"""
+__version__ = "0.0.1"
+__all__ = ["agents", "pipelines", "tools", "utils"]

khora/__main__.py ADDED Viewed

@@ -0,0 +1,101 @@
+"""Main entry point for Khora CLI."""
+import asyncio
+import json
+import sys
+from pathlib import Path
+from khora.agents import DataFetcherAgent, PipelineBuilderAgent
+from khora.utils.config import load_config
+from khora.utils.data_models import DataRequest, DataSourceType
+async def main() -> None:
+    """Main CLI function."""
+    config = load_config()
+    if not config.get("openai_api_key"):
+        print("Error: OPENAI_API_KEY not set in environment")
+        sys.exit(1)
+    print("Khora - AI-powered Data Pipeline Builder")
+    print("=" * 40)
+    # Example: Create a simple data fetching request
+    if len(sys.argv) > 1:
+        if sys.argv[1] == "fetch":
+            if len(sys.argv) < 4:
+                print("Usage: python -m khora fetch <source_type> <prompt>")
+                sys.exit(1)
+            source_type = sys.argv[2]
+            prompt = " ".join(sys.argv[3:])
+            fetcher = DataFetcherAgent(
+                openai_api_key=config["openai_api_key"],
+                model=config.get("openai_model", "gpt-4-turbo-preview"),
+            )
+            try:
+                request = DataRequest(
+                    source_type=DataSourceType(source_type), prompt=prompt
+                )
+                print(f"Fetching data from {source_type}...")
+                response = await fetcher.fetch_data(request)
+                if response.status == "success":
+                    print("Success! Data fetched:")
+                    print(json.dumps(response.data, indent=2))
+                else:
+                    print(f"Error: {response.error_message}")
+            except ValueError:
+                print(
+                    "Error: Invalid source type. Valid types: api, web_scraper, google_docs, spreadsheet"
+                )
+                sys.exit(1)
+        elif sys.argv[1] == "build":
+            if len(sys.argv) < 3:
+                print("Usage: python -m khora build <pipeline_description>")
+                sys.exit(1)
+            description = " ".join(sys.argv[2:])
+            builder = PipelineBuilderAgent(
+                openai_api_key=config["openai_api_key"],
+                model=config.get("openai_model", "gpt-4-turbo-preview"),
+            )
+            print("Analyzing pipeline request...")
+            pipeline_config = builder.analyze_pipeline_request(description)
+            print(f"\nGenerated Pipeline: {pipeline_config.name}")
+            print(f"Description: {pipeline_config.description}")
+            print(f"Number of data sources: {len(pipeline_config.requests)}")
+            # Generate code
+            code = builder.generate_pipeline_code(pipeline_config)
+            # Save to file
+            output_file = Path(f"{pipeline_config.name}_pipeline.py")
+            output_file.write_text(code)
+            print(f"\nPipeline code saved to: {output_file}")
+            print("\nTo run the pipeline:")
+            print(f"  dagster dev -f {output_file}")
+    else:
+        print("\nUsage:")
+        print("  python -m khora fetch <source_type> <prompt>")
+        print("  python -m khora build <pipeline_description>")
+        print("\nExamples:")
+        print("  python -m khora fetch api 'Get weather data for NYC'")
+        print(
+            "  python -m khora build 'Create pipeline to fetch crypto prices and news'"
+        )
+if __name__ == "__main__":
+    asyncio.run(main())

khora/agents/__init__.py ADDED Viewed

@@ -0,0 +1,6 @@
+"""Agents module for AI-powered data fetching."""
+from .data_fetcher import DataFetcherAgent
+from .pipeline_builder import PipelineBuilderAgent
+__all__ = ["DataFetcherAgent", "PipelineBuilderAgent"]

khora/agents/data_fetcher.py ADDED Viewed

@@ -0,0 +1,158 @@
+"""Data fetcher agent using LangGraph for orchestration."""
+import json
+from typing import Any, Dict, List, Optional, TypedDict
+from langchain.schema import BaseMessage, HumanMessage, SystemMessage
+from langchain_openai import ChatOpenAI
+from langgraph.graph import END, StateGraph
+from pydantic import SecretStr
+from khora.tools import APITool, GoogleDocsTool, WebScraperTool
+from khora.utils.data_models import DataRequest, DataResponse, DataSourceType
+class AgentState(TypedDict):
+    """State for the data fetcher agent."""
+    messages: List[BaseMessage]
+    request: DataRequest
+    response: Optional[DataResponse]
+    tool_calls: List[Dict[str, Any]]
+    final_answer: Optional[str]
+class DataFetcherAgent:
+    """Agent for fetching data based on AI prompts using LangGraph."""
+    def __init__(self, openai_api_key: str, model: str = "gpt-4-turbo-preview"):
+        """Initialize the data fetcher agent."""
+        self.llm = ChatOpenAI(
+            api_key=SecretStr(openai_api_key), model=model, temperature=0
+        )
+        # Initialize tools
+        self.tools = {
+            DataSourceType.API: APITool(),
+            DataSourceType.WEB_SCRAPER: WebScraperTool(),
+            DataSourceType.GOOGLE_DOCS: GoogleDocsTool(),
+            DataSourceType.SPREADSHEET: GoogleDocsTool(),
+        }
+        # Build the graph
+        self.graph = self._build_graph()
+    def _build_graph(self):  # type: ignore
+        """Build the LangGraph state graph."""
+        workflow = StateGraph(AgentState)
+        # Add nodes
+        workflow.add_node("analyze_request", self._analyze_request)
+        workflow.add_node("execute_tool", self._execute_tool)
+        workflow.add_node("process_response", self._process_response)
+        # Add edges
+        workflow.set_entry_point("analyze_request")
+        workflow.add_edge("analyze_request", "execute_tool")
+        workflow.add_edge("execute_tool", "process_response")
+        workflow.add_edge("process_response", END)
+        return workflow.compile()
+    def _analyze_request(self, state: AgentState) -> AgentState:
+        """Analyze the data request and prepare tool invocation."""
+        request = state["request"]
+        system_prompt = f"""
+        You are a data fetching assistant. Analyze the user's request and determine
+        how to fetch the data using the {request.source_type} tool.
+        Based on the prompt: "{request.prompt}"
+        And the source configuration: {json.dumps(request.source_config)}
+        Determine the exact parameters needed for the tool invocation.
+        Respond with a JSON object containing the tool parameters.
+        """
+        messages = [
+            SystemMessage(content=system_prompt),
+            HumanMessage(content=request.prompt),
+        ]
+        response = self.llm.invoke(messages)
+        # Parse the response to get tool parameters
+        try:
+            content = (
+                response.content
+                if isinstance(response.content, str)
+                else str(response.content)
+            )
+            tool_params = json.loads(content)
+        except json.JSONDecodeError:
+            # Fallback to basic parameters
+            tool_params = request.source_config
+        state["tool_calls"] = [{"tool": request.source_type, "parameters": tool_params}]
+        state["messages"] = messages + [response]
+        return state
+    def _execute_tool(self, state: AgentState) -> AgentState:
+        """Execute the selected tool with parameters."""
+        tool_call = state["tool_calls"][0]
+        tool = self.tools[DataSourceType(tool_call["tool"])]
+        # Execute tool directly
+        result = tool._run(**tool_call["parameters"])  # type: ignore
+        # Store result in state
+        state["final_answer"] = json.dumps(result)
+        return state
+    def _process_response(self, state: AgentState) -> AgentState:
+        """Process the tool response and create final DataResponse."""
+        request = state["request"]
+        final_answer = state["final_answer"] or "{}"
+        tool_result = json.loads(final_answer)
+        # Create response
+        response = DataResponse(
+            request_id=f"{request.source_type}_{id(request)}",
+            status=tool_result.get("status", "error"),
+            data=tool_result.get("data"),
+            error_message=tool_result.get("error"),
+            source_type=request.source_type,
+            metadata={
+                "tool_parameters": state["tool_calls"][0]["parameters"],
+                "request_metadata": request.metadata,
+            },
+        )
+        state["response"] = response
+        return state
+    async def fetch_data(self, request: DataRequest) -> DataResponse:
+        """
+        Fetch data based on the request.
+        Args:
+            request: Data request with prompt and configuration
+        Returns:
+            DataResponse with fetched data or error
+        """
+        initial_state: AgentState = {
+            "messages": [],
+            "request": request,
+            "response": None,
+            "tool_calls": [],
+            "final_answer": None,
+        }
+        # Run the graph
+        final_state = await self.graph.ainvoke(initial_state)
+        return final_state["response"]

khora/agents/pipeline_builder.py ADDED Viewed

@@ -0,0 +1,217 @@
+"""Pipeline builder agent for creating Dagster pipelines dynamically."""
+import json
+from typing import Any, Dict
+from dagster import AssetExecutionContext, asset, define_asset_job
+from langchain.schema import HumanMessage, SystemMessage
+from langchain_openai import ChatOpenAI
+from pydantic import SecretStr
+from khora.agents.data_fetcher import DataFetcherAgent
+from khora.utils.data_models import DataRequest, PipelineConfig
+class PipelineBuilderAgent:
+    """Agent for building Dagster pipelines based on natural language descriptions."""
+    def __init__(self, openai_api_key: str, model: str = "gpt-4-turbo-preview"):
+        """Initialize the pipeline builder agent."""
+        self.llm = ChatOpenAI(
+            api_key=SecretStr(openai_api_key), model=model, temperature=0
+        )
+        self.data_fetcher = DataFetcherAgent(openai_api_key, model)
+    def analyze_pipeline_request(self, description: str) -> PipelineConfig:
+        """
+        Analyze a natural language pipeline description and create PipelineConfig.
+        Args:
+            description: Natural language description of the pipeline
+        Returns:
+            PipelineConfig with structured pipeline definition
+        """
+        system_prompt = """
+        You are a pipeline configuration assistant. Analyze the user's description
+        and create a structured pipeline configuration.
+        Identify:
+        1. Data sources to fetch from (API, web scraping, Google Docs/Sheets)
+        2. The sequence of operations
+        3. Any transformations or processing needed
+        4. Output format requirements
+        Respond with a JSON object that matches the PipelineConfig schema:
+        {
+            "name": "pipeline_name",
+            "description": "pipeline description",
+            "requests": [
+                {
+                    "source_type": "api|web_scraper|google_docs|spreadsheet",
+                    "prompt": "what data to fetch",
+                    "source_config": {},
+                    "filters": {},
+                    "metadata": {}
+                }
+            ],
+            "parallel_execution": true/false,
+            "output_format": "json"
+        }
+        """
+        messages = [
+            SystemMessage(content=system_prompt),
+            HumanMessage(content=description),
+        ]
+        response = self.llm.invoke(messages)
+        try:
+            content = (
+                response.content
+                if isinstance(response.content, str)
+                else str(response.content)
+            )
+            config_dict = json.loads(content)
+            # Convert to PipelineConfig
+            config_dict["requests"] = [
+                DataRequest(**req) for req in config_dict.get("requests", [])
+            ]
+            return PipelineConfig(**config_dict)
+        except (json.JSONDecodeError, ValueError):
+            # Fallback to a simple configuration
+            return PipelineConfig(
+                name="custom_pipeline",
+                description=description,
+                requests=[],
+                parallel_execution=True,
+            )
+    def build_pipeline(self, config: PipelineConfig) -> Dict[str, Any]:
+        """
+        Build a Dagster pipeline from configuration.
+        Args:
+            config: Pipeline configuration
+        Returns:
+            Dictionary containing Dagster assets and jobs
+        """
+        assets = []
+        # Create assets for each data request
+        for i, request in enumerate(config.requests):
+            asset_name = f"{config.name}_{request.source_type}_{i}"
+            @asset(
+                name=asset_name,
+                description=f"Fetch data: {request.prompt}",
+                metadata={"source_type": request.source_type, "prompt": request.prompt},
+            )
+            async def fetch_data_asset(
+                context: AssetExecutionContext,
+                req: DataRequest = request,
+                fetcher: DataFetcherAgent = self.data_fetcher,
+            ) -> Dict[str, Any]:
+                """Asset for fetching data based on request."""
+                context.log.info(f"Fetching data from {req.source_type}")
+                response = await fetcher.fetch_data(req)
+                if response.status == "error":
+                    context.log.error(f"Error fetching data: {response.error_message}")
+                    raise Exception(response.error_message)
+                return response.data or {}
+            assets.append(fetch_data_asset)
+        # Create a job that runs all assets
+        job = define_asset_job(
+            name=f"{config.name}_job",
+            selection=[asset.key for asset in assets],
+            description=config.description or f"Job for {config.name}",
+        )
+        return {"assets": assets, "jobs": [job], "config": config}
+    def generate_pipeline_code(self, config: PipelineConfig) -> str:
+        """
+        Generate Python code for a Dagster pipeline.
+        Args:
+            config: Pipeline configuration
+        Returns:
+            Python code as string
+        """
+        code_template = '''
+"""Auto-generated Dagster pipeline: {name}"""
+from dagster import AssetExecutionContext, asset, define_asset_job, Definitions
+from khora.agents import DataFetcherAgent
+from khora.utils.data_models import DataRequest, DataSourceType
+from khora.utils.config import load_config
+# Load configuration
+config = load_config()
+data_fetcher = DataFetcherAgent(
+    openai_api_key=config["openai_api_key"],
+    model=config["openai_model"]
+)
+# Define assets
+'''
+        code = code_template.format(name=config.name)
+        # Generate asset code for each request
+        for i, request in enumerate(config.requests):
+            asset_code = f'''
+@asset(
+    name="{config.name}_{request.source_type}_{i}",
+    description="Fetch: {request.prompt}"
+)
+async def fetch_{request.source_type}_{i}(context: AssetExecutionContext):
+    """Fetch data from {request.source_type}."""
+    request = DataRequest(
+        source_type=DataSourceType.{request.source_type.upper()},
+        prompt="{request.prompt}",
+        source_config={json.dumps(request.source_config)},
+        filters={json.dumps(request.filters)},
+                 metadata={json.dumps(request.metadata)}
+     )
+     response = await data_fetcher.fetch_data(request)
+     if response.status == "error":
+         raise Exception(f"Failed to fetch data: {{response.error_message}}")
+     return response.data
+'''
+            code += asset_code
+        # Generate job definition
+        asset_names = [
+            f"fetch_{req.source_type}_{i}" for i, req in enumerate(config.requests)
+        ]
+        job_code = f"""
+# Define job
+{config.name}_job = define_asset_job(
+    name="{config.name}_job",
+    selection={asset_names},
+    description="{config.description or "Auto-generated job"}"
+)
+# Define Dagster definitions
+defs = Definitions(
+    assets={asset_names},
+    jobs=[{config.name}_job]
+)
+"""
+        code += job_code
+        return code

khora/pipelines/__init__.py ADDED Viewed

@@ -0,0 +1,6 @@
+"""Dagster pipelines for data fetching operations."""
+from .data_pipeline import create_data_pipeline
+from .definitions import defs
+__all__ = ["create_data_pipeline", "defs"]

khora/pipelines/data_pipeline.py ADDED Viewed

@@ -0,0 +1,131 @@
+"""Example data pipeline implementation."""
+from typing import Any, Dict, List
+from dagster import (
+    AssetExecutionContext,
+    AssetsDefinition,
+    Config,
+    asset,
+    define_asset_job,
+)
+from khora.agents import DataFetcherAgent
+from khora.utils.config import load_config
+from khora.utils.data_models import DataRequest, DataSourceType
+class PipelineConfig(Config):
+    """Configuration for pipeline execution."""
+    openai_api_key: str = ""
+    openai_model: str = "gpt-4-turbo-preview"
+def create_data_pipeline(
+    name: str, requests: List[DataRequest], config: Dict[str, Any]
+) -> List[AssetsDefinition]:
+    """
+    Create a Dagster pipeline dynamically.
+    Args:
+        name: Pipeline name
+        requests: List of data requests
+        config: Configuration dictionary
+    Returns:
+        List of Dagster assets
+    """
+    assets = []
+    data_fetcher = DataFetcherAgent(
+        openai_api_key=config.get("openai_api_key", ""),
+        model=config.get("openai_model", "gpt-4-turbo-preview"),
+    )
+    for i, request in enumerate(requests):
+        asset_name = f"{name}_{request.source_type}_{i}"
+        @asset(
+            name=asset_name,
+            description=f"Fetch: {request.prompt}",
+            metadata={"source_type": request.source_type, "prompt": request.prompt},
+        )
+        async def fetch_data(
+            context: AssetExecutionContext,
+            req: DataRequest = request,
+            fetcher: DataFetcherAgent = data_fetcher,
+        ) -> Dict[str, Any]:
+            """Fetch data based on request."""
+            context.log.info(f"Fetching data from {req.source_type}")
+            response = await fetcher.fetch_data(req)
+            if response.status == "error":
+                context.log.error(f"Error: {response.error_message}")
+                raise Exception(response.error_message)
+            return response.data or {}
+        assets.append(fetch_data)
+    return assets
+def create_example_assets() -> List[AssetsDefinition]:
+    """Create example assets for demonstration."""
+    @asset(name="example_api_data", description="Example API data fetching")
+    async def example_api_data(context: AssetExecutionContext) -> Dict[str, Any]:
+        """Fetch example data from an API."""
+        context.log.info("Fetching example API data")
+        # Example implementation
+        config = load_config()
+        fetcher = DataFetcherAgent(
+            openai_api_key=config.get("openai_api_key", ""),
+            model=config.get("openai_model", "gpt-4-turbo-preview"),
+        )
+        request = DataRequest(
+            source_type=DataSourceType.API,
+            prompt="Fetch weather data for San Francisco",
+            source_config={
+                "url": "https://api.weather.com/v1/weather",
+                "params": {"city": "San Francisco"},
+            },
+        )
+        response = await fetcher.fetch_data(request)
+        return (response.data or {}) if response.status == "success" else {}
+    @asset(name="example_web_data", description="Example web scraping")
+    async def example_web_data(context: AssetExecutionContext) -> Dict[str, Any]:
+        """Scrape example data from a website."""
+        context.log.info("Scraping example web data")
+        config = load_config()
+        fetcher = DataFetcherAgent(
+            openai_api_key=config.get("openai_api_key", ""),
+            model=config.get("openai_model", "gpt-4-turbo-preview"),
+        )
+        request = DataRequest(
+            source_type=DataSourceType.WEB_SCRAPER,
+            prompt="Extract article titles from a news website",
+            source_config={
+                "url": "https://example.com/news",
+                "selectors": {"titles": "h2.article-title"},
+            },
+        )
+        response = await fetcher.fetch_data(request)
+        return (response.data or {}) if response.status == "success" else {}
+    return [example_api_data, example_web_data]
+# Define example job
+example_job = define_asset_job(
+    name="example_data_pipeline",
+    selection=["example_api_data", "example_web_data"],
+    description="Example pipeline for fetching data from multiple sources",
+)

khora/pipelines/definitions.py ADDED Viewed

@@ -0,0 +1,14 @@
+"""Dagster definitions for Khora pipelines."""
+from dagster import Definitions
+from khora.pipelines.data_pipeline import create_example_assets, example_job
+from khora.utils.config import load_config
+# Load configuration
+config = load_config()
+# Create definitions
+defs = Definitions(
+    assets=create_example_assets(), jobs=[example_job], resources={"config": config}
+)

khora/tools/__init__.py ADDED Viewed

@@ -0,0 +1,7 @@
+"""Tools for various data source integrations."""
+from .api_tool import APITool
+from .google_docs_tool import GoogleDocsTool
+from .web_scraper_tool import WebScraperTool
+__all__ = ["APITool", "WebScraperTool", "GoogleDocsTool"]