khora 0.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
khora/__init__.py ADDED
@@ -0,0 +1,6 @@
1
+ """
2
+ Khora - Ad-hoc Dagster pipelines for data fetching using AI/LLM prompts.
3
+ """
4
+
5
+ __version__ = "0.0.1"
6
+ __all__ = ["agents", "pipelines", "tools", "utils"]
khora/__main__.py ADDED
@@ -0,0 +1,101 @@
1
+ """Main entry point for Khora CLI."""
2
+
3
+ import asyncio
4
+ import json
5
+ import sys
6
+ from pathlib import Path
7
+
8
+ from khora.agents import DataFetcherAgent, PipelineBuilderAgent
9
+ from khora.utils.config import load_config
10
+ from khora.utils.data_models import DataRequest, DataSourceType
11
+
12
+
13
+ async def main() -> None:
14
+ """Main CLI function."""
15
+ config = load_config()
16
+
17
+ if not config.get("openai_api_key"):
18
+ print("Error: OPENAI_API_KEY not set in environment")
19
+ sys.exit(1)
20
+
21
+ print("Khora - AI-powered Data Pipeline Builder")
22
+ print("=" * 40)
23
+
24
+ # Example: Create a simple data fetching request
25
+ if len(sys.argv) > 1:
26
+ if sys.argv[1] == "fetch":
27
+ if len(sys.argv) < 4:
28
+ print("Usage: python -m khora fetch <source_type> <prompt>")
29
+ sys.exit(1)
30
+
31
+ source_type = sys.argv[2]
32
+ prompt = " ".join(sys.argv[3:])
33
+
34
+ fetcher = DataFetcherAgent(
35
+ openai_api_key=config["openai_api_key"],
36
+ model=config.get("openai_model", "gpt-4-turbo-preview"),
37
+ )
38
+
39
+ try:
40
+ request = DataRequest(
41
+ source_type=DataSourceType(source_type), prompt=prompt
42
+ )
43
+
44
+ print(f"Fetching data from {source_type}...")
45
+ response = await fetcher.fetch_data(request)
46
+
47
+ if response.status == "success":
48
+ print("Success! Data fetched:")
49
+ print(json.dumps(response.data, indent=2))
50
+ else:
51
+ print(f"Error: {response.error_message}")
52
+
53
+ except ValueError:
54
+ print(
55
+ "Error: Invalid source type. Valid types: api, web_scraper, google_docs, spreadsheet"
56
+ )
57
+ sys.exit(1)
58
+
59
+ elif sys.argv[1] == "build":
60
+ if len(sys.argv) < 3:
61
+ print("Usage: python -m khora build <pipeline_description>")
62
+ sys.exit(1)
63
+
64
+ description = " ".join(sys.argv[2:])
65
+
66
+ builder = PipelineBuilderAgent(
67
+ openai_api_key=config["openai_api_key"],
68
+ model=config.get("openai_model", "gpt-4-turbo-preview"),
69
+ )
70
+
71
+ print("Analyzing pipeline request...")
72
+ pipeline_config = builder.analyze_pipeline_request(description)
73
+
74
+ print(f"\nGenerated Pipeline: {pipeline_config.name}")
75
+ print(f"Description: {pipeline_config.description}")
76
+ print(f"Number of data sources: {len(pipeline_config.requests)}")
77
+
78
+ # Generate code
79
+ code = builder.generate_pipeline_code(pipeline_config)
80
+
81
+ # Save to file
82
+ output_file = Path(f"{pipeline_config.name}_pipeline.py")
83
+ output_file.write_text(code)
84
+
85
+ print(f"\nPipeline code saved to: {output_file}")
86
+ print("\nTo run the pipeline:")
87
+ print(f" dagster dev -f {output_file}")
88
+
89
+ else:
90
+ print("\nUsage:")
91
+ print(" python -m khora fetch <source_type> <prompt>")
92
+ print(" python -m khora build <pipeline_description>")
93
+ print("\nExamples:")
94
+ print(" python -m khora fetch api 'Get weather data for NYC'")
95
+ print(
96
+ " python -m khora build 'Create pipeline to fetch crypto prices and news'"
97
+ )
98
+
99
+
100
+ if __name__ == "__main__":
101
+ asyncio.run(main())
@@ -0,0 +1,6 @@
1
+ """Agents module for AI-powered data fetching."""
2
+
3
+ from .data_fetcher import DataFetcherAgent
4
+ from .pipeline_builder import PipelineBuilderAgent
5
+
6
+ __all__ = ["DataFetcherAgent", "PipelineBuilderAgent"]
@@ -0,0 +1,158 @@
1
+ """Data fetcher agent using LangGraph for orchestration."""
2
+
3
+ import json
4
+ from typing import Any, Dict, List, Optional, TypedDict
5
+
6
+ from langchain.schema import BaseMessage, HumanMessage, SystemMessage
7
+ from langchain_openai import ChatOpenAI
8
+ from langgraph.graph import END, StateGraph
9
+ from pydantic import SecretStr
10
+
11
+ from khora.tools import APITool, GoogleDocsTool, WebScraperTool
12
+ from khora.utils.data_models import DataRequest, DataResponse, DataSourceType
13
+
14
+
15
+ class AgentState(TypedDict):
16
+ """State for the data fetcher agent."""
17
+
18
+ messages: List[BaseMessage]
19
+ request: DataRequest
20
+ response: Optional[DataResponse]
21
+ tool_calls: List[Dict[str, Any]]
22
+ final_answer: Optional[str]
23
+
24
+
25
+ class DataFetcherAgent:
26
+ """Agent for fetching data based on AI prompts using LangGraph."""
27
+
28
+ def __init__(self, openai_api_key: str, model: str = "gpt-4-turbo-preview"):
29
+ """Initialize the data fetcher agent."""
30
+ self.llm = ChatOpenAI(
31
+ api_key=SecretStr(openai_api_key), model=model, temperature=0
32
+ )
33
+
34
+ # Initialize tools
35
+ self.tools = {
36
+ DataSourceType.API: APITool(),
37
+ DataSourceType.WEB_SCRAPER: WebScraperTool(),
38
+ DataSourceType.GOOGLE_DOCS: GoogleDocsTool(),
39
+ DataSourceType.SPREADSHEET: GoogleDocsTool(),
40
+ }
41
+
42
+ # Build the graph
43
+ self.graph = self._build_graph()
44
+
45
+ def _build_graph(self): # type: ignore
46
+ """Build the LangGraph state graph."""
47
+ workflow = StateGraph(AgentState)
48
+
49
+ # Add nodes
50
+ workflow.add_node("analyze_request", self._analyze_request)
51
+ workflow.add_node("execute_tool", self._execute_tool)
52
+ workflow.add_node("process_response", self._process_response)
53
+
54
+ # Add edges
55
+ workflow.set_entry_point("analyze_request")
56
+ workflow.add_edge("analyze_request", "execute_tool")
57
+ workflow.add_edge("execute_tool", "process_response")
58
+ workflow.add_edge("process_response", END)
59
+
60
+ return workflow.compile()
61
+
62
+ def _analyze_request(self, state: AgentState) -> AgentState:
63
+ """Analyze the data request and prepare tool invocation."""
64
+ request = state["request"]
65
+
66
+ system_prompt = f"""
67
+ You are a data fetching assistant. Analyze the user's request and determine
68
+ how to fetch the data using the {request.source_type} tool.
69
+
70
+ Based on the prompt: "{request.prompt}"
71
+ And the source configuration: {json.dumps(request.source_config)}
72
+
73
+ Determine the exact parameters needed for the tool invocation.
74
+ Respond with a JSON object containing the tool parameters.
75
+ """
76
+
77
+ messages = [
78
+ SystemMessage(content=system_prompt),
79
+ HumanMessage(content=request.prompt),
80
+ ]
81
+
82
+ response = self.llm.invoke(messages)
83
+
84
+ # Parse the response to get tool parameters
85
+ try:
86
+ content = (
87
+ response.content
88
+ if isinstance(response.content, str)
89
+ else str(response.content)
90
+ )
91
+ tool_params = json.loads(content)
92
+ except json.JSONDecodeError:
93
+ # Fallback to basic parameters
94
+ tool_params = request.source_config
95
+
96
+ state["tool_calls"] = [{"tool": request.source_type, "parameters": tool_params}]
97
+ state["messages"] = messages + [response]
98
+
99
+ return state
100
+
101
+ def _execute_tool(self, state: AgentState) -> AgentState:
102
+ """Execute the selected tool with parameters."""
103
+ tool_call = state["tool_calls"][0]
104
+ tool = self.tools[DataSourceType(tool_call["tool"])]
105
+
106
+ # Execute tool directly
107
+ result = tool._run(**tool_call["parameters"]) # type: ignore
108
+
109
+ # Store result in state
110
+ state["final_answer"] = json.dumps(result)
111
+
112
+ return state
113
+
114
+ def _process_response(self, state: AgentState) -> AgentState:
115
+ """Process the tool response and create final DataResponse."""
116
+ request = state["request"]
117
+ final_answer = state["final_answer"] or "{}"
118
+ tool_result = json.loads(final_answer)
119
+
120
+ # Create response
121
+ response = DataResponse(
122
+ request_id=f"{request.source_type}_{id(request)}",
123
+ status=tool_result.get("status", "error"),
124
+ data=tool_result.get("data"),
125
+ error_message=tool_result.get("error"),
126
+ source_type=request.source_type,
127
+ metadata={
128
+ "tool_parameters": state["tool_calls"][0]["parameters"],
129
+ "request_metadata": request.metadata,
130
+ },
131
+ )
132
+
133
+ state["response"] = response
134
+
135
+ return state
136
+
137
+ async def fetch_data(self, request: DataRequest) -> DataResponse:
138
+ """
139
+ Fetch data based on the request.
140
+
141
+ Args:
142
+ request: Data request with prompt and configuration
143
+
144
+ Returns:
145
+ DataResponse with fetched data or error
146
+ """
147
+ initial_state: AgentState = {
148
+ "messages": [],
149
+ "request": request,
150
+ "response": None,
151
+ "tool_calls": [],
152
+ "final_answer": None,
153
+ }
154
+
155
+ # Run the graph
156
+ final_state = await self.graph.ainvoke(initial_state)
157
+
158
+ return final_state["response"]
@@ -0,0 +1,217 @@
1
+ """Pipeline builder agent for creating Dagster pipelines dynamically."""
2
+
3
+ import json
4
+ from typing import Any, Dict
5
+
6
+ from dagster import AssetExecutionContext, asset, define_asset_job
7
+ from langchain.schema import HumanMessage, SystemMessage
8
+ from langchain_openai import ChatOpenAI
9
+ from pydantic import SecretStr
10
+
11
+ from khora.agents.data_fetcher import DataFetcherAgent
12
+ from khora.utils.data_models import DataRequest, PipelineConfig
13
+
14
+
15
+ class PipelineBuilderAgent:
16
+ """Agent for building Dagster pipelines based on natural language descriptions."""
17
+
18
+ def __init__(self, openai_api_key: str, model: str = "gpt-4-turbo-preview"):
19
+ """Initialize the pipeline builder agent."""
20
+ self.llm = ChatOpenAI(
21
+ api_key=SecretStr(openai_api_key), model=model, temperature=0
22
+ )
23
+ self.data_fetcher = DataFetcherAgent(openai_api_key, model)
24
+
25
+ def analyze_pipeline_request(self, description: str) -> PipelineConfig:
26
+ """
27
+ Analyze a natural language pipeline description and create PipelineConfig.
28
+
29
+ Args:
30
+ description: Natural language description of the pipeline
31
+
32
+ Returns:
33
+ PipelineConfig with structured pipeline definition
34
+ """
35
+ system_prompt = """
36
+ You are a pipeline configuration assistant. Analyze the user's description
37
+ and create a structured pipeline configuration.
38
+
39
+ Identify:
40
+ 1. Data sources to fetch from (API, web scraping, Google Docs/Sheets)
41
+ 2. The sequence of operations
42
+ 3. Any transformations or processing needed
43
+ 4. Output format requirements
44
+
45
+ Respond with a JSON object that matches the PipelineConfig schema:
46
+ {
47
+ "name": "pipeline_name",
48
+ "description": "pipeline description",
49
+ "requests": [
50
+ {
51
+ "source_type": "api|web_scraper|google_docs|spreadsheet",
52
+ "prompt": "what data to fetch",
53
+ "source_config": {},
54
+ "filters": {},
55
+ "metadata": {}
56
+ }
57
+ ],
58
+ "parallel_execution": true/false,
59
+ "output_format": "json"
60
+ }
61
+ """
62
+
63
+ messages = [
64
+ SystemMessage(content=system_prompt),
65
+ HumanMessage(content=description),
66
+ ]
67
+
68
+ response = self.llm.invoke(messages)
69
+
70
+ try:
71
+ content = (
72
+ response.content
73
+ if isinstance(response.content, str)
74
+ else str(response.content)
75
+ )
76
+ config_dict = json.loads(content)
77
+ # Convert to PipelineConfig
78
+ config_dict["requests"] = [
79
+ DataRequest(**req) for req in config_dict.get("requests", [])
80
+ ]
81
+ return PipelineConfig(**config_dict)
82
+ except (json.JSONDecodeError, ValueError):
83
+ # Fallback to a simple configuration
84
+ return PipelineConfig(
85
+ name="custom_pipeline",
86
+ description=description,
87
+ requests=[],
88
+ parallel_execution=True,
89
+ )
90
+
91
+ def build_pipeline(self, config: PipelineConfig) -> Dict[str, Any]:
92
+ """
93
+ Build a Dagster pipeline from configuration.
94
+
95
+ Args:
96
+ config: Pipeline configuration
97
+
98
+ Returns:
99
+ Dictionary containing Dagster assets and jobs
100
+ """
101
+ assets = []
102
+
103
+ # Create assets for each data request
104
+ for i, request in enumerate(config.requests):
105
+ asset_name = f"{config.name}_{request.source_type}_{i}"
106
+
107
+ @asset(
108
+ name=asset_name,
109
+ description=f"Fetch data: {request.prompt}",
110
+ metadata={"source_type": request.source_type, "prompt": request.prompt},
111
+ )
112
+ async def fetch_data_asset(
113
+ context: AssetExecutionContext,
114
+ req: DataRequest = request,
115
+ fetcher: DataFetcherAgent = self.data_fetcher,
116
+ ) -> Dict[str, Any]:
117
+ """Asset for fetching data based on request."""
118
+ context.log.info(f"Fetching data from {req.source_type}")
119
+ response = await fetcher.fetch_data(req)
120
+
121
+ if response.status == "error":
122
+ context.log.error(f"Error fetching data: {response.error_message}")
123
+ raise Exception(response.error_message)
124
+
125
+ return response.data or {}
126
+
127
+ assets.append(fetch_data_asset)
128
+
129
+ # Create a job that runs all assets
130
+ job = define_asset_job(
131
+ name=f"{config.name}_job",
132
+ selection=[asset.key for asset in assets],
133
+ description=config.description or f"Job for {config.name}",
134
+ )
135
+
136
+ return {"assets": assets, "jobs": [job], "config": config}
137
+
138
+ def generate_pipeline_code(self, config: PipelineConfig) -> str:
139
+ """
140
+ Generate Python code for a Dagster pipeline.
141
+
142
+ Args:
143
+ config: Pipeline configuration
144
+
145
+ Returns:
146
+ Python code as string
147
+ """
148
+ code_template = '''
149
+ """Auto-generated Dagster pipeline: {name}"""
150
+
151
+ from dagster import AssetExecutionContext, asset, define_asset_job, Definitions
152
+ from khora.agents import DataFetcherAgent
153
+ from khora.utils.data_models import DataRequest, DataSourceType
154
+ from khora.utils.config import load_config
155
+
156
+ # Load configuration
157
+ config = load_config()
158
+ data_fetcher = DataFetcherAgent(
159
+ openai_api_key=config["openai_api_key"],
160
+ model=config["openai_model"]
161
+ )
162
+
163
+ # Define assets
164
+ '''
165
+
166
+ code = code_template.format(name=config.name)
167
+
168
+ # Generate asset code for each request
169
+ for i, request in enumerate(config.requests):
170
+ asset_code = f'''
171
+ @asset(
172
+ name="{config.name}_{request.source_type}_{i}",
173
+ description="Fetch: {request.prompt}"
174
+ )
175
+ async def fetch_{request.source_type}_{i}(context: AssetExecutionContext):
176
+ """Fetch data from {request.source_type}."""
177
+ request = DataRequest(
178
+ source_type=DataSourceType.{request.source_type.upper()},
179
+ prompt="{request.prompt}",
180
+ source_config={json.dumps(request.source_config)},
181
+ filters={json.dumps(request.filters)},
182
+ metadata={json.dumps(request.metadata)}
183
+ )
184
+
185
+ response = await data_fetcher.fetch_data(request)
186
+
187
+ if response.status == "error":
188
+ raise Exception(f"Failed to fetch data: {{response.error_message}}")
189
+
190
+ return response.data
191
+
192
+ '''
193
+ code += asset_code
194
+
195
+ # Generate job definition
196
+ asset_names = [
197
+ f"fetch_{req.source_type}_{i}" for i, req in enumerate(config.requests)
198
+ ]
199
+
200
+ job_code = f"""
201
+ # Define job
202
+ {config.name}_job = define_asset_job(
203
+ name="{config.name}_job",
204
+ selection={asset_names},
205
+ description="{config.description or "Auto-generated job"}"
206
+ )
207
+
208
+ # Define Dagster definitions
209
+ defs = Definitions(
210
+ assets={asset_names},
211
+ jobs=[{config.name}_job]
212
+ )
213
+ """
214
+
215
+ code += job_code
216
+
217
+ return code
@@ -0,0 +1,6 @@
1
+ """Dagster pipelines for data fetching operations."""
2
+
3
+ from .data_pipeline import create_data_pipeline
4
+ from .definitions import defs
5
+
6
+ __all__ = ["create_data_pipeline", "defs"]
@@ -0,0 +1,131 @@
1
+ """Example data pipeline implementation."""
2
+
3
+ from typing import Any, Dict, List
4
+
5
+ from dagster import (
6
+ AssetExecutionContext,
7
+ AssetsDefinition,
8
+ Config,
9
+ asset,
10
+ define_asset_job,
11
+ )
12
+
13
+ from khora.agents import DataFetcherAgent
14
+ from khora.utils.config import load_config
15
+ from khora.utils.data_models import DataRequest, DataSourceType
16
+
17
+
18
+ class PipelineConfig(Config):
19
+ """Configuration for pipeline execution."""
20
+
21
+ openai_api_key: str = ""
22
+ openai_model: str = "gpt-4-turbo-preview"
23
+
24
+
25
+ def create_data_pipeline(
26
+ name: str, requests: List[DataRequest], config: Dict[str, Any]
27
+ ) -> List[AssetsDefinition]:
28
+ """
29
+ Create a Dagster pipeline dynamically.
30
+
31
+ Args:
32
+ name: Pipeline name
33
+ requests: List of data requests
34
+ config: Configuration dictionary
35
+
36
+ Returns:
37
+ List of Dagster assets
38
+ """
39
+ assets = []
40
+ data_fetcher = DataFetcherAgent(
41
+ openai_api_key=config.get("openai_api_key", ""),
42
+ model=config.get("openai_model", "gpt-4-turbo-preview"),
43
+ )
44
+
45
+ for i, request in enumerate(requests):
46
+ asset_name = f"{name}_{request.source_type}_{i}"
47
+
48
+ @asset(
49
+ name=asset_name,
50
+ description=f"Fetch: {request.prompt}",
51
+ metadata={"source_type": request.source_type, "prompt": request.prompt},
52
+ )
53
+ async def fetch_data(
54
+ context: AssetExecutionContext,
55
+ req: DataRequest = request,
56
+ fetcher: DataFetcherAgent = data_fetcher,
57
+ ) -> Dict[str, Any]:
58
+ """Fetch data based on request."""
59
+ context.log.info(f"Fetching data from {req.source_type}")
60
+ response = await fetcher.fetch_data(req)
61
+
62
+ if response.status == "error":
63
+ context.log.error(f"Error: {response.error_message}")
64
+ raise Exception(response.error_message)
65
+
66
+ return response.data or {}
67
+
68
+ assets.append(fetch_data)
69
+
70
+ return assets
71
+
72
+
73
+ def create_example_assets() -> List[AssetsDefinition]:
74
+ """Create example assets for demonstration."""
75
+
76
+ @asset(name="example_api_data", description="Example API data fetching")
77
+ async def example_api_data(context: AssetExecutionContext) -> Dict[str, Any]:
78
+ """Fetch example data from an API."""
79
+ context.log.info("Fetching example API data")
80
+
81
+ # Example implementation
82
+ config = load_config()
83
+ fetcher = DataFetcherAgent(
84
+ openai_api_key=config.get("openai_api_key", ""),
85
+ model=config.get("openai_model", "gpt-4-turbo-preview"),
86
+ )
87
+
88
+ request = DataRequest(
89
+ source_type=DataSourceType.API,
90
+ prompt="Fetch weather data for San Francisco",
91
+ source_config={
92
+ "url": "https://api.weather.com/v1/weather",
93
+ "params": {"city": "San Francisco"},
94
+ },
95
+ )
96
+
97
+ response = await fetcher.fetch_data(request)
98
+ return (response.data or {}) if response.status == "success" else {}
99
+
100
+ @asset(name="example_web_data", description="Example web scraping")
101
+ async def example_web_data(context: AssetExecutionContext) -> Dict[str, Any]:
102
+ """Scrape example data from a website."""
103
+ context.log.info("Scraping example web data")
104
+
105
+ config = load_config()
106
+ fetcher = DataFetcherAgent(
107
+ openai_api_key=config.get("openai_api_key", ""),
108
+ model=config.get("openai_model", "gpt-4-turbo-preview"),
109
+ )
110
+
111
+ request = DataRequest(
112
+ source_type=DataSourceType.WEB_SCRAPER,
113
+ prompt="Extract article titles from a news website",
114
+ source_config={
115
+ "url": "https://example.com/news",
116
+ "selectors": {"titles": "h2.article-title"},
117
+ },
118
+ )
119
+
120
+ response = await fetcher.fetch_data(request)
121
+ return (response.data or {}) if response.status == "success" else {}
122
+
123
+ return [example_api_data, example_web_data]
124
+
125
+
126
+ # Define example job
127
+ example_job = define_asset_job(
128
+ name="example_data_pipeline",
129
+ selection=["example_api_data", "example_web_data"],
130
+ description="Example pipeline for fetching data from multiple sources",
131
+ )
@@ -0,0 +1,14 @@
1
+ """Dagster definitions for Khora pipelines."""
2
+
3
+ from dagster import Definitions
4
+
5
+ from khora.pipelines.data_pipeline import create_example_assets, example_job
6
+ from khora.utils.config import load_config
7
+
8
+ # Load configuration
9
+ config = load_config()
10
+
11
+ # Create definitions
12
+ defs = Definitions(
13
+ assets=create_example_assets(), jobs=[example_job], resources={"config": config}
14
+ )
@@ -0,0 +1,7 @@
1
+ """Tools for various data source integrations."""
2
+
3
+ from .api_tool import APITool
4
+ from .google_docs_tool import GoogleDocsTool
5
+ from .web_scraper_tool import WebScraperTool
6
+
7
+ __all__ = ["APITool", "WebScraperTool", "GoogleDocsTool"]