khora 0.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,81 @@
1
+ """API tool for fetching data from REST APIs."""
2
+
3
+ import json
4
+ from typing import Any, Dict, Optional
5
+
6
+ import httpx
7
+ from langchain.tools import BaseTool
8
+ from pydantic import Field
9
+
10
+
11
+ class APITool(BaseTool):
12
+ """Tool for making API requests based on AI-generated specifications."""
13
+
14
+ name: str = "api_fetcher"
15
+ description: str = (
16
+ "Fetch data from APIs. The tool accepts a URL, HTTP method, "
17
+ "headers, and optional body/params based on the AI prompt analysis."
18
+ )
19
+
20
+ timeout: int = Field(default=30, description="Request timeout in seconds")
21
+
22
+ def _run(
23
+ self,
24
+ url: str,
25
+ method: str = "GET",
26
+ headers: Optional[Dict[str, str]] = None,
27
+ params: Optional[Dict[str, Any]] = None,
28
+ json_body: Optional[Dict[str, Any]] = None,
29
+ **kwargs: Any,
30
+ ) -> Dict[str, Any]:
31
+ """
32
+ Execute API request.
33
+
34
+ Args:
35
+ url: API endpoint URL
36
+ method: HTTP method (GET, POST, PUT, DELETE, etc.)
37
+ headers: Optional HTTP headers
38
+ params: Optional query parameters
39
+ json_body: Optional JSON body for POST/PUT requests
40
+
41
+ Returns:
42
+ Response data as dictionary
43
+ """
44
+ try:
45
+ with httpx.Client(timeout=self.timeout) as client:
46
+ response = client.request(
47
+ method=method.upper(),
48
+ url=url,
49
+ headers=headers,
50
+ params=params,
51
+ json=json_body,
52
+ )
53
+
54
+ response.raise_for_status()
55
+
56
+ # Try to parse JSON response
57
+ try:
58
+ data = response.json()
59
+ except json.JSONDecodeError:
60
+ data = {"text": response.text}
61
+
62
+ return {
63
+ "status": "success",
64
+ "status_code": response.status_code,
65
+ "data": data,
66
+ "headers": dict(response.headers),
67
+ }
68
+
69
+ except httpx.HTTPStatusError as e:
70
+ return {
71
+ "status": "error",
72
+ "status_code": e.response.status_code,
73
+ "error": str(e),
74
+ "response_text": e.response.text,
75
+ }
76
+ except Exception as e:
77
+ return {"status": "error", "error": str(e), "error_type": type(e).__name__}
78
+
79
+ async def _arun(self, *args: Any, **kwargs: Any) -> Dict[str, Any]:
80
+ """Async version of the API tool."""
81
+ raise NotImplementedError("Async execution not implemented yet")
@@ -0,0 +1,169 @@
1
+ """Google Docs and Sheets tool for extracting data."""
2
+
3
+ from typing import Any, Dict, List, Optional
4
+
5
+ from google.oauth2 import service_account
6
+ from googleapiclient.discovery import build
7
+ from langchain.tools import BaseTool
8
+ from pydantic import Field
9
+
10
+
11
+ class GoogleDocsTool(BaseTool):
12
+ """Tool for extracting data from Google Docs and Sheets."""
13
+
14
+ name: str = "google_docs_fetcher"
15
+ description: str = (
16
+ "Extract data from Google Docs and Google Sheets. "
17
+ "Requires document/sheet ID and appropriate permissions."
18
+ )
19
+
20
+ credentials_path: Optional[str] = Field(
21
+ default=None, description="Path to Google service account credentials JSON"
22
+ )
23
+ scopes: List[str] = Field(
24
+ default_factory=lambda: [
25
+ "https://www.googleapis.com/auth/documents.readonly",
26
+ "https://www.googleapis.com/auth/spreadsheets.readonly",
27
+ "https://www.googleapis.com/auth/drive.readonly",
28
+ ]
29
+ )
30
+
31
+ def _run(
32
+ self,
33
+ document_id: str,
34
+ document_type: str = "sheet",
35
+ sheet_range: Optional[str] = None,
36
+ **kwargs: Any,
37
+ ) -> Dict[str, Any]:
38
+ """
39
+ Extract data from Google Docs or Sheets.
40
+
41
+ Args:
42
+ document_id: Google document or sheet ID
43
+ document_type: Type of document ("doc" or "sheet")
44
+ sheet_range: For sheets, the A1 notation range (e.g., "Sheet1!A1:D10")
45
+
46
+ Returns:
47
+ Extracted data as dictionary
48
+ """
49
+ try:
50
+ # Initialize credentials
51
+ if self.credentials_path:
52
+ credentials = service_account.Credentials.from_service_account_file( # type: ignore
53
+ self.credentials_path, scopes=self.scopes
54
+ )
55
+ else:
56
+ # Use default credentials if available
57
+ credentials = None
58
+
59
+ if document_type.lower() == "sheet":
60
+ return self._extract_sheet_data(document_id, sheet_range, credentials)
61
+ elif document_type.lower() == "doc":
62
+ return self._extract_doc_data(document_id, credentials)
63
+ else:
64
+ return {
65
+ "status": "error",
66
+ "error": f"Unsupported document type: {document_type}",
67
+ }
68
+
69
+ except Exception as e:
70
+ return {"status": "error", "error": str(e), "error_type": type(e).__name__}
71
+
72
+ def _extract_sheet_data(
73
+ self, sheet_id: str, sheet_range: Optional[str], credentials: Any
74
+ ) -> Dict[str, Any]:
75
+ """Extract data from Google Sheets."""
76
+ service = build("sheets", "v4", credentials=credentials)
77
+
78
+ # Get sheet metadata
79
+ sheet_metadata = service.spreadsheets().get(spreadsheetId=sheet_id).execute()
80
+
81
+ sheets = sheet_metadata.get("sheets", [])
82
+
83
+ result = {
84
+ "title": sheet_metadata.get("properties", {}).get("title"),
85
+ "sheets": [s["properties"]["title"] for s in sheets],
86
+ "data": {},
87
+ }
88
+
89
+ # If no range specified, get all sheets
90
+ if not sheet_range:
91
+ for sheet in sheets:
92
+ sheet_name = sheet["properties"]["title"]
93
+ range_name = f"{sheet_name}!A:Z"
94
+ try:
95
+ sheet_data = (
96
+ service.spreadsheets()
97
+ .values()
98
+ .get(spreadsheetId=sheet_id, range=range_name)
99
+ .execute()
100
+ )
101
+
102
+ values = sheet_data.get("values", [])
103
+ if values:
104
+ # Convert to list of dicts using first row as headers
105
+ headers = values[0] if values else []
106
+ rows = []
107
+ for row in values[1:]:
108
+ row_dict = {}
109
+ for i, header in enumerate(headers):
110
+ row_dict[header] = row[i] if i < len(row) else ""
111
+ rows.append(row_dict)
112
+ result["data"][sheet_name] = rows
113
+ except Exception:
114
+ result["data"][sheet_name] = []
115
+ else:
116
+ # Get specific range
117
+ sheet_data = (
118
+ service.spreadsheets()
119
+ .values()
120
+ .get(spreadsheetId=sheet_id, range=sheet_range)
121
+ .execute()
122
+ )
123
+
124
+ values = sheet_data.get("values", [])
125
+ if values:
126
+ headers = values[0] if values else []
127
+ rows = []
128
+ for row in values[1:]:
129
+ row_dict = {}
130
+ for i, header in enumerate(headers):
131
+ row_dict[header] = row[i] if i < len(row) else ""
132
+ rows.append(row_dict)
133
+ result["data"]["requested_range"] = rows
134
+
135
+ return {"status": "success", "data": result}
136
+
137
+ def _extract_doc_data(self, doc_id: str, credentials: Any) -> Dict[str, Any]:
138
+ """Extract data from Google Docs."""
139
+ service = build("docs", "v1", credentials=credentials)
140
+
141
+ # Get document
142
+ document = service.documents().get(documentId=doc_id).execute()
143
+
144
+ title = document.get("title")
145
+ content = []
146
+
147
+ # Extract text content
148
+ for element in document.get("body", {}).get("content", []):
149
+ if "paragraph" in element:
150
+ paragraph = element["paragraph"]
151
+ text_elements = []
152
+ for elem in paragraph.get("elements", []):
153
+ if "textRun" in elem:
154
+ text_elements.append(elem["textRun"]["content"])
155
+ if text_elements:
156
+ content.append("".join(text_elements))
157
+
158
+ return {
159
+ "status": "success",
160
+ "data": {
161
+ "title": title,
162
+ "content": "\n".join(content),
163
+ "document_id": doc_id,
164
+ },
165
+ }
166
+
167
+ async def _arun(self, *args: Any, **kwargs: Any) -> Dict[str, Any]:
168
+ """Async version of the Google Docs tool."""
169
+ raise NotImplementedError("Async execution not implemented yet")
@@ -0,0 +1,197 @@
1
+ """Web scraper tool for extracting data from websites using Playwright."""
2
+
3
+ import asyncio
4
+ from typing import Any, Dict, Optional
5
+
6
+ from langchain.tools import BaseTool
7
+ from playwright.async_api import async_playwright
8
+ from pydantic import Field
9
+
10
+
11
+ class WebScraperTool(BaseTool):
12
+ """Tool for scraping data from websites using Playwright."""
13
+
14
+ name: str = "web_scraper"
15
+ description: str = (
16
+ "Extract data from websites using Playwright. Can handle JavaScript-rendered "
17
+ "content, interact with pages, and extract complex data structures."
18
+ )
19
+
20
+ timeout: int = Field(default=30000, description="Page timeout in milliseconds")
21
+ headless: bool = Field(default=True, description="Run browser in headless mode")
22
+
23
+ def _run(
24
+ self,
25
+ url: str,
26
+ wait_for: Optional[str] = None,
27
+ selectors: Optional[Dict[str, str]] = None,
28
+ extract_all_text: bool = False,
29
+ extract_links: bool = False,
30
+ extract_tables: bool = False,
31
+ screenshot: bool = False,
32
+ execute_script: Optional[str] = None,
33
+ **kwargs: Any,
34
+ ) -> Dict[str, Any]:
35
+ """
36
+ Scrape web page and extract data using Playwright.
37
+
38
+ Args:
39
+ url: Website URL to scrape
40
+ wait_for: CSS selector or state to wait for before extraction
41
+ selectors: CSS selectors for specific elements
42
+ extract_all_text: Extract all text content
43
+ extract_links: Extract all links
44
+ extract_tables: Extract tables as structured data
45
+ screenshot: Take a screenshot of the page
46
+ execute_script: Custom JavaScript to execute on the page
47
+
48
+ Returns:
49
+ Extracted data as dictionary
50
+ """
51
+ # Run async function in sync context
52
+ return asyncio.run(
53
+ self._async_run(
54
+ url=url,
55
+ wait_for=wait_for,
56
+ selectors=selectors,
57
+ extract_all_text=extract_all_text,
58
+ extract_links=extract_links,
59
+ extract_tables=extract_tables,
60
+ screenshot=screenshot,
61
+ execute_script=execute_script,
62
+ **kwargs,
63
+ )
64
+ )
65
+
66
+ async def _async_run(
67
+ self,
68
+ url: str,
69
+ wait_for: Optional[str] = None,
70
+ selectors: Optional[Dict[str, str]] = None,
71
+ extract_all_text: bool = False,
72
+ extract_links: bool = False,
73
+ extract_tables: bool = False,
74
+ screenshot: bool = False,
75
+ execute_script: Optional[str] = None,
76
+ **kwargs: Any,
77
+ ) -> Dict[str, Any]:
78
+ """Async implementation of web scraping."""
79
+ try:
80
+ async with async_playwright() as p:
81
+ # Launch browser
82
+ browser = await p.chromium.launch(headless=self.headless)
83
+ context = await browser.new_context(
84
+ viewport={"width": 1920, "height": 1080},
85
+ user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36",
86
+ )
87
+ page = await context.new_page()
88
+
89
+ # Navigate to URL
90
+ await page.goto(url, timeout=self.timeout)
91
+
92
+ # Wait for specific element or state if specified
93
+ if wait_for:
94
+ if wait_for in ["load", "domcontentloaded", "networkidle"]:
95
+ await page.wait_for_load_state(wait_for) # type: ignore
96
+ else:
97
+ await page.wait_for_selector(wait_for, timeout=self.timeout)
98
+ else:
99
+ # Default: wait for network to be idle
100
+ await page.wait_for_load_state("networkidle", timeout=self.timeout)
101
+
102
+ result: Dict[str, Any] = {"url": page.url, "title": await page.title()}
103
+
104
+ # Execute custom JavaScript if provided
105
+ if execute_script:
106
+ script_result = await page.evaluate(execute_script)
107
+ result["script_result"] = script_result
108
+
109
+ # Extract based on selectors
110
+ if selectors:
111
+ extracted_data: Dict[str, list[str]] = {}
112
+ for key, selector in selectors.items():
113
+ elements = await page.query_selector_all(selector)
114
+ extracted_data[key] = []
115
+ for elem in elements:
116
+ text = await elem.text_content()
117
+ if text:
118
+ extracted_data[key].append(text.strip())
119
+ result["selected_data"] = extracted_data
120
+
121
+ # Extract all text
122
+ if extract_all_text:
123
+ result["text"] = await page.inner_text("body")
124
+
125
+ # Extract links
126
+ if extract_links:
127
+ links = await page.evaluate(
128
+ """
129
+ () => Array.from(document.querySelectorAll('a[href]')).map(a => ({
130
+ text: a.textContent.trim(),
131
+ href: a.href,
132
+ title: a.title || null
133
+ }))
134
+ """
135
+ )
136
+ result["links"] = links
137
+
138
+ # Extract tables
139
+ if extract_tables:
140
+ tables = await page.evaluate(
141
+ """
142
+ () => Array.from(document.querySelectorAll('table')).map(table => {
143
+ const headers = Array.from(table.querySelectorAll('th')).map(th => th.textContent.trim());
144
+ const rows = Array.from(table.querySelectorAll('tr')).slice(1).map(row => {
145
+ const cells = Array.from(row.querySelectorAll('td, th'));
146
+ const rowData = {};
147
+ cells.forEach((cell, i) => {
148
+ const key = headers[i] || `column_${i}`;
149
+ rowData[key] = cell.textContent.trim();
150
+ });
151
+ return rowData;
152
+ });
153
+ return rows;
154
+ })
155
+ """
156
+ )
157
+ result["tables"] = tables
158
+
159
+ # Take screenshot if requested
160
+ if screenshot:
161
+ screenshot_data = await page.screenshot(full_page=True)
162
+ result["screenshot"] = {
163
+ "size": len(screenshot_data),
164
+ "note": "Screenshot data available as bytes",
165
+ }
166
+
167
+ await browser.close()
168
+
169
+ return {"status": "success", "data": result}
170
+
171
+ except Exception as e:
172
+ return {"status": "error", "error": str(e), "error_type": type(e).__name__}
173
+
174
+ async def _arun(
175
+ self,
176
+ url: str,
177
+ wait_for: Optional[str] = None,
178
+ selectors: Optional[Dict[str, str]] = None,
179
+ extract_all_text: bool = False,
180
+ extract_links: bool = False,
181
+ extract_tables: bool = False,
182
+ screenshot: bool = False,
183
+ execute_script: Optional[str] = None,
184
+ **kwargs: Any,
185
+ ) -> Dict[str, Any]:
186
+ """Async version of the web scraper tool."""
187
+ return await self._async_run(
188
+ url=url,
189
+ wait_for=wait_for,
190
+ selectors=selectors,
191
+ extract_all_text=extract_all_text,
192
+ extract_links=extract_links,
193
+ extract_tables=extract_tables,
194
+ screenshot=screenshot,
195
+ execute_script=execute_script,
196
+ **kwargs,
197
+ )
@@ -0,0 +1,6 @@
1
+ """Utility functions and helpers."""
2
+
3
+ from .config import load_config
4
+ from .data_models import DataRequest, DataResponse
5
+
6
+ __all__ = ["load_config", "DataRequest", "DataResponse"]
khora/utils/config.py ADDED
@@ -0,0 +1,54 @@
1
+ """Configuration management for Khora."""
2
+
3
+ import os
4
+ from pathlib import Path
5
+ from typing import Any, Dict, Optional
6
+
7
+ from dotenv import load_dotenv
8
+
9
+
10
+ def load_config(config_path: Optional[Path] = None) -> Dict[str, Any]:
11
+ """
12
+ Load configuration from environment variables and optional config file.
13
+
14
+ Args:
15
+ config_path: Optional path to configuration file
16
+
17
+ Returns:
18
+ Configuration dictionary
19
+ """
20
+ # Load environment variables
21
+ load_dotenv()
22
+
23
+ config = {
24
+ # OpenAI Configuration
25
+ "openai_api_key": os.getenv("OPENAI_API_KEY"),
26
+ "openai_model": os.getenv("OPENAI_MODEL", "gpt-4-turbo-preview"),
27
+ # Google Configuration
28
+ "google_credentials_path": os.getenv("GOOGLE_CREDENTIALS_PATH"),
29
+ "google_scopes": [
30
+ "https://www.googleapis.com/auth/documents.readonly",
31
+ "https://www.googleapis.com/auth/spreadsheets.readonly",
32
+ "https://www.googleapis.com/auth/drive.readonly",
33
+ ],
34
+ # Dagster Configuration
35
+ "dagster_home": os.getenv("DAGSTER_HOME", "/tmp/dagster"),
36
+ "dagster_storage": {
37
+ "postgres": {
38
+ "postgres_db": os.getenv("DAGSTER_PG_DB", "dagster"),
39
+ "postgres_host": os.getenv("DAGSTER_PG_HOST", "localhost"),
40
+ "postgres_port": int(os.getenv("DAGSTER_PG_PORT", "5432")),
41
+ "postgres_user": os.getenv("DAGSTER_PG_USER", "dagster"),
42
+ "postgres_password": os.getenv("DAGSTER_PG_PASSWORD", ""),
43
+ }
44
+ },
45
+ # General Configuration
46
+ "log_level": os.getenv("LOG_LEVEL", "INFO"),
47
+ "cache_enabled": os.getenv("CACHE_ENABLED", "true").lower() == "true",
48
+ "cache_ttl": int(os.getenv("CACHE_TTL", "3600")),
49
+ }
50
+
51
+ # Remove None values
52
+ config = {k: v for k, v in config.items() if v is not None}
53
+
54
+ return config
@@ -0,0 +1,57 @@
1
+ """Data models for Khora pipeline operations."""
2
+
3
+ from datetime import datetime, timezone
4
+ from enum import Enum
5
+ from typing import Any, Dict, List, Optional
6
+
7
+ from pydantic import BaseModel, ConfigDict, Field
8
+
9
+
10
+ class DataSourceType(str, Enum):
11
+ """Supported data source types."""
12
+
13
+ API = "api"
14
+ WEB_SCRAPER = "web_scraper"
15
+ GOOGLE_DOCS = "google_docs"
16
+ SPREADSHEET = "spreadsheet"
17
+
18
+
19
+ class DataRequest(BaseModel):
20
+ """Model for data fetching requests."""
21
+
22
+ source_type: DataSourceType
23
+ prompt: str = Field(..., description="AI prompt describing what data to fetch")
24
+ source_config: Dict[str, Any] = Field(
25
+ default_factory=dict, description="Configuration specific to the data source"
26
+ )
27
+ filters: Optional[Dict[str, Any]] = None
28
+ metadata: Dict[str, Any] = Field(default_factory=dict)
29
+
30
+ model_config = ConfigDict(use_enum_values=True)
31
+
32
+
33
+ class DataResponse(BaseModel):
34
+ """Model for data fetching responses."""
35
+
36
+ request_id: str
37
+ status: str = Field(..., description="success, error, or partial")
38
+ data: Optional[Any] = None
39
+ error_message: Optional[str] = None
40
+ metadata: Dict[str, Any] = Field(default_factory=dict)
41
+ timestamp: datetime = Field(default_factory=lambda: datetime.now(timezone.utc))
42
+ source_type: DataSourceType
43
+
44
+ model_config = ConfigDict(use_enum_values=True)
45
+
46
+
47
+ class PipelineConfig(BaseModel):
48
+ """Configuration for pipeline execution."""
49
+
50
+ name: str
51
+ description: Optional[str] = None
52
+ requests: List[DataRequest]
53
+ parallel_execution: bool = True
54
+ retry_config: Dict[str, Any] = Field(
55
+ default_factory=lambda: {"max_retries": 3, "retry_delay": 5}
56
+ )
57
+ output_format: str = "json"