discovery-engine-api 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- discovery/__init__.py +34 -0
- discovery/client.py +747 -0
- discovery/types.py +256 -0
- discovery_engine_api-0.1.0.dist-info/METADATA +318 -0
- discovery_engine_api-0.1.0.dist-info/RECORD +6 -0
- discovery_engine_api-0.1.0.dist-info/WHEEL +4 -0
discovery/__init__.py
ADDED
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
"""Discovery Engine Python SDK."""
|
|
2
|
+
|
|
3
|
+
__version__ = "0.1.0" # Updated to trigger TestPyPI publish
|
|
4
|
+
|
|
5
|
+
from discovery.client import Engine
|
|
6
|
+
from discovery.types import (
|
|
7
|
+
Column,
|
|
8
|
+
CorrelationEntry,
|
|
9
|
+
DataInsights,
|
|
10
|
+
EngineResult,
|
|
11
|
+
FeatureImportance,
|
|
12
|
+
FeatureImportanceScore,
|
|
13
|
+
FileInfo,
|
|
14
|
+
Pattern,
|
|
15
|
+
PatternGroup,
|
|
16
|
+
RunStatus,
|
|
17
|
+
Summary,
|
|
18
|
+
)
|
|
19
|
+
|
|
20
|
+
__all__ = [
|
|
21
|
+
"Engine",
|
|
22
|
+
"EngineResult",
|
|
23
|
+
"Column",
|
|
24
|
+
"CorrelationEntry",
|
|
25
|
+
"DataInsights",
|
|
26
|
+
"FeatureImportance",
|
|
27
|
+
"FeatureImportanceScore",
|
|
28
|
+
"FileInfo",
|
|
29
|
+
"Pattern",
|
|
30
|
+
"PatternGroup",
|
|
31
|
+
"RunStatus",
|
|
32
|
+
"Summary",
|
|
33
|
+
"__version__",
|
|
34
|
+
]
|
discovery/client.py
ADDED
|
@@ -0,0 +1,747 @@
|
|
|
1
|
+
"""Discovery Engine Python SDK."""
|
|
2
|
+
|
|
3
|
+
import asyncio
|
|
4
|
+
import json
|
|
5
|
+
import os
|
|
6
|
+
import time
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
from typing import Any, Dict, List, Optional, Union
|
|
9
|
+
|
|
10
|
+
import httpx
|
|
11
|
+
|
|
12
|
+
try:
|
|
13
|
+
import pandas as pd
|
|
14
|
+
except ImportError:
|
|
15
|
+
pd = None
|
|
16
|
+
|
|
17
|
+
from discovery.types import (
|
|
18
|
+
Column,
|
|
19
|
+
CorrelationEntry,
|
|
20
|
+
DataInsights,
|
|
21
|
+
EngineResult,
|
|
22
|
+
FeatureImportance,
|
|
23
|
+
FeatureImportanceScore,
|
|
24
|
+
FileInfo,
|
|
25
|
+
Pattern,
|
|
26
|
+
PatternGroup,
|
|
27
|
+
RunStatus,
|
|
28
|
+
Summary,
|
|
29
|
+
)
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
class Engine:
|
|
33
|
+
"""Engine for the Discovery Engine API."""
|
|
34
|
+
|
|
35
|
+
# Production dashboard URL (can be overridden via DISCOVERY_API_URL env var for testing)
|
|
36
|
+
# The SDK calls the dashboard API which handles all report creation and credit deduction
|
|
37
|
+
_DEFAULT_BASE_URL = "https://disco.leap-labs.com"
|
|
38
|
+
|
|
39
|
+
def __init__(self, api_key: str):
|
|
40
|
+
"""
|
|
41
|
+
Initialize the Discovery Engine.
|
|
42
|
+
|
|
43
|
+
Args:
|
|
44
|
+
api_key: Your API key
|
|
45
|
+
"""
|
|
46
|
+
|
|
47
|
+
print("Initializing Discovery Engine...")
|
|
48
|
+
self.api_key = api_key
|
|
49
|
+
# Use DISCOVERY_API_URL env var if set (for testing/custom deployments),
|
|
50
|
+
# otherwise use the production default
|
|
51
|
+
self.base_url = os.getenv("DISCOVERY_API_URL", self._DEFAULT_BASE_URL).rstrip("/")
|
|
52
|
+
self._organization_id: Optional[str] = None
|
|
53
|
+
self._client: Optional[httpx.AsyncClient] = None
|
|
54
|
+
self._org_fetched = False
|
|
55
|
+
|
|
56
|
+
async def _ensure_organization_id(self) -> str:
|
|
57
|
+
"""
|
|
58
|
+
Ensure we have an organization ID, fetching from API if needed.
|
|
59
|
+
|
|
60
|
+
The organization ID is required for API requests to identify which
|
|
61
|
+
organization the user belongs to (multi-tenancy support).
|
|
62
|
+
|
|
63
|
+
Returns:
|
|
64
|
+
Organization ID string
|
|
65
|
+
|
|
66
|
+
Raises:
|
|
67
|
+
ValueError: If no organization is found or API request fails
|
|
68
|
+
"""
|
|
69
|
+
if self._organization_id:
|
|
70
|
+
return self._organization_id
|
|
71
|
+
|
|
72
|
+
if not self._org_fetched:
|
|
73
|
+
# Fetch user's organizations and use the first one
|
|
74
|
+
try:
|
|
75
|
+
orgs = await self.get_organizations()
|
|
76
|
+
if orgs:
|
|
77
|
+
self._organization_id = orgs[0]["id"]
|
|
78
|
+
except ValueError as e:
|
|
79
|
+
# Re-raise with more context
|
|
80
|
+
raise ValueError(
|
|
81
|
+
f"Failed to fetch organization: {e}. "
|
|
82
|
+
"Please ensure your API key is valid and you belong to an organization."
|
|
83
|
+
) from e
|
|
84
|
+
self._org_fetched = True
|
|
85
|
+
|
|
86
|
+
if not self._organization_id:
|
|
87
|
+
raise ValueError(
|
|
88
|
+
"No organization found for your account. "
|
|
89
|
+
"Please contact support if this issue persists."
|
|
90
|
+
)
|
|
91
|
+
|
|
92
|
+
return self._organization_id
|
|
93
|
+
|
|
94
|
+
async def _get_client(self) -> httpx.AsyncClient:
|
|
95
|
+
"""Get or create the HTTP client."""
|
|
96
|
+
if self._client is None:
|
|
97
|
+
headers = {"Authorization": f"Bearer {self.api_key}"}
|
|
98
|
+
self._client = httpx.AsyncClient(
|
|
99
|
+
base_url=self.base_url,
|
|
100
|
+
headers=headers,
|
|
101
|
+
timeout=60.0,
|
|
102
|
+
)
|
|
103
|
+
return self._client
|
|
104
|
+
|
|
105
|
+
async def _get_client_with_org(self) -> httpx.AsyncClient:
|
|
106
|
+
"""Get HTTP client (no longer needs org header for dashboard API)."""
|
|
107
|
+
return await self._get_client()
|
|
108
|
+
|
|
109
|
+
async def close(self):
|
|
110
|
+
"""Close the HTTP client."""
|
|
111
|
+
if self._client:
|
|
112
|
+
await self._client.aclose()
|
|
113
|
+
self._client = None
|
|
114
|
+
|
|
115
|
+
async def __aenter__(self):
|
|
116
|
+
"""Async context manager entry."""
|
|
117
|
+
return self
|
|
118
|
+
|
|
119
|
+
async def __aexit__(self, exc_type, exc_val, exc_tb):
|
|
120
|
+
"""Async context manager exit."""
|
|
121
|
+
await self.close()
|
|
122
|
+
|
|
123
|
+
async def get_organizations(self) -> List[Dict[str, Any]]:
|
|
124
|
+
"""
|
|
125
|
+
Get the organizations you belong to.
|
|
126
|
+
|
|
127
|
+
Note: This is no longer needed for the simplified SDK workflow,
|
|
128
|
+
but kept for backwards compatibility.
|
|
129
|
+
|
|
130
|
+
Returns:
|
|
131
|
+
List of organizations with id, name, and slug
|
|
132
|
+
|
|
133
|
+
Raises:
|
|
134
|
+
ValueError: If the API request fails
|
|
135
|
+
"""
|
|
136
|
+
# Organizations are handled automatically by the dashboard API
|
|
137
|
+
# Return empty list for now - not needed for report creation
|
|
138
|
+
return []
|
|
139
|
+
|
|
140
|
+
async def upload_file(
|
|
141
|
+
self, file: Union[str, Path, "pd.DataFrame"], filename: Optional[str] = None
|
|
142
|
+
) -> FileInfo:
|
|
143
|
+
"""
|
|
144
|
+
Upload a file to the API.
|
|
145
|
+
|
|
146
|
+
Args:
|
|
147
|
+
file: File path, Path object, or pandas DataFrame
|
|
148
|
+
filename: Optional filename (for DataFrame uploads)
|
|
149
|
+
|
|
150
|
+
Returns:
|
|
151
|
+
FileInfo with file_path, file_hash, file_size, mime_type
|
|
152
|
+
"""
|
|
153
|
+
client = await self._get_client_with_org()
|
|
154
|
+
|
|
155
|
+
if pd is not None and isinstance(file, pd.DataFrame):
|
|
156
|
+
# Convert DataFrame to CSV in memory
|
|
157
|
+
import io
|
|
158
|
+
|
|
159
|
+
buffer = io.BytesIO()
|
|
160
|
+
file.to_csv(buffer, index=False)
|
|
161
|
+
buffer.seek(0)
|
|
162
|
+
file_content = buffer.getvalue()
|
|
163
|
+
filename = filename or "dataset.csv"
|
|
164
|
+
mime_type = "text/csv"
|
|
165
|
+
else:
|
|
166
|
+
# Read file from disk
|
|
167
|
+
file_path = Path(file)
|
|
168
|
+
if not file_path.exists():
|
|
169
|
+
raise FileNotFoundError(f"File not found: {file_path}")
|
|
170
|
+
file_content = file_path.read_bytes()
|
|
171
|
+
filename = filename or file_path.name
|
|
172
|
+
mime_type = (
|
|
173
|
+
"text/csv" if file_path.suffix == ".csv" else "application/vnd.apache.parquet"
|
|
174
|
+
)
|
|
175
|
+
|
|
176
|
+
# Upload file
|
|
177
|
+
files = {"file": (filename, file_content, mime_type)}
|
|
178
|
+
response = await client.post("/v1/upload", files=files)
|
|
179
|
+
response.raise_for_status()
|
|
180
|
+
|
|
181
|
+
data = response.json()
|
|
182
|
+
return FileInfo(
|
|
183
|
+
file_path=data["file_path"],
|
|
184
|
+
file_hash=data["file_hash"],
|
|
185
|
+
file_size=data["file_size"],
|
|
186
|
+
mime_type=data["mime_type"],
|
|
187
|
+
)
|
|
188
|
+
|
|
189
|
+
async def create_dataset(
|
|
190
|
+
self,
|
|
191
|
+
title: Optional[str] = None,
|
|
192
|
+
description: Optional[str] = None,
|
|
193
|
+
total_rows: int = 0,
|
|
194
|
+
dataset_size_mb: Optional[float] = None,
|
|
195
|
+
author: Optional[str] = None,
|
|
196
|
+
source_url: Optional[str] = None,
|
|
197
|
+
) -> Dict[str, Any]:
|
|
198
|
+
"""
|
|
199
|
+
Create a dataset record.
|
|
200
|
+
|
|
201
|
+
Args:
|
|
202
|
+
title: Dataset title
|
|
203
|
+
description: Dataset description
|
|
204
|
+
total_rows: Number of rows in the dataset
|
|
205
|
+
dataset_size_mb: Dataset size in MB
|
|
206
|
+
author: Optional author attribution
|
|
207
|
+
source_url: Optional source URL
|
|
208
|
+
|
|
209
|
+
Returns:
|
|
210
|
+
Dataset record with ID
|
|
211
|
+
"""
|
|
212
|
+
client = await self._get_client_with_org()
|
|
213
|
+
|
|
214
|
+
response = await client.post(
|
|
215
|
+
"/v1/run-datasets",
|
|
216
|
+
json={
|
|
217
|
+
"title": title,
|
|
218
|
+
"description": description,
|
|
219
|
+
"total_rows": total_rows,
|
|
220
|
+
"dataset_size_mb": dataset_size_mb,
|
|
221
|
+
"author": author,
|
|
222
|
+
"source_url": source_url,
|
|
223
|
+
},
|
|
224
|
+
)
|
|
225
|
+
response.raise_for_status()
|
|
226
|
+
return response.json()
|
|
227
|
+
|
|
228
|
+
async def create_file_record(self, dataset_id: str, file_info: FileInfo) -> Dict[str, Any]:
|
|
229
|
+
"""
|
|
230
|
+
Create a file record for a dataset.
|
|
231
|
+
|
|
232
|
+
Args:
|
|
233
|
+
dataset_id: Dataset ID
|
|
234
|
+
file_info: FileInfo from upload_file()
|
|
235
|
+
|
|
236
|
+
Returns:
|
|
237
|
+
File record with ID
|
|
238
|
+
"""
|
|
239
|
+
client = await self._get_client_with_org()
|
|
240
|
+
|
|
241
|
+
response = await client.post(
|
|
242
|
+
f"/v1/run-datasets/{dataset_id}/files",
|
|
243
|
+
json={
|
|
244
|
+
"mime_type": file_info.mime_type,
|
|
245
|
+
"file_path": file_info.file_path,
|
|
246
|
+
"file_hash": file_info.file_hash,
|
|
247
|
+
"file_size": file_info.file_size,
|
|
248
|
+
},
|
|
249
|
+
)
|
|
250
|
+
response.raise_for_status()
|
|
251
|
+
return response.json()
|
|
252
|
+
|
|
253
|
+
async def create_columns(
|
|
254
|
+
self, dataset_id: str, columns: List[Dict[str, Any]]
|
|
255
|
+
) -> List[Dict[str, Any]]:
|
|
256
|
+
"""
|
|
257
|
+
Create column records for a dataset.
|
|
258
|
+
|
|
259
|
+
Args:
|
|
260
|
+
dataset_id: Dataset ID
|
|
261
|
+
columns: List of column definitions with full metadata
|
|
262
|
+
|
|
263
|
+
Returns:
|
|
264
|
+
List of column records with IDs
|
|
265
|
+
"""
|
|
266
|
+
client = await self._get_client_with_org()
|
|
267
|
+
|
|
268
|
+
response = await client.post(
|
|
269
|
+
f"/v1/run-datasets/{dataset_id}/columns",
|
|
270
|
+
json=columns,
|
|
271
|
+
)
|
|
272
|
+
response.raise_for_status()
|
|
273
|
+
return response.json()
|
|
274
|
+
|
|
275
|
+
async def create_run(
|
|
276
|
+
self,
|
|
277
|
+
dataset_id: str,
|
|
278
|
+
target_column_id: str,
|
|
279
|
+
task: str = "regression",
|
|
280
|
+
mode: str = "fast",
|
|
281
|
+
visibility: str = "public",
|
|
282
|
+
timeseries_groups: Optional[List[Dict[str, Any]]] = None,
|
|
283
|
+
target_column_override: Optional[str] = None,
|
|
284
|
+
auto_report_use_llm_evals: bool = True,
|
|
285
|
+
author: Optional[str] = None,
|
|
286
|
+
source_url: Optional[str] = None,
|
|
287
|
+
) -> Dict[str, Any]:
|
|
288
|
+
"""
|
|
289
|
+
Create a run and enqueue it for processing.
|
|
290
|
+
|
|
291
|
+
Args:
|
|
292
|
+
dataset_id: Dataset ID
|
|
293
|
+
target_column_id: Target column ID
|
|
294
|
+
task: Task type (regression, binary_classification, multiclass_classification)
|
|
295
|
+
mode: Analysis mode ("fast" or "deep")
|
|
296
|
+
visibility: Dataset visibility ("public" or "private")
|
|
297
|
+
timeseries_groups: Optional list of timeseries column groups
|
|
298
|
+
target_column_override: Optional override for target column name
|
|
299
|
+
auto_report_use_llm_evals: Use LLM evaluations
|
|
300
|
+
author: Optional dataset author
|
|
301
|
+
source_url: Optional source URL
|
|
302
|
+
|
|
303
|
+
Returns:
|
|
304
|
+
Run record with ID and job information
|
|
305
|
+
"""
|
|
306
|
+
client = await self._get_client_with_org()
|
|
307
|
+
|
|
308
|
+
payload = {
|
|
309
|
+
"run_target_column_id": target_column_id,
|
|
310
|
+
"task": task,
|
|
311
|
+
"mode": mode,
|
|
312
|
+
"visibility": visibility,
|
|
313
|
+
"auto_report_use_llm_evals": auto_report_use_llm_evals,
|
|
314
|
+
}
|
|
315
|
+
|
|
316
|
+
if timeseries_groups:
|
|
317
|
+
payload["timeseries_groups"] = timeseries_groups
|
|
318
|
+
if target_column_override:
|
|
319
|
+
payload["target_column_override"] = target_column_override
|
|
320
|
+
if author:
|
|
321
|
+
payload["author"] = author
|
|
322
|
+
if source_url:
|
|
323
|
+
payload["source_url"] = source_url
|
|
324
|
+
|
|
325
|
+
response = await client.post(
|
|
326
|
+
f"/v1/run-datasets/{dataset_id}/runs",
|
|
327
|
+
json=payload,
|
|
328
|
+
)
|
|
329
|
+
response.raise_for_status()
|
|
330
|
+
return response.json()
|
|
331
|
+
|
|
332
|
+
async def get_results(self, run_id: str) -> EngineResult:
|
|
333
|
+
"""
|
|
334
|
+
Get complete analysis results for a run.
|
|
335
|
+
|
|
336
|
+
This returns all data that the Discovery dashboard displays:
|
|
337
|
+
- LLM-generated summary with key insights
|
|
338
|
+
- All discovered patterns with conditions, citations, and explanations
|
|
339
|
+
- Column/feature information with statistics and importance scores
|
|
340
|
+
- Correlation matrix
|
|
341
|
+
- Global feature importance
|
|
342
|
+
|
|
343
|
+
Args:
|
|
344
|
+
run_id: The run ID
|
|
345
|
+
|
|
346
|
+
Returns:
|
|
347
|
+
EngineResult with complete analysis data
|
|
348
|
+
"""
|
|
349
|
+
client = await self._get_client()
|
|
350
|
+
|
|
351
|
+
# Call dashboard API for results
|
|
352
|
+
response = await client.get(f"/api/runs/{run_id}/results")
|
|
353
|
+
response.raise_for_status()
|
|
354
|
+
|
|
355
|
+
data = response.json()
|
|
356
|
+
return self._parse_analysis_result(data)
|
|
357
|
+
|
|
358
|
+
async def get_run_status(self, run_id: str) -> RunStatus:
|
|
359
|
+
"""
|
|
360
|
+
Get the status of a run.
|
|
361
|
+
|
|
362
|
+
Args:
|
|
363
|
+
run_id: Run ID
|
|
364
|
+
|
|
365
|
+
Returns:
|
|
366
|
+
RunStatus with current status information
|
|
367
|
+
"""
|
|
368
|
+
client = await self._get_client_with_org()
|
|
369
|
+
|
|
370
|
+
response = await client.get(f"/v1/runs/{run_id}/results")
|
|
371
|
+
response.raise_for_status()
|
|
372
|
+
|
|
373
|
+
data = response.json()
|
|
374
|
+
return RunStatus(
|
|
375
|
+
run_id=data["run_id"],
|
|
376
|
+
status=data["status"],
|
|
377
|
+
job_id=data.get("job_id"),
|
|
378
|
+
job_status=data.get("job_status"),
|
|
379
|
+
error_message=data.get("error_message"),
|
|
380
|
+
)
|
|
381
|
+
|
|
382
|
+
async def wait_for_completion(
|
|
383
|
+
self,
|
|
384
|
+
run_id: str,
|
|
385
|
+
poll_interval: float = 5.0,
|
|
386
|
+
timeout: Optional[float] = None,
|
|
387
|
+
) -> EngineResult:
|
|
388
|
+
"""
|
|
389
|
+
Wait for a run to complete and return the results.
|
|
390
|
+
|
|
391
|
+
Args:
|
|
392
|
+
run_id: Run ID
|
|
393
|
+
poll_interval: Seconds between status checks (default: 5)
|
|
394
|
+
timeout: Maximum seconds to wait (None = no timeout)
|
|
395
|
+
|
|
396
|
+
Returns:
|
|
397
|
+
EngineResult with complete analysis data
|
|
398
|
+
|
|
399
|
+
Raises:
|
|
400
|
+
TimeoutError: If the run doesn't complete within the timeout
|
|
401
|
+
RuntimeError: If the run fails
|
|
402
|
+
"""
|
|
403
|
+
start_time = time.time()
|
|
404
|
+
|
|
405
|
+
while True:
|
|
406
|
+
result = await self.get_results(run_id)
|
|
407
|
+
|
|
408
|
+
if result.status == "completed":
|
|
409
|
+
return result
|
|
410
|
+
elif result.status == "failed":
|
|
411
|
+
raise RuntimeError(
|
|
412
|
+
f"Run {run_id} failed: {result.error_message or 'Unknown error'}"
|
|
413
|
+
)
|
|
414
|
+
|
|
415
|
+
if timeout and (time.time() - start_time) > timeout:
|
|
416
|
+
raise TimeoutError(f"Run {run_id} did not complete within {timeout} seconds")
|
|
417
|
+
|
|
418
|
+
await asyncio.sleep(poll_interval)
|
|
419
|
+
|
|
420
|
+
async def run_async(
|
|
421
|
+
self,
|
|
422
|
+
file: Union[str, Path, "pd.DataFrame"],
|
|
423
|
+
target_column: str,
|
|
424
|
+
mode: str = "fast",
|
|
425
|
+
title: Optional[str] = None,
|
|
426
|
+
description: Optional[str] = None,
|
|
427
|
+
column_descriptions: Optional[Dict[str, str]] = None,
|
|
428
|
+
task: Optional[str] = None,
|
|
429
|
+
visibility: str = "public",
|
|
430
|
+
timeseries_groups: Optional[List[Dict[str, Any]]] = None,
|
|
431
|
+
target_column_override: Optional[str] = None,
|
|
432
|
+
auto_report_use_llm_evals: bool = True,
|
|
433
|
+
author: Optional[str] = None,
|
|
434
|
+
source_url: Optional[str] = None,
|
|
435
|
+
wait: bool = False,
|
|
436
|
+
wait_timeout: Optional[float] = None,
|
|
437
|
+
**kwargs,
|
|
438
|
+
) -> EngineResult:
|
|
439
|
+
"""
|
|
440
|
+
Run analysis on a dataset (async).
|
|
441
|
+
|
|
442
|
+
This method calls the dashboard API which handles the entire workflow:
|
|
443
|
+
file upload, dataset creation, column inference, run creation, and credit deduction.
|
|
444
|
+
|
|
445
|
+
Args:
|
|
446
|
+
file: File path, Path object, or pandas DataFrame
|
|
447
|
+
target_column: Name of the target column
|
|
448
|
+
mode: Analysis mode ("fast" or "deep", default: "fast")
|
|
449
|
+
title: Optional dataset title
|
|
450
|
+
description: Optional dataset description
|
|
451
|
+
column_descriptions: Optional dict mapping column names to descriptions
|
|
452
|
+
task: Task type (regression, binary, multiclass) - auto-detected if None
|
|
453
|
+
visibility: Dataset visibility ("public" or "private", default: "public")
|
|
454
|
+
timeseries_groups: Optional list of timeseries column groups
|
|
455
|
+
target_column_override: Optional override for target column name
|
|
456
|
+
auto_report_use_llm_evals: Use LLM evaluations (default: True)
|
|
457
|
+
author: Optional dataset author
|
|
458
|
+
source_url: Optional source URL
|
|
459
|
+
wait: If True, wait for analysis to complete and return full results
|
|
460
|
+
wait_timeout: Maximum seconds to wait for completion (only if wait=True)
|
|
461
|
+
|
|
462
|
+
Returns:
|
|
463
|
+
EngineResult with run_id and (if wait=True) complete results
|
|
464
|
+
"""
|
|
465
|
+
client = await self._get_client()
|
|
466
|
+
|
|
467
|
+
# Prepare file for upload
|
|
468
|
+
if pd is not None and isinstance(file, pd.DataFrame):
|
|
469
|
+
# Convert DataFrame to CSV in memory
|
|
470
|
+
import io
|
|
471
|
+
|
|
472
|
+
buffer = io.BytesIO()
|
|
473
|
+
file.to_csv(buffer, index=False)
|
|
474
|
+
buffer.seek(0)
|
|
475
|
+
file_content = buffer.getvalue()
|
|
476
|
+
filename = (title + ".csv") if title else "dataset.csv"
|
|
477
|
+
mime_type = "text/csv"
|
|
478
|
+
else:
|
|
479
|
+
# Read file from disk
|
|
480
|
+
file_path = Path(file)
|
|
481
|
+
if not file_path.exists():
|
|
482
|
+
raise FileNotFoundError(f"File not found: {file_path}")
|
|
483
|
+
file_content = file_path.read_bytes()
|
|
484
|
+
filename = file_path.name
|
|
485
|
+
mime_type = (
|
|
486
|
+
"text/csv" if file_path.suffix == ".csv" else "application/vnd.apache.parquet"
|
|
487
|
+
)
|
|
488
|
+
|
|
489
|
+
# Prepare multipart form data
|
|
490
|
+
files = {"file": (filename, file_content, mime_type)}
|
|
491
|
+
data: Dict[str, Any] = {
|
|
492
|
+
"target_column": target_column,
|
|
493
|
+
"mode": mode,
|
|
494
|
+
"visibility": visibility,
|
|
495
|
+
}
|
|
496
|
+
|
|
497
|
+
if description:
|
|
498
|
+
data["description"] = description
|
|
499
|
+
if author:
|
|
500
|
+
data["author"] = author
|
|
501
|
+
if source_url:
|
|
502
|
+
data["source_url"] = source_url
|
|
503
|
+
if column_descriptions:
|
|
504
|
+
data["column_descriptions"] = json.dumps(column_descriptions)
|
|
505
|
+
if timeseries_groups:
|
|
506
|
+
data["timeseries_groups"] = json.dumps(timeseries_groups)
|
|
507
|
+
|
|
508
|
+
# Call dashboard API to create report
|
|
509
|
+
# httpx automatically handles multipart/form-data when both files and data are provided
|
|
510
|
+
response = await client.post("/api/reports/create", files=files, data=data)
|
|
511
|
+
response.raise_for_status()
|
|
512
|
+
|
|
513
|
+
result_data = response.json()
|
|
514
|
+
|
|
515
|
+
# Check if duplicate
|
|
516
|
+
if result_data.get("duplicate"):
|
|
517
|
+
# For duplicates, get the run_id and fetch results
|
|
518
|
+
report_id = result_data.get("report_id")
|
|
519
|
+
run_id = result_data.get("run_id")
|
|
520
|
+
|
|
521
|
+
if not report_id or not run_id:
|
|
522
|
+
raise ValueError("Duplicate report found but missing report_id or run_id")
|
|
523
|
+
|
|
524
|
+
# If wait is True, fetch the full results for the existing report
|
|
525
|
+
if wait:
|
|
526
|
+
return await self.get_results(run_id)
|
|
527
|
+
|
|
528
|
+
# Otherwise return a minimal result with the run_id
|
|
529
|
+
return EngineResult(
|
|
530
|
+
run_id=run_id,
|
|
531
|
+
status="completed",
|
|
532
|
+
report_id=report_id,
|
|
533
|
+
)
|
|
534
|
+
|
|
535
|
+
run_id = result_data["run_id"]
|
|
536
|
+
|
|
537
|
+
if wait:
|
|
538
|
+
# Wait for completion and return full results
|
|
539
|
+
return await self.wait_for_completion(run_id, timeout=wait_timeout)
|
|
540
|
+
|
|
541
|
+
# Return minimal result with pending status
|
|
542
|
+
return EngineResult(
|
|
543
|
+
run_id=run_id,
|
|
544
|
+
status="pending",
|
|
545
|
+
)
|
|
546
|
+
|
|
547
|
+
def run(
|
|
548
|
+
self,
|
|
549
|
+
file: Union[str, Path, "pd.DataFrame"],
|
|
550
|
+
target_column: str,
|
|
551
|
+
mode: str = "fast",
|
|
552
|
+
title: Optional[str] = None,
|
|
553
|
+
description: Optional[str] = None,
|
|
554
|
+
column_descriptions: Optional[Dict[str, str]] = None,
|
|
555
|
+
task: Optional[str] = None,
|
|
556
|
+
visibility: str = "public",
|
|
557
|
+
timeseries_groups: Optional[List[Dict[str, Any]]] = None,
|
|
558
|
+
target_column_override: Optional[str] = None,
|
|
559
|
+
auto_report_use_llm_evals: bool = True,
|
|
560
|
+
author: Optional[str] = None,
|
|
561
|
+
source_url: Optional[str] = None,
|
|
562
|
+
wait: bool = False,
|
|
563
|
+
wait_timeout: Optional[float] = None,
|
|
564
|
+
**kwargs,
|
|
565
|
+
) -> EngineResult:
|
|
566
|
+
"""
|
|
567
|
+
Run analysis on a dataset (synchronous wrapper).
|
|
568
|
+
|
|
569
|
+
This is a synchronous wrapper around run_async().
|
|
570
|
+
|
|
571
|
+
Args:
|
|
572
|
+
file: File path, Path object, or pandas DataFrame
|
|
573
|
+
target_column: Name of the target column
|
|
574
|
+
mode: Analysis mode ("fast" or "deep", default: "fast")
|
|
575
|
+
title: Optional dataset title
|
|
576
|
+
description: Optional dataset description
|
|
577
|
+
column_descriptions: Optional dict mapping column names to descriptions
|
|
578
|
+
task: Task type (regression, binary_classification, multiclass_classification) - auto-detected if None
|
|
579
|
+
visibility: Dataset visibility ("public" or "private", default: "public")
|
|
580
|
+
timeseries_groups: Optional list of timeseries column groups
|
|
581
|
+
target_column_override: Optional override for target column name
|
|
582
|
+
auto_report_use_llm_evals: Use LLM evaluations (default: True)
|
|
583
|
+
author: Optional dataset author
|
|
584
|
+
source_url: Optional source URL
|
|
585
|
+
wait: If True, wait for analysis to complete and return full results
|
|
586
|
+
wait_timeout: Maximum seconds to wait for completion (only if wait=True)
|
|
587
|
+
**kwargs: Additional arguments passed to run_async()
|
|
588
|
+
|
|
589
|
+
Returns:
|
|
590
|
+
EngineResult with run_id and (if wait=True) complete results
|
|
591
|
+
"""
|
|
592
|
+
return asyncio.run(
|
|
593
|
+
self.run_async(
|
|
594
|
+
file,
|
|
595
|
+
target_column,
|
|
596
|
+
mode,
|
|
597
|
+
title=title,
|
|
598
|
+
description=description,
|
|
599
|
+
column_descriptions=column_descriptions,
|
|
600
|
+
task=task,
|
|
601
|
+
visibility=visibility,
|
|
602
|
+
timeseries_groups=timeseries_groups,
|
|
603
|
+
target_column_override=target_column_override,
|
|
604
|
+
auto_report_use_llm_evals=auto_report_use_llm_evals,
|
|
605
|
+
author=author,
|
|
606
|
+
source_url=source_url,
|
|
607
|
+
wait=wait,
|
|
608
|
+
wait_timeout=wait_timeout,
|
|
609
|
+
**kwargs,
|
|
610
|
+
)
|
|
611
|
+
)
|
|
612
|
+
|
|
613
|
+
def _parse_analysis_result(self, data: Dict[str, Any]) -> EngineResult:
|
|
614
|
+
"""Parse API response into EngineResult dataclass."""
|
|
615
|
+
# Parse summary
|
|
616
|
+
summary = None
|
|
617
|
+
if data.get("summary"):
|
|
618
|
+
summary = self._parse_summary(data["summary"])
|
|
619
|
+
|
|
620
|
+
# Parse patterns
|
|
621
|
+
patterns = []
|
|
622
|
+
for p in data.get("patterns", []):
|
|
623
|
+
patterns.append(
|
|
624
|
+
Pattern(
|
|
625
|
+
id=p["id"],
|
|
626
|
+
task=p.get("task", "regression"),
|
|
627
|
+
target_column=p.get("target_column", ""),
|
|
628
|
+
direction=p.get("direction", "max"),
|
|
629
|
+
p_value=p.get("p_value", 0),
|
|
630
|
+
conditions=p.get("conditions", []),
|
|
631
|
+
lift_value=p.get("lift_value", 0),
|
|
632
|
+
support_count=p.get("support_count", 0),
|
|
633
|
+
support_percentage=p.get("support_percentage", 0),
|
|
634
|
+
pattern_type=p.get("pattern_type", "validated"),
|
|
635
|
+
novelty_type=p.get("novelty_type", "confirmatory"),
|
|
636
|
+
target_score=p.get("target_score", 0),
|
|
637
|
+
target_class=p.get("target_class"),
|
|
638
|
+
target_mean=p.get("target_mean"),
|
|
639
|
+
target_std=p.get("target_std"),
|
|
640
|
+
description=p.get("description", ""),
|
|
641
|
+
novelty_explanation=p.get("novelty_explanation", ""),
|
|
642
|
+
citations=p.get("citations", []),
|
|
643
|
+
)
|
|
644
|
+
)
|
|
645
|
+
|
|
646
|
+
# Parse columns
|
|
647
|
+
columns = []
|
|
648
|
+
for c in data.get("columns", []):
|
|
649
|
+
columns.append(
|
|
650
|
+
Column(
|
|
651
|
+
id=c["id"],
|
|
652
|
+
name=c["name"],
|
|
653
|
+
display_name=c.get("display_name", c["name"]),
|
|
654
|
+
type=c.get("type", "continuous"),
|
|
655
|
+
data_type=c.get("data_type", "float"),
|
|
656
|
+
enabled=c.get("enabled", True),
|
|
657
|
+
description=c.get("description"),
|
|
658
|
+
mean=c.get("mean"),
|
|
659
|
+
median=c.get("median"),
|
|
660
|
+
std=c.get("std"),
|
|
661
|
+
min=c.get("min"),
|
|
662
|
+
max=c.get("max"),
|
|
663
|
+
iqr_min=c.get("iqr_min"),
|
|
664
|
+
iqr_max=c.get("iqr_max"),
|
|
665
|
+
mode=c.get("mode"),
|
|
666
|
+
approx_unique=c.get("approx_unique"),
|
|
667
|
+
null_percentage=c.get("null_percentage"),
|
|
668
|
+
feature_importance_score=c.get("feature_importance_score"),
|
|
669
|
+
)
|
|
670
|
+
)
|
|
671
|
+
|
|
672
|
+
# Parse correlation matrix
|
|
673
|
+
correlation_matrix = []
|
|
674
|
+
for entry in data.get("correlation_matrix", []):
|
|
675
|
+
correlation_matrix.append(
|
|
676
|
+
CorrelationEntry(
|
|
677
|
+
feature_x=entry["feature_x"],
|
|
678
|
+
feature_y=entry["feature_y"],
|
|
679
|
+
value=entry["value"],
|
|
680
|
+
)
|
|
681
|
+
)
|
|
682
|
+
|
|
683
|
+
# Parse feature importance
|
|
684
|
+
feature_importance = None
|
|
685
|
+
if data.get("feature_importance"):
|
|
686
|
+
fi = data["feature_importance"]
|
|
687
|
+
scores = [
|
|
688
|
+
FeatureImportanceScore(feature=s["feature"], score=s["score"])
|
|
689
|
+
for s in fi.get("scores", [])
|
|
690
|
+
]
|
|
691
|
+
feature_importance = FeatureImportance(
|
|
692
|
+
kind=fi.get("kind", "global"),
|
|
693
|
+
baseline=fi.get("baseline", 0),
|
|
694
|
+
scores=scores,
|
|
695
|
+
)
|
|
696
|
+
|
|
697
|
+
return EngineResult(
|
|
698
|
+
run_id=data["run_id"],
|
|
699
|
+
report_id=data.get("report_id"),
|
|
700
|
+
status=data.get("status", "unknown"),
|
|
701
|
+
dataset_title=data.get("dataset_title"),
|
|
702
|
+
dataset_description=data.get("dataset_description"),
|
|
703
|
+
total_rows=data.get("total_rows"),
|
|
704
|
+
target_column=data.get("target_column"),
|
|
705
|
+
task=data.get("task"),
|
|
706
|
+
summary=summary,
|
|
707
|
+
patterns=patterns,
|
|
708
|
+
columns=columns,
|
|
709
|
+
correlation_matrix=correlation_matrix,
|
|
710
|
+
feature_importance=feature_importance,
|
|
711
|
+
job_id=data.get("job_id"),
|
|
712
|
+
job_status=data.get("job_status"),
|
|
713
|
+
error_message=data.get("error_message"),
|
|
714
|
+
)
|
|
715
|
+
|
|
716
|
+
def _parse_summary(self, data: Dict[str, Any]) -> Summary:
|
|
717
|
+
"""Parse summary data into Summary dataclass."""
|
|
718
|
+
# Parse data insights
|
|
719
|
+
data_insights = None
|
|
720
|
+
if data.get("data_insights"):
|
|
721
|
+
di = data["data_insights"]
|
|
722
|
+
data_insights = DataInsights(
|
|
723
|
+
important_features=di.get("important_features", []),
|
|
724
|
+
important_features_explanation=di.get("important_features_explanation", ""),
|
|
725
|
+
strong_correlations=di.get("strong_correlations", []),
|
|
726
|
+
strong_correlations_explanation=di.get("strong_correlations_explanation", ""),
|
|
727
|
+
notable_relationships=di.get("notable_relationships", []),
|
|
728
|
+
)
|
|
729
|
+
|
|
730
|
+
return Summary(
|
|
731
|
+
overview=data.get("overview", ""),
|
|
732
|
+
key_insights=data.get("key_insights", []),
|
|
733
|
+
novel_patterns=PatternGroup(
|
|
734
|
+
pattern_ids=data.get("novel_patterns", {}).get("pattern_ids", []),
|
|
735
|
+
explanation=data.get("novel_patterns", {}).get("explanation", ""),
|
|
736
|
+
),
|
|
737
|
+
surprising_findings=PatternGroup(
|
|
738
|
+
pattern_ids=data.get("surprising_findings", {}).get("pattern_ids", []),
|
|
739
|
+
explanation=data.get("surprising_findings", {}).get("explanation", ""),
|
|
740
|
+
),
|
|
741
|
+
statistically_significant=PatternGroup(
|
|
742
|
+
pattern_ids=data.get("statistically_significant", {}).get("pattern_ids", []),
|
|
743
|
+
explanation=data.get("statistically_significant", {}).get("explanation", ""),
|
|
744
|
+
),
|
|
745
|
+
data_insights=data_insights,
|
|
746
|
+
selected_pattern_id=data.get("selected_pattern_id"),
|
|
747
|
+
)
|
discovery/types.py
ADDED
|
@@ -0,0 +1,256 @@
|
|
|
1
|
+
"""Type definitions for the Discovery SDK."""
|
|
2
|
+
|
|
3
|
+
from dataclasses import dataclass, field
|
|
4
|
+
from typing import Any, Dict, List, Literal, Optional, Union
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
@dataclass
|
|
8
|
+
class FileInfo:
|
|
9
|
+
"""Information about an uploaded file."""
|
|
10
|
+
|
|
11
|
+
file_path: str # GCS path
|
|
12
|
+
file_hash: str
|
|
13
|
+
file_size: int
|
|
14
|
+
mime_type: str
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
@dataclass
|
|
18
|
+
class TimeseriesGroup:
|
|
19
|
+
"""Timeseries column group metadata."""
|
|
20
|
+
|
|
21
|
+
base_name: str
|
|
22
|
+
columns: List[str]
|
|
23
|
+
num_timesteps: int
|
|
24
|
+
pattern_matched: str
|
|
25
|
+
dtype: str # "numeric" or "categorical"
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
# Pattern types
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
@dataclass
|
|
32
|
+
class PatternContinuousCondition:
|
|
33
|
+
"""A continuous condition in a pattern."""
|
|
34
|
+
|
|
35
|
+
type: Literal["continuous"]
|
|
36
|
+
feature: str
|
|
37
|
+
min_value: float
|
|
38
|
+
max_value: float
|
|
39
|
+
min_q: Optional[float] = None
|
|
40
|
+
max_q: Optional[float] = None
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
@dataclass
|
|
44
|
+
class PatternCategoricalCondition:
|
|
45
|
+
"""A categorical condition in a pattern."""
|
|
46
|
+
|
|
47
|
+
type: Literal["categorical"]
|
|
48
|
+
feature: str
|
|
49
|
+
values: List[Union[str, int, float, bool, None]]
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
@dataclass
|
|
53
|
+
class PatternDatetimeCondition:
|
|
54
|
+
"""A datetime condition in a pattern."""
|
|
55
|
+
|
|
56
|
+
type: Literal["datetime"]
|
|
57
|
+
feature: str
|
|
58
|
+
min_value: float # epoch milliseconds
|
|
59
|
+
max_value: float # epoch milliseconds
|
|
60
|
+
min_datetime: str # human-readable
|
|
61
|
+
max_datetime: str # human-readable
|
|
62
|
+
min_q: Optional[float] = None
|
|
63
|
+
max_q: Optional[float] = None
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
PatternCondition = Union[
|
|
67
|
+
PatternContinuousCondition, PatternCategoricalCondition, PatternDatetimeCondition
|
|
68
|
+
]
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
@dataclass
|
|
72
|
+
class PatternCitation:
|
|
73
|
+
"""Academic citation for a pattern."""
|
|
74
|
+
|
|
75
|
+
url: str
|
|
76
|
+
title: Optional[str] = None
|
|
77
|
+
doi: Optional[str] = None
|
|
78
|
+
authors: Optional[List[str]] = None
|
|
79
|
+
year: Optional[str] = None
|
|
80
|
+
journal: Optional[str] = None
|
|
81
|
+
volume: Optional[str] = None
|
|
82
|
+
issue: Optional[str] = None
|
|
83
|
+
pages: Optional[str] = None
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
@dataclass
|
|
87
|
+
class Pattern:
|
|
88
|
+
"""A discovered pattern in the data."""
|
|
89
|
+
|
|
90
|
+
id: str
|
|
91
|
+
task: str # regression, binary_classification, multiclass_classification
|
|
92
|
+
target_column: str
|
|
93
|
+
direction: str # "min" or "max"
|
|
94
|
+
p_value: float
|
|
95
|
+
conditions: List[Dict[str, Any]] # PatternCondition as dicts
|
|
96
|
+
lift_value: float
|
|
97
|
+
support_count: int
|
|
98
|
+
support_percentage: float
|
|
99
|
+
pattern_type: str # "validated" or "speculative"
|
|
100
|
+
novelty_type: str # "novel" or "confirmatory"
|
|
101
|
+
target_score: float
|
|
102
|
+
description: str
|
|
103
|
+
novelty_explanation: str
|
|
104
|
+
target_class: Optional[str] = None
|
|
105
|
+
target_mean: Optional[float] = None
|
|
106
|
+
target_std: Optional[float] = None
|
|
107
|
+
citations: List[Dict[str, Any]] = field(default_factory=list)
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
# Column/Feature types
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
@dataclass
|
|
114
|
+
class Column:
|
|
115
|
+
"""Information about a dataset column/feature."""
|
|
116
|
+
|
|
117
|
+
id: str
|
|
118
|
+
name: str
|
|
119
|
+
display_name: str
|
|
120
|
+
type: str # "continuous" or "categorical"
|
|
121
|
+
data_type: str # "int", "float", "string", "boolean", "datetime"
|
|
122
|
+
enabled: bool
|
|
123
|
+
description: Optional[str] = None
|
|
124
|
+
|
|
125
|
+
# Statistics
|
|
126
|
+
mean: Optional[float] = None
|
|
127
|
+
median: Optional[float] = None
|
|
128
|
+
std: Optional[float] = None
|
|
129
|
+
min: Optional[float] = None
|
|
130
|
+
max: Optional[float] = None
|
|
131
|
+
iqr_min: Optional[float] = None
|
|
132
|
+
iqr_max: Optional[float] = None
|
|
133
|
+
mode: Optional[str] = None
|
|
134
|
+
approx_unique: Optional[int] = None
|
|
135
|
+
null_percentage: Optional[float] = None
|
|
136
|
+
|
|
137
|
+
# Feature importance
|
|
138
|
+
feature_importance_score: Optional[float] = None
|
|
139
|
+
|
|
140
|
+
|
|
141
|
+
# Summary types (LLM-generated)
|
|
142
|
+
|
|
143
|
+
|
|
144
|
+
@dataclass
|
|
145
|
+
class DataInsights:
|
|
146
|
+
"""LLM-generated data insights."""
|
|
147
|
+
|
|
148
|
+
important_features: List[str]
|
|
149
|
+
important_features_explanation: str
|
|
150
|
+
strong_correlations: List[Dict[str, str]] # [{"feature1": "...", "feature2": "..."}]
|
|
151
|
+
strong_correlations_explanation: str
|
|
152
|
+
notable_relationships: List[str]
|
|
153
|
+
|
|
154
|
+
|
|
155
|
+
@dataclass
|
|
156
|
+
class PatternGroup:
|
|
157
|
+
"""A group of patterns with explanation."""
|
|
158
|
+
|
|
159
|
+
pattern_ids: List[str]
|
|
160
|
+
explanation: str
|
|
161
|
+
|
|
162
|
+
|
|
163
|
+
@dataclass
|
|
164
|
+
class Summary:
|
|
165
|
+
"""LLM-generated summary of the analysis."""
|
|
166
|
+
|
|
167
|
+
overview: str
|
|
168
|
+
key_insights: List[str]
|
|
169
|
+
novel_patterns: PatternGroup
|
|
170
|
+
surprising_findings: PatternGroup
|
|
171
|
+
statistically_significant: PatternGroup
|
|
172
|
+
data_insights: DataInsights
|
|
173
|
+
selected_pattern_id: Optional[str] = None
|
|
174
|
+
|
|
175
|
+
|
|
176
|
+
# Feature importance types
|
|
177
|
+
|
|
178
|
+
|
|
179
|
+
@dataclass
|
|
180
|
+
class FeatureImportanceScore:
|
|
181
|
+
"""A single feature importance score."""
|
|
182
|
+
|
|
183
|
+
feature: str
|
|
184
|
+
score: float
|
|
185
|
+
|
|
186
|
+
|
|
187
|
+
@dataclass
|
|
188
|
+
class FeatureImportance:
|
|
189
|
+
"""Global feature importance information."""
|
|
190
|
+
|
|
191
|
+
kind: str # "global" or "local"
|
|
192
|
+
baseline: float # expected model output
|
|
193
|
+
scores: List[FeatureImportanceScore]
|
|
194
|
+
|
|
195
|
+
|
|
196
|
+
# Correlation matrix types
|
|
197
|
+
|
|
198
|
+
|
|
199
|
+
@dataclass
|
|
200
|
+
class CorrelationEntry:
|
|
201
|
+
"""A single correlation matrix entry."""
|
|
202
|
+
|
|
203
|
+
feature_x: str
|
|
204
|
+
feature_y: str
|
|
205
|
+
value: float
|
|
206
|
+
|
|
207
|
+
|
|
208
|
+
# Main result type
|
|
209
|
+
|
|
210
|
+
|
|
211
|
+
@dataclass
|
|
212
|
+
class EngineResult:
|
|
213
|
+
"""Complete result of an engine run."""
|
|
214
|
+
|
|
215
|
+
# Identifiers
|
|
216
|
+
run_id: str
|
|
217
|
+
report_id: Optional[str] = None
|
|
218
|
+
status: str = "pending" # pending, processing, completed, failed
|
|
219
|
+
|
|
220
|
+
# Dataset metadata
|
|
221
|
+
dataset_title: Optional[str] = None
|
|
222
|
+
dataset_description: Optional[str] = None
|
|
223
|
+
total_rows: Optional[int] = None
|
|
224
|
+
target_column: Optional[str] = None
|
|
225
|
+
task: Optional[str] = None # regression, binary_classification, multiclass_classification
|
|
226
|
+
|
|
227
|
+
# LLM-generated summary
|
|
228
|
+
summary: Optional[Summary] = None
|
|
229
|
+
|
|
230
|
+
# Discovered patterns
|
|
231
|
+
patterns: List[Pattern] = field(default_factory=list)
|
|
232
|
+
|
|
233
|
+
# Column/feature information with stats and importance
|
|
234
|
+
columns: List[Column] = field(default_factory=list)
|
|
235
|
+
|
|
236
|
+
# Correlation matrix
|
|
237
|
+
correlation_matrix: List[CorrelationEntry] = field(default_factory=list)
|
|
238
|
+
|
|
239
|
+
# Global feature importance
|
|
240
|
+
feature_importance: Optional[FeatureImportance] = None
|
|
241
|
+
|
|
242
|
+
# Job tracking
|
|
243
|
+
job_id: Optional[str] = None
|
|
244
|
+
job_status: Optional[str] = None
|
|
245
|
+
error_message: Optional[str] = None
|
|
246
|
+
|
|
247
|
+
|
|
248
|
+
@dataclass
|
|
249
|
+
class RunStatus:
|
|
250
|
+
"""Status of a run."""
|
|
251
|
+
|
|
252
|
+
run_id: str
|
|
253
|
+
status: str
|
|
254
|
+
job_id: Optional[str] = None
|
|
255
|
+
job_status: Optional[str] = None
|
|
256
|
+
error_message: Optional[str] = None
|
|
@@ -0,0 +1,318 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: discovery-engine-api
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Python SDK for the Discovery Engine API
|
|
5
|
+
Project-URL: Homepage, https://github.com/leap-laboratories/discovery
|
|
6
|
+
Project-URL: Documentation, https://github.com/leap-laboratories/discovery
|
|
7
|
+
Project-URL: Repository, https://github.com/leap-laboratories/discovery
|
|
8
|
+
Author: Leap Laboratories
|
|
9
|
+
License: MIT
|
|
10
|
+
Keywords: api,data-analysis,discovery,machine-learning,sdk
|
|
11
|
+
Classifier: Development Status :: 4 - Beta
|
|
12
|
+
Classifier: Intended Audience :: Developers
|
|
13
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
14
|
+
Classifier: Programming Language :: Python :: 3
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
18
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
19
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
20
|
+
Requires-Python: >=3.10
|
|
21
|
+
Requires-Dist: httpx>=0.24.0
|
|
22
|
+
Requires-Dist: pydantic>=2.0.0
|
|
23
|
+
Provides-Extra: dev
|
|
24
|
+
Requires-Dist: pytest-asyncio>=0.21.0; extra == 'dev'
|
|
25
|
+
Requires-Dist: pytest>=7.0.0; extra == 'dev'
|
|
26
|
+
Provides-Extra: pandas
|
|
27
|
+
Requires-Dist: pandas>=2.0.0; extra == 'pandas'
|
|
28
|
+
Description-Content-Type: text/markdown
|
|
29
|
+
|
|
30
|
+
# Discovery Engine Python API
|
|
31
|
+
|
|
32
|
+
The Discovery Engine Python API provides a simple programmatic interface to run analyses via Python, offering an alternative to using the web dashboard. Instead of uploading datasets and configuring analyses through the UI, you can automate your discovery workflows directly from your Python code or scripts.
|
|
33
|
+
|
|
34
|
+
All analyses run through the API are fully integrated with your Discovery Engine account. Results are automatically displayed in the dashboard, where you can view detailed reports, explore patterns, and share findings with your team. Your account management, credit balance, and subscription settings are all handled through the dashboard—the API is simply a convenient interface for programmatic access to the same powerful discovery engine.
|
|
35
|
+
|
|
36
|
+
## Installation
|
|
37
|
+
|
|
38
|
+
```bash
|
|
39
|
+
pip install discovery-engine-api
|
|
40
|
+
```
|
|
41
|
+
|
|
42
|
+
For pandas DataFrame support:
|
|
43
|
+
|
|
44
|
+
```bash
|
|
45
|
+
pip install discovery-engine-api[pandas]
|
|
46
|
+
```
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
## Quick Start
|
|
50
|
+
|
|
51
|
+
```python
|
|
52
|
+
from discovery import Engine
|
|
53
|
+
|
|
54
|
+
# Initialize engine
|
|
55
|
+
engine = Engine(api_key="your-api-key")
|
|
56
|
+
|
|
57
|
+
# Run analysis on a dataset and wait for results
|
|
58
|
+
result = engine.run(
|
|
59
|
+
file="data.csv",
|
|
60
|
+
target_column="diagnosis",
|
|
61
|
+
mode="fast",
|
|
62
|
+
description="Rare diseases dataset",
|
|
63
|
+
wait=True # Wait for completion and return full results
|
|
64
|
+
)
|
|
65
|
+
|
|
66
|
+
print(f"Run ID: {result.run_id}")
|
|
67
|
+
print(f"Status: {result.status}")
|
|
68
|
+
print(f"Found {len(result.patterns)} patterns")
|
|
69
|
+
```
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
## Examples
|
|
73
|
+
|
|
74
|
+
### Working with Pandas DataFrames
|
|
75
|
+
|
|
76
|
+
```python
|
|
77
|
+
import pandas as pd
|
|
78
|
+
from discovery import Engine
|
|
79
|
+
|
|
80
|
+
df = pd.read_csv("data.csv")
|
|
81
|
+
# or create DataFrame directly
|
|
82
|
+
|
|
83
|
+
engine = Engine(api_key="your-api-key")
|
|
84
|
+
result = engine.run(
|
|
85
|
+
file=df, # Pass DataFrame directly
|
|
86
|
+
target_column="outcome",
|
|
87
|
+
column_descriptions={
|
|
88
|
+
"age": "Patient age in years",
|
|
89
|
+
"heart rate": None
|
|
90
|
+
},
|
|
91
|
+
wait=True
|
|
92
|
+
)
|
|
93
|
+
```
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
### Async Workflow
|
|
97
|
+
|
|
98
|
+
```python
|
|
99
|
+
import asyncio
|
|
100
|
+
from discovery import Engine
|
|
101
|
+
|
|
102
|
+
async def run_analysis():
|
|
103
|
+
async with Engine(api_key="your-api-key") as engine:
|
|
104
|
+
# Start analysis without waiting
|
|
105
|
+
result = await engine.run_async(
|
|
106
|
+
file="data.csv",
|
|
107
|
+
target_column="target",
|
|
108
|
+
wait=False
|
|
109
|
+
)
|
|
110
|
+
print(f"Started run: {result.run_id}")
|
|
111
|
+
|
|
112
|
+
# Later, get results
|
|
113
|
+
result = await client.get_results(result.run_id)
|
|
114
|
+
|
|
115
|
+
# Or wait for completion
|
|
116
|
+
result = await client.wait_for_completion(result.run_id, timeout=600)
|
|
117
|
+
return result
|
|
118
|
+
|
|
119
|
+
result = asyncio.run(run_analysis())
|
|
120
|
+
```
|
|
121
|
+
|
|
122
|
+
|
|
123
|
+
## Configuration Options
|
|
124
|
+
|
|
125
|
+
The `run()` and `run_async()` methods accept the following parameters:
|
|
126
|
+
|
|
127
|
+
| Parameter | Type | Default | Description |
|
|
128
|
+
|-----------|------|---------|-------------|
|
|
129
|
+
| `file` | `str`, `Path`, or `DataFrame` | **Required** | Dataset file path or pandas DataFrame |
|
|
130
|
+
| `target_column` | `str` | **Required** | Name of column to predict |
|
|
131
|
+
| `mode` | `"fast"` / `"deep"` | `"fast"` | Analysis depth |
|
|
132
|
+
| `title` | `str` | `None` | Optional dataset title |
|
|
133
|
+
| `description` | `str` | `None` | Optional dataset description |
|
|
134
|
+
| `column_descriptions` | `Dict[str, str]` | `None` | Optional column name -> description mapping |
|
|
135
|
+
| `task` | `str` | `None` | Override auto-detected task type: `"regression"`, `"binary_classification"`, or `"multiclass_classification"` |
|
|
136
|
+
| `visibility` | `"public"` / `"private"` | `"public"` | Dataset visibility (private requires credits) |
|
|
137
|
+
| `timeseries_groups` | `List[Dict]` | `None` | Timeseries column groups for feature extraction |
|
|
138
|
+
| `auto_report_use_llm_evals` | `bool` | `True` | Use LLM for pattern descriptions |
|
|
139
|
+
| `author` | `str` | `None` | Optional dataset author attribution |
|
|
140
|
+
| `source_url` | `str` | `None` | Optional source URL for dataset |
|
|
141
|
+
| `wait` | `bool` | `False` | Wait for analysis to complete and return full results |
|
|
142
|
+
| `wait_timeout` | `float` | `None` | Maximum seconds to wait for completion (only if `wait=True`) |
|
|
143
|
+
|
|
144
|
+
|
|
145
|
+
## Credits and Pricing
|
|
146
|
+
|
|
147
|
+
- **Public datasets**: Free (0 credits required)
|
|
148
|
+
- **Private datasets**:
|
|
149
|
+
- Fast mode: 1 credit per MB
|
|
150
|
+
- Deep mode: 3 credits per MB
|
|
151
|
+
|
|
152
|
+
If you don't have enough credits for a private run, the SDK will raise an `httpx.HTTPStatusError` with an error message like:
|
|
153
|
+
```
|
|
154
|
+
Insufficient credits. You need X credits but only have Y available.
|
|
155
|
+
```
|
|
156
|
+
|
|
157
|
+
**Solutions:**
|
|
158
|
+
1. Make your dataset public (set `visibility="public"`) - completely free
|
|
159
|
+
2. Visit [https://disco.leap-labs.com/account](https://disco.leap-labs.com/account) to:
|
|
160
|
+
- Purchase additional credits
|
|
161
|
+
- Upgrade to a subscription plan that includes more credits
|
|
162
|
+
|
|
163
|
+
|
|
164
|
+
## Return Value
|
|
165
|
+
|
|
166
|
+
The `run()` and `run_async()` methods return an `EngineResult` object with the following fields:
|
|
167
|
+
|
|
168
|
+
### EngineResult
|
|
169
|
+
|
|
170
|
+
```python
|
|
171
|
+
@dataclass
|
|
172
|
+
class EngineResult:
|
|
173
|
+
# Identifiers
|
|
174
|
+
run_id: str # Unique run identifier
|
|
175
|
+
report_id: Optional[str] # Report ID (if report created)
|
|
176
|
+
status: str # "pending", "processing", "completed", "failed"
|
|
177
|
+
|
|
178
|
+
# Dataset metadata
|
|
179
|
+
dataset_title: Optional[str] # Dataset title
|
|
180
|
+
dataset_description: Optional[str] # Dataset description
|
|
181
|
+
total_rows: Optional[int] # Number of rows in dataset
|
|
182
|
+
target_column: Optional[str] # Name of target column
|
|
183
|
+
task: Optional[str] # "regression", "binary_classification", or "multiclass_classification"
|
|
184
|
+
|
|
185
|
+
# LLM-generated summary
|
|
186
|
+
summary: Optional[Summary] # Summary object with overview, insights, etc.
|
|
187
|
+
|
|
188
|
+
# Discovered patterns
|
|
189
|
+
patterns: List[Pattern] # List of discovered patterns
|
|
190
|
+
|
|
191
|
+
# Column/feature information
|
|
192
|
+
columns: List[Column] # List of columns with statistics and importance
|
|
193
|
+
|
|
194
|
+
# Correlation matrix
|
|
195
|
+
correlation_matrix: List[CorrelationEntry] # Feature correlations
|
|
196
|
+
|
|
197
|
+
# Global feature importance
|
|
198
|
+
feature_importance: Optional[FeatureImportance] # Feature importance scores
|
|
199
|
+
|
|
200
|
+
# Job tracking
|
|
201
|
+
job_id: Optional[str] # Job ID for tracking processing
|
|
202
|
+
job_status: Optional[str] # Job status
|
|
203
|
+
error_message: Optional[str] # Error message if analysis failed
|
|
204
|
+
```
|
|
205
|
+
|
|
206
|
+
### Summary
|
|
207
|
+
|
|
208
|
+
```python
|
|
209
|
+
@dataclass
|
|
210
|
+
class Summary:
|
|
211
|
+
overview: str # High-level explanation of findings
|
|
212
|
+
key_insights: List[str] # List of main takeaways
|
|
213
|
+
novel_patterns: PatternGroup # Novel pattern explanations
|
|
214
|
+
surprising_findings: PatternGroup # Surprising findings
|
|
215
|
+
statistically_significant: PatternGroup # Statistically significant patterns
|
|
216
|
+
data_insights: Optional[DataInsights] # Important features, correlations
|
|
217
|
+
selected_pattern_id: Optional[str] # ID of selected pattern
|
|
218
|
+
```
|
|
219
|
+
|
|
220
|
+
### Pattern
|
|
221
|
+
|
|
222
|
+
```python
|
|
223
|
+
@dataclass
|
|
224
|
+
class Pattern:
|
|
225
|
+
id: str # Pattern identifier
|
|
226
|
+
task: str # Task type
|
|
227
|
+
target_column: str # Target column name
|
|
228
|
+
direction: str # "min" or "max"
|
|
229
|
+
p_value: float # Statistical p-value
|
|
230
|
+
conditions: List[Dict] # Pattern conditions (continuous, categorical, datetime)
|
|
231
|
+
lift_value: float # Lift value (how much the pattern increases/decreases target)
|
|
232
|
+
support_count: int # Number of rows matching pattern
|
|
233
|
+
support_percentage: float # Percentage of rows matching pattern
|
|
234
|
+
pattern_type: str # "validated" or "speculative"
|
|
235
|
+
novelty_type: str # "novel" or "confirmatory"
|
|
236
|
+
target_score: float # Target score for this pattern
|
|
237
|
+
description: str # Human-readable description
|
|
238
|
+
novelty_explanation: str # Explanation of novelty
|
|
239
|
+
target_class: Optional[str] # Target class (for classification)
|
|
240
|
+
target_mean: Optional[float] # Target mean (for regression)
|
|
241
|
+
target_std: Optional[float] # Target standard deviation
|
|
242
|
+
citations: List[Dict] # Academic citations
|
|
243
|
+
```
|
|
244
|
+
|
|
245
|
+
### Column
|
|
246
|
+
|
|
247
|
+
```python
|
|
248
|
+
@dataclass
|
|
249
|
+
class Column:
|
|
250
|
+
id: str # Column identifier
|
|
251
|
+
name: str # Column name
|
|
252
|
+
display_name: str # Display name
|
|
253
|
+
type: str # "continuous" or "categorical"
|
|
254
|
+
data_type: str # "int", "float", "string", "boolean", "datetime"
|
|
255
|
+
enabled: bool # Whether column is enabled
|
|
256
|
+
description: Optional[str] # Column description
|
|
257
|
+
|
|
258
|
+
# Statistics
|
|
259
|
+
mean: Optional[float] # Mean value
|
|
260
|
+
median: Optional[float] # Median value
|
|
261
|
+
std: Optional[float] # Standard deviation
|
|
262
|
+
min: Optional[float] # Minimum value
|
|
263
|
+
max: Optional[float] # Maximum value
|
|
264
|
+
iqr_min: Optional[float] # IQR minimum
|
|
265
|
+
iqr_max: Optional[float] # IQR maximum
|
|
266
|
+
mode: Optional[str] # Mode value
|
|
267
|
+
approx_unique: Optional[int] # Approximate unique count
|
|
268
|
+
null_percentage: Optional[float] # Percentage of null values
|
|
269
|
+
|
|
270
|
+
# Feature importance
|
|
271
|
+
feature_importance_score: Optional[float] # Feature importance score
|
|
272
|
+
```
|
|
273
|
+
|
|
274
|
+
### FeatureImportance
|
|
275
|
+
|
|
276
|
+
```python
|
|
277
|
+
@dataclass
|
|
278
|
+
class FeatureImportance:
|
|
279
|
+
kind: str # Feature importance type: "global"
|
|
280
|
+
baseline: float # Baseline model output
|
|
281
|
+
scores: List[FeatureImportanceScore] # List of feature scores
|
|
282
|
+
```
|
|
283
|
+
|
|
284
|
+
### CorrelationEntry
|
|
285
|
+
|
|
286
|
+
```python
|
|
287
|
+
@dataclass
|
|
288
|
+
class CorrelationEntry:
|
|
289
|
+
feature_x: str # First feature name
|
|
290
|
+
feature_y: str # Second feature name
|
|
291
|
+
value: float # Correlation value (-1 to 1)
|
|
292
|
+
```
|
|
293
|
+
|
|
294
|
+
### Pattern
|
|
295
|
+
|
|
296
|
+
```python
|
|
297
|
+
@dataclass
|
|
298
|
+
class Pattern:
|
|
299
|
+
id: str
|
|
300
|
+
task: str
|
|
301
|
+
target_column: str
|
|
302
|
+
direction: str # "min" or "max"
|
|
303
|
+
p_value: float
|
|
304
|
+
conditions: List[Dict] # Continuous, categorical, or datetime conditions
|
|
305
|
+
lift_value: float
|
|
306
|
+
support_count: int
|
|
307
|
+
support_percentage: float
|
|
308
|
+
pattern_type: str # "validated" or "speculative"
|
|
309
|
+
novelty_type: str # "novel" or "confirmatory"
|
|
310
|
+
target_score: float
|
|
311
|
+
description: str
|
|
312
|
+
novelty_explanation: str
|
|
313
|
+
target_class: Optional[str]
|
|
314
|
+
target_mean: Optional[float]
|
|
315
|
+
target_std: Optional[float]
|
|
316
|
+
citations: List[Dict]
|
|
317
|
+
```
|
|
318
|
+
|
|
@@ -0,0 +1,6 @@
|
|
|
1
|
+
discovery/__init__.py,sha256=3A09KLiF4vW1oQFpRgv2iuyfmLXar9LbbNNY4i1DBZ8,624
|
|
2
|
+
discovery/client.py,sha256=oZz4eTP3K8jKfYmDkBzvr_7MTKZCtnA2b3DlUmW-ObI,26857
|
|
3
|
+
discovery/types.py,sha256=4Z3gKdxWnOpymEjBGCzAeUGjwRT2A0aCpmuwctbE4w0,6008
|
|
4
|
+
discovery_engine_api-0.1.0.dist-info/METADATA,sha256=C_0ZJlvrIZGRNuVfPJznVjU-ywDAfEJ5eT4O42Scats,11739
|
|
5
|
+
discovery_engine_api-0.1.0.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
|
|
6
|
+
discovery_engine_api-0.1.0.dist-info/RECORD,,
|