discovery-engine-api 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- discovery_engine_api-0.1.0/.gitignore +56 -0
- discovery_engine_api-0.1.0/PKG-INFO +318 -0
- discovery_engine_api-0.1.0/README.md +289 -0
- discovery_engine_api-0.1.0/TESTING.md +190 -0
- discovery_engine_api-0.1.0/discovery/__init__.py +34 -0
- discovery_engine_api-0.1.0/discovery/client.py +747 -0
- discovery_engine_api-0.1.0/discovery/types.py +256 -0
- discovery_engine_api-0.1.0/publish.sh +77 -0
- discovery_engine_api-0.1.0/pyproject.toml +47 -0
- discovery_engine_api-0.1.0/tests/__init__.py +1 -0
- discovery_engine_api-0.1.0/tests/test_client.py +941 -0
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
# trained models and generated prototypes
|
|
2
|
+
/models
|
|
3
|
+
/results
|
|
4
|
+
/train_results
|
|
5
|
+
research/automl/trained_model
|
|
6
|
+
research/automl/leap_files
|
|
7
|
+
*.ckpt
|
|
8
|
+
*/train_results/*
|
|
9
|
+
|
|
10
|
+
# DS_STORE
|
|
11
|
+
.DS_STORE
|
|
12
|
+
|
|
13
|
+
# python
|
|
14
|
+
__pycache__
|
|
15
|
+
|
|
16
|
+
# virtualenv
|
|
17
|
+
.venv
|
|
18
|
+
venv
|
|
19
|
+
|
|
20
|
+
# pytest
|
|
21
|
+
.pytest_cache
|
|
22
|
+
|
|
23
|
+
# coverage
|
|
24
|
+
htmlcov/
|
|
25
|
+
.coverage
|
|
26
|
+
.coverage.*
|
|
27
|
+
*.cover
|
|
28
|
+
|
|
29
|
+
# ruff
|
|
30
|
+
.ruff_cache
|
|
31
|
+
|
|
32
|
+
# dotenv
|
|
33
|
+
*.env
|
|
34
|
+
|
|
35
|
+
# pickle files
|
|
36
|
+
*.pkl
|
|
37
|
+
*.pickle
|
|
38
|
+
|
|
39
|
+
# leap_ie
|
|
40
|
+
leap_files
|
|
41
|
+
**/leap_files/
|
|
42
|
+
|
|
43
|
+
# packages/data
|
|
44
|
+
/datasets
|
|
45
|
+
|
|
46
|
+
# packages/reporting
|
|
47
|
+
discovery_report
|
|
48
|
+
|
|
49
|
+
# packages/training
|
|
50
|
+
wandb
|
|
51
|
+
|
|
52
|
+
# logs
|
|
53
|
+
lightning_logs
|
|
54
|
+
.deepeval
|
|
55
|
+
logs/*
|
|
56
|
+
logs/memory_monitor.log
|
|
@@ -0,0 +1,318 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: discovery-engine-api
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Python SDK for the Discovery Engine API
|
|
5
|
+
Project-URL: Homepage, https://github.com/leap-laboratories/discovery
|
|
6
|
+
Project-URL: Documentation, https://github.com/leap-laboratories/discovery
|
|
7
|
+
Project-URL: Repository, https://github.com/leap-laboratories/discovery
|
|
8
|
+
Author: Leap Laboratories
|
|
9
|
+
License: MIT
|
|
10
|
+
Keywords: api,data-analysis,discovery,machine-learning,sdk
|
|
11
|
+
Classifier: Development Status :: 4 - Beta
|
|
12
|
+
Classifier: Intended Audience :: Developers
|
|
13
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
14
|
+
Classifier: Programming Language :: Python :: 3
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
18
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
19
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
20
|
+
Requires-Python: >=3.10
|
|
21
|
+
Requires-Dist: httpx>=0.24.0
|
|
22
|
+
Requires-Dist: pydantic>=2.0.0
|
|
23
|
+
Provides-Extra: dev
|
|
24
|
+
Requires-Dist: pytest-asyncio>=0.21.0; extra == 'dev'
|
|
25
|
+
Requires-Dist: pytest>=7.0.0; extra == 'dev'
|
|
26
|
+
Provides-Extra: pandas
|
|
27
|
+
Requires-Dist: pandas>=2.0.0; extra == 'pandas'
|
|
28
|
+
Description-Content-Type: text/markdown
|
|
29
|
+
|
|
30
|
+
# Discovery Engine Python API
|
|
31
|
+
|
|
32
|
+
The Discovery Engine Python API provides a simple programmatic interface to run analyses via Python, offering an alternative to using the web dashboard. Instead of uploading datasets and configuring analyses through the UI, you can automate your discovery workflows directly from your Python code or scripts.
|
|
33
|
+
|
|
34
|
+
All analyses run through the API are fully integrated with your Discovery Engine account. Results are automatically displayed in the dashboard, where you can view detailed reports, explore patterns, and share findings with your team. Your account management, credit balance, and subscription settings are all handled through the dashboard—the API is simply a convenient interface for programmatic access to the same powerful discovery engine.
|
|
35
|
+
|
|
36
|
+
## Installation
|
|
37
|
+
|
|
38
|
+
```bash
|
|
39
|
+
pip install discovery-engine-api
|
|
40
|
+
```
|
|
41
|
+
|
|
42
|
+
For pandas DataFrame support:
|
|
43
|
+
|
|
44
|
+
```bash
|
|
45
|
+
pip install discovery-engine-api[pandas]
|
|
46
|
+
```
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
## Quick Start
|
|
50
|
+
|
|
51
|
+
```python
|
|
52
|
+
from discovery import Engine
|
|
53
|
+
|
|
54
|
+
# Initialize engine
|
|
55
|
+
engine = Engine(api_key="your-api-key")
|
|
56
|
+
|
|
57
|
+
# Run analysis on a dataset and wait for results
|
|
58
|
+
result = engine.run(
|
|
59
|
+
file="data.csv",
|
|
60
|
+
target_column="diagnosis",
|
|
61
|
+
mode="fast",
|
|
62
|
+
description="Rare diseases dataset",
|
|
63
|
+
wait=True # Wait for completion and return full results
|
|
64
|
+
)
|
|
65
|
+
|
|
66
|
+
print(f"Run ID: {result.run_id}")
|
|
67
|
+
print(f"Status: {result.status}")
|
|
68
|
+
print(f"Found {len(result.patterns)} patterns")
|
|
69
|
+
```
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
## Examples
|
|
73
|
+
|
|
74
|
+
### Working with Pandas DataFrames
|
|
75
|
+
|
|
76
|
+
```python
|
|
77
|
+
import pandas as pd
|
|
78
|
+
from discovery import Engine
|
|
79
|
+
|
|
80
|
+
df = pd.read_csv("data.csv")
|
|
81
|
+
# or create DataFrame directly
|
|
82
|
+
|
|
83
|
+
engine = Engine(api_key="your-api-key")
|
|
84
|
+
result = engine.run(
|
|
85
|
+
file=df, # Pass DataFrame directly
|
|
86
|
+
target_column="outcome",
|
|
87
|
+
column_descriptions={
|
|
88
|
+
"age": "Patient age in years",
|
|
89
|
+
"heart rate": None
|
|
90
|
+
},
|
|
91
|
+
wait=True
|
|
92
|
+
)
|
|
93
|
+
```
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
### Async Workflow
|
|
97
|
+
|
|
98
|
+
```python
|
|
99
|
+
import asyncio
|
|
100
|
+
from discovery import Engine
|
|
101
|
+
|
|
102
|
+
async def run_analysis():
|
|
103
|
+
async with Engine(api_key="your-api-key") as engine:
|
|
104
|
+
# Start analysis without waiting
|
|
105
|
+
result = await engine.run_async(
|
|
106
|
+
file="data.csv",
|
|
107
|
+
target_column="target",
|
|
108
|
+
wait=False
|
|
109
|
+
)
|
|
110
|
+
print(f"Started run: {result.run_id}")
|
|
111
|
+
|
|
112
|
+
# Later, get results
|
|
113
|
+
result = await client.get_results(result.run_id)
|
|
114
|
+
|
|
115
|
+
# Or wait for completion
|
|
116
|
+
result = await client.wait_for_completion(result.run_id, timeout=600)
|
|
117
|
+
return result
|
|
118
|
+
|
|
119
|
+
result = asyncio.run(run_analysis())
|
|
120
|
+
```
|
|
121
|
+
|
|
122
|
+
|
|
123
|
+
## Configuration Options
|
|
124
|
+
|
|
125
|
+
The `run()` and `run_async()` methods accept the following parameters:
|
|
126
|
+
|
|
127
|
+
| Parameter | Type | Default | Description |
|
|
128
|
+
|-----------|------|---------|-------------|
|
|
129
|
+
| `file` | `str`, `Path`, or `DataFrame` | **Required** | Dataset file path or pandas DataFrame |
|
|
130
|
+
| `target_column` | `str` | **Required** | Name of column to predict |
|
|
131
|
+
| `mode` | `"fast"` / `"deep"` | `"fast"` | Analysis depth |
|
|
132
|
+
| `title` | `str` | `None` | Optional dataset title |
|
|
133
|
+
| `description` | `str` | `None` | Optional dataset description |
|
|
134
|
+
| `column_descriptions` | `Dict[str, str]` | `None` | Optional column name -> description mapping |
|
|
135
|
+
| `task` | `str` | `None` | Override auto-detected task type: `"regression"`, `"binary_classification"`, or `"multiclass_classification"` |
|
|
136
|
+
| `visibility` | `"public"` / `"private"` | `"public"` | Dataset visibility (private requires credits) |
|
|
137
|
+
| `timeseries_groups` | `List[Dict]` | `None` | Timeseries column groups for feature extraction |
|
|
138
|
+
| `auto_report_use_llm_evals` | `bool` | `True` | Use LLM for pattern descriptions |
|
|
139
|
+
| `author` | `str` | `None` | Optional dataset author attribution |
|
|
140
|
+
| `source_url` | `str` | `None` | Optional source URL for dataset |
|
|
141
|
+
| `wait` | `bool` | `False` | Wait for analysis to complete and return full results |
|
|
142
|
+
| `wait_timeout` | `float` | `None` | Maximum seconds to wait for completion (only if `wait=True`) |
|
|
143
|
+
|
|
144
|
+
|
|
145
|
+
## Credits and Pricing
|
|
146
|
+
|
|
147
|
+
- **Public datasets**: Free (0 credits required)
|
|
148
|
+
- **Private datasets**:
|
|
149
|
+
- Fast mode: 1 credit per MB
|
|
150
|
+
- Deep mode: 3 credits per MB
|
|
151
|
+
|
|
152
|
+
If you don't have enough credits for a private run, the SDK will raise an `httpx.HTTPStatusError` with an error message like:
|
|
153
|
+
```
|
|
154
|
+
Insufficient credits. You need X credits but only have Y available.
|
|
155
|
+
```
|
|
156
|
+
|
|
157
|
+
**Solutions:**
|
|
158
|
+
1. Make your dataset public (set `visibility="public"`) - completely free
|
|
159
|
+
2. Visit [https://disco.leap-labs.com/account](https://disco.leap-labs.com/account) to:
|
|
160
|
+
- Purchase additional credits
|
|
161
|
+
- Upgrade to a subscription plan that includes more credits
|
|
162
|
+
|
|
163
|
+
|
|
164
|
+
## Return Value
|
|
165
|
+
|
|
166
|
+
The `run()` and `run_async()` methods return an `EngineResult` object with the following fields:
|
|
167
|
+
|
|
168
|
+
### EngineResult
|
|
169
|
+
|
|
170
|
+
```python
|
|
171
|
+
@dataclass
|
|
172
|
+
class EngineResult:
|
|
173
|
+
# Identifiers
|
|
174
|
+
run_id: str # Unique run identifier
|
|
175
|
+
report_id: Optional[str] # Report ID (if report created)
|
|
176
|
+
status: str # "pending", "processing", "completed", "failed"
|
|
177
|
+
|
|
178
|
+
# Dataset metadata
|
|
179
|
+
dataset_title: Optional[str] # Dataset title
|
|
180
|
+
dataset_description: Optional[str] # Dataset description
|
|
181
|
+
total_rows: Optional[int] # Number of rows in dataset
|
|
182
|
+
target_column: Optional[str] # Name of target column
|
|
183
|
+
task: Optional[str] # "regression", "binary_classification", or "multiclass_classification"
|
|
184
|
+
|
|
185
|
+
# LLM-generated summary
|
|
186
|
+
summary: Optional[Summary] # Summary object with overview, insights, etc.
|
|
187
|
+
|
|
188
|
+
# Discovered patterns
|
|
189
|
+
patterns: List[Pattern] # List of discovered patterns
|
|
190
|
+
|
|
191
|
+
# Column/feature information
|
|
192
|
+
columns: List[Column] # List of columns with statistics and importance
|
|
193
|
+
|
|
194
|
+
# Correlation matrix
|
|
195
|
+
correlation_matrix: List[CorrelationEntry] # Feature correlations
|
|
196
|
+
|
|
197
|
+
# Global feature importance
|
|
198
|
+
feature_importance: Optional[FeatureImportance] # Feature importance scores
|
|
199
|
+
|
|
200
|
+
# Job tracking
|
|
201
|
+
job_id: Optional[str] # Job ID for tracking processing
|
|
202
|
+
job_status: Optional[str] # Job status
|
|
203
|
+
error_message: Optional[str] # Error message if analysis failed
|
|
204
|
+
```
|
|
205
|
+
|
|
206
|
+
### Summary
|
|
207
|
+
|
|
208
|
+
```python
|
|
209
|
+
@dataclass
|
|
210
|
+
class Summary:
|
|
211
|
+
overview: str # High-level explanation of findings
|
|
212
|
+
key_insights: List[str] # List of main takeaways
|
|
213
|
+
novel_patterns: PatternGroup # Novel pattern explanations
|
|
214
|
+
surprising_findings: PatternGroup # Surprising findings
|
|
215
|
+
statistically_significant: PatternGroup # Statistically significant patterns
|
|
216
|
+
data_insights: Optional[DataInsights] # Important features, correlations
|
|
217
|
+
selected_pattern_id: Optional[str] # ID of selected pattern
|
|
218
|
+
```
|
|
219
|
+
|
|
220
|
+
### Pattern
|
|
221
|
+
|
|
222
|
+
```python
|
|
223
|
+
@dataclass
|
|
224
|
+
class Pattern:
|
|
225
|
+
id: str # Pattern identifier
|
|
226
|
+
task: str # Task type
|
|
227
|
+
target_column: str # Target column name
|
|
228
|
+
direction: str # "min" or "max"
|
|
229
|
+
p_value: float # Statistical p-value
|
|
230
|
+
conditions: List[Dict] # Pattern conditions (continuous, categorical, datetime)
|
|
231
|
+
lift_value: float # Lift value (how much the pattern increases/decreases target)
|
|
232
|
+
support_count: int # Number of rows matching pattern
|
|
233
|
+
support_percentage: float # Percentage of rows matching pattern
|
|
234
|
+
pattern_type: str # "validated" or "speculative"
|
|
235
|
+
novelty_type: str # "novel" or "confirmatory"
|
|
236
|
+
target_score: float # Target score for this pattern
|
|
237
|
+
description: str # Human-readable description
|
|
238
|
+
novelty_explanation: str # Explanation of novelty
|
|
239
|
+
target_class: Optional[str] # Target class (for classification)
|
|
240
|
+
target_mean: Optional[float] # Target mean (for regression)
|
|
241
|
+
target_std: Optional[float] # Target standard deviation
|
|
242
|
+
citations: List[Dict] # Academic citations
|
|
243
|
+
```
|
|
244
|
+
|
|
245
|
+
### Column
|
|
246
|
+
|
|
247
|
+
```python
|
|
248
|
+
@dataclass
|
|
249
|
+
class Column:
|
|
250
|
+
id: str # Column identifier
|
|
251
|
+
name: str # Column name
|
|
252
|
+
display_name: str # Display name
|
|
253
|
+
type: str # "continuous" or "categorical"
|
|
254
|
+
data_type: str # "int", "float", "string", "boolean", "datetime"
|
|
255
|
+
enabled: bool # Whether column is enabled
|
|
256
|
+
description: Optional[str] # Column description
|
|
257
|
+
|
|
258
|
+
# Statistics
|
|
259
|
+
mean: Optional[float] # Mean value
|
|
260
|
+
median: Optional[float] # Median value
|
|
261
|
+
std: Optional[float] # Standard deviation
|
|
262
|
+
min: Optional[float] # Minimum value
|
|
263
|
+
max: Optional[float] # Maximum value
|
|
264
|
+
iqr_min: Optional[float] # IQR minimum
|
|
265
|
+
iqr_max: Optional[float] # IQR maximum
|
|
266
|
+
mode: Optional[str] # Mode value
|
|
267
|
+
approx_unique: Optional[int] # Approximate unique count
|
|
268
|
+
null_percentage: Optional[float] # Percentage of null values
|
|
269
|
+
|
|
270
|
+
# Feature importance
|
|
271
|
+
feature_importance_score: Optional[float] # Feature importance score
|
|
272
|
+
```
|
|
273
|
+
|
|
274
|
+
### FeatureImportance
|
|
275
|
+
|
|
276
|
+
```python
|
|
277
|
+
@dataclass
|
|
278
|
+
class FeatureImportance:
|
|
279
|
+
kind: str # Feature importance type: "global"
|
|
280
|
+
baseline: float # Baseline model output
|
|
281
|
+
scores: List[FeatureImportanceScore] # List of feature scores
|
|
282
|
+
```
|
|
283
|
+
|
|
284
|
+
### CorrelationEntry
|
|
285
|
+
|
|
286
|
+
```python
|
|
287
|
+
@dataclass
|
|
288
|
+
class CorrelationEntry:
|
|
289
|
+
feature_x: str # First feature name
|
|
290
|
+
feature_y: str # Second feature name
|
|
291
|
+
value: float # Correlation value (-1 to 1)
|
|
292
|
+
```
|
|
293
|
+
|
|
294
|
+
### Pattern
|
|
295
|
+
|
|
296
|
+
```python
|
|
297
|
+
@dataclass
|
|
298
|
+
class Pattern:
|
|
299
|
+
id: str
|
|
300
|
+
task: str
|
|
301
|
+
target_column: str
|
|
302
|
+
direction: str # "min" or "max"
|
|
303
|
+
p_value: float
|
|
304
|
+
conditions: List[Dict] # Continuous, categorical, or datetime conditions
|
|
305
|
+
lift_value: float
|
|
306
|
+
support_count: int
|
|
307
|
+
support_percentage: float
|
|
308
|
+
pattern_type: str # "validated" or "speculative"
|
|
309
|
+
novelty_type: str # "novel" or "confirmatory"
|
|
310
|
+
target_score: float
|
|
311
|
+
description: str
|
|
312
|
+
novelty_explanation: str
|
|
313
|
+
target_class: Optional[str]
|
|
314
|
+
target_mean: Optional[float]
|
|
315
|
+
target_std: Optional[float]
|
|
316
|
+
citations: List[Dict]
|
|
317
|
+
```
|
|
318
|
+
|
|
@@ -0,0 +1,289 @@
|
|
|
1
|
+
# Discovery Engine Python API
|
|
2
|
+
|
|
3
|
+
The Discovery Engine Python API provides a simple programmatic interface to run analyses via Python, offering an alternative to using the web dashboard. Instead of uploading datasets and configuring analyses through the UI, you can automate your discovery workflows directly from your Python code or scripts.
|
|
4
|
+
|
|
5
|
+
All analyses run through the API are fully integrated with your Discovery Engine account. Results are automatically displayed in the dashboard, where you can view detailed reports, explore patterns, and share findings with your team. Your account management, credit balance, and subscription settings are all handled through the dashboard—the API is simply a convenient interface for programmatic access to the same powerful discovery engine.
|
|
6
|
+
|
|
7
|
+
## Installation
|
|
8
|
+
|
|
9
|
+
```bash
|
|
10
|
+
pip install discovery-engine-api
|
|
11
|
+
```
|
|
12
|
+
|
|
13
|
+
For pandas DataFrame support:
|
|
14
|
+
|
|
15
|
+
```bash
|
|
16
|
+
pip install discovery-engine-api[pandas]
|
|
17
|
+
```
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
## Quick Start
|
|
21
|
+
|
|
22
|
+
```python
|
|
23
|
+
from discovery import Engine
|
|
24
|
+
|
|
25
|
+
# Initialize engine
|
|
26
|
+
engine = Engine(api_key="your-api-key")
|
|
27
|
+
|
|
28
|
+
# Run analysis on a dataset and wait for results
|
|
29
|
+
result = engine.run(
|
|
30
|
+
file="data.csv",
|
|
31
|
+
target_column="diagnosis",
|
|
32
|
+
mode="fast",
|
|
33
|
+
description="Rare diseases dataset",
|
|
34
|
+
wait=True # Wait for completion and return full results
|
|
35
|
+
)
|
|
36
|
+
|
|
37
|
+
print(f"Run ID: {result.run_id}")
|
|
38
|
+
print(f"Status: {result.status}")
|
|
39
|
+
print(f"Found {len(result.patterns)} patterns")
|
|
40
|
+
```
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
## Examples
|
|
44
|
+
|
|
45
|
+
### Working with Pandas DataFrames
|
|
46
|
+
|
|
47
|
+
```python
|
|
48
|
+
import pandas as pd
|
|
49
|
+
from discovery import Engine
|
|
50
|
+
|
|
51
|
+
df = pd.read_csv("data.csv")
|
|
52
|
+
# or create DataFrame directly
|
|
53
|
+
|
|
54
|
+
engine = Engine(api_key="your-api-key")
|
|
55
|
+
result = engine.run(
|
|
56
|
+
file=df, # Pass DataFrame directly
|
|
57
|
+
target_column="outcome",
|
|
58
|
+
column_descriptions={
|
|
59
|
+
"age": "Patient age in years",
|
|
60
|
+
"heart rate": None
|
|
61
|
+
},
|
|
62
|
+
wait=True
|
|
63
|
+
)
|
|
64
|
+
```
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
### Async Workflow
|
|
68
|
+
|
|
69
|
+
```python
|
|
70
|
+
import asyncio
|
|
71
|
+
from discovery import Engine
|
|
72
|
+
|
|
73
|
+
async def run_analysis():
|
|
74
|
+
async with Engine(api_key="your-api-key") as engine:
|
|
75
|
+
# Start analysis without waiting
|
|
76
|
+
result = await engine.run_async(
|
|
77
|
+
file="data.csv",
|
|
78
|
+
target_column="target",
|
|
79
|
+
wait=False
|
|
80
|
+
)
|
|
81
|
+
print(f"Started run: {result.run_id}")
|
|
82
|
+
|
|
83
|
+
# Later, get results
|
|
84
|
+
result = await client.get_results(result.run_id)
|
|
85
|
+
|
|
86
|
+
# Or wait for completion
|
|
87
|
+
result = await client.wait_for_completion(result.run_id, timeout=600)
|
|
88
|
+
return result
|
|
89
|
+
|
|
90
|
+
result = asyncio.run(run_analysis())
|
|
91
|
+
```
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
## Configuration Options
|
|
95
|
+
|
|
96
|
+
The `run()` and `run_async()` methods accept the following parameters:
|
|
97
|
+
|
|
98
|
+
| Parameter | Type | Default | Description |
|
|
99
|
+
|-----------|------|---------|-------------|
|
|
100
|
+
| `file` | `str`, `Path`, or `DataFrame` | **Required** | Dataset file path or pandas DataFrame |
|
|
101
|
+
| `target_column` | `str` | **Required** | Name of column to predict |
|
|
102
|
+
| `mode` | `"fast"` / `"deep"` | `"fast"` | Analysis depth |
|
|
103
|
+
| `title` | `str` | `None` | Optional dataset title |
|
|
104
|
+
| `description` | `str` | `None` | Optional dataset description |
|
|
105
|
+
| `column_descriptions` | `Dict[str, str]` | `None` | Optional column name -> description mapping |
|
|
106
|
+
| `task` | `str` | `None` | Override auto-detected task type: `"regression"`, `"binary_classification"`, or `"multiclass_classification"` |
|
|
107
|
+
| `visibility` | `"public"` / `"private"` | `"public"` | Dataset visibility (private requires credits) |
|
|
108
|
+
| `timeseries_groups` | `List[Dict]` | `None` | Timeseries column groups for feature extraction |
|
|
109
|
+
| `auto_report_use_llm_evals` | `bool` | `True` | Use LLM for pattern descriptions |
|
|
110
|
+
| `author` | `str` | `None` | Optional dataset author attribution |
|
|
111
|
+
| `source_url` | `str` | `None` | Optional source URL for dataset |
|
|
112
|
+
| `wait` | `bool` | `False` | Wait for analysis to complete and return full results |
|
|
113
|
+
| `wait_timeout` | `float` | `None` | Maximum seconds to wait for completion (only if `wait=True`) |
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
## Credits and Pricing
|
|
117
|
+
|
|
118
|
+
- **Public datasets**: Free (0 credits required)
|
|
119
|
+
- **Private datasets**:
|
|
120
|
+
- Fast mode: 1 credit per MB
|
|
121
|
+
- Deep mode: 3 credits per MB
|
|
122
|
+
|
|
123
|
+
If you don't have enough credits for a private run, the SDK will raise an `httpx.HTTPStatusError` with an error message like:
|
|
124
|
+
```
|
|
125
|
+
Insufficient credits. You need X credits but only have Y available.
|
|
126
|
+
```
|
|
127
|
+
|
|
128
|
+
**Solutions:**
|
|
129
|
+
1. Make your dataset public (set `visibility="public"`) - completely free
|
|
130
|
+
2. Visit [https://disco.leap-labs.com/account](https://disco.leap-labs.com/account) to:
|
|
131
|
+
- Purchase additional credits
|
|
132
|
+
- Upgrade to a subscription plan that includes more credits
|
|
133
|
+
|
|
134
|
+
|
|
135
|
+
## Return Value
|
|
136
|
+
|
|
137
|
+
The `run()` and `run_async()` methods return an `EngineResult` object with the following fields:
|
|
138
|
+
|
|
139
|
+
### EngineResult
|
|
140
|
+
|
|
141
|
+
```python
|
|
142
|
+
@dataclass
|
|
143
|
+
class EngineResult:
|
|
144
|
+
# Identifiers
|
|
145
|
+
run_id: str # Unique run identifier
|
|
146
|
+
report_id: Optional[str] # Report ID (if report created)
|
|
147
|
+
status: str # "pending", "processing", "completed", "failed"
|
|
148
|
+
|
|
149
|
+
# Dataset metadata
|
|
150
|
+
dataset_title: Optional[str] # Dataset title
|
|
151
|
+
dataset_description: Optional[str] # Dataset description
|
|
152
|
+
total_rows: Optional[int] # Number of rows in dataset
|
|
153
|
+
target_column: Optional[str] # Name of target column
|
|
154
|
+
task: Optional[str] # "regression", "binary_classification", or "multiclass_classification"
|
|
155
|
+
|
|
156
|
+
# LLM-generated summary
|
|
157
|
+
summary: Optional[Summary] # Summary object with overview, insights, etc.
|
|
158
|
+
|
|
159
|
+
# Discovered patterns
|
|
160
|
+
patterns: List[Pattern] # List of discovered patterns
|
|
161
|
+
|
|
162
|
+
# Column/feature information
|
|
163
|
+
columns: List[Column] # List of columns with statistics and importance
|
|
164
|
+
|
|
165
|
+
# Correlation matrix
|
|
166
|
+
correlation_matrix: List[CorrelationEntry] # Feature correlations
|
|
167
|
+
|
|
168
|
+
# Global feature importance
|
|
169
|
+
feature_importance: Optional[FeatureImportance] # Feature importance scores
|
|
170
|
+
|
|
171
|
+
# Job tracking
|
|
172
|
+
job_id: Optional[str] # Job ID for tracking processing
|
|
173
|
+
job_status: Optional[str] # Job status
|
|
174
|
+
error_message: Optional[str] # Error message if analysis failed
|
|
175
|
+
```
|
|
176
|
+
|
|
177
|
+
### Summary
|
|
178
|
+
|
|
179
|
+
```python
|
|
180
|
+
@dataclass
|
|
181
|
+
class Summary:
|
|
182
|
+
overview: str # High-level explanation of findings
|
|
183
|
+
key_insights: List[str] # List of main takeaways
|
|
184
|
+
novel_patterns: PatternGroup # Novel pattern explanations
|
|
185
|
+
surprising_findings: PatternGroup # Surprising findings
|
|
186
|
+
statistically_significant: PatternGroup # Statistically significant patterns
|
|
187
|
+
data_insights: Optional[DataInsights] # Important features, correlations
|
|
188
|
+
selected_pattern_id: Optional[str] # ID of selected pattern
|
|
189
|
+
```
|
|
190
|
+
|
|
191
|
+
### Pattern
|
|
192
|
+
|
|
193
|
+
```python
|
|
194
|
+
@dataclass
|
|
195
|
+
class Pattern:
|
|
196
|
+
id: str # Pattern identifier
|
|
197
|
+
task: str # Task type
|
|
198
|
+
target_column: str # Target column name
|
|
199
|
+
direction: str # "min" or "max"
|
|
200
|
+
p_value: float # Statistical p-value
|
|
201
|
+
conditions: List[Dict] # Pattern conditions (continuous, categorical, datetime)
|
|
202
|
+
lift_value: float # Lift value (how much the pattern increases/decreases target)
|
|
203
|
+
support_count: int # Number of rows matching pattern
|
|
204
|
+
support_percentage: float # Percentage of rows matching pattern
|
|
205
|
+
pattern_type: str # "validated" or "speculative"
|
|
206
|
+
novelty_type: str # "novel" or "confirmatory"
|
|
207
|
+
target_score: float # Target score for this pattern
|
|
208
|
+
description: str # Human-readable description
|
|
209
|
+
novelty_explanation: str # Explanation of novelty
|
|
210
|
+
target_class: Optional[str] # Target class (for classification)
|
|
211
|
+
target_mean: Optional[float] # Target mean (for regression)
|
|
212
|
+
target_std: Optional[float] # Target standard deviation
|
|
213
|
+
citations: List[Dict] # Academic citations
|
|
214
|
+
```
|
|
215
|
+
|
|
216
|
+
### Column
|
|
217
|
+
|
|
218
|
+
```python
|
|
219
|
+
@dataclass
|
|
220
|
+
class Column:
|
|
221
|
+
id: str # Column identifier
|
|
222
|
+
name: str # Column name
|
|
223
|
+
display_name: str # Display name
|
|
224
|
+
type: str # "continuous" or "categorical"
|
|
225
|
+
data_type: str # "int", "float", "string", "boolean", "datetime"
|
|
226
|
+
enabled: bool # Whether column is enabled
|
|
227
|
+
description: Optional[str] # Column description
|
|
228
|
+
|
|
229
|
+
# Statistics
|
|
230
|
+
mean: Optional[float] # Mean value
|
|
231
|
+
median: Optional[float] # Median value
|
|
232
|
+
std: Optional[float] # Standard deviation
|
|
233
|
+
min: Optional[float] # Minimum value
|
|
234
|
+
max: Optional[float] # Maximum value
|
|
235
|
+
iqr_min: Optional[float] # IQR minimum
|
|
236
|
+
iqr_max: Optional[float] # IQR maximum
|
|
237
|
+
mode: Optional[str] # Mode value
|
|
238
|
+
approx_unique: Optional[int] # Approximate unique count
|
|
239
|
+
null_percentage: Optional[float] # Percentage of null values
|
|
240
|
+
|
|
241
|
+
# Feature importance
|
|
242
|
+
feature_importance_score: Optional[float] # Feature importance score
|
|
243
|
+
```
|
|
244
|
+
|
|
245
|
+
### FeatureImportance
|
|
246
|
+
|
|
247
|
+
```python
|
|
248
|
+
@dataclass
|
|
249
|
+
class FeatureImportance:
|
|
250
|
+
kind: str # Feature importance type: "global"
|
|
251
|
+
baseline: float # Baseline model output
|
|
252
|
+
scores: List[FeatureImportanceScore] # List of feature scores
|
|
253
|
+
```
|
|
254
|
+
|
|
255
|
+
### CorrelationEntry
|
|
256
|
+
|
|
257
|
+
```python
|
|
258
|
+
@dataclass
|
|
259
|
+
class CorrelationEntry:
|
|
260
|
+
feature_x: str # First feature name
|
|
261
|
+
feature_y: str # Second feature name
|
|
262
|
+
value: float # Correlation value (-1 to 1)
|
|
263
|
+
```
|
|
264
|
+
|
|
265
|
+
### Pattern
|
|
266
|
+
|
|
267
|
+
```python
|
|
268
|
+
@dataclass
|
|
269
|
+
class Pattern:
|
|
270
|
+
id: str
|
|
271
|
+
task: str
|
|
272
|
+
target_column: str
|
|
273
|
+
direction: str # "min" or "max"
|
|
274
|
+
p_value: float
|
|
275
|
+
conditions: List[Dict] # Continuous, categorical, or datetime conditions
|
|
276
|
+
lift_value: float
|
|
277
|
+
support_count: int
|
|
278
|
+
support_percentage: float
|
|
279
|
+
pattern_type: str # "validated" or "speculative"
|
|
280
|
+
novelty_type: str # "novel" or "confirmatory"
|
|
281
|
+
target_score: float
|
|
282
|
+
description: str
|
|
283
|
+
novelty_explanation: str
|
|
284
|
+
target_class: Optional[str]
|
|
285
|
+
target_mean: Optional[float]
|
|
286
|
+
target_std: Optional[float]
|
|
287
|
+
citations: List[Dict]
|
|
288
|
+
```
|
|
289
|
+
|