discovery-engine-api 0.2.93__tar.gz → 0.2.94__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {discovery_engine_api-0.2.93 → discovery_engine_api-0.2.94}/PKG-INFO +102 -36
- {discovery_engine_api-0.2.93 → discovery_engine_api-0.2.94}/README.md +101 -35
- {discovery_engine_api-0.2.93 → discovery_engine_api-0.2.94}/discovery/__init__.py +1 -1
- discovery_engine_api-0.2.94/discovery/integrations/crewai.py +118 -0
- discovery_engine_api-0.2.94/discovery/integrations/langchain.py +122 -0
- {discovery_engine_api-0.2.93 → discovery_engine_api-0.2.94}/pyproject.toml +1 -1
- discovery_engine_api-0.2.93/discovery/integrations/crewai.py +0 -96
- discovery_engine_api-0.2.93/discovery/integrations/langchain.py +0 -105
- {discovery_engine_api-0.2.93 → discovery_engine_api-0.2.94}/.gitignore +0 -0
- {discovery_engine_api-0.2.93 → discovery_engine_api-0.2.94}/discovery/client.py +0 -0
- {discovery_engine_api-0.2.93 → discovery_engine_api-0.2.94}/discovery/errors.py +0 -0
- {discovery_engine_api-0.2.93 → discovery_engine_api-0.2.94}/discovery/integrations/__init__.py +0 -0
- {discovery_engine_api-0.2.93 → discovery_engine_api-0.2.94}/discovery/types.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: discovery-engine-api
|
|
3
|
-
Version: 0.2.
|
|
3
|
+
Version: 0.2.94
|
|
4
4
|
Summary: Python SDK for Disco API
|
|
5
5
|
Project-URL: Homepage, https://www.leap-labs.com
|
|
6
6
|
Project-URL: Documentation, https://disco.leap-labs.com/llms-full.txt
|
|
@@ -78,19 +78,26 @@ Get your API key from the [Developers page](https://disco.leap-labs.com/develope
|
|
|
78
78
|
await engine.discover(
|
|
79
79
|
file: str | Path | pd.DataFrame, # Dataset to analyze
|
|
80
80
|
target_column: str, # Column to predict/analyze
|
|
81
|
-
analysis_depth: int = 2,
|
|
81
|
+
analysis_depth: int = 2, # 2=default, higher=deeper analysis
|
|
82
82
|
visibility: str = "public", # "public" (free) or "private" (credits)
|
|
83
83
|
title: str | None = None, # Dataset title
|
|
84
84
|
description: str | None = None, # Dataset description
|
|
85
85
|
column_descriptions: dict[str, str] | None = None, # Improves pattern explanations
|
|
86
|
-
excluded_columns: list[str] | None = None, # Columns to exclude
|
|
86
|
+
excluded_columns: list[str] | None = None, # Columns to exclude — see below
|
|
87
|
+
use_llms: bool = False, # True = LLM explanations (costs more) — see below
|
|
87
88
|
timeout: float = 1800, # Max seconds to wait
|
|
89
|
+
# Additional kwargs forwarded to run_async():
|
|
90
|
+
# task, author, source_url, timeseries_groups, ...
|
|
88
91
|
)
|
|
89
92
|
```
|
|
90
93
|
|
|
91
94
|
> **Tip:** Providing `column_descriptions` significantly improves pattern explanations. If your columns have non-obvious names, always describe them.
|
|
92
95
|
|
|
93
|
-
>
|
|
96
|
+
> **`use_llms`:** Default `False`. Slower and more expensive, but you get smarter pre-processing, literature context and novelty assessment. Set to `True` if you want Disco-generated pattern descriptions, novelty assessment with citations, and report summaries. **Public runs always use LLMs regardless of this setting.** What changes when false: pattern descriptions fall back to generic text, novelty is not assessed (all patterns marked confirmatory, no citations), report summaries are omitted, integer columns with few unique values (e.g. "month" 1-12, "hour" 0-23) may be misclassified as categorical instead of continuous, and high-cardinality text columns get generic cluster names instead of descriptive ones. Use `engine.estimate()` to check credit cost before running.
|
|
97
|
+
|
|
98
|
+
> **Visibility:** `"public"` runs are free but results are published, and analysis depth is locked to 2. `"private"` runs keep results confidential and consume credits.
|
|
99
|
+
|
|
100
|
+
> **`excluded_columns`:** Always exclude identifiers (row IDs, UUIDs), data leakage (target renamed/reformatted), and tautological columns (alternative encodings of the same construct as the target). For example, if your target is `serious`, exclude `serious_outcome`, `not_serious`, `death` — they're part of the same classification system.
|
|
94
101
|
|
|
95
102
|
|
|
96
103
|
## Examples
|
|
@@ -111,32 +118,25 @@ result = await engine.discover(
|
|
|
111
118
|
"age": "Patient age in years",
|
|
112
119
|
"bmi": "Body mass index",
|
|
113
120
|
},
|
|
114
|
-
excluded_columns=["patient_id", "timestamp"],
|
|
121
|
+
excluded_columns=["patient_id", "timestamp", "outcome_text"], # IDs + tautological
|
|
115
122
|
)
|
|
116
123
|
```
|
|
117
124
|
|
|
118
|
-
###
|
|
119
|
-
|
|
120
|
-
If you need to see the dataset's columns before choosing a target column, upload first and inspect:
|
|
125
|
+
### Running in the Background
|
|
121
126
|
|
|
122
|
-
|
|
123
|
-
# Upload once and get the server's parsed column list
|
|
124
|
-
upload = await engine.upload_file(file="data.csv", title="My dataset")
|
|
125
|
-
print(upload["columns"]) # [{"name": "col1", "type": "continuous", ...}, ...]
|
|
126
|
-
print(upload["rowCount"]) # e.g., 5000
|
|
127
|
+
Runs take 3–15 minutes. While waiting, the SDK logs progress automatically:
|
|
127
128
|
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
)
|
|
129
|
+
```
|
|
130
|
+
Waiting for run abc123 to complete...
|
|
131
|
+
Status: waiting (position 2 in queue) | Est. wait: ~8 min | Upgrade at disco.leap-labs.com/account for priority processing
|
|
132
|
+
Status: processing (preprocessing — Processing data...) | Elapsed: 34.2s | ETA: ~6 min
|
|
133
|
+
Status: processing (training — Modelling data...) | Elapsed: 98.7s | ETA: ~4 min
|
|
134
|
+
Status: processing (interpreting — Extracting patterns...) | Elapsed: 284.1s | ETA: ~2 min
|
|
135
|
+
Status: processing (reporting — Building report...) | Elapsed: 412.3s | ETA: ~1 min
|
|
136
|
+
Run completed in 467.8s
|
|
135
137
|
```
|
|
136
138
|
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
Runs take 3–15 minutes. If you need to do other work while Disco runs:
|
|
139
|
+
If you need to do other work while Disco runs:
|
|
140
140
|
|
|
141
141
|
```python
|
|
142
142
|
import asyncio
|
|
@@ -161,6 +161,29 @@ async def main():
|
|
|
161
161
|
result = asyncio.run(main())
|
|
162
162
|
```
|
|
163
163
|
|
|
164
|
+
### Inspecting Columns Before Running
|
|
165
|
+
|
|
166
|
+
If you need to see the dataset's columns before choosing a target column — e.g., when column names are not obvious — upload first, inspect, then run without re-uploading:
|
|
167
|
+
|
|
168
|
+
```python
|
|
169
|
+
# Upload once and get the server's parsed column list
|
|
170
|
+
upload = await engine.upload_file(file="data.csv", title="My dataset")
|
|
171
|
+
# upload["file"] -> {"key": "uploads/abc123.csv", "name": "data.csv",
|
|
172
|
+
# "size": 1048576, "fileHash": "sha256:..."}
|
|
173
|
+
# upload["columns"] -> [{"name": "col1", "type": "continuous", ...}, ...]
|
|
174
|
+
# upload["rowCount"] -> 5000
|
|
175
|
+
print(upload["columns"])
|
|
176
|
+
print(upload["rowCount"])
|
|
177
|
+
|
|
178
|
+
# Pass the result to avoid re-uploading
|
|
179
|
+
result = await engine.run_async(
|
|
180
|
+
file="data.csv",
|
|
181
|
+
target_column="col1",
|
|
182
|
+
wait=True,
|
|
183
|
+
upload_result=upload, # skips the upload step
|
|
184
|
+
)
|
|
185
|
+
```
|
|
186
|
+
|
|
164
187
|
### Synchronous Usage
|
|
165
188
|
|
|
166
189
|
For scripts and Jupyter notebooks:
|
|
@@ -212,7 +235,7 @@ print(f"Explore: {result.report_url}")
|
|
|
212
235
|
|
|
213
236
|
## Credits and Pricing
|
|
214
237
|
|
|
215
|
-
- **Public runs**: Free. Results published to public gallery.
|
|
238
|
+
- **Public runs**: Free. Results published to public gallery. Locked to depth=2.
|
|
216
239
|
- **Private runs**: Credits scale with file size, depth, and run configuration. $0.10 per credit. Use `engine.estimate()` to check cost before running.
|
|
217
240
|
|
|
218
241
|
```python
|
|
@@ -223,13 +246,27 @@ estimate = await engine.estimate(
|
|
|
223
246
|
analysis_depth=2,
|
|
224
247
|
visibility="private",
|
|
225
248
|
)
|
|
226
|
-
# estimate["cost"]["credits"]
|
|
227
|
-
# estimate["
|
|
249
|
+
# estimate["cost"]["credits"] -> 55
|
|
250
|
+
# estimate["cost"]["price_usd"] -> 5.5
|
|
251
|
+
# estimate["time_estimate"]["estimated_seconds"] -> 360
|
|
252
|
+
# estimate["account"]["sufficient"] -> True/False
|
|
253
|
+
# estimate["limits"]["max_analysis_depth"] -> 23 (num_columns - 2)
|
|
228
254
|
```
|
|
229
255
|
|
|
230
256
|
Manage credits and plans at [disco.leap-labs.com/account](https://disco.leap-labs.com/account).
|
|
231
257
|
|
|
232
258
|
|
|
259
|
+
## Expected Data Format
|
|
260
|
+
|
|
261
|
+
Disco expects a **flat table** — columns for features, rows for samples.
|
|
262
|
+
|
|
263
|
+
- **One row per observation** — a patient, a sample, a transaction, a measurement, etc.
|
|
264
|
+
- **One column per feature** — numeric, categorical, datetime, or free text are all fine
|
|
265
|
+
- **One target column** — the outcome to analyze. Must have at least 2 distinct values.
|
|
266
|
+
- **Missing values are OK** — Disco handles them automatically. Don't drop rows or impute beforehand.
|
|
267
|
+
|
|
268
|
+
Not supported: images, raw text documents, nested/hierarchical JSON, multi-sheet Excel (use the first sheet or export to CSV).
|
|
269
|
+
|
|
233
270
|
## File Size Limits
|
|
234
271
|
|
|
235
272
|
Uploads up to **5 GB**. Files are uploaded directly to cloud storage using presigned URLs.
|
|
@@ -245,16 +282,30 @@ Supported formats: **CSV**, **TSV**, **Excel (.xlsx)**, **JSON**, **Parquet**, *
|
|
|
245
282
|
@dataclass
|
|
246
283
|
class EngineResult:
|
|
247
284
|
run_id: str
|
|
285
|
+
report_id: str | None # Report UUID (used in report_url)
|
|
248
286
|
status: str # "pending", "processing", "completed", "failed"
|
|
287
|
+
dataset_title: str | None # Title of the dataset
|
|
288
|
+
dataset_description: str | None # Description of the dataset
|
|
289
|
+
total_rows: int | None
|
|
290
|
+
target_column: str | None # Column being predicted/analyzed
|
|
291
|
+
task: str | None # "regression", "binary_classification", "multiclass_classification"
|
|
249
292
|
summary: Summary | None # LLM-generated insights
|
|
250
293
|
patterns: list[Pattern] # Discovered patterns (the core output)
|
|
251
294
|
columns: list[Column] # Feature info and statistics
|
|
252
|
-
feature_importance: FeatureImportance | None # Global importance scores
|
|
253
295
|
correlation_matrix: list[CorrelationEntry] # Feature correlations
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
296
|
+
feature_importance: FeatureImportance | None # Global importance scores
|
|
297
|
+
job_id: str | None # Job ID for tracking
|
|
298
|
+
job_status: str | None # Job queue status
|
|
299
|
+
queue_position: int | None # Position in queue when pending (1 = next up)
|
|
300
|
+
current_step: str | None # Active pipeline step (preprocessing, training, interpreting, reporting)
|
|
301
|
+
current_step_message: str | None # Human-readable description of the current step
|
|
302
|
+
estimated_seconds: int | None # Estimated total processing time in seconds
|
|
303
|
+
estimated_wait_seconds: int | None # Estimated queue wait time in seconds (pending only)
|
|
257
304
|
error_message: str | None
|
|
305
|
+
report_url: str | None # Shareable link to interactive web report
|
|
306
|
+
hints: list[str] # Upgrade hints (non-empty for free-tier users with hidden patterns)
|
|
307
|
+
hidden_deep_count: int # Patterns hidden for free-tier accounts (upgrade to see all)
|
|
308
|
+
hidden_deep_novel_count: int # Novel patterns hidden for free-tier accounts
|
|
258
309
|
```
|
|
259
310
|
|
|
260
311
|
### Pattern
|
|
@@ -263,6 +314,8 @@ class EngineResult:
|
|
|
263
314
|
@dataclass
|
|
264
315
|
class Pattern:
|
|
265
316
|
id: str
|
|
317
|
+
task: str # "regression", "binary_classification", "multiclass_classification"
|
|
318
|
+
target_column: str # Column being analyzed
|
|
266
319
|
description: str # Human-readable description
|
|
267
320
|
conditions: list[dict] # Conditions defining the pattern
|
|
268
321
|
p_value: float # FDR-adjusted p-value
|
|
@@ -272,8 +325,10 @@ class Pattern:
|
|
|
272
325
|
citations: list[dict] # Academic citations
|
|
273
326
|
target_change_direction: str # "max" (increases target) or "min" (decreases)
|
|
274
327
|
abs_target_change: float # Magnitude of effect
|
|
328
|
+
target_score: float # Mean target value (regression) or class fraction (classification) in the subgroup
|
|
275
329
|
support_count: int # Rows matching this pattern
|
|
276
330
|
support_percentage: float # Percentage of dataset
|
|
331
|
+
target_class: str | None # For classification tasks
|
|
277
332
|
target_mean: float | None # For regression tasks
|
|
278
333
|
target_std: float | None
|
|
279
334
|
```
|
|
@@ -323,6 +378,7 @@ class Summary:
|
|
|
323
378
|
overview: str # High-level summary of findings
|
|
324
379
|
key_insights: list[str] # Main takeaways
|
|
325
380
|
novel_patterns: PatternGroup # Novel pattern IDs and explanation
|
|
381
|
+
selected_pattern_id: str | None # ID of the highlighted/featured pattern
|
|
326
382
|
```
|
|
327
383
|
|
|
328
384
|
### Column
|
|
@@ -342,17 +398,22 @@ class Column:
|
|
|
342
398
|
std: float | None
|
|
343
399
|
min: float | None
|
|
344
400
|
max: float | None
|
|
401
|
+
iqr_min: float | None # 25th percentile
|
|
402
|
+
iqr_max: float | None # 75th percentile
|
|
403
|
+
mode: str | None # Most common value (categorical columns)
|
|
404
|
+
approx_unique: int | None # Approximate distinct value count
|
|
405
|
+
null_percentage: float | None
|
|
345
406
|
feature_importance_score: float | None # Signed importance score
|
|
346
407
|
```
|
|
347
408
|
|
|
348
409
|
### FeatureImportance
|
|
349
410
|
|
|
350
|
-
|
|
411
|
+
Scores are **signed** — positive means the feature increases the prediction, negative means it decreases it.
|
|
351
412
|
|
|
352
413
|
```python
|
|
353
414
|
@dataclass
|
|
354
415
|
class FeatureImportance:
|
|
355
|
-
kind: str # "global"
|
|
416
|
+
kind: str # "global" | "local"
|
|
356
417
|
baseline: float # Baseline model output
|
|
357
418
|
scores: list[FeatureImportanceScore]
|
|
358
419
|
|
|
@@ -366,12 +427,13 @@ class FeatureImportanceScore:
|
|
|
366
427
|
## Error Handling
|
|
367
428
|
|
|
368
429
|
```python
|
|
369
|
-
from discovery import
|
|
370
|
-
|
|
430
|
+
from discovery import Engine
|
|
431
|
+
from discovery.errors import (
|
|
371
432
|
AuthenticationError,
|
|
372
433
|
InsufficientCreditsError,
|
|
373
434
|
RateLimitError,
|
|
374
435
|
RunFailedError,
|
|
436
|
+
RunNotFoundError,
|
|
375
437
|
PaymentRequiredError,
|
|
376
438
|
)
|
|
377
439
|
|
|
@@ -381,11 +443,15 @@ except AuthenticationError as e:
|
|
|
381
443
|
print(e.suggestion) # "Check your API key at https://disco.leap-labs.com/developers"
|
|
382
444
|
except InsufficientCreditsError as e:
|
|
383
445
|
print(f"Need {e.credits_required}, have {e.credits_available}")
|
|
384
|
-
print(e.suggestion) # "
|
|
446
|
+
print(e.suggestion) # "Run with visibility='public' (free, results published) or purchase credits with engine.purchase_credits()."
|
|
385
447
|
except RateLimitError as e:
|
|
386
448
|
print(f"Retry after {e.retry_after} seconds")
|
|
387
449
|
except RunFailedError as e:
|
|
388
450
|
print(f"Run {e.run_id} failed: {e}")
|
|
451
|
+
except RunNotFoundError as e:
|
|
452
|
+
print(f"Run {e.run_id} not found — may have been cleaned up")
|
|
453
|
+
except PaymentRequiredError as e:
|
|
454
|
+
print(e.suggestion) # "Attach a payment method with engine.add_payment_method(...)"
|
|
389
455
|
except TimeoutError:
|
|
390
456
|
pass # Retrieve later with engine.wait_for_completion(run_id)
|
|
391
457
|
```
|
|
@@ -395,7 +461,7 @@ All errors include a `suggestion` field with actionable instructions.
|
|
|
395
461
|
|
|
396
462
|
## MCP Server
|
|
397
463
|
|
|
398
|
-
Disco is available as an [MCP server](https://disco.leap-labs.com/.well-known/mcp.json) with tools for the full discovery lifecycle — estimate, analyze, check status, get results, manage account.
|
|
464
|
+
Disco is available as an [MCP server](https://disco.leap-labs.com/.well-known/mcp.json) with tools for the full discovery lifecycle — estimate, analyze, check status, get results, manage account. To subscribe or purchase credits via MCP, call `discovery_add_payment_method` first to attach a Stripe payment method.
|
|
399
465
|
|
|
400
466
|
```json
|
|
401
467
|
{
|
|
@@ -41,19 +41,26 @@ Get your API key from the [Developers page](https://disco.leap-labs.com/develope
|
|
|
41
41
|
await engine.discover(
|
|
42
42
|
file: str | Path | pd.DataFrame, # Dataset to analyze
|
|
43
43
|
target_column: str, # Column to predict/analyze
|
|
44
|
-
analysis_depth: int = 2,
|
|
44
|
+
analysis_depth: int = 2, # 2=default, higher=deeper analysis
|
|
45
45
|
visibility: str = "public", # "public" (free) or "private" (credits)
|
|
46
46
|
title: str | None = None, # Dataset title
|
|
47
47
|
description: str | None = None, # Dataset description
|
|
48
48
|
column_descriptions: dict[str, str] | None = None, # Improves pattern explanations
|
|
49
|
-
excluded_columns: list[str] | None = None, # Columns to exclude
|
|
49
|
+
excluded_columns: list[str] | None = None, # Columns to exclude — see below
|
|
50
|
+
use_llms: bool = False, # True = LLM explanations (costs more) — see below
|
|
50
51
|
timeout: float = 1800, # Max seconds to wait
|
|
52
|
+
# Additional kwargs forwarded to run_async():
|
|
53
|
+
# task, author, source_url, timeseries_groups, ...
|
|
51
54
|
)
|
|
52
55
|
```
|
|
53
56
|
|
|
54
57
|
> **Tip:** Providing `column_descriptions` significantly improves pattern explanations. If your columns have non-obvious names, always describe them.
|
|
55
58
|
|
|
56
|
-
>
|
|
59
|
+
> **`use_llms`:** Default `False`. Slower and more expensive, but you get smarter pre-processing, literature context and novelty assessment. Set to `True` if you want Disco-generated pattern descriptions, novelty assessment with citations, and report summaries. **Public runs always use LLMs regardless of this setting.** What changes when false: pattern descriptions fall back to generic text, novelty is not assessed (all patterns marked confirmatory, no citations), report summaries are omitted, integer columns with few unique values (e.g. "month" 1-12, "hour" 0-23) may be misclassified as categorical instead of continuous, and high-cardinality text columns get generic cluster names instead of descriptive ones. Use `engine.estimate()` to check credit cost before running.
|
|
60
|
+
|
|
61
|
+
> **Visibility:** `"public"` runs are free but results are published, and analysis depth is locked to 2. `"private"` runs keep results confidential and consume credits.
|
|
62
|
+
|
|
63
|
+
> **`excluded_columns`:** Always exclude identifiers (row IDs, UUIDs), data leakage (target renamed/reformatted), and tautological columns (alternative encodings of the same construct as the target). For example, if your target is `serious`, exclude `serious_outcome`, `not_serious`, `death` — they're part of the same classification system.
|
|
57
64
|
|
|
58
65
|
|
|
59
66
|
## Examples
|
|
@@ -74,32 +81,25 @@ result = await engine.discover(
|
|
|
74
81
|
"age": "Patient age in years",
|
|
75
82
|
"bmi": "Body mass index",
|
|
76
83
|
},
|
|
77
|
-
excluded_columns=["patient_id", "timestamp"],
|
|
84
|
+
excluded_columns=["patient_id", "timestamp", "outcome_text"], # IDs + tautological
|
|
78
85
|
)
|
|
79
86
|
```
|
|
80
87
|
|
|
81
|
-
###
|
|
82
|
-
|
|
83
|
-
If you need to see the dataset's columns before choosing a target column, upload first and inspect:
|
|
88
|
+
### Running in the Background
|
|
84
89
|
|
|
85
|
-
|
|
86
|
-
# Upload once and get the server's parsed column list
|
|
87
|
-
upload = await engine.upload_file(file="data.csv", title="My dataset")
|
|
88
|
-
print(upload["columns"]) # [{"name": "col1", "type": "continuous", ...}, ...]
|
|
89
|
-
print(upload["rowCount"]) # e.g., 5000
|
|
90
|
+
Runs take 3–15 minutes. While waiting, the SDK logs progress automatically:
|
|
90
91
|
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
)
|
|
92
|
+
```
|
|
93
|
+
Waiting for run abc123 to complete...
|
|
94
|
+
Status: waiting (position 2 in queue) | Est. wait: ~8 min | Upgrade at disco.leap-labs.com/account for priority processing
|
|
95
|
+
Status: processing (preprocessing — Processing data...) | Elapsed: 34.2s | ETA: ~6 min
|
|
96
|
+
Status: processing (training — Modelling data...) | Elapsed: 98.7s | ETA: ~4 min
|
|
97
|
+
Status: processing (interpreting — Extracting patterns...) | Elapsed: 284.1s | ETA: ~2 min
|
|
98
|
+
Status: processing (reporting — Building report...) | Elapsed: 412.3s | ETA: ~1 min
|
|
99
|
+
Run completed in 467.8s
|
|
98
100
|
```
|
|
99
101
|
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
Runs take 3–15 minutes. If you need to do other work while Disco runs:
|
|
102
|
+
If you need to do other work while Disco runs:
|
|
103
103
|
|
|
104
104
|
```python
|
|
105
105
|
import asyncio
|
|
@@ -124,6 +124,29 @@ async def main():
|
|
|
124
124
|
result = asyncio.run(main())
|
|
125
125
|
```
|
|
126
126
|
|
|
127
|
+
### Inspecting Columns Before Running
|
|
128
|
+
|
|
129
|
+
If you need to see the dataset's columns before choosing a target column — e.g., when column names are not obvious — upload first, inspect, then run without re-uploading:
|
|
130
|
+
|
|
131
|
+
```python
|
|
132
|
+
# Upload once and get the server's parsed column list
|
|
133
|
+
upload = await engine.upload_file(file="data.csv", title="My dataset")
|
|
134
|
+
# upload["file"] -> {"key": "uploads/abc123.csv", "name": "data.csv",
|
|
135
|
+
# "size": 1048576, "fileHash": "sha256:..."}
|
|
136
|
+
# upload["columns"] -> [{"name": "col1", "type": "continuous", ...}, ...]
|
|
137
|
+
# upload["rowCount"] -> 5000
|
|
138
|
+
print(upload["columns"])
|
|
139
|
+
print(upload["rowCount"])
|
|
140
|
+
|
|
141
|
+
# Pass the result to avoid re-uploading
|
|
142
|
+
result = await engine.run_async(
|
|
143
|
+
file="data.csv",
|
|
144
|
+
target_column="col1",
|
|
145
|
+
wait=True,
|
|
146
|
+
upload_result=upload, # skips the upload step
|
|
147
|
+
)
|
|
148
|
+
```
|
|
149
|
+
|
|
127
150
|
### Synchronous Usage
|
|
128
151
|
|
|
129
152
|
For scripts and Jupyter notebooks:
|
|
@@ -175,7 +198,7 @@ print(f"Explore: {result.report_url}")
|
|
|
175
198
|
|
|
176
199
|
## Credits and Pricing
|
|
177
200
|
|
|
178
|
-
- **Public runs**: Free. Results published to public gallery.
|
|
201
|
+
- **Public runs**: Free. Results published to public gallery. Locked to depth=2.
|
|
179
202
|
- **Private runs**: Credits scale with file size, depth, and run configuration. $0.10 per credit. Use `engine.estimate()` to check cost before running.
|
|
180
203
|
|
|
181
204
|
```python
|
|
@@ -186,13 +209,27 @@ estimate = await engine.estimate(
|
|
|
186
209
|
analysis_depth=2,
|
|
187
210
|
visibility="private",
|
|
188
211
|
)
|
|
189
|
-
# estimate["cost"]["credits"]
|
|
190
|
-
# estimate["
|
|
212
|
+
# estimate["cost"]["credits"] -> 55
|
|
213
|
+
# estimate["cost"]["price_usd"] -> 5.5
|
|
214
|
+
# estimate["time_estimate"]["estimated_seconds"] -> 360
|
|
215
|
+
# estimate["account"]["sufficient"] -> True/False
|
|
216
|
+
# estimate["limits"]["max_analysis_depth"] -> 23 (num_columns - 2)
|
|
191
217
|
```
|
|
192
218
|
|
|
193
219
|
Manage credits and plans at [disco.leap-labs.com/account](https://disco.leap-labs.com/account).
|
|
194
220
|
|
|
195
221
|
|
|
222
|
+
## Expected Data Format
|
|
223
|
+
|
|
224
|
+
Disco expects a **flat table** — columns for features, rows for samples.
|
|
225
|
+
|
|
226
|
+
- **One row per observation** — a patient, a sample, a transaction, a measurement, etc.
|
|
227
|
+
- **One column per feature** — numeric, categorical, datetime, or free text are all fine
|
|
228
|
+
- **One target column** — the outcome to analyze. Must have at least 2 distinct values.
|
|
229
|
+
- **Missing values are OK** — Disco handles them automatically. Don't drop rows or impute beforehand.
|
|
230
|
+
|
|
231
|
+
Not supported: images, raw text documents, nested/hierarchical JSON, multi-sheet Excel (use the first sheet or export to CSV).
|
|
232
|
+
|
|
196
233
|
## File Size Limits
|
|
197
234
|
|
|
198
235
|
Uploads up to **5 GB**. Files are uploaded directly to cloud storage using presigned URLs.
|
|
@@ -208,16 +245,30 @@ Supported formats: **CSV**, **TSV**, **Excel (.xlsx)**, **JSON**, **Parquet**, *
|
|
|
208
245
|
@dataclass
|
|
209
246
|
class EngineResult:
|
|
210
247
|
run_id: str
|
|
248
|
+
report_id: str | None # Report UUID (used in report_url)
|
|
211
249
|
status: str # "pending", "processing", "completed", "failed"
|
|
250
|
+
dataset_title: str | None # Title of the dataset
|
|
251
|
+
dataset_description: str | None # Description of the dataset
|
|
252
|
+
total_rows: int | None
|
|
253
|
+
target_column: str | None # Column being predicted/analyzed
|
|
254
|
+
task: str | None # "regression", "binary_classification", "multiclass_classification"
|
|
212
255
|
summary: Summary | None # LLM-generated insights
|
|
213
256
|
patterns: list[Pattern] # Discovered patterns (the core output)
|
|
214
257
|
columns: list[Column] # Feature info and statistics
|
|
215
|
-
feature_importance: FeatureImportance | None # Global importance scores
|
|
216
258
|
correlation_matrix: list[CorrelationEntry] # Feature correlations
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
259
|
+
feature_importance: FeatureImportance | None # Global importance scores
|
|
260
|
+
job_id: str | None # Job ID for tracking
|
|
261
|
+
job_status: str | None # Job queue status
|
|
262
|
+
queue_position: int | None # Position in queue when pending (1 = next up)
|
|
263
|
+
current_step: str | None # Active pipeline step (preprocessing, training, interpreting, reporting)
|
|
264
|
+
current_step_message: str | None # Human-readable description of the current step
|
|
265
|
+
estimated_seconds: int | None # Estimated total processing time in seconds
|
|
266
|
+
estimated_wait_seconds: int | None # Estimated queue wait time in seconds (pending only)
|
|
220
267
|
error_message: str | None
|
|
268
|
+
report_url: str | None # Shareable link to interactive web report
|
|
269
|
+
hints: list[str] # Upgrade hints (non-empty for free-tier users with hidden patterns)
|
|
270
|
+
hidden_deep_count: int # Patterns hidden for free-tier accounts (upgrade to see all)
|
|
271
|
+
hidden_deep_novel_count: int # Novel patterns hidden for free-tier accounts
|
|
221
272
|
```
|
|
222
273
|
|
|
223
274
|
### Pattern
|
|
@@ -226,6 +277,8 @@ class EngineResult:
|
|
|
226
277
|
@dataclass
|
|
227
278
|
class Pattern:
|
|
228
279
|
id: str
|
|
280
|
+
task: str # "regression", "binary_classification", "multiclass_classification"
|
|
281
|
+
target_column: str # Column being analyzed
|
|
229
282
|
description: str # Human-readable description
|
|
230
283
|
conditions: list[dict] # Conditions defining the pattern
|
|
231
284
|
p_value: float # FDR-adjusted p-value
|
|
@@ -235,8 +288,10 @@ class Pattern:
|
|
|
235
288
|
citations: list[dict] # Academic citations
|
|
236
289
|
target_change_direction: str # "max" (increases target) or "min" (decreases)
|
|
237
290
|
abs_target_change: float # Magnitude of effect
|
|
291
|
+
target_score: float # Mean target value (regression) or class fraction (classification) in the subgroup
|
|
238
292
|
support_count: int # Rows matching this pattern
|
|
239
293
|
support_percentage: float # Percentage of dataset
|
|
294
|
+
target_class: str | None # For classification tasks
|
|
240
295
|
target_mean: float | None # For regression tasks
|
|
241
296
|
target_std: float | None
|
|
242
297
|
```
|
|
@@ -286,6 +341,7 @@ class Summary:
|
|
|
286
341
|
overview: str # High-level summary of findings
|
|
287
342
|
key_insights: list[str] # Main takeaways
|
|
288
343
|
novel_patterns: PatternGroup # Novel pattern IDs and explanation
|
|
344
|
+
selected_pattern_id: str | None # ID of the highlighted/featured pattern
|
|
289
345
|
```
|
|
290
346
|
|
|
291
347
|
### Column
|
|
@@ -305,17 +361,22 @@ class Column:
|
|
|
305
361
|
std: float | None
|
|
306
362
|
min: float | None
|
|
307
363
|
max: float | None
|
|
364
|
+
iqr_min: float | None # 25th percentile
|
|
365
|
+
iqr_max: float | None # 75th percentile
|
|
366
|
+
mode: str | None # Most common value (categorical columns)
|
|
367
|
+
approx_unique: int | None # Approximate distinct value count
|
|
368
|
+
null_percentage: float | None
|
|
308
369
|
feature_importance_score: float | None # Signed importance score
|
|
309
370
|
```
|
|
310
371
|
|
|
311
372
|
### FeatureImportance
|
|
312
373
|
|
|
313
|
-
|
|
374
|
+
Scores are **signed** — positive means the feature increases the prediction, negative means it decreases it.
|
|
314
375
|
|
|
315
376
|
```python
|
|
316
377
|
@dataclass
|
|
317
378
|
class FeatureImportance:
|
|
318
|
-
kind: str # "global"
|
|
379
|
+
kind: str # "global" | "local"
|
|
319
380
|
baseline: float # Baseline model output
|
|
320
381
|
scores: list[FeatureImportanceScore]
|
|
321
382
|
|
|
@@ -329,12 +390,13 @@ class FeatureImportanceScore:
|
|
|
329
390
|
## Error Handling
|
|
330
391
|
|
|
331
392
|
```python
|
|
332
|
-
from discovery import
|
|
333
|
-
|
|
393
|
+
from discovery import Engine
|
|
394
|
+
from discovery.errors import (
|
|
334
395
|
AuthenticationError,
|
|
335
396
|
InsufficientCreditsError,
|
|
336
397
|
RateLimitError,
|
|
337
398
|
RunFailedError,
|
|
399
|
+
RunNotFoundError,
|
|
338
400
|
PaymentRequiredError,
|
|
339
401
|
)
|
|
340
402
|
|
|
@@ -344,11 +406,15 @@ except AuthenticationError as e:
|
|
|
344
406
|
print(e.suggestion) # "Check your API key at https://disco.leap-labs.com/developers"
|
|
345
407
|
except InsufficientCreditsError as e:
|
|
346
408
|
print(f"Need {e.credits_required}, have {e.credits_available}")
|
|
347
|
-
print(e.suggestion) # "
|
|
409
|
+
print(e.suggestion) # "Run with visibility='public' (free, results published) or purchase credits with engine.purchase_credits()."
|
|
348
410
|
except RateLimitError as e:
|
|
349
411
|
print(f"Retry after {e.retry_after} seconds")
|
|
350
412
|
except RunFailedError as e:
|
|
351
413
|
print(f"Run {e.run_id} failed: {e}")
|
|
414
|
+
except RunNotFoundError as e:
|
|
415
|
+
print(f"Run {e.run_id} not found — may have been cleaned up")
|
|
416
|
+
except PaymentRequiredError as e:
|
|
417
|
+
print(e.suggestion) # "Attach a payment method with engine.add_payment_method(...)"
|
|
352
418
|
except TimeoutError:
|
|
353
419
|
pass # Retrieve later with engine.wait_for_completion(run_id)
|
|
354
420
|
```
|
|
@@ -358,7 +424,7 @@ All errors include a `suggestion` field with actionable instructions.
|
|
|
358
424
|
|
|
359
425
|
## MCP Server
|
|
360
426
|
|
|
361
|
-
Disco is available as an [MCP server](https://disco.leap-labs.com/.well-known/mcp.json) with tools for the full discovery lifecycle — estimate, analyze, check status, get results, manage account.
|
|
427
|
+
Disco is available as an [MCP server](https://disco.leap-labs.com/.well-known/mcp.json) with tools for the full discovery lifecycle — estimate, analyze, check status, get results, manage account. To subscribe or purchase credits via MCP, call `discovery_add_payment_method` first to attach a Stripe payment method.
|
|
362
428
|
|
|
363
429
|
```json
|
|
364
430
|
{
|
|
@@ -0,0 +1,118 @@
|
|
|
1
|
+
"""CrewAI tool wrapper for Disco (Discovery Engine).
|
|
2
|
+
|
|
3
|
+
Install: pip install discovery-engine-api crewai
|
|
4
|
+
Usage:
|
|
5
|
+
from discovery.integrations.crewai import DiscoTool
|
|
6
|
+
tool = DiscoTool(api_key="disco_...")
|
|
7
|
+
agent = Agent(tools=[tool], ...)
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
from __future__ import annotations
|
|
11
|
+
|
|
12
|
+
import json
|
|
13
|
+
from typing import Any
|
|
14
|
+
|
|
15
|
+
from crewai.tools import BaseTool
|
|
16
|
+
from pydantic import BaseModel, Field
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class DiscoInput(BaseModel):
|
|
20
|
+
"""Input for the Disco discovery tool."""
|
|
21
|
+
|
|
22
|
+
file_url: str = Field(
|
|
23
|
+
description="URL of the tabular dataset to analyse (CSV, Excel, Parquet, JSON, etc.)"
|
|
24
|
+
)
|
|
25
|
+
target_column: str = Field(
|
|
26
|
+
description="The column to predict/explain — the outcome you want to understand"
|
|
27
|
+
)
|
|
28
|
+
visibility: str = Field(
|
|
29
|
+
default="public",
|
|
30
|
+
description="'public' (free, results published) or 'private' (costs credits, results private)",
|
|
31
|
+
)
|
|
32
|
+
analysis_depth: int = Field(
|
|
33
|
+
default=2,
|
|
34
|
+
description="Analysis depth — higher means deeper analysis but more credits. Default 2.",
|
|
35
|
+
)
|
|
36
|
+
excluded_columns: list[str] = Field(
|
|
37
|
+
default_factory=list,
|
|
38
|
+
description="Columns to exclude (IDs, data leakage, tautological columns)",
|
|
39
|
+
)
|
|
40
|
+
use_llms: bool = Field(
|
|
41
|
+
default=False,
|
|
42
|
+
description="If True, enables LLM-powered summaries, literature context, and novelty assessment. Slower and more expensive. Public runs always use LLMs.",
|
|
43
|
+
)
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
class DiscoTool(BaseTool):
|
|
47
|
+
"""Superhuman exploratory data analysis.
|
|
48
|
+
|
|
49
|
+
Disco finds novel, statistically validated patterns in tabular data — the
|
|
50
|
+
feature interactions, subgroup effects, and conditional relationships that
|
|
51
|
+
correlation analysis, LLMs, and manual exploration miss. Every finding comes
|
|
52
|
+
with p-values, effect sizes, and academic literature citations.
|
|
53
|
+
|
|
54
|
+
Free for public data. No ML expertise required.
|
|
55
|
+
"""
|
|
56
|
+
|
|
57
|
+
name: str = "disco"
|
|
58
|
+
description: str = (
|
|
59
|
+
"Automated scientific discovery from tabular data. Use when you need to find "
|
|
60
|
+
"patterns, interactions, or subgroup effects in a dataset — especially when you "
|
|
61
|
+
"don't know what to look for. Returns statistically validated patterns with "
|
|
62
|
+
"p-values, effect sizes, and literature citations. Free for public data."
|
|
63
|
+
)
|
|
64
|
+
args_schema: type[BaseModel] = DiscoInput
|
|
65
|
+
api_key: str = ""
|
|
66
|
+
|
|
67
|
+
def __init__(self, api_key: str, **kwargs: Any):
|
|
68
|
+
super().__init__(api_key=api_key, **kwargs)
|
|
69
|
+
|
|
70
|
+
def _run(
|
|
71
|
+
self,
|
|
72
|
+
file_url: str,
|
|
73
|
+
target_column: str,
|
|
74
|
+
visibility: str = "public",
|
|
75
|
+
analysis_depth: int = 2,
|
|
76
|
+
excluded_columns: list[str] | None = None,
|
|
77
|
+
use_llms: bool = False,
|
|
78
|
+
) -> str:
|
|
79
|
+
from discovery import Engine
|
|
80
|
+
|
|
81
|
+
engine = Engine(api_key=self.api_key)
|
|
82
|
+
|
|
83
|
+
result = engine.discover_sync(
|
|
84
|
+
file=file_url,
|
|
85
|
+
target_column=target_column,
|
|
86
|
+
visibility=visibility,
|
|
87
|
+
analysis_depth=analysis_depth,
|
|
88
|
+
excluded_columns=excluded_columns or [],
|
|
89
|
+
use_llms=use_llms,
|
|
90
|
+
)
|
|
91
|
+
|
|
92
|
+
patterns = []
|
|
93
|
+
for p in result.patterns:
|
|
94
|
+
patterns.append(
|
|
95
|
+
{
|
|
96
|
+
"description": p.description,
|
|
97
|
+
"conditions": p.conditions,
|
|
98
|
+
"p_value": p.p_value,
|
|
99
|
+
"effect_size": p.abs_target_change,
|
|
100
|
+
"direction": p.target_change_direction,
|
|
101
|
+
"support_count": p.support_count,
|
|
102
|
+
"support_percentage": p.support_percentage,
|
|
103
|
+
"novelty": p.novelty_type,
|
|
104
|
+
"novelty_explanation": p.novelty_explanation,
|
|
105
|
+
"citations": p.citations,
|
|
106
|
+
}
|
|
107
|
+
)
|
|
108
|
+
|
|
109
|
+
output = {
|
|
110
|
+
"report_url": result.report_url,
|
|
111
|
+
"pattern_count": len(patterns),
|
|
112
|
+
"patterns": patterns,
|
|
113
|
+
}
|
|
114
|
+
|
|
115
|
+
if hasattr(result, "summary") and result.summary:
|
|
116
|
+
output["summary"] = result.summary.overview
|
|
117
|
+
|
|
118
|
+
return json.dumps(output, indent=2, default=str)
|
|
@@ -0,0 +1,122 @@
|
|
|
1
|
+
"""LangChain tool wrapper for Disco (Discovery Engine).
|
|
2
|
+
|
|
3
|
+
Install: pip install discovery-engine-api langchain-core
|
|
4
|
+
Usage:
|
|
5
|
+
from discovery.integrations.langchain import DiscoTool
|
|
6
|
+
tool = DiscoTool(api_key="disco_...")
|
|
7
|
+
result = tool.invoke({"file_url": "https://example.com/data.csv", "target_column": "outcome"})
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
from __future__ import annotations
|
|
11
|
+
|
|
12
|
+
import asyncio
|
|
13
|
+
import json
|
|
14
|
+
from typing import Any
|
|
15
|
+
|
|
16
|
+
from langchain_core.tools import BaseTool
|
|
17
|
+
from pydantic import BaseModel, Field
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class DiscoInput(BaseModel):
|
|
21
|
+
"""Input for the Disco discovery tool."""
|
|
22
|
+
|
|
23
|
+
file_url: str = Field(
|
|
24
|
+
description="URL of the tabular dataset to analyse (CSV, Excel, Parquet, JSON, etc.)"
|
|
25
|
+
)
|
|
26
|
+
target_column: str = Field(
|
|
27
|
+
description="The column to predict/explain — the outcome you want to understand"
|
|
28
|
+
)
|
|
29
|
+
visibility: str = Field(
|
|
30
|
+
default="public",
|
|
31
|
+
description="'public' (free, results published) or 'private' (costs credits, results private)",
|
|
32
|
+
)
|
|
33
|
+
analysis_depth: int = Field(
|
|
34
|
+
default=2,
|
|
35
|
+
description="Analysis depth — higher means deeper analysis but more credits. Default 2.",
|
|
36
|
+
)
|
|
37
|
+
excluded_columns: list[str] = Field(
|
|
38
|
+
default_factory=list,
|
|
39
|
+
description="Columns to exclude (IDs, data leakage, tautological columns)",
|
|
40
|
+
)
|
|
41
|
+
use_llms: bool = Field(
|
|
42
|
+
default=False,
|
|
43
|
+
description="If True, enables LLM-powered summaries, literature context, and novelty assessment. Slower and more expensive. Public runs always use LLMs.",
|
|
44
|
+
)
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
class DiscoTool(BaseTool):
|
|
48
|
+
"""Superhuman exploratory data analysis.
|
|
49
|
+
|
|
50
|
+
Disco finds novel, statistically validated patterns in tabular data — the
|
|
51
|
+
feature interactions, subgroup effects, and conditional relationships that
|
|
52
|
+
correlation analysis, LLMs, and manual exploration miss. Every finding comes
|
|
53
|
+
with p-values, effect sizes, and academic literature citations.
|
|
54
|
+
|
|
55
|
+
Free for public data. No ML expertise required.
|
|
56
|
+
"""
|
|
57
|
+
|
|
58
|
+
name: str = "disco"
|
|
59
|
+
description: str = (
|
|
60
|
+
"Automated scientific discovery from tabular data. Use when you need to find "
|
|
61
|
+
"patterns, interactions, or subgroup effects in a dataset — especially when you "
|
|
62
|
+
"don't know what to look for. Returns statistically validated patterns with "
|
|
63
|
+
"p-values, effect sizes, and literature citations. Free for public data."
|
|
64
|
+
)
|
|
65
|
+
args_schema: type[BaseModel] = DiscoInput
|
|
66
|
+
api_key: str = ""
|
|
67
|
+
|
|
68
|
+
def __init__(self, api_key: str, **kwargs: Any):
|
|
69
|
+
super().__init__(api_key=api_key, **kwargs)
|
|
70
|
+
|
|
71
|
+
def _run(self, **kwargs: Any) -> str:
|
|
72
|
+
return asyncio.run(self._arun(**kwargs))
|
|
73
|
+
|
|
74
|
+
async def _arun(
|
|
75
|
+
self,
|
|
76
|
+
file_url: str,
|
|
77
|
+
target_column: str,
|
|
78
|
+
visibility: str = "public",
|
|
79
|
+
analysis_depth: int = 2,
|
|
80
|
+
excluded_columns: list[str] | None = None,
|
|
81
|
+
use_llms: bool = False,
|
|
82
|
+
) -> str:
|
|
83
|
+
from discovery import Engine
|
|
84
|
+
|
|
85
|
+
engine = Engine(api_key=self.api_key)
|
|
86
|
+
|
|
87
|
+
result = await engine.discover(
|
|
88
|
+
file=file_url,
|
|
89
|
+
target_column=target_column,
|
|
90
|
+
visibility=visibility,
|
|
91
|
+
analysis_depth=analysis_depth,
|
|
92
|
+
excluded_columns=excluded_columns or [],
|
|
93
|
+
use_llms=use_llms,
|
|
94
|
+
)
|
|
95
|
+
|
|
96
|
+
patterns = []
|
|
97
|
+
for p in result.patterns:
|
|
98
|
+
patterns.append(
|
|
99
|
+
{
|
|
100
|
+
"description": p.description,
|
|
101
|
+
"conditions": p.conditions,
|
|
102
|
+
"p_value": p.p_value,
|
|
103
|
+
"effect_size": p.abs_target_change,
|
|
104
|
+
"direction": p.target_change_direction,
|
|
105
|
+
"support_count": p.support_count,
|
|
106
|
+
"support_percentage": p.support_percentage,
|
|
107
|
+
"novelty": p.novelty_type,
|
|
108
|
+
"novelty_explanation": p.novelty_explanation,
|
|
109
|
+
"citations": p.citations,
|
|
110
|
+
}
|
|
111
|
+
)
|
|
112
|
+
|
|
113
|
+
output = {
|
|
114
|
+
"report_url": result.report_url,
|
|
115
|
+
"pattern_count": len(patterns),
|
|
116
|
+
"patterns": patterns,
|
|
117
|
+
}
|
|
118
|
+
|
|
119
|
+
if hasattr(result, "summary") and result.summary:
|
|
120
|
+
output["summary"] = result.summary.overview
|
|
121
|
+
|
|
122
|
+
return json.dumps(output, indent=2, default=str)
|
|
@@ -1,96 +0,0 @@
|
|
|
1
|
-
"""CrewAI tool wrapper for Disco.
|
|
2
|
-
|
|
3
|
-
Usage:
|
|
4
|
-
from discovery.integrations.crewai import DiscoveryEngineTool
|
|
5
|
-
|
|
6
|
-
tool = DiscoveryEngineTool(api_key="disco_...")
|
|
7
|
-
# Add to your CrewAI agent
|
|
8
|
-
agent = Agent(tools=[tool], ...)
|
|
9
|
-
"""
|
|
10
|
-
|
|
11
|
-
from __future__ import annotations
|
|
12
|
-
|
|
13
|
-
import json
|
|
14
|
-
from typing import Any
|
|
15
|
-
|
|
16
|
-
from crewai.tools import BaseTool
|
|
17
|
-
from pydantic import Field
|
|
18
|
-
|
|
19
|
-
from discovery import Engine
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
class DiscoveryEngineTool(BaseTool):
|
|
23
|
-
"""CrewAI tool that runs Disco on tabular data.
|
|
24
|
-
|
|
25
|
-
Finds novel, statistically validated patterns — feature interactions,
|
|
26
|
-
subgroup effects, and conditional relationships — that correlation analysis,
|
|
27
|
-
LLMs, and hypothesis-driven approaches miss.
|
|
28
|
-
"""
|
|
29
|
-
|
|
30
|
-
name: str = "Disco"
|
|
31
|
-
description: str = (
|
|
32
|
-
"Run Disco on tabular data to find novel, statistically "
|
|
33
|
-
"validated patterns that you cannot find with pandas, SQL, or by prompting "
|
|
34
|
-
"an LLM to analyze data. Use when you need to go beyond correlation. "
|
|
35
|
-
"Input: JSON with 'file' (path), 'target_column' (column to analyze). "
|
|
36
|
-
"Optional: 'visibility' (public/private), 'analysis_depth' (search depth). "
|
|
37
|
-
"Returns patterns with conditions, p-values, novelty scores, citations."
|
|
38
|
-
)
|
|
39
|
-
api_key: str = Field(description="Disco API key (disco_...)")
|
|
40
|
-
quiet: bool = Field(default=True, description="Suppress progress output")
|
|
41
|
-
|
|
42
|
-
def _run(self, query: str) -> str:
|
|
43
|
-
"""Run Disco."""
|
|
44
|
-
try:
|
|
45
|
-
params = json.loads(query)
|
|
46
|
-
except json.JSONDecodeError:
|
|
47
|
-
return json.dumps({"error": "Input must be JSON with 'file' and 'target_column' keys."})
|
|
48
|
-
|
|
49
|
-
file_path = params.get("file")
|
|
50
|
-
target_column = params.get("target_column")
|
|
51
|
-
if not file_path or not target_column:
|
|
52
|
-
return json.dumps({"error": "Missing required keys: 'file' and 'target_column'."})
|
|
53
|
-
|
|
54
|
-
engine = Engine(api_key=self.api_key, quiet=self.quiet)
|
|
55
|
-
|
|
56
|
-
try:
|
|
57
|
-
result = engine.discover_sync(
|
|
58
|
-
file=file_path,
|
|
59
|
-
target_column=target_column,
|
|
60
|
-
analysis_depth=params.get("analysis_depth", 2),
|
|
61
|
-
visibility=params.get("visibility", "public"),
|
|
62
|
-
)
|
|
63
|
-
except Exception as e:
|
|
64
|
-
return json.dumps({"error": str(e), "suggestion": getattr(e, "suggestion", None)})
|
|
65
|
-
|
|
66
|
-
return _format_result(result)
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
def _format_result(result: Any) -> str:
|
|
70
|
-
"""Format EngineResult as a JSON string."""
|
|
71
|
-
patterns = []
|
|
72
|
-
for p in result.patterns:
|
|
73
|
-
patterns.append(
|
|
74
|
-
{
|
|
75
|
-
"description": p.description,
|
|
76
|
-
"conditions": p.conditions,
|
|
77
|
-
"p_value": p.p_value,
|
|
78
|
-
"novelty_type": p.novelty_type,
|
|
79
|
-
"novelty_explanation": p.novelty_explanation,
|
|
80
|
-
"effect_size": p.abs_target_change,
|
|
81
|
-
"direction": p.target_change_direction,
|
|
82
|
-
"support_percentage": p.support_percentage,
|
|
83
|
-
}
|
|
84
|
-
)
|
|
85
|
-
|
|
86
|
-
output: dict[str, Any] = {
|
|
87
|
-
"status": result.status,
|
|
88
|
-
"patterns": patterns,
|
|
89
|
-
"report_url": result.report_url,
|
|
90
|
-
"dashboard_urls": result.dashboard_urls,
|
|
91
|
-
}
|
|
92
|
-
if result.summary:
|
|
93
|
-
output["summary"] = result.summary.overview
|
|
94
|
-
output["key_insights"] = result.summary.key_insights
|
|
95
|
-
|
|
96
|
-
return json.dumps(output, indent=2)
|
|
@@ -1,105 +0,0 @@
|
|
|
1
|
-
"""LangChain tool wrapper for Disco.
|
|
2
|
-
|
|
3
|
-
Usage:
|
|
4
|
-
from discovery.integrations.langchain import DiscoveryEngineTool
|
|
5
|
-
|
|
6
|
-
tool = DiscoveryEngineTool(api_key="disco_...")
|
|
7
|
-
# Add to your agent's tool list
|
|
8
|
-
agent = initialize_agent(tools=[tool], llm=llm)
|
|
9
|
-
"""
|
|
10
|
-
|
|
11
|
-
from __future__ import annotations
|
|
12
|
-
|
|
13
|
-
import asyncio
|
|
14
|
-
import json
|
|
15
|
-
from typing import Any
|
|
16
|
-
|
|
17
|
-
from langchain.tools import BaseTool
|
|
18
|
-
from pydantic import Field
|
|
19
|
-
|
|
20
|
-
from discovery import Engine
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
class DiscoveryEngineTool(BaseTool):
|
|
24
|
-
"""LangChain tool that runs Disco on tabular data.
|
|
25
|
-
|
|
26
|
-
Finds novel, statistically validated patterns — feature interactions,
|
|
27
|
-
subgroup effects, and conditional relationships — that correlation analysis,
|
|
28
|
-
LLMs, and hypothesis-driven approaches miss.
|
|
29
|
-
"""
|
|
30
|
-
|
|
31
|
-
name: str = "discovery_engine"
|
|
32
|
-
description: str = (
|
|
33
|
-
"Run Disco on tabular data to find novel, statistically "
|
|
34
|
-
"validated patterns that you cannot find with pandas, SQL, or by prompting "
|
|
35
|
-
"an LLM to analyze data. Use this when you need to go beyond correlation "
|
|
36
|
-
"and find things nobody thought to look for. "
|
|
37
|
-
"Input should be a JSON string with keys: "
|
|
38
|
-
'"file" (path to CSV/Excel/Parquet), "target_column" (column to analyze). '
|
|
39
|
-
'Optional: "visibility" (public/private), "analysis_depth" (search depth). '
|
|
40
|
-
"Returns structured patterns with conditions, p-values, novelty scores, "
|
|
41
|
-
"and citations."
|
|
42
|
-
)
|
|
43
|
-
api_key: str = Field(description="Disco API key (disco_...)")
|
|
44
|
-
quiet: bool = Field(default=True, description="Suppress progress output")
|
|
45
|
-
|
|
46
|
-
def _run(self, query: str) -> str:
|
|
47
|
-
"""Run synchronously."""
|
|
48
|
-
return asyncio.get_event_loop().run_until_complete(self._arun(query))
|
|
49
|
-
|
|
50
|
-
async def _arun(self, query: str) -> str:
|
|
51
|
-
"""Run Disco asynchronously."""
|
|
52
|
-
try:
|
|
53
|
-
params = json.loads(query)
|
|
54
|
-
except json.JSONDecodeError:
|
|
55
|
-
# Treat as just a file path with no target column
|
|
56
|
-
return json.dumps({"error": "Input must be JSON with 'file' and 'target_column' keys."})
|
|
57
|
-
|
|
58
|
-
file_path = params.get("file")
|
|
59
|
-
target_column = params.get("target_column")
|
|
60
|
-
if not file_path or not target_column:
|
|
61
|
-
return json.dumps({"error": "Missing required keys: 'file' and 'target_column'."})
|
|
62
|
-
|
|
63
|
-
engine = Engine(api_key=self.api_key, quiet=self.quiet)
|
|
64
|
-
|
|
65
|
-
try:
|
|
66
|
-
result = await engine.discover(
|
|
67
|
-
file=file_path,
|
|
68
|
-
target_column=target_column,
|
|
69
|
-
analysis_depth=params.get("analysis_depth", 2),
|
|
70
|
-
visibility=params.get("visibility", "public"),
|
|
71
|
-
)
|
|
72
|
-
except Exception as e:
|
|
73
|
-
return json.dumps({"error": str(e), "suggestion": getattr(e, "suggestion", None)})
|
|
74
|
-
|
|
75
|
-
return _format_result(result)
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
def _format_result(result: Any) -> str:
|
|
79
|
-
"""Format EngineResult as a JSON string for the LLM."""
|
|
80
|
-
patterns = []
|
|
81
|
-
for p in result.patterns:
|
|
82
|
-
patterns.append(
|
|
83
|
-
{
|
|
84
|
-
"description": p.description,
|
|
85
|
-
"conditions": p.conditions,
|
|
86
|
-
"p_value": p.p_value,
|
|
87
|
-
"novelty_type": p.novelty_type,
|
|
88
|
-
"novelty_explanation": p.novelty_explanation,
|
|
89
|
-
"effect_size": p.abs_target_change,
|
|
90
|
-
"direction": p.target_change_direction,
|
|
91
|
-
"support_percentage": p.support_percentage,
|
|
92
|
-
}
|
|
93
|
-
)
|
|
94
|
-
|
|
95
|
-
output: dict[str, Any] = {
|
|
96
|
-
"status": result.status,
|
|
97
|
-
"patterns": patterns,
|
|
98
|
-
"report_url": result.report_url,
|
|
99
|
-
"dashboard_urls": result.dashboard_urls,
|
|
100
|
-
}
|
|
101
|
-
if result.summary:
|
|
102
|
-
output["summary"] = result.summary.overview
|
|
103
|
-
output["key_insights"] = result.summary.key_insights
|
|
104
|
-
|
|
105
|
-
return json.dumps(output, indent=2)
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{discovery_engine_api-0.2.93 → discovery_engine_api-0.2.94}/discovery/integrations/__init__.py
RENAMED
|
File without changes
|
|
File without changes
|