discovery-engine-api 0.2.93__tar.gz → 0.2.94__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: discovery-engine-api
3
- Version: 0.2.93
3
+ Version: 0.2.94
4
4
  Summary: Python SDK for Disco API
5
5
  Project-URL: Homepage, https://www.leap-labs.com
6
6
  Project-URL: Documentation, https://disco.leap-labs.com/llms-full.txt
@@ -78,19 +78,26 @@ Get your API key from the [Developers page](https://disco.leap-labs.com/develope
78
78
  await engine.discover(
79
79
  file: str | Path | pd.DataFrame, # Dataset to analyze
80
80
  target_column: str, # Column to predict/analyze
81
- analysis_depth: int = 2, # 1=fast, higher=deeper search
81
+ analysis_depth: int = 2, # 2=default, higher=deeper analysis
82
82
  visibility: str = "public", # "public" (free) or "private" (credits)
83
83
  title: str | None = None, # Dataset title
84
84
  description: str | None = None, # Dataset description
85
85
  column_descriptions: dict[str, str] | None = None, # Improves pattern explanations
86
- excluded_columns: list[str] | None = None, # Columns to exclude (e.g., IDs)
86
+ excluded_columns: list[str] | None = None, # Columns to exclude see below
87
+ use_llms: bool = False, # True = LLM explanations (costs more) — see below
87
88
  timeout: float = 1800, # Max seconds to wait
89
+ # Additional kwargs forwarded to run_async():
90
+ # task, author, source_url, timeseries_groups, ...
88
91
  )
89
92
  ```
90
93
 
91
94
  > **Tip:** Providing `column_descriptions` significantly improves pattern explanations. If your columns have non-obvious names, always describe them.
92
95
 
93
- > **Depth and visibility:** Public runs are free; results are published to the public gallery. Private runs consume credits based on file size × depth.
96
+ > **`use_llms`:** Default `False`. Slower and more expensive, but you get smarter pre-processing, literature context and novelty assessment. Set to `True` if you want Disco-generated pattern descriptions, novelty assessment with citations, and report summaries. **Public runs always use LLMs regardless of this setting.** What changes when false: pattern descriptions fall back to generic text, novelty is not assessed (all patterns marked confirmatory, no citations), report summaries are omitted, integer columns with few unique values (e.g. "month" 1-12, "hour" 0-23) may be misclassified as categorical instead of continuous, and high-cardinality text columns get generic cluster names instead of descriptive ones. Use `engine.estimate()` to check credit cost before running.
97
+
98
+ > **Visibility:** `"public"` runs are free but results are published, and analysis depth is locked to 2. `"private"` runs keep results confidential and consume credits.
99
+
100
+ > **`excluded_columns`:** Always exclude identifiers (row IDs, UUIDs), data leakage (target renamed/reformatted), and tautological columns (alternative encodings of the same construct as the target). For example, if your target is `serious`, exclude `serious_outcome`, `not_serious`, `death` — they're part of the same classification system.
94
101
 
95
102
 
96
103
  ## Examples
@@ -111,32 +118,25 @@ result = await engine.discover(
111
118
  "age": "Patient age in years",
112
119
  "bmi": "Body mass index",
113
120
  },
114
- excluded_columns=["patient_id", "timestamp"],
121
+ excluded_columns=["patient_id", "timestamp", "outcome_text"], # IDs + tautological
115
122
  )
116
123
  ```
117
124
 
118
- ### Inspecting Columns Before Running
119
-
120
- If you need to see the dataset's columns before choosing a target column, upload first and inspect:
125
+ ### Running in the Background
121
126
 
122
- ```python
123
- # Upload once and get the server's parsed column list
124
- upload = await engine.upload_file(file="data.csv", title="My dataset")
125
- print(upload["columns"]) # [{"name": "col1", "type": "continuous", ...}, ...]
126
- print(upload["rowCount"]) # e.g., 5000
127
+ Runs take 3–15 minutes. While waiting, the SDK logs progress automatically:
127
128
 
128
- # Pass the result to avoid re-uploading
129
- result = await engine.run_async(
130
- file="data.csv",
131
- target_column="col1",
132
- wait=True,
133
- upload_result=upload, # skips the upload step
134
- )
129
+ ```
130
+ Waiting for run abc123 to complete...
131
+ Status: waiting (position 2 in queue) | Est. wait: ~8 min | Upgrade at disco.leap-labs.com/account for priority processing
132
+ Status: processing (preprocessing — Processing data...) | Elapsed: 34.2s | ETA: ~6 min
133
+ Status: processing (training — Modelling data...) | Elapsed: 98.7s | ETA: ~4 min
134
+ Status: processing (interpreting Extracting patterns...) | Elapsed: 284.1s | ETA: ~2 min
135
+ Status: processing (reporting — Building report...) | Elapsed: 412.3s | ETA: ~1 min
136
+ Run completed in 467.8s
135
137
  ```
136
138
 
137
- ### Running in the Background
138
-
139
- Runs take 3–15 minutes. If you need to do other work while Disco runs:
139
+ If you need to do other work while Disco runs:
140
140
 
141
141
  ```python
142
142
  import asyncio
@@ -161,6 +161,29 @@ async def main():
161
161
  result = asyncio.run(main())
162
162
  ```
163
163
 
164
+ ### Inspecting Columns Before Running
165
+
166
+ If you need to see the dataset's columns before choosing a target column — e.g., when column names are not obvious — upload first, inspect, then run without re-uploading:
167
+
168
+ ```python
169
+ # Upload once and get the server's parsed column list
170
+ upload = await engine.upload_file(file="data.csv", title="My dataset")
171
+ # upload["file"] -> {"key": "uploads/abc123.csv", "name": "data.csv",
172
+ # "size": 1048576, "fileHash": "sha256:..."}
173
+ # upload["columns"] -> [{"name": "col1", "type": "continuous", ...}, ...]
174
+ # upload["rowCount"] -> 5000
175
+ print(upload["columns"])
176
+ print(upload["rowCount"])
177
+
178
+ # Pass the result to avoid re-uploading
179
+ result = await engine.run_async(
180
+ file="data.csv",
181
+ target_column="col1",
182
+ wait=True,
183
+ upload_result=upload, # skips the upload step
184
+ )
185
+ ```
186
+
164
187
  ### Synchronous Usage
165
188
 
166
189
  For scripts and Jupyter notebooks:
@@ -212,7 +235,7 @@ print(f"Explore: {result.report_url}")
212
235
 
213
236
  ## Credits and Pricing
214
237
 
215
- - **Public runs**: Free. Results published to public gallery.
238
+ - **Public runs**: Free. Results published to public gallery. Locked to depth=2.
216
239
  - **Private runs**: Credits scale with file size, depth, and run configuration. $0.10 per credit. Use `engine.estimate()` to check cost before running.
217
240
 
218
241
  ```python
@@ -223,13 +246,27 @@ estimate = await engine.estimate(
223
246
  analysis_depth=2,
224
247
  visibility="private",
225
248
  )
226
- # estimate["cost"]["credits"] -> 21
227
- # estimate["account"]["sufficient"] -> True/False
249
+ # estimate["cost"]["credits"] -> 55
250
+ # estimate["cost"]["price_usd"] -> 5.5
251
+ # estimate["time_estimate"]["estimated_seconds"] -> 360
252
+ # estimate["account"]["sufficient"] -> True/False
253
+ # estimate["limits"]["max_analysis_depth"] -> 23 (num_columns - 2)
228
254
  ```
229
255
 
230
256
  Manage credits and plans at [disco.leap-labs.com/account](https://disco.leap-labs.com/account).
231
257
 
232
258
 
259
+ ## Expected Data Format
260
+
261
+ Disco expects a **flat table** — columns for features, rows for samples.
262
+
263
+ - **One row per observation** — a patient, a sample, a transaction, a measurement, etc.
264
+ - **One column per feature** — numeric, categorical, datetime, or free text are all fine
265
+ - **One target column** — the outcome to analyze. Must have at least 2 distinct values.
266
+ - **Missing values are OK** — Disco handles them automatically. Don't drop rows or impute beforehand.
267
+
268
+ Not supported: images, raw text documents, nested/hierarchical JSON, multi-sheet Excel (use the first sheet or export to CSV).
269
+
233
270
  ## File Size Limits
234
271
 
235
272
  Uploads up to **5 GB**. Files are uploaded directly to cloud storage using presigned URLs.
@@ -245,16 +282,30 @@ Supported formats: **CSV**, **TSV**, **Excel (.xlsx)**, **JSON**, **Parquet**, *
245
282
  @dataclass
246
283
  class EngineResult:
247
284
  run_id: str
285
+ report_id: str | None # Report UUID (used in report_url)
248
286
  status: str # "pending", "processing", "completed", "failed"
287
+ dataset_title: str | None # Title of the dataset
288
+ dataset_description: str | None # Description of the dataset
289
+ total_rows: int | None
290
+ target_column: str | None # Column being predicted/analyzed
291
+ task: str | None # "regression", "binary_classification", "multiclass_classification"
249
292
  summary: Summary | None # LLM-generated insights
250
293
  patterns: list[Pattern] # Discovered patterns (the core output)
251
294
  columns: list[Column] # Feature info and statistics
252
- feature_importance: FeatureImportance | None # Global importance scores
253
295
  correlation_matrix: list[CorrelationEntry] # Feature correlations
254
- report_url: str | None # Shareable link to interactive web report
255
- task: str | None # "regression", "binary_classification", "multiclass_classification"
256
- total_rows: int | None
296
+ feature_importance: FeatureImportance | None # Global importance scores
297
+ job_id: str | None # Job ID for tracking
298
+ job_status: str | None # Job queue status
299
+ queue_position: int | None # Position in queue when pending (1 = next up)
300
+ current_step: str | None # Active pipeline step (preprocessing, training, interpreting, reporting)
301
+ current_step_message: str | None # Human-readable description of the current step
302
+ estimated_seconds: int | None # Estimated total processing time in seconds
303
+ estimated_wait_seconds: int | None # Estimated queue wait time in seconds (pending only)
257
304
  error_message: str | None
305
+ report_url: str | None # Shareable link to interactive web report
306
+ hints: list[str] # Upgrade hints (non-empty for free-tier users with hidden patterns)
307
+ hidden_deep_count: int # Patterns hidden for free-tier accounts (upgrade to see all)
308
+ hidden_deep_novel_count: int # Novel patterns hidden for free-tier accounts
258
309
  ```
259
310
 
260
311
  ### Pattern
@@ -263,6 +314,8 @@ class EngineResult:
263
314
  @dataclass
264
315
  class Pattern:
265
316
  id: str
317
+ task: str # "regression", "binary_classification", "multiclass_classification"
318
+ target_column: str # Column being analyzed
266
319
  description: str # Human-readable description
267
320
  conditions: list[dict] # Conditions defining the pattern
268
321
  p_value: float # FDR-adjusted p-value
@@ -272,8 +325,10 @@ class Pattern:
272
325
  citations: list[dict] # Academic citations
273
326
  target_change_direction: str # "max" (increases target) or "min" (decreases)
274
327
  abs_target_change: float # Magnitude of effect
328
+ target_score: float # Mean target value (regression) or class fraction (classification) in the subgroup
275
329
  support_count: int # Rows matching this pattern
276
330
  support_percentage: float # Percentage of dataset
331
+ target_class: str | None # For classification tasks
277
332
  target_mean: float | None # For regression tasks
278
333
  target_std: float | None
279
334
  ```
@@ -323,6 +378,7 @@ class Summary:
323
378
  overview: str # High-level summary of findings
324
379
  key_insights: list[str] # Main takeaways
325
380
  novel_patterns: PatternGroup # Novel pattern IDs and explanation
381
+ selected_pattern_id: str | None # ID of the highlighted/featured pattern
326
382
  ```
327
383
 
328
384
  ### Column
@@ -342,17 +398,22 @@ class Column:
342
398
  std: float | None
343
399
  min: float | None
344
400
  max: float | None
401
+ iqr_min: float | None # 25th percentile
402
+ iqr_max: float | None # 75th percentile
403
+ mode: str | None # Most common value (categorical columns)
404
+ approx_unique: int | None # Approximate distinct value count
405
+ null_percentage: float | None
345
406
  feature_importance_score: float | None # Signed importance score
346
407
  ```
347
408
 
348
409
  ### FeatureImportance
349
410
 
350
- Computed using **Hierarchical Perturbation (HiPe)**, an ablation-based method. Scores are **signed** — positive means the feature increases the prediction, negative means it decreases it.
411
+ Scores are **signed** — positive means the feature increases the prediction, negative means it decreases it.
351
412
 
352
413
  ```python
353
414
  @dataclass
354
415
  class FeatureImportance:
355
- kind: str # "global"
416
+ kind: str # "global" | "local"
356
417
  baseline: float # Baseline model output
357
418
  scores: list[FeatureImportanceScore]
358
419
 
@@ -366,12 +427,13 @@ class FeatureImportanceScore:
366
427
  ## Error Handling
367
428
 
368
429
  ```python
369
- from discovery import (
370
- Engine,
430
+ from discovery import Engine
431
+ from discovery.errors import (
371
432
  AuthenticationError,
372
433
  InsufficientCreditsError,
373
434
  RateLimitError,
374
435
  RunFailedError,
436
+ RunNotFoundError,
375
437
  PaymentRequiredError,
376
438
  )
377
439
 
@@ -381,11 +443,15 @@ except AuthenticationError as e:
381
443
  print(e.suggestion) # "Check your API key at https://disco.leap-labs.com/developers"
382
444
  except InsufficientCreditsError as e:
383
445
  print(f"Need {e.credits_required}, have {e.credits_available}")
384
- print(e.suggestion) # "Purchase credits or run publicly for free"
446
+ print(e.suggestion) # "Run with visibility='public' (free, results published) or purchase credits with engine.purchase_credits()."
385
447
  except RateLimitError as e:
386
448
  print(f"Retry after {e.retry_after} seconds")
387
449
  except RunFailedError as e:
388
450
  print(f"Run {e.run_id} failed: {e}")
451
+ except RunNotFoundError as e:
452
+ print(f"Run {e.run_id} not found — may have been cleaned up")
453
+ except PaymentRequiredError as e:
454
+ print(e.suggestion) # "Attach a payment method with engine.add_payment_method(...)"
389
455
  except TimeoutError:
390
456
  pass # Retrieve later with engine.wait_for_completion(run_id)
391
457
  ```
@@ -395,7 +461,7 @@ All errors include a `suggestion` field with actionable instructions.
395
461
 
396
462
  ## MCP Server
397
463
 
398
- Disco is available as an [MCP server](https://disco.leap-labs.com/.well-known/mcp.json) with tools for the full discovery lifecycle — estimate, analyze, check status, get results, manage account.
464
+ Disco is available as an [MCP server](https://disco.leap-labs.com/.well-known/mcp.json) with tools for the full discovery lifecycle — estimate, analyze, check status, get results, manage account. To subscribe or purchase credits via MCP, call `discovery_add_payment_method` first to attach a Stripe payment method.
399
465
 
400
466
  ```json
401
467
  {
@@ -41,19 +41,26 @@ Get your API key from the [Developers page](https://disco.leap-labs.com/develope
41
41
  await engine.discover(
42
42
  file: str | Path | pd.DataFrame, # Dataset to analyze
43
43
  target_column: str, # Column to predict/analyze
44
- analysis_depth: int = 2, # 1=fast, higher=deeper search
44
+ analysis_depth: int = 2, # 2=default, higher=deeper analysis
45
45
  visibility: str = "public", # "public" (free) or "private" (credits)
46
46
  title: str | None = None, # Dataset title
47
47
  description: str | None = None, # Dataset description
48
48
  column_descriptions: dict[str, str] | None = None, # Improves pattern explanations
49
- excluded_columns: list[str] | None = None, # Columns to exclude (e.g., IDs)
49
+ excluded_columns: list[str] | None = None, # Columns to exclude see below
50
+ use_llms: bool = False, # True = LLM explanations (costs more) — see below
50
51
  timeout: float = 1800, # Max seconds to wait
52
+ # Additional kwargs forwarded to run_async():
53
+ # task, author, source_url, timeseries_groups, ...
51
54
  )
52
55
  ```
53
56
 
54
57
  > **Tip:** Providing `column_descriptions` significantly improves pattern explanations. If your columns have non-obvious names, always describe them.
55
58
 
56
- > **Depth and visibility:** Public runs are free; results are published to the public gallery. Private runs consume credits based on file size × depth.
59
+ > **`use_llms`:** Default `False`. Slower and more expensive, but you get smarter pre-processing, literature context and novelty assessment. Set to `True` if you want Disco-generated pattern descriptions, novelty assessment with citations, and report summaries. **Public runs always use LLMs regardless of this setting.** What changes when false: pattern descriptions fall back to generic text, novelty is not assessed (all patterns marked confirmatory, no citations), report summaries are omitted, integer columns with few unique values (e.g. "month" 1-12, "hour" 0-23) may be misclassified as categorical instead of continuous, and high-cardinality text columns get generic cluster names instead of descriptive ones. Use `engine.estimate()` to check credit cost before running.
60
+
61
+ > **Visibility:** `"public"` runs are free but results are published, and analysis depth is locked to 2. `"private"` runs keep results confidential and consume credits.
62
+
63
+ > **`excluded_columns`:** Always exclude identifiers (row IDs, UUIDs), data leakage (target renamed/reformatted), and tautological columns (alternative encodings of the same construct as the target). For example, if your target is `serious`, exclude `serious_outcome`, `not_serious`, `death` — they're part of the same classification system.
57
64
 
58
65
 
59
66
  ## Examples
@@ -74,32 +81,25 @@ result = await engine.discover(
74
81
  "age": "Patient age in years",
75
82
  "bmi": "Body mass index",
76
83
  },
77
- excluded_columns=["patient_id", "timestamp"],
84
+ excluded_columns=["patient_id", "timestamp", "outcome_text"], # IDs + tautological
78
85
  )
79
86
  ```
80
87
 
81
- ### Inspecting Columns Before Running
82
-
83
- If you need to see the dataset's columns before choosing a target column, upload first and inspect:
88
+ ### Running in the Background
84
89
 
85
- ```python
86
- # Upload once and get the server's parsed column list
87
- upload = await engine.upload_file(file="data.csv", title="My dataset")
88
- print(upload["columns"]) # [{"name": "col1", "type": "continuous", ...}, ...]
89
- print(upload["rowCount"]) # e.g., 5000
90
+ Runs take 3–15 minutes. While waiting, the SDK logs progress automatically:
90
91
 
91
- # Pass the result to avoid re-uploading
92
- result = await engine.run_async(
93
- file="data.csv",
94
- target_column="col1",
95
- wait=True,
96
- upload_result=upload, # skips the upload step
97
- )
92
+ ```
93
+ Waiting for run abc123 to complete...
94
+ Status: waiting (position 2 in queue) | Est. wait: ~8 min | Upgrade at disco.leap-labs.com/account for priority processing
95
+ Status: processing (preprocessing — Processing data...) | Elapsed: 34.2s | ETA: ~6 min
96
+ Status: processing (training — Modelling data...) | Elapsed: 98.7s | ETA: ~4 min
97
+ Status: processing (interpreting Extracting patterns...) | Elapsed: 284.1s | ETA: ~2 min
98
+ Status: processing (reporting — Building report...) | Elapsed: 412.3s | ETA: ~1 min
99
+ Run completed in 467.8s
98
100
  ```
99
101
 
100
- ### Running in the Background
101
-
102
- Runs take 3–15 minutes. If you need to do other work while Disco runs:
102
+ If you need to do other work while Disco runs:
103
103
 
104
104
  ```python
105
105
  import asyncio
@@ -124,6 +124,29 @@ async def main():
124
124
  result = asyncio.run(main())
125
125
  ```
126
126
 
127
+ ### Inspecting Columns Before Running
128
+
129
+ If you need to see the dataset's columns before choosing a target column — e.g., when column names are not obvious — upload first, inspect, then run without re-uploading:
130
+
131
+ ```python
132
+ # Upload once and get the server's parsed column list
133
+ upload = await engine.upload_file(file="data.csv", title="My dataset")
134
+ # upload["file"] -> {"key": "uploads/abc123.csv", "name": "data.csv",
135
+ # "size": 1048576, "fileHash": "sha256:..."}
136
+ # upload["columns"] -> [{"name": "col1", "type": "continuous", ...}, ...]
137
+ # upload["rowCount"] -> 5000
138
+ print(upload["columns"])
139
+ print(upload["rowCount"])
140
+
141
+ # Pass the result to avoid re-uploading
142
+ result = await engine.run_async(
143
+ file="data.csv",
144
+ target_column="col1",
145
+ wait=True,
146
+ upload_result=upload, # skips the upload step
147
+ )
148
+ ```
149
+
127
150
  ### Synchronous Usage
128
151
 
129
152
  For scripts and Jupyter notebooks:
@@ -175,7 +198,7 @@ print(f"Explore: {result.report_url}")
175
198
 
176
199
  ## Credits and Pricing
177
200
 
178
- - **Public runs**: Free. Results published to public gallery.
201
+ - **Public runs**: Free. Results published to public gallery. Locked to depth=2.
179
202
  - **Private runs**: Credits scale with file size, depth, and run configuration. $0.10 per credit. Use `engine.estimate()` to check cost before running.
180
203
 
181
204
  ```python
@@ -186,13 +209,27 @@ estimate = await engine.estimate(
186
209
  analysis_depth=2,
187
210
  visibility="private",
188
211
  )
189
- # estimate["cost"]["credits"] -> 21
190
- # estimate["account"]["sufficient"] -> True/False
212
+ # estimate["cost"]["credits"] -> 55
213
+ # estimate["cost"]["price_usd"] -> 5.5
214
+ # estimate["time_estimate"]["estimated_seconds"] -> 360
215
+ # estimate["account"]["sufficient"] -> True/False
216
+ # estimate["limits"]["max_analysis_depth"] -> 23 (num_columns - 2)
191
217
  ```
192
218
 
193
219
  Manage credits and plans at [disco.leap-labs.com/account](https://disco.leap-labs.com/account).
194
220
 
195
221
 
222
+ ## Expected Data Format
223
+
224
+ Disco expects a **flat table** — columns for features, rows for samples.
225
+
226
+ - **One row per observation** — a patient, a sample, a transaction, a measurement, etc.
227
+ - **One column per feature** — numeric, categorical, datetime, or free text are all fine
228
+ - **One target column** — the outcome to analyze. Must have at least 2 distinct values.
229
+ - **Missing values are OK** — Disco handles them automatically. Don't drop rows or impute beforehand.
230
+
231
+ Not supported: images, raw text documents, nested/hierarchical JSON, multi-sheet Excel (use the first sheet or export to CSV).
232
+
196
233
  ## File Size Limits
197
234
 
198
235
  Uploads up to **5 GB**. Files are uploaded directly to cloud storage using presigned URLs.
@@ -208,16 +245,30 @@ Supported formats: **CSV**, **TSV**, **Excel (.xlsx)**, **JSON**, **Parquet**, *
208
245
  @dataclass
209
246
  class EngineResult:
210
247
  run_id: str
248
+ report_id: str | None # Report UUID (used in report_url)
211
249
  status: str # "pending", "processing", "completed", "failed"
250
+ dataset_title: str | None # Title of the dataset
251
+ dataset_description: str | None # Description of the dataset
252
+ total_rows: int | None
253
+ target_column: str | None # Column being predicted/analyzed
254
+ task: str | None # "regression", "binary_classification", "multiclass_classification"
212
255
  summary: Summary | None # LLM-generated insights
213
256
  patterns: list[Pattern] # Discovered patterns (the core output)
214
257
  columns: list[Column] # Feature info and statistics
215
- feature_importance: FeatureImportance | None # Global importance scores
216
258
  correlation_matrix: list[CorrelationEntry] # Feature correlations
217
- report_url: str | None # Shareable link to interactive web report
218
- task: str | None # "regression", "binary_classification", "multiclass_classification"
219
- total_rows: int | None
259
+ feature_importance: FeatureImportance | None # Global importance scores
260
+ job_id: str | None # Job ID for tracking
261
+ job_status: str | None # Job queue status
262
+ queue_position: int | None # Position in queue when pending (1 = next up)
263
+ current_step: str | None # Active pipeline step (preprocessing, training, interpreting, reporting)
264
+ current_step_message: str | None # Human-readable description of the current step
265
+ estimated_seconds: int | None # Estimated total processing time in seconds
266
+ estimated_wait_seconds: int | None # Estimated queue wait time in seconds (pending only)
220
267
  error_message: str | None
268
+ report_url: str | None # Shareable link to interactive web report
269
+ hints: list[str] # Upgrade hints (non-empty for free-tier users with hidden patterns)
270
+ hidden_deep_count: int # Patterns hidden for free-tier accounts (upgrade to see all)
271
+ hidden_deep_novel_count: int # Novel patterns hidden for free-tier accounts
221
272
  ```
222
273
 
223
274
  ### Pattern
@@ -226,6 +277,8 @@ class EngineResult:
226
277
  @dataclass
227
278
  class Pattern:
228
279
  id: str
280
+ task: str # "regression", "binary_classification", "multiclass_classification"
281
+ target_column: str # Column being analyzed
229
282
  description: str # Human-readable description
230
283
  conditions: list[dict] # Conditions defining the pattern
231
284
  p_value: float # FDR-adjusted p-value
@@ -235,8 +288,10 @@ class Pattern:
235
288
  citations: list[dict] # Academic citations
236
289
  target_change_direction: str # "max" (increases target) or "min" (decreases)
237
290
  abs_target_change: float # Magnitude of effect
291
+ target_score: float # Mean target value (regression) or class fraction (classification) in the subgroup
238
292
  support_count: int # Rows matching this pattern
239
293
  support_percentage: float # Percentage of dataset
294
+ target_class: str | None # For classification tasks
240
295
  target_mean: float | None # For regression tasks
241
296
  target_std: float | None
242
297
  ```
@@ -286,6 +341,7 @@ class Summary:
286
341
  overview: str # High-level summary of findings
287
342
  key_insights: list[str] # Main takeaways
288
343
  novel_patterns: PatternGroup # Novel pattern IDs and explanation
344
+ selected_pattern_id: str | None # ID of the highlighted/featured pattern
289
345
  ```
290
346
 
291
347
  ### Column
@@ -305,17 +361,22 @@ class Column:
305
361
  std: float | None
306
362
  min: float | None
307
363
  max: float | None
364
+ iqr_min: float | None # 25th percentile
365
+ iqr_max: float | None # 75th percentile
366
+ mode: str | None # Most common value (categorical columns)
367
+ approx_unique: int | None # Approximate distinct value count
368
+ null_percentage: float | None
308
369
  feature_importance_score: float | None # Signed importance score
309
370
  ```
310
371
 
311
372
  ### FeatureImportance
312
373
 
313
- Computed using **Hierarchical Perturbation (HiPe)**, an ablation-based method. Scores are **signed** — positive means the feature increases the prediction, negative means it decreases it.
374
+ Scores are **signed** — positive means the feature increases the prediction, negative means it decreases it.
314
375
 
315
376
  ```python
316
377
  @dataclass
317
378
  class FeatureImportance:
318
- kind: str # "global"
379
+ kind: str # "global" | "local"
319
380
  baseline: float # Baseline model output
320
381
  scores: list[FeatureImportanceScore]
321
382
 
@@ -329,12 +390,13 @@ class FeatureImportanceScore:
329
390
  ## Error Handling
330
391
 
331
392
  ```python
332
- from discovery import (
333
- Engine,
393
+ from discovery import Engine
394
+ from discovery.errors import (
334
395
  AuthenticationError,
335
396
  InsufficientCreditsError,
336
397
  RateLimitError,
337
398
  RunFailedError,
399
+ RunNotFoundError,
338
400
  PaymentRequiredError,
339
401
  )
340
402
 
@@ -344,11 +406,15 @@ except AuthenticationError as e:
344
406
  print(e.suggestion) # "Check your API key at https://disco.leap-labs.com/developers"
345
407
  except InsufficientCreditsError as e:
346
408
  print(f"Need {e.credits_required}, have {e.credits_available}")
347
- print(e.suggestion) # "Purchase credits or run publicly for free"
409
+ print(e.suggestion) # "Run with visibility='public' (free, results published) or purchase credits with engine.purchase_credits()."
348
410
  except RateLimitError as e:
349
411
  print(f"Retry after {e.retry_after} seconds")
350
412
  except RunFailedError as e:
351
413
  print(f"Run {e.run_id} failed: {e}")
414
+ except RunNotFoundError as e:
415
+ print(f"Run {e.run_id} not found — may have been cleaned up")
416
+ except PaymentRequiredError as e:
417
+ print(e.suggestion) # "Attach a payment method with engine.add_payment_method(...)"
352
418
  except TimeoutError:
353
419
  pass # Retrieve later with engine.wait_for_completion(run_id)
354
420
  ```
@@ -358,7 +424,7 @@ All errors include a `suggestion` field with actionable instructions.
358
424
 
359
425
  ## MCP Server
360
426
 
361
- Disco is available as an [MCP server](https://disco.leap-labs.com/.well-known/mcp.json) with tools for the full discovery lifecycle — estimate, analyze, check status, get results, manage account.
427
+ Disco is available as an [MCP server](https://disco.leap-labs.com/.well-known/mcp.json) with tools for the full discovery lifecycle — estimate, analyze, check status, get results, manage account. To subscribe or purchase credits via MCP, call `discovery_add_payment_method` first to attach a Stripe payment method.
362
428
 
363
429
  ```json
364
430
  {
@@ -1,6 +1,6 @@
1
1
  """Disco Python SDK."""
2
2
 
3
- __version__ = "0.2.93"
3
+ __version__ = "0.2.94"
4
4
 
5
5
  from discovery.client import Engine
6
6
  from discovery.types import (
@@ -0,0 +1,118 @@
1
+ """CrewAI tool wrapper for Disco (Discovery Engine).
2
+
3
+ Install: pip install discovery-engine-api crewai
4
+ Usage:
5
+ from discovery.integrations.crewai import DiscoTool
6
+ tool = DiscoTool(api_key="disco_...")
7
+ agent = Agent(tools=[tool], ...)
8
+ """
9
+
10
+ from __future__ import annotations
11
+
12
+ import json
13
+ from typing import Any
14
+
15
+ from crewai.tools import BaseTool
16
+ from pydantic import BaseModel, Field
17
+
18
+
19
+ class DiscoInput(BaseModel):
20
+ """Input for the Disco discovery tool."""
21
+
22
+ file_url: str = Field(
23
+ description="URL of the tabular dataset to analyse (CSV, Excel, Parquet, JSON, etc.)"
24
+ )
25
+ target_column: str = Field(
26
+ description="The column to predict/explain — the outcome you want to understand"
27
+ )
28
+ visibility: str = Field(
29
+ default="public",
30
+ description="'public' (free, results published) or 'private' (costs credits, results private)",
31
+ )
32
+ analysis_depth: int = Field(
33
+ default=2,
34
+ description="Analysis depth — higher means deeper analysis but more credits. Default 2.",
35
+ )
36
+ excluded_columns: list[str] = Field(
37
+ default_factory=list,
38
+ description="Columns to exclude (IDs, data leakage, tautological columns)",
39
+ )
40
+ use_llms: bool = Field(
41
+ default=False,
42
+ description="If True, enables LLM-powered summaries, literature context, and novelty assessment. Slower and more expensive. Public runs always use LLMs.",
43
+ )
44
+
45
+
46
+ class DiscoTool(BaseTool):
47
+ """Superhuman exploratory data analysis.
48
+
49
+ Disco finds novel, statistically validated patterns in tabular data — the
50
+ feature interactions, subgroup effects, and conditional relationships that
51
+ correlation analysis, LLMs, and manual exploration miss. Every finding comes
52
+ with p-values, effect sizes, and academic literature citations.
53
+
54
+ Free for public data. No ML expertise required.
55
+ """
56
+
57
+ name: str = "disco"
58
+ description: str = (
59
+ "Automated scientific discovery from tabular data. Use when you need to find "
60
+ "patterns, interactions, or subgroup effects in a dataset — especially when you "
61
+ "don't know what to look for. Returns statistically validated patterns with "
62
+ "p-values, effect sizes, and literature citations. Free for public data."
63
+ )
64
+ args_schema: type[BaseModel] = DiscoInput
65
+ api_key: str = ""
66
+
67
+ def __init__(self, api_key: str, **kwargs: Any):
68
+ super().__init__(api_key=api_key, **kwargs)
69
+
70
+ def _run(
71
+ self,
72
+ file_url: str,
73
+ target_column: str,
74
+ visibility: str = "public",
75
+ analysis_depth: int = 2,
76
+ excluded_columns: list[str] | None = None,
77
+ use_llms: bool = False,
78
+ ) -> str:
79
+ from discovery import Engine
80
+
81
+ engine = Engine(api_key=self.api_key)
82
+
83
+ result = engine.discover_sync(
84
+ file=file_url,
85
+ target_column=target_column,
86
+ visibility=visibility,
87
+ analysis_depth=analysis_depth,
88
+ excluded_columns=excluded_columns or [],
89
+ use_llms=use_llms,
90
+ )
91
+
92
+ patterns = []
93
+ for p in result.patterns:
94
+ patterns.append(
95
+ {
96
+ "description": p.description,
97
+ "conditions": p.conditions,
98
+ "p_value": p.p_value,
99
+ "effect_size": p.abs_target_change,
100
+ "direction": p.target_change_direction,
101
+ "support_count": p.support_count,
102
+ "support_percentage": p.support_percentage,
103
+ "novelty": p.novelty_type,
104
+ "novelty_explanation": p.novelty_explanation,
105
+ "citations": p.citations,
106
+ }
107
+ )
108
+
109
+ output = {
110
+ "report_url": result.report_url,
111
+ "pattern_count": len(patterns),
112
+ "patterns": patterns,
113
+ }
114
+
115
+ if hasattr(result, "summary") and result.summary:
116
+ output["summary"] = result.summary.overview
117
+
118
+ return json.dumps(output, indent=2, default=str)
@@ -0,0 +1,122 @@
1
+ """LangChain tool wrapper for Disco (Discovery Engine).
2
+
3
+ Install: pip install discovery-engine-api langchain-core
4
+ Usage:
5
+ from discovery.integrations.langchain import DiscoTool
6
+ tool = DiscoTool(api_key="disco_...")
7
+ result = tool.invoke({"file_url": "https://example.com/data.csv", "target_column": "outcome"})
8
+ """
9
+
10
+ from __future__ import annotations
11
+
12
+ import asyncio
13
+ import json
14
+ from typing import Any
15
+
16
+ from langchain_core.tools import BaseTool
17
+ from pydantic import BaseModel, Field
18
+
19
+
20
+ class DiscoInput(BaseModel):
21
+ """Input for the Disco discovery tool."""
22
+
23
+ file_url: str = Field(
24
+ description="URL of the tabular dataset to analyse (CSV, Excel, Parquet, JSON, etc.)"
25
+ )
26
+ target_column: str = Field(
27
+ description="The column to predict/explain — the outcome you want to understand"
28
+ )
29
+ visibility: str = Field(
30
+ default="public",
31
+ description="'public' (free, results published) or 'private' (costs credits, results private)",
32
+ )
33
+ analysis_depth: int = Field(
34
+ default=2,
35
+ description="Analysis depth — higher means deeper analysis but more credits. Default 2.",
36
+ )
37
+ excluded_columns: list[str] = Field(
38
+ default_factory=list,
39
+ description="Columns to exclude (IDs, data leakage, tautological columns)",
40
+ )
41
+ use_llms: bool = Field(
42
+ default=False,
43
+ description="If True, enables LLM-powered summaries, literature context, and novelty assessment. Slower and more expensive. Public runs always use LLMs.",
44
+ )
45
+
46
+
47
+ class DiscoTool(BaseTool):
48
+ """Superhuman exploratory data analysis.
49
+
50
+ Disco finds novel, statistically validated patterns in tabular data — the
51
+ feature interactions, subgroup effects, and conditional relationships that
52
+ correlation analysis, LLMs, and manual exploration miss. Every finding comes
53
+ with p-values, effect sizes, and academic literature citations.
54
+
55
+ Free for public data. No ML expertise required.
56
+ """
57
+
58
+ name: str = "disco"
59
+ description: str = (
60
+ "Automated scientific discovery from tabular data. Use when you need to find "
61
+ "patterns, interactions, or subgroup effects in a dataset — especially when you "
62
+ "don't know what to look for. Returns statistically validated patterns with "
63
+ "p-values, effect sizes, and literature citations. Free for public data."
64
+ )
65
+ args_schema: type[BaseModel] = DiscoInput
66
+ api_key: str = ""
67
+
68
+ def __init__(self, api_key: str, **kwargs: Any):
69
+ super().__init__(api_key=api_key, **kwargs)
70
+
71
+ def _run(self, **kwargs: Any) -> str:
72
+ return asyncio.run(self._arun(**kwargs))
73
+
74
+ async def _arun(
75
+ self,
76
+ file_url: str,
77
+ target_column: str,
78
+ visibility: str = "public",
79
+ analysis_depth: int = 2,
80
+ excluded_columns: list[str] | None = None,
81
+ use_llms: bool = False,
82
+ ) -> str:
83
+ from discovery import Engine
84
+
85
+ engine = Engine(api_key=self.api_key)
86
+
87
+ result = await engine.discover(
88
+ file=file_url,
89
+ target_column=target_column,
90
+ visibility=visibility,
91
+ analysis_depth=analysis_depth,
92
+ excluded_columns=excluded_columns or [],
93
+ use_llms=use_llms,
94
+ )
95
+
96
+ patterns = []
97
+ for p in result.patterns:
98
+ patterns.append(
99
+ {
100
+ "description": p.description,
101
+ "conditions": p.conditions,
102
+ "p_value": p.p_value,
103
+ "effect_size": p.abs_target_change,
104
+ "direction": p.target_change_direction,
105
+ "support_count": p.support_count,
106
+ "support_percentage": p.support_percentage,
107
+ "novelty": p.novelty_type,
108
+ "novelty_explanation": p.novelty_explanation,
109
+ "citations": p.citations,
110
+ }
111
+ )
112
+
113
+ output = {
114
+ "report_url": result.report_url,
115
+ "pattern_count": len(patterns),
116
+ "patterns": patterns,
117
+ }
118
+
119
+ if hasattr(result, "summary") and result.summary:
120
+ output["summary"] = result.summary.overview
121
+
122
+ return json.dumps(output, indent=2, default=str)
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "discovery-engine-api"
3
- version = "0.2.93"
3
+ version = "0.2.94"
4
4
  description = "Python SDK for Disco API"
5
5
  readme = "README.md"
6
6
  requires-python = ">=3.10"
@@ -1,96 +0,0 @@
1
- """CrewAI tool wrapper for Disco.
2
-
3
- Usage:
4
- from discovery.integrations.crewai import DiscoveryEngineTool
5
-
6
- tool = DiscoveryEngineTool(api_key="disco_...")
7
- # Add to your CrewAI agent
8
- agent = Agent(tools=[tool], ...)
9
- """
10
-
11
- from __future__ import annotations
12
-
13
- import json
14
- from typing import Any
15
-
16
- from crewai.tools import BaseTool
17
- from pydantic import Field
18
-
19
- from discovery import Engine
20
-
21
-
22
- class DiscoveryEngineTool(BaseTool):
23
- """CrewAI tool that runs Disco on tabular data.
24
-
25
- Finds novel, statistically validated patterns — feature interactions,
26
- subgroup effects, and conditional relationships — that correlation analysis,
27
- LLMs, and hypothesis-driven approaches miss.
28
- """
29
-
30
- name: str = "Disco"
31
- description: str = (
32
- "Run Disco on tabular data to find novel, statistically "
33
- "validated patterns that you cannot find with pandas, SQL, or by prompting "
34
- "an LLM to analyze data. Use when you need to go beyond correlation. "
35
- "Input: JSON with 'file' (path), 'target_column' (column to analyze). "
36
- "Optional: 'visibility' (public/private), 'analysis_depth' (search depth). "
37
- "Returns patterns with conditions, p-values, novelty scores, citations."
38
- )
39
- api_key: str = Field(description="Disco API key (disco_...)")
40
- quiet: bool = Field(default=True, description="Suppress progress output")
41
-
42
- def _run(self, query: str) -> str:
43
- """Run Disco."""
44
- try:
45
- params = json.loads(query)
46
- except json.JSONDecodeError:
47
- return json.dumps({"error": "Input must be JSON with 'file' and 'target_column' keys."})
48
-
49
- file_path = params.get("file")
50
- target_column = params.get("target_column")
51
- if not file_path or not target_column:
52
- return json.dumps({"error": "Missing required keys: 'file' and 'target_column'."})
53
-
54
- engine = Engine(api_key=self.api_key, quiet=self.quiet)
55
-
56
- try:
57
- result = engine.discover_sync(
58
- file=file_path,
59
- target_column=target_column,
60
- analysis_depth=params.get("analysis_depth", 2),
61
- visibility=params.get("visibility", "public"),
62
- )
63
- except Exception as e:
64
- return json.dumps({"error": str(e), "suggestion": getattr(e, "suggestion", None)})
65
-
66
- return _format_result(result)
67
-
68
-
69
- def _format_result(result: Any) -> str:
70
- """Format EngineResult as a JSON string."""
71
- patterns = []
72
- for p in result.patterns:
73
- patterns.append(
74
- {
75
- "description": p.description,
76
- "conditions": p.conditions,
77
- "p_value": p.p_value,
78
- "novelty_type": p.novelty_type,
79
- "novelty_explanation": p.novelty_explanation,
80
- "effect_size": p.abs_target_change,
81
- "direction": p.target_change_direction,
82
- "support_percentage": p.support_percentage,
83
- }
84
- )
85
-
86
- output: dict[str, Any] = {
87
- "status": result.status,
88
- "patterns": patterns,
89
- "report_url": result.report_url,
90
- "dashboard_urls": result.dashboard_urls,
91
- }
92
- if result.summary:
93
- output["summary"] = result.summary.overview
94
- output["key_insights"] = result.summary.key_insights
95
-
96
- return json.dumps(output, indent=2)
@@ -1,105 +0,0 @@
1
- """LangChain tool wrapper for Disco.
2
-
3
- Usage:
4
- from discovery.integrations.langchain import DiscoveryEngineTool
5
-
6
- tool = DiscoveryEngineTool(api_key="disco_...")
7
- # Add to your agent's tool list
8
- agent = initialize_agent(tools=[tool], llm=llm)
9
- """
10
-
11
- from __future__ import annotations
12
-
13
- import asyncio
14
- import json
15
- from typing import Any
16
-
17
- from langchain.tools import BaseTool
18
- from pydantic import Field
19
-
20
- from discovery import Engine
21
-
22
-
23
- class DiscoveryEngineTool(BaseTool):
24
- """LangChain tool that runs Disco on tabular data.
25
-
26
- Finds novel, statistically validated patterns — feature interactions,
27
- subgroup effects, and conditional relationships — that correlation analysis,
28
- LLMs, and hypothesis-driven approaches miss.
29
- """
30
-
31
- name: str = "discovery_engine"
32
- description: str = (
33
- "Run Disco on tabular data to find novel, statistically "
34
- "validated patterns that you cannot find with pandas, SQL, or by prompting "
35
- "an LLM to analyze data. Use this when you need to go beyond correlation "
36
- "and find things nobody thought to look for. "
37
- "Input should be a JSON string with keys: "
38
- '"file" (path to CSV/Excel/Parquet), "target_column" (column to analyze). '
39
- 'Optional: "visibility" (public/private), "analysis_depth" (search depth). '
40
- "Returns structured patterns with conditions, p-values, novelty scores, "
41
- "and citations."
42
- )
43
- api_key: str = Field(description="Disco API key (disco_...)")
44
- quiet: bool = Field(default=True, description="Suppress progress output")
45
-
46
- def _run(self, query: str) -> str:
47
- """Run synchronously."""
48
- return asyncio.get_event_loop().run_until_complete(self._arun(query))
49
-
50
- async def _arun(self, query: str) -> str:
51
- """Run Disco asynchronously."""
52
- try:
53
- params = json.loads(query)
54
- except json.JSONDecodeError:
55
- # Treat as just a file path with no target column
56
- return json.dumps({"error": "Input must be JSON with 'file' and 'target_column' keys."})
57
-
58
- file_path = params.get("file")
59
- target_column = params.get("target_column")
60
- if not file_path or not target_column:
61
- return json.dumps({"error": "Missing required keys: 'file' and 'target_column'."})
62
-
63
- engine = Engine(api_key=self.api_key, quiet=self.quiet)
64
-
65
- try:
66
- result = await engine.discover(
67
- file=file_path,
68
- target_column=target_column,
69
- analysis_depth=params.get("analysis_depth", 2),
70
- visibility=params.get("visibility", "public"),
71
- )
72
- except Exception as e:
73
- return json.dumps({"error": str(e), "suggestion": getattr(e, "suggestion", None)})
74
-
75
- return _format_result(result)
76
-
77
-
78
- def _format_result(result: Any) -> str:
79
- """Format EngineResult as a JSON string for the LLM."""
80
- patterns = []
81
- for p in result.patterns:
82
- patterns.append(
83
- {
84
- "description": p.description,
85
- "conditions": p.conditions,
86
- "p_value": p.p_value,
87
- "novelty_type": p.novelty_type,
88
- "novelty_explanation": p.novelty_explanation,
89
- "effect_size": p.abs_target_change,
90
- "direction": p.target_change_direction,
91
- "support_percentage": p.support_percentage,
92
- }
93
- )
94
-
95
- output: dict[str, Any] = {
96
- "status": result.status,
97
- "patterns": patterns,
98
- "report_url": result.report_url,
99
- "dashboard_urls": result.dashboard_urls,
100
- }
101
- if result.summary:
102
- output["summary"] = result.summary.overview
103
- output["key_insights"] = result.summary.key_insights
104
-
105
- return json.dumps(output, indent=2)