discovery-engine-api 0.1.5__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,56 @@
1
+ # trained models and generated prototypes
2
+ /models
3
+ /results
4
+ /train_results
5
+ research/automl/trained_model
6
+ research/automl/leap_files
7
+ *.ckpt
8
+ */train_results/*
9
+
10
+ # DS_STORE
11
+ .DS_STORE
12
+
13
+ # python
14
+ __pycache__
15
+
16
+ # virtualenv
17
+ .venv
18
+ venv
19
+
20
+ # pytest
21
+ .pytest_cache
22
+
23
+ # coverage
24
+ htmlcov/
25
+ .coverage
26
+ .coverage.*
27
+ *.cover
28
+
29
+ # ruff
30
+ .ruff_cache
31
+
32
+ # dotenv
33
+ *.env
34
+
35
+ # pickle files
36
+ *.pkl
37
+ *.pickle
38
+
39
+ # leap_ie
40
+ leap_files
41
+ **/leap_files/
42
+
43
+ # packages/data
44
+ /datasets
45
+
46
+ # packages/reporting
47
+ discovery_report
48
+
49
+ # packages/training
50
+ wandb
51
+
52
+ # logs
53
+ lightning_logs
54
+ .deepeval
55
+ logs/*
56
+ logs/memory_monitor.log
@@ -0,0 +1,356 @@
1
+ Metadata-Version: 2.4
2
+ Name: discovery-engine-api
3
+ Version: 0.1.5
4
+ Summary: Python SDK for the Discovery Engine API
5
+ Project-URL: Homepage, https://github.com/leap-laboratories/discovery
6
+ Project-URL: Documentation, https://github.com/leap-laboratories/discovery
7
+ Project-URL: Repository, https://github.com/leap-laboratories/discovery
8
+ Author: Leap Laboratories
9
+ License: MIT
10
+ Keywords: api,data-analysis,discovery,machine-learning,sdk
11
+ Classifier: Development Status :: 4 - Beta
12
+ Classifier: Intended Audience :: Developers
13
+ Classifier: License :: OSI Approved :: MIT License
14
+ Classifier: Programming Language :: Python :: 3
15
+ Classifier: Programming Language :: Python :: 3.10
16
+ Classifier: Programming Language :: Python :: 3.11
17
+ Classifier: Programming Language :: Python :: 3.12
18
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
19
+ Classifier: Topic :: Software Development :: Libraries :: Python Modules
20
+ Requires-Python: >=3.10
21
+ Requires-Dist: httpx>=0.24.0
22
+ Requires-Dist: pydantic>=2.0.0
23
+ Provides-Extra: dev
24
+ Requires-Dist: pytest-asyncio>=0.21.0; extra == 'dev'
25
+ Requires-Dist: pytest>=7.0.0; extra == 'dev'
26
+ Provides-Extra: jupyter
27
+ Requires-Dist: nest-asyncio>=1.5.0; extra == 'jupyter'
28
+ Provides-Extra: pandas
29
+ Requires-Dist: pandas>=2.0.0; extra == 'pandas'
30
+ Description-Content-Type: text/markdown
31
+
32
+ # Discovery Engine Python API
33
+
34
+ The Discovery Engine Python API provides a simple programmatic interface to run analyses via Python, offering an alternative to using the web dashboard. Instead of uploading datasets and configuring analyses through the UI, you can automate your discovery workflows directly from your Python code or scripts.
35
+
36
+ All analyses run through the API are fully integrated with your Discovery Engine account. Results are automatically displayed in the dashboard, where you can view detailed reports, explore patterns, and share findings with your team. Your account management, credit balance, and subscription settings are all handled through the dashboard.
37
+
38
+ ## Installation
39
+
40
+ ```bash
41
+ pip install discovery-engine-api
42
+ ```
43
+
44
+ For pandas DataFrame support:
45
+
46
+ ```bash
47
+ pip install discovery-engine-api[pandas]
48
+ ```
49
+
50
+ For Jupyter notebook support:
51
+
52
+ ```bash
53
+ pip install discovery-engine-api[jupyter]
54
+ ```
55
+
56
+ This installs `nest-asyncio`, which is required to use `engine.run()` in Jupyter notebooks. Alternatively, you can use `await engine.run_async()` directly in Jupyter notebooks without installing the jupyter extra.
57
+
58
+ ## Configuration
59
+
60
+ ### API Keys
61
+
62
+ Get your API key from the [Developers page](https://disco.leap-labs.com/developers) in your Discovery Engine dashboard.
63
+
64
+ ## Quick Start
65
+
66
+ ```python
67
+ from discovery import Engine
68
+
69
+ # Initialize engine
70
+ engine = Engine(api_key="your-api-key")
71
+
72
+ # Run analysis on a dataset and wait for results
73
+ result = engine.run(
74
+ file="data.csv",
75
+ target_column="diagnosis",
76
+ mode="fast",
77
+ description="Rare diseases dataset",
78
+ wait=True # Wait for completion and return full results
79
+ )
80
+
81
+ print(f"Run ID: {result.run_id}")
82
+ print(f"Status: {result.status}")
83
+ print(f"Found {len(result.patterns)} patterns")
84
+ ```
85
+
86
+
87
+ ## Examples
88
+
89
+ ### Working with Pandas DataFrames
90
+
91
+ ```python
92
+ import pandas as pd
93
+ from discovery import Engine
94
+
95
+ df = pd.read_csv("data.csv")
96
+ # or create DataFrame directly
97
+
98
+ engine = Engine(api_key="your-api-key")
99
+ result = engine.run(
100
+ file=df, # Pass DataFrame directly
101
+ target_column="outcome",
102
+ column_descriptions={
103
+ "age": "Patient age in years",
104
+ "heart rate": None
105
+ },
106
+ wait=True
107
+ )
108
+ ```
109
+
110
+
111
+ ### Async Workflow
112
+
113
+ ```python
114
+ import asyncio
115
+ from discovery import Engine
116
+
117
+ async def run_analysis():
118
+ async with Engine(api_key="your-api-key") as engine:
119
+ # Start analysis without waiting
120
+ result = await engine.run_async(
121
+ file="data.csv",
122
+ target_column="target",
123
+ wait=False
124
+ )
125
+ print(f"Started run: {result.run_id}")
126
+
127
+ # Later, get results
128
+ result = await engine.get_results(result.run_id)
129
+
130
+ # Or wait for completion
131
+ result = await engine.wait_for_completion(result.run_id, timeout=1200)
132
+ return result
133
+
134
+ result = asyncio.run(run_analysis())
135
+ ```
136
+
137
+ ### Using in Jupyter Notebooks
138
+
139
+ In Jupyter notebooks, you have two options:
140
+
141
+ **Option 1: Install the jupyter extra (recommended)**
142
+ ```bash
143
+ pip install discovery-engine-api[jupyter]
144
+ ```
145
+
146
+ Then use `engine.run()` as normal:
147
+ ```python
148
+ from discovery import Engine
149
+
150
+ engine = Engine(api_key="your-api-key")
151
+ result = engine.run(file="data.csv", target_column="target", wait=True)
152
+ ```
153
+
154
+ **Option 2: Use async directly**
155
+ ```python
156
+ from discovery import Engine
157
+
158
+ engine = Engine(api_key="your-api-key")
159
+ result = await engine.run_async(file="data.csv", target_column="target", wait=True)
160
+ ```
161
+
162
+
163
+ ## Configuration Options
164
+
165
+ The `run()` and `run_async()` methods accept the following parameters:
166
+
167
+ | Parameter | Type | Default | Description |
168
+ |-----------|------|---------|-------------|
169
+ | `file` | `str`, `Path`, or `DataFrame` | **Required** | Dataset file path or pandas DataFrame |
170
+ | `target_column` | `str` | **Required** | Name of column to predict |
171
+ | `mode` | `"fast"` / `"deep"` | `"fast"` | Analysis depth |
172
+ | `title` | `str` | `None` | Optional dataset title |
173
+ | `description` | `str` | `None` | Optional dataset description |
174
+ | `column_descriptions` | `Dict[str, str]` | `None` | Optional column name -> description mapping |
175
+ | `visibility` | `"public"` / `"private"` | `"public"` | Dataset visibility (private requires credits) |
176
+ | `auto_report_use_llm_evals` | `bool` | `True` | Use LLM for pattern descriptions |
177
+ | `author` | `str` | `None` | Optional dataset author attribution |
178
+ | `source_url` | `str` | `None` | Optional source URL for dataset attribution |
179
+ | `wait` | `bool` | `False` | Wait for analysis to complete and return full results |
180
+ | `wait_timeout` | `float` | `None` | Maximum seconds to wait for completion (only if `wait=True`) |
181
+
182
+
183
+ ## Credits and Pricing
184
+
185
+ - **Public datasets**: Free (0 credits required)
186
+ - **Private datasets**:
187
+ - Fast mode: 1 credit per MB
188
+ - Deep mode: 3 credits per MB
189
+
190
+ If you don't have enough credits for a private run, the SDK will raise an `httpx.HTTPStatusError` with an error message like:
191
+ ```
192
+ Insufficient credits. You need X credits but only have Y available.
193
+ ```
194
+
195
+ **Solutions:**
196
+ 1. Make your dataset public (set `visibility="public"`) - completely free
197
+ 2. Visit [https://disco.leap-labs.com/account](https://disco.leap-labs.com/account) to:
198
+ - Purchase additional credits
199
+ - Upgrade to a subscription plan that includes more credits
200
+
201
+
202
+ ## Return Value
203
+
204
+ The `run()` and `run_async()` methods return an `EngineResult` object with the following fields:
205
+
206
+ ### EngineResult
207
+
208
+ ```python
209
+ @dataclass
210
+ class EngineResult:
211
+ # Identifiers
212
+ run_id: str # Unique run identifier
213
+ report_id: Optional[str] # Report ID (if report created)
214
+ status: str # "pending", "processing", "completed", "failed"
215
+
216
+ # Dataset metadata
217
+ dataset_title: Optional[str] # Dataset title
218
+ dataset_description: Optional[str] # Dataset description
219
+ total_rows: Optional[int] # Number of rows in dataset
220
+ target_column: Optional[str] # Name of target column
221
+ task: Optional[str] # "regression", "binary_classification", or "multiclass_classification"
222
+
223
+ # LLM-generated summary
224
+ summary: Optional[Summary] # Summary object with overview, insights, etc.
225
+
226
+ # Discovered patterns
227
+ patterns: List[Pattern] # List of discovered patterns
228
+
229
+ # Column/feature information
230
+ columns: List[Column] # List of columns with statistics and importance
231
+
232
+ # Correlation matrix
233
+ correlation_matrix: List[CorrelationEntry] # Feature correlations
234
+
235
+ # Global feature importance
236
+ feature_importance: Optional[FeatureImportance] # Feature importance scores
237
+
238
+ # Job tracking
239
+ job_id: Optional[str] # Job ID for tracking processing
240
+ job_status: Optional[str] # Job status
241
+ error_message: Optional[str] # Error message if analysis failed
242
+ ```
243
+
244
+ ### Summary
245
+
246
+ ```python
247
+ @dataclass
248
+ class Summary:
249
+ overview: str # High-level explanation of findings
250
+ key_insights: List[str] # List of main takeaways
251
+ novel_patterns: PatternGroup # Novel pattern explanations
252
+ surprising_findings: PatternGroup # Surprising findings
253
+ statistically_significant: PatternGroup # Statistically significant patterns
254
+ data_insights: Optional[DataInsights] # Important features, correlations
255
+ selected_pattern_id: Optional[str] # ID of selected pattern
256
+ ```
257
+
258
+ ### Pattern
259
+
260
+ ```python
261
+ @dataclass
262
+ class Pattern:
263
+ id: str # Pattern identifier
264
+ task: str # Task type
265
+ target_column: str # Target column name
266
+ direction: str # "min" or "max"
267
+ p_value: float # Statistical p-value
268
+ conditions: List[Dict] # Pattern conditions (continuous, categorical, datetime)
269
+ lift_value: float # Lift value (how much the pattern increases/decreases target)
270
+ support_count: int # Number of rows matching pattern
271
+ support_percentage: float # Percentage of rows matching pattern
272
+ pattern_type: str # "validated" or "speculative"
273
+ novelty_type: str # "novel" or "confirmatory"
274
+ target_score: float # Target score for this pattern
275
+ description: str # Human-readable description
276
+ novelty_explanation: str # Explanation of novelty
277
+ target_class: Optional[str] # Target class (for classification)
278
+ target_mean: Optional[float] # Target mean (for regression)
279
+ target_std: Optional[float] # Target standard deviation
280
+ citations: List[Dict] # Academic citations
281
+ ```
282
+
283
+ ### Column
284
+
285
+ ```python
286
+ @dataclass
287
+ class Column:
288
+ id: str # Column identifier
289
+ name: str # Column name
290
+ display_name: str # Display name
291
+ type: str # "continuous" or "categorical"
292
+ data_type: str # "int", "float", "string", "boolean", "datetime"
293
+ enabled: bool # Whether column is enabled
294
+ description: Optional[str] # Column description
295
+
296
+ # Statistics
297
+ mean: Optional[float] # Mean value
298
+ median: Optional[float] # Median value
299
+ std: Optional[float] # Standard deviation
300
+ min: Optional[float] # Minimum value
301
+ max: Optional[float] # Maximum value
302
+ iqr_min: Optional[float] # IQR minimum
303
+ iqr_max: Optional[float] # IQR maximum
304
+ mode: Optional[str] # Mode value
305
+ approx_unique: Optional[int] # Approximate unique count
306
+ null_percentage: Optional[float] # Percentage of null values
307
+
308
+ # Feature importance
309
+ feature_importance_score: Optional[float] # Feature importance score
310
+ ```
311
+
312
+ ### FeatureImportance
313
+
314
+ ```python
315
+ @dataclass
316
+ class FeatureImportance:
317
+ kind: str # Feature importance type: "global"
318
+ baseline: float # Baseline model output
319
+ scores: List[FeatureImportanceScore] # List of feature scores
320
+ ```
321
+
322
+ ### CorrelationEntry
323
+
324
+ ```python
325
+ @dataclass
326
+ class CorrelationEntry:
327
+ feature_x: str # First feature name
328
+ feature_y: str # Second feature name
329
+ value: float # Correlation value (-1 to 1)
330
+ ```
331
+
332
+ ### Pattern
333
+
334
+ ```python
335
+ @dataclass
336
+ class Pattern:
337
+ id: str
338
+ task: str
339
+ target_column: str
340
+ direction: str # "min" or "max"
341
+ p_value: float
342
+ conditions: List[Dict] # Continuous, categorical, or datetime conditions
343
+ lift_value: float
344
+ support_count: int
345
+ support_percentage: float
346
+ pattern_type: str # "validated" or "speculative"
347
+ novelty_type: str # "novel" or "confirmatory"
348
+ target_score: float
349
+ description: str
350
+ novelty_explanation: str
351
+ target_class: Optional[str]
352
+ target_mean: Optional[float]
353
+ target_std: Optional[float]
354
+ citations: List[Dict]
355
+ ```
356
+