discovery-engine-api 0.1.52__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
discovery/types.py ADDED
@@ -0,0 +1,256 @@
1
+ """Type definitions for the Discovery SDK."""
2
+
3
+ from dataclasses import dataclass, field
4
+ from typing import Any, Dict, List, Literal, Optional, Union
5
+
6
+
7
+ @dataclass
8
+ class FileInfo:
9
+ """Information about an uploaded file."""
10
+
11
+ file_path: str # GCS path
12
+ file_hash: str
13
+ file_size: int
14
+ mime_type: str
15
+
16
+
17
+ @dataclass
18
+ class TimeseriesGroup:
19
+ """Timeseries column group metadata."""
20
+
21
+ base_name: str
22
+ columns: List[str]
23
+ num_timesteps: int
24
+ pattern_matched: str
25
+ dtype: str # "numeric" or "categorical"
26
+
27
+
28
+ # Pattern types
29
+
30
+
31
+ @dataclass
32
+ class PatternContinuousCondition:
33
+ """A continuous condition in a pattern."""
34
+
35
+ type: Literal["continuous"]
36
+ feature: str
37
+ min_value: float
38
+ max_value: float
39
+ min_q: Optional[float] = None
40
+ max_q: Optional[float] = None
41
+
42
+
43
+ @dataclass
44
+ class PatternCategoricalCondition:
45
+ """A categorical condition in a pattern."""
46
+
47
+ type: Literal["categorical"]
48
+ feature: str
49
+ values: List[Union[str, int, float, bool, None]]
50
+
51
+
52
+ @dataclass
53
+ class PatternDatetimeCondition:
54
+ """A datetime condition in a pattern."""
55
+
56
+ type: Literal["datetime"]
57
+ feature: str
58
+ min_value: float # epoch milliseconds
59
+ max_value: float # epoch milliseconds
60
+ min_datetime: str # human-readable
61
+ max_datetime: str # human-readable
62
+ min_q: Optional[float] = None
63
+ max_q: Optional[float] = None
64
+
65
+
66
+ PatternCondition = Union[
67
+ PatternContinuousCondition, PatternCategoricalCondition, PatternDatetimeCondition
68
+ ]
69
+
70
+
71
+ @dataclass
72
+ class PatternCitation:
73
+ """Academic citation for a pattern."""
74
+
75
+ url: str
76
+ title: Optional[str] = None
77
+ doi: Optional[str] = None
78
+ authors: Optional[List[str]] = None
79
+ year: Optional[str] = None
80
+ journal: Optional[str] = None
81
+ volume: Optional[str] = None
82
+ issue: Optional[str] = None
83
+ pages: Optional[str] = None
84
+
85
+
86
+ @dataclass
87
+ class Pattern:
88
+ """A discovered pattern in the data."""
89
+
90
+ id: str
91
+ task: str # regression, binary_classification, multiclass_classification
92
+ target_column: str
93
+ direction: str # "min" or "max"
94
+ p_value: float
95
+ conditions: List[Dict[str, Any]] # PatternCondition as dicts
96
+ lift_value: float
97
+ support_count: int
98
+ support_percentage: float
99
+ pattern_type: str # "validated" or "speculative"
100
+ novelty_type: str # "novel" or "confirmatory"
101
+ target_score: float
102
+ description: str
103
+ novelty_explanation: str
104
+ target_class: Optional[str] = None
105
+ target_mean: Optional[float] = None
106
+ target_std: Optional[float] = None
107
+ citations: List[Dict[str, Any]] = field(default_factory=list)
108
+
109
+
110
+ # Column/Feature types
111
+
112
+
113
+ @dataclass
114
+ class Column:
115
+ """Information about a dataset column/feature."""
116
+
117
+ id: str
118
+ name: str
119
+ display_name: str
120
+ type: str # "continuous" or "categorical"
121
+ data_type: str # "int", "float", "string", "boolean", "datetime"
122
+ enabled: bool
123
+ description: Optional[str] = None
124
+
125
+ # Statistics
126
+ mean: Optional[float] = None
127
+ median: Optional[float] = None
128
+ std: Optional[float] = None
129
+ min: Optional[float] = None
130
+ max: Optional[float] = None
131
+ iqr_min: Optional[float] = None
132
+ iqr_max: Optional[float] = None
133
+ mode: Optional[str] = None
134
+ approx_unique: Optional[int] = None
135
+ null_percentage: Optional[float] = None
136
+
137
+ # Feature importance
138
+ feature_importance_score: Optional[float] = None
139
+
140
+
141
+ # Summary types (LLM-generated)
142
+
143
+
144
+ @dataclass
145
+ class DataInsights:
146
+ """LLM-generated data insights."""
147
+
148
+ important_features: List[str]
149
+ important_features_explanation: str
150
+ strong_correlations: List[Dict[str, str]] # [{"feature1": "...", "feature2": "..."}]
151
+ strong_correlations_explanation: str
152
+ notable_relationships: List[str]
153
+
154
+
155
+ @dataclass
156
+ class PatternGroup:
157
+ """A group of patterns with explanation."""
158
+
159
+ pattern_ids: List[str]
160
+ explanation: str
161
+
162
+
163
+ @dataclass
164
+ class Summary:
165
+ """LLM-generated summary of the analysis."""
166
+
167
+ overview: str
168
+ key_insights: List[str]
169
+ novel_patterns: PatternGroup
170
+ surprising_findings: PatternGroup
171
+ statistically_significant: PatternGroup
172
+ data_insights: DataInsights
173
+ selected_pattern_id: Optional[str] = None
174
+
175
+
176
+ # Feature importance types
177
+
178
+
179
+ @dataclass
180
+ class FeatureImportanceScore:
181
+ """A single feature importance score."""
182
+
183
+ feature: str
184
+ score: float
185
+
186
+
187
+ @dataclass
188
+ class FeatureImportance:
189
+ """Global feature importance information."""
190
+
191
+ kind: str # "global" or "local"
192
+ baseline: float # expected model output
193
+ scores: List[FeatureImportanceScore]
194
+
195
+
196
+ # Correlation matrix types
197
+
198
+
199
+ @dataclass
200
+ class CorrelationEntry:
201
+ """A single correlation matrix entry."""
202
+
203
+ feature_x: str
204
+ feature_y: str
205
+ value: float
206
+
207
+
208
+ # Main result type
209
+
210
+
211
+ @dataclass
212
+ class EngineResult:
213
+ """Complete result of an engine run."""
214
+
215
+ # Identifiers
216
+ run_id: str
217
+ report_id: Optional[str] = None
218
+ status: str = "pending" # pending, processing, completed, failed
219
+
220
+ # Dataset metadata
221
+ dataset_title: Optional[str] = None
222
+ dataset_description: Optional[str] = None
223
+ total_rows: Optional[int] = None
224
+ target_column: Optional[str] = None
225
+ task: Optional[str] = None # regression, binary_classification, multiclass_classification
226
+
227
+ # LLM-generated summary
228
+ summary: Optional[Summary] = None
229
+
230
+ # Discovered patterns
231
+ patterns: List[Pattern] = field(default_factory=list)
232
+
233
+ # Column/feature information with stats and importance
234
+ columns: List[Column] = field(default_factory=list)
235
+
236
+ # Correlation matrix
237
+ correlation_matrix: List[CorrelationEntry] = field(default_factory=list)
238
+
239
+ # Global feature importance
240
+ feature_importance: Optional[FeatureImportance] = None
241
+
242
+ # Job tracking
243
+ job_id: Optional[str] = None
244
+ job_status: Optional[str] = None
245
+ error_message: Optional[str] = None
246
+
247
+
248
+ @dataclass
249
+ class RunStatus:
250
+ """Status of a run."""
251
+
252
+ run_id: str
253
+ status: str
254
+ job_id: Optional[str] = None
255
+ job_status: Optional[str] = None
256
+ error_message: Optional[str] = None
@@ -0,0 +1,354 @@
1
+ Metadata-Version: 2.4
2
+ Name: discovery-engine-api
3
+ Version: 0.1.52
4
+ Summary: Python SDK for the Discovery Engine API
5
+ Project-URL: Homepage, https://github.com/leap-laboratories/discovery
6
+ Project-URL: Documentation, https://github.com/leap-laboratories/discovery
7
+ Project-URL: Repository, https://github.com/leap-laboratories/discovery
8
+ Author: Leap Laboratories
9
+ License: MIT
10
+ Keywords: api,data-analysis,discovery,machine-learning,sdk
11
+ Classifier: Development Status :: 4 - Beta
12
+ Classifier: Intended Audience :: Developers
13
+ Classifier: License :: OSI Approved :: MIT License
14
+ Classifier: Programming Language :: Python :: 3
15
+ Classifier: Programming Language :: Python :: 3.10
16
+ Classifier: Programming Language :: Python :: 3.11
17
+ Classifier: Programming Language :: Python :: 3.12
18
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
19
+ Classifier: Topic :: Software Development :: Libraries :: Python Modules
20
+ Requires-Python: >=3.10
21
+ Requires-Dist: httpx>=0.24.0
22
+ Requires-Dist: pydantic>=2.0.0
23
+ Provides-Extra: dev
24
+ Requires-Dist: pytest-asyncio>=0.21.0; extra == 'dev'
25
+ Requires-Dist: pytest>=7.0.0; extra == 'dev'
26
+ Provides-Extra: jupyter
27
+ Requires-Dist: nest-asyncio>=1.5.0; extra == 'jupyter'
28
+ Provides-Extra: pandas
29
+ Requires-Dist: pandas>=2.0.0; extra == 'pandas'
30
+ Description-Content-Type: text/markdown
31
+
32
+ # Discovery Engine Python API
33
+
34
+ The Discovery Engine Python API provides a simple programmatic interface to run analyses via Python, offering an alternative to using the web dashboard. Instead of uploading datasets and configuring analyses through the UI, you can automate your discovery workflows directly from your Python code or scripts.
35
+
36
+ All analyses run through the API are fully integrated with your Discovery Engine account. Results are automatically displayed in the dashboard, where you can view detailed reports, explore patterns, and share findings with your team. Your account management, credit balance, and subscription settings are all handled through the dashboard.
37
+
38
+ ## Installation
39
+
40
+ ```bash
41
+ pip install discovery-engine-api
42
+ ```
43
+
44
+ For pandas DataFrame support:
45
+
46
+ ```bash
47
+ pip install discovery-engine-api[pandas]
48
+ ```
49
+
50
+ For Jupyter notebook support:
51
+
52
+ ```bash
53
+ pip install discovery-engine-api[jupyter]
54
+ ```
55
+
56
+ This installs `nest-asyncio`, which is required to use `engine.run()` in Jupyter notebooks. Alternatively, you can use `await engine.run_async()` directly in Jupyter notebooks without installing the jupyter extra.
57
+
58
+ ## Configuration
59
+
60
+ ### API Keys
61
+
62
+ Get your API key from the [Developers page](https://disco.leap-labs.com/developers) in your Discovery Engine dashboard.
63
+
64
+ ## Quick Start
65
+
66
+ ```python
67
+ from discovery import Engine
68
+
69
+ # Initialize engine
70
+ engine = Engine(api_key="your-api-key")
71
+
72
+ # Run analysis on a dataset and wait for results
73
+ result = engine.run(
74
+ file="data.csv",
75
+ target_column="diagnosis",
76
+ mode="fast",
77
+ description="Rare diseases dataset",
78
+ excluded_columns=["patient_id"], # Exclude ID column from analysis
79
+ wait=True # Wait for completion and return full results
80
+ )
81
+
82
+ print(f"Run ID: {result.run_id}")
83
+ print(f"Status: {result.status}")
84
+ print(f"Found {len(result.patterns)} patterns")
85
+ ```
86
+
87
+
88
+ ## Examples
89
+
90
+ ### Working with Pandas DataFrames
91
+
92
+ ```python
93
+ import pandas as pd
94
+ from discovery import Engine
95
+
96
+ df = pd.read_csv("data.csv")
97
+ # or create DataFrame directly
98
+
99
+ engine = Engine(api_key="your-api-key")
100
+ result = engine.run(
101
+ file=df, # Pass DataFrame directly
102
+ target_column="outcome",
103
+ column_descriptions={
104
+ "age": "Patient age in years",
105
+ "heart rate": None
106
+ },
107
+ excluded_columns=["id", "timestamp"], # Exclude ID and timestamp columns from analysis
108
+ wait=True
109
+ )
110
+ ```
111
+
112
+
113
+ ### Async Workflow
114
+
115
+ ```python
116
+ import asyncio
117
+ from discovery import Engine
118
+
119
+ async def run_analysis():
120
+ async with Engine(api_key="your-api-key") as engine:
121
+ # Start analysis without waiting
122
+ result = await engine.run_async(
123
+ file="data.csv",
124
+ target_column="target",
125
+ wait=False
126
+ )
127
+ print(f"Started run: {result.run_id}")
128
+
129
+ # Later, get results
130
+ result = await engine.get_results(result.run_id)
131
+
132
+ # Or wait for completion
133
+ result = await engine.wait_for_completion(result.run_id, timeout=1200)
134
+ return result
135
+
136
+ result = asyncio.run(run_analysis())
137
+ ```
138
+
139
+ ### Using in Jupyter Notebooks
140
+
141
+ In Jupyter notebooks, you have two options:
142
+
143
+ **Option 1: Install the jupyter extra (recommended)**
144
+ ```bash
145
+ pip install discovery-engine-api[jupyter]
146
+ ```
147
+
148
+ Then use `engine.run()` as normal:
149
+ ```python
150
+ from discovery import Engine
151
+
152
+ engine = Engine(api_key="your-api-key")
153
+ result = engine.run(file="data.csv", target_column="target", wait=True)
154
+ ```
155
+
156
+ **Option 2: Use async directly**
157
+ ```python
158
+ from discovery import Engine
159
+
160
+ engine = Engine(api_key="your-api-key")
161
+ result = await engine.run_async(file="data.csv", target_column="target", wait=True)
162
+ ```
163
+
164
+
165
+ ## Configuration Options
166
+
167
+ The `run()` and `run_async()` methods accept the following parameters:
168
+
169
+ | Parameter | Type | Default | Description |
170
+ |-----------|------|---------|-------------|
171
+ | `file` | `str`, `Path`, or `DataFrame` | **Required** | Dataset file path or pandas DataFrame |
172
+ | `target_column` | `str` | **Required** | Name of column to predict |
173
+ | `mode` | `"fast"` / `"deep"` | `"fast"` | Analysis depth |
174
+ | `title` | `str` | `None` | Optional dataset title |
175
+ | `description` | `str` | `None` | Optional dataset description |
176
+ | `column_descriptions` | `Dict[str, str]` | `None` | Optional column name -> description mapping |
177
+ | `excluded_columns` | `List[str]` | `None` | Optional list of column names to exclude from analysis (e.g., IDs, timestamps) |
178
+ | `visibility` | `"public"` / `"private"` | `"public"` | Dataset visibility (private requires credits) |
179
+ | `auto_report_use_llm_evals` | `bool` | `True` | Use LLM for pattern descriptions |
180
+ | `author` | `str` | `None` | Optional dataset author attribution |
181
+ | `source_url` | `str` | `None` | Optional source URL for dataset attribution |
182
+ | `wait` | `bool` | `False` | Wait for analysis to complete and return full results |
183
+ | `wait_timeout` | `float` | `None` | Maximum seconds to wait for completion (only if `wait=True`) |
184
+
185
+
186
+ ## Credits and Pricing
187
+
188
+ If you don't have enough credits for a private run, the SDK will raise an `httpx.HTTPStatusError` with an error message like:
189
+ ```
190
+ Insufficient credits. You need X credits but only have Y available.
191
+ ```
192
+
193
+ **Solutions:**
194
+ 1. Make your dataset public (set `visibility="public"`) - completely free
195
+ 2. Visit [https://disco.leap-labs.com/account](https://disco.leap-labs.com/account) to:
196
+ - Purchase additional credits
197
+ - Upgrade to a subscription plan that includes more credits
198
+
199
+
200
+ ## Return Value
201
+
202
+ The `run()` and `run_async()` methods return an `EngineResult` object with the following fields:
203
+
204
+ ### EngineResult
205
+
206
+ ```python
207
+ @dataclass
208
+ class EngineResult:
209
+ # Identifiers
210
+ run_id: str # Unique run identifier
211
+ report_id: Optional[str] # Report ID (if report created)
212
+ status: str # "pending", "processing", "completed", "failed"
213
+
214
+ # Dataset metadata
215
+ dataset_title: Optional[str] # Dataset title
216
+ dataset_description: Optional[str] # Dataset description
217
+ total_rows: Optional[int] # Number of rows in dataset
218
+ target_column: Optional[str] # Name of target column
219
+ task: Optional[str] # "regression", "binary_classification", or "multiclass_classification"
220
+
221
+ # LLM-generated summary
222
+ summary: Optional[Summary] # Summary object with overview, insights, etc.
223
+
224
+ # Discovered patterns
225
+ patterns: List[Pattern] # List of discovered patterns
226
+
227
+ # Column/feature information
228
+ columns: List[Column] # List of columns with statistics and importance
229
+
230
+ # Correlation matrix
231
+ correlation_matrix: List[CorrelationEntry] # Feature correlations
232
+
233
+ # Global feature importance
234
+ feature_importance: Optional[FeatureImportance] # Feature importance scores
235
+
236
+ # Job tracking
237
+ job_id: Optional[str] # Job ID for tracking processing
238
+ job_status: Optional[str] # Job status
239
+ error_message: Optional[str] # Error message if analysis failed
240
+ ```
241
+
242
+ ### Summary
243
+
244
+ ```python
245
+ @dataclass
246
+ class Summary:
247
+ overview: str # High-level explanation of findings
248
+ key_insights: List[str] # List of main takeaways
249
+ novel_patterns: PatternGroup # Novel pattern explanations
250
+ surprising_findings: PatternGroup # Surprising findings
251
+ statistically_significant: PatternGroup # Statistically significant patterns
252
+ data_insights: Optional[DataInsights] # Important features, correlations
253
+ selected_pattern_id: Optional[str] # ID of selected pattern
254
+ ```
255
+
256
+ ### Pattern
257
+
258
+ ```python
259
+ @dataclass
260
+ class Pattern:
261
+ id: str # Pattern identifier
262
+ task: str # Task type
263
+ target_column: str # Target column name
264
+ direction: str # "min" or "max"
265
+ p_value: float # Statistical p-value
266
+ conditions: List[Dict] # Pattern conditions (continuous, categorical, datetime)
267
+ lift_value: float # Lift value (how much the pattern increases/decreases target)
268
+ support_count: int # Number of rows matching pattern
269
+ support_percentage: float # Percentage of rows matching pattern
270
+ pattern_type: str # "validated" or "speculative"
271
+ novelty_type: str # "novel" or "confirmatory"
272
+ target_score: float # Target score for this pattern
273
+ description: str # Human-readable description
274
+ novelty_explanation: str # Explanation of novelty
275
+ target_class: Optional[str] # Target class (for classification)
276
+ target_mean: Optional[float] # Target mean (for regression)
277
+ target_std: Optional[float] # Target standard deviation
278
+ citations: List[Dict] # Academic citations
279
+ ```
280
+
281
+ ### Column
282
+
283
+ ```python
284
+ @dataclass
285
+ class Column:
286
+ id: str # Column identifier
287
+ name: str # Column name
288
+ display_name: str # Display name
289
+ type: str # "continuous" or "categorical"
290
+ data_type: str # "int", "float", "string", "boolean", "datetime"
291
+ enabled: bool # Whether column is enabled
292
+ description: Optional[str] # Column description
293
+
294
+ # Statistics
295
+ mean: Optional[float] # Mean value
296
+ median: Optional[float] # Median value
297
+ std: Optional[float] # Standard deviation
298
+ min: Optional[float] # Minimum value
299
+ max: Optional[float] # Maximum value
300
+ iqr_min: Optional[float] # IQR minimum
301
+ iqr_max: Optional[float] # IQR maximum
302
+ mode: Optional[str] # Mode value
303
+ approx_unique: Optional[int] # Approximate unique count
304
+ null_percentage: Optional[float] # Percentage of null values
305
+
306
+ # Feature importance
307
+ feature_importance_score: Optional[float] # Feature importance score
308
+ ```
309
+
310
+ ### FeatureImportance
311
+
312
+ ```python
313
+ @dataclass
314
+ class FeatureImportance:
315
+ kind: str # Feature importance type: "global"
316
+ baseline: float # Baseline model output
317
+ scores: List[FeatureImportanceScore] # List of feature scores
318
+ ```
319
+
320
+ ### CorrelationEntry
321
+
322
+ ```python
323
+ @dataclass
324
+ class CorrelationEntry:
325
+ feature_x: str # First feature name
326
+ feature_y: str # Second feature name
327
+ value: float # Correlation value (-1 to 1)
328
+ ```
329
+
330
+ ### Pattern
331
+
332
+ ```python
333
+ @dataclass
334
+ class Pattern:
335
+ id: str
336
+ task: str
337
+ target_column: str
338
+ direction: str # "min" or "max"
339
+ p_value: float
340
+ conditions: List[Dict] # Continuous, categorical, or datetime conditions
341
+ lift_value: float
342
+ support_count: int
343
+ support_percentage: float
344
+ pattern_type: str # "validated" or "speculative"
345
+ novelty_type: str # "novel" or "confirmatory"
346
+ target_score: float
347
+ description: str
348
+ novelty_explanation: str
349
+ target_class: Optional[str]
350
+ target_mean: Optional[float]
351
+ target_std: Optional[float]
352
+ citations: List[Dict]
353
+ ```
354
+
@@ -0,0 +1,6 @@
1
+ discovery/__init__.py,sha256=22eLF6JiAFVrSm8c7J7pgJ7TuHMpeC4aT381IDV7XeQ,586
2
+ discovery/client.py,sha256=4qRYvnf7oU0HUC5M15S8fKbQjyw4C_qQ7g4ExJEeRwo,32174
3
+ discovery/types.py,sha256=4Z3gKdxWnOpymEjBGCzAeUGjwRT2A0aCpmuwctbE4w0,6008
4
+ discovery_engine_api-0.1.52.dist-info/METADATA,sha256=BdW0EssYoebqi51M4x3-gVsmHcXwl2s_YvxP7cgu_1c,12675
5
+ discovery_engine_api-0.1.52.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
6
+ discovery_engine_api-0.1.52.dist-info/RECORD,,
@@ -0,0 +1,4 @@
1
+ Wheel-Version: 1.0
2
+ Generator: hatchling 1.28.0
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any