discovery-engine-api 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
discovery/__init__.py ADDED
@@ -0,0 +1,34 @@
1
+ """Discovery Engine Python SDK."""
2
+
3
+ __version__ = "0.1.0" # Updated to trigger TestPyPI publish
4
+
5
+ from discovery.client import Engine
6
+ from discovery.types import (
7
+ Column,
8
+ CorrelationEntry,
9
+ DataInsights,
10
+ EngineResult,
11
+ FeatureImportance,
12
+ FeatureImportanceScore,
13
+ FileInfo,
14
+ Pattern,
15
+ PatternGroup,
16
+ RunStatus,
17
+ Summary,
18
+ )
19
+
20
+ __all__ = [
21
+ "Engine",
22
+ "EngineResult",
23
+ "Column",
24
+ "CorrelationEntry",
25
+ "DataInsights",
26
+ "FeatureImportance",
27
+ "FeatureImportanceScore",
28
+ "FileInfo",
29
+ "Pattern",
30
+ "PatternGroup",
31
+ "RunStatus",
32
+ "Summary",
33
+ "__version__",
34
+ ]
discovery/client.py ADDED
@@ -0,0 +1,747 @@
1
+ """Discovery Engine Python SDK."""
2
+
3
+ import asyncio
4
+ import json
5
+ import os
6
+ import time
7
+ from pathlib import Path
8
+ from typing import Any, Dict, List, Optional, Union
9
+
10
+ import httpx
11
+
12
+ try:
13
+ import pandas as pd
14
+ except ImportError:
15
+ pd = None
16
+
17
+ from discovery.types import (
18
+ Column,
19
+ CorrelationEntry,
20
+ DataInsights,
21
+ EngineResult,
22
+ FeatureImportance,
23
+ FeatureImportanceScore,
24
+ FileInfo,
25
+ Pattern,
26
+ PatternGroup,
27
+ RunStatus,
28
+ Summary,
29
+ )
30
+
31
+
32
+ class Engine:
33
+ """Engine for the Discovery Engine API."""
34
+
35
+ # Production dashboard URL (can be overridden via DISCOVERY_API_URL env var for testing)
36
+ # The SDK calls the dashboard API which handles all report creation and credit deduction
37
+ _DEFAULT_BASE_URL = "https://disco.leap-labs.com"
38
+
39
+ def __init__(self, api_key: str):
40
+ """
41
+ Initialize the Discovery Engine.
42
+
43
+ Args:
44
+ api_key: Your API key
45
+ """
46
+
47
+ print("Initializing Discovery Engine...")
48
+ self.api_key = api_key
49
+ # Use DISCOVERY_API_URL env var if set (for testing/custom deployments),
50
+ # otherwise use the production default
51
+ self.base_url = os.getenv("DISCOVERY_API_URL", self._DEFAULT_BASE_URL).rstrip("/")
52
+ self._organization_id: Optional[str] = None
53
+ self._client: Optional[httpx.AsyncClient] = None
54
+ self._org_fetched = False
55
+
56
+ async def _ensure_organization_id(self) -> str:
57
+ """
58
+ Ensure we have an organization ID, fetching from API if needed.
59
+
60
+ The organization ID is required for API requests to identify which
61
+ organization the user belongs to (multi-tenancy support).
62
+
63
+ Returns:
64
+ Organization ID string
65
+
66
+ Raises:
67
+ ValueError: If no organization is found or API request fails
68
+ """
69
+ if self._organization_id:
70
+ return self._organization_id
71
+
72
+ if not self._org_fetched:
73
+ # Fetch user's organizations and use the first one
74
+ try:
75
+ orgs = await self.get_organizations()
76
+ if orgs:
77
+ self._organization_id = orgs[0]["id"]
78
+ except ValueError as e:
79
+ # Re-raise with more context
80
+ raise ValueError(
81
+ f"Failed to fetch organization: {e}. "
82
+ "Please ensure your API key is valid and you belong to an organization."
83
+ ) from e
84
+ self._org_fetched = True
85
+
86
+ if not self._organization_id:
87
+ raise ValueError(
88
+ "No organization found for your account. "
89
+ "Please contact support if this issue persists."
90
+ )
91
+
92
+ return self._organization_id
93
+
94
+ async def _get_client(self) -> httpx.AsyncClient:
95
+ """Get or create the HTTP client."""
96
+ if self._client is None:
97
+ headers = {"Authorization": f"Bearer {self.api_key}"}
98
+ self._client = httpx.AsyncClient(
99
+ base_url=self.base_url,
100
+ headers=headers,
101
+ timeout=60.0,
102
+ )
103
+ return self._client
104
+
105
+ async def _get_client_with_org(self) -> httpx.AsyncClient:
106
+ """Get HTTP client (no longer needs org header for dashboard API)."""
107
+ return await self._get_client()
108
+
109
+ async def close(self):
110
+ """Close the HTTP client."""
111
+ if self._client:
112
+ await self._client.aclose()
113
+ self._client = None
114
+
115
+ async def __aenter__(self):
116
+ """Async context manager entry."""
117
+ return self
118
+
119
+ async def __aexit__(self, exc_type, exc_val, exc_tb):
120
+ """Async context manager exit."""
121
+ await self.close()
122
+
123
+ async def get_organizations(self) -> List[Dict[str, Any]]:
124
+ """
125
+ Get the organizations you belong to.
126
+
127
+ Note: This is no longer needed for the simplified SDK workflow,
128
+ but kept for backwards compatibility.
129
+
130
+ Returns:
131
+ List of organizations with id, name, and slug
132
+
133
+ Raises:
134
+ ValueError: If the API request fails
135
+ """
136
+ # Organizations are handled automatically by the dashboard API
137
+ # Return empty list for now - not needed for report creation
138
+ return []
139
+
140
+ async def upload_file(
141
+ self, file: Union[str, Path, "pd.DataFrame"], filename: Optional[str] = None
142
+ ) -> FileInfo:
143
+ """
144
+ Upload a file to the API.
145
+
146
+ Args:
147
+ file: File path, Path object, or pandas DataFrame
148
+ filename: Optional filename (for DataFrame uploads)
149
+
150
+ Returns:
151
+ FileInfo with file_path, file_hash, file_size, mime_type
152
+ """
153
+ client = await self._get_client_with_org()
154
+
155
+ if pd is not None and isinstance(file, pd.DataFrame):
156
+ # Convert DataFrame to CSV in memory
157
+ import io
158
+
159
+ buffer = io.BytesIO()
160
+ file.to_csv(buffer, index=False)
161
+ buffer.seek(0)
162
+ file_content = buffer.getvalue()
163
+ filename = filename or "dataset.csv"
164
+ mime_type = "text/csv"
165
+ else:
166
+ # Read file from disk
167
+ file_path = Path(file)
168
+ if not file_path.exists():
169
+ raise FileNotFoundError(f"File not found: {file_path}")
170
+ file_content = file_path.read_bytes()
171
+ filename = filename or file_path.name
172
+ mime_type = (
173
+ "text/csv" if file_path.suffix == ".csv" else "application/vnd.apache.parquet"
174
+ )
175
+
176
+ # Upload file
177
+ files = {"file": (filename, file_content, mime_type)}
178
+ response = await client.post("/v1/upload", files=files)
179
+ response.raise_for_status()
180
+
181
+ data = response.json()
182
+ return FileInfo(
183
+ file_path=data["file_path"],
184
+ file_hash=data["file_hash"],
185
+ file_size=data["file_size"],
186
+ mime_type=data["mime_type"],
187
+ )
188
+
189
+ async def create_dataset(
190
+ self,
191
+ title: Optional[str] = None,
192
+ description: Optional[str] = None,
193
+ total_rows: int = 0,
194
+ dataset_size_mb: Optional[float] = None,
195
+ author: Optional[str] = None,
196
+ source_url: Optional[str] = None,
197
+ ) -> Dict[str, Any]:
198
+ """
199
+ Create a dataset record.
200
+
201
+ Args:
202
+ title: Dataset title
203
+ description: Dataset description
204
+ total_rows: Number of rows in the dataset
205
+ dataset_size_mb: Dataset size in MB
206
+ author: Optional author attribution
207
+ source_url: Optional source URL
208
+
209
+ Returns:
210
+ Dataset record with ID
211
+ """
212
+ client = await self._get_client_with_org()
213
+
214
+ response = await client.post(
215
+ "/v1/run-datasets",
216
+ json={
217
+ "title": title,
218
+ "description": description,
219
+ "total_rows": total_rows,
220
+ "dataset_size_mb": dataset_size_mb,
221
+ "author": author,
222
+ "source_url": source_url,
223
+ },
224
+ )
225
+ response.raise_for_status()
226
+ return response.json()
227
+
228
+ async def create_file_record(self, dataset_id: str, file_info: FileInfo) -> Dict[str, Any]:
229
+ """
230
+ Create a file record for a dataset.
231
+
232
+ Args:
233
+ dataset_id: Dataset ID
234
+ file_info: FileInfo from upload_file()
235
+
236
+ Returns:
237
+ File record with ID
238
+ """
239
+ client = await self._get_client_with_org()
240
+
241
+ response = await client.post(
242
+ f"/v1/run-datasets/{dataset_id}/files",
243
+ json={
244
+ "mime_type": file_info.mime_type,
245
+ "file_path": file_info.file_path,
246
+ "file_hash": file_info.file_hash,
247
+ "file_size": file_info.file_size,
248
+ },
249
+ )
250
+ response.raise_for_status()
251
+ return response.json()
252
+
253
+ async def create_columns(
254
+ self, dataset_id: str, columns: List[Dict[str, Any]]
255
+ ) -> List[Dict[str, Any]]:
256
+ """
257
+ Create column records for a dataset.
258
+
259
+ Args:
260
+ dataset_id: Dataset ID
261
+ columns: List of column definitions with full metadata
262
+
263
+ Returns:
264
+ List of column records with IDs
265
+ """
266
+ client = await self._get_client_with_org()
267
+
268
+ response = await client.post(
269
+ f"/v1/run-datasets/{dataset_id}/columns",
270
+ json=columns,
271
+ )
272
+ response.raise_for_status()
273
+ return response.json()
274
+
275
+ async def create_run(
276
+ self,
277
+ dataset_id: str,
278
+ target_column_id: str,
279
+ task: str = "regression",
280
+ mode: str = "fast",
281
+ visibility: str = "public",
282
+ timeseries_groups: Optional[List[Dict[str, Any]]] = None,
283
+ target_column_override: Optional[str] = None,
284
+ auto_report_use_llm_evals: bool = True,
285
+ author: Optional[str] = None,
286
+ source_url: Optional[str] = None,
287
+ ) -> Dict[str, Any]:
288
+ """
289
+ Create a run and enqueue it for processing.
290
+
291
+ Args:
292
+ dataset_id: Dataset ID
293
+ target_column_id: Target column ID
294
+ task: Task type (regression, binary_classification, multiclass_classification)
295
+ mode: Analysis mode ("fast" or "deep")
296
+ visibility: Dataset visibility ("public" or "private")
297
+ timeseries_groups: Optional list of timeseries column groups
298
+ target_column_override: Optional override for target column name
299
+ auto_report_use_llm_evals: Use LLM evaluations
300
+ author: Optional dataset author
301
+ source_url: Optional source URL
302
+
303
+ Returns:
304
+ Run record with ID and job information
305
+ """
306
+ client = await self._get_client_with_org()
307
+
308
+ payload = {
309
+ "run_target_column_id": target_column_id,
310
+ "task": task,
311
+ "mode": mode,
312
+ "visibility": visibility,
313
+ "auto_report_use_llm_evals": auto_report_use_llm_evals,
314
+ }
315
+
316
+ if timeseries_groups:
317
+ payload["timeseries_groups"] = timeseries_groups
318
+ if target_column_override:
319
+ payload["target_column_override"] = target_column_override
320
+ if author:
321
+ payload["author"] = author
322
+ if source_url:
323
+ payload["source_url"] = source_url
324
+
325
+ response = await client.post(
326
+ f"/v1/run-datasets/{dataset_id}/runs",
327
+ json=payload,
328
+ )
329
+ response.raise_for_status()
330
+ return response.json()
331
+
332
+ async def get_results(self, run_id: str) -> EngineResult:
333
+ """
334
+ Get complete analysis results for a run.
335
+
336
+ This returns all data that the Discovery dashboard displays:
337
+ - LLM-generated summary with key insights
338
+ - All discovered patterns with conditions, citations, and explanations
339
+ - Column/feature information with statistics and importance scores
340
+ - Correlation matrix
341
+ - Global feature importance
342
+
343
+ Args:
344
+ run_id: The run ID
345
+
346
+ Returns:
347
+ EngineResult with complete analysis data
348
+ """
349
+ client = await self._get_client()
350
+
351
+ # Call dashboard API for results
352
+ response = await client.get(f"/api/runs/{run_id}/results")
353
+ response.raise_for_status()
354
+
355
+ data = response.json()
356
+ return self._parse_analysis_result(data)
357
+
358
+ async def get_run_status(self, run_id: str) -> RunStatus:
359
+ """
360
+ Get the status of a run.
361
+
362
+ Args:
363
+ run_id: Run ID
364
+
365
+ Returns:
366
+ RunStatus with current status information
367
+ """
368
+ client = await self._get_client_with_org()
369
+
370
+ response = await client.get(f"/v1/runs/{run_id}/results")
371
+ response.raise_for_status()
372
+
373
+ data = response.json()
374
+ return RunStatus(
375
+ run_id=data["run_id"],
376
+ status=data["status"],
377
+ job_id=data.get("job_id"),
378
+ job_status=data.get("job_status"),
379
+ error_message=data.get("error_message"),
380
+ )
381
+
382
+ async def wait_for_completion(
383
+ self,
384
+ run_id: str,
385
+ poll_interval: float = 5.0,
386
+ timeout: Optional[float] = None,
387
+ ) -> EngineResult:
388
+ """
389
+ Wait for a run to complete and return the results.
390
+
391
+ Args:
392
+ run_id: Run ID
393
+ poll_interval: Seconds between status checks (default: 5)
394
+ timeout: Maximum seconds to wait (None = no timeout)
395
+
396
+ Returns:
397
+ EngineResult with complete analysis data
398
+
399
+ Raises:
400
+ TimeoutError: If the run doesn't complete within the timeout
401
+ RuntimeError: If the run fails
402
+ """
403
+ start_time = time.time()
404
+
405
+ while True:
406
+ result = await self.get_results(run_id)
407
+
408
+ if result.status == "completed":
409
+ return result
410
+ elif result.status == "failed":
411
+ raise RuntimeError(
412
+ f"Run {run_id} failed: {result.error_message or 'Unknown error'}"
413
+ )
414
+
415
+ if timeout and (time.time() - start_time) > timeout:
416
+ raise TimeoutError(f"Run {run_id} did not complete within {timeout} seconds")
417
+
418
+ await asyncio.sleep(poll_interval)
419
+
420
+ async def run_async(
421
+ self,
422
+ file: Union[str, Path, "pd.DataFrame"],
423
+ target_column: str,
424
+ mode: str = "fast",
425
+ title: Optional[str] = None,
426
+ description: Optional[str] = None,
427
+ column_descriptions: Optional[Dict[str, str]] = None,
428
+ task: Optional[str] = None,
429
+ visibility: str = "public",
430
+ timeseries_groups: Optional[List[Dict[str, Any]]] = None,
431
+ target_column_override: Optional[str] = None,
432
+ auto_report_use_llm_evals: bool = True,
433
+ author: Optional[str] = None,
434
+ source_url: Optional[str] = None,
435
+ wait: bool = False,
436
+ wait_timeout: Optional[float] = None,
437
+ **kwargs,
438
+ ) -> EngineResult:
439
+ """
440
+ Run analysis on a dataset (async).
441
+
442
+ This method calls the dashboard API which handles the entire workflow:
443
+ file upload, dataset creation, column inference, run creation, and credit deduction.
444
+
445
+ Args:
446
+ file: File path, Path object, or pandas DataFrame
447
+ target_column: Name of the target column
448
+ mode: Analysis mode ("fast" or "deep", default: "fast")
449
+ title: Optional dataset title
450
+ description: Optional dataset description
451
+ column_descriptions: Optional dict mapping column names to descriptions
452
+ task: Task type (regression, binary, multiclass) - auto-detected if None
453
+ visibility: Dataset visibility ("public" or "private", default: "public")
454
+ timeseries_groups: Optional list of timeseries column groups
455
+ target_column_override: Optional override for target column name
456
+ auto_report_use_llm_evals: Use LLM evaluations (default: True)
457
+ author: Optional dataset author
458
+ source_url: Optional source URL
459
+ wait: If True, wait for analysis to complete and return full results
460
+ wait_timeout: Maximum seconds to wait for completion (only if wait=True)
461
+
462
+ Returns:
463
+ EngineResult with run_id and (if wait=True) complete results
464
+ """
465
+ client = await self._get_client()
466
+
467
+ # Prepare file for upload
468
+ if pd is not None and isinstance(file, pd.DataFrame):
469
+ # Convert DataFrame to CSV in memory
470
+ import io
471
+
472
+ buffer = io.BytesIO()
473
+ file.to_csv(buffer, index=False)
474
+ buffer.seek(0)
475
+ file_content = buffer.getvalue()
476
+ filename = (title + ".csv") if title else "dataset.csv"
477
+ mime_type = "text/csv"
478
+ else:
479
+ # Read file from disk
480
+ file_path = Path(file)
481
+ if not file_path.exists():
482
+ raise FileNotFoundError(f"File not found: {file_path}")
483
+ file_content = file_path.read_bytes()
484
+ filename = file_path.name
485
+ mime_type = (
486
+ "text/csv" if file_path.suffix == ".csv" else "application/vnd.apache.parquet"
487
+ )
488
+
489
+ # Prepare multipart form data
490
+ files = {"file": (filename, file_content, mime_type)}
491
+ data: Dict[str, Any] = {
492
+ "target_column": target_column,
493
+ "mode": mode,
494
+ "visibility": visibility,
495
+ }
496
+
497
+ if description:
498
+ data["description"] = description
499
+ if author:
500
+ data["author"] = author
501
+ if source_url:
502
+ data["source_url"] = source_url
503
+ if column_descriptions:
504
+ data["column_descriptions"] = json.dumps(column_descriptions)
505
+ if timeseries_groups:
506
+ data["timeseries_groups"] = json.dumps(timeseries_groups)
507
+
508
+ # Call dashboard API to create report
509
+ # httpx automatically handles multipart/form-data when both files and data are provided
510
+ response = await client.post("/api/reports/create", files=files, data=data)
511
+ response.raise_for_status()
512
+
513
+ result_data = response.json()
514
+
515
+ # Check if duplicate
516
+ if result_data.get("duplicate"):
517
+ # For duplicates, get the run_id and fetch results
518
+ report_id = result_data.get("report_id")
519
+ run_id = result_data.get("run_id")
520
+
521
+ if not report_id or not run_id:
522
+ raise ValueError("Duplicate report found but missing report_id or run_id")
523
+
524
+ # If wait is True, fetch the full results for the existing report
525
+ if wait:
526
+ return await self.get_results(run_id)
527
+
528
+ # Otherwise return a minimal result with the run_id
529
+ return EngineResult(
530
+ run_id=run_id,
531
+ status="completed",
532
+ report_id=report_id,
533
+ )
534
+
535
+ run_id = result_data["run_id"]
536
+
537
+ if wait:
538
+ # Wait for completion and return full results
539
+ return await self.wait_for_completion(run_id, timeout=wait_timeout)
540
+
541
+ # Return minimal result with pending status
542
+ return EngineResult(
543
+ run_id=run_id,
544
+ status="pending",
545
+ )
546
+
547
+ def run(
548
+ self,
549
+ file: Union[str, Path, "pd.DataFrame"],
550
+ target_column: str,
551
+ mode: str = "fast",
552
+ title: Optional[str] = None,
553
+ description: Optional[str] = None,
554
+ column_descriptions: Optional[Dict[str, str]] = None,
555
+ task: Optional[str] = None,
556
+ visibility: str = "public",
557
+ timeseries_groups: Optional[List[Dict[str, Any]]] = None,
558
+ target_column_override: Optional[str] = None,
559
+ auto_report_use_llm_evals: bool = True,
560
+ author: Optional[str] = None,
561
+ source_url: Optional[str] = None,
562
+ wait: bool = False,
563
+ wait_timeout: Optional[float] = None,
564
+ **kwargs,
565
+ ) -> EngineResult:
566
+ """
567
+ Run analysis on a dataset (synchronous wrapper).
568
+
569
+ This is a synchronous wrapper around run_async().
570
+
571
+ Args:
572
+ file: File path, Path object, or pandas DataFrame
573
+ target_column: Name of the target column
574
+ mode: Analysis mode ("fast" or "deep", default: "fast")
575
+ title: Optional dataset title
576
+ description: Optional dataset description
577
+ column_descriptions: Optional dict mapping column names to descriptions
578
+ task: Task type (regression, binary_classification, multiclass_classification) - auto-detected if None
579
+ visibility: Dataset visibility ("public" or "private", default: "public")
580
+ timeseries_groups: Optional list of timeseries column groups
581
+ target_column_override: Optional override for target column name
582
+ auto_report_use_llm_evals: Use LLM evaluations (default: True)
583
+ author: Optional dataset author
584
+ source_url: Optional source URL
585
+ wait: If True, wait for analysis to complete and return full results
586
+ wait_timeout: Maximum seconds to wait for completion (only if wait=True)
587
+ **kwargs: Additional arguments passed to run_async()
588
+
589
+ Returns:
590
+ EngineResult with run_id and (if wait=True) complete results
591
+ """
592
+ return asyncio.run(
593
+ self.run_async(
594
+ file,
595
+ target_column,
596
+ mode,
597
+ title=title,
598
+ description=description,
599
+ column_descriptions=column_descriptions,
600
+ task=task,
601
+ visibility=visibility,
602
+ timeseries_groups=timeseries_groups,
603
+ target_column_override=target_column_override,
604
+ auto_report_use_llm_evals=auto_report_use_llm_evals,
605
+ author=author,
606
+ source_url=source_url,
607
+ wait=wait,
608
+ wait_timeout=wait_timeout,
609
+ **kwargs,
610
+ )
611
+ )
612
+
613
+ def _parse_analysis_result(self, data: Dict[str, Any]) -> EngineResult:
614
+ """Parse API response into EngineResult dataclass."""
615
+ # Parse summary
616
+ summary = None
617
+ if data.get("summary"):
618
+ summary = self._parse_summary(data["summary"])
619
+
620
+ # Parse patterns
621
+ patterns = []
622
+ for p in data.get("patterns", []):
623
+ patterns.append(
624
+ Pattern(
625
+ id=p["id"],
626
+ task=p.get("task", "regression"),
627
+ target_column=p.get("target_column", ""),
628
+ direction=p.get("direction", "max"),
629
+ p_value=p.get("p_value", 0),
630
+ conditions=p.get("conditions", []),
631
+ lift_value=p.get("lift_value", 0),
632
+ support_count=p.get("support_count", 0),
633
+ support_percentage=p.get("support_percentage", 0),
634
+ pattern_type=p.get("pattern_type", "validated"),
635
+ novelty_type=p.get("novelty_type", "confirmatory"),
636
+ target_score=p.get("target_score", 0),
637
+ target_class=p.get("target_class"),
638
+ target_mean=p.get("target_mean"),
639
+ target_std=p.get("target_std"),
640
+ description=p.get("description", ""),
641
+ novelty_explanation=p.get("novelty_explanation", ""),
642
+ citations=p.get("citations", []),
643
+ )
644
+ )
645
+
646
+ # Parse columns
647
+ columns = []
648
+ for c in data.get("columns", []):
649
+ columns.append(
650
+ Column(
651
+ id=c["id"],
652
+ name=c["name"],
653
+ display_name=c.get("display_name", c["name"]),
654
+ type=c.get("type", "continuous"),
655
+ data_type=c.get("data_type", "float"),
656
+ enabled=c.get("enabled", True),
657
+ description=c.get("description"),
658
+ mean=c.get("mean"),
659
+ median=c.get("median"),
660
+ std=c.get("std"),
661
+ min=c.get("min"),
662
+ max=c.get("max"),
663
+ iqr_min=c.get("iqr_min"),
664
+ iqr_max=c.get("iqr_max"),
665
+ mode=c.get("mode"),
666
+ approx_unique=c.get("approx_unique"),
667
+ null_percentage=c.get("null_percentage"),
668
+ feature_importance_score=c.get("feature_importance_score"),
669
+ )
670
+ )
671
+
672
+ # Parse correlation matrix
673
+ correlation_matrix = []
674
+ for entry in data.get("correlation_matrix", []):
675
+ correlation_matrix.append(
676
+ CorrelationEntry(
677
+ feature_x=entry["feature_x"],
678
+ feature_y=entry["feature_y"],
679
+ value=entry["value"],
680
+ )
681
+ )
682
+
683
+ # Parse feature importance
684
+ feature_importance = None
685
+ if data.get("feature_importance"):
686
+ fi = data["feature_importance"]
687
+ scores = [
688
+ FeatureImportanceScore(feature=s["feature"], score=s["score"])
689
+ for s in fi.get("scores", [])
690
+ ]
691
+ feature_importance = FeatureImportance(
692
+ kind=fi.get("kind", "global"),
693
+ baseline=fi.get("baseline", 0),
694
+ scores=scores,
695
+ )
696
+
697
+ return EngineResult(
698
+ run_id=data["run_id"],
699
+ report_id=data.get("report_id"),
700
+ status=data.get("status", "unknown"),
701
+ dataset_title=data.get("dataset_title"),
702
+ dataset_description=data.get("dataset_description"),
703
+ total_rows=data.get("total_rows"),
704
+ target_column=data.get("target_column"),
705
+ task=data.get("task"),
706
+ summary=summary,
707
+ patterns=patterns,
708
+ columns=columns,
709
+ correlation_matrix=correlation_matrix,
710
+ feature_importance=feature_importance,
711
+ job_id=data.get("job_id"),
712
+ job_status=data.get("job_status"),
713
+ error_message=data.get("error_message"),
714
+ )
715
+
716
+ def _parse_summary(self, data: Dict[str, Any]) -> Summary:
717
+ """Parse summary data into Summary dataclass."""
718
+ # Parse data insights
719
+ data_insights = None
720
+ if data.get("data_insights"):
721
+ di = data["data_insights"]
722
+ data_insights = DataInsights(
723
+ important_features=di.get("important_features", []),
724
+ important_features_explanation=di.get("important_features_explanation", ""),
725
+ strong_correlations=di.get("strong_correlations", []),
726
+ strong_correlations_explanation=di.get("strong_correlations_explanation", ""),
727
+ notable_relationships=di.get("notable_relationships", []),
728
+ )
729
+
730
+ return Summary(
731
+ overview=data.get("overview", ""),
732
+ key_insights=data.get("key_insights", []),
733
+ novel_patterns=PatternGroup(
734
+ pattern_ids=data.get("novel_patterns", {}).get("pattern_ids", []),
735
+ explanation=data.get("novel_patterns", {}).get("explanation", ""),
736
+ ),
737
+ surprising_findings=PatternGroup(
738
+ pattern_ids=data.get("surprising_findings", {}).get("pattern_ids", []),
739
+ explanation=data.get("surprising_findings", {}).get("explanation", ""),
740
+ ),
741
+ statistically_significant=PatternGroup(
742
+ pattern_ids=data.get("statistically_significant", {}).get("pattern_ids", []),
743
+ explanation=data.get("statistically_significant", {}).get("explanation", ""),
744
+ ),
745
+ data_insights=data_insights,
746
+ selected_pattern_id=data.get("selected_pattern_id"),
747
+ )
discovery/types.py ADDED
@@ -0,0 +1,256 @@
1
+ """Type definitions for the Discovery SDK."""
2
+
3
+ from dataclasses import dataclass, field
4
+ from typing import Any, Dict, List, Literal, Optional, Union
5
+
6
+
7
+ @dataclass
8
+ class FileInfo:
9
+ """Information about an uploaded file."""
10
+
11
+ file_path: str # GCS path
12
+ file_hash: str
13
+ file_size: int
14
+ mime_type: str
15
+
16
+
17
+ @dataclass
18
+ class TimeseriesGroup:
19
+ """Timeseries column group metadata."""
20
+
21
+ base_name: str
22
+ columns: List[str]
23
+ num_timesteps: int
24
+ pattern_matched: str
25
+ dtype: str # "numeric" or "categorical"
26
+
27
+
28
+ # Pattern types
29
+
30
+
31
+ @dataclass
32
+ class PatternContinuousCondition:
33
+ """A continuous condition in a pattern."""
34
+
35
+ type: Literal["continuous"]
36
+ feature: str
37
+ min_value: float
38
+ max_value: float
39
+ min_q: Optional[float] = None
40
+ max_q: Optional[float] = None
41
+
42
+
43
+ @dataclass
44
+ class PatternCategoricalCondition:
45
+ """A categorical condition in a pattern."""
46
+
47
+ type: Literal["categorical"]
48
+ feature: str
49
+ values: List[Union[str, int, float, bool, None]]
50
+
51
+
52
+ @dataclass
53
+ class PatternDatetimeCondition:
54
+ """A datetime condition in a pattern."""
55
+
56
+ type: Literal["datetime"]
57
+ feature: str
58
+ min_value: float # epoch milliseconds
59
+ max_value: float # epoch milliseconds
60
+ min_datetime: str # human-readable
61
+ max_datetime: str # human-readable
62
+ min_q: Optional[float] = None
63
+ max_q: Optional[float] = None
64
+
65
+
66
+ PatternCondition = Union[
67
+ PatternContinuousCondition, PatternCategoricalCondition, PatternDatetimeCondition
68
+ ]
69
+
70
+
71
+ @dataclass
72
+ class PatternCitation:
73
+ """Academic citation for a pattern."""
74
+
75
+ url: str
76
+ title: Optional[str] = None
77
+ doi: Optional[str] = None
78
+ authors: Optional[List[str]] = None
79
+ year: Optional[str] = None
80
+ journal: Optional[str] = None
81
+ volume: Optional[str] = None
82
+ issue: Optional[str] = None
83
+ pages: Optional[str] = None
84
+
85
+
86
+ @dataclass
87
+ class Pattern:
88
+ """A discovered pattern in the data."""
89
+
90
+ id: str
91
+ task: str # regression, binary_classification, multiclass_classification
92
+ target_column: str
93
+ direction: str # "min" or "max"
94
+ p_value: float
95
+ conditions: List[Dict[str, Any]] # PatternCondition as dicts
96
+ lift_value: float
97
+ support_count: int
98
+ support_percentage: float
99
+ pattern_type: str # "validated" or "speculative"
100
+ novelty_type: str # "novel" or "confirmatory"
101
+ target_score: float
102
+ description: str
103
+ novelty_explanation: str
104
+ target_class: Optional[str] = None
105
+ target_mean: Optional[float] = None
106
+ target_std: Optional[float] = None
107
+ citations: List[Dict[str, Any]] = field(default_factory=list)
108
+
109
+
110
+ # Column/Feature types
111
+
112
+
113
+ @dataclass
114
+ class Column:
115
+ """Information about a dataset column/feature."""
116
+
117
+ id: str
118
+ name: str
119
+ display_name: str
120
+ type: str # "continuous" or "categorical"
121
+ data_type: str # "int", "float", "string", "boolean", "datetime"
122
+ enabled: bool
123
+ description: Optional[str] = None
124
+
125
+ # Statistics
126
+ mean: Optional[float] = None
127
+ median: Optional[float] = None
128
+ std: Optional[float] = None
129
+ min: Optional[float] = None
130
+ max: Optional[float] = None
131
+ iqr_min: Optional[float] = None
132
+ iqr_max: Optional[float] = None
133
+ mode: Optional[str] = None
134
+ approx_unique: Optional[int] = None
135
+ null_percentage: Optional[float] = None
136
+
137
+ # Feature importance
138
+ feature_importance_score: Optional[float] = None
139
+
140
+
141
+ # Summary types (LLM-generated)
142
+
143
+
144
+ @dataclass
145
+ class DataInsights:
146
+ """LLM-generated data insights."""
147
+
148
+ important_features: List[str]
149
+ important_features_explanation: str
150
+ strong_correlations: List[Dict[str, str]] # [{"feature1": "...", "feature2": "..."}]
151
+ strong_correlations_explanation: str
152
+ notable_relationships: List[str]
153
+
154
+
155
+ @dataclass
156
+ class PatternGroup:
157
+ """A group of patterns with explanation."""
158
+
159
+ pattern_ids: List[str]
160
+ explanation: str
161
+
162
+
163
+ @dataclass
164
+ class Summary:
165
+ """LLM-generated summary of the analysis."""
166
+
167
+ overview: str
168
+ key_insights: List[str]
169
+ novel_patterns: PatternGroup
170
+ surprising_findings: PatternGroup
171
+ statistically_significant: PatternGroup
172
+ data_insights: DataInsights
173
+ selected_pattern_id: Optional[str] = None
174
+
175
+
176
+ # Feature importance types
177
+
178
+
179
+ @dataclass
180
+ class FeatureImportanceScore:
181
+ """A single feature importance score."""
182
+
183
+ feature: str
184
+ score: float
185
+
186
+
187
+ @dataclass
188
+ class FeatureImportance:
189
+ """Global feature importance information."""
190
+
191
+ kind: str # "global" or "local"
192
+ baseline: float # expected model output
193
+ scores: List[FeatureImportanceScore]
194
+
195
+
196
+ # Correlation matrix types
197
+
198
+
199
+ @dataclass
200
+ class CorrelationEntry:
201
+ """A single correlation matrix entry."""
202
+
203
+ feature_x: str
204
+ feature_y: str
205
+ value: float
206
+
207
+
208
+ # Main result type
209
+
210
+
211
+ @dataclass
212
+ class EngineResult:
213
+ """Complete result of an engine run."""
214
+
215
+ # Identifiers
216
+ run_id: str
217
+ report_id: Optional[str] = None
218
+ status: str = "pending" # pending, processing, completed, failed
219
+
220
+ # Dataset metadata
221
+ dataset_title: Optional[str] = None
222
+ dataset_description: Optional[str] = None
223
+ total_rows: Optional[int] = None
224
+ target_column: Optional[str] = None
225
+ task: Optional[str] = None # regression, binary_classification, multiclass_classification
226
+
227
+ # LLM-generated summary
228
+ summary: Optional[Summary] = None
229
+
230
+ # Discovered patterns
231
+ patterns: List[Pattern] = field(default_factory=list)
232
+
233
+ # Column/feature information with stats and importance
234
+ columns: List[Column] = field(default_factory=list)
235
+
236
+ # Correlation matrix
237
+ correlation_matrix: List[CorrelationEntry] = field(default_factory=list)
238
+
239
+ # Global feature importance
240
+ feature_importance: Optional[FeatureImportance] = None
241
+
242
+ # Job tracking
243
+ job_id: Optional[str] = None
244
+ job_status: Optional[str] = None
245
+ error_message: Optional[str] = None
246
+
247
+
248
+ @dataclass
249
+ class RunStatus:
250
+ """Status of a run."""
251
+
252
+ run_id: str
253
+ status: str
254
+ job_id: Optional[str] = None
255
+ job_status: Optional[str] = None
256
+ error_message: Optional[str] = None
@@ -0,0 +1,318 @@
1
+ Metadata-Version: 2.4
2
+ Name: discovery-engine-api
3
+ Version: 0.1.0
4
+ Summary: Python SDK for the Discovery Engine API
5
+ Project-URL: Homepage, https://github.com/leap-laboratories/discovery
6
+ Project-URL: Documentation, https://github.com/leap-laboratories/discovery
7
+ Project-URL: Repository, https://github.com/leap-laboratories/discovery
8
+ Author: Leap Laboratories
9
+ License: MIT
10
+ Keywords: api,data-analysis,discovery,machine-learning,sdk
11
+ Classifier: Development Status :: 4 - Beta
12
+ Classifier: Intended Audience :: Developers
13
+ Classifier: License :: OSI Approved :: MIT License
14
+ Classifier: Programming Language :: Python :: 3
15
+ Classifier: Programming Language :: Python :: 3.10
16
+ Classifier: Programming Language :: Python :: 3.11
17
+ Classifier: Programming Language :: Python :: 3.12
18
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
19
+ Classifier: Topic :: Software Development :: Libraries :: Python Modules
20
+ Requires-Python: >=3.10
21
+ Requires-Dist: httpx>=0.24.0
22
+ Requires-Dist: pydantic>=2.0.0
23
+ Provides-Extra: dev
24
+ Requires-Dist: pytest-asyncio>=0.21.0; extra == 'dev'
25
+ Requires-Dist: pytest>=7.0.0; extra == 'dev'
26
+ Provides-Extra: pandas
27
+ Requires-Dist: pandas>=2.0.0; extra == 'pandas'
28
+ Description-Content-Type: text/markdown
29
+
30
+ # Discovery Engine Python API
31
+
32
+ The Discovery Engine Python API provides a simple programmatic interface to run analyses via Python, offering an alternative to using the web dashboard. Instead of uploading datasets and configuring analyses through the UI, you can automate your discovery workflows directly from your Python code or scripts.
33
+
34
+ All analyses run through the API are fully integrated with your Discovery Engine account. Results are automatically displayed in the dashboard, where you can view detailed reports, explore patterns, and share findings with your team. Your account management, credit balance, and subscription settings are all handled through the dashboard—the API is simply a convenient interface for programmatic access to the same powerful discovery engine.
35
+
36
+ ## Installation
37
+
38
+ ```bash
39
+ pip install discovery-engine-api
40
+ ```
41
+
42
+ For pandas DataFrame support:
43
+
44
+ ```bash
45
+ pip install discovery-engine-api[pandas]
46
+ ```
47
+
48
+
49
+ ## Quick Start
50
+
51
+ ```python
52
+ from discovery import Engine
53
+
54
+ # Initialize engine
55
+ engine = Engine(api_key="your-api-key")
56
+
57
+ # Run analysis on a dataset and wait for results
58
+ result = engine.run(
59
+ file="data.csv",
60
+ target_column="diagnosis",
61
+ mode="fast",
62
+ description="Rare diseases dataset",
63
+ wait=True # Wait for completion and return full results
64
+ )
65
+
66
+ print(f"Run ID: {result.run_id}")
67
+ print(f"Status: {result.status}")
68
+ print(f"Found {len(result.patterns)} patterns")
69
+ ```
70
+
71
+
72
+ ## Examples
73
+
74
+ ### Working with Pandas DataFrames
75
+
76
+ ```python
77
+ import pandas as pd
78
+ from discovery import Engine
79
+
80
+ df = pd.read_csv("data.csv")
81
+ # or create DataFrame directly
82
+
83
+ engine = Engine(api_key="your-api-key")
84
+ result = engine.run(
85
+ file=df, # Pass DataFrame directly
86
+ target_column="outcome",
87
+ column_descriptions={
88
+ "age": "Patient age in years",
89
+ "heart rate": None
90
+ },
91
+ wait=True
92
+ )
93
+ ```
94
+
95
+
96
+ ### Async Workflow
97
+
98
+ ```python
99
+ import asyncio
100
+ from discovery import Engine
101
+
102
+ async def run_analysis():
103
+ async with Engine(api_key="your-api-key") as engine:
104
+ # Start analysis without waiting
105
+ result = await engine.run_async(
106
+ file="data.csv",
107
+ target_column="target",
108
+ wait=False
109
+ )
110
+ print(f"Started run: {result.run_id}")
111
+
112
+ # Later, get results
113
+ result = await client.get_results(result.run_id)
114
+
115
+ # Or wait for completion
116
+ result = await client.wait_for_completion(result.run_id, timeout=600)
117
+ return result
118
+
119
+ result = asyncio.run(run_analysis())
120
+ ```
121
+
122
+
123
+ ## Configuration Options
124
+
125
+ The `run()` and `run_async()` methods accept the following parameters:
126
+
127
+ | Parameter | Type | Default | Description |
128
+ |-----------|------|---------|-------------|
129
+ | `file` | `str`, `Path`, or `DataFrame` | **Required** | Dataset file path or pandas DataFrame |
130
+ | `target_column` | `str` | **Required** | Name of column to predict |
131
+ | `mode` | `"fast"` / `"deep"` | `"fast"` | Analysis depth |
132
+ | `title` | `str` | `None` | Optional dataset title |
133
+ | `description` | `str` | `None` | Optional dataset description |
134
+ | `column_descriptions` | `Dict[str, str]` | `None` | Optional column name -> description mapping |
135
+ | `task` | `str` | `None` | Override auto-detected task type: `"regression"`, `"binary_classification"`, or `"multiclass_classification"` |
136
+ | `visibility` | `"public"` / `"private"` | `"public"` | Dataset visibility (private requires credits) |
137
+ | `timeseries_groups` | `List[Dict]` | `None` | Timeseries column groups for feature extraction |
138
+ | `auto_report_use_llm_evals` | `bool` | `True` | Use LLM for pattern descriptions |
139
+ | `author` | `str` | `None` | Optional dataset author attribution |
140
+ | `source_url` | `str` | `None` | Optional source URL for dataset |
141
+ | `wait` | `bool` | `False` | Wait for analysis to complete and return full results |
142
+ | `wait_timeout` | `float` | `None` | Maximum seconds to wait for completion (only if `wait=True`) |
143
+
144
+
145
+ ## Credits and Pricing
146
+
147
+ - **Public datasets**: Free (0 credits required)
148
+ - **Private datasets**:
149
+ - Fast mode: 1 credit per MB
150
+ - Deep mode: 3 credits per MB
151
+
152
+ If you don't have enough credits for a private run, the SDK will raise an `httpx.HTTPStatusError` with an error message like:
153
+ ```
154
+ Insufficient credits. You need X credits but only have Y available.
155
+ ```
156
+
157
+ **Solutions:**
158
+ 1. Make your dataset public (set `visibility="public"`) - completely free
159
+ 2. Visit [https://disco.leap-labs.com/account](https://disco.leap-labs.com/account) to:
160
+ - Purchase additional credits
161
+ - Upgrade to a subscription plan that includes more credits
162
+
163
+
164
+ ## Return Value
165
+
166
+ The `run()` and `run_async()` methods return an `EngineResult` object with the following fields:
167
+
168
+ ### EngineResult
169
+
170
+ ```python
171
+ @dataclass
172
+ class EngineResult:
173
+ # Identifiers
174
+ run_id: str # Unique run identifier
175
+ report_id: Optional[str] # Report ID (if report created)
176
+ status: str # "pending", "processing", "completed", "failed"
177
+
178
+ # Dataset metadata
179
+ dataset_title: Optional[str] # Dataset title
180
+ dataset_description: Optional[str] # Dataset description
181
+ total_rows: Optional[int] # Number of rows in dataset
182
+ target_column: Optional[str] # Name of target column
183
+ task: Optional[str] # "regression", "binary_classification", or "multiclass_classification"
184
+
185
+ # LLM-generated summary
186
+ summary: Optional[Summary] # Summary object with overview, insights, etc.
187
+
188
+ # Discovered patterns
189
+ patterns: List[Pattern] # List of discovered patterns
190
+
191
+ # Column/feature information
192
+ columns: List[Column] # List of columns with statistics and importance
193
+
194
+ # Correlation matrix
195
+ correlation_matrix: List[CorrelationEntry] # Feature correlations
196
+
197
+ # Global feature importance
198
+ feature_importance: Optional[FeatureImportance] # Feature importance scores
199
+
200
+ # Job tracking
201
+ job_id: Optional[str] # Job ID for tracking processing
202
+ job_status: Optional[str] # Job status
203
+ error_message: Optional[str] # Error message if analysis failed
204
+ ```
205
+
206
+ ### Summary
207
+
208
+ ```python
209
+ @dataclass
210
+ class Summary:
211
+ overview: str # High-level explanation of findings
212
+ key_insights: List[str] # List of main takeaways
213
+ novel_patterns: PatternGroup # Novel pattern explanations
214
+ surprising_findings: PatternGroup # Surprising findings
215
+ statistically_significant: PatternGroup # Statistically significant patterns
216
+ data_insights: Optional[DataInsights] # Important features, correlations
217
+ selected_pattern_id: Optional[str] # ID of selected pattern
218
+ ```
219
+
220
+ ### Pattern
221
+
222
+ ```python
223
+ @dataclass
224
+ class Pattern:
225
+ id: str # Pattern identifier
226
+ task: str # Task type
227
+ target_column: str # Target column name
228
+ direction: str # "min" or "max"
229
+ p_value: float # Statistical p-value
230
+ conditions: List[Dict] # Pattern conditions (continuous, categorical, datetime)
231
+ lift_value: float # Lift value (how much the pattern increases/decreases target)
232
+ support_count: int # Number of rows matching pattern
233
+ support_percentage: float # Percentage of rows matching pattern
234
+ pattern_type: str # "validated" or "speculative"
235
+ novelty_type: str # "novel" or "confirmatory"
236
+ target_score: float # Target score for this pattern
237
+ description: str # Human-readable description
238
+ novelty_explanation: str # Explanation of novelty
239
+ target_class: Optional[str] # Target class (for classification)
240
+ target_mean: Optional[float] # Target mean (for regression)
241
+ target_std: Optional[float] # Target standard deviation
242
+ citations: List[Dict] # Academic citations
243
+ ```
244
+
245
+ ### Column
246
+
247
+ ```python
248
+ @dataclass
249
+ class Column:
250
+ id: str # Column identifier
251
+ name: str # Column name
252
+ display_name: str # Display name
253
+ type: str # "continuous" or "categorical"
254
+ data_type: str # "int", "float", "string", "boolean", "datetime"
255
+ enabled: bool # Whether column is enabled
256
+ description: Optional[str] # Column description
257
+
258
+ # Statistics
259
+ mean: Optional[float] # Mean value
260
+ median: Optional[float] # Median value
261
+ std: Optional[float] # Standard deviation
262
+ min: Optional[float] # Minimum value
263
+ max: Optional[float] # Maximum value
264
+ iqr_min: Optional[float] # IQR minimum
265
+ iqr_max: Optional[float] # IQR maximum
266
+ mode: Optional[str] # Mode value
267
+ approx_unique: Optional[int] # Approximate unique count
268
+ null_percentage: Optional[float] # Percentage of null values
269
+
270
+ # Feature importance
271
+ feature_importance_score: Optional[float] # Feature importance score
272
+ ```
273
+
274
+ ### FeatureImportance
275
+
276
+ ```python
277
+ @dataclass
278
+ class FeatureImportance:
279
+ kind: str # Feature importance type: "global"
280
+ baseline: float # Baseline model output
281
+ scores: List[FeatureImportanceScore] # List of feature scores
282
+ ```
283
+
284
+ ### CorrelationEntry
285
+
286
+ ```python
287
+ @dataclass
288
+ class CorrelationEntry:
289
+ feature_x: str # First feature name
290
+ feature_y: str # Second feature name
291
+ value: float # Correlation value (-1 to 1)
292
+ ```
293
+
294
+ ### Pattern
295
+
296
+ ```python
297
+ @dataclass
298
+ class Pattern:
299
+ id: str
300
+ task: str
301
+ target_column: str
302
+ direction: str # "min" or "max"
303
+ p_value: float
304
+ conditions: List[Dict] # Continuous, categorical, or datetime conditions
305
+ lift_value: float
306
+ support_count: int
307
+ support_percentage: float
308
+ pattern_type: str # "validated" or "speculative"
309
+ novelty_type: str # "novel" or "confirmatory"
310
+ target_score: float
311
+ description: str
312
+ novelty_explanation: str
313
+ target_class: Optional[str]
314
+ target_mean: Optional[float]
315
+ target_std: Optional[float]
316
+ citations: List[Dict]
317
+ ```
318
+
@@ -0,0 +1,6 @@
1
+ discovery/__init__.py,sha256=3A09KLiF4vW1oQFpRgv2iuyfmLXar9LbbNNY4i1DBZ8,624
2
+ discovery/client.py,sha256=oZz4eTP3K8jKfYmDkBzvr_7MTKZCtnA2b3DlUmW-ObI,26857
3
+ discovery/types.py,sha256=4Z3gKdxWnOpymEjBGCzAeUGjwRT2A0aCpmuwctbE4w0,6008
4
+ discovery_engine_api-0.1.0.dist-info/METADATA,sha256=C_0ZJlvrIZGRNuVfPJznVjU-ywDAfEJ5eT4O42Scats,11739
5
+ discovery_engine_api-0.1.0.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
6
+ discovery_engine_api-0.1.0.dist-info/RECORD,,
@@ -0,0 +1,4 @@
1
+ Wheel-Version: 1.0
2
+ Generator: hatchling 1.28.0
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any