discovery-engine-api 0.1.34__tar.gz → 0.1.50__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: discovery-engine-api
3
- Version: 0.1.34
3
+ Version: 0.1.50
4
4
  Summary: Python SDK for the Discovery Engine API
5
5
  Project-URL: Homepage, https://github.com/leap-laboratories/discovery
6
6
  Project-URL: Documentation, https://github.com/leap-laboratories/discovery
@@ -1,6 +1,6 @@
1
1
  """Discovery Engine Python SDK."""
2
2
 
3
- __version__ = "0.1.34"
3
+ __version__ = "0.1.50"
4
4
 
5
5
  from discovery.client import Engine
6
6
  from discovery.types import (
@@ -32,9 +32,9 @@ from discovery.types import (
32
32
  class Engine:
33
33
  """Engine for the Discovery Engine API."""
34
34
 
35
- # Production dashboard URL (can be overridden via DISCOVERY_API_URL env var for testing)
36
- # The SDK calls the dashboard API which handles all report creation and credit deduction
37
- _DEFAULT_BASE_URL = "https://disco.leap-labs.com"
35
+ # Production API URL (can be overridden via DISCOVERY_API_URL env var for testing)
36
+ # This points to the Modal-deployed FastAPI API
37
+ _DEFAULT_BASE_URL = "https://leap-labs-production--discovery-api.modal.run"
38
38
 
39
39
  def __init__(self, api_key: str):
40
40
  """
@@ -296,7 +296,7 @@ class Engine:
296
296
  dataset_id: str,
297
297
  target_column_id: str,
298
298
  task: str = "regression",
299
- mode: str = "fast",
299
+ depth_iterations: int = 1,
300
300
  visibility: str = "public",
301
301
  timeseries_groups: Optional[List[Dict[str, Any]]] = None,
302
302
  target_column_override: Optional[str] = None,
@@ -311,7 +311,7 @@ class Engine:
311
311
  dataset_id: Dataset ID
312
312
  target_column_id: Target column ID
313
313
  task: Task type (regression, binary_classification, multiclass_classification)
314
- mode: Analysis mode ("fast" or "deep")
314
+ depth_iterations: Number of iterative feature removal cycles (1 = fastest)
315
315
  visibility: Dataset visibility ("public" or "private")
316
316
  timeseries_groups: Optional list of timeseries column groups
317
317
  target_column_override: Optional override for target column name
@@ -327,7 +327,7 @@ class Engine:
327
327
  payload = {
328
328
  "run_target_column_id": target_column_id,
329
329
  "task": task,
330
- "mode": mode,
330
+ "depth_iterations": depth_iterations,
331
331
  "visibility": visibility,
332
332
  "auto_report_use_llm_evals": auto_report_use_llm_evals,
333
333
  }
@@ -458,7 +458,7 @@ class Engine:
458
458
  self,
459
459
  file: Union[str, Path, "pd.DataFrame"],
460
460
  target_column: str,
461
- mode: str = "fast",
461
+ depth_iterations: int = 1,
462
462
  title: Optional[str] = None,
463
463
  description: Optional[str] = None,
464
464
  column_descriptions: Optional[Dict[str, str]] = None,
@@ -483,7 +483,7 @@ class Engine:
483
483
  Args:
484
484
  file: File path, Path object, or pandas DataFrame
485
485
  target_column: Name of the target column
486
- mode: Analysis mode ("fast" or "deep", default: "fast")
486
+ depth_iterations: Number of iterative feature removal cycles (1 = fastest)
487
487
  title: Optional dataset title
488
488
  description: Optional dataset description
489
489
  column_descriptions: Optional dict mapping column names to descriptions
@@ -535,7 +535,7 @@ class Engine:
535
535
  files = {"file": (filename, file_content, mime_type)}
536
536
  data: Dict[str, Any] = {
537
537
  "target_column": target_column,
538
- "mode": mode,
538
+ "depth_iterations": str(depth_iterations),
539
539
  "visibility": visibility,
540
540
  }
541
541
 
@@ -553,7 +553,9 @@ class Engine:
553
553
  data["timeseries_groups"] = json.dumps(timeseries_groups)
554
554
 
555
555
  # Call dashboard API to create report
556
- print(f"🚀 Uploading file and creating run (mode: {mode}, target: {target_column})...")
556
+ print(
557
+ f"🚀 Uploading file and creating run (depth: {depth_iterations}, target: {target_column})..."
558
+ )
557
559
  # httpx automatically handles multipart/form-data when both files and data are provided
558
560
  response = await client.post("/api/reports/create", files=files, data=data)
559
561
  response.raise_for_status()
@@ -607,7 +609,7 @@ class Engine:
607
609
  self,
608
610
  file: Union[str, Path, "pd.DataFrame"],
609
611
  target_column: str,
610
- mode: str = "fast",
612
+ depth_iterations: int = 1,
611
613
  title: Optional[str] = None,
612
614
  description: Optional[str] = None,
613
615
  column_descriptions: Optional[Dict[str, str]] = None,
@@ -631,7 +633,7 @@ class Engine:
631
633
  Args:
632
634
  file: File path, Path object, or pandas DataFrame
633
635
  target_column: Name of the target column
634
- mode: Analysis mode ("fast" or "deep", default: "fast")
636
+ depth_iterations: Number of iterative feature removal cycles (1 = fastest)
635
637
  title: Optional dataset title
636
638
  description: Optional dataset description
637
639
  column_descriptions: Optional dict mapping column names to descriptions
@@ -653,7 +655,7 @@ class Engine:
653
655
  coro = self.run_async(
654
656
  file,
655
657
  target_column,
656
- mode,
658
+ depth_iterations,
657
659
  title=title,
658
660
  description=description,
659
661
  column_descriptions=column_descriptions,
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "discovery-engine-api"
3
- version = "0.1.34"
3
+ version = "0.1.50"
4
4
  description = "Python SDK for the Discovery Engine API"
5
5
  readme = "README.md"
6
6
  requires-python = ">=3.10"
@@ -0,0 +1,53 @@
1
+ """
2
+ Pytest configuration for Discovery Engine SDK client tests.
3
+
4
+ This module provides session-level checks and warnings for API key configuration.
5
+ """
6
+
7
+ import os
8
+
9
+
10
+ def pytest_configure(config):
11
+ """
12
+ Print API key status at the start of the test session.
13
+
14
+ This runs once before any tests and provides visibility into whether
15
+ E2E tests will be executed or skipped.
16
+ """
17
+ api_key = os.getenv("DISCOVERY_API_KEY")
18
+ environment = os.getenv("ENVIRONMENT", "staging")
19
+
20
+ separator = "=" * 70
21
+ print(f"\n{separator}")
22
+ print("Discovery Engine SDK - E2E Test Configuration")
23
+ print(separator)
24
+
25
+ if not api_key:
26
+ print("⚠️ WARNING: DISCOVERY_API_KEY is NOT SET!")
27
+ print("")
28
+ print(" All E2E tests that require the real API will be SKIPPED.")
29
+ print("")
30
+ print(" To enable E2E tests:")
31
+ print(" • In CI: Add DISCOVERY_API_KEY to GitHub Secrets")
32
+ print(" • Locally: export DISCOVERY_API_KEY='disco_...'")
33
+ print("")
34
+ elif not api_key.startswith("disco_"):
35
+ print("⚠️ WARNING: DISCOVERY_API_KEY format appears INVALID!")
36
+ print(f" Key starts with: '{api_key[:10]}...'")
37
+ print(" Expected format: disco_<token>")
38
+ print("")
39
+ print(" E2E tests may fail with authentication errors.")
40
+ print("")
41
+ else:
42
+ print("✅ DISCOVERY_API_KEY is configured (format looks valid)")
43
+ print(f"✅ Environment: {environment}")
44
+ api_url = (
45
+ "https://leap-labs-production--discovery-api.modal.run"
46
+ if environment == "production"
47
+ else "https://leap-labs-staging--discovery-api.modal.run"
48
+ )
49
+ print(f"✅ API URL: {api_url}")
50
+ print("")
51
+
52
+ print(separator)
53
+ print("")
@@ -463,7 +463,7 @@ class TestCreateRun:
463
463
  dataset_id=dataset_id,
464
464
  target_column_id=target_column_id,
465
465
  task="regression",
466
- mode="fast",
466
+ depth_iterations=1,
467
467
  )
468
468
 
469
469
  assert result == sample_run
@@ -472,7 +472,7 @@ class TestCreateRun:
472
472
  payload = call_args[1]["json"]
473
473
  assert payload["run_target_column_id"] == target_column_id
474
474
  assert payload["task"] == "regression"
475
- assert payload["mode"] == "fast"
475
+ assert payload["depth_iterations"] == 1
476
476
 
477
477
  @pytest.mark.asyncio
478
478
  async def test_create_run_with_optional_params(self, client, mock_httpx_client, sample_run):
@@ -492,7 +492,7 @@ class TestCreateRun:
492
492
  dataset_id=dataset_id,
493
493
  target_column_id=target_column_id,
494
494
  task="regression",
495
- mode="fast",
495
+ depth_iterations=1,
496
496
  timeseries_groups=timeseries_groups,
497
497
  target_column_override="price_override",
498
498
  author="Test Author",
@@ -680,7 +680,7 @@ class TestRunAsync:
680
680
  result = await client.run_async(
681
681
  file=df,
682
682
  target_column="price",
683
- mode="fast",
683
+ depth_iterations=1,
684
684
  )
685
685
 
686
686
  assert isinstance(result, EngineResult)
@@ -795,7 +795,7 @@ class TestRun:
795
795
  result = client.run(
796
796
  file=df,
797
797
  target_column="price",
798
- mode="fast",
798
+ depth_iterations=1,
799
799
  )
800
800
 
801
801
  assert isinstance(result, EngineResult)
@@ -0,0 +1,343 @@
1
+ """
2
+ End-to-end tests for the Discovery Engine Python SDK.
3
+
4
+ These tests call the real API and exercise the full flow including Modal.
5
+ They are skipped if API credentials are not available.
6
+
7
+ To run these tests locally:
8
+ # Set required environment variables
9
+ export DISCOVERY_API_KEY="your-api-key"
10
+
11
+ # Optional: Set environment (defaults to staging)
12
+ export ENVIRONMENT="staging" # or "production"
13
+
14
+ # Run e2e tests
15
+ pytest engine/packages/client/tests/test_client_e2e.py -v
16
+
17
+ # Or run all tests except e2e
18
+ pytest -m "not e2e"
19
+
20
+ To run in CI (GitHub Actions):
21
+ Set these secrets in GitHub:
22
+ - DISCOVERY_API_KEY: Your API key
23
+ - ENVIRONMENT: "staging" or "production" (optional, defaults to staging)
24
+
25
+ The tests will:
26
+ - Auto-detect environment from ENVIRONMENT or VERCEL_ENV
27
+ - Use staging URL (https://leap-labs-staging--discovery-api.modal.run) by default
28
+ - Use production URL (https://leap-labs-production--discovery-api.modal.run) if ENVIRONMENT=production
29
+ - Skip gracefully if DISCOVERY_API_KEY is not set
30
+ """
31
+
32
+ import io
33
+ import os
34
+ import sys
35
+
36
+ import pandas as pd
37
+ import pytest
38
+
39
+ # Test data - simple regression dataset
40
+ TEST_DATA_CSV = """age,income,experience,price
41
+ 25,50000,2,150000
42
+ 30,60000,5,180000
43
+ 35,70000,8,220000
44
+ 40,80000,12,250000
45
+ 45,90000,15,280000
46
+ 28,55000,3,160000
47
+ 32,65000,6,190000
48
+ 38,75000,10,230000
49
+ 42,85000,13,260000
50
+ 48,95000,18,300000
51
+ """
52
+
53
+
54
+ # Hardcoded API URLs (Modal-hosted FastAPI backend)
55
+ STAGING_API_URL = "https://leap-labs-staging--discovery-api.modal.run"
56
+ PRODUCTION_API_URL = "https://leap-labs-production--discovery-api.modal.run"
57
+
58
+
59
+ def get_api_key() -> str | None:
60
+ """Get API key from environment variable."""
61
+ return os.getenv("DISCOVERY_API_KEY")
62
+
63
+
64
+ def validate_api_key_format(api_key: str) -> tuple[bool, str]:
65
+ """
66
+ Validate that the API key has the expected format.
67
+
68
+ Returns:
69
+ Tuple of (is_valid, error_message)
70
+ """
71
+ if not api_key:
72
+ return False, "API key is empty"
73
+
74
+ if not api_key.startswith("disco_"):
75
+ return False, f"API key should start with 'disco_', got: '{api_key[:10]}...'"
76
+
77
+ # Expected format: disco_<base64-like-string>
78
+ # Should be at least 20 characters total
79
+ if len(api_key) < 20:
80
+ return False, f"API key appears too short ({len(api_key)} chars)"
81
+
82
+ return True, ""
83
+
84
+
85
+ def print_api_key_warning(message: str) -> None:
86
+ """Print a loud warning about API key issues."""
87
+ separator = "=" * 70
88
+ print(f"\n{separator}", file=sys.stderr)
89
+ print("⚠️ WARNING: DISCOVERY_API_KEY ISSUE", file=sys.stderr)
90
+ print(separator, file=sys.stderr)
91
+ print(f" {message}", file=sys.stderr)
92
+ print("", file=sys.stderr)
93
+ print(" E2E tests against the real API will be SKIPPED.", file=sys.stderr)
94
+ print("", file=sys.stderr)
95
+ print(" To fix this:", file=sys.stderr)
96
+ print(" 1. Get a valid API key from the Discovery dashboard", file=sys.stderr)
97
+ print(" 2. Set it in GitHub: Settings → Secrets → DISCOVERY_API_KEY", file=sys.stderr)
98
+ print(" 3. Or locally: export DISCOVERY_API_KEY='disco_...'", file=sys.stderr)
99
+ print(f"{separator}\n", file=sys.stderr)
100
+
101
+
102
+ def get_environment() -> str:
103
+ """
104
+ Determine the current environment (staging or production).
105
+
106
+ Checks environment variables in order:
107
+ 1. ENVIRONMENT (set in CI/GitHub Actions)
108
+ 2. VERCEL_ENV (set in Vercel deployments)
109
+ 3. Defaults to staging
110
+ """
111
+ env = os.getenv("ENVIRONMENT") or os.getenv("VERCEL_ENV")
112
+ if env == "production":
113
+ return "production"
114
+ return "staging"
115
+
116
+
117
+ def get_api_url() -> str:
118
+ """
119
+ Get API URL based on environment.
120
+
121
+ Returns:
122
+ - Production URL if environment is production
123
+ - Staging URL otherwise (default)
124
+ """
125
+ env = get_environment()
126
+ if env == "production":
127
+ return PRODUCTION_API_URL
128
+ return STAGING_API_URL
129
+
130
+
131
+ @pytest.fixture
132
+ def api_key():
133
+ """Get API key from environment, skip test if not available or invalid."""
134
+ key = get_api_key()
135
+
136
+ if not key:
137
+ print_api_key_warning("DISCOVERY_API_KEY environment variable is NOT SET")
138
+ pytest.skip("DISCOVERY_API_KEY environment variable not set")
139
+
140
+ is_valid, error_message = validate_api_key_format(key)
141
+ if not is_valid:
142
+ print_api_key_warning(f"DISCOVERY_API_KEY format is INVALID: {error_message}")
143
+ pytest.skip(f"DISCOVERY_API_KEY format invalid: {error_message}")
144
+
145
+ return key
146
+
147
+
148
+ @pytest.fixture
149
+ def api_url():
150
+ """Get API URL from environment (optional)."""
151
+ return get_api_url()
152
+
153
+
154
+ @pytest.fixture
155
+ def test_dataframe():
156
+ """Create test DataFrame from CSV string."""
157
+ try:
158
+ return pd.read_csv(io.StringIO(TEST_DATA_CSV))
159
+ except ImportError:
160
+ pytest.skip("pandas not available")
161
+
162
+
163
+ @pytest.fixture
164
+ def engine(api_key, api_url):
165
+ """Create Engine instance with API key and optional URL."""
166
+ from discovery import Engine
167
+
168
+ engine = Engine(api_key=api_key)
169
+ if api_url:
170
+ engine.base_url = api_url.rstrip("/")
171
+ return engine
172
+
173
+
174
+ def print_auth_error_warning(error: Exception, api_url: str) -> None:
175
+ """Print a loud warning about authentication failures."""
176
+ separator = "=" * 70
177
+ print(f"\n{separator}", file=sys.stderr)
178
+ print("❌ ERROR: API AUTHENTICATION FAILED", file=sys.stderr)
179
+ print(separator, file=sys.stderr)
180
+ print(f" API URL: {api_url}", file=sys.stderr)
181
+ print(f" Error: {error}", file=sys.stderr)
182
+ print("", file=sys.stderr)
183
+ print(" This usually means:", file=sys.stderr)
184
+ print(" • The DISCOVERY_API_KEY is invalid or expired", file=sys.stderr)
185
+ print(" • The key doesn't have permission for this environment", file=sys.stderr)
186
+ print("", file=sys.stderr)
187
+ print(" To fix this:", file=sys.stderr)
188
+ print(" 1. Get a new API key from the Discovery dashboard", file=sys.stderr)
189
+ print(" 2. Update the DISCOVERY_API_KEY secret in GitHub", file=sys.stderr)
190
+ print(f"{separator}\n", file=sys.stderr)
191
+
192
+
193
+ @pytest.mark.e2e
194
+ @pytest.mark.asyncio
195
+ async def test_client_e2e_full_flow(engine, test_dataframe, api_url):
196
+ """
197
+ Test the full end-to-end flow: upload, analyze, wait for completion.
198
+
199
+ This test:
200
+ 1. Uploads a test dataset via the API
201
+ 2. Creates a run
202
+ 3. Waits for Modal to process the job
203
+ 4. Verifies results are returned
204
+
205
+ This exercises the complete production flow including Modal.
206
+ """
207
+ try:
208
+ # Run analysis with wait=True to exercise full flow including Modal
209
+ result = await engine.run_async(
210
+ file=test_dataframe,
211
+ target_column="price",
212
+ depth_iterations=1,
213
+ description="E2E test dataset - house price prediction",
214
+ column_descriptions={
215
+ "age": "Age of the property owner",
216
+ "income": "Annual income in USD",
217
+ "experience": "Years of work experience",
218
+ "price": "House price in USD",
219
+ },
220
+ auto_report_use_llm_evals=False, # Disable LLMs for faster test
221
+ wait=True, # Wait for completion (exercises Modal)
222
+ wait_timeout=600, # 10 minute timeout
223
+ )
224
+ except Exception as e:
225
+ error_str = str(e).lower()
226
+ if (
227
+ "401" in error_str
228
+ or "403" in error_str
229
+ or "unauthorized" in error_str
230
+ or "forbidden" in error_str
231
+ ):
232
+ print_auth_error_warning(e, api_url)
233
+ pytest.fail(f"API authentication failed - check DISCOVERY_API_KEY: {e}")
234
+ raise
235
+
236
+ # Verify results
237
+ assert result is not None, "Result should not be None"
238
+ assert result.run_id is not None, "Run ID should be set"
239
+ assert result.status == "completed", f"Run should be completed, got status: {result.status}"
240
+
241
+ # Verify we got patterns (at least one pattern should be found)
242
+ assert result.patterns is not None, "Patterns should not be None"
243
+ assert len(result.patterns) > 0, f"Should find at least one pattern, got {len(result.patterns)}"
244
+
245
+ # Verify summary exists
246
+ assert result.summary is not None, "Summary should not be None"
247
+
248
+ # Verify feature importance exists (if available)
249
+ # Note: Feature importance might be None in some cases, so we don't assert it exists
250
+
251
+
252
+ @pytest.mark.e2e
253
+ @pytest.mark.asyncio
254
+ async def test_client_e2e_async_workflow(engine, test_dataframe, api_url):
255
+ """
256
+ Test async workflow: start analysis, then wait for completion separately.
257
+
258
+ This tests the async pattern where you start a run and check status later.
259
+ """
260
+ try:
261
+ # Start analysis without waiting
262
+ result = await engine.run_async(
263
+ file=test_dataframe,
264
+ target_column="price",
265
+ depth_iterations=1,
266
+ auto_report_use_llm_evals=False,
267
+ wait=False, # Don't wait immediately
268
+ )
269
+ except Exception as e:
270
+ error_str = str(e).lower()
271
+ if (
272
+ "401" in error_str
273
+ or "403" in error_str
274
+ or "unauthorized" in error_str
275
+ or "forbidden" in error_str
276
+ ):
277
+ print_auth_error_warning(e, api_url)
278
+ pytest.fail(f"API authentication failed - check DISCOVERY_API_KEY: {e}")
279
+ raise
280
+
281
+ assert result is not None, "Result should not be None"
282
+ assert result.run_id is not None, "Run ID should be set"
283
+ run_id = result.run_id
284
+
285
+ # Now wait for completion separately
286
+ completed_result = await engine.wait_for_completion(
287
+ run_id=run_id,
288
+ poll_interval=5.0, # Check every 5 seconds
289
+ timeout=600, # 10 minute timeout
290
+ )
291
+
292
+ # Verify completion
293
+ assert (
294
+ completed_result.status == "completed"
295
+ ), f"Run should be completed, got: {completed_result.status}"
296
+ assert completed_result.patterns is not None, "Patterns should not be None"
297
+ assert len(completed_result.patterns) > 0, "Should find at least one pattern"
298
+
299
+
300
+ @pytest.mark.e2e
301
+ @pytest.mark.asyncio
302
+ async def test_client_e2e_get_results(engine, test_dataframe, api_url):
303
+ """
304
+ Test getting results for an existing run.
305
+
306
+ This tests the get_results method which can be used to check status
307
+ of a run that was started elsewhere.
308
+ """
309
+ try:
310
+ # Start a run
311
+ result = await engine.run_async(
312
+ file=test_dataframe,
313
+ target_column="price",
314
+ depth_iterations=1,
315
+ auto_report_use_llm_evals=False,
316
+ wait=False,
317
+ )
318
+ except Exception as e:
319
+ error_str = str(e).lower()
320
+ if (
321
+ "401" in error_str
322
+ or "403" in error_str
323
+ or "unauthorized" in error_str
324
+ or "forbidden" in error_str
325
+ ):
326
+ print_auth_error_warning(e, api_url)
327
+ pytest.fail(f"API authentication failed - check DISCOVERY_API_KEY: {e}")
328
+ raise
329
+
330
+ run_id = result.run_id
331
+
332
+ # Get results immediately (might still be processing)
333
+ initial_result = await engine.get_results(run_id)
334
+ assert initial_result is not None, "Should get initial result"
335
+ assert initial_result.run_id == run_id, "Run ID should match"
336
+
337
+ # Wait for completion
338
+ final_result = await engine.wait_for_completion(run_id, timeout=600)
339
+
340
+ # Verify final results
341
+ assert final_result.status == "completed", "Run should complete"
342
+ assert final_result.patterns is not None, "Patterns should be available"
343
+ assert len(final_result.patterns) > 0, "Should find patterns"
@@ -1,241 +0,0 @@
1
- """
2
- End-to-end tests for the Discovery Engine Python SDK.
3
-
4
- These tests call the real API and exercise the full flow including Modal.
5
- They are skipped if API credentials are not available.
6
-
7
- To run these tests locally:
8
- # Set required environment variables
9
- export DISCOVERY_API_KEY="your-api-key"
10
-
11
- # Optional: Set environment (defaults to staging)
12
- export ENVIRONMENT="staging" # or "production"
13
-
14
- # Run e2e tests
15
- pytest engine/packages/client/tests/test_client_e2e.py -v
16
-
17
- # Or run all tests except e2e
18
- pytest -m "not e2e"
19
-
20
- To run in CI (GitHub Actions):
21
- Set these secrets in GitHub:
22
- - DISCOVERY_API_KEY: Your API key
23
- - ENVIRONMENT: "staging" or "production" (optional, defaults to staging)
24
-
25
- The tests will:
26
- - Auto-detect environment from ENVIRONMENT or VERCEL_ENV
27
- - Use staging URL (https://staging.disco.leap-labs.com) by default
28
- - Use production URL (https://disco.leap-labs.com) if ENVIRONMENT=production
29
- - Skip gracefully if DISCOVERY_API_KEY is not set
30
- """
31
-
32
- import io
33
- import os
34
-
35
- import pandas as pd
36
- import pytest
37
-
38
- # Test data - simple regression dataset
39
- TEST_DATA_CSV = """age,income,experience,price
40
- 25,50000,2,150000
41
- 30,60000,5,180000
42
- 35,70000,8,220000
43
- 40,80000,12,250000
44
- 45,90000,15,280000
45
- 28,55000,3,160000
46
- 32,65000,6,190000
47
- 38,75000,10,230000
48
- 42,85000,13,260000
49
- 48,95000,18,300000
50
- """
51
-
52
-
53
- # Hardcoded API URLs (these don't change)
54
- STAGING_API_URL = "https://staging.disco.leap-labs.com"
55
- PRODUCTION_API_URL = "https://disco.leap-labs.com"
56
-
57
-
58
- def get_api_key() -> str | None:
59
- """Get API key from environment variable."""
60
- return os.getenv("DISCOVERY_API_KEY")
61
-
62
-
63
- def get_environment() -> str:
64
- """
65
- Determine the current environment (staging or production).
66
-
67
- Checks environment variables in order:
68
- 1. ENVIRONMENT (set in CI/GitHub Actions)
69
- 2. VERCEL_ENV (set in Vercel deployments)
70
- 3. Defaults to staging
71
- """
72
- env = os.getenv("ENVIRONMENT") or os.getenv("VERCEL_ENV")
73
- if env == "production":
74
- return "production"
75
- return "staging"
76
-
77
-
78
- def get_api_url() -> str:
79
- """
80
- Get API URL based on environment.
81
-
82
- Returns:
83
- - Production URL if environment is production
84
- - Staging URL otherwise (default)
85
- """
86
- env = get_environment()
87
- if env == "production":
88
- return PRODUCTION_API_URL
89
- return STAGING_API_URL
90
-
91
-
92
- @pytest.fixture
93
- def api_key():
94
- """Get API key from environment, skip test if not available."""
95
- key = get_api_key()
96
- if not key:
97
- pytest.skip("DISCOVERY_API_KEY environment variable not set")
98
- return key
99
-
100
-
101
- @pytest.fixture
102
- def api_url():
103
- """Get API URL from environment (optional)."""
104
- return get_api_url()
105
-
106
-
107
- @pytest.fixture
108
- def test_dataframe():
109
- """Create test DataFrame from CSV string."""
110
- try:
111
- return pd.read_csv(io.StringIO(TEST_DATA_CSV))
112
- except ImportError:
113
- pytest.skip("pandas not available")
114
-
115
-
116
- @pytest.fixture
117
- def engine(api_key, api_url):
118
- """Create Engine instance with API key and optional URL."""
119
- from discovery import Engine
120
-
121
- engine = Engine(api_key=api_key)
122
- if api_url:
123
- engine.base_url = api_url.rstrip("/")
124
- return engine
125
-
126
-
127
- @pytest.mark.e2e
128
- @pytest.mark.asyncio
129
- async def test_client_e2e_full_flow(engine, test_dataframe):
130
- """
131
- Test the full end-to-end flow: upload, analyze, wait for completion.
132
-
133
- This test:
134
- 1. Uploads a test dataset via the API
135
- 2. Creates a run
136
- 3. Waits for Modal to process the job
137
- 4. Verifies results are returned
138
-
139
- This exercises the complete production flow including Modal.
140
- """
141
- # Run analysis with wait=True to exercise full flow including Modal
142
- result = await engine.run_async(
143
- file=test_dataframe,
144
- target_column="price",
145
- mode="fast",
146
- description="E2E test dataset - house price prediction",
147
- column_descriptions={
148
- "age": "Age of the property owner",
149
- "income": "Annual income in USD",
150
- "experience": "Years of work experience",
151
- "price": "House price in USD",
152
- },
153
- auto_report_use_llm_evals=False, # Disable LLMs for faster test
154
- wait=True, # Wait for completion (exercises Modal)
155
- wait_timeout=600, # 10 minute timeout
156
- )
157
-
158
- # Verify results
159
- assert result is not None, "Result should not be None"
160
- assert result.run_id is not None, "Run ID should be set"
161
- assert result.status == "completed", f"Run should be completed, got status: {result.status}"
162
-
163
- # Verify we got patterns (at least one pattern should be found)
164
- assert result.patterns is not None, "Patterns should not be None"
165
- assert len(result.patterns) > 0, f"Should find at least one pattern, got {len(result.patterns)}"
166
-
167
- # Verify summary exists
168
- assert result.summary is not None, "Summary should not be None"
169
-
170
- # Verify feature importance exists (if available)
171
- # Note: Feature importance might be None in some cases, so we don't assert it exists
172
-
173
-
174
- @pytest.mark.e2e
175
- @pytest.mark.asyncio
176
- async def test_client_e2e_async_workflow(engine, test_dataframe):
177
- """
178
- Test async workflow: start analysis, then wait for completion separately.
179
-
180
- This tests the async pattern where you start a run and check status later.
181
- """
182
- # Start analysis without waiting
183
- result = await engine.run_async(
184
- file=test_dataframe,
185
- target_column="price",
186
- mode="fast",
187
- auto_report_use_llm_evals=False,
188
- wait=False, # Don't wait immediately
189
- )
190
-
191
- assert result is not None, "Result should not be None"
192
- assert result.run_id is not None, "Run ID should be set"
193
- run_id = result.run_id
194
-
195
- # Now wait for completion separately
196
- completed_result = await engine.wait_for_completion(
197
- run_id=run_id,
198
- poll_interval=5.0, # Check every 5 seconds
199
- timeout=600, # 10 minute timeout
200
- )
201
-
202
- # Verify completion
203
- assert (
204
- completed_result.status == "completed"
205
- ), f"Run should be completed, got: {completed_result.status}"
206
- assert completed_result.patterns is not None, "Patterns should not be None"
207
- assert len(completed_result.patterns) > 0, "Should find at least one pattern"
208
-
209
-
210
- @pytest.mark.e2e
211
- @pytest.mark.asyncio
212
- async def test_client_e2e_get_results(engine, test_dataframe):
213
- """
214
- Test getting results for an existing run.
215
-
216
- This tests the get_results method which can be used to check status
217
- of a run that was started elsewhere.
218
- """
219
- # Start a run
220
- result = await engine.run_async(
221
- file=test_dataframe,
222
- target_column="price",
223
- mode="fast",
224
- auto_report_use_llm_evals=False,
225
- wait=False,
226
- )
227
-
228
- run_id = result.run_id
229
-
230
- # Get results immediately (might still be processing)
231
- initial_result = await engine.get_results(run_id)
232
- assert initial_result is not None, "Should get initial result"
233
- assert initial_result.run_id == run_id, "Run ID should match"
234
-
235
- # Wait for completion
236
- final_result = await engine.wait_for_completion(run_id, timeout=600)
237
-
238
- # Verify final results
239
- assert final_result.status == "completed", "Run should complete"
240
- assert final_result.patterns is not None, "Patterns should be available"
241
- assert len(final_result.patterns) > 0, "Should find patterns"