ds-agent-cli 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (67) hide show
  1. package/bin/ds-agent.js +451 -0
  2. package/ds_agent/__init__.py +8 -0
  3. package/package.json +28 -0
  4. package/requirements.txt +126 -0
  5. package/setup.py +35 -0
  6. package/src/__init__.py +7 -0
  7. package/src/_compress_tool_result.py +118 -0
  8. package/src/api/__init__.py +4 -0
  9. package/src/api/app.py +1626 -0
  10. package/src/cache/__init__.py +5 -0
  11. package/src/cache/cache_manager.py +561 -0
  12. package/src/cli.py +2886 -0
  13. package/src/dynamic_prompts.py +281 -0
  14. package/src/orchestrator.py +4799 -0
  15. package/src/progress_manager.py +139 -0
  16. package/src/reasoning/__init__.py +332 -0
  17. package/src/reasoning/business_summary.py +431 -0
  18. package/src/reasoning/data_understanding.py +356 -0
  19. package/src/reasoning/model_explanation.py +383 -0
  20. package/src/reasoning/reasoning_trace.py +239 -0
  21. package/src/registry/__init__.py +3 -0
  22. package/src/registry/tools_registry.py +3 -0
  23. package/src/session_memory.py +448 -0
  24. package/src/session_store.py +370 -0
  25. package/src/storage/__init__.py +19 -0
  26. package/src/storage/artifact_store.py +620 -0
  27. package/src/storage/helpers.py +116 -0
  28. package/src/storage/huggingface_storage.py +694 -0
  29. package/src/storage/r2_storage.py +0 -0
  30. package/src/storage/user_files_service.py +288 -0
  31. package/src/tools/__init__.py +335 -0
  32. package/src/tools/advanced_analysis.py +823 -0
  33. package/src/tools/advanced_feature_engineering.py +708 -0
  34. package/src/tools/advanced_insights.py +578 -0
  35. package/src/tools/advanced_preprocessing.py +549 -0
  36. package/src/tools/advanced_training.py +906 -0
  37. package/src/tools/agent_tool_mapping.py +326 -0
  38. package/src/tools/auto_pipeline.py +420 -0
  39. package/src/tools/autogluon_training.py +1480 -0
  40. package/src/tools/business_intelligence.py +860 -0
  41. package/src/tools/cloud_data_sources.py +581 -0
  42. package/src/tools/code_interpreter.py +390 -0
  43. package/src/tools/computer_vision.py +614 -0
  44. package/src/tools/data_cleaning.py +614 -0
  45. package/src/tools/data_profiling.py +593 -0
  46. package/src/tools/data_type_conversion.py +268 -0
  47. package/src/tools/data_wrangling.py +433 -0
  48. package/src/tools/eda_reports.py +284 -0
  49. package/src/tools/enhanced_feature_engineering.py +241 -0
  50. package/src/tools/feature_engineering.py +302 -0
  51. package/src/tools/matplotlib_visualizations.py +1327 -0
  52. package/src/tools/model_training.py +520 -0
  53. package/src/tools/nlp_text_analytics.py +761 -0
  54. package/src/tools/plotly_visualizations.py +497 -0
  55. package/src/tools/production_mlops.py +852 -0
  56. package/src/tools/time_series.py +507 -0
  57. package/src/tools/tools_registry.py +2133 -0
  58. package/src/tools/visualization_engine.py +559 -0
  59. package/src/utils/__init__.py +42 -0
  60. package/src/utils/error_recovery.py +313 -0
  61. package/src/utils/parallel_executor.py +402 -0
  62. package/src/utils/polars_helpers.py +248 -0
  63. package/src/utils/schema_extraction.py +132 -0
  64. package/src/utils/semantic_layer.py +392 -0
  65. package/src/utils/token_budget.py +411 -0
  66. package/src/utils/validation.py +377 -0
  67. package/src/workflow_state.py +154 -0
@@ -0,0 +1,581 @@
1
+ """
2
+ Cloud Data Sources - BigQuery Integration
3
+ Tools for loading and writing data to/from Google BigQuery.
4
+ Compatible with existing DataScienceCopilot tool registry.
5
+ """
6
+
7
+ import polars as pl
8
+ import pandas as pd
9
+ from typing import Dict, Any, Optional, Literal
10
+ from pathlib import Path
11
+ import sys
12
+ import os
13
+
14
+ # Add parent directory to path
15
+ sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
16
+
17
+ from ds_agent.utils.validation import validate_dataframe
18
+
19
+ try:
20
+ from google.cloud import bigquery
21
+ from google.oauth2 import service_account
22
+ BIGQUERY_AVAILABLE = True
23
+ except ImportError:
24
+ BIGQUERY_AVAILABLE = False
25
+ bigquery = None
26
+ service_account = None
27
+
28
+
29
+ def _get_bigquery_client(project_id: str) -> 'bigquery.Client':
30
+ """
31
+ Initialize BigQuery client with credentials from environment.
32
+
33
+ Credential sources (in order of priority):
34
+ 1. GOOGLE_APPLICATION_CREDENTIALS env var (service account JSON path)
35
+ 2. Default application credentials (gcloud auth application-default login)
36
+
37
+ Args:
38
+ project_id: Google Cloud project ID
39
+
40
+ Returns:
41
+ BigQuery client instance
42
+
43
+ Raises:
44
+ ImportError: If google-cloud-bigquery not installed
45
+ EnvironmentError: If credentials not found
46
+ """
47
+ if not BIGQUERY_AVAILABLE:
48
+ raise ImportError(
49
+ "google-cloud-bigquery is not installed. "
50
+ "Install it with: pip install google-cloud-bigquery"
51
+ )
52
+
53
+ # Check for service account credentials
54
+ creds_path = os.getenv("GOOGLE_APPLICATION_CREDENTIALS")
55
+
56
+ if creds_path and Path(creds_path).exists():
57
+ # Use service account JSON
58
+ credentials = service_account.Credentials.from_service_account_file(creds_path)
59
+ client = bigquery.Client(project=project_id, credentials=credentials)
60
+ else:
61
+ # Use default application credentials
62
+ try:
63
+ client = bigquery.Client(project=project_id)
64
+ except Exception as e:
65
+ raise EnvironmentError(
66
+ "BigQuery credentials not found. Either:\n"
67
+ "1. Set GOOGLE_APPLICATION_CREDENTIALS to service account JSON path\n"
68
+ "2. Run: gcloud auth application-default login\n"
69
+ f"Error: {str(e)}"
70
+ )
71
+
72
+ return client
73
+
74
+
75
+ def load_bigquery_table(
76
+ project_id: str,
77
+ dataset: str,
78
+ table: str,
79
+ limit: Optional[int] = None,
80
+ columns: Optional[list] = None,
81
+ where_clause: Optional[str] = None
82
+ ) -> Dict[str, Any]:
83
+ """
84
+ Load data from BigQuery table into a Polars DataFrame.
85
+
86
+ This tool allows the agent to load data from BigQuery for analysis.
87
+ Supports sampling via LIMIT and column selection for memory efficiency.
88
+
89
+ Args:
90
+ project_id: Google Cloud project ID
91
+ dataset: BigQuery dataset name
92
+ table: BigQuery table name
93
+ limit: Optional row limit for sampling (e.g., 10000 for large tables)
94
+ columns: Optional list of column names to load (default: all columns)
95
+ where_clause: Optional SQL WHERE clause for filtering (without WHERE keyword)
96
+ Example: "created_at > '2024-01-01'"
97
+
98
+ Returns:
99
+ Dictionary with:
100
+ - success: bool
101
+ - data_path: str (saved CSV path for downstream tools)
102
+ - df_info: dict (shape, columns, memory_usage)
103
+ - message: str
104
+ - query_stats: dict (bytes processed, rows returned)
105
+
106
+ Examples:
107
+ >>> # Load full table
108
+ >>> load_bigquery_table("my-project", "analytics", "users")
109
+
110
+ >>> # Sample 10K rows for exploration
111
+ >>> load_bigquery_table("my-project", "analytics", "events", limit=10000)
112
+
113
+ >>> # Load specific columns with filter
114
+ >>> load_bigquery_table(
115
+ ... "my-project", "sales", "transactions",
116
+ ... columns=["customer_id", "amount", "date"],
117
+ ... where_clause="date >= '2024-01-01'",
118
+ ... limit=50000
119
+ ... )
120
+ """
121
+ try:
122
+ # Initialize client
123
+ client = _get_bigquery_client(project_id)
124
+
125
+ # Build query
126
+ table_ref = f"{project_id}.{dataset}.{table}"
127
+
128
+ if columns:
129
+ columns_str = ", ".join(columns)
130
+ else:
131
+ columns_str = "*"
132
+
133
+ query = f"SELECT {columns_str} FROM `{table_ref}`"
134
+
135
+ if where_clause:
136
+ query += f" WHERE {where_clause}"
137
+
138
+ if limit:
139
+ query += f" LIMIT {limit}"
140
+
141
+ # Execute query
142
+ query_job = client.query(query)
143
+
144
+ # Load results into pandas (BigQuery SDK returns pandas)
145
+ df_pandas = query_job.to_dataframe()
146
+
147
+ # Convert to Polars for consistency with existing tools
148
+ df = pl.from_pandas(df_pandas)
149
+
150
+ # Validate
151
+ validate_dataframe(df)
152
+
153
+ # Save to outputs/data/ for downstream tool compatibility
154
+ output_dir = Path("./outputs/data")
155
+ output_dir.mkdir(parents=True, exist_ok=True)
156
+
157
+ output_path = output_dir / f"bigquery_{dataset}_{table}.csv"
158
+ df.write_csv(output_path)
159
+
160
+ # Get query statistics
161
+ bytes_processed = query_job.total_bytes_processed or 0
162
+ bytes_billed = query_job.total_bytes_billed or 0
163
+
164
+ return {
165
+ "success": True,
166
+ "data_path": str(output_path),
167
+ "df_info": {
168
+ "rows": df.shape[0],
169
+ "columns": df.shape[1],
170
+ "column_names": df.columns,
171
+ "memory_mb": round(df.estimated_size("mb"), 2)
172
+ },
173
+ "query_stats": {
174
+ "bytes_processed": bytes_processed,
175
+ "bytes_processed_mb": round(bytes_processed / 1024 / 1024, 2),
176
+ "bytes_billed": bytes_billed,
177
+ "bytes_billed_mb": round(bytes_billed / 1024 / 1024, 2),
178
+ "rows_returned": len(df)
179
+ },
180
+ "message": f"✅ Loaded {len(df):,} rows from {table_ref}. Saved to {output_path}",
181
+ "table_reference": table_ref,
182
+ "query": query
183
+ }
184
+
185
+ except ImportError as e:
186
+ return {
187
+ "success": False,
188
+ "error": str(e),
189
+ "error_type": "ImportError",
190
+ "message": "BigQuery library not installed. Run: pip install google-cloud-bigquery"
191
+ }
192
+
193
+ except Exception as e:
194
+ return {
195
+ "success": False,
196
+ "error": str(e),
197
+ "error_type": type(e).__name__,
198
+ "message": f"Failed to load BigQuery table: {str(e)}"
199
+ }
200
+
201
+
202
+ def write_bigquery_table(
203
+ file_path: str,
204
+ project_id: str,
205
+ dataset: str,
206
+ table: str,
207
+ mode: Literal["append", "overwrite", "fail"] = "append"
208
+ ) -> Dict[str, Any]:
209
+ """
210
+ Write DataFrame to BigQuery table from CSV/Parquet file.
211
+
212
+ This tool allows the agent to save predictions, metrics, or processed data
213
+ back to BigQuery for downstream consumption.
214
+
215
+ Args:
216
+ file_path: Path to CSV or Parquet file containing data to write
217
+ project_id: Google Cloud project ID
218
+ dataset: BigQuery dataset name
219
+ table: BigQuery table name
220
+ mode: Write mode
221
+ - "append": Add rows to existing table
222
+ - "overwrite": Replace table contents
223
+ - "fail": Raise error if table exists
224
+
225
+ Returns:
226
+ Dictionary with:
227
+ - success: bool
228
+ - table_reference: str
229
+ - rows_written: int
230
+ - message: str
231
+
232
+ Examples:
233
+ >>> # Write predictions to BigQuery
234
+ >>> write_bigquery_table(
235
+ ... "./outputs/data/predictions.csv",
236
+ ... "my-project",
237
+ ... "ml_results",
238
+ ... "churn_predictions",
239
+ ... mode="append"
240
+ ... )
241
+
242
+ >>> # Overwrite existing metrics table
243
+ >>> write_bigquery_table(
244
+ ... "./outputs/data/metrics.csv",
245
+ ... "my-project",
246
+ ... "ml_results",
247
+ ... "model_metrics",
248
+ ... mode="overwrite"
249
+ ... )
250
+ """
251
+ try:
252
+ # Initialize client
253
+ client = _get_bigquery_client(project_id)
254
+
255
+ # Load data from file
256
+ file_path = Path(file_path)
257
+ if not file_path.exists():
258
+ return {
259
+ "success": False,
260
+ "error": f"File not found: {file_path}",
261
+ "error_type": "FileNotFoundError"
262
+ }
263
+
264
+ # Load based on extension
265
+ if file_path.suffix.lower() == ".csv":
266
+ df = pl.read_csv(file_path)
267
+ elif file_path.suffix.lower() == ".parquet":
268
+ df = pl.read_parquet(file_path)
269
+ else:
270
+ return {
271
+ "success": False,
272
+ "error": f"Unsupported file format: {file_path.suffix}",
273
+ "error_type": "ValueError"
274
+ }
275
+
276
+ # Convert to pandas (BigQuery SDK requires pandas)
277
+ df_pandas = df.to_pandas()
278
+
279
+ # Build table reference
280
+ table_ref = f"{project_id}.{dataset}.{table}"
281
+
282
+ # Configure write disposition
283
+ if mode == "append":
284
+ write_disposition = bigquery.WriteDisposition.WRITE_APPEND
285
+ elif mode == "overwrite":
286
+ write_disposition = bigquery.WriteDisposition.WRITE_TRUNCATE
287
+ elif mode == "fail":
288
+ write_disposition = bigquery.WriteDisposition.WRITE_EMPTY
289
+ else:
290
+ return {
291
+ "success": False,
292
+ "error": f"Invalid mode: {mode}. Use 'append', 'overwrite', or 'fail'",
293
+ "error_type": "ValueError"
294
+ }
295
+
296
+ # Configure job
297
+ job_config = bigquery.LoadJobConfig(
298
+ write_disposition=write_disposition,
299
+ autodetect=True # Auto-detect schema from DataFrame
300
+ )
301
+
302
+ # Execute write job
303
+ job = client.load_table_from_dataframe(
304
+ df_pandas,
305
+ table_ref,
306
+ job_config=job_config
307
+ )
308
+
309
+ # Wait for completion
310
+ job.result()
311
+
312
+ return {
313
+ "success": True,
314
+ "table_reference": table_ref,
315
+ "rows_written": len(df_pandas),
316
+ "mode": mode,
317
+ "message": f"✅ Wrote {len(df_pandas):,} rows to {table_ref} (mode: {mode})",
318
+ "table_info": {
319
+ "project": project_id,
320
+ "dataset": dataset,
321
+ "table": table,
322
+ "columns": df.columns,
323
+ "rows": len(df)
324
+ }
325
+ }
326
+
327
+ except ImportError as e:
328
+ return {
329
+ "success": False,
330
+ "error": str(e),
331
+ "error_type": "ImportError",
332
+ "message": "BigQuery library not installed. Run: pip install google-cloud-bigquery"
333
+ }
334
+
335
+ except Exception as e:
336
+ return {
337
+ "success": False,
338
+ "error": str(e),
339
+ "error_type": type(e).__name__,
340
+ "message": f"Failed to write to BigQuery: {str(e)}"
341
+ }
342
+
343
+
344
+ def profile_bigquery_table(
345
+ project_id: str,
346
+ dataset: str,
347
+ table: str
348
+ ) -> Dict[str, Any]:
349
+ """
350
+ Profile a BigQuery table without loading all data.
351
+
352
+ Returns metadata including row count, column types, null counts,
353
+ and table size. Useful for initial exploration before full load.
354
+
355
+ Args:
356
+ project_id: Google Cloud project ID
357
+ dataset: BigQuery dataset name
358
+ table: BigQuery table name
359
+
360
+ Returns:
361
+ Dictionary with:
362
+ - success: bool
363
+ - table_reference: str
364
+ - row_count: int
365
+ - columns: list of dicts with column info
366
+ - table_size_mb: float
367
+ - created: str (timestamp)
368
+ - modified: str (timestamp)
369
+ - message: str
370
+
371
+ Examples:
372
+ >>> # Quick profile before loading
373
+ >>> profile_bigquery_table("my-project", "analytics", "events")
374
+ {
375
+ "success": True,
376
+ "row_count": 1000000,
377
+ "columns": [
378
+ {"name": "user_id", "type": "STRING", "mode": "NULLABLE"},
379
+ {"name": "event_time", "type": "TIMESTAMP", "mode": "REQUIRED"},
380
+ ...
381
+ ],
382
+ "table_size_mb": 125.5
383
+ }
384
+ """
385
+ try:
386
+ # Initialize client
387
+ client = _get_bigquery_client(project_id)
388
+
389
+ # Get table metadata
390
+ table_ref = f"{project_id}.{dataset}.{table}"
391
+ table_obj = client.get_table(table_ref)
392
+
393
+ # Extract schema information
394
+ columns_info = []
395
+ for field in table_obj.schema:
396
+ columns_info.append({
397
+ "name": field.name,
398
+ "type": field.field_type,
399
+ "mode": field.mode, # NULLABLE, REQUIRED, REPEATED
400
+ "description": field.description or ""
401
+ })
402
+
403
+ # Get null counts via query (sample for efficiency)
404
+ null_counts = {}
405
+ try:
406
+ # Use TABLESAMPLE for large tables (1% sample)
407
+ sample_query = f"""
408
+ SELECT
409
+ {', '.join([f'COUNTIF({col["name"]} IS NULL) AS {col["name"]}_nulls' for col in columns_info])}
410
+ FROM `{table_ref}`
411
+ TABLESAMPLE SYSTEM (1 PERCENT)
412
+ """
413
+
414
+ query_job = client.query(sample_query)
415
+ result = query_job.result()
416
+ row = next(iter(result))
417
+
418
+ for col in columns_info:
419
+ null_count = row.get(f'{col["name"]}_nulls', 0)
420
+ null_counts[col["name"]] = null_count
421
+ except Exception as e:
422
+ # If sampling fails, skip null counts
423
+ null_counts = {col["name"]: "N/A" for col in columns_info}
424
+
425
+ # Table size information
426
+ table_size_bytes = table_obj.num_bytes or 0
427
+ table_size_mb = round(table_size_bytes / 1024 / 1024, 2)
428
+
429
+ return {
430
+ "success": True,
431
+ "table_reference": table_ref,
432
+ "profile": {
433
+ "row_count": table_obj.num_rows,
434
+ "column_count": len(columns_info),
435
+ "table_size_mb": table_size_mb,
436
+ "table_size_gb": round(table_size_mb / 1024, 2)
437
+ },
438
+ "columns": columns_info,
439
+ "null_counts_sample": null_counts,
440
+ "metadata": {
441
+ "created": table_obj.created.isoformat() if table_obj.created else None,
442
+ "modified": table_obj.modified.isoformat() if table_obj.modified else None,
443
+ "location": table_obj.location,
444
+ "expiration": table_obj.expires.isoformat() if table_obj.expires else None
445
+ },
446
+ "message": f"✅ Profiled {table_ref}: {table_obj.num_rows:,} rows, {len(columns_info)} columns, {table_size_mb} MB",
447
+ "recommendation": (
448
+ f"Table has {table_obj.num_rows:,} rows. "
449
+ f"Consider using limit={min(10000, table_obj.num_rows)} for initial exploration."
450
+ if table_obj.num_rows > 10000 else
451
+ f"Table is small ({table_obj.num_rows:,} rows), safe to load fully."
452
+ )
453
+ }
454
+
455
+ except ImportError as e:
456
+ return {
457
+ "success": False,
458
+ "error": str(e),
459
+ "error_type": "ImportError",
460
+ "message": "BigQuery library not installed. Run: pip install google-cloud-bigquery"
461
+ }
462
+
463
+ except Exception as e:
464
+ return {
465
+ "success": False,
466
+ "error": str(e),
467
+ "error_type": type(e).__name__,
468
+ "message": f"Failed to profile BigQuery table: {str(e)}"
469
+ }
470
+
471
+
472
+ def query_bigquery(
473
+ project_id: str,
474
+ query: str,
475
+ output_path: Optional[str] = None,
476
+ limit: Optional[int] = None
477
+ ) -> Dict[str, Any]:
478
+ """
479
+ Execute a custom BigQuery SQL query and return results as DataFrame.
480
+
481
+ This tool allows the agent to run custom SQL queries for complex
482
+ data transformations before analysis.
483
+
484
+ Args:
485
+ project_id: Google Cloud project ID
486
+ query: SQL query to execute
487
+ output_path: Optional path to save results (default: auto-generated)
488
+ limit: Optional row limit to append to query
489
+
490
+ Returns:
491
+ Dictionary with:
492
+ - success: bool
493
+ - data_path: str
494
+ - df_info: dict
495
+ - query_stats: dict
496
+ - message: str
497
+
498
+ Examples:
499
+ >>> # Custom aggregation query
500
+ >>> query_bigquery(
501
+ ... "my-project",
502
+ ... '''
503
+ ... SELECT
504
+ ... customer_id,
505
+ ... SUM(amount) as total_spent,
506
+ ... COUNT(*) as num_orders
507
+ ... FROM `my-project.sales.orders`
508
+ ... WHERE date >= '2024-01-01'
509
+ ... GROUP BY customer_id
510
+ ... '''
511
+ ... )
512
+ """
513
+ try:
514
+ # Initialize client
515
+ client = _get_bigquery_client(project_id)
516
+
517
+ # Add limit if specified
518
+ if limit:
519
+ query = f"{query.rstrip(';')} LIMIT {limit}"
520
+
521
+ # Execute query
522
+ query_job = client.query(query)
523
+ df_pandas = query_job.to_dataframe()
524
+
525
+ # Convert to Polars
526
+ df = pl.from_pandas(df_pandas)
527
+
528
+ # Determine output path
529
+ if output_path is None:
530
+ output_dir = Path("./outputs/data")
531
+ output_dir.mkdir(parents=True, exist_ok=True)
532
+ output_path = str(output_dir / "bigquery_query_result.csv")
533
+
534
+ # Save results
535
+ df.write_csv(output_path)
536
+
537
+ # Get query statistics
538
+ bytes_processed = query_job.total_bytes_processed or 0
539
+
540
+ return {
541
+ "success": True,
542
+ "data_path": output_path,
543
+ "df_info": {
544
+ "rows": df.shape[0],
545
+ "columns": df.shape[1],
546
+ "column_names": df.columns,
547
+ "memory_mb": round(df.estimated_size("mb"), 2)
548
+ },
549
+ "query_stats": {
550
+ "bytes_processed": bytes_processed,
551
+ "bytes_processed_mb": round(bytes_processed / 1024 / 1024, 2),
552
+ "rows_returned": len(df)
553
+ },
554
+ "message": f"✅ Query returned {len(df):,} rows. Saved to {output_path}",
555
+ "query": query
556
+ }
557
+
558
+ except ImportError as e:
559
+ return {
560
+ "success": False,
561
+ "error": str(e),
562
+ "error_type": "ImportError",
563
+ "message": "BigQuery library not installed. Run: pip install google-cloud-bigquery"
564
+ }
565
+
566
+ except Exception as e:
567
+ return {
568
+ "success": False,
569
+ "error": str(e),
570
+ "error_type": type(e).__name__,
571
+ "message": f"Failed to execute BigQuery query: {str(e)}"
572
+ }
573
+
574
+
575
+ # Export functions for tool registry
576
+ __all__ = [
577
+ 'load_bigquery_table',
578
+ 'write_bigquery_table',
579
+ 'profile_bigquery_table',
580
+ 'query_bigquery'
581
+ ]