datatoolpack 0.2.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,299 @@
1
+ Metadata-Version: 2.4
2
+ Name: datatoolpack
3
+ Version: 0.2.0
4
+ Summary: Official Python SDK for the AutoData ML data preparation pipeline API
5
+ Home-page: https://autodata.datatoolpack.com
6
+ Author: AutoData Team
7
+ Author-email: support@datatoolpack.com
8
+ Project-URL: Documentation, https://autodata.datatoolpack.com/docs
9
+ Project-URL: Bug Tracker, https://github.com/datatoolpack/autodata-client/issues
10
+ Keywords: autodata machine-learning data-preparation synthetic-data ml-pipeline
11
+ Classifier: Development Status :: 4 - Beta
12
+ Classifier: Intended Audience :: Developers
13
+ Classifier: Intended Audience :: Science/Research
14
+ Classifier: License :: OSI Approved :: MIT License
15
+ Classifier: Programming Language :: Python :: 3
16
+ Classifier: Programming Language :: Python :: 3.8
17
+ Classifier: Programming Language :: Python :: 3.9
18
+ Classifier: Programming Language :: Python :: 3.10
19
+ Classifier: Programming Language :: Python :: 3.11
20
+ Classifier: Programming Language :: Python :: 3.12
21
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
22
+ Classifier: Topic :: Software Development :: Libraries :: Python Modules
23
+ Requires-Python: >=3.8
24
+ Description-Content-Type: text/markdown
25
+ Requires-Dist: requests>=2.25.0
26
+ Dynamic: author
27
+ Dynamic: author-email
28
+ Dynamic: classifier
29
+ Dynamic: description
30
+ Dynamic: description-content-type
31
+ Dynamic: home-page
32
+ Dynamic: keywords
33
+ Dynamic: project-url
34
+ Dynamic: requires-dist
35
+ Dynamic: requires-python
36
+ Dynamic: summary
37
+
38
+ # AutoData Python Client
39
+
40
+ Official Python SDK for the [AutoData](https://autodata.datatoolpack.com) ML data preparation pipeline API.
41
+
42
+ ## Installation
43
+
44
+ ```bash
45
+ pip install autodata-client
46
+ ```
47
+
48
+ Or install from source:
49
+
50
+ ```bash
51
+ git clone https://github.com/datatoolpack/autodata-client
52
+ cd autodata-client
53
+ pip install .
54
+ ```
55
+
56
+ ## Quick Start
57
+
58
+ ```python
59
+ from autodata import AutoDataClient
60
+
61
+ client = AutoDataClient(
62
+ api_key="dtpk_YOUR_API_KEY",
63
+ base_url="https://autodata.datatoolpack.com",
64
+ )
65
+
66
+ result = client.process(
67
+ file_path="data.csv",
68
+ target_columns=["price"],
69
+ output_rows=20000,
70
+ )
71
+ print(result["files"])
72
+ # [{'name': 'dsg.csv', 'url': '/download/...', 'size': 1048576, 'description': '...'}]
73
+ ```
74
+
75
+ Get your API key from the [AutoData dashboard](https://autodata.datatoolpack.com/dashboard) → API Keys tab.
76
+
77
+ ---
78
+
79
+ ## Reference
80
+
81
+ ### `AutoDataClient(api_key, base_url, timeout)`
82
+
83
+ | Parameter | Type | Default | Description |
84
+ |------------|-------|--------------------------------------|------------------------------------|
85
+ | `api_key` | `str` | required | API key starting with `dtpk_` |
86
+ | `base_url` | `str` | `"https://autodata.datatoolpack.com"` | Server URL (no trailing slash) |
87
+ | `timeout` | `int` | `120` | Request timeout in seconds |
88
+
89
+ ---
90
+
91
+ ### `client.process(...)` — Upload & run pipeline
92
+
93
+ ```python
94
+ result = client.process(
95
+ file_path="data.csv", # Path to input CSV
96
+ target_columns=["price"], # y-column(s) for ML
97
+ output_rows=20000, # Target row count in output
98
+ tools={ # Toggle pipeline steps (all optional)
99
+ "anomaly": False, # Anomaly detection (off by default)
100
+ "dtc": True, # Data Type Conversion
101
+ "mdh": True, # Missing Data Handler
102
+ "cds": True, # Column Scaling
103
+ "dsm": True, # Data Split Manager
104
+ "dsg": True, # Synthetic Data Generator
105
+ },
106
+ advanced_params={ # Fine-grained parameters (all optional)
107
+ "excluded_columns": ["id"], # Columns to drop before processing
108
+ "text_mode": 0, # 0=none, 1=neural, 2=tfidf
109
+ "text_cleaning": True, # Clean text before encoding
110
+ "zscore_limit": 3.0, # Z-score outlier threshold
111
+ "dsg_mode": "copula", # "copula" or "gan"
112
+ "similarity_p": 95, # Similarity percentile for DSG
113
+ },
114
+ wait=True, # Block until complete (default True)
115
+ poll_interval=2, # Status poll interval in seconds
116
+ download_path="./outputs/", # Where to save files (default auto)
117
+ output_preferences=["dsg.csv"], # Which files to download (default all)
118
+ compressed=True, # Download as ZIP (default True)
119
+ )
120
+ ```
121
+
122
+ **Returns** a dict:
123
+
124
+ ```python
125
+ {
126
+ "session_id": "abc123...",
127
+ "status": "completed",
128
+ "files": [
129
+ {"name": "dsg.csv", "url": "/download/.../dsg.csv", "size": 2097152, "description": "Synthetic data"},
130
+ {"name": "dsm_train.csv", ...},
131
+ ...
132
+ ],
133
+ "row_count": 20000,
134
+ "duration_seconds": 42.1,
135
+ }
136
+ ```
137
+
138
+ Set `wait=False` to get back immediately with just `session_id` and `status`:
139
+
140
+ ```python
141
+ result = client.process(file_path="data.csv", target_columns="price", wait=False)
142
+ session_id = result["session_id"]
143
+ ```
144
+
145
+ ---
146
+
147
+ ### `client.get_status(session_id)` — Poll progress
148
+
149
+ ```python
150
+ status = client.get_status(session_id)
151
+ # {
152
+ # "status": "running", # queued | running | completed | error | cancelled
153
+ # "message": "Running MDH...",
154
+ # "current_step": 3,
155
+ # "total_steps": 6,
156
+ # "progress_percent": 50,
157
+ # "duration_seconds": 15.3,
158
+ # }
159
+ ```
160
+
161
+ ---
162
+
163
+ ### `client.get_result(session_id)` — Fetch completed results
164
+
165
+ ```python
166
+ result = client.get_result(session_id)
167
+ # {"status": "completed", "files": [...], "row_count": ..., "duration_seconds": ...}
168
+ ```
169
+
170
+ ---
171
+
172
+ ### `client.wait_for_completion(session_id, poll_interval)` — Block until done
173
+
174
+ ```python
175
+ result = client.wait_for_completion(session_id, poll_interval=3)
176
+ ```
177
+
178
+ Prints live progress to stdout. Raises `AutoDataError` if processing fails.
179
+
180
+ ---
181
+
182
+ ### `client.cancel(session_id)` — Cancel a running job
183
+
184
+ ```python
185
+ cancelled = client.cancel(session_id) # True if acknowledged
186
+ ```
187
+
188
+ ---
189
+
190
+ ### `client.download_results(session_id, ...)` — Download output files
191
+
192
+ ```python
193
+ path = client.download_results(
194
+ session_id,
195
+ download_path="./my_outputs/", # Directory to save into
196
+ output_preferences=["dsg.csv"], # Specific files only (None = all)
197
+ compressed=True, # ZIP download (default) or individual files
198
+ )
199
+ print(f"Saved to {path}")
200
+ ```
201
+
202
+ ---
203
+
204
+ ### `client.download_file(url, output_path)` — Download a single file
205
+
206
+ ```python
207
+ client.download_file("/download/abc123.../dsg.csv", "dsg.csv")
208
+ ```
209
+
210
+ ---
211
+
212
+ ### `client.list_keys()` — List API keys
213
+
214
+ ```python
215
+ keys = client.list_keys()
216
+ # [{"id": "...", "name": "My Key", "prefix": "dtpk_abc123", "created_at": "..."}]
217
+ ```
218
+
219
+ ---
220
+
221
+ ### `client.get_usage()` — Usage statistics
222
+
223
+ ```python
224
+ usage = client.get_usage()
225
+ # {
226
+ # "daily_credits_used": 500,
227
+ # "daily_credit_limit": 10000,
228
+ # "daily_remaining": 9500,
229
+ # "lifetime_credits_used": 12340,
230
+ # "lifetime_credit_limit": 1000000,
231
+ # "lifetime_remaining": 987660,
232
+ # "daily_request_count": 3,
233
+ # "last_used_at": "2026-04-12T10:30:00Z",
234
+ # }
235
+ ```
236
+
237
+ ---
238
+
239
+ ## Error Handling
240
+
241
+ All API errors raise `AutoDataError`:
242
+
243
+ ```python
244
+ from autodata import AutoDataClient, AutoDataError
245
+
246
+ client = AutoDataClient(api_key="dtpk_...")
247
+
248
+ try:
249
+ result = client.process("data.csv", target_columns="price")
250
+ except AutoDataError as e:
251
+ print(f"API error {e.status_code}: {e}")
252
+ except FileNotFoundError as e:
253
+ print(f"File not found: {e}")
254
+ ```
255
+
256
+ `AutoDataError` attributes:
257
+ - `str(e)` — human-readable error message from the server
258
+ - `e.status_code` — HTTP status code (e.g. `401`, `429`, `500`), or `None` for non-HTTP errors
259
+
260
+ ---
261
+
262
+ ## Advanced Example: Non-blocking with manual polling
263
+
264
+ ```python
265
+ import time
266
+ from autodata import AutoDataClient, AutoDataError
267
+
268
+ client = AutoDataClient(api_key="dtpk_...")
269
+
270
+ # Start job without blocking
271
+ job = client.process("large_dataset.csv", target_columns=["churn"], wait=False)
272
+ session_id = job["session_id"]
273
+ print(f"Job started: {session_id}")
274
+
275
+ # Poll manually
276
+ while True:
277
+ status = client.get_status(session_id)
278
+ print(f" {status['progress_percent']}% — {status['message']}")
279
+ if status["status"] == "completed":
280
+ break
281
+ elif status["status"] in ("error", "cancelled"):
282
+ raise AutoDataError(f"Job {status['status']}: {status['message']}")
283
+ time.sleep(5)
284
+
285
+ # Download results
286
+ path = client.download_results(session_id, download_path="./outputs/")
287
+ print(f"Results saved to {path}")
288
+ ```
289
+
290
+ ---
291
+
292
+ ## Requirements
293
+
294
+ - Python ≥ 3.8
295
+ - `requests` ≥ 2.25.0
296
+
297
+ ## License
298
+
299
+ MIT
@@ -0,0 +1,262 @@
1
+ # AutoData Python Client
2
+
3
+ Official Python SDK for the [AutoData](https://autodata.datatoolpack.com) ML data preparation pipeline API.
4
+
5
+ ## Installation
6
+
7
+ ```bash
8
+ pip install autodata-client
9
+ ```
10
+
11
+ Or install from source:
12
+
13
+ ```bash
14
+ git clone https://github.com/datatoolpack/autodata-client
15
+ cd autodata-client
16
+ pip install .
17
+ ```
18
+
19
+ ## Quick Start
20
+
21
+ ```python
22
+ from autodata import AutoDataClient
23
+
24
+ client = AutoDataClient(
25
+ api_key="dtpk_YOUR_API_KEY",
26
+ base_url="https://autodata.datatoolpack.com",
27
+ )
28
+
29
+ result = client.process(
30
+ file_path="data.csv",
31
+ target_columns=["price"],
32
+ output_rows=20000,
33
+ )
34
+ print(result["files"])
35
+ # [{'name': 'dsg.csv', 'url': '/download/...', 'size': 1048576, 'description': '...'}]
36
+ ```
37
+
38
+ Get your API key from the [AutoData dashboard](https://autodata.datatoolpack.com/dashboard) → API Keys tab.
39
+
40
+ ---
41
+
42
+ ## Reference
43
+
44
+ ### `AutoDataClient(api_key, base_url, timeout)`
45
+
46
+ | Parameter | Type | Default | Description |
47
+ |------------|-------|--------------------------------------|------------------------------------|
48
+ | `api_key` | `str` | required | API key starting with `dtpk_` |
49
+ | `base_url` | `str` | `"https://autodata.datatoolpack.com"` | Server URL (no trailing slash) |
50
+ | `timeout` | `int` | `120` | Request timeout in seconds |
51
+
52
+ ---
53
+
54
+ ### `client.process(...)` — Upload & run pipeline
55
+
56
+ ```python
57
+ result = client.process(
58
+ file_path="data.csv", # Path to input CSV
59
+ target_columns=["price"], # y-column(s) for ML
60
+ output_rows=20000, # Target row count in output
61
+ tools={ # Toggle pipeline steps (all optional)
62
+ "anomaly": False, # Anomaly detection (off by default)
63
+ "dtc": True, # Data Type Conversion
64
+ "mdh": True, # Missing Data Handler
65
+ "cds": True, # Column Scaling
66
+ "dsm": True, # Data Split Manager
67
+ "dsg": True, # Synthetic Data Generator
68
+ },
69
+ advanced_params={ # Fine-grained parameters (all optional)
70
+ "excluded_columns": ["id"], # Columns to drop before processing
71
+ "text_mode": 0, # 0=none, 1=neural, 2=tfidf
72
+ "text_cleaning": True, # Clean text before encoding
73
+ "zscore_limit": 3.0, # Z-score outlier threshold
74
+ "dsg_mode": "copula", # "copula" or "gan"
75
+ "similarity_p": 95, # Similarity percentile for DSG
76
+ },
77
+ wait=True, # Block until complete (default True)
78
+ poll_interval=2, # Status poll interval in seconds
79
+ download_path="./outputs/", # Where to save files (default auto)
80
+ output_preferences=["dsg.csv"], # Which files to download (default all)
81
+ compressed=True, # Download as ZIP (default True)
82
+ )
83
+ ```
84
+
85
+ **Returns** a dict:
86
+
87
+ ```python
88
+ {
89
+ "session_id": "abc123...",
90
+ "status": "completed",
91
+ "files": [
92
+ {"name": "dsg.csv", "url": "/download/.../dsg.csv", "size": 2097152, "description": "Synthetic data"},
93
+ {"name": "dsm_train.csv", ...},
94
+ ...
95
+ ],
96
+ "row_count": 20000,
97
+ "duration_seconds": 42.1,
98
+ }
99
+ ```
100
+
101
+ Set `wait=False` to get back immediately with just `session_id` and `status`:
102
+
103
+ ```python
104
+ result = client.process(file_path="data.csv", target_columns="price", wait=False)
105
+ session_id = result["session_id"]
106
+ ```
107
+
108
+ ---
109
+
110
+ ### `client.get_status(session_id)` — Poll progress
111
+
112
+ ```python
113
+ status = client.get_status(session_id)
114
+ # {
115
+ # "status": "running", # queued | running | completed | error | cancelled
116
+ # "message": "Running MDH...",
117
+ # "current_step": 3,
118
+ # "total_steps": 6,
119
+ # "progress_percent": 50,
120
+ # "duration_seconds": 15.3,
121
+ # }
122
+ ```
123
+
124
+ ---
125
+
126
+ ### `client.get_result(session_id)` — Fetch completed results
127
+
128
+ ```python
129
+ result = client.get_result(session_id)
130
+ # {"status": "completed", "files": [...], "row_count": ..., "duration_seconds": ...}
131
+ ```
132
+
133
+ ---
134
+
135
+ ### `client.wait_for_completion(session_id, poll_interval)` — Block until done
136
+
137
+ ```python
138
+ result = client.wait_for_completion(session_id, poll_interval=3)
139
+ ```
140
+
141
+ Prints live progress to stdout. Raises `AutoDataError` if processing fails.
142
+
143
+ ---
144
+
145
+ ### `client.cancel(session_id)` — Cancel a running job
146
+
147
+ ```python
148
+ cancelled = client.cancel(session_id) # True if acknowledged
149
+ ```
150
+
151
+ ---
152
+
153
+ ### `client.download_results(session_id, ...)` — Download output files
154
+
155
+ ```python
156
+ path = client.download_results(
157
+ session_id,
158
+ download_path="./my_outputs/", # Directory to save into
159
+ output_preferences=["dsg.csv"], # Specific files only (None = all)
160
+ compressed=True, # ZIP download (default) or individual files
161
+ )
162
+ print(f"Saved to {path}")
163
+ ```
164
+
165
+ ---
166
+
167
+ ### `client.download_file(url, output_path)` — Download a single file
168
+
169
+ ```python
170
+ client.download_file("/download/abc123.../dsg.csv", "dsg.csv")
171
+ ```
172
+
173
+ ---
174
+
175
+ ### `client.list_keys()` — List API keys
176
+
177
+ ```python
178
+ keys = client.list_keys()
179
+ # [{"id": "...", "name": "My Key", "prefix": "dtpk_abc123", "created_at": "..."}]
180
+ ```
181
+
182
+ ---
183
+
184
+ ### `client.get_usage()` — Usage statistics
185
+
186
+ ```python
187
+ usage = client.get_usage()
188
+ # {
189
+ # "daily_credits_used": 500,
190
+ # "daily_credit_limit": 10000,
191
+ # "daily_remaining": 9500,
192
+ # "lifetime_credits_used": 12340,
193
+ # "lifetime_credit_limit": 1000000,
194
+ # "lifetime_remaining": 987660,
195
+ # "daily_request_count": 3,
196
+ # "last_used_at": "2026-04-12T10:30:00Z",
197
+ # }
198
+ ```
199
+
200
+ ---
201
+
202
+ ## Error Handling
203
+
204
+ All API errors raise `AutoDataError`:
205
+
206
+ ```python
207
+ from autodata import AutoDataClient, AutoDataError
208
+
209
+ client = AutoDataClient(api_key="dtpk_...")
210
+
211
+ try:
212
+ result = client.process("data.csv", target_columns="price")
213
+ except AutoDataError as e:
214
+ print(f"API error {e.status_code}: {e}")
215
+ except FileNotFoundError as e:
216
+ print(f"File not found: {e}")
217
+ ```
218
+
219
+ `AutoDataError` attributes:
220
+ - `str(e)` — human-readable error message from the server
221
+ - `e.status_code` — HTTP status code (e.g. `401`, `429`, `500`), or `None` for non-HTTP errors
222
+
223
+ ---
224
+
225
+ ## Advanced Example: Non-blocking with manual polling
226
+
227
+ ```python
228
+ import time
229
+ from autodata import AutoDataClient, AutoDataError
230
+
231
+ client = AutoDataClient(api_key="dtpk_...")
232
+
233
+ # Start job without blocking
234
+ job = client.process("large_dataset.csv", target_columns=["churn"], wait=False)
235
+ session_id = job["session_id"]
236
+ print(f"Job started: {session_id}")
237
+
238
+ # Poll manually
239
+ while True:
240
+ status = client.get_status(session_id)
241
+ print(f" {status['progress_percent']}% — {status['message']}")
242
+ if status["status"] == "completed":
243
+ break
244
+ elif status["status"] in ("error", "cancelled"):
245
+ raise AutoDataError(f"Job {status['status']}: {status['message']}")
246
+ time.sleep(5)
247
+
248
+ # Download results
249
+ path = client.download_results(session_id, download_path="./outputs/")
250
+ print(f"Results saved to {path}")
251
+ ```
252
+
253
+ ---
254
+
255
+ ## Requirements
256
+
257
+ - Python ≥ 3.8
258
+ - `requests` ≥ 2.25.0
259
+
260
+ ## License
261
+
262
+ MIT
@@ -0,0 +1,3 @@
1
+ from .client import AutoDataClient, AutoDataError
2
+
3
+ __all__ = ['AutoDataClient', 'AutoDataError']
@@ -0,0 +1,345 @@
1
+ """
2
+ AutoData Python Client
3
+ ======================
4
+ Official Python SDK for the AutoData ML data preparation pipeline API.
5
+
6
+ Usage:
7
+ from autodata import AutoDataClient
8
+
9
+ client = AutoDataClient(api_key="dtpk_...", base_url="https://autodata.datatoolpack.com")
10
+
11
+ result = client.process(
12
+ file_path="data.csv",
13
+ target_columns=["price"],
14
+ output_rows=20000,
15
+ )
16
+ print(result["files"])
17
+ """
18
+
19
+ import io
20
+ import json
21
+ import os
22
+ import time
23
+ import zipfile
24
+ from typing import Dict, List, Optional, Union
25
+
26
+
27
+ try:
28
+ import requests
29
+ except ImportError as exc: # pragma: no cover
30
+ raise ImportError("The 'requests' package is required. Install with: pip install requests") from exc
31
+
32
+
33
+ class AutoDataError(Exception):
34
+ """Raised for API errors returned by the AutoData server."""
35
+ def __init__(self, message: str, status_code: Optional[int] = None):
36
+ super().__init__(message)
37
+ self.status_code = status_code
38
+
39
+
40
+ class AutoDataClient:
41
+ """
42
+ Client for the AutoData REST API v1.
43
+
44
+ All endpoints authenticate with an API key issued from the AutoData dashboard.
45
+ Pass the key as the ``api_key`` argument — it is sent as a Bearer token.
46
+
47
+ Args:
48
+ api_key: API key string starting with ``dtpk_``.
49
+ base_url: Base URL of the AutoData server (no trailing slash).
50
+ timeout: HTTP request timeout in seconds (default 120).
51
+ """
52
+
53
+ def __init__(
54
+ self,
55
+ api_key: str,
56
+ base_url: str = "https://autodata.datatoolpack.com",
57
+ timeout: int = 120,
58
+ ):
59
+ if not api_key or not api_key.startswith("dtpk_"):
60
+ raise ValueError("api_key must start with 'dtpk_'. Get yours from the AutoData dashboard.")
61
+ self.api_key = api_key
62
+ self.base_url = base_url.rstrip("/")
63
+ self.timeout = timeout
64
+ self._session = requests.Session()
65
+ self._session.headers.update({"Authorization": f"Bearer {api_key}"})
66
+
67
+ # ------------------------------------------------------------------
68
+ # Internal helpers
69
+ # ------------------------------------------------------------------
70
+
71
+ def _url(self, path: str) -> str:
72
+ return f"{self.base_url}/api/v1{path}"
73
+
74
+ def _raise_for_error(self, response: requests.Response) -> None:
75
+ if not response.ok:
76
+ try:
77
+ msg = response.json().get("error", response.text)
78
+ except Exception:
79
+ msg = response.text or f"HTTP {response.status_code}"
80
+ raise AutoDataError(msg, status_code=response.status_code)
81
+
82
+ # ------------------------------------------------------------------
83
+ # Core pipeline methods
84
+ # ------------------------------------------------------------------
85
+
86
+ def process(
87
+ self,
88
+ file_path: str,
89
+ target_columns: Union[str, List[str]],
90
+ output_rows: int = 10000,
91
+ tools: Optional[Dict[str, bool]] = None,
92
+ advanced_params: Optional[Dict] = None,
93
+ wait: bool = True,
94
+ poll_interval: int = 2,
95
+ download_path: Optional[str] = None,
96
+ output_preferences: Optional[List[str]] = None,
97
+ compressed: bool = True,
98
+ ) -> Dict:
99
+ """
100
+ Upload a CSV file and start the AutoData pipeline.
101
+
102
+ Args:
103
+ file_path: Path to the input CSV file.
104
+ target_columns: Target column name(s) for ML (y-columns).
105
+ output_rows: Desired number of rows in the output dataset.
106
+ tools: Dict of tool toggles. Keys: anomaly, dtc, mdh,
107
+ dor, cds, dsm, dsg. Default is all standard
108
+ tools enabled (anomaly/dor off by default).
109
+ advanced_params: Fine-grained parameters:
110
+ excluded_columns (list[str])
111
+ text_mode (int: 0=none, 1=neural, 2=tfidf)
112
+ text_cleaning (bool)
113
+ zscore_limit (float, default 3.0)
114
+ dsg_mode (str: 'copula'|'gan')
115
+ similarity_p (float, default 95)
116
+ wait: If True (default), block until processing
117
+ completes and download results automatically.
118
+ poll_interval: Seconds between status polls (default 2).
119
+ download_path: Directory to save results. Defaults to
120
+ ``./auto_data_outputs/<session_id>/``.
121
+ output_preferences: Subset of filenames to download. Downloads
122
+ all files when None.
123
+ compressed: Download as a ZIP archive (default True).
124
+
125
+ Returns:
126
+ Dict with session_id, status, and files list.
127
+
128
+ Raises:
129
+ FileNotFoundError: If file_path does not exist.
130
+ AutoDataError: On API errors.
131
+ """
132
+ if not os.path.exists(file_path):
133
+ raise FileNotFoundError(f"File not found: {file_path}")
134
+
135
+ config = {
136
+ "target_columns": [target_columns] if isinstance(target_columns, str) else target_columns,
137
+ "output_rows": output_rows,
138
+ "tools": tools or {},
139
+ "advanced_params": advanced_params or {},
140
+ }
141
+
142
+ with open(file_path, "rb") as f:
143
+ response = self._session.post(
144
+ self._url("/process"),
145
+ files={"file": (os.path.basename(file_path), f, "text/csv")},
146
+ data={"config": json.dumps(config)},
147
+ timeout=self.timeout,
148
+ )
149
+ self._raise_for_error(response)
150
+ result = response.json()
151
+ session_id = result["session_id"]
152
+
153
+ if not wait:
154
+ return result
155
+
156
+ try:
157
+ final = self.wait_for_completion(session_id, poll_interval=poll_interval)
158
+ except KeyboardInterrupt:
159
+ print("\nInterrupted — cancelling job on server…")
160
+ self.cancel(session_id)
161
+ raise
162
+
163
+ if download_path is not False:
164
+ self.download_results(
165
+ session_id,
166
+ download_path=download_path,
167
+ output_preferences=output_preferences,
168
+ compressed=compressed,
169
+ )
170
+ return final
171
+
172
+ def get_status(self, session_id: str) -> Dict:
173
+ """
174
+ Poll the processing status of a session.
175
+
176
+ Returns a dict with keys: status, message, current_step, total_steps,
177
+ progress_percent, duration_seconds.
178
+ """
179
+ r = self._session.get(self._url(f"/status/{session_id}"), timeout=self.timeout)
180
+ self._raise_for_error(r)
181
+ return r.json()
182
+
183
+ def get_result(self, session_id: str) -> Dict:
184
+ """
185
+ Retrieve the final results of a completed session.
186
+
187
+ Returns a dict with keys: status, files (list of {name, url, size, description}),
188
+ row_count, duration_seconds.
189
+ """
190
+ r = self._session.get(self._url(f"/result/{session_id}"), timeout=self.timeout)
191
+ self._raise_for_error(r)
192
+ return r.json()
193
+
194
+ def cancel(self, session_id: str) -> bool:
195
+ """
196
+ Cancel a running or queued session.
197
+
198
+ Returns True if cancellation was acknowledged.
199
+ """
200
+ try:
201
+ r = self._session.post(self._url(f"/cancel/{session_id}"), timeout=self.timeout)
202
+ self._raise_for_error(r)
203
+ return r.json().get("cancelled", False)
204
+ except AutoDataError as e:
205
+ print(f"Warning: cancel failed — {e}")
206
+ return False
207
+
208
+ def wait_for_completion(self, session_id: str, poll_interval: int = 2) -> Dict:
209
+ """
210
+ Block until a session reaches *completed*, *error*, or *cancelled*.
211
+
212
+ Prints progress updates to stdout. Raises ``AutoDataError`` on failure.
213
+ """
214
+ print(f"Waiting for session {session_id}…")
215
+ while True:
216
+ data = self.get_status(session_id)
217
+ status = data.get("status", "unknown")
218
+ pct = data.get("progress_percent", 0)
219
+ msg = data.get("message", "")
220
+
221
+ if status == "completed":
222
+ print(f"\r✓ Completed ({pct}%): {msg} ")
223
+ return self.get_result(session_id)
224
+ elif status == "error":
225
+ raise AutoDataError(f"Processing failed: {msg}")
226
+ elif status == "cancelled":
227
+ raise AutoDataError("Processing was cancelled")
228
+
229
+ print(f"\r {pct:3d}% — {msg[:60]:<60}", end="", flush=True)
230
+ time.sleep(poll_interval)
231
+
232
+ # ------------------------------------------------------------------
233
+ # File download helpers
234
+ # ------------------------------------------------------------------
235
+
236
+ def download_results(
237
+ self,
238
+ session_id: str,
239
+ download_path: Optional[str] = None,
240
+ output_preferences: Optional[List[str]] = None,
241
+ compressed: bool = True,
242
+ ) -> str:
243
+ """
244
+ Download result files for a completed session.
245
+
246
+ Args:
247
+ session_id: Session to download.
248
+ download_path: Directory to save files (created if missing).
249
+ output_preferences: Subset of filenames to include. None = all.
250
+ compressed: If True (default), downloads a single ZIP archive
251
+ and extracts it. If False, downloads each file
252
+ individually.
253
+
254
+ Returns:
255
+ Absolute path to the download directory.
256
+ """
257
+ if not download_path:
258
+ download_path = os.path.join(os.getcwd(), "auto_data_outputs", session_id)
259
+ os.makedirs(download_path, exist_ok=True)
260
+
261
+ if compressed:
262
+ body = {}
263
+ if output_preferences:
264
+ body["files"] = output_preferences
265
+ r = self._session.post(
266
+ self._url(f"/download-archive/{session_id}"),
267
+ json=body,
268
+ timeout=max(self.timeout, 300),
269
+ stream=True,
270
+ )
271
+ self._raise_for_error(r)
272
+ try:
273
+ z = zipfile.ZipFile(io.BytesIO(r.content))
274
+ z.extractall(download_path)
275
+ print(f"Extracted results to {download_path}")
276
+ except zipfile.BadZipFile:
277
+ raise AutoDataError("Server returned an invalid ZIP archive")
278
+ else:
279
+ result = self.get_result(session_id)
280
+ for f in result.get("files", []):
281
+ fname = f["name"]
282
+ if output_preferences and fname not in output_preferences:
283
+ continue
284
+ url = f["url"]
285
+ if not url.startswith("http"):
286
+ url = f"{self.base_url}{url}"
287
+ out = os.path.join(download_path, fname)
288
+ with self._session.get(url, stream=True, timeout=max(self.timeout, 300)) as resp:
289
+ resp.raise_for_status()
290
+ with open(out, "wb") as fh:
291
+ for chunk in resp.iter_content(chunk_size=65536):
292
+ fh.write(chunk)
293
+ print(f" Downloaded {fname}")
294
+
295
+ return os.path.abspath(download_path)
296
+
297
+ def download_file(self, url: str, output_path: str) -> None:
298
+ """Download a single file by URL (absolute or server-relative)."""
299
+ if not url.startswith("http"):
300
+ url = f"{self.base_url}{url}"
301
+ with self._session.get(url, stream=True, timeout=max(self.timeout, 300)) as r:
302
+ r.raise_for_status()
303
+ with open(output_path, "wb") as fh:
304
+ for chunk in r.iter_content(chunk_size=65536):
305
+ fh.write(chunk)
306
+ print(f"Downloaded to {output_path}")
307
+
308
+ # ------------------------------------------------------------------
309
+ # Account / API key management
310
+ # ------------------------------------------------------------------
311
+
312
+ def list_keys(self) -> List[Dict]:
313
+ """
314
+ List all active API keys for the authenticated account.
315
+
316
+ Returns a list of key metadata dicts (no secret values exposed).
317
+ """
318
+ r = self._session.get(self._url("/keys"), timeout=self.timeout)
319
+ self._raise_for_error(r)
320
+ return r.json().get("api_keys", [])
321
+
322
+ def get_usage(self) -> Dict:
323
+ """
324
+ Get credit usage statistics for the current API key.
325
+
326
+ Returns a dict with:
327
+ daily_credits_used, daily_credit_limit, daily_remaining,
328
+ lifetime_credits_used, lifetime_credit_limit, lifetime_remaining,
329
+ daily_request_count, last_used_at.
330
+ """
331
+ r = self._session.get(self._url("/usage"), timeout=self.timeout)
332
+ self._raise_for_error(r)
333
+ return r.json()
334
+
335
+ # ------------------------------------------------------------------
336
+ # Deprecated aliases (backward compat)
337
+ # ------------------------------------------------------------------
338
+
339
+ def get_status_deprecated(self, session_id: str) -> Dict: # noqa: D401
340
+ """Deprecated alias for get_status()."""
341
+ return self.get_status(session_id)
342
+
343
+ def cancel_session(self, session_id: str) -> bool:
344
+ """Deprecated alias for cancel(). Use cancel() instead."""
345
+ return self.cancel(session_id)
@@ -0,0 +1,299 @@
1
+ Metadata-Version: 2.4
2
+ Name: datatoolpack
3
+ Version: 0.2.0
4
+ Summary: Official Python SDK for the AutoData ML data preparation pipeline API
5
+ Home-page: https://autodata.datatoolpack.com
6
+ Author: AutoData Team
7
+ Author-email: support@datatoolpack.com
8
+ Project-URL: Documentation, https://autodata.datatoolpack.com/docs
9
+ Project-URL: Bug Tracker, https://github.com/datatoolpack/autodata-client/issues
10
+ Keywords: autodata machine-learning data-preparation synthetic-data ml-pipeline
11
+ Classifier: Development Status :: 4 - Beta
12
+ Classifier: Intended Audience :: Developers
13
+ Classifier: Intended Audience :: Science/Research
14
+ Classifier: License :: OSI Approved :: MIT License
15
+ Classifier: Programming Language :: Python :: 3
16
+ Classifier: Programming Language :: Python :: 3.8
17
+ Classifier: Programming Language :: Python :: 3.9
18
+ Classifier: Programming Language :: Python :: 3.10
19
+ Classifier: Programming Language :: Python :: 3.11
20
+ Classifier: Programming Language :: Python :: 3.12
21
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
22
+ Classifier: Topic :: Software Development :: Libraries :: Python Modules
23
+ Requires-Python: >=3.8
24
+ Description-Content-Type: text/markdown
25
+ Requires-Dist: requests>=2.25.0
26
+ Dynamic: author
27
+ Dynamic: author-email
28
+ Dynamic: classifier
29
+ Dynamic: description
30
+ Dynamic: description-content-type
31
+ Dynamic: home-page
32
+ Dynamic: keywords
33
+ Dynamic: project-url
34
+ Dynamic: requires-dist
35
+ Dynamic: requires-python
36
+ Dynamic: summary
37
+
38
+ # AutoData Python Client
39
+
40
+ Official Python SDK for the [AutoData](https://autodata.datatoolpack.com) ML data preparation pipeline API.
41
+
42
+ ## Installation
43
+
44
+ ```bash
45
+ pip install autodata-client
46
+ ```
47
+
48
+ Or install from source:
49
+
50
+ ```bash
51
+ git clone https://github.com/datatoolpack/autodata-client
52
+ cd autodata-client
53
+ pip install .
54
+ ```
55
+
56
+ ## Quick Start
57
+
58
+ ```python
59
+ from autodata import AutoDataClient
60
+
61
+ client = AutoDataClient(
62
+ api_key="dtpk_YOUR_API_KEY",
63
+ base_url="https://autodata.datatoolpack.com",
64
+ )
65
+
66
+ result = client.process(
67
+ file_path="data.csv",
68
+ target_columns=["price"],
69
+ output_rows=20000,
70
+ )
71
+ print(result["files"])
72
+ # [{'name': 'dsg.csv', 'url': '/download/...', 'size': 1048576, 'description': '...'}]
73
+ ```
74
+
75
+ Get your API key from the [AutoData dashboard](https://autodata.datatoolpack.com/dashboard) → API Keys tab.
76
+
77
+ ---
78
+
79
+ ## Reference
80
+
81
+ ### `AutoDataClient(api_key, base_url, timeout)`
82
+
83
+ | Parameter | Type | Default | Description |
84
+ |------------|-------|--------------------------------------|------------------------------------|
85
+ | `api_key` | `str` | required | API key starting with `dtpk_` |
86
+ | `base_url` | `str` | `"https://autodata.datatoolpack.com"` | Server URL (no trailing slash) |
87
+ | `timeout` | `int` | `120` | Request timeout in seconds |
88
+
89
+ ---
90
+
91
+ ### `client.process(...)` — Upload & run pipeline
92
+
93
+ ```python
94
+ result = client.process(
95
+ file_path="data.csv", # Path to input CSV
96
+ target_columns=["price"], # y-column(s) for ML
97
+ output_rows=20000, # Target row count in output
98
+ tools={ # Toggle pipeline steps (all optional)
99
+ "anomaly": False, # Anomaly detection (off by default)
100
+ "dtc": True, # Data Type Conversion
101
+ "mdh": True, # Missing Data Handler
102
+ "cds": True, # Column Scaling
103
+ "dsm": True, # Data Split Manager
104
+ "dsg": True, # Synthetic Data Generator
105
+ },
106
+ advanced_params={ # Fine-grained parameters (all optional)
107
+ "excluded_columns": ["id"], # Columns to drop before processing
108
+ "text_mode": 0, # 0=none, 1=neural, 2=tfidf
109
+ "text_cleaning": True, # Clean text before encoding
110
+ "zscore_limit": 3.0, # Z-score outlier threshold
111
+ "dsg_mode": "copula", # "copula" or "gan"
112
+ "similarity_p": 95, # Similarity percentile for DSG
113
+ },
114
+ wait=True, # Block until complete (default True)
115
+ poll_interval=2, # Status poll interval in seconds
116
+ download_path="./outputs/", # Where to save files (default auto)
117
+ output_preferences=["dsg.csv"], # Which files to download (default all)
118
+ compressed=True, # Download as ZIP (default True)
119
+ )
120
+ ```
121
+
122
+ **Returns** a dict:
123
+
124
+ ```python
125
+ {
126
+ "session_id": "abc123...",
127
+ "status": "completed",
128
+ "files": [
129
+ {"name": "dsg.csv", "url": "/download/.../dsg.csv", "size": 2097152, "description": "Synthetic data"},
130
+ {"name": "dsm_train.csv", ...},
131
+ ...
132
+ ],
133
+ "row_count": 20000,
134
+ "duration_seconds": 42.1,
135
+ }
136
+ ```
137
+
138
+ Set `wait=False` to get back immediately with just `session_id` and `status`:
139
+
140
+ ```python
141
+ result = client.process(file_path="data.csv", target_columns="price", wait=False)
142
+ session_id = result["session_id"]
143
+ ```
144
+
145
+ ---
146
+
147
+ ### `client.get_status(session_id)` — Poll progress
148
+
149
+ ```python
150
+ status = client.get_status(session_id)
151
+ # {
152
+ # "status": "running", # queued | running | completed | error | cancelled
153
+ # "message": "Running MDH...",
154
+ # "current_step": 3,
155
+ # "total_steps": 6,
156
+ # "progress_percent": 50,
157
+ # "duration_seconds": 15.3,
158
+ # }
159
+ ```
160
+
161
+ ---
162
+
163
+ ### `client.get_result(session_id)` — Fetch completed results
164
+
165
+ ```python
166
+ result = client.get_result(session_id)
167
+ # {"status": "completed", "files": [...], "row_count": ..., "duration_seconds": ...}
168
+ ```
169
+
170
+ ---
171
+
172
+ ### `client.wait_for_completion(session_id, poll_interval)` — Block until done
173
+
174
+ ```python
175
+ result = client.wait_for_completion(session_id, poll_interval=3)
176
+ ```
177
+
178
+ Prints live progress to stdout. Raises `AutoDataError` if processing fails.
179
+
180
+ ---
181
+
182
+ ### `client.cancel(session_id)` — Cancel a running job
183
+
184
+ ```python
185
+ cancelled = client.cancel(session_id) # True if acknowledged
186
+ ```
187
+
188
+ ---
189
+
190
+ ### `client.download_results(session_id, ...)` — Download output files
191
+
192
+ ```python
193
+ path = client.download_results(
194
+ session_id,
195
+ download_path="./my_outputs/", # Directory to save into
196
+ output_preferences=["dsg.csv"], # Specific files only (None = all)
197
+ compressed=True, # ZIP download (default) or individual files
198
+ )
199
+ print(f"Saved to {path}")
200
+ ```
201
+
202
+ ---
203
+
204
+ ### `client.download_file(url, output_path)` — Download a single file
205
+
206
+ ```python
207
+ client.download_file("/download/abc123.../dsg.csv", "dsg.csv")
208
+ ```
209
+
210
+ ---
211
+
212
+ ### `client.list_keys()` — List API keys
213
+
214
+ ```python
215
+ keys = client.list_keys()
216
+ # [{"id": "...", "name": "My Key", "prefix": "dtpk_abc123", "created_at": "..."}]
217
+ ```
218
+
219
+ ---
220
+
221
+ ### `client.get_usage()` — Usage statistics
222
+
223
+ ```python
224
+ usage = client.get_usage()
225
+ # {
226
+ # "daily_credits_used": 500,
227
+ # "daily_credit_limit": 10000,
228
+ # "daily_remaining": 9500,
229
+ # "lifetime_credits_used": 12340,
230
+ # "lifetime_credit_limit": 1000000,
231
+ # "lifetime_remaining": 987660,
232
+ # "daily_request_count": 3,
233
+ # "last_used_at": "2026-04-12T10:30:00Z",
234
+ # }
235
+ ```
236
+
237
+ ---
238
+
239
+ ## Error Handling
240
+
241
+ All API errors raise `AutoDataError`:
242
+
243
+ ```python
244
+ from autodata import AutoDataClient, AutoDataError
245
+
246
+ client = AutoDataClient(api_key="dtpk_...")
247
+
248
+ try:
249
+ result = client.process("data.csv", target_columns="price")
250
+ except AutoDataError as e:
251
+ print(f"API error {e.status_code}: {e}")
252
+ except FileNotFoundError as e:
253
+ print(f"File not found: {e}")
254
+ ```
255
+
256
+ `AutoDataError` attributes:
257
+ - `str(e)` — human-readable error message from the server
258
+ - `e.status_code` — HTTP status code (e.g. `401`, `429`, `500`), or `None` for non-HTTP errors
259
+
260
+ ---
261
+
262
+ ## Advanced Example: Non-blocking with manual polling
263
+
264
+ ```python
265
+ import time
266
+ from autodata import AutoDataClient, AutoDataError
267
+
268
+ client = AutoDataClient(api_key="dtpk_...")
269
+
270
+ # Start job without blocking
271
+ job = client.process("large_dataset.csv", target_columns=["churn"], wait=False)
272
+ session_id = job["session_id"]
273
+ print(f"Job started: {session_id}")
274
+
275
+ # Poll manually
276
+ while True:
277
+ status = client.get_status(session_id)
278
+ print(f" {status['progress_percent']}% — {status['message']}")
279
+ if status["status"] == "completed":
280
+ break
281
+ elif status["status"] in ("error", "cancelled"):
282
+ raise AutoDataError(f"Job {status['status']}: {status['message']}")
283
+ time.sleep(5)
284
+
285
+ # Download results
286
+ path = client.download_results(session_id, download_path="./outputs/")
287
+ print(f"Results saved to {path}")
288
+ ```
289
+
290
+ ---
291
+
292
+ ## Requirements
293
+
294
+ - Python ≥ 3.8
295
+ - `requests` ≥ 2.25.0
296
+
297
+ ## License
298
+
299
+ MIT
@@ -0,0 +1,10 @@
1
+ README.md
2
+ pyproject.toml
3
+ setup.py
4
+ autodata/__init__.py
5
+ autodata/client.py
6
+ datatoolpack.egg-info/PKG-INFO
7
+ datatoolpack.egg-info/SOURCES.txt
8
+ datatoolpack.egg-info/dependency_links.txt
9
+ datatoolpack.egg-info/requires.txt
10
+ datatoolpack.egg-info/top_level.txt
@@ -0,0 +1 @@
1
+ requests>=2.25.0
@@ -0,0 +1 @@
1
+ autodata
@@ -0,0 +1,3 @@
1
+ [build-system]
2
+ requires = ["setuptools>=61.0", "wheel"]
3
+ build-backend = "setuptools.build_meta"
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
@@ -0,0 +1,41 @@
1
+ from setuptools import setup, find_packages
2
+ import os
3
+
4
+ here = os.path.abspath(os.path.dirname(__file__))
5
+ with open(os.path.join(here, "README.md"), encoding="utf-8") as f:
6
+ long_description = f.read()
7
+
8
+ setup(
9
+ name="datatoolpack",
10
+ version="0.2.0",
11
+ description="Official Python SDK for the AutoData ML data preparation pipeline API",
12
+ long_description=long_description,
13
+ long_description_content_type="text/markdown",
14
+ author="AutoData Team",
15
+ author_email="support@datatoolpack.com",
16
+ url="https://autodata.datatoolpack.com",
17
+ project_urls={
18
+ "Documentation": "https://autodata.datatoolpack.com/docs",
19
+ "Bug Tracker": "https://github.com/datatoolpack/autodata-client/issues",
20
+ },
21
+ packages=find_packages(exclude=["tests*"]),
22
+ python_requires=">=3.8",
23
+ install_requires=[
24
+ "requests>=2.25.0",
25
+ ],
26
+ classifiers=[
27
+ "Development Status :: 4 - Beta",
28
+ "Intended Audience :: Developers",
29
+ "Intended Audience :: Science/Research",
30
+ "License :: OSI Approved :: MIT License",
31
+ "Programming Language :: Python :: 3",
32
+ "Programming Language :: Python :: 3.8",
33
+ "Programming Language :: Python :: 3.9",
34
+ "Programming Language :: Python :: 3.10",
35
+ "Programming Language :: Python :: 3.11",
36
+ "Programming Language :: Python :: 3.12",
37
+ "Topic :: Scientific/Engineering :: Artificial Intelligence",
38
+ "Topic :: Software Development :: Libraries :: Python Modules",
39
+ ],
40
+ keywords="autodata machine-learning data-preparation synthetic-data ml-pipeline",
41
+ )