datalier 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
datalier-0.1.0/LICENSE ADDED
@@ -0,0 +1,10 @@
1
+ cat > ~/concave-platform/sdk/LICENSE << 'EOF'
2
+ MIT License
3
+
4
+ Copyright (c) 2026 Concave AI
5
+
6
+ Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software" "Datalier by Concave AI"), to deal in the Software ("Datalier by Concave AI") without restriction, including without limitation the rightsto use, copy, modify, merge publish, distribute, sublicense, and/or sellcopies of the Software, and to permit persons to whom the Software ("Datalier by Concave AI") is furnished to do so, subject to the following conditions:
7
+
8
+ The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software ("Datalier by Concave AI").
9
+
10
+ THE SOFTWARE ("Datalier by Concave AI") IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS ORIMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THEAUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHERLIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,OUT OF OR IN CONNECTION WITH THE SOFTWARE ("Datalier by Concave AI") OR THE USE OR OTHER DEALINGS IN THE SOFTWARE ("Datalier by Concave AI").EOF
@@ -0,0 +1,2 @@
1
+ include README.md
2
+ include LICENSE
@@ -0,0 +1,105 @@
1
+ Metadata-Version: 2.4
2
+ Name: datalier
3
+ Version: 0.1.0
4
+ Summary: Datalier by Concave AI - data infrastructure for AI model training
5
+ Author-email: Aniket Nerali <aniket.nerali@theconcaveai.com>
6
+ License-Expression: MIT
7
+ Project-URL: Homepage, https://theconcaveai.com
8
+ Project-URL: Documentation, https://theconcaveai.com/docs
9
+ Project-URL: Repository, https://github.com/thesineo/concave-platform
10
+ Project-URL: Issues, https://github.com/thesineo/concave-platform/issues
11
+ Keywords: datalier,concave-ai,ai,machine-learning,training-data,annotation,rlhf,rlaif,data-quality,data-infrastructure
12
+ Classifier: Development Status :: 3 - Alpha
13
+ Classifier: Intended Audience :: Developers
14
+ Classifier: Intended Audience :: Science/Research
15
+ Classifier: Programming Language :: Python :: 3
16
+ Classifier: Programming Language :: Python :: 3.8
17
+ Classifier: Programming Language :: Python :: 3.9
18
+ Classifier: Programming Language :: Python :: 3.10
19
+ Classifier: Programming Language :: Python :: 3.11
20
+ Classifier: Programming Language :: Python :: 3.12
21
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
22
+ Classifier: Topic :: Software Development :: Libraries :: Python Modules
23
+ Requires-Python: >=3.8
24
+ Description-Content-Type: text/markdown
25
+ License-File: LICENSE
26
+ Requires-Dist: httpx>=0.27.0
27
+ Dynamic: license-file
28
+
29
+ # Datalier — by Concave AI
30
+
31
+ Data infrastructure for AI model training. One SDK — raw data in, quality-assured training data out.
32
+
33
+ ## Installation
34
+
35
+ ```bash
36
+ pip install datalier
37
+ ```
38
+
39
+ ## Quick Start
40
+
41
+ ```python
42
+ from datalier import DatalierClient
43
+
44
+ client = DatalierClient(
45
+ api_key="sk_concave_...",
46
+ base_url="https://api.theconcaveai.com", # or http://localhost:8000 for local
47
+ )
48
+
49
+ # Upload a dataset
50
+ dataset = client.upload("training_pairs.jsonl", dataset_type="rlhf")
51
+ print(f"Uploaded: {dataset.id} — {dataset.row_count} rows")
52
+
53
+ # Transform (validate, deduplicate, detect PII)
54
+ result = client.transform(dataset.id, steps=["validate", "dedup", "pii_scan"])
55
+ print(f"PII found: {result['pii_detection']['total_pii_found']}")
56
+
57
+ # Label with RLAIF (AI handles 80-90%, humans review edge cases)
58
+ result = client.label(dataset.id, dataset_type="rlhf", min_kappa=0.70)
59
+ print(f"Auto-labeled: {result['ai_labeled']}/{result['total_tasks']}")
60
+
61
+ # Check quality metrics
62
+ quality = client.get_quality(dataset.id)
63
+ print(f"Kappa: {quality.kappa} | Gold Accuracy: {quality.gold_accuracy}")
64
+
65
+ # Approve and version
66
+ client.approve(dataset.id)
67
+ version = client.snapshot(dataset.id)
68
+ print(f"Snapshot: {version['version']} — hash: {version['snapshot_hash'][:8]}...")
69
+
70
+ # Export as DPO format for training
71
+ download_url = client.export(dataset.id, fmt="dpo", version="v1.0")
72
+
73
+ # Monitor model performance (Layer 5)
74
+ model = client.register_model("my_model_v1", trained_on_dataset_id=dataset.id, trained_on_version="v1.0")
75
+ client.submit_metrics(model["model_id"], accuracy=0.89, f1=0.85)
76
+ drift = client.get_drift(model["model_id"])
77
+ ```
78
+
79
+ ## Platform Layers
80
+
81
+ | Layer | Function | SDK Methods |
82
+ |-------|----------|-------------|
83
+ | **1 - Ingest** | Upload from any source | `upload()`, `list_datasets()`, `get_dataset()` |
84
+ | **2 - Prepare** | Validate, dedup, PII scan | `transform()`, `get_profile()`, `get_pii_report()`, `redact()` |
85
+ | **3 - Label** | RLAIF + human review | `label()`, `get_quality()`, `approve()` |
86
+ | **4 - Version** | Snapshots + lineage | `snapshot()`, `list_versions()`, `get_lineage()`, `rollback()`, `export()` |
87
+ | **5 - Observe** | Monitor + re-label loop | `register_model()`, `submit_metrics()`, `get_drift()`, `trigger_relabel()` |
88
+
89
+ ## Using with Local Dev Server
90
+
91
+ ```python
92
+ client = DatalierClient(
93
+ api_key="your-jwt-token",
94
+ base_url="http://localhost:8000",
95
+ )
96
+ ```
97
+
98
+ ## Links
99
+
100
+ - **Website**: [theconcaveai.com](https://theconcaveai.com)
101
+ - **Documentation**: [theconcaveai.com/docs](https://theconcaveai.com/docs)
102
+
103
+ ## License
104
+
105
+ MIT — Concave AI 2026
@@ -0,0 +1,77 @@
1
+ # Datalier — by Concave AI
2
+
3
+ Data infrastructure for AI model training. One SDK — raw data in, quality-assured training data out.
4
+
5
+ ## Installation
6
+
7
+ ```bash
8
+ pip install datalier
9
+ ```
10
+
11
+ ## Quick Start
12
+
13
+ ```python
14
+ from datalier import DatalierClient
15
+
16
+ client = DatalierClient(
17
+ api_key="sk_concave_...",
18
+ base_url="https://api.theconcaveai.com", # or http://localhost:8000 for local
19
+ )
20
+
21
+ # Upload a dataset
22
+ dataset = client.upload("training_pairs.jsonl", dataset_type="rlhf")
23
+ print(f"Uploaded: {dataset.id} — {dataset.row_count} rows")
24
+
25
+ # Transform (validate, deduplicate, detect PII)
26
+ result = client.transform(dataset.id, steps=["validate", "dedup", "pii_scan"])
27
+ print(f"PII found: {result['pii_detection']['total_pii_found']}")
28
+
29
+ # Label with RLAIF (AI handles 80-90%, humans review edge cases)
30
+ result = client.label(dataset.id, dataset_type="rlhf", min_kappa=0.70)
31
+ print(f"Auto-labeled: {result['ai_labeled']}/{result['total_tasks']}")
32
+
33
+ # Check quality metrics
34
+ quality = client.get_quality(dataset.id)
35
+ print(f"Kappa: {quality.kappa} | Gold Accuracy: {quality.gold_accuracy}")
36
+
37
+ # Approve and version
38
+ client.approve(dataset.id)
39
+ version = client.snapshot(dataset.id)
40
+ print(f"Snapshot: {version['version']} — hash: {version['snapshot_hash'][:8]}...")
41
+
42
+ # Export as DPO format for training
43
+ download_url = client.export(dataset.id, fmt="dpo", version="v1.0")
44
+
45
+ # Monitor model performance (Layer 5)
46
+ model = client.register_model("my_model_v1", trained_on_dataset_id=dataset.id, trained_on_version="v1.0")
47
+ client.submit_metrics(model["model_id"], accuracy=0.89, f1=0.85)
48
+ drift = client.get_drift(model["model_id"])
49
+ ```
50
+
51
+ ## Platform Layers
52
+
53
+ | Layer | Function | SDK Methods |
54
+ |-------|----------|-------------|
55
+ | **1 - Ingest** | Upload from any source | `upload()`, `list_datasets()`, `get_dataset()` |
56
+ | **2 - Prepare** | Validate, dedup, PII scan | `transform()`, `get_profile()`, `get_pii_report()`, `redact()` |
57
+ | **3 - Label** | RLAIF + human review | `label()`, `get_quality()`, `approve()` |
58
+ | **4 - Version** | Snapshots + lineage | `snapshot()`, `list_versions()`, `get_lineage()`, `rollback()`, `export()` |
59
+ | **5 - Observe** | Monitor + re-label loop | `register_model()`, `submit_metrics()`, `get_drift()`, `trigger_relabel()` |
60
+
61
+ ## Using with Local Dev Server
62
+
63
+ ```python
64
+ client = DatalierClient(
65
+ api_key="your-jwt-token",
66
+ base_url="http://localhost:8000",
67
+ )
68
+ ```
69
+
70
+ ## Links
71
+
72
+ - **Website**: [theconcaveai.com](https://theconcaveai.com)
73
+ - **Documentation**: [theconcaveai.com/docs](https://theconcaveai.com/docs)
74
+
75
+ ## License
76
+
77
+ MIT — Concave AI 2026
@@ -0,0 +1,7 @@
1
+ from datalier.client import DatalierClient, DatasetInfo, QualityInfo, ModelInfo
2
+
3
+ # backwards-compat alias
4
+ JobInfo = ModelInfo
5
+
6
+ __version__ = "0.1.0"
7
+ __all__ = ["DatalierClient", "DatasetInfo", "QualityInfo", "ModelInfo", "JobInfo"]
@@ -0,0 +1,355 @@
1
+ """
2
+ Datalier by Concave AI — Python SDK
3
+
4
+ pip install datalier
5
+
6
+ Usage:
7
+ from datalier import DatalierClient
8
+
9
+ client = DatalierClient(api_key="sk_concave_...", base_url="http://localhost:8000")
10
+
11
+ dataset = client.upload("data.jsonl", dataset_type="rlhf")
12
+ client.transform(dataset.id)
13
+ client.label(dataset.id)
14
+ quality = client.get_quality(dataset.id)
15
+ client.approve(dataset.id)
16
+ client.snapshot(dataset.id)
17
+ url = client.export(dataset.id, fmt="dpo")
18
+ """
19
+ import httpx
20
+ from typing import Optional, List
21
+ from dataclasses import dataclass, field
22
+
23
+
24
+ @dataclass
25
+ class DatasetInfo:
26
+ id: str
27
+ name: str
28
+ type: str
29
+ status: str
30
+ file_format: str
31
+ row_count: int
32
+ schema: dict = field(default_factory=dict)
33
+
34
+
35
+ @dataclass
36
+ class QualityInfo:
37
+ dataset_id: str
38
+ kappa: float
39
+ gold_accuracy: float
40
+ bias_score: float
41
+ overall_status: str
42
+ automation_rate: float = 0.0
43
+
44
+
45
+ @dataclass
46
+ class ModelInfo:
47
+ model_id: str
48
+ name: str
49
+ status: str
50
+ drift_score: float = 0.0
51
+
52
+
53
+ class DatalierClient:
54
+ """
55
+ Datalier by Concave AI — SDK Client.
56
+
57
+ Provides programmatic access to all 5 platform layers:
58
+ 1. Ingest — upload and connect data sources
59
+ 2. Prepare — validate, deduplicate, detect and redact PII
60
+ 3. Label — RLAIF pre-labeling with human review loop
61
+ 4. Version — immutable snapshots with full lineage
62
+ 5. Observe — model performance monitoring + drift detection
63
+ """
64
+
65
+ def __init__(
66
+ self,
67
+ api_key: str,
68
+ base_url: str = "https://api.theconcaveai.com",
69
+ ):
70
+ self.api_key = api_key
71
+ self.base_url = base_url.rstrip("/")
72
+ self._client = httpx.Client(
73
+ base_url=self.base_url,
74
+ headers={"Authorization": f"Bearer {api_key}"},
75
+ timeout=120.0,
76
+ )
77
+
78
+ def _request(self, method: str, path: str, **kwargs) -> dict:
79
+ response = self._client.request(method, path, **kwargs)
80
+ if response.status_code >= 400:
81
+ try:
82
+ detail = response.json().get("detail", response.text)
83
+ except Exception:
84
+ detail = response.text
85
+ raise Exception(f"API Error {response.status_code}: {detail}")
86
+ return response.json()
87
+
88
+ def close(self):
89
+ self._client.close()
90
+
91
+ def __enter__(self):
92
+ return self
93
+
94
+ def __exit__(self, *args):
95
+ self.close()
96
+
97
+ # ─── Layer 1: Ingest ──────────────────────────────────────────────────────
98
+
99
+ def upload(self, filepath: str, name: str = None,
100
+ dataset_type: str = "generic") -> DatasetInfo:
101
+ """Upload a local file and create a new dataset."""
102
+ with open(filepath, "rb") as f:
103
+ filename = filepath.split("/")[-1]
104
+ result = self._request(
105
+ "POST", "/api/v1/datasets/upload",
106
+ data={"name": name or filename, "dataset_type": dataset_type},
107
+ files={"file": (filename, f)},
108
+ )
109
+ return DatasetInfo(
110
+ id=result["dataset_id"],
111
+ name=result["name"],
112
+ type=dataset_type,
113
+ status=result["status"],
114
+ file_format=result["file_format"],
115
+ row_count=result["row_count"],
116
+ schema=result.get("schema_json", {}),
117
+ )
118
+
119
+ def get_dataset(self, dataset_id: str) -> DatasetInfo:
120
+ """Get dataset details."""
121
+ d = self._request("GET", f"/api/v1/datasets/{dataset_id}")
122
+ return DatasetInfo(
123
+ id=d["id"], name=d["name"], type=d.get("type", "generic"),
124
+ status=d["status"], file_format=d.get("file_format", ""),
125
+ row_count=d.get("row_count", 0), schema=d.get("schema_json", {}),
126
+ )
127
+
128
+ # ─── Layer 2: Prepare ─────────────────────────────────────────────────────
129
+
130
+ def transform(self, dataset_id: str,
131
+ steps: Optional[List[str]] = None) -> dict:
132
+ """
133
+ Run the data preparation pipeline.
134
+ Returns full results: validation report, dedup stats, PII findings.
135
+ """
136
+ steps = steps or ["validate", "dedup", "pii_scan"]
137
+ return self._request(
138
+ "POST", f"/api/v1/datasets/{dataset_id}/transform",
139
+ json={"steps": steps},
140
+ )
141
+
142
+ def get_profile(self, dataset_id: str) -> dict:
143
+ """Get data profile — column types, null rates, unique counts."""
144
+ return self._request("GET", f"/api/v1/datasets/{dataset_id}/profile")
145
+
146
+ def get_pii_report(self, dataset_id: str) -> dict:
147
+ """Get PII detection report with findings per field."""
148
+ return self._request("GET", f"/api/v1/datasets/{dataset_id}/pii")
149
+
150
+ def redact(self, dataset_id: str, fields: List[str],
151
+ method: str = "mask") -> dict:
152
+ """Redact PII fields. method: 'mask' | 'hash' | 'remove'"""
153
+ return self._request(
154
+ "POST", f"/api/v1/datasets/{dataset_id}/redact",
155
+ json={"fields": fields, "method": method},
156
+ )
157
+
158
+ # ─── Layer 3: Label ───────────────────────────────────────────────────────
159
+
160
+ def label(self, dataset_id: str, dataset_type: str = "rlhf",
161
+ rlaif_model: str = "gpt-4o-mini",
162
+ confidence_threshold: float = 0.85,
163
+ min_kappa: float = 0.70) -> dict:
164
+ """
165
+ Start the RLAIF pre-labeling pipeline.
166
+ Returns: total_tasks, ai_labeled, human_review, automation_rate.
167
+ """
168
+ return self._request(
169
+ "POST", f"/api/v1/datasets/{dataset_id}/label",
170
+ json={
171
+ "type": dataset_type,
172
+ "rlaif_model": rlaif_model,
173
+ "confidence_threshold": confidence_threshold,
174
+ "min_kappa": min_kappa,
175
+ },
176
+ )
177
+
178
+ def get_quality(self, dataset_id: str) -> QualityInfo:
179
+ """Get real-time quality metrics: kappa, gold accuracy, bias score."""
180
+ q = self._request("GET", f"/api/v1/datasets/{dataset_id}/quality")
181
+ return QualityInfo(
182
+ dataset_id=q.get("dataset_id", dataset_id),
183
+ kappa=q.get("kappa", -1.0),
184
+ gold_accuracy=q.get("gold_accuracy", 0.0),
185
+ bias_score=q.get("bias_score", 0.0),
186
+ overall_status=q.get("overall_status", "pending_reviews"),
187
+ automation_rate=q.get("automation_rate", 0.0),
188
+ )
189
+
190
+ def list_tasks(self, dataset_id: str,
191
+ status: Optional[str] = None) -> dict:
192
+ """List tasks, optionally filtered by status (e.g. 'human_review')."""
193
+ params = {}
194
+ if status:
195
+ params["status"] = status
196
+ return self._request("GET", f"/api/v1/datasets/{dataset_id}/tasks",
197
+ params=params)
198
+
199
+ def submit_review(self, dataset_id: str, task_id: str,
200
+ label: dict, reasoning: str = "") -> dict:
201
+ """Submit a human annotation for a task."""
202
+ body = {"label": label}
203
+ if reasoning:
204
+ body["reasoning"] = reasoning
205
+ return self._request(
206
+ "PUT", f"/api/v1/datasets/{dataset_id}/tasks/{task_id}",
207
+ json=body,
208
+ )
209
+
210
+ def approve(self, dataset_id: str) -> dict:
211
+ """Approve a labeled dataset — marks it ready for versioning."""
212
+ return self._request(
213
+ "POST", f"/api/v1/datasets/{dataset_id}/label/approve"
214
+ )
215
+
216
+ # ─── Layer 4: Version ─────────────────────────────────────────────────────
217
+
218
+ def snapshot(self, dataset_id: str, bump: str = "minor") -> dict:
219
+ """
220
+ Create an immutable versioned snapshot.
221
+ Returns: version, snapshot_hash, row_count, lineage_summary.
222
+ """
223
+ return self._request(
224
+ "POST", f"/api/v1/datasets/{dataset_id}/snapshot",
225
+ params={"bump": bump},
226
+ )
227
+
228
+ def list_versions(self, dataset_id: str) -> dict:
229
+ """List all versions with quality scores."""
230
+ return self._request("GET", f"/api/v1/datasets/{dataset_id}/versions")
231
+
232
+ def get_lineage(self, dataset_id: str, version: str) -> dict:
233
+ """Get full provenance for a dataset version."""
234
+ return self._request(
235
+ "GET", f"/api/v1/datasets/{dataset_id}/versions/{version}/lineage"
236
+ )
237
+
238
+ def rollback(self, dataset_id: str, version: str) -> dict:
239
+ """Rollback the active dataset to a previous version."""
240
+ return self._request(
241
+ "POST", f"/api/v1/datasets/{dataset_id}/rollback/{version}"
242
+ )
243
+
244
+ def diff(self, dataset_id: str, version_a: str, version_b: str) -> dict:
245
+ """Compare two versions — shows added, removed, unchanged rows."""
246
+ return self._request(
247
+ "GET", f"/api/v1/datasets/{dataset_id}/diff/{version_a}/{version_b}"
248
+ )
249
+
250
+ def export(self, dataset_id: str, fmt: str = "jsonl",
251
+ version: Optional[str] = None) -> str:
252
+ """
253
+ Export in training-ready format.
254
+ fmt: 'jsonl' | 'dpo' | 'conll' | 'coco' | 'csv' | 'json'
255
+ Returns the download URL.
256
+ """
257
+ params: dict = {"format": fmt}
258
+ if version:
259
+ params["version"] = version
260
+ result = self._request(
261
+ "POST", f"/api/v1/datasets/{dataset_id}/export",
262
+ params=params,
263
+ )
264
+ return result["download_url"]
265
+
266
+ # ─── Layer 5: Observe ─────────────────────────────────────────────────────
267
+
268
+ def register_model(self, name: str,
269
+ trained_on_dataset_id: Optional[str] = None,
270
+ trained_on_version: Optional[str] = None,
271
+ min_accuracy: float = 0.80,
272
+ max_drift_psi: float = 0.15) -> ModelInfo:
273
+ """Register a trained model for performance monitoring."""
274
+ result = self._request(
275
+ "POST", "/api/v1/models/register",
276
+ json={
277
+ "name": name,
278
+ "trained_on_dataset_id": trained_on_dataset_id,
279
+ "trained_on_version": trained_on_version,
280
+ "min_accuracy": min_accuracy,
281
+ "max_drift_psi": max_drift_psi,
282
+ },
283
+ )
284
+ return ModelInfo(
285
+ model_id=result["model_id"],
286
+ name=result["name"],
287
+ status=result["status"],
288
+ )
289
+
290
+ def submit_metrics(self, model_id: str, **metrics) -> dict:
291
+ """
292
+ Submit performance metrics from your training platform.
293
+ Accepts any keyword args: accuracy=0.89, f1=0.85, loss=0.31, etc.
294
+ Automatically detects degradation and triggers re-labeling plan.
295
+ """
296
+ return self._request(
297
+ "POST", f"/api/v1/models/{model_id}/metrics",
298
+ json=metrics,
299
+ )
300
+
301
+ def ingest_webhook(self, model_id: str, payload: dict,
302
+ source: str = "auto") -> dict:
303
+ """
304
+ Push metrics from MLflow, W&B, HuggingFace Evaluate, or generic.
305
+ source: 'auto' | 'mlflow' | 'wandb' | 'huggingface' | 'generic'
306
+ """
307
+ return self._request(
308
+ "POST", f"/api/v1/models/{model_id}/webhook-ingest",
309
+ json={"source": source, "payload": payload},
310
+ )
311
+
312
+ def submit_drift(self, model_id: str,
313
+ training_distribution: dict,
314
+ production_distribution: dict) -> dict:
315
+ """Submit training vs production distributions for PSI drift analysis."""
316
+ return self._request(
317
+ "POST", f"/api/v1/models/{model_id}/drift",
318
+ json={
319
+ "training_distribution": training_distribution,
320
+ "production_distribution": production_distribution,
321
+ },
322
+ )
323
+
324
+ def get_drift(self, model_id: str) -> dict:
325
+ """Get latest drift score and history for a model."""
326
+ return self._request("GET", f"/api/v1/models/{model_id}/drift")
327
+
328
+ def get_performance(self, model_id: str) -> dict:
329
+ """Get full performance history with trend analysis."""
330
+ return self._request("GET", f"/api/v1/models/{model_id}/performance")
331
+
332
+ def trigger_relabel(self, model_id: str,
333
+ categories: Optional[List[str]] = None) -> dict:
334
+ """Generate a re-labeling plan and get step-by-step loop instructions."""
335
+ body = {}
336
+ if categories:
337
+ body["categories"] = categories
338
+ return self._request(
339
+ "POST", f"/api/v1/models/{model_id}/trigger-relabel",
340
+ json=body,
341
+ )
342
+
343
+ def get_relabel_jobs(self, model_id: str) -> dict:
344
+ """Get full re-labeling job history for a model."""
345
+ return self._request(
346
+ "GET", f"/api/v1/models/{model_id}/relabel-jobs"
347
+ )
348
+
349
+ def dashboard(self) -> dict:
350
+ """Platform-wide health dashboard — all models, alerts, quality."""
351
+ return self._request("GET", "/api/v1/observe/dashboard")
352
+
353
+
354
+ # Alias for backwards compatibility
355
+ ConcaveClient = DatalierClient
@@ -0,0 +1,105 @@
1
+ Metadata-Version: 2.4
2
+ Name: datalier
3
+ Version: 0.1.0
4
+ Summary: Datalier by Concave AI - data infrastructure for AI model training
5
+ Author-email: Aniket Nerali <aniket.nerali@theconcaveai.com>
6
+ License-Expression: MIT
7
+ Project-URL: Homepage, https://theconcaveai.com
8
+ Project-URL: Documentation, https://theconcaveai.com/docs
9
+ Project-URL: Repository, https://github.com/thesineo/concave-platform
10
+ Project-URL: Issues, https://github.com/thesineo/concave-platform/issues
11
+ Keywords: datalier,concave-ai,ai,machine-learning,training-data,annotation,rlhf,rlaif,data-quality,data-infrastructure
12
+ Classifier: Development Status :: 3 - Alpha
13
+ Classifier: Intended Audience :: Developers
14
+ Classifier: Intended Audience :: Science/Research
15
+ Classifier: Programming Language :: Python :: 3
16
+ Classifier: Programming Language :: Python :: 3.8
17
+ Classifier: Programming Language :: Python :: 3.9
18
+ Classifier: Programming Language :: Python :: 3.10
19
+ Classifier: Programming Language :: Python :: 3.11
20
+ Classifier: Programming Language :: Python :: 3.12
21
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
22
+ Classifier: Topic :: Software Development :: Libraries :: Python Modules
23
+ Requires-Python: >=3.8
24
+ Description-Content-Type: text/markdown
25
+ License-File: LICENSE
26
+ Requires-Dist: httpx>=0.27.0
27
+ Dynamic: license-file
28
+
29
+ # Datalier — by Concave AI
30
+
31
+ Data infrastructure for AI model training. One SDK — raw data in, quality-assured training data out.
32
+
33
+ ## Installation
34
+
35
+ ```bash
36
+ pip install datalier
37
+ ```
38
+
39
+ ## Quick Start
40
+
41
+ ```python
42
+ from datalier import DatalierClient
43
+
44
+ client = DatalierClient(
45
+ api_key="sk_concave_...",
46
+ base_url="https://api.theconcaveai.com", # or http://localhost:8000 for local
47
+ )
48
+
49
+ # Upload a dataset
50
+ dataset = client.upload("training_pairs.jsonl", dataset_type="rlhf")
51
+ print(f"Uploaded: {dataset.id} — {dataset.row_count} rows")
52
+
53
+ # Transform (validate, deduplicate, detect PII)
54
+ result = client.transform(dataset.id, steps=["validate", "dedup", "pii_scan"])
55
+ print(f"PII found: {result['pii_detection']['total_pii_found']}")
56
+
57
+ # Label with RLAIF (AI handles 80-90%, humans review edge cases)
58
+ result = client.label(dataset.id, dataset_type="rlhf", min_kappa=0.70)
59
+ print(f"Auto-labeled: {result['ai_labeled']}/{result['total_tasks']}")
60
+
61
+ # Check quality metrics
62
+ quality = client.get_quality(dataset.id)
63
+ print(f"Kappa: {quality.kappa} | Gold Accuracy: {quality.gold_accuracy}")
64
+
65
+ # Approve and version
66
+ client.approve(dataset.id)
67
+ version = client.snapshot(dataset.id)
68
+ print(f"Snapshot: {version['version']} — hash: {version['snapshot_hash'][:8]}...")
69
+
70
+ # Export as DPO format for training
71
+ download_url = client.export(dataset.id, fmt="dpo", version="v1.0")
72
+
73
+ # Monitor model performance (Layer 5)
74
+ model = client.register_model("my_model_v1", trained_on_dataset_id=dataset.id, trained_on_version="v1.0")
75
+ client.submit_metrics(model["model_id"], accuracy=0.89, f1=0.85)
76
+ drift = client.get_drift(model["model_id"])
77
+ ```
78
+
79
+ ## Platform Layers
80
+
81
+ | Layer | Function | SDK Methods |
82
+ |-------|----------|-------------|
83
+ | **1 - Ingest** | Upload from any source | `upload()`, `list_datasets()`, `get_dataset()` |
84
+ | **2 - Prepare** | Validate, dedup, PII scan | `transform()`, `get_profile()`, `get_pii_report()`, `redact()` |
85
+ | **3 - Label** | RLAIF + human review | `label()`, `get_quality()`, `approve()` |
86
+ | **4 - Version** | Snapshots + lineage | `snapshot()`, `list_versions()`, `get_lineage()`, `rollback()`, `export()` |
87
+ | **5 - Observe** | Monitor + re-label loop | `register_model()`, `submit_metrics()`, `get_drift()`, `trigger_relabel()` |
88
+
89
+ ## Using with Local Dev Server
90
+
91
+ ```python
92
+ client = DatalierClient(
93
+ api_key="your-jwt-token",
94
+ base_url="http://localhost:8000",
95
+ )
96
+ ```
97
+
98
+ ## Links
99
+
100
+ - **Website**: [theconcaveai.com](https://theconcaveai.com)
101
+ - **Documentation**: [theconcaveai.com/docs](https://theconcaveai.com/docs)
102
+
103
+ ## License
104
+
105
+ MIT — Concave AI 2026
@@ -0,0 +1,12 @@
1
+ LICENSE
2
+ MANIFEST.in
3
+ README.md
4
+ pyproject.toml
5
+ setup.py
6
+ datalier/__init__.py
7
+ datalier/client.py
8
+ datalier.egg-info/PKG-INFO
9
+ datalier.egg-info/SOURCES.txt
10
+ datalier.egg-info/dependency_links.txt
11
+ datalier.egg-info/requires.txt
12
+ datalier.egg-info/top_level.txt
@@ -0,0 +1 @@
1
+ httpx>=0.27.0
@@ -0,0 +1 @@
1
+ datalier
@@ -0,0 +1,37 @@
1
+ [build-system]
2
+ requires = ["setuptools>=68.0", "wheel"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "datalier"
7
+ version = "0.1.0"
8
+ description = "Datalier by Concave AI - data infrastructure for AI model training"
9
+ readme = "README.md"
10
+ license = "MIT"
11
+ requires-python = ">=3.8"
12
+ authors = [
13
+ {name = "Aniket Nerali", email = "aniket.nerali@theconcaveai.com"}
14
+ ]
15
+ keywords = ["datalier", "concave-ai", "ai", "machine-learning", "training-data", "annotation", "rlhf", "rlaif", "data-quality", "data-infrastructure"]
16
+ classifiers = [
17
+ "Development Status :: 3 - Alpha",
18
+ "Intended Audience :: Developers",
19
+ "Intended Audience :: Science/Research",
20
+ "Programming Language :: Python :: 3",
21
+ "Programming Language :: Python :: 3.8",
22
+ "Programming Language :: Python :: 3.9",
23
+ "Programming Language :: Python :: 3.10",
24
+ "Programming Language :: Python :: 3.11",
25
+ "Programming Language :: Python :: 3.12",
26
+ "Topic :: Scientific/Engineering :: Artificial Intelligence",
27
+ "Topic :: Software Development :: Libraries :: Python Modules",
28
+ ]
29
+ dependencies = [
30
+ "httpx>=0.27.0",
31
+ ]
32
+
33
+ [project.urls]
34
+ Homepage = "https://theconcaveai.com"
35
+ Documentation = "https://theconcaveai.com/docs"
36
+ Repository = "https://github.com/thesineo/concave-platform"
37
+ Issues = "https://github.com/thesineo/concave-platform/issues"
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
@@ -0,0 +1,2 @@
1
+ from setuptools import setup, find_packages
2
+ setup(packages=find_packages())