datalier 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- datalier-0.1.0/LICENSE +10 -0
- datalier-0.1.0/MANIFEST.in +2 -0
- datalier-0.1.0/PKG-INFO +105 -0
- datalier-0.1.0/README.md +77 -0
- datalier-0.1.0/datalier/__init__.py +7 -0
- datalier-0.1.0/datalier/client.py +355 -0
- datalier-0.1.0/datalier.egg-info/PKG-INFO +105 -0
- datalier-0.1.0/datalier.egg-info/SOURCES.txt +12 -0
- datalier-0.1.0/datalier.egg-info/dependency_links.txt +1 -0
- datalier-0.1.0/datalier.egg-info/requires.txt +1 -0
- datalier-0.1.0/datalier.egg-info/top_level.txt +1 -0
- datalier-0.1.0/pyproject.toml +37 -0
- datalier-0.1.0/setup.cfg +4 -0
- datalier-0.1.0/setup.py +2 -0
datalier-0.1.0/LICENSE
ADDED
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
cat > ~/concave-platform/sdk/LICENSE << 'EOF'
|
|
2
|
+
MIT License
|
|
3
|
+
|
|
4
|
+
Copyright (c) 2026 Concave AI
|
|
5
|
+
|
|
6
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software" "Datalier by Concave AI"), to deal in the Software ("Datalier by Concave AI") without restriction, including without limitation the rightsto use, copy, modify, merge publish, distribute, sublicense, and/or sellcopies of the Software, and to permit persons to whom the Software ("Datalier by Concave AI") is furnished to do so, subject to the following conditions:
|
|
7
|
+
|
|
8
|
+
The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software ("Datalier by Concave AI").
|
|
9
|
+
|
|
10
|
+
THE SOFTWARE ("Datalier by Concave AI") IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS ORIMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THEAUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHERLIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,OUT OF OR IN CONNECTION WITH THE SOFTWARE ("Datalier by Concave AI") OR THE USE OR OTHER DEALINGS IN THE SOFTWARE ("Datalier by Concave AI").EOF
|
datalier-0.1.0/PKG-INFO
ADDED
|
@@ -0,0 +1,105 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: datalier
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Datalier by Concave AI - data infrastructure for AI model training
|
|
5
|
+
Author-email: Aniket Nerali <aniket.nerali@theconcaveai.com>
|
|
6
|
+
License-Expression: MIT
|
|
7
|
+
Project-URL: Homepage, https://theconcaveai.com
|
|
8
|
+
Project-URL: Documentation, https://theconcaveai.com/docs
|
|
9
|
+
Project-URL: Repository, https://github.com/thesineo/concave-platform
|
|
10
|
+
Project-URL: Issues, https://github.com/thesineo/concave-platform/issues
|
|
11
|
+
Keywords: datalier,concave-ai,ai,machine-learning,training-data,annotation,rlhf,rlaif,data-quality,data-infrastructure
|
|
12
|
+
Classifier: Development Status :: 3 - Alpha
|
|
13
|
+
Classifier: Intended Audience :: Developers
|
|
14
|
+
Classifier: Intended Audience :: Science/Research
|
|
15
|
+
Classifier: Programming Language :: Python :: 3
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.8
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
20
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
21
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
22
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
23
|
+
Requires-Python: >=3.8
|
|
24
|
+
Description-Content-Type: text/markdown
|
|
25
|
+
License-File: LICENSE
|
|
26
|
+
Requires-Dist: httpx>=0.27.0
|
|
27
|
+
Dynamic: license-file
|
|
28
|
+
|
|
29
|
+
# Datalier — by Concave AI
|
|
30
|
+
|
|
31
|
+
Data infrastructure for AI model training. One SDK — raw data in, quality-assured training data out.
|
|
32
|
+
|
|
33
|
+
## Installation
|
|
34
|
+
|
|
35
|
+
```bash
|
|
36
|
+
pip install datalier
|
|
37
|
+
```
|
|
38
|
+
|
|
39
|
+
## Quick Start
|
|
40
|
+
|
|
41
|
+
```python
|
|
42
|
+
from datalier import DatalierClient
|
|
43
|
+
|
|
44
|
+
client = DatalierClient(
|
|
45
|
+
api_key="sk_concave_...",
|
|
46
|
+
base_url="https://api.theconcaveai.com", # or http://localhost:8000 for local
|
|
47
|
+
)
|
|
48
|
+
|
|
49
|
+
# Upload a dataset
|
|
50
|
+
dataset = client.upload("training_pairs.jsonl", dataset_type="rlhf")
|
|
51
|
+
print(f"Uploaded: {dataset.id} — {dataset.row_count} rows")
|
|
52
|
+
|
|
53
|
+
# Transform (validate, deduplicate, detect PII)
|
|
54
|
+
result = client.transform(dataset.id, steps=["validate", "dedup", "pii_scan"])
|
|
55
|
+
print(f"PII found: {result['pii_detection']['total_pii_found']}")
|
|
56
|
+
|
|
57
|
+
# Label with RLAIF (AI handles 80-90%, humans review edge cases)
|
|
58
|
+
result = client.label(dataset.id, dataset_type="rlhf", min_kappa=0.70)
|
|
59
|
+
print(f"Auto-labeled: {result['ai_labeled']}/{result['total_tasks']}")
|
|
60
|
+
|
|
61
|
+
# Check quality metrics
|
|
62
|
+
quality = client.get_quality(dataset.id)
|
|
63
|
+
print(f"Kappa: {quality.kappa} | Gold Accuracy: {quality.gold_accuracy}")
|
|
64
|
+
|
|
65
|
+
# Approve and version
|
|
66
|
+
client.approve(dataset.id)
|
|
67
|
+
version = client.snapshot(dataset.id)
|
|
68
|
+
print(f"Snapshot: {version['version']} — hash: {version['snapshot_hash'][:8]}...")
|
|
69
|
+
|
|
70
|
+
# Export as DPO format for training
|
|
71
|
+
download_url = client.export(dataset.id, fmt="dpo", version="v1.0")
|
|
72
|
+
|
|
73
|
+
# Monitor model performance (Layer 5)
|
|
74
|
+
model = client.register_model("my_model_v1", trained_on_dataset_id=dataset.id, trained_on_version="v1.0")
|
|
75
|
+
client.submit_metrics(model["model_id"], accuracy=0.89, f1=0.85)
|
|
76
|
+
drift = client.get_drift(model["model_id"])
|
|
77
|
+
```
|
|
78
|
+
|
|
79
|
+
## Platform Layers
|
|
80
|
+
|
|
81
|
+
| Layer | Function | SDK Methods |
|
|
82
|
+
|-------|----------|-------------|
|
|
83
|
+
| **1 - Ingest** | Upload from any source | `upload()`, `list_datasets()`, `get_dataset()` |
|
|
84
|
+
| **2 - Prepare** | Validate, dedup, PII scan | `transform()`, `get_profile()`, `get_pii_report()`, `redact()` |
|
|
85
|
+
| **3 - Label** | RLAIF + human review | `label()`, `get_quality()`, `approve()` |
|
|
86
|
+
| **4 - Version** | Snapshots + lineage | `snapshot()`, `list_versions()`, `get_lineage()`, `rollback()`, `export()` |
|
|
87
|
+
| **5 - Observe** | Monitor + re-label loop | `register_model()`, `submit_metrics()`, `get_drift()`, `trigger_relabel()` |
|
|
88
|
+
|
|
89
|
+
## Using with Local Dev Server
|
|
90
|
+
|
|
91
|
+
```python
|
|
92
|
+
client = DatalierClient(
|
|
93
|
+
api_key="your-jwt-token",
|
|
94
|
+
base_url="http://localhost:8000",
|
|
95
|
+
)
|
|
96
|
+
```
|
|
97
|
+
|
|
98
|
+
## Links
|
|
99
|
+
|
|
100
|
+
- **Website**: [theconcaveai.com](https://theconcaveai.com)
|
|
101
|
+
- **Documentation**: [theconcaveai.com/docs](https://theconcaveai.com/docs)
|
|
102
|
+
|
|
103
|
+
## License
|
|
104
|
+
|
|
105
|
+
MIT — Concave AI 2026
|
datalier-0.1.0/README.md
ADDED
|
@@ -0,0 +1,77 @@
|
|
|
1
|
+
# Datalier — by Concave AI
|
|
2
|
+
|
|
3
|
+
Data infrastructure for AI model training. One SDK — raw data in, quality-assured training data out.
|
|
4
|
+
|
|
5
|
+
## Installation
|
|
6
|
+
|
|
7
|
+
```bash
|
|
8
|
+
pip install datalier
|
|
9
|
+
```
|
|
10
|
+
|
|
11
|
+
## Quick Start
|
|
12
|
+
|
|
13
|
+
```python
|
|
14
|
+
from datalier import DatalierClient
|
|
15
|
+
|
|
16
|
+
client = DatalierClient(
|
|
17
|
+
api_key="sk_concave_...",
|
|
18
|
+
base_url="https://api.theconcaveai.com", # or http://localhost:8000 for local
|
|
19
|
+
)
|
|
20
|
+
|
|
21
|
+
# Upload a dataset
|
|
22
|
+
dataset = client.upload("training_pairs.jsonl", dataset_type="rlhf")
|
|
23
|
+
print(f"Uploaded: {dataset.id} — {dataset.row_count} rows")
|
|
24
|
+
|
|
25
|
+
# Transform (validate, deduplicate, detect PII)
|
|
26
|
+
result = client.transform(dataset.id, steps=["validate", "dedup", "pii_scan"])
|
|
27
|
+
print(f"PII found: {result['pii_detection']['total_pii_found']}")
|
|
28
|
+
|
|
29
|
+
# Label with RLAIF (AI handles 80-90%, humans review edge cases)
|
|
30
|
+
result = client.label(dataset.id, dataset_type="rlhf", min_kappa=0.70)
|
|
31
|
+
print(f"Auto-labeled: {result['ai_labeled']}/{result['total_tasks']}")
|
|
32
|
+
|
|
33
|
+
# Check quality metrics
|
|
34
|
+
quality = client.get_quality(dataset.id)
|
|
35
|
+
print(f"Kappa: {quality.kappa} | Gold Accuracy: {quality.gold_accuracy}")
|
|
36
|
+
|
|
37
|
+
# Approve and version
|
|
38
|
+
client.approve(dataset.id)
|
|
39
|
+
version = client.snapshot(dataset.id)
|
|
40
|
+
print(f"Snapshot: {version['version']} — hash: {version['snapshot_hash'][:8]}...")
|
|
41
|
+
|
|
42
|
+
# Export as DPO format for training
|
|
43
|
+
download_url = client.export(dataset.id, fmt="dpo", version="v1.0")
|
|
44
|
+
|
|
45
|
+
# Monitor model performance (Layer 5)
|
|
46
|
+
model = client.register_model("my_model_v1", trained_on_dataset_id=dataset.id, trained_on_version="v1.0")
|
|
47
|
+
client.submit_metrics(model["model_id"], accuracy=0.89, f1=0.85)
|
|
48
|
+
drift = client.get_drift(model["model_id"])
|
|
49
|
+
```
|
|
50
|
+
|
|
51
|
+
## Platform Layers
|
|
52
|
+
|
|
53
|
+
| Layer | Function | SDK Methods |
|
|
54
|
+
|-------|----------|-------------|
|
|
55
|
+
| **1 - Ingest** | Upload from any source | `upload()`, `list_datasets()`, `get_dataset()` |
|
|
56
|
+
| **2 - Prepare** | Validate, dedup, PII scan | `transform()`, `get_profile()`, `get_pii_report()`, `redact()` |
|
|
57
|
+
| **3 - Label** | RLAIF + human review | `label()`, `get_quality()`, `approve()` |
|
|
58
|
+
| **4 - Version** | Snapshots + lineage | `snapshot()`, `list_versions()`, `get_lineage()`, `rollback()`, `export()` |
|
|
59
|
+
| **5 - Observe** | Monitor + re-label loop | `register_model()`, `submit_metrics()`, `get_drift()`, `trigger_relabel()` |
|
|
60
|
+
|
|
61
|
+
## Using with Local Dev Server
|
|
62
|
+
|
|
63
|
+
```python
|
|
64
|
+
client = DatalierClient(
|
|
65
|
+
api_key="your-jwt-token",
|
|
66
|
+
base_url="http://localhost:8000",
|
|
67
|
+
)
|
|
68
|
+
```
|
|
69
|
+
|
|
70
|
+
## Links
|
|
71
|
+
|
|
72
|
+
- **Website**: [theconcaveai.com](https://theconcaveai.com)
|
|
73
|
+
- **Documentation**: [theconcaveai.com/docs](https://theconcaveai.com/docs)
|
|
74
|
+
|
|
75
|
+
## License
|
|
76
|
+
|
|
77
|
+
MIT — Concave AI 2026
|
|
@@ -0,0 +1,355 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Datalier by Concave AI — Python SDK
|
|
3
|
+
|
|
4
|
+
pip install datalier
|
|
5
|
+
|
|
6
|
+
Usage:
|
|
7
|
+
from datalier import DatalierClient
|
|
8
|
+
|
|
9
|
+
client = DatalierClient(api_key="sk_concave_...", base_url="http://localhost:8000")
|
|
10
|
+
|
|
11
|
+
dataset = client.upload("data.jsonl", dataset_type="rlhf")
|
|
12
|
+
client.transform(dataset.id)
|
|
13
|
+
client.label(dataset.id)
|
|
14
|
+
quality = client.get_quality(dataset.id)
|
|
15
|
+
client.approve(dataset.id)
|
|
16
|
+
client.snapshot(dataset.id)
|
|
17
|
+
url = client.export(dataset.id, fmt="dpo")
|
|
18
|
+
"""
|
|
19
|
+
import httpx
|
|
20
|
+
from typing import Optional, List
|
|
21
|
+
from dataclasses import dataclass, field
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
@dataclass
|
|
25
|
+
class DatasetInfo:
|
|
26
|
+
id: str
|
|
27
|
+
name: str
|
|
28
|
+
type: str
|
|
29
|
+
status: str
|
|
30
|
+
file_format: str
|
|
31
|
+
row_count: int
|
|
32
|
+
schema: dict = field(default_factory=dict)
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
@dataclass
|
|
36
|
+
class QualityInfo:
|
|
37
|
+
dataset_id: str
|
|
38
|
+
kappa: float
|
|
39
|
+
gold_accuracy: float
|
|
40
|
+
bias_score: float
|
|
41
|
+
overall_status: str
|
|
42
|
+
automation_rate: float = 0.0
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
@dataclass
|
|
46
|
+
class ModelInfo:
|
|
47
|
+
model_id: str
|
|
48
|
+
name: str
|
|
49
|
+
status: str
|
|
50
|
+
drift_score: float = 0.0
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
class DatalierClient:
|
|
54
|
+
"""
|
|
55
|
+
Datalier by Concave AI — SDK Client.
|
|
56
|
+
|
|
57
|
+
Provides programmatic access to all 5 platform layers:
|
|
58
|
+
1. Ingest — upload and connect data sources
|
|
59
|
+
2. Prepare — validate, deduplicate, detect and redact PII
|
|
60
|
+
3. Label — RLAIF pre-labeling with human review loop
|
|
61
|
+
4. Version — immutable snapshots with full lineage
|
|
62
|
+
5. Observe — model performance monitoring + drift detection
|
|
63
|
+
"""
|
|
64
|
+
|
|
65
|
+
def __init__(
|
|
66
|
+
self,
|
|
67
|
+
api_key: str,
|
|
68
|
+
base_url: str = "https://api.theconcaveai.com",
|
|
69
|
+
):
|
|
70
|
+
self.api_key = api_key
|
|
71
|
+
self.base_url = base_url.rstrip("/")
|
|
72
|
+
self._client = httpx.Client(
|
|
73
|
+
base_url=self.base_url,
|
|
74
|
+
headers={"Authorization": f"Bearer {api_key}"},
|
|
75
|
+
timeout=120.0,
|
|
76
|
+
)
|
|
77
|
+
|
|
78
|
+
def _request(self, method: str, path: str, **kwargs) -> dict:
|
|
79
|
+
response = self._client.request(method, path, **kwargs)
|
|
80
|
+
if response.status_code >= 400:
|
|
81
|
+
try:
|
|
82
|
+
detail = response.json().get("detail", response.text)
|
|
83
|
+
except Exception:
|
|
84
|
+
detail = response.text
|
|
85
|
+
raise Exception(f"API Error {response.status_code}: {detail}")
|
|
86
|
+
return response.json()
|
|
87
|
+
|
|
88
|
+
def close(self):
|
|
89
|
+
self._client.close()
|
|
90
|
+
|
|
91
|
+
def __enter__(self):
|
|
92
|
+
return self
|
|
93
|
+
|
|
94
|
+
def __exit__(self, *args):
|
|
95
|
+
self.close()
|
|
96
|
+
|
|
97
|
+
# ─── Layer 1: Ingest ──────────────────────────────────────────────────────
|
|
98
|
+
|
|
99
|
+
def upload(self, filepath: str, name: str = None,
|
|
100
|
+
dataset_type: str = "generic") -> DatasetInfo:
|
|
101
|
+
"""Upload a local file and create a new dataset."""
|
|
102
|
+
with open(filepath, "rb") as f:
|
|
103
|
+
filename = filepath.split("/")[-1]
|
|
104
|
+
result = self._request(
|
|
105
|
+
"POST", "/api/v1/datasets/upload",
|
|
106
|
+
data={"name": name or filename, "dataset_type": dataset_type},
|
|
107
|
+
files={"file": (filename, f)},
|
|
108
|
+
)
|
|
109
|
+
return DatasetInfo(
|
|
110
|
+
id=result["dataset_id"],
|
|
111
|
+
name=result["name"],
|
|
112
|
+
type=dataset_type,
|
|
113
|
+
status=result["status"],
|
|
114
|
+
file_format=result["file_format"],
|
|
115
|
+
row_count=result["row_count"],
|
|
116
|
+
schema=result.get("schema_json", {}),
|
|
117
|
+
)
|
|
118
|
+
|
|
119
|
+
def get_dataset(self, dataset_id: str) -> DatasetInfo:
|
|
120
|
+
"""Get dataset details."""
|
|
121
|
+
d = self._request("GET", f"/api/v1/datasets/{dataset_id}")
|
|
122
|
+
return DatasetInfo(
|
|
123
|
+
id=d["id"], name=d["name"], type=d.get("type", "generic"),
|
|
124
|
+
status=d["status"], file_format=d.get("file_format", ""),
|
|
125
|
+
row_count=d.get("row_count", 0), schema=d.get("schema_json", {}),
|
|
126
|
+
)
|
|
127
|
+
|
|
128
|
+
# ─── Layer 2: Prepare ─────────────────────────────────────────────────────
|
|
129
|
+
|
|
130
|
+
def transform(self, dataset_id: str,
|
|
131
|
+
steps: Optional[List[str]] = None) -> dict:
|
|
132
|
+
"""
|
|
133
|
+
Run the data preparation pipeline.
|
|
134
|
+
Returns full results: validation report, dedup stats, PII findings.
|
|
135
|
+
"""
|
|
136
|
+
steps = steps or ["validate", "dedup", "pii_scan"]
|
|
137
|
+
return self._request(
|
|
138
|
+
"POST", f"/api/v1/datasets/{dataset_id}/transform",
|
|
139
|
+
json={"steps": steps},
|
|
140
|
+
)
|
|
141
|
+
|
|
142
|
+
def get_profile(self, dataset_id: str) -> dict:
|
|
143
|
+
"""Get data profile — column types, null rates, unique counts."""
|
|
144
|
+
return self._request("GET", f"/api/v1/datasets/{dataset_id}/profile")
|
|
145
|
+
|
|
146
|
+
def get_pii_report(self, dataset_id: str) -> dict:
|
|
147
|
+
"""Get PII detection report with findings per field."""
|
|
148
|
+
return self._request("GET", f"/api/v1/datasets/{dataset_id}/pii")
|
|
149
|
+
|
|
150
|
+
def redact(self, dataset_id: str, fields: List[str],
|
|
151
|
+
method: str = "mask") -> dict:
|
|
152
|
+
"""Redact PII fields. method: 'mask' | 'hash' | 'remove'"""
|
|
153
|
+
return self._request(
|
|
154
|
+
"POST", f"/api/v1/datasets/{dataset_id}/redact",
|
|
155
|
+
json={"fields": fields, "method": method},
|
|
156
|
+
)
|
|
157
|
+
|
|
158
|
+
# ─── Layer 3: Label ───────────────────────────────────────────────────────
|
|
159
|
+
|
|
160
|
+
def label(self, dataset_id: str, dataset_type: str = "rlhf",
|
|
161
|
+
rlaif_model: str = "gpt-4o-mini",
|
|
162
|
+
confidence_threshold: float = 0.85,
|
|
163
|
+
min_kappa: float = 0.70) -> dict:
|
|
164
|
+
"""
|
|
165
|
+
Start the RLAIF pre-labeling pipeline.
|
|
166
|
+
Returns: total_tasks, ai_labeled, human_review, automation_rate.
|
|
167
|
+
"""
|
|
168
|
+
return self._request(
|
|
169
|
+
"POST", f"/api/v1/datasets/{dataset_id}/label",
|
|
170
|
+
json={
|
|
171
|
+
"type": dataset_type,
|
|
172
|
+
"rlaif_model": rlaif_model,
|
|
173
|
+
"confidence_threshold": confidence_threshold,
|
|
174
|
+
"min_kappa": min_kappa,
|
|
175
|
+
},
|
|
176
|
+
)
|
|
177
|
+
|
|
178
|
+
def get_quality(self, dataset_id: str) -> QualityInfo:
|
|
179
|
+
"""Get real-time quality metrics: kappa, gold accuracy, bias score."""
|
|
180
|
+
q = self._request("GET", f"/api/v1/datasets/{dataset_id}/quality")
|
|
181
|
+
return QualityInfo(
|
|
182
|
+
dataset_id=q.get("dataset_id", dataset_id),
|
|
183
|
+
kappa=q.get("kappa", -1.0),
|
|
184
|
+
gold_accuracy=q.get("gold_accuracy", 0.0),
|
|
185
|
+
bias_score=q.get("bias_score", 0.0),
|
|
186
|
+
overall_status=q.get("overall_status", "pending_reviews"),
|
|
187
|
+
automation_rate=q.get("automation_rate", 0.0),
|
|
188
|
+
)
|
|
189
|
+
|
|
190
|
+
def list_tasks(self, dataset_id: str,
|
|
191
|
+
status: Optional[str] = None) -> dict:
|
|
192
|
+
"""List tasks, optionally filtered by status (e.g. 'human_review')."""
|
|
193
|
+
params = {}
|
|
194
|
+
if status:
|
|
195
|
+
params["status"] = status
|
|
196
|
+
return self._request("GET", f"/api/v1/datasets/{dataset_id}/tasks",
|
|
197
|
+
params=params)
|
|
198
|
+
|
|
199
|
+
def submit_review(self, dataset_id: str, task_id: str,
|
|
200
|
+
label: dict, reasoning: str = "") -> dict:
|
|
201
|
+
"""Submit a human annotation for a task."""
|
|
202
|
+
body = {"label": label}
|
|
203
|
+
if reasoning:
|
|
204
|
+
body["reasoning"] = reasoning
|
|
205
|
+
return self._request(
|
|
206
|
+
"PUT", f"/api/v1/datasets/{dataset_id}/tasks/{task_id}",
|
|
207
|
+
json=body,
|
|
208
|
+
)
|
|
209
|
+
|
|
210
|
+
def approve(self, dataset_id: str) -> dict:
|
|
211
|
+
"""Approve a labeled dataset — marks it ready for versioning."""
|
|
212
|
+
return self._request(
|
|
213
|
+
"POST", f"/api/v1/datasets/{dataset_id}/label/approve"
|
|
214
|
+
)
|
|
215
|
+
|
|
216
|
+
# ─── Layer 4: Version ─────────────────────────────────────────────────────
|
|
217
|
+
|
|
218
|
+
def snapshot(self, dataset_id: str, bump: str = "minor") -> dict:
|
|
219
|
+
"""
|
|
220
|
+
Create an immutable versioned snapshot.
|
|
221
|
+
Returns: version, snapshot_hash, row_count, lineage_summary.
|
|
222
|
+
"""
|
|
223
|
+
return self._request(
|
|
224
|
+
"POST", f"/api/v1/datasets/{dataset_id}/snapshot",
|
|
225
|
+
params={"bump": bump},
|
|
226
|
+
)
|
|
227
|
+
|
|
228
|
+
def list_versions(self, dataset_id: str) -> dict:
|
|
229
|
+
"""List all versions with quality scores."""
|
|
230
|
+
return self._request("GET", f"/api/v1/datasets/{dataset_id}/versions")
|
|
231
|
+
|
|
232
|
+
def get_lineage(self, dataset_id: str, version: str) -> dict:
|
|
233
|
+
"""Get full provenance for a dataset version."""
|
|
234
|
+
return self._request(
|
|
235
|
+
"GET", f"/api/v1/datasets/{dataset_id}/versions/{version}/lineage"
|
|
236
|
+
)
|
|
237
|
+
|
|
238
|
+
def rollback(self, dataset_id: str, version: str) -> dict:
|
|
239
|
+
"""Rollback the active dataset to a previous version."""
|
|
240
|
+
return self._request(
|
|
241
|
+
"POST", f"/api/v1/datasets/{dataset_id}/rollback/{version}"
|
|
242
|
+
)
|
|
243
|
+
|
|
244
|
+
def diff(self, dataset_id: str, version_a: str, version_b: str) -> dict:
|
|
245
|
+
"""Compare two versions — shows added, removed, unchanged rows."""
|
|
246
|
+
return self._request(
|
|
247
|
+
"GET", f"/api/v1/datasets/{dataset_id}/diff/{version_a}/{version_b}"
|
|
248
|
+
)
|
|
249
|
+
|
|
250
|
+
def export(self, dataset_id: str, fmt: str = "jsonl",
|
|
251
|
+
version: Optional[str] = None) -> str:
|
|
252
|
+
"""
|
|
253
|
+
Export in training-ready format.
|
|
254
|
+
fmt: 'jsonl' | 'dpo' | 'conll' | 'coco' | 'csv' | 'json'
|
|
255
|
+
Returns the download URL.
|
|
256
|
+
"""
|
|
257
|
+
params: dict = {"format": fmt}
|
|
258
|
+
if version:
|
|
259
|
+
params["version"] = version
|
|
260
|
+
result = self._request(
|
|
261
|
+
"POST", f"/api/v1/datasets/{dataset_id}/export",
|
|
262
|
+
params=params,
|
|
263
|
+
)
|
|
264
|
+
return result["download_url"]
|
|
265
|
+
|
|
266
|
+
# ─── Layer 5: Observe ─────────────────────────────────────────────────────
|
|
267
|
+
|
|
268
|
+
def register_model(self, name: str,
|
|
269
|
+
trained_on_dataset_id: Optional[str] = None,
|
|
270
|
+
trained_on_version: Optional[str] = None,
|
|
271
|
+
min_accuracy: float = 0.80,
|
|
272
|
+
max_drift_psi: float = 0.15) -> ModelInfo:
|
|
273
|
+
"""Register a trained model for performance monitoring."""
|
|
274
|
+
result = self._request(
|
|
275
|
+
"POST", "/api/v1/models/register",
|
|
276
|
+
json={
|
|
277
|
+
"name": name,
|
|
278
|
+
"trained_on_dataset_id": trained_on_dataset_id,
|
|
279
|
+
"trained_on_version": trained_on_version,
|
|
280
|
+
"min_accuracy": min_accuracy,
|
|
281
|
+
"max_drift_psi": max_drift_psi,
|
|
282
|
+
},
|
|
283
|
+
)
|
|
284
|
+
return ModelInfo(
|
|
285
|
+
model_id=result["model_id"],
|
|
286
|
+
name=result["name"],
|
|
287
|
+
status=result["status"],
|
|
288
|
+
)
|
|
289
|
+
|
|
290
|
+
def submit_metrics(self, model_id: str, **metrics) -> dict:
|
|
291
|
+
"""
|
|
292
|
+
Submit performance metrics from your training platform.
|
|
293
|
+
Accepts any keyword args: accuracy=0.89, f1=0.85, loss=0.31, etc.
|
|
294
|
+
Automatically detects degradation and triggers re-labeling plan.
|
|
295
|
+
"""
|
|
296
|
+
return self._request(
|
|
297
|
+
"POST", f"/api/v1/models/{model_id}/metrics",
|
|
298
|
+
json=metrics,
|
|
299
|
+
)
|
|
300
|
+
|
|
301
|
+
def ingest_webhook(self, model_id: str, payload: dict,
|
|
302
|
+
source: str = "auto") -> dict:
|
|
303
|
+
"""
|
|
304
|
+
Push metrics from MLflow, W&B, HuggingFace Evaluate, or generic.
|
|
305
|
+
source: 'auto' | 'mlflow' | 'wandb' | 'huggingface' | 'generic'
|
|
306
|
+
"""
|
|
307
|
+
return self._request(
|
|
308
|
+
"POST", f"/api/v1/models/{model_id}/webhook-ingest",
|
|
309
|
+
json={"source": source, "payload": payload},
|
|
310
|
+
)
|
|
311
|
+
|
|
312
|
+
def submit_drift(self, model_id: str,
|
|
313
|
+
training_distribution: dict,
|
|
314
|
+
production_distribution: dict) -> dict:
|
|
315
|
+
"""Submit training vs production distributions for PSI drift analysis."""
|
|
316
|
+
return self._request(
|
|
317
|
+
"POST", f"/api/v1/models/{model_id}/drift",
|
|
318
|
+
json={
|
|
319
|
+
"training_distribution": training_distribution,
|
|
320
|
+
"production_distribution": production_distribution,
|
|
321
|
+
},
|
|
322
|
+
)
|
|
323
|
+
|
|
324
|
+
def get_drift(self, model_id: str) -> dict:
|
|
325
|
+
"""Get latest drift score and history for a model."""
|
|
326
|
+
return self._request("GET", f"/api/v1/models/{model_id}/drift")
|
|
327
|
+
|
|
328
|
+
def get_performance(self, model_id: str) -> dict:
|
|
329
|
+
"""Get full performance history with trend analysis."""
|
|
330
|
+
return self._request("GET", f"/api/v1/models/{model_id}/performance")
|
|
331
|
+
|
|
332
|
+
def trigger_relabel(self, model_id: str,
|
|
333
|
+
categories: Optional[List[str]] = None) -> dict:
|
|
334
|
+
"""Generate a re-labeling plan and get step-by-step loop instructions."""
|
|
335
|
+
body = {}
|
|
336
|
+
if categories:
|
|
337
|
+
body["categories"] = categories
|
|
338
|
+
return self._request(
|
|
339
|
+
"POST", f"/api/v1/models/{model_id}/trigger-relabel",
|
|
340
|
+
json=body,
|
|
341
|
+
)
|
|
342
|
+
|
|
343
|
+
def get_relabel_jobs(self, model_id: str) -> dict:
|
|
344
|
+
"""Get full re-labeling job history for a model."""
|
|
345
|
+
return self._request(
|
|
346
|
+
"GET", f"/api/v1/models/{model_id}/relabel-jobs"
|
|
347
|
+
)
|
|
348
|
+
|
|
349
|
+
def dashboard(self) -> dict:
|
|
350
|
+
"""Platform-wide health dashboard — all models, alerts, quality."""
|
|
351
|
+
return self._request("GET", "/api/v1/observe/dashboard")
|
|
352
|
+
|
|
353
|
+
|
|
354
|
+
# Alias for backwards compatibility
|
|
355
|
+
ConcaveClient = DatalierClient
|
|
@@ -0,0 +1,105 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: datalier
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Datalier by Concave AI - data infrastructure for AI model training
|
|
5
|
+
Author-email: Aniket Nerali <aniket.nerali@theconcaveai.com>
|
|
6
|
+
License-Expression: MIT
|
|
7
|
+
Project-URL: Homepage, https://theconcaveai.com
|
|
8
|
+
Project-URL: Documentation, https://theconcaveai.com/docs
|
|
9
|
+
Project-URL: Repository, https://github.com/thesineo/concave-platform
|
|
10
|
+
Project-URL: Issues, https://github.com/thesineo/concave-platform/issues
|
|
11
|
+
Keywords: datalier,concave-ai,ai,machine-learning,training-data,annotation,rlhf,rlaif,data-quality,data-infrastructure
|
|
12
|
+
Classifier: Development Status :: 3 - Alpha
|
|
13
|
+
Classifier: Intended Audience :: Developers
|
|
14
|
+
Classifier: Intended Audience :: Science/Research
|
|
15
|
+
Classifier: Programming Language :: Python :: 3
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.8
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
20
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
21
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
22
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
23
|
+
Requires-Python: >=3.8
|
|
24
|
+
Description-Content-Type: text/markdown
|
|
25
|
+
License-File: LICENSE
|
|
26
|
+
Requires-Dist: httpx>=0.27.0
|
|
27
|
+
Dynamic: license-file
|
|
28
|
+
|
|
29
|
+
# Datalier — by Concave AI
|
|
30
|
+
|
|
31
|
+
Data infrastructure for AI model training. One SDK — raw data in, quality-assured training data out.
|
|
32
|
+
|
|
33
|
+
## Installation
|
|
34
|
+
|
|
35
|
+
```bash
|
|
36
|
+
pip install datalier
|
|
37
|
+
```
|
|
38
|
+
|
|
39
|
+
## Quick Start
|
|
40
|
+
|
|
41
|
+
```python
|
|
42
|
+
from datalier import DatalierClient
|
|
43
|
+
|
|
44
|
+
client = DatalierClient(
|
|
45
|
+
api_key="sk_concave_...",
|
|
46
|
+
base_url="https://api.theconcaveai.com", # or http://localhost:8000 for local
|
|
47
|
+
)
|
|
48
|
+
|
|
49
|
+
# Upload a dataset
|
|
50
|
+
dataset = client.upload("training_pairs.jsonl", dataset_type="rlhf")
|
|
51
|
+
print(f"Uploaded: {dataset.id} — {dataset.row_count} rows")
|
|
52
|
+
|
|
53
|
+
# Transform (validate, deduplicate, detect PII)
|
|
54
|
+
result = client.transform(dataset.id, steps=["validate", "dedup", "pii_scan"])
|
|
55
|
+
print(f"PII found: {result['pii_detection']['total_pii_found']}")
|
|
56
|
+
|
|
57
|
+
# Label with RLAIF (AI handles 80-90%, humans review edge cases)
|
|
58
|
+
result = client.label(dataset.id, dataset_type="rlhf", min_kappa=0.70)
|
|
59
|
+
print(f"Auto-labeled: {result['ai_labeled']}/{result['total_tasks']}")
|
|
60
|
+
|
|
61
|
+
# Check quality metrics
|
|
62
|
+
quality = client.get_quality(dataset.id)
|
|
63
|
+
print(f"Kappa: {quality.kappa} | Gold Accuracy: {quality.gold_accuracy}")
|
|
64
|
+
|
|
65
|
+
# Approve and version
|
|
66
|
+
client.approve(dataset.id)
|
|
67
|
+
version = client.snapshot(dataset.id)
|
|
68
|
+
print(f"Snapshot: {version['version']} — hash: {version['snapshot_hash'][:8]}...")
|
|
69
|
+
|
|
70
|
+
# Export as DPO format for training
|
|
71
|
+
download_url = client.export(dataset.id, fmt="dpo", version="v1.0")
|
|
72
|
+
|
|
73
|
+
# Monitor model performance (Layer 5)
|
|
74
|
+
model = client.register_model("my_model_v1", trained_on_dataset_id=dataset.id, trained_on_version="v1.0")
|
|
75
|
+
client.submit_metrics(model["model_id"], accuracy=0.89, f1=0.85)
|
|
76
|
+
drift = client.get_drift(model["model_id"])
|
|
77
|
+
```
|
|
78
|
+
|
|
79
|
+
## Platform Layers
|
|
80
|
+
|
|
81
|
+
| Layer | Function | SDK Methods |
|
|
82
|
+
|-------|----------|-------------|
|
|
83
|
+
| **1 - Ingest** | Upload from any source | `upload()`, `list_datasets()`, `get_dataset()` |
|
|
84
|
+
| **2 - Prepare** | Validate, dedup, PII scan | `transform()`, `get_profile()`, `get_pii_report()`, `redact()` |
|
|
85
|
+
| **3 - Label** | RLAIF + human review | `label()`, `get_quality()`, `approve()` |
|
|
86
|
+
| **4 - Version** | Snapshots + lineage | `snapshot()`, `list_versions()`, `get_lineage()`, `rollback()`, `export()` |
|
|
87
|
+
| **5 - Observe** | Monitor + re-label loop | `register_model()`, `submit_metrics()`, `get_drift()`, `trigger_relabel()` |
|
|
88
|
+
|
|
89
|
+
## Using with Local Dev Server
|
|
90
|
+
|
|
91
|
+
```python
|
|
92
|
+
client = DatalierClient(
|
|
93
|
+
api_key="your-jwt-token",
|
|
94
|
+
base_url="http://localhost:8000",
|
|
95
|
+
)
|
|
96
|
+
```
|
|
97
|
+
|
|
98
|
+
## Links
|
|
99
|
+
|
|
100
|
+
- **Website**: [theconcaveai.com](https://theconcaveai.com)
|
|
101
|
+
- **Documentation**: [theconcaveai.com/docs](https://theconcaveai.com/docs)
|
|
102
|
+
|
|
103
|
+
## License
|
|
104
|
+
|
|
105
|
+
MIT — Concave AI 2026
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
LICENSE
|
|
2
|
+
MANIFEST.in
|
|
3
|
+
README.md
|
|
4
|
+
pyproject.toml
|
|
5
|
+
setup.py
|
|
6
|
+
datalier/__init__.py
|
|
7
|
+
datalier/client.py
|
|
8
|
+
datalier.egg-info/PKG-INFO
|
|
9
|
+
datalier.egg-info/SOURCES.txt
|
|
10
|
+
datalier.egg-info/dependency_links.txt
|
|
11
|
+
datalier.egg-info/requires.txt
|
|
12
|
+
datalier.egg-info/top_level.txt
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
httpx>=0.27.0
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
datalier
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=68.0", "wheel"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "datalier"
|
|
7
|
+
version = "0.1.0"
|
|
8
|
+
description = "Datalier by Concave AI - data infrastructure for AI model training"
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
license = "MIT"
|
|
11
|
+
requires-python = ">=3.8"
|
|
12
|
+
authors = [
|
|
13
|
+
{name = "Aniket Nerali", email = "aniket.nerali@theconcaveai.com"}
|
|
14
|
+
]
|
|
15
|
+
keywords = ["datalier", "concave-ai", "ai", "machine-learning", "training-data", "annotation", "rlhf", "rlaif", "data-quality", "data-infrastructure"]
|
|
16
|
+
classifiers = [
|
|
17
|
+
"Development Status :: 3 - Alpha",
|
|
18
|
+
"Intended Audience :: Developers",
|
|
19
|
+
"Intended Audience :: Science/Research",
|
|
20
|
+
"Programming Language :: Python :: 3",
|
|
21
|
+
"Programming Language :: Python :: 3.8",
|
|
22
|
+
"Programming Language :: Python :: 3.9",
|
|
23
|
+
"Programming Language :: Python :: 3.10",
|
|
24
|
+
"Programming Language :: Python :: 3.11",
|
|
25
|
+
"Programming Language :: Python :: 3.12",
|
|
26
|
+
"Topic :: Scientific/Engineering :: Artificial Intelligence",
|
|
27
|
+
"Topic :: Software Development :: Libraries :: Python Modules",
|
|
28
|
+
]
|
|
29
|
+
dependencies = [
|
|
30
|
+
"httpx>=0.27.0",
|
|
31
|
+
]
|
|
32
|
+
|
|
33
|
+
[project.urls]
|
|
34
|
+
Homepage = "https://theconcaveai.com"
|
|
35
|
+
Documentation = "https://theconcaveai.com/docs"
|
|
36
|
+
Repository = "https://github.com/thesineo/concave-platform"
|
|
37
|
+
Issues = "https://github.com/thesineo/concave-platform/issues"
|
datalier-0.1.0/setup.cfg
ADDED
datalier-0.1.0/setup.py
ADDED