lql-cli 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,398 @@
1
+ import json
2
+ import os
3
+ import sys
4
+ from pathlib import Path
5
+ from typing import Annotated, Optional
6
+
7
+ import typer
8
+
9
+ from .._opts import ApiUrlOpt, JsonOpt, ProfileOpt
10
+ from ..api import ApiClient
11
+ from ..output import print_error, print_json, print_table
12
+ from ..util import q
13
+
14
+ app = typer.Typer(help="Manage datasets")
15
+
16
+
17
+ def _truncate(v: object, n: int = 80) -> str:
18
+ s = json.dumps(v) if isinstance(v, (dict, list)) else ("" if v is None else str(v))
19
+ return s[: n - 3] + "..." if len(s) > n else s
20
+
21
+
22
+ @app.command("list")
23
+ def list_datasets(
24
+ workspace: Annotated[Optional[str], typer.Option("--workspace", help="Filter by workspace ID")] = None,
25
+ json_out: JsonOpt = False,
26
+ profile: ProfileOpt = None,
27
+ api_url: ApiUrlOpt = None,
28
+ ) -> None:
29
+ """List datasets."""
30
+ client = ApiClient(profile=profile, api_url=api_url)
31
+ params = {}
32
+ if workspace:
33
+ params["workspace_id"] = workspace
34
+ items = client.get("/v1/datasets", params=params).json()
35
+ print_table(
36
+ ["ID", "Name", "Workspace", "Rows"],
37
+ [
38
+ [
39
+ d.get("id") or "",
40
+ d.get("display_name") or d.get("name") or "",
41
+ d.get("workspace_id") or "",
42
+ d.get("row_count") if d.get("row_count") is not None else "",
43
+ ]
44
+ for d in items
45
+ ],
46
+ json_out,
47
+ items,
48
+ )
49
+
50
+
51
+ @app.command("show")
52
+ def show(
53
+ id: Annotated[str, typer.Argument(help="Dataset ID")],
54
+ json_out: JsonOpt = False,
55
+ profile: ProfileOpt = None,
56
+ api_url: ApiUrlOpt = None,
57
+ ) -> None:
58
+ """Show dataset details."""
59
+ client = ApiClient(profile=profile, api_url=api_url)
60
+ d = client.get(f"/v1/datasets/{q(id)}").json()
61
+ if json_out:
62
+ print_json(d)
63
+ else:
64
+ print_table(
65
+ ["Field", "Value"],
66
+ [
67
+ ["ID", d.get("id") or ""],
68
+ ["Name", d.get("display_name") or d.get("name") or ""],
69
+ ["Workspace", d.get("workspace_id") or ""],
70
+ ["HF Repo", d.get("hf_repo_id") or ""],
71
+ ["Split", d.get("hf_split") or ""],
72
+ ["Rows", d.get("row_count") if d.get("row_count") is not None else ""],
73
+ ["Status", d.get("sync_status") or ""],
74
+ ],
75
+ False,
76
+ [d],
77
+ )
78
+
79
+
80
+ @app.command("create")
81
+ def create(
82
+ workspace: Annotated[str, typer.Option("--workspace", help="Workspace ID")],
83
+ hf_repo: Annotated[Optional[str], typer.Option("--hf-repo", help="HuggingFace repo (e.g. org/dataset)")] = None,
84
+ hf_bucket: Annotated[Optional[str], typer.Option("--hf-bucket", help="HuggingFace storage bucket")] = None,
85
+ key: Annotated[Optional[str], typer.Option("--key", help="Path or glob within the bucket (with --hf-bucket)")] = None,
86
+ name: Annotated[Optional[str], typer.Option("--name", help="Display name")] = None,
87
+ split: Annotated[str, typer.Option("--split", help="Dataset split (HF repo only)")] = "train",
88
+ json_out: JsonOpt = False,
89
+ profile: ProfileOpt = None,
90
+ api_url: ApiUrlOpt = None,
91
+ ) -> None:
92
+ """Create a dataset from a HuggingFace repo or storage bucket."""
93
+ if not hf_repo and not hf_bucket:
94
+ print_error("Either --hf-repo or --hf-bucket is required.", "missing_source")
95
+ raise typer.Exit(1)
96
+ if hf_repo and hf_bucket:
97
+ print_error("--hf-repo and --hf-bucket are mutually exclusive.", "ambiguous_source")
98
+ raise typer.Exit(1)
99
+ if hf_bucket and not key:
100
+ print_error("--key is required with --hf-bucket (path or glob, e.g. data/*.parquet).", "missing_key")
101
+ raise typer.Exit(1)
102
+
103
+ client = ApiClient(profile=profile, api_url=api_url)
104
+ if hf_bucket:
105
+ body = {
106
+ "workspace_id": workspace,
107
+ "source_type": "hf_bucket",
108
+ "hf_bucket": hf_bucket,
109
+ "hf_bucket_key": key,
110
+ "display_name": name or (key.split("/")[-1] if key else None) or hf_bucket,
111
+ }
112
+ else:
113
+ body = {
114
+ "workspace_id": workspace,
115
+ "hf_repo_id": hf_repo,
116
+ "hf_split": split or "train",
117
+ "display_name": name or hf_repo,
118
+ }
119
+ data = client.post("/v1/datasets", json=body).json()
120
+ if json_out:
121
+ print_json(data)
122
+ else:
123
+ status = f" ({data.get('sync_status')})" if data.get("sync_status") else ""
124
+ sys.stdout.write(f"Created dataset: {data.get('id')}{status}\n")
125
+
126
+
127
+ @app.command("sync")
128
+ def sync(
129
+ id: Annotated[str, typer.Argument(help="Dataset ID")],
130
+ json_out: JsonOpt = False,
131
+ profile: ProfileOpt = None,
132
+ api_url: ApiUrlOpt = None,
133
+ ) -> None:
134
+ """Trigger a dataset sync."""
135
+ client = ApiClient(profile=profile, api_url=api_url)
136
+ data = client.post(f"/v1/datasets/{q(id)}/sync").json()
137
+ if json_out:
138
+ print_json(data)
139
+ else:
140
+ sys.stdout.write(f"Sync started for dataset: {id}\n")
141
+
142
+
143
+ @app.command("schema")
144
+ def schema(
145
+ id: Annotated[str, typer.Argument(help="Dataset ID")],
146
+ json_out: JsonOpt = False,
147
+ profile: ProfileOpt = None,
148
+ api_url: ApiUrlOpt = None,
149
+ ) -> None:
150
+ """Get dataset schema."""
151
+ client = ApiClient(profile=profile, api_url=api_url)
152
+ data = client.get(f"/v1/datasets/{q(id)}/schema").json()
153
+ if json_out:
154
+ print_json(data)
155
+ else:
156
+ columns = (data or {}).get("columns") if isinstance(data, dict) else data
157
+ columns = columns or []
158
+ print_table(
159
+ ["Column", "Type"],
160
+ [[c.get("name") or "", c.get("type") or ""] for c in columns],
161
+ False,
162
+ columns,
163
+ )
164
+
165
+
166
+ def _fmt_null_pct(nr: float) -> str:
167
+ # Avoid rounding a not-quite-0/100 null rate to a misleading "0%"/"100%".
168
+ pct = nr * 100
169
+ if 0 < pct < 0.5:
170
+ return "<1%"
171
+ if 99.5 < pct < 100:
172
+ return ">99%"
173
+ return f"{round(pct)}%"
174
+
175
+
176
+ @app.command("profile")
177
+ def profile_cmd(
178
+ id: Annotated[str, typer.Argument(help="Dataset ID")],
179
+ full_content: Annotated[bool, typer.Option("--full-content", help="Scan every row for content stats (slow)")] = False,
180
+ skip_content: Annotated[bool, typer.Option("--skip-content", help="Skip content/token stats entirely")] = False,
181
+ json_out: JsonOpt = False,
182
+ profile: ProfileOpt = None,
183
+ api_url: ApiUrlOpt = None,
184
+ ) -> None:
185
+ """Profile a dataset: per-column nulls/cardinality/numeric stats/top values + content token stats."""
186
+ client = ApiClient(profile=profile, api_url=api_url)
187
+ params = {}
188
+ if full_content:
189
+ params["full_content"] = "true"
190
+ if skip_content:
191
+ params["skip_content"] = "true"
192
+ p = client.get(f"/v1/datasets/{q(id)}/profile", params=params).json()
193
+ if json_out:
194
+ print_json(p)
195
+ return
196
+ if p.get("skip_reason"):
197
+ sys.stdout.write(f"Profile unavailable: {p['skip_reason']}\n")
198
+ return
199
+ sys.stdout.write(f"Rows: {p.get('total_rows', '?')}\n")
200
+ cols = p.get("columns") or []
201
+
202
+ def col_row(c):
203
+ num = c.get("numeric_stats")
204
+ num_str = f"{num['min']}/{num['p50']}/{num['p95']}/{num['max']}" if num else ""
205
+ top = c.get("top_values")
206
+ note = f"[{c['skip_reason']}]" if c.get("skip_reason") else ""
207
+ if top:
208
+ note = ", ".join(f"{t.get('value')}:{t.get('count')}" for t in top[:3])
209
+ nr = c.get("null_rate")
210
+ return [
211
+ c.get("name") or "",
212
+ str(c.get("type") or "")[:28],
213
+ "" if nr is None else _fmt_null_pct(float(nr)),
214
+ "" if c.get("approx_distinct") is None else str(c.get("approx_distinct")),
215
+ num_str,
216
+ note[:45] + "..." if len(note) > 48 else note,
217
+ ]
218
+
219
+ print_table(
220
+ ["Column", "Type", "Null %", "~Distinct", "Numeric min/p50/p95/max", "Top / note"],
221
+ [col_row(c) for c in cols],
222
+ False,
223
+ cols,
224
+ )
225
+
226
+ cs = p.get("content_stats")
227
+ if cs:
228
+ if cs.get("sampled"):
229
+ sys.stdout.write(
230
+ f"\n⚠ content stats sampled from the first shard "
231
+ f"({cs.get('sample_rows')} / {cs.get('total_rows')} rows). "
232
+ "max_chars is sample-bound — use --full-content for an exact scan.\n"
233
+ )
234
+ else:
235
+ sys.stdout.write(f"\nContent stats (full scan, {cs.get('sample_rows')} rows):\n")
236
+ ccols = cs.get("columns") or []
237
+ print_table(
238
+ ["Column", "Field", "P50 ch/~tok", "P95 ch/~tok", "Max ch", "Turns avg/max"],
239
+ [
240
+ [
241
+ c.get("column") or "",
242
+ c.get("text_field") or "",
243
+ f"{c.get('p50_chars', '')} / ~{c.get('p50_tokens_est', '')}",
244
+ f"{c.get('p95_chars', '')} / ~{c.get('p95_tokens_est', '')}",
245
+ c.get("max_chars") if c.get("max_chars") is not None else "",
246
+ "" if c.get("avg_turns") is None else f"{c.get('avg_turns')} / {c.get('max_turns')}",
247
+ ]
248
+ for c in ccols
249
+ ],
250
+ False,
251
+ ccols,
252
+ )
253
+
254
+
255
+ @app.command("rows")
256
+ def rows(
257
+ id: Annotated[str, typer.Argument(help="Dataset ID")],
258
+ limit: Annotated[str, typer.Option("--limit", help="Number of rows")] = "20",
259
+ offset: Annotated[str, typer.Option("--offset", help="Row offset")] = "0",
260
+ json_out: JsonOpt = False,
261
+ profile: ProfileOpt = None,
262
+ api_url: ApiUrlOpt = None,
263
+ ) -> None:
264
+ """Get dataset rows."""
265
+ client = ApiClient(profile=profile, api_url=api_url)
266
+ data = client.get(f"/v1/datasets/{q(id)}/rows", params={"limit": limit, "offset": offset}).json()
267
+ if json_out:
268
+ print_json(data)
269
+ return
270
+ rows_data = (data or {}).get("rows") if isinstance(data, dict) else data
271
+ rows_data = rows_data or []
272
+ if not rows_data:
273
+ sys.stdout.write("No rows.\n")
274
+ return
275
+ headers = list(rows_data[0].keys())
276
+ print_table(
277
+ headers,
278
+ [[_truncate(r.get(h)) for h in headers] for r in rows_data],
279
+ False,
280
+ rows_data,
281
+ )
282
+
283
+
284
+ @app.command("delete")
285
+ def delete(
286
+ id: Annotated[str, typer.Argument(help="Dataset ID")],
287
+ profile: ProfileOpt = None,
288
+ api_url: ApiUrlOpt = None,
289
+ ) -> None:
290
+ """Delete a dataset."""
291
+ client = ApiClient(profile=profile, api_url=api_url)
292
+ client.delete(f"/v1/datasets/{q(id)}")
293
+ sys.stdout.write(f"Deleted dataset: {id}\n")
294
+
295
+
296
+ @app.command("push")
297
+ def push(
298
+ id: Annotated[str, typer.Argument(help="Dataset ID")],
299
+ json_out: JsonOpt = False,
300
+ profile: ProfileOpt = None,
301
+ api_url: ApiUrlOpt = None,
302
+ ) -> None:
303
+ """Push dataset to HuggingFace."""
304
+ client = ApiClient(profile=profile, api_url=api_url)
305
+ data = client.post(f"/v1/datasets/{q(id)}/push").json()
306
+ if json_out:
307
+ print_json(data)
308
+ else:
309
+ sys.stdout.write(f"Push started. Job ID: {data.get('job_id', 'unknown')}\n")
310
+
311
+
312
+ @app.command("push-status")
313
+ def push_status(
314
+ id: Annotated[str, typer.Argument(help="Dataset ID")],
315
+ job: Annotated[Optional[str], typer.Option("--job", help="Specific job ID")] = None,
316
+ json_out: JsonOpt = False,
317
+ profile: ProfileOpt = None,
318
+ api_url: ApiUrlOpt = None,
319
+ ) -> None:
320
+ """Get push job status for a dataset."""
321
+ client = ApiClient(profile=profile, api_url=api_url)
322
+ url = f"/v1/datasets/{q(id)}/push-jobs/{q(job)}" if job else f"/v1/datasets/{q(id)}/push-jobs"
323
+ data = client.get(url).json()
324
+ if json_out:
325
+ print_json(data)
326
+ else:
327
+ jobs = data if isinstance(data, list) else [data]
328
+ print_table(
329
+ ["Job ID", "Status", "Created"],
330
+ [[j.get("id") or "", j.get("status") or "", j.get("created_at") or ""] for j in jobs],
331
+ False,
332
+ jobs,
333
+ )
334
+
335
+
336
+ @app.command("upload")
337
+ def upload(
338
+ file: Annotated[str, typer.Argument(help="Local file to upload")],
339
+ workspace: Annotated[str, typer.Option("--workspace", help="Workspace ID")],
340
+ name: Annotated[str, typer.Option("--name", help="Repo name (will upload to liquid-ai/<name>)")],
341
+ split: Annotated[str, typer.Option("--split", help="Dataset split")] = "train",
342
+ json_out: JsonOpt = False,
343
+ profile: ProfileOpt = None,
344
+ api_url: ApiUrlOpt = None,
345
+ ) -> None:
346
+ """Upload a local file to HuggingFace and create a dataset."""
347
+ hf_token = os.environ.get("LQL_HF_TOKEN")
348
+ if not hf_token:
349
+ print_error("LQL_HF_TOKEN environment variable is required for upload.", "missing_hf_token")
350
+ raise typer.Exit(1)
351
+
352
+ resolved = Path(file).resolve()
353
+ if not resolved.exists():
354
+ print_error(f"File not found: {resolved}", "file_not_found")
355
+ raise typer.Exit(1)
356
+
357
+ # Validate LQL auth + API URL *before* the external HF side effect, so a
358
+ # missing/expired token or bad API URL can't leave an orphaned HF upload.
359
+ client = ApiClient(profile=profile, api_url=api_url)
360
+
361
+ repo_name = f"liquid-ai/{name}"
362
+ sys.stdout.write(f"Uploading to HuggingFace: {repo_name}...\n")
363
+
364
+ try:
365
+ from huggingface_hub import HfApi # lazy import: keeps startup fast
366
+
367
+ hf = HfApi(token=hf_token)
368
+ hf.create_repo(repo_id=repo_name, repo_type="dataset", exist_ok=True)
369
+ hf.upload_file(
370
+ path_or_fileobj=str(resolved),
371
+ path_in_repo=f"data/{split or 'train'}/{resolved.name}",
372
+ repo_id=repo_name,
373
+ repo_type="dataset",
374
+ )
375
+ sys.stdout.write("Uploaded to HuggingFace.\n")
376
+ except Exception as e:
377
+ print_error(f"HuggingFace upload failed: {e or 'unknown error'}", "hf_upload_failed")
378
+ raise typer.Exit(1)
379
+
380
+ create_res = client.post(
381
+ "/v1/datasets",
382
+ json={
383
+ "workspace_id": workspace,
384
+ "hf_id": repo_name,
385
+ "display_name": name,
386
+ "split": split or "train",
387
+ },
388
+ ).json()
389
+ dataset_id = create_res.get("id")
390
+ sys.stdout.write(f"Created dataset: {dataset_id}\n")
391
+
392
+ client.post(f"/v1/datasets/{q(dataset_id)}/sync")
393
+ sys.stdout.write("Sync started.\n")
394
+
395
+ if json_out:
396
+ print_json({"id": dataset_id, "hf_repo": repo_name, "status": "syncing"})
397
+ else:
398
+ sys.stdout.write(f"Dataset ID: {dataset_id}\nStatus: syncing\n")
lql/commands/edits.py ADDED
@@ -0,0 +1,95 @@
1
+ import json
2
+ import sys
3
+ from typing import Annotated, Optional
4
+
5
+ import typer
6
+
7
+ from .._opts import ApiUrlOpt, JsonOpt, ProfileOpt
8
+ from ..api import ApiClient
9
+ from ..output import print_error, print_json, print_table
10
+ from ..util import q
11
+
12
+ app = typer.Typer(help="Manage row edits")
13
+
14
+
15
+ @app.command("list")
16
+ def list_edits(
17
+ dataset_id: Annotated[str, typer.Argument(help="Dataset ID")],
18
+ limit: Annotated[str, typer.Option("--limit", help="Number of edits")] = "50",
19
+ json_out: JsonOpt = False,
20
+ profile: ProfileOpt = None,
21
+ api_url: ApiUrlOpt = None,
22
+ ) -> None:
23
+ """List edits for a dataset."""
24
+ client = ApiClient(profile=profile, api_url=api_url)
25
+ items = client.get(f"/v1/datasets/{q(dataset_id)}/edits", params={"limit": limit}).json()
26
+ print_table(
27
+ ["ID", "Row", "Column", "Value"],
28
+ [
29
+ [
30
+ e.get("id") or "",
31
+ e.get("row_external_id") or "",
32
+ e.get("column") or "",
33
+ json.dumps(e.get("new_value"))[:60],
34
+ ]
35
+ for e in items
36
+ ],
37
+ json_out,
38
+ items,
39
+ )
40
+
41
+
42
+ @app.command("count")
43
+ def count(
44
+ dataset_id: Annotated[str, typer.Argument(help="Dataset ID")],
45
+ json_out: JsonOpt = False,
46
+ profile: ProfileOpt = None,
47
+ api_url: ApiUrlOpt = None,
48
+ ) -> None:
49
+ """Count edits for a dataset."""
50
+ client = ApiClient(profile=profile, api_url=api_url)
51
+ data = client.get(f"/v1/datasets/{q(dataset_id)}/edits/count").json()
52
+ if json_out:
53
+ print_json(data)
54
+ else:
55
+ sys.stdout.write(f"Pending edits: {data.get('pending', 0)}\n")
56
+
57
+
58
+ @app.command("add")
59
+ def add(
60
+ dataset_id: Annotated[str, typer.Argument(help="Dataset ID")],
61
+ row: Annotated[str, typer.Option("--row", help="Row external ID")],
62
+ column: Annotated[str, typer.Option("--column", help="Column name")],
63
+ value: Annotated[str, typer.Option("--value", help="New value as JSON")],
64
+ json_out: JsonOpt = False,
65
+ profile: ProfileOpt = None,
66
+ api_url: ApiUrlOpt = None,
67
+ ) -> None:
68
+ """Add an edit to a row."""
69
+ try:
70
+ parsed = json.loads(value)
71
+ except ValueError:
72
+ print_error(f"Invalid JSON for --value: {value}", "invalid_json")
73
+ raise typer.Exit(1)
74
+ client = ApiClient(profile=profile, api_url=api_url)
75
+ data = client.post(
76
+ f"/v1/datasets/{q(dataset_id)}/edits",
77
+ json={"row_external_id": row, "column": column, "new_value": parsed},
78
+ ).json()
79
+ if json_out:
80
+ print_json(data)
81
+ else:
82
+ sys.stdout.write(f"Created edit: {data.get('id', 'ok')}\n")
83
+
84
+
85
+ @app.command("delete")
86
+ def delete(
87
+ dataset_id: Annotated[str, typer.Argument(help="Dataset ID")],
88
+ edit_id: Annotated[str, typer.Argument(help="Edit ID")],
89
+ profile: ProfileOpt = None,
90
+ api_url: ApiUrlOpt = None,
91
+ ) -> None:
92
+ """Delete an edit."""
93
+ client = ApiClient(profile=profile, api_url=api_url)
94
+ client.delete(f"/v1/datasets/{q(dataset_id)}/edits/{q(edit_id)}")
95
+ sys.stdout.write(f"Deleted edit: {edit_id}\n")