podstack 1.3.11__tar.gz → 1.3.13__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (34) hide show
  1. {podstack-1.3.11 → podstack-1.3.13}/PKG-INFO +307 -8
  2. podstack-1.3.13/README.md +714 -0
  3. podstack-1.3.13/podstack/registry/autolog.py +196 -0
  4. {podstack-1.3.11 → podstack-1.3.13}/podstack/registry/client.py +258 -64
  5. {podstack-1.3.11 → podstack-1.3.13}/podstack/registry/experiment.py +60 -0
  6. {podstack-1.3.11 → podstack-1.3.13}/podstack.egg-info/PKG-INFO +307 -8
  7. {podstack-1.3.11 → podstack-1.3.13}/podstack.egg-info/SOURCES.txt +1 -0
  8. {podstack-1.3.11 → podstack-1.3.13}/pyproject.toml +1 -1
  9. podstack-1.3.11/README.md +0 -415
  10. {podstack-1.3.11 → podstack-1.3.13}/LICENSE +0 -0
  11. {podstack-1.3.11 → podstack-1.3.13}/podstack/__init__.py +0 -0
  12. {podstack-1.3.11 → podstack-1.3.13}/podstack/annotations.py +0 -0
  13. {podstack-1.3.11 → podstack-1.3.13}/podstack/client.py +0 -0
  14. {podstack-1.3.11 → podstack-1.3.13}/podstack/exceptions.py +0 -0
  15. {podstack-1.3.11 → podstack-1.3.13}/podstack/execution.py +0 -0
  16. {podstack-1.3.11 → podstack-1.3.13}/podstack/gpu_runner.py +0 -0
  17. {podstack-1.3.11 → podstack-1.3.13}/podstack/models.py +0 -0
  18. {podstack-1.3.11 → podstack-1.3.13}/podstack/notebook.py +0 -0
  19. {podstack-1.3.11 → podstack-1.3.13}/podstack/registry/__init__.py +0 -0
  20. {podstack-1.3.11 → podstack-1.3.13}/podstack/registry/exceptions.py +0 -0
  21. {podstack-1.3.11 → podstack-1.3.13}/podstack/registry/model.py +0 -0
  22. {podstack-1.3.11 → podstack-1.3.13}/podstack/registry/model_utils.py +0 -0
  23. {podstack-1.3.11 → podstack-1.3.13}/podstack.egg-info/dependency_links.txt +0 -0
  24. {podstack-1.3.11 → podstack-1.3.13}/podstack.egg-info/requires.txt +0 -0
  25. {podstack-1.3.11 → podstack-1.3.13}/podstack.egg-info/top_level.txt +0 -0
  26. {podstack-1.3.11 → podstack-1.3.13}/podstack_gpu/__init__.py +0 -0
  27. {podstack-1.3.11 → podstack-1.3.13}/podstack_gpu/app.py +0 -0
  28. {podstack-1.3.11 → podstack-1.3.13}/podstack_gpu/exceptions.py +0 -0
  29. {podstack-1.3.11 → podstack-1.3.13}/podstack_gpu/image.py +0 -0
  30. {podstack-1.3.11 → podstack-1.3.13}/podstack_gpu/runner.py +0 -0
  31. {podstack-1.3.11 → podstack-1.3.13}/podstack_gpu/secret.py +0 -0
  32. {podstack-1.3.11 → podstack-1.3.13}/podstack_gpu/utils.py +0 -0
  33. {podstack-1.3.11 → podstack-1.3.13}/podstack_gpu/volume.py +0 -0
  34. {podstack-1.3.11 → podstack-1.3.13}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: podstack
3
- Version: 1.3.11
3
+ Version: 1.3.13
4
4
  Summary: Official Python SDK for Podstack GPU Notebook Platform
5
5
  Author-email: Podstack <support@podstack.ai>
6
6
  License-Expression: MIT
@@ -302,13 +302,13 @@ with registry.start_run(name="training-v1") as run:
302
302
  registry.log_artifact("model.pt", "model")
303
303
  registry.log_artifact("training_curves.png", "plots")
304
304
 
305
- # Log dataset metadata
306
- registry.log_dataset(
307
- name="imdb-reviews",
308
- path="s3://datasets/imdb",
309
- num_rows=50000,
310
- num_features=2
311
- )
305
+ # Log dataset provenance (first-class resource, deduped by content hash)
306
+ registry.log_dataset("imdb-reviews", path="data/imdb.csv", context="training")
307
+
308
+ # Or pass a DataFrame — schema and row/feature counts are auto-computed
309
+ import pandas as pd
310
+ df = pd.read_csv("data/imdb.csv")
311
+ registry.log_dataset("imdb-reviews", df=df, context="training")
312
312
  ```
313
313
 
314
314
  ### Log and Load Models
@@ -360,6 +360,305 @@ runs = registry.search_runs(
360
360
  )
361
361
  ```
362
362
 
363
+ ### Dataset Tracking & Lineage
364
+
365
+ Podstack tracks datasets as first-class resources, linking them to runs and model versions so you can always answer *"what data was this model trained on?"*
366
+
367
+ The lineage chain is:
368
+
369
+ ```
370
+ Dataset(s) ──[logged to]──▶ Run ──[run_id]──▶ ModelVersion
371
+ ```
372
+
373
+ #### `log_dataset()` — log a dataset to the active run
374
+
375
+ ```python
376
+ dataset = registry.log_dataset(
377
+ name="imdb-reviews", # required — human-readable name
378
+ path="data/imdb.csv", # local path or URI (s3://, gcs://, https://)
379
+ context="training", # "training" | "validation" | "test" (default: "training")
380
+ )
381
+ ```
382
+
383
+ The dataset is stored as a **project-level resource** and linked to the current run.
384
+ Subsequent calls with the same file produce the same dataset record — no duplicates.
385
+
386
+ **Auto-enrichment from a local file:**
387
+
388
+ ```python
389
+ # SHA-256 digest is computed automatically for files ≤ 500 MB.
390
+ # This enables deduplication across runs — if two runs use the exact
391
+ # same file, they share one Dataset record in the registry.
392
+ dataset = registry.log_dataset("imdb-reviews", path="data/imdb.csv")
393
+ print(dataset.digest) # "a3f2c1..." — hex SHA-256
394
+ ```
395
+
396
+ **Auto-enrichment from a pandas DataFrame:**
397
+
398
+ ```python
399
+ import pandas as pd
400
+
401
+ df = pd.read_csv("data/imdb.csv")
402
+
403
+ dataset = registry.log_dataset(
404
+ name="imdb-reviews",
405
+ df=df,
406
+ context="training",
407
+ )
408
+ # schema and profile are computed automatically:
409
+ print(dataset.schema) # {"text": "object", "label": "int64"}
410
+ print(dataset.profile) # {"num_rows": 50000, "num_features": 2}
411
+ ```
412
+
413
+ **Pass both `path` and `df`** to get digest dedup *and* schema inference:
414
+
415
+ ```python
416
+ dataset = registry.log_dataset("imdb-reviews", path="data/imdb.csv", df=df)
417
+ ```
418
+
419
+ **All parameters:**
420
+
421
+ | Parameter | Type | Default | Description |
422
+ |-----------|------|---------|-------------|
423
+ | `name` | `str` | required | Human-readable dataset name |
424
+ | `path` | `str` | `None` | Local file path or URI (`s3://`, `gcs://`, `https://`) |
425
+ | `df` | `DataFrame` | `None` | pandas DataFrame — schema and profile auto-computed |
426
+ | `context` | `str` | `"training"` | Role of the dataset: `"training"`, `"validation"`, or `"test"` |
427
+ | `digest` | `str` | `None` | SHA-256 hex digest. Computed from `path` if not provided |
428
+ | `source_type` | `str` | `"local"` | Storage backend: `"local"`, `"s3"`, `"gcs"`, `"url"` |
429
+ | `tags` | `dict` | `None` | Arbitrary string key-value tags |
430
+
431
+ **Returns:** `Dataset` object with fields:
432
+
433
+ | Field | Type | Description |
434
+ |-------|------|-------------|
435
+ | `id` | `str` | UUID of the dataset record |
436
+ | `name` | `str` | Dataset name |
437
+ | `digest` | `str` | SHA-256 hex digest (empty if not computed) |
438
+ | `source_type` | `str` | Storage backend |
439
+ | `source` | `str` | File path or URI |
440
+ | `schema` | `dict` | Column → dtype mapping |
441
+ | `profile` | `dict` | `num_rows`, `num_features`, and any other stats |
442
+ | `tags` | `dict` | Tags dict |
443
+ | `created_at` | `str` | ISO 8601 timestamp |
444
+
445
+ **Via the `Run` object** (equivalent to calling `registry.log_dataset()`):
446
+
447
+ ```python
448
+ with registry.start_run("training-v1") as run:
449
+ dataset = run.log_dataset("imdb-reviews", df=df, context="training")
450
+ ```
451
+
452
+ #### Multiple datasets per run
453
+
454
+ Log validation and test sets alongside the training set:
455
+
456
+ ```python
457
+ with registry.start_run("bert-finetune") as run:
458
+ run.log_dataset("imdb-train", df=train_df, context="training")
459
+ run.log_dataset("imdb-val", df=val_df, context="validation")
460
+ run.log_dataset("imdb-test", df=test_df, context="test")
461
+ ```
462
+
463
+ #### `get_run_datasets()` — retrieve datasets logged to a run
464
+
465
+ Returns every `Dataset` object linked to a run, in the order they were logged.
466
+
467
+ ```python
468
+ datasets = registry.get_run_datasets(run_id)
469
+ ```
470
+
471
+ **Parameters:**
472
+
473
+ | Parameter | Type | Description |
474
+ |-----------|------|-------------|
475
+ | `run_id` | `str` | ID of the run to query |
476
+
477
+ **Returns:** `list[Dataset]` — same object as returned by `log_dataset()`.
478
+
479
+ **Fields on each `Dataset`:**
480
+
481
+ | Field | Type | Description |
482
+ |-------|------|-------------|
483
+ | `id` | `str` | UUID of the dataset record |
484
+ | `name` | `str` | Human-readable name |
485
+ | `digest` | `str` | SHA-256 hex digest (empty if not computed at log time) |
486
+ | `source_type` | `str` | `"local"`, `"s3"`, `"gcs"`, or `"url"` |
487
+ | `source` | `str` | File path or URI that was passed to `log_dataset()` |
488
+ | `schema` | `dict` | Column → dtype mapping (e.g. `{"text": "object", "label": "int64"}`) |
489
+ | `profile` | `dict` | Stats dict, always contains `num_rows` and `num_features` when a DataFrame was passed |
490
+ | `tags` | `dict` | Key-value tags |
491
+ | `created_at` | `str` | ISO 8601 timestamp |
492
+
493
+ **Examples:**
494
+
495
+ ```python
496
+ from podstack import registry
497
+
498
+ registry.init(api_key="...", project_id="...")
499
+
500
+ datasets = registry.get_run_datasets("3a9f12c4-...")
501
+
502
+ # Inspect each dataset
503
+ for ds in datasets:
504
+ print(ds.name)
505
+ print(f" source : {ds.source}")
506
+ print(f" digest : {ds.digest[:16]}…")
507
+ print(f" rows : {ds.profile.get('num_rows', 'unknown')}")
508
+ print(f" schema : {ds.schema}")
509
+ ```
510
+
511
+ Checking datasets on a run you have in hand:
512
+
513
+ ```python
514
+ with registry.start_run("training-v1") as run:
515
+ run.log_dataset("train", df=train_df, context="training")
516
+ run.log_dataset("val", df=val_df, context="validation")
517
+
518
+ # After the run completes, retrieve everything that was logged
519
+ datasets = registry.get_run_datasets(run.id)
520
+ assert len(datasets) == 2
521
+ ```
522
+
523
+ Verifying deduplication — the same physical file logged across two runs
524
+ returns the same dataset ID:
525
+
526
+ ```python
527
+ ds1 = registry.get_run_datasets(run_a.id)[0]
528
+ ds2 = registry.get_run_datasets(run_b.id)[0]
529
+
530
+ # Same file → same digest → same Dataset record
531
+ assert ds1.id == ds2.id
532
+ assert ds1.digest == ds2.digest
533
+ ```
534
+
535
+ #### `get_model_lineage()` — trace a model back to its training data
536
+
537
+ Returns the full provenance chain for every version of a registered model:
538
+ which datasets each version was trained on, via which run.
539
+
540
+ ```python
541
+ lineage = registry.get_model_lineage(model_id)
542
+ ```
543
+
544
+ **Parameters:**
545
+
546
+ | Parameter | Type | Description |
547
+ |-----------|------|-------------|
548
+ | `model_id` | `str` | ID of the registered model |
549
+
550
+ **Returns:** `dict` with the following structure:
551
+
552
+ ```
553
+ {
554
+ "model_id": str,
555
+ "versions": [
556
+ {
557
+ "version": int, # version number (1, 2, 3 …)
558
+ "stage": str, # "development" | "staging" | "production" | "archived"
559
+ "run_id": str, # ID of the linked training run (empty if none)
560
+ "run_name": str, # display name of the run
561
+ "datasets": [Dataset] # list of Dataset dicts logged to that run
562
+ },
563
+
564
+ ]
565
+ }
566
+ ```
567
+
568
+ Each `datasets` entry has the same fields as a `Dataset` object
569
+ (`id`, `name`, `digest`, `source_type`, `source`, `schema`, `profile`, `tags`, `created_at`).
570
+
571
+ **Examples:**
572
+
573
+ Basic iteration:
574
+
575
+ ```python
576
+ from podstack import registry
577
+
578
+ registry.init(api_key="...", project_id="...")
579
+
580
+ model = registry.get_model("sentiment-bert")
581
+ lineage = registry.get_model_lineage(model.id)
582
+
583
+ for version in lineage["versions"]:
584
+ print(f"v{version['version']} · {version['stage']}")
585
+ print(f" Run: {version['run_name']} ({version['run_id'][:8]}…)")
586
+ for ds in version["datasets"]:
587
+ rows = ds["profile"].get("num_rows", "?")
588
+ print(f" └─ {ds['name']} {rows} rows sha256:{ds['digest'][:12]}…")
589
+ ```
590
+
591
+ Example output:
592
+
593
+ ```
594
+ v3 · production
595
+ Run: bert-finetune-v3 (3a9f12c4…)
596
+ └─ imdb-train 40000 rows sha256:a3f2c1d8e9b0…
597
+ └─ imdb-val 5000 rows sha256:7e4b2f1a0c3d…
598
+ v2 · staging
599
+ Run: bert-finetune-v2 (8b2e77d1…)
600
+ └─ imdb-train 40000 rows sha256:a3f2c1d8e9b0…
601
+ v1 · archived
602
+ Run: bert-finetune-v1 (f1c3a0e2…)
603
+ └─ imdb-train 40000 rows sha256:a3f2c1d8e9b0…
604
+ ```
605
+
606
+ Finding every unique dataset ever used to train any version of a model:
607
+
608
+ ```python
609
+ lineage = registry.get_model_lineage(model.id)
610
+ seen = {}
611
+ for version in lineage["versions"]:
612
+ for ds in version["datasets"]:
613
+ seen[ds["id"]] = ds # dedup by ID
614
+
615
+ unique_datasets = list(seen.values())
616
+ print(f"{len(unique_datasets)} unique dataset(s) across all versions")
617
+ ```
618
+
619
+ Checking whether the production version was trained on an approved dataset:
620
+
621
+ ```python
622
+ APPROVED_DIGEST = "a3f2c1d8e9b0..."
623
+
624
+ lineage = registry.get_model_lineage(model.id)
625
+ prod = next(v for v in lineage["versions"] if v["stage"] == "production")
626
+
627
+ approved = any(ds["digest"] == APPROVED_DIGEST for ds in prod["datasets"])
628
+ print("Production model trained on approved data:", approved)
629
+ ```
630
+
631
+ #### End-to-end example
632
+
633
+ ```python
634
+ import pandas as pd
635
+ from podstack import registry
636
+
637
+ registry.init(api_key="...", project_id="...")
638
+ registry.set_experiment("sentiment-analysis")
639
+
640
+ # Load data
641
+ train_df = pd.read_csv("data/train.csv")
642
+ val_df = pd.read_csv("data/val.csv")
643
+
644
+ with registry.start_run("bert-finetune-v3") as run:
645
+ # Log datasets — digest is auto-computed, schema inferred
646
+ run.log_dataset("imdb-train", path="data/train.csv", df=train_df, context="training")
647
+ run.log_dataset("imdb-val", path="data/val.csv", df=val_df, context="validation")
648
+
649
+ # Train
650
+ run.log_params({"lr": 2e-5, "epochs": 3})
651
+ run.log_metrics({"accuracy": 0.93, "f1": 0.92})
652
+
653
+ # Register and promote the model
654
+ registry.register_model("sentiment-bert", run_id=run.id)
655
+ registry.set_model_stage("sentiment-bert", version=3, stage="production")
656
+
657
+ # Later — answer "what data trained v3?"
658
+ model = registry.get_model("sentiment-bert")
659
+ lineage = registry.get_model_lineage(model.id)
660
+ ```
661
+
363
662
  ### List and Browse
364
663
 
365
664
  ```python