atdata 0.2.3b1__py3-none-any.whl → 0.3.0b1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (48) hide show
  1. atdata/.gitignore +1 -0
  2. atdata/__init__.py +30 -0
  3. atdata/_exceptions.py +168 -0
  4. atdata/_helpers.py +29 -15
  5. atdata/_hf_api.py +63 -11
  6. atdata/_logging.py +70 -0
  7. atdata/_protocols.py +19 -62
  8. atdata/_schema_codec.py +5 -4
  9. atdata/_type_utils.py +28 -2
  10. atdata/atmosphere/__init__.py +19 -9
  11. atdata/atmosphere/records.py +3 -2
  12. atdata/atmosphere/schema.py +2 -2
  13. atdata/cli/__init__.py +157 -171
  14. atdata/cli/inspect.py +69 -0
  15. atdata/cli/local.py +1 -1
  16. atdata/cli/preview.py +63 -0
  17. atdata/cli/schema.py +109 -0
  18. atdata/dataset.py +428 -326
  19. atdata/lens.py +9 -2
  20. atdata/local/__init__.py +71 -0
  21. atdata/local/_entry.py +157 -0
  22. atdata/local/_index.py +940 -0
  23. atdata/local/_repo_legacy.py +218 -0
  24. atdata/local/_s3.py +349 -0
  25. atdata/local/_schema.py +380 -0
  26. atdata/manifest/__init__.py +28 -0
  27. atdata/manifest/_aggregates.py +156 -0
  28. atdata/manifest/_builder.py +163 -0
  29. atdata/manifest/_fields.py +154 -0
  30. atdata/manifest/_manifest.py +146 -0
  31. atdata/manifest/_query.py +150 -0
  32. atdata/manifest/_writer.py +74 -0
  33. atdata/promote.py +4 -4
  34. atdata/providers/__init__.py +25 -0
  35. atdata/providers/_base.py +140 -0
  36. atdata/providers/_factory.py +69 -0
  37. atdata/providers/_postgres.py +214 -0
  38. atdata/providers/_redis.py +171 -0
  39. atdata/providers/_sqlite.py +191 -0
  40. atdata/repository.py +323 -0
  41. atdata/testing.py +337 -0
  42. {atdata-0.2.3b1.dist-info → atdata-0.3.0b1.dist-info}/METADATA +4 -1
  43. atdata-0.3.0b1.dist-info/RECORD +54 -0
  44. atdata/local.py +0 -1720
  45. atdata-0.2.3b1.dist-info/RECORD +0 -28
  46. {atdata-0.2.3b1.dist-info → atdata-0.3.0b1.dist-info}/WHEEL +0 -0
  47. {atdata-0.2.3b1.dist-info → atdata-0.3.0b1.dist-info}/entry_points.txt +0 -0
  48. {atdata-0.2.3b1.dist-info → atdata-0.3.0b1.dist-info}/licenses/LICENSE +0 -0
atdata/cli/schema.py ADDED
@@ -0,0 +1,109 @@
1
+ """``atdata schema`` commands — show and diff dataset schemas."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import sys
6
+
7
+
8
+ def schema_show(dataset_ref: str) -> int:
9
+ """Display the schema of a dataset.
10
+
11
+ Args:
12
+ dataset_ref: Dataset URL, local path, or index reference
13
+ (e.g. ``@local/my-dataset``).
14
+
15
+ Returns:
16
+ Exit code (0 success, 1 failure).
17
+ """
18
+ try:
19
+ from ..dataset import Dataset, DictSample
20
+
21
+ ds = Dataset[DictSample](dataset_ref)
22
+ except Exception as exc:
23
+ print(f"Error opening dataset: {exc}", file=sys.stderr)
24
+ return 1
25
+
26
+ samples = ds.head(1)
27
+ if not samples:
28
+ print("No samples found — cannot infer schema.", file=sys.stderr)
29
+ return 1
30
+
31
+ sample = samples[0]
32
+ print(f"Schema for: {dataset_ref}")
33
+ print(f"Fields ({len(sample.keys())}):")
34
+ for key in sample.keys():
35
+ val = sample[key]
36
+ print(f" {key}: {_type_label(val)}")
37
+
38
+ return 0
39
+
40
+
41
+ def schema_diff(url_a: str, url_b: str) -> int:
42
+ """Compare schemas of two datasets and print differences.
43
+
44
+ Args:
45
+ url_a: First dataset URL / path.
46
+ url_b: Second dataset URL / path.
47
+
48
+ Returns:
49
+ Exit code (0 identical, 1 different, 2 error).
50
+ """
51
+ try:
52
+ from ..dataset import Dataset, DictSample
53
+
54
+ ds_a = Dataset[DictSample](url_a)
55
+ ds_b = Dataset[DictSample](url_b)
56
+ except Exception as exc:
57
+ print(f"Error opening dataset: {exc}", file=sys.stderr)
58
+ return 2
59
+
60
+ samples_a = ds_a.head(1)
61
+ samples_b = ds_b.head(1)
62
+
63
+ if not samples_a:
64
+ print(f"No samples in {url_a}", file=sys.stderr)
65
+ return 2
66
+ if not samples_b:
67
+ print(f"No samples in {url_b}", file=sys.stderr)
68
+ return 2
69
+
70
+ fields_a = {k: _type_label(samples_a[0][k]) for k in samples_a[0].keys()}
71
+ fields_b = {k: _type_label(samples_b[0][k]) for k in samples_b[0].keys()}
72
+
73
+ keys_a = set(fields_a)
74
+ keys_b = set(fields_b)
75
+
76
+ added = sorted(keys_b - keys_a)
77
+ removed = sorted(keys_a - keys_b)
78
+ common = sorted(keys_a & keys_b)
79
+ changed = [k for k in common if fields_a[k] != fields_b[k]]
80
+
81
+ if not added and not removed and not changed:
82
+ print("Schemas are identical.")
83
+ return 0
84
+
85
+ if added:
86
+ print("Added:")
87
+ for k in added:
88
+ print(f" + {k}: {fields_b[k]}")
89
+ if removed:
90
+ print("Removed:")
91
+ for k in removed:
92
+ print(f" - {k}: {fields_a[k]}")
93
+ if changed:
94
+ print("Changed:")
95
+ for k in changed:
96
+ print(f" ~ {k}: {fields_a[k]} -> {fields_b[k]}")
97
+
98
+ return 1
99
+
100
+
101
+ def _type_label(val: object) -> str:
102
+ """Short type label for schema display."""
103
+ import numpy as np
104
+
105
+ if isinstance(val, np.ndarray):
106
+ return f"ndarray[{val.dtype}]"
107
+ if isinstance(val, bytes):
108
+ return "bytes"
109
+ return type(val).__name__