pyspiral 0.2.5__cp310-abi3-macosx_11_0_arm64.whl → 0.4.0__cp310-abi3-macosx_11_0_arm64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (114) hide show
  1. {pyspiral-0.2.5.dist-info → pyspiral-0.4.0.dist-info}/METADATA +12 -14
  2. pyspiral-0.4.0.dist-info/RECORD +98 -0
  3. {pyspiral-0.2.5.dist-info → pyspiral-0.4.0.dist-info}/WHEEL +1 -1
  4. spiral/__init__.py +6 -7
  5. spiral/_lib.abi3.so +0 -0
  6. spiral/adbc.py +21 -14
  7. spiral/api/__init__.py +15 -172
  8. spiral/api/admin.py +12 -26
  9. spiral/api/client.py +160 -0
  10. spiral/api/filesystems.py +100 -72
  11. spiral/api/organizations.py +45 -58
  12. spiral/api/projects.py +171 -134
  13. spiral/api/telemetry.py +19 -0
  14. spiral/api/types.py +20 -0
  15. spiral/api/workloads.py +32 -25
  16. spiral/{arrow.py → arrow_.py} +12 -0
  17. spiral/cli/__init__.py +2 -5
  18. spiral/cli/admin.py +7 -12
  19. spiral/cli/app.py +23 -6
  20. spiral/cli/console.py +1 -1
  21. spiral/cli/fs.py +83 -18
  22. spiral/cli/iceberg/__init__.py +7 -0
  23. spiral/cli/iceberg/namespaces.py +47 -0
  24. spiral/cli/iceberg/tables.py +60 -0
  25. spiral/cli/indexes/__init__.py +19 -0
  26. spiral/cli/login.py +14 -5
  27. spiral/cli/orgs.py +90 -0
  28. spiral/cli/printer.py +9 -1
  29. spiral/cli/projects.py +136 -0
  30. spiral/cli/state.py +2 -0
  31. spiral/cli/tables/__init__.py +121 -0
  32. spiral/cli/telemetry.py +18 -0
  33. spiral/cli/types.py +8 -10
  34. spiral/cli/{workload.py → workloads.py} +11 -11
  35. spiral/{catalog.py → client.py} +22 -21
  36. spiral/core/client/__init__.pyi +117 -0
  37. spiral/core/index/__init__.pyi +15 -0
  38. spiral/core/table/__init__.pyi +108 -0
  39. spiral/core/{manifests → table/manifests}/__init__.pyi +5 -23
  40. spiral/core/table/metastore/__init__.pyi +62 -0
  41. spiral/core/{spec → table/spec}/__init__.pyi +49 -92
  42. spiral/datetime_.py +27 -0
  43. spiral/expressions/__init__.py +40 -17
  44. spiral/expressions/base.py +5 -5
  45. spiral/expressions/list_.py +1 -1
  46. spiral/expressions/mp4.py +62 -0
  47. spiral/expressions/png.py +18 -0
  48. spiral/expressions/qoi.py +18 -0
  49. spiral/expressions/refs.py +23 -9
  50. spiral/expressions/struct.py +7 -5
  51. spiral/expressions/text.py +62 -0
  52. spiral/expressions/tiff.py +88 -88
  53. spiral/expressions/udf.py +3 -3
  54. spiral/iceberg/__init__.py +3 -0
  55. spiral/iceberg/client.py +33 -0
  56. spiral/indexes/__init__.py +5 -0
  57. spiral/indexes/client.py +137 -0
  58. spiral/indexes/index.py +34 -0
  59. spiral/indexes/scan.py +22 -0
  60. spiral/project.py +19 -110
  61. spiral/{proto → protogen}/_/scandal/__init__.py +32 -77
  62. spiral/protogen/_/spiral/table/__init__.py +22 -0
  63. spiral/protogen/substrait/__init__.py +3399 -0
  64. spiral/protogen/substrait/extensions/__init__.py +115 -0
  65. spiral/server.py +17 -0
  66. spiral/settings.py +31 -87
  67. spiral/substrait_.py +10 -6
  68. spiral/tables/__init__.py +12 -0
  69. spiral/tables/client.py +130 -0
  70. spiral/{dataset.py → tables/dataset.py} +36 -25
  71. spiral/tables/debug/manifests.py +70 -0
  72. spiral/tables/debug/metrics.py +56 -0
  73. spiral/{debug.py → tables/debug/scan.py} +6 -9
  74. spiral/tables/maintenance.py +12 -0
  75. spiral/tables/scan.py +193 -0
  76. spiral/tables/snapshot.py +78 -0
  77. spiral/tables/table.py +157 -0
  78. spiral/tables/transaction.py +52 -0
  79. pyspiral-0.2.5.dist-info/RECORD +0 -81
  80. spiral/api/tables.py +0 -94
  81. spiral/api/tokens.py +0 -56
  82. spiral/authn/authn.py +0 -89
  83. spiral/authn/device.py +0 -206
  84. spiral/authn/github_.py +0 -33
  85. spiral/authn/modal_.py +0 -18
  86. spiral/cli/org.py +0 -90
  87. spiral/cli/project.py +0 -107
  88. spiral/cli/table.py +0 -20
  89. spiral/cli/token.py +0 -27
  90. spiral/config.py +0 -26
  91. spiral/core/core/__init__.pyi +0 -53
  92. spiral/core/metastore/__init__.pyi +0 -91
  93. spiral/proto/_/spfs/__init__.py +0 -36
  94. spiral/proto/_/spiral/table/__init__.py +0 -225
  95. spiral/proto/_/spiraldb/metastore/__init__.py +0 -499
  96. spiral/proto/__init__.py +0 -0
  97. spiral/proto/scandal/__init__.py +0 -45
  98. spiral/proto/spiral/__init__.py +0 -0
  99. spiral/proto/spiral/table/__init__.py +0 -96
  100. spiral/scan_.py +0 -168
  101. spiral/table.py +0 -157
  102. {pyspiral-0.2.5.dist-info → pyspiral-0.4.0.dist-info}/entry_points.txt +0 -0
  103. /spiral/{authn/__init__.py → core/__init__.pyi} +0 -0
  104. /spiral/{core → protogen/_}/__init__.py +0 -0
  105. /spiral/{proto/_ → protogen/_/arrow}/__init__.py +0 -0
  106. /spiral/{proto/_/arrow → protogen/_/arrow/flight}/__init__.py +0 -0
  107. /spiral/{proto/_/arrow/flight → protogen/_/arrow/flight/protocol}/__init__.py +0 -0
  108. /spiral/{proto → protogen}/_/arrow/flight/protocol/sql/__init__.py +0 -0
  109. /spiral/{proto/_/arrow/flight/protocol → protogen/_/spiral}/__init__.py +0 -0
  110. /spiral/{proto → protogen/_}/substrait/__init__.py +0 -0
  111. /spiral/{proto → protogen/_}/substrait/extensions/__init__.py +0 -0
  112. /spiral/{proto/_/spiral → protogen}/__init__.py +0 -0
  113. /spiral/{proto → protogen}/util.py +0 -0
  114. /spiral/{proto/_/spiraldb → tables/debug}/__init__.py +0 -0
@@ -0,0 +1,70 @@
1
+ from spiral import datetime_
2
+ from spiral.core.table import TableScan
3
+ from spiral.core.table.manifests import FragmentManifest
4
+ from spiral.tables.debug.metrics import _format_bytes
5
+
6
+
7
+ def display_manifests(scan: TableScan):
8
+ """Display all manifests in a scan."""
9
+ if len(scan.table_ids()) != 1:
10
+ raise NotImplementedError("Multiple table scans are not supported.")
11
+ table_id = scan.table_ids()[0]
12
+
13
+ key_space_manifest: FragmentManifest = scan.key_space_scan(table_id).manifest
14
+ _table_of_fragments(
15
+ key_space_manifest,
16
+ title="Key Space manifest",
17
+ )
18
+
19
+ for column_group in scan.column_groups():
20
+ column_group_manifest: FragmentManifest = scan.column_group_scan(column_group).manifest
21
+ _table_of_fragments(
22
+ column_group_manifest,
23
+ title=f"Column Group manifest for {str(column_group)}",
24
+ )
25
+
26
+
27
+ def _table_of_fragments(manifest: FragmentManifest, title: str):
28
+ """Display fragments in a formatted table."""
29
+ # Calculate summary statistics
30
+ total_size = sum(fragment.size_bytes for fragment in manifest)
31
+ total_metadata_size = sum(len(fragment.format_metadata or b"") for fragment in manifest)
32
+ fragment_count = len(manifest)
33
+ avg_size = total_size / fragment_count if fragment_count > 0 else 0
34
+
35
+ # Print title and summary
36
+ print(f"\n\n{title}")
37
+ print(
38
+ f"{fragment_count} fragments, "
39
+ f"total: {_format_bytes(total_size)}, "
40
+ f"avg: {_format_bytes(int(avg_size))}, "
41
+ f"metadata: {_format_bytes(total_metadata_size)}"
42
+ )
43
+ print("=" * 120)
44
+
45
+ # Print header
46
+ print(
47
+ f"{'ID':<30} {'Size (Metadata)':<20} {'Format':<10} {'Key Span':<10} "
48
+ f"{'Level':<5} {'Committed At':<20} {'Compacted At':<20}"
49
+ )
50
+ print("=" * 120)
51
+
52
+ # Print each fragment
53
+ for fragment in manifest:
54
+ committed_str = str(datetime_.from_timestamp_micros(fragment.committed_at)) if fragment.committed_at else "N/A"
55
+ compacted_str = str(datetime_.from_timestamp_micros(fragment.compacted_at)) if fragment.compacted_at else "N/A"
56
+
57
+ size_with_metadata = (
58
+ f"{_format_bytes(fragment.size_bytes)} ({_format_bytes(len(fragment.format_metadata or b''))})"
59
+ )
60
+ key_span = f"{fragment.key_span.begin}..{fragment.key_span.end}"
61
+
62
+ print(
63
+ f"{fragment.id:<30} "
64
+ f"{size_with_metadata:<20} "
65
+ f"{str(fragment.format):<10} "
66
+ f"{key_span:<10} "
67
+ f"{str(fragment.level):<5} "
68
+ f"{committed_str:<20} "
69
+ f"{compacted_str:<20}"
70
+ )
@@ -0,0 +1,56 @@
1
+ from typing import Any
2
+
3
+
4
+ def display_metrics(metrics: dict[str, Any]) -> None:
5
+ """Display metrics in a formatted table."""
6
+ print(
7
+ f"{'Metric':<40} {'Type':<10} {'Count':<8} {'Avg':<12} {'Min':<12} "
8
+ f"{'Max':<12} {'P95':<12} {'P99':<12} {'StdDev':<12}"
9
+ )
10
+ print("=" * 140)
11
+
12
+ for metric_name, data in sorted(metrics.items()):
13
+ metric_type = data["type"]
14
+ count = data["count"]
15
+ avg = _format_value(data["avg"], metric_type, metric_name)
16
+ min_val = _format_value(data["min"], metric_type, metric_name)
17
+ max_val = _format_value(data["max"], metric_type, metric_name)
18
+ p95 = _format_value(data["p95"], metric_type, metric_name)
19
+ p99 = _format_value(data["p99"], metric_type, metric_name)
20
+ stddev = _format_value(data["stddev"], metric_type, metric_name)
21
+
22
+ print(
23
+ f"{metric_name:<40} {metric_type:<10} {count:<8} {avg:<12} {min_val:<12} "
24
+ f"{max_val:<12} {p95:<12} {p99:<12} {stddev:<12}"
25
+ )
26
+
27
+
28
+ def _format_duration(nanoseconds: float) -> str:
29
+ """Convert nanoseconds to human-readable duration."""
30
+ if nanoseconds >= 1_000_000_000:
31
+ return f"{nanoseconds / 1_000_000_000:.2f}s"
32
+ elif nanoseconds >= 1_000_000:
33
+ return f"{nanoseconds / 1_000_000:.2f}ms"
34
+ elif nanoseconds >= 1_000:
35
+ return f"{nanoseconds / 1_000:.2f}μs"
36
+ else:
37
+ return f"{nanoseconds:.0f}ns"
38
+
39
+
40
+ def _format_bytes(bytes_value: float) -> str:
41
+ """Convert bytes to human-readable size."""
42
+ for unit in ["B", "KB", "MB", "GB"]:
43
+ if bytes_value < 1024:
44
+ return f"{bytes_value:.1f}{unit}"
45
+ bytes_value /= 1024
46
+ return f"{bytes_value:.1f}TB"
47
+
48
+
49
+ def _format_value(value: float, metric_type: str, metric_name: str) -> str:
50
+ """Format a value based on metric type and name."""
51
+ if metric_type == "timer" or "duration" in metric_name:
52
+ return _format_duration(value)
53
+ elif "bytes" in metric_name:
54
+ return _format_bytes(value)
55
+ else:
56
+ return f"{value:,.0f}"
@@ -1,8 +1,8 @@
1
1
  from datetime import datetime
2
2
 
3
- from spiral.core.core import TableScan
4
- from spiral.core.manifests import FragmentFile, FragmentManifest
5
- from spiral.core.spec import Key, KeyRange
3
+ from spiral.core.table import TableScan
4
+ from spiral.core.table.manifests import FragmentFile, FragmentManifest
5
+ from spiral.core.table.spec import Key
6
6
  from spiral.types_ import Timestamp
7
7
 
8
8
 
@@ -30,7 +30,7 @@ def show_scan(scan: TableScan):
30
30
  for i in range(len(cg_manifest)):
31
31
  fragment_file = cg_manifest[i]
32
32
  key_points.add(fragment_file.key_extent.min)
33
- key_points.add(fragment_file.key_extent.max)
33
+ key_points.add(fragment_file.key_extent.max)
34
34
 
35
35
  # Make sure split points exist in all key points.
36
36
  for s in splits[:-1]: # Don't take the last end.
@@ -44,9 +44,7 @@ def show_scan(scan: TableScan):
44
44
  show_manifest(cg_scan.manifest, scope=".".join(cg.path[1:]), key_points=key_points, splits=splits)
45
45
 
46
46
 
47
- def show_manifest(
48
- manifest: FragmentManifest, scope: str = None, key_points: list[Key] = None, splits: list[KeyRange] = None
49
- ):
47
+ def show_manifest(manifest: FragmentManifest, scope: str = None, key_points: list[Key] = None, splits: list = None):
50
48
  try:
51
49
  import matplotlib.patches as patches
52
50
  import matplotlib.pyplot as plt
@@ -157,10 +155,9 @@ def _get_fragment_legend(manifest_file: FragmentFile):
157
155
  f"key_min: {manifest_file.key_extent.min}",
158
156
  f"key_max: {manifest_file.key_extent.max}",
159
157
  f"format: {manifest_file.format}",
160
- f"level: {manifest_file.fs_level}",
158
+ f"level: {manifest_file.level}",
161
159
  f"committed_at: {_format_timestamp(manifest_file.committed_at)}",
162
160
  f"compacted_at: {_format_timestamp(manifest_file.compacted_at)}",
163
- f"fs_id: {manifest_file.fs_id}",
164
161
  f"ks_id: {manifest_file.ks_id}",
165
162
  ]
166
163
  )
@@ -0,0 +1,12 @@
1
+ from spiral.core.table import TableMaintenance
2
+
3
+
4
+ class Maintenance:
5
+ """Spiral table maintenance."""
6
+
7
+ def __init__(self, maintenance: TableMaintenance):
8
+ self._maintenance = maintenance
9
+
10
+ def flush_wal(self):
11
+ """Flush the write-ahead log."""
12
+ self._maintenance.flush_wal()
spiral/tables/scan.py ADDED
@@ -0,0 +1,193 @@
1
+ from collections.abc import Iterator
2
+ from typing import TYPE_CHECKING, Any
3
+
4
+ import pyarrow as pa
5
+ from datasets import DatasetInfo, Features
6
+
7
+ from spiral.core.table import KeyRange, TableScan
8
+ from spiral.core.table.spec import Schema
9
+ from spiral.settings import CI, DEV
10
+
11
+ if TYPE_CHECKING:
12
+ import dask.dataframe as dd
13
+ import pandas as pd
14
+ import polars as pl
15
+ from datasets import iterable_dataset
16
+
17
+
18
+ class Scan:
19
+ """Scan object."""
20
+
21
+ def __init__(
22
+ self,
23
+ scan: TableScan,
24
+ ):
25
+ # NOTE(ngates): this API is a little weird. e.g. if the query doesn't define an asof, it is resolved
26
+ # when we wrap it into a core.Scan. Should we expose a Query object in the Python API that's reusable
27
+ # and will re-resolve the asof? Or should we just expose a scan that fixes the asof at construction time?
28
+ self._scan = scan
29
+
30
+ @property
31
+ def metrics(self) -> dict[str, Any]:
32
+ """Returns metrics about the scan."""
33
+ return self._scan.metrics()
34
+
35
+ @property
36
+ def schema(self) -> Schema:
37
+ """Returns the schema of the scan."""
38
+ return self._scan.schema()
39
+
40
+ def is_empty(self) -> bool:
41
+ """Check if the Spiral is empty for the given key range.
42
+
43
+ **IMPORTANT**: False negatives are possible, but false positives are not,
44
+ i.e. is_empty can return False and scan can return zero rows.
45
+ """
46
+ return self._scan.is_empty()
47
+
48
+ def to_record_batches(
49
+ self,
50
+ key_table: pa.Table | pa.RecordBatchReader | None = None,
51
+ batch_size: int | None = None,
52
+ batch_readahead: int | None = None,
53
+ ) -> pa.RecordBatchReader:
54
+ """Read as a stream of RecordBatches.
55
+
56
+ Args:
57
+ key_table: a table of keys to "take" (including aux columns for cell-push-down).
58
+ If None, the scan will be executed without a key table.
59
+ batch_size: the maximum number of rows per returned batch.
60
+ IMPORTANT: This is currently only respected when the key_table is used. If key table is a
61
+ RecordBatchReader, the batch_size argument must be None, and the existing batching is respected.
62
+ batch_readahead: the number of batches to prefetch in the background.
63
+ """
64
+ if isinstance(key_table, pa.RecordBatchReader):
65
+ if batch_size is not None:
66
+ raise ValueError(
67
+ "batch_size must be None when key_table is a RecordBatchReader, the existing batching is respected."
68
+ )
69
+ elif isinstance(key_table, pa.Table):
70
+ key_table = key_table.to_reader(max_chunksize=batch_size)
71
+
72
+ return self._scan.to_record_batches(key_table=key_table, batch_readahead=batch_readahead)
73
+
74
+ def to_table(
75
+ self,
76
+ key_table: pa.Table | pa.RecordBatchReader | None = None,
77
+ ) -> pa.Table:
78
+ """Read into a single PyArrow Table.
79
+
80
+ Args:
81
+ key_table: a table of keys to "take" (including aux columns for cell-push-down).
82
+ If None, the scan will be executed without a key table.
83
+ """
84
+ # NOTE: Evaluates fully on Rust side which improved debuggability.
85
+ if DEV and not CI and key_table is None:
86
+ rb = self._scan.to_record_batch()
87
+ return pa.Table.from_batches([rb])
88
+
89
+ return self.to_record_batches(key_table=key_table).read_all()
90
+
91
+ def to_dask(self) -> "dd.DataFrame":
92
+ """Read into a Dask DataFrame.
93
+
94
+ Requires the `dask` package to be installed.
95
+ """
96
+ import dask.dataframe as dd
97
+ import pandas as pd
98
+
99
+ def _read_key_range(key_range: KeyRange) -> pd.DataFrame:
100
+ # TODO(ngates): we need a way to preserve the existing asofs? Should we copy CoreScan instead of Query?
101
+ raise NotImplementedError()
102
+
103
+ # Fetch a set of partition ranges
104
+ return dd.from_map(_read_key_range, self.split())
105
+
106
+ def to_pandas(self) -> "pd.DataFrame":
107
+ """Read into a Pandas DataFrame.
108
+
109
+ Requires the `pandas` package to be installed.
110
+ """
111
+ return self.to_table().to_pandas()
112
+
113
+ def to_polars(self) -> "pl.DataFrame":
114
+ """Read into a Polars DataFrame.
115
+
116
+ Requires the `polars` package to be installed.
117
+ """
118
+ import polars as pl
119
+
120
+ # TODO(marko): This should support lazy dataframe.
121
+ return pl.from_arrow(self.to_record_batches())
122
+
123
+ def to_pytorch(
124
+ self,
125
+ batch_readahead: int | None = None,
126
+ shuffle_batch_size: int | None = None,
127
+ shuffle_pool_num_rows: int | None = None,
128
+ ) -> "iterable_dataset.IterableDataset":
129
+ """Returns an iterable dataset that can be used to build a PyTorch DataLoader.
130
+
131
+ Args:
132
+ batch_readahead: Number of batches to prefetch in the background.
133
+ shuffle_batch_size: read granularity of number of rows for a shuffled scan. If left as
134
+ None along with shuffle_pool_num_rows=None, shuffling is disabled.
135
+ shuffle_pool_num_rows: Pool size for shuffling batches.
136
+ """
137
+ from datasets.iterable_dataset import ArrowExamplesIterable, IterableDataset
138
+
139
+ def _generate_tables(**kwargs) -> Iterator[tuple[int, pa.Table]]:
140
+ if shuffle_batch_size is None and shuffle_pool_num_rows is None:
141
+ stream = self.to_record_batches(
142
+ batch_readahead=batch_readahead,
143
+ )
144
+ else:
145
+ stream = self._scan.to_shuffled_record_batches(
146
+ batch_readahead, shuffle_batch_size, shuffle_pool_num_rows
147
+ )
148
+
149
+ # This key is unused when training with IterableDataset.
150
+ # Default implementation returns shard id, e.g. parquet row group id.
151
+ for i, rb in enumerate(stream):
152
+ yield i, pa.Table.from_batches([rb], stream.schema)
153
+
154
+ def _hf_compatible_schema(schema: pa.Schema) -> pa.Schema:
155
+ """
156
+ Replace string-view columns in the schema with strings. We do use this converted schema
157
+ as Features in the returned Dataset.
158
+ Remove this method once we have https://github.com/huggingface/datasets/pull/7718
159
+ """
160
+ new_fields = [
161
+ pa.field(field.name, pa.string(), nullable=field.nullable, metadata=field.metadata)
162
+ if field.type == pa.string_view()
163
+ else field
164
+ for field in schema
165
+ ]
166
+ return pa.schema(new_fields)
167
+
168
+ # NOTE: generate_tables_fn type annotations are wrong, return type must be an iterable of tuples.
169
+ ex_iterable = ArrowExamplesIterable(generate_tables_fn=_generate_tables, kwargs={})
170
+ info = DatasetInfo(features=Features.from_arrow_schema(_hf_compatible_schema(self.schema.to_arrow())))
171
+ return IterableDataset(ex_iterable=ex_iterable, info=info)
172
+
173
+ def _split(self) -> list[KeyRange]:
174
+ # Splits the scan into a set of key ranges.
175
+ return self._scan.split()
176
+
177
+ def _debug(self):
178
+ # Visualizes the scan, mainly for debugging purposes.
179
+ from spiral.tables.debug.scan import show_scan
180
+
181
+ show_scan(self._scan)
182
+
183
+ def _dump_manifests(self):
184
+ # Print manifests in a human-readable format.
185
+ from spiral.tables.debug.manifests import display_manifests
186
+
187
+ display_manifests(self._scan)
188
+
189
+ def _dump_metrics(self):
190
+ # Print metrics in a human-readable format.
191
+ from spiral.tables.debug.metrics import display_metrics
192
+
193
+ display_metrics(self.metrics)
@@ -0,0 +1,78 @@
1
+ from typing import TYPE_CHECKING
2
+
3
+ from spiral.core.table import TableSnapshot
4
+ from spiral.expressions import ExprLike
5
+ from spiral.tables.scan import Scan
6
+ from spiral.types_ import Timestamp
7
+
8
+ if TYPE_CHECKING:
9
+ import duckdb
10
+ import polars as pl
11
+ import pyarrow.dataset
12
+
13
+ from spiral.tables import Tables
14
+ from spiral.tables.table import Table
15
+
16
+
17
+ class Snapshot:
18
+ """Spiral table snapshot.
19
+
20
+ A snapshot represents a point-in-time view of a table.
21
+ """
22
+
23
+ def __init__(self, tables: "Tables", snapshot: TableSnapshot):
24
+ self._tables = tables
25
+ self._snapshot = snapshot
26
+
27
+ @property
28
+ def asof(self) -> Timestamp:
29
+ """Returns the asof timestamp of the snapshot."""
30
+ return self._snapshot.asof
31
+
32
+ @property
33
+ def client(self) -> "Tables":
34
+ """Returns the client used by the snapshot."""
35
+ return self._tables
36
+
37
+ @property
38
+ def table(self) -> "Table":
39
+ """Returns the table associated with the snapshot."""
40
+ from spiral.tables.table import Table
41
+
42
+ return Table(self._tables, self._snapshot.table)
43
+
44
+ def to_dataset(self) -> "pyarrow.dataset.Dataset":
45
+ """Returns a PyArrow Dataset representing the table."""
46
+ from .dataset import TableDataset
47
+
48
+ return TableDataset(self)
49
+
50
+ def to_polars(self) -> "pl.LazyFrame":
51
+ """Returns a Polars LazyFrame for the Spiral table."""
52
+ import polars as pl
53
+
54
+ return pl.scan_pyarrow_dataset(self.to_dataset())
55
+
56
+ def to_duckdb(self) -> "duckdb.DuckDBPyRelation":
57
+ """Returns a DuckDB relation for the Spiral table."""
58
+ import duckdb
59
+
60
+ return duckdb.from_arrow(self.to_dataset())
61
+
62
+ def scan(
63
+ self,
64
+ *projections: ExprLike,
65
+ where: ExprLike | None = None,
66
+ exclude_keys: bool = False,
67
+ ) -> Scan:
68
+ """Reads the snapshot. If projections are not provided, the entire table is read."""
69
+ if not projections:
70
+ # Use table as the default projection.
71
+ projections = [self._snapshot.table.__expr__]
72
+
73
+ return self._tables.scan(
74
+ *projections,
75
+ where=where,
76
+ asof=self._snapshot.asof,
77
+ exclude_keys=exclude_keys,
78
+ )
spiral/tables/table.py ADDED
@@ -0,0 +1,157 @@
1
+ from datetime import datetime
2
+ from typing import TYPE_CHECKING
3
+
4
+ from spiral.core.table import Table as CoreTable
5
+ from spiral.core.table.spec import Schema
6
+ from spiral.expressions.base import Expr, ExprLike
7
+ from spiral.settings import settings
8
+ from spiral.tables.maintenance import Maintenance
9
+ from spiral.tables.scan import Scan
10
+ from spiral.tables.snapshot import Snapshot
11
+ from spiral.tables.transaction import Transaction
12
+
13
+ if TYPE_CHECKING:
14
+ from spiral.tables import Tables
15
+
16
+
17
+ class Table(Expr):
18
+ """API for interacting with a SpiralDB's Table.
19
+
20
+ Different catalog implementations should ultimately construct a Table object.
21
+ """
22
+
23
+ # TODO(marko): Make identifier required.
24
+ def __init__(self, tables: "Tables", table: CoreTable, *, identifier: str | None = None):
25
+ super().__init__(table.__expr__)
26
+
27
+ self._tables = tables
28
+ self._table = table
29
+ self._identifier = identifier
30
+ self._key_schema = self._table.key_schema
31
+ self._key_columns = set(self._key_schema.names)
32
+
33
+ @property
34
+ def client(self) -> "Tables":
35
+ """Returns the client used by the table."""
36
+ return self._tables
37
+
38
+ @property
39
+ def table_id(self) -> str:
40
+ return self._table.id
41
+
42
+ @property
43
+ def identifier(self) -> str:
44
+ """Returns the fully qualified identifier of the table."""
45
+ return self._identifier or self._table.id
46
+
47
+ @property
48
+ def dataset(self) -> str | None:
49
+ """Returns the dataset of the table."""
50
+ if self._identifier is None:
51
+ return None
52
+ _, dataset, _ = self._identifier.split(".")
53
+ return dataset
54
+
55
+ @property
56
+ def name(self) -> str | None:
57
+ """Returns the name of the table."""
58
+ if self._identifier is None:
59
+ return None
60
+ _, _, name = self._identifier.split(".")
61
+ return name
62
+
63
+ @property
64
+ def last_modified_at(self) -> int:
65
+ return self._table.get_wal(asof=None).last_modified_at
66
+
67
+ def __str__(self):
68
+ return self.identifier
69
+
70
+ def __repr__(self):
71
+ return f'Table("{self.identifier}")'
72
+
73
+ def __getitem__(self, item: str) -> Expr:
74
+ from spiral import expressions as se
75
+
76
+ if item in self._key_columns:
77
+ return se.key(name=item)
78
+
79
+ return super().__getitem__(item)
80
+
81
+ def select(self, *paths: str, exclude: list[str] = None) -> "Expr":
82
+ # Override an expression select in the root column group to split between keys and columns.
83
+ if exclude is not None:
84
+ if set(exclude) & self._key_columns:
85
+ raise ValueError(
86
+ "Cannot use 'exclude' arg with key columns. Use 'exclude_keys' and an explicit select of keys."
87
+ )
88
+
89
+ key_paths = set(paths) & self._key_columns
90
+ other_paths = set(paths) - key_paths
91
+ if not key_paths:
92
+ return super().select(*paths, exclude=exclude)
93
+
94
+ from spiral import expressions as se
95
+
96
+ return se.merge(se.pack({key: se.key(key) for key in key_paths}), super().select(*other_paths, exclude=exclude))
97
+
98
+ @property
99
+ def key_schema(self) -> Schema:
100
+ """Returns the key schema of the table."""
101
+ return self._key_schema
102
+
103
+ @property
104
+ def schema(self) -> Schema:
105
+ """Returns the FULL schema of the table.
106
+
107
+ NOTE: This can be expensive for large tables.
108
+ """
109
+ return self._table.get_schema(asof=None)
110
+
111
+ def scan(
112
+ self,
113
+ *projections: ExprLike,
114
+ where: ExprLike | None = None,
115
+ asof: datetime | int | None = None,
116
+ exclude_keys: bool = False,
117
+ ) -> Scan:
118
+ """Reads the table. If projections are not provided, the entire table is read."""
119
+ if not projections:
120
+ projections = [self]
121
+
122
+ return self._tables.scan(*projections, where=where, asof=asof, exclude_keys=exclude_keys)
123
+
124
+ def write(
125
+ self,
126
+ expr: ExprLike,
127
+ *,
128
+ partition_size_bytes: int | None = None,
129
+ ) -> None:
130
+ """Write an item to the table inside a single transaction.
131
+
132
+ :param expr: The expression to write. Must evaluate to a struct array.
133
+ :param partition_size_bytes: The maximum partition size in bytes.
134
+ """
135
+ with self.txn() as txn:
136
+ txn.write(
137
+ expr,
138
+ partition_size_bytes=partition_size_bytes,
139
+ )
140
+
141
+ def snapshot(self, asof: datetime | int | None = None) -> Snapshot:
142
+ """Returns a snapshot of the table at the given timestamp."""
143
+ if isinstance(asof, datetime):
144
+ asof = int(asof.timestamp() * 1_000_000)
145
+ return Snapshot(self._tables, self._table.get_snapshot(asof=asof))
146
+
147
+ def txn(self) -> Transaction:
148
+ """Begins a new transaction. Transaction must be committed for writes to become visible.
149
+
150
+ IMPORTANT: While transaction can be used to atomically write data to the table,
151
+ it is important that the primary key columns are unique within the transaction.
152
+ """
153
+ return Transaction(self._tables._spiral.open_transaction(self._table, settings().file_format))
154
+
155
+ def maintenance(self) -> Maintenance:
156
+ """Access maintenance operations for a table."""
157
+ return Maintenance(self._tables._spiral.open_maintenance(self._table, settings().file_format))
@@ -0,0 +1,52 @@
1
+ from typing import TYPE_CHECKING
2
+
3
+ from spiral.core.table import TableTransaction
4
+
5
+ if TYPE_CHECKING:
6
+ from spiral.expressions.base import ExprLike
7
+
8
+
9
+ class Transaction:
10
+ """Spiral table transaction.
11
+
12
+ IMPORTANT: While transaction can be used to atomically write data to the table,
13
+ it is important that the primary key columns are unique within the transaction.
14
+ """
15
+
16
+ def __init__(self, transaction: TableTransaction):
17
+ self._transaction = transaction
18
+
19
+ @property
20
+ def status(self) -> str:
21
+ """The status of the transaction."""
22
+ return self._transaction.status
23
+
24
+ def __enter__(self):
25
+ return self
26
+
27
+ def __exit__(self, exc_type, exc_value, traceback):
28
+ if exc_type is None:
29
+ self._transaction.commit()
30
+ else:
31
+ self._transaction.abort()
32
+
33
+ def write(self, expr: "ExprLike", *, partition_size_bytes: int | None = None):
34
+ """Write an item to the table inside a single transaction.
35
+
36
+ :param expr: The expression to write. Must evaluate to a struct array.
37
+ :param partition_size_bytes: The maximum partition size in bytes.
38
+ If not provided, the default partition size is used.
39
+ """
40
+ from spiral import expressions as se
41
+
42
+ expr = se.lift(expr)
43
+
44
+ self._transaction.write(expr.__expr__, partition_size_bytes=partition_size_bytes)
45
+
46
+ def commit(self):
47
+ """Commit the transaction."""
48
+ self._transaction.commit()
49
+
50
+ def abort(self):
51
+ """Abort the transaction."""
52
+ self._transaction.abort()