pyspiral 0.4.4__cp310-abi3-macosx_11_0_arm64.whl → 0.6.0__cp310-abi3-macosx_11_0_arm64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (85) hide show
  1. {pyspiral-0.4.4.dist-info → pyspiral-0.6.0.dist-info}/METADATA +10 -5
  2. pyspiral-0.6.0.dist-info/RECORD +99 -0
  3. {pyspiral-0.4.4.dist-info → pyspiral-0.6.0.dist-info}/WHEEL +1 -1
  4. spiral/__init__.py +10 -3
  5. spiral/_lib.abi3.so +0 -0
  6. spiral/adbc.py +29 -11
  7. spiral/api/__init__.py +14 -0
  8. spiral/api/client.py +5 -1
  9. spiral/api/key_space_indexes.py +23 -0
  10. spiral/api/projects.py +17 -2
  11. spiral/api/text_indexes.py +56 -0
  12. spiral/api/types.py +2 -0
  13. spiral/api/workers.py +40 -0
  14. spiral/cli/__init__.py +15 -6
  15. spiral/cli/admin.py +2 -4
  16. spiral/cli/app.py +4 -2
  17. spiral/cli/fs.py +5 -6
  18. spiral/cli/iceberg.py +97 -0
  19. spiral/cli/key_spaces.py +68 -0
  20. spiral/cli/login.py +6 -7
  21. spiral/cli/orgs.py +7 -8
  22. spiral/cli/printer.py +3 -3
  23. spiral/cli/projects.py +5 -6
  24. spiral/cli/tables.py +131 -0
  25. spiral/cli/telemetry.py +3 -4
  26. spiral/cli/text.py +115 -0
  27. spiral/cli/types.py +3 -4
  28. spiral/cli/workloads.py +7 -8
  29. spiral/client.py +111 -8
  30. spiral/core/authn/__init__.pyi +27 -0
  31. spiral/core/client/__init__.pyi +135 -63
  32. spiral/core/table/__init__.pyi +36 -26
  33. spiral/core/table/metastore/__init__.pyi +0 -4
  34. spiral/core/table/spec/__init__.pyi +0 -2
  35. spiral/{tables/dataset.py → dataset.py} +13 -7
  36. spiral/{tables/debug → debug}/manifests.py +17 -6
  37. spiral/{tables/debug → debug}/scan.py +7 -7
  38. spiral/expressions/base.py +3 -3
  39. spiral/expressions/udf.py +1 -1
  40. spiral/{iceberg/client.py → iceberg.py} +1 -3
  41. spiral/key_space_index.py +44 -0
  42. spiral/project.py +171 -18
  43. spiral/protogen/_/arrow/flight/protocol/sql/__init__.py +1668 -1110
  44. spiral/protogen/_/google/protobuf/__init__.py +2190 -0
  45. spiral/protogen/_/message_pool.py +3 -0
  46. spiral/protogen/_/py.typed +0 -0
  47. spiral/protogen/_/scandal/__init__.py +138 -126
  48. spiral/protogen/_/spfs/__init__.py +72 -0
  49. spiral/protogen/_/spql/__init__.py +61 -0
  50. spiral/protogen/_/substrait/__init__.py +5256 -2459
  51. spiral/protogen/_/substrait/extensions/__init__.py +103 -49
  52. spiral/{tables/scan.py → scan.py} +37 -44
  53. spiral/settings.py +14 -3
  54. spiral/snapshot.py +55 -0
  55. spiral/streaming_/__init__.py +3 -0
  56. spiral/streaming_/reader.py +117 -0
  57. spiral/streaming_/stream.py +146 -0
  58. spiral/substrait_.py +9 -9
  59. spiral/table.py +257 -0
  60. spiral/text_index.py +17 -0
  61. spiral/{tables/transaction.py → transaction.py} +11 -15
  62. pyspiral-0.4.4.dist-info/RECORD +0 -98
  63. spiral/cli/iceberg/__init__.py +0 -7
  64. spiral/cli/iceberg/namespaces.py +0 -47
  65. spiral/cli/iceberg/tables.py +0 -60
  66. spiral/cli/indexes/__init__.py +0 -19
  67. spiral/cli/tables/__init__.py +0 -121
  68. spiral/core/index/__init__.pyi +0 -15
  69. spiral/iceberg/__init__.py +0 -3
  70. spiral/indexes/__init__.py +0 -5
  71. spiral/indexes/client.py +0 -137
  72. spiral/indexes/index.py +0 -34
  73. spiral/indexes/scan.py +0 -22
  74. spiral/protogen/_/spiral/table/__init__.py +0 -22
  75. spiral/protogen/substrait/__init__.py +0 -3399
  76. spiral/protogen/substrait/extensions/__init__.py +0 -115
  77. spiral/tables/__init__.py +0 -12
  78. spiral/tables/client.py +0 -130
  79. spiral/tables/maintenance.py +0 -12
  80. spiral/tables/snapshot.py +0 -78
  81. spiral/tables/table.py +0 -145
  82. {pyspiral-0.4.4.dist-info → pyspiral-0.6.0.dist-info}/entry_points.txt +0 -0
  83. /spiral/{protogen/_/spiral → debug}/__init__.py +0 -0
  84. /spiral/{tables/debug → debug}/metrics.py +0 -0
  85. /spiral/{tables/debug → protogen/_/google}/__init__.py +0 -0
spiral/client.py CHANGED
@@ -1,24 +1,29 @@
1
+ from datetime import datetime, timedelta
1
2
  from typing import TYPE_CHECKING
2
3
 
3
4
  import jwt
5
+ import pyarrow as pa
4
6
 
5
7
  from spiral.api import SpiralAPI
6
8
  from spiral.api.projects import CreateProjectRequest, CreateProjectResponse
9
+ from spiral.core.client import Operations
7
10
  from spiral.core.client import Spiral as CoreSpiral
11
+ from spiral.datetime_ import timestamp_micros
12
+ from spiral.expressions import ExprLike
13
+ from spiral.scan import Scan
8
14
  from spiral.settings import Settings, settings
9
15
 
10
16
  if TYPE_CHECKING:
11
17
  from spiral.iceberg import Iceberg
18
+ from spiral.key_space_index import KeySpaceIndex
12
19
  from spiral.project import Project
20
+ from spiral.table import Table
21
+ from spiral.text_index import TextIndex
13
22
 
14
23
 
15
24
  class Spiral:
16
25
  def __init__(self, config: Settings | None = None):
17
26
  self._config = config or settings()
18
- self._api = self._config.api
19
- self._core = CoreSpiral(
20
- api_url=self._config.spiraldb.uri, spfs_url=self._config.spfs.uri, authn=self._config.authn
21
- )
22
27
  self._org = None
23
28
 
24
29
  @property
@@ -27,7 +32,11 @@ class Spiral:
27
32
 
28
33
  @property
29
34
  def api(self) -> SpiralAPI:
30
- return self._api
35
+ return self._config.api
36
+
37
+ @property
38
+ def _core(self) -> CoreSpiral:
39
+ return self._config.core
31
40
 
32
41
  @property
33
42
  def organization(self) -> str:
@@ -45,7 +54,7 @@ class Spiral:
45
54
  """List project IDs."""
46
55
  from .project import Project
47
56
 
48
- return [Project(self, id=p.id, name=p.name) for p in self.api.project.list()]
57
+ return [Project(self, project_id=p.id, name=p.name) for p in self.api.project.list()]
49
58
 
50
59
  def create_project(
51
60
  self,
@@ -64,7 +73,102 @@ class Spiral:
64
73
  from spiral.project import Project
65
74
 
66
75
  # We avoid an API call since we'd just be fetching a human-readable name. Seems a waste in most cases.
67
- return Project(self, id=project_id, name=project_id)
76
+ return Project(self, project_id=project_id, name=project_id)
77
+
78
+ def table(self, table_id: str) -> "Table":
79
+ """Open a table using an ID."""
80
+ from spiral.table import Table
81
+
82
+ return Table(self, self._core.table(table_id))
83
+
84
+ def text_index(self, index_id: str) -> "TextIndex":
85
+ """Open a text index using an ID."""
86
+ from spiral.text_index import TextIndex
87
+
88
+ return TextIndex(self._core.text_index(index_id))
89
+
90
+ def key_space_index(self, index_id: str) -> "KeySpaceIndex":
91
+ """Open a key space index using an ID."""
92
+ from spiral.key_space_index import KeySpaceIndex
93
+
94
+ return KeySpaceIndex(self._core.key_space_index(index_id))
95
+
96
+ def scan(
97
+ self,
98
+ *projections: ExprLike,
99
+ where: ExprLike | None = None,
100
+ asof: datetime | int | None = None,
101
+ exclude_keys: bool = False,
102
+ ) -> Scan:
103
+ """Starts a read transaction on the Spiral.
104
+
105
+ Args:
106
+ projections: a set of expressions that return struct arrays.
107
+ where: a query expression to apply to the data.
108
+ asof: only data written before the given timestamp will be returned, caveats around compaction.
109
+ exclude_keys: whether to exclude the key columns in the scan result, defaults to False.
110
+ Note that if a projection includes a key column, it will be included in the result.
111
+ """
112
+ from spiral import expressions as se
113
+
114
+ if isinstance(asof, datetime):
115
+ asof = timestamp_micros(asof)
116
+
117
+ # Combine all projections into a single struct.
118
+ projection = se.merge(*projections)
119
+ if where is not None:
120
+ where = se.lift(where)
121
+
122
+ return Scan(
123
+ self._core.scan(
124
+ projection.__expr__,
125
+ filter=where.__expr__ if where else None,
126
+ asof=asof,
127
+ exclude_keys=exclude_keys,
128
+ ),
129
+ )
130
+
131
+ # TODO(marko): This should be query, and search should be query + scan.
132
+ def search(
133
+ self,
134
+ top_k: int,
135
+ *rank_by: ExprLike,
136
+ filters: ExprLike | None = None,
137
+ freshness_window: timedelta | None = None,
138
+ ) -> pa.RecordBatchReader:
139
+ """Queries the index with the given rank by and filters clauses. Returns a steam of scored keys.
140
+
141
+ Args:
142
+ top_k: The number of top results to return.
143
+ rank_by: Rank by expressions are combined for scoring.
144
+ See `se.text.find` and `se.text.boost` for scoring expressions.
145
+ filters: The `filters` expression is used to filter the results.
146
+ It must return a boolean value and use only conjunctions (ANDs). Expressions in filters
147
+ statement are considered either a `must` or `must_not` clause in search terminology.
148
+ freshness_window: If provided, the index will not be refreshed if its freshness does not exceed this window.
149
+ """
150
+ from spiral import expressions as se
151
+
152
+ if not rank_by:
153
+ raise ValueError("At least one rank by expression is required.")
154
+ rank_by = se.or_(*rank_by)
155
+ if filters is not None:
156
+ filters = se.lift(filters)
157
+
158
+ if freshness_window is None:
159
+ freshness_window = timedelta(seconds=0)
160
+ freshness_window_s = int(freshness_window.total_seconds())
161
+
162
+ return self._core.search(
163
+ top_k=top_k,
164
+ rank_by=rank_by.__expr__,
165
+ filters=filters.__expr__ if filters else None,
166
+ freshness_window_s=freshness_window_s,
167
+ )
168
+
169
+ def _ops(self) -> Operations:
170
+ """Access maintenance operations."""
171
+ return self._core._ops(format=settings().file_format)
68
172
 
69
173
  @property
70
174
  def iceberg(self) -> "Iceberg":
@@ -72,7 +176,6 @@ class Spiral:
72
176
  Apache Iceberg is a powerful open-source table format designed for high-performance data lakes.
73
177
  Iceberg brings reliability, scalability, and advanced features like time travel, schema evolution,
74
178
  and ACID transactions to your warehouse.
75
-
76
179
  """
77
180
  from spiral.iceberg import Iceberg
78
181
 
@@ -0,0 +1,27 @@
1
+ from spiral.api.types import OrgId
2
+
3
+ class Token:
4
+ def __init__(self, value: str): ...
5
+ def expose_secret(self) -> str: ...
6
+
7
+ class Authn:
8
+ @staticmethod
9
+ def from_token(token: Token) -> Authn: ...
10
+ @staticmethod
11
+ def from_fallback() -> Authn: ...
12
+ @staticmethod
13
+ def from_device() -> Authn: ...
14
+ def token(self) -> Token | None: ...
15
+
16
+ class DeviceCodeAuth:
17
+ @staticmethod
18
+ def default() -> DeviceCodeAuth:
19
+ """Return the static device code instance."""
20
+ ...
21
+ def authenticate(self, force: bool = False, org_id: OrgId | None = None) -> Token:
22
+ """Authenticate using device code flow."""
23
+ ...
24
+
25
+ def logout(self) -> None:
26
+ """Logout from the device authentication session."""
27
+ ...
@@ -1,34 +1,11 @@
1
- from spiral.api.types import DatasetName, IndexName, OrgId, ProjectId, RootUri, TableName
2
- from spiral.core.index import SearchScan, TextIndex
3
- from spiral.core.table import Table, TableMaintenance, TableScan, TableSnapshot, TableTransaction
4
- from spiral.core.table.spec import Schema
5
- from spiral.expressions import Expr
6
-
7
- class Token:
8
- def __init__(self, value: str): ...
9
- def expose_secret(self) -> str: ...
10
-
11
- class Authn:
12
- @staticmethod
13
- def from_token(token: Token) -> Authn: ...
14
- @staticmethod
15
- def from_fallback() -> Authn: ...
16
- @staticmethod
17
- def from_device() -> Authn: ...
18
- def token(self) -> Token | None: ...
19
-
20
- class DeviceCodeAuth:
21
- @staticmethod
22
- def default() -> DeviceCodeAuth:
23
- """Return the static device code instance."""
24
- ...
25
- def authenticate(self, force: bool = False, org_id: OrgId | None = None) -> Token:
26
- """Authenticate using device code flow."""
27
- ...
1
+ from typing import Any, Literal
28
2
 
29
- def logout(self) -> None:
30
- """Logout from the device authentication session."""
31
- ...
3
+ import pyarrow as pa
4
+ from spiral.api.types import DatasetName, IndexName, ProjectId, RootUri, TableName
5
+ from spiral.core.authn import Authn
6
+ from spiral.core.table import ColumnGroupState, KeyRange, KeySpaceState, Scan, Snapshot, Table, Transaction
7
+ from spiral.core.table.spec import ColumnGroup, Schema
8
+ from spiral.expressions import Expr
32
9
 
33
10
  class Spiral:
34
11
  """A client for Spiral database"""
@@ -43,6 +20,39 @@ class Spiral:
43
20
  def authn(self) -> Authn:
44
21
  """Get the current authentication context."""
45
22
  ...
23
+
24
+ def scan(
25
+ self,
26
+ projection: Expr,
27
+ filter: Expr | None = None,
28
+ asof: int | None = None,
29
+ exclude_keys: bool = False,
30
+ ) -> Scan:
31
+ """Construct a table scan."""
32
+ ...
33
+
34
+ def transaction(self, table: Table, format: str | None = None) -> Transaction:
35
+ """Being a table transaction."""
36
+ ...
37
+
38
+ def search(
39
+ self,
40
+ top_k: int,
41
+ rank_by: Expr,
42
+ *,
43
+ filters: Expr | None = None,
44
+ freshness_window_s: int | None = None,
45
+ ) -> pa.RecordBatchReader:
46
+ """Search an index.
47
+
48
+ Searching an index returns a stream of record batches that match table's key schema + float score column.
49
+ """
50
+ ...
51
+
52
+ def table(self, table_id: str) -> Table:
53
+ """Get a table."""
54
+ ...
55
+
46
56
  def create_table(
47
57
  self,
48
58
  project_id: ProjectId,
@@ -56,63 +66,125 @@ class Spiral:
56
66
  """Create a new table in the specified project."""
57
67
  ...
58
68
 
59
- def get_table(self, table_id: str) -> Table:
60
- """Get and open table."""
61
-
62
- def open_table(self, table_id: str, key_schema: Schema, root_uri: RootUri) -> Table:
63
- """Open a table. This does not make any network calls."""
69
+ def text_index(self, index_id: str) -> TextIndex:
70
+ """Get a text index."""
64
71
  ...
65
72
 
66
- def open_table_scan(
73
+ def create_text_index(
67
74
  self,
75
+ project_id: ProjectId,
76
+ name: IndexName,
68
77
  projection: Expr,
69
78
  filter: Expr | None = None,
70
- asof: int | None = None,
71
- exclude_keys: bool = False,
72
- ) -> TableScan:
73
- """Construct a table scan."""
79
+ *,
80
+ root_uri: RootUri | None = None,
81
+ exist_ok: bool = False,
82
+ ) -> TextIndex:
83
+ """Create a new index in the specified project."""
74
84
  ...
75
85
 
76
- def open_transaction(self, table: Table, format: str | None = None) -> TableTransaction:
77
- """Being transaction."""
86
+ def key_space_index(self, index_id: str) -> KeySpaceIndex:
87
+ """Get a key space index."""
78
88
  ...
79
89
 
80
- def open_maintenance(self, table: Table, format: str | None = None) -> TableMaintenance:
81
- """Access maintenance operations for a table."""
82
- ...
83
- def create_text_index(
90
+ def create_key_space_index(
84
91
  self,
85
92
  project_id: ProjectId,
86
93
  name: IndexName,
94
+ granularity: int,
87
95
  projection: Expr,
88
96
  filter: Expr | None = None,
89
97
  *,
90
98
  root_uri: RootUri | None = None,
91
99
  exist_ok: bool = False,
92
- ) -> TextIndex:
93
- """Create a new index in the specified project."""
100
+ ) -> KeySpaceIndex:
101
+ """Create a new key space index in the specified project."""
94
102
  ...
95
103
 
96
- def get_text_index(self, index_id: str) -> TextIndex:
97
- """Get a text-based index."""
104
+ def _ops(self, *, format: str | None = None) -> Operations:
105
+ """Access maintenance operations.
106
+
107
+ IMPORTANT: This API is internal and is currently exposed for development & testing.
108
+ Maintenance operations are run by SpiralDB.
109
+ """
98
110
  ...
99
111
 
100
- def open_search_scan(
112
+ class TextIndex:
113
+ id: str
114
+
115
+ class KeySpaceIndex:
116
+ id: str
117
+ table_id: str
118
+ granularity: int
119
+ projection: Expr
120
+ filter: Expr
121
+ asof: int
122
+
123
+ class Shard:
124
+ key_range: KeyRange
125
+ cardinality: int
126
+
127
+ class Operations:
128
+ def flush_wal(self, table: Table) -> None:
129
+ """
130
+ Flush the write-ahead log of the table.
131
+ """
132
+ ...
133
+ def compact_key_space(
101
134
  self,
102
- rank_by: Expr,
103
- top_k: int,
104
- # NOTE(marko): Required for now.
105
- freshness_window_s: int,
106
135
  *,
107
- filter: Expr | None = None,
108
- ) -> SearchScan:
109
- """Query an index."""
136
+ table: Table,
137
+ mode: Literal["plan", "read", "write"] | None = None,
138
+ partition_bytes_min: int | None = None,
139
+ ):
140
+ """
141
+ Compact the key space of the table.
142
+ """
110
143
  ...
111
-
112
- def _sync_snapshot(self, index_id: str, snapshot: TableSnapshot) -> None:
144
+ def compact_column_group(
145
+ self,
146
+ table: Table,
147
+ column_group: ColumnGroup,
148
+ *,
149
+ mode: Literal["plan", "read", "write"] | None = None,
150
+ partition_bytes_min: int | None = None,
151
+ ):
113
152
  """
114
- Synchronize an index with a table snapshot.
115
-
116
- IMPORTANT: This is only exposed for testing purposes and should not be used.
153
+ Compact a column group in the table.
154
+ """
155
+ ...
156
+ def update_text_index(self, index: TextIndex, snapshot: Snapshot) -> None:
157
+ """
158
+ Index table changes up to the given snapshot.
159
+ """
160
+ ...
161
+ def update_key_space_index(self, index: KeySpaceIndex, snapshot: Snapshot) -> None:
162
+ """
163
+ Index table changes up to the given snapshot.
164
+ """
165
+ ...
166
+ def key_space_state(self, table: Table, *, asof: int | None = None) -> KeySpaceState:
167
+ """
168
+ The key space state for the table.
169
+ """
170
+ ...
171
+ def column_group_state(
172
+ self, table: Table, key_space_state: KeySpaceState, column_group: ColumnGroup, *, asof: int | None = None
173
+ ) -> ColumnGroupState:
174
+ """
175
+ The state the column group of the table.
176
+ """
177
+ ...
178
+ def column_groups_states(
179
+ self, table: Table, key_space_state: KeySpaceState, *, asof: int | None = None
180
+ ) -> list[ColumnGroupState]:
181
+ """
182
+ The state of each column group of the table.
183
+ """
184
+ ...
185
+ def compute_shards(self, index: KeySpaceIndex) -> list[Shard]:
186
+ """
187
+ Compute the scan shards from a key space index.
117
188
  """
118
189
  ...
190
+ def metrics(self) -> dict[str, Any]: ...
@@ -1,4 +1,4 @@
1
- from typing import Any, Literal
1
+ from typing import Any
2
2
 
3
3
  import pyarrow as pa
4
4
  from spiral.expressions import Expr
@@ -42,22 +42,24 @@ class Table:
42
42
 
43
43
  def get_wal(self, *, asof: int | None) -> WriteAheadLog: ...
44
44
  def get_schema(self, *, asof: int | None) -> Schema: ...
45
- def get_snapshot(self, *, asof: int | None) -> TableSnapshot: ...
45
+ def get_snapshot(self, *, asof: int | None) -> Snapshot: ...
46
46
 
47
- class TableSnapshot:
47
+ class Snapshot:
48
48
  """A snapshot of a table at a specific point in time."""
49
49
 
50
50
  asof: int
51
51
  table: Table
52
52
  wal: WriteAheadLog
53
53
 
54
- class TableScan:
54
+ class Scan:
55
55
  def key_schema(self) -> Schema: ...
56
56
  def schema(self) -> Schema: ...
57
57
  def is_empty(self) -> bool: ...
58
- def split(self) -> list[KeyRange]: ...
58
+ def splits(self) -> list[KeyRange]: ...
59
59
  def table_ids(self) -> list[str]: ...
60
60
  def column_groups(self) -> list[ColumnGroup]: ...
61
+ def column_group_state(self, column_group: ColumnGroup) -> ColumnGroupState: ...
62
+ def key_space_state(self, table_id: str) -> KeySpaceState: ...
61
63
  def to_record_batches(
62
64
  self,
63
65
  key_table: pa.Table | pa.RecordBatch | None = None,
@@ -65,25 +67,29 @@ class TableScan:
65
67
  ) -> pa.RecordBatchReader: ...
66
68
  def to_shuffled_record_batches(
67
69
  self,
70
+ strategy: ShuffleStrategy | None = None,
68
71
  batch_readahead: int | None = None,
69
- shuffle_buffer_size: int | None = None,
70
- shuffle_pool_num_rows: int | None = None,
71
72
  ) -> pa.RecordBatchReader: ...
72
- def column_group_scan(self, column_group: ColumnGroup) -> ColumnGroupScan: ...
73
- def key_space_scan(self, table_id: str) -> KeySpaceScan: ...
74
73
  def metrics(self) -> dict[str, Any]: ...
74
+ def _prepare_shard(
75
+ self,
76
+ output_path: str,
77
+ key_range: KeyRange,
78
+ expected_cardinality: int | None = None,
79
+ ) -> None: ...
75
80
 
76
- class KeySpaceScan:
81
+ class KeySpaceState:
77
82
  manifest: FragmentManifest
78
83
 
79
84
  def key_schema(self) -> Schema: ...
80
85
 
81
- class ColumnGroupScan:
86
+ class ColumnGroupState:
82
87
  manifest: FragmentManifest
88
+ column_group: ColumnGroup
83
89
 
84
90
  def schema(self) -> Schema: ...
85
91
 
86
- class TableTransaction:
92
+ class Transaction:
87
93
  status: str
88
94
 
89
95
  def write(self, expr: Expr, *, partition_size_bytes: int | None = None): ...
@@ -91,19 +97,23 @@ class TableTransaction:
91
97
  def abort(self): ...
92
98
  def metrics(self) -> dict[str, Any]: ...
93
99
 
94
- class TableMaintenance:
95
- def flush_wal(self): ...
96
- def compact_key_space(
97
- self,
98
- *,
99
- mode: Literal["plan", "read", "write"] | None = None,
100
- partition_bytes_min: int | None = None,
101
- ): ...
102
- def compact_column_group(
100
+ class ShuffleStrategy:
101
+ # Results are buffered in a pool of `buffer_size` rows and shuffled again.
102
+ shuffle_buffer_size: int
103
+
104
+ # All randomness is derived from this seed. If None, a random seed is generated from the OS.
105
+ seed: int | None
106
+
107
+ # `approximate_batch_size` controls the maximum approximate size of each shard. Shards that
108
+ # are larger than this size are further split assuming uniform distribution of keys. Note
109
+ # that this is a best-effort and can be widely off. The purpose of this is to improve
110
+ # shuffling, rather than to support sharding. If not present, splits derived from the table
111
+ # are used in the attempt to minimize wasted reads.
112
+ approximate_buffer_size: int | None
113
+
114
+ def __init__(
103
115
  self,
104
- column_group: ColumnGroup,
105
- *,
106
- mode: Literal["plan", "read", "write"] | None = None,
107
- partition_bytes_min: int | None = None,
116
+ seed: int | None = None,
117
+ shard_size: int | None = None,
118
+ buffer_size: int | None = None,
108
119
  ): ...
109
- def metrics(self) -> dict[str, Any]: ...
@@ -56,7 +56,3 @@ class PyMetastore:
56
56
  authn: Authn,
57
57
  ) -> PyMetastore:
58
58
  """Construct a PyMetastore backed by an HTTP metastore service."""
59
-
60
- @staticmethod
61
- def test(table_id: str, root_uri: str, key_schema: Schema) -> PyMetastore:
62
- """Construct a PyMetastore backed by an in-memory mock metastore service."""
@@ -101,8 +101,6 @@ class FragmentLevel:
101
101
  ...
102
102
 
103
103
  class Key:
104
- key: bytes
105
-
106
104
  def __init__(self, key: bytes): ...
107
105
  def __bytes__(self): ...
108
106
  def step(self) -> Key:
@@ -4,14 +4,15 @@ import pyarrow as pa
4
4
  import pyarrow.compute as pc
5
5
  import pyarrow.dataset as ds
6
6
 
7
- from spiral.tables import Scan, Snapshot
7
+ from spiral.scan import Scan
8
+ from spiral.snapshot import Snapshot
8
9
 
9
10
 
10
- class TableDataset(ds.Dataset):
11
+ class Dataset(ds.Dataset):
11
12
  def __init__(self, snapshot: Snapshot):
12
13
  self._snapshot = snapshot
13
14
  self._table = snapshot.table
14
- self._schema: pa.Schema = self._snapshot._snapshot.table.get_schema(asof=self._snapshot.asof).to_arrow()
15
+ self._schema: pa.Schema = self._snapshot.schema().to_arrow()
15
16
 
16
17
  # We don't actually initialize a Dataset, we just implement enough of the API
17
18
  # to fool both DuckDB and Polars.
@@ -43,7 +44,7 @@ class TableDataset(ds.Dataset):
43
44
  memory_pool,
44
45
  ).count_rows()
45
46
 
46
- def filter(self, expression: pc.Expression) -> "TableDataset":
47
+ def filter(self, expression: pc.Expression) -> "Dataset":
47
48
  raise NotImplementedError("filter not implemented")
48
49
 
49
50
  def get_fragments(self, filter: pc.Expression | None = None):
@@ -89,7 +90,7 @@ class TableDataset(ds.Dataset):
89
90
  def join_asof(self, right_dataset, on, by, tolerance, right_on=None, right_by=None):
90
91
  raise NotImplementedError("join_asof not implemented")
91
92
 
92
- def replace_schema(self, schema: pa.Schema) -> "TableDataset":
93
+ def replace_schema(self, schema: pa.Schema) -> "Dataset":
93
94
  raise NotImplementedError("replace_schema not implemented")
94
95
 
95
96
  def scanner(
@@ -112,13 +113,18 @@ class TableDataset(ds.Dataset):
112
113
  )
113
114
 
114
115
  scan = (
115
- self._snapshot.scan(
116
+ self._table.spiral.scan(
116
117
  {c: self._table[c] for c in columns},
117
118
  where=filter,
119
+ asof=self._snapshot.asof,
118
120
  exclude_keys=True,
119
121
  )
120
122
  if columns
121
- else self._snapshot.scan(where=filter)
123
+ else self._table.spiral.scan(
124
+ self._table,
125
+ where=filter,
126
+ asof=self._snapshot.asof,
127
+ )
122
128
  )
123
129
  self._last_scan = scan
124
130
 
@@ -1,23 +1,34 @@
1
+ from collections.abc import Iterable
2
+
1
3
  from spiral import datetime_
2
- from spiral.core.table import TableScan
4
+ from spiral.core.table import Scan
3
5
  from spiral.core.table.manifests import FragmentManifest
4
- from spiral.tables.debug.metrics import _format_bytes
6
+ from spiral.core.table.spec import ColumnGroup
7
+ from spiral.debug.metrics import _format_bytes
5
8
 
6
9
 
7
- def display_manifests(scan: TableScan):
10
+ def display_scan_manifests(scan: Scan):
8
11
  """Display all manifests in a scan."""
9
12
  if len(scan.table_ids()) != 1:
10
13
  raise NotImplementedError("Multiple table scans are not supported.")
11
14
  table_id = scan.table_ids()[0]
15
+ key_space_manifest = scan.key_space_state(table_id).manifest
16
+ column_group_manifests = (
17
+ (column_group, scan.column_group_state(column_group).manifest) for column_group in scan.column_groups()
18
+ )
19
+
20
+ display_manifests(key_space_manifest, column_group_manifests)
21
+
12
22
 
13
- key_space_manifest: FragmentManifest = scan.key_space_scan(table_id).manifest
23
+ def display_manifests(
24
+ key_space_manifest: FragmentManifest, column_group_manifests: Iterable[tuple[ColumnGroup, FragmentManifest]]
25
+ ):
14
26
  _table_of_fragments(
15
27
  key_space_manifest,
16
28
  title="Key Space manifest",
17
29
  )
18
30
 
19
- for column_group in scan.column_groups():
20
- column_group_manifest: FragmentManifest = scan.column_group_scan(column_group).manifest
31
+ for column_group, column_group_manifest in column_group_manifests:
21
32
  _table_of_fragments(
22
33
  column_group_manifest,
23
34
  title=f"Column Group manifest for {str(column_group)}",