pyspiral 0.4.4__cp310-abi3-macosx_11_0_arm64.whl → 0.6.0__cp310-abi3-macosx_11_0_arm64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (85) hide show
  1. {pyspiral-0.4.4.dist-info → pyspiral-0.6.0.dist-info}/METADATA +10 -5
  2. pyspiral-0.6.0.dist-info/RECORD +99 -0
  3. {pyspiral-0.4.4.dist-info → pyspiral-0.6.0.dist-info}/WHEEL +1 -1
  4. spiral/__init__.py +10 -3
  5. spiral/_lib.abi3.so +0 -0
  6. spiral/adbc.py +29 -11
  7. spiral/api/__init__.py +14 -0
  8. spiral/api/client.py +5 -1
  9. spiral/api/key_space_indexes.py +23 -0
  10. spiral/api/projects.py +17 -2
  11. spiral/api/text_indexes.py +56 -0
  12. spiral/api/types.py +2 -0
  13. spiral/api/workers.py +40 -0
  14. spiral/cli/__init__.py +15 -6
  15. spiral/cli/admin.py +2 -4
  16. spiral/cli/app.py +4 -2
  17. spiral/cli/fs.py +5 -6
  18. spiral/cli/iceberg.py +97 -0
  19. spiral/cli/key_spaces.py +68 -0
  20. spiral/cli/login.py +6 -7
  21. spiral/cli/orgs.py +7 -8
  22. spiral/cli/printer.py +3 -3
  23. spiral/cli/projects.py +5 -6
  24. spiral/cli/tables.py +131 -0
  25. spiral/cli/telemetry.py +3 -4
  26. spiral/cli/text.py +115 -0
  27. spiral/cli/types.py +3 -4
  28. spiral/cli/workloads.py +7 -8
  29. spiral/client.py +111 -8
  30. spiral/core/authn/__init__.pyi +27 -0
  31. spiral/core/client/__init__.pyi +135 -63
  32. spiral/core/table/__init__.pyi +36 -26
  33. spiral/core/table/metastore/__init__.pyi +0 -4
  34. spiral/core/table/spec/__init__.pyi +0 -2
  35. spiral/{tables/dataset.py → dataset.py} +13 -7
  36. spiral/{tables/debug → debug}/manifests.py +17 -6
  37. spiral/{tables/debug → debug}/scan.py +7 -7
  38. spiral/expressions/base.py +3 -3
  39. spiral/expressions/udf.py +1 -1
  40. spiral/{iceberg/client.py → iceberg.py} +1 -3
  41. spiral/key_space_index.py +44 -0
  42. spiral/project.py +171 -18
  43. spiral/protogen/_/arrow/flight/protocol/sql/__init__.py +1668 -1110
  44. spiral/protogen/_/google/protobuf/__init__.py +2190 -0
  45. spiral/protogen/_/message_pool.py +3 -0
  46. spiral/protogen/_/py.typed +0 -0
  47. spiral/protogen/_/scandal/__init__.py +138 -126
  48. spiral/protogen/_/spfs/__init__.py +72 -0
  49. spiral/protogen/_/spql/__init__.py +61 -0
  50. spiral/protogen/_/substrait/__init__.py +5256 -2459
  51. spiral/protogen/_/substrait/extensions/__init__.py +103 -49
  52. spiral/{tables/scan.py → scan.py} +37 -44
  53. spiral/settings.py +14 -3
  54. spiral/snapshot.py +55 -0
  55. spiral/streaming_/__init__.py +3 -0
  56. spiral/streaming_/reader.py +117 -0
  57. spiral/streaming_/stream.py +146 -0
  58. spiral/substrait_.py +9 -9
  59. spiral/table.py +257 -0
  60. spiral/text_index.py +17 -0
  61. spiral/{tables/transaction.py → transaction.py} +11 -15
  62. pyspiral-0.4.4.dist-info/RECORD +0 -98
  63. spiral/cli/iceberg/__init__.py +0 -7
  64. spiral/cli/iceberg/namespaces.py +0 -47
  65. spiral/cli/iceberg/tables.py +0 -60
  66. spiral/cli/indexes/__init__.py +0 -19
  67. spiral/cli/tables/__init__.py +0 -121
  68. spiral/core/index/__init__.pyi +0 -15
  69. spiral/iceberg/__init__.py +0 -3
  70. spiral/indexes/__init__.py +0 -5
  71. spiral/indexes/client.py +0 -137
  72. spiral/indexes/index.py +0 -34
  73. spiral/indexes/scan.py +0 -22
  74. spiral/protogen/_/spiral/table/__init__.py +0 -22
  75. spiral/protogen/substrait/__init__.py +0 -3399
  76. spiral/protogen/substrait/extensions/__init__.py +0 -115
  77. spiral/tables/__init__.py +0 -12
  78. spiral/tables/client.py +0 -130
  79. spiral/tables/maintenance.py +0 -12
  80. spiral/tables/snapshot.py +0 -78
  81. spiral/tables/table.py +0 -145
  82. {pyspiral-0.4.4.dist-info → pyspiral-0.6.0.dist-info}/entry_points.txt +0 -0
  83. /spiral/{protogen/_/spiral → debug}/__init__.py +0 -0
  84. /spiral/{tables/debug → debug}/metrics.py +0 -0
  85. /spiral/{tables/debug → protogen/_/google}/__init__.py +0 -0
@@ -1,115 +1,169 @@
1
1
  # Generated by the protocol buffer compiler. DO NOT EDIT!
2
2
  # sources: substrait/extensions/extensions.proto
3
- # plugin: python-betterproto
3
+ # plugin: python-betterproto2
4
4
  # This file has been @generated
5
5
 
6
+ __all__ = (
7
+ "AdvancedExtension",
8
+ "SimpleExtensionDeclaration",
9
+ "SimpleExtensionDeclarationExtensionFunction",
10
+ "SimpleExtensionDeclarationExtensionType",
11
+ "SimpleExtensionDeclarationExtensionTypeVariation",
12
+ "SimpleExtensionUri",
13
+ )
14
+
6
15
  from dataclasses import dataclass
7
- from typing import List
8
16
 
9
- import betterproto
10
- import betterproto.lib.google.protobuf as betterproto_lib_google_protobuf
17
+ import betterproto2
18
+
19
+ from ...message_pool import default_message_pool
20
+
21
+ _COMPILER_VERSION = "0.8.0"
22
+ betterproto2.check_compiler_version(_COMPILER_VERSION)
11
23
 
12
24
 
13
25
  @dataclass(eq=False, repr=False)
14
- class SimpleExtensionUri(betterproto.Message):
15
- extension_uri_anchor: int = betterproto.uint32_field(1)
26
+ class AdvancedExtension(betterproto2.Message):
16
27
  """
17
- A surrogate key used in the context of a single plan used to reference the
18
- URI associated with an extension.
28
+ A generic object that can be used to embed additional extension information
29
+ into the serialized substrait plan.
19
30
  """
20
31
 
21
- uri: str = betterproto.string_field(2)
32
+ optimization: "list[__google__protobuf__.Any]" = betterproto2.field(1, betterproto2.TYPE_MESSAGE, repeated=True)
22
33
  """
23
- The URI where this extension YAML can be retrieved. This is the "namespace"
24
- of this extension.
34
+ An optimization is helpful information that don't influence semantics. May
35
+ be ignored by a consumer.
36
+ """
37
+
38
+ enhancement: "__google__protobuf__.Any | None" = betterproto2.field(2, betterproto2.TYPE_MESSAGE, optional=True)
39
+ """
40
+ An enhancement alter semantics. Cannot be ignored by a consumer.
25
41
  """
26
42
 
27
43
 
44
+ default_message_pool.register_message("substrait.extensions", "AdvancedExtension", AdvancedExtension)
45
+
46
+
28
47
  @dataclass(eq=False, repr=False)
29
- class SimpleExtensionDeclaration(betterproto.Message):
48
+ class SimpleExtensionDeclaration(betterproto2.Message):
30
49
  """
31
50
  Describes a mapping between a specific extension entity and the uri where
32
- that extension can be found.
51
+ that extension can be found.
52
+
53
+ Oneofs:
54
+ - mapping_type:
33
55
  """
34
56
 
35
- extension_type: "SimpleExtensionDeclarationExtensionType" = (
36
- betterproto.message_field(1, group="mapping_type")
57
+ extension_type: "SimpleExtensionDeclarationExtensionType | None" = betterproto2.field(
58
+ 1, betterproto2.TYPE_MESSAGE, optional=True, group="mapping_type"
37
59
  )
38
- extension_type_variation: "SimpleExtensionDeclarationExtensionTypeVariation" = (
39
- betterproto.message_field(2, group="mapping_type")
60
+
61
+ extension_type_variation: "SimpleExtensionDeclarationExtensionTypeVariation | None" = betterproto2.field(
62
+ 2, betterproto2.TYPE_MESSAGE, optional=True, group="mapping_type"
40
63
  )
41
- extension_function: "SimpleExtensionDeclarationExtensionFunction" = (
42
- betterproto.message_field(3, group="mapping_type")
64
+
65
+ extension_function: "SimpleExtensionDeclarationExtensionFunction | None" = betterproto2.field(
66
+ 3, betterproto2.TYPE_MESSAGE, optional=True, group="mapping_type"
43
67
  )
44
68
 
45
69
 
46
- @dataclass(eq=False, repr=False)
47
- class SimpleExtensionDeclarationExtensionType(betterproto.Message):
48
- """Describes a Type"""
70
+ default_message_pool.register_message("substrait.extensions", "SimpleExtensionDeclaration", SimpleExtensionDeclaration)
71
+
49
72
 
50
- extension_uri_reference: int = betterproto.uint32_field(1)
73
+ @dataclass(eq=False, repr=False)
74
+ class SimpleExtensionDeclarationExtensionFunction(betterproto2.Message):
75
+ extension_uri_reference: "int" = betterproto2.field(1, betterproto2.TYPE_UINT32)
51
76
  """
52
77
  references the extension_uri_anchor defined for a specific extension URI.
53
78
  """
54
79
 
55
- type_anchor: int = betterproto.uint32_field(2)
80
+ function_anchor: "int" = betterproto2.field(2, betterproto2.TYPE_UINT32)
56
81
  """
57
82
  A surrogate key used in the context of a single plan to reference a
58
- specific extension type
83
+ specific function
84
+ """
85
+
86
+ name: "str" = betterproto2.field(3, betterproto2.TYPE_STRING)
59
87
  """
88
+ A function signature compound name
89
+ """
90
+
60
91
 
61
- name: str = betterproto.string_field(3)
62
- """the name of the type in the defined extension YAML."""
92
+ default_message_pool.register_message(
93
+ "substrait.extensions", "SimpleExtensionDeclaration.ExtensionFunction", SimpleExtensionDeclarationExtensionFunction
94
+ )
63
95
 
64
96
 
65
97
  @dataclass(eq=False, repr=False)
66
- class SimpleExtensionDeclarationExtensionTypeVariation(betterproto.Message):
67
- extension_uri_reference: int = betterproto.uint32_field(1)
98
+ class SimpleExtensionDeclarationExtensionType(betterproto2.Message):
99
+ """
100
+ Describes a Type
101
+ """
102
+
103
+ extension_uri_reference: "int" = betterproto2.field(1, betterproto2.TYPE_UINT32)
68
104
  """
69
105
  references the extension_uri_anchor defined for a specific extension URI.
70
106
  """
71
107
 
72
- type_variation_anchor: int = betterproto.uint32_field(2)
108
+ type_anchor: "int" = betterproto2.field(2, betterproto2.TYPE_UINT32)
73
109
  """
74
110
  A surrogate key used in the context of a single plan to reference a
75
- specific type variation
111
+ specific extension type
112
+ """
113
+
114
+ name: "str" = betterproto2.field(3, betterproto2.TYPE_STRING)
115
+ """
116
+ the name of the type in the defined extension YAML.
76
117
  """
77
118
 
78
- name: str = betterproto.string_field(3)
79
- """the name of the type in the defined extension YAML."""
119
+
120
+ default_message_pool.register_message(
121
+ "substrait.extensions", "SimpleExtensionDeclaration.ExtensionType", SimpleExtensionDeclarationExtensionType
122
+ )
80
123
 
81
124
 
82
125
  @dataclass(eq=False, repr=False)
83
- class SimpleExtensionDeclarationExtensionFunction(betterproto.Message):
84
- extension_uri_reference: int = betterproto.uint32_field(1)
126
+ class SimpleExtensionDeclarationExtensionTypeVariation(betterproto2.Message):
127
+ extension_uri_reference: "int" = betterproto2.field(1, betterproto2.TYPE_UINT32)
85
128
  """
86
129
  references the extension_uri_anchor defined for a specific extension URI.
87
130
  """
88
131
 
89
- function_anchor: int = betterproto.uint32_field(2)
132
+ type_variation_anchor: "int" = betterproto2.field(2, betterproto2.TYPE_UINT32)
90
133
  """
91
134
  A surrogate key used in the context of a single plan to reference a
92
- specific function
135
+ specific type variation
136
+ """
137
+
138
+ name: "str" = betterproto2.field(3, betterproto2.TYPE_STRING)
139
+ """
140
+ the name of the type in the defined extension YAML.
93
141
  """
94
142
 
95
- name: str = betterproto.string_field(3)
96
- """A function signature compound name"""
143
+
144
+ default_message_pool.register_message(
145
+ "substrait.extensions",
146
+ "SimpleExtensionDeclaration.ExtensionTypeVariation",
147
+ SimpleExtensionDeclarationExtensionTypeVariation,
148
+ )
97
149
 
98
150
 
99
151
  @dataclass(eq=False, repr=False)
100
- class AdvancedExtension(betterproto.Message):
152
+ class SimpleExtensionUri(betterproto2.Message):
153
+ extension_uri_anchor: "int" = betterproto2.field(1, betterproto2.TYPE_UINT32)
101
154
  """
102
- A generic object that can be used to embed additional extension information
103
- into the serialized substrait plan.
155
+ A surrogate key used in the context of a single plan used to reference the
156
+ URI associated with an extension.
104
157
  """
105
158
 
106
- optimization: List[
107
- "betterproto_lib_google_protobuf.Any"
108
- ] = betterproto.message_field(1)
159
+ uri: "str" = betterproto2.field(2, betterproto2.TYPE_STRING)
109
160
  """
110
- An optimization is helpful information that don't influence semantics. May
111
- be ignored by a consumer.
161
+ The URI where this extension YAML can be retrieved. This is the "namespace"
162
+ of this extension.
112
163
  """
113
164
 
114
- enhancement: "betterproto_lib_google_protobuf.Any" = betterproto.message_field(2)
115
- """An enhancement alter semantics. Cannot be ignored by a consumer."""
165
+
166
+ default_message_pool.register_message("substrait.extensions", "SimpleExtensionURI", SimpleExtensionUri)
167
+
168
+
169
+ from ...google import protobuf as __google__protobuf__
@@ -2,40 +2,36 @@ from collections.abc import Iterator
2
2
  from typing import TYPE_CHECKING, Any
3
3
 
4
4
  import pyarrow as pa
5
- from datasets import DatasetInfo, Features
6
5
 
7
- from spiral.core.table import KeyRange, TableScan
6
+ from spiral.core.table import KeyRange, ShuffleStrategy
7
+ from spiral.core.table import Scan as CoreScan
8
8
  from spiral.core.table.spec import Schema
9
9
  from spiral.settings import CI, DEV
10
10
 
11
11
  if TYPE_CHECKING:
12
12
  import dask.dataframe as dd
13
+ import datasets.iterable_dataset as hf # noqa
13
14
  import pandas as pd
14
15
  import polars as pl
15
- from datasets import iterable_dataset
16
+ import streaming # noqa
17
+ import torch.utils.data as torchdata # noqa
16
18
 
17
19
 
18
20
  class Scan:
19
21
  """Scan object."""
20
22
 
21
- def __init__(
22
- self,
23
- scan: TableScan,
24
- ):
25
- # NOTE(ngates): this API is a little weird. e.g. if the query doesn't define an asof, it is resolved
26
- # when we wrap it into a core.Scan. Should we expose a Query object in the Python API that's reusable
27
- # and will re-resolve the asof? Or should we just expose a scan that fixes the asof at construction time?
28
- self._scan = scan
23
+ def __init__(self, core: CoreScan):
24
+ self.core = core
29
25
 
30
26
  @property
31
27
  def metrics(self) -> dict[str, Any]:
32
28
  """Returns metrics about the scan."""
33
- return self._scan.metrics()
29
+ return self.core.metrics()
34
30
 
35
31
  @property
36
32
  def schema(self) -> Schema:
37
33
  """Returns the schema of the scan."""
38
- return self._scan.schema()
34
+ return self.core.schema()
39
35
 
40
36
  def is_empty(self) -> bool:
41
37
  """Check if the Spiral is empty for the given key range.
@@ -43,7 +39,7 @@ class Scan:
43
39
  **IMPORTANT**: False negatives are possible, but false positives are not,
44
40
  i.e. is_empty can return False and scan can return zero rows.
45
41
  """
46
- return self._scan.is_empty()
42
+ return self.core.is_empty()
47
43
 
48
44
  def to_record_batches(
49
45
  self,
@@ -69,7 +65,7 @@ class Scan:
69
65
  elif isinstance(key_table, pa.Table):
70
66
  key_table = key_table.to_reader(max_chunksize=batch_size)
71
67
 
72
- return self._scan.to_record_batches(key_table=key_table, batch_readahead=batch_readahead)
68
+ return self.core.to_record_batches(key_table=key_table, batch_readahead=batch_readahead)
73
69
 
74
70
  def to_table(
75
71
  self,
@@ -83,7 +79,7 @@ class Scan:
83
79
  """
84
80
  # NOTE: Evaluates fully on Rust side which improved debuggability.
85
81
  if DEV and not CI and key_table is None:
86
- rb = self._scan.to_record_batch()
82
+ rb = self.core.to_record_batch()
87
83
  return pa.Table.from_batches([rb])
88
84
 
89
85
  return self.to_record_batches(key_table=key_table).read_all()
@@ -97,11 +93,11 @@ class Scan:
97
93
  import pandas as pd
98
94
 
99
95
  def _read_key_range(key_range: KeyRange) -> pd.DataFrame:
100
- # TODO(ngates): we need a way to preserve the existing asofs? Should we copy CoreScan instead of Query?
96
+ # TODO(ngates): we need a way to preserve the existing asofs?
101
97
  raise NotImplementedError()
102
98
 
103
99
  # Fetch a set of partition ranges
104
- return dd.from_map(_read_key_range, self.split())
100
+ return dd.from_map(_read_key_range, self._splits())
105
101
 
106
102
  def to_pandas(self) -> "pd.DataFrame":
107
103
  """Read into a Pandas DataFrame.
@@ -117,34 +113,31 @@ class Scan:
117
113
  """
118
114
  import polars as pl
119
115
 
120
- # TODO(marko): This should support lazy dataframe.
121
116
  return pl.from_arrow(self.to_record_batches())
122
117
 
123
- def to_pytorch(
118
+ def to_iterable_dataset(
124
119
  self,
120
+ shuffle: ShuffleStrategy | None = None,
125
121
  batch_readahead: int | None = None,
126
- shuffle_batch_size: int | None = None,
127
- shuffle_pool_num_rows: int | None = None,
128
- ) -> "iterable_dataset.IterableDataset":
129
- """Returns an iterable dataset that can be used to build a PyTorch DataLoader.
122
+ ) -> "hf.IterableDataset":
123
+ """Returns an Huggingface's IterableDataset.
124
+
125
+ Requires `datasets` package to be installed.
130
126
 
131
127
  Args:
132
- batch_readahead: Number of batches to prefetch in the background.
133
- shuffle_batch_size: read granularity of number of rows for a shuffled scan. If left as
134
- None along with shuffle_pool_num_rows=None, shuffling is disabled.
135
- shuffle_pool_num_rows: Pool size for shuffling batches.
128
+ shuffle: Controls sample shuffling. If None, no shuffling is performed.
129
+ batch_readahead: Controls how many batches to read ahead concurrently.
130
+ If pipeline includes work after reading (e.g. decoding, transforming, ...) this can be set higher.
131
+ Otherwise, it should be kept low to reduce next batch latency. Defaults to 2.
136
132
  """
133
+ from datasets import DatasetInfo, Features
137
134
  from datasets.iterable_dataset import ArrowExamplesIterable, IterableDataset
138
135
 
139
136
  def _generate_tables(**kwargs) -> Iterator[tuple[int, pa.Table]]:
140
- if shuffle_batch_size is None and shuffle_pool_num_rows is None:
141
- stream = self.to_record_batches(
142
- batch_readahead=batch_readahead,
143
- )
144
- else:
145
- stream = self._scan.to_shuffled_record_batches(
146
- batch_readahead, shuffle_batch_size, shuffle_pool_num_rows
147
- )
137
+ stream = self.core.to_shuffled_record_batches(
138
+ shuffle,
139
+ batch_readahead,
140
+ )
148
141
 
149
142
  # This key is unused when training with IterableDataset.
150
143
  # Default implementation returns shard id, e.g. parquet row group id.
@@ -166,28 +159,28 @@ class Scan:
166
159
  return pa.schema(new_fields)
167
160
 
168
161
  # NOTE: generate_tables_fn type annotations are wrong, return type must be an iterable of tuples.
169
- ex_iterable = ArrowExamplesIterable(generate_tables_fn=_generate_tables, kwargs={})
162
+ ex_iterable = ArrowExamplesIterable(generate_tables_fn=_generate_tables, kwargs={}) # type: ignore
170
163
  info = DatasetInfo(features=Features.from_arrow_schema(_hf_compatible_schema(self.schema.to_arrow())))
171
164
  return IterableDataset(ex_iterable=ex_iterable, info=info)
172
165
 
173
- def _split(self) -> list[KeyRange]:
166
+ def _splits(self) -> list[KeyRange]:
174
167
  # Splits the scan into a set of key ranges.
175
- return self._scan.split()
168
+ return self.core.splits()
176
169
 
177
170
  def _debug(self):
178
171
  # Visualizes the scan, mainly for debugging purposes.
179
- from spiral.tables.debug.scan import show_scan
172
+ from spiral.debug.scan import show_scan
180
173
 
181
- show_scan(self._scan)
174
+ show_scan(self.core)
182
175
 
183
176
  def _dump_manifests(self):
184
177
  # Print manifests in a human-readable format.
185
- from spiral.tables.debug.manifests import display_manifests
178
+ from spiral.debug.manifests import display_scan_manifests
186
179
 
187
- display_manifests(self._scan)
180
+ display_scan_manifests(self._core)
188
181
 
189
182
  def _dump_metrics(self):
190
183
  # Print metrics in a human-readable format.
191
- from spiral.tables.debug.metrics import display_metrics
184
+ from spiral.debug.metrics import display_metrics
192
185
 
193
186
  display_metrics(self.metrics)
spiral/settings.py CHANGED
@@ -1,7 +1,7 @@
1
1
  import functools
2
2
  import os
3
3
  from pathlib import Path
4
- from typing import Annotated
4
+ from typing import TYPE_CHECKING, Annotated
5
5
 
6
6
  import typer
7
7
  from pydantic import Field, ValidatorFunctionWrapHandler, WrapValidator
@@ -12,8 +12,11 @@ from pydantic_settings import (
12
12
  SettingsConfigDict,
13
13
  )
14
14
 
15
- from spiral.api import SpiralAPI
16
- from spiral.core.client import Authn, DeviceCodeAuth, Token
15
+ from spiral.core.authn import Authn, DeviceCodeAuth, Token
16
+ from spiral.core.client import Spiral
17
+
18
+ if TYPE_CHECKING:
19
+ from spiral.api import SpiralAPI
17
20
 
18
21
  DEV = "PYTEST_VERSION" in os.environ or bool(os.environ.get("SPIRAL_DEV", None))
19
22
  CI = "GITHUB_ACTIONS" in os.environ
@@ -74,6 +77,14 @@ class Settings(BaseSettings):
74
77
 
75
78
  return SpiralAPI(self.authn, base_url=self.spiraldb.uri)
76
79
 
80
+ @functools.cached_property
81
+ def core(self) -> Spiral:
82
+ return Spiral(
83
+ api_url=self.spiraldb.uri,
84
+ spfs_url=self.spfs.uri,
85
+ authn=self.authn,
86
+ )
87
+
77
88
  @functools.cached_property
78
89
  def authn(self):
79
90
  if self.spiraldb.token:
spiral/snapshot.py ADDED
@@ -0,0 +1,55 @@
1
+ from typing import TYPE_CHECKING
2
+
3
+ from spiral.core.table import Snapshot as CoreSnapshot
4
+ from spiral.core.table.spec import Schema
5
+ from spiral.types_ import Timestamp
6
+
7
+ if TYPE_CHECKING:
8
+ import duckdb
9
+ import polars as pl
10
+ import pyarrow.dataset as ds
11
+
12
+ from spiral.table import Table
13
+
14
+
15
+ class Snapshot:
16
+ """Spiral table snapshot.
17
+
18
+ A snapshot represents a point-in-time view of a table.
19
+ """
20
+
21
+ def __init__(self, table: "Table", core: CoreSnapshot):
22
+ self.core = core
23
+ self._table = table
24
+
25
+ @property
26
+ def asof(self) -> Timestamp:
27
+ """Returns the asof timestamp of the snapshot."""
28
+ return self.core.asof
29
+
30
+ def schema(self) -> Schema:
31
+ """Returns the schema of the snapshot."""
32
+ return self.core.table.get_schema(asof=self.asof)
33
+
34
+ @property
35
+ def table(self) -> "Table":
36
+ """Returns the table associated with the snapshot."""
37
+ return self._table
38
+
39
+ def to_dataset(self) -> "ds.Dataset":
40
+ """Returns a PyArrow Dataset representing the table."""
41
+ from spiral.dataset import Dataset
42
+
43
+ return Dataset(self)
44
+
45
+ def to_polars(self) -> "pl.LazyFrame":
46
+ """Returns a Polars LazyFrame for the Spiral table."""
47
+ import polars as pl
48
+
49
+ return pl.scan_pyarrow_dataset(self.to_dataset())
50
+
51
+ def to_duckdb(self) -> "duckdb.DuckDBPyRelation":
52
+ """Returns a DuckDB relation for the Spiral table."""
53
+ import duckdb
54
+
55
+ return duckdb.from_arrow(self.to_dataset())
@@ -0,0 +1,3 @@
1
+ from .stream import SpiralStream
2
+
3
+ __all__ = ["SpiralStream"]
@@ -0,0 +1,117 @@
1
+ import dataclasses
2
+ import functools
3
+ import os
4
+ from typing import Any
5
+
6
+ import vortex as vx
7
+
8
+ from spiral.core.client import Shard
9
+
10
+
11
+ # Fake streaming.base.format.base.reader.FileInfo
12
+ # Dataset manages decompression instead of the Stream in MDS.
13
+ # So we return our own fake FileInfo that has None for compressed file.
14
+ @dataclasses.dataclass
15
+ class FileInfo:
16
+ basename: str
17
+ hashes: dict[str, str] = dataclasses.field(default_factory=dict)
18
+
19
+ @property
20
+ def bytes(self):
21
+ raise NotImplementedError("FileInfo.bytes should NOT be called.")
22
+
23
+
24
+ class SpiralReader:
25
+ """
26
+ An MDS (streaming) compatible Reader.
27
+ """
28
+
29
+ def __init__(self, shard: Shard, basepath):
30
+ self._shard = shard
31
+ self._cardinality = shard.cardinality
32
+ self._basepath = basepath
33
+ self._scan: vx.RepeatedScan | None = None
34
+
35
+ @property
36
+ def shard(self) -> Shard:
37
+ return self._shard
38
+
39
+ @property
40
+ def size(self):
41
+ """Get the number of samples in this shard.
42
+
43
+ Returns:
44
+ int: Sample count.
45
+ """
46
+ return self._cardinality
47
+
48
+ @property
49
+ def samples(self):
50
+ """Get the number of samples in this shard.
51
+
52
+ Returns:
53
+ int: Sample count.
54
+ """
55
+ return self._cardinality
56
+
57
+ def __len__(self) -> int:
58
+ """Get the number of samples in this shard.
59
+
60
+ Returns:
61
+ int: Sample count.
62
+ """
63
+ return self._cardinality
64
+
65
+ @property
66
+ def file_pairs(self) -> list[tuple[FileInfo, FileInfo | None]]:
67
+ """Get the infos from raw and compressed file.
68
+
69
+ MDS uses this because dataset manages decompression of the shards, not stream...
70
+ """
71
+ return [(FileInfo(basename=self.filename), None)]
72
+
73
+ @functools.cached_property
74
+ def filename(self) -> str:
75
+ """Used by SpiralStream to identify shard's file-on-disk, if it exists."""
76
+ # TODO(marko): This might be too long...
77
+ return (
78
+ bytes(self._shard.key_range.begin).hex()
79
+ + "_"
80
+ + bytes(self._shard.key_range.end).hex()
81
+ + "_"
82
+ + str(self._shard.cardinality)
83
+ + ".vortex"
84
+ )
85
+
86
+ @functools.cached_property
87
+ def filepath(self) -> str:
88
+ """Full path to the shard's file-on-disk, if it exists."""
89
+ return os.path.join(self._basepath, self.filename)
90
+
91
+ def evict(self) -> int:
92
+ """Remove all files belonging to this shard."""
93
+
94
+ # Clean up the scan handle first. This will make sure memory is freed.
95
+ self._scan = None
96
+
97
+ # Try to evict file.
98
+ try:
99
+ stat = os.stat(self.filepath)
100
+ os.remove(self.filepath)
101
+ return stat.st_size
102
+ except FileNotFoundError:
103
+ # Nothing to evict.
104
+ return 0
105
+
106
+ def __getitem__(self, item):
107
+ return self.get_item(item)
108
+
109
+ def get_item(self, idx: int) -> dict[str, Any]:
110
+ if self._scan is None:
111
+ # TODO(marko): vx.open should throw FileNotFoundError instead of
112
+ # ValueError: No such file or directory (os error 2)
113
+ # Check if shard is ready on disk. This must throw FileNotFoundError.
114
+ if not os.path.exists(self.filepath):
115
+ raise FileNotFoundError(f"Shard not found: {self.filepath}")
116
+ self._scan = vx.open(self.filepath, without_segment_cache=True).to_repeated_scan()
117
+ return self._scan.scalar_at(idx).as_py()