pyspiral 0.5.0__cp310-abi3-macosx_11_0_arm64.whl → 0.6.1__cp310-abi3-macosx_11_0_arm64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (86) hide show
  1. {pyspiral-0.5.0.dist-info → pyspiral-0.6.1.dist-info}/METADATA +7 -3
  2. pyspiral-0.6.1.dist-info/RECORD +99 -0
  3. {pyspiral-0.5.0.dist-info → pyspiral-0.6.1.dist-info}/WHEEL +1 -1
  4. spiral/__init__.py +11 -3
  5. spiral/_lib.abi3.so +0 -0
  6. spiral/adbc.py +6 -6
  7. spiral/api/__init__.py +8 -2
  8. spiral/api/client.py +1 -1
  9. spiral/api/key_space_indexes.py +23 -0
  10. spiral/api/projects.py +15 -0
  11. spiral/api/text_indexes.py +1 -1
  12. spiral/cli/__init__.py +15 -6
  13. spiral/cli/admin.py +2 -4
  14. spiral/cli/app.py +4 -2
  15. spiral/cli/fs.py +5 -6
  16. spiral/cli/iceberg.py +97 -0
  17. spiral/cli/key_spaces.py +89 -0
  18. spiral/cli/login.py +6 -7
  19. spiral/cli/orgs.py +7 -8
  20. spiral/cli/printer.py +3 -3
  21. spiral/cli/projects.py +5 -6
  22. spiral/cli/tables.py +131 -0
  23. spiral/cli/telemetry.py +3 -4
  24. spiral/cli/text.py +115 -0
  25. spiral/cli/types.py +3 -4
  26. spiral/cli/workloads.py +7 -8
  27. spiral/client.py +111 -8
  28. spiral/core/authn/__init__.pyi +27 -0
  29. spiral/core/client/__init__.pyi +152 -63
  30. spiral/core/table/__init__.pyi +17 -27
  31. spiral/core/table/metastore/__init__.pyi +0 -4
  32. spiral/core/table/spec/__init__.pyi +0 -2
  33. spiral/{tables/dataset.py → dataset.py} +13 -7
  34. spiral/{tables/debug → debug}/manifests.py +15 -6
  35. spiral/{tables/debug → debug}/scan.py +3 -3
  36. spiral/expressions/base.py +3 -3
  37. spiral/expressions/udf.py +1 -1
  38. spiral/{iceberg/client.py → iceberg.py} +1 -3
  39. spiral/key_space_index.py +44 -0
  40. spiral/project.py +171 -18
  41. spiral/protogen/_/arrow/flight/protocol/sql/__init__.py +1668 -1110
  42. spiral/protogen/_/google/protobuf/__init__.py +2190 -0
  43. spiral/protogen/_/message_pool.py +3 -0
  44. spiral/protogen/_/py.typed +0 -0
  45. spiral/protogen/_/scandal/__init__.py +138 -126
  46. spiral/protogen/_/spfs/__init__.py +72 -0
  47. spiral/protogen/_/spql/__init__.py +61 -0
  48. spiral/protogen/_/substrait/__init__.py +5256 -2459
  49. spiral/protogen/_/substrait/extensions/__init__.py +103 -49
  50. spiral/{tables/scan.py → scan.py} +38 -44
  51. spiral/settings.py +14 -3
  52. spiral/snapshot.py +55 -0
  53. spiral/streaming_/__init__.py +3 -0
  54. spiral/streaming_/reader.py +131 -0
  55. spiral/streaming_/stream.py +146 -0
  56. spiral/substrait_.py +9 -9
  57. spiral/table.py +259 -0
  58. spiral/text_index.py +17 -0
  59. spiral/{tables/transaction.py → transaction.py} +11 -15
  60. pyspiral-0.5.0.dist-info/RECORD +0 -103
  61. spiral/cli/iceberg/__init__.py +0 -7
  62. spiral/cli/iceberg/namespaces.py +0 -47
  63. spiral/cli/iceberg/tables.py +0 -60
  64. spiral/cli/indexes/__init__.py +0 -40
  65. spiral/cli/indexes/args.py +0 -39
  66. spiral/cli/indexes/workers.py +0 -59
  67. spiral/cli/tables/__init__.py +0 -88
  68. spiral/cli/tables/args.py +0 -42
  69. spiral/core/index/__init__.pyi +0 -7
  70. spiral/iceberg/__init__.py +0 -3
  71. spiral/indexes/__init__.py +0 -5
  72. spiral/indexes/client.py +0 -137
  73. spiral/indexes/index.py +0 -28
  74. spiral/indexes/scan.py +0 -22
  75. spiral/protogen/_/spiral/table/__init__.py +0 -22
  76. spiral/protogen/substrait/__init__.py +0 -3399
  77. spiral/protogen/substrait/extensions/__init__.py +0 -115
  78. spiral/tables/__init__.py +0 -12
  79. spiral/tables/client.py +0 -133
  80. spiral/tables/maintenance.py +0 -12
  81. spiral/tables/snapshot.py +0 -78
  82. spiral/tables/table.py +0 -145
  83. {pyspiral-0.5.0.dist-info → pyspiral-0.6.1.dist-info}/entry_points.txt +0 -0
  84. /spiral/{protogen/_/spiral → debug}/__init__.py +0 -0
  85. /spiral/{tables/debug → debug}/metrics.py +0 -0
  86. /spiral/{tables/debug → protogen/_/google}/__init__.py +0 -0
@@ -1,115 +1,169 @@
1
1
  # Generated by the protocol buffer compiler. DO NOT EDIT!
2
2
  # sources: substrait/extensions/extensions.proto
3
- # plugin: python-betterproto
3
+ # plugin: python-betterproto2
4
4
  # This file has been @generated
5
5
 
6
+ __all__ = (
7
+ "AdvancedExtension",
8
+ "SimpleExtensionDeclaration",
9
+ "SimpleExtensionDeclarationExtensionFunction",
10
+ "SimpleExtensionDeclarationExtensionType",
11
+ "SimpleExtensionDeclarationExtensionTypeVariation",
12
+ "SimpleExtensionUri",
13
+ )
14
+
6
15
  from dataclasses import dataclass
7
- from typing import List
8
16
 
9
- import betterproto
10
- import betterproto.lib.google.protobuf as betterproto_lib_google_protobuf
17
+ import betterproto2
18
+
19
+ from ...message_pool import default_message_pool
20
+
21
+ _COMPILER_VERSION = "0.8.0"
22
+ betterproto2.check_compiler_version(_COMPILER_VERSION)
11
23
 
12
24
 
13
25
  @dataclass(eq=False, repr=False)
14
- class SimpleExtensionUri(betterproto.Message):
15
- extension_uri_anchor: int = betterproto.uint32_field(1)
26
+ class AdvancedExtension(betterproto2.Message):
16
27
  """
17
- A surrogate key used in the context of a single plan used to reference the
18
- URI associated with an extension.
28
+ A generic object that can be used to embed additional extension information
29
+ into the serialized substrait plan.
19
30
  """
20
31
 
21
- uri: str = betterproto.string_field(2)
32
+ optimization: "list[__google__protobuf__.Any]" = betterproto2.field(1, betterproto2.TYPE_MESSAGE, repeated=True)
22
33
  """
23
- The URI where this extension YAML can be retrieved. This is the "namespace"
24
- of this extension.
34
+ An optimization is helpful information that don't influence semantics. May
35
+ be ignored by a consumer.
36
+ """
37
+
38
+ enhancement: "__google__protobuf__.Any | None" = betterproto2.field(2, betterproto2.TYPE_MESSAGE, optional=True)
39
+ """
40
+ An enhancement alter semantics. Cannot be ignored by a consumer.
25
41
  """
26
42
 
27
43
 
44
+ default_message_pool.register_message("substrait.extensions", "AdvancedExtension", AdvancedExtension)
45
+
46
+
28
47
  @dataclass(eq=False, repr=False)
29
- class SimpleExtensionDeclaration(betterproto.Message):
48
+ class SimpleExtensionDeclaration(betterproto2.Message):
30
49
  """
31
50
  Describes a mapping between a specific extension entity and the uri where
32
- that extension can be found.
51
+ that extension can be found.
52
+
53
+ Oneofs:
54
+ - mapping_type:
33
55
  """
34
56
 
35
- extension_type: "SimpleExtensionDeclarationExtensionType" = (
36
- betterproto.message_field(1, group="mapping_type")
57
+ extension_type: "SimpleExtensionDeclarationExtensionType | None" = betterproto2.field(
58
+ 1, betterproto2.TYPE_MESSAGE, optional=True, group="mapping_type"
37
59
  )
38
- extension_type_variation: "SimpleExtensionDeclarationExtensionTypeVariation" = (
39
- betterproto.message_field(2, group="mapping_type")
60
+
61
+ extension_type_variation: "SimpleExtensionDeclarationExtensionTypeVariation | None" = betterproto2.field(
62
+ 2, betterproto2.TYPE_MESSAGE, optional=True, group="mapping_type"
40
63
  )
41
- extension_function: "SimpleExtensionDeclarationExtensionFunction" = (
42
- betterproto.message_field(3, group="mapping_type")
64
+
65
+ extension_function: "SimpleExtensionDeclarationExtensionFunction | None" = betterproto2.field(
66
+ 3, betterproto2.TYPE_MESSAGE, optional=True, group="mapping_type"
43
67
  )
44
68
 
45
69
 
46
- @dataclass(eq=False, repr=False)
47
- class SimpleExtensionDeclarationExtensionType(betterproto.Message):
48
- """Describes a Type"""
70
+ default_message_pool.register_message("substrait.extensions", "SimpleExtensionDeclaration", SimpleExtensionDeclaration)
71
+
49
72
 
50
- extension_uri_reference: int = betterproto.uint32_field(1)
73
+ @dataclass(eq=False, repr=False)
74
+ class SimpleExtensionDeclarationExtensionFunction(betterproto2.Message):
75
+ extension_uri_reference: "int" = betterproto2.field(1, betterproto2.TYPE_UINT32)
51
76
  """
52
77
  references the extension_uri_anchor defined for a specific extension URI.
53
78
  """
54
79
 
55
- type_anchor: int = betterproto.uint32_field(2)
80
+ function_anchor: "int" = betterproto2.field(2, betterproto2.TYPE_UINT32)
56
81
  """
57
82
  A surrogate key used in the context of a single plan to reference a
58
- specific extension type
83
+ specific function
84
+ """
85
+
86
+ name: "str" = betterproto2.field(3, betterproto2.TYPE_STRING)
59
87
  """
88
+ A function signature compound name
89
+ """
90
+
60
91
 
61
- name: str = betterproto.string_field(3)
62
- """the name of the type in the defined extension YAML."""
92
+ default_message_pool.register_message(
93
+ "substrait.extensions", "SimpleExtensionDeclaration.ExtensionFunction", SimpleExtensionDeclarationExtensionFunction
94
+ )
63
95
 
64
96
 
65
97
  @dataclass(eq=False, repr=False)
66
- class SimpleExtensionDeclarationExtensionTypeVariation(betterproto.Message):
67
- extension_uri_reference: int = betterproto.uint32_field(1)
98
+ class SimpleExtensionDeclarationExtensionType(betterproto2.Message):
99
+ """
100
+ Describes a Type
101
+ """
102
+
103
+ extension_uri_reference: "int" = betterproto2.field(1, betterproto2.TYPE_UINT32)
68
104
  """
69
105
  references the extension_uri_anchor defined for a specific extension URI.
70
106
  """
71
107
 
72
- type_variation_anchor: int = betterproto.uint32_field(2)
108
+ type_anchor: "int" = betterproto2.field(2, betterproto2.TYPE_UINT32)
73
109
  """
74
110
  A surrogate key used in the context of a single plan to reference a
75
- specific type variation
111
+ specific extension type
112
+ """
113
+
114
+ name: "str" = betterproto2.field(3, betterproto2.TYPE_STRING)
115
+ """
116
+ the name of the type in the defined extension YAML.
76
117
  """
77
118
 
78
- name: str = betterproto.string_field(3)
79
- """the name of the type in the defined extension YAML."""
119
+
120
+ default_message_pool.register_message(
121
+ "substrait.extensions", "SimpleExtensionDeclaration.ExtensionType", SimpleExtensionDeclarationExtensionType
122
+ )
80
123
 
81
124
 
82
125
  @dataclass(eq=False, repr=False)
83
- class SimpleExtensionDeclarationExtensionFunction(betterproto.Message):
84
- extension_uri_reference: int = betterproto.uint32_field(1)
126
+ class SimpleExtensionDeclarationExtensionTypeVariation(betterproto2.Message):
127
+ extension_uri_reference: "int" = betterproto2.field(1, betterproto2.TYPE_UINT32)
85
128
  """
86
129
  references the extension_uri_anchor defined for a specific extension URI.
87
130
  """
88
131
 
89
- function_anchor: int = betterproto.uint32_field(2)
132
+ type_variation_anchor: "int" = betterproto2.field(2, betterproto2.TYPE_UINT32)
90
133
  """
91
134
  A surrogate key used in the context of a single plan to reference a
92
- specific function
135
+ specific type variation
136
+ """
137
+
138
+ name: "str" = betterproto2.field(3, betterproto2.TYPE_STRING)
139
+ """
140
+ the name of the type in the defined extension YAML.
93
141
  """
94
142
 
95
- name: str = betterproto.string_field(3)
96
- """A function signature compound name"""
143
+
144
+ default_message_pool.register_message(
145
+ "substrait.extensions",
146
+ "SimpleExtensionDeclaration.ExtensionTypeVariation",
147
+ SimpleExtensionDeclarationExtensionTypeVariation,
148
+ )
97
149
 
98
150
 
99
151
  @dataclass(eq=False, repr=False)
100
- class AdvancedExtension(betterproto.Message):
152
+ class SimpleExtensionUri(betterproto2.Message):
153
+ extension_uri_anchor: "int" = betterproto2.field(1, betterproto2.TYPE_UINT32)
101
154
  """
102
- A generic object that can be used to embed additional extension information
103
- into the serialized substrait plan.
155
+ A surrogate key used in the context of a single plan used to reference the
156
+ URI associated with an extension.
104
157
  """
105
158
 
106
- optimization: List[
107
- "betterproto_lib_google_protobuf.Any"
108
- ] = betterproto.message_field(1)
159
+ uri: "str" = betterproto2.field(2, betterproto2.TYPE_STRING)
109
160
  """
110
- An optimization is helpful information that don't influence semantics. May
111
- be ignored by a consumer.
161
+ The URI where this extension YAML can be retrieved. This is the "namespace"
162
+ of this extension.
112
163
  """
113
164
 
114
- enhancement: "betterproto_lib_google_protobuf.Any" = betterproto.message_field(2)
115
- """An enhancement alter semantics. Cannot be ignored by a consumer."""
165
+
166
+ default_message_pool.register_message("substrait.extensions", "SimpleExtensionURI", SimpleExtensionUri)
167
+
168
+
169
+ from ...google import protobuf as __google__protobuf__
@@ -2,40 +2,37 @@ from collections.abc import Iterator
2
2
  from typing import TYPE_CHECKING, Any
3
3
 
4
4
  import pyarrow as pa
5
- from datasets import DatasetInfo, Features
6
5
 
7
- from spiral.core.table import KeyRange, TableScan
6
+ from spiral.core.client import ShuffleStrategy
7
+ from spiral.core.table import KeyRange
8
+ from spiral.core.table import Scan as CoreScan
8
9
  from spiral.core.table.spec import Schema
9
10
  from spiral.settings import CI, DEV
10
11
 
11
12
  if TYPE_CHECKING:
12
13
  import dask.dataframe as dd
14
+ import datasets.iterable_dataset as hf # noqa
13
15
  import pandas as pd
14
16
  import polars as pl
15
- from datasets import iterable_dataset
17
+ import streaming # noqa
18
+ import torch.utils.data as torchdata # noqa
16
19
 
17
20
 
18
21
  class Scan:
19
22
  """Scan object."""
20
23
 
21
- def __init__(
22
- self,
23
- scan: TableScan,
24
- ):
25
- # NOTE(ngates): this API is a little weird. e.g. if the query doesn't define an asof, it is resolved
26
- # when we wrap it into a core.Scan. Should we expose a Query object in the Python API that's reusable
27
- # and will re-resolve the asof? Or should we just expose a scan that fixes the asof at construction time?
28
- self._scan = scan
24
+ def __init__(self, core: CoreScan):
25
+ self.core = core
29
26
 
30
27
  @property
31
28
  def metrics(self) -> dict[str, Any]:
32
29
  """Returns metrics about the scan."""
33
- return self._scan.metrics()
30
+ return self.core.metrics()
34
31
 
35
32
  @property
36
33
  def schema(self) -> Schema:
37
34
  """Returns the schema of the scan."""
38
- return self._scan.schema()
35
+ return self.core.schema()
39
36
 
40
37
  def is_empty(self) -> bool:
41
38
  """Check if the Spiral is empty for the given key range.
@@ -43,7 +40,7 @@ class Scan:
43
40
  **IMPORTANT**: False negatives are possible, but false positives are not,
44
41
  i.e. is_empty can return False and scan can return zero rows.
45
42
  """
46
- return self._scan.is_empty()
43
+ return self.core.is_empty()
47
44
 
48
45
  def to_record_batches(
49
46
  self,
@@ -69,7 +66,7 @@ class Scan:
69
66
  elif isinstance(key_table, pa.Table):
70
67
  key_table = key_table.to_reader(max_chunksize=batch_size)
71
68
 
72
- return self._scan.to_record_batches(key_table=key_table, batch_readahead=batch_readahead)
69
+ return self.core.to_record_batches(key_table=key_table, batch_readahead=batch_readahead)
73
70
 
74
71
  def to_table(
75
72
  self,
@@ -83,7 +80,7 @@ class Scan:
83
80
  """
84
81
  # NOTE: Evaluates fully on Rust side which improved debuggability.
85
82
  if DEV and not CI and key_table is None:
86
- rb = self._scan.to_record_batch()
83
+ rb = self.core.to_record_batch()
87
84
  return pa.Table.from_batches([rb])
88
85
 
89
86
  return self.to_record_batches(key_table=key_table).read_all()
@@ -97,11 +94,11 @@ class Scan:
97
94
  import pandas as pd
98
95
 
99
96
  def _read_key_range(key_range: KeyRange) -> pd.DataFrame:
100
- # TODO(ngates): we need a way to preserve the existing asofs? Should we copy CoreScan instead of Query?
97
+ # TODO(ngates): we need a way to preserve the existing asofs?
101
98
  raise NotImplementedError()
102
99
 
103
100
  # Fetch a set of partition ranges
104
- return dd.from_map(_read_key_range, self.split())
101
+ return dd.from_map(_read_key_range, self._splits())
105
102
 
106
103
  def to_pandas(self) -> "pd.DataFrame":
107
104
  """Read into a Pandas DataFrame.
@@ -117,34 +114,31 @@ class Scan:
117
114
  """
118
115
  import polars as pl
119
116
 
120
- # TODO(marko): This should support lazy dataframe.
121
117
  return pl.from_arrow(self.to_record_batches())
122
118
 
123
- def to_pytorch(
119
+ def to_iterable_dataset(
124
120
  self,
121
+ shuffle: ShuffleStrategy | None = None,
125
122
  batch_readahead: int | None = None,
126
- shuffle_batch_size: int | None = None,
127
- shuffle_pool_num_rows: int | None = None,
128
- ) -> "iterable_dataset.IterableDataset":
129
- """Returns an iterable dataset that can be used to build a PyTorch DataLoader.
123
+ ) -> "hf.IterableDataset":
124
+ """Returns an Huggingface's IterableDataset.
125
+
126
+ Requires `datasets` package to be installed.
130
127
 
131
128
  Args:
132
- batch_readahead: Number of batches to prefetch in the background.
133
- shuffle_batch_size: read granularity of number of rows for a shuffled scan. If left as
134
- None along with shuffle_pool_num_rows=None, shuffling is disabled.
135
- shuffle_pool_num_rows: Pool size for shuffling batches.
129
+ shuffle: Controls sample shuffling. If None, no shuffling is performed.
130
+ batch_readahead: Controls how many batches to read ahead concurrently.
131
+ If pipeline includes work after reading (e.g. decoding, transforming, ...) this can be set higher.
132
+ Otherwise, it should be kept low to reduce next batch latency. Defaults to 2.
136
133
  """
134
+ from datasets import DatasetInfo, Features
137
135
  from datasets.iterable_dataset import ArrowExamplesIterable, IterableDataset
138
136
 
139
137
  def _generate_tables(**kwargs) -> Iterator[tuple[int, pa.Table]]:
140
- if shuffle_batch_size is None and shuffle_pool_num_rows is None:
141
- stream = self.to_record_batches(
142
- batch_readahead=batch_readahead,
143
- )
144
- else:
145
- stream = self._scan.to_shuffled_record_batches(
146
- batch_readahead, shuffle_batch_size, shuffle_pool_num_rows
147
- )
138
+ stream = self.core.to_shuffled_record_batches(
139
+ shuffle,
140
+ batch_readahead,
141
+ )
148
142
 
149
143
  # This key is unused when training with IterableDataset.
150
144
  # Default implementation returns shard id, e.g. parquet row group id.
@@ -166,28 +160,28 @@ class Scan:
166
160
  return pa.schema(new_fields)
167
161
 
168
162
  # NOTE: generate_tables_fn type annotations are wrong, return type must be an iterable of tuples.
169
- ex_iterable = ArrowExamplesIterable(generate_tables_fn=_generate_tables, kwargs={})
163
+ ex_iterable = ArrowExamplesIterable(generate_tables_fn=_generate_tables, kwargs={}) # type: ignore
170
164
  info = DatasetInfo(features=Features.from_arrow_schema(_hf_compatible_schema(self.schema.to_arrow())))
171
165
  return IterableDataset(ex_iterable=ex_iterable, info=info)
172
166
 
173
- def _split(self) -> list[KeyRange]:
167
+ def _splits(self) -> list[KeyRange]:
174
168
  # Splits the scan into a set of key ranges.
175
- return self._scan.split()
169
+ return self.core.splits()
176
170
 
177
171
  def _debug(self):
178
172
  # Visualizes the scan, mainly for debugging purposes.
179
- from spiral.tables.debug.scan import show_scan
173
+ from spiral.debug.scan import show_scan
180
174
 
181
- show_scan(self._scan)
175
+ show_scan(self.core)
182
176
 
183
177
  def _dump_manifests(self):
184
178
  # Print manifests in a human-readable format.
185
- from spiral.tables.debug.manifests import display_manifests
179
+ from spiral.debug.manifests import display_scan_manifests
186
180
 
187
- display_manifests(self._scan)
181
+ display_scan_manifests(self.core)
188
182
 
189
183
  def _dump_metrics(self):
190
184
  # Print metrics in a human-readable format.
191
- from spiral.tables.debug.metrics import display_metrics
185
+ from spiral.debug.metrics import display_metrics
192
186
 
193
187
  display_metrics(self.metrics)
spiral/settings.py CHANGED
@@ -1,7 +1,7 @@
1
1
  import functools
2
2
  import os
3
3
  from pathlib import Path
4
- from typing import Annotated
4
+ from typing import TYPE_CHECKING, Annotated
5
5
 
6
6
  import typer
7
7
  from pydantic import Field, ValidatorFunctionWrapHandler, WrapValidator
@@ -12,8 +12,11 @@ from pydantic_settings import (
12
12
  SettingsConfigDict,
13
13
  )
14
14
 
15
- from spiral.api import SpiralAPI
16
- from spiral.core.client import Authn, DeviceCodeAuth, Token
15
+ from spiral.core.authn import Authn, DeviceCodeAuth, Token
16
+ from spiral.core.client import Spiral
17
+
18
+ if TYPE_CHECKING:
19
+ from spiral.api import SpiralAPI
17
20
 
18
21
  DEV = "PYTEST_VERSION" in os.environ or bool(os.environ.get("SPIRAL_DEV", None))
19
22
  CI = "GITHUB_ACTIONS" in os.environ
@@ -74,6 +77,14 @@ class Settings(BaseSettings):
74
77
 
75
78
  return SpiralAPI(self.authn, base_url=self.spiraldb.uri)
76
79
 
80
+ @functools.cached_property
81
+ def core(self) -> Spiral:
82
+ return Spiral(
83
+ api_url=self.spiraldb.uri,
84
+ spfs_url=self.spfs.uri,
85
+ authn=self.authn,
86
+ )
87
+
77
88
  @functools.cached_property
78
89
  def authn(self):
79
90
  if self.spiraldb.token:
spiral/snapshot.py ADDED
@@ -0,0 +1,55 @@
1
+ from typing import TYPE_CHECKING
2
+
3
+ from spiral.core.table import Snapshot as CoreSnapshot
4
+ from spiral.core.table.spec import Schema
5
+ from spiral.types_ import Timestamp
6
+
7
+ if TYPE_CHECKING:
8
+ import duckdb
9
+ import polars as pl
10
+ import pyarrow.dataset as ds
11
+
12
+ from spiral.table import Table
13
+
14
+
15
+ class Snapshot:
16
+ """Spiral table snapshot.
17
+
18
+ A snapshot represents a point-in-time view of a table.
19
+ """
20
+
21
+ def __init__(self, table: "Table", core: CoreSnapshot):
22
+ self.core = core
23
+ self._table = table
24
+
25
+ @property
26
+ def asof(self) -> Timestamp:
27
+ """Returns the asof timestamp of the snapshot."""
28
+ return self.core.asof
29
+
30
+ def schema(self) -> Schema:
31
+ """Returns the schema of the snapshot."""
32
+ return self.core.table.get_schema(asof=self.asof)
33
+
34
+ @property
35
+ def table(self) -> "Table":
36
+ """Returns the table associated with the snapshot."""
37
+ return self._table
38
+
39
+ def to_dataset(self) -> "ds.Dataset":
40
+ """Returns a PyArrow Dataset representing the table."""
41
+ from spiral.dataset import Dataset
42
+
43
+ return Dataset(self)
44
+
45
+ def to_polars(self) -> "pl.LazyFrame":
46
+ """Returns a Polars LazyFrame for the Spiral table."""
47
+ import polars as pl
48
+
49
+ return pl.scan_pyarrow_dataset(self.to_dataset())
50
+
51
+ def to_duckdb(self) -> "duckdb.DuckDBPyRelation":
52
+ """Returns a DuckDB relation for the Spiral table."""
53
+ import duckdb
54
+
55
+ return duckdb.from_arrow(self.to_dataset())
@@ -0,0 +1,3 @@
1
+ from .stream import SpiralStream
2
+
3
+ __all__ = ["SpiralStream"]
@@ -0,0 +1,131 @@
1
+ import dataclasses
2
+ import functools
3
+ import os
4
+ from typing import Any
5
+
6
+ import vortex as vx
7
+
8
+ from spiral.core.client import Shard
9
+
10
+
11
+ # Fake streaming.base.format.base.reader.FileInfo
12
+ # Dataset manages decompression instead of the Stream in MDS.
13
+ # So we return our own fake FileInfo that has None for compressed file.
14
+ @dataclasses.dataclass
15
+ class FileInfo:
16
+ basename: str
17
+ hashes: dict[str, str] = dataclasses.field(default_factory=dict)
18
+
19
+ @property
20
+ def bytes(self):
21
+ raise NotImplementedError("FileInfo.bytes should NOT be called.")
22
+
23
+
24
+ class SpiralReader:
25
+ """
26
+ An MDS (streaming) compatible Reader.
27
+ """
28
+
29
+ def __init__(self, shard: Shard, basepath):
30
+ self._shard = shard
31
+ self._cardinality = shard.cardinality
32
+ self._basepath = basepath
33
+ self._scan: vx.RepeatedScan | None = None
34
+
35
+ @property
36
+ def shard(self) -> Shard:
37
+ return self._shard
38
+
39
+ @property
40
+ def size(self):
41
+ """Get the number of samples in this shard.
42
+
43
+ Returns:
44
+ int: Sample count.
45
+ """
46
+ return self._cardinality
47
+
48
+ @property
49
+ def samples(self):
50
+ """Get the number of samples in this shard.
51
+
52
+ Returns:
53
+ int: Sample count.
54
+ """
55
+ return self._cardinality
56
+
57
+ def __len__(self) -> int:
58
+ """Get the number of samples in this shard.
59
+
60
+ Returns:
61
+ int: Sample count.
62
+ """
63
+ return self._cardinality
64
+
65
+ @property
66
+ def file_pairs(self) -> list[tuple[FileInfo, FileInfo | None]]:
67
+ """Get the infos from raw and compressed file.
68
+
69
+ MDS uses this because dataset manages decompression of the shards, not stream...
70
+ """
71
+ return [(FileInfo(basename=self.filename), None)]
72
+
73
+ def get_max_size(self) -> int:
74
+ """Get the full size of this shard.
75
+
76
+ "Max" in this case means both the raw (decompressed) and zip (compressed) versions are
77
+ resident (assuming it has a zip form). This is the maximum disk usage the shard can reach.
78
+ When compressed was used, even if keep_zip is ``False``, the zip form must still be
79
+ resident at the same time as the raw form during shard decompression.
80
+
81
+ Returns:
82
+ int: Size in bytes.
83
+ """
84
+ # TODO(marko): This is used to check cache limit is possible...
85
+ return 0
86
+
87
+ @functools.cached_property
88
+ def filename(self) -> str:
89
+ """Used by SpiralStream to identify shard's file-on-disk, if it exists."""
90
+ # TODO(marko): This might be too long...
91
+ return (
92
+ bytes(self._shard.key_range.begin).hex()
93
+ + "_"
94
+ + bytes(self._shard.key_range.end).hex()
95
+ + "_"
96
+ + str(self._shard.cardinality)
97
+ + ".vortex"
98
+ )
99
+
100
+ @functools.cached_property
101
+ def filepath(self) -> str:
102
+ """Full path to the shard's file-on-disk, if it exists."""
103
+ return os.path.join(self._basepath, self.filename)
104
+
105
+ def evict(self) -> int:
106
+ """Remove all files belonging to this shard."""
107
+
108
+ # Clean up the scan handle first. This will make sure memory is freed.
109
+ self._scan = None
110
+
111
+ # Try to evict file.
112
+ try:
113
+ stat = os.stat(self.filepath)
114
+ os.remove(self.filepath)
115
+ return stat.st_size
116
+ except FileNotFoundError:
117
+ # Nothing to evict.
118
+ return 0
119
+
120
+ def __getitem__(self, item):
121
+ return self.get_item(item)
122
+
123
+ def get_item(self, idx: int) -> dict[str, Any]:
124
+ if self._scan is None:
125
+ # TODO(marko): vx.open should throw FileNotFoundError instead of
126
+ # ValueError: No such file or directory (os error 2)
127
+ # Check if shard is ready on disk. This must throw FileNotFoundError.
128
+ if not os.path.exists(self.filepath):
129
+ raise FileNotFoundError(f"Shard not found: {self.filepath}")
130
+ self._scan = vx.open(self.filepath, without_segment_cache=True).to_repeated_scan()
131
+ return self._scan.scalar_at(idx).as_py()