pyspiral 0.6.6__cp312-abi3-macosx_11_0_arm64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pyspiral-0.6.6.dist-info/METADATA +51 -0
- pyspiral-0.6.6.dist-info/RECORD +102 -0
- pyspiral-0.6.6.dist-info/WHEEL +4 -0
- pyspiral-0.6.6.dist-info/entry_points.txt +2 -0
- spiral/__init__.py +35 -0
- spiral/_lib.abi3.so +0 -0
- spiral/adbc.py +411 -0
- spiral/api/__init__.py +78 -0
- spiral/api/admin.py +15 -0
- spiral/api/client.py +164 -0
- spiral/api/filesystems.py +134 -0
- spiral/api/key_space_indexes.py +23 -0
- spiral/api/organizations.py +77 -0
- spiral/api/projects.py +219 -0
- spiral/api/telemetry.py +19 -0
- spiral/api/text_indexes.py +56 -0
- spiral/api/types.py +22 -0
- spiral/api/workers.py +40 -0
- spiral/api/workloads.py +52 -0
- spiral/arrow_.py +216 -0
- spiral/cli/__init__.py +88 -0
- spiral/cli/__main__.py +4 -0
- spiral/cli/admin.py +14 -0
- spiral/cli/app.py +104 -0
- spiral/cli/console.py +95 -0
- spiral/cli/fs.py +76 -0
- spiral/cli/iceberg.py +97 -0
- spiral/cli/key_spaces.py +89 -0
- spiral/cli/login.py +24 -0
- spiral/cli/orgs.py +89 -0
- spiral/cli/printer.py +53 -0
- spiral/cli/projects.py +147 -0
- spiral/cli/state.py +5 -0
- spiral/cli/tables.py +174 -0
- spiral/cli/telemetry.py +17 -0
- spiral/cli/text.py +115 -0
- spiral/cli/types.py +50 -0
- spiral/cli/workloads.py +58 -0
- spiral/client.py +178 -0
- spiral/core/__init__.pyi +0 -0
- spiral/core/_tools/__init__.pyi +5 -0
- spiral/core/authn/__init__.pyi +27 -0
- spiral/core/client/__init__.pyi +237 -0
- spiral/core/table/__init__.pyi +101 -0
- spiral/core/table/manifests/__init__.pyi +35 -0
- spiral/core/table/metastore/__init__.pyi +58 -0
- spiral/core/table/spec/__init__.pyi +213 -0
- spiral/dataloader.py +285 -0
- spiral/dataset.py +255 -0
- spiral/datetime_.py +27 -0
- spiral/debug/__init__.py +0 -0
- spiral/debug/manifests.py +87 -0
- spiral/debug/metrics.py +56 -0
- spiral/debug/scan.py +266 -0
- spiral/expressions/__init__.py +276 -0
- spiral/expressions/base.py +157 -0
- spiral/expressions/http.py +86 -0
- spiral/expressions/io.py +100 -0
- spiral/expressions/list_.py +68 -0
- spiral/expressions/mp4.py +62 -0
- spiral/expressions/png.py +18 -0
- spiral/expressions/qoi.py +18 -0
- spiral/expressions/refs.py +58 -0
- spiral/expressions/str_.py +39 -0
- spiral/expressions/struct.py +59 -0
- spiral/expressions/text.py +62 -0
- spiral/expressions/tiff.py +223 -0
- spiral/expressions/udf.py +46 -0
- spiral/grpc_.py +32 -0
- spiral/iceberg.py +31 -0
- spiral/iterable_dataset.py +106 -0
- spiral/key_space_index.py +44 -0
- spiral/project.py +199 -0
- spiral/protogen/_/__init__.py +0 -0
- spiral/protogen/_/arrow/__init__.py +0 -0
- spiral/protogen/_/arrow/flight/__init__.py +0 -0
- spiral/protogen/_/arrow/flight/protocol/__init__.py +0 -0
- spiral/protogen/_/arrow/flight/protocol/sql/__init__.py +2548 -0
- spiral/protogen/_/google/__init__.py +0 -0
- spiral/protogen/_/google/protobuf/__init__.py +2310 -0
- spiral/protogen/_/message_pool.py +3 -0
- spiral/protogen/_/py.typed +0 -0
- spiral/protogen/_/scandal/__init__.py +190 -0
- spiral/protogen/_/spfs/__init__.py +72 -0
- spiral/protogen/_/spql/__init__.py +61 -0
- spiral/protogen/_/substrait/__init__.py +6196 -0
- spiral/protogen/_/substrait/extensions/__init__.py +169 -0
- spiral/protogen/__init__.py +0 -0
- spiral/protogen/util.py +41 -0
- spiral/py.typed +0 -0
- spiral/scan.py +285 -0
- spiral/server.py +17 -0
- spiral/settings.py +114 -0
- spiral/snapshot.py +56 -0
- spiral/streaming_/__init__.py +3 -0
- spiral/streaming_/reader.py +133 -0
- spiral/streaming_/stream.py +157 -0
- spiral/substrait_.py +274 -0
- spiral/table.py +293 -0
- spiral/text_index.py +17 -0
- spiral/transaction.py +58 -0
- spiral/types_.py +6 -0
@@ -0,0 +1,106 @@
|
|
1
|
+
from collections.abc import Callable, Iterator
|
2
|
+
from typing import TYPE_CHECKING
|
3
|
+
|
4
|
+
import pyarrow as pa
|
5
|
+
|
6
|
+
if TYPE_CHECKING:
|
7
|
+
import datasets.iterable_dataset as hf # noqa
|
8
|
+
import streaming # noqa
|
9
|
+
import torch.utils.data as torchdata # noqa
|
10
|
+
|
11
|
+
|
12
|
+
def _hf_compatible_schema(schema: pa.Schema) -> pa.Schema:
|
13
|
+
"""
|
14
|
+
Replace string-view and binary-view columns in the schema with strings/binary.
|
15
|
+
Recursively handles nested types (struct, list, etc).
|
16
|
+
We use this converted schema as Features in the returned Dataset.
|
17
|
+
Remove this method once we have https://github.com/huggingface/datasets/pull/7718
|
18
|
+
"""
|
19
|
+
|
20
|
+
def _convert_type(dtype: pa.DataType) -> pa.DataType:
|
21
|
+
if dtype == pa.string_view():
|
22
|
+
return pa.string()
|
23
|
+
elif dtype == pa.binary_view():
|
24
|
+
return pa.binary()
|
25
|
+
elif pa.types.is_struct(dtype):
|
26
|
+
new_fields = [
|
27
|
+
pa.field(field.name, _convert_type(field.type), nullable=field.nullable, metadata=field.metadata)
|
28
|
+
for field in dtype
|
29
|
+
]
|
30
|
+
return pa.struct(new_fields)
|
31
|
+
elif pa.types.is_list(dtype):
|
32
|
+
return pa.list_(_convert_type(dtype.value_type))
|
33
|
+
elif pa.types.is_large_list(dtype):
|
34
|
+
return pa.large_list(_convert_type(dtype.value_type))
|
35
|
+
elif pa.types.is_fixed_size_list(dtype):
|
36
|
+
return pa.list_(_convert_type(dtype.value_type), dtype.list_size)
|
37
|
+
elif pa.types.is_map(dtype):
|
38
|
+
return pa.map_(_convert_type(dtype.key_type), _convert_type(dtype.item_type))
|
39
|
+
else:
|
40
|
+
return dtype
|
41
|
+
|
42
|
+
new_fields = []
|
43
|
+
for field in schema:
|
44
|
+
new_type = _convert_type(field.type)
|
45
|
+
new_fields.append(pa.field(field.name, new_type, nullable=field.nullable, metadata=field.metadata))
|
46
|
+
|
47
|
+
return pa.schema(new_fields)
|
48
|
+
|
49
|
+
|
50
|
+
def to_iterable_dataset(stream: pa.RecordBatchReader) -> "hf.IterableDataset":
|
51
|
+
from datasets import DatasetInfo, Features
|
52
|
+
from datasets.builder import ArrowExamplesIterable
|
53
|
+
from datasets.iterable_dataset import IterableDataset
|
54
|
+
|
55
|
+
def _generate_tables(**kwargs) -> Iterator[tuple[int, pa.Table]]:
|
56
|
+
# This key is unused when training with IterableDataset.
|
57
|
+
# Default implementation returns shard id, e.g. parquet row group id.
|
58
|
+
for i, rb in enumerate(stream):
|
59
|
+
yield i, pa.Table.from_batches([rb], stream.schema)
|
60
|
+
|
61
|
+
# TODO(marko): This is temporary until we stop returning IterableDataset from this function.
|
62
|
+
class _IterableDataset(IterableDataset):
|
63
|
+
# Diff with datasets.iterable_dataset.IterableDataset:
|
64
|
+
# - Removes torch handling which attempts to handle worker processes.
|
65
|
+
# - Assumes arrow iterator.
|
66
|
+
def __iter__(self):
|
67
|
+
from datasets.formatting import get_formatter
|
68
|
+
|
69
|
+
prepared_ex_iterable = self._prepare_ex_iterable_for_iteration()
|
70
|
+
if self._formatting and (prepared_ex_iterable.iter_arrow or self._formatting.is_table):
|
71
|
+
formatter = get_formatter(self._formatting.format_type, features=self.features)
|
72
|
+
iterator = prepared_ex_iterable.iter_arrow()
|
73
|
+
for key, pa_table in iterator:
|
74
|
+
yield formatter.format_row(pa_table)
|
75
|
+
return
|
76
|
+
|
77
|
+
for key, example in prepared_ex_iterable:
|
78
|
+
# no need to format thanks to FormattedExamplesIterable
|
79
|
+
yield example
|
80
|
+
|
81
|
+
def map(self, *args, **kwargs):
|
82
|
+
# Map constructs a new IterableDataset, so we need to "patch" it
|
83
|
+
base = super().map(*args, **kwargs)
|
84
|
+
if isinstance(base, IterableDataset):
|
85
|
+
# Patch __iter__ to avoid torch handling
|
86
|
+
base.__class__ = _IterableDataset # type: ignore
|
87
|
+
return base
|
88
|
+
|
89
|
+
class _ArrowExamplesIterable(ArrowExamplesIterable):
|
90
|
+
def __init__(self, generate_tables_fn: Callable[..., Iterator[tuple[int, pa.Table]]], features: Features):
|
91
|
+
# NOTE: generate_tables_fn type annotations are wrong, return type must be an iterable of tuples.
|
92
|
+
super().__init__(generate_tables_fn, kwargs={}) # type: ignore
|
93
|
+
self._features = features
|
94
|
+
|
95
|
+
@property
|
96
|
+
def is_typed(self) -> bool:
|
97
|
+
return True
|
98
|
+
|
99
|
+
@property
|
100
|
+
def features(self) -> Features:
|
101
|
+
return self._features
|
102
|
+
|
103
|
+
target_features = Features.from_arrow_schema(_hf_compatible_schema(stream.schema))
|
104
|
+
ex_iterable = _ArrowExamplesIterable(_generate_tables, target_features)
|
105
|
+
info = DatasetInfo(features=target_features)
|
106
|
+
return _IterableDataset(ex_iterable=ex_iterable, info=info)
|
@@ -0,0 +1,44 @@
|
|
1
|
+
from spiral.core.client import KeySpaceIndex as CoreKeySpaceIndex
|
2
|
+
from spiral.expressions import Expr
|
3
|
+
from spiral.types_ import Timestamp
|
4
|
+
|
5
|
+
|
6
|
+
class KeySpaceIndex:
|
7
|
+
"""
|
8
|
+
KeysIndex represents an optionally materialized key space, defined by a projection and a filter over a table.
|
9
|
+
It can be used to efficiently and precisely shard the table for parallel processing or distributed training.
|
10
|
+
|
11
|
+
An index is defined by:
|
12
|
+
- A granularity that defines the target size of key ranges in the index.
|
13
|
+
IMPORTANT: Actual key ranges may be smaller, but will not exceed twice the granularity.
|
14
|
+
- A projection expression that defines which columns are included in the resulting key space.
|
15
|
+
- An optional filter expression that defines which rows are included in the index.
|
16
|
+
"""
|
17
|
+
|
18
|
+
def __init__(self, core: CoreKeySpaceIndex, *, name: str | None = None):
|
19
|
+
self.core = core
|
20
|
+
self._name = name
|
21
|
+
|
22
|
+
@property
|
23
|
+
def index_id(self) -> str:
|
24
|
+
return self.core.id
|
25
|
+
|
26
|
+
@property
|
27
|
+
def table_id(self) -> str:
|
28
|
+
return self.core.table_id
|
29
|
+
|
30
|
+
@property
|
31
|
+
def name(self) -> str:
|
32
|
+
return self._name or self.index_id
|
33
|
+
|
34
|
+
@property
|
35
|
+
def asof(self) -> Timestamp:
|
36
|
+
return self.core.asof
|
37
|
+
|
38
|
+
@property
|
39
|
+
def projection(self) -> Expr:
|
40
|
+
return Expr(self.core.projection)
|
41
|
+
|
42
|
+
@property
|
43
|
+
def filter(self) -> Expr | None:
|
44
|
+
return Expr(self.core.filter) if self.core.filter is not None else None
|
spiral/project.py
ADDED
@@ -0,0 +1,199 @@
|
|
1
|
+
from typing import TYPE_CHECKING, Any
|
2
|
+
|
3
|
+
import pyarrow as pa
|
4
|
+
|
5
|
+
from spiral.api.projects import KeySpaceIndexResource, TableResource, TextIndexResource
|
6
|
+
from spiral.core.table.spec import Schema
|
7
|
+
from spiral.expressions import ExprLike
|
8
|
+
from spiral.key_space_index import KeySpaceIndex
|
9
|
+
from spiral.table import Table
|
10
|
+
from spiral.text_index import TextIndex
|
11
|
+
from spiral.types_ import Uri
|
12
|
+
|
13
|
+
if TYPE_CHECKING:
|
14
|
+
from spiral.client import Spiral
|
15
|
+
|
16
|
+
|
17
|
+
class Project:
|
18
|
+
def __init__(self, spiral: "Spiral", project_id: str, name: str | None = None):
|
19
|
+
self._spiral = spiral
|
20
|
+
self._id = project_id
|
21
|
+
self._name = name
|
22
|
+
|
23
|
+
def __str__(self):
|
24
|
+
return self._id
|
25
|
+
|
26
|
+
def __repr__(self):
|
27
|
+
return f"Project(id={self._id}{', name=' + self._name if self._name else ''})"
|
28
|
+
|
29
|
+
@property
|
30
|
+
def id(self) -> str:
|
31
|
+
return self._id
|
32
|
+
|
33
|
+
@property
|
34
|
+
def name(self) -> str:
|
35
|
+
return self._name or self._id
|
36
|
+
|
37
|
+
def list_tables(self) -> list[TableResource]:
|
38
|
+
return list(self._spiral.api.project.list_tables(self._id))
|
39
|
+
|
40
|
+
def list_text_indexes(self) -> list[TextIndexResource]:
|
41
|
+
return list(self._spiral.api.project.list_text_indexes(self._id))
|
42
|
+
|
43
|
+
def list_key_space_indexes(self) -> list[KeySpaceIndexResource]:
|
44
|
+
return list(self._spiral.api.project.list_key_space_indexes(self._id))
|
45
|
+
|
46
|
+
def table(self, identifier: str) -> Table:
|
47
|
+
"""Open a table with a `dataset.table` identifier, or `table` name using the `default` dataset."""
|
48
|
+
dataset, table = self._parse_table_identifier(identifier)
|
49
|
+
|
50
|
+
res = list(self._spiral.api.project.list_tables(project_id=self._id, dataset=dataset, table=table))
|
51
|
+
if len(res) == 0:
|
52
|
+
raise ValueError(f"Table not found: {self._id}.{dataset}.{table}")
|
53
|
+
res = res[0]
|
54
|
+
|
55
|
+
return Table(
|
56
|
+
self._spiral, self._spiral._core.table(res.id), identifier=f"{res.project_id}.{res.dataset}.{res.table}"
|
57
|
+
)
|
58
|
+
|
59
|
+
def create_table(
|
60
|
+
self,
|
61
|
+
identifier: str,
|
62
|
+
*,
|
63
|
+
key_schema: pa.Schema | Any,
|
64
|
+
root_uri: Uri | None = None,
|
65
|
+
exist_ok: bool = False,
|
66
|
+
) -> Table:
|
67
|
+
"""Create a new table in the project.
|
68
|
+
|
69
|
+
Args:
|
70
|
+
identifier: The table identifier, in the form `dataset.table` or `table`.
|
71
|
+
key_schema: The schema of the table's keys.
|
72
|
+
root_uri: The root URI for the table.
|
73
|
+
exist_ok: If True, do not raise an error if the table already exists.
|
74
|
+
"""
|
75
|
+
dataset, table = self._parse_table_identifier(identifier)
|
76
|
+
|
77
|
+
if not isinstance(key_schema, pa.Schema):
|
78
|
+
key_schema = pa.schema(key_schema)
|
79
|
+
key_schema = Schema.from_arrow(key_schema)
|
80
|
+
|
81
|
+
core_table = self._spiral._core.create_table(
|
82
|
+
project_id=self._id,
|
83
|
+
dataset=dataset,
|
84
|
+
table=table,
|
85
|
+
key_schema=key_schema,
|
86
|
+
root_uri=root_uri,
|
87
|
+
exist_ok=exist_ok,
|
88
|
+
)
|
89
|
+
|
90
|
+
return Table(self._spiral, core_table, identifier=f"{self._id}.{dataset}.{table}")
|
91
|
+
|
92
|
+
def _parse_table_identifier(self, identifier: str) -> tuple[str, str]:
|
93
|
+
parts = identifier.split(".")
|
94
|
+
if len(parts) == 1:
|
95
|
+
return "default", parts[0]
|
96
|
+
elif len(parts) == 2:
|
97
|
+
return parts[0], parts[1]
|
98
|
+
else:
|
99
|
+
raise ValueError(f"Invalid table identifier: {self._id}.{identifier}")
|
100
|
+
|
101
|
+
def text_index(self, name: str) -> TextIndex:
|
102
|
+
"""Returns the index with the given name."""
|
103
|
+
res = list(self._spiral.api.project.list_text_indexes(project_id=self._id, name=name))
|
104
|
+
if len(res) == 0:
|
105
|
+
raise ValueError(f"Index not found: {name}")
|
106
|
+
res = res[0]
|
107
|
+
|
108
|
+
return TextIndex(self._spiral._core.text_index(res.id), name=name)
|
109
|
+
|
110
|
+
def create_text_index(
|
111
|
+
self,
|
112
|
+
name: str,
|
113
|
+
*projections: ExprLike,
|
114
|
+
where: ExprLike | None = None,
|
115
|
+
root_uri: Uri | None = None,
|
116
|
+
exist_ok: bool = False,
|
117
|
+
) -> TextIndex:
|
118
|
+
"""Creates a text index over the table projection.
|
119
|
+
|
120
|
+
See `se.text.field` for how to create and configure indexable fields.
|
121
|
+
|
122
|
+
Args:
|
123
|
+
name: The index name. Must be unique within the project.
|
124
|
+
projections: At least one projection expression is required.
|
125
|
+
All projections must reference the same table.
|
126
|
+
where: An optional filter expression to apply to the index.
|
127
|
+
root_uri: The root URI for the index.
|
128
|
+
exist_ok: If True, do not raise an error if the index already exists.
|
129
|
+
"""
|
130
|
+
from spiral import expressions as se
|
131
|
+
|
132
|
+
if not projections:
|
133
|
+
raise ValueError("At least one projection is required.")
|
134
|
+
projection = se.merge(*projections)
|
135
|
+
if where is not None:
|
136
|
+
where = se.lift(where)
|
137
|
+
|
138
|
+
core_index = self._spiral._core.create_text_index(
|
139
|
+
project_id=self._id,
|
140
|
+
name=name,
|
141
|
+
projection=projection.__expr__,
|
142
|
+
filter=where.__expr__ if where else None,
|
143
|
+
root_uri=root_uri,
|
144
|
+
# TODO(marko): Validate that if an index exists, it's the same?
|
145
|
+
exist_ok=exist_ok,
|
146
|
+
)
|
147
|
+
|
148
|
+
return TextIndex(core_index, name=name)
|
149
|
+
|
150
|
+
def key_space_index(self, name: str) -> KeySpaceIndex:
|
151
|
+
"""Returns the index with the given name."""
|
152
|
+
res = list(self._spiral.api.project.list_key_space_indexes(project_id=self._id, name=name))
|
153
|
+
if len(res) == 0:
|
154
|
+
raise ValueError(f"Index not found: {name}")
|
155
|
+
res = res[0]
|
156
|
+
|
157
|
+
return KeySpaceIndex(self._spiral._core.key_space_index(res.id), name=name)
|
158
|
+
|
159
|
+
def create_key_space_index(
|
160
|
+
self,
|
161
|
+
name: str,
|
162
|
+
granularity: int,
|
163
|
+
*projections: ExprLike,
|
164
|
+
where: ExprLike | None = None,
|
165
|
+
root_uri: Uri | None = None,
|
166
|
+
exist_ok: bool = False,
|
167
|
+
) -> KeySpaceIndex:
|
168
|
+
"""Creates a key space index over the table projection.
|
169
|
+
|
170
|
+
Args:
|
171
|
+
name: The index name. Must be unique within the project.
|
172
|
+
granularity: The granularity at which to store keys, i.e. the size of desired key ranges.
|
173
|
+
The key ranges will not be greater than 2x the granularity, but may be smaller.
|
174
|
+
projections: At least one projection expression is required.
|
175
|
+
All projections must reference the same table.
|
176
|
+
where: An optional filter expression to apply to the index.
|
177
|
+
root_uri: The root URI for the index.
|
178
|
+
exist_ok: If True, do not raise an error if the index already exists.
|
179
|
+
"""
|
180
|
+
from spiral import expressions as se
|
181
|
+
|
182
|
+
if not projections:
|
183
|
+
raise ValueError("At least one projection is required.")
|
184
|
+
projection = se.merge(*projections)
|
185
|
+
if where is not None:
|
186
|
+
where = se.lift(where)
|
187
|
+
|
188
|
+
core_index = self._spiral._core.create_key_space_index(
|
189
|
+
project_id=self._id,
|
190
|
+
name=name,
|
191
|
+
granularity=granularity,
|
192
|
+
projection=projection.__expr__,
|
193
|
+
filter=where.__expr__ if where else None,
|
194
|
+
root_uri=root_uri,
|
195
|
+
# TODO(marko): Validate that if an index exists, it's the same?
|
196
|
+
exist_ok=exist_ok,
|
197
|
+
)
|
198
|
+
|
199
|
+
return KeySpaceIndex(core_index, name=name)
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|