pyspiral 0.6.6__cp312-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (102) hide show
  1. pyspiral-0.6.6.dist-info/METADATA +51 -0
  2. pyspiral-0.6.6.dist-info/RECORD +102 -0
  3. pyspiral-0.6.6.dist-info/WHEEL +4 -0
  4. pyspiral-0.6.6.dist-info/entry_points.txt +2 -0
  5. spiral/__init__.py +35 -0
  6. spiral/_lib.abi3.so +0 -0
  7. spiral/adbc.py +411 -0
  8. spiral/api/__init__.py +78 -0
  9. spiral/api/admin.py +15 -0
  10. spiral/api/client.py +164 -0
  11. spiral/api/filesystems.py +134 -0
  12. spiral/api/key_space_indexes.py +23 -0
  13. spiral/api/organizations.py +77 -0
  14. spiral/api/projects.py +219 -0
  15. spiral/api/telemetry.py +19 -0
  16. spiral/api/text_indexes.py +56 -0
  17. spiral/api/types.py +22 -0
  18. spiral/api/workers.py +40 -0
  19. spiral/api/workloads.py +52 -0
  20. spiral/arrow_.py +216 -0
  21. spiral/cli/__init__.py +88 -0
  22. spiral/cli/__main__.py +4 -0
  23. spiral/cli/admin.py +14 -0
  24. spiral/cli/app.py +104 -0
  25. spiral/cli/console.py +95 -0
  26. spiral/cli/fs.py +76 -0
  27. spiral/cli/iceberg.py +97 -0
  28. spiral/cli/key_spaces.py +89 -0
  29. spiral/cli/login.py +24 -0
  30. spiral/cli/orgs.py +89 -0
  31. spiral/cli/printer.py +53 -0
  32. spiral/cli/projects.py +147 -0
  33. spiral/cli/state.py +5 -0
  34. spiral/cli/tables.py +174 -0
  35. spiral/cli/telemetry.py +17 -0
  36. spiral/cli/text.py +115 -0
  37. spiral/cli/types.py +50 -0
  38. spiral/cli/workloads.py +58 -0
  39. spiral/client.py +178 -0
  40. spiral/core/__init__.pyi +0 -0
  41. spiral/core/_tools/__init__.pyi +5 -0
  42. spiral/core/authn/__init__.pyi +27 -0
  43. spiral/core/client/__init__.pyi +237 -0
  44. spiral/core/table/__init__.pyi +101 -0
  45. spiral/core/table/manifests/__init__.pyi +35 -0
  46. spiral/core/table/metastore/__init__.pyi +58 -0
  47. spiral/core/table/spec/__init__.pyi +213 -0
  48. spiral/dataloader.py +285 -0
  49. spiral/dataset.py +255 -0
  50. spiral/datetime_.py +27 -0
  51. spiral/debug/__init__.py +0 -0
  52. spiral/debug/manifests.py +87 -0
  53. spiral/debug/metrics.py +56 -0
  54. spiral/debug/scan.py +266 -0
  55. spiral/expressions/__init__.py +276 -0
  56. spiral/expressions/base.py +157 -0
  57. spiral/expressions/http.py +86 -0
  58. spiral/expressions/io.py +100 -0
  59. spiral/expressions/list_.py +68 -0
  60. spiral/expressions/mp4.py +62 -0
  61. spiral/expressions/png.py +18 -0
  62. spiral/expressions/qoi.py +18 -0
  63. spiral/expressions/refs.py +58 -0
  64. spiral/expressions/str_.py +39 -0
  65. spiral/expressions/struct.py +59 -0
  66. spiral/expressions/text.py +62 -0
  67. spiral/expressions/tiff.py +223 -0
  68. spiral/expressions/udf.py +46 -0
  69. spiral/grpc_.py +32 -0
  70. spiral/iceberg.py +31 -0
  71. spiral/iterable_dataset.py +106 -0
  72. spiral/key_space_index.py +44 -0
  73. spiral/project.py +199 -0
  74. spiral/protogen/_/__init__.py +0 -0
  75. spiral/protogen/_/arrow/__init__.py +0 -0
  76. spiral/protogen/_/arrow/flight/__init__.py +0 -0
  77. spiral/protogen/_/arrow/flight/protocol/__init__.py +0 -0
  78. spiral/protogen/_/arrow/flight/protocol/sql/__init__.py +2548 -0
  79. spiral/protogen/_/google/__init__.py +0 -0
  80. spiral/protogen/_/google/protobuf/__init__.py +2310 -0
  81. spiral/protogen/_/message_pool.py +3 -0
  82. spiral/protogen/_/py.typed +0 -0
  83. spiral/protogen/_/scandal/__init__.py +190 -0
  84. spiral/protogen/_/spfs/__init__.py +72 -0
  85. spiral/protogen/_/spql/__init__.py +61 -0
  86. spiral/protogen/_/substrait/__init__.py +6196 -0
  87. spiral/protogen/_/substrait/extensions/__init__.py +169 -0
  88. spiral/protogen/__init__.py +0 -0
  89. spiral/protogen/util.py +41 -0
  90. spiral/py.typed +0 -0
  91. spiral/scan.py +285 -0
  92. spiral/server.py +17 -0
  93. spiral/settings.py +114 -0
  94. spiral/snapshot.py +56 -0
  95. spiral/streaming_/__init__.py +3 -0
  96. spiral/streaming_/reader.py +133 -0
  97. spiral/streaming_/stream.py +157 -0
  98. spiral/substrait_.py +274 -0
  99. spiral/table.py +293 -0
  100. spiral/text_index.py +17 -0
  101. spiral/transaction.py +58 -0
  102. spiral/types_.py +6 -0
@@ -0,0 +1,106 @@
1
+ from collections.abc import Callable, Iterator
2
+ from typing import TYPE_CHECKING
3
+
4
+ import pyarrow as pa
5
+
6
+ if TYPE_CHECKING:
7
+ import datasets.iterable_dataset as hf # noqa
8
+ import streaming # noqa
9
+ import torch.utils.data as torchdata # noqa
10
+
11
+
12
+ def _hf_compatible_schema(schema: pa.Schema) -> pa.Schema:
13
+ """
14
+ Replace string-view and binary-view columns in the schema with strings/binary.
15
+ Recursively handles nested types (struct, list, etc).
16
+ We use this converted schema as Features in the returned Dataset.
17
+ Remove this method once we have https://github.com/huggingface/datasets/pull/7718
18
+ """
19
+
20
+ def _convert_type(dtype: pa.DataType) -> pa.DataType:
21
+ if dtype == pa.string_view():
22
+ return pa.string()
23
+ elif dtype == pa.binary_view():
24
+ return pa.binary()
25
+ elif pa.types.is_struct(dtype):
26
+ new_fields = [
27
+ pa.field(field.name, _convert_type(field.type), nullable=field.nullable, metadata=field.metadata)
28
+ for field in dtype
29
+ ]
30
+ return pa.struct(new_fields)
31
+ elif pa.types.is_list(dtype):
32
+ return pa.list_(_convert_type(dtype.value_type))
33
+ elif pa.types.is_large_list(dtype):
34
+ return pa.large_list(_convert_type(dtype.value_type))
35
+ elif pa.types.is_fixed_size_list(dtype):
36
+ return pa.list_(_convert_type(dtype.value_type), dtype.list_size)
37
+ elif pa.types.is_map(dtype):
38
+ return pa.map_(_convert_type(dtype.key_type), _convert_type(dtype.item_type))
39
+ else:
40
+ return dtype
41
+
42
+ new_fields = []
43
+ for field in schema:
44
+ new_type = _convert_type(field.type)
45
+ new_fields.append(pa.field(field.name, new_type, nullable=field.nullable, metadata=field.metadata))
46
+
47
+ return pa.schema(new_fields)
48
+
49
+
50
+ def to_iterable_dataset(stream: pa.RecordBatchReader) -> "hf.IterableDataset":
51
+ from datasets import DatasetInfo, Features
52
+ from datasets.builder import ArrowExamplesIterable
53
+ from datasets.iterable_dataset import IterableDataset
54
+
55
+ def _generate_tables(**kwargs) -> Iterator[tuple[int, pa.Table]]:
56
+ # This key is unused when training with IterableDataset.
57
+ # Default implementation returns shard id, e.g. parquet row group id.
58
+ for i, rb in enumerate(stream):
59
+ yield i, pa.Table.from_batches([rb], stream.schema)
60
+
61
+ # TODO(marko): This is temporary until we stop returning IterableDataset from this function.
62
+ class _IterableDataset(IterableDataset):
63
+ # Diff with datasets.iterable_dataset.IterableDataset:
64
+ # - Removes torch handling which attempts to handle worker processes.
65
+ # - Assumes arrow iterator.
66
+ def __iter__(self):
67
+ from datasets.formatting import get_formatter
68
+
69
+ prepared_ex_iterable = self._prepare_ex_iterable_for_iteration()
70
+ if self._formatting and (prepared_ex_iterable.iter_arrow or self._formatting.is_table):
71
+ formatter = get_formatter(self._formatting.format_type, features=self.features)
72
+ iterator = prepared_ex_iterable.iter_arrow()
73
+ for key, pa_table in iterator:
74
+ yield formatter.format_row(pa_table)
75
+ return
76
+
77
+ for key, example in prepared_ex_iterable:
78
+ # no need to format thanks to FormattedExamplesIterable
79
+ yield example
80
+
81
+ def map(self, *args, **kwargs):
82
+ # Map constructs a new IterableDataset, so we need to "patch" it
83
+ base = super().map(*args, **kwargs)
84
+ if isinstance(base, IterableDataset):
85
+ # Patch __iter__ to avoid torch handling
86
+ base.__class__ = _IterableDataset # type: ignore
87
+ return base
88
+
89
+ class _ArrowExamplesIterable(ArrowExamplesIterable):
90
+ def __init__(self, generate_tables_fn: Callable[..., Iterator[tuple[int, pa.Table]]], features: Features):
91
+ # NOTE: generate_tables_fn type annotations are wrong, return type must be an iterable of tuples.
92
+ super().__init__(generate_tables_fn, kwargs={}) # type: ignore
93
+ self._features = features
94
+
95
+ @property
96
+ def is_typed(self) -> bool:
97
+ return True
98
+
99
+ @property
100
+ def features(self) -> Features:
101
+ return self._features
102
+
103
+ target_features = Features.from_arrow_schema(_hf_compatible_schema(stream.schema))
104
+ ex_iterable = _ArrowExamplesIterable(_generate_tables, target_features)
105
+ info = DatasetInfo(features=target_features)
106
+ return _IterableDataset(ex_iterable=ex_iterable, info=info)
@@ -0,0 +1,44 @@
1
+ from spiral.core.client import KeySpaceIndex as CoreKeySpaceIndex
2
+ from spiral.expressions import Expr
3
+ from spiral.types_ import Timestamp
4
+
5
+
6
+ class KeySpaceIndex:
7
+ """
8
+ KeysIndex represents an optionally materialized key space, defined by a projection and a filter over a table.
9
+ It can be used to efficiently and precisely shard the table for parallel processing or distributed training.
10
+
11
+ An index is defined by:
12
+ - A granularity that defines the target size of key ranges in the index.
13
+ IMPORTANT: Actual key ranges may be smaller, but will not exceed twice the granularity.
14
+ - A projection expression that defines which columns are included in the resulting key space.
15
+ - An optional filter expression that defines which rows are included in the index.
16
+ """
17
+
18
+ def __init__(self, core: CoreKeySpaceIndex, *, name: str | None = None):
19
+ self.core = core
20
+ self._name = name
21
+
22
+ @property
23
+ def index_id(self) -> str:
24
+ return self.core.id
25
+
26
+ @property
27
+ def table_id(self) -> str:
28
+ return self.core.table_id
29
+
30
+ @property
31
+ def name(self) -> str:
32
+ return self._name or self.index_id
33
+
34
+ @property
35
+ def asof(self) -> Timestamp:
36
+ return self.core.asof
37
+
38
+ @property
39
+ def projection(self) -> Expr:
40
+ return Expr(self.core.projection)
41
+
42
+ @property
43
+ def filter(self) -> Expr | None:
44
+ return Expr(self.core.filter) if self.core.filter is not None else None
spiral/project.py ADDED
@@ -0,0 +1,199 @@
1
+ from typing import TYPE_CHECKING, Any
2
+
3
+ import pyarrow as pa
4
+
5
+ from spiral.api.projects import KeySpaceIndexResource, TableResource, TextIndexResource
6
+ from spiral.core.table.spec import Schema
7
+ from spiral.expressions import ExprLike
8
+ from spiral.key_space_index import KeySpaceIndex
9
+ from spiral.table import Table
10
+ from spiral.text_index import TextIndex
11
+ from spiral.types_ import Uri
12
+
13
+ if TYPE_CHECKING:
14
+ from spiral.client import Spiral
15
+
16
+
17
+ class Project:
18
+ def __init__(self, spiral: "Spiral", project_id: str, name: str | None = None):
19
+ self._spiral = spiral
20
+ self._id = project_id
21
+ self._name = name
22
+
23
+ def __str__(self):
24
+ return self._id
25
+
26
+ def __repr__(self):
27
+ return f"Project(id={self._id}{', name=' + self._name if self._name else ''})"
28
+
29
+ @property
30
+ def id(self) -> str:
31
+ return self._id
32
+
33
+ @property
34
+ def name(self) -> str:
35
+ return self._name or self._id
36
+
37
+ def list_tables(self) -> list[TableResource]:
38
+ return list(self._spiral.api.project.list_tables(self._id))
39
+
40
+ def list_text_indexes(self) -> list[TextIndexResource]:
41
+ return list(self._spiral.api.project.list_text_indexes(self._id))
42
+
43
+ def list_key_space_indexes(self) -> list[KeySpaceIndexResource]:
44
+ return list(self._spiral.api.project.list_key_space_indexes(self._id))
45
+
46
+ def table(self, identifier: str) -> Table:
47
+ """Open a table with a `dataset.table` identifier, or `table` name using the `default` dataset."""
48
+ dataset, table = self._parse_table_identifier(identifier)
49
+
50
+ res = list(self._spiral.api.project.list_tables(project_id=self._id, dataset=dataset, table=table))
51
+ if len(res) == 0:
52
+ raise ValueError(f"Table not found: {self._id}.{dataset}.{table}")
53
+ res = res[0]
54
+
55
+ return Table(
56
+ self._spiral, self._spiral._core.table(res.id), identifier=f"{res.project_id}.{res.dataset}.{res.table}"
57
+ )
58
+
59
+ def create_table(
60
+ self,
61
+ identifier: str,
62
+ *,
63
+ key_schema: pa.Schema | Any,
64
+ root_uri: Uri | None = None,
65
+ exist_ok: bool = False,
66
+ ) -> Table:
67
+ """Create a new table in the project.
68
+
69
+ Args:
70
+ identifier: The table identifier, in the form `dataset.table` or `table`.
71
+ key_schema: The schema of the table's keys.
72
+ root_uri: The root URI for the table.
73
+ exist_ok: If True, do not raise an error if the table already exists.
74
+ """
75
+ dataset, table = self._parse_table_identifier(identifier)
76
+
77
+ if not isinstance(key_schema, pa.Schema):
78
+ key_schema = pa.schema(key_schema)
79
+ key_schema = Schema.from_arrow(key_schema)
80
+
81
+ core_table = self._spiral._core.create_table(
82
+ project_id=self._id,
83
+ dataset=dataset,
84
+ table=table,
85
+ key_schema=key_schema,
86
+ root_uri=root_uri,
87
+ exist_ok=exist_ok,
88
+ )
89
+
90
+ return Table(self._spiral, core_table, identifier=f"{self._id}.{dataset}.{table}")
91
+
92
+ def _parse_table_identifier(self, identifier: str) -> tuple[str, str]:
93
+ parts = identifier.split(".")
94
+ if len(parts) == 1:
95
+ return "default", parts[0]
96
+ elif len(parts) == 2:
97
+ return parts[0], parts[1]
98
+ else:
99
+ raise ValueError(f"Invalid table identifier: {self._id}.{identifier}")
100
+
101
+ def text_index(self, name: str) -> TextIndex:
102
+ """Returns the index with the given name."""
103
+ res = list(self._spiral.api.project.list_text_indexes(project_id=self._id, name=name))
104
+ if len(res) == 0:
105
+ raise ValueError(f"Index not found: {name}")
106
+ res = res[0]
107
+
108
+ return TextIndex(self._spiral._core.text_index(res.id), name=name)
109
+
110
+ def create_text_index(
111
+ self,
112
+ name: str,
113
+ *projections: ExprLike,
114
+ where: ExprLike | None = None,
115
+ root_uri: Uri | None = None,
116
+ exist_ok: bool = False,
117
+ ) -> TextIndex:
118
+ """Creates a text index over the table projection.
119
+
120
+ See `se.text.field` for how to create and configure indexable fields.
121
+
122
+ Args:
123
+ name: The index name. Must be unique within the project.
124
+ projections: At least one projection expression is required.
125
+ All projections must reference the same table.
126
+ where: An optional filter expression to apply to the index.
127
+ root_uri: The root URI for the index.
128
+ exist_ok: If True, do not raise an error if the index already exists.
129
+ """
130
+ from spiral import expressions as se
131
+
132
+ if not projections:
133
+ raise ValueError("At least one projection is required.")
134
+ projection = se.merge(*projections)
135
+ if where is not None:
136
+ where = se.lift(where)
137
+
138
+ core_index = self._spiral._core.create_text_index(
139
+ project_id=self._id,
140
+ name=name,
141
+ projection=projection.__expr__,
142
+ filter=where.__expr__ if where else None,
143
+ root_uri=root_uri,
144
+ # TODO(marko): Validate that if an index exists, it's the same?
145
+ exist_ok=exist_ok,
146
+ )
147
+
148
+ return TextIndex(core_index, name=name)
149
+
150
+ def key_space_index(self, name: str) -> KeySpaceIndex:
151
+ """Returns the index with the given name."""
152
+ res = list(self._spiral.api.project.list_key_space_indexes(project_id=self._id, name=name))
153
+ if len(res) == 0:
154
+ raise ValueError(f"Index not found: {name}")
155
+ res = res[0]
156
+
157
+ return KeySpaceIndex(self._spiral._core.key_space_index(res.id), name=name)
158
+
159
+ def create_key_space_index(
160
+ self,
161
+ name: str,
162
+ granularity: int,
163
+ *projections: ExprLike,
164
+ where: ExprLike | None = None,
165
+ root_uri: Uri | None = None,
166
+ exist_ok: bool = False,
167
+ ) -> KeySpaceIndex:
168
+ """Creates a key space index over the table projection.
169
+
170
+ Args:
171
+ name: The index name. Must be unique within the project.
172
+ granularity: The granularity at which to store keys, i.e. the size of desired key ranges.
173
+ The key ranges will not be greater than 2x the granularity, but may be smaller.
174
+ projections: At least one projection expression is required.
175
+ All projections must reference the same table.
176
+ where: An optional filter expression to apply to the index.
177
+ root_uri: The root URI for the index.
178
+ exist_ok: If True, do not raise an error if the index already exists.
179
+ """
180
+ from spiral import expressions as se
181
+
182
+ if not projections:
183
+ raise ValueError("At least one projection is required.")
184
+ projection = se.merge(*projections)
185
+ if where is not None:
186
+ where = se.lift(where)
187
+
188
+ core_index = self._spiral._core.create_key_space_index(
189
+ project_id=self._id,
190
+ name=name,
191
+ granularity=granularity,
192
+ projection=projection.__expr__,
193
+ filter=where.__expr__ if where else None,
194
+ root_uri=root_uri,
195
+ # TODO(marko): Validate that if an index exists, it's the same?
196
+ exist_ok=exist_ok,
197
+ )
198
+
199
+ return KeySpaceIndex(core_index, name=name)
File without changes
File without changes
File without changes
File without changes