pyspiral 0.8.9__cp311-abi3-macosx_11_0_arm64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (114) hide show
  1. pyspiral-0.8.9.dist-info/METADATA +53 -0
  2. pyspiral-0.8.9.dist-info/RECORD +114 -0
  3. pyspiral-0.8.9.dist-info/WHEEL +4 -0
  4. pyspiral-0.8.9.dist-info/entry_points.txt +3 -0
  5. spiral/__init__.py +55 -0
  6. spiral/_lib.abi3.so +0 -0
  7. spiral/adbc.py +411 -0
  8. spiral/api/__init__.py +78 -0
  9. spiral/api/admin.py +15 -0
  10. spiral/api/client.py +165 -0
  11. spiral/api/filesystems.py +152 -0
  12. spiral/api/key_space_indexes.py +23 -0
  13. spiral/api/organizations.py +78 -0
  14. spiral/api/projects.py +219 -0
  15. spiral/api/telemetry.py +19 -0
  16. spiral/api/text_indexes.py +56 -0
  17. spiral/api/types.py +23 -0
  18. spiral/api/workers.py +40 -0
  19. spiral/api/workloads.py +52 -0
  20. spiral/arrow_.py +202 -0
  21. spiral/cli/__init__.py +89 -0
  22. spiral/cli/__main__.py +4 -0
  23. spiral/cli/admin.py +33 -0
  24. spiral/cli/app.py +108 -0
  25. spiral/cli/console.py +95 -0
  26. spiral/cli/fs.py +109 -0
  27. spiral/cli/iceberg.py +97 -0
  28. spiral/cli/key_spaces.py +103 -0
  29. spiral/cli/login.py +25 -0
  30. spiral/cli/orgs.py +81 -0
  31. spiral/cli/printer.py +53 -0
  32. spiral/cli/projects.py +148 -0
  33. spiral/cli/state.py +7 -0
  34. spiral/cli/tables.py +225 -0
  35. spiral/cli/telemetry.py +17 -0
  36. spiral/cli/text.py +115 -0
  37. spiral/cli/types.py +50 -0
  38. spiral/cli/workloads.py +86 -0
  39. spiral/client.py +279 -0
  40. spiral/core/__init__.pyi +0 -0
  41. spiral/core/_tools/__init__.pyi +5 -0
  42. spiral/core/authn/__init__.pyi +21 -0
  43. spiral/core/client/__init__.pyi +270 -0
  44. spiral/core/config/__init__.pyi +35 -0
  45. spiral/core/expr/__init__.pyi +15 -0
  46. spiral/core/expr/images/__init__.pyi +3 -0
  47. spiral/core/expr/list_/__init__.pyi +4 -0
  48. spiral/core/expr/pushdown/__init__.pyi +3 -0
  49. spiral/core/expr/refs/__init__.pyi +4 -0
  50. spiral/core/expr/s3/__init__.pyi +3 -0
  51. spiral/core/expr/str_/__init__.pyi +3 -0
  52. spiral/core/expr/struct_/__init__.pyi +6 -0
  53. spiral/core/expr/text/__init__.pyi +5 -0
  54. spiral/core/expr/udf/__init__.pyi +14 -0
  55. spiral/core/expr/video/__init__.pyi +3 -0
  56. spiral/core/table/__init__.pyi +142 -0
  57. spiral/core/table/manifests/__init__.pyi +35 -0
  58. spiral/core/table/metastore/__init__.pyi +58 -0
  59. spiral/core/table/spec/__init__.pyi +214 -0
  60. spiral/dataloader.py +310 -0
  61. spiral/dataset.py +264 -0
  62. spiral/datetime_.py +27 -0
  63. spiral/debug/__init__.py +0 -0
  64. spiral/debug/manifests.py +103 -0
  65. spiral/debug/metrics.py +56 -0
  66. spiral/debug/scan.py +266 -0
  67. spiral/demo.py +100 -0
  68. spiral/enrichment.py +290 -0
  69. spiral/expressions/__init__.py +274 -0
  70. spiral/expressions/base.py +186 -0
  71. spiral/expressions/file.py +17 -0
  72. spiral/expressions/http.py +17 -0
  73. spiral/expressions/list_.py +77 -0
  74. spiral/expressions/pushdown.py +12 -0
  75. spiral/expressions/s3.py +16 -0
  76. spiral/expressions/str_.py +39 -0
  77. spiral/expressions/struct.py +59 -0
  78. spiral/expressions/text.py +62 -0
  79. spiral/expressions/tiff.py +225 -0
  80. spiral/expressions/udf.py +66 -0
  81. spiral/grpc_.py +32 -0
  82. spiral/iceberg.py +31 -0
  83. spiral/iterable_dataset.py +106 -0
  84. spiral/key_space_index.py +44 -0
  85. spiral/project.py +247 -0
  86. spiral/protogen/_/__init__.py +0 -0
  87. spiral/protogen/_/arrow/__init__.py +0 -0
  88. spiral/protogen/_/arrow/flight/__init__.py +0 -0
  89. spiral/protogen/_/arrow/flight/protocol/__init__.py +0 -0
  90. spiral/protogen/_/arrow/flight/protocol/sql/__init__.py +2548 -0
  91. spiral/protogen/_/google/__init__.py +0 -0
  92. spiral/protogen/_/google/protobuf/__init__.py +2310 -0
  93. spiral/protogen/_/message_pool.py +3 -0
  94. spiral/protogen/_/py.typed +0 -0
  95. spiral/protogen/_/scandal/__init__.py +190 -0
  96. spiral/protogen/_/spfs/__init__.py +72 -0
  97. spiral/protogen/_/spql/__init__.py +61 -0
  98. spiral/protogen/_/substrait/__init__.py +6196 -0
  99. spiral/protogen/_/substrait/extensions/__init__.py +169 -0
  100. spiral/protogen/__init__.py +0 -0
  101. spiral/protogen/util.py +41 -0
  102. spiral/py.typed +0 -0
  103. spiral/scan.py +383 -0
  104. spiral/server.py +37 -0
  105. spiral/settings.py +36 -0
  106. spiral/snapshot.py +61 -0
  107. spiral/streaming_/__init__.py +3 -0
  108. spiral/streaming_/reader.py +133 -0
  109. spiral/streaming_/stream.py +156 -0
  110. spiral/substrait_.py +274 -0
  111. spiral/table.py +216 -0
  112. spiral/text_index.py +17 -0
  113. spiral/transaction.py +156 -0
  114. spiral/types_.py +6 -0
@@ -0,0 +1,56 @@
1
+ from typing import Any
2
+
3
+
4
+ def display_metrics(metrics: dict[str, Any]) -> None:
5
+ """Display metrics in a formatted table."""
6
+ print(
7
+ f"{'Metric':<40} {'Type':<10} {'Count':<10} {'Avg':<12} {'Min':<12} "
8
+ f"{'Max':<12} {'P95':<12} {'P99':<12} {'StdDev':<12}"
9
+ )
10
+ print("=" * 142)
11
+
12
+ for metric_name, data in sorted(metrics.items()):
13
+ metric_type = data["type"]
14
+ count = f"{int(data['count']):,}"
15
+ avg = _format_value(data["avg"], metric_type, metric_name)
16
+ min_val = _format_value(data["min"], metric_type, metric_name)
17
+ max_val = _format_value(data["max"], metric_type, metric_name)
18
+ p95 = _format_value(data["p95"], metric_type, metric_name)
19
+ p99 = _format_value(data["p99"], metric_type, metric_name)
20
+ stddev = _format_value(data["stddev"], metric_type, metric_name)
21
+
22
+ print(
23
+ f"{metric_name:<40} {metric_type:<10} {count:<10} {avg:<12} {min_val:<12} "
24
+ f"{max_val:<12} {p95:<12} {p99:<12} {stddev:<12}"
25
+ )
26
+
27
+
28
+ def _format_duration(nanoseconds: float) -> str:
29
+ """Convert nanoseconds to human-readable duration."""
30
+ if nanoseconds >= 1_000_000_000:
31
+ return f"{nanoseconds / 1_000_000_000:.2f}s"
32
+ elif nanoseconds >= 1_000_000:
33
+ return f"{nanoseconds / 1_000_000:.2f}ms"
34
+ elif nanoseconds >= 1_000:
35
+ return f"{nanoseconds / 1_000:.2f}μs"
36
+ else:
37
+ return f"{nanoseconds:.0f}ns"
38
+
39
+
40
+ def _format_bytes(bytes_value: float) -> str:
41
+ """Convert bytes to human-readable size."""
42
+ for unit in ["B", "KB", "MB", "GB"]:
43
+ if bytes_value < 1024:
44
+ return f"{bytes_value:.1f}{unit}"
45
+ bytes_value /= 1024
46
+ return f"{bytes_value:.1f}TB"
47
+
48
+
49
+ def _format_value(value: float, metric_type: str, metric_name: str) -> str:
50
+ """Format a value based on metric type and name."""
51
+ if metric_type == "timer" or "duration" in metric_name:
52
+ return _format_duration(value)
53
+ elif "bytes" in metric_name:
54
+ return _format_bytes(value)
55
+ else:
56
+ return f"{value:,.0f}"
spiral/debug/scan.py ADDED
@@ -0,0 +1,266 @@
1
+ from datetime import datetime
2
+
3
+ from spiral.core.table import Scan
4
+ from spiral.core.table.manifests import FragmentFile, FragmentManifest
5
+ from spiral.core.table.spec import Key
6
+ from spiral.types_ import Timestamp
7
+
8
+
9
+ def show_scan(scan: Scan):
10
+ """Displays a scan in a way that is useful for debugging."""
11
+ table_ids = scan.table_ids()
12
+ if len(table_ids) > 1:
13
+ raise NotImplementedError("Multiple table scan is not supported.")
14
+ table_id = table_ids[0]
15
+ column_groups = scan.column_groups()
16
+
17
+ splits = [s.key_range for s in scan.shards()]
18
+ key_space_state = scan.key_space_state(table_id)
19
+
20
+ # Collect all key bounds from all manifests. This makes sure all visualizations are aligned.
21
+ key_points = set()
22
+ key_space_manifest = key_space_state.manifest
23
+ for i in range(len(key_space_manifest)):
24
+ fragment_file = key_space_manifest[i]
25
+ key_points.add(fragment_file.key_extent.min)
26
+ key_points.add(fragment_file.key_extent.max)
27
+ for cg in column_groups:
28
+ cg_scan = scan.column_group_state(cg)
29
+ cg_manifest = cg_scan.manifest
30
+ for i in range(len(cg_manifest)):
31
+ fragment_file = cg_manifest[i]
32
+ key_points.add(fragment_file.key_extent.min)
33
+ key_points.add(fragment_file.key_extent.max)
34
+
35
+ # Make sure split points exist in all key points.
36
+ for s in splits[:-1]: # Don't take the last end.
37
+ key_points.add(s.end)
38
+ key_points = list(sorted(key_points))
39
+
40
+ show_manifest(key_space_manifest, scope="Key space", key_points=key_points, splits=splits)
41
+ for cg in scan.column_groups():
42
+ cg_scan = scan.column_group_state(cg)
43
+ # Skip table id from the start of the column group.
44
+ show_manifest(cg_scan.manifest, scope=".".join(cg.path[1:]), key_points=key_points, splits=splits)
45
+
46
+
47
+ def show_manifest(manifest: FragmentManifest, scope: str = None, key_points: list[Key] = None, splits: list = None):
48
+ try:
49
+ import matplotlib.patches as patches
50
+ import matplotlib.pyplot as plt
51
+ except ImportError:
52
+ raise ImportError("matplotlib is required for debug")
53
+
54
+ total_fragments = len(manifest)
55
+
56
+ size_points = set()
57
+ for i in range(total_fragments):
58
+ manifest_file: FragmentFile = manifest[i]
59
+ size_points.add(manifest_file.size_bytes)
60
+ size_points = list(sorted(size_points))
61
+
62
+ if key_points is None:
63
+ key_points = set()
64
+
65
+ for i in range(total_fragments):
66
+ manifest_file: FragmentFile = manifest[i]
67
+
68
+ key_points.add(manifest_file.key_extent.min)
69
+ key_points.add(manifest_file.key_extent.max)
70
+
71
+ if splits is not None:
72
+ for split in splits[:-1]:
73
+ key_points.add(split.end)
74
+
75
+ key_points = list(sorted(key_points))
76
+
77
+ # Create figure and axis with specified size
78
+ fig, ax = plt.subplots(figsize=(12, 8))
79
+
80
+ # Plot each rectangle
81
+ for i in range(total_fragments):
82
+ manifest_file: FragmentFile = manifest[i]
83
+
84
+ left = key_points.index(manifest_file.key_extent.min)
85
+ right = key_points.index(manifest_file.key_extent.max)
86
+ height = size_points.index(manifest_file.size_bytes) + 1
87
+
88
+ color = _get_fragment_color(manifest_file, i, total_fragments)
89
+
90
+ # Create rectangle patch
91
+ rect = patches.Rectangle(
92
+ (left, 0), # (x, y)
93
+ right - left, # width
94
+ height, # height
95
+ facecolor=color, # fill color
96
+ edgecolor="black", # border color
97
+ alpha=0.5, # transparency
98
+ linewidth=1, # border width
99
+ label=manifest_file.id, # label for legend
100
+ )
101
+
102
+ ax.add_patch(rect)
103
+
104
+ # Set axis limits with some padding
105
+ ax.set_xlim(-0.5, len(key_points) - 1 + 0.5)
106
+ ax.set_ylim(-0.5, len(size_points) + 0.5)
107
+
108
+ # Create split markers on x-axis
109
+ if splits is not None:
110
+ split_positions = [key_points.index(split.end) for split in splits[:-1]]
111
+
112
+ # Add split markers at the bottom
113
+ for pos in split_positions:
114
+ ax.annotate("▲", xy=(pos, 0), ha="center", va="top", color="red", annotation_clip=False)
115
+
116
+ # Add grid
117
+ ax.grid(True, linestyle="--", alpha=0.7, zorder=0)
118
+
119
+ # Add labels and title
120
+ ax.set_title("Fragment Distribution" if scope is None else f"{scope} Fragment Distribution")
121
+ ax.set_xlabel("Key Index")
122
+ ax.set_ylabel("Size Index")
123
+
124
+ # Add legend
125
+ ax.legend(bbox_to_anchor=(1, 1), loc="upper left", fontsize="small")
126
+
127
+ # Adjust layout to prevent label cutoff
128
+ plt.tight_layout()
129
+
130
+ plot = FragmentManifestPlot(fig, ax, manifest)
131
+ fig.canvas.mpl_connect("motion_notify_event", plot.hover)
132
+
133
+ plt.show()
134
+
135
+
136
+ def _get_fragment_color(manifest_file: FragmentFile, color_index, total_colors):
137
+ import matplotlib.cm as cm
138
+
139
+ if manifest_file.compacted_at is not None:
140
+ # Use a shade of gray for compacted fragments
141
+ # Vary the shade based on the index to distinguish different compacted fragments
142
+ gray_value = 0.3 + (0.5 * (color_index / total_colors))
143
+ return (gray_value, gray_value, gray_value)
144
+ else:
145
+ # Use viridis colormap for non-compacted fragments
146
+ return cm.viridis(color_index / total_colors)
147
+
148
+
149
+ def _get_human_size(size_bytes: int) -> str:
150
+ # Convert bytes to a human-readable format
151
+ for unit in ["B", "KB", "MB", "GB", "TB"]:
152
+ if size_bytes < 1024:
153
+ return f"{size_bytes:.2f} {unit}"
154
+ size_bytes /= 1024
155
+ return f"{size_bytes:.2f} PB"
156
+
157
+
158
+ def _maybe_truncate(text, max_length: int = 30) -> str:
159
+ text = str(text)
160
+ if len(text) <= max_length:
161
+ return text
162
+
163
+ half_length = (max_length - 3) // 2
164
+ return text[:half_length] + "..." + text[-half_length:]
165
+
166
+
167
+ def _get_fragment_legend(manifest_file: FragmentFile):
168
+ return "\n".join(
169
+ [
170
+ f"id: {manifest_file.id}",
171
+ f"size: {_get_human_size(manifest_file.size_bytes)} ({manifest_file.size_bytes} bytes)",
172
+ f"key_span: {manifest_file.key_span}",
173
+ f"key_min: {_maybe_truncate(manifest_file.key_extent.min)}",
174
+ f"key_max: {_maybe_truncate(manifest_file.key_extent.max)}",
175
+ f"format: {manifest_file.format}",
176
+ f"level: {manifest_file.level}",
177
+ f"committed_at: {_format_timestamp(manifest_file.committed_at)}",
178
+ f"compacted_at: {_format_timestamp(manifest_file.compacted_at)}",
179
+ f"ks_id: {manifest_file.ks_id}",
180
+ ]
181
+ )
182
+
183
+
184
+ def _format_timestamp(ts: Timestamp | None) -> str:
185
+ # Format timestamp or show None
186
+ if ts is None:
187
+ return "None"
188
+ try:
189
+ return datetime.fromtimestamp(ts / 1e6).strftime("%Y-%m-%d %H:%M:%S")
190
+ except ValueError:
191
+ return str(ts)
192
+
193
+
194
+ class FragmentManifestPlot:
195
+ def __init__(self, fig, ax, manifest: FragmentManifest):
196
+ self.fig = fig
197
+ self.ax = ax
198
+ self.manifest = manifest
199
+
200
+ # Position the annotation in the bottom right corner
201
+ self.annotation = ax.annotate(
202
+ "",
203
+ xy=(0.98, 0.02), # Position in axes coordinates
204
+ xycoords="axes fraction",
205
+ bbox=dict(boxstyle="round,pad=0.5", fc="white", ec="gray", alpha=0.8),
206
+ ha="right", # Right-align text
207
+ va="bottom", # Bottom-align text
208
+ visible=False,
209
+ )
210
+ self.highlighted_rect = None
211
+ self.highlighted_legend = None
212
+
213
+ def hover(self, event):
214
+ if event.inaxes != self.ax:
215
+ # Check if we're hovering over the legend
216
+ legend = self.ax.get_legend()
217
+ if legend and legend.contains(event)[0]:
218
+ # Find which legend item we're hovering over
219
+ for i, legend_text in enumerate(legend.get_texts()):
220
+ if legend_text.contains(event)[0]:
221
+ manifest_file = self.manifest[i]
222
+ self._show_legend(manifest_file, i, legend_text)
223
+ return
224
+ self._hide_legend()
225
+ return
226
+
227
+ # Check rectangles in the main plot
228
+ for i, rect in enumerate(self.ax.patches):
229
+ if rect.contains(event)[0]:
230
+ manifest_file = self.manifest[i]
231
+ self._show_legend(manifest_file, i, rect)
232
+ return
233
+
234
+ self._hide_legend()
235
+
236
+ def _show_legend(self, manifest_file, index, highlight_obj):
237
+ import matplotlib.patches as patches
238
+
239
+ # Update tooltip text
240
+ self.annotation.set_text(_get_fragment_legend(manifest_file))
241
+ self.annotation.set_visible(True)
242
+
243
+ # Handle highlighting
244
+ if isinstance(highlight_obj, patches.Rectangle):
245
+ # Highlighting rectangle in main plot
246
+ if self.highlighted_rect and self.highlighted_rect != highlight_obj:
247
+ self.highlighted_rect.set_alpha(0.5)
248
+ highlight_obj.set_alpha(0.8)
249
+ self.highlighted_rect = highlight_obj
250
+ else:
251
+ # Highlighting legend text
252
+ if self.highlighted_rect:
253
+ self.highlighted_rect.set_alpha(0.5)
254
+ # Find and highlight corresponding rectangle
255
+ rect = self.ax.patches[index]
256
+ rect.set_alpha(0.8)
257
+ self.highlighted_rect = rect
258
+
259
+ self.fig.canvas.draw_idle()
260
+
261
+ def _hide_legend(self):
262
+ if self.annotation.get_visible():
263
+ self.annotation.set_visible(False)
264
+ if self.highlighted_rect:
265
+ self.highlighted_rect.set_alpha(0.5)
266
+ self.fig.canvas.draw_idle()
spiral/demo.py ADDED
@@ -0,0 +1,100 @@
1
+ """Demo data to play with SpiralDB"""
2
+
3
+ import functools
4
+ import time
5
+
6
+ import duckdb
7
+ import pandas as pd
8
+ import pyarrow as pa
9
+ from datasets import load_dataset
10
+
11
+ from spiral import Project, Spiral, Table
12
+
13
+
14
+ def _install_duckdb_extension(name: str, max_retries: int = 3) -> None:
15
+ """Install and load a DuckDB extension with retry logic for flaky CI environments."""
16
+ for attempt in range(max_retries):
17
+ try:
18
+ duckdb.execute(f"INSTALL {name}; LOAD {name};")
19
+ return
20
+ except duckdb.IOException:
21
+ if attempt < max_retries - 1:
22
+ time.sleep(0.5 * (attempt + 1))
23
+ else:
24
+ raise
25
+
26
+
27
+ @functools.lru_cache(maxsize=1)
28
+ def demo_project(sp: Spiral) -> Project:
29
+ return sp.create_project(id_prefix="demo")
30
+
31
+
32
+ @functools.lru_cache(maxsize=1)
33
+ def images(sp: Spiral) -> Table:
34
+ table = demo_project(sp).create_table(
35
+ "openimages.images-v1", key_schema=pa.schema([("idx", pa.int64())]), exist_ok=False
36
+ )
37
+
38
+ # Load URLs from a TSV file
39
+ df = pd.read_csv(
40
+ "https://storage.googleapis.com/cvdf-datasets/oid/open-images-dataset-validation.tsv",
41
+ names=["url", "size", "etag"],
42
+ skiprows=1,
43
+ sep="\t",
44
+ header=None,
45
+ )
46
+ # For this example, we load just a few rows, but Spiral can handle many more.
47
+ df = pa.Table.from_pandas(df[:10])
48
+ df = df.append_column("idx", pa.array(range(len(df))))
49
+
50
+ # Write just the metadata - lightweight and fast
51
+ table.write(df)
52
+ return table
53
+
54
+
55
+ @functools.lru_cache(maxsize=1)
56
+ def gharchive(sp: Spiral, limit=100, period=None) -> Table:
57
+ if period is None:
58
+ period = pd.Period("2023-01-01T00:00:00Z", freq="h")
59
+
60
+ _install_duckdb_extension("httpfs")
61
+
62
+ json_gz_url = f"https://data.gharchive.org/{period.strftime('%Y-%m-%d')}-{str(period.hour)}.json.gz"
63
+ arrow_table = (
64
+ duckdb.read_json(json_gz_url, union_by_name=True)
65
+ .limit(limit)
66
+ .select("""
67
+ * REPLACE (
68
+ cast(created_at AS TIMESTAMP_MS) AS created_at,
69
+ )
70
+ """)
71
+ .to_arrow_table()
72
+ )
73
+
74
+ events = duckdb.from_arrow(arrow_table).order("created_at, id").distinct().to_arrow_table()
75
+ events = (
76
+ events.drop_columns("id")
77
+ .add_column(0, "id", events["id"].cast(pa.large_string()))
78
+ .drop_columns("created_at")
79
+ .add_column(0, "created_at", events["created_at"].cast(pa.timestamp("ms")))
80
+ .drop_columns("org")
81
+ )
82
+
83
+ key_schema = pa.schema([("created_at", pa.timestamp("ms")), ("id", pa.string_view())])
84
+ table = demo_project(sp).create_table("gharchive.events", key_schema=key_schema, exist_ok=False)
85
+ table.write(events, push_down_nulls=True)
86
+ return table
87
+
88
+
89
+ @functools.lru_cache(maxsize=1)
90
+ def fineweb(sp: Spiral, limit=100) -> Table:
91
+ table = demo_project(sp).create_table(
92
+ "fineweb.v1", key_schema=pa.schema([("id", pa.string_view())]), exist_ok=False
93
+ )
94
+
95
+ ds = load_dataset("HuggingFaceFW/fineweb", "sample-10BT", streaming=True)
96
+ data = ds["train"].take(limit)
97
+ arrow_table = pa.Table.from_pylist(data.to_list())
98
+
99
+ table.write(arrow_table, push_down_nulls=True)
100
+ return table