pyspiral 0.8.9__cp311-abi3-macosx_11_0_arm64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pyspiral-0.8.9.dist-info/METADATA +53 -0
- pyspiral-0.8.9.dist-info/RECORD +114 -0
- pyspiral-0.8.9.dist-info/WHEEL +4 -0
- pyspiral-0.8.9.dist-info/entry_points.txt +3 -0
- spiral/__init__.py +55 -0
- spiral/_lib.abi3.so +0 -0
- spiral/adbc.py +411 -0
- spiral/api/__init__.py +78 -0
- spiral/api/admin.py +15 -0
- spiral/api/client.py +165 -0
- spiral/api/filesystems.py +152 -0
- spiral/api/key_space_indexes.py +23 -0
- spiral/api/organizations.py +78 -0
- spiral/api/projects.py +219 -0
- spiral/api/telemetry.py +19 -0
- spiral/api/text_indexes.py +56 -0
- spiral/api/types.py +23 -0
- spiral/api/workers.py +40 -0
- spiral/api/workloads.py +52 -0
- spiral/arrow_.py +202 -0
- spiral/cli/__init__.py +89 -0
- spiral/cli/__main__.py +4 -0
- spiral/cli/admin.py +33 -0
- spiral/cli/app.py +108 -0
- spiral/cli/console.py +95 -0
- spiral/cli/fs.py +109 -0
- spiral/cli/iceberg.py +97 -0
- spiral/cli/key_spaces.py +103 -0
- spiral/cli/login.py +25 -0
- spiral/cli/orgs.py +81 -0
- spiral/cli/printer.py +53 -0
- spiral/cli/projects.py +148 -0
- spiral/cli/state.py +7 -0
- spiral/cli/tables.py +225 -0
- spiral/cli/telemetry.py +17 -0
- spiral/cli/text.py +115 -0
- spiral/cli/types.py +50 -0
- spiral/cli/workloads.py +86 -0
- spiral/client.py +279 -0
- spiral/core/__init__.pyi +0 -0
- spiral/core/_tools/__init__.pyi +5 -0
- spiral/core/authn/__init__.pyi +21 -0
- spiral/core/client/__init__.pyi +270 -0
- spiral/core/config/__init__.pyi +35 -0
- spiral/core/expr/__init__.pyi +15 -0
- spiral/core/expr/images/__init__.pyi +3 -0
- spiral/core/expr/list_/__init__.pyi +4 -0
- spiral/core/expr/pushdown/__init__.pyi +3 -0
- spiral/core/expr/refs/__init__.pyi +4 -0
- spiral/core/expr/s3/__init__.pyi +3 -0
- spiral/core/expr/str_/__init__.pyi +3 -0
- spiral/core/expr/struct_/__init__.pyi +6 -0
- spiral/core/expr/text/__init__.pyi +5 -0
- spiral/core/expr/udf/__init__.pyi +14 -0
- spiral/core/expr/video/__init__.pyi +3 -0
- spiral/core/table/__init__.pyi +142 -0
- spiral/core/table/manifests/__init__.pyi +35 -0
- spiral/core/table/metastore/__init__.pyi +58 -0
- spiral/core/table/spec/__init__.pyi +214 -0
- spiral/dataloader.py +310 -0
- spiral/dataset.py +264 -0
- spiral/datetime_.py +27 -0
- spiral/debug/__init__.py +0 -0
- spiral/debug/manifests.py +103 -0
- spiral/debug/metrics.py +56 -0
- spiral/debug/scan.py +266 -0
- spiral/demo.py +100 -0
- spiral/enrichment.py +290 -0
- spiral/expressions/__init__.py +274 -0
- spiral/expressions/base.py +186 -0
- spiral/expressions/file.py +17 -0
- spiral/expressions/http.py +17 -0
- spiral/expressions/list_.py +77 -0
- spiral/expressions/pushdown.py +12 -0
- spiral/expressions/s3.py +16 -0
- spiral/expressions/str_.py +39 -0
- spiral/expressions/struct.py +59 -0
- spiral/expressions/text.py +62 -0
- spiral/expressions/tiff.py +225 -0
- spiral/expressions/udf.py +66 -0
- spiral/grpc_.py +32 -0
- spiral/iceberg.py +31 -0
- spiral/iterable_dataset.py +106 -0
- spiral/key_space_index.py +44 -0
- spiral/project.py +247 -0
- spiral/protogen/_/__init__.py +0 -0
- spiral/protogen/_/arrow/__init__.py +0 -0
- spiral/protogen/_/arrow/flight/__init__.py +0 -0
- spiral/protogen/_/arrow/flight/protocol/__init__.py +0 -0
- spiral/protogen/_/arrow/flight/protocol/sql/__init__.py +2548 -0
- spiral/protogen/_/google/__init__.py +0 -0
- spiral/protogen/_/google/protobuf/__init__.py +2310 -0
- spiral/protogen/_/message_pool.py +3 -0
- spiral/protogen/_/py.typed +0 -0
- spiral/protogen/_/scandal/__init__.py +190 -0
- spiral/protogen/_/spfs/__init__.py +72 -0
- spiral/protogen/_/spql/__init__.py +61 -0
- spiral/protogen/_/substrait/__init__.py +6196 -0
- spiral/protogen/_/substrait/extensions/__init__.py +169 -0
- spiral/protogen/__init__.py +0 -0
- spiral/protogen/util.py +41 -0
- spiral/py.typed +0 -0
- spiral/scan.py +383 -0
- spiral/server.py +37 -0
- spiral/settings.py +36 -0
- spiral/snapshot.py +61 -0
- spiral/streaming_/__init__.py +3 -0
- spiral/streaming_/reader.py +133 -0
- spiral/streaming_/stream.py +156 -0
- spiral/substrait_.py +274 -0
- spiral/table.py +216 -0
- spiral/text_index.py +17 -0
- spiral/transaction.py +156 -0
- spiral/types_.py +6 -0
spiral/debug/metrics.py
ADDED
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
from typing import Any
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
def display_metrics(metrics: dict[str, Any]) -> None:
|
|
5
|
+
"""Display metrics in a formatted table."""
|
|
6
|
+
print(
|
|
7
|
+
f"{'Metric':<40} {'Type':<10} {'Count':<10} {'Avg':<12} {'Min':<12} "
|
|
8
|
+
f"{'Max':<12} {'P95':<12} {'P99':<12} {'StdDev':<12}"
|
|
9
|
+
)
|
|
10
|
+
print("=" * 142)
|
|
11
|
+
|
|
12
|
+
for metric_name, data in sorted(metrics.items()):
|
|
13
|
+
metric_type = data["type"]
|
|
14
|
+
count = f"{int(data['count']):,}"
|
|
15
|
+
avg = _format_value(data["avg"], metric_type, metric_name)
|
|
16
|
+
min_val = _format_value(data["min"], metric_type, metric_name)
|
|
17
|
+
max_val = _format_value(data["max"], metric_type, metric_name)
|
|
18
|
+
p95 = _format_value(data["p95"], metric_type, metric_name)
|
|
19
|
+
p99 = _format_value(data["p99"], metric_type, metric_name)
|
|
20
|
+
stddev = _format_value(data["stddev"], metric_type, metric_name)
|
|
21
|
+
|
|
22
|
+
print(
|
|
23
|
+
f"{metric_name:<40} {metric_type:<10} {count:<10} {avg:<12} {min_val:<12} "
|
|
24
|
+
f"{max_val:<12} {p95:<12} {p99:<12} {stddev:<12}"
|
|
25
|
+
)
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def _format_duration(nanoseconds: float) -> str:
|
|
29
|
+
"""Convert nanoseconds to human-readable duration."""
|
|
30
|
+
if nanoseconds >= 1_000_000_000:
|
|
31
|
+
return f"{nanoseconds / 1_000_000_000:.2f}s"
|
|
32
|
+
elif nanoseconds >= 1_000_000:
|
|
33
|
+
return f"{nanoseconds / 1_000_000:.2f}ms"
|
|
34
|
+
elif nanoseconds >= 1_000:
|
|
35
|
+
return f"{nanoseconds / 1_000:.2f}μs"
|
|
36
|
+
else:
|
|
37
|
+
return f"{nanoseconds:.0f}ns"
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def _format_bytes(bytes_value: float) -> str:
|
|
41
|
+
"""Convert bytes to human-readable size."""
|
|
42
|
+
for unit in ["B", "KB", "MB", "GB"]:
|
|
43
|
+
if bytes_value < 1024:
|
|
44
|
+
return f"{bytes_value:.1f}{unit}"
|
|
45
|
+
bytes_value /= 1024
|
|
46
|
+
return f"{bytes_value:.1f}TB"
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def _format_value(value: float, metric_type: str, metric_name: str) -> str:
|
|
50
|
+
"""Format a value based on metric type and name."""
|
|
51
|
+
if metric_type == "timer" or "duration" in metric_name:
|
|
52
|
+
return _format_duration(value)
|
|
53
|
+
elif "bytes" in metric_name:
|
|
54
|
+
return _format_bytes(value)
|
|
55
|
+
else:
|
|
56
|
+
return f"{value:,.0f}"
|
spiral/debug/scan.py
ADDED
|
@@ -0,0 +1,266 @@
|
|
|
1
|
+
from datetime import datetime
|
|
2
|
+
|
|
3
|
+
from spiral.core.table import Scan
|
|
4
|
+
from spiral.core.table.manifests import FragmentFile, FragmentManifest
|
|
5
|
+
from spiral.core.table.spec import Key
|
|
6
|
+
from spiral.types_ import Timestamp
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def show_scan(scan: Scan):
|
|
10
|
+
"""Displays a scan in a way that is useful for debugging."""
|
|
11
|
+
table_ids = scan.table_ids()
|
|
12
|
+
if len(table_ids) > 1:
|
|
13
|
+
raise NotImplementedError("Multiple table scan is not supported.")
|
|
14
|
+
table_id = table_ids[0]
|
|
15
|
+
column_groups = scan.column_groups()
|
|
16
|
+
|
|
17
|
+
splits = [s.key_range for s in scan.shards()]
|
|
18
|
+
key_space_state = scan.key_space_state(table_id)
|
|
19
|
+
|
|
20
|
+
# Collect all key bounds from all manifests. This makes sure all visualizations are aligned.
|
|
21
|
+
key_points = set()
|
|
22
|
+
key_space_manifest = key_space_state.manifest
|
|
23
|
+
for i in range(len(key_space_manifest)):
|
|
24
|
+
fragment_file = key_space_manifest[i]
|
|
25
|
+
key_points.add(fragment_file.key_extent.min)
|
|
26
|
+
key_points.add(fragment_file.key_extent.max)
|
|
27
|
+
for cg in column_groups:
|
|
28
|
+
cg_scan = scan.column_group_state(cg)
|
|
29
|
+
cg_manifest = cg_scan.manifest
|
|
30
|
+
for i in range(len(cg_manifest)):
|
|
31
|
+
fragment_file = cg_manifest[i]
|
|
32
|
+
key_points.add(fragment_file.key_extent.min)
|
|
33
|
+
key_points.add(fragment_file.key_extent.max)
|
|
34
|
+
|
|
35
|
+
# Make sure split points exist in all key points.
|
|
36
|
+
for s in splits[:-1]: # Don't take the last end.
|
|
37
|
+
key_points.add(s.end)
|
|
38
|
+
key_points = list(sorted(key_points))
|
|
39
|
+
|
|
40
|
+
show_manifest(key_space_manifest, scope="Key space", key_points=key_points, splits=splits)
|
|
41
|
+
for cg in scan.column_groups():
|
|
42
|
+
cg_scan = scan.column_group_state(cg)
|
|
43
|
+
# Skip table id from the start of the column group.
|
|
44
|
+
show_manifest(cg_scan.manifest, scope=".".join(cg.path[1:]), key_points=key_points, splits=splits)
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def show_manifest(manifest: FragmentManifest, scope: str = None, key_points: list[Key] = None, splits: list = None):
|
|
48
|
+
try:
|
|
49
|
+
import matplotlib.patches as patches
|
|
50
|
+
import matplotlib.pyplot as plt
|
|
51
|
+
except ImportError:
|
|
52
|
+
raise ImportError("matplotlib is required for debug")
|
|
53
|
+
|
|
54
|
+
total_fragments = len(manifest)
|
|
55
|
+
|
|
56
|
+
size_points = set()
|
|
57
|
+
for i in range(total_fragments):
|
|
58
|
+
manifest_file: FragmentFile = manifest[i]
|
|
59
|
+
size_points.add(manifest_file.size_bytes)
|
|
60
|
+
size_points = list(sorted(size_points))
|
|
61
|
+
|
|
62
|
+
if key_points is None:
|
|
63
|
+
key_points = set()
|
|
64
|
+
|
|
65
|
+
for i in range(total_fragments):
|
|
66
|
+
manifest_file: FragmentFile = manifest[i]
|
|
67
|
+
|
|
68
|
+
key_points.add(manifest_file.key_extent.min)
|
|
69
|
+
key_points.add(manifest_file.key_extent.max)
|
|
70
|
+
|
|
71
|
+
if splits is not None:
|
|
72
|
+
for split in splits[:-1]:
|
|
73
|
+
key_points.add(split.end)
|
|
74
|
+
|
|
75
|
+
key_points = list(sorted(key_points))
|
|
76
|
+
|
|
77
|
+
# Create figure and axis with specified size
|
|
78
|
+
fig, ax = plt.subplots(figsize=(12, 8))
|
|
79
|
+
|
|
80
|
+
# Plot each rectangle
|
|
81
|
+
for i in range(total_fragments):
|
|
82
|
+
manifest_file: FragmentFile = manifest[i]
|
|
83
|
+
|
|
84
|
+
left = key_points.index(manifest_file.key_extent.min)
|
|
85
|
+
right = key_points.index(manifest_file.key_extent.max)
|
|
86
|
+
height = size_points.index(manifest_file.size_bytes) + 1
|
|
87
|
+
|
|
88
|
+
color = _get_fragment_color(manifest_file, i, total_fragments)
|
|
89
|
+
|
|
90
|
+
# Create rectangle patch
|
|
91
|
+
rect = patches.Rectangle(
|
|
92
|
+
(left, 0), # (x, y)
|
|
93
|
+
right - left, # width
|
|
94
|
+
height, # height
|
|
95
|
+
facecolor=color, # fill color
|
|
96
|
+
edgecolor="black", # border color
|
|
97
|
+
alpha=0.5, # transparency
|
|
98
|
+
linewidth=1, # border width
|
|
99
|
+
label=manifest_file.id, # label for legend
|
|
100
|
+
)
|
|
101
|
+
|
|
102
|
+
ax.add_patch(rect)
|
|
103
|
+
|
|
104
|
+
# Set axis limits with some padding
|
|
105
|
+
ax.set_xlim(-0.5, len(key_points) - 1 + 0.5)
|
|
106
|
+
ax.set_ylim(-0.5, len(size_points) + 0.5)
|
|
107
|
+
|
|
108
|
+
# Create split markers on x-axis
|
|
109
|
+
if splits is not None:
|
|
110
|
+
split_positions = [key_points.index(split.end) for split in splits[:-1]]
|
|
111
|
+
|
|
112
|
+
# Add split markers at the bottom
|
|
113
|
+
for pos in split_positions:
|
|
114
|
+
ax.annotate("▲", xy=(pos, 0), ha="center", va="top", color="red", annotation_clip=False)
|
|
115
|
+
|
|
116
|
+
# Add grid
|
|
117
|
+
ax.grid(True, linestyle="--", alpha=0.7, zorder=0)
|
|
118
|
+
|
|
119
|
+
# Add labels and title
|
|
120
|
+
ax.set_title("Fragment Distribution" if scope is None else f"{scope} Fragment Distribution")
|
|
121
|
+
ax.set_xlabel("Key Index")
|
|
122
|
+
ax.set_ylabel("Size Index")
|
|
123
|
+
|
|
124
|
+
# Add legend
|
|
125
|
+
ax.legend(bbox_to_anchor=(1, 1), loc="upper left", fontsize="small")
|
|
126
|
+
|
|
127
|
+
# Adjust layout to prevent label cutoff
|
|
128
|
+
plt.tight_layout()
|
|
129
|
+
|
|
130
|
+
plot = FragmentManifestPlot(fig, ax, manifest)
|
|
131
|
+
fig.canvas.mpl_connect("motion_notify_event", plot.hover)
|
|
132
|
+
|
|
133
|
+
plt.show()
|
|
134
|
+
|
|
135
|
+
|
|
136
|
+
def _get_fragment_color(manifest_file: FragmentFile, color_index, total_colors):
|
|
137
|
+
import matplotlib.cm as cm
|
|
138
|
+
|
|
139
|
+
if manifest_file.compacted_at is not None:
|
|
140
|
+
# Use a shade of gray for compacted fragments
|
|
141
|
+
# Vary the shade based on the index to distinguish different compacted fragments
|
|
142
|
+
gray_value = 0.3 + (0.5 * (color_index / total_colors))
|
|
143
|
+
return (gray_value, gray_value, gray_value)
|
|
144
|
+
else:
|
|
145
|
+
# Use viridis colormap for non-compacted fragments
|
|
146
|
+
return cm.viridis(color_index / total_colors)
|
|
147
|
+
|
|
148
|
+
|
|
149
|
+
def _get_human_size(size_bytes: int) -> str:
|
|
150
|
+
# Convert bytes to a human-readable format
|
|
151
|
+
for unit in ["B", "KB", "MB", "GB", "TB"]:
|
|
152
|
+
if size_bytes < 1024:
|
|
153
|
+
return f"{size_bytes:.2f} {unit}"
|
|
154
|
+
size_bytes /= 1024
|
|
155
|
+
return f"{size_bytes:.2f} PB"
|
|
156
|
+
|
|
157
|
+
|
|
158
|
+
def _maybe_truncate(text, max_length: int = 30) -> str:
|
|
159
|
+
text = str(text)
|
|
160
|
+
if len(text) <= max_length:
|
|
161
|
+
return text
|
|
162
|
+
|
|
163
|
+
half_length = (max_length - 3) // 2
|
|
164
|
+
return text[:half_length] + "..." + text[-half_length:]
|
|
165
|
+
|
|
166
|
+
|
|
167
|
+
def _get_fragment_legend(manifest_file: FragmentFile):
|
|
168
|
+
return "\n".join(
|
|
169
|
+
[
|
|
170
|
+
f"id: {manifest_file.id}",
|
|
171
|
+
f"size: {_get_human_size(manifest_file.size_bytes)} ({manifest_file.size_bytes} bytes)",
|
|
172
|
+
f"key_span: {manifest_file.key_span}",
|
|
173
|
+
f"key_min: {_maybe_truncate(manifest_file.key_extent.min)}",
|
|
174
|
+
f"key_max: {_maybe_truncate(manifest_file.key_extent.max)}",
|
|
175
|
+
f"format: {manifest_file.format}",
|
|
176
|
+
f"level: {manifest_file.level}",
|
|
177
|
+
f"committed_at: {_format_timestamp(manifest_file.committed_at)}",
|
|
178
|
+
f"compacted_at: {_format_timestamp(manifest_file.compacted_at)}",
|
|
179
|
+
f"ks_id: {manifest_file.ks_id}",
|
|
180
|
+
]
|
|
181
|
+
)
|
|
182
|
+
|
|
183
|
+
|
|
184
|
+
def _format_timestamp(ts: Timestamp | None) -> str:
|
|
185
|
+
# Format timestamp or show None
|
|
186
|
+
if ts is None:
|
|
187
|
+
return "None"
|
|
188
|
+
try:
|
|
189
|
+
return datetime.fromtimestamp(ts / 1e6).strftime("%Y-%m-%d %H:%M:%S")
|
|
190
|
+
except ValueError:
|
|
191
|
+
return str(ts)
|
|
192
|
+
|
|
193
|
+
|
|
194
|
+
class FragmentManifestPlot:
|
|
195
|
+
def __init__(self, fig, ax, manifest: FragmentManifest):
|
|
196
|
+
self.fig = fig
|
|
197
|
+
self.ax = ax
|
|
198
|
+
self.manifest = manifest
|
|
199
|
+
|
|
200
|
+
# Position the annotation in the bottom right corner
|
|
201
|
+
self.annotation = ax.annotate(
|
|
202
|
+
"",
|
|
203
|
+
xy=(0.98, 0.02), # Position in axes coordinates
|
|
204
|
+
xycoords="axes fraction",
|
|
205
|
+
bbox=dict(boxstyle="round,pad=0.5", fc="white", ec="gray", alpha=0.8),
|
|
206
|
+
ha="right", # Right-align text
|
|
207
|
+
va="bottom", # Bottom-align text
|
|
208
|
+
visible=False,
|
|
209
|
+
)
|
|
210
|
+
self.highlighted_rect = None
|
|
211
|
+
self.highlighted_legend = None
|
|
212
|
+
|
|
213
|
+
def hover(self, event):
|
|
214
|
+
if event.inaxes != self.ax:
|
|
215
|
+
# Check if we're hovering over the legend
|
|
216
|
+
legend = self.ax.get_legend()
|
|
217
|
+
if legend and legend.contains(event)[0]:
|
|
218
|
+
# Find which legend item we're hovering over
|
|
219
|
+
for i, legend_text in enumerate(legend.get_texts()):
|
|
220
|
+
if legend_text.contains(event)[0]:
|
|
221
|
+
manifest_file = self.manifest[i]
|
|
222
|
+
self._show_legend(manifest_file, i, legend_text)
|
|
223
|
+
return
|
|
224
|
+
self._hide_legend()
|
|
225
|
+
return
|
|
226
|
+
|
|
227
|
+
# Check rectangles in the main plot
|
|
228
|
+
for i, rect in enumerate(self.ax.patches):
|
|
229
|
+
if rect.contains(event)[0]:
|
|
230
|
+
manifest_file = self.manifest[i]
|
|
231
|
+
self._show_legend(manifest_file, i, rect)
|
|
232
|
+
return
|
|
233
|
+
|
|
234
|
+
self._hide_legend()
|
|
235
|
+
|
|
236
|
+
def _show_legend(self, manifest_file, index, highlight_obj):
|
|
237
|
+
import matplotlib.patches as patches
|
|
238
|
+
|
|
239
|
+
# Update tooltip text
|
|
240
|
+
self.annotation.set_text(_get_fragment_legend(manifest_file))
|
|
241
|
+
self.annotation.set_visible(True)
|
|
242
|
+
|
|
243
|
+
# Handle highlighting
|
|
244
|
+
if isinstance(highlight_obj, patches.Rectangle):
|
|
245
|
+
# Highlighting rectangle in main plot
|
|
246
|
+
if self.highlighted_rect and self.highlighted_rect != highlight_obj:
|
|
247
|
+
self.highlighted_rect.set_alpha(0.5)
|
|
248
|
+
highlight_obj.set_alpha(0.8)
|
|
249
|
+
self.highlighted_rect = highlight_obj
|
|
250
|
+
else:
|
|
251
|
+
# Highlighting legend text
|
|
252
|
+
if self.highlighted_rect:
|
|
253
|
+
self.highlighted_rect.set_alpha(0.5)
|
|
254
|
+
# Find and highlight corresponding rectangle
|
|
255
|
+
rect = self.ax.patches[index]
|
|
256
|
+
rect.set_alpha(0.8)
|
|
257
|
+
self.highlighted_rect = rect
|
|
258
|
+
|
|
259
|
+
self.fig.canvas.draw_idle()
|
|
260
|
+
|
|
261
|
+
def _hide_legend(self):
|
|
262
|
+
if self.annotation.get_visible():
|
|
263
|
+
self.annotation.set_visible(False)
|
|
264
|
+
if self.highlighted_rect:
|
|
265
|
+
self.highlighted_rect.set_alpha(0.5)
|
|
266
|
+
self.fig.canvas.draw_idle()
|
spiral/demo.py
ADDED
|
@@ -0,0 +1,100 @@
|
|
|
1
|
+
"""Demo data to play with SpiralDB"""
|
|
2
|
+
|
|
3
|
+
import functools
|
|
4
|
+
import time
|
|
5
|
+
|
|
6
|
+
import duckdb
|
|
7
|
+
import pandas as pd
|
|
8
|
+
import pyarrow as pa
|
|
9
|
+
from datasets import load_dataset
|
|
10
|
+
|
|
11
|
+
from spiral import Project, Spiral, Table
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def _install_duckdb_extension(name: str, max_retries: int = 3) -> None:
|
|
15
|
+
"""Install and load a DuckDB extension with retry logic for flaky CI environments."""
|
|
16
|
+
for attempt in range(max_retries):
|
|
17
|
+
try:
|
|
18
|
+
duckdb.execute(f"INSTALL {name}; LOAD {name};")
|
|
19
|
+
return
|
|
20
|
+
except duckdb.IOException:
|
|
21
|
+
if attempt < max_retries - 1:
|
|
22
|
+
time.sleep(0.5 * (attempt + 1))
|
|
23
|
+
else:
|
|
24
|
+
raise
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
@functools.lru_cache(maxsize=1)
|
|
28
|
+
def demo_project(sp: Spiral) -> Project:
|
|
29
|
+
return sp.create_project(id_prefix="demo")
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
@functools.lru_cache(maxsize=1)
|
|
33
|
+
def images(sp: Spiral) -> Table:
|
|
34
|
+
table = demo_project(sp).create_table(
|
|
35
|
+
"openimages.images-v1", key_schema=pa.schema([("idx", pa.int64())]), exist_ok=False
|
|
36
|
+
)
|
|
37
|
+
|
|
38
|
+
# Load URLs from a TSV file
|
|
39
|
+
df = pd.read_csv(
|
|
40
|
+
"https://storage.googleapis.com/cvdf-datasets/oid/open-images-dataset-validation.tsv",
|
|
41
|
+
names=["url", "size", "etag"],
|
|
42
|
+
skiprows=1,
|
|
43
|
+
sep="\t",
|
|
44
|
+
header=None,
|
|
45
|
+
)
|
|
46
|
+
# For this example, we load just a few rows, but Spiral can handle many more.
|
|
47
|
+
df = pa.Table.from_pandas(df[:10])
|
|
48
|
+
df = df.append_column("idx", pa.array(range(len(df))))
|
|
49
|
+
|
|
50
|
+
# Write just the metadata - lightweight and fast
|
|
51
|
+
table.write(df)
|
|
52
|
+
return table
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
@functools.lru_cache(maxsize=1)
|
|
56
|
+
def gharchive(sp: Spiral, limit=100, period=None) -> Table:
|
|
57
|
+
if period is None:
|
|
58
|
+
period = pd.Period("2023-01-01T00:00:00Z", freq="h")
|
|
59
|
+
|
|
60
|
+
_install_duckdb_extension("httpfs")
|
|
61
|
+
|
|
62
|
+
json_gz_url = f"https://data.gharchive.org/{period.strftime('%Y-%m-%d')}-{str(period.hour)}.json.gz"
|
|
63
|
+
arrow_table = (
|
|
64
|
+
duckdb.read_json(json_gz_url, union_by_name=True)
|
|
65
|
+
.limit(limit)
|
|
66
|
+
.select("""
|
|
67
|
+
* REPLACE (
|
|
68
|
+
cast(created_at AS TIMESTAMP_MS) AS created_at,
|
|
69
|
+
)
|
|
70
|
+
""")
|
|
71
|
+
.to_arrow_table()
|
|
72
|
+
)
|
|
73
|
+
|
|
74
|
+
events = duckdb.from_arrow(arrow_table).order("created_at, id").distinct().to_arrow_table()
|
|
75
|
+
events = (
|
|
76
|
+
events.drop_columns("id")
|
|
77
|
+
.add_column(0, "id", events["id"].cast(pa.large_string()))
|
|
78
|
+
.drop_columns("created_at")
|
|
79
|
+
.add_column(0, "created_at", events["created_at"].cast(pa.timestamp("ms")))
|
|
80
|
+
.drop_columns("org")
|
|
81
|
+
)
|
|
82
|
+
|
|
83
|
+
key_schema = pa.schema([("created_at", pa.timestamp("ms")), ("id", pa.string_view())])
|
|
84
|
+
table = demo_project(sp).create_table("gharchive.events", key_schema=key_schema, exist_ok=False)
|
|
85
|
+
table.write(events, push_down_nulls=True)
|
|
86
|
+
return table
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
@functools.lru_cache(maxsize=1)
|
|
90
|
+
def fineweb(sp: Spiral, limit=100) -> Table:
|
|
91
|
+
table = demo_project(sp).create_table(
|
|
92
|
+
"fineweb.v1", key_schema=pa.schema([("id", pa.string_view())]), exist_ok=False
|
|
93
|
+
)
|
|
94
|
+
|
|
95
|
+
ds = load_dataset("HuggingFaceFW/fineweb", "sample-10BT", streaming=True)
|
|
96
|
+
data = ds["train"].take(limit)
|
|
97
|
+
arrow_table = pa.Table.from_pylist(data.to_list())
|
|
98
|
+
|
|
99
|
+
table.write(arrow_table, push_down_nulls=True)
|
|
100
|
+
return table
|