more-compute 0.1.4__py3-none-any.whl → 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (55) hide show
  1. frontend/app/globals.css +322 -77
  2. frontend/app/layout.tsx +98 -82
  3. frontend/components/Cell.tsx +234 -95
  4. frontend/components/Notebook.tsx +430 -199
  5. frontend/components/{AddCellButton.tsx → cell/AddCellButton.tsx} +0 -2
  6. frontend/components/cell/MonacoCell.tsx +726 -0
  7. frontend/components/layout/ConnectionBanner.tsx +41 -0
  8. frontend/components/{Sidebar.tsx → layout/Sidebar.tsx} +16 -11
  9. frontend/components/modals/ConfirmModal.tsx +154 -0
  10. frontend/components/modals/SuccessModal.tsx +140 -0
  11. frontend/components/output/MarkdownRenderer.tsx +116 -0
  12. frontend/components/popups/ComputePopup.tsx +674 -365
  13. frontend/components/popups/MetricsPopup.tsx +11 -7
  14. frontend/components/popups/SettingsPopup.tsx +11 -13
  15. frontend/contexts/PodWebSocketContext.tsx +247 -0
  16. frontend/eslint.config.mjs +11 -0
  17. frontend/lib/monaco-themes.ts +160 -0
  18. frontend/lib/settings.ts +128 -26
  19. frontend/lib/themes.json +9973 -0
  20. frontend/lib/websocket-native.ts +19 -8
  21. frontend/lib/websocket.ts +59 -11
  22. frontend/next.config.ts +8 -0
  23. frontend/package-lock.json +1705 -3
  24. frontend/package.json +8 -1
  25. frontend/styling_README.md +18 -0
  26. kernel_run.py +159 -42
  27. more_compute-0.2.0.dist-info/METADATA +126 -0
  28. more_compute-0.2.0.dist-info/RECORD +100 -0
  29. morecompute/__version__.py +1 -1
  30. morecompute/execution/executor.py +31 -20
  31. morecompute/execution/worker.py +68 -7
  32. morecompute/models/__init__.py +31 -0
  33. morecompute/models/api_models.py +197 -0
  34. morecompute/notebook.py +50 -7
  35. morecompute/server.py +574 -94
  36. morecompute/services/data_manager.py +379 -0
  37. morecompute/services/lsp_service.py +335 -0
  38. morecompute/services/pod_manager.py +122 -20
  39. morecompute/services/pod_monitor.py +138 -0
  40. morecompute/services/prime_intellect.py +87 -63
  41. morecompute/utils/config_util.py +59 -0
  42. morecompute/utils/special_commands.py +11 -5
  43. morecompute/utils/zmq_util.py +51 -0
  44. frontend/components/MarkdownRenderer.tsx +0 -84
  45. frontend/components/popups/PythonPopup.tsx +0 -292
  46. more_compute-0.1.4.dist-info/METADATA +0 -173
  47. more_compute-0.1.4.dist-info/RECORD +0 -86
  48. /frontend/components/{CellButton.tsx → cell/CellButton.tsx} +0 -0
  49. /frontend/components/{ErrorModal.tsx → modals/ErrorModal.tsx} +0 -0
  50. /frontend/components/{CellOutput.tsx → output/CellOutput.tsx} +0 -0
  51. /frontend/components/{ErrorDisplay.tsx → output/ErrorDisplay.tsx} +0 -0
  52. {more_compute-0.1.4.dist-info → more_compute-0.2.0.dist-info}/WHEEL +0 -0
  53. {more_compute-0.1.4.dist-info → more_compute-0.2.0.dist-info}/entry_points.txt +0 -0
  54. {more_compute-0.1.4.dist-info → more_compute-0.2.0.dist-info}/licenses/LICENSE +0 -0
  55. {more_compute-0.1.4.dist-info → more_compute-0.2.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,379 @@
1
+ import os
2
+ import shutil
3
+ from pathlib import Path
4
+ from typing import TYPE_CHECKING
5
+
6
+ if TYPE_CHECKING:
7
+ from .prime_intellect import PrimeIntellectService, CreateDiskRequest, DiskConfig, ProviderConfig, TeamConfig
8
+
9
+ class DatasetInfo:
10
+ """
11
+ container for dataset metadata
12
+ """
13
+ def __init__(self, name: str, size_gb: float | None = None, splits: dict | None = None, features: dict | None = None):
14
+ self.name = name
15
+ self.size_gb = size_gb
16
+ self.splits = splits if splits is not None else {}
17
+ self.features = features if features is not None else {}
18
+
19
+ class DataManager:
20
+ """
21
+ dataset manager
22
+
23
+ integrates woth hugging face datasets amd prime intellect disk management
24
+ """
25
+ def __init__(
26
+ self,
27
+ cache_dir: str | Path | None = None,
28
+ max_cache_size_gb: float = 50,
29
+ prime_intellect: "PrimeIntellectService | None" = None
30
+ ):
31
+ """
32
+ Initialize DataManager:
33
+
34
+ args:
35
+ cache_dir: directory for dataset cache (default: ~/.cache/huggingface/datasets)
36
+ max_cache_size_gb: max cache size in GB
37
+ prime_intellect: optional primeIntellectService for disk management
38
+ """
39
+ self.cache_dir = Path(cache_dir or Path.home() / ".cache" / "huggingface" / "datasets")
40
+ self.cache_dir.mkdir(parents=True, exist_ok=True)
41
+ self.max_cache_size_gb = max_cache_size_gb
42
+ self.prime_intellect = prime_intellect
43
+
44
+ def get_dataset_info(self, dataset_name: str, config: str | None = None) -> DatasetInfo:
45
+ """
46
+ get dataset metadata without downloading it
47
+
48
+ args:
49
+ dataset_name : huggingface dataset name
50
+ config: optional dataset configuration
51
+
52
+ returns:
53
+ DatasetInfo with size, splits, features
54
+ """
55
+ try:
56
+ from datasets import load_dataset_builder
57
+ builder = load_dataset_builder(dataset_name, config)
58
+ size_bytes = builder.info.dataset_size if builder.info.dataset_size else None
59
+ size_gb = size_bytes / (1024**3) if size_bytes else None
60
+ return DatasetInfo(
61
+ name=dataset_name,
62
+ size_gb=size_gb,
63
+ splits=dict(builder.info.splits) if builder.info.splits else{},
64
+ features =dict(builder.info.features) if builder.info.features else {}
65
+ )
66
+ except Exception as e:
67
+ #fallback to return unknown size
68
+ return DatasetInfo(name=dataset_name)
69
+
70
+ def check_environment(self) -> dict[str, object]:
71
+ """
72
+ a check for the current environment and available resources
73
+
74
+ returns:
75
+ dict with available_gb, is_remote, pod_id, cache_usage,gb
76
+ """
77
+
78
+ available_space = shutil.disk_usage(self.cache_dir).free / (1024**3)
79
+ is_remote_pod = os.getenv('MC_POD_ID') is not None
80
+
81
+ return {
82
+ 'available_gb': available_space,
83
+ 'is_remote': is_remote_pod,
84
+ 'pod_id': os.getenv('MC_POD_ID'),
85
+ 'cache_usage_gb': self.get_cache_size()
86
+ }
87
+
88
+ def get_cache_size(self) -> float:
89
+ """
90
+ get total cache in gb
91
+ """
92
+ try:
93
+ total = 0
94
+ for path in self.cache_dir.rglob('*'):
95
+ if path.is_file():
96
+ total += path.stat().st_size
97
+ return total / (1024**3)
98
+ except Exception as e:
99
+ return 0.0
100
+
101
+ def list_cache_dataset(self) -> list[dict[str, object]]:
102
+ """
103
+ list of all cache
104
+ """
105
+ datasets = []
106
+ try:
107
+ for item in self.cache_dir.iterdir():
108
+ if item.is_dir():
109
+ size = sum(f.stat().st_size for f in item.rglob('*') if f.is_file())
110
+ datasets.append({
111
+ 'name': item.name,
112
+ 'size_gb': size / (1024**3),
113
+ 'path': str(item)
114
+ })
115
+ return sorted(datasets, key=lambda x: x['size_gb'], reverse=True) # type: ignore
116
+ except Exception:
117
+ return []
118
+
119
+ def clear_cache(self, dataset_id: str | None = None) -> dict[str, str]:
120
+ """
121
+ clear specific dataset or entire cache
122
+
123
+ args:
124
+ dataset_id : (optional) dataset name to clear, if None then clear all
125
+
126
+ returns:
127
+ dict with status msg
128
+ """
129
+
130
+ try:
131
+ if dataset_id:
132
+ for item in self.cache_dir.iterdir():
133
+ if dataset_id in item.name:
134
+ if item.is_dir():
135
+ shutil.rmtree(item)
136
+ else:
137
+ item.unlink()
138
+ return {"status": "ok", "message": f"Cleared {dataset_id}"}
139
+ else:
140
+ shutil.rmtree(self.cache_dir) # clear entire cache
141
+ self.cache_dir.mkdir(parents=True)
142
+ return {"status": "ok", "message": "Cleared all cache"}
143
+ except Exception as e:
144
+ return {"status": "error", "message": str(e)}
145
+
146
+ # Core smart loading methods - TO BE IMPLEMENTED
147
+
148
+ async def load_smart(
149
+ self,
150
+ dataset_name: str,
151
+ config: str | None = None,
152
+ split: str | None = None,
153
+ auto_stream_threshold_gb: float = 10.0,
154
+ **kwargs
155
+ ) -> dict[str, object]:
156
+ """
157
+ Smart dataset loading with automatic streaming/downloading decisions.
158
+
159
+ This is the method that decides what to do based on:
160
+ - Dataset size (from metadata check)
161
+ - Available disk space
162
+ - Whether running on remote pod
163
+ - User threshold preferences
164
+
165
+ Args:
166
+ dataset_name: HuggingFace dataset name (e.g., "openai/gsm8k")
167
+ config: Optional dataset configuration
168
+ split: Optional split to load
169
+ auto_stream_threshold_gb: Auto-stream datasets larger than this (default: 10GB)
170
+ **kwargs: Additional arguments for load_dataset
171
+
172
+ Returns:
173
+ Dict with:
174
+ - action: "download" | "stream" | "disk_needed" | "recommend_stream"
175
+ - recommendation: User-facing message explaining the decision
176
+ - import_code: Python code snippet user can run
177
+ - info: DatasetInfo object with metadata
178
+ - alternatives: (optional) Dict of alternative approaches
179
+
180
+ """
181
+ info = self.get_dataset_info(dataset_name, config)
182
+ size_gb = info.size_gb
183
+ env = self.check_environment()
184
+ available_gb = env['available_gb']
185
+ is_remote = env['is_remote']
186
+ config_str = f',"{config}"'if config else ''
187
+ split_str = f',split="{split}"'if split else''
188
+
189
+ #unknown size for whatever reason
190
+ if size_gb is None:
191
+ return {
192
+ "action": "stream",
193
+ "recommendation": "Dataset size unknown, streaming for safety",
194
+ "import_code": f'from datasets import load_dataset\ndataset = load_dataset("{dataset_name}"{config_str}, streaming=True)',
195
+ "info": info
196
+ }
197
+
198
+ #too big
199
+ if size_gb > available_gb * 0.8:
200
+ #if on remote pod w/ primte intellect, suggest disk
201
+ if is_remote and self.prime_intellect:
202
+ return await self._handle_remote_storage(dataset_name, size_gb, env['pod_id'])
203
+ else:
204
+ return {
205
+ "action": "stream",
206
+ "recommendation": f"Dataset ({size_gb:.1f}GB) too large for available space ({available_gb:.1f}GB). Auto-streaming.",
207
+ "import_code": f'from datasets import load_dataset\ndataset = load_dataset("{dataset_name}"{config_str}, streaming=True)',
208
+ "info": info
209
+ }
210
+
211
+ #large dataset, recommend streaming
212
+ if size_gb > auto_stream_threshold_gb:
213
+ return {
214
+ "action": "recommend_stream",
215
+ "recommendation": f"Large dataset ({size_gb:.1f}GB). Recommend streaming to save space.",
216
+ "info": info,
217
+ "alternatives": {
218
+ "stream": f'load_dataset("{dataset_name}"{config_str}, streaming=True)',
219
+ "download": f'load_dataset("{dataset_name}"{config_str})',
220
+ "subset": f'load_dataset("{dataset_name}"{config_str}, split="train[:1000]")'
221
+ }
222
+ }
223
+
224
+ #case 4 small enough to download
225
+ return {
226
+ "action": "download",
227
+ "recommendation": f"Downloading {size_gb:.1f}GB to cache...",
228
+ "import_code": f'from datasets import load_dataset\ndataset = load_dataset("{dataset_name}"{config_str}{split_str})',
229
+ "info": info
230
+ }
231
+
232
+
233
+ async def _handle_remote_storage(
234
+ self,
235
+ dataset_name: str,
236
+ size_gb: float,
237
+ pod_id: str
238
+ ) -> dict[str, object]:
239
+ """
240
+ Handle storage when dataset is too large for pod's default disk.
241
+
242
+ Called by load_smart() when:
243
+ 1. Dataset won't fit on pod disk
244
+ 2. Running on remote pod (MC_POD_ID env var set)
245
+ 3. Prime Intellect service is configured
246
+
247
+ Args:
248
+ dataset_name: Dataset name
249
+ size_gb: Dataset size in GB
250
+ pod_id: Current pod ID from MC_POD_ID env var
251
+
252
+ Returns:
253
+ Dict with:
254
+ - action: "disk_needed"
255
+ - recommendation: Explanation of the problem
256
+ - disk_size_gb: Recommended disk size (dataset size * 1.2)
257
+ - pod_id: Pod to attach disk to
258
+ - alternatives: Dict of other options (stream, subset)
259
+ - estimated_cost: Monthly cost estimate
260
+ """
261
+ if not self.prime_intellect:
262
+ return {
263
+ "action": "stream",
264
+ "recommendation": "Dataset too large, streaming recommended"
265
+ }
266
+ recommended_disk_size = int(size_gb * 1.2)
267
+ estimated_cost_monthly = recommended_disk_size * 0.10
268
+
269
+ return {
270
+ "action": "disk_needed",
271
+ "recommendation": f"Dataset ({size_gb:.1f}GB) requires external disk",
272
+ "disk_size_gb": recommended_disk_size,
273
+ "pod_id": pod_id,
274
+ "alternatives": {
275
+ "stream": "Stream the dataset (recommended for training)",
276
+ "create_disk": f"Create {recommended_disk_size}GB disk and attach to pod",
277
+ "subset": "Load subset for testing"
278
+ },
279
+ "estimated_cost": f"${estimated_cost_monthly:.2f}/month"
280
+ }
281
+
282
+
283
+ async def create_and_attach_disk(
284
+ self,
285
+ pod_id: str,
286
+ disk_name: str,
287
+ size_gb: int,
288
+ provider_type: str = "runpod"
289
+ ) -> dict[str, object]:
290
+ """
291
+ Create a disk via Prime Intellect API and attach it to a pod.
292
+
293
+ Args:
294
+ pod_id: Pod to attach disk to
295
+ disk_name: Human-readable name for the disk
296
+ size_gb: Disk size in GB
297
+ provider_type: Cloud provider (default: "runpod")
298
+
299
+ Returns:
300
+ Dict with:
301
+ - status: "ok" | "error"
302
+ - disk_id: Created disk ID (if successful)
303
+ - disk_name: Disk name
304
+ - size_gb: Disk size
305
+ - mount_path: Where disk will be mounted (e.g., /mnt/disks/disk-abc123)
306
+ - message: Instructions for using the disk
307
+ - error: Error message (if status == "error")
308
+ """
309
+ if not self.prime_intellect:
310
+ return {"status": "error", "message": "Prime Intellect not configured"}
311
+
312
+ try:
313
+ from .prime_intellect import CreateDiskRequest, DiskConfig, ProviderConfig
314
+ disk_config = DiskConfig(name=disk_name, size=size_gb)
315
+ provider_config = ProviderConfig(type=provider_type)
316
+ disk_request = CreateDiskRequest(disk=disk_config, provider=provider_config)
317
+ disk_response = await self.prime_intellect.create_disks(disk_request)
318
+ return {
319
+ "status": "ok",
320
+ "disk_id": disk_response.id,
321
+ "disk_name": disk_response.name,
322
+ "size_gb": disk_response.size,
323
+ "mount_path": f"/mnt/disks/{disk_response.id}",
324
+ "message": f"Disk created successfully. Use cache_dir='/mnt/disks/{disk_response.id}' when loading dataset"
325
+ }
326
+
327
+ except Exception as e:
328
+ return {"status": "error", "message": str(e)}
329
+
330
+ def to_pytorch_dataloader(self, dataset, batch_size: int = 32, **kwargs):
331
+ """
332
+ Convert HuggingFace dataset to PyTorch DataLoader.
333
+
334
+ Convenience helper for training. Sets dataset format to 'torch'
335
+ and wraps in DataLoader.
336
+
337
+ Args:
338
+ dataset: HuggingFace dataset
339
+ batch_size: Batch size for training
340
+ **kwargs: Additional DataLoader arguments (shuffle, num_workers, etc.)
341
+
342
+ Returns:
343
+ torch.utils.data.DataLoader
344
+ """
345
+ from torch.utils.data import DataLoader
346
+ dataset.set_format("torch")
347
+ return DataLoader(dataset, batch_size=batch_size, **kwargs)
348
+
349
+ def load_subset(
350
+ self,
351
+ dataset_name: str,
352
+ num_samples: int = 1000,
353
+ split: str = "train",
354
+ config: str | None = None
355
+ ) -> dict[str, object]:
356
+ """
357
+ Generate code to load a small subset for testing/development.
358
+
359
+ Args:
360
+ dataset_name: HuggingFace dataset name
361
+ num_samples: Number of samples to load
362
+ split: Which split to use
363
+ config: Optional dataset configuration
364
+
365
+ Returns:
366
+ Dict with:
367
+ - action: "subset"
368
+ - num_samples: Number of samples
369
+ - import_code: Code to load the subset
370
+ - recommendation: Explanation
371
+ """
372
+ config_str = f', "{config}"' if config else ''
373
+
374
+ return {
375
+ "action": "subset",
376
+ "num_samples": num_samples,
377
+ "import_code": f'from datasets import load_dataset\ndataset = load_dataset("{dataset_name}"{config_str}, split="{split}[:{num_samples}]")',
378
+ "recommendation": f"Loading {num_samples} samples from {split} split for testing"
379
+ }