dreadnode 1.0.0rc0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,126 @@
1
+ """
2
+ Artifact storage implementation for fsspec-compatible file systems.
3
+ Provides efficient uploading of files and directories with deduplication.
4
+ """
5
+
6
+ import hashlib
7
+ from pathlib import Path
8
+
9
+ import fsspec # type: ignore[import-untyped]
10
+
11
+ from dreadnode.util import logger
12
+
13
+ CHUNK_SIZE = 8 * 1024 * 1024 # 8MB
14
+
15
+
16
+ class ArtifactStorage:
17
+ """
18
+ Storage for artifacts with efficient handling of large files and directories.
19
+
20
+ Supports:
21
+ - Content-based deduplication using SHA1 hashing
22
+ - Batch uploads for directories handled by fsspec
23
+ """
24
+
25
+ def __init__(self, file_system: fsspec.AbstractFileSystem):
26
+ """
27
+ Initialize artifact storage with a file system and prefix path.
28
+
29
+ Args:
30
+ file_system: FSSpec-compatible file system
31
+ """
32
+ self._file_system = file_system
33
+
34
+ def store_file(self, file_path: Path, target_key: str) -> str:
35
+ """
36
+ Store a file in the storage system, using multipart upload for large files.
37
+
38
+ Args:
39
+ file_path: Path to the local file
40
+ target_key: Key/path where the file should be stored
41
+
42
+ Returns:
43
+ Full URI with protocol to the stored file
44
+ """
45
+ if not self._file_system.exists(target_key):
46
+ self._file_system.put(str(file_path), target_key)
47
+ logger.debug("Artifact successfully stored at %s", target_key)
48
+ else:
49
+ logger.debug("Artifact already exists at %s, skipping upload.", target_key)
50
+
51
+ return str(self._file_system.unstrip_protocol(target_key))
52
+
53
+ def batch_upload_files(self, source_paths: list[str], target_paths: list[str]) -> list[str]:
54
+ """
55
+ Upload multiple files in a single batch operation.
56
+
57
+ Args:
58
+ source_paths: List of local file paths
59
+ target_paths: List of target keys/paths
60
+
61
+ Returns:
62
+ List of URIs for the uploaded files
63
+ """
64
+ if not source_paths:
65
+ return []
66
+
67
+ logger.debug("Batch uploading %d files", len(source_paths))
68
+
69
+ srcs = []
70
+ dsts = []
71
+
72
+ for src, dst in zip(source_paths, target_paths, strict=False):
73
+ if not self._file_system.exists(dst):
74
+ srcs.append(src)
75
+ dsts.append(dst)
76
+
77
+ if srcs:
78
+ self._file_system.put(srcs, dsts)
79
+ logger.debug("Batch upload completed for %d files", len(srcs))
80
+ else:
81
+ logger.debug("All files already exist, skipping upload")
82
+
83
+ return [str(self._file_system.unstrip_protocol(target)) for target in target_paths]
84
+
85
+ def compute_file_hash(self, file_path: Path, stream_threshold_mb: int = 10) -> str:
86
+ """
87
+ Compute SHA1 hash of a file, using streaming only for larger files.
88
+
89
+ Args:
90
+ file_path: Path to the file
91
+ stream_threshold_mb: Size threshold in MB for streaming vs. loading whole file
92
+
93
+ Returns:
94
+ First 16 chars of SHA1 hash
95
+ """
96
+ file_size = file_path.stat().st_size
97
+ stream_threshold = stream_threshold_mb * 1024 * 1024 # Convert MB to bytes
98
+
99
+ sha1 = hashlib.sha1() # noqa: S324
100
+
101
+ if file_size < stream_threshold:
102
+ with file_path.open("rb") as f:
103
+ data = f.read()
104
+ sha1.update(data)
105
+ else:
106
+ with file_path.open("rb") as f:
107
+ for chunk in iter(lambda: f.read(CHUNK_SIZE), b""):
108
+ sha1.update(chunk)
109
+
110
+ return sha1.hexdigest()[:16]
111
+
112
+ def compute_file_hashes(self, file_paths: list[Path]) -> dict[str, str]:
113
+ """
114
+ Compute SHA1 hashes for multiple files.
115
+
116
+ Args:
117
+ file_paths: List of file paths to hash
118
+
119
+ Returns:
120
+ Dictionary mapping file paths to their hash values
121
+ """
122
+ result = {}
123
+ for file_path in file_paths:
124
+ file_path_str = file_path.resolve().as_posix()
125
+ result[file_path_str] = self.compute_file_hash(file_path)
126
+ return result
@@ -0,0 +1,455 @@
1
+ """
2
+ Tree structure builder for artifacts with directory hierarchy preservation.
3
+ Provides efficient uploads and tree construction for frontend to consume.
4
+ """
5
+
6
+ import hashlib
7
+ import os
8
+ from dataclasses import dataclass
9
+ from pathlib import Path
10
+ from typing import Literal, TypedDict, Union
11
+
12
+ from dreadnode.artifact.storage import ArtifactStorage
13
+ from dreadnode.util import logger
14
+
15
+
16
+ class FileNode(TypedDict):
17
+ """
18
+ Represents a file node in the artifact tree.
19
+ Contains metadata about the file, including its name, uri, size_bytes, and final_real_path.
20
+ """
21
+
22
+ type: Literal["file"]
23
+ hash: str
24
+ uri: str
25
+ size_bytes: int
26
+ final_real_path: str
27
+
28
+
29
+ class DirectoryNode(TypedDict):
30
+ """
31
+ Represents a directory node in the artifact tree.
32
+ Contains metadata about the directory, including its dir_path, hash, and children nodes.
33
+ """
34
+
35
+ type: Literal["dir"]
36
+ dir_path: str
37
+ hash: str
38
+ children: list[Union["DirectoryNode", FileNode]]
39
+
40
+
41
+ @dataclass
42
+ class ArtifactTreeBuilder:
43
+ """
44
+ Builds a hierarchical tree structure for artifacts while uploading them to storage.
45
+ Preserves directory structure and handles efficient uploads.
46
+ """
47
+
48
+ storage: ArtifactStorage
49
+ prefix_path: str | None = None
50
+
51
+ def process_artifact(self, local_uri: str | Path) -> DirectoryNode:
52
+ """
53
+ Process an artifact (file or directory) and build its tree representation.
54
+
55
+ Args:
56
+ local_uri: Path to the local file or directory
57
+
58
+ Returns:
59
+ Directory tree structure representing the artifact
60
+
61
+ Raises:
62
+ FileNotFoundError: If the path doesn't exist
63
+ """
64
+ local_path = Path(local_uri).expanduser().resolve()
65
+ if not local_path.exists():
66
+ raise FileNotFoundError(f"{local_path} does not exist")
67
+
68
+ if local_path.is_dir():
69
+ return self._process_directory(local_path)
70
+
71
+ return self._process_single_file(local_path)
72
+
73
+ def _process_single_file(self, file_path: Path) -> DirectoryNode:
74
+ """
75
+ Process a single file and create a directory structure for it.
76
+
77
+ Args:
78
+ file_path: Path to the file to be processed
79
+
80
+ Returns:
81
+ DirectoryNode containing the single file
82
+ """
83
+ file_node = self._process_file(file_path)
84
+
85
+ file_node["final_real_path"] = file_path.resolve().as_posix()
86
+
87
+ dir_path = file_path.parent.resolve().as_posix()
88
+ return {
89
+ "type": "dir",
90
+ "dir_path": dir_path,
91
+ "hash": file_node["hash"],
92
+ "children": [file_node],
93
+ }
94
+
95
+ def _process_directory(self, dir_path: Path) -> DirectoryNode:
96
+ """
97
+ Process a directory and all its contents efficiently.
98
+
99
+ Args:
100
+ dir_path: Path to the directory to be processed.
101
+
102
+ Returns:
103
+ DirectoryNode: A hierarchical tree structure representing the directory and its contents.
104
+ """
105
+ logger.debug("Processing directory: %s", dir_path)
106
+
107
+ all_files: list[Path] = []
108
+ for root, _, files in os.walk(dir_path):
109
+ root_path = Path(root)
110
+ for file in files:
111
+ file_path = root_path / file
112
+ all_files.append(file_path)
113
+
114
+ file_hashes = self.storage.compute_file_hashes(all_files)
115
+
116
+ source_paths = []
117
+ target_paths = []
118
+ file_nodes_by_path: dict[Path, FileNode] = {}
119
+ file_hash_cache: dict[str, FileNode] = {}
120
+
121
+ for file_path in all_files:
122
+ file_path_str = file_path.resolve().as_posix()
123
+ file_hash = file_hashes.get(file_path_str)
124
+ if not file_hash:
125
+ raise ValueError(f"File {file_path} not found in hash computation")
126
+
127
+ # Check local cache for duplicates within this directory
128
+ if file_hash in file_hash_cache:
129
+ cached_node = file_hash_cache[file_hash].copy()
130
+ cached_node["final_real_path"] = file_path_str
131
+ file_nodes_by_path[file_path] = cached_node
132
+ continue
133
+
134
+ file_extension = file_path.suffix
135
+ file_size = file_path.stat().st_size
136
+
137
+ if self.prefix_path:
138
+ prefix = self.prefix_path.rstrip("/")
139
+ target_key = f"{prefix}/artifacts/{file_hash}{file_extension}"
140
+ else:
141
+ raise ValueError("Prefix path is invalid or empty")
142
+
143
+ source_paths.append(file_path_str)
144
+ target_paths.append(target_key)
145
+
146
+ # Create the file node without URI (will be set after upload)
147
+ file_node: FileNode = {
148
+ "type": "file",
149
+ "uri": "",
150
+ "hash": file_hash,
151
+ "size_bytes": file_size,
152
+ "final_real_path": file_path.resolve().as_posix(),
153
+ }
154
+
155
+ file_nodes_by_path[file_path] = file_node
156
+ file_hash_cache[file_hash] = file_node
157
+
158
+ if source_paths:
159
+ logger.debug("Uploading %d files in batch", len(source_paths))
160
+ uris = self.storage.batch_upload_files(source_paths, target_paths)
161
+
162
+ # Update file nodes with URIs
163
+ for i, file_path_str in enumerate(source_paths):
164
+ file_path = Path(file_path_str)
165
+ if file_path in file_nodes_by_path:
166
+ file_nodes_by_path[file_path]["uri"] = uris[i]
167
+
168
+ return self._build_tree_structure(dir_path, file_nodes_by_path)
169
+
170
+ def _build_tree_structure(
171
+ self,
172
+ base_dir: Path,
173
+ file_nodes_by_path: dict[Path, FileNode],
174
+ ) -> DirectoryNode:
175
+ """
176
+ Build a hierarchical tree structure from processed files and directories.
177
+
178
+ This method constructs a directory tree representation from a dictionary of
179
+ file paths and their corresponding `FileNode` objects, while preserving empty directories.
180
+
181
+ Args:
182
+ base_dir (Path): The root directory for the tree structure.
183
+ file_nodes_by_path (dict[Path, FileNode]): A dictionary mapping file paths
184
+ to their corresponding `FileNode` objects.
185
+
186
+ Returns:
187
+ DirectoryNode: A hierarchical tree structure representing the directory
188
+ and its contents.
189
+
190
+ Example:
191
+ Given the following directory structure:
192
+ ```
193
+ base_dir/
194
+ ├── file1.txt
195
+ ├── subdir1/
196
+ │ ├── file2.txt
197
+ │ └── file3.txt
198
+ └── subdir2/
199
+ └── file4.txt
200
+ ```
201
+
202
+ And the [file_nodes_by_path] dictionary:
203
+ {
204
+ Path("base_dir/file1.txt"): FileNode(...),
205
+ Path("base_dir/subdir1/file2.txt"): FileNode(...),
206
+ Path("base_dir/subdir1/file3.txt"): FileNode(...),
207
+ Path("base_dir/subdir2/file4.txt"): FileNode(...),
208
+ }
209
+
210
+ The returned tree structure will look like:
211
+ {
212
+ "type": "dir",
213
+ "name": "base_dir",
214
+ "hash": "<hash_of_base_dir>",
215
+ "children": [
216
+ {
217
+ "type": "file",
218
+ "name": "file1.txt",
219
+ ...
220
+ },
221
+ {
222
+ "type": "dir",
223
+ "name": "subdir1",
224
+ "hash": "<hash_of_subdir1>",
225
+ "children": [
226
+ {
227
+ "type": "file",
228
+ "name": "file2.txt",
229
+ ...
230
+ },
231
+ {
232
+ "type": "file",
233
+ "name": "file3.txt",
234
+ ...
235
+ }
236
+ ]
237
+ },
238
+ {
239
+ "type": "dir",
240
+ "name": "subdir2",
241
+ "hash": "<hash_of_subdir2>",
242
+ "children": [
243
+ {
244
+ "type": "file",
245
+ "name": "file4.txt",
246
+ ...
247
+ }
248
+ ]
249
+ }
250
+ ]
251
+ }
252
+ """
253
+ dir_structure: dict[str, DirectoryNode] = {}
254
+
255
+ # Create root node
256
+ root_dir_path = base_dir.resolve().as_posix()
257
+ root_node: DirectoryNode = {
258
+ "type": "dir",
259
+ "dir_path": root_dir_path,
260
+ "hash": "", # Will be computed later
261
+ "children": [],
262
+ }
263
+ dir_structure[root_dir_path] = root_node
264
+
265
+ for file_path, file_node in file_nodes_by_path.items():
266
+ try:
267
+ rel_path = file_path.relative_to(base_dir)
268
+ parts = rel_path.parts
269
+ except ValueError:
270
+ logger.debug("File %s is not relative to base directory %s", file_path, base_dir)
271
+ continue
272
+
273
+ # File in the root directory
274
+ if len(parts) == 1:
275
+ root_node["children"].append(file_node)
276
+ continue
277
+
278
+ # Create parent directories
279
+ current_dir = base_dir
280
+ current_dir_str = current_dir.resolve().as_posix()
281
+ for part in parts[:-1]:
282
+ next_dir = current_dir / part
283
+ next_dir_str = next_dir.resolve().as_posix()
284
+ if next_dir_str not in dir_structure:
285
+ dir_node: DirectoryNode = {
286
+ "type": "dir",
287
+ "dir_path": next_dir_str,
288
+ "hash": "", # Will be computed later
289
+ "children": [],
290
+ }
291
+ dir_structure[next_dir_str] = dir_node
292
+ dir_structure[current_dir_str]["children"].append(dir_node)
293
+ current_dir = next_dir
294
+ current_dir_str = next_dir_str
295
+ # Now add the file to its parent directory
296
+ parent_dir_str = file_path.parent.resolve().as_posix()
297
+ if parent_dir_str in dir_structure:
298
+ dir_structure[parent_dir_str]["children"].append(file_node)
299
+ self._compute_directory_hashes(dir_structure)
300
+
301
+ return root_node
302
+
303
+ def _compute_directory_hashes(self, dir_structure: dict[str, DirectoryNode]) -> None:
304
+ """
305
+ Compute hashes for all directories in the structure.
306
+
307
+ Args:
308
+ dir_structure: Dictionary mapping directory paths to DirectoryNode objects
309
+ """
310
+ parents = self._map_parent_child_relationships(dir_structure)
311
+ leaf_dirs = self._find_leaf_directories(dir_structure, parents)
312
+ self._process_directories_bottom_up(dir_structure, parents, leaf_dirs)
313
+
314
+ def _map_parent_child_relationships(
315
+ self,
316
+ dir_structure: dict[str, DirectoryNode],
317
+ ) -> dict[str, str]:
318
+ """
319
+ Create a mapping of parent-child relationships for directories.
320
+
321
+ Args:
322
+ dir_structure: Dictionary mapping directory paths to DirectoryNode objects
323
+
324
+ Returns:
325
+ A dictionary mapping child directory paths to their parent directory paths.
326
+ """
327
+ parents = {}
328
+ for dir_path, dir_node in dir_structure.items():
329
+ for child in dir_node["children"]:
330
+ if child["type"] == "dir":
331
+ child_path = child["dir_path"]
332
+ parents[child_path] = dir_path
333
+ return parents
334
+
335
+ def _find_leaf_directories(
336
+ self,
337
+ dir_structure: dict[str, DirectoryNode],
338
+ parents: dict[str, str],
339
+ ) -> set[str]:
340
+ """
341
+ Find leaf directories (those with no directory children).
342
+
343
+ Args:
344
+ dir_structure: Dictionary mapping directory paths to DirectoryNode objects
345
+ parents: Dictionary mapping child directory paths to parent directory paths
346
+
347
+ Returns:
348
+ A set of leaf directory paths.
349
+ """
350
+ leaf_dirs = set()
351
+ for dir_path in dir_structure:
352
+ if dir_path not in parents.values():
353
+ leaf_dirs.add(dir_path)
354
+ return leaf_dirs
355
+
356
+ def _process_directories_bottom_up(
357
+ self,
358
+ dir_structure: dict[str, DirectoryNode],
359
+ parents: dict[str, str],
360
+ leaf_dirs: set[str],
361
+ ) -> None:
362
+ """
363
+ Process directories bottom-up starting from leaf directories.
364
+
365
+ Args:
366
+ dir_structure: Dictionary mapping directory paths to DirectoryNode objects
367
+ parents: Dictionary mapping child directory paths to parent directory paths
368
+ leaf_dirs: Set of leaf directory paths
369
+ """
370
+ processed = set()
371
+ while leaf_dirs:
372
+ dir_path = leaf_dirs.pop()
373
+ dir_node = dir_structure[dir_path]
374
+
375
+ # Compute hash based on children
376
+ dir_node["hash"] = self._compute_directory_hash(dir_node)
377
+
378
+ processed.add(dir_path)
379
+
380
+ # Add parent to leaf_dirs if all its children are processed
381
+ if dir_path in parents:
382
+ parent_path = parents[dir_path]
383
+ if self._are_all_children_processed(dir_structure[parent_path], processed):
384
+ leaf_dirs.add(parent_path)
385
+
386
+ def _compute_directory_hash(self, dir_node: DirectoryNode) -> str:
387
+ """
388
+ Compute the hash for a directory based on its children.
389
+
390
+ Args:
391
+ dir_node: The DirectoryNode object
392
+
393
+ Returns:
394
+ The computed hash as a string.
395
+ """
396
+ child_hashes = [child["hash"] for child in dir_node["children"]]
397
+ child_hashes.sort() # Ensure consistent hash
398
+ hash_input = "|".join(child_hashes)
399
+ return hashlib.sha1(hash_input.encode()).hexdigest()[:16] # noqa: S324
400
+
401
+ def _are_all_children_processed(self, parent_node: DirectoryNode, processed: set[str]) -> bool:
402
+ """
403
+ Check if all children of a parent directory have been processed.
404
+
405
+ Args:
406
+ parent_node: The parent DirectoryNode object
407
+ processed: Set of processed directory paths
408
+
409
+ Returns:
410
+ True if all children are processed, False otherwise.
411
+ """
412
+ for child in parent_node["children"]:
413
+ if child["type"] == "dir" and child["dir_path"] not in processed:
414
+ return False
415
+ return True
416
+
417
+ def _process_file(self, file_path: Path) -> FileNode:
418
+ """
419
+ Process a single file by hashing and uploading it to storage.
420
+
421
+ This method computes a SHA1 hash of the file's contents to uniquely identify it.
422
+ If the file has already been processed (based on the hash), the cached result is
423
+ returned. Otherwise, the file is uploaded to the storage system, and a `FileNode`
424
+ is created to represent the file.
425
+
426
+ The method also extracts metadata such as the file's size, MIME type, and extension,
427
+ and determines the target storage path based on the user ID and file hash.
428
+
429
+ Args:
430
+ file_path (Path): Path to the file to be processed.
431
+
432
+ Returns:
433
+ FileNode: A dictionary representing the processed file, including its metadata
434
+ and storage URI.
435
+ """
436
+ file_hash = self.storage.compute_file_hash(file_path)
437
+
438
+ file_extension = file_path.suffix
439
+ file_size = file_path.stat().st_size
440
+
441
+ if self.prefix_path:
442
+ prefix = self.prefix_path.rstrip("/")
443
+ target_key = f"{prefix}/artifacts/{file_hash}{file_extension}"
444
+ else:
445
+ raise ValueError("Prefix path is invalid or empty")
446
+
447
+ uri = self.storage.store_file(file_path, target_key)
448
+
449
+ return {
450
+ "type": "file",
451
+ "uri": uri,
452
+ "hash": file_hash,
453
+ "size_bytes": file_size,
454
+ "final_real_path": file_path.resolve().as_posix(),
455
+ }
dreadnode/constants.py ADDED
@@ -0,0 +1,16 @@
1
+ # Environment variable names
2
+
3
+ ENV_SERVER_URL = "DREADNODE_SERVER_URL"
4
+ ENV_SERVER = "DREADNODE_SERVER" # alternative to SERVER_URL
5
+ ENV_API_TOKEN = "DREADNODE_API_TOKEN" # noqa: S105
6
+ ENV_API_KEY = "DREADNODE_API_KEY" # alternative to API_TOKEN
7
+ ENV_LOCAL_DIR = "DREADNODE_LOCAL_DIR"
8
+ ENV_PROJECT = "DREADNODE_PROJECT"
9
+
10
+ # Default values
11
+
12
+ DEFAULT_SERVER_URL = "https://platform.dreadnode.io"
13
+ DEFAULT_LOCAL_OBJECT_DIR = ".dreadnode/objects"
14
+
15
+ # Default values for the S3 storage
16
+ MAX_INLINE_OBJECT_BYTES = 10 * 1024 # 10KB
File without changes