dreadnode 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,599 @@
1
+ """
2
+ Utility for merging artifact tree structures while preserving directory hierarchy.
3
+ """
4
+
5
+ import hashlib
6
+ from pathlib import Path
7
+ from typing import cast
8
+
9
+ from dreadnode.artifact.tree_builder import DirectoryNode, FileNode
10
+
11
+
12
+ class ArtifactMerger:
13
+ """
14
+ Class responsible for merging artifact tree structures.
15
+ Handles overlapping directory structures and efficiently combines artifacts.
16
+
17
+ Example:
18
+ ```python
19
+ # Create a merger instance
20
+ merger = ArtifactMerger()
21
+
22
+ # Add multiple artifact trees
23
+ merger.add_tree(tree1) # First tree gets added directly
24
+ merger.add_tree(tree2) # Second tree gets merged if it overlaps
25
+
26
+ # Get the merged result
27
+ merged_trees = merger.get_merged_trees()
28
+ ```
29
+ """
30
+
31
+ def __init__(self) -> None:
32
+ self._path_map: dict[str, DirectoryNode | FileNode] = {}
33
+ # Maps file hashes to all matching files
34
+ self._hash_map: dict[str, list[FileNode]] = {}
35
+ self._merged_trees: list[DirectoryNode] = []
36
+
37
+ def add_tree(self, new_tree: DirectoryNode) -> None:
38
+ """
39
+ Add a new artifact tree, merging with existing trees if needed.
40
+
41
+ This method analyzes the new tree and determines how to integrate it
42
+ with existing trees, handling parent/child relationships and overlaps.
43
+
44
+ Args:
45
+ new_tree: New directory tree to add
46
+
47
+ Example:
48
+ ```python
49
+ # Add first tree (e.g., /data/audio/sub1)
50
+ merger.add_tree({
51
+ "type": "dir",
52
+ "dir_path": "/data/audio/sub1",
53
+ "hash": "abc123",
54
+ "children": [...]
55
+ })
56
+
57
+ # Add parent directory later (e.g., /data/audio)
58
+ # The merger will recognize the relationship and restructure
59
+ merger.add_tree({
60
+ "type": "dir",
61
+ "dir_path": "/data/audio",
62
+ "hash": "def456",
63
+ "children": [...]
64
+ })
65
+ ```
66
+ """
67
+ # First artifact - just add it
68
+ if not self._merged_trees:
69
+ self._merged_trees = [new_tree]
70
+ self._build_maps(new_tree)
71
+ return
72
+
73
+ # Get new tree's path
74
+ new_dir_path = new_tree["dir_path"]
75
+
76
+ # Check for direct match with existing trees
77
+ for existing_tree in self._merged_trees:
78
+ if existing_tree["dir_path"] == new_dir_path:
79
+ # Same directory - merge them
80
+ self._merge_directory_nodes(existing_tree, new_tree)
81
+ self._build_maps() # Rebuild maps
82
+ return
83
+
84
+ # Check if new tree is parent of any existing trees
85
+ children_to_remove = []
86
+ for existing_tree in self._merged_trees:
87
+ existing_dir_path = existing_tree["dir_path"]
88
+
89
+ # New tree is parent of existing tree
90
+ if existing_dir_path.startswith(new_dir_path + "/"):
91
+ rel_path = existing_dir_path[len(new_dir_path) + 1 :].split("/")
92
+ self._place_tree_at_path(new_tree, existing_tree, rel_path)
93
+ children_to_remove.append(existing_tree)
94
+
95
+ # Remove trees that are now incorporated into new tree
96
+ if children_to_remove:
97
+ for child in children_to_remove:
98
+ if child in self._merged_trees:
99
+ self._merged_trees.remove(child)
100
+ self._merged_trees.append(new_tree)
101
+ self._build_maps() # Rebuild maps
102
+ return
103
+
104
+ # Check if new tree is child of an existing tree
105
+ for existing_tree in self._merged_trees:
106
+ existing_dir_path = existing_tree["dir_path"]
107
+
108
+ if new_dir_path.startswith(existing_dir_path + "/"):
109
+ rel_path = new_dir_path[len(existing_dir_path) + 1 :].split("/")
110
+ self._place_tree_at_path(existing_tree, new_tree, rel_path)
111
+ self._build_maps() # Rebuild maps
112
+ return
113
+
114
+ # Try to find and handle overlaps
115
+ new_path_map: dict[str, DirectoryNode | FileNode] = {}
116
+ new_hash_map: dict[str, list[FileNode]] = {}
117
+ self._build_path_and_hash_maps(new_tree, new_path_map, new_hash_map)
118
+
119
+ # Find common paths between existing and new tree
120
+ path_overlaps = set(self._path_map.keys()) & set(new_path_map.keys())
121
+
122
+ if path_overlaps and self._handle_overlaps(path_overlaps, new_path_map):
123
+ # Successfully merged via overlaps
124
+ self._build_maps() # Rebuild maps
125
+ return
126
+
127
+ # If we get here, add new tree as a separate root
128
+ self._merged_trees.append(new_tree)
129
+ self._build_maps() # Rebuild maps
130
+
131
+ def get_merged_trees(self) -> list[DirectoryNode]:
132
+ """
133
+ Get the current merged trees.
134
+
135
+ Returns:
136
+ List of merged directory trees
137
+
138
+ Example:
139
+ ```python
140
+ # Get the merged trees after adding multiple trees
141
+ trees = merger.get_merged_trees()
142
+
143
+ # Typically there will be a single root tree if all added trees are related
144
+ if len(trees) == 1:
145
+ root_tree = trees[0]
146
+ print(f"Root directory: {root_tree['dir_path']}")
147
+ ```
148
+ """
149
+ return self._merged_trees
150
+
151
+ def _place_tree_at_path(
152
+ self,
153
+ parent_tree: DirectoryNode,
154
+ child_tree: DirectoryNode,
155
+ path_parts: list[str],
156
+ ) -> None:
157
+ """
158
+ Place child_tree at the specified path under parent_tree.
159
+
160
+ This creates any necessary intermediate directories and then merges
161
+ the child tree at the correct location in the parent tree.
162
+
163
+ Args:
164
+ parent_tree: The parent tree to place under
165
+ child_tree: The child tree to place
166
+ path_parts: Path components from parent to child
167
+
168
+ Example:
169
+ ```python
170
+ # Internal use to place /data/audio/sub1 under /data
171
+ # path_parts would be ['audio', 'sub1']
172
+ self._place_tree_at_path(
173
+ parent_tree=data_tree, # /data
174
+ child_tree=sub1_tree, # /data/audio/sub1
175
+ path_parts=['audio', 'sub1']
176
+ )
177
+ ```
178
+ """
179
+ current = parent_tree
180
+
181
+ # Navigate to the correct location, creating directories as needed
182
+ for part in path_parts:
183
+ if not part: # Skip empty path parts
184
+ continue
185
+
186
+ # Look for existing directory
187
+ next_node = None
188
+ for child in current["children"]:
189
+ if child["type"] == "dir" and Path(child["dir_path"]).name == part:
190
+ next_node = child
191
+ break
192
+
193
+ # Create directory if it doesn't exist
194
+ if next_node is None:
195
+ next_dir_path = f"{current['dir_path']}/{part}"
196
+ next_node = {
197
+ "type": "dir",
198
+ "dir_path": next_dir_path,
199
+ "hash": "",
200
+ "children": [],
201
+ }
202
+ current["children"].append(next_node)
203
+
204
+ current = next_node
205
+
206
+ # Merge the trees at the final location
207
+ self._merge_directory_nodes(current, child_tree)
208
+
209
+ def _handle_overlaps(
210
+ self,
211
+ overlaps: set[str],
212
+ new_path_map: dict[str, DirectoryNode | FileNode],
213
+ ) -> bool:
214
+ """
215
+ Handle overlapping paths between trees.
216
+
217
+ This method processes paths that exist in both the existing trees
218
+ and the new tree, merging directories and handling file conflicts.
219
+
220
+ Args:
221
+ overlaps: Set of overlapping paths
222
+ new_path_map: Path map for the new tree
223
+
224
+ Returns:
225
+ True if the tree was merged, False otherwise
226
+
227
+ Example:
228
+ ```python
229
+ # Internal use when two directories have some paths in common
230
+ # but neither is a parent of the other
231
+ overlapping_paths = {'/data/shared/file1.txt', '/data/shared/configs'}
232
+ result = self._handle_overlaps(
233
+ overlaps=overlapping_paths,
234
+ new_path_map={'/data/shared/file1.txt': file_node, ...}
235
+ )
236
+ # If result is True, the trees were successfully merged
237
+ ```
238
+ """
239
+ merged = False
240
+
241
+ for path in sorted(overlaps, key=len):
242
+ existing_node = self._path_map.get(path)
243
+ new_node = new_path_map.get(path)
244
+
245
+ if not existing_node or not new_node:
246
+ continue
247
+
248
+ if existing_node["type"] == "dir" and new_node["type"] == "dir":
249
+ # Both are directories - merge them
250
+ self._merge_directory_nodes(
251
+ cast("DirectoryNode", existing_node),
252
+ cast("DirectoryNode", new_node),
253
+ )
254
+ merged = True
255
+ elif existing_node["type"] == "file" and new_node["type"] == "file":
256
+ # Both are files - propagate URIs and update if hash differs
257
+ existing_file = cast("FileNode", existing_node)
258
+ new_file = cast("FileNode", new_node)
259
+
260
+ # Always propagate URIs between files with identical hash
261
+ if existing_file["hash"] == new_file["hash"]:
262
+ self._propagate_uri(existing_file, new_file)
263
+ merged = True
264
+ else:
265
+ # Different hash - find the parent directory and update the file
266
+ for tree in self._merged_trees:
267
+ if self._update_file_in_tree(tree, existing_file, new_file):
268
+ merged = True
269
+ break
270
+
271
+ return merged
272
+
273
+ def _propagate_uri(self, file1: FileNode, file2: FileNode) -> None:
274
+ """
275
+ Ensure URIs are propagated between files with the same hash.
276
+
277
+ If one file has a URI and the other doesn't, the URI will be copied.
278
+
279
+ Args:
280
+ file1: First file node
281
+ file2: Second file node
282
+
283
+ Example:
284
+ ```python
285
+ # Internal use to ensure URIs are shared between identical files
286
+ # If file1 has a URI but file2 doesn't, file2 will get file1's URI
287
+ self._propagate_uri(
288
+ file1={"type": "file", "uri": "s3://bucket/file.txt", ...},
289
+ file2={"type": "file", "uri": "", ...}
290
+ )
291
+ # After: file2["uri"] == "s3://bucket/file.txt"
292
+ ```
293
+ """
294
+ if not file1["uri"] and file2["uri"]:
295
+ file1["uri"] = file2["uri"]
296
+ elif not file2["uri"] and file1["uri"]:
297
+ file2["uri"] = file1["uri"]
298
+
299
+ def _update_file_in_tree(
300
+ self,
301
+ tree: DirectoryNode,
302
+ old_file: FileNode,
303
+ new_file: FileNode,
304
+ ) -> bool:
305
+ """
306
+ Update a file in a directory tree.
307
+
308
+ This replaces old_file with new_file in the tree, recursively searching
309
+ if necessary.
310
+
311
+ Args:
312
+ tree: The directory tree to search
313
+ old_file: The file to replace
314
+ new_file: The new file
315
+
316
+ Returns:
317
+ True if the file was found and updated
318
+
319
+ Example:
320
+ ```python
321
+ # Internal use to replace an outdated file with a newer version
322
+ success = self._update_file_in_tree(
323
+ tree=root_tree,
324
+ old_file={"type": "file", "hash": "abc123", ...},
325
+ new_file={"type": "file", "hash": "def456", ...}
326
+ )
327
+ # If success is True, the file was found and replaced
328
+ ```
329
+ """
330
+ for i, child in enumerate(tree["children"]):
331
+ if child is old_file:
332
+ tree["children"][i] = new_file
333
+ return True
334
+
335
+ if child["type"] == "dir" and self._update_file_in_tree(
336
+ cast("DirectoryNode", child),
337
+ old_file,
338
+ new_file,
339
+ ):
340
+ return True
341
+ return False
342
+
343
+ def _build_maps(self, new_tree: DirectoryNode | None = None) -> None:
344
+ """
345
+ Build or rebuild the path and hash maps.
346
+
347
+ This method populates the internal path and hash maps that enable
348
+ efficient lookups during tree merging.
349
+
350
+ Args:
351
+ new_tree: Optional new tree to add directly to the maps
352
+
353
+ Example:
354
+ ```python
355
+ # Internal use to initialize maps with a new tree
356
+ self._build_maps(new_tree=first_tree)
357
+
358
+ # Or to rebuild all maps after changes
359
+ self._build_maps()
360
+ ```
361
+ """
362
+ self._path_map.clear()
363
+ self._hash_map.clear()
364
+
365
+ if new_tree:
366
+ self._build_path_and_hash_maps(new_tree, self._path_map, self._hash_map)
367
+ else:
368
+ for tree in self._merged_trees:
369
+ self._build_path_and_hash_maps(tree, self._path_map, self._hash_map)
370
+ self._propagate_uris_by_hash()
371
+
372
+ def _propagate_uris_by_hash(self) -> None:
373
+ """
374
+ Ensure all files with the same hash have the same URI.
375
+
376
+ This function ensures that if multiple file nodes have the same hash,
377
+ but only some have URIs, the URI is propagated to all instances.
378
+ """
379
+ for file_nodes in self._hash_map.values():
380
+ if len(file_nodes) <= 1:
381
+ continue
382
+
383
+ uri = next((node["uri"] for node in file_nodes if node["uri"]), "")
384
+ if not uri:
385
+ continue
386
+
387
+ for node in file_nodes:
388
+ if not node["uri"]:
389
+ node["uri"] = uri
390
+
391
+ def _build_path_and_hash_maps(
392
+ self,
393
+ node: DirectoryNode | FileNode,
394
+ path_map: dict[str, DirectoryNode | FileNode],
395
+ hash_map: dict[str, list[FileNode]],
396
+ ) -> None:
397
+ """
398
+ Build both path and hash maps simultaneously.
399
+
400
+ This method recursively processes a node (file or directory) and adds
401
+ it to the appropriate maps.
402
+
403
+ Args:
404
+ node: The node to process
405
+ path_map: Map of paths to nodes
406
+ hash_map: Map of file hashes to file nodes
407
+
408
+ Example:
409
+ ```python
410
+ # Internal use to build maps for a tree
411
+ path_map = {}
412
+ hash_map = {}
413
+ self._build_path_and_hash_maps(
414
+ node=root_tree,
415
+ path_map=path_map,
416
+ hash_map=hash_map
417
+ )
418
+ # After: path_map contains all paths, hash_map contains all file hashes
419
+ ```
420
+ """
421
+ if node["type"] == "dir":
422
+ # Add directory to path map
423
+ dir_node = cast("DirectoryNode", node)
424
+ dir_path = dir_node["dir_path"]
425
+ path_map[dir_path] = dir_node
426
+
427
+ # Process children
428
+ for child in dir_node["children"]:
429
+ self._build_path_and_hash_maps(child, path_map, hash_map)
430
+ else: # File node
431
+ # Add file to path map
432
+ file_node = cast("FileNode", node)
433
+ file_path = file_node["final_real_path"]
434
+ path_map[file_path] = file_node
435
+
436
+ # Add file to hash map
437
+ file_hash = file_node["hash"]
438
+ if file_hash not in hash_map:
439
+ hash_map[file_hash] = []
440
+ hash_map[file_hash].append(file_node)
441
+
442
+ def _merge_directory_nodes(self, target_dir: DirectoryNode, source_dir: DirectoryNode) -> None:
443
+ """
444
+ Merge contents from source directory into target directory.
445
+
446
+ This combines children from both directories, handling duplicates
447
+ and updating files as needed.
448
+
449
+ Args:
450
+ target_dir: Directory to merge into
451
+ source_dir: Directory to merge from
452
+
453
+ Example:
454
+ ```python
455
+ # Internal use to merge two directory nodes
456
+ self._merge_directory_nodes(
457
+ target_dir={"type": "dir", "dir_path": "/data", "children": [...]},
458
+ source_dir={"type": "dir", "dir_path": "/data", "children": [...]}
459
+ )
460
+ # After: target_dir contains all children from both directories
461
+ ```
462
+ """
463
+ # Delegate file and directory processing to separate methods to reduce branches
464
+ path_to_index, hash_to_index = self._build_indices(target_dir)
465
+
466
+ # Process each child from source
467
+ for source_child in source_dir["children"]:
468
+ if source_child["type"] == "dir":
469
+ self._merge_directory_child(
470
+ target_dir,
471
+ cast("DirectoryNode", source_child),
472
+ path_to_index,
473
+ )
474
+ else: # file
475
+ self._merge_file_child(
476
+ target_dir,
477
+ cast("FileNode", source_child),
478
+ path_to_index,
479
+ hash_to_index,
480
+ )
481
+
482
+ # Update hash
483
+ self._update_directory_hash(target_dir)
484
+
485
+ def _build_indices(self, dir_node: DirectoryNode) -> tuple[dict[str, int], dict[str, int]]:
486
+ """
487
+ Build indices for efficient child lookups.
488
+
489
+ Returns:
490
+ A tuple of (path_to_index, hash_to_index) dictionaries
491
+ """
492
+ path_to_index: dict[str, int] = {}
493
+ hash_to_index: dict[str, int] = {}
494
+
495
+ for i, child in enumerate(dir_node["children"]):
496
+ if child["type"] == "dir":
497
+ path_to_index[cast("DirectoryNode", child)["dir_path"]] = i
498
+ else: # file
499
+ file_child = cast("FileNode", child)
500
+ path_to_index[file_child["final_real_path"]] = i
501
+ hash_to_index[file_child["hash"]] = i
502
+
503
+ return path_to_index, hash_to_index
504
+
505
+ def _merge_directory_child(
506
+ self,
507
+ target_dir: DirectoryNode,
508
+ source_dir: DirectoryNode,
509
+ path_to_index: dict[str, int],
510
+ ) -> None:
511
+ """Merge a directory child from source into target directory."""
512
+ dir_path = source_dir["dir_path"]
513
+ if dir_path in path_to_index:
514
+ # Directory exists in both - merge recursively
515
+ index = path_to_index[dir_path]
516
+ existing_child = target_dir["children"][index]
517
+ if existing_child["type"] == "dir":
518
+ self._merge_directory_nodes(
519
+ cast("DirectoryNode", existing_child),
520
+ source_dir,
521
+ )
522
+ else:
523
+ # Directory only in source - add to target
524
+ target_dir["children"].append(source_dir)
525
+
526
+ def _merge_file_child(
527
+ self,
528
+ target_dir: DirectoryNode,
529
+ source_file: FileNode,
530
+ path_to_index: dict[str, int],
531
+ hash_to_index: dict[str, int],
532
+ ) -> None:
533
+ """Merge a file child from source into target directory."""
534
+ file_path = source_file["final_real_path"]
535
+ file_hash = source_file["hash"]
536
+
537
+ if file_path in path_to_index:
538
+ # File exists at same path - update if hash differs
539
+ index = path_to_index[file_path]
540
+ existing_child = target_dir["children"][index]
541
+ if existing_child["hash"] != file_hash:
542
+ target_dir["children"][index] = source_file
543
+ elif existing_child["type"] == "file":
544
+ # Same file - propagate URI if needed
545
+ self._propagate_uri(cast("FileNode", existing_child), source_file)
546
+ elif file_hash in hash_to_index:
547
+ # Same file content exists but at different path
548
+ index = hash_to_index[file_hash]
549
+ existing_child = target_dir["children"][index]
550
+ if existing_child["type"] == "file":
551
+ # Propagate URI if needed
552
+ self._propagate_uri(cast("FileNode", existing_child), source_file)
553
+
554
+ if source_file["uri"] and file_hash in self._hash_map:
555
+ for other_file in self._hash_map[file_hash]:
556
+ if not other_file["uri"]:
557
+ other_file["uri"] = source_file["uri"]
558
+ target_dir["children"].append(source_file)
559
+ else:
560
+ # File only in source - add to target
561
+ target_dir["children"].append(source_file)
562
+
563
+ def _update_directory_hash(self, dir_node: DirectoryNode) -> str:
564
+ """
565
+ Update the hash of a directory based on its children.
566
+
567
+ This computes a content-based hash for a directory by combining
568
+ the hashes of all its children.
569
+
570
+ Args:
571
+ dir_node: The directory to update
572
+
573
+ Returns:
574
+ The updated hash
575
+
576
+ Example:
577
+ ```python
578
+ # Internal use to compute directory hash after changes
579
+ new_hash = self._update_directory_hash(
580
+ dir_node={"type": "dir", "children": [...]}
581
+ )
582
+ # After: dir_node["hash"] is updated and returned
583
+ ```
584
+ """
585
+ child_hashes = []
586
+
587
+ for child in dir_node["children"]:
588
+ if child["type"] == "file":
589
+ child_hashes.append(cast("FileNode", child)["hash"])
590
+ else:
591
+ child_hash = self._update_directory_hash(cast("DirectoryNode", child))
592
+ child_hashes.append(child_hash)
593
+
594
+ child_hashes.sort() # Ensure consistent hash regardless of order
595
+ hash_input = "|".join(child_hashes)
596
+ dir_hash = hashlib.sha1(hash_input.encode()).hexdigest()[:16] # noqa: S324
597
+
598
+ dir_node["hash"] = dir_hash
599
+ return dir_hash