loom-code 0.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- loom_code/__init__.py +22 -0
- loom_code/_post_commit.py +119 -0
- loom_code/agent.py +544 -0
- loom_code/approval.py +616 -0
- loom_code/browse/__init__.py +291 -0
- loom_code/browse/act.py +467 -0
- loom_code/browse/observe.py +249 -0
- loom_code/browse/session.py +96 -0
- loom_code/browse/verify.py +194 -0
- loom_code/checkpoint.py +283 -0
- loom_code/cli.py +495 -0
- loom_code/code_index.py +703 -0
- loom_code/compact.py +143 -0
- loom_code/consent.py +47 -0
- loom_code/credentials.py +527 -0
- loom_code/edit_tool.py +635 -0
- loom_code/extensions.py +522 -0
- loom_code/file_history.py +322 -0
- loom_code/file_tools.py +93 -0
- loom_code/git_hook.py +200 -0
- loom_code/grep_tool.py +430 -0
- loom_code/hooks.py +297 -0
- loom_code/loominit/__init__.py +23 -0
- loom_code/loominit/_ast_walk.py +429 -0
- loom_code/loominit/_files.py +284 -0
- loom_code/loominit/_graph.py +141 -0
- loom_code/loominit/_resolve.py +392 -0
- loom_code/loominit/_tests_map.py +108 -0
- loom_code/loominit/extractor.py +332 -0
- loom_code/loominit/repomap.py +225 -0
- loom_code/loominit/schema.py +242 -0
- loom_code/lsp_tools.py +396 -0
- loom_code/mcp_host.py +79 -0
- loom_code/operator.py +449 -0
- loom_code/paste.py +97 -0
- loom_code/paths.py +52 -0
- loom_code/permissions.py +177 -0
- loom_code/project.py +104 -0
- loom_code/prompts.py +451 -0
- loom_code/render.py +783 -0
- loom_code/repl.py +4080 -0
- loom_code/rules.py +267 -0
- loom_code/sandboxed_bash.py +176 -0
- loom_code/scribe.py +88 -0
- loom_code/skills/__init__.py +16 -0
- loom_code/skills/graphify/SKILL.md +97 -0
- loom_code/skills/graphify/tools.py +570 -0
- loom_code/trust.py +216 -0
- loom_code/turn.py +169 -0
- loom_code/web_fetch.py +370 -0
- loom_code/workers.py +758 -0
- loom_code/worktree.py +134 -0
- loom_code-0.1.1.dist-info/METADATA +224 -0
- loom_code-0.1.1.dist-info/RECORD +58 -0
- loom_code-0.1.1.dist-info/WHEEL +5 -0
- loom_code-0.1.1.dist-info/entry_points.txt +2 -0
- loom_code-0.1.1.dist-info/licenses/LICENSE +21 -0
- loom_code-0.1.1.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,284 @@
|
|
|
1
|
+
"""File discovery + per-file metadata (hash, lang, git heat).
|
|
2
|
+
|
|
3
|
+
The structural extractor calls :func:`discover_files` once at the
|
|
4
|
+
start of indexing; everything downstream uses the returned
|
|
5
|
+
:class:`DiscoveredFile` list as the canonical "what files exist".
|
|
6
|
+
|
|
7
|
+
Discovery strategy:
|
|
8
|
+
|
|
9
|
+
* **In a git repo** — use ``git ls-files --cached --others
|
|
10
|
+
--exclude-standard``. This respects ``.gitignore`` for free, which
|
|
11
|
+
is the *only* reliable way to skip a project's actual ignore set
|
|
12
|
+
(venvs, build outputs, generated code). The alternative — re-
|
|
13
|
+
implementing gitignore semantics — is a tar pit.
|
|
14
|
+
* **No git** — walk the tree, skip a hard-coded set of well-known
|
|
15
|
+
noise directories (``.venv``, ``node_modules``, ``__pycache__``,
|
|
16
|
+
etc.). Less accurate but covers the "loose folder" case.
|
|
17
|
+
|
|
18
|
+
Git heat (commits touching a file in the last 90 days) comes from
|
|
19
|
+
``git log --since=90.days --name-only`` parsed once at discovery
|
|
20
|
+
time. If the repo is huge, this is the most expensive step in
|
|
21
|
+
discovery — still O(seconds). Cached in the returned dataclass so
|
|
22
|
+
no other module needs to re-run git.
|
|
23
|
+
"""
|
|
24
|
+
|
|
25
|
+
from __future__ import annotations
|
|
26
|
+
|
|
27
|
+
import hashlib
|
|
28
|
+
import subprocess
|
|
29
|
+
from dataclasses import dataclass
|
|
30
|
+
from datetime import datetime
|
|
31
|
+
from pathlib import Path
|
|
32
|
+
from typing import Literal
|
|
33
|
+
|
|
34
|
+
# Hard-coded noise directories for the non-git walker. Add only
|
|
35
|
+
# things that are universally noise — when in doubt, leave it in,
|
|
36
|
+
# the user is in a git repo 99% of the time and these matter only
|
|
37
|
+
# for the fallback path.
|
|
38
|
+
_NOISE_DIRS: frozenset[str] = frozenset(
|
|
39
|
+
{
|
|
40
|
+
".git",
|
|
41
|
+
".hg",
|
|
42
|
+
".svn",
|
|
43
|
+
".venv",
|
|
44
|
+
"venv",
|
|
45
|
+
"env",
|
|
46
|
+
"__pycache__",
|
|
47
|
+
"node_modules",
|
|
48
|
+
".pytest_cache",
|
|
49
|
+
".mypy_cache",
|
|
50
|
+
".ruff_cache",
|
|
51
|
+
".tox",
|
|
52
|
+
".nox",
|
|
53
|
+
"dist",
|
|
54
|
+
"build",
|
|
55
|
+
".eggs",
|
|
56
|
+
".loom", # our own output dir — never re-index ourselves
|
|
57
|
+
".idea",
|
|
58
|
+
".vscode",
|
|
59
|
+
}
|
|
60
|
+
)
|
|
61
|
+
|
|
62
|
+
# File extensions we recognize. Anything else is skipped — the index
|
|
63
|
+
# is for code understanding, not asset cataloguing. Markdown is kept
|
|
64
|
+
# because docs often capture architecture decisions that supplement
|
|
65
|
+
# the LLM-generated narrative.
|
|
66
|
+
_LANG_BY_EXT: dict[
|
|
67
|
+
str, Literal["python", "markdown", "toml", "yaml", "json"]
|
|
68
|
+
] = {
|
|
69
|
+
".py": "python",
|
|
70
|
+
".pyi": "python",
|
|
71
|
+
".md": "markdown",
|
|
72
|
+
".markdown": "markdown",
|
|
73
|
+
".toml": "toml",
|
|
74
|
+
".yaml": "yaml",
|
|
75
|
+
".yml": "yaml",
|
|
76
|
+
".json": "json",
|
|
77
|
+
}
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
@dataclass(frozen=True)
|
|
81
|
+
class DiscoveredFile:
|
|
82
|
+
"""One file the extractor will inspect, with everything it needs
|
|
83
|
+
to know up front. ``rel_path`` is repo-relative POSIX.
|
|
84
|
+
|
|
85
|
+
Hash is computed lazily on first read — the dataclass stores
|
|
86
|
+
the absolute path; callers compute + cache via :func:`hash_file`.
|
|
87
|
+
"""
|
|
88
|
+
|
|
89
|
+
rel_path: str
|
|
90
|
+
abs_path: Path
|
|
91
|
+
lang: Literal["python", "markdown", "toml", "yaml", "json", "other"]
|
|
92
|
+
size_bytes: int
|
|
93
|
+
lines: int
|
|
94
|
+
sha256: str
|
|
95
|
+
mtime: datetime
|
|
96
|
+
git_changes_90d: int | None
|
|
97
|
+
is_test: bool
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
def is_git_repo(root: Path) -> bool:
|
|
101
|
+
"""True when ``root`` (or any ancestor up to the filesystem
|
|
102
|
+
boundary) contains a ``.git`` directory. Using ``git rev-parse``
|
|
103
|
+
rather than just checking ``.git/`` makes us correct on
|
|
104
|
+
submodules + worktrees, where ``.git`` is a file."""
|
|
105
|
+
try:
|
|
106
|
+
result = subprocess.run(
|
|
107
|
+
["git", "rev-parse", "--is-inside-work-tree"],
|
|
108
|
+
cwd=str(root),
|
|
109
|
+
capture_output=True,
|
|
110
|
+
text=True,
|
|
111
|
+
check=False,
|
|
112
|
+
)
|
|
113
|
+
return result.returncode == 0 and result.stdout.strip() == "true"
|
|
114
|
+
except FileNotFoundError:
|
|
115
|
+
# No git binary installed — definitely no git repo by our
|
|
116
|
+
# operational definition (we'd need git to enumerate it).
|
|
117
|
+
return False
|
|
118
|
+
|
|
119
|
+
|
|
120
|
+
def discover_files(root: Path) -> list[DiscoveredFile]:
|
|
121
|
+
"""Enumerate every indexable file under ``root``.
|
|
122
|
+
|
|
123
|
+
Order is deterministic (sorted by ``rel_path``) so the resulting
|
|
124
|
+
:class:`schema.LoomIndex` is byte-stable across runs that see
|
|
125
|
+
the same tree — important for diff-aware refresh.
|
|
126
|
+
|
|
127
|
+
Returns an empty list when ``root`` doesn't exist; raises only on
|
|
128
|
+
permission errors. A non-readable repo is a real problem worth
|
|
129
|
+
surfacing.
|
|
130
|
+
"""
|
|
131
|
+
if not root.exists():
|
|
132
|
+
return []
|
|
133
|
+
|
|
134
|
+
rel_paths = _list_paths(root)
|
|
135
|
+
git_heat = _git_heat(root) if is_git_repo(root) else {}
|
|
136
|
+
|
|
137
|
+
out: list[DiscoveredFile] = []
|
|
138
|
+
for rel in sorted(rel_paths):
|
|
139
|
+
abs_path = root / rel
|
|
140
|
+
if not abs_path.is_file():
|
|
141
|
+
continue
|
|
142
|
+
try:
|
|
143
|
+
data = abs_path.read_bytes()
|
|
144
|
+
except OSError:
|
|
145
|
+
continue
|
|
146
|
+
ext = abs_path.suffix.lower()
|
|
147
|
+
lang = _LANG_BY_EXT.get(ext, "other")
|
|
148
|
+
if lang == "other":
|
|
149
|
+
# We keep ``other`` files OUT of the index for now — the
|
|
150
|
+
# annotator can't do anything useful with binary blobs
|
|
151
|
+
# and including them just bloats files[]. Future: re-
|
|
152
|
+
# enable for shell / docker / etc. with a language
|
|
153
|
+
# filter in extractor.
|
|
154
|
+
continue
|
|
155
|
+
sha = hashlib.sha256(data).hexdigest()
|
|
156
|
+
text = data.decode("utf-8", errors="replace")
|
|
157
|
+
size = len(data)
|
|
158
|
+
n_lines = text.count("\n") + (
|
|
159
|
+
1 if text and not text.endswith("\n") else 0
|
|
160
|
+
)
|
|
161
|
+
mtime = datetime.fromtimestamp(abs_path.stat().st_mtime).astimezone()
|
|
162
|
+
out.append(
|
|
163
|
+
DiscoveredFile(
|
|
164
|
+
rel_path=rel,
|
|
165
|
+
abs_path=abs_path,
|
|
166
|
+
lang=lang,
|
|
167
|
+
size_bytes=size,
|
|
168
|
+
lines=n_lines,
|
|
169
|
+
sha256=sha,
|
|
170
|
+
mtime=mtime,
|
|
171
|
+
git_changes_90d=git_heat.get(rel),
|
|
172
|
+
is_test=_is_test_path(rel),
|
|
173
|
+
)
|
|
174
|
+
)
|
|
175
|
+
return out
|
|
176
|
+
|
|
177
|
+
|
|
178
|
+
# ---------------------------------------------------------------------------
|
|
179
|
+
# Internals
|
|
180
|
+
# ---------------------------------------------------------------------------
|
|
181
|
+
|
|
182
|
+
|
|
183
|
+
def _list_paths(root: Path) -> list[str]:
|
|
184
|
+
"""Enumerate POSIX-relative paths under ``root``.
|
|
185
|
+
|
|
186
|
+
Routes through ``git ls-files`` when applicable (free .gitignore
|
|
187
|
+
handling), else walks + filters noise dirs.
|
|
188
|
+
"""
|
|
189
|
+
if is_git_repo(root):
|
|
190
|
+
return _git_list(root)
|
|
191
|
+
return _walk_list(root)
|
|
192
|
+
|
|
193
|
+
|
|
194
|
+
def _git_list(root: Path) -> list[str]:
|
|
195
|
+
"""``git ls-files --cached --others --exclude-standard`` —
|
|
196
|
+
tracked + untracked but not ignored. Skips submodule contents
|
|
197
|
+
(recurse=False by default) which is what we want; submodule
|
|
198
|
+
code belongs to a different repo's index."""
|
|
199
|
+
proc = subprocess.run(
|
|
200
|
+
[
|
|
201
|
+
"git",
|
|
202
|
+
"ls-files",
|
|
203
|
+
"--cached",
|
|
204
|
+
"--others",
|
|
205
|
+
"--exclude-standard",
|
|
206
|
+
],
|
|
207
|
+
cwd=str(root),
|
|
208
|
+
capture_output=True,
|
|
209
|
+
text=True,
|
|
210
|
+
check=False,
|
|
211
|
+
)
|
|
212
|
+
if proc.returncode != 0:
|
|
213
|
+
# Fall back to walking — better partial coverage than zero.
|
|
214
|
+
return _walk_list(root)
|
|
215
|
+
return [
|
|
216
|
+
line.strip()
|
|
217
|
+
for line in proc.stdout.splitlines()
|
|
218
|
+
if line.strip()
|
|
219
|
+
]
|
|
220
|
+
|
|
221
|
+
|
|
222
|
+
def _walk_list(root: Path) -> list[str]:
|
|
223
|
+
"""Manual walk: skip directories in :data:`_NOISE_DIRS`. POSIX
|
|
224
|
+
paths only — Windows users get the same shape via PurePosixPath
|
|
225
|
+
conversion in the caller (loom-code runs on macOS / Linux today
|
|
226
|
+
but the contract should not be tripped by OS quirks)."""
|
|
227
|
+
out: list[str] = []
|
|
228
|
+
for path in root.rglob("*"):
|
|
229
|
+
if not path.is_file():
|
|
230
|
+
continue
|
|
231
|
+
# Skip if any part is a noise dir.
|
|
232
|
+
if any(part in _NOISE_DIRS for part in path.relative_to(root).parts):
|
|
233
|
+
continue
|
|
234
|
+
out.append(path.relative_to(root).as_posix())
|
|
235
|
+
return out
|
|
236
|
+
|
|
237
|
+
|
|
238
|
+
def _git_heat(root: Path) -> dict[str, int]:
|
|
239
|
+
"""Return ``{rel_path: n_commits_in_last_90d}``.
|
|
240
|
+
|
|
241
|
+
Uses ``git log --since=90.days --name-only --pretty=`` — outputs
|
|
242
|
+
one path per line per commit, with blank lines between commits.
|
|
243
|
+
Counting occurrences gives us the heat score directly.
|
|
244
|
+
|
|
245
|
+
Returns ``{}`` on any subprocess error — heat is a hint, not a
|
|
246
|
+
correctness guarantee."""
|
|
247
|
+
try:
|
|
248
|
+
proc = subprocess.run(
|
|
249
|
+
[
|
|
250
|
+
"git",
|
|
251
|
+
"log",
|
|
252
|
+
"--since=90.days",
|
|
253
|
+
"--name-only",
|
|
254
|
+
"--pretty=format:",
|
|
255
|
+
"--no-merges",
|
|
256
|
+
],
|
|
257
|
+
cwd=str(root),
|
|
258
|
+
capture_output=True,
|
|
259
|
+
text=True,
|
|
260
|
+
check=False,
|
|
261
|
+
timeout=30,
|
|
262
|
+
)
|
|
263
|
+
except (subprocess.TimeoutExpired, FileNotFoundError):
|
|
264
|
+
return {}
|
|
265
|
+
if proc.returncode != 0:
|
|
266
|
+
return {}
|
|
267
|
+
counts: dict[str, int] = {}
|
|
268
|
+
for line in proc.stdout.splitlines():
|
|
269
|
+
line = line.strip()
|
|
270
|
+
if not line:
|
|
271
|
+
continue
|
|
272
|
+
counts[line] = counts.get(line, 0) + 1
|
|
273
|
+
return counts
|
|
274
|
+
|
|
275
|
+
|
|
276
|
+
def _is_test_path(rel: str) -> bool:
|
|
277
|
+
"""Best-effort detection: anything under ``tests/`` or named
|
|
278
|
+
``test_*.py`` / ``*_test.py``. Same heuristic pytest uses, which
|
|
279
|
+
matches the vast majority of Python projects."""
|
|
280
|
+
parts = rel.split("/")
|
|
281
|
+
if "tests" in parts or "test" in parts:
|
|
282
|
+
return True
|
|
283
|
+
name = parts[-1]
|
|
284
|
+
return name.startswith("test_") or name.endswith("_test.py")
|
|
@@ -0,0 +1,141 @@
|
|
|
1
|
+
"""PageRank over the file-level import graph.
|
|
2
|
+
|
|
3
|
+
Hand-rolled power iteration — networkx would be a more familiar
|
|
4
|
+
implementation but it's a 5 MB transitive-dependency chain to do
|
|
5
|
+
roughly twenty lines of arithmetic. The loomflow design rule "no
|
|
6
|
+
SDK at module top" applies here too: cheap things should be self-
|
|
7
|
+
contained.
|
|
8
|
+
|
|
9
|
+
Math: standard PageRank with a damping factor (0.85). For a graph
|
|
10
|
+
where node ``i`` has out-edges to neighbours ``N(i)``::
|
|
11
|
+
|
|
12
|
+
pr(j) = (1 - d) / N + d * sum( pr(i) / |N(i)| for i in inbound(j) )
|
|
13
|
+
|
|
14
|
+
We iterate until L1 change drops below tolerance or 100 iterations
|
|
15
|
+
elapse. Dangling nodes (no out-edges) distribute their score
|
|
16
|
+
uniformly across all nodes, the textbook fix.
|
|
17
|
+
|
|
18
|
+
The result is per-FILE — file-level centrality, which Aider also
|
|
19
|
+
uses. Per-symbol PageRank requires a call graph, which we don't
|
|
20
|
+
extract in v1 (see :mod:`_ast_walk` design note). Each symbol
|
|
21
|
+
inherits its file's PageRank score in :mod:`extractor`'s aggregation
|
|
22
|
+
step.
|
|
23
|
+
"""
|
|
24
|
+
|
|
25
|
+
from __future__ import annotations
|
|
26
|
+
|
|
27
|
+
from collections import defaultdict
|
|
28
|
+
|
|
29
|
+
_DAMPING = 0.85
|
|
30
|
+
_TOLERANCE = 1e-6
|
|
31
|
+
_MAX_ITERATIONS = 100
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def pagerank_file_graph(
|
|
35
|
+
*, files: list[str], edges: list[tuple[str, str]]
|
|
36
|
+
) -> dict[str, float]:
|
|
37
|
+
"""Compute PageRank for a directed graph of files.
|
|
38
|
+
|
|
39
|
+
``files`` is the full node set (every indexed file). ``edges`` is
|
|
40
|
+
``[(from_path, to_path), ...]`` — only RESOLVED imports, so
|
|
41
|
+
third-party / stdlib edges don't dominate.
|
|
42
|
+
|
|
43
|
+
Returns ``{rel_path: score}`` for every file in ``files``;
|
|
44
|
+
files not appearing in any edge get the uniform 1/N base score.
|
|
45
|
+
Returns ``{}`` if ``files`` is empty (degenerate case).
|
|
46
|
+
"""
|
|
47
|
+
n = len(files)
|
|
48
|
+
if n == 0:
|
|
49
|
+
return {}
|
|
50
|
+
|
|
51
|
+
file_set = set(files)
|
|
52
|
+
# Inbound and outbound adjacency. Drop edges whose endpoints
|
|
53
|
+
# aren't in our file set — defensive: caller should already
|
|
54
|
+
# have filtered to resolved edges, but never trust that.
|
|
55
|
+
out_adj: dict[str, list[str]] = defaultdict(list)
|
|
56
|
+
in_adj: dict[str, list[str]] = defaultdict(list)
|
|
57
|
+
for src, dst in edges:
|
|
58
|
+
if src in file_set and dst in file_set and src != dst:
|
|
59
|
+
out_adj[src].append(dst)
|
|
60
|
+
in_adj[dst].append(src)
|
|
61
|
+
|
|
62
|
+
# Initialise uniform.
|
|
63
|
+
pr = {f: 1.0 / n for f in files}
|
|
64
|
+
base = (1.0 - _DAMPING) / n
|
|
65
|
+
|
|
66
|
+
for _ in range(_MAX_ITERATIONS):
|
|
67
|
+
# Dangling mass: sum of scores at nodes with no out-edges,
|
|
68
|
+
# redistributed uniformly to every node so the system stays
|
|
69
|
+
# stochastic.
|
|
70
|
+
dangling = sum(pr[f] for f in files if not out_adj[f])
|
|
71
|
+
dangling_share = _DAMPING * dangling / n
|
|
72
|
+
|
|
73
|
+
new_pr: dict[str, float] = {}
|
|
74
|
+
for f in files:
|
|
75
|
+
inbound_mass = sum(
|
|
76
|
+
pr[src] / len(out_adj[src])
|
|
77
|
+
for src in in_adj[f]
|
|
78
|
+
)
|
|
79
|
+
new_pr[f] = base + dangling_share + _DAMPING * inbound_mass
|
|
80
|
+
|
|
81
|
+
# L1 convergence check — converges fast on typical repos.
|
|
82
|
+
delta = sum(abs(new_pr[f] - pr[f]) for f in files)
|
|
83
|
+
pr = new_pr
|
|
84
|
+
if delta < _TOLERANCE:
|
|
85
|
+
break
|
|
86
|
+
|
|
87
|
+
return pr
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
def cluster_by_path_prefix(
|
|
91
|
+
files: list[str], *, max_files_per_cluster: int = 50
|
|
92
|
+
) -> dict[str, list[str]]:
|
|
93
|
+
"""Group files by their top-level directory, then split oversized
|
|
94
|
+
clusters by the NEXT directory level.
|
|
95
|
+
|
|
96
|
+
Recursion is shallow (depth 3) — beyond that, clusters get too
|
|
97
|
+
fine-grained to be useful. The result is a ``{cluster_id: [paths]}``
|
|
98
|
+
map; ``cluster_id`` is the directory prefix or the bare filename
|
|
99
|
+
for files at the repo root.
|
|
100
|
+
|
|
101
|
+
Example for the loomflow tree::
|
|
102
|
+
|
|
103
|
+
loomflow/agent/*.py → cluster "loomflow/agent"
|
|
104
|
+
loomflow/architecture/*.py → cluster "loomflow/architecture"
|
|
105
|
+
loomflow/memory/ (>50 files) → split into
|
|
106
|
+
"loomflow/memory/postgres",
|
|
107
|
+
"loomflow/memory/chroma", ...
|
|
108
|
+
|
|
109
|
+
This is a deliberately simple heuristic — most well-organized
|
|
110
|
+
codebases already group by directory by convention. Import-graph
|
|
111
|
+
community detection would do better on tangled codebases but
|
|
112
|
+
adds complexity for marginal gain on the typical case.
|
|
113
|
+
"""
|
|
114
|
+
return _cluster(files, depth=1, max_files=max_files_per_cluster)
|
|
115
|
+
|
|
116
|
+
|
|
117
|
+
def _cluster(
|
|
118
|
+
files: list[str], depth: int, max_files: int
|
|
119
|
+
) -> dict[str, list[str]]:
|
|
120
|
+
"""Cluster by the first ``depth`` directory components. If any
|
|
121
|
+
resulting cluster exceeds ``max_files``, recurse on it with
|
|
122
|
+
``depth+1``."""
|
|
123
|
+
groups: dict[str, list[str]] = defaultdict(list)
|
|
124
|
+
for path in files:
|
|
125
|
+
parts = path.split("/")
|
|
126
|
+
if len(parts) <= depth:
|
|
127
|
+
# Top-level file (e.g. ``cli.py``) — it's its own cluster.
|
|
128
|
+
groups[path].append(path)
|
|
129
|
+
else:
|
|
130
|
+
key = "/".join(parts[:depth])
|
|
131
|
+
groups[key].append(path)
|
|
132
|
+
|
|
133
|
+
out: dict[str, list[str]] = {}
|
|
134
|
+
for key, paths in groups.items():
|
|
135
|
+
if len(paths) <= max_files or depth >= 3:
|
|
136
|
+
out[key] = sorted(paths)
|
|
137
|
+
continue
|
|
138
|
+
# Recurse one level deeper for the oversized cluster.
|
|
139
|
+
sub = _cluster(paths, depth=depth + 1, max_files=max_files)
|
|
140
|
+
out.update(sub)
|
|
141
|
+
return out
|