repolens-cli 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
repolens/fetcher.py ADDED
@@ -0,0 +1,198 @@
1
+ """GitHub API client — fetches file trees and content without cloning."""
2
+ import base64
3
+ import re
4
+ import time
5
+ from typing import Optional
6
+
7
+ import requests
8
+
9
+ from .models import FileNode
10
+
11
+ GITHUB_API = "https://api.github.com"
12
+
13
+ SUPPORTED_EXTENSIONS = {
14
+ ".py": "python",
15
+ ".js": "javascript",
16
+ ".jsx": "javascript",
17
+ ".ts": "typescript",
18
+ ".tsx": "typescript",
19
+ ".go": "go",
20
+ ".java": "java",
21
+ ".rb": "ruby",
22
+ ".rs": "rust",
23
+ ".cpp": "cpp",
24
+ ".c": "c",
25
+ ".h": "c",
26
+ ".cs": "csharp",
27
+ ".php": "php",
28
+ ".swift": "swift",
29
+ ".kt": "kotlin",
30
+ ".scala": "scala",
31
+ ".md": "markdown",
32
+ ".json": "json",
33
+ ".yaml": "yaml",
34
+ ".yml": "yaml",
35
+ ".toml": "toml",
36
+ }
37
+
38
+
39
+ def parse_github_url(url: str) -> tuple[str, str, Optional[str]]:
40
+ """Parse GitHub URL into (owner, repo, branch_or_None).
41
+
42
+ Handles:
43
+ - https://github.com/owner/repo
44
+ - https://github.com/owner/repo/tree/branch
45
+ - github.com/owner/repo
46
+ - owner/repo
47
+ """
48
+ url = url.strip().rstrip("/")
49
+ # strip protocol
50
+ url = re.sub(r"^https?://", "", url)
51
+ url = re.sub(r"^github\.com/", "", url)
52
+
53
+ parts = url.split("/")
54
+ if len(parts) < 2:
55
+ raise ValueError(f"Cannot parse GitHub URL: {url!r}. Expected owner/repo format.")
56
+
57
+ owner, repo = parts[0], parts[1]
58
+ repo = repo.removesuffix(".git")
59
+
60
+ branch = None
61
+ if len(parts) >= 4 and parts[2] == "tree":
62
+ branch = "/".join(parts[3:])
63
+
64
+ return owner, repo, branch
65
+
66
+
67
+ def _headers(token: Optional[str]) -> dict:
68
+ h = {"Accept": "application/vnd.github+json", "X-GitHub-Api-Version": "2022-11-28"}
69
+ if token:
70
+ h["Authorization"] = f"Bearer {token}"
71
+ return h
72
+
73
+
74
+ def _get(url: str, token: Optional[str], params: dict | None = None) -> dict:
75
+ resp = requests.get(url, headers=_headers(token), params=params, timeout=30)
76
+ if resp.status_code == 403 and "rate limit" in resp.text.lower():
77
+ reset = int(resp.headers.get("X-RateLimit-Reset", time.time() + 60))
78
+ wait = max(0, reset - int(time.time())) + 1
79
+ raise RuntimeError(
80
+ f"GitHub rate limit hit. Resets in {wait}s. Set GITHUB_TOKEN to increase limits."
81
+ )
82
+ if resp.status_code == 404:
83
+ raise ValueError(f"Not found: {url}")
84
+ resp.raise_for_status()
85
+ return resp.json()
86
+
87
+
88
+ def get_repo_info(owner: str, repo: str, token: Optional[str] = None) -> dict:
89
+ return _get(f"{GITHUB_API}/repos/{owner}/{repo}", token)
90
+
91
+
92
+ def get_default_branch(owner: str, repo: str, token: Optional[str] = None) -> str:
93
+ info = get_repo_info(owner, repo, token)
94
+ return info["default_branch"]
95
+
96
+
97
+ def fetch_file_tree(
98
+ owner: str,
99
+ repo: str,
100
+ branch: str,
101
+ token: Optional[str] = None,
102
+ max_files: int = 500,
103
+ ) -> list[FileNode]:
104
+ """Fetch the full file tree via GitHub Trees API (no cloning)."""
105
+ # Get commit SHA for the branch
106
+ branch_data = _get(f"{GITHUB_API}/repos/{owner}/{repo}/branches/{branch}", token)
107
+ commit_sha = branch_data["commit"]["sha"]
108
+
109
+ # Get tree SHA from commit
110
+ commit_data = _get(f"{GITHUB_API}/repos/{owner}/{repo}/git/commits/{commit_sha}", token)
111
+ tree_sha = commit_data["tree"]["sha"]
112
+
113
+ # Fetch full recursive tree
114
+ tree_data = _get(
115
+ f"{GITHUB_API}/repos/{owner}/{repo}/git/trees/{tree_sha}",
116
+ token,
117
+ params={"recursive": "1"},
118
+ )
119
+
120
+ if tree_data.get("truncated"):
121
+ print(f" Warning: repo tree was truncated (>{max_files} items). Showing first {max_files}.")
122
+
123
+ nodes = []
124
+ for item in tree_data.get("tree", []):
125
+ if item["type"] != "blob":
126
+ continue
127
+ path = item["path"]
128
+ ext = "." + path.rsplit(".", 1)[-1] if "." in path else ""
129
+ language = SUPPORTED_EXTENSIONS.get(ext.lower(), "")
130
+ nodes.append(
131
+ FileNode(
132
+ path=path,
133
+ sha=item["sha"],
134
+ size=item.get("size", 0),
135
+ language=language,
136
+ )
137
+ )
138
+ if len(nodes) >= max_files:
139
+ break
140
+
141
+ return nodes
142
+
143
+
144
+ def fetch_file_content(
145
+ owner: str,
146
+ repo: str,
147
+ path: str,
148
+ branch: str,
149
+ token: Optional[str] = None,
150
+ ) -> Optional[str]:
151
+ """Fetch decoded content of a single file. Returns None if too large or binary."""
152
+ try:
153
+ data = _get(
154
+ f"{GITHUB_API}/repos/{owner}/{repo}/contents/{path}",
155
+ token,
156
+ params={"ref": branch},
157
+ )
158
+ except (ValueError, requests.HTTPError):
159
+ return None
160
+
161
+ if isinstance(data, list):
162
+ return None # It's a directory
163
+
164
+ if data.get("encoding") != "base64":
165
+ return None
166
+
167
+ size = data.get("size", 0)
168
+ if size > 500_000: # skip files > 500KB
169
+ return None
170
+
171
+ raw = data.get("content", "")
172
+ try:
173
+ return base64.b64decode(raw).decode("utf-8", errors="replace")
174
+ except Exception:
175
+ return None
176
+
177
+
178
+ def fetch_source_files(
179
+ owner: str,
180
+ repo: str,
181
+ branch: str,
182
+ files: list[FileNode],
183
+ token: Optional[str] = None,
184
+ languages: Optional[set[str]] = None,
185
+ max_fetch: int = 200,
186
+ ) -> None:
187
+ """Fetch content for source files in-place, filtered by language."""
188
+ source_files = [
189
+ f for f in files
190
+ if f.language and f.language not in ("markdown", "json", "yaml", "toml")
191
+ and (languages is None or f.language in languages)
192
+ ][:max_fetch]
193
+
194
+ print(f" Fetching content for {len(source_files)} source files...")
195
+ for i, file_node in enumerate(source_files, 1):
196
+ if i % 20 == 0:
197
+ print(f" {i}/{len(source_files)}...")
198
+ file_node.content = fetch_file_content(owner, repo, file_node.path, branch, token)
repolens/graph.py ADDED
@@ -0,0 +1,126 @@
1
+ """Build and analyse dependency and call graphs from FileAnalysis data."""
2
+ from __future__ import annotations
3
+
4
+ from .models import FileAnalysis, FunctionNode, GraphStats
5
+
6
+
7
+ def build_graph(analyses: dict[str, FileAnalysis]) -> GraphStats:
8
+ stats = GraphStats()
9
+
10
+ # ── Import graph ──────────────────────────────────────────────────────────
11
+ in_degree: dict[str, int] = {p: 0 for p in analyses}
12
+
13
+ for path, fa in analyses.items():
14
+ unique_imports = list(dict.fromkeys(fa.resolved_imports)) # deduplicate, preserve order
15
+ stats.import_edges[path] = unique_imports
16
+ for dep in unique_imports:
17
+ if dep in in_degree:
18
+ in_degree[dep] = in_degree.get(dep, 0) + 1
19
+ else:
20
+ in_degree[dep] = 1
21
+
22
+ stats.in_degree = in_degree
23
+
24
+ # ── Circular dependency detection (iterative DFS) ─────────────────────────
25
+ stats.circular_deps = _find_cycles(stats.import_edges)
26
+
27
+ # ── Entry points (source files nobody imports) ────────────────────────────
28
+ stats.entry_points = [
29
+ p for p in analyses if in_degree.get(p, 0) == 0
30
+ ]
31
+
32
+ # ── Hub files (most imported) ─────────────────────────────────────────────
33
+ stats.hub_files = sorted(
34
+ [(p, in_degree.get(p, 0)) for p in analyses],
35
+ key=lambda x: x[1],
36
+ reverse=True,
37
+ )[:10]
38
+
39
+ # ── Function index & caller resolution ────────────────────────────────────
40
+ # Build name -> list[function_id] index for resolving calls
41
+ name_index: dict[str, list[str]] = {}
42
+ for path, fa in analyses.items():
43
+ for fn in fa.functions:
44
+ fid = f"{path}::{fn.name}"
45
+ fn_copy = FunctionNode(
46
+ name=fn.name,
47
+ file_path=fn.file_path,
48
+ line_start=fn.line_start,
49
+ line_end=fn.line_end,
50
+ calls=list(fn.calls),
51
+ )
52
+ stats.functions[fid] = fn_copy
53
+ name_index.setdefault(fn.name, []).append(fid)
54
+
55
+ # Resolve calls to function IDs and back-populate callers
56
+ for fid, fn in stats.functions.items():
57
+ resolved_calls: list[str] = []
58
+ for call in fn.calls:
59
+ base_name = call.split(".")[-1] # handle obj.method style
60
+ for candidate in name_index.get(base_name, []):
61
+ resolved_calls.append(candidate)
62
+ stats.functions[candidate].callers.append(fid)
63
+ fn.calls = list(dict.fromkeys(resolved_calls))
64
+
65
+ return stats
66
+
67
+
68
+ def _find_cycles(edges: dict[str, list[str]]) -> list[list[str]]:
69
+ """Detect all simple cycles using iterative DFS (Johnson-lite)."""
70
+ WHITE, GRAY, BLACK = 0, 1, 2
71
+ color: dict[str, int] = {n: WHITE for n in edges}
72
+ # include all targets too
73
+ for deps in edges.values():
74
+ for d in deps:
75
+ color.setdefault(d, WHITE)
76
+
77
+ cycles: list[list[str]] = []
78
+ parent: dict[str, str | None] = {}
79
+
80
+ for start in list(color):
81
+ if color[start] != WHITE:
82
+ continue
83
+ stack = [(start, iter(edges.get(start, [])))]
84
+ color[start] = GRAY
85
+ path = [start]
86
+ parent[start] = None
87
+
88
+ while stack:
89
+ node, children = stack[-1]
90
+ try:
91
+ child = next(children)
92
+ if color.get(child, WHITE) == GRAY:
93
+ # Found a back-edge → extract cycle
94
+ idx = path.index(child)
95
+ cycle = path[idx:]
96
+ # Deduplicate: only add if we haven't seen this cycle (by sorted set)
97
+ cycle_key = frozenset(cycle)
98
+ if not any(frozenset(c) == cycle_key for c in cycles):
99
+ cycles.append(cycle)
100
+ elif color.get(child, WHITE) == WHITE:
101
+ color[child] = GRAY
102
+ parent[child] = node
103
+ path.append(child)
104
+ stack.append((child, iter(edges.get(child, []))))
105
+ except StopIteration:
106
+ color[node] = BLACK
107
+ stack.pop()
108
+ if path and path[-1] == node:
109
+ path.pop()
110
+
111
+ return cycles
112
+
113
+
114
+ def importers_of(path: str, stats: GraphStats) -> list[str]:
115
+ """Return list of files that import *path*."""
116
+ return [src for src, deps in stats.import_edges.items() if path in deps]
117
+
118
+
119
+ def callers_of(function_id: str, stats: GraphStats) -> list[str]:
120
+ fn = stats.functions.get(function_id)
121
+ return fn.callers if fn else []
122
+
123
+
124
+ def callees_of(function_id: str, stats: GraphStats) -> list[str]:
125
+ fn = stats.functions.get(function_id)
126
+ return fn.calls if fn else []
repolens/models.py ADDED
@@ -0,0 +1,52 @@
1
+ from dataclasses import dataclass, field
2
+ from typing import Optional
3
+
4
+
5
+ @dataclass
6
+ class FileNode:
7
+ path: str # repo-relative path
8
+ size: int = 0
9
+ language: str = ""
10
+ content: Optional[str] = None
11
+
12
+
13
+ @dataclass
14
+ class FunctionNode:
15
+ name: str
16
+ file_path: str
17
+ line_start: int
18
+ line_end: int
19
+ calls: list[str] = field(default_factory=list) # raw call names found in body
20
+ callers: list[str] = field(default_factory=list) # populated by graph builder
21
+ docstring: Optional[str] = None
22
+
23
+
24
+ @dataclass
25
+ class FileAnalysis:
26
+ path: str
27
+ language: str
28
+ raw_imports: list[str] = field(default_factory=list)
29
+ resolved_imports: list[str] = field(default_factory=list) # repo-relative paths
30
+ functions: list[FunctionNode] = field(default_factory=list)
31
+ classes: list[str] = field(default_factory=list)
32
+
33
+
34
+ @dataclass
35
+ class GraphStats:
36
+ # file_path -> list of files it imports (within repo)
37
+ import_edges: dict[str, list[str]] = field(default_factory=dict)
38
+ # file_path -> in-degree (how many files import it)
39
+ in_degree: dict[str, int] = field(default_factory=dict)
40
+ circular_deps: list[list[str]] = field(default_factory=list)
41
+ entry_points: list[str] = field(default_factory=list) # nothing imports these
42
+ hub_files: list[tuple[str, int]] = field(default_factory=list) # (path, in_degree) top-10
43
+ # function_id "file::name" -> FunctionNode
44
+ functions: dict[str, FunctionNode] = field(default_factory=dict)
45
+
46
+
47
+ @dataclass
48
+ class RepoAnalysis:
49
+ root: str # absolute path to scanned directory
50
+ files: list[FileNode] = field(default_factory=list)
51
+ file_analyses: dict[str, FileAnalysis] = field(default_factory=dict)
52
+ stats: GraphStats = field(default_factory=GraphStats)
repolens/scanner.py ADDED
@@ -0,0 +1,69 @@
1
+ import os
2
+ from pathlib import Path
3
+
4
+ from .models import FileNode
5
+
6
+ SUPPORTED_EXTENSIONS: dict[str, str] = {
7
+ ".py": "python",
8
+ ".js": "javascript",
9
+ ".jsx": "javascript",
10
+ ".ts": "typescript",
11
+ ".tsx": "typescript",
12
+ ".go": "go",
13
+ ".rs": "rust",
14
+ ".java": "java",
15
+ ".rb": "ruby",
16
+ ".cpp": "cpp",
17
+ ".c": "c",
18
+ ".h": "c",
19
+ ".cs": "csharp",
20
+ ".php": "php",
21
+ ".swift": "swift",
22
+ ".kt": "kotlin",
23
+ ".scala": "scala",
24
+ ".md": "markdown",
25
+ }
26
+
27
+ SKIP_DIRS = {
28
+ ".git", ".hg", ".svn",
29
+ "node_modules", "__pycache__", ".mypy_cache", ".ruff_cache",
30
+ ".venv", "venv", "env", ".env",
31
+ "dist", "build", "out", "target",
32
+ ".idea", ".vscode",
33
+ "vendor", "third_party",
34
+ }
35
+
36
+ MAX_FILE_SIZE = 500_000 # 500 KB
37
+
38
+
39
+ def scan(root: str, max_files: int = 2000) -> list[FileNode]:
40
+ """Walk *root* and return FileNode list for all source files."""
41
+ root_path = Path(root).resolve()
42
+ nodes: list[FileNode] = []
43
+
44
+ for dirpath, dirnames, filenames in os.walk(root_path):
45
+ # Prune ignored directories in-place
46
+ dirnames[:] = [d for d in dirnames if d not in SKIP_DIRS and not d.startswith(".")]
47
+
48
+ for filename in filenames:
49
+ if len(nodes) >= max_files:
50
+ break
51
+ full_path = Path(dirpath) / filename
52
+ suffix = full_path.suffix.lower()
53
+ language = SUPPORTED_EXTENSIONS.get(suffix)
54
+ if not language:
55
+ continue
56
+
57
+ rel_path = full_path.relative_to(root_path).as_posix()
58
+ size = full_path.stat().st_size
59
+
60
+ content: str | None = None
61
+ if size <= MAX_FILE_SIZE:
62
+ try:
63
+ content = full_path.read_text(encoding="utf-8", errors="replace")
64
+ except OSError:
65
+ pass
66
+
67
+ nodes.append(FileNode(path=rel_path, size=size, language=language, content=content))
68
+
69
+ return sorted(nodes, key=lambda n: n.path)
File without changes