repr-cli 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
repr/discovery.py ADDED
@@ -0,0 +1,472 @@
1
+ """
2
+ Git repository discovery - find repos recursively in directories.
3
+ """
4
+
5
+ import hashlib
6
+ from dataclasses import dataclass
7
+ from datetime import datetime
8
+ from pathlib import Path
9
+
10
+ from git import Repo, InvalidGitRepositoryError
11
+ from git.exc import GitCommandError
12
+
13
+ from .config import get_skip_patterns, get_repo_hash, set_repo_hash
14
+
15
+
16
+ @dataclass
17
+ class RepoInfo:
18
+ """Information about a discovered repository."""
19
+
20
+ path: Path
21
+ name: str
22
+ commit_count: int
23
+ user_commit_count: int # User's own commits
24
+ first_commit_date: datetime | None
25
+ last_commit_date: datetime | None
26
+ primary_language: str | None
27
+ languages: dict[str, int] | None # Language -> percentage mapping
28
+ contributors: int
29
+ is_fork: bool
30
+ remote_url: str | None
31
+ description: str | None # From README first line or repo
32
+ frameworks: list[str] | None # Detected frameworks/libraries
33
+ has_tests: bool # Has test directory or test files
34
+ has_ci: bool # Has CI/CD config (.github/workflows, .gitlab-ci, etc.)
35
+
36
+ @property
37
+ def age_months(self) -> int:
38
+ """Calculate repository age in months."""
39
+ if not self.first_commit_date:
40
+ return 0
41
+
42
+ now = datetime.now()
43
+ delta = now - self.first_commit_date
44
+ return int(delta.days / 30)
45
+
46
+ @property
47
+ def age_display(self) -> str:
48
+ """Human-readable age string."""
49
+ months = self.age_months
50
+ if months < 1:
51
+ return "< 1 mo"
52
+ elif months < 12:
53
+ return f"{months} mo"
54
+ else:
55
+ years = months // 12
56
+ return f"{years}+ yr"
57
+
58
+ def compute_hash(self) -> str:
59
+ """Compute a hash representing the current state of the repo."""
60
+ try:
61
+ repo = Repo(self.path)
62
+ head_sha = repo.head.commit.hexsha
63
+ commit_count = str(self.commit_count)
64
+ hash_input = f"{head_sha}:{commit_count}".encode()
65
+ return hashlib.sha256(hash_input).hexdigest()[:16]
66
+ except Exception:
67
+ return ""
68
+
69
+ def to_dict(self) -> dict:
70
+ """Convert to dictionary for serialization."""
71
+ return {
72
+ "path": str(self.path),
73
+ "name": self.name,
74
+ "commit_count": self.commit_count,
75
+ "first_commit": self.first_commit_date.isoformat() if self.first_commit_date else None,
76
+ "last_commit": self.last_commit_date.isoformat() if self.last_commit_date else None,
77
+ "languages": self.languages or {},
78
+ "contributors": self.contributors,
79
+ "is_fork": self.is_fork,
80
+ "remote_url": self.remote_url,
81
+ "age_months": self.age_months,
82
+ }
83
+
84
+
85
+ def should_skip_directory(path: Path, skip_patterns: list[str]) -> bool:
86
+ """Check if a directory should be skipped."""
87
+ name = path.name
88
+
89
+ # Always skip hidden directories (except .git check happens elsewhere)
90
+ if name.startswith(".") and name != ".git":
91
+ return True
92
+
93
+ # Check against skip patterns
94
+ for pattern in skip_patterns:
95
+ if name == pattern or name.lower() == pattern.lower():
96
+ return True
97
+
98
+ return False
99
+
100
+
101
+ def discover_repos(
102
+ root_paths: list[Path],
103
+ skip_patterns: list[str] | None = None,
104
+ min_commits: int = 10,
105
+ use_cache: bool = True,
106
+ ) -> list[RepoInfo]:
107
+ """
108
+ Discover git repositories recursively.
109
+
110
+ Args:
111
+ root_paths: List of directories to search
112
+ skip_patterns: Patterns to skip (default from config)
113
+ min_commits: Minimum commits to include repo
114
+ use_cache: Whether to use cached repo hashes
115
+
116
+ Returns:
117
+ List of discovered repositories
118
+ """
119
+ if skip_patterns is None:
120
+ skip_patterns = get_skip_patterns()
121
+
122
+ repos: list[RepoInfo] = []
123
+ visited_paths: set[Path] = set()
124
+
125
+ for root_path in root_paths:
126
+ root = Path(root_path).expanduser().resolve()
127
+ if not root.exists():
128
+ continue
129
+
130
+ # Search for .git directories
131
+ for git_dir in _find_git_dirs(root, skip_patterns, visited_paths):
132
+ repo_path = git_dir.parent
133
+
134
+ try:
135
+ repo_info = analyze_repo(repo_path)
136
+
137
+ # Skip repos with too few commits
138
+ if repo_info.commit_count < min_commits:
139
+ continue
140
+
141
+ # Check cache if enabled
142
+ if use_cache:
143
+ cached_hash = get_repo_hash(str(repo_path))
144
+ current_hash = repo_info.compute_hash()
145
+ if cached_hash == current_hash:
146
+ repo_info._cached = True # type: ignore
147
+ else:
148
+ set_repo_hash(str(repo_path), current_hash)
149
+
150
+ repos.append(repo_info)
151
+
152
+ except (InvalidGitRepositoryError, GitCommandError, Exception):
153
+ # Skip invalid or problematic repos
154
+ continue
155
+
156
+ return repos
157
+
158
+
159
+ def _find_git_dirs(
160
+ root: Path,
161
+ skip_patterns: list[str],
162
+ visited: set[Path],
163
+ ) -> list[Path]:
164
+ """Find all .git directories under root."""
165
+ git_dirs: list[Path] = []
166
+
167
+ def search(path: Path, depth: int = 0) -> None:
168
+ if depth > 10: # Limit recursion depth
169
+ return
170
+
171
+ if path in visited:
172
+ return
173
+ visited.add(path)
174
+
175
+ try:
176
+ for item in path.iterdir():
177
+ if not item.is_dir():
178
+ continue
179
+
180
+ if item.name == ".git":
181
+ git_dirs.append(item)
182
+ # Don't recurse into repo subdirectories
183
+ return
184
+
185
+ if should_skip_directory(item, skip_patterns):
186
+ continue
187
+
188
+ search(item, depth + 1)
189
+ except PermissionError:
190
+ pass
191
+
192
+ search(root)
193
+ return git_dirs
194
+
195
+
196
+ def analyze_repo(path: Path) -> RepoInfo:
197
+ """
198
+ Analyze a single repository.
199
+
200
+ Args:
201
+ path: Path to repository root
202
+
203
+ Returns:
204
+ RepoInfo with repository metadata
205
+ """
206
+ repo = Repo(path)
207
+
208
+ # Get user's git config for identifying their commits
209
+ user_email = None
210
+ user_name = None
211
+ try:
212
+ user_email = repo.config_reader().get_value("user", "email", default=None)
213
+ user_name = repo.config_reader().get_value("user", "name", default=None)
214
+ except Exception:
215
+ pass
216
+
217
+ # Get commit counts (total and user's own)
218
+ commit_count = 0
219
+ user_commit_count = 0
220
+ try:
221
+ for commit in repo.iter_commits():
222
+ commit_count += 1
223
+ # Check if commit is by user
224
+ if user_email and commit.author.email == user_email:
225
+ user_commit_count += 1
226
+ elif user_name and commit.author.name == user_name:
227
+ user_commit_count += 1
228
+ except Exception:
229
+ pass
230
+
231
+ # Get date range
232
+ first_commit_date = None
233
+ last_commit_date = None
234
+
235
+ try:
236
+ commits = list(repo.iter_commits())
237
+ if commits:
238
+ last_commit_date = datetime.fromtimestamp(commits[0].committed_date)
239
+ first_commit_date = datetime.fromtimestamp(commits[-1].committed_date)
240
+ except Exception:
241
+ pass
242
+
243
+ # Get contributors
244
+ contributors = set()
245
+ try:
246
+ for commit in repo.iter_commits():
247
+ contributors.add(commit.author.email)
248
+ except Exception:
249
+ pass
250
+
251
+ # Get remote URL
252
+ remote_url = None
253
+ is_fork = False
254
+ try:
255
+ if repo.remotes:
256
+ remote = repo.remotes.origin
257
+ remote_url = remote.url
258
+ # Simple fork detection - could be improved
259
+ is_fork = "fork" in remote_url.lower() if remote_url else False
260
+ except Exception:
261
+ pass
262
+
263
+ # Get description from README
264
+ description = _get_repo_description(path)
265
+
266
+ # Detect frameworks
267
+ frameworks = _detect_frameworks(path)
268
+
269
+ # Check for tests
270
+ has_tests = _has_tests(path)
271
+
272
+ # Check for CI/CD
273
+ has_ci = _has_ci(path)
274
+
275
+ # Primary language and languages will be detected by extractor
276
+ primary_language = None
277
+ languages = None
278
+
279
+ return RepoInfo(
280
+ path=path,
281
+ name=path.name,
282
+ commit_count=commit_count,
283
+ user_commit_count=user_commit_count,
284
+ first_commit_date=first_commit_date,
285
+ last_commit_date=last_commit_date,
286
+ primary_language=primary_language,
287
+ languages=languages,
288
+ contributors=len(contributors),
289
+ is_fork=is_fork,
290
+ remote_url=remote_url,
291
+ description=description,
292
+ frameworks=frameworks,
293
+ has_tests=has_tests,
294
+ has_ci=has_ci,
295
+ )
296
+
297
+
298
+ def _get_repo_description(path: Path) -> str | None:
299
+ """Extract description from README file."""
300
+ readme_names = ["README.md", "README.rst", "README.txt", "README"]
301
+ for name in readme_names:
302
+ readme_path = path / name
303
+ if readme_path.exists():
304
+ try:
305
+ content = readme_path.read_text(errors='ignore')
306
+ lines = content.strip().split('\n')
307
+ # Skip title (usually starts with #) and get first paragraph
308
+ for line in lines:
309
+ line = line.strip()
310
+ if line and not line.startswith('#') and not line.startswith('!'):
311
+ # Truncate to reasonable length
312
+ return line[:200] if len(line) > 200 else line
313
+ except Exception:
314
+ pass
315
+ return None
316
+
317
+
318
+ def _detect_frameworks(path: Path) -> list[str]:
319
+ """Detect frameworks and major libraries used."""
320
+ frameworks = []
321
+
322
+ # Python frameworks
323
+ requirements_files = ["requirements.txt", "requirements.in", "pyproject.toml", "setup.py"]
324
+ python_frameworks = {
325
+ "fastapi": "FastAPI", "django": "Django", "flask": "Flask",
326
+ "pytorch": "PyTorch", "torch": "PyTorch", "tensorflow": "TensorFlow",
327
+ "pandas": "Pandas", "numpy": "NumPy", "scikit-learn": "scikit-learn",
328
+ "celery": "Celery", "sqlalchemy": "SQLAlchemy", "pydantic": "Pydantic",
329
+ }
330
+
331
+ for req_file in requirements_files:
332
+ req_path = path / req_file
333
+ if req_path.exists():
334
+ try:
335
+ content = req_path.read_text(errors='ignore').lower()
336
+ for key, name in python_frameworks.items():
337
+ if key in content and name not in frameworks:
338
+ frameworks.append(name)
339
+ except Exception:
340
+ pass
341
+
342
+ # JavaScript/TypeScript frameworks
343
+ package_json = path / "package.json"
344
+ if package_json.exists():
345
+ try:
346
+ import json
347
+ data = json.loads(package_json.read_text())
348
+ deps = {**data.get("dependencies", {}), **data.get("devDependencies", {})}
349
+ js_frameworks = {
350
+ "react": "React", "next": "Next.js", "vue": "Vue",
351
+ "angular": "Angular", "svelte": "Svelte", "express": "Express",
352
+ "nestjs": "NestJS", "@nestjs/core": "NestJS",
353
+ "tailwindcss": "Tailwind", "typescript": "TypeScript",
354
+ }
355
+ for key, name in js_frameworks.items():
356
+ if key in deps and name not in frameworks:
357
+ frameworks.append(name)
358
+ except Exception:
359
+ pass
360
+
361
+ # Rust frameworks
362
+ cargo_toml = path / "Cargo.toml"
363
+ if cargo_toml.exists():
364
+ try:
365
+ content = cargo_toml.read_text(errors='ignore').lower()
366
+ rust_frameworks = {
367
+ "actix": "Actix", "axum": "Axum", "tokio": "Tokio",
368
+ "rocket": "Rocket", "warp": "Warp",
369
+ }
370
+ for key, name in rust_frameworks.items():
371
+ if key in content and name not in frameworks:
372
+ frameworks.append(name)
373
+ except Exception:
374
+ pass
375
+
376
+ # Go frameworks
377
+ go_mod = path / "go.mod"
378
+ if go_mod.exists():
379
+ try:
380
+ content = go_mod.read_text(errors='ignore').lower()
381
+ go_frameworks = {
382
+ "gin-gonic": "Gin", "echo": "Echo", "fiber": "Fiber",
383
+ }
384
+ for key, name in go_frameworks.items():
385
+ if key in content and name not in frameworks:
386
+ frameworks.append(name)
387
+ except Exception:
388
+ pass
389
+
390
+ return frameworks if frameworks else None
391
+
392
+
393
+ def _has_tests(path: Path) -> bool:
394
+ """Check if repository has tests."""
395
+ test_indicators = [
396
+ "tests", "test", "__tests__", "spec", "specs",
397
+ "pytest.ini", "jest.config.js", "jest.config.ts",
398
+ ".pytest_cache", "conftest.py",
399
+ ]
400
+ for indicator in test_indicators:
401
+ if (path / indicator).exists():
402
+ return True
403
+
404
+ # Check for test files in src
405
+ for pattern in ["**/test_*.py", "**/*_test.py", "**/*.test.ts", "**/*.spec.ts"]:
406
+ if list(path.glob(pattern)):
407
+ return True
408
+
409
+ return False
410
+
411
+
412
+ def _has_ci(path: Path) -> bool:
413
+ """Check if repository has CI/CD configuration."""
414
+ ci_paths = [
415
+ ".github/workflows",
416
+ ".gitlab-ci.yml",
417
+ ".circleci",
418
+ "Jenkinsfile",
419
+ ".travis.yml",
420
+ "azure-pipelines.yml",
421
+ ".drone.yml",
422
+ "bitbucket-pipelines.yml",
423
+ ]
424
+ for ci_path in ci_paths:
425
+ if (path / ci_path).exists():
426
+ return True
427
+ return False
428
+
429
+
430
+ def is_config_only_repo(path: Path) -> bool:
431
+ """
432
+ Check if a repository only contains config files (dotfiles, etc).
433
+
434
+ Args:
435
+ path: Path to repository root
436
+
437
+ Returns:
438
+ True if repo appears to be config-only
439
+ """
440
+ config_indicators = {
441
+ "dotfiles",
442
+ ".dotfiles",
443
+ "config",
444
+ ".config",
445
+ }
446
+
447
+ # Check repo name
448
+ if path.name.lower() in config_indicators:
449
+ return True
450
+
451
+ # Check file types
452
+ code_extensions = {
453
+ ".py", ".js", ".ts", ".tsx", ".jsx",
454
+ ".go", ".rs", ".java", ".kt", ".swift",
455
+ ".c", ".cpp", ".h", ".hpp",
456
+ ".rb", ".php", ".cs", ".scala",
457
+ }
458
+
459
+ has_code = False
460
+ try:
461
+ for file in path.rglob("*"):
462
+ if file.is_file() and file.suffix in code_extensions:
463
+ # Check it's not in a hidden directory
464
+ parts = file.relative_to(path).parts
465
+ if not any(p.startswith(".") for p in parts[:-1]):
466
+ has_code = True
467
+ break
468
+ except Exception:
469
+ pass
470
+
471
+ return not has_code
472
+