repr-cli 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
repr/extractor.py ADDED
@@ -0,0 +1,388 @@
1
+ """
2
+ Extract signals from repositories - languages, dependencies, etc.
3
+ """
4
+
5
+ import json
6
+ import re
7
+ from collections import Counter
8
+ from pathlib import Path
9
+
10
+ from pygments.lexers import get_lexer_for_filename, ClassNotFound
11
+
12
+
13
+ # Language detection by file extension
14
+ LANGUAGE_EXTENSIONS = {
15
+ ".py": "Python",
16
+ ".js": "JavaScript",
17
+ ".ts": "TypeScript",
18
+ ".tsx": "TypeScript",
19
+ ".jsx": "JavaScript",
20
+ ".go": "Go",
21
+ ".rs": "Rust",
22
+ ".java": "Java",
23
+ ".kt": "Kotlin",
24
+ ".swift": "Swift",
25
+ ".c": "C",
26
+ ".cpp": "C++",
27
+ ".h": "C",
28
+ ".hpp": "C++",
29
+ ".rb": "Ruby",
30
+ ".php": "PHP",
31
+ ".cs": "C#",
32
+ ".scala": "Scala",
33
+ ".clj": "Clojure",
34
+ ".ex": "Elixir",
35
+ ".exs": "Elixir",
36
+ ".erl": "Erlang",
37
+ ".hs": "Haskell",
38
+ ".lua": "Lua",
39
+ ".r": "R",
40
+ ".R": "R",
41
+ ".jl": "Julia",
42
+ ".dart": "Dart",
43
+ ".vue": "Vue",
44
+ ".svelte": "Svelte",
45
+ ".sql": "SQL",
46
+ ".sh": "Shell",
47
+ ".bash": "Shell",
48
+ ".zsh": "Shell",
49
+ }
50
+
51
+ # Files to skip when detecting languages
52
+ SKIP_PATTERNS = {
53
+ "node_modules",
54
+ "venv",
55
+ ".venv",
56
+ "vendor",
57
+ "__pycache__",
58
+ ".git",
59
+ "dist",
60
+ "build",
61
+ ".next",
62
+ "target",
63
+ "coverage",
64
+ }
65
+
66
+
67
+ def detect_languages(repo_path: Path) -> dict[str, float]:
68
+ """
69
+ Detect languages used in a repository.
70
+
71
+ Args:
72
+ repo_path: Path to repository
73
+
74
+ Returns:
75
+ Dictionary of language -> percentage
76
+ """
77
+ extension_counts: Counter[str] = Counter()
78
+ total_files = 0
79
+
80
+ try:
81
+ for file_path in repo_path.rglob("*"):
82
+ if not file_path.is_file():
83
+ continue
84
+
85
+ # Skip certain directories
86
+ parts = file_path.relative_to(repo_path).parts
87
+ if any(skip in parts for skip in SKIP_PATTERNS):
88
+ continue
89
+
90
+ # Get language from extension
91
+ ext = file_path.suffix.lower()
92
+ if ext in LANGUAGE_EXTENSIONS:
93
+ extension_counts[ext] += 1
94
+ total_files += 1
95
+ except Exception:
96
+ pass
97
+
98
+ if total_files == 0:
99
+ return {}
100
+
101
+ # Convert to language percentages
102
+ languages: Counter[str] = Counter()
103
+ for ext, count in extension_counts.items():
104
+ language = LANGUAGE_EXTENSIONS[ext]
105
+ languages[language] += count
106
+
107
+ # Calculate percentages
108
+ result = {}
109
+ for language, count in languages.most_common():
110
+ percentage = (count / total_files) * 100
111
+ if percentage >= 1: # Only include if >= 1%
112
+ result[language] = round(percentage, 1)
113
+
114
+ return result
115
+
116
+
117
+ def get_primary_language(repo_path: Path) -> str | None:
118
+ """
119
+ Get the primary language of a repository.
120
+
121
+ Args:
122
+ repo_path: Path to repository
123
+
124
+ Returns:
125
+ Primary language name or None
126
+ """
127
+ languages = detect_languages(repo_path)
128
+ if languages:
129
+ return max(languages, key=languages.get)
130
+ return None
131
+
132
+
133
+ def detect_dependencies(repo_path: Path) -> dict[str, list[str]]:
134
+ """
135
+ Detect dependencies from package files.
136
+
137
+ Args:
138
+ repo_path: Path to repository
139
+
140
+ Returns:
141
+ Dictionary of ecosystem -> list of dependency names
142
+ """
143
+ dependencies: dict[str, list[str]] = {}
144
+
145
+ # Python - requirements.txt
146
+ requirements_file = repo_path / "requirements.txt"
147
+ if requirements_file.exists():
148
+ deps = _parse_requirements_txt(requirements_file)
149
+ if deps:
150
+ dependencies["python"] = deps
151
+
152
+ # Python - pyproject.toml
153
+ pyproject_file = repo_path / "pyproject.toml"
154
+ if pyproject_file.exists():
155
+ deps = _parse_pyproject_toml(pyproject_file)
156
+ if deps:
157
+ dependencies.setdefault("python", []).extend(deps)
158
+ dependencies["python"] = list(set(dependencies["python"]))
159
+
160
+ # Node.js - package.json
161
+ package_json = repo_path / "package.json"
162
+ if package_json.exists():
163
+ deps = _parse_package_json(package_json)
164
+ if deps:
165
+ dependencies["nodejs"] = deps
166
+
167
+ # Go - go.mod
168
+ go_mod = repo_path / "go.mod"
169
+ if go_mod.exists():
170
+ deps = _parse_go_mod(go_mod)
171
+ if deps:
172
+ dependencies["go"] = deps
173
+
174
+ # Rust - Cargo.toml
175
+ cargo_toml = repo_path / "Cargo.toml"
176
+ if cargo_toml.exists():
177
+ deps = _parse_cargo_toml(cargo_toml)
178
+ if deps:
179
+ dependencies["rust"] = deps
180
+
181
+ # Ruby - Gemfile
182
+ gemfile = repo_path / "Gemfile"
183
+ if gemfile.exists():
184
+ deps = _parse_gemfile(gemfile)
185
+ if deps:
186
+ dependencies["ruby"] = deps
187
+
188
+ return dependencies
189
+
190
+
191
+ def _parse_requirements_txt(path: Path) -> list[str]:
192
+ """Parse Python requirements.txt file."""
193
+ deps = []
194
+ try:
195
+ content = path.read_text()
196
+ for line in content.splitlines():
197
+ line = line.strip()
198
+ if not line or line.startswith("#") or line.startswith("-"):
199
+ continue
200
+ # Extract package name (before version specifier)
201
+ match = re.match(r"^([a-zA-Z0-9_-]+)", line)
202
+ if match:
203
+ deps.append(match.group(1).lower())
204
+ except Exception:
205
+ pass
206
+ return deps
207
+
208
+
209
+ def _parse_pyproject_toml(path: Path) -> list[str]:
210
+ """Parse Python pyproject.toml dependencies."""
211
+ deps = []
212
+ try:
213
+ content = path.read_text()
214
+ # Simple parsing - look for dependencies array
215
+ in_deps = False
216
+ for line in content.splitlines():
217
+ if "dependencies" in line and "=" in line:
218
+ in_deps = True
219
+ continue
220
+ if in_deps:
221
+ if line.strip().startswith("]"):
222
+ in_deps = False
223
+ continue
224
+ # Extract package name
225
+ match = re.search(r'"([a-zA-Z0-9_-]+)', line)
226
+ if match:
227
+ deps.append(match.group(1).lower())
228
+ except Exception:
229
+ pass
230
+ return deps
231
+
232
+
233
+ def _parse_package_json(path: Path) -> list[str]:
234
+ """Parse Node.js package.json dependencies."""
235
+ deps = []
236
+ try:
237
+ content = json.loads(path.read_text())
238
+ for key in ["dependencies", "devDependencies"]:
239
+ if key in content:
240
+ deps.extend(content[key].keys())
241
+ except Exception:
242
+ pass
243
+ return deps
244
+
245
+
246
+ def _parse_go_mod(path: Path) -> list[str]:
247
+ """Parse Go go.mod dependencies."""
248
+ deps = []
249
+ try:
250
+ content = path.read_text()
251
+ in_require = False
252
+ for line in content.splitlines():
253
+ line = line.strip()
254
+ if line.startswith("require ("):
255
+ in_require = True
256
+ continue
257
+ if in_require:
258
+ if line == ")":
259
+ in_require = False
260
+ continue
261
+ # Extract module path
262
+ parts = line.split()
263
+ if parts:
264
+ deps.append(parts[0])
265
+ elif line.startswith("require "):
266
+ parts = line.split()
267
+ if len(parts) >= 2:
268
+ deps.append(parts[1])
269
+ except Exception:
270
+ pass
271
+ return deps
272
+
273
+
274
+ def _parse_cargo_toml(path: Path) -> list[str]:
275
+ """Parse Rust Cargo.toml dependencies."""
276
+ deps = []
277
+ try:
278
+ content = path.read_text()
279
+ in_deps = False
280
+ for line in content.splitlines():
281
+ line = line.strip()
282
+ if line == "[dependencies]" or line == "[dev-dependencies]":
283
+ in_deps = True
284
+ continue
285
+ if line.startswith("[") and in_deps:
286
+ in_deps = False
287
+ continue
288
+ if in_deps and "=" in line:
289
+ name = line.split("=")[0].strip()
290
+ if name and not name.startswith("#"):
291
+ deps.append(name)
292
+ except Exception:
293
+ pass
294
+ return deps
295
+
296
+
297
+ def _parse_gemfile(path: Path) -> list[str]:
298
+ """Parse Ruby Gemfile dependencies."""
299
+ deps = []
300
+ try:
301
+ content = path.read_text()
302
+ for line in content.splitlines():
303
+ line = line.strip()
304
+ if line.startswith("gem "):
305
+ # Extract gem name
306
+ match = re.search(r"gem ['\"]([^'\"]+)['\"]", line)
307
+ if match:
308
+ deps.append(match.group(1))
309
+ except Exception:
310
+ pass
311
+ return deps
312
+
313
+
314
+ def get_file_tree(repo_path: Path, max_depth: int = 3) -> dict:
315
+ """
316
+ Get the file tree structure of a repository.
317
+
318
+ Args:
319
+ repo_path: Path to repository
320
+ max_depth: Maximum depth to traverse
321
+
322
+ Returns:
323
+ Nested dictionary representing file tree
324
+ """
325
+ def build_tree(path: Path, depth: int = 0) -> dict | str:
326
+ if depth > max_depth:
327
+ return "..."
328
+
329
+ if path.is_file():
330
+ return path.name
331
+
332
+ result = {}
333
+ try:
334
+ for item in sorted(path.iterdir()):
335
+ # Skip hidden and common skip patterns
336
+ if item.name.startswith(".") or item.name in SKIP_PATTERNS:
337
+ continue
338
+
339
+ if item.is_dir():
340
+ subtree = build_tree(item, depth + 1)
341
+ if subtree: # Only include non-empty directories
342
+ result[item.name + "/"] = subtree
343
+ else:
344
+ result[item.name] = None
345
+ except PermissionError:
346
+ pass
347
+
348
+ return result
349
+
350
+ return build_tree(repo_path)
351
+
352
+
353
+ def get_file_tree_flat(repo_path: Path, max_depth: int = 3) -> list[str]:
354
+ """
355
+ Get a flat list of file paths in the repository.
356
+
357
+ Args:
358
+ repo_path: Path to repository
359
+ max_depth: Maximum depth to traverse
360
+
361
+ Returns:
362
+ List of relative file paths
363
+ """
364
+ files = []
365
+
366
+ def walk(path: Path, depth: int = 0) -> None:
367
+ if depth > max_depth:
368
+ return
369
+
370
+ try:
371
+ for item in sorted(path.iterdir()):
372
+ # Skip hidden and common skip patterns
373
+ if item.name.startswith(".") or item.name in SKIP_PATTERNS:
374
+ continue
375
+
376
+ rel_path = str(item.relative_to(repo_path))
377
+
378
+ if item.is_dir():
379
+ files.append(rel_path + "/")
380
+ walk(item, depth + 1)
381
+ else:
382
+ files.append(rel_path)
383
+ except PermissionError:
384
+ pass
385
+
386
+ walk(repo_path)
387
+ return files
388
+