skill-seekers 2.7.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- skill_seekers/__init__.py +22 -0
- skill_seekers/cli/__init__.py +39 -0
- skill_seekers/cli/adaptors/__init__.py +120 -0
- skill_seekers/cli/adaptors/base.py +221 -0
- skill_seekers/cli/adaptors/claude.py +485 -0
- skill_seekers/cli/adaptors/gemini.py +453 -0
- skill_seekers/cli/adaptors/markdown.py +269 -0
- skill_seekers/cli/adaptors/openai.py +503 -0
- skill_seekers/cli/ai_enhancer.py +310 -0
- skill_seekers/cli/api_reference_builder.py +373 -0
- skill_seekers/cli/architectural_pattern_detector.py +525 -0
- skill_seekers/cli/code_analyzer.py +1462 -0
- skill_seekers/cli/codebase_scraper.py +1225 -0
- skill_seekers/cli/config_command.py +563 -0
- skill_seekers/cli/config_enhancer.py +431 -0
- skill_seekers/cli/config_extractor.py +871 -0
- skill_seekers/cli/config_manager.py +452 -0
- skill_seekers/cli/config_validator.py +394 -0
- skill_seekers/cli/conflict_detector.py +528 -0
- skill_seekers/cli/constants.py +72 -0
- skill_seekers/cli/dependency_analyzer.py +757 -0
- skill_seekers/cli/doc_scraper.py +2332 -0
- skill_seekers/cli/enhance_skill.py +488 -0
- skill_seekers/cli/enhance_skill_local.py +1096 -0
- skill_seekers/cli/enhance_status.py +194 -0
- skill_seekers/cli/estimate_pages.py +433 -0
- skill_seekers/cli/generate_router.py +1209 -0
- skill_seekers/cli/github_fetcher.py +534 -0
- skill_seekers/cli/github_scraper.py +1466 -0
- skill_seekers/cli/guide_enhancer.py +723 -0
- skill_seekers/cli/how_to_guide_builder.py +1267 -0
- skill_seekers/cli/install_agent.py +461 -0
- skill_seekers/cli/install_skill.py +178 -0
- skill_seekers/cli/language_detector.py +614 -0
- skill_seekers/cli/llms_txt_detector.py +60 -0
- skill_seekers/cli/llms_txt_downloader.py +104 -0
- skill_seekers/cli/llms_txt_parser.py +150 -0
- skill_seekers/cli/main.py +558 -0
- skill_seekers/cli/markdown_cleaner.py +132 -0
- skill_seekers/cli/merge_sources.py +806 -0
- skill_seekers/cli/package_multi.py +77 -0
- skill_seekers/cli/package_skill.py +241 -0
- skill_seekers/cli/pattern_recognizer.py +1825 -0
- skill_seekers/cli/pdf_extractor_poc.py +1166 -0
- skill_seekers/cli/pdf_scraper.py +617 -0
- skill_seekers/cli/quality_checker.py +519 -0
- skill_seekers/cli/rate_limit_handler.py +438 -0
- skill_seekers/cli/resume_command.py +160 -0
- skill_seekers/cli/run_tests.py +230 -0
- skill_seekers/cli/setup_wizard.py +93 -0
- skill_seekers/cli/split_config.py +390 -0
- skill_seekers/cli/swift_patterns.py +560 -0
- skill_seekers/cli/test_example_extractor.py +1081 -0
- skill_seekers/cli/test_unified_simple.py +179 -0
- skill_seekers/cli/unified_codebase_analyzer.py +572 -0
- skill_seekers/cli/unified_scraper.py +932 -0
- skill_seekers/cli/unified_skill_builder.py +1605 -0
- skill_seekers/cli/upload_skill.py +162 -0
- skill_seekers/cli/utils.py +432 -0
- skill_seekers/mcp/__init__.py +33 -0
- skill_seekers/mcp/agent_detector.py +316 -0
- skill_seekers/mcp/git_repo.py +273 -0
- skill_seekers/mcp/server.py +231 -0
- skill_seekers/mcp/server_fastmcp.py +1249 -0
- skill_seekers/mcp/server_legacy.py +2302 -0
- skill_seekers/mcp/source_manager.py +285 -0
- skill_seekers/mcp/tools/__init__.py +115 -0
- skill_seekers/mcp/tools/config_tools.py +251 -0
- skill_seekers/mcp/tools/packaging_tools.py +826 -0
- skill_seekers/mcp/tools/scraping_tools.py +842 -0
- skill_seekers/mcp/tools/source_tools.py +828 -0
- skill_seekers/mcp/tools/splitting_tools.py +212 -0
- skill_seekers/py.typed +0 -0
- skill_seekers-2.7.3.dist-info/METADATA +2027 -0
- skill_seekers-2.7.3.dist-info/RECORD +79 -0
- skill_seekers-2.7.3.dist-info/WHEEL +5 -0
- skill_seekers-2.7.3.dist-info/entry_points.txt +19 -0
- skill_seekers-2.7.3.dist-info/licenses/LICENSE +21 -0
- skill_seekers-2.7.3.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,534 @@
|
|
|
1
|
+
"""
|
|
2
|
+
GitHub Three-Stream Fetcher
|
|
3
|
+
|
|
4
|
+
Fetches from GitHub and splits into 3 streams:
|
|
5
|
+
- Stream 1: Code (for C3.x analysis)
|
|
6
|
+
- Stream 2: Documentation (README, CONTRIBUTING, docs/*.md)
|
|
7
|
+
- Stream 3: Insights (issues, metadata)
|
|
8
|
+
|
|
9
|
+
This is the foundation of the unified codebase analyzer architecture.
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
import os
|
|
13
|
+
import subprocess
|
|
14
|
+
import tempfile
|
|
15
|
+
from collections import Counter
|
|
16
|
+
from dataclasses import dataclass
|
|
17
|
+
from pathlib import Path
|
|
18
|
+
|
|
19
|
+
import requests
|
|
20
|
+
|
|
21
|
+
from .config_manager import get_config_manager
|
|
22
|
+
from .rate_limit_handler import RateLimitError, RateLimitHandler, create_github_headers
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
@dataclass
|
|
26
|
+
class CodeStream:
|
|
27
|
+
"""Code files for C3.x analysis."""
|
|
28
|
+
|
|
29
|
+
directory: Path
|
|
30
|
+
files: list[Path]
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
@dataclass
|
|
34
|
+
class DocsStream:
|
|
35
|
+
"""Documentation files from repository."""
|
|
36
|
+
|
|
37
|
+
readme: str | None
|
|
38
|
+
contributing: str | None
|
|
39
|
+
docs_files: list[dict] # [{"path": "docs/oauth.md", "content": "..."}]
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
@dataclass
|
|
43
|
+
class InsightsStream:
|
|
44
|
+
"""GitHub metadata and issues."""
|
|
45
|
+
|
|
46
|
+
metadata: dict # stars, forks, language, etc.
|
|
47
|
+
common_problems: list[dict]
|
|
48
|
+
known_solutions: list[dict]
|
|
49
|
+
top_labels: list[dict]
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
@dataclass
|
|
53
|
+
class ThreeStreamData:
|
|
54
|
+
"""Complete output from GitHub fetcher."""
|
|
55
|
+
|
|
56
|
+
code_stream: CodeStream
|
|
57
|
+
docs_stream: DocsStream
|
|
58
|
+
insights_stream: InsightsStream
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
class GitHubThreeStreamFetcher:
|
|
62
|
+
"""
|
|
63
|
+
Fetch from GitHub and split into 3 streams.
|
|
64
|
+
|
|
65
|
+
Usage:
|
|
66
|
+
fetcher = GitHubThreeStreamFetcher(
|
|
67
|
+
repo_url="https://github.com/facebook/react",
|
|
68
|
+
github_token=os.getenv('GITHUB_TOKEN')
|
|
69
|
+
)
|
|
70
|
+
|
|
71
|
+
three_streams = fetcher.fetch()
|
|
72
|
+
|
|
73
|
+
# Now you have:
|
|
74
|
+
# - three_streams.code_stream (for C3.x)
|
|
75
|
+
# - three_streams.docs_stream (for doc parser)
|
|
76
|
+
# - three_streams.insights_stream (for issue analyzer)
|
|
77
|
+
"""
|
|
78
|
+
|
|
79
|
+
def __init__(
|
|
80
|
+
self,
|
|
81
|
+
repo_url: str,
|
|
82
|
+
github_token: str | None = None,
|
|
83
|
+
interactive: bool = True,
|
|
84
|
+
profile_name: str | None = None,
|
|
85
|
+
):
|
|
86
|
+
"""
|
|
87
|
+
Initialize fetcher.
|
|
88
|
+
|
|
89
|
+
Args:
|
|
90
|
+
repo_url: GitHub repository URL (e.g., https://github.com/owner/repo)
|
|
91
|
+
github_token: Optional GitHub API token for higher rate limits
|
|
92
|
+
interactive: Whether to show interactive prompts (False for CI/CD)
|
|
93
|
+
profile_name: Name of the GitHub profile being used
|
|
94
|
+
"""
|
|
95
|
+
self.repo_url = repo_url
|
|
96
|
+
self.github_token = github_token or os.getenv("GITHUB_TOKEN")
|
|
97
|
+
self.owner, self.repo = self.parse_repo_url(repo_url)
|
|
98
|
+
self.interactive = interactive
|
|
99
|
+
|
|
100
|
+
# Initialize rate limit handler
|
|
101
|
+
config = get_config_manager()
|
|
102
|
+
if not profile_name and self.github_token:
|
|
103
|
+
profile_name = config.get_profile_for_token(self.github_token)
|
|
104
|
+
|
|
105
|
+
self.rate_limiter = RateLimitHandler(
|
|
106
|
+
token=self.github_token, interactive=interactive, profile_name=profile_name
|
|
107
|
+
)
|
|
108
|
+
|
|
109
|
+
def parse_repo_url(self, url: str) -> tuple[str, str]:
|
|
110
|
+
"""
|
|
111
|
+
Parse GitHub URL to extract owner and repo.
|
|
112
|
+
|
|
113
|
+
Args:
|
|
114
|
+
url: GitHub URL (https://github.com/owner/repo or git@github.com:owner/repo.git)
|
|
115
|
+
|
|
116
|
+
Returns:
|
|
117
|
+
Tuple of (owner, repo)
|
|
118
|
+
"""
|
|
119
|
+
# Remove .git suffix if present
|
|
120
|
+
if url.endswith(".git"):
|
|
121
|
+
url = url[:-4] # Remove last 4 characters (.git)
|
|
122
|
+
|
|
123
|
+
# Handle git@ URLs (SSH format)
|
|
124
|
+
if url.startswith("git@github.com:"):
|
|
125
|
+
parts = url.replace("git@github.com:", "").split("/")
|
|
126
|
+
if len(parts) >= 2:
|
|
127
|
+
return parts[0], parts[1]
|
|
128
|
+
|
|
129
|
+
# Handle HTTPS URLs
|
|
130
|
+
if "github.com/" in url:
|
|
131
|
+
parts = url.split("github.com/")[-1].split("/")
|
|
132
|
+
if len(parts) >= 2:
|
|
133
|
+
return parts[0], parts[1]
|
|
134
|
+
|
|
135
|
+
raise ValueError(f"Invalid GitHub URL: {url}")
|
|
136
|
+
|
|
137
|
+
def fetch(self, output_dir: Path = None) -> ThreeStreamData:
|
|
138
|
+
"""
|
|
139
|
+
Fetch everything and split into 3 streams.
|
|
140
|
+
|
|
141
|
+
Args:
|
|
142
|
+
output_dir: Directory to clone repository to (default: /tmp)
|
|
143
|
+
|
|
144
|
+
Returns:
|
|
145
|
+
ThreeStreamData with all 3 streams
|
|
146
|
+
|
|
147
|
+
Raises:
|
|
148
|
+
RateLimitError: If rate limit cannot be handled
|
|
149
|
+
"""
|
|
150
|
+
# Check rate limit upfront
|
|
151
|
+
if not self.rate_limiter.check_upfront():
|
|
152
|
+
raise RateLimitError("Rate limit check failed during startup")
|
|
153
|
+
|
|
154
|
+
if output_dir is None:
|
|
155
|
+
output_dir = Path(tempfile.mkdtemp(prefix="github_fetch_"))
|
|
156
|
+
|
|
157
|
+
print(f"📦 Cloning {self.repo_url}...")
|
|
158
|
+
local_path = self.clone_repo(output_dir)
|
|
159
|
+
|
|
160
|
+
print("🔍 Fetching GitHub metadata...")
|
|
161
|
+
metadata = self.fetch_github_metadata()
|
|
162
|
+
|
|
163
|
+
print("🐛 Fetching issues...")
|
|
164
|
+
issues = self.fetch_issues(max_issues=100)
|
|
165
|
+
|
|
166
|
+
print("📂 Classifying files...")
|
|
167
|
+
code_files, doc_files = self.classify_files(local_path)
|
|
168
|
+
print(f" - Code: {len(code_files)} files")
|
|
169
|
+
print(f" - Docs: {len(doc_files)} files")
|
|
170
|
+
|
|
171
|
+
print(f"📊 Analyzing {len(issues)} issues...")
|
|
172
|
+
issue_insights = self.analyze_issues(issues)
|
|
173
|
+
|
|
174
|
+
# Build three streams
|
|
175
|
+
return ThreeStreamData(
|
|
176
|
+
code_stream=CodeStream(directory=local_path, files=code_files),
|
|
177
|
+
docs_stream=DocsStream(
|
|
178
|
+
readme=self.read_file(local_path / "README.md"),
|
|
179
|
+
contributing=self.read_file(local_path / "CONTRIBUTING.md"),
|
|
180
|
+
docs_files=[
|
|
181
|
+
{"path": str(f.relative_to(local_path)), "content": self.read_file(f)}
|
|
182
|
+
for f in doc_files
|
|
183
|
+
if f.name not in ["README.md", "CONTRIBUTING.md"]
|
|
184
|
+
],
|
|
185
|
+
),
|
|
186
|
+
insights_stream=InsightsStream(
|
|
187
|
+
metadata=metadata,
|
|
188
|
+
common_problems=issue_insights["common_problems"],
|
|
189
|
+
known_solutions=issue_insights["known_solutions"],
|
|
190
|
+
top_labels=issue_insights["top_labels"],
|
|
191
|
+
),
|
|
192
|
+
)
|
|
193
|
+
|
|
194
|
+
def clone_repo(self, output_dir: Path) -> Path:
|
|
195
|
+
"""
|
|
196
|
+
Clone repository to local directory.
|
|
197
|
+
|
|
198
|
+
Args:
|
|
199
|
+
output_dir: Parent directory for clone
|
|
200
|
+
|
|
201
|
+
Returns:
|
|
202
|
+
Path to cloned repository
|
|
203
|
+
"""
|
|
204
|
+
repo_dir = output_dir / self.repo
|
|
205
|
+
repo_dir.mkdir(parents=True, exist_ok=True)
|
|
206
|
+
|
|
207
|
+
# Clone with depth 1 for speed
|
|
208
|
+
cmd = ["git", "clone", "--depth", "1", self.repo_url, str(repo_dir)]
|
|
209
|
+
result = subprocess.run(cmd, capture_output=True, text=True)
|
|
210
|
+
|
|
211
|
+
if result.returncode != 0:
|
|
212
|
+
raise RuntimeError(f"Failed to clone repository: {result.stderr}")
|
|
213
|
+
|
|
214
|
+
return repo_dir
|
|
215
|
+
|
|
216
|
+
def fetch_github_metadata(self) -> dict:
|
|
217
|
+
"""
|
|
218
|
+
Fetch repo metadata via GitHub API.
|
|
219
|
+
|
|
220
|
+
Returns:
|
|
221
|
+
Dict with stars, forks, language, open_issues, etc.
|
|
222
|
+
|
|
223
|
+
Raises:
|
|
224
|
+
RateLimitError: If rate limit cannot be handled
|
|
225
|
+
"""
|
|
226
|
+
url = f"https://api.github.com/repos/{self.owner}/{self.repo}"
|
|
227
|
+
headers = create_github_headers(self.github_token)
|
|
228
|
+
|
|
229
|
+
try:
|
|
230
|
+
response = requests.get(url, headers=headers, timeout=10)
|
|
231
|
+
|
|
232
|
+
# Check for rate limit
|
|
233
|
+
if not self.rate_limiter.check_response(response):
|
|
234
|
+
raise RateLimitError("Rate limit exceeded and cannot continue")
|
|
235
|
+
|
|
236
|
+
response.raise_for_status()
|
|
237
|
+
data = response.json()
|
|
238
|
+
|
|
239
|
+
return {
|
|
240
|
+
"stars": data.get("stargazers_count", 0),
|
|
241
|
+
"forks": data.get("forks_count", 0),
|
|
242
|
+
"open_issues": data.get("open_issues_count", 0),
|
|
243
|
+
"language": data.get("language", "Unknown"),
|
|
244
|
+
"description": data.get("description", ""),
|
|
245
|
+
"homepage": data.get("homepage", ""),
|
|
246
|
+
"created_at": data.get("created_at", ""),
|
|
247
|
+
"updated_at": data.get("updated_at", ""),
|
|
248
|
+
"html_url": data.get("html_url", ""), # NEW: Repository URL
|
|
249
|
+
"license": data.get("license", {}), # NEW: License info
|
|
250
|
+
}
|
|
251
|
+
except RateLimitError:
|
|
252
|
+
raise
|
|
253
|
+
except Exception as e:
|
|
254
|
+
print(f"⚠️ Failed to fetch metadata: {e}")
|
|
255
|
+
return {
|
|
256
|
+
"stars": 0,
|
|
257
|
+
"forks": 0,
|
|
258
|
+
"open_issues": 0,
|
|
259
|
+
"language": "Unknown",
|
|
260
|
+
"description": "",
|
|
261
|
+
"homepage": "",
|
|
262
|
+
"created_at": "",
|
|
263
|
+
"updated_at": "",
|
|
264
|
+
"html_url": "", # NEW: Repository URL
|
|
265
|
+
"license": {}, # NEW: License info
|
|
266
|
+
}
|
|
267
|
+
|
|
268
|
+
def fetch_issues(self, max_issues: int = 100) -> list[dict]:
|
|
269
|
+
"""
|
|
270
|
+
Fetch GitHub issues (open + closed).
|
|
271
|
+
|
|
272
|
+
Args:
|
|
273
|
+
max_issues: Maximum number of issues to fetch
|
|
274
|
+
|
|
275
|
+
Returns:
|
|
276
|
+
List of issue dicts
|
|
277
|
+
"""
|
|
278
|
+
all_issues = []
|
|
279
|
+
|
|
280
|
+
# Fetch open issues
|
|
281
|
+
all_issues.extend(self._fetch_issues_page(state="open", max_count=max_issues // 2))
|
|
282
|
+
|
|
283
|
+
# Fetch closed issues
|
|
284
|
+
all_issues.extend(self._fetch_issues_page(state="closed", max_count=max_issues // 2))
|
|
285
|
+
|
|
286
|
+
return all_issues
|
|
287
|
+
|
|
288
|
+
def _fetch_issues_page(self, state: str, max_count: int) -> list[dict]:
|
|
289
|
+
"""
|
|
290
|
+
Fetch one page of issues.
|
|
291
|
+
|
|
292
|
+
Args:
|
|
293
|
+
state: 'open' or 'closed'
|
|
294
|
+
max_count: Maximum issues to fetch
|
|
295
|
+
|
|
296
|
+
Returns:
|
|
297
|
+
List of issues
|
|
298
|
+
|
|
299
|
+
Raises:
|
|
300
|
+
RateLimitError: If rate limit cannot be handled
|
|
301
|
+
"""
|
|
302
|
+
url = f"https://api.github.com/repos/{self.owner}/{self.repo}/issues"
|
|
303
|
+
headers = create_github_headers(self.github_token)
|
|
304
|
+
|
|
305
|
+
params = {
|
|
306
|
+
"state": state,
|
|
307
|
+
"per_page": min(max_count, 100), # GitHub API limit
|
|
308
|
+
"sort": "comments",
|
|
309
|
+
"direction": "desc",
|
|
310
|
+
}
|
|
311
|
+
|
|
312
|
+
try:
|
|
313
|
+
response = requests.get(url, headers=headers, params=params, timeout=10)
|
|
314
|
+
|
|
315
|
+
# Check for rate limit
|
|
316
|
+
if not self.rate_limiter.check_response(response):
|
|
317
|
+
raise RateLimitError("Rate limit exceeded and cannot continue")
|
|
318
|
+
|
|
319
|
+
response.raise_for_status()
|
|
320
|
+
issues = response.json()
|
|
321
|
+
|
|
322
|
+
# Filter out pull requests (they appear in issues endpoint)
|
|
323
|
+
issues = [issue for issue in issues if "pull_request" not in issue]
|
|
324
|
+
|
|
325
|
+
return issues
|
|
326
|
+
except RateLimitError:
|
|
327
|
+
raise
|
|
328
|
+
except Exception as e:
|
|
329
|
+
print(f"⚠️ Failed to fetch {state} issues: {e}")
|
|
330
|
+
return []
|
|
331
|
+
|
|
332
|
+
def classify_files(self, repo_path: Path) -> tuple[list[Path], list[Path]]:
|
|
333
|
+
"""
|
|
334
|
+
Split files into code vs documentation.
|
|
335
|
+
|
|
336
|
+
Code patterns:
|
|
337
|
+
- *.py, *.js, *.ts, *.go, *.rs, *.java, etc.
|
|
338
|
+
- In src/, lib/, pkg/, etc.
|
|
339
|
+
|
|
340
|
+
Doc patterns:
|
|
341
|
+
- README.md, CONTRIBUTING.md, CHANGELOG.md
|
|
342
|
+
- docs/**/*.md, doc/**/*.md
|
|
343
|
+
- *.rst (reStructuredText)
|
|
344
|
+
|
|
345
|
+
Args:
|
|
346
|
+
repo_path: Path to repository
|
|
347
|
+
|
|
348
|
+
Returns:
|
|
349
|
+
Tuple of (code_files, doc_files)
|
|
350
|
+
"""
|
|
351
|
+
code_files = []
|
|
352
|
+
doc_files = []
|
|
353
|
+
|
|
354
|
+
# Documentation patterns
|
|
355
|
+
doc_patterns = [
|
|
356
|
+
"**/README.md",
|
|
357
|
+
"**/CONTRIBUTING.md",
|
|
358
|
+
"**/CHANGELOG.md",
|
|
359
|
+
"**/LICENSE.md",
|
|
360
|
+
"docs/*.md", # Files directly in docs/
|
|
361
|
+
"docs/**/*.md", # Files in subdirectories of docs/
|
|
362
|
+
"doc/*.md", # Files directly in doc/
|
|
363
|
+
"doc/**/*.md", # Files in subdirectories of doc/
|
|
364
|
+
"documentation/*.md", # Files directly in documentation/
|
|
365
|
+
"documentation/**/*.md", # Files in subdirectories of documentation/
|
|
366
|
+
"**/*.rst",
|
|
367
|
+
]
|
|
368
|
+
|
|
369
|
+
# Code extensions
|
|
370
|
+
code_extensions = [
|
|
371
|
+
".py",
|
|
372
|
+
".js",
|
|
373
|
+
".ts",
|
|
374
|
+
".jsx",
|
|
375
|
+
".tsx",
|
|
376
|
+
".go",
|
|
377
|
+
".rs",
|
|
378
|
+
".java",
|
|
379
|
+
".kt",
|
|
380
|
+
".c",
|
|
381
|
+
".cpp",
|
|
382
|
+
".h",
|
|
383
|
+
".hpp",
|
|
384
|
+
".rb",
|
|
385
|
+
".php",
|
|
386
|
+
".swift",
|
|
387
|
+
".cs",
|
|
388
|
+
".scala",
|
|
389
|
+
".clj",
|
|
390
|
+
".cljs",
|
|
391
|
+
]
|
|
392
|
+
|
|
393
|
+
# Directories to exclude
|
|
394
|
+
exclude_dirs = [
|
|
395
|
+
"node_modules",
|
|
396
|
+
"__pycache__",
|
|
397
|
+
"venv",
|
|
398
|
+
".venv",
|
|
399
|
+
".git",
|
|
400
|
+
"build",
|
|
401
|
+
"dist",
|
|
402
|
+
".tox",
|
|
403
|
+
".pytest_cache",
|
|
404
|
+
"htmlcov",
|
|
405
|
+
".mypy_cache",
|
|
406
|
+
".eggs",
|
|
407
|
+
"*.egg-info",
|
|
408
|
+
]
|
|
409
|
+
|
|
410
|
+
for file_path in repo_path.rglob("*"):
|
|
411
|
+
if not file_path.is_file():
|
|
412
|
+
continue
|
|
413
|
+
|
|
414
|
+
# Check excluded directories first
|
|
415
|
+
if any(exclude in str(file_path) for exclude in exclude_dirs):
|
|
416
|
+
continue
|
|
417
|
+
|
|
418
|
+
# Skip hidden files (but allow docs in docs/ directories)
|
|
419
|
+
is_in_docs_dir = any(
|
|
420
|
+
pattern in str(file_path) for pattern in ["docs/", "doc/", "documentation/"]
|
|
421
|
+
)
|
|
422
|
+
if any(part.startswith(".") for part in file_path.parts) and not is_in_docs_dir:
|
|
423
|
+
continue
|
|
424
|
+
|
|
425
|
+
# Check if documentation
|
|
426
|
+
is_doc = any(file_path.match(pattern) for pattern in doc_patterns)
|
|
427
|
+
|
|
428
|
+
if is_doc:
|
|
429
|
+
doc_files.append(file_path)
|
|
430
|
+
elif file_path.suffix in code_extensions:
|
|
431
|
+
code_files.append(file_path)
|
|
432
|
+
|
|
433
|
+
return code_files, doc_files
|
|
434
|
+
|
|
435
|
+
def analyze_issues(self, issues: list[dict]) -> dict:
|
|
436
|
+
"""
|
|
437
|
+
Analyze GitHub issues to extract insights.
|
|
438
|
+
|
|
439
|
+
Returns:
|
|
440
|
+
{
|
|
441
|
+
"common_problems": [
|
|
442
|
+
{
|
|
443
|
+
"title": "OAuth setup fails",
|
|
444
|
+
"number": 42,
|
|
445
|
+
"labels": ["question", "oauth"],
|
|
446
|
+
"comments": 15,
|
|
447
|
+
"state": "open"
|
|
448
|
+
},
|
|
449
|
+
...
|
|
450
|
+
],
|
|
451
|
+
"known_solutions": [
|
|
452
|
+
{
|
|
453
|
+
"title": "Fixed OAuth redirect",
|
|
454
|
+
"number": 35,
|
|
455
|
+
"labels": ["bug", "oauth"],
|
|
456
|
+
"comments": 8,
|
|
457
|
+
"state": "closed"
|
|
458
|
+
},
|
|
459
|
+
...
|
|
460
|
+
],
|
|
461
|
+
"top_labels": [
|
|
462
|
+
{"label": "question", "count": 23},
|
|
463
|
+
{"label": "bug", "count": 15},
|
|
464
|
+
...
|
|
465
|
+
]
|
|
466
|
+
}
|
|
467
|
+
"""
|
|
468
|
+
common_problems = []
|
|
469
|
+
known_solutions = []
|
|
470
|
+
all_labels = []
|
|
471
|
+
|
|
472
|
+
for issue in issues:
|
|
473
|
+
# Handle both string labels and dict labels (GitHub API format)
|
|
474
|
+
raw_labels = issue.get("labels", [])
|
|
475
|
+
labels = []
|
|
476
|
+
for label in raw_labels:
|
|
477
|
+
if isinstance(label, dict):
|
|
478
|
+
labels.append(label.get("name", ""))
|
|
479
|
+
else:
|
|
480
|
+
labels.append(str(label))
|
|
481
|
+
all_labels.extend(labels)
|
|
482
|
+
|
|
483
|
+
issue_data = {
|
|
484
|
+
"title": issue.get("title", ""),
|
|
485
|
+
"number": issue.get("number", 0),
|
|
486
|
+
"labels": labels,
|
|
487
|
+
"comments": issue.get("comments", 0),
|
|
488
|
+
"state": issue.get("state", "unknown"),
|
|
489
|
+
}
|
|
490
|
+
|
|
491
|
+
# Open issues with many comments = common problems
|
|
492
|
+
if issue["state"] == "open" and issue.get("comments", 0) >= 5:
|
|
493
|
+
common_problems.append(issue_data)
|
|
494
|
+
|
|
495
|
+
# Closed issues with comments = known solutions
|
|
496
|
+
elif issue["state"] == "closed" and issue.get("comments", 0) > 0:
|
|
497
|
+
known_solutions.append(issue_data)
|
|
498
|
+
|
|
499
|
+
# Count label frequency
|
|
500
|
+
label_counts = Counter(all_labels)
|
|
501
|
+
|
|
502
|
+
return {
|
|
503
|
+
"common_problems": sorted(common_problems, key=lambda x: x["comments"], reverse=True)[
|
|
504
|
+
:10
|
|
505
|
+
],
|
|
506
|
+
"known_solutions": sorted(known_solutions, key=lambda x: x["comments"], reverse=True)[
|
|
507
|
+
:10
|
|
508
|
+
],
|
|
509
|
+
"top_labels": [
|
|
510
|
+
{"label": label, "count": count} for label, count in label_counts.most_common(10)
|
|
511
|
+
],
|
|
512
|
+
}
|
|
513
|
+
|
|
514
|
+
def read_file(self, file_path: Path) -> str | None:
|
|
515
|
+
"""
|
|
516
|
+
Read file content safely.
|
|
517
|
+
|
|
518
|
+
Args:
|
|
519
|
+
file_path: Path to file
|
|
520
|
+
|
|
521
|
+
Returns:
|
|
522
|
+
File content or None if file doesn't exist or can't be read
|
|
523
|
+
"""
|
|
524
|
+
if not file_path.exists():
|
|
525
|
+
return None
|
|
526
|
+
|
|
527
|
+
try:
|
|
528
|
+
return file_path.read_text(encoding="utf-8")
|
|
529
|
+
except Exception:
|
|
530
|
+
# Try with different encoding
|
|
531
|
+
try:
|
|
532
|
+
return file_path.read_text(encoding="latin-1")
|
|
533
|
+
except Exception:
|
|
534
|
+
return None
|