skill-seekers 2.7.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (79) hide show
  1. skill_seekers/__init__.py +22 -0
  2. skill_seekers/cli/__init__.py +39 -0
  3. skill_seekers/cli/adaptors/__init__.py +120 -0
  4. skill_seekers/cli/adaptors/base.py +221 -0
  5. skill_seekers/cli/adaptors/claude.py +485 -0
  6. skill_seekers/cli/adaptors/gemini.py +453 -0
  7. skill_seekers/cli/adaptors/markdown.py +269 -0
  8. skill_seekers/cli/adaptors/openai.py +503 -0
  9. skill_seekers/cli/ai_enhancer.py +310 -0
  10. skill_seekers/cli/api_reference_builder.py +373 -0
  11. skill_seekers/cli/architectural_pattern_detector.py +525 -0
  12. skill_seekers/cli/code_analyzer.py +1462 -0
  13. skill_seekers/cli/codebase_scraper.py +1225 -0
  14. skill_seekers/cli/config_command.py +563 -0
  15. skill_seekers/cli/config_enhancer.py +431 -0
  16. skill_seekers/cli/config_extractor.py +871 -0
  17. skill_seekers/cli/config_manager.py +452 -0
  18. skill_seekers/cli/config_validator.py +394 -0
  19. skill_seekers/cli/conflict_detector.py +528 -0
  20. skill_seekers/cli/constants.py +72 -0
  21. skill_seekers/cli/dependency_analyzer.py +757 -0
  22. skill_seekers/cli/doc_scraper.py +2332 -0
  23. skill_seekers/cli/enhance_skill.py +488 -0
  24. skill_seekers/cli/enhance_skill_local.py +1096 -0
  25. skill_seekers/cli/enhance_status.py +194 -0
  26. skill_seekers/cli/estimate_pages.py +433 -0
  27. skill_seekers/cli/generate_router.py +1209 -0
  28. skill_seekers/cli/github_fetcher.py +534 -0
  29. skill_seekers/cli/github_scraper.py +1466 -0
  30. skill_seekers/cli/guide_enhancer.py +723 -0
  31. skill_seekers/cli/how_to_guide_builder.py +1267 -0
  32. skill_seekers/cli/install_agent.py +461 -0
  33. skill_seekers/cli/install_skill.py +178 -0
  34. skill_seekers/cli/language_detector.py +614 -0
  35. skill_seekers/cli/llms_txt_detector.py +60 -0
  36. skill_seekers/cli/llms_txt_downloader.py +104 -0
  37. skill_seekers/cli/llms_txt_parser.py +150 -0
  38. skill_seekers/cli/main.py +558 -0
  39. skill_seekers/cli/markdown_cleaner.py +132 -0
  40. skill_seekers/cli/merge_sources.py +806 -0
  41. skill_seekers/cli/package_multi.py +77 -0
  42. skill_seekers/cli/package_skill.py +241 -0
  43. skill_seekers/cli/pattern_recognizer.py +1825 -0
  44. skill_seekers/cli/pdf_extractor_poc.py +1166 -0
  45. skill_seekers/cli/pdf_scraper.py +617 -0
  46. skill_seekers/cli/quality_checker.py +519 -0
  47. skill_seekers/cli/rate_limit_handler.py +438 -0
  48. skill_seekers/cli/resume_command.py +160 -0
  49. skill_seekers/cli/run_tests.py +230 -0
  50. skill_seekers/cli/setup_wizard.py +93 -0
  51. skill_seekers/cli/split_config.py +390 -0
  52. skill_seekers/cli/swift_patterns.py +560 -0
  53. skill_seekers/cli/test_example_extractor.py +1081 -0
  54. skill_seekers/cli/test_unified_simple.py +179 -0
  55. skill_seekers/cli/unified_codebase_analyzer.py +572 -0
  56. skill_seekers/cli/unified_scraper.py +932 -0
  57. skill_seekers/cli/unified_skill_builder.py +1605 -0
  58. skill_seekers/cli/upload_skill.py +162 -0
  59. skill_seekers/cli/utils.py +432 -0
  60. skill_seekers/mcp/__init__.py +33 -0
  61. skill_seekers/mcp/agent_detector.py +316 -0
  62. skill_seekers/mcp/git_repo.py +273 -0
  63. skill_seekers/mcp/server.py +231 -0
  64. skill_seekers/mcp/server_fastmcp.py +1249 -0
  65. skill_seekers/mcp/server_legacy.py +2302 -0
  66. skill_seekers/mcp/source_manager.py +285 -0
  67. skill_seekers/mcp/tools/__init__.py +115 -0
  68. skill_seekers/mcp/tools/config_tools.py +251 -0
  69. skill_seekers/mcp/tools/packaging_tools.py +826 -0
  70. skill_seekers/mcp/tools/scraping_tools.py +842 -0
  71. skill_seekers/mcp/tools/source_tools.py +828 -0
  72. skill_seekers/mcp/tools/splitting_tools.py +212 -0
  73. skill_seekers/py.typed +0 -0
  74. skill_seekers-2.7.3.dist-info/METADATA +2027 -0
  75. skill_seekers-2.7.3.dist-info/RECORD +79 -0
  76. skill_seekers-2.7.3.dist-info/WHEEL +5 -0
  77. skill_seekers-2.7.3.dist-info/entry_points.txt +19 -0
  78. skill_seekers-2.7.3.dist-info/licenses/LICENSE +21 -0
  79. skill_seekers-2.7.3.dist-info/top_level.txt +1 -0
@@ -0,0 +1,534 @@
1
+ """
2
+ GitHub Three-Stream Fetcher
3
+
4
+ Fetches from GitHub and splits into 3 streams:
5
+ - Stream 1: Code (for C3.x analysis)
6
+ - Stream 2: Documentation (README, CONTRIBUTING, docs/*.md)
7
+ - Stream 3: Insights (issues, metadata)
8
+
9
+ This is the foundation of the unified codebase analyzer architecture.
10
+ """
11
+
12
+ import os
13
+ import subprocess
14
+ import tempfile
15
+ from collections import Counter
16
+ from dataclasses import dataclass
17
+ from pathlib import Path
18
+
19
+ import requests
20
+
21
+ from .config_manager import get_config_manager
22
+ from .rate_limit_handler import RateLimitError, RateLimitHandler, create_github_headers
23
+
24
+
25
+ @dataclass
26
+ class CodeStream:
27
+ """Code files for C3.x analysis."""
28
+
29
+ directory: Path
30
+ files: list[Path]
31
+
32
+
33
+ @dataclass
34
+ class DocsStream:
35
+ """Documentation files from repository."""
36
+
37
+ readme: str | None
38
+ contributing: str | None
39
+ docs_files: list[dict] # [{"path": "docs/oauth.md", "content": "..."}]
40
+
41
+
42
+ @dataclass
43
+ class InsightsStream:
44
+ """GitHub metadata and issues."""
45
+
46
+ metadata: dict # stars, forks, language, etc.
47
+ common_problems: list[dict]
48
+ known_solutions: list[dict]
49
+ top_labels: list[dict]
50
+
51
+
52
+ @dataclass
53
+ class ThreeStreamData:
54
+ """Complete output from GitHub fetcher."""
55
+
56
+ code_stream: CodeStream
57
+ docs_stream: DocsStream
58
+ insights_stream: InsightsStream
59
+
60
+
61
+ class GitHubThreeStreamFetcher:
62
+ """
63
+ Fetch from GitHub and split into 3 streams.
64
+
65
+ Usage:
66
+ fetcher = GitHubThreeStreamFetcher(
67
+ repo_url="https://github.com/facebook/react",
68
+ github_token=os.getenv('GITHUB_TOKEN')
69
+ )
70
+
71
+ three_streams = fetcher.fetch()
72
+
73
+ # Now you have:
74
+ # - three_streams.code_stream (for C3.x)
75
+ # - three_streams.docs_stream (for doc parser)
76
+ # - three_streams.insights_stream (for issue analyzer)
77
+ """
78
+
79
+ def __init__(
80
+ self,
81
+ repo_url: str,
82
+ github_token: str | None = None,
83
+ interactive: bool = True,
84
+ profile_name: str | None = None,
85
+ ):
86
+ """
87
+ Initialize fetcher.
88
+
89
+ Args:
90
+ repo_url: GitHub repository URL (e.g., https://github.com/owner/repo)
91
+ github_token: Optional GitHub API token for higher rate limits
92
+ interactive: Whether to show interactive prompts (False for CI/CD)
93
+ profile_name: Name of the GitHub profile being used
94
+ """
95
+ self.repo_url = repo_url
96
+ self.github_token = github_token or os.getenv("GITHUB_TOKEN")
97
+ self.owner, self.repo = self.parse_repo_url(repo_url)
98
+ self.interactive = interactive
99
+
100
+ # Initialize rate limit handler
101
+ config = get_config_manager()
102
+ if not profile_name and self.github_token:
103
+ profile_name = config.get_profile_for_token(self.github_token)
104
+
105
+ self.rate_limiter = RateLimitHandler(
106
+ token=self.github_token, interactive=interactive, profile_name=profile_name
107
+ )
108
+
109
+ def parse_repo_url(self, url: str) -> tuple[str, str]:
110
+ """
111
+ Parse GitHub URL to extract owner and repo.
112
+
113
+ Args:
114
+ url: GitHub URL (https://github.com/owner/repo or git@github.com:owner/repo.git)
115
+
116
+ Returns:
117
+ Tuple of (owner, repo)
118
+ """
119
+ # Remove .git suffix if present
120
+ if url.endswith(".git"):
121
+ url = url[:-4] # Remove last 4 characters (.git)
122
+
123
+ # Handle git@ URLs (SSH format)
124
+ if url.startswith("git@github.com:"):
125
+ parts = url.replace("git@github.com:", "").split("/")
126
+ if len(parts) >= 2:
127
+ return parts[0], parts[1]
128
+
129
+ # Handle HTTPS URLs
130
+ if "github.com/" in url:
131
+ parts = url.split("github.com/")[-1].split("/")
132
+ if len(parts) >= 2:
133
+ return parts[0], parts[1]
134
+
135
+ raise ValueError(f"Invalid GitHub URL: {url}")
136
+
137
+ def fetch(self, output_dir: Path = None) -> ThreeStreamData:
138
+ """
139
+ Fetch everything and split into 3 streams.
140
+
141
+ Args:
142
+ output_dir: Directory to clone repository to (default: /tmp)
143
+
144
+ Returns:
145
+ ThreeStreamData with all 3 streams
146
+
147
+ Raises:
148
+ RateLimitError: If rate limit cannot be handled
149
+ """
150
+ # Check rate limit upfront
151
+ if not self.rate_limiter.check_upfront():
152
+ raise RateLimitError("Rate limit check failed during startup")
153
+
154
+ if output_dir is None:
155
+ output_dir = Path(tempfile.mkdtemp(prefix="github_fetch_"))
156
+
157
+ print(f"📦 Cloning {self.repo_url}...")
158
+ local_path = self.clone_repo(output_dir)
159
+
160
+ print("🔍 Fetching GitHub metadata...")
161
+ metadata = self.fetch_github_metadata()
162
+
163
+ print("🐛 Fetching issues...")
164
+ issues = self.fetch_issues(max_issues=100)
165
+
166
+ print("📂 Classifying files...")
167
+ code_files, doc_files = self.classify_files(local_path)
168
+ print(f" - Code: {len(code_files)} files")
169
+ print(f" - Docs: {len(doc_files)} files")
170
+
171
+ print(f"📊 Analyzing {len(issues)} issues...")
172
+ issue_insights = self.analyze_issues(issues)
173
+
174
+ # Build three streams
175
+ return ThreeStreamData(
176
+ code_stream=CodeStream(directory=local_path, files=code_files),
177
+ docs_stream=DocsStream(
178
+ readme=self.read_file(local_path / "README.md"),
179
+ contributing=self.read_file(local_path / "CONTRIBUTING.md"),
180
+ docs_files=[
181
+ {"path": str(f.relative_to(local_path)), "content": self.read_file(f)}
182
+ for f in doc_files
183
+ if f.name not in ["README.md", "CONTRIBUTING.md"]
184
+ ],
185
+ ),
186
+ insights_stream=InsightsStream(
187
+ metadata=metadata,
188
+ common_problems=issue_insights["common_problems"],
189
+ known_solutions=issue_insights["known_solutions"],
190
+ top_labels=issue_insights["top_labels"],
191
+ ),
192
+ )
193
+
194
+ def clone_repo(self, output_dir: Path) -> Path:
195
+ """
196
+ Clone repository to local directory.
197
+
198
+ Args:
199
+ output_dir: Parent directory for clone
200
+
201
+ Returns:
202
+ Path to cloned repository
203
+ """
204
+ repo_dir = output_dir / self.repo
205
+ repo_dir.mkdir(parents=True, exist_ok=True)
206
+
207
+ # Clone with depth 1 for speed
208
+ cmd = ["git", "clone", "--depth", "1", self.repo_url, str(repo_dir)]
209
+ result = subprocess.run(cmd, capture_output=True, text=True)
210
+
211
+ if result.returncode != 0:
212
+ raise RuntimeError(f"Failed to clone repository: {result.stderr}")
213
+
214
+ return repo_dir
215
+
216
+ def fetch_github_metadata(self) -> dict:
217
+ """
218
+ Fetch repo metadata via GitHub API.
219
+
220
+ Returns:
221
+ Dict with stars, forks, language, open_issues, etc.
222
+
223
+ Raises:
224
+ RateLimitError: If rate limit cannot be handled
225
+ """
226
+ url = f"https://api.github.com/repos/{self.owner}/{self.repo}"
227
+ headers = create_github_headers(self.github_token)
228
+
229
+ try:
230
+ response = requests.get(url, headers=headers, timeout=10)
231
+
232
+ # Check for rate limit
233
+ if not self.rate_limiter.check_response(response):
234
+ raise RateLimitError("Rate limit exceeded and cannot continue")
235
+
236
+ response.raise_for_status()
237
+ data = response.json()
238
+
239
+ return {
240
+ "stars": data.get("stargazers_count", 0),
241
+ "forks": data.get("forks_count", 0),
242
+ "open_issues": data.get("open_issues_count", 0),
243
+ "language": data.get("language", "Unknown"),
244
+ "description": data.get("description", ""),
245
+ "homepage": data.get("homepage", ""),
246
+ "created_at": data.get("created_at", ""),
247
+ "updated_at": data.get("updated_at", ""),
248
+ "html_url": data.get("html_url", ""), # NEW: Repository URL
249
+ "license": data.get("license", {}), # NEW: License info
250
+ }
251
+ except RateLimitError:
252
+ raise
253
+ except Exception as e:
254
+ print(f"⚠️ Failed to fetch metadata: {e}")
255
+ return {
256
+ "stars": 0,
257
+ "forks": 0,
258
+ "open_issues": 0,
259
+ "language": "Unknown",
260
+ "description": "",
261
+ "homepage": "",
262
+ "created_at": "",
263
+ "updated_at": "",
264
+ "html_url": "", # NEW: Repository URL
265
+ "license": {}, # NEW: License info
266
+ }
267
+
268
+ def fetch_issues(self, max_issues: int = 100) -> list[dict]:
269
+ """
270
+ Fetch GitHub issues (open + closed).
271
+
272
+ Args:
273
+ max_issues: Maximum number of issues to fetch
274
+
275
+ Returns:
276
+ List of issue dicts
277
+ """
278
+ all_issues = []
279
+
280
+ # Fetch open issues
281
+ all_issues.extend(self._fetch_issues_page(state="open", max_count=max_issues // 2))
282
+
283
+ # Fetch closed issues
284
+ all_issues.extend(self._fetch_issues_page(state="closed", max_count=max_issues // 2))
285
+
286
+ return all_issues
287
+
288
+ def _fetch_issues_page(self, state: str, max_count: int) -> list[dict]:
289
+ """
290
+ Fetch one page of issues.
291
+
292
+ Args:
293
+ state: 'open' or 'closed'
294
+ max_count: Maximum issues to fetch
295
+
296
+ Returns:
297
+ List of issues
298
+
299
+ Raises:
300
+ RateLimitError: If rate limit cannot be handled
301
+ """
302
+ url = f"https://api.github.com/repos/{self.owner}/{self.repo}/issues"
303
+ headers = create_github_headers(self.github_token)
304
+
305
+ params = {
306
+ "state": state,
307
+ "per_page": min(max_count, 100), # GitHub API limit
308
+ "sort": "comments",
309
+ "direction": "desc",
310
+ }
311
+
312
+ try:
313
+ response = requests.get(url, headers=headers, params=params, timeout=10)
314
+
315
+ # Check for rate limit
316
+ if not self.rate_limiter.check_response(response):
317
+ raise RateLimitError("Rate limit exceeded and cannot continue")
318
+
319
+ response.raise_for_status()
320
+ issues = response.json()
321
+
322
+ # Filter out pull requests (they appear in issues endpoint)
323
+ issues = [issue for issue in issues if "pull_request" not in issue]
324
+
325
+ return issues
326
+ except RateLimitError:
327
+ raise
328
+ except Exception as e:
329
+ print(f"⚠️ Failed to fetch {state} issues: {e}")
330
+ return []
331
+
332
+ def classify_files(self, repo_path: Path) -> tuple[list[Path], list[Path]]:
333
+ """
334
+ Split files into code vs documentation.
335
+
336
+ Code patterns:
337
+ - *.py, *.js, *.ts, *.go, *.rs, *.java, etc.
338
+ - In src/, lib/, pkg/, etc.
339
+
340
+ Doc patterns:
341
+ - README.md, CONTRIBUTING.md, CHANGELOG.md
342
+ - docs/**/*.md, doc/**/*.md
343
+ - *.rst (reStructuredText)
344
+
345
+ Args:
346
+ repo_path: Path to repository
347
+
348
+ Returns:
349
+ Tuple of (code_files, doc_files)
350
+ """
351
+ code_files = []
352
+ doc_files = []
353
+
354
+ # Documentation patterns
355
+ doc_patterns = [
356
+ "**/README.md",
357
+ "**/CONTRIBUTING.md",
358
+ "**/CHANGELOG.md",
359
+ "**/LICENSE.md",
360
+ "docs/*.md", # Files directly in docs/
361
+ "docs/**/*.md", # Files in subdirectories of docs/
362
+ "doc/*.md", # Files directly in doc/
363
+ "doc/**/*.md", # Files in subdirectories of doc/
364
+ "documentation/*.md", # Files directly in documentation/
365
+ "documentation/**/*.md", # Files in subdirectories of documentation/
366
+ "**/*.rst",
367
+ ]
368
+
369
+ # Code extensions
370
+ code_extensions = [
371
+ ".py",
372
+ ".js",
373
+ ".ts",
374
+ ".jsx",
375
+ ".tsx",
376
+ ".go",
377
+ ".rs",
378
+ ".java",
379
+ ".kt",
380
+ ".c",
381
+ ".cpp",
382
+ ".h",
383
+ ".hpp",
384
+ ".rb",
385
+ ".php",
386
+ ".swift",
387
+ ".cs",
388
+ ".scala",
389
+ ".clj",
390
+ ".cljs",
391
+ ]
392
+
393
+ # Directories to exclude
394
+ exclude_dirs = [
395
+ "node_modules",
396
+ "__pycache__",
397
+ "venv",
398
+ ".venv",
399
+ ".git",
400
+ "build",
401
+ "dist",
402
+ ".tox",
403
+ ".pytest_cache",
404
+ "htmlcov",
405
+ ".mypy_cache",
406
+ ".eggs",
407
+ "*.egg-info",
408
+ ]
409
+
410
+ for file_path in repo_path.rglob("*"):
411
+ if not file_path.is_file():
412
+ continue
413
+
414
+ # Check excluded directories first
415
+ if any(exclude in str(file_path) for exclude in exclude_dirs):
416
+ continue
417
+
418
+ # Skip hidden files (but allow docs in docs/ directories)
419
+ is_in_docs_dir = any(
420
+ pattern in str(file_path) for pattern in ["docs/", "doc/", "documentation/"]
421
+ )
422
+ if any(part.startswith(".") for part in file_path.parts) and not is_in_docs_dir:
423
+ continue
424
+
425
+ # Check if documentation
426
+ is_doc = any(file_path.match(pattern) for pattern in doc_patterns)
427
+
428
+ if is_doc:
429
+ doc_files.append(file_path)
430
+ elif file_path.suffix in code_extensions:
431
+ code_files.append(file_path)
432
+
433
+ return code_files, doc_files
434
+
435
+ def analyze_issues(self, issues: list[dict]) -> dict:
436
+ """
437
+ Analyze GitHub issues to extract insights.
438
+
439
+ Returns:
440
+ {
441
+ "common_problems": [
442
+ {
443
+ "title": "OAuth setup fails",
444
+ "number": 42,
445
+ "labels": ["question", "oauth"],
446
+ "comments": 15,
447
+ "state": "open"
448
+ },
449
+ ...
450
+ ],
451
+ "known_solutions": [
452
+ {
453
+ "title": "Fixed OAuth redirect",
454
+ "number": 35,
455
+ "labels": ["bug", "oauth"],
456
+ "comments": 8,
457
+ "state": "closed"
458
+ },
459
+ ...
460
+ ],
461
+ "top_labels": [
462
+ {"label": "question", "count": 23},
463
+ {"label": "bug", "count": 15},
464
+ ...
465
+ ]
466
+ }
467
+ """
468
+ common_problems = []
469
+ known_solutions = []
470
+ all_labels = []
471
+
472
+ for issue in issues:
473
+ # Handle both string labels and dict labels (GitHub API format)
474
+ raw_labels = issue.get("labels", [])
475
+ labels = []
476
+ for label in raw_labels:
477
+ if isinstance(label, dict):
478
+ labels.append(label.get("name", ""))
479
+ else:
480
+ labels.append(str(label))
481
+ all_labels.extend(labels)
482
+
483
+ issue_data = {
484
+ "title": issue.get("title", ""),
485
+ "number": issue.get("number", 0),
486
+ "labels": labels,
487
+ "comments": issue.get("comments", 0),
488
+ "state": issue.get("state", "unknown"),
489
+ }
490
+
491
+ # Open issues with many comments = common problems
492
+ if issue["state"] == "open" and issue.get("comments", 0) >= 5:
493
+ common_problems.append(issue_data)
494
+
495
+ # Closed issues with comments = known solutions
496
+ elif issue["state"] == "closed" and issue.get("comments", 0) > 0:
497
+ known_solutions.append(issue_data)
498
+
499
+ # Count label frequency
500
+ label_counts = Counter(all_labels)
501
+
502
+ return {
503
+ "common_problems": sorted(common_problems, key=lambda x: x["comments"], reverse=True)[
504
+ :10
505
+ ],
506
+ "known_solutions": sorted(known_solutions, key=lambda x: x["comments"], reverse=True)[
507
+ :10
508
+ ],
509
+ "top_labels": [
510
+ {"label": label, "count": count} for label, count in label_counts.most_common(10)
511
+ ],
512
+ }
513
+
514
+ def read_file(self, file_path: Path) -> str | None:
515
+ """
516
+ Read file content safely.
517
+
518
+ Args:
519
+ file_path: Path to file
520
+
521
+ Returns:
522
+ File content or None if file doesn't exist or can't be read
523
+ """
524
+ if not file_path.exists():
525
+ return None
526
+
527
+ try:
528
+ return file_path.read_text(encoding="utf-8")
529
+ except Exception:
530
+ # Try with different encoding
531
+ try:
532
+ return file_path.read_text(encoding="latin-1")
533
+ except Exception:
534
+ return None