skill-seekers 2.7.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (79) hide show
  1. skill_seekers/__init__.py +22 -0
  2. skill_seekers/cli/__init__.py +39 -0
  3. skill_seekers/cli/adaptors/__init__.py +120 -0
  4. skill_seekers/cli/adaptors/base.py +221 -0
  5. skill_seekers/cli/adaptors/claude.py +485 -0
  6. skill_seekers/cli/adaptors/gemini.py +453 -0
  7. skill_seekers/cli/adaptors/markdown.py +269 -0
  8. skill_seekers/cli/adaptors/openai.py +503 -0
  9. skill_seekers/cli/ai_enhancer.py +310 -0
  10. skill_seekers/cli/api_reference_builder.py +373 -0
  11. skill_seekers/cli/architectural_pattern_detector.py +525 -0
  12. skill_seekers/cli/code_analyzer.py +1462 -0
  13. skill_seekers/cli/codebase_scraper.py +1225 -0
  14. skill_seekers/cli/config_command.py +563 -0
  15. skill_seekers/cli/config_enhancer.py +431 -0
  16. skill_seekers/cli/config_extractor.py +871 -0
  17. skill_seekers/cli/config_manager.py +452 -0
  18. skill_seekers/cli/config_validator.py +394 -0
  19. skill_seekers/cli/conflict_detector.py +528 -0
  20. skill_seekers/cli/constants.py +72 -0
  21. skill_seekers/cli/dependency_analyzer.py +757 -0
  22. skill_seekers/cli/doc_scraper.py +2332 -0
  23. skill_seekers/cli/enhance_skill.py +488 -0
  24. skill_seekers/cli/enhance_skill_local.py +1096 -0
  25. skill_seekers/cli/enhance_status.py +194 -0
  26. skill_seekers/cli/estimate_pages.py +433 -0
  27. skill_seekers/cli/generate_router.py +1209 -0
  28. skill_seekers/cli/github_fetcher.py +534 -0
  29. skill_seekers/cli/github_scraper.py +1466 -0
  30. skill_seekers/cli/guide_enhancer.py +723 -0
  31. skill_seekers/cli/how_to_guide_builder.py +1267 -0
  32. skill_seekers/cli/install_agent.py +461 -0
  33. skill_seekers/cli/install_skill.py +178 -0
  34. skill_seekers/cli/language_detector.py +614 -0
  35. skill_seekers/cli/llms_txt_detector.py +60 -0
  36. skill_seekers/cli/llms_txt_downloader.py +104 -0
  37. skill_seekers/cli/llms_txt_parser.py +150 -0
  38. skill_seekers/cli/main.py +558 -0
  39. skill_seekers/cli/markdown_cleaner.py +132 -0
  40. skill_seekers/cli/merge_sources.py +806 -0
  41. skill_seekers/cli/package_multi.py +77 -0
  42. skill_seekers/cli/package_skill.py +241 -0
  43. skill_seekers/cli/pattern_recognizer.py +1825 -0
  44. skill_seekers/cli/pdf_extractor_poc.py +1166 -0
  45. skill_seekers/cli/pdf_scraper.py +617 -0
  46. skill_seekers/cli/quality_checker.py +519 -0
  47. skill_seekers/cli/rate_limit_handler.py +438 -0
  48. skill_seekers/cli/resume_command.py +160 -0
  49. skill_seekers/cli/run_tests.py +230 -0
  50. skill_seekers/cli/setup_wizard.py +93 -0
  51. skill_seekers/cli/split_config.py +390 -0
  52. skill_seekers/cli/swift_patterns.py +560 -0
  53. skill_seekers/cli/test_example_extractor.py +1081 -0
  54. skill_seekers/cli/test_unified_simple.py +179 -0
  55. skill_seekers/cli/unified_codebase_analyzer.py +572 -0
  56. skill_seekers/cli/unified_scraper.py +932 -0
  57. skill_seekers/cli/unified_skill_builder.py +1605 -0
  58. skill_seekers/cli/upload_skill.py +162 -0
  59. skill_seekers/cli/utils.py +432 -0
  60. skill_seekers/mcp/__init__.py +33 -0
  61. skill_seekers/mcp/agent_detector.py +316 -0
  62. skill_seekers/mcp/git_repo.py +273 -0
  63. skill_seekers/mcp/server.py +231 -0
  64. skill_seekers/mcp/server_fastmcp.py +1249 -0
  65. skill_seekers/mcp/server_legacy.py +2302 -0
  66. skill_seekers/mcp/source_manager.py +285 -0
  67. skill_seekers/mcp/tools/__init__.py +115 -0
  68. skill_seekers/mcp/tools/config_tools.py +251 -0
  69. skill_seekers/mcp/tools/packaging_tools.py +826 -0
  70. skill_seekers/mcp/tools/scraping_tools.py +842 -0
  71. skill_seekers/mcp/tools/source_tools.py +828 -0
  72. skill_seekers/mcp/tools/splitting_tools.py +212 -0
  73. skill_seekers/py.typed +0 -0
  74. skill_seekers-2.7.3.dist-info/METADATA +2027 -0
  75. skill_seekers-2.7.3.dist-info/RECORD +79 -0
  76. skill_seekers-2.7.3.dist-info/WHEEL +5 -0
  77. skill_seekers-2.7.3.dist-info/entry_points.txt +19 -0
  78. skill_seekers-2.7.3.dist-info/licenses/LICENSE +21 -0
  79. skill_seekers-2.7.3.dist-info/top_level.txt +1 -0
@@ -0,0 +1,1466 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ GitHub Repository to Claude Skill Converter (Tasks C1.1-C1.12)
4
+
5
+ Converts GitHub repositories into Claude AI skills by extracting:
6
+ - README and documentation
7
+ - Code structure and signatures
8
+ - GitHub Issues, Changelog, and Releases
9
+ - Usage examples from tests
10
+
11
+ Usage:
12
+ skill-seekers github --repo facebook/react
13
+ skill-seekers github --config configs/react_github.json
14
+ skill-seekers github --repo owner/repo --token $GITHUB_TOKEN
15
+ """
16
+
17
+ import argparse
18
+ import json
19
+ import logging
20
+ import os
21
+ import re
22
+ import sys
23
+ from pathlib import Path
24
+ from typing import Any, Optional
25
+
26
+ try:
27
+ from github import Github, GithubException, Repository
28
+ from github.GithubException import RateLimitExceededException
29
+ except ImportError:
30
+ print("Error: PyGithub not installed. Run: pip install PyGithub")
31
+ sys.exit(1)
32
+
33
+ # Try to import pathspec for .gitignore support
34
+ try:
35
+ import pathspec
36
+
37
+ PATHSPEC_AVAILABLE = True
38
+ except ImportError:
39
+ PATHSPEC_AVAILABLE = False
40
+
41
+ # Configure logging FIRST (before using logger)
42
+ logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
43
+ logger = logging.getLogger(__name__)
44
+
45
+ # Import code analyzer for deep code analysis
46
+ try:
47
+ from .code_analyzer import CodeAnalyzer
48
+
49
+ CODE_ANALYZER_AVAILABLE = True
50
+ except ImportError:
51
+ CODE_ANALYZER_AVAILABLE = False
52
+ logger.warning("Code analyzer not available - deep analysis disabled")
53
+
54
+ # Directories to exclude from local repository analysis
55
+ EXCLUDED_DIRS = {
56
+ "venv",
57
+ "env",
58
+ ".venv",
59
+ ".env", # Virtual environments
60
+ "node_modules",
61
+ "__pycache__",
62
+ ".pytest_cache", # Dependencies and caches
63
+ ".git",
64
+ ".svn",
65
+ ".hg", # Version control
66
+ "build",
67
+ "dist",
68
+ "*.egg-info", # Build artifacts
69
+ "htmlcov",
70
+ ".coverage", # Coverage reports
71
+ ".tox",
72
+ ".nox", # Testing environments
73
+ ".mypy_cache",
74
+ ".ruff_cache", # Linter caches
75
+ }
76
+
77
+
78
+ def extract_description_from_readme(readme_content: str, repo_name: str) -> str:
79
+ """
80
+ Extract a meaningful description from README content for skill description.
81
+
82
+ Parses README to find the first meaningful paragraph that describes
83
+ what the project does, suitable for "Use when..." format.
84
+
85
+ Args:
86
+ readme_content: README.md content
87
+ repo_name: Repository name (e.g., 'facebook/react')
88
+
89
+ Returns:
90
+ Description string, or improved fallback if extraction fails
91
+ """
92
+ if not readme_content:
93
+ return f"Use when working with {repo_name.split('/')[-1]}"
94
+
95
+ try:
96
+ lines = readme_content.split("\n")
97
+
98
+ # Skip badges, images, title - find first meaningful text paragraph
99
+ meaningful_paragraph = None
100
+ in_code_block = False
101
+
102
+ for _i, line in enumerate(lines):
103
+ stripped = line.strip()
104
+
105
+ # Track code blocks
106
+ if stripped.startswith("```"):
107
+ in_code_block = not in_code_block
108
+ continue
109
+
110
+ # Skip if in code block
111
+ if in_code_block:
112
+ continue
113
+
114
+ # Skip empty lines, badges, images, HTML
115
+ if not stripped or stripped.startswith(("#", "!", "<", "[![", "[![")):
116
+ continue
117
+
118
+ # Skip lines that are just links or badges
119
+ if stripped.startswith("[") and "](" in stripped and len(stripped) < 100:
120
+ continue
121
+
122
+ # Found a meaningful paragraph - take up to 200 chars
123
+ if len(stripped) > 20: # Meaningful length
124
+ meaningful_paragraph = stripped
125
+ break
126
+
127
+ if meaningful_paragraph:
128
+ # Clean up and extract purpose
129
+ # Remove markdown formatting
130
+ clean = re.sub(r"\[([^\]]+)\]\([^\)]+\)", r"\1", meaningful_paragraph) # Links
131
+ clean = re.sub(r"[*_`]", "", clean) # Bold, italic, code
132
+ clean = re.sub(r"<[^>]+>", "", clean) # HTML tags
133
+
134
+ # Truncate if too long (keep first sentence or ~150 chars)
135
+ if ". " in clean:
136
+ first_sentence = clean.split(". ")[0] + "."
137
+ if len(first_sentence) < 200:
138
+ clean = first_sentence
139
+
140
+ if len(clean) > 150:
141
+ clean = clean[:147] + "..."
142
+
143
+ # Format as "Use when..." description
144
+ # If it already starts with action words, use as-is
145
+ action_words = ["build", "create", "develop", "work", "use", "implement", "manage"]
146
+ if any(clean.lower().startswith(word) for word in action_words):
147
+ return f"Use when {clean.lower()}"
148
+ else:
149
+ return f"Use when working with {clean.lower()}"
150
+
151
+ except Exception as e:
152
+ logger.debug(f"Could not extract description from README: {e}")
153
+
154
+ # Improved fallback
155
+ project_name = repo_name.split("/")[-1]
156
+ return f"Use when working with {project_name}"
157
+
158
+
159
+ class GitHubScraper:
160
+ """
161
+ GitHub Repository Scraper (C1.1-C1.9)
162
+
163
+ Extracts repository information for skill generation:
164
+ - Repository structure
165
+ - README files
166
+ - Code comments and docstrings
167
+ - Programming language detection
168
+ - Function/class signatures
169
+ - Test examples
170
+ - GitHub Issues
171
+ - CHANGELOG
172
+ - Releases
173
+ """
174
+
175
+ def __init__(self, config: dict[str, Any], local_repo_path: str | None = None):
176
+ """Initialize GitHub scraper with configuration."""
177
+ self.config = config
178
+ self.repo_name = config["repo"]
179
+ self.name = config.get("name", self.repo_name.split("/")[-1])
180
+ # Set initial description (will be improved after README extraction if not in config)
181
+ self.description = config.get(
182
+ "description", f"Use when working with {self.repo_name.split('/')[-1]}"
183
+ )
184
+
185
+ # Local repository path (optional - enables unlimited analysis)
186
+ self.local_repo_path = local_repo_path or config.get("local_repo_path")
187
+ if self.local_repo_path:
188
+ self.local_repo_path = os.path.expanduser(self.local_repo_path)
189
+ logger.info(f"Local repository mode enabled: {self.local_repo_path}")
190
+
191
+ # Configure directory exclusions (smart defaults + optional customization)
192
+ self.excluded_dirs = set(EXCLUDED_DIRS) # Start with smart defaults
193
+
194
+ # Option 1: Replace mode - Use only specified exclusions
195
+ if "exclude_dirs" in config:
196
+ self.excluded_dirs = set(config["exclude_dirs"])
197
+ logger.warning(
198
+ f"Using custom directory exclusions ({len(self.excluded_dirs)} dirs) - defaults overridden"
199
+ )
200
+ logger.debug(f"Custom exclusions: {sorted(self.excluded_dirs)}")
201
+
202
+ # Option 2: Extend mode - Add to default exclusions
203
+ elif "exclude_dirs_additional" in config:
204
+ additional = set(config["exclude_dirs_additional"])
205
+ self.excluded_dirs = self.excluded_dirs.union(additional)
206
+ logger.info(
207
+ f"Added {len(additional)} custom directory exclusions (total: {len(self.excluded_dirs)})"
208
+ )
209
+ logger.debug(f"Additional exclusions: {sorted(additional)}")
210
+
211
+ # Load .gitignore for additional exclusions (C2.1)
212
+ self.gitignore_spec = None
213
+ if self.local_repo_path:
214
+ self.gitignore_spec = self._load_gitignore()
215
+
216
+ # GitHub client setup (C1.1)
217
+ token = self._get_token()
218
+ self.github = Github(token) if token else Github()
219
+ self.repo: Repository.Repository | None = None
220
+
221
+ # Options
222
+ self.include_issues = config.get("include_issues", True)
223
+ self.max_issues = config.get("max_issues", 100)
224
+ self.include_changelog = config.get("include_changelog", True)
225
+ self.include_releases = config.get("include_releases", True)
226
+ self.include_code = config.get("include_code", False)
227
+ self.code_analysis_depth = config.get(
228
+ "code_analysis_depth", "surface"
229
+ ) # 'surface', 'deep', 'full'
230
+ self.file_patterns = config.get("file_patterns", [])
231
+
232
+ # Initialize code analyzer if deep analysis requested
233
+ self.code_analyzer = None
234
+ if self.code_analysis_depth != "surface" and CODE_ANALYZER_AVAILABLE:
235
+ self.code_analyzer = CodeAnalyzer(depth=self.code_analysis_depth)
236
+ logger.info(f"Code analysis depth: {self.code_analysis_depth}")
237
+
238
+ # Output paths
239
+ self.skill_dir = f"output/{self.name}"
240
+ self.data_file = f"output/{self.name}_github_data.json"
241
+
242
+ # Extracted data storage
243
+ self.extracted_data = {
244
+ "repo_info": {},
245
+ "readme": "",
246
+ "file_tree": [],
247
+ "languages": {},
248
+ "signatures": [],
249
+ "test_examples": [],
250
+ "issues": [],
251
+ "changelog": "",
252
+ "releases": [],
253
+ }
254
+
255
+ def _get_token(self) -> str | None:
256
+ """
257
+ Get GitHub token from env var or config (both options supported).
258
+ Priority: GITHUB_TOKEN env var > config file > None
259
+ """
260
+ # Try environment variable first (recommended)
261
+ token = os.getenv("GITHUB_TOKEN")
262
+ if token:
263
+ logger.info("Using GitHub token from GITHUB_TOKEN environment variable")
264
+ return token
265
+
266
+ # Fall back to config file
267
+ token = self.config.get("github_token")
268
+ if token:
269
+ logger.warning("Using GitHub token from config file (less secure)")
270
+ return token
271
+
272
+ logger.warning(
273
+ "No GitHub token provided - using unauthenticated access (lower rate limits)"
274
+ )
275
+ return None
276
+
277
+ def scrape(self) -> dict[str, Any]:
278
+ """
279
+ Main scraping entry point.
280
+ Executes all C1 tasks in sequence.
281
+ """
282
+ try:
283
+ logger.info(f"Starting GitHub scrape for: {self.repo_name}")
284
+
285
+ # C1.1: Fetch repository
286
+ self._fetch_repository()
287
+
288
+ # C1.2: Extract README
289
+ self._extract_readme()
290
+
291
+ # C1.3-C1.6: Extract code structure
292
+ self._extract_code_structure()
293
+
294
+ # C1.7: Extract Issues
295
+ if self.include_issues:
296
+ self._extract_issues()
297
+
298
+ # C1.8: Extract CHANGELOG
299
+ if self.include_changelog:
300
+ self._extract_changelog()
301
+
302
+ # C1.9: Extract Releases
303
+ if self.include_releases:
304
+ self._extract_releases()
305
+
306
+ # Save extracted data
307
+ self._save_data()
308
+
309
+ logger.info(f"āœ… Scraping complete! Data saved to: {self.data_file}")
310
+ return self.extracted_data
311
+
312
+ except RateLimitExceededException:
313
+ logger.error("GitHub API rate limit exceeded. Please wait or use authentication token.")
314
+ raise
315
+ except GithubException as e:
316
+ logger.error(f"GitHub API error: {e}")
317
+ raise
318
+ except Exception as e:
319
+ logger.error(f"Unexpected error during scraping: {e}")
320
+ raise
321
+
322
+ def _fetch_repository(self):
323
+ """C1.1: Fetch repository structure using GitHub API."""
324
+ logger.info(f"Fetching repository: {self.repo_name}")
325
+
326
+ try:
327
+ self.repo = self.github.get_repo(self.repo_name)
328
+
329
+ # Extract basic repo info
330
+ self.extracted_data["repo_info"] = {
331
+ "name": self.repo.name,
332
+ "full_name": self.repo.full_name,
333
+ "description": self.repo.description,
334
+ "url": self.repo.html_url,
335
+ "homepage": self.repo.homepage,
336
+ "stars": self.repo.stargazers_count,
337
+ "forks": self.repo.forks_count,
338
+ "open_issues": self.repo.open_issues_count,
339
+ "default_branch": self.repo.default_branch,
340
+ "created_at": self.repo.created_at.isoformat() if self.repo.created_at else None,
341
+ "updated_at": self.repo.updated_at.isoformat() if self.repo.updated_at else None,
342
+ "language": self.repo.language,
343
+ "license": self.repo.license.name if self.repo.license else None,
344
+ "topics": self.repo.get_topics(),
345
+ }
346
+
347
+ logger.info(
348
+ f"Repository fetched: {self.repo.full_name} ({self.repo.stargazers_count} stars)"
349
+ )
350
+
351
+ except GithubException as e:
352
+ if e.status == 404:
353
+ raise ValueError(f"Repository not found: {self.repo_name}") from e
354
+ raise
355
+
356
+ def _get_file_content(self, file_path: str) -> str | None:
357
+ """
358
+ Safely get file content, handling symlinks and encoding issues.
359
+
360
+ Args:
361
+ file_path: Path to file in repository
362
+
363
+ Returns:
364
+ File content as string, or None if file not found/error
365
+ """
366
+ try:
367
+ content = self.repo.get_contents(file_path)
368
+ if not content:
369
+ return None
370
+
371
+ # Handle symlinks - follow the target to get actual file
372
+ if hasattr(content, "type") and content.type == "symlink":
373
+ target = getattr(content, "target", None)
374
+ if target:
375
+ target = target.strip()
376
+ logger.debug(f"File {file_path} is a symlink to {target}, following...")
377
+ try:
378
+ content = self.repo.get_contents(target)
379
+ except GithubException as e:
380
+ logger.warning(f"Failed to follow symlink {file_path} -> {target}: {e}")
381
+ return None
382
+ else:
383
+ logger.warning(f"Symlink {file_path} has no target")
384
+ return None
385
+
386
+ # Handle large files (encoding="none") - download via URL
387
+ # GitHub API doesn't base64-encode files >1MB
388
+ if hasattr(content, "encoding") and content.encoding in [None, "none"]:
389
+ download_url = getattr(content, "download_url", None)
390
+ file_size = getattr(content, "size", 0)
391
+
392
+ if download_url:
393
+ logger.info(
394
+ f"File {file_path} is large ({file_size:,} bytes), downloading via URL..."
395
+ )
396
+ try:
397
+ import requests
398
+
399
+ response = requests.get(download_url, timeout=30)
400
+ response.raise_for_status()
401
+ return response.text
402
+ except Exception as e:
403
+ logger.warning(f"Failed to download {file_path} from {download_url}: {e}")
404
+ return None
405
+ else:
406
+ logger.warning(
407
+ f"File {file_path} has no download URL (encoding={content.encoding})"
408
+ )
409
+ return None
410
+
411
+ # Handle regular files - decode content
412
+ try:
413
+ if isinstance(content.decoded_content, bytes):
414
+ return content.decoded_content.decode("utf-8")
415
+ else:
416
+ return str(content.decoded_content)
417
+ except (UnicodeDecodeError, AttributeError, LookupError, AssertionError) as e:
418
+ logger.warning(f"Encoding issue with {file_path}: {e}")
419
+ # Try alternative encoding
420
+ try:
421
+ if isinstance(content.decoded_content, bytes):
422
+ return content.decoded_content.decode("latin-1")
423
+ except Exception:
424
+ return None
425
+ return None
426
+
427
+ except GithubException:
428
+ return None
429
+ except Exception as e:
430
+ logger.warning(f"Error reading {file_path}: {e}")
431
+ return None
432
+
433
+ def _extract_readme(self):
434
+ """C1.2: Extract README.md files."""
435
+ logger.info("Extracting README...")
436
+
437
+ # Try common README locations
438
+ readme_files = [
439
+ "README.md",
440
+ "README.rst",
441
+ "README.txt",
442
+ "README",
443
+ "docs/README.md",
444
+ ".github/README.md",
445
+ ]
446
+
447
+ for readme_path in readme_files:
448
+ readme_content = self._get_file_content(readme_path)
449
+ if readme_content:
450
+ self.extracted_data["readme"] = readme_content
451
+ logger.info(f"README found: {readme_path}")
452
+
453
+ # Update description if not explicitly set in config
454
+ if "description" not in self.config:
455
+ smart_description = extract_description_from_readme(
456
+ self.extracted_data["readme"], self.repo_name
457
+ )
458
+ self.description = smart_description
459
+ logger.debug(f"Generated description: {self.description}")
460
+
461
+ return
462
+
463
+ logger.warning("No README found in repository")
464
+
465
+ def _extract_code_structure(self):
466
+ """
467
+ C1.3-C1.6: Extract code structure, languages, signatures, and test examples.
468
+ Surface layer only - no full implementation code.
469
+ """
470
+ logger.info("Extracting code structure...")
471
+
472
+ # C1.4: Get language breakdown
473
+ self._extract_languages()
474
+
475
+ # Get file tree
476
+ self._extract_file_tree()
477
+
478
+ # Extract signatures and test examples
479
+ if self.include_code:
480
+ self._extract_signatures_and_tests()
481
+
482
+ def _extract_languages(self):
483
+ """C1.4: Detect programming languages in repository."""
484
+ logger.info("Detecting programming languages...")
485
+
486
+ try:
487
+ languages = self.repo.get_languages()
488
+ total_bytes = sum(languages.values())
489
+
490
+ self.extracted_data["languages"] = {
491
+ lang: {
492
+ "bytes": bytes_count,
493
+ "percentage": round((bytes_count / total_bytes) * 100, 2)
494
+ if total_bytes > 0
495
+ else 0,
496
+ }
497
+ for lang, bytes_count in languages.items()
498
+ }
499
+
500
+ logger.info(f"Languages detected: {', '.join(languages.keys())}")
501
+
502
+ except GithubException as e:
503
+ logger.warning(f"Could not fetch languages: {e}")
504
+
505
+ def should_exclude_dir(self, dir_name: str, dir_path: str = None) -> bool:
506
+ """
507
+ Check if directory should be excluded from analysis.
508
+
509
+ Args:
510
+ dir_name: Directory name (e.g., "Examples & Extras")
511
+ dir_path: Full relative path (e.g., "TextMesh Pro/Examples & Extras")
512
+
513
+ Returns:
514
+ True if directory should be excluded
515
+ """
516
+ # Check directory name
517
+ if dir_name in self.excluded_dirs or dir_name.startswith("."):
518
+ return True
519
+
520
+ # Check full path if provided (for nested exclusions like "TextMesh Pro/Examples & Extras")
521
+ if dir_path:
522
+ for excluded in self.excluded_dirs:
523
+ # Match if path contains the exclusion pattern
524
+ if excluded in dir_path or dir_path.startswith(excluded):
525
+ return True
526
+
527
+ # Check .gitignore rules if available (C2.1)
528
+ if self.gitignore_spec and dir_path:
529
+ # For directories, we need to check both with and without trailing slash
530
+ # as .gitignore patterns can match either way
531
+ dir_path_with_slash = dir_path if dir_path.endswith("/") else dir_path + "/"
532
+ if self.gitignore_spec.match_file(dir_path) or self.gitignore_spec.match_file(
533
+ dir_path_with_slash
534
+ ):
535
+ logger.debug(f"Directory excluded by .gitignore: {dir_path}")
536
+ return True
537
+
538
+ return False
539
+
540
+ def _load_gitignore(self) -> Optional["pathspec.PathSpec"]:
541
+ """
542
+ Load .gitignore file and create pathspec matcher (C2.1).
543
+
544
+ Returns:
545
+ PathSpec object if .gitignore found, None otherwise
546
+ """
547
+ if not PATHSPEC_AVAILABLE:
548
+ logger.warning("pathspec not installed - .gitignore support disabled")
549
+ logger.warning("Install with: pip install pathspec")
550
+ return None
551
+
552
+ if not self.local_repo_path:
553
+ return None
554
+
555
+ gitignore_path = Path(self.local_repo_path) / ".gitignore"
556
+ if not gitignore_path.exists():
557
+ logger.debug(f"No .gitignore found in {self.local_repo_path}")
558
+ return None
559
+
560
+ try:
561
+ with open(gitignore_path, encoding="utf-8") as f:
562
+ spec = pathspec.PathSpec.from_lines("gitwildmatch", f)
563
+ logger.info(f"Loaded .gitignore from {gitignore_path}")
564
+ return spec
565
+ except Exception as e:
566
+ logger.warning(f"Failed to load .gitignore: {e}")
567
+ return None
568
+
569
+ def _extract_file_tree(self):
570
+ """Extract repository file tree structure (dual-mode: GitHub API or local filesystem)."""
571
+ logger.info("Building file tree...")
572
+
573
+ if self.local_repo_path:
574
+ # Local filesystem mode - unlimited files
575
+ self._extract_file_tree_local()
576
+ else:
577
+ # GitHub API mode - limited by API rate limits
578
+ self._extract_file_tree_github()
579
+
580
+ def _extract_file_tree_local(self):
581
+ """Extract file tree from local filesystem (unlimited files)."""
582
+ if not os.path.exists(self.local_repo_path):
583
+ logger.error(f"Local repository path not found: {self.local_repo_path}")
584
+ return
585
+
586
+ # Log exclusions for debugging
587
+ logger.info(
588
+ f"Directory exclusions ({len(self.excluded_dirs)} total): {sorted(list(self.excluded_dirs)[:10])}"
589
+ )
590
+
591
+ file_tree = []
592
+ excluded_count = 0
593
+ for root, dirs, files in os.walk(self.local_repo_path):
594
+ # Calculate relative path from repo root first (needed for exclusion checks)
595
+ rel_root = os.path.relpath(root, self.local_repo_path)
596
+ if rel_root == ".":
597
+ rel_root = ""
598
+
599
+ # Exclude directories in-place to prevent os.walk from descending into them
600
+ # Pass both dir name and full path for path-based exclusions
601
+ filtered_dirs = []
602
+ for d in dirs:
603
+ dir_path = os.path.join(rel_root, d) if rel_root else d
604
+ if self.should_exclude_dir(d, dir_path):
605
+ excluded_count += 1
606
+ logger.debug(f"Excluding directory: {dir_path}")
607
+ else:
608
+ filtered_dirs.append(d)
609
+ dirs[:] = filtered_dirs
610
+
611
+ # Add directories
612
+ for dir_name in dirs:
613
+ dir_path = os.path.join(rel_root, dir_name) if rel_root else dir_name
614
+ file_tree.append({"path": dir_path, "type": "dir", "size": None})
615
+
616
+ # Add files
617
+ for file_name in files:
618
+ file_path = os.path.join(rel_root, file_name) if rel_root else file_name
619
+ full_path = os.path.join(root, file_name)
620
+ try:
621
+ file_size = os.path.getsize(full_path)
622
+ except OSError:
623
+ file_size = None
624
+
625
+ file_tree.append({"path": file_path, "type": "file", "size": file_size})
626
+
627
+ self.extracted_data["file_tree"] = file_tree
628
+ logger.info(
629
+ f"File tree built (local mode): {len(file_tree)} items ({excluded_count} directories excluded)"
630
+ )
631
+
632
+ def _extract_file_tree_github(self):
633
+ """Extract file tree from GitHub API (rate-limited)."""
634
+ try:
635
+ contents = self.repo.get_contents("")
636
+ file_tree = []
637
+
638
+ while contents:
639
+ file_content = contents.pop(0)
640
+
641
+ file_info = {
642
+ "path": file_content.path,
643
+ "type": file_content.type,
644
+ "size": file_content.size if file_content.type == "file" else None,
645
+ }
646
+ file_tree.append(file_info)
647
+
648
+ if file_content.type == "dir":
649
+ contents.extend(self.repo.get_contents(file_content.path))
650
+
651
+ self.extracted_data["file_tree"] = file_tree
652
+ logger.info(f"File tree built (GitHub API mode): {len(file_tree)} items")
653
+
654
+ except GithubException as e:
655
+ logger.warning(f"Could not build file tree: {e}")
656
+
657
+ def _extract_signatures_and_tests(self):
658
+ """
659
+ C1.3, C1.5, C1.6: Extract signatures, docstrings, and test examples.
660
+
661
+ Extraction depth depends on code_analysis_depth setting:
662
+ - surface: File tree only (minimal)
663
+ - deep: Parse files for signatures, parameters, types
664
+ - full: Complete AST analysis (future enhancement)
665
+ """
666
+ if self.code_analysis_depth == "surface":
667
+ logger.info("Code extraction: Surface level (file tree only)")
668
+ return
669
+
670
+ if not self.code_analyzer:
671
+ logger.warning("Code analyzer not available - skipping deep analysis")
672
+ return
673
+
674
+ logger.info(f"Extracting code signatures ({self.code_analysis_depth} analysis)...")
675
+
676
+ # Get primary language for the repository
677
+ languages = self.extracted_data.get("languages", {})
678
+ if not languages:
679
+ logger.warning("No languages detected - skipping code analysis")
680
+ return
681
+
682
+ # Determine primary language
683
+ primary_language = max(languages.items(), key=lambda x: x[1]["bytes"])[0]
684
+ logger.info(f"Primary language: {primary_language}")
685
+
686
+ # Determine file extensions to analyze
687
+ extension_map = {
688
+ "Python": [".py"],
689
+ "JavaScript": [".js", ".jsx"],
690
+ "TypeScript": [".ts", ".tsx"],
691
+ "C": [".c", ".h"],
692
+ "C++": [".cpp", ".hpp", ".cc", ".hh", ".cxx"],
693
+ }
694
+
695
+ extensions = extension_map.get(primary_language, [])
696
+ if not extensions:
697
+ logger.warning(f"No file extensions mapped for {primary_language}")
698
+ return
699
+
700
+ # Analyze files matching patterns and extensions
701
+ analyzed_files = []
702
+ file_tree = self.extracted_data.get("file_tree", [])
703
+
704
+ for file_info in file_tree:
705
+ file_path = file_info["path"]
706
+
707
+ # Check if file matches extension
708
+ if not any(file_path.endswith(ext) for ext in extensions):
709
+ continue
710
+
711
+ # Check if file matches patterns (if specified)
712
+ if self.file_patterns:
713
+ import fnmatch
714
+
715
+ if not any(fnmatch.fnmatch(file_path, pattern) for pattern in self.file_patterns):
716
+ continue
717
+
718
+ # Analyze this file
719
+ try:
720
+ # Read file content based on mode
721
+ if self.local_repo_path:
722
+ # Local mode - read from filesystem
723
+ full_path = os.path.join(self.local_repo_path, file_path)
724
+ with open(full_path, encoding="utf-8") as f:
725
+ content = f.read()
726
+ else:
727
+ # GitHub API mode - fetch from API
728
+ file_content = self.repo.get_contents(file_path)
729
+ content = file_content.decoded_content.decode("utf-8")
730
+
731
+ analysis_result = self.code_analyzer.analyze_file(
732
+ file_path, content, primary_language
733
+ )
734
+
735
+ if analysis_result and (
736
+ analysis_result.get("classes") or analysis_result.get("functions")
737
+ ):
738
+ analyzed_files.append(
739
+ {"file": file_path, "language": primary_language, **analysis_result}
740
+ )
741
+
742
+ logger.debug(
743
+ f"Analyzed {file_path}: "
744
+ f"{len(analysis_result.get('classes', []))} classes, "
745
+ f"{len(analysis_result.get('functions', []))} functions"
746
+ )
747
+
748
+ except Exception as e:
749
+ logger.debug(f"Could not analyze {file_path}: {e}")
750
+ continue
751
+
752
+ # Limit number of files analyzed to avoid rate limits (GitHub API mode only)
753
+ if not self.local_repo_path and len(analyzed_files) >= 50:
754
+ logger.info("Reached analysis limit (50 files, GitHub API mode)")
755
+ break
756
+
757
+ self.extracted_data["code_analysis"] = {
758
+ "depth": self.code_analysis_depth,
759
+ "language": primary_language,
760
+ "files_analyzed": len(analyzed_files),
761
+ "files": analyzed_files,
762
+ }
763
+
764
+ # Calculate totals
765
+ total_classes = sum(len(f.get("classes", [])) for f in analyzed_files)
766
+ total_functions = sum(len(f.get("functions", [])) for f in analyzed_files)
767
+
768
+ logger.info(
769
+ f"Code analysis complete: {len(analyzed_files)} files, {total_classes} classes, {total_functions} functions"
770
+ )
771
+
772
+ def _extract_issues(self):
773
+ """C1.7: Extract GitHub Issues (open/closed, labels, milestones)."""
774
+ logger.info(f"Extracting GitHub Issues (max {self.max_issues})...")
775
+
776
+ try:
777
+ # Fetch recent issues (open + closed)
778
+ issues = self.repo.get_issues(state="all", sort="updated", direction="desc")
779
+
780
+ issue_list = []
781
+ for issue in issues[: self.max_issues]:
782
+ # Skip pull requests (they appear in issues)
783
+ if issue.pull_request:
784
+ continue
785
+
786
+ issue_data = {
787
+ "number": issue.number,
788
+ "title": issue.title,
789
+ "state": issue.state,
790
+ "labels": [label.name for label in issue.labels],
791
+ "milestone": issue.milestone.title if issue.milestone else None,
792
+ "created_at": issue.created_at.isoformat() if issue.created_at else None,
793
+ "updated_at": issue.updated_at.isoformat() if issue.updated_at else None,
794
+ "closed_at": issue.closed_at.isoformat() if issue.closed_at else None,
795
+ "url": issue.html_url,
796
+ "body": issue.body[:500] if issue.body else None, # First 500 chars
797
+ }
798
+ issue_list.append(issue_data)
799
+
800
+ self.extracted_data["issues"] = issue_list
801
+ logger.info(f"Extracted {len(issue_list)} issues")
802
+
803
+ except GithubException as e:
804
+ logger.warning(f"Could not fetch issues: {e}")
805
+
806
+ def _extract_changelog(self):
807
+ """C1.8: Extract CHANGELOG.md and release notes."""
808
+ logger.info("Extracting CHANGELOG...")
809
+
810
+ # Try common changelog locations
811
+ changelog_files = [
812
+ "CHANGELOG.md",
813
+ "CHANGES.md",
814
+ "HISTORY.md",
815
+ "CHANGELOG.rst",
816
+ "CHANGELOG.txt",
817
+ "CHANGELOG",
818
+ "docs/CHANGELOG.md",
819
+ ".github/CHANGELOG.md",
820
+ ]
821
+
822
+ for changelog_path in changelog_files:
823
+ changelog_content = self._get_file_content(changelog_path)
824
+ if changelog_content:
825
+ self.extracted_data["changelog"] = changelog_content
826
+ logger.info(f"CHANGELOG found: {changelog_path}")
827
+ return
828
+
829
+ logger.warning("No CHANGELOG found in repository")
830
+
831
+ def _extract_releases(self):
832
+ """C1.9: Extract GitHub Releases with version history."""
833
+ logger.info("Extracting GitHub Releases...")
834
+
835
+ try:
836
+ releases = self.repo.get_releases()
837
+
838
+ release_list = []
839
+ for release in releases:
840
+ release_data = {
841
+ "tag_name": release.tag_name,
842
+ "name": release.title,
843
+ "body": release.body,
844
+ "draft": release.draft,
845
+ "prerelease": release.prerelease,
846
+ "created_at": release.created_at.isoformat() if release.created_at else None,
847
+ "published_at": release.published_at.isoformat()
848
+ if release.published_at
849
+ else None,
850
+ "url": release.html_url,
851
+ "tarball_url": release.tarball_url,
852
+ "zipball_url": release.zipball_url,
853
+ }
854
+ release_list.append(release_data)
855
+
856
+ self.extracted_data["releases"] = release_list
857
+ logger.info(f"Extracted {len(release_list)} releases")
858
+
859
+ except GithubException as e:
860
+ logger.warning(f"Could not fetch releases: {e}")
861
+
862
+ def _save_data(self):
863
+ """Save extracted data to JSON file."""
864
+ os.makedirs("output", exist_ok=True)
865
+
866
+ with open(self.data_file, "w", encoding="utf-8") as f:
867
+ json.dump(self.extracted_data, f, indent=2, ensure_ascii=False)
868
+
869
+ logger.info(f"Data saved to: {self.data_file}")
870
+
871
+
872
+ class GitHubToSkillConverter:
873
+ """
874
+ Convert extracted GitHub data to Claude skill format (C1.10).
875
+ """
876
+
877
+ def __init__(self, config: dict[str, Any]):
878
+ """Initialize converter with configuration."""
879
+ self.config = config
880
+ self.name = config.get("name", config["repo"].split("/")[-1])
881
+
882
+ # Paths
883
+ self.data_file = f"output/{self.name}_github_data.json"
884
+ self.skill_dir = f"output/{self.name}"
885
+
886
+ # Load extracted data
887
+ self.data = self._load_data()
888
+
889
+ # Set description (smart extraction from README if available)
890
+ if "description" in config:
891
+ self.description = config["description"]
892
+ else:
893
+ # Try to extract from README in loaded data
894
+ readme_content = self.data.get("readme", "")
895
+ repo_name = config["repo"]
896
+ if readme_content:
897
+ self.description = extract_description_from_readme(readme_content, repo_name)
898
+ else:
899
+ self.description = f"Use when working with {repo_name.split('/')[-1]}"
900
+
901
+ def _load_data(self) -> dict[str, Any]:
902
+ """Load extracted GitHub data from JSON."""
903
+ if not os.path.exists(self.data_file):
904
+ raise FileNotFoundError(f"Data file not found: {self.data_file}")
905
+
906
+ with open(self.data_file, encoding="utf-8") as f:
907
+ return json.load(f)
908
+
909
+ def build_skill(self):
910
+ """Build complete skill structure."""
911
+ logger.info(f"Building skill for: {self.name}")
912
+
913
+ # Create directories
914
+ os.makedirs(self.skill_dir, exist_ok=True)
915
+ os.makedirs(f"{self.skill_dir}/references", exist_ok=True)
916
+ os.makedirs(f"{self.skill_dir}/scripts", exist_ok=True)
917
+ os.makedirs(f"{self.skill_dir}/assets", exist_ok=True)
918
+
919
+ # Generate SKILL.md
920
+ self._generate_skill_md()
921
+
922
+ # Generate reference files
923
+ self._generate_references()
924
+
925
+ logger.info(f"āœ… Skill built successfully: {self.skill_dir}/")
926
+
927
+ def _generate_skill_md(self):
928
+ """Generate main SKILL.md file (rich version with C3.x data if available)."""
929
+ repo_info = self.data.get("repo_info", {})
930
+ c3_data = self.data.get("c3_analysis", {})
931
+ has_c3_data = bool(c3_data)
932
+
933
+ # Generate skill name (lowercase, hyphens only, max 64 chars)
934
+ skill_name = self.name.lower().replace("_", "-").replace(" ", "-")[:64]
935
+
936
+ # Truncate description to 1024 chars if needed
937
+ desc = self.description[:1024] if len(self.description) > 1024 else self.description
938
+
939
+ # Build skill content
940
+ skill_content = f"""---
941
+ name: {skill_name}
942
+ description: {desc}
943
+ ---
944
+
945
+ # {repo_info.get("name", self.name)}
946
+
947
+ {self.description}
948
+
949
+ ## Description
950
+
951
+ {repo_info.get("description", "GitHub repository skill")}
952
+
953
+ **Repository:** [{repo_info.get("full_name", "N/A")}]({repo_info.get("url", "#")})
954
+ **Language:** {repo_info.get("language", "N/A")}
955
+ **Stars:** {repo_info.get("stars", 0):,}
956
+ **License:** {repo_info.get("license", "N/A")}
957
+
958
+ ## When to Use This Skill
959
+
960
+ Use this skill when you need to:
961
+ - Understand how to use {repo_info.get("name", self.name)}
962
+ - Look up API documentation and implementation details
963
+ - Find real-world usage examples from the codebase
964
+ - Review design patterns and architecture
965
+ - Check for known issues or recent changes
966
+ - Explore release history and changelogs
967
+ """
968
+
969
+ # Add Quick Reference section (enhanced with C3.x if available)
970
+ skill_content += "\n## ⚔ Quick Reference\n\n"
971
+
972
+ # Repository info
973
+ skill_content += "### Repository Info\n"
974
+ skill_content += f"- **Homepage:** {repo_info.get('homepage', 'N/A')}\n"
975
+ skill_content += f"- **Topics:** {', '.join(repo_info.get('topics', []))}\n"
976
+ skill_content += f"- **Open Issues:** {repo_info.get('open_issues', 0)}\n"
977
+ skill_content += f"- **Last Updated:** {repo_info.get('updated_at', 'N/A')[:10]}\n\n"
978
+
979
+ # Languages
980
+ skill_content += "### Languages\n"
981
+ skill_content += self._format_languages() + "\n\n"
982
+
983
+ # Add C3.x pattern summary if available
984
+ if has_c3_data and c3_data.get("patterns"):
985
+ skill_content += self._format_pattern_summary(c3_data)
986
+
987
+ # Add code examples if available (C3.2 test examples)
988
+ if has_c3_data and c3_data.get("test_examples"):
989
+ skill_content += self._format_code_examples(c3_data)
990
+
991
+ # Add API Reference if available (C2.5)
992
+ if has_c3_data and c3_data.get("api_reference"):
993
+ skill_content += self._format_api_reference(c3_data)
994
+
995
+ # Add Architecture Overview if available (C3.7)
996
+ if has_c3_data and c3_data.get("architecture"):
997
+ skill_content += self._format_architecture(c3_data)
998
+
999
+ # Add Known Issues section
1000
+ skill_content += self._format_known_issues()
1001
+
1002
+ # Add Recent Releases
1003
+ skill_content += "### Recent Releases\n"
1004
+ skill_content += self._format_recent_releases() + "\n\n"
1005
+
1006
+ # Available References
1007
+ skill_content += "## šŸ“– Available References\n\n"
1008
+ skill_content += "- `references/README.md` - Complete README documentation\n"
1009
+ skill_content += "- `references/CHANGELOG.md` - Version history and changes\n"
1010
+ skill_content += "- `references/issues.md` - Recent GitHub issues\n"
1011
+ skill_content += "- `references/releases.md` - Release notes\n"
1012
+ skill_content += "- `references/file_structure.md` - Repository structure\n"
1013
+
1014
+ if has_c3_data:
1015
+ skill_content += "\n### Codebase Analysis References\n\n"
1016
+ if c3_data.get("patterns"):
1017
+ skill_content += (
1018
+ "- `references/codebase_analysis/patterns/` - Design patterns detected\n"
1019
+ )
1020
+ if c3_data.get("test_examples"):
1021
+ skill_content += (
1022
+ "- `references/codebase_analysis/examples/` - Test examples extracted\n"
1023
+ )
1024
+ if c3_data.get("config_patterns"):
1025
+ skill_content += (
1026
+ "- `references/codebase_analysis/configuration/` - Configuration analysis\n"
1027
+ )
1028
+ if c3_data.get("architecture"):
1029
+ skill_content += (
1030
+ "- `references/codebase_analysis/ARCHITECTURE.md` - Architecture overview\n"
1031
+ )
1032
+
1033
+ # Usage
1034
+ skill_content += "\n## šŸ’» Usage\n\n"
1035
+ skill_content += "See README.md for complete usage instructions and examples.\n\n"
1036
+
1037
+ # Footer
1038
+ skill_content += "---\n\n"
1039
+ if has_c3_data:
1040
+ skill_content += "**Generated by Skill Seeker** | GitHub Repository Scraper with C3.x Codebase Analysis\n"
1041
+ else:
1042
+ skill_content += "**Generated by Skill Seeker** | GitHub Repository Scraper\n"
1043
+
1044
+ # Write to file
1045
+ skill_path = f"{self.skill_dir}/SKILL.md"
1046
+ with open(skill_path, "w", encoding="utf-8") as f:
1047
+ f.write(skill_content)
1048
+
1049
+ line_count = len(skill_content.split("\n"))
1050
+ logger.info(f"Generated: {skill_path} ({line_count} lines)")
1051
+
1052
+ def _format_languages(self) -> str:
1053
+ """Format language breakdown."""
1054
+ languages = self.data.get("languages", {})
1055
+ if not languages:
1056
+ return "No language data available"
1057
+
1058
+ lines = []
1059
+ for lang, info in sorted(languages.items(), key=lambda x: x[1]["bytes"], reverse=True):
1060
+ lines.append(f"- **{lang}:** {info['percentage']:.1f}%")
1061
+
1062
+ return "\n".join(lines)
1063
+
1064
+ def _format_recent_releases(self) -> str:
1065
+ """Format recent releases (top 3)."""
1066
+ releases = self.data.get("releases", [])
1067
+ if not releases:
1068
+ return "No releases available"
1069
+
1070
+ lines = []
1071
+ for release in releases[:3]:
1072
+ lines.append(
1073
+ f"- **{release['tag_name']}** ({release['published_at'][:10]}): {release['name']}"
1074
+ )
1075
+
1076
+ return "\n".join(lines)
1077
+
1078
+ def _format_pattern_summary(self, c3_data: dict[str, Any]) -> str:
1079
+ """Format design patterns summary (C3.1)."""
1080
+ patterns_data = c3_data.get("patterns", [])
1081
+ if not patterns_data:
1082
+ return ""
1083
+
1084
+ # Count patterns by type (deduplicate by class, keep highest confidence)
1085
+ pattern_counts = {}
1086
+ by_class = {}
1087
+
1088
+ for pattern_file in patterns_data:
1089
+ for pattern in pattern_file.get("patterns", []):
1090
+ ptype = pattern.get("pattern_type", "Unknown")
1091
+ cls = pattern.get("class_name", "")
1092
+ confidence = pattern.get("confidence", 0)
1093
+
1094
+ # Skip low confidence
1095
+ if confidence < 0.7:
1096
+ continue
1097
+
1098
+ # Deduplicate by class
1099
+ key = f"{cls}:{ptype}"
1100
+ if key not in by_class or by_class[key]["confidence"] < confidence:
1101
+ by_class[key] = pattern
1102
+
1103
+ # Count by type
1104
+ pattern_counts[ptype] = pattern_counts.get(ptype, 0) + 1
1105
+
1106
+ if not pattern_counts:
1107
+ return ""
1108
+
1109
+ content = "### Design Patterns Detected\n\n"
1110
+ content += "*From C3.1 codebase analysis (confidence > 0.7)*\n\n"
1111
+
1112
+ # Top 5 pattern types
1113
+ for ptype, count in sorted(pattern_counts.items(), key=lambda x: x[1], reverse=True)[:5]:
1114
+ content += f"- **{ptype}**: {count} instances\n"
1115
+
1116
+ content += f"\n*Total: {len(by_class)} high-confidence patterns*\n\n"
1117
+ return content
1118
+
1119
+ def _format_code_examples(self, c3_data: dict[str, Any]) -> str:
1120
+ """Format code examples (C3.2)."""
1121
+ examples_data = c3_data.get("test_examples", {})
1122
+ examples = examples_data.get("examples", [])
1123
+
1124
+ if not examples:
1125
+ return ""
1126
+
1127
+ # Filter high-value examples (complexity > 0.7)
1128
+ high_value = [ex for ex in examples if ex.get("complexity_score", 0) > 0.7]
1129
+
1130
+ if not high_value:
1131
+ return ""
1132
+
1133
+ content = "## šŸ“ Code Examples\n\n"
1134
+ content += "*High-quality examples from codebase (C3.2)*\n\n"
1135
+
1136
+ # Top 10 examples
1137
+ for ex in sorted(high_value, key=lambda x: x.get("complexity_score", 0), reverse=True)[:10]:
1138
+ desc = ex.get("description", "Example")
1139
+ lang = ex.get("language", "python")
1140
+ code = ex.get("code", "")
1141
+ complexity = ex.get("complexity_score", 0)
1142
+
1143
+ content += f"**{desc}** (complexity: {complexity:.2f})\n\n"
1144
+ content += f"```{lang}\n{code}\n```\n\n"
1145
+
1146
+ return content
1147
+
1148
+ def _format_api_reference(self, c3_data: dict[str, Any]) -> str:
1149
+ """Format API reference (C2.5)."""
1150
+ api_ref = c3_data.get("api_reference", {})
1151
+
1152
+ if not api_ref:
1153
+ return ""
1154
+
1155
+ content = "## šŸ”§ API Reference\n\n"
1156
+ content += "*Extracted from codebase analysis (C2.5)*\n\n"
1157
+
1158
+ # Top 5 modules
1159
+ for module_name, module_md in list(api_ref.items())[:5]:
1160
+ content += f"### {module_name}\n\n"
1161
+ # First 500 chars of module documentation
1162
+ content += module_md[:500]
1163
+ if len(module_md) > 500:
1164
+ content += "...\n\n"
1165
+ else:
1166
+ content += "\n\n"
1167
+
1168
+ content += "*See `references/codebase_analysis/api_reference/` for complete API docs*\n\n"
1169
+ return content
1170
+
1171
+ def _format_architecture(self, c3_data: dict[str, Any]) -> str:
1172
+ """Format architecture overview (C3.7)."""
1173
+ arch_data = c3_data.get("architecture", {})
1174
+
1175
+ if not arch_data:
1176
+ return ""
1177
+
1178
+ content = "## šŸ—ļø Architecture Overview\n\n"
1179
+ content += "*From C3.7 codebase analysis*\n\n"
1180
+
1181
+ # Architecture patterns
1182
+ patterns = arch_data.get("patterns", [])
1183
+ if patterns:
1184
+ content += "**Architectural Patterns:**\n"
1185
+ for pattern in patterns[:5]:
1186
+ content += (
1187
+ f"- {pattern.get('name', 'Unknown')}: {pattern.get('description', 'N/A')}\n"
1188
+ )
1189
+ content += "\n"
1190
+
1191
+ # Dependencies (C2.6)
1192
+ dep_data = c3_data.get("dependency_graph", {})
1193
+ if dep_data:
1194
+ total_deps = dep_data.get("total_dependencies", 0)
1195
+ circular = len(dep_data.get("circular_dependencies", []))
1196
+ if total_deps > 0:
1197
+ content += f"**Dependencies:** {total_deps} total"
1198
+ if circular > 0:
1199
+ content += f" (āš ļø {circular} circular dependencies detected)"
1200
+ content += "\n\n"
1201
+
1202
+ content += "*See `references/codebase_analysis/ARCHITECTURE.md` for complete overview*\n\n"
1203
+ return content
1204
+
1205
+ def _format_known_issues(self) -> str:
1206
+ """Format known issues from GitHub."""
1207
+ issues = self.data.get("issues", [])
1208
+
1209
+ if not issues:
1210
+ return ""
1211
+
1212
+ content = "## āš ļø Known Issues\n\n"
1213
+ content += "*Recent issues from GitHub*\n\n"
1214
+
1215
+ # Top 5 issues
1216
+ for issue in issues[:5]:
1217
+ title = issue.get("title", "Untitled")
1218
+ number = issue.get("number", 0)
1219
+ labels = ", ".join(issue.get("labels", []))
1220
+ content += f"- **#{number}**: {title}"
1221
+ if labels:
1222
+ content += f" [`{labels}`]"
1223
+ content += "\n"
1224
+
1225
+ content += "\n*See `references/issues.md` for complete list*\n\n"
1226
+ return content
1227
+
1228
+ def _generate_references(self):
1229
+ """Generate all reference files."""
1230
+ # README
1231
+ if self.data.get("readme"):
1232
+ readme_path = f"{self.skill_dir}/references/README.md"
1233
+ with open(readme_path, "w", encoding="utf-8") as f:
1234
+ f.write(self.data["readme"])
1235
+ logger.info(f"Generated: {readme_path}")
1236
+
1237
+ # CHANGELOG
1238
+ if self.data.get("changelog"):
1239
+ changelog_path = f"{self.skill_dir}/references/CHANGELOG.md"
1240
+ with open(changelog_path, "w", encoding="utf-8") as f:
1241
+ f.write(self.data["changelog"])
1242
+ logger.info(f"Generated: {changelog_path}")
1243
+
1244
+ # Issues
1245
+ if self.data.get("issues"):
1246
+ self._generate_issues_reference()
1247
+
1248
+ # Releases
1249
+ if self.data.get("releases"):
1250
+ self._generate_releases_reference()
1251
+
1252
+ # File structure
1253
+ if self.data.get("file_tree"):
1254
+ self._generate_file_structure_reference()
1255
+
1256
+ def _generate_issues_reference(self):
1257
+ """Generate issues.md reference file."""
1258
+ issues = self.data["issues"]
1259
+
1260
+ content = f"# GitHub Issues\n\nRecent issues from the repository ({len(issues)} total).\n\n"
1261
+
1262
+ # Group by state
1263
+ open_issues = [i for i in issues if i["state"] == "open"]
1264
+ closed_issues = [i for i in issues if i["state"] == "closed"]
1265
+
1266
+ content += f"## Open Issues ({len(open_issues)})\n\n"
1267
+ for issue in open_issues[:20]:
1268
+ labels = ", ".join(issue["labels"]) if issue["labels"] else "No labels"
1269
+ content += f"### #{issue['number']}: {issue['title']}\n"
1270
+ content += f"**Labels:** {labels} | **Created:** {issue['created_at'][:10]}\n"
1271
+ content += f"[View on GitHub]({issue['url']})\n\n"
1272
+
1273
+ content += f"\n## Recently Closed Issues ({len(closed_issues)})\n\n"
1274
+ for issue in closed_issues[:10]:
1275
+ labels = ", ".join(issue["labels"]) if issue["labels"] else "No labels"
1276
+ content += f"### #{issue['number']}: {issue['title']}\n"
1277
+ content += f"**Labels:** {labels} | **Closed:** {issue['closed_at'][:10]}\n"
1278
+ content += f"[View on GitHub]({issue['url']})\n\n"
1279
+
1280
+ issues_path = f"{self.skill_dir}/references/issues.md"
1281
+ with open(issues_path, "w", encoding="utf-8") as f:
1282
+ f.write(content)
1283
+ logger.info(f"Generated: {issues_path}")
1284
+
1285
+ def _generate_releases_reference(self):
1286
+ """Generate releases.md reference file."""
1287
+ releases = self.data["releases"]
1288
+
1289
+ content = (
1290
+ f"# Releases\n\nVersion history for this repository ({len(releases)} releases).\n\n"
1291
+ )
1292
+
1293
+ for release in releases:
1294
+ content += f"## {release['tag_name']}: {release['name']}\n"
1295
+ content += f"**Published:** {release['published_at'][:10]}\n"
1296
+ if release["prerelease"]:
1297
+ content += "**Pre-release**\n"
1298
+ content += f"\n{release['body']}\n\n"
1299
+ content += f"[View on GitHub]({release['url']})\n\n---\n\n"
1300
+
1301
+ releases_path = f"{self.skill_dir}/references/releases.md"
1302
+ with open(releases_path, "w", encoding="utf-8") as f:
1303
+ f.write(content)
1304
+ logger.info(f"Generated: {releases_path}")
1305
+
1306
+ def _generate_file_structure_reference(self):
1307
+ """Generate file_structure.md reference file."""
1308
+ file_tree = self.data["file_tree"]
1309
+
1310
+ content = "# Repository File Structure\n\n"
1311
+ content += f"Total items: {len(file_tree)}\n\n"
1312
+ content += "```\n"
1313
+
1314
+ # Build tree structure
1315
+ for item in file_tree:
1316
+ indent = " " * item["path"].count("/")
1317
+ icon = "šŸ“" if item["type"] == "dir" else "šŸ“„"
1318
+ content += f"{indent}{icon} {os.path.basename(item['path'])}\n"
1319
+
1320
+ content += "```\n"
1321
+
1322
+ structure_path = f"{self.skill_dir}/references/file_structure.md"
1323
+ with open(structure_path, "w", encoding="utf-8") as f:
1324
+ f.write(content)
1325
+ logger.info(f"Generated: {structure_path}")
1326
+
1327
+
1328
+ def main():
1329
+ """C1.10: CLI tool entry point."""
1330
+ parser = argparse.ArgumentParser(
1331
+ description="GitHub Repository to Claude Skill Converter",
1332
+ formatter_class=argparse.RawDescriptionHelpFormatter,
1333
+ epilog="""
1334
+ Examples:
1335
+ skill-seekers github --repo facebook/react
1336
+ skill-seekers github --config configs/react_github.json
1337
+ skill-seekers github --repo owner/repo --token $GITHUB_TOKEN
1338
+ """,
1339
+ )
1340
+
1341
+ parser.add_argument("--repo", help="GitHub repository (owner/repo)")
1342
+ parser.add_argument("--config", help="Path to config JSON file")
1343
+ parser.add_argument("--token", help="GitHub personal access token")
1344
+ parser.add_argument("--name", help="Skill name (default: repo name)")
1345
+ parser.add_argument("--description", help="Skill description")
1346
+ parser.add_argument("--no-issues", action="store_true", help="Skip GitHub issues")
1347
+ parser.add_argument("--no-changelog", action="store_true", help="Skip CHANGELOG")
1348
+ parser.add_argument("--no-releases", action="store_true", help="Skip releases")
1349
+ parser.add_argument("--max-issues", type=int, default=100, help="Max issues to fetch")
1350
+ parser.add_argument("--scrape-only", action="store_true", help="Only scrape, don't build skill")
1351
+ parser.add_argument(
1352
+ "--enhance",
1353
+ action="store_true",
1354
+ help="Enhance SKILL.md using Claude API after building (requires API key)",
1355
+ )
1356
+ parser.add_argument(
1357
+ "--enhance-local",
1358
+ action="store_true",
1359
+ help="Enhance SKILL.md using Claude Code (no API key needed)",
1360
+ )
1361
+ parser.add_argument(
1362
+ "--api-key", type=str, help="Anthropic API key for --enhance (or set ANTHROPIC_API_KEY)"
1363
+ )
1364
+ parser.add_argument(
1365
+ "--non-interactive",
1366
+ action="store_true",
1367
+ help="Non-interactive mode for CI/CD (fail fast on rate limits)",
1368
+ )
1369
+ parser.add_argument("--profile", type=str, help="GitHub profile name to use from config")
1370
+
1371
+ args = parser.parse_args()
1372
+
1373
+ # Build config from args or file
1374
+ if args.config:
1375
+ with open(args.config, encoding="utf-8") as f:
1376
+ config = json.load(f)
1377
+ # Override with CLI args if provided
1378
+ if args.non_interactive:
1379
+ config["interactive"] = False
1380
+ if args.profile:
1381
+ config["github_profile"] = args.profile
1382
+ elif args.repo:
1383
+ config = {
1384
+ "repo": args.repo,
1385
+ "name": args.name or args.repo.split("/")[-1],
1386
+ "description": args.description or f"Use when working with {args.repo.split('/')[-1]}",
1387
+ "github_token": args.token,
1388
+ "include_issues": not args.no_issues,
1389
+ "include_changelog": not args.no_changelog,
1390
+ "include_releases": not args.no_releases,
1391
+ "max_issues": args.max_issues,
1392
+ "interactive": not args.non_interactive,
1393
+ "github_profile": args.profile,
1394
+ }
1395
+ else:
1396
+ parser.error("Either --repo or --config is required")
1397
+
1398
+ try:
1399
+ # Phase 1: Scrape GitHub repository
1400
+ scraper = GitHubScraper(config)
1401
+ scraper.scrape()
1402
+
1403
+ if args.scrape_only:
1404
+ logger.info("Scrape complete (--scrape-only mode)")
1405
+ return
1406
+
1407
+ # Phase 2: Build skill
1408
+ converter = GitHubToSkillConverter(config)
1409
+ converter.build_skill()
1410
+
1411
+ skill_name = config.get("name", config["repo"].split("/")[-1])
1412
+ skill_dir = f"output/{skill_name}"
1413
+
1414
+ # Phase 3: Optional enhancement
1415
+ if args.enhance or args.enhance_local:
1416
+ logger.info("\nšŸ“ Enhancing SKILL.md with Claude...")
1417
+
1418
+ if args.enhance_local:
1419
+ # Local enhancement using Claude Code
1420
+ from pathlib import Path
1421
+
1422
+ from skill_seekers.cli.enhance_skill_local import LocalSkillEnhancer
1423
+
1424
+ enhancer = LocalSkillEnhancer(Path(skill_dir))
1425
+ enhancer.run(headless=True)
1426
+ logger.info("āœ… Local enhancement complete!")
1427
+
1428
+ elif args.enhance:
1429
+ # API-based enhancement
1430
+ import os
1431
+
1432
+ api_key = args.api_key or os.environ.get("ANTHROPIC_API_KEY")
1433
+ if not api_key:
1434
+ logger.error(
1435
+ "āŒ ANTHROPIC_API_KEY not set. Use --api-key or set environment variable."
1436
+ )
1437
+ logger.info("šŸ’” Tip: Use --enhance-local instead (no API key needed)")
1438
+ else:
1439
+ # Import and run API enhancement
1440
+ try:
1441
+ from skill_seekers.cli.enhance_skill import enhance_skill_md
1442
+
1443
+ enhance_skill_md(skill_dir, api_key)
1444
+ logger.info("āœ… API enhancement complete!")
1445
+ except ImportError:
1446
+ logger.error(
1447
+ "āŒ API enhancement not available. Install: pip install anthropic"
1448
+ )
1449
+ logger.info("šŸ’” Tip: Use --enhance-local instead (no API key needed)")
1450
+
1451
+ logger.info(f"\nāœ… Success! Skill created at: {skill_dir}/")
1452
+
1453
+ if not (args.enhance or args.enhance_local):
1454
+ logger.info("\nšŸ’” Optional: Enhance SKILL.md with Claude:")
1455
+ logger.info(f" Local (recommended): skill-seekers enhance {skill_dir}/")
1456
+ logger.info(" or re-run with: --enhance-local")
1457
+
1458
+ logger.info(f"\nNext step: skill-seekers package {skill_dir}/")
1459
+
1460
+ except Exception as e:
1461
+ logger.error(f"Error: {e}")
1462
+ sys.exit(1)
1463
+
1464
+
1465
+ if __name__ == "__main__":
1466
+ main()