skill-seekers 2.7.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (79) hide show
  1. skill_seekers/__init__.py +22 -0
  2. skill_seekers/cli/__init__.py +39 -0
  3. skill_seekers/cli/adaptors/__init__.py +120 -0
  4. skill_seekers/cli/adaptors/base.py +221 -0
  5. skill_seekers/cli/adaptors/claude.py +485 -0
  6. skill_seekers/cli/adaptors/gemini.py +453 -0
  7. skill_seekers/cli/adaptors/markdown.py +269 -0
  8. skill_seekers/cli/adaptors/openai.py +503 -0
  9. skill_seekers/cli/ai_enhancer.py +310 -0
  10. skill_seekers/cli/api_reference_builder.py +373 -0
  11. skill_seekers/cli/architectural_pattern_detector.py +525 -0
  12. skill_seekers/cli/code_analyzer.py +1462 -0
  13. skill_seekers/cli/codebase_scraper.py +1225 -0
  14. skill_seekers/cli/config_command.py +563 -0
  15. skill_seekers/cli/config_enhancer.py +431 -0
  16. skill_seekers/cli/config_extractor.py +871 -0
  17. skill_seekers/cli/config_manager.py +452 -0
  18. skill_seekers/cli/config_validator.py +394 -0
  19. skill_seekers/cli/conflict_detector.py +528 -0
  20. skill_seekers/cli/constants.py +72 -0
  21. skill_seekers/cli/dependency_analyzer.py +757 -0
  22. skill_seekers/cli/doc_scraper.py +2332 -0
  23. skill_seekers/cli/enhance_skill.py +488 -0
  24. skill_seekers/cli/enhance_skill_local.py +1096 -0
  25. skill_seekers/cli/enhance_status.py +194 -0
  26. skill_seekers/cli/estimate_pages.py +433 -0
  27. skill_seekers/cli/generate_router.py +1209 -0
  28. skill_seekers/cli/github_fetcher.py +534 -0
  29. skill_seekers/cli/github_scraper.py +1466 -0
  30. skill_seekers/cli/guide_enhancer.py +723 -0
  31. skill_seekers/cli/how_to_guide_builder.py +1267 -0
  32. skill_seekers/cli/install_agent.py +461 -0
  33. skill_seekers/cli/install_skill.py +178 -0
  34. skill_seekers/cli/language_detector.py +614 -0
  35. skill_seekers/cli/llms_txt_detector.py +60 -0
  36. skill_seekers/cli/llms_txt_downloader.py +104 -0
  37. skill_seekers/cli/llms_txt_parser.py +150 -0
  38. skill_seekers/cli/main.py +558 -0
  39. skill_seekers/cli/markdown_cleaner.py +132 -0
  40. skill_seekers/cli/merge_sources.py +806 -0
  41. skill_seekers/cli/package_multi.py +77 -0
  42. skill_seekers/cli/package_skill.py +241 -0
  43. skill_seekers/cli/pattern_recognizer.py +1825 -0
  44. skill_seekers/cli/pdf_extractor_poc.py +1166 -0
  45. skill_seekers/cli/pdf_scraper.py +617 -0
  46. skill_seekers/cli/quality_checker.py +519 -0
  47. skill_seekers/cli/rate_limit_handler.py +438 -0
  48. skill_seekers/cli/resume_command.py +160 -0
  49. skill_seekers/cli/run_tests.py +230 -0
  50. skill_seekers/cli/setup_wizard.py +93 -0
  51. skill_seekers/cli/split_config.py +390 -0
  52. skill_seekers/cli/swift_patterns.py +560 -0
  53. skill_seekers/cli/test_example_extractor.py +1081 -0
  54. skill_seekers/cli/test_unified_simple.py +179 -0
  55. skill_seekers/cli/unified_codebase_analyzer.py +572 -0
  56. skill_seekers/cli/unified_scraper.py +932 -0
  57. skill_seekers/cli/unified_skill_builder.py +1605 -0
  58. skill_seekers/cli/upload_skill.py +162 -0
  59. skill_seekers/cli/utils.py +432 -0
  60. skill_seekers/mcp/__init__.py +33 -0
  61. skill_seekers/mcp/agent_detector.py +316 -0
  62. skill_seekers/mcp/git_repo.py +273 -0
  63. skill_seekers/mcp/server.py +231 -0
  64. skill_seekers/mcp/server_fastmcp.py +1249 -0
  65. skill_seekers/mcp/server_legacy.py +2302 -0
  66. skill_seekers/mcp/source_manager.py +285 -0
  67. skill_seekers/mcp/tools/__init__.py +115 -0
  68. skill_seekers/mcp/tools/config_tools.py +251 -0
  69. skill_seekers/mcp/tools/packaging_tools.py +826 -0
  70. skill_seekers/mcp/tools/scraping_tools.py +842 -0
  71. skill_seekers/mcp/tools/source_tools.py +828 -0
  72. skill_seekers/mcp/tools/splitting_tools.py +212 -0
  73. skill_seekers/py.typed +0 -0
  74. skill_seekers-2.7.3.dist-info/METADATA +2027 -0
  75. skill_seekers-2.7.3.dist-info/RECORD +79 -0
  76. skill_seekers-2.7.3.dist-info/WHEEL +5 -0
  77. skill_seekers-2.7.3.dist-info/entry_points.txt +19 -0
  78. skill_seekers-2.7.3.dist-info/licenses/LICENSE +21 -0
  79. skill_seekers-2.7.3.dist-info/top_level.txt +1 -0
@@ -0,0 +1,932 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ Unified Multi-Source Scraper
4
+
5
+ Orchestrates scraping from multiple sources (documentation, GitHub, PDF),
6
+ detects conflicts, merges intelligently, and builds unified skills.
7
+
8
+ This is the main entry point for unified config workflow.
9
+
10
+ Usage:
11
+ skill-seekers unified --config configs/godot_unified.json
12
+ skill-seekers unified --config configs/react_unified.json --merge-mode claude-enhanced
13
+ """
14
+
15
+ import argparse
16
+ import json
17
+ import logging
18
+ import os
19
+ import shutil
20
+ import subprocess
21
+ import sys
22
+ from pathlib import Path
23
+ from typing import Any
24
+
25
+ # Import validators and scrapers
26
+ try:
27
+ from skill_seekers.cli.config_validator import validate_config
28
+ from skill_seekers.cli.conflict_detector import ConflictDetector
29
+ from skill_seekers.cli.merge_sources import ClaudeEnhancedMerger, RuleBasedMerger
30
+ from skill_seekers.cli.unified_skill_builder import UnifiedSkillBuilder
31
+ except ImportError as e:
32
+ print(f"Error importing modules: {e}")
33
+ print("Make sure you're running from the project root directory")
34
+ sys.exit(1)
35
+
36
+ logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
37
+ logger = logging.getLogger(__name__)
38
+
39
+
40
+ class UnifiedScraper:
41
+ """
42
+ Orchestrates multi-source scraping and merging.
43
+
44
+ Main workflow:
45
+ 1. Load and validate unified config
46
+ 2. Scrape all sources (docs, GitHub, PDF)
47
+ 3. Detect conflicts between sources
48
+ 4. Merge intelligently (rule-based or Claude-enhanced)
49
+ 5. Build unified skill
50
+ """
51
+
52
+ def __init__(self, config_path: str, merge_mode: str | None = None):
53
+ """
54
+ Initialize unified scraper.
55
+
56
+ Args:
57
+ config_path: Path to unified config JSON
58
+ merge_mode: Override config merge_mode ('rule-based' or 'claude-enhanced')
59
+ """
60
+ self.config_path = config_path
61
+
62
+ # Validate and load config
63
+ logger.info(f"Loading config: {config_path}")
64
+ self.validator = validate_config(config_path)
65
+ self.config = self.validator.config
66
+
67
+ # Determine merge mode
68
+ self.merge_mode = merge_mode or self.config.get("merge_mode", "rule-based")
69
+ logger.info(f"Merge mode: {self.merge_mode}")
70
+
71
+ # Storage for scraped data - use lists to support multiple sources of same type
72
+ self.scraped_data = {
73
+ "documentation": [], # List of doc sources
74
+ "github": [], # List of github sources
75
+ "pdf": [], # List of pdf sources
76
+ }
77
+
78
+ # Track source index for unique naming (multi-source support)
79
+ self._source_counters = {"documentation": 0, "github": 0, "pdf": 0}
80
+
81
+ # Output paths - cleaner organization
82
+ self.name = self.config["name"]
83
+ self.output_dir = f"output/{self.name}" # Final skill only
84
+
85
+ # Use hidden cache directory for intermediate files
86
+ self.cache_dir = f".skillseeker-cache/{self.name}"
87
+ self.sources_dir = f"{self.cache_dir}/sources"
88
+ self.data_dir = f"{self.cache_dir}/data"
89
+ self.repos_dir = f"{self.cache_dir}/repos"
90
+ self.logs_dir = f"{self.cache_dir}/logs"
91
+
92
+ # Create directories
93
+ os.makedirs(self.output_dir, exist_ok=True)
94
+ os.makedirs(self.sources_dir, exist_ok=True)
95
+ os.makedirs(self.data_dir, exist_ok=True)
96
+ os.makedirs(self.repos_dir, exist_ok=True)
97
+ os.makedirs(self.logs_dir, exist_ok=True)
98
+
99
+ # Setup file logging
100
+ self._setup_logging()
101
+
102
+ def _setup_logging(self):
103
+ """Setup file logging for this scraping session."""
104
+ from datetime import datetime
105
+
106
+ # Create log filename with timestamp
107
+ timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
108
+ log_file = f"{self.logs_dir}/unified_{timestamp}.log"
109
+
110
+ # Add file handler to root logger
111
+ file_handler = logging.FileHandler(log_file, encoding="utf-8")
112
+ file_handler.setLevel(logging.DEBUG)
113
+
114
+ # Create formatter
115
+ formatter = logging.Formatter(
116
+ "%(asctime)s - %(name)s - %(levelname)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S"
117
+ )
118
+ file_handler.setFormatter(formatter)
119
+
120
+ # Add to root logger
121
+ logging.getLogger().addHandler(file_handler)
122
+
123
+ logger.info(f"๐Ÿ“ Logging to: {log_file}")
124
+ logger.info(f"๐Ÿ—‚๏ธ Cache directory: {self.cache_dir}")
125
+
126
+ def scrape_all_sources(self):
127
+ """
128
+ Scrape all configured sources.
129
+
130
+ Routes to appropriate scraper based on source type.
131
+ """
132
+ logger.info("=" * 60)
133
+ logger.info("PHASE 1: Scraping all sources")
134
+ logger.info("=" * 60)
135
+
136
+ if not self.validator.is_unified:
137
+ logger.warning("Config is not unified format, converting...")
138
+ self.config = self.validator.convert_legacy_to_unified()
139
+
140
+ sources = self.config.get("sources", [])
141
+
142
+ for i, source in enumerate(sources):
143
+ source_type = source["type"]
144
+ logger.info(f"\n[{i + 1}/{len(sources)}] Scraping {source_type} source...")
145
+
146
+ try:
147
+ if source_type == "documentation":
148
+ self._scrape_documentation(source)
149
+ elif source_type == "github":
150
+ self._scrape_github(source)
151
+ elif source_type == "pdf":
152
+ self._scrape_pdf(source)
153
+ else:
154
+ logger.warning(f"Unknown source type: {source_type}")
155
+ except Exception as e:
156
+ logger.error(f"Error scraping {source_type}: {e}")
157
+ logger.info("Continuing with other sources...")
158
+
159
+ logger.info(f"\nโœ… Scraped {len(self.scraped_data)} sources successfully")
160
+
161
+ def _scrape_documentation(self, source: dict[str, Any]):
162
+ """Scrape documentation website."""
163
+ # Create temporary config for doc scraper
164
+ doc_config = {
165
+ "name": f"{self.name}_docs",
166
+ "base_url": source["base_url"],
167
+ "selectors": source.get("selectors", {}),
168
+ "url_patterns": source.get("url_patterns", {}),
169
+ "categories": source.get("categories", {}),
170
+ "rate_limit": source.get("rate_limit", 0.5),
171
+ "max_pages": source.get("max_pages", 100),
172
+ }
173
+
174
+ # Pass through llms.txt settings (so unified configs behave the same as doc_scraper configs)
175
+ if "llms_txt_url" in source:
176
+ doc_config["llms_txt_url"] = source.get("llms_txt_url")
177
+
178
+ if "skip_llms_txt" in source:
179
+ doc_config["skip_llms_txt"] = source.get("skip_llms_txt")
180
+
181
+ # Optional: support overriding start URLs
182
+ if "start_urls" in source:
183
+ doc_config["start_urls"] = source.get("start_urls")
184
+
185
+ # Write temporary config
186
+ temp_config_path = os.path.join(self.data_dir, "temp_docs_config.json")
187
+ with open(temp_config_path, "w", encoding="utf-8") as f:
188
+ json.dump(doc_config, f, indent=2)
189
+
190
+ # Run doc_scraper as subprocess
191
+ logger.info(f"Scraping documentation from {source['base_url']}")
192
+
193
+ doc_scraper_path = Path(__file__).parent / "doc_scraper.py"
194
+ cmd = [sys.executable, str(doc_scraper_path), "--config", temp_config_path, "--fresh"]
195
+
196
+ result = subprocess.run(cmd, capture_output=True, text=True, stdin=subprocess.DEVNULL)
197
+
198
+ if result.returncode != 0:
199
+ logger.error(f"Documentation scraping failed with return code {result.returncode}")
200
+ logger.error(f"STDERR: {result.stderr}")
201
+ logger.error(f"STDOUT: {result.stdout}")
202
+ return
203
+
204
+ # Log subprocess output for debugging
205
+ if result.stdout:
206
+ logger.info(f"Doc scraper output: {result.stdout[-500:]}") # Last 500 chars
207
+
208
+ # Load scraped data
209
+ docs_data_file = f"output/{doc_config['name']}_data/summary.json"
210
+
211
+ if os.path.exists(docs_data_file):
212
+ with open(docs_data_file, encoding="utf-8") as f:
213
+ summary = json.load(f)
214
+
215
+ # Append to documentation list (multi-source support)
216
+ self.scraped_data["documentation"].append(
217
+ {
218
+ "source_id": doc_config["name"],
219
+ "base_url": source["base_url"],
220
+ "pages": summary.get("pages", []),
221
+ "total_pages": summary.get("total_pages", 0),
222
+ "data_file": docs_data_file,
223
+ "refs_dir": "", # Will be set after moving to cache
224
+ }
225
+ )
226
+
227
+ logger.info(f"โœ… Documentation: {summary.get('total_pages', 0)} pages scraped")
228
+ else:
229
+ logger.warning("Documentation data file not found")
230
+
231
+ # Clean up temp config
232
+ if os.path.exists(temp_config_path):
233
+ os.remove(temp_config_path)
234
+
235
+ # Move intermediate files to cache to keep output/ clean
236
+ docs_output_dir = f"output/{doc_config['name']}"
237
+ docs_data_dir = f"output/{doc_config['name']}_data"
238
+
239
+ if os.path.exists(docs_output_dir):
240
+ cache_docs_dir = os.path.join(self.sources_dir, f"{doc_config['name']}")
241
+ if os.path.exists(cache_docs_dir):
242
+ shutil.rmtree(cache_docs_dir)
243
+ shutil.move(docs_output_dir, cache_docs_dir)
244
+ logger.info(f"๐Ÿ“ฆ Moved docs output to cache: {cache_docs_dir}")
245
+
246
+ # Update refs_dir in scraped_data with cache location
247
+ refs_dir_path = os.path.join(cache_docs_dir, "references")
248
+ if self.scraped_data["documentation"]:
249
+ self.scraped_data["documentation"][-1]["refs_dir"] = refs_dir_path
250
+
251
+ if os.path.exists(docs_data_dir):
252
+ cache_data_dir = os.path.join(self.data_dir, f"{doc_config['name']}_data")
253
+ if os.path.exists(cache_data_dir):
254
+ shutil.rmtree(cache_data_dir)
255
+ shutil.move(docs_data_dir, cache_data_dir)
256
+ logger.info(f"๐Ÿ“ฆ Moved docs data to cache: {cache_data_dir}")
257
+
258
+ def _clone_github_repo(self, repo_name: str, idx: int = 0) -> str | None:
259
+ """
260
+ Clone GitHub repository to cache directory for C3.x analysis.
261
+ Reuses existing clone if already present.
262
+
263
+ Args:
264
+ repo_name: GitHub repo in format "owner/repo"
265
+ idx: Source index for unique naming when multiple repos
266
+
267
+ Returns:
268
+ Path to cloned repo, or None if clone failed
269
+ """
270
+ # Clone to cache repos folder for future reuse
271
+ repo_dir_name = f"{idx}_{repo_name.replace('/', '_')}" # e.g., 0_encode_httpx
272
+ clone_path = os.path.join(self.repos_dir, repo_dir_name)
273
+
274
+ # Check if already cloned
275
+ if os.path.exists(clone_path) and os.path.isdir(os.path.join(clone_path, ".git")):
276
+ logger.info(f"โ™ป๏ธ Found existing repository clone: {clone_path}")
277
+ logger.info(" Reusing for C3.x analysis (skip re-cloning)")
278
+ return clone_path
279
+
280
+ # repos_dir already created in __init__
281
+
282
+ # Clone repo (full clone, not shallow - for complete analysis)
283
+ repo_url = f"https://github.com/{repo_name}.git"
284
+ logger.info(f"๐Ÿ”„ Cloning repository for C3.x analysis: {repo_url}")
285
+ logger.info(f" โ†’ {clone_path}")
286
+ logger.info(" ๐Ÿ’พ Clone will be saved for future reuse")
287
+
288
+ try:
289
+ result = subprocess.run(
290
+ ["git", "clone", repo_url, clone_path],
291
+ capture_output=True,
292
+ text=True,
293
+ timeout=600, # 10 minute timeout for full clone
294
+ )
295
+
296
+ if result.returncode == 0:
297
+ logger.info("โœ… Repository cloned successfully")
298
+ logger.info(f" ๐Ÿ“ Saved to: {clone_path}")
299
+ return clone_path
300
+ else:
301
+ logger.error(f"โŒ Git clone failed: {result.stderr}")
302
+ # Clean up failed clone
303
+ if os.path.exists(clone_path):
304
+ shutil.rmtree(clone_path)
305
+ return None
306
+
307
+ except subprocess.TimeoutExpired:
308
+ logger.error("โŒ Git clone timed out after 10 minutes")
309
+ if os.path.exists(clone_path):
310
+ shutil.rmtree(clone_path)
311
+ return None
312
+ except Exception as e:
313
+ logger.error(f"โŒ Git clone failed: {e}")
314
+ if os.path.exists(clone_path):
315
+ shutil.rmtree(clone_path)
316
+ return None
317
+
318
+ def _scrape_github(self, source: dict[str, Any]):
319
+ """Scrape GitHub repository."""
320
+ try:
321
+ from skill_seekers.cli.github_scraper import GitHubScraper
322
+ except ImportError:
323
+ logger.error("github_scraper.py not found")
324
+ return
325
+
326
+ # Multi-source support: Get unique index for this GitHub source
327
+ idx = self._source_counters["github"]
328
+ self._source_counters["github"] += 1
329
+
330
+ # Extract repo identifier for unique naming
331
+ repo = source["repo"]
332
+ repo_id = repo.replace("/", "_")
333
+
334
+ # Check if we need to clone for C3.x analysis
335
+ enable_codebase_analysis = source.get("enable_codebase_analysis", True)
336
+ local_repo_path = source.get("local_repo_path")
337
+ cloned_repo_path = None
338
+
339
+ # Auto-clone if C3.x analysis is enabled but no local path provided
340
+ if enable_codebase_analysis and not local_repo_path:
341
+ logger.info("๐Ÿ”ฌ C3.x codebase analysis enabled - cloning repository...")
342
+ cloned_repo_path = self._clone_github_repo(repo, idx=idx)
343
+ if cloned_repo_path:
344
+ local_repo_path = cloned_repo_path
345
+ logger.info(f"โœ… Using cloned repo for C3.x analysis: {local_repo_path}")
346
+ else:
347
+ logger.warning("โš ๏ธ Failed to clone repo - C3.x analysis will be skipped")
348
+ enable_codebase_analysis = False
349
+
350
+ # Create config for GitHub scraper
351
+ github_config = {
352
+ "repo": repo,
353
+ "name": f"{self.name}_github_{idx}_{repo_id}",
354
+ "github_token": source.get("github_token"),
355
+ "include_issues": source.get("include_issues", True),
356
+ "max_issues": source.get("max_issues", 100),
357
+ "include_changelog": source.get("include_changelog", True),
358
+ "include_releases": source.get("include_releases", True),
359
+ "include_code": source.get("include_code", True),
360
+ "code_analysis_depth": source.get("code_analysis_depth", "surface"),
361
+ "file_patterns": source.get("file_patterns", []),
362
+ "local_repo_path": local_repo_path, # Use cloned path if available
363
+ }
364
+
365
+ # Pass directory exclusions if specified (optional)
366
+ if "exclude_dirs" in source:
367
+ github_config["exclude_dirs"] = source["exclude_dirs"]
368
+ if "exclude_dirs_additional" in source:
369
+ github_config["exclude_dirs_additional"] = source["exclude_dirs_additional"]
370
+
371
+ # Scrape
372
+ logger.info(f"Scraping GitHub repository: {source['repo']}")
373
+ scraper = GitHubScraper(github_config)
374
+ github_data = scraper.scrape()
375
+
376
+ # Run C3.x codebase analysis if enabled and local_repo_path available
377
+ if enable_codebase_analysis and local_repo_path:
378
+ logger.info("๐Ÿ”ฌ Running C3.x codebase analysis...")
379
+ try:
380
+ c3_data = self._run_c3_analysis(local_repo_path, source)
381
+ if c3_data:
382
+ github_data["c3_analysis"] = c3_data
383
+ logger.info("โœ… C3.x analysis complete")
384
+ else:
385
+ logger.warning("โš ๏ธ C3.x analysis returned no data")
386
+ except Exception as e:
387
+ logger.warning(f"โš ๏ธ C3.x analysis failed: {e}")
388
+ import traceback
389
+
390
+ logger.debug(f"Traceback: {traceback.format_exc()}")
391
+ # Continue without C3.x data - graceful degradation
392
+
393
+ # Note: We keep the cloned repo in output/ for future reuse
394
+ if cloned_repo_path:
395
+ logger.info(f"๐Ÿ“ Repository clone saved for future use: {cloned_repo_path}")
396
+
397
+ # Save data to unified location with unique filename
398
+ github_data_file = os.path.join(self.data_dir, f"github_data_{idx}_{repo_id}.json")
399
+ with open(github_data_file, "w", encoding="utf-8") as f:
400
+ json.dump(github_data, f, indent=2, ensure_ascii=False)
401
+
402
+ # ALSO save to the location GitHubToSkillConverter expects (with C3.x data!)
403
+ converter_data_file = f"output/{github_config['name']}_github_data.json"
404
+ with open(converter_data_file, "w", encoding="utf-8") as f:
405
+ json.dump(github_data, f, indent=2, ensure_ascii=False)
406
+
407
+ # Append to list instead of overwriting (multi-source support)
408
+ self.scraped_data["github"].append(
409
+ {
410
+ "repo": repo,
411
+ "repo_id": repo_id,
412
+ "idx": idx,
413
+ "data": github_data,
414
+ "data_file": github_data_file,
415
+ }
416
+ )
417
+
418
+ # Build standalone SKILL.md for synthesis using GitHubToSkillConverter
419
+ try:
420
+ from skill_seekers.cli.github_scraper import GitHubToSkillConverter
421
+
422
+ # Use github_config which has the correct name field
423
+ # Converter will load from output/{name}_github_data.json which now has C3.x data
424
+ converter = GitHubToSkillConverter(config=github_config)
425
+ converter.build_skill()
426
+ logger.info("โœ… GitHub: Standalone SKILL.md created")
427
+ except Exception as e:
428
+ logger.warning(f"โš ๏ธ Failed to build standalone GitHub SKILL.md: {e}")
429
+
430
+ # Move intermediate files to cache to keep output/ clean
431
+ github_output_dir = f"output/{github_config['name']}"
432
+ github_data_file_path = f"output/{github_config['name']}_github_data.json"
433
+
434
+ if os.path.exists(github_output_dir):
435
+ cache_github_dir = os.path.join(self.sources_dir, github_config["name"])
436
+ if os.path.exists(cache_github_dir):
437
+ shutil.rmtree(cache_github_dir)
438
+ shutil.move(github_output_dir, cache_github_dir)
439
+ logger.info(f"๐Ÿ“ฆ Moved GitHub output to cache: {cache_github_dir}")
440
+
441
+ if os.path.exists(github_data_file_path):
442
+ cache_github_data = os.path.join(
443
+ self.data_dir, f"{github_config['name']}_github_data.json"
444
+ )
445
+ if os.path.exists(cache_github_data):
446
+ os.remove(cache_github_data)
447
+ shutil.move(github_data_file_path, cache_github_data)
448
+ logger.info(f"๐Ÿ“ฆ Moved GitHub data to cache: {cache_github_data}")
449
+
450
+ logger.info("โœ… GitHub: Repository scraped successfully")
451
+
452
+ def _scrape_pdf(self, source: dict[str, Any]):
453
+ """Scrape PDF document."""
454
+ try:
455
+ from skill_seekers.cli.pdf_scraper import PDFToSkillConverter
456
+ except ImportError:
457
+ logger.error("pdf_scraper.py not found")
458
+ return
459
+
460
+ # Multi-source support: Get unique index for this PDF source
461
+ idx = self._source_counters["pdf"]
462
+ self._source_counters["pdf"] += 1
463
+
464
+ # Extract PDF identifier for unique naming (filename without extension)
465
+ pdf_path = source["path"]
466
+ pdf_id = os.path.splitext(os.path.basename(pdf_path))[0]
467
+
468
+ # Create config for PDF scraper
469
+ pdf_config = {
470
+ "name": f"{self.name}_pdf_{idx}_{pdf_id}",
471
+ "pdf": source["path"],
472
+ "extract_tables": source.get("extract_tables", False),
473
+ "ocr": source.get("ocr", False),
474
+ "password": source.get("password"),
475
+ }
476
+
477
+ # Scrape
478
+ logger.info(f"Scraping PDF: {source['path']}")
479
+ converter = PDFToSkillConverter(pdf_config)
480
+ pdf_data = converter.extract_all()
481
+
482
+ # Save data
483
+ pdf_data_file = os.path.join(self.data_dir, f"pdf_data_{idx}_{pdf_id}.json")
484
+ with open(pdf_data_file, "w", encoding="utf-8") as f:
485
+ json.dump(pdf_data, f, indent=2, ensure_ascii=False)
486
+
487
+ # Append to list instead of overwriting
488
+ self.scraped_data["pdf"].append(
489
+ {
490
+ "pdf_path": pdf_path,
491
+ "pdf_id": pdf_id,
492
+ "idx": idx,
493
+ "data": pdf_data,
494
+ "data_file": pdf_data_file,
495
+ }
496
+ )
497
+
498
+ # Build standalone SKILL.md for synthesis
499
+ try:
500
+ converter.build_skill()
501
+ logger.info("โœ… PDF: Standalone SKILL.md created")
502
+ except Exception as e:
503
+ logger.warning(f"โš ๏ธ Failed to build standalone PDF SKILL.md: {e}")
504
+
505
+ logger.info(f"โœ… PDF: {len(pdf_data.get('pages', []))} pages extracted")
506
+
507
+ def _load_json(self, file_path: Path) -> dict:
508
+ """
509
+ Load JSON file safely.
510
+
511
+ Args:
512
+ file_path: Path to JSON file
513
+
514
+ Returns:
515
+ Dict with JSON data, or empty dict if file doesn't exist or is invalid
516
+ """
517
+ if not file_path.exists():
518
+ logger.warning(f"JSON file not found: {file_path}")
519
+ return {}
520
+
521
+ try:
522
+ with open(file_path, encoding="utf-8") as f:
523
+ return json.load(f)
524
+ except (OSError, json.JSONDecodeError) as e:
525
+ logger.warning(f"Failed to load JSON {file_path}: {e}")
526
+ return {}
527
+
528
+ def _load_guide_collection(self, tutorials_dir: Path) -> dict:
529
+ """
530
+ Load how-to guide collection from tutorials directory.
531
+
532
+ Args:
533
+ tutorials_dir: Path to tutorials directory
534
+
535
+ Returns:
536
+ Dict with guide collection data
537
+ """
538
+ if not tutorials_dir.exists():
539
+ logger.warning(f"Tutorials directory not found: {tutorials_dir}")
540
+ return {"guides": []}
541
+
542
+ collection_file = tutorials_dir / "guide_collection.json"
543
+ if collection_file.exists():
544
+ return self._load_json(collection_file)
545
+
546
+ # Fallback: scan for individual guide JSON files
547
+ guides = []
548
+ for guide_file in tutorials_dir.glob("guide_*.json"):
549
+ guide_data = self._load_json(guide_file)
550
+ if guide_data:
551
+ guides.append(guide_data)
552
+
553
+ return {"guides": guides, "total_count": len(guides)}
554
+
555
+ def _load_api_reference(self, api_dir: Path) -> dict[str, Any]:
556
+ """
557
+ Load API reference markdown files from api_reference directory.
558
+
559
+ Args:
560
+ api_dir: Path to api_reference directory
561
+
562
+ Returns:
563
+ Dict mapping module names to markdown content, or empty dict if not found
564
+ """
565
+ if not api_dir.exists():
566
+ logger.debug(f"API reference directory not found: {api_dir}")
567
+ return {}
568
+
569
+ api_refs = {}
570
+ for md_file in api_dir.glob("*.md"):
571
+ try:
572
+ module_name = md_file.stem
573
+ api_refs[module_name] = md_file.read_text(encoding="utf-8")
574
+ except OSError as e:
575
+ logger.warning(f"Failed to read API reference {md_file}: {e}")
576
+
577
+ return api_refs
578
+
579
+ def _run_c3_analysis(self, local_repo_path: str, source: dict[str, Any]) -> dict[str, Any]:
580
+ """
581
+ Run comprehensive C3.x codebase analysis.
582
+
583
+ Calls codebase_scraper.analyze_codebase() with all C3.x features enabled,
584
+ loads the results into memory, and cleans up temporary files.
585
+
586
+ Args:
587
+ local_repo_path: Path to local repository
588
+ source: GitHub source configuration dict
589
+
590
+ Returns:
591
+ Dict with keys: patterns, test_examples, how_to_guides,
592
+ config_patterns, architecture
593
+ """
594
+ try:
595
+ from skill_seekers.cli.codebase_scraper import analyze_codebase
596
+ except ImportError:
597
+ logger.error("codebase_scraper.py not found")
598
+ return {}
599
+
600
+ # Create temp output dir for C3.x analysis
601
+ temp_output = Path(self.data_dir) / "c3_analysis_temp"
602
+ temp_output.mkdir(parents=True, exist_ok=True)
603
+
604
+ logger.info(f" Analyzing codebase: {local_repo_path}")
605
+
606
+ try:
607
+ # Run full C3.x analysis
608
+ _results = analyze_codebase(
609
+ directory=Path(local_repo_path),
610
+ output_dir=temp_output,
611
+ depth="deep",
612
+ languages=None, # Analyze all languages
613
+ file_patterns=source.get("file_patterns"),
614
+ build_api_reference=True, # C2.5: API Reference
615
+ extract_comments=False, # Not needed
616
+ build_dependency_graph=True, # C2.6: Dependency Graph
617
+ detect_patterns=True, # C3.1: Design patterns
618
+ extract_test_examples=True, # C3.2: Test examples
619
+ build_how_to_guides=True, # C3.3: How-to guides
620
+ extract_config_patterns=True, # C3.4: Config patterns
621
+ enhance_with_ai=source.get("ai_mode", "auto") != "none",
622
+ ai_mode=source.get("ai_mode", "auto"),
623
+ )
624
+
625
+ # Load C3.x outputs into memory
626
+ c3_data = {
627
+ "patterns": self._load_json(temp_output / "patterns" / "detected_patterns.json"),
628
+ "test_examples": self._load_json(
629
+ temp_output / "test_examples" / "test_examples.json"
630
+ ),
631
+ "how_to_guides": self._load_guide_collection(temp_output / "tutorials"),
632
+ "config_patterns": self._load_json(
633
+ temp_output / "config_patterns" / "config_patterns.json"
634
+ ),
635
+ "architecture": self._load_json(
636
+ temp_output / "architecture" / "architectural_patterns.json"
637
+ ),
638
+ "api_reference": self._load_api_reference(temp_output / "api_reference"), # C2.5
639
+ "dependency_graph": self._load_json(
640
+ temp_output / "dependencies" / "dependency_graph.json"
641
+ ), # C2.6
642
+ }
643
+
644
+ # Log summary
645
+ total_patterns = sum(len(f.get("patterns", [])) for f in c3_data.get("patterns", []))
646
+ total_examples = c3_data.get("test_examples", {}).get("total_examples", 0)
647
+ total_guides = len(c3_data.get("how_to_guides", {}).get("guides", []))
648
+ total_configs = len(c3_data.get("config_patterns", {}).get("config_files", []))
649
+ arch_patterns = len(c3_data.get("architecture", {}).get("patterns", []))
650
+
651
+ logger.info(f" โœ“ Design Patterns: {total_patterns}")
652
+ logger.info(f" โœ“ Test Examples: {total_examples}")
653
+ logger.info(f" โœ“ How-To Guides: {total_guides}")
654
+ logger.info(f" โœ“ Config Files: {total_configs}")
655
+ logger.info(f" โœ“ Architecture Patterns: {arch_patterns}")
656
+
657
+ return c3_data
658
+
659
+ except Exception as e:
660
+ logger.error(f"C3.x analysis failed: {e}")
661
+ import traceback
662
+
663
+ traceback.print_exc()
664
+ return {}
665
+
666
+ finally:
667
+ # Clean up temp directory
668
+ if temp_output.exists():
669
+ try:
670
+ shutil.rmtree(temp_output)
671
+ except Exception as e:
672
+ logger.warning(f"Failed to clean up temp directory: {e}")
673
+
674
+ def detect_conflicts(self) -> list:
675
+ """
676
+ Detect conflicts between documentation and code.
677
+
678
+ Only applicable if both documentation and GitHub sources exist.
679
+
680
+ Returns:
681
+ List of conflicts
682
+ """
683
+ logger.info("\n" + "=" * 60)
684
+ logger.info("PHASE 2: Detecting conflicts")
685
+ logger.info("=" * 60)
686
+
687
+ if not self.validator.needs_api_merge():
688
+ logger.info("No API merge needed (only one API source)")
689
+ return []
690
+
691
+ # Get documentation and GitHub data
692
+ docs_data = self.scraped_data.get("documentation", {})
693
+ github_data = self.scraped_data.get("github", {})
694
+
695
+ if not docs_data or not github_data:
696
+ logger.warning("Missing documentation or GitHub data for conflict detection")
697
+ return []
698
+
699
+ # Load data files
700
+ with open(docs_data["data_file"], encoding="utf-8") as f:
701
+ docs_json = json.load(f)
702
+
703
+ with open(github_data["data_file"], encoding="utf-8") as f:
704
+ github_json = json.load(f)
705
+
706
+ # Detect conflicts
707
+ detector = ConflictDetector(docs_json, github_json)
708
+ conflicts = detector.detect_all_conflicts()
709
+
710
+ # Save conflicts
711
+ conflicts_file = os.path.join(self.data_dir, "conflicts.json")
712
+ detector.save_conflicts(conflicts, conflicts_file)
713
+
714
+ # Print summary
715
+ summary = detector.generate_summary(conflicts)
716
+ logger.info("\n๐Ÿ“Š Conflict Summary:")
717
+ logger.info(f" Total: {summary['total']}")
718
+ logger.info(" By Type:")
719
+ for ctype, count in summary["by_type"].items():
720
+ if count > 0:
721
+ logger.info(f" - {ctype}: {count}")
722
+ logger.info(" By Severity:")
723
+ for severity, count in summary["by_severity"].items():
724
+ if count > 0:
725
+ emoji = "๐Ÿ”ด" if severity == "high" else "๐ŸŸก" if severity == "medium" else "๐ŸŸข"
726
+ logger.info(f" {emoji} {severity}: {count}")
727
+
728
+ return conflicts
729
+
730
+ def merge_sources(self, conflicts: list):
731
+ """
732
+ Merge data from multiple sources.
733
+
734
+ Args:
735
+ conflicts: List of detected conflicts
736
+ """
737
+ logger.info("\n" + "=" * 60)
738
+ logger.info(f"PHASE 3: Merging sources ({self.merge_mode})")
739
+ logger.info("=" * 60)
740
+
741
+ if not conflicts:
742
+ logger.info("No conflicts to merge")
743
+ return None
744
+
745
+ # Get data files
746
+ docs_data = self.scraped_data.get("documentation", {})
747
+ github_data = self.scraped_data.get("github", {})
748
+
749
+ # Load data
750
+ with open(docs_data["data_file"], encoding="utf-8") as f:
751
+ docs_json = json.load(f)
752
+
753
+ with open(github_data["data_file"], encoding="utf-8") as f:
754
+ github_json = json.load(f)
755
+
756
+ # Choose merger
757
+ if self.merge_mode == "claude-enhanced":
758
+ merger = ClaudeEnhancedMerger(docs_json, github_json, conflicts)
759
+ else:
760
+ merger = RuleBasedMerger(docs_json, github_json, conflicts)
761
+
762
+ # Merge
763
+ merged_data = merger.merge_all()
764
+
765
+ # Save merged data
766
+ merged_file = os.path.join(self.data_dir, "merged_data.json")
767
+ with open(merged_file, "w", encoding="utf-8") as f:
768
+ json.dump(merged_data, f, indent=2, ensure_ascii=False)
769
+
770
+ logger.info(f"โœ… Merged data saved: {merged_file}")
771
+
772
+ return merged_data
773
+
774
+ def build_skill(self, merged_data: dict | None = None):
775
+ """
776
+ Build final unified skill.
777
+
778
+ Args:
779
+ merged_data: Merged API data (if conflicts were resolved)
780
+ """
781
+ logger.info("\n" + "=" * 60)
782
+ logger.info("PHASE 4: Building unified skill")
783
+ logger.info("=" * 60)
784
+
785
+ # Load conflicts if they exist
786
+ conflicts = []
787
+ conflicts_file = os.path.join(self.data_dir, "conflicts.json")
788
+ if os.path.exists(conflicts_file):
789
+ with open(conflicts_file, encoding="utf-8") as f:
790
+ conflicts_data = json.load(f)
791
+ conflicts = conflicts_data.get("conflicts", [])
792
+
793
+ # Build skill
794
+ builder = UnifiedSkillBuilder(
795
+ self.config, self.scraped_data, merged_data, conflicts, cache_dir=self.cache_dir
796
+ )
797
+
798
+ builder.build()
799
+
800
+ logger.info(f"โœ… Unified skill built: {self.output_dir}/")
801
+
802
+ def run(self):
803
+ """
804
+ Execute complete unified scraping workflow.
805
+ """
806
+ logger.info("\n" + "๐Ÿš€ " * 20)
807
+ logger.info(f"Unified Scraper: {self.config['name']}")
808
+ logger.info("๐Ÿš€ " * 20 + "\n")
809
+
810
+ try:
811
+ # Phase 1: Scrape all sources
812
+ self.scrape_all_sources()
813
+
814
+ # Phase 2: Detect conflicts (if applicable)
815
+ conflicts = self.detect_conflicts()
816
+
817
+ # Phase 3: Merge sources (if conflicts exist)
818
+ merged_data = None
819
+ if conflicts:
820
+ merged_data = self.merge_sources(conflicts)
821
+
822
+ # Phase 4: Build skill
823
+ self.build_skill(merged_data)
824
+
825
+ logger.info("\n" + "โœ… " * 20)
826
+ logger.info("Unified scraping complete!")
827
+ logger.info("โœ… " * 20 + "\n")
828
+
829
+ logger.info(f"๐Ÿ“ Output: {self.output_dir}/")
830
+ logger.info(f"๐Ÿ“ Data: {self.data_dir}/")
831
+
832
+ except KeyboardInterrupt:
833
+ logger.info("\n\nโš ๏ธ Scraping interrupted by user")
834
+ sys.exit(1)
835
+ except Exception as e:
836
+ logger.error(f"\n\nโŒ Error during scraping: {e}")
837
+ import traceback
838
+
839
+ traceback.print_exc()
840
+ sys.exit(1)
841
+
842
+
843
+ def main():
844
+ """Main entry point."""
845
+ parser = argparse.ArgumentParser(
846
+ description="Unified multi-source scraper",
847
+ formatter_class=argparse.RawDescriptionHelpFormatter,
848
+ epilog="""
849
+ Examples:
850
+ # Basic usage with unified config
851
+ skill-seekers unified --config configs/godot_unified.json
852
+
853
+ # Override merge mode
854
+ skill-seekers unified --config configs/react_unified.json --merge-mode claude-enhanced
855
+
856
+ # Backward compatible with legacy configs
857
+ skill-seekers unified --config configs/react.json
858
+ """,
859
+ )
860
+
861
+ parser.add_argument("--config", "-c", required=True, help="Path to unified config JSON file")
862
+ parser.add_argument(
863
+ "--merge-mode",
864
+ "-m",
865
+ choices=["rule-based", "claude-enhanced"],
866
+ help="Override config merge mode",
867
+ )
868
+ parser.add_argument(
869
+ "--skip-codebase-analysis",
870
+ action="store_true",
871
+ help="Skip C3.x codebase analysis for GitHub sources (default: enabled)",
872
+ )
873
+ parser.add_argument(
874
+ "--fresh",
875
+ action="store_true",
876
+ help="Clear any existing data and start fresh (ignore checkpoints)",
877
+ )
878
+ parser.add_argument(
879
+ "--dry-run",
880
+ action="store_true",
881
+ help="Preview what will be scraped without actually scraping",
882
+ )
883
+
884
+ args = parser.parse_args()
885
+
886
+ # Create scraper
887
+ scraper = UnifiedScraper(args.config, args.merge_mode)
888
+
889
+ # Disable codebase analysis if requested
890
+ if args.skip_codebase_analysis:
891
+ for source in scraper.config.get("sources", []):
892
+ if source["type"] == "github":
893
+ source["enable_codebase_analysis"] = False
894
+ logger.info(
895
+ f"โญ๏ธ Skipping codebase analysis for GitHub source: {source.get('repo', 'unknown')}"
896
+ )
897
+
898
+ # Handle --fresh flag (clear cache)
899
+ if args.fresh:
900
+ import shutil
901
+
902
+ if os.path.exists(scraper.cache_dir):
903
+ logger.info(f"๐Ÿงน Clearing cache: {scraper.cache_dir}")
904
+ shutil.rmtree(scraper.cache_dir)
905
+ # Recreate directories
906
+ os.makedirs(scraper.sources_dir, exist_ok=True)
907
+ os.makedirs(scraper.data_dir, exist_ok=True)
908
+ os.makedirs(scraper.repos_dir, exist_ok=True)
909
+ os.makedirs(scraper.logs_dir, exist_ok=True)
910
+
911
+ # Handle --dry-run flag
912
+ if args.dry_run:
913
+ logger.info("๐Ÿ” DRY RUN MODE - Preview only, no scraping will occur")
914
+ logger.info(f"\nWould scrape {len(scraper.config.get('sources', []))} sources:")
915
+ for idx, source in enumerate(scraper.config.get("sources", []), 1):
916
+ source_type = source.get("type", "unknown")
917
+ if source_type == "documentation":
918
+ logger.info(f" {idx}. Documentation: {source.get('base_url', 'N/A')}")
919
+ elif source_type == "github":
920
+ logger.info(f" {idx}. GitHub: {source.get('repo', 'N/A')}")
921
+ elif source_type == "pdf":
922
+ logger.info(f" {idx}. PDF: {source.get('pdf_path', 'N/A')}")
923
+ logger.info(f"\nOutput directory: {scraper.output_dir}")
924
+ logger.info(f"Merge mode: {scraper.merge_mode}")
925
+ return
926
+
927
+ # Run scraper
928
+ scraper.run()
929
+
930
+
931
+ if __name__ == "__main__":
932
+ main()