skill-seekers 2.7.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- skill_seekers/__init__.py +22 -0
- skill_seekers/cli/__init__.py +39 -0
- skill_seekers/cli/adaptors/__init__.py +120 -0
- skill_seekers/cli/adaptors/base.py +221 -0
- skill_seekers/cli/adaptors/claude.py +485 -0
- skill_seekers/cli/adaptors/gemini.py +453 -0
- skill_seekers/cli/adaptors/markdown.py +269 -0
- skill_seekers/cli/adaptors/openai.py +503 -0
- skill_seekers/cli/ai_enhancer.py +310 -0
- skill_seekers/cli/api_reference_builder.py +373 -0
- skill_seekers/cli/architectural_pattern_detector.py +525 -0
- skill_seekers/cli/code_analyzer.py +1462 -0
- skill_seekers/cli/codebase_scraper.py +1225 -0
- skill_seekers/cli/config_command.py +563 -0
- skill_seekers/cli/config_enhancer.py +431 -0
- skill_seekers/cli/config_extractor.py +871 -0
- skill_seekers/cli/config_manager.py +452 -0
- skill_seekers/cli/config_validator.py +394 -0
- skill_seekers/cli/conflict_detector.py +528 -0
- skill_seekers/cli/constants.py +72 -0
- skill_seekers/cli/dependency_analyzer.py +757 -0
- skill_seekers/cli/doc_scraper.py +2332 -0
- skill_seekers/cli/enhance_skill.py +488 -0
- skill_seekers/cli/enhance_skill_local.py +1096 -0
- skill_seekers/cli/enhance_status.py +194 -0
- skill_seekers/cli/estimate_pages.py +433 -0
- skill_seekers/cli/generate_router.py +1209 -0
- skill_seekers/cli/github_fetcher.py +534 -0
- skill_seekers/cli/github_scraper.py +1466 -0
- skill_seekers/cli/guide_enhancer.py +723 -0
- skill_seekers/cli/how_to_guide_builder.py +1267 -0
- skill_seekers/cli/install_agent.py +461 -0
- skill_seekers/cli/install_skill.py +178 -0
- skill_seekers/cli/language_detector.py +614 -0
- skill_seekers/cli/llms_txt_detector.py +60 -0
- skill_seekers/cli/llms_txt_downloader.py +104 -0
- skill_seekers/cli/llms_txt_parser.py +150 -0
- skill_seekers/cli/main.py +558 -0
- skill_seekers/cli/markdown_cleaner.py +132 -0
- skill_seekers/cli/merge_sources.py +806 -0
- skill_seekers/cli/package_multi.py +77 -0
- skill_seekers/cli/package_skill.py +241 -0
- skill_seekers/cli/pattern_recognizer.py +1825 -0
- skill_seekers/cli/pdf_extractor_poc.py +1166 -0
- skill_seekers/cli/pdf_scraper.py +617 -0
- skill_seekers/cli/quality_checker.py +519 -0
- skill_seekers/cli/rate_limit_handler.py +438 -0
- skill_seekers/cli/resume_command.py +160 -0
- skill_seekers/cli/run_tests.py +230 -0
- skill_seekers/cli/setup_wizard.py +93 -0
- skill_seekers/cli/split_config.py +390 -0
- skill_seekers/cli/swift_patterns.py +560 -0
- skill_seekers/cli/test_example_extractor.py +1081 -0
- skill_seekers/cli/test_unified_simple.py +179 -0
- skill_seekers/cli/unified_codebase_analyzer.py +572 -0
- skill_seekers/cli/unified_scraper.py +932 -0
- skill_seekers/cli/unified_skill_builder.py +1605 -0
- skill_seekers/cli/upload_skill.py +162 -0
- skill_seekers/cli/utils.py +432 -0
- skill_seekers/mcp/__init__.py +33 -0
- skill_seekers/mcp/agent_detector.py +316 -0
- skill_seekers/mcp/git_repo.py +273 -0
- skill_seekers/mcp/server.py +231 -0
- skill_seekers/mcp/server_fastmcp.py +1249 -0
- skill_seekers/mcp/server_legacy.py +2302 -0
- skill_seekers/mcp/source_manager.py +285 -0
- skill_seekers/mcp/tools/__init__.py +115 -0
- skill_seekers/mcp/tools/config_tools.py +251 -0
- skill_seekers/mcp/tools/packaging_tools.py +826 -0
- skill_seekers/mcp/tools/scraping_tools.py +842 -0
- skill_seekers/mcp/tools/source_tools.py +828 -0
- skill_seekers/mcp/tools/splitting_tools.py +212 -0
- skill_seekers/py.typed +0 -0
- skill_seekers-2.7.3.dist-info/METADATA +2027 -0
- skill_seekers-2.7.3.dist-info/RECORD +79 -0
- skill_seekers-2.7.3.dist-info/WHEEL +5 -0
- skill_seekers-2.7.3.dist-info/entry_points.txt +19 -0
- skill_seekers-2.7.3.dist-info/licenses/LICENSE +21 -0
- skill_seekers-2.7.3.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,932 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
Unified Multi-Source Scraper
|
|
4
|
+
|
|
5
|
+
Orchestrates scraping from multiple sources (documentation, GitHub, PDF),
|
|
6
|
+
detects conflicts, merges intelligently, and builds unified skills.
|
|
7
|
+
|
|
8
|
+
This is the main entry point for unified config workflow.
|
|
9
|
+
|
|
10
|
+
Usage:
|
|
11
|
+
skill-seekers unified --config configs/godot_unified.json
|
|
12
|
+
skill-seekers unified --config configs/react_unified.json --merge-mode claude-enhanced
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
import argparse
|
|
16
|
+
import json
|
|
17
|
+
import logging
|
|
18
|
+
import os
|
|
19
|
+
import shutil
|
|
20
|
+
import subprocess
|
|
21
|
+
import sys
|
|
22
|
+
from pathlib import Path
|
|
23
|
+
from typing import Any
|
|
24
|
+
|
|
25
|
+
# Import validators and scrapers
|
|
26
|
+
try:
|
|
27
|
+
from skill_seekers.cli.config_validator import validate_config
|
|
28
|
+
from skill_seekers.cli.conflict_detector import ConflictDetector
|
|
29
|
+
from skill_seekers.cli.merge_sources import ClaudeEnhancedMerger, RuleBasedMerger
|
|
30
|
+
from skill_seekers.cli.unified_skill_builder import UnifiedSkillBuilder
|
|
31
|
+
except ImportError as e:
|
|
32
|
+
print(f"Error importing modules: {e}")
|
|
33
|
+
print("Make sure you're running from the project root directory")
|
|
34
|
+
sys.exit(1)
|
|
35
|
+
|
|
36
|
+
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
|
|
37
|
+
logger = logging.getLogger(__name__)
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
class UnifiedScraper:
|
|
41
|
+
"""
|
|
42
|
+
Orchestrates multi-source scraping and merging.
|
|
43
|
+
|
|
44
|
+
Main workflow:
|
|
45
|
+
1. Load and validate unified config
|
|
46
|
+
2. Scrape all sources (docs, GitHub, PDF)
|
|
47
|
+
3. Detect conflicts between sources
|
|
48
|
+
4. Merge intelligently (rule-based or Claude-enhanced)
|
|
49
|
+
5. Build unified skill
|
|
50
|
+
"""
|
|
51
|
+
|
|
52
|
+
def __init__(self, config_path: str, merge_mode: str | None = None):
|
|
53
|
+
"""
|
|
54
|
+
Initialize unified scraper.
|
|
55
|
+
|
|
56
|
+
Args:
|
|
57
|
+
config_path: Path to unified config JSON
|
|
58
|
+
merge_mode: Override config merge_mode ('rule-based' or 'claude-enhanced')
|
|
59
|
+
"""
|
|
60
|
+
self.config_path = config_path
|
|
61
|
+
|
|
62
|
+
# Validate and load config
|
|
63
|
+
logger.info(f"Loading config: {config_path}")
|
|
64
|
+
self.validator = validate_config(config_path)
|
|
65
|
+
self.config = self.validator.config
|
|
66
|
+
|
|
67
|
+
# Determine merge mode
|
|
68
|
+
self.merge_mode = merge_mode or self.config.get("merge_mode", "rule-based")
|
|
69
|
+
logger.info(f"Merge mode: {self.merge_mode}")
|
|
70
|
+
|
|
71
|
+
# Storage for scraped data - use lists to support multiple sources of same type
|
|
72
|
+
self.scraped_data = {
|
|
73
|
+
"documentation": [], # List of doc sources
|
|
74
|
+
"github": [], # List of github sources
|
|
75
|
+
"pdf": [], # List of pdf sources
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
# Track source index for unique naming (multi-source support)
|
|
79
|
+
self._source_counters = {"documentation": 0, "github": 0, "pdf": 0}
|
|
80
|
+
|
|
81
|
+
# Output paths - cleaner organization
|
|
82
|
+
self.name = self.config["name"]
|
|
83
|
+
self.output_dir = f"output/{self.name}" # Final skill only
|
|
84
|
+
|
|
85
|
+
# Use hidden cache directory for intermediate files
|
|
86
|
+
self.cache_dir = f".skillseeker-cache/{self.name}"
|
|
87
|
+
self.sources_dir = f"{self.cache_dir}/sources"
|
|
88
|
+
self.data_dir = f"{self.cache_dir}/data"
|
|
89
|
+
self.repos_dir = f"{self.cache_dir}/repos"
|
|
90
|
+
self.logs_dir = f"{self.cache_dir}/logs"
|
|
91
|
+
|
|
92
|
+
# Create directories
|
|
93
|
+
os.makedirs(self.output_dir, exist_ok=True)
|
|
94
|
+
os.makedirs(self.sources_dir, exist_ok=True)
|
|
95
|
+
os.makedirs(self.data_dir, exist_ok=True)
|
|
96
|
+
os.makedirs(self.repos_dir, exist_ok=True)
|
|
97
|
+
os.makedirs(self.logs_dir, exist_ok=True)
|
|
98
|
+
|
|
99
|
+
# Setup file logging
|
|
100
|
+
self._setup_logging()
|
|
101
|
+
|
|
102
|
+
def _setup_logging(self):
|
|
103
|
+
"""Setup file logging for this scraping session."""
|
|
104
|
+
from datetime import datetime
|
|
105
|
+
|
|
106
|
+
# Create log filename with timestamp
|
|
107
|
+
timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
|
|
108
|
+
log_file = f"{self.logs_dir}/unified_{timestamp}.log"
|
|
109
|
+
|
|
110
|
+
# Add file handler to root logger
|
|
111
|
+
file_handler = logging.FileHandler(log_file, encoding="utf-8")
|
|
112
|
+
file_handler.setLevel(logging.DEBUG)
|
|
113
|
+
|
|
114
|
+
# Create formatter
|
|
115
|
+
formatter = logging.Formatter(
|
|
116
|
+
"%(asctime)s - %(name)s - %(levelname)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S"
|
|
117
|
+
)
|
|
118
|
+
file_handler.setFormatter(formatter)
|
|
119
|
+
|
|
120
|
+
# Add to root logger
|
|
121
|
+
logging.getLogger().addHandler(file_handler)
|
|
122
|
+
|
|
123
|
+
logger.info(f"๐ Logging to: {log_file}")
|
|
124
|
+
logger.info(f"๐๏ธ Cache directory: {self.cache_dir}")
|
|
125
|
+
|
|
126
|
+
def scrape_all_sources(self):
|
|
127
|
+
"""
|
|
128
|
+
Scrape all configured sources.
|
|
129
|
+
|
|
130
|
+
Routes to appropriate scraper based on source type.
|
|
131
|
+
"""
|
|
132
|
+
logger.info("=" * 60)
|
|
133
|
+
logger.info("PHASE 1: Scraping all sources")
|
|
134
|
+
logger.info("=" * 60)
|
|
135
|
+
|
|
136
|
+
if not self.validator.is_unified:
|
|
137
|
+
logger.warning("Config is not unified format, converting...")
|
|
138
|
+
self.config = self.validator.convert_legacy_to_unified()
|
|
139
|
+
|
|
140
|
+
sources = self.config.get("sources", [])
|
|
141
|
+
|
|
142
|
+
for i, source in enumerate(sources):
|
|
143
|
+
source_type = source["type"]
|
|
144
|
+
logger.info(f"\n[{i + 1}/{len(sources)}] Scraping {source_type} source...")
|
|
145
|
+
|
|
146
|
+
try:
|
|
147
|
+
if source_type == "documentation":
|
|
148
|
+
self._scrape_documentation(source)
|
|
149
|
+
elif source_type == "github":
|
|
150
|
+
self._scrape_github(source)
|
|
151
|
+
elif source_type == "pdf":
|
|
152
|
+
self._scrape_pdf(source)
|
|
153
|
+
else:
|
|
154
|
+
logger.warning(f"Unknown source type: {source_type}")
|
|
155
|
+
except Exception as e:
|
|
156
|
+
logger.error(f"Error scraping {source_type}: {e}")
|
|
157
|
+
logger.info("Continuing with other sources...")
|
|
158
|
+
|
|
159
|
+
logger.info(f"\nโ
Scraped {len(self.scraped_data)} sources successfully")
|
|
160
|
+
|
|
161
|
+
def _scrape_documentation(self, source: dict[str, Any]):
|
|
162
|
+
"""Scrape documentation website."""
|
|
163
|
+
# Create temporary config for doc scraper
|
|
164
|
+
doc_config = {
|
|
165
|
+
"name": f"{self.name}_docs",
|
|
166
|
+
"base_url": source["base_url"],
|
|
167
|
+
"selectors": source.get("selectors", {}),
|
|
168
|
+
"url_patterns": source.get("url_patterns", {}),
|
|
169
|
+
"categories": source.get("categories", {}),
|
|
170
|
+
"rate_limit": source.get("rate_limit", 0.5),
|
|
171
|
+
"max_pages": source.get("max_pages", 100),
|
|
172
|
+
}
|
|
173
|
+
|
|
174
|
+
# Pass through llms.txt settings (so unified configs behave the same as doc_scraper configs)
|
|
175
|
+
if "llms_txt_url" in source:
|
|
176
|
+
doc_config["llms_txt_url"] = source.get("llms_txt_url")
|
|
177
|
+
|
|
178
|
+
if "skip_llms_txt" in source:
|
|
179
|
+
doc_config["skip_llms_txt"] = source.get("skip_llms_txt")
|
|
180
|
+
|
|
181
|
+
# Optional: support overriding start URLs
|
|
182
|
+
if "start_urls" in source:
|
|
183
|
+
doc_config["start_urls"] = source.get("start_urls")
|
|
184
|
+
|
|
185
|
+
# Write temporary config
|
|
186
|
+
temp_config_path = os.path.join(self.data_dir, "temp_docs_config.json")
|
|
187
|
+
with open(temp_config_path, "w", encoding="utf-8") as f:
|
|
188
|
+
json.dump(doc_config, f, indent=2)
|
|
189
|
+
|
|
190
|
+
# Run doc_scraper as subprocess
|
|
191
|
+
logger.info(f"Scraping documentation from {source['base_url']}")
|
|
192
|
+
|
|
193
|
+
doc_scraper_path = Path(__file__).parent / "doc_scraper.py"
|
|
194
|
+
cmd = [sys.executable, str(doc_scraper_path), "--config", temp_config_path, "--fresh"]
|
|
195
|
+
|
|
196
|
+
result = subprocess.run(cmd, capture_output=True, text=True, stdin=subprocess.DEVNULL)
|
|
197
|
+
|
|
198
|
+
if result.returncode != 0:
|
|
199
|
+
logger.error(f"Documentation scraping failed with return code {result.returncode}")
|
|
200
|
+
logger.error(f"STDERR: {result.stderr}")
|
|
201
|
+
logger.error(f"STDOUT: {result.stdout}")
|
|
202
|
+
return
|
|
203
|
+
|
|
204
|
+
# Log subprocess output for debugging
|
|
205
|
+
if result.stdout:
|
|
206
|
+
logger.info(f"Doc scraper output: {result.stdout[-500:]}") # Last 500 chars
|
|
207
|
+
|
|
208
|
+
# Load scraped data
|
|
209
|
+
docs_data_file = f"output/{doc_config['name']}_data/summary.json"
|
|
210
|
+
|
|
211
|
+
if os.path.exists(docs_data_file):
|
|
212
|
+
with open(docs_data_file, encoding="utf-8") as f:
|
|
213
|
+
summary = json.load(f)
|
|
214
|
+
|
|
215
|
+
# Append to documentation list (multi-source support)
|
|
216
|
+
self.scraped_data["documentation"].append(
|
|
217
|
+
{
|
|
218
|
+
"source_id": doc_config["name"],
|
|
219
|
+
"base_url": source["base_url"],
|
|
220
|
+
"pages": summary.get("pages", []),
|
|
221
|
+
"total_pages": summary.get("total_pages", 0),
|
|
222
|
+
"data_file": docs_data_file,
|
|
223
|
+
"refs_dir": "", # Will be set after moving to cache
|
|
224
|
+
}
|
|
225
|
+
)
|
|
226
|
+
|
|
227
|
+
logger.info(f"โ
Documentation: {summary.get('total_pages', 0)} pages scraped")
|
|
228
|
+
else:
|
|
229
|
+
logger.warning("Documentation data file not found")
|
|
230
|
+
|
|
231
|
+
# Clean up temp config
|
|
232
|
+
if os.path.exists(temp_config_path):
|
|
233
|
+
os.remove(temp_config_path)
|
|
234
|
+
|
|
235
|
+
# Move intermediate files to cache to keep output/ clean
|
|
236
|
+
docs_output_dir = f"output/{doc_config['name']}"
|
|
237
|
+
docs_data_dir = f"output/{doc_config['name']}_data"
|
|
238
|
+
|
|
239
|
+
if os.path.exists(docs_output_dir):
|
|
240
|
+
cache_docs_dir = os.path.join(self.sources_dir, f"{doc_config['name']}")
|
|
241
|
+
if os.path.exists(cache_docs_dir):
|
|
242
|
+
shutil.rmtree(cache_docs_dir)
|
|
243
|
+
shutil.move(docs_output_dir, cache_docs_dir)
|
|
244
|
+
logger.info(f"๐ฆ Moved docs output to cache: {cache_docs_dir}")
|
|
245
|
+
|
|
246
|
+
# Update refs_dir in scraped_data with cache location
|
|
247
|
+
refs_dir_path = os.path.join(cache_docs_dir, "references")
|
|
248
|
+
if self.scraped_data["documentation"]:
|
|
249
|
+
self.scraped_data["documentation"][-1]["refs_dir"] = refs_dir_path
|
|
250
|
+
|
|
251
|
+
if os.path.exists(docs_data_dir):
|
|
252
|
+
cache_data_dir = os.path.join(self.data_dir, f"{doc_config['name']}_data")
|
|
253
|
+
if os.path.exists(cache_data_dir):
|
|
254
|
+
shutil.rmtree(cache_data_dir)
|
|
255
|
+
shutil.move(docs_data_dir, cache_data_dir)
|
|
256
|
+
logger.info(f"๐ฆ Moved docs data to cache: {cache_data_dir}")
|
|
257
|
+
|
|
258
|
+
def _clone_github_repo(self, repo_name: str, idx: int = 0) -> str | None:
|
|
259
|
+
"""
|
|
260
|
+
Clone GitHub repository to cache directory for C3.x analysis.
|
|
261
|
+
Reuses existing clone if already present.
|
|
262
|
+
|
|
263
|
+
Args:
|
|
264
|
+
repo_name: GitHub repo in format "owner/repo"
|
|
265
|
+
idx: Source index for unique naming when multiple repos
|
|
266
|
+
|
|
267
|
+
Returns:
|
|
268
|
+
Path to cloned repo, or None if clone failed
|
|
269
|
+
"""
|
|
270
|
+
# Clone to cache repos folder for future reuse
|
|
271
|
+
repo_dir_name = f"{idx}_{repo_name.replace('/', '_')}" # e.g., 0_encode_httpx
|
|
272
|
+
clone_path = os.path.join(self.repos_dir, repo_dir_name)
|
|
273
|
+
|
|
274
|
+
# Check if already cloned
|
|
275
|
+
if os.path.exists(clone_path) and os.path.isdir(os.path.join(clone_path, ".git")):
|
|
276
|
+
logger.info(f"โป๏ธ Found existing repository clone: {clone_path}")
|
|
277
|
+
logger.info(" Reusing for C3.x analysis (skip re-cloning)")
|
|
278
|
+
return clone_path
|
|
279
|
+
|
|
280
|
+
# repos_dir already created in __init__
|
|
281
|
+
|
|
282
|
+
# Clone repo (full clone, not shallow - for complete analysis)
|
|
283
|
+
repo_url = f"https://github.com/{repo_name}.git"
|
|
284
|
+
logger.info(f"๐ Cloning repository for C3.x analysis: {repo_url}")
|
|
285
|
+
logger.info(f" โ {clone_path}")
|
|
286
|
+
logger.info(" ๐พ Clone will be saved for future reuse")
|
|
287
|
+
|
|
288
|
+
try:
|
|
289
|
+
result = subprocess.run(
|
|
290
|
+
["git", "clone", repo_url, clone_path],
|
|
291
|
+
capture_output=True,
|
|
292
|
+
text=True,
|
|
293
|
+
timeout=600, # 10 minute timeout for full clone
|
|
294
|
+
)
|
|
295
|
+
|
|
296
|
+
if result.returncode == 0:
|
|
297
|
+
logger.info("โ
Repository cloned successfully")
|
|
298
|
+
logger.info(f" ๐ Saved to: {clone_path}")
|
|
299
|
+
return clone_path
|
|
300
|
+
else:
|
|
301
|
+
logger.error(f"โ Git clone failed: {result.stderr}")
|
|
302
|
+
# Clean up failed clone
|
|
303
|
+
if os.path.exists(clone_path):
|
|
304
|
+
shutil.rmtree(clone_path)
|
|
305
|
+
return None
|
|
306
|
+
|
|
307
|
+
except subprocess.TimeoutExpired:
|
|
308
|
+
logger.error("โ Git clone timed out after 10 minutes")
|
|
309
|
+
if os.path.exists(clone_path):
|
|
310
|
+
shutil.rmtree(clone_path)
|
|
311
|
+
return None
|
|
312
|
+
except Exception as e:
|
|
313
|
+
logger.error(f"โ Git clone failed: {e}")
|
|
314
|
+
if os.path.exists(clone_path):
|
|
315
|
+
shutil.rmtree(clone_path)
|
|
316
|
+
return None
|
|
317
|
+
|
|
318
|
+
def _scrape_github(self, source: dict[str, Any]):
|
|
319
|
+
"""Scrape GitHub repository."""
|
|
320
|
+
try:
|
|
321
|
+
from skill_seekers.cli.github_scraper import GitHubScraper
|
|
322
|
+
except ImportError:
|
|
323
|
+
logger.error("github_scraper.py not found")
|
|
324
|
+
return
|
|
325
|
+
|
|
326
|
+
# Multi-source support: Get unique index for this GitHub source
|
|
327
|
+
idx = self._source_counters["github"]
|
|
328
|
+
self._source_counters["github"] += 1
|
|
329
|
+
|
|
330
|
+
# Extract repo identifier for unique naming
|
|
331
|
+
repo = source["repo"]
|
|
332
|
+
repo_id = repo.replace("/", "_")
|
|
333
|
+
|
|
334
|
+
# Check if we need to clone for C3.x analysis
|
|
335
|
+
enable_codebase_analysis = source.get("enable_codebase_analysis", True)
|
|
336
|
+
local_repo_path = source.get("local_repo_path")
|
|
337
|
+
cloned_repo_path = None
|
|
338
|
+
|
|
339
|
+
# Auto-clone if C3.x analysis is enabled but no local path provided
|
|
340
|
+
if enable_codebase_analysis and not local_repo_path:
|
|
341
|
+
logger.info("๐ฌ C3.x codebase analysis enabled - cloning repository...")
|
|
342
|
+
cloned_repo_path = self._clone_github_repo(repo, idx=idx)
|
|
343
|
+
if cloned_repo_path:
|
|
344
|
+
local_repo_path = cloned_repo_path
|
|
345
|
+
logger.info(f"โ
Using cloned repo for C3.x analysis: {local_repo_path}")
|
|
346
|
+
else:
|
|
347
|
+
logger.warning("โ ๏ธ Failed to clone repo - C3.x analysis will be skipped")
|
|
348
|
+
enable_codebase_analysis = False
|
|
349
|
+
|
|
350
|
+
# Create config for GitHub scraper
|
|
351
|
+
github_config = {
|
|
352
|
+
"repo": repo,
|
|
353
|
+
"name": f"{self.name}_github_{idx}_{repo_id}",
|
|
354
|
+
"github_token": source.get("github_token"),
|
|
355
|
+
"include_issues": source.get("include_issues", True),
|
|
356
|
+
"max_issues": source.get("max_issues", 100),
|
|
357
|
+
"include_changelog": source.get("include_changelog", True),
|
|
358
|
+
"include_releases": source.get("include_releases", True),
|
|
359
|
+
"include_code": source.get("include_code", True),
|
|
360
|
+
"code_analysis_depth": source.get("code_analysis_depth", "surface"),
|
|
361
|
+
"file_patterns": source.get("file_patterns", []),
|
|
362
|
+
"local_repo_path": local_repo_path, # Use cloned path if available
|
|
363
|
+
}
|
|
364
|
+
|
|
365
|
+
# Pass directory exclusions if specified (optional)
|
|
366
|
+
if "exclude_dirs" in source:
|
|
367
|
+
github_config["exclude_dirs"] = source["exclude_dirs"]
|
|
368
|
+
if "exclude_dirs_additional" in source:
|
|
369
|
+
github_config["exclude_dirs_additional"] = source["exclude_dirs_additional"]
|
|
370
|
+
|
|
371
|
+
# Scrape
|
|
372
|
+
logger.info(f"Scraping GitHub repository: {source['repo']}")
|
|
373
|
+
scraper = GitHubScraper(github_config)
|
|
374
|
+
github_data = scraper.scrape()
|
|
375
|
+
|
|
376
|
+
# Run C3.x codebase analysis if enabled and local_repo_path available
|
|
377
|
+
if enable_codebase_analysis and local_repo_path:
|
|
378
|
+
logger.info("๐ฌ Running C3.x codebase analysis...")
|
|
379
|
+
try:
|
|
380
|
+
c3_data = self._run_c3_analysis(local_repo_path, source)
|
|
381
|
+
if c3_data:
|
|
382
|
+
github_data["c3_analysis"] = c3_data
|
|
383
|
+
logger.info("โ
C3.x analysis complete")
|
|
384
|
+
else:
|
|
385
|
+
logger.warning("โ ๏ธ C3.x analysis returned no data")
|
|
386
|
+
except Exception as e:
|
|
387
|
+
logger.warning(f"โ ๏ธ C3.x analysis failed: {e}")
|
|
388
|
+
import traceback
|
|
389
|
+
|
|
390
|
+
logger.debug(f"Traceback: {traceback.format_exc()}")
|
|
391
|
+
# Continue without C3.x data - graceful degradation
|
|
392
|
+
|
|
393
|
+
# Note: We keep the cloned repo in output/ for future reuse
|
|
394
|
+
if cloned_repo_path:
|
|
395
|
+
logger.info(f"๐ Repository clone saved for future use: {cloned_repo_path}")
|
|
396
|
+
|
|
397
|
+
# Save data to unified location with unique filename
|
|
398
|
+
github_data_file = os.path.join(self.data_dir, f"github_data_{idx}_{repo_id}.json")
|
|
399
|
+
with open(github_data_file, "w", encoding="utf-8") as f:
|
|
400
|
+
json.dump(github_data, f, indent=2, ensure_ascii=False)
|
|
401
|
+
|
|
402
|
+
# ALSO save to the location GitHubToSkillConverter expects (with C3.x data!)
|
|
403
|
+
converter_data_file = f"output/{github_config['name']}_github_data.json"
|
|
404
|
+
with open(converter_data_file, "w", encoding="utf-8") as f:
|
|
405
|
+
json.dump(github_data, f, indent=2, ensure_ascii=False)
|
|
406
|
+
|
|
407
|
+
# Append to list instead of overwriting (multi-source support)
|
|
408
|
+
self.scraped_data["github"].append(
|
|
409
|
+
{
|
|
410
|
+
"repo": repo,
|
|
411
|
+
"repo_id": repo_id,
|
|
412
|
+
"idx": idx,
|
|
413
|
+
"data": github_data,
|
|
414
|
+
"data_file": github_data_file,
|
|
415
|
+
}
|
|
416
|
+
)
|
|
417
|
+
|
|
418
|
+
# Build standalone SKILL.md for synthesis using GitHubToSkillConverter
|
|
419
|
+
try:
|
|
420
|
+
from skill_seekers.cli.github_scraper import GitHubToSkillConverter
|
|
421
|
+
|
|
422
|
+
# Use github_config which has the correct name field
|
|
423
|
+
# Converter will load from output/{name}_github_data.json which now has C3.x data
|
|
424
|
+
converter = GitHubToSkillConverter(config=github_config)
|
|
425
|
+
converter.build_skill()
|
|
426
|
+
logger.info("โ
GitHub: Standalone SKILL.md created")
|
|
427
|
+
except Exception as e:
|
|
428
|
+
logger.warning(f"โ ๏ธ Failed to build standalone GitHub SKILL.md: {e}")
|
|
429
|
+
|
|
430
|
+
# Move intermediate files to cache to keep output/ clean
|
|
431
|
+
github_output_dir = f"output/{github_config['name']}"
|
|
432
|
+
github_data_file_path = f"output/{github_config['name']}_github_data.json"
|
|
433
|
+
|
|
434
|
+
if os.path.exists(github_output_dir):
|
|
435
|
+
cache_github_dir = os.path.join(self.sources_dir, github_config["name"])
|
|
436
|
+
if os.path.exists(cache_github_dir):
|
|
437
|
+
shutil.rmtree(cache_github_dir)
|
|
438
|
+
shutil.move(github_output_dir, cache_github_dir)
|
|
439
|
+
logger.info(f"๐ฆ Moved GitHub output to cache: {cache_github_dir}")
|
|
440
|
+
|
|
441
|
+
if os.path.exists(github_data_file_path):
|
|
442
|
+
cache_github_data = os.path.join(
|
|
443
|
+
self.data_dir, f"{github_config['name']}_github_data.json"
|
|
444
|
+
)
|
|
445
|
+
if os.path.exists(cache_github_data):
|
|
446
|
+
os.remove(cache_github_data)
|
|
447
|
+
shutil.move(github_data_file_path, cache_github_data)
|
|
448
|
+
logger.info(f"๐ฆ Moved GitHub data to cache: {cache_github_data}")
|
|
449
|
+
|
|
450
|
+
logger.info("โ
GitHub: Repository scraped successfully")
|
|
451
|
+
|
|
452
|
+
def _scrape_pdf(self, source: dict[str, Any]):
|
|
453
|
+
"""Scrape PDF document."""
|
|
454
|
+
try:
|
|
455
|
+
from skill_seekers.cli.pdf_scraper import PDFToSkillConverter
|
|
456
|
+
except ImportError:
|
|
457
|
+
logger.error("pdf_scraper.py not found")
|
|
458
|
+
return
|
|
459
|
+
|
|
460
|
+
# Multi-source support: Get unique index for this PDF source
|
|
461
|
+
idx = self._source_counters["pdf"]
|
|
462
|
+
self._source_counters["pdf"] += 1
|
|
463
|
+
|
|
464
|
+
# Extract PDF identifier for unique naming (filename without extension)
|
|
465
|
+
pdf_path = source["path"]
|
|
466
|
+
pdf_id = os.path.splitext(os.path.basename(pdf_path))[0]
|
|
467
|
+
|
|
468
|
+
# Create config for PDF scraper
|
|
469
|
+
pdf_config = {
|
|
470
|
+
"name": f"{self.name}_pdf_{idx}_{pdf_id}",
|
|
471
|
+
"pdf": source["path"],
|
|
472
|
+
"extract_tables": source.get("extract_tables", False),
|
|
473
|
+
"ocr": source.get("ocr", False),
|
|
474
|
+
"password": source.get("password"),
|
|
475
|
+
}
|
|
476
|
+
|
|
477
|
+
# Scrape
|
|
478
|
+
logger.info(f"Scraping PDF: {source['path']}")
|
|
479
|
+
converter = PDFToSkillConverter(pdf_config)
|
|
480
|
+
pdf_data = converter.extract_all()
|
|
481
|
+
|
|
482
|
+
# Save data
|
|
483
|
+
pdf_data_file = os.path.join(self.data_dir, f"pdf_data_{idx}_{pdf_id}.json")
|
|
484
|
+
with open(pdf_data_file, "w", encoding="utf-8") as f:
|
|
485
|
+
json.dump(pdf_data, f, indent=2, ensure_ascii=False)
|
|
486
|
+
|
|
487
|
+
# Append to list instead of overwriting
|
|
488
|
+
self.scraped_data["pdf"].append(
|
|
489
|
+
{
|
|
490
|
+
"pdf_path": pdf_path,
|
|
491
|
+
"pdf_id": pdf_id,
|
|
492
|
+
"idx": idx,
|
|
493
|
+
"data": pdf_data,
|
|
494
|
+
"data_file": pdf_data_file,
|
|
495
|
+
}
|
|
496
|
+
)
|
|
497
|
+
|
|
498
|
+
# Build standalone SKILL.md for synthesis
|
|
499
|
+
try:
|
|
500
|
+
converter.build_skill()
|
|
501
|
+
logger.info("โ
PDF: Standalone SKILL.md created")
|
|
502
|
+
except Exception as e:
|
|
503
|
+
logger.warning(f"โ ๏ธ Failed to build standalone PDF SKILL.md: {e}")
|
|
504
|
+
|
|
505
|
+
logger.info(f"โ
PDF: {len(pdf_data.get('pages', []))} pages extracted")
|
|
506
|
+
|
|
507
|
+
def _load_json(self, file_path: Path) -> dict:
|
|
508
|
+
"""
|
|
509
|
+
Load JSON file safely.
|
|
510
|
+
|
|
511
|
+
Args:
|
|
512
|
+
file_path: Path to JSON file
|
|
513
|
+
|
|
514
|
+
Returns:
|
|
515
|
+
Dict with JSON data, or empty dict if file doesn't exist or is invalid
|
|
516
|
+
"""
|
|
517
|
+
if not file_path.exists():
|
|
518
|
+
logger.warning(f"JSON file not found: {file_path}")
|
|
519
|
+
return {}
|
|
520
|
+
|
|
521
|
+
try:
|
|
522
|
+
with open(file_path, encoding="utf-8") as f:
|
|
523
|
+
return json.load(f)
|
|
524
|
+
except (OSError, json.JSONDecodeError) as e:
|
|
525
|
+
logger.warning(f"Failed to load JSON {file_path}: {e}")
|
|
526
|
+
return {}
|
|
527
|
+
|
|
528
|
+
def _load_guide_collection(self, tutorials_dir: Path) -> dict:
|
|
529
|
+
"""
|
|
530
|
+
Load how-to guide collection from tutorials directory.
|
|
531
|
+
|
|
532
|
+
Args:
|
|
533
|
+
tutorials_dir: Path to tutorials directory
|
|
534
|
+
|
|
535
|
+
Returns:
|
|
536
|
+
Dict with guide collection data
|
|
537
|
+
"""
|
|
538
|
+
if not tutorials_dir.exists():
|
|
539
|
+
logger.warning(f"Tutorials directory not found: {tutorials_dir}")
|
|
540
|
+
return {"guides": []}
|
|
541
|
+
|
|
542
|
+
collection_file = tutorials_dir / "guide_collection.json"
|
|
543
|
+
if collection_file.exists():
|
|
544
|
+
return self._load_json(collection_file)
|
|
545
|
+
|
|
546
|
+
# Fallback: scan for individual guide JSON files
|
|
547
|
+
guides = []
|
|
548
|
+
for guide_file in tutorials_dir.glob("guide_*.json"):
|
|
549
|
+
guide_data = self._load_json(guide_file)
|
|
550
|
+
if guide_data:
|
|
551
|
+
guides.append(guide_data)
|
|
552
|
+
|
|
553
|
+
return {"guides": guides, "total_count": len(guides)}
|
|
554
|
+
|
|
555
|
+
def _load_api_reference(self, api_dir: Path) -> dict[str, Any]:
|
|
556
|
+
"""
|
|
557
|
+
Load API reference markdown files from api_reference directory.
|
|
558
|
+
|
|
559
|
+
Args:
|
|
560
|
+
api_dir: Path to api_reference directory
|
|
561
|
+
|
|
562
|
+
Returns:
|
|
563
|
+
Dict mapping module names to markdown content, or empty dict if not found
|
|
564
|
+
"""
|
|
565
|
+
if not api_dir.exists():
|
|
566
|
+
logger.debug(f"API reference directory not found: {api_dir}")
|
|
567
|
+
return {}
|
|
568
|
+
|
|
569
|
+
api_refs = {}
|
|
570
|
+
for md_file in api_dir.glob("*.md"):
|
|
571
|
+
try:
|
|
572
|
+
module_name = md_file.stem
|
|
573
|
+
api_refs[module_name] = md_file.read_text(encoding="utf-8")
|
|
574
|
+
except OSError as e:
|
|
575
|
+
logger.warning(f"Failed to read API reference {md_file}: {e}")
|
|
576
|
+
|
|
577
|
+
return api_refs
|
|
578
|
+
|
|
579
|
+
def _run_c3_analysis(self, local_repo_path: str, source: dict[str, Any]) -> dict[str, Any]:
|
|
580
|
+
"""
|
|
581
|
+
Run comprehensive C3.x codebase analysis.
|
|
582
|
+
|
|
583
|
+
Calls codebase_scraper.analyze_codebase() with all C3.x features enabled,
|
|
584
|
+
loads the results into memory, and cleans up temporary files.
|
|
585
|
+
|
|
586
|
+
Args:
|
|
587
|
+
local_repo_path: Path to local repository
|
|
588
|
+
source: GitHub source configuration dict
|
|
589
|
+
|
|
590
|
+
Returns:
|
|
591
|
+
Dict with keys: patterns, test_examples, how_to_guides,
|
|
592
|
+
config_patterns, architecture
|
|
593
|
+
"""
|
|
594
|
+
try:
|
|
595
|
+
from skill_seekers.cli.codebase_scraper import analyze_codebase
|
|
596
|
+
except ImportError:
|
|
597
|
+
logger.error("codebase_scraper.py not found")
|
|
598
|
+
return {}
|
|
599
|
+
|
|
600
|
+
# Create temp output dir for C3.x analysis
|
|
601
|
+
temp_output = Path(self.data_dir) / "c3_analysis_temp"
|
|
602
|
+
temp_output.mkdir(parents=True, exist_ok=True)
|
|
603
|
+
|
|
604
|
+
logger.info(f" Analyzing codebase: {local_repo_path}")
|
|
605
|
+
|
|
606
|
+
try:
|
|
607
|
+
# Run full C3.x analysis
|
|
608
|
+
_results = analyze_codebase(
|
|
609
|
+
directory=Path(local_repo_path),
|
|
610
|
+
output_dir=temp_output,
|
|
611
|
+
depth="deep",
|
|
612
|
+
languages=None, # Analyze all languages
|
|
613
|
+
file_patterns=source.get("file_patterns"),
|
|
614
|
+
build_api_reference=True, # C2.5: API Reference
|
|
615
|
+
extract_comments=False, # Not needed
|
|
616
|
+
build_dependency_graph=True, # C2.6: Dependency Graph
|
|
617
|
+
detect_patterns=True, # C3.1: Design patterns
|
|
618
|
+
extract_test_examples=True, # C3.2: Test examples
|
|
619
|
+
build_how_to_guides=True, # C3.3: How-to guides
|
|
620
|
+
extract_config_patterns=True, # C3.4: Config patterns
|
|
621
|
+
enhance_with_ai=source.get("ai_mode", "auto") != "none",
|
|
622
|
+
ai_mode=source.get("ai_mode", "auto"),
|
|
623
|
+
)
|
|
624
|
+
|
|
625
|
+
# Load C3.x outputs into memory
|
|
626
|
+
c3_data = {
|
|
627
|
+
"patterns": self._load_json(temp_output / "patterns" / "detected_patterns.json"),
|
|
628
|
+
"test_examples": self._load_json(
|
|
629
|
+
temp_output / "test_examples" / "test_examples.json"
|
|
630
|
+
),
|
|
631
|
+
"how_to_guides": self._load_guide_collection(temp_output / "tutorials"),
|
|
632
|
+
"config_patterns": self._load_json(
|
|
633
|
+
temp_output / "config_patterns" / "config_patterns.json"
|
|
634
|
+
),
|
|
635
|
+
"architecture": self._load_json(
|
|
636
|
+
temp_output / "architecture" / "architectural_patterns.json"
|
|
637
|
+
),
|
|
638
|
+
"api_reference": self._load_api_reference(temp_output / "api_reference"), # C2.5
|
|
639
|
+
"dependency_graph": self._load_json(
|
|
640
|
+
temp_output / "dependencies" / "dependency_graph.json"
|
|
641
|
+
), # C2.6
|
|
642
|
+
}
|
|
643
|
+
|
|
644
|
+
# Log summary
|
|
645
|
+
total_patterns = sum(len(f.get("patterns", [])) for f in c3_data.get("patterns", []))
|
|
646
|
+
total_examples = c3_data.get("test_examples", {}).get("total_examples", 0)
|
|
647
|
+
total_guides = len(c3_data.get("how_to_guides", {}).get("guides", []))
|
|
648
|
+
total_configs = len(c3_data.get("config_patterns", {}).get("config_files", []))
|
|
649
|
+
arch_patterns = len(c3_data.get("architecture", {}).get("patterns", []))
|
|
650
|
+
|
|
651
|
+
logger.info(f" โ Design Patterns: {total_patterns}")
|
|
652
|
+
logger.info(f" โ Test Examples: {total_examples}")
|
|
653
|
+
logger.info(f" โ How-To Guides: {total_guides}")
|
|
654
|
+
logger.info(f" โ Config Files: {total_configs}")
|
|
655
|
+
logger.info(f" โ Architecture Patterns: {arch_patterns}")
|
|
656
|
+
|
|
657
|
+
return c3_data
|
|
658
|
+
|
|
659
|
+
except Exception as e:
|
|
660
|
+
logger.error(f"C3.x analysis failed: {e}")
|
|
661
|
+
import traceback
|
|
662
|
+
|
|
663
|
+
traceback.print_exc()
|
|
664
|
+
return {}
|
|
665
|
+
|
|
666
|
+
finally:
|
|
667
|
+
# Clean up temp directory
|
|
668
|
+
if temp_output.exists():
|
|
669
|
+
try:
|
|
670
|
+
shutil.rmtree(temp_output)
|
|
671
|
+
except Exception as e:
|
|
672
|
+
logger.warning(f"Failed to clean up temp directory: {e}")
|
|
673
|
+
|
|
674
|
+
def detect_conflicts(self) -> list:
|
|
675
|
+
"""
|
|
676
|
+
Detect conflicts between documentation and code.
|
|
677
|
+
|
|
678
|
+
Only applicable if both documentation and GitHub sources exist.
|
|
679
|
+
|
|
680
|
+
Returns:
|
|
681
|
+
List of conflicts
|
|
682
|
+
"""
|
|
683
|
+
logger.info("\n" + "=" * 60)
|
|
684
|
+
logger.info("PHASE 2: Detecting conflicts")
|
|
685
|
+
logger.info("=" * 60)
|
|
686
|
+
|
|
687
|
+
if not self.validator.needs_api_merge():
|
|
688
|
+
logger.info("No API merge needed (only one API source)")
|
|
689
|
+
return []
|
|
690
|
+
|
|
691
|
+
# Get documentation and GitHub data
|
|
692
|
+
docs_data = self.scraped_data.get("documentation", {})
|
|
693
|
+
github_data = self.scraped_data.get("github", {})
|
|
694
|
+
|
|
695
|
+
if not docs_data or not github_data:
|
|
696
|
+
logger.warning("Missing documentation or GitHub data for conflict detection")
|
|
697
|
+
return []
|
|
698
|
+
|
|
699
|
+
# Load data files
|
|
700
|
+
with open(docs_data["data_file"], encoding="utf-8") as f:
|
|
701
|
+
docs_json = json.load(f)
|
|
702
|
+
|
|
703
|
+
with open(github_data["data_file"], encoding="utf-8") as f:
|
|
704
|
+
github_json = json.load(f)
|
|
705
|
+
|
|
706
|
+
# Detect conflicts
|
|
707
|
+
detector = ConflictDetector(docs_json, github_json)
|
|
708
|
+
conflicts = detector.detect_all_conflicts()
|
|
709
|
+
|
|
710
|
+
# Save conflicts
|
|
711
|
+
conflicts_file = os.path.join(self.data_dir, "conflicts.json")
|
|
712
|
+
detector.save_conflicts(conflicts, conflicts_file)
|
|
713
|
+
|
|
714
|
+
# Print summary
|
|
715
|
+
summary = detector.generate_summary(conflicts)
|
|
716
|
+
logger.info("\n๐ Conflict Summary:")
|
|
717
|
+
logger.info(f" Total: {summary['total']}")
|
|
718
|
+
logger.info(" By Type:")
|
|
719
|
+
for ctype, count in summary["by_type"].items():
|
|
720
|
+
if count > 0:
|
|
721
|
+
logger.info(f" - {ctype}: {count}")
|
|
722
|
+
logger.info(" By Severity:")
|
|
723
|
+
for severity, count in summary["by_severity"].items():
|
|
724
|
+
if count > 0:
|
|
725
|
+
emoji = "๐ด" if severity == "high" else "๐ก" if severity == "medium" else "๐ข"
|
|
726
|
+
logger.info(f" {emoji} {severity}: {count}")
|
|
727
|
+
|
|
728
|
+
return conflicts
|
|
729
|
+
|
|
730
|
+
def merge_sources(self, conflicts: list):
|
|
731
|
+
"""
|
|
732
|
+
Merge data from multiple sources.
|
|
733
|
+
|
|
734
|
+
Args:
|
|
735
|
+
conflicts: List of detected conflicts
|
|
736
|
+
"""
|
|
737
|
+
logger.info("\n" + "=" * 60)
|
|
738
|
+
logger.info(f"PHASE 3: Merging sources ({self.merge_mode})")
|
|
739
|
+
logger.info("=" * 60)
|
|
740
|
+
|
|
741
|
+
if not conflicts:
|
|
742
|
+
logger.info("No conflicts to merge")
|
|
743
|
+
return None
|
|
744
|
+
|
|
745
|
+
# Get data files
|
|
746
|
+
docs_data = self.scraped_data.get("documentation", {})
|
|
747
|
+
github_data = self.scraped_data.get("github", {})
|
|
748
|
+
|
|
749
|
+
# Load data
|
|
750
|
+
with open(docs_data["data_file"], encoding="utf-8") as f:
|
|
751
|
+
docs_json = json.load(f)
|
|
752
|
+
|
|
753
|
+
with open(github_data["data_file"], encoding="utf-8") as f:
|
|
754
|
+
github_json = json.load(f)
|
|
755
|
+
|
|
756
|
+
# Choose merger
|
|
757
|
+
if self.merge_mode == "claude-enhanced":
|
|
758
|
+
merger = ClaudeEnhancedMerger(docs_json, github_json, conflicts)
|
|
759
|
+
else:
|
|
760
|
+
merger = RuleBasedMerger(docs_json, github_json, conflicts)
|
|
761
|
+
|
|
762
|
+
# Merge
|
|
763
|
+
merged_data = merger.merge_all()
|
|
764
|
+
|
|
765
|
+
# Save merged data
|
|
766
|
+
merged_file = os.path.join(self.data_dir, "merged_data.json")
|
|
767
|
+
with open(merged_file, "w", encoding="utf-8") as f:
|
|
768
|
+
json.dump(merged_data, f, indent=2, ensure_ascii=False)
|
|
769
|
+
|
|
770
|
+
logger.info(f"โ
Merged data saved: {merged_file}")
|
|
771
|
+
|
|
772
|
+
return merged_data
|
|
773
|
+
|
|
774
|
+
def build_skill(self, merged_data: dict | None = None):
|
|
775
|
+
"""
|
|
776
|
+
Build final unified skill.
|
|
777
|
+
|
|
778
|
+
Args:
|
|
779
|
+
merged_data: Merged API data (if conflicts were resolved)
|
|
780
|
+
"""
|
|
781
|
+
logger.info("\n" + "=" * 60)
|
|
782
|
+
logger.info("PHASE 4: Building unified skill")
|
|
783
|
+
logger.info("=" * 60)
|
|
784
|
+
|
|
785
|
+
# Load conflicts if they exist
|
|
786
|
+
conflicts = []
|
|
787
|
+
conflicts_file = os.path.join(self.data_dir, "conflicts.json")
|
|
788
|
+
if os.path.exists(conflicts_file):
|
|
789
|
+
with open(conflicts_file, encoding="utf-8") as f:
|
|
790
|
+
conflicts_data = json.load(f)
|
|
791
|
+
conflicts = conflicts_data.get("conflicts", [])
|
|
792
|
+
|
|
793
|
+
# Build skill
|
|
794
|
+
builder = UnifiedSkillBuilder(
|
|
795
|
+
self.config, self.scraped_data, merged_data, conflicts, cache_dir=self.cache_dir
|
|
796
|
+
)
|
|
797
|
+
|
|
798
|
+
builder.build()
|
|
799
|
+
|
|
800
|
+
logger.info(f"โ
Unified skill built: {self.output_dir}/")
|
|
801
|
+
|
|
802
|
+
def run(self):
|
|
803
|
+
"""
|
|
804
|
+
Execute complete unified scraping workflow.
|
|
805
|
+
"""
|
|
806
|
+
logger.info("\n" + "๐ " * 20)
|
|
807
|
+
logger.info(f"Unified Scraper: {self.config['name']}")
|
|
808
|
+
logger.info("๐ " * 20 + "\n")
|
|
809
|
+
|
|
810
|
+
try:
|
|
811
|
+
# Phase 1: Scrape all sources
|
|
812
|
+
self.scrape_all_sources()
|
|
813
|
+
|
|
814
|
+
# Phase 2: Detect conflicts (if applicable)
|
|
815
|
+
conflicts = self.detect_conflicts()
|
|
816
|
+
|
|
817
|
+
# Phase 3: Merge sources (if conflicts exist)
|
|
818
|
+
merged_data = None
|
|
819
|
+
if conflicts:
|
|
820
|
+
merged_data = self.merge_sources(conflicts)
|
|
821
|
+
|
|
822
|
+
# Phase 4: Build skill
|
|
823
|
+
self.build_skill(merged_data)
|
|
824
|
+
|
|
825
|
+
logger.info("\n" + "โ
" * 20)
|
|
826
|
+
logger.info("Unified scraping complete!")
|
|
827
|
+
logger.info("โ
" * 20 + "\n")
|
|
828
|
+
|
|
829
|
+
logger.info(f"๐ Output: {self.output_dir}/")
|
|
830
|
+
logger.info(f"๐ Data: {self.data_dir}/")
|
|
831
|
+
|
|
832
|
+
except KeyboardInterrupt:
|
|
833
|
+
logger.info("\n\nโ ๏ธ Scraping interrupted by user")
|
|
834
|
+
sys.exit(1)
|
|
835
|
+
except Exception as e:
|
|
836
|
+
logger.error(f"\n\nโ Error during scraping: {e}")
|
|
837
|
+
import traceback
|
|
838
|
+
|
|
839
|
+
traceback.print_exc()
|
|
840
|
+
sys.exit(1)
|
|
841
|
+
|
|
842
|
+
|
|
843
|
+
def main():
|
|
844
|
+
"""Main entry point."""
|
|
845
|
+
parser = argparse.ArgumentParser(
|
|
846
|
+
description="Unified multi-source scraper",
|
|
847
|
+
formatter_class=argparse.RawDescriptionHelpFormatter,
|
|
848
|
+
epilog="""
|
|
849
|
+
Examples:
|
|
850
|
+
# Basic usage with unified config
|
|
851
|
+
skill-seekers unified --config configs/godot_unified.json
|
|
852
|
+
|
|
853
|
+
# Override merge mode
|
|
854
|
+
skill-seekers unified --config configs/react_unified.json --merge-mode claude-enhanced
|
|
855
|
+
|
|
856
|
+
# Backward compatible with legacy configs
|
|
857
|
+
skill-seekers unified --config configs/react.json
|
|
858
|
+
""",
|
|
859
|
+
)
|
|
860
|
+
|
|
861
|
+
parser.add_argument("--config", "-c", required=True, help="Path to unified config JSON file")
|
|
862
|
+
parser.add_argument(
|
|
863
|
+
"--merge-mode",
|
|
864
|
+
"-m",
|
|
865
|
+
choices=["rule-based", "claude-enhanced"],
|
|
866
|
+
help="Override config merge mode",
|
|
867
|
+
)
|
|
868
|
+
parser.add_argument(
|
|
869
|
+
"--skip-codebase-analysis",
|
|
870
|
+
action="store_true",
|
|
871
|
+
help="Skip C3.x codebase analysis for GitHub sources (default: enabled)",
|
|
872
|
+
)
|
|
873
|
+
parser.add_argument(
|
|
874
|
+
"--fresh",
|
|
875
|
+
action="store_true",
|
|
876
|
+
help="Clear any existing data and start fresh (ignore checkpoints)",
|
|
877
|
+
)
|
|
878
|
+
parser.add_argument(
|
|
879
|
+
"--dry-run",
|
|
880
|
+
action="store_true",
|
|
881
|
+
help="Preview what will be scraped without actually scraping",
|
|
882
|
+
)
|
|
883
|
+
|
|
884
|
+
args = parser.parse_args()
|
|
885
|
+
|
|
886
|
+
# Create scraper
|
|
887
|
+
scraper = UnifiedScraper(args.config, args.merge_mode)
|
|
888
|
+
|
|
889
|
+
# Disable codebase analysis if requested
|
|
890
|
+
if args.skip_codebase_analysis:
|
|
891
|
+
for source in scraper.config.get("sources", []):
|
|
892
|
+
if source["type"] == "github":
|
|
893
|
+
source["enable_codebase_analysis"] = False
|
|
894
|
+
logger.info(
|
|
895
|
+
f"โญ๏ธ Skipping codebase analysis for GitHub source: {source.get('repo', 'unknown')}"
|
|
896
|
+
)
|
|
897
|
+
|
|
898
|
+
# Handle --fresh flag (clear cache)
|
|
899
|
+
if args.fresh:
|
|
900
|
+
import shutil
|
|
901
|
+
|
|
902
|
+
if os.path.exists(scraper.cache_dir):
|
|
903
|
+
logger.info(f"๐งน Clearing cache: {scraper.cache_dir}")
|
|
904
|
+
shutil.rmtree(scraper.cache_dir)
|
|
905
|
+
# Recreate directories
|
|
906
|
+
os.makedirs(scraper.sources_dir, exist_ok=True)
|
|
907
|
+
os.makedirs(scraper.data_dir, exist_ok=True)
|
|
908
|
+
os.makedirs(scraper.repos_dir, exist_ok=True)
|
|
909
|
+
os.makedirs(scraper.logs_dir, exist_ok=True)
|
|
910
|
+
|
|
911
|
+
# Handle --dry-run flag
|
|
912
|
+
if args.dry_run:
|
|
913
|
+
logger.info("๐ DRY RUN MODE - Preview only, no scraping will occur")
|
|
914
|
+
logger.info(f"\nWould scrape {len(scraper.config.get('sources', []))} sources:")
|
|
915
|
+
for idx, source in enumerate(scraper.config.get("sources", []), 1):
|
|
916
|
+
source_type = source.get("type", "unknown")
|
|
917
|
+
if source_type == "documentation":
|
|
918
|
+
logger.info(f" {idx}. Documentation: {source.get('base_url', 'N/A')}")
|
|
919
|
+
elif source_type == "github":
|
|
920
|
+
logger.info(f" {idx}. GitHub: {source.get('repo', 'N/A')}")
|
|
921
|
+
elif source_type == "pdf":
|
|
922
|
+
logger.info(f" {idx}. PDF: {source.get('pdf_path', 'N/A')}")
|
|
923
|
+
logger.info(f"\nOutput directory: {scraper.output_dir}")
|
|
924
|
+
logger.info(f"Merge mode: {scraper.merge_mode}")
|
|
925
|
+
return
|
|
926
|
+
|
|
927
|
+
# Run scraper
|
|
928
|
+
scraper.run()
|
|
929
|
+
|
|
930
|
+
|
|
931
|
+
if __name__ == "__main__":
|
|
932
|
+
main()
|