skill-seekers 2.7.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- skill_seekers/__init__.py +22 -0
- skill_seekers/cli/__init__.py +39 -0
- skill_seekers/cli/adaptors/__init__.py +120 -0
- skill_seekers/cli/adaptors/base.py +221 -0
- skill_seekers/cli/adaptors/claude.py +485 -0
- skill_seekers/cli/adaptors/gemini.py +453 -0
- skill_seekers/cli/adaptors/markdown.py +269 -0
- skill_seekers/cli/adaptors/openai.py +503 -0
- skill_seekers/cli/ai_enhancer.py +310 -0
- skill_seekers/cli/api_reference_builder.py +373 -0
- skill_seekers/cli/architectural_pattern_detector.py +525 -0
- skill_seekers/cli/code_analyzer.py +1462 -0
- skill_seekers/cli/codebase_scraper.py +1225 -0
- skill_seekers/cli/config_command.py +563 -0
- skill_seekers/cli/config_enhancer.py +431 -0
- skill_seekers/cli/config_extractor.py +871 -0
- skill_seekers/cli/config_manager.py +452 -0
- skill_seekers/cli/config_validator.py +394 -0
- skill_seekers/cli/conflict_detector.py +528 -0
- skill_seekers/cli/constants.py +72 -0
- skill_seekers/cli/dependency_analyzer.py +757 -0
- skill_seekers/cli/doc_scraper.py +2332 -0
- skill_seekers/cli/enhance_skill.py +488 -0
- skill_seekers/cli/enhance_skill_local.py +1096 -0
- skill_seekers/cli/enhance_status.py +194 -0
- skill_seekers/cli/estimate_pages.py +433 -0
- skill_seekers/cli/generate_router.py +1209 -0
- skill_seekers/cli/github_fetcher.py +534 -0
- skill_seekers/cli/github_scraper.py +1466 -0
- skill_seekers/cli/guide_enhancer.py +723 -0
- skill_seekers/cli/how_to_guide_builder.py +1267 -0
- skill_seekers/cli/install_agent.py +461 -0
- skill_seekers/cli/install_skill.py +178 -0
- skill_seekers/cli/language_detector.py +614 -0
- skill_seekers/cli/llms_txt_detector.py +60 -0
- skill_seekers/cli/llms_txt_downloader.py +104 -0
- skill_seekers/cli/llms_txt_parser.py +150 -0
- skill_seekers/cli/main.py +558 -0
- skill_seekers/cli/markdown_cleaner.py +132 -0
- skill_seekers/cli/merge_sources.py +806 -0
- skill_seekers/cli/package_multi.py +77 -0
- skill_seekers/cli/package_skill.py +241 -0
- skill_seekers/cli/pattern_recognizer.py +1825 -0
- skill_seekers/cli/pdf_extractor_poc.py +1166 -0
- skill_seekers/cli/pdf_scraper.py +617 -0
- skill_seekers/cli/quality_checker.py +519 -0
- skill_seekers/cli/rate_limit_handler.py +438 -0
- skill_seekers/cli/resume_command.py +160 -0
- skill_seekers/cli/run_tests.py +230 -0
- skill_seekers/cli/setup_wizard.py +93 -0
- skill_seekers/cli/split_config.py +390 -0
- skill_seekers/cli/swift_patterns.py +560 -0
- skill_seekers/cli/test_example_extractor.py +1081 -0
- skill_seekers/cli/test_unified_simple.py +179 -0
- skill_seekers/cli/unified_codebase_analyzer.py +572 -0
- skill_seekers/cli/unified_scraper.py +932 -0
- skill_seekers/cli/unified_skill_builder.py +1605 -0
- skill_seekers/cli/upload_skill.py +162 -0
- skill_seekers/cli/utils.py +432 -0
- skill_seekers/mcp/__init__.py +33 -0
- skill_seekers/mcp/agent_detector.py +316 -0
- skill_seekers/mcp/git_repo.py +273 -0
- skill_seekers/mcp/server.py +231 -0
- skill_seekers/mcp/server_fastmcp.py +1249 -0
- skill_seekers/mcp/server_legacy.py +2302 -0
- skill_seekers/mcp/source_manager.py +285 -0
- skill_seekers/mcp/tools/__init__.py +115 -0
- skill_seekers/mcp/tools/config_tools.py +251 -0
- skill_seekers/mcp/tools/packaging_tools.py +826 -0
- skill_seekers/mcp/tools/scraping_tools.py +842 -0
- skill_seekers/mcp/tools/source_tools.py +828 -0
- skill_seekers/mcp/tools/splitting_tools.py +212 -0
- skill_seekers/py.typed +0 -0
- skill_seekers-2.7.3.dist-info/METADATA +2027 -0
- skill_seekers-2.7.3.dist-info/RECORD +79 -0
- skill_seekers-2.7.3.dist-info/WHEEL +5 -0
- skill_seekers-2.7.3.dist-info/entry_points.txt +19 -0
- skill_seekers-2.7.3.dist-info/licenses/LICENSE +21 -0
- skill_seekers-2.7.3.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,194 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
Check Enhancement Status
|
|
4
|
+
|
|
5
|
+
Monitor the status of background/daemon enhancement processes.
|
|
6
|
+
|
|
7
|
+
Usage:
|
|
8
|
+
skill-seekers enhance-status output/react/
|
|
9
|
+
skill-seekers enhance-status output/react/ --watch
|
|
10
|
+
skill-seekers enhance-status output/react/ --json
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
import json
|
|
14
|
+
import sys
|
|
15
|
+
import time
|
|
16
|
+
from pathlib import Path
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def read_status(skill_dir):
|
|
20
|
+
"""Read enhancement status from file.
|
|
21
|
+
|
|
22
|
+
Args:
|
|
23
|
+
skill_dir: Path to skill directory
|
|
24
|
+
|
|
25
|
+
Returns:
|
|
26
|
+
dict: Status data or None if not found
|
|
27
|
+
"""
|
|
28
|
+
status_file = Path(skill_dir) / ".enhancement_status.json"
|
|
29
|
+
|
|
30
|
+
if not status_file.exists():
|
|
31
|
+
return None
|
|
32
|
+
|
|
33
|
+
try:
|
|
34
|
+
return json.loads(status_file.read_text(encoding="utf-8"))
|
|
35
|
+
except Exception as e:
|
|
36
|
+
return {"error": f"Failed to read status: {e}"}
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def format_status(status):
|
|
40
|
+
"""Format status for display.
|
|
41
|
+
|
|
42
|
+
Args:
|
|
43
|
+
status: Status dict
|
|
44
|
+
|
|
45
|
+
Returns:
|
|
46
|
+
str: Formatted status string
|
|
47
|
+
"""
|
|
48
|
+
if not status:
|
|
49
|
+
return "❌ No enhancement in progress (no status file found)"
|
|
50
|
+
|
|
51
|
+
if "error" in status:
|
|
52
|
+
return f"❌ {status['error']}"
|
|
53
|
+
|
|
54
|
+
# Status emoji mapping
|
|
55
|
+
status_emojis = {"pending": "⏳", "running": "🔄", "completed": "✅", "failed": "❌"}
|
|
56
|
+
|
|
57
|
+
emoji = status_emojis.get(status.get("status", ""), "❓")
|
|
58
|
+
status_text = status.get("status", "unknown").upper()
|
|
59
|
+
message = status.get("message", "")
|
|
60
|
+
progress = status.get("progress", 0.0)
|
|
61
|
+
timestamp = status.get("timestamp", "unknown")
|
|
62
|
+
error = status.get("error")
|
|
63
|
+
pid = status.get("pid")
|
|
64
|
+
|
|
65
|
+
# Build output
|
|
66
|
+
lines = []
|
|
67
|
+
lines.append(f"\n{'=' * 60}")
|
|
68
|
+
lines.append(f"ENHANCEMENT STATUS: {status_text}")
|
|
69
|
+
lines.append(f"{'=' * 60}\n")
|
|
70
|
+
|
|
71
|
+
lines.append(f"{emoji} Status: {status_text}")
|
|
72
|
+
|
|
73
|
+
if message:
|
|
74
|
+
lines.append(f" Message: {message}")
|
|
75
|
+
|
|
76
|
+
if progress > 0:
|
|
77
|
+
progress_pct = int(progress * 100)
|
|
78
|
+
progress_bar = "█" * (progress_pct // 5) + "░" * (20 - progress_pct // 5)
|
|
79
|
+
lines.append(f" Progress: [{progress_bar}] {progress_pct}%")
|
|
80
|
+
|
|
81
|
+
if pid:
|
|
82
|
+
lines.append(f" PID: {pid}")
|
|
83
|
+
|
|
84
|
+
lines.append(f" Timestamp: {timestamp}")
|
|
85
|
+
|
|
86
|
+
if error:
|
|
87
|
+
lines.append(f"\n❌ Error: {error}")
|
|
88
|
+
|
|
89
|
+
lines.append("")
|
|
90
|
+
|
|
91
|
+
return "\n".join(lines)
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
def watch_status(skill_dir, interval=2):
|
|
95
|
+
"""Watch status in real-time.
|
|
96
|
+
|
|
97
|
+
Args:
|
|
98
|
+
skill_dir: Path to skill directory
|
|
99
|
+
interval: Update interval in seconds
|
|
100
|
+
"""
|
|
101
|
+
print(f"👀 Watching enhancement status for: {skill_dir}")
|
|
102
|
+
print(f" Update interval: {interval} seconds")
|
|
103
|
+
print(" Press Ctrl+C to stop\n")
|
|
104
|
+
|
|
105
|
+
try:
|
|
106
|
+
last_status = None
|
|
107
|
+
|
|
108
|
+
while True:
|
|
109
|
+
status = read_status(skill_dir)
|
|
110
|
+
|
|
111
|
+
# Only print if status changed
|
|
112
|
+
if status != last_status:
|
|
113
|
+
# Clear screen (optional, comment out if you don't want this)
|
|
114
|
+
# os.system('clear' if os.name != 'nt' else 'cls')
|
|
115
|
+
|
|
116
|
+
print(format_status(status))
|
|
117
|
+
last_status = status
|
|
118
|
+
|
|
119
|
+
# Exit if completed or failed
|
|
120
|
+
if status and status.get("status") in ["completed", "failed"]:
|
|
121
|
+
break
|
|
122
|
+
|
|
123
|
+
time.sleep(interval)
|
|
124
|
+
|
|
125
|
+
except KeyboardInterrupt:
|
|
126
|
+
print("\n\n👋 Stopped watching")
|
|
127
|
+
sys.exit(0)
|
|
128
|
+
|
|
129
|
+
|
|
130
|
+
def main():
|
|
131
|
+
import argparse
|
|
132
|
+
|
|
133
|
+
parser = argparse.ArgumentParser(
|
|
134
|
+
description="Check enhancement status",
|
|
135
|
+
formatter_class=argparse.RawDescriptionHelpFormatter,
|
|
136
|
+
epilog="""
|
|
137
|
+
Examples:
|
|
138
|
+
# Check status once
|
|
139
|
+
skill-seekers enhance-status output/react/
|
|
140
|
+
|
|
141
|
+
# Watch status in real-time
|
|
142
|
+
skill-seekers enhance-status output/react/ --watch
|
|
143
|
+
|
|
144
|
+
# Get JSON output (for scripts)
|
|
145
|
+
skill-seekers enhance-status output/react/ --json
|
|
146
|
+
""",
|
|
147
|
+
)
|
|
148
|
+
|
|
149
|
+
parser.add_argument("skill_directory", help="Path to skill directory (e.g., output/react/)")
|
|
150
|
+
|
|
151
|
+
parser.add_argument(
|
|
152
|
+
"--watch",
|
|
153
|
+
"-w",
|
|
154
|
+
action="store_true",
|
|
155
|
+
help="Watch status in real-time (updates every 2 seconds)",
|
|
156
|
+
)
|
|
157
|
+
|
|
158
|
+
parser.add_argument("--json", action="store_true", help="Output raw JSON (for scripting)")
|
|
159
|
+
|
|
160
|
+
parser.add_argument(
|
|
161
|
+
"--interval", type=int, default=2, help="Watch update interval in seconds (default: 2)"
|
|
162
|
+
)
|
|
163
|
+
|
|
164
|
+
args = parser.parse_args()
|
|
165
|
+
|
|
166
|
+
# Watch mode
|
|
167
|
+
if args.watch:
|
|
168
|
+
watch_status(args.skill_directory, args.interval)
|
|
169
|
+
return
|
|
170
|
+
|
|
171
|
+
# Read status
|
|
172
|
+
status = read_status(args.skill_directory)
|
|
173
|
+
|
|
174
|
+
# JSON output
|
|
175
|
+
if args.json:
|
|
176
|
+
print(json.dumps(status, indent=2))
|
|
177
|
+
return
|
|
178
|
+
|
|
179
|
+
# Human-readable output
|
|
180
|
+
print(format_status(status))
|
|
181
|
+
|
|
182
|
+
# Exit code based on status
|
|
183
|
+
if not status:
|
|
184
|
+
sys.exit(2) # No status found
|
|
185
|
+
elif status.get("status") == "completed":
|
|
186
|
+
sys.exit(0) # Success
|
|
187
|
+
elif status.get("status") == "failed":
|
|
188
|
+
sys.exit(1) # Failed
|
|
189
|
+
else:
|
|
190
|
+
sys.exit(0) # In progress
|
|
191
|
+
|
|
192
|
+
|
|
193
|
+
if __name__ == "__main__":
|
|
194
|
+
main()
|
|
@@ -0,0 +1,433 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
Page Count Estimator for Skill Seeker
|
|
4
|
+
Quickly estimates how many pages a config will scrape without downloading content
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import json
|
|
8
|
+
import os
|
|
9
|
+
import sys
|
|
10
|
+
import time
|
|
11
|
+
from pathlib import Path
|
|
12
|
+
from urllib.parse import urljoin, urlparse
|
|
13
|
+
|
|
14
|
+
import requests
|
|
15
|
+
from bs4 import BeautifulSoup
|
|
16
|
+
|
|
17
|
+
# Add parent directory to path for imports when run as script
|
|
18
|
+
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
|
19
|
+
|
|
20
|
+
from skill_seekers.cli.constants import (
|
|
21
|
+
DEFAULT_MAX_DISCOVERY,
|
|
22
|
+
DEFAULT_RATE_LIMIT,
|
|
23
|
+
DISCOVERY_THRESHOLD,
|
|
24
|
+
)
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def estimate_pages(config, max_discovery=DEFAULT_MAX_DISCOVERY, timeout=30):
|
|
28
|
+
"""
|
|
29
|
+
Estimate total pages that will be scraped
|
|
30
|
+
|
|
31
|
+
Args:
|
|
32
|
+
config: Configuration dictionary
|
|
33
|
+
max_discovery: Maximum pages to discover (safety limit, use -1 for unlimited)
|
|
34
|
+
timeout: Timeout for HTTP requests in seconds
|
|
35
|
+
|
|
36
|
+
Returns:
|
|
37
|
+
dict with estimation results
|
|
38
|
+
"""
|
|
39
|
+
base_url = config["base_url"]
|
|
40
|
+
start_urls = config.get("start_urls", [base_url])
|
|
41
|
+
url_patterns = config.get("url_patterns", {"include": [], "exclude": []})
|
|
42
|
+
rate_limit = config.get("rate_limit", DEFAULT_RATE_LIMIT)
|
|
43
|
+
|
|
44
|
+
visited = set()
|
|
45
|
+
pending = list(start_urls)
|
|
46
|
+
discovered = 0
|
|
47
|
+
|
|
48
|
+
include_patterns = url_patterns.get("include", [])
|
|
49
|
+
exclude_patterns = url_patterns.get("exclude", [])
|
|
50
|
+
|
|
51
|
+
# Handle unlimited mode
|
|
52
|
+
unlimited = max_discovery == -1 or max_discovery is None
|
|
53
|
+
|
|
54
|
+
print(f"🔍 Estimating pages for: {config['name']}")
|
|
55
|
+
print(f"📍 Base URL: {base_url}")
|
|
56
|
+
print(f"🎯 Start URLs: {len(start_urls)}")
|
|
57
|
+
print(f"⏱️ Rate limit: {rate_limit}s")
|
|
58
|
+
|
|
59
|
+
if unlimited:
|
|
60
|
+
print("🔢 Max discovery: UNLIMITED (will discover all pages)")
|
|
61
|
+
print("⚠️ WARNING: This may take a long time!")
|
|
62
|
+
else:
|
|
63
|
+
print(f"🔢 Max discovery: {max_discovery}")
|
|
64
|
+
|
|
65
|
+
print()
|
|
66
|
+
|
|
67
|
+
start_time = time.time()
|
|
68
|
+
|
|
69
|
+
# Loop condition: stop if no more URLs, or if limit reached (when not unlimited)
|
|
70
|
+
while pending and (unlimited or discovered < max_discovery):
|
|
71
|
+
url = pending.pop(0)
|
|
72
|
+
|
|
73
|
+
# Skip if already visited
|
|
74
|
+
if url in visited:
|
|
75
|
+
continue
|
|
76
|
+
|
|
77
|
+
visited.add(url)
|
|
78
|
+
discovered += 1
|
|
79
|
+
|
|
80
|
+
# Progress indicator
|
|
81
|
+
if discovered % 10 == 0:
|
|
82
|
+
elapsed = time.time() - start_time
|
|
83
|
+
rate = discovered / elapsed if elapsed > 0 else 0
|
|
84
|
+
print(f"⏳ Discovered: {discovered} pages ({rate:.1f} pages/sec)", end="\r")
|
|
85
|
+
|
|
86
|
+
try:
|
|
87
|
+
# HEAD request first to check if page exists (faster)
|
|
88
|
+
head_response = requests.head(url, timeout=timeout, allow_redirects=True)
|
|
89
|
+
|
|
90
|
+
# Skip non-HTML content
|
|
91
|
+
content_type = head_response.headers.get("Content-Type", "")
|
|
92
|
+
if "text/html" not in content_type:
|
|
93
|
+
continue
|
|
94
|
+
|
|
95
|
+
# Now GET the page to find links
|
|
96
|
+
response = requests.get(url, timeout=timeout)
|
|
97
|
+
response.raise_for_status()
|
|
98
|
+
|
|
99
|
+
soup = BeautifulSoup(response.content, "html.parser")
|
|
100
|
+
|
|
101
|
+
# Find all links
|
|
102
|
+
for link in soup.find_all("a", href=True):
|
|
103
|
+
href = link["href"]
|
|
104
|
+
full_url = urljoin(url, href)
|
|
105
|
+
|
|
106
|
+
# Normalize URL
|
|
107
|
+
parsed = urlparse(full_url)
|
|
108
|
+
full_url = f"{parsed.scheme}://{parsed.netloc}{parsed.path}"
|
|
109
|
+
|
|
110
|
+
# Check if URL is valid
|
|
111
|
+
if not is_valid_url(full_url, base_url, include_patterns, exclude_patterns):
|
|
112
|
+
continue
|
|
113
|
+
|
|
114
|
+
# Add to pending if not visited
|
|
115
|
+
if full_url not in visited and full_url not in pending:
|
|
116
|
+
pending.append(full_url)
|
|
117
|
+
|
|
118
|
+
# Rate limiting
|
|
119
|
+
time.sleep(rate_limit)
|
|
120
|
+
|
|
121
|
+
except requests.RequestException:
|
|
122
|
+
# Silently skip errors during estimation
|
|
123
|
+
pass
|
|
124
|
+
except Exception:
|
|
125
|
+
# Silently skip other errors
|
|
126
|
+
pass
|
|
127
|
+
|
|
128
|
+
elapsed = time.time() - start_time
|
|
129
|
+
|
|
130
|
+
# Results
|
|
131
|
+
results = {
|
|
132
|
+
"discovered": discovered,
|
|
133
|
+
"pending": len(pending),
|
|
134
|
+
"estimated_total": discovered + len(pending),
|
|
135
|
+
"elapsed_seconds": round(elapsed, 2),
|
|
136
|
+
"discovery_rate": round(discovered / elapsed if elapsed > 0 else 0, 2),
|
|
137
|
+
"hit_limit": (not unlimited) and (discovered >= max_discovery),
|
|
138
|
+
"unlimited": unlimited,
|
|
139
|
+
}
|
|
140
|
+
|
|
141
|
+
return results
|
|
142
|
+
|
|
143
|
+
|
|
144
|
+
def is_valid_url(url, base_url, include_patterns, exclude_patterns):
|
|
145
|
+
"""Check if URL should be crawled"""
|
|
146
|
+
# Must be same domain
|
|
147
|
+
if not url.startswith(base_url.rstrip("/")):
|
|
148
|
+
return False
|
|
149
|
+
|
|
150
|
+
# Check exclude patterns first
|
|
151
|
+
if exclude_patterns:
|
|
152
|
+
for pattern in exclude_patterns:
|
|
153
|
+
if pattern in url:
|
|
154
|
+
return False
|
|
155
|
+
|
|
156
|
+
# Check include patterns (if specified)
|
|
157
|
+
if include_patterns:
|
|
158
|
+
return any(pattern in url for pattern in include_patterns)
|
|
159
|
+
|
|
160
|
+
# If no include patterns, accept by default
|
|
161
|
+
return True
|
|
162
|
+
|
|
163
|
+
|
|
164
|
+
def print_results(results, config):
|
|
165
|
+
"""Print estimation results"""
|
|
166
|
+
print()
|
|
167
|
+
print("=" * 70)
|
|
168
|
+
print("📊 ESTIMATION RESULTS")
|
|
169
|
+
print("=" * 70)
|
|
170
|
+
print()
|
|
171
|
+
print(f"Config: {config['name']}")
|
|
172
|
+
print(f"Base URL: {config['base_url']}")
|
|
173
|
+
print()
|
|
174
|
+
print(f"✅ Pages Discovered: {results['discovered']}")
|
|
175
|
+
print(f"⏳ Pages Pending: {results['pending']}")
|
|
176
|
+
print(f"📈 Estimated Total: {results['estimated_total']}")
|
|
177
|
+
print()
|
|
178
|
+
print(f"⏱️ Time Elapsed: {results['elapsed_seconds']}s")
|
|
179
|
+
print(f"⚡ Discovery Rate: {results['discovery_rate']} pages/sec")
|
|
180
|
+
|
|
181
|
+
if results.get("unlimited", False):
|
|
182
|
+
print()
|
|
183
|
+
print("✅ UNLIMITED MODE - Discovered all reachable pages")
|
|
184
|
+
print(f" Total pages: {results['estimated_total']}")
|
|
185
|
+
elif results["hit_limit"]:
|
|
186
|
+
print()
|
|
187
|
+
print("⚠️ Hit discovery limit - actual total may be higher")
|
|
188
|
+
print(" Increase max_discovery parameter for more accurate estimate")
|
|
189
|
+
|
|
190
|
+
print()
|
|
191
|
+
print("=" * 70)
|
|
192
|
+
print("💡 RECOMMENDATIONS")
|
|
193
|
+
print("=" * 70)
|
|
194
|
+
print()
|
|
195
|
+
|
|
196
|
+
estimated = results["estimated_total"]
|
|
197
|
+
current_max = config.get("max_pages", 100)
|
|
198
|
+
|
|
199
|
+
if estimated <= current_max:
|
|
200
|
+
print(f"✅ Current max_pages ({current_max}) is sufficient")
|
|
201
|
+
else:
|
|
202
|
+
recommended = min(estimated + 50, DISCOVERY_THRESHOLD) # Add 50 buffer, cap at threshold
|
|
203
|
+
print(f"⚠️ Current max_pages ({current_max}) may be too low")
|
|
204
|
+
print(f"📝 Recommended max_pages: {recommended}")
|
|
205
|
+
print(f" (Estimated {estimated} + 50 buffer)")
|
|
206
|
+
|
|
207
|
+
# Estimate time for full scrape
|
|
208
|
+
rate_limit = config.get("rate_limit", DEFAULT_RATE_LIMIT)
|
|
209
|
+
estimated_time = (estimated * rate_limit) / 60 # in minutes
|
|
210
|
+
|
|
211
|
+
print()
|
|
212
|
+
print(f"⏱️ Estimated full scrape time: {estimated_time:.1f} minutes")
|
|
213
|
+
print(f" (Based on rate_limit: {rate_limit}s)")
|
|
214
|
+
|
|
215
|
+
print()
|
|
216
|
+
|
|
217
|
+
|
|
218
|
+
def load_config(config_path):
|
|
219
|
+
"""Load configuration from JSON file"""
|
|
220
|
+
try:
|
|
221
|
+
with open(config_path) as f:
|
|
222
|
+
config = json.load(f)
|
|
223
|
+
return config
|
|
224
|
+
except FileNotFoundError:
|
|
225
|
+
print(f"❌ Error: Config file not found: {config_path}")
|
|
226
|
+
sys.exit(1)
|
|
227
|
+
except json.JSONDecodeError as e:
|
|
228
|
+
print(f"❌ Error: Invalid JSON in config file: {e}")
|
|
229
|
+
sys.exit(1)
|
|
230
|
+
|
|
231
|
+
|
|
232
|
+
def find_configs_directory():
|
|
233
|
+
"""
|
|
234
|
+
Find the configs directory using the same logic as the API.
|
|
235
|
+
|
|
236
|
+
Returns:
|
|
237
|
+
Path to configs directory or None if not found
|
|
238
|
+
"""
|
|
239
|
+
# Get the package root (src/skill_seekers/)
|
|
240
|
+
package_root = Path(__file__).parent.parent
|
|
241
|
+
|
|
242
|
+
# Try API configs_repo first (production)
|
|
243
|
+
api_config_dir = package_root.parent.parent / "api" / "configs_repo" / "official"
|
|
244
|
+
if api_config_dir.exists():
|
|
245
|
+
return api_config_dir
|
|
246
|
+
|
|
247
|
+
# Fallback to configs (local development)
|
|
248
|
+
local_config_dir = package_root.parent.parent / "configs"
|
|
249
|
+
if local_config_dir.exists():
|
|
250
|
+
return local_config_dir
|
|
251
|
+
|
|
252
|
+
return None
|
|
253
|
+
|
|
254
|
+
|
|
255
|
+
def list_all_configs():
|
|
256
|
+
"""
|
|
257
|
+
List all available configuration files.
|
|
258
|
+
Uses the same directory logic as the API.
|
|
259
|
+
"""
|
|
260
|
+
config_dir = find_configs_directory()
|
|
261
|
+
|
|
262
|
+
if not config_dir:
|
|
263
|
+
print("❌ Error: No config directory found")
|
|
264
|
+
print(" Tried: api/configs_repo/official/ and configs/")
|
|
265
|
+
return 1
|
|
266
|
+
|
|
267
|
+
print()
|
|
268
|
+
print("=" * 70)
|
|
269
|
+
print("📋 AVAILABLE CONFIGS")
|
|
270
|
+
print("=" * 70)
|
|
271
|
+
print()
|
|
272
|
+
print(f"📁 Config directory: {config_dir}")
|
|
273
|
+
print()
|
|
274
|
+
|
|
275
|
+
# Find all JSON files recursively
|
|
276
|
+
config_files = sorted(config_dir.rglob("*.json"))
|
|
277
|
+
|
|
278
|
+
if not config_files:
|
|
279
|
+
print("⚠️ No config files found")
|
|
280
|
+
return 1
|
|
281
|
+
|
|
282
|
+
# Group by category (subdirectory)
|
|
283
|
+
by_category = {}
|
|
284
|
+
for config_file in config_files:
|
|
285
|
+
# Get relative path from config_dir
|
|
286
|
+
rel_path = config_file.relative_to(config_dir)
|
|
287
|
+
|
|
288
|
+
# Category is the first directory in the path, or "root" if in root
|
|
289
|
+
category = rel_path.parts[0] if len(rel_path.parts) > 1 else "root"
|
|
290
|
+
|
|
291
|
+
if category not in by_category:
|
|
292
|
+
by_category[category] = []
|
|
293
|
+
|
|
294
|
+
# Try to load the config to get name and description
|
|
295
|
+
try:
|
|
296
|
+
with open(config_file) as f:
|
|
297
|
+
config_data = json.load(f)
|
|
298
|
+
|
|
299
|
+
name = config_data.get("name", config_file.stem)
|
|
300
|
+
description = config_data.get("description", "No description")
|
|
301
|
+
|
|
302
|
+
# Truncate description if too long
|
|
303
|
+
if len(description) > 60:
|
|
304
|
+
description = description[:57] + "..."
|
|
305
|
+
|
|
306
|
+
by_category[category].append(
|
|
307
|
+
{
|
|
308
|
+
"file": config_file.name,
|
|
309
|
+
"path": str(rel_path),
|
|
310
|
+
"name": name,
|
|
311
|
+
"description": description,
|
|
312
|
+
}
|
|
313
|
+
)
|
|
314
|
+
except Exception as e:
|
|
315
|
+
# If we can't parse the config, just use the filename
|
|
316
|
+
by_category[category].append(
|
|
317
|
+
{
|
|
318
|
+
"file": config_file.name,
|
|
319
|
+
"path": str(rel_path),
|
|
320
|
+
"name": config_file.stem,
|
|
321
|
+
"description": f"⚠️ Error loading config: {e}",
|
|
322
|
+
}
|
|
323
|
+
)
|
|
324
|
+
|
|
325
|
+
# Print configs by category
|
|
326
|
+
total = 0
|
|
327
|
+
for category in sorted(by_category.keys()):
|
|
328
|
+
configs = by_category[category]
|
|
329
|
+
total += len(configs)
|
|
330
|
+
|
|
331
|
+
print(f"📦 {category.upper()}")
|
|
332
|
+
print("-" * 70)
|
|
333
|
+
|
|
334
|
+
for config in configs:
|
|
335
|
+
print(f" • {config['name']}")
|
|
336
|
+
print(f" File: {config['path']}")
|
|
337
|
+
print(f" Description: {config['description']}")
|
|
338
|
+
print()
|
|
339
|
+
|
|
340
|
+
print("=" * 70)
|
|
341
|
+
print(f"📊 Total: {total} configs found")
|
|
342
|
+
print("=" * 70)
|
|
343
|
+
print()
|
|
344
|
+
|
|
345
|
+
return 0
|
|
346
|
+
|
|
347
|
+
|
|
348
|
+
def main():
|
|
349
|
+
"""Main entry point"""
|
|
350
|
+
import argparse
|
|
351
|
+
|
|
352
|
+
parser = argparse.ArgumentParser(
|
|
353
|
+
description="Estimate page count for Skill Seeker configs",
|
|
354
|
+
formatter_class=argparse.RawDescriptionHelpFormatter,
|
|
355
|
+
epilog="""
|
|
356
|
+
Examples:
|
|
357
|
+
# List all available configs
|
|
358
|
+
skill-seekers estimate --all
|
|
359
|
+
|
|
360
|
+
# Estimate pages for a config
|
|
361
|
+
skill-seekers estimate configs/react.json
|
|
362
|
+
|
|
363
|
+
# Estimate with higher discovery limit
|
|
364
|
+
skill-seekers estimate configs/godot.json --max-discovery 2000
|
|
365
|
+
|
|
366
|
+
# Quick estimate (stop at 100 pages)
|
|
367
|
+
skill-seekers estimate configs/vue.json --max-discovery 100
|
|
368
|
+
""",
|
|
369
|
+
)
|
|
370
|
+
|
|
371
|
+
parser.add_argument("config", nargs="?", help="Path to config JSON file")
|
|
372
|
+
parser.add_argument(
|
|
373
|
+
"--all",
|
|
374
|
+
action="store_true",
|
|
375
|
+
help="List all available configs from api/configs_repo/official/",
|
|
376
|
+
)
|
|
377
|
+
parser.add_argument(
|
|
378
|
+
"--max-discovery",
|
|
379
|
+
"-m",
|
|
380
|
+
type=int,
|
|
381
|
+
default=DEFAULT_MAX_DISCOVERY,
|
|
382
|
+
help=f"Maximum pages to discover (default: {DEFAULT_MAX_DISCOVERY}, use -1 for unlimited)",
|
|
383
|
+
)
|
|
384
|
+
parser.add_argument(
|
|
385
|
+
"--unlimited",
|
|
386
|
+
"-u",
|
|
387
|
+
action="store_true",
|
|
388
|
+
help="Remove discovery limit - discover all pages (same as --max-discovery -1)",
|
|
389
|
+
)
|
|
390
|
+
parser.add_argument(
|
|
391
|
+
"--timeout",
|
|
392
|
+
"-t",
|
|
393
|
+
type=int,
|
|
394
|
+
default=30,
|
|
395
|
+
help="HTTP request timeout in seconds (default: 30)",
|
|
396
|
+
)
|
|
397
|
+
|
|
398
|
+
args = parser.parse_args()
|
|
399
|
+
|
|
400
|
+
# Handle --all flag
|
|
401
|
+
if args.all:
|
|
402
|
+
return list_all_configs()
|
|
403
|
+
|
|
404
|
+
# If not --all, config is required
|
|
405
|
+
if not args.config:
|
|
406
|
+
parser.error("the following arguments are required: config (or use --all to list configs)")
|
|
407
|
+
|
|
408
|
+
# Handle unlimited flag
|
|
409
|
+
max_discovery = -1 if args.unlimited else args.max_discovery
|
|
410
|
+
|
|
411
|
+
# Load config
|
|
412
|
+
config = load_config(args.config)
|
|
413
|
+
|
|
414
|
+
# Run estimation
|
|
415
|
+
try:
|
|
416
|
+
results = estimate_pages(config, max_discovery, args.timeout)
|
|
417
|
+
print_results(results, config)
|
|
418
|
+
|
|
419
|
+
# Return exit code based on results
|
|
420
|
+
if results["hit_limit"]:
|
|
421
|
+
return 2 # Warning: hit limit
|
|
422
|
+
return 0 # Success
|
|
423
|
+
|
|
424
|
+
except KeyboardInterrupt:
|
|
425
|
+
print("\n\n⚠️ Estimation interrupted by user")
|
|
426
|
+
return 1
|
|
427
|
+
except Exception as e:
|
|
428
|
+
print(f"\n\n❌ Error during estimation: {e}")
|
|
429
|
+
return 1
|
|
430
|
+
|
|
431
|
+
|
|
432
|
+
if __name__ == "__main__":
|
|
433
|
+
sys.exit(main())
|