skill-seekers 2.7.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- skill_seekers/__init__.py +22 -0
- skill_seekers/cli/__init__.py +39 -0
- skill_seekers/cli/adaptors/__init__.py +120 -0
- skill_seekers/cli/adaptors/base.py +221 -0
- skill_seekers/cli/adaptors/claude.py +485 -0
- skill_seekers/cli/adaptors/gemini.py +453 -0
- skill_seekers/cli/adaptors/markdown.py +269 -0
- skill_seekers/cli/adaptors/openai.py +503 -0
- skill_seekers/cli/ai_enhancer.py +310 -0
- skill_seekers/cli/api_reference_builder.py +373 -0
- skill_seekers/cli/architectural_pattern_detector.py +525 -0
- skill_seekers/cli/code_analyzer.py +1462 -0
- skill_seekers/cli/codebase_scraper.py +1225 -0
- skill_seekers/cli/config_command.py +563 -0
- skill_seekers/cli/config_enhancer.py +431 -0
- skill_seekers/cli/config_extractor.py +871 -0
- skill_seekers/cli/config_manager.py +452 -0
- skill_seekers/cli/config_validator.py +394 -0
- skill_seekers/cli/conflict_detector.py +528 -0
- skill_seekers/cli/constants.py +72 -0
- skill_seekers/cli/dependency_analyzer.py +757 -0
- skill_seekers/cli/doc_scraper.py +2332 -0
- skill_seekers/cli/enhance_skill.py +488 -0
- skill_seekers/cli/enhance_skill_local.py +1096 -0
- skill_seekers/cli/enhance_status.py +194 -0
- skill_seekers/cli/estimate_pages.py +433 -0
- skill_seekers/cli/generate_router.py +1209 -0
- skill_seekers/cli/github_fetcher.py +534 -0
- skill_seekers/cli/github_scraper.py +1466 -0
- skill_seekers/cli/guide_enhancer.py +723 -0
- skill_seekers/cli/how_to_guide_builder.py +1267 -0
- skill_seekers/cli/install_agent.py +461 -0
- skill_seekers/cli/install_skill.py +178 -0
- skill_seekers/cli/language_detector.py +614 -0
- skill_seekers/cli/llms_txt_detector.py +60 -0
- skill_seekers/cli/llms_txt_downloader.py +104 -0
- skill_seekers/cli/llms_txt_parser.py +150 -0
- skill_seekers/cli/main.py +558 -0
- skill_seekers/cli/markdown_cleaner.py +132 -0
- skill_seekers/cli/merge_sources.py +806 -0
- skill_seekers/cli/package_multi.py +77 -0
- skill_seekers/cli/package_skill.py +241 -0
- skill_seekers/cli/pattern_recognizer.py +1825 -0
- skill_seekers/cli/pdf_extractor_poc.py +1166 -0
- skill_seekers/cli/pdf_scraper.py +617 -0
- skill_seekers/cli/quality_checker.py +519 -0
- skill_seekers/cli/rate_limit_handler.py +438 -0
- skill_seekers/cli/resume_command.py +160 -0
- skill_seekers/cli/run_tests.py +230 -0
- skill_seekers/cli/setup_wizard.py +93 -0
- skill_seekers/cli/split_config.py +390 -0
- skill_seekers/cli/swift_patterns.py +560 -0
- skill_seekers/cli/test_example_extractor.py +1081 -0
- skill_seekers/cli/test_unified_simple.py +179 -0
- skill_seekers/cli/unified_codebase_analyzer.py +572 -0
- skill_seekers/cli/unified_scraper.py +932 -0
- skill_seekers/cli/unified_skill_builder.py +1605 -0
- skill_seekers/cli/upload_skill.py +162 -0
- skill_seekers/cli/utils.py +432 -0
- skill_seekers/mcp/__init__.py +33 -0
- skill_seekers/mcp/agent_detector.py +316 -0
- skill_seekers/mcp/git_repo.py +273 -0
- skill_seekers/mcp/server.py +231 -0
- skill_seekers/mcp/server_fastmcp.py +1249 -0
- skill_seekers/mcp/server_legacy.py +2302 -0
- skill_seekers/mcp/source_manager.py +285 -0
- skill_seekers/mcp/tools/__init__.py +115 -0
- skill_seekers/mcp/tools/config_tools.py +251 -0
- skill_seekers/mcp/tools/packaging_tools.py +826 -0
- skill_seekers/mcp/tools/scraping_tools.py +842 -0
- skill_seekers/mcp/tools/source_tools.py +828 -0
- skill_seekers/mcp/tools/splitting_tools.py +212 -0
- skill_seekers/py.typed +0 -0
- skill_seekers-2.7.3.dist-info/METADATA +2027 -0
- skill_seekers-2.7.3.dist-info/RECORD +79 -0
- skill_seekers-2.7.3.dist-info/WHEEL +5 -0
- skill_seekers-2.7.3.dist-info/entry_points.txt +19 -0
- skill_seekers-2.7.3.dist-info/licenses/LICENSE +21 -0
- skill_seekers-2.7.3.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,104 @@
|
|
|
1
|
+
"""ABOUTME: Downloads llms.txt files from documentation URLs with retry logic"""
|
|
2
|
+
|
|
3
|
+
import time
|
|
4
|
+
|
|
5
|
+
import requests
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class LlmsTxtDownloader:
|
|
9
|
+
"""Download llms.txt content from URLs with retry logic"""
|
|
10
|
+
|
|
11
|
+
def __init__(self, url: str, timeout: int = 30, max_retries: int = 3):
|
|
12
|
+
self.url = url
|
|
13
|
+
self.timeout = timeout
|
|
14
|
+
self.max_retries = max_retries
|
|
15
|
+
|
|
16
|
+
def get_proper_filename(self) -> str:
|
|
17
|
+
"""
|
|
18
|
+
Extract filename from URL and convert .txt to .md
|
|
19
|
+
|
|
20
|
+
Returns:
|
|
21
|
+
Proper filename with .md extension
|
|
22
|
+
|
|
23
|
+
Examples:
|
|
24
|
+
https://hono.dev/llms-full.txt -> llms-full.md
|
|
25
|
+
https://hono.dev/llms.txt -> llms.md
|
|
26
|
+
https://hono.dev/llms-small.txt -> llms-small.md
|
|
27
|
+
"""
|
|
28
|
+
# Extract filename from URL
|
|
29
|
+
from urllib.parse import urlparse
|
|
30
|
+
|
|
31
|
+
parsed = urlparse(self.url)
|
|
32
|
+
filename = parsed.path.split("/")[-1]
|
|
33
|
+
|
|
34
|
+
# Replace .txt with .md
|
|
35
|
+
if filename.endswith(".txt"):
|
|
36
|
+
filename = filename[:-4] + ".md"
|
|
37
|
+
|
|
38
|
+
return filename
|
|
39
|
+
|
|
40
|
+
def _is_markdown(self, content: str) -> bool:
|
|
41
|
+
"""
|
|
42
|
+
Check if content looks like markdown (not HTML).
|
|
43
|
+
|
|
44
|
+
Returns:
|
|
45
|
+
True if content contains markdown patterns and is NOT HTML
|
|
46
|
+
"""
|
|
47
|
+
# First, reject HTML content (common redirect trap)
|
|
48
|
+
content_start = content.strip()[:500].lower()
|
|
49
|
+
html_indicators = [
|
|
50
|
+
"<!doctype html",
|
|
51
|
+
"<html",
|
|
52
|
+
"<!doctype",
|
|
53
|
+
"<head>",
|
|
54
|
+
"<meta charset",
|
|
55
|
+
]
|
|
56
|
+
if any(indicator in content_start for indicator in html_indicators):
|
|
57
|
+
return False
|
|
58
|
+
|
|
59
|
+
# Then check for markdown patterns
|
|
60
|
+
markdown_patterns = ["# ", "## ", "```", "- ", "* ", "`"]
|
|
61
|
+
return any(pattern in content for pattern in markdown_patterns)
|
|
62
|
+
|
|
63
|
+
def download(self) -> str | None:
|
|
64
|
+
"""
|
|
65
|
+
Download llms.txt content with retry logic.
|
|
66
|
+
|
|
67
|
+
Returns:
|
|
68
|
+
String content or None if download fails
|
|
69
|
+
"""
|
|
70
|
+
headers = {"User-Agent": "Skill-Seekers-llms.txt-Reader/1.0"}
|
|
71
|
+
|
|
72
|
+
for attempt in range(self.max_retries):
|
|
73
|
+
try:
|
|
74
|
+
response = requests.get(self.url, headers=headers, timeout=self.timeout)
|
|
75
|
+
response.raise_for_status()
|
|
76
|
+
|
|
77
|
+
content = response.text
|
|
78
|
+
|
|
79
|
+
# Validate content is not empty
|
|
80
|
+
if len(content) < 100:
|
|
81
|
+
print(f"⚠️ Content too short ({len(content)} chars), rejecting")
|
|
82
|
+
return None
|
|
83
|
+
|
|
84
|
+
# Validate content looks like markdown
|
|
85
|
+
if not self._is_markdown(content):
|
|
86
|
+
print("⚠️ Content doesn't look like markdown")
|
|
87
|
+
return None
|
|
88
|
+
|
|
89
|
+
return content
|
|
90
|
+
|
|
91
|
+
except requests.RequestException as e:
|
|
92
|
+
if attempt < self.max_retries - 1:
|
|
93
|
+
# Calculate exponential backoff delay: 1s, 2s, 4s, etc.
|
|
94
|
+
delay = 2**attempt
|
|
95
|
+
print(f"⚠️ Attempt {attempt + 1}/{self.max_retries} failed: {e}")
|
|
96
|
+
print(f" Retrying in {delay}s...")
|
|
97
|
+
time.sleep(delay)
|
|
98
|
+
else:
|
|
99
|
+
print(
|
|
100
|
+
f"❌ Failed to download {self.url} after {self.max_retries} attempts: {e}"
|
|
101
|
+
)
|
|
102
|
+
return None
|
|
103
|
+
|
|
104
|
+
return None
|
|
@@ -0,0 +1,150 @@
|
|
|
1
|
+
"""ABOUTME: Parses llms.txt markdown content into structured page data"""
|
|
2
|
+
|
|
3
|
+
import re
|
|
4
|
+
from urllib.parse import urljoin
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class LlmsTxtParser:
|
|
8
|
+
"""Parse llms.txt markdown content into page structures"""
|
|
9
|
+
|
|
10
|
+
def __init__(self, content: str, base_url: str = None):
|
|
11
|
+
self.content = content
|
|
12
|
+
self.base_url = base_url
|
|
13
|
+
|
|
14
|
+
def extract_urls(self) -> list[str]:
|
|
15
|
+
"""
|
|
16
|
+
Extract all URLs from the llms.txt content.
|
|
17
|
+
|
|
18
|
+
Supports both markdown-style links [text](url) and bare URLs.
|
|
19
|
+
Resolves relative URLs using base_url if provided.
|
|
20
|
+
Filters out malformed URLs with invalid anchor patterns.
|
|
21
|
+
|
|
22
|
+
Returns:
|
|
23
|
+
List of unique, cleaned URLs found in the content.
|
|
24
|
+
Returns empty list if no valid URLs found.
|
|
25
|
+
|
|
26
|
+
Note:
|
|
27
|
+
- Markdown links: [Getting Started](./docs/guide.md)
|
|
28
|
+
- Bare URLs: https://example.com/api.md
|
|
29
|
+
- Relative paths resolved with base_url
|
|
30
|
+
- Invalid anchors (#section/path.md) are stripped
|
|
31
|
+
"""
|
|
32
|
+
urls = set()
|
|
33
|
+
|
|
34
|
+
# Match markdown links: [text](url)
|
|
35
|
+
md_links = re.findall(r"\[([^\]]*)\]\(([^)]+)\)", self.content)
|
|
36
|
+
for _, url in md_links:
|
|
37
|
+
if url.startswith("http"):
|
|
38
|
+
clean_url = self._clean_url(url)
|
|
39
|
+
if clean_url:
|
|
40
|
+
urls.add(clean_url)
|
|
41
|
+
elif self.base_url and not url.startswith("#"):
|
|
42
|
+
clean_url = self._clean_url(urljoin(self.base_url, url))
|
|
43
|
+
if clean_url:
|
|
44
|
+
urls.add(clean_url)
|
|
45
|
+
|
|
46
|
+
# Match bare URLs
|
|
47
|
+
bare_urls = re.findall(r'https?://[^\s\)\]<>"\']+', self.content)
|
|
48
|
+
for url in bare_urls:
|
|
49
|
+
# Clean trailing punctuation
|
|
50
|
+
url = url.rstrip(".,;:")
|
|
51
|
+
clean_url = self._clean_url(url)
|
|
52
|
+
if clean_url:
|
|
53
|
+
urls.add(clean_url)
|
|
54
|
+
|
|
55
|
+
return list(urls)
|
|
56
|
+
|
|
57
|
+
def _clean_url(self, url: str) -> str:
|
|
58
|
+
"""
|
|
59
|
+
Clean and validate URL, removing invalid anchor patterns.
|
|
60
|
+
|
|
61
|
+
Detects and strips malformed anchors that contain path separators.
|
|
62
|
+
Valid: https://example.com/page.md#section
|
|
63
|
+
Invalid: https://example.com/page#section/index.html.md
|
|
64
|
+
|
|
65
|
+
Args:
|
|
66
|
+
url: URL to clean (absolute or relative)
|
|
67
|
+
|
|
68
|
+
Returns:
|
|
69
|
+
Cleaned URL with malformed anchors stripped.
|
|
70
|
+
Returns base URL if anchor contains '/' (malformed).
|
|
71
|
+
Returns original URL if anchor is valid or no anchor present.
|
|
72
|
+
|
|
73
|
+
Example:
|
|
74
|
+
>>> parser._clean_url("https://ex.com/page#sec/path.md")
|
|
75
|
+
"https://ex.com/page"
|
|
76
|
+
>>> parser._clean_url("https://ex.com/page.md#section")
|
|
77
|
+
"https://ex.com/page.md#section"
|
|
78
|
+
"""
|
|
79
|
+
# Skip URLs with path after anchor (e.g., #section/index.html.md)
|
|
80
|
+
# These are malformed and return duplicate HTML content
|
|
81
|
+
if "#" in url:
|
|
82
|
+
anchor_pos = url.index("#")
|
|
83
|
+
after_anchor = url[anchor_pos + 1 :]
|
|
84
|
+
# If there's a path separator after anchor, it's invalid
|
|
85
|
+
if "/" in after_anchor:
|
|
86
|
+
# Extract the base URL without the malformed anchor
|
|
87
|
+
return url[:anchor_pos]
|
|
88
|
+
return url
|
|
89
|
+
|
|
90
|
+
def parse(self) -> list[dict]:
|
|
91
|
+
"""
|
|
92
|
+
Parse markdown content into page structures.
|
|
93
|
+
|
|
94
|
+
Returns:
|
|
95
|
+
List of page dicts with title, content, code_samples, headings
|
|
96
|
+
"""
|
|
97
|
+
pages = []
|
|
98
|
+
|
|
99
|
+
# Split by h1 headers (# Title)
|
|
100
|
+
sections = re.split(r"\n# ", self.content)
|
|
101
|
+
|
|
102
|
+
for section in sections:
|
|
103
|
+
if not section.strip():
|
|
104
|
+
continue
|
|
105
|
+
|
|
106
|
+
# First line is title
|
|
107
|
+
lines = section.split("\n")
|
|
108
|
+
title = lines[0].strip("#").strip()
|
|
109
|
+
|
|
110
|
+
# Parse content
|
|
111
|
+
page = self._parse_section("\n".join(lines[1:]), title)
|
|
112
|
+
pages.append(page)
|
|
113
|
+
|
|
114
|
+
return pages
|
|
115
|
+
|
|
116
|
+
def _parse_section(self, content: str, title: str) -> dict:
|
|
117
|
+
"""Parse a single section into page structure"""
|
|
118
|
+
page = {
|
|
119
|
+
"title": title,
|
|
120
|
+
"content": "",
|
|
121
|
+
"code_samples": [],
|
|
122
|
+
"headings": [],
|
|
123
|
+
"url": f"llms-txt#{title.lower().replace(' ', '-')}",
|
|
124
|
+
"links": [],
|
|
125
|
+
}
|
|
126
|
+
|
|
127
|
+
# Extract code blocks
|
|
128
|
+
code_blocks = re.findall(r"```(\w+)?\n(.*?)```", content, re.DOTALL)
|
|
129
|
+
for lang, code in code_blocks:
|
|
130
|
+
page["code_samples"].append({"code": code.strip(), "language": lang or "unknown"})
|
|
131
|
+
|
|
132
|
+
# Extract h2/h3 headings
|
|
133
|
+
headings = re.findall(r"^(#{2,3})\s+(.+)$", content, re.MULTILINE)
|
|
134
|
+
for level_markers, text in headings:
|
|
135
|
+
page["headings"].append(
|
|
136
|
+
{
|
|
137
|
+
"level": f"h{len(level_markers)}",
|
|
138
|
+
"text": text.strip(),
|
|
139
|
+
"id": text.lower().replace(" ", "-"),
|
|
140
|
+
}
|
|
141
|
+
)
|
|
142
|
+
|
|
143
|
+
# Remove code blocks from content for plain text
|
|
144
|
+
content_no_code = re.sub(r"```.*?```", "", content, flags=re.DOTALL)
|
|
145
|
+
|
|
146
|
+
# Extract paragraphs
|
|
147
|
+
paragraphs = [p.strip() for p in content_no_code.split("\n\n") if len(p.strip()) > 20]
|
|
148
|
+
page["content"] = "\n\n".join(paragraphs)
|
|
149
|
+
|
|
150
|
+
return page
|