skill-seekers 2.7.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- skill_seekers/__init__.py +22 -0
- skill_seekers/cli/__init__.py +39 -0
- skill_seekers/cli/adaptors/__init__.py +120 -0
- skill_seekers/cli/adaptors/base.py +221 -0
- skill_seekers/cli/adaptors/claude.py +485 -0
- skill_seekers/cli/adaptors/gemini.py +453 -0
- skill_seekers/cli/adaptors/markdown.py +269 -0
- skill_seekers/cli/adaptors/openai.py +503 -0
- skill_seekers/cli/ai_enhancer.py +310 -0
- skill_seekers/cli/api_reference_builder.py +373 -0
- skill_seekers/cli/architectural_pattern_detector.py +525 -0
- skill_seekers/cli/code_analyzer.py +1462 -0
- skill_seekers/cli/codebase_scraper.py +1225 -0
- skill_seekers/cli/config_command.py +563 -0
- skill_seekers/cli/config_enhancer.py +431 -0
- skill_seekers/cli/config_extractor.py +871 -0
- skill_seekers/cli/config_manager.py +452 -0
- skill_seekers/cli/config_validator.py +394 -0
- skill_seekers/cli/conflict_detector.py +528 -0
- skill_seekers/cli/constants.py +72 -0
- skill_seekers/cli/dependency_analyzer.py +757 -0
- skill_seekers/cli/doc_scraper.py +2332 -0
- skill_seekers/cli/enhance_skill.py +488 -0
- skill_seekers/cli/enhance_skill_local.py +1096 -0
- skill_seekers/cli/enhance_status.py +194 -0
- skill_seekers/cli/estimate_pages.py +433 -0
- skill_seekers/cli/generate_router.py +1209 -0
- skill_seekers/cli/github_fetcher.py +534 -0
- skill_seekers/cli/github_scraper.py +1466 -0
- skill_seekers/cli/guide_enhancer.py +723 -0
- skill_seekers/cli/how_to_guide_builder.py +1267 -0
- skill_seekers/cli/install_agent.py +461 -0
- skill_seekers/cli/install_skill.py +178 -0
- skill_seekers/cli/language_detector.py +614 -0
- skill_seekers/cli/llms_txt_detector.py +60 -0
- skill_seekers/cli/llms_txt_downloader.py +104 -0
- skill_seekers/cli/llms_txt_parser.py +150 -0
- skill_seekers/cli/main.py +558 -0
- skill_seekers/cli/markdown_cleaner.py +132 -0
- skill_seekers/cli/merge_sources.py +806 -0
- skill_seekers/cli/package_multi.py +77 -0
- skill_seekers/cli/package_skill.py +241 -0
- skill_seekers/cli/pattern_recognizer.py +1825 -0
- skill_seekers/cli/pdf_extractor_poc.py +1166 -0
- skill_seekers/cli/pdf_scraper.py +617 -0
- skill_seekers/cli/quality_checker.py +519 -0
- skill_seekers/cli/rate_limit_handler.py +438 -0
- skill_seekers/cli/resume_command.py +160 -0
- skill_seekers/cli/run_tests.py +230 -0
- skill_seekers/cli/setup_wizard.py +93 -0
- skill_seekers/cli/split_config.py +390 -0
- skill_seekers/cli/swift_patterns.py +560 -0
- skill_seekers/cli/test_example_extractor.py +1081 -0
- skill_seekers/cli/test_unified_simple.py +179 -0
- skill_seekers/cli/unified_codebase_analyzer.py +572 -0
- skill_seekers/cli/unified_scraper.py +932 -0
- skill_seekers/cli/unified_skill_builder.py +1605 -0
- skill_seekers/cli/upload_skill.py +162 -0
- skill_seekers/cli/utils.py +432 -0
- skill_seekers/mcp/__init__.py +33 -0
- skill_seekers/mcp/agent_detector.py +316 -0
- skill_seekers/mcp/git_repo.py +273 -0
- skill_seekers/mcp/server.py +231 -0
- skill_seekers/mcp/server_fastmcp.py +1249 -0
- skill_seekers/mcp/server_legacy.py +2302 -0
- skill_seekers/mcp/source_manager.py +285 -0
- skill_seekers/mcp/tools/__init__.py +115 -0
- skill_seekers/mcp/tools/config_tools.py +251 -0
- skill_seekers/mcp/tools/packaging_tools.py +826 -0
- skill_seekers/mcp/tools/scraping_tools.py +842 -0
- skill_seekers/mcp/tools/source_tools.py +828 -0
- skill_seekers/mcp/tools/splitting_tools.py +212 -0
- skill_seekers/py.typed +0 -0
- skill_seekers-2.7.3.dist-info/METADATA +2027 -0
- skill_seekers-2.7.3.dist-info/RECORD +79 -0
- skill_seekers-2.7.3.dist-info/WHEEL +5 -0
- skill_seekers-2.7.3.dist-info/entry_points.txt +19 -0
- skill_seekers-2.7.3.dist-info/licenses/LICENSE +21 -0
- skill_seekers-2.7.3.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,617 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
PDF Documentation to Claude Skill Converter (Task B1.6)
|
|
4
|
+
|
|
5
|
+
Converts PDF documentation into Claude AI skills.
|
|
6
|
+
Uses pdf_extractor_poc.py for extraction, builds skill structure.
|
|
7
|
+
|
|
8
|
+
Usage:
|
|
9
|
+
python3 pdf_scraper.py --config configs/manual_pdf.json
|
|
10
|
+
python3 pdf_scraper.py --pdf manual.pdf --name myskill
|
|
11
|
+
python3 pdf_scraper.py --from-json manual_extracted.json
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
import argparse
|
|
15
|
+
import json
|
|
16
|
+
import os
|
|
17
|
+
import re
|
|
18
|
+
import sys
|
|
19
|
+
from pathlib import Path
|
|
20
|
+
|
|
21
|
+
# Import the PDF extractor
|
|
22
|
+
from .pdf_extractor_poc import PDFExtractor
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def infer_description_from_pdf(pdf_metadata: dict = None, name: str = "") -> str:
|
|
26
|
+
"""
|
|
27
|
+
Infer skill description from PDF metadata or document properties.
|
|
28
|
+
|
|
29
|
+
Tries to extract meaningful description from:
|
|
30
|
+
1. PDF metadata fields (title, subject, keywords)
|
|
31
|
+
2. Falls back to improved template
|
|
32
|
+
|
|
33
|
+
Args:
|
|
34
|
+
pdf_metadata: PDF metadata dictionary with title, subject, etc.
|
|
35
|
+
name: Skill name for fallback
|
|
36
|
+
|
|
37
|
+
Returns:
|
|
38
|
+
Description string suitable for "Use when..." format
|
|
39
|
+
"""
|
|
40
|
+
if pdf_metadata:
|
|
41
|
+
# Try to use subject field (often contains description)
|
|
42
|
+
if "subject" in pdf_metadata and pdf_metadata["subject"]:
|
|
43
|
+
desc = str(pdf_metadata["subject"]).strip()
|
|
44
|
+
if len(desc) > 20:
|
|
45
|
+
if len(desc) > 150:
|
|
46
|
+
desc = desc[:147] + "..."
|
|
47
|
+
return f"Use when {desc.lower()}"
|
|
48
|
+
|
|
49
|
+
# Try title field if meaningful
|
|
50
|
+
if "title" in pdf_metadata and pdf_metadata["title"]:
|
|
51
|
+
title = str(pdf_metadata["title"]).strip()
|
|
52
|
+
# Skip if it's just the filename
|
|
53
|
+
if len(title) > 10 and not title.endswith(".pdf"):
|
|
54
|
+
return f"Use when working with {title.lower()}"
|
|
55
|
+
|
|
56
|
+
# Improved fallback
|
|
57
|
+
return (
|
|
58
|
+
f"Use when referencing {name} documentation"
|
|
59
|
+
if name
|
|
60
|
+
else "Use when referencing this documentation"
|
|
61
|
+
)
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
class PDFToSkillConverter:
|
|
65
|
+
"""Convert PDF documentation to Claude skill"""
|
|
66
|
+
|
|
67
|
+
def __init__(self, config):
|
|
68
|
+
self.config = config
|
|
69
|
+
self.name = config["name"]
|
|
70
|
+
self.pdf_path = config.get("pdf_path", "")
|
|
71
|
+
# Set initial description (will be improved after extraction if metadata available)
|
|
72
|
+
self.description = config.get(
|
|
73
|
+
"description", f"Use when referencing {self.name} documentation"
|
|
74
|
+
)
|
|
75
|
+
|
|
76
|
+
# Paths
|
|
77
|
+
self.skill_dir = f"output/{self.name}"
|
|
78
|
+
self.data_file = f"output/{self.name}_extracted.json"
|
|
79
|
+
|
|
80
|
+
# Extraction options
|
|
81
|
+
self.extract_options = config.get("extract_options", {})
|
|
82
|
+
|
|
83
|
+
# Categories
|
|
84
|
+
self.categories = config.get("categories", {})
|
|
85
|
+
|
|
86
|
+
# Extracted data
|
|
87
|
+
self.extracted_data = None
|
|
88
|
+
|
|
89
|
+
def extract_pdf(self):
|
|
90
|
+
"""Extract content from PDF using pdf_extractor_poc.py"""
|
|
91
|
+
print(f"\n🔍 Extracting from PDF: {self.pdf_path}")
|
|
92
|
+
|
|
93
|
+
# Create extractor with options
|
|
94
|
+
extractor = PDFExtractor(
|
|
95
|
+
self.pdf_path,
|
|
96
|
+
verbose=True,
|
|
97
|
+
chunk_size=self.extract_options.get("chunk_size", 10),
|
|
98
|
+
min_quality=self.extract_options.get("min_quality", 5.0),
|
|
99
|
+
extract_images=self.extract_options.get("extract_images", True),
|
|
100
|
+
image_dir=f"{self.skill_dir}/assets/images",
|
|
101
|
+
min_image_size=self.extract_options.get("min_image_size", 100),
|
|
102
|
+
)
|
|
103
|
+
|
|
104
|
+
# Extract
|
|
105
|
+
result = extractor.extract_all()
|
|
106
|
+
|
|
107
|
+
if not result:
|
|
108
|
+
print("❌ Extraction failed")
|
|
109
|
+
raise RuntimeError(f"Failed to extract PDF: {self.pdf_path}")
|
|
110
|
+
|
|
111
|
+
# Save extracted data
|
|
112
|
+
with open(self.data_file, "w", encoding="utf-8") as f:
|
|
113
|
+
json.dump(result, f, indent=2, ensure_ascii=False)
|
|
114
|
+
|
|
115
|
+
print(f"\n💾 Saved extracted data to: {self.data_file}")
|
|
116
|
+
self.extracted_data = result
|
|
117
|
+
return True
|
|
118
|
+
|
|
119
|
+
def load_extracted_data(self, json_path):
|
|
120
|
+
"""Load previously extracted data from JSON"""
|
|
121
|
+
print(f"\n📂 Loading extracted data from: {json_path}")
|
|
122
|
+
|
|
123
|
+
with open(json_path, encoding="utf-8") as f:
|
|
124
|
+
self.extracted_data = json.load(f)
|
|
125
|
+
|
|
126
|
+
print(f"✅ Loaded {self.extracted_data['total_pages']} pages")
|
|
127
|
+
return True
|
|
128
|
+
|
|
129
|
+
def categorize_content(self):
|
|
130
|
+
"""Categorize pages based on chapters or keywords"""
|
|
131
|
+
print("\n📋 Categorizing content...")
|
|
132
|
+
|
|
133
|
+
categorized = {}
|
|
134
|
+
|
|
135
|
+
# Use chapters if available
|
|
136
|
+
if self.extracted_data.get("chapters"):
|
|
137
|
+
for chapter in self.extracted_data["chapters"]:
|
|
138
|
+
category_key = self._sanitize_filename(chapter["title"])
|
|
139
|
+
categorized[category_key] = {"title": chapter["title"], "pages": []}
|
|
140
|
+
|
|
141
|
+
# Assign pages to chapters
|
|
142
|
+
for page in self.extracted_data["pages"]:
|
|
143
|
+
page_num = page["page_number"]
|
|
144
|
+
|
|
145
|
+
# Find which chapter this page belongs to
|
|
146
|
+
for chapter in self.extracted_data["chapters"]:
|
|
147
|
+
if chapter["start_page"] <= page_num <= chapter["end_page"]:
|
|
148
|
+
category_key = self._sanitize_filename(chapter["title"])
|
|
149
|
+
categorized[category_key]["pages"].append(page)
|
|
150
|
+
break
|
|
151
|
+
|
|
152
|
+
# Fall back to keyword-based categorization
|
|
153
|
+
elif self.categories:
|
|
154
|
+
# Check if categories is already in the right format (for tests)
|
|
155
|
+
# If first value is a list of dicts (pages), use as-is
|
|
156
|
+
first_value = next(iter(self.categories.values()))
|
|
157
|
+
if isinstance(first_value, list) and first_value and isinstance(first_value[0], dict):
|
|
158
|
+
# Already categorized - convert to expected format
|
|
159
|
+
for cat_key, pages in self.categories.items():
|
|
160
|
+
categorized[cat_key] = {
|
|
161
|
+
"title": cat_key.replace("_", " ").title(),
|
|
162
|
+
"pages": pages,
|
|
163
|
+
}
|
|
164
|
+
else:
|
|
165
|
+
# Keyword-based categorization
|
|
166
|
+
# Initialize categories
|
|
167
|
+
for cat_key, _ in self.categories.items():
|
|
168
|
+
categorized[cat_key] = {"title": cat_key.replace("_", " ").title(), "pages": []}
|
|
169
|
+
|
|
170
|
+
# Categorize by keywords
|
|
171
|
+
for page in self.extracted_data["pages"]:
|
|
172
|
+
text = page.get("text", "").lower()
|
|
173
|
+
headings_text = " ".join([h["text"] for h in page.get("headings", [])]).lower()
|
|
174
|
+
|
|
175
|
+
# Score against each category
|
|
176
|
+
scores = {}
|
|
177
|
+
for cat_key, keywords in self.categories.items():
|
|
178
|
+
# Handle both string keywords and dict keywords (shouldn't happen, but be safe)
|
|
179
|
+
if isinstance(keywords, list):
|
|
180
|
+
score = sum(
|
|
181
|
+
1
|
|
182
|
+
for kw in keywords
|
|
183
|
+
if isinstance(kw, str)
|
|
184
|
+
and (kw.lower() in text or kw.lower() in headings_text)
|
|
185
|
+
)
|
|
186
|
+
else:
|
|
187
|
+
score = 0
|
|
188
|
+
if score > 0:
|
|
189
|
+
scores[cat_key] = score
|
|
190
|
+
|
|
191
|
+
# Assign to highest scoring category
|
|
192
|
+
if scores:
|
|
193
|
+
best_cat = max(scores, key=scores.get)
|
|
194
|
+
categorized[best_cat]["pages"].append(page)
|
|
195
|
+
else:
|
|
196
|
+
# Default category
|
|
197
|
+
if "other" not in categorized:
|
|
198
|
+
categorized["other"] = {"title": "Other", "pages": []}
|
|
199
|
+
categorized["other"]["pages"].append(page)
|
|
200
|
+
|
|
201
|
+
else:
|
|
202
|
+
# No categorization - use single category
|
|
203
|
+
categorized["content"] = {"title": "Content", "pages": self.extracted_data["pages"]}
|
|
204
|
+
|
|
205
|
+
print(f"✅ Created {len(categorized)} categories")
|
|
206
|
+
for _cat_key, cat_data in categorized.items():
|
|
207
|
+
print(f" - {cat_data['title']}: {len(cat_data['pages'])} pages")
|
|
208
|
+
|
|
209
|
+
return categorized
|
|
210
|
+
|
|
211
|
+
def build_skill(self):
|
|
212
|
+
"""Build complete skill structure"""
|
|
213
|
+
print(f"\n🏗️ Building skill: {self.name}")
|
|
214
|
+
|
|
215
|
+
# Create directories
|
|
216
|
+
os.makedirs(f"{self.skill_dir}/references", exist_ok=True)
|
|
217
|
+
os.makedirs(f"{self.skill_dir}/scripts", exist_ok=True)
|
|
218
|
+
os.makedirs(f"{self.skill_dir}/assets", exist_ok=True)
|
|
219
|
+
|
|
220
|
+
# Categorize content
|
|
221
|
+
categorized = self.categorize_content()
|
|
222
|
+
|
|
223
|
+
# Generate reference files
|
|
224
|
+
print("\n📝 Generating reference files...")
|
|
225
|
+
for cat_key, cat_data in categorized.items():
|
|
226
|
+
self._generate_reference_file(cat_key, cat_data)
|
|
227
|
+
|
|
228
|
+
# Generate index
|
|
229
|
+
self._generate_index(categorized)
|
|
230
|
+
|
|
231
|
+
# Generate SKILL.md
|
|
232
|
+
self._generate_skill_md(categorized)
|
|
233
|
+
|
|
234
|
+
print(f"\n✅ Skill built successfully: {self.skill_dir}/")
|
|
235
|
+
print(f"\n📦 Next step: Package with: skill-seekers package {self.skill_dir}/")
|
|
236
|
+
|
|
237
|
+
def _generate_reference_file(self, cat_key, cat_data):
|
|
238
|
+
"""Generate a reference markdown file for a category"""
|
|
239
|
+
filename = f"{self.skill_dir}/references/{cat_key}.md"
|
|
240
|
+
|
|
241
|
+
with open(filename, "w", encoding="utf-8") as f:
|
|
242
|
+
f.write(f"# {cat_data['title']}\n\n")
|
|
243
|
+
|
|
244
|
+
for page in cat_data["pages"]:
|
|
245
|
+
# Add headings as section markers
|
|
246
|
+
if page.get("headings"):
|
|
247
|
+
f.write(f"## {page['headings'][0]['text']}\n\n")
|
|
248
|
+
|
|
249
|
+
# Add text content
|
|
250
|
+
if page.get("text"):
|
|
251
|
+
# Limit to first 1000 chars per page to avoid huge files
|
|
252
|
+
text = page["text"][:1000]
|
|
253
|
+
f.write(f"{text}\n\n")
|
|
254
|
+
|
|
255
|
+
# Add code samples (check both 'code_samples' and 'code_blocks' for compatibility)
|
|
256
|
+
code_list = page.get("code_samples") or page.get("code_blocks")
|
|
257
|
+
if code_list:
|
|
258
|
+
f.write("### Code Examples\n\n")
|
|
259
|
+
for code in code_list[:3]: # Limit to top 3
|
|
260
|
+
lang = code.get("language", "")
|
|
261
|
+
f.write(f"```{lang}\n{code['code']}\n```\n\n")
|
|
262
|
+
|
|
263
|
+
# Add images
|
|
264
|
+
if page.get("images"):
|
|
265
|
+
# Create assets directory if needed
|
|
266
|
+
assets_dir = os.path.join(self.skill_dir, "assets")
|
|
267
|
+
os.makedirs(assets_dir, exist_ok=True)
|
|
268
|
+
|
|
269
|
+
f.write("### Images\n\n")
|
|
270
|
+
for img in page["images"]:
|
|
271
|
+
# Save image to assets
|
|
272
|
+
img_filename = f"page_{page['page_number']}_img_{img['index']}.png"
|
|
273
|
+
img_path = os.path.join(assets_dir, img_filename)
|
|
274
|
+
|
|
275
|
+
with open(img_path, "wb") as img_file:
|
|
276
|
+
img_file.write(img["data"])
|
|
277
|
+
|
|
278
|
+
# Add markdown image reference
|
|
279
|
+
f.write(f"![Image {img['index']}](../assets/{img_filename})\n\n")
|
|
280
|
+
|
|
281
|
+
f.write("---\n\n")
|
|
282
|
+
|
|
283
|
+
print(f" Generated: {filename}")
|
|
284
|
+
|
|
285
|
+
def _generate_index(self, categorized):
|
|
286
|
+
"""Generate reference index"""
|
|
287
|
+
filename = f"{self.skill_dir}/references/index.md"
|
|
288
|
+
|
|
289
|
+
with open(filename, "w", encoding="utf-8") as f:
|
|
290
|
+
f.write(f"# {self.name.title()} Documentation Reference\n\n")
|
|
291
|
+
f.write("## Categories\n\n")
|
|
292
|
+
|
|
293
|
+
for cat_key, cat_data in categorized.items():
|
|
294
|
+
page_count = len(cat_data["pages"])
|
|
295
|
+
f.write(f"- [{cat_data['title']}]({cat_key}.md) ({page_count} pages)\n")
|
|
296
|
+
|
|
297
|
+
f.write("\n## Statistics\n\n")
|
|
298
|
+
stats = self.extracted_data.get("quality_statistics", {})
|
|
299
|
+
f.write(f"- Total pages: {self.extracted_data.get('total_pages', 0)}\n")
|
|
300
|
+
f.write(f"- Code blocks: {self.extracted_data.get('total_code_blocks', 0)}\n")
|
|
301
|
+
f.write(f"- Images: {self.extracted_data.get('total_images', 0)}\n")
|
|
302
|
+
if stats:
|
|
303
|
+
f.write(f"- Average code quality: {stats.get('average_quality', 0):.1f}/10\n")
|
|
304
|
+
f.write(f"- Valid code blocks: {stats.get('valid_code_blocks', 0)}\n")
|
|
305
|
+
|
|
306
|
+
print(f" Generated: {filename}")
|
|
307
|
+
|
|
308
|
+
def _generate_skill_md(self, categorized):
|
|
309
|
+
"""Generate main SKILL.md file (enhanced with rich content)"""
|
|
310
|
+
filename = f"{self.skill_dir}/SKILL.md"
|
|
311
|
+
|
|
312
|
+
# Generate skill name (lowercase, hyphens only, max 64 chars)
|
|
313
|
+
skill_name = self.name.lower().replace("_", "-").replace(" ", "-")[:64]
|
|
314
|
+
|
|
315
|
+
# Truncate description to 1024 chars if needed
|
|
316
|
+
desc = self.description[:1024] if len(self.description) > 1024 else self.description
|
|
317
|
+
|
|
318
|
+
with open(filename, "w", encoding="utf-8") as f:
|
|
319
|
+
# Write YAML frontmatter
|
|
320
|
+
f.write("---\n")
|
|
321
|
+
f.write(f"name: {skill_name}\n")
|
|
322
|
+
f.write(f"description: {desc}\n")
|
|
323
|
+
f.write("---\n\n")
|
|
324
|
+
|
|
325
|
+
f.write(f"# {self.name.title()} Documentation Skill\n\n")
|
|
326
|
+
f.write(f"{self.description}\n\n")
|
|
327
|
+
|
|
328
|
+
# Enhanced "When to Use" section
|
|
329
|
+
f.write("## 💡 When to Use This Skill\n\n")
|
|
330
|
+
f.write("Use this skill when you need to:\n")
|
|
331
|
+
f.write(f"- Understand {self.name} concepts and fundamentals\n")
|
|
332
|
+
f.write("- Look up API references and technical specifications\n")
|
|
333
|
+
f.write("- Find code examples and implementation patterns\n")
|
|
334
|
+
f.write("- Review tutorials, guides, and best practices\n")
|
|
335
|
+
f.write("- Explore the complete documentation structure\n\n")
|
|
336
|
+
|
|
337
|
+
# Chapter Overview (PDF structure)
|
|
338
|
+
f.write("## 📖 Chapter Overview\n\n")
|
|
339
|
+
total_pages = self.extracted_data.get("total_pages", 0)
|
|
340
|
+
f.write(f"**Total Pages:** {total_pages}\n\n")
|
|
341
|
+
f.write("**Content Breakdown:**\n\n")
|
|
342
|
+
for _cat_key, cat_data in categorized.items():
|
|
343
|
+
page_count = len(cat_data["pages"])
|
|
344
|
+
f.write(f"- **{cat_data['title']}**: {page_count} pages\n")
|
|
345
|
+
f.write("\n")
|
|
346
|
+
|
|
347
|
+
# Extract key concepts from headings
|
|
348
|
+
f.write(self._format_key_concepts())
|
|
349
|
+
|
|
350
|
+
# Quick Reference with patterns
|
|
351
|
+
f.write("## ⚡ Quick Reference\n\n")
|
|
352
|
+
f.write(self._format_patterns_from_content())
|
|
353
|
+
|
|
354
|
+
# Enhanced code examples section (top 15, grouped by language)
|
|
355
|
+
all_code = []
|
|
356
|
+
for page in self.extracted_data["pages"]:
|
|
357
|
+
all_code.extend(page.get("code_samples", []))
|
|
358
|
+
|
|
359
|
+
# Sort by quality and get top 15
|
|
360
|
+
all_code.sort(key=lambda x: x.get("quality_score", 0), reverse=True)
|
|
361
|
+
top_code = all_code[:15]
|
|
362
|
+
|
|
363
|
+
if top_code:
|
|
364
|
+
f.write("## 📝 Code Examples\n\n")
|
|
365
|
+
f.write("*High-quality examples extracted from documentation*\n\n")
|
|
366
|
+
|
|
367
|
+
# Group by language
|
|
368
|
+
by_lang = {}
|
|
369
|
+
for code in top_code:
|
|
370
|
+
lang = code.get("language", "unknown")
|
|
371
|
+
if lang not in by_lang:
|
|
372
|
+
by_lang[lang] = []
|
|
373
|
+
by_lang[lang].append(code)
|
|
374
|
+
|
|
375
|
+
# Display grouped by language
|
|
376
|
+
for lang in sorted(by_lang.keys()):
|
|
377
|
+
examples = by_lang[lang]
|
|
378
|
+
f.write(f"### {lang.title()} Examples ({len(examples)})\n\n")
|
|
379
|
+
|
|
380
|
+
for i, code in enumerate(examples[:5], 1): # Top 5 per language
|
|
381
|
+
quality = code.get("quality_score", 0)
|
|
382
|
+
code_text = code.get("code", "")
|
|
383
|
+
|
|
384
|
+
f.write(f"**Example {i}** (Quality: {quality:.1f}/10):\n\n")
|
|
385
|
+
f.write(f"```{lang}\n")
|
|
386
|
+
|
|
387
|
+
# Show full code if short, truncate if long
|
|
388
|
+
if len(code_text) <= 500:
|
|
389
|
+
f.write(code_text)
|
|
390
|
+
else:
|
|
391
|
+
f.write(code_text[:500] + "\n...")
|
|
392
|
+
|
|
393
|
+
f.write("\n```\n\n")
|
|
394
|
+
|
|
395
|
+
# Statistics
|
|
396
|
+
f.write("## 📊 Documentation Statistics\n\n")
|
|
397
|
+
f.write(f"- **Total Pages**: {total_pages}\n")
|
|
398
|
+
total_code_blocks = self.extracted_data.get("total_code_blocks", 0)
|
|
399
|
+
f.write(f"- **Code Blocks**: {total_code_blocks}\n")
|
|
400
|
+
total_images = self.extracted_data.get("total_images", 0)
|
|
401
|
+
f.write(f"- **Images/Diagrams**: {total_images}\n")
|
|
402
|
+
|
|
403
|
+
# Language statistics
|
|
404
|
+
langs = self.extracted_data.get("languages_detected", {})
|
|
405
|
+
if langs:
|
|
406
|
+
f.write(f"- **Programming Languages**: {len(langs)}\n\n")
|
|
407
|
+
f.write("**Language Breakdown:**\n\n")
|
|
408
|
+
for lang, count in sorted(langs.items(), key=lambda x: x[1], reverse=True):
|
|
409
|
+
f.write(f"- {lang}: {count} examples\n")
|
|
410
|
+
f.write("\n")
|
|
411
|
+
|
|
412
|
+
# Quality metrics
|
|
413
|
+
quality_stats = self.extracted_data.get("quality_statistics", {})
|
|
414
|
+
if quality_stats:
|
|
415
|
+
avg_quality = quality_stats.get("average_quality", 0)
|
|
416
|
+
valid_blocks = quality_stats.get("valid_code_blocks", 0)
|
|
417
|
+
f.write("**Code Quality:**\n\n")
|
|
418
|
+
f.write(f"- Average Quality Score: {avg_quality:.1f}/10\n")
|
|
419
|
+
f.write(f"- Valid Code Blocks: {valid_blocks}\n\n")
|
|
420
|
+
|
|
421
|
+
# Navigation
|
|
422
|
+
f.write("## 🗺️ Navigation\n\n")
|
|
423
|
+
f.write("**Reference Files:**\n\n")
|
|
424
|
+
for _cat_key, cat_data in categorized.items():
|
|
425
|
+
cat_file = self._sanitize_filename(cat_data["title"])
|
|
426
|
+
f.write(f"- `references/{cat_file}.md` - {cat_data['title']}\n")
|
|
427
|
+
f.write("\n")
|
|
428
|
+
f.write("See `references/index.md` for complete documentation structure.\n\n")
|
|
429
|
+
|
|
430
|
+
# Footer
|
|
431
|
+
f.write("---\n\n")
|
|
432
|
+
f.write("**Generated by Skill Seeker** | PDF Documentation Scraper\n")
|
|
433
|
+
|
|
434
|
+
with open(filename, encoding="utf-8") as f:
|
|
435
|
+
line_count = len(f.read().split("\n"))
|
|
436
|
+
print(f" Generated: {filename} ({line_count} lines)")
|
|
437
|
+
|
|
438
|
+
def _format_key_concepts(self) -> str:
|
|
439
|
+
"""Extract key concepts from headings across all pages."""
|
|
440
|
+
all_headings = []
|
|
441
|
+
|
|
442
|
+
for page in self.extracted_data.get("pages", []):
|
|
443
|
+
headings = page.get("headings", [])
|
|
444
|
+
for heading in headings:
|
|
445
|
+
text = heading.get("text", "").strip()
|
|
446
|
+
level = heading.get("level", "h1")
|
|
447
|
+
if text and len(text) > 3: # Skip very short headings
|
|
448
|
+
all_headings.append((level, text))
|
|
449
|
+
|
|
450
|
+
if not all_headings:
|
|
451
|
+
return ""
|
|
452
|
+
|
|
453
|
+
content = "## 🔑 Key Concepts\n\n"
|
|
454
|
+
content += "*Main topics covered in this documentation*\n\n"
|
|
455
|
+
|
|
456
|
+
# Group by level and show top concepts
|
|
457
|
+
h1_headings = [text for level, text in all_headings if level == "h1"]
|
|
458
|
+
h2_headings = [text for level, text in all_headings if level == "h2"]
|
|
459
|
+
|
|
460
|
+
if h1_headings:
|
|
461
|
+
content += "**Major Topics:**\n\n"
|
|
462
|
+
for heading in h1_headings[:10]: # Top 10
|
|
463
|
+
content += f"- {heading}\n"
|
|
464
|
+
content += "\n"
|
|
465
|
+
|
|
466
|
+
if h2_headings:
|
|
467
|
+
content += "**Subtopics:**\n\n"
|
|
468
|
+
for heading in h2_headings[:15]: # Top 15
|
|
469
|
+
content += f"- {heading}\n"
|
|
470
|
+
content += "\n"
|
|
471
|
+
|
|
472
|
+
return content
|
|
473
|
+
|
|
474
|
+
def _format_patterns_from_content(self) -> str:
|
|
475
|
+
"""Extract common patterns from text content."""
|
|
476
|
+
# Look for common technical patterns in text
|
|
477
|
+
patterns = []
|
|
478
|
+
|
|
479
|
+
# Simple pattern extraction from headings and emphasized text
|
|
480
|
+
for page in self.extracted_data.get("pages", []):
|
|
481
|
+
_text = page.get("text", "")
|
|
482
|
+
headings = page.get("headings", [])
|
|
483
|
+
|
|
484
|
+
# Look for common pattern keywords in headings
|
|
485
|
+
pattern_keywords = [
|
|
486
|
+
"getting started",
|
|
487
|
+
"installation",
|
|
488
|
+
"configuration",
|
|
489
|
+
"usage",
|
|
490
|
+
"api",
|
|
491
|
+
"examples",
|
|
492
|
+
"tutorial",
|
|
493
|
+
"guide",
|
|
494
|
+
"best practices",
|
|
495
|
+
"troubleshooting",
|
|
496
|
+
"faq",
|
|
497
|
+
]
|
|
498
|
+
|
|
499
|
+
for heading in headings:
|
|
500
|
+
heading_text = heading.get("text", "").lower()
|
|
501
|
+
for keyword in pattern_keywords:
|
|
502
|
+
if keyword in heading_text:
|
|
503
|
+
page_num = page.get("page_number", 0)
|
|
504
|
+
patterns.append(
|
|
505
|
+
{
|
|
506
|
+
"type": keyword.title(),
|
|
507
|
+
"heading": heading.get("text", ""),
|
|
508
|
+
"page": page_num,
|
|
509
|
+
}
|
|
510
|
+
)
|
|
511
|
+
break # Only add once per heading
|
|
512
|
+
|
|
513
|
+
if not patterns:
|
|
514
|
+
return "*See reference files for detailed content*\n\n"
|
|
515
|
+
|
|
516
|
+
content = "*Common documentation patterns found:*\n\n"
|
|
517
|
+
|
|
518
|
+
# Group by type
|
|
519
|
+
by_type = {}
|
|
520
|
+
for pattern in patterns:
|
|
521
|
+
ptype = pattern["type"]
|
|
522
|
+
if ptype not in by_type:
|
|
523
|
+
by_type[ptype] = []
|
|
524
|
+
by_type[ptype].append(pattern)
|
|
525
|
+
|
|
526
|
+
# Display grouped patterns
|
|
527
|
+
for ptype in sorted(by_type.keys()):
|
|
528
|
+
items = by_type[ptype]
|
|
529
|
+
content += f"**{ptype}** ({len(items)} sections):\n"
|
|
530
|
+
for item in items[:3]: # Top 3 per type
|
|
531
|
+
content += f"- {item['heading']} (page {item['page']})\n"
|
|
532
|
+
content += "\n"
|
|
533
|
+
|
|
534
|
+
return content
|
|
535
|
+
|
|
536
|
+
def _sanitize_filename(self, name):
|
|
537
|
+
"""Convert string to safe filename"""
|
|
538
|
+
# Remove special chars, replace spaces with underscores
|
|
539
|
+
safe = re.sub(r"[^\w\s-]", "", name.lower())
|
|
540
|
+
safe = re.sub(r"[-\s]+", "_", safe)
|
|
541
|
+
return safe
|
|
542
|
+
|
|
543
|
+
|
|
544
|
+
def main():
|
|
545
|
+
parser = argparse.ArgumentParser(
|
|
546
|
+
description="Convert PDF documentation to Claude skill",
|
|
547
|
+
formatter_class=argparse.RawDescriptionHelpFormatter,
|
|
548
|
+
)
|
|
549
|
+
|
|
550
|
+
parser.add_argument("--config", help="PDF config JSON file")
|
|
551
|
+
parser.add_argument("--pdf", help="Direct PDF file path")
|
|
552
|
+
parser.add_argument("--name", help="Skill name (with --pdf)")
|
|
553
|
+
parser.add_argument("--from-json", help="Build skill from extracted JSON")
|
|
554
|
+
parser.add_argument("--description", help="Skill description")
|
|
555
|
+
|
|
556
|
+
args = parser.parse_args()
|
|
557
|
+
|
|
558
|
+
# Validate inputs
|
|
559
|
+
if not (args.config or args.pdf or args.from_json):
|
|
560
|
+
parser.error("Must specify --config, --pdf, or --from-json")
|
|
561
|
+
|
|
562
|
+
# Load or create config
|
|
563
|
+
if args.config:
|
|
564
|
+
with open(args.config) as f:
|
|
565
|
+
config = json.load(f)
|
|
566
|
+
elif args.from_json:
|
|
567
|
+
# Build from extracted JSON
|
|
568
|
+
name = Path(args.from_json).stem.replace("_extracted", "")
|
|
569
|
+
config = {
|
|
570
|
+
"name": name,
|
|
571
|
+
"description": args.description or f"Use when referencing {name} documentation",
|
|
572
|
+
}
|
|
573
|
+
converter = PDFToSkillConverter(config)
|
|
574
|
+
converter.load_extracted_data(args.from_json)
|
|
575
|
+
converter.build_skill()
|
|
576
|
+
return
|
|
577
|
+
else:
|
|
578
|
+
# Direct PDF mode
|
|
579
|
+
if not args.name:
|
|
580
|
+
parser.error("Must specify --name with --pdf")
|
|
581
|
+
config = {
|
|
582
|
+
"name": args.name,
|
|
583
|
+
"pdf_path": args.pdf,
|
|
584
|
+
"description": args.description or f"Use when referencing {args.name} documentation",
|
|
585
|
+
"extract_options": {
|
|
586
|
+
"chunk_size": 10,
|
|
587
|
+
"min_quality": 5.0,
|
|
588
|
+
"extract_images": True,
|
|
589
|
+
"min_image_size": 100,
|
|
590
|
+
},
|
|
591
|
+
}
|
|
592
|
+
|
|
593
|
+
# Create converter
|
|
594
|
+
try:
|
|
595
|
+
converter = PDFToSkillConverter(config)
|
|
596
|
+
|
|
597
|
+
# Extract if needed
|
|
598
|
+
if config.get("pdf_path"):
|
|
599
|
+
if not converter.extract_pdf():
|
|
600
|
+
print("\n❌ PDF extraction failed - see error above", file=sys.stderr)
|
|
601
|
+
sys.exit(1)
|
|
602
|
+
|
|
603
|
+
# Build skill
|
|
604
|
+
converter.build_skill()
|
|
605
|
+
|
|
606
|
+
except RuntimeError as e:
|
|
607
|
+
print(f"\n❌ Error: {e}", file=sys.stderr)
|
|
608
|
+
sys.exit(1)
|
|
609
|
+
except Exception as e:
|
|
610
|
+
print(f"\n❌ Unexpected error during PDF processing: {e}", file=sys.stderr)
|
|
611
|
+
import traceback
|
|
612
|
+
traceback.print_exc()
|
|
613
|
+
sys.exit(1)
|
|
614
|
+
|
|
615
|
+
|
|
616
|
+
if __name__ == "__main__":
|
|
617
|
+
main()
|