skill-seekers 2.7.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (79) hide show
  1. skill_seekers/__init__.py +22 -0
  2. skill_seekers/cli/__init__.py +39 -0
  3. skill_seekers/cli/adaptors/__init__.py +120 -0
  4. skill_seekers/cli/adaptors/base.py +221 -0
  5. skill_seekers/cli/adaptors/claude.py +485 -0
  6. skill_seekers/cli/adaptors/gemini.py +453 -0
  7. skill_seekers/cli/adaptors/markdown.py +269 -0
  8. skill_seekers/cli/adaptors/openai.py +503 -0
  9. skill_seekers/cli/ai_enhancer.py +310 -0
  10. skill_seekers/cli/api_reference_builder.py +373 -0
  11. skill_seekers/cli/architectural_pattern_detector.py +525 -0
  12. skill_seekers/cli/code_analyzer.py +1462 -0
  13. skill_seekers/cli/codebase_scraper.py +1225 -0
  14. skill_seekers/cli/config_command.py +563 -0
  15. skill_seekers/cli/config_enhancer.py +431 -0
  16. skill_seekers/cli/config_extractor.py +871 -0
  17. skill_seekers/cli/config_manager.py +452 -0
  18. skill_seekers/cli/config_validator.py +394 -0
  19. skill_seekers/cli/conflict_detector.py +528 -0
  20. skill_seekers/cli/constants.py +72 -0
  21. skill_seekers/cli/dependency_analyzer.py +757 -0
  22. skill_seekers/cli/doc_scraper.py +2332 -0
  23. skill_seekers/cli/enhance_skill.py +488 -0
  24. skill_seekers/cli/enhance_skill_local.py +1096 -0
  25. skill_seekers/cli/enhance_status.py +194 -0
  26. skill_seekers/cli/estimate_pages.py +433 -0
  27. skill_seekers/cli/generate_router.py +1209 -0
  28. skill_seekers/cli/github_fetcher.py +534 -0
  29. skill_seekers/cli/github_scraper.py +1466 -0
  30. skill_seekers/cli/guide_enhancer.py +723 -0
  31. skill_seekers/cli/how_to_guide_builder.py +1267 -0
  32. skill_seekers/cli/install_agent.py +461 -0
  33. skill_seekers/cli/install_skill.py +178 -0
  34. skill_seekers/cli/language_detector.py +614 -0
  35. skill_seekers/cli/llms_txt_detector.py +60 -0
  36. skill_seekers/cli/llms_txt_downloader.py +104 -0
  37. skill_seekers/cli/llms_txt_parser.py +150 -0
  38. skill_seekers/cli/main.py +558 -0
  39. skill_seekers/cli/markdown_cleaner.py +132 -0
  40. skill_seekers/cli/merge_sources.py +806 -0
  41. skill_seekers/cli/package_multi.py +77 -0
  42. skill_seekers/cli/package_skill.py +241 -0
  43. skill_seekers/cli/pattern_recognizer.py +1825 -0
  44. skill_seekers/cli/pdf_extractor_poc.py +1166 -0
  45. skill_seekers/cli/pdf_scraper.py +617 -0
  46. skill_seekers/cli/quality_checker.py +519 -0
  47. skill_seekers/cli/rate_limit_handler.py +438 -0
  48. skill_seekers/cli/resume_command.py +160 -0
  49. skill_seekers/cli/run_tests.py +230 -0
  50. skill_seekers/cli/setup_wizard.py +93 -0
  51. skill_seekers/cli/split_config.py +390 -0
  52. skill_seekers/cli/swift_patterns.py +560 -0
  53. skill_seekers/cli/test_example_extractor.py +1081 -0
  54. skill_seekers/cli/test_unified_simple.py +179 -0
  55. skill_seekers/cli/unified_codebase_analyzer.py +572 -0
  56. skill_seekers/cli/unified_scraper.py +932 -0
  57. skill_seekers/cli/unified_skill_builder.py +1605 -0
  58. skill_seekers/cli/upload_skill.py +162 -0
  59. skill_seekers/cli/utils.py +432 -0
  60. skill_seekers/mcp/__init__.py +33 -0
  61. skill_seekers/mcp/agent_detector.py +316 -0
  62. skill_seekers/mcp/git_repo.py +273 -0
  63. skill_seekers/mcp/server.py +231 -0
  64. skill_seekers/mcp/server_fastmcp.py +1249 -0
  65. skill_seekers/mcp/server_legacy.py +2302 -0
  66. skill_seekers/mcp/source_manager.py +285 -0
  67. skill_seekers/mcp/tools/__init__.py +115 -0
  68. skill_seekers/mcp/tools/config_tools.py +251 -0
  69. skill_seekers/mcp/tools/packaging_tools.py +826 -0
  70. skill_seekers/mcp/tools/scraping_tools.py +842 -0
  71. skill_seekers/mcp/tools/source_tools.py +828 -0
  72. skill_seekers/mcp/tools/splitting_tools.py +212 -0
  73. skill_seekers/py.typed +0 -0
  74. skill_seekers-2.7.3.dist-info/METADATA +2027 -0
  75. skill_seekers-2.7.3.dist-info/RECORD +79 -0
  76. skill_seekers-2.7.3.dist-info/WHEEL +5 -0
  77. skill_seekers-2.7.3.dist-info/entry_points.txt +19 -0
  78. skill_seekers-2.7.3.dist-info/licenses/LICENSE +21 -0
  79. skill_seekers-2.7.3.dist-info/top_level.txt +1 -0
@@ -0,0 +1,617 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ PDF Documentation to Claude Skill Converter (Task B1.6)
4
+
5
+ Converts PDF documentation into Claude AI skills.
6
+ Uses pdf_extractor_poc.py for extraction, builds skill structure.
7
+
8
+ Usage:
9
+ python3 pdf_scraper.py --config configs/manual_pdf.json
10
+ python3 pdf_scraper.py --pdf manual.pdf --name myskill
11
+ python3 pdf_scraper.py --from-json manual_extracted.json
12
+ """
13
+
14
+ import argparse
15
+ import json
16
+ import os
17
+ import re
18
+ import sys
19
+ from pathlib import Path
20
+
21
+ # Import the PDF extractor
22
+ from .pdf_extractor_poc import PDFExtractor
23
+
24
+
25
+ def infer_description_from_pdf(pdf_metadata: dict = None, name: str = "") -> str:
26
+ """
27
+ Infer skill description from PDF metadata or document properties.
28
+
29
+ Tries to extract meaningful description from:
30
+ 1. PDF metadata fields (title, subject, keywords)
31
+ 2. Falls back to improved template
32
+
33
+ Args:
34
+ pdf_metadata: PDF metadata dictionary with title, subject, etc.
35
+ name: Skill name for fallback
36
+
37
+ Returns:
38
+ Description string suitable for "Use when..." format
39
+ """
40
+ if pdf_metadata:
41
+ # Try to use subject field (often contains description)
42
+ if "subject" in pdf_metadata and pdf_metadata["subject"]:
43
+ desc = str(pdf_metadata["subject"]).strip()
44
+ if len(desc) > 20:
45
+ if len(desc) > 150:
46
+ desc = desc[:147] + "..."
47
+ return f"Use when {desc.lower()}"
48
+
49
+ # Try title field if meaningful
50
+ if "title" in pdf_metadata and pdf_metadata["title"]:
51
+ title = str(pdf_metadata["title"]).strip()
52
+ # Skip if it's just the filename
53
+ if len(title) > 10 and not title.endswith(".pdf"):
54
+ return f"Use when working with {title.lower()}"
55
+
56
+ # Improved fallback
57
+ return (
58
+ f"Use when referencing {name} documentation"
59
+ if name
60
+ else "Use when referencing this documentation"
61
+ )
62
+
63
+
64
+ class PDFToSkillConverter:
65
+ """Convert PDF documentation to Claude skill"""
66
+
67
+ def __init__(self, config):
68
+ self.config = config
69
+ self.name = config["name"]
70
+ self.pdf_path = config.get("pdf_path", "")
71
+ # Set initial description (will be improved after extraction if metadata available)
72
+ self.description = config.get(
73
+ "description", f"Use when referencing {self.name} documentation"
74
+ )
75
+
76
+ # Paths
77
+ self.skill_dir = f"output/{self.name}"
78
+ self.data_file = f"output/{self.name}_extracted.json"
79
+
80
+ # Extraction options
81
+ self.extract_options = config.get("extract_options", {})
82
+
83
+ # Categories
84
+ self.categories = config.get("categories", {})
85
+
86
+ # Extracted data
87
+ self.extracted_data = None
88
+
89
+ def extract_pdf(self):
90
+ """Extract content from PDF using pdf_extractor_poc.py"""
91
+ print(f"\n🔍 Extracting from PDF: {self.pdf_path}")
92
+
93
+ # Create extractor with options
94
+ extractor = PDFExtractor(
95
+ self.pdf_path,
96
+ verbose=True,
97
+ chunk_size=self.extract_options.get("chunk_size", 10),
98
+ min_quality=self.extract_options.get("min_quality", 5.0),
99
+ extract_images=self.extract_options.get("extract_images", True),
100
+ image_dir=f"{self.skill_dir}/assets/images",
101
+ min_image_size=self.extract_options.get("min_image_size", 100),
102
+ )
103
+
104
+ # Extract
105
+ result = extractor.extract_all()
106
+
107
+ if not result:
108
+ print("❌ Extraction failed")
109
+ raise RuntimeError(f"Failed to extract PDF: {self.pdf_path}")
110
+
111
+ # Save extracted data
112
+ with open(self.data_file, "w", encoding="utf-8") as f:
113
+ json.dump(result, f, indent=2, ensure_ascii=False)
114
+
115
+ print(f"\n💾 Saved extracted data to: {self.data_file}")
116
+ self.extracted_data = result
117
+ return True
118
+
119
+ def load_extracted_data(self, json_path):
120
+ """Load previously extracted data from JSON"""
121
+ print(f"\n📂 Loading extracted data from: {json_path}")
122
+
123
+ with open(json_path, encoding="utf-8") as f:
124
+ self.extracted_data = json.load(f)
125
+
126
+ print(f"✅ Loaded {self.extracted_data['total_pages']} pages")
127
+ return True
128
+
129
+ def categorize_content(self):
130
+ """Categorize pages based on chapters or keywords"""
131
+ print("\n📋 Categorizing content...")
132
+
133
+ categorized = {}
134
+
135
+ # Use chapters if available
136
+ if self.extracted_data.get("chapters"):
137
+ for chapter in self.extracted_data["chapters"]:
138
+ category_key = self._sanitize_filename(chapter["title"])
139
+ categorized[category_key] = {"title": chapter["title"], "pages": []}
140
+
141
+ # Assign pages to chapters
142
+ for page in self.extracted_data["pages"]:
143
+ page_num = page["page_number"]
144
+
145
+ # Find which chapter this page belongs to
146
+ for chapter in self.extracted_data["chapters"]:
147
+ if chapter["start_page"] <= page_num <= chapter["end_page"]:
148
+ category_key = self._sanitize_filename(chapter["title"])
149
+ categorized[category_key]["pages"].append(page)
150
+ break
151
+
152
+ # Fall back to keyword-based categorization
153
+ elif self.categories:
154
+ # Check if categories is already in the right format (for tests)
155
+ # If first value is a list of dicts (pages), use as-is
156
+ first_value = next(iter(self.categories.values()))
157
+ if isinstance(first_value, list) and first_value and isinstance(first_value[0], dict):
158
+ # Already categorized - convert to expected format
159
+ for cat_key, pages in self.categories.items():
160
+ categorized[cat_key] = {
161
+ "title": cat_key.replace("_", " ").title(),
162
+ "pages": pages,
163
+ }
164
+ else:
165
+ # Keyword-based categorization
166
+ # Initialize categories
167
+ for cat_key, _ in self.categories.items():
168
+ categorized[cat_key] = {"title": cat_key.replace("_", " ").title(), "pages": []}
169
+
170
+ # Categorize by keywords
171
+ for page in self.extracted_data["pages"]:
172
+ text = page.get("text", "").lower()
173
+ headings_text = " ".join([h["text"] for h in page.get("headings", [])]).lower()
174
+
175
+ # Score against each category
176
+ scores = {}
177
+ for cat_key, keywords in self.categories.items():
178
+ # Handle both string keywords and dict keywords (shouldn't happen, but be safe)
179
+ if isinstance(keywords, list):
180
+ score = sum(
181
+ 1
182
+ for kw in keywords
183
+ if isinstance(kw, str)
184
+ and (kw.lower() in text or kw.lower() in headings_text)
185
+ )
186
+ else:
187
+ score = 0
188
+ if score > 0:
189
+ scores[cat_key] = score
190
+
191
+ # Assign to highest scoring category
192
+ if scores:
193
+ best_cat = max(scores, key=scores.get)
194
+ categorized[best_cat]["pages"].append(page)
195
+ else:
196
+ # Default category
197
+ if "other" not in categorized:
198
+ categorized["other"] = {"title": "Other", "pages": []}
199
+ categorized["other"]["pages"].append(page)
200
+
201
+ else:
202
+ # No categorization - use single category
203
+ categorized["content"] = {"title": "Content", "pages": self.extracted_data["pages"]}
204
+
205
+ print(f"✅ Created {len(categorized)} categories")
206
+ for _cat_key, cat_data in categorized.items():
207
+ print(f" - {cat_data['title']}: {len(cat_data['pages'])} pages")
208
+
209
+ return categorized
210
+
211
+ def build_skill(self):
212
+ """Build complete skill structure"""
213
+ print(f"\n🏗️ Building skill: {self.name}")
214
+
215
+ # Create directories
216
+ os.makedirs(f"{self.skill_dir}/references", exist_ok=True)
217
+ os.makedirs(f"{self.skill_dir}/scripts", exist_ok=True)
218
+ os.makedirs(f"{self.skill_dir}/assets", exist_ok=True)
219
+
220
+ # Categorize content
221
+ categorized = self.categorize_content()
222
+
223
+ # Generate reference files
224
+ print("\n📝 Generating reference files...")
225
+ for cat_key, cat_data in categorized.items():
226
+ self._generate_reference_file(cat_key, cat_data)
227
+
228
+ # Generate index
229
+ self._generate_index(categorized)
230
+
231
+ # Generate SKILL.md
232
+ self._generate_skill_md(categorized)
233
+
234
+ print(f"\n✅ Skill built successfully: {self.skill_dir}/")
235
+ print(f"\n📦 Next step: Package with: skill-seekers package {self.skill_dir}/")
236
+
237
+ def _generate_reference_file(self, cat_key, cat_data):
238
+ """Generate a reference markdown file for a category"""
239
+ filename = f"{self.skill_dir}/references/{cat_key}.md"
240
+
241
+ with open(filename, "w", encoding="utf-8") as f:
242
+ f.write(f"# {cat_data['title']}\n\n")
243
+
244
+ for page in cat_data["pages"]:
245
+ # Add headings as section markers
246
+ if page.get("headings"):
247
+ f.write(f"## {page['headings'][0]['text']}\n\n")
248
+
249
+ # Add text content
250
+ if page.get("text"):
251
+ # Limit to first 1000 chars per page to avoid huge files
252
+ text = page["text"][:1000]
253
+ f.write(f"{text}\n\n")
254
+
255
+ # Add code samples (check both 'code_samples' and 'code_blocks' for compatibility)
256
+ code_list = page.get("code_samples") or page.get("code_blocks")
257
+ if code_list:
258
+ f.write("### Code Examples\n\n")
259
+ for code in code_list[:3]: # Limit to top 3
260
+ lang = code.get("language", "")
261
+ f.write(f"```{lang}\n{code['code']}\n```\n\n")
262
+
263
+ # Add images
264
+ if page.get("images"):
265
+ # Create assets directory if needed
266
+ assets_dir = os.path.join(self.skill_dir, "assets")
267
+ os.makedirs(assets_dir, exist_ok=True)
268
+
269
+ f.write("### Images\n\n")
270
+ for img in page["images"]:
271
+ # Save image to assets
272
+ img_filename = f"page_{page['page_number']}_img_{img['index']}.png"
273
+ img_path = os.path.join(assets_dir, img_filename)
274
+
275
+ with open(img_path, "wb") as img_file:
276
+ img_file.write(img["data"])
277
+
278
+ # Add markdown image reference
279
+ f.write(f"![Image {img['index']}](../assets/{img_filename})\n\n")
280
+
281
+ f.write("---\n\n")
282
+
283
+ print(f" Generated: {filename}")
284
+
285
+ def _generate_index(self, categorized):
286
+ """Generate reference index"""
287
+ filename = f"{self.skill_dir}/references/index.md"
288
+
289
+ with open(filename, "w", encoding="utf-8") as f:
290
+ f.write(f"# {self.name.title()} Documentation Reference\n\n")
291
+ f.write("## Categories\n\n")
292
+
293
+ for cat_key, cat_data in categorized.items():
294
+ page_count = len(cat_data["pages"])
295
+ f.write(f"- [{cat_data['title']}]({cat_key}.md) ({page_count} pages)\n")
296
+
297
+ f.write("\n## Statistics\n\n")
298
+ stats = self.extracted_data.get("quality_statistics", {})
299
+ f.write(f"- Total pages: {self.extracted_data.get('total_pages', 0)}\n")
300
+ f.write(f"- Code blocks: {self.extracted_data.get('total_code_blocks', 0)}\n")
301
+ f.write(f"- Images: {self.extracted_data.get('total_images', 0)}\n")
302
+ if stats:
303
+ f.write(f"- Average code quality: {stats.get('average_quality', 0):.1f}/10\n")
304
+ f.write(f"- Valid code blocks: {stats.get('valid_code_blocks', 0)}\n")
305
+
306
+ print(f" Generated: {filename}")
307
+
308
+ def _generate_skill_md(self, categorized):
309
+ """Generate main SKILL.md file (enhanced with rich content)"""
310
+ filename = f"{self.skill_dir}/SKILL.md"
311
+
312
+ # Generate skill name (lowercase, hyphens only, max 64 chars)
313
+ skill_name = self.name.lower().replace("_", "-").replace(" ", "-")[:64]
314
+
315
+ # Truncate description to 1024 chars if needed
316
+ desc = self.description[:1024] if len(self.description) > 1024 else self.description
317
+
318
+ with open(filename, "w", encoding="utf-8") as f:
319
+ # Write YAML frontmatter
320
+ f.write("---\n")
321
+ f.write(f"name: {skill_name}\n")
322
+ f.write(f"description: {desc}\n")
323
+ f.write("---\n\n")
324
+
325
+ f.write(f"# {self.name.title()} Documentation Skill\n\n")
326
+ f.write(f"{self.description}\n\n")
327
+
328
+ # Enhanced "When to Use" section
329
+ f.write("## 💡 When to Use This Skill\n\n")
330
+ f.write("Use this skill when you need to:\n")
331
+ f.write(f"- Understand {self.name} concepts and fundamentals\n")
332
+ f.write("- Look up API references and technical specifications\n")
333
+ f.write("- Find code examples and implementation patterns\n")
334
+ f.write("- Review tutorials, guides, and best practices\n")
335
+ f.write("- Explore the complete documentation structure\n\n")
336
+
337
+ # Chapter Overview (PDF structure)
338
+ f.write("## 📖 Chapter Overview\n\n")
339
+ total_pages = self.extracted_data.get("total_pages", 0)
340
+ f.write(f"**Total Pages:** {total_pages}\n\n")
341
+ f.write("**Content Breakdown:**\n\n")
342
+ for _cat_key, cat_data in categorized.items():
343
+ page_count = len(cat_data["pages"])
344
+ f.write(f"- **{cat_data['title']}**: {page_count} pages\n")
345
+ f.write("\n")
346
+
347
+ # Extract key concepts from headings
348
+ f.write(self._format_key_concepts())
349
+
350
+ # Quick Reference with patterns
351
+ f.write("## ⚡ Quick Reference\n\n")
352
+ f.write(self._format_patterns_from_content())
353
+
354
+ # Enhanced code examples section (top 15, grouped by language)
355
+ all_code = []
356
+ for page in self.extracted_data["pages"]:
357
+ all_code.extend(page.get("code_samples", []))
358
+
359
+ # Sort by quality and get top 15
360
+ all_code.sort(key=lambda x: x.get("quality_score", 0), reverse=True)
361
+ top_code = all_code[:15]
362
+
363
+ if top_code:
364
+ f.write("## 📝 Code Examples\n\n")
365
+ f.write("*High-quality examples extracted from documentation*\n\n")
366
+
367
+ # Group by language
368
+ by_lang = {}
369
+ for code in top_code:
370
+ lang = code.get("language", "unknown")
371
+ if lang not in by_lang:
372
+ by_lang[lang] = []
373
+ by_lang[lang].append(code)
374
+
375
+ # Display grouped by language
376
+ for lang in sorted(by_lang.keys()):
377
+ examples = by_lang[lang]
378
+ f.write(f"### {lang.title()} Examples ({len(examples)})\n\n")
379
+
380
+ for i, code in enumerate(examples[:5], 1): # Top 5 per language
381
+ quality = code.get("quality_score", 0)
382
+ code_text = code.get("code", "")
383
+
384
+ f.write(f"**Example {i}** (Quality: {quality:.1f}/10):\n\n")
385
+ f.write(f"```{lang}\n")
386
+
387
+ # Show full code if short, truncate if long
388
+ if len(code_text) <= 500:
389
+ f.write(code_text)
390
+ else:
391
+ f.write(code_text[:500] + "\n...")
392
+
393
+ f.write("\n```\n\n")
394
+
395
+ # Statistics
396
+ f.write("## 📊 Documentation Statistics\n\n")
397
+ f.write(f"- **Total Pages**: {total_pages}\n")
398
+ total_code_blocks = self.extracted_data.get("total_code_blocks", 0)
399
+ f.write(f"- **Code Blocks**: {total_code_blocks}\n")
400
+ total_images = self.extracted_data.get("total_images", 0)
401
+ f.write(f"- **Images/Diagrams**: {total_images}\n")
402
+
403
+ # Language statistics
404
+ langs = self.extracted_data.get("languages_detected", {})
405
+ if langs:
406
+ f.write(f"- **Programming Languages**: {len(langs)}\n\n")
407
+ f.write("**Language Breakdown:**\n\n")
408
+ for lang, count in sorted(langs.items(), key=lambda x: x[1], reverse=True):
409
+ f.write(f"- {lang}: {count} examples\n")
410
+ f.write("\n")
411
+
412
+ # Quality metrics
413
+ quality_stats = self.extracted_data.get("quality_statistics", {})
414
+ if quality_stats:
415
+ avg_quality = quality_stats.get("average_quality", 0)
416
+ valid_blocks = quality_stats.get("valid_code_blocks", 0)
417
+ f.write("**Code Quality:**\n\n")
418
+ f.write(f"- Average Quality Score: {avg_quality:.1f}/10\n")
419
+ f.write(f"- Valid Code Blocks: {valid_blocks}\n\n")
420
+
421
+ # Navigation
422
+ f.write("## 🗺️ Navigation\n\n")
423
+ f.write("**Reference Files:**\n\n")
424
+ for _cat_key, cat_data in categorized.items():
425
+ cat_file = self._sanitize_filename(cat_data["title"])
426
+ f.write(f"- `references/{cat_file}.md` - {cat_data['title']}\n")
427
+ f.write("\n")
428
+ f.write("See `references/index.md` for complete documentation structure.\n\n")
429
+
430
+ # Footer
431
+ f.write("---\n\n")
432
+ f.write("**Generated by Skill Seeker** | PDF Documentation Scraper\n")
433
+
434
+ with open(filename, encoding="utf-8") as f:
435
+ line_count = len(f.read().split("\n"))
436
+ print(f" Generated: {filename} ({line_count} lines)")
437
+
438
+ def _format_key_concepts(self) -> str:
439
+ """Extract key concepts from headings across all pages."""
440
+ all_headings = []
441
+
442
+ for page in self.extracted_data.get("pages", []):
443
+ headings = page.get("headings", [])
444
+ for heading in headings:
445
+ text = heading.get("text", "").strip()
446
+ level = heading.get("level", "h1")
447
+ if text and len(text) > 3: # Skip very short headings
448
+ all_headings.append((level, text))
449
+
450
+ if not all_headings:
451
+ return ""
452
+
453
+ content = "## 🔑 Key Concepts\n\n"
454
+ content += "*Main topics covered in this documentation*\n\n"
455
+
456
+ # Group by level and show top concepts
457
+ h1_headings = [text for level, text in all_headings if level == "h1"]
458
+ h2_headings = [text for level, text in all_headings if level == "h2"]
459
+
460
+ if h1_headings:
461
+ content += "**Major Topics:**\n\n"
462
+ for heading in h1_headings[:10]: # Top 10
463
+ content += f"- {heading}\n"
464
+ content += "\n"
465
+
466
+ if h2_headings:
467
+ content += "**Subtopics:**\n\n"
468
+ for heading in h2_headings[:15]: # Top 15
469
+ content += f"- {heading}\n"
470
+ content += "\n"
471
+
472
+ return content
473
+
474
+ def _format_patterns_from_content(self) -> str:
475
+ """Extract common patterns from text content."""
476
+ # Look for common technical patterns in text
477
+ patterns = []
478
+
479
+ # Simple pattern extraction from headings and emphasized text
480
+ for page in self.extracted_data.get("pages", []):
481
+ _text = page.get("text", "")
482
+ headings = page.get("headings", [])
483
+
484
+ # Look for common pattern keywords in headings
485
+ pattern_keywords = [
486
+ "getting started",
487
+ "installation",
488
+ "configuration",
489
+ "usage",
490
+ "api",
491
+ "examples",
492
+ "tutorial",
493
+ "guide",
494
+ "best practices",
495
+ "troubleshooting",
496
+ "faq",
497
+ ]
498
+
499
+ for heading in headings:
500
+ heading_text = heading.get("text", "").lower()
501
+ for keyword in pattern_keywords:
502
+ if keyword in heading_text:
503
+ page_num = page.get("page_number", 0)
504
+ patterns.append(
505
+ {
506
+ "type": keyword.title(),
507
+ "heading": heading.get("text", ""),
508
+ "page": page_num,
509
+ }
510
+ )
511
+ break # Only add once per heading
512
+
513
+ if not patterns:
514
+ return "*See reference files for detailed content*\n\n"
515
+
516
+ content = "*Common documentation patterns found:*\n\n"
517
+
518
+ # Group by type
519
+ by_type = {}
520
+ for pattern in patterns:
521
+ ptype = pattern["type"]
522
+ if ptype not in by_type:
523
+ by_type[ptype] = []
524
+ by_type[ptype].append(pattern)
525
+
526
+ # Display grouped patterns
527
+ for ptype in sorted(by_type.keys()):
528
+ items = by_type[ptype]
529
+ content += f"**{ptype}** ({len(items)} sections):\n"
530
+ for item in items[:3]: # Top 3 per type
531
+ content += f"- {item['heading']} (page {item['page']})\n"
532
+ content += "\n"
533
+
534
+ return content
535
+
536
+ def _sanitize_filename(self, name):
537
+ """Convert string to safe filename"""
538
+ # Remove special chars, replace spaces with underscores
539
+ safe = re.sub(r"[^\w\s-]", "", name.lower())
540
+ safe = re.sub(r"[-\s]+", "_", safe)
541
+ return safe
542
+
543
+
544
+ def main():
545
+ parser = argparse.ArgumentParser(
546
+ description="Convert PDF documentation to Claude skill",
547
+ formatter_class=argparse.RawDescriptionHelpFormatter,
548
+ )
549
+
550
+ parser.add_argument("--config", help="PDF config JSON file")
551
+ parser.add_argument("--pdf", help="Direct PDF file path")
552
+ parser.add_argument("--name", help="Skill name (with --pdf)")
553
+ parser.add_argument("--from-json", help="Build skill from extracted JSON")
554
+ parser.add_argument("--description", help="Skill description")
555
+
556
+ args = parser.parse_args()
557
+
558
+ # Validate inputs
559
+ if not (args.config or args.pdf or args.from_json):
560
+ parser.error("Must specify --config, --pdf, or --from-json")
561
+
562
+ # Load or create config
563
+ if args.config:
564
+ with open(args.config) as f:
565
+ config = json.load(f)
566
+ elif args.from_json:
567
+ # Build from extracted JSON
568
+ name = Path(args.from_json).stem.replace("_extracted", "")
569
+ config = {
570
+ "name": name,
571
+ "description": args.description or f"Use when referencing {name} documentation",
572
+ }
573
+ converter = PDFToSkillConverter(config)
574
+ converter.load_extracted_data(args.from_json)
575
+ converter.build_skill()
576
+ return
577
+ else:
578
+ # Direct PDF mode
579
+ if not args.name:
580
+ parser.error("Must specify --name with --pdf")
581
+ config = {
582
+ "name": args.name,
583
+ "pdf_path": args.pdf,
584
+ "description": args.description or f"Use when referencing {args.name} documentation",
585
+ "extract_options": {
586
+ "chunk_size": 10,
587
+ "min_quality": 5.0,
588
+ "extract_images": True,
589
+ "min_image_size": 100,
590
+ },
591
+ }
592
+
593
+ # Create converter
594
+ try:
595
+ converter = PDFToSkillConverter(config)
596
+
597
+ # Extract if needed
598
+ if config.get("pdf_path"):
599
+ if not converter.extract_pdf():
600
+ print("\n❌ PDF extraction failed - see error above", file=sys.stderr)
601
+ sys.exit(1)
602
+
603
+ # Build skill
604
+ converter.build_skill()
605
+
606
+ except RuntimeError as e:
607
+ print(f"\n❌ Error: {e}", file=sys.stderr)
608
+ sys.exit(1)
609
+ except Exception as e:
610
+ print(f"\n❌ Unexpected error during PDF processing: {e}", file=sys.stderr)
611
+ import traceback
612
+ traceback.print_exc()
613
+ sys.exit(1)
614
+
615
+
616
+ if __name__ == "__main__":
617
+ main()