doctoskills 1.0.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- doctoskills-1.0.0/PKG-INFO +13 -0
- doctoskills-1.0.0/backend/__init__.py +17 -0
- doctoskills-1.0.0/backend/cli.py +85 -0
- doctoskills-1.0.0/backend/config.py +46 -0
- doctoskills-1.0.0/backend/converter.py +165 -0
- doctoskills-1.0.0/backend/models/Config.py +8 -0
- doctoskills-1.0.0/backend/models/SkillData.py +7 -0
- doctoskills-1.0.0/backend/models/__init__.py +4 -0
- doctoskills-1.0.0/backend/utils/__init__.py +9 -0
- doctoskills-1.0.0/backend/utils/ai_skill_data_generator.py +106 -0
- doctoskills-1.0.0/backend/utils/browser.py +81 -0
- doctoskills-1.0.0/backend/utils/content_processor.py +89 -0
- doctoskills-1.0.0/backend/utils/file_manager.py +102 -0
- doctoskills-1.0.0/doctoskills.egg-info/PKG-INFO +13 -0
- doctoskills-1.0.0/doctoskills.egg-info/SOURCES.txt +19 -0
- doctoskills-1.0.0/doctoskills.egg-info/dependency_links.txt +1 -0
- doctoskills-1.0.0/doctoskills.egg-info/entry_points.txt +2 -0
- doctoskills-1.0.0/doctoskills.egg-info/requires.txt +8 -0
- doctoskills-1.0.0/doctoskills.egg-info/top_level.txt +1 -0
- doctoskills-1.0.0/pyproject.toml +29 -0
- doctoskills-1.0.0/setup.cfg +4 -0
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: doctoskills
|
|
3
|
+
Version: 1.0.0
|
|
4
|
+
Summary: Convert documentation pages into skill markdown files.
|
|
5
|
+
Requires-Python: >=3.10
|
|
6
|
+
Requires-Dist: beautifulsoup4>=4.12.0
|
|
7
|
+
Requires-Dist: google-genai>=1.60.0
|
|
8
|
+
Requires-Dist: markdownify>=0.11.6
|
|
9
|
+
Requires-Dist: platformdirs>=4.0.0
|
|
10
|
+
Requires-Dist: selenium>=4.15.0
|
|
11
|
+
Requires-Dist: typer>=0.12.0
|
|
12
|
+
Requires-Dist: webdriver-manager>=4.0.0
|
|
13
|
+
Requires-Dist: requests>=2.31.0
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Doc2Skills Backend - Documentation to Markdown Converter
|
|
3
|
+
Converts HTML documentation to structured markdown skill files
|
|
4
|
+
"""
|
|
5
|
+
from .config import ConverterConfig
|
|
6
|
+
from .converter import (
|
|
7
|
+
DocumentationConverter,
|
|
8
|
+
convert_single_page
|
|
9
|
+
)
|
|
10
|
+
|
|
11
|
+
__version__ = "1.0.0"
|
|
12
|
+
__all__ = [
|
|
13
|
+
'ConverterConfig',
|
|
14
|
+
'DocumentationConverter',
|
|
15
|
+
'convert_single_page'
|
|
16
|
+
]
|
|
17
|
+
|
|
@@ -0,0 +1,85 @@
|
|
|
1
|
+
from typing import Optional
|
|
2
|
+
|
|
3
|
+
import typer
|
|
4
|
+
|
|
5
|
+
from .config import ConverterConfig
|
|
6
|
+
from .converter import convert_single_page
|
|
7
|
+
from .utils.file_manager import FileManager
|
|
8
|
+
|
|
9
|
+
app = typer.Typer(help="Convert documentation pages into skill files.")
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def prompt_for_api_key(file_manager: FileManager) -> None:
|
|
13
|
+
api_key = ""
|
|
14
|
+
while not api_key:
|
|
15
|
+
api_key = typer.prompt("Enter gemini api key").strip()
|
|
16
|
+
if not api_key:
|
|
17
|
+
typer.echo("Gemini API key is required.")
|
|
18
|
+
|
|
19
|
+
file_manager.update_file_api_key(api_key)
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def ensure_config_api_key(file_manager: FileManager) -> None:
|
|
23
|
+
if not file_manager.config_file_exist():
|
|
24
|
+
file_manager.prepare_default_configfile()
|
|
25
|
+
|
|
26
|
+
config = file_manager.load_config()
|
|
27
|
+
if not config.api_key.strip():
|
|
28
|
+
prompt_for_api_key(file_manager)
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
@app.command()
|
|
32
|
+
def init() -> None:
|
|
33
|
+
"""Create the default config file and save a Gemini API key."""
|
|
34
|
+
file_manager = FileManager(ConverterConfig())
|
|
35
|
+
|
|
36
|
+
if not file_manager.config_file_exist():
|
|
37
|
+
file_manager.prepare_default_configfile()
|
|
38
|
+
prompt_for_api_key(file_manager)
|
|
39
|
+
typer.echo("Config file created.")
|
|
40
|
+
return
|
|
41
|
+
|
|
42
|
+
config = file_manager.load_config()
|
|
43
|
+
if not config.api_key.strip():
|
|
44
|
+
prompt_for_api_key(file_manager)
|
|
45
|
+
typer.echo("Config file updated.")
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
@app.command("update-api-key")
|
|
49
|
+
def update_api_key() -> None:
|
|
50
|
+
"""Update the Gemini API key in the config file."""
|
|
51
|
+
file_manager = FileManager(ConverterConfig())
|
|
52
|
+
|
|
53
|
+
if not file_manager.config_file_exist():
|
|
54
|
+
file_manager.prepare_default_configfile()
|
|
55
|
+
|
|
56
|
+
prompt_for_api_key(file_manager)
|
|
57
|
+
typer.echo("API key updated.")
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
@app.command()
|
|
61
|
+
def add(
|
|
62
|
+
page_url: str = typer.Argument(..., help="Documentation page URL to convert."),
|
|
63
|
+
output: Optional[str] = typer.Option(
|
|
64
|
+
None,
|
|
65
|
+
"--output",
|
|
66
|
+
"-o",
|
|
67
|
+
help="Directory where the generated skill file will be saved.",
|
|
68
|
+
),
|
|
69
|
+
) -> None:
|
|
70
|
+
"""Convert one documentation page into a skill file."""
|
|
71
|
+
if not page_url.strip():
|
|
72
|
+
raise typer.BadParameter("Page URL is required.")
|
|
73
|
+
|
|
74
|
+
if output is None or not output.strip():
|
|
75
|
+
raise typer.BadParameter("Output directory is required. Use --output ./skills.")
|
|
76
|
+
|
|
77
|
+
file_manager = FileManager(ConverterConfig())
|
|
78
|
+
ensure_config_api_key(file_manager)
|
|
79
|
+
|
|
80
|
+
result = convert_single_page(page_url=page_url, output_dir=output)
|
|
81
|
+
if not result.get("success"):
|
|
82
|
+
typer.echo(result.get("error", "Conversion failed."), err=True)
|
|
83
|
+
raise typer.Exit(code=1)
|
|
84
|
+
|
|
85
|
+
typer.echo(f"Skill file created: {result.get('output_file')}")
|
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Configuration module for Doc2Skills converter
|
|
3
|
+
"""
|
|
4
|
+
import os
|
|
5
|
+
from dataclasses import dataclass
|
|
6
|
+
from typing import Optional
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
from platformdirs import user_config_dir
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
@dataclass
|
|
12
|
+
class ConverterConfig:
|
|
13
|
+
"""Configuration for the documentation converter"""
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
# Browser settings
|
|
17
|
+
headless: bool = True
|
|
18
|
+
window_size: str = "1920,1080"
|
|
19
|
+
page_load_timeout: int = 30
|
|
20
|
+
|
|
21
|
+
# Extraction settings
|
|
22
|
+
output_base_dir: str = ""
|
|
23
|
+
|
|
24
|
+
# HTML cleanup tags
|
|
25
|
+
cleanup_tags: tuple = ('nav', 'footer', 'script', 'style', 'header', 'aside')
|
|
26
|
+
|
|
27
|
+
default_api_key = ""
|
|
28
|
+
default_max_content_size = 3500
|
|
29
|
+
default_model="gemma-4-31b-it"
|
|
30
|
+
|
|
31
|
+
config_dir = Path(user_config_dir("DocToSkill"))
|
|
32
|
+
config_file_path = config_dir / "config.json"
|
|
33
|
+
|
|
34
|
+
# Chrome driver settings
|
|
35
|
+
chrome_driver_path: Optional[str] = None
|
|
36
|
+
|
|
37
|
+
def get_output_dir(self, domain: str) -> str:
|
|
38
|
+
"""Get the output directory for a specific domain"""
|
|
39
|
+
return os.path.join(self.output_base_dir, domain)
|
|
40
|
+
|
|
41
|
+
def ensure_output_dir(self, domain: str) -> str:
|
|
42
|
+
"""Create and return the output directory"""
|
|
43
|
+
output_dir = self.get_output_dir(domain)
|
|
44
|
+
os.makedirs(output_dir, exist_ok=True)
|
|
45
|
+
return output_dir
|
|
46
|
+
|
|
@@ -0,0 +1,165 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Main converter module with both approaches:
|
|
3
|
+
1. Full documentation conversion (all pages)
|
|
4
|
+
2. Single page conversion (user-specified URL)
|
|
5
|
+
|
|
6
|
+
Supports asynchronous processing for improved performance
|
|
7
|
+
"""
|
|
8
|
+
import time
|
|
9
|
+
import asyncio
|
|
10
|
+
from typing import Optional, List, Dict
|
|
11
|
+
from concurrent.futures import ThreadPoolExecutor
|
|
12
|
+
from .config import ConverterConfig
|
|
13
|
+
from .utils import BrowserManager, ContentProcessor, FileManager
|
|
14
|
+
from .models.SkillData import SkillData
|
|
15
|
+
|
|
16
|
+
#(Changing here) just keep single converter
|
|
17
|
+
class DocumentationConverter:
|
|
18
|
+
"""
|
|
19
|
+
Unified converter for documentation to markdown skills
|
|
20
|
+
Supports both full documentation and single page conversion
|
|
21
|
+
"""
|
|
22
|
+
|
|
23
|
+
def __init__(self, config: Optional[ConverterConfig] = None):
|
|
24
|
+
self.config = config or ConverterConfig()
|
|
25
|
+
self.browser_manager = BrowserManager(self.config)
|
|
26
|
+
self.content_processor = ContentProcessor(self.config)
|
|
27
|
+
self.file_manager = FileManager(self.config)
|
|
28
|
+
|
|
29
|
+
def convert_single_page(self, page_url: str, output_dir: Optional[str] = None,
|
|
30
|
+
custom_title: Optional[str] = None) -> Dict[str, any]:
|
|
31
|
+
"""
|
|
32
|
+
Convert a single documentation page to markdown
|
|
33
|
+
|
|
34
|
+
Args:
|
|
35
|
+
page_url: URL of the specific page to convert
|
|
36
|
+
output_dir: Optional custom output directory name
|
|
37
|
+
custom_title: Optional custom title for the page
|
|
38
|
+
|
|
39
|
+
Returns:
|
|
40
|
+
Dictionary with conversion result
|
|
41
|
+
"""
|
|
42
|
+
result = {
|
|
43
|
+
'success': False,
|
|
44
|
+
'url': page_url,
|
|
45
|
+
'title': custom_title or 'Untitled',
|
|
46
|
+
'output_file': '',
|
|
47
|
+
'output_directory': ''
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
try:
|
|
51
|
+
# Setup output directory
|
|
52
|
+
|
|
53
|
+
print(f"[*] Starting Single Page Conversion")
|
|
54
|
+
print(f"[*] Source: {page_url}")
|
|
55
|
+
|
|
56
|
+
with self.browser_manager as browser:
|
|
57
|
+
page_result = self._process_single_page(
|
|
58
|
+
browser=browser,
|
|
59
|
+
url=page_url,
|
|
60
|
+
title=custom_title,
|
|
61
|
+
output_dir=output_dir,
|
|
62
|
+
extract_title=custom_title is None
|
|
63
|
+
)
|
|
64
|
+
|
|
65
|
+
result.update(page_result)
|
|
66
|
+
|
|
67
|
+
if result['success']:
|
|
68
|
+
print(f"\n[✓] Single Page Conversion Complete!")
|
|
69
|
+
|
|
70
|
+
except Exception as e:
|
|
71
|
+
print(f"\n[✗] Conversion failed: {str(e)}")
|
|
72
|
+
result['error'] = str(e)
|
|
73
|
+
|
|
74
|
+
return result
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
def _process_single_page(self, browser: BrowserManager, url: str,
|
|
78
|
+
title: Optional[str], output_dir: str,
|
|
79
|
+
page_num: Optional[int] = None,
|
|
80
|
+
total_pages: Optional[int] = None,
|
|
81
|
+
extract_title: bool = False) -> Dict[str, any]:
|
|
82
|
+
"""
|
|
83
|
+
Internal method to process a single page
|
|
84
|
+
Shared by both conversion approaches
|
|
85
|
+
|
|
86
|
+
Args:
|
|
87
|
+
browser: BrowserManager instance
|
|
88
|
+
url: Page URL
|
|
89
|
+
title: Page title (or None to extract from page)
|
|
90
|
+
output_dir: Output directory path
|
|
91
|
+
page_num: Current page number (for progress display)
|
|
92
|
+
total_pages: Total number of pages (for progress display)
|
|
93
|
+
extract_title: Whether to extract title from page content
|
|
94
|
+
|
|
95
|
+
Returns:
|
|
96
|
+
Dictionary with page processing result
|
|
97
|
+
"""
|
|
98
|
+
result = {
|
|
99
|
+
'success': False,
|
|
100
|
+
'url': url,
|
|
101
|
+
'title': title or 'Untitled',
|
|
102
|
+
'output_file': ''
|
|
103
|
+
}
|
|
104
|
+
|
|
105
|
+
try:
|
|
106
|
+
# Display progress
|
|
107
|
+
if page_num and total_pages:
|
|
108
|
+
print(f"[{page_num}/{total_pages}] Processing: {result['title']}")
|
|
109
|
+
else:
|
|
110
|
+
print(f"[*] Processing: {url}")
|
|
111
|
+
|
|
112
|
+
# Load the page
|
|
113
|
+
browser.load_page(url, wait_time=2.0)
|
|
114
|
+
|
|
115
|
+
# Get page content
|
|
116
|
+
html_content = browser.get_page_source()
|
|
117
|
+
|
|
118
|
+
# Convert to markdown
|
|
119
|
+
markdown_content = self.content_processor.convert_to_markdown(html_content)
|
|
120
|
+
|
|
121
|
+
skill_data : SkillData = self.content_processor.get_skill_metedata_with_ai(markdown_content)
|
|
122
|
+
|
|
123
|
+
# Format the document with metadata
|
|
124
|
+
formatted_content = self.content_processor.format_markdown_document(
|
|
125
|
+
skill_data,
|
|
126
|
+
content=markdown_content
|
|
127
|
+
)
|
|
128
|
+
|
|
129
|
+
# Save to file
|
|
130
|
+
|
|
131
|
+
output_file = self.file_manager.save_markdown_file(
|
|
132
|
+
output_dir,
|
|
133
|
+
skill_data.file_name,
|
|
134
|
+
formatted_content,
|
|
135
|
+
)
|
|
136
|
+
|
|
137
|
+
result['output_file'] = output_file
|
|
138
|
+
result['success'] = True
|
|
139
|
+
|
|
140
|
+
except Exception as e:
|
|
141
|
+
print(f" [✗] Failed: {str(e)}")
|
|
142
|
+
result['error'] = str(e)
|
|
143
|
+
|
|
144
|
+
return result
|
|
145
|
+
|
|
146
|
+
|
|
147
|
+
|
|
148
|
+
def convert_single_page(page_url: str, config: Optional[ConverterConfig] = None,
|
|
149
|
+
output_dir: Optional[str] = None,
|
|
150
|
+
title: Optional[str] = None) -> Dict[str, any]:
|
|
151
|
+
"""
|
|
152
|
+
Convert a single documentation page to markdown
|
|
153
|
+
|
|
154
|
+
Args:
|
|
155
|
+
page_url: URL of the specific page to convert
|
|
156
|
+
config: Optional ConverterConfig instance
|
|
157
|
+
output_dir: Optional custom output directory name
|
|
158
|
+
title: Optional custom title for the page
|
|
159
|
+
|
|
160
|
+
Returns:
|
|
161
|
+
Dictionary with conversion result
|
|
162
|
+
"""
|
|
163
|
+
converter = DocumentationConverter(config)
|
|
164
|
+
return converter.convert_single_page(page_url, output_dir, title)
|
|
165
|
+
|
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Utilities package for Doc2Skills converter
|
|
3
|
+
"""
|
|
4
|
+
from .browser import BrowserManager
|
|
5
|
+
from .content_processor import ContentProcessor
|
|
6
|
+
from .file_manager import FileManager
|
|
7
|
+
from .ai_skill_data_generator import AISkillDataGen
|
|
8
|
+
|
|
9
|
+
__all__ = ['BrowserManager', 'ContentProcessor', 'FileManager','AISkillDataGen']
|
|
@@ -0,0 +1,106 @@
|
|
|
1
|
+
from google import genai
|
|
2
|
+
from google.genai import types
|
|
3
|
+
import json
|
|
4
|
+
from ..models.SkillData import SkillData
|
|
5
|
+
from ..models.Config import Config
|
|
6
|
+
from .file_manager import FileManager
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
SKILL_METADATA_PROMPT = """
|
|
10
|
+
You are an AI agent skill metadata generator.
|
|
11
|
+
Your task is to generate metadata for a skill based only on the provided documentation text.
|
|
12
|
+
Rules:
|
|
13
|
+
- Use only information explicitly present in the documentation.
|
|
14
|
+
- Do not invent, assume, or add information from prior knowledge.
|
|
15
|
+
- The title must clearly describe the main capability or topic of the skill.
|
|
16
|
+
- Avoid generic titles such as:
|
|
17
|
+
- Documentation Overview
|
|
18
|
+
- Guide
|
|
19
|
+
- Introduction
|
|
20
|
+
- Reference
|
|
21
|
+
- Maximum title length: 30 characters.
|
|
22
|
+
- The description must:
|
|
23
|
+
- Explain what knowledge or capability this skill provides.
|
|
24
|
+
- Explain when an AI agent should use this skill.
|
|
25
|
+
- Mention the main topics covered.
|
|
26
|
+
- Must start with Use this skill when working
|
|
27
|
+
- Maximum description length: 300 characters.
|
|
28
|
+
- Generate a file_name suitable for storing the skill as a file.
|
|
29
|
+
- file_name must:
|
|
30
|
+
- be lowercase
|
|
31
|
+
- use hyphens instead of spaces
|
|
32
|
+
- contain only letters, numbers, and hyphens
|
|
33
|
+
- not include a file extension
|
|
34
|
+
Return JSON only.
|
|
35
|
+
Output format:
|
|
36
|
+
{
|
|
37
|
+
"file_name": "",
|
|
38
|
+
"title": "",
|
|
39
|
+
"description": ""
|
|
40
|
+
}
|
|
41
|
+
"""
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
REQUIRED_FIELDS = ("file_name", "title", "description")
|
|
45
|
+
|
|
46
|
+
class AISkillDataGen:
|
|
47
|
+
"""Handles generation of skill metadata such as title and description."""
|
|
48
|
+
|
|
49
|
+
@staticmethod
|
|
50
|
+
def get_gen_data(doc_text: str) -> SkillData:
|
|
51
|
+
AISkillDataGen._validate_doc_text(doc_text)
|
|
52
|
+
|
|
53
|
+
config: Config = FileManager().load_config()
|
|
54
|
+
resolved_api_key = config.api_key
|
|
55
|
+
if not resolved_api_key:
|
|
56
|
+
raise ValueError("Google GenAI API key is required.") # you must edit the message error
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
doc_content = doc_text[:config.max_content_size]
|
|
60
|
+
client = genai.Client(api_key=resolved_api_key)
|
|
61
|
+
response = client.models.generate_content(
|
|
62
|
+
model=config.model,
|
|
63
|
+
contents=f"{SKILL_METADATA_PROMPT}\nDocumentation text:\n{doc_content}",
|
|
64
|
+
config=types.GenerateContentConfig(
|
|
65
|
+
response_mime_type="application/json",
|
|
66
|
+
)
|
|
67
|
+
)
|
|
68
|
+
|
|
69
|
+
data = AISkillDataGen._parse_response(response.text)
|
|
70
|
+
AISkillDataGen._validate_skill_data(data)
|
|
71
|
+
return SkillData(**data)
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
@staticmethod
|
|
75
|
+
def _parse_response(response_text) -> dict:
|
|
76
|
+
if not response_text or not response_text.strip():
|
|
77
|
+
raise RuntimeError("AI response was empty.")
|
|
78
|
+
|
|
79
|
+
try:
|
|
80
|
+
data = json.loads(response_text)
|
|
81
|
+
except json.JSONDecodeError as exc:
|
|
82
|
+
raise ValueError("AI response must be valid JSON.") from exc
|
|
83
|
+
|
|
84
|
+
if not isinstance(data, dict):
|
|
85
|
+
raise ValueError("AI response JSON must be an object.")
|
|
86
|
+
|
|
87
|
+
return data
|
|
88
|
+
|
|
89
|
+
@staticmethod
|
|
90
|
+
def _validate_doc_text(doc_text: str) -> None:
|
|
91
|
+
if not isinstance(doc_text, str) or not doc_text.strip():
|
|
92
|
+
raise ValueError("Documentation content is required.")
|
|
93
|
+
|
|
94
|
+
@staticmethod
|
|
95
|
+
def _validate_skill_data(data: dict) -> None:
|
|
96
|
+
missing_fields = [field for field in REQUIRED_FIELDS if field not in data]
|
|
97
|
+
if missing_fields:
|
|
98
|
+
raise ValueError(f"AI response missing required fields: {', '.join(missing_fields)}.")
|
|
99
|
+
|
|
100
|
+
for field in REQUIRED_FIELDS:
|
|
101
|
+
if not isinstance(data[field], str) or not data[field].strip():
|
|
102
|
+
raise ValueError(f"AI response field '{field}' must be a non-empty string.")
|
|
103
|
+
data[field] = data[field].strip()
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
|
|
@@ -0,0 +1,81 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Browser utilities for web scraping
|
|
3
|
+
Shared functions for both full documentation and single page conversion
|
|
4
|
+
"""
|
|
5
|
+
import os
|
|
6
|
+
import time
|
|
7
|
+
from typing import Optional
|
|
8
|
+
from selenium import webdriver
|
|
9
|
+
from selenium.webdriver.chrome.options import Options
|
|
10
|
+
from selenium.webdriver.chrome.service import Service
|
|
11
|
+
from selenium.webdriver.common.by import By
|
|
12
|
+
from ..config import ConverterConfig
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class BrowserManager:
|
|
16
|
+
"""Manages browser instance and operations"""
|
|
17
|
+
|
|
18
|
+
def __init__(self, config: ConverterConfig):
|
|
19
|
+
self.config = config
|
|
20
|
+
self.driver: Optional[webdriver.Chrome] = None
|
|
21
|
+
|
|
22
|
+
#(Changing here) delete docker usage done
|
|
23
|
+
def initialize_driver(self) -> webdriver.Chrome:
|
|
24
|
+
"""Initialize and return a Chrome WebDriver instance"""
|
|
25
|
+
chrome_options = Options()
|
|
26
|
+
|
|
27
|
+
if self.config.headless:
|
|
28
|
+
chrome_options.add_argument("--headless")
|
|
29
|
+
|
|
30
|
+
chrome_options.add_argument(f"--window-size={self.config.window_size}")
|
|
31
|
+
chrome_options.add_argument("--disable-blink-features=AutomationControlled")
|
|
32
|
+
chrome_options.add_argument("--no-sandbox")
|
|
33
|
+
chrome_options.add_argument("--disable-dev-shm-usage")
|
|
34
|
+
chrome_options.add_argument("--disable-gpu")
|
|
35
|
+
|
|
36
|
+
try:
|
|
37
|
+
from webdriver_manager.chrome import ChromeDriverManager
|
|
38
|
+
service = Service(ChromeDriverManager().install())
|
|
39
|
+
print("[*] Using ChromeDriver from webdriver_manager")
|
|
40
|
+
except ImportError:
|
|
41
|
+
raise RuntimeError(
|
|
42
|
+
"ChromeDriver not found. Either run in Docker or install webdriver-manager: "
|
|
43
|
+
"pip install webdriver-manager"
|
|
44
|
+
)
|
|
45
|
+
|
|
46
|
+
self.driver = webdriver.Chrome(service=service, options=chrome_options)
|
|
47
|
+
self.driver.set_page_load_timeout(self.config.page_load_timeout)
|
|
48
|
+
|
|
49
|
+
return self.driver
|
|
50
|
+
|
|
51
|
+
def load_page(self, url: str, wait_time: Optional[float] = None) -> None:
|
|
52
|
+
"""Load a page and wait for it to render"""
|
|
53
|
+
if not self.driver:
|
|
54
|
+
raise RuntimeError("Driver not initialized. Call initialize_driver() first.")
|
|
55
|
+
|
|
56
|
+
print(f"[*] Loading: {url}...")
|
|
57
|
+
self.driver.get(url)
|
|
58
|
+
time.sleep(wait_time or self.config.initial_load_wait)
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
def get_page_source(self) -> str:
|
|
62
|
+
"""Get the current page source"""
|
|
63
|
+
if not self.driver:
|
|
64
|
+
raise RuntimeError("Driver not initialized.")
|
|
65
|
+
return self.driver.page_source
|
|
66
|
+
|
|
67
|
+
def close(self) -> None:
|
|
68
|
+
"""Close the browser"""
|
|
69
|
+
if self.driver:
|
|
70
|
+
self.driver.quit()
|
|
71
|
+
self.driver = None
|
|
72
|
+
|
|
73
|
+
def __enter__(self):
|
|
74
|
+
"""Context manager entry"""
|
|
75
|
+
self.initialize_driver()
|
|
76
|
+
return self
|
|
77
|
+
|
|
78
|
+
def __exit__(self, exc_type, exc_val, exc_tb):
|
|
79
|
+
"""Context manager exit"""
|
|
80
|
+
self.close()
|
|
81
|
+
|
|
@@ -0,0 +1,89 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Content processing utilities
|
|
3
|
+
Shared functions for HTML parsing and markdown conversion
|
|
4
|
+
"""
|
|
5
|
+
import re
|
|
6
|
+
from typing import List, Optional, Tuple
|
|
7
|
+
from bs4 import BeautifulSoup
|
|
8
|
+
from markdownify import markdownify as md
|
|
9
|
+
from ..config import ConverterConfig
|
|
10
|
+
from .ai_skill_data_generator import AISkillDataGen
|
|
11
|
+
from ..models import SkillData
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class ContentProcessor:
|
|
15
|
+
"""Handles HTML parsing and markdown conversion"""
|
|
16
|
+
|
|
17
|
+
def __init__(self, config: ConverterConfig):
|
|
18
|
+
self.config = config
|
|
19
|
+
|
|
20
|
+
def parse_html(self, html_content: str) -> BeautifulSoup:
|
|
21
|
+
"""Parse HTML content into BeautifulSoup object"""
|
|
22
|
+
return BeautifulSoup(html_content, 'html.parser')
|
|
23
|
+
|
|
24
|
+
def cleanup_html(self, soup: BeautifulSoup) -> BeautifulSoup:
|
|
25
|
+
"""Remove unwanted HTML elements and strip links while keeping their text."""
|
|
26
|
+
|
|
27
|
+
for tag in soup(self.config.cleanup_tags):
|
|
28
|
+
tag.decompose()
|
|
29
|
+
|
|
30
|
+
for a in soup.find_all("a"):
|
|
31
|
+
a.unwrap()
|
|
32
|
+
|
|
33
|
+
return soup
|
|
34
|
+
|
|
35
|
+
def convert_to_markdown(self, html_content: str) -> str:
|
|
36
|
+
"""Convert HTML to markdown format"""
|
|
37
|
+
soup = self.parse_html(html_content)
|
|
38
|
+
soup = self.cleanup_html(soup)
|
|
39
|
+
return md(str(soup), heading_style="ATX")
|
|
40
|
+
|
|
41
|
+
def sanitize_filename(self, name: str, max_length: int = 100) -> str:
|
|
42
|
+
"""
|
|
43
|
+
Convert a string into a safe filename with length limit
|
|
44
|
+
|
|
45
|
+
Args:
|
|
46
|
+
name: Original name to sanitize
|
|
47
|
+
max_length: Maximum length for the filename (default: 100)
|
|
48
|
+
|
|
49
|
+
Returns:
|
|
50
|
+
Sanitized filename with length limit
|
|
51
|
+
"""
|
|
52
|
+
# Remove special characters and replace spaces with underscores
|
|
53
|
+
safe_name = re.sub(r'[^\w\s-]', '', name).strip()
|
|
54
|
+
safe_name = safe_name.replace(" ", "_")
|
|
55
|
+
|
|
56
|
+
# Truncate if too long, keeping the most important part (beginning)
|
|
57
|
+
if len(safe_name) > max_length:
|
|
58
|
+
safe_name = safe_name[:max_length].rstrip('_')
|
|
59
|
+
|
|
60
|
+
return safe_name
|
|
61
|
+
|
|
62
|
+
@staticmethod
|
|
63
|
+
def get_skill_metedata_with_ai(doc_content_md:str) -> SkillData:
|
|
64
|
+
data = AISkillDataGen().get_gen_data(doc_content_md)
|
|
65
|
+
return data
|
|
66
|
+
|
|
67
|
+
def format_markdown_document(self,skill_data:SkillData, content: str) -> str:
|
|
68
|
+
"""
|
|
69
|
+
Format the final markdown document following
|
|
70
|
+
|
|
71
|
+
Args:
|
|
72
|
+
title: Page title
|
|
73
|
+
description: Brief description (2 lines)
|
|
74
|
+
content: Main markdown content
|
|
75
|
+
|
|
76
|
+
Returns:
|
|
77
|
+
Formatted markdown skill
|
|
78
|
+
"""
|
|
79
|
+
|
|
80
|
+
# Format YAML frontmatter skill format
|
|
81
|
+
header = f"""---
|
|
82
|
+
title: {skill_data.title}
|
|
83
|
+
description: {skill_data.description}
|
|
84
|
+
---
|
|
85
|
+
"""
|
|
86
|
+
|
|
87
|
+
# Combine header with content (instructions section)
|
|
88
|
+
return f"{header}# \n\n{content}"
|
|
89
|
+
|
|
@@ -0,0 +1,102 @@
|
|
|
1
|
+
"""
|
|
2
|
+
File management utilities
|
|
3
|
+
Shared functions for file operations
|
|
4
|
+
"""
|
|
5
|
+
import os
|
|
6
|
+
from typing import Optional
|
|
7
|
+
from urllib.parse import urlparse
|
|
8
|
+
from ..config import ConverterConfig
|
|
9
|
+
from pathlib import Path
|
|
10
|
+
from platformdirs import user_config_dir
|
|
11
|
+
import json
|
|
12
|
+
from ..models.Config import Config
|
|
13
|
+
|
|
14
|
+
#(Changing here) the files must not generated like this , it must be from config URI .
|
|
15
|
+
class FileManager:
|
|
16
|
+
"""Manages files operations """
|
|
17
|
+
|
|
18
|
+
def __init__(self, config: Optional[ConverterConfig] = None):
|
|
19
|
+
self.config = config or ConverterConfig()
|
|
20
|
+
|
|
21
|
+
def extract_library_name(self, url: str) -> str:
|
|
22
|
+
"""
|
|
23
|
+
Extract library/package name from URL for metadata
|
|
24
|
+
Returns capitalized library name (e.g., 'React', 'I18next')
|
|
25
|
+
"""
|
|
26
|
+
domain = self.extract_domain_from_url(url)
|
|
27
|
+
# Capitalize first letter of each word
|
|
28
|
+
return domain.replace('_', ' ').title().replace(' ', '')
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def save_markdown_file(self, directory: str, filename: str, content: str) -> str:
|
|
33
|
+
"""
|
|
34
|
+
Save markdown content to a file
|
|
35
|
+
Returns the full path of the saved file
|
|
36
|
+
"""
|
|
37
|
+
# check and get the file full path for saving
|
|
38
|
+
filepath = self.prepare_output_file_path(directory,filename)
|
|
39
|
+
|
|
40
|
+
with open(filepath, 'w', encoding='utf-8') as f:
|
|
41
|
+
f.write(content)
|
|
42
|
+
|
|
43
|
+
return filepath
|
|
44
|
+
|
|
45
|
+
def prepare_output_file_path(self,directory:str,filename:str)->str:
|
|
46
|
+
"""
|
|
47
|
+
Returns the full path and check
|
|
48
|
+
"""
|
|
49
|
+
path = Path(directory).expanduser().resolve()
|
|
50
|
+
|
|
51
|
+
if path.exists() and path.is_file():
|
|
52
|
+
raise ValueError("Output path must be a directory.")
|
|
53
|
+
|
|
54
|
+
path.mkdir(parents=True, exist_ok=True)
|
|
55
|
+
|
|
56
|
+
file_path = path / f"{filename}.md"
|
|
57
|
+
return str(file_path)
|
|
58
|
+
|
|
59
|
+
def prepare_default_configfile(self)->bool:
|
|
60
|
+
config = {
|
|
61
|
+
"api_key": ConverterConfig.default_api_key,
|
|
62
|
+
"model": ConverterConfig.default_model,
|
|
63
|
+
"max_content_size": ConverterConfig.default_max_content_size
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
config_dir = Path(user_config_dir("DocToSkill"))
|
|
67
|
+
|
|
68
|
+
config_dir.mkdir(parents=True, exist_ok=True)
|
|
69
|
+
|
|
70
|
+
config_file = config_dir / "config.json"
|
|
71
|
+
|
|
72
|
+
with open(config_file, "w", encoding="utf-8") as f:
|
|
73
|
+
json.dump(config, f, indent=4)
|
|
74
|
+
|
|
75
|
+
return True
|
|
76
|
+
|
|
77
|
+
def update_file_api_key(self,api_key:str):
|
|
78
|
+
with open(self.config.config_file_path, "r", encoding="utf-8") as f:
|
|
79
|
+
config = json.load(f)
|
|
80
|
+
|
|
81
|
+
config["api_key"] = api_key
|
|
82
|
+
|
|
83
|
+
with open(self.config.config_file_path, "w", encoding="utf-8") as f:
|
|
84
|
+
json.dump(config, f, indent=4)
|
|
85
|
+
|
|
86
|
+
def load_config(self)-> Config:
|
|
87
|
+
with open(self.config.config_file_path, "r", encoding="utf-8") as f:
|
|
88
|
+
config = json.load(f)
|
|
89
|
+
|
|
90
|
+
return Config(**config)
|
|
91
|
+
|
|
92
|
+
def config_file_exist(self) -> bool:
|
|
93
|
+
path = Path(self.config.config_file_path)
|
|
94
|
+
return path.exists() and path.is_file()
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: doctoskills
|
|
3
|
+
Version: 1.0.0
|
|
4
|
+
Summary: Convert documentation pages into skill markdown files.
|
|
5
|
+
Requires-Python: >=3.10
|
|
6
|
+
Requires-Dist: beautifulsoup4>=4.12.0
|
|
7
|
+
Requires-Dist: google-genai>=1.60.0
|
|
8
|
+
Requires-Dist: markdownify>=0.11.6
|
|
9
|
+
Requires-Dist: platformdirs>=4.0.0
|
|
10
|
+
Requires-Dist: selenium>=4.15.0
|
|
11
|
+
Requires-Dist: typer>=0.12.0
|
|
12
|
+
Requires-Dist: webdriver-manager>=4.0.0
|
|
13
|
+
Requires-Dist: requests>=2.31.0
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
pyproject.toml
|
|
2
|
+
backend/__init__.py
|
|
3
|
+
backend/cli.py
|
|
4
|
+
backend/config.py
|
|
5
|
+
backend/converter.py
|
|
6
|
+
backend/models/Config.py
|
|
7
|
+
backend/models/SkillData.py
|
|
8
|
+
backend/models/__init__.py
|
|
9
|
+
backend/utils/__init__.py
|
|
10
|
+
backend/utils/ai_skill_data_generator.py
|
|
11
|
+
backend/utils/browser.py
|
|
12
|
+
backend/utils/content_processor.py
|
|
13
|
+
backend/utils/file_manager.py
|
|
14
|
+
doctoskills.egg-info/PKG-INFO
|
|
15
|
+
doctoskills.egg-info/SOURCES.txt
|
|
16
|
+
doctoskills.egg-info/dependency_links.txt
|
|
17
|
+
doctoskills.egg-info/entry_points.txt
|
|
18
|
+
doctoskills.egg-info/requires.txt
|
|
19
|
+
doctoskills.egg-info/top_level.txt
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
backend
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=68", "wheel"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
[project]
|
|
7
|
+
name = "doctoskills"
|
|
8
|
+
version = "1.0.0"
|
|
9
|
+
description = "Convert documentation pages into skill markdown files."
|
|
10
|
+
requires-python = ">=3.10"
|
|
11
|
+
|
|
12
|
+
dependencies = [
|
|
13
|
+
"beautifulsoup4>=4.12.0",
|
|
14
|
+
"google-genai>=1.60.0",
|
|
15
|
+
"markdownify>=0.11.6",
|
|
16
|
+
"platformdirs>=4.0.0",
|
|
17
|
+
"selenium>=4.15.0",
|
|
18
|
+
"typer>=0.12.0",
|
|
19
|
+
"webdriver-manager>=4.0.0",
|
|
20
|
+
"requests>=2.31.0"
|
|
21
|
+
]
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
[project.scripts]
|
|
25
|
+
doctoskills = "backend.cli:app"
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
[tool.setuptools.packages.find]
|
|
29
|
+
include = ["backend*"]
|