doculift-cli 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- doculift/__init__.py +0 -0
- doculift/__main__.py +4 -0
- doculift/app.py +97 -0
- doculift/cli.py +177 -0
- doculift/scraper.py +442 -0
- doculift/static/css/style.css +481 -0
- doculift/static/doculift_logo.png +0 -0
- doculift/static/js/main.js +82 -0
- doculift/templates/index.html +91 -0
- doculift_cli-0.1.0.dist-info/METADATA +240 -0
- doculift_cli-0.1.0.dist-info/RECORD +14 -0
- doculift_cli-0.1.0.dist-info/WHEEL +5 -0
- doculift_cli-0.1.0.dist-info/entry_points.txt +2 -0
- doculift_cli-0.1.0.dist-info/top_level.txt +1 -0
doculift/__init__.py
ADDED
|
File without changes
|
doculift/__main__.py
ADDED
doculift/app.py
ADDED
|
@@ -0,0 +1,97 @@
|
|
|
1
|
+
from flask import Flask, render_template, request, jsonify, send_from_directory
|
|
2
|
+
import threading
|
|
3
|
+
import uuid
|
|
4
|
+
import os
|
|
5
|
+
from .scraper import DocuLiftScraper
|
|
6
|
+
|
|
7
|
+
app = Flask(__name__)
|
|
8
|
+
|
|
9
|
+
# In-memory storage for jobs
|
|
10
|
+
# Production should use Redis/Celery, but for this scale a global dict is fine.
|
|
11
|
+
jobs = {}
|
|
12
|
+
|
|
13
|
+
OUTPUT_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), "outputs")
|
|
14
|
+
if not os.path.exists(OUTPUT_DIR):
|
|
15
|
+
os.makedirs(OUTPUT_DIR)
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
@app.route("/")
|
|
19
|
+
def index():
|
|
20
|
+
return render_template("index.html")
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
@app.route("/scrape", methods=["POST"])
|
|
24
|
+
def start_scrape():
|
|
25
|
+
data = request.json
|
|
26
|
+
urls = data.get("urls", [])
|
|
27
|
+
output_format = data.get("format", "md")
|
|
28
|
+
max_pages = int(data.get("max_pages", 500))
|
|
29
|
+
scope_type = data.get("scope_type", "section")
|
|
30
|
+
extract_mode = data.get("extract_mode", "content")
|
|
31
|
+
|
|
32
|
+
if not urls:
|
|
33
|
+
return jsonify({"error": "No URLs provided"}), 400
|
|
34
|
+
|
|
35
|
+
job_id = str(uuid.uuid4())
|
|
36
|
+
scraper = DocuLiftScraper(
|
|
37
|
+
urls,
|
|
38
|
+
output_format=output_format,
|
|
39
|
+
max_pages=max_pages,
|
|
40
|
+
scope_type=scope_type,
|
|
41
|
+
extract_mode=extract_mode,
|
|
42
|
+
)
|
|
43
|
+
|
|
44
|
+
jobs[job_id] = {"scraper": scraper, "status": "pending", "progress": 0, "files": []}
|
|
45
|
+
|
|
46
|
+
# Start thread
|
|
47
|
+
thread = threading.Thread(target=run_scraper_task, args=(job_id, scraper))
|
|
48
|
+
thread.daemon = True
|
|
49
|
+
thread.start()
|
|
50
|
+
|
|
51
|
+
return jsonify({"job_id": job_id})
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def run_scraper_task(job_id, scraper):
|
|
55
|
+
try:
|
|
56
|
+
jobs[job_id]["status"] = "running"
|
|
57
|
+
job_dir = os.path.join(OUTPUT_DIR, job_id)
|
|
58
|
+
scraper.run(job_dir)
|
|
59
|
+
|
|
60
|
+
# Get list of files
|
|
61
|
+
files = os.listdir(job_dir)
|
|
62
|
+
jobs[job_id]["files"] = files
|
|
63
|
+
jobs[job_id]["status"] = "completed"
|
|
64
|
+
jobs[job_id]["progress"] = 100
|
|
65
|
+
except Exception as e:
|
|
66
|
+
jobs[job_id]["status"] = f"error: {str(e)}"
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
@app.route("/status/<job_id>")
|
|
70
|
+
def get_status(job_id):
|
|
71
|
+
job = jobs.get(job_id)
|
|
72
|
+
if not job:
|
|
73
|
+
return jsonify({"error": "Job not found"}), 404
|
|
74
|
+
|
|
75
|
+
return jsonify(
|
|
76
|
+
{
|
|
77
|
+
"status": job["scraper"].status,
|
|
78
|
+
"progress": job["scraper"].progress,
|
|
79
|
+
"is_finished": job["status"] in ["completed", "error"]
|
|
80
|
+
or "error" in job["status"],
|
|
81
|
+
"files": job["files"],
|
|
82
|
+
"job_id": job_id,
|
|
83
|
+
"urls_extracted": job["scraper"].urls_extracted,
|
|
84
|
+
"per_url_stats": job["scraper"].per_url_stats,
|
|
85
|
+
}
|
|
86
|
+
)
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
@app.route("/download/<job_id>/<filename>")
|
|
90
|
+
def download_file(job_id, filename):
|
|
91
|
+
return send_from_directory(os.path.join(OUTPUT_DIR, job_id), filename)
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
if __name__ == "__main__":
|
|
95
|
+
# debug=True is disabled to prevent arbitrary code execution vulnerabilities
|
|
96
|
+
# in production-like environments, satisfying Bandit B201.
|
|
97
|
+
app.run(debug=False, port=5001)
|
doculift/cli.py
ADDED
|
@@ -0,0 +1,177 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
import click
|
|
3
|
+
import os
|
|
4
|
+
import sys
|
|
5
|
+
|
|
6
|
+
# Ensure the scripts can import our local modules
|
|
7
|
+
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
|
|
8
|
+
|
|
9
|
+
# Rich configuration for beautiful terminal output
|
|
10
|
+
from rich.console import Console # noqa: E402
|
|
11
|
+
from rich.progress import ( # noqa: E402
|
|
12
|
+
Progress,
|
|
13
|
+
SpinnerColumn,
|
|
14
|
+
BarColumn,
|
|
15
|
+
TimeElapsedColumn,
|
|
16
|
+
)
|
|
17
|
+
from rich.panel import Panel # noqa: E402
|
|
18
|
+
|
|
19
|
+
from .scraper import DocuLiftScraper # noqa: E402
|
|
20
|
+
from .app import app, OUTPUT_DIR # noqa: E402
|
|
21
|
+
|
|
22
|
+
console = Console()
|
|
23
|
+
|
|
24
|
+
CONTEXT_SETTINGS = dict(help_option_names=["-h", "--help"])
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
@click.group(context_settings=CONTEXT_SETTINGS)
|
|
28
|
+
def cli():
|
|
29
|
+
"""
|
|
30
|
+
\b
|
|
31
|
+
DocuLift - Effortlessly lift documentation for AI.
|
|
32
|
+
|
|
33
|
+
A hybrid tool to siphon documentation websites into clean Markdown files
|
|
34
|
+
or URL lists, optimized for Large Language Models.
|
|
35
|
+
|
|
36
|
+
\b
|
|
37
|
+
Examples:
|
|
38
|
+
# Start the local Web UI
|
|
39
|
+
doculift ui
|
|
40
|
+
|
|
41
|
+
# Extract full markdown content from a documentation section
|
|
42
|
+
doculift scrape https://docs.docker.com/reference/
|
|
43
|
+
|
|
44
|
+
# Just extract all the URLs from a section, up to 1000 pages
|
|
45
|
+
doculift scrape https://paketo.io/docs/ --mode urls --max-pages 1000
|
|
46
|
+
"""
|
|
47
|
+
pass
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
@cli.command()
|
|
51
|
+
@click.option("--port", default=5001, help="Port to run the Flask web UI on.")
|
|
52
|
+
def ui(port):
|
|
53
|
+
"""Launch the DocuLift Web User Interface."""
|
|
54
|
+
console.print(
|
|
55
|
+
f"[bold green]🚀 Starting DocuLift Web UI "
|
|
56
|
+
f"on port {port}...[/bold green]"
|
|
57
|
+
)
|
|
58
|
+
app.run(debug=False, port=port)
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
@cli.command()
|
|
62
|
+
@click.argument("urls", nargs=-1, required=True)
|
|
63
|
+
@click.option(
|
|
64
|
+
"-m",
|
|
65
|
+
"--mode",
|
|
66
|
+
type=click.Choice(["content", "urls"]),
|
|
67
|
+
default="content",
|
|
68
|
+
help='Extract full "content" (Markdown) or just "urls".',
|
|
69
|
+
)
|
|
70
|
+
@click.option(
|
|
71
|
+
"-s",
|
|
72
|
+
"--scope",
|
|
73
|
+
type=click.Choice(["section", "domain"]),
|
|
74
|
+
default="section",
|
|
75
|
+
help=(
|
|
76
|
+
'Restrict crawl to the starting "section" folder, '
|
|
77
|
+
'or allow the entire "domain".'
|
|
78
|
+
),
|
|
79
|
+
)
|
|
80
|
+
@click.option(
|
|
81
|
+
"-f",
|
|
82
|
+
"--format",
|
|
83
|
+
"output_format",
|
|
84
|
+
default="md",
|
|
85
|
+
help="Output file format (default: md).",
|
|
86
|
+
)
|
|
87
|
+
@click.option(
|
|
88
|
+
"-p",
|
|
89
|
+
"--max-pages",
|
|
90
|
+
type=int,
|
|
91
|
+
default=500,
|
|
92
|
+
help="Maximum number of pages to scan PER starting URL.",
|
|
93
|
+
)
|
|
94
|
+
def scrape(urls, mode, scope, output_format, max_pages):
|
|
95
|
+
"""
|
|
96
|
+
Scrape target URLs from the terminal.
|
|
97
|
+
|
|
98
|
+
URLS: One or more starting documentation URLs separated by spaces.
|
|
99
|
+
"""
|
|
100
|
+
console.print(
|
|
101
|
+
Panel(
|
|
102
|
+
f"[bold blue]DocuLift CLI[/bold blue]\nTarget URLs: {len(urls)}"
|
|
103
|
+
f"\nMode: {mode.upper()}\nScope: {scope.capitalize()}",
|
|
104
|
+
expand=False,
|
|
105
|
+
)
|
|
106
|
+
)
|
|
107
|
+
|
|
108
|
+
# Initialize the scraper class directly
|
|
109
|
+
scraper = DocuLiftScraper(
|
|
110
|
+
start_urls=list(urls),
|
|
111
|
+
output_format=output_format,
|
|
112
|
+
max_pages=max_pages,
|
|
113
|
+
scope_type=scope,
|
|
114
|
+
extract_mode=mode,
|
|
115
|
+
)
|
|
116
|
+
|
|
117
|
+
# Use Rich to create a beautiful progress bar in the terminal
|
|
118
|
+
with Progress(
|
|
119
|
+
SpinnerColumn(),
|
|
120
|
+
"[progress.description]{task.description}",
|
|
121
|
+
BarColumn(),
|
|
122
|
+
"[progress.percentage]{task.percentage:>3.0f}%",
|
|
123
|
+
TimeElapsedColumn(),
|
|
124
|
+
console=console,
|
|
125
|
+
) as progress:
|
|
126
|
+
|
|
127
|
+
# We need to run the scraper, but we also want progress updates.
|
|
128
|
+
# Since scraper.run() is synchronous and holds the thread, we will
|
|
129
|
+
# run it in a background thread and use the main thread to poll its
|
|
130
|
+
# status and update Rich.
|
|
131
|
+
import threading
|
|
132
|
+
import time
|
|
133
|
+
|
|
134
|
+
task = progress.add_task("[cyan]Initializing...", total=100)
|
|
135
|
+
|
|
136
|
+
def run_scraper():
|
|
137
|
+
try:
|
|
138
|
+
scraper.run(OUTPUT_DIR)
|
|
139
|
+
except Exception as e:
|
|
140
|
+
console.print(
|
|
141
|
+
f"[bold red]Error during scraping: {e}[/bold red]"
|
|
142
|
+
)
|
|
143
|
+
|
|
144
|
+
thread = threading.Thread(target=run_scraper)
|
|
145
|
+
thread.daemon = True
|
|
146
|
+
thread.start()
|
|
147
|
+
|
|
148
|
+
# Poll and update until finished
|
|
149
|
+
while thread.is_alive():
|
|
150
|
+
progress.update(
|
|
151
|
+
task,
|
|
152
|
+
description=f"[cyan]{scraper.status}",
|
|
153
|
+
completed=scraper.progress,
|
|
154
|
+
)
|
|
155
|
+
time.sleep(0.5)
|
|
156
|
+
|
|
157
|
+
# Final update
|
|
158
|
+
progress.update(
|
|
159
|
+
task, description=f"[green]{scraper.status}", completed=100
|
|
160
|
+
)
|
|
161
|
+
|
|
162
|
+
# Print Summary
|
|
163
|
+
console.print("\n[bold green]✅ Mission Accomplished![/bold green]")
|
|
164
|
+
for url, count in scraper.per_url_stats.items():
|
|
165
|
+
label = "URLs extracted" if mode == "urls" else "Pages scraped"
|
|
166
|
+
console.print(
|
|
167
|
+
f" [dim]•[/dim] {url} -> [bold cyan]{count}[/bold cyan] {label}"
|
|
168
|
+
)
|
|
169
|
+
|
|
170
|
+
console.print(
|
|
171
|
+
"\n[reset]Check the [bold yellow]outputs/[/bold yellow] directory "
|
|
172
|
+
"for your files.[/reset]\n"
|
|
173
|
+
)
|
|
174
|
+
|
|
175
|
+
|
|
176
|
+
if __name__ == "__main__":
|
|
177
|
+
cli()
|