doculift-cli 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
doculift/__init__.py ADDED
File without changes
doculift/__main__.py ADDED
@@ -0,0 +1,4 @@
1
+ from .cli import cli
2
+
3
+ if __name__ == "__main__":
4
+ cli()
doculift/app.py ADDED
@@ -0,0 +1,97 @@
1
+ from flask import Flask, render_template, request, jsonify, send_from_directory
2
+ import threading
3
+ import uuid
4
+ import os
5
+ from .scraper import DocuLiftScraper
6
+
7
+ app = Flask(__name__)
8
+
9
+ # In-memory storage for jobs
10
+ # Production should use Redis/Celery, but for this scale a global dict is fine.
11
+ jobs = {}
12
+
13
+ OUTPUT_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), "outputs")
14
+ if not os.path.exists(OUTPUT_DIR):
15
+ os.makedirs(OUTPUT_DIR)
16
+
17
+
18
+ @app.route("/")
19
+ def index():
20
+ return render_template("index.html")
21
+
22
+
23
+ @app.route("/scrape", methods=["POST"])
24
+ def start_scrape():
25
+ data = request.json
26
+ urls = data.get("urls", [])
27
+ output_format = data.get("format", "md")
28
+ max_pages = int(data.get("max_pages", 500))
29
+ scope_type = data.get("scope_type", "section")
30
+ extract_mode = data.get("extract_mode", "content")
31
+
32
+ if not urls:
33
+ return jsonify({"error": "No URLs provided"}), 400
34
+
35
+ job_id = str(uuid.uuid4())
36
+ scraper = DocuLiftScraper(
37
+ urls,
38
+ output_format=output_format,
39
+ max_pages=max_pages,
40
+ scope_type=scope_type,
41
+ extract_mode=extract_mode,
42
+ )
43
+
44
+ jobs[job_id] = {"scraper": scraper, "status": "pending", "progress": 0, "files": []}
45
+
46
+ # Start thread
47
+ thread = threading.Thread(target=run_scraper_task, args=(job_id, scraper))
48
+ thread.daemon = True
49
+ thread.start()
50
+
51
+ return jsonify({"job_id": job_id})
52
+
53
+
54
+ def run_scraper_task(job_id, scraper):
55
+ try:
56
+ jobs[job_id]["status"] = "running"
57
+ job_dir = os.path.join(OUTPUT_DIR, job_id)
58
+ scraper.run(job_dir)
59
+
60
+ # Get list of files
61
+ files = os.listdir(job_dir)
62
+ jobs[job_id]["files"] = files
63
+ jobs[job_id]["status"] = "completed"
64
+ jobs[job_id]["progress"] = 100
65
+ except Exception as e:
66
+ jobs[job_id]["status"] = f"error: {str(e)}"
67
+
68
+
69
+ @app.route("/status/<job_id>")
70
+ def get_status(job_id):
71
+ job = jobs.get(job_id)
72
+ if not job:
73
+ return jsonify({"error": "Job not found"}), 404
74
+
75
+ return jsonify(
76
+ {
77
+ "status": job["scraper"].status,
78
+ "progress": job["scraper"].progress,
79
+ "is_finished": job["status"] in ["completed", "error"]
80
+ or "error" in job["status"],
81
+ "files": job["files"],
82
+ "job_id": job_id,
83
+ "urls_extracted": job["scraper"].urls_extracted,
84
+ "per_url_stats": job["scraper"].per_url_stats,
85
+ }
86
+ )
87
+
88
+
89
+ @app.route("/download/<job_id>/<filename>")
90
+ def download_file(job_id, filename):
91
+ return send_from_directory(os.path.join(OUTPUT_DIR, job_id), filename)
92
+
93
+
94
+ if __name__ == "__main__":
95
+ # debug=True is disabled to prevent arbitrary code execution vulnerabilities
96
+ # in production-like environments, satisfying Bandit B201.
97
+ app.run(debug=False, port=5001)
doculift/cli.py ADDED
@@ -0,0 +1,177 @@
1
+ #!/usr/bin/env python3
2
+ import click
3
+ import os
4
+ import sys
5
+
6
+ # Ensure the scripts can import our local modules
7
+ sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
8
+
9
+ # Rich configuration for beautiful terminal output
10
+ from rich.console import Console # noqa: E402
11
+ from rich.progress import ( # noqa: E402
12
+ Progress,
13
+ SpinnerColumn,
14
+ BarColumn,
15
+ TimeElapsedColumn,
16
+ )
17
+ from rich.panel import Panel # noqa: E402
18
+
19
+ from .scraper import DocuLiftScraper # noqa: E402
20
+ from .app import app, OUTPUT_DIR # noqa: E402
21
+
22
+ console = Console()
23
+
24
+ CONTEXT_SETTINGS = dict(help_option_names=["-h", "--help"])
25
+
26
+
27
+ @click.group(context_settings=CONTEXT_SETTINGS)
28
+ def cli():
29
+ """
30
+ \b
31
+ DocuLift - Effortlessly lift documentation for AI.
32
+
33
+ A hybrid tool to siphon documentation websites into clean Markdown files
34
+ or URL lists, optimized for Large Language Models.
35
+
36
+ \b
37
+ Examples:
38
+ # Start the local Web UI
39
+ doculift ui
40
+
41
+ # Extract full markdown content from a documentation section
42
+ doculift scrape https://docs.docker.com/reference/
43
+
44
+ # Just extract all the URLs from a section, up to 1000 pages
45
+ doculift scrape https://paketo.io/docs/ --mode urls --max-pages 1000
46
+ """
47
+ pass
48
+
49
+
50
+ @cli.command()
51
+ @click.option("--port", default=5001, help="Port to run the Flask web UI on.")
52
+ def ui(port):
53
+ """Launch the DocuLift Web User Interface."""
54
+ console.print(
55
+ f"[bold green]🚀 Starting DocuLift Web UI "
56
+ f"on port {port}...[/bold green]"
57
+ )
58
+ app.run(debug=False, port=port)
59
+
60
+
61
+ @cli.command()
62
+ @click.argument("urls", nargs=-1, required=True)
63
+ @click.option(
64
+ "-m",
65
+ "--mode",
66
+ type=click.Choice(["content", "urls"]),
67
+ default="content",
68
+ help='Extract full "content" (Markdown) or just "urls".',
69
+ )
70
+ @click.option(
71
+ "-s",
72
+ "--scope",
73
+ type=click.Choice(["section", "domain"]),
74
+ default="section",
75
+ help=(
76
+ 'Restrict crawl to the starting "section" folder, '
77
+ 'or allow the entire "domain".'
78
+ ),
79
+ )
80
+ @click.option(
81
+ "-f",
82
+ "--format",
83
+ "output_format",
84
+ default="md",
85
+ help="Output file format (default: md).",
86
+ )
87
+ @click.option(
88
+ "-p",
89
+ "--max-pages",
90
+ type=int,
91
+ default=500,
92
+ help="Maximum number of pages to scan PER starting URL.",
93
+ )
94
+ def scrape(urls, mode, scope, output_format, max_pages):
95
+ """
96
+ Scrape target URLs from the terminal.
97
+
98
+ URLS: One or more starting documentation URLs separated by spaces.
99
+ """
100
+ console.print(
101
+ Panel(
102
+ f"[bold blue]DocuLift CLI[/bold blue]\nTarget URLs: {len(urls)}"
103
+ f"\nMode: {mode.upper()}\nScope: {scope.capitalize()}",
104
+ expand=False,
105
+ )
106
+ )
107
+
108
+ # Initialize the scraper class directly
109
+ scraper = DocuLiftScraper(
110
+ start_urls=list(urls),
111
+ output_format=output_format,
112
+ max_pages=max_pages,
113
+ scope_type=scope,
114
+ extract_mode=mode,
115
+ )
116
+
117
+ # Use Rich to create a beautiful progress bar in the terminal
118
+ with Progress(
119
+ SpinnerColumn(),
120
+ "[progress.description]{task.description}",
121
+ BarColumn(),
122
+ "[progress.percentage]{task.percentage:>3.0f}%",
123
+ TimeElapsedColumn(),
124
+ console=console,
125
+ ) as progress:
126
+
127
+ # We need to run the scraper, but we also want progress updates.
128
+ # Since scraper.run() is synchronous and holds the thread, we will
129
+ # run it in a background thread and use the main thread to poll its
130
+ # status and update Rich.
131
+ import threading
132
+ import time
133
+
134
+ task = progress.add_task("[cyan]Initializing...", total=100)
135
+
136
+ def run_scraper():
137
+ try:
138
+ scraper.run(OUTPUT_DIR)
139
+ except Exception as e:
140
+ console.print(
141
+ f"[bold red]Error during scraping: {e}[/bold red]"
142
+ )
143
+
144
+ thread = threading.Thread(target=run_scraper)
145
+ thread.daemon = True
146
+ thread.start()
147
+
148
+ # Poll and update until finished
149
+ while thread.is_alive():
150
+ progress.update(
151
+ task,
152
+ description=f"[cyan]{scraper.status}",
153
+ completed=scraper.progress,
154
+ )
155
+ time.sleep(0.5)
156
+
157
+ # Final update
158
+ progress.update(
159
+ task, description=f"[green]{scraper.status}", completed=100
160
+ )
161
+
162
+ # Print Summary
163
+ console.print("\n[bold green]✅ Mission Accomplished![/bold green]")
164
+ for url, count in scraper.per_url_stats.items():
165
+ label = "URLs extracted" if mode == "urls" else "Pages scraped"
166
+ console.print(
167
+ f" [dim]•[/dim] {url} -> [bold cyan]{count}[/bold cyan] {label}"
168
+ )
169
+
170
+ console.print(
171
+ "\n[reset]Check the [bold yellow]outputs/[/bold yellow] directory "
172
+ "for your files.[/reset]\n"
173
+ )
174
+
175
+
176
+ if __name__ == "__main__":
177
+ cli()