doculift-cli 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,6 @@
1
+ include README.md
2
+ include requirements.txt
3
+ recursive-include src/doculift/templates *
4
+ recursive-include src/doculift/static *
5
+ global-exclude *.py[cod]
6
+ global-exclude __pycache__
@@ -0,0 +1,240 @@
1
+ Metadata-Version: 2.4
2
+ Name: doculift-cli
3
+ Version: 0.1.0
4
+ Summary: A powerful CLI & web scraper that lifts documentation for Large Language Models.
5
+ Author: M.J. Shetty
6
+ License: MIT
7
+ Project-URL: Homepage, https://github.com/mjshetty/doculift
8
+ Classifier: Programming Language :: Python :: 3
9
+ Classifier: License :: OSI Approved :: MIT License
10
+ Classifier: Operating System :: OS Independent
11
+ Classifier: Intended Audience :: Developers
12
+ Classifier: Topic :: Utilities
13
+ Requires-Python: >=3.10
14
+ Description-Content-Type: text/markdown
15
+ Requires-Dist: flask>=3.0.0
16
+ Requires-Dist: requests
17
+ Requires-Dist: beautifulsoup4
18
+ Requires-Dist: playwright
19
+ Requires-Dist: click
20
+ Requires-Dist: rich
21
+ Provides-Extra: dev
22
+ Requires-Dist: black; extra == "dev"
23
+ Requires-Dist: flake8; extra == "dev"
24
+ Requires-Dist: bandit; extra == "dev"
25
+ Requires-Dist: build; extra == "dev"
26
+ Requires-Dist: twine; extra == "dev"
27
+
28
+ # DocuLift
29
+
30
+ **DocuLift** is a web scraping tool that lifts documentation websites into clean, aggregated files optimized for feeding into Large Language Models like Google NotebookLM, Claude, or ChatGPT.
31
+
32
+ It handles dynamic Single Page Applications (SPAs), respects site structure, and produces output in two modes: full content extraction or URL-only extraction.
33
+
34
+ ---
35
+
36
+ ## Features
37
+
38
+ - **Two Extract Modes** — choose between extracting full page content or just collecting URLs (see [When to Use Each Mode](#when-to-use-each-mode))
39
+ - **Dynamic Content Scraping** — uses Playwright (headless Chromium) to render JavaScript-heavy sites (React, Vue, etc.) before extraction
40
+ - **Smart Scoping**:
41
+ - **Section Only** — stays within the folder boundary of the starting URL (e.g. starting at `.../docs/agents/overview` scrapes everything under `.../docs/agents/`)
42
+ - **Entire Domain** — crawls all pages under the target domain
43
+ - **Intelligent Aggregation** — combines multiple pages into single files, auto-splits at ~500KB (NotebookLM's per-file limit), generates meaningful filenames
44
+ - **Multi-URL Support** — submit multiple starting URLs in one job; each is crawled independently and produces its own output file(s)
45
+ - **Per-URL stats** — on completion, the UI shows how many pages or URLs were collected per starting URL
46
+ - **Clean Extraction** — removes navigation, footers, sidebars, ads, and scripts; focuses on main content
47
+
48
+ ---
49
+
50
+ ## When to Use Each Mode
51
+
52
+ ### Extract Content
53
+ Crawls each page and converts its content to Markdown (or text/CSV). Use this when you want to feed documentation directly into an LLM as context.
54
+
55
+ - **Best for**: NotebookLM, Claude Projects, ChatGPT — any tool that accepts uploaded documents
56
+ - **Output**: One or more `.md` files per starting URL, split at ~500KB
57
+ - **Typical workflow**: Extract content → upload files to NotebookLM → ask questions
58
+
59
+ ### Extract URLs Only
60
+ Crawls the site and collects every discovered URL within scope, writing them to a plain `.txt` file — one URL per line, no other content.
61
+
62
+ **Use this when NotebookLM's URL limit is the bottleneck.**
63
+
64
+ NotebookLM supports adding web URLs as sources, but has a cap on how many you can add per notebook. When a documentation section has hundreds of pages, you'll hit that limit quickly. The recommended two-step workflow is:
65
+
66
+ 1. **Run "Extract URLs Only"** on the target documentation to get a full list of all pages within scope
67
+ 2. **Review and trim** the URL list down to the most relevant pages
68
+ 3. **Add the trimmed URLs directly to NotebookLM** as web sources — NotebookLM fetches and indexes them itself, giving you live, citable sources rather than static file uploads
69
+
70
+ This approach gives you fine-grained control over exactly which pages NotebookLM indexes, without wasting your URL quota on irrelevant pages.
71
+
72
+ ---
73
+
74
+ ## Tech Stack
75
+
76
+ | Layer | Technology |
77
+ |---|---|
78
+ | Backend | Python 3.10+, Flask |
79
+ | Scraping | Playwright (headless Chromium) |
80
+ | Parsing | BeautifulSoup4 |
81
+ | Frontend | HTML5, CSS (Glassmorphism), Vanilla JS |
82
+ | CI/CD | GitHub Actions, Black, Flake8, Bandit |
83
+
84
+ ---
85
+
86
+ ## Continuous Integration (CI/CD)
87
+
88
+ DocuLift includes a pre-configured GitHub Actions pipeline (`.github/workflows/ci.yml`) that automatically runs on every push and pull request to the `main` or `master` branches.
89
+
90
+ The pipeline executes the following checks to ensure code quality and security:
91
+
92
+ 1. **Code Formatting (Black)**
93
+ - Automatically checks that all Python files adhere to standard `black` formatting rules.
94
+ 2. **Linting (Flake8)**
95
+ - Scans for syntax errors, undefined names, and unused imports.
96
+ - Enforces a maximum line length and complexity thresholds.
97
+ 3. **Security Scanning (Bandit)**
98
+ - Analyzes Python code for common security vulnerabilities.
99
+ - Ensures safe configurations (e.g., verifying `debug=False` for Flask in production environments).
100
+
101
+ *Note: The pipeline strictly fails if any high-severity security issues are found, preventing insecure code from being merged.*
102
+
103
+ ---
104
+
105
+ ## Installation
106
+
107
+ ### Prerequisites
108
+ - Python 3.10 or higher
109
+ - `pip`
110
+
111
+ ### Steps
112
+
113
+ 1. **Clone the repository**
114
+ ```bash
115
+ git clone <repository-url>
116
+ cd doculift
117
+ ```
118
+
119
+ 2. **Create a virtual environment**
120
+ ```bash
121
+ python3 -m venv venv
122
+ source venv/bin/activate # Windows: venv\Scripts\activate
123
+ ```
124
+
125
+ 3. **Install dependencies**
126
+ ```bash
127
+ pip install -r requirements.txt
128
+ ```
129
+
130
+ 4. **Install Chromium**
131
+ ```bash
132
+ playwright install chromium
133
+ ```
134
+
135
+ 5. **Start the app**
136
+ ```bash
137
+ python3 app.py
138
+ ```
139
+ Open `http://127.0.0.1:5001` in your browser.
140
+
141
+ ---
142
+
143
+ ## Usage
144
+
145
+ DocuLift is a hybrid tool. You can run it via a beautiful Web interface, or directly from your terminal.
146
+
147
+ ### 1. Web User Interface
148
+
149
+ Start the local server:
150
+ ```bash
151
+ doculift ui
152
+ # or
153
+ doculift ui --port 5001
154
+ ```
155
+ Then open `http://127.0.0.1:5001` in your browser.
156
+
157
+ 1. **Enter target URLs** — one per line (e.g. `https://docs.docker.com/reference/`)
158
+ 2. **Choose Extract Mode** — *Extract Content* or *Extract URLs Only*
159
+ 3. **Choose Scoping Strategy** — *Section Only* (recommended) or *Entire Domain*
160
+ 4. **Choose Output Format** — Markdown, Plain Text, or CSV (applies to content mode)
161
+ 5. **Set Max Pages per URL** — default 500; each starting URL is crawled independently up to this limit
162
+ 6. **Click "Siphon Content"** and watch the progress bar
163
+ 7. On completion, per-URL stats are shown and files are available for download
164
+
165
+ ### 2. Command Line Interface (CLI)
166
+
167
+ Run extraction directly from your terminal with a beautiful progress bar. Files will be saved into the `./outputs` folder automatically.
168
+
169
+ ```bash
170
+ # See all available commands and options
171
+ doculift --help
172
+
173
+ # See options specific to the scrape command
174
+ doculift scrape --help
175
+
176
+ # Example: Extract full markdown content from a documentation section
177
+ doculift scrape https://docs.docker.com/reference/
178
+
179
+ # Example: Extract only URLs, capped at 1000 pages, from multiple sources
180
+ doculift scrape https://paketo.io/docs/ https://docs.docker.com/ --mode urls --max-pages 1000
181
+ ```
182
+
183
+ ---
184
+
185
+ ## How It Works
186
+
187
+ ```
188
+ User submits URLs + config
189
+
190
+ Background thread spawned (one per job)
191
+
192
+ For each starting URL:
193
+ ├── Determine scope (section boundary or full domain)
194
+ ├── BFS crawl with Playwright (handles JS rendering)
195
+ ├── [Content mode] Clean HTML → Markdown, buffer → split files at 500KB
196
+ └── [URL mode] Collect discovered links → single .txt file
197
+
198
+ Per-URL stats displayed, files available for download
199
+ ```
200
+
201
+ **Key crawl behaviours:**
202
+ - Each starting URL gets an independent BFS with its own visited set — URLs are not cross-contaminated between starting points
203
+ - `max_pages` applies per starting URL, not globally
204
+ - Pages already scraped by an earlier starting URL in the same job are skipped to avoid duplication
205
+ - Fragment URLs (`#anchor`) are normalised and deduplicated
206
+
207
+ ---
208
+
209
+ ## API
210
+
211
+ Trigger jobs programmatically:
212
+
213
+ ```bash
214
+ curl -X POST http://127.0.0.1:5001/scrape \
215
+ -H "Content-Type: application/json" \
216
+ -d '{
217
+ "urls": ["https://docs.docker.com/reference/", "https://paketo.io/docs/"],
218
+ "format": "md",
219
+ "max_pages": 200,
220
+ "scope_type": "section",
221
+ "extract_mode": "content"
222
+ }'
223
+ ```
224
+
225
+ Response:
226
+ ```json
227
+ { "job_id": "abc123" }
228
+ ```
229
+
230
+ Poll for status:
231
+ ```bash
232
+ curl http://127.0.0.1:5001/status/abc123
233
+ ```
234
+
235
+ Response fields: `status`, `progress`, `is_finished`, `files`, `per_url_stats`, `urls_extracted`.
236
+
237
+ Download a file:
238
+ ```
239
+ GET /download/<job_id>/<filename>
240
+ ```
@@ -0,0 +1,213 @@
1
+ # DocuLift
2
+
3
+ **DocuLift** is a web scraping tool that lifts documentation websites into clean, aggregated files optimized for feeding into Large Language Models like Google NotebookLM, Claude, or ChatGPT.
4
+
5
+ It handles dynamic Single Page Applications (SPAs), respects site structure, and produces output in two modes: full content extraction or URL-only extraction.
6
+
7
+ ---
8
+
9
+ ## Features
10
+
11
+ - **Two Extract Modes** — choose between extracting full page content or just collecting URLs (see [When to Use Each Mode](#when-to-use-each-mode))
12
+ - **Dynamic Content Scraping** — uses Playwright (headless Chromium) to render JavaScript-heavy sites (React, Vue, etc.) before extraction
13
+ - **Smart Scoping**:
14
+ - **Section Only** — stays within the folder boundary of the starting URL (e.g. starting at `.../docs/agents/overview` scrapes everything under `.../docs/agents/`)
15
+ - **Entire Domain** — crawls all pages under the target domain
16
+ - **Intelligent Aggregation** — combines multiple pages into single files, auto-splits at ~500KB (NotebookLM's per-file limit), generates meaningful filenames
17
+ - **Multi-URL Support** — submit multiple starting URLs in one job; each is crawled independently and produces its own output file(s)
18
+ - **Per-URL stats** — on completion, the UI shows how many pages or URLs were collected per starting URL
19
+ - **Clean Extraction** — removes navigation, footers, sidebars, ads, and scripts; focuses on main content
20
+
21
+ ---
22
+
23
+ ## When to Use Each Mode
24
+
25
+ ### Extract Content
26
+ Crawls each page and converts its content to Markdown (or text/CSV). Use this when you want to feed documentation directly into an LLM as context.
27
+
28
+ - **Best for**: NotebookLM, Claude Projects, ChatGPT — any tool that accepts uploaded documents
29
+ - **Output**: One or more `.md` files per starting URL, split at ~500KB
30
+ - **Typical workflow**: Extract content → upload files to NotebookLM → ask questions
31
+
32
+ ### Extract URLs Only
33
+ Crawls the site and collects every discovered URL within scope, writing them to a plain `.txt` file — one URL per line, no other content.
34
+
35
+ **Use this when NotebookLM's URL limit is the bottleneck.**
36
+
37
+ NotebookLM supports adding web URLs as sources, but has a cap on how many you can add per notebook. When a documentation section has hundreds of pages, you'll hit that limit quickly. The recommended two-step workflow is:
38
+
39
+ 1. **Run "Extract URLs Only"** on the target documentation to get a full list of all pages within scope
40
+ 2. **Review and trim** the URL list down to the most relevant pages
41
+ 3. **Add the trimmed URLs directly to NotebookLM** as web sources — NotebookLM fetches and indexes them itself, giving you live, citable sources rather than static file uploads
42
+
43
+ This approach gives you fine-grained control over exactly which pages NotebookLM indexes, without wasting your URL quota on irrelevant pages.
44
+
45
+ ---
46
+
47
+ ## Tech Stack
48
+
49
+ | Layer | Technology |
50
+ |---|---|
51
+ | Backend | Python 3.10+, Flask |
52
+ | Scraping | Playwright (headless Chromium) |
53
+ | Parsing | BeautifulSoup4 |
54
+ | Frontend | HTML5, CSS (Glassmorphism), Vanilla JS |
55
+ | CI/CD | GitHub Actions, Black, Flake8, Bandit |
56
+
57
+ ---
58
+
59
+ ## Continuous Integration (CI/CD)
60
+
61
+ DocuLift includes a pre-configured GitHub Actions pipeline (`.github/workflows/ci.yml`) that automatically runs on every push and pull request to the `main` or `master` branches.
62
+
63
+ The pipeline executes the following checks to ensure code quality and security:
64
+
65
+ 1. **Code Formatting (Black)**
66
+ - Automatically checks that all Python files adhere to standard `black` formatting rules.
67
+ 2. **Linting (Flake8)**
68
+ - Scans for syntax errors, undefined names, and unused imports.
69
+ - Enforces a maximum line length and complexity thresholds.
70
+ 3. **Security Scanning (Bandit)**
71
+ - Analyzes Python code for common security vulnerabilities.
72
+ - Ensures safe configurations (e.g., verifying `debug=False` for Flask in production environments).
73
+
74
+ *Note: The pipeline strictly fails if any high-severity security issues are found, preventing insecure code from being merged.*
75
+
76
+ ---
77
+
78
+ ## Installation
79
+
80
+ ### Prerequisites
81
+ - Python 3.10 or higher
82
+ - `pip`
83
+
84
+ ### Steps
85
+
86
+ 1. **Clone the repository**
87
+ ```bash
88
+ git clone <repository-url>
89
+ cd doculift
90
+ ```
91
+
92
+ 2. **Create a virtual environment**
93
+ ```bash
94
+ python3 -m venv venv
95
+ source venv/bin/activate # Windows: venv\Scripts\activate
96
+ ```
97
+
98
+ 3. **Install dependencies**
99
+ ```bash
100
+ pip install -r requirements.txt
101
+ ```
102
+
103
+ 4. **Install Chromium**
104
+ ```bash
105
+ playwright install chromium
106
+ ```
107
+
108
+ 5. **Start the app**
109
+ ```bash
110
+ python3 app.py
111
+ ```
112
+ Open `http://127.0.0.1:5001` in your browser.
113
+
114
+ ---
115
+
116
+ ## Usage
117
+
118
+ DocuLift is a hybrid tool. You can run it via a beautiful Web interface, or directly from your terminal.
119
+
120
+ ### 1. Web User Interface
121
+
122
+ Start the local server:
123
+ ```bash
124
+ doculift ui
125
+ # or
126
+ doculift ui --port 5001
127
+ ```
128
+ Then open `http://127.0.0.1:5001` in your browser.
129
+
130
+ 1. **Enter target URLs** — one per line (e.g. `https://docs.docker.com/reference/`)
131
+ 2. **Choose Extract Mode** — *Extract Content* or *Extract URLs Only*
132
+ 3. **Choose Scoping Strategy** — *Section Only* (recommended) or *Entire Domain*
133
+ 4. **Choose Output Format** — Markdown, Plain Text, or CSV (applies to content mode)
134
+ 5. **Set Max Pages per URL** — default 500; each starting URL is crawled independently up to this limit
135
+ 6. **Click "Siphon Content"** and watch the progress bar
136
+ 7. On completion, per-URL stats are shown and files are available for download
137
+
138
+ ### 2. Command Line Interface (CLI)
139
+
140
+ Run extraction directly from your terminal with a beautiful progress bar. Files will be saved into the `./outputs` folder automatically.
141
+
142
+ ```bash
143
+ # See all available commands and options
144
+ doculift --help
145
+
146
+ # See options specific to the scrape command
147
+ doculift scrape --help
148
+
149
+ # Example: Extract full markdown content from a documentation section
150
+ doculift scrape https://docs.docker.com/reference/
151
+
152
+ # Example: Extract only URLs, capped at 1000 pages, from multiple sources
153
+ doculift scrape https://paketo.io/docs/ https://docs.docker.com/ --mode urls --max-pages 1000
154
+ ```
155
+
156
+ ---
157
+
158
+ ## How It Works
159
+
160
+ ```
161
+ User submits URLs + config
162
+
163
+ Background thread spawned (one per job)
164
+
165
+ For each starting URL:
166
+ ├── Determine scope (section boundary or full domain)
167
+ ├── BFS crawl with Playwright (handles JS rendering)
168
+ ├── [Content mode] Clean HTML → Markdown, buffer → split files at 500KB
169
+ └── [URL mode] Collect discovered links → single .txt file
170
+
171
+ Per-URL stats displayed, files available for download
172
+ ```
173
+
174
+ **Key crawl behaviours:**
175
+ - Each starting URL gets an independent BFS with its own visited set — URLs are not cross-contaminated between starting points
176
+ - `max_pages` applies per starting URL, not globally
177
+ - Pages already scraped by an earlier starting URL in the same job are skipped to avoid duplication
178
+ - Fragment URLs (`#anchor`) are normalised and deduplicated
179
+
180
+ ---
181
+
182
+ ## API
183
+
184
+ Trigger jobs programmatically:
185
+
186
+ ```bash
187
+ curl -X POST http://127.0.0.1:5001/scrape \
188
+ -H "Content-Type: application/json" \
189
+ -d '{
190
+ "urls": ["https://docs.docker.com/reference/", "https://paketo.io/docs/"],
191
+ "format": "md",
192
+ "max_pages": 200,
193
+ "scope_type": "section",
194
+ "extract_mode": "content"
195
+ }'
196
+ ```
197
+
198
+ Response:
199
+ ```json
200
+ { "job_id": "abc123" }
201
+ ```
202
+
203
+ Poll for status:
204
+ ```bash
205
+ curl http://127.0.0.1:5001/status/abc123
206
+ ```
207
+
208
+ Response fields: `status`, `progress`, `is_finished`, `files`, `per_url_stats`, `urls_extracted`.
209
+
210
+ Download a file:
211
+ ```
212
+ GET /download/<job_id>/<filename>
213
+ ```
@@ -0,0 +1,48 @@
1
+ [build-system]
2
+ requires = ["setuptools>=61.0", "wheel"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "doculift-cli"
7
+ version = "0.1.0"
8
+ description = "A powerful CLI & web scraper that lifts documentation for Large Language Models."
9
+ readme = "README.md"
10
+ requires-python = ">=3.10"
11
+ license = { text = "MIT" }
12
+ authors = [
13
+ { name = "M.J. Shetty" }
14
+ ]
15
+ classifiers = [
16
+ "Programming Language :: Python :: 3",
17
+ "License :: OSI Approved :: MIT License",
18
+ "Operating System :: OS Independent",
19
+ "Intended Audience :: Developers",
20
+ "Topic :: Utilities",
21
+ ]
22
+ dependencies = [
23
+ "flask>=3.0.0",
24
+ "requests",
25
+ "beautifulsoup4",
26
+ "playwright",
27
+ "click",
28
+ "rich"
29
+ ]
30
+
31
+ [project.optional-dependencies]
32
+ dev = [
33
+ "black",
34
+ "flake8",
35
+ "bandit",
36
+ "build",
37
+ "twine"
38
+ ]
39
+
40
+ [project.urls]
41
+ "Homepage" = "https://github.com/mjshetty/doculift"
42
+
43
+ [project.scripts]
44
+ doculift = "doculift.cli:cli"
45
+
46
+ [tool.setuptools.packages.find]
47
+ where = ["src"]
48
+ include = ["doculift*"]
@@ -0,0 +1,9 @@
1
+ flask
2
+ requests
3
+ beautifulsoup4
4
+ playwright
5
+ flake8
6
+ black
7
+ bandit
8
+ click
9
+ rich
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
File without changes
@@ -0,0 +1,4 @@
1
+ from .cli import cli
2
+
3
+ if __name__ == "__main__":
4
+ cli()
@@ -0,0 +1,97 @@
1
+ from flask import Flask, render_template, request, jsonify, send_from_directory
2
+ import threading
3
+ import uuid
4
+ import os
5
+ from .scraper import DocuLiftScraper
6
+
7
+ app = Flask(__name__)
8
+
9
+ # In-memory storage for jobs
10
+ # Production should use Redis/Celery, but for this scale a global dict is fine.
11
+ jobs = {}
12
+
13
+ OUTPUT_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), "outputs")
14
+ if not os.path.exists(OUTPUT_DIR):
15
+ os.makedirs(OUTPUT_DIR)
16
+
17
+
18
+ @app.route("/")
19
+ def index():
20
+ return render_template("index.html")
21
+
22
+
23
+ @app.route("/scrape", methods=["POST"])
24
+ def start_scrape():
25
+ data = request.json
26
+ urls = data.get("urls", [])
27
+ output_format = data.get("format", "md")
28
+ max_pages = int(data.get("max_pages", 500))
29
+ scope_type = data.get("scope_type", "section")
30
+ extract_mode = data.get("extract_mode", "content")
31
+
32
+ if not urls:
33
+ return jsonify({"error": "No URLs provided"}), 400
34
+
35
+ job_id = str(uuid.uuid4())
36
+ scraper = DocuLiftScraper(
37
+ urls,
38
+ output_format=output_format,
39
+ max_pages=max_pages,
40
+ scope_type=scope_type,
41
+ extract_mode=extract_mode,
42
+ )
43
+
44
+ jobs[job_id] = {"scraper": scraper, "status": "pending", "progress": 0, "files": []}
45
+
46
+ # Start thread
47
+ thread = threading.Thread(target=run_scraper_task, args=(job_id, scraper))
48
+ thread.daemon = True
49
+ thread.start()
50
+
51
+ return jsonify({"job_id": job_id})
52
+
53
+
54
+ def run_scraper_task(job_id, scraper):
55
+ try:
56
+ jobs[job_id]["status"] = "running"
57
+ job_dir = os.path.join(OUTPUT_DIR, job_id)
58
+ scraper.run(job_dir)
59
+
60
+ # Get list of files
61
+ files = os.listdir(job_dir)
62
+ jobs[job_id]["files"] = files
63
+ jobs[job_id]["status"] = "completed"
64
+ jobs[job_id]["progress"] = 100
65
+ except Exception as e:
66
+ jobs[job_id]["status"] = f"error: {str(e)}"
67
+
68
+
69
+ @app.route("/status/<job_id>")
70
+ def get_status(job_id):
71
+ job = jobs.get(job_id)
72
+ if not job:
73
+ return jsonify({"error": "Job not found"}), 404
74
+
75
+ return jsonify(
76
+ {
77
+ "status": job["scraper"].status,
78
+ "progress": job["scraper"].progress,
79
+ "is_finished": job["status"] in ["completed", "error"]
80
+ or "error" in job["status"],
81
+ "files": job["files"],
82
+ "job_id": job_id,
83
+ "urls_extracted": job["scraper"].urls_extracted,
84
+ "per_url_stats": job["scraper"].per_url_stats,
85
+ }
86
+ )
87
+
88
+
89
+ @app.route("/download/<job_id>/<filename>")
90
+ def download_file(job_id, filename):
91
+ return send_from_directory(os.path.join(OUTPUT_DIR, job_id), filename)
92
+
93
+
94
+ if __name__ == "__main__":
95
+ # debug=True is disabled to prevent arbitrary code execution vulnerabilities
96
+ # in production-like environments, satisfying Bandit B201.
97
+ app.run(debug=False, port=5001)