doculift 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- doculift-0.1.0/MANIFEST.in +6 -0
- doculift-0.1.0/PKG-INFO +229 -0
- doculift-0.1.0/README.md +202 -0
- doculift-0.1.0/pyproject.toml +48 -0
- doculift-0.1.0/requirements.txt +9 -0
- doculift-0.1.0/setup.cfg +4 -0
- doculift-0.1.0/src/doculift/__init__.py +0 -0
- doculift-0.1.0/src/doculift/__main__.py +4 -0
- doculift-0.1.0/src/doculift/app.py +97 -0
- doculift-0.1.0/src/doculift/cli.py +177 -0
- doculift-0.1.0/src/doculift/scraper.py +442 -0
- doculift-0.1.0/src/doculift/static/css/style.css +481 -0
- doculift-0.1.0/src/doculift/static/doculift_logo.png +0 -0
- doculift-0.1.0/src/doculift/static/js/main.js +82 -0
- doculift-0.1.0/src/doculift/templates/index.html +91 -0
- doculift-0.1.0/src/doculift.egg-info/PKG-INFO +229 -0
- doculift-0.1.0/src/doculift.egg-info/SOURCES.txt +19 -0
- doculift-0.1.0/src/doculift.egg-info/dependency_links.txt +1 -0
- doculift-0.1.0/src/doculift.egg-info/entry_points.txt +2 -0
- doculift-0.1.0/src/doculift.egg-info/requires.txt +13 -0
- doculift-0.1.0/src/doculift.egg-info/top_level.txt +1 -0
doculift-0.1.0/PKG-INFO
ADDED
|
@@ -0,0 +1,229 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: doculift
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: A powerful CLI & web scraper that lifts documentation for Large Language Models.
|
|
5
|
+
Author: M.J. Shetty
|
|
6
|
+
License: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/mjshetty/doculift
|
|
8
|
+
Classifier: Programming Language :: Python :: 3
|
|
9
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
10
|
+
Classifier: Operating System :: OS Independent
|
|
11
|
+
Classifier: Intended Audience :: Developers
|
|
12
|
+
Classifier: Topic :: Utilities
|
|
13
|
+
Requires-Python: >=3.10
|
|
14
|
+
Description-Content-Type: text/markdown
|
|
15
|
+
Requires-Dist: flask>=3.0.0
|
|
16
|
+
Requires-Dist: requests
|
|
17
|
+
Requires-Dist: beautifulsoup4
|
|
18
|
+
Requires-Dist: playwright
|
|
19
|
+
Requires-Dist: click
|
|
20
|
+
Requires-Dist: rich
|
|
21
|
+
Provides-Extra: dev
|
|
22
|
+
Requires-Dist: black; extra == "dev"
|
|
23
|
+
Requires-Dist: flake8; extra == "dev"
|
|
24
|
+
Requires-Dist: bandit; extra == "dev"
|
|
25
|
+
Requires-Dist: build; extra == "dev"
|
|
26
|
+
Requires-Dist: twine; extra == "dev"
|
|
27
|
+
|
|
28
|
+
# DocuLift
|
|
29
|
+
|
|
30
|
+
**DocuLift** is a web scraping tool that lifts documentation websites into clean, aggregated files optimized for feeding into Large Language Models like Google NotebookLM, Claude, or ChatGPT.
|
|
31
|
+
|
|
32
|
+
It handles dynamic Single Page Applications (SPAs), respects site structure, and produces output in two modes: full content extraction or URL-only extraction.
|
|
33
|
+
|
|
34
|
+
---
|
|
35
|
+
|
|
36
|
+
## Features
|
|
37
|
+
|
|
38
|
+
- **Two Extract Modes** — choose between extracting full page content or just collecting URLs (see [When to Use Each Mode](#when-to-use-each-mode))
|
|
39
|
+
- **Dynamic Content Scraping** — uses Playwright (headless Chromium) to render JavaScript-heavy sites (React, Vue, etc.) before extraction
|
|
40
|
+
- **Smart Scoping**:
|
|
41
|
+
- **Section Only** — stays within the folder boundary of the starting URL (e.g. starting at `.../docs/agents/overview` scrapes everything under `.../docs/agents/`)
|
|
42
|
+
- **Entire Domain** — crawls all pages under the target domain
|
|
43
|
+
- **Intelligent Aggregation** — combines multiple pages into single files, auto-splits at ~500KB (NotebookLM's per-file limit), generates meaningful filenames
|
|
44
|
+
- **Multi-URL Support** — submit multiple starting URLs in one job; each is crawled independently and produces its own output file(s)
|
|
45
|
+
- **Per-URL stats** — on completion, the UI shows how many pages or URLs were collected per starting URL
|
|
46
|
+
- **Clean Extraction** — removes navigation, footers, sidebars, ads, and scripts; focuses on main content
|
|
47
|
+
|
|
48
|
+
---
|
|
49
|
+
|
|
50
|
+
## When to Use Each Mode
|
|
51
|
+
|
|
52
|
+
### Extract Content
|
|
53
|
+
Crawls each page and converts its content to Markdown (or text/CSV). Use this when you want to feed documentation directly into an LLM as context.
|
|
54
|
+
|
|
55
|
+
- **Best for**: NotebookLM, Claude Projects, ChatGPT — any tool that accepts uploaded documents
|
|
56
|
+
- **Output**: One or more `.md` files per starting URL, split at ~500KB
|
|
57
|
+
- **Typical workflow**: Extract content → upload files to NotebookLM → ask questions
|
|
58
|
+
|
|
59
|
+
### Extract URLs Only
|
|
60
|
+
Crawls the site and collects every discovered URL within scope, writing them to a plain `.txt` file — one URL per line, no other content.
|
|
61
|
+
|
|
62
|
+
**Use this when NotebookLM's URL limit is the bottleneck.**
|
|
63
|
+
|
|
64
|
+
NotebookLM supports adding web URLs as sources, but has a cap on how many you can add per notebook. When a documentation section has hundreds of pages, you'll hit that limit quickly. The recommended two-step workflow is:
|
|
65
|
+
|
|
66
|
+
1. **Run "Extract URLs Only"** on the target documentation to get a full list of all pages within scope
|
|
67
|
+
2. **Review and trim** the URL list down to the most relevant pages
|
|
68
|
+
3. **Add the trimmed URLs directly to NotebookLM** as web sources — NotebookLM fetches and indexes them itself, giving you live, citable sources rather than static file uploads
|
|
69
|
+
|
|
70
|
+
This approach gives you fine-grained control over exactly which pages NotebookLM indexes, without wasting your URL quota on irrelevant pages.
|
|
71
|
+
|
|
72
|
+
---
|
|
73
|
+
|
|
74
|
+
## Tech Stack
|
|
75
|
+
|
|
76
|
+
| Layer | Technology |
|
|
77
|
+
|---|---|
|
|
78
|
+
| Backend | Python 3.10+, Flask |
|
|
79
|
+
| Scraping | Playwright (headless Chromium) |
|
|
80
|
+
| Parsing | BeautifulSoup4 |
|
|
81
|
+
| Frontend | HTML5, CSS (Glassmorphism), Vanilla JS |
|
|
82
|
+
| CI/CD | GitHub Actions, Black, Flake8, Bandit |
|
|
83
|
+
|
|
84
|
+
---
|
|
85
|
+
|
|
86
|
+
## Continuous Integration (CI/CD)
|
|
87
|
+
|
|
88
|
+
DocuLift includes a pre-configured GitHub Actions pipeline (`.github/workflows/ci.yml`) that automatically runs on every push and pull request to the `main` or `master` branches.
|
|
89
|
+
|
|
90
|
+
The pipeline executes the following checks to ensure code quality and security:
|
|
91
|
+
|
|
92
|
+
1. **Code Formatting (Black)**
|
|
93
|
+
- Automatically checks that all Python files adhere to standard `black` formatting rules.
|
|
94
|
+
2. **Linting (Flake8)**
|
|
95
|
+
- Scans for syntax errors, undefined names, and unused imports.
|
|
96
|
+
- Enforces a maximum line length and complexity thresholds.
|
|
97
|
+
3. **Security Scanning (Bandit)**
|
|
98
|
+
- Analyzes Python code for common security vulnerabilities.
|
|
99
|
+
- Ensures safe configurations (e.g., verifying `debug=False` for Flask in production environments).
|
|
100
|
+
|
|
101
|
+
*Note: The pipeline strictly fails if any high-severity security issues are found, preventing insecure code from being merged.*
|
|
102
|
+
|
|
103
|
+
---
|
|
104
|
+
|
|
105
|
+
## Installation
|
|
106
|
+
|
|
107
|
+
DocuLift is published on PyPI as `doculift`. We recommend installing it in a virtual environment or using `pipx`.
|
|
108
|
+
|
|
109
|
+
### Prerequisites
|
|
110
|
+
- Python 3.10 or higher
|
|
111
|
+
|
|
112
|
+
### Steps
|
|
113
|
+
|
|
114
|
+
1. **Install the package via pip**
|
|
115
|
+
```bash
|
|
116
|
+
pip install doculift
|
|
117
|
+
```
|
|
118
|
+
|
|
119
|
+
2. **Install Chromium (required for dynamic page scraping)**
|
|
120
|
+
```bash
|
|
121
|
+
playwright install chromium
|
|
122
|
+
```
|
|
123
|
+
|
|
124
|
+
3. **Start the Web UI**
|
|
125
|
+
```bash
|
|
126
|
+
doculift ui
|
|
127
|
+
```
|
|
128
|
+
Open `http://127.0.0.1:5001` in your browser.
|
|
129
|
+
|
|
130
|
+
---
|
|
131
|
+
|
|
132
|
+
## Usage
|
|
133
|
+
|
|
134
|
+
DocuLift is a hybrid tool. You can run it via a beautiful Web interface, or directly from your terminal.
|
|
135
|
+
|
|
136
|
+
### 1. Web User Interface
|
|
137
|
+
|
|
138
|
+
Start the local server:
|
|
139
|
+
```bash
|
|
140
|
+
doculift ui
|
|
141
|
+
# or
|
|
142
|
+
doculift ui --port 5001
|
|
143
|
+
```
|
|
144
|
+
Then open `http://127.0.0.1:5001` in your browser.
|
|
145
|
+
|
|
146
|
+
1. **Enter target URLs** — one per line (e.g. `https://docs.docker.com/reference/`)
|
|
147
|
+
2. **Choose Extract Mode** — *Extract Content* or *Extract URLs Only*
|
|
148
|
+
3. **Choose Scoping Strategy** — *Section Only* (recommended) or *Entire Domain*
|
|
149
|
+
4. **Choose Output Format** — Markdown, Plain Text, or CSV (applies to content mode)
|
|
150
|
+
5. **Set Max Pages per URL** — default 500; each starting URL is crawled independently up to this limit
|
|
151
|
+
6. **Click "Siphon Content"** and watch the progress bar
|
|
152
|
+
7. On completion, per-URL stats are shown and files are available for download
|
|
153
|
+
|
|
154
|
+
### 2. Command Line Interface (CLI)
|
|
155
|
+
|
|
156
|
+
Run extraction directly from your terminal with a beautiful progress bar. Files will be saved into the `./outputs` folder automatically.
|
|
157
|
+
|
|
158
|
+
```bash
|
|
159
|
+
# See all available commands and options
|
|
160
|
+
doculift --help
|
|
161
|
+
|
|
162
|
+
# See options specific to the scrape command
|
|
163
|
+
doculift scrape --help
|
|
164
|
+
|
|
165
|
+
# Example: Extract full markdown content from a documentation section
|
|
166
|
+
doculift scrape https://docs.docker.com/reference/
|
|
167
|
+
|
|
168
|
+
# Example: Extract only URLs, capped at 1000 pages, from multiple sources
|
|
169
|
+
doculift scrape https://paketo.io/docs/ https://docs.docker.com/ --mode urls --max-pages 1000
|
|
170
|
+
```
|
|
171
|
+
|
|
172
|
+
---
|
|
173
|
+
|
|
174
|
+
## How It Works
|
|
175
|
+
|
|
176
|
+
```
|
|
177
|
+
User submits URLs + config
|
|
178
|
+
↓
|
|
179
|
+
Background thread spawned (one per job)
|
|
180
|
+
↓
|
|
181
|
+
For each starting URL:
|
|
182
|
+
├── Determine scope (section boundary or full domain)
|
|
183
|
+
├── BFS crawl with Playwright (handles JS rendering)
|
|
184
|
+
├── [Content mode] Clean HTML → Markdown, buffer → split files at 500KB
|
|
185
|
+
└── [URL mode] Collect discovered links → single .txt file
|
|
186
|
+
↓
|
|
187
|
+
Per-URL stats displayed, files available for download
|
|
188
|
+
```
|
|
189
|
+
|
|
190
|
+
**Key crawl behaviours:**
|
|
191
|
+
- Each starting URL gets an independent BFS with its own visited set — URLs are not cross-contaminated between starting points
|
|
192
|
+
- `max_pages` applies per starting URL, not globally
|
|
193
|
+
- Pages already scraped by an earlier starting URL in the same job are skipped to avoid duplication
|
|
194
|
+
- Fragment URLs (`#anchor`) are normalised and deduplicated
|
|
195
|
+
|
|
196
|
+
---
|
|
197
|
+
|
|
198
|
+
## API
|
|
199
|
+
|
|
200
|
+
Trigger jobs programmatically:
|
|
201
|
+
|
|
202
|
+
```bash
|
|
203
|
+
curl -X POST http://127.0.0.1:5001/scrape \
|
|
204
|
+
-H "Content-Type: application/json" \
|
|
205
|
+
-d '{
|
|
206
|
+
"urls": ["https://docs.docker.com/reference/", "https://paketo.io/docs/"],
|
|
207
|
+
"format": "md",
|
|
208
|
+
"max_pages": 200,
|
|
209
|
+
"scope_type": "section",
|
|
210
|
+
"extract_mode": "content"
|
|
211
|
+
}'
|
|
212
|
+
```
|
|
213
|
+
|
|
214
|
+
Response:
|
|
215
|
+
```json
|
|
216
|
+
{ "job_id": "abc123" }
|
|
217
|
+
```
|
|
218
|
+
|
|
219
|
+
Poll for status:
|
|
220
|
+
```bash
|
|
221
|
+
curl http://127.0.0.1:5001/status/abc123
|
|
222
|
+
```
|
|
223
|
+
|
|
224
|
+
Response fields: `status`, `progress`, `is_finished`, `files`, `per_url_stats`, `urls_extracted`.
|
|
225
|
+
|
|
226
|
+
Download a file:
|
|
227
|
+
```
|
|
228
|
+
GET /download/<job_id>/<filename>
|
|
229
|
+
```
|
doculift-0.1.0/README.md
ADDED
|
@@ -0,0 +1,202 @@
|
|
|
1
|
+
# DocuLift
|
|
2
|
+
|
|
3
|
+
**DocuLift** is a web scraping tool that lifts documentation websites into clean, aggregated files optimized for feeding into Large Language Models like Google NotebookLM, Claude, or ChatGPT.
|
|
4
|
+
|
|
5
|
+
It handles dynamic Single Page Applications (SPAs), respects site structure, and produces output in two modes: full content extraction or URL-only extraction.
|
|
6
|
+
|
|
7
|
+
---
|
|
8
|
+
|
|
9
|
+
## Features
|
|
10
|
+
|
|
11
|
+
- **Two Extract Modes** — choose between extracting full page content or just collecting URLs (see [When to Use Each Mode](#when-to-use-each-mode))
|
|
12
|
+
- **Dynamic Content Scraping** — uses Playwright (headless Chromium) to render JavaScript-heavy sites (React, Vue, etc.) before extraction
|
|
13
|
+
- **Smart Scoping**:
|
|
14
|
+
- **Section Only** — stays within the folder boundary of the starting URL (e.g. starting at `.../docs/agents/overview` scrapes everything under `.../docs/agents/`)
|
|
15
|
+
- **Entire Domain** — crawls all pages under the target domain
|
|
16
|
+
- **Intelligent Aggregation** — combines multiple pages into single files, auto-splits at ~500KB (NotebookLM's per-file limit), generates meaningful filenames
|
|
17
|
+
- **Multi-URL Support** — submit multiple starting URLs in one job; each is crawled independently and produces its own output file(s)
|
|
18
|
+
- **Per-URL stats** — on completion, the UI shows how many pages or URLs were collected per starting URL
|
|
19
|
+
- **Clean Extraction** — removes navigation, footers, sidebars, ads, and scripts; focuses on main content
|
|
20
|
+
|
|
21
|
+
---
|
|
22
|
+
|
|
23
|
+
## When to Use Each Mode
|
|
24
|
+
|
|
25
|
+
### Extract Content
|
|
26
|
+
Crawls each page and converts its content to Markdown (or text/CSV). Use this when you want to feed documentation directly into an LLM as context.
|
|
27
|
+
|
|
28
|
+
- **Best for**: NotebookLM, Claude Projects, ChatGPT — any tool that accepts uploaded documents
|
|
29
|
+
- **Output**: One or more `.md` files per starting URL, split at ~500KB
|
|
30
|
+
- **Typical workflow**: Extract content → upload files to NotebookLM → ask questions
|
|
31
|
+
|
|
32
|
+
### Extract URLs Only
|
|
33
|
+
Crawls the site and collects every discovered URL within scope, writing them to a plain `.txt` file — one URL per line, no other content.
|
|
34
|
+
|
|
35
|
+
**Use this when NotebookLM's URL limit is the bottleneck.**
|
|
36
|
+
|
|
37
|
+
NotebookLM supports adding web URLs as sources, but has a cap on how many you can add per notebook. When a documentation section has hundreds of pages, you'll hit that limit quickly. The recommended two-step workflow is:
|
|
38
|
+
|
|
39
|
+
1. **Run "Extract URLs Only"** on the target documentation to get a full list of all pages within scope
|
|
40
|
+
2. **Review and trim** the URL list down to the most relevant pages
|
|
41
|
+
3. **Add the trimmed URLs directly to NotebookLM** as web sources — NotebookLM fetches and indexes them itself, giving you live, citable sources rather than static file uploads
|
|
42
|
+
|
|
43
|
+
This approach gives you fine-grained control over exactly which pages NotebookLM indexes, without wasting your URL quota on irrelevant pages.
|
|
44
|
+
|
|
45
|
+
---
|
|
46
|
+
|
|
47
|
+
## Tech Stack
|
|
48
|
+
|
|
49
|
+
| Layer | Technology |
|
|
50
|
+
|---|---|
|
|
51
|
+
| Backend | Python 3.10+, Flask |
|
|
52
|
+
| Scraping | Playwright (headless Chromium) |
|
|
53
|
+
| Parsing | BeautifulSoup4 |
|
|
54
|
+
| Frontend | HTML5, CSS (Glassmorphism), Vanilla JS |
|
|
55
|
+
| CI/CD | GitHub Actions, Black, Flake8, Bandit |
|
|
56
|
+
|
|
57
|
+
---
|
|
58
|
+
|
|
59
|
+
## Continuous Integration (CI/CD)
|
|
60
|
+
|
|
61
|
+
DocuLift includes a pre-configured GitHub Actions pipeline (`.github/workflows/ci.yml`) that automatically runs on every push and pull request to the `main` or `master` branches.
|
|
62
|
+
|
|
63
|
+
The pipeline executes the following checks to ensure code quality and security:
|
|
64
|
+
|
|
65
|
+
1. **Code Formatting (Black)**
|
|
66
|
+
- Automatically checks that all Python files adhere to standard `black` formatting rules.
|
|
67
|
+
2. **Linting (Flake8)**
|
|
68
|
+
- Scans for syntax errors, undefined names, and unused imports.
|
|
69
|
+
- Enforces a maximum line length and complexity thresholds.
|
|
70
|
+
3. **Security Scanning (Bandit)**
|
|
71
|
+
- Analyzes Python code for common security vulnerabilities.
|
|
72
|
+
- Ensures safe configurations (e.g., verifying `debug=False` for Flask in production environments).
|
|
73
|
+
|
|
74
|
+
*Note: The pipeline strictly fails if any high-severity security issues are found, preventing insecure code from being merged.*
|
|
75
|
+
|
|
76
|
+
---
|
|
77
|
+
|
|
78
|
+
## Installation
|
|
79
|
+
|
|
80
|
+
DocuLift is published on PyPI as `doculift`. We recommend installing it in a virtual environment or using `pipx`.
|
|
81
|
+
|
|
82
|
+
### Prerequisites
|
|
83
|
+
- Python 3.10 or higher
|
|
84
|
+
|
|
85
|
+
### Steps
|
|
86
|
+
|
|
87
|
+
1. **Install the package via pip**
|
|
88
|
+
```bash
|
|
89
|
+
pip install doculift
|
|
90
|
+
```
|
|
91
|
+
|
|
92
|
+
2. **Install Chromium (required for dynamic page scraping)**
|
|
93
|
+
```bash
|
|
94
|
+
playwright install chromium
|
|
95
|
+
```
|
|
96
|
+
|
|
97
|
+
3. **Start the Web UI**
|
|
98
|
+
```bash
|
|
99
|
+
doculift ui
|
|
100
|
+
```
|
|
101
|
+
Open `http://127.0.0.1:5001` in your browser.
|
|
102
|
+
|
|
103
|
+
---
|
|
104
|
+
|
|
105
|
+
## Usage
|
|
106
|
+
|
|
107
|
+
DocuLift is a hybrid tool. You can run it via a beautiful Web interface, or directly from your terminal.
|
|
108
|
+
|
|
109
|
+
### 1. Web User Interface
|
|
110
|
+
|
|
111
|
+
Start the local server:
|
|
112
|
+
```bash
|
|
113
|
+
doculift ui
|
|
114
|
+
# or
|
|
115
|
+
doculift ui --port 5001
|
|
116
|
+
```
|
|
117
|
+
Then open `http://127.0.0.1:5001` in your browser.
|
|
118
|
+
|
|
119
|
+
1. **Enter target URLs** — one per line (e.g. `https://docs.docker.com/reference/`)
|
|
120
|
+
2. **Choose Extract Mode** — *Extract Content* or *Extract URLs Only*
|
|
121
|
+
3. **Choose Scoping Strategy** — *Section Only* (recommended) or *Entire Domain*
|
|
122
|
+
4. **Choose Output Format** — Markdown, Plain Text, or CSV (applies to content mode)
|
|
123
|
+
5. **Set Max Pages per URL** — default 500; each starting URL is crawled independently up to this limit
|
|
124
|
+
6. **Click "Siphon Content"** and watch the progress bar
|
|
125
|
+
7. On completion, per-URL stats are shown and files are available for download
|
|
126
|
+
|
|
127
|
+
### 2. Command Line Interface (CLI)
|
|
128
|
+
|
|
129
|
+
Run extraction directly from your terminal with a beautiful progress bar. Files will be saved into the `./outputs` folder automatically.
|
|
130
|
+
|
|
131
|
+
```bash
|
|
132
|
+
# See all available commands and options
|
|
133
|
+
doculift --help
|
|
134
|
+
|
|
135
|
+
# See options specific to the scrape command
|
|
136
|
+
doculift scrape --help
|
|
137
|
+
|
|
138
|
+
# Example: Extract full markdown content from a documentation section
|
|
139
|
+
doculift scrape https://docs.docker.com/reference/
|
|
140
|
+
|
|
141
|
+
# Example: Extract only URLs, capped at 1000 pages, from multiple sources
|
|
142
|
+
doculift scrape https://paketo.io/docs/ https://docs.docker.com/ --mode urls --max-pages 1000
|
|
143
|
+
```
|
|
144
|
+
|
|
145
|
+
---
|
|
146
|
+
|
|
147
|
+
## How It Works
|
|
148
|
+
|
|
149
|
+
```
|
|
150
|
+
User submits URLs + config
|
|
151
|
+
↓
|
|
152
|
+
Background thread spawned (one per job)
|
|
153
|
+
↓
|
|
154
|
+
For each starting URL:
|
|
155
|
+
├── Determine scope (section boundary or full domain)
|
|
156
|
+
├── BFS crawl with Playwright (handles JS rendering)
|
|
157
|
+
├── [Content mode] Clean HTML → Markdown, buffer → split files at 500KB
|
|
158
|
+
└── [URL mode] Collect discovered links → single .txt file
|
|
159
|
+
↓
|
|
160
|
+
Per-URL stats displayed, files available for download
|
|
161
|
+
```
|
|
162
|
+
|
|
163
|
+
**Key crawl behaviours:**
|
|
164
|
+
- Each starting URL gets an independent BFS with its own visited set — URLs are not cross-contaminated between starting points
|
|
165
|
+
- `max_pages` applies per starting URL, not globally
|
|
166
|
+
- Pages already scraped by an earlier starting URL in the same job are skipped to avoid duplication
|
|
167
|
+
- Fragment URLs (`#anchor`) are normalised and deduplicated
|
|
168
|
+
|
|
169
|
+
---
|
|
170
|
+
|
|
171
|
+
## API
|
|
172
|
+
|
|
173
|
+
Trigger jobs programmatically:
|
|
174
|
+
|
|
175
|
+
```bash
|
|
176
|
+
curl -X POST http://127.0.0.1:5001/scrape \
|
|
177
|
+
-H "Content-Type: application/json" \
|
|
178
|
+
-d '{
|
|
179
|
+
"urls": ["https://docs.docker.com/reference/", "https://paketo.io/docs/"],
|
|
180
|
+
"format": "md",
|
|
181
|
+
"max_pages": 200,
|
|
182
|
+
"scope_type": "section",
|
|
183
|
+
"extract_mode": "content"
|
|
184
|
+
}'
|
|
185
|
+
```
|
|
186
|
+
|
|
187
|
+
Response:
|
|
188
|
+
```json
|
|
189
|
+
{ "job_id": "abc123" }
|
|
190
|
+
```
|
|
191
|
+
|
|
192
|
+
Poll for status:
|
|
193
|
+
```bash
|
|
194
|
+
curl http://127.0.0.1:5001/status/abc123
|
|
195
|
+
```
|
|
196
|
+
|
|
197
|
+
Response fields: `status`, `progress`, `is_finished`, `files`, `per_url_stats`, `urls_extracted`.
|
|
198
|
+
|
|
199
|
+
Download a file:
|
|
200
|
+
```
|
|
201
|
+
GET /download/<job_id>/<filename>
|
|
202
|
+
```
|
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=61.0", "wheel"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "doculift"
|
|
7
|
+
version = "0.1.0"
|
|
8
|
+
description = "A powerful CLI & web scraper that lifts documentation for Large Language Models."
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
requires-python = ">=3.10"
|
|
11
|
+
license = { text = "MIT" }
|
|
12
|
+
authors = [
|
|
13
|
+
{ name = "M.J. Shetty" }
|
|
14
|
+
]
|
|
15
|
+
classifiers = [
|
|
16
|
+
"Programming Language :: Python :: 3",
|
|
17
|
+
"License :: OSI Approved :: MIT License",
|
|
18
|
+
"Operating System :: OS Independent",
|
|
19
|
+
"Intended Audience :: Developers",
|
|
20
|
+
"Topic :: Utilities",
|
|
21
|
+
]
|
|
22
|
+
dependencies = [
|
|
23
|
+
"flask>=3.0.0",
|
|
24
|
+
"requests",
|
|
25
|
+
"beautifulsoup4",
|
|
26
|
+
"playwright",
|
|
27
|
+
"click",
|
|
28
|
+
"rich"
|
|
29
|
+
]
|
|
30
|
+
|
|
31
|
+
[project.optional-dependencies]
|
|
32
|
+
dev = [
|
|
33
|
+
"black",
|
|
34
|
+
"flake8",
|
|
35
|
+
"bandit",
|
|
36
|
+
"build",
|
|
37
|
+
"twine"
|
|
38
|
+
]
|
|
39
|
+
|
|
40
|
+
[project.urls]
|
|
41
|
+
"Homepage" = "https://github.com/mjshetty/doculift"
|
|
42
|
+
|
|
43
|
+
[project.scripts]
|
|
44
|
+
doculift = "doculift.cli:cli"
|
|
45
|
+
|
|
46
|
+
[tool.setuptools.packages.find]
|
|
47
|
+
where = ["src"]
|
|
48
|
+
include = ["doculift*"]
|
doculift-0.1.0/setup.cfg
ADDED
|
File without changes
|
|
@@ -0,0 +1,97 @@
|
|
|
1
|
+
from flask import Flask, render_template, request, jsonify, send_from_directory
|
|
2
|
+
import threading
|
|
3
|
+
import uuid
|
|
4
|
+
import os
|
|
5
|
+
from .scraper import DocuLiftScraper
|
|
6
|
+
|
|
7
|
+
app = Flask(__name__)
|
|
8
|
+
|
|
9
|
+
# In-memory storage for jobs
|
|
10
|
+
# Production should use Redis/Celery, but for this scale a global dict is fine.
|
|
11
|
+
jobs = {}
|
|
12
|
+
|
|
13
|
+
OUTPUT_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), "outputs")
|
|
14
|
+
if not os.path.exists(OUTPUT_DIR):
|
|
15
|
+
os.makedirs(OUTPUT_DIR)
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
@app.route("/")
|
|
19
|
+
def index():
|
|
20
|
+
return render_template("index.html")
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
@app.route("/scrape", methods=["POST"])
|
|
24
|
+
def start_scrape():
|
|
25
|
+
data = request.json
|
|
26
|
+
urls = data.get("urls", [])
|
|
27
|
+
output_format = data.get("format", "md")
|
|
28
|
+
max_pages = int(data.get("max_pages", 500))
|
|
29
|
+
scope_type = data.get("scope_type", "section")
|
|
30
|
+
extract_mode = data.get("extract_mode", "content")
|
|
31
|
+
|
|
32
|
+
if not urls:
|
|
33
|
+
return jsonify({"error": "No URLs provided"}), 400
|
|
34
|
+
|
|
35
|
+
job_id = str(uuid.uuid4())
|
|
36
|
+
scraper = DocuLiftScraper(
|
|
37
|
+
urls,
|
|
38
|
+
output_format=output_format,
|
|
39
|
+
max_pages=max_pages,
|
|
40
|
+
scope_type=scope_type,
|
|
41
|
+
extract_mode=extract_mode,
|
|
42
|
+
)
|
|
43
|
+
|
|
44
|
+
jobs[job_id] = {"scraper": scraper, "status": "pending", "progress": 0, "files": []}
|
|
45
|
+
|
|
46
|
+
# Start thread
|
|
47
|
+
thread = threading.Thread(target=run_scraper_task, args=(job_id, scraper))
|
|
48
|
+
thread.daemon = True
|
|
49
|
+
thread.start()
|
|
50
|
+
|
|
51
|
+
return jsonify({"job_id": job_id})
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def run_scraper_task(job_id, scraper):
|
|
55
|
+
try:
|
|
56
|
+
jobs[job_id]["status"] = "running"
|
|
57
|
+
job_dir = os.path.join(OUTPUT_DIR, job_id)
|
|
58
|
+
scraper.run(job_dir)
|
|
59
|
+
|
|
60
|
+
# Get list of files
|
|
61
|
+
files = os.listdir(job_dir)
|
|
62
|
+
jobs[job_id]["files"] = files
|
|
63
|
+
jobs[job_id]["status"] = "completed"
|
|
64
|
+
jobs[job_id]["progress"] = 100
|
|
65
|
+
except Exception as e:
|
|
66
|
+
jobs[job_id]["status"] = f"error: {str(e)}"
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
@app.route("/status/<job_id>")
|
|
70
|
+
def get_status(job_id):
|
|
71
|
+
job = jobs.get(job_id)
|
|
72
|
+
if not job:
|
|
73
|
+
return jsonify({"error": "Job not found"}), 404
|
|
74
|
+
|
|
75
|
+
return jsonify(
|
|
76
|
+
{
|
|
77
|
+
"status": job["scraper"].status,
|
|
78
|
+
"progress": job["scraper"].progress,
|
|
79
|
+
"is_finished": job["status"] in ["completed", "error"]
|
|
80
|
+
or "error" in job["status"],
|
|
81
|
+
"files": job["files"],
|
|
82
|
+
"job_id": job_id,
|
|
83
|
+
"urls_extracted": job["scraper"].urls_extracted,
|
|
84
|
+
"per_url_stats": job["scraper"].per_url_stats,
|
|
85
|
+
}
|
|
86
|
+
)
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
@app.route("/download/<job_id>/<filename>")
|
|
90
|
+
def download_file(job_id, filename):
|
|
91
|
+
return send_from_directory(os.path.join(OUTPUT_DIR, job_id), filename)
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
if __name__ == "__main__":
|
|
95
|
+
# debug=True is disabled to prevent arbitrary code execution vulnerabilities
|
|
96
|
+
# in production-like environments, satisfying Bandit B201.
|
|
97
|
+
app.run(debug=False, port=5001)
|