doc2lora 1.0.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- doc2lora-1.0.0/LICENSE +21 -0
- doc2lora-1.0.0/PKG-INFO +603 -0
- doc2lora-1.0.0/README.md +520 -0
- doc2lora-1.0.0/doc2lora/__init__.py +7 -0
- doc2lora-1.0.0/doc2lora/cli.py +523 -0
- doc2lora-1.0.0/doc2lora/core.py +289 -0
- doc2lora-1.0.0/doc2lora/deploy.py +230 -0
- doc2lora-1.0.0/doc2lora/lora_trainer.py +605 -0
- doc2lora-1.0.0/doc2lora/parsers.py +881 -0
- doc2lora-1.0.0/doc2lora/utils.py +432 -0
- doc2lora-1.0.0/doc2lora.egg-info/PKG-INFO +603 -0
- doc2lora-1.0.0/doc2lora.egg-info/SOURCES.txt +24 -0
- doc2lora-1.0.0/doc2lora.egg-info/dependency_links.txt +1 -0
- doc2lora-1.0.0/doc2lora.egg-info/entry_points.txt +2 -0
- doc2lora-1.0.0/doc2lora.egg-info/requires.txt +65 -0
- doc2lora-1.0.0/doc2lora.egg-info/top_level.txt +1 -0
- doc2lora-1.0.0/pyproject.toml +135 -0
- doc2lora-1.0.0/setup.cfg +4 -0
- doc2lora-1.0.0/setup.py +7 -0
- doc2lora-1.0.0/tests/test_cli.py +64 -0
- doc2lora-1.0.0/tests/test_core.py +121 -0
- doc2lora-1.0.0/tests/test_deploy.py +82 -0
- doc2lora-1.0.0/tests/test_lora_trainer.py +96 -0
- doc2lora-1.0.0/tests/test_new_formats.py +232 -0
- doc2lora-1.0.0/tests/test_parsers.py +342 -0
- doc2lora-1.0.0/tests/test_utils.py +103 -0
doc2lora-1.0.0/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2025 The Earth App
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
doc2lora-1.0.0/PKG-INFO
ADDED
|
@@ -0,0 +1,603 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: doc2lora
|
|
3
|
+
Version: 1.0.0
|
|
4
|
+
Summary: A library for fine-tuning LLMs using LoRA by using a folder of documents as input
|
|
5
|
+
Author-email: Gregory Mitchell <gregory@earth-app.com>
|
|
6
|
+
License: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/earth-app/doc2lora
|
|
8
|
+
Project-URL: Repository, https://github.com/earth-app/doc2lora
|
|
9
|
+
Project-URL: Issues, https://github.com/earth-app/doc2lora/issues
|
|
10
|
+
Classifier: Development Status :: 5 - Production/Stable
|
|
11
|
+
Classifier: Intended Audience :: Developers
|
|
12
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
13
|
+
Classifier: Operating System :: OS Independent
|
|
14
|
+
Classifier: Programming Language :: Python :: 3
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
20
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
21
|
+
Requires-Python: >=3.9
|
|
22
|
+
Description-Content-Type: text/markdown
|
|
23
|
+
License-File: LICENSE
|
|
24
|
+
Requires-Dist: torch>=1.13.0
|
|
25
|
+
Requires-Dist: transformers>=4.21.0
|
|
26
|
+
Requires-Dist: peft>=0.3.0
|
|
27
|
+
Requires-Dist: datasets>=2.0.0
|
|
28
|
+
Requires-Dist: click>=8.0.0
|
|
29
|
+
Requires-Dist: tqdm>=4.64.0
|
|
30
|
+
Requires-Dist: numpy>=1.21.0
|
|
31
|
+
Requires-Dist: pandas>=1.3.0
|
|
32
|
+
Provides-Extra: docs
|
|
33
|
+
Requires-Dist: pypdf>=3.0.0; extra == "docs"
|
|
34
|
+
Requires-Dist: python-docx>=0.8.11; extra == "docs"
|
|
35
|
+
Requires-Dist: beautifulsoup4>=4.11.0; extra == "docs"
|
|
36
|
+
Requires-Dist: lxml>=4.9.0; extra == "docs"
|
|
37
|
+
Requires-Dist: pyyaml>=6.0; extra == "docs"
|
|
38
|
+
Requires-Dist: openpyxl>=3.0.0; extra == "docs"
|
|
39
|
+
Requires-Dist: python-pptx>=0.6.21; extra == "docs"
|
|
40
|
+
Requires-Dist: odfpy>=1.4.1; extra == "docs"
|
|
41
|
+
Requires-Dist: striprtf>=0.0.26; extra == "docs"
|
|
42
|
+
Requires-Dist: EbookLib>=0.18; extra == "docs"
|
|
43
|
+
Requires-Dist: py7zr>=0.20.0; extra == "docs"
|
|
44
|
+
Provides-Extra: audio
|
|
45
|
+
Requires-Dist: SpeechRecognition>=3.10.0; extra == "audio"
|
|
46
|
+
Requires-Dist: pydub>=0.25.1; extra == "audio"
|
|
47
|
+
Provides-Extra: r2
|
|
48
|
+
Requires-Dist: boto3>=1.26.0; extra == "r2"
|
|
49
|
+
Requires-Dist: python-dotenv>=1.0.0; extra == "r2"
|
|
50
|
+
Provides-Extra: quant
|
|
51
|
+
Requires-Dist: bitsandbytes>=0.41.0; extra == "quant"
|
|
52
|
+
Provides-Extra: all
|
|
53
|
+
Requires-Dist: pypdf>=3.0.0; extra == "all"
|
|
54
|
+
Requires-Dist: python-docx>=0.8.11; extra == "all"
|
|
55
|
+
Requires-Dist: beautifulsoup4>=4.11.0; extra == "all"
|
|
56
|
+
Requires-Dist: lxml>=4.9.0; extra == "all"
|
|
57
|
+
Requires-Dist: pyyaml>=6.0; extra == "all"
|
|
58
|
+
Requires-Dist: openpyxl>=3.0.0; extra == "all"
|
|
59
|
+
Requires-Dist: python-pptx>=0.6.21; extra == "all"
|
|
60
|
+
Requires-Dist: odfpy>=1.4.1; extra == "all"
|
|
61
|
+
Requires-Dist: striprtf>=0.0.26; extra == "all"
|
|
62
|
+
Requires-Dist: EbookLib>=0.18; extra == "all"
|
|
63
|
+
Requires-Dist: py7zr>=0.20.0; extra == "all"
|
|
64
|
+
Requires-Dist: SpeechRecognition>=3.10.0; extra == "all"
|
|
65
|
+
Requires-Dist: pydub>=0.25.1; extra == "all"
|
|
66
|
+
Requires-Dist: boto3>=1.26.0; extra == "all"
|
|
67
|
+
Requires-Dist: python-dotenv>=1.0.0; extra == "all"
|
|
68
|
+
Requires-Dist: bitsandbytes>=0.41.0; extra == "all"
|
|
69
|
+
Provides-Extra: test
|
|
70
|
+
Requires-Dist: pytest>=7.0.0; extra == "test"
|
|
71
|
+
Requires-Dist: pytest-cov>=4.0.0; extra == "test"
|
|
72
|
+
Provides-Extra: dev
|
|
73
|
+
Requires-Dist: pytest>=7.0.0; extra == "dev"
|
|
74
|
+
Requires-Dist: pytest-cov>=4.0.0; extra == "dev"
|
|
75
|
+
Requires-Dist: black>=22.0.0; extra == "dev"
|
|
76
|
+
Requires-Dist: flake8>=5.0.0; extra == "dev"
|
|
77
|
+
Requires-Dist: isort>=5.10.0; extra == "dev"
|
|
78
|
+
Requires-Dist: mypy>=1.0.0; extra == "dev"
|
|
79
|
+
Requires-Dist: pre-commit>=3.0.0; extra == "dev"
|
|
80
|
+
Requires-Dist: build>=1.0.0; extra == "dev"
|
|
81
|
+
Requires-Dist: twine>=4.0.0; extra == "dev"
|
|
82
|
+
Dynamic: license-file
|
|
83
|
+
|
|
84
|
+
# doc2lora
|
|
85
|
+
|
|
86
|
+
This repository is a small library for fine-tuning LLMs using LoRA (Low-Rank Adaptation) by using a folder of documents as input. It is designed to be simple and easy to use, allowing users to quickly adapt large language models to specific tasks or domains.
|
|
87
|
+
|
|
88
|
+
The library allows you to pass a folder of documents (local or from R2 bucket) and turn them into a LoRA Adapter. It is particularly useful for fine-tuning models on domain-specific data, such as legal documents, medical texts, or any other specialized corpus. It is intended to be used with Cloudflare Workers AI or similar platforms that support LLM fine-tuning.
|
|
89
|
+
|
|
90
|
+
It supports the following formats:
|
|
91
|
+
|
|
92
|
+
- **Markdown / reStructuredText**: `.md`, `.rst` files
|
|
93
|
+
- **Text**: `.txt` files or blank text files
|
|
94
|
+
- **PDF**: `.pdf` files
|
|
95
|
+
- **HTML**: `.html` files
|
|
96
|
+
- **Word Documents**: `.docx` files
|
|
97
|
+
- **PowerPoint**: `.pptx` files (slide text + speaker notes)
|
|
98
|
+
- **OpenDocument**: `.odt`, `.ods` files
|
|
99
|
+
- **Rich Text**: `.rtf` files
|
|
100
|
+
- **EPUB e-books**: `.epub` files
|
|
101
|
+
- **Excel Spreadsheets**: `.xlsx` files
|
|
102
|
+
- **CSV**: `.csv` files
|
|
103
|
+
- **JSON**: `.json` files
|
|
104
|
+
- **Jupyter notebooks**: `.ipynb` files (markdown + code cells)
|
|
105
|
+
- **YAML**: `.yaml` / `.yml` files
|
|
106
|
+
- **XML**: `.xml` files
|
|
107
|
+
- **LaTeX**: `.tex` files
|
|
108
|
+
- **Source code** (read as plaintext): `.py`, `.js`, `.ts`, `.java`, `.kt`, `.rs`, `.c`/`.cpp`, `.go`, `.rb`, `.php`, `.swift`, `.dart`, `.scala`, and more
|
|
109
|
+
- **Audio** (speech-to-text): `.wav`, `.mp3`, `.m4a`, `.flac`, `.aac`, `.ogg`, and more
|
|
110
|
+
- **Archive Formats**: `.zip`, `.tar.gz`, `.tar.xz`, `.7z`, single-file `.gz`/`.bz2`/`.xz`, etc with supported documents inside
|
|
111
|
+
|
|
112
|
+
Run `doc2lora formats` to print the full list at any time.
|
|
113
|
+
|
|
114
|
+
## Quick Start
|
|
115
|
+
|
|
116
|
+
### Installation
|
|
117
|
+
|
|
118
|
+
```bash
|
|
119
|
+
# Core install (training only):
|
|
120
|
+
pip install doc2lora
|
|
121
|
+
|
|
122
|
+
# Everything (all document formats, audio, R2, QLoRA):
|
|
123
|
+
pip install "doc2lora[all]"
|
|
124
|
+
|
|
125
|
+
# Or pick what you need via extras:
|
|
126
|
+
pip install "doc2lora[docs]" # pdf, docx, pptx, odt/ods, rtf, epub, xlsx, 7z
|
|
127
|
+
pip install "doc2lora[audio]" # speech-to-text (also needs the ffmpeg binary for mp3/m4a/aac)
|
|
128
|
+
pip install "doc2lora[r2]" # Cloudflare R2 ingestion
|
|
129
|
+
pip install "doc2lora[quant]" # 4-bit QLoRA (CUDA only)
|
|
130
|
+
|
|
131
|
+
# For local development (editable + dev tools):
|
|
132
|
+
pip install -e ".[all,dev]"
|
|
133
|
+
```
|
|
134
|
+
|
|
135
|
+
> Audio transcription uses the `SpeechRecognition` library (Google Web Speech by
|
|
136
|
+
> default, which needs network access). Non-WAV formats are converted with
|
|
137
|
+
> `pydub`, which requires the system `ffmpeg` binary.
|
|
138
|
+
|
|
139
|
+
### Basic Usage
|
|
140
|
+
|
|
141
|
+
```bash
|
|
142
|
+
# Test the example
|
|
143
|
+
cd examples
|
|
144
|
+
python basic_usage.py
|
|
145
|
+
```
|
|
146
|
+
|
|
147
|
+
## Library Usage
|
|
148
|
+
|
|
149
|
+
To use the library, you can import it into your project and call the `convert` function with the path to the folder containing your documents, or use `convert_from_r2` to process documents from an R2 bucket. The library will handle the parsing and conversion of the documents into a format suitable for LoRA fine-tuning.
|
|
150
|
+
|
|
151
|
+
The `convert` function now supports multiple input types:
|
|
152
|
+
|
|
153
|
+
- **Folder path**: Pass a path to a folder containing documents
|
|
154
|
+
- **Array of strings**: Pass document content directly as strings
|
|
155
|
+
- **Array of bytes**: Pass document content as byte arrays
|
|
156
|
+
- **Single string**: Pass individual document content
|
|
157
|
+
- **Single bytes**: Pass individual document as bytes
|
|
158
|
+
|
|
159
|
+
### Subdirectory-Based Labeling
|
|
160
|
+
|
|
161
|
+
`doc2lora` now automatically uses subdirectory structure combined with filenames to create detailed labels, making it easy to organize training data by category.
|
|
162
|
+
|
|
163
|
+
When processing a folder, each document is automatically labeled by combining its subdirectory and filename:
|
|
164
|
+
|
|
165
|
+
```text
|
|
166
|
+
training_data/
|
|
167
|
+
âââ legal/ # Documents labeled as "legal_[filename]"
|
|
168
|
+
â âââ contract1.pdf # -> "legal_contract1"
|
|
169
|
+
â âââ agreement.docx # -> "legal_agreement"
|
|
170
|
+
âââ technical/ # Documents labeled as "technical_[filename]"
|
|
171
|
+
â âââ spec.md # -> "technical_spec"
|
|
172
|
+
â âââ guide.txt # -> "technical_guide"
|
|
173
|
+
âââ marketing/ # Documents labeled as "marketing_[filename]"
|
|
174
|
+
â âââ campaign.html # -> "marketing_campaign"
|
|
175
|
+
â âââ copy.txt # -> "marketing_copy"
|
|
176
|
+
âââ overview.txt # Root-level files â "root_overview"
|
|
177
|
+
```
|
|
178
|
+
|
|
179
|
+
**Generated metadata includes:**
|
|
180
|
+
|
|
181
|
+
```json
|
|
182
|
+
{
|
|
183
|
+
"content": "Document content...",
|
|
184
|
+
"filename": "contract1.pdf",
|
|
185
|
+
"label": "legal_contract1",
|
|
186
|
+
"category_path": "legal",
|
|
187
|
+
"extension": ".pdf",
|
|
188
|
+
"size": 1024
|
|
189
|
+
}
|
|
190
|
+
```
|
|
191
|
+
|
|
192
|
+
**Use Cases:**
|
|
193
|
+
|
|
194
|
+
- **Domain + Document type**: legal_contract, legal_agreement, technical_spec, technical_guide
|
|
195
|
+
- **Difficulty + Topic**: beginner_python, intermediate_javascript, advanced_algorithms
|
|
196
|
+
- **Type + Content**: manual_installation, faq_troubleshooting, tutorial_setup
|
|
197
|
+
- **Language + Region**: en_privacy_policy, es_terms_service, fr_user_guide
|
|
198
|
+
- **Time + Event**: 2023_quarterly_report, 2024_annual_summary, current_status
|
|
199
|
+
|
|
200
|
+
```bash
|
|
201
|
+
# See the labeling feature in action
|
|
202
|
+
cd examples
|
|
203
|
+
python subdirectory_labeling_demo.py
|
|
204
|
+
```
|
|
205
|
+
|
|
206
|
+
### Local Documents
|
|
207
|
+
|
|
208
|
+
```py
|
|
209
|
+
from doc2lora import convert
|
|
210
|
+
|
|
211
|
+
# Method 1: Convert a folder of documents
|
|
212
|
+
convert(documents_path="path/to/documents", output_path="path/to/output.json")
|
|
213
|
+
|
|
214
|
+
# Method 2: Convert array of strings directly
|
|
215
|
+
documents = [
|
|
216
|
+
"This is document 1 content...",
|
|
217
|
+
"This is document 2 content...",
|
|
218
|
+
"This is document 3 content..."
|
|
219
|
+
]
|
|
220
|
+
convert(input_data=documents, output_path="path/to/output.json")
|
|
221
|
+
|
|
222
|
+
# Method 3: Convert single string
|
|
223
|
+
document_content = "This is my document content..."
|
|
224
|
+
convert(input_data=document_content, output_path="path/to/output.json")
|
|
225
|
+
|
|
226
|
+
# Method 4: Convert array of bytes
|
|
227
|
+
with open("doc1.txt", "rb") as f1, open("doc2.txt", "rb") as f2:
|
|
228
|
+
byte_documents = [f1.read(), f2.read()]
|
|
229
|
+
convert(input_data=byte_documents, output_path="path/to/output.json")
|
|
230
|
+
```
|
|
231
|
+
|
|
232
|
+
### R2 Bucket Documents
|
|
233
|
+
|
|
234
|
+
```py
|
|
235
|
+
from doc2lora import convert_from_r2
|
|
236
|
+
|
|
237
|
+
# Method 1: Direct credentials
|
|
238
|
+
convert_from_r2(
|
|
239
|
+
bucket_name="my-documents-bucket",
|
|
240
|
+
folder_prefix="training-docs", # optional
|
|
241
|
+
output_path="path/to/output.json",
|
|
242
|
+
aws_access_key_id="your-access-key",
|
|
243
|
+
aws_secret_access_key="your-secret-key",
|
|
244
|
+
endpoint_url="https://your-account.r2.cloudflarestorage.com"
|
|
245
|
+
)
|
|
246
|
+
|
|
247
|
+
# Method 2: Using .env file (recommended)
|
|
248
|
+
convert_from_r2(
|
|
249
|
+
bucket_name="my-documents-bucket",
|
|
250
|
+
folder_prefix="training-docs", # optional
|
|
251
|
+
output_path="path/to/output.json",
|
|
252
|
+
env_file=".env" # Load credentials from .env file
|
|
253
|
+
)
|
|
254
|
+
|
|
255
|
+
# The output will be a JSON file containing the LoRA adapter data
|
|
256
|
+
# You can then use this output with your LLM fine-tuning framework
|
|
257
|
+
# For example, with Cloudflare Workers AI:
|
|
258
|
+
from cloudflare_workers_ai import LLM
|
|
259
|
+
llm = LLM(model="your-model-name")
|
|
260
|
+
llm.load_lora_adapter("path/to/output.json")
|
|
261
|
+
```
|
|
262
|
+
|
|
263
|
+
## CLI
|
|
264
|
+
|
|
265
|
+
You can also use the library from the command line. The CLI allows you to convert a folder of documents or R2 bucket contents into a LoRA adapter JSON file.
|
|
266
|
+
|
|
267
|
+
### CLI for Local Documents
|
|
268
|
+
|
|
269
|
+
```bash
|
|
270
|
+
doc2lora convert path/to/documents --output path/to/output.json
|
|
271
|
+
|
|
272
|
+
# scan first to preview files + a rough training-time estimate
|
|
273
|
+
doc2lora scan path/to/documents --device cpu
|
|
274
|
+
|
|
275
|
+
# low-memory machine: smaller batch + gradient accumulation (on by default:
|
|
276
|
+
# gradient checkpointing). 4-bit QLoRA is available on CUDA via --load-in-4bit
|
|
277
|
+
doc2lora convert path/to/documents \
|
|
278
|
+
--batch-size 1 --gradient-accumulation-steps 8 \
|
|
279
|
+
--output adapter.json
|
|
280
|
+
```
|
|
281
|
+
|
|
282
|
+
### Deploy to Cloudflare Workers AI
|
|
283
|
+
|
|
284
|
+
Once you have an adapter, upload it as a Workers AI finetune with one command:
|
|
285
|
+
|
|
286
|
+
```bash
|
|
287
|
+
# uses the wrangler CLI under the hood (validates the adapter first)
|
|
288
|
+
doc2lora deploy adapter.json my-finetune-name \
|
|
289
|
+
--cf-model "@cf/mistralai/mistral-7b-instruct-v0.2-lora"
|
|
290
|
+
|
|
291
|
+
# or upload via the REST API (no wrangler needed)
|
|
292
|
+
doc2lora deploy adapter.json my-finetune-name --backend rest \
|
|
293
|
+
--account-id "$CLOUDFLARE_ACCOUNT_ID" --api-token "$CLOUDFLARE_API_TOKEN"
|
|
294
|
+
```
|
|
295
|
+
|
|
296
|
+
Then reference it at inference time with the `lora` parameter
|
|
297
|
+
(`env.AI.run("@cf/mistralai/mistral-7b-instruct-v0.2-lora", { ..., lora: "my-finetune-name" })`).
|
|
298
|
+
|
|
299
|
+
### CLI for R2 Bucket Documents
|
|
300
|
+
|
|
301
|
+
```bash
|
|
302
|
+
# Method 1: Set environment variables for credentials
|
|
303
|
+
export R2_ACCESS_KEY_ID="your-access-key"
|
|
304
|
+
export R2_SECRET_ACCESS_KEY="your-secret-key"
|
|
305
|
+
export R2_ENDPOINT_URL="https://your-account.r2.cloudflarestorage.com"
|
|
306
|
+
|
|
307
|
+
# Convert documents from R2 bucket
|
|
308
|
+
doc2lora convert-r2 my-documents-bucket --folder-prefix training-docs --output path/to/output.json
|
|
309
|
+
|
|
310
|
+
# Method 2: Use .env file (recommended)
|
|
311
|
+
doc2lora convert-r2 my-documents-bucket \
|
|
312
|
+
--env-file .env \
|
|
313
|
+
--folder-prefix training-docs \
|
|
314
|
+
--output path/to/output.json
|
|
315
|
+
|
|
316
|
+
# Method 3: Pass credentials directly
|
|
317
|
+
doc2lora convert-r2 my-documents-bucket \
|
|
318
|
+
--r2-access-key-id "your-access-key" \
|
|
319
|
+
--r2-secret-access-key "your-secret-key" \
|
|
320
|
+
--endpoint-url "https://your-account.r2.cloudflarestorage.com" \
|
|
321
|
+
--output path/to/output.json
|
|
322
|
+
```
|
|
323
|
+
|
|
324
|
+
## Project Structure
|
|
325
|
+
|
|
326
|
+
```text
|
|
327
|
+
doc2lora/
|
|
328
|
+
âââ doc2lora/ # Main package
|
|
329
|
+
â âââ __init__.py # Package initialization
|
|
330
|
+
â âââ core.py # Main convert function
|
|
331
|
+
â âââ parsers.py # Document parsing logic
|
|
332
|
+
â âââ lora_trainer.py # LoRA training implementation
|
|
333
|
+
â âââ cli.py # Command-line interface
|
|
334
|
+
â âââ utils.py # Utility functions
|
|
335
|
+
âââ examples/ # Example usage
|
|
336
|
+
â âââ basic_usage.py # Working example script
|
|
337
|
+
â âââ subdirectory_labeling_demo.py # Subdirectory labeling demonstration
|
|
338
|
+
â âââ mistral_usage.py # Mistral model example with HF API key
|
|
339
|
+
â âââ gemma_usage.py # Gemma model example for Cloudflare AI
|
|
340
|
+
â âââ llama_usage.py # Llama model example for Cloudflare AI
|
|
341
|
+
â âââ r2_usage.py # R2 bucket integration example
|
|
342
|
+
â âââ example_documents/ # Sample documents
|
|
343
|
+
â âââ sample.md
|
|
344
|
+
â âââ sample.txt
|
|
345
|
+
â âââ sample.json
|
|
346
|
+
â âââ sample.csv
|
|
347
|
+
âââ demo/ # Complete working demonstration
|
|
348
|
+
â âââ data/ # Sample training documents about software development
|
|
349
|
+
â âââ scripts/ # Automation scripts (train_lora.sh/.bat, deploy_to_r2.sh/.bat)
|
|
350
|
+
â âââ worker.js # Cloudflare Worker implementation
|
|
351
|
+
â âââ wrangler.toml # Cloudflare Worker configuration
|
|
352
|
+
â âââ index.html # Web interface for testing
|
|
353
|
+
â âââ README.md # Demo documentation
|
|
354
|
+
âââ tests/ # Test suite
|
|
355
|
+
âââ requirements.txt # Dependencies
|
|
356
|
+
âââ setup.py # Package setup
|
|
357
|
+
âââ README.md # This file
|
|
358
|
+
```
|
|
359
|
+
|
|
360
|
+
## Examples
|
|
361
|
+
|
|
362
|
+
The `examples/` directory contains usage examples for different models and scenarios:
|
|
363
|
+
|
|
364
|
+
### Model-Specific Examples
|
|
365
|
+
|
|
366
|
+
1. **`mistral_usage.py`** - Complete example for Mistral models with HuggingFace authentication
|
|
367
|
+
|
|
368
|
+
```bash
|
|
369
|
+
cd examples
|
|
370
|
+
export HF_API_KEY="your_huggingface_token" # Required for Mistral models
|
|
371
|
+
python mistral_usage.py
|
|
372
|
+
```
|
|
373
|
+
|
|
374
|
+
2. **`gemma_usage.py`** - Google Gemma model fine-tuning for Cloudflare Workers AI
|
|
375
|
+
|
|
376
|
+
```bash
|
|
377
|
+
cd examples
|
|
378
|
+
python gemma_usage.py
|
|
379
|
+
```
|
|
380
|
+
|
|
381
|
+
3. **`llama_usage.py`** - Meta Llama 2 model fine-tuning with optimized parameters
|
|
382
|
+
|
|
383
|
+
```bash
|
|
384
|
+
cd examples
|
|
385
|
+
python llama_usage.py
|
|
386
|
+
```
|
|
387
|
+
|
|
388
|
+
4. **`r2_usage.py`** - R2 bucket integration with .env file support
|
|
389
|
+
|
|
390
|
+
```bash
|
|
391
|
+
cd examples
|
|
392
|
+
python r2_usage.py
|
|
393
|
+
```
|
|
394
|
+
|
|
395
|
+
5. **`qlora_usage.py`** - Memory-efficient 4-bit QLoRA training (CUDA) + deploy
|
|
396
|
+
|
|
397
|
+
```bash
|
|
398
|
+
cd examples
|
|
399
|
+
python qlora_usage.py
|
|
400
|
+
```
|
|
401
|
+
|
|
402
|
+
6. **`qwq_usage.py`** - Fine-tuning the QwQ-32B reasoning model
|
|
403
|
+
(`@cf/qwen/qwq-32b`) with 4-bit QLoRA; needs a 24 GB+ NVIDIA GPU
|
|
404
|
+
|
|
405
|
+
```bash
|
|
406
|
+
cd examples
|
|
407
|
+
python qwq_usage.py
|
|
408
|
+
```
|
|
409
|
+
|
|
410
|
+
### Demo Application
|
|
411
|
+
|
|
412
|
+
The `demo/` folder contains a complete working demonstration of a Cloudflare Worker using a custom LoRA adapter:
|
|
413
|
+
|
|
414
|
+
```bash
|
|
415
|
+
# 1. Train a LoRA adapter on software development data
|
|
416
|
+
cd demo
|
|
417
|
+
./scripts/train_lora.sh # or train_lora.bat on Windows
|
|
418
|
+
|
|
419
|
+
# 2. Deploy the adapter to R2 bucket
|
|
420
|
+
./scripts/deploy_to_r2.sh # or deploy_to_r2.bat on Windows
|
|
421
|
+
|
|
422
|
+
# 3. Deploy the Cloudflare Worker
|
|
423
|
+
./scripts/wrangler_deploy.sh # or wrangler_deploy.bat on Windows
|
|
424
|
+
```
|
|
425
|
+
|
|
426
|
+
The demo creates a **Software Developer Assistant** AI that provides guidance on:
|
|
427
|
+
|
|
428
|
+
- Code development and architecture
|
|
429
|
+
- Debugging and troubleshooting
|
|
430
|
+
- Team collaboration and communication
|
|
431
|
+
- Professional growth and career development
|
|
432
|
+
- Technical decision-making
|
|
433
|
+
|
|
434
|
+
**API Endpoints:**
|
|
435
|
+
|
|
436
|
+
- `GET /health` - Health check
|
|
437
|
+
- `POST /chat` - Send message and get response
|
|
438
|
+
- `POST /chat/stream` - Streaming responses
|
|
439
|
+
- `GET /docs` - API documentation
|
|
440
|
+
|
|
441
|
+
## Configuration
|
|
442
|
+
|
|
443
|
+
### GPU Support
|
|
444
|
+
|
|
445
|
+
ð **Automatic GPU Detection**: doc2lora now automatically detects and uses the best available device for training:
|
|
446
|
+
|
|
447
|
+
**Device Priority (Automatic):**
|
|
448
|
+
|
|
449
|
+
1. ð **NVIDIA GPU (CUDA)** - Fastest training with fp16 precision and optimal memory usage
|
|
450
|
+
2. ð **Apple Silicon (MPS)** - Good performance on Mac M1/M2/M3
|
|
451
|
+
3. ðŧ **CPU** - Reliable fallback, works everywhere
|
|
452
|
+
|
|
453
|
+
**Automatic Detection (Recommended):**
|
|
454
|
+
|
|
455
|
+
```bash
|
|
456
|
+
# Will automatically use GPU if available, fallback to CPU
|
|
457
|
+
doc2lora convert ./docs --output adapter.json
|
|
458
|
+
```
|
|
459
|
+
|
|
460
|
+
**Manual Device Selection:**
|
|
461
|
+
|
|
462
|
+
```bash
|
|
463
|
+
# Force GPU usage
|
|
464
|
+
doc2lora convert ./docs --output adapter.json --device cuda
|
|
465
|
+
|
|
466
|
+
# Force CPU usage (useful for troubleshooting)
|
|
467
|
+
doc2lora convert ./docs --output adapter.json --device cpu
|
|
468
|
+
|
|
469
|
+
# Use Apple Silicon GPU (Mac M1/M2/M3)
|
|
470
|
+
doc2lora convert ./docs --output adapter.json --device mps
|
|
471
|
+
```
|
|
472
|
+
|
|
473
|
+
**Python API:**
|
|
474
|
+
|
|
475
|
+
```python
|
|
476
|
+
from doc2lora import convert
|
|
477
|
+
|
|
478
|
+
# Auto-detect device (recommended)
|
|
479
|
+
convert(documents_path="./docs", output_path="adapter.json")
|
|
480
|
+
|
|
481
|
+
# Specify device manually
|
|
482
|
+
convert(documents_path="./docs", output_path="adapter.json", device="cuda")
|
|
483
|
+
convert(documents_path="./docs", output_path="adapter.json", device="cpu")
|
|
484
|
+
convert(documents_path="./docs", output_path="adapter.json", device="mps") # Apple Silicon
|
|
485
|
+
```
|
|
486
|
+
|
|
487
|
+
**GPU Requirements:**
|
|
488
|
+
|
|
489
|
+
- **NVIDIA GPUs**: Requires CUDA-compatible PyTorch installation
|
|
490
|
+
- **Apple Silicon**: Requires PyTorch with MPS support (automatically included on macOS)
|
|
491
|
+
- **Memory**: 8GB+ GPU memory recommended for larger models
|
|
492
|
+
|
|
493
|
+
### Training Parameters
|
|
494
|
+
|
|
495
|
+
Common configuration options:
|
|
496
|
+
|
|
497
|
+
```bash
|
|
498
|
+
doc2lora convert ./docs \
|
|
499
|
+
--model mistralai/Mistral-7B-Instruct-v0.2 \
|
|
500
|
+
--batch-size 2 \
|
|
501
|
+
--epochs 3 \
|
|
502
|
+
--learning-rate 2e-4 \
|
|
503
|
+
--lora-r 8 \
|
|
504
|
+
--lora-alpha 16 \
|
|
505
|
+
--gradient-accumulation-steps 4 \
|
|
506
|
+
--device auto # or cuda/mps/cpu
|
|
507
|
+
```
|
|
508
|
+
|
|
509
|
+
**LoRA rank:** the default is `8` (broadest compatibility). Cloudflare Workers AI
|
|
510
|
+
now accepts adapters up to **rank 32** (with a 300MB safetensors limit), so you can
|
|
511
|
+
raise `--lora-r` up to 32 for more capacity; doc2lora only warns above 32.
|
|
512
|
+
|
|
513
|
+
**Performance / low-resource options:**
|
|
514
|
+
|
|
515
|
+
- ⥠**Gradient checkpointing** (on by default): trades ~20% compute for a large
|
|
516
|
+
memory saving. Disable with `--no-gradient-checkpointing`.
|
|
517
|
+
- ð§Ū **Gradient accumulation**: `--gradient-accumulation-steps N` emulates a larger
|
|
518
|
+
effective batch (`batch_size * N`) without the memory cost - ideal on weak machines.
|
|
519
|
+
- ðŠķ **4-bit QLoRA**: `--load-in-4bit` (CUDA + `pip install "doc2lora[quant]"`) loads
|
|
520
|
+
the base model in 4-bit (nf4) so large models fit on small GPUs.
|
|
521
|
+
- ð **Precision**: bf16 on capable CUDA hardware, fp16 on other GPUs, fp32 on CPU.
|
|
522
|
+
- ðŧ **Out of Memory**: reduce `--batch-size`, raise `--gradient-accumulation-steps`,
|
|
523
|
+
or fall back with `--device cpu` (CUDA OOM also auto-falls back to CPU).
|
|
524
|
+
|
|
525
|
+
### How long will training take?
|
|
526
|
+
|
|
527
|
+
All numbers below are **order-of-magnitude estimates** and vary widely with
|
|
528
|
+
sequence length, batch size, LoRA rank, and data shape. `doc2lora scan <dir>
|
|
529
|
+
--device <d>` prints an estimate for your own corpus.
|
|
530
|
+
|
|
531
|
+
#### Small base model (DialoGPT-small / GPT-2 class), 3 epochs
|
|
532
|
+
|
|
533
|
+
| Corpus size | CPU | Apple MPS | NVIDIA CUDA |
|
|
534
|
+
| ----------- | --------- | --------- | ----------- |
|
|
535
|
+
| ~1 MB | minutes | ~1 min | seconds |
|
|
536
|
+
| ~10 MB | ~1 hour | ~10 min | ~2 min |
|
|
537
|
+
| ~100 MB | many hrs | ~1-2 hrs | ~20 min |
|
|
538
|
+
|
|
539
|
+
#### 7B-class model (Mistral / Gemma / Llama) vs hardware and VRAM
|
|
540
|
+
|
|
541
|
+
Times below are for **3 epochs** at ~512-token sequences. The "approach" column
|
|
542
|
+
reflects what fits in memory:
|
|
543
|
+
|
|
544
|
+
- **>= 24 GB VRAM**: full fp16/bf16 LoRA fits comfortably.
|
|
545
|
+
- **12 GB VRAM**: use 4-bit QLoRA (`--load-in-4bit`) to fit a 7B model.
|
|
546
|
+
- **Apple Silicon**: 4-bit QLoRA is CUDA-only (bitsandbytes), so MPS runs **fp16
|
|
547
|
+
LoRA** and needs ~18 GB+ unified memory for a 7B model; 8 GB Macs cannot train
|
|
548
|
+
7B (use a smaller base model). MPS is also much slower than a discrete GPU.
|
|
549
|
+
|
|
550
|
+
| Hardware | Memory | 7B approach | 1 MB | 10 MB | 100 MB |
|
|
551
|
+
| ---------- | ------------------ | ------------------------ | -------- | -------- | --------- |
|
|
552
|
+
| Apple M2 | 8-24 GB unified | fp16 LoRA (16 GB+ for 7B)| ~1 hr | ~11 hrs | ~4-5 days |
|
|
553
|
+
| Apple M3 | 8-128 GB unified | fp16 LoRA | ~40 min | ~6 hrs | ~2-3 days |
|
|
554
|
+
| Apple M4 | 16-128 GB unified | fp16 LoRA | ~25 min | ~4 hrs | ~1.5 days |
|
|
555
|
+
| RTX 4070 | 12 GB | QLoRA (4-bit) required | ~10 min | ~1.5 hrs | ~17 hrs |
|
|
556
|
+
| RTX 5070 | 12 GB | QLoRA (4-bit) required | ~7 min | ~1.2 hrs | ~12 hrs |
|
|
557
|
+
| RTX 3090 | 24 GB | full fp16 LoRA | ~7 min | ~1 hr | ~11 hrs |
|
|
558
|
+
| RTX 4090 | 24 GB | full fp16 LoRA | ~4 min | ~35 min | ~6 hrs |
|
|
559
|
+
| RTX 5090 | 32 GB | full fp16 LoRA | ~2 min | ~20 min | ~3-4 hrs |
|
|
560
|
+
|
|
561
|
+
> For LoRA you usually get better results from a few hundred to a few thousand
|
|
562
|
+
> curated examples than from a huge corpus - data quality beats data quantity.
|
|
563
|
+
> The small-model table above is ~20-40x faster if you only need a lightweight
|
|
564
|
+
> adapter.
|
|
565
|
+
|
|
566
|
+
#### 32B-class model (QwQ-32B) vs hardware and VRAM
|
|
567
|
+
|
|
568
|
+
QwQ-32B (`@cf/qwen/qwq-32b`) also accepts BYO LoRA adapters. A 32B base is roughly
|
|
569
|
+
4-5x slower than 7B and only fits with **4-bit QLoRA**, which needs ~20-24 GB of
|
|
570
|
+
VRAM - so it is realistically a 24 GB+ NVIDIA job. Times are for **3 epochs** at
|
|
571
|
+
~512-token sequences (see `examples/qwq_usage.py`).
|
|
572
|
+
|
|
573
|
+
| Hardware | Memory | 32B approach | 1 MB | 10 MB | 100 MB |
|
|
574
|
+
| --------------- | -------- | ------------------------ | -------- | -------- | -------- |
|
|
575
|
+
| Apple M2/M3/M4 | unified | not practical (no 4-bit) | - | - | - |
|
|
576
|
+
| RTX 4070 / 5070 | 12 GB | too small for 32B | - | - | - |
|
|
577
|
+
| RTX 3090 | 24 GB | QLoRA (4-bit), tight | ~30 min | ~4.5 hrs | ~2 days |
|
|
578
|
+
| RTX 4090 | 24 GB | QLoRA (4-bit) | ~18 min | ~2.5 hrs | ~1 day |
|
|
579
|
+
| RTX 5090 | 32 GB | QLoRA (4-bit), roomy | ~9 min | ~1.5 hrs | ~15 hrs |
|
|
580
|
+
|
|
581
|
+
> A rank-8..32 adapter on a 32B model is still well under Cloudflare's 300 MB
|
|
582
|
+
> safetensors limit. doc2lora tags Qwen/QwQ adapters with `model_type: qwen`
|
|
583
|
+
> automatically; deploy with `--cf-model "@cf/qwen/qwq-32b"`.
|
|
584
|
+
|
|
585
|
+
## Features
|
|
586
|
+
|
|
587
|
+
- â
**Document Parsing**: Recursively scan directories for supported document types
|
|
588
|
+
- â
**Subdirectory Labeling**: Automatically label documents based on directory structure and filename
|
|
589
|
+
- â
**Multiple Formats**: Support for 16+ document formats including archives
|
|
590
|
+
- â
**Archive Support**: Extract and parse documents from ZIP and TAR archives
|
|
591
|
+
- â
**R2 Bucket Support**: Direct integration with Cloudflare R2 storage buckets
|
|
592
|
+
- â
**CLI Interface**: Easy-to-use command-line interface
|
|
593
|
+
- â
**Flexible Configuration**: Customizable LoRA parameters
|
|
594
|
+
- ð **LoRA Training**: Fine-tune models using LoRA adaptation (requires ML dependencies)
|
|
595
|
+
- ð **Export Options**: JSON format compatible with various platforms
|
|
596
|
+
|
|
597
|
+
## Status
|
|
598
|
+
|
|
599
|
+
- **Document Parsing**: â
Fully working
|
|
600
|
+
- **CLI Interface**: â
Basic functionality working
|
|
601
|
+
- **LoRA Training**: ð Requires ML dependencies (torch, transformers, peft, datasets)
|
|
602
|
+
|
|
603
|
+
The core document parsing functionality works out of the box. For full LoRA training capabilities, install the ML dependencies listed above.
|