clean-web-scraper 4.3.4 → 4.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.github/workflows/npm.yml +1 -1
- package/README.md +16 -19
- package/example-usage.js +30 -28
- package/main.js +0 -1
- package/package.json +3 -3
- package/fine-tuning/fine-tuning.md +0 -333
- package/fine-tuning/llama-factory-colab-simple.py +0 -35
- package/fine-tuning/unsloth-SmolLM2-135M-simple.colab.py +0 -182
- package/fine-tuning/unsloth-llama3.2-1b-simple.colab.py +0 -174
- package/fine-tuning/unsloth-llama3.2-1b-simple.py +0 -178
package/README.md
CHANGED
|
@@ -36,10 +36,7 @@ npm i clean-web-scraper
|
|
|
36
36
|
git clone https://github.com/mlibre/Clean-Web-Scraper
|
|
37
37
|
cd Clean-Web-Scraper
|
|
38
38
|
sudo pacman -S extra/xorg-server-xvfb chromium
|
|
39
|
-
npm install
|
|
40
|
-
|
|
41
|
-
# Skip chromium download during npm installation
|
|
42
|
-
# npm install --ignore-scripts
|
|
39
|
+
npm install --ignore-scripts
|
|
43
40
|
```
|
|
44
41
|
|
|
45
42
|
## 💻 Usage
|
|
@@ -120,7 +117,7 @@ node example-usage.js
|
|
|
120
117
|
|
|
121
118
|
## 📤 Output
|
|
122
119
|
|
|
123
|
-
|
|
120
|
+
The content is saved in a clean, structured format:
|
|
124
121
|
|
|
125
122
|
- 📁 Base folder: `./folderPath/example.com/`
|
|
126
123
|
- 📑 Files preserve original URL paths
|
|
@@ -130,36 +127,36 @@ Your AI-ready content is saved in a clean, structured format:
|
|
|
130
127
|
```bash
|
|
131
128
|
example.com/
|
|
132
129
|
├── website/
|
|
133
|
-
│ ├── page1.txt
|
|
134
|
-
│ ├── page1.json
|
|
135
|
-
│ ├── page1.html
|
|
130
|
+
│ ├── page1.txt # Clean text content
|
|
131
|
+
│ ├── page1.json # Full metadata
|
|
132
|
+
│ ├── page1.html # Original HTML content
|
|
136
133
|
│ └── blog/
|
|
137
134
|
│ ├── post1.txt
|
|
138
135
|
│ └── post1.json
|
|
139
136
|
│ └── post1.html
|
|
140
|
-
├── texts/
|
|
137
|
+
├── texts/ # Numbered text files
|
|
141
138
|
│ ├── 1.txt
|
|
142
139
|
│ └── 2.txt
|
|
143
|
-
├── texts_with_metadata/
|
|
140
|
+
├── texts_with_metadata/ # When includeMetadata is true
|
|
144
141
|
│ ├── 1.txt
|
|
145
142
|
│ └── 2.txt
|
|
146
|
-
├── train.jsonl
|
|
147
|
-
├── train_with_metadata.jsonl
|
|
148
|
-
├── train.csv
|
|
149
|
-
└── train_with_metadata.csv
|
|
143
|
+
├── train.jsonl # Combined content
|
|
144
|
+
├── train_with_metadata.jsonl # When includeMetadata is true
|
|
145
|
+
├── train.csv # Clean text in CSV format
|
|
146
|
+
└── train_with_metadata.csv # When includeMetadata is true
|
|
150
147
|
|
|
151
148
|
combined/
|
|
152
|
-
├── texts/
|
|
149
|
+
├── texts/ # Combined numbered text files
|
|
153
150
|
│ ├── 1.txt
|
|
154
151
|
│ ├── 2.txt
|
|
155
152
|
│ └── n.txt
|
|
156
|
-
├── texts_with_metadata/
|
|
153
|
+
├── texts_with_metadata/ # Combined metadata text files
|
|
157
154
|
│ ├── 1.txt
|
|
158
155
|
│ ├── 2.txt
|
|
159
156
|
│ └── n.txt
|
|
160
|
-
├── combined.jsonl
|
|
157
|
+
├── combined.jsonl # Combined JSONL content
|
|
161
158
|
├── combined_with_metadata.jsonl
|
|
162
|
-
├── combined.csv
|
|
159
|
+
├── combined.csv # Combined CSV content
|
|
163
160
|
└── combined_with_metadata.csv
|
|
164
161
|
```
|
|
165
162
|
|
|
@@ -200,7 +197,7 @@ The actual article content starts here. This is the clean, processed text of the
|
|
|
200
197
|
{"text": "Another article", "metadata": {"articleTitle": "Second Page", "author": "Jane Smith"}}
|
|
201
198
|
```
|
|
202
199
|
|
|
203
|
-
### 🗃️ JSON Files In Website
|
|
200
|
+
### 🗃️ JSON Files In Website Directory (*.json)
|
|
204
201
|
|
|
205
202
|
```json
|
|
206
203
|
{
|
package/example-usage.js
CHANGED
|
@@ -49,7 +49,7 @@ async function khameneiIrFreePalestineTag ( enable )
|
|
|
49
49
|
baseURL: "https://english.khamenei.ir/news",
|
|
50
50
|
startURL: "https://english.khamenei.ir/page/search.xhtml?topicid=0&period=0&q=FreePalestine&pageSize=100#",
|
|
51
51
|
maxDepth: 1,
|
|
52
|
-
maxArticles:
|
|
52
|
+
maxArticles: 300,
|
|
53
53
|
exactExcludeList: [
|
|
54
54
|
"https://english.khamenei.ir/page/search.xhtml?topicid=0&period=0&q=FreePalestine&pageSize=100#",
|
|
55
55
|
"https://english.khamenei.ir/page/search.xhtml?topicid=0&period=0&q=FreePalestine&pageSize=100"
|
|
@@ -72,7 +72,7 @@ async function khameneiIrPalestineSpecialPage ( enable )
|
|
|
72
72
|
baseURL: "https://english.khamenei.ir/news",
|
|
73
73
|
startURL: "https://english.khamenei.ir/palestine-special-page",
|
|
74
74
|
maxDepth: 1,
|
|
75
|
-
maxArticles:
|
|
75
|
+
maxArticles: 300,
|
|
76
76
|
exactExcludeList: [
|
|
77
77
|
"https://english.khamenei.ir/palestine-special-page/"
|
|
78
78
|
],
|
|
@@ -102,7 +102,7 @@ async function decolonizepalestine ( enable )
|
|
|
102
102
|
"https://decolonizepalestine.com/rainbow-washing",
|
|
103
103
|
"https://decolonizepalestine.com/"
|
|
104
104
|
],
|
|
105
|
-
maxArticles:
|
|
105
|
+
maxArticles: 500,
|
|
106
106
|
scrapResultPath: "./dataset/decolonizepalestine/website",
|
|
107
107
|
jsonlOutputPath: "./dataset/decolonizepalestine/train.jsonl",
|
|
108
108
|
textOutputPath: "./dataset/decolonizepalestine/texts",
|
|
@@ -149,18 +149,18 @@ async function electronicintifada ( enable )
|
|
|
149
149
|
csvOutputPath: "./dataset/electronicintifada/train.csv",
|
|
150
150
|
includeMetadata: true,
|
|
151
151
|
metadataFields: ["author", "articleTitle", "pageTitle", "description", "dataScrapedDate", "url"],
|
|
152
|
-
maxArticles:
|
|
152
|
+
maxArticles: 3000,
|
|
153
153
|
maxDepth: 16,
|
|
154
154
|
batchSize: 40,
|
|
155
155
|
axiosHeaders: headers,
|
|
156
156
|
axiosMaxRetries: 2,
|
|
157
157
|
axiosRetryDelay: 8000,
|
|
158
|
-
axiosProxy: {
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
},
|
|
163
|
-
useProxyAsFallback: true,
|
|
158
|
+
// axiosProxy: {
|
|
159
|
+
// host: "localhost",
|
|
160
|
+
// port: 10808,
|
|
161
|
+
// protocol: "socks5"
|
|
162
|
+
// },
|
|
163
|
+
// useProxyAsFallback: true,
|
|
164
164
|
};
|
|
165
165
|
return await runScraper( config, enable );
|
|
166
166
|
}
|
|
@@ -228,18 +228,18 @@ async function mondoweiss ( enable )
|
|
|
228
228
|
csvOutputPath: "./dataset/mondoweiss/train.csv",
|
|
229
229
|
includeMetadata: true,
|
|
230
230
|
metadataFields: ["author", "articleTitle", "pageTitle", "description", "dataScrapedDate", "url"],
|
|
231
|
-
maxArticles:
|
|
231
|
+
maxArticles: 3000,
|
|
232
232
|
maxDepth: 15,
|
|
233
233
|
batchSize: 20,
|
|
234
234
|
axiosHeaders: headers,
|
|
235
235
|
axiosMaxRetries: 2,
|
|
236
236
|
axiosRetryDelay: 10000,
|
|
237
|
-
axiosProxy: {
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
},
|
|
242
|
-
useProxyAsFallback: true,
|
|
237
|
+
// axiosProxy: {
|
|
238
|
+
// host: "localhost",
|
|
239
|
+
// port: 10808,
|
|
240
|
+
// protocol: "socks5"
|
|
241
|
+
// },
|
|
242
|
+
// useProxyAsFallback: true,
|
|
243
243
|
};
|
|
244
244
|
return await runScraper( config, enable );
|
|
245
245
|
}
|
|
@@ -263,13 +263,15 @@ async function bdsmovement ( enable )
|
|
|
263
263
|
"https://bdsmovement.net/news-type",
|
|
264
264
|
"https://bdsmovement.net/cdn-cgi",
|
|
265
265
|
"https://bdsmovement.net/es/",
|
|
266
|
-
"https://bdsmovement.net/ar/"
|
|
266
|
+
"https://bdsmovement.net/ar/",
|
|
267
|
+
"https://bdsmovement.net/resource-type/",
|
|
267
268
|
],
|
|
268
269
|
exactExcludeList: [
|
|
269
270
|
"https://bdsmovement.net/",
|
|
270
271
|
"https://bdsmovement.net/shutdownnation",
|
|
271
272
|
"https://bdsmovement.net/campaigns",
|
|
272
273
|
"https://bdsmovement.net/resources",
|
|
274
|
+
"https://bdsmovement.net/news",
|
|
273
275
|
/^https:\/\/bdsmovement\.net\/resources\?page=\d+$/,
|
|
274
276
|
/^https:\/\/bdsmovement\.net\/resources\?campaign=\d+$/,
|
|
275
277
|
/^https:\/\/bdsmovement\.net\/resources\?type=\d+$/,
|
|
@@ -283,18 +285,18 @@ async function bdsmovement ( enable )
|
|
|
283
285
|
csvOutputPath: "./dataset/bdsmovement/train.csv",
|
|
284
286
|
includeMetadata: true,
|
|
285
287
|
metadataFields: ["author", "articleTitle", "pageTitle", "description", "dataScrapedDate", "url"],
|
|
286
|
-
maxArticles:
|
|
288
|
+
maxArticles: 3000,
|
|
287
289
|
maxDepth: 16,
|
|
288
|
-
batchSize:
|
|
290
|
+
batchSize: 100,
|
|
289
291
|
axiosHeaders: headers,
|
|
290
292
|
axiosMaxRetries: 2,
|
|
291
293
|
axiosRetryDelay: 8000,
|
|
292
|
-
axiosProxy: {
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
},
|
|
297
|
-
useProxyAsFallback: true
|
|
294
|
+
// axiosProxy: {
|
|
295
|
+
// host: "localhost",
|
|
296
|
+
// port: 10808,
|
|
297
|
+
// protocol: "socks5"
|
|
298
|
+
// },
|
|
299
|
+
// useProxyAsFallback: true
|
|
298
300
|
};
|
|
299
301
|
return await runScraper( config, enable );
|
|
300
302
|
}
|
|
@@ -332,8 +334,8 @@ async function palestineremembered ( enable )
|
|
|
332
334
|
batchSize: 10,
|
|
333
335
|
axiosProxy: {
|
|
334
336
|
host: "localhost",
|
|
335
|
-
port:
|
|
336
|
-
protocol: "
|
|
337
|
+
port: 10808,
|
|
338
|
+
protocol: "socks5"
|
|
337
339
|
}
|
|
338
340
|
};
|
|
339
341
|
return await runScraper( config, enable );
|
package/main.js
CHANGED
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "clean-web-scraper",
|
|
3
|
-
"version": "4.
|
|
3
|
+
"version": "4.4.0",
|
|
4
4
|
"main": "main.js",
|
|
5
5
|
"scripts": {
|
|
6
6
|
"start": "node main.js",
|
|
@@ -24,8 +24,8 @@
|
|
|
24
24
|
"description": "",
|
|
25
25
|
"dependencies": {
|
|
26
26
|
"@mozilla/readability": "^0.6.0",
|
|
27
|
-
"axios": "^1.
|
|
28
|
-
"eslint": "^9.
|
|
27
|
+
"axios": "^1.12.2",
|
|
28
|
+
"eslint": "^9.38.0",
|
|
29
29
|
"jsdom": "^26.0.0",
|
|
30
30
|
"puppeteer": "^24.1.1",
|
|
31
31
|
"puppeteer-real-browser": "^1.3.22"
|
|
@@ -1,333 +0,0 @@
|
|
|
1
|
-
# Fine-Tuning LLMs on Raw Text
|
|
2
|
-
|
|
3
|
-
Fine-tuning large language models (LLMs) on raw text allows them to specialize in new knowledge domains.
|
|
4
|
-
This guide walks you through fine-tuning an LLM using JSONL-formatted data, covering data preparation, model training, and deployment.
|
|
5
|
-
We use the [Unsloth](https://docs.unsloth.ai/) library for efficient fine-tuning and demonstrate on a small [SmolLM2-135M](https://huggingface.co/HuggingFaceTB/SmolLM2-135M).
|
|
6
|
-
The final model can be deployed with [Ollama](https://github.com/ollama/ollama) for local inference.
|
|
7
|
-
|
|
8
|
-
📌 **Full Code & Implementation Details**: [GitHub Repository](https://github.com/mlibre/Clean-Web-Scraper/tree/main/fine-tuning)
|
|
9
|
-
|
|
10
|
-
---
|
|
11
|
-
|
|
12
|
-
## 🛠️ Overview of the Process
|
|
13
|
-
|
|
14
|
-
Fine-tuning an LLM involves several steps:
|
|
15
|
-
|
|
16
|
-
### 1️⃣ Data Collection & Preparation
|
|
17
|
-
|
|
18
|
-
First, prepare your dataset in a structured format. Common formats for fine-tuning include **JSONL, CSV, and TXT**.
|
|
19
|
-
In this guide, we use **JSONL** because it's easy to work with and widely used.
|
|
20
|
-
|
|
21
|
-
📄 **Sample JSONL file (`train.jsonl`)**:
|
|
22
|
-
|
|
23
|
-
```json
|
|
24
|
-
{"text": "Despite facing constant oppression, Palestinians have continued to resist Israeli occupation.", "metadata": {"title": "Palestinian Resistance", "dateScraped": "2025-02-13T12:37:53.776Z"}}
|
|
25
|
-
{"text": "Palestinians have shown remarkable resilience.", "metadata": {"title": "Youth Resistance", "dateScraped": "2025-02-13T12:37:53.776Z"}}
|
|
26
|
-
```
|
|
27
|
-
|
|
28
|
-
To scrape data efficiently, we use the [Clean-Web-Scraper](https://github.com/mlibre/Clean-Web-Scraper) library.
|
|
29
|
-
This **Node.js** library extracts articles from websites, cleans them, and saves them in `JSONL` format.
|
|
30
|
-
The dataset is available on [Hugging Face](https://huggingface.co/datasets/mlibre/palestine).
|
|
31
|
-
|
|
32
|
-
---
|
|
33
|
-
|
|
34
|
-
### 2️⃣ Fine-Tuning Library – **Why Unsloth?** 🦥
|
|
35
|
-
|
|
36
|
-
At the time of writing, [Unsloth](https://docs.unsloth.ai/) is one of the **fastest and most memory-efficient** fine-tuning libraries available.
|
|
37
|
-
It supports **fine-tuning and Continued Pretraining (CPT)**, allowing LLMs to learn **new knowledge domains** efficiently.
|
|
38
|
-
|
|
39
|
-
---
|
|
40
|
-
|
|
41
|
-
### 3️⃣ Setting Up the Training Environment 🖥️
|
|
42
|
-
|
|
43
|
-
We use **Google Colab** for training, as it provides free GPU access.
|
|
44
|
-
|
|
45
|
-
---
|
|
46
|
-
|
|
47
|
-
### 4️⃣ The Model 🏗️
|
|
48
|
-
|
|
49
|
-
We use **SmolLM2-135M**, a very small 135M-parameter model, for fine-tuning. To optimize memory, we load the model in **4-bit quantization** using `Unsloth`.
|
|
50
|
-
|
|
51
|
-
---
|
|
52
|
-
|
|
53
|
-
### 5️⃣ Deployment with Ollama
|
|
54
|
-
|
|
55
|
-
After fine-tuning, we save the new model and deploy it using [Ollama](https://github.com/ollama/ollama).
|
|
56
|
-
|
|
57
|
-
---
|
|
58
|
-
|
|
59
|
-
## 💻 The Code
|
|
60
|
-
|
|
61
|
-
The provided Colab code includes all the steps to fine-tune the model.
|
|
62
|
-
|
|
63
|
-
### Installing Dependencies
|
|
64
|
-
|
|
65
|
-
```python
|
|
66
|
-
!pip install unsloth vllm
|
|
67
|
-
!pip install --upgrade pillow
|
|
68
|
-
|
|
69
|
-
# Install trl if needed
|
|
70
|
-
# !pip install git+https://github.com/huggingface/trl.git@e95f9fb74a3c3647b86f251b7e230ec51c64b72b
|
|
71
|
-
```
|
|
72
|
-
|
|
73
|
-
### Loading and Preparing the Model
|
|
74
|
-
|
|
75
|
-
Using [Unsloth’s documentation](https://docs.unsloth.ai), we load a pretrained model (a 4-bit quantized version of SmolLM2-135M) and set it up for fine-tuning with LoRA.
|
|
76
|
-
This method allows for memory efficiency while updating the model's parameters.
|
|
77
|
-
|
|
78
|
-
```python
|
|
79
|
-
from unsloth import FastLanguageModel
|
|
80
|
-
import torch
|
|
81
|
-
max_seq_length = 2048 # Choose any! Unsloth auto support RoPE Scaling internally!
|
|
82
|
-
dtype = None # None for auto detection
|
|
83
|
-
load_in_4bit = True # Use 4bit quantization to reduce memory usage (also less accuracy). Can be False.
|
|
84
|
-
|
|
85
|
-
model, tokenizer = FastLanguageModel.from_pretrained(
|
|
86
|
-
model_name = "unsloth/SmolLM2-135M-bnb-4bit",
|
|
87
|
-
max_seq_length = max_seq_length,
|
|
88
|
-
dtype = dtype,
|
|
89
|
-
load_in_4bit = load_in_4bit,
|
|
90
|
-
# token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
|
|
91
|
-
)
|
|
92
|
-
|
|
93
|
-
model = FastLanguageModel.get_peft_model(
|
|
94
|
-
model,
|
|
95
|
-
r = 128, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
|
|
96
|
-
# Higher: Better accuracy on hard tasks but increases memory and risk of overfitting.
|
|
97
|
-
# Lower: Faster, memory-efficient but may reduce accuracy.
|
|
98
|
-
|
|
99
|
-
target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
|
|
100
|
-
"gate_proj", "up_proj", "down_proj", "lm_head", "embed_tokens"],
|
|
101
|
-
lora_alpha = 64, # 32, 16
|
|
102
|
-
# Higher: Learns more but may overfit.
|
|
103
|
-
# Lower: Slower to learn, more generalizable
|
|
104
|
-
|
|
105
|
-
lora_dropout = 0, # Supports any, but = 0 is optimized
|
|
106
|
-
bias = "none", # Supports any, but = "none" is optimized
|
|
107
|
-
use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
|
|
108
|
-
random_state = 3407,
|
|
109
|
-
use_rslora = True, # unsloth support rank stabilized LoRA
|
|
110
|
-
loftq_config = None, # And LoftQ
|
|
111
|
-
)
|
|
112
|
-
```
|
|
113
|
-
|
|
114
|
-
---
|
|
115
|
-
|
|
116
|
-
### Loading the Dataset 📂
|
|
117
|
-
|
|
118
|
-
Upload the JSONL dataset to Google Drive and load it into Colab:
|
|
119
|
-
|
|
120
|
-
```python
|
|
121
|
-
# Mount Google Drive to access training data
|
|
122
|
-
from google.colab import drive
|
|
123
|
-
drive.mount('/content/drive')
|
|
124
|
-
|
|
125
|
-
# Load the dataset
|
|
126
|
-
from datasets import load_dataset
|
|
127
|
-
dataset = load_dataset(
|
|
128
|
-
"json",
|
|
129
|
-
data_files = "/content/drive/MyDrive/train.jsonl",
|
|
130
|
-
split = "train",
|
|
131
|
-
)
|
|
132
|
-
|
|
133
|
-
EOS_TOKEN = tokenizer.eos_token
|
|
134
|
-
def formatting_prompts_func(examples):
|
|
135
|
-
return { "text" : [example + EOS_TOKEN for example in examples["text"]] }
|
|
136
|
-
dataset = dataset.map(formatting_prompts_func, batched = True,)
|
|
137
|
-
|
|
138
|
-
print(dataset.column_names)
|
|
139
|
-
print(dataset[0])
|
|
140
|
-
```
|
|
141
|
-
|
|
142
|
-
---
|
|
143
|
-
|
|
144
|
-
### Training the Model 🚴♂️
|
|
145
|
-
|
|
146
|
-
Fine-tuning is managed with `UnslothTrainer`, allowing optimization of batch size, learning rate, and epochs.
|
|
147
|
-
|
|
148
|
-
```python
|
|
149
|
-
from trl import SFTTrainer
|
|
150
|
-
from transformers import TrainingArguments
|
|
151
|
-
from unsloth import is_bfloat16_supported
|
|
152
|
-
from unsloth import UnslothTrainer, UnslothTrainingArguments
|
|
153
|
-
|
|
154
|
-
trainer = UnslothTrainer(
|
|
155
|
-
model = model,
|
|
156
|
-
tokenizer = tokenizer,
|
|
157
|
-
train_dataset = dataset,
|
|
158
|
-
dataset_text_field = "text",
|
|
159
|
-
max_seq_length = max_seq_length,
|
|
160
|
-
dataset_num_proc = 8, # 2
|
|
161
|
-
|
|
162
|
-
args = UnslothTrainingArguments(
|
|
163
|
-
per_device_train_batch_size = 2,
|
|
164
|
-
gradient_accumulation_steps = 8, # 4
|
|
165
|
-
|
|
166
|
-
warmup_ratio = 0.1,
|
|
167
|
-
num_train_epochs = 3, # 1, 2, 3, 4
|
|
168
|
-
# max_steps = 60,
|
|
169
|
-
|
|
170
|
-
learning_rate = 5e-5,
|
|
171
|
-
embedding_learning_rate = 5e-6,
|
|
172
|
-
|
|
173
|
-
fp16 = not is_bfloat16_supported(),
|
|
174
|
-
bf16 = is_bfloat16_supported(),
|
|
175
|
-
logging_steps = 1,
|
|
176
|
-
optim = "adamw_8bit",
|
|
177
|
-
weight_decay = 0.00,
|
|
178
|
-
lr_scheduler_type = "cosine",
|
|
179
|
-
seed = 3407,
|
|
180
|
-
output_dir = "outputs",
|
|
181
|
-
report_to = "none", # Use this for WandB etc
|
|
182
|
-
),
|
|
183
|
-
)
|
|
184
|
-
|
|
185
|
-
trainer_stats = trainer.train()
|
|
186
|
-
```
|
|
187
|
-
|
|
188
|
-
---
|
|
189
|
-
|
|
190
|
-
### Saving & Exporting the Model 💾
|
|
191
|
-
|
|
192
|
-
Once training is complete, we save the fine-tuned model.
|
|
193
|
-
For **quantized GGUF format**, use:
|
|
194
|
-
|
|
195
|
-
```python
|
|
196
|
-
# saves the LoRA adapters, and not the full model. To save to 16bit or GGUF, scroll down!
|
|
197
|
-
model.save_pretrained("lora_model") # Local saving
|
|
198
|
-
tokenizer.save_pretrained("lora_model")
|
|
199
|
-
|
|
200
|
-
# Save to 8bit Q8_0
|
|
201
|
-
if False: model.save_pretrained_gguf("model", tokenizer,)
|
|
202
|
-
# Remember to go to https://huggingface.co/settings/tokens for a token!
|
|
203
|
-
# And change your username from mlibre to your username!!
|
|
204
|
-
if False: model.push_to_hub_gguf("mlibre/model", tokenizer, token = "token")
|
|
205
|
-
|
|
206
|
-
# Save to 16bit GGUF
|
|
207
|
-
if False: model.save_pretrained_gguf("model", tokenizer, quantization_method = "f16")
|
|
208
|
-
if False: model.push_to_hub_gguf("mlibre/model", tokenizer, quantization_method = "f16", token = "token")
|
|
209
|
-
|
|
210
|
-
# Save to q4_k_m GGUF
|
|
211
|
-
if True: model.save_pretrained_gguf("model", tokenizer, quantization_method = "q4_k_m")
|
|
212
|
-
if False: model.push_to_hub_gguf("mlibre/model", tokenizer, quantization_method = "q4_k_m", token = "token")
|
|
213
|
-
|
|
214
|
-
# Save to multiple GGUF options - much faster if you want multiple!
|
|
215
|
-
if False:
|
|
216
|
-
model.push_to_hub_gguf(
|
|
217
|
-
"mlibre/model", # Change mlibre to your username!
|
|
218
|
-
tokenizer,
|
|
219
|
-
quantization_method = ["q4_k_m", "q8_0", "q5_k_m",],
|
|
220
|
-
token = "token", # Get a token at https://huggingface.co/settings/tokens
|
|
221
|
-
)
|
|
222
|
-
```
|
|
223
|
-
|
|
224
|
-
Now, go to the model folder and download the new model (**unsloth.Q4_K_M.gguf**) along with the Ollama **Modelfile**.
|
|
225
|
-
|
|
226
|
-
---
|
|
227
|
-
|
|
228
|
-
## 🚀 Deploying the Model with Ollama
|
|
229
|
-
|
|
230
|
-
### 📥 Step 1: Install Ollama
|
|
231
|
-
|
|
232
|
-
Ollama is a lightweight, open-source LLM server that allows you to run and deploy models locally.
|
|
233
|
-
|
|
234
|
-
```bash
|
|
235
|
-
curl -fsSL https://ollama.com/install.sh | sh
|
|
236
|
-
```
|
|
237
|
-
|
|
238
|
-
### 📝 Step 2: Create the Modelfile
|
|
239
|
-
|
|
240
|
-
To run GGUF models on ollama, we first must create a **Modelfile** taht tells Ollama how to run the model.
|
|
241
|
-
If the Modelfile was not available for download in Colab for any reason, you can create it manually.
|
|
242
|
-
Navigate to the model folder and **create a new file named `Modelfile`**:
|
|
243
|
-
|
|
244
|
-
```bash
|
|
245
|
-
nano Modelfile
|
|
246
|
-
```
|
|
247
|
-
|
|
248
|
-
Inside the file, add the following:
|
|
249
|
-
|
|
250
|
-
```text
|
|
251
|
-
TEMPLATE """{{- if .Messages }}
|
|
252
|
-
{{- if .System }}<|im_start|>system
|
|
253
|
-
{{ .System }}<|im_end|>
|
|
254
|
-
{{ end }}
|
|
255
|
-
{{- range $i, $_ := .Messages }}
|
|
256
|
-
{{- $last := eq (len (slice $.Messages $i)) 1 -}}
|
|
257
|
-
{{- if eq .Role "user" }}<|im_start|>user
|
|
258
|
-
{{ .Content }}<|im_end|>
|
|
259
|
-
{{ else if eq .Role "assistant" }}<|im_start|>assistant
|
|
260
|
-
{{ .Content }}{{ if not $last }}<|im_end|>
|
|
261
|
-
{{ end }}
|
|
262
|
-
{{- end }}
|
|
263
|
-
{{- if and (ne .Role "assistant") $last }}<|im_start|>assistant
|
|
264
|
-
{{ end }}
|
|
265
|
-
{{- end }}
|
|
266
|
-
{{- else }}
|
|
267
|
-
{{- if .System }}<|im_start|>system
|
|
268
|
-
{{ .System }}<|im_end|>
|
|
269
|
-
{{ end }}{{ if .Prompt }}<|im_start|>user
|
|
270
|
-
{{ .Prompt }}<|im_end|>
|
|
271
|
-
{{ end }}<|im_start|>assistant
|
|
272
|
-
{{ end }}{{ .Response }}{{ if .Response }}<|im_end|>{{ end }}"""
|
|
273
|
-
SYSTEM You are a helpful AI assistant named SmolLM, trained by Hugging Face
|
|
274
|
-
PARAMETER stop <|im_start|>
|
|
275
|
-
PARAMETER stop <|im_end|>
|
|
276
|
-
```
|
|
277
|
-
|
|
278
|
-
Save and close the file.
|
|
279
|
-
If you wonder where I did find the `Modelfile` template, it’s from the original SmolLM2.
|
|
280
|
-
You can show it with this command:
|
|
281
|
-
|
|
282
|
-
```bash
|
|
283
|
-
ollama show --modelfile smollm2:135m
|
|
284
|
-
```
|
|
285
|
-
|
|
286
|
-
### 🏃 Step 3: Create & Run the Model
|
|
287
|
-
|
|
288
|
-
```bash
|
|
289
|
-
ollama create Modelfile
|
|
290
|
-
ollama run Modelfile:latest
|
|
291
|
-
```
|
|
292
|
-
|
|
293
|
-
---
|
|
294
|
-
|
|
295
|
-
### 🎯 Testing the Model
|
|
296
|
-
|
|
297
|
-
Let’s test both the **default SmolLM2 model** and our **fine-tuned version** to compare outputs.
|
|
298
|
-
|
|
299
|
-
📌 **Official SmolLM2 Model**:
|
|
300
|
-
|
|
301
|
-
```bash
|
|
302
|
-
ollama run smollm2:135m "Palestine is"
|
|
303
|
-
```
|
|
304
|
-
|
|
305
|
-
Example output:
|
|
306
|
-
|
|
307
|
-
```text
|
|
308
|
-
What an intriguing combination of languages you're sharing with me. I'm ready to help refine your communication style and enhance the effectiveness of our
|
|
309
|
-
exchange. How can we work together to make your writing more engaging?
|
|
310
|
-
|
|
311
|
-
```
|
|
312
|
-
|
|
313
|
-
📌 **Fine-Tuned Model**:
|
|
314
|
-
|
|
315
|
-
```bash
|
|
316
|
-
ollama run Modelfile:latest "Palestine is"
|
|
317
|
-
```
|
|
318
|
-
|
|
319
|
-
Example output:
|
|
320
|
-
|
|
321
|
-
```text
|
|
322
|
-
The world has long held Palestine in low respectability as an unappreciated people’s land. It remains that way despite the genocide and occupation. That is
|
|
323
|
-
why we have so many Palestinian children who are orphans with no homeland, yet still want to belong there. We need you to help them live in this orphanhood
|
|
324
|
-
by becoming a source of life for our future generation.
|
|
325
|
-
```
|
|
326
|
-
|
|
327
|
-
## ✊ A Call for Justice: Standing in Solidarity with Palestine
|
|
328
|
-
|
|
329
|
-
In these times of rapid technological advancement, we are reminded that progress is not measured solely by the innovations we create but also by the compassion and justice we uphold. As we fine-tune our language models to process vast quantities of raw text and specialize in new knowledge domains, we must also harness our skills to amplify the voices of the marginalized.
|
|
330
|
-
|
|
331
|
-
The struggle of the Palestinian people, who continue to face relentless oppression and violence—including the reported genocide in Gaza—demands that we not remain silent. When silence becomes complicity, our work in technology can serve as a platform to challenge injustice and promote accountability. By integrating critical perspectives into our technical endeavors, we contribute to a broader dialogue that insists on human rights, transparency, and an end to systemic violence.
|
|
332
|
-
|
|
333
|
-
Let this guide be a reminder: every line of code, every dataset we prepare, and every model we deploy carries the potential not only to advance our field but also to support those whose stories are often ignored. As we pursue excellence in machine learning, we stand in solidarity with Palestinians and all oppressed communities, affirming that technology must be a force for justice and liberation.
|
|
@@ -1,35 +0,0 @@
|
|
|
1
|
-
!rm -r /content/.ipynb_checkpoints
|
|
2
|
-
!rm -r /content/data
|
|
3
|
-
!rm -r /content/.config
|
|
4
|
-
!rm -r /content/sample_data/
|
|
5
|
-
!rm -r /content/lora_model/
|
|
6
|
-
!rm -r /content/llama.cpp/
|
|
7
|
-
!rm -r /content/outputs/
|
|
8
|
-
!rm -r /content/model/
|
|
9
|
-
!rm -r /content/huggingface_tokenizers_cache/
|
|
10
|
-
|
|
11
|
-
%cd /content/
|
|
12
|
-
%rm -rf LLaMA-Factory
|
|
13
|
-
!git clone --depth 1 https://github.com/hiyouga/LLaMA-Factory.git
|
|
14
|
-
%cd LLaMA-Factory
|
|
15
|
-
%ls
|
|
16
|
-
!pip install -e .[torch,bitsandbytes]
|
|
17
|
-
|
|
18
|
-
# Use this to resolve package conflicts.
|
|
19
|
-
# pip install --no-deps -e .
|
|
20
|
-
|
|
21
|
-
# dataset_info.json
|
|
22
|
-
# "dataset_name": {
|
|
23
|
-
# "file_name": "data.json",
|
|
24
|
-
# "columns": {
|
|
25
|
-
# "prompt": "text"
|
|
26
|
-
# }
|
|
27
|
-
# }
|
|
28
|
-
# [
|
|
29
|
-
# {"text": "document"},
|
|
30
|
-
# {"text": "document"}
|
|
31
|
-
# ]
|
|
32
|
-
|
|
33
|
-
# llamafactory-cli train examples/train_lora/llama3_lora_sft.yaml
|
|
34
|
-
# llamafactory-cli chat examples/inference/llama3_lora_sft.yaml
|
|
35
|
-
# llamafactory-cli export examples/merge_lora/llama3_lora_sft.yaml
|
|
@@ -1,182 +0,0 @@
|
|
|
1
|
-
# !rm -r /content/.ipynb_checkpoints
|
|
2
|
-
# !rm -r /content/data
|
|
3
|
-
# !rm -r /content/.config
|
|
4
|
-
# !rm -r /content/sample_data/
|
|
5
|
-
# !rm -r /content/lora_model/
|
|
6
|
-
# !rm -r /content/llama.cpp/
|
|
7
|
-
# !rm -r /content/outputs/
|
|
8
|
-
# !rm -r /content/model/
|
|
9
|
-
# !rm -r /content/huggingface_tokenizers_cache/
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
# Commented out IPython magic to ensure Python compatibility.
|
|
13
|
-
# %%capture
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
# !pip uninstall unsloth -y && pip install --upgrade --no-cache-dir --no-deps git+https://github.com/unslothai/unsloth.git
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
# Disconnect and delete the runtime
|
|
20
|
-
# !pip uninstall unsloth -y
|
|
21
|
-
# !pip install --force-reinstall --no-cache-dir --upgrade "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
# Disconnect and delete the runtime
|
|
25
|
-
# !pip uninstall unsloth -y
|
|
26
|
-
# !pip install unsloth
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
%%capture
|
|
30
|
-
import sys; modules = list(sys.modules.keys())
|
|
31
|
-
for x in modules: sys.modules.pop(x) if "PIL" in x or "google" in x else None
|
|
32
|
-
|
|
33
|
-
!pip install unsloth vllm
|
|
34
|
-
!pip install --upgrade pillow
|
|
35
|
-
!pip install git+https://github.com/huggingface/trl.git@e95f9fb74a3c3647b86f251b7e230ec51c64b72b
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
from google.colab import drive
|
|
39
|
-
drive.mount('/content/drive')
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
from unsloth import FastLanguageModel
|
|
43
|
-
import torch
|
|
44
|
-
max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally! 2048 is also default in ollama
|
|
45
|
-
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
|
|
46
|
-
load_in_4bit = True # Use 4bit quantization to reduce memory usage (also less accuracy). Can be False.
|
|
47
|
-
|
|
48
|
-
model, tokenizer = FastLanguageModel.from_pretrained(
|
|
49
|
-
model_name = "unsloth/SmolLM2-135M-bnb-4bit",
|
|
50
|
-
max_seq_length = max_seq_length,
|
|
51
|
-
dtype = dtype,
|
|
52
|
-
load_in_4bit = load_in_4bit,
|
|
53
|
-
# token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
|
|
54
|
-
)
|
|
55
|
-
|
|
56
|
-
model = FastLanguageModel.get_peft_model(
|
|
57
|
-
model,
|
|
58
|
-
r = 128, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
|
|
59
|
-
# Higher: Better accuracy on hard tasks but increases memory and risk of overfitting.
|
|
60
|
-
# Lower: Faster, memory-efficient but may reduce accuracy.
|
|
61
|
-
|
|
62
|
-
target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
|
|
63
|
-
"gate_proj", "up_proj", "down_proj", "lm_head", "embed_tokens"],
|
|
64
|
-
lora_alpha = 64, # 32, 16
|
|
65
|
-
# Higher: Learns more but may overfit.
|
|
66
|
-
# Lower: Slower to learn, more generalizable
|
|
67
|
-
|
|
68
|
-
lora_dropout = 0, # Supports any, but = 0 is optimized
|
|
69
|
-
bias = "none", # Supports any, but = "none" is optimized
|
|
70
|
-
# [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
|
|
71
|
-
use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
|
|
72
|
-
random_state = 3407,
|
|
73
|
-
use_rslora = True, # We support rank stabilized LoRA
|
|
74
|
-
loftq_config = None, # And LoftQ
|
|
75
|
-
)
|
|
76
|
-
|
|
77
|
-
from datasets import load_dataset
|
|
78
|
-
dataset = load_dataset(
|
|
79
|
-
"json",
|
|
80
|
-
data_files = "/content/drive/MyDrive/train.jsonl",
|
|
81
|
-
split = "train",
|
|
82
|
-
)
|
|
83
|
-
print(dataset.column_names)
|
|
84
|
-
print(dataset[0])
|
|
85
|
-
|
|
86
|
-
EOS_TOKEN = tokenizer.eos_token
|
|
87
|
-
def formatting_prompts_func(examples):
|
|
88
|
-
return { "text" : [example + EOS_TOKEN for example in examples["text"]] }
|
|
89
|
-
dataset = dataset.map(formatting_prompts_func, batched = True,)
|
|
90
|
-
|
|
91
|
-
print(dataset.column_names)
|
|
92
|
-
print(dataset[0])
|
|
93
|
-
|
|
94
|
-
from trl import SFTTrainer
|
|
95
|
-
from transformers import TrainingArguments
|
|
96
|
-
from unsloth import is_bfloat16_supported
|
|
97
|
-
from unsloth import UnslothTrainer, UnslothTrainingArguments
|
|
98
|
-
|
|
99
|
-
trainer = UnslothTrainer(
|
|
100
|
-
model = model,
|
|
101
|
-
tokenizer = tokenizer,
|
|
102
|
-
train_dataset = dataset,
|
|
103
|
-
dataset_text_field = "text",
|
|
104
|
-
max_seq_length = max_seq_length,
|
|
105
|
-
dataset_num_proc = 8, # 2
|
|
106
|
-
|
|
107
|
-
args = UnslothTrainingArguments(
|
|
108
|
-
per_device_train_batch_size = 2,
|
|
109
|
-
gradient_accumulation_steps = 8, # 4
|
|
110
|
-
|
|
111
|
-
warmup_ratio = 0.1,
|
|
112
|
-
num_train_epochs = 3, # 1, 2, 3, 4
|
|
113
|
-
# max_steps = 60,
|
|
114
|
-
|
|
115
|
-
learning_rate = 5e-5,
|
|
116
|
-
embedding_learning_rate = 5e-6,
|
|
117
|
-
|
|
118
|
-
fp16 = not is_bfloat16_supported(),
|
|
119
|
-
bf16 = is_bfloat16_supported(),
|
|
120
|
-
logging_steps = 1,
|
|
121
|
-
optim = "adamw_8bit",
|
|
122
|
-
weight_decay = 0.00,
|
|
123
|
-
lr_scheduler_type = "cosine",
|
|
124
|
-
seed = 3407,
|
|
125
|
-
output_dir = "outputs",
|
|
126
|
-
report_to = "none", # Use this for WandB etc
|
|
127
|
-
),
|
|
128
|
-
)
|
|
129
|
-
|
|
130
|
-
trainer_stats = trainer.train()
|
|
131
|
-
|
|
132
|
-
# saves the LoRA adapters, and not the full model. To save to 16bit or GGUF, scroll down!
|
|
133
|
-
model.save_pretrained("lora_model") # Local saving
|
|
134
|
-
tokenizer.save_pretrained("lora_model")
|
|
135
|
-
# model.push_to_hub("your_name/lora_model", token = "...") # Online saving
|
|
136
|
-
# tokenizer.push_to_hub("your_name/lora_model", token = "...") # Online saving
|
|
137
|
-
|
|
138
|
-
# Save to 8bit Q8_0
|
|
139
|
-
if False: model.save_pretrained_gguf("model", tokenizer,)
|
|
140
|
-
# Remember to go to https://huggingface.co/settings/tokens for a token!
|
|
141
|
-
# And change your username!
|
|
142
|
-
if False: model.push_to_hub_gguf("mlibre/model", tokenizer, token = "token")
|
|
143
|
-
|
|
144
|
-
# Save to 16bit GGUF
|
|
145
|
-
if False: model.save_pretrained_gguf("model", tokenizer, quantization_method = "f16")
|
|
146
|
-
if False: model.push_to_hub_gguf("mlibre/model", tokenizer, quantization_method = "f16", token = "token")
|
|
147
|
-
|
|
148
|
-
# Save to q4_k_m GGUF
|
|
149
|
-
if True: model.save_pretrained_gguf("model", tokenizer, quantization_method = "q4_k_m")
|
|
150
|
-
if False: model.push_to_hub_gguf("mlibre/model", tokenizer, quantization_method = "q4_k_m", token = "token")
|
|
151
|
-
|
|
152
|
-
# Save to multiple GGUF options - much faster if you want multiple!
|
|
153
|
-
if False:
|
|
154
|
-
model.push_to_hub_gguf(
|
|
155
|
-
"mlibre/model", # Change mlibre to your username!
|
|
156
|
-
tokenizer,
|
|
157
|
-
quantization_method = ["q4_k_m", "q8_0", "q5_k_m",],
|
|
158
|
-
token = "token", # Get a token at https://huggingface.co/settings/tokens
|
|
159
|
-
)
|
|
160
|
-
|
|
161
|
-
# print(tokenizer._ollama_modelfile)
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
# now in your own system:
|
|
165
|
-
curl -fsSL https://ollama.com/install.sh | sh
|
|
166
|
-
let first try the offical smollm2
|
|
167
|
-
ollama run smollm2:135m
|
|
168
|
-
> palestine is the owner of the land not israel
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
# download the model (/content/model/unsloth.Q4_K_M.gguf)
|
|
172
|
-
ollama create unsloth_model -f ./model/Modelfile
|
|
173
|
-
|
|
174
|
-
# In colab terminal type: ollama run unsloth_model
|
|
175
|
-
# in local ollama:
|
|
176
|
-
!curl http://localhost:11434/api/chat -d '{ \
|
|
177
|
-
"model": "unsloth_model", \
|
|
178
|
-
"messages": [ \
|
|
179
|
-
{"role": "user", \
|
|
180
|
-
"content": "The palestine"} \
|
|
181
|
-
] \
|
|
182
|
-
}'
|
|
@@ -1,174 +0,0 @@
|
|
|
1
|
-
# !rm -r /content/.ipynb_checkpoints
|
|
2
|
-
# !rm -r /content/data
|
|
3
|
-
# !rm -r /content/.config
|
|
4
|
-
# !rm -r /content/sample_data/
|
|
5
|
-
# !rm -r /content/lora_model/
|
|
6
|
-
# !rm -r /content/llama.cpp/
|
|
7
|
-
# !rm -r /content/outputs/
|
|
8
|
-
# !rm -r /content/model/
|
|
9
|
-
# !rm -r /content/huggingface_tokenizers_cache/
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
# Commented out IPython magic to ensure Python compatibility.
|
|
13
|
-
# %%capture
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
# !pip uninstall unsloth -y && pip install --upgrade --no-cache-dir --no-deps git+https://github.com/unslothai/unsloth.git
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
# Disconnect and delete the runtime
|
|
20
|
-
# !pip uninstall unsloth -y
|
|
21
|
-
# !pip install --force-reinstall --no-cache-dir --upgrade "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
# Disconnect and delete the runtime
|
|
25
|
-
# !pip uninstall unsloth -y
|
|
26
|
-
# !pip install unsloth
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
%%capture
|
|
30
|
-
import sys; modules = list(sys.modules.keys())
|
|
31
|
-
for x in modules: sys.modules.pop(x) if "PIL" in x or "google" in x else None
|
|
32
|
-
|
|
33
|
-
!pip install unsloth vllm
|
|
34
|
-
!pip install --upgrade pillow
|
|
35
|
-
!pip install git+https://github.com/huggingface/trl.git@e95f9fb74a3c3647b86f251b7e230ec51c64b72b
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
from google.colab import drive
|
|
39
|
-
drive.mount('/content/drive')
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
from unsloth import FastLanguageModel
|
|
43
|
-
import torch
|
|
44
|
-
max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally! 2048 is also default in ollama
|
|
45
|
-
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
|
|
46
|
-
load_in_4bit = True # Use 4bit quantization to reduce memory usage (also less accuracy). Can be False.
|
|
47
|
-
|
|
48
|
-
model, tokenizer = FastLanguageModel.from_pretrained(
|
|
49
|
-
model_name = "unsloth/Llama-3.2-1B-Instruct-bnb-4bit",
|
|
50
|
-
max_seq_length = max_seq_length,
|
|
51
|
-
dtype = dtype,
|
|
52
|
-
load_in_4bit = load_in_4bit,
|
|
53
|
-
# token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
|
|
54
|
-
)
|
|
55
|
-
|
|
56
|
-
model = FastLanguageModel.get_peft_model(
|
|
57
|
-
model,
|
|
58
|
-
r = 128, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
|
|
59
|
-
# Higher: Better accuracy on hard tasks but increases memory and risk of overfitting.
|
|
60
|
-
# Lower: Faster, memory-efficient but may reduce accuracy.
|
|
61
|
-
|
|
62
|
-
target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
|
|
63
|
-
"gate_proj", "up_proj", "down_proj", "lm_head", "embed_tokens"],
|
|
64
|
-
lora_alpha = 64, # 32, 16
|
|
65
|
-
# Higher: Learns more but may overfit.
|
|
66
|
-
# Lower: Slower to learn, more generalizable
|
|
67
|
-
|
|
68
|
-
lora_dropout = 0, # Supports any, but = 0 is optimized
|
|
69
|
-
bias = "none", # Supports any, but = "none" is optimized
|
|
70
|
-
# [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
|
|
71
|
-
use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
|
|
72
|
-
random_state = 3407,
|
|
73
|
-
use_rslora = True, # We support rank stabilized LoRA
|
|
74
|
-
loftq_config = None, # And LoftQ
|
|
75
|
-
)
|
|
76
|
-
|
|
77
|
-
from datasets import load_dataset
|
|
78
|
-
dataset = load_dataset(
|
|
79
|
-
"json",
|
|
80
|
-
data_files = "/content/drive/MyDrive/train.jsonl",
|
|
81
|
-
split = "train",
|
|
82
|
-
)
|
|
83
|
-
print(dataset.column_names)
|
|
84
|
-
print(dataset[0])
|
|
85
|
-
|
|
86
|
-
EOS_TOKEN = tokenizer.eos_token
|
|
87
|
-
def formatting_prompts_func(examples):
|
|
88
|
-
return { "text" : [example + EOS_TOKEN for example in examples["text"]] }
|
|
89
|
-
dataset = dataset.map(formatting_prompts_func, batched = True,)
|
|
90
|
-
|
|
91
|
-
print(dataset.column_names)
|
|
92
|
-
print(dataset[0])
|
|
93
|
-
|
|
94
|
-
from trl import SFTTrainer
|
|
95
|
-
from transformers import TrainingArguments
|
|
96
|
-
from unsloth import is_bfloat16_supported
|
|
97
|
-
from unsloth import UnslothTrainer, UnslothTrainingArguments
|
|
98
|
-
|
|
99
|
-
trainer = UnslothTrainer(
|
|
100
|
-
model = model,
|
|
101
|
-
tokenizer = tokenizer,
|
|
102
|
-
train_dataset = dataset,
|
|
103
|
-
dataset_text_field = "text",
|
|
104
|
-
max_seq_length = max_seq_length,
|
|
105
|
-
dataset_num_proc = 8, # 2
|
|
106
|
-
|
|
107
|
-
args = UnslothTrainingArguments(
|
|
108
|
-
per_device_train_batch_size = 2,
|
|
109
|
-
gradient_accumulation_steps = 8, # 4
|
|
110
|
-
|
|
111
|
-
warmup_ratio = 0.1,
|
|
112
|
-
num_train_epochs = 3, # 1, 2, 3, 4
|
|
113
|
-
# max_steps = 60,
|
|
114
|
-
|
|
115
|
-
learning_rate = 5e-5,
|
|
116
|
-
embedding_learning_rate = 5e-6,
|
|
117
|
-
|
|
118
|
-
fp16 = not is_bfloat16_supported(),
|
|
119
|
-
bf16 = is_bfloat16_supported(),
|
|
120
|
-
logging_steps = 1,
|
|
121
|
-
optim = "adamw_8bit",
|
|
122
|
-
weight_decay = 0.00,
|
|
123
|
-
lr_scheduler_type = "cosine",
|
|
124
|
-
seed = 3407,
|
|
125
|
-
output_dir = "outputs",
|
|
126
|
-
report_to = "none", # Use this for WandB etc
|
|
127
|
-
),
|
|
128
|
-
)
|
|
129
|
-
|
|
130
|
-
trainer_stats = trainer.train()
|
|
131
|
-
|
|
132
|
-
# saves the LoRA adapters, and not the full model. To save to 16bit or GGUF, scroll down!
|
|
133
|
-
model.save_pretrained("lora_model") # Local saving
|
|
134
|
-
tokenizer.save_pretrained("lora_model")
|
|
135
|
-
# model.push_to_hub("your_name/lora_model", token = "...") # Online saving
|
|
136
|
-
# tokenizer.push_to_hub("your_name/lora_model", token = "...") # Online saving
|
|
137
|
-
|
|
138
|
-
# Save to 8bit Q8_0
|
|
139
|
-
if False: model.save_pretrained_gguf("model", tokenizer,)
|
|
140
|
-
# Remember to go to https://huggingface.co/settings/tokens for a token!
|
|
141
|
-
# And change your username!
|
|
142
|
-
if False: model.push_to_hub_gguf("mlibre/model", tokenizer, token = "token")
|
|
143
|
-
|
|
144
|
-
# Save to 16bit GGUF
|
|
145
|
-
if False: model.save_pretrained_gguf("model", tokenizer, quantization_method = "f16")
|
|
146
|
-
if False: model.push_to_hub_gguf("mlibre/model", tokenizer, quantization_method = "f16", token = "token")
|
|
147
|
-
|
|
148
|
-
# Save to q4_k_m GGUF
|
|
149
|
-
if True: model.save_pretrained_gguf("model", tokenizer, quantization_method = "q4_k_m")
|
|
150
|
-
if False: model.push_to_hub_gguf("mlibre/model", tokenizer, quantization_method = "q4_k_m", token = "token")
|
|
151
|
-
|
|
152
|
-
# Save to multiple GGUF options - much faster if you want multiple!
|
|
153
|
-
if False:
|
|
154
|
-
model.push_to_hub_gguf(
|
|
155
|
-
"mlibre/model", # Change mlibre to your username!
|
|
156
|
-
tokenizer,
|
|
157
|
-
quantization_method = ["q4_k_m", "q8_0", "q5_k_m",],
|
|
158
|
-
token = "token", # Get a token at https://huggingface.co/settings/tokens
|
|
159
|
-
)
|
|
160
|
-
|
|
161
|
-
print(tokenizer._ollama_modelfile)
|
|
162
|
-
|
|
163
|
-
!curl -fsSL https://ollama.com/install.sh | sh
|
|
164
|
-
!ollama create unsloth_model -f ./model/Modelfile
|
|
165
|
-
|
|
166
|
-
# In colab terminal type: ollama run unsloth_model
|
|
167
|
-
# in local ollama:
|
|
168
|
-
!curl http://localhost:11434/api/chat -d '{ \
|
|
169
|
-
"model": "unsloth_model", \
|
|
170
|
-
"messages": [ \
|
|
171
|
-
{"role": "user", \
|
|
172
|
-
"content": "The palestine"} \
|
|
173
|
-
] \
|
|
174
|
-
}'
|
|
@@ -1,178 +0,0 @@
|
|
|
1
|
-
|
|
2
|
-
# Rocm and cuda and torch and vllm and unsloth and ...
|
|
3
|
-
|
|
4
|
-
# Virtual Environment
|
|
5
|
-
curl -LsSf https://astral.sh/uv/install.sh | sh
|
|
6
|
-
uv venv myenv --python 3.12 --seed
|
|
7
|
-
source myenv/bin/activate
|
|
8
|
-
|
|
9
|
-
pip uninstall unsloth -y --break-system-packages
|
|
10
|
-
|
|
11
|
-
# https://pytorch.org/get-started/locally/
|
|
12
|
-
pip3 install -U torch torchvision torchaudio --index-url https://download.pytorch.org/whl/rocm6.2.4 --break-system-packages
|
|
13
|
-
# aria2c -x 15 "https://download.pytorch.org/whl/rocm6.2.4/torch-2.6.0%2Brocm6.2.4-cp312-cp312-manylinux_2_28_x86_64.whl"
|
|
14
|
-
pip3 install -U torch torchvision torchaudio torch-2.6.0+rocm6.2.4-cp312-cp312-manylinux_2_28_x86_64.whl --index-url https://download.pytorch.org/whl/rocm6.2.4 --break-system-packages
|
|
15
|
-
|
|
16
|
-
# pillow
|
|
17
|
-
pip install --upgrade pillow --break-system-packages
|
|
18
|
-
|
|
19
|
-
# vllm
|
|
20
|
-
pip install vllm --break-system-packages
|
|
21
|
-
# pip install git+https://github.com/huggingface/trl.git
|
|
22
|
-
# pip install ninja cmake wheel pybind11 --break-system-packages
|
|
23
|
-
# git clone --recursive https://github.com/mlc-ai/xgrammar.git
|
|
24
|
-
# cd xgrammar
|
|
25
|
-
# mkdir build && cd build/
|
|
26
|
-
# cmake ..
|
|
27
|
-
# make -j8
|
|
28
|
-
# cd ../python/
|
|
29
|
-
# pip install -e . --break-system-packages
|
|
30
|
-
# git clone https://github.com/vllm-project/vllm.git
|
|
31
|
-
# cd vllm
|
|
32
|
-
# pip install -r requirements-rocm.txt --break-system-packages
|
|
33
|
-
|
|
34
|
-
# bitsandbytes
|
|
35
|
-
# https://huggingface.co/docs/bitsandbytes/main/en/installation?platform=Linux#multi-backend
|
|
36
|
-
pip install --force-reinstall 'https://github.com/bitsandbytes-foundation/bitsandbytes/releases/download/continuous-release_multi-backend-refactor/bitsandbytes-0.44.1.dev0-py3-none-manylinux_2_24_x86_64.whl'
|
|
37
|
-
|
|
38
|
-
# unsloth
|
|
39
|
-
wget -qO- https://raw.githubusercontent.com/unslothai/unsloth/main/unsloth/_auto_install.py | python -
|
|
40
|
-
# pip install --force-reinstall --upgrade --no-cache-dir --no-deps git+https://github.com/unslothai/unsloth.git --break-system-packages
|
|
41
|
-
# pip install unsloth --break-system-packages
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
from unsloth import FastLanguageModel
|
|
47
|
-
import torch
|
|
48
|
-
max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally! 2048 is also default in ollama
|
|
49
|
-
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
|
|
50
|
-
load_in_4bit = True # Use 4bit quantization to reduce memory usage (also less accuracy). Can be False.
|
|
51
|
-
|
|
52
|
-
model, tokenizer = FastLanguageModel.from_pretrained(
|
|
53
|
-
model_name = "unsloth/Llama-3.2-1B-Instruct-bnb-4bit",
|
|
54
|
-
max_seq_length = max_seq_length,
|
|
55
|
-
dtype = dtype,
|
|
56
|
-
load_in_4bit = load_in_4bit,
|
|
57
|
-
# token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
|
|
58
|
-
)
|
|
59
|
-
|
|
60
|
-
model = FastLanguageModel.get_peft_model(
|
|
61
|
-
model,
|
|
62
|
-
r = 128, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
|
|
63
|
-
# Higher: Better accuracy on hard tasks but increases memory and risk of overfitting.
|
|
64
|
-
# Lower: Faster, memory-efficient but may reduce accuracy.
|
|
65
|
-
|
|
66
|
-
target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
|
|
67
|
-
"gate_proj", "up_proj", "down_proj", "lm_head", "embed_tokens"],
|
|
68
|
-
lora_alpha = 64, # 32, 16
|
|
69
|
-
# Higher: Learns more but may overfit.
|
|
70
|
-
# Lower: Slower to learn, more generalizable
|
|
71
|
-
|
|
72
|
-
lora_dropout = 0, # Supports any, but = 0 is optimized
|
|
73
|
-
bias = "none", # Supports any, but = "none" is optimized
|
|
74
|
-
# [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
|
|
75
|
-
use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
|
|
76
|
-
random_state = 3407,
|
|
77
|
-
use_rslora = True, # We support rank stabilized LoRA
|
|
78
|
-
loftq_config = None, # And LoftQ
|
|
79
|
-
)
|
|
80
|
-
|
|
81
|
-
from datasets import load_dataset
|
|
82
|
-
dataset = load_dataset(
|
|
83
|
-
"json",
|
|
84
|
-
data_files = "/content/drive/MyDrive/train.jsonl",
|
|
85
|
-
split = "train",
|
|
86
|
-
)
|
|
87
|
-
print(dataset.column_names)
|
|
88
|
-
print(dataset[0])
|
|
89
|
-
|
|
90
|
-
EOS_TOKEN = tokenizer.eos_token
|
|
91
|
-
def formatting_prompts_func(examples):
|
|
92
|
-
return { "text" : [example + EOS_TOKEN for example in examples["text"]] }
|
|
93
|
-
dataset = dataset.map(formatting_prompts_func, batched = True,)
|
|
94
|
-
|
|
95
|
-
print(dataset.column_names)
|
|
96
|
-
print(dataset[0])
|
|
97
|
-
|
|
98
|
-
from trl import SFTTrainer
|
|
99
|
-
from transformers import TrainingArguments
|
|
100
|
-
from unsloth import is_bfloat16_supported
|
|
101
|
-
from unsloth import UnslothTrainer, UnslothTrainingArguments
|
|
102
|
-
|
|
103
|
-
trainer = UnslothTrainer(
|
|
104
|
-
model = model,
|
|
105
|
-
tokenizer = tokenizer,
|
|
106
|
-
train_dataset = dataset,
|
|
107
|
-
dataset_text_field = "text",
|
|
108
|
-
max_seq_length = max_seq_length,
|
|
109
|
-
dataset_num_proc = 8, # 2
|
|
110
|
-
|
|
111
|
-
args = UnslothTrainingArguments(
|
|
112
|
-
per_device_train_batch_size = 2,
|
|
113
|
-
gradient_accumulation_steps = 8, # 4
|
|
114
|
-
|
|
115
|
-
warmup_ratio = 0.1,
|
|
116
|
-
num_train_epochs = 3, # 1, 2, 3, 4
|
|
117
|
-
# max_steps = 60,
|
|
118
|
-
|
|
119
|
-
learning_rate = 5e-5,
|
|
120
|
-
embedding_learning_rate = 5e-6,
|
|
121
|
-
|
|
122
|
-
fp16 = not is_bfloat16_supported(),
|
|
123
|
-
bf16 = is_bfloat16_supported(),
|
|
124
|
-
logging_steps = 1,
|
|
125
|
-
optim = "adamw_8bit",
|
|
126
|
-
weight_decay = 0.00,
|
|
127
|
-
lr_scheduler_type = "cosine",
|
|
128
|
-
seed = 3407,
|
|
129
|
-
output_dir = "outputs",
|
|
130
|
-
report_to = "none", # Use this for WandB etc
|
|
131
|
-
),
|
|
132
|
-
)
|
|
133
|
-
|
|
134
|
-
trainer_stats = trainer.train()
|
|
135
|
-
|
|
136
|
-
# saves the LoRA adapters, and not the full model. To save to 16bit or GGUF, scroll down!
|
|
137
|
-
model.save_pretrained("lora_model") # Local saving
|
|
138
|
-
tokenizer.save_pretrained("lora_model")
|
|
139
|
-
# model.push_to_hub("your_name/lora_model", token = "...") # Online saving
|
|
140
|
-
# tokenizer.push_to_hub("your_name/lora_model", token = "...") # Online saving
|
|
141
|
-
|
|
142
|
-
# Save to 8bit Q8_0
|
|
143
|
-
if False: model.save_pretrained_gguf("model", tokenizer,)
|
|
144
|
-
# Remember to go to https://huggingface.co/settings/tokens for a token!
|
|
145
|
-
# And change your username!
|
|
146
|
-
if False: model.push_to_hub_gguf("mlibre/model", tokenizer, token = "token")
|
|
147
|
-
|
|
148
|
-
# Save to 16bit GGUF
|
|
149
|
-
if False: model.save_pretrained_gguf("model", tokenizer, quantization_method = "f16")
|
|
150
|
-
if False: model.push_to_hub_gguf("mlibre/model", tokenizer, quantization_method = "f16", token = "token")
|
|
151
|
-
|
|
152
|
-
# Save to q4_k_m GGUF
|
|
153
|
-
if True: model.save_pretrained_gguf("model", tokenizer, quantization_method = "q4_k_m")
|
|
154
|
-
if False: model.push_to_hub_gguf("mlibre/model", tokenizer, quantization_method = "q4_k_m", token = "token")
|
|
155
|
-
|
|
156
|
-
# Save to multiple GGUF options - much faster if you want multiple!
|
|
157
|
-
if False:
|
|
158
|
-
model.push_to_hub_gguf(
|
|
159
|
-
"mlibre/model", # Change mlibre to your username!
|
|
160
|
-
tokenizer,
|
|
161
|
-
quantization_method = ["q4_k_m", "q8_0", "q5_k_m",],
|
|
162
|
-
token = "token", # Get a token at https://huggingface.co/settings/tokens
|
|
163
|
-
)
|
|
164
|
-
|
|
165
|
-
print(tokenizer._ollama_modelfile)
|
|
166
|
-
|
|
167
|
-
!curl -fsSL https://ollama.com/install.sh | sh
|
|
168
|
-
!ollama create unsloth_model -f ./model/Modelfile
|
|
169
|
-
|
|
170
|
-
# In colab terminal type: ollama run unsloth_model
|
|
171
|
-
# in local ollama:
|
|
172
|
-
!curl http://localhost:11434/api/chat -d '{ \
|
|
173
|
-
"model": "unsloth_model", \
|
|
174
|
-
"messages": [ \
|
|
175
|
-
{"role": "user", \
|
|
176
|
-
"content": "The palestine"} \
|
|
177
|
-
] \
|
|
178
|
-
}'
|