clean-web-scraper 4.1.3 → 4.1.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -37,6 +37,9 @@ git clone https://github.com/mlibre/Clean-Web-Scraper
37
37
  cd Clean-Web-Scraper
38
38
  sudo pacman -S extra/xorg-server-xvfb chromium
39
39
  npm install
40
+
41
+ # Skip chromium download during npm installation
42
+ # npm i --ignore-scripts
40
43
  ```
41
44
 
42
45
  ## 💻 Usage
@@ -0,0 +1,35 @@
1
+ !rm -r /content/.ipynb_checkpoints
2
+ !rm -r /content/data
3
+ !rm -r /content/.config
4
+ !rm -r /content/sample_data/
5
+ !rm -r /content/lora_model/
6
+ !rm -r /content/llama.cpp/
7
+ !rm -r /content/outputs/
8
+ !rm -r /content/model/
9
+ !rm -r /content/huggingface_tokenizers_cache/
10
+
11
+ %cd /content/
12
+ %rm -rf LLaMA-Factory
13
+ !git clone --depth 1 https://github.com/hiyouga/LLaMA-Factory.git
14
+ %cd LLaMA-Factory
15
+ %ls
16
+ !pip install -e .[torch,bitsandbytes]
17
+
18
+ # Use this to resolve package conflicts.
19
+ # pip install --no-deps -e .
20
+
21
+ # dataset_info.json
22
+ # "dataset_name": {
23
+ # "file_name": "data.json",
24
+ # "columns": {
25
+ # "prompt": "text"
26
+ # }
27
+ # }
28
+ # [
29
+ # {"text": "document"},
30
+ # {"text": "document"}
31
+ # ]
32
+
33
+ # llamafactory-cli train examples/train_lora/llama3_lora_sft.yaml
34
+ # llamafactory-cli chat examples/inference/llama3_lora_sft.yaml
35
+ # llamafactory-cli export examples/merge_lora/llama3_lora_sft.yaml
@@ -0,0 +1,192 @@
1
+ # !rm -r /content/.ipynb_checkpoints
2
+ # !rm -r /content/data
3
+ # !rm -r /content/.config
4
+ # !rm -r /content/sample_data/
5
+ # !rm -r /content/lora_model/
6
+ # !rm -r /content/llama.cpp/
7
+ # !rm -r /content/outputs/
8
+ # !rm -r /content/model/
9
+ # !rm -r /content/huggingface_tokenizers_cache/
10
+
11
+
12
+ # Commented out IPython magic to ensure Python compatibility.
13
+ # %%capture
14
+
15
+
16
+ # !pip uninstall unsloth -y && pip install --upgrade --no-cache-dir --no-deps git+https://github.com/unslothai/unsloth.git
17
+
18
+
19
+ # Disconnect and delete the runtime
20
+ # !pip uninstall unsloth -y
21
+ # !pip install --force-reinstall --no-cache-dir --upgrade "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
22
+
23
+
24
+ # Disconnect and delete the runtime
25
+ # !pip uninstall unsloth -y
26
+ # !pip install unsloth
27
+
28
+
29
+ %%capture
30
+ import sys; modules = list(sys.modules.keys())
31
+ for x in modules: sys.modules.pop(x) if "PIL" in x or "google" in x else None
32
+
33
+ !pip install unsloth vllm
34
+ !pip install --upgrade pillow
35
+ !pip install git+https://github.com/huggingface/trl.git@e95f9fb74a3c3647b86f251b7e230ec51c64b72b
36
+
37
+
38
+ from google.colab import drive
39
+ drive.mount('/content/drive')
40
+
41
+
42
+ from unsloth import FastLanguageModel
43
+ import torch
44
+ max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally! 2048 is also default in ollama
45
+ dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
46
+ load_in_4bit = True # Use 4bit quantization to reduce memory usage (also less accuracy). Can be False.
47
+
48
+ model, tokenizer = FastLanguageModel.from_pretrained(
49
+ model_name = "unsloth/Llama-3.2-3B-Instruct-bnb-4bit",
50
+ max_seq_length = max_seq_length,
51
+ dtype = dtype,
52
+ load_in_4bit = load_in_4bit,
53
+ # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
54
+ )
55
+
56
+ model = FastLanguageModel.get_peft_model(
57
+ model,
58
+ r = 128, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
59
+ # Higher: Better accuracy on hard tasks but increases memory and risk of overfitting.
60
+ # Lower: Faster, memory-efficient but may reduce accuracy.
61
+
62
+ target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
63
+ "gate_proj", "up_proj", "down_proj", "lm_head", "embed_tokens"],
64
+ lora_alpha = 64, # 32, 16
65
+ # Higher: Learns more but may overfit.
66
+ # Lower: Slower to learn, more generalizable
67
+
68
+ lora_dropout = 0, # Supports any, but = 0 is optimized
69
+ bias = "none", # Supports any, but = "none" is optimized
70
+ # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
71
+ use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
72
+ random_state = 3407,
73
+ use_rslora = True, # We support rank stabilized LoRA
74
+ loftq_config = None, # And LoftQ
75
+ )
76
+
77
+ from datasets import load_dataset
78
+ dataset = load_dataset(
79
+ "json",
80
+ data_files = "/content/drive/MyDrive/train.jsonl",
81
+ split = "train",
82
+ )
83
+ print(dataset.column_names)
84
+ print(dataset[0])
85
+
86
+ EOS_TOKEN = tokenizer.eos_token
87
+ def formatting_prompts_func(examples):
88
+ return { "text" : [example + EOS_TOKEN for example in examples["text"]] }
89
+ dataset = dataset.map(formatting_prompts_func, batched = True,)
90
+
91
+ print(dataset.column_names)
92
+ print(dataset[0])
93
+
94
+ from trl import SFTTrainer
95
+ from transformers import TrainingArguments
96
+ from unsloth import is_bfloat16_supported
97
+ from unsloth import UnslothTrainer, UnslothTrainingArguments
98
+
99
+ trainer = UnslothTrainer(
100
+ model = model,
101
+ tokenizer = tokenizer,
102
+ train_dataset = dataset,
103
+ dataset_text_field = "text",
104
+ max_seq_length = max_seq_length,
105
+ dataset_num_proc = 8, # 2
106
+
107
+ args = UnslothTrainingArguments(
108
+ per_device_train_batch_size = 2,
109
+ gradient_accumulation_steps = 8, # 4
110
+
111
+ warmup_ratio = 0.1,
112
+ num_train_epochs = 3, # 1, 2, 3, 4
113
+ # max_steps = 60,
114
+
115
+ learning_rate = 5e-5,
116
+ embedding_learning_rate = 5e-6,
117
+
118
+ fp16 = not is_bfloat16_supported(),
119
+ bf16 = is_bfloat16_supported(),
120
+ logging_steps = 1,
121
+ optim = "adamw_8bit",
122
+ weight_decay = 0.00,
123
+ lr_scheduler_type = "cosine",
124
+ seed = 3407,
125
+ output_dir = "outputs",
126
+ report_to = "none", # Use this for WandB etc
127
+ ),
128
+ )
129
+
130
+ trainer_stats = trainer.train()
131
+
132
+ """
133
+ ### Saving, loading finetuned models
134
+ To save the final model as LoRA adapters, either use Huggingface's `push_to_hub` for an online save or `save_pretrained` for a local save.
135
+
136
+ **[NOTE]** This ONLY saves the LoRA adapters, and not the full model. To save to 16bit or GGUF, scroll down!
137
+ """
138
+
139
+ model.save_pretrained("lora_model") # Local saving
140
+ tokenizer.save_pretrained("lora_model")
141
+ # model.push_to_hub("your_name/lora_model", token = "...") # Online saving
142
+ # tokenizer.push_to_hub("your_name/lora_model", token = "...") # Online saving
143
+
144
+ """
145
+ ### Ollama Support
146
+
147
+ [Unsloth](https://github.com/unslothai/unsloth) now allows you to automatically finetune and create a [Modelfile](https://github.com/ollama/ollama/blob/main/docs/modelfile.md), and export to [Ollama](https://ollama.com/)! This makes finetuning much easier and provides a seamless workflow from `Unsloth` to `Ollama`!
148
+
149
+ Let's first install `Ollama`!
150
+ """
151
+
152
+ # Save to 8bit Q8_0
153
+ if False: model.save_pretrained_gguf("model", tokenizer,)
154
+ # Remember to go to https://huggingface.co/settings/tokens for a token!
155
+ # And change hf to your username!
156
+ if False: model.push_to_hub_gguf("mlibre/model", tokenizer, token = "token")
157
+
158
+ # Save to 16bit GGUF
159
+ if False: model.save_pretrained_gguf("model", tokenizer, quantization_method = "f16")
160
+ if False: model.push_to_hub_gguf("mlibre/model", tokenizer, quantization_method = "f16", token = "token")
161
+
162
+ # Save to q4_k_m GGUF
163
+ if False: model.save_pretrained_gguf("model", tokenizer, quantization_method = "q4_k_m")
164
+ if False: model.push_to_hub_gguf("mlibre/model", tokenizer, quantization_method = "q4_k_m", token = "token")
165
+
166
+ # Save to multiple GGUF options - much faster if you want multiple!
167
+ if True:
168
+ model.push_to_hub_gguf(
169
+ "mlibre/model", # Change mlibre to your username!
170
+ tokenizer,
171
+ quantization_method = ["q4_k_m", "q8_0", "q5_k_m",],
172
+ token = "token", # Get a token at https://huggingface.co/settings/tokens
173
+ )
174
+
175
+ """We use `subprocess` to start `Ollama` up in a non blocking fashion! In your own desktop, you can simply open up a new `terminal` and type `ollama serve`, but in Colab, we have to use this hack!"""
176
+
177
+ print(tokenizer._ollama_modelfile)
178
+
179
+ """We now will create an `Ollama` model called `unsloth_model` using the `Modelfile` which we auto generated!"""
180
+
181
+ !curl -fsSL https://ollama.com/install.sh | sh
182
+ !ollama create unsloth_model -f ./model/Modelfile
183
+
184
+ # In colab terminal type: ollama run unsloth_model
185
+ # in local ollama:
186
+ !curl http://localhost:11434/api/chat -d '{ \
187
+ "model": "unsloth_model", \
188
+ "messages": [ \
189
+ {"role": "user", \
190
+ "content": "Their passenger class is 3.\nTheir age is 22.0.\nThey paid $107.25 for the trip."} \
191
+ ] \
192
+ }'
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "clean-web-scraper",
3
- "version": "4.1.3",
3
+ "version": "4.1.5",
4
4
  "main": "main.js",
5
5
  "scripts": {
6
6
  "start": "node main.js",