scout-ai 0.2.0 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (80) hide show
  1. checksums.yaml +4 -4
  2. data/.vimproject +91 -10
  3. data/Rakefile +1 -0
  4. data/VERSION +1 -1
  5. data/bin/scout-ai +2 -0
  6. data/lib/scout/llm/agent/chat.rb +24 -0
  7. data/lib/scout/llm/agent.rb +13 -13
  8. data/lib/scout/llm/ask.rb +26 -16
  9. data/lib/scout/llm/backends/bedrock.rb +129 -0
  10. data/lib/scout/llm/backends/huggingface.rb +6 -21
  11. data/lib/scout/llm/backends/ollama.rb +69 -36
  12. data/lib/scout/llm/backends/openai.rb +85 -35
  13. data/lib/scout/llm/backends/openwebui.rb +1 -1
  14. data/lib/scout/llm/backends/relay.rb +3 -2
  15. data/lib/scout/llm/backends/responses.rb +272 -0
  16. data/lib/scout/llm/chat.rb +547 -0
  17. data/lib/scout/llm/parse.rb +70 -13
  18. data/lib/scout/llm/tools.rb +126 -5
  19. data/lib/scout/llm/utils.rb +17 -10
  20. data/lib/scout/model/base.rb +19 -0
  21. data/lib/scout/model/python/base.rb +25 -0
  22. data/lib/scout/model/python/huggingface/causal/next_token.rb +23 -0
  23. data/lib/scout/model/python/huggingface/causal.rb +29 -0
  24. data/lib/scout/model/python/huggingface/classification +0 -0
  25. data/lib/scout/model/python/huggingface/classification.rb +50 -0
  26. data/lib/scout/model/python/huggingface.rb +112 -0
  27. data/lib/scout/model/python/torch/dataloader.rb +57 -0
  28. data/lib/scout/model/python/torch/helpers.rb +84 -0
  29. data/lib/scout/model/python/torch/introspection.rb +34 -0
  30. data/lib/scout/model/python/torch/load_and_save.rb +47 -0
  31. data/lib/scout/model/python/torch.rb +94 -0
  32. data/lib/scout/model/util/run.rb +181 -0
  33. data/lib/scout/model/util/save.rb +81 -0
  34. data/lib/scout-ai.rb +3 -1
  35. data/python/scout_ai/__init__.py +35 -0
  36. data/python/scout_ai/__pycache__/__init__.cpython-310.pyc +0 -0
  37. data/python/scout_ai/__pycache__/__init__.cpython-311.pyc +0 -0
  38. data/python/scout_ai/__pycache__/huggingface.cpython-310.pyc +0 -0
  39. data/python/scout_ai/__pycache__/huggingface.cpython-311.pyc +0 -0
  40. data/python/scout_ai/__pycache__/util.cpython-310.pyc +0 -0
  41. data/python/scout_ai/__pycache__/util.cpython-311.pyc +0 -0
  42. data/python/scout_ai/atcold/__init__.py +0 -0
  43. data/python/scout_ai/atcold/plot_lib.py +141 -0
  44. data/python/scout_ai/atcold/spiral.py +27 -0
  45. data/python/scout_ai/huggingface/data.py +48 -0
  46. data/python/scout_ai/huggingface/eval.py +60 -0
  47. data/python/scout_ai/huggingface/model.py +29 -0
  48. data/python/scout_ai/huggingface/rlhf.py +83 -0
  49. data/python/scout_ai/huggingface/train/__init__.py +34 -0
  50. data/python/scout_ai/huggingface/train/__pycache__/__init__.cpython-310.pyc +0 -0
  51. data/python/scout_ai/huggingface/train/__pycache__/next_token.cpython-310.pyc +0 -0
  52. data/python/scout_ai/huggingface/train/next_token.py +315 -0
  53. data/python/scout_ai/language_model.py +70 -0
  54. data/python/scout_ai/util.py +32 -0
  55. data/scout-ai.gemspec +130 -0
  56. data/scout_commands/agent/ask +133 -15
  57. data/scout_commands/agent/kb +15 -0
  58. data/scout_commands/llm/ask +71 -12
  59. data/scout_commands/llm/process +4 -2
  60. data/test/data/cat.jpg +0 -0
  61. data/test/scout/llm/agent/test_chat.rb +14 -0
  62. data/test/scout/llm/backends/test_bedrock.rb +60 -0
  63. data/test/scout/llm/backends/test_huggingface.rb +3 -3
  64. data/test/scout/llm/backends/test_ollama.rb +48 -10
  65. data/test/scout/llm/backends/test_openai.rb +96 -11
  66. data/test/scout/llm/backends/test_responses.rb +115 -0
  67. data/test/scout/llm/test_ask.rb +1 -0
  68. data/test/scout/llm/test_chat.rb +214 -0
  69. data/test/scout/llm/test_parse.rb +81 -2
  70. data/test/scout/model/python/huggingface/causal/test_next_token.rb +59 -0
  71. data/test/scout/model/python/huggingface/test_causal.rb +33 -0
  72. data/test/scout/model/python/huggingface/test_classification.rb +30 -0
  73. data/test/scout/model/python/test_base.rb +44 -0
  74. data/test/scout/model/python/test_huggingface.rb +9 -0
  75. data/test/scout/model/python/test_torch.rb +71 -0
  76. data/test/scout/model/python/torch/test_helpers.rb +14 -0
  77. data/test/scout/model/test_base.rb +117 -0
  78. data/test/scout/model/util/test_save.rb +31 -0
  79. metadata +72 -5
  80. data/questions/coach +0 -2
@@ -0,0 +1,60 @@
1
+ def forward(model, features):
2
+ return model(**features)
3
+
4
+ def get_logits(predictions):
5
+ logits = predictions["logits"]
6
+ return [v.detach().cpu().numpy() for v in logits]
7
+
8
+ def eval_model(model, tokenizer, texts, return_logits=True):
9
+ features = tokenizer(texts, return_tensors='pt', truncation=True).to(model.device)
10
+ model.eval()
11
+ predictions = forward(model, features)
12
+ if return_logits:
13
+ return get_logits(predictions)
14
+ return predictions
15
+
16
+ def eval_causal_lm_chat(
17
+ model, tokenizer, messages,
18
+ chat_template=None,
19
+ chat_template_kwargs=None,
20
+ generation_kwargs=None
21
+ ):
22
+ """
23
+ Evaluate a CausalLM model given chat messages. Uses tokenizer's chat template by default.
24
+
25
+ Args:
26
+ model: Huggingface CausalLM
27
+ tokenizer: Huggingface tokenizer
28
+ messages: List[Dict[str, str]] (OpenAI API style, 'role' and 'content')
29
+ chat_template: (Optional) Override string for the chat template.
30
+ chat_template_kwargs: (Optional) Dict, kwargs for apply_chat_template (like tokenize, add_generation_prompt, etc).
31
+ generation_kwargs: (Optional) Dict for model.generate
32
+
33
+ Returns:
34
+ Generated text (or list, depending on settings).
35
+ """
36
+ chat_template_kwargs = chat_template_kwargs or {}
37
+ generation_kwargs = generation_kwargs or {}
38
+
39
+ # If the tokenizer has a chat template (HF 4.34+)
40
+ if hasattr(tokenizer, "___apply_chat_template"):
41
+ kwargs = dict(add_generation_prompt=True, tokenize=False)
42
+ kwargs.update(chat_template_kwargs)
43
+ if chat_template is not None:
44
+ # Override the template (may require tokenizer._chat_template)
45
+ tokenizer._chat_template = chat_template
46
+ prompt = tokenizer.apply_chat_template(messages, **kwargs)
47
+ else:
48
+ # Fallback: simple concatenation
49
+ prompt = "\n".join([msg['content'] for msg in messages])
50
+
51
+ # Tokenize as usual
52
+ inputs = tokenizer(prompt, return_tensors='pt').to(model.device)
53
+ model.eval()
54
+ # Use generate
55
+ output_ids = model.generate(**inputs, **generation_kwargs)
56
+ # Decode only the newly generated tokens (not the prompt)
57
+ output_text = tokenizer.decode(
58
+ output_ids[0, inputs["input_ids"].shape[1]:], skip_special_tokens=True
59
+ )
60
+ return output_text
@@ -0,0 +1,29 @@
1
+ # huggingface_model.py
2
+ import importlib
3
+ from typing import Optional, Any
4
+
5
+ def import_module_class(module: str, class_name: str) -> Any:
6
+ """Dynamically import a class from a module."""
7
+ mod = importlib.import_module(module)
8
+ return getattr(mod, class_name)
9
+
10
+ def load_model(task: Optional[str], checkpoint: str, **kwargs) -> Any:
11
+ """Load a Huggingface model by task and checkpoint"""
12
+ if task is None or task.lower() == 'embedding':
13
+ model_class = import_module_class('transformers', 'AutoModel')
14
+ elif ":" in task:
15
+ module, class_name = task.split(":")
16
+ model_class = import_module_class(module, class_name)
17
+ else:
18
+ model_class = import_module_class('transformers', f'AutoModelFor{task}')
19
+ return model_class.from_pretrained(checkpoint, **kwargs)
20
+
21
+ def load_tokenizer(checkpoint: str, **kwargs) -> Any:
22
+ """Load a Huggingface tokenizer"""
23
+ tokenizer_class = import_module_class('transformers', 'AutoTokenizer')
24
+ return tokenizer_class.from_pretrained(checkpoint, **kwargs)
25
+
26
+ def load_model_and_tokenizer(task: Optional[str], checkpoint: str, **kwargs):
27
+ model = load_model(task, checkpoint, **kwargs)
28
+ tokenizer = load_tokenizer(checkpoint, **kwargs)
29
+ return model, tokenizer
@@ -0,0 +1,83 @@
1
+ from trl import PPOTrainer, AutoModelForCausalLMWithValueHead, PPOConfig
2
+ import torch
3
+ import scout_ai
4
+
5
+ from copy import deepcopy
6
+ from datasets import Dataset
7
+
8
+
9
+ class PPOTrainerWithPrecomputedReward(PPOTrainer):
10
+ def get_rewards(self, **kwargs):
11
+ return torch.tensor(self.train_dataset['reward'], dtype=torch.float32)
12
+
13
+ def train_rlhf(path, tokenizer, pairs, rewards, config=None, generation_config=None):
14
+ """
15
+ pairs: List of tuples (messages, response)
16
+ - messages: List[Dict[str, str]] (OpenAI/chatML-style messages)
17
+ - response: string (the model output to be rewarded)
18
+ """
19
+ config = config or {}
20
+ device = scout_ai.device()
21
+ device = 'cuda'
22
+
23
+ tokenizer.padding_side = "left"
24
+ tokenizer.pad_token = tokenizer.eos_token
25
+
26
+ prompts, responses = [], []
27
+ for pair in pairs:
28
+ messages, response = pair
29
+ # Ensure tokenizer supports chat template (HF >=4.34)
30
+ if hasattr(tokenizer, 'apply_chat_template'):
31
+ # Use default: add_generation_prompt needed for LLMs like Llama, Mistral, etc
32
+ prompt = tokenizer.apply_chat_template(
33
+ messages, add_generation_prompt=True, tokenize=False
34
+ )
35
+ else:
36
+ # Fallback: join user/assistant messages
37
+ prompt = "\n".join(msg['content'] for msg in messages)
38
+ prompts.append(prompt)
39
+ responses.append(response)
40
+
41
+ train_dataset = Dataset.from_dict({'prompt': prompts, 'response': responses, 'reward': rewards})
42
+
43
+ # Wrap model with Value Head for PPO
44
+ from trl import PPOTrainer, AutoModelForCausalLMWithValueHead, PPOConfig
45
+ model = AutoModelForCausalLMWithValueHead.from_pretrained(path)
46
+ model.to(device)
47
+
48
+ from transformers import GenerationConfig
49
+
50
+ generation_config = GenerationConfig()
51
+
52
+ ppo_config = PPOConfig(
53
+ batch_size=config.get('batch_size', 4),
54
+ learning_rate=config.get('learning_rate', 1e-5),
55
+ mini_batch_size=config.get('mini_batch_size', 1),
56
+ gradient_accumulation_steps=1,
57
+ )
58
+
59
+ model.base_model_prefix = 'model'
60
+
61
+ ref_model = deepcopy(model)
62
+ ref_model.to(device)
63
+
64
+ model.generation_config=generation_config
65
+
66
+ print(model)
67
+ print(ref_model)
68
+
69
+ ppo_trainer = PPOTrainerWithPrecomputedReward(
70
+ args=ppo_config,
71
+ model=model,
72
+ ref_model=ref_model,
73
+ reward_model=model, # dummy
74
+ value_model=model, # dummy
75
+ train_dataset=train_dataset,
76
+ processing_class=None,
77
+ )
78
+
79
+
80
+ print("Step")
81
+ stats = ppo_trainer.train(prompts, responses, rewards)
82
+ model.save
83
+ return stats
@@ -0,0 +1,34 @@
1
+ from transformers import TrainingArguments, Trainer
2
+ from typing import Any
3
+ from ..data import json_dataset, tsv_dataset, tokenize_dataset
4
+
5
+ def training_args(*args, **kwargs) -> TrainingArguments:
6
+ return TrainingArguments(*args, **kwargs)
7
+
8
+ def train_model(model: Any, tokenizer: Any, training_args: TrainingArguments, dataset: Any, class_weights=None, **kwargs):
9
+ for param in model.parameters():
10
+ param.data = param.data.contiguous()
11
+
12
+ if (isinstance(dataset, str)):
13
+ if (dataset.endswith('.json')):
14
+ tokenized_dataset = json_dataset(tokenizer, dataset)
15
+ else:
16
+ tokenized_dataset = tsv_dataset(tokenizer, dataset)
17
+ else:
18
+ tokenized_dataset = tokenize_dataset(tokenizer, dataset)
19
+
20
+ if class_weights is not None:
21
+ import torch
22
+ from torch import nn
23
+ class WeightTrainer(Trainer):
24
+ def compute_loss(self, model, inputs, return_outputs=False):
25
+ labels = inputs.get("labels")
26
+ outputs = model(**inputs)
27
+ logits = outputs.get('logits')
28
+ loss_fct = nn.CrossEntropyLoss(weight=torch.tensor(class_weights).to(model.device))
29
+ loss = loss_fct(logits.view(-1, model.config.num_labels), labels.view(-1))
30
+ return (loss, outputs) if return_outputs else loss
31
+ trainer = WeightTrainer(model, training_args, train_dataset=tokenized_dataset["train"], tokenizer=tokenizer, **kwargs)
32
+ else:
33
+ trainer = Trainer(model, training_args, train_dataset=tokenized_dataset["train"], tokenizer=tokenizer, **kwargs)
34
+ trainer.train()
@@ -0,0 +1,315 @@
1
+ import os
2
+ import math
3
+ import time
4
+ import shutil
5
+ import random
6
+ from dataclasses import dataclass
7
+ from typing import List, Optional, Dict, Any, Union
8
+
9
+ import torch
10
+ from torch.utils.data import DataLoader
11
+ from datasets import Dataset, load_dataset
12
+
13
+ from transformers import (
14
+ PreTrainedModel,
15
+ PreTrainedTokenizer,
16
+ get_scheduler,
17
+ DataCollatorForLanguageModeling
18
+ )
19
+ from torch.optim import AdamW
20
+ from transformers.utils import logging
21
+
22
+ logger = logging.get_logger(__name__)
23
+
24
+ def set_seed(seed: int):
25
+ random.seed(seed)
26
+ torch.manual_seed(seed)
27
+ torch.cuda.manual_seed_all(seed)
28
+ try:
29
+ import numpy as np
30
+ np.random.seed(seed)
31
+ except ImportError:
32
+ pass
33
+
34
+ @dataclass
35
+ class TrainingState:
36
+ global_step: int = 0
37
+ best_eval_loss: float = float("inf")
38
+
39
+ def tokenize_function(examples, tokenizer, max_seq_length):
40
+ # examples: dict with key 'text' or single texts
41
+ # Always output input_ids and attention_mask
42
+ output = tokenizer(
43
+ examples["text"] if "text" in examples else examples,
44
+ truncation=True,
45
+ padding="max_length",
46
+ max_length=max_seq_length,
47
+ return_attention_mask=True,
48
+ )
49
+ output["labels"] = output["input_ids"].copy()
50
+ return output
51
+
52
+ def group_texts(examples, block_size):
53
+ # For paragraph-based datasets: simply return; for huge files, use this.
54
+ concatenated = {k: sum(examples[k], []) for k in examples.keys()}
55
+ total_length = len(concatenated[list(examples.keys())[0]])
56
+ # Drop the small remainder
57
+ total_length = (total_length // block_size) * block_size
58
+ result = {
59
+ k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
60
+ for k, t in concatenated.items()
61
+ }
62
+ return result
63
+
64
+ def train_next_token(
65
+ model: PreTrainedModel,
66
+ tokenizer: PreTrainedTokenizer,
67
+ dataset: Union[List[str], Dataset],
68
+ *,
69
+ output_dir: str,
70
+ eval_dataset: Optional[Union[List[str], Dataset]] = None,
71
+ max_seq_length: int = 2048,
72
+ batch_size: int = 8,
73
+ gradient_accumulation_steps: int = 1,
74
+ num_train_epochs: int = 3,
75
+ learning_rate: float = 1e-4,
76
+ weight_decay: float = 0.01,
77
+ lr_scheduler_type: str = "linear",
78
+ warmup_steps: int = 0,
79
+ logging_steps: int = 50,
80
+ eval_steps: int = 200,
81
+ save_steps: int = 500,
82
+ save_total_limit: int = 3,
83
+ fp16: bool = False,
84
+ bf16: bool = False,
85
+ max_train_steps: int = None,
86
+ seed: int = 42,
87
+ report_to: str = "none", # or "wandb", "tensorboard"
88
+ use_lora: bool = False,
89
+ lora_config: Optional[dict] = None,
90
+ resume_from_checkpoint: str = None,
91
+ callbacks: Optional[List] = None,
92
+ device_map: str = "auto",
93
+ dataloader_num_workers: int = 4,
94
+ group_by_length: bool = False,
95
+ description: str = "",
96
+ ):
97
+ """
98
+ Fine-tunes a causal LM for next-token prediction.
99
+ """
100
+ #assert isinstance(model, PreTrainedModel), "Model must be a HuggingFace PreTrainedModel"
101
+ #assert isinstance(tokenizer, PreTrainedTokenizer), "Tokenizer must be a HuggingFace PreTrainedTokenizer"
102
+ assert isinstance(dataset, (list, Dataset)), "Dataset must be a HuggingFace Dataset or a list of texts"
103
+
104
+ set_seed(seed)
105
+ os.makedirs(output_dir, exist_ok=True)
106
+
107
+ device = "cuda" if torch.cuda.is_available() else "cpu"
108
+ n_gpus = torch.cuda.device_count()
109
+
110
+ if resume_from_checkpoint:
111
+ logger.info(f"Loading checkpoint from {resume_from_checkpoint}")
112
+ model.load_state_dict(torch.load(os.path.join(resume_from_checkpoint, "pytorch_model.bin")))
113
+
114
+ model.to(device)
115
+
116
+ if fp16:
117
+ scaler = torch.cuda.amp.GradScaler()
118
+ else:
119
+ scaler = None
120
+
121
+ # 1. Prepare Dataset
122
+ if isinstance(dataset, list):
123
+ dataset = Dataset.from_dict({"text": dataset})
124
+
125
+ if eval_dataset is not None and isinstance(eval_dataset, list):
126
+ eval_dataset = Dataset.from_dict({"text": eval_dataset})
127
+
128
+ # Tokenization and formatting
129
+ def preprocess(examples):
130
+ return tokenize_function(examples, tokenizer, max_seq_length)
131
+
132
+ dataset = dataset.map(preprocess, batched=True, remove_columns=list(dataset.column_names))
133
+ if eval_dataset is not None:
134
+ eval_dataset = eval_dataset.map(preprocess, batched=True, remove_columns=list(eval_dataset.column_names))
135
+
136
+ # 2. Loader & Collator
137
+ data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)
138
+
139
+ train_loader = DataLoader(
140
+ dataset,
141
+ batch_size=batch_size,
142
+ shuffle=True,
143
+ collate_fn=data_collator,
144
+ num_workers=dataloader_num_workers,
145
+ drop_last=True,
146
+ )
147
+ eval_loader = None
148
+ if eval_dataset is not None:
149
+ eval_loader = DataLoader(
150
+ eval_dataset,
151
+ batch_size=batch_size,
152
+ shuffle=False,
153
+ collate_fn=data_collator,
154
+ num_workers=dataloader_num_workers,
155
+ )
156
+
157
+ # 3. Optimizer & Scheduler
158
+ no_decay = ["bias", "LayerNorm.weight"]
159
+ grouped_params = [
160
+ {
161
+ "params": [
162
+ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)
163
+ ],
164
+ "weight_decay": weight_decay,
165
+ },
166
+ {
167
+ "params": [
168
+ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)
169
+ ],
170
+ "weight_decay": 0.0,
171
+ },
172
+ ]
173
+
174
+ optimizer = AdamW(grouped_params, lr=learning_rate)
175
+
176
+ total_train_steps = (
177
+ max_train_steps if max_train_steps is not None
178
+ else (len(train_loader) * num_train_epochs) // gradient_accumulation_steps
179
+ )
180
+
181
+ lr_scheduler = get_scheduler(
182
+ lr_scheduler_type,
183
+ optimizer=optimizer,
184
+ num_warmup_steps=warmup_steps,
185
+ num_training_steps=total_train_steps,
186
+ )
187
+
188
+ # 4. LoRA/PEFT Support (placeholder)
189
+ if use_lora:
190
+ logger.warning("PEFT/LoRA integration not yet implemented. Skipping.")
191
+
192
+ # 5. Checkpoint Management
193
+ saved_checkpoints = []
194
+
195
+ # 6. Training Loop
196
+ state = TrainingState()
197
+ model.train()
198
+ start_time = time.time()
199
+ for epoch in range(num_train_epochs):
200
+ logger.info(f"Epoch {epoch+1}/{num_train_epochs}")
201
+ for step, batch in enumerate(train_loader):
202
+ true_step = state.global_step + 1
203
+ batch = {k: v.to(device) for k, v in batch.items()}
204
+ with torch.cuda.amp.autocast(dtype=torch.float16 if fp16 else torch.bfloat16 if bf16 else torch.float32, enabled=(fp16 or bf16)):
205
+ outputs = model(**batch)
206
+ loss = outputs.loss
207
+ loss = loss / gradient_accumulation_steps
208
+
209
+ if fp16:
210
+ scaler.scale(loss).backward()
211
+ else:
212
+ loss.backward()
213
+
214
+ if true_step % gradient_accumulation_steps == 0:
215
+ if fp16:
216
+ scaler.step(optimizer)
217
+ scaler.update()
218
+ else:
219
+ optimizer.step()
220
+ optimizer.zero_grad()
221
+ lr_scheduler.step()
222
+
223
+ if true_step % logging_steps == 0:
224
+ logger.info(f"Step {true_step}: loss {loss.item() * gradient_accumulation_steps:.4f}")
225
+
226
+ if eval_loader is not None and true_step % eval_steps == 0:
227
+ eval_loss = evaluate(model, eval_loader, device, fp16, bf16)
228
+ logger.info(f"Step {true_step}: eval_loss {eval_loss:.4f}, ppl {math.exp(eval_loss):.2f}")
229
+ # Save best
230
+ if eval_loss < state.best_eval_loss:
231
+ state.best_eval_loss = eval_loss
232
+ save_checkpoint(model, output_dir, f"best")
233
+ if true_step % save_steps == 0:
234
+ ckpt_dir = save_checkpoint(model, output_dir, f"step-{true_step}")
235
+ saved_checkpoints.append(ckpt_dir)
236
+ # Cleanup
237
+ if len(saved_checkpoints) > save_total_limit:
238
+ old = saved_checkpoints.pop(0)
239
+ shutil.rmtree(old, ignore_errors=True)
240
+ state.global_step = true_step
241
+ if max_train_steps is not None and true_step >= max_train_steps:
242
+ break
243
+ # End-of-epoch eval/save
244
+ if eval_loader is not None:
245
+ eval_loss = evaluate(model, eval_loader, device, fp16, bf16)
246
+ logger.info(f"Epoch {epoch+1} end: eval_loss {eval_loss:.4f}, ppl {math.exp(eval_loss):.2f}")
247
+ if eval_loss < state.best_eval_loss:
248
+ state.best_eval_loss = eval_loss
249
+ save_checkpoint(model, output_dir, "best")
250
+ save_checkpoint(model, output_dir, f"epoch-{epoch+1}")
251
+ logger.info(f"Training completed in {time.time() - start_time:.2f} sec on {device}")
252
+
253
+ def evaluate(model, eval_loader, device, fp16, bf16):
254
+ model.eval()
255
+ losses = []
256
+ for batch in eval_loader:
257
+ batch = {k: v.to(device) for k, v in batch.items()}
258
+ with torch.no_grad():
259
+ with torch.cuda.amp.autocast(dtype=torch.float16 if fp16 else torch.bfloat16 if bf16 else torch.float32, enabled=(fp16 or bf16)):
260
+ outputs = model(**batch)
261
+ losses.append(outputs.loss.item())
262
+ model.train()
263
+ return sum(losses) / len(losses)
264
+
265
+ def save_checkpoint(model, output_dir, tag):
266
+ output_ckpt_dir = os.path.join(output_dir, tag)
267
+ os.makedirs(output_ckpt_dir, exist_ok=True)
268
+ model.save_pretrained(output_ckpt_dir)
269
+ return output_ckpt_dir
270
+
271
+ def main():
272
+ from transformers import AutoModelForCausalLM, AutoTokenizer
273
+
274
+ # Example tiny dataset: few sentences
275
+ train_texts = [
276
+ "The quick brown fox jumps over the lazy dog.",
277
+ "Artificial intelligence is the future.",
278
+ "Llama models are great for language tasks.",
279
+ "Open source is important for research.",
280
+ ]
281
+ eval_texts = [
282
+ "Transformers enable powerful NLP models.",
283
+ "Fine-tuning improves performance."
284
+ ]
285
+
286
+ #model_name = "openlm-research/open_llama_3b" # Replace with your local/other HF Llama checkpoint as needed
287
+ model_name = "distilgpt2" # Replace with your local/other HF Llama checkpoint as needed
288
+
289
+ tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
290
+ # Make sure tokenizer pads on right for causal LMs (Llama does not have pad by default)
291
+ if tokenizer.pad_token is None:
292
+ tokenizer.pad_token = tokenizer.eos_token
293
+
294
+ model = AutoModelForCausalLM.from_pretrained(model_name)
295
+
296
+ train_next_token(
297
+ model=model,
298
+ tokenizer=tokenizer,
299
+ dataset=train_texts,
300
+ output_dir="./output_test",
301
+ eval_dataset=eval_texts,
302
+ max_seq_length=32,
303
+ batch_size=2,
304
+ num_train_epochs=1,
305
+ gradient_accumulation_steps=1,
306
+ learning_rate=5e-5,
307
+ fp16=False, # Change to True if running on GPU with enough VRAM
308
+ bf16=False,
309
+ logging_steps=1,
310
+ eval_steps=2,
311
+ save_steps=10
312
+ )
313
+
314
+ if __name__ == "__main__":
315
+ main()
@@ -0,0 +1,70 @@
1
+ def group_texts(examples):
2
+ # Concatenate all texts.
3
+ concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
4
+ total_length = len(concatenated_examples[list(examples.keys())[0]])
5
+ # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
6
+ # customize this part to your needs.
7
+ total_length = (total_length // block_size) * block_size
8
+ # Split by chunks of max_len.
9
+ result = {
10
+ k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
11
+ for k, t in concatenated_examples.items()
12
+ }
13
+ result["labels"] = result["input_ids"].copy()
14
+ return result
15
+
16
+ def whole_word_masking_data_collator(features):
17
+ from transformers import default_data_collator
18
+ for feature in features:
19
+ word_ids = feature.pop("word_ids")
20
+
21
+ # Create a map between words and corresponding token indices
22
+ mapping = collections.defaultdict(list)
23
+ current_word_index = -1
24
+ current_word = None
25
+ for idx, word_id in enumerate(word_ids):
26
+ if word_id is not None:
27
+ if word_id != current_word:
28
+ current_word = word_id
29
+ current_word_index += 1
30
+ mapping[current_word_index].append(idx)
31
+
32
+ # Randomly mask words
33
+ mask = np.random.binomial(1, wwm_probability, (len(mapping),))
34
+ input_ids = feature["input_ids"]
35
+ labels = feature["labels"]
36
+ new_labels = [-100] * len(labels)
37
+ for word_id in np.where(mask)[0]:
38
+ word_id = word_id.item()
39
+ for idx in mapping[word_id]:
40
+ new_labels[idx] = labels[idx]
41
+ input_ids[idx] = tokenizer.mask_token_id
42
+ feature["labels"] = new_labels
43
+
44
+ return default_data_collator(features)
45
+
46
+ if __name__ == "__main__2":
47
+
48
+ from transformers import AutoModelForMaskedLM
49
+ from transformers import AutoTokenizer
50
+ import torch
51
+
52
+ model_checkpoint = "distilbert-base-uncased"
53
+ model = AutoModelForMaskedLM.from_pretrained(model_checkpoint)
54
+ tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
55
+
56
+ text = "This is a great [MASK]."
57
+
58
+ inputs = tokenizer(text, return_tensors="pt")
59
+ token_logits = model(**inputs).logits
60
+ # Find the location of [MASK] and extract its logits
61
+ mask_token_index = torch.where(inputs["input_ids"] == tokenizer.mask_token_id)[1]
62
+ mask_token_logits = token_logits[0, mask_token_index, :]
63
+ # Pick the [MASK] candidates with the highest logits
64
+ top_5_tokens = torch.topk(mask_token_logits, 5, dim=1).indices[0].tolist()
65
+
66
+ for token in top_5_tokens:
67
+ print(f"'>>> {text.replace(tokenizer.mask_token, tokenizer.decode([token]))}'")
68
+
69
+
70
+
@@ -0,0 +1,32 @@
1
+ import random
2
+ import torch
3
+ import numpy
4
+
5
+ def set_seed(seed):
6
+ """
7
+ Set seed in several backends
8
+ """
9
+ random.seed(seed)
10
+ numpy.random.seed(seed)
11
+ torch.manual_seed(seed)
12
+ if torch.cuda.is_available():
13
+ torch.cuda.manual_seed(seed)
14
+ torch.cuda.manual_seed_all(seed)
15
+
16
+ def deterministic():
17
+ """
18
+ Ensure that all operations are deterministic on GPU (if used) for
19
+ reproducibility
20
+ """
21
+ torch.backends.cudnn.deterministic = True
22
+ torch.backends.cudnn.benchmark = False
23
+
24
+ def device():
25
+ return torch.device("cuda:0") if torch.cuda.is_available() else torch.device("cpu")
26
+
27
+ def data_directory():
28
+ from pathlib import Path
29
+ print(Path.home())
30
+
31
+ def model_device(model):
32
+ return next(model.parameters()).device