scout-ai 0.2.0 → 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.vimproject +91 -10
- data/Rakefile +1 -0
- data/VERSION +1 -1
- data/bin/scout-ai +2 -0
- data/lib/scout/llm/agent/chat.rb +24 -0
- data/lib/scout/llm/agent.rb +13 -13
- data/lib/scout/llm/ask.rb +26 -16
- data/lib/scout/llm/backends/bedrock.rb +129 -0
- data/lib/scout/llm/backends/huggingface.rb +6 -21
- data/lib/scout/llm/backends/ollama.rb +69 -36
- data/lib/scout/llm/backends/openai.rb +85 -35
- data/lib/scout/llm/backends/openwebui.rb +1 -1
- data/lib/scout/llm/backends/relay.rb +3 -2
- data/lib/scout/llm/backends/responses.rb +272 -0
- data/lib/scout/llm/chat.rb +547 -0
- data/lib/scout/llm/parse.rb +70 -13
- data/lib/scout/llm/tools.rb +126 -5
- data/lib/scout/llm/utils.rb +17 -10
- data/lib/scout/model/base.rb +19 -0
- data/lib/scout/model/python/base.rb +25 -0
- data/lib/scout/model/python/huggingface/causal/next_token.rb +23 -0
- data/lib/scout/model/python/huggingface/causal.rb +29 -0
- data/lib/scout/model/python/huggingface/classification +0 -0
- data/lib/scout/model/python/huggingface/classification.rb +50 -0
- data/lib/scout/model/python/huggingface.rb +112 -0
- data/lib/scout/model/python/torch/dataloader.rb +57 -0
- data/lib/scout/model/python/torch/helpers.rb +84 -0
- data/lib/scout/model/python/torch/introspection.rb +34 -0
- data/lib/scout/model/python/torch/load_and_save.rb +47 -0
- data/lib/scout/model/python/torch.rb +94 -0
- data/lib/scout/model/util/run.rb +181 -0
- data/lib/scout/model/util/save.rb +81 -0
- data/lib/scout-ai.rb +3 -1
- data/python/scout_ai/__init__.py +35 -0
- data/python/scout_ai/__pycache__/__init__.cpython-310.pyc +0 -0
- data/python/scout_ai/__pycache__/__init__.cpython-311.pyc +0 -0
- data/python/scout_ai/__pycache__/huggingface.cpython-310.pyc +0 -0
- data/python/scout_ai/__pycache__/huggingface.cpython-311.pyc +0 -0
- data/python/scout_ai/__pycache__/util.cpython-310.pyc +0 -0
- data/python/scout_ai/__pycache__/util.cpython-311.pyc +0 -0
- data/python/scout_ai/atcold/__init__.py +0 -0
- data/python/scout_ai/atcold/plot_lib.py +141 -0
- data/python/scout_ai/atcold/spiral.py +27 -0
- data/python/scout_ai/huggingface/data.py +48 -0
- data/python/scout_ai/huggingface/eval.py +60 -0
- data/python/scout_ai/huggingface/model.py +29 -0
- data/python/scout_ai/huggingface/rlhf.py +83 -0
- data/python/scout_ai/huggingface/train/__init__.py +34 -0
- data/python/scout_ai/huggingface/train/__pycache__/__init__.cpython-310.pyc +0 -0
- data/python/scout_ai/huggingface/train/__pycache__/next_token.cpython-310.pyc +0 -0
- data/python/scout_ai/huggingface/train/next_token.py +315 -0
- data/python/scout_ai/language_model.py +70 -0
- data/python/scout_ai/util.py +32 -0
- data/scout-ai.gemspec +130 -0
- data/scout_commands/agent/ask +133 -15
- data/scout_commands/agent/kb +15 -0
- data/scout_commands/llm/ask +71 -12
- data/scout_commands/llm/process +4 -2
- data/test/data/cat.jpg +0 -0
- data/test/scout/llm/agent/test_chat.rb +14 -0
- data/test/scout/llm/backends/test_bedrock.rb +60 -0
- data/test/scout/llm/backends/test_huggingface.rb +3 -3
- data/test/scout/llm/backends/test_ollama.rb +48 -10
- data/test/scout/llm/backends/test_openai.rb +96 -11
- data/test/scout/llm/backends/test_responses.rb +115 -0
- data/test/scout/llm/test_ask.rb +1 -0
- data/test/scout/llm/test_chat.rb +214 -0
- data/test/scout/llm/test_parse.rb +81 -2
- data/test/scout/model/python/huggingface/causal/test_next_token.rb +59 -0
- data/test/scout/model/python/huggingface/test_causal.rb +33 -0
- data/test/scout/model/python/huggingface/test_classification.rb +30 -0
- data/test/scout/model/python/test_base.rb +44 -0
- data/test/scout/model/python/test_huggingface.rb +9 -0
- data/test/scout/model/python/test_torch.rb +71 -0
- data/test/scout/model/python/torch/test_helpers.rb +14 -0
- data/test/scout/model/test_base.rb +117 -0
- data/test/scout/model/util/test_save.rb +31 -0
- metadata +72 -5
- data/questions/coach +0 -2
@@ -0,0 +1,60 @@
|
|
1
|
+
def forward(model, features):
|
2
|
+
return model(**features)
|
3
|
+
|
4
|
+
def get_logits(predictions):
|
5
|
+
logits = predictions["logits"]
|
6
|
+
return [v.detach().cpu().numpy() for v in logits]
|
7
|
+
|
8
|
+
def eval_model(model, tokenizer, texts, return_logits=True):
|
9
|
+
features = tokenizer(texts, return_tensors='pt', truncation=True).to(model.device)
|
10
|
+
model.eval()
|
11
|
+
predictions = forward(model, features)
|
12
|
+
if return_logits:
|
13
|
+
return get_logits(predictions)
|
14
|
+
return predictions
|
15
|
+
|
16
|
+
def eval_causal_lm_chat(
|
17
|
+
model, tokenizer, messages,
|
18
|
+
chat_template=None,
|
19
|
+
chat_template_kwargs=None,
|
20
|
+
generation_kwargs=None
|
21
|
+
):
|
22
|
+
"""
|
23
|
+
Evaluate a CausalLM model given chat messages. Uses tokenizer's chat template by default.
|
24
|
+
|
25
|
+
Args:
|
26
|
+
model: Huggingface CausalLM
|
27
|
+
tokenizer: Huggingface tokenizer
|
28
|
+
messages: List[Dict[str, str]] (OpenAI API style, 'role' and 'content')
|
29
|
+
chat_template: (Optional) Override string for the chat template.
|
30
|
+
chat_template_kwargs: (Optional) Dict, kwargs for apply_chat_template (like tokenize, add_generation_prompt, etc).
|
31
|
+
generation_kwargs: (Optional) Dict for model.generate
|
32
|
+
|
33
|
+
Returns:
|
34
|
+
Generated text (or list, depending on settings).
|
35
|
+
"""
|
36
|
+
chat_template_kwargs = chat_template_kwargs or {}
|
37
|
+
generation_kwargs = generation_kwargs or {}
|
38
|
+
|
39
|
+
# If the tokenizer has a chat template (HF 4.34+)
|
40
|
+
if hasattr(tokenizer, "___apply_chat_template"):
|
41
|
+
kwargs = dict(add_generation_prompt=True, tokenize=False)
|
42
|
+
kwargs.update(chat_template_kwargs)
|
43
|
+
if chat_template is not None:
|
44
|
+
# Override the template (may require tokenizer._chat_template)
|
45
|
+
tokenizer._chat_template = chat_template
|
46
|
+
prompt = tokenizer.apply_chat_template(messages, **kwargs)
|
47
|
+
else:
|
48
|
+
# Fallback: simple concatenation
|
49
|
+
prompt = "\n".join([msg['content'] for msg in messages])
|
50
|
+
|
51
|
+
# Tokenize as usual
|
52
|
+
inputs = tokenizer(prompt, return_tensors='pt').to(model.device)
|
53
|
+
model.eval()
|
54
|
+
# Use generate
|
55
|
+
output_ids = model.generate(**inputs, **generation_kwargs)
|
56
|
+
# Decode only the newly generated tokens (not the prompt)
|
57
|
+
output_text = tokenizer.decode(
|
58
|
+
output_ids[0, inputs["input_ids"].shape[1]:], skip_special_tokens=True
|
59
|
+
)
|
60
|
+
return output_text
|
@@ -0,0 +1,29 @@
|
|
1
|
+
# huggingface_model.py
|
2
|
+
import importlib
|
3
|
+
from typing import Optional, Any
|
4
|
+
|
5
|
+
def import_module_class(module: str, class_name: str) -> Any:
|
6
|
+
"""Dynamically import a class from a module."""
|
7
|
+
mod = importlib.import_module(module)
|
8
|
+
return getattr(mod, class_name)
|
9
|
+
|
10
|
+
def load_model(task: Optional[str], checkpoint: str, **kwargs) -> Any:
|
11
|
+
"""Load a Huggingface model by task and checkpoint"""
|
12
|
+
if task is None or task.lower() == 'embedding':
|
13
|
+
model_class = import_module_class('transformers', 'AutoModel')
|
14
|
+
elif ":" in task:
|
15
|
+
module, class_name = task.split(":")
|
16
|
+
model_class = import_module_class(module, class_name)
|
17
|
+
else:
|
18
|
+
model_class = import_module_class('transformers', f'AutoModelFor{task}')
|
19
|
+
return model_class.from_pretrained(checkpoint, **kwargs)
|
20
|
+
|
21
|
+
def load_tokenizer(checkpoint: str, **kwargs) -> Any:
|
22
|
+
"""Load a Huggingface tokenizer"""
|
23
|
+
tokenizer_class = import_module_class('transformers', 'AutoTokenizer')
|
24
|
+
return tokenizer_class.from_pretrained(checkpoint, **kwargs)
|
25
|
+
|
26
|
+
def load_model_and_tokenizer(task: Optional[str], checkpoint: str, **kwargs):
|
27
|
+
model = load_model(task, checkpoint, **kwargs)
|
28
|
+
tokenizer = load_tokenizer(checkpoint, **kwargs)
|
29
|
+
return model, tokenizer
|
@@ -0,0 +1,83 @@
|
|
1
|
+
from trl import PPOTrainer, AutoModelForCausalLMWithValueHead, PPOConfig
|
2
|
+
import torch
|
3
|
+
import scout_ai
|
4
|
+
|
5
|
+
from copy import deepcopy
|
6
|
+
from datasets import Dataset
|
7
|
+
|
8
|
+
|
9
|
+
class PPOTrainerWithPrecomputedReward(PPOTrainer):
|
10
|
+
def get_rewards(self, **kwargs):
|
11
|
+
return torch.tensor(self.train_dataset['reward'], dtype=torch.float32)
|
12
|
+
|
13
|
+
def train_rlhf(path, tokenizer, pairs, rewards, config=None, generation_config=None):
|
14
|
+
"""
|
15
|
+
pairs: List of tuples (messages, response)
|
16
|
+
- messages: List[Dict[str, str]] (OpenAI/chatML-style messages)
|
17
|
+
- response: string (the model output to be rewarded)
|
18
|
+
"""
|
19
|
+
config = config or {}
|
20
|
+
device = scout_ai.device()
|
21
|
+
device = 'cuda'
|
22
|
+
|
23
|
+
tokenizer.padding_side = "left"
|
24
|
+
tokenizer.pad_token = tokenizer.eos_token
|
25
|
+
|
26
|
+
prompts, responses = [], []
|
27
|
+
for pair in pairs:
|
28
|
+
messages, response = pair
|
29
|
+
# Ensure tokenizer supports chat template (HF >=4.34)
|
30
|
+
if hasattr(tokenizer, 'apply_chat_template'):
|
31
|
+
# Use default: add_generation_prompt needed for LLMs like Llama, Mistral, etc
|
32
|
+
prompt = tokenizer.apply_chat_template(
|
33
|
+
messages, add_generation_prompt=True, tokenize=False
|
34
|
+
)
|
35
|
+
else:
|
36
|
+
# Fallback: join user/assistant messages
|
37
|
+
prompt = "\n".join(msg['content'] for msg in messages)
|
38
|
+
prompts.append(prompt)
|
39
|
+
responses.append(response)
|
40
|
+
|
41
|
+
train_dataset = Dataset.from_dict({'prompt': prompts, 'response': responses, 'reward': rewards})
|
42
|
+
|
43
|
+
# Wrap model with Value Head for PPO
|
44
|
+
from trl import PPOTrainer, AutoModelForCausalLMWithValueHead, PPOConfig
|
45
|
+
model = AutoModelForCausalLMWithValueHead.from_pretrained(path)
|
46
|
+
model.to(device)
|
47
|
+
|
48
|
+
from transformers import GenerationConfig
|
49
|
+
|
50
|
+
generation_config = GenerationConfig()
|
51
|
+
|
52
|
+
ppo_config = PPOConfig(
|
53
|
+
batch_size=config.get('batch_size', 4),
|
54
|
+
learning_rate=config.get('learning_rate', 1e-5),
|
55
|
+
mini_batch_size=config.get('mini_batch_size', 1),
|
56
|
+
gradient_accumulation_steps=1,
|
57
|
+
)
|
58
|
+
|
59
|
+
model.base_model_prefix = 'model'
|
60
|
+
|
61
|
+
ref_model = deepcopy(model)
|
62
|
+
ref_model.to(device)
|
63
|
+
|
64
|
+
model.generation_config=generation_config
|
65
|
+
|
66
|
+
print(model)
|
67
|
+
print(ref_model)
|
68
|
+
|
69
|
+
ppo_trainer = PPOTrainerWithPrecomputedReward(
|
70
|
+
args=ppo_config,
|
71
|
+
model=model,
|
72
|
+
ref_model=ref_model,
|
73
|
+
reward_model=model, # dummy
|
74
|
+
value_model=model, # dummy
|
75
|
+
train_dataset=train_dataset,
|
76
|
+
processing_class=None,
|
77
|
+
)
|
78
|
+
|
79
|
+
|
80
|
+
print("Step")
|
81
|
+
stats = ppo_trainer.train(prompts, responses, rewards)
|
82
|
+
model.save
|
83
|
+
return stats
|
@@ -0,0 +1,34 @@
|
|
1
|
+
from transformers import TrainingArguments, Trainer
|
2
|
+
from typing import Any
|
3
|
+
from ..data import json_dataset, tsv_dataset, tokenize_dataset
|
4
|
+
|
5
|
+
def training_args(*args, **kwargs) -> TrainingArguments:
|
6
|
+
return TrainingArguments(*args, **kwargs)
|
7
|
+
|
8
|
+
def train_model(model: Any, tokenizer: Any, training_args: TrainingArguments, dataset: Any, class_weights=None, **kwargs):
|
9
|
+
for param in model.parameters():
|
10
|
+
param.data = param.data.contiguous()
|
11
|
+
|
12
|
+
if (isinstance(dataset, str)):
|
13
|
+
if (dataset.endswith('.json')):
|
14
|
+
tokenized_dataset = json_dataset(tokenizer, dataset)
|
15
|
+
else:
|
16
|
+
tokenized_dataset = tsv_dataset(tokenizer, dataset)
|
17
|
+
else:
|
18
|
+
tokenized_dataset = tokenize_dataset(tokenizer, dataset)
|
19
|
+
|
20
|
+
if class_weights is not None:
|
21
|
+
import torch
|
22
|
+
from torch import nn
|
23
|
+
class WeightTrainer(Trainer):
|
24
|
+
def compute_loss(self, model, inputs, return_outputs=False):
|
25
|
+
labels = inputs.get("labels")
|
26
|
+
outputs = model(**inputs)
|
27
|
+
logits = outputs.get('logits')
|
28
|
+
loss_fct = nn.CrossEntropyLoss(weight=torch.tensor(class_weights).to(model.device))
|
29
|
+
loss = loss_fct(logits.view(-1, model.config.num_labels), labels.view(-1))
|
30
|
+
return (loss, outputs) if return_outputs else loss
|
31
|
+
trainer = WeightTrainer(model, training_args, train_dataset=tokenized_dataset["train"], tokenizer=tokenizer, **kwargs)
|
32
|
+
else:
|
33
|
+
trainer = Trainer(model, training_args, train_dataset=tokenized_dataset["train"], tokenizer=tokenizer, **kwargs)
|
34
|
+
trainer.train()
|
@@ -0,0 +1,315 @@
|
|
1
|
+
import os
|
2
|
+
import math
|
3
|
+
import time
|
4
|
+
import shutil
|
5
|
+
import random
|
6
|
+
from dataclasses import dataclass
|
7
|
+
from typing import List, Optional, Dict, Any, Union
|
8
|
+
|
9
|
+
import torch
|
10
|
+
from torch.utils.data import DataLoader
|
11
|
+
from datasets import Dataset, load_dataset
|
12
|
+
|
13
|
+
from transformers import (
|
14
|
+
PreTrainedModel,
|
15
|
+
PreTrainedTokenizer,
|
16
|
+
get_scheduler,
|
17
|
+
DataCollatorForLanguageModeling
|
18
|
+
)
|
19
|
+
from torch.optim import AdamW
|
20
|
+
from transformers.utils import logging
|
21
|
+
|
22
|
+
logger = logging.get_logger(__name__)
|
23
|
+
|
24
|
+
def set_seed(seed: int):
|
25
|
+
random.seed(seed)
|
26
|
+
torch.manual_seed(seed)
|
27
|
+
torch.cuda.manual_seed_all(seed)
|
28
|
+
try:
|
29
|
+
import numpy as np
|
30
|
+
np.random.seed(seed)
|
31
|
+
except ImportError:
|
32
|
+
pass
|
33
|
+
|
34
|
+
@dataclass
|
35
|
+
class TrainingState:
|
36
|
+
global_step: int = 0
|
37
|
+
best_eval_loss: float = float("inf")
|
38
|
+
|
39
|
+
def tokenize_function(examples, tokenizer, max_seq_length):
|
40
|
+
# examples: dict with key 'text' or single texts
|
41
|
+
# Always output input_ids and attention_mask
|
42
|
+
output = tokenizer(
|
43
|
+
examples["text"] if "text" in examples else examples,
|
44
|
+
truncation=True,
|
45
|
+
padding="max_length",
|
46
|
+
max_length=max_seq_length,
|
47
|
+
return_attention_mask=True,
|
48
|
+
)
|
49
|
+
output["labels"] = output["input_ids"].copy()
|
50
|
+
return output
|
51
|
+
|
52
|
+
def group_texts(examples, block_size):
|
53
|
+
# For paragraph-based datasets: simply return; for huge files, use this.
|
54
|
+
concatenated = {k: sum(examples[k], []) for k in examples.keys()}
|
55
|
+
total_length = len(concatenated[list(examples.keys())[0]])
|
56
|
+
# Drop the small remainder
|
57
|
+
total_length = (total_length // block_size) * block_size
|
58
|
+
result = {
|
59
|
+
k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
|
60
|
+
for k, t in concatenated.items()
|
61
|
+
}
|
62
|
+
return result
|
63
|
+
|
64
|
+
def train_next_token(
|
65
|
+
model: PreTrainedModel,
|
66
|
+
tokenizer: PreTrainedTokenizer,
|
67
|
+
dataset: Union[List[str], Dataset],
|
68
|
+
*,
|
69
|
+
output_dir: str,
|
70
|
+
eval_dataset: Optional[Union[List[str], Dataset]] = None,
|
71
|
+
max_seq_length: int = 2048,
|
72
|
+
batch_size: int = 8,
|
73
|
+
gradient_accumulation_steps: int = 1,
|
74
|
+
num_train_epochs: int = 3,
|
75
|
+
learning_rate: float = 1e-4,
|
76
|
+
weight_decay: float = 0.01,
|
77
|
+
lr_scheduler_type: str = "linear",
|
78
|
+
warmup_steps: int = 0,
|
79
|
+
logging_steps: int = 50,
|
80
|
+
eval_steps: int = 200,
|
81
|
+
save_steps: int = 500,
|
82
|
+
save_total_limit: int = 3,
|
83
|
+
fp16: bool = False,
|
84
|
+
bf16: bool = False,
|
85
|
+
max_train_steps: int = None,
|
86
|
+
seed: int = 42,
|
87
|
+
report_to: str = "none", # or "wandb", "tensorboard"
|
88
|
+
use_lora: bool = False,
|
89
|
+
lora_config: Optional[dict] = None,
|
90
|
+
resume_from_checkpoint: str = None,
|
91
|
+
callbacks: Optional[List] = None,
|
92
|
+
device_map: str = "auto",
|
93
|
+
dataloader_num_workers: int = 4,
|
94
|
+
group_by_length: bool = False,
|
95
|
+
description: str = "",
|
96
|
+
):
|
97
|
+
"""
|
98
|
+
Fine-tunes a causal LM for next-token prediction.
|
99
|
+
"""
|
100
|
+
#assert isinstance(model, PreTrainedModel), "Model must be a HuggingFace PreTrainedModel"
|
101
|
+
#assert isinstance(tokenizer, PreTrainedTokenizer), "Tokenizer must be a HuggingFace PreTrainedTokenizer"
|
102
|
+
assert isinstance(dataset, (list, Dataset)), "Dataset must be a HuggingFace Dataset or a list of texts"
|
103
|
+
|
104
|
+
set_seed(seed)
|
105
|
+
os.makedirs(output_dir, exist_ok=True)
|
106
|
+
|
107
|
+
device = "cuda" if torch.cuda.is_available() else "cpu"
|
108
|
+
n_gpus = torch.cuda.device_count()
|
109
|
+
|
110
|
+
if resume_from_checkpoint:
|
111
|
+
logger.info(f"Loading checkpoint from {resume_from_checkpoint}")
|
112
|
+
model.load_state_dict(torch.load(os.path.join(resume_from_checkpoint, "pytorch_model.bin")))
|
113
|
+
|
114
|
+
model.to(device)
|
115
|
+
|
116
|
+
if fp16:
|
117
|
+
scaler = torch.cuda.amp.GradScaler()
|
118
|
+
else:
|
119
|
+
scaler = None
|
120
|
+
|
121
|
+
# 1. Prepare Dataset
|
122
|
+
if isinstance(dataset, list):
|
123
|
+
dataset = Dataset.from_dict({"text": dataset})
|
124
|
+
|
125
|
+
if eval_dataset is not None and isinstance(eval_dataset, list):
|
126
|
+
eval_dataset = Dataset.from_dict({"text": eval_dataset})
|
127
|
+
|
128
|
+
# Tokenization and formatting
|
129
|
+
def preprocess(examples):
|
130
|
+
return tokenize_function(examples, tokenizer, max_seq_length)
|
131
|
+
|
132
|
+
dataset = dataset.map(preprocess, batched=True, remove_columns=list(dataset.column_names))
|
133
|
+
if eval_dataset is not None:
|
134
|
+
eval_dataset = eval_dataset.map(preprocess, batched=True, remove_columns=list(eval_dataset.column_names))
|
135
|
+
|
136
|
+
# 2. Loader & Collator
|
137
|
+
data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)
|
138
|
+
|
139
|
+
train_loader = DataLoader(
|
140
|
+
dataset,
|
141
|
+
batch_size=batch_size,
|
142
|
+
shuffle=True,
|
143
|
+
collate_fn=data_collator,
|
144
|
+
num_workers=dataloader_num_workers,
|
145
|
+
drop_last=True,
|
146
|
+
)
|
147
|
+
eval_loader = None
|
148
|
+
if eval_dataset is not None:
|
149
|
+
eval_loader = DataLoader(
|
150
|
+
eval_dataset,
|
151
|
+
batch_size=batch_size,
|
152
|
+
shuffle=False,
|
153
|
+
collate_fn=data_collator,
|
154
|
+
num_workers=dataloader_num_workers,
|
155
|
+
)
|
156
|
+
|
157
|
+
# 3. Optimizer & Scheduler
|
158
|
+
no_decay = ["bias", "LayerNorm.weight"]
|
159
|
+
grouped_params = [
|
160
|
+
{
|
161
|
+
"params": [
|
162
|
+
p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)
|
163
|
+
],
|
164
|
+
"weight_decay": weight_decay,
|
165
|
+
},
|
166
|
+
{
|
167
|
+
"params": [
|
168
|
+
p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)
|
169
|
+
],
|
170
|
+
"weight_decay": 0.0,
|
171
|
+
},
|
172
|
+
]
|
173
|
+
|
174
|
+
optimizer = AdamW(grouped_params, lr=learning_rate)
|
175
|
+
|
176
|
+
total_train_steps = (
|
177
|
+
max_train_steps if max_train_steps is not None
|
178
|
+
else (len(train_loader) * num_train_epochs) // gradient_accumulation_steps
|
179
|
+
)
|
180
|
+
|
181
|
+
lr_scheduler = get_scheduler(
|
182
|
+
lr_scheduler_type,
|
183
|
+
optimizer=optimizer,
|
184
|
+
num_warmup_steps=warmup_steps,
|
185
|
+
num_training_steps=total_train_steps,
|
186
|
+
)
|
187
|
+
|
188
|
+
# 4. LoRA/PEFT Support (placeholder)
|
189
|
+
if use_lora:
|
190
|
+
logger.warning("PEFT/LoRA integration not yet implemented. Skipping.")
|
191
|
+
|
192
|
+
# 5. Checkpoint Management
|
193
|
+
saved_checkpoints = []
|
194
|
+
|
195
|
+
# 6. Training Loop
|
196
|
+
state = TrainingState()
|
197
|
+
model.train()
|
198
|
+
start_time = time.time()
|
199
|
+
for epoch in range(num_train_epochs):
|
200
|
+
logger.info(f"Epoch {epoch+1}/{num_train_epochs}")
|
201
|
+
for step, batch in enumerate(train_loader):
|
202
|
+
true_step = state.global_step + 1
|
203
|
+
batch = {k: v.to(device) for k, v in batch.items()}
|
204
|
+
with torch.cuda.amp.autocast(dtype=torch.float16 if fp16 else torch.bfloat16 if bf16 else torch.float32, enabled=(fp16 or bf16)):
|
205
|
+
outputs = model(**batch)
|
206
|
+
loss = outputs.loss
|
207
|
+
loss = loss / gradient_accumulation_steps
|
208
|
+
|
209
|
+
if fp16:
|
210
|
+
scaler.scale(loss).backward()
|
211
|
+
else:
|
212
|
+
loss.backward()
|
213
|
+
|
214
|
+
if true_step % gradient_accumulation_steps == 0:
|
215
|
+
if fp16:
|
216
|
+
scaler.step(optimizer)
|
217
|
+
scaler.update()
|
218
|
+
else:
|
219
|
+
optimizer.step()
|
220
|
+
optimizer.zero_grad()
|
221
|
+
lr_scheduler.step()
|
222
|
+
|
223
|
+
if true_step % logging_steps == 0:
|
224
|
+
logger.info(f"Step {true_step}: loss {loss.item() * gradient_accumulation_steps:.4f}")
|
225
|
+
|
226
|
+
if eval_loader is not None and true_step % eval_steps == 0:
|
227
|
+
eval_loss = evaluate(model, eval_loader, device, fp16, bf16)
|
228
|
+
logger.info(f"Step {true_step}: eval_loss {eval_loss:.4f}, ppl {math.exp(eval_loss):.2f}")
|
229
|
+
# Save best
|
230
|
+
if eval_loss < state.best_eval_loss:
|
231
|
+
state.best_eval_loss = eval_loss
|
232
|
+
save_checkpoint(model, output_dir, f"best")
|
233
|
+
if true_step % save_steps == 0:
|
234
|
+
ckpt_dir = save_checkpoint(model, output_dir, f"step-{true_step}")
|
235
|
+
saved_checkpoints.append(ckpt_dir)
|
236
|
+
# Cleanup
|
237
|
+
if len(saved_checkpoints) > save_total_limit:
|
238
|
+
old = saved_checkpoints.pop(0)
|
239
|
+
shutil.rmtree(old, ignore_errors=True)
|
240
|
+
state.global_step = true_step
|
241
|
+
if max_train_steps is not None and true_step >= max_train_steps:
|
242
|
+
break
|
243
|
+
# End-of-epoch eval/save
|
244
|
+
if eval_loader is not None:
|
245
|
+
eval_loss = evaluate(model, eval_loader, device, fp16, bf16)
|
246
|
+
logger.info(f"Epoch {epoch+1} end: eval_loss {eval_loss:.4f}, ppl {math.exp(eval_loss):.2f}")
|
247
|
+
if eval_loss < state.best_eval_loss:
|
248
|
+
state.best_eval_loss = eval_loss
|
249
|
+
save_checkpoint(model, output_dir, "best")
|
250
|
+
save_checkpoint(model, output_dir, f"epoch-{epoch+1}")
|
251
|
+
logger.info(f"Training completed in {time.time() - start_time:.2f} sec on {device}")
|
252
|
+
|
253
|
+
def evaluate(model, eval_loader, device, fp16, bf16):
|
254
|
+
model.eval()
|
255
|
+
losses = []
|
256
|
+
for batch in eval_loader:
|
257
|
+
batch = {k: v.to(device) for k, v in batch.items()}
|
258
|
+
with torch.no_grad():
|
259
|
+
with torch.cuda.amp.autocast(dtype=torch.float16 if fp16 else torch.bfloat16 if bf16 else torch.float32, enabled=(fp16 or bf16)):
|
260
|
+
outputs = model(**batch)
|
261
|
+
losses.append(outputs.loss.item())
|
262
|
+
model.train()
|
263
|
+
return sum(losses) / len(losses)
|
264
|
+
|
265
|
+
def save_checkpoint(model, output_dir, tag):
|
266
|
+
output_ckpt_dir = os.path.join(output_dir, tag)
|
267
|
+
os.makedirs(output_ckpt_dir, exist_ok=True)
|
268
|
+
model.save_pretrained(output_ckpt_dir)
|
269
|
+
return output_ckpt_dir
|
270
|
+
|
271
|
+
def main():
|
272
|
+
from transformers import AutoModelForCausalLM, AutoTokenizer
|
273
|
+
|
274
|
+
# Example tiny dataset: few sentences
|
275
|
+
train_texts = [
|
276
|
+
"The quick brown fox jumps over the lazy dog.",
|
277
|
+
"Artificial intelligence is the future.",
|
278
|
+
"Llama models are great for language tasks.",
|
279
|
+
"Open source is important for research.",
|
280
|
+
]
|
281
|
+
eval_texts = [
|
282
|
+
"Transformers enable powerful NLP models.",
|
283
|
+
"Fine-tuning improves performance."
|
284
|
+
]
|
285
|
+
|
286
|
+
#model_name = "openlm-research/open_llama_3b" # Replace with your local/other HF Llama checkpoint as needed
|
287
|
+
model_name = "distilgpt2" # Replace with your local/other HF Llama checkpoint as needed
|
288
|
+
|
289
|
+
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
|
290
|
+
# Make sure tokenizer pads on right for causal LMs (Llama does not have pad by default)
|
291
|
+
if tokenizer.pad_token is None:
|
292
|
+
tokenizer.pad_token = tokenizer.eos_token
|
293
|
+
|
294
|
+
model = AutoModelForCausalLM.from_pretrained(model_name)
|
295
|
+
|
296
|
+
train_next_token(
|
297
|
+
model=model,
|
298
|
+
tokenizer=tokenizer,
|
299
|
+
dataset=train_texts,
|
300
|
+
output_dir="./output_test",
|
301
|
+
eval_dataset=eval_texts,
|
302
|
+
max_seq_length=32,
|
303
|
+
batch_size=2,
|
304
|
+
num_train_epochs=1,
|
305
|
+
gradient_accumulation_steps=1,
|
306
|
+
learning_rate=5e-5,
|
307
|
+
fp16=False, # Change to True if running on GPU with enough VRAM
|
308
|
+
bf16=False,
|
309
|
+
logging_steps=1,
|
310
|
+
eval_steps=2,
|
311
|
+
save_steps=10
|
312
|
+
)
|
313
|
+
|
314
|
+
if __name__ == "__main__":
|
315
|
+
main()
|
@@ -0,0 +1,70 @@
|
|
1
|
+
def group_texts(examples):
|
2
|
+
# Concatenate all texts.
|
3
|
+
concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
|
4
|
+
total_length = len(concatenated_examples[list(examples.keys())[0]])
|
5
|
+
# We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
|
6
|
+
# customize this part to your needs.
|
7
|
+
total_length = (total_length // block_size) * block_size
|
8
|
+
# Split by chunks of max_len.
|
9
|
+
result = {
|
10
|
+
k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
|
11
|
+
for k, t in concatenated_examples.items()
|
12
|
+
}
|
13
|
+
result["labels"] = result["input_ids"].copy()
|
14
|
+
return result
|
15
|
+
|
16
|
+
def whole_word_masking_data_collator(features):
|
17
|
+
from transformers import default_data_collator
|
18
|
+
for feature in features:
|
19
|
+
word_ids = feature.pop("word_ids")
|
20
|
+
|
21
|
+
# Create a map between words and corresponding token indices
|
22
|
+
mapping = collections.defaultdict(list)
|
23
|
+
current_word_index = -1
|
24
|
+
current_word = None
|
25
|
+
for idx, word_id in enumerate(word_ids):
|
26
|
+
if word_id is not None:
|
27
|
+
if word_id != current_word:
|
28
|
+
current_word = word_id
|
29
|
+
current_word_index += 1
|
30
|
+
mapping[current_word_index].append(idx)
|
31
|
+
|
32
|
+
# Randomly mask words
|
33
|
+
mask = np.random.binomial(1, wwm_probability, (len(mapping),))
|
34
|
+
input_ids = feature["input_ids"]
|
35
|
+
labels = feature["labels"]
|
36
|
+
new_labels = [-100] * len(labels)
|
37
|
+
for word_id in np.where(mask)[0]:
|
38
|
+
word_id = word_id.item()
|
39
|
+
for idx in mapping[word_id]:
|
40
|
+
new_labels[idx] = labels[idx]
|
41
|
+
input_ids[idx] = tokenizer.mask_token_id
|
42
|
+
feature["labels"] = new_labels
|
43
|
+
|
44
|
+
return default_data_collator(features)
|
45
|
+
|
46
|
+
if __name__ == "__main__2":
|
47
|
+
|
48
|
+
from transformers import AutoModelForMaskedLM
|
49
|
+
from transformers import AutoTokenizer
|
50
|
+
import torch
|
51
|
+
|
52
|
+
model_checkpoint = "distilbert-base-uncased"
|
53
|
+
model = AutoModelForMaskedLM.from_pretrained(model_checkpoint)
|
54
|
+
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
|
55
|
+
|
56
|
+
text = "This is a great [MASK]."
|
57
|
+
|
58
|
+
inputs = tokenizer(text, return_tensors="pt")
|
59
|
+
token_logits = model(**inputs).logits
|
60
|
+
# Find the location of [MASK] and extract its logits
|
61
|
+
mask_token_index = torch.where(inputs["input_ids"] == tokenizer.mask_token_id)[1]
|
62
|
+
mask_token_logits = token_logits[0, mask_token_index, :]
|
63
|
+
# Pick the [MASK] candidates with the highest logits
|
64
|
+
top_5_tokens = torch.topk(mask_token_logits, 5, dim=1).indices[0].tolist()
|
65
|
+
|
66
|
+
for token in top_5_tokens:
|
67
|
+
print(f"'>>> {text.replace(tokenizer.mask_token, tokenizer.decode([token]))}'")
|
68
|
+
|
69
|
+
|
70
|
+
|
@@ -0,0 +1,32 @@
|
|
1
|
+
import random
|
2
|
+
import torch
|
3
|
+
import numpy
|
4
|
+
|
5
|
+
def set_seed(seed):
|
6
|
+
"""
|
7
|
+
Set seed in several backends
|
8
|
+
"""
|
9
|
+
random.seed(seed)
|
10
|
+
numpy.random.seed(seed)
|
11
|
+
torch.manual_seed(seed)
|
12
|
+
if torch.cuda.is_available():
|
13
|
+
torch.cuda.manual_seed(seed)
|
14
|
+
torch.cuda.manual_seed_all(seed)
|
15
|
+
|
16
|
+
def deterministic():
|
17
|
+
"""
|
18
|
+
Ensure that all operations are deterministic on GPU (if used) for
|
19
|
+
reproducibility
|
20
|
+
"""
|
21
|
+
torch.backends.cudnn.deterministic = True
|
22
|
+
torch.backends.cudnn.benchmark = False
|
23
|
+
|
24
|
+
def device():
|
25
|
+
return torch.device("cuda:0") if torch.cuda.is_available() else torch.device("cpu")
|
26
|
+
|
27
|
+
def data_directory():
|
28
|
+
from pathlib import Path
|
29
|
+
print(Path.home())
|
30
|
+
|
31
|
+
def model_device(model):
|
32
|
+
return next(model.parameters()).device
|