npcpy 1.2.22__py3-none-any.whl → 1.2.24__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
npcpy/ft/rl.py CHANGED
@@ -1 +1,360 @@
1
- # tools for reinforcement learning
1
+ from dataclasses import dataclass
2
+
3
+ from datetime import datetime
4
+ import glob
5
+ import json
6
+ import os
7
+ import pandas as pd
8
+ try:
9
+ from datasets import Dataset
10
+
11
+ from peft import LoraConfig, PeftModel
12
+ import torch
13
+ from transformers import (
14
+ AutoModelForCausalLM,
15
+ AutoTokenizer
16
+ )
17
+ from trl import DPOTrainer, DPOConfig
18
+ except:
19
+ Dataset = None
20
+ PeftModel = None
21
+ DPOConfig = None
22
+ DPOTrainer = None
23
+ torch = None
24
+ AutoModelForCausalLM = None
25
+ AutoTokenizer = None
26
+
27
+
28
+ import random
29
+ from typing import List, Dict, Any, Optional, Callable
30
+ from npcpy.npc_compiler import NPC
31
+ from npcpy.llm_funcs import get_llm_response
32
+
33
+
34
+ @dataclass
35
+ class RLConfig:
36
+ base_model_name: str = "Qwen/Qwen3-0.6B"
37
+ adapter_path: str = "./rl_adapter"
38
+ max_iterations: int = 8
39
+ min_reward_gap: float = 0.4
40
+ num_train_epochs: int = 20
41
+ per_device_train_batch_size: int = 1
42
+ gradient_accumulation_steps: int = 2
43
+ learning_rate: float = 1e-6
44
+ beta: float = 0.5
45
+ max_length: int = 512
46
+ max_prompt_length: int = 256
47
+
48
+
49
+ class TaskExecutor:
50
+
51
+ def __init__(
52
+ self,
53
+ agent: NPC,
54
+ max_iterations: int = 8
55
+ ):
56
+ self.agent = agent
57
+ self.max_iterations = max_iterations
58
+
59
+ def execute_task(
60
+ self,
61
+ task_prompt: str
62
+ ) -> Dict[str, Any]:
63
+
64
+ messages = [
65
+ {
66
+ "role": "system",
67
+ "content": self.agent.primary_directive
68
+ }
69
+ ]
70
+
71
+ raw_responses = []
72
+ current_prompt = task_prompt
73
+
74
+ for i in range(self.max_iterations):
75
+ response_obj = self.agent.get_llm_response(
76
+ current_prompt,
77
+ messages=messages,
78
+ auto_process_tool_calls=True
79
+ )
80
+
81
+ raw_responses.append(response_obj)
82
+ messages = response_obj.get('messages', messages)
83
+
84
+ last_content = messages[-1].get('content', '')
85
+
86
+ if self._is_complete(last_content):
87
+ return {
88
+ "raw_responses": raw_responses,
89
+ "final_output": last_content,
90
+ "total_iterations": i + 1,
91
+ "completed": True
92
+ }
93
+
94
+ current_prompt = (
95
+ "Continue or provide final answer."
96
+ )
97
+
98
+ return {
99
+ "raw_responses": raw_responses,
100
+ "final_output": messages[-1].get('content', ''),
101
+ "total_iterations": self.max_iterations,
102
+ "completed": False
103
+ }
104
+
105
+ def _is_complete(self, content: str) -> bool:
106
+
107
+ completion_markers = [
108
+ "final answer:",
109
+ "conclusion:",
110
+ "result:",
111
+ "therefore",
112
+ "in summary"
113
+ ]
114
+ content_lower = content.lower()
115
+ return any(
116
+ marker in content_lower
117
+ for marker in completion_markers
118
+ )
119
+
120
+
121
+ def collect_traces(
122
+ tasks: List[Dict[str, Any]],
123
+ agents: List[NPC],
124
+ reward_fn: Callable[[Dict], float],
125
+ config: Optional[RLConfig] = None
126
+ ) -> List[Dict[str, Any]]:
127
+
128
+ if config is None:
129
+ config = RLConfig()
130
+
131
+ traces = []
132
+
133
+ for task in tasks:
134
+ task_prompt = task.get('prompt', task.get('input', ''))
135
+
136
+ for agent in agents:
137
+ executor = TaskExecutor(
138
+ agent,
139
+ max_iterations=config.max_iterations
140
+ )
141
+
142
+ result = executor.execute_task(task_prompt)
143
+
144
+ trace = {
145
+ "agent_name": agent.name,
146
+ "task_prompt": task_prompt,
147
+ "final_output": result['final_output'],
148
+ "total_iterations": result['total_iterations'],
149
+ "completed": result['completed'],
150
+ "task_metadata": task
151
+ }
152
+
153
+ trace['reward'] = reward_fn(trace)
154
+
155
+ traces.append(trace)
156
+
157
+ print(
158
+ f"Agent {agent.name}: "
159
+ f"Reward={trace['reward']:.2f}"
160
+ )
161
+
162
+ return traces
163
+
164
+
165
+ def create_preference_pairs(
166
+ traces: List[Dict[str, Any]],
167
+ min_reward_gap: float = 0.4
168
+ ) -> Dataset:
169
+
170
+ df = pd.DataFrame(traces)
171
+ df = df[df['reward'] > -1.0].copy()
172
+
173
+ if len(df) < 2:
174
+ return None
175
+
176
+ df = df.sort_values('reward', ascending=False)
177
+
178
+ top_quantile = df['reward'].quantile(
179
+ 0.8,
180
+ interpolation='higher'
181
+ )
182
+ low_quantile = df['reward'].quantile(
183
+ 0.2,
184
+ interpolation='lower'
185
+ )
186
+
187
+ high_traces = df[df['reward'] >= top_quantile]
188
+ low_traces = df[df['reward'] <= low_quantile]
189
+
190
+ pairs = []
191
+
192
+ for _, high_trace in high_traces.iterrows():
193
+ for _, low_trace in low_traces.iterrows():
194
+ reward_gap = (
195
+ high_trace['reward'] - low_trace['reward']
196
+ )
197
+
198
+ if reward_gap >= min_reward_gap:
199
+ pairs.append({
200
+ "prompt": str(high_trace['task_prompt']),
201
+ "chosen": str(high_trace['final_output']),
202
+ "rejected": str(low_trace['final_output'])
203
+ })
204
+
205
+ if len(pairs) < 5:
206
+ print(
207
+ f"Warning: Only {len(pairs)} pairs found. "
208
+ "May overfit."
209
+ )
210
+
211
+ return Dataset.from_list(pairs[:100])
212
+
213
+
214
+ def train_with_dpo(
215
+ traces: List[Dict[str, Any]],
216
+ config: Optional[RLConfig] = None
217
+ ) -> str:
218
+
219
+ if config is None:
220
+ config = RLConfig()
221
+
222
+ preference_dataset = create_preference_pairs(
223
+ traces,
224
+ min_reward_gap=config.min_reward_gap
225
+ )
226
+
227
+ if preference_dataset is None or len(preference_dataset) == 0:
228
+ print("No valid preference pairs. Cannot train.")
229
+ return None
230
+
231
+ model = AutoModelForCausalLM.from_pretrained(
232
+ config.base_model_name,
233
+ torch_dtype=torch.float32,
234
+ device_map="auto",
235
+ low_cpu_mem_usage=True
236
+ )
237
+
238
+ tokenizer = AutoTokenizer.from_pretrained(
239
+ config.base_model_name,
240
+ trust_remote_code=True
241
+ )
242
+
243
+ if tokenizer.pad_token is None:
244
+ tokenizer.pad_token = tokenizer.eos_token
245
+
246
+ peft_config = LoraConfig(
247
+ r=8,
248
+ lora_alpha=16,
249
+ lora_dropout=0.1,
250
+ bias="none",
251
+ task_type="CAUSAL_LM",
252
+ target_modules=[
253
+ "q_proj",
254
+ "k_proj",
255
+ "v_proj",
256
+ "o_proj"
257
+ ]
258
+ )
259
+
260
+ training_args = DPOConfig(
261
+ output_dir="./dpo_results",
262
+ per_device_train_batch_size=(
263
+ config.per_device_train_batch_size
264
+ ),
265
+ gradient_accumulation_steps=(
266
+ config.gradient_accumulation_steps
267
+ ),
268
+ learning_rate=config.learning_rate,
269
+ num_train_epochs=config.num_train_epochs,
270
+ weight_decay=0.1,
271
+ beta=config.beta,
272
+ logging_steps=2,
273
+ save_steps=10,
274
+ remove_unused_columns=False,
275
+ max_length=config.max_length,
276
+ max_prompt_length=config.max_prompt_length,
277
+ dataloader_num_workers=0,
278
+ fp16=False,
279
+ bf16=False,
280
+ optim="adamw_torch",
281
+ warmup_steps=2,
282
+ save_strategy="steps",
283
+ save_total_limit=3
284
+ )
285
+
286
+ trainer = DPOTrainer(
287
+ model,
288
+ args=training_args,
289
+ train_dataset=preference_dataset,
290
+ peft_config=peft_config
291
+ )
292
+
293
+ print("Starting DPO training...")
294
+ trainer.train()
295
+
296
+ trainer.save_model(config.adapter_path)
297
+ print(f"Adapter saved to {config.adapter_path}")
298
+
299
+ return config.adapter_path
300
+
301
+
302
+ def run_rl_training(
303
+ tasks: List[Dict[str, Any]],
304
+ agents: List[NPC],
305
+ reward_fn: Callable[[Dict], float],
306
+ config: Optional[RLConfig] = None,
307
+ save_traces: bool = True
308
+ ) -> str:
309
+
310
+ if config is None:
311
+ config = RLConfig()
312
+
313
+ print(f"Collecting traces from {len(tasks)} tasks...")
314
+ traces = collect_traces(
315
+ tasks,
316
+ agents,
317
+ reward_fn,
318
+ config
319
+ )
320
+
321
+ if save_traces:
322
+ timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
323
+ traces_file = f"rl_traces_{timestamp}.csv"
324
+ df = pd.DataFrame(traces)
325
+ df.to_csv(traces_file, index=False)
326
+ print(f"Traces saved to {traces_file}")
327
+
328
+ print("Training with DPO...")
329
+ adapter_path = train_with_dpo(traces, config)
330
+
331
+ return adapter_path
332
+
333
+
334
+ def load_rl_model(
335
+ base_model_id: str,
336
+ adapter_path: str
337
+ ):
338
+
339
+ print(f"Loading base model: {base_model_id}")
340
+ model = AutoModelForCausalLM.from_pretrained(
341
+ base_model_id,
342
+ torch_dtype=torch.float32,
343
+ device_map="auto",
344
+ attn_implementation='eager'
345
+ )
346
+
347
+ tokenizer = AutoTokenizer.from_pretrained(
348
+ base_model_id,
349
+ trust_remote_code=True
350
+ )
351
+
352
+ if tokenizer.pad_token is None:
353
+ tokenizer.pad_token = tokenizer.eos_token
354
+
355
+ if adapter_path and os.path.exists(adapter_path):
356
+ print(f"Loading adapter: {adapter_path}")
357
+ model = PeftModel.from_pretrained(model, adapter_path)
358
+ model = model.merge_and_unload()
359
+
360
+ return model, tokenizer
npcpy/ft/sft.py CHANGED
@@ -1 +1,230 @@
1
1
  # structured fine tuning of LLMs to produce structured output
2
+ from dataclasses import dataclass, field
3
+ from datasets import Dataset
4
+ import json
5
+ import numpy as np
6
+ import os
7
+ try:
8
+ import torch
9
+ from transformers import (
10
+ AutoModelForCausalLM,
11
+ AutoTokenizer,
12
+ TrainingArguments
13
+ )
14
+ from trl import SFTTrainer
15
+ from peft import LoraConfig
16
+ except:
17
+ torch = None
18
+ SFTTrainer = None
19
+ LoraConfig = None
20
+ AutoModelForCausalLM = None
21
+ AutoTokenizer = None
22
+ TrainingArguments = None
23
+
24
+ from typing import List, Dict, Any, Optional
25
+
26
+
27
+ @dataclass
28
+ class SFTConfig:
29
+ base_model_name: str = "google/gemma-3-270m-it"
30
+ output_model_path: str = "models/sft_model"
31
+ lora_r: int = 8
32
+ lora_alpha: int = 16
33
+ use_4bit: bool = False
34
+ fp16: bool = False
35
+ bf16: bool = False
36
+ lora_dropout: float = 0.15
37
+ lora_target_modules: List[str] = field(
38
+ default_factory=lambda: ["q_proj", "v_proj"]
39
+ )
40
+ num_train_epochs: int = 20
41
+ per_device_train_batch_size: int = 2
42
+ gradient_accumulation_steps: int = 4
43
+ learning_rate: float = 3e-5
44
+ logging_steps: int = 10
45
+ optim: str = "adamw_torch"
46
+ lr_scheduler_type: str = "cosine_with_restarts"
47
+ weight_decay: float = 0.01
48
+ max_length: int = 512
49
+ save_steps: int = 50
50
+
51
+
52
+ def format_training_examples(
53
+ inputs: List[str],
54
+ outputs: List[str],
55
+ format_style: str = "gemma"
56
+ ) -> List[Dict[str, str]]:
57
+
58
+ formatted = []
59
+
60
+ for inp, out in zip(inputs, outputs):
61
+ if format_style == "gemma":
62
+ text = (
63
+ f"<start_of_turn>user\n{inp}<end_of_turn>\n"
64
+ f"<start_of_turn>model\n{out}<end_of_turn>"
65
+ )
66
+ elif format_style == "llama":
67
+ text = (
68
+ f"<|begin_of_text|><|start_header_id|>user"
69
+ f"<|end_header_id|>\n\n{inp}<|eot_id|>"
70
+ f"<|start_header_id|>assistant<|end_header_id|>"
71
+ f"\n\n{out}<|eot_id|>"
72
+ )
73
+ else:
74
+ text = f"Input: {inp}\nOutput: {out}"
75
+
76
+ formatted.append({"text": text})
77
+
78
+ return formatted
79
+
80
+
81
+ def run_sft(
82
+ X: List[str],
83
+ y: List[str],
84
+ config: Optional[SFTConfig] = None,
85
+ validation_split: float = 0.0,
86
+ format_style: str = "gemma"
87
+ ) -> str:
88
+
89
+ if config is None:
90
+ config = SFTConfig()
91
+
92
+ if len(X) != len(y):
93
+ raise ValueError(
94
+ f"X and y must have same length: {len(X)} vs {len(y)}"
95
+ )
96
+
97
+ formatted_examples = format_training_examples(
98
+ X, y, format_style
99
+ )
100
+
101
+ if validation_split > 0:
102
+ split_idx = int(len(formatted_examples) * (1 - validation_split))
103
+ train_examples = formatted_examples[:split_idx]
104
+ val_examples = formatted_examples[split_idx:]
105
+ print(
106
+ f"Split: {len(train_examples)} train, "
107
+ f"{len(val_examples)} val"
108
+ )
109
+ else:
110
+ train_examples = formatted_examples
111
+ val_examples = []
112
+
113
+ dataset = Dataset.from_list(train_examples)
114
+
115
+ model = AutoModelForCausalLM.from_pretrained(
116
+ config.base_model_name,
117
+ trust_remote_code=True,
118
+ attn_implementation="eager"
119
+ )
120
+ model.config.use_cache = False
121
+
122
+ tokenizer = AutoTokenizer.from_pretrained(
123
+ config.base_model_name,
124
+ trust_remote_code=True
125
+ )
126
+ tokenizer.pad_token = tokenizer.eos_token
127
+ tokenizer.padding_side = "right"
128
+
129
+ peft_config = LoraConfig(
130
+ r=config.lora_r,
131
+ lora_alpha=config.lora_alpha,
132
+ lora_dropout=config.lora_dropout,
133
+ target_modules=config.lora_target_modules,
134
+ bias="none",
135
+ task_type="CAUSAL_LM"
136
+ )
137
+
138
+ training_args = TrainingArguments(
139
+ output_dir=config.output_model_path,
140
+ num_train_epochs=config.num_train_epochs,
141
+ per_device_train_batch_size=(
142
+ config.per_device_train_batch_size
143
+ ),
144
+ gradient_accumulation_steps=(
145
+ config.gradient_accumulation_steps
146
+ ),
147
+ optim=config.optim,
148
+ logging_steps=config.logging_steps,
149
+ learning_rate=config.learning_rate,
150
+ fp16=config.fp16,
151
+ bf16=config.bf16,
152
+ lr_scheduler_type=config.lr_scheduler_type,
153
+ group_by_length=True,
154
+ save_steps=config.save_steps,
155
+ weight_decay=config.weight_decay,
156
+ )
157
+
158
+ trainer = SFTTrainer(
159
+ model=model,
160
+ train_dataset=dataset,
161
+ peft_config=peft_config,
162
+ args=training_args,
163
+ max_seq_length=config.max_length
164
+ )
165
+
166
+ print(f"Training on {len(dataset)} examples")
167
+ trainer.train()
168
+
169
+ trainer.save_model(config.output_model_path)
170
+ print(f"Model saved to {config.output_model_path}")
171
+
172
+ return config.output_model_path
173
+
174
+
175
+ def load_sft_model(model_path: str):
176
+
177
+ model = AutoModelForCausalLM.from_pretrained(
178
+ model_path,
179
+ torch_dtype=torch.float32,
180
+ device_map="auto",
181
+ attn_implementation="eager"
182
+ )
183
+
184
+ tokenizer = AutoTokenizer.from_pretrained(
185
+ model_path,
186
+ trust_remote_code=True
187
+ )
188
+
189
+ if tokenizer.pad_token is None:
190
+ tokenizer.pad_token = tokenizer.eos_token
191
+
192
+ return model, tokenizer
193
+
194
+
195
+ def predict_sft(
196
+ model,
197
+ tokenizer,
198
+ prompt: str,
199
+ max_new_tokens: int = 128,
200
+ temperature: float = 0.7
201
+ ) -> str:
202
+
203
+ device = next(model.parameters()).device
204
+
205
+ inputs = tokenizer(
206
+ prompt,
207
+ return_tensors="pt",
208
+ truncation=True,
209
+ max_length=512
210
+ )
211
+
212
+ input_ids = inputs.input_ids.to(device)
213
+ attention_mask = inputs.attention_mask.to(device)
214
+
215
+ with torch.no_grad():
216
+ outputs = model.generate(
217
+ input_ids=input_ids,
218
+ attention_mask=attention_mask,
219
+ max_new_tokens=max_new_tokens,
220
+ temperature=temperature,
221
+ do_sample=temperature > 0,
222
+ pad_token_id=tokenizer.eos_token_id
223
+ )
224
+
225
+ response = tokenizer.decode(
226
+ outputs[0],
227
+ skip_special_tokens=True
228
+ )
229
+
230
+ return response
npcpy/ft/usft.py ADDED
@@ -0,0 +1,128 @@
1
+ from dataclasses import dataclass, field
2
+ try:
3
+ from datasets import Dataset, load_dataset
4
+ import torch
5
+ from transformers import (
6
+ AutoModelForCausalLM,
7
+ AutoTokenizer,
8
+ TrainingArguments
9
+ )
10
+ from trl import SFTTrainer
11
+ from peft import LoraConfig
12
+ except:
13
+ Dataset = None
14
+ load_dataset = None
15
+ torch = None
16
+ AutoModelForCausalLM = None
17
+ AutoTokenizer = None
18
+ TrainingArguments = None
19
+ SFTTrainer = None
20
+
21
+ from typing import List, Optional
22
+
23
+
24
+ @dataclass
25
+ class USFTConfig:
26
+ base_model_name: str = "Qwen/Qwen3-0.6B"
27
+ output_model_path: str = "models/usft_model"
28
+ lora_r: int = 8
29
+ lora_alpha: int = 16
30
+ lora_dropout: float = 0.15
31
+ lora_target_modules: List[str] = field(
32
+ default_factory=lambda: ["q_proj", "v_proj"]
33
+ )
34
+ num_train_epochs: int = 3
35
+ per_device_train_batch_size: int = 4
36
+ gradient_accumulation_steps: int = 4
37
+ learning_rate: float = 2e-5
38
+ logging_steps: int = 10
39
+ optim: str = "adamw_torch"
40
+ lr_scheduler_type: str = "cosine"
41
+ weight_decay: float = 0.01
42
+ max_length: int = 512
43
+ save_steps: int = 100
44
+
45
+
46
+ def run_usft(
47
+ texts: List[str],
48
+ config: Optional[USFTConfig] = None
49
+ ) -> str:
50
+
51
+ if config is None:
52
+ config = USFTConfig()
53
+
54
+ dataset = Dataset.from_dict({"text": texts})
55
+
56
+ model = AutoModelForCausalLM.from_pretrained(
57
+ config.base_model_name,
58
+ trust_remote_code=True,
59
+ attn_implementation="eager"
60
+ )
61
+ model.config.use_cache = False
62
+
63
+ tokenizer = AutoTokenizer.from_pretrained(
64
+ config.base_model_name,
65
+ trust_remote_code=True
66
+ )
67
+
68
+ if tokenizer.pad_token is None:
69
+ tokenizer.pad_token = tokenizer.eos_token
70
+
71
+ tokenizer.padding_side = "right"
72
+
73
+ peft_config = LoraConfig(
74
+ r=config.lora_r,
75
+ lora_alpha=config.lora_alpha,
76
+ lora_dropout=config.lora_dropout,
77
+ target_modules=config.lora_target_modules,
78
+ bias="none",
79
+ task_type="CAUSAL_LM"
80
+ )
81
+
82
+ training_args = TrainingArguments(
83
+ output_dir=config.output_model_path,
84
+ num_train_epochs=config.num_train_epochs,
85
+ per_device_train_batch_size=(
86
+ config.per_device_train_batch_size
87
+ ),
88
+ gradient_accumulation_steps=(
89
+ config.gradient_accumulation_steps
90
+ ),
91
+ optim=config.optim,
92
+ logging_steps=config.logging_steps,
93
+ learning_rate=config.learning_rate,
94
+ fp16=False,
95
+ bf16=torch.cuda.is_available(),
96
+ lr_scheduler_type=config.lr_scheduler_type,
97
+ save_steps=config.save_steps,
98
+ weight_decay=config.weight_decay,
99
+ )
100
+
101
+ trainer = SFTTrainer(
102
+ model=model,
103
+ train_dataset=dataset,
104
+ peft_config=peft_config,
105
+ args=training_args,
106
+ max_seq_length=config.max_length,
107
+ dataset_text_field="text"
108
+ )
109
+
110
+ print(f"Starting USFT on {len(dataset)} texts")
111
+ trainer.train()
112
+
113
+ trainer.save_model(config.output_model_path)
114
+ print(f"Model saved to {config.output_model_path}")
115
+
116
+ return config.output_model_path
117
+
118
+
119
+ def load_corpus_from_hf(dataset_name: str, split: str = "train"):
120
+
121
+ ds = load_dataset(dataset_name, split=split)
122
+
123
+ if "text" in ds.column_names:
124
+ return ds["text"]
125
+ elif "content" in ds.column_names:
126
+ return ds["content"]
127
+ else:
128
+ return [str(item) for item in ds]