PyPI - npcpy - Versions diffs - 1.2.22__py3-none-any.whl → 1.2.24__py3-none-any.whl - Mend

npcpy 1.2.22py3-none-any.whl → 1.2.24py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (16) hide show

npcpy/ft/diff.py +110 -1
npcpy/ft/ge.py +115 -1
npcpy/ft/model_ensembler.py +357 -0
npcpy/ft/rl.py +360 -1
npcpy/ft/sft.py +229 -0
npcpy/ft/usft.py +128 -0
npcpy/llm_funcs.py +1 -1
npcpy/memory/command_history.py +23 -0
npcpy/memory/memory_processor.py +27 -11
npcpy/npc_sysenv.py +152 -24
npcpy/serve.py +4 -2
{npcpy-1.2.22.dist-info → npcpy-1.2.24.dist-info}/METADATA +147 -1
{npcpy-1.2.22.dist-info → npcpy-1.2.24.dist-info}/RECORD +16 -14
{npcpy-1.2.22.dist-info → npcpy-1.2.24.dist-info}/WHEEL +0 -0
{npcpy-1.2.22.dist-info → npcpy-1.2.24.dist-info}/licenses/LICENSE +0 -0
{npcpy-1.2.22.dist-info → npcpy-1.2.24.dist-info}/top_level.txt +0 -0

npcpy/ft/rl.py CHANGED Viewed

@@ -1 +1,360 @@
-# tools for reinforcement learning
+from dataclasses import dataclass
+from datetime import datetime
+import glob
+import json
+import os
+import pandas as pd
+try:
+    from datasets import Dataset
+    from peft import LoraConfig, PeftModel
+    import torch
+    from transformers import (
+        AutoModelForCausalLM,
+        AutoTokenizer
+    )
+    from trl import DPOTrainer, DPOConfig
+except:
+    Dataset = None
+    PeftModel = None
+    DPOConfig = None
+    DPOTrainer = None
+    torch = None
+    AutoModelForCausalLM = None
+    AutoTokenizer = None
+import random
+from typing import List, Dict, Any, Optional, Callable
+from npcpy.npc_compiler import NPC
+from npcpy.llm_funcs import get_llm_response
+@dataclass
+class RLConfig:
+    base_model_name: str = "Qwen/Qwen3-0.6B"
+    adapter_path: str = "./rl_adapter"
+    max_iterations: int = 8
+    min_reward_gap: float = 0.4
+    num_train_epochs: int = 20
+    per_device_train_batch_size: int = 1
+    gradient_accumulation_steps: int = 2
+    learning_rate: float = 1e-6
+    beta: float = 0.5
+    max_length: int = 512
+    max_prompt_length: int = 256
+class TaskExecutor:
+    def __init__(
+        self,
+        agent: NPC,
+        max_iterations: int = 8
+    ):
+        self.agent = agent
+        self.max_iterations = max_iterations
+    def execute_task(
+        self,
+        task_prompt: str
+    ) -> Dict[str, Any]:
+        messages = [
+            {
+                "role": "system",
+                "content": self.agent.primary_directive
+            }
+        ]
+        raw_responses = []
+        current_prompt = task_prompt
+        for i in range(self.max_iterations):
+            response_obj = self.agent.get_llm_response(
+                current_prompt,
+                messages=messages,
+                auto_process_tool_calls=True
+            )
+            raw_responses.append(response_obj)
+            messages = response_obj.get('messages', messages)
+            last_content = messages[-1].get('content', '')
+            if self._is_complete(last_content):
+                return {
+                    "raw_responses": raw_responses,
+                    "final_output": last_content,
+                    "total_iterations": i + 1,
+                    "completed": True
+                }
+            current_prompt = (
+                "Continue or provide final answer."
+            )
+        return {
+            "raw_responses": raw_responses,
+            "final_output": messages[-1].get('content', ''),
+            "total_iterations": self.max_iterations,
+            "completed": False
+        }
+    def _is_complete(self, content: str) -> bool:
+        completion_markers = [
+            "final answer:",
+            "conclusion:",
+            "result:",
+            "therefore",
+            "in summary"
+        ]
+        content_lower = content.lower()
+        return any(
+            marker in content_lower
+            for marker in completion_markers
+        )
+def collect_traces(
+    tasks: List[Dict[str, Any]],
+    agents: List[NPC],
+    reward_fn: Callable[[Dict], float],
+    config: Optional[RLConfig] = None
+) -> List[Dict[str, Any]]:
+    if config is None:
+        config = RLConfig()
+    traces = []
+    for task in tasks:
+        task_prompt = task.get('prompt', task.get('input', ''))
+        for agent in agents:
+            executor = TaskExecutor(
+                agent,
+                max_iterations=config.max_iterations
+            )
+            result = executor.execute_task(task_prompt)
+            trace = {
+                "agent_name": agent.name,
+                "task_prompt": task_prompt,
+                "final_output": result['final_output'],
+                "total_iterations": result['total_iterations'],
+                "completed": result['completed'],
+                "task_metadata": task
+            }
+            trace['reward'] = reward_fn(trace)
+            traces.append(trace)
+            print(
+                f"Agent {agent.name}: "
+                f"Reward={trace['reward']:.2f}"
+            )
+    return traces
+def create_preference_pairs(
+    traces: List[Dict[str, Any]],
+    min_reward_gap: float = 0.4
+) -> Dataset:
+    df = pd.DataFrame(traces)
+    df = df[df['reward'] > -1.0].copy()
+    if len(df) < 2:
+        return None
+    df = df.sort_values('reward', ascending=False)
+    top_quantile = df['reward'].quantile(
+        0.8,
+        interpolation='higher'
+    )
+    low_quantile = df['reward'].quantile(
+        0.2,
+        interpolation='lower'
+    )
+    high_traces = df[df['reward'] >= top_quantile]
+    low_traces = df[df['reward'] <= low_quantile]
+    pairs = []
+    for _, high_trace in high_traces.iterrows():
+        for _, low_trace in low_traces.iterrows():
+            reward_gap = (
+                high_trace['reward'] - low_trace['reward']
+            )
+            if reward_gap >= min_reward_gap:
+                pairs.append({
+                    "prompt": str(high_trace['task_prompt']),
+                    "chosen": str(high_trace['final_output']),
+                    "rejected": str(low_trace['final_output'])
+                })
+    if len(pairs) < 5:
+        print(
+            f"Warning: Only {len(pairs)} pairs found. "
+            "May overfit."
+        )
+    return Dataset.from_list(pairs[:100])
+def train_with_dpo(
+    traces: List[Dict[str, Any]],
+    config: Optional[RLConfig] = None
+) -> str:
+    if config is None:
+        config = RLConfig()
+    preference_dataset = create_preference_pairs(
+        traces,
+        min_reward_gap=config.min_reward_gap
+    )
+    if preference_dataset is None or len(preference_dataset) == 0:
+        print("No valid preference pairs. Cannot train.")
+        return None
+    model = AutoModelForCausalLM.from_pretrained(
+        config.base_model_name,
+        torch_dtype=torch.float32,
+        device_map="auto",
+        low_cpu_mem_usage=True
+    )
+    tokenizer = AutoTokenizer.from_pretrained(
+        config.base_model_name,
+        trust_remote_code=True
+    )
+    if tokenizer.pad_token is None:
+        tokenizer.pad_token = tokenizer.eos_token
+    peft_config = LoraConfig(
+        r=8,
+        lora_alpha=16,
+        lora_dropout=0.1,
+        bias="none",
+        task_type="CAUSAL_LM",
+        target_modules=[
+            "q_proj",
+            "k_proj",
+            "v_proj",
+            "o_proj"
+        ]
+    )
+    training_args = DPOConfig(
+        output_dir="./dpo_results",
+        per_device_train_batch_size=(
+            config.per_device_train_batch_size
+        ),
+        gradient_accumulation_steps=(
+            config.gradient_accumulation_steps
+        ),
+        learning_rate=config.learning_rate,
+        num_train_epochs=config.num_train_epochs,
+        weight_decay=0.1,
+        beta=config.beta,
+        logging_steps=2,
+        save_steps=10,
+        remove_unused_columns=False,
+        max_length=config.max_length,
+        max_prompt_length=config.max_prompt_length,
+        dataloader_num_workers=0,
+        fp16=False,
+        bf16=False,
+        optim="adamw_torch",
+        warmup_steps=2,
+        save_strategy="steps",
+        save_total_limit=3
+    )
+    trainer = DPOTrainer(
+        model,
+        args=training_args,
+        train_dataset=preference_dataset,
+        peft_config=peft_config
+    )
+    print("Starting DPO training...")
+    trainer.train()
+    trainer.save_model(config.adapter_path)
+    print(f"Adapter saved to {config.adapter_path}")
+    return config.adapter_path
+def run_rl_training(
+    tasks: List[Dict[str, Any]],
+    agents: List[NPC],
+    reward_fn: Callable[[Dict], float],
+    config: Optional[RLConfig] = None,
+    save_traces: bool = True
+) -> str:
+    if config is None:
+        config = RLConfig()
+    print(f"Collecting traces from {len(tasks)} tasks...")
+    traces = collect_traces(
+        tasks,
+        agents,
+        reward_fn,
+        config
+    )
+    if save_traces:
+        timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
+        traces_file = f"rl_traces_{timestamp}.csv"
+        df = pd.DataFrame(traces)
+        df.to_csv(traces_file, index=False)
+        print(f"Traces saved to {traces_file}")
+    print("Training with DPO...")
+    adapter_path = train_with_dpo(traces, config)
+    return adapter_path
+def load_rl_model(
+    base_model_id: str,
+    adapter_path: str
+):
+    print(f"Loading base model: {base_model_id}")
+    model = AutoModelForCausalLM.from_pretrained(
+        base_model_id,
+        torch_dtype=torch.float32,
+        device_map="auto",
+        attn_implementation='eager'
+    )
+    tokenizer = AutoTokenizer.from_pretrained(
+        base_model_id,
+        trust_remote_code=True
+    )
+    if tokenizer.pad_token is None:
+        tokenizer.pad_token = tokenizer.eos_token
+    if adapter_path and os.path.exists(adapter_path):
+        print(f"Loading adapter: {adapter_path}")
+        model = PeftModel.from_pretrained(model, adapter_path)
+        model = model.merge_and_unload()
+    return model, tokenizer

npcpy/ft/sft.py CHANGED Viewed

@@ -1 +1,230 @@
 # structured fine tuning of LLMs to produce structured output
+from dataclasses import dataclass, field
+from datasets import Dataset
+import json
+import numpy as np
+import os
+try:
+    import torch
+    from transformers import (
+        AutoModelForCausalLM,
+        AutoTokenizer,
+        TrainingArguments
+    )
+    from trl import SFTTrainer
+    from peft import LoraConfig
+except:
+    torch = None
+    SFTTrainer = None
+    LoraConfig = None
+    AutoModelForCausalLM = None
+    AutoTokenizer = None
+    TrainingArguments = None
+from typing import List, Dict, Any, Optional
+@dataclass
+class SFTConfig:
+    base_model_name: str = "google/gemma-3-270m-it"
+    output_model_path: str = "models/sft_model"
+    lora_r: int = 8
+    lora_alpha: int = 16
+    use_4bit: bool = False
+    fp16: bool = False
+    bf16: bool = False
+    lora_dropout: float = 0.15
+    lora_target_modules: List[str] = field(
+        default_factory=lambda: ["q_proj", "v_proj"]
+    )
+    num_train_epochs: int = 20
+    per_device_train_batch_size: int = 2
+    gradient_accumulation_steps: int = 4
+    learning_rate: float = 3e-5
+    logging_steps: int = 10
+    optim: str = "adamw_torch"
+    lr_scheduler_type: str = "cosine_with_restarts"
+    weight_decay: float = 0.01
+    max_length: int = 512
+    save_steps: int = 50
+def format_training_examples(
+    inputs: List[str],
+    outputs: List[str],
+    format_style: str = "gemma"
+) -> List[Dict[str, str]]:
+    formatted = []
+    for inp, out in zip(inputs, outputs):
+        if format_style == "gemma":
+            text = (
+                f"<start_of_turn>user\n{inp}<end_of_turn>\n"
+                f"<start_of_turn>model\n{out}<end_of_turn>"
+            )
+        elif format_style == "llama":
+            text = (
+                f"<|begin_of_text|><|start_header_id|>user"
+                f"<|end_header_id|>\n\n{inp}<|eot_id|>"
+                f"<|start_header_id|>assistant<|end_header_id|>"
+                f"\n\n{out}<|eot_id|>"
+            )
+        else:
+            text = f"Input: {inp}\nOutput: {out}"
+        formatted.append({"text": text})
+    return formatted
+def run_sft(
+    X: List[str],
+    y: List[str],
+    config: Optional[SFTConfig] = None,
+    validation_split: float = 0.0,
+    format_style: str = "gemma"
+) -> str:
+    if config is None:
+        config = SFTConfig()
+    if len(X) != len(y):
+        raise ValueError(
+            f"X and y must have same length: {len(X)} vs {len(y)}"
+        )
+    formatted_examples = format_training_examples(
+        X, y, format_style
+    )
+    if validation_split > 0:
+        split_idx = int(len(formatted_examples) * (1 - validation_split))
+        train_examples = formatted_examples[:split_idx]
+        val_examples = formatted_examples[split_idx:]
+        print(
+            f"Split: {len(train_examples)} train, "
+            f"{len(val_examples)} val"
+        )
+    else:
+        train_examples = formatted_examples
+        val_examples = []
+    dataset = Dataset.from_list(train_examples)
+    model = AutoModelForCausalLM.from_pretrained(
+        config.base_model_name,
+        trust_remote_code=True,
+        attn_implementation="eager"
+    )
+    model.config.use_cache = False
+    tokenizer = AutoTokenizer.from_pretrained(
+        config.base_model_name,
+        trust_remote_code=True
+    )
+    tokenizer.pad_token = tokenizer.eos_token
+    tokenizer.padding_side = "right"
+    peft_config = LoraConfig(
+        r=config.lora_r,
+        lora_alpha=config.lora_alpha,
+        lora_dropout=config.lora_dropout,
+        target_modules=config.lora_target_modules,
+        bias="none",
+        task_type="CAUSAL_LM"
+    )
+    training_args = TrainingArguments(
+        output_dir=config.output_model_path,
+        num_train_epochs=config.num_train_epochs,
+        per_device_train_batch_size=(
+            config.per_device_train_batch_size
+        ),
+        gradient_accumulation_steps=(
+            config.gradient_accumulation_steps
+        ),
+        optim=config.optim,
+        logging_steps=config.logging_steps,
+        learning_rate=config.learning_rate,
+        fp16=config.fp16,
+        bf16=config.bf16,
+        lr_scheduler_type=config.lr_scheduler_type,
+        group_by_length=True,
+        save_steps=config.save_steps,
+        weight_decay=config.weight_decay,
+    )
+    trainer = SFTTrainer(
+        model=model,
+        train_dataset=dataset,
+        peft_config=peft_config,
+        args=training_args,
+        max_seq_length=config.max_length
+    )
+    print(f"Training on {len(dataset)} examples")
+    trainer.train()
+    trainer.save_model(config.output_model_path)
+    print(f"Model saved to {config.output_model_path}")
+    return config.output_model_path
+def load_sft_model(model_path: str):
+    model = AutoModelForCausalLM.from_pretrained(
+        model_path,
+        torch_dtype=torch.float32,
+        device_map="auto",
+        attn_implementation="eager"
+    )
+    tokenizer = AutoTokenizer.from_pretrained(
+        model_path,
+        trust_remote_code=True
+    )
+    if tokenizer.pad_token is None:
+        tokenizer.pad_token = tokenizer.eos_token
+    return model, tokenizer
+def predict_sft(
+    model,
+    tokenizer,
+    prompt: str,
+    max_new_tokens: int = 128,
+    temperature: float = 0.7
+) -> str:
+    device = next(model.parameters()).device
+    inputs = tokenizer(
+        prompt,
+        return_tensors="pt",
+        truncation=True,
+        max_length=512
+    )
+    input_ids = inputs.input_ids.to(device)
+    attention_mask = inputs.attention_mask.to(device)
+    with torch.no_grad():
+        outputs = model.generate(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            max_new_tokens=max_new_tokens,
+            temperature=temperature,
+            do_sample=temperature > 0,
+            pad_token_id=tokenizer.eos_token_id
+        )
+    response = tokenizer.decode(
+        outputs[0],
+        skip_special_tokens=True
+    )
+    return response

npcpy/ft/usft.py ADDED Viewed

@@ -0,0 +1,128 @@
+from dataclasses import dataclass, field
+try:
+    from datasets import Dataset, load_dataset
+    import torch
+    from transformers import (
+        AutoModelForCausalLM,
+        AutoTokenizer,
+        TrainingArguments
+    )
+    from trl import SFTTrainer
+    from peft import LoraConfig
+except:
+    Dataset = None
+    load_dataset = None
+    torch = None
+    AutoModelForCausalLM = None
+    AutoTokenizer = None
+    TrainingArguments = None
+    SFTTrainer = None
+from typing import List, Optional
+@dataclass
+class USFTConfig:
+    base_model_name: str = "Qwen/Qwen3-0.6B"
+    output_model_path: str = "models/usft_model"
+    lora_r: int = 8
+    lora_alpha: int = 16
+    lora_dropout: float = 0.15
+    lora_target_modules: List[str] = field(
+        default_factory=lambda: ["q_proj", "v_proj"]
+    )
+    num_train_epochs: int = 3
+    per_device_train_batch_size: int = 4
+    gradient_accumulation_steps: int = 4
+    learning_rate: float = 2e-5
+    logging_steps: int = 10
+    optim: str = "adamw_torch"
+    lr_scheduler_type: str = "cosine"
+    weight_decay: float = 0.01
+    max_length: int = 512
+    save_steps: int = 100
+def run_usft(
+    texts: List[str],
+    config: Optional[USFTConfig] = None
+) -> str:
+    if config is None:
+        config = USFTConfig()
+    dataset = Dataset.from_dict({"text": texts})
+    model = AutoModelForCausalLM.from_pretrained(
+        config.base_model_name,
+        trust_remote_code=True,
+        attn_implementation="eager"
+    )
+    model.config.use_cache = False
+    tokenizer = AutoTokenizer.from_pretrained(
+        config.base_model_name,
+        trust_remote_code=True
+    )
+    if tokenizer.pad_token is None:
+        tokenizer.pad_token = tokenizer.eos_token
+    tokenizer.padding_side = "right"
+    peft_config = LoraConfig(
+        r=config.lora_r,
+        lora_alpha=config.lora_alpha,
+        lora_dropout=config.lora_dropout,
+        target_modules=config.lora_target_modules,
+        bias="none",
+        task_type="CAUSAL_LM"
+    )
+    training_args = TrainingArguments(
+        output_dir=config.output_model_path,
+        num_train_epochs=config.num_train_epochs,
+        per_device_train_batch_size=(
+            config.per_device_train_batch_size
+        ),
+        gradient_accumulation_steps=(
+            config.gradient_accumulation_steps
+        ),
+        optim=config.optim,
+        logging_steps=config.logging_steps,
+        learning_rate=config.learning_rate,
+        fp16=False,
+        bf16=torch.cuda.is_available(),
+        lr_scheduler_type=config.lr_scheduler_type,
+        save_steps=config.save_steps,
+        weight_decay=config.weight_decay,
+    )
+    trainer = SFTTrainer(
+        model=model,
+        train_dataset=dataset,
+        peft_config=peft_config,
+        args=training_args,
+        max_seq_length=config.max_length,
+        dataset_text_field="text"
+    )
+    print(f"Starting USFT on {len(dataset)} texts")
+    trainer.train()
+    trainer.save_model(config.output_model_path)
+    print(f"Model saved to {config.output_model_path}")
+    return config.output_model_path
+def load_corpus_from_hf(dataset_name: str, split: str = "train"):
+    ds = load_dataset(dataset_name, split=split)
+    if "text" in ds.column_names:
+        return ds["text"]
+    elif "content" in ds.column_names:
+        return ds["content"]
+    else:
+        return [str(item) for item in ds]

npcpy 1.2.22__py3-none-any.whl → 1.2.24__py3-none-any.whl

npcpy 1.2.22py3-none-any.whl → 1.2.24py3-none-any.whl