RubyGems - red-candle - Versions diffs - 1.0.0.pre.4 → 1.0.0.pre.5 - Mend

red-candle 1.0.0.pre.4 → 1.0.0.pre.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

checksums.yaml +4 -4
data/README.md +9 -2
data/Rakefile +0 -2
data/ext/candle/src/llm/llama.rs +402 -0
data/ext/candle/src/llm/mod.rs +1 -0
data/ext/candle/src/ruby/llm.rs +72 -2
data/lib/candle/llm.rb +4 -4
data/lib/candle/version.rb +1 -1
metadata +2 -1

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: 53283961925b76f6fdc39634e01d3d08f58805584488053f7e82fbd484b96b21
-  data.tar.gz: 91d3dae0c3c1f6686980708fc898132178454712fe9adf2a79d59f75038c21f1
+  metadata.gz: 91a4c43a1a12d6d8960f1a1d190c9bfe8ea60db75f687233012d09b8c90b5020
+  data.tar.gz: 3f6ce143cd38856365231baebe25a188e7d5824d930ecdae79c1660b3ad6c787
 SHA512:
-  metadata.gz: 809ab126883fc6a0b706f46b34922ccbbb5eb18b6b153b703404d17bdd2626a0bafe5cebbdd9168389c44f7506b4f0dabd2b9ed7a6ad6348b7406a899e3e5484
-  data.tar.gz: 1d908b031b207bf43e65d81c3f8ac34cd1451cdd38d32476a30e83c23b34c6148a2aa7e235b804ad47041b0d781b1ca36585c2c83b0fa7cd1e9ba26d7cbfc377
+  metadata.gz: 273a01c438b085509a433602097b5ad4bcdb3420fc19ebe84c4fd37bf43ae5e6e1040701ecaad9e2864c0cda31d6128d4b2df6c395290994c17f135620282be6
+  data.tar.gz: 9193d01d8bfc704b982c839f9aebac23217ae5d56b5afae6228daae73b60e219b049fe8fc8d5ad13657ceb110476489c68bbb9aa6397688bf70a976a8d62d41e

data/README.md CHANGED Viewed

@@ -45,6 +45,11 @@ results = reranker.rerank("query", ["doc1", "doc2", "doc3"])
 Red-Candle now supports Large Language Models (LLMs) with GPU acceleration!
+### Supported Models
+- **Llama**: Llama 2 and Llama 3 models (e.g., `TinyLlama/TinyLlama-1.1B-Chat-v1.0`, `meta-llama/Llama-2-7b-hf`, `NousResearch/Llama-2-7b-hf`)
+- **Mistral**: All Mistral models (e.g., `mistralai/Mistral-7B-Instruct-v0.1`)
 > ### ⚠️ Huggingface login warning
 >
 > Many models, including the one below, require you to agree to the terms. You'll need to:
@@ -62,7 +67,9 @@ device = Candle::Device.cpu     # CPU (default)
 device = Candle::Device.metal   # Apple GPU (Metal)
 device = Candle::Device.cuda    # NVIDIA GPU (CUDA)
-# Load a model
+# Load a Llama model
+llm = Candle::LLM.from_pretrained("TinyLlama/TinyLlama-1.1B-Chat-v1.0", device: device)
+# Or a Mistral model
 llm = Candle::LLM.from_pretrained("mistralai/Mistral-7B-Instruct-v0.1", device: device)
 # Generate text
@@ -86,7 +93,7 @@ response = llm.chat(messages)
 ```ruby
 # CPU works for all models
 device = Candle::Device.cpu
-llm = Candle::LLM.from_pretrained("mistralai/Mistral-7B-Instruct-v0.1", device: device)
+llm = Candle::LLM.from_pretrained("TinyLlama/TinyLlama-1.1B-Chat-v1.0", device: device)
 # Metal
 device = Candle::Device.metal

data/Rakefile CHANGED Viewed

@@ -4,8 +4,6 @@ require "bundler/gem_tasks"
 require "rake/testtask"
 require "rake/extensiontask"
-ENV['CANDLE_TEST_SKIP_LLM'] = 'true'
 task default: :test
 Rake::TestTask.new do |t|
   t.deps << :compile

data/ext/candle/src/llm/llama.rs ADDED Viewed

@@ -0,0 +1,402 @@
+use candle_core::{DType, Device, Result as CandleResult, Tensor};
+use candle_nn::VarBuilder;
+use candle_transformers::models::llama::{Config, LlamaConfig, Llama as LlamaModel, Cache};
+use hf_hub::{api::tokio::Api, Repo};
+use tokenizers::Tokenizer;
+use super::{GenerationConfig, TextGeneration, TextGenerator, TokenizerWrapper};
+#[derive(Debug)]
+pub struct Llama {
+    model: LlamaModel,
+    tokenizer: TokenizerWrapper,
+    device: Device,
+    model_id: String,
+    eos_token_id: u32,
+    cache: Cache,
+    config: Config,
+}
+impl Llama {
+    /// Clear the KV cache between generations
+    pub fn clear_kv_cache(&mut self) {
+        // Since Cache doesn't expose a reset method and kvs is private,
+        // we'll recreate the cache to clear it
+        // This is a workaround until candle provides a proper reset method
+        if let Ok(new_cache) = Cache::new(self.cache.use_kv_cache, DType::F32, &self.config, &self.device) {
+            self.cache = new_cache;
+        }
+    }
+    /// Load a Llama model from HuggingFace Hub
+    pub async fn from_pretrained(model_id: &str, device: Device) -> CandleResult<Self> {
+        let api = Api::new()
+            .map_err(|e| candle_core::Error::Msg(format!("Failed to create HF API: {}", e)))?;
+        let repo = api.repo(Repo::model(model_id.to_string()));
+        // Download model files
+        let config_filename = repo
+            .get("config.json")
+            .await
+            .map_err(|e| candle_core::Error::Msg(format!("Failed to download config: {}", e)))?;
+        let tokenizer_filename = repo
+            .get("tokenizer.json")
+            .await
+            .map_err(|e| candle_core::Error::Msg(format!("Failed to download tokenizer: {}", e)))?;
+        // Try different file patterns for model weights
+        let weights_filenames = if let Ok(single_file) = repo.get("model.safetensors").await {
+            vec![single_file]
+        } else if let Ok(consolidated_file) = repo.get("consolidated.safetensors").await {
+            vec![consolidated_file]
+        } else {
+            // Try to find sharded model files
+            let mut sharded_files = Vec::new();
+            let mut index = 1;
+            loop {
+                // Try common shard counts for Llama models
+                let mut found = false;
+                for total in [2, 3, 4, 5, 6, 7, 8, 10, 15, 20, 30] {
+                    let filename = format!("model-{:05}-of-{:05}.safetensors", index, total);
+                    if let Ok(file) = repo.get(&filename).await {
+                        sharded_files.push(file);
+                        found = true;
+                        break;
+                    }
+                }
+                if !found {
+                    break;
+                }
+                index += 1;
+            }
+            if sharded_files.is_empty() {
+                return Err(candle_core::Error::Msg(
+                    "Could not find model weights. Tried: model.safetensors, consolidated.safetensors, model-*-of-*.safetensors".to_string()
+                ));
+            }
+            sharded_files
+        };
+        // Load config
+        let llama_config: LlamaConfig = serde_json::from_reader(std::fs::File::open(config_filename)?)
+            .map_err(|e| candle_core::Error::Msg(format!("Failed to parse config: {}", e)))?;
+        let config = llama_config.into_config(false); // Don't use flash attention for now
+        // Load tokenizer
+        let tokenizer = Tokenizer::from_file(tokenizer_filename)
+            .map_err(|e| candle_core::Error::Msg(format!("Failed to load tokenizer: {}", e)))?;
+        // Determine EOS token ID based on model type
+        let eos_token_id = if model_id.contains("Llama-3") || model_id.contains("llama-3") {
+            // Llama 3 uses different special tokens
+            {
+                let vocab = tokenizer.get_vocab(true);
+                vocab.get("<|eot_id|>")
+                    .or_else(|| vocab.get("<|end_of_text|>"))
+                    .copied()
+                    .unwrap_or(128009) // Default Llama 3 EOS
+            }
+        } else {
+            // Llama 2 and earlier
+            tokenizer
+                .get_vocab(true)
+                .get("</s>")
+                .copied()
+                .unwrap_or(2)
+        };
+        // Load model weights
+        let vb = unsafe {
+            VarBuilder::from_mmaped_safetensors(&weights_filenames, DType::F32, &device)?
+        };
+        let model = LlamaModel::load(vb, &config)?;
+        let cache = Cache::new(true, DType::F32, &config, &device)?;
+        Ok(Self {
+            model,
+            tokenizer: TokenizerWrapper::new(tokenizer),
+            device,
+            model_id: model_id.to_string(),
+            eos_token_id,
+            cache,
+            config,
+        })
+    }
+    /// Create from existing components (useful for testing)
+    pub fn new(
+        model: LlamaModel,
+        tokenizer: Tokenizer,
+        device: Device,
+        model_id: String,
+        config: &Config,
+    ) -> CandleResult<Self> {
+        let eos_token_id = if model_id.contains("Llama-3") || model_id.contains("llama-3") {
+            {
+                let vocab = tokenizer.get_vocab(true);
+                vocab.get("<|eot_id|>")
+                    .or_else(|| vocab.get("<|end_of_text|>"))
+                    .copied()
+                    .unwrap_or(128009)
+            }
+        } else {
+            tokenizer
+                .get_vocab(true)
+                .get("</s>")
+                .copied()
+                .unwrap_or(2)
+        };
+        let cache = Cache::new(true, DType::F32, config, &device)?;
+        Ok(Self {
+            model,
+            tokenizer: TokenizerWrapper::new(tokenizer),
+            device,
+            model_id,
+            eos_token_id,
+            cache,
+            config: config.clone(),
+        })
+    }
+    fn generate_tokens(
+        &mut self,
+        prompt_tokens: Vec<u32>,
+        config: &GenerationConfig,
+        mut callback: Option<impl FnMut(&str)>,
+    ) -> CandleResult<Vec<u32>> {
+        let mut text_gen = TextGeneration::from_config(config);
+        text_gen.set_eos_token_id(self.eos_token_id);
+        text_gen.set_tokens(prompt_tokens.clone());
+        let mut all_tokens = prompt_tokens.clone();
+        let start_gen = all_tokens.len();
+        for index in 0..config.max_length {
+            let context_size = if index > 0 { 1 } else { all_tokens.len() };
+            let start_pos = all_tokens.len().saturating_sub(context_size);
+            let ctxt = &all_tokens[start_pos..];
+            let input = Tensor::new(ctxt, &self.device)?.unsqueeze(0)?;
+            let input = input.contiguous()?;
+            let logits = self.model.forward(&input, start_pos, &mut self.cache)?;
+            let logits = logits.squeeze(0)?;
+            let logits = if logits.dims().len() == 2 {
+                let seq_len = logits.dim(0)?;
+                logits.narrow(0, seq_len - 1, 1)?.squeeze(0)?
+            } else {
+                logits
+            };
+            let logits = logits.to_dtype(DType::F32)?;
+            let next_token = text_gen.sample_next_token(
+                &logits,
+                Some((config.repetition_penalty, config.repetition_penalty_last_n)),
+            )?;
+            all_tokens.push(next_token);
+            // Stream callback
+            if let Some(ref mut cb) = callback {
+                let token_text = self.tokenizer.token_to_piece(next_token)?;
+                cb(&token_text);
+            }
+            // Check stop conditions
+            if text_gen.should_stop(next_token, config.max_length) {
+                break;
+            }
+            // Check stop sequences
+            let generated_text = self.tokenizer.decode(&all_tokens[start_gen..], true)?;
+            if text_gen.check_stop_sequences(&generated_text, &config.stop_sequences) {
+                break;
+            }
+        }
+        Ok(if config.include_prompt {
+            all_tokens
+        } else {
+            all_tokens[start_gen..].to_vec()
+        })
+    }
+    fn generate_tokens_decoded(
+        &mut self,
+        prompt_tokens: Vec<u32>,
+        config: &GenerationConfig,
+        mut callback: Option<impl FnMut(&str)>,
+    ) -> CandleResult<Vec<u32>> {
+        let mut text_gen = TextGeneration::from_config(config);
+        text_gen.set_eos_token_id(self.eos_token_id);
+        text_gen.set_tokens(prompt_tokens.clone());
+        let mut all_tokens = prompt_tokens.clone();
+        let start_gen = all_tokens.len();
+        let mut previously_decoded = String::new();
+        for index in 0..config.max_length {
+            let context_size = if index > 0 { 1 } else { all_tokens.len() };
+            let start_pos = all_tokens.len().saturating_sub(context_size);
+            let ctxt = &all_tokens[start_pos..];
+            let input = Tensor::new(ctxt, &self.device)?.unsqueeze(0)?;
+            let input = input.contiguous()?;
+            let logits = self.model.forward(&input, start_pos, &mut self.cache)?;
+            let logits = logits.squeeze(0)?;
+            let logits = if logits.dims().len() == 2 {
+                let seq_len = logits.dim(0)?;
+                logits.narrow(0, seq_len - 1, 1)?.squeeze(0)?
+            } else {
+                logits
+            };
+            let logits = logits.to_dtype(DType::F32)?;
+            let next_token = text_gen.sample_next_token(
+                &logits,
+                Some((config.repetition_penalty, config.repetition_penalty_last_n)),
+            )?;
+            all_tokens.push(next_token);
+            // Stream callback with incremental decoding
+            if let Some(ref mut cb) = callback {
+                let current_decoded = self.tokenizer.decode(&all_tokens[start_gen..], true)?;
+                if current_decoded.len() > previously_decoded.len() {
+                    let new_text = &current_decoded[previously_decoded.len()..];
+                    cb(new_text);
+                    previously_decoded = current_decoded;
+                }
+            }
+            // Check stop conditions
+            if text_gen.should_stop(next_token, config.max_length) {
+                break;
+            }
+            // Check stop sequences
+            let generated_text = if callback.is_some() {
+                previously_decoded.clone()
+            } else {
+                self.tokenizer.decode(&all_tokens[start_gen..], true)?
+            };
+            if text_gen.check_stop_sequences(&generated_text, &config.stop_sequences) {
+                break;
+            }
+        }
+        Ok(if config.include_prompt {
+            all_tokens
+        } else {
+            all_tokens[start_gen..].to_vec()
+        })
+    }
+    /// Apply chat template based on Llama version
+    pub fn apply_chat_template(&self, messages: &[serde_json::Value]) -> CandleResult<String> {
+        let is_llama3 = self.model_id.contains("Llama-3") || self.model_id.contains("llama-3");
+        if is_llama3 {
+            self.apply_llama3_template(messages)
+        } else {
+            self.apply_llama2_template(messages)
+        }
+    }
+    fn apply_llama2_template(&self, messages: &[serde_json::Value]) -> CandleResult<String> {
+        let mut prompt = String::new();
+        let mut system_message = String::new();
+        for (i, message) in messages.iter().enumerate() {
+            let role = message["role"].as_str().unwrap_or("");
+            let content = message["content"].as_str().unwrap_or("");
+            match role {
+                "system" => {
+                    system_message = content.to_string();
+                }
+                "user" => {
+                    if i == 1 || (i == 0 && system_message.is_empty()) {
+                        // First user message
+                        if !system_message.is_empty() {
+                            prompt.push_str(&format!("<s>[INST] <<SYS>>\n{}\n<</SYS>>\n\n{} [/INST]", system_message, content));
+                        } else {
+                            prompt.push_str(&format!("<s>[INST] {} [/INST]", content));
+                        }
+                    } else {
+                        prompt.push_str(&format!(" [INST] {} [/INST]", content));
+                    }
+                }
+                "assistant" => {
+                    prompt.push_str(&format!(" {} </s>", content));
+                }
+                _ => {}
+            }
+        }
+        Ok(prompt)
+    }
+    fn apply_llama3_template(&self, messages: &[serde_json::Value]) -> CandleResult<String> {
+        let mut prompt = String::new();
+        prompt.push_str("<|begin_of_text|>");
+        for message in messages {
+            let role = message["role"].as_str().unwrap_or("");
+            let content = message["content"].as_str().unwrap_or("");
+            prompt.push_str(&format!("<|start_header_id|>{}<|end_header_id|>\n\n{}<|eot_id|>", role, content));
+        }
+        prompt.push_str("<|start_header_id|>assistant<|end_header_id|>\n\n");
+        Ok(prompt)
+    }
+}
+impl TextGenerator for Llama {
+    fn generate(
+        &mut self,
+        prompt: &str,
+        config: &GenerationConfig,
+    ) -> CandleResult<String> {
+        let prompt_tokens = self.tokenizer.encode(prompt, true)?;
+        let output_tokens = self.generate_tokens(prompt_tokens, config, None::<fn(&str)>)?;
+        self.tokenizer.decode(&output_tokens, true)
+    }
+    fn generate_stream(
+        &mut self,
+        prompt: &str,
+        config: &GenerationConfig,
+        mut callback: impl FnMut(&str),
+    ) -> CandleResult<String> {
+        let prompt_tokens = self.tokenizer.encode(prompt, true)?;
+        let output_tokens = self.generate_tokens_decoded(prompt_tokens, config, Some(&mut callback))?;
+        self.tokenizer.decode(&output_tokens, true)
+    }
+    fn model_name(&self) -> &str {
+        &self.model_id
+    }
+    fn device(&self) -> &Device {
+        &self.device
+    }
+    fn clear_cache(&mut self) {
+        self.clear_kv_cache();
+    }
+}

data/ext/candle/src/llm/mod.rs CHANGED Viewed

@@ -2,6 +2,7 @@ use candle_core::{Device, Result as CandleResult};
 use tokenizers::Tokenizer;
 pub mod mistral;
+pub mod llama;
 pub mod generation_config;
 pub mod text_generation;

data/ext/candle/src/ruby/llm.rs CHANGED Viewed

@@ -1,19 +1,21 @@
 use magnus::{function, method, prelude::*, Error, Module, RArray, RHash, RModule, Ruby, TryConvert, Value};
 use std::cell::RefCell;
-use crate::llm::{GenerationConfig as RustGenerationConfig, TextGenerator, mistral::Mistral as RustMistral};
+use crate::llm::{GenerationConfig as RustGenerationConfig, TextGenerator, mistral::Mistral as RustMistral, llama::Llama as RustLlama};
 use crate::ruby::{Result as RbResult, Device as RbDevice};
 // Use an enum to handle different model types instead of trait objects
 #[derive(Debug)]
 enum ModelType {
     Mistral(RustMistral),
+    Llama(RustLlama),
 }
 impl ModelType {
     fn generate(&mut self, prompt: &str, config: &RustGenerationConfig) -> candle_core::Result<String> {
         match self {
             ModelType::Mistral(m) => m.generate(prompt, config),
+            ModelType::Llama(m) => m.generate(prompt, config),
         }
     }
@@ -25,6 +27,7 @@ impl ModelType {
     ) -> candle_core::Result<String> {
         match self {
             ModelType::Mistral(m) => m.generate_stream(prompt, config, callback),
+            ModelType::Llama(m) => m.generate_stream(prompt, config, callback),
         }
     }
@@ -32,12 +35,37 @@ impl ModelType {
     fn model_name(&self) -> &str {
         match self {
             ModelType::Mistral(m) => m.model_name(),
+            ModelType::Llama(m) => m.model_name(),
         }
     }
     fn clear_cache(&mut self) {
         match self {
             ModelType::Mistral(m) => m.clear_cache(),
+            ModelType::Llama(m) => m.clear_cache(),
+        }
+    }
+    fn apply_chat_template(&self, messages: &[serde_json::Value]) -> candle_core::Result<String> {
+        match self {
+            ModelType::Mistral(_) => {
+                // For now, use a simple template for Mistral
+                // In the future, we could implement proper Mistral chat templating
+                let mut prompt = String::new();
+                for message in messages {
+                    let role = message["role"].as_str().unwrap_or("");
+                    let content = message["content"].as_str().unwrap_or("");
+                    match role {
+                        "system" => prompt.push_str(&format!("System: {}\n\n", content)),
+                        "user" => prompt.push_str(&format!("User: {}\n\n", content)),
+                        "assistant" => prompt.push_str(&format!("Assistant: {}\n\n", content)),
+                        _ => {}
+                    }
+                }
+                prompt.push_str("Assistant: ");
+                Ok(prompt)
+            },
+            ModelType::Llama(m) => m.apply_chat_template(messages),
         }
     }
 }
@@ -180,10 +208,16 @@ impl LLM {
             })
             .map_err(|e| Error::new(magnus::exception::runtime_error(), format!("Failed to load model: {}", e)))?;
             ModelType::Mistral(mistral)
+        } else if model_lower.contains("llama") || model_lower.contains("meta-llama") {
+            let llama = rt.block_on(async {
+                RustLlama::from_pretrained(&model_id, candle_device).await
+            })
+            .map_err(|e| Error::new(magnus::exception::runtime_error(), format!("Failed to load model: {}", e)))?;
+            ModelType::Llama(llama)
         } else {
             return Err(Error::new(
                 magnus::exception::runtime_error(),
-                format!("Unsupported model type: {}. Currently only Mistral models are supported.", model_id),
+                format!("Unsupported model type: {}. Currently only Mistral and Llama models are supported.", model_id),
             ));
         };
@@ -248,6 +282,41 @@ impl LLM {
         model_ref.clear_cache();
         Ok(())
     }
+    /// Apply chat template to messages
+    pub fn apply_chat_template(&self, messages: RArray) -> RbResult<String> {
+        // Convert Ruby array to JSON values
+        let json_messages: Vec<serde_json::Value> = messages
+            .into_iter()
+            .filter_map(|msg| {
+                if let Ok(hash) = <RHash as TryConvert>::try_convert(msg) {
+                    let mut json_msg = serde_json::Map::new();
+                    if let Some(role) = hash.get(magnus::Symbol::new("role")) {
+                        if let Ok(role_str) = <String as TryConvert>::try_convert(role) {
+                            json_msg.insert("role".to_string(), serde_json::Value::String(role_str));
+                        }
+                    }
+                    if let Some(content) = hash.get(magnus::Symbol::new("content")) {
+                        if let Ok(content_str) = <String as TryConvert>::try_convert(content) {
+                            json_msg.insert("content".to_string(), serde_json::Value::String(content_str));
+                        }
+                    }
+                    Some(serde_json::Value::Object(json_msg))
+                } else {
+                    None
+                }
+            })
+            .collect();
+        let model = self.model.lock().unwrap();
+        let model_ref = model.borrow();
+        model_ref.apply_chat_template(&json_messages)
+            .map_err(|e| Error::new(magnus::exception::runtime_error(), format!("Failed to apply chat template: {}", e)))
+    }
 }
 // Define a standalone function for from_pretrained that handles variable arguments
@@ -290,6 +359,7 @@ pub fn init_llm(rb_candle: RModule) -> RbResult<()> {
     rb_llm.define_method("model_name", method!(LLM::model_name, 0))?;
     rb_llm.define_method("device", method!(LLM::device, 0))?;
     rb_llm.define_method("clear_cache", method!(LLM::clear_cache, 0))?;
+    rb_llm.define_method("apply_chat_template", method!(LLM::apply_chat_template, 1))?;
     Ok(())
 }

data/lib/candle/llm.rb CHANGED Viewed

@@ -2,13 +2,13 @@ module Candle
   class LLM
     # Simple chat interface for instruction models
     def chat(messages, **options)
-      prompt = format_messages(messages)
+      prompt = apply_chat_template(messages)
       generate(prompt, **options)
     end
     # Streaming chat interface
     def chat_stream(messages, **options, &block)
-      prompt = format_messages(messages)
+      prompt = apply_chat_template(messages)
       generate_stream(prompt, **options, &block)
     end
@@ -34,8 +34,8 @@ module Candle
     private
-    # Format messages into a prompt string
-    # This is a simple implementation - model-specific formatting should be added
+    # Legacy format messages method - kept for backward compatibility
+    # Use apply_chat_template for proper model-specific formatting
     def format_messages(messages)
       formatted = messages.map do |msg|
         case msg[:role]

data/lib/candle/version.rb CHANGED Viewed

@@ -1,3 +1,3 @@
 module Candle
-  VERSION = "1.0.0.pre.4"
+  VERSION = "1.0.0.pre.5"
 end

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: red-candle
 version: !ruby/object:Gem::Version
-  version: 1.0.0.pre.4
+  version: 1.0.0.pre.5
 platform: ruby
 authors:
 - Christopher Petersen
@@ -48,6 +48,7 @@ files:
 - ext/candle/rustfmt.toml
 - ext/candle/src/lib.rs
 - ext/candle/src/llm/generation_config.rs
+- ext/candle/src/llm/llama.rs
 - ext/candle/src/llm/mistral.rs
 - ext/candle/src/llm/mod.rs
 - ext/candle/src/llm/text_generation.rs