rllama 1.0.0-x86_64-linux-gnu
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/README.md +214 -0
- data/bin/rllama +8 -0
- data/lib/rllama/cli.rb +183 -0
- data/lib/rllama/context.rb +233 -0
- data/lib/rllama/cpp.rb +690 -0
- data/lib/rllama/loader.rb +210 -0
- data/lib/rllama/model.rb +103 -0
- data/lib/rllama/version.rb +5 -0
- data/lib/rllama/x86_64-linux/libggml-base.so +0 -0
- data/lib/rllama/x86_64-linux/libggml-cpu-alderlake.so +0 -0
- data/lib/rllama/x86_64-linux/libggml-cpu-haswell.so +0 -0
- data/lib/rllama/x86_64-linux/libggml-cpu-icelake.so +0 -0
- data/lib/rllama/x86_64-linux/libggml-cpu-sandybridge.so +0 -0
- data/lib/rllama/x86_64-linux/libggml-cpu-sapphirerapids.so +0 -0
- data/lib/rllama/x86_64-linux/libggml-cpu-skylakex.so +0 -0
- data/lib/rllama/x86_64-linux/libggml-cpu-sse42.so +0 -0
- data/lib/rllama/x86_64-linux/libggml-cpu-x64.so +0 -0
- data/lib/rllama/x86_64-linux/libggml-rpc.so +0 -0
- data/lib/rllama/x86_64-linux/libggml.so +0 -0
- data/lib/rllama/x86_64-linux/libllama.so +0 -0
- data/lib/rllama.rb +37 -0
- metadata +79 -0
    
        checksums.yaml
    ADDED
    
    | @@ -0,0 +1,7 @@ | |
| 1 | 
            +
            ---
         | 
| 2 | 
            +
            SHA256:
         | 
| 3 | 
            +
              metadata.gz: 117ba4498d285d83158050dec220dab571fde8abe9f62ca9a1da07c07c1750f2
         | 
| 4 | 
            +
              data.tar.gz: 0e2a7bc873398ad490d3f90b9488b808bd197544c98747f916be9d7d9914ce1f
         | 
| 5 | 
            +
            SHA512:
         | 
| 6 | 
            +
              metadata.gz: 128cde36b03ced401b2d7f3a9ceb84c00ef91381368e4f9b8eba2714e99af4d552e102d8a23f300440e25b20712f2b81233f8c254ec44cb82232a88fd84093ca
         | 
| 7 | 
            +
              data.tar.gz: 77609e417bae1d4c2b8d9e2351ce753b1adefe03f81ca761a3ad59013fecb856aaa7d1ab1c6e2a2f9a9031a057cd474f1f1c63b31d3d5dfa7802efdd636f73d7
         | 
    
        data/README.md
    ADDED
    
    | @@ -0,0 +1,214 @@ | |
| 1 | 
            +
            # Rllama
         | 
| 2 | 
            +
             | 
| 3 | 
            +
            Ruby bindings for [llama.cpp](https://github.com/ggerganov/llama.cpp) to run open-source language models locally. Run models like GPT-OSS, Qwen 3, Gemma 3, Llama 3, and many others directly in your Ruby application code.
         | 
| 4 | 
            +
             | 
| 5 | 
            +
            ## Installation
         | 
| 6 | 
            +
             | 
| 7 | 
            +
            Add this line to your application's Gemfile:
         | 
| 8 | 
            +
             | 
| 9 | 
            +
            ```ruby
         | 
| 10 | 
            +
            gem 'rllama'
         | 
| 11 | 
            +
            ```
         | 
| 12 | 
            +
             | 
| 13 | 
            +
            And then execute:
         | 
| 14 | 
            +
             | 
| 15 | 
            +
            ```bash
         | 
| 16 | 
            +
            bundle install
         | 
| 17 | 
            +
            ```
         | 
| 18 | 
            +
             | 
| 19 | 
            +
            Or install it yourself as:
         | 
| 20 | 
            +
             | 
| 21 | 
            +
            ```bash
         | 
| 22 | 
            +
            gem install rllama
         | 
| 23 | 
            +
            ```
         | 
| 24 | 
            +
             | 
| 25 | 
            +
            ## Usage
         | 
| 26 | 
            +
             | 
| 27 | 
            +
            ### Text Generation
         | 
| 28 | 
            +
             | 
| 29 | 
            +
            Generate text completions using local language models:
         | 
| 30 | 
            +
             | 
| 31 | 
            +
            ```ruby
         | 
| 32 | 
            +
            require 'rllama'
         | 
| 33 | 
            +
             | 
| 34 | 
            +
            # Load a model
         | 
| 35 | 
            +
            model = Rllama.load_model('lmstudio-community/gemma-3-1B-it-QAT-GGUF/gemma-3-1B-it-QAT-Q4_0.gguf')
         | 
| 36 | 
            +
             | 
| 37 | 
            +
            # Generate text
         | 
| 38 | 
            +
            result = model.generate('What is the capital of France?')
         | 
| 39 | 
            +
            puts result.text
         | 
| 40 | 
            +
            # => "The capital of France is Paris."
         | 
| 41 | 
            +
             | 
| 42 | 
            +
            # Access generation statistics
         | 
| 43 | 
            +
            puts "Tokens generated: #{result.stats[:tokens_generated]}"
         | 
| 44 | 
            +
            puts "Tokens per second: #{result.stats[:tps]}"
         | 
| 45 | 
            +
            puts "Duration: #{result.stats[:duration]} seconds"
         | 
| 46 | 
            +
             | 
| 47 | 
            +
            # Don't forget to close the model when done
         | 
| 48 | 
            +
            model.close
         | 
| 49 | 
            +
            ```
         | 
| 50 | 
            +
             | 
| 51 | 
            +
            #### Generation parameters
         | 
| 52 | 
            +
             | 
| 53 | 
            +
            Adjust the generation with parameters:
         | 
| 54 | 
            +
             | 
| 55 | 
            +
            ```ruby
         | 
| 56 | 
            +
            result = model.generate(
         | 
| 57 | 
            +
              'Write a short poem about Ruby programming',
         | 
| 58 | 
            +
              max_tokens: 2024,
         | 
| 59 | 
            +
              temperature: 0.8,
         | 
| 60 | 
            +
              top_k: 40,
         | 
| 61 | 
            +
              top_p: 0.95,
         | 
| 62 | 
            +
              min_p: 0.05
         | 
| 63 | 
            +
            )
         | 
| 64 | 
            +
            ```
         | 
| 65 | 
            +
             | 
| 66 | 
            +
            #### Streaming generation
         | 
| 67 | 
            +
             | 
| 68 | 
            +
            Stream generated text token-by-token:
         | 
| 69 | 
            +
             | 
| 70 | 
            +
            ```ruby
         | 
| 71 | 
            +
            model.generate('Explain quantum computing') do |token|
         | 
| 72 | 
            +
              print token
         | 
| 73 | 
            +
            end
         | 
| 74 | 
            +
            ```
         | 
| 75 | 
            +
             | 
| 76 | 
            +
            #### System prompt
         | 
| 77 | 
            +
             | 
| 78 | 
            +
            Include system promt to guide model behavior:
         | 
| 79 | 
            +
             | 
| 80 | 
            +
            ```ruby
         | 
| 81 | 
            +
            result = model.generate(
         | 
| 82 | 
            +
              'What are best practices for Ruby development?',
         | 
| 83 | 
            +
              system: 'You are an expert Ruby developer with 10 years of experience.'
         | 
| 84 | 
            +
            )
         | 
| 85 | 
            +
            ```
         | 
| 86 | 
            +
             | 
| 87 | 
            +
            #### Messages list
         | 
| 88 | 
            +
             | 
| 89 | 
            +
            Pass multiple messages with roles for more complex interactions:
         | 
| 90 | 
            +
             | 
| 91 | 
            +
            ```ruby
         | 
| 92 | 
            +
            result = model.generate([
         | 
| 93 | 
            +
              { role: 'system', content: 'You are a helpful assistant.' },
         | 
| 94 | 
            +
              { role: 'user', content: 'What is the capital of France?' },
         | 
| 95 | 
            +
              { role: 'assistant', content: 'The capital of France is Paris.' },
         | 
| 96 | 
            +
              { role: 'user', content: 'What is its population?' }
         | 
| 97 | 
            +
            ])
         | 
| 98 | 
            +
            puts result.text
         | 
| 99 | 
            +
            ```
         | 
| 100 | 
            +
             | 
| 101 | 
            +
            ### Chat
         | 
| 102 | 
            +
             | 
| 103 | 
            +
            For ongoing conversations, use a context object that maintains the conversation history:
         | 
| 104 | 
            +
             | 
| 105 | 
            +
            ```ruby
         | 
| 106 | 
            +
            # Initialize a chat context
         | 
| 107 | 
            +
            context = model.init_context
         | 
| 108 | 
            +
             | 
| 109 | 
            +
            # Send messages and maintain conversation history
         | 
| 110 | 
            +
            response1 = context.message('What is the capital of France?')
         | 
| 111 | 
            +
            puts response1.text
         | 
| 112 | 
            +
            # => "The capital of France is Paris."
         | 
| 113 | 
            +
             | 
| 114 | 
            +
            response2 = context.message('What is the population of that city?')
         | 
| 115 | 
            +
            puts response2.text
         | 
| 116 | 
            +
            # => "Paris has a population of approximately 2.1 million people..."
         | 
| 117 | 
            +
             | 
| 118 | 
            +
            response3 = context.message('What was my first message?')
         | 
| 119 | 
            +
            puts response3.text
         | 
| 120 | 
            +
            # => "Your first message was asking about the capital of France."
         | 
| 121 | 
            +
             | 
| 122 | 
            +
            # The context remembers all previous messages in the conversation
         | 
| 123 | 
            +
             | 
| 124 | 
            +
            # Close context when done
         | 
| 125 | 
            +
            context.close
         | 
| 126 | 
            +
            ```
         | 
| 127 | 
            +
             | 
| 128 | 
            +
            ### Embeddings
         | 
| 129 | 
            +
             | 
| 130 | 
            +
            Generate vector embeddings for text using embedding models:
         | 
| 131 | 
            +
             | 
| 132 | 
            +
            ```ruby
         | 
| 133 | 
            +
            require 'rllama'
         | 
| 134 | 
            +
             | 
| 135 | 
            +
            # Load an embedding model
         | 
| 136 | 
            +
            model = Rllama.load_model('lmstudio-community/embeddinggemma-300m-qat-GGUF/embeddinggemma-300m-qat-Q4_0.gguf')
         | 
| 137 | 
            +
             | 
| 138 | 
            +
            # Generate embedding for a single text
         | 
| 139 | 
            +
            embedding = model.embed('Hello, world!')
         | 
| 140 | 
            +
            puts embedding.length
         | 
| 141 | 
            +
            # => 724 (depending on your model)
         | 
| 142 | 
            +
             | 
| 143 | 
            +
            # Generate embeddings for multiple sentences
         | 
| 144 | 
            +
            embeddings = model.embed([
         | 
| 145 | 
            +
              'roses are red',
         | 
| 146 | 
            +
              'violets are blue',
         | 
| 147 | 
            +
              'sugar is sweet'
         | 
| 148 | 
            +
            ])
         | 
| 149 | 
            +
             | 
| 150 | 
            +
            puts embeddings.length
         | 
| 151 | 
            +
            # => 3
         | 
| 152 | 
            +
            puts embeddings[0].length
         | 
| 153 | 
            +
            # => 768
         | 
| 154 | 
            +
             | 
| 155 | 
            +
            model.close
         | 
| 156 | 
            +
            ```
         | 
| 157 | 
            +
             | 
| 158 | 
            +
            #### Vector parameters
         | 
| 159 | 
            +
             | 
| 160 | 
            +
            By default, embedding vectors are normalized. You can disable normalization with `normalize: false`:
         | 
| 161 | 
            +
             | 
| 162 | 
            +
            ```ruby
         | 
| 163 | 
            +
            # Generate unnormalized embeddings
         | 
| 164 | 
            +
            embedding = model.embed('Sample text', normalize: false)
         | 
| 165 | 
            +
             | 
| 166 | 
            +
            # Use custom batch size for processing multiple texts
         | 
| 167 | 
            +
            embeddings = model.embed(
         | 
| 168 | 
            +
              ['roses are red', 'violets are blue', 'sugar is sweet'],
         | 
| 169 | 
            +
              normalize: true
         | 
| 170 | 
            +
            )
         | 
| 171 | 
            +
            ```
         | 
| 172 | 
            +
             | 
| 173 | 
            +
            ## CLI Chat Utility
         | 
| 174 | 
            +
             | 
| 175 | 
            +
            The `rllama` command-line utility provides an interactive chat interface for conversing with language models. After installing the gem, you can start chatting immediately:
         | 
| 176 | 
            +
             | 
| 177 | 
            +
            ```bash
         | 
| 178 | 
            +
            rllama
         | 
| 179 | 
            +
            ```
         | 
| 180 | 
            +
             | 
| 181 | 
            +
            When you run `rllama` without arguments, it will display:
         | 
| 182 | 
            +
             | 
| 183 | 
            +
            - **Downloaded models**: Any models you've already downloaded to `~/.rllama/models/`
         | 
| 184 | 
            +
            - **Popular models**: A curated list of popular models available for download, including:
         | 
| 185 | 
            +
              - Gemma 3 1B
         | 
| 186 | 
            +
              - Llama 3.2 3B
         | 
| 187 | 
            +
              - Phi-4
         | 
| 188 | 
            +
              - Qwen3 30B
         | 
| 189 | 
            +
              - GPT-OSS
         | 
| 190 | 
            +
              - And more...
         | 
| 191 | 
            +
             | 
| 192 | 
            +
            Simply enter the number of the model you want to use. If you select a model that hasn't been downloaded yet, it will be automatically downloaded from Hugging Face.
         | 
| 193 | 
            +
             | 
| 194 | 
            +
            You can also specify a model path directly:
         | 
| 195 | 
            +
             | 
| 196 | 
            +
            ```bash
         | 
| 197 | 
            +
            rllama path/to/your/model.gguf
         | 
| 198 | 
            +
            ```
         | 
| 199 | 
            +
             | 
| 200 | 
            +
            Once the model loads, you can start chatting.
         | 
| 201 | 
            +
             | 
| 202 | 
            +
            ## Finding Models
         | 
| 203 | 
            +
             | 
| 204 | 
            +
            You can download GGUF format models from various sources:
         | 
| 205 | 
            +
             | 
| 206 | 
            +
            - [Hugging Face](https://huggingface.co/models?library=gguf) - Search for models with "GGUF" format
         | 
| 207 | 
            +
             | 
| 208 | 
            +
            ## License
         | 
| 209 | 
            +
             | 
| 210 | 
            +
            MIT
         | 
| 211 | 
            +
             | 
| 212 | 
            +
            ## Contributing
         | 
| 213 | 
            +
             | 
| 214 | 
            +
            Bug reports and pull requests are welcome on GitHub at https://github.com/docusealco/rllama.
         | 
    
        data/bin/rllama
    ADDED
    
    
    
        data/lib/rllama/cli.rb
    ADDED
    
    | @@ -0,0 +1,183 @@ | |
| 1 | 
            +
            # frozen_string_literal: true
         | 
| 2 | 
            +
             | 
| 3 | 
            +
            require 'readline'
         | 
| 4 | 
            +
             | 
| 5 | 
            +
            module Rllama
         | 
| 6 | 
            +
              class Cli
         | 
| 7 | 
            +
                POPULAR_MODELS = [
         | 
| 8 | 
            +
                  { path: 'lmstudio-community/gemma-3-1B-it-QAT-GGUF/gemma-3-1B-it-QAT-Q4_0.gguf', size: 720_425_472 },
         | 
| 9 | 
            +
                  { path: 'lmstudio-community/gpt-oss-20b-GGUF/gpt-oss-20b-MXFP4.gguf', size: 12_109_565_632 },
         | 
| 10 | 
            +
                  { path: 'bartowski/Llama-3.2-3B-Instruct-GGUF/Llama-3.2-3B-Instruct-Q4_K_M.gguf', size: 2_019_377_696 },
         | 
| 11 | 
            +
                  { path: 'unsloth/Qwen3-30B-A3B-GGUF/Qwen3-30B-A3B-Q3_K_S.gguf', size: 13_292_468_800 },
         | 
| 12 | 
            +
                  { path: 'inclusionAI/Ling-mini-2.0-GGUF/Ling-mini-2.0-Q4_K_M.gguf', size: 9_911_575_072 },
         | 
| 13 | 
            +
                  { path: 'unsloth/gemma-3n-E4B-it-GGUF/gemma-3n-E4B-it-Q4_K_S.gguf', size: 4_404_697_216 },
         | 
| 14 | 
            +
                  { path: 'microsoft/phi-4-gguf/phi-4-Q4_K_S.gguf', size: 8_440_762_560 }
         | 
| 15 | 
            +
                ].freeze
         | 
| 16 | 
            +
             | 
| 17 | 
            +
                COLOR_CODES = {
         | 
| 18 | 
            +
                  red: 31,
         | 
| 19 | 
            +
                  green: 32,
         | 
| 20 | 
            +
                  yellow: 33,
         | 
| 21 | 
            +
                  blue: 34,
         | 
| 22 | 
            +
                  magenta: 35,
         | 
| 23 | 
            +
                  cyan: 36
         | 
| 24 | 
            +
                }.freeze
         | 
| 25 | 
            +
             | 
| 26 | 
            +
                def self.start(args)
         | 
| 27 | 
            +
                  new(args).run
         | 
| 28 | 
            +
                end
         | 
| 29 | 
            +
             | 
| 30 | 
            +
                def initialize(args)
         | 
| 31 | 
            +
                  @args = args
         | 
| 32 | 
            +
             | 
| 33 | 
            +
                  @model_path = args.first
         | 
| 34 | 
            +
                end
         | 
| 35 | 
            +
             | 
| 36 | 
            +
                def run
         | 
| 37 | 
            +
                  model_path = select_or_load_model
         | 
| 38 | 
            +
             | 
| 39 | 
            +
                  puts "\n#{colorize('Loading model...', :yellow)}"
         | 
| 40 | 
            +
             | 
| 41 | 
            +
                  model = Rllama.load_model(model_path)
         | 
| 42 | 
            +
                  context = model.init_context
         | 
| 43 | 
            +
             | 
| 44 | 
            +
                  puts colorize('Model loaded successfully!', :green)
         | 
| 45 | 
            +
                  puts "\n#{colorize('Chat started. Type your message and press Enter. Type "exit" or "quit" to end the chat.',
         | 
| 46 | 
            +
                                     :cyan)}\n\n"
         | 
| 47 | 
            +
             | 
| 48 | 
            +
                  chat_loop(context)
         | 
| 49 | 
            +
                rescue Interrupt
         | 
| 50 | 
            +
                  puts "\n\n#{colorize('Chat interrupted. Goodbye!', :yellow)}"
         | 
| 51 | 
            +
                  exit(0)
         | 
| 52 | 
            +
                rescue StandardError => e
         | 
| 53 | 
            +
                  puts "\n#{colorize("Error: #{e.message}", :red)}"
         | 
| 54 | 
            +
                  exit(1)
         | 
| 55 | 
            +
                ensure
         | 
| 56 | 
            +
                  context&.close
         | 
| 57 | 
            +
                  model&.close
         | 
| 58 | 
            +
                end
         | 
| 59 | 
            +
             | 
| 60 | 
            +
                private
         | 
| 61 | 
            +
             | 
| 62 | 
            +
                def select_or_load_model
         | 
| 63 | 
            +
                  return @model_path if @model_path
         | 
| 64 | 
            +
             | 
| 65 | 
            +
                  downloaded_models = find_downloaded_models
         | 
| 66 | 
            +
                  downloaded_model_names = downloaded_models.map { |path| File.basename(path, '.gguf') }
         | 
| 67 | 
            +
             | 
| 68 | 
            +
                  available_popular = POPULAR_MODELS.reject do |popular_model|
         | 
| 69 | 
            +
                    popular_filename = File.basename(popular_model[:path], '.gguf')
         | 
| 70 | 
            +
                    downloaded_model_names.any?(popular_filename)
         | 
| 71 | 
            +
                  end
         | 
| 72 | 
            +
             | 
| 73 | 
            +
                  all_choices = []
         | 
| 74 | 
            +
                  current_index = 1
         | 
| 75 | 
            +
             | 
| 76 | 
            +
                  unless downloaded_models.empty?
         | 
| 77 | 
            +
                    puts "#{colorize('Downloaded models:', :cyan)}\n\n"
         | 
| 78 | 
            +
             | 
| 79 | 
            +
                    downloaded_models.each do |model|
         | 
| 80 | 
            +
                      display_name = File.basename(model, '.gguf')
         | 
| 81 | 
            +
                      size = format_file_size(File.size(model))
         | 
| 82 | 
            +
                      puts "  #{colorize(current_index.to_s, :green)}. #{display_name} #{colorize("(#{size})", :yellow)}"
         | 
| 83 | 
            +
                      all_choices << model
         | 
| 84 | 
            +
                      current_index += 1
         | 
| 85 | 
            +
                    end
         | 
| 86 | 
            +
             | 
| 87 | 
            +
                    puts "\n"
         | 
| 88 | 
            +
                  end
         | 
| 89 | 
            +
             | 
| 90 | 
            +
                  unless available_popular.empty?
         | 
| 91 | 
            +
                    puts "#{colorize('Popular models (not downloaded):', :cyan)}\n\n"
         | 
| 92 | 
            +
             | 
| 93 | 
            +
                    available_popular.each do |model|
         | 
| 94 | 
            +
                      display_name = File.basename(model[:path], '.gguf')
         | 
| 95 | 
            +
                      puts "  #{colorize(current_index.to_s, :green)}. " \
         | 
| 96 | 
            +
                           "#{display_name} #{colorize("(#{format_file_size(model[:size])})", :yellow)}"
         | 
| 97 | 
            +
                      all_choices << model[:path]
         | 
| 98 | 
            +
                      current_index += 1
         | 
| 99 | 
            +
                    end
         | 
| 100 | 
            +
             | 
| 101 | 
            +
                    puts "\n"
         | 
| 102 | 
            +
                  end
         | 
| 103 | 
            +
             | 
| 104 | 
            +
                  if all_choices.empty?
         | 
| 105 | 
            +
                    puts colorize('No models available', :yellow)
         | 
| 106 | 
            +
                    exit(1)
         | 
| 107 | 
            +
                  end
         | 
| 108 | 
            +
             | 
| 109 | 
            +
                  print colorize("Enter number (1-#{all_choices.length}): ", :cyan)
         | 
| 110 | 
            +
             | 
| 111 | 
            +
                  choice = $stdin.gets&.strip.to_i
         | 
| 112 | 
            +
             | 
| 113 | 
            +
                  if choice < 1 || choice > all_choices.length
         | 
| 114 | 
            +
                    puts colorize('Invalid choice', :red)
         | 
| 115 | 
            +
             | 
| 116 | 
            +
                    exit(1)
         | 
| 117 | 
            +
                  end
         | 
| 118 | 
            +
             | 
| 119 | 
            +
                  all_choices[choice - 1]
         | 
| 120 | 
            +
                end
         | 
| 121 | 
            +
             | 
| 122 | 
            +
                def find_downloaded_models
         | 
| 123 | 
            +
                  models_dir = File.join(Dir.home, '.rllama', 'models')
         | 
| 124 | 
            +
             | 
| 125 | 
            +
                  return [] unless Dir.exist?(models_dir)
         | 
| 126 | 
            +
             | 
| 127 | 
            +
                  Dir.glob(File.join(models_dir, '**', '*.gguf')).reject do |path|
         | 
| 128 | 
            +
                    basename = File.basename(path)
         | 
| 129 | 
            +
             | 
| 130 | 
            +
                    basename.start_with?('~', '!')
         | 
| 131 | 
            +
                  end
         | 
| 132 | 
            +
                end
         | 
| 133 | 
            +
             | 
| 134 | 
            +
                def format_file_size(bytes)
         | 
| 135 | 
            +
                  gb = bytes / (1024.0**3)
         | 
| 136 | 
            +
             | 
| 137 | 
            +
                  if gb >= 1.0
         | 
| 138 | 
            +
                    format('%.1fGB', gb)
         | 
| 139 | 
            +
                  else
         | 
| 140 | 
            +
                    mb = bytes / (1024.0**2)
         | 
| 141 | 
            +
             | 
| 142 | 
            +
                    format('%dMB', mb.round)
         | 
| 143 | 
            +
                  end
         | 
| 144 | 
            +
                end
         | 
| 145 | 
            +
             | 
| 146 | 
            +
                def chat_loop(context)
         | 
| 147 | 
            +
                  loop do
         | 
| 148 | 
            +
                    user_input = Readline.readline('> ', false)&.strip
         | 
| 149 | 
            +
             | 
| 150 | 
            +
                    break if user_input.nil?
         | 
| 151 | 
            +
             | 
| 152 | 
            +
                    next if user_input.empty?
         | 
| 153 | 
            +
             | 
| 154 | 
            +
                    if user_input.downcase == 'exit' || user_input.downcase == 'quit'
         | 
| 155 | 
            +
                      puts "\n#{colorize('Goodbye!', :yellow)}"
         | 
| 156 | 
            +
             | 
| 157 | 
            +
                      break
         | 
| 158 | 
            +
                    end
         | 
| 159 | 
            +
             | 
| 160 | 
            +
                    puts "\n"
         | 
| 161 | 
            +
             | 
| 162 | 
            +
                    print "#{colorize('Assistant:', :magenta, bold: true)} "
         | 
| 163 | 
            +
             | 
| 164 | 
            +
                    context.generate(user_input) do |token|
         | 
| 165 | 
            +
                      print token
         | 
| 166 | 
            +
                      $stdout.flush
         | 
| 167 | 
            +
                    end
         | 
| 168 | 
            +
             | 
| 169 | 
            +
                    puts "\n\n"
         | 
| 170 | 
            +
                  end
         | 
| 171 | 
            +
                end
         | 
| 172 | 
            +
             | 
| 173 | 
            +
                def colorize(text, color, bold: false)
         | 
| 174 | 
            +
                  return text unless $stdout.tty?
         | 
| 175 | 
            +
             | 
| 176 | 
            +
                  code = COLOR_CODES[color] || 37
         | 
| 177 | 
            +
             | 
| 178 | 
            +
                  prefix = bold ? "\e[1;#{code}m" : "\e[#{code}m"
         | 
| 179 | 
            +
             | 
| 180 | 
            +
                  "#{prefix}#{text}\e[0m"
         | 
| 181 | 
            +
                end
         | 
| 182 | 
            +
              end
         | 
| 183 | 
            +
            end
         | 
| @@ -0,0 +1,233 @@ | |
| 1 | 
            +
            # frozen_string_literal: true
         | 
| 2 | 
            +
             | 
| 3 | 
            +
            module Rllama
         | 
| 4 | 
            +
              class Context
         | 
| 5 | 
            +
                attr_reader :messages, :n_ctx, :n_batch, :n_past
         | 
| 6 | 
            +
             | 
| 7 | 
            +
                def initialize(model, embeddings: false, n_ctx: nil, n_batch: nil)
         | 
| 8 | 
            +
                  @model = model
         | 
| 9 | 
            +
                  @n_ctx = n_ctx
         | 
| 10 | 
            +
                  @n_batch = n_batch
         | 
| 11 | 
            +
                  @embeddings = embeddings
         | 
| 12 | 
            +
             | 
| 13 | 
            +
                  @ctx_params = Cpp.llama_context_default_params
         | 
| 14 | 
            +
             | 
| 15 | 
            +
                  @ctx_params[:n_ctx] = @n_ctx
         | 
| 16 | 
            +
                  @ctx_params[:n_batch] = @n_batch
         | 
| 17 | 
            +
             | 
| 18 | 
            +
                  if @embeddings
         | 
| 19 | 
            +
                    @ctx_params[:n_seq_max] = [@n_batch, @model.n_seq_max].min
         | 
| 20 | 
            +
                    @ctx_params[:embeddings] = true
         | 
| 21 | 
            +
                  end
         | 
| 22 | 
            +
             | 
| 23 | 
            +
                  @pointer = Cpp.llama_init_from_model(model.pointer, @ctx_params)
         | 
| 24 | 
            +
             | 
| 25 | 
            +
                  raise Error, 'Failed to create the llama_context' if @pointer.null?
         | 
| 26 | 
            +
             | 
| 27 | 
            +
                  @n_past = 0
         | 
| 28 | 
            +
                  @messages = []
         | 
| 29 | 
            +
                end
         | 
| 30 | 
            +
             | 
| 31 | 
            +
                def generate(message, role: 'user', max_tokens: @n_ctx, temperature: 0.8, top_k: 40, top_p: 0.95, min_p: 0.05,
         | 
| 32 | 
            +
                             seed: nil, system: nil)
         | 
| 33 | 
            +
                  @messages << { role: 'system', content: system } if system && @messages.empty?
         | 
| 34 | 
            +
             | 
| 35 | 
            +
                  if message.is_a?(Array)
         | 
| 36 | 
            +
                    @messages.push(*message)
         | 
| 37 | 
            +
                  elsif message.is_a?(Hash)
         | 
| 38 | 
            +
                    @messages.push(message)
         | 
| 39 | 
            +
                  else
         | 
| 40 | 
            +
                    @messages << { role: role, content: message }
         | 
| 41 | 
            +
                  end
         | 
| 42 | 
            +
             | 
| 43 | 
            +
                  prompt_string = @model.build_chat_template(@messages)
         | 
| 44 | 
            +
             | 
| 45 | 
            +
                  n_prompt_tokens = -Cpp.llama_tokenize(@model.vocab, prompt_string, prompt_string.bytesize, nil, 0, true, true)
         | 
| 46 | 
            +
             | 
| 47 | 
            +
                  raise Error, 'Prompt is too long.' if n_prompt_tokens.negative?
         | 
| 48 | 
            +
             | 
| 49 | 
            +
                  prompt_tokens_ptr = FFI::MemoryPointer.new(:int32, n_prompt_tokens)
         | 
| 50 | 
            +
                  tokens_written = Cpp.llama_tokenize(@model.vocab, prompt_string, prompt_string.bytesize, prompt_tokens_ptr,
         | 
| 51 | 
            +
                                                      n_prompt_tokens, true, true)
         | 
| 52 | 
            +
             | 
| 53 | 
            +
                  raise Error, 'Failed to tokenize prompt.' if tokens_written.negative?
         | 
| 54 | 
            +
             | 
| 55 | 
            +
                  new_token_count = tokens_written - @n_past
         | 
| 56 | 
            +
             | 
| 57 | 
            +
                  if new_token_count.positive?
         | 
| 58 | 
            +
                    new_tokens_ptr = prompt_tokens_ptr + (@n_past * FFI.type_size(:int32))
         | 
| 59 | 
            +
             | 
| 60 | 
            +
                    batch = Cpp.llama_batch_get_one(new_tokens_ptr, new_token_count)
         | 
| 61 | 
            +
             | 
| 62 | 
            +
                    raise Error, 'llama_decode failed.' if Cpp.llama_decode(@pointer, batch) != 0
         | 
| 63 | 
            +
             | 
| 64 | 
            +
                    @n_past = tokens_written
         | 
| 65 | 
            +
                  end
         | 
| 66 | 
            +
             | 
| 67 | 
            +
                  chain_params = Cpp.llama_sampler_chain_default_params
         | 
| 68 | 
            +
                  sampler_chain = Cpp.llama_sampler_chain_init(chain_params)
         | 
| 69 | 
            +
             | 
| 70 | 
            +
                  Cpp.llama_sampler_chain_add(sampler_chain, Cpp.llama_sampler_init_min_p(min_p, 1)) if min_p
         | 
| 71 | 
            +
                  Cpp.llama_sampler_chain_add(sampler_chain, Cpp.llama_sampler_init_top_k(top_k)) if top_k&.positive?
         | 
| 72 | 
            +
                  Cpp.llama_sampler_chain_add(sampler_chain, Cpp.llama_sampler_init_top_p(top_p, 1)) if top_p && top_p < 1.0
         | 
| 73 | 
            +
                  if temperature&.positive?
         | 
| 74 | 
            +
                    Cpp.llama_sampler_chain_add(sampler_chain,
         | 
| 75 | 
            +
                                                Cpp.llama_sampler_init_temp(temperature))
         | 
| 76 | 
            +
                  end
         | 
| 77 | 
            +
             | 
| 78 | 
            +
                  is_probabilistic = temperature&.positive? || top_k&.positive? || (top_p && top_p < 1.0) || !min_p.nil?
         | 
| 79 | 
            +
                  rng_seed = seed || (Random.new_seed & 0xFFFFFFFF)
         | 
| 80 | 
            +
             | 
| 81 | 
            +
                  if is_probabilistic
         | 
| 82 | 
            +
                    Cpp.llama_sampler_chain_add(sampler_chain, Cpp.llama_sampler_init_dist(rng_seed))
         | 
| 83 | 
            +
                  else
         | 
| 84 | 
            +
                    Cpp.llama_sampler_chain_add(sampler_chain, Cpp.llama_sampler_init_greedy)
         | 
| 85 | 
            +
                  end
         | 
| 86 | 
            +
             | 
| 87 | 
            +
                  n_decoded = 0
         | 
| 88 | 
            +
             | 
| 89 | 
            +
                  generated_text = ''.b
         | 
| 90 | 
            +
             | 
| 91 | 
            +
                  assistant_message = { role: 'assistant', content: generated_text }
         | 
| 92 | 
            +
             | 
| 93 | 
            +
                  @messages << assistant_message
         | 
| 94 | 
            +
             | 
| 95 | 
            +
                  start_time = Time.now
         | 
| 96 | 
            +
             | 
| 97 | 
            +
                  loop do
         | 
| 98 | 
            +
                    break if n_decoded >= max_tokens
         | 
| 99 | 
            +
             | 
| 100 | 
            +
                    new_token_id = Cpp.llama_sampler_sample(sampler_chain, @pointer, -1)
         | 
| 101 | 
            +
             | 
| 102 | 
            +
                    break if Cpp.llama_vocab_is_eog(@model.vocab, new_token_id)
         | 
| 103 | 
            +
             | 
| 104 | 
            +
                    buffer = FFI::MemoryPointer.new(:char, 256)
         | 
| 105 | 
            +
                    n_chars = Cpp.llama_token_to_piece(@model.vocab, new_token_id, buffer, buffer.size, 0, true)
         | 
| 106 | 
            +
             | 
| 107 | 
            +
                    if n_chars >= 0
         | 
| 108 | 
            +
                      piece_bytes = buffer.read_string(n_chars)
         | 
| 109 | 
            +
                      utf8_piece = piece_bytes.force_encoding(Encoding::UTF_8)
         | 
| 110 | 
            +
                      generated_text << utf8_piece
         | 
| 111 | 
            +
                      yield utf8_piece if block_given?
         | 
| 112 | 
            +
                    end
         | 
| 113 | 
            +
             | 
| 114 | 
            +
                    token_ptr = FFI::MemoryPointer.new(:int32, 1).put_int32(0, new_token_id)
         | 
| 115 | 
            +
                    batch = Cpp.llama_batch_get_one(token_ptr, 1)
         | 
| 116 | 
            +
             | 
| 117 | 
            +
                    raise Error, 'context length has been exceeded' if @n_past >= @n_ctx
         | 
| 118 | 
            +
                    raise Error, 'llama_decode failed.' if Cpp.llama_decode(@pointer, batch) != 0
         | 
| 119 | 
            +
             | 
| 120 | 
            +
                    @n_past += 1
         | 
| 121 | 
            +
                    n_decoded += 1
         | 
| 122 | 
            +
                  end
         | 
| 123 | 
            +
             | 
| 124 | 
            +
                  end_time = Time.now
         | 
| 125 | 
            +
             | 
| 126 | 
            +
                  duration = end_time - start_time
         | 
| 127 | 
            +
             | 
| 128 | 
            +
                  tps = n_decoded.positive? && duration.positive? ? n_decoded / duration : 0
         | 
| 129 | 
            +
             | 
| 130 | 
            +
                  Cpp.llama_sampler_free(sampler_chain)
         | 
| 131 | 
            +
             | 
| 132 | 
            +
                  Result.new(
         | 
| 133 | 
            +
                    text: generated_text,
         | 
| 134 | 
            +
                    stats: {
         | 
| 135 | 
            +
                      duration:,
         | 
| 136 | 
            +
                      tokens_generated: n_decoded,
         | 
| 137 | 
            +
                      tps:,
         | 
| 138 | 
            +
                      seed: rng_seed
         | 
| 139 | 
            +
                    }
         | 
| 140 | 
            +
                  )
         | 
| 141 | 
            +
                end
         | 
| 142 | 
            +
                alias message generate
         | 
| 143 | 
            +
             | 
| 144 | 
            +
                def embed(strings, normalize: true, batch_size: 512)
         | 
| 145 | 
            +
                  is_array = strings.is_a?(Array)
         | 
| 146 | 
            +
             | 
| 147 | 
            +
                  strings = Array(strings) unless is_array
         | 
| 148 | 
            +
             | 
| 149 | 
            +
                  tokenized_strings = strings.map do |text|
         | 
| 150 | 
            +
                    max_tokens = text.bytesize + 2
         | 
| 151 | 
            +
                    tokens_ptr = FFI::MemoryPointer.new(:int32, max_tokens)
         | 
| 152 | 
            +
                    count = Cpp.llama_tokenize(@model.vocab, text, text.bytesize, tokens_ptr, max_tokens, true, false)
         | 
| 153 | 
            +
             | 
| 154 | 
            +
                    raise Error, "Failed to tokenize text: '#{text}'" if count.negative?
         | 
| 155 | 
            +
             | 
| 156 | 
            +
                    tokens_ptr.read_array_of_int32(count)
         | 
| 157 | 
            +
                  end
         | 
| 158 | 
            +
             | 
| 159 | 
            +
                  all_embeddings = []
         | 
| 160 | 
            +
                  batch = Cpp.llama_batch_init(batch_size, 0, 1)
         | 
| 161 | 
            +
                  prompts_in_batch = []
         | 
| 162 | 
            +
                  current_batch_token_count = 0
         | 
| 163 | 
            +
             | 
| 164 | 
            +
                  process_batch = lambda do
         | 
| 165 | 
            +
                    next if prompts_in_batch.empty?
         | 
| 166 | 
            +
             | 
| 167 | 
            +
                    batch[:n_tokens] = current_batch_token_count
         | 
| 168 | 
            +
             | 
| 169 | 
            +
                    raise Error, 'llama_decode failed' unless Cpp.llama_decode(@pointer, batch).zero?
         | 
| 170 | 
            +
             | 
| 171 | 
            +
                    prompts_in_batch.each do |seq_id_in_batch|
         | 
| 172 | 
            +
                      embd_ptr = Cpp.llama_get_embeddings_seq(@pointer, seq_id_in_batch)
         | 
| 173 | 
            +
             | 
| 174 | 
            +
                      raise Error, 'Failed to get embedding' if embd_ptr.null?
         | 
| 175 | 
            +
             | 
| 176 | 
            +
                      embedding = embd_ptr.read_array_of_float(@model.n_embd)
         | 
| 177 | 
            +
             | 
| 178 | 
            +
                      all_embeddings << (normalize ? normalize_embedding(embedding) : embedding)
         | 
| 179 | 
            +
                    end
         | 
| 180 | 
            +
             | 
| 181 | 
            +
                    current_batch_token_count = 0
         | 
| 182 | 
            +
                    prompts_in_batch = []
         | 
| 183 | 
            +
                  end
         | 
| 184 | 
            +
             | 
| 185 | 
            +
                  tokenized_strings.each do |tokens|
         | 
| 186 | 
            +
                    batch_full = (current_batch_token_count + tokens.size) > batch_size
         | 
| 187 | 
            +
                    seq_limit_reached = prompts_in_batch.size >= @model.n_seq_max
         | 
| 188 | 
            +
                    process_batch.call if !prompts_in_batch.empty? && (batch_full || seq_limit_reached)
         | 
| 189 | 
            +
             | 
| 190 | 
            +
                    seq_id = prompts_in_batch.size
         | 
| 191 | 
            +
                    prompts_in_batch << seq_id
         | 
| 192 | 
            +
             | 
| 193 | 
            +
                    tokens.each_with_index do |token_id, pos|
         | 
| 194 | 
            +
                      idx = current_batch_token_count
         | 
| 195 | 
            +
             | 
| 196 | 
            +
                      batch[:token].put_int32(idx * FFI.type_size(:int32), token_id)
         | 
| 197 | 
            +
                      batch[:pos].put_int32(idx * FFI.type_size(:int32), pos)
         | 
| 198 | 
            +
                      batch[:n_seq_id].put_int32(idx * FFI.type_size(:int32), 1)
         | 
| 199 | 
            +
                      batch[:seq_id].get_pointer(idx * FFI::Pointer.size).put_int32(0, seq_id)
         | 
| 200 | 
            +
                      batch[:logits].put_int8(idx, pos == tokens.size - 1 ? 1 : 0)
         | 
| 201 | 
            +
             | 
| 202 | 
            +
                      current_batch_token_count += 1
         | 
| 203 | 
            +
                    end
         | 
| 204 | 
            +
                  end
         | 
| 205 | 
            +
             | 
| 206 | 
            +
                  process_batch.call
         | 
| 207 | 
            +
             | 
| 208 | 
            +
                  Cpp.llama_batch_free(batch)
         | 
| 209 | 
            +
             | 
| 210 | 
            +
                  is_array ? all_embeddings : all_embeddings[0]
         | 
| 211 | 
            +
                end
         | 
| 212 | 
            +
             | 
| 213 | 
            +
                def embeddings?
         | 
| 214 | 
            +
                  @embeddings
         | 
| 215 | 
            +
                end
         | 
| 216 | 
            +
             | 
| 217 | 
            +
                def close
         | 
| 218 | 
            +
                  Cpp.llama_free(@pointer)
         | 
| 219 | 
            +
                end
         | 
| 220 | 
            +
             | 
| 221 | 
            +
                def norm(vec)
         | 
| 222 | 
            +
                  Math.sqrt(vec.sum { |x| x**2 })
         | 
| 223 | 
            +
                end
         | 
| 224 | 
            +
             | 
| 225 | 
            +
                def normalize_embedding(vec)
         | 
| 226 | 
            +
                  n = norm(vec)
         | 
| 227 | 
            +
             | 
| 228 | 
            +
                  return vec if n.zero?
         | 
| 229 | 
            +
             | 
| 230 | 
            +
                  vec.map { |x| x / n }
         | 
| 231 | 
            +
                end
         | 
| 232 | 
            +
              end
         | 
| 233 | 
            +
            end
         |