RubyGems - rllama - Versions diffs - 1.0.0-arm64-darwin → 1.0.2-arm64-darwin - Mend

rllama 1.0.0-arm64-darwin → 1.0.2-arm64-darwin

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: 11d7ddcc0931dc67db511e81488bc3a2a19d564c3fdffbcdae938a9cf77fbd74
-  data.tar.gz: 157cb3a06a1469788ddffd31466ab363114e747587befa59583efd37e3b8796f
+  metadata.gz: ad943f40faafeaf0a622fc29006a1e70a8b650996356b8f12ac07a8fab689358
+  data.tar.gz: 5de4cfd00f49df0b1b8025417c84d7717a41c6d60c5921a706e6cc02a0a3fcb7
 SHA512:
-  metadata.gz: 40c0670a3b6aa77aefedc09e8763e9175f1be7074b243dc682763389ba6187c522650cb1b5ea54f3921b129733f30d1512acfe5f8793e31ff7747ad1ec41a4e2
-  data.tar.gz: 1c81121425b454ce3803a650754a9ba2882a4254c4f414d98c30e707897965493ef1f2f3288fbcbae363af11259cefe8336ebc5cec94c0fdf6a6e1cf86d0d93a
+  metadata.gz: 799e088b716958ad4363835104b99ef3edfa423a12220e07df92f1ff81a2ede03e5151d7c242ef39cc3d0e151688722b900bb35b974d225dcb6d4450994664b2
+  data.tar.gz: 550298eb56c7f451354e3d620b729c247c2b2d97bf0f9f29220bb6365ba15134b46dbf65062f6df370118e5f21cebcdd974eae6e4ea7ec9528a0ab8e34813593

data/README.md CHANGED Viewed

@@ -1,3 +1,5 @@
+<img width="336" height="212.0" alt="Logo" src="https://github.com/user-attachments/assets/e27442fb-22d1-44cf-ba3d-f10b24c13652" />
 # Rllama
 Ruby bindings for [llama.cpp](https://github.com/ggerganov/llama.cpp) to run open-source language models locally. Run models like GPT-OSS, Qwen 3, Gemma 3, Llama 3, and many others directly in your Ruby application code.
@@ -22,6 +24,38 @@ Or install it yourself as:
 gem install rllama
 ```
+## CLI Chat
+The `rllama` command-line utility provides an interactive chat interface for conversing with language models. After installing the gem, you can start chatting immediately:
+```bash
+rllama
+```
+When you run `rllama` without arguments, it will display:
+- **Downloaded models**: Any models you've already downloaded to `~/.rllama/models/`
+- **Popular models**: A curated list of popular models available for download, including:
+  - Gemma 3 1B
+  - Llama 3.2 3B
+  - Phi-4
+  - Qwen3 30B
+  - GPT-OSS
+Simply enter the number of the model you want to use. If you select a model that hasn't been downloaded yet, it will be automatically downloaded from Hugging Face.
+You can also specify a model path or URL directly:
+```bash
+rllama path/to/your/model.gguf
+```
+```bash
+rllama https://huggingface.co/microsoft/phi-4-gguf/resolve/main/phi-4-Q3_K_S.gguf
+```
+Once the model has loaded, you can start chatting.
 ## Usage
 ### Text Generation
@@ -162,43 +196,8 @@ By default, embedding vectors are normalized. You can disable normalization with
 ```ruby
 # Generate unnormalized embeddings
 embedding = model.embed('Sample text', normalize: false)
-# Use custom batch size for processing multiple texts
-embeddings = model.embed(
-  ['roses are red', 'violets are blue', 'sugar is sweet'],
-  normalize: true
-)
-```
-## CLI Chat Utility
-The `rllama` command-line utility provides an interactive chat interface for conversing with language models. After installing the gem, you can start chatting immediately:
-```bash
-rllama
-```
-When you run `rllama` without arguments, it will display:
-- **Downloaded models**: Any models you've already downloaded to `~/.rllama/models/`
-- **Popular models**: A curated list of popular models available for download, including:
-  - Gemma 3 1B
-  - Llama 3.2 3B
-  - Phi-4
-  - Qwen3 30B
-  - GPT-OSS
-  - And more...
-Simply enter the number of the model you want to use. If you select a model that hasn't been downloaded yet, it will be automatically downloaded from Hugging Face.
-You can also specify a model path directly:
-```bash
-rllama path/to/your/model.gguf
 ```
-Once the model loads, you can start chatting.
 ## Finding Models
 You can download GGUF format models from various sources:

data/lib/rllama/context.rb CHANGED Viewed

@@ -12,12 +12,21 @@ module Rllama
       @ctx_params = Cpp.llama_context_default_params
-      @ctx_params[:n_ctx] = @n_ctx
-      @ctx_params[:n_batch] = @n_batch
+      @ctx_params[:n_ctx] = @n_ctx if @n_ctx
+      @ctx_params[:n_batch] = @n_batch if @n_batch
       if @embeddings
-        @ctx_params[:n_seq_max] = [@n_batch, @model.n_seq_max].min
+        seq_cap = @model.n_seq_max
+        if @n_batch&.positive? && seq_cap&.positive?
+          @ctx_params[:n_seq_max] = [@n_batch, seq_cap].min
+        elsif seq_cap&.positive?
+          @ctx_params[:n_seq_max] = seq_cap
+        end
         @ctx_params[:embeddings] = true
+        @ctx_params[:kv_unified] = true
+        @ctx_params[:n_ubatch] = @n_batch if @n_batch&.positive?
       end
       @pointer = Cpp.llama_init_from_model(model.pointer, @ctx_params)
@@ -141,19 +150,31 @@ module Rllama
     end
     alias message generate
-    def embed(strings, normalize: true, batch_size: 512)
-      is_array = strings.is_a?(Array)
+    def embed(strings_or_tokens, normalize: true, batch_size: 512)
+      is_tokens = strings_or_tokens.is_a?(Array) &&
+                  (strings_or_tokens[0].is_a?(Integer) ||
+                   (strings_or_tokens[0].is_a?(Array) && strings_or_tokens[0][0].is_a?(Integer)))
-      strings = Array(strings) unless is_array
+      input_is_array = is_tokens ? strings_or_tokens[0].is_a?(Array) : strings_or_tokens.is_a?(Array)
-      tokenized_strings = strings.map do |text|
-        max_tokens = text.bytesize + 2
-        tokens_ptr = FFI::MemoryPointer.new(:int32, max_tokens)
-        count = Cpp.llama_tokenize(@model.vocab, text, text.bytesize, tokens_ptr, max_tokens, true, false)
+      normalized_inputs = input_is_array ? strings_or_tokens : [strings_or_tokens]
+      tokenized_strings =
+        if is_tokens
+          input_is_array ? strings_or_tokens : [strings_or_tokens]
+        else
+          normalized_inputs.map { |text| @model.tokenize(text) }
+        end
-        raise Error, "Failed to tokenize text: '#{text}'" if count.negative?
+      max_tokens_in_prompt = tokenized_strings.map(&:length).max || 0
-        tokens_ptr.read_array_of_int32(count)
+      if max_tokens_in_prompt > batch_size
+        raise Error, "batch_size (#{batch_size}) is smaller than the longest prompt (#{max_tokens_in_prompt} tokens)."
+      end
+      if max_tokens_in_prompt > @n_batch
+        raise Error, "Context n_batch (#{@n_batch}) is smaller than the longest " \
+                     "prompt (#{max_tokens_in_prompt} tokens). Increase batch_size when calling embed."
       end
       all_embeddings = []
@@ -166,6 +187,9 @@ module Rllama
         batch[:n_tokens] = current_batch_token_count
+        memory_ptr = Cpp.llama_get_memory(@pointer)
+        Cpp.llama_memory_clear(memory_ptr, true) unless memory_ptr.null?
         raise Error, 'llama_decode failed' unless Cpp.llama_decode(@pointer, batch).zero?
         prompts_in_batch.each do |seq_id_in_batch|
@@ -179,7 +203,8 @@ module Rllama
         end
         current_batch_token_count = 0
-        prompts_in_batch = []
+        prompts_in_batch.clear
       end
       tokenized_strings.each do |tokens|
@@ -207,7 +232,7 @@ module Rllama
       Cpp.llama_batch_free(batch)
-      is_array ? all_embeddings : all_embeddings[0]
+      input_is_array ? all_embeddings : all_embeddings[0]
     end
     def embeddings?

data/lib/rllama/cpp.rb CHANGED Viewed

@@ -8,7 +8,7 @@ module Rllama
     LIB_NAME = 'llama'
-    platform =
+    PLATFORM =
       case FFI::Platform::OS
       when 'darwin'
         FFI::Platform::ARCH == 'aarch64' ? 'arm64-darwin' : 'x86_64-darwin'
@@ -28,12 +28,27 @@ module Rllama
         "lib#{LIB_NAME}.so"
       end
-    platform_dir = File.join(__dir__, platform)
-    platform_path = File.join(platform_dir, lib_file)
+    PLATFORM_DIR = File.join(__dir__, PLATFORM)
+    platform_path = File.join(PLATFORM_DIR, lib_file)
     lib_paths = []
     lib_paths << platform_path if File.exist?(platform_path)
+    ggml_lib_file =
+      case FFI::Platform::OS
+      when 'darwin'
+        'libggml.dylib'
+      when 'windows', 'mingw32'
+        'ggml.dll'
+      else
+        'libggml.so'
+      end
+    ggml_platform_path = File.join(PLATFORM_DIR, ggml_lib_file)
+    lib_paths << ggml_platform_path if File.exist?(ggml_platform_path)
     lib_paths +=
       case FFI::Platform::OS
       when 'darwin'
@@ -436,6 +451,8 @@ module Rllama
     attach_function :llama_backend_init, [], :void
     attach_function :llama_backend_free, [], :void
     attach_function :llama_numa_init, [:int], :void # ggml_numa_strategy
+    attach_function :ggml_backend_load_all, [], :void
+    attach_function :ggml_backend_load_all_from_path, [:string], :void
     # Threadpool
     attach_function :llama_attach_threadpool, %i[llama_context_p ggml_threadpool_t ggml_threadpool_t], :void
@@ -681,10 +698,14 @@ module Rllama
       llama_log_set(@log_callback, nil)
     end
-    llama_backend_init
     silence_log!
+    if File.directory?(PLATFORM_DIR)
+      ggml_backend_load_all_from_path(PLATFORM_DIR)
+    else
+      ggml_backend_load_all
+    end
     freeze
   end
 end

data/lib/rllama/loader.rb CHANGED Viewed

@@ -62,6 +62,8 @@ module Rllama
       local_path = File.join(dir, org, repo, file_path)
+      return local_path if File.exist?(local_path)
       puts "Destination: #{local_path}"
       download_file(url, local_path, "HuggingFace model: #{hf_path}")
@@ -74,6 +76,8 @@ module Rllama
       local_path = File.join(dir, filename)
+      return local_path if File.exist?(local_path)
       puts "Destination: #{local_path}"
       download_file(url, local_path, "URL: #{url}")
@@ -82,8 +86,6 @@ module Rllama
     def download_file(url, local_path, description)
       FileUtils.mkdir_p(File.dirname(local_path))
-      return local_path if File.exist?(local_path)
       temp_path = File.join(File.dirname(local_path), "~#{File.basename(local_path)}")
       existing_size = File.exist?(temp_path) ? File.size(temp_path) : 0

data/lib/rllama/model.rb CHANGED Viewed

@@ -47,11 +47,32 @@ module Rllama
     alias message generate
     def embed(prompt, normalize: true, batch_size: 512, &block)
-      init_embedding_context do |ctx|
-        ctx.embed(prompt, normalize:, batch_size:, &block)
+      inputs = prompt.is_a?(Array) ? prompt : [prompt]
+      tokenized_inputs = inputs.map { |text| tokenize(text, max_tokens: n_ctx_train) }
+      max_token_length = tokenized_inputs.map(&:length).max || 0
+      effective_batch_size = [batch_size, max_token_length].max
+      effective_ctx = [n_ctx_train, max_token_length].min
+      init_embedding_context(n_ctx: effective_ctx, n_batch: effective_batch_size) do |ctx|
+        inputs = prompt.is_a?(Array) ? tokenized_inputs : tokenized_inputs[0]
+        ctx.embed(inputs, normalize:, batch_size: effective_batch_size, &block)
       end
     end
+    def tokenize(text, max_tokens: nil)
+      size = text.bytesize + 2
+      tokens_ptr = FFI::MemoryPointer.new(:int32, size)
+      count = Cpp.llama_tokenize(vocab, text, text.bytesize, tokens_ptr, size, true, false)
+      raise Error, "Failed to tokenize text: '#{text}'" if count.negative?
+      tokens_ptr.read_array_of_int32([count, max_tokens].compact.min)
+    end
     def close
       Cpp.llama_model_free(@pointer)
     end
@@ -70,7 +91,7 @@ module Rllama
       context
     end
-    def init_embedding_context(n_ctx: 2048, n_batch: 512, &)
+    def init_embedding_context(n_ctx: n_ctx_train, n_batch: 512, &)
       init_context(embeddings: true, n_ctx:, n_batch:, &)
     end

data/lib/rllama/version.rb CHANGED Viewed

@@ -1,5 +1,5 @@
 # frozen_string_literal: true
 module Rllama
-  VERSION = '1.0.0'
+  VERSION = '1.0.2'
 end

metadata CHANGED Viewed

@@ -1,13 +1,13 @@
 --- !ruby/object:Gem::Specification
 name: rllama
 version: !ruby/object:Gem::Version
-  version: 1.0.0
+  version: 1.0.2
 platform: arm64-darwin
 authors:
 - Pete Matsyburka
 bindir: bin
 cert_chain: []
-date: 2025-10-05 00:00:00.000000000 Z
+date: 2025-10-07 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: ffi