npm - cui-llama.rn - Versions diffs - 1.5.0 → 1.6.1 - Mend

cui-llama.rn 1.5.0 → 1.6.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (324) hide show

package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/rn-llama.h ADDED Viewed

@@ -0,0 +1,134 @@
+#ifndef RNLLAMA_H
+#define RNLLAMA_H
+#include <sstream>
+#include <iostream>
+#include "chat.h"
+#include "common.h"
+#include "ggml.h"
+#include "gguf.h"
+#include "llama.h"
+#include "llama-impl.h"
+#include "sampling.h"
+#if defined(__ANDROID__)
+#include <android/log.h>
+#endif
+namespace rnllama {
+std::string tokens_to_output_formatted_string(const llama_context *ctx, const llama_token token);
+std::string tokens_to_str(llama_context *ctx, const std::vector<llama_token>::const_iterator begin, const std::vector<llama_token>::const_iterator end);
+lm_ggml_type kv_cache_type_from_str(const std::string & s);
+enum stop_type
+{
+    STOP_FULL,
+    STOP_PARTIAL,
+};
+// completion token output with probabilities
+struct completion_token_output
+{
+    struct token_prob
+    {
+        llama_token tok;
+        float prob;
+    };
+    std::vector<token_prob> probs;
+    llama_token tok;
+};
+// Main context class
+struct llama_rn_context {
+    bool is_predicting = false;
+    bool is_interrupted = false;
+    bool has_next_token = false;
+    std::string generated_text;
+    std::vector<completion_token_output> generated_token_probs;
+    size_t num_prompt_tokens = 0;
+    size_t num_tokens_predicted = 0;
+    size_t n_past = 0;
+    size_t n_remain = 0;
+    std::vector<llama_token> embd;
+    common_params params;
+    common_init_result llama_init;
+    llama_model *model = nullptr;
+    float loading_progress = 0;
+    bool is_load_interrupted = false;
+    llama_context *ctx = nullptr;
+    common_sampler *ctx_sampling = nullptr;
+    common_chat_templates_ptr templates;
+    int n_ctx;
+    bool context_full = false;
+    bool truncated = false;
+    bool stopped_eos = false;
+    bool stopped_word = false;
+    bool stopped_limit = false;
+    std::string stopping_word;
+    bool incomplete = false;
+    std::vector<common_adapter_lora_info> lora;
+    ~llama_rn_context();
+    void rewind();
+    bool initSampling();
+    bool loadModel(common_params &params_);
+    bool validateModelChatTemplate(bool use_jinja, const char *name) const;
+    common_chat_params getFormattedChatWithJinja(
+      const std::string &messages,
+      const std::string &chat_template,
+      const std::string &json_schema,
+      const std::string &tools,
+      const bool &parallel_tool_calls,
+      const std::string &tool_choice
+    ) const;
+    std::string getFormattedChat(
+      const std::string &messages,
+      const std::string &chat_template
+    ) const;
+    void truncatePrompt(std::vector<llama_token> &prompt_tokens);
+    void loadPrompt();
+    void beginCompletion();
+    completion_token_output nextToken();
+    size_t findStoppingStrings(const std::string &text, const size_t last_token_size, const stop_type type);
+    completion_token_output doCompletion();
+    std::vector<float> getEmbedding(common_params &embd_params);
+    std::string bench(int pp, int tg, int pl, int nr);
+    int applyLoraAdapters(std::vector<common_adapter_lora_info> lora);
+    void removeLoraAdapters();
+    std::vector<common_adapter_lora_info> getLoadedLoraAdapters();
+};\
+// Logging macros
+extern bool rnllama_verbose;
+#if RNLLAMA_VERBOSE != 1
+#define LOG_VERBOSE(MSG, ...)
+#else
+#define LOG_VERBOSE(MSG, ...)                                       \
+    do                                                              \
+    {                                                               \
+        if (rnllama_verbose)                                        \
+        {                                                           \
+            log("VERBOSE", __func__, __LINE__, MSG, ##__VA_ARGS__); \
+        }                                                           \
+    } while (0)
+#endif
+#define LOG_ERROR(MSG, ...) log("ERROR", __func__, __LINE__, MSG, ##__VA_ARGS__)
+#define LOG_WARNING(MSG, ...) log("WARNING", __func__, __LINE__, MSG, ##__VA_ARGS__)
+#define LOG_INFO(MSG, ...) log("INFO", __func__, __LINE__, MSG, ##__VA_ARGS__)
+} // namespace rnllama
+#endif /* RNLLAMA_H */

package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/sampling.h ADDED Viewed

@@ -0,0 +1,107 @@
+#pragma once
+#include "llama.h"
+#include "common.h"
+#include <string>
+#include <vector>
+// common_sampler extends llama_sampler with additional functionality:
+//
+//  - grammar support
+//  - custom sampler logic based on the parameters
+//  - history of the last accepted tokens
+//  - performance metrics
+//
+// This goal is to have a common implementation of the sampling logic shared across the examples.
+// For example, depending on the temperature, the sampling chain can be very simple (greedy) or more
+// complex (top-k, top-p, etc).
+//
+// Another example is related to the grammar. In general, the grammar constraints applied on the full
+// vocabulary can be very taxing. To improve performance, the grammar can be applied only to the sampled
+// token in order to verify if it fits the grammar. And only if the token doesn't fit the grammar, the
+// grammar constraints are applied to the full vocabulary and the token is resampled.
+//
+// The common_sampler also maintains a container with the last accepted tokens. In the future, this can
+// be moved into the core llama library.
+//
+// For convenience, the common_sampler also maintains a container with the current candidate tokens.
+// This can be used to access the probabilities of the rest of the non-sampled tokens.
+//
+// TODO: measure grammar performance
+//
+struct common_sampler;
+// llama_sampler API overloads
+struct common_sampler * common_sampler_init(const struct llama_model * model, const struct common_params_sampling & params);
+void common_sampler_free(struct common_sampler * gsmpl);
+// if accept_grammar is true, the token is accepted both by the sampling chain and the grammar
+void                    common_sampler_accept(struct common_sampler * gsmpl, llama_token token, bool accept_grammar);
+void                    common_sampler_reset (struct common_sampler * gsmpl);
+struct common_sampler * common_sampler_clone (struct common_sampler * gsmpl);
+// arguments can be nullptr to skip printing
+void common_perf_print(const struct llama_context * ctx, const struct common_sampler * gsmpl);
+// extended sampling implementation:
+//
+// - set logits
+// - apply the configured sampler chain
+// - check if the token fits the grammar (if any)
+// - if not: resample by first applying the grammar constraints and then sampling again (slower path)
+//
+// if grammar_first is true, the grammar is applied before the samplers (slower)
+// useful in cases where all the resulting candidates (not just the sampled one) must fit the grammar
+//
+llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_context * ctx, int idx, bool grammar_first = false);
+// generalized version of common_sampler_sample
+//
+// will cross-reference the sampled tokens with a batch of draft tokens and accept those that match
+// if the sampler disagrees at some point, we stop and return the accepted tokens up to now
+//
+//      common_sampler_sample_n(gsmpl, ctx, { idx }, {});
+//
+// is equivalent to
+//
+//      common_sampler_sample(gsmpl, ctx, idx);
+//      common_sampler_accept(gsmpl, token, true);
+//
+// requires: idxs.size() == draft.size() + 1
+//
+// returns at least 1 token, up to idxs.size()
+//
+std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const std::vector<int> & idxs, const llama_tokens & draft, bool grammar_first = false);
+// assume idxs == [ 0, 1, 2, ..., draft.size() ]
+std::vector<llama_token> common_sampler_sample_and_accept_n(struct common_sampler * gsmpl, struct llama_context * ctx, const llama_tokens & draft, bool grammar_first = false);
+uint32_t common_sampler_get_seed(const struct common_sampler * gsmpl);
+// helpers
+// access the internal list of current candidate tokens
+llama_token_data_array * common_sampler_get_candidates(struct common_sampler * gsmpl);
+// get the last accepted token
+llama_token common_sampler_last(const struct common_sampler * gsmpl);
+// print the sampler chain into a string
+std::string common_sampler_print(const struct common_sampler * gsmpl);
+// get a string representation of the last accepted tokens
+std::string common_sampler_prev_str(common_sampler * gsmpl, llama_context * ctx, int n);
+char        common_sampler_type_to_chr(enum common_sampler_type cnstr);
+std::string common_sampler_type_to_str(enum common_sampler_type cnstr);
+std::vector<enum common_sampler_type> common_sampler_types_from_names(const std::vector<std::string> & names, bool allow_alt_names);
+std::vector<enum common_sampler_type> common_sampler_types_from_chars(const std::string & chars);
+llama_sampler * llama_sampler_init_llg(const llama_vocab * vocab,
+                const char * grammar_kind, const char * grammar_data);

package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/speculative.h ADDED Viewed

@@ -0,0 +1,28 @@
+#pragma once
+#include "llama.h"
+#include "common.h"
+struct common_speculative;
+struct common_speculative_params {
+    int n_draft = 16;  // max drafted tokens
+    int n_reuse = 256;
+    float p_min = 0.75f; // min probability required to accept a token in the draft
+};
+struct common_speculative * common_speculative_init(struct llama_context * ctx_dft);
+void common_speculative_free(struct common_speculative * spec);
+bool common_speculative_are_compatible(
+        const struct llama_context * ctx_tgt,
+        const struct llama_context * ctx_dft);
+// sample up to n_draft tokens and add them to the batch using the draft model
+llama_tokens common_speculative_gen_draft(
+               struct common_speculative * spec,
+        struct common_speculative_params   params,
+                      const llama_tokens & prompt,
+                             llama_token   id_last);

package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/unicode-data.h ADDED Viewed

@@ -0,0 +1,20 @@
+#pragma once
+#include <cstdint>
+#include <vector>
+#include <unordered_map>
+#include <unordered_set>
+struct range_nfd {
+    uint32_t first;
+    uint32_t last;
+    uint32_t nfd;
+};
+static const uint32_t MAX_CODEPOINTS = 0x110000;
+extern const std::initializer_list<std::pair<uint32_t, uint16_t>> unicode_ranges_flags;
+extern const std::unordered_set<uint32_t> unicode_set_whitespace;
+extern const std::initializer_list<std::pair<uint32_t, uint32_t>> unicode_map_lowercase;
+extern const std::initializer_list<std::pair<uint32_t, uint32_t>> unicode_map_uppercase;
+extern const std::initializer_list<range_nfd> unicode_ranges_nfd;

package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Headers/unicode.h ADDED Viewed

@@ -0,0 +1,66 @@
+#pragma once
+#include <cstdint>
+#include <string>
+#include <vector>
+struct unicode_cpt_flags {
+    enum {
+        UNDEFINED       = 0x0001,
+        NUMBER          = 0x0002,  // regex: \p{N}
+        LETTER          = 0x0004,  // regex: \p{L}
+        SEPARATOR       = 0x0008,  // regex: \p{Z}
+        ACCENT_MARK     = 0x0010,  // regex: \p{M}
+        PUNCTUATION     = 0x0020,  // regex: \p{P}
+        SYMBOL          = 0x0040,  // regex: \p{S}
+        CONTROL         = 0x0080,  // regex: \p{C}
+        MASK_CATEGORIES = 0x00FF,
+    };
+    // codepoint type
+    uint16_t is_undefined   : 1;
+    uint16_t is_number      : 1;  // regex: \p{N}
+    uint16_t is_letter      : 1;  // regex: \p{L}
+    uint16_t is_separator   : 1;  // regex: \p{Z}
+    uint16_t is_accent_mark : 1;  // regex: \p{M}
+    uint16_t is_punctuation : 1;  // regex: \p{P}
+    uint16_t is_symbol      : 1;  // regex: \p{S}
+    uint16_t is_control     : 1;  // regex: \p{C}
+    // helper flags
+    uint16_t is_whitespace  : 1;  // regex: \s
+    uint16_t is_lowercase   : 1;
+    uint16_t is_uppercase   : 1;
+    uint16_t is_nfd         : 1;
+    // decode from uint16
+    inline unicode_cpt_flags(const uint16_t flags = 0) {
+        *reinterpret_cast<uint16_t*>(this) = flags;
+    }
+    inline uint16_t as_uint() const {
+        return *reinterpret_cast<const uint16_t*>(this);
+    }
+    inline uint16_t category_flag() const {
+        return this->as_uint() & MASK_CATEGORIES;
+    }
+};
+size_t unicode_len_utf8(char src);
+std::string unicode_cpt_to_utf8  (uint32_t cpt);
+uint32_t    unicode_cpt_from_utf8(const std::string & utf8, size_t & offset);
+std::vector<uint32_t> unicode_cpts_from_utf8(const std::string & utf8);
+std::vector<uint32_t> unicode_cpts_normalize_nfd(const std::vector<uint32_t> & cpts);
+unicode_cpt_flags unicode_cpt_flags_from_cpt (uint32_t cpt);
+unicode_cpt_flags unicode_cpt_flags_from_utf8(const std::string & utf8);
+std::string unicode_byte_to_utf8(uint8_t byte);
+uint8_t     unicode_utf8_to_byte(const std::string & utf8);
+uint32_t unicode_tolower(uint32_t cpt);
+std::vector<std::string> unicode_regex_split(const std::string & text, const std::vector<std::string> & regex_exprs);

package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/Info.plist ADDED Viewed

Binary file

package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/ggml-llama.metallib ADDED Viewed

Binary file

package/ios/rnllama.xcframework/tvos-arm64/rnllama.framework/rnllama ADDED Viewed

Binary file

package/ios/rnllama.xcframework/tvos-arm64_x86_64-simulator/rnllama.framework/Headers/chat.h ADDED Viewed

@@ -0,0 +1,143 @@
+// Chat support (incl. tool call grammar constraining & output parsing) w/ generic & custom template handlers.
+#pragma once
+#include "common.h"
+#include <string>
+#include <vector>
+#include "minja/chat-template.hpp"
+#include "minja/minja.hpp"
+typedef minja::chat_template common_chat_template;
+struct common_chat_templates {
+    bool has_explicit_template; // Model had builtin template or template overridde was specified.
+    std::unique_ptr<common_chat_template> template_default; // always set (defaults to chatml)
+    std::unique_ptr<common_chat_template> template_tool_use;
+};
+struct common_chat_tool_call {
+    std::string name;
+    std::string arguments;
+    std::string id;
+};
+struct common_chat_msg_content_part {
+    std::string type;
+    std::string text;
+};
+struct common_chat_msg {
+    std::string role;
+    std::string content;
+    std::vector<common_chat_msg_content_part> content_parts = {};
+    std::vector<common_chat_tool_call> tool_calls = {};
+    std::string reasoning_content;
+    std::string tool_name;
+    std::string tool_call_id;
+};
+struct common_chat_tool {
+    std::string name;
+    std::string description;
+    std::string parameters;
+};
+enum common_chat_tool_choice {
+    COMMON_CHAT_TOOL_CHOICE_AUTO,
+    COMMON_CHAT_TOOL_CHOICE_REQUIRED,
+    COMMON_CHAT_TOOL_CHOICE_NONE,
+};
+enum common_chat_format {
+    COMMON_CHAT_FORMAT_CONTENT_ONLY,
+    COMMON_CHAT_FORMAT_GENERIC,
+    COMMON_CHAT_FORMAT_MISTRAL_NEMO,
+    COMMON_CHAT_FORMAT_LLAMA_3_X,
+    COMMON_CHAT_FORMAT_LLAMA_3_X_WITH_BUILTIN_TOOLS,
+    COMMON_CHAT_FORMAT_DEEPSEEK_R1,
+    COMMON_CHAT_FORMAT_DEEPSEEK_R1_EXTRACT_REASONING,
+    COMMON_CHAT_FORMAT_FIREFUNCTION_V2,
+    COMMON_CHAT_FORMAT_FUNCTIONARY_V3_2,
+    COMMON_CHAT_FORMAT_FUNCTIONARY_V3_1_LLAMA_3_1,
+    COMMON_CHAT_FORMAT_HERMES_2_PRO,
+    COMMON_CHAT_FORMAT_HERMES_2_PRO_EXTRACT_REASONING,
+    COMMON_CHAT_FORMAT_COMMAND_R7B,
+    COMMON_CHAT_FORMAT_COMMAND_R7B_EXTRACT_REASONING,
+    COMMON_CHAT_FORMAT_COUNT, // Not a format, just the # formats
+};
+struct common_chat_templates_inputs {
+    std::vector<common_chat_msg> messages;
+    std::string grammar;
+    std::string json_schema;
+    bool add_generation_prompt = true;
+    bool use_jinja = true;
+    // Parameters below only supported when use_jinja is true
+    std::vector<common_chat_tool> tools;
+    common_chat_tool_choice tool_choice = COMMON_CHAT_TOOL_CHOICE_AUTO;
+    bool parallel_tool_calls = false;
+    bool extract_reasoning     = true;
+};
+struct common_chat_params {
+    common_chat_format                  format = COMMON_CHAT_FORMAT_CONTENT_ONLY;
+    std::string                         prompt;
+    std::string                         grammar;
+    bool                                grammar_lazy = false;
+    std::vector<common_grammar_trigger> grammar_triggers;
+    std::vector<std::string>            preserved_tokens;
+    std::vector<std::string>            additional_stops;
+};
+// Check if the template supplied via "--chat-template" is supported or not. Returns true if it's valid
+bool common_chat_verify_template(const std::string & tmpl, bool use_jinja);
+void common_chat_templates_free(struct common_chat_templates * tmpls);
+struct common_chat_templates_deleter { void operator()(common_chat_templates * tmpls) { common_chat_templates_free(tmpls); } };
+typedef std::unique_ptr<struct common_chat_templates, common_chat_templates_deleter> common_chat_templates_ptr;
+common_chat_templates_ptr common_chat_templates_init(
+                                    const struct llama_model * model,
+                                           const std::string & chat_template_override,
+                                           const std::string & bos_token_override = "",
+                                           const std::string & eos_token_override = "");
+bool         common_chat_templates_was_explicit(const struct common_chat_templates * tmpls);
+const char * common_chat_templates_source(const struct common_chat_templates * tmpls, const char * variant = nullptr);
+struct common_chat_params      common_chat_templates_apply(
+    const struct common_chat_templates * tmpls,
+    const struct common_chat_templates_inputs & inputs);
+// Format single message, while taking into account the position of that message in chat history
+std::string common_chat_format_single(
+        const struct common_chat_templates * tmpls,
+        const std::vector<common_chat_msg> & past_msg,
+        const common_chat_msg & new_msg,
+        bool add_ass,
+        bool use_jinja);
+// Returns an example of formatted chat
+std::string common_chat_format_example(
+    const struct common_chat_templates * tmpls,
+    bool use_jinja);
+std::string               common_chat_format_name(common_chat_format format);
+common_chat_msg           common_chat_parse(      const std::string & input, common_chat_format format);
+common_chat_tool_choice common_chat_tool_choice_parse_oaicompat(const std::string & tool_choice);
+// Parses a JSON array of messages in OpenAI's chat completion API format.
+// T can be std::string containing JSON or nlohmann::ordered_json
+template <class T> std::vector<common_chat_msg> common_chat_msgs_parse_oaicompat(const T & messages);
+template <class T> T common_chat_msgs_to_json_oaicompat(const std::vector<common_chat_msg> & msgs, bool concat_typed_text = false);
+// Parses a JSON array of tools in OpenAI's chat completion tool call API format.
+// T can be std::string containing JSON or nlohmann::ordered_json
+template <class T> std::vector<common_chat_tool> common_chat_tools_parse_oaicompat(const T & tools);
+template <class T> T common_chat_tools_to_json_oaicompat(const std::vector<common_chat_tool> & tools);