npm - cui-llama.rn - Versions diffs - 1.2.6 → 1.3.3 - Mend

cui-llama.rn 1.2.6 → 1.3.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (75) hide show

package/README.md +3 -2
package/android/src/main/CMakeLists.txt +26 -6
package/android/src/main/java/com/rnllama/LlamaContext.java +115 -27
package/android/src/main/java/com/rnllama/RNLlama.java +40 -7
package/android/src/main/jni.cpp +228 -40
package/android/src/newarch/java/com/rnllama/RNLlamaModule.java +9 -4
package/android/src/oldarch/java/com/rnllama/RNLlamaModule.java +9 -4
package/cpp/amx/amx.cpp +196 -0
package/cpp/amx/amx.h +20 -0
package/cpp/amx/common.h +101 -0
package/cpp/amx/mmq.cpp +2524 -0
package/cpp/amx/mmq.h +16 -0
package/cpp/common.cpp +118 -251
package/cpp/common.h +53 -30
package/cpp/ggml-aarch64.c +46 -3395
package/cpp/ggml-aarch64.h +0 -20
package/cpp/ggml-alloc.c +6 -8
package/cpp/ggml-backend-impl.h +33 -11
package/cpp/ggml-backend-reg.cpp +423 -0
package/cpp/ggml-backend.cpp +14 -676
package/cpp/ggml-backend.h +46 -9
package/cpp/ggml-common.h +6 -0
package/cpp/ggml-cpu-aarch64.c +3823 -0
package/cpp/ggml-cpu-aarch64.h +32 -0
package/cpp/ggml-cpu-impl.h +14 -242
package/cpp/ggml-cpu-quants.c +10835 -0
package/cpp/ggml-cpu-quants.h +63 -0
package/cpp/ggml-cpu.c +13971 -13720
package/cpp/ggml-cpu.cpp +715 -0
package/cpp/ggml-cpu.h +65 -63
package/cpp/ggml-impl.h +285 -25
package/cpp/ggml-metal.h +8 -8
package/cpp/ggml-metal.m +1221 -728
package/cpp/ggml-quants.c +189 -10681
package/cpp/ggml-quants.h +78 -125
package/cpp/ggml-threading.cpp +12 -0
package/cpp/ggml-threading.h +12 -0
package/cpp/ggml.c +688 -1460
package/cpp/ggml.h +58 -244
package/cpp/json-schema-to-grammar.cpp +1045 -1045
package/cpp/json.hpp +24766 -24766
package/cpp/llama-sampling.cpp +5 -2
package/cpp/llama.cpp +409 -123
package/cpp/llama.h +8 -4
package/cpp/rn-llama.hpp +89 -25
package/cpp/sampling.cpp +42 -3
package/cpp/sampling.h +22 -1
package/cpp/sgemm.cpp +608 -0
package/cpp/speculative.cpp +270 -0
package/cpp/speculative.h +28 -0
package/cpp/unicode.cpp +11 -0
package/ios/RNLlama.mm +43 -20
package/ios/RNLlamaContext.h +9 -3
package/ios/RNLlamaContext.mm +146 -33
package/jest/mock.js +0 -1
package/lib/commonjs/NativeRNLlama.js.map +1 -1
package/lib/commonjs/grammar.js +4 -2
package/lib/commonjs/grammar.js.map +1 -1
package/lib/commonjs/index.js +52 -15
package/lib/commonjs/index.js.map +1 -1
package/lib/module/NativeRNLlama.js.map +1 -1
package/lib/module/grammar.js +2 -1
package/lib/module/grammar.js.map +1 -1
package/lib/module/index.js +51 -15
package/lib/module/index.js.map +1 -1
package/lib/typescript/NativeRNLlama.d.ts +122 -8
package/lib/typescript/NativeRNLlama.d.ts.map +1 -1
package/lib/typescript/grammar.d.ts +5 -6
package/lib/typescript/grammar.d.ts.map +1 -1
package/lib/typescript/index.d.ts +15 -6
package/lib/typescript/index.d.ts.map +1 -1
package/package.json +2 -1
package/src/NativeRNLlama.ts +135 -13
package/src/grammar.ts +10 -8
package/src/index.ts +104 -28

package/lib/typescript/index.d.ts CHANGED Viewed

@@ -1,15 +1,23 @@
-import type { NativeContextParams, NativeLlamaContext, NativeCompletionParams, NativeCompletionTokenProb, NativeCompletionResult, NativeTokenizeResult, NativeEmbeddingResult, NativeSessionLoadResult, NativeCPUFeatures } from './NativeRNLlama';
+import type { NativeContextParams, NativeLlamaContext, NativeCompletionParams, NativeCompletionTokenProb, NativeCompletionResult, NativeTokenizeResult, NativeEmbeddingResult, NativeSessionLoadResult, NativeCPUFeatures, NativeEmbeddingParams, NativeCompletionTokenProbItem, NativeCompletionResultTimings } from './NativeRNLlama';
+import type { SchemaGrammarConverterPropOrder, SchemaGrammarConverterBuiltinRule } from './grammar';
 import { SchemaGrammarConverter, convertJsonSchemaToGrammar } from './grammar';
-import type { RNLlamaOAICompatibleMessage } from './chat';
+import type { RNLlamaMessagePart, RNLlamaOAICompatibleMessage } from './chat';
+export type { NativeContextParams, NativeLlamaContext, NativeCompletionParams, NativeCompletionTokenProb, NativeCompletionResult, NativeTokenizeResult, NativeEmbeddingResult, NativeSessionLoadResult, NativeEmbeddingParams, NativeCompletionTokenProbItem, NativeCompletionResultTimings, RNLlamaMessagePart, RNLlamaOAICompatibleMessage, SchemaGrammarConverterPropOrder, SchemaGrammarConverterBuiltinRule, };
 export { SchemaGrammarConverter, convertJsonSchemaToGrammar };
 export type TokenData = {
     token: string;
     completion_probabilities?: Array<NativeCompletionTokenProb>;
 };
-export type ContextParams = NativeContextParams;
+export type ContextParams = Omit<NativeContextParams, 'cache_type_k' | 'cache_type_v' | 'pooling_type'> & {
+    cache_type_k?: 'f16' | 'f32' | 'q8_0' | 'q4_0' | 'q4_1' | 'iq4_nl' | 'q5_0' | 'q5_1';
+    cache_type_v?: 'f16' | 'f32' | 'q8_0' | 'q4_0' | 'q4_1' | 'iq4_nl' | 'q5_0' | 'q5_1';
+    pooling_type?: 'none' | 'mean' | 'cls' | 'last' | 'rank';
+};
+export type EmbeddingParams = NativeEmbeddingParams;
 export type CompletionParams = Omit<NativeCompletionParams, 'emit_partial_completion' | 'prompt'> & {
     prompt?: string;
     messages?: RNLlamaOAICompatibleMessage[];
+    chatTemplate?: string;
 };
 export type BenchResult = {
     modelDesc: string;
@@ -38,18 +46,19 @@ export declare class LlamaContext {
     saveSession(filepath: string, options?: {
         tokenSize: number;
     }): Promise<number>;
-    getFormattedChat(messages: RNLlamaOAICompatibleMessage[]): Promise<string>;
+    getFormattedChat(messages: RNLlamaOAICompatibleMessage[], template?: string): Promise<string>;
     completion(params: CompletionParams, callback?: (data: TokenData) => void): Promise<NativeCompletionResult>;
     stopCompletion(): Promise<void>;
     tokenizeAsync(text: string): Promise<NativeTokenizeResult>;
     tokenizeSync(text: string): NativeTokenizeResult;
     detokenize(tokens: number[]): Promise<string>;
-    embedding(text: string): Promise<NativeEmbeddingResult>;
+    embedding(text: string, params?: EmbeddingParams): Promise<NativeEmbeddingResult>;
     bench(pp: number, tg: number, pl: number, nr: number): Promise<BenchResult>;
     release(): Promise<void>;
 }
 export declare function getCpuFeatures(): Promise<NativeCPUFeatures>;
 export declare function setContextLimit(limit: number): Promise<void>;
-export declare function initLlama({ model, is_model_asset: isModelAsset, ...rest }: ContextParams, progressCallback?: (progress: number) => void): Promise<LlamaContext>;
+export declare function loadLlamaModelInfo(model: string): Promise<Object>;
+export declare function initLlama({ model, is_model_asset: isModelAsset, pooling_type: poolingType, lora, ...rest }: ContextParams, onProgress?: (progress: number) => void): Promise<LlamaContext>;
 export declare function releaseAllLlama(): Promise<void>;
 //# sourceMappingURL=index.d.ts.map

package/lib/typescript/index.d.ts.map CHANGED Viewed

	@@ -1 +1 @@
1	- {"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../src/index.ts"],"names":[],"mappings":"AAGA,OAAO,KAAK,EACV,mBAAmB,EACnB,kBAAkB,EAClB,sBAAsB,EACtB,yBAAyB,EACzB,sBAAsB,EACtB,oBAAoB,EACpB,qBAAqB,EACrB,uBAAuB,EACvB,iBAAiB,~~EAClB~~,MAAM,iBAAiB,CAAA;AACxB,OAAO,EAAE,sBAAsB,EAAE,0BAA0B,EAAE,MAAM,WAAW,CAAA;AAC9E,OAAO,KAAK,EAAE,2BAA2B,EAAE,MAAM,QAAQ,CAAA;~~AAGzD~~,OAAO,EAAE,sBAAsB,EAAE,0BAA0B,EAAE,CAAA;~~AAe7D~~,MAAM,MAAM,SAAS,GAAG;IACtB,KAAK,EAAE,MAAM,CAAA;IACb,wBAAwB,CAAC,EAAE,KAAK,CAAC,yBAAyB,CAAC,CAAA;CAC5D,CAAA;AAOD,MAAM,MAAM,aAAa,GAAG,mBAAmB,CAAA;~~AAE/C~~,MAAM,MAAM,gBAAgB,GAAG,IAAI,CACjC,sBAAsB,EACtB,yBAAyB,GAAG,QAAQ,CACrC,GAAG;IACF,MAAM,CAAC,EAAE,MAAM,CAAA;IACf,QAAQ,CAAC,EAAE,2BAA2B,EAAE,CAAA;~~CACzC~~,CAAA;AAED,MAAM,MAAM,WAAW,GAAG;IACxB,SAAS,EAAE,MAAM,CAAA;IACjB,SAAS,EAAE,MAAM,CAAA;IACjB,YAAY,EAAE,MAAM,CAAA;IACpB,KAAK,EAAE,MAAM,CAAA;IACb,KAAK,EAAE,MAAM,CAAA;IACb,KAAK,EAAE,MAAM,CAAA;IACb,KAAK,EAAE,MAAM,CAAA;CACd,CAAA;AAED,qBAAa,YAAY;IACvB,EAAE,EAAE,MAAM,CAAA;IAEV,GAAG,EAAE,OAAO,CAAQ;IAEpB,WAAW,EAAE,MAAM,CAAK;IAExB,KAAK,EAAE;QACL,uBAAuB,CAAC,EAAE,OAAO,CAAA;KAClC,CAAK;gBAEM,EAAE,SAAS,EAAE,GAAG,EAAE,WAAW,EAAE,KAAK,EAAE,EAAE,kBAAkB;IAOtE;;OAEG;IACG,WAAW,CAAC,QAAQ,EAAE,MAAM,GAAG,OAAO,CAAC,uBAAuB,CAAC;IAMrE;;OAEG;IACG,WAAW,CACf,QAAQ,EAAE,MAAM,EAChB,OAAO,CAAC,EAAE;QAAE,SAAS,EAAE,MAAM,CAAA;KAAE,GAC9B,OAAO,CAAC,MAAM,CAAC;IAIZ,gBAAgB,CACpB,QAAQ,EAAE,2BAA2B,EAAE,~~GACtC~~,OAAO,CAAC,MAAM,CAAC;~~IASZ~~,UAAU,CACd,MAAM,EAAE,gBAAgB,EACxB,QAAQ,CAAC,EAAE,CAAC,IAAI,EAAE,SAAS,KAAK,IAAI,GACnC,OAAO,CAAC,sBAAsB,CAAC;IAkClC,cAAc,IAAI,OAAO,CAAC,IAAI,CAAC;IAI/B,aAAa,CAAC,IAAI,EAAE,MAAM,GAAG,OAAO,CAAC,oBAAoB,CAAC;IAI1D,YAAY,CAAC,IAAI,EAAE,MAAM,GAAG,oBAAoB;IAIhD,UAAU,CAAC,MAAM,EAAE,MAAM,EAAE,GAAG,OAAO,CAAC,MAAM,CAAC;IAI7C,SAAS,~~CAAC~~,IAAI,EAAE,MAAM,~~GAAG~~,OAAO,CAAC,qBAAqB,CAAC;~~IAIjD~~,KAAK,CACT,EAAE,EAAE,MAAM,EACV,EAAE,EAAE,MAAM,EACV,EAAE,EAAE,MAAM,EACV,EAAE,EAAE,MAAM,GACT,OAAO,CAAC,WAAW,CAAC;IAejB,OAAO,IAAI,OAAO,CAAC,IAAI,CAAC;CAG/B;AAED,wBAAsB,cAAc,IAAK,OAAO,CAAC,iBAAiB,CAAC,CAElE;AAED,wBAAsB,eAAe,CAAC,KAAK,EAAE,MAAM,GAAG,OAAO,CAAC,IAAI,CAAC,CAElE;~~AAED~~,wBAAsB,~~SAAS~~,CAAC,~~EAC5B~~,KAAK,EACL,cAAc,EAAE,YAAY,EAC5B,GAAG,IAAI,EACR,EAAE,aAAa,EAChB,~~gBAAgB~~,CAAC,EAAE,CAAC,QAAQ,EAAE,MAAM,KAAK,IAAI,~~GAC5C~~,OAAO,CAAC,YAAY,CAAC,~~CAwBvB~~;AAED,wBAAsB,eAAe,IAAI,OAAO,CAAC,IAAI,CAAC,CAErD"}
1	+ {"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../src/index.ts"],"names":[],"mappings":"AAGA,OAAO,KAAK,EACV,mBAAmB,EACnB,kBAAkB,EAClB,sBAAsB,EACtB,yBAAyB,EACzB,sBAAsB,EACtB,oBAAoB,EACpB,qBAAqB,EACrB,uBAAuB,EACvB,iBAAiB,EACjB,qBAAqB,EACrB,6BAA6B,EAC7B,6BAA6B,EAC9B,MAAM,iBAAiB,CAAA;AACxB,OAAO,KAAK,EAAE,+BAA+B,EAAE,iCAAiC,EAAE,MAAM,WAAW,CAAA;AACnG,OAAO,EAAE,sBAAsB,EAAE,0BAA0B,EAAE,MAAM,WAAW,CAAA;AAC9E,OAAO,KAAK,EAAE,kBAAkB,EAAE,2BAA2B,EAAE,MAAM,QAAQ,CAAA;AAG7E,YAAY,EACV,mBAAmB,EACnB,kBAAkB,EAClB,sBAAsB,EACtB,yBAAyB,EACzB,sBAAsB,EACtB,oBAAoB,EACpB,qBAAqB,EACrB,uBAAuB,EACvB,qBAAqB,EACrB,6BAA6B,EAC7B,6BAA6B,EAC7B,kBAAkB,EAClB,2BAA2B,EAC3B,+BAA+B,EAC/B,iCAAiC,GAClC,CAAA;AAED,OAAO,EAAE,sBAAsB,EAAE,0BAA0B,EAAE,CAAA;AAc7D,MAAM,MAAM,SAAS,GAAG;IACtB,KAAK,EAAE,MAAM,CAAA;IACb,wBAAwB,CAAC,EAAE,KAAK,CAAC,yBAAyB,CAAC,CAAA;CAC5D,CAAA;AAOD,MAAM,MAAM,aAAa,GAAG,IAAI,CAC9B,mBAAmB,EACnB,cAAc,GAAG,cAAc,GAAI,cAAc,CAClD,GAAG;IACF,YAAY,CAAC,EAAE,KAAK,GAAG,KAAK,GAAG,MAAM,GAAG,MAAM,GAAG,MAAM,GAAG,QAAQ,GAAG,MAAM,GAAG,MAAM,CAAA;IACpF,YAAY,CAAC,EAAE,KAAK,GAAG,KAAK,GAAG,MAAM,GAAG,MAAM,GAAG,MAAM,GAAG,QAAQ,GAAG,MAAM,GAAG,MAAM,CAAA;IACpF,YAAY,CAAC,EAAE,MAAM,GAAG,MAAM,GAAG,KAAK,GAAG,MAAM,GAAG,MAAM,CAAA;CACzD,CAAA;AAED,MAAM,MAAM,eAAe,GAAG,qBAAqB,CAAA;AAEnD,MAAM,MAAM,gBAAgB,GAAG,IAAI,CACjC,sBAAsB,EACtB,yBAAyB,GAAG,QAAQ,CACrC,GAAG;IACF,MAAM,CAAC,EAAE,MAAM,CAAA;IACf,QAAQ,CAAC,EAAE,2BAA2B,EAAE,CAAA;IACxC,YAAY,CAAC,EAAE,MAAM,CAAA;CACtB,CAAA;AAED,MAAM,MAAM,WAAW,GAAG;IACxB,SAAS,EAAE,MAAM,CAAA;IACjB,SAAS,EAAE,MAAM,CAAA;IACjB,YAAY,EAAE,MAAM,CAAA;IACpB,KAAK,EAAE,MAAM,CAAA;IACb,KAAK,EAAE,MAAM,CAAA;IACb,KAAK,EAAE,MAAM,CAAA;IACb,KAAK,EAAE,MAAM,CAAA;CACd,CAAA;AAED,qBAAa,YAAY;IACvB,EAAE,EAAE,MAAM,CAAA;IAEV,GAAG,EAAE,OAAO,CAAQ;IAEpB,WAAW,EAAE,MAAM,CAAK;IAExB,KAAK,EAAE;QACL,uBAAuB,CAAC,EAAE,OAAO,CAAA;KAClC,CAAK;gBAEM,EAAE,SAAS,EAAE,GAAG,EAAE,WAAW,EAAE,KAAK,EAAE,EAAE,kBAAkB;IAOtE;;OAEG;IACG,WAAW,CAAC,QAAQ,EAAE,MAAM,GAAG,OAAO,CAAC,uBAAuB,CAAC;IAMrE;;OAEG;IACG,WAAW,CACf,QAAQ,EAAE,MAAM,EAChB,OAAO,CAAC,EAAE;QAAE,SAAS,EAAE,MAAM,CAAA;KAAE,GAC9B,OAAO,CAAC,MAAM,CAAC;IAIZ,gBAAgB,CACpB,QAAQ,EAAE,2BAA2B,EAAE,EACvC,QAAQ,CAAC,EAAE,MAAM,GAChB,OAAO,CAAC,MAAM,CAAC;IAOZ,UAAU,CACd,MAAM,EAAE,gBAAgB,EACxB,QAAQ,CAAC,EAAE,CAAC,IAAI,EAAE,SAAS,KAAK,IAAI,GACnC,OAAO,CAAC,sBAAsB,CAAC;IAkClC,cAAc,IAAI,OAAO,CAAC,IAAI,CAAC;IAI/B,aAAa,CAAC,IAAI,EAAE,MAAM,GAAG,OAAO,CAAC,oBAAoB,CAAC;IAI1D,YAAY,CAAC,IAAI,EAAE,MAAM,GAAG,oBAAoB;IAIhD,UAAU,CAAC,MAAM,EAAE,MAAM,EAAE,GAAG,OAAO,CAAC,MAAM,CAAC;IAI7C,SAAS,CACP,IAAI,EAAE,MAAM,EACZ,MAAM,CAAC,EAAE,eAAe,GACvB,OAAO,CAAC,qBAAqB,CAAC;IAI3B,KAAK,CACT,EAAE,EAAE,MAAM,EACV,EAAE,EAAE,MAAM,EACV,EAAE,EAAE,MAAM,EACV,EAAE,EAAE,MAAM,GACT,OAAO,CAAC,WAAW,CAAC;IAejB,OAAO,IAAI,OAAO,CAAC,IAAI,CAAC;CAG/B;AAED,wBAAsB,cAAc,IAAK,OAAO,CAAC,iBAAiB,CAAC,CAElE;AAED,wBAAsB,eAAe,CAAC,KAAK,EAAE,MAAM,GAAG,OAAO,CAAC,IAAI,CAAC,CAElE;AAYD,wBAAsB,kBAAkB,CAAC,KAAK,EAAE,MAAM,GAAG,OAAO,CAAC,MAAM,CAAC,CAIvE;AAWD,wBAAsB,SAAS,CAC7B,EACE,KAAK,EACL,cAAc,EAAE,YAAY,EAC5B,YAAY,EAAE,WAAW,EACzB,IAAI,EACJ,GAAG,IAAI,EACR,EAAE,aAAa,EAChB,UAAU,CAAC,EAAE,CAAC,QAAQ,EAAE,MAAM,KAAK,IAAI,GACtC,OAAO,CAAC,YAAY,CAAC,CAuCvB;AAED,wBAAsB,eAAe,IAAI,OAAO,CAAC,IAAI,CAAC,CAErD"}

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "cui-llama.rn",
-  "version": "1.2.6",
+  "version": "1.3.3",
   "description": "Fork of llama.rn for ChatterUI",
   "main": "lib/commonjs/index",
   "module": "lib/module/index",
@@ -14,6 +14,7 @@
     "ios",
     "android",
     "cpp/*.*",
+    "cpp/amx/*.*",
     "*.podspec",
     "!lib/typescript/example",
     "!ios/build",

package/src/NativeRNLlama.ts CHANGED Viewed

@@ -1,11 +1,14 @@
 import type { TurboModule } from 'react-native'
 import { TurboModuleRegistry } from 'react-native'
+export type NativeEmbeddingParams = {
+  embd_normalize?: number
+}
 export type NativeContextParams = {
   model: string
   is_model_asset?: boolean
-  embedding?: boolean
+  use_progress_callback?: boolean
   n_ctx?: number
   n_batch?: number
@@ -13,6 +16,20 @@ export type NativeContextParams = {
   n_threads?: number
   n_gpu_layers?: number
+  /**
+   * Enable flash attention, only recommended in GPU device (Experimental in llama.cpp)
+   */
+  flash_attn?: boolean
+  /**
+   * KV cache data type for the K (Experimental in llama.cpp)
+   */
+  cache_type_k?: string
+  /**
+   * KV cache data type for the V (Experimental in llama.cpp)
+   */
+  cache_type_v?: string
   use_mlock?: boolean
   use_mmap?: boolean
   vocab_only?: boolean
@@ -22,35 +39,134 @@ export type NativeContextParams = {
   rope_freq_base?: number
   rope_freq_scale?: number
+  pooling_type?: number
+  // Embedding params
+  embedding?: boolean
+  embd_normalize?: number
 }
 export type NativeCompletionParams = {
   prompt: string
-  grammar?: string
-  stop?: Array<string> // -> antiprompt
   n_threads?: number
+  /**
+   * Set grammar for grammar-based sampling.  Default: no grammar
+   */
+  grammar?: string
+  /**
+   * Specify a JSON array of stopping strings.
+   * These words will not be included in the completion, so make sure to add them to the prompt for the next iteration. Default: `[]`
+   */
+  stop?: Array<string>
+  /**
+   * Set the maximum number of tokens to predict when generating text.
+   * **Note:** May exceed the set limit slightly if the last token is a partial multibyte character.
+   * When 0,no tokens will be generated but the prompt is evaluated into the cache. Default: `-1`, where `-1` is infinity.
+   */
   n_predict?: number
+  /**
+   * If greater than 0, the response also contains the probabilities of top N tokens for each generated token given the sampling settings.
+   * Note that for temperature < 0 the tokens are sampled greedily but token probabilities are still being calculated via a simple softmax of the logits without considering any other sampler settings.
+   * Default: `0`
+   */
   n_probs?: number
+  /**
+   * Limit the next token selection to the K most probable tokens.  Default: `40`
+   */
   top_k?: number
+  /**
+   * Limit the next token selection to a subset of tokens with a cumulative probability above a threshold P. Default: `0.95`
+   */
   top_p?: number
+  /**
+   * The minimum probability for a token to be considered, relative to the probability of the most likely token. Default: `0.05`
+   */
   min_p?: number
-  xtc_t?: number
-  xtc_p?: number
+  /**
+   * Set the chance for token removal via XTC sampler. Default: `0.0`, which is disabled.
+   */
+  xtc_probability?: number
+  /**
+   * Set a minimum probability threshold for tokens to be removed via XTC sampler. Default: `0.1` (> `0.5` disables XTC)
+   */
+  xtc_threshold?: number
+  /**
+   * Enable locally typical sampling with parameter p. Default: `1.0`, which is disabled.
+   */
   typical_p?: number
-  temperature?: number // -> temp
+  /**
+   * Adjust the randomness of the generated text. Default: `0.8`
+   */
+  temperature?: number
+  /**
+   * Last n tokens to consider for penalizing repetition. Default: `64`, where `0` is disabled and `-1` is ctx-size.
+   */
   penalty_last_n?: number
+  /**
+   * Control the repetition of token sequences in the generated text. Default: `1.0`
+   */
   penalty_repeat?: number
+  /**
+   * Repeat alpha frequency penalty. Default: `0.0`, which is disabled.
+   */
   penalty_freq?: number
+  /**
+   * Repeat alpha presence penalty. Default: `0.0`, which is disabled.
+   */
   penalty_present?: number
+  /**
+   * Penalize newline tokens when applying the repeat penalty. Default: `false`
+   */
+  penalize_nl?: boolean
+  /**
+   * Enable Mirostat sampling, controlling perplexity during text generation. Default: `0`, where `0` is disabled, `1` is Mirostat, and `2` is Mirostat 2.0.
+   */
   mirostat?: number
+  /**
+   * Set the Mirostat target entropy, parameter tau. Default: `5.0`
+   */
   mirostat_tau?: number
+  /**
+   * Set the Mirostat learning rate, parameter eta. Default: `0.1`
+   */
   mirostat_eta?: number
-  penalize_nl?: boolean
-  seed?: number
+  /**
+   * Set the DRY (Don't Repeat Yourself) repetition penalty multiplier. Default: `0.0`, which is disabled.
+   */
+  dry_multiplier?: number
+  /**
+   * Set the DRY repetition penalty base value. Default: `1.75`
+   */
+  dry_base?: number
+  /**
+   * Tokens that extend repetition beyond this receive exponentially increasing penalty: multiplier * base ^ (length of repeating sequence before token - allowed length). Default: `2`
+   */
+  dry_allowed_length?: number
+  /**
+   * How many tokens to scan for repetitions. Default: `-1`, where `0` is disabled and `-1` is context size.
+   */
+  dry_penalty_last_n?: number
+  /**
+   * Specify an array of sequence breakers for DRY sampling. Only a JSON array of strings is accepted. Default: `['\n', ':', '"', '*']`
+   */
+  dry_sequence_breakers?: Array<string>
+  /**
+   * Ignore end of stream token and continue generating. Default: `false`
+   */
   ignore_eos?: boolean
+  /**
+   * Modify the likelihood of a token appearing in the generated text completion.
+   * For example, use `"logit_bias": [[15043,1.0]]` to increase the likelihood of the token 'Hello', or `"logit_bias": [[15043,-1.0]]` to decrease its likelihood.
+   * Setting the value to false, `"logit_bias": [[15043,false]]` ensures that the token `Hello` is never produced. The tokens can also be represented as strings,
+   * e.g.`[["Hello, World!",-0.5]]` will reduce the likelihood of all the individual tokens that represent the string `Hello, World!`, just like the `presence_penalty` does.
+   * Default: `[]`
+   */
   logit_bias?: Array<Array<number>>
+  /**
+   * Set the random number generator (RNG) seed. Default: `-1`, which is a random seed.
+   */
+  seed?: number
   emit_partial_completion: boolean
 }
@@ -125,7 +241,9 @@ export type NativeCPUFeatures = {
 export interface Spec extends TurboModule {
   setContextLimit(limit: number): Promise<void>
-  initContext(params: NativeContextParams): Promise<NativeLlamaContext>
+  modelInfo(path: string, skip?: string[]): Promise<Object>
+  initContext(contextId: number, params: NativeContextParams): Promise<NativeLlamaContext>
   loadSession(
     contextId: number,
@@ -150,7 +268,11 @@ export interface Spec extends TurboModule {
     chatTemplate?: string,
   ): Promise<string>
   detokenize(contextId: number, tokens: number[]): Promise<string>
-  embedding(contextId: number, text: string): Promise<NativeEmbeddingResult>
+  embedding(
+    contextId: number,
+    text: string,
+    params: NativeEmbeddingParams,
+  ): Promise<NativeEmbeddingResult>
   bench(
     contextId: number,
     pp: number,

package/src/grammar.ts CHANGED Viewed

@@ -74,7 +74,7 @@ function buildRepetition(
   return result
 }
-class BuiltinRule {
+export class SchemaGrammarConverterBuiltinRule {
   content: string
   deps: string[]
@@ -85,9 +85,11 @@ class BuiltinRule {
   }
 }
+const BuiltinRule = SchemaGrammarConverterBuiltinRule
 const UP_TO_15_DIGITS = buildRepetition('[0-9]', 0, 15)
-const PRIMITIVE_RULES: { [key: string]: BuiltinRule } = {
+const PRIMITIVE_RULES: { [key: string]: SchemaGrammarConverterBuiltinRule } = {
   boolean: new BuiltinRule('("true" | "false") space', []),
   'decimal-part': new BuiltinRule(`[0-9] ${UP_TO_15_DIGITS}`, []),
   'integral-part': new BuiltinRule(`[0-9] | [1-9] ${UP_TO_15_DIGITS}`, []),
@@ -126,7 +128,7 @@ const PRIMITIVE_RULES: { [key: string]: BuiltinRule } = {
 }
 // TODO: support "uri", "email" string formats
-const STRING_FORMAT_RULES: { [key: string]: BuiltinRule } = {
+const STRING_FORMAT_RULES: { [key: string]: SchemaGrammarConverterBuiltinRule } = {
   date: new BuiltinRule(
     '[0-9] [0-9] [0-9] [0-9] "-" ( "0" [1-9] | "1" [0-2] ) "-" ( "0" [1-9] | [1-2] [0-9] | "3" [0-1] )',
     [],
@@ -173,7 +175,7 @@ const formatLiteral = (literal: string): string => {
 const generateConstantRule = (value: any): string =>
   formatLiteral(JSON.stringify(value))
-interface PropOrder {
+export interface SchemaGrammarConverterPropOrder {
   [key: string]: number
 }
@@ -196,7 +198,7 @@ function* groupBy(iterable: Iterable<any>, keyFn: (x: any) => any) {
 }
 export class SchemaGrammarConverter {
-  private _propOrder: PropOrder
+  private _propOrder: SchemaGrammarConverterPropOrder
   private _allowFetch: boolean
@@ -209,7 +211,7 @@ export class SchemaGrammarConverter {
   private _refsBeingResolved: Set<string>
   constructor(options: {
-    prop_order?: PropOrder
+    prop_order?: SchemaGrammarConverterPropOrder
     allow_fetch?: boolean
     dotall?: boolean
   }) {
@@ -690,7 +692,7 @@ export class SchemaGrammarConverter {
     }
   }
-  _addPrimitive(name: string, rule: BuiltinRule | undefined) {
+  _addPrimitive(name: string, rule: SchemaGrammarConverterBuiltinRule | undefined) {
     if (!rule) {
       throw new Error(`Rule ${name} not known`)
     }
@@ -828,7 +830,7 @@ export const convertJsonSchemaToGrammar = ({
   allowFetch,
 }: {
   schema: any
-  propOrder?: PropOrder
+  propOrder?: SchemaGrammarConverterPropOrder
   dotall?: boolean
   allowFetch?: boolean
 }): string | Promise<string> => {

package/src/index.ts CHANGED Viewed

@@ -11,17 +11,38 @@ import type {
   NativeEmbeddingResult,
   NativeSessionLoadResult,
   NativeCPUFeatures,
+  NativeEmbeddingParams,
+  NativeCompletionTokenProbItem,
+  NativeCompletionResultTimings,
 } from './NativeRNLlama'
+import type { SchemaGrammarConverterPropOrder, SchemaGrammarConverterBuiltinRule } from './grammar'
 import { SchemaGrammarConverter, convertJsonSchemaToGrammar } from './grammar'
-import type { RNLlamaOAICompatibleMessage } from './chat'
+import type { RNLlamaMessagePart, RNLlamaOAICompatibleMessage } from './chat'
 import { formatChat } from './chat'
+export type {
+  NativeContextParams,
+  NativeLlamaContext,
+  NativeCompletionParams,
+  NativeCompletionTokenProb,
+  NativeCompletionResult,
+  NativeTokenizeResult,
+  NativeEmbeddingResult,
+  NativeSessionLoadResult,
+  NativeEmbeddingParams,
+  NativeCompletionTokenProbItem,
+  NativeCompletionResultTimings,
+  RNLlamaMessagePart,
+  RNLlamaOAICompatibleMessage,
+  SchemaGrammarConverterPropOrder,
+  SchemaGrammarConverterBuiltinRule,
+}
 export { SchemaGrammarConverter, convertJsonSchemaToGrammar }
+const EVENT_ON_INIT_CONTEXT_PROGRESS = '@RNLlama_onInitContextProgress'
 const EVENT_ON_TOKEN = '@RNLlama_onToken'
-const EVENT_ON_MODEL_PROGRESS = '@RNLlama_onModelProgress'
 let EventEmitter: NativeEventEmitter | DeviceEventEmitterStatic
 if (Platform.OS === 'ios') {
   // @ts-ignore
@@ -41,7 +62,16 @@ type TokenNativeEvent = {
   tokenResult: TokenData
 }
-export type ContextParams = NativeContextParams
+export type ContextParams = Omit<
+  NativeContextParams,
+  'cache_type_k' | 'cache_type_v' |  'pooling_type'
+> & {
+  cache_type_k?: 'f16' | 'f32' | 'q8_0' | 'q4_0' | 'q4_1' | 'iq4_nl' | 'q5_0' | 'q5_1'
+  cache_type_v?: 'f16' | 'f32' | 'q8_0' | 'q4_0' | 'q4_1' | 'iq4_nl' | 'q5_0' | 'q5_1'
+  pooling_type?: 'none' | 'mean' | 'cls' | 'last' | 'rank'
+}
+export type EmbeddingParams = NativeEmbeddingParams
 export type CompletionParams = Omit<
   NativeCompletionParams,
@@ -49,6 +79,7 @@ export type CompletionParams = Omit<
 > & {
   prompt?: string
   messages?: RNLlamaOAICompatibleMessage[]
+  chatTemplate?: string
 }
 export type BenchResult = {
@@ -100,23 +131,22 @@ export class LlamaContext {
   async getFormattedChat(
     messages: RNLlamaOAICompatibleMessage[],
+    template?: string,
   ): Promise<string> {
     const chat = formatChat(messages)
-    return RNLlama.getFormattedChat(
-      this.id,
-      chat,
-      this.model?.isChatTemplateSupported ? undefined : 'chatml',
-    )
+    let tmpl = this.model?.isChatTemplateSupported ? undefined : 'chatml'
+    if (template) tmpl = template // Force replace if provided
+    return RNLlama.getFormattedChat(this.id, chat, tmpl)
   }
   async completion(
     params: CompletionParams,
     callback?: (data: TokenData) => void,
   ): Promise<NativeCompletionResult> {
     let finalPrompt = params.prompt
-    if (params.messages) { // messages always win
-      finalPrompt = await this.getFormattedChat(params.messages)
+    if (params.messages) {
+      // messages always win
+      finalPrompt = await this.getFormattedChat(params.messages, params.chatTemplate)
     }
     let tokenListener: any =
@@ -162,8 +192,11 @@ export class LlamaContext {
     return RNLlama.detokenize(this.id, tokens)
   }
-  embedding(text: string): Promise<NativeEmbeddingResult> {
-    return RNLlama.embedding(this.id, text)
+  embedding(
+    text: string,
+    params?: EmbeddingParams,
+  ): Promise<NativeEmbeddingResult> {
+    return RNLlama.embedding(this.id, text, params || {})
   }
   async bench(
@@ -199,35 +232,78 @@ export async function setContextLimit(limit: number): Promise<void> {
   return RNLlama.setContextLimit(limit)
 }
-export async function initLlama({
+let contextIdCounter = 0
+const contextIdRandom = () =>
+  process.env.NODE_ENV === 'test' ? 0 : Math.floor(Math.random() * 100000)
+const modelInfoSkip = [
+  // Large fields
+  'tokenizer.ggml.tokens',
+  'tokenizer.ggml.token_type',
+  'tokenizer.ggml.merges',
+]
+export async function loadLlamaModelInfo(model: string): Promise<Object> {
+  let path = model
+  if (path.startsWith('file://')) path = path.slice(7)
+  return RNLlama.modelInfo(path, modelInfoSkip)
+}
+const poolTypeMap = {
+  // -1 is unspecified as undefined
+  none: 0,
+  mean: 1,
+  cls: 2,
+  last: 3,
+  rank: 4,
+}
+export async function initLlama(
+  {
     model,
     is_model_asset: isModelAsset,
+    pooling_type: poolingType,
+    lora,
     ...rest
-  }: ContextParams,
-  progressCallback?: (progress: number) => void
+  }: ContextParams,
+  onProgress?: (progress: number) => void,
 ): Promise<LlamaContext> {
   let path = model
   if (path.startsWith('file://')) path = path.slice(7)
-  const modelProgressListener = EventEmitter.addListener(EVENT_ON_MODEL_PROGRESS, (event) => {
-    if(event.progress && progressCallback)
-      progressCallback(event.progress)
-    if(event.progress === 100) {
-      modelProgressListener.remove()
-    }
-  })
+  let loraPath = lora
+  if (loraPath?.startsWith('file://')) loraPath = loraPath.slice(7)
+  const contextId = contextIdCounter + contextIdRandom()
+  contextIdCounter += 1
+  let removeProgressListener: any = null
+  if (onProgress) {
+    removeProgressListener = EventEmitter.addListener(
+      EVENT_ON_INIT_CONTEXT_PROGRESS,
+      (evt: { contextId: number; progress: number }) => {
+        if (evt.contextId !== contextId) return
+        onProgress(evt.progress)
+      },
+    )
+  }
+  const poolType = poolTypeMap[poolingType as keyof typeof poolTypeMap]
   const {
-    contextId,
     gpu,
     reasonNoGPU,
     model: modelDetails,
-  } = await RNLlama.initContext({
+  } = await RNLlama.initContext(contextId, {
     model: path,
     is_model_asset: !!isModelAsset,
+    use_progress_callback: !!onProgress,
+    pooling_type: poolType,
+    lora: loraPath,
     ...rest,
+  }).catch((err: any) => {
+    removeProgressListener?.remove()
+    throw err
   })
+  removeProgressListener?.remove()
   return new LlamaContext({ contextId, gpu, reasonNoGPU, model: modelDetails })
 }