npm - @inferrlm/react-native-mlx - Versions diffs - 0.4.2-alpha.2 → 0.4.2-alpha.4 - Mend

@inferrlm/react-native-mlx 0.4.2-alpha.2 → 0.4.2-alpha.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (2) hide show

package/ios/Sources/HybridLLM.swift +155 -25
package/package.json +1 -1

package/ios/Sources/HybridLLM.swift CHANGED Viewed

@@ -230,10 +230,6 @@ class HybridLLM: HybridLLMSpec {
         }
         return Promise.async { [self] in
-            if self.manageHistory {
-                self.messageHistory.append(LLMMessage(role: "user", content: prompt))
-            }
             let task = Task<String, Error> {
                 log("Generating response for: \(prompt.prefix(50))...")
                 let result = try await session.respond(to: prompt)
@@ -247,6 +243,7 @@ class HybridLLM: HybridLLMSpec {
             let result = try await task.value
             if self.manageHistory {
+                self.messageHistory.append(LLMMessage(role: "user", content: prompt))
                 self.messageHistory.append(LLMMessage(role: "assistant", content: result))
             }
@@ -266,15 +263,14 @@ class HybridLLM: HybridLLMSpec {
         }
         return Promise.async { [self] in
-            if self.manageHistory {
-                self.messageHistory.append(LLMMessage(role: "user", content: prompt))
-            }
             let task = Task<String, Error> {
                 let startTime = Date()
                 var firstTokenTime: Date?
                 var tokenCount = 0
+                log("stream_start prompt=\(prompt.count)chars history=\(self.messageHistory.count) manageHistory=\(self.manageHistory)")
+                log("stream_prompt: \(prompt.prefix(300))")
                 let result = try await self.performGeneration(
                     container: container,
                     prompt: prompt,
@@ -303,7 +299,8 @@ class HybridLLM: HybridLLMSpec {
                     toolExecutionTime: 0
                 )
-                log("Stream complete - \(tokenCount) tokens, \(String(format: "%.1f", tokensPerSecond)) tokens/s")
+                log("stream_done tokens=\(tokenCount) tps=\(String(format: "%.1f", tokensPerSecond)) result=\(result.count)chars")
+                log("stream_result_preview: \(result.prefix(300))")
                 return result
             }
@@ -313,7 +310,9 @@ class HybridLLM: HybridLLMSpec {
             let result = try await task.value
             if self.manageHistory {
+                self.messageHistory.append(LLMMessage(role: "user", content: prompt))
                 self.messageHistory.append(LLMMessage(role: "assistant", content: result))
+                log("stream_history_updated count=\(self.messageHistory.count)")
             }
             return result
@@ -329,10 +328,6 @@ class HybridLLM: HybridLLMSpec {
         }
         return Promise.async { [self] in
-            if self.manageHistory {
-                self.messageHistory.append(LLMMessage(role: "user", content: prompt))
-            }
             let task = Task<String, Error> {
                 let startTime = Date()
                 var firstTokenTime: Date?
@@ -389,6 +384,7 @@ class HybridLLM: HybridLLMSpec {
             let result = try await task.value
             if self.manageHistory {
+                self.messageHistory.append(LLMMessage(role: "user", content: prompt))
                 self.messageHistory.append(LLMMessage(role: "assistant", content: result))
             }
@@ -403,11 +399,14 @@ class HybridLLM: HybridLLMSpec {
     ) -> [Chat.Message] {
         var chat: [Chat.Message] = []
+        log("build_chat depth=\(depth) history=\(self.messageHistory.count) prompt=\(prompt.count)chars")
         if !self.systemPrompt.isEmpty {
             chat.append(.system(self.systemPrompt))
+            log("  [system] \(self.systemPrompt.prefix(80))...")
         }
-        for msg in self.messageHistory {
+        for (i, msg) in self.messageHistory.enumerated() {
             switch msg.role {
             case "user": chat.append(.user(msg.content))
             case "assistant": chat.append(.assistant(msg.content))
@@ -415,18 +414,22 @@ class HybridLLM: HybridLLMSpec {
             case "tool": chat.append(.tool(msg.content))
             default: break
             }
+            log("  [\(i):\(msg.role)] \(msg.content.prefix(120))")
         }
         if depth == 0 {
             chat.append(.user(prompt))
+            log("  [prompt] \(prompt.prefix(200))")
         }
         if let toolResults {
-            for result in toolResults {
+            for (i, result) in toolResults.enumerated() {
                 chat.append(.tool(result))
+                log("  [tool_result_\(i)] \(result.prefix(100))")
             }
         }
+        log("chat_built total=\(chat.count) messages")
         return chat
     }
@@ -460,6 +463,14 @@ class HybridLLM: HybridLLMSpec {
         var output = ""
         var thinkingMachine = ThinkingStateMachine()
         var pendingToolCalls: [(id: String, tool: ToolDefinition, args: [String: Any], argsJson: String)] = []
+        var rawTokenLog = ""
+        let specialTokenPattern = try? NSRegularExpression(
+            pattern: "<\\|(?:im_end|im_start|endoftext|end|pad)\\|>",
+            options: []
+        )
+        log("perform_gen_events depth=\(depth) prompt=\(prompt.count)chars toolResults=\(toolResults?.count ?? 0)")
         let chat = buildChatMessages(prompt: prompt, toolResults: toolResults, depth: depth)
         let userInput = UserInput(
@@ -468,6 +479,7 @@ class HybridLLM: HybridLLMSpec {
         )
         let lmInput = try await container.prepare(input: userInput)
+        log("perform_gen_events input_prepared")
         let stream = try await container.perform { context in
             let parameters = GenerateParameters(maxTokens: 2048, temperature: 0.7)
@@ -478,30 +490,57 @@ class HybridLLM: HybridLLMSpec {
             )
         }
+        var chunkCount = 0
         for await generation in stream {
-            if Task.isCancelled { break }
+            if Task.isCancelled {
+                log("perform_gen_events cancelled at chunk=\(chunkCount)")
+                break
+            }
             switch generation {
             case .chunk(let text):
+                chunkCount += 1
+                rawTokenLog += text
+                if chunkCount <= 20 || chunkCount % 50 == 0 {
+                    log("raw_chunk_events[\(chunkCount)] \(text.debugDescription)")
+                }
                 let outputs = thinkingMachine.process(token: text)
                 for machineOutput in outputs {
                     switch machineOutput {
                     case .token(let token):
-                        output += token
-                        emitter.emitToken(token)
-                        onTokenProcessed()
+                        var cleaned = token
+                        if let regex = specialTokenPattern {
+                            let before = cleaned
+                            cleaned = regex.stringByReplacingMatches(
+                                in: cleaned,
+                                range: NSRange(cleaned.startIndex..., in: cleaned),
+                                withTemplate: ""
+                            )
+                            if before != cleaned {
+                                log("stripped_special_events: \(before.debugDescription) -> \(cleaned.debugDescription)")
+                            }
+                        }
+                        if !cleaned.isEmpty {
+                            output += cleaned
+                            emitter.emitToken(cleaned)
+                            onTokenProcessed()
+                        }
                     case .thinkingStart:
+                        log("thinking_start_events at chunk=\(chunkCount)")
                         emitter.emitThinkingStart()
                     case .thinkingChunk(let chunk):
                         emitter.emitThinkingChunk(chunk)
                     case .thinkingEnd(let content):
+                        log("thinking_end_events at chunk=\(chunkCount)")
                         emitter.emitThinkingEnd(content)
                     }
                 }
+                }
             case .toolCall(let toolCall):
                 log("Tool call detected: \(toolCall.function.name)")
@@ -519,12 +558,15 @@ class HybridLLM: HybridLLMSpec {
                 pendingToolCalls.append((id: toolCallId, tool: tool, args: argsDict, argsJson: argsJson))
             case .info(let info):
-                log("Generation info: \(info.generationTokenCount) tokens, \(String(format: "%.1f", info.tokensPerSecond)) tokens/s")
+                log("gen_info_events chunks=\(chunkCount) genTokens=\(info.generationTokenCount) tps=\(String(format: "%.1f", info.tokensPerSecond))")
                 let generationTime = info.tokensPerSecond > 0 ? Double(info.generationTokenCount) / info.tokensPerSecond * 1000 : 0
                 onGenerationInfo(info.generationTokenCount, generationTime)
             }
         }
+        log("perform_gen_events_loop_done chunks=\(chunkCount) output=\(output.count)chars")
+        log("raw_output_events_first500: \(rawTokenLog.prefix(500))")
         let flushOutputs = thinkingMachine.flush()
         for machineOutput in flushOutputs {
             switch machineOutput {
@@ -614,7 +656,16 @@ class HybridLLM: HybridLLMSpec {
         }
         var output = ""
+        var thinkingMachine = ThinkingStateMachine()
         var pendingToolCalls: [(tool: ToolDefinition, args: [String: Any], argsJson: String)] = []
+        var rawTokenLog = ""
+        let specialTokenPattern = try? NSRegularExpression(
+            pattern: "<\\|(?:im_end|im_start|endoftext|end|pad)\\|>",
+            options: []
+        )
+        log("perform_gen depth=\(depth) prompt=\(prompt.count)chars toolResults=\(toolResults?.count ?? 0)")
         let chat = buildChatMessages(prompt: prompt, toolResults: toolResults, depth: depth)
         let userInput = UserInput(
@@ -623,6 +674,7 @@ class HybridLLM: HybridLLMSpec {
         )
         let lmInput = try await container.prepare(input: userInput)
+        log("perform_gen input_prepared")
         let stream = try await container.perform { context in
             let parameters = GenerateParameters(maxTokens: 2048, temperature: 0.7)
@@ -633,13 +685,55 @@ class HybridLLM: HybridLLMSpec {
             )
         }
+        var chunkCount = 0
         for await generation in stream {
-            if Task.isCancelled { break }
+            if Task.isCancelled {
+                log("perform_gen cancelled at chunk=\(chunkCount)")
+                break
+            }
             switch generation {
             case .chunk(let text):
-                output += text
-                onToken(text)
+                chunkCount += 1
+                rawTokenLog += text
+                if chunkCount <= 20 || chunkCount % 50 == 0 {
+                    log("raw_chunk[\(chunkCount)] \(text.debugDescription)")
+                }
+                let outputs = thinkingMachine.process(token: text)
+                for machineOutput in outputs {
+                    switch machineOutput {
+                    case .token(let token):
+                        var cleaned = token
+                        if let regex = specialTokenPattern {
+                            let before = cleaned
+                            cleaned = regex.stringByReplacingMatches(
+                                in: cleaned,
+                                range: NSRange(cleaned.startIndex..., in: cleaned),
+                                withTemplate: ""
+                            )
+                            if before != cleaned {
+                                log("stripped_special: \(before.debugDescription) -> \(cleaned.debugDescription)")
+                            }
+                        }
+                        if !cleaned.isEmpty {
+                            output += cleaned
+                            onToken(cleaned)
+                        }
+                    case .thinkingStart:
+                        log("thinking_start at chunk=\(chunkCount)")
+                        onToken("<think>")
+                    case .thinkingChunk(let chunk):
+                        onToken(chunk)
+                    case .thinkingEnd:
+                        log("thinking_end at chunk=\(chunkCount)")
+                        onToken("</think>")
+                    }
+                }
             case .toolCall(let toolCall):
                 log("Tool call detected: \(toolCall.function.name)")
@@ -656,7 +750,38 @@ class HybridLLM: HybridLLMSpec {
                 onToolCall(toolCall.function.name, argsJson)
             case .info(let info):
-                log("Generation info: \(info.generationTokenCount) tokens, \(String(format: "%.1f", info.tokensPerSecond)) tokens/s")
+                log("gen_info chunks=\(chunkCount) genTokens=\(info.generationTokenCount) tps=\(String(format: "%.1f", info.tokensPerSecond))")
+            }
+        }
+        log("perform_gen_loop_done chunks=\(chunkCount) output=\(output.count)chars")
+        log("raw_output_first500: \(rawTokenLog.prefix(500))")
+        let flushOutputs = thinkingMachine.flush()
+        if !flushOutputs.isEmpty {
+            log("flush_outputs count=\(flushOutputs.count)")
+        }
+        for machineOutput in flushOutputs {
+            switch machineOutput {
+            case .token(let token):
+                var cleaned = token
+                if let regex = specialTokenPattern {
+                    cleaned = regex.stringByReplacingMatches(
+                        in: cleaned,
+                        range: NSRange(cleaned.startIndex..., in: cleaned),
+                        withTemplate: ""
+                    )
+                }
+                if !cleaned.isEmpty {
+                    output += cleaned
+                    onToken(cleaned)
+                }
+            case .thinkingStart:
+                onToken("<think>")
+            case .thinkingChunk(let chunk):
+                onToken(chunk)
+            case .thinkingEnd:
+                onToken("</think>")
             }
         }
@@ -710,6 +835,7 @@ class HybridLLM: HybridLLMSpec {
             return output + continuation
         }
+        log("perform_gen_result output=\(output.count)chars preview: \(output.prefix(200))")
         return output
     }
@@ -793,10 +919,14 @@ class HybridLLM: HybridLLMSpec {
     }
     func clearHistory() throws {
+        log("clear_history before=\(messageHistory.count) messages")
+        for (i, msg) in messageHistory.enumerated() {
+            log("  clearing[\(i):\(msg.role)] \(msg.content.prefix(80))")
+        }
         messageHistory = []
         if let container = self.container {
             self.session = ChatSession(container, instructions: self.systemPrompt)
         }
-        log("History and session cleared")
+        log("clear_history done session_reset")
     }
 }

package/package.json CHANGED Viewed

@@ -1,7 +1,7 @@
 {
   "name": "@inferrlm/react-native-mlx",
   "description": "MLX Swift integration for React Native - InferrLM fork with enhanced features",
-  "version": "0.4.2-alpha.2",
+  "version": "0.4.2-alpha.4",
   "main": "./lib/module/index.js",
   "module": "./lib/module/index.js",
   "types": "./lib/typescript/src/index.d.ts",