PyPI - python-fastllm - Versions diffs - 0.0.1__py3-none-any.whl - Mend

python-fastllm 0.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (21) hide show

fastllm/__init__.py +1 -0
fastllm/_modidx.py +245 -0
fastllm/acomplete.py +122 -0
fastllm/anthropic.py +298 -0
fastllm/chat.py +622 -0
fastllm/gemini.py +304 -0
fastllm/openai_chat.py +219 -0
fastllm/openai_responses.py +260 -0
fastllm/specs/anthropic.json +1 -0
fastllm/specs/anthropic.yml +15684 -0
fastllm/specs/gemini.json +6951 -0
fastllm/specs/openai.with-code-samples.json +1 -0
fastllm/specs/openai.with-code-samples.yml +73650 -0
fastllm/specs/spec_manifest.json +17 -0
fastllm/streaming.py +162 -0
fastllm/types.py +301 -0
python_fastllm-0.0.1.dist-info/METADATA +395 -0
python_fastllm-0.0.1.dist-info/RECORD +21 -0
python_fastllm-0.0.1.dist-info/WHEEL +5 -0
python_fastllm-0.0.1.dist-info/entry_points.txt +2 -0
python_fastllm-0.0.1.dist-info/top_level.txt +1 -0

fastllm/chat.py ADDED Viewed

@@ -0,0 +1,622 @@
+"""High level chat api for fastllm similar to lisette"""
+# AUTOGENERATED! DO NOT EDIT! File to edit: ../nbs/07_chat.ipynb.
+# %% auto #0
+__all__ = ['haik45', 'sonn45', 'sonn', 'sonn46', 'opus46', 'opus', 'gpt54', 'gpt54m', 'codex54', 'codex55', 'tool_dtls_tag',
+           're_tools', 'token_dtls_tag', 're_token', 'effort', 'remove_cache_ckpts', 'contents', 'stop_reason',
+           'mk_msg', 'split_tools', 'fmt2hist', 'mk_msgs', 'cite_footnote', 'postproc', 'lite_mk_func', 'ToolResponse',
+           'structured', 'StopResponse', 'FullResponse', 'search_count', 'UsageStats', 'AsyncChat', 'add_warning',
+           'astream_with_complete', 'mk_tr_details', 'mk_srv_tc_details', 'StreamFormatter', 'AsyncStreamFormatter',
+           'adisplay_stream']
+# %% ../nbs/07_chat.ipynb #d5a3bc1f
+import asyncio, base64, json, mimetypes, random, string, ast, warnings
+from typing import Optional,Callable
+from html import escape
+from toolslm.funccall import mk_ns, call_func, call_func_async, get_schema
+from fastcore.utils import *
+from fastcore.meta import delegates
+from fastcore import imghdr
+from fastcore.xml import Safe
+from dataclasses import dataclass
+from .acomplete import *
+from .acomplete import Msg, Part, PartType, ToolCall, Completion, mk_tool_res_msg, get_model_info
+# %% ../nbs/07_chat.ipynb #c4b8f12b
+haik45 = "claude-haiku-4-5"
+sonn45 = "claude-sonnet-4-5"
+sonn = sonn46 = "claude-sonnet-4-6"
+opus46 = "claude-opus-4-6"
+opus = "claude-opus-4-7"
+gpt54 = "gpt-5.4"
+gpt54m = "gpt-5.4-mini"
+codex54 = "gpt-5.4"
+codex55 = "gpt-5.5"
+# %% ../nbs/07_chat.ipynb #90f55ad4
+def _bytes2content(data):
+    "Convert bytes to litellm content dict (image, pdf, audio, video)"
+    mtype = detect_mime(data)
+    if not mtype: raise ValueError(f'Data must be a supported file type, got {data[:10]}')
+    encoded = base64.b64encode(data).decode("utf-8")
+    if mtype.startswith('image/'): return Part(type=PartType.input_image, text=f'data:{mtype};base64,{encoded}')
+    return Part(type=PartType.input_file, text=f'data:{mtype};base64,{encoded}')
+# %% ../nbs/07_chat.ipynb #48c78e48
+def _add_cache_control(msg,          # LiteLLM formatted msg
+                       ttl=None):    # Cache TTL: '5m' (default) or '1h'
+    "cache `msg` with default time-to-live (ttl) of 5minutes ('5m'), but can be set to '1h'."
+    cc = {"type": "ephemeral"} | ({"ttl": ttl} if ttl else {})
+    cache_idx = None
+    for idx, part in enumerate(msg.content):
+        if part.type in (PartType.text, PartType.tool_use): cache_idx = idx
+    msg.content[idx].data = merge(msg.content[idx].data or {}, dict(cache_control=cc))
+    return msg
+def _has_cache(msg):
+    "Check if msg has cache_control set"
+    return any(part.data and 'cache_control' in part.data for part in msg.content)
+def remove_cache_ckpts(msg):
+    "remove cache checkpoints and return msg."
+    for part in msg.content:
+        if part.data: part.data.pop('cache_control', None)
+    return msg
+def _mk_content(o):
+    if isinstance(o, str): return Part(type=PartType.text, text=o.strip())
+    elif isinstance(o,bytes): return _bytes2content(o)
+    return o
+def contents(c):
+    "Get Msg object from Completion."
+    if not c.message: return ''
+    return c.message
+def stop_reason(c):
+    if not c.finish_reason: return 'unk'
+    return c.finish_reason
+# %% ../nbs/07_chat.ipynb #8bdd997c
+def mk_msg(
+    content,      # Content: str, bytes (image), list of mixed content, or dict w 'role' and 'content' fields
+    role="user",  # Message role if content isn't already a dict/Message
+    cache=False,  # Enable Anthropic caching
+    ttl=None      # Cache TTL: '5m' (default) or '1h'
+):
+    "Create a LiteLLM compatible message."
+    if content is None: return None
+    if isinstance(content, Msg): return content
+    if isinstance(content, Completion): return content.message
+    if isinstance(content, list) and len(content) == 1 and isinstance(content[0], str): parts = [Part(PartType.text, content[0])]
+    elif isinstance(content, list): parts = [_mk_content(o) for o in content]
+    elif isinstance(content, dict): return Msg(role=content['role'], content=[Part(PartType.text, content['content'])])
+    else: parts = [Part(PartType.text, content)]
+    msg = Msg(role=role, content=parts)
+    return _add_cache_control(msg, ttl=ttl) if cache else msg
+# %% ../nbs/07_chat.ipynb #db466e1c
+tool_dtls_tag = "<details class='tool-usage-details'>"
+re_tools = re.compile(fr"^({tool_dtls_tag}\n*(?:<summary>(?P<summary>.*?)</summary>\n*)?\n*```json\n+(.*?)\n+```\n+</details>)",
+                      flags=re.DOTALL|re.MULTILINE)
+token_dtls_tag = "<details class='token-usage-details'>"
+re_token = re.compile(fr"^{re.escape(token_dtls_tag)}<summary>.*?</summary>\n*\n*`.*?`\n*\n*</details>\n?",
+                      flags=re.DOTALL|re.MULTILINE)
+# %% ../nbs/07_chat.ipynb #45ada210
+def _extract_tool_parts(text:str):
+    "Extract (tool_use_part, tool_result_part) from <details> json block"
+    try: d = json.loads(text.strip())
+    except: return None
+    call = d['call']
+    # Skip server tool calls in deserialization (round trip issues with Gemini/Anthropic)
+    if d.get('server'): return None
+    tu = Part(type=PartType.tool_use, text=None, data={'id': d['id'], 'name': call['function'], 'arguments': call['arguments']})
+    tr = Part(type=PartType.tool_result, text=str(d['result']), data={'id': d['id'], 'name': call['function']})
+    return tu, tr
+def split_tools(s):
+    "Split formatted output into (text, summary, tooljson) chunks"
+    return [(txt,summ,tj) for txt,_,summ,tj in chunked(re_tools.split(s.strip()), 4, pad=True)]
+def fmt2hist(outp:str)->list[Msg]:
+    "Transform a formatted output string into fastllm canonical Msgs"
+    if token_dtls_tag in outp: outp = re_token.sub('', outp)
+    if tool_dtls_tag not in outp: return [Msg(role='assistant', content=[Part(type=PartType.text, text=outp.strip())])]
+    hist, asst_parts, tool_parts = [], [], []
+    def flush():
+        if tool_parts:
+            hist.append(Msg(role='assistant', content=asst_parts.copy()))
+            hist.append(Msg(role='tool',      content=tool_parts.copy()))
+            asst_parts.clear(); tool_parts.clear()
+    for txt,_,tj in split_tools(outp):
+        if txt and txt.strip():
+            if tool_parts: flush()   # text after tool results => new assistant turn
+            asst_parts.append(Part(type=PartType.text, text=txt.strip()))
+        if tj and (tp := _extract_tool_parts(tj)):
+            asst_parts.append(tp[0])
+            tool_parts.append(tp[1])
+    flush()
+    if asst_parts: hist.append(Msg(role='assistant', content=asst_parts))
+    # TODO: Is this needed?
+    # if hist and hist[-1].role == 'tool':
+    #     hist.append(Msg(role='assistant', content=[Part(type=PartType.text, text='.')]))
+    return hist
+# %% ../nbs/07_chat.ipynb #8de5ce8d
+def _apply_cache_idxs(msgs, cache_idxs=[-1], ttl=None):
+    "Add cache control to `cache_idxs` after filtering tool-role msgs"
+    ms = [j for j,m in enumerate(msgs) if m.role != 'tool']
+    for i in cache_idxs:
+        try: idx = ms[i]
+        except IndexError: continue
+        _add_cache_control(msgs[idx], ttl)
+# %% ../nbs/07_chat.ipynb #85882c9c
+def mk_msgs(
+    msgs,                   # List of messages (each: str, bytes, list, Msg, or Completion)
+    cache=False,            # Enable Anthropic caching
+    cache_idxs=[-1],        # Cache breakpoint idxs
+    ttl=None,               # Cache TTL: '5m' (default) or '1h'
+):
+    "Create a list of fastllm canonical Msgs."
+    if not msgs: return []
+    if not isinstance(msgs, list): msgs = [msgs]
+    msgs = L(msgs).map(lambda m: fmt2hist(m) if isinstance(m,str) and tool_dtls_tag in m else [m]).concat()
+    res, role = [], 'user'
+    for m in msgs:
+        res.append(msg := remove_cache_ckpts(mk_msg(m, role=role)))
+        role = 'assistant' if msg.role in ('user','tool') else 'user'
+    if cache: _apply_cache_idxs(res, cache_idxs, ttl)
+    return res
+# %% ../nbs/07_chat.ipynb #447aed6e
+def cite_footnote(citations):
+    'Build citation footnotes for a single Delta'
+    links = []
+    for c in citations:
+        if 'title' not in c: return ''
+        title = c['title'].replace('"', '\\"')
+        links.append(f'[*]({c["url"]} "{title}")')
+    return ' '.join(links)
+# %% ../nbs/07_chat.ipynb #2680479a
+def postproc(chunk):
+    'Convert Anthropic citations into hyperlink text'
+    if isinstance(chunk, dict) and 'citations' in chunk: return dict(text=cite_footnote(chunk['citations']))
+    return chunk
+# %% ../nbs/07_chat.ipynb #0e7d3980
+def lite_mk_func(f):
+    if isinstance(f, dict): return f
+    return {'type':'function', 'function':get_schema(f, pname='parameters')}
+# %% ../nbs/07_chat.ipynb #3e0afa31
+@dataclass
+class ToolResponse:
+    content: list[str,str]
+# %% ../nbs/07_chat.ipynb #bba6fd58
+def _mk_tool_result(res):
+    "Unwrap `ToolResponse`, and format tool result message"
+    if isinstance(res, ToolResponse): return res.content
+    if isinstance(res, str): content = res
+    else: content = str(res)
+    return content
+# %% ../nbs/07_chat.ipynb #a0fcc96e
+def _call_func(tc:ToolCall, tool_schemas, ns, callf):
+    "Call tool function synchronously and return formatted result"
+    fn, valid = tc.name, {nested_idx(o,'function','name') for o in tool_schemas or []}
+    if fn not in valid: return f"Tool not defined in tool_schemas: {fn}"
+    else: return callf(fn, tc.arguments, ns=ns, raise_on_err=False)
+# %% ../nbs/07_chat.ipynb #dbbb66e9
+def _lite_call_func(tc, tool_schemas, ns):
+    "Call tool function synchronously and return formatted result"
+    res = _call_func(tc, tool_schemas, ns, call_func)
+    return _mk_tool_result(res)
+# %% ../nbs/07_chat.ipynb #6fb0e375
+@delegates(acomplete)
+async def structured(
+    m:str,          # LiteLLM model string
+    msgs:list,      # List of messages
+    tool:Callable,  # Tool to be used for creating the structured output (class, dataclass or Pydantic, function, etc)
+    sp:str|Part='', # System message
+    **kwargs):
+    "Return the value of the tool call (generally used for structured outputs)"
+    t = lite_mk_func(tool)
+    r = await acomplete(msgs, m, system=sp, tools=[t], tool_choice=nested_idx(t, 'function', 'name'), **kwargs)
+    return tool(**r.tool_calls[0].arguments)
+# %% ../nbs/07_chat.ipynb #1fe8a9bc
+def _has_search(info): return bool(info.get('search_context_cost_per_query') or info.get('supports_web_search'))
+# %% ../nbs/07_chat.ipynb #2d78087b
+effort = AttrDict({o[0]:o for o in ('low','medium','high')})
+effort['x'] = 'max'
+# %% ../nbs/07_chat.ipynb #e1facb77
+def _mk_prefill(pf): return dict(text=pf)
+# %% ../nbs/07_chat.ipynb #dc17f844
+class StopResponse(str): pass
+class FullResponse(str): pass
+def _has_stop(tres_parts): return any(isinstance(p.text, StopResponse) for p in tres_parts)
+# %% ../nbs/07_chat.ipynb #f58ce348
+def _trunc_str(s, mx=2000, skip=10, replace="TRUNCATED"):
+    "Truncate `s` to `mx` chars max, adding `replace` if truncated"
+    if not isinstance(s, str): s = str(s)
+    if len(s)>2 and s[0]=='𝍁' and s[-1]=='𝍁': return s[1:-1]
+    if isinstance_str(s, ('FullResponse','Safe')): return s
+    s = str(s).strip()
+    if len(s)<=mx: return s
+    s = s[skip:mx-skip]
+    ss = s.split(' ')
+    if len(ss[-1])>150: ss[-1] = ss[-1][:5]
+    s = ' '.join(ss)
+    if skip: s = f"…{s}"
+    s = f"{s}…"
+    if replace: s = f"<{replace}>{s}</{replace}>"
+    return s
+# %% ../nbs/07_chat.ipynb #ca9e447e
+_final_prompt = dict(role="user", content="You have used all your tool calls for this turn. Please summarize your findings. If you did not complete your goal, tell the user what further work is needed. You may use tools again on the next user message.")
+_cwe_msg = "ContextWindowExceededError: Do no more tool calls and complete your response now. Inform user that you ran out of context and explain what the cause was. This is the response to this tool call, truncated if needed: "
+# %% ../nbs/07_chat.ipynb #05c20a94
+def search_count(r):
+    if cnt := nested_idx(r.usage.raw, 'server_tool_use', 'web_search_requests'): return cnt # Anthropic
+    if cnt := nested_idx(r.usage.raw, 'server_tool_use', 'google_search'): return cnt # Gemini
+    if cnt := nested_idx(r.usage.raw, 'web_search_requests'): return cnt # streaming with `include_usage`
+    return 0
+# %% ../nbs/07_chat.ipynb #61395e0d
+class UsageStats:
+    def __init__(self, prompt_tokens=0, completion_tokens=0, total_tokens=0, cached_tokens=0, cache_creation_tokens=0, reasoning_tokens=0, web_search_requests=0, cost=0.0): store_attr()
+    @classmethod
+    def from_response(cls, r):
+        u = r.usage
+        return cls(
+            prompt_tokens=u.prompt_tokens or 0, completion_tokens=u.completion_tokens or 0, total_tokens=u.total_tokens or 0,
+            cached_tokens=u.cached_tokens or 0, cache_creation_tokens=u.cache_creation_tokens or 0, reasoning_tokens=u.reasoning_tokens or 0,
+            web_search_requests=search_count(r), cost=r.cost)
+    def __add__(self, other):
+        if other is None: return self
+        return UsageStats(**{k: getattr(self, k, 0) + getattr(other, k, 0)
+            for k in ('prompt_tokens', 'completion_tokens', 'total_tokens', 'cached_tokens', 'cache_creation_tokens', 'reasoning_tokens', 'web_search_requests', 'cost')
+        })
+    def __radd__(self, other): return self if other is None or other == 0 else self.__add__(other)
+    def __repr__(self):
+        hit = f"{100*self.cached_tokens/self.prompt_tokens:.1f}%" if self.prompt_tokens else "N/A"
+        parts = [f"total={self.total_tokens:,}", f"in={self.prompt_tokens:,}", f"out={self.completion_tokens:,}", f"cached={hit}"]
+        if self.cache_creation_tokens: parts.append(f"cache_new={self.cache_creation_tokens:,}")
+        if self.reasoning_tokens: parts.append(f"reasoning={self.reasoning_tokens:,}")
+        if getattr(self, 'web_search_requests', None): parts.append(f"searches={self.web_search_requests}")
+        if self.cost: parts.append(f"${self.cost:.4f}")
+        return ' | '.join(parts)
+    def fmt(self):
+        if not self.total_tokens: return ''
+        summ = f"${self.cost:.4f}" if self.cost else f"{self.total_tokens:,} tokens"
+        return f"\n\n{token_dtls_tag}<summary>{summ}</summary>\n\n`{self!r}`\n\n</details>\n"
+# %% ../nbs/07_chat.ipynb #67fd51cb
+def _inject_tool_reminder(msgs, reminder):
+    i = len(msgs)
+    while i>0 and msgs[i-1].role=='tool': i-=1
+    if i>=len(msgs): return msgs
+    msgs,m = list(msgs),msgs[i]
+    m.content.append(Part(type=PartType.text, text=reminder))
+    msgs[i] = m
+    return msgs
+# %% ../nbs/07_chat.ipynb #e9a14051
+class AsyncChat:
+    def __init__(
+        self,
+        model:str,                # LiteLLM compatible model name
+        sp='',                    # System prompt
+        temp=0,                   # Temperature
+        search=False,             # Search (l,m,h), if model supports it
+        tools:list=None,          # Add tools
+        hist:list=None,           # Chat history
+        ns:Optional[dict]=None,   # Custom namespace for tool calling
+        cache=False,              # Anthropic prompt caching
+        cache_idxs:list=[-1],     # Anthropic cache breakpoint idxs, use `0` for sys prompt if provided
+        ttl=None,                 # Anthropic prompt caching ttl
+        api_name=None,            # API to use, one of ApiName: openai (responses), openai_chat, anthropic, gemini
+        vendor_name=None,         # Vendor name, one of vendor_mapping which resolves api_base/api_key automatically
+        api_key=None,             # API key when model can't be resolved or vendor_name is not known or codex
+        base_url=None,            # API base url when model can't be resolved or vendor_name is not known
+        extra_headers=None,       # Extra HTTP headers for custom providers
+        markup=0,                 # Cost markup multiplier (e.g. 0.5 for 50%)
+        tool_reminder=None,       # Prepended as a block to the first trailing tool result (transient)
+    ):
+        "LiteLLM chat client."
+        self.model = model
+        hist,tools = mk_msgs(hist,cache,cache_idxs,ttl),listify(tools)
+        if ns is None and tools: ns = mk_ns(tools)
+        elif ns is None: ns = globals()
+        self.tool_schemas = [lite_mk_func(t) for t in tools] if tools else None
+        self.use = UsageStats()
+        store_attr()
+    def _prep_msg(self, msg=None, prefill=None):
+        "Prepare the system prompt and messages list for the API call"
+        sp = self.sp
+        if sp:
+            if 0 in self.cache_idxs: sp = _add_cache_control(Msg('',[Part(PartType.text, sp)]))
+            cache_idxs = L(self.cache_idxs).filter().map(lambda o: o-1 if o>0 else o)
+        else:
+            cache_idxs = self.cache_idxs
+        if msg: self.hist = self.hist+[msg]
+        self.hist = mk_msgs(self.hist, self.cache and 'claude' in self.model, cache_idxs, self.ttl)
+        msgs = self.hist
+        if prefill: msgs = self.hist + [Msg(role='assistant', content=[Part(PartType.text, prefill)])]
+        if self.tool_reminder: msgs = _inject_tool_reminder(msgs, self.tool_reminder)
+        if 'deepseek' in self.model:
+            # The `reasoning_content` in the thinking mode must be passed back to the API.
+            for m in msgs:
+                if m.role=='assistant':
+                    if not any(p.type==PartType.thinking for p in m.content):
+                        m.content.append(Part(PartType.thinking, ''))
+        return sp, msgs
+    @property
+    def tcdict(self): return dict(tool_schemas=self.tool_schemas, ns=self.ns)
+    def _track(self, res):
+        u = UsageStats.from_response(res)
+        u.cost *= (1 + self.markup)
+        self.use += u
+# %% ../nbs/07_chat.ipynb #2e469ea1
+def _srvtools(tcs): return L(tcs).filter(lambda o: o.server) if tcs else None
+def _usrtools(tcs): return L(tcs).filter(lambda o: not o.server) if tcs else None
+# %% ../nbs/07_chat.ipynb #a2e70fbb
+def add_warning(r, msg):
+    wrn = Part(PartType.text, f"<warning>{msg}</warning>")
+    if r.message.content: r.message.content.append(wrn)
+    else: r.message.content = [wrn]
+# %% ../nbs/07_chat.ipynb #e16195f9
+def _handle_stop_reason(res):
+    "Returns (action, warning_msg) - action is 'warning', 'pause', or None"
+    sr = stop_reason(res)
+    if sr == 'length': return 'warning', 'Response was cut off at token limit.'
+    if sr == 'refusal': return 'warning', 'AI server provider content filter was applied to this request'
+    if sr == 'content_filter': return 'warning', 'AI server provider content filter was applied to this request.'
+    # if sr == 'pause_turn': return 'retry', None # TODO: Not a canonical finish reason
+    return None, None
+# %% ../nbs/07_chat.ipynb #19b87f53
+def _think_kw(model, think, vendor_name):
+    if not think: return {}
+    if 'opus-4-7' in model:
+        e = 'xhigh' if think=='h' else effort.get(think)
+        return dict(thinking={"type":"adaptive", "display":"summarized"}, output_config={"effort":e})
+    try: xhigh = get_model_info(model, vendor_name).get('supports_xhigh_reasoning_effort')
+    except: xhigh = False
+    eff = effort.get(think) if think!='x' else 'xhigh' if xhigh else 'high'
+    if vendor_name == 'codex': return dict(reasoning_effort={'effort':eff, 'summary':'auto'})
+    return dict(reasoning_effort=eff)
+# %% ../nbs/07_chat.ipynb #b3f28523
+@patch
+def _prep_call(self:AsyncChat, prefill, search, max_tokens, kwargs, stream=False, think=None):
+    "Prepare model info, prefill, search, and provider kwargs for a completion call"
+    model_info = get_model_info(self.model, self.vendor_name)
+    if max_tokens is None: max_tokens = model_info.get('max_output_tokens')
+    if not model_info.get("supports_assistant_prefill"): prefill = None
+    if _has_search(model_info) and (s:=ifnone(search,self.search)):
+        if 'web_search_options' not in kwargs: kwargs['web_search_options'] = {}
+        kwargs['web_search_options']['search_context_size'] = effort[s]
+        if self.vendor_name == 'codex': kwargs['web_search_options']['type'] = 'web_search'
+    else: kwargs.pop('web_search_options', None)
+    # kwargs['additional_drop_params'] = ['temperature'] # TODO: What is this for?
+    if self.api_name:      kwargs['api_name'] = self.api_name
+    if self.vendor_name:   kwargs['vendor_name'] = self.vendor_name
+    if self.api_key:       kwargs['api_key'] = self.api_key
+    if self.base_url:      kwargs['base_url'] = self.base_url
+    if self.extra_headers: kwargs['xtra_headers'] = self.extra_headers
+    kwargs.update(_think_kw(self.model, think, self.vendor_name))
+    return prefill, max_tokens
+# %% ../nbs/07_chat.ipynb #07951b77
+@patch
+def print_hist(self:AsyncChat):
+    "Print each message on a different line"
+    return display_list(self.hist)
+# %% ../nbs/07_chat.ipynb #bf84d49a
+async def _alite_call_func(tc, tool_schemas, ns):
+    "Call tool function asynchronously and return formatted result"
+    res = _call_func(tc, tool_schemas, ns, call_func_async)
+    return _mk_tool_result(await maybe_await(res))
+# %% ../nbs/07_chat.ipynb #ee4fb755
+@asave_iter
+async def astream_with_complete(self, agen, postproc=noop):
+    async for chunk in agen:
+        if not isinstance(chunk, Completion): yield postproc(chunk)
+    self.value = chunk
+# %% ../nbs/07_chat.ipynb #baf28c01
+@patch
+@delegates(acomplete)
+async def _call(self:AsyncChat, msg=None, prefill=None, temp=None, think=None, search=None, stream=False, max_steps=2, step=1,
+        final_prompt=None, tool_choice=None, max_tokens=None, n_workers=8, pause=0.001, tc_timeout=7200, **kwargs):
+    if step>max_steps+1: return
+    prefill, max_tokens = self._prep_call(prefill, search, max_tokens, kwargs, stream=stream, think=think)
+    sp,msgs = self._prep_msg(msg,prefill)
+    if prefill and self.vendor_name == 'deepseek' and self.model in ("deepseek-v4-flash", "deepseek-v4-pro"):
+        kwargs['base_url'] = 'https://api.deepseek.com/beta'
+    # TODO: num_retries=2 is this needed? If so add.
+    # caching removed, cache checkpoints are added for Anthropic and other providers do implicit caching
+    res = await acomplete(msgs, self.model, system=sp, stream=stream,
+        tools=self.tool_schemas, tool_choice=tool_choice, max_tokens=int(max_tokens),
+        temperature=None if think else ifnone(temp,self.temp), **kwargs)
+    if stream:
+        if prefill: yield _mk_prefill(prefill)
+        res = astream_with_complete(res, postproc=postproc)
+        async for chunk in res: yield chunk
+        res = res.value
+    m=contents(res)
+    if prefill: m.content[0].text = prefill + m.content[0].text
+    self.hist.append(m)
+    action, msg = _handle_stop_reason(res)
+    if action == 'warning': add_warning(res, msg)
+    elif action == 'retry':
+        async for result in self._call(
+            None, prefill, temp, think, search, stream, max_steps, step,
+            final_prompt, tool_choice, **kwargs): yield result
+        self.hist.pop(-2) # rm incomplete srvtoolu_
+        return
+    self._track(res)
+    yield res
+    if stcs:= _srvtools(res.tool_calls):
+        for tc in stcs: yield tc
+    if tcs := _usrtools(res.tool_calls):
+        tres = await parallel_async(_alite_call_func, tcs, timeout=tc_timeout, n_workers=n_workers, pause=pause, **self.tcdict)
+        tmsg = mk_tool_res_msg(tcs, tres)
+        # TODO: We yield tool calls at the end with their results, fastllm doesn't yield streaming tool calls during streaming as once the collation is done for simplicity, but it can
+        for r in tmsg.content: yield r
+        self.hist.append(tmsg)
+        if step>=max_steps-1 or _has_stop(tmsg.content): prompt,tool_choice,search = mk_msg(final_prompt),'none',False
+        else: prompt = None
+        try:
+            async for result in self._call(
+                prompt, prefill, temp, think, search, stream, max_steps, step+1,
+                final_prompt, tool_choice=tool_choice, **kwargs): yield result
+        except ContextWindowExceededError:
+            for p in tmsg.content:
+                if len(p.text)>1000: p.text = _cwe_msg + _trunc_str(p.text, mx=1000)
+            async for result in self._call(
+                prompt, prefill, temp, think, search, stream, max_steps, step+1,
+                final_prompt, tool_choice='none', **kwargs): yield result
+# %% ../nbs/07_chat.ipynb #1361515a
+@patch
+@delegates(AsyncChat._call)
+async def __call__(
+    self:AsyncChat,
+    msg=None,          # Message str, or list of multiple message parts
+    prefill=None,      # Prefill AI response if model supports it
+    temp=None,         # Override temp set on chat initialization
+    think=None,        # Thinking (l,m,h)
+    search=None,       # Override search set on chat initialization (l,m,h)
+    stream=False,      # Stream results
+    max_steps=2, # Maximum number of tool calls
+    final_prompt=_final_prompt, # Final prompt when tool calls have ran out
+    return_all=False,  # Returns all intermediate ModelResponses if not streaming and has tool calls
+    **kwargs
+):
+    self.use = UsageStats()
+    result_gen = self._call(msg, prefill, temp, think, search, stream, max_steps, 1, final_prompt, **kwargs)
+    if stream or return_all: return result_gen
+    async for res in result_gen: pass
+    return res # normal chat behavior only return last msg
+# %% ../nbs/07_chat.ipynb #115fd94f
+def _trunc_param(v, mx=40):
+    "Truncate and escape param value for display"
+    tp = _trunc_str(str(v).replace('`', r'\`'), mx=mx, replace=None, skip=0)
+    try: return ast.literal_eval(tp)
+    except Exception: return repr(tp).replace('\\\\', '\\')
+# %% ../nbs/07_chat.ipynb #80c0abdb
+def _tc_summary(tr):
+    "Format tool call as func(params) → result string"
+    params = ', '.join(f"{k}={_trunc_param(v)}" for k,v in tr.data['arguments'].items())
+    res = f"→{_trunc_param(tr.text)}"
+    return '<code>'+escape(f"{tr.data['name']}({params}){res}")+'</code>'
+# %% ../nbs/07_chat.ipynb #91beb26c
+def _srv_tc_summary(tc):
+    "Format tool call as func(params) → result string"
+    params = ', '.join(f"{k}={_trunc_param(v)}" for k,v in tc.arguments.items())
+    return '<code>'+escape(f"{tc.name}({params})")+'</code>'
+# %% ../nbs/07_chat.ipynb #80f344cc
+def _trunc_content(content, mx):
+    "Truncate tool result content, respecting '_full' flag"
+    if isinstance(content, dict) and '_full' in content and len(content)==1: return content['_full']
+    return _trunc_str(content, mx=mx)
+# %% ../nbs/07_chat.ipynb #3602a033
+def mk_tr_details(tr, mx=2000):
+    "Create <details> block for tool call as JSON"
+    args = {k:_trunc_str(v, mx=mx*5) for k,v in tr.data['arguments'].items()}
+    res = {'id':tr.data['id'], 'server':False,
+           'call':{'function': tr.data['name'], 'arguments': args},
+           'result':_trunc_content(tr.text, mx=mx),}
+    summ = f"<summary>{_tc_summary(tr)}</summary>"
+    return f"\n\n{tool_dtls_tag}\n{summ}\n\n```json\n{dumps(res, indent=2)}\n```\n\n</details>\n\n"
+# %% ../nbs/07_chat.ipynb #3049001c
+def mk_srv_tc_details(tc, mx=2000):
+    "Create <details> block for tool call as JSON"
+    args = {k:_trunc_str(v, mx=mx*5) for k,v in tc.arguments.items()}
+    res = {'id':tc.id, 'server':True, 'call':{'function': tc.name, 'arguments': args}, 'result':"Server tool call executed."}
+    summ = f"<summary>{_srv_tc_summary(tc)}</summary>"
+    return f"\n\n{tool_dtls_tag}\n{summ}\n\n```json\n{dumps(res, indent=2)}\n```\n\n</details>\n\n"
+# %% ../nbs/07_chat.ipynb #f0d984ec
+# status_re = re.compile(r'^- ⏳ <code>(.*)</code> ⏳$|^🧠+$', re.MULTILINE) # TODO: Need to yield tool calls as they are done collated in fastllm `_acollect_stream`
+class StreamFormatter:
+    def __init__(self, mx=2000, debug=False, showthink=False):
+        self.outp,self.tcs = '',{}
+        store_attr()
+    def format_item(self, o):
+        "Format a single item from the response stream."
+        res = ''
+        if self.debug: print(o)
+        if isinstance(o, dict):
+            if thk:=o.get('thinking'):
+                if self.showthink: res += thk
+                res+= '🧠' if not self.outp or self.outp[-1]=='🧠' else '\n\n🧠'
+            elif self.outp and self.outp[-1] == '🧠': res+= '\n\n'
+            if txt:=o.get('text'): res+=f"\n\n{txt}" if res and res[-1] == '🧠' else txt
+        if isinstance(o, ToolCall):
+            res += mk_srv_tc_details(o)
+        if isinstance(o, Part) and o.type == PartType.tool_result:
+            res += mk_tr_details(o,mx=self.mx)
+        self.outp+=res
+        return res
+    def format_stream(self, rs):
+        "Format the response stream for markdown display."
+        for o in rs: yield self.format_item(o)
+# %% ../nbs/07_chat.ipynb #0cdd4d7c
+class AsyncStreamFormatter(StreamFormatter):
+    async def format_stream(self, rs):
+        "Format the response stream for markdown display."
+        async for o in rs: yield self.format_item(o)
+# %% ../nbs/07_chat.ipynb #f4345023
+@delegates(AsyncStreamFormatter)
+async def adisplay_stream(rs, **kwargs):
+    "Use IPython.display to markdown display the response stream."
+    try: from IPython.display import display, Markdown
+    except ModuleNotFoundError: raise ModuleNotFoundError("This function requires ipython. Please run `pip install ipython` to use.")
+    fmt = AsyncStreamFormatter(**kwargs)
+    md = ''
+    async for o in fmt.format_stream(rs):
+        md+=o
+        display(Markdown(md),clear=True)
+    return fmt