PyPI - llm-codegen-research - Versions diffs - 2.15__tar.gz → 2.16__tar.gz - Mend

llm-codegen-research 2.15tar.gz → 2.16tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (47) hide show

{llm_codegen_research-2.15 → llm_codegen_research-2.16}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: llm-codegen-research
-Version: 2.15
+Version: 2.16
 Summary: Useful classes and methods for researching code-generation by LLMs.
 Author-email: Lukas Twist <itsluketwist@gmail.com>
 Project-URL: Homepage, https://github.com/itsluketwist/llm-codegen-research
@@ -158,7 +158,7 @@ uv add openai
 Or to upgrade dependencies:
 ```shell
-uv sync --upgrade
+uv sync --extra api --upgrade
 ```
 Check typings with `ty`:

{llm_codegen_research-2.15 → llm_codegen_research-2.16}/README.md RENAMED Viewed

@@ -138,7 +138,7 @@ uv add openai
 Or to upgrade dependencies:
 ```shell
-uv sync --upgrade
+uv sync --extra api --upgrade
 ```
 Check typings with `ty`:

{llm_codegen_research-2.15 → llm_codegen_research-2.16}/src/llm_cgr/defaults.py RENAMED Viewed

@@ -7,4 +7,7 @@ DEFAULT_MODEL = "gpt-4.1-mini-2025-04-14"
 DEFAULT_CODEBLOCK_LANGUAGE = "python"
 # the default max_tokens to be used when prompting models
-DEFAULT_MAX_TOKENS = 2000
+DEFAULT_MAX_TOKENS = 4096
+# default token budget for anthropic extended thinking (minimum allowed is 1024)
+DEFAULT_THINKING_BUDGET = 2048

{llm_codegen_research-2.15 → llm_codegen_research-2.16}/src/llm_cgr/llm/clients/__init__.py RENAMED Viewed

@@ -33,6 +33,7 @@ def get_llm(
     top_p: float | None = None,
     max_tokens: int | None = None,
     provider: str | None = None,
+    enable_reasoning: bool = False,
     tools: list[Tool] | None = None,
     max_tool_iterations: int = MAX_TOOL_ITERATIONS,
     max_tool_calls: int = MAX_TOOL_CALLS,
@@ -41,7 +42,8 @@ def get_llm(
     Initialise the correct LLM client for the given model.
     If tools are provided, returns an OpenAI_Tool_LLM instance. Tool calls
-    are currently only supported for OpenAI models.
+    are currently only supported for OpenAI models. enable_reasoning is only
+    supported by Anthropic, DeepSeek, Mistral, and TogetherAI models.
     """
     llm_class: type[Base_LLM]
     if provider is not None:
@@ -63,6 +65,8 @@ def get_llm(
             raise NotImplementedError(
                 "Tool calls are only supported for OpenAI models."
             )
+        if enable_reasoning:
+            raise ValueError("OpenAI_Tool_LLM does not support enable_reasoning.")
         return OpenAI_Tool_LLM(
             tools=tools,
             model=model,
@@ -80,6 +84,7 @@ def get_llm(
         temperature=temperature,
         top_p=top_p,
         max_tokens=max_tokens,
+        enable_reasoning=enable_reasoning,
     )

{llm_codegen_research-2.15 → llm_codegen_research-2.16}/src/llm_cgr/llm/clients/anthropic.py RENAMED Viewed

@@ -3,9 +3,14 @@
 from typing import Any, cast
 import anthropic
-from anthropic.types import MessageParam, TextBlock
+from anthropic.types import (
+    MessageParam,
+    TextBlock,
+    ThinkingBlock,
+    ThinkingConfigEnabledParam,
+)
-from llm_cgr.defaults import DEFAULT_MAX_TOKENS
+from llm_cgr.defaults import DEFAULT_MAX_TOKENS, DEFAULT_THINKING_BUDGET
 from llm_cgr.llm.clients.base import Base_LLM
@@ -19,11 +24,14 @@ class Anthropic_LLM(Base_LLM):
         temperature: float | None = None,
         top_p: float | None = None,
         max_tokens: int | None = None,
+        enable_reasoning: bool = False,
     ) -> None:
         """
         Initialise the Anthropic client.
         Requires the ANTHROPIC_API_KEY environment variable to be set.
+        Set enable_reasoning=True to enable extended thinking on supported models
+        (e.g. claude-sonnet-4-5).
         """
         super().__init__(
             model=model,
@@ -31,6 +39,7 @@ class Anthropic_LLM(Base_LLM):
             temperature=temperature,
             top_p=top_p,
             max_tokens=max_tokens,
+            enable_reasoning=enable_reasoning,
         )
         self._client = anthropic.Anthropic()
@@ -68,13 +77,50 @@ class Anthropic_LLM(Base_LLM):
         max_tokens: int | None = None,
     ) -> tuple[str, str | None]:
         """Generate a model response from the Anthropic API."""
+        # extended thinking is incompatible with custom temperature/top_p
+        thinking = (
+            ThinkingConfigEnabledParam(
+                type="enabled",
+                budget_tokens=DEFAULT_THINKING_BUDGET,
+            )
+            if self._enable_reasoning
+            else anthropic.omit
+        )
+        # custom temperature/top_p are not supported alongside extended thinking,
+        # and the api rejects requests that set both temperature and top_p
+        _temperature = (
+            temperature
+            if temperature is not None and not self._enable_reasoning
+            else anthropic.omit
+        )
+        _top_p = (
+            top_p
+            if top_p is not None
+            and not self._enable_reasoning
+            and _temperature is anthropic.omit
+            else anthropic.omit
+        )
         response = self._client.messages.create(
             model=model,
             system=system or self._system or anthropic.omit,
             messages=cast(list[MessageParam], input),
-            temperature=temperature if temperature is not None else anthropic.omit,
-            top_p=top_p if top_p is not None else anthropic.omit,
+            temperature=_temperature,
+            top_p=_top_p,
             max_tokens=max_tokens if max_tokens is not None else DEFAULT_MAX_TOKENS,
+            thinking=thinking,
+        )
+        # collect chain-of-thought from any thinking blocks; None if not present
+        thinking_blocks = [
+            block.thinking
+            for block in response.content
+            if isinstance(block, ThinkingBlock)
+        ]
+        reasoning = "\n".join(thinking_blocks) if thinking_blocks else None
+        # the final answer is always returned as a text block
+        text_block = next(
+            block for block in response.content if isinstance(block, TextBlock)
         )
-        # cast to TextBlock as non-tool, non-thinking requests always return text
-        return cast(TextBlock, response.content[0]).text, None
+        return text_block.text, reasoning

{llm_codegen_research-2.15 → llm_codegen_research-2.16}/src/llm_cgr/llm/clients/deepseek.py RENAMED Viewed

@@ -76,6 +76,8 @@ class DeepSeek_LLM(Base_LLM):
             temperature=temperature if temperature is not None else openai.omit,
             top_p=top_p if top_p is not None else openai.omit,
             max_completion_tokens=max_tokens if max_tokens is not None else openai.omit,
+            reasoning_effort="high",
+            extra_body={"thinking": {"type": "enabled"}},
         )
         message = response.choices[0].message

{llm_codegen_research-2.15 → llm_codegen_research-2.16}/src/llm_cgr/llm/clients/mistral.py RENAMED Viewed

@@ -4,6 +4,7 @@ import os
 from typing import Any
 from mistralai import client
+from mistralai.client.models import TextChunk, ThinkChunk
 from llm_cgr.llm.clients.base import Base_LLM
@@ -18,11 +19,14 @@ class Mistral_LLM(Base_LLM):
         temperature: float | None = None,
         top_p: float | None = None,
         max_tokens: int | None = None,
+        enable_reasoning: bool = False,
     ) -> None:
         """
         Initialise the Mistral client.
         Requires the MISTRAL_API_KEY environment variable to be set.
+        Set enable_reasoning=True to request chain-of-thought from reasoning
+        models (e.g. magistral-medium-latest).
         """
         super().__init__(
             model=model,
@@ -30,6 +34,7 @@ class Mistral_LLM(Base_LLM):
             temperature=temperature,
             top_p=top_p,
             max_tokens=max_tokens,
+            enable_reasoning=enable_reasoning,
         )
         self._client = client.Mistral(
             api_key=os.environ["MISTRAL_API_KEY"],
@@ -74,5 +79,23 @@ class Mistral_LLM(Base_LLM):
             temperature=temperature if temperature is not None else client.UNSET,
             top_p=top_p,
             max_tokens=max_tokens if max_tokens is not None else client.UNSET,
+            reasoning_effort="high" if self._enable_reasoning else client.UNSET,
         )
-        return response.choices[0].message.content, None
+        content = response.choices[0].message.content
+        # plain string content means no reasoning chunks were returned
+        if isinstance(content, str):
+            return content, None
+        # otherwise content is a list of chunks: thinking and final text
+        reasoning_parts = [
+            inner.text
+            for chunk in content
+            if isinstance(chunk, ThinkChunk)
+            for inner in chunk.thinking
+            if isinstance(inner, TextChunk)
+        ]
+        text_parts = [chunk.text for chunk in content if isinstance(chunk, TextChunk)]
+        reasoning = "\n".join(reasoning_parts) if reasoning_parts else None
+        return "\n".join(text_parts), reasoning

{llm_codegen_research-2.15 → llm_codegen_research-2.16}/src/llm_cgr/llm/clients/nscale.py RENAMED Viewed

@@ -19,18 +19,23 @@ class Nscale_LLM(Base_LLM):
         temperature: float | None = None,
         top_p: float | None = None,
         max_tokens: int | None = None,
+        enable_reasoning: bool = False,
     ) -> None:
         """
         Initialise the NSCALE client.
         Requires the NSCALE_API_KEY environment variable to be set.
         """
+        if enable_reasoning:
+            raise ValueError("Nscale_LLM does not support enable_reasoning.")
         super().__init__(
             model=model,
             system=system,
             temperature=temperature,
             top_p=top_p,
             max_tokens=max_tokens,
+            enable_reasoning=enable_reasoning,
         )
         self._client = openai.OpenAI(
             api_key=os.environ["NSCALE_API_KEY"],

{llm_codegen_research-2.15 → llm_codegen_research-2.16}/src/llm_cgr/llm/clients/openai.py RENAMED Viewed

@@ -18,18 +18,23 @@ class OpenAI_LLM(Base_LLM):
         temperature: float | None = None,
         top_p: float | None = None,
         max_tokens: int | None = None,
+        enable_reasoning: bool = False,
     ) -> None:
         """
         Initialise the OpenAI client.
         Requires the OPENAI_API_KEY environment variable to be set.
         """
+        if enable_reasoning:
+            raise ValueError("OpenAI_LLM does not support enable_reasoning.")
         super().__init__(
             model=model,
             system=system,
             temperature=temperature,
             top_p=top_p,
             max_tokens=max_tokens,
+            enable_reasoning=enable_reasoning,
         )
         self._client = openai.OpenAI()

{llm_codegen_research-2.15 → llm_codegen_research-2.16}/src/llm_cgr/llm/clients/together.py RENAMED Viewed

@@ -1,5 +1,6 @@
 """Class to access LLMs via the TogetherAI API."""
+import re
 from typing import Any, cast
 import together
@@ -7,6 +8,11 @@ import together
 from llm_cgr.llm.clients.base import Base_LLM
+# matches a <think>...</think> block at the start of a response, used by
+# models that embed their reasoning trace directly in the content
+_THINK_BLOCK = re.compile(r"\A<think>(.*?)</think>\s*", re.DOTALL)
 class TogetherAI_LLM(Base_LLM):
     """Class to access LLMs via the TogetherAI API."""
@@ -17,11 +23,13 @@ class TogetherAI_LLM(Base_LLM):
         temperature: float | None = None,
         top_p: float | None = None,
         max_tokens: int | None = None,
+        enable_reasoning: bool = False,
     ) -> None:
         """
         Initialise the TogetherAI client.
         Requires the TOGETHER_API_KEY environment variable to be set.
+        Set enable_reasoning=True when using a reasoning model (e.g. deepseek-ai/DeepSeek-R1).
         """
         super().__init__(
             model=model,
@@ -29,6 +37,7 @@ class TogetherAI_LLM(Base_LLM):
             temperature=temperature,
             top_p=top_p,
             max_tokens=max_tokens,
+            enable_reasoning=enable_reasoning,
         )
         self._client = together.Together()
@@ -72,4 +81,20 @@ class TogetherAI_LLM(Base_LLM):
         # cast to Any first as together doesn't publicly export the message type,
         # then cast content to str as text completions always have it set
         message = cast(Any, response.choices[0].message)
-        return cast(str, message.content), None
+        content = cast(str, message.content)
+        # chain-of-thought from reasoning models: most (e.g. DeepSeek-R1) use
+        # reasoning_content, some (e.g. Kimi-K2.6) use reasoning; None otherwise
+        reasoning = getattr(message, "reasoning_content", None) or getattr(
+            message, "reasoning", None
+        )
+        # some models embed their reasoning as a <think>...</think> block at
+        # the start of content instead of a separate field; pull it out
+        if reasoning is None:
+            think_match = _THINK_BLOCK.match(content)
+            if think_match:
+                reasoning = think_match.group(1).strip()
+                content = content[think_match.end() :]
+        return content, reasoning

{llm_codegen_research-2.15 → llm_codegen_research-2.16}/src/llm_cgr/llm/generate.py RENAMED Viewed

@@ -1,12 +1,13 @@
 """API utilities for interfacing with the generation models."""
-from typing import cast
+from typing import Literal, overload
 from llm_cgr.defaults import DEFAULT_MODEL
 from llm_cgr.llm.clients import get_llm
 from llm_cgr.llm.prompts import BOOL_SYSTEM_PROMPT, LIST_SYSTEM_PROMPT
+@overload
 def generate(
     user: str,
     model: str = DEFAULT_MODEL,
@@ -15,15 +16,47 @@ def generate(
     top_p: float | None = None,
     max_tokens: int | None = None,
     provider: str | None = None,
+    enable_reasoning: Literal[False] = False,
     **generate_kwargs,
-) -> str:
+) -> str: ...
+@overload
+def generate(
+    user: str,
+    model: str = DEFAULT_MODEL,
+    system: str | None = None,
+    temperature: float | None = None,
+    top_p: float | None = None,
+    max_tokens: int | None = None,
+    provider: str | None = None,
+    enable_reasoning: Literal[True] = True,
+    **generate_kwargs,
+) -> tuple[str, str | None]: ...
+def generate(
+    user: str,
+    model: str = DEFAULT_MODEL,
+    system: str | None = None,
+    temperature: float | None = None,
+    top_p: float | None = None,
+    max_tokens: int | None = None,
+    provider: str | None = None,
+    enable_reasoning: bool = False,
+    **generate_kwargs,
+) -> str | tuple[str, str | None]:
     """
     Simple function to quickly prompt a model for a response.
+    When enable_reasoning is True, returns a (response, reasoning) tuple instead
+    of a plain string.
     """
     client = get_llm(
         model=model,
         system=system,
         provider=provider,
+        enable_reasoning=enable_reasoning,
     )
     [result] = client.generate(
         user=user,
@@ -33,8 +66,7 @@ def generate(
         max_tokens=max_tokens,
         **generate_kwargs,
     )
-    # enable_reasoning is False by default, so result is always a plain string
-    return cast(str, result)
+    return result
 def generate_list(

{llm_codegen_research-2.15 → llm_codegen_research-2.16}/src/llm_codegen_research.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: llm-codegen-research
-Version: 2.15
+Version: 2.16
 Summary: Useful classes and methods for researching code-generation by LLMs.
 Author-email: Lukas Twist <itsluketwist@gmail.com>
 Project-URL: Homepage, https://github.com/itsluketwist/llm-codegen-research
@@ -158,7 +158,7 @@ uv add openai
 Or to upgrade dependencies:
 ```shell
-uv sync --upgrade
+uv sync --extra api --upgrade
 ```
 Check typings with `ty`:

{llm_codegen_research-2.15 → llm_codegen_research-2.16}/src/llm_codegen_research.egg-info/SOURCES.txt RENAMED Viewed

@@ -39,7 +39,6 @@ src/llm_codegen_research.egg-info/top_level.txt
 tests/test_enums.py
 tests/test_json_utils.py
 tests/test_llm_api.py
-tests/test_llm_deepseek_reasoning.py
 tests/test_llm_local.py
 tests/test_llm_tool.py
 tests/test_utils.py

llm_codegen_research-2.15/tests/test_llm_deepseek_reasoning.py DELETED Viewed

@@ -1,136 +0,0 @@
-"""Tests for DeepSeek reasoning model support."""
-import pytest
-from llm_cgr.llm.clients.deepseek import DeepSeek_LLM
-# mark all tests in this file as api tests, so they can be excluded in ci
-pytestmark = pytest.mark.api
-# standard model returns no chain-of-thought; reasoner model does
-CHAT_MODEL = "deepseek-chat"
-REASONER_MODEL = "deepseek-reasoner"
-USER_PROMPT = "How many r's are in 'strawberry'?"
-def test_generate_no_reasoning():
-    """
-    Test that generate returns plain strings when enable_reasoning is False (default).
-    """
-    llm = DeepSeek_LLM(model=CHAT_MODEL)
-    results = llm.generate(user=USER_PROMPT)
-    assert isinstance(results, list)
-    assert len(results) == 1
-    # result should be a plain string, not a tuple
-    assert isinstance(results[0], str)
-    assert len(results[0]) > 0
-def test_generate_with_reasoning_returns_tuples():
-    """
-    Test that generate returns (response, reasoning) tuples when enable_reasoning is True.
-    """
-    llm = DeepSeek_LLM(model=REASONER_MODEL, enable_reasoning=True)
-    results = llm.generate(user=USER_PROMPT)
-    assert isinstance(results, list)
-    assert len(results) == 1
-    response, reasoning = results[0]
-    # response should be a non-empty string
-    assert isinstance(response, str)
-    assert len(response) > 0
-    # the reasoner model should always produce chain-of-thought
-    assert isinstance(reasoning, str)
-    assert len(reasoning) > 0
-def test_generate_non_reasoning_model_has_no_reasoning():
-    """
-    Test that a standard (non-reasoner) model returns None for reasoning even when enabled.
-    """
-    llm = DeepSeek_LLM(model=CHAT_MODEL, enable_reasoning=True)
-    results = llm.generate(user=USER_PROMPT)
-    response, reasoning = results[0]
-    assert isinstance(response, str)
-    assert len(response) > 0
-    # deepseek-chat does not produce reasoning content
-    assert reasoning is None
-def test_chat_no_reasoning():
-    """
-    Test that chat returns a plain string and history has no reasoning_content
-    when enable_reasoning is False (default).
-    """
-    llm = DeepSeek_LLM(model=CHAT_MODEL)
-    response = llm.chat(user=USER_PROMPT)
-    assert isinstance(response, str)
-    assert len(response) > 0
-    # history entries should each have exactly role and content
-    history = llm.history
-    assert all("reasoning_content" not in msg for msg in history)
-def test_chat_with_reasoning_returns_tuple():
-    """
-    Test that chat returns a (response, reasoning) tuple when enable_reasoning is True.
-    """
-    llm = DeepSeek_LLM(model=REASONER_MODEL, enable_reasoning=True)
-    result = llm.chat(user=USER_PROMPT)
-    assert isinstance(result, tuple)
-    response, reasoning = result
-    assert isinstance(response, str)
-    assert len(response) > 0
-    assert isinstance(reasoning, str)
-    assert len(reasoning) > 0
-def test_chat_reasoning_stored_in_history():
-    """
-    Test that reasoning is stored on the assistant history entry when enable_reasoning is True.
-    """
-    llm = DeepSeek_LLM(model=REASONER_MODEL, enable_reasoning=True)
-    llm.chat(user=USER_PROMPT)
-    history = llm.history
-    # find the assistant message
-    assistant_msgs = [msg for msg in history if msg["role"] == "assistant"]
-    assert len(assistant_msgs) == 1
-    assistant_msg = assistant_msgs[0]
-    assert "reasoning_content" in assistant_msg
-    assert isinstance(assistant_msg["reasoning_content"], str)
-    assert len(assistant_msg["reasoning_content"]) > 0
-def test_chat_multi_turn_reasoning_stored_per_turn():
-    """
-    Test that reasoning is captured and stored for each turn in a multi-turn chat.
-    """
-    llm = DeepSeek_LLM(model=REASONER_MODEL, enable_reasoning=True)
-    llm.chat(user="What is 2 + 2?")
-    llm.chat(user="And what is that result multiplied by 3?")
-    history = llm.history
-    assistant_msgs = [msg for msg in history if msg["role"] == "assistant"]
-    assert len(assistant_msgs) == 2
-    # both assistant turns should have reasoning attached
-    for msg in assistant_msgs:
-        assert "reasoning_content" in msg
-        assert isinstance(msg["reasoning_content"], str)
-        assert len(msg["reasoning_content"]) > 0