PyPI - llm-codegen-research - Versions diffs - 2.13__tar.gz → 2.15__tar.gz - Mend

llm-codegen-research 2.13tar.gz → 2.15tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (47) hide show

{llm_codegen_research-2.13 → llm_codegen_research-2.15}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: llm-codegen-research
-Version: 2.13
+Version: 2.15
 Summary: Useful classes and methods for researching code-generation by LLMs.
 Author-email: Lukas Twist <itsluketwist@gmail.com>
 Project-URL: Homepage, https://github.com/itsluketwist/llm-codegen-research

{llm_codegen_research-2.13 → llm_codegen_research-2.15}/src/llm_cgr/llm/clients/__init__.py RENAMED Viewed

@@ -6,7 +6,12 @@ from llm_cgr.llm.clients.deepseek import DeepSeek_LLM
 from llm_cgr.llm.clients.mistral import Mistral_LLM
 from llm_cgr.llm.clients.nscale import Nscale_LLM
 from llm_cgr.llm.clients.openai import OpenAI_LLM
-from llm_cgr.llm.clients.openai_tool import OpenAI_Tool_LLM, Tool
+from llm_cgr.llm.clients.openai_tool import (
+    MAX_TOOL_CALLS,
+    MAX_TOOL_ITERATIONS,
+    OpenAI_Tool_LLM,
+    Tool,
+)
 from llm_cgr.llm.clients.protocol import GenerationProtocol
 from llm_cgr.llm.clients.together import TogetherAI_LLM
@@ -29,6 +34,8 @@ def get_llm(
     max_tokens: int | None = None,
     provider: str | None = None,
     tools: list[Tool] | None = None,
+    max_tool_iterations: int = MAX_TOOL_ITERATIONS,
+    max_tool_calls: int = MAX_TOOL_CALLS,
 ) -> GenerationProtocol:
     """
     Initialise the correct LLM client for the given model.
@@ -63,6 +70,8 @@ def get_llm(
             temperature=temperature,
             top_p=top_p,
             max_tokens=max_tokens,
+            max_tool_iterations=max_tool_iterations,
+            max_tool_calls=max_tool_calls,
         )
     return llm_class(

{llm_codegen_research-2.13 → llm_codegen_research-2.15}/src/llm_cgr/llm/clients/anthropic.py RENAMED Viewed

@@ -66,7 +66,7 @@ class Anthropic_LLM(Base_LLM):
         temperature: float | None = None,
         top_p: float | None = None,
         max_tokens: int | None = None,
-    ) -> str:
+    ) -> tuple[str, str | None]:
         """Generate a model response from the Anthropic API."""
         response = self._client.messages.create(
             model=model,
@@ -77,4 +77,4 @@ class Anthropic_LLM(Base_LLM):
             max_tokens=max_tokens if max_tokens is not None else DEFAULT_MAX_TOKENS,
         )
         # cast to TextBlock as non-tool, non-thinking requests always return text
-        return cast(TextBlock, response.content[0]).text
+        return cast(TextBlock, response.content[0]).text, None

{llm_codegen_research-2.13 → llm_codegen_research-2.15}/src/llm_cgr/llm/clients/base.py RENAMED Viewed

@@ -1,3 +1,5 @@
+"""Base class for LLM API clients."""
 from abc import ABC, abstractmethod
 from typing import Any
@@ -12,9 +14,13 @@ class Base_LLM(ABC):
         temperature: float | None = None,
         top_p: float | None = None,
         max_tokens: int | None = None,
+        enable_reasoning: bool = False,
     ) -> None:
         """
         Initialise the LLM client.
+        When enable_reasoning is True, generate() and chat() include chain-of-thought
+        alongside responses, and reasoning is stored in the chat history.
         """
         self._model = model
         self._system = system
@@ -24,6 +30,7 @@ class Base_LLM(ABC):
         self._top_p = top_p
         self._max_tokens = max_tokens
+        self._enable_reasoning = enable_reasoning
         self._history: list[dict[str, Any]] | None = None
     def generate(
@@ -35,9 +42,12 @@ class Base_LLM(ABC):
         temperature: float | None = None,
         top_p: float | None = None,
         max_tokens: int | None = None,
-    ) -> list[str]:
+    ) -> list[str] | list[tuple[str, str | None]]:
         """
         Generate model responses from the LLMs API.
+        When enable_reasoning is True, returns a list of (response, reasoning) tuples.
+        When False, returns a list of response strings.
         """
         _model = model or self._model
         if _model is None:
@@ -48,16 +58,19 @@ class Base_LLM(ABC):
             system=system or self._system,
         )
-        _generations = []
+        _generations: list[Any] = []
         for _ in range(samples):
-            response = self._get_response(
+            response, reasoning = self._get_response(
                 input=messages,
                 model=_model,
                 temperature=temperature or self._temperature,
                 top_p=top_p or self._top_p,
                 max_tokens=max_tokens or self._max_tokens,
             )
-            _generations.append(response)
+            if self._enable_reasoning:
+                _generations.append((response, reasoning))
+            else:
+                _generations.append(response)
         return _generations
@@ -69,9 +82,12 @@ class Base_LLM(ABC):
         temperature: float | None = None,
         top_p: float | None = None,
         max_tokens: int | None = None,
-    ) -> str:
+    ) -> str | tuple[str, str | None]:
         """
         Generate a model response from the LLMs API, in the ongoing chat.
+        When enable_reasoning is True, reasoning is stored in the history and the
+        return value is a (response, reasoning) tuple instead of a plain string.
         """
         _model = model or self._model
         if _model is None:
@@ -92,7 +108,7 @@ class Base_LLM(ABC):
                 )
             )
-        response = self._get_response(
+        response, reasoning = self._get_response(
             input=self._history,
             system=system,
             model=_model,
@@ -101,13 +117,14 @@ class Base_LLM(ABC):
             max_tokens=max_tokens or self._max_tokens,
         )
-        # update the history and return
-        self._history.append(
-            self._build_message(
-                role="assistant",
-                content=response,
-            )
-        )
+        # build the assistant history entry, attaching reasoning if present
+        assistant_message = self._build_message(role="assistant", content=response)
+        if self._enable_reasoning and reasoning is not None:
+            assistant_message["reasoning_content"] = reasoning
+        self._history.append(assistant_message)
+        if self._enable_reasoning:
+            return response, reasoning
         return response
     @property
@@ -146,9 +163,10 @@ class Base_LLM(ABC):
         temperature: float | None = None,
         top_p: float | None = None,
         max_tokens: int | None = None,
-    ) -> str:
+    ) -> tuple[str, str | None]:
         """
         Generate a model response from the LLM API.
-        Returns the text response to the prompt.
+        Returns a (response, reasoning) tuple; reasoning is None for models that
+        do not produce chain-of-thought output.
         """

{llm_codegen_research-2.13 → llm_codegen_research-2.15}/src/llm_cgr/llm/clients/deepseek.py RENAMED Viewed

@@ -1,4 +1,4 @@
-"""Class to access LLMs via the OpenAI API."""
+"""Class to access LLMs via the DeepSeek API."""
 import os
 from typing import Any, cast
@@ -19,11 +19,13 @@ class DeepSeek_LLM(Base_LLM):
         temperature: float | None = None,
         top_p: float | None = None,
         max_tokens: int | None = None,
+        enable_reasoning: bool = False,
     ) -> None:
         """
         Initialise the DeepSeek client.
         Requires the DEEPSEEK_API_KEY environment variable to be set.
+        Set enable_reasoning=True when using a reasoning model (e.g. deepseek-reasoner).
         """
         super().__init__(
             model=model,
@@ -31,6 +33,7 @@ class DeepSeek_LLM(Base_LLM):
             temperature=temperature,
             top_p=top_p,
             max_tokens=max_tokens,
+            enable_reasoning=enable_reasoning,
         )
         self._client = openai.OpenAI(
             api_key=os.environ["DEEPSEEK_API_KEY"],
@@ -65,8 +68,8 @@ class DeepSeek_LLM(Base_LLM):
         temperature: float | None = None,
         top_p: float | None = None,
         max_tokens: int | None = None,
-    ) -> str:
-        """Generate a model response from the OpenAI API."""
+    ) -> tuple[str, str | None]:
+        """Generate a model response from the DeepSeek API."""
         response = self._client.chat.completions.create(
             messages=cast(list[ChatCompletionMessageParam], input),
             model=model,
@@ -74,5 +77,10 @@ class DeepSeek_LLM(Base_LLM):
             top_p=top_p if top_p is not None else openai.omit,
             max_completion_tokens=max_tokens if max_tokens is not None else openai.omit,
         )
+        message = response.choices[0].message
+        # chain-of-thought from reasoning models (e.g. deepseek-reasoner); None otherwise
+        reasoning = getattr(message, "reasoning_content", None)
         # cast to str as text completions always return string content
-        return cast(str, response.choices[0].message.content)
+        return cast(str, message.content), reasoning

{llm_codegen_research-2.13 → llm_codegen_research-2.15}/src/llm_cgr/llm/clients/mistral.py RENAMED Viewed

@@ -66,7 +66,7 @@ class Mistral_LLM(Base_LLM):
         temperature: float | None = None,
         top_p: float | None = None,
         max_tokens: int | None = None,
-    ) -> str:
+    ) -> tuple[str, str | None]:
         """Generate a model response from the MistralAI API."""
         response = self._client.chat.complete(
             model=model,
@@ -75,4 +75,4 @@ class Mistral_LLM(Base_LLM):
             top_p=top_p,
             max_tokens=max_tokens if max_tokens is not None else client.UNSET,
         )
-        return response.choices[0].message.content
+        return response.choices[0].message.content, None

{llm_codegen_research-2.13 → llm_codegen_research-2.15}/src/llm_cgr/llm/clients/nscale.py RENAMED Viewed

@@ -65,7 +65,7 @@ class Nscale_LLM(Base_LLM):
         temperature: float | None = None,
         top_p: float | None = None,
         max_tokens: int | None = None,
-    ) -> str:
+    ) -> tuple[str, str | None]:
         """Generate a model response from the OpenAI API."""
         response = self._client.chat.completions.create(
             messages=cast(list[ChatCompletionMessageParam], input),
@@ -75,4 +75,4 @@ class Nscale_LLM(Base_LLM):
             max_completion_tokens=max_tokens if max_tokens is not None else openai.omit,
         )
         # cast to str as text completions always return string content
-        return cast(str, response.choices[0].message.content)
+        return cast(str, response.choices[0].message.content), None

{llm_codegen_research-2.13 → llm_codegen_research-2.15}/src/llm_cgr/llm/clients/openai.py RENAMED Viewed

@@ -61,7 +61,7 @@ class OpenAI_LLM(Base_LLM):
         temperature: int | float | None = None,
         top_p: int | float | None = None,
         max_tokens: int | None = None,
-    ) -> str:
+    ) -> tuple[str, str | None]:
         """Generate a model response from the OpenAI API."""
         self._client.responses.input_items
         response = self._client.responses.create(
@@ -71,4 +71,4 @@ class OpenAI_LLM(Base_LLM):
             top_p=top_p if top_p is not None else openai.omit,
             max_output_tokens=max_tokens if max_tokens is not None else openai.omit,
         )
-        return response.output_text
+        return response.output_text, None

{llm_codegen_research-2.13 → llm_codegen_research-2.15}/src/llm_cgr/llm/clients/openai_tool.py RENAMED Viewed

@@ -10,8 +10,11 @@ from openai.types.responses import ResponseFunctionToolCall, ResponseInputItemPa
 from llm_cgr.llm.clients.openai import OpenAI_LLM
-# maximum number of tool-call iterations per request, to prevent runaway loops
-MAX_TOOL_ITERATIONS: int = 10
+# maximum tool-call rounds allowed within a single generate() or chat() call
+MAX_TOOL_ITERATIONS: int = 5
+# maximum total tool calls allowed across the lifetime of a client instance
+MAX_TOOL_CALLS: int = 10
 @dataclass
@@ -51,11 +54,17 @@ class OpenAI_Tool_LLM(OpenAI_LLM):
         temperature: float | None = None,
         top_p: float | None = None,
         max_tokens: int | None = None,
+        max_tool_iterations: int = MAX_TOOL_ITERATIONS,
+        max_tool_calls: int = MAX_TOOL_CALLS,
     ) -> None:
         """
         Initialise the OpenAI tool client.
         Requires the OPENAI_API_KEY environment variable to be set.
+        max_tool_iterations caps tool-call rounds within a single request.
+        max_tool_calls caps the cumulative total across all requests on this
+        instance. When either limit is reached, the model is sent a message
+        asking it to answer immediately without any further tool calls.
         """
         super().__init__(
             model=model,
@@ -65,6 +74,8 @@ class OpenAI_Tool_LLM(OpenAI_LLM):
             max_tokens=max_tokens,
         )
         self._tools = tools
+        self._max_tool_iterations = max_tool_iterations
+        self._max_tool_calls = max_tool_calls
         # cumulative count of individual tool calls made by this instance
         self._tool_calls: int = 0
@@ -90,6 +101,43 @@ class OpenAI_Tool_LLM(OpenAI_LLM):
             "parameters": tool.parameters,
         }
+    def _force_final_answer(
+        self,
+        current_input: list[Any],
+        model: str,
+        temperature: float | None,
+        top_p: float | None,
+        max_tokens: int | None,
+    ) -> str:
+        """Force the model to produce a text answer after a limit is reached.
+        Appends a user message telling the model it has used all its allowed
+        tool calls, then calls the API one final time without any tools so the
+        model cannot make further calls.
+        Returns the model's final text response.
+        """
+        # tell the model it must answer now — no more tool calls are allowed
+        current_input.append(
+            self._build_message(
+                role="user",
+                content=(
+                    "You have reached the maximum number of tool calls allowed. "
+                    "Please provide your final answer now based on the information "
+                    "you have gathered, without calling any more tools."
+                ),
+            )
+        )
+        response = self._client.responses.create(
+            input=cast(list[ResponseInputItemParam], current_input),
+            model=model,
+            temperature=temperature if temperature is not None else openai.omit,
+            top_p=top_p if top_p is not None else openai.omit,
+            max_output_tokens=max_tokens if max_tokens is not None else openai.omit,
+            # no tools provided: the model cannot make further tool calls
+        )
+        return response.output_text
     def _run_tool_loop(
         self,
         messages: list[dict[str, Any]],
@@ -101,8 +149,12 @@ class OpenAI_Tool_LLM(OpenAI_LLM):
         """Run the agentic tool-call loop for a single turn.
         Calls the OpenAI API in a loop, executing any tool calls the model
-        requests, until the model produces a final text response or the
-        MAX_TOOL_ITERATIONS safety limit is reached.
+        requests, until the model produces a final text response or a limit is
+        reached. Two limits apply:
+          - max_tool_iterations: rounds allowed within this single call.
+          - max_tool_calls: cumulative total across all calls on this instance.
+        When either limit is hit, _force_final_answer() is called, which tells
+        the model to answer immediately without making any further tool calls.
         Returns the final text response.
         """
@@ -118,7 +170,7 @@ class OpenAI_Tool_LLM(OpenAI_LLM):
         # and the richer tool-call dicts without fighting the type checker.
         current_input: list[Any] = list(messages)
-        for _ in range(MAX_TOOL_ITERATIONS):
+        for _ in range(self._max_tool_iterations):
             response = self._client.responses.create(
                 input=cast(list[ResponseInputItemParam], current_input),
                 model=model,
@@ -137,6 +189,18 @@ class OpenAI_Tool_LLM(OpenAI_LLM):
             if not function_calls:
                 return response.output_text
+            # check the overall cumulative limit before processing these calls.
+            # if adding them would exceed the limit, force a final answer now
+            # without executing any of the pending tool calls.
+            if self._tool_calls + len(function_calls) > self._max_tool_calls:
+                return self._force_final_answer(
+                    current_input=current_input,
+                    model=model,
+                    temperature=temperature,
+                    top_p=top_p,
+                    max_tokens=max_tokens,
+                )
             # increment the cumulative counter; parallel calls count individually
             self._tool_calls += len(function_calls)
@@ -172,8 +236,14 @@ class OpenAI_Tool_LLM(OpenAI_LLM):
             # loop continues: enriched input is sent back to the model
-        # safety fallback: return whatever text the model produced on the last turn
-        return response.output_text
+        # max_tool_iterations exhausted — force the model to answer now
+        return self._force_final_answer(
+            current_input=current_input,
+            model=model,
+            temperature=temperature,
+            top_p=top_p,
+            max_tokens=max_tokens,
+        )
     def generate(
         self,

{llm_codegen_research-2.13 → llm_codegen_research-2.15}/src/llm_cgr/llm/clients/protocol.py RENAMED Viewed

@@ -17,7 +17,7 @@ class GenerationProtocol(Protocol):
         temperature: float | None = None,
         top_p: float | None = None,
         max_tokens: int | None = None,
-    ) -> list[str]:
+    ) -> list[str] | list[tuple[str, str | None]]:
         """
         Generate model responses from the LLMs API.
         """
@@ -30,7 +30,7 @@ class GenerationProtocol(Protocol):
         temperature: float | None = None,
         top_p: float | None = None,
         max_tokens: int | None = None,
-    ) -> str:
+    ) -> str | tuple[str, str | None]:
         """
         Generate a model response from the LLMs API, in the ongoing chat.
         """

{llm_codegen_research-2.13 → llm_codegen_research-2.15}/src/llm_cgr/llm/clients/together.py RENAMED Viewed

@@ -60,7 +60,7 @@ class TogetherAI_LLM(Base_LLM):
         temperature: float | None = None,
         top_p: float | None = None,
         max_tokens: int | None = None,
-    ) -> str:
+    ) -> tuple[str, str | None]:
         """Generate a model response from the TogetherAI API."""
         response = self._client.chat.completions.create(
             model=model,
@@ -72,4 +72,4 @@ class TogetherAI_LLM(Base_LLM):
         # cast to Any first as together doesn't publicly export the message type,
         # then cast content to str as text completions always have it set
         message = cast(Any, response.choices[0].message)
-        return cast(str, message.content)
+        return cast(str, message.content), None

{llm_codegen_research-2.13 → llm_codegen_research-2.15}/src/llm_cgr/llm/generate.py RENAMED Viewed

@@ -1,5 +1,7 @@
 """API utilities for interfacing with the generation models."""
+from typing import cast
 from llm_cgr.defaults import DEFAULT_MODEL
 from llm_cgr.llm.clients import get_llm
 from llm_cgr.llm.prompts import BOOL_SYSTEM_PROMPT, LIST_SYSTEM_PROMPT
@@ -31,7 +33,8 @@ def generate(
         max_tokens=max_tokens,
         **generate_kwargs,
     )
-    return result
+    # enable_reasoning is False by default, so result is always a plain string
+    return cast(str, result)
 def generate_list(

{llm_codegen_research-2.13 → llm_codegen_research-2.15}/src/llm_codegen_research.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: llm-codegen-research
-Version: 2.13
+Version: 2.15
 Summary: Useful classes and methods for researching code-generation by LLMs.
 Author-email: Lukas Twist <itsluketwist@gmail.com>
 Project-URL: Homepage, https://github.com/itsluketwist/llm-codegen-research

{llm_codegen_research-2.13 → llm_codegen_research-2.15}/src/llm_codegen_research.egg-info/SOURCES.txt RENAMED Viewed

@@ -39,6 +39,7 @@ src/llm_codegen_research.egg-info/top_level.txt
 tests/test_enums.py
 tests/test_json_utils.py
 tests/test_llm_api.py
+tests/test_llm_deepseek_reasoning.py
 tests/test_llm_local.py
 tests/test_llm_tool.py
 tests/test_utils.py

llm_codegen_research-2.15/tests/test_llm_deepseek_reasoning.py ADDED Viewed

@@ -0,0 +1,136 @@
+"""Tests for DeepSeek reasoning model support."""
+import pytest
+from llm_cgr.llm.clients.deepseek import DeepSeek_LLM
+# mark all tests in this file as api tests, so they can be excluded in ci
+pytestmark = pytest.mark.api
+# standard model returns no chain-of-thought; reasoner model does
+CHAT_MODEL = "deepseek-chat"
+REASONER_MODEL = "deepseek-reasoner"
+USER_PROMPT = "How many r's are in 'strawberry'?"
+def test_generate_no_reasoning():
+    """
+    Test that generate returns plain strings when enable_reasoning is False (default).
+    """
+    llm = DeepSeek_LLM(model=CHAT_MODEL)
+    results = llm.generate(user=USER_PROMPT)
+    assert isinstance(results, list)
+    assert len(results) == 1
+    # result should be a plain string, not a tuple
+    assert isinstance(results[0], str)
+    assert len(results[0]) > 0
+def test_generate_with_reasoning_returns_tuples():
+    """
+    Test that generate returns (response, reasoning) tuples when enable_reasoning is True.
+    """
+    llm = DeepSeek_LLM(model=REASONER_MODEL, enable_reasoning=True)
+    results = llm.generate(user=USER_PROMPT)
+    assert isinstance(results, list)
+    assert len(results) == 1
+    response, reasoning = results[0]
+    # response should be a non-empty string
+    assert isinstance(response, str)
+    assert len(response) > 0
+    # the reasoner model should always produce chain-of-thought
+    assert isinstance(reasoning, str)
+    assert len(reasoning) > 0
+def test_generate_non_reasoning_model_has_no_reasoning():
+    """
+    Test that a standard (non-reasoner) model returns None for reasoning even when enabled.
+    """
+    llm = DeepSeek_LLM(model=CHAT_MODEL, enable_reasoning=True)
+    results = llm.generate(user=USER_PROMPT)
+    response, reasoning = results[0]
+    assert isinstance(response, str)
+    assert len(response) > 0
+    # deepseek-chat does not produce reasoning content
+    assert reasoning is None
+def test_chat_no_reasoning():
+    """
+    Test that chat returns a plain string and history has no reasoning_content
+    when enable_reasoning is False (default).
+    """
+    llm = DeepSeek_LLM(model=CHAT_MODEL)
+    response = llm.chat(user=USER_PROMPT)
+    assert isinstance(response, str)
+    assert len(response) > 0
+    # history entries should each have exactly role and content
+    history = llm.history
+    assert all("reasoning_content" not in msg for msg in history)
+def test_chat_with_reasoning_returns_tuple():
+    """
+    Test that chat returns a (response, reasoning) tuple when enable_reasoning is True.
+    """
+    llm = DeepSeek_LLM(model=REASONER_MODEL, enable_reasoning=True)
+    result = llm.chat(user=USER_PROMPT)
+    assert isinstance(result, tuple)
+    response, reasoning = result
+    assert isinstance(response, str)
+    assert len(response) > 0
+    assert isinstance(reasoning, str)
+    assert len(reasoning) > 0
+def test_chat_reasoning_stored_in_history():
+    """
+    Test that reasoning is stored on the assistant history entry when enable_reasoning is True.
+    """
+    llm = DeepSeek_LLM(model=REASONER_MODEL, enable_reasoning=True)
+    llm.chat(user=USER_PROMPT)
+    history = llm.history
+    # find the assistant message
+    assistant_msgs = [msg for msg in history if msg["role"] == "assistant"]
+    assert len(assistant_msgs) == 1
+    assistant_msg = assistant_msgs[0]
+    assert "reasoning_content" in assistant_msg
+    assert isinstance(assistant_msg["reasoning_content"], str)
+    assert len(assistant_msg["reasoning_content"]) > 0
+def test_chat_multi_turn_reasoning_stored_per_turn():
+    """
+    Test that reasoning is captured and stored for each turn in a multi-turn chat.
+    """
+    llm = DeepSeek_LLM(model=REASONER_MODEL, enable_reasoning=True)
+    llm.chat(user="What is 2 + 2?")
+    llm.chat(user="And what is that result multiplied by 3?")
+    history = llm.history
+    assistant_msgs = [msg for msg in history if msg["role"] == "assistant"]
+    assert len(assistant_msgs) == 2
+    # both assistant turns should have reasoning attached
+    for msg in assistant_msgs:
+        assert "reasoning_content" in msg
+        assert isinstance(msg["reasoning_content"], str)
+        assert len(msg["reasoning_content"]) > 0