langchain-failover 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- langchain_failover-0.1.0/.gitignore +11 -0
- langchain_failover-0.1.0/CHANGELOG.md +11 -0
- langchain_failover-0.1.0/LICENSE +21 -0
- langchain_failover-0.1.0/PKG-INFO +125 -0
- langchain_failover-0.1.0/README.md +73 -0
- langchain_failover-0.1.0/pyproject.toml +49 -0
- langchain_failover-0.1.0/src/langchain_failover/__init__.py +17 -0
- langchain_failover-0.1.0/src/langchain_failover/failover.py +248 -0
- langchain_failover-0.1.0/tests/test_failover.py +107 -0
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
# Changelog
|
|
2
|
+
|
|
3
|
+
## 0.1.0 (unreleased)
|
|
4
|
+
|
|
5
|
+
- Initial release.
|
|
6
|
+
- `FailoverChatModel`: primary/secondary failover with stateful recovery.
|
|
7
|
+
- Connection-aware failover that walks the exception cause/context chain.
|
|
8
|
+
- `bind_tools` preserved across failover (binds both legs).
|
|
9
|
+
- Mid-stream safety: only fails over before the first streamed token.
|
|
10
|
+
- `create_failover_llm` convenience constructor with `/models` auto-discovery.
|
|
11
|
+
- `extract_token_metrics` helper for OpenAI-compatible and Ollama metadata.
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Vinay Vobbilichetty
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,125 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: langchain-failover
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Primary/secondary failover wrapper for LangChain chat models, with tool-calling preserved across failover.
|
|
5
|
+
Project-URL: Homepage, https://github.com/vinayvobbili/langchain-failover
|
|
6
|
+
Project-URL: Repository, https://github.com/vinayvobbili/langchain-failover
|
|
7
|
+
Project-URL: Issues, https://github.com/vinayvobbili/langchain-failover/issues
|
|
8
|
+
Author-email: Vinay Vobbilichetty <vinayvobbilichetty11@gmail.com>
|
|
9
|
+
License: MIT License
|
|
10
|
+
|
|
11
|
+
Copyright (c) 2026 Vinay Vobbilichetty
|
|
12
|
+
|
|
13
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
14
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
15
|
+
in the Software without restriction, including without limitation the rights
|
|
16
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
17
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
18
|
+
furnished to do so, subject to the following conditions:
|
|
19
|
+
|
|
20
|
+
The above copyright notice and this permission notice shall be included in all
|
|
21
|
+
copies or substantial portions of the Software.
|
|
22
|
+
|
|
23
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
24
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
25
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
26
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
27
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
28
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
29
|
+
SOFTWARE.
|
|
30
|
+
License-File: LICENSE
|
|
31
|
+
Keywords: chat-model,failover,fallback,high-availability,langchain,llm,resilience
|
|
32
|
+
Classifier: Development Status :: 4 - Beta
|
|
33
|
+
Classifier: Intended Audience :: Developers
|
|
34
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
35
|
+
Classifier: Programming Language :: Python :: 3
|
|
36
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
37
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
38
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
39
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
40
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
41
|
+
Requires-Python: >=3.9
|
|
42
|
+
Requires-Dist: langchain-core>=0.2
|
|
43
|
+
Provides-Extra: dev
|
|
44
|
+
Requires-Dist: build>=1.0; extra == 'dev'
|
|
45
|
+
Requires-Dist: langchain-openai>=0.1; extra == 'dev'
|
|
46
|
+
Requires-Dist: pytest>=7; extra == 'dev'
|
|
47
|
+
Requires-Dist: ruff>=0.4; extra == 'dev'
|
|
48
|
+
Requires-Dist: twine>=5.0; extra == 'dev'
|
|
49
|
+
Provides-Extra: openai
|
|
50
|
+
Requires-Dist: langchain-openai>=0.1; extra == 'openai'
|
|
51
|
+
Description-Content-Type: text/markdown
|
|
52
|
+
|
|
53
|
+
# langchain-failover
|
|
54
|
+
|
|
55
|
+
A tiny, dependency-light **primary/secondary failover wrapper** for LangChain chat
|
|
56
|
+
models. Point it at two chat models; it serves from the primary, transparently
|
|
57
|
+
falls back to the secondary on connection errors, and switches back the moment the
|
|
58
|
+
primary recovers — **and tool-calling keeps working across the failover.**
|
|
59
|
+
|
|
60
|
+
```python
|
|
61
|
+
from langchain_openai import ChatOpenAI
|
|
62
|
+
from langchain_failover import FailoverChatModel
|
|
63
|
+
|
|
64
|
+
primary = ChatOpenAI(base_url="http://gpu-box:8001/v1", api_key="x", model="local")
|
|
65
|
+
backup = ChatOpenAI(base_url="http://cpu-box:8002/v1", api_key="x", model="local")
|
|
66
|
+
|
|
67
|
+
llm = FailoverChatModel(primary=primary, secondary=backup)
|
|
68
|
+
|
|
69
|
+
llm.invoke("Summarise this incident…") # served by primary
|
|
70
|
+
# …primary host dies…
|
|
71
|
+
llm.invoke("And the next one?") # transparently served by backup
|
|
72
|
+
# …primary comes back…
|
|
73
|
+
llm.invoke("One more") # back on primary, logged as recovered
|
|
74
|
+
```
|
|
75
|
+
|
|
76
|
+
## Install
|
|
77
|
+
|
|
78
|
+
```bash
|
|
79
|
+
pip install langchain-failover # core
|
|
80
|
+
pip install "langchain-failover[openai]" # + langchain-openai for create_failover_llm
|
|
81
|
+
```
|
|
82
|
+
|
|
83
|
+
## Why not `RunnableWithFallbacks` / `.with_fallbacks()`?
|
|
84
|
+
|
|
85
|
+
LangChain ships per-invocation fallbacks, and they're great for what they do. This
|
|
86
|
+
package exists for the cases they don't cover well:
|
|
87
|
+
|
|
88
|
+
- **Stateful recovery.** `FailoverChatModel` remembers which leg it's on and logs
|
|
89
|
+
the transition both ways (`active` property tells you). `.with_fallbacks()` is
|
|
90
|
+
stateless — every call re-tries the (possibly still-dead) primary first.
|
|
91
|
+
- **Tool-calling survives failover.** `bind_tools` is overridden to bind on *both*
|
|
92
|
+
legs and return another `FailoverChatModel`. With strict langchain-core
|
|
93
|
+
(`>=1.4`, where `BaseChatModel.bind_tools` raises by default) naïve wrappers
|
|
94
|
+
break at bind time; agents using this one keep working.
|
|
95
|
+
- **Connection-aware, not blanket.** It only fails over on connection/network
|
|
96
|
+
errors (walking the exception's `__cause__`/`__context__` chain, so a socket
|
|
97
|
+
error wrapped three layers deep still counts). A `ValueError` from a bad prompt
|
|
98
|
+
propagates instead of being silently retried on a second endpoint.
|
|
99
|
+
- **Mid-stream safety.** During `stream()`, it only fails over if the primary dies
|
|
100
|
+
*before* the first token — so you never get duplicated, half-streamed output.
|
|
101
|
+
|
|
102
|
+
## Local-model convenience
|
|
103
|
+
|
|
104
|
+
If you run local OpenAI-compatible servers (vLLM, mlx-lm, Ollama, LM Studio) and
|
|
105
|
+
don't want to hardcode model names, `create_failover_llm` auto-discovers the served
|
|
106
|
+
model id from each endpoint's `/models`:
|
|
107
|
+
|
|
108
|
+
```python
|
|
109
|
+
from langchain_failover import create_failover_llm
|
|
110
|
+
|
|
111
|
+
llm = create_failover_llm(
|
|
112
|
+
primary_url="http://localhost:8001/v1",
|
|
113
|
+
secondary_url="http://localhost:8002/v1",
|
|
114
|
+
)
|
|
115
|
+
```
|
|
116
|
+
|
|
117
|
+
## Bonus helper
|
|
118
|
+
|
|
119
|
+
`extract_token_metrics(response.response_metadata)` normalises token counts and
|
|
120
|
+
timings across OpenAI-compatible and Ollama metadata shapes into a single
|
|
121
|
+
`{input_tokens, output_tokens, prompt_time, generation_time}` dict.
|
|
122
|
+
|
|
123
|
+
## License
|
|
124
|
+
|
|
125
|
+
MIT
|
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
# langchain-failover
|
|
2
|
+
|
|
3
|
+
A tiny, dependency-light **primary/secondary failover wrapper** for LangChain chat
|
|
4
|
+
models. Point it at two chat models; it serves from the primary, transparently
|
|
5
|
+
falls back to the secondary on connection errors, and switches back the moment the
|
|
6
|
+
primary recovers — **and tool-calling keeps working across the failover.**
|
|
7
|
+
|
|
8
|
+
```python
|
|
9
|
+
from langchain_openai import ChatOpenAI
|
|
10
|
+
from langchain_failover import FailoverChatModel
|
|
11
|
+
|
|
12
|
+
primary = ChatOpenAI(base_url="http://gpu-box:8001/v1", api_key="x", model="local")
|
|
13
|
+
backup = ChatOpenAI(base_url="http://cpu-box:8002/v1", api_key="x", model="local")
|
|
14
|
+
|
|
15
|
+
llm = FailoverChatModel(primary=primary, secondary=backup)
|
|
16
|
+
|
|
17
|
+
llm.invoke("Summarise this incident…") # served by primary
|
|
18
|
+
# …primary host dies…
|
|
19
|
+
llm.invoke("And the next one?") # transparently served by backup
|
|
20
|
+
# …primary comes back…
|
|
21
|
+
llm.invoke("One more") # back on primary, logged as recovered
|
|
22
|
+
```
|
|
23
|
+
|
|
24
|
+
## Install
|
|
25
|
+
|
|
26
|
+
```bash
|
|
27
|
+
pip install langchain-failover # core
|
|
28
|
+
pip install "langchain-failover[openai]" # + langchain-openai for create_failover_llm
|
|
29
|
+
```
|
|
30
|
+
|
|
31
|
+
## Why not `RunnableWithFallbacks` / `.with_fallbacks()`?
|
|
32
|
+
|
|
33
|
+
LangChain ships per-invocation fallbacks, and they're great for what they do. This
|
|
34
|
+
package exists for the cases they don't cover well:
|
|
35
|
+
|
|
36
|
+
- **Stateful recovery.** `FailoverChatModel` remembers which leg it's on and logs
|
|
37
|
+
the transition both ways (`active` property tells you). `.with_fallbacks()` is
|
|
38
|
+
stateless — every call re-tries the (possibly still-dead) primary first.
|
|
39
|
+
- **Tool-calling survives failover.** `bind_tools` is overridden to bind on *both*
|
|
40
|
+
legs and return another `FailoverChatModel`. With strict langchain-core
|
|
41
|
+
(`>=1.4`, where `BaseChatModel.bind_tools` raises by default) naïve wrappers
|
|
42
|
+
break at bind time; agents using this one keep working.
|
|
43
|
+
- **Connection-aware, not blanket.** It only fails over on connection/network
|
|
44
|
+
errors (walking the exception's `__cause__`/`__context__` chain, so a socket
|
|
45
|
+
error wrapped three layers deep still counts). A `ValueError` from a bad prompt
|
|
46
|
+
propagates instead of being silently retried on a second endpoint.
|
|
47
|
+
- **Mid-stream safety.** During `stream()`, it only fails over if the primary dies
|
|
48
|
+
*before* the first token — so you never get duplicated, half-streamed output.
|
|
49
|
+
|
|
50
|
+
## Local-model convenience
|
|
51
|
+
|
|
52
|
+
If you run local OpenAI-compatible servers (vLLM, mlx-lm, Ollama, LM Studio) and
|
|
53
|
+
don't want to hardcode model names, `create_failover_llm` auto-discovers the served
|
|
54
|
+
model id from each endpoint's `/models`:
|
|
55
|
+
|
|
56
|
+
```python
|
|
57
|
+
from langchain_failover import create_failover_llm
|
|
58
|
+
|
|
59
|
+
llm = create_failover_llm(
|
|
60
|
+
primary_url="http://localhost:8001/v1",
|
|
61
|
+
secondary_url="http://localhost:8002/v1",
|
|
62
|
+
)
|
|
63
|
+
```
|
|
64
|
+
|
|
65
|
+
## Bonus helper
|
|
66
|
+
|
|
67
|
+
`extract_token_metrics(response.response_metadata)` normalises token counts and
|
|
68
|
+
timings across OpenAI-compatible and Ollama metadata shapes into a single
|
|
69
|
+
`{input_tokens, output_tokens, prompt_time, generation_time}` dict.
|
|
70
|
+
|
|
71
|
+
## License
|
|
72
|
+
|
|
73
|
+
MIT
|
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["hatchling"]
|
|
3
|
+
build-backend = "hatchling.build"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "langchain-failover"
|
|
7
|
+
version = "0.1.0"
|
|
8
|
+
description = "Primary/secondary failover wrapper for LangChain chat models, with tool-calling preserved across failover."
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
requires-python = ">=3.9"
|
|
11
|
+
license = { file = "LICENSE" }
|
|
12
|
+
authors = [{ name = "Vinay Vobbilichetty", email = "vinayvobbilichetty11@gmail.com" }]
|
|
13
|
+
keywords = ["langchain", "llm", "failover", "fallback", "resilience", "high-availability", "chat-model"]
|
|
14
|
+
classifiers = [
|
|
15
|
+
"Development Status :: 4 - Beta",
|
|
16
|
+
"Intended Audience :: Developers",
|
|
17
|
+
"License :: OSI Approved :: MIT License",
|
|
18
|
+
"Programming Language :: Python :: 3",
|
|
19
|
+
"Programming Language :: Python :: 3.9",
|
|
20
|
+
"Programming Language :: Python :: 3.10",
|
|
21
|
+
"Programming Language :: Python :: 3.11",
|
|
22
|
+
"Programming Language :: Python :: 3.12",
|
|
23
|
+
"Topic :: Software Development :: Libraries :: Python Modules",
|
|
24
|
+
]
|
|
25
|
+
dependencies = [
|
|
26
|
+
"langchain-core>=0.2",
|
|
27
|
+
]
|
|
28
|
+
|
|
29
|
+
[project.optional-dependencies]
|
|
30
|
+
openai = ["langchain-openai>=0.1"]
|
|
31
|
+
dev = [
|
|
32
|
+
"langchain-openai>=0.1",
|
|
33
|
+
"pytest>=7",
|
|
34
|
+
"ruff>=0.4",
|
|
35
|
+
"build>=1.0",
|
|
36
|
+
"twine>=5.0",
|
|
37
|
+
]
|
|
38
|
+
|
|
39
|
+
[project.urls]
|
|
40
|
+
Homepage = "https://github.com/vinayvobbili/langchain-failover"
|
|
41
|
+
Repository = "https://github.com/vinayvobbili/langchain-failover"
|
|
42
|
+
Issues = "https://github.com/vinayvobbili/langchain-failover/issues"
|
|
43
|
+
|
|
44
|
+
[tool.hatch.build.targets.wheel]
|
|
45
|
+
packages = ["src/langchain_failover"]
|
|
46
|
+
|
|
47
|
+
[tool.ruff]
|
|
48
|
+
line-length = 100
|
|
49
|
+
target-version = "py39"
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
"""langchain-failover — a primary/secondary failover wrapper for LangChain chat models."""
|
|
2
|
+
from langchain_failover.failover import (
|
|
3
|
+
FailoverChatModel,
|
|
4
|
+
create_failover_llm,
|
|
5
|
+
extract_token_metrics,
|
|
6
|
+
is_connection_error,
|
|
7
|
+
)
|
|
8
|
+
|
|
9
|
+
__version__ = "0.1.0"
|
|
10
|
+
|
|
11
|
+
__all__ = [
|
|
12
|
+
"FailoverChatModel",
|
|
13
|
+
"create_failover_llm",
|
|
14
|
+
"extract_token_metrics",
|
|
15
|
+
"is_connection_error",
|
|
16
|
+
"__version__",
|
|
17
|
+
]
|
|
@@ -0,0 +1,248 @@
|
|
|
1
|
+
"""A LangChain chat model that fails over between a primary and a secondary model.
|
|
2
|
+
|
|
3
|
+
The wrapper delegates every call to the primary chat model. If the primary raises
|
|
4
|
+
a connection-related error it transparently retries on the secondary, and it
|
|
5
|
+
switches back to the primary the moment the primary answers again. ``bind_tools``
|
|
6
|
+
is preserved across the failover so tool-calling agents keep working when either
|
|
7
|
+
leg is the one serving the request.
|
|
8
|
+
"""
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
import logging
|
|
12
|
+
from typing import Any, Optional
|
|
13
|
+
|
|
14
|
+
from langchain_core.language_models import BaseChatModel
|
|
15
|
+
from pydantic import ConfigDict
|
|
16
|
+
|
|
17
|
+
logger = logging.getLogger(__name__)
|
|
18
|
+
|
|
19
|
+
# Exception *type names* (not classes) that we treat as "the endpoint is
|
|
20
|
+
# unreachable, try the other one." Matching by name keeps us independent of
|
|
21
|
+
# which HTTP/client library raised it (httpx, requests, urllib, openai, ...).
|
|
22
|
+
_CONNECTION_ERROR_NAMES = (
|
|
23
|
+
"ConnectionError",
|
|
24
|
+
"ConnectError",
|
|
25
|
+
"RemoteProtocolError",
|
|
26
|
+
"ConnectionRefusedError",
|
|
27
|
+
"TimeoutError",
|
|
28
|
+
"ReadTimeout",
|
|
29
|
+
"APIConnectionError",
|
|
30
|
+
)
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def is_connection_error(exc: BaseException) -> bool:
|
|
34
|
+
"""Return True if ``exc`` (or anything in its cause/context chain) looks like
|
|
35
|
+
a connection/network failure worth failing over on.
|
|
36
|
+
|
|
37
|
+
We walk ``__cause__``/``__context__`` because client libraries routinely wrap
|
|
38
|
+
the original socket error inside a higher-level exception, so the interesting
|
|
39
|
+
type is often several links down the chain.
|
|
40
|
+
"""
|
|
41
|
+
seen: set[int] = set()
|
|
42
|
+
current: Optional[BaseException] = exc
|
|
43
|
+
while current is not None and id(current) not in seen:
|
|
44
|
+
seen.add(id(current))
|
|
45
|
+
name = type(current).__name__
|
|
46
|
+
if name in _CONNECTION_ERROR_NAMES:
|
|
47
|
+
return True
|
|
48
|
+
if "connection" in name.lower():
|
|
49
|
+
return True
|
|
50
|
+
if "connection" in str(current).lower()[:200]:
|
|
51
|
+
return True
|
|
52
|
+
current = current.__cause__ or current.__context__
|
|
53
|
+
return False
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
class FailoverChatModel(BaseChatModel):
|
|
57
|
+
"""Wraps two chat models — tries ``primary``, falls back to ``secondary``.
|
|
58
|
+
|
|
59
|
+
All calls (``invoke``, ``stream``, ``generate``, tool-bound variants) are
|
|
60
|
+
delegated to ``primary``. On a connection-related error the call is retried
|
|
61
|
+
on ``secondary`` and the model remembers it is running degraded; the next
|
|
62
|
+
successful ``primary`` call flips it back and logs the recovery.
|
|
63
|
+
|
|
64
|
+
Example
|
|
65
|
+
-------
|
|
66
|
+
>>> from langchain_openai import ChatOpenAI
|
|
67
|
+
>>> from langchain_failover import FailoverChatModel
|
|
68
|
+
>>> primary = ChatOpenAI(base_url="http://localhost:8001/v1", api_key="x", model="local")
|
|
69
|
+
>>> backup = ChatOpenAI(base_url="http://localhost:8002/v1", api_key="x", model="local")
|
|
70
|
+
>>> llm = FailoverChatModel(primary=primary, secondary=backup)
|
|
71
|
+
>>> llm.invoke("hello").content # doctest: +SKIP
|
|
72
|
+
"""
|
|
73
|
+
|
|
74
|
+
# ``Any`` rather than ``BaseChatModel`` on purpose: ``ChatModel.bind_tools``
|
|
75
|
+
# returns a ``Runnable`` binding (e.g. langchain's ``_ChatModelBinding``),
|
|
76
|
+
# not a ``BaseChatModel``. The binding still exposes ``_generate``/``_stream``,
|
|
77
|
+
# so wrapping it works — but a strict type would reject it.
|
|
78
|
+
model_config = ConfigDict(arbitrary_types_allowed=True)
|
|
79
|
+
|
|
80
|
+
primary: Any
|
|
81
|
+
secondary: Any
|
|
82
|
+
_active: str = "primary"
|
|
83
|
+
|
|
84
|
+
@property
|
|
85
|
+
def _llm_type(self) -> str:
|
|
86
|
+
return "failover"
|
|
87
|
+
|
|
88
|
+
def _mark_primary_recovered(self) -> None:
|
|
89
|
+
if self._active != "primary":
|
|
90
|
+
logger.info("Failover: primary recovered, switching back")
|
|
91
|
+
self._active = "primary"
|
|
92
|
+
|
|
93
|
+
def _switch_to_secondary(self, exc: BaseException) -> None:
|
|
94
|
+
logger.warning(
|
|
95
|
+
"Failover: primary down (%s), switching to secondary",
|
|
96
|
+
type(exc).__name__,
|
|
97
|
+
)
|
|
98
|
+
self._active = "secondary"
|
|
99
|
+
|
|
100
|
+
def _generate(self, messages, stop=None, run_manager=None, **kwargs):
|
|
101
|
+
try:
|
|
102
|
+
result = self.primary._generate(
|
|
103
|
+
messages, stop=stop, run_manager=run_manager, **kwargs
|
|
104
|
+
)
|
|
105
|
+
self._mark_primary_recovered()
|
|
106
|
+
return result
|
|
107
|
+
except Exception as exc:
|
|
108
|
+
if is_connection_error(exc):
|
|
109
|
+
self._switch_to_secondary(exc)
|
|
110
|
+
return self.secondary._generate(
|
|
111
|
+
messages, stop=stop, run_manager=run_manager, **kwargs
|
|
112
|
+
)
|
|
113
|
+
raise
|
|
114
|
+
|
|
115
|
+
def _stream(self, messages, stop=None, run_manager=None, **kwargs):
|
|
116
|
+
# Only fail over if the primary dies *before* emitting its first chunk;
|
|
117
|
+
# once tokens are flowing a mid-stream error is a real error, not a
|
|
118
|
+
# connect failure, and retrying would duplicate already-yielded output.
|
|
119
|
+
try:
|
|
120
|
+
started = False
|
|
121
|
+
for chunk in self.primary._stream(
|
|
122
|
+
messages, stop=stop, run_manager=run_manager, **kwargs
|
|
123
|
+
):
|
|
124
|
+
if not started:
|
|
125
|
+
started = True
|
|
126
|
+
self._mark_primary_recovered()
|
|
127
|
+
yield chunk
|
|
128
|
+
except Exception as exc:
|
|
129
|
+
if is_connection_error(exc) and not started:
|
|
130
|
+
self._switch_to_secondary(exc)
|
|
131
|
+
yield from self.secondary._stream(
|
|
132
|
+
messages, stop=stop, run_manager=run_manager, **kwargs
|
|
133
|
+
)
|
|
134
|
+
else:
|
|
135
|
+
raise
|
|
136
|
+
|
|
137
|
+
def bind_tools(self, tools, **kwargs) -> "FailoverChatModel":
|
|
138
|
+
"""Bind tools on both legs so failover preserves tool-calling.
|
|
139
|
+
|
|
140
|
+
langchain-core >=1.4 made ``BaseChatModel.bind_tools`` strict (it raises
|
|
141
|
+
``NotImplementedError`` by default), so without this override any agent
|
|
142
|
+
that binds tools to a ``FailoverChatModel`` would fail at bind time. Each
|
|
143
|
+
bound leg is a ``Runnable`` that still exposes ``_generate``/``_stream``,
|
|
144
|
+
so the delegation above keeps working on the returned wrapper.
|
|
145
|
+
"""
|
|
146
|
+
return FailoverChatModel(
|
|
147
|
+
primary=self.primary.bind_tools(tools, **kwargs),
|
|
148
|
+
secondary=self.secondary.bind_tools(tools, **kwargs),
|
|
149
|
+
)
|
|
150
|
+
|
|
151
|
+
@property
|
|
152
|
+
def active(self) -> str:
|
|
153
|
+
"""Which leg served the most recent call: ``"primary"`` or ``"secondary"``."""
|
|
154
|
+
return self._active
|
|
155
|
+
|
|
156
|
+
|
|
157
|
+
def create_failover_llm(
|
|
158
|
+
primary_url: str,
|
|
159
|
+
secondary_url: str,
|
|
160
|
+
temperature: float = 0.1,
|
|
161
|
+
api_key: str = "not-needed",
|
|
162
|
+
**kwargs: Any,
|
|
163
|
+
) -> FailoverChatModel:
|
|
164
|
+
"""Build a :class:`FailoverChatModel` from two OpenAI-compatible base URLs.
|
|
165
|
+
|
|
166
|
+
The served model id is auto-discovered from each endpoint's ``/models`` list
|
|
167
|
+
(handy for local servers like vLLM, mlx-lm, Ollama, or LM Studio, where you
|
|
168
|
+
often don't want to hardcode the model name). Extra ``kwargs`` are forwarded
|
|
169
|
+
to both underlying ``ChatOpenAI`` instances.
|
|
170
|
+
|
|
171
|
+
Args:
|
|
172
|
+
primary_url: Primary endpoint base URL, e.g. ``http://localhost:8001/v1``.
|
|
173
|
+
secondary_url: Fallback endpoint base URL.
|
|
174
|
+
temperature: Sampling temperature for both legs.
|
|
175
|
+
api_key: Bearer token sent to both endpoints (many local servers ignore it).
|
|
176
|
+
"""
|
|
177
|
+
try:
|
|
178
|
+
from langchain_openai import ChatOpenAI
|
|
179
|
+
except ImportError as exc: # pragma: no cover
|
|
180
|
+
raise ImportError(
|
|
181
|
+
"create_failover_llm requires langchain-openai. "
|
|
182
|
+
"Install it with `pip install langchain-failover[openai]`."
|
|
183
|
+
) from exc
|
|
184
|
+
|
|
185
|
+
import urllib.request
|
|
186
|
+
import json
|
|
187
|
+
|
|
188
|
+
def _discover_model(base_url: str) -> str:
|
|
189
|
+
try:
|
|
190
|
+
req = urllib.request.Request(
|
|
191
|
+
f"{base_url.rstrip('/')}/models",
|
|
192
|
+
headers={"Authorization": f"Bearer {api_key}"},
|
|
193
|
+
)
|
|
194
|
+
with urllib.request.urlopen(req, timeout=5) as resp:
|
|
195
|
+
data = json.loads(resp.read()).get("data", [])
|
|
196
|
+
if data:
|
|
197
|
+
return data[0]["id"]
|
|
198
|
+
except Exception:
|
|
199
|
+
pass
|
|
200
|
+
return "default"
|
|
201
|
+
|
|
202
|
+
def _make_client(base_url: str) -> Any:
|
|
203
|
+
return ChatOpenAI(
|
|
204
|
+
model=_discover_model(base_url),
|
|
205
|
+
temperature=temperature,
|
|
206
|
+
base_url=base_url,
|
|
207
|
+
api_key=api_key,
|
|
208
|
+
**kwargs,
|
|
209
|
+
)
|
|
210
|
+
|
|
211
|
+
logger.info("Failover LLM: primary=%s, secondary=%s", primary_url, secondary_url)
|
|
212
|
+
return FailoverChatModel(
|
|
213
|
+
primary=_make_client(primary_url),
|
|
214
|
+
secondary=_make_client(secondary_url),
|
|
215
|
+
)
|
|
216
|
+
|
|
217
|
+
|
|
218
|
+
def extract_token_metrics(meta: Optional[dict]) -> dict:
|
|
219
|
+
"""Pull token counts and timings out of a LangChain ``response_metadata`` dict.
|
|
220
|
+
|
|
221
|
+
Handles both OpenAI-compatible servers (``token_usage``/``usage``; no timing)
|
|
222
|
+
and Ollama (``prompt_eval_count``/``eval_count`` plus nanosecond durations).
|
|
223
|
+
Every field defaults to 0 when absent so callers never have to guard.
|
|
224
|
+
|
|
225
|
+
Returns:
|
|
226
|
+
``{"input_tokens", "output_tokens", "prompt_time", "generation_time"}``.
|
|
227
|
+
"""
|
|
228
|
+
if not meta:
|
|
229
|
+
return {
|
|
230
|
+
"input_tokens": 0,
|
|
231
|
+
"output_tokens": 0,
|
|
232
|
+
"prompt_time": 0.0,
|
|
233
|
+
"generation_time": 0.0,
|
|
234
|
+
}
|
|
235
|
+
|
|
236
|
+
usage = meta.get("token_usage") or meta.get("usage") or {}
|
|
237
|
+
input_tokens = usage.get("prompt_tokens", 0) or meta.get("prompt_eval_count", 0)
|
|
238
|
+
output_tokens = usage.get("completion_tokens", 0) or meta.get("eval_count", 0)
|
|
239
|
+
|
|
240
|
+
prompt_time = meta.get("prompt_eval_duration", 0) / 1e9 if "prompt_eval_duration" in meta else 0.0
|
|
241
|
+
generation_time = meta.get("eval_duration", 0) / 1e9 if "eval_duration" in meta else 0.0
|
|
242
|
+
|
|
243
|
+
return {
|
|
244
|
+
"input_tokens": input_tokens,
|
|
245
|
+
"output_tokens": output_tokens,
|
|
246
|
+
"prompt_time": prompt_time,
|
|
247
|
+
"generation_time": generation_time,
|
|
248
|
+
}
|
|
@@ -0,0 +1,107 @@
|
|
|
1
|
+
"""Tests for FailoverChatModel — no network required.
|
|
2
|
+
|
|
3
|
+
Uses tiny fake chat models that either answer or raise, so the failover,
|
|
4
|
+
recovery, streaming, and bind_tools behaviour can be exercised deterministically.
|
|
5
|
+
"""
|
|
6
|
+
from typing import Any, List, Optional
|
|
7
|
+
|
|
8
|
+
import pytest
|
|
9
|
+
from langchain_core.callbacks import CallbackManagerForLLMRun
|
|
10
|
+
from langchain_core.language_models import BaseChatModel
|
|
11
|
+
from langchain_core.messages import AIMessage
|
|
12
|
+
from langchain_core.outputs import ChatGeneration, ChatGenerationChunk, ChatResult
|
|
13
|
+
from langchain_core.messages import AIMessageChunk
|
|
14
|
+
|
|
15
|
+
from langchain_failover import FailoverChatModel, is_connection_error
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class _FakeChat(BaseChatModel):
|
|
19
|
+
"""Answers with a fixed reply, or raises a chosen exception on every call."""
|
|
20
|
+
|
|
21
|
+
reply: str = "ok"
|
|
22
|
+
raises: Any = None
|
|
23
|
+
calls: int = 0
|
|
24
|
+
|
|
25
|
+
class Config:
|
|
26
|
+
arbitrary_types_allowed = True
|
|
27
|
+
|
|
28
|
+
@property
|
|
29
|
+
def _llm_type(self) -> str:
|
|
30
|
+
return "fake"
|
|
31
|
+
|
|
32
|
+
def _generate(
|
|
33
|
+
self,
|
|
34
|
+
messages,
|
|
35
|
+
stop: Optional[List[str]] = None,
|
|
36
|
+
run_manager: Optional[CallbackManagerForLLMRun] = None,
|
|
37
|
+
**kwargs: Any,
|
|
38
|
+
) -> ChatResult:
|
|
39
|
+
object.__setattr__(self, "calls", self.calls + 1)
|
|
40
|
+
if self.raises is not None:
|
|
41
|
+
raise self.raises
|
|
42
|
+
return ChatResult(generations=[ChatGeneration(message=AIMessage(content=self.reply))])
|
|
43
|
+
|
|
44
|
+
def _stream(self, messages, stop=None, run_manager=None, **kwargs):
|
|
45
|
+
object.__setattr__(self, "calls", self.calls + 1)
|
|
46
|
+
if self.raises is not None:
|
|
47
|
+
raise self.raises
|
|
48
|
+
yield ChatGenerationChunk(message=AIMessageChunk(content=self.reply))
|
|
49
|
+
|
|
50
|
+
def bind_tools(self, tools, **kwargs):
|
|
51
|
+
# Mirror the reply so a bound model is still identifiable in tests.
|
|
52
|
+
return _FakeChat(reply=f"bound:{self.reply}", raises=self.raises)
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def test_primary_serves_when_healthy():
|
|
56
|
+
llm = FailoverChatModel(primary=_FakeChat(reply="primary"), secondary=_FakeChat(reply="secondary"))
|
|
57
|
+
assert llm.invoke("hi").content == "primary"
|
|
58
|
+
assert llm.active == "primary"
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
def test_fails_over_on_connection_error():
|
|
62
|
+
primary = _FakeChat(raises=ConnectionError("refused"))
|
|
63
|
+
llm = FailoverChatModel(primary=primary, secondary=_FakeChat(reply="secondary"))
|
|
64
|
+
assert llm.invoke("hi").content == "secondary"
|
|
65
|
+
assert llm.active == "secondary"
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
def test_non_connection_error_propagates():
|
|
69
|
+
primary = _FakeChat(raises=ValueError("bad prompt"))
|
|
70
|
+
llm = FailoverChatModel(primary=primary, secondary=_FakeChat(reply="secondary"))
|
|
71
|
+
with pytest.raises(ValueError):
|
|
72
|
+
llm.invoke("hi")
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
def test_recovers_back_to_primary():
|
|
76
|
+
primary = _FakeChat(raises=ConnectionError("down"))
|
|
77
|
+
secondary = _FakeChat(reply="secondary")
|
|
78
|
+
llm = FailoverChatModel(primary=primary, secondary=secondary)
|
|
79
|
+
assert llm.invoke("hi").content == "secondary"
|
|
80
|
+
assert llm.active == "secondary"
|
|
81
|
+
# Primary heals.
|
|
82
|
+
object.__setattr__(primary, "raises", None)
|
|
83
|
+
object.__setattr__(primary, "reply", "primary-back")
|
|
84
|
+
assert llm.invoke("hi").content == "primary-back"
|
|
85
|
+
assert llm.active == "primary"
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
def test_streaming_fails_over():
|
|
89
|
+
primary = _FakeChat(raises=ConnectionError("refused"))
|
|
90
|
+
llm = FailoverChatModel(primary=primary, secondary=_FakeChat(reply="streamed"))
|
|
91
|
+
chunks = list(llm.stream("hi"))
|
|
92
|
+
assert "".join(c.content for c in chunks) == "streamed"
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
def test_bind_tools_preserved_on_both_legs():
|
|
96
|
+
llm = FailoverChatModel(primary=_FakeChat(reply="p"), secondary=_FakeChat(reply="s"))
|
|
97
|
+
bound = llm.bind_tools([])
|
|
98
|
+
assert isinstance(bound, FailoverChatModel)
|
|
99
|
+
assert bound.invoke("hi").content == "bound:p"
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
def test_is_connection_error_walks_cause_chain():
|
|
103
|
+
inner = ConnectionRefusedError("nope")
|
|
104
|
+
outer = RuntimeError("wrapper")
|
|
105
|
+
outer.__cause__ = inner
|
|
106
|
+
assert is_connection_error(outer)
|
|
107
|
+
assert not is_connection_error(ValueError("totally unrelated"))
|