jehoctor-rag-demo 0.2.0__tar.gz → 0.2.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (33) hide show
  1. {jehoctor_rag_demo-0.2.0 → jehoctor_rag_demo-0.2.1}/PKG-INFO +56 -31
  2. jehoctor_rag_demo-0.2.1/README.md +90 -0
  3. {jehoctor_rag_demo-0.2.0 → jehoctor_rag_demo-0.2.1}/pyproject.toml +38 -11
  4. jehoctor_rag_demo-0.2.1/src/rag_demo/__main__.py +42 -0
  5. jehoctor_rag_demo-0.2.1/src/rag_demo/agents/__init__.py +4 -0
  6. jehoctor_rag_demo-0.2.1/src/rag_demo/agents/base.py +40 -0
  7. jehoctor_rag_demo-0.2.1/src/rag_demo/agents/hugging_face.py +116 -0
  8. jehoctor_rag_demo-0.2.1/src/rag_demo/agents/llama_cpp.py +113 -0
  9. jehoctor_rag_demo-0.2.1/src/rag_demo/agents/ollama.py +91 -0
  10. {jehoctor_rag_demo-0.2.0 → jehoctor_rag_demo-0.2.1}/src/rag_demo/app.py +1 -1
  11. jehoctor_rag_demo-0.2.1/src/rag_demo/app_protocol.py +101 -0
  12. jehoctor_rag_demo-0.2.1/src/rag_demo/constants.py +11 -0
  13. jehoctor_rag_demo-0.2.1/src/rag_demo/logic.py +201 -0
  14. {jehoctor_rag_demo-0.2.0 → jehoctor_rag_demo-0.2.1}/src/rag_demo/modes/_logic_provider.py +3 -2
  15. {jehoctor_rag_demo-0.2.0 → jehoctor_rag_demo-0.2.1}/src/rag_demo/modes/chat.py +10 -8
  16. jehoctor_rag_demo-0.2.1/src/rag_demo/probe.py +129 -0
  17. jehoctor_rag_demo-0.2.0/README.md +0 -69
  18. jehoctor_rag_demo-0.2.0/src/rag_demo/__main__.py +0 -31
  19. jehoctor_rag_demo-0.2.0/src/rag_demo/logic.py +0 -287
  20. {jehoctor_rag_demo-0.2.0 → jehoctor_rag_demo-0.2.1}/src/rag_demo/__init__.py +0 -0
  21. {jehoctor_rag_demo-0.2.0 → jehoctor_rag_demo-0.2.1}/src/rag_demo/app.tcss +0 -0
  22. {jehoctor_rag_demo-0.2.0 → jehoctor_rag_demo-0.2.1}/src/rag_demo/db.py +0 -0
  23. {jehoctor_rag_demo-0.2.0 → jehoctor_rag_demo-0.2.1}/src/rag_demo/dirs.py +0 -0
  24. {jehoctor_rag_demo-0.2.0 → jehoctor_rag_demo-0.2.1}/src/rag_demo/markdown.py +0 -0
  25. {jehoctor_rag_demo-0.2.0 → jehoctor_rag_demo-0.2.1}/src/rag_demo/modes/__init__.py +0 -0
  26. {jehoctor_rag_demo-0.2.0 → jehoctor_rag_demo-0.2.1}/src/rag_demo/modes/chat.tcss +0 -0
  27. {jehoctor_rag_demo-0.2.0 → jehoctor_rag_demo-0.2.1}/src/rag_demo/modes/config.py +0 -0
  28. {jehoctor_rag_demo-0.2.0 → jehoctor_rag_demo-0.2.1}/src/rag_demo/modes/config.tcss +0 -0
  29. {jehoctor_rag_demo-0.2.0 → jehoctor_rag_demo-0.2.1}/src/rag_demo/modes/help.py +0 -0
  30. {jehoctor_rag_demo-0.2.0 → jehoctor_rag_demo-0.2.1}/src/rag_demo/modes/help.tcss +0 -0
  31. {jehoctor_rag_demo-0.2.0 → jehoctor_rag_demo-0.2.1}/src/rag_demo/py.typed +0 -0
  32. {jehoctor_rag_demo-0.2.0 → jehoctor_rag_demo-0.2.1}/src/rag_demo/widgets/__init__.py +0 -0
  33. {jehoctor_rag_demo-0.2.0 → jehoctor_rag_demo-0.2.1}/src/rag_demo/widgets/escapable_input.py +0 -0
@@ -1,10 +1,11 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: jehoctor-rag-demo
3
- Version: 0.2.0
3
+ Version: 0.2.1
4
4
  Summary: Chat with Wikipedia
5
5
  Author: James Hoctor
6
6
  Author-email: James Hoctor <JEHoctor@protonmail.com>
7
7
  Requires-Dist: aiosqlite==0.21.0
8
+ Requires-Dist: bitsandbytes>=0.49.1
8
9
  Requires-Dist: chromadb>=1.3.4
9
10
  Requires-Dist: datasets>=4.4.1
10
11
  Requires-Dist: httpx>=0.28.1
@@ -16,7 +17,6 @@ Requires-Dist: langchain-huggingface>=1.1.0
16
17
  Requires-Dist: langchain-ollama>=1.0.0
17
18
  Requires-Dist: langchain-openai>=1.0.2
18
19
  Requires-Dist: langgraph-checkpoint-sqlite>=3.0.1
19
- Requires-Dist: llama-cpp-python>=0.3.16
20
20
  Requires-Dist: nvidia-ml-py>=13.590.44
21
21
  Requires-Dist: ollama>=0.6.0
22
22
  Requires-Dist: platformdirs>=4.5.0
@@ -24,9 +24,13 @@ Requires-Dist: psutil>=7.1.3
24
24
  Requires-Dist: py-cpuinfo>=9.0.0
25
25
  Requires-Dist: pydantic>=2.12.4
26
26
  Requires-Dist: pyperclip>=1.11.0
27
+ Requires-Dist: sentence-transformers>=5.2.2
27
28
  Requires-Dist: textual>=6.5.0
29
+ Requires-Dist: transformers[torch]>=4.57.6
28
30
  Requires-Dist: typer>=0.20.0
29
- Requires-Python: >=3.12
31
+ Requires-Dist: llama-cpp-python>=0.3.16 ; extra == 'llamacpp'
32
+ Requires-Python: ~=3.12.0
33
+ Provides-Extra: llamacpp
30
34
  Description-Content-Type: text/markdown
31
35
 
32
36
  # RAG-demo
@@ -35,50 +39,43 @@ Chat with (a small portion of) Wikipedia
35
39
 
36
40
  ⚠️ RAG functionality is still under development. ⚠️
37
41
 
38
- ![app screenshot](screenshots/screenshot_062f205a.png "App screenshot (this AI response is not accurate)")
42
+ ![app screenshot](screenshots/screenshot_0.2.0.png "App screenshot")
39
43
 
40
44
  ## Requirements
41
45
 
42
- 1. [uv](https://docs.astral.sh/uv/)
43
- 2. At least one of the following:
44
- - A suitable terminal emulator. In particular, on macOS consider using [iTerm2](https://iterm2.com/) instead of the default Terminal.app ([explanation](https://textual.textualize.io/FAQ/#why-doesnt-textual-look-good-on-macos)). On Linux, you might want to try [kitty](https://sw.kovidgoyal.net/kitty/), [wezterm](https://wezterm.org/), [alacritty](https://alacritty.org/), or [ghostty](https://ghostty.org/) instead of the terminal that came with your DE ([reason](https://darren.codes/posts/textual-copy-paste/)). Windows Terminal should be fine as far as I know.
45
- - Any common web browser
46
+ 1. The [uv](https://docs.astral.sh/uv/) Python package manager
47
+ - Installing and updating `uv` is easy by following [the docs](https://docs.astral.sh/uv/getting-started/installation/).
48
+ - As of 2026-01-25, I'm developing using `uv` version 0.9.26, and using the new experimental `--pytorch-backend` option.
49
+ 2. A terminal emulator or web browser
50
+ - Any common web browser will work.
51
+ - Some terminal emulators will work better than others.
52
+ See [Notes on terminal emulators](#notes-on-terminal-emulators) below.
46
53
 
47
- ## Optional stuff that could make your experience better
54
+ ### Notes on terminal emulators
55
+
56
+ Certain terminal emulators will not work with some features of this program.
57
+ In particular, on macOS consider using [iTerm2](https://iterm2.com/) instead of the default Terminal.app ([explanation](https://textual.textualize.io/FAQ/#why-doesnt-textual-look-good-on-macos)).
58
+ On Linux you might want to try [kitty](https://sw.kovidgoyal.net/kitty/), [wezterm](https://wezterm.org/), [alacritty](https://alacritty.org/), or [ghostty](https://ghostty.org/), instead of the terminal that came with your desktop environment ([reason](https://darren.codes/posts/textual-copy-paste/)).
59
+ Windows Terminal should be fine as far as I know.
60
+
61
+ ### Optional dependencies
48
62
 
49
63
  1. [Hugging Face login](https://huggingface.co/docs/huggingface_hub/quick-start#login)
50
64
  2. API key for your favorite LLM provider (support coming soon)
51
65
  3. Ollama installed on your system if you have a GPU
52
66
  4. Run RAG-demo on a more capable (bigger GPU) machine over SSH if you can. It is a terminal app after all.
67
+ 5. A C compiler if you want to build Llama.cpp from source.
53
68
 
54
-
55
- ## Run from the repository
56
-
57
- First, clone this repository. Then, run one of the options below.
69
+ ## Run the latest version
58
70
 
59
71
  Run in a terminal:
60
72
  ```bash
61
- uv run chat
73
+ uvx --torch-backend=auto --from=jehoctor-rag-demo@latest chat
62
74
  ```
63
75
 
64
76
  Or run in a web browser:
65
77
  ```bash
66
- uv run textual serve chat
67
- ```
68
-
69
- ## Run from the latest version on PyPI
70
-
71
- TODO: test uv automatic torch backend selection:
72
- https://docs.astral.sh/uv/guides/integration/pytorch/#automatic-backend-selection
73
-
74
- Run in a terminal:
75
- ```bash
76
- uvx --from=jehoctor-rag-demo chat
77
- ```
78
-
79
- Or run in a web browser:
80
- ```bash
81
- uvx --from=jehoctor-rag-demo textual serve chat
78
+ uvx --torch-backend=auto --from=jehoctor-rag-demo@latest textual serve chat
82
79
  ```
83
80
 
84
81
  ## CUDA acceleration via Llama.cpp
@@ -86,15 +83,43 @@ uvx --from=jehoctor-rag-demo textual serve chat
86
83
  If you have an NVIDIA GPU with CUDA and build tools installed, you might be able to get CUDA acceleration without installing Ollama.
87
84
 
88
85
  ```bash
89
- CMAKE_ARGS="-DGGML_CUDA=on" uv run chat
86
+ CMAKE_ARGS="-DGGML_CUDA=on" uv run --extra=llamacpp chat
90
87
  ```
91
88
 
92
89
  ## Metal acceleration via Llama.cpp (on Apple Silicon)
93
90
 
94
91
  On an Apple Silicon machine, make sure `uv` runs an ARM interpreter as this should cause it to install Llama.cpp with Metal support.
92
+ Also, run with the extra group `llamacpp`.
93
+ Try this:
94
+
95
+ ```bash
96
+ uvx --python-platform=aarch64-apple-darwin --torch-backend=auto --from=jehoctor-rag-demo[llamacpp]@latest chat
97
+ ```
95
98
 
96
99
  ## Ollama on Linux
97
100
 
98
101
  Remember that you have to keep Ollama up-to-date manually on Linux.
99
102
  A recent version of Ollama (v0.11.10 or later) is required to run the [embedding model we use](https://ollama.com/library/embeddinggemma).
100
103
  See this FAQ: https://docs.ollama.com/faq#how-can-i-upgrade-ollama.
104
+
105
+ ## Project feature roadmap
106
+
107
+ - ❌ RAG functionality
108
+ - ❌ torch inference via the Langchain local Hugging Face inference integration
109
+ - ❌ uv automatic torch backend selection (see [the docs](https://docs.astral.sh/uv/guides/integration/pytorch/#automatic-backend-selection))
110
+ - ❌ OpenAI integration
111
+ - ❌ Anthropic integration
112
+
113
+ ## Run from the repository
114
+
115
+ First, clone this repository. Then, run one of the options below.
116
+
117
+ Run in a terminal:
118
+ ```bash
119
+ uv run chat
120
+ ```
121
+
122
+ Or run in a web browser:
123
+ ```bash
124
+ uv run textual serve chat
125
+ ```
@@ -0,0 +1,90 @@
1
+ # RAG-demo
2
+
3
+ Chat with (a small portion of) Wikipedia
4
+
5
+ ⚠️ RAG functionality is still under development. ⚠️
6
+
7
+ ![app screenshot](screenshots/screenshot_0.2.0.png "App screenshot")
8
+
9
+ ## Requirements
10
+
11
+ 1. The [uv](https://docs.astral.sh/uv/) Python package manager
12
+ - Installing and updating `uv` is easy by following [the docs](https://docs.astral.sh/uv/getting-started/installation/).
13
+ - As of 2026-01-25, I'm developing using `uv` version 0.9.26, and using the new experimental `--pytorch-backend` option.
14
+ 2. A terminal emulator or web browser
15
+ - Any common web browser will work.
16
+ - Some terminal emulators will work better than others.
17
+ See [Notes on terminal emulators](#notes-on-terminal-emulators) below.
18
+
19
+ ### Notes on terminal emulators
20
+
21
+ Certain terminal emulators will not work with some features of this program.
22
+ In particular, on macOS consider using [iTerm2](https://iterm2.com/) instead of the default Terminal.app ([explanation](https://textual.textualize.io/FAQ/#why-doesnt-textual-look-good-on-macos)).
23
+ On Linux you might want to try [kitty](https://sw.kovidgoyal.net/kitty/), [wezterm](https://wezterm.org/), [alacritty](https://alacritty.org/), or [ghostty](https://ghostty.org/), instead of the terminal that came with your desktop environment ([reason](https://darren.codes/posts/textual-copy-paste/)).
24
+ Windows Terminal should be fine as far as I know.
25
+
26
+ ### Optional dependencies
27
+
28
+ 1. [Hugging Face login](https://huggingface.co/docs/huggingface_hub/quick-start#login)
29
+ 2. API key for your favorite LLM provider (support coming soon)
30
+ 3. Ollama installed on your system if you have a GPU
31
+ 4. Run RAG-demo on a more capable (bigger GPU) machine over SSH if you can. It is a terminal app after all.
32
+ 5. A C compiler if you want to build Llama.cpp from source.
33
+
34
+ ## Run the latest version
35
+
36
+ Run in a terminal:
37
+ ```bash
38
+ uvx --torch-backend=auto --from=jehoctor-rag-demo@latest chat
39
+ ```
40
+
41
+ Or run in a web browser:
42
+ ```bash
43
+ uvx --torch-backend=auto --from=jehoctor-rag-demo@latest textual serve chat
44
+ ```
45
+
46
+ ## CUDA acceleration via Llama.cpp
47
+
48
+ If you have an NVIDIA GPU with CUDA and build tools installed, you might be able to get CUDA acceleration without installing Ollama.
49
+
50
+ ```bash
51
+ CMAKE_ARGS="-DGGML_CUDA=on" uv run --extra=llamacpp chat
52
+ ```
53
+
54
+ ## Metal acceleration via Llama.cpp (on Apple Silicon)
55
+
56
+ On an Apple Silicon machine, make sure `uv` runs an ARM interpreter as this should cause it to install Llama.cpp with Metal support.
57
+ Also, run with the extra group `llamacpp`.
58
+ Try this:
59
+
60
+ ```bash
61
+ uvx --python-platform=aarch64-apple-darwin --torch-backend=auto --from=jehoctor-rag-demo[llamacpp]@latest chat
62
+ ```
63
+
64
+ ## Ollama on Linux
65
+
66
+ Remember that you have to keep Ollama up-to-date manually on Linux.
67
+ A recent version of Ollama (v0.11.10 or later) is required to run the [embedding model we use](https://ollama.com/library/embeddinggemma).
68
+ See this FAQ: https://docs.ollama.com/faq#how-can-i-upgrade-ollama.
69
+
70
+ ## Project feature roadmap
71
+
72
+ - ❌ RAG functionality
73
+ - ❌ torch inference via the Langchain local Hugging Face inference integration
74
+ - ❌ uv automatic torch backend selection (see [the docs](https://docs.astral.sh/uv/guides/integration/pytorch/#automatic-backend-selection))
75
+ - ❌ OpenAI integration
76
+ - ❌ Anthropic integration
77
+
78
+ ## Run from the repository
79
+
80
+ First, clone this repository. Then, run one of the options below.
81
+
82
+ Run in a terminal:
83
+ ```bash
84
+ uv run chat
85
+ ```
86
+
87
+ Or run in a web browser:
88
+ ```bash
89
+ uv run textual serve chat
90
+ ```
@@ -1,16 +1,19 @@
1
1
  [project]
2
2
  name = "jehoctor-rag-demo"
3
- version = "0.2.0"
3
+ version = "0.2.1"
4
4
  description = "Chat with Wikipedia"
5
5
  readme = "README.md"
6
6
  authors = [
7
7
  { name = "James Hoctor", email = "JEHoctor@protonmail.com" }
8
8
  ]
9
- requires-python = ">=3.12"
9
+ requires-python = "~=3.12.0"
10
10
  # TODO: Reverse pinning of aiosqlite to 0.21.0 to work around this issue:
11
11
  # https://github.com/langchain-ai/langgraph/issues/6583
12
+ # TODO: Should I depend on xformers "for a more memory-efficient attention implementation?
13
+ # https://docs.langchain.com/oss/python/integrations/llms/huggingface_pipelines
12
14
  dependencies = [
13
15
  "aiosqlite==0.21.0",
16
+ "bitsandbytes>=0.49.1",
14
17
  "chromadb>=1.3.4",
15
18
  "datasets>=4.4.1",
16
19
  "httpx>=0.28.1",
@@ -22,7 +25,6 @@ dependencies = [
22
25
  "langchain-ollama>=1.0.0",
23
26
  "langchain-openai>=1.0.2",
24
27
  "langgraph-checkpoint-sqlite>=3.0.1",
25
- "llama-cpp-python>=0.3.16",
26
28
  "nvidia-ml-py>=13.590.44",
27
29
  "ollama>=0.6.0",
28
30
  "platformdirs>=4.5.0",
@@ -30,20 +32,31 @@ dependencies = [
30
32
  "py-cpuinfo>=9.0.0",
31
33
  "pydantic>=2.12.4",
32
34
  "pyperclip>=1.11.0",
35
+ "sentence-transformers>=5.2.2",
33
36
  "textual>=6.5.0",
37
+ "transformers[torch]>=4.57.6",
34
38
  "typer>=0.20.0",
35
39
  ]
36
40
 
41
+ [project.optional-dependencies]
42
+ llamacpp = [
43
+ "llama-cpp-python>=0.3.16",
44
+ ]
45
+
37
46
  [project.scripts]
38
47
  chat = "rag_demo.__main__:main"
39
48
 
40
49
  [dependency-groups]
41
50
  dev = [
42
- "pytest>=8.4.2",
43
51
  "ruff>=0.14.3",
44
52
  "mypy>=1.18.2",
45
53
  "textual-dev>=1.8.0",
46
54
  "ipython>=9.7.0",
55
+ "ty>=0.0.13",
56
+ "uv-outdated>=1.0.4",
57
+ ]
58
+ test = [
59
+ "pytest>=8.4.2",
47
60
  "pytest-cov>=7.0.0",
48
61
  "pytest-asyncio>=1.3.0",
49
62
  ]
@@ -62,6 +75,7 @@ explicit = true
62
75
  [tool.uv.sources]
63
76
  llama-cpp-python = [
64
77
  { index = "llama-cpp-metal", marker = "platform_machine == 'arm64' and sys_platform == 'darwin'" },
78
+ { index = "llama-cpp-metal", marker = "platform_machine == 'aarch64' and sys_platform == 'darwin'" },
65
79
  ]
66
80
 
67
81
  [build-system]
@@ -75,15 +89,25 @@ module-name = "rag_demo"
75
89
  line-length = 120
76
90
 
77
91
  [tool.ruff.lint]
78
- per-file-ignores = { "__init__.py" = ["F401"] } # Ignore unused-import in all __init__.py files.
79
92
  select = ["ALL"]
80
93
  ignore = [
81
- "E501", # Handled by ruff format (line-too-long)
82
- "D100", # undocumented-public-module
83
- "D104", # undocumented-public-package
84
- "D203", # Conflicts with Google style D211/D212
85
- "ANN101", # Missing type annotation for self
86
- "ANN102", # Missing type annotation for cls
94
+ "E501", # Handled by ruff format (line-too-long)
95
+ "D100", # undocumented-public-module
96
+ "D104", # undocumented-public-package
97
+ "D203", # Conflicts with Google style D211/D212
98
+ "ANN101", # Missing type annotation for self
99
+ "ANN102", # Missing type annotation for cls
100
+ "PLE1205", # This rule falsely identifies Textual Loggers as standard Python Loggers, creating false positives.
101
+ "TRY400", # Textual.Logger doesn't provide an exception logger, so it's fine to use Logger.error instead.
102
+ ]
103
+
104
+ [tool.ruff.lint.per-file-ignores]
105
+ "__init__.py" = ["F401"] # Ignore unused-import in all __init__.py files.
106
+ "tests/*" = [
107
+ "S101", # Assert statements are allowed in tests.
108
+ "INP001", # No need to create __init__.py files in the tests/ directory; only pytest runs the tests.
109
+ "C419", # It's OK to use extra list comprehensions in tests to make the output more informative.
110
+ "PLR2004", # There are going to be some magic values in the tests. It's OK.
87
111
  ]
88
112
 
89
113
  [tool.ruff.lint.pydocstyle]
@@ -100,3 +124,6 @@ files = ["src/", "tests/"]
100
124
 
101
125
  [tool.mypy.plugins]
102
126
  pydantic.mypy.plugins = { enabled = true }
127
+
128
+ [tool.pytest.ini_options]
129
+ asyncio_mode = "auto"
@@ -0,0 +1,42 @@
1
+ import time
2
+
3
+ # Measure the application start time.
4
+ APPLICATION_START_TIME = time.time()
5
+
6
+ # Disable "module import not at top of file" (aka E402) when importing Typer and other early imports. This is necessary
7
+ # so that the initialization of these modules is included in the application startup time.
8
+ from typing import Annotated # noqa: E402
9
+
10
+ import typer # noqa: E402
11
+
12
+ from rag_demo.constants import LocalProviderType # noqa: E402
13
+
14
+
15
+ def _main(
16
+ name: Annotated[str | None, typer.Option(help="The name you want to want the AI to use with you.")] = None,
17
+ provider: Annotated[LocalProviderType | None, typer.Option(help="The local provider to prefer.")] = None,
18
+ ) -> None:
19
+ """Talk to Wikipedia."""
20
+ # Import here so that imports run within the typer.run context for prettier stack traces if errors occur.
21
+ # We ignore PLC0415 because we do not want these imports to be at the top of the module as is usually preferred.
22
+ import transformers # noqa: PLC0415
23
+
24
+ from rag_demo.app import RAGDemo # noqa: PLC0415
25
+ from rag_demo.logic import Logic # noqa: PLC0415
26
+
27
+ # The transformers library likes to print text that interferes with the TUI. Disable it.
28
+ transformers.logging.set_verbosity(verbosity=transformers.logging.CRITICAL)
29
+ transformers.logging.disable_progress_bar()
30
+
31
+ logic = Logic(username=name, preferred_provider_type=provider, application_start_time=APPLICATION_START_TIME)
32
+ app = RAGDemo(logic)
33
+ app.run()
34
+
35
+
36
+ def main() -> None:
37
+ """Entrypoint for the rag demo, specifically the `chat` command."""
38
+ typer.run(_main)
39
+
40
+
41
+ if __name__ == "__main__":
42
+ main()
@@ -0,0 +1,4 @@
1
+ from .base import Agent, AgentProvider
2
+ from .hugging_face import HuggingFaceAgent, HuggingFaceAgentProvider
3
+ from .llama_cpp import LlamaCppAgent, LlamaCppAgentProvider
4
+ from .ollama import OllamaAgent, OllamaAgentProvider
@@ -0,0 +1,40 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import TYPE_CHECKING, Final, Protocol
4
+
5
+ if TYPE_CHECKING:
6
+ from collections.abc import AsyncIterator
7
+ from contextlib import AbstractAsyncContextManager
8
+ from pathlib import Path
9
+
10
+ from rag_demo.app_protocol import AppProtocol
11
+ from rag_demo.constants import LocalProviderType
12
+
13
+
14
+ class Agent(Protocol):
15
+ """An LLM agent that supports streaming responses asynchronously."""
16
+
17
+ def astream(self, user_message: str, thread_id: str, app: AppProtocol) -> AsyncIterator[str]:
18
+ """Stream a response from the agent.
19
+
20
+ Args:
21
+ user_message (str): User's next prompt in the conversation.
22
+ thread_id (str): Identifier for the current thread/conversation.
23
+ app (AppProtocol): Application interface, commonly used for logging.
24
+
25
+ Yields:
26
+ str: A token from the agent's response.
27
+ """
28
+
29
+
30
+ class AgentProvider(Protocol):
31
+ """A strategy for creating LLM agents."""
32
+
33
+ type: Final[LocalProviderType]
34
+
35
+ def get_agent(self, checkpoints_sqlite_db: str | Path) -> AbstractAsyncContextManager[Agent | None]:
36
+ """Attempt to create an agent.
37
+
38
+ Args:
39
+ checkpoints_sqlite_db (str | Path): Connection string for SQLite database used for LangChain checkpoints.
40
+ """
@@ -0,0 +1,116 @@
1
+ from __future__ import annotations
2
+
3
+ import asyncio
4
+ import sqlite3
5
+ from contextlib import asynccontextmanager
6
+ from typing import TYPE_CHECKING, Final
7
+
8
+ from huggingface_hub import hf_hub_download
9
+ from langchain.agents import create_agent
10
+ from langchain.messages import AIMessageChunk, HumanMessage
11
+ from langchain_huggingface import ChatHuggingFace, HuggingFaceEmbeddings, HuggingFacePipeline
12
+ from langgraph.checkpoint.sqlite import SqliteSaver
13
+
14
+ from rag_demo.constants import LocalProviderType
15
+
16
+ if TYPE_CHECKING:
17
+ from collections.abc import AsyncIterator
18
+ from pathlib import Path
19
+
20
+ from rag_demo.app_protocol import AppProtocol
21
+
22
+
23
+ class HuggingFaceAgent:
24
+ """An LLM agent powered by Hugging Face local pipelines."""
25
+
26
+ def __init__(
27
+ self,
28
+ checkpoints_sqlite_db: str | Path,
29
+ model_id: str,
30
+ embedding_model_id: str,
31
+ ) -> None:
32
+ """Initialize the HuggingFaceAgent.
33
+
34
+ Args:
35
+ checkpoints_sqlite_db (str | Path): Connection string for SQLite database used for LangChain checkpoints.
36
+ model_id (str): Hugging Face model ID for the LLM.
37
+ embedding_model_id (str): Hugging Face model ID for the embedding model.
38
+ """
39
+ self.checkpoints_sqlite_db = checkpoints_sqlite_db
40
+ self.model_id = model_id
41
+ self.embedding_model_id = embedding_model_id
42
+
43
+ self.llm = ChatHuggingFace(
44
+ llm=HuggingFacePipeline.from_model_id(
45
+ model_id=model_id,
46
+ task="text-generation",
47
+ device_map="auto",
48
+ pipeline_kwargs={"max_new_tokens": 4096},
49
+ ),
50
+ )
51
+ self.embed = HuggingFaceEmbeddings(model_name=embedding_model_id)
52
+ self.agent = create_agent(
53
+ model=self.llm,
54
+ system_prompt="You are a helpful assistant.",
55
+ checkpointer=SqliteSaver(sqlite3.Connection(self.checkpoints_sqlite_db, check_same_thread=False)),
56
+ )
57
+
58
+ async def astream(self, user_message: str, thread_id: str, app: AppProtocol) -> AsyncIterator[str]:
59
+ """Stream a response from the agent.
60
+
61
+ Args:
62
+ user_message (str): User's next prompt in the conversation.
63
+ thread_id (str): Identifier for the current thread/conversation.
64
+ app (AppProtocol): Application interface, commonly used for logging.
65
+
66
+ Yields:
67
+ str: A token from the agent's response.
68
+ """
69
+ agent_stream = self.agent.stream(
70
+ {"messages": [HumanMessage(content=user_message)]},
71
+ {"configurable": {"thread_id": thread_id}},
72
+ stream_mode="messages",
73
+ )
74
+ for message_chunk, _ in agent_stream:
75
+ if isinstance(message_chunk, AIMessageChunk):
76
+ token = message_chunk.content
77
+ if isinstance(token, str):
78
+ yield token
79
+ else:
80
+ app.log.error("Received message content of type", type(token))
81
+ else:
82
+ app.log.error("Received message chunk of type", type(message_chunk))
83
+
84
+
85
+ def _hf_downloads() -> None:
86
+ hf_hub_download(
87
+ repo_id="Qwen/Qwen3-0.6B", # 1.5GB
88
+ filename="model.safetensors",
89
+ revision="c1899de289a04d12100db370d81485cdf75e47ca",
90
+ )
91
+ hf_hub_download(
92
+ repo_id="unsloth/embeddinggemma-300m", # 1.21GB
93
+ filename="model.safetensors",
94
+ revision="bfa3c846ac738e62aa61806ef9112d34acb1dc5a",
95
+ )
96
+
97
+
98
+ class HuggingFaceAgentProvider:
99
+ """Create LLM agents using Hugging Face local pipelines."""
100
+
101
+ type: Final[LocalProviderType] = LocalProviderType.HUGGING_FACE
102
+
103
+ @asynccontextmanager
104
+ async def get_agent(self, checkpoints_sqlite_db: str | Path) -> AsyncIterator[HuggingFaceAgent]:
105
+ """Create a Hugging Face local pipeline agent.
106
+
107
+ Args:
108
+ checkpoints_sqlite_db (str | Path): Connection string for SQLite database used for LangChain checkpoints.
109
+ """
110
+ loop = asyncio.get_running_loop()
111
+ await loop.run_in_executor(None, _hf_downloads)
112
+ yield HuggingFaceAgent(
113
+ checkpoints_sqlite_db,
114
+ model_id="Qwen/Qwen3-0.6B",
115
+ embedding_model_id="unsloth/embeddinggemma-300m",
116
+ )
@@ -0,0 +1,113 @@
1
+ from __future__ import annotations
2
+
3
+ import asyncio
4
+ from contextlib import asynccontextmanager
5
+ from typing import TYPE_CHECKING, Final
6
+
7
+ import aiosqlite
8
+ from huggingface_hub import hf_hub_download
9
+ from langchain.agents import create_agent
10
+ from langchain.messages import AIMessageChunk, HumanMessage
11
+ from langchain_community.chat_models import ChatLlamaCpp
12
+ from langchain_community.embeddings import LlamaCppEmbeddings
13
+ from langgraph.checkpoint.sqlite.aio import AsyncSqliteSaver
14
+
15
+ from rag_demo import probe
16
+ from rag_demo.constants import LocalProviderType
17
+
18
+ if TYPE_CHECKING:
19
+ from collections.abc import AsyncIterator
20
+ from pathlib import Path
21
+
22
+ from rag_demo.app_protocol import AppProtocol
23
+
24
+
25
+ class LlamaCppAgent:
26
+ """An LLM agent powered by Llama.cpp."""
27
+
28
+ def __init__(
29
+ self,
30
+ checkpoints_conn: aiosqlite.Connection,
31
+ model_path: str,
32
+ embedding_model_path: str,
33
+ ) -> None:
34
+ """Initialize the LlamaCppAgent.
35
+
36
+ Args:
37
+ checkpoints_conn (aiosqlite.Connection): Connection to SQLite checkpoint database.
38
+ model_path (str): Path to Llama.cpp model.
39
+ embedding_model_path (str): Path to Llama.cpp embedding model.
40
+ """
41
+ self.checkpoints_conn = checkpoints_conn
42
+ self.llm = ChatLlamaCpp(model_path=model_path, verbose=False)
43
+ self.embed = LlamaCppEmbeddings(model_path=embedding_model_path, verbose=False)
44
+ self.agent = create_agent(
45
+ model=self.llm,
46
+ system_prompt="You are a helpful assistant.",
47
+ checkpointer=AsyncSqliteSaver(self.checkpoints_conn),
48
+ )
49
+
50
+ async def astream(self, user_message: str, thread_id: str, app: AppProtocol) -> AsyncIterator[str]:
51
+ """Stream a response from the agent.
52
+
53
+ Args:
54
+ user_message (str): User's next prompt in the conversation.
55
+ thread_id (str): Identifier for the current thread/conversation.
56
+ app (AppProtocol): Application interface, commonly used for logging.
57
+
58
+ Yields:
59
+ str: A token from the agent's response.
60
+ """
61
+ agent_stream = self.agent.astream(
62
+ {"messages": [HumanMessage(content=user_message)]},
63
+ {"configurable": {"thread_id": thread_id}},
64
+ stream_mode="messages",
65
+ )
66
+ async for message_chunk, _ in agent_stream:
67
+ if isinstance(message_chunk, AIMessageChunk):
68
+ token = message_chunk.content
69
+ if isinstance(token, str):
70
+ yield token
71
+ else:
72
+ app.log.error("Received message content of type", type(token))
73
+ else:
74
+ app.log.error("Received message chunk of type", type(message_chunk))
75
+
76
+
77
+ def _hf_downloads() -> tuple[str, str]:
78
+ model_path = hf_hub_download(
79
+ repo_id="bartowski/google_gemma-3-4b-it-GGUF",
80
+ filename="google_gemma-3-4b-it-Q6_K_L.gguf", # 3.35GB
81
+ revision="71506238f970075ca85125cd749c28b1b0eee84e",
82
+ )
83
+ embedding_model_path = hf_hub_download(
84
+ repo_id="CompendiumLabs/bge-small-en-v1.5-gguf",
85
+ filename="bge-small-en-v1.5-q8_0.gguf", # 36.8MB
86
+ revision="d32f8c040ea3b516330eeb75b72bcc2d3a780ab7",
87
+ )
88
+ return model_path, embedding_model_path
89
+
90
+
91
+ class LlamaCppAgentProvider:
92
+ """Create LLM agents using Llama.cpp."""
93
+
94
+ type: Final[LocalProviderType] = LocalProviderType.LLAMA_CPP
95
+
96
+ @asynccontextmanager
97
+ async def get_agent(self, checkpoints_sqlite_db: str | Path) -> AsyncIterator[LlamaCppAgent | None]:
98
+ """Attempt to create a Llama.cpp agent.
99
+
100
+ Args:
101
+ checkpoints_sqlite_db (str | Path): Connection string for SQLite database used for LangChain checkpoints.
102
+ """
103
+ if probe.probe_llama_available():
104
+ loop = asyncio.get_running_loop()
105
+ model_path, embedding_model_path = await loop.run_in_executor(None, _hf_downloads)
106
+ async with aiosqlite.connect(database=checkpoints_sqlite_db) as checkpoints_conn:
107
+ yield LlamaCppAgent(
108
+ checkpoints_conn=checkpoints_conn,
109
+ model_path=model_path,
110
+ embedding_model_path=embedding_model_path,
111
+ )
112
+ else:
113
+ yield None