npcsh 0.3.31__py3-none-any.whl → 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (91) hide show
  1. npcsh/_state.py +942 -0
  2. npcsh/alicanto.py +1074 -0
  3. npcsh/guac.py +785 -0
  4. npcsh/mcp_helpers.py +357 -0
  5. npcsh/mcp_npcsh.py +822 -0
  6. npcsh/mcp_server.py +184 -0
  7. npcsh/npc.py +218 -0
  8. npcsh/npcsh.py +1161 -0
  9. npcsh/plonk.py +387 -269
  10. npcsh/pti.py +234 -0
  11. npcsh/routes.py +958 -0
  12. npcsh/spool.py +315 -0
  13. npcsh/wander.py +550 -0
  14. npcsh/yap.py +573 -0
  15. npcsh-1.0.0.dist-info/METADATA +596 -0
  16. npcsh-1.0.0.dist-info/RECORD +21 -0
  17. {npcsh-0.3.31.dist-info → npcsh-1.0.0.dist-info}/WHEEL +1 -1
  18. npcsh-1.0.0.dist-info/entry_points.txt +9 -0
  19. {npcsh-0.3.31.dist-info → npcsh-1.0.0.dist-info}/licenses/LICENSE +1 -1
  20. npcsh/audio.py +0 -210
  21. npcsh/cli.py +0 -545
  22. npcsh/command_history.py +0 -566
  23. npcsh/conversation.py +0 -291
  24. npcsh/data_models.py +0 -46
  25. npcsh/dataframes.py +0 -163
  26. npcsh/embeddings.py +0 -168
  27. npcsh/helpers.py +0 -641
  28. npcsh/image.py +0 -298
  29. npcsh/image_gen.py +0 -79
  30. npcsh/knowledge_graph.py +0 -1006
  31. npcsh/llm_funcs.py +0 -2027
  32. npcsh/load_data.py +0 -83
  33. npcsh/main.py +0 -5
  34. npcsh/model_runner.py +0 -189
  35. npcsh/npc_compiler.py +0 -2870
  36. npcsh/npc_sysenv.py +0 -383
  37. npcsh/npc_team/assembly_lines/test_pipeline.py +0 -181
  38. npcsh/npc_team/corca.npc +0 -13
  39. npcsh/npc_team/foreman.npc +0 -7
  40. npcsh/npc_team/npcsh.ctx +0 -11
  41. npcsh/npc_team/sibiji.npc +0 -4
  42. npcsh/npc_team/templates/analytics/celona.npc +0 -0
  43. npcsh/npc_team/templates/hr_support/raone.npc +0 -0
  44. npcsh/npc_team/templates/humanities/eriane.npc +0 -4
  45. npcsh/npc_team/templates/it_support/lineru.npc +0 -0
  46. npcsh/npc_team/templates/marketing/slean.npc +0 -4
  47. npcsh/npc_team/templates/philosophy/maurawa.npc +0 -0
  48. npcsh/npc_team/templates/sales/turnic.npc +0 -4
  49. npcsh/npc_team/templates/software/welxor.npc +0 -0
  50. npcsh/npc_team/tools/bash_executer.tool +0 -32
  51. npcsh/npc_team/tools/calculator.tool +0 -8
  52. npcsh/npc_team/tools/code_executor.tool +0 -16
  53. npcsh/npc_team/tools/generic_search.tool +0 -27
  54. npcsh/npc_team/tools/image_generation.tool +0 -25
  55. npcsh/npc_team/tools/local_search.tool +0 -149
  56. npcsh/npc_team/tools/npcsh_executor.tool +0 -9
  57. npcsh/npc_team/tools/screen_cap.tool +0 -27
  58. npcsh/npc_team/tools/sql_executor.tool +0 -26
  59. npcsh/response.py +0 -623
  60. npcsh/search.py +0 -248
  61. npcsh/serve.py +0 -1460
  62. npcsh/shell.py +0 -538
  63. npcsh/shell_helpers.py +0 -3529
  64. npcsh/stream.py +0 -700
  65. npcsh/video.py +0 -49
  66. npcsh-0.3.31.data/data/npcsh/npc_team/bash_executer.tool +0 -32
  67. npcsh-0.3.31.data/data/npcsh/npc_team/calculator.tool +0 -8
  68. npcsh-0.3.31.data/data/npcsh/npc_team/celona.npc +0 -0
  69. npcsh-0.3.31.data/data/npcsh/npc_team/code_executor.tool +0 -16
  70. npcsh-0.3.31.data/data/npcsh/npc_team/corca.npc +0 -13
  71. npcsh-0.3.31.data/data/npcsh/npc_team/eriane.npc +0 -4
  72. npcsh-0.3.31.data/data/npcsh/npc_team/foreman.npc +0 -7
  73. npcsh-0.3.31.data/data/npcsh/npc_team/generic_search.tool +0 -27
  74. npcsh-0.3.31.data/data/npcsh/npc_team/image_generation.tool +0 -25
  75. npcsh-0.3.31.data/data/npcsh/npc_team/lineru.npc +0 -0
  76. npcsh-0.3.31.data/data/npcsh/npc_team/local_search.tool +0 -149
  77. npcsh-0.3.31.data/data/npcsh/npc_team/maurawa.npc +0 -0
  78. npcsh-0.3.31.data/data/npcsh/npc_team/npcsh.ctx +0 -11
  79. npcsh-0.3.31.data/data/npcsh/npc_team/npcsh_executor.tool +0 -9
  80. npcsh-0.3.31.data/data/npcsh/npc_team/raone.npc +0 -0
  81. npcsh-0.3.31.data/data/npcsh/npc_team/screen_cap.tool +0 -27
  82. npcsh-0.3.31.data/data/npcsh/npc_team/sibiji.npc +0 -4
  83. npcsh-0.3.31.data/data/npcsh/npc_team/slean.npc +0 -4
  84. npcsh-0.3.31.data/data/npcsh/npc_team/sql_executor.tool +0 -26
  85. npcsh-0.3.31.data/data/npcsh/npc_team/test_pipeline.py +0 -181
  86. npcsh-0.3.31.data/data/npcsh/npc_team/turnic.npc +0 -4
  87. npcsh-0.3.31.data/data/npcsh/npc_team/welxor.npc +0 -0
  88. npcsh-0.3.31.dist-info/METADATA +0 -1853
  89. npcsh-0.3.31.dist-info/RECORD +0 -76
  90. npcsh-0.3.31.dist-info/entry_points.txt +0 -3
  91. {npcsh-0.3.31.dist-info → npcsh-1.0.0.dist-info}/top_level.txt +0 -0
npcsh/conversation.py DELETED
@@ -1,291 +0,0 @@
1
- ########
2
- ########
3
- ########
4
- ########
5
- ######## CONVERSATION
6
- ########
7
- from typing import Any, Dict, Generator, List
8
- import os
9
- import anthropic
10
-
11
- from openai import OpenAI
12
- from google.generativeai import types
13
- import google.generativeai as genai
14
- from .npc_sysenv import get_system_message
15
-
16
-
17
- def get_ollama_conversation(
18
- messages: List[Dict[str, str]],
19
- model: str,
20
- npc: Any = None,
21
- tools: list = None,
22
- images=None,
23
- **kwargs,
24
- ) -> List[Dict[str, str]]:
25
- """
26
- Function Description:
27
- This function generates a conversation using the Ollama API.
28
- Args:
29
- messages (List[Dict[str, str]]): The list of messages in the conversation.
30
- model (str): The model to use for the conversation.
31
- Keyword Args:
32
- npc (Any): The NPC object.
33
- Returns:
34
- List[Dict[str, str]]: The list of messages in the conversation.
35
- """
36
- import ollama
37
-
38
- messages_copy = messages.copy()
39
- if messages_copy[0]["role"] != "system":
40
- if npc is not None:
41
- system_message = get_system_message(npc)
42
- messages_copy.insert(0, {"role": "system", "content": system_message})
43
-
44
- response = ollama.chat(model=model, messages=messages_copy)
45
- messages_copy.append(response["message"])
46
- return messages_copy
47
-
48
-
49
- def get_openai_conversation(
50
- messages: List[Dict[str, str]],
51
- model: str,
52
- npc: Any = None,
53
- tools: list = None,
54
- api_key: str = None,
55
- images=None,
56
- **kwargs,
57
- ) -> List[Dict[str, str]]:
58
- """
59
- Function Description:
60
- This function generates a conversation using the OpenAI API.
61
- Args:
62
- messages (List[Dict[str, str]]): The list of messages in the conversation.
63
- model (str): The model to use for the conversation.
64
- Keyword Args:
65
- npc (Any): The NPC object.
66
- api_key (str): The API key for accessing the OpenAI API.
67
- Returns:
68
- List[Dict[str, str]]: The list of messages in the conversation.
69
- """
70
-
71
- try:
72
- if api_key is None:
73
- api_key = os.environ["OPENAI_API_KEY"]
74
- client = OpenAI(api_key=api_key)
75
-
76
- system_message = (
77
- get_system_message(npc) if npc else "You are a helpful assistant."
78
- )
79
-
80
- if messages is None:
81
- messages = []
82
-
83
- # Ensure the system message is at the beginning
84
- if not any(msg["role"] == "system" for msg in messages):
85
- messages.insert(0, {"role": "system", "content": system_message})
86
-
87
- # messages should already include the user's latest message
88
-
89
- # Make the API call with the messages including the latest user input
90
- completion = client.chat.completions.create(
91
- model=model, messages=messages, **kwargs
92
- )
93
-
94
- response_message = completion.choices[0].message
95
- messages.append({"role": "assistant", "content": response_message.content})
96
-
97
- return messages
98
-
99
- except Exception as e:
100
- return f"Error interacting with OpenAI: {e}"
101
-
102
-
103
- def get_openai_like_conversation(
104
- messages: List[Dict[str, str]],
105
- model: str,
106
- api_url: str,
107
- npc: Any = None,
108
- images=None,
109
- tools: list = None,
110
- api_key: str = None,
111
- **kwargs,
112
- ) -> List[Dict[str, str]]:
113
- """
114
- Function Description:
115
- This function generates a conversation using an OpenAI-like API.
116
- Args:
117
- messages (List[Dict[str, str]]): The list of messages in the conversation.
118
- model (str): The model to use for the conversation.
119
- Keyword Args:
120
- npc (Any): The NPC object.
121
- api_url (str): The URL of the API endpoint.
122
- api_key (str): The API key for accessing the API.
123
- Returns:
124
- List[Dict[str, str]]: The list of messages in the conversation.
125
- """
126
-
127
- if api_url is None:
128
- raise ValueError("api_url is required for openai-like provider")
129
- if api_key is None:
130
- api_key = "dummy_api_key"
131
- try:
132
- client = OpenAI(api_key=api_key, base_url=api_url)
133
-
134
- system_message = (
135
- get_system_message(npc) if npc else "You are a helpful assistant."
136
- )
137
-
138
- if messages is None:
139
- messages = []
140
-
141
- # Ensure the system message is at the beginning
142
- if not any(msg["role"] == "system" for msg in messages):
143
- messages.insert(0, {"role": "system", "content": system_message})
144
-
145
- # messages should already include the user's latest message
146
-
147
- # Make the API call with the messages including the latest user input
148
-
149
- completion = client.chat.completions.create(
150
- model=model, messages=messages, **kwargs
151
- )
152
- response_message = completion.choices[0].message
153
- messages.append({"role": "assistant", "content": response_message.content})
154
-
155
- return messages
156
-
157
- except Exception as e:
158
- return f"Error interacting with OpenAI: {e}"
159
-
160
- return messages
161
-
162
-
163
- def get_anthropic_conversation(
164
- messages: List[Dict[str, str]],
165
- model: str,
166
- npc: Any = None,
167
- tools: list = None,
168
- images=None,
169
- api_key: str = None,
170
- **kwargs,
171
- ) -> List[Dict[str, str]]:
172
- """
173
- Function Description:
174
- This function generates a conversation using the Anthropic API.
175
- Args:
176
- messages (List[Dict[str, str]]): The list of messages in the conversation.
177
- model (str): The model to use for the conversation.
178
- Keyword Args:
179
- npc (Any): The NPC object.
180
- api_key (str): The API key for accessing the Anthropic API.
181
- Returns:
182
- List[Dict[str, str]]: The list of messages in the conversation.
183
- """
184
-
185
- try:
186
- if api_key is None:
187
- api_key = os.getenv("ANTHROPIC_API_KEY", None)
188
- system_message = get_system_message(npc) if npc else ""
189
- client = anthropic.Anthropic(api_key=api_key)
190
- last_user_message = None
191
- for msg in reversed(messages):
192
- if msg["role"] == "user":
193
- last_user_message = msg["content"]
194
- break
195
-
196
- if last_user_message is None:
197
- raise ValueError("No user message found in the conversation history.")
198
-
199
- # if a sys message is in messages, remove it
200
- if messages[0]["role"] == "system":
201
- messages.pop(0)
202
-
203
- message = client.messages.create(
204
- model=model,
205
- system=system_message, # Include system message in each turn for Anthropic
206
- messages=messages, # Send only the last user message
207
- max_tokens=8192,
208
- **kwargs,
209
- )
210
-
211
- messages.append({"role": "assistant", "content": message.content[0].text})
212
-
213
- return messages
214
-
215
- except Exception as e:
216
- return f"Error interacting with Anthropic conversations: {e}"
217
-
218
-
219
- def get_gemini_conversation(
220
- messages: List[Dict[str, str]],
221
- model: str,
222
- npc: Any = None,
223
- tools: list = None,
224
- api_key: str = None,
225
- ) -> List[Dict[str, str]]:
226
- """
227
- Function Description:
228
- This function generates a conversation using the Gemini API.
229
- Args:
230
- messages (List[Dict[str, str]]): The list of messages in the conversation.
231
- model (str): The model to use for the conversation.
232
- Keyword Args:
233
- npc (Any): The NPC object.
234
- Returns:
235
- List[Dict[str, str]]: The list of messages in the conversation.
236
- """
237
- # Make the API call to Gemini
238
-
239
- # print(messages)
240
- response = get_gemini_response(
241
- messages[-1]["content"], model, messages=messages[1:], npc=npc
242
- )
243
- # print(response)
244
- return response.get("messages", [])
245
-
246
-
247
- def get_deepseek_conversation(
248
- messages: List[Dict[str, str]],
249
- model: str,
250
- npc: Any = None,
251
- tools: list = None,
252
- api_key: str = None,
253
- ) -> List[Dict[str, str]]:
254
- """
255
- Function Description:
256
- This function generates a conversation using the DeepSeek API.
257
- Args:
258
- messages (List[Dict[str, str]]): The list of messages in the conversation.
259
- model (str): The model to use for the conversation.
260
- Keyword Args:
261
- npc (Any): The NPC object.
262
- Returns:
263
- List[Dict[str, str]]: The list of messages in the conversation.
264
- """
265
-
266
- system_message = get_system_message(npc) if npc else "You are a helpful assistant."
267
-
268
- # Prepare the messages list
269
- if messages is None or len(messages) == 0:
270
- messages = [{"role": "system", "content": system_message}]
271
- elif not any(msg["role"] == "system" for msg in messages):
272
- messages.insert(0, {"role": "system", "content": system_message})
273
-
274
- # Make the API call to DeepSeek
275
- try:
276
- response = get_deepseek_response(
277
- messages[-1]["content"], model, messages=messages, npc=npc
278
- )
279
- messages.append(
280
- {"role": "assistant", "content": response.get("response", "No response")}
281
- )
282
-
283
- except Exception as e:
284
- messages.append(
285
- {
286
- "role": "assistant",
287
- "content": f"Error interacting with DeepSeek: {str(e)}",
288
- }
289
- )
290
-
291
- return messages
npcsh/data_models.py DELETED
@@ -1,46 +0,0 @@
1
- from pydantic import BaseModel
2
- from typing import List, Dict
3
-
4
-
5
- class NPC(BaseModel):
6
- name: str
7
- primary_directive: str
8
- model: str
9
- provider: str
10
- api_url: str
11
- tools: List[str]
12
- use_default_tools: bool
13
-
14
-
15
- class Tool(BaseModel):
16
- tool_name: str
17
- description: str
18
- steps: List[Dict[str, str]]
19
-
20
-
21
- class ToolStep(BaseModel):
22
- engine: str
23
- code: str
24
-
25
-
26
- class Context(BaseModel):
27
- databases: List[str]
28
- files: List[str]
29
- vars: List[Dict[str, str]]
30
-
31
-
32
- class Pipeline(BaseModel):
33
- steps: List[Dict[str, str]]
34
-
35
-
36
- class PipelineStep(BaseModel):
37
- tool: str
38
- args: List[str]
39
- model: str
40
- provider: str
41
- task: str
42
- npc: str
43
-
44
-
45
- class Fabrication(BaseModel):
46
- spell: str
npcsh/dataframes.py DELETED
@@ -1,163 +0,0 @@
1
- ## functions for dataframes
2
- import os
3
- import sqlite3
4
- import json
5
- import pandas as pd
6
- import numpy as np
7
- import io
8
- from PIL import Image
9
- from typing import Optional
10
-
11
- from .llm_funcs import get_llm_response
12
- from .audio import process_audio
13
- from .video import process_video
14
-
15
- from .load_data import load_pdf, load_csv, load_json, load_excel, load_txt, load_image
16
-
17
-
18
- def load_data_into_table(
19
- file_path: str, table_name: str, cursor: sqlite3.Cursor, conn: sqlite3.Connection
20
- ) -> None:
21
- """
22
- Function Description:
23
- This function is used to load data into a table.
24
- Args:
25
- file_path : str : The file path.
26
- table_name : str : The table name.
27
- cursor : sqlite3.Cursor : The SQLite cursor.
28
- conn : sqlite3.Connection : The SQLite connection.
29
- Keyword Args:
30
- None
31
- Returns:
32
- None
33
- """
34
- try:
35
- if not os.path.exists(file_path):
36
- raise FileNotFoundError(f"File not found: {file_path}")
37
-
38
- # Determine file type and load data
39
- if file_path.endswith(".csv"):
40
- df = pd.read_csv(file_path)
41
- elif file_path.endswith(".pdf"):
42
- df = load_pdf(file_path)
43
- elif file_path.endswith((".txt", ".log", ".md")):
44
- df = load_txt(file_path)
45
- elif file_path.endswith((".xls", ".xlsx")):
46
- df = load_excel(file_path)
47
- elif file_path.lower().endswith(
48
- (".png", ".jpg", ".jpeg", ".gif", ".bmp", ".tiff")
49
- ):
50
- # Handle images as NumPy arrays
51
- df = load_image(file_path)
52
- elif file_path.lower().endswith(
53
- (".mp4", ".avi", ".mov", ".mkv")
54
- ): # Video files
55
- video_frames, audio_array = process_video(file_path)
56
- # Store video frames and audio
57
- df = pd.DataFrame(
58
- {
59
- "video_frames": [video_frames.tobytes()],
60
- "shape": [video_frames.shape],
61
- "dtype": [video_frames.dtype.str],
62
- "audio_array": (
63
- [audio_array.tobytes()] if audio_array is not None else None
64
- ),
65
- "audio_rate": [sr] if audio_array is not None else None,
66
- }
67
- )
68
-
69
- elif file_path.lower().endswith((".mp3", ".wav", ".ogg")): # Audio files
70
- audio_array, sr = process_audio(file_path)
71
- df = pd.DataFrame(
72
- {
73
- "audio_array": [audio_array.tobytes()],
74
- "audio_rate": [sr],
75
- }
76
- )
77
- else:
78
- # Attempt to load as text if no other type matches
79
- try:
80
- df = load_txt(file_path)
81
- except Exception as e:
82
- print(f"Could not load file: {e}")
83
- return
84
-
85
- # Store DataFrame in the database
86
- df.to_sql(table_name, conn, if_exists="replace", index=False)
87
- print(f"Data from '{file_path}' loaded into table '{table_name}'")
88
-
89
- except Exception as e:
90
- raise e # Re-raise the exception for handling in enter_observation_mode
91
-
92
-
93
- def create_new_table(cursor: sqlite3.Cursor, conn: sqlite3.Connection) -> None:
94
- """
95
- Function Description:
96
- This function is used to create a new table.
97
- Args:
98
- cursor : sqlite3.Cursor : The SQLite cursor.
99
- conn : sqlite3.Connection : The SQLite connection.
100
- Keyword Args:
101
- None
102
- Returns:
103
- None
104
- """
105
-
106
- table_name = input("Enter new table name: ").strip()
107
- columns = input("Enter column names separated by commas: ").strip()
108
-
109
- create_query = (
110
- f"CREATE TABLE {table_name} (id INTEGER PRIMARY KEY AUTOINCREMENT, {columns})"
111
- )
112
- cursor.execute(create_query)
113
- conn.commit()
114
- print(f"Table '{table_name}' created successfully.")
115
-
116
-
117
- def delete_table(cursor: sqlite3.Cursor, conn: sqlite3.Connection) -> None:
118
- """
119
- Function Description:
120
- This function is used to delete a table.
121
- Args:
122
- cursor : sqlite3.Cursor : The SQLite cursor.
123
- conn : sqlite3.Connection : The SQLite connection.
124
- Keyword Args:
125
- None
126
- Returns:
127
- None
128
- """
129
-
130
- table_name = input("Enter table name to delete: ").strip()
131
- cursor.execute(f"DROP TABLE IF EXISTS {table_name}")
132
- conn.commit()
133
- print(f"Table '{table_name}' deleted successfully.")
134
-
135
-
136
- def add_observation(
137
- cursor: sqlite3.Cursor, conn: sqlite3.Connection, table_name: str
138
- ) -> None:
139
- """
140
- Function Description:
141
- This function is used to add an observation.
142
- Args:
143
- cursor : sqlite3.Cursor : The SQLite cursor.
144
- conn : sqlite3.Connection : The SQLite connection.
145
- table_name : str : The table name.
146
- Keyword Args:
147
- None
148
- Returns:
149
- None
150
- """
151
-
152
- cursor.execute(f"PRAGMA table_info({table_name})")
153
- columns = [column[1] for column in cursor.fetchall() if column[1] != "id"]
154
-
155
- values = []
156
- for column in columns:
157
- value = input(f"Enter value for {column}: ").strip()
158
- values.append(value)
159
-
160
- insert_query = f"INSERT INTO {table_name} ({','.join(columns)}) VALUES ({','.join(['?' for _ in columns])})"
161
- cursor.execute(insert_query, values)
162
- conn.commit()
163
- print("Observation added successfully.")
npcsh/embeddings.py DELETED
@@ -1,168 +0,0 @@
1
- #######
2
- #######
3
- #######
4
- #######
5
- ####### EMBEDDINGS
6
- #######
7
- from typing import List, Dict, Optional
8
- import numpy as np
9
- from npcsh.npc_sysenv import (
10
- NPCSH_VECTOR_DB_PATH,
11
- NPCSH_EMBEDDING_MODEL,
12
- NPCSH_EMBEDDING_PROVIDER,
13
- chroma_client,
14
- )
15
- from openai import OpenAI
16
- import anthropic
17
-
18
-
19
- def get_ollama_embeddings(
20
- texts: List[str], model: str = "nomic-embed-text"
21
- ) -> List[List[float]]:
22
- """Generate embeddings using Ollama."""
23
- import ollama
24
-
25
- embeddings = []
26
- for text in texts:
27
- response = ollama.embeddings(model=model, prompt=text)
28
- embeddings.append(response["embedding"])
29
- return embeddings
30
-
31
-
32
- def get_openai_embeddings(
33
- texts: List[str], model: str = "text-embedding-3-small"
34
- ) -> List[List[float]]:
35
- """Generate embeddings using OpenAI."""
36
- client = OpenAI(api_key=openai_api_key)
37
- response = client.embeddings.create(input=texts, model=model)
38
- return [embedding.embedding for embedding in response.data]
39
-
40
-
41
- def get_openai_like_embeddings(
42
- texts: List[str], model, api_url=None, api_key=None
43
- ) -> List[List[float]]:
44
- """Generate embeddings using OpenAI."""
45
- client = OpenAI(api_key=openai_api_key, base_url=api_url)
46
- response = client.embeddings.create(input=texts, model=model)
47
- return [embedding.embedding for embedding in response.data]
48
-
49
-
50
- def get_anthropic_embeddings(
51
- texts: List[str], model: str = "claude-3-haiku-20240307"
52
- ) -> List[List[float]]:
53
- """Generate embeddings using Anthropic."""
54
- client = anthropic.Anthropic(api_key=anthropic_api_key)
55
- embeddings = []
56
- for text in texts:
57
- # response = client.messages.create(
58
- # model=model, max_tokens=1024, messages=[{"role": "user", "content": text}]
59
- # )
60
- # Placeholder for actual embedding
61
- embeddings.append([0.0] * 1024) # Replace with actual embedding when available
62
- return embeddings
63
-
64
-
65
- def store_embeddings_for_model(
66
- texts,
67
- embeddings,
68
- metadata=None,
69
- model: str = NPCSH_EMBEDDING_MODEL,
70
- provider: str = NPCSH_EMBEDDING_PROVIDER,
71
- ):
72
- collection_name = f"{provider}_{model}_embeddings"
73
- collection = chroma_client.get_collection(collection_name)
74
-
75
- # Create meaningful metadata for each document (adjust as necessary)
76
- if metadata is None:
77
- metadata = [{"text_length": len(text)} for text in texts] # Example metadata
78
- print(
79
- "metadata is none, creating metadata for each document as the length of the text"
80
- )
81
- # Add embeddings to the collection with metadata
82
- collection.add(
83
- ids=[str(i) for i in range(len(texts))],
84
- embeddings=embeddings,
85
- metadatas=metadata, # Passing populated metadata
86
- documents=texts,
87
- )
88
-
89
-
90
- def delete_embeddings_from_collection(collection, ids):
91
- """Delete embeddings by id from Chroma collection."""
92
- if ids:
93
- collection.delete(ids=ids) # Only delete if ids are provided
94
-
95
-
96
- def search_similar_texts(
97
- query: str,
98
- docs_to_embed: Optional[List[str]] = None,
99
- top_k: int = 5,
100
- db_path: str = NPCSH_VECTOR_DB_PATH,
101
- embedding_model: str = NPCSH_EMBEDDING_MODEL,
102
- embedding_provider: str = NPCSH_EMBEDDING_PROVIDER,
103
- ) -> List[Dict[str, any]]:
104
- """
105
- Search for similar texts using either a Chroma database or direct embedding comparison.
106
- """
107
-
108
- print(f"\nQuery to embed: {query}")
109
- embedded_search_term = get_ollama_embeddings([query], embedding_model)[0]
110
- # print(f"Query embedding: {embedded_search_term}")
111
-
112
- if docs_to_embed is None:
113
- # Fetch from the database if no documents to embed are provided
114
- collection_name = f"{embedding_provider}_{embedding_model}_embeddings"
115
- collection = chroma_client.get_collection(collection_name)
116
- results = collection.query(
117
- query_embeddings=[embedded_search_term], n_results=top_k
118
- )
119
- # Constructing and returning results
120
- return [
121
- {"id": id, "score": float(distance), "text": document}
122
- for id, distance, document in zip(
123
- results["ids"][0], results["distances"][0], results["documents"][0]
124
- )
125
- ]
126
-
127
- print(f"\nNumber of documents to embed: {len(docs_to_embed)}")
128
-
129
- # Get embeddings for provided documents
130
- raw_embeddings = get_ollama_embeddings(docs_to_embed, embedding_model)
131
-
132
- output_embeddings = []
133
- for idx, emb in enumerate(raw_embeddings):
134
- if emb: # Exclude any empty embeddings
135
- output_embeddings.append(emb)
136
-
137
- # Convert to numpy arrays for calculations
138
- doc_embeddings = np.array(output_embeddings)
139
- query_embedding = np.array(embedded_search_term)
140
-
141
- # Check for zero-length embeddings
142
- if len(doc_embeddings) == 0:
143
- raise ValueError("No valid document embeddings found")
144
-
145
- # Normalize embeddings to avoid division by zeros
146
- doc_norms = np.linalg.norm(doc_embeddings, axis=1, keepdims=True)
147
- query_norm = np.linalg.norm(query_embedding)
148
-
149
- # Ensure no zero vectors are being used in cosine similarity
150
- if query_norm == 0:
151
- raise ValueError("Query embedding is zero-length")
152
-
153
- # Calculate cosine similarities
154
- cosine_similarities = np.dot(doc_embeddings, query_embedding) / (
155
- doc_norms.flatten() * query_norm
156
- )
157
-
158
- # Get indices of top K documents
159
- top_indices = np.argsort(cosine_similarities)[::-1][:top_k]
160
-
161
- return [
162
- {
163
- "id": str(idx),
164
- "score": float(cosine_similarities[idx]),
165
- "text": docs_to_embed[idx],
166
- }
167
- for idx in top_indices
168
- ]