ag2 0.4b1__py3-none-any.whl → 0.4.2b1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of ag2 might be problematic. Click here for more details.

Files changed (118) hide show
  1. ag2-0.4.2b1.dist-info/METADATA +19 -0
  2. ag2-0.4.2b1.dist-info/RECORD +6 -0
  3. ag2-0.4.2b1.dist-info/top_level.txt +1 -0
  4. ag2-0.4b1.dist-info/METADATA +0 -496
  5. ag2-0.4b1.dist-info/RECORD +0 -115
  6. ag2-0.4b1.dist-info/top_level.txt +0 -1
  7. autogen/__init__.py +0 -17
  8. autogen/_pydantic.py +0 -116
  9. autogen/agentchat/__init__.py +0 -42
  10. autogen/agentchat/agent.py +0 -142
  11. autogen/agentchat/assistant_agent.py +0 -85
  12. autogen/agentchat/chat.py +0 -306
  13. autogen/agentchat/contrib/__init__.py +0 -0
  14. autogen/agentchat/contrib/agent_builder.py +0 -787
  15. autogen/agentchat/contrib/agent_optimizer.py +0 -450
  16. autogen/agentchat/contrib/capabilities/__init__.py +0 -0
  17. autogen/agentchat/contrib/capabilities/agent_capability.py +0 -21
  18. autogen/agentchat/contrib/capabilities/generate_images.py +0 -297
  19. autogen/agentchat/contrib/capabilities/teachability.py +0 -406
  20. autogen/agentchat/contrib/capabilities/text_compressors.py +0 -72
  21. autogen/agentchat/contrib/capabilities/transform_messages.py +0 -92
  22. autogen/agentchat/contrib/capabilities/transforms.py +0 -565
  23. autogen/agentchat/contrib/capabilities/transforms_util.py +0 -120
  24. autogen/agentchat/contrib/capabilities/vision_capability.py +0 -217
  25. autogen/agentchat/contrib/captainagent.py +0 -487
  26. autogen/agentchat/contrib/gpt_assistant_agent.py +0 -545
  27. autogen/agentchat/contrib/graph_rag/__init__.py +0 -0
  28. autogen/agentchat/contrib/graph_rag/document.py +0 -24
  29. autogen/agentchat/contrib/graph_rag/falkor_graph_query_engine.py +0 -76
  30. autogen/agentchat/contrib/graph_rag/graph_query_engine.py +0 -50
  31. autogen/agentchat/contrib/graph_rag/graph_rag_capability.py +0 -56
  32. autogen/agentchat/contrib/img_utils.py +0 -390
  33. autogen/agentchat/contrib/llamaindex_conversable_agent.py +0 -123
  34. autogen/agentchat/contrib/llava_agent.py +0 -176
  35. autogen/agentchat/contrib/math_user_proxy_agent.py +0 -471
  36. autogen/agentchat/contrib/multimodal_conversable_agent.py +0 -128
  37. autogen/agentchat/contrib/qdrant_retrieve_user_proxy_agent.py +0 -325
  38. autogen/agentchat/contrib/retrieve_assistant_agent.py +0 -56
  39. autogen/agentchat/contrib/retrieve_user_proxy_agent.py +0 -701
  40. autogen/agentchat/contrib/society_of_mind_agent.py +0 -203
  41. autogen/agentchat/contrib/swarm_agent.py +0 -414
  42. autogen/agentchat/contrib/text_analyzer_agent.py +0 -76
  43. autogen/agentchat/contrib/tool_retriever.py +0 -114
  44. autogen/agentchat/contrib/vectordb/__init__.py +0 -0
  45. autogen/agentchat/contrib/vectordb/base.py +0 -243
  46. autogen/agentchat/contrib/vectordb/chromadb.py +0 -326
  47. autogen/agentchat/contrib/vectordb/mongodb.py +0 -559
  48. autogen/agentchat/contrib/vectordb/pgvectordb.py +0 -958
  49. autogen/agentchat/contrib/vectordb/qdrant.py +0 -334
  50. autogen/agentchat/contrib/vectordb/utils.py +0 -126
  51. autogen/agentchat/contrib/web_surfer.py +0 -305
  52. autogen/agentchat/conversable_agent.py +0 -2908
  53. autogen/agentchat/groupchat.py +0 -1668
  54. autogen/agentchat/user_proxy_agent.py +0 -109
  55. autogen/agentchat/utils.py +0 -207
  56. autogen/browser_utils.py +0 -291
  57. autogen/cache/__init__.py +0 -10
  58. autogen/cache/abstract_cache_base.py +0 -78
  59. autogen/cache/cache.py +0 -182
  60. autogen/cache/cache_factory.py +0 -85
  61. autogen/cache/cosmos_db_cache.py +0 -150
  62. autogen/cache/disk_cache.py +0 -109
  63. autogen/cache/in_memory_cache.py +0 -61
  64. autogen/cache/redis_cache.py +0 -128
  65. autogen/code_utils.py +0 -745
  66. autogen/coding/__init__.py +0 -22
  67. autogen/coding/base.py +0 -113
  68. autogen/coding/docker_commandline_code_executor.py +0 -262
  69. autogen/coding/factory.py +0 -45
  70. autogen/coding/func_with_reqs.py +0 -203
  71. autogen/coding/jupyter/__init__.py +0 -22
  72. autogen/coding/jupyter/base.py +0 -32
  73. autogen/coding/jupyter/docker_jupyter_server.py +0 -164
  74. autogen/coding/jupyter/embedded_ipython_code_executor.py +0 -182
  75. autogen/coding/jupyter/jupyter_client.py +0 -224
  76. autogen/coding/jupyter/jupyter_code_executor.py +0 -161
  77. autogen/coding/jupyter/local_jupyter_server.py +0 -168
  78. autogen/coding/local_commandline_code_executor.py +0 -410
  79. autogen/coding/markdown_code_extractor.py +0 -44
  80. autogen/coding/utils.py +0 -57
  81. autogen/exception_utils.py +0 -46
  82. autogen/extensions/__init__.py +0 -0
  83. autogen/formatting_utils.py +0 -76
  84. autogen/function_utils.py +0 -362
  85. autogen/graph_utils.py +0 -148
  86. autogen/io/__init__.py +0 -15
  87. autogen/io/base.py +0 -105
  88. autogen/io/console.py +0 -43
  89. autogen/io/websockets.py +0 -213
  90. autogen/logger/__init__.py +0 -11
  91. autogen/logger/base_logger.py +0 -140
  92. autogen/logger/file_logger.py +0 -287
  93. autogen/logger/logger_factory.py +0 -29
  94. autogen/logger/logger_utils.py +0 -42
  95. autogen/logger/sqlite_logger.py +0 -459
  96. autogen/math_utils.py +0 -356
  97. autogen/oai/__init__.py +0 -33
  98. autogen/oai/anthropic.py +0 -428
  99. autogen/oai/bedrock.py +0 -600
  100. autogen/oai/cerebras.py +0 -264
  101. autogen/oai/client.py +0 -1148
  102. autogen/oai/client_utils.py +0 -167
  103. autogen/oai/cohere.py +0 -453
  104. autogen/oai/completion.py +0 -1216
  105. autogen/oai/gemini.py +0 -469
  106. autogen/oai/groq.py +0 -281
  107. autogen/oai/mistral.py +0 -279
  108. autogen/oai/ollama.py +0 -576
  109. autogen/oai/openai_utils.py +0 -810
  110. autogen/oai/together.py +0 -343
  111. autogen/retrieve_utils.py +0 -487
  112. autogen/runtime_logging.py +0 -163
  113. autogen/token_count_utils.py +0 -257
  114. autogen/types.py +0 -20
  115. autogen/version.py +0 -7
  116. {ag2-0.4b1.dist-info → ag2-0.4.2b1.dist-info}/LICENSE +0 -0
  117. {ag2-0.4b1.dist-info → ag2-0.4.2b1.dist-info}/NOTICE.md +0 -0
  118. {ag2-0.4b1.dist-info → ag2-0.4.2b1.dist-info}/WHEEL +0 -0
autogen/retrieve_utils.py DELETED
@@ -1,487 +0,0 @@
1
- # Copyright (c) 2023 - 2024, Owners of https://github.com/ag2ai
2
- #
3
- # SPDX-License-Identifier: Apache-2.0
4
- #
5
- # Portions derived from https://github.com/microsoft/autogen are under the MIT License.
6
- # SPDX-License-Identifier: MIT
7
- import glob
8
- import hashlib
9
- import os
10
- import re
11
- from typing import Callable, List, Tuple, Union
12
- from urllib.parse import urlparse
13
-
14
- import chromadb
15
- import markdownify
16
- import requests
17
- from bs4 import BeautifulSoup
18
-
19
- if chromadb.__version__ < "0.4.15":
20
- from chromadb.api import API
21
- else:
22
- from chromadb.api import ClientAPI as API
23
- import logging
24
-
25
- import chromadb.utils.embedding_functions as ef
26
- import pypdf
27
- from chromadb.api.types import QueryResult
28
-
29
- from autogen.token_count_utils import count_token
30
-
31
- try:
32
- from unstructured.partition.auto import partition
33
-
34
- HAS_UNSTRUCTURED = True
35
- except ImportError:
36
- HAS_UNSTRUCTURED = False
37
-
38
- logger = logging.getLogger(__name__)
39
- TEXT_FORMATS = [
40
- "txt",
41
- "json",
42
- "csv",
43
- "tsv",
44
- "md",
45
- "html",
46
- "htm",
47
- "rtf",
48
- "rst",
49
- "jsonl",
50
- "log",
51
- "xml",
52
- "yaml",
53
- "yml",
54
- "pdf",
55
- ]
56
- UNSTRUCTURED_FORMATS = [
57
- "doc",
58
- "docx",
59
- "epub",
60
- "msg",
61
- "odt",
62
- "org",
63
- "pdf",
64
- "ppt",
65
- "pptx",
66
- "rtf",
67
- "rst",
68
- "xlsx",
69
- ] # These formats will be parsed by the 'unstructured' library, if installed.
70
- if HAS_UNSTRUCTURED:
71
- TEXT_FORMATS += UNSTRUCTURED_FORMATS
72
- TEXT_FORMATS = list(set(TEXT_FORMATS))
73
- VALID_CHUNK_MODES = frozenset({"one_line", "multi_lines"})
74
- RAG_MINIMUM_MESSAGE_LENGTH = int(os.environ.get("RAG_MINIMUM_MESSAGE_LENGTH", 5))
75
-
76
-
77
- def split_text_to_chunks(
78
- text: str,
79
- max_tokens: int = 4000,
80
- chunk_mode: str = "multi_lines",
81
- must_break_at_empty_line: bool = True,
82
- overlap: int = 0, # number of overlapping lines
83
- ):
84
- """Split a long text into chunks of max_tokens."""
85
- if chunk_mode not in VALID_CHUNK_MODES:
86
- raise AssertionError
87
- if chunk_mode == "one_line":
88
- must_break_at_empty_line = False
89
- overlap = 0
90
- chunks = []
91
- lines = text.split("\n")
92
- num_lines = len(lines)
93
- if num_lines < 3 and must_break_at_empty_line:
94
- logger.warning("The input text has less than 3 lines. Set `must_break_at_empty_line` to `False`")
95
- must_break_at_empty_line = False
96
- lines_tokens = [count_token(line) for line in lines]
97
- sum_tokens = sum(lines_tokens)
98
- while sum_tokens > max_tokens:
99
- if chunk_mode == "one_line":
100
- estimated_line_cut = 2
101
- else:
102
- estimated_line_cut = max(int(max_tokens / sum_tokens * len(lines)), 2)
103
- cnt = 0
104
- prev = ""
105
- for cnt in reversed(range(estimated_line_cut)):
106
- if must_break_at_empty_line and lines[cnt].strip() != "":
107
- continue
108
- if sum(lines_tokens[:cnt]) <= max_tokens:
109
- prev = "\n".join(lines[:cnt])
110
- break
111
- if cnt == 0:
112
- logger.warning(
113
- f"max_tokens is too small to fit a single line of text. Breaking this line:\n\t{lines[0][:100]} ..."
114
- )
115
- if not must_break_at_empty_line:
116
- split_len = max(
117
- int(max_tokens / (lines_tokens[0] * 0.9 * len(lines[0]) + 0.1)), RAG_MINIMUM_MESSAGE_LENGTH
118
- )
119
- prev = lines[0][:split_len]
120
- lines[0] = lines[0][split_len:]
121
- lines_tokens[0] = count_token(lines[0])
122
- else:
123
- logger.warning("Failed to split docs with must_break_at_empty_line being True, set to False.")
124
- must_break_at_empty_line = False
125
- (
126
- chunks.append(prev) if len(prev) >= RAG_MINIMUM_MESSAGE_LENGTH else None
127
- ) # don't add chunks less than RAG_MINIMUM_MESSAGE_LENGTH characters
128
- lines = lines[cnt - overlap if cnt > overlap else cnt :]
129
- lines_tokens = lines_tokens[cnt - overlap if cnt > overlap else cnt :]
130
- sum_tokens = sum(lines_tokens)
131
- text_to_chunk = "\n".join(lines).strip()
132
- (
133
- chunks.append(text_to_chunk) if len(text_to_chunk) >= RAG_MINIMUM_MESSAGE_LENGTH else None
134
- ) # don't add chunks less than RAG_MINIMUM_MESSAGE_LENGTH characters
135
- return chunks
136
-
137
-
138
- def extract_text_from_pdf(file: str) -> str:
139
- """Extract text from PDF files"""
140
- text = ""
141
- with open(file, "rb") as f:
142
- reader = pypdf.PdfReader(f)
143
- if reader.is_encrypted: # Check if the PDF is encrypted
144
- try:
145
- reader.decrypt("")
146
- except pypdf.errors.FileNotDecryptedError as e:
147
- logger.warning(f"Could not decrypt PDF {file}, {e}")
148
- return text # Return empty text if PDF could not be decrypted
149
-
150
- for page_num in range(len(reader.pages)):
151
- page = reader.pages[page_num]
152
- text += page.extract_text()
153
-
154
- if not text.strip(): # Debugging line to check if text is empty
155
- logger.warning(f"Could not decrypt PDF {file}")
156
-
157
- return text
158
-
159
-
160
- def split_files_to_chunks(
161
- files: list,
162
- max_tokens: int = 4000,
163
- chunk_mode: str = "multi_lines",
164
- must_break_at_empty_line: bool = True,
165
- custom_text_split_function: Callable = None,
166
- ) -> Tuple[List[str], List[dict]]:
167
- """Split a list of files into chunks of max_tokens."""
168
-
169
- chunks = []
170
- sources = []
171
-
172
- for file in files:
173
- if isinstance(file, tuple):
174
- url = file[1]
175
- file = file[0]
176
- else:
177
- url = None
178
- _, file_extension = os.path.splitext(file)
179
- file_extension = file_extension.lower()
180
-
181
- if HAS_UNSTRUCTURED and file_extension[1:] in UNSTRUCTURED_FORMATS:
182
- text = partition(file)
183
- text = "\n".join([t.text for t in text]) if len(text) > 0 else ""
184
- elif file_extension == ".pdf":
185
- text = extract_text_from_pdf(file)
186
- else: # For non-PDF text-based files
187
- with open(file, "r", encoding="utf-8", errors="ignore") as f:
188
- text = f.read()
189
-
190
- if not text.strip(): # Debugging line to check if text is empty after reading
191
- logger.warning(f"No text available in file: {file}")
192
- continue # Skip to the next file if no text is available
193
-
194
- if custom_text_split_function is not None:
195
- tmp_chunks = custom_text_split_function(text)
196
- else:
197
- tmp_chunks = split_text_to_chunks(text, max_tokens, chunk_mode, must_break_at_empty_line)
198
- chunks += tmp_chunks
199
- sources += [{"source": url if url else file}] * len(tmp_chunks)
200
-
201
- return chunks, sources
202
-
203
-
204
- def get_files_from_dir(dir_path: Union[str, List[str]], types: list = TEXT_FORMATS, recursive: bool = True):
205
- """Return a list of all the files in a given directory, a url, a file path or a list of them."""
206
- if len(types) == 0:
207
- raise ValueError("types cannot be empty.")
208
- types = [t[1:].lower() if t.startswith(".") else t.lower() for t in set(types)]
209
- types += [t.upper() for t in types]
210
-
211
- files = []
212
- # If the path is a list of files or urls, process and return them
213
- if isinstance(dir_path, list):
214
- for item in dir_path:
215
- if os.path.isfile(item):
216
- files.append(item)
217
- elif is_url(item):
218
- filepath = get_file_from_url(item)
219
- if filepath:
220
- files.append(filepath)
221
- elif os.path.exists(item):
222
- try:
223
- files.extend(get_files_from_dir(item, types, recursive))
224
- except ValueError:
225
- logger.warning(f"Directory {item} does not exist. Skipping.")
226
- else:
227
- logger.warning(f"File {item} does not exist. Skipping.")
228
- return files
229
-
230
- # If the path is a file, return it
231
- if os.path.isfile(dir_path):
232
- return [dir_path]
233
-
234
- # If the path is a url, download it and return the downloaded file
235
- if is_url(dir_path):
236
- filepath = get_file_from_url(dir_path)
237
- if filepath:
238
- return [filepath]
239
- else:
240
- return []
241
-
242
- if os.path.exists(dir_path):
243
- for type in types:
244
- if recursive:
245
- files += glob.glob(os.path.join(dir_path, f"**/*.{type}"), recursive=True)
246
- else:
247
- files += glob.glob(os.path.join(dir_path, f"*.{type}"), recursive=False)
248
- else:
249
- logger.error(f"Directory {dir_path} does not exist.")
250
- raise ValueError(f"Directory {dir_path} does not exist.")
251
- return files
252
-
253
-
254
- def parse_html_to_markdown(html: str, url: str = None) -> str:
255
- """Parse HTML to markdown."""
256
- soup = BeautifulSoup(html, "html.parser")
257
- title = soup.title.string
258
- # Remove javascript and style blocks
259
- for script in soup(["script", "style"]):
260
- script.extract()
261
-
262
- # Convert to markdown -- Wikipedia gets special attention to get a clean version of the page
263
- if isinstance(url, str) and url.startswith("https://en.wikipedia.org/"):
264
- body_elm = soup.find("div", {"id": "mw-content-text"})
265
- title_elm = soup.find("span", {"class": "mw-page-title-main"})
266
-
267
- if body_elm:
268
- # What's the title
269
- main_title = soup.title.string
270
- if title_elm and len(title_elm) > 0:
271
- main_title = title_elm.string
272
- webpage_text = "# " + main_title + "\n\n" + markdownify.MarkdownConverter().convert_soup(body_elm)
273
- else:
274
- webpage_text = markdownify.MarkdownConverter().convert_soup(soup)
275
- else:
276
- webpage_text = markdownify.MarkdownConverter().convert_soup(soup)
277
-
278
- # Convert newlines
279
- webpage_text = re.sub(r"\r\n", "\n", webpage_text)
280
- webpage_text = re.sub(r"\n{2,}", "\n\n", webpage_text).strip()
281
- webpage_text = "# " + title + "\n\n" + webpage_text
282
- return webpage_text
283
-
284
-
285
- def _generate_file_name_from_url(url: str, max_length=255) -> str:
286
- url_bytes = url.encode("utf-8")
287
- hash = hashlib.blake2b(url_bytes).hexdigest()
288
- parsed_url = urlparse(url)
289
- file_name = os.path.basename(url)
290
- file_name = f"{parsed_url.netloc}_{file_name}_{hash[:min(8, max_length-len(parsed_url.netloc)-len(file_name)-1)]}"
291
- return file_name
292
-
293
-
294
- def get_file_from_url(url: str, save_path: str = None) -> Tuple[str, str]:
295
- """Download a file from a URL."""
296
- if save_path is None:
297
- save_path = "tmp/chromadb"
298
- os.makedirs(save_path, exist_ok=True)
299
- if os.path.isdir(save_path):
300
- filename = _generate_file_name_from_url(url)
301
- save_path = os.path.join(save_path, filename)
302
- else:
303
- os.makedirs(os.path.dirname(save_path), exist_ok=True)
304
-
305
- custom_headers = {
306
- "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36",
307
- }
308
- try:
309
- response = requests.get(url, stream=True, headers=custom_headers, timeout=30)
310
- response.raise_for_status()
311
- except requests.exceptions.RequestException as e:
312
- logger.warning(f"Failed to download {url}, {e}")
313
- return None
314
-
315
- content_type = response.headers.get("content-type", "")
316
- if "text/html" in content_type:
317
- # Get the content of the response
318
- html = ""
319
- for chunk in response.iter_content(chunk_size=8192, decode_unicode=True):
320
- html += chunk
321
- text = parse_html_to_markdown(html, url)
322
- with open(save_path, "w", encoding="utf-8") as f:
323
- f.write(text)
324
- else:
325
- with open(save_path, "wb") as f:
326
- for chunk in response.iter_content(chunk_size=8192):
327
- f.write(chunk)
328
- return save_path, url
329
-
330
-
331
- def is_url(string: str):
332
- """Return True if the string is a valid URL."""
333
- try:
334
- result = urlparse(string)
335
- return all([result.scheme, result.netloc])
336
- except ValueError:
337
- return False
338
-
339
-
340
- def create_vector_db_from_dir(
341
- dir_path: Union[str, List[str]],
342
- max_tokens: int = 4000,
343
- client: API = None,
344
- db_path: str = "tmp/chromadb.db",
345
- collection_name: str = "all-my-documents",
346
- get_or_create: bool = False,
347
- chunk_mode: str = "multi_lines",
348
- must_break_at_empty_line: bool = True,
349
- embedding_model: str = "all-MiniLM-L6-v2",
350
- embedding_function: Callable = None,
351
- custom_text_split_function: Callable = None,
352
- custom_text_types: List[str] = TEXT_FORMATS,
353
- recursive: bool = True,
354
- extra_docs: bool = False,
355
- ) -> API:
356
- """Create a vector db from all the files in a given directory, the directory can also be a single file or a url to
357
- a single file. We support chromadb compatible APIs to create the vector db, this function is not required if
358
- you prepared your own vector db.
359
-
360
- Args:
361
- dir_path (Union[str, List[str]]): the path to the directory, file, url or a list of them.
362
- max_tokens (Optional, int): the maximum number of tokens per chunk. Default is 4000.
363
- client (Optional, API): the chromadb client. Default is None.
364
- db_path (Optional, str): the path to the chromadb. Default is "tmp/chromadb.db". The default was `/tmp/chromadb.db` for version <=0.2.24.
365
- collection_name (Optional, str): the name of the collection. Default is "all-my-documents".
366
- get_or_create (Optional, bool): Whether to get or create the collection. Default is False. If True, the collection
367
- will be returned if it already exists. Will raise ValueError if the collection already exists and get_or_create is False.
368
- chunk_mode (Optional, str): the chunk mode. Default is "multi_lines".
369
- must_break_at_empty_line (Optional, bool): Whether to break at empty line. Default is True.
370
- embedding_model (Optional, str): the embedding model to use. Default is "all-MiniLM-L6-v2". Will be ignored if
371
- embedding_function is not None.
372
- embedding_function (Optional, Callable): the embedding function to use. Default is None, SentenceTransformer with
373
- the given `embedding_model` will be used. If you want to use OpenAI, Cohere, HuggingFace or other embedding
374
- functions, you can pass it here, follow the examples in `https://docs.trychroma.com/embeddings`.
375
- custom_text_split_function (Optional, Callable): a custom function to split a string into a list of strings.
376
- Default is None, will use the default function in `autogen.retrieve_utils.split_text_to_chunks`.
377
- custom_text_types (Optional, List[str]): a list of file types to be processed. Default is TEXT_FORMATS.
378
- recursive (Optional, bool): whether to search documents recursively in the dir_path. Default is True.
379
- extra_docs (Optional, bool): whether to add more documents in the collection. Default is False
380
-
381
- Returns:
382
-
383
- The chromadb client.
384
- """
385
- if client is None:
386
- client = chromadb.PersistentClient(path=db_path)
387
- try:
388
- embedding_function = (
389
- ef.SentenceTransformerEmbeddingFunction(embedding_model)
390
- if embedding_function is None
391
- else embedding_function
392
- )
393
- collection = client.create_collection(
394
- collection_name,
395
- get_or_create=get_or_create,
396
- embedding_function=embedding_function,
397
- # https://github.com/nmslib/hnswlib#supported-distances
398
- # https://github.com/chroma-core/chroma/blob/566bc80f6c8ee29f7d99b6322654f32183c368c4/chromadb/segment/impl/vector/local_hnsw.py#L184
399
- # https://github.com/nmslib/hnswlib/blob/master/ALGO_PARAMS.md
400
- metadata={"hnsw:space": "ip", "hnsw:construction_ef": 30, "hnsw:M": 32}, # ip, l2, cosine
401
- )
402
-
403
- length = 0
404
- if extra_docs:
405
- length = len(collection.get()["ids"])
406
-
407
- if custom_text_split_function is not None:
408
- chunks, sources = split_files_to_chunks(
409
- get_files_from_dir(dir_path, custom_text_types, recursive),
410
- custom_text_split_function=custom_text_split_function,
411
- )
412
- else:
413
- chunks, sources = split_files_to_chunks(
414
- get_files_from_dir(dir_path, custom_text_types, recursive),
415
- max_tokens,
416
- chunk_mode,
417
- must_break_at_empty_line,
418
- )
419
- logger.info(f"Found {len(chunks)} chunks.")
420
- # Upsert in batch of 40000 or less if the total number of chunks is less than 40000
421
- for i in range(0, len(chunks), min(40000, len(chunks))):
422
- end_idx = i + min(40000, len(chunks) - i)
423
- collection.upsert(
424
- documents=chunks[i:end_idx],
425
- ids=[f"doc_{j+length}" for j in range(i, end_idx)], # unique for each doc
426
- metadatas=sources[i:end_idx],
427
- )
428
- except ValueError as e:
429
- logger.warning(f"{e}")
430
- return client
431
-
432
-
433
- def query_vector_db(
434
- query_texts: List[str],
435
- n_results: int = 10,
436
- client: API = None,
437
- db_path: str = "tmp/chromadb.db",
438
- collection_name: str = "all-my-documents",
439
- search_string: str = "",
440
- embedding_model: str = "all-MiniLM-L6-v2",
441
- embedding_function: Callable = None,
442
- ) -> QueryResult:
443
- """Query a vector db. We support chromadb compatible APIs, it's not required if you prepared your own vector db
444
- and query function.
445
-
446
- Args:
447
- query_texts (List[str]): the list of strings which will be used to query the vector db.
448
- n_results (Optional, int): the number of results to return. Default is 10.
449
- client (Optional, API): the chromadb compatible client. Default is None, a chromadb client will be used.
450
- db_path (Optional, str): the path to the vector db. Default is "tmp/chromadb.db". The default was `/tmp/chromadb.db` for version <=0.2.24.
451
- collection_name (Optional, str): the name of the collection. Default is "all-my-documents".
452
- search_string (Optional, str): the search string. Only docs that contain an exact match of this string will be retrieved. Default is "".
453
- embedding_model (Optional, str): the embedding model to use. Default is "all-MiniLM-L6-v2". Will be ignored if
454
- embedding_function is not None.
455
- embedding_function (Optional, Callable): the embedding function to use. Default is None, SentenceTransformer with
456
- the given `embedding_model` will be used. If you want to use OpenAI, Cohere, HuggingFace or other embedding
457
- functions, you can pass it here, follow the examples in `https://docs.trychroma.com/embeddings`.
458
-
459
- Returns:
460
-
461
- The query result. The format is:
462
-
463
- ```python
464
- class QueryResult(TypedDict):
465
- ids: List[IDs]
466
- embeddings: Optional[List[List[Embedding]]]
467
- documents: Optional[List[List[Document]]]
468
- metadatas: Optional[List[List[Metadata]]]
469
- distances: Optional[List[List[float]]]
470
- ```
471
- """
472
- if client is None:
473
- client = chromadb.PersistentClient(path=db_path)
474
- # the collection's embedding function is always the default one, but we want to use the one we used to create the
475
- # collection. So we compute the embeddings ourselves and pass it to the query function.
476
- collection = client.get_collection(collection_name)
477
- embedding_function = (
478
- ef.SentenceTransformerEmbeddingFunction(embedding_model) if embedding_function is None else embedding_function
479
- )
480
- query_embeddings = embedding_function(query_texts)
481
- # Query/search n most similar results. You can also .get by id
482
- results = collection.query(
483
- query_embeddings=query_embeddings,
484
- n_results=n_results,
485
- where_document={"$contains": search_string} if search_string else None, # optional filter
486
- )
487
- return results
@@ -1,163 +0,0 @@
1
- # Copyright (c) 2023 - 2024, Owners of https://github.com/ag2ai
2
- #
3
- # SPDX-License-Identifier: Apache-2.0
4
- #
5
- # Portions derived from https://github.com/microsoft/autogen are under the MIT License.
6
- # SPDX-License-Identifier: MIT
7
- from __future__ import annotations
8
-
9
- import logging
10
- import sqlite3
11
- import uuid
12
- from typing import TYPE_CHECKING, Any, Callable, Dict, List, Literal, Optional, TypeVar, Union
13
-
14
- from openai import AzureOpenAI, OpenAI
15
- from openai.types.chat import ChatCompletion
16
-
17
- from autogen.logger.base_logger import BaseLogger, LLMConfig
18
- from autogen.logger.logger_factory import LoggerFactory
19
-
20
- if TYPE_CHECKING:
21
- from autogen import Agent, ConversableAgent, OpenAIWrapper
22
- from autogen.oai.anthropic import AnthropicClient
23
- from autogen.oai.bedrock import BedrockClient
24
- from autogen.oai.cerebras import CerebrasClient
25
- from autogen.oai.cohere import CohereClient
26
- from autogen.oai.gemini import GeminiClient
27
- from autogen.oai.groq import GroqClient
28
- from autogen.oai.mistral import MistralAIClient
29
- from autogen.oai.ollama import OllamaClient
30
- from autogen.oai.together import TogetherClient
31
-
32
- logger = logging.getLogger(__name__)
33
-
34
- autogen_logger = None
35
- is_logging = False
36
-
37
- F = TypeVar("F", bound=Callable[..., Any])
38
-
39
-
40
- def start(
41
- logger: Optional[BaseLogger] = None,
42
- logger_type: Literal["sqlite", "file"] = "sqlite",
43
- config: Optional[Dict[str, Any]] = None,
44
- ) -> str:
45
- """
46
- Start logging for the runtime.
47
- Args:
48
- logger (BaseLogger): A logger instance
49
- logger_type (str): The type of logger to use (default: sqlite)
50
- config (dict): Configuration for the logger
51
- Returns:
52
- session_id (str(uuid.uuid4)): a unique id for the logging session
53
- """
54
- global autogen_logger
55
- global is_logging
56
-
57
- if logger:
58
- autogen_logger = logger
59
- else:
60
- autogen_logger = LoggerFactory.get_logger(logger_type=logger_type, config=config)
61
-
62
- try:
63
- session_id = autogen_logger.start()
64
- is_logging = True
65
- except Exception as e:
66
- logger.error(f"[runtime logging] Failed to start logging: {e}")
67
- finally:
68
- return session_id
69
-
70
-
71
- def log_chat_completion(
72
- invocation_id: uuid.UUID,
73
- client_id: int,
74
- wrapper_id: int,
75
- agent: Union[str, Agent],
76
- request: Dict[str, Union[float, str, List[Dict[str, str]]]],
77
- response: Union[str, ChatCompletion],
78
- is_cached: int,
79
- cost: float,
80
- start_time: str,
81
- ) -> None:
82
- if autogen_logger is None:
83
- logger.error("[runtime logging] log_chat_completion: autogen logger is None")
84
- return
85
-
86
- autogen_logger.log_chat_completion(
87
- invocation_id, client_id, wrapper_id, agent, request, response, is_cached, cost, start_time
88
- )
89
-
90
-
91
- def log_new_agent(agent: ConversableAgent, init_args: Dict[str, Any]) -> None:
92
- if autogen_logger is None:
93
- logger.error("[runtime logging] log_new_agent: autogen logger is None")
94
- return
95
-
96
- autogen_logger.log_new_agent(agent, init_args)
97
-
98
-
99
- def log_event(source: Union[str, Agent], name: str, **kwargs: Dict[str, Any]) -> None:
100
- if autogen_logger is None:
101
- logger.error("[runtime logging] log_event: autogen logger is None")
102
- return
103
-
104
- autogen_logger.log_event(source, name, **kwargs)
105
-
106
-
107
- def log_function_use(agent: Union[str, Agent], function: F, args: Dict[str, Any], returns: any):
108
- if autogen_logger is None:
109
- logger.error("[runtime logging] log_function_use: autogen logger is None")
110
- return
111
-
112
- autogen_logger.log_function_use(agent, function, args, returns)
113
-
114
-
115
- def log_new_wrapper(wrapper: OpenAIWrapper, init_args: Dict[str, Union[LLMConfig, List[LLMConfig]]]) -> None:
116
- if autogen_logger is None:
117
- logger.error("[runtime logging] log_new_wrapper: autogen logger is None")
118
- return
119
-
120
- autogen_logger.log_new_wrapper(wrapper, init_args)
121
-
122
-
123
- def log_new_client(
124
- client: Union[
125
- AzureOpenAI,
126
- OpenAI,
127
- CerebrasClient,
128
- GeminiClient,
129
- AnthropicClient,
130
- MistralAIClient,
131
- TogetherClient,
132
- GroqClient,
133
- CohereClient,
134
- OllamaClient,
135
- BedrockClient,
136
- ],
137
- wrapper: OpenAIWrapper,
138
- init_args: Dict[str, Any],
139
- ) -> None:
140
- if autogen_logger is None:
141
- logger.error("[runtime logging] log_new_client: autogen logger is None")
142
- return
143
-
144
- autogen_logger.log_new_client(client, wrapper, init_args)
145
-
146
-
147
- def stop() -> None:
148
- global is_logging
149
- if autogen_logger:
150
- autogen_logger.stop()
151
- is_logging = False
152
-
153
-
154
- def get_connection() -> Union[None, sqlite3.Connection]:
155
- if autogen_logger is None:
156
- logger.error("[runtime logging] get_connection: autogen logger is None")
157
- return None
158
-
159
- return autogen_logger.get_connection()
160
-
161
-
162
- def logging_enabled() -> bool:
163
- return is_logging