ag2 0.4.1__py3-none-any.whl → 0.5.0b2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of ag2 might be problematic. Click here for more details.

Files changed (160) hide show
  1. {ag2-0.4.1.dist-info → ag2-0.5.0b2.dist-info}/METADATA +5 -146
  2. ag2-0.5.0b2.dist-info/RECORD +6 -0
  3. ag2-0.5.0b2.dist-info/top_level.txt +1 -0
  4. ag2-0.4.1.dist-info/RECORD +0 -158
  5. ag2-0.4.1.dist-info/top_level.txt +0 -1
  6. autogen/__init__.py +0 -17
  7. autogen/_pydantic.py +0 -116
  8. autogen/agentchat/__init__.py +0 -42
  9. autogen/agentchat/agent.py +0 -142
  10. autogen/agentchat/assistant_agent.py +0 -85
  11. autogen/agentchat/chat.py +0 -306
  12. autogen/agentchat/contrib/__init__.py +0 -0
  13. autogen/agentchat/contrib/agent_builder.py +0 -788
  14. autogen/agentchat/contrib/agent_eval/agent_eval.py +0 -107
  15. autogen/agentchat/contrib/agent_eval/criterion.py +0 -47
  16. autogen/agentchat/contrib/agent_eval/critic_agent.py +0 -47
  17. autogen/agentchat/contrib/agent_eval/quantifier_agent.py +0 -42
  18. autogen/agentchat/contrib/agent_eval/subcritic_agent.py +0 -48
  19. autogen/agentchat/contrib/agent_eval/task.py +0 -43
  20. autogen/agentchat/contrib/agent_optimizer.py +0 -450
  21. autogen/agentchat/contrib/capabilities/__init__.py +0 -0
  22. autogen/agentchat/contrib/capabilities/agent_capability.py +0 -21
  23. autogen/agentchat/contrib/capabilities/generate_images.py +0 -297
  24. autogen/agentchat/contrib/capabilities/teachability.py +0 -406
  25. autogen/agentchat/contrib/capabilities/text_compressors.py +0 -72
  26. autogen/agentchat/contrib/capabilities/transform_messages.py +0 -92
  27. autogen/agentchat/contrib/capabilities/transforms.py +0 -565
  28. autogen/agentchat/contrib/capabilities/transforms_util.py +0 -120
  29. autogen/agentchat/contrib/capabilities/vision_capability.py +0 -217
  30. autogen/agentchat/contrib/captainagent/tools/__init__.py +0 -0
  31. autogen/agentchat/contrib/captainagent/tools/data_analysis/calculate_correlation.py +0 -41
  32. autogen/agentchat/contrib/captainagent/tools/data_analysis/calculate_skewness_and_kurtosis.py +0 -29
  33. autogen/agentchat/contrib/captainagent/tools/data_analysis/detect_outlier_iqr.py +0 -29
  34. autogen/agentchat/contrib/captainagent/tools/data_analysis/detect_outlier_zscore.py +0 -29
  35. autogen/agentchat/contrib/captainagent/tools/data_analysis/explore_csv.py +0 -22
  36. autogen/agentchat/contrib/captainagent/tools/data_analysis/shapiro_wilk_test.py +0 -31
  37. autogen/agentchat/contrib/captainagent/tools/information_retrieval/arxiv_download.py +0 -26
  38. autogen/agentchat/contrib/captainagent/tools/information_retrieval/arxiv_search.py +0 -55
  39. autogen/agentchat/contrib/captainagent/tools/information_retrieval/extract_pdf_image.py +0 -54
  40. autogen/agentchat/contrib/captainagent/tools/information_retrieval/extract_pdf_text.py +0 -39
  41. autogen/agentchat/contrib/captainagent/tools/information_retrieval/get_wikipedia_text.py +0 -22
  42. autogen/agentchat/contrib/captainagent/tools/information_retrieval/get_youtube_caption.py +0 -35
  43. autogen/agentchat/contrib/captainagent/tools/information_retrieval/image_qa.py +0 -61
  44. autogen/agentchat/contrib/captainagent/tools/information_retrieval/optical_character_recognition.py +0 -62
  45. autogen/agentchat/contrib/captainagent/tools/information_retrieval/perform_web_search.py +0 -48
  46. autogen/agentchat/contrib/captainagent/tools/information_retrieval/scrape_wikipedia_tables.py +0 -34
  47. autogen/agentchat/contrib/captainagent/tools/information_retrieval/transcribe_audio_file.py +0 -22
  48. autogen/agentchat/contrib/captainagent/tools/information_retrieval/youtube_download.py +0 -36
  49. autogen/agentchat/contrib/captainagent/tools/math/calculate_circle_area_from_diameter.py +0 -22
  50. autogen/agentchat/contrib/captainagent/tools/math/calculate_day_of_the_week.py +0 -19
  51. autogen/agentchat/contrib/captainagent/tools/math/calculate_fraction_sum.py +0 -29
  52. autogen/agentchat/contrib/captainagent/tools/math/calculate_matrix_power.py +0 -32
  53. autogen/agentchat/contrib/captainagent/tools/math/calculate_reflected_point.py +0 -17
  54. autogen/agentchat/contrib/captainagent/tools/math/complex_numbers_product.py +0 -26
  55. autogen/agentchat/contrib/captainagent/tools/math/compute_currency_conversion.py +0 -24
  56. autogen/agentchat/contrib/captainagent/tools/math/count_distinct_permutations.py +0 -28
  57. autogen/agentchat/contrib/captainagent/tools/math/evaluate_expression.py +0 -29
  58. autogen/agentchat/contrib/captainagent/tools/math/find_continuity_point.py +0 -35
  59. autogen/agentchat/contrib/captainagent/tools/math/fraction_to_mixed_numbers.py +0 -40
  60. autogen/agentchat/contrib/captainagent/tools/math/modular_inverse_sum.py +0 -23
  61. autogen/agentchat/contrib/captainagent/tools/math/simplify_mixed_numbers.py +0 -37
  62. autogen/agentchat/contrib/captainagent/tools/math/sum_of_digit_factorials.py +0 -16
  63. autogen/agentchat/contrib/captainagent/tools/math/sum_of_primes_below.py +0 -16
  64. autogen/agentchat/contrib/captainagent/tools/requirements.txt +0 -10
  65. autogen/agentchat/contrib/captainagent/tools/tool_description.tsv +0 -34
  66. autogen/agentchat/contrib/captainagent.py +0 -490
  67. autogen/agentchat/contrib/gpt_assistant_agent.py +0 -545
  68. autogen/agentchat/contrib/graph_rag/__init__.py +0 -0
  69. autogen/agentchat/contrib/graph_rag/document.py +0 -30
  70. autogen/agentchat/contrib/graph_rag/falkor_graph_query_engine.py +0 -111
  71. autogen/agentchat/contrib/graph_rag/falkor_graph_rag_capability.py +0 -81
  72. autogen/agentchat/contrib/graph_rag/graph_query_engine.py +0 -56
  73. autogen/agentchat/contrib/graph_rag/graph_rag_capability.py +0 -64
  74. autogen/agentchat/contrib/img_utils.py +0 -390
  75. autogen/agentchat/contrib/llamaindex_conversable_agent.py +0 -123
  76. autogen/agentchat/contrib/llava_agent.py +0 -176
  77. autogen/agentchat/contrib/math_user_proxy_agent.py +0 -471
  78. autogen/agentchat/contrib/multimodal_conversable_agent.py +0 -128
  79. autogen/agentchat/contrib/qdrant_retrieve_user_proxy_agent.py +0 -325
  80. autogen/agentchat/contrib/retrieve_assistant_agent.py +0 -56
  81. autogen/agentchat/contrib/retrieve_user_proxy_agent.py +0 -705
  82. autogen/agentchat/contrib/society_of_mind_agent.py +0 -203
  83. autogen/agentchat/contrib/swarm_agent.py +0 -463
  84. autogen/agentchat/contrib/text_analyzer_agent.py +0 -76
  85. autogen/agentchat/contrib/tool_retriever.py +0 -120
  86. autogen/agentchat/contrib/vectordb/__init__.py +0 -0
  87. autogen/agentchat/contrib/vectordb/base.py +0 -243
  88. autogen/agentchat/contrib/vectordb/chromadb.py +0 -326
  89. autogen/agentchat/contrib/vectordb/mongodb.py +0 -559
  90. autogen/agentchat/contrib/vectordb/pgvectordb.py +0 -958
  91. autogen/agentchat/contrib/vectordb/qdrant.py +0 -334
  92. autogen/agentchat/contrib/vectordb/utils.py +0 -126
  93. autogen/agentchat/contrib/web_surfer.py +0 -305
  94. autogen/agentchat/conversable_agent.py +0 -2908
  95. autogen/agentchat/groupchat.py +0 -1668
  96. autogen/agentchat/user_proxy_agent.py +0 -109
  97. autogen/agentchat/utils.py +0 -207
  98. autogen/browser_utils.py +0 -291
  99. autogen/cache/__init__.py +0 -10
  100. autogen/cache/abstract_cache_base.py +0 -78
  101. autogen/cache/cache.py +0 -182
  102. autogen/cache/cache_factory.py +0 -85
  103. autogen/cache/cosmos_db_cache.py +0 -150
  104. autogen/cache/disk_cache.py +0 -109
  105. autogen/cache/in_memory_cache.py +0 -61
  106. autogen/cache/redis_cache.py +0 -128
  107. autogen/code_utils.py +0 -745
  108. autogen/coding/__init__.py +0 -22
  109. autogen/coding/base.py +0 -113
  110. autogen/coding/docker_commandline_code_executor.py +0 -262
  111. autogen/coding/factory.py +0 -45
  112. autogen/coding/func_with_reqs.py +0 -203
  113. autogen/coding/jupyter/__init__.py +0 -22
  114. autogen/coding/jupyter/base.py +0 -32
  115. autogen/coding/jupyter/docker_jupyter_server.py +0 -164
  116. autogen/coding/jupyter/embedded_ipython_code_executor.py +0 -182
  117. autogen/coding/jupyter/jupyter_client.py +0 -224
  118. autogen/coding/jupyter/jupyter_code_executor.py +0 -161
  119. autogen/coding/jupyter/local_jupyter_server.py +0 -168
  120. autogen/coding/local_commandline_code_executor.py +0 -410
  121. autogen/coding/markdown_code_extractor.py +0 -44
  122. autogen/coding/utils.py +0 -57
  123. autogen/exception_utils.py +0 -46
  124. autogen/extensions/__init__.py +0 -0
  125. autogen/formatting_utils.py +0 -76
  126. autogen/function_utils.py +0 -362
  127. autogen/graph_utils.py +0 -148
  128. autogen/io/__init__.py +0 -15
  129. autogen/io/base.py +0 -105
  130. autogen/io/console.py +0 -43
  131. autogen/io/websockets.py +0 -213
  132. autogen/logger/__init__.py +0 -11
  133. autogen/logger/base_logger.py +0 -140
  134. autogen/logger/file_logger.py +0 -287
  135. autogen/logger/logger_factory.py +0 -29
  136. autogen/logger/logger_utils.py +0 -42
  137. autogen/logger/sqlite_logger.py +0 -459
  138. autogen/math_utils.py +0 -356
  139. autogen/oai/__init__.py +0 -33
  140. autogen/oai/anthropic.py +0 -428
  141. autogen/oai/bedrock.py +0 -606
  142. autogen/oai/cerebras.py +0 -270
  143. autogen/oai/client.py +0 -1148
  144. autogen/oai/client_utils.py +0 -167
  145. autogen/oai/cohere.py +0 -453
  146. autogen/oai/completion.py +0 -1216
  147. autogen/oai/gemini.py +0 -469
  148. autogen/oai/groq.py +0 -281
  149. autogen/oai/mistral.py +0 -279
  150. autogen/oai/ollama.py +0 -582
  151. autogen/oai/openai_utils.py +0 -811
  152. autogen/oai/together.py +0 -343
  153. autogen/retrieve_utils.py +0 -487
  154. autogen/runtime_logging.py +0 -163
  155. autogen/token_count_utils.py +0 -259
  156. autogen/types.py +0 -20
  157. autogen/version.py +0 -7
  158. {ag2-0.4.1.dist-info → ag2-0.5.0b2.dist-info}/LICENSE +0 -0
  159. {ag2-0.4.1.dist-info → ag2-0.5.0b2.dist-info}/NOTICE.md +0 -0
  160. {ag2-0.4.1.dist-info → ag2-0.5.0b2.dist-info}/WHEEL +0 -0
autogen/retrieve_utils.py DELETED
@@ -1,487 +0,0 @@
1
- # Copyright (c) 2023 - 2024, Owners of https://github.com/ag2ai
2
- #
3
- # SPDX-License-Identifier: Apache-2.0
4
- #
5
- # Portions derived from https://github.com/microsoft/autogen are under the MIT License.
6
- # SPDX-License-Identifier: MIT
7
- import glob
8
- import hashlib
9
- import os
10
- import re
11
- from typing import Callable, List, Tuple, Union
12
- from urllib.parse import urlparse
13
-
14
- import chromadb
15
- import markdownify
16
- import requests
17
- from bs4 import BeautifulSoup
18
-
19
- if chromadb.__version__ < "0.4.15":
20
- from chromadb.api import API
21
- else:
22
- from chromadb.api import ClientAPI as API
23
- import logging
24
-
25
- import chromadb.utils.embedding_functions as ef
26
- import pypdf
27
- from chromadb.api.types import QueryResult
28
-
29
- from autogen.token_count_utils import count_token
30
-
31
- try:
32
- from unstructured.partition.auto import partition
33
-
34
- HAS_UNSTRUCTURED = True
35
- except ImportError:
36
- HAS_UNSTRUCTURED = False
37
-
38
- logger = logging.getLogger(__name__)
39
- TEXT_FORMATS = [
40
- "txt",
41
- "json",
42
- "csv",
43
- "tsv",
44
- "md",
45
- "html",
46
- "htm",
47
- "rtf",
48
- "rst",
49
- "jsonl",
50
- "log",
51
- "xml",
52
- "yaml",
53
- "yml",
54
- "pdf",
55
- ]
56
- UNSTRUCTURED_FORMATS = [
57
- "doc",
58
- "docx",
59
- "epub",
60
- "msg",
61
- "odt",
62
- "org",
63
- "pdf",
64
- "ppt",
65
- "pptx",
66
- "rtf",
67
- "rst",
68
- "xlsx",
69
- ] # These formats will be parsed by the 'unstructured' library, if installed.
70
- if HAS_UNSTRUCTURED:
71
- TEXT_FORMATS += UNSTRUCTURED_FORMATS
72
- TEXT_FORMATS = list(set(TEXT_FORMATS))
73
- VALID_CHUNK_MODES = frozenset({"one_line", "multi_lines"})
74
- RAG_MINIMUM_MESSAGE_LENGTH = int(os.environ.get("RAG_MINIMUM_MESSAGE_LENGTH", 5))
75
-
76
-
77
- def split_text_to_chunks(
78
- text: str,
79
- max_tokens: int = 4000,
80
- chunk_mode: str = "multi_lines",
81
- must_break_at_empty_line: bool = True,
82
- overlap: int = 0, # number of overlapping lines
83
- ):
84
- """Split a long text into chunks of max_tokens."""
85
- if chunk_mode not in VALID_CHUNK_MODES:
86
- raise AssertionError
87
- if chunk_mode == "one_line":
88
- must_break_at_empty_line = False
89
- overlap = 0
90
- chunks = []
91
- lines = text.split("\n")
92
- num_lines = len(lines)
93
- if num_lines < 3 and must_break_at_empty_line:
94
- logger.warning("The input text has less than 3 lines. Set `must_break_at_empty_line` to `False`")
95
- must_break_at_empty_line = False
96
- lines_tokens = [count_token(line) for line in lines]
97
- sum_tokens = sum(lines_tokens)
98
- while sum_tokens > max_tokens:
99
- if chunk_mode == "one_line":
100
- estimated_line_cut = 2
101
- else:
102
- estimated_line_cut = max(int(max_tokens / sum_tokens * len(lines)), 2)
103
- cnt = 0
104
- prev = ""
105
- for cnt in reversed(range(estimated_line_cut)):
106
- if must_break_at_empty_line and lines[cnt].strip() != "":
107
- continue
108
- if sum(lines_tokens[:cnt]) <= max_tokens:
109
- prev = "\n".join(lines[:cnt])
110
- break
111
- if cnt == 0:
112
- logger.warning(
113
- f"max_tokens is too small to fit a single line of text. Breaking this line:\n\t{lines[0][:100]} ..."
114
- )
115
- if not must_break_at_empty_line:
116
- split_len = max(
117
- int(max_tokens / (lines_tokens[0] * 0.9 * len(lines[0]) + 0.1)), RAG_MINIMUM_MESSAGE_LENGTH
118
- )
119
- prev = lines[0][:split_len]
120
- lines[0] = lines[0][split_len:]
121
- lines_tokens[0] = count_token(lines[0])
122
- else:
123
- logger.warning("Failed to split docs with must_break_at_empty_line being True, set to False.")
124
- must_break_at_empty_line = False
125
- (
126
- chunks.append(prev) if len(prev) >= RAG_MINIMUM_MESSAGE_LENGTH else None
127
- ) # don't add chunks less than RAG_MINIMUM_MESSAGE_LENGTH characters
128
- lines = lines[cnt - overlap if cnt > overlap else cnt :]
129
- lines_tokens = lines_tokens[cnt - overlap if cnt > overlap else cnt :]
130
- sum_tokens = sum(lines_tokens)
131
- text_to_chunk = "\n".join(lines).strip()
132
- (
133
- chunks.append(text_to_chunk) if len(text_to_chunk) >= RAG_MINIMUM_MESSAGE_LENGTH else None
134
- ) # don't add chunks less than RAG_MINIMUM_MESSAGE_LENGTH characters
135
- return chunks
136
-
137
-
138
- def extract_text_from_pdf(file: str) -> str:
139
- """Extract text from PDF files"""
140
- text = ""
141
- with open(file, "rb") as f:
142
- reader = pypdf.PdfReader(f)
143
- if reader.is_encrypted: # Check if the PDF is encrypted
144
- try:
145
- reader.decrypt("")
146
- except pypdf.errors.FileNotDecryptedError as e:
147
- logger.warning(f"Could not decrypt PDF {file}, {e}")
148
- return text # Return empty text if PDF could not be decrypted
149
-
150
- for page_num in range(len(reader.pages)):
151
- page = reader.pages[page_num]
152
- text += page.extract_text()
153
-
154
- if not text.strip(): # Debugging line to check if text is empty
155
- logger.warning(f"Could not decrypt PDF {file}")
156
-
157
- return text
158
-
159
-
160
- def split_files_to_chunks(
161
- files: list,
162
- max_tokens: int = 4000,
163
- chunk_mode: str = "multi_lines",
164
- must_break_at_empty_line: bool = True,
165
- custom_text_split_function: Callable = None,
166
- ) -> Tuple[List[str], List[dict]]:
167
- """Split a list of files into chunks of max_tokens."""
168
-
169
- chunks = []
170
- sources = []
171
-
172
- for file in files:
173
- if isinstance(file, tuple):
174
- url = file[1]
175
- file = file[0]
176
- else:
177
- url = None
178
- _, file_extension = os.path.splitext(file)
179
- file_extension = file_extension.lower()
180
-
181
- if HAS_UNSTRUCTURED and file_extension[1:] in UNSTRUCTURED_FORMATS:
182
- text = partition(file)
183
- text = "\n".join([t.text for t in text]) if len(text) > 0 else ""
184
- elif file_extension == ".pdf":
185
- text = extract_text_from_pdf(file)
186
- else: # For non-PDF text-based files
187
- with open(file, "r", encoding="utf-8", errors="ignore") as f:
188
- text = f.read()
189
-
190
- if not text.strip(): # Debugging line to check if text is empty after reading
191
- logger.warning(f"No text available in file: {file}")
192
- continue # Skip to the next file if no text is available
193
-
194
- if custom_text_split_function is not None:
195
- tmp_chunks = custom_text_split_function(text)
196
- else:
197
- tmp_chunks = split_text_to_chunks(text, max_tokens, chunk_mode, must_break_at_empty_line)
198
- chunks += tmp_chunks
199
- sources += [{"source": url if url else file}] * len(tmp_chunks)
200
-
201
- return chunks, sources
202
-
203
-
204
- def get_files_from_dir(dir_path: Union[str, List[str]], types: list = TEXT_FORMATS, recursive: bool = True):
205
- """Return a list of all the files in a given directory, a url, a file path or a list of them."""
206
- if len(types) == 0:
207
- raise ValueError("types cannot be empty.")
208
- types = [t[1:].lower() if t.startswith(".") else t.lower() for t in set(types)]
209
- types += [t.upper() for t in types]
210
-
211
- files = []
212
- # If the path is a list of files or urls, process and return them
213
- if isinstance(dir_path, list):
214
- for item in dir_path:
215
- if os.path.isfile(item):
216
- files.append(item)
217
- elif is_url(item):
218
- filepath = get_file_from_url(item)
219
- if filepath:
220
- files.append(filepath)
221
- elif os.path.exists(item):
222
- try:
223
- files.extend(get_files_from_dir(item, types, recursive))
224
- except ValueError:
225
- logger.warning(f"Directory {item} does not exist. Skipping.")
226
- else:
227
- logger.warning(f"File {item} does not exist. Skipping.")
228
- return files
229
-
230
- # If the path is a file, return it
231
- if os.path.isfile(dir_path):
232
- return [dir_path]
233
-
234
- # If the path is a url, download it and return the downloaded file
235
- if is_url(dir_path):
236
- filepath = get_file_from_url(dir_path)
237
- if filepath:
238
- return [filepath]
239
- else:
240
- return []
241
-
242
- if os.path.exists(dir_path):
243
- for type in types:
244
- if recursive:
245
- files += glob.glob(os.path.join(dir_path, f"**/*.{type}"), recursive=True)
246
- else:
247
- files += glob.glob(os.path.join(dir_path, f"*.{type}"), recursive=False)
248
- else:
249
- logger.error(f"Directory {dir_path} does not exist.")
250
- raise ValueError(f"Directory {dir_path} does not exist.")
251
- return files
252
-
253
-
254
- def parse_html_to_markdown(html: str, url: str = None) -> str:
255
- """Parse HTML to markdown."""
256
- soup = BeautifulSoup(html, "html.parser")
257
- title = soup.title.string
258
- # Remove javascript and style blocks
259
- for script in soup(["script", "style"]):
260
- script.extract()
261
-
262
- # Convert to markdown -- Wikipedia gets special attention to get a clean version of the page
263
- if isinstance(url, str) and url.startswith("https://en.wikipedia.org/"):
264
- body_elm = soup.find("div", {"id": "mw-content-text"})
265
- title_elm = soup.find("span", {"class": "mw-page-title-main"})
266
-
267
- if body_elm:
268
- # What's the title
269
- main_title = soup.title.string
270
- if title_elm and len(title_elm) > 0:
271
- main_title = title_elm.string
272
- webpage_text = "# " + main_title + "\n\n" + markdownify.MarkdownConverter().convert_soup(body_elm)
273
- else:
274
- webpage_text = markdownify.MarkdownConverter().convert_soup(soup)
275
- else:
276
- webpage_text = markdownify.MarkdownConverter().convert_soup(soup)
277
-
278
- # Convert newlines
279
- webpage_text = re.sub(r"\r\n", "\n", webpage_text)
280
- webpage_text = re.sub(r"\n{2,}", "\n\n", webpage_text).strip()
281
- webpage_text = "# " + title + "\n\n" + webpage_text
282
- return webpage_text
283
-
284
-
285
- def _generate_file_name_from_url(url: str, max_length=255) -> str:
286
- url_bytes = url.encode("utf-8")
287
- hash = hashlib.blake2b(url_bytes).hexdigest()
288
- parsed_url = urlparse(url)
289
- file_name = os.path.basename(url)
290
- file_name = f"{parsed_url.netloc}_{file_name}_{hash[:min(8, max_length-len(parsed_url.netloc)-len(file_name)-1)]}"
291
- return file_name
292
-
293
-
294
- def get_file_from_url(url: str, save_path: str = None) -> Tuple[str, str]:
295
- """Download a file from a URL."""
296
- if save_path is None:
297
- save_path = "tmp/chromadb"
298
- os.makedirs(save_path, exist_ok=True)
299
- if os.path.isdir(save_path):
300
- filename = _generate_file_name_from_url(url)
301
- save_path = os.path.join(save_path, filename)
302
- else:
303
- os.makedirs(os.path.dirname(save_path), exist_ok=True)
304
-
305
- custom_headers = {
306
- "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36",
307
- }
308
- try:
309
- response = requests.get(url, stream=True, headers=custom_headers, timeout=30)
310
- response.raise_for_status()
311
- except requests.exceptions.RequestException as e:
312
- logger.warning(f"Failed to download {url}, {e}")
313
- return None
314
-
315
- content_type = response.headers.get("content-type", "")
316
- if "text/html" in content_type:
317
- # Get the content of the response
318
- html = ""
319
- for chunk in response.iter_content(chunk_size=8192, decode_unicode=True):
320
- html += chunk
321
- text = parse_html_to_markdown(html, url)
322
- with open(save_path, "w", encoding="utf-8") as f:
323
- f.write(text)
324
- else:
325
- with open(save_path, "wb") as f:
326
- for chunk in response.iter_content(chunk_size=8192):
327
- f.write(chunk)
328
- return save_path, url
329
-
330
-
331
- def is_url(string: str):
332
- """Return True if the string is a valid URL."""
333
- try:
334
- result = urlparse(string)
335
- return all([result.scheme, result.netloc])
336
- except ValueError:
337
- return False
338
-
339
-
340
- def create_vector_db_from_dir(
341
- dir_path: Union[str, List[str]],
342
- max_tokens: int = 4000,
343
- client: API = None,
344
- db_path: str = "tmp/chromadb.db",
345
- collection_name: str = "all-my-documents",
346
- get_or_create: bool = False,
347
- chunk_mode: str = "multi_lines",
348
- must_break_at_empty_line: bool = True,
349
- embedding_model: str = "all-MiniLM-L6-v2",
350
- embedding_function: Callable = None,
351
- custom_text_split_function: Callable = None,
352
- custom_text_types: List[str] = TEXT_FORMATS,
353
- recursive: bool = True,
354
- extra_docs: bool = False,
355
- ) -> API:
356
- """Create a vector db from all the files in a given directory, the directory can also be a single file or a url to
357
- a single file. We support chromadb compatible APIs to create the vector db, this function is not required if
358
- you prepared your own vector db.
359
-
360
- Args:
361
- dir_path (Union[str, List[str]]): the path to the directory, file, url or a list of them.
362
- max_tokens (Optional, int): the maximum number of tokens per chunk. Default is 4000.
363
- client (Optional, API): the chromadb client. Default is None.
364
- db_path (Optional, str): the path to the chromadb. Default is "tmp/chromadb.db". The default was `/tmp/chromadb.db` for version <=0.2.24.
365
- collection_name (Optional, str): the name of the collection. Default is "all-my-documents".
366
- get_or_create (Optional, bool): Whether to get or create the collection. Default is False. If True, the collection
367
- will be returned if it already exists. Will raise ValueError if the collection already exists and get_or_create is False.
368
- chunk_mode (Optional, str): the chunk mode. Default is "multi_lines".
369
- must_break_at_empty_line (Optional, bool): Whether to break at empty line. Default is True.
370
- embedding_model (Optional, str): the embedding model to use. Default is "all-MiniLM-L6-v2". Will be ignored if
371
- embedding_function is not None.
372
- embedding_function (Optional, Callable): the embedding function to use. Default is None, SentenceTransformer with
373
- the given `embedding_model` will be used. If you want to use OpenAI, Cohere, HuggingFace or other embedding
374
- functions, you can pass it here, follow the examples in `https://docs.trychroma.com/embeddings`.
375
- custom_text_split_function (Optional, Callable): a custom function to split a string into a list of strings.
376
- Default is None, will use the default function in `autogen.retrieve_utils.split_text_to_chunks`.
377
- custom_text_types (Optional, List[str]): a list of file types to be processed. Default is TEXT_FORMATS.
378
- recursive (Optional, bool): whether to search documents recursively in the dir_path. Default is True.
379
- extra_docs (Optional, bool): whether to add more documents in the collection. Default is False
380
-
381
- Returns:
382
-
383
- The chromadb client.
384
- """
385
- if client is None:
386
- client = chromadb.PersistentClient(path=db_path)
387
- try:
388
- embedding_function = (
389
- ef.SentenceTransformerEmbeddingFunction(embedding_model)
390
- if embedding_function is None
391
- else embedding_function
392
- )
393
- collection = client.create_collection(
394
- collection_name,
395
- get_or_create=get_or_create,
396
- embedding_function=embedding_function,
397
- # https://github.com/nmslib/hnswlib#supported-distances
398
- # https://github.com/chroma-core/chroma/blob/566bc80f6c8ee29f7d99b6322654f32183c368c4/chromadb/segment/impl/vector/local_hnsw.py#L184
399
- # https://github.com/nmslib/hnswlib/blob/master/ALGO_PARAMS.md
400
- metadata={"hnsw:space": "ip", "hnsw:construction_ef": 30, "hnsw:M": 32}, # ip, l2, cosine
401
- )
402
-
403
- length = 0
404
- if extra_docs:
405
- length = len(collection.get()["ids"])
406
-
407
- if custom_text_split_function is not None:
408
- chunks, sources = split_files_to_chunks(
409
- get_files_from_dir(dir_path, custom_text_types, recursive),
410
- custom_text_split_function=custom_text_split_function,
411
- )
412
- else:
413
- chunks, sources = split_files_to_chunks(
414
- get_files_from_dir(dir_path, custom_text_types, recursive),
415
- max_tokens,
416
- chunk_mode,
417
- must_break_at_empty_line,
418
- )
419
- logger.info(f"Found {len(chunks)} chunks.")
420
- # Upsert in batch of 40000 or less if the total number of chunks is less than 40000
421
- for i in range(0, len(chunks), min(40000, len(chunks))):
422
- end_idx = i + min(40000, len(chunks) - i)
423
- collection.upsert(
424
- documents=chunks[i:end_idx],
425
- ids=[f"doc_{j+length}" for j in range(i, end_idx)], # unique for each doc
426
- metadatas=sources[i:end_idx],
427
- )
428
- except ValueError as e:
429
- logger.warning(f"{e}")
430
- return client
431
-
432
-
433
- def query_vector_db(
434
- query_texts: List[str],
435
- n_results: int = 10,
436
- client: API = None,
437
- db_path: str = "tmp/chromadb.db",
438
- collection_name: str = "all-my-documents",
439
- search_string: str = "",
440
- embedding_model: str = "all-MiniLM-L6-v2",
441
- embedding_function: Callable = None,
442
- ) -> QueryResult:
443
- """Query a vector db. We support chromadb compatible APIs, it's not required if you prepared your own vector db
444
- and query function.
445
-
446
- Args:
447
- query_texts (List[str]): the list of strings which will be used to query the vector db.
448
- n_results (Optional, int): the number of results to return. Default is 10.
449
- client (Optional, API): the chromadb compatible client. Default is None, a chromadb client will be used.
450
- db_path (Optional, str): the path to the vector db. Default is "tmp/chromadb.db". The default was `/tmp/chromadb.db` for version <=0.2.24.
451
- collection_name (Optional, str): the name of the collection. Default is "all-my-documents".
452
- search_string (Optional, str): the search string. Only docs that contain an exact match of this string will be retrieved. Default is "".
453
- embedding_model (Optional, str): the embedding model to use. Default is "all-MiniLM-L6-v2". Will be ignored if
454
- embedding_function is not None.
455
- embedding_function (Optional, Callable): the embedding function to use. Default is None, SentenceTransformer with
456
- the given `embedding_model` will be used. If you want to use OpenAI, Cohere, HuggingFace or other embedding
457
- functions, you can pass it here, follow the examples in `https://docs.trychroma.com/embeddings`.
458
-
459
- Returns:
460
-
461
- The query result. The format is:
462
-
463
- ```python
464
- class QueryResult(TypedDict):
465
- ids: List[IDs]
466
- embeddings: Optional[List[List[Embedding]]]
467
- documents: Optional[List[List[Document]]]
468
- metadatas: Optional[List[List[Metadata]]]
469
- distances: Optional[List[List[float]]]
470
- ```
471
- """
472
- if client is None:
473
- client = chromadb.PersistentClient(path=db_path)
474
- # the collection's embedding function is always the default one, but we want to use the one we used to create the
475
- # collection. So we compute the embeddings ourselves and pass it to the query function.
476
- collection = client.get_collection(collection_name)
477
- embedding_function = (
478
- ef.SentenceTransformerEmbeddingFunction(embedding_model) if embedding_function is None else embedding_function
479
- )
480
- query_embeddings = embedding_function(query_texts)
481
- # Query/search n most similar results. You can also .get by id
482
- results = collection.query(
483
- query_embeddings=query_embeddings,
484
- n_results=n_results,
485
- where_document={"$contains": search_string} if search_string else None, # optional filter
486
- )
487
- return results
@@ -1,163 +0,0 @@
1
- # Copyright (c) 2023 - 2024, Owners of https://github.com/ag2ai
2
- #
3
- # SPDX-License-Identifier: Apache-2.0
4
- #
5
- # Portions derived from https://github.com/microsoft/autogen are under the MIT License.
6
- # SPDX-License-Identifier: MIT
7
- from __future__ import annotations
8
-
9
- import logging
10
- import sqlite3
11
- import uuid
12
- from typing import TYPE_CHECKING, Any, Callable, Dict, List, Literal, Optional, TypeVar, Union
13
-
14
- from openai import AzureOpenAI, OpenAI
15
- from openai.types.chat import ChatCompletion
16
-
17
- from autogen.logger.base_logger import BaseLogger, LLMConfig
18
- from autogen.logger.logger_factory import LoggerFactory
19
-
20
- if TYPE_CHECKING:
21
- from autogen import Agent, ConversableAgent, OpenAIWrapper
22
- from autogen.oai.anthropic import AnthropicClient
23
- from autogen.oai.bedrock import BedrockClient
24
- from autogen.oai.cerebras import CerebrasClient
25
- from autogen.oai.cohere import CohereClient
26
- from autogen.oai.gemini import GeminiClient
27
- from autogen.oai.groq import GroqClient
28
- from autogen.oai.mistral import MistralAIClient
29
- from autogen.oai.ollama import OllamaClient
30
- from autogen.oai.together import TogetherClient
31
-
32
- logger = logging.getLogger(__name__)
33
-
34
- autogen_logger = None
35
- is_logging = False
36
-
37
- F = TypeVar("F", bound=Callable[..., Any])
38
-
39
-
40
- def start(
41
- logger: Optional[BaseLogger] = None,
42
- logger_type: Literal["sqlite", "file"] = "sqlite",
43
- config: Optional[Dict[str, Any]] = None,
44
- ) -> str:
45
- """
46
- Start logging for the runtime.
47
- Args:
48
- logger (BaseLogger): A logger instance
49
- logger_type (str): The type of logger to use (default: sqlite)
50
- config (dict): Configuration for the logger
51
- Returns:
52
- session_id (str(uuid.uuid4)): a unique id for the logging session
53
- """
54
- global autogen_logger
55
- global is_logging
56
-
57
- if logger:
58
- autogen_logger = logger
59
- else:
60
- autogen_logger = LoggerFactory.get_logger(logger_type=logger_type, config=config)
61
-
62
- try:
63
- session_id = autogen_logger.start()
64
- is_logging = True
65
- except Exception as e:
66
- logger.error(f"[runtime logging] Failed to start logging: {e}")
67
- finally:
68
- return session_id
69
-
70
-
71
- def log_chat_completion(
72
- invocation_id: uuid.UUID,
73
- client_id: int,
74
- wrapper_id: int,
75
- agent: Union[str, Agent],
76
- request: Dict[str, Union[float, str, List[Dict[str, str]]]],
77
- response: Union[str, ChatCompletion],
78
- is_cached: int,
79
- cost: float,
80
- start_time: str,
81
- ) -> None:
82
- if autogen_logger is None:
83
- logger.error("[runtime logging] log_chat_completion: autogen logger is None")
84
- return
85
-
86
- autogen_logger.log_chat_completion(
87
- invocation_id, client_id, wrapper_id, agent, request, response, is_cached, cost, start_time
88
- )
89
-
90
-
91
- def log_new_agent(agent: ConversableAgent, init_args: Dict[str, Any]) -> None:
92
- if autogen_logger is None:
93
- logger.error("[runtime logging] log_new_agent: autogen logger is None")
94
- return
95
-
96
- autogen_logger.log_new_agent(agent, init_args)
97
-
98
-
99
- def log_event(source: Union[str, Agent], name: str, **kwargs: Dict[str, Any]) -> None:
100
- if autogen_logger is None:
101
- logger.error("[runtime logging] log_event: autogen logger is None")
102
- return
103
-
104
- autogen_logger.log_event(source, name, **kwargs)
105
-
106
-
107
- def log_function_use(agent: Union[str, Agent], function: F, args: Dict[str, Any], returns: any):
108
- if autogen_logger is None:
109
- logger.error("[runtime logging] log_function_use: autogen logger is None")
110
- return
111
-
112
- autogen_logger.log_function_use(agent, function, args, returns)
113
-
114
-
115
- def log_new_wrapper(wrapper: OpenAIWrapper, init_args: Dict[str, Union[LLMConfig, List[LLMConfig]]]) -> None:
116
- if autogen_logger is None:
117
- logger.error("[runtime logging] log_new_wrapper: autogen logger is None")
118
- return
119
-
120
- autogen_logger.log_new_wrapper(wrapper, init_args)
121
-
122
-
123
- def log_new_client(
124
- client: Union[
125
- AzureOpenAI,
126
- OpenAI,
127
- CerebrasClient,
128
- GeminiClient,
129
- AnthropicClient,
130
- MistralAIClient,
131
- TogetherClient,
132
- GroqClient,
133
- CohereClient,
134
- OllamaClient,
135
- BedrockClient,
136
- ],
137
- wrapper: OpenAIWrapper,
138
- init_args: Dict[str, Any],
139
- ) -> None:
140
- if autogen_logger is None:
141
- logger.error("[runtime logging] log_new_client: autogen logger is None")
142
- return
143
-
144
- autogen_logger.log_new_client(client, wrapper, init_args)
145
-
146
-
147
- def stop() -> None:
148
- global is_logging
149
- if autogen_logger:
150
- autogen_logger.stop()
151
- is_logging = False
152
-
153
-
154
- def get_connection() -> Union[None, sqlite3.Connection]:
155
- if autogen_logger is None:
156
- logger.error("[runtime logging] get_connection: autogen logger is None")
157
- return None
158
-
159
- return autogen_logger.get_connection()
160
-
161
-
162
- def logging_enabled() -> bool:
163
- return is_logging