ag2 0.4.1__py3-none-any.whl → 0.4.2b1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of ag2 might be problematic. Click here for more details.

Files changed (161) hide show
  1. ag2-0.4.2b1.dist-info/METADATA +19 -0
  2. ag2-0.4.2b1.dist-info/RECORD +6 -0
  3. ag2-0.4.2b1.dist-info/top_level.txt +1 -0
  4. ag2-0.4.1.dist-info/METADATA +0 -500
  5. ag2-0.4.1.dist-info/RECORD +0 -158
  6. ag2-0.4.1.dist-info/top_level.txt +0 -1
  7. autogen/__init__.py +0 -17
  8. autogen/_pydantic.py +0 -116
  9. autogen/agentchat/__init__.py +0 -42
  10. autogen/agentchat/agent.py +0 -142
  11. autogen/agentchat/assistant_agent.py +0 -85
  12. autogen/agentchat/chat.py +0 -306
  13. autogen/agentchat/contrib/__init__.py +0 -0
  14. autogen/agentchat/contrib/agent_builder.py +0 -788
  15. autogen/agentchat/contrib/agent_eval/agent_eval.py +0 -107
  16. autogen/agentchat/contrib/agent_eval/criterion.py +0 -47
  17. autogen/agentchat/contrib/agent_eval/critic_agent.py +0 -47
  18. autogen/agentchat/contrib/agent_eval/quantifier_agent.py +0 -42
  19. autogen/agentchat/contrib/agent_eval/subcritic_agent.py +0 -48
  20. autogen/agentchat/contrib/agent_eval/task.py +0 -43
  21. autogen/agentchat/contrib/agent_optimizer.py +0 -450
  22. autogen/agentchat/contrib/capabilities/__init__.py +0 -0
  23. autogen/agentchat/contrib/capabilities/agent_capability.py +0 -21
  24. autogen/agentchat/contrib/capabilities/generate_images.py +0 -297
  25. autogen/agentchat/contrib/capabilities/teachability.py +0 -406
  26. autogen/agentchat/contrib/capabilities/text_compressors.py +0 -72
  27. autogen/agentchat/contrib/capabilities/transform_messages.py +0 -92
  28. autogen/agentchat/contrib/capabilities/transforms.py +0 -565
  29. autogen/agentchat/contrib/capabilities/transforms_util.py +0 -120
  30. autogen/agentchat/contrib/capabilities/vision_capability.py +0 -217
  31. autogen/agentchat/contrib/captainagent/tools/__init__.py +0 -0
  32. autogen/agentchat/contrib/captainagent/tools/data_analysis/calculate_correlation.py +0 -41
  33. autogen/agentchat/contrib/captainagent/tools/data_analysis/calculate_skewness_and_kurtosis.py +0 -29
  34. autogen/agentchat/contrib/captainagent/tools/data_analysis/detect_outlier_iqr.py +0 -29
  35. autogen/agentchat/contrib/captainagent/tools/data_analysis/detect_outlier_zscore.py +0 -29
  36. autogen/agentchat/contrib/captainagent/tools/data_analysis/explore_csv.py +0 -22
  37. autogen/agentchat/contrib/captainagent/tools/data_analysis/shapiro_wilk_test.py +0 -31
  38. autogen/agentchat/contrib/captainagent/tools/information_retrieval/arxiv_download.py +0 -26
  39. autogen/agentchat/contrib/captainagent/tools/information_retrieval/arxiv_search.py +0 -55
  40. autogen/agentchat/contrib/captainagent/tools/information_retrieval/extract_pdf_image.py +0 -54
  41. autogen/agentchat/contrib/captainagent/tools/information_retrieval/extract_pdf_text.py +0 -39
  42. autogen/agentchat/contrib/captainagent/tools/information_retrieval/get_wikipedia_text.py +0 -22
  43. autogen/agentchat/contrib/captainagent/tools/information_retrieval/get_youtube_caption.py +0 -35
  44. autogen/agentchat/contrib/captainagent/tools/information_retrieval/image_qa.py +0 -61
  45. autogen/agentchat/contrib/captainagent/tools/information_retrieval/optical_character_recognition.py +0 -62
  46. autogen/agentchat/contrib/captainagent/tools/information_retrieval/perform_web_search.py +0 -48
  47. autogen/agentchat/contrib/captainagent/tools/information_retrieval/scrape_wikipedia_tables.py +0 -34
  48. autogen/agentchat/contrib/captainagent/tools/information_retrieval/transcribe_audio_file.py +0 -22
  49. autogen/agentchat/contrib/captainagent/tools/information_retrieval/youtube_download.py +0 -36
  50. autogen/agentchat/contrib/captainagent/tools/math/calculate_circle_area_from_diameter.py +0 -22
  51. autogen/agentchat/contrib/captainagent/tools/math/calculate_day_of_the_week.py +0 -19
  52. autogen/agentchat/contrib/captainagent/tools/math/calculate_fraction_sum.py +0 -29
  53. autogen/agentchat/contrib/captainagent/tools/math/calculate_matrix_power.py +0 -32
  54. autogen/agentchat/contrib/captainagent/tools/math/calculate_reflected_point.py +0 -17
  55. autogen/agentchat/contrib/captainagent/tools/math/complex_numbers_product.py +0 -26
  56. autogen/agentchat/contrib/captainagent/tools/math/compute_currency_conversion.py +0 -24
  57. autogen/agentchat/contrib/captainagent/tools/math/count_distinct_permutations.py +0 -28
  58. autogen/agentchat/contrib/captainagent/tools/math/evaluate_expression.py +0 -29
  59. autogen/agentchat/contrib/captainagent/tools/math/find_continuity_point.py +0 -35
  60. autogen/agentchat/contrib/captainagent/tools/math/fraction_to_mixed_numbers.py +0 -40
  61. autogen/agentchat/contrib/captainagent/tools/math/modular_inverse_sum.py +0 -23
  62. autogen/agentchat/contrib/captainagent/tools/math/simplify_mixed_numbers.py +0 -37
  63. autogen/agentchat/contrib/captainagent/tools/math/sum_of_digit_factorials.py +0 -16
  64. autogen/agentchat/contrib/captainagent/tools/math/sum_of_primes_below.py +0 -16
  65. autogen/agentchat/contrib/captainagent/tools/requirements.txt +0 -10
  66. autogen/agentchat/contrib/captainagent/tools/tool_description.tsv +0 -34
  67. autogen/agentchat/contrib/captainagent.py +0 -490
  68. autogen/agentchat/contrib/gpt_assistant_agent.py +0 -545
  69. autogen/agentchat/contrib/graph_rag/__init__.py +0 -0
  70. autogen/agentchat/contrib/graph_rag/document.py +0 -30
  71. autogen/agentchat/contrib/graph_rag/falkor_graph_query_engine.py +0 -111
  72. autogen/agentchat/contrib/graph_rag/falkor_graph_rag_capability.py +0 -81
  73. autogen/agentchat/contrib/graph_rag/graph_query_engine.py +0 -56
  74. autogen/agentchat/contrib/graph_rag/graph_rag_capability.py +0 -64
  75. autogen/agentchat/contrib/img_utils.py +0 -390
  76. autogen/agentchat/contrib/llamaindex_conversable_agent.py +0 -123
  77. autogen/agentchat/contrib/llava_agent.py +0 -176
  78. autogen/agentchat/contrib/math_user_proxy_agent.py +0 -471
  79. autogen/agentchat/contrib/multimodal_conversable_agent.py +0 -128
  80. autogen/agentchat/contrib/qdrant_retrieve_user_proxy_agent.py +0 -325
  81. autogen/agentchat/contrib/retrieve_assistant_agent.py +0 -56
  82. autogen/agentchat/contrib/retrieve_user_proxy_agent.py +0 -705
  83. autogen/agentchat/contrib/society_of_mind_agent.py +0 -203
  84. autogen/agentchat/contrib/swarm_agent.py +0 -463
  85. autogen/agentchat/contrib/text_analyzer_agent.py +0 -76
  86. autogen/agentchat/contrib/tool_retriever.py +0 -120
  87. autogen/agentchat/contrib/vectordb/__init__.py +0 -0
  88. autogen/agentchat/contrib/vectordb/base.py +0 -243
  89. autogen/agentchat/contrib/vectordb/chromadb.py +0 -326
  90. autogen/agentchat/contrib/vectordb/mongodb.py +0 -559
  91. autogen/agentchat/contrib/vectordb/pgvectordb.py +0 -958
  92. autogen/agentchat/contrib/vectordb/qdrant.py +0 -334
  93. autogen/agentchat/contrib/vectordb/utils.py +0 -126
  94. autogen/agentchat/contrib/web_surfer.py +0 -305
  95. autogen/agentchat/conversable_agent.py +0 -2908
  96. autogen/agentchat/groupchat.py +0 -1668
  97. autogen/agentchat/user_proxy_agent.py +0 -109
  98. autogen/agentchat/utils.py +0 -207
  99. autogen/browser_utils.py +0 -291
  100. autogen/cache/__init__.py +0 -10
  101. autogen/cache/abstract_cache_base.py +0 -78
  102. autogen/cache/cache.py +0 -182
  103. autogen/cache/cache_factory.py +0 -85
  104. autogen/cache/cosmos_db_cache.py +0 -150
  105. autogen/cache/disk_cache.py +0 -109
  106. autogen/cache/in_memory_cache.py +0 -61
  107. autogen/cache/redis_cache.py +0 -128
  108. autogen/code_utils.py +0 -745
  109. autogen/coding/__init__.py +0 -22
  110. autogen/coding/base.py +0 -113
  111. autogen/coding/docker_commandline_code_executor.py +0 -262
  112. autogen/coding/factory.py +0 -45
  113. autogen/coding/func_with_reqs.py +0 -203
  114. autogen/coding/jupyter/__init__.py +0 -22
  115. autogen/coding/jupyter/base.py +0 -32
  116. autogen/coding/jupyter/docker_jupyter_server.py +0 -164
  117. autogen/coding/jupyter/embedded_ipython_code_executor.py +0 -182
  118. autogen/coding/jupyter/jupyter_client.py +0 -224
  119. autogen/coding/jupyter/jupyter_code_executor.py +0 -161
  120. autogen/coding/jupyter/local_jupyter_server.py +0 -168
  121. autogen/coding/local_commandline_code_executor.py +0 -410
  122. autogen/coding/markdown_code_extractor.py +0 -44
  123. autogen/coding/utils.py +0 -57
  124. autogen/exception_utils.py +0 -46
  125. autogen/extensions/__init__.py +0 -0
  126. autogen/formatting_utils.py +0 -76
  127. autogen/function_utils.py +0 -362
  128. autogen/graph_utils.py +0 -148
  129. autogen/io/__init__.py +0 -15
  130. autogen/io/base.py +0 -105
  131. autogen/io/console.py +0 -43
  132. autogen/io/websockets.py +0 -213
  133. autogen/logger/__init__.py +0 -11
  134. autogen/logger/base_logger.py +0 -140
  135. autogen/logger/file_logger.py +0 -287
  136. autogen/logger/logger_factory.py +0 -29
  137. autogen/logger/logger_utils.py +0 -42
  138. autogen/logger/sqlite_logger.py +0 -459
  139. autogen/math_utils.py +0 -356
  140. autogen/oai/__init__.py +0 -33
  141. autogen/oai/anthropic.py +0 -428
  142. autogen/oai/bedrock.py +0 -606
  143. autogen/oai/cerebras.py +0 -270
  144. autogen/oai/client.py +0 -1148
  145. autogen/oai/client_utils.py +0 -167
  146. autogen/oai/cohere.py +0 -453
  147. autogen/oai/completion.py +0 -1216
  148. autogen/oai/gemini.py +0 -469
  149. autogen/oai/groq.py +0 -281
  150. autogen/oai/mistral.py +0 -279
  151. autogen/oai/ollama.py +0 -582
  152. autogen/oai/openai_utils.py +0 -811
  153. autogen/oai/together.py +0 -343
  154. autogen/retrieve_utils.py +0 -487
  155. autogen/runtime_logging.py +0 -163
  156. autogen/token_count_utils.py +0 -259
  157. autogen/types.py +0 -20
  158. autogen/version.py +0 -7
  159. {ag2-0.4.1.dist-info → ag2-0.4.2b1.dist-info}/LICENSE +0 -0
  160. {ag2-0.4.1.dist-info → ag2-0.4.2b1.dist-info}/NOTICE.md +0 -0
  161. {ag2-0.4.1.dist-info → ag2-0.4.2b1.dist-info}/WHEEL +0 -0
autogen/retrieve_utils.py DELETED
@@ -1,487 +0,0 @@
1
- # Copyright (c) 2023 - 2024, Owners of https://github.com/ag2ai
2
- #
3
- # SPDX-License-Identifier: Apache-2.0
4
- #
5
- # Portions derived from https://github.com/microsoft/autogen are under the MIT License.
6
- # SPDX-License-Identifier: MIT
7
- import glob
8
- import hashlib
9
- import os
10
- import re
11
- from typing import Callable, List, Tuple, Union
12
- from urllib.parse import urlparse
13
-
14
- import chromadb
15
- import markdownify
16
- import requests
17
- from bs4 import BeautifulSoup
18
-
19
- if chromadb.__version__ < "0.4.15":
20
- from chromadb.api import API
21
- else:
22
- from chromadb.api import ClientAPI as API
23
- import logging
24
-
25
- import chromadb.utils.embedding_functions as ef
26
- import pypdf
27
- from chromadb.api.types import QueryResult
28
-
29
- from autogen.token_count_utils import count_token
30
-
31
- try:
32
- from unstructured.partition.auto import partition
33
-
34
- HAS_UNSTRUCTURED = True
35
- except ImportError:
36
- HAS_UNSTRUCTURED = False
37
-
38
- logger = logging.getLogger(__name__)
39
- TEXT_FORMATS = [
40
- "txt",
41
- "json",
42
- "csv",
43
- "tsv",
44
- "md",
45
- "html",
46
- "htm",
47
- "rtf",
48
- "rst",
49
- "jsonl",
50
- "log",
51
- "xml",
52
- "yaml",
53
- "yml",
54
- "pdf",
55
- ]
56
- UNSTRUCTURED_FORMATS = [
57
- "doc",
58
- "docx",
59
- "epub",
60
- "msg",
61
- "odt",
62
- "org",
63
- "pdf",
64
- "ppt",
65
- "pptx",
66
- "rtf",
67
- "rst",
68
- "xlsx",
69
- ] # These formats will be parsed by the 'unstructured' library, if installed.
70
- if HAS_UNSTRUCTURED:
71
- TEXT_FORMATS += UNSTRUCTURED_FORMATS
72
- TEXT_FORMATS = list(set(TEXT_FORMATS))
73
- VALID_CHUNK_MODES = frozenset({"one_line", "multi_lines"})
74
- RAG_MINIMUM_MESSAGE_LENGTH = int(os.environ.get("RAG_MINIMUM_MESSAGE_LENGTH", 5))
75
-
76
-
77
- def split_text_to_chunks(
78
- text: str,
79
- max_tokens: int = 4000,
80
- chunk_mode: str = "multi_lines",
81
- must_break_at_empty_line: bool = True,
82
- overlap: int = 0, # number of overlapping lines
83
- ):
84
- """Split a long text into chunks of max_tokens."""
85
- if chunk_mode not in VALID_CHUNK_MODES:
86
- raise AssertionError
87
- if chunk_mode == "one_line":
88
- must_break_at_empty_line = False
89
- overlap = 0
90
- chunks = []
91
- lines = text.split("\n")
92
- num_lines = len(lines)
93
- if num_lines < 3 and must_break_at_empty_line:
94
- logger.warning("The input text has less than 3 lines. Set `must_break_at_empty_line` to `False`")
95
- must_break_at_empty_line = False
96
- lines_tokens = [count_token(line) for line in lines]
97
- sum_tokens = sum(lines_tokens)
98
- while sum_tokens > max_tokens:
99
- if chunk_mode == "one_line":
100
- estimated_line_cut = 2
101
- else:
102
- estimated_line_cut = max(int(max_tokens / sum_tokens * len(lines)), 2)
103
- cnt = 0
104
- prev = ""
105
- for cnt in reversed(range(estimated_line_cut)):
106
- if must_break_at_empty_line and lines[cnt].strip() != "":
107
- continue
108
- if sum(lines_tokens[:cnt]) <= max_tokens:
109
- prev = "\n".join(lines[:cnt])
110
- break
111
- if cnt == 0:
112
- logger.warning(
113
- f"max_tokens is too small to fit a single line of text. Breaking this line:\n\t{lines[0][:100]} ..."
114
- )
115
- if not must_break_at_empty_line:
116
- split_len = max(
117
- int(max_tokens / (lines_tokens[0] * 0.9 * len(lines[0]) + 0.1)), RAG_MINIMUM_MESSAGE_LENGTH
118
- )
119
- prev = lines[0][:split_len]
120
- lines[0] = lines[0][split_len:]
121
- lines_tokens[0] = count_token(lines[0])
122
- else:
123
- logger.warning("Failed to split docs with must_break_at_empty_line being True, set to False.")
124
- must_break_at_empty_line = False
125
- (
126
- chunks.append(prev) if len(prev) >= RAG_MINIMUM_MESSAGE_LENGTH else None
127
- ) # don't add chunks less than RAG_MINIMUM_MESSAGE_LENGTH characters
128
- lines = lines[cnt - overlap if cnt > overlap else cnt :]
129
- lines_tokens = lines_tokens[cnt - overlap if cnt > overlap else cnt :]
130
- sum_tokens = sum(lines_tokens)
131
- text_to_chunk = "\n".join(lines).strip()
132
- (
133
- chunks.append(text_to_chunk) if len(text_to_chunk) >= RAG_MINIMUM_MESSAGE_LENGTH else None
134
- ) # don't add chunks less than RAG_MINIMUM_MESSAGE_LENGTH characters
135
- return chunks
136
-
137
-
138
- def extract_text_from_pdf(file: str) -> str:
139
- """Extract text from PDF files"""
140
- text = ""
141
- with open(file, "rb") as f:
142
- reader = pypdf.PdfReader(f)
143
- if reader.is_encrypted: # Check if the PDF is encrypted
144
- try:
145
- reader.decrypt("")
146
- except pypdf.errors.FileNotDecryptedError as e:
147
- logger.warning(f"Could not decrypt PDF {file}, {e}")
148
- return text # Return empty text if PDF could not be decrypted
149
-
150
- for page_num in range(len(reader.pages)):
151
- page = reader.pages[page_num]
152
- text += page.extract_text()
153
-
154
- if not text.strip(): # Debugging line to check if text is empty
155
- logger.warning(f"Could not decrypt PDF {file}")
156
-
157
- return text
158
-
159
-
160
- def split_files_to_chunks(
161
- files: list,
162
- max_tokens: int = 4000,
163
- chunk_mode: str = "multi_lines",
164
- must_break_at_empty_line: bool = True,
165
- custom_text_split_function: Callable = None,
166
- ) -> Tuple[List[str], List[dict]]:
167
- """Split a list of files into chunks of max_tokens."""
168
-
169
- chunks = []
170
- sources = []
171
-
172
- for file in files:
173
- if isinstance(file, tuple):
174
- url = file[1]
175
- file = file[0]
176
- else:
177
- url = None
178
- _, file_extension = os.path.splitext(file)
179
- file_extension = file_extension.lower()
180
-
181
- if HAS_UNSTRUCTURED and file_extension[1:] in UNSTRUCTURED_FORMATS:
182
- text = partition(file)
183
- text = "\n".join([t.text for t in text]) if len(text) > 0 else ""
184
- elif file_extension == ".pdf":
185
- text = extract_text_from_pdf(file)
186
- else: # For non-PDF text-based files
187
- with open(file, "r", encoding="utf-8", errors="ignore") as f:
188
- text = f.read()
189
-
190
- if not text.strip(): # Debugging line to check if text is empty after reading
191
- logger.warning(f"No text available in file: {file}")
192
- continue # Skip to the next file if no text is available
193
-
194
- if custom_text_split_function is not None:
195
- tmp_chunks = custom_text_split_function(text)
196
- else:
197
- tmp_chunks = split_text_to_chunks(text, max_tokens, chunk_mode, must_break_at_empty_line)
198
- chunks += tmp_chunks
199
- sources += [{"source": url if url else file}] * len(tmp_chunks)
200
-
201
- return chunks, sources
202
-
203
-
204
- def get_files_from_dir(dir_path: Union[str, List[str]], types: list = TEXT_FORMATS, recursive: bool = True):
205
- """Return a list of all the files in a given directory, a url, a file path or a list of them."""
206
- if len(types) == 0:
207
- raise ValueError("types cannot be empty.")
208
- types = [t[1:].lower() if t.startswith(".") else t.lower() for t in set(types)]
209
- types += [t.upper() for t in types]
210
-
211
- files = []
212
- # If the path is a list of files or urls, process and return them
213
- if isinstance(dir_path, list):
214
- for item in dir_path:
215
- if os.path.isfile(item):
216
- files.append(item)
217
- elif is_url(item):
218
- filepath = get_file_from_url(item)
219
- if filepath:
220
- files.append(filepath)
221
- elif os.path.exists(item):
222
- try:
223
- files.extend(get_files_from_dir(item, types, recursive))
224
- except ValueError:
225
- logger.warning(f"Directory {item} does not exist. Skipping.")
226
- else:
227
- logger.warning(f"File {item} does not exist. Skipping.")
228
- return files
229
-
230
- # If the path is a file, return it
231
- if os.path.isfile(dir_path):
232
- return [dir_path]
233
-
234
- # If the path is a url, download it and return the downloaded file
235
- if is_url(dir_path):
236
- filepath = get_file_from_url(dir_path)
237
- if filepath:
238
- return [filepath]
239
- else:
240
- return []
241
-
242
- if os.path.exists(dir_path):
243
- for type in types:
244
- if recursive:
245
- files += glob.glob(os.path.join(dir_path, f"**/*.{type}"), recursive=True)
246
- else:
247
- files += glob.glob(os.path.join(dir_path, f"*.{type}"), recursive=False)
248
- else:
249
- logger.error(f"Directory {dir_path} does not exist.")
250
- raise ValueError(f"Directory {dir_path} does not exist.")
251
- return files
252
-
253
-
254
- def parse_html_to_markdown(html: str, url: str = None) -> str:
255
- """Parse HTML to markdown."""
256
- soup = BeautifulSoup(html, "html.parser")
257
- title = soup.title.string
258
- # Remove javascript and style blocks
259
- for script in soup(["script", "style"]):
260
- script.extract()
261
-
262
- # Convert to markdown -- Wikipedia gets special attention to get a clean version of the page
263
- if isinstance(url, str) and url.startswith("https://en.wikipedia.org/"):
264
- body_elm = soup.find("div", {"id": "mw-content-text"})
265
- title_elm = soup.find("span", {"class": "mw-page-title-main"})
266
-
267
- if body_elm:
268
- # What's the title
269
- main_title = soup.title.string
270
- if title_elm and len(title_elm) > 0:
271
- main_title = title_elm.string
272
- webpage_text = "# " + main_title + "\n\n" + markdownify.MarkdownConverter().convert_soup(body_elm)
273
- else:
274
- webpage_text = markdownify.MarkdownConverter().convert_soup(soup)
275
- else:
276
- webpage_text = markdownify.MarkdownConverter().convert_soup(soup)
277
-
278
- # Convert newlines
279
- webpage_text = re.sub(r"\r\n", "\n", webpage_text)
280
- webpage_text = re.sub(r"\n{2,}", "\n\n", webpage_text).strip()
281
- webpage_text = "# " + title + "\n\n" + webpage_text
282
- return webpage_text
283
-
284
-
285
- def _generate_file_name_from_url(url: str, max_length=255) -> str:
286
- url_bytes = url.encode("utf-8")
287
- hash = hashlib.blake2b(url_bytes).hexdigest()
288
- parsed_url = urlparse(url)
289
- file_name = os.path.basename(url)
290
- file_name = f"{parsed_url.netloc}_{file_name}_{hash[:min(8, max_length-len(parsed_url.netloc)-len(file_name)-1)]}"
291
- return file_name
292
-
293
-
294
- def get_file_from_url(url: str, save_path: str = None) -> Tuple[str, str]:
295
- """Download a file from a URL."""
296
- if save_path is None:
297
- save_path = "tmp/chromadb"
298
- os.makedirs(save_path, exist_ok=True)
299
- if os.path.isdir(save_path):
300
- filename = _generate_file_name_from_url(url)
301
- save_path = os.path.join(save_path, filename)
302
- else:
303
- os.makedirs(os.path.dirname(save_path), exist_ok=True)
304
-
305
- custom_headers = {
306
- "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36",
307
- }
308
- try:
309
- response = requests.get(url, stream=True, headers=custom_headers, timeout=30)
310
- response.raise_for_status()
311
- except requests.exceptions.RequestException as e:
312
- logger.warning(f"Failed to download {url}, {e}")
313
- return None
314
-
315
- content_type = response.headers.get("content-type", "")
316
- if "text/html" in content_type:
317
- # Get the content of the response
318
- html = ""
319
- for chunk in response.iter_content(chunk_size=8192, decode_unicode=True):
320
- html += chunk
321
- text = parse_html_to_markdown(html, url)
322
- with open(save_path, "w", encoding="utf-8") as f:
323
- f.write(text)
324
- else:
325
- with open(save_path, "wb") as f:
326
- for chunk in response.iter_content(chunk_size=8192):
327
- f.write(chunk)
328
- return save_path, url
329
-
330
-
331
- def is_url(string: str):
332
- """Return True if the string is a valid URL."""
333
- try:
334
- result = urlparse(string)
335
- return all([result.scheme, result.netloc])
336
- except ValueError:
337
- return False
338
-
339
-
340
- def create_vector_db_from_dir(
341
- dir_path: Union[str, List[str]],
342
- max_tokens: int = 4000,
343
- client: API = None,
344
- db_path: str = "tmp/chromadb.db",
345
- collection_name: str = "all-my-documents",
346
- get_or_create: bool = False,
347
- chunk_mode: str = "multi_lines",
348
- must_break_at_empty_line: bool = True,
349
- embedding_model: str = "all-MiniLM-L6-v2",
350
- embedding_function: Callable = None,
351
- custom_text_split_function: Callable = None,
352
- custom_text_types: List[str] = TEXT_FORMATS,
353
- recursive: bool = True,
354
- extra_docs: bool = False,
355
- ) -> API:
356
- """Create a vector db from all the files in a given directory, the directory can also be a single file or a url to
357
- a single file. We support chromadb compatible APIs to create the vector db, this function is not required if
358
- you prepared your own vector db.
359
-
360
- Args:
361
- dir_path (Union[str, List[str]]): the path to the directory, file, url or a list of them.
362
- max_tokens (Optional, int): the maximum number of tokens per chunk. Default is 4000.
363
- client (Optional, API): the chromadb client. Default is None.
364
- db_path (Optional, str): the path to the chromadb. Default is "tmp/chromadb.db". The default was `/tmp/chromadb.db` for version <=0.2.24.
365
- collection_name (Optional, str): the name of the collection. Default is "all-my-documents".
366
- get_or_create (Optional, bool): Whether to get or create the collection. Default is False. If True, the collection
367
- will be returned if it already exists. Will raise ValueError if the collection already exists and get_or_create is False.
368
- chunk_mode (Optional, str): the chunk mode. Default is "multi_lines".
369
- must_break_at_empty_line (Optional, bool): Whether to break at empty line. Default is True.
370
- embedding_model (Optional, str): the embedding model to use. Default is "all-MiniLM-L6-v2". Will be ignored if
371
- embedding_function is not None.
372
- embedding_function (Optional, Callable): the embedding function to use. Default is None, SentenceTransformer with
373
- the given `embedding_model` will be used. If you want to use OpenAI, Cohere, HuggingFace or other embedding
374
- functions, you can pass it here, follow the examples in `https://docs.trychroma.com/embeddings`.
375
- custom_text_split_function (Optional, Callable): a custom function to split a string into a list of strings.
376
- Default is None, will use the default function in `autogen.retrieve_utils.split_text_to_chunks`.
377
- custom_text_types (Optional, List[str]): a list of file types to be processed. Default is TEXT_FORMATS.
378
- recursive (Optional, bool): whether to search documents recursively in the dir_path. Default is True.
379
- extra_docs (Optional, bool): whether to add more documents in the collection. Default is False
380
-
381
- Returns:
382
-
383
- The chromadb client.
384
- """
385
- if client is None:
386
- client = chromadb.PersistentClient(path=db_path)
387
- try:
388
- embedding_function = (
389
- ef.SentenceTransformerEmbeddingFunction(embedding_model)
390
- if embedding_function is None
391
- else embedding_function
392
- )
393
- collection = client.create_collection(
394
- collection_name,
395
- get_or_create=get_or_create,
396
- embedding_function=embedding_function,
397
- # https://github.com/nmslib/hnswlib#supported-distances
398
- # https://github.com/chroma-core/chroma/blob/566bc80f6c8ee29f7d99b6322654f32183c368c4/chromadb/segment/impl/vector/local_hnsw.py#L184
399
- # https://github.com/nmslib/hnswlib/blob/master/ALGO_PARAMS.md
400
- metadata={"hnsw:space": "ip", "hnsw:construction_ef": 30, "hnsw:M": 32}, # ip, l2, cosine
401
- )
402
-
403
- length = 0
404
- if extra_docs:
405
- length = len(collection.get()["ids"])
406
-
407
- if custom_text_split_function is not None:
408
- chunks, sources = split_files_to_chunks(
409
- get_files_from_dir(dir_path, custom_text_types, recursive),
410
- custom_text_split_function=custom_text_split_function,
411
- )
412
- else:
413
- chunks, sources = split_files_to_chunks(
414
- get_files_from_dir(dir_path, custom_text_types, recursive),
415
- max_tokens,
416
- chunk_mode,
417
- must_break_at_empty_line,
418
- )
419
- logger.info(f"Found {len(chunks)} chunks.")
420
- # Upsert in batch of 40000 or less if the total number of chunks is less than 40000
421
- for i in range(0, len(chunks), min(40000, len(chunks))):
422
- end_idx = i + min(40000, len(chunks) - i)
423
- collection.upsert(
424
- documents=chunks[i:end_idx],
425
- ids=[f"doc_{j+length}" for j in range(i, end_idx)], # unique for each doc
426
- metadatas=sources[i:end_idx],
427
- )
428
- except ValueError as e:
429
- logger.warning(f"{e}")
430
- return client
431
-
432
-
433
- def query_vector_db(
434
- query_texts: List[str],
435
- n_results: int = 10,
436
- client: API = None,
437
- db_path: str = "tmp/chromadb.db",
438
- collection_name: str = "all-my-documents",
439
- search_string: str = "",
440
- embedding_model: str = "all-MiniLM-L6-v2",
441
- embedding_function: Callable = None,
442
- ) -> QueryResult:
443
- """Query a vector db. We support chromadb compatible APIs, it's not required if you prepared your own vector db
444
- and query function.
445
-
446
- Args:
447
- query_texts (List[str]): the list of strings which will be used to query the vector db.
448
- n_results (Optional, int): the number of results to return. Default is 10.
449
- client (Optional, API): the chromadb compatible client. Default is None, a chromadb client will be used.
450
- db_path (Optional, str): the path to the vector db. Default is "tmp/chromadb.db". The default was `/tmp/chromadb.db` for version <=0.2.24.
451
- collection_name (Optional, str): the name of the collection. Default is "all-my-documents".
452
- search_string (Optional, str): the search string. Only docs that contain an exact match of this string will be retrieved. Default is "".
453
- embedding_model (Optional, str): the embedding model to use. Default is "all-MiniLM-L6-v2". Will be ignored if
454
- embedding_function is not None.
455
- embedding_function (Optional, Callable): the embedding function to use. Default is None, SentenceTransformer with
456
- the given `embedding_model` will be used. If you want to use OpenAI, Cohere, HuggingFace or other embedding
457
- functions, you can pass it here, follow the examples in `https://docs.trychroma.com/embeddings`.
458
-
459
- Returns:
460
-
461
- The query result. The format is:
462
-
463
- ```python
464
- class QueryResult(TypedDict):
465
- ids: List[IDs]
466
- embeddings: Optional[List[List[Embedding]]]
467
- documents: Optional[List[List[Document]]]
468
- metadatas: Optional[List[List[Metadata]]]
469
- distances: Optional[List[List[float]]]
470
- ```
471
- """
472
- if client is None:
473
- client = chromadb.PersistentClient(path=db_path)
474
- # the collection's embedding function is always the default one, but we want to use the one we used to create the
475
- # collection. So we compute the embeddings ourselves and pass it to the query function.
476
- collection = client.get_collection(collection_name)
477
- embedding_function = (
478
- ef.SentenceTransformerEmbeddingFunction(embedding_model) if embedding_function is None else embedding_function
479
- )
480
- query_embeddings = embedding_function(query_texts)
481
- # Query/search n most similar results. You can also .get by id
482
- results = collection.query(
483
- query_embeddings=query_embeddings,
484
- n_results=n_results,
485
- where_document={"$contains": search_string} if search_string else None, # optional filter
486
- )
487
- return results
@@ -1,163 +0,0 @@
1
- # Copyright (c) 2023 - 2024, Owners of https://github.com/ag2ai
2
- #
3
- # SPDX-License-Identifier: Apache-2.0
4
- #
5
- # Portions derived from https://github.com/microsoft/autogen are under the MIT License.
6
- # SPDX-License-Identifier: MIT
7
- from __future__ import annotations
8
-
9
- import logging
10
- import sqlite3
11
- import uuid
12
- from typing import TYPE_CHECKING, Any, Callable, Dict, List, Literal, Optional, TypeVar, Union
13
-
14
- from openai import AzureOpenAI, OpenAI
15
- from openai.types.chat import ChatCompletion
16
-
17
- from autogen.logger.base_logger import BaseLogger, LLMConfig
18
- from autogen.logger.logger_factory import LoggerFactory
19
-
20
- if TYPE_CHECKING:
21
- from autogen import Agent, ConversableAgent, OpenAIWrapper
22
- from autogen.oai.anthropic import AnthropicClient
23
- from autogen.oai.bedrock import BedrockClient
24
- from autogen.oai.cerebras import CerebrasClient
25
- from autogen.oai.cohere import CohereClient
26
- from autogen.oai.gemini import GeminiClient
27
- from autogen.oai.groq import GroqClient
28
- from autogen.oai.mistral import MistralAIClient
29
- from autogen.oai.ollama import OllamaClient
30
- from autogen.oai.together import TogetherClient
31
-
32
- logger = logging.getLogger(__name__)
33
-
34
- autogen_logger = None
35
- is_logging = False
36
-
37
- F = TypeVar("F", bound=Callable[..., Any])
38
-
39
-
40
- def start(
41
- logger: Optional[BaseLogger] = None,
42
- logger_type: Literal["sqlite", "file"] = "sqlite",
43
- config: Optional[Dict[str, Any]] = None,
44
- ) -> str:
45
- """
46
- Start logging for the runtime.
47
- Args:
48
- logger (BaseLogger): A logger instance
49
- logger_type (str): The type of logger to use (default: sqlite)
50
- config (dict): Configuration for the logger
51
- Returns:
52
- session_id (str(uuid.uuid4)): a unique id for the logging session
53
- """
54
- global autogen_logger
55
- global is_logging
56
-
57
- if logger:
58
- autogen_logger = logger
59
- else:
60
- autogen_logger = LoggerFactory.get_logger(logger_type=logger_type, config=config)
61
-
62
- try:
63
- session_id = autogen_logger.start()
64
- is_logging = True
65
- except Exception as e:
66
- logger.error(f"[runtime logging] Failed to start logging: {e}")
67
- finally:
68
- return session_id
69
-
70
-
71
- def log_chat_completion(
72
- invocation_id: uuid.UUID,
73
- client_id: int,
74
- wrapper_id: int,
75
- agent: Union[str, Agent],
76
- request: Dict[str, Union[float, str, List[Dict[str, str]]]],
77
- response: Union[str, ChatCompletion],
78
- is_cached: int,
79
- cost: float,
80
- start_time: str,
81
- ) -> None:
82
- if autogen_logger is None:
83
- logger.error("[runtime logging] log_chat_completion: autogen logger is None")
84
- return
85
-
86
- autogen_logger.log_chat_completion(
87
- invocation_id, client_id, wrapper_id, agent, request, response, is_cached, cost, start_time
88
- )
89
-
90
-
91
- def log_new_agent(agent: ConversableAgent, init_args: Dict[str, Any]) -> None:
92
- if autogen_logger is None:
93
- logger.error("[runtime logging] log_new_agent: autogen logger is None")
94
- return
95
-
96
- autogen_logger.log_new_agent(agent, init_args)
97
-
98
-
99
- def log_event(source: Union[str, Agent], name: str, **kwargs: Dict[str, Any]) -> None:
100
- if autogen_logger is None:
101
- logger.error("[runtime logging] log_event: autogen logger is None")
102
- return
103
-
104
- autogen_logger.log_event(source, name, **kwargs)
105
-
106
-
107
- def log_function_use(agent: Union[str, Agent], function: F, args: Dict[str, Any], returns: any):
108
- if autogen_logger is None:
109
- logger.error("[runtime logging] log_function_use: autogen logger is None")
110
- return
111
-
112
- autogen_logger.log_function_use(agent, function, args, returns)
113
-
114
-
115
- def log_new_wrapper(wrapper: OpenAIWrapper, init_args: Dict[str, Union[LLMConfig, List[LLMConfig]]]) -> None:
116
- if autogen_logger is None:
117
- logger.error("[runtime logging] log_new_wrapper: autogen logger is None")
118
- return
119
-
120
- autogen_logger.log_new_wrapper(wrapper, init_args)
121
-
122
-
123
- def log_new_client(
124
- client: Union[
125
- AzureOpenAI,
126
- OpenAI,
127
- CerebrasClient,
128
- GeminiClient,
129
- AnthropicClient,
130
- MistralAIClient,
131
- TogetherClient,
132
- GroqClient,
133
- CohereClient,
134
- OllamaClient,
135
- BedrockClient,
136
- ],
137
- wrapper: OpenAIWrapper,
138
- init_args: Dict[str, Any],
139
- ) -> None:
140
- if autogen_logger is None:
141
- logger.error("[runtime logging] log_new_client: autogen logger is None")
142
- return
143
-
144
- autogen_logger.log_new_client(client, wrapper, init_args)
145
-
146
-
147
- def stop() -> None:
148
- global is_logging
149
- if autogen_logger:
150
- autogen_logger.stop()
151
- is_logging = False
152
-
153
-
154
- def get_connection() -> Union[None, sqlite3.Connection]:
155
- if autogen_logger is None:
156
- logger.error("[runtime logging] get_connection: autogen logger is None")
157
- return None
158
-
159
- return autogen_logger.get_connection()
160
-
161
-
162
- def logging_enabled() -> bool:
163
- return is_logging