alita-sdk 0.3.314__py3-none-any.whl → 0.3.315__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- alita_sdk/tools/browser/__init__.py +28 -5
- alita_sdk/tools/browser/crawler.py +4 -1
- alita_sdk/tools/browser/utils.py +16 -7
- {alita_sdk-0.3.314.dist-info → alita_sdk-0.3.315.dist-info}/METADATA +1 -1
- {alita_sdk-0.3.314.dist-info → alita_sdk-0.3.315.dist-info}/RECORD +8 -8
- {alita_sdk-0.3.314.dist-info → alita_sdk-0.3.315.dist-info}/WHEEL +0 -0
- {alita_sdk-0.3.314.dist-info → alita_sdk-0.3.315.dist-info}/licenses/LICENSE +0 -0
- {alita_sdk-0.3.314.dist-info → alita_sdk-0.3.315.dist-info}/top_level.txt +0 -0
@@ -12,6 +12,8 @@ from ..utils import get_max_toolkit_length, clean_string, TOOLKIT_SPLITTER
|
|
12
12
|
from ...configurations.browser import BrowserConfiguration
|
13
13
|
from logging import getLogger
|
14
14
|
|
15
|
+
from ...configurations.pgvector import PgVectorConfiguration
|
16
|
+
|
15
17
|
logger = getLogger(__name__)
|
16
18
|
|
17
19
|
name = "browser"
|
@@ -21,6 +23,8 @@ def get_tools(tool):
|
|
21
23
|
return BrowserToolkit().get_toolkit(
|
22
24
|
selected_tools=tool['settings'].get('selected_tools', []),
|
23
25
|
browser_configuration=tool['settings']['browser_configuration'],
|
26
|
+
pgvector_configuration=tool['settings'].get('pgvector_configuration', {}),
|
27
|
+
embedding_model=tool['settings'].get('embedding_model'),
|
24
28
|
toolkit_name=tool.get('toolkit_name', '')
|
25
29
|
).get_tools()
|
26
30
|
|
@@ -51,8 +55,21 @@ class BrowserToolkit(BaseToolkit):
|
|
51
55
|
|
52
56
|
return create_model(
|
53
57
|
name,
|
54
|
-
__config__=ConfigDict(json_schema_extra={'metadata': {"label": "Browser", "icon_url": None,
|
55
|
-
|
58
|
+
__config__=ConfigDict(json_schema_extra={'metadata': {"label": "Browser", "icon_url": None,
|
59
|
+
"categories": ["testing"],
|
60
|
+
"extra_categories": [
|
61
|
+
"web scraping", "search", "crawler"
|
62
|
+
]}}),
|
63
|
+
browser_configuration=(Optional[BrowserConfiguration],
|
64
|
+
Field(description="Browser Configuration (required for tools and `google`)",
|
65
|
+
default=None, json_schema_extra={'configuration_types': ['browser']})),
|
66
|
+
pgvector_configuration=(Optional[PgVectorConfiguration],
|
67
|
+
Field(description="PgVector configuration (required for tools `multi_url_crawler`)",
|
68
|
+
default=None, json_schema_extra={'configuration_types': ['pgvector']})),
|
69
|
+
embedding_model=(Optional[str],
|
70
|
+
Field(default=None,
|
71
|
+
description="Embedding configuration (required for tools `multi_url_crawler`)",
|
72
|
+
json_schema_extra={'configuration_model': 'embedding'})),
|
56
73
|
selected_tools=(List[Literal[tuple(selected_tools)]],
|
57
74
|
Field(default=[], json_schema_extra={'args_schemas': selected_tools})),
|
58
75
|
__validators__={
|
@@ -65,9 +82,15 @@ class BrowserToolkit(BaseToolkit):
|
|
65
82
|
if selected_tools is None:
|
66
83
|
selected_tools = []
|
67
84
|
|
68
|
-
|
85
|
+
wrapper_payload_google = {
|
69
86
|
**kwargs,
|
70
87
|
**kwargs.get('browser_configuration', {}),
|
88
|
+
**kwargs.get('pgvector_configuration', {}),
|
89
|
+
}
|
90
|
+
|
91
|
+
wrapper_payload_rag_based = {
|
92
|
+
**kwargs,
|
93
|
+
**kwargs.get('pgvector_configuration', {}),
|
71
94
|
}
|
72
95
|
|
73
96
|
tools = []
|
@@ -85,7 +108,7 @@ class BrowserToolkit(BaseToolkit):
|
|
85
108
|
if tool == 'single_url_crawler':
|
86
109
|
tool_entry = SingleURLCrawler()
|
87
110
|
elif tool == 'multi_url_crawler':
|
88
|
-
tool_entry = MultiURLCrawler()
|
111
|
+
tool_entry = MultiURLCrawler(**wrapper_payload_rag_based)
|
89
112
|
elif tool == 'get_html_content':
|
90
113
|
tool_entry = GetHTMLContent()
|
91
114
|
elif tool == 'get_pdf_content':
|
@@ -93,7 +116,7 @@ class BrowserToolkit(BaseToolkit):
|
|
93
116
|
elif tool == 'google':
|
94
117
|
try:
|
95
118
|
google_api_wrapper = GoogleSearchAPIWrapper(
|
96
|
-
**
|
119
|
+
**wrapper_payload_google
|
97
120
|
)
|
98
121
|
tool_entry = GoogleSearchResults(api_wrapper=google_api_wrapper)
|
99
122
|
# rename the tool to avoid conflicts
|
@@ -27,13 +27,16 @@ class MultiURLCrawler(BaseTool):
|
|
27
27
|
max_response_size: int = 3000
|
28
28
|
name: str = "multi_url_crawler"
|
29
29
|
description: str = "Crawls multiple URLs and returns the content related to query"
|
30
|
+
embedding_model: str = None
|
31
|
+
connection_string: str = None
|
30
32
|
args_schema: Type[BaseModel] = create_model("MultiURLCrawlerModel",
|
31
33
|
query=(str, Field(description="Query text to search pages")),
|
32
34
|
urls=(list[str], Field(description="list of URLs to search like ['url1', 'url2']")))
|
33
35
|
|
34
36
|
def _run(self, query: str, urls: list[str], run_manager=None):
|
35
37
|
urls = [url.strip() for url in urls]
|
36
|
-
return webRag(urls, self.max_response_size, query
|
38
|
+
return webRag(urls=urls, max_response_size=self.max_response_size, query=query,
|
39
|
+
connection_string=self.connection_string, embedding_model=self.embedding_model)
|
37
40
|
|
38
41
|
|
39
42
|
class GetHTMLContent(BaseTool):
|
alita_sdk/tools/browser/utils.py
CHANGED
@@ -6,9 +6,9 @@ from langchain.text_splitter import CharacterTextSplitter
|
|
6
6
|
import fitz
|
7
7
|
|
8
8
|
try:
|
9
|
-
from
|
9
|
+
from langchain_postgres import PGVector
|
10
10
|
except ImportError:
|
11
|
-
|
11
|
+
PGVector = None
|
12
12
|
|
13
13
|
from langchain_community.embeddings.sentence_transformer import (
|
14
14
|
SentenceTransformerEmbeddings,
|
@@ -32,13 +32,22 @@ def get_page(urls, html_only=False):
|
|
32
32
|
return docs_transformed
|
33
33
|
|
34
34
|
|
35
|
-
def webRag(urls, max_response_size, query):
|
36
|
-
if
|
37
|
-
return "
|
35
|
+
def webRag(urls, max_response_size, query, connection_string=None, embedding_model=None):
|
36
|
+
if PGVector is None:
|
37
|
+
return "PGVector is not initialized. Web rag is not available."
|
38
|
+
|
39
|
+
if not connection_string or not embedding_model:
|
40
|
+
return "Connection string or embedding model is missing. Web rag is not available."
|
38
41
|
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
|
39
42
|
docs = text_splitter.split_documents(get_page(urls))
|
40
|
-
embedding_function = SentenceTransformerEmbeddings(model_name=
|
41
|
-
db =
|
43
|
+
embedding_function = SentenceTransformerEmbeddings(model_name=embedding_model)
|
44
|
+
db = PGVector.from_documents(
|
45
|
+
documents=docs,
|
46
|
+
embedding=embedding_function,
|
47
|
+
collection_name="web_rag",
|
48
|
+
pre_delete_collection=True,
|
49
|
+
connection=connection_string
|
50
|
+
)
|
42
51
|
docs = db.search(query, "mmr", k=10)
|
43
52
|
text = ""
|
44
53
|
for doc in docs:
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: alita_sdk
|
3
|
-
Version: 0.3.
|
3
|
+
Version: 0.3.315
|
4
4
|
Summary: SDK for building langchain agents using resources from Alita
|
5
5
|
Author-email: Artem Rozumenko <artyom.rozumenko@gmail.com>, Mikalai Biazruchka <mikalai_biazruchka@epam.com>, Roman Mitusov <roman_mitusov@epam.com>, Ivan Krakhmaliuk <lifedj27@gmail.com>, Artem Dubrovskiy <ad13box@gmail.com>
|
6
6
|
License-Expression: Apache-2.0
|
@@ -162,11 +162,11 @@ alita_sdk/tools/bitbucket/__init__.py,sha256=2VAY45Jij5dHkz6UGTmsEmOcLeJMWmcX-Wr
|
|
162
162
|
alita_sdk/tools/bitbucket/api_wrapper.py,sha256=xHrluV2aCckOK_lGd42fFz1c-pyuZAnC-A_v1SKiM5g,20006
|
163
163
|
alita_sdk/tools/bitbucket/bitbucket_constants.py,sha256=UsbhQ1iEvrKoxceTFPWTYhaXS1zSxbmjs1TwY0-P4gw,462
|
164
164
|
alita_sdk/tools/bitbucket/cloud_api_wrapper.py,sha256=QHdud-d3xcz3mOP3xb1Htk1sv9QFg7bTm1szdN_zohQ,15517
|
165
|
-
alita_sdk/tools/browser/__init__.py,sha256=
|
166
|
-
alita_sdk/tools/browser/crawler.py,sha256=
|
165
|
+
alita_sdk/tools/browser/__init__.py,sha256=faLmuMt2CzCPMmxquGcdV-TGAbKxHi7sTQyuK0VKYNs,6760
|
166
|
+
alita_sdk/tools/browser/crawler.py,sha256=Ah0tyF7lKGJIlxMY4MXEQmuDehaB_I-FmECxG27DnPw,2476
|
167
167
|
alita_sdk/tools/browser/duck_duck_go_search.py,sha256=iKws923v34o-ySXohJw-8xTDBWlj3fMsnzC_ZRuPugE,2002
|
168
168
|
alita_sdk/tools/browser/google_search_rag.py,sha256=QVHFbVwymiJGuno_HLSJOK1c_MpgMdBSTYQKf6fLRk8,1838
|
169
|
-
alita_sdk/tools/browser/utils.py,sha256=
|
169
|
+
alita_sdk/tools/browser/utils.py,sha256=J4-ZSb5TeCJnYJTsPKUOyiOC_vfTye3QtZA-T_AYEoA,2853
|
170
170
|
alita_sdk/tools/browser/wiki.py,sha256=Qh3HBFd4dkS2VavXbFJOm4b8SjVSIe5xSD7CY1vEkKE,1126
|
171
171
|
alita_sdk/tools/carrier/__init__.py,sha256=Ove5wAXBxyLS5F5ZxgydV2xKZJIR3OoMB5fMkn8jNUc,4296
|
172
172
|
alita_sdk/tools/carrier/api_wrapper.py,sha256=tP7oR_U0HX1rxqat0Jkz6oh3RB9BEr1ESKQ9J8OWDcE,9093
|
@@ -349,8 +349,8 @@ alita_sdk/tools/zephyr_scale/api_wrapper.py,sha256=A6CUEKjENt3mZlPU9lai88WV9esCD
|
|
349
349
|
alita_sdk/tools/zephyr_squad/__init__.py,sha256=0ne8XLJEQSLOWfzd2HdnqOYmQlUliKHbBED5kW_Vias,2895
|
350
350
|
alita_sdk/tools/zephyr_squad/api_wrapper.py,sha256=kmw_xol8YIYFplBLWTqP_VKPRhL_1ItDD0_vXTe_UuI,14906
|
351
351
|
alita_sdk/tools/zephyr_squad/zephyr_squad_cloud_client.py,sha256=R371waHsms4sllHCbijKYs90C-9Yu0sSR3N4SUfQOgU,5066
|
352
|
-
alita_sdk-0.3.
|
353
|
-
alita_sdk-0.3.
|
354
|
-
alita_sdk-0.3.
|
355
|
-
alita_sdk-0.3.
|
356
|
-
alita_sdk-0.3.
|
352
|
+
alita_sdk-0.3.315.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
353
|
+
alita_sdk-0.3.315.dist-info/METADATA,sha256=iwfMJ3l6wJ6OAHVEJWZ1xIUsQgfKlcgUzxdqUvS4POA,18897
|
354
|
+
alita_sdk-0.3.315.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
355
|
+
alita_sdk-0.3.315.dist-info/top_level.txt,sha256=0vJYy5p_jK6AwVb1aqXr7Kgqgk3WDtQ6t5C-XI9zkmg,10
|
356
|
+
alita_sdk-0.3.315.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|