MindsDB 25.7.2.0__py3-none-any.whl → 25.7.4.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of MindsDB might be problematic. Click here for more details.
- mindsdb/__about__.py +1 -1
- mindsdb/__main__.py +1 -1
- mindsdb/api/a2a/common/server/server.py +16 -6
- mindsdb/api/executor/command_executor.py +213 -137
- mindsdb/api/executor/datahub/datanodes/integration_datanode.py +5 -1
- mindsdb/api/executor/datahub/datanodes/project_datanode.py +14 -3
- mindsdb/api/executor/planner/plan_join.py +3 -0
- mindsdb/api/executor/planner/plan_join_ts.py +117 -100
- mindsdb/api/executor/planner/query_planner.py +1 -0
- mindsdb/api/executor/sql_query/steps/apply_predictor_step.py +54 -85
- mindsdb/api/http/initialize.py +16 -43
- mindsdb/api/http/namespaces/agents.py +24 -21
- mindsdb/api/http/namespaces/chatbots.py +83 -120
- mindsdb/api/http/namespaces/file.py +1 -1
- mindsdb/api/http/namespaces/jobs.py +38 -60
- mindsdb/api/http/namespaces/tree.py +69 -61
- mindsdb/api/mcp/start.py +2 -0
- mindsdb/api/mysql/mysql_proxy/utilities/dump.py +3 -2
- mindsdb/integrations/handlers/autogluon_handler/requirements.txt +1 -1
- mindsdb/integrations/handlers/autosklearn_handler/requirements.txt +1 -1
- mindsdb/integrations/handlers/bigquery_handler/bigquery_handler.py +25 -5
- mindsdb/integrations/handlers/chromadb_handler/chromadb_handler.py +3 -3
- mindsdb/integrations/handlers/flaml_handler/requirements.txt +1 -1
- mindsdb/integrations/handlers/google_calendar_handler/google_calendar_tables.py +82 -73
- mindsdb/integrations/handlers/hubspot_handler/requirements.txt +1 -1
- mindsdb/integrations/handlers/langchain_handler/langchain_handler.py +83 -76
- mindsdb/integrations/handlers/lightwood_handler/requirements.txt +4 -4
- mindsdb/integrations/handlers/litellm_handler/litellm_handler.py +16 -3
- mindsdb/integrations/handlers/litellm_handler/settings.py +2 -1
- mindsdb/integrations/handlers/llama_index_handler/requirements.txt +1 -1
- mindsdb/integrations/handlers/pgvector_handler/pgvector_handler.py +106 -90
- mindsdb/integrations/handlers/postgres_handler/postgres_handler.py +41 -39
- mindsdb/integrations/handlers/s3_handler/s3_handler.py +72 -70
- mindsdb/integrations/handlers/salesforce_handler/constants.py +208 -0
- mindsdb/integrations/handlers/salesforce_handler/salesforce_handler.py +142 -81
- mindsdb/integrations/handlers/salesforce_handler/salesforce_tables.py +12 -4
- mindsdb/integrations/handlers/slack_handler/slack_tables.py +141 -161
- mindsdb/integrations/handlers/tpot_handler/requirements.txt +1 -1
- mindsdb/integrations/handlers/web_handler/urlcrawl_helpers.py +32 -17
- mindsdb/integrations/handlers/web_handler/web_handler.py +19 -22
- mindsdb/integrations/handlers/youtube_handler/youtube_tables.py +183 -55
- mindsdb/integrations/libs/vectordatabase_handler.py +10 -1
- mindsdb/integrations/utilities/handler_utils.py +32 -12
- mindsdb/interfaces/agents/agents_controller.py +169 -110
- mindsdb/interfaces/agents/langchain_agent.py +10 -3
- mindsdb/interfaces/data_catalog/data_catalog_loader.py +22 -8
- mindsdb/interfaces/database/database.py +38 -13
- mindsdb/interfaces/database/integrations.py +20 -5
- mindsdb/interfaces/database/projects.py +63 -16
- mindsdb/interfaces/database/views.py +86 -60
- mindsdb/interfaces/jobs/jobs_controller.py +103 -110
- mindsdb/interfaces/knowledge_base/controller.py +33 -5
- mindsdb/interfaces/knowledge_base/evaluate.py +53 -9
- mindsdb/interfaces/knowledge_base/executor.py +24 -0
- mindsdb/interfaces/knowledge_base/llm_client.py +3 -3
- mindsdb/interfaces/knowledge_base/preprocessing/document_preprocessor.py +21 -13
- mindsdb/interfaces/query_context/context_controller.py +100 -133
- mindsdb/interfaces/skills/skills_controller.py +18 -6
- mindsdb/interfaces/storage/db.py +40 -6
- mindsdb/interfaces/variables/variables_controller.py +8 -15
- mindsdb/utilities/config.py +3 -3
- mindsdb/utilities/functions.py +72 -60
- mindsdb/utilities/log.py +38 -6
- mindsdb/utilities/ps.py +7 -7
- {mindsdb-25.7.2.0.dist-info → mindsdb-25.7.4.0.dist-info}/METADATA +262 -263
- {mindsdb-25.7.2.0.dist-info → mindsdb-25.7.4.0.dist-info}/RECORD +69 -68
- {mindsdb-25.7.2.0.dist-info → mindsdb-25.7.4.0.dist-info}/WHEEL +0 -0
- {mindsdb-25.7.2.0.dist-info → mindsdb-25.7.4.0.dist-info}/licenses/LICENSE +0 -0
- {mindsdb-25.7.2.0.dist-info → mindsdb-25.7.4.0.dist-info}/top_level.txt +0 -0
|
@@ -100,26 +100,25 @@ def parallel_get_all_website_links(urls) -> dict:
|
|
|
100
100
|
return url_contents
|
|
101
101
|
|
|
102
102
|
with concurrent.futures.ProcessPoolExecutor() as executor:
|
|
103
|
-
future_to_url = {
|
|
104
|
-
executor.submit(get_all_website_links, url): url for url in urls
|
|
105
|
-
}
|
|
103
|
+
future_to_url = {executor.submit(get_all_website_links, url): url for url in urls}
|
|
106
104
|
for future in concurrent.futures.as_completed(future_to_url):
|
|
107
105
|
url = future_to_url[future]
|
|
108
106
|
try:
|
|
109
107
|
url_contents[url] = future.result()
|
|
110
108
|
except Exception as exc:
|
|
111
|
-
logger.error(f
|
|
109
|
+
logger.error(f"{url} generated an exception: {exc}")
|
|
112
110
|
# don't raise the exception, just log it, continue processing other urls
|
|
113
111
|
|
|
114
112
|
return url_contents
|
|
115
113
|
|
|
116
114
|
|
|
117
|
-
def get_all_website_links(url) -> dict:
|
|
115
|
+
def get_all_website_links(url, headers: dict = None) -> dict:
|
|
118
116
|
"""
|
|
119
117
|
Fetch all website links from a URL.
|
|
120
118
|
|
|
121
119
|
Args:
|
|
122
120
|
url (str): the URL to fetch links from
|
|
121
|
+
headers (dict): a dictionary of headers to use when fetching links
|
|
123
122
|
|
|
124
123
|
Returns:
|
|
125
124
|
A dictionary containing the URL, the extracted links, the HTML content, the text content, and any error that occurred.
|
|
@@ -132,9 +131,12 @@ def get_all_website_links(url) -> dict:
|
|
|
132
131
|
session = requests.Session()
|
|
133
132
|
|
|
134
133
|
# Add headers to mimic a real browser request
|
|
135
|
-
headers
|
|
136
|
-
|
|
137
|
-
|
|
134
|
+
if headers is None:
|
|
135
|
+
headers = {}
|
|
136
|
+
if "User-Agent" not in headers:
|
|
137
|
+
headers["User-Agent"] = (
|
|
138
|
+
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/134.0.0.0 Safari/537.3"
|
|
139
|
+
)
|
|
138
140
|
|
|
139
141
|
response = session.get(url, headers=headers)
|
|
140
142
|
if "cookie" in response.request.headers:
|
|
@@ -157,7 +159,7 @@ def get_all_website_links(url) -> dict:
|
|
|
157
159
|
continue
|
|
158
160
|
href = urljoin(url, href)
|
|
159
161
|
parsed_href = urlparse(href)
|
|
160
|
-
href = urlunparse((parsed_href.scheme, parsed_href.netloc, parsed_href.path,
|
|
162
|
+
href = urlunparse((parsed_href.scheme, parsed_href.netloc, parsed_href.path, "", "", ""))
|
|
161
163
|
if not is_valid(href):
|
|
162
164
|
continue
|
|
163
165
|
if href in urls:
|
|
@@ -203,7 +205,15 @@ def get_readable_text_from_soup(soup) -> str:
|
|
|
203
205
|
return html_converter.handle(str(soup))
|
|
204
206
|
|
|
205
207
|
|
|
206
|
-
def get_all_website_links_recursively(
|
|
208
|
+
def get_all_website_links_recursively(
|
|
209
|
+
url,
|
|
210
|
+
reviewed_urls,
|
|
211
|
+
limit=None,
|
|
212
|
+
crawl_depth: int = 1,
|
|
213
|
+
current_depth: int = 0,
|
|
214
|
+
filters: List[str] = None,
|
|
215
|
+
headers=None,
|
|
216
|
+
):
|
|
207
217
|
"""
|
|
208
218
|
Recursively gathers all links from a given website up to a specified limit.
|
|
209
219
|
|
|
@@ -227,7 +237,7 @@ def get_all_website_links_recursively(url, reviewed_urls, limit=None, crawl_dept
|
|
|
227
237
|
matches_filter = any(re.match(f, url) is not None for f in filters)
|
|
228
238
|
if url not in reviewed_urls and matches_filter:
|
|
229
239
|
try:
|
|
230
|
-
reviewed_urls[url] = get_all_website_links(url)
|
|
240
|
+
reviewed_urls[url] = get_all_website_links(url, headers=headers)
|
|
231
241
|
except Exception as e:
|
|
232
242
|
error_message = traceback.format_exc().splitlines()[-1]
|
|
233
243
|
logger.error("An exception occurred: %s", str(e))
|
|
@@ -271,10 +281,14 @@ def get_all_website_links_recursively(url, reviewed_urls, limit=None, crawl_dept
|
|
|
271
281
|
reviewed_urls.update(new_revised_urls)
|
|
272
282
|
|
|
273
283
|
for new_url in new_revised_urls:
|
|
274
|
-
get_all_website_links_recursively(
|
|
284
|
+
get_all_website_links_recursively(
|
|
285
|
+
new_url, reviewed_urls, limit, crawl_depth=crawl_depth, current_depth=current_depth + 1, filters=filters
|
|
286
|
+
)
|
|
275
287
|
|
|
276
288
|
|
|
277
|
-
def get_all_websites(
|
|
289
|
+
def get_all_websites(
|
|
290
|
+
urls, limit=1, html=False, crawl_depth: int = 1, filters: List[str] = None, headers: dict = None
|
|
291
|
+
) -> pd.DataFrame:
|
|
278
292
|
"""
|
|
279
293
|
Crawl a list of websites and return a DataFrame containing the results.
|
|
280
294
|
|
|
@@ -284,6 +298,7 @@ def get_all_websites(urls, limit=1, html=False, crawl_depth: int = 1, filters: L
|
|
|
284
298
|
crawl_depth (int): Crawl depth for URLs.
|
|
285
299
|
html (bool): a boolean indicating whether to include the HTML content in the results
|
|
286
300
|
filters (List[str]): Crawl URLs that only match these regex patterns.
|
|
301
|
+
headers (dict): headers of request
|
|
287
302
|
|
|
288
303
|
Returns:
|
|
289
304
|
A DataFrame containing the results.
|
|
@@ -299,7 +314,9 @@ def get_all_websites(urls, limit=1, html=False, crawl_depth: int = 1, filters: L
|
|
|
299
314
|
if urlparse(url).scheme == "":
|
|
300
315
|
# Try HTTPS first
|
|
301
316
|
url = "https://" + url
|
|
302
|
-
get_all_website_links_recursively(
|
|
317
|
+
get_all_website_links_recursively(
|
|
318
|
+
url, reviewed_urls, limit, crawl_depth=crawl_depth, filters=filters, headers=headers
|
|
319
|
+
)
|
|
303
320
|
|
|
304
321
|
# Use a ThreadPoolExecutor to run the helper function in parallel.
|
|
305
322
|
with concurrent.futures.ThreadPoolExecutor() as executor:
|
|
@@ -311,9 +328,7 @@ def get_all_websites(urls, limit=1, html=False, crawl_depth: int = 1, filters: L
|
|
|
311
328
|
columns_to_ignore = ["urls"]
|
|
312
329
|
if html is False:
|
|
313
330
|
columns_to_ignore += ["html_content"]
|
|
314
|
-
df = dict_to_dataframe(
|
|
315
|
-
reviewed_urls, columns_to_ignore=columns_to_ignore, index_name="url"
|
|
316
|
-
)
|
|
331
|
+
df = dict_to_dataframe(reviewed_urls, columns_to_ignore=columns_to_ignore, index_name="url")
|
|
317
332
|
|
|
318
333
|
if not df.empty and df[df.error.isna()].empty:
|
|
319
334
|
raise Exception(str(df.iloc[0].error))
|
|
@@ -7,17 +7,11 @@ from mindsdb.utilities.security import validate_urls
|
|
|
7
7
|
from .urlcrawl_helpers import get_all_websites
|
|
8
8
|
|
|
9
9
|
from mindsdb.integrations.libs.api_handler import APIResource, APIHandler
|
|
10
|
-
from mindsdb.integrations.utilities.sql_utils import
|
|
10
|
+
from mindsdb.integrations.utilities.sql_utils import FilterCondition, FilterOperator
|
|
11
11
|
|
|
12
12
|
|
|
13
13
|
class CrawlerTable(APIResource):
|
|
14
|
-
|
|
15
|
-
def list(
|
|
16
|
-
self,
|
|
17
|
-
conditions: List[FilterCondition] = None,
|
|
18
|
-
limit: int = None,
|
|
19
|
-
**kwargs
|
|
20
|
-
) -> pd.DataFrame:
|
|
14
|
+
def list(self, conditions: List[FilterCondition] = None, limit: int = None, **kwargs) -> pd.DataFrame:
|
|
21
15
|
"""
|
|
22
16
|
Selects data from the provided websites
|
|
23
17
|
|
|
@@ -30,27 +24,34 @@ class CrawlerTable(APIResource):
|
|
|
30
24
|
urls = []
|
|
31
25
|
crawl_depth = None
|
|
32
26
|
per_url_limit = None
|
|
27
|
+
headers = {}
|
|
33
28
|
for condition in conditions:
|
|
34
|
-
if condition.column ==
|
|
29
|
+
if condition.column == "url":
|
|
35
30
|
if condition.op == FilterOperator.IN:
|
|
36
31
|
urls = condition.value
|
|
37
32
|
elif condition.op == FilterOperator.EQUAL:
|
|
38
33
|
urls = [condition.value]
|
|
39
34
|
condition.applied = True
|
|
40
|
-
if condition.column ==
|
|
35
|
+
if condition.column == "crawl_depth" and condition.op == FilterOperator.EQUAL:
|
|
41
36
|
crawl_depth = condition.value
|
|
42
37
|
condition.applied = True
|
|
43
|
-
if condition.column ==
|
|
38
|
+
if condition.column == "per_url_limit" and condition.op == FilterOperator.EQUAL:
|
|
44
39
|
per_url_limit = condition.value
|
|
45
40
|
condition.applied = True
|
|
41
|
+
if condition.column.lower() == "user_agent" and condition.op == FilterOperator.EQUAL:
|
|
42
|
+
headers["User-Agent"] = condition.value
|
|
43
|
+
condition.applied = True
|
|
46
44
|
|
|
47
45
|
if len(urls) == 0:
|
|
48
46
|
raise NotImplementedError(
|
|
49
|
-
'You must specify what url you want to crawl, for example: SELECT * FROM web.crawler WHERE url = "someurl"'
|
|
47
|
+
'You must specify what url you want to crawl, for example: SELECT * FROM web.crawler WHERE url = "someurl"'
|
|
48
|
+
)
|
|
50
49
|
|
|
51
|
-
allowed_urls = config.get(
|
|
50
|
+
allowed_urls = config.get("web_crawling_allowed_sites", [])
|
|
52
51
|
if allowed_urls and not validate_urls(urls, allowed_urls):
|
|
53
|
-
raise ValueError(
|
|
52
|
+
raise ValueError(
|
|
53
|
+
f"The provided URL is not allowed for web crawling. Please use any of {', '.join(allowed_urls)}."
|
|
54
|
+
)
|
|
54
55
|
|
|
55
56
|
if limit is None and per_url_limit is None and crawl_depth is None:
|
|
56
57
|
per_url_limit = 1
|
|
@@ -58,10 +59,10 @@ class CrawlerTable(APIResource):
|
|
|
58
59
|
# crawl every url separately
|
|
59
60
|
results = []
|
|
60
61
|
for url in urls:
|
|
61
|
-
results.append(get_all_websites([url], per_url_limit, crawl_depth=crawl_depth))
|
|
62
|
+
results.append(get_all_websites([url], per_url_limit, crawl_depth=crawl_depth, headers=headers))
|
|
62
63
|
result = pd.concat(results)
|
|
63
64
|
else:
|
|
64
|
-
result = get_all_websites(urls, limit, crawl_depth=crawl_depth)
|
|
65
|
+
result = get_all_websites(urls, limit, crawl_depth=crawl_depth, headers=headers)
|
|
65
66
|
|
|
66
67
|
if limit is not None and len(result) > limit:
|
|
67
68
|
result = result[:limit]
|
|
@@ -72,11 +73,7 @@ class CrawlerTable(APIResource):
|
|
|
72
73
|
"""
|
|
73
74
|
Returns the columns of the crawler table
|
|
74
75
|
"""
|
|
75
|
-
return [
|
|
76
|
-
'url',
|
|
77
|
-
'text_content',
|
|
78
|
-
'error'
|
|
79
|
-
]
|
|
76
|
+
return ["url", "text_content", "error"]
|
|
80
77
|
|
|
81
78
|
|
|
82
79
|
class WebHandler(APIHandler):
|
|
@@ -87,7 +84,7 @@ class WebHandler(APIHandler):
|
|
|
87
84
|
def __init__(self, name=None, **kwargs):
|
|
88
85
|
super().__init__(name)
|
|
89
86
|
crawler = CrawlerTable(self)
|
|
90
|
-
self._register_table(
|
|
87
|
+
self._register_table("crawler", crawler)
|
|
91
88
|
|
|
92
89
|
def check_connection(self) -> HandlerStatusResponse:
|
|
93
90
|
"""
|
|
@@ -7,7 +7,7 @@ from mindsdb_sql_parser import ast
|
|
|
7
7
|
from mindsdb.integrations.utilities.handlers.query_utilities import (
|
|
8
8
|
SELECTQueryParser,
|
|
9
9
|
SELECTQueryExecutor,
|
|
10
|
-
INSERTQueryParser
|
|
10
|
+
INSERTQueryParser,
|
|
11
11
|
)
|
|
12
12
|
|
|
13
13
|
import pandas as pd
|
|
@@ -66,9 +66,13 @@ class YoutubeCommentsTable(APITable):
|
|
|
66
66
|
select_statement_executor = SELECTQueryExecutor(
|
|
67
67
|
comments_df,
|
|
68
68
|
selected_columns,
|
|
69
|
-
[
|
|
69
|
+
[
|
|
70
|
+
where_condition
|
|
71
|
+
for where_condition in where_conditions
|
|
72
|
+
if where_condition[1] not in ["video_id", "channel_id"]
|
|
73
|
+
],
|
|
70
74
|
order_by_conditions,
|
|
71
|
-
result_limit if query.limit else None
|
|
75
|
+
result_limit if query.limit else None,
|
|
72
76
|
)
|
|
73
77
|
|
|
74
78
|
comments_df = select_statement_executor.execute_query()
|
|
@@ -98,50 +102,30 @@ class YoutubeCommentsTable(APITable):
|
|
|
98
102
|
values_to_insert = insert_query_parser.parse_query()
|
|
99
103
|
|
|
100
104
|
for value in values_to_insert:
|
|
101
|
-
if not value.get(
|
|
102
|
-
if not value.get(
|
|
105
|
+
if not value.get("comment_id"):
|
|
106
|
+
if not value.get("comment"):
|
|
103
107
|
raise ValueError("comment is mandatory for inserting a top-level comment.")
|
|
104
108
|
else:
|
|
105
|
-
self.insert_comment(video_id=value[
|
|
109
|
+
self.insert_comment(video_id=value["video_id"], text=value["comment"])
|
|
106
110
|
|
|
107
111
|
else:
|
|
108
|
-
if not value.get(
|
|
112
|
+
if not value.get("reply"):
|
|
109
113
|
raise ValueError("reply is mandatory for inserting a reply.")
|
|
110
114
|
else:
|
|
111
|
-
self.insert_comment(comment_id=value[
|
|
115
|
+
self.insert_comment(comment_id=value["comment_id"], text=value["reply"])
|
|
112
116
|
|
|
113
117
|
def insert_comment(self, text, video_id: str = None, comment_id: str = None):
|
|
114
118
|
# if comment_id is provided, define the request body for a reply and insert it
|
|
115
119
|
if comment_id:
|
|
116
|
-
request_body = {
|
|
117
|
-
'snippet': {
|
|
118
|
-
'parentId': comment_id,
|
|
119
|
-
'textOriginal': text
|
|
120
|
-
}
|
|
121
|
-
}
|
|
120
|
+
request_body = {"snippet": {"parentId": comment_id, "textOriginal": text}}
|
|
122
121
|
|
|
123
|
-
self.handler.connect().comments().insert(
|
|
124
|
-
part='snippet',
|
|
125
|
-
body=request_body
|
|
126
|
-
).execute()
|
|
122
|
+
self.handler.connect().comments().insert(part="snippet", body=request_body).execute()
|
|
127
123
|
|
|
128
124
|
# else if video_id is provided, define the request body for a top-level comment and insert it
|
|
129
125
|
elif video_id:
|
|
130
|
-
request_body = {
|
|
131
|
-
'snippet': {
|
|
132
|
-
'topLevelComment': {
|
|
133
|
-
'snippet': {
|
|
134
|
-
'videoId': video_id,
|
|
135
|
-
'textOriginal': text
|
|
136
|
-
}
|
|
137
|
-
}
|
|
138
|
-
}
|
|
139
|
-
}
|
|
126
|
+
request_body = {"snippet": {"topLevelComment": {"snippet": {"videoId": video_id, "textOriginal": text}}}}
|
|
140
127
|
|
|
141
|
-
self.handler.connect().commentThreads().insert(
|
|
142
|
-
part='snippet',
|
|
143
|
-
body=request_body
|
|
144
|
-
).execute()
|
|
128
|
+
self.handler.connect().commentThreads().insert(part="snippet", body=request_body).execute()
|
|
145
129
|
|
|
146
130
|
def get_columns(self) -> List[str]:
|
|
147
131
|
"""Gets all columns to be returned in pandas DataFrame responses
|
|
@@ -150,7 +134,19 @@ class YoutubeCommentsTable(APITable):
|
|
|
150
134
|
List[str]
|
|
151
135
|
List of columns
|
|
152
136
|
"""
|
|
153
|
-
return [
|
|
137
|
+
return [
|
|
138
|
+
"comment_id",
|
|
139
|
+
"channel_id",
|
|
140
|
+
"video_id",
|
|
141
|
+
"user_id",
|
|
142
|
+
"display_name",
|
|
143
|
+
"comment",
|
|
144
|
+
"published_at",
|
|
145
|
+
"updated_at",
|
|
146
|
+
"reply_user_id",
|
|
147
|
+
"reply_author",
|
|
148
|
+
"reply",
|
|
149
|
+
]
|
|
154
150
|
|
|
155
151
|
def get_comments(self, video_id: str, channel_id: str):
|
|
156
152
|
"""Pulls all the records from the given youtube api end point and returns it select()
|
|
@@ -166,7 +162,12 @@ class YoutubeCommentsTable(APITable):
|
|
|
166
162
|
resource = (
|
|
167
163
|
self.handler.connect()
|
|
168
164
|
.commentThreads()
|
|
169
|
-
.list(
|
|
165
|
+
.list(
|
|
166
|
+
part="snippet, replies",
|
|
167
|
+
videoId=video_id,
|
|
168
|
+
allThreadsRelatedToChannelId=channel_id,
|
|
169
|
+
textFormat="plainText",
|
|
170
|
+
)
|
|
170
171
|
)
|
|
171
172
|
|
|
172
173
|
data = []
|
|
@@ -175,7 +176,7 @@ class YoutubeCommentsTable(APITable):
|
|
|
175
176
|
|
|
176
177
|
for comment in comments["items"]:
|
|
177
178
|
replies = []
|
|
178
|
-
if
|
|
179
|
+
if "replies" in comment:
|
|
179
180
|
for reply in comment["replies"]["comments"]:
|
|
180
181
|
replies.append(
|
|
181
182
|
{
|
|
@@ -222,18 +223,51 @@ class YoutubeCommentsTable(APITable):
|
|
|
222
223
|
else:
|
|
223
224
|
break
|
|
224
225
|
|
|
225
|
-
youtube_comments_df = pd.json_normalize(
|
|
226
|
-
|
|
226
|
+
youtube_comments_df = pd.json_normalize(
|
|
227
|
+
data,
|
|
228
|
+
"replies",
|
|
229
|
+
[
|
|
230
|
+
"comment_id",
|
|
231
|
+
"channel_id",
|
|
232
|
+
"video_id",
|
|
233
|
+
"user_id",
|
|
234
|
+
"display_name",
|
|
235
|
+
"comment",
|
|
236
|
+
"published_at",
|
|
237
|
+
"updated_at",
|
|
238
|
+
],
|
|
239
|
+
record_prefix="replies.",
|
|
240
|
+
)
|
|
241
|
+
youtube_comments_df = youtube_comments_df.rename(
|
|
242
|
+
columns={
|
|
243
|
+
"replies.user_id": "reply_user_id",
|
|
244
|
+
"replies.reply_author": "reply_author",
|
|
245
|
+
"replies.reply": "reply",
|
|
246
|
+
}
|
|
247
|
+
)
|
|
227
248
|
|
|
228
249
|
# check if DataFrame is empty
|
|
229
250
|
if youtube_comments_df.empty:
|
|
230
251
|
return youtube_comments_df
|
|
231
252
|
else:
|
|
232
|
-
return youtube_comments_df[
|
|
253
|
+
return youtube_comments_df[
|
|
254
|
+
[
|
|
255
|
+
"comment_id",
|
|
256
|
+
"channel_id",
|
|
257
|
+
"video_id",
|
|
258
|
+
"user_id",
|
|
259
|
+
"display_name",
|
|
260
|
+
"comment",
|
|
261
|
+
"published_at",
|
|
262
|
+
"updated_at",
|
|
263
|
+
"reply_user_id",
|
|
264
|
+
"reply_author",
|
|
265
|
+
"reply",
|
|
266
|
+
]
|
|
267
|
+
]
|
|
233
268
|
|
|
234
269
|
|
|
235
270
|
class YoutubeChannelsTable(APITable):
|
|
236
|
-
|
|
237
271
|
"""Youtube Channel Info by channel id Table implementation"""
|
|
238
272
|
|
|
239
273
|
def select(self, query: ast.Select) -> pd.DataFrame:
|
|
@@ -263,9 +297,9 @@ class YoutubeChannelsTable(APITable):
|
|
|
263
297
|
select_statement_executor = SELECTQueryExecutor(
|
|
264
298
|
channel_df,
|
|
265
299
|
selected_columns,
|
|
266
|
-
[where_condition for where_condition in where_conditions if where_condition[1] ==
|
|
300
|
+
[where_condition for where_condition in where_conditions if where_condition[1] == "channel_id"],
|
|
267
301
|
order_by_conditions,
|
|
268
|
-
result_limit if query.limit else None
|
|
302
|
+
result_limit if query.limit else None,
|
|
269
303
|
)
|
|
270
304
|
|
|
271
305
|
channel_df = select_statement_executor.execute_query()
|
|
@@ -304,7 +338,6 @@ class YoutubeChannelsTable(APITable):
|
|
|
304
338
|
|
|
305
339
|
|
|
306
340
|
class YoutubeVideosTable(APITable):
|
|
307
|
-
|
|
308
341
|
"""Youtube Video info by video id Table implementation"""
|
|
309
342
|
|
|
310
343
|
def select(self, query: ast.Select) -> pd.DataFrame:
|
|
@@ -317,7 +350,7 @@ class YoutubeVideosTable(APITable):
|
|
|
317
350
|
result_limit,
|
|
318
351
|
) = select_statement_parser.parse_query()
|
|
319
352
|
|
|
320
|
-
video_id, channel_id = None, None
|
|
353
|
+
video_id, channel_id, search_query = None, None, None
|
|
321
354
|
for op, arg1, arg2 in where_conditions:
|
|
322
355
|
if arg1 == "video_id":
|
|
323
356
|
if op == "=":
|
|
@@ -331,38 +364,126 @@ class YoutubeVideosTable(APITable):
|
|
|
331
364
|
else:
|
|
332
365
|
raise NotImplementedError("Only '=' operator is supported for channel_id column.")
|
|
333
366
|
|
|
334
|
-
|
|
335
|
-
|
|
367
|
+
elif arg1 == "query":
|
|
368
|
+
if op == "=":
|
|
369
|
+
search_query = arg2
|
|
370
|
+
else:
|
|
371
|
+
raise NotImplementedError("Only '=' operator is supported for query column.")
|
|
372
|
+
|
|
373
|
+
if not video_id and not channel_id and not search_query:
|
|
374
|
+
raise ValueError("At least one of video_id, channel_id, or query must be present in the WHERE clause.")
|
|
336
375
|
|
|
337
376
|
if video_id:
|
|
338
377
|
video_df = self.get_videos_by_video_ids([video_id])
|
|
378
|
+
elif channel_id and search_query:
|
|
379
|
+
video_df = self.get_videos_by_search_query_in_channel(search_query, channel_id, result_limit)
|
|
380
|
+
elif channel_id:
|
|
381
|
+
video_df = self.get_videos_by_channel_id(channel_id, result_limit)
|
|
339
382
|
else:
|
|
340
|
-
video_df = self.
|
|
383
|
+
video_df = self.get_videos_by_search_query(search_query, result_limit)
|
|
341
384
|
|
|
342
385
|
select_statement_executor = SELECTQueryExecutor(
|
|
343
386
|
video_df,
|
|
344
387
|
selected_columns,
|
|
345
|
-
[
|
|
388
|
+
[
|
|
389
|
+
where_condition
|
|
390
|
+
for where_condition in where_conditions
|
|
391
|
+
if where_condition[1] not in ["video_id", "channel_id", "query"]
|
|
392
|
+
],
|
|
346
393
|
order_by_conditions,
|
|
347
|
-
result_limit if query.limit else None
|
|
394
|
+
result_limit if query.limit else None,
|
|
348
395
|
)
|
|
349
396
|
|
|
350
397
|
video_df = select_statement_executor.execute_query()
|
|
351
398
|
|
|
352
399
|
return video_df
|
|
353
400
|
|
|
354
|
-
def
|
|
401
|
+
def get_videos_by_search_query(self, search_query, limit=10):
|
|
355
402
|
video_ids = []
|
|
356
403
|
resource = (
|
|
357
404
|
self.handler.connect()
|
|
358
405
|
.search()
|
|
359
|
-
.list(part="snippet",
|
|
406
|
+
.list(part="snippet", q=search_query, type="video", maxResults=min(50, limit))
|
|
360
407
|
)
|
|
361
|
-
|
|
408
|
+
total_fetched = 0
|
|
409
|
+
|
|
410
|
+
while resource and total_fetched < limit:
|
|
411
|
+
response = resource.execute()
|
|
412
|
+
for item in response["items"]:
|
|
413
|
+
video_ids.append(item["id"]["videoId"])
|
|
414
|
+
total_fetched += 1
|
|
415
|
+
if total_fetched >= limit:
|
|
416
|
+
break
|
|
417
|
+
|
|
418
|
+
if "nextPageToken" in response and total_fetched < limit:
|
|
419
|
+
resource = (
|
|
420
|
+
self.handler.connect()
|
|
421
|
+
.search()
|
|
422
|
+
.list(
|
|
423
|
+
part="snippet",
|
|
424
|
+
q=search_query,
|
|
425
|
+
type="video",
|
|
426
|
+
maxResults=min(50, limit - total_fetched),
|
|
427
|
+
pageToken=response["nextPageToken"],
|
|
428
|
+
)
|
|
429
|
+
)
|
|
430
|
+
else:
|
|
431
|
+
break
|
|
432
|
+
|
|
433
|
+
return self.get_videos_by_video_ids(video_ids)
|
|
434
|
+
|
|
435
|
+
def get_videos_by_search_query_in_channel(self, search_query, channel_id, limit=10):
|
|
436
|
+
"""Search for videos within a specific channel"""
|
|
437
|
+
video_ids = []
|
|
438
|
+
resource = (
|
|
439
|
+
self.handler.connect()
|
|
440
|
+
.search()
|
|
441
|
+
.list(part="snippet", q=search_query, channelId=channel_id, type="video", maxResults=min(50, limit))
|
|
442
|
+
)
|
|
443
|
+
total_fetched = 0
|
|
444
|
+
|
|
445
|
+
while resource and total_fetched < limit:
|
|
446
|
+
response = resource.execute()
|
|
447
|
+
for item in response["items"]:
|
|
448
|
+
video_ids.append(item["id"]["videoId"])
|
|
449
|
+
total_fetched += 1
|
|
450
|
+
if total_fetched >= limit:
|
|
451
|
+
break
|
|
452
|
+
|
|
453
|
+
if "nextPageToken" in response and total_fetched < limit:
|
|
454
|
+
resource = (
|
|
455
|
+
self.handler.connect()
|
|
456
|
+
.search()
|
|
457
|
+
.list(
|
|
458
|
+
part="snippet",
|
|
459
|
+
q=search_query,
|
|
460
|
+
channelId=channel_id,
|
|
461
|
+
type="video",
|
|
462
|
+
maxResults=min(50, limit - total_fetched),
|
|
463
|
+
pageToken=response["nextPageToken"],
|
|
464
|
+
)
|
|
465
|
+
)
|
|
466
|
+
else:
|
|
467
|
+
break
|
|
468
|
+
|
|
469
|
+
return self.get_videos_by_video_ids(video_ids)
|
|
470
|
+
|
|
471
|
+
def get_videos_by_channel_id(self, channel_id, limit=10):
|
|
472
|
+
video_ids = []
|
|
473
|
+
resource = (
|
|
474
|
+
self.handler.connect()
|
|
475
|
+
.search()
|
|
476
|
+
.list(part="snippet", channelId=channel_id, type="video", maxResults=min(50, limit))
|
|
477
|
+
)
|
|
478
|
+
total_fetched = 0
|
|
479
|
+
while resource and total_fetched < limit:
|
|
362
480
|
response = resource.execute()
|
|
363
481
|
for item in response["items"]:
|
|
364
482
|
video_ids.append(item["id"]["videoId"])
|
|
365
|
-
|
|
483
|
+
total_fetched += 1
|
|
484
|
+
if total_fetched >= limit:
|
|
485
|
+
break
|
|
486
|
+
if "nextPageToken" in response and total_fetched < limit:
|
|
366
487
|
resource = (
|
|
367
488
|
self.handler.connect()
|
|
368
489
|
.search()
|
|
@@ -370,6 +491,7 @@ class YoutubeVideosTable(APITable):
|
|
|
370
491
|
part="snippet",
|
|
371
492
|
channelId=channel_id,
|
|
372
493
|
type="video",
|
|
494
|
+
maxResults=min(50, limit - total_fetched),
|
|
373
495
|
pageToken=response["nextPageToken"],
|
|
374
496
|
)
|
|
375
497
|
)
|
|
@@ -388,7 +510,13 @@ class YoutubeVideosTable(APITable):
|
|
|
388
510
|
# loop over 50 video ids at a time
|
|
389
511
|
# an invalid request error is caused otherwise
|
|
390
512
|
for i in range(0, len(video_ids), 50):
|
|
391
|
-
resource =
|
|
513
|
+
resource = (
|
|
514
|
+
self.handler.connect()
|
|
515
|
+
.videos()
|
|
516
|
+
.list(part="statistics,snippet,contentDetails", id=",".join(video_ids[i : i + 50]))
|
|
517
|
+
.execute()
|
|
518
|
+
)
|
|
519
|
+
|
|
392
520
|
for item in resource["items"]:
|
|
393
521
|
data.append(
|
|
394
522
|
{
|
|
@@ -415,7 +543,7 @@ class YoutubeVideosTable(APITable):
|
|
|
415
543
|
return json_formatted_transcript
|
|
416
544
|
|
|
417
545
|
except Exception as e:
|
|
418
|
-
logger.error(f"Encountered an error while fetching transcripts for video ${video_id}: ${e}"),
|
|
546
|
+
(logger.error(f"Encountered an error while fetching transcripts for video ${video_id}: ${e}"),)
|
|
419
547
|
return "Transcript not available for this video"
|
|
420
548
|
|
|
421
549
|
def parse_duration(self, video_id, duration):
|
|
@@ -428,7 +556,7 @@ class YoutubeVideosTable(APITable):
|
|
|
428
556
|
|
|
429
557
|
return duration_str.strip(":")
|
|
430
558
|
except Exception as e:
|
|
431
|
-
logger.error(f"Encountered an error while parsing duration for video ${video_id}: ${e}"),
|
|
559
|
+
(logger.error(f"Encountered an error while parsing duration for video ${video_id}: ${e}"),)
|
|
432
560
|
return "Duration not available for this video"
|
|
433
561
|
|
|
434
562
|
def get_columns(self) -> List[str]:
|
|
@@ -334,12 +334,21 @@ class VectorStoreHandler(BaseHandler):
|
|
|
334
334
|
|
|
335
335
|
if not df_update.empty:
|
|
336
336
|
# get values of existed `created_at` and return them to metadata
|
|
337
|
-
|
|
337
|
+
origin_id_col = "_original_doc_id"
|
|
338
|
+
|
|
339
|
+
created_dates, ids = {}, {}
|
|
340
|
+
for _, row in df_existed.iterrows():
|
|
341
|
+
chunk_id = row[id_col]
|
|
342
|
+
created_dates[chunk_id] = row[metadata_col].get("_created_at")
|
|
343
|
+
ids[chunk_id] = row[metadata_col].get(origin_id_col)
|
|
338
344
|
|
|
339
345
|
def keep_created_at(row):
|
|
340
346
|
val = created_dates.get(row[id_col])
|
|
341
347
|
if val:
|
|
342
348
|
row[metadata_col]["_created_at"] = val
|
|
349
|
+
# keep id column
|
|
350
|
+
if origin_id_col not in row[metadata_col]:
|
|
351
|
+
row[metadata_col][origin_id_col] = ids.get(row[id_col])
|
|
343
352
|
return row
|
|
344
353
|
|
|
345
354
|
df_update.apply(keep_created_at, axis=1)
|