MindsDB 25.7.2.0__py3-none-any.whl → 25.7.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of MindsDB might be problematic. Click here for more details.

Files changed (69) hide show
  1. mindsdb/__about__.py +1 -1
  2. mindsdb/__main__.py +1 -1
  3. mindsdb/api/a2a/common/server/server.py +16 -6
  4. mindsdb/api/executor/command_executor.py +213 -137
  5. mindsdb/api/executor/datahub/datanodes/integration_datanode.py +5 -1
  6. mindsdb/api/executor/datahub/datanodes/project_datanode.py +14 -3
  7. mindsdb/api/executor/planner/plan_join.py +3 -0
  8. mindsdb/api/executor/planner/plan_join_ts.py +117 -100
  9. mindsdb/api/executor/planner/query_planner.py +1 -0
  10. mindsdb/api/executor/sql_query/steps/apply_predictor_step.py +54 -85
  11. mindsdb/api/http/initialize.py +16 -43
  12. mindsdb/api/http/namespaces/agents.py +24 -21
  13. mindsdb/api/http/namespaces/chatbots.py +83 -120
  14. mindsdb/api/http/namespaces/file.py +1 -1
  15. mindsdb/api/http/namespaces/jobs.py +38 -60
  16. mindsdb/api/http/namespaces/tree.py +69 -61
  17. mindsdb/api/mcp/start.py +2 -0
  18. mindsdb/api/mysql/mysql_proxy/utilities/dump.py +3 -2
  19. mindsdb/integrations/handlers/autogluon_handler/requirements.txt +1 -1
  20. mindsdb/integrations/handlers/autosklearn_handler/requirements.txt +1 -1
  21. mindsdb/integrations/handlers/bigquery_handler/bigquery_handler.py +25 -5
  22. mindsdb/integrations/handlers/chromadb_handler/chromadb_handler.py +3 -3
  23. mindsdb/integrations/handlers/flaml_handler/requirements.txt +1 -1
  24. mindsdb/integrations/handlers/google_calendar_handler/google_calendar_tables.py +82 -73
  25. mindsdb/integrations/handlers/hubspot_handler/requirements.txt +1 -1
  26. mindsdb/integrations/handlers/langchain_handler/langchain_handler.py +83 -76
  27. mindsdb/integrations/handlers/lightwood_handler/requirements.txt +4 -4
  28. mindsdb/integrations/handlers/litellm_handler/litellm_handler.py +16 -3
  29. mindsdb/integrations/handlers/litellm_handler/settings.py +2 -1
  30. mindsdb/integrations/handlers/llama_index_handler/requirements.txt +1 -1
  31. mindsdb/integrations/handlers/pgvector_handler/pgvector_handler.py +106 -90
  32. mindsdb/integrations/handlers/postgres_handler/postgres_handler.py +41 -39
  33. mindsdb/integrations/handlers/s3_handler/s3_handler.py +72 -70
  34. mindsdb/integrations/handlers/salesforce_handler/constants.py +208 -0
  35. mindsdb/integrations/handlers/salesforce_handler/salesforce_handler.py +142 -81
  36. mindsdb/integrations/handlers/salesforce_handler/salesforce_tables.py +12 -4
  37. mindsdb/integrations/handlers/slack_handler/slack_tables.py +141 -161
  38. mindsdb/integrations/handlers/tpot_handler/requirements.txt +1 -1
  39. mindsdb/integrations/handlers/web_handler/urlcrawl_helpers.py +32 -17
  40. mindsdb/integrations/handlers/web_handler/web_handler.py +19 -22
  41. mindsdb/integrations/handlers/youtube_handler/youtube_tables.py +183 -55
  42. mindsdb/integrations/libs/vectordatabase_handler.py +10 -1
  43. mindsdb/integrations/utilities/handler_utils.py +32 -12
  44. mindsdb/interfaces/agents/agents_controller.py +169 -110
  45. mindsdb/interfaces/agents/langchain_agent.py +10 -3
  46. mindsdb/interfaces/data_catalog/data_catalog_loader.py +22 -8
  47. mindsdb/interfaces/database/database.py +38 -13
  48. mindsdb/interfaces/database/integrations.py +20 -5
  49. mindsdb/interfaces/database/projects.py +63 -16
  50. mindsdb/interfaces/database/views.py +86 -60
  51. mindsdb/interfaces/jobs/jobs_controller.py +103 -110
  52. mindsdb/interfaces/knowledge_base/controller.py +33 -5
  53. mindsdb/interfaces/knowledge_base/evaluate.py +53 -9
  54. mindsdb/interfaces/knowledge_base/executor.py +24 -0
  55. mindsdb/interfaces/knowledge_base/llm_client.py +3 -3
  56. mindsdb/interfaces/knowledge_base/preprocessing/document_preprocessor.py +21 -13
  57. mindsdb/interfaces/query_context/context_controller.py +100 -133
  58. mindsdb/interfaces/skills/skills_controller.py +18 -6
  59. mindsdb/interfaces/storage/db.py +40 -6
  60. mindsdb/interfaces/variables/variables_controller.py +8 -15
  61. mindsdb/utilities/config.py +3 -3
  62. mindsdb/utilities/functions.py +72 -60
  63. mindsdb/utilities/log.py +38 -6
  64. mindsdb/utilities/ps.py +7 -7
  65. {mindsdb-25.7.2.0.dist-info → mindsdb-25.7.4.0.dist-info}/METADATA +262 -263
  66. {mindsdb-25.7.2.0.dist-info → mindsdb-25.7.4.0.dist-info}/RECORD +69 -68
  67. {mindsdb-25.7.2.0.dist-info → mindsdb-25.7.4.0.dist-info}/WHEEL +0 -0
  68. {mindsdb-25.7.2.0.dist-info → mindsdb-25.7.4.0.dist-info}/licenses/LICENSE +0 -0
  69. {mindsdb-25.7.2.0.dist-info → mindsdb-25.7.4.0.dist-info}/top_level.txt +0 -0
@@ -100,26 +100,25 @@ def parallel_get_all_website_links(urls) -> dict:
100
100
  return url_contents
101
101
 
102
102
  with concurrent.futures.ProcessPoolExecutor() as executor:
103
- future_to_url = {
104
- executor.submit(get_all_website_links, url): url for url in urls
105
- }
103
+ future_to_url = {executor.submit(get_all_website_links, url): url for url in urls}
106
104
  for future in concurrent.futures.as_completed(future_to_url):
107
105
  url = future_to_url[future]
108
106
  try:
109
107
  url_contents[url] = future.result()
110
108
  except Exception as exc:
111
- logger.error(f'{url} generated an exception: {exc}')
109
+ logger.error(f"{url} generated an exception: {exc}")
112
110
  # don't raise the exception, just log it, continue processing other urls
113
111
 
114
112
  return url_contents
115
113
 
116
114
 
117
- def get_all_website_links(url) -> dict:
115
+ def get_all_website_links(url, headers: dict = None) -> dict:
118
116
  """
119
117
  Fetch all website links from a URL.
120
118
 
121
119
  Args:
122
120
  url (str): the URL to fetch links from
121
+ headers (dict): a dictionary of headers to use when fetching links
123
122
 
124
123
  Returns:
125
124
  A dictionary containing the URL, the extracted links, the HTML content, the text content, and any error that occurred.
@@ -132,9 +131,12 @@ def get_all_website_links(url) -> dict:
132
131
  session = requests.Session()
133
132
 
134
133
  # Add headers to mimic a real browser request
135
- headers = {
136
- "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36"
137
- }
134
+ if headers is None:
135
+ headers = {}
136
+ if "User-Agent" not in headers:
137
+ headers["User-Agent"] = (
138
+ "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/134.0.0.0 Safari/537.3"
139
+ )
138
140
 
139
141
  response = session.get(url, headers=headers)
140
142
  if "cookie" in response.request.headers:
@@ -157,7 +159,7 @@ def get_all_website_links(url) -> dict:
157
159
  continue
158
160
  href = urljoin(url, href)
159
161
  parsed_href = urlparse(href)
160
- href = urlunparse((parsed_href.scheme, parsed_href.netloc, parsed_href.path, '', '', ''))
162
+ href = urlunparse((parsed_href.scheme, parsed_href.netloc, parsed_href.path, "", "", ""))
161
163
  if not is_valid(href):
162
164
  continue
163
165
  if href in urls:
@@ -203,7 +205,15 @@ def get_readable_text_from_soup(soup) -> str:
203
205
  return html_converter.handle(str(soup))
204
206
 
205
207
 
206
- def get_all_website_links_recursively(url, reviewed_urls, limit=None, crawl_depth: int = 1, current_depth: int = 0, filters: List[str] = None):
208
+ def get_all_website_links_recursively(
209
+ url,
210
+ reviewed_urls,
211
+ limit=None,
212
+ crawl_depth: int = 1,
213
+ current_depth: int = 0,
214
+ filters: List[str] = None,
215
+ headers=None,
216
+ ):
207
217
  """
208
218
  Recursively gathers all links from a given website up to a specified limit.
209
219
 
@@ -227,7 +237,7 @@ def get_all_website_links_recursively(url, reviewed_urls, limit=None, crawl_dept
227
237
  matches_filter = any(re.match(f, url) is not None for f in filters)
228
238
  if url not in reviewed_urls and matches_filter:
229
239
  try:
230
- reviewed_urls[url] = get_all_website_links(url)
240
+ reviewed_urls[url] = get_all_website_links(url, headers=headers)
231
241
  except Exception as e:
232
242
  error_message = traceback.format_exc().splitlines()[-1]
233
243
  logger.error("An exception occurred: %s", str(e))
@@ -271,10 +281,14 @@ def get_all_website_links_recursively(url, reviewed_urls, limit=None, crawl_dept
271
281
  reviewed_urls.update(new_revised_urls)
272
282
 
273
283
  for new_url in new_revised_urls:
274
- get_all_website_links_recursively(new_url, reviewed_urls, limit, crawl_depth=crawl_depth, current_depth=current_depth + 1, filters=filters)
284
+ get_all_website_links_recursively(
285
+ new_url, reviewed_urls, limit, crawl_depth=crawl_depth, current_depth=current_depth + 1, filters=filters
286
+ )
275
287
 
276
288
 
277
- def get_all_websites(urls, limit=1, html=False, crawl_depth: int = 1, filters: List[str] = None) -> pd.DataFrame:
289
+ def get_all_websites(
290
+ urls, limit=1, html=False, crawl_depth: int = 1, filters: List[str] = None, headers: dict = None
291
+ ) -> pd.DataFrame:
278
292
  """
279
293
  Crawl a list of websites and return a DataFrame containing the results.
280
294
 
@@ -284,6 +298,7 @@ def get_all_websites(urls, limit=1, html=False, crawl_depth: int = 1, filters: L
284
298
  crawl_depth (int): Crawl depth for URLs.
285
299
  html (bool): a boolean indicating whether to include the HTML content in the results
286
300
  filters (List[str]): Crawl URLs that only match these regex patterns.
301
+ headers (dict): headers of request
287
302
 
288
303
  Returns:
289
304
  A DataFrame containing the results.
@@ -299,7 +314,9 @@ def get_all_websites(urls, limit=1, html=False, crawl_depth: int = 1, filters: L
299
314
  if urlparse(url).scheme == "":
300
315
  # Try HTTPS first
301
316
  url = "https://" + url
302
- get_all_website_links_recursively(url, reviewed_urls, limit, crawl_depth=crawl_depth, filters=filters)
317
+ get_all_website_links_recursively(
318
+ url, reviewed_urls, limit, crawl_depth=crawl_depth, filters=filters, headers=headers
319
+ )
303
320
 
304
321
  # Use a ThreadPoolExecutor to run the helper function in parallel.
305
322
  with concurrent.futures.ThreadPoolExecutor() as executor:
@@ -311,9 +328,7 @@ def get_all_websites(urls, limit=1, html=False, crawl_depth: int = 1, filters: L
311
328
  columns_to_ignore = ["urls"]
312
329
  if html is False:
313
330
  columns_to_ignore += ["html_content"]
314
- df = dict_to_dataframe(
315
- reviewed_urls, columns_to_ignore=columns_to_ignore, index_name="url"
316
- )
331
+ df = dict_to_dataframe(reviewed_urls, columns_to_ignore=columns_to_ignore, index_name="url")
317
332
 
318
333
  if not df.empty and df[df.error.isna()].empty:
319
334
  raise Exception(str(df.iloc[0].error))
@@ -7,17 +7,11 @@ from mindsdb.utilities.security import validate_urls
7
7
  from .urlcrawl_helpers import get_all_websites
8
8
 
9
9
  from mindsdb.integrations.libs.api_handler import APIResource, APIHandler
10
- from mindsdb.integrations.utilities.sql_utils import (FilterCondition, FilterOperator)
10
+ from mindsdb.integrations.utilities.sql_utils import FilterCondition, FilterOperator
11
11
 
12
12
 
13
13
  class CrawlerTable(APIResource):
14
-
15
- def list(
16
- self,
17
- conditions: List[FilterCondition] = None,
18
- limit: int = None,
19
- **kwargs
20
- ) -> pd.DataFrame:
14
+ def list(self, conditions: List[FilterCondition] = None, limit: int = None, **kwargs) -> pd.DataFrame:
21
15
  """
22
16
  Selects data from the provided websites
23
17
 
@@ -30,27 +24,34 @@ class CrawlerTable(APIResource):
30
24
  urls = []
31
25
  crawl_depth = None
32
26
  per_url_limit = None
27
+ headers = {}
33
28
  for condition in conditions:
34
- if condition.column == 'url':
29
+ if condition.column == "url":
35
30
  if condition.op == FilterOperator.IN:
36
31
  urls = condition.value
37
32
  elif condition.op == FilterOperator.EQUAL:
38
33
  urls = [condition.value]
39
34
  condition.applied = True
40
- if condition.column == 'crawl_depth' and condition.op == FilterOperator.EQUAL:
35
+ if condition.column == "crawl_depth" and condition.op == FilterOperator.EQUAL:
41
36
  crawl_depth = condition.value
42
37
  condition.applied = True
43
- if condition.column == 'per_url_limit' and condition.op == FilterOperator.EQUAL:
38
+ if condition.column == "per_url_limit" and condition.op == FilterOperator.EQUAL:
44
39
  per_url_limit = condition.value
45
40
  condition.applied = True
41
+ if condition.column.lower() == "user_agent" and condition.op == FilterOperator.EQUAL:
42
+ headers["User-Agent"] = condition.value
43
+ condition.applied = True
46
44
 
47
45
  if len(urls) == 0:
48
46
  raise NotImplementedError(
49
- 'You must specify what url you want to crawl, for example: SELECT * FROM web.crawler WHERE url = "someurl"')
47
+ 'You must specify what url you want to crawl, for example: SELECT * FROM web.crawler WHERE url = "someurl"'
48
+ )
50
49
 
51
- allowed_urls = config.get('web_crawling_allowed_sites', [])
50
+ allowed_urls = config.get("web_crawling_allowed_sites", [])
52
51
  if allowed_urls and not validate_urls(urls, allowed_urls):
53
- raise ValueError(f"The provided URL is not allowed for web crawling. Please use any of {', '.join(allowed_urls)}.")
52
+ raise ValueError(
53
+ f"The provided URL is not allowed for web crawling. Please use any of {', '.join(allowed_urls)}."
54
+ )
54
55
 
55
56
  if limit is None and per_url_limit is None and crawl_depth is None:
56
57
  per_url_limit = 1
@@ -58,10 +59,10 @@ class CrawlerTable(APIResource):
58
59
  # crawl every url separately
59
60
  results = []
60
61
  for url in urls:
61
- results.append(get_all_websites([url], per_url_limit, crawl_depth=crawl_depth))
62
+ results.append(get_all_websites([url], per_url_limit, crawl_depth=crawl_depth, headers=headers))
62
63
  result = pd.concat(results)
63
64
  else:
64
- result = get_all_websites(urls, limit, crawl_depth=crawl_depth)
65
+ result = get_all_websites(urls, limit, crawl_depth=crawl_depth, headers=headers)
65
66
 
66
67
  if limit is not None and len(result) > limit:
67
68
  result = result[:limit]
@@ -72,11 +73,7 @@ class CrawlerTable(APIResource):
72
73
  """
73
74
  Returns the columns of the crawler table
74
75
  """
75
- return [
76
- 'url',
77
- 'text_content',
78
- 'error'
79
- ]
76
+ return ["url", "text_content", "error"]
80
77
 
81
78
 
82
79
  class WebHandler(APIHandler):
@@ -87,7 +84,7 @@ class WebHandler(APIHandler):
87
84
  def __init__(self, name=None, **kwargs):
88
85
  super().__init__(name)
89
86
  crawler = CrawlerTable(self)
90
- self._register_table('crawler', crawler)
87
+ self._register_table("crawler", crawler)
91
88
 
92
89
  def check_connection(self) -> HandlerStatusResponse:
93
90
  """
@@ -7,7 +7,7 @@ from mindsdb_sql_parser import ast
7
7
  from mindsdb.integrations.utilities.handlers.query_utilities import (
8
8
  SELECTQueryParser,
9
9
  SELECTQueryExecutor,
10
- INSERTQueryParser
10
+ INSERTQueryParser,
11
11
  )
12
12
 
13
13
  import pandas as pd
@@ -66,9 +66,13 @@ class YoutubeCommentsTable(APITable):
66
66
  select_statement_executor = SELECTQueryExecutor(
67
67
  comments_df,
68
68
  selected_columns,
69
- [where_condition for where_condition in where_conditions if where_condition[1] not in ['video_id', 'channel_id']],
69
+ [
70
+ where_condition
71
+ for where_condition in where_conditions
72
+ if where_condition[1] not in ["video_id", "channel_id"]
73
+ ],
70
74
  order_by_conditions,
71
- result_limit if query.limit else None
75
+ result_limit if query.limit else None,
72
76
  )
73
77
 
74
78
  comments_df = select_statement_executor.execute_query()
@@ -98,50 +102,30 @@ class YoutubeCommentsTable(APITable):
98
102
  values_to_insert = insert_query_parser.parse_query()
99
103
 
100
104
  for value in values_to_insert:
101
- if not value.get('comment_id'):
102
- if not value.get('comment'):
105
+ if not value.get("comment_id"):
106
+ if not value.get("comment"):
103
107
  raise ValueError("comment is mandatory for inserting a top-level comment.")
104
108
  else:
105
- self.insert_comment(video_id=value['video_id'], text=value['comment'])
109
+ self.insert_comment(video_id=value["video_id"], text=value["comment"])
106
110
 
107
111
  else:
108
- if not value.get('reply'):
112
+ if not value.get("reply"):
109
113
  raise ValueError("reply is mandatory for inserting a reply.")
110
114
  else:
111
- self.insert_comment(comment_id=value['comment_id'], text=value['reply'])
115
+ self.insert_comment(comment_id=value["comment_id"], text=value["reply"])
112
116
 
113
117
  def insert_comment(self, text, video_id: str = None, comment_id: str = None):
114
118
  # if comment_id is provided, define the request body for a reply and insert it
115
119
  if comment_id:
116
- request_body = {
117
- 'snippet': {
118
- 'parentId': comment_id,
119
- 'textOriginal': text
120
- }
121
- }
120
+ request_body = {"snippet": {"parentId": comment_id, "textOriginal": text}}
122
121
 
123
- self.handler.connect().comments().insert(
124
- part='snippet',
125
- body=request_body
126
- ).execute()
122
+ self.handler.connect().comments().insert(part="snippet", body=request_body).execute()
127
123
 
128
124
  # else if video_id is provided, define the request body for a top-level comment and insert it
129
125
  elif video_id:
130
- request_body = {
131
- 'snippet': {
132
- 'topLevelComment': {
133
- 'snippet': {
134
- 'videoId': video_id,
135
- 'textOriginal': text
136
- }
137
- }
138
- }
139
- }
126
+ request_body = {"snippet": {"topLevelComment": {"snippet": {"videoId": video_id, "textOriginal": text}}}}
140
127
 
141
- self.handler.connect().commentThreads().insert(
142
- part='snippet',
143
- body=request_body
144
- ).execute()
128
+ self.handler.connect().commentThreads().insert(part="snippet", body=request_body).execute()
145
129
 
146
130
  def get_columns(self) -> List[str]:
147
131
  """Gets all columns to be returned in pandas DataFrame responses
@@ -150,7 +134,19 @@ class YoutubeCommentsTable(APITable):
150
134
  List[str]
151
135
  List of columns
152
136
  """
153
- return ['comment_id', 'channel_id', 'video_id', 'user_id', 'display_name', 'comment', "published_at", "updated_at", 'reply_user_id', 'reply_author', 'reply']
137
+ return [
138
+ "comment_id",
139
+ "channel_id",
140
+ "video_id",
141
+ "user_id",
142
+ "display_name",
143
+ "comment",
144
+ "published_at",
145
+ "updated_at",
146
+ "reply_user_id",
147
+ "reply_author",
148
+ "reply",
149
+ ]
154
150
 
155
151
  def get_comments(self, video_id: str, channel_id: str):
156
152
  """Pulls all the records from the given youtube api end point and returns it select()
@@ -166,7 +162,12 @@ class YoutubeCommentsTable(APITable):
166
162
  resource = (
167
163
  self.handler.connect()
168
164
  .commentThreads()
169
- .list(part="snippet, replies", videoId=video_id, allThreadsRelatedToChannelId=channel_id, textFormat="plainText")
165
+ .list(
166
+ part="snippet, replies",
167
+ videoId=video_id,
168
+ allThreadsRelatedToChannelId=channel_id,
169
+ textFormat="plainText",
170
+ )
170
171
  )
171
172
 
172
173
  data = []
@@ -175,7 +176,7 @@ class YoutubeCommentsTable(APITable):
175
176
 
176
177
  for comment in comments["items"]:
177
178
  replies = []
178
- if 'replies' in comment:
179
+ if "replies" in comment:
179
180
  for reply in comment["replies"]["comments"]:
180
181
  replies.append(
181
182
  {
@@ -222,18 +223,51 @@ class YoutubeCommentsTable(APITable):
222
223
  else:
223
224
  break
224
225
 
225
- youtube_comments_df = pd.json_normalize(data, 'replies', ['comment_id', 'channel_id', 'video_id', 'user_id', 'display_name', 'comment', "published_at", "updated_at"], record_prefix='replies.')
226
- youtube_comments_df = youtube_comments_df.rename(columns={'replies.user_id': 'reply_user_id', 'replies.reply_author': 'reply_author', 'replies.reply': 'reply'})
226
+ youtube_comments_df = pd.json_normalize(
227
+ data,
228
+ "replies",
229
+ [
230
+ "comment_id",
231
+ "channel_id",
232
+ "video_id",
233
+ "user_id",
234
+ "display_name",
235
+ "comment",
236
+ "published_at",
237
+ "updated_at",
238
+ ],
239
+ record_prefix="replies.",
240
+ )
241
+ youtube_comments_df = youtube_comments_df.rename(
242
+ columns={
243
+ "replies.user_id": "reply_user_id",
244
+ "replies.reply_author": "reply_author",
245
+ "replies.reply": "reply",
246
+ }
247
+ )
227
248
 
228
249
  # check if DataFrame is empty
229
250
  if youtube_comments_df.empty:
230
251
  return youtube_comments_df
231
252
  else:
232
- return youtube_comments_df[['comment_id', 'channel_id', 'video_id', 'user_id', 'display_name', 'comment', "published_at", "updated_at", 'reply_user_id', 'reply_author', 'reply']]
253
+ return youtube_comments_df[
254
+ [
255
+ "comment_id",
256
+ "channel_id",
257
+ "video_id",
258
+ "user_id",
259
+ "display_name",
260
+ "comment",
261
+ "published_at",
262
+ "updated_at",
263
+ "reply_user_id",
264
+ "reply_author",
265
+ "reply",
266
+ ]
267
+ ]
233
268
 
234
269
 
235
270
  class YoutubeChannelsTable(APITable):
236
-
237
271
  """Youtube Channel Info by channel id Table implementation"""
238
272
 
239
273
  def select(self, query: ast.Select) -> pd.DataFrame:
@@ -263,9 +297,9 @@ class YoutubeChannelsTable(APITable):
263
297
  select_statement_executor = SELECTQueryExecutor(
264
298
  channel_df,
265
299
  selected_columns,
266
- [where_condition for where_condition in where_conditions if where_condition[1] == 'channel_id'],
300
+ [where_condition for where_condition in where_conditions if where_condition[1] == "channel_id"],
267
301
  order_by_conditions,
268
- result_limit if query.limit else None
302
+ result_limit if query.limit else None,
269
303
  )
270
304
 
271
305
  channel_df = select_statement_executor.execute_query()
@@ -304,7 +338,6 @@ class YoutubeChannelsTable(APITable):
304
338
 
305
339
 
306
340
  class YoutubeVideosTable(APITable):
307
-
308
341
  """Youtube Video info by video id Table implementation"""
309
342
 
310
343
  def select(self, query: ast.Select) -> pd.DataFrame:
@@ -317,7 +350,7 @@ class YoutubeVideosTable(APITable):
317
350
  result_limit,
318
351
  ) = select_statement_parser.parse_query()
319
352
 
320
- video_id, channel_id = None, None
353
+ video_id, channel_id, search_query = None, None, None
321
354
  for op, arg1, arg2 in where_conditions:
322
355
  if arg1 == "video_id":
323
356
  if op == "=":
@@ -331,38 +364,126 @@ class YoutubeVideosTable(APITable):
331
364
  else:
332
365
  raise NotImplementedError("Only '=' operator is supported for channel_id column.")
333
366
 
334
- if not video_id and not channel_id:
335
- raise ValueError("Either video_id or channel_id has to be present in where clause.")
367
+ elif arg1 == "query":
368
+ if op == "=":
369
+ search_query = arg2
370
+ else:
371
+ raise NotImplementedError("Only '=' operator is supported for query column.")
372
+
373
+ if not video_id and not channel_id and not search_query:
374
+ raise ValueError("At least one of video_id, channel_id, or query must be present in the WHERE clause.")
336
375
 
337
376
  if video_id:
338
377
  video_df = self.get_videos_by_video_ids([video_id])
378
+ elif channel_id and search_query:
379
+ video_df = self.get_videos_by_search_query_in_channel(search_query, channel_id, result_limit)
380
+ elif channel_id:
381
+ video_df = self.get_videos_by_channel_id(channel_id, result_limit)
339
382
  else:
340
- video_df = self.get_videos_by_channel_id(channel_id)
383
+ video_df = self.get_videos_by_search_query(search_query, result_limit)
341
384
 
342
385
  select_statement_executor = SELECTQueryExecutor(
343
386
  video_df,
344
387
  selected_columns,
345
- [where_condition for where_condition in where_conditions if where_condition[1] not in ['video_id', 'channel_id']],
388
+ [
389
+ where_condition
390
+ for where_condition in where_conditions
391
+ if where_condition[1] not in ["video_id", "channel_id", "query"]
392
+ ],
346
393
  order_by_conditions,
347
- result_limit if query.limit else None
394
+ result_limit if query.limit else None,
348
395
  )
349
396
 
350
397
  video_df = select_statement_executor.execute_query()
351
398
 
352
399
  return video_df
353
400
 
354
- def get_videos_by_channel_id(self, channel_id):
401
+ def get_videos_by_search_query(self, search_query, limit=10):
355
402
  video_ids = []
356
403
  resource = (
357
404
  self.handler.connect()
358
405
  .search()
359
- .list(part="snippet", channelId=channel_id, type="video")
406
+ .list(part="snippet", q=search_query, type="video", maxResults=min(50, limit))
360
407
  )
361
- while resource:
408
+ total_fetched = 0
409
+
410
+ while resource and total_fetched < limit:
411
+ response = resource.execute()
412
+ for item in response["items"]:
413
+ video_ids.append(item["id"]["videoId"])
414
+ total_fetched += 1
415
+ if total_fetched >= limit:
416
+ break
417
+
418
+ if "nextPageToken" in response and total_fetched < limit:
419
+ resource = (
420
+ self.handler.connect()
421
+ .search()
422
+ .list(
423
+ part="snippet",
424
+ q=search_query,
425
+ type="video",
426
+ maxResults=min(50, limit - total_fetched),
427
+ pageToken=response["nextPageToken"],
428
+ )
429
+ )
430
+ else:
431
+ break
432
+
433
+ return self.get_videos_by_video_ids(video_ids)
434
+
435
+ def get_videos_by_search_query_in_channel(self, search_query, channel_id, limit=10):
436
+ """Search for videos within a specific channel"""
437
+ video_ids = []
438
+ resource = (
439
+ self.handler.connect()
440
+ .search()
441
+ .list(part="snippet", q=search_query, channelId=channel_id, type="video", maxResults=min(50, limit))
442
+ )
443
+ total_fetched = 0
444
+
445
+ while resource and total_fetched < limit:
446
+ response = resource.execute()
447
+ for item in response["items"]:
448
+ video_ids.append(item["id"]["videoId"])
449
+ total_fetched += 1
450
+ if total_fetched >= limit:
451
+ break
452
+
453
+ if "nextPageToken" in response and total_fetched < limit:
454
+ resource = (
455
+ self.handler.connect()
456
+ .search()
457
+ .list(
458
+ part="snippet",
459
+ q=search_query,
460
+ channelId=channel_id,
461
+ type="video",
462
+ maxResults=min(50, limit - total_fetched),
463
+ pageToken=response["nextPageToken"],
464
+ )
465
+ )
466
+ else:
467
+ break
468
+
469
+ return self.get_videos_by_video_ids(video_ids)
470
+
471
+ def get_videos_by_channel_id(self, channel_id, limit=10):
472
+ video_ids = []
473
+ resource = (
474
+ self.handler.connect()
475
+ .search()
476
+ .list(part="snippet", channelId=channel_id, type="video", maxResults=min(50, limit))
477
+ )
478
+ total_fetched = 0
479
+ while resource and total_fetched < limit:
362
480
  response = resource.execute()
363
481
  for item in response["items"]:
364
482
  video_ids.append(item["id"]["videoId"])
365
- if "nextPageToken" in response:
483
+ total_fetched += 1
484
+ if total_fetched >= limit:
485
+ break
486
+ if "nextPageToken" in response and total_fetched < limit:
366
487
  resource = (
367
488
  self.handler.connect()
368
489
  .search()
@@ -370,6 +491,7 @@ class YoutubeVideosTable(APITable):
370
491
  part="snippet",
371
492
  channelId=channel_id,
372
493
  type="video",
494
+ maxResults=min(50, limit - total_fetched),
373
495
  pageToken=response["nextPageToken"],
374
496
  )
375
497
  )
@@ -388,7 +510,13 @@ class YoutubeVideosTable(APITable):
388
510
  # loop over 50 video ids at a time
389
511
  # an invalid request error is caused otherwise
390
512
  for i in range(0, len(video_ids), 50):
391
- resource = self.handler.connect().videos().list(part="statistics,snippet,contentDetails", id=",".join(video_ids[i:i + 50])).execute()
513
+ resource = (
514
+ self.handler.connect()
515
+ .videos()
516
+ .list(part="statistics,snippet,contentDetails", id=",".join(video_ids[i : i + 50]))
517
+ .execute()
518
+ )
519
+
392
520
  for item in resource["items"]:
393
521
  data.append(
394
522
  {
@@ -415,7 +543,7 @@ class YoutubeVideosTable(APITable):
415
543
  return json_formatted_transcript
416
544
 
417
545
  except Exception as e:
418
- logger.error(f"Encountered an error while fetching transcripts for video ${video_id}: ${e}"),
546
+ (logger.error(f"Encountered an error while fetching transcripts for video ${video_id}: ${e}"),)
419
547
  return "Transcript not available for this video"
420
548
 
421
549
  def parse_duration(self, video_id, duration):
@@ -428,7 +556,7 @@ class YoutubeVideosTable(APITable):
428
556
 
429
557
  return duration_str.strip(":")
430
558
  except Exception as e:
431
- logger.error(f"Encountered an error while parsing duration for video ${video_id}: ${e}"),
559
+ (logger.error(f"Encountered an error while parsing duration for video ${video_id}: ${e}"),)
432
560
  return "Duration not available for this video"
433
561
 
434
562
  def get_columns(self) -> List[str]:
@@ -334,12 +334,21 @@ class VectorStoreHandler(BaseHandler):
334
334
 
335
335
  if not df_update.empty:
336
336
  # get values of existed `created_at` and return them to metadata
337
- created_dates = {row[id_col]: row[metadata_col].get("_created_at") for _, row in df_existed.iterrows()}
337
+ origin_id_col = "_original_doc_id"
338
+
339
+ created_dates, ids = {}, {}
340
+ for _, row in df_existed.iterrows():
341
+ chunk_id = row[id_col]
342
+ created_dates[chunk_id] = row[metadata_col].get("_created_at")
343
+ ids[chunk_id] = row[metadata_col].get(origin_id_col)
338
344
 
339
345
  def keep_created_at(row):
340
346
  val = created_dates.get(row[id_col])
341
347
  if val:
342
348
  row[metadata_col]["_created_at"] = val
349
+ # keep id column
350
+ if origin_id_col not in row[metadata_col]:
351
+ row[metadata_col][origin_id_col] = ids.get(row[id_col])
343
352
  return row
344
353
 
345
354
  df_update.apply(keep_created_at, axis=1)