PyPI - MindsDB - Versions diffs - 25.7.2.0__py3-none-any.whl → 25.7.4.0__py3-none-any.whl - Mend

MindsDB 25.7.2.0py3-none-any.whl → 25.7.4.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of MindsDB might be problematic. Click here for more details.

Files changed (69) hide show

mindsdb/__about__.py +1 -1
mindsdb/__main__.py +1 -1
mindsdb/api/a2a/common/server/server.py +16 -6
mindsdb/api/executor/command_executor.py +213 -137
mindsdb/api/executor/datahub/datanodes/integration_datanode.py +5 -1
mindsdb/api/executor/datahub/datanodes/project_datanode.py +14 -3
mindsdb/api/executor/planner/plan_join.py +3 -0
mindsdb/api/executor/planner/plan_join_ts.py +117 -100
mindsdb/api/executor/planner/query_planner.py +1 -0
mindsdb/api/executor/sql_query/steps/apply_predictor_step.py +54 -85
mindsdb/api/http/initialize.py +16 -43
mindsdb/api/http/namespaces/agents.py +24 -21
mindsdb/api/http/namespaces/chatbots.py +83 -120
mindsdb/api/http/namespaces/file.py +1 -1
mindsdb/api/http/namespaces/jobs.py +38 -60
mindsdb/api/http/namespaces/tree.py +69 -61
mindsdb/api/mcp/start.py +2 -0
mindsdb/api/mysql/mysql_proxy/utilities/dump.py +3 -2
mindsdb/integrations/handlers/autogluon_handler/requirements.txt +1 -1
mindsdb/integrations/handlers/autosklearn_handler/requirements.txt +1 -1
mindsdb/integrations/handlers/bigquery_handler/bigquery_handler.py +25 -5
mindsdb/integrations/handlers/chromadb_handler/chromadb_handler.py +3 -3
mindsdb/integrations/handlers/flaml_handler/requirements.txt +1 -1
mindsdb/integrations/handlers/google_calendar_handler/google_calendar_tables.py +82 -73
mindsdb/integrations/handlers/hubspot_handler/requirements.txt +1 -1
mindsdb/integrations/handlers/langchain_handler/langchain_handler.py +83 -76
mindsdb/integrations/handlers/lightwood_handler/requirements.txt +4 -4
mindsdb/integrations/handlers/litellm_handler/litellm_handler.py +16 -3
mindsdb/integrations/handlers/litellm_handler/settings.py +2 -1
mindsdb/integrations/handlers/llama_index_handler/requirements.txt +1 -1
mindsdb/integrations/handlers/pgvector_handler/pgvector_handler.py +106 -90
mindsdb/integrations/handlers/postgres_handler/postgres_handler.py +41 -39
mindsdb/integrations/handlers/s3_handler/s3_handler.py +72 -70
mindsdb/integrations/handlers/salesforce_handler/constants.py +208 -0
mindsdb/integrations/handlers/salesforce_handler/salesforce_handler.py +142 -81
mindsdb/integrations/handlers/salesforce_handler/salesforce_tables.py +12 -4
mindsdb/integrations/handlers/slack_handler/slack_tables.py +141 -161
mindsdb/integrations/handlers/tpot_handler/requirements.txt +1 -1
mindsdb/integrations/handlers/web_handler/urlcrawl_helpers.py +32 -17
mindsdb/integrations/handlers/web_handler/web_handler.py +19 -22
mindsdb/integrations/handlers/youtube_handler/youtube_tables.py +183 -55
mindsdb/integrations/libs/vectordatabase_handler.py +10 -1
mindsdb/integrations/utilities/handler_utils.py +32 -12
mindsdb/interfaces/agents/agents_controller.py +169 -110
mindsdb/interfaces/agents/langchain_agent.py +10 -3
mindsdb/interfaces/data_catalog/data_catalog_loader.py +22 -8
mindsdb/interfaces/database/database.py +38 -13
mindsdb/interfaces/database/integrations.py +20 -5
mindsdb/interfaces/database/projects.py +63 -16
mindsdb/interfaces/database/views.py +86 -60
mindsdb/interfaces/jobs/jobs_controller.py +103 -110
mindsdb/interfaces/knowledge_base/controller.py +33 -5
mindsdb/interfaces/knowledge_base/evaluate.py +53 -9
mindsdb/interfaces/knowledge_base/executor.py +24 -0
mindsdb/interfaces/knowledge_base/llm_client.py +3 -3
mindsdb/interfaces/knowledge_base/preprocessing/document_preprocessor.py +21 -13
mindsdb/interfaces/query_context/context_controller.py +100 -133
mindsdb/interfaces/skills/skills_controller.py +18 -6
mindsdb/interfaces/storage/db.py +40 -6
mindsdb/interfaces/variables/variables_controller.py +8 -15
mindsdb/utilities/config.py +3 -3
mindsdb/utilities/functions.py +72 -60
mindsdb/utilities/log.py +38 -6
mindsdb/utilities/ps.py +7 -7
{mindsdb-25.7.2.0.dist-info → mindsdb-25.7.4.0.dist-info}/METADATA +262 -263
{mindsdb-25.7.2.0.dist-info → mindsdb-25.7.4.0.dist-info}/RECORD +69 -68
{mindsdb-25.7.2.0.dist-info → mindsdb-25.7.4.0.dist-info}/WHEEL +0 -0
{mindsdb-25.7.2.0.dist-info → mindsdb-25.7.4.0.dist-info}/licenses/LICENSE +0 -0
{mindsdb-25.7.2.0.dist-info → mindsdb-25.7.4.0.dist-info}/top_level.txt +0 -0

mindsdb/integrations/handlers/web_handler/urlcrawl_helpers.py CHANGED Viewed

@@ -100,26 +100,25 @@ def parallel_get_all_website_links(urls) -> dict:
         return url_contents
     with concurrent.futures.ProcessPoolExecutor() as executor:
-        future_to_url = {
-            executor.submit(get_all_website_links, url): url for url in urls
-        }
+        future_to_url = {executor.submit(get_all_website_links, url): url for url in urls}
         for future in concurrent.futures.as_completed(future_to_url):
             url = future_to_url[future]
             try:
                 url_contents[url] = future.result()
             except Exception as exc:
-                logger.error(f'{url} generated an exception: {exc}')
+                logger.error(f"{url} generated an exception: {exc}")
                 # don't raise the exception, just log it, continue processing other urls
     return url_contents
-def get_all_website_links(url) -> dict:
+def get_all_website_links(url, headers: dict = None) -> dict:
     """
     Fetch all website links from a URL.
     Args:
         url (str): the URL to fetch links from
+        headers (dict): a dictionary of headers to use when fetching links
     Returns:
         A dictionary containing the URL, the extracted links, the HTML content, the text content, and any error that occurred.
@@ -132,9 +131,12 @@ def get_all_website_links(url) -> dict:
         session = requests.Session()
         # Add headers to mimic a real browser request
-        headers = {
-            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36"
-        }
+        if headers is None:
+            headers = {}
+        if "User-Agent" not in headers:
+            headers["User-Agent"] = (
+                "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/134.0.0.0 Safari/537.3"
+            )
         response = session.get(url, headers=headers)
         if "cookie" in response.request.headers:
@@ -157,7 +159,7 @@ def get_all_website_links(url) -> dict:
                     continue
                 href = urljoin(url, href)
                 parsed_href = urlparse(href)
-                href = urlunparse((parsed_href.scheme, parsed_href.netloc, parsed_href.path, '', '', ''))
+                href = urlunparse((parsed_href.scheme, parsed_href.netloc, parsed_href.path, "", "", ""))
                 if not is_valid(href):
                     continue
                 if href in urls:
@@ -203,7 +205,15 @@ def get_readable_text_from_soup(soup) -> str:
     return html_converter.handle(str(soup))
-def get_all_website_links_recursively(url, reviewed_urls, limit=None, crawl_depth: int = 1, current_depth: int = 0, filters: List[str] = None):
+def get_all_website_links_recursively(
+    url,
+    reviewed_urls,
+    limit=None,
+    crawl_depth: int = 1,
+    current_depth: int = 0,
+    filters: List[str] = None,
+    headers=None,
+):
     """
     Recursively gathers all links from a given website up to a specified limit.
@@ -227,7 +237,7 @@ def get_all_website_links_recursively(url, reviewed_urls, limit=None, crawl_dept
         matches_filter = any(re.match(f, url) is not None for f in filters)
     if url not in reviewed_urls and matches_filter:
         try:
-            reviewed_urls[url] = get_all_website_links(url)
+            reviewed_urls[url] = get_all_website_links(url, headers=headers)
         except Exception as e:
             error_message = traceback.format_exc().splitlines()[-1]
             logger.error("An exception occurred: %s", str(e))
@@ -271,10 +281,14 @@ def get_all_website_links_recursively(url, reviewed_urls, limit=None, crawl_dept
         reviewed_urls.update(new_revised_urls)
         for new_url in new_revised_urls:
-            get_all_website_links_recursively(new_url, reviewed_urls, limit, crawl_depth=crawl_depth, current_depth=current_depth + 1, filters=filters)
+            get_all_website_links_recursively(
+                new_url, reviewed_urls, limit, crawl_depth=crawl_depth, current_depth=current_depth + 1, filters=filters
+            )
-def get_all_websites(urls, limit=1, html=False, crawl_depth: int = 1, filters: List[str] = None) -> pd.DataFrame:
+def get_all_websites(
+    urls, limit=1, html=False, crawl_depth: int = 1, filters: List[str] = None, headers: dict = None
+) -> pd.DataFrame:
     """
     Crawl a list of websites and return a DataFrame containing the results.
@@ -284,6 +298,7 @@ def get_all_websites(urls, limit=1, html=False, crawl_depth: int = 1, filters: L
         crawl_depth (int): Crawl depth for URLs.
         html (bool): a boolean indicating whether to include the HTML content in the results
         filters (List[str]): Crawl URLs that only match these regex patterns.
+        headers (dict): headers of request
     Returns:
         A DataFrame containing the results.
@@ -299,7 +314,9 @@ def get_all_websites(urls, limit=1, html=False, crawl_depth: int = 1, filters: L
         if urlparse(url).scheme == "":
             # Try HTTPS first
             url = "https://" + url
-        get_all_website_links_recursively(url, reviewed_urls, limit, crawl_depth=crawl_depth, filters=filters)
+        get_all_website_links_recursively(
+            url, reviewed_urls, limit, crawl_depth=crawl_depth, filters=filters, headers=headers
+        )
     # Use a ThreadPoolExecutor to run the helper function in parallel.
     with concurrent.futures.ThreadPoolExecutor() as executor:
@@ -311,9 +328,7 @@ def get_all_websites(urls, limit=1, html=False, crawl_depth: int = 1, filters: L
     columns_to_ignore = ["urls"]
     if html is False:
         columns_to_ignore += ["html_content"]
-    df = dict_to_dataframe(
-        reviewed_urls, columns_to_ignore=columns_to_ignore, index_name="url"
-    )
+    df = dict_to_dataframe(reviewed_urls, columns_to_ignore=columns_to_ignore, index_name="url")
     if not df.empty and df[df.error.isna()].empty:
         raise Exception(str(df.iloc[0].error))

mindsdb/integrations/handlers/web_handler/web_handler.py CHANGED Viewed

@@ -7,17 +7,11 @@ from mindsdb.utilities.security import validate_urls
 from .urlcrawl_helpers import get_all_websites
 from mindsdb.integrations.libs.api_handler import APIResource, APIHandler
-from mindsdb.integrations.utilities.sql_utils import (FilterCondition, FilterOperator)
+from mindsdb.integrations.utilities.sql_utils import FilterCondition, FilterOperator
 class CrawlerTable(APIResource):
-    def list(
-            self,
-            conditions: List[FilterCondition] = None,
-            limit: int = None,
-            **kwargs
-    ) -> pd.DataFrame:
+    def list(self, conditions: List[FilterCondition] = None, limit: int = None, **kwargs) -> pd.DataFrame:
         """
         Selects data from the provided websites
@@ -30,27 +24,34 @@ class CrawlerTable(APIResource):
         urls = []
         crawl_depth = None
         per_url_limit = None
+        headers = {}
         for condition in conditions:
-            if condition.column == 'url':
+            if condition.column == "url":
                 if condition.op == FilterOperator.IN:
                     urls = condition.value
                 elif condition.op == FilterOperator.EQUAL:
                     urls = [condition.value]
                 condition.applied = True
-            if condition.column == 'crawl_depth' and condition.op == FilterOperator.EQUAL:
+            if condition.column == "crawl_depth" and condition.op == FilterOperator.EQUAL:
                 crawl_depth = condition.value
                 condition.applied = True
-            if condition.column == 'per_url_limit' and condition.op == FilterOperator.EQUAL:
+            if condition.column == "per_url_limit" and condition.op == FilterOperator.EQUAL:
                 per_url_limit = condition.value
                 condition.applied = True
+            if condition.column.lower() == "user_agent" and condition.op == FilterOperator.EQUAL:
+                headers["User-Agent"] = condition.value
+                condition.applied = True
         if len(urls) == 0:
             raise NotImplementedError(
-                'You must specify what url you want to crawl, for example: SELECT * FROM web.crawler WHERE url = "someurl"')
+                'You must specify what url you want to crawl, for example: SELECT * FROM web.crawler WHERE url = "someurl"'
+            )
-        allowed_urls = config.get('web_crawling_allowed_sites', [])
+        allowed_urls = config.get("web_crawling_allowed_sites", [])
         if allowed_urls and not validate_urls(urls, allowed_urls):
-            raise ValueError(f"The provided URL is not allowed for web crawling. Please use any of {', '.join(allowed_urls)}.")
+            raise ValueError(
+                f"The provided URL is not allowed for web crawling. Please use any of {', '.join(allowed_urls)}."
+            )
         if limit is None and per_url_limit is None and crawl_depth is None:
             per_url_limit = 1
@@ -58,10 +59,10 @@ class CrawlerTable(APIResource):
             # crawl every url separately
             results = []
             for url in urls:
-                results.append(get_all_websites([url], per_url_limit, crawl_depth=crawl_depth))
+                results.append(get_all_websites([url], per_url_limit, crawl_depth=crawl_depth, headers=headers))
             result = pd.concat(results)
         else:
-            result = get_all_websites(urls, limit, crawl_depth=crawl_depth)
+            result = get_all_websites(urls, limit, crawl_depth=crawl_depth, headers=headers)
         if limit is not None and len(result) > limit:
             result = result[:limit]
@@ -72,11 +73,7 @@ class CrawlerTable(APIResource):
         """
         Returns the columns of the crawler table
         """
-        return [
-            'url',
-            'text_content',
-            'error'
-        ]
+        return ["url", "text_content", "error"]
 class WebHandler(APIHandler):
@@ -87,7 +84,7 @@ class WebHandler(APIHandler):
     def __init__(self, name=None, **kwargs):
         super().__init__(name)
         crawler = CrawlerTable(self)
-        self._register_table('crawler', crawler)
+        self._register_table("crawler", crawler)
     def check_connection(self) -> HandlerStatusResponse:
         """

mindsdb/integrations/handlers/youtube_handler/youtube_tables.py CHANGED Viewed

@@ -7,7 +7,7 @@ from mindsdb_sql_parser import ast
 from mindsdb.integrations.utilities.handlers.query_utilities import (
     SELECTQueryParser,
     SELECTQueryExecutor,
-    INSERTQueryParser
+    INSERTQueryParser,
 )
 import pandas as pd
@@ -66,9 +66,13 @@ class YoutubeCommentsTable(APITable):
         select_statement_executor = SELECTQueryExecutor(
             comments_df,
             selected_columns,
-            [where_condition for where_condition in where_conditions if where_condition[1] not in ['video_id', 'channel_id']],
+            [
+                where_condition
+                for where_condition in where_conditions
+                if where_condition[1] not in ["video_id", "channel_id"]
+            ],
             order_by_conditions,
-            result_limit if query.limit else None
+            result_limit if query.limit else None,
         )
         comments_df = select_statement_executor.execute_query()
@@ -98,50 +102,30 @@ class YoutubeCommentsTable(APITable):
         values_to_insert = insert_query_parser.parse_query()
         for value in values_to_insert:
-            if not value.get('comment_id'):
-                if not value.get('comment'):
+            if not value.get("comment_id"):
+                if not value.get("comment"):
                     raise ValueError("comment is mandatory for inserting a top-level comment.")
                 else:
-                    self.insert_comment(video_id=value['video_id'], text=value['comment'])
+                    self.insert_comment(video_id=value["video_id"], text=value["comment"])
             else:
-                if not value.get('reply'):
+                if not value.get("reply"):
                     raise ValueError("reply is mandatory for inserting a reply.")
                 else:
-                    self.insert_comment(comment_id=value['comment_id'], text=value['reply'])
+                    self.insert_comment(comment_id=value["comment_id"], text=value["reply"])
     def insert_comment(self, text, video_id: str = None, comment_id: str = None):
         # if comment_id is provided, define the request body for a reply and insert it
         if comment_id:
-            request_body = {
-                'snippet': {
-                    'parentId': comment_id,
-                    'textOriginal': text
-                }
-            }
+            request_body = {"snippet": {"parentId": comment_id, "textOriginal": text}}
-            self.handler.connect().comments().insert(
-                part='snippet',
-                body=request_body
-            ).execute()
+            self.handler.connect().comments().insert(part="snippet", body=request_body).execute()
         # else if video_id is provided, define the request body for a top-level comment and insert it
         elif video_id:
-            request_body = {
-                'snippet': {
-                    'topLevelComment': {
-                        'snippet': {
-                            'videoId': video_id,
-                            'textOriginal': text
-                        }
-                    }
-                }
-            }
+            request_body = {"snippet": {"topLevelComment": {"snippet": {"videoId": video_id, "textOriginal": text}}}}
-            self.handler.connect().commentThreads().insert(
-                part='snippet',
-                body=request_body
-            ).execute()
+            self.handler.connect().commentThreads().insert(part="snippet", body=request_body).execute()
     def get_columns(self) -> List[str]:
         """Gets all columns to be returned in pandas DataFrame responses
@@ -150,7 +134,19 @@ class YoutubeCommentsTable(APITable):
         List[str]
             List of columns
         """
-        return ['comment_id', 'channel_id', 'video_id', 'user_id', 'display_name', 'comment', "published_at", "updated_at", 'reply_user_id', 'reply_author', 'reply']
+        return [
+            "comment_id",
+            "channel_id",
+            "video_id",
+            "user_id",
+            "display_name",
+            "comment",
+            "published_at",
+            "updated_at",
+            "reply_user_id",
+            "reply_author",
+            "reply",
+        ]
     def get_comments(self, video_id: str, channel_id: str):
         """Pulls all the records from the given youtube api end point and returns it select()
@@ -166,7 +162,12 @@ class YoutubeCommentsTable(APITable):
         resource = (
             self.handler.connect()
             .commentThreads()
-            .list(part="snippet, replies", videoId=video_id, allThreadsRelatedToChannelId=channel_id, textFormat="plainText")
+            .list(
+                part="snippet, replies",
+                videoId=video_id,
+                allThreadsRelatedToChannelId=channel_id,
+                textFormat="plainText",
+            )
         )
         data = []
@@ -175,7 +176,7 @@ class YoutubeCommentsTable(APITable):
             for comment in comments["items"]:
                 replies = []
-                if 'replies' in comment:
+                if "replies" in comment:
                     for reply in comment["replies"]["comments"]:
                         replies.append(
                             {
@@ -222,18 +223,51 @@ class YoutubeCommentsTable(APITable):
             else:
                 break
-        youtube_comments_df = pd.json_normalize(data, 'replies', ['comment_id', 'channel_id', 'video_id', 'user_id', 'display_name', 'comment', "published_at", "updated_at"], record_prefix='replies.')
-        youtube_comments_df = youtube_comments_df.rename(columns={'replies.user_id': 'reply_user_id', 'replies.reply_author': 'reply_author', 'replies.reply': 'reply'})
+        youtube_comments_df = pd.json_normalize(
+            data,
+            "replies",
+            [
+                "comment_id",
+                "channel_id",
+                "video_id",
+                "user_id",
+                "display_name",
+                "comment",
+                "published_at",
+                "updated_at",
+            ],
+            record_prefix="replies.",
+        )
+        youtube_comments_df = youtube_comments_df.rename(
+            columns={
+                "replies.user_id": "reply_user_id",
+                "replies.reply_author": "reply_author",
+                "replies.reply": "reply",
+            }
+        )
         # check if DataFrame is empty
         if youtube_comments_df.empty:
             return youtube_comments_df
         else:
-            return youtube_comments_df[['comment_id', 'channel_id', 'video_id', 'user_id', 'display_name', 'comment', "published_at", "updated_at", 'reply_user_id', 'reply_author', 'reply']]
+            return youtube_comments_df[
+                [
+                    "comment_id",
+                    "channel_id",
+                    "video_id",
+                    "user_id",
+                    "display_name",
+                    "comment",
+                    "published_at",
+                    "updated_at",
+                    "reply_user_id",
+                    "reply_author",
+                    "reply",
+                ]
+            ]
 class YoutubeChannelsTable(APITable):
     """Youtube Channel Info  by channel id Table implementation"""
     def select(self, query: ast.Select) -> pd.DataFrame:
@@ -263,9 +297,9 @@ class YoutubeChannelsTable(APITable):
         select_statement_executor = SELECTQueryExecutor(
             channel_df,
             selected_columns,
-            [where_condition for where_condition in where_conditions if where_condition[1] == 'channel_id'],
+            [where_condition for where_condition in where_conditions if where_condition[1] == "channel_id"],
             order_by_conditions,
-            result_limit if query.limit else None
+            result_limit if query.limit else None,
         )
         channel_df = select_statement_executor.execute_query()
@@ -304,7 +338,6 @@ class YoutubeChannelsTable(APITable):
 class YoutubeVideosTable(APITable):
     """Youtube Video info  by video id Table implementation"""
     def select(self, query: ast.Select) -> pd.DataFrame:
@@ -317,7 +350,7 @@ class YoutubeVideosTable(APITable):
             result_limit,
         ) = select_statement_parser.parse_query()
-        video_id, channel_id = None, None
+        video_id, channel_id, search_query = None, None, None
         for op, arg1, arg2 in where_conditions:
             if arg1 == "video_id":
                 if op == "=":
@@ -331,38 +364,126 @@ class YoutubeVideosTable(APITable):
                 else:
                     raise NotImplementedError("Only '=' operator is supported for channel_id column.")
-        if not video_id and not channel_id:
-            raise ValueError("Either video_id or channel_id has to be present in where clause.")
+            elif arg1 == "query":
+                if op == "=":
+                    search_query = arg2
+                else:
+                    raise NotImplementedError("Only '=' operator is supported for query column.")
+        if not video_id and not channel_id and not search_query:
+            raise ValueError("At least one of video_id, channel_id, or query must be present in the WHERE clause.")
         if video_id:
             video_df = self.get_videos_by_video_ids([video_id])
+        elif channel_id and search_query:
+            video_df = self.get_videos_by_search_query_in_channel(search_query, channel_id, result_limit)
+        elif channel_id:
+            video_df = self.get_videos_by_channel_id(channel_id, result_limit)
         else:
-            video_df = self.get_videos_by_channel_id(channel_id)
+            video_df = self.get_videos_by_search_query(search_query, result_limit)
         select_statement_executor = SELECTQueryExecutor(
             video_df,
             selected_columns,
-            [where_condition for where_condition in where_conditions if where_condition[1] not in ['video_id', 'channel_id']],
+            [
+                where_condition
+                for where_condition in where_conditions
+                if where_condition[1] not in ["video_id", "channel_id", "query"]
+            ],
             order_by_conditions,
-            result_limit if query.limit else None
+            result_limit if query.limit else None,
         )
         video_df = select_statement_executor.execute_query()
         return video_df
-    def get_videos_by_channel_id(self, channel_id):
+    def get_videos_by_search_query(self, search_query, limit=10):
         video_ids = []
         resource = (
             self.handler.connect()
             .search()
-            .list(part="snippet", channelId=channel_id, type="video")
+            .list(part="snippet", q=search_query, type="video", maxResults=min(50, limit))
         )
-        while resource:
+        total_fetched = 0
+        while resource and total_fetched < limit:
+            response = resource.execute()
+            for item in response["items"]:
+                video_ids.append(item["id"]["videoId"])
+                total_fetched += 1
+                if total_fetched >= limit:
+                    break
+            if "nextPageToken" in response and total_fetched < limit:
+                resource = (
+                    self.handler.connect()
+                    .search()
+                    .list(
+                        part="snippet",
+                        q=search_query,
+                        type="video",
+                        maxResults=min(50, limit - total_fetched),
+                        pageToken=response["nextPageToken"],
+                    )
+                )
+            else:
+                break
+        return self.get_videos_by_video_ids(video_ids)
+    def get_videos_by_search_query_in_channel(self, search_query, channel_id, limit=10):
+        """Search for videos within a specific channel"""
+        video_ids = []
+        resource = (
+            self.handler.connect()
+            .search()
+            .list(part="snippet", q=search_query, channelId=channel_id, type="video", maxResults=min(50, limit))
+        )
+        total_fetched = 0
+        while resource and total_fetched < limit:
+            response = resource.execute()
+            for item in response["items"]:
+                video_ids.append(item["id"]["videoId"])
+                total_fetched += 1
+                if total_fetched >= limit:
+                    break
+            if "nextPageToken" in response and total_fetched < limit:
+                resource = (
+                    self.handler.connect()
+                    .search()
+                    .list(
+                        part="snippet",
+                        q=search_query,
+                        channelId=channel_id,
+                        type="video",
+                        maxResults=min(50, limit - total_fetched),
+                        pageToken=response["nextPageToken"],
+                    )
+                )
+            else:
+                break
+        return self.get_videos_by_video_ids(video_ids)
+    def get_videos_by_channel_id(self, channel_id, limit=10):
+        video_ids = []
+        resource = (
+            self.handler.connect()
+            .search()
+            .list(part="snippet", channelId=channel_id, type="video", maxResults=min(50, limit))
+        )
+        total_fetched = 0
+        while resource and total_fetched < limit:
             response = resource.execute()
             for item in response["items"]:
                 video_ids.append(item["id"]["videoId"])
-            if "nextPageToken" in response:
+                total_fetched += 1
+                if total_fetched >= limit:
+                    break
+            if "nextPageToken" in response and total_fetched < limit:
                 resource = (
                     self.handler.connect()
                     .search()
@@ -370,6 +491,7 @@ class YoutubeVideosTable(APITable):
                         part="snippet",
                         channelId=channel_id,
                         type="video",
+                        maxResults=min(50, limit - total_fetched),
                         pageToken=response["nextPageToken"],
                     )
                 )
@@ -388,7 +510,13 @@ class YoutubeVideosTable(APITable):
         # loop over 50 video ids at a time
         # an invalid request error is caused otherwise
         for i in range(0, len(video_ids), 50):
-            resource = self.handler.connect().videos().list(part="statistics,snippet,contentDetails", id=",".join(video_ids[i:i + 50])).execute()
+            resource = (
+                self.handler.connect()
+                .videos()
+                .list(part="statistics,snippet,contentDetails", id=",".join(video_ids[i : i + 50]))
+                .execute()
+            )
             for item in resource["items"]:
                 data.append(
                     {
@@ -415,7 +543,7 @@ class YoutubeVideosTable(APITable):
             return json_formatted_transcript
         except Exception as e:
-            logger.error(f"Encountered an error while fetching transcripts for video ${video_id}: ${e}"),
+            (logger.error(f"Encountered an error while fetching transcripts for video ${video_id}: ${e}"),)
             return "Transcript not available for this video"
     def parse_duration(self, video_id, duration):
@@ -428,7 +556,7 @@ class YoutubeVideosTable(APITable):
             return duration_str.strip(":")
         except Exception as e:
-            logger.error(f"Encountered an error while parsing duration for video ${video_id}: ${e}"),
+            (logger.error(f"Encountered an error while parsing duration for video ${video_id}: ${e}"),)
             return "Duration not available for this video"
     def get_columns(self) -> List[str]:

mindsdb/integrations/libs/vectordatabase_handler.py CHANGED Viewed

@@ -334,12 +334,21 @@ class VectorStoreHandler(BaseHandler):
         if not df_update.empty:
             # get values of existed `created_at` and return them to metadata
-            created_dates = {row[id_col]: row[metadata_col].get("_created_at") for _, row in df_existed.iterrows()}
+            origin_id_col = "_original_doc_id"
+            created_dates, ids = {}, {}
+            for _, row in df_existed.iterrows():
+                chunk_id = row[id_col]
+                created_dates[chunk_id] = row[metadata_col].get("_created_at")
+                ids[chunk_id] = row[metadata_col].get(origin_id_col)
             def keep_created_at(row):
                 val = created_dates.get(row[id_col])
                 if val:
                     row[metadata_col]["_created_at"] = val
+                # keep id column
+                if origin_id_col not in row[metadata_col]:
+                    row[metadata_col][origin_id_col] = ids.get(row[id_col])
                 return row
             df_update.apply(keep_created_at, axis=1)

MindsDB 25.7.2.0__py3-none-any.whl → 25.7.4.0__py3-none-any.whl

Potentially problematic release.

MindsDB 25.7.2.0py3-none-any.whl → 25.7.4.0py3-none-any.whl