MindsDB 25.3.3.0__py3-none-any.whl → 25.3.4.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of MindsDB might be problematic. Click here for more details.

Files changed (45) hide show
  1. mindsdb/__about__.py +2 -2
  2. mindsdb/api/executor/datahub/datanodes/information_schema_datanode.py +2 -6
  3. mindsdb/api/executor/datahub/datanodes/mindsdb_tables.py +1 -1
  4. mindsdb/api/http/namespaces/agents.py +9 -5
  5. mindsdb/api/http/namespaces/chatbots.py +6 -5
  6. mindsdb/api/http/namespaces/databases.py +5 -6
  7. mindsdb/api/http/namespaces/skills.py +5 -4
  8. mindsdb/api/http/namespaces/views.py +6 -7
  9. mindsdb/integrations/handlers/chromadb_handler/chromadb_handler.py +23 -2
  10. mindsdb/integrations/handlers/confluence_handler/confluence_api_client.py +164 -0
  11. mindsdb/integrations/handlers/confluence_handler/confluence_handler.py +54 -59
  12. mindsdb/integrations/handlers/confluence_handler/confluence_tables.py +753 -0
  13. mindsdb/integrations/handlers/confluence_handler/connection_args.py +8 -8
  14. mindsdb/integrations/handlers/dummy_data_handler/dummy_data_handler.py +16 -6
  15. mindsdb/integrations/handlers/file_handler/tests/test_file_handler.py +64 -83
  16. mindsdb/integrations/handlers/huggingface_handler/requirements.txt +5 -4
  17. mindsdb/integrations/handlers/huggingface_handler/requirements_cpu.txt +5 -5
  18. mindsdb/integrations/handlers/langchain_handler/requirements.txt +1 -1
  19. mindsdb/integrations/handlers/lightwood_handler/requirements.txt +3 -3
  20. mindsdb/integrations/handlers/litellm_handler/requirements.txt +1 -1
  21. mindsdb/integrations/handlers/llama_index_handler/requirements.txt +1 -1
  22. mindsdb/integrations/handlers/ms_one_drive_handler/ms_graph_api_one_drive_client.py +1 -1
  23. mindsdb/integrations/handlers/ms_teams_handler/ms_graph_api_teams_client.py +278 -0
  24. mindsdb/integrations/handlers/ms_teams_handler/ms_teams_handler.py +114 -70
  25. mindsdb/integrations/handlers/ms_teams_handler/ms_teams_tables.py +431 -0
  26. mindsdb/integrations/handlers/mysql_handler/mysql_handler.py +2 -0
  27. mindsdb/integrations/handlers/pgvector_handler/pgvector_handler.py +18 -4
  28. mindsdb/integrations/handlers/ray_serve_handler/ray_serve_handler.py +18 -16
  29. mindsdb/integrations/handlers/snowflake_handler/snowflake_handler.py +26 -1
  30. mindsdb/integrations/libs/vectordatabase_handler.py +2 -2
  31. mindsdb/integrations/utilities/files/file_reader.py +3 -3
  32. mindsdb/integrations/utilities/handlers/api_utilities/microsoft/ms_graph_api_utilities.py +36 -2
  33. mindsdb/integrations/utilities/rag/settings.py +1 -0
  34. mindsdb/interfaces/chatbot/chatbot_controller.py +6 -4
  35. mindsdb/interfaces/jobs/jobs_controller.py +1 -4
  36. mindsdb/interfaces/knowledge_base/controller.py +9 -28
  37. mindsdb/interfaces/knowledge_base/preprocessing/document_preprocessor.py +1 -1
  38. mindsdb/interfaces/skills/skills_controller.py +8 -7
  39. {mindsdb-25.3.3.0.dist-info → mindsdb-25.3.4.1.dist-info}/METADATA +237 -237
  40. {mindsdb-25.3.3.0.dist-info → mindsdb-25.3.4.1.dist-info}/RECORD +43 -41
  41. {mindsdb-25.3.3.0.dist-info → mindsdb-25.3.4.1.dist-info}/WHEEL +1 -1
  42. mindsdb/integrations/handlers/confluence_handler/confluence_table.py +0 -193
  43. mindsdb/integrations/handlers/confluence_handler/requirements.txt +0 -1
  44. {mindsdb-25.3.3.0.dist-info → mindsdb-25.3.4.1.dist-info/licenses}/LICENSE +0 -0
  45. {mindsdb-25.3.3.0.dist-info → mindsdb-25.3.4.1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,431 @@
1
+ from typing import List
2
+
3
+ import pandas as pd
4
+
5
+ from mindsdb.integrations.handlers.ms_teams_handler.ms_graph_api_teams_client import MSGraphAPITeamsDelegatedPermissionsClient
6
+ from mindsdb.integrations.libs.api_handler import APIResource
7
+ from mindsdb.integrations.utilities.sql_utils import (
8
+ FilterCondition,
9
+ FilterOperator,
10
+ SortColumn
11
+ )
12
+
13
+
14
+ class TeamsTable(APIResource):
15
+ """
16
+ The table abstraction for the 'teams' resource of the Microsoft Graph API.
17
+ """
18
+ def list(
19
+ self,
20
+ conditions: List[FilterCondition] = None,
21
+ limit: int = None,
22
+ sort: List[SortColumn] = None,
23
+ targets: List[str] = None,
24
+ **kwargs
25
+ ):
26
+ """
27
+ Executes a parsed SELECT SQL query on the 'teams' resource of the Microsoft Graph API.
28
+
29
+ Args:
30
+ conditions (List[FilterCondition]): The list of parsed filter conditions.
31
+ limit (int): The maximum number of records to return.
32
+ sort (List[SortColumn]): The list of parsed sort columns.
33
+ targets (List[str]): The list of target columns to return.
34
+ """
35
+ client: MSGraphAPITeamsDelegatedPermissionsClient = self.handler.connect()
36
+ teams = client.get_all_groups()
37
+
38
+ teams_df = pd.json_normalize(teams, sep="_")
39
+ teams_df = teams_df[self.get_columns()]
40
+
41
+ return teams_df
42
+
43
+ def get_columns(self) -> List[str]:
44
+ """
45
+ Retrieves the attributes (columns) of the 'teams' resource.
46
+
47
+ Returns:
48
+ List[Text]: A list of attributes (columns) of the 'teams' resource.
49
+ """
50
+ return [
51
+ "id",
52
+ "createdDateTime",
53
+ "displayName",
54
+ "description",
55
+ "internalId",
56
+ "classification",
57
+ "specialization",
58
+ "visibility",
59
+ "webUrl",
60
+ "isArchived",
61
+ "tenantId",
62
+ "isMembershipLimitedToOwners",
63
+ ]
64
+
65
+
66
+ class ChannelsTable(APIResource):
67
+ """
68
+ The table abstraction for the 'channels' resource of the Microsoft Graph API.
69
+ """
70
+ def list(
71
+ self,
72
+ conditions: List[FilterCondition] = None,
73
+ limit: int = None,
74
+ sort: List[SortColumn] = None,
75
+ targets: List[str] = None,
76
+ **kwargs
77
+ ):
78
+ """
79
+ Executes a parsed SELECT SQL query on the 'channels' resource of the Microsoft Graph API.
80
+
81
+ Args:
82
+ conditions (List[FilterCondition]): The list of parsed filter conditions.
83
+ limit (int): The maximum number of records to return.
84
+ sort (List[SortColumn]): The list of parsed sort columns.
85
+ targets (List[str]): The list of target columns to return.
86
+ """
87
+ client: MSGraphAPITeamsDelegatedPermissionsClient = self.handler.connect()
88
+ channels = []
89
+
90
+ team_id, channel_ids = None, None
91
+ for condition in conditions:
92
+ if condition.column == "teamId":
93
+ if condition.op == FilterOperator.EQUAL:
94
+ team_id = condition.value
95
+
96
+ else:
97
+ raise ValueError(
98
+ f"Unsupported operator '{condition.op}' for column 'teamId'."
99
+ )
100
+
101
+ condition.applied = True
102
+
103
+ if condition.column == "id":
104
+ if condition.op == FilterOperator.EQUAL:
105
+ channel_ids = [condition.value]
106
+
107
+ elif condition.op == FilterOperator.IN:
108
+ channel_ids = condition.value
109
+
110
+ else:
111
+ raise ValueError(
112
+ f"Unsupported operator '{condition.op}' for column 'id'."
113
+ )
114
+
115
+ condition.applied = True
116
+
117
+ if team_id:
118
+ if channel_ids:
119
+ channels = client.get_channels_in_group_by_ids(team_id, channel_ids)
120
+
121
+ else:
122
+ channels = client.get_all_channels_in_group(team_id)
123
+
124
+ elif channel_ids:
125
+ channels = client.get_channels_across_all_groups_by_ids(channel_ids)
126
+
127
+ else:
128
+ channels = client.get_all_channels_across_all_groups()
129
+
130
+ channels_df = pd.json_normalize(channels, sep="_")
131
+ channels_df = channels_df[self.get_columns()]
132
+
133
+ return channels_df
134
+
135
+ def get_columns(self) -> List[str]:
136
+ """
137
+ Retrieves the attributes (columns) of the 'chats' resource.
138
+
139
+ Returns:
140
+ List[Text]: A list of attributes (columns) of the 'chats' resource.
141
+ """
142
+ return [
143
+ "id",
144
+ "createdDateTime",
145
+ "displayName",
146
+ "description",
147
+ "isFavoriteByDefault",
148
+ "email",
149
+ "tenantId",
150
+ "webUrl",
151
+ "membershipType",
152
+ "teamId",
153
+ ]
154
+
155
+
156
+ class ChannelMessagesTable(APIResource):
157
+ """
158
+ The table abstraction for the 'channel messages' resource of the Microsoft Graph API.
159
+ """
160
+ def list(
161
+ self,
162
+ conditions: List[FilterCondition] = None,
163
+ limit: int = None,
164
+ sort: List[SortColumn] = None,
165
+ targets: List[str] = None,
166
+ **kwargs
167
+ ):
168
+ """
169
+ Executes a parsed SELECT SQL query on the 'channel messages' resource of the Microsoft Graph API.
170
+
171
+ Args:
172
+ conditions (List[FilterCondition]): The list of parsed filter conditions.
173
+ limit (int): The maximum number of records to return.
174
+ sort (List[SortColumn]): The list of parsed sort columns.
175
+ targets (List[str]): The list of target columns to return.
176
+ """
177
+ client: MSGraphAPITeamsDelegatedPermissionsClient = self.handler.connect()
178
+ messages = []
179
+
180
+ group_id, channel_id, message_ids = None, None, None
181
+ for condition in conditions:
182
+ if condition.column == "channelIdentity_teamId":
183
+ if condition.op == FilterOperator.EQUAL:
184
+ group_id = condition.value
185
+
186
+ else:
187
+ raise ValueError(
188
+ f"Unsupported operator '{condition.op}' for column 'channelIdentity_teamId'."
189
+ )
190
+
191
+ condition.applied = True
192
+
193
+ if condition.column == "channelIdentity_channelId":
194
+ if condition.op == FilterOperator.EQUAL:
195
+ channel_id = condition.value
196
+
197
+ else:
198
+ raise ValueError(
199
+ f"Unsupported operator '{condition.op}' for column 'channelIdentity_channelId'."
200
+ )
201
+
202
+ condition.applied = True
203
+
204
+ if condition.column == "id":
205
+ if condition.op == FilterOperator.EQUAL:
206
+ message_ids = [condition.value]
207
+
208
+ elif condition.op == FilterOperator.IN:
209
+ message_ids = condition.value
210
+
211
+ else:
212
+ raise ValueError(
213
+ f"Unsupported operator '{condition.op}' for column 'id'."
214
+ )
215
+
216
+ condition.applied = True
217
+
218
+ if not group_id or not channel_id:
219
+ raise ValueError("The 'channelIdentity_teamId' and 'channelIdentity_channelId' columns are required.")
220
+
221
+ if message_ids:
222
+ messages = client.get_messages_in_channel_by_ids(group_id, channel_id, message_ids)
223
+
224
+ else:
225
+ messages = client.get_all_messages_in_channel(group_id, channel_id, limit)
226
+
227
+ messages_df = pd.json_normalize(messages, sep="_")
228
+ messages_df = messages_df[self.get_columns()]
229
+
230
+ return messages_df
231
+
232
+ def get_columns(self) -> List[str]:
233
+ """
234
+ Retrieves the attributes (columns) of the 'chat messages' resource.
235
+
236
+ Returns:
237
+ List[Text]: A list of attributes (columns) of the 'chat messages' resource.
238
+ """
239
+ return [
240
+ "id",
241
+ "replyToId",
242
+ "etag",
243
+ "messageType",
244
+ "createdDateTime",
245
+ "lastModifiedDateTime",
246
+ "lastEditedDateTime",
247
+ "deletedDateTime",
248
+ "subject",
249
+ "summary",
250
+ "chatId",
251
+ "importance",
252
+ "locale",
253
+ "webUrl",
254
+ "policyViolation",
255
+ "from_application",
256
+ "from_device",
257
+ "from_user_id",
258
+ "from_user_displayName",
259
+ "from_user_userIdentityType",
260
+ "body_contentType",
261
+ "body_content",
262
+ "channelIdentity_teamId",
263
+ "channelIdentity_channelId",
264
+ ]
265
+
266
+
267
+ class ChatsTable(APIResource):
268
+ """
269
+ The table abstraction for the 'chats' resource of the Microsoft Graph API.
270
+ """
271
+ def list(
272
+ self,
273
+ conditions: List[FilterCondition] = None,
274
+ limit: int = None,
275
+ sort: List[SortColumn] = None,
276
+ targets: List[str] = None,
277
+ **kwargs
278
+ ):
279
+ """
280
+ Executes a parsed SELECT SQL query on the 'chats' resource of the Microsoft Graph API.
281
+
282
+ Args:
283
+ conditions (List[FilterCondition]): The list of parsed filter conditions.
284
+ limit (int): The maximum number of records to return.
285
+ sort (List[SortColumn]): The list of parsed sort columns.
286
+ targets (List[str]): The list of target columns to return.
287
+ """
288
+ client: MSGraphAPITeamsDelegatedPermissionsClient = self.handler.connect()
289
+ chats = []
290
+
291
+ chat_ids = None
292
+ for condition in conditions:
293
+ if condition.column == "id":
294
+ if condition.op == FilterOperator.EQUAL:
295
+ chat_ids = [condition.value]
296
+
297
+ elif condition.op == FilterOperator.IN:
298
+ chat_ids = condition.value
299
+
300
+ else:
301
+ raise ValueError(
302
+ f"Unsupported operator '{condition.op}' for column 'id'."
303
+ )
304
+
305
+ condition.applied = True
306
+
307
+ if chat_ids:
308
+ chats = client.get_chats_by_ids(chat_ids)
309
+
310
+ else:
311
+ chats = client.get_all_chats(limit)
312
+
313
+ chats_df = pd.json_normalize(chats, sep="_")
314
+ chats_df = chats_df[self.get_columns()]
315
+
316
+ return chats_df
317
+
318
+ def get_columns(self) -> List[str]:
319
+ """
320
+ Retrieves the attributes (columns) of the 'chats' resource.
321
+
322
+ Returns:
323
+ List[Text]: A list of attributes (columns) of the 'chats' resource.
324
+ """
325
+ return [
326
+ "id",
327
+ "topic",
328
+ "createdDateTime",
329
+ "lastUpdatedDateTime",
330
+ "chatType",
331
+ "webUrl",
332
+ "isHiddenForAllMembers"
333
+ ]
334
+
335
+
336
+ class ChatMessagesTable(APIResource):
337
+ """
338
+ The table abstraction for the 'chat messages' resource of the Microsoft Graph API.
339
+ """
340
+ def list(
341
+ self,
342
+ conditions: List[FilterCondition] = None,
343
+ limit: int = None,
344
+ sort: List[SortColumn] = None,
345
+ targets: List[str] = None,
346
+ **kwargs
347
+ ):
348
+ """
349
+ Executes a parsed SELECT SQL query on the 'chat messages' resource of the Microsoft Graph API.
350
+
351
+ Args:
352
+ conditions (List[FilterCondition]): The list of parsed filter conditions.
353
+ limit (int): The maximum number of records to return.
354
+ sort (List[SortColumn]): The list of parsed sort columns.
355
+ targets (List[str]): The list of target columns to return.
356
+ """
357
+ client: MSGraphAPITeamsDelegatedPermissionsClient = self.handler.connect()
358
+ messages = []
359
+
360
+ chat_id, message_ids = None, None
361
+ for condition in conditions:
362
+ if condition.column == "chatId":
363
+ if condition.op == FilterOperator.EQUAL:
364
+ chat_id = condition.value
365
+
366
+ else:
367
+ raise ValueError(
368
+ f"Unsupported operator '{condition.op}' for column 'chatId'."
369
+ )
370
+
371
+ condition.applied = True
372
+
373
+ if condition.column == "id":
374
+ if condition.op == FilterOperator.EQUAL:
375
+ message_ids = [condition.value]
376
+
377
+ elif condition.op == FilterOperator.IN:
378
+ message_ids = condition.value
379
+
380
+ else:
381
+ raise ValueError(
382
+ f"Unsupported operator '{condition.op}' for column 'id'."
383
+ )
384
+
385
+ condition.applied = True
386
+
387
+ if not chat_id:
388
+ raise ValueError("The 'chatId' column is required.")
389
+
390
+ if message_ids:
391
+ messages = client.get_messages_in_chat_by_ids(chat_id, message_ids)
392
+
393
+ else:
394
+ messages = client.get_all_messages_in_chat(chat_id, limit)
395
+
396
+ messages_df = pd.json_normalize(messages, sep="_")
397
+ messages_df = messages_df[self.get_columns()]
398
+
399
+ return messages_df
400
+
401
+ def get_columns(self) -> List[str]:
402
+ """
403
+ Retrieves the attributes (columns) of the 'chat messages' resource.
404
+
405
+ Returns:
406
+ List[Text]: A list of attributes (columns) of the 'chat messages' resource.
407
+ """
408
+ return [
409
+ "id",
410
+ "replyToId",
411
+ "etag",
412
+ "messageType",
413
+ "createdDateTime",
414
+ "lastModifiedDateTime",
415
+ "lastEditedDateTime",
416
+ "deletedDateTime",
417
+ "subject",
418
+ "summary",
419
+ "chatId",
420
+ "importance",
421
+ "locale",
422
+ "webUrl",
423
+ "policyViolation",
424
+ "from_application",
425
+ "from_device",
426
+ "from_user_id",
427
+ "from_user_displayName",
428
+ "from_user_userIdentityType",
429
+ "body_contentType",
430
+ "body_content",
431
+ ]
@@ -94,6 +94,8 @@ class MySQLHandler(DatabaseHandler):
94
94
  config["ssl_key"] = ssl_key
95
95
  if 'collation' not in config:
96
96
  config['collation'] = 'utf8mb4_general_ci'
97
+ if 'use_pure' not in config:
98
+ config['use_pure'] = True
97
99
  try:
98
100
  connection = mysql.connector.connect(**config)
99
101
  connection.autocommit = True
@@ -114,13 +114,27 @@ class PgVectorHandler(PostgresHandler, VectorStoreHandler):
114
114
  if conditions is None:
115
115
  return {}
116
116
 
117
- return {
118
- condition.column.split(".")[-1]: {
117
+ filter_conditions = {}
118
+
119
+ for condition in conditions:
120
+
121
+ parts = condition.column.split(".")
122
+ key = parts[0]
123
+ # converts 'col.el1.el2' to col->'el1'->>'el2'
124
+ if len(parts) > 1:
125
+ # intermediate elements
126
+ for el in parts[1:-1]:
127
+ key += f" -> '{el}'"
128
+
129
+ # last element
130
+ key += f" ->> '{parts[-1]}'"
131
+
132
+ filter_conditions[key] = {
119
133
  "op": condition.op.value,
120
134
  "value": condition.value,
121
135
  }
122
- for condition in conditions
123
- }
136
+
137
+ return filter_conditions
124
138
 
125
139
  @staticmethod
126
140
  def _construct_where_clause(filter_conditions=None):
@@ -81,27 +81,29 @@ class RayServeHandler(BaseMLEngine):
81
81
  resp = requests.post(args['predict_url'],
82
82
  json={'df': df.to_json(orient='records'), 'pred_args': pred_args},
83
83
  headers={'content-type': 'application/json; format=pandas-records'})
84
- try:
85
- if args.get('is_parquet', False):
84
+ content_type = resp.headers.get("Content-Type", "")
85
+ if "application/octet-stream" in content_type:
86
+ try:
86
87
  buffer = io.BytesIO(resp.content)
87
88
  table = pq.read_table(buffer)
88
89
  response = table.to_pandas()
89
- else:
90
+ except Exception:
91
+ error = 'Could not decode parquet.'
92
+ else:
93
+ try:
90
94
  response = resp.json()
91
- except json.JSONDecodeError:
92
- error = resp.text
93
- except Exception:
94
- error = 'Could not decode parquet.'
95
+ except json.JSONDecodeError:
96
+ error = resp.text
97
+
98
+ if 'prediction' in response:
99
+ target = args['target']
100
+ if target != 'prediction':
101
+ # rename prediction to target
102
+ response[target] = response.pop('prediction')
103
+ return pd.DataFrame(response)
95
104
  else:
96
- if 'prediction' in response:
97
- target = args['target']
98
- if target != 'prediction':
99
- # rename prediction to target
100
- response[target] = response.pop('prediction')
101
- return pd.DataFrame(response)
102
- else:
103
- # something wrong
104
- error = response
105
+ # something wrong
106
+ error = response
105
107
 
106
108
  raise RayServeException(f"Error: {error}")
107
109
 
@@ -7,6 +7,8 @@ from snowflake.connector.errors import NotSupportedError
7
7
 
8
8
  from mindsdb.utilities import log
9
9
  from mindsdb_sql_parser.ast.base import ASTNode
10
+ from mindsdb_sql_parser.ast import Select, Identifier
11
+
10
12
  from mindsdb.integrations.libs.base import DatabaseHandler
11
13
  from mindsdb.utilities.render.sqlalchemy_render import SqlalchemyRender
12
14
  from mindsdb.integrations.libs.response import (
@@ -234,7 +236,30 @@ class SnowflakeHandler(DatabaseHandler):
234
236
 
235
237
  query_str = self.renderer.get_string(query, with_failback=True)
236
238
  logger.debug(f"Executing SQL query: {query_str}")
237
- return self.native_query(query_str)
239
+ result = self.native_query(query_str)
240
+ return self.lowercase_columns(result, query)
241
+
242
+ def lowercase_columns(self, result, query):
243
+ if not isinstance(query, Select) or result.data_frame is None:
244
+ return result
245
+
246
+ quoted_columns = []
247
+ if query.targets is not None:
248
+ for column in query.targets:
249
+ if hasattr(column, 'alias') and column.alias is not None:
250
+ if column.alias.is_quoted[-1]:
251
+ quoted_columns.append(column.alias.parts[-1])
252
+ elif isinstance(column, Identifier):
253
+ if column.is_quoted[-1]:
254
+ quoted_columns.append(column.parts[-1])
255
+
256
+ rename_columns = {}
257
+ for col in result.data_frame.columns:
258
+ if col.isupper() and col not in quoted_columns:
259
+ rename_columns[col] = col.lower()
260
+ if rename_columns:
261
+ result.data_frame = result.data_frame.rename(columns=rename_columns)
262
+ return result
238
263
 
239
264
  def get_tables(self) -> Response:
240
265
  """
@@ -325,7 +325,7 @@ class VectorStoreHandler(BaseHandler):
325
325
  if not df_insert.empty:
326
326
  self.insert(table_name, df_insert)
327
327
 
328
- def _dispatch_delete(self, query: Delete):
328
+ def dispatch_delete(self, query: Delete):
329
329
  """
330
330
  Dispatch delete query to the appropriate method.
331
331
  """
@@ -382,7 +382,7 @@ class VectorStoreHandler(BaseHandler):
382
382
  DropTables: self._dispatch_drop_table,
383
383
  Insert: self._dispatch_insert,
384
384
  Update: self._dispatch_update,
385
- Delete: self._dispatch_delete,
385
+ Delete: self.dispatch_delete,
386
386
  Select: self.dispatch_select,
387
387
  }
388
388
  if type(query) in dispatch_router:
@@ -309,7 +309,7 @@ class FileReader(FormatDetector):
309
309
  )
310
310
  text = file_obj.read()
311
311
 
312
- metadata = {"source": name}
312
+ metadata = {"source_file": name, "file_format": "txt"}
313
313
  documents = [Document(page_content=text, metadata=metadata)]
314
314
 
315
315
  text_splitter = RecursiveCharacterTextSplitter(
@@ -325,7 +325,7 @@ class FileReader(FormatDetector):
325
325
  )
326
326
 
327
327
  @staticmethod
328
- def read_pdf(file_obj: BytesIO, **kwargs):
328
+ def read_pdf(file_obj: BytesIO, name=None, **kwargs):
329
329
 
330
330
  with fitz.open(stream=file_obj.read()) as pdf: # open pdf
331
331
  text = chr(12).join([page.get_text() for page in pdf])
@@ -337,7 +337,7 @@ class FileReader(FormatDetector):
337
337
  split_text = text_splitter.split_text(text)
338
338
 
339
339
  return pd.DataFrame(
340
- {"content": split_text, "metadata": [{}] * len(split_text)}
340
+ {"content": split_text, "metadata": [{"file_format": "pdf", "source_file": name}] * len(split_text)}
341
341
  )
342
342
 
343
343
  @staticmethod
@@ -87,7 +87,7 @@ class MSGraphAPIBaseClient:
87
87
 
88
88
  return response
89
89
 
90
- def fetch_paginated_data(self, endpoint: Text, params: Optional[Dict] = {}) -> Generator:
90
+ def fetch_paginated_data(self, endpoint: Text, params: Optional[Dict] = None) -> Generator:
91
91
  """
92
92
  Fetches data from the Microsoft Graph API by making the specified request and handling pagination.
93
93
 
@@ -98,6 +98,8 @@ class MSGraphAPIBaseClient:
98
98
  Yields:
99
99
  List: The data fetched from the Microsoft Graph API.
100
100
  """
101
+ if params is None:
102
+ params = {}
101
103
  api_url = self._get_api_url(endpoint)
102
104
 
103
105
  # Add the pagination count to the request parameters.
@@ -115,7 +117,7 @@ class MSGraphAPIBaseClient:
115
117
  api_url = response_json.get("@odata.nextLink", "")
116
118
  yield value
117
119
 
118
- def fetch_data(self, endpoint: str, params: Optional[Dict] = {}) -> Union[List, Dict, bytes]:
120
+ def _fetch_data(self, endpoint: str, params: Optional[Dict] = {}) -> Union[List, Dict, bytes]:
119
121
  """
120
122
  Fetches data from the Microsoft Graph API by making the specified request.
121
123
 
@@ -129,4 +131,36 @@ class MSGraphAPIBaseClient:
129
131
  api_url = self._get_api_url(endpoint)
130
132
 
131
133
  response = self._make_request(api_url, params)
134
+ return response
135
+
136
+ def fetch_data_content(self, endpoint: str, params: Optional[Dict] = {}) -> bytes:
137
+ """
138
+ Fetches data content from the Microsoft Graph API by making the specified request.
139
+
140
+ Args:
141
+ endpoint (str): The endpoint of the Microsoft Graph API to fetch data from.
142
+ params (Optional[Dict]): The parameters to include in the request.
143
+
144
+ Returns:
145
+ bytes: The data content fetched from the Microsoft Graph API.
146
+ """
147
+ response = self._fetch_data(endpoint, params)
132
148
  return response.content
149
+
150
+ def fetch_data_json(self, endpoint: str, params: Optional[Dict] = {}) -> Union[List, Dict]:
151
+ """
152
+ Fetches data from the Microsoft Graph API by making the specified request and returns the JSON response.
153
+
154
+ Args:
155
+ endpoint (str): The endpoint of the Microsoft Graph API to fetch data from.
156
+ params (Optional[Dict]): The parameters to include in the request.
157
+
158
+ Returns:
159
+ Union[List, Dict]: The JSON response fetched from the Microsoft Graph API.
160
+ """
161
+ response = self._fetch_data(endpoint, params)
162
+ response_json = response.json()
163
+
164
+ if "value" in response_json:
165
+ return response_json["value"]
166
+ return response_json
@@ -551,6 +551,7 @@ class ColumnSchema(BaseModel):
551
551
  Dict[Union[str, int, float], ValueSchema],
552
552
  ]
553
553
  ] = Field(
554
+ default=None,
554
555
  description="One of the following. A dict or ordered dict of {schema_value: ValueSchema, ...}, where schema value is the name given for this value description in the schema."
555
556
  )
556
557
  example_questions: Optional[List[LLMExample]] = Field(