MindsDB 25.2.2.1__py3-none-any.whl → 25.2.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of MindsDB might be problematic. Click here for more details.

@@ -3,12 +3,17 @@ import os
3
3
  import shutil
4
4
  from pathlib import Path
5
5
 
6
- from mindsdb.integrations.handlers.file_handler import Handler as FileHandler
6
+ import pandas as pd
7
+
7
8
  from mindsdb.interfaces.storage import db
8
9
  from mindsdb.interfaces.storage.fs import FsStore
9
10
  from mindsdb.utilities import log
10
11
  from mindsdb.utilities.config import Config
11
12
  from mindsdb.utilities.context import context as ctx
13
+ from sqlalchemy.orm.attributes import flag_modified
14
+
15
+ from mindsdb.integrations.utilities.files.file_reader import FileReader
16
+
12
17
 
13
18
  logger = log.getLogger(__name__)
14
19
 
@@ -82,31 +87,38 @@ class FileController:
82
87
 
83
88
  file_dir = None
84
89
  try:
85
- df = FileHandler.handle_source(file_path)
86
-
87
- ds_meta = {"row_count": len(df), "column_names": list(df.columns)}
90
+ pages_files, pages_index = self.get_file_pages(file_path)
88
91
 
92
+ metadata = {
93
+ 'is_feather': True,
94
+ 'pages': pages_index
95
+ }
96
+ df = pages_files[0]
89
97
  file_record = db.File(
90
98
  name=name,
91
99
  company_id=ctx.company_id,
92
100
  source_file_path=file_name,
93
101
  file_path="",
94
- row_count=ds_meta["row_count"],
95
- columns=ds_meta["column_names"],
102
+ row_count=len(df),
103
+ columns=list(df.columns),
104
+ metadata_=metadata
96
105
  )
97
106
  db.session.add(file_record)
98
- db.session.commit()
107
+ db.session.flush()
108
+
99
109
  store_file_path = f"file_{ctx.company_id}_{file_record.id}"
100
110
  file_record.file_path = store_file_path
101
- db.session.commit()
102
111
 
103
112
  file_dir = Path(self.dir).joinpath(store_file_path)
104
113
  file_dir.mkdir(parents=True, exist_ok=True)
105
- source = file_dir.joinpath(file_name)
106
- # NOTE may be delay between db record exists and file is really in folder
107
- shutil.move(file_path, str(source))
114
+
115
+ self.store_pages_as_feather(file_dir, pages_files)
116
+ # store original file
117
+ shutil.move(file_path, str(file_dir.joinpath(file_name)))
108
118
 
109
119
  self.fs_store.put(store_file_path, base_dir=self.dir)
120
+ db.session.commit()
121
+
110
122
  except Exception as e:
111
123
  logger.error(e)
112
124
  if file_dir is not None:
@@ -115,6 +127,39 @@ class FileController:
115
127
 
116
128
  return file_record.id
117
129
 
130
+ def get_file_pages(self, source_path: str):
131
+ """
132
+ Reads file and extract pages from it
133
+ Returned structures:
134
+ - page_files: dict with content, {page_num: dataframe}
135
+ - pages_index: dict, link between page name and num: {page_name: page_num}
136
+ """
137
+ file_reader = FileReader(path=source_path)
138
+ tables = file_reader.get_contents()
139
+
140
+ pages_files = {}
141
+ pages_index = {}
142
+ if len(tables) == 1:
143
+ df = list(tables.values())[0]
144
+ pages_files[0] = df
145
+ else:
146
+ # file has several pages, create a new one with info
147
+ df = pd.DataFrame(tables.keys(), columns=["Tables"])
148
+ pages_files[0] = df
149
+ for i, page_name in enumerate(tables.keys(), 1):
150
+ pages_files[i] = tables[page_name]
151
+ pages_index[page_name] = i
152
+ return pages_files, pages_index
153
+
154
+ def store_pages_as_feather(self, dest_dir: Path, pages_files: dict):
155
+ """
156
+ Stores pages in file storage dir in feather format
157
+ """
158
+
159
+ for num, df in pages_files.items():
160
+ dest = dest_dir.joinpath(f'{num}.feather')
161
+ df.to_feather(str(dest))
162
+
118
163
  def delete_file(self, name):
119
164
  file_record = (
120
165
  db.session.query(db.File)
@@ -144,3 +189,87 @@ class FileController:
144
189
  .joinpath(file_dir)
145
190
  .joinpath(Path(file_record.source_file_path).name)
146
191
  )
192
+
193
+ def get_file_data(self, name: str, page_name: str = None) -> pd.DataFrame:
194
+ """
195
+ Returns file content as dataframe
196
+
197
+ :param name: name of file
198
+ :param page_name: page name, optional
199
+ :return: Page or file content
200
+ """
201
+ file_record = (
202
+ db.session.query(db.File)
203
+ .filter_by(company_id=ctx.company_id, name=name)
204
+ .first()
205
+ )
206
+ if file_record is None:
207
+ raise Exception(f"File '{name}' does not exists")
208
+
209
+ file_dir = f"file_{ctx.company_id}_{file_record.id}"
210
+ self.fs_store.get(file_dir, base_dir=self.dir)
211
+
212
+ metadata = file_record.metadata_ or {}
213
+ if metadata.get('is_feather') is not True:
214
+ # migrate file
215
+
216
+ file_path = (
217
+ Path(self.dir)
218
+ .joinpath(file_dir)
219
+ .joinpath(Path(file_record.source_file_path).name)
220
+ )
221
+
222
+ pages_files, pages_index = self.get_file_pages(str(file_path))
223
+
224
+ self.store_pages_as_feather(file_path.parent, pages_files)
225
+ metadata['is_feather'] = True
226
+ metadata['pages'] = pages_index
227
+
228
+ file_record.metadata_ = metadata
229
+ flag_modified(file_record, 'metadata_')
230
+ db.session.commit()
231
+
232
+ if page_name is None:
233
+ num = 0
234
+ else:
235
+ num = metadata.get('pages', {}).get(page_name)
236
+ if num is None:
237
+ raise KeyError(f'Page not found: {page_name}')
238
+
239
+ path = (
240
+ Path(self.dir)
241
+ .joinpath(file_dir)
242
+ .joinpath(f'{num}.feather')
243
+ )
244
+ return pd.read_feather(path)
245
+
246
+ def set_file_data(self, name: str, df: pd.DataFrame, page_name: str = None):
247
+ """
248
+ Save file content
249
+ :param name: name of file
250
+ :param df: content to store
251
+ :param page_name: name of page, optional
252
+ """
253
+
254
+ file_record = (
255
+ db.session.query(db.File)
256
+ .filter_by(company_id=ctx.company_id, name=name)
257
+ .first()
258
+ )
259
+ if file_record is None:
260
+ raise Exception(f"File '{name}' does not exists")
261
+
262
+ file_dir = f"file_{ctx.company_id}_{file_record.id}"
263
+ self.fs_store.get(file_dir, base_dir=self.dir)
264
+
265
+ num = 0
266
+ if page_name is not None and file_record.metadata_ is not None:
267
+ num = file_record.metadata_.get('pages', {}).get(page_name, 0)
268
+
269
+ path = (
270
+ Path(self.dir)
271
+ .joinpath(file_dir)
272
+ .joinpath(f'{num}.feather')
273
+ )
274
+ df.to_feather(path)
275
+ self.fs_store.put(file_dir, base_dir=self.dir)
@@ -258,6 +258,7 @@ class File(Base):
258
258
  row_count = Column(Integer, nullable=False)
259
259
  columns = Column(Json, nullable=False)
260
260
  created_at = Column(DateTime, default=datetime.datetime.now)
261
+ metadata_: dict = Column("metadata", JSON, nullable=True)
261
262
  updated_at = Column(
262
263
  DateTime, default=datetime.datetime.now, onupdate=datetime.datetime.now
263
264
  )
@@ -0,0 +1,31 @@
1
+ """file_metadata
2
+
3
+ Revision ID: 4943359e354a
4
+ Revises: c06c35f7e8e1
5
+ Create Date: 2025-02-09 10:10:55.577407
6
+
7
+ """
8
+ from alembic import op
9
+ import sqlalchemy as sa
10
+ import mindsdb.interfaces.storage.db # noqa
11
+
12
+
13
+ # revision identifiers, used by Alembic.
14
+ revision = '4943359e354a'
15
+ down_revision = 'c06c35f7e8e1'
16
+ branch_labels = None
17
+ depends_on = None
18
+
19
+
20
+ def upgrade():
21
+ # ### commands auto generated by Alembic - please adjust! ###
22
+ with op.batch_alter_table('file', schema=None) as batch_op:
23
+ batch_op.add_column(sa.Column('metadata', sa.JSON(), nullable=True))
24
+ # ### end Alembic commands ###
25
+
26
+
27
+ def downgrade():
28
+ # ### commands auto generated by Alembic - please adjust! ###
29
+ with op.batch_alter_table('file', schema=None) as batch_op:
30
+ batch_op.drop_column('metadata')
31
+ # ### end Alembic commands ###
@@ -93,9 +93,6 @@ class SqlalchemyRender:
93
93
  if hasattr(dialect, 'preparer'):
94
94
  class Preparer(dialect.preparer):
95
95
 
96
- def __init__(self, *args, **kwargs):
97
- super().__init__(*args, **kwargs)
98
-
99
96
  def _requires_quotes(self, value: str) -> bool:
100
97
  # check force-quote flag
101
98
  if isinstance(value, AttributedStr):
@@ -242,6 +239,8 @@ class SqlalchemyRender:
242
239
 
243
240
  op = t.op.lower()
244
241
  if op in ('in', 'not in'):
242
+ if t.args[1].parentheses:
243
+ arg1 = [arg1]
245
244
  if isinstance(arg1, sa.sql.selectable.ColumnClause):
246
245
  raise NotImplementedError(f'Required list argument for: {op}')
247
246
 
@@ -536,12 +535,19 @@ class SqlalchemyRender:
536
535
  query = query.select_from(table)
537
536
 
538
537
  # other tables
538
+ has_explicit_join = False
539
539
  for item in join_list[1:]:
540
540
  table = self.to_table(item['table'])
541
541
  if item['is_implicit']:
542
542
  # add to from clause
543
- query = query.select_from(table)
543
+ if has_explicit_join:
544
+ # sqlalchemy doesn't support implicit join after explicit
545
+ # convert it to explicit
546
+ query = query.join(table, sa.text('1=1'))
547
+ else:
548
+ query = query.select_from(table)
544
549
  else:
550
+ has_explicit_join = True
545
551
  if item['condition'] is None:
546
552
  # otherwise, sqlalchemy raises "Don't know how to join to ..."
547
553
  condition = sa.text('1=1')
@@ -564,7 +570,7 @@ class SqlalchemyRender:
564
570
  condition,
565
571
  full=is_full
566
572
  )
567
- elif isinstance(from_table, ast.Union):
573
+ elif isinstance(from_table, (ast.Union, ast.Intersect, ast.Except)):
568
574
  alias = None
569
575
  if from_table.alias:
570
576
  alias = self.get_alias(from_table.alias)