lfss 0.9.2__py3-none-any.whl → 0.11.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
lfss/eng/database.py CHANGED
@@ -1,10 +1,12 @@
1
1
 
2
- from typing import Optional, Literal, AsyncIterable, overload
2
+ from typing import Optional, Literal, overload
3
+ from collections.abc import AsyncIterable
3
4
  from contextlib import asynccontextmanager
4
5
  from abc import ABC
6
+ import re
5
7
 
8
+ import uuid, datetime
6
9
  import urllib.parse
7
- import uuid
8
10
  import zipfile, io, asyncio
9
11
 
10
12
  import aiosqlite, aiofiles
@@ -19,7 +21,7 @@ from .datatype import (
19
21
  )
20
22
  from .config import LARGE_BLOB_DIR, CHUNK_SIZE, LARGE_FILE_BYTES, MAX_MEM_FILE_BYTES
21
23
  from .log import get_logger
22
- from .utils import decode_uri_compnents, hash_credential, concurrent_wrap, debounce_async, copy_file
24
+ from .utils import decode_uri_compnents, hash_credential, concurrent_wrap, debounce_async, static_vars
23
25
  from .error import *
24
26
 
25
27
  class DBObjectBase(ABC):
@@ -82,9 +84,12 @@ class UserConn(DBObjectBase):
82
84
  self, username: str, password: str, is_admin: bool = False,
83
85
  max_storage: int = 1073741824, permission: FileReadPermission = FileReadPermission.UNSET
84
86
  ) -> int:
85
- assert not username.startswith('_'), "Error: reserved username"
86
- assert not ('/' in username or len(username) > 255), "Invalid username"
87
- assert urllib.parse.quote(username) == username, "Invalid username, must be URL safe"
87
+ def validate_username(username: str):
88
+ assert not set(username) & {'/', ':'}, "Invalid username"
89
+ assert not username.startswith('_'), "Error: reserved username"
90
+ assert not (len(username) > 255), "Username too long"
91
+ assert urllib.parse.quote(username) == username, "Invalid username, must be URL safe"
92
+ validate_username(username)
88
93
  self.logger.debug(f"Creating user {username}")
89
94
  credential = hash_credential(username, password)
90
95
  assert await self.get_user(username) is None, "Duplicate username"
@@ -161,7 +166,7 @@ class UserConn(DBObjectBase):
161
166
  async def list_peer_users(self, src_user: int | str, level: AccessLevel) -> list[UserRecord]:
162
167
  """
163
168
  List all users that src_user can do [AliasLevel] to, with level >= level,
164
- Note: the returned list does not include src_user and admin users
169
+ Note: the returned list does not include src_user and is not apporiate for admin (who has all permissions for all users)
165
170
  """
166
171
  assert int(level) > AccessLevel.NONE, f"Invalid level, {level}"
167
172
  match src_user:
@@ -192,6 +197,11 @@ class FileConn(DBObjectBase):
192
197
  def parse_record(record) -> FileRecord:
193
198
  return FileRecord(*record)
194
199
 
200
+ @staticmethod
201
+ def escape_sqlike(url: str) -> str:
202
+ """ Escape a url for use in SQL LIKE clause (The % and _ characters) """
203
+ return url.replace('%', r'\%').replace('_', r'\_')
204
+
195
205
  @overload
196
206
  async def get_file_record(self, url: str, throw: Literal[True]) -> FileRecord: ...
197
207
  @overload
@@ -205,6 +215,10 @@ class FileConn(DBObjectBase):
205
215
  return self.parse_record(res)
206
216
 
207
217
  async def get_file_records(self, urls: list[str]) -> list[FileRecord]:
218
+ """
219
+ Get all file records with the given urls, only urls in the database will be returned.
220
+ If the urls are not in the database, they will be ignored.
221
+ """
208
222
  await self.cur.execute("SELECT * FROM fmeta WHERE url IN ({})".format(','.join(['?'] * len(urls))), urls)
209
223
  res = await self.cur.fetchall()
210
224
  if res is None:
@@ -220,12 +234,12 @@ class FileConn(DBObjectBase):
220
234
  await self.cur.execute("SELECT username FROM user")
221
235
  res = await self.cur.fetchall()
222
236
  dirnames = [u[0] + '/' for u in res]
223
- dirs = [await self.get_path_record(u) for u in dirnames] if not skim else [DirectoryRecord(u) for u in dirnames]
237
+ dirs = [await self.get_dir_record(u) for u in dirnames] if not skim else [DirectoryRecord(u) for u in dirnames]
224
238
  return dirs
225
239
  else:
226
240
  # list specific users
227
241
  dirnames = [uname + '/' for uname in usernames]
228
- dirs = [await self.get_path_record(u) for u in dirnames] if not skim else [DirectoryRecord(u) for u in dirnames]
242
+ dirs = [await self.get_dir_record(u) for u in dirnames] if not skim else [DirectoryRecord(u) for u in dirnames]
229
243
  return dirs
230
244
 
231
245
  async def count_path_dirs(self, url: str):
@@ -237,16 +251,16 @@ class FileConn(DBObjectBase):
237
251
  url, LENGTH(?) + 1,
238
252
  INSTR(SUBSTR(url, LENGTH(?) + 1), '/')
239
253
  ) AS dirname
240
- FROM fmeta WHERE url LIKE ? AND dirname != ''
254
+ FROM fmeta WHERE url LIKE ? ESCAPE '\\' AND dirname != ''
241
255
  )
242
- """, (url, url, url + '%'))
256
+ """, (url, url, self.escape_sqlike(url) + '%'))
243
257
  res = await cursor.fetchone()
244
258
  assert res is not None, "Error: count_path_dirs"
245
259
  return res[0]
246
260
 
247
261
  async def list_path_dirs(
248
262
  self, url: str,
249
- offset: int = 0, limit: int = int(1e5),
263
+ offset: int = 0, limit: int = 10_000,
250
264
  order_by: DirSortKey = '', order_desc: bool = False,
251
265
  skim: bool = True
252
266
  ) -> list[DirectoryRecord]:
@@ -262,35 +276,41 @@ class FileConn(DBObjectBase):
262
276
  1 + LENGTH(?),
263
277
  INSTR(SUBSTR(url, 1 + LENGTH(?)), '/')
264
278
  ) AS dirname
265
- FROM fmeta WHERE url LIKE ? AND dirname != ''
279
+ FROM fmeta WHERE url LIKE ? ESCAPE '\\' AND dirname != ''
266
280
  """ \
267
281
  + (f"ORDER BY {order_by} {'DESC' if order_desc else 'ASC'}" if order_by else '') \
268
282
  + " LIMIT ? OFFSET ?"
269
- cursor = await self.cur.execute(sql_qury, (url, url, url + '%', limit, offset))
283
+ cursor = await self.cur.execute(sql_qury, (url, url, self.escape_sqlike(url) + '%', limit, offset))
270
284
  res = await cursor.fetchall()
271
285
  dirs_str = [r[0] for r in res]
272
286
  async def get_dir(dir_url):
273
287
  if skim:
274
288
  return DirectoryRecord(dir_url)
275
289
  else:
276
- return await self.get_path_record(dir_url)
290
+ return await self.get_dir_record(dir_url)
277
291
  dirs = [await get_dir(url + d) for d in dirs_str]
278
292
  return dirs
279
293
 
280
- async def count_path_files(self, url: str, flat: bool = False):
294
+ async def count_dir_files(self, url: str, flat: bool = False):
281
295
  if not url.endswith('/'): url += '/'
282
296
  if url == '/': url = ''
283
297
  if flat:
284
- cursor = await self.cur.execute("SELECT COUNT(*) FROM fmeta WHERE url LIKE ?", (url + '%', ))
298
+ cursor = await self.cur.execute(
299
+ "SELECT COUNT(*) FROM fmeta WHERE url LIKE ? ESCAPE '\\'",
300
+ (self.escape_sqlike(url) + '%', )
301
+ )
285
302
  else:
286
- cursor = await self.cur.execute("SELECT COUNT(*) FROM fmeta WHERE url LIKE ? AND url NOT LIKE ?", (url + '%', url + '%/%'))
303
+ cursor = await self.cur.execute(
304
+ "SELECT COUNT(*) FROM fmeta WHERE url LIKE ? ESCAPE '\\' AND url NOT LIKE ? ESCAPE '\\'",
305
+ (self.escape_sqlike(url) + '%', self.escape_sqlike(url) + '%/%')
306
+ )
287
307
  res = await cursor.fetchone()
288
308
  assert res is not None, "Error: count_path_files"
289
309
  return res[0]
290
310
 
291
- async def list_path_files(
311
+ async def list_dir_files(
292
312
  self, url: str,
293
- offset: int = 0, limit: int = int(1e5),
313
+ offset: int = 0, limit: int = 10_000,
294
314
  order_by: FileSortKey = '', order_desc: bool = False,
295
315
  flat: bool = False,
296
316
  ) -> list[FileRecord]:
@@ -300,14 +320,14 @@ class FileConn(DBObjectBase):
300
320
  if not url.endswith('/'): url += '/'
301
321
  if url == '/': url = ''
302
322
 
303
- sql_query = "SELECT * FROM fmeta WHERE url LIKE ?"
304
- if not flat: sql_query += " AND url NOT LIKE ?"
323
+ sql_query = "SELECT * FROM fmeta WHERE url LIKE ? ESCAPE '\\'"
324
+ if not flat: sql_query += " AND url NOT LIKE ? ESCAPE '\\'"
305
325
  if order_by: sql_query += f" ORDER BY {order_by} {'DESC' if order_desc else 'ASC'}"
306
326
  sql_query += " LIMIT ? OFFSET ?"
307
327
  if flat:
308
- cursor = await self.cur.execute(sql_query, (url + '%', limit, offset))
328
+ cursor = await self.cur.execute(sql_query, (self.escape_sqlike(url) + '%', limit, offset))
309
329
  else:
310
- cursor = await self.cur.execute(sql_query, (url + '%', url + '%/%', limit, offset))
330
+ cursor = await self.cur.execute(sql_query, (self.escape_sqlike(url) + '%', self.escape_sqlike(url) + '%/%', limit, offset))
311
331
  res = await cursor.fetchall()
312
332
  files = [self.parse_record(r) for r in res]
313
333
  return files
@@ -321,17 +341,17 @@ class FileConn(DBObjectBase):
321
341
  - It cannot flatten directories
322
342
  - It cannot list directories with details
323
343
  """
324
- MAX_ITEMS = int(1e4)
344
+ MAX_ITEMS = 10_000
325
345
  dir_count = await self.count_path_dirs(url)
326
- file_count = await self.count_path_files(url, flat=False)
346
+ file_count = await self.count_dir_files(url, flat=False)
327
347
  if dir_count + file_count > MAX_ITEMS:
328
348
  raise TooManyItemsError("Too many items, please paginate")
329
349
  return PathContents(
330
350
  dirs = await self.list_path_dirs(url, skim=True, limit=MAX_ITEMS),
331
- files = await self.list_path_files(url, flat=False, limit=MAX_ITEMS)
351
+ files = await self.list_dir_files(url, flat=False, limit=MAX_ITEMS)
332
352
  )
333
353
 
334
- async def get_path_record(self, url: str) -> DirectoryRecord:
354
+ async def get_dir_record(self, url: str) -> DirectoryRecord:
335
355
  """
336
356
  Get the full record of a directory, including size, create_time, update_time, access_time etc.
337
357
  """
@@ -342,8 +362,8 @@ class FileConn(DBObjectBase):
342
362
  MAX(access_time) as access_time,
343
363
  COUNT(*) as n_files
344
364
  FROM fmeta
345
- WHERE url LIKE ?
346
- """, (url + '%', ))
365
+ WHERE url LIKE ? ESCAPE '\\'
366
+ """, (self.escape_sqlike(url) + '%', ))
347
367
  result = await cursor.fetchone()
348
368
  if result is None or any(val is None for val in result):
349
369
  raise PathNotFoundError(f"Path {url} not found")
@@ -367,10 +387,16 @@ class FileConn(DBObjectBase):
367
387
  if not url.endswith('/'):
368
388
  url += '/'
369
389
  if not include_subpath:
370
- cursor = await self.cur.execute("SELECT SUM(file_size) FROM fmeta WHERE url LIKE ? AND url NOT LIKE ?", (url + '%', url + '%/%'))
390
+ cursor = await self.cur.execute(
391
+ "SELECT SUM(file_size) FROM fmeta WHERE url LIKE ? ESCAPE '\\' AND url NOT LIKE ? ESCAPE '\\'",
392
+ (self.escape_sqlike(url) + '%', self.escape_sqlike(url) + '%/%')
393
+ )
371
394
  res = await cursor.fetchone()
372
395
  else:
373
- cursor = await self.cur.execute("SELECT SUM(file_size) FROM fmeta WHERE url LIKE ?", (url + '%', ))
396
+ cursor = await self.cur.execute(
397
+ "SELECT SUM(file_size) FROM fmeta WHERE url LIKE ? ESCAPE '\\'",
398
+ (self.escape_sqlike(url) + '%', )
399
+ )
374
400
  res = await cursor.fetchone()
375
401
  assert res is not None
376
402
  return res[0] or 0
@@ -406,56 +432,51 @@ class FileConn(DBObjectBase):
406
432
  await self._user_size_inc(owner_id, file_size)
407
433
  self.logger.info(f"File {url} created")
408
434
 
409
- # not tested
410
435
  async def copy_file(self, old_url: str, new_url: str, user_id: Optional[int] = None):
436
+ """
437
+ Copy file from old_url to new_url,
438
+ if user_id is None, will not change the owner_id of the file. Otherwise, will change the owner_id to user_id.
439
+ """
411
440
  old = await self.get_file_record(old_url)
412
441
  if old is None:
413
442
  raise FileNotFoundError(f"File {old_url} not found")
414
443
  new_exists = await self.get_file_record(new_url)
415
444
  if new_exists is not None:
416
445
  raise FileExistsError(f"File {new_url} already exists")
417
- new_fid = str(uuid.uuid4())
418
446
  user_id = old.owner_id if user_id is None else user_id
419
447
  await self.cur.execute(
420
448
  "INSERT INTO fmeta (url, owner_id, file_id, file_size, permission, external, mime_type) VALUES (?, ?, ?, ?, ?, ?, ?)",
421
- (new_url, user_id, new_fid, old.file_size, old.permission, old.external, old.mime_type)
449
+ (new_url, user_id, old.file_id, old.file_size, old.permission, old.external, old.mime_type)
422
450
  )
423
- if not old.external:
424
- await self.set_file_blob(new_fid, await self.get_file_blob(old.file_id))
425
- else:
426
- await copy_file(LARGE_BLOB_DIR / old.file_id, LARGE_BLOB_DIR / new_fid)
451
+ await self.cur.execute("INSERT OR REPLACE INTO dupcount (file_id, count) VALUES (?, COALESCE((SELECT count FROM dupcount WHERE file_id = ?), 0) + 1)", (old.file_id, old.file_id))
427
452
  await self._user_size_inc(user_id, old.file_size)
428
453
  self.logger.info(f"Copied file {old_url} to {new_url}")
429
454
 
430
- # not tested
431
- async def copy_path(self, old_url: str, new_url: str, conflict_handler: Literal['skip', 'overwrite'] = 'overwrite', user_id: Optional[int] = None):
455
+ async def copy_dir(self, old_url: str, new_url: str, user_id: Optional[int] = None):
456
+ """
457
+ Copy all files under old_url to new_url,
458
+ if user_id is None, will not change the owner_id of the files. Otherwise, will change the owner_id to user_id.
459
+ """
432
460
  assert old_url.endswith('/'), "Old path must end with /"
433
461
  assert new_url.endswith('/'), "New path must end with /"
434
- if user_id is None:
435
- cursor = await self.cur.execute("SELECT * FROM fmeta WHERE url LIKE ?", (old_url + '%', ))
436
- res = await cursor.fetchall()
437
- else:
438
- cursor = await self.cur.execute("SELECT * FROM fmeta WHERE url LIKE ? AND owner_id = ?", (old_url + '%', user_id))
439
- res = await cursor.fetchall()
462
+ cursor = await self.cur.execute(
463
+ "SELECT * FROM fmeta WHERE url LIKE ? ESCAPE '\\'",
464
+ (self.escape_sqlike(old_url) + '%', )
465
+ )
466
+ res = await cursor.fetchall()
440
467
  for r in res:
441
468
  old_record = FileRecord(*r)
442
469
  new_r = new_url + old_record.url[len(old_url):]
443
- if conflict_handler == 'overwrite':
444
- await self.cur.execute("DELETE FROM fmeta WHERE url = ?", (new_r, ))
445
- elif conflict_handler == 'skip':
446
- if (await self.cur.execute("SELECT url FROM fmeta WHERE url = ?", (new_r, ))) is not None:
447
- continue
448
- new_fid = str(uuid.uuid4())
470
+ if await (await self.cur.execute("SELECT url FROM fmeta WHERE url = ?", (new_r, ))).fetchone() is not None:
471
+ raise FileExistsError(f"File {new_r} already exists")
449
472
  user_id = old_record.owner_id if user_id is None else user_id
450
473
  await self.cur.execute(
451
474
  "INSERT INTO fmeta (url, owner_id, file_id, file_size, permission, external, mime_type) VALUES (?, ?, ?, ?, ?, ?, ?)",
452
- (new_r, user_id, new_fid, old_record.file_size, old_record.permission, old_record.external, old_record.mime_type)
475
+ (new_r, user_id, old_record.file_id, old_record.file_size, old_record.permission, old_record.external, old_record.mime_type)
453
476
  )
454
- if not old_record.external:
455
- await self.set_file_blob(new_fid, await self.get_file_blob(old_record.file_id))
456
- else:
457
- await copy_file(LARGE_BLOB_DIR / old_record.file_id, LARGE_BLOB_DIR / new_fid)
477
+ await self.cur.execute("INSERT OR REPLACE INTO dupcount (file_id, count) VALUES (?, COALESCE((SELECT count FROM dupcount WHERE file_id = ?), 0) + 1)", (old_record.file_id, old_record.file_id))
458
478
  await self._user_size_inc(user_id, old_record.file_size)
479
+ self.logger.info(f"Copied path {old_url} to {new_url}")
459
480
 
460
481
  async def move_file(self, old_url: str, new_url: str):
461
482
  old = await self.get_file_record(old_url)
@@ -467,14 +488,20 @@ class FileConn(DBObjectBase):
467
488
  await self.cur.execute("UPDATE fmeta SET url = ?, create_time = CURRENT_TIMESTAMP WHERE url = ?", (new_url, old_url))
468
489
  self.logger.info(f"Moved file {old_url} to {new_url}")
469
490
 
470
- async def move_path(self, old_url: str, new_url: str, user_id: Optional[int] = None):
491
+ async def move_dir(self, old_url: str, new_url: str, user_id: Optional[int] = None):
471
492
  assert old_url.endswith('/'), "Old path must end with /"
472
493
  assert new_url.endswith('/'), "New path must end with /"
473
494
  if user_id is None:
474
- cursor = await self.cur.execute("SELECT * FROM fmeta WHERE url LIKE ?", (old_url + '%', ))
495
+ cursor = await self.cur.execute(
496
+ "SELECT * FROM fmeta WHERE url LIKE ? ESCAPE '\\'",
497
+ (self.escape_sqlike(old_url) + '%', )
498
+ )
475
499
  res = await cursor.fetchall()
476
500
  else:
477
- cursor = await self.cur.execute("SELECT * FROM fmeta WHERE url LIKE ? AND owner_id = ?", (old_url + '%', user_id))
501
+ cursor = await self.cur.execute(
502
+ "SELECT * FROM fmeta WHERE url LIKE ? ESCAPE '\\' AND owner_id = ?",
503
+ (self.escape_sqlike(old_url) + '%', user_id)
504
+ )
478
505
  res = await cursor.fetchall()
479
506
  for r in res:
480
507
  new_r = new_url + r[0][len(old_url):]
@@ -497,6 +524,7 @@ class FileConn(DBObjectBase):
497
524
  return file_record
498
525
 
499
526
  async def delete_user_file_records(self, owner_id: int) -> list[FileRecord]:
527
+ """ Delete all records with owner_id """
500
528
  cursor = await self.cur.execute("SELECT * FROM fmeta WHERE owner_id = ?", (owner_id, ))
501
529
  res = await cursor.fetchall()
502
530
  await self.cur.execute("DELETE FROM usize WHERE user_id = ?", (owner_id, ))
@@ -505,13 +533,19 @@ class FileConn(DBObjectBase):
505
533
  self.logger.info(f"Deleted {len(ret)} file records for user {owner_id}") # type: ignore
506
534
  return ret
507
535
 
508
- async def delete_path_records(self, path: str, under_owner_id: Optional[int] = None) -> list[FileRecord]:
536
+ async def delete_records_by_prefix(self, path: str, under_owner_id: Optional[int] = None) -> list[FileRecord]:
509
537
  """Delete all records with url starting with path"""
510
538
  # update user size
511
- cursor = await self.cur.execute("SELECT DISTINCT owner_id FROM fmeta WHERE url LIKE ?", (path + '%', ))
539
+ cursor = await self.cur.execute(
540
+ "SELECT DISTINCT owner_id FROM fmeta WHERE url LIKE ? ESCAPE '\\'",
541
+ (self.escape_sqlike(path) + '%', )
542
+ )
512
543
  res = await cursor.fetchall()
513
544
  for r in res:
514
- cursor = await self.cur.execute("SELECT SUM(file_size) FROM fmeta WHERE owner_id = ? AND url LIKE ?", (r[0], path + '%'))
545
+ cursor = await self.cur.execute(
546
+ "SELECT SUM(file_size) FROM fmeta WHERE owner_id = ? AND url LIKE ? ESCAPE '\\'",
547
+ (r[0], self.escape_sqlike(path) + '%')
548
+ )
515
549
  size = await cursor.fetchone()
516
550
  if size is not None:
517
551
  await self._user_size_dec(r[0], size[0])
@@ -520,15 +554,15 @@ class FileConn(DBObjectBase):
520
554
  # but it's not a big deal... we should have only one writer
521
555
 
522
556
  if under_owner_id is None:
523
- res = await self.cur.execute("DELETE FROM fmeta WHERE url LIKE ? RETURNING *", (path + '%', ))
557
+ res = await self.cur.execute("DELETE FROM fmeta WHERE url LIKE ? ESCAPE '\\' RETURNING *", (self.escape_sqlike(path) + '%', ))
524
558
  else:
525
- res = await self.cur.execute("DELETE FROM fmeta WHERE url LIKE ? AND owner_id = ? RETURNING *", (path + '%', under_owner_id))
559
+ res = await self.cur.execute("DELETE FROM fmeta WHERE url LIKE ? ESCAPE '\\' AND owner_id = ? RETURNING *", (self.escape_sqlike(path) + '%', under_owner_id))
526
560
  all_f_rec = await res.fetchall()
527
561
  self.logger.info(f"Deleted {len(all_f_rec)} file(s) for path {path}") # type: ignore
528
562
  return [self.parse_record(r) for r in all_f_rec]
529
563
 
530
564
  async def set_file_blob(self, file_id: str, blob: bytes):
531
- await self.cur.execute("INSERT OR REPLACE INTO blobs.fdata (file_id, data) VALUES (?, ?)", (file_id, blob))
565
+ await self.cur.execute("INSERT INTO blobs.fdata (file_id, data) VALUES (?, ?)", (file_id, blob))
532
566
 
533
567
  @staticmethod
534
568
  async def set_file_blob_external(file_id: str, stream: AsyncIterable[bytes])->int:
@@ -580,16 +614,78 @@ class FileConn(DBObjectBase):
580
614
  if not chunk: break
581
615
  yield chunk
582
616
 
583
- @staticmethod
584
- async def delete_file_blob_external(file_id: str):
617
+ async def unlink_file_blob_external(self, file_id: str):
618
+ # first check if the file has duplication
619
+ cursor = await self.cur.execute("SELECT count FROM dupcount WHERE file_id = ?", (file_id, ))
620
+ res = await cursor.fetchone()
621
+ if res is not None and res[0] > 0:
622
+ await self.cur.execute("UPDATE dupcount SET count = count - 1 WHERE file_id = ?", (file_id, ))
623
+ return
624
+
625
+ # finally delete the file and the duplication count
585
626
  if (LARGE_BLOB_DIR / file_id).exists():
586
627
  await aiofiles.os.remove(LARGE_BLOB_DIR / file_id)
628
+ await self.cur.execute("DELETE FROM dupcount WHERE file_id = ?", (file_id, ))
587
629
 
588
- async def delete_file_blob(self, file_id: str):
630
+ async def unlink_file_blob(self, file_id: str):
631
+ # first check if the file has duplication
632
+ cursor = await self.cur.execute("SELECT count FROM dupcount WHERE file_id = ?", (file_id, ))
633
+ res = await cursor.fetchone()
634
+ if res is not None and res[0] > 0:
635
+ await self.cur.execute("UPDATE dupcount SET count = count - 1 WHERE file_id = ?", (file_id, ))
636
+ return
637
+
638
+ # finally delete the file and the duplication count
589
639
  await self.cur.execute("DELETE FROM blobs.fdata WHERE file_id = ?", (file_id, ))
640
+ await self.cur.execute("DELETE FROM dupcount WHERE file_id = ?", (file_id, ))
590
641
 
591
- async def delete_file_blobs(self, file_ids: list[str]):
592
- await self.cur.execute("DELETE FROM blobs.fdata WHERE file_id IN ({})".format(','.join(['?'] * len(file_ids))), file_ids)
642
+ async def _group_del(self, file_ids_all: list[str]):
643
+ """
644
+ The file_ids_all may contain duplication,
645
+ yield tuples of unique (to_del_ids, to_dec_ids) for each iteration,
646
+ every iteration should unlink one copy of the files, repeat until all re-occurrence in the input list are removed.
647
+ """
648
+ async def check_dup(file_ids: set[str]):
649
+ cursor = await self.cur.execute("SELECT file_id FROM dupcount WHERE file_id IN ({}) AND count > 0".format(','.join(['?'] * len(file_ids))), tuple(file_ids))
650
+ res = await cursor.fetchall()
651
+ to_dec_ids = [r[0] for r in res]
652
+ to_del_ids = list(file_ids - set(to_dec_ids))
653
+ return to_del_ids, to_dec_ids
654
+ # gather duplication from all file_ids
655
+ fid_occurrence = {}
656
+ for file_id in file_ids_all:
657
+ fid_occurrence[file_id] = fid_occurrence.get(file_id, 0) + 1
658
+ while fid_occurrence:
659
+ to_del_ids, to_dec_ids = await check_dup(set(fid_occurrence.keys()))
660
+ for file_id in to_del_ids:
661
+ del fid_occurrence[file_id]
662
+ for file_id in to_dec_ids:
663
+ fid_occurrence[file_id] -= 1
664
+ if fid_occurrence[file_id] == 0:
665
+ del fid_occurrence[file_id]
666
+ yield (to_del_ids, to_dec_ids)
667
+
668
+ async def unlink_file_blobs(self, file_ids: list[str]):
669
+ async for (to_del_ids, to_dec_ids) in self._group_del(file_ids):
670
+ # delete the only copy
671
+ await self.cur.execute("DELETE FROM blobs.fdata WHERE file_id IN ({})".format(','.join(['?'] * len(to_del_ids))), to_del_ids)
672
+ await self.cur.execute("DELETE FROM dupcount WHERE file_id IN ({})".format(','.join(['?'] * len(to_del_ids))), to_del_ids)
673
+ # decrease duplication count
674
+ await self.cur.execute("UPDATE dupcount SET count = count - 1 WHERE file_id IN ({})".format(','.join(['?'] * len(to_dec_ids))), to_dec_ids)
675
+
676
+ async def unlink_file_blobs_external(self, file_ids: list[str]):
677
+ async def del_file(file_id: str):
678
+ if (LARGE_BLOB_DIR / file_id).exists():
679
+ await aiofiles.os.remove(LARGE_BLOB_DIR / file_id)
680
+ async for (to_del_ids, to_dec_ids) in self._group_del(file_ids):
681
+ # delete the only copy
682
+ await asyncio.gather(*(
683
+ [del_file(file_id) for file_id in to_del_ids] +
684
+ [self.cur.execute("DELETE FROM dupcount WHERE file_id = ?", (file_id, )) for file_id in to_del_ids]
685
+ ))
686
+ # decrease duplication count
687
+ await self.cur.execute("UPDATE dupcount SET count = count - 1 WHERE file_id IN ({})".format(','.join(['?'] * len(to_dec_ids))), to_dec_ids)
688
+
593
689
 
594
690
  _log_active_queue = []
595
691
  _log_active_lock = asyncio.Lock()
@@ -621,20 +717,35 @@ async def delayed_log_access(url: str):
621
717
  _log_access_queue.append(url)
622
718
  await _log_all_access()
623
719
 
624
- def validate_url(url: str, is_file = True):
625
- prohibited_chars = ['..', ';', "'", '"', '\\', '\0', '\n', '\r', '\t', '\x0b', '\x0c']
626
- ret = not url.startswith('/') and not url.startswith('_') and not url.startswith('.')
627
- ret = ret and not any([c in url for c in prohibited_chars])
720
+ @static_vars(
721
+ prohibited_regex = re.compile(
722
+ r"^[/_.]", # start with / or _ or .
723
+ ),
724
+ prohibited_part_regex = re.compile(
725
+ "|".join([
726
+ r"^\s*\.+\s*$", # dot path
727
+ "[{}]".format("".join(re.escape(c) for c in ('/', "\\", "'", '"', "*"))), # prohibited characters
728
+ ])
729
+ ),
730
+ )
731
+ def validate_url(url: str, utype: Literal['file', 'dir'] = 'file'):
732
+ """ Check if a path is valid. The input path is considered url safe """
733
+ if len(url) > 1024:
734
+ raise InvalidPathError(f"URL too long: {url}")
628
735
 
629
- if not ret:
736
+ is_valid = validate_url.prohibited_regex.search(url) is None
737
+ if not is_valid: # early return, no need to check further
630
738
  raise InvalidPathError(f"Invalid URL: {url}")
631
-
632
- if is_file:
633
- ret = ret and not url.endswith('/')
634
- else:
635
- ret = ret and url.endswith('/')
636
739
 
637
- if not ret:
740
+ for part in url.split('/'):
741
+ if validate_url.prohibited_part_regex.search(urllib.parse.unquote(part)):
742
+ is_valid = False
743
+ break
744
+
745
+ if utype == 'file': is_valid = is_valid and not url.endswith('/')
746
+ else: is_valid = is_valid and url.endswith('/')
747
+
748
+ if not is_valid:
638
749
  raise InvalidPathError(f"Invalid URL: {url}")
639
750
 
640
751
  async def get_user(cur: aiosqlite.Cursor, user: int | str) -> Optional[UserRecord]:
@@ -755,6 +866,58 @@ class Database:
755
866
  yield blob
756
867
  ret = blob_stream()
757
868
  return ret
869
+
870
+ async def read_files_bulk(
871
+ self, urls: list[str],
872
+ skip_content = False,
873
+ op_user: Optional[UserRecord] = None,
874
+ ) -> dict[str, Optional[bytes]]:
875
+ """
876
+ A frequent use case is to read multiple files at once,
877
+ this method will read all files in the list and return a dict of url -> blob.
878
+ if the file is not found, the value will be None.
879
+ - skip_content: if True, will not read the content of the file, resulting in a dict of url -> b''
880
+
881
+ may raise StorageExceededError if the total size of the files exceeds MAX_MEM_FILE_BYTES
882
+ """
883
+ for url in urls:
884
+ validate_url(url)
885
+
886
+ async with unique_cursor() as cur:
887
+ fconn = FileConn(cur)
888
+ file_records = await fconn.get_file_records(urls)
889
+
890
+ if op_user is not None:
891
+ for r in file_records:
892
+ if await check_path_permission(r.url, op_user, cursor=cur) >= AccessLevel.READ:
893
+ continue
894
+ is_allowed, reason = await check_file_read_permission(op_user, r, cursor=cur)
895
+ if not is_allowed:
896
+ raise PermissionDeniedError(f"Permission denied: {op_user.username} cannot read file {r.url}: {reason}")
897
+
898
+ # first check if the files are too big
899
+ sum_size = sum([r.file_size for r in file_records])
900
+ if not skip_content and sum_size > MAX_MEM_FILE_BYTES:
901
+ raise StorageExceededError(f"Unable to read files at once, total size {sum_size} exceeds {MAX_MEM_FILE_BYTES}")
902
+
903
+ self.logger.debug(f"Reading {len(file_records)} files{' (skip content)' if skip_content else ''}, getting {sum_size} bytes, from {urls}")
904
+ # read the file content
905
+ async with unique_cursor() as cur:
906
+ fconn = FileConn(cur)
907
+ blobs: dict[str, bytes] = {}
908
+ for r in file_records:
909
+ if skip_content:
910
+ blobs[r.url] = b''
911
+ continue
912
+
913
+ if r.external:
914
+ blob_iter = fconn.get_file_blob_external(r.file_id)
915
+ blob = b''.join([chunk async for chunk in blob_iter])
916
+ else:
917
+ blob = await fconn.get_file_blob(r.file_id)
918
+ blobs[r.url] = blob
919
+
920
+ return {url: blobs.get(url, None) for url in urls}
758
921
 
759
922
  async def delete_file(self, url: str, op_user: Optional[UserRecord] = None) -> Optional[FileRecord]:
760
923
  validate_url(url)
@@ -771,9 +934,9 @@ class Database:
771
934
  raise PermissionDeniedError(f"Permission denied: {op_user.username} cannot delete file {url}")
772
935
  f_id = r.file_id
773
936
  if r.external:
774
- await fconn.delete_file_blob_external(f_id)
937
+ await fconn.unlink_file_blob_external(f_id)
775
938
  else:
776
- await fconn.delete_file_blob(f_id)
939
+ await fconn.unlink_file_blob(f_id)
777
940
  return r
778
941
 
779
942
  async def move_file(self, old_url: str, new_url: str, op_user: Optional[UserRecord] = None):
@@ -813,9 +976,9 @@ class Database:
813
976
  raise PermissionDeniedError(f"Permission denied: {op_user.username} cannot copy file to {new_url}")
814
977
  await fconn.copy_file(old_url, new_url, user_id=op_user.id if op_user is not None else None)
815
978
 
816
- async def move_path(self, old_url: str, new_url: str, op_user: UserRecord):
817
- validate_url(old_url, is_file=False)
818
- validate_url(new_url, is_file=False)
979
+ async def move_dir(self, old_url: str, new_url: str, op_user: UserRecord):
980
+ validate_url(old_url, 'dir')
981
+ validate_url(new_url, 'dir')
819
982
 
820
983
  if new_url.startswith('/'):
821
984
  new_url = new_url[1:]
@@ -834,12 +997,11 @@ class Database:
834
997
 
835
998
  async with transaction() as cur:
836
999
  fconn = FileConn(cur)
837
- await fconn.move_path(old_url, new_url, op_user.id)
1000
+ await fconn.move_dir(old_url, new_url, op_user.id)
838
1001
 
839
- # not tested
840
- async def copy_path(self, old_url: str, new_url: str, op_user: UserRecord):
841
- validate_url(old_url, is_file=False)
842
- validate_url(new_url, is_file=False)
1002
+ async def copy_dir(self, old_url: str, new_url: str, op_user: UserRecord):
1003
+ validate_url(old_url, 'dir')
1004
+ validate_url(new_url, 'dir')
843
1005
 
844
1006
  if new_url.startswith('/'):
845
1007
  new_url = new_url[1:]
@@ -858,7 +1020,7 @@ class Database:
858
1020
 
859
1021
  async with transaction() as cur:
860
1022
  fconn = FileConn(cur)
861
- await fconn.copy_path(old_url, new_url, 'overwrite', op_user.id)
1023
+ await fconn.copy_dir(old_url, new_url, op_user.id)
862
1024
 
863
1025
  async def __batch_delete_file_blobs(self, fconn: FileConn, file_records: list[FileRecord], batch_size: int = 512):
864
1026
  # https://github.com/langchain-ai/langchain/issues/10321
@@ -872,19 +1034,20 @@ class Database:
872
1034
 
873
1035
  async def del_internal():
874
1036
  for i in range(0, len(internal_ids), batch_size):
875
- await fconn.delete_file_blobs([r for r in internal_ids[i:i+batch_size]])
1037
+ await fconn.unlink_file_blobs([r for r in internal_ids[i:i+batch_size]])
876
1038
  async def del_external():
877
- for i in range(0, len(external_ids)):
878
- await fconn.delete_file_blob_external(external_ids[i])
879
- await asyncio.gather(del_internal(), del_external())
1039
+ for i in range(0, len(external_ids), batch_size):
1040
+ await fconn.unlink_file_blobs_external([r for r in external_ids[i:i+batch_size]])
1041
+ await del_internal()
1042
+ await del_external()
880
1043
 
881
- async def delete_path(self, url: str, op_user: Optional[UserRecord] = None) -> Optional[list[FileRecord]]:
882
- validate_url(url, is_file=False)
1044
+ async def delete_dir(self, url: str, op_user: Optional[UserRecord] = None) -> Optional[list[FileRecord]]:
1045
+ validate_url(url, 'dir')
883
1046
  from_owner_id = op_user.id if op_user is not None and not (op_user.is_admin or await check_path_permission(url, op_user) >= AccessLevel.WRITE) else None
884
1047
 
885
1048
  async with transaction() as cur:
886
1049
  fconn = FileConn(cur)
887
- records = await fconn.delete_path_records(url, from_owner_id)
1050
+ records = await fconn.delete_records_by_prefix(url, from_owner_id)
888
1051
  if not records:
889
1052
  return None
890
1053
  await self.__batch_delete_file_blobs(fconn, records)
@@ -908,14 +1071,15 @@ class Database:
908
1071
 
909
1072
  # make sure the user's directory is deleted,
910
1073
  # may contain admin's files, but delete them all
911
- await fconn.delete_path_records(user.username + '/')
1074
+ await fconn.delete_records_by_prefix(user.username + '/')
912
1075
 
913
- async def iter_path(self, top_url: str, urls: Optional[list[str]]) -> AsyncIterable[tuple[FileRecord, bytes | AsyncIterable[bytes]]]:
1076
+ async def iter_dir(self, top_url: str, urls: Optional[list[str]]) -> AsyncIterable[tuple[FileRecord, bytes | AsyncIterable[bytes]]]:
1077
+ validate_url(top_url, 'dir')
914
1078
  async with unique_cursor() as cur:
915
1079
  fconn = FileConn(cur)
916
1080
  if urls is None:
917
- fcount = await fconn.count_path_files(top_url, flat=True)
918
- urls = [r.url for r in (await fconn.list_path_files(top_url, flat=True, limit=fcount))]
1081
+ fcount = await fconn.count_dir_files(top_url, flat=True)
1082
+ urls = [r.url for r in (await fconn.list_dir_files(top_url, flat=True, limit=fcount))]
919
1083
 
920
1084
  for url in urls:
921
1085
  if not url.startswith(top_url):
@@ -929,14 +1093,50 @@ class Database:
929
1093
  else:
930
1094
  blob = await fconn.get_file_blob(f_id)
931
1095
  yield r, blob
1096
+
1097
+ async def zip_dir_stream(self, top_url: str, op_user: Optional[UserRecord] = None) -> AsyncIterable[bytes]:
1098
+ from stat import S_IFREG
1099
+ from stream_zip import async_stream_zip, ZIP_64
1100
+ if top_url.startswith('/'):
1101
+ top_url = top_url[1:]
1102
+
1103
+ if op_user:
1104
+ if await check_path_permission(top_url, op_user) < AccessLevel.READ:
1105
+ raise PermissionDeniedError(f"Permission denied: {op_user.username} cannot zip path {top_url}")
1106
+
1107
+ # https://stream-zip.docs.trade.gov.uk/async-interface/
1108
+ async def data_iter():
1109
+ async for (r, blob) in self.iter_dir(top_url, None):
1110
+ rel_path = r.url[len(top_url):]
1111
+ rel_path = decode_uri_compnents(rel_path)
1112
+ b_iter: AsyncIterable[bytes]
1113
+ if isinstance(blob, bytes):
1114
+ async def blob_iter(): yield blob
1115
+ b_iter = blob_iter() # type: ignore
1116
+ else:
1117
+ assert isinstance(blob, AsyncIterable)
1118
+ b_iter = blob
1119
+ yield (
1120
+ rel_path,
1121
+ datetime.datetime.now(),
1122
+ S_IFREG | 0o600,
1123
+ ZIP_64,
1124
+ b_iter
1125
+ )
1126
+ return async_stream_zip(data_iter())
932
1127
 
933
1128
  @concurrent_wrap()
934
- async def zip_path(self, top_url: str, urls: Optional[list[str]]) -> io.BytesIO:
1129
+ async def zip_dir(self, top_url: str, op_user: Optional[UserRecord]) -> io.BytesIO:
935
1130
  if top_url.startswith('/'):
936
1131
  top_url = top_url[1:]
1132
+
1133
+ if op_user:
1134
+ if await check_path_permission(top_url, op_user) < AccessLevel.READ:
1135
+ raise PermissionDeniedError(f"Permission denied: {op_user.username} cannot zip path {top_url}")
1136
+
937
1137
  buffer = io.BytesIO()
938
1138
  with zipfile.ZipFile(buffer, 'w') as zf:
939
- async for (r, blob) in self.iter_path(top_url, urls):
1139
+ async for (r, blob) in self.iter_dir(top_url, None):
940
1140
  rel_path = r.url[len(top_url):]
941
1141
  rel_path = decode_uri_compnents(rel_path)
942
1142
  if r.external:
@@ -948,39 +1148,50 @@ class Database:
948
1148
  buffer.seek(0)
949
1149
  return buffer
950
1150
 
951
- def check_file_read_permission(user: UserRecord, owner: UserRecord, file: FileRecord) -> tuple[bool, str]:
1151
+ async def _get_path_owner(cur: aiosqlite.Cursor, path: str) -> UserRecord:
1152
+ path_username = path.split('/')[0]
1153
+ uconn = UserConn(cur)
1154
+ path_user = await uconn.get_user(path_username)
1155
+ if path_user is None:
1156
+ raise InvalidPathError(f"Invalid path: {path_username} is not a valid username")
1157
+ return path_user
1158
+
1159
+ async def check_file_read_permission(user: UserRecord, file: FileRecord, cursor: Optional[aiosqlite.Cursor] = None) -> tuple[bool, str]:
952
1160
  """
953
1161
  This does not consider alias level permission,
954
1162
  use check_path_permission for alias level permission check first:
955
1163
  ```
956
- if await check_path_permission(path, user) < AccessLevel.READ:
957
- read_allowed, reason = check_file_read_permission(user, owner, file)
1164
+ if await check_path_permission(file.url, user) < AccessLevel.READ:
1165
+ read_allowed, reason = check_file_read_permission(user, file)
958
1166
  ```
1167
+ The implementation assumes the user is not admin and is not the owner of the file/path
959
1168
  """
960
- if user.is_admin:
961
- return True, ""
1169
+ @asynccontextmanager
1170
+ async def this_cur():
1171
+ if cursor is None:
1172
+ async with unique_cursor() as _cur:
1173
+ yield _cur
1174
+ else:
1175
+ yield cursor
1176
+
1177
+ f_perm = file.permission
1178
+
1179
+ # if file permission unset, use path owner's permission as fallback
1180
+ if f_perm == FileReadPermission.UNSET:
1181
+ async with this_cur() as cur:
1182
+ path_owner = await _get_path_owner(cur, file.url)
1183
+ f_perm = path_owner.permission
962
1184
 
963
1185
  # check permission of the file
964
- if file.permission == FileReadPermission.PRIVATE:
965
- if user.id != owner.id:
966
- return False, "Permission denied, private file"
967
- elif file.permission == FileReadPermission.PROTECTED:
1186
+ if f_perm == FileReadPermission.PRIVATE:
1187
+ return False, "Permission denied, private file"
1188
+ elif f_perm == FileReadPermission.PROTECTED:
968
1189
  if user.id == 0:
969
1190
  return False, "Permission denied, protected file"
970
- elif file.permission == FileReadPermission.PUBLIC:
1191
+ elif f_perm == FileReadPermission.PUBLIC:
971
1192
  return True, ""
972
1193
  else:
973
- assert file.permission == FileReadPermission.UNSET
974
-
975
- # use owner's permission as fallback
976
- if owner.permission == FileReadPermission.PRIVATE:
977
- if user.id != owner.id:
978
- return False, "Permission denied, private user file"
979
- elif owner.permission == FileReadPermission.PROTECTED:
980
- if user.id == 0:
981
- return False, "Permission denied, protected user file"
982
- else:
983
- assert owner.permission == FileReadPermission.PUBLIC or owner.permission == FileReadPermission.UNSET
1194
+ assert f_perm == FileReadPermission.UNSET
984
1195
 
985
1196
  return True, ""
986
1197
 
@@ -991,9 +1202,6 @@ async def check_path_permission(path: str, user: UserRecord, cursor: Optional[ai
991
1202
  If the path is a file, the user will have all access if the user is the owner.
992
1203
  Otherwise, the user will have alias level access w.r.t. the path user.
993
1204
  """
994
- if user.id == 0:
995
- return AccessLevel.GUEST
996
-
997
1205
  @asynccontextmanager
998
1206
  async def this_cur():
999
1207
  if cursor is None:
@@ -1002,16 +1210,18 @@ async def check_path_permission(path: str, user: UserRecord, cursor: Optional[ai
1002
1210
  else:
1003
1211
  yield cursor
1004
1212
 
1005
- # check if path user exists
1006
- path_username = path.split('/')[0]
1213
+ # check if path user exists, may raise exception
1007
1214
  async with this_cur() as cur:
1008
- uconn = UserConn(cur)
1009
- path_user = await uconn.get_user(path_username)
1010
- if path_user is None:
1011
- raise PathNotFoundError(f"Invalid path: {path_username} is not a valid username")
1215
+ path_owner = await _get_path_owner(cur, path)
1012
1216
 
1013
- # check if user is admin
1014
- if user.is_admin or user.username == path_username:
1217
+ if user.id == 0:
1218
+ return AccessLevel.GUEST
1219
+
1220
+ if user.is_admin:
1221
+ return AccessLevel.ALL
1222
+
1223
+ # check if user is admin or the owner of the path
1224
+ if user.id == path_owner.id:
1015
1225
  return AccessLevel.ALL
1016
1226
 
1017
1227
  # if the path is a file, check if the user is the owner
@@ -1025,4 +1235,4 @@ async def check_path_permission(path: str, user: UserRecord, cursor: Optional[ai
1025
1235
  # check alias level
1026
1236
  async with this_cur() as cur:
1027
1237
  uconn = UserConn(cur)
1028
- return await uconn.query_peer_level(user.id, path_user.id)
1238
+ return await uconn.query_peer_level(user.id, path_owner.id)