lfss 0.9.4__py3-none-any.whl → 0.11.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
lfss/eng/database.py CHANGED
@@ -1,10 +1,12 @@
1
1
 
2
- from typing import Optional, Literal, AsyncIterable, overload
2
+ from typing import Optional, Literal, overload
3
+ from collections.abc import AsyncIterable
3
4
  from contextlib import asynccontextmanager
4
5
  from abc import ABC
6
+ import re
5
7
 
8
+ import uuid, datetime
6
9
  import urllib.parse
7
- import uuid
8
10
  import zipfile, io, asyncio
9
11
 
10
12
  import aiosqlite, aiofiles
@@ -19,7 +21,7 @@ from .datatype import (
19
21
  )
20
22
  from .config import LARGE_BLOB_DIR, CHUNK_SIZE, LARGE_FILE_BYTES, MAX_MEM_FILE_BYTES
21
23
  from .log import get_logger
22
- from .utils import decode_uri_compnents, hash_credential, concurrent_wrap, debounce_async, copy_file
24
+ from .utils import decode_uri_compnents, hash_credential, concurrent_wrap, debounce_async, static_vars
23
25
  from .error import *
24
26
 
25
27
  class DBObjectBase(ABC):
@@ -82,9 +84,12 @@ class UserConn(DBObjectBase):
82
84
  self, username: str, password: str, is_admin: bool = False,
83
85
  max_storage: int = 1073741824, permission: FileReadPermission = FileReadPermission.UNSET
84
86
  ) -> int:
85
- assert not username.startswith('_'), "Error: reserved username"
86
- assert not ('/' in username or len(username) > 255), "Invalid username"
87
- assert urllib.parse.quote(username) == username, "Invalid username, must be URL safe"
87
+ def validate_username(username: str):
88
+ assert not set(username) & {'/', ':'}, "Invalid username"
89
+ assert not username.startswith('_'), "Error: reserved username"
90
+ assert not (len(username) > 255), "Username too long"
91
+ assert urllib.parse.quote(username) == username, "Invalid username, must be URL safe"
92
+ validate_username(username)
88
93
  self.logger.debug(f"Creating user {username}")
89
94
  credential = hash_credential(username, password)
90
95
  assert await self.get_user(username) is None, "Duplicate username"
@@ -192,6 +197,11 @@ class FileConn(DBObjectBase):
192
197
  def parse_record(record) -> FileRecord:
193
198
  return FileRecord(*record)
194
199
 
200
+ @staticmethod
201
+ def escape_sqlike(url: str) -> str:
202
+ """ Escape a url for use in SQL LIKE clause (The % and _ characters) """
203
+ return url.replace('%', r'\%').replace('_', r'\_')
204
+
195
205
  @overload
196
206
  async def get_file_record(self, url: str, throw: Literal[True]) -> FileRecord: ...
197
207
  @overload
@@ -205,6 +215,10 @@ class FileConn(DBObjectBase):
205
215
  return self.parse_record(res)
206
216
 
207
217
  async def get_file_records(self, urls: list[str]) -> list[FileRecord]:
218
+ """
219
+ Get all file records with the given urls, only urls in the database will be returned.
220
+ If the urls are not in the database, they will be ignored.
221
+ """
208
222
  await self.cur.execute("SELECT * FROM fmeta WHERE url IN ({})".format(','.join(['?'] * len(urls))), urls)
209
223
  res = await self.cur.fetchall()
210
224
  if res is None:
@@ -220,12 +234,12 @@ class FileConn(DBObjectBase):
220
234
  await self.cur.execute("SELECT username FROM user")
221
235
  res = await self.cur.fetchall()
222
236
  dirnames = [u[0] + '/' for u in res]
223
- dirs = [await self.get_path_record(u) for u in dirnames] if not skim else [DirectoryRecord(u) for u in dirnames]
237
+ dirs = [await self.get_dir_record(u) for u in dirnames] if not skim else [DirectoryRecord(u) for u in dirnames]
224
238
  return dirs
225
239
  else:
226
240
  # list specific users
227
241
  dirnames = [uname + '/' for uname in usernames]
228
- dirs = [await self.get_path_record(u) for u in dirnames] if not skim else [DirectoryRecord(u) for u in dirnames]
242
+ dirs = [await self.get_dir_record(u) for u in dirnames] if not skim else [DirectoryRecord(u) for u in dirnames]
229
243
  return dirs
230
244
 
231
245
  async def count_path_dirs(self, url: str):
@@ -237,16 +251,16 @@ class FileConn(DBObjectBase):
237
251
  url, LENGTH(?) + 1,
238
252
  INSTR(SUBSTR(url, LENGTH(?) + 1), '/')
239
253
  ) AS dirname
240
- FROM fmeta WHERE url LIKE ? AND dirname != ''
254
+ FROM fmeta WHERE url LIKE ? ESCAPE '\\' AND dirname != ''
241
255
  )
242
- """, (url, url, url + '%'))
256
+ """, (url, url, self.escape_sqlike(url) + '%'))
243
257
  res = await cursor.fetchone()
244
258
  assert res is not None, "Error: count_path_dirs"
245
259
  return res[0]
246
260
 
247
261
  async def list_path_dirs(
248
262
  self, url: str,
249
- offset: int = 0, limit: int = int(1e5),
263
+ offset: int = 0, limit: int = 10_000,
250
264
  order_by: DirSortKey = '', order_desc: bool = False,
251
265
  skim: bool = True
252
266
  ) -> list[DirectoryRecord]:
@@ -262,35 +276,41 @@ class FileConn(DBObjectBase):
262
276
  1 + LENGTH(?),
263
277
  INSTR(SUBSTR(url, 1 + LENGTH(?)), '/')
264
278
  ) AS dirname
265
- FROM fmeta WHERE url LIKE ? AND dirname != ''
279
+ FROM fmeta WHERE url LIKE ? ESCAPE '\\' AND dirname != ''
266
280
  """ \
267
281
  + (f"ORDER BY {order_by} {'DESC' if order_desc else 'ASC'}" if order_by else '') \
268
282
  + " LIMIT ? OFFSET ?"
269
- cursor = await self.cur.execute(sql_qury, (url, url, url + '%', limit, offset))
283
+ cursor = await self.cur.execute(sql_qury, (url, url, self.escape_sqlike(url) + '%', limit, offset))
270
284
  res = await cursor.fetchall()
271
285
  dirs_str = [r[0] for r in res]
272
286
  async def get_dir(dir_url):
273
287
  if skim:
274
288
  return DirectoryRecord(dir_url)
275
289
  else:
276
- return await self.get_path_record(dir_url)
290
+ return await self.get_dir_record(dir_url)
277
291
  dirs = [await get_dir(url + d) for d in dirs_str]
278
292
  return dirs
279
293
 
280
- async def count_path_files(self, url: str, flat: bool = False):
294
+ async def count_dir_files(self, url: str, flat: bool = False):
281
295
  if not url.endswith('/'): url += '/'
282
296
  if url == '/': url = ''
283
297
  if flat:
284
- cursor = await self.cur.execute("SELECT COUNT(*) FROM fmeta WHERE url LIKE ?", (url + '%', ))
298
+ cursor = await self.cur.execute(
299
+ "SELECT COUNT(*) FROM fmeta WHERE url LIKE ? ESCAPE '\\'",
300
+ (self.escape_sqlike(url) + '%', )
301
+ )
285
302
  else:
286
- cursor = await self.cur.execute("SELECT COUNT(*) FROM fmeta WHERE url LIKE ? AND url NOT LIKE ?", (url + '%', url + '%/%'))
303
+ cursor = await self.cur.execute(
304
+ "SELECT COUNT(*) FROM fmeta WHERE url LIKE ? ESCAPE '\\' AND url NOT LIKE ? ESCAPE '\\'",
305
+ (self.escape_sqlike(url) + '%', self.escape_sqlike(url) + '%/%')
306
+ )
287
307
  res = await cursor.fetchone()
288
308
  assert res is not None, "Error: count_path_files"
289
309
  return res[0]
290
310
 
291
- async def list_path_files(
311
+ async def list_dir_files(
292
312
  self, url: str,
293
- offset: int = 0, limit: int = int(1e5),
313
+ offset: int = 0, limit: int = 10_000,
294
314
  order_by: FileSortKey = '', order_desc: bool = False,
295
315
  flat: bool = False,
296
316
  ) -> list[FileRecord]:
@@ -300,14 +320,14 @@ class FileConn(DBObjectBase):
300
320
  if not url.endswith('/'): url += '/'
301
321
  if url == '/': url = ''
302
322
 
303
- sql_query = "SELECT * FROM fmeta WHERE url LIKE ?"
304
- if not flat: sql_query += " AND url NOT LIKE ?"
323
+ sql_query = "SELECT * FROM fmeta WHERE url LIKE ? ESCAPE '\\'"
324
+ if not flat: sql_query += " AND url NOT LIKE ? ESCAPE '\\'"
305
325
  if order_by: sql_query += f" ORDER BY {order_by} {'DESC' if order_desc else 'ASC'}"
306
326
  sql_query += " LIMIT ? OFFSET ?"
307
327
  if flat:
308
- cursor = await self.cur.execute(sql_query, (url + '%', limit, offset))
328
+ cursor = await self.cur.execute(sql_query, (self.escape_sqlike(url) + '%', limit, offset))
309
329
  else:
310
- cursor = await self.cur.execute(sql_query, (url + '%', url + '%/%', limit, offset))
330
+ cursor = await self.cur.execute(sql_query, (self.escape_sqlike(url) + '%', self.escape_sqlike(url) + '%/%', limit, offset))
311
331
  res = await cursor.fetchall()
312
332
  files = [self.parse_record(r) for r in res]
313
333
  return files
@@ -321,17 +341,17 @@ class FileConn(DBObjectBase):
321
341
  - It cannot flatten directories
322
342
  - It cannot list directories with details
323
343
  """
324
- MAX_ITEMS = int(1e4)
344
+ MAX_ITEMS = 10_000
325
345
  dir_count = await self.count_path_dirs(url)
326
- file_count = await self.count_path_files(url, flat=False)
346
+ file_count = await self.count_dir_files(url, flat=False)
327
347
  if dir_count + file_count > MAX_ITEMS:
328
348
  raise TooManyItemsError("Too many items, please paginate")
329
349
  return PathContents(
330
350
  dirs = await self.list_path_dirs(url, skim=True, limit=MAX_ITEMS),
331
- files = await self.list_path_files(url, flat=False, limit=MAX_ITEMS)
351
+ files = await self.list_dir_files(url, flat=False, limit=MAX_ITEMS)
332
352
  )
333
353
 
334
- async def get_path_record(self, url: str) -> DirectoryRecord:
354
+ async def get_dir_record(self, url: str) -> DirectoryRecord:
335
355
  """
336
356
  Get the full record of a directory, including size, create_time, update_time, access_time etc.
337
357
  """
@@ -342,8 +362,8 @@ class FileConn(DBObjectBase):
342
362
  MAX(access_time) as access_time,
343
363
  COUNT(*) as n_files
344
364
  FROM fmeta
345
- WHERE url LIKE ?
346
- """, (url + '%', ))
365
+ WHERE url LIKE ? ESCAPE '\\'
366
+ """, (self.escape_sqlike(url) + '%', ))
347
367
  result = await cursor.fetchone()
348
368
  if result is None or any(val is None for val in result):
349
369
  raise PathNotFoundError(f"Path {url} not found")
@@ -367,10 +387,16 @@ class FileConn(DBObjectBase):
367
387
  if not url.endswith('/'):
368
388
  url += '/'
369
389
  if not include_subpath:
370
- cursor = await self.cur.execute("SELECT SUM(file_size) FROM fmeta WHERE url LIKE ? AND url NOT LIKE ?", (url + '%', url + '%/%'))
390
+ cursor = await self.cur.execute(
391
+ "SELECT SUM(file_size) FROM fmeta WHERE url LIKE ? ESCAPE '\\' AND url NOT LIKE ? ESCAPE '\\'",
392
+ (self.escape_sqlike(url) + '%', self.escape_sqlike(url) + '%/%')
393
+ )
371
394
  res = await cursor.fetchone()
372
395
  else:
373
- cursor = await self.cur.execute("SELECT SUM(file_size) FROM fmeta WHERE url LIKE ?", (url + '%', ))
396
+ cursor = await self.cur.execute(
397
+ "SELECT SUM(file_size) FROM fmeta WHERE url LIKE ? ESCAPE '\\'",
398
+ (self.escape_sqlike(url) + '%', )
399
+ )
374
400
  res = await cursor.fetchone()
375
401
  assert res is not None
376
402
  return res[0] or 0
@@ -406,51 +432,49 @@ class FileConn(DBObjectBase):
406
432
  await self._user_size_inc(owner_id, file_size)
407
433
  self.logger.info(f"File {url} created")
408
434
 
409
- # not tested
410
435
  async def copy_file(self, old_url: str, new_url: str, user_id: Optional[int] = None):
436
+ """
437
+ Copy file from old_url to new_url,
438
+ if user_id is None, will not change the owner_id of the file. Otherwise, will change the owner_id to user_id.
439
+ """
411
440
  old = await self.get_file_record(old_url)
412
441
  if old is None:
413
442
  raise FileNotFoundError(f"File {old_url} not found")
414
443
  new_exists = await self.get_file_record(new_url)
415
444
  if new_exists is not None:
416
445
  raise FileExistsError(f"File {new_url} already exists")
417
- new_fid = str(uuid.uuid4())
418
446
  user_id = old.owner_id if user_id is None else user_id
419
447
  await self.cur.execute(
420
448
  "INSERT INTO fmeta (url, owner_id, file_id, file_size, permission, external, mime_type) VALUES (?, ?, ?, ?, ?, ?, ?)",
421
- (new_url, user_id, new_fid, old.file_size, old.permission, old.external, old.mime_type)
449
+ (new_url, user_id, old.file_id, old.file_size, old.permission, old.external, old.mime_type)
422
450
  )
423
- if not old.external:
424
- await self.set_file_blob(new_fid, await self.get_file_blob(old.file_id))
425
- else:
426
- await copy_file(LARGE_BLOB_DIR / old.file_id, LARGE_BLOB_DIR / new_fid)
451
+ await self.cur.execute("INSERT OR REPLACE INTO dupcount (file_id, count) VALUES (?, COALESCE((SELECT count FROM dupcount WHERE file_id = ?), 0) + 1)", (old.file_id, old.file_id))
427
452
  await self._user_size_inc(user_id, old.file_size)
428
453
  self.logger.info(f"Copied file {old_url} to {new_url}")
429
454
 
430
- async def copy_path(self, old_url: str, new_url: str, user_id: Optional[int] = None):
455
+ async def copy_dir(self, old_url: str, new_url: str, user_id: Optional[int] = None):
456
+ """
457
+ Copy all files under old_url to new_url,
458
+ if user_id is None, will not change the owner_id of the files. Otherwise, will change the owner_id to user_id.
459
+ """
431
460
  assert old_url.endswith('/'), "Old path must end with /"
432
461
  assert new_url.endswith('/'), "New path must end with /"
433
- if user_id is None:
434
- cursor = await self.cur.execute("SELECT * FROM fmeta WHERE url LIKE ?", (old_url + '%', ))
435
- res = await cursor.fetchall()
436
- else:
437
- cursor = await self.cur.execute("SELECT * FROM fmeta WHERE url LIKE ? AND owner_id = ?", (old_url + '%', user_id))
438
- res = await cursor.fetchall()
462
+ cursor = await self.cur.execute(
463
+ "SELECT * FROM fmeta WHERE url LIKE ? ESCAPE '\\'",
464
+ (self.escape_sqlike(old_url) + '%', )
465
+ )
466
+ res = await cursor.fetchall()
439
467
  for r in res:
440
468
  old_record = FileRecord(*r)
441
469
  new_r = new_url + old_record.url[len(old_url):]
442
470
  if await (await self.cur.execute("SELECT url FROM fmeta WHERE url = ?", (new_r, ))).fetchone() is not None:
443
471
  raise FileExistsError(f"File {new_r} already exists")
444
- new_fid = str(uuid.uuid4())
445
472
  user_id = old_record.owner_id if user_id is None else user_id
446
473
  await self.cur.execute(
447
474
  "INSERT INTO fmeta (url, owner_id, file_id, file_size, permission, external, mime_type) VALUES (?, ?, ?, ?, ?, ?, ?)",
448
- (new_r, user_id, new_fid, old_record.file_size, old_record.permission, old_record.external, old_record.mime_type)
475
+ (new_r, user_id, old_record.file_id, old_record.file_size, old_record.permission, old_record.external, old_record.mime_type)
449
476
  )
450
- if not old_record.external:
451
- await self.set_file_blob(new_fid, await self.get_file_blob(old_record.file_id))
452
- else:
453
- await copy_file(LARGE_BLOB_DIR / old_record.file_id, LARGE_BLOB_DIR / new_fid)
477
+ await self.cur.execute("INSERT OR REPLACE INTO dupcount (file_id, count) VALUES (?, COALESCE((SELECT count FROM dupcount WHERE file_id = ?), 0) + 1)", (old_record.file_id, old_record.file_id))
454
478
  await self._user_size_inc(user_id, old_record.file_size)
455
479
  self.logger.info(f"Copied path {old_url} to {new_url}")
456
480
 
@@ -464,14 +488,20 @@ class FileConn(DBObjectBase):
464
488
  await self.cur.execute("UPDATE fmeta SET url = ?, create_time = CURRENT_TIMESTAMP WHERE url = ?", (new_url, old_url))
465
489
  self.logger.info(f"Moved file {old_url} to {new_url}")
466
490
 
467
- async def move_path(self, old_url: str, new_url: str, user_id: Optional[int] = None):
491
+ async def move_dir(self, old_url: str, new_url: str, user_id: Optional[int] = None):
468
492
  assert old_url.endswith('/'), "Old path must end with /"
469
493
  assert new_url.endswith('/'), "New path must end with /"
470
494
  if user_id is None:
471
- cursor = await self.cur.execute("SELECT * FROM fmeta WHERE url LIKE ?", (old_url + '%', ))
495
+ cursor = await self.cur.execute(
496
+ "SELECT * FROM fmeta WHERE url LIKE ? ESCAPE '\\'",
497
+ (self.escape_sqlike(old_url) + '%', )
498
+ )
472
499
  res = await cursor.fetchall()
473
500
  else:
474
- cursor = await self.cur.execute("SELECT * FROM fmeta WHERE url LIKE ? AND owner_id = ?", (old_url + '%', user_id))
501
+ cursor = await self.cur.execute(
502
+ "SELECT * FROM fmeta WHERE url LIKE ? ESCAPE '\\' AND owner_id = ?",
503
+ (self.escape_sqlike(old_url) + '%', user_id)
504
+ )
475
505
  res = await cursor.fetchall()
476
506
  for r in res:
477
507
  new_r = new_url + r[0][len(old_url):]
@@ -494,6 +524,7 @@ class FileConn(DBObjectBase):
494
524
  return file_record
495
525
 
496
526
  async def delete_user_file_records(self, owner_id: int) -> list[FileRecord]:
527
+ """ Delete all records with owner_id """
497
528
  cursor = await self.cur.execute("SELECT * FROM fmeta WHERE owner_id = ?", (owner_id, ))
498
529
  res = await cursor.fetchall()
499
530
  await self.cur.execute("DELETE FROM usize WHERE user_id = ?", (owner_id, ))
@@ -502,13 +533,19 @@ class FileConn(DBObjectBase):
502
533
  self.logger.info(f"Deleted {len(ret)} file records for user {owner_id}") # type: ignore
503
534
  return ret
504
535
 
505
- async def delete_path_records(self, path: str, under_owner_id: Optional[int] = None) -> list[FileRecord]:
536
+ async def delete_records_by_prefix(self, path: str, under_owner_id: Optional[int] = None) -> list[FileRecord]:
506
537
  """Delete all records with url starting with path"""
507
538
  # update user size
508
- cursor = await self.cur.execute("SELECT DISTINCT owner_id FROM fmeta WHERE url LIKE ?", (path + '%', ))
539
+ cursor = await self.cur.execute(
540
+ "SELECT DISTINCT owner_id FROM fmeta WHERE url LIKE ? ESCAPE '\\'",
541
+ (self.escape_sqlike(path) + '%', )
542
+ )
509
543
  res = await cursor.fetchall()
510
544
  for r in res:
511
- cursor = await self.cur.execute("SELECT SUM(file_size) FROM fmeta WHERE owner_id = ? AND url LIKE ?", (r[0], path + '%'))
545
+ cursor = await self.cur.execute(
546
+ "SELECT SUM(file_size) FROM fmeta WHERE owner_id = ? AND url LIKE ? ESCAPE '\\'",
547
+ (r[0], self.escape_sqlike(path) + '%')
548
+ )
512
549
  size = await cursor.fetchone()
513
550
  if size is not None:
514
551
  await self._user_size_dec(r[0], size[0])
@@ -517,15 +554,15 @@ class FileConn(DBObjectBase):
517
554
  # but it's not a big deal... we should have only one writer
518
555
 
519
556
  if under_owner_id is None:
520
- res = await self.cur.execute("DELETE FROM fmeta WHERE url LIKE ? RETURNING *", (path + '%', ))
557
+ res = await self.cur.execute("DELETE FROM fmeta WHERE url LIKE ? ESCAPE '\\' RETURNING *", (self.escape_sqlike(path) + '%', ))
521
558
  else:
522
- res = await self.cur.execute("DELETE FROM fmeta WHERE url LIKE ? AND owner_id = ? RETURNING *", (path + '%', under_owner_id))
559
+ res = await self.cur.execute("DELETE FROM fmeta WHERE url LIKE ? ESCAPE '\\' AND owner_id = ? RETURNING *", (self.escape_sqlike(path) + '%', under_owner_id))
523
560
  all_f_rec = await res.fetchall()
524
561
  self.logger.info(f"Deleted {len(all_f_rec)} file(s) for path {path}") # type: ignore
525
562
  return [self.parse_record(r) for r in all_f_rec]
526
563
 
527
564
  async def set_file_blob(self, file_id: str, blob: bytes):
528
- await self.cur.execute("INSERT OR REPLACE INTO blobs.fdata (file_id, data) VALUES (?, ?)", (file_id, blob))
565
+ await self.cur.execute("INSERT INTO blobs.fdata (file_id, data) VALUES (?, ?)", (file_id, blob))
529
566
 
530
567
  @staticmethod
531
568
  async def set_file_blob_external(file_id: str, stream: AsyncIterable[bytes])->int:
@@ -577,16 +614,78 @@ class FileConn(DBObjectBase):
577
614
  if not chunk: break
578
615
  yield chunk
579
616
 
580
- @staticmethod
581
- async def delete_file_blob_external(file_id: str):
617
+ async def unlink_file_blob_external(self, file_id: str):
618
+ # first check if the file has duplication
619
+ cursor = await self.cur.execute("SELECT count FROM dupcount WHERE file_id = ?", (file_id, ))
620
+ res = await cursor.fetchone()
621
+ if res is not None and res[0] > 0:
622
+ await self.cur.execute("UPDATE dupcount SET count = count - 1 WHERE file_id = ?", (file_id, ))
623
+ return
624
+
625
+ # finally delete the file and the duplication count
582
626
  if (LARGE_BLOB_DIR / file_id).exists():
583
627
  await aiofiles.os.remove(LARGE_BLOB_DIR / file_id)
628
+ await self.cur.execute("DELETE FROM dupcount WHERE file_id = ?", (file_id, ))
584
629
 
585
- async def delete_file_blob(self, file_id: str):
630
+ async def unlink_file_blob(self, file_id: str):
631
+ # first check if the file has duplication
632
+ cursor = await self.cur.execute("SELECT count FROM dupcount WHERE file_id = ?", (file_id, ))
633
+ res = await cursor.fetchone()
634
+ if res is not None and res[0] > 0:
635
+ await self.cur.execute("UPDATE dupcount SET count = count - 1 WHERE file_id = ?", (file_id, ))
636
+ return
637
+
638
+ # finally delete the file and the duplication count
586
639
  await self.cur.execute("DELETE FROM blobs.fdata WHERE file_id = ?", (file_id, ))
640
+ await self.cur.execute("DELETE FROM dupcount WHERE file_id = ?", (file_id, ))
641
+
642
+ async def _group_del(self, file_ids_all: list[str]):
643
+ """
644
+ The file_ids_all may contain duplication,
645
+ yield tuples of unique (to_del_ids, to_dec_ids) for each iteration,
646
+ every iteration should unlink one copy of the files, repeat until all re-occurrence in the input list are removed.
647
+ """
648
+ async def check_dup(file_ids: set[str]):
649
+ cursor = await self.cur.execute("SELECT file_id FROM dupcount WHERE file_id IN ({}) AND count > 0".format(','.join(['?'] * len(file_ids))), tuple(file_ids))
650
+ res = await cursor.fetchall()
651
+ to_dec_ids = [r[0] for r in res]
652
+ to_del_ids = list(file_ids - set(to_dec_ids))
653
+ return to_del_ids, to_dec_ids
654
+ # gather duplication from all file_ids
655
+ fid_occurrence = {}
656
+ for file_id in file_ids_all:
657
+ fid_occurrence[file_id] = fid_occurrence.get(file_id, 0) + 1
658
+ while fid_occurrence:
659
+ to_del_ids, to_dec_ids = await check_dup(set(fid_occurrence.keys()))
660
+ for file_id in to_del_ids:
661
+ del fid_occurrence[file_id]
662
+ for file_id in to_dec_ids:
663
+ fid_occurrence[file_id] -= 1
664
+ if fid_occurrence[file_id] == 0:
665
+ del fid_occurrence[file_id]
666
+ yield (to_del_ids, to_dec_ids)
667
+
668
+ async def unlink_file_blobs(self, file_ids: list[str]):
669
+ async for (to_del_ids, to_dec_ids) in self._group_del(file_ids):
670
+ # delete the only copy
671
+ await self.cur.execute("DELETE FROM blobs.fdata WHERE file_id IN ({})".format(','.join(['?'] * len(to_del_ids))), to_del_ids)
672
+ await self.cur.execute("DELETE FROM dupcount WHERE file_id IN ({})".format(','.join(['?'] * len(to_del_ids))), to_del_ids)
673
+ # decrease duplication count
674
+ await self.cur.execute("UPDATE dupcount SET count = count - 1 WHERE file_id IN ({})".format(','.join(['?'] * len(to_dec_ids))), to_dec_ids)
587
675
 
588
- async def delete_file_blobs(self, file_ids: list[str]):
589
- await self.cur.execute("DELETE FROM blobs.fdata WHERE file_id IN ({})".format(','.join(['?'] * len(file_ids))), file_ids)
676
+ async def unlink_file_blobs_external(self, file_ids: list[str]):
677
+ async def del_file(file_id: str):
678
+ if (LARGE_BLOB_DIR / file_id).exists():
679
+ await aiofiles.os.remove(LARGE_BLOB_DIR / file_id)
680
+ async for (to_del_ids, to_dec_ids) in self._group_del(file_ids):
681
+ # delete the only copy
682
+ await asyncio.gather(*(
683
+ [del_file(file_id) for file_id in to_del_ids] +
684
+ [self.cur.execute("DELETE FROM dupcount WHERE file_id = ?", (file_id, )) for file_id in to_del_ids]
685
+ ))
686
+ # decrease duplication count
687
+ await self.cur.execute("UPDATE dupcount SET count = count - 1 WHERE file_id IN ({})".format(','.join(['?'] * len(to_dec_ids))), to_dec_ids)
688
+
590
689
 
591
690
  _log_active_queue = []
592
691
  _log_active_lock = asyncio.Lock()
@@ -618,20 +717,35 @@ async def delayed_log_access(url: str):
618
717
  _log_access_queue.append(url)
619
718
  await _log_all_access()
620
719
 
621
- def validate_url(url: str, is_file = True):
622
- prohibited_chars = ['..', ';', "'", '"', '\\', '\0', '\n', '\r', '\t', '\x0b', '\x0c']
623
- ret = not url.startswith('/') and not url.startswith('_') and not url.startswith('.')
624
- ret = ret and not any([c in url for c in prohibited_chars])
720
+ @static_vars(
721
+ prohibited_regex = re.compile(
722
+ r"^[/_.]", # start with / or _ or .
723
+ ),
724
+ prohibited_part_regex = re.compile(
725
+ "|".join([
726
+ r"^\s*\.+\s*$", # dot path
727
+ "[{}]".format("".join(re.escape(c) for c in ('/', "\\", "'", '"', "*"))), # prohibited characters
728
+ ])
729
+ ),
730
+ )
731
+ def validate_url(url: str, utype: Literal['file', 'dir'] = 'file'):
732
+ """ Check if a path is valid. The input path is considered url safe """
733
+ if len(url) > 1024:
734
+ raise InvalidPathError(f"URL too long: {url}")
625
735
 
626
- if not ret:
736
+ is_valid = validate_url.prohibited_regex.search(url) is None
737
+ if not is_valid: # early return, no need to check further
627
738
  raise InvalidPathError(f"Invalid URL: {url}")
628
-
629
- if is_file:
630
- ret = ret and not url.endswith('/')
631
- else:
632
- ret = ret and url.endswith('/')
633
739
 
634
- if not ret:
740
+ for part in url.split('/'):
741
+ if validate_url.prohibited_part_regex.search(urllib.parse.unquote(part)):
742
+ is_valid = False
743
+ break
744
+
745
+ if utype == 'file': is_valid = is_valid and not url.endswith('/')
746
+ else: is_valid = is_valid and url.endswith('/')
747
+
748
+ if not is_valid:
635
749
  raise InvalidPathError(f"Invalid URL: {url}")
636
750
 
637
751
  async def get_user(cur: aiosqlite.Cursor, user: int | str) -> Optional[UserRecord]:
@@ -752,6 +866,58 @@ class Database:
752
866
  yield blob
753
867
  ret = blob_stream()
754
868
  return ret
869
+
870
+ async def read_files_bulk(
871
+ self, urls: list[str],
872
+ skip_content = False,
873
+ op_user: Optional[UserRecord] = None,
874
+ ) -> dict[str, Optional[bytes]]:
875
+ """
876
+ A frequent use case is to read multiple files at once,
877
+ this method will read all files in the list and return a dict of url -> blob.
878
+ if the file is not found, the value will be None.
879
+ - skip_content: if True, will not read the content of the file, resulting in a dict of url -> b''
880
+
881
+ may raise StorageExceededError if the total size of the files exceeds MAX_MEM_FILE_BYTES
882
+ """
883
+ for url in urls:
884
+ validate_url(url)
885
+
886
+ async with unique_cursor() as cur:
887
+ fconn = FileConn(cur)
888
+ file_records = await fconn.get_file_records(urls)
889
+
890
+ if op_user is not None:
891
+ for r in file_records:
892
+ if await check_path_permission(r.url, op_user, cursor=cur) >= AccessLevel.READ:
893
+ continue
894
+ is_allowed, reason = await check_file_read_permission(op_user, r, cursor=cur)
895
+ if not is_allowed:
896
+ raise PermissionDeniedError(f"Permission denied: {op_user.username} cannot read file {r.url}: {reason}")
897
+
898
+ # first check if the files are too big
899
+ sum_size = sum([r.file_size for r in file_records])
900
+ if not skip_content and sum_size > MAX_MEM_FILE_BYTES:
901
+ raise StorageExceededError(f"Unable to read files at once, total size {sum_size} exceeds {MAX_MEM_FILE_BYTES}")
902
+
903
+ self.logger.debug(f"Reading {len(file_records)} files{' (skip content)' if skip_content else ''}, getting {sum_size} bytes, from {urls}")
904
+ # read the file content
905
+ async with unique_cursor() as cur:
906
+ fconn = FileConn(cur)
907
+ blobs: dict[str, bytes] = {}
908
+ for r in file_records:
909
+ if skip_content:
910
+ blobs[r.url] = b''
911
+ continue
912
+
913
+ if r.external:
914
+ blob_iter = fconn.get_file_blob_external(r.file_id)
915
+ blob = b''.join([chunk async for chunk in blob_iter])
916
+ else:
917
+ blob = await fconn.get_file_blob(r.file_id)
918
+ blobs[r.url] = blob
919
+
920
+ return {url: blobs.get(url, None) for url in urls}
755
921
 
756
922
  async def delete_file(self, url: str, op_user: Optional[UserRecord] = None) -> Optional[FileRecord]:
757
923
  validate_url(url)
@@ -768,9 +934,9 @@ class Database:
768
934
  raise PermissionDeniedError(f"Permission denied: {op_user.username} cannot delete file {url}")
769
935
  f_id = r.file_id
770
936
  if r.external:
771
- await fconn.delete_file_blob_external(f_id)
937
+ await fconn.unlink_file_blob_external(f_id)
772
938
  else:
773
- await fconn.delete_file_blob(f_id)
939
+ await fconn.unlink_file_blob(f_id)
774
940
  return r
775
941
 
776
942
  async def move_file(self, old_url: str, new_url: str, op_user: Optional[UserRecord] = None):
@@ -810,9 +976,9 @@ class Database:
810
976
  raise PermissionDeniedError(f"Permission denied: {op_user.username} cannot copy file to {new_url}")
811
977
  await fconn.copy_file(old_url, new_url, user_id=op_user.id if op_user is not None else None)
812
978
 
813
- async def move_path(self, old_url: str, new_url: str, op_user: UserRecord):
814
- validate_url(old_url, is_file=False)
815
- validate_url(new_url, is_file=False)
979
+ async def move_dir(self, old_url: str, new_url: str, op_user: UserRecord):
980
+ validate_url(old_url, 'dir')
981
+ validate_url(new_url, 'dir')
816
982
 
817
983
  if new_url.startswith('/'):
818
984
  new_url = new_url[1:]
@@ -831,12 +997,11 @@ class Database:
831
997
 
832
998
  async with transaction() as cur:
833
999
  fconn = FileConn(cur)
834
- await fconn.move_path(old_url, new_url, op_user.id)
1000
+ await fconn.move_dir(old_url, new_url, op_user.id)
835
1001
 
836
- # not tested
837
- async def copy_path(self, old_url: str, new_url: str, op_user: UserRecord):
838
- validate_url(old_url, is_file=False)
839
- validate_url(new_url, is_file=False)
1002
+ async def copy_dir(self, old_url: str, new_url: str, op_user: UserRecord):
1003
+ validate_url(old_url, 'dir')
1004
+ validate_url(new_url, 'dir')
840
1005
 
841
1006
  if new_url.startswith('/'):
842
1007
  new_url = new_url[1:]
@@ -855,7 +1020,7 @@ class Database:
855
1020
 
856
1021
  async with transaction() as cur:
857
1022
  fconn = FileConn(cur)
858
- await fconn.copy_path(old_url, new_url, op_user.id)
1023
+ await fconn.copy_dir(old_url, new_url, op_user.id)
859
1024
 
860
1025
  async def __batch_delete_file_blobs(self, fconn: FileConn, file_records: list[FileRecord], batch_size: int = 512):
861
1026
  # https://github.com/langchain-ai/langchain/issues/10321
@@ -869,19 +1034,20 @@ class Database:
869
1034
 
870
1035
  async def del_internal():
871
1036
  for i in range(0, len(internal_ids), batch_size):
872
- await fconn.delete_file_blobs([r for r in internal_ids[i:i+batch_size]])
1037
+ await fconn.unlink_file_blobs([r for r in internal_ids[i:i+batch_size]])
873
1038
  async def del_external():
874
- for i in range(0, len(external_ids)):
875
- await fconn.delete_file_blob_external(external_ids[i])
876
- await asyncio.gather(del_internal(), del_external())
1039
+ for i in range(0, len(external_ids), batch_size):
1040
+ await fconn.unlink_file_blobs_external([r for r in external_ids[i:i+batch_size]])
1041
+ await del_internal()
1042
+ await del_external()
877
1043
 
878
- async def delete_path(self, url: str, op_user: Optional[UserRecord] = None) -> Optional[list[FileRecord]]:
879
- validate_url(url, is_file=False)
1044
+ async def delete_dir(self, url: str, op_user: Optional[UserRecord] = None) -> Optional[list[FileRecord]]:
1045
+ validate_url(url, 'dir')
880
1046
  from_owner_id = op_user.id if op_user is not None and not (op_user.is_admin or await check_path_permission(url, op_user) >= AccessLevel.WRITE) else None
881
1047
 
882
1048
  async with transaction() as cur:
883
1049
  fconn = FileConn(cur)
884
- records = await fconn.delete_path_records(url, from_owner_id)
1050
+ records = await fconn.delete_records_by_prefix(url, from_owner_id)
885
1051
  if not records:
886
1052
  return None
887
1053
  await self.__batch_delete_file_blobs(fconn, records)
@@ -905,14 +1071,15 @@ class Database:
905
1071
 
906
1072
  # make sure the user's directory is deleted,
907
1073
  # may contain admin's files, but delete them all
908
- await fconn.delete_path_records(user.username + '/')
1074
+ await fconn.delete_records_by_prefix(user.username + '/')
909
1075
 
910
- async def iter_path(self, top_url: str, urls: Optional[list[str]]) -> AsyncIterable[tuple[FileRecord, bytes | AsyncIterable[bytes]]]:
1076
+ async def iter_dir(self, top_url: str, urls: Optional[list[str]]) -> AsyncIterable[tuple[FileRecord, bytes | AsyncIterable[bytes]]]:
1077
+ validate_url(top_url, 'dir')
911
1078
  async with unique_cursor() as cur:
912
1079
  fconn = FileConn(cur)
913
1080
  if urls is None:
914
- fcount = await fconn.count_path_files(top_url, flat=True)
915
- urls = [r.url for r in (await fconn.list_path_files(top_url, flat=True, limit=fcount))]
1081
+ fcount = await fconn.count_dir_files(top_url, flat=True)
1082
+ urls = [r.url for r in (await fconn.list_dir_files(top_url, flat=True, limit=fcount))]
916
1083
 
917
1084
  for url in urls:
918
1085
  if not url.startswith(top_url):
@@ -926,14 +1093,50 @@ class Database:
926
1093
  else:
927
1094
  blob = await fconn.get_file_blob(f_id)
928
1095
  yield r, blob
1096
+
1097
+ async def zip_dir_stream(self, top_url: str, op_user: Optional[UserRecord] = None) -> AsyncIterable[bytes]:
1098
+ from stat import S_IFREG
1099
+ from stream_zip import async_stream_zip, ZIP_64
1100
+ if top_url.startswith('/'):
1101
+ top_url = top_url[1:]
1102
+
1103
+ if op_user:
1104
+ if await check_path_permission(top_url, op_user) < AccessLevel.READ:
1105
+ raise PermissionDeniedError(f"Permission denied: {op_user.username} cannot zip path {top_url}")
1106
+
1107
+ # https://stream-zip.docs.trade.gov.uk/async-interface/
1108
+ async def data_iter():
1109
+ async for (r, blob) in self.iter_dir(top_url, None):
1110
+ rel_path = r.url[len(top_url):]
1111
+ rel_path = decode_uri_compnents(rel_path)
1112
+ b_iter: AsyncIterable[bytes]
1113
+ if isinstance(blob, bytes):
1114
+ async def blob_iter(): yield blob
1115
+ b_iter = blob_iter() # type: ignore
1116
+ else:
1117
+ assert isinstance(blob, AsyncIterable)
1118
+ b_iter = blob
1119
+ yield (
1120
+ rel_path,
1121
+ datetime.datetime.now(),
1122
+ S_IFREG | 0o600,
1123
+ ZIP_64,
1124
+ b_iter
1125
+ )
1126
+ return async_stream_zip(data_iter())
929
1127
 
930
1128
  @concurrent_wrap()
931
- async def zip_path(self, top_url: str, urls: Optional[list[str]]) -> io.BytesIO:
1129
+ async def zip_dir(self, top_url: str, op_user: Optional[UserRecord]) -> io.BytesIO:
932
1130
  if top_url.startswith('/'):
933
1131
  top_url = top_url[1:]
1132
+
1133
+ if op_user:
1134
+ if await check_path_permission(top_url, op_user) < AccessLevel.READ:
1135
+ raise PermissionDeniedError(f"Permission denied: {op_user.username} cannot zip path {top_url}")
1136
+
934
1137
  buffer = io.BytesIO()
935
1138
  with zipfile.ZipFile(buffer, 'w') as zf:
936
- async for (r, blob) in self.iter_path(top_url, urls):
1139
+ async for (r, blob) in self.iter_dir(top_url, None):
937
1140
  rel_path = r.url[len(top_url):]
938
1141
  rel_path = decode_uri_compnents(rel_path)
939
1142
  if r.external:
@@ -945,39 +1148,50 @@ class Database:
945
1148
  buffer.seek(0)
946
1149
  return buffer
947
1150
 
948
- def check_file_read_permission(user: UserRecord, owner: UserRecord, file: FileRecord) -> tuple[bool, str]:
1151
+ async def _get_path_owner(cur: aiosqlite.Cursor, path: str) -> UserRecord:
1152
+ path_username = path.split('/')[0]
1153
+ uconn = UserConn(cur)
1154
+ path_user = await uconn.get_user(path_username)
1155
+ if path_user is None:
1156
+ raise InvalidPathError(f"Invalid path: {path_username} is not a valid username")
1157
+ return path_user
1158
+
1159
+ async def check_file_read_permission(user: UserRecord, file: FileRecord, cursor: Optional[aiosqlite.Cursor] = None) -> tuple[bool, str]:
949
1160
  """
950
1161
  This does not consider alias level permission,
951
1162
  use check_path_permission for alias level permission check first:
952
1163
  ```
953
- if await check_path_permission(path, user) < AccessLevel.READ:
954
- read_allowed, reason = check_file_read_permission(user, owner, file)
1164
+ if await check_path_permission(file.url, user) < AccessLevel.READ:
1165
+ read_allowed, reason = check_file_read_permission(user, file)
955
1166
  ```
1167
+ The implementation assumes the user is not admin and is not the owner of the file/path
956
1168
  """
957
- if user.is_admin:
958
- return True, ""
1169
+ @asynccontextmanager
1170
+ async def this_cur():
1171
+ if cursor is None:
1172
+ async with unique_cursor() as _cur:
1173
+ yield _cur
1174
+ else:
1175
+ yield cursor
1176
+
1177
+ f_perm = file.permission
1178
+
1179
+ # if file permission unset, use path owner's permission as fallback
1180
+ if f_perm == FileReadPermission.UNSET:
1181
+ async with this_cur() as cur:
1182
+ path_owner = await _get_path_owner(cur, file.url)
1183
+ f_perm = path_owner.permission
959
1184
 
960
1185
  # check permission of the file
961
- if file.permission == FileReadPermission.PRIVATE:
962
- if user.id != owner.id:
963
- return False, "Permission denied, private file"
964
- elif file.permission == FileReadPermission.PROTECTED:
1186
+ if f_perm == FileReadPermission.PRIVATE:
1187
+ return False, "Permission denied, private file"
1188
+ elif f_perm == FileReadPermission.PROTECTED:
965
1189
  if user.id == 0:
966
1190
  return False, "Permission denied, protected file"
967
- elif file.permission == FileReadPermission.PUBLIC:
1191
+ elif f_perm == FileReadPermission.PUBLIC:
968
1192
  return True, ""
969
1193
  else:
970
- assert file.permission == FileReadPermission.UNSET
971
-
972
- # use owner's permission as fallback
973
- if owner.permission == FileReadPermission.PRIVATE:
974
- if user.id != owner.id:
975
- return False, "Permission denied, private user file"
976
- elif owner.permission == FileReadPermission.PROTECTED:
977
- if user.id == 0:
978
- return False, "Permission denied, protected user file"
979
- else:
980
- assert owner.permission == FileReadPermission.PUBLIC or owner.permission == FileReadPermission.UNSET
1194
+ assert f_perm == FileReadPermission.UNSET
981
1195
 
982
1196
  return True, ""
983
1197
 
@@ -988,9 +1202,6 @@ async def check_path_permission(path: str, user: UserRecord, cursor: Optional[ai
988
1202
  If the path is a file, the user will have all access if the user is the owner.
989
1203
  Otherwise, the user will have alias level access w.r.t. the path user.
990
1204
  """
991
- if user.id == 0:
992
- return AccessLevel.GUEST
993
-
994
1205
  @asynccontextmanager
995
1206
  async def this_cur():
996
1207
  if cursor is None:
@@ -999,16 +1210,18 @@ async def check_path_permission(path: str, user: UserRecord, cursor: Optional[ai
999
1210
  else:
1000
1211
  yield cursor
1001
1212
 
1002
- # check if path user exists
1003
- path_username = path.split('/')[0]
1213
+ # check if path user exists, may raise exception
1004
1214
  async with this_cur() as cur:
1005
- uconn = UserConn(cur)
1006
- path_user = await uconn.get_user(path_username)
1007
- if path_user is None:
1008
- raise PathNotFoundError(f"Invalid path: {path_username} is not a valid username")
1215
+ path_owner = await _get_path_owner(cur, path)
1009
1216
 
1010
- # check if user is admin
1011
- if user.is_admin or user.username == path_username:
1217
+ if user.id == 0:
1218
+ return AccessLevel.GUEST
1219
+
1220
+ if user.is_admin:
1221
+ return AccessLevel.ALL
1222
+
1223
+ # check if user is admin or the owner of the path
1224
+ if user.id == path_owner.id:
1012
1225
  return AccessLevel.ALL
1013
1226
 
1014
1227
  # if the path is a file, check if the user is the owner
@@ -1022,4 +1235,4 @@ async def check_path_permission(path: str, user: UserRecord, cursor: Optional[ai
1022
1235
  # check alias level
1023
1236
  async with this_cur() as cur:
1024
1237
  uconn = UserConn(cur)
1025
- return await uconn.query_peer_level(user.id, path_user.id)
1238
+ return await uconn.query_peer_level(user.id, path_owner.id)