PyPI - lfss - Versions diffs - 0.9.2__py3-none-any.whl → 0.11.4__py3-none-any.whl - Mend

lfss 0.9.2py3-none-any.whl → 0.11.4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (33) hide show

Readme.md +4 -4
docs/Enviroment_variables.md +4 -2
docs/Permission.md +4 -4
docs/Webdav.md +3 -3
docs/changelog.md +58 -0
frontend/api.js +66 -4
frontend/login.js +0 -1
frontend/popup.js +18 -3
frontend/scripts.js +46 -39
frontend/utils.js +98 -1
lfss/api/__init__.py +7 -4
lfss/api/connector.py +47 -11
lfss/cli/cli.py +9 -9
lfss/cli/log.py +77 -0
lfss/cli/vacuum.py +69 -19
lfss/eng/config.py +7 -5
lfss/eng/connection_pool.py +12 -8
lfss/eng/database.py +350 -140
lfss/eng/error.py +6 -2
lfss/eng/log.py +91 -21
lfss/eng/thumb.py +20 -23
lfss/eng/utils.py +50 -29
lfss/sql/init.sql +9 -4
lfss/svc/app.py +1 -1
lfss/svc/app_base.py +8 -3
lfss/svc/app_dav.py +74 -61
lfss/svc/app_native.py +95 -59
lfss/svc/common_impl.py +72 -37
{lfss-0.9.2.dist-info → lfss-0.11.4.dist-info}/METADATA +10 -8
lfss-0.11.4.dist-info/RECORD +52 -0
{lfss-0.9.2.dist-info → lfss-0.11.4.dist-info}/entry_points.txt +1 -0
lfss-0.9.2.dist-info/RECORD +0 -50
{lfss-0.9.2.dist-info → lfss-0.11.4.dist-info}/WHEEL +0 -0

lfss/eng/database.py CHANGED Viewed

@@ -1,10 +1,12 @@
-from typing import Optional, Literal, AsyncIterable, overload
+from typing import Optional, Literal, overload
+from collections.abc import AsyncIterable
 from contextlib import asynccontextmanager
 from abc import ABC
+import re
+import uuid, datetime
 import urllib.parse
-import uuid
 import zipfile, io, asyncio
 import aiosqlite, aiofiles
@@ -19,7 +21,7 @@ from .datatype import (
     )
 from .config import LARGE_BLOB_DIR, CHUNK_SIZE, LARGE_FILE_BYTES, MAX_MEM_FILE_BYTES
 from .log import get_logger
-from .utils import decode_uri_compnents, hash_credential, concurrent_wrap, debounce_async, copy_file
+from .utils import decode_uri_compnents, hash_credential, concurrent_wrap, debounce_async, static_vars
 from .error import *
 class DBObjectBase(ABC):
@@ -82,9 +84,12 @@ class UserConn(DBObjectBase):
         self, username: str, password: str, is_admin: bool = False,
         max_storage: int = 1073741824, permission: FileReadPermission = FileReadPermission.UNSET
         ) -> int:
-        assert not username.startswith('_'), "Error: reserved username"
-        assert not ('/' in username or len(username) > 255), "Invalid username"
-        assert urllib.parse.quote(username) == username, "Invalid username, must be URL safe"
+        def validate_username(username: str):
+            assert not set(username) & {'/', ':'}, "Invalid username"
+            assert not username.startswith('_'), "Error: reserved username"
+            assert not (len(username) > 255), "Username too long"
+            assert urllib.parse.quote(username) == username, "Invalid username, must be URL safe"
+        validate_username(username)
         self.logger.debug(f"Creating user {username}")
         credential = hash_credential(username, password)
         assert await self.get_user(username) is None, "Duplicate username"
@@ -161,7 +166,7 @@ class UserConn(DBObjectBase):
     async def list_peer_users(self, src_user: int | str, level: AccessLevel) -> list[UserRecord]:
         """
         List all users that src_user can do [AliasLevel] to, with level >= level,
-        Note: the returned list does not include src_user and admin users
+        Note: the returned list does not include src_user and is not apporiate for admin (who has all permissions for all users)
         """
         assert int(level) > AccessLevel.NONE, f"Invalid level, {level}"
         match src_user:
@@ -192,6 +197,11 @@ class FileConn(DBObjectBase):
     def parse_record(record) -> FileRecord:
         return FileRecord(*record)
+    @staticmethod
+    def escape_sqlike(url: str) -> str:
+        """ Escape a url for use in SQL LIKE clause (The % and _ characters) """
+        return url.replace('%', r'\%').replace('_', r'\_')
     @overload
     async def get_file_record(self, url: str, throw: Literal[True]) -> FileRecord: ...
     @overload
@@ -205,6 +215,10 @@ class FileConn(DBObjectBase):
         return self.parse_record(res)
     async def get_file_records(self, urls: list[str]) -> list[FileRecord]:
+        """
+        Get all file records with the given urls, only urls in the database will be returned.
+        If the urls are not in the database, they will be ignored.
+        """
         await self.cur.execute("SELECT * FROM fmeta WHERE url IN ({})".format(','.join(['?'] * len(urls))), urls)
         res = await self.cur.fetchall()
         if res is None:
@@ -220,12 +234,12 @@ class FileConn(DBObjectBase):
             await self.cur.execute("SELECT username FROM user")
             res = await self.cur.fetchall()
             dirnames = [u[0] + '/' for u in res]
-            dirs = [await self.get_path_record(u) for u in dirnames] if not skim else [DirectoryRecord(u) for u in dirnames]
+            dirs = [await self.get_dir_record(u) for u in dirnames] if not skim else [DirectoryRecord(u) for u in dirnames]
             return dirs
         else:
             # list specific users
             dirnames = [uname + '/' for uname in usernames]
-            dirs = [await self.get_path_record(u) for u in dirnames] if not skim else [DirectoryRecord(u) for u in dirnames]
+            dirs = [await self.get_dir_record(u) for u in dirnames] if not skim else [DirectoryRecord(u) for u in dirnames]
             return dirs
     async def count_path_dirs(self, url: str):
@@ -237,16 +251,16 @@ class FileConn(DBObjectBase):
                 url, LENGTH(?) + 1,
                 INSTR(SUBSTR(url, LENGTH(?) + 1), '/')
                 ) AS dirname
-            FROM fmeta WHERE url LIKE ? AND dirname != ''
+            FROM fmeta WHERE url LIKE ? ESCAPE '\\' AND dirname != ''
         )
-        """, (url, url, url + '%'))
+        """, (url, url, self.escape_sqlike(url) + '%'))
         res = await cursor.fetchone()
         assert res is not None, "Error: count_path_dirs"
         return res[0]
     async def list_path_dirs(
         self, url: str,
-        offset: int = 0, limit: int = int(1e5),
+        offset: int = 0, limit: int = 10_000,
         order_by: DirSortKey = '', order_desc: bool = False,
         skim: bool = True
         ) -> list[DirectoryRecord]:
@@ -262,35 +276,41 @@ class FileConn(DBObjectBase):
                 1 + LENGTH(?),
                 INSTR(SUBSTR(url, 1 + LENGTH(?)), '/')
             ) AS dirname
-            FROM fmeta WHERE url LIKE ? AND dirname != ''
+            FROM fmeta WHERE url LIKE ? ESCAPE '\\' AND dirname != ''
         """ \
         + (f"ORDER BY {order_by} {'DESC' if order_desc else 'ASC'}" if order_by else '') \
         + " LIMIT ? OFFSET ?"
-        cursor = await self.cur.execute(sql_qury, (url, url, url + '%', limit, offset))
+        cursor = await self.cur.execute(sql_qury, (url, url, self.escape_sqlike(url) + '%', limit, offset))
         res = await cursor.fetchall()
         dirs_str = [r[0] for r in res]
         async def get_dir(dir_url):
             if skim:
                 return DirectoryRecord(dir_url)
             else:
-                return await self.get_path_record(dir_url)
+                return await self.get_dir_record(dir_url)
         dirs = [await get_dir(url + d) for d in dirs_str]
         return dirs
-    async def count_path_files(self, url: str, flat: bool = False):
+    async def count_dir_files(self, url: str, flat: bool = False):
         if not url.endswith('/'): url += '/'
         if url == '/': url = ''
         if flat:
-            cursor = await self.cur.execute("SELECT COUNT(*) FROM fmeta WHERE url LIKE ?", (url + '%', ))
+            cursor = await self.cur.execute(
+                "SELECT COUNT(*) FROM fmeta WHERE url LIKE ? ESCAPE '\\'",
+                (self.escape_sqlike(url) + '%', )
+                )
         else:
-            cursor = await self.cur.execute("SELECT COUNT(*) FROM fmeta WHERE url LIKE ? AND url NOT LIKE ?", (url + '%', url + '%/%'))
+            cursor = await self.cur.execute(
+                "SELECT COUNT(*) FROM fmeta WHERE url LIKE ? ESCAPE '\\' AND url NOT LIKE ? ESCAPE '\\'",
+                (self.escape_sqlike(url) + '%', self.escape_sqlike(url) + '%/%')
+                )
         res = await cursor.fetchone()
         assert res is not None, "Error: count_path_files"
         return res[0]
-    async def list_path_files(
+    async def list_dir_files(
         self, url: str,
-        offset: int = 0, limit: int = int(1e5),
+        offset: int = 0, limit: int = 10_000,
         order_by: FileSortKey = '', order_desc: bool = False,
         flat: bool = False,
         ) -> list[FileRecord]:
@@ -300,14 +320,14 @@ class FileConn(DBObjectBase):
         if not url.endswith('/'): url += '/'
         if url == '/': url = ''
-        sql_query = "SELECT * FROM fmeta WHERE url LIKE ?"
-        if not flat: sql_query += " AND url NOT LIKE ?"
+        sql_query = "SELECT * FROM fmeta WHERE url LIKE ? ESCAPE '\\'"
+        if not flat: sql_query += " AND url NOT LIKE ? ESCAPE '\\'"
         if order_by: sql_query += f" ORDER BY {order_by} {'DESC' if order_desc else 'ASC'}"
         sql_query += " LIMIT ? OFFSET ?"
         if flat:
-            cursor = await self.cur.execute(sql_query, (url + '%', limit, offset))
+            cursor = await self.cur.execute(sql_query, (self.escape_sqlike(url) + '%', limit, offset))
         else:
-            cursor = await self.cur.execute(sql_query, (url + '%', url + '%/%', limit, offset))
+            cursor = await self.cur.execute(sql_query, (self.escape_sqlike(url) + '%', self.escape_sqlike(url) + '%/%', limit, offset))
         res = await cursor.fetchall()
         files = [self.parse_record(r) for r in res]
         return files
@@ -321,17 +341,17 @@ class FileConn(DBObjectBase):
         - It cannot flatten directories
         - It cannot list directories with details
         """
-        MAX_ITEMS = int(1e4)
+        MAX_ITEMS = 10_000
         dir_count = await self.count_path_dirs(url)
-        file_count = await self.count_path_files(url, flat=False)
+        file_count = await self.count_dir_files(url, flat=False)
         if dir_count + file_count > MAX_ITEMS:
             raise TooManyItemsError("Too many items, please paginate")
         return PathContents(
             dirs = await self.list_path_dirs(url, skim=True, limit=MAX_ITEMS),
-            files = await self.list_path_files(url, flat=False, limit=MAX_ITEMS)
+            files = await self.list_dir_files(url, flat=False, limit=MAX_ITEMS)
             )
-    async def get_path_record(self, url: str) -> DirectoryRecord:
+    async def get_dir_record(self, url: str) -> DirectoryRecord:
         """
         Get the full record of a directory, including size, create_time, update_time, access_time etc.
         """
@@ -342,8 +362,8 @@ class FileConn(DBObjectBase):
                 MAX(access_time) as access_time,
                 COUNT(*) as n_files
             FROM fmeta
-            WHERE url LIKE ?
-        """, (url + '%', ))
+            WHERE url LIKE ? ESCAPE '\\'
+        """, (self.escape_sqlike(url) + '%', ))
         result = await cursor.fetchone()
         if result is None or any(val is None for val in result):
             raise PathNotFoundError(f"Path {url} not found")
@@ -367,10 +387,16 @@ class FileConn(DBObjectBase):
         if not url.endswith('/'):
             url += '/'
         if not include_subpath:
-            cursor = await self.cur.execute("SELECT SUM(file_size) FROM fmeta WHERE url LIKE ? AND url NOT LIKE ?", (url + '%', url + '%/%'))
+            cursor = await self.cur.execute(
+                "SELECT SUM(file_size) FROM fmeta WHERE url LIKE ? ESCAPE '\\' AND url NOT LIKE ? ESCAPE '\\'",
+                (self.escape_sqlike(url) + '%', self.escape_sqlike(url) + '%/%')
+                )
             res = await cursor.fetchone()
         else:
-            cursor = await self.cur.execute("SELECT SUM(file_size) FROM fmeta WHERE url LIKE ?", (url + '%', ))
+            cursor = await self.cur.execute(
+                "SELECT SUM(file_size) FROM fmeta WHERE url LIKE ? ESCAPE '\\'",
+                (self.escape_sqlike(url) + '%', )
+                )
             res = await cursor.fetchone()
         assert res is not None
         return res[0] or 0
@@ -406,56 +432,51 @@ class FileConn(DBObjectBase):
         await self._user_size_inc(owner_id, file_size)
         self.logger.info(f"File {url} created")
-    # not tested
     async def copy_file(self, old_url: str, new_url: str, user_id: Optional[int] = None):
+        """
+        Copy file from old_url to new_url,
+        if user_id is None, will not change the owner_id of the file. Otherwise, will change the owner_id to user_id.
+        """
         old = await self.get_file_record(old_url)
         if old is None:
             raise FileNotFoundError(f"File {old_url} not found")
         new_exists = await self.get_file_record(new_url)
         if new_exists is not None:
             raise FileExistsError(f"File {new_url} already exists")
-        new_fid = str(uuid.uuid4())
         user_id = old.owner_id if user_id is None else user_id
         await self.cur.execute(
             "INSERT INTO fmeta (url, owner_id, file_id, file_size, permission, external, mime_type) VALUES (?, ?, ?, ?, ?, ?, ?)",
-            (new_url, user_id, new_fid, old.file_size, old.permission, old.external, old.mime_type)
+            (new_url, user_id, old.file_id, old.file_size, old.permission, old.external, old.mime_type)
             )
-        if not old.external:
-            await self.set_file_blob(new_fid, await self.get_file_blob(old.file_id))
-        else:
-            await copy_file(LARGE_BLOB_DIR / old.file_id, LARGE_BLOB_DIR / new_fid)
+        await self.cur.execute("INSERT OR REPLACE INTO dupcount (file_id, count) VALUES (?, COALESCE((SELECT count FROM dupcount WHERE file_id = ?), 0) + 1)", (old.file_id, old.file_id))
         await self._user_size_inc(user_id, old.file_size)
         self.logger.info(f"Copied file {old_url} to {new_url}")
-    # not tested
-    async def copy_path(self, old_url: str, new_url: str, conflict_handler: Literal['skip', 'overwrite'] = 'overwrite', user_id: Optional[int] = None):
+    async def copy_dir(self, old_url: str, new_url: str, user_id: Optional[int] = None):
+        """
+        Copy all files under old_url to new_url,
+        if user_id is None, will not change the owner_id of the files. Otherwise, will change the owner_id to user_id.
+        """
         assert old_url.endswith('/'), "Old path must end with /"
         assert new_url.endswith('/'), "New path must end with /"
-        if user_id is None:
-            cursor = await self.cur.execute("SELECT * FROM fmeta WHERE url LIKE ?", (old_url + '%', ))
-            res = await cursor.fetchall()
-        else:
-            cursor = await self.cur.execute("SELECT * FROM fmeta WHERE url LIKE ? AND owner_id = ?", (old_url + '%', user_id))
-            res = await cursor.fetchall()
+        cursor = await self.cur.execute(
+            "SELECT * FROM fmeta WHERE url LIKE ? ESCAPE '\\'",
+            (self.escape_sqlike(old_url) + '%', )
+            )
+        res = await cursor.fetchall()
         for r in res:
             old_record = FileRecord(*r)
             new_r = new_url + old_record.url[len(old_url):]
-            if conflict_handler == 'overwrite':
-                await self.cur.execute("DELETE FROM fmeta WHERE url = ?", (new_r, ))
-            elif conflict_handler == 'skip':
-                if (await self.cur.execute("SELECT url FROM fmeta WHERE url = ?", (new_r, ))) is not None:
-                    continue
-            new_fid = str(uuid.uuid4())
+            if await (await self.cur.execute("SELECT url FROM fmeta WHERE url = ?", (new_r, ))).fetchone() is not None:
+                raise FileExistsError(f"File {new_r} already exists")
             user_id = old_record.owner_id if user_id is None else user_id
             await self.cur.execute(
                 "INSERT INTO fmeta (url, owner_id, file_id, file_size, permission, external, mime_type) VALUES (?, ?, ?, ?, ?, ?, ?)",
-                (new_r, user_id, new_fid, old_record.file_size, old_record.permission, old_record.external, old_record.mime_type)
+                (new_r, user_id, old_record.file_id, old_record.file_size, old_record.permission, old_record.external, old_record.mime_type)
                 )
-            if not old_record.external:
-                await self.set_file_blob(new_fid, await self.get_file_blob(old_record.file_id))
-            else:
-                await copy_file(LARGE_BLOB_DIR / old_record.file_id, LARGE_BLOB_DIR / new_fid)
+            await self.cur.execute("INSERT OR REPLACE INTO dupcount (file_id, count) VALUES (?, COALESCE((SELECT count FROM dupcount WHERE file_id = ?), 0) + 1)", (old_record.file_id, old_record.file_id))
             await self._user_size_inc(user_id, old_record.file_size)
+        self.logger.info(f"Copied path {old_url} to {new_url}")
     async def move_file(self, old_url: str, new_url: str):
         old = await self.get_file_record(old_url)
@@ -467,14 +488,20 @@ class FileConn(DBObjectBase):
         await self.cur.execute("UPDATE fmeta SET url = ?, create_time = CURRENT_TIMESTAMP WHERE url = ?", (new_url, old_url))
         self.logger.info(f"Moved file {old_url} to {new_url}")
-    async def move_path(self, old_url: str, new_url: str, user_id: Optional[int] = None):
+    async def move_dir(self, old_url: str, new_url: str, user_id: Optional[int] = None):
         assert old_url.endswith('/'), "Old path must end with /"
         assert new_url.endswith('/'), "New path must end with /"
         if user_id is None:
-            cursor = await self.cur.execute("SELECT * FROM fmeta WHERE url LIKE ?", (old_url + '%', ))
+            cursor = await self.cur.execute(
+                "SELECT * FROM fmeta WHERE url LIKE ? ESCAPE '\\'",
+                (self.escape_sqlike(old_url) + '%', )
+                )
             res = await cursor.fetchall()
         else:
-            cursor = await self.cur.execute("SELECT * FROM fmeta WHERE url LIKE ? AND owner_id = ?", (old_url + '%', user_id))
+            cursor = await self.cur.execute(
+                "SELECT * FROM fmeta WHERE url LIKE ? ESCAPE '\\' AND owner_id = ?",
+                (self.escape_sqlike(old_url) + '%', user_id)
+                )
             res = await cursor.fetchall()
         for r in res:
             new_r = new_url + r[0][len(old_url):]
@@ -497,6 +524,7 @@ class FileConn(DBObjectBase):
         return file_record
     async def delete_user_file_records(self, owner_id: int) -> list[FileRecord]:
+        """ Delete all records with owner_id """
         cursor = await self.cur.execute("SELECT * FROM fmeta WHERE owner_id = ?", (owner_id, ))
         res = await cursor.fetchall()
         await self.cur.execute("DELETE FROM usize WHERE user_id = ?", (owner_id, ))
@@ -505,13 +533,19 @@ class FileConn(DBObjectBase):
         self.logger.info(f"Deleted {len(ret)} file records for user {owner_id}") # type: ignore
         return ret
-    async def delete_path_records(self, path: str, under_owner_id: Optional[int] = None) -> list[FileRecord]:
+    async def delete_records_by_prefix(self, path: str, under_owner_id: Optional[int] = None) -> list[FileRecord]:
         """Delete all records with url starting with path"""
         # update user size
-        cursor = await self.cur.execute("SELECT DISTINCT owner_id FROM fmeta WHERE url LIKE ?", (path + '%', ))
+        cursor = await self.cur.execute(
+            "SELECT DISTINCT owner_id FROM fmeta WHERE url LIKE ? ESCAPE '\\'",
+            (self.escape_sqlike(path) + '%', )
+            )
         res = await cursor.fetchall()
         for r in res:
-            cursor = await self.cur.execute("SELECT SUM(file_size) FROM fmeta WHERE owner_id = ? AND url LIKE ?", (r[0], path + '%'))
+            cursor = await self.cur.execute(
+                "SELECT SUM(file_size) FROM fmeta WHERE owner_id = ? AND url LIKE ? ESCAPE '\\'",
+                (r[0], self.escape_sqlike(path) + '%')
+                )
             size = await cursor.fetchone()
             if size is not None:
                 await self._user_size_dec(r[0], size[0])
@@ -520,15 +554,15 @@ class FileConn(DBObjectBase):
         # but it's not a big deal... we should have only one writer
         if under_owner_id is None:
-            res = await self.cur.execute("DELETE FROM fmeta WHERE url LIKE ? RETURNING *", (path + '%', ))
+            res = await self.cur.execute("DELETE FROM fmeta WHERE url LIKE ? ESCAPE '\\' RETURNING *", (self.escape_sqlike(path) + '%', ))
         else:
-            res = await self.cur.execute("DELETE FROM fmeta WHERE url LIKE ? AND owner_id = ? RETURNING *", (path + '%', under_owner_id))
+            res = await self.cur.execute("DELETE FROM fmeta WHERE url LIKE ? ESCAPE '\\' AND owner_id = ? RETURNING *", (self.escape_sqlike(path) + '%', under_owner_id))
         all_f_rec = await res.fetchall()
         self.logger.info(f"Deleted {len(all_f_rec)} file(s) for path {path}") # type: ignore
         return [self.parse_record(r) for r in all_f_rec]
     async def set_file_blob(self, file_id: str, blob: bytes):
-        await self.cur.execute("INSERT OR REPLACE INTO blobs.fdata (file_id, data) VALUES (?, ?)", (file_id, blob))
+        await self.cur.execute("INSERT INTO blobs.fdata (file_id, data) VALUES (?, ?)", (file_id, blob))
     @staticmethod
     async def set_file_blob_external(file_id: str, stream: AsyncIterable[bytes])->int:
@@ -580,16 +614,78 @@ class FileConn(DBObjectBase):
                     if not chunk: break
                     yield chunk
-    @staticmethod
-    async def delete_file_blob_external(file_id: str):
+    async def unlink_file_blob_external(self, file_id: str):
+        # first check if the file has duplication
+        cursor = await self.cur.execute("SELECT count FROM dupcount WHERE file_id = ?", (file_id, ))
+        res = await cursor.fetchone()
+        if res is not None and res[0] > 0:
+            await self.cur.execute("UPDATE dupcount SET count = count - 1 WHERE file_id = ?", (file_id, ))
+            return
+        # finally delete the file and the duplication count
         if (LARGE_BLOB_DIR / file_id).exists():
             await aiofiles.os.remove(LARGE_BLOB_DIR / file_id)
+        await self.cur.execute("DELETE FROM dupcount WHERE file_id = ?", (file_id, ))
-    async def delete_file_blob(self, file_id: str):
+    async def unlink_file_blob(self, file_id: str):
+        # first check if the file has duplication
+        cursor = await self.cur.execute("SELECT count FROM dupcount WHERE file_id = ?", (file_id, ))
+        res = await cursor.fetchone()
+        if res is not None and res[0] > 0:
+            await self.cur.execute("UPDATE dupcount SET count = count - 1 WHERE file_id = ?", (file_id, ))
+            return
+        # finally delete the file and the duplication count
         await self.cur.execute("DELETE FROM blobs.fdata WHERE file_id = ?", (file_id, ))
+        await self.cur.execute("DELETE FROM dupcount WHERE file_id = ?", (file_id, ))
-    async def delete_file_blobs(self, file_ids: list[str]):
-        await self.cur.execute("DELETE FROM blobs.fdata WHERE file_id IN ({})".format(','.join(['?'] * len(file_ids))), file_ids)
+    async def _group_del(self, file_ids_all: list[str]):
+        """
+        The file_ids_all may contain duplication,
+        yield tuples of unique (to_del_ids, to_dec_ids) for each iteration,
+        every iteration should unlink one copy of the files, repeat until all re-occurrence in the input list are removed.
+        """
+        async def check_dup(file_ids: set[str]):
+            cursor = await self.cur.execute("SELECT file_id FROM dupcount WHERE file_id IN ({}) AND count > 0".format(','.join(['?'] * len(file_ids))), tuple(file_ids))
+            res = await cursor.fetchall()
+            to_dec_ids = [r[0] for r in res]
+            to_del_ids = list(file_ids - set(to_dec_ids))
+            return to_del_ids, to_dec_ids
+        # gather duplication from all file_ids
+        fid_occurrence = {}
+        for file_id in file_ids_all:
+            fid_occurrence[file_id] = fid_occurrence.get(file_id, 0) + 1
+        while fid_occurrence:
+            to_del_ids, to_dec_ids = await check_dup(set(fid_occurrence.keys()))
+            for file_id in to_del_ids:
+                del fid_occurrence[file_id]
+            for file_id in to_dec_ids:
+                fid_occurrence[file_id] -= 1
+                if fid_occurrence[file_id] == 0:
+                    del fid_occurrence[file_id]
+            yield (to_del_ids, to_dec_ids)
+    async def unlink_file_blobs(self, file_ids: list[str]):
+        async for (to_del_ids, to_dec_ids) in self._group_del(file_ids):
+            # delete the only copy
+            await self.cur.execute("DELETE FROM blobs.fdata WHERE file_id IN ({})".format(','.join(['?'] * len(to_del_ids))), to_del_ids)
+            await self.cur.execute("DELETE FROM dupcount WHERE file_id IN ({})".format(','.join(['?'] * len(to_del_ids))), to_del_ids)
+            # decrease duplication count
+            await self.cur.execute("UPDATE dupcount SET count = count - 1 WHERE file_id IN ({})".format(','.join(['?'] * len(to_dec_ids))), to_dec_ids)
+    async def unlink_file_blobs_external(self, file_ids: list[str]):
+        async def del_file(file_id: str):
+            if (LARGE_BLOB_DIR / file_id).exists():
+                await aiofiles.os.remove(LARGE_BLOB_DIR / file_id)
+        async for (to_del_ids, to_dec_ids) in self._group_del(file_ids):
+            # delete the only copy
+            await asyncio.gather(*(
+                [del_file(file_id) for file_id in to_del_ids] +
+                [self.cur.execute("DELETE FROM dupcount WHERE file_id = ?", (file_id, )) for file_id in to_del_ids]
+                ))
+            # decrease duplication count
+            await self.cur.execute("UPDATE dupcount SET count = count - 1 WHERE file_id IN ({})".format(','.join(['?'] * len(to_dec_ids))), to_dec_ids)
 _log_active_queue = []
 _log_active_lock = asyncio.Lock()
@@ -621,20 +717,35 @@ async def delayed_log_access(url: str):
         _log_access_queue.append(url)
     await _log_all_access()
-def validate_url(url: str, is_file = True):
-    prohibited_chars = ['..', ';', "'", '"', '\\', '\0', '\n', '\r', '\t', '\x0b', '\x0c']
-    ret = not url.startswith('/') and not url.startswith('_') and not url.startswith('.')
-    ret = ret and not any([c in url for c in prohibited_chars])
+@static_vars(
+    prohibited_regex = re.compile(
+            r"^[/_.]",              # start with / or _ or .
+        ),
+    prohibited_part_regex = re.compile(
+        "|".join([
+            r"^\s*\.+\s*$",       # dot path
+            "[{}]".format("".join(re.escape(c) for c in ('/', "\\", "'", '"', "*"))), # prohibited characters
+        ])
+    ),
+)
+def validate_url(url: str, utype: Literal['file', 'dir'] = 'file'):
+    """ Check if a path is valid. The input path is considered url safe """
+    if len(url) > 1024:
+        raise InvalidPathError(f"URL too long: {url}")
-    if not ret:
+    is_valid = validate_url.prohibited_regex.search(url) is None
+    if not is_valid:    # early return, no need to check further
         raise InvalidPathError(f"Invalid URL: {url}")
-    if is_file:
-        ret = ret and not url.endswith('/')
-    else:
-        ret = ret and url.endswith('/')
-    if not ret:
+    for part in url.split('/'):
+        if validate_url.prohibited_part_regex.search(urllib.parse.unquote(part)):
+            is_valid = False
+            break
+    if utype == 'file': is_valid = is_valid and not url.endswith('/')
+    else: is_valid = is_valid and url.endswith('/')
+    if not is_valid:
         raise InvalidPathError(f"Invalid URL: {url}")
 async def get_user(cur: aiosqlite.Cursor, user: int | str) -> Optional[UserRecord]:
@@ -755,6 +866,58 @@ class Database:
                     yield blob
         ret = blob_stream()
         return ret
+    async def read_files_bulk(
+        self, urls: list[str],
+        skip_content = False,
+        op_user: Optional[UserRecord] = None,
+        ) -> dict[str, Optional[bytes]]:
+        """
+        A frequent use case is to read multiple files at once,
+        this method will read all files in the list and return a dict of url -> blob.
+        if the file is not found, the value will be None.
+        - skip_content: if True, will not read the content of the file, resulting in a dict of url -> b''
+        may raise StorageExceededError if the total size of the files exceeds MAX_MEM_FILE_BYTES
+        """
+        for url in urls:
+            validate_url(url)
+        async with unique_cursor() as cur:
+            fconn = FileConn(cur)
+            file_records = await fconn.get_file_records(urls)
+            if op_user is not None:
+                for r in file_records:
+                    if await check_path_permission(r.url, op_user, cursor=cur) >= AccessLevel.READ:
+                        continue
+                    is_allowed, reason = await check_file_read_permission(op_user, r, cursor=cur)
+                    if not is_allowed:
+                        raise PermissionDeniedError(f"Permission denied: {op_user.username} cannot read file {r.url}: {reason}")
+        # first check if the files are too big
+        sum_size = sum([r.file_size for r in file_records])
+        if not skip_content and sum_size > MAX_MEM_FILE_BYTES:
+            raise StorageExceededError(f"Unable to read files at once, total size {sum_size} exceeds {MAX_MEM_FILE_BYTES}")
+        self.logger.debug(f"Reading {len(file_records)} files{' (skip content)' if skip_content else ''}, getting {sum_size} bytes, from {urls}")
+        # read the file content
+        async with unique_cursor() as cur:
+            fconn = FileConn(cur)
+            blobs: dict[str, bytes] = {}
+            for r in file_records:
+                if skip_content:
+                    blobs[r.url] = b''
+                    continue
+                if r.external:
+                    blob_iter = fconn.get_file_blob_external(r.file_id)
+                    blob = b''.join([chunk async for chunk in blob_iter])
+                else:
+                    blob = await fconn.get_file_blob(r.file_id)
+                blobs[r.url] = blob
+            return {url: blobs.get(url, None) for url in urls}
     async def delete_file(self, url: str, op_user: Optional[UserRecord] = None) -> Optional[FileRecord]:
         validate_url(url)
@@ -771,9 +934,9 @@ class Database:
                     raise PermissionDeniedError(f"Permission denied: {op_user.username} cannot delete file {url}")
             f_id = r.file_id
             if r.external:
-                await fconn.delete_file_blob_external(f_id)
+                await fconn.unlink_file_blob_external(f_id)
             else:
-                await fconn.delete_file_blob(f_id)
+                await fconn.unlink_file_blob(f_id)
             return r
     async def move_file(self, old_url: str, new_url: str, op_user: Optional[UserRecord] = None):
@@ -813,9 +976,9 @@ class Database:
                     raise PermissionDeniedError(f"Permission denied: {op_user.username} cannot copy file to {new_url}")
             await fconn.copy_file(old_url, new_url, user_id=op_user.id if op_user is not None else None)
-    async def move_path(self, old_url: str, new_url: str, op_user: UserRecord):
-        validate_url(old_url, is_file=False)
-        validate_url(new_url, is_file=False)
+    async def move_dir(self, old_url: str, new_url: str, op_user: UserRecord):
+        validate_url(old_url, 'dir')
+        validate_url(new_url, 'dir')
         if new_url.startswith('/'):
             new_url = new_url[1:]
@@ -834,12 +997,11 @@ class Database:
         async with transaction() as cur:
             fconn = FileConn(cur)
-            await fconn.move_path(old_url, new_url, op_user.id)
+            await fconn.move_dir(old_url, new_url, op_user.id)
-    # not tested
-    async def copy_path(self, old_url: str, new_url: str, op_user: UserRecord):
-        validate_url(old_url, is_file=False)
-        validate_url(new_url, is_file=False)
+    async def copy_dir(self, old_url: str, new_url: str, op_user: UserRecord):
+        validate_url(old_url, 'dir')
+        validate_url(new_url, 'dir')
         if new_url.startswith('/'):
             new_url = new_url[1:]
@@ -858,7 +1020,7 @@ class Database:
         async with transaction() as cur:
             fconn = FileConn(cur)
-            await fconn.copy_path(old_url, new_url, 'overwrite', op_user.id)
+            await fconn.copy_dir(old_url, new_url, op_user.id)
     async def __batch_delete_file_blobs(self, fconn: FileConn, file_records: list[FileRecord], batch_size: int = 512):
         # https://github.com/langchain-ai/langchain/issues/10321
@@ -872,19 +1034,20 @@ class Database:
         async def del_internal():
             for i in range(0, len(internal_ids), batch_size):
-                await fconn.delete_file_blobs([r for r in internal_ids[i:i+batch_size]])
+                await fconn.unlink_file_blobs([r for r in internal_ids[i:i+batch_size]])
         async def del_external():
-            for i in range(0, len(external_ids)):
-                await fconn.delete_file_blob_external(external_ids[i])
-        await asyncio.gather(del_internal(), del_external())
+            for i in range(0, len(external_ids), batch_size):
+                await fconn.unlink_file_blobs_external([r for r in external_ids[i:i+batch_size]])
+        await del_internal()
+        await del_external()
-    async def delete_path(self, url: str, op_user: Optional[UserRecord] = None) -> Optional[list[FileRecord]]:
-        validate_url(url, is_file=False)
+    async def delete_dir(self, url: str, op_user: Optional[UserRecord] = None) -> Optional[list[FileRecord]]:
+        validate_url(url, 'dir')
         from_owner_id = op_user.id if op_user is not None and not (op_user.is_admin or await check_path_permission(url, op_user) >= AccessLevel.WRITE) else None
         async with transaction() as cur:
             fconn = FileConn(cur)
-            records = await fconn.delete_path_records(url, from_owner_id)
+            records = await fconn.delete_records_by_prefix(url, from_owner_id)
             if not records:
                 return None
             await self.__batch_delete_file_blobs(fconn, records)
@@ -908,14 +1071,15 @@ class Database:
             # make sure the user's directory is deleted,
             # may contain admin's files, but delete them all
-            await fconn.delete_path_records(user.username + '/')
+            await fconn.delete_records_by_prefix(user.username + '/')
-    async def iter_path(self, top_url: str, urls: Optional[list[str]]) -> AsyncIterable[tuple[FileRecord, bytes | AsyncIterable[bytes]]]:
+    async def iter_dir(self, top_url: str, urls: Optional[list[str]]) -> AsyncIterable[tuple[FileRecord, bytes | AsyncIterable[bytes]]]:
+        validate_url(top_url, 'dir')
         async with unique_cursor() as cur:
             fconn = FileConn(cur)
             if urls is None:
-                fcount = await fconn.count_path_files(top_url, flat=True)
-                urls = [r.url for r in (await fconn.list_path_files(top_url, flat=True, limit=fcount))]
+                fcount = await fconn.count_dir_files(top_url, flat=True)
+                urls = [r.url for r in (await fconn.list_dir_files(top_url, flat=True, limit=fcount))]
             for url in urls:
                 if not url.startswith(top_url):
@@ -929,14 +1093,50 @@ class Database:
                 else:
                     blob = await fconn.get_file_blob(f_id)
                 yield r, blob
+    async def zip_dir_stream(self, top_url: str, op_user: Optional[UserRecord] = None) -> AsyncIterable[bytes]:
+        from stat import S_IFREG
+        from stream_zip import async_stream_zip, ZIP_64
+        if top_url.startswith('/'):
+            top_url = top_url[1:]
+        if op_user:
+            if await check_path_permission(top_url, op_user) < AccessLevel.READ:
+                raise PermissionDeniedError(f"Permission denied: {op_user.username} cannot zip path {top_url}")
+        # https://stream-zip.docs.trade.gov.uk/async-interface/
+        async def data_iter():
+            async for (r, blob) in self.iter_dir(top_url, None):
+                rel_path = r.url[len(top_url):]
+                rel_path = decode_uri_compnents(rel_path)
+                b_iter: AsyncIterable[bytes]
+                if isinstance(blob, bytes):
+                    async def blob_iter(): yield blob
+                    b_iter = blob_iter()    # type: ignore
+                else:
+                    assert isinstance(blob, AsyncIterable)
+                    b_iter = blob
+                yield (
+                    rel_path,
+                    datetime.datetime.now(),
+                    S_IFREG | 0o600,
+                    ZIP_64,
+                    b_iter
+                )
+        return async_stream_zip(data_iter())
     @concurrent_wrap()
-    async def zip_path(self, top_url: str, urls: Optional[list[str]]) -> io.BytesIO:
+    async def zip_dir(self, top_url: str, op_user: Optional[UserRecord]) -> io.BytesIO:
         if top_url.startswith('/'):
             top_url = top_url[1:]
+        if op_user:
+            if await check_path_permission(top_url, op_user) < AccessLevel.READ:
+                raise PermissionDeniedError(f"Permission denied: {op_user.username} cannot zip path {top_url}")
         buffer = io.BytesIO()
         with zipfile.ZipFile(buffer, 'w') as zf:
-            async for (r, blob) in self.iter_path(top_url, urls):
+            async for (r, blob) in self.iter_dir(top_url, None):
                 rel_path = r.url[len(top_url):]
                 rel_path = decode_uri_compnents(rel_path)
                 if r.external:
@@ -948,39 +1148,50 @@ class Database:
         buffer.seek(0)
         return buffer
-def check_file_read_permission(user: UserRecord, owner: UserRecord, file: FileRecord) -> tuple[bool, str]:
+async def _get_path_owner(cur: aiosqlite.Cursor, path: str) -> UserRecord:
+    path_username = path.split('/')[0]
+    uconn = UserConn(cur)
+    path_user = await uconn.get_user(path_username)
+    if path_user is None:
+        raise InvalidPathError(f"Invalid path: {path_username} is not a valid username")
+    return path_user
+async def check_file_read_permission(user: UserRecord, file: FileRecord, cursor: Optional[aiosqlite.Cursor] = None) -> tuple[bool, str]:
     """
     This does not consider alias level permission,
     use check_path_permission for alias level permission check first:
     ```
-    if await check_path_permission(path, user) < AccessLevel.READ:
-        read_allowed, reason = check_file_read_permission(user, owner, file)
+    if await check_path_permission(file.url, user) < AccessLevel.READ:
+        read_allowed, reason = check_file_read_permission(user, file)
     ```
+    The implementation assumes the user is not admin and is not the owner of the file/path
     """
-    if user.is_admin:
-        return True, ""
+    @asynccontextmanager
+    async def this_cur():
+        if cursor is None:
+            async with unique_cursor() as _cur:
+                yield _cur
+        else:
+            yield cursor
+    f_perm = file.permission
+    # if file permission unset, use path owner's permission as fallback
+    if f_perm == FileReadPermission.UNSET:
+        async with this_cur() as cur:
+            path_owner = await _get_path_owner(cur, file.url)
+        f_perm = path_owner.permission
     # check permission of the file
-    if file.permission == FileReadPermission.PRIVATE:
-        if user.id != owner.id:
-            return False, "Permission denied, private file"
-    elif file.permission == FileReadPermission.PROTECTED:
+    if f_perm == FileReadPermission.PRIVATE:
+        return False, "Permission denied, private file"
+    elif f_perm == FileReadPermission.PROTECTED:
         if user.id == 0:
             return False, "Permission denied, protected file"
-    elif file.permission == FileReadPermission.PUBLIC:
+    elif f_perm == FileReadPermission.PUBLIC:
         return True, ""
     else:
-        assert file.permission == FileReadPermission.UNSET
-    # use owner's permission as fallback
-    if owner.permission == FileReadPermission.PRIVATE:
-        if user.id != owner.id:
-            return False, "Permission denied, private user file"
-    elif owner.permission == FileReadPermission.PROTECTED:
-        if user.id == 0:
-            return False, "Permission denied, protected user file"
-    else:
-        assert owner.permission == FileReadPermission.PUBLIC or owner.permission == FileReadPermission.UNSET
+        assert f_perm == FileReadPermission.UNSET
     return True, ""
@@ -991,9 +1202,6 @@ async def check_path_permission(path: str, user: UserRecord, cursor: Optional[ai
     If the path is a file, the user will have all access if the user is the owner.
     Otherwise, the user will have alias level access w.r.t. the path user.
     """
-    if user.id == 0:
-        return AccessLevel.GUEST
     @asynccontextmanager
     async def this_cur():
         if cursor is None:
@@ -1002,16 +1210,18 @@ async def check_path_permission(path: str, user: UserRecord, cursor: Optional[ai
         else:
             yield cursor
-    # check if path user exists
-    path_username = path.split('/')[0]
+    # check if path user exists, may raise exception
     async with this_cur() as cur:
-        uconn = UserConn(cur)
-        path_user = await uconn.get_user(path_username)
-    if path_user is None:
-        raise PathNotFoundError(f"Invalid path: {path_username} is not a valid username")
+        path_owner = await _get_path_owner(cur, path)
-    # check if user is admin
-    if user.is_admin or user.username == path_username:
+    if user.id == 0:
+        return AccessLevel.GUEST
+    if user.is_admin:
+        return AccessLevel.ALL
+    # check if user is admin or the owner of the path
+    if user.id == path_owner.id:
         return AccessLevel.ALL
     # if the path is a file, check if the user is the owner
@@ -1025,4 +1235,4 @@ async def check_path_permission(path: str, user: UserRecord, cursor: Optional[ai
     # check alias level
     async with this_cur() as cur:
         uconn = UserConn(cur)
-        return await uconn.query_peer_level(user.id, path_user.id)
+        return await uconn.query_peer_level(user.id, path_owner.id)

lfss 0.9.2__py3-none-any.whl → 0.11.4__py3-none-any.whl

lfss 0.9.2py3-none-any.whl → 0.11.4py3-none-any.whl