PyPI - thds.core - Versions diffs - 0.0.1__py3-none-any.whl → 1.31.20250123022540__py3-none-any.whl - Mend

thds.core 0.0.1py3-none-any.whl → 1.31.20250123022540py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of thds.core might be problematic. Click here for more details.

Files changed (70) hide show

thds/core/__init__.py +48 -0
thds/core/ansi_esc.py +46 -0
thds/core/cache.py +201 -0
thds/core/calgitver.py +82 -0
thds/core/concurrency.py +100 -0
thds/core/config.py +250 -0
thds/core/decos.py +55 -0
thds/core/dict_utils.py +188 -0
thds/core/env.py +40 -0
thds/core/exit_after.py +121 -0
thds/core/files.py +125 -0
thds/core/fretry.py +115 -0
thds/core/generators.py +56 -0
thds/core/git.py +81 -0
thds/core/hash_cache.py +86 -0
thds/core/hashing.py +106 -0
thds/core/home.py +15 -0
thds/core/hostname.py +10 -0
thds/core/imports.py +17 -0
thds/core/inspect.py +58 -0
thds/core/iterators.py +9 -0
thds/core/lazy.py +83 -0
thds/core/link.py +153 -0
thds/core/log/__init__.py +29 -0
thds/core/log/basic_config.py +171 -0
thds/core/log/json_formatter.py +43 -0
thds/core/log/kw_formatter.py +84 -0
thds/core/log/kw_logger.py +93 -0
thds/core/log/logfmt.py +302 -0
thds/core/merge_args.py +168 -0
thds/core/meta.json +8 -0
thds/core/meta.py +518 -0
thds/core/parallel.py +200 -0
thds/core/pickle_visit.py +24 -0
thds/core/prof.py +276 -0
thds/core/progress.py +112 -0
thds/core/protocols.py +17 -0
thds/core/py.typed +0 -0
thds/core/scaling.py +39 -0
thds/core/scope.py +199 -0
thds/core/source.py +238 -0
thds/core/source_serde.py +104 -0
thds/core/sqlite/__init__.py +21 -0
thds/core/sqlite/connect.py +33 -0
thds/core/sqlite/copy.py +35 -0
thds/core/sqlite/ddl.py +4 -0
thds/core/sqlite/functions.py +63 -0
thds/core/sqlite/index.py +22 -0
thds/core/sqlite/insert_utils.py +23 -0
thds/core/sqlite/merge.py +84 -0
thds/core/sqlite/meta.py +190 -0
thds/core/sqlite/read.py +66 -0
thds/core/sqlite/sqlmap.py +179 -0
thds/core/sqlite/structured.py +138 -0
thds/core/sqlite/types.py +64 -0
thds/core/sqlite/upsert.py +139 -0
thds/core/sqlite/write.py +99 -0
thds/core/stack_context.py +41 -0
thds/core/thunks.py +40 -0
thds/core/timer.py +214 -0
thds/core/tmp.py +85 -0
thds/core/types.py +4 -0
thds.core-1.31.20250123022540.dist-info/METADATA +68 -0
thds.core-1.31.20250123022540.dist-info/RECORD +67 -0
{thds.core-0.0.1.dist-info → thds.core-1.31.20250123022540.dist-info}/WHEEL +1 -1
thds.core-1.31.20250123022540.dist-info/entry_points.txt +4 -0
thds.core-1.31.20250123022540.dist-info/top_level.txt +1 -0
thds.core-0.0.1.dist-info/METADATA +0 -8
thds.core-0.0.1.dist-info/RECORD +0 -4
thds.core-0.0.1.dist-info/top_level.txt +0 -1

thds/core/sqlite/structured.py ADDED Viewed

@@ -0,0 +1,138 @@
+import functools
+import typing as ty
+from dataclasses import dataclass
+from sqlite3 import Connection, OperationalError
+from thds.core import config
+from thds.core.lazy import ThreadLocalLazy
+from thds.core.log import getLogger
+from thds.core.types import StrOrPath
+from .connect import row_connect
+from .meta import column_names, primary_key_cols
+from .read import matching
+from .types import T, TableSource
+SQLITE_CACHE_SIZE = config.item("cache_size", 100_000)
+MMAP_BYTES = config.item("mmap_bytes", 8_589_934_592)
+_logger = getLogger(__name__)
+@dataclass
+class TableMeta:
+    """Things which can be derived and cached once we have established the first Connection."""
+    conn: Connection
+    name: str
+    pk_cols: ty.Set[str]
+    colnames: ty.Set[str]
+DbPathAndTableName = ty.Tuple[StrOrPath, str]
+class BadPrimaryKey(ValueError):
+    pass
+class UnknownColumns(ValueError):
+    pass
+class _Table(ty.Protocol):
+    def __call__(self, ignore_mmap_size: bool = False) -> TableMeta:
+        ...
+class StructTable(ty.Generic[T]):
+    def __init__(
+        self,
+        from_item: ty.Callable[[ty.Mapping[str, ty.Any]], T],
+        table_meta: ty.Callable[[ty.Optional[int]], ThreadLocalLazy[TableMeta]],
+        cache_size: ty.Optional[int] = None,
+        mmap_size: int = -1,
+    ):
+        if cache_size is None:
+            cache_size = SQLITE_CACHE_SIZE()
+        if mmap_size < 0:
+            mmap_size = MMAP_BYTES()
+        self._tbl = table_meta(mmap_size)
+        self.from_item = from_item
+        if cache_size:
+            # Caching is only applied to `.list` and `.get` as `.matching` returns a consumable iterator
+            self.get = functools.lru_cache(cache_size)(self.get)  # type: ignore
+            self.list = functools.lru_cache(cache_size)(self.list)  # type: ignore
+    def matching(self, **where: ty.Any) -> ty.Iterator[T]:
+        tbl = self._tbl()
+        try:
+            for item in matching(tbl.name, tbl.conn, where):
+                yield self.from_item(item)
+        except OperationalError as e:
+            if unknown_cols := (set(where) - tbl.colnames):
+                raise UnknownColumns(f"Can't match on columns that don't exist: {unknown_cols}")
+            else:
+                raise e
+    def get(self, **primary_key: ty.Any) -> ty.Optional[T]:
+        """A primary key lookup. Returns None if there is no match.
+        Raises if there is more than one match.
+        """
+        tbl = self._tbl()
+        if not set(primary_key) == tbl.pk_cols:
+            raise BadPrimaryKey(
+                f"Primary key must be complete; expected {tbl.pk_cols} but got {primary_key}"
+            )
+        t_iter = self.matching(**primary_key)
+        first = next(t_iter, None)
+        if first is None:
+            return None
+        should_be_no_next = next(t_iter, None)
+        if should_be_no_next is not None:
+            raise BadPrimaryKey(f"More than one item found for supposed primary key {primary_key}")
+        return first
+    def list(self, **where: ty.Any) -> ty.List[T]:
+        """List all items in the table where key/column = value."""
+        return list(self.matching(**where))
+def autometa_factory(
+    src: ty.Callable[[], DbPathAndTableName]
+) -> ty.Callable[[ty.Optional[int]], ThreadLocalLazy[TableMeta]]:
+    """Use this factory to defer the connection and other settings (e.g., mmap_size) within each thread"""
+    def _autometa(mmap_size: ty.Optional[int] = None) -> ThreadLocalLazy[TableMeta]:
+        def _get_table_meta():
+            db_path, table_name = src()
+            conn = row_connect(db_path)
+            pk_cols = set(primary_key_cols(table_name, conn))
+            if not pk_cols:
+                raise BadPrimaryKey(f"Found no primary key cols for table {table_name}")
+            colnames = set(column_names(table_name, conn))
+            if not colnames:
+                raise UnknownColumns(f"Found no columns for table {table_name}")
+            if mmap_size:
+                _logger.info(f"Setting sqlite mmap size to {mmap_size}")
+                conn.execute(f"PRAGMA mmap_size={mmap_size};")
+            return TableMeta(conn, table_name, pk_cols, colnames)
+        return ThreadLocalLazy(_get_table_meta)
+    return _autometa
+def struct_table_from_source(
+    from_item: ty.Callable[[ty.Mapping[str, ty.Any]], T],
+    table_source: ty.Callable[[], TableSource],
+    **kwargs,
+) -> StructTable[T]:
+    def extract_path_and_name() -> DbPathAndTableName:
+        return str(table_source().db_src.path()), table_source().table_name
+    return StructTable(from_item, autometa_factory(extract_path_and_name), **kwargs)

thds/core/sqlite/types.py ADDED Viewed

@@ -0,0 +1,64 @@
+import os
+import sqlite3
+import typing as ty
+from pathlib import Path
+from thds.core.source import Source
+class DbAndTableP(ty.Protocol):
+    @property  # read-only
+    def db_path(self) -> os.PathLike:
+        ...
+    @property  # read-only
+    def table_name(self) -> str:
+        ...
+class DbAndTable(ty.NamedTuple):
+    db_path: Path
+    table_name: str
+class TableSource(ty.NamedTuple):
+    db_src: Source
+    table_name: str
+    @property
+    def db_path(self) -> os.PathLike:
+        return self.db_src
+AnyDbTableSrc = ty.Union[DbAndTableP, ty.Callable[[], DbAndTableP]]
+def resolve_lazy_db_and_table(table_src: AnyDbTableSrc) -> DbAndTableP:
+    if hasattr(table_src, "table_name"):
+        src = ty.cast(DbAndTableP, table_src)
+    else:
+        src = table_src()  # type: ignore
+    assert hasattr(src, "table_name"), "table_name must be provided"
+    return src
+class TableMaster(ty.NamedTuple):
+    """Element/asset table and its corresponding metadata table"""
+    table: TableSource
+    metadata: TableSource
+T = ty.TypeVar("T")
+def maybe_t(
+    to_t: ty.Callable[[ty.Mapping[str, ty.Any]], T],
+    row: ty.Optional[ty.Mapping[str, ty.Any]],
+) -> ty.Optional[T]:
+    if row:
+        return to_t(row)
+    return None
+Connectable = ty.Union[os.PathLike, sqlite3.Connection]

thds/core/sqlite/upsert.py ADDED Viewed

@@ -0,0 +1,139 @@
+import textwrap
+import typing as ty
+from functools import lru_cache
+from sqlite3 import Connection
+from thds.core import generators, log
+from .meta import get_table_schema, primary_key_cols
+from .read import matching_where
+from .write import run_batch_and_isolate_failures
+logger = log.getLogger(__name__)
+def _make_upsert_writer(
+    conn: Connection,
+    table_name: str,
+    batch_size: int = 1000,
+    max_sql_stmt_cache_size: int = 1000,
+) -> ty.Generator[None, ty.Mapping[str, ty.Any], str]:
+    """Upserts in SQLite are a bit... under-featured. You simply cannot ask SQLite in a
+    generic way to write the key-value pairs you've provided for a row but not to overwrite
+    any key-value pairs you didn't provide with whatever the default value is (often NULL).
+    In fact, the docs normally suggest doing a SELECT first to see if the row exists...
+    We _tried_ doing an ON CONFLICT... DO UPDATE SET clause, but it turns out that
+    does not work in circumstances described below. So we ended up with an approach
+    that basically embeds the SELECT (so that this can be done in pure SQL rather than requiring
+    Python logic to run for each row).
+    By doing it this way, we can batch any immediately-following rows that fit the exact
+    same set of keys to be written, and finally, we can commit all of the queries at the
+    end.  This won't be as fast as a true bulk insert, since there's still meaningful
+    Python running for every row (converting dict keys into a tuple) and a check against
+    the previous keyset.
+    Your perfomance will be better if you are able to make sure that your iterator of rows
+    provides rows with the same keys in the same order in batches, so that we can do as
+    little SQL formatting as possible and execute larger batches with executemany.
+    """
+    primary_keys = primary_key_cols(table_name, conn)
+    where_matches_primary_keys = matching_where(primary_keys)
+    all_column_names = tuple(get_table_schema(conn, table_name).keys())
+    all_column_names_comma_str = ", ".join(all_column_names)
+    # https://stackoverflow.com/questions/418898/upsert-not-insert-or-replace/4330694#comment65393759_7511635
+    #
+    # the above is the approach i'm taking now that I know that SQLite will (sadly) enforce a
+    # not-null constraint _before_ it actually discovers that the row already exists and that
+    # the ON CONFLICT clause would end up doing a simple UPDATE to an existing row.
+    # This is more boilerplate-y and might be slower, too, because it requires a separate SELECT -
+    # but in theory the database had to do that SELECT in order to check the ON CONFLICT clause anyway,
+    # so maybe it's a wash?
+    @lru_cache(maxsize=max_sql_stmt_cache_size)
+    def make_upsert_query(colnames_for_partial_row: ty.Sequence[str]) -> str:
+        """Makes a query with placeholders which are:
+        - the values you provide for the row keys
+        - the primary keys themselves, which must be provided in the same order as they are
+          defined in the table schema.
+        Prefer sorting your row keys so that you get overlap here.
+        """
+        colnames_or_placeholders = list()
+        for col in all_column_names:
+            if col in colnames_for_partial_row:
+                colnames_or_placeholders.append(f"@{col}")  # insert/update the provided value
+            else:
+                colnames_or_placeholders.append(col)  # use the joined default value for an update
+        # Construct the SQL query for the batch
+        return textwrap.dedent(
+            f"""
+            INSERT OR REPLACE INTO {table_name} ({all_column_names_comma_str})
+                SELECT {", ".join(colnames_or_placeholders)}
+                FROM ( SELECT NULL )
+                LEFT JOIN (
+                    SELECT * from {table_name} {where_matches_primary_keys}
+                )
+            """
+        )
+    cursor = None
+    batch: ty.List[ty.Mapping[str, ty.Any]] = list()
+    query = ""
+    current_keyset: ty.Tuple[str, ...] = tuple()
+    try:
+        row = yield
+        cursor = conn.cursor()
+        # don't create the cursor til we receive our first actual row.
+        while True:
+            keyset = tuple([col for col in all_column_names if col in row])
+            if keyset != current_keyset or len(batch) >= batch_size:
+                # send current batch:
+                run_batch_and_isolate_failures(cursor, query, batch)
+                batch = list()
+                query = make_upsert_query(keyset)
+                current_keyset = keyset
+            batch.append(row)
+            row = yield
+    except GeneratorExit:
+        if not query:
+            # we never got any rows
+            logger.warning(f"No rows to upsert into table '{table_name}'")
+            return ""
+        # Insert any remaining data in the last batch
+        run_batch_and_isolate_failures(cursor, query, batch)
+        # Commit the changes to the database
+        conn.commit()
+        return table_name
+    finally:
+        if cursor:
+            cursor.close()
+def mappings(
+    conn: Connection,
+    table_name: str,
+    rows: ty.Iterable[ty.Mapping[str, ty.Any]],
+    *,
+    batch_size: int = 1000,
+) -> None:
+    """Write rows to a table, upserting on the primary keys. Will not overwrite existing values that are not contained within the provided mappings.
+    Note that core.sqlite.write.write_mappings is likely to be significantly faster than
+    this if your rows have homogeneous keys (e.g. if you're writing the full row for each
+    mapping), because this routine needs to generate a specific SQL statement for every
+    unique combination of keys it sees (and so needs to examine the keys for every row).
+    """
+    generators.iterator_sender(_make_upsert_writer(conn, table_name, batch_size=batch_size), rows)

thds/core/sqlite/write.py ADDED Viewed

@@ -0,0 +1,99 @@
+import typing as ty
+from sqlite3 import Connection
+from thds.core import generators, log
+logger = log.getLogger(__name__)
+def run_batch_and_isolate_failures(cursor, query: str, batch: ty.List[ty.Any]):
+    if not batch:
+        return
+    assert cursor, "cursor must exist"
+    assert query, "query must be non-empty"
+    try:
+        cursor.executemany(query, batch)
+    except Exception:
+        if len(batch) >= 2:
+            run_batch_and_isolate_failures(cursor, query, batch[: len(batch) // 2])
+            run_batch_and_isolate_failures(cursor, query, batch[len(batch) // 2 :])
+        else:
+            bad_data = ""
+            for i, col in enumerate(batch[0], 1):
+                bad_data += f"COL {i}: {col}\n      {type(col)}\n"
+            logger.exception(
+                f"Failed during insertion; ***QUERY***\n{query} \n***FAILED-DATA***\n{bad_data}"
+            )
+        raise
+def make_mapping_writer(
+    conn: Connection,
+    table_name: str,
+    *,
+    batch_size: int = 1000,
+    replace: bool = False,
+) -> ty.Generator[None, ty.Mapping[str, ty.Any], str]:
+    """Return a generator that accepts mappings via send() and writes them in batches
+    to the database. The generator itself yields nothing, but will return the table_name
+    once it has been close()d.
+    The reason for having a generator do this rather than a function that consumes an
+    iterator is that the former is fundamentally more flexible - a 'power user' could
+    consume multiple iterators and write to multiple of these generator writers in
+    parallel without need for threading.
+    """
+    def make_query(first_row) -> str:
+        columns = ",\n    ".join(first_row.keys())
+        # Create a list of placeholders
+        placeholders = ", ".join(["?"] * len(first_row))
+        rpl = "OR REPLACE" if replace else ""
+        # Construct the SQL query for the batch
+        return f"INSERT {rpl} INTO {table_name} (\n    {columns}\n) VALUES ({placeholders})"
+    query = ""
+    cursor = None
+    batch = list()
+    try:
+        while True:
+            row = yield
+            if not query:
+                query = make_query(row)
+                cursor = conn.cursor()
+            batch.append(tuple(row.values()))
+            if len(batch) >= batch_size:
+                run_batch_and_isolate_failures(cursor, query, batch)
+                batch = list()
+    except GeneratorExit:
+        if not query:
+            # we never got any rows
+            logger.warning(f"No rows to write into table '{table_name}'")
+            return ""
+        # Insert any remaining data in the last batch
+        run_batch_and_isolate_failures(cursor, query, batch)
+        # Commit the changes to the database
+        conn.commit()
+        return table_name
+    finally:
+        if cursor:
+            cursor.close()
+def write_mappings(
+    conn: Connection,
+    table_name: str,
+    rows: ty.Iterable[ty.Mapping[str, ty.Any]],
+    *,
+    batch_size: int = 1000,
+    replace: bool = False,
+) -> None:
+    generators.iterator_sender(
+        make_mapping_writer(conn, table_name, batch_size=batch_size, replace=replace), rows
+    )

thds/core/stack_context.py ADDED Viewed

@@ -0,0 +1,41 @@
+"""Allows avoiding prop-drilling of contextual information.
+Instead of threading an argument through many layers of functions,
+create a global StackContext, and use a with statement to set its
+value for everything below the current place on the stack.
+Only affects your thread/green thread (works with async).
+"""
+import contextlib as cl
+import contextvars as cv
+import typing as ty
+T = ty.TypeVar("T")
+F = ty.TypeVar("F", bound=ty.Callable)
+@cl.contextmanager
+def stack_context(contextvar: cv.ContextVar[T], value: T) -> ty.Iterator[T]:
+    try:
+        token = contextvar.set(value)
+        yield value
+    finally:
+        contextvar.reset(token)
+class StackContext(ty.Generic[T]):
+    """A thin wrapper around a ContextVar that requires it to be set in a
+    stack-frame limited manner.
+    These should only be created at a module/global level, just like the
+    underlying ContextVar.
+    """
+    def __init__(self, debug_name: str, default: T):
+        """The name passed in here is only for debugging purposes, as per the
+        documentation for ContextVar.
+        """
+        self._contextvar = cv.ContextVar(debug_name, default=default)
+    def set(self, value: T) -> ty.ContextManager[T]:
+        return stack_context(self._contextvar, value)
+    def __call__(self) -> T:
+        return self._contextvar.get()

thds/core/thunks.py ADDED Viewed

@@ -0,0 +1,40 @@
+import sys
+import typing as ty
+from dataclasses import dataclass
+if sys.version_info >= (3, 10):
+    from typing import ParamSpec
+else:
+    from typing_extensions import ParamSpec
+P = ParamSpec("P")
+R = ty.TypeVar("R")
+@dataclass
+class Thunk(ty.Generic[R]):
+    """Result-typed callable with arguments partially applied beforehand."""
+    func: ty.Callable
+    args: P.args
+    kwargs: P.kwargs
+    def __init__(self, func: ty.Callable[P, R], *args: P.args, **kwargs: P.kwargs):
+        self.func = func
+        self.args = args
+        self.kwargs = kwargs
+    def __call__(self) -> R:
+        return ty.cast(R, self.func(*self.args, **self.kwargs))
+def thunking(func: ty.Callable[P, R]) -> ty.Callable[P, Thunk[R]]:
+    """Converts a standard function into a function that accepts the
+    exact same arguments but returns a Thunk - something ready to be
+    executed but the execution itself is deferred.
+    """
+    def wrapper(*args: P.args, **kwargs: P.kwargs) -> Thunk[R]:
+        return Thunk(func, *args, **kwargs)
+    return wrapper

thds.core 0.0.1__py3-none-any.whl → 1.31.20250123022540__py3-none-any.whl

Potentially problematic release.

thds.core 0.0.1py3-none-any.whl → 1.31.20250123022540py3-none-any.whl