thds.core 0.0.1__py3-none-any.whl → 1.31.20250123022540__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of thds.core might be problematic. Click here for more details.

Files changed (70) hide show
  1. thds/core/__init__.py +48 -0
  2. thds/core/ansi_esc.py +46 -0
  3. thds/core/cache.py +201 -0
  4. thds/core/calgitver.py +82 -0
  5. thds/core/concurrency.py +100 -0
  6. thds/core/config.py +250 -0
  7. thds/core/decos.py +55 -0
  8. thds/core/dict_utils.py +188 -0
  9. thds/core/env.py +40 -0
  10. thds/core/exit_after.py +121 -0
  11. thds/core/files.py +125 -0
  12. thds/core/fretry.py +115 -0
  13. thds/core/generators.py +56 -0
  14. thds/core/git.py +81 -0
  15. thds/core/hash_cache.py +86 -0
  16. thds/core/hashing.py +106 -0
  17. thds/core/home.py +15 -0
  18. thds/core/hostname.py +10 -0
  19. thds/core/imports.py +17 -0
  20. thds/core/inspect.py +58 -0
  21. thds/core/iterators.py +9 -0
  22. thds/core/lazy.py +83 -0
  23. thds/core/link.py +153 -0
  24. thds/core/log/__init__.py +29 -0
  25. thds/core/log/basic_config.py +171 -0
  26. thds/core/log/json_formatter.py +43 -0
  27. thds/core/log/kw_formatter.py +84 -0
  28. thds/core/log/kw_logger.py +93 -0
  29. thds/core/log/logfmt.py +302 -0
  30. thds/core/merge_args.py +168 -0
  31. thds/core/meta.json +8 -0
  32. thds/core/meta.py +518 -0
  33. thds/core/parallel.py +200 -0
  34. thds/core/pickle_visit.py +24 -0
  35. thds/core/prof.py +276 -0
  36. thds/core/progress.py +112 -0
  37. thds/core/protocols.py +17 -0
  38. thds/core/py.typed +0 -0
  39. thds/core/scaling.py +39 -0
  40. thds/core/scope.py +199 -0
  41. thds/core/source.py +238 -0
  42. thds/core/source_serde.py +104 -0
  43. thds/core/sqlite/__init__.py +21 -0
  44. thds/core/sqlite/connect.py +33 -0
  45. thds/core/sqlite/copy.py +35 -0
  46. thds/core/sqlite/ddl.py +4 -0
  47. thds/core/sqlite/functions.py +63 -0
  48. thds/core/sqlite/index.py +22 -0
  49. thds/core/sqlite/insert_utils.py +23 -0
  50. thds/core/sqlite/merge.py +84 -0
  51. thds/core/sqlite/meta.py +190 -0
  52. thds/core/sqlite/read.py +66 -0
  53. thds/core/sqlite/sqlmap.py +179 -0
  54. thds/core/sqlite/structured.py +138 -0
  55. thds/core/sqlite/types.py +64 -0
  56. thds/core/sqlite/upsert.py +139 -0
  57. thds/core/sqlite/write.py +99 -0
  58. thds/core/stack_context.py +41 -0
  59. thds/core/thunks.py +40 -0
  60. thds/core/timer.py +214 -0
  61. thds/core/tmp.py +85 -0
  62. thds/core/types.py +4 -0
  63. thds.core-1.31.20250123022540.dist-info/METADATA +68 -0
  64. thds.core-1.31.20250123022540.dist-info/RECORD +67 -0
  65. {thds.core-0.0.1.dist-info → thds.core-1.31.20250123022540.dist-info}/WHEEL +1 -1
  66. thds.core-1.31.20250123022540.dist-info/entry_points.txt +4 -0
  67. thds.core-1.31.20250123022540.dist-info/top_level.txt +1 -0
  68. thds.core-0.0.1.dist-info/METADATA +0 -8
  69. thds.core-0.0.1.dist-info/RECORD +0 -4
  70. thds.core-0.0.1.dist-info/top_level.txt +0 -1
@@ -0,0 +1,138 @@
1
+ import functools
2
+ import typing as ty
3
+ from dataclasses import dataclass
4
+ from sqlite3 import Connection, OperationalError
5
+
6
+ from thds.core import config
7
+ from thds.core.lazy import ThreadLocalLazy
8
+ from thds.core.log import getLogger
9
+ from thds.core.types import StrOrPath
10
+
11
+ from .connect import row_connect
12
+ from .meta import column_names, primary_key_cols
13
+ from .read import matching
14
+ from .types import T, TableSource
15
+
16
+ SQLITE_CACHE_SIZE = config.item("cache_size", 100_000)
17
+ MMAP_BYTES = config.item("mmap_bytes", 8_589_934_592)
18
+ _logger = getLogger(__name__)
19
+
20
+
21
+ @dataclass
22
+ class TableMeta:
23
+ """Things which can be derived and cached once we have established the first Connection."""
24
+
25
+ conn: Connection
26
+ name: str
27
+ pk_cols: ty.Set[str]
28
+ colnames: ty.Set[str]
29
+
30
+
31
+ DbPathAndTableName = ty.Tuple[StrOrPath, str]
32
+
33
+
34
+ class BadPrimaryKey(ValueError):
35
+ pass
36
+
37
+
38
+ class UnknownColumns(ValueError):
39
+ pass
40
+
41
+
42
+ class _Table(ty.Protocol):
43
+ def __call__(self, ignore_mmap_size: bool = False) -> TableMeta:
44
+ ...
45
+
46
+
47
+ class StructTable(ty.Generic[T]):
48
+ def __init__(
49
+ self,
50
+ from_item: ty.Callable[[ty.Mapping[str, ty.Any]], T],
51
+ table_meta: ty.Callable[[ty.Optional[int]], ThreadLocalLazy[TableMeta]],
52
+ cache_size: ty.Optional[int] = None,
53
+ mmap_size: int = -1,
54
+ ):
55
+ if cache_size is None:
56
+ cache_size = SQLITE_CACHE_SIZE()
57
+ if mmap_size < 0:
58
+ mmap_size = MMAP_BYTES()
59
+
60
+ self._tbl = table_meta(mmap_size)
61
+ self.from_item = from_item
62
+ if cache_size:
63
+ # Caching is only applied to `.list` and `.get` as `.matching` returns a consumable iterator
64
+ self.get = functools.lru_cache(cache_size)(self.get) # type: ignore
65
+ self.list = functools.lru_cache(cache_size)(self.list) # type: ignore
66
+
67
+ def matching(self, **where: ty.Any) -> ty.Iterator[T]:
68
+ tbl = self._tbl()
69
+ try:
70
+ for item in matching(tbl.name, tbl.conn, where):
71
+ yield self.from_item(item)
72
+ except OperationalError as e:
73
+ if unknown_cols := (set(where) - tbl.colnames):
74
+ raise UnknownColumns(f"Can't match on columns that don't exist: {unknown_cols}")
75
+ else:
76
+ raise e
77
+
78
+ def get(self, **primary_key: ty.Any) -> ty.Optional[T]:
79
+ """A primary key lookup. Returns None if there is no match.
80
+
81
+ Raises if there is more than one match.
82
+ """
83
+ tbl = self._tbl()
84
+ if not set(primary_key) == tbl.pk_cols:
85
+ raise BadPrimaryKey(
86
+ f"Primary key must be complete; expected {tbl.pk_cols} but got {primary_key}"
87
+ )
88
+
89
+ t_iter = self.matching(**primary_key)
90
+ first = next(t_iter, None)
91
+ if first is None:
92
+ return None
93
+ should_be_no_next = next(t_iter, None)
94
+ if should_be_no_next is not None:
95
+ raise BadPrimaryKey(f"More than one item found for supposed primary key {primary_key}")
96
+ return first
97
+
98
+ def list(self, **where: ty.Any) -> ty.List[T]:
99
+ """List all items in the table where key/column = value."""
100
+ return list(self.matching(**where))
101
+
102
+
103
+ def autometa_factory(
104
+ src: ty.Callable[[], DbPathAndTableName]
105
+ ) -> ty.Callable[[ty.Optional[int]], ThreadLocalLazy[TableMeta]]:
106
+ """Use this factory to defer the connection and other settings (e.g., mmap_size) within each thread"""
107
+
108
+ def _autometa(mmap_size: ty.Optional[int] = None) -> ThreadLocalLazy[TableMeta]:
109
+ def _get_table_meta():
110
+ db_path, table_name = src()
111
+ conn = row_connect(db_path)
112
+ pk_cols = set(primary_key_cols(table_name, conn))
113
+ if not pk_cols:
114
+ raise BadPrimaryKey(f"Found no primary key cols for table {table_name}")
115
+ colnames = set(column_names(table_name, conn))
116
+ if not colnames:
117
+ raise UnknownColumns(f"Found no columns for table {table_name}")
118
+
119
+ if mmap_size:
120
+ _logger.info(f"Setting sqlite mmap size to {mmap_size}")
121
+ conn.execute(f"PRAGMA mmap_size={mmap_size};")
122
+
123
+ return TableMeta(conn, table_name, pk_cols, colnames)
124
+
125
+ return ThreadLocalLazy(_get_table_meta)
126
+
127
+ return _autometa
128
+
129
+
130
+ def struct_table_from_source(
131
+ from_item: ty.Callable[[ty.Mapping[str, ty.Any]], T],
132
+ table_source: ty.Callable[[], TableSource],
133
+ **kwargs,
134
+ ) -> StructTable[T]:
135
+ def extract_path_and_name() -> DbPathAndTableName:
136
+ return str(table_source().db_src.path()), table_source().table_name
137
+
138
+ return StructTable(from_item, autometa_factory(extract_path_and_name), **kwargs)
@@ -0,0 +1,64 @@
1
+ import os
2
+ import sqlite3
3
+ import typing as ty
4
+ from pathlib import Path
5
+
6
+ from thds.core.source import Source
7
+
8
+
9
+ class DbAndTableP(ty.Protocol):
10
+ @property # read-only
11
+ def db_path(self) -> os.PathLike:
12
+ ...
13
+
14
+ @property # read-only
15
+ def table_name(self) -> str:
16
+ ...
17
+
18
+
19
+ class DbAndTable(ty.NamedTuple):
20
+ db_path: Path
21
+ table_name: str
22
+
23
+
24
+ class TableSource(ty.NamedTuple):
25
+ db_src: Source
26
+ table_name: str
27
+
28
+ @property
29
+ def db_path(self) -> os.PathLike:
30
+ return self.db_src
31
+
32
+
33
+ AnyDbTableSrc = ty.Union[DbAndTableP, ty.Callable[[], DbAndTableP]]
34
+
35
+
36
+ def resolve_lazy_db_and_table(table_src: AnyDbTableSrc) -> DbAndTableP:
37
+ if hasattr(table_src, "table_name"):
38
+ src = ty.cast(DbAndTableP, table_src)
39
+ else:
40
+ src = table_src() # type: ignore
41
+ assert hasattr(src, "table_name"), "table_name must be provided"
42
+ return src
43
+
44
+
45
+ class TableMaster(ty.NamedTuple):
46
+ """Element/asset table and its corresponding metadata table"""
47
+
48
+ table: TableSource
49
+ metadata: TableSource
50
+
51
+
52
+ T = ty.TypeVar("T")
53
+
54
+
55
+ def maybe_t(
56
+ to_t: ty.Callable[[ty.Mapping[str, ty.Any]], T],
57
+ row: ty.Optional[ty.Mapping[str, ty.Any]],
58
+ ) -> ty.Optional[T]:
59
+ if row:
60
+ return to_t(row)
61
+ return None
62
+
63
+
64
+ Connectable = ty.Union[os.PathLike, sqlite3.Connection]
@@ -0,0 +1,139 @@
1
+ import textwrap
2
+ import typing as ty
3
+ from functools import lru_cache
4
+ from sqlite3 import Connection
5
+
6
+ from thds.core import generators, log
7
+
8
+ from .meta import get_table_schema, primary_key_cols
9
+ from .read import matching_where
10
+ from .write import run_batch_and_isolate_failures
11
+
12
+ logger = log.getLogger(__name__)
13
+
14
+
15
+ def _make_upsert_writer(
16
+ conn: Connection,
17
+ table_name: str,
18
+ batch_size: int = 1000,
19
+ max_sql_stmt_cache_size: int = 1000,
20
+ ) -> ty.Generator[None, ty.Mapping[str, ty.Any], str]:
21
+ """Upserts in SQLite are a bit... under-featured. You simply cannot ask SQLite in a
22
+ generic way to write the key-value pairs you've provided for a row but not to overwrite
23
+ any key-value pairs you didn't provide with whatever the default value is (often NULL).
24
+
25
+ In fact, the docs normally suggest doing a SELECT first to see if the row exists...
26
+
27
+ We _tried_ doing an ON CONFLICT... DO UPDATE SET clause, but it turns out that
28
+ does not work in circumstances described below. So we ended up with an approach
29
+ that basically embeds the SELECT (so that this can be done in pure SQL rather than requiring
30
+ Python logic to run for each row).
31
+
32
+ By doing it this way, we can batch any immediately-following rows that fit the exact
33
+ same set of keys to be written, and finally, we can commit all of the queries at the
34
+ end. This won't be as fast as a true bulk insert, since there's still meaningful
35
+ Python running for every row (converting dict keys into a tuple) and a check against
36
+ the previous keyset.
37
+
38
+ Your perfomance will be better if you are able to make sure that your iterator of rows
39
+ provides rows with the same keys in the same order in batches, so that we can do as
40
+ little SQL formatting as possible and execute larger batches with executemany.
41
+ """
42
+
43
+ primary_keys = primary_key_cols(table_name, conn)
44
+ where_matches_primary_keys = matching_where(primary_keys)
45
+ all_column_names = tuple(get_table_schema(conn, table_name).keys())
46
+ all_column_names_comma_str = ", ".join(all_column_names)
47
+
48
+ # https://stackoverflow.com/questions/418898/upsert-not-insert-or-replace/4330694#comment65393759_7511635
49
+ #
50
+ # the above is the approach i'm taking now that I know that SQLite will (sadly) enforce a
51
+ # not-null constraint _before_ it actually discovers that the row already exists and that
52
+ # the ON CONFLICT clause would end up doing a simple UPDATE to an existing row.
53
+ # This is more boilerplate-y and might be slower, too, because it requires a separate SELECT -
54
+ # but in theory the database had to do that SELECT in order to check the ON CONFLICT clause anyway,
55
+ # so maybe it's a wash?
56
+
57
+ @lru_cache(maxsize=max_sql_stmt_cache_size)
58
+ def make_upsert_query(colnames_for_partial_row: ty.Sequence[str]) -> str:
59
+ """Makes a query with placeholders which are:
60
+
61
+ - the values you provide for the row keys
62
+ - the primary keys themselves, which must be provided in the same order as they are
63
+ defined in the table schema.
64
+
65
+ Prefer sorting your row keys so that you get overlap here.
66
+ """
67
+ colnames_or_placeholders = list()
68
+ for col in all_column_names:
69
+ if col in colnames_for_partial_row:
70
+ colnames_or_placeholders.append(f"@{col}") # insert/update the provided value
71
+ else:
72
+ colnames_or_placeholders.append(col) # use the joined default value for an update
73
+
74
+ # Construct the SQL query for the batch
75
+ return textwrap.dedent(
76
+ f"""
77
+ INSERT OR REPLACE INTO {table_name} ({all_column_names_comma_str})
78
+ SELECT {", ".join(colnames_or_placeholders)}
79
+ FROM ( SELECT NULL )
80
+ LEFT JOIN (
81
+ SELECT * from {table_name} {where_matches_primary_keys}
82
+ )
83
+ """
84
+ )
85
+
86
+ cursor = None
87
+ batch: ty.List[ty.Mapping[str, ty.Any]] = list()
88
+ query = ""
89
+ current_keyset: ty.Tuple[str, ...] = tuple()
90
+
91
+ try:
92
+ row = yield
93
+ cursor = conn.cursor()
94
+ # don't create the cursor til we receive our first actual row.
95
+
96
+ while True:
97
+ keyset = tuple([col for col in all_column_names if col in row])
98
+ if keyset != current_keyset or len(batch) >= batch_size:
99
+ # send current batch:
100
+ run_batch_and_isolate_failures(cursor, query, batch)
101
+
102
+ batch = list()
103
+ query = make_upsert_query(keyset)
104
+ current_keyset = keyset
105
+
106
+ batch.append(row)
107
+ row = yield
108
+
109
+ except GeneratorExit:
110
+ if not query:
111
+ # we never got any rows
112
+ logger.warning(f"No rows to upsert into table '{table_name}'")
113
+ return ""
114
+
115
+ # Insert any remaining data in the last batch
116
+ run_batch_and_isolate_failures(cursor, query, batch)
117
+ # Commit the changes to the database
118
+ conn.commit()
119
+ return table_name
120
+ finally:
121
+ if cursor:
122
+ cursor.close()
123
+
124
+
125
+ def mappings(
126
+ conn: Connection,
127
+ table_name: str,
128
+ rows: ty.Iterable[ty.Mapping[str, ty.Any]],
129
+ *,
130
+ batch_size: int = 1000,
131
+ ) -> None:
132
+ """Write rows to a table, upserting on the primary keys. Will not overwrite existing values that are not contained within the provided mappings.
133
+
134
+ Note that core.sqlite.write.write_mappings is likely to be significantly faster than
135
+ this if your rows have homogeneous keys (e.g. if you're writing the full row for each
136
+ mapping), because this routine needs to generate a specific SQL statement for every
137
+ unique combination of keys it sees (and so needs to examine the keys for every row).
138
+ """
139
+ generators.iterator_sender(_make_upsert_writer(conn, table_name, batch_size=batch_size), rows)
@@ -0,0 +1,99 @@
1
+ import typing as ty
2
+ from sqlite3 import Connection
3
+
4
+ from thds.core import generators, log
5
+
6
+ logger = log.getLogger(__name__)
7
+
8
+
9
+ def run_batch_and_isolate_failures(cursor, query: str, batch: ty.List[ty.Any]):
10
+ if not batch:
11
+ return
12
+ assert cursor, "cursor must exist"
13
+ assert query, "query must be non-empty"
14
+ try:
15
+ cursor.executemany(query, batch)
16
+ except Exception:
17
+ if len(batch) >= 2:
18
+ run_batch_and_isolate_failures(cursor, query, batch[: len(batch) // 2])
19
+ run_batch_and_isolate_failures(cursor, query, batch[len(batch) // 2 :])
20
+ else:
21
+ bad_data = ""
22
+ for i, col in enumerate(batch[0], 1):
23
+ bad_data += f"COL {i}: {col}\n {type(col)}\n"
24
+ logger.exception(
25
+ f"Failed during insertion; ***QUERY***\n{query} \n***FAILED-DATA***\n{bad_data}"
26
+ )
27
+ raise
28
+
29
+
30
+ def make_mapping_writer(
31
+ conn: Connection,
32
+ table_name: str,
33
+ *,
34
+ batch_size: int = 1000,
35
+ replace: bool = False,
36
+ ) -> ty.Generator[None, ty.Mapping[str, ty.Any], str]:
37
+ """Return a generator that accepts mappings via send() and writes them in batches
38
+ to the database. The generator itself yields nothing, but will return the table_name
39
+ once it has been close()d.
40
+
41
+ The reason for having a generator do this rather than a function that consumes an
42
+ iterator is that the former is fundamentally more flexible - a 'power user' could
43
+ consume multiple iterators and write to multiple of these generator writers in
44
+ parallel without need for threading.
45
+ """
46
+
47
+ def make_query(first_row) -> str:
48
+ columns = ",\n ".join(first_row.keys())
49
+ # Create a list of placeholders
50
+ placeholders = ", ".join(["?"] * len(first_row))
51
+
52
+ rpl = "OR REPLACE" if replace else ""
53
+ # Construct the SQL query for the batch
54
+ return f"INSERT {rpl} INTO {table_name} (\n {columns}\n) VALUES ({placeholders})"
55
+
56
+ query = ""
57
+ cursor = None
58
+ batch = list()
59
+
60
+ try:
61
+ while True:
62
+ row = yield
63
+ if not query:
64
+ query = make_query(row)
65
+ cursor = conn.cursor()
66
+
67
+ batch.append(tuple(row.values()))
68
+
69
+ if len(batch) >= batch_size:
70
+ run_batch_and_isolate_failures(cursor, query, batch)
71
+ batch = list()
72
+
73
+ except GeneratorExit:
74
+ if not query:
75
+ # we never got any rows
76
+ logger.warning(f"No rows to write into table '{table_name}'")
77
+ return ""
78
+
79
+ # Insert any remaining data in the last batch
80
+ run_batch_and_isolate_failures(cursor, query, batch)
81
+ # Commit the changes to the database
82
+ conn.commit()
83
+ return table_name
84
+ finally:
85
+ if cursor:
86
+ cursor.close()
87
+
88
+
89
+ def write_mappings(
90
+ conn: Connection,
91
+ table_name: str,
92
+ rows: ty.Iterable[ty.Mapping[str, ty.Any]],
93
+ *,
94
+ batch_size: int = 1000,
95
+ replace: bool = False,
96
+ ) -> None:
97
+ generators.iterator_sender(
98
+ make_mapping_writer(conn, table_name, batch_size=batch_size, replace=replace), rows
99
+ )
@@ -0,0 +1,41 @@
1
+ """Allows avoiding prop-drilling of contextual information.
2
+ Instead of threading an argument through many layers of functions,
3
+ create a global StackContext, and use a with statement to set its
4
+ value for everything below the current place on the stack.
5
+ Only affects your thread/green thread (works with async).
6
+ """
7
+ import contextlib as cl
8
+ import contextvars as cv
9
+ import typing as ty
10
+
11
+ T = ty.TypeVar("T")
12
+ F = ty.TypeVar("F", bound=ty.Callable)
13
+
14
+
15
+ @cl.contextmanager
16
+ def stack_context(contextvar: cv.ContextVar[T], value: T) -> ty.Iterator[T]:
17
+ try:
18
+ token = contextvar.set(value)
19
+ yield value
20
+ finally:
21
+ contextvar.reset(token)
22
+
23
+
24
+ class StackContext(ty.Generic[T]):
25
+ """A thin wrapper around a ContextVar that requires it to be set in a
26
+ stack-frame limited manner.
27
+ These should only be created at a module/global level, just like the
28
+ underlying ContextVar.
29
+ """
30
+
31
+ def __init__(self, debug_name: str, default: T):
32
+ """The name passed in here is only for debugging purposes, as per the
33
+ documentation for ContextVar.
34
+ """
35
+ self._contextvar = cv.ContextVar(debug_name, default=default)
36
+
37
+ def set(self, value: T) -> ty.ContextManager[T]:
38
+ return stack_context(self._contextvar, value)
39
+
40
+ def __call__(self) -> T:
41
+ return self._contextvar.get()
thds/core/thunks.py ADDED
@@ -0,0 +1,40 @@
1
+ import sys
2
+ import typing as ty
3
+ from dataclasses import dataclass
4
+
5
+ if sys.version_info >= (3, 10):
6
+ from typing import ParamSpec
7
+ else:
8
+ from typing_extensions import ParamSpec
9
+
10
+ P = ParamSpec("P")
11
+ R = ty.TypeVar("R")
12
+
13
+
14
+ @dataclass
15
+ class Thunk(ty.Generic[R]):
16
+ """Result-typed callable with arguments partially applied beforehand."""
17
+
18
+ func: ty.Callable
19
+ args: P.args
20
+ kwargs: P.kwargs
21
+
22
+ def __init__(self, func: ty.Callable[P, R], *args: P.args, **kwargs: P.kwargs):
23
+ self.func = func
24
+ self.args = args
25
+ self.kwargs = kwargs
26
+
27
+ def __call__(self) -> R:
28
+ return ty.cast(R, self.func(*self.args, **self.kwargs))
29
+
30
+
31
+ def thunking(func: ty.Callable[P, R]) -> ty.Callable[P, Thunk[R]]:
32
+ """Converts a standard function into a function that accepts the
33
+ exact same arguments but returns a Thunk - something ready to be
34
+ executed but the execution itself is deferred.
35
+ """
36
+
37
+ def wrapper(*args: P.args, **kwargs: P.kwargs) -> Thunk[R]:
38
+ return Thunk(func, *args, **kwargs)
39
+
40
+ return wrapper