thds.core 0.0.1__py3-none-any.whl → 1.31.20250123022540__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of thds.core might be problematic. Click here for more details.

Files changed (70) hide show
  1. thds/core/__init__.py +48 -0
  2. thds/core/ansi_esc.py +46 -0
  3. thds/core/cache.py +201 -0
  4. thds/core/calgitver.py +82 -0
  5. thds/core/concurrency.py +100 -0
  6. thds/core/config.py +250 -0
  7. thds/core/decos.py +55 -0
  8. thds/core/dict_utils.py +188 -0
  9. thds/core/env.py +40 -0
  10. thds/core/exit_after.py +121 -0
  11. thds/core/files.py +125 -0
  12. thds/core/fretry.py +115 -0
  13. thds/core/generators.py +56 -0
  14. thds/core/git.py +81 -0
  15. thds/core/hash_cache.py +86 -0
  16. thds/core/hashing.py +106 -0
  17. thds/core/home.py +15 -0
  18. thds/core/hostname.py +10 -0
  19. thds/core/imports.py +17 -0
  20. thds/core/inspect.py +58 -0
  21. thds/core/iterators.py +9 -0
  22. thds/core/lazy.py +83 -0
  23. thds/core/link.py +153 -0
  24. thds/core/log/__init__.py +29 -0
  25. thds/core/log/basic_config.py +171 -0
  26. thds/core/log/json_formatter.py +43 -0
  27. thds/core/log/kw_formatter.py +84 -0
  28. thds/core/log/kw_logger.py +93 -0
  29. thds/core/log/logfmt.py +302 -0
  30. thds/core/merge_args.py +168 -0
  31. thds/core/meta.json +8 -0
  32. thds/core/meta.py +518 -0
  33. thds/core/parallel.py +200 -0
  34. thds/core/pickle_visit.py +24 -0
  35. thds/core/prof.py +276 -0
  36. thds/core/progress.py +112 -0
  37. thds/core/protocols.py +17 -0
  38. thds/core/py.typed +0 -0
  39. thds/core/scaling.py +39 -0
  40. thds/core/scope.py +199 -0
  41. thds/core/source.py +238 -0
  42. thds/core/source_serde.py +104 -0
  43. thds/core/sqlite/__init__.py +21 -0
  44. thds/core/sqlite/connect.py +33 -0
  45. thds/core/sqlite/copy.py +35 -0
  46. thds/core/sqlite/ddl.py +4 -0
  47. thds/core/sqlite/functions.py +63 -0
  48. thds/core/sqlite/index.py +22 -0
  49. thds/core/sqlite/insert_utils.py +23 -0
  50. thds/core/sqlite/merge.py +84 -0
  51. thds/core/sqlite/meta.py +190 -0
  52. thds/core/sqlite/read.py +66 -0
  53. thds/core/sqlite/sqlmap.py +179 -0
  54. thds/core/sqlite/structured.py +138 -0
  55. thds/core/sqlite/types.py +64 -0
  56. thds/core/sqlite/upsert.py +139 -0
  57. thds/core/sqlite/write.py +99 -0
  58. thds/core/stack_context.py +41 -0
  59. thds/core/thunks.py +40 -0
  60. thds/core/timer.py +214 -0
  61. thds/core/tmp.py +85 -0
  62. thds/core/types.py +4 -0
  63. thds.core-1.31.20250123022540.dist-info/METADATA +68 -0
  64. thds.core-1.31.20250123022540.dist-info/RECORD +67 -0
  65. {thds.core-0.0.1.dist-info → thds.core-1.31.20250123022540.dist-info}/WHEEL +1 -1
  66. thds.core-1.31.20250123022540.dist-info/entry_points.txt +4 -0
  67. thds.core-1.31.20250123022540.dist-info/top_level.txt +1 -0
  68. thds.core-0.0.1.dist-info/METADATA +0 -8
  69. thds.core-0.0.1.dist-info/RECORD +0 -4
  70. thds.core-0.0.1.dist-info/top_level.txt +0 -1
thds/core/scope.py ADDED
@@ -0,0 +1,199 @@
1
+ """This allows the usage of ContextManagers that cover the entire body
2
+ of a function without requiring invasive (and git-diff-increasing)
3
+ "with" statements.
4
+
5
+ Another way of looking at this is that it is essentially a
6
+ decorator-driven `defer` in Go or `scope` in D. However, the semantics
7
+ are slightly different in that the scope is not necessarily at the
8
+ nearest function call boundary - we use Python's dynamic capabilities
9
+ to look 'up' the stack until we find a scope that is usable, and then
10
+ we embed the ContextManager in that scope.
11
+
12
+ Generally, the usage will look something like this:
13
+
14
+ ```
15
+ @scope.bound # wrap a function with a scope that will exit when it returns
16
+ def do_stuff(...):
17
+ foo = scope.enter(a_context_manager(...)) # enters the context manager via the nearest scope
18
+ # ...do some stuff
19
+ return bar
20
+ # context manager exits when nearest scope exits, which is right after function return.
21
+ ```
22
+
23
+ where the traditional alternative would be:
24
+
25
+ ```
26
+ def do_stuff(...):
27
+ with a_context_manager(...) as foo:
28
+ # ...do the same stuff
29
+ # ...but now your git diff is huge b/c you indented everything,
30
+ return bar
31
+ # context manager exits after `with` block closes, which is when the function returns.
32
+ ```
33
+
34
+ Because we use ContextVar to perform the lookups of the nearest scope,
35
+ this is actually pretty performant. You do pay the cost of the
36
+ wrapping function, which is higher than a `with` statement.
37
+
38
+ """
39
+
40
+ import atexit
41
+ import contextlib
42
+ import inspect
43
+ import sys
44
+ import typing as ty
45
+ from functools import wraps
46
+ from logging import getLogger
47
+ from uuid import uuid4
48
+
49
+ from .inspect import get_caller_info
50
+ from .stack_context import StackContext
51
+
52
+ _KEYED_SCOPE_CONTEXTS: ty.Dict[str, StackContext[contextlib.ExitStack]] = dict()
53
+ # all non-nil ExitStacks will be closed at application exit
54
+
55
+
56
+ def _close_root_scopes_atexit():
57
+ for name, scope_sc in _KEYED_SCOPE_CONTEXTS.items():
58
+ scope = scope_sc()
59
+ if scope:
60
+ try:
61
+ scope.close()
62
+ except ValueError as ve:
63
+ print(f"Unable to close scope '{name}' at exit because {ve}", file=sys.stderr)
64
+
65
+
66
+ atexit.register(_close_root_scopes_atexit)
67
+
68
+
69
+ def _init_sc(key: str, val: contextlib.ExitStack):
70
+ """This should only ever be called at the root of/during import of a
71
+ module. It is _not_ threadsafe.
72
+ """
73
+ # normally you shouldn't create a StackContext except as a
74
+ # global. in this case, we're dynamically storing _in_ a
75
+ # global dict, which is equivalent.
76
+ if key in _KEYED_SCOPE_CONTEXTS:
77
+ getLogger(__name__).warning(
78
+ f"Scope {key} already exists! If this is not importlib.reload, you have a problem."
79
+ )
80
+ _KEYED_SCOPE_CONTEXTS[key] = StackContext(key, val)
81
+
82
+
83
+ F = ty.TypeVar("F", bound=ty.Callable)
84
+
85
+
86
+ def _bound(key: str, func: F) -> F:
87
+ """A decorator that establishes a scope boundary for context managers
88
+ that can now be `enter`ed, and will then be exited when this
89
+ boundary is returned to.
90
+ """
91
+ if inspect.isgeneratorfunction(func):
92
+
93
+ @wraps(func)
94
+ def __scope_boundary_generator_wrap(*args, **kwargs):
95
+ if key not in _KEYED_SCOPE_CONTEXTS:
96
+ _init_sc(key, contextlib.ExitStack()) # this root stack will probably not get used
97
+
98
+ with _KEYED_SCOPE_CONTEXTS[key].set(contextlib.ExitStack()) as scoped_exit_stack:
99
+ with scoped_exit_stack: # enter and exit the ExitStack itself
100
+ ret = yield from func(*args, **kwargs)
101
+ return ret # weird syntax here, Python...
102
+
103
+ return ty.cast(F, __scope_boundary_generator_wrap)
104
+
105
+ if inspect.isasyncgenfunction(func):
106
+
107
+ @wraps(func)
108
+ async def __scope_boundary_async_generator_wrap(*args, **kwargs):
109
+ if key not in _KEYED_SCOPE_CONTEXTS:
110
+ _init_sc(key, contextlib.ExitStack())
111
+
112
+ with _KEYED_SCOPE_CONTEXTS[key].set(contextlib.ExitStack()) as scoped_exit_stack:
113
+ with scoped_exit_stack:
114
+ async for ret in func(*args, **kwargs):
115
+ yield ret
116
+
117
+ return ty.cast(F, __scope_boundary_async_generator_wrap)
118
+
119
+ if inspect.iscoroutinefunction(func):
120
+
121
+ @wraps(func)
122
+ async def __scope_boundary_coroutine_wrap(*args, **kwargs):
123
+ if key not in _KEYED_SCOPE_CONTEXTS:
124
+ _init_sc(key, contextlib.ExitStack())
125
+
126
+ with _KEYED_SCOPE_CONTEXTS[key].set(contextlib.ExitStack()) as scoped_exit_stack:
127
+ with scoped_exit_stack:
128
+ return await func(*args, **kwargs)
129
+
130
+ return ty.cast(F, __scope_boundary_coroutine_wrap)
131
+
132
+ @wraps(func)
133
+ def __scope_boundary_wrap(*args, **kwargs):
134
+ if key not in _KEYED_SCOPE_CONTEXTS:
135
+ _init_sc(key, contextlib.ExitStack()) # this root stack will probably not get used
136
+
137
+ with _KEYED_SCOPE_CONTEXTS[key].set(contextlib.ExitStack()) as scoped_exit_stack:
138
+ with scoped_exit_stack: # enter and exit the ExitStack itself
139
+ return func(*args, **kwargs)
140
+
141
+ return ty.cast(F, __scope_boundary_wrap)
142
+
143
+
144
+ class NoScopeFound(Exception):
145
+ pass
146
+
147
+
148
+ M = ty.TypeVar("M")
149
+
150
+
151
+ def _enter(key: str, context: ty.ContextManager[M]) -> M:
152
+ """Call this to enter a ContextManager which will be exited at the
153
+ nearest scope boundary, without needing a with statement.
154
+ """
155
+ # this is fairly efficient - we don't walk up the stack; we simply
156
+ # use the stack-following ContextVar that was set up previously.
157
+ scope_context = _KEYED_SCOPE_CONTEXTS.get(key)
158
+ if scope_context:
159
+ return scope_context().enter_context(context)
160
+ raise NoScopeFound(f"No scope with the key {key} was found.")
161
+
162
+
163
+ class Scope:
164
+ """Creating your own Scope isn't often necessary - often you just want
165
+ a basic scope around your function, so you can just use the default Scope,
166
+ which is created below.
167
+
168
+ However, in case it's important to your use case to be able to
169
+ have orthogonal scopes that can be entered further down the stack
170
+ and exited at a precise point further up, this makes it possible.
171
+
172
+ If you provide a key, it must be globally unique, and if it has
173
+ previously been created within the same application, an
174
+ AssertionError will be thrown. You do not need to provide a key.
175
+
176
+ These should be module-level/global objects under all
177
+ circumstances, as they share an internal global namespace.
178
+
179
+ """
180
+
181
+ def __init__(self, key: str = ""):
182
+ caller_info = get_caller_info(skip=1)
183
+ self.key = caller_info.module + "+" + (key or uuid4().hex)
184
+ _init_sc(self.key, contextlib.ExitStack()) # add root boundary
185
+
186
+ def bound(self, func: F) -> F:
187
+ """Add a boundary to this function which will close all of the
188
+ Contexts subsequently entered at the time this function exits.
189
+ """
190
+ return _bound(self.key, func)
191
+
192
+ def enter(self, context: ty.ContextManager[M]) -> M:
193
+ """Enter the provided Context with a future exit at the nearest boundary for this Scope."""
194
+ return _enter(self.key, context)
195
+
196
+
197
+ default = Scope("__default_scope_stack")
198
+ bound = default.bound
199
+ enter = default.enter
thds/core/source.py ADDED
@@ -0,0 +1,238 @@
1
+ """Wrap openable, read-only data that is either locally-present or downloadable,
2
+
3
+ yet will not be downloaded (if non-local) until it is actually opened or unwrapped.
4
+ """
5
+
6
+ import os
7
+ import typing as ty
8
+ from dataclasses import dataclass
9
+ from functools import partial
10
+ from pathlib import Path
11
+
12
+ from . import log
13
+ from .files import is_file_uri, path_from_uri, to_uri
14
+ from .hash_cache import filehash
15
+ from .hashing import Hash
16
+ from .types import StrOrPath
17
+
18
+
19
+ class Downloader(ty.Protocol):
20
+ def __call__(self, hash: ty.Optional[Hash]) -> Path:
21
+ """Closure over a URI that downloads a file to a local path and returns the path.
22
+ The file may be placed anywhere as long as the file will be readable until the
23
+ program exits.
24
+
25
+ If the URI points to a missing file, this MUST raise any Exception that the
26
+ underlying implementation desires. It MUST NOT return a Path pointing to a
27
+ non-existent file.
28
+
29
+ The Hash may be used to short-circuit a download that would result in downloading
30
+ a file that does not match the expected hash, but the Downloader need not verify
31
+ the Hash of the file downloaded after the fact, as that will be performed by
32
+ default by the Source object.
33
+ """
34
+
35
+
36
+ class DownloadHandler(ty.Protocol):
37
+ def __call__(self, uri: str) -> ty.Optional[Downloader]:
38
+ """Returns a Downloader containing the URI if this URI can be handled. Returns
39
+ None if this URI cannot be handled.
40
+ """
41
+
42
+
43
+ def _LocalFileHandler(uri: str) -> ty.Optional[Downloader]:
44
+ if not is_file_uri(uri):
45
+ return None
46
+
47
+ def download_file(hash: ty.Optional[Hash]) -> Path:
48
+ lpath = path_from_uri(uri)
49
+ if not lpath.exists():
50
+ raise FileNotFoundError(lpath)
51
+ if hash:
52
+ _check_hash(hash, lpath)
53
+ return lpath
54
+
55
+ return download_file
56
+
57
+
58
+ def register_download_handler(key: str, handler: DownloadHandler):
59
+ # key is not currently used for anything other than avoiding
60
+ # having duplicates registered for whatever reason.
61
+ _DOWNLOAD_HANDLERS[key] = handler
62
+
63
+
64
+ _DOWNLOAD_HANDLERS: ty.Dict[str, DownloadHandler] = dict()
65
+ register_download_handler("local_file", _LocalFileHandler)
66
+
67
+
68
+ def _get_download_handler(uri: str) -> Downloader:
69
+ for handler in _DOWNLOAD_HANDLERS.values():
70
+ if downloader := handler(uri):
71
+ return downloader
72
+ raise ValueError(f"No SourcePath download handler for uri: {uri}")
73
+
74
+
75
+ class SourceHashMismatchError(ValueError):
76
+ pass
77
+
78
+
79
+ def _check_hash(expected_hash: ty.Optional[Hash], path: Path) -> Hash:
80
+ hash_algo = expected_hash.algo if expected_hash else "sha256"
81
+ with log.logger_context(hash_for=f"source-{hash_algo}"):
82
+ computed_hash = filehash(hash_algo, path)
83
+ if expected_hash and expected_hash != computed_hash:
84
+ raise SourceHashMismatchError(
85
+ f"{expected_hash.algo} mismatch for {path};"
86
+ f" got {computed_hash.bytes!r}, expected {expected_hash.bytes!r}"
87
+ )
88
+ return computed_hash
89
+
90
+
91
+ @dataclass(frozen=True)
92
+ class Source(os.PathLike):
93
+ """Source is meant to be a consistent in-memory representation for an abstract,
94
+ **read-only** source of data that may not be present locally when an application
95
+ starts.
96
+
97
+ A Source uses `os.PathLike` (`__fspath__`) to support transparent `open(src)` calls,
98
+ so in many cases it will be a drop-in replacement for Path or str filenames. If you
99
+ need an actual Path object, you can call `path()` to get one, but you should prefer to
100
+ defer this until the actual location of use.
101
+
102
+ By 'wrapping' read-only data in these objects, we can unify the code around how we
103
+ unwrap and use them, which should allow us to more easily support different execution
104
+ environments and sources of data.
105
+
106
+ For instance, a Source could be a file on disk, but it could also be a file in
107
+ ADLS.
108
+
109
+ Furthermore, libraries which build on top of this one may use this representation to
110
+ identify opportunities for optimization, by representing the Source in a stable
111
+ and consistent format that allows different underlying data sources to fulfill the
112
+ request for the data based on environmental context. A library could choose to
113
+ transparently transform a local-path-based Source into a Source representing a
114
+ remote file, without changing the semantics of the object as observed by the code.
115
+
116
+ One reason a Hash is part of the interface is so that libraries interacting with the
117
+ object can use the hash as a canonical 'name' for the data, if one is available.
118
+
119
+ Another reason is that we can add a layer of consistency checking to data we're
120
+ working with, at the cost of a few compute cycles. Since Sources are meant to represent
121
+ read-only data, the Hash is a meaningful and persistent marker of data identity.
122
+
123
+ Do not call its constructor in application code. Use `from_file` or `from_uri` instead.
124
+ """
125
+
126
+ uri: str
127
+ hash: ty.Optional[Hash] = None
128
+ # hash and equality are based only on the _identity_ of the object,
129
+ # not on the other properties that provide some caching functionality.
130
+
131
+ @property
132
+ def cached_path(self) -> ty.Optional[Path]:
133
+ """This is part of the public interface as far as checking to see whether a file
134
+ is already present locally, but its existence and value is not part of equality or
135
+ the hash for this class - it exists purely as an optimization.
136
+ """
137
+ return getattr(self, "__cached_path", None)
138
+
139
+ def _set_cached_path(self, lpath: ty.Optional[Path]):
140
+ """protected interface for setting a cached Path since the attribute is not
141
+ available via the constructor.
142
+ """
143
+ super().__setattr__("__cached_path", lpath) # this works around dataclass.frozen.
144
+ # https://noklam.github.io/blog/posts/2022-04-22-python-dataclass-partiala-immutable.html
145
+
146
+ def path(self) -> Path:
147
+ """Any Source can be turned into a local file path.
148
+
149
+ Remember that the resulting data is meant to be read-only. If you want to mutate
150
+ the data, you should first make a copy.
151
+
152
+ If not already present locally, this will incur a one-time download. Then, if the
153
+ Source has a Hash, the Hash will be validated against the downloaded file, and a
154
+ failure will raise SourceHashMismatchError.
155
+ """
156
+ if self.cached_path is None or not self.cached_path.exists():
157
+ lpath = _get_download_handler(self.uri)(self.hash)
158
+ # path() used to be responsible for checking the hash, but since we pass it to the downloader,
159
+ # it really makes more sense to allow the downloader to decide how to verify its own download,
160
+ # and we don't want to duplicate any effort that it may have already put in.
161
+ self._set_cached_path(lpath)
162
+
163
+ assert self.cached_path and self.cached_path.exists()
164
+ return self.cached_path
165
+
166
+ def __fspath__(self) -> str:
167
+ return os.fspath(self.path())
168
+
169
+
170
+ # Creation from local Files or from remote URIs
171
+
172
+
173
+ def from_file(filename: StrOrPath, hash: ty.Optional[Hash] = None, uri: str = "") -> Source:
174
+ """Create a read-only Source from a local file that already exists.
175
+
176
+ If URI is passed, the local file will be read and hashed, but the final URI in the
177
+ Source will be the one provided explicitly. NO UPLOAD IS PERFORMED. It is your
178
+ responsibility to ensure that your file has been uploaded to the URI you provide.
179
+ """
180
+ path = path_from_uri(filename) if isinstance(filename, str) else filename
181
+ assert isinstance(path, Path)
182
+ if not path.exists():
183
+ raise FileNotFoundError(path)
184
+
185
+ if uri:
186
+ src = from_uri(uri, _check_hash(hash, path))
187
+ else:
188
+ src = Source(to_uri(path), _check_hash(hash, path))
189
+ src._set_cached_path(path) # internally, it's okay to hack around immutability.
190
+ return src
191
+
192
+
193
+ class FromUri(ty.Protocol):
194
+ def __call__(self, hash: ty.Optional[Hash]) -> Source:
195
+ """Closure over a URI that creates a Source from a URI.
196
+
197
+ The Hash may be used to short-circuit creation that would result in creating
198
+ a Source that cannot match the expected Hash, but this is not required,
199
+ and the hash will be included in the Source object regardless, and will
200
+ be validated (if non-nil) at the time of source data access.
201
+ """
202
+
203
+
204
+ class FromUriHandler(ty.Protocol):
205
+ def __call__(self, uri: str) -> ty.Optional[FromUri]:
206
+ """Returns a FromUri object containing the URI if this URI can be handled. Returns
207
+ None if this URI cannot be handled.
208
+ """
209
+
210
+
211
+ def register_from_uri_handler(key: str, handler: FromUriHandler):
212
+ """If a library wants to customize how Sources are created from URIs that it handles,
213
+ it can register a handler here.
214
+ """
215
+ # key is not currently used for anything other than avoiding
216
+ # having duplicates registered for whatever reason.
217
+ _FROM_URI_HANDLERS[key] = handler
218
+
219
+
220
+ _FROM_URI_HANDLERS: ty.Dict[str, FromUriHandler] = dict()
221
+ register_from_uri_handler(
222
+ "local_file", lambda uri: partial(from_file, path_from_uri(uri)) if is_file_uri(uri) else None
223
+ )
224
+
225
+
226
+ def from_uri(uri: str, hash: ty.Optional[Hash] = None) -> Source:
227
+ """Create a read-only Source from a URI. The data should already exist at this remote
228
+ URI, although Source itself can make no guarantee that it always represents real data
229
+ - only that any data it does represent is read-only.
230
+
231
+ It may be advantageous for a URI-handling library to register a more specific
232
+ implementation of this function, if it is capable of determining a Hash for the blob
233
+ represented by the URI without downloading the blob.
234
+ """
235
+ for handler in _FROM_URI_HANDLERS.values():
236
+ if from_uri_ := handler(uri):
237
+ return from_uri_(hash)
238
+ return Source(uri=uri, hash=hash)
@@ -0,0 +1,104 @@
1
+ # this should later get promoted somewhere, probably
2
+ import json
3
+ import typing as ty
4
+ from functools import partial
5
+ from pathlib import Path
6
+
7
+ from thds.core import files, hashing, log, source, types
8
+
9
+ _SHA256_B64 = "sha256b64"
10
+ _MD5_B64 = "md5b64"
11
+
12
+ logger = log.getLogger(__name__)
13
+
14
+
15
+ def _from_sha256b64(d: dict) -> ty.Optional[hashing.Hash]:
16
+ if "sha256b64" in d:
17
+ return hashing.Hash(algo="sha256", bytes=hashing.db64(d[_SHA256_B64]))
18
+ return None
19
+
20
+
21
+ def _from_md5b64(d: dict) -> ty.Optional[hashing.Hash]:
22
+ if "md5b64" in d:
23
+ return hashing.Hash(algo="md5", bytes=hashing.db64(d[_MD5_B64]))
24
+ return None
25
+
26
+
27
+ HashParser = ty.Callable[[dict], ty.Optional[hashing.Hash]]
28
+ _BASE_PARSERS = (_from_sha256b64, _from_md5b64)
29
+
30
+
31
+ def base_parsers() -> ty.Tuple[HashParser, ...]:
32
+ return _BASE_PARSERS
33
+
34
+
35
+ def from_json(
36
+ json_source: str, hash_parsers: ty.Collection[HashParser] = base_parsers()
37
+ ) -> source.Source:
38
+ d = json.loads(json_source)
39
+ return source.from_uri(
40
+ uri=d["uri"],
41
+ hash=next(filter(None, (p(d) for p in hash_parsers)), None),
42
+ )
43
+
44
+
45
+ def _generic_hash_serializer(
46
+ algo: str, stringify_hash: ty.Callable[[bytes], str], keyname: str, hash: hashing.Hash
47
+ ) -> ty.Optional[dict]:
48
+ if hash.algo == algo:
49
+ return {keyname: stringify_hash(hash.bytes)}
50
+ return None
51
+
52
+
53
+ _to_sha256b64 = partial(_generic_hash_serializer, "sha256", hashing.b64, _SHA256_B64)
54
+ _to_md5b64 = partial(_generic_hash_serializer, "md5", hashing.b64, _MD5_B64)
55
+
56
+ HashSerializer = ty.Callable[[hashing.Hash], ty.Optional[dict]]
57
+ _BASE_HASH_SERIALIZERS: ty.Tuple[HashSerializer, ...] = (_to_md5b64, _to_sha256b64) # type: ignore
58
+
59
+
60
+ def base_hash_serializers() -> ty.Tuple[HashSerializer, ...]:
61
+ return _BASE_HASH_SERIALIZERS
62
+
63
+
64
+ def to_json(
65
+ source: source.Source, hash_serializers: ty.Collection[HashSerializer] = base_hash_serializers()
66
+ ) -> str:
67
+ hash_dict = (
68
+ next(filter(None, (ser(source.hash) for ser in hash_serializers if source.hash)), None)
69
+ ) or dict()
70
+ return json.dumps(dict(uri=source.uri, **hash_dict))
71
+
72
+
73
+ def from_unknown_user_path(path: types.StrOrPath, desired_uri: str) -> source.Source:
74
+ """Sometimes you may want to load a Source directly from a Path provided by a user.
75
+
76
+ It _might_ represent something loadable as a from_json Source, but it might just be a
77
+ raw file that needs to be loaded with from_file!
78
+
79
+ This is a _reasonable_ (but not guaranteed!) way of trying to ascertain which one it
80
+ is, and specifying where it should live 'remotely' if such a thing becomes
81
+ necessary.
82
+
83
+ Your application might need to implement something more robust if the
84
+ actual underlying data is likely to be a JSON blob containing the key `uri`, for
85
+ instance.
86
+ """
87
+ with open(path) as readable:
88
+ try:
89
+ return from_json(readable.read(4096))
90
+ except (json.JSONDecodeError, UnicodeDecodeError):
91
+ return source.from_file(path, uri=desired_uri)
92
+
93
+
94
+ def write_to_json_file(source: source.Source, local_file: Path) -> bool:
95
+ """Write the canonical JSON serialization of the Source to a file."""
96
+ local_file.parent.mkdir(parents=True, exist_ok=True)
97
+ previous_source = local_file.read_text() if local_file.exists() else None
98
+ new_source = to_json(source) + "\n"
99
+ if new_source != previous_source:
100
+ with files.atomic_text_writer(local_file) as f:
101
+ logger.info(f"Writing {source} to {local_file}")
102
+ f.write(new_source)
103
+ return True
104
+ return False
@@ -0,0 +1,21 @@
1
+ from . import connect, copy, ddl, functions, index, read, sqlmap, upsert # noqa: F401
2
+ from .merge import merge_databases # noqa: F401
3
+ from .meta import ( # noqa: F401
4
+ debug_errors,
5
+ list_tables,
6
+ preload_sources,
7
+ primary_key_cols,
8
+ table_name_from_path,
9
+ table_source,
10
+ )
11
+ from .structured import StructTable, struct_table_from_source # noqa: F401
12
+ from .types import ( # noqa: F401
13
+ AnyDbTableSrc,
14
+ DbAndTable,
15
+ DbAndTableP,
16
+ TableMaster,
17
+ TableSource,
18
+ maybe_t,
19
+ resolve_lazy_db_and_table,
20
+ )
21
+ from .write import make_mapping_writer, write_mappings # noqa: F401
@@ -0,0 +1,33 @@
1
+ import contextlib
2
+ import os
3
+ import sqlite3
4
+ import typing as ty
5
+
6
+ from thds.core import scope
7
+
8
+ from .functions import register_functions_on_connection
9
+ from .types import Connectable
10
+
11
+
12
+ def row_connect(path: ty.Union[str, os.PathLike]) -> sqlite3.Connection:
13
+ """Get a connection to a row database"""
14
+ conn = sqlite3.connect(os.fspath(path), isolation_level=None) # autocommit
15
+ conn.row_factory = sqlite3.Row
16
+ return register_functions_on_connection(conn)
17
+
18
+
19
+ autoconn_scope = scope.Scope("sqlite3.autoconn")
20
+
21
+
22
+ def autoconnect(connectable: Connectable) -> sqlite3.Connection:
23
+ """Will automatically commit when it hits the autoconn_scope.bound, but only if
24
+ the connectable was not already a connection.
25
+ """
26
+ if isinstance(connectable, sqlite3.Connection):
27
+ return connectable
28
+
29
+ return autoconn_scope.enter(
30
+ contextlib.closing( # close the connection when we exit the scope
31
+ row_connect(os.fspath(connectable))
32
+ )
33
+ )
@@ -0,0 +1,35 @@
1
+ """Utility for copying a table from one connection to another."""
2
+
3
+ from ..log import getLogger
4
+ from .connect import autoconn_scope, autoconnect
5
+ from .types import Connectable
6
+
7
+ logger = getLogger(__name__)
8
+
9
+
10
+ @autoconn_scope.bound
11
+ def table(source: Connectable, table_name: str, dest: Connectable) -> None:
12
+ """Copy a table from one connection to another, including its table definition.
13
+
14
+ If you can do this using ATTACH instead, that will be faster because it involves
15
+ no Python code running in the loop.
16
+ """
17
+ source_conn = autoconnect(source)
18
+ dest_conn = autoconnect(dest)
19
+
20
+ source_table_sql = source_conn.execute(
21
+ f"SELECT sql FROM sqlite_master WHERE name = '{table_name}'"
22
+ ).fetchone()[0]
23
+
24
+ dest_conn.execute(source_table_sql)
25
+
26
+ src_data = source_conn.execute(f"SELECT * FROM {table_name}")
27
+
28
+ dest_conn.execute("BEGIN TRANSACTION;")
29
+ while True:
30
+ data = src_data.fetchmany(1000)
31
+ if not data:
32
+ break
33
+ placeholders = ", ".join(["?"] * len(data[0]))
34
+ dest_conn.executemany(f"INSERT INTO {table_name} VALUES ({placeholders});", data)
35
+ dest_conn.execute("COMMIT;")
@@ -0,0 +1,4 @@
1
+ def drop(full_name: str, is_view: bool = False) -> str:
2
+ """Drop a table or view."""
3
+ table_or_view = "TABLE" if not is_view else "VIEW"
4
+ return f"DROP {table_or_view} IF EXISTS {full_name};"