thds.core 0.0.1__py3-none-any.whl → 1.31.20250116223856__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of thds.core might be problematic. Click here for more details.
- thds/core/__init__.py +48 -0
- thds/core/ansi_esc.py +46 -0
- thds/core/cache.py +201 -0
- thds/core/calgitver.py +82 -0
- thds/core/concurrency.py +100 -0
- thds/core/config.py +250 -0
- thds/core/decos.py +55 -0
- thds/core/dict_utils.py +188 -0
- thds/core/env.py +40 -0
- thds/core/exit_after.py +121 -0
- thds/core/files.py +125 -0
- thds/core/fretry.py +115 -0
- thds/core/generators.py +56 -0
- thds/core/git.py +81 -0
- thds/core/hash_cache.py +86 -0
- thds/core/hashing.py +106 -0
- thds/core/home.py +15 -0
- thds/core/hostname.py +10 -0
- thds/core/imports.py +17 -0
- thds/core/inspect.py +58 -0
- thds/core/iterators.py +9 -0
- thds/core/lazy.py +83 -0
- thds/core/link.py +153 -0
- thds/core/log/__init__.py +29 -0
- thds/core/log/basic_config.py +171 -0
- thds/core/log/json_formatter.py +43 -0
- thds/core/log/kw_formatter.py +84 -0
- thds/core/log/kw_logger.py +93 -0
- thds/core/log/logfmt.py +302 -0
- thds/core/merge_args.py +168 -0
- thds/core/meta.json +8 -0
- thds/core/meta.py +518 -0
- thds/core/parallel.py +200 -0
- thds/core/pickle_visit.py +24 -0
- thds/core/prof.py +276 -0
- thds/core/progress.py +112 -0
- thds/core/protocols.py +17 -0
- thds/core/py.typed +0 -0
- thds/core/scaling.py +39 -0
- thds/core/scope.py +199 -0
- thds/core/source.py +238 -0
- thds/core/source_serde.py +104 -0
- thds/core/sqlite/__init__.py +21 -0
- thds/core/sqlite/connect.py +33 -0
- thds/core/sqlite/copy.py +35 -0
- thds/core/sqlite/ddl.py +4 -0
- thds/core/sqlite/functions.py +63 -0
- thds/core/sqlite/index.py +22 -0
- thds/core/sqlite/insert_utils.py +23 -0
- thds/core/sqlite/merge.py +84 -0
- thds/core/sqlite/meta.py +190 -0
- thds/core/sqlite/read.py +66 -0
- thds/core/sqlite/sqlmap.py +179 -0
- thds/core/sqlite/structured.py +138 -0
- thds/core/sqlite/types.py +64 -0
- thds/core/sqlite/upsert.py +139 -0
- thds/core/sqlite/write.py +99 -0
- thds/core/stack_context.py +41 -0
- thds/core/thunks.py +40 -0
- thds/core/timer.py +214 -0
- thds/core/tmp.py +85 -0
- thds/core/types.py +4 -0
- thds.core-1.31.20250116223856.dist-info/METADATA +68 -0
- thds.core-1.31.20250116223856.dist-info/RECORD +67 -0
- {thds.core-0.0.1.dist-info → thds.core-1.31.20250116223856.dist-info}/WHEEL +1 -1
- thds.core-1.31.20250116223856.dist-info/entry_points.txt +4 -0
- thds.core-1.31.20250116223856.dist-info/top_level.txt +1 -0
- thds.core-0.0.1.dist-info/METADATA +0 -8
- thds.core-0.0.1.dist-info/RECORD +0 -4
- thds.core-0.0.1.dist-info/top_level.txt +0 -1
thds/core/scope.py
ADDED
|
@@ -0,0 +1,199 @@
|
|
|
1
|
+
"""This allows the usage of ContextManagers that cover the entire body
|
|
2
|
+
of a function without requiring invasive (and git-diff-increasing)
|
|
3
|
+
"with" statements.
|
|
4
|
+
|
|
5
|
+
Another way of looking at this is that it is essentially a
|
|
6
|
+
decorator-driven `defer` in Go or `scope` in D. However, the semantics
|
|
7
|
+
are slightly different in that the scope is not necessarily at the
|
|
8
|
+
nearest function call boundary - we use Python's dynamic capabilities
|
|
9
|
+
to look 'up' the stack until we find a scope that is usable, and then
|
|
10
|
+
we embed the ContextManager in that scope.
|
|
11
|
+
|
|
12
|
+
Generally, the usage will look something like this:
|
|
13
|
+
|
|
14
|
+
```
|
|
15
|
+
@scope.bound # wrap a function with a scope that will exit when it returns
|
|
16
|
+
def do_stuff(...):
|
|
17
|
+
foo = scope.enter(a_context_manager(...)) # enters the context manager via the nearest scope
|
|
18
|
+
# ...do some stuff
|
|
19
|
+
return bar
|
|
20
|
+
# context manager exits when nearest scope exits, which is right after function return.
|
|
21
|
+
```
|
|
22
|
+
|
|
23
|
+
where the traditional alternative would be:
|
|
24
|
+
|
|
25
|
+
```
|
|
26
|
+
def do_stuff(...):
|
|
27
|
+
with a_context_manager(...) as foo:
|
|
28
|
+
# ...do the same stuff
|
|
29
|
+
# ...but now your git diff is huge b/c you indented everything,
|
|
30
|
+
return bar
|
|
31
|
+
# context manager exits after `with` block closes, which is when the function returns.
|
|
32
|
+
```
|
|
33
|
+
|
|
34
|
+
Because we use ContextVar to perform the lookups of the nearest scope,
|
|
35
|
+
this is actually pretty performant. You do pay the cost of the
|
|
36
|
+
wrapping function, which is higher than a `with` statement.
|
|
37
|
+
|
|
38
|
+
"""
|
|
39
|
+
|
|
40
|
+
import atexit
|
|
41
|
+
import contextlib
|
|
42
|
+
import inspect
|
|
43
|
+
import sys
|
|
44
|
+
import typing as ty
|
|
45
|
+
from functools import wraps
|
|
46
|
+
from logging import getLogger
|
|
47
|
+
from uuid import uuid4
|
|
48
|
+
|
|
49
|
+
from .inspect import get_caller_info
|
|
50
|
+
from .stack_context import StackContext
|
|
51
|
+
|
|
52
|
+
_KEYED_SCOPE_CONTEXTS: ty.Dict[str, StackContext[contextlib.ExitStack]] = dict()
|
|
53
|
+
# all non-nil ExitStacks will be closed at application exit
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def _close_root_scopes_atexit():
|
|
57
|
+
for name, scope_sc in _KEYED_SCOPE_CONTEXTS.items():
|
|
58
|
+
scope = scope_sc()
|
|
59
|
+
if scope:
|
|
60
|
+
try:
|
|
61
|
+
scope.close()
|
|
62
|
+
except ValueError as ve:
|
|
63
|
+
print(f"Unable to close scope '{name}' at exit because {ve}", file=sys.stderr)
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
atexit.register(_close_root_scopes_atexit)
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
def _init_sc(key: str, val: contextlib.ExitStack):
|
|
70
|
+
"""This should only ever be called at the root of/during import of a
|
|
71
|
+
module. It is _not_ threadsafe.
|
|
72
|
+
"""
|
|
73
|
+
# normally you shouldn't create a StackContext except as a
|
|
74
|
+
# global. in this case, we're dynamically storing _in_ a
|
|
75
|
+
# global dict, which is equivalent.
|
|
76
|
+
if key in _KEYED_SCOPE_CONTEXTS:
|
|
77
|
+
getLogger(__name__).warning(
|
|
78
|
+
f"Scope {key} already exists! If this is not importlib.reload, you have a problem."
|
|
79
|
+
)
|
|
80
|
+
_KEYED_SCOPE_CONTEXTS[key] = StackContext(key, val)
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
F = ty.TypeVar("F", bound=ty.Callable)
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
def _bound(key: str, func: F) -> F:
|
|
87
|
+
"""A decorator that establishes a scope boundary for context managers
|
|
88
|
+
that can now be `enter`ed, and will then be exited when this
|
|
89
|
+
boundary is returned to.
|
|
90
|
+
"""
|
|
91
|
+
if inspect.isgeneratorfunction(func):
|
|
92
|
+
|
|
93
|
+
@wraps(func)
|
|
94
|
+
def __scope_boundary_generator_wrap(*args, **kwargs):
|
|
95
|
+
if key not in _KEYED_SCOPE_CONTEXTS:
|
|
96
|
+
_init_sc(key, contextlib.ExitStack()) # this root stack will probably not get used
|
|
97
|
+
|
|
98
|
+
with _KEYED_SCOPE_CONTEXTS[key].set(contextlib.ExitStack()) as scoped_exit_stack:
|
|
99
|
+
with scoped_exit_stack: # enter and exit the ExitStack itself
|
|
100
|
+
ret = yield from func(*args, **kwargs)
|
|
101
|
+
return ret # weird syntax here, Python...
|
|
102
|
+
|
|
103
|
+
return ty.cast(F, __scope_boundary_generator_wrap)
|
|
104
|
+
|
|
105
|
+
if inspect.isasyncgenfunction(func):
|
|
106
|
+
|
|
107
|
+
@wraps(func)
|
|
108
|
+
async def __scope_boundary_async_generator_wrap(*args, **kwargs):
|
|
109
|
+
if key not in _KEYED_SCOPE_CONTEXTS:
|
|
110
|
+
_init_sc(key, contextlib.ExitStack())
|
|
111
|
+
|
|
112
|
+
with _KEYED_SCOPE_CONTEXTS[key].set(contextlib.ExitStack()) as scoped_exit_stack:
|
|
113
|
+
with scoped_exit_stack:
|
|
114
|
+
async for ret in func(*args, **kwargs):
|
|
115
|
+
yield ret
|
|
116
|
+
|
|
117
|
+
return ty.cast(F, __scope_boundary_async_generator_wrap)
|
|
118
|
+
|
|
119
|
+
if inspect.iscoroutinefunction(func):
|
|
120
|
+
|
|
121
|
+
@wraps(func)
|
|
122
|
+
async def __scope_boundary_coroutine_wrap(*args, **kwargs):
|
|
123
|
+
if key not in _KEYED_SCOPE_CONTEXTS:
|
|
124
|
+
_init_sc(key, contextlib.ExitStack())
|
|
125
|
+
|
|
126
|
+
with _KEYED_SCOPE_CONTEXTS[key].set(contextlib.ExitStack()) as scoped_exit_stack:
|
|
127
|
+
with scoped_exit_stack:
|
|
128
|
+
return await func(*args, **kwargs)
|
|
129
|
+
|
|
130
|
+
return ty.cast(F, __scope_boundary_coroutine_wrap)
|
|
131
|
+
|
|
132
|
+
@wraps(func)
|
|
133
|
+
def __scope_boundary_wrap(*args, **kwargs):
|
|
134
|
+
if key not in _KEYED_SCOPE_CONTEXTS:
|
|
135
|
+
_init_sc(key, contextlib.ExitStack()) # this root stack will probably not get used
|
|
136
|
+
|
|
137
|
+
with _KEYED_SCOPE_CONTEXTS[key].set(contextlib.ExitStack()) as scoped_exit_stack:
|
|
138
|
+
with scoped_exit_stack: # enter and exit the ExitStack itself
|
|
139
|
+
return func(*args, **kwargs)
|
|
140
|
+
|
|
141
|
+
return ty.cast(F, __scope_boundary_wrap)
|
|
142
|
+
|
|
143
|
+
|
|
144
|
+
class NoScopeFound(Exception):
|
|
145
|
+
pass
|
|
146
|
+
|
|
147
|
+
|
|
148
|
+
M = ty.TypeVar("M")
|
|
149
|
+
|
|
150
|
+
|
|
151
|
+
def _enter(key: str, context: ty.ContextManager[M]) -> M:
|
|
152
|
+
"""Call this to enter a ContextManager which will be exited at the
|
|
153
|
+
nearest scope boundary, without needing a with statement.
|
|
154
|
+
"""
|
|
155
|
+
# this is fairly efficient - we don't walk up the stack; we simply
|
|
156
|
+
# use the stack-following ContextVar that was set up previously.
|
|
157
|
+
scope_context = _KEYED_SCOPE_CONTEXTS.get(key)
|
|
158
|
+
if scope_context:
|
|
159
|
+
return scope_context().enter_context(context)
|
|
160
|
+
raise NoScopeFound(f"No scope with the key {key} was found.")
|
|
161
|
+
|
|
162
|
+
|
|
163
|
+
class Scope:
|
|
164
|
+
"""Creating your own Scope isn't often necessary - often you just want
|
|
165
|
+
a basic scope around your function, so you can just use the default Scope,
|
|
166
|
+
which is created below.
|
|
167
|
+
|
|
168
|
+
However, in case it's important to your use case to be able to
|
|
169
|
+
have orthogonal scopes that can be entered further down the stack
|
|
170
|
+
and exited at a precise point further up, this makes it possible.
|
|
171
|
+
|
|
172
|
+
If you provide a key, it must be globally unique, and if it has
|
|
173
|
+
previously been created within the same application, an
|
|
174
|
+
AssertionError will be thrown. You do not need to provide a key.
|
|
175
|
+
|
|
176
|
+
These should be module-level/global objects under all
|
|
177
|
+
circumstances, as they share an internal global namespace.
|
|
178
|
+
|
|
179
|
+
"""
|
|
180
|
+
|
|
181
|
+
def __init__(self, key: str = ""):
|
|
182
|
+
caller_info = get_caller_info(skip=1)
|
|
183
|
+
self.key = caller_info.module + "+" + (key or uuid4().hex)
|
|
184
|
+
_init_sc(self.key, contextlib.ExitStack()) # add root boundary
|
|
185
|
+
|
|
186
|
+
def bound(self, func: F) -> F:
|
|
187
|
+
"""Add a boundary to this function which will close all of the
|
|
188
|
+
Contexts subsequently entered at the time this function exits.
|
|
189
|
+
"""
|
|
190
|
+
return _bound(self.key, func)
|
|
191
|
+
|
|
192
|
+
def enter(self, context: ty.ContextManager[M]) -> M:
|
|
193
|
+
"""Enter the provided Context with a future exit at the nearest boundary for this Scope."""
|
|
194
|
+
return _enter(self.key, context)
|
|
195
|
+
|
|
196
|
+
|
|
197
|
+
default = Scope("__default_scope_stack")
|
|
198
|
+
bound = default.bound
|
|
199
|
+
enter = default.enter
|
thds/core/source.py
ADDED
|
@@ -0,0 +1,238 @@
|
|
|
1
|
+
"""Wrap openable, read-only data that is either locally-present or downloadable,
|
|
2
|
+
|
|
3
|
+
yet will not be downloaded (if non-local) until it is actually opened or unwrapped.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
import os
|
|
7
|
+
import typing as ty
|
|
8
|
+
from dataclasses import dataclass
|
|
9
|
+
from functools import partial
|
|
10
|
+
from pathlib import Path
|
|
11
|
+
|
|
12
|
+
from . import log
|
|
13
|
+
from .files import is_file_uri, path_from_uri, to_uri
|
|
14
|
+
from .hash_cache import filehash
|
|
15
|
+
from .hashing import Hash
|
|
16
|
+
from .types import StrOrPath
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class Downloader(ty.Protocol):
|
|
20
|
+
def __call__(self, hash: ty.Optional[Hash]) -> Path:
|
|
21
|
+
"""Closure over a URI that downloads a file to a local path and returns the path.
|
|
22
|
+
The file may be placed anywhere as long as the file will be readable until the
|
|
23
|
+
program exits.
|
|
24
|
+
|
|
25
|
+
If the URI points to a missing file, this MUST raise any Exception that the
|
|
26
|
+
underlying implementation desires. It MUST NOT return a Path pointing to a
|
|
27
|
+
non-existent file.
|
|
28
|
+
|
|
29
|
+
The Hash may be used to short-circuit a download that would result in downloading
|
|
30
|
+
a file that does not match the expected hash, but the Downloader need not verify
|
|
31
|
+
the Hash of the file downloaded after the fact, as that will be performed by
|
|
32
|
+
default by the Source object.
|
|
33
|
+
"""
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
class DownloadHandler(ty.Protocol):
|
|
37
|
+
def __call__(self, uri: str) -> ty.Optional[Downloader]:
|
|
38
|
+
"""Returns a Downloader containing the URI if this URI can be handled. Returns
|
|
39
|
+
None if this URI cannot be handled.
|
|
40
|
+
"""
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def _LocalFileHandler(uri: str) -> ty.Optional[Downloader]:
|
|
44
|
+
if not is_file_uri(uri):
|
|
45
|
+
return None
|
|
46
|
+
|
|
47
|
+
def download_file(hash: ty.Optional[Hash]) -> Path:
|
|
48
|
+
lpath = path_from_uri(uri)
|
|
49
|
+
if not lpath.exists():
|
|
50
|
+
raise FileNotFoundError(lpath)
|
|
51
|
+
if hash:
|
|
52
|
+
_check_hash(hash, lpath)
|
|
53
|
+
return lpath
|
|
54
|
+
|
|
55
|
+
return download_file
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def register_download_handler(key: str, handler: DownloadHandler):
|
|
59
|
+
# key is not currently used for anything other than avoiding
|
|
60
|
+
# having duplicates registered for whatever reason.
|
|
61
|
+
_DOWNLOAD_HANDLERS[key] = handler
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
_DOWNLOAD_HANDLERS: ty.Dict[str, DownloadHandler] = dict()
|
|
65
|
+
register_download_handler("local_file", _LocalFileHandler)
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
def _get_download_handler(uri: str) -> Downloader:
|
|
69
|
+
for handler in _DOWNLOAD_HANDLERS.values():
|
|
70
|
+
if downloader := handler(uri):
|
|
71
|
+
return downloader
|
|
72
|
+
raise ValueError(f"No SourcePath download handler for uri: {uri}")
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
class SourceHashMismatchError(ValueError):
|
|
76
|
+
pass
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
def _check_hash(expected_hash: ty.Optional[Hash], path: Path) -> Hash:
|
|
80
|
+
hash_algo = expected_hash.algo if expected_hash else "sha256"
|
|
81
|
+
with log.logger_context(hash_for=f"source-{hash_algo}"):
|
|
82
|
+
computed_hash = filehash(hash_algo, path)
|
|
83
|
+
if expected_hash and expected_hash != computed_hash:
|
|
84
|
+
raise SourceHashMismatchError(
|
|
85
|
+
f"{expected_hash.algo} mismatch for {path};"
|
|
86
|
+
f" got {computed_hash.bytes!r}, expected {expected_hash.bytes!r}"
|
|
87
|
+
)
|
|
88
|
+
return computed_hash
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
@dataclass(frozen=True)
|
|
92
|
+
class Source(os.PathLike):
|
|
93
|
+
"""Source is meant to be a consistent in-memory representation for an abstract,
|
|
94
|
+
**read-only** source of data that may not be present locally when an application
|
|
95
|
+
starts.
|
|
96
|
+
|
|
97
|
+
A Source uses `os.PathLike` (`__fspath__`) to support transparent `open(src)` calls,
|
|
98
|
+
so in many cases it will be a drop-in replacement for Path or str filenames. If you
|
|
99
|
+
need an actual Path object, you can call `path()` to get one, but you should prefer to
|
|
100
|
+
defer this until the actual location of use.
|
|
101
|
+
|
|
102
|
+
By 'wrapping' read-only data in these objects, we can unify the code around how we
|
|
103
|
+
unwrap and use them, which should allow us to more easily support different execution
|
|
104
|
+
environments and sources of data.
|
|
105
|
+
|
|
106
|
+
For instance, a Source could be a file on disk, but it could also be a file in
|
|
107
|
+
ADLS.
|
|
108
|
+
|
|
109
|
+
Furthermore, libraries which build on top of this one may use this representation to
|
|
110
|
+
identify opportunities for optimization, by representing the Source in a stable
|
|
111
|
+
and consistent format that allows different underlying data sources to fulfill the
|
|
112
|
+
request for the data based on environmental context. A library could choose to
|
|
113
|
+
transparently transform a local-path-based Source into a Source representing a
|
|
114
|
+
remote file, without changing the semantics of the object as observed by the code.
|
|
115
|
+
|
|
116
|
+
One reason a Hash is part of the interface is so that libraries interacting with the
|
|
117
|
+
object can use the hash as a canonical 'name' for the data, if one is available.
|
|
118
|
+
|
|
119
|
+
Another reason is that we can add a layer of consistency checking to data we're
|
|
120
|
+
working with, at the cost of a few compute cycles. Since Sources are meant to represent
|
|
121
|
+
read-only data, the Hash is a meaningful and persistent marker of data identity.
|
|
122
|
+
|
|
123
|
+
Do not call its constructor in application code. Use `from_file` or `from_uri` instead.
|
|
124
|
+
"""
|
|
125
|
+
|
|
126
|
+
uri: str
|
|
127
|
+
hash: ty.Optional[Hash] = None
|
|
128
|
+
# hash and equality are based only on the _identity_ of the object,
|
|
129
|
+
# not on the other properties that provide some caching functionality.
|
|
130
|
+
|
|
131
|
+
@property
|
|
132
|
+
def cached_path(self) -> ty.Optional[Path]:
|
|
133
|
+
"""This is part of the public interface as far as checking to see whether a file
|
|
134
|
+
is already present locally, but its existence and value is not part of equality or
|
|
135
|
+
the hash for this class - it exists purely as an optimization.
|
|
136
|
+
"""
|
|
137
|
+
return getattr(self, "__cached_path", None)
|
|
138
|
+
|
|
139
|
+
def _set_cached_path(self, lpath: ty.Optional[Path]):
|
|
140
|
+
"""protected interface for setting a cached Path since the attribute is not
|
|
141
|
+
available via the constructor.
|
|
142
|
+
"""
|
|
143
|
+
super().__setattr__("__cached_path", lpath) # this works around dataclass.frozen.
|
|
144
|
+
# https://noklam.github.io/blog/posts/2022-04-22-python-dataclass-partiala-immutable.html
|
|
145
|
+
|
|
146
|
+
def path(self) -> Path:
|
|
147
|
+
"""Any Source can be turned into a local file path.
|
|
148
|
+
|
|
149
|
+
Remember that the resulting data is meant to be read-only. If you want to mutate
|
|
150
|
+
the data, you should first make a copy.
|
|
151
|
+
|
|
152
|
+
If not already present locally, this will incur a one-time download. Then, if the
|
|
153
|
+
Source has a Hash, the Hash will be validated against the downloaded file, and a
|
|
154
|
+
failure will raise SourceHashMismatchError.
|
|
155
|
+
"""
|
|
156
|
+
if self.cached_path is None or not self.cached_path.exists():
|
|
157
|
+
lpath = _get_download_handler(self.uri)(self.hash)
|
|
158
|
+
# path() used to be responsible for checking the hash, but since we pass it to the downloader,
|
|
159
|
+
# it really makes more sense to allow the downloader to decide how to verify its own download,
|
|
160
|
+
# and we don't want to duplicate any effort that it may have already put in.
|
|
161
|
+
self._set_cached_path(lpath)
|
|
162
|
+
|
|
163
|
+
assert self.cached_path and self.cached_path.exists()
|
|
164
|
+
return self.cached_path
|
|
165
|
+
|
|
166
|
+
def __fspath__(self) -> str:
|
|
167
|
+
return os.fspath(self.path())
|
|
168
|
+
|
|
169
|
+
|
|
170
|
+
# Creation from local Files or from remote URIs
|
|
171
|
+
|
|
172
|
+
|
|
173
|
+
def from_file(filename: StrOrPath, hash: ty.Optional[Hash] = None, uri: str = "") -> Source:
|
|
174
|
+
"""Create a read-only Source from a local file that already exists.
|
|
175
|
+
|
|
176
|
+
If URI is passed, the local file will be read and hashed, but the final URI in the
|
|
177
|
+
Source will be the one provided explicitly. NO UPLOAD IS PERFORMED. It is your
|
|
178
|
+
responsibility to ensure that your file has been uploaded to the URI you provide.
|
|
179
|
+
"""
|
|
180
|
+
path = path_from_uri(filename) if isinstance(filename, str) else filename
|
|
181
|
+
assert isinstance(path, Path)
|
|
182
|
+
if not path.exists():
|
|
183
|
+
raise FileNotFoundError(path)
|
|
184
|
+
|
|
185
|
+
if uri:
|
|
186
|
+
src = from_uri(uri, _check_hash(hash, path))
|
|
187
|
+
else:
|
|
188
|
+
src = Source(to_uri(path), _check_hash(hash, path))
|
|
189
|
+
src._set_cached_path(path) # internally, it's okay to hack around immutability.
|
|
190
|
+
return src
|
|
191
|
+
|
|
192
|
+
|
|
193
|
+
class FromUri(ty.Protocol):
|
|
194
|
+
def __call__(self, hash: ty.Optional[Hash]) -> Source:
|
|
195
|
+
"""Closure over a URI that creates a Source from a URI.
|
|
196
|
+
|
|
197
|
+
The Hash may be used to short-circuit creation that would result in creating
|
|
198
|
+
a Source that cannot match the expected Hash, but this is not required,
|
|
199
|
+
and the hash will be included in the Source object regardless, and will
|
|
200
|
+
be validated (if non-nil) at the time of source data access.
|
|
201
|
+
"""
|
|
202
|
+
|
|
203
|
+
|
|
204
|
+
class FromUriHandler(ty.Protocol):
|
|
205
|
+
def __call__(self, uri: str) -> ty.Optional[FromUri]:
|
|
206
|
+
"""Returns a FromUri object containing the URI if this URI can be handled. Returns
|
|
207
|
+
None if this URI cannot be handled.
|
|
208
|
+
"""
|
|
209
|
+
|
|
210
|
+
|
|
211
|
+
def register_from_uri_handler(key: str, handler: FromUriHandler):
|
|
212
|
+
"""If a library wants to customize how Sources are created from URIs that it handles,
|
|
213
|
+
it can register a handler here.
|
|
214
|
+
"""
|
|
215
|
+
# key is not currently used for anything other than avoiding
|
|
216
|
+
# having duplicates registered for whatever reason.
|
|
217
|
+
_FROM_URI_HANDLERS[key] = handler
|
|
218
|
+
|
|
219
|
+
|
|
220
|
+
_FROM_URI_HANDLERS: ty.Dict[str, FromUriHandler] = dict()
|
|
221
|
+
register_from_uri_handler(
|
|
222
|
+
"local_file", lambda uri: partial(from_file, path_from_uri(uri)) if is_file_uri(uri) else None
|
|
223
|
+
)
|
|
224
|
+
|
|
225
|
+
|
|
226
|
+
def from_uri(uri: str, hash: ty.Optional[Hash] = None) -> Source:
|
|
227
|
+
"""Create a read-only Source from a URI. The data should already exist at this remote
|
|
228
|
+
URI, although Source itself can make no guarantee that it always represents real data
|
|
229
|
+
- only that any data it does represent is read-only.
|
|
230
|
+
|
|
231
|
+
It may be advantageous for a URI-handling library to register a more specific
|
|
232
|
+
implementation of this function, if it is capable of determining a Hash for the blob
|
|
233
|
+
represented by the URI without downloading the blob.
|
|
234
|
+
"""
|
|
235
|
+
for handler in _FROM_URI_HANDLERS.values():
|
|
236
|
+
if from_uri_ := handler(uri):
|
|
237
|
+
return from_uri_(hash)
|
|
238
|
+
return Source(uri=uri, hash=hash)
|
|
@@ -0,0 +1,104 @@
|
|
|
1
|
+
# this should later get promoted somewhere, probably
|
|
2
|
+
import json
|
|
3
|
+
import typing as ty
|
|
4
|
+
from functools import partial
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
|
|
7
|
+
from thds.core import files, hashing, log, source, types
|
|
8
|
+
|
|
9
|
+
_SHA256_B64 = "sha256b64"
|
|
10
|
+
_MD5_B64 = "md5b64"
|
|
11
|
+
|
|
12
|
+
logger = log.getLogger(__name__)
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def _from_sha256b64(d: dict) -> ty.Optional[hashing.Hash]:
|
|
16
|
+
if "sha256b64" in d:
|
|
17
|
+
return hashing.Hash(algo="sha256", bytes=hashing.db64(d[_SHA256_B64]))
|
|
18
|
+
return None
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def _from_md5b64(d: dict) -> ty.Optional[hashing.Hash]:
|
|
22
|
+
if "md5b64" in d:
|
|
23
|
+
return hashing.Hash(algo="md5", bytes=hashing.db64(d[_MD5_B64]))
|
|
24
|
+
return None
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
HashParser = ty.Callable[[dict], ty.Optional[hashing.Hash]]
|
|
28
|
+
_BASE_PARSERS = (_from_sha256b64, _from_md5b64)
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def base_parsers() -> ty.Tuple[HashParser, ...]:
|
|
32
|
+
return _BASE_PARSERS
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def from_json(
|
|
36
|
+
json_source: str, hash_parsers: ty.Collection[HashParser] = base_parsers()
|
|
37
|
+
) -> source.Source:
|
|
38
|
+
d = json.loads(json_source)
|
|
39
|
+
return source.from_uri(
|
|
40
|
+
uri=d["uri"],
|
|
41
|
+
hash=next(filter(None, (p(d) for p in hash_parsers)), None),
|
|
42
|
+
)
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def _generic_hash_serializer(
|
|
46
|
+
algo: str, stringify_hash: ty.Callable[[bytes], str], keyname: str, hash: hashing.Hash
|
|
47
|
+
) -> ty.Optional[dict]:
|
|
48
|
+
if hash.algo == algo:
|
|
49
|
+
return {keyname: stringify_hash(hash.bytes)}
|
|
50
|
+
return None
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
_to_sha256b64 = partial(_generic_hash_serializer, "sha256", hashing.b64, _SHA256_B64)
|
|
54
|
+
_to_md5b64 = partial(_generic_hash_serializer, "md5", hashing.b64, _MD5_B64)
|
|
55
|
+
|
|
56
|
+
HashSerializer = ty.Callable[[hashing.Hash], ty.Optional[dict]]
|
|
57
|
+
_BASE_HASH_SERIALIZERS: ty.Tuple[HashSerializer, ...] = (_to_md5b64, _to_sha256b64) # type: ignore
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def base_hash_serializers() -> ty.Tuple[HashSerializer, ...]:
|
|
61
|
+
return _BASE_HASH_SERIALIZERS
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
def to_json(
|
|
65
|
+
source: source.Source, hash_serializers: ty.Collection[HashSerializer] = base_hash_serializers()
|
|
66
|
+
) -> str:
|
|
67
|
+
hash_dict = (
|
|
68
|
+
next(filter(None, (ser(source.hash) for ser in hash_serializers if source.hash)), None)
|
|
69
|
+
) or dict()
|
|
70
|
+
return json.dumps(dict(uri=source.uri, **hash_dict))
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
def from_unknown_user_path(path: types.StrOrPath, desired_uri: str) -> source.Source:
|
|
74
|
+
"""Sometimes you may want to load a Source directly from a Path provided by a user.
|
|
75
|
+
|
|
76
|
+
It _might_ represent something loadable as a from_json Source, but it might just be a
|
|
77
|
+
raw file that needs to be loaded with from_file!
|
|
78
|
+
|
|
79
|
+
This is a _reasonable_ (but not guaranteed!) way of trying to ascertain which one it
|
|
80
|
+
is, and specifying where it should live 'remotely' if such a thing becomes
|
|
81
|
+
necessary.
|
|
82
|
+
|
|
83
|
+
Your application might need to implement something more robust if the
|
|
84
|
+
actual underlying data is likely to be a JSON blob containing the key `uri`, for
|
|
85
|
+
instance.
|
|
86
|
+
"""
|
|
87
|
+
with open(path) as readable:
|
|
88
|
+
try:
|
|
89
|
+
return from_json(readable.read(4096))
|
|
90
|
+
except (json.JSONDecodeError, UnicodeDecodeError):
|
|
91
|
+
return source.from_file(path, uri=desired_uri)
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
def write_to_json_file(source: source.Source, local_file: Path) -> bool:
|
|
95
|
+
"""Write the canonical JSON serialization of the Source to a file."""
|
|
96
|
+
local_file.parent.mkdir(parents=True, exist_ok=True)
|
|
97
|
+
previous_source = local_file.read_text() if local_file.exists() else None
|
|
98
|
+
new_source = to_json(source) + "\n"
|
|
99
|
+
if new_source != previous_source:
|
|
100
|
+
with files.atomic_text_writer(local_file) as f:
|
|
101
|
+
logger.info(f"Writing {source} to {local_file}")
|
|
102
|
+
f.write(new_source)
|
|
103
|
+
return True
|
|
104
|
+
return False
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
from . import connect, copy, ddl, functions, index, read, sqlmap, upsert # noqa: F401
|
|
2
|
+
from .merge import merge_databases # noqa: F401
|
|
3
|
+
from .meta import ( # noqa: F401
|
|
4
|
+
debug_errors,
|
|
5
|
+
list_tables,
|
|
6
|
+
preload_sources,
|
|
7
|
+
primary_key_cols,
|
|
8
|
+
table_name_from_path,
|
|
9
|
+
table_source,
|
|
10
|
+
)
|
|
11
|
+
from .structured import StructTable, struct_table_from_source # noqa: F401
|
|
12
|
+
from .types import ( # noqa: F401
|
|
13
|
+
AnyDbTableSrc,
|
|
14
|
+
DbAndTable,
|
|
15
|
+
DbAndTableP,
|
|
16
|
+
TableMaster,
|
|
17
|
+
TableSource,
|
|
18
|
+
maybe_t,
|
|
19
|
+
resolve_lazy_db_and_table,
|
|
20
|
+
)
|
|
21
|
+
from .write import make_mapping_writer, write_mappings # noqa: F401
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
import contextlib
|
|
2
|
+
import os
|
|
3
|
+
import sqlite3
|
|
4
|
+
import typing as ty
|
|
5
|
+
|
|
6
|
+
from thds.core import scope
|
|
7
|
+
|
|
8
|
+
from .functions import register_functions_on_connection
|
|
9
|
+
from .types import Connectable
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def row_connect(path: ty.Union[str, os.PathLike]) -> sqlite3.Connection:
|
|
13
|
+
"""Get a connection to a row database"""
|
|
14
|
+
conn = sqlite3.connect(os.fspath(path), isolation_level=None) # autocommit
|
|
15
|
+
conn.row_factory = sqlite3.Row
|
|
16
|
+
return register_functions_on_connection(conn)
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
autoconn_scope = scope.Scope("sqlite3.autoconn")
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def autoconnect(connectable: Connectable) -> sqlite3.Connection:
|
|
23
|
+
"""Will automatically commit when it hits the autoconn_scope.bound, but only if
|
|
24
|
+
the connectable was not already a connection.
|
|
25
|
+
"""
|
|
26
|
+
if isinstance(connectable, sqlite3.Connection):
|
|
27
|
+
return connectable
|
|
28
|
+
|
|
29
|
+
return autoconn_scope.enter(
|
|
30
|
+
contextlib.closing( # close the connection when we exit the scope
|
|
31
|
+
row_connect(os.fspath(connectable))
|
|
32
|
+
)
|
|
33
|
+
)
|
thds/core/sqlite/copy.py
ADDED
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
"""Utility for copying a table from one connection to another."""
|
|
2
|
+
|
|
3
|
+
from ..log import getLogger
|
|
4
|
+
from .connect import autoconn_scope, autoconnect
|
|
5
|
+
from .types import Connectable
|
|
6
|
+
|
|
7
|
+
logger = getLogger(__name__)
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
@autoconn_scope.bound
|
|
11
|
+
def table(source: Connectable, table_name: str, dest: Connectable) -> None:
|
|
12
|
+
"""Copy a table from one connection to another, including its table definition.
|
|
13
|
+
|
|
14
|
+
If you can do this using ATTACH instead, that will be faster because it involves
|
|
15
|
+
no Python code running in the loop.
|
|
16
|
+
"""
|
|
17
|
+
source_conn = autoconnect(source)
|
|
18
|
+
dest_conn = autoconnect(dest)
|
|
19
|
+
|
|
20
|
+
source_table_sql = source_conn.execute(
|
|
21
|
+
f"SELECT sql FROM sqlite_master WHERE name = '{table_name}'"
|
|
22
|
+
).fetchone()[0]
|
|
23
|
+
|
|
24
|
+
dest_conn.execute(source_table_sql)
|
|
25
|
+
|
|
26
|
+
src_data = source_conn.execute(f"SELECT * FROM {table_name}")
|
|
27
|
+
|
|
28
|
+
dest_conn.execute("BEGIN TRANSACTION;")
|
|
29
|
+
while True:
|
|
30
|
+
data = src_data.fetchmany(1000)
|
|
31
|
+
if not data:
|
|
32
|
+
break
|
|
33
|
+
placeholders = ", ".join(["?"] * len(data[0]))
|
|
34
|
+
dest_conn.executemany(f"INSERT INTO {table_name} VALUES ({placeholders});", data)
|
|
35
|
+
dest_conn.execute("COMMIT;")
|
thds/core/sqlite/ddl.py
ADDED