promnesia 1.2.20240810__py3-none-any.whl → 1.4.20250909__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- promnesia/__init__.py +18 -4
- promnesia/__main__.py +104 -78
- promnesia/cannon.py +108 -107
- promnesia/common.py +107 -88
- promnesia/compare.py +33 -30
- promnesia/compat.py +10 -10
- promnesia/config.py +37 -34
- promnesia/database/common.py +4 -3
- promnesia/database/dump.py +13 -13
- promnesia/database/load.py +7 -7
- promnesia/extract.py +19 -17
- promnesia/logging.py +27 -15
- promnesia/misc/install_server.py +32 -27
- promnesia/server.py +106 -79
- promnesia/sources/auto.py +104 -77
- promnesia/sources/auto_logseq.py +6 -5
- promnesia/sources/auto_obsidian.py +2 -2
- promnesia/sources/browser.py +20 -10
- promnesia/sources/browser_legacy.py +65 -50
- promnesia/sources/demo.py +7 -8
- promnesia/sources/fbmessenger.py +3 -3
- promnesia/sources/filetypes.py +22 -16
- promnesia/sources/github.py +9 -8
- promnesia/sources/guess.py +6 -2
- promnesia/sources/hackernews.py +7 -9
- promnesia/sources/hpi.py +5 -3
- promnesia/sources/html.py +11 -7
- promnesia/sources/hypothesis.py +3 -2
- promnesia/sources/instapaper.py +3 -2
- promnesia/sources/markdown.py +22 -12
- promnesia/sources/org.py +36 -17
- promnesia/sources/plaintext.py +41 -39
- promnesia/sources/pocket.py +5 -3
- promnesia/sources/reddit.py +24 -26
- promnesia/sources/roamresearch.py +5 -2
- promnesia/sources/rss.py +6 -8
- promnesia/sources/shellcmd.py +21 -11
- promnesia/sources/signal.py +27 -26
- promnesia/sources/smscalls.py +2 -3
- promnesia/sources/stackexchange.py +5 -4
- promnesia/sources/takeout.py +37 -34
- promnesia/sources/takeout_legacy.py +29 -19
- promnesia/sources/telegram.py +18 -12
- promnesia/sources/telegram_legacy.py +22 -11
- promnesia/sources/twitter.py +7 -6
- promnesia/sources/vcs.py +11 -6
- promnesia/sources/viber.py +11 -10
- promnesia/sources/website.py +8 -7
- promnesia/sources/zulip.py +3 -2
- promnesia/sqlite.py +13 -7
- promnesia/tests/common.py +10 -5
- promnesia/tests/server_helper.py +13 -10
- promnesia/tests/sources/test_auto.py +2 -3
- promnesia/tests/sources/test_filetypes.py +11 -8
- promnesia/tests/sources/test_hypothesis.py +10 -6
- promnesia/tests/sources/test_org.py +9 -5
- promnesia/tests/sources/test_plaintext.py +9 -8
- promnesia/tests/sources/test_shellcmd.py +13 -13
- promnesia/tests/sources/test_takeout.py +3 -5
- promnesia/tests/test_cannon.py +256 -239
- promnesia/tests/test_cli.py +12 -8
- promnesia/tests/test_compare.py +17 -13
- promnesia/tests/test_config.py +7 -8
- promnesia/tests/test_db_dump.py +15 -15
- promnesia/tests/test_extract.py +17 -10
- promnesia/tests/test_indexer.py +24 -18
- promnesia/tests/test_server.py +12 -13
- promnesia/tests/test_traverse.py +0 -2
- promnesia/tests/utils.py +3 -7
- promnesia-1.4.20250909.dist-info/METADATA +66 -0
- promnesia-1.4.20250909.dist-info/RECORD +80 -0
- {promnesia-1.2.20240810.dist-info → promnesia-1.4.20250909.dist-info}/WHEEL +1 -2
- promnesia/kjson.py +0 -121
- promnesia/sources/__init__.pyi +0 -0
- promnesia-1.2.20240810.dist-info/METADATA +0 -54
- promnesia-1.2.20240810.dist-info/RECORD +0 -83
- promnesia-1.2.20240810.dist-info/top_level.txt +0 -1
- {promnesia-1.2.20240810.dist-info → promnesia-1.4.20250909.dist-info}/entry_points.txt +0 -0
- {promnesia-1.2.20240810.dist-info → promnesia-1.4.20250909.dist-info/licenses}/LICENSE +0 -0
promnesia/common.py
CHANGED
@@ -1,51 +1,55 @@
|
|
1
1
|
from __future__ import annotations
|
2
2
|
|
3
|
-
from contextlib import contextmanager
|
4
|
-
from datetime import datetime, date
|
5
|
-
from functools import lru_cache
|
6
|
-
from glob import glob
|
7
3
|
import itertools
|
8
4
|
import logging
|
9
5
|
import os
|
10
|
-
|
6
|
+
import re
|
11
7
|
import shutil
|
12
|
-
|
8
|
+
import tempfile
|
9
|
+
import warnings
|
10
|
+
from collections.abc import Callable, Iterable, Sequence
|
11
|
+
from contextlib import contextmanager
|
12
|
+
from copy import copy
|
13
|
+
from datetime import date, datetime, timezone
|
14
|
+
from functools import lru_cache
|
15
|
+
from glob import glob
|
16
|
+
from pathlib import Path
|
17
|
+
from subprocess import PIPE, Popen, run
|
13
18
|
from timeit import default_timer as timer
|
14
19
|
from types import ModuleType
|
15
|
-
from typing import
|
16
|
-
import
|
20
|
+
from typing import TYPE_CHECKING, NamedTuple, TypeAlias, TypeVar
|
21
|
+
from zoneinfo import ZoneInfo
|
17
22
|
|
23
|
+
import platformdirs
|
18
24
|
from more_itertools import intersperse
|
19
|
-
import pytz
|
20
25
|
|
21
26
|
from .cannon import canonify
|
22
|
-
from .compat import removeprefix
|
23
|
-
|
24
27
|
|
25
28
|
_is_windows = os.name == 'nt'
|
26
29
|
|
27
30
|
T = TypeVar('T')
|
28
|
-
Res =
|
31
|
+
Res: TypeAlias = T | Exception
|
29
32
|
|
30
|
-
PathIsh =
|
33
|
+
PathIsh = str | Path
|
31
34
|
|
32
35
|
Url = str
|
33
36
|
SourceName = str
|
34
|
-
DatetimeIsh =
|
37
|
+
DatetimeIsh = datetime | date
|
35
38
|
Context = str
|
36
39
|
Second = int
|
37
40
|
|
41
|
+
|
38
42
|
# TODO hmm. arguably, source and context are almost same things...
|
39
43
|
class Loc(NamedTuple):
|
40
44
|
title: str
|
41
|
-
href:
|
45
|
+
href: str | None = None
|
42
46
|
|
43
47
|
@classmethod
|
44
|
-
def make(cls, title: str, href:
|
48
|
+
def make(cls, title: str, href: str | None = None) -> Loc:
|
45
49
|
return cls(title=title, href=href)
|
46
50
|
|
47
51
|
@classmethod
|
48
|
-
def file(cls, path: PathIsh, line:
|
52
|
+
def file(cls, path: PathIsh, line: int | None = None, relative_to: Path | None = None) -> Loc:
|
49
53
|
lstr = '' if line is None else f':{line}'
|
50
54
|
# todo loc should be url encoded? dunno.
|
51
55
|
# or use line=? eh. I don't know. Just ask in issues.
|
@@ -53,11 +57,11 @@ class Loc(NamedTuple):
|
|
53
57
|
# todo: handler has to be overridable by config. This is needed for docker, but also for a "as a service" install, where the sources would be available on some remote webserver
|
54
58
|
# maybe it should be treated as a format string, so that {line} may be a part of the result or not.
|
55
59
|
# for local usage, editor:///file:line works, but if the txt file is only available through http, it breaks.
|
56
|
-
#if get_config().MIME_HANDLER:
|
60
|
+
# if get_config().MIME_HANDLER:
|
57
61
|
# handler = get_config().MIME_HANDLER
|
58
|
-
#if True:
|
62
|
+
# if True:
|
59
63
|
# handler = 'editor:///home/koom/promnesia/docker/'
|
60
|
-
#else:
|
64
|
+
# else:
|
61
65
|
handler = _detect_mime_handler()
|
62
66
|
|
63
67
|
rel = Path(path)
|
@@ -65,13 +69,10 @@ class Loc(NamedTuple):
|
|
65
69
|
try:
|
66
70
|
# making it relative is a bit nicer for display
|
67
71
|
rel = rel.relative_to(relative_to)
|
68
|
-
except Exception
|
69
|
-
pass
|
72
|
+
except Exception:
|
73
|
+
pass # todo log/warn?
|
70
74
|
loc = f'{rel}{lstr}'
|
71
|
-
return cls.make(
|
72
|
-
title=loc,
|
73
|
-
href=f'{handler}{path}{lstr}'
|
74
|
-
)
|
75
|
+
return cls.make(title=loc, href=f'{handler}{path}{lstr}')
|
75
76
|
|
76
77
|
# TODO need some uniform way of string conversion
|
77
78
|
# but generally, it will be
|
@@ -87,19 +88,21 @@ def warn_once(message: str) -> None:
|
|
87
88
|
|
88
89
|
|
89
90
|
def _warn_no_xdg_mime() -> None:
|
90
|
-
warn_once(
|
91
|
+
warn_once(
|
92
|
+
"No xdg-mime on your OS! If you're on OSX, perhaps you can help me! https://github.com/karlicoss/open-in-editor/issues/1"
|
93
|
+
)
|
91
94
|
|
92
95
|
|
93
96
|
@lru_cache(1)
|
94
97
|
def _detect_mime_handler() -> str:
|
95
98
|
def exists(what: str) -> bool:
|
96
99
|
try:
|
97
|
-
r = run(f'xdg-mime query default x-scheme-handler/{what}'.split(), stdout=PIPE)
|
100
|
+
r = run(f'xdg-mime query default x-scheme-handler/{what}'.split(), stdout=PIPE, check=False)
|
98
101
|
except (FileNotFoundError, NotADirectoryError): # ugh seems that osx might throw NotADirectory for some reason
|
99
102
|
_warn_no_xdg_mime()
|
100
103
|
return False
|
101
104
|
if r.returncode > 0:
|
102
|
-
warnings.warn('xdg-mime failed')
|
105
|
+
warnings.warn('xdg-mime failed') # hopefully rest is in stderr
|
103
106
|
return False
|
104
107
|
# todo not sure if should check=True or something
|
105
108
|
handler = r.stdout.decode('utf8').strip()
|
@@ -108,11 +111,13 @@ def _detect_mime_handler() -> str:
|
|
108
111
|
# 1. detect legacy 'emacs:' handler (so it doesn't break for existing users)
|
109
112
|
result = None
|
110
113
|
if exists('emacs'):
|
111
|
-
warnings.warn(
|
114
|
+
warnings.warn(
|
115
|
+
'''
|
112
116
|
'emacs:' handler is deprecated!
|
113
117
|
Please use newer version at https://github.com/karlicoss/open-in-editor
|
114
118
|
And remove the old one (most likely, rm ~/.local/share/applications/mimemacs.desktop && update-desktop-database ~/.local/share/applications).
|
115
|
-
'''.rstrip()
|
119
|
+
'''.rstrip()
|
120
|
+
)
|
116
121
|
result = 'emacs:'
|
117
122
|
|
118
123
|
# 2. now try to use newer editor:// thing
|
@@ -120,10 +125,12 @@ def _detect_mime_handler() -> str:
|
|
120
125
|
|
121
126
|
# TODO would be nice to collect warnings and display at the end
|
122
127
|
if not exists('editor'):
|
123
|
-
warnings.warn(
|
128
|
+
warnings.warn(
|
129
|
+
'''
|
124
130
|
You might want to install https://github.com/karlicoss/open-in-editor
|
125
131
|
So you can jump to your text files straight from the browser
|
126
|
-
'''.rstrip()
|
132
|
+
'''.rstrip()
|
133
|
+
)
|
127
134
|
else:
|
128
135
|
result = 'editor://'
|
129
136
|
|
@@ -139,39 +146,41 @@ class Visit(NamedTuple):
|
|
139
146
|
# TODO back to DatetimeIsh, but somehow make compatible to dbcache?
|
140
147
|
dt: datetime
|
141
148
|
locator: Loc
|
142
|
-
context:
|
143
|
-
duration:
|
149
|
+
context: Context | None = None
|
150
|
+
duration: Second | None = None
|
144
151
|
# TODO shit. I need to insert it in chrome db....
|
145
152
|
# TODO gonna be hard to fill retroactively.
|
146
153
|
# spent: Optional[Second] = None
|
147
|
-
debug:
|
154
|
+
debug: str | None = None
|
155
|
+
|
148
156
|
|
149
|
-
Result =
|
157
|
+
Result = Visit | Exception
|
150
158
|
Results = Iterable[Result]
|
151
159
|
Extractor = Callable[[], Results]
|
152
160
|
|
153
161
|
Extraction = Result # TODO deprecate!
|
154
162
|
|
163
|
+
|
155
164
|
class DbVisit(NamedTuple):
|
156
165
|
norm_url: Url
|
157
166
|
orig_url: Url
|
158
167
|
dt: datetime
|
159
168
|
locator: Loc
|
160
|
-
src:
|
161
|
-
context:
|
162
|
-
duration:
|
169
|
+
src: SourceName | None = None
|
170
|
+
context: Context | None = None
|
171
|
+
duration: Second | None = None
|
163
172
|
|
164
173
|
@staticmethod
|
165
|
-
def make(p: Visit, src: SourceName) -> Res[
|
174
|
+
def make(p: Visit, src: SourceName) -> Res[DbVisit]:
|
166
175
|
try:
|
167
176
|
# hmm, mypy gets a bit confused here.. presumably because datetime is always datetime (but date is not datetime)
|
168
177
|
if isinstance(p.dt, datetime):
|
169
178
|
dt = p.dt
|
170
179
|
elif isinstance(p.dt, date):
|
171
180
|
# TODO that won't be with timezone..
|
172
|
-
dt = datetime.combine(p.dt, datetime.min.time())
|
181
|
+
dt = datetime.combine(p.dt, datetime.min.time()) # meh..
|
173
182
|
else:
|
174
|
-
raise
|
183
|
+
raise TypeError(f'unexpected date: {p.dt}, {type(p.dt)}') # noqa: TRY301
|
175
184
|
except Exception as e:
|
176
185
|
return e
|
177
186
|
|
@@ -196,35 +205,37 @@ Filter = Callable[[Url], bool]
|
|
196
205
|
|
197
206
|
|
198
207
|
from .logging import LazyLogger
|
208
|
+
|
199
209
|
logger = LazyLogger('promnesia', level='DEBUG')
|
200
210
|
|
211
|
+
|
201
212
|
def get_logger() -> logging.Logger:
|
202
213
|
# deprecate? no need since logger is lazy already
|
203
214
|
return logger
|
204
215
|
|
205
216
|
|
206
|
-
|
207
|
-
import tempfile
|
208
217
|
# kinda singleton
|
209
218
|
@lru_cache(1)
|
210
219
|
def get_tmpdir() -> tempfile.TemporaryDirectory[str]:
|
211
|
-
# todo use
|
220
|
+
# todo use platformdirs?
|
212
221
|
tdir = tempfile.TemporaryDirectory(suffix="promnesia")
|
213
222
|
return tdir
|
214
223
|
|
224
|
+
|
215
225
|
# TODO use mypy literal?
|
216
226
|
Syntax = str
|
217
227
|
|
218
228
|
|
219
229
|
@lru_cache(None)
|
220
230
|
def _get_urlextractor(syntax: Syntax):
|
221
|
-
from urlextract import URLExtract
|
231
|
+
from urlextract import URLExtract # type: ignore[import-untyped]
|
232
|
+
|
222
233
|
u = URLExtract()
|
223
234
|
# https://github.com/lipoja/URLExtract/issues/13
|
224
|
-
if syntax in {'org', 'orgmode', 'org-mode'}:
|
235
|
+
if syntax in {'org', 'orgmode', 'org-mode'}: # TODO remove hardcoding..
|
225
236
|
# handle org-mode links properly..
|
226
237
|
u._stop_chars_right |= {'[', ']'}
|
227
|
-
u._stop_chars_left
|
238
|
+
u._stop_chars_left |= {'[', ']'}
|
228
239
|
elif syntax in {'md', 'markdown'}:
|
229
240
|
pass
|
230
241
|
# u._stop_chars_right |= {','}
|
@@ -242,19 +253,19 @@ def _sanitize(url: str) -> str:
|
|
242
253
|
return url
|
243
254
|
|
244
255
|
|
245
|
-
def iter_urls(s: str, *, syntax: Syntax='') -> Iterable[Url]:
|
256
|
+
def iter_urls(s: str, *, syntax: Syntax = '') -> Iterable[Url]:
|
246
257
|
urlextractor = _get_urlextractor(syntax=syntax)
|
247
258
|
# note: it also has get_indices, might be useful
|
248
259
|
for u in urlextractor.gen_urls(s):
|
249
260
|
yield _sanitize(u)
|
250
261
|
|
251
262
|
|
252
|
-
def extract_urls(s: str, *, syntax: Syntax='') ->
|
263
|
+
def extract_urls(s: str, *, syntax: Syntax = '') -> list[Url]:
|
253
264
|
return list(iter_urls(s=s, syntax=syntax))
|
254
265
|
|
255
266
|
|
256
267
|
def from_epoch(ts: int) -> datetime:
|
257
|
-
return datetime.fromtimestamp(ts, tz=
|
268
|
+
return datetime.fromtimestamp(ts, tz=timezone.utc)
|
258
269
|
|
259
270
|
|
260
271
|
def join_tags(tags: Iterable[str]) -> str:
|
@@ -274,7 +285,7 @@ class PathWithMtime(NamedTuple):
|
|
274
285
|
mtime: float
|
275
286
|
|
276
287
|
@classmethod
|
277
|
-
def make(cls, p: Path) ->
|
288
|
+
def make(cls, p: Path) -> PathWithMtime:
|
278
289
|
return cls(
|
279
290
|
path=p,
|
280
291
|
mtime=p.stat().st_mtime,
|
@@ -285,10 +296,7 @@ class PathWithMtime(NamedTuple):
|
|
285
296
|
PreExtractor = Callable[..., Results]
|
286
297
|
|
287
298
|
|
288
|
-
PreSource =
|
289
|
-
PreExtractor,
|
290
|
-
ModuleType, # module with 'index' functon defined in it
|
291
|
-
]
|
299
|
+
PreSource = PreExtractor | ModuleType # module with 'index' functon defined in it
|
292
300
|
|
293
301
|
|
294
302
|
# todo not sure about this...
|
@@ -300,7 +308,7 @@ def _guess_name(thing: PreSource) -> str:
|
|
300
308
|
guess = thing.__module__
|
301
309
|
|
302
310
|
dflt = 'promnesia.sources.'
|
303
|
-
guess = removeprefix(
|
311
|
+
guess = guess.removeprefix(dflt)
|
304
312
|
if guess == 'config':
|
305
313
|
# this happens when we define a lambda in config or something without properly wrapping in Source
|
306
314
|
logger.warning(f'Inferred source name "config" for {thing}. This might be misleading TODO')
|
@@ -320,7 +328,7 @@ def _get_index_function(sourceish: PreSource) -> PreExtractor:
|
|
320
328
|
class Source:
|
321
329
|
# TODO make sure it works with empty src?
|
322
330
|
# TODO later, make it properly optional?
|
323
|
-
def __init__(self, ff: PreSource, *args, src: SourceName='', name: SourceName='', **kwargs) -> None:
|
331
|
+
def __init__(self, ff: PreSource, *args, src: SourceName = '', name: SourceName = '', **kwargs) -> None:
|
324
332
|
# NOTE: in principle, would be nice to make the Source countructor to be as dumb as possible
|
325
333
|
# so we could move _get_index_function inside extractor lambda
|
326
334
|
# but that way we get nicer error reporting
|
@@ -354,6 +362,7 @@ class Source:
|
|
354
362
|
# TODO deprecated!
|
355
363
|
return self.name
|
356
364
|
|
365
|
+
|
357
366
|
# TODO deprecated
|
358
367
|
Indexer = Source
|
359
368
|
|
@@ -362,13 +371,15 @@ Indexer = Source
|
|
362
371
|
# NOTE: used in configs...
|
363
372
|
def last(path: PathIsh, *parts: str) -> Path:
|
364
373
|
import os.path
|
365
|
-
pp = os.path.join(str(path), *parts)
|
366
|
-
return Path(max(glob(pp, recursive=True)))
|
367
374
|
|
375
|
+
pp = os.path.join(str(path), *parts) # noqa: PTH118
|
376
|
+
return Path(max(glob(pp, recursive=True))) # noqa: PTH207
|
368
377
|
|
369
|
-
from .logging import setup_logger
|
370
378
|
|
371
|
-
from
|
379
|
+
from .logging import setup_logger # noqa: F401
|
380
|
+
|
381
|
+
|
382
|
+
# TODO get rid of this? not sure if still necessary
|
372
383
|
def echain(ex: Exception, cause: Exception) -> Exception:
|
373
384
|
e = copy(ex)
|
374
385
|
e.__cause__ = cause
|
@@ -382,50 +393,48 @@ def echain(ex: Exception, cause: Exception) -> Exception:
|
|
382
393
|
|
383
394
|
def slugify(x: str) -> str:
|
384
395
|
# https://stackoverflow.com/a/38766141/706389
|
385
|
-
import re
|
386
396
|
valid_file_name = re.sub(r'[^\w_.)( -]', '', x)
|
387
397
|
return valid_file_name
|
388
398
|
|
389
399
|
|
390
400
|
# todo cache?
|
391
|
-
def
|
401
|
+
def _platformdirs() -> platformdirs.PlatformDirs:
|
392
402
|
under_test = os.environ.get('PYTEST_CURRENT_TEST') is not None
|
393
403
|
# todo actually use test name?
|
394
404
|
name = 'promnesia-test' if under_test else 'promnesia'
|
395
|
-
|
396
|
-
return ad.AppDirs(appname=name)
|
405
|
+
return platformdirs.PlatformDirs(appname=name)
|
397
406
|
|
398
407
|
|
399
408
|
def default_output_dir() -> Path:
|
400
409
|
# TODO: on Windows, there are two extra subdirectories (<AppAuthor>\<AppName>)
|
401
410
|
# perhaps makes sense to create it here with parents to avoid issues downstream?
|
402
|
-
return Path(
|
411
|
+
return Path(_platformdirs().user_data_dir)
|
403
412
|
|
404
413
|
|
405
414
|
def default_cache_dir() -> Path:
|
406
|
-
return Path(
|
415
|
+
return Path(_platformdirs().user_cache_dir)
|
407
416
|
|
408
417
|
|
409
418
|
# make it lazy, otherwise it might crash on module import (e.g. on Windows)
|
410
419
|
# ideally would be nice to fix it properly https://github.com/ahupp/python-magic#windows
|
411
420
|
@lru_cache(1)
|
412
|
-
def _magic() -> Callable[[PathIsh],
|
421
|
+
def _magic() -> Callable[[PathIsh], str | None]:
|
413
422
|
logger = get_logger()
|
414
423
|
try:
|
415
|
-
import magic
|
424
|
+
import magic # type: ignore[import-not-found]
|
416
425
|
except Exception as e:
|
417
426
|
logger.exception(e)
|
418
|
-
defensive_msg:
|
427
|
+
defensive_msg: str | None = None
|
419
428
|
if isinstance(e, ModuleNotFoundError) and e.name == 'magic':
|
420
429
|
defensive_msg = "python-magic is not detected. It's recommended for better file type detection (pip3 install --user python-magic). See https://github.com/ahupp/python-magic#installation"
|
421
430
|
elif isinstance(e, ImportError):
|
422
|
-
emsg = getattr(e, 'msg', '')
|
423
|
-
if 'failed to find libmagic' in emsg:
|
431
|
+
emsg = getattr(e, 'msg', '') # make mypy happy
|
432
|
+
if 'failed to find libmagic' in emsg: # probably the actual library is missing?...
|
424
433
|
defensive_msg = "couldn't import magic. See https://github.com/ahupp/python-magic#installation"
|
425
434
|
if defensive_msg is not None:
|
426
435
|
logger.warning(defensive_msg)
|
427
436
|
warnings.warn(defensive_msg)
|
428
|
-
return lambda path: None #
|
437
|
+
return lambda path: None # stub # noqa: ARG005
|
429
438
|
else:
|
430
439
|
raise e
|
431
440
|
else:
|
@@ -437,11 +446,12 @@ def _magic() -> Callable[[PathIsh], Optional[str]]:
|
|
437
446
|
@lru_cache(1)
|
438
447
|
def _mimetypes():
|
439
448
|
import mimetypes
|
449
|
+
|
440
450
|
mimetypes.init()
|
441
451
|
return mimetypes
|
442
452
|
|
443
453
|
|
444
|
-
def mime(path: PathIsh) ->
|
454
|
+
def mime(path: PathIsh) -> str | None:
|
445
455
|
ps = str(path)
|
446
456
|
mimetypes = _mimetypes()
|
447
457
|
# first try mimetypes, it's only using the filename without opening the file
|
@@ -453,7 +463,7 @@ def mime(path: PathIsh) -> Optional[str]:
|
|
453
463
|
return magic(ps)
|
454
464
|
|
455
465
|
|
456
|
-
def find_args(root: Path, follow: bool, ignore:
|
466
|
+
def find_args(root: Path, *, follow: bool, ignore: Sequence[str] = ()) -> list[str]:
|
457
467
|
prune_dir_args = []
|
458
468
|
ignore_file_args = []
|
459
469
|
if ignore:
|
@@ -473,10 +483,10 @@ def find_args(root: Path, follow: bool, ignore: List[str]=[]) -> List[str]:
|
|
473
483
|
*prune_dir_args,
|
474
484
|
'-type', 'f',
|
475
485
|
*ignore_file_args
|
476
|
-
]
|
486
|
+
] # fmt: skip
|
477
487
|
|
478
488
|
|
479
|
-
def fdfind_args(root: Path, follow: bool, ignore:
|
489
|
+
def fdfind_args(root: Path, *, follow: bool, ignore: Sequence[str] = ()) -> list[str]:
|
480
490
|
from .config import extra_fd_args
|
481
491
|
|
482
492
|
ignore_args = []
|
@@ -493,10 +503,10 @@ def fdfind_args(root: Path, follow: bool, ignore: List[str]=[]) -> List[str]:
|
|
493
503
|
'--type', 'f',
|
494
504
|
'.',
|
495
505
|
str(root),
|
496
|
-
]
|
506
|
+
] # fmt: skip
|
497
507
|
|
498
508
|
|
499
|
-
def traverse(root: Path, *, follow: bool=True, ignore:
|
509
|
+
def traverse(root: Path, *, follow: bool = True, ignore: Sequence[str] = ()) -> Iterable[Path]:
|
500
510
|
if not root.is_dir():
|
501
511
|
yield root
|
502
512
|
return
|
@@ -515,12 +525,14 @@ def traverse(root: Path, *, follow: bool=True, ignore: List[str]=[]) -> Iterable
|
|
515
525
|
|
516
526
|
cmd = ['find', *find_args(root, follow=follow, ignore=ignore)]
|
517
527
|
# try to use fd.. it cooperates well with gitignore etc, also faster than find
|
518
|
-
for x in ('fd', 'fd-find', 'fdfind'):
|
528
|
+
for x in ('fd', 'fd-find', 'fdfind'): # has different names on different dists..
|
519
529
|
if shutil.which(x):
|
520
530
|
cmd = [x, *fdfind_args(root, follow=follow, ignore=ignore)]
|
521
531
|
break
|
522
532
|
else:
|
523
|
-
warnings.warn(
|
533
|
+
warnings.warn(
|
534
|
+
"'fdfind' is recommended for the best indexing performance. See https://github.com/sharkdp/fd#installation. Falling back to 'find'"
|
535
|
+
)
|
524
536
|
|
525
537
|
logger.debug('running: %s', cmd)
|
526
538
|
# TODO split by \0?
|
@@ -537,6 +549,7 @@ def traverse(root: Path, *, follow: bool=True, ignore: List[str]=[]) -> Iterable
|
|
537
549
|
def get_system_zone() -> str:
|
538
550
|
try:
|
539
551
|
import tzlocal
|
552
|
+
|
540
553
|
return tzlocal.get_localzone_name()
|
541
554
|
except Exception as e:
|
542
555
|
logger.exception(e)
|
@@ -545,14 +558,15 @@ def get_system_zone() -> str:
|
|
545
558
|
|
546
559
|
|
547
560
|
@lru_cache(1)
|
548
|
-
def get_system_tz() ->
|
561
|
+
def get_system_tz() -> ZoneInfo:
|
549
562
|
zone = get_system_zone()
|
550
563
|
try:
|
551
|
-
return
|
564
|
+
return ZoneInfo(zone)
|
552
565
|
except Exception as e:
|
553
566
|
logger.exception(e)
|
554
567
|
logger.error("Unknown time zone %s. Falling back to UTC. Please report this as a bug!", zone)
|
555
|
-
return
|
568
|
+
return ZoneInfo('UTC')
|
569
|
+
|
556
570
|
|
557
571
|
# used in misc/install_server.py
|
558
572
|
def root() -> Path:
|
@@ -574,7 +588,7 @@ def user_config_file() -> Path:
|
|
574
588
|
if "PROMNESIA_CONFIG" in os.environ:
|
575
589
|
return Path(os.environ["PROMNESIA_CONFIG"])
|
576
590
|
else:
|
577
|
-
return Path(
|
591
|
+
return Path(_platformdirs().user_config_dir) / 'config.py'
|
578
592
|
|
579
593
|
|
580
594
|
def default_config_path() -> Path:
|
@@ -589,7 +603,7 @@ def default_config_path() -> Path:
|
|
589
603
|
|
590
604
|
|
591
605
|
@contextmanager
|
592
|
-
def measure(tag: str='', *, logger: logging.Logger, unit: str='ms'):
|
606
|
+
def measure(tag: str = '', *, logger: logging.Logger, unit: str = 'ms'):
|
593
607
|
before = timer()
|
594
608
|
yield lambda: timer() - before
|
595
609
|
after = timer()
|
@@ -605,3 +619,8 @@ def is_sqlite_db(x: Path) -> bool:
|
|
605
619
|
'application/vnd.sqlite3',
|
606
620
|
# TODO this mime can also match wal files/journals, not sure
|
607
621
|
}
|
622
|
+
|
623
|
+
|
624
|
+
if not TYPE_CHECKING:
|
625
|
+
# todo deprecate properly --just backwards compat
|
626
|
+
from .compat import removeprefix # noqa: F401
|
promnesia/compare.py
CHANGED
@@ -1,69 +1,71 @@
|
|
1
|
-
|
1
|
+
from __future__ import annotations
|
2
|
+
|
2
3
|
# TODO perhaps make it external script?
|
3
4
|
import argparse
|
4
|
-
from pathlib import Path
|
5
5
|
import logging
|
6
6
|
import sys
|
7
|
-
from
|
8
|
-
|
7
|
+
from collections.abc import Iterator, Sequence
|
8
|
+
from pathlib import Path
|
9
|
+
from typing import TypeVar
|
9
10
|
|
10
|
-
from .common import DbVisit,
|
11
|
+
from .common import DbVisit, PathWithMtime, Url
|
11
12
|
from .database.load import row_to_db_visit
|
12
13
|
|
13
14
|
# TODO include latest too?
|
14
15
|
# from cconfig import ignore, filtered
|
15
16
|
|
17
|
+
|
16
18
|
def get_logger():
|
17
19
|
return logging.getLogger('promnesia-db-changes')
|
18
20
|
|
19
|
-
# TODO return error depending on severity?
|
20
|
-
|
21
21
|
|
22
|
-
|
22
|
+
# TODO return error depending on severity?
|
23
23
|
|
24
24
|
|
25
25
|
T = TypeVar('T')
|
26
26
|
|
27
|
+
|
27
28
|
def eliminate_by(sa: Sequence[T], sb: Sequence[T], key):
|
28
|
-
def make_dict(s: Sequence[T]) ->
|
29
|
-
res:
|
29
|
+
def make_dict(s: Sequence[T]) -> dict[str, list[T]]:
|
30
|
+
res: dict[str, list[T]] = {}
|
30
31
|
for a in s:
|
31
32
|
k = key(a)
|
32
|
-
ll = res.get(k
|
33
|
+
ll = res.get(k)
|
33
34
|
if ll is None:
|
34
35
|
ll = []
|
35
36
|
res[k] = ll
|
36
37
|
ll.append(a)
|
37
38
|
return res
|
39
|
+
|
38
40
|
da = make_dict(sa)
|
39
41
|
db = make_dict(sb)
|
40
42
|
ka = set(da.keys())
|
41
43
|
kb = set(db.keys())
|
42
|
-
onlya:
|
43
|
-
common:
|
44
|
-
onlyb:
|
44
|
+
onlya: set[T] = set()
|
45
|
+
common: set[T] = set()
|
46
|
+
onlyb: set[T] = set()
|
45
47
|
for k in ka.union(kb):
|
46
48
|
la = da.get(k, [])
|
47
49
|
lb = db.get(k, [])
|
48
|
-
common.update(la[:min(len(la), len(lb))])
|
50
|
+
common.update(la[: min(len(la), len(lb))])
|
49
51
|
if len(la) > len(lb):
|
50
|
-
onlya.update(la[len(lb):])
|
52
|
+
onlya.update(la[len(lb) :])
|
51
53
|
if len(lb) > len(la):
|
52
|
-
onlyb.update(lb[len(la):])
|
54
|
+
onlyb.update(lb[len(la) :])
|
53
55
|
|
54
56
|
return onlya, common, onlyb
|
55
57
|
|
56
58
|
|
57
|
-
def compare(before:
|
59
|
+
def compare(before: list[DbVisit], after: list[DbVisit], between: str, *, log=True) -> list[DbVisit]:
|
58
60
|
logger = get_logger()
|
59
61
|
logger.info('comparing between: %s', between)
|
60
62
|
|
61
|
-
errors:
|
63
|
+
errors: list[DbVisit] = []
|
62
64
|
|
63
|
-
umap:
|
65
|
+
umap: dict[Url, list[DbVisit]] = {}
|
64
66
|
for a in after:
|
65
67
|
url = a.norm_url
|
66
|
-
xx = umap.get(url, [])
|
68
|
+
xx = umap.get(url, []) # TODO canonify here?
|
67
69
|
xx.append(a)
|
68
70
|
umap[url] = xx
|
69
71
|
|
@@ -71,8 +73,7 @@ def compare(before: List[DbVisit], after: List[DbVisit], between: str, *, log=Tr
|
|
71
73
|
errors.append(b)
|
72
74
|
if log:
|
73
75
|
logger.error('between %s missing %s', between, b)
|
74
|
-
print('ignoreline "
|
75
|
-
|
76
|
+
print('ignoreline "{}", # {} {}'.format('exid', b.norm_url, b.src), file=sys.stderr)
|
76
77
|
|
77
78
|
# the idea is that we eliminate items simultaneously from both sets
|
78
79
|
eliminations = [
|
@@ -80,7 +81,7 @@ def compare(before: List[DbVisit], after: List[DbVisit], between: str, *, log=Tr
|
|
80
81
|
('without dt' , lambda x: x._replace(src='', dt='')),
|
81
82
|
('without context' , lambda x: x._replace(src='', context='', locator='')),
|
82
83
|
('without dt and context' , lambda x: x._replace(src='', dt='', context='', locator='')),
|
83
|
-
]
|
84
|
+
] # fmt: skip
|
84
85
|
for ename, ekey in eliminations:
|
85
86
|
logger.info('eliminating by %s', ename)
|
86
87
|
logger.info('before: %d, after: %d', len(before), len(after))
|
@@ -96,6 +97,7 @@ def compare(before: List[DbVisit], after: List[DbVisit], between: str, *, log=Tr
|
|
96
97
|
|
97
98
|
return errors
|
98
99
|
|
100
|
+
|
99
101
|
def setup_parser(p):
|
100
102
|
# TODO better name?
|
101
103
|
p.add_argument('--intermediate-dir', type=Path)
|
@@ -108,8 +110,8 @@ def get_files(args):
|
|
108
110
|
if len(args.paths) == 0:
|
109
111
|
int_dir = args.intermediate_dir
|
110
112
|
assert int_dir.exists()
|
111
|
-
files =
|
112
|
-
files = files[-args.last:]
|
113
|
+
files = sorted(int_dir.glob('*.sqlite*'))
|
114
|
+
files = files[-args.last :]
|
113
115
|
else:
|
114
116
|
files = [Path(p) for p in args.paths]
|
115
117
|
return files
|
@@ -126,7 +128,7 @@ def main():
|
|
126
128
|
sys.exit(1)
|
127
129
|
|
128
130
|
|
129
|
-
def compare_files(*files: Path, log=True) -> Iterator[
|
131
|
+
def compare_files(*files: Path, log=True) -> Iterator[tuple[str, DbVisit]]:
|
130
132
|
assert len(files) > 0
|
131
133
|
|
132
134
|
logger = get_logger()
|
@@ -137,9 +139,10 @@ def compare_files(*files: Path, log=True) -> Iterator[Tuple[str, DbVisit]]:
|
|
137
139
|
for f in files:
|
138
140
|
logger.info('processing %r', f)
|
139
141
|
name = f.name
|
140
|
-
this_dts = name[0: name.index('.')]
|
142
|
+
this_dts = name[0 : name.index('.')] # can't use stem due to multiple extensions..
|
143
|
+
|
144
|
+
from promnesia.server import _get_stuff # TODO ugh
|
141
145
|
|
142
|
-
from promnesia.server import _get_stuff # TODO ugh
|
143
146
|
engine, table = _get_stuff(PathWithMtime.make(f))
|
144
147
|
|
145
148
|
with engine.connect() as conn:
|
@@ -153,6 +156,6 @@ def compare_files(*files: Path, log=True) -> Iterator[Tuple[str, DbVisit]]:
|
|
153
156
|
last = vis
|
154
157
|
last_dts = this_dts
|
155
158
|
|
159
|
+
|
156
160
|
if __name__ == '__main__':
|
157
161
|
main()
|
158
|
-
|