promnesia 1.3.20241021__py3-none-any.whl → 1.4.20250909__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- promnesia/__init__.py +4 -1
- promnesia/__main__.py +72 -59
- promnesia/cannon.py +90 -89
- promnesia/common.py +74 -62
- promnesia/compare.py +15 -10
- promnesia/config.py +22 -17
- promnesia/database/dump.py +1 -2
- promnesia/extract.py +6 -6
- promnesia/logging.py +27 -15
- promnesia/misc/install_server.py +25 -19
- promnesia/server.py +69 -53
- promnesia/sources/auto.py +65 -51
- promnesia/sources/browser.py +7 -2
- promnesia/sources/browser_legacy.py +51 -40
- promnesia/sources/demo.py +0 -1
- promnesia/sources/fbmessenger.py +0 -1
- promnesia/sources/filetypes.py +15 -11
- promnesia/sources/github.py +4 -1
- promnesia/sources/guess.py +4 -1
- promnesia/sources/hackernews.py +5 -7
- promnesia/sources/hpi.py +3 -1
- promnesia/sources/html.py +4 -2
- promnesia/sources/instapaper.py +1 -0
- promnesia/sources/markdown.py +4 -4
- promnesia/sources/org.py +17 -8
- promnesia/sources/plaintext.py +14 -11
- promnesia/sources/pocket.py +2 -1
- promnesia/sources/reddit.py +5 -8
- promnesia/sources/roamresearch.py +3 -1
- promnesia/sources/rss.py +4 -5
- promnesia/sources/shellcmd.py +3 -6
- promnesia/sources/signal.py +14 -14
- promnesia/sources/smscalls.py +0 -1
- promnesia/sources/stackexchange.py +2 -2
- promnesia/sources/takeout.py +14 -21
- promnesia/sources/takeout_legacy.py +16 -10
- promnesia/sources/telegram.py +7 -3
- promnesia/sources/telegram_legacy.py +5 -5
- promnesia/sources/twitter.py +1 -1
- promnesia/sources/vcs.py +6 -3
- promnesia/sources/viber.py +2 -2
- promnesia/sources/website.py +4 -3
- promnesia/sqlite.py +10 -7
- promnesia/tests/common.py +2 -0
- promnesia/tests/server_helper.py +2 -2
- promnesia/tests/sources/test_filetypes.py +9 -7
- promnesia/tests/sources/test_hypothesis.py +7 -3
- promnesia/tests/sources/test_org.py +7 -2
- promnesia/tests/sources/test_plaintext.py +9 -7
- promnesia/tests/sources/test_shellcmd.py +10 -9
- promnesia/tests/test_cannon.py +254 -237
- promnesia/tests/test_cli.py +8 -2
- promnesia/tests/test_compare.py +16 -12
- promnesia/tests/test_db_dump.py +4 -3
- promnesia/tests/test_extract.py +7 -4
- promnesia/tests/test_indexer.py +10 -10
- promnesia/tests/test_server.py +10 -10
- promnesia/tests/utils.py +1 -5
- promnesia-1.4.20250909.dist-info/METADATA +66 -0
- promnesia-1.4.20250909.dist-info/RECORD +80 -0
- {promnesia-1.3.20241021.dist-info → promnesia-1.4.20250909.dist-info}/WHEEL +1 -2
- promnesia/kjson.py +0 -122
- promnesia/sources/__init__.pyi +0 -0
- promnesia-1.3.20241021.dist-info/METADATA +0 -55
- promnesia-1.3.20241021.dist-info/RECORD +0 -83
- promnesia-1.3.20241021.dist-info/top_level.txt +0 -1
- {promnesia-1.3.20241021.dist-info → promnesia-1.4.20250909.dist-info}/entry_points.txt +0 -0
- {promnesia-1.3.20241021.dist-info → promnesia-1.4.20250909.dist-info/licenses}/LICENSE +0 -0
promnesia/common.py
CHANGED
@@ -7,19 +7,20 @@ import re
|
|
7
7
|
import shutil
|
8
8
|
import tempfile
|
9
9
|
import warnings
|
10
|
-
from collections.abc import Iterable, Sequence
|
10
|
+
from collections.abc import Callable, Iterable, Sequence
|
11
11
|
from contextlib import contextmanager
|
12
12
|
from copy import copy
|
13
|
-
from datetime import date, datetime
|
13
|
+
from datetime import date, datetime, timezone
|
14
14
|
from functools import lru_cache
|
15
15
|
from glob import glob
|
16
16
|
from pathlib import Path
|
17
17
|
from subprocess import PIPE, Popen, run
|
18
18
|
from timeit import default_timer as timer
|
19
19
|
from types import ModuleType
|
20
|
-
from typing import TYPE_CHECKING,
|
20
|
+
from typing import TYPE_CHECKING, NamedTuple, TypeAlias, TypeVar
|
21
|
+
from zoneinfo import ZoneInfo
|
21
22
|
|
22
|
-
import
|
23
|
+
import platformdirs
|
23
24
|
from more_itertools import intersperse
|
24
25
|
|
25
26
|
from .cannon import canonify
|
@@ -27,27 +28,28 @@ from .cannon import canonify
|
|
27
28
|
_is_windows = os.name == 'nt'
|
28
29
|
|
29
30
|
T = TypeVar('T')
|
30
|
-
Res =
|
31
|
+
Res: TypeAlias = T | Exception
|
31
32
|
|
32
|
-
PathIsh =
|
33
|
+
PathIsh = str | Path
|
33
34
|
|
34
35
|
Url = str
|
35
36
|
SourceName = str
|
36
|
-
DatetimeIsh =
|
37
|
+
DatetimeIsh = datetime | date
|
37
38
|
Context = str
|
38
39
|
Second = int
|
39
40
|
|
41
|
+
|
40
42
|
# TODO hmm. arguably, source and context are almost same things...
|
41
43
|
class Loc(NamedTuple):
|
42
44
|
title: str
|
43
|
-
href:
|
45
|
+
href: str | None = None
|
44
46
|
|
45
47
|
@classmethod
|
46
|
-
def make(cls, title: str, href: str | None=None) -> Loc:
|
48
|
+
def make(cls, title: str, href: str | None = None) -> Loc:
|
47
49
|
return cls(title=title, href=href)
|
48
50
|
|
49
51
|
@classmethod
|
50
|
-
def file(cls, path: PathIsh, line: int | None=None, relative_to: Path | None=None) -> Loc:
|
52
|
+
def file(cls, path: PathIsh, line: int | None = None, relative_to: Path | None = None) -> Loc:
|
51
53
|
lstr = '' if line is None else f':{line}'
|
52
54
|
# todo loc should be url encoded? dunno.
|
53
55
|
# or use line=? eh. I don't know. Just ask in issues.
|
@@ -55,11 +57,11 @@ class Loc(NamedTuple):
|
|
55
57
|
# todo: handler has to be overridable by config. This is needed for docker, but also for a "as a service" install, where the sources would be available on some remote webserver
|
56
58
|
# maybe it should be treated as a format string, so that {line} may be a part of the result or not.
|
57
59
|
# for local usage, editor:///file:line works, but if the txt file is only available through http, it breaks.
|
58
|
-
#if get_config().MIME_HANDLER:
|
60
|
+
# if get_config().MIME_HANDLER:
|
59
61
|
# handler = get_config().MIME_HANDLER
|
60
|
-
#if True:
|
62
|
+
# if True:
|
61
63
|
# handler = 'editor:///home/koom/promnesia/docker/'
|
62
|
-
#else:
|
64
|
+
# else:
|
63
65
|
handler = _detect_mime_handler()
|
64
66
|
|
65
67
|
rel = Path(path)
|
@@ -67,13 +69,10 @@ class Loc(NamedTuple):
|
|
67
69
|
try:
|
68
70
|
# making it relative is a bit nicer for display
|
69
71
|
rel = rel.relative_to(relative_to)
|
70
|
-
except Exception
|
71
|
-
pass
|
72
|
+
except Exception:
|
73
|
+
pass # todo log/warn?
|
72
74
|
loc = f'{rel}{lstr}'
|
73
|
-
return cls.make(
|
74
|
-
title=loc,
|
75
|
-
href=f'{handler}{path}{lstr}'
|
76
|
-
)
|
75
|
+
return cls.make(title=loc, href=f'{handler}{path}{lstr}')
|
77
76
|
|
78
77
|
# TODO need some uniform way of string conversion
|
79
78
|
# but generally, it will be
|
@@ -89,7 +88,9 @@ def warn_once(message: str) -> None:
|
|
89
88
|
|
90
89
|
|
91
90
|
def _warn_no_xdg_mime() -> None:
|
92
|
-
warn_once(
|
91
|
+
warn_once(
|
92
|
+
"No xdg-mime on your OS! If you're on OSX, perhaps you can help me! https://github.com/karlicoss/open-in-editor/issues/1"
|
93
|
+
)
|
93
94
|
|
94
95
|
|
95
96
|
@lru_cache(1)
|
@@ -101,7 +102,7 @@ def _detect_mime_handler() -> str:
|
|
101
102
|
_warn_no_xdg_mime()
|
102
103
|
return False
|
103
104
|
if r.returncode > 0:
|
104
|
-
warnings.warn('xdg-mime failed')
|
105
|
+
warnings.warn('xdg-mime failed') # hopefully rest is in stderr
|
105
106
|
return False
|
106
107
|
# todo not sure if should check=True or something
|
107
108
|
handler = r.stdout.decode('utf8').strip()
|
@@ -110,11 +111,13 @@ def _detect_mime_handler() -> str:
|
|
110
111
|
# 1. detect legacy 'emacs:' handler (so it doesn't break for existing users)
|
111
112
|
result = None
|
112
113
|
if exists('emacs'):
|
113
|
-
warnings.warn(
|
114
|
+
warnings.warn(
|
115
|
+
'''
|
114
116
|
'emacs:' handler is deprecated!
|
115
117
|
Please use newer version at https://github.com/karlicoss/open-in-editor
|
116
118
|
And remove the old one (most likely, rm ~/.local/share/applications/mimemacs.desktop && update-desktop-database ~/.local/share/applications).
|
117
|
-
'''.rstrip()
|
119
|
+
'''.rstrip()
|
120
|
+
)
|
118
121
|
result = 'emacs:'
|
119
122
|
|
120
123
|
# 2. now try to use newer editor:// thing
|
@@ -122,10 +125,12 @@ def _detect_mime_handler() -> str:
|
|
122
125
|
|
123
126
|
# TODO would be nice to collect warnings and display at the end
|
124
127
|
if not exists('editor'):
|
125
|
-
warnings.warn(
|
128
|
+
warnings.warn(
|
129
|
+
'''
|
126
130
|
You might want to install https://github.com/karlicoss/open-in-editor
|
127
131
|
So you can jump to your text files straight from the browser
|
128
|
-
'''.rstrip()
|
132
|
+
'''.rstrip()
|
133
|
+
)
|
129
134
|
else:
|
130
135
|
result = 'editor://'
|
131
136
|
|
@@ -148,20 +153,22 @@ class Visit(NamedTuple):
|
|
148
153
|
# spent: Optional[Second] = None
|
149
154
|
debug: str | None = None
|
150
155
|
|
151
|
-
|
156
|
+
|
157
|
+
Result = Visit | Exception
|
152
158
|
Results = Iterable[Result]
|
153
159
|
Extractor = Callable[[], Results]
|
154
160
|
|
155
161
|
Extraction = Result # TODO deprecate!
|
156
162
|
|
163
|
+
|
157
164
|
class DbVisit(NamedTuple):
|
158
165
|
norm_url: Url
|
159
166
|
orig_url: Url
|
160
167
|
dt: datetime
|
161
168
|
locator: Loc
|
162
|
-
src:
|
163
|
-
context:
|
164
|
-
duration:
|
169
|
+
src: SourceName | None = None
|
170
|
+
context: Context | None = None
|
171
|
+
duration: Second | None = None
|
165
172
|
|
166
173
|
@staticmethod
|
167
174
|
def make(p: Visit, src: SourceName) -> Res[DbVisit]:
|
@@ -171,9 +178,9 @@ class DbVisit(NamedTuple):
|
|
171
178
|
dt = p.dt
|
172
179
|
elif isinstance(p.dt, date):
|
173
180
|
# TODO that won't be with timezone..
|
174
|
-
dt = datetime.combine(p.dt, datetime.min.time())
|
181
|
+
dt = datetime.combine(p.dt, datetime.min.time()) # meh..
|
175
182
|
else:
|
176
|
-
raise
|
183
|
+
raise TypeError(f'unexpected date: {p.dt}, {type(p.dt)}') # noqa: TRY301
|
177
184
|
except Exception as e:
|
178
185
|
return e
|
179
186
|
|
@@ -201,32 +208,34 @@ from .logging import LazyLogger
|
|
201
208
|
|
202
209
|
logger = LazyLogger('promnesia', level='DEBUG')
|
203
210
|
|
211
|
+
|
204
212
|
def get_logger() -> logging.Logger:
|
205
213
|
# deprecate? no need since logger is lazy already
|
206
214
|
return logger
|
207
215
|
|
208
216
|
|
209
|
-
|
210
217
|
# kinda singleton
|
211
218
|
@lru_cache(1)
|
212
219
|
def get_tmpdir() -> tempfile.TemporaryDirectory[str]:
|
213
|
-
# todo use
|
220
|
+
# todo use platformdirs?
|
214
221
|
tdir = tempfile.TemporaryDirectory(suffix="promnesia")
|
215
222
|
return tdir
|
216
223
|
|
224
|
+
|
217
225
|
# TODO use mypy literal?
|
218
226
|
Syntax = str
|
219
227
|
|
220
228
|
|
221
229
|
@lru_cache(None)
|
222
230
|
def _get_urlextractor(syntax: Syntax):
|
223
|
-
from urlextract import URLExtract # type: ignore
|
231
|
+
from urlextract import URLExtract # type: ignore[import-untyped]
|
232
|
+
|
224
233
|
u = URLExtract()
|
225
234
|
# https://github.com/lipoja/URLExtract/issues/13
|
226
|
-
if syntax in {'org', 'orgmode', 'org-mode'}:
|
235
|
+
if syntax in {'org', 'orgmode', 'org-mode'}: # TODO remove hardcoding..
|
227
236
|
# handle org-mode links properly..
|
228
237
|
u._stop_chars_right |= {'[', ']'}
|
229
|
-
u._stop_chars_left
|
238
|
+
u._stop_chars_left |= {'[', ']'}
|
230
239
|
elif syntax in {'md', 'markdown'}:
|
231
240
|
pass
|
232
241
|
# u._stop_chars_right |= {','}
|
@@ -244,19 +253,19 @@ def _sanitize(url: str) -> str:
|
|
244
253
|
return url
|
245
254
|
|
246
255
|
|
247
|
-
def iter_urls(s: str, *, syntax: Syntax='') -> Iterable[Url]:
|
256
|
+
def iter_urls(s: str, *, syntax: Syntax = '') -> Iterable[Url]:
|
248
257
|
urlextractor = _get_urlextractor(syntax=syntax)
|
249
258
|
# note: it also has get_indices, might be useful
|
250
259
|
for u in urlextractor.gen_urls(s):
|
251
260
|
yield _sanitize(u)
|
252
261
|
|
253
262
|
|
254
|
-
def extract_urls(s: str, *, syntax: Syntax='') -> list[Url]:
|
263
|
+
def extract_urls(s: str, *, syntax: Syntax = '') -> list[Url]:
|
255
264
|
return list(iter_urls(s=s, syntax=syntax))
|
256
265
|
|
257
266
|
|
258
267
|
def from_epoch(ts: int) -> datetime:
|
259
|
-
return datetime.fromtimestamp(ts, tz=
|
268
|
+
return datetime.fromtimestamp(ts, tz=timezone.utc)
|
260
269
|
|
261
270
|
|
262
271
|
def join_tags(tags: Iterable[str]) -> str:
|
@@ -287,10 +296,7 @@ class PathWithMtime(NamedTuple):
|
|
287
296
|
PreExtractor = Callable[..., Results]
|
288
297
|
|
289
298
|
|
290
|
-
PreSource =
|
291
|
-
PreExtractor,
|
292
|
-
ModuleType, # module with 'index' functon defined in it
|
293
|
-
]
|
299
|
+
PreSource = PreExtractor | ModuleType # module with 'index' functon defined in it
|
294
300
|
|
295
301
|
|
296
302
|
# todo not sure about this...
|
@@ -322,7 +328,7 @@ def _get_index_function(sourceish: PreSource) -> PreExtractor:
|
|
322
328
|
class Source:
|
323
329
|
# TODO make sure it works with empty src?
|
324
330
|
# TODO later, make it properly optional?
|
325
|
-
def __init__(self, ff: PreSource, *args, src: SourceName='', name: SourceName='', **kwargs) -> None:
|
331
|
+
def __init__(self, ff: PreSource, *args, src: SourceName = '', name: SourceName = '', **kwargs) -> None:
|
326
332
|
# NOTE: in principle, would be nice to make the Source countructor to be as dumb as possible
|
327
333
|
# so we could move _get_index_function inside extractor lambda
|
328
334
|
# but that way we get nicer error reporting
|
@@ -356,6 +362,7 @@ class Source:
|
|
356
362
|
# TODO deprecated!
|
357
363
|
return self.name
|
358
364
|
|
365
|
+
|
359
366
|
# TODO deprecated
|
360
367
|
Indexer = Source
|
361
368
|
|
@@ -364,6 +371,7 @@ Indexer = Source
|
|
364
371
|
# NOTE: used in configs...
|
365
372
|
def last(path: PathIsh, *parts: str) -> Path:
|
366
373
|
import os.path
|
374
|
+
|
367
375
|
pp = os.path.join(str(path), *parts) # noqa: PTH118
|
368
376
|
return Path(max(glob(pp, recursive=True))) # noqa: PTH207
|
369
377
|
|
@@ -390,22 +398,21 @@ def slugify(x: str) -> str:
|
|
390
398
|
|
391
399
|
|
392
400
|
# todo cache?
|
393
|
-
def
|
401
|
+
def _platformdirs() -> platformdirs.PlatformDirs:
|
394
402
|
under_test = os.environ.get('PYTEST_CURRENT_TEST') is not None
|
395
403
|
# todo actually use test name?
|
396
404
|
name = 'promnesia-test' if under_test else 'promnesia'
|
397
|
-
|
398
|
-
return ad.AppDirs(appname=name)
|
405
|
+
return platformdirs.PlatformDirs(appname=name)
|
399
406
|
|
400
407
|
|
401
408
|
def default_output_dir() -> Path:
|
402
409
|
# TODO: on Windows, there are two extra subdirectories (<AppAuthor>\<AppName>)
|
403
410
|
# perhaps makes sense to create it here with parents to avoid issues downstream?
|
404
|
-
return Path(
|
411
|
+
return Path(_platformdirs().user_data_dir)
|
405
412
|
|
406
413
|
|
407
414
|
def default_cache_dir() -> Path:
|
408
|
-
return Path(
|
415
|
+
return Path(_platformdirs().user_cache_dir)
|
409
416
|
|
410
417
|
|
411
418
|
# make it lazy, otherwise it might crash on module import (e.g. on Windows)
|
@@ -414,15 +421,15 @@ def default_cache_dir() -> Path:
|
|
414
421
|
def _magic() -> Callable[[PathIsh], str | None]:
|
415
422
|
logger = get_logger()
|
416
423
|
try:
|
417
|
-
import magic # type: ignore
|
424
|
+
import magic # type: ignore[import-not-found]
|
418
425
|
except Exception as e:
|
419
426
|
logger.exception(e)
|
420
427
|
defensive_msg: str | None = None
|
421
428
|
if isinstance(e, ModuleNotFoundError) and e.name == 'magic':
|
422
429
|
defensive_msg = "python-magic is not detected. It's recommended for better file type detection (pip3 install --user python-magic). See https://github.com/ahupp/python-magic#installation"
|
423
430
|
elif isinstance(e, ImportError):
|
424
|
-
emsg = getattr(e, 'msg', '')
|
425
|
-
if 'failed to find libmagic' in emsg:
|
431
|
+
emsg = getattr(e, 'msg', '') # make mypy happy
|
432
|
+
if 'failed to find libmagic' in emsg: # probably the actual library is missing?...
|
426
433
|
defensive_msg = "couldn't import magic. See https://github.com/ahupp/python-magic#installation"
|
427
434
|
if defensive_msg is not None:
|
428
435
|
logger.warning(defensive_msg)
|
@@ -439,6 +446,7 @@ def _magic() -> Callable[[PathIsh], str | None]:
|
|
439
446
|
@lru_cache(1)
|
440
447
|
def _mimetypes():
|
441
448
|
import mimetypes
|
449
|
+
|
442
450
|
mimetypes.init()
|
443
451
|
return mimetypes
|
444
452
|
|
@@ -475,7 +483,7 @@ def find_args(root: Path, *, follow: bool, ignore: Sequence[str] = ()) -> list[s
|
|
475
483
|
*prune_dir_args,
|
476
484
|
'-type', 'f',
|
477
485
|
*ignore_file_args
|
478
|
-
]
|
486
|
+
] # fmt: skip
|
479
487
|
|
480
488
|
|
481
489
|
def fdfind_args(root: Path, *, follow: bool, ignore: Sequence[str] = ()) -> list[str]:
|
@@ -495,10 +503,10 @@ def fdfind_args(root: Path, *, follow: bool, ignore: Sequence[str] = ()) -> list
|
|
495
503
|
'--type', 'f',
|
496
504
|
'.',
|
497
505
|
str(root),
|
498
|
-
]
|
506
|
+
] # fmt: skip
|
499
507
|
|
500
508
|
|
501
|
-
def traverse(root: Path, *, follow: bool=True, ignore: Sequence[str] = ()) -> Iterable[Path]:
|
509
|
+
def traverse(root: Path, *, follow: bool = True, ignore: Sequence[str] = ()) -> Iterable[Path]:
|
502
510
|
if not root.is_dir():
|
503
511
|
yield root
|
504
512
|
return
|
@@ -517,12 +525,14 @@ def traverse(root: Path, *, follow: bool=True, ignore: Sequence[str] = ()) -> It
|
|
517
525
|
|
518
526
|
cmd = ['find', *find_args(root, follow=follow, ignore=ignore)]
|
519
527
|
# try to use fd.. it cooperates well with gitignore etc, also faster than find
|
520
|
-
for x in ('fd', 'fd-find', 'fdfind'):
|
528
|
+
for x in ('fd', 'fd-find', 'fdfind'): # has different names on different dists..
|
521
529
|
if shutil.which(x):
|
522
530
|
cmd = [x, *fdfind_args(root, follow=follow, ignore=ignore)]
|
523
531
|
break
|
524
532
|
else:
|
525
|
-
warnings.warn(
|
533
|
+
warnings.warn(
|
534
|
+
"'fdfind' is recommended for the best indexing performance. See https://github.com/sharkdp/fd#installation. Falling back to 'find'"
|
535
|
+
)
|
526
536
|
|
527
537
|
logger.debug('running: %s', cmd)
|
528
538
|
# TODO split by \0?
|
@@ -539,6 +549,7 @@ def traverse(root: Path, *, follow: bool=True, ignore: Sequence[str] = ()) -> It
|
|
539
549
|
def get_system_zone() -> str:
|
540
550
|
try:
|
541
551
|
import tzlocal
|
552
|
+
|
542
553
|
return tzlocal.get_localzone_name()
|
543
554
|
except Exception as e:
|
544
555
|
logger.exception(e)
|
@@ -547,14 +558,15 @@ def get_system_zone() -> str:
|
|
547
558
|
|
548
559
|
|
549
560
|
@lru_cache(1)
|
550
|
-
def get_system_tz() ->
|
561
|
+
def get_system_tz() -> ZoneInfo:
|
551
562
|
zone = get_system_zone()
|
552
563
|
try:
|
553
|
-
return
|
564
|
+
return ZoneInfo(zone)
|
554
565
|
except Exception as e:
|
555
566
|
logger.exception(e)
|
556
567
|
logger.error("Unknown time zone %s. Falling back to UTC. Please report this as a bug!", zone)
|
557
|
-
return
|
568
|
+
return ZoneInfo('UTC')
|
569
|
+
|
558
570
|
|
559
571
|
# used in misc/install_server.py
|
560
572
|
def root() -> Path:
|
@@ -576,7 +588,7 @@ def user_config_file() -> Path:
|
|
576
588
|
if "PROMNESIA_CONFIG" in os.environ:
|
577
589
|
return Path(os.environ["PROMNESIA_CONFIG"])
|
578
590
|
else:
|
579
|
-
return Path(
|
591
|
+
return Path(_platformdirs().user_config_dir) / 'config.py'
|
580
592
|
|
581
593
|
|
582
594
|
def default_config_path() -> Path:
|
@@ -591,7 +603,7 @@ def default_config_path() -> Path:
|
|
591
603
|
|
592
604
|
|
593
605
|
@contextmanager
|
594
|
-
def measure(tag: str='', *, logger: logging.Logger, unit: str='ms'):
|
606
|
+
def measure(tag: str = '', *, logger: logging.Logger, unit: str = 'ms'):
|
595
607
|
before = timer()
|
596
608
|
yield lambda: timer() - before
|
597
609
|
after = timer()
|
promnesia/compare.py
CHANGED
@@ -14,25 +14,29 @@ from .database.load import row_to_db_visit
|
|
14
14
|
# TODO include latest too?
|
15
15
|
# from cconfig import ignore, filtered
|
16
16
|
|
17
|
+
|
17
18
|
def get_logger():
|
18
19
|
return logging.getLogger('promnesia-db-changes')
|
19
20
|
|
21
|
+
|
20
22
|
# TODO return error depending on severity?
|
21
23
|
|
22
24
|
|
23
25
|
T = TypeVar('T')
|
24
26
|
|
27
|
+
|
25
28
|
def eliminate_by(sa: Sequence[T], sb: Sequence[T], key):
|
26
29
|
def make_dict(s: Sequence[T]) -> dict[str, list[T]]:
|
27
30
|
res: dict[str, list[T]] = {}
|
28
31
|
for a in s:
|
29
32
|
k = key(a)
|
30
|
-
ll = res.get(k
|
33
|
+
ll = res.get(k)
|
31
34
|
if ll is None:
|
32
35
|
ll = []
|
33
36
|
res[k] = ll
|
34
37
|
ll.append(a)
|
35
38
|
return res
|
39
|
+
|
36
40
|
da = make_dict(sa)
|
37
41
|
db = make_dict(sb)
|
38
42
|
ka = set(da.keys())
|
@@ -43,11 +47,11 @@ def eliminate_by(sa: Sequence[T], sb: Sequence[T], key):
|
|
43
47
|
for k in ka.union(kb):
|
44
48
|
la = da.get(k, [])
|
45
49
|
lb = db.get(k, [])
|
46
|
-
common.update(la[:min(len(la), len(lb))])
|
50
|
+
common.update(la[: min(len(la), len(lb))])
|
47
51
|
if len(la) > len(lb):
|
48
|
-
onlya.update(la[len(lb):])
|
52
|
+
onlya.update(la[len(lb) :])
|
49
53
|
if len(lb) > len(la):
|
50
|
-
onlyb.update(lb[len(la):])
|
54
|
+
onlyb.update(lb[len(la) :])
|
51
55
|
|
52
56
|
return onlya, common, onlyb
|
53
57
|
|
@@ -61,7 +65,7 @@ def compare(before: list[DbVisit], after: list[DbVisit], between: str, *, log=Tr
|
|
61
65
|
umap: dict[Url, list[DbVisit]] = {}
|
62
66
|
for a in after:
|
63
67
|
url = a.norm_url
|
64
|
-
xx = umap.get(url, [])
|
68
|
+
xx = umap.get(url, []) # TODO canonify here?
|
65
69
|
xx.append(a)
|
66
70
|
umap[url] = xx
|
67
71
|
|
@@ -71,14 +75,13 @@ def compare(before: list[DbVisit], after: list[DbVisit], between: str, *, log=Tr
|
|
71
75
|
logger.error('between %s missing %s', between, b)
|
72
76
|
print('ignoreline "{}", # {} {}'.format('exid', b.norm_url, b.src), file=sys.stderr)
|
73
77
|
|
74
|
-
|
75
78
|
# the idea is that we eliminate items simultaneously from both sets
|
76
79
|
eliminations = [
|
77
80
|
('identity' , lambda x: x),
|
78
81
|
('without dt' , lambda x: x._replace(src='', dt='')),
|
79
82
|
('without context' , lambda x: x._replace(src='', context='', locator='')),
|
80
83
|
('without dt and context' , lambda x: x._replace(src='', dt='', context='', locator='')),
|
81
|
-
]
|
84
|
+
] # fmt: skip
|
82
85
|
for ename, ekey in eliminations:
|
83
86
|
logger.info('eliminating by %s', ename)
|
84
87
|
logger.info('before: %d, after: %d', len(before), len(after))
|
@@ -94,6 +97,7 @@ def compare(before: list[DbVisit], after: list[DbVisit], between: str, *, log=Tr
|
|
94
97
|
|
95
98
|
return errors
|
96
99
|
|
100
|
+
|
97
101
|
def setup_parser(p):
|
98
102
|
# TODO better name?
|
99
103
|
p.add_argument('--intermediate-dir', type=Path)
|
@@ -107,7 +111,7 @@ def get_files(args):
|
|
107
111
|
int_dir = args.intermediate_dir
|
108
112
|
assert int_dir.exists()
|
109
113
|
files = sorted(int_dir.glob('*.sqlite*'))
|
110
|
-
files = files[-args.last:]
|
114
|
+
files = files[-args.last :]
|
111
115
|
else:
|
112
116
|
files = [Path(p) for p in args.paths]
|
113
117
|
return files
|
@@ -135,9 +139,10 @@ def compare_files(*files: Path, log=True) -> Iterator[tuple[str, DbVisit]]:
|
|
135
139
|
for f in files:
|
136
140
|
logger.info('processing %r', f)
|
137
141
|
name = f.name
|
138
|
-
this_dts = name[0: name.index('.')]
|
142
|
+
this_dts = name[0 : name.index('.')] # can't use stem due to multiple extensions..
|
139
143
|
|
140
144
|
from promnesia.server import _get_stuff # TODO ugh
|
145
|
+
|
141
146
|
engine, table = _get_stuff(PathWithMtime.make(f))
|
142
147
|
|
143
148
|
with engine.connect() as conn:
|
@@ -151,6 +156,6 @@ def compare_files(*files: Path, log=True) -> Iterator[tuple[str, DbVisit]]:
|
|
151
156
|
last = vis
|
152
157
|
last_dts = this_dts
|
153
158
|
|
159
|
+
|
154
160
|
if __name__ == '__main__':
|
155
161
|
main()
|
156
|
-
|
promnesia/config.py
CHANGED
@@ -4,10 +4,10 @@ import importlib
|
|
4
4
|
import importlib.util
|
5
5
|
import os
|
6
6
|
import warnings
|
7
|
-
from collections.abc import Iterable
|
7
|
+
from collections.abc import Callable, Iterable
|
8
8
|
from pathlib import Path
|
9
9
|
from types import ModuleType
|
10
|
-
from typing import
|
10
|
+
from typing import NamedTuple
|
11
11
|
|
12
12
|
from .common import DbVisit, PathIsh, Res, Source, default_cache_dir, default_output_dir
|
13
13
|
|
@@ -17,37 +17,37 @@ HookT = Callable[[Res[DbVisit]], Iterable[Res[DbVisit]]]
|
|
17
17
|
ModuleName = str
|
18
18
|
|
19
19
|
# something that can be converted into a proper Source
|
20
|
-
ConfigSource =
|
20
|
+
ConfigSource = Source | ModuleName | ModuleType
|
21
21
|
|
22
22
|
|
23
23
|
class Config(NamedTuple):
|
24
24
|
# TODO remove default from sources once migrated
|
25
|
-
SOURCES: list[ConfigSource] = []
|
25
|
+
SOURCES: list[ConfigSource] = [] # noqa: RUF012
|
26
26
|
|
27
27
|
# if not specified, uses user data dir
|
28
28
|
OUTPUT_DIR: PathIsh | None = None
|
29
29
|
|
30
30
|
CACHE_DIR: PathIsh | None = ''
|
31
|
-
FILTERS: list[str] = []
|
31
|
+
FILTERS: list[str] = [] # noqa: RUF012
|
32
32
|
|
33
33
|
HOOK: HookT | None = None
|
34
34
|
|
35
35
|
#
|
36
36
|
# NOTE: INDEXERS is deprecated, use SOURCES instead
|
37
|
-
INDEXERS: list[ConfigSource] = []
|
38
|
-
#MIME_HANDLER: Optional[str] = None # TODO
|
37
|
+
INDEXERS: list[ConfigSource] = [] # noqa: RUF012
|
38
|
+
# MIME_HANDLER: Optional[str] = None # TODO
|
39
39
|
|
40
40
|
@property
|
41
41
|
def sources(self) -> Iterable[Res[Source]]:
|
42
|
-
idx = self.INDEXERS
|
43
|
-
|
44
42
|
if len(self.INDEXERS) > 0:
|
45
43
|
warnings.warn("'INDEXERS' is deprecated. Please use 'SOURCES'!", DeprecationWarning)
|
46
44
|
|
47
45
|
raw = self.SOURCES + self.INDEXERS
|
48
46
|
|
49
47
|
if len(raw) == 0:
|
50
|
-
raise RuntimeError(
|
48
|
+
raise RuntimeError(
|
49
|
+
"Please specify SOURCES in the config! See https://github.com/karlicoss/promnesia#setup for more information"
|
50
|
+
)
|
51
51
|
|
52
52
|
for r in raw:
|
53
53
|
if isinstance(r, ModuleName):
|
@@ -72,8 +72,8 @@ class Config(NamedTuple):
|
|
72
72
|
cd = self.CACHE_DIR
|
73
73
|
cpath: Path | None
|
74
74
|
if cd is None:
|
75
|
-
cpath = None
|
76
|
-
elif cd == '':
|
75
|
+
cpath = None # means 'disabled' in cachew
|
76
|
+
elif cd == '': # meh.. but need to make it None friendly..
|
77
77
|
cpath = default_cache_dir()
|
78
78
|
else:
|
79
79
|
cpath = Path(cd)
|
@@ -97,12 +97,14 @@ class Config(NamedTuple):
|
|
97
97
|
def hook(self) -> HookT | None:
|
98
98
|
return self.HOOK
|
99
99
|
|
100
|
+
|
100
101
|
instance: Config | None = None
|
101
102
|
|
102
103
|
|
103
104
|
def has() -> bool:
|
104
105
|
return instance is not None
|
105
106
|
|
107
|
+
|
106
108
|
def get() -> Config:
|
107
109
|
assert instance is not None, "Expected config to be set, but it's not"
|
108
110
|
return instance
|
@@ -124,9 +126,12 @@ def import_config(config_file: PathIsh) -> Config:
|
|
124
126
|
|
125
127
|
# todo just exec??
|
126
128
|
name = p.stem
|
127
|
-
spec = importlib.util.spec_from_file_location(name, p)
|
128
|
-
|
129
|
-
|
129
|
+
spec = importlib.util.spec_from_file_location(name, p)
|
130
|
+
assert spec is not None
|
131
|
+
mod = importlib.util.module_from_spec(spec)
|
132
|
+
assert mod is not None
|
133
|
+
loader = spec.loader
|
134
|
+
assert loader is not None
|
130
135
|
loader.exec_module(mod)
|
131
136
|
|
132
137
|
d = {}
|
@@ -148,7 +153,7 @@ def use_cores() -> int | None:
|
|
148
153
|
return None
|
149
154
|
try:
|
150
155
|
return int(cs)
|
151
|
-
except ValueError:
|
156
|
+
except ValueError: # any other value means 'use all
|
152
157
|
return 0
|
153
158
|
|
154
159
|
|
@@ -158,5 +163,5 @@ def extra_fd_args() -> list[str]:
|
|
158
163
|
Can be used to pass --ignore-file parameter
|
159
164
|
'''
|
160
165
|
v = os.environ.get('PROMNESIA_FD_EXTRA_ARGS', '')
|
161
|
-
extra = v.split()
|
166
|
+
extra = v.split() # eh, hopefully splitting that way is ok...
|
162
167
|
return extra
|
promnesia/database/dump.py
CHANGED
@@ -3,7 +3,6 @@ from __future__ import annotations
|
|
3
3
|
import sqlite3
|
4
4
|
from collections.abc import Iterable
|
5
5
|
from pathlib import Path
|
6
|
-
from typing import Optional
|
7
6
|
|
8
7
|
from more_itertools import chunked
|
9
8
|
from sqlalchemy import (
|
@@ -51,7 +50,7 @@ def begin_immediate_transaction(conn):
|
|
51
50
|
conn.exec_driver_sql('BEGIN IMMEDIATE')
|
52
51
|
|
53
52
|
|
54
|
-
Stats = dict[
|
53
|
+
Stats = dict[SourceName | None, int]
|
55
54
|
|
56
55
|
|
57
56
|
# returns critical warnings
|