promnesia 1.2.20240810__py3-none-any.whl → 1.3.20241021__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- promnesia/__init__.py +14 -3
- promnesia/__main__.py +38 -25
- promnesia/cannon.py +23 -23
- promnesia/common.py +49 -42
- promnesia/compare.py +18 -20
- promnesia/compat.py +10 -10
- promnesia/config.py +20 -22
- promnesia/database/common.py +4 -3
- promnesia/database/dump.py +14 -13
- promnesia/database/load.py +7 -7
- promnesia/extract.py +13 -11
- promnesia/kjson.py +11 -10
- promnesia/logging.py +1 -1
- promnesia/misc/install_server.py +7 -8
- promnesia/server.py +42 -31
- promnesia/sources/auto.py +43 -30
- promnesia/sources/auto_logseq.py +6 -5
- promnesia/sources/auto_obsidian.py +2 -2
- promnesia/sources/browser.py +14 -9
- promnesia/sources/browser_legacy.py +17 -13
- promnesia/sources/demo.py +7 -7
- promnesia/sources/fbmessenger.py +3 -2
- promnesia/sources/filetypes.py +9 -7
- promnesia/sources/github.py +5 -7
- promnesia/sources/guess.py +2 -1
- promnesia/sources/hackernews.py +2 -2
- promnesia/sources/hpi.py +2 -2
- promnesia/sources/html.py +7 -5
- promnesia/sources/hypothesis.py +3 -2
- promnesia/sources/instapaper.py +2 -2
- promnesia/sources/markdown.py +17 -7
- promnesia/sources/org.py +20 -10
- promnesia/sources/plaintext.py +30 -31
- promnesia/sources/pocket.py +3 -2
- promnesia/sources/reddit.py +19 -18
- promnesia/sources/roamresearch.py +2 -1
- promnesia/sources/rss.py +3 -4
- promnesia/sources/shellcmd.py +19 -6
- promnesia/sources/signal.py +14 -13
- promnesia/sources/smscalls.py +2 -2
- promnesia/sources/stackexchange.py +3 -2
- promnesia/sources/takeout.py +23 -13
- promnesia/sources/takeout_legacy.py +15 -11
- promnesia/sources/telegram.py +13 -11
- promnesia/sources/telegram_legacy.py +18 -7
- promnesia/sources/twitter.py +6 -5
- promnesia/sources/vcs.py +5 -3
- promnesia/sources/viber.py +10 -9
- promnesia/sources/website.py +4 -4
- promnesia/sources/zulip.py +3 -2
- promnesia/sqlite.py +7 -4
- promnesia/tests/common.py +8 -5
- promnesia/tests/server_helper.py +11 -8
- promnesia/tests/sources/test_auto.py +2 -3
- promnesia/tests/sources/test_filetypes.py +2 -1
- promnesia/tests/sources/test_hypothesis.py +3 -3
- promnesia/tests/sources/test_org.py +2 -3
- promnesia/tests/sources/test_plaintext.py +0 -1
- promnesia/tests/sources/test_shellcmd.py +3 -4
- promnesia/tests/sources/test_takeout.py +3 -5
- promnesia/tests/test_cannon.py +5 -5
- promnesia/tests/test_cli.py +4 -6
- promnesia/tests/test_compare.py +1 -1
- promnesia/tests/test_config.py +7 -8
- promnesia/tests/test_db_dump.py +11 -12
- promnesia/tests/test_extract.py +10 -6
- promnesia/tests/test_indexer.py +14 -8
- promnesia/tests/test_server.py +2 -3
- promnesia/tests/test_traverse.py +0 -2
- promnesia/tests/utils.py +4 -4
- {promnesia-1.2.20240810.dist-info → promnesia-1.3.20241021.dist-info}/METADATA +3 -2
- promnesia-1.3.20241021.dist-info/RECORD +83 -0
- {promnesia-1.2.20240810.dist-info → promnesia-1.3.20241021.dist-info}/WHEEL +1 -1
- promnesia-1.2.20240810.dist-info/RECORD +0 -83
- {promnesia-1.2.20240810.dist-info → promnesia-1.3.20241021.dist-info}/LICENSE +0 -0
- {promnesia-1.2.20240810.dist-info → promnesia-1.3.20241021.dist-info}/entry_points.txt +0 -0
- {promnesia-1.2.20240810.dist-info → promnesia-1.3.20241021.dist-info}/top_level.txt +0 -0
promnesia/__init__.py
CHANGED
@@ -1,6 +1,17 @@
|
|
1
|
-
from pathlib import Path
|
2
|
-
from .common import PathIsh, Visit, Source, last, Loc, Results, DbVisit, Context, Res
|
3
|
-
|
4
1
|
# add deprecation warning so eventually this may converted to a namespace package?
|
5
2
|
import warnings
|
3
|
+
|
4
|
+
from .common import ( # noqa: F401
|
5
|
+
Context,
|
6
|
+
DbVisit,
|
7
|
+
Loc,
|
8
|
+
PathIsh,
|
9
|
+
Res,
|
10
|
+
Results,
|
11
|
+
Source,
|
12
|
+
Visit,
|
13
|
+
last,
|
14
|
+
)
|
15
|
+
|
16
|
+
# TODO think again about it -- what are the pros and cons?
|
6
17
|
warnings.warn("DEPRECATED! Please import directly from 'promnesia.common', e.g. 'from promnesia.common import Visit, Source, Results'", DeprecationWarning)
|
promnesia/__main__.py
CHANGED
@@ -5,24 +5,34 @@ import ast
|
|
5
5
|
import importlib
|
6
6
|
import inspect
|
7
7
|
import os
|
8
|
-
|
8
|
+
import shlex
|
9
9
|
import shutil
|
10
|
-
from subprocess import run, check_call, Popen
|
11
10
|
import sys
|
11
|
+
from collections.abc import Iterable, Iterator, Sequence
|
12
|
+
from pathlib import Path
|
13
|
+
from subprocess import Popen, check_call, run
|
12
14
|
from tempfile import TemporaryDirectory, gettempdir
|
13
|
-
from typing import Callable
|
14
|
-
|
15
|
-
|
16
|
-
from . import
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
15
|
+
from typing import Callable
|
16
|
+
|
17
|
+
from . import config, server
|
18
|
+
from .common import (
|
19
|
+
DbVisit,
|
20
|
+
Extractor,
|
21
|
+
PathIsh,
|
22
|
+
Res,
|
23
|
+
Source,
|
24
|
+
default_config_path,
|
25
|
+
get_system_tz,
|
26
|
+
get_tmpdir,
|
27
|
+
logger,
|
28
|
+
user_config_file,
|
29
|
+
)
|
21
30
|
from .database.dump import visits_to_sqlite
|
22
31
|
from .extract import extract_visits
|
32
|
+
from .misc import install_server
|
23
33
|
|
24
34
|
|
25
|
-
def iter_all_visits(sources_subset: Iterable[
|
35
|
+
def iter_all_visits(sources_subset: Iterable[str | int] = ()) -> Iterator[Res[DbVisit]]:
|
26
36
|
cfg = config.get()
|
27
37
|
output_dir = cfg.output_dir
|
28
38
|
# not sure if belongs here??
|
@@ -74,7 +84,7 @@ def iter_all_visits(sources_subset: Iterable[Union[str, int]]=()) -> Iterator[Re
|
|
74
84
|
logger.warning("unknown --sources: %s", ", ".join(repr(i) for i in sources_subset))
|
75
85
|
|
76
86
|
|
77
|
-
def _do_index(dry: bool=False, sources_subset: Iterable[
|
87
|
+
def _do_index(*, dry: bool = False, sources_subset: Iterable[str | int] = (), overwrite_db: bool = False) -> Iterable[Exception]:
|
78
88
|
# also keep & return errors for further display
|
79
89
|
errors: list[Exception] = []
|
80
90
|
def it() -> Iterable[Res[DbVisit]]:
|
@@ -98,9 +108,10 @@ def _do_index(dry: bool=False, sources_subset: Iterable[Union[str, int]]=(), ove
|
|
98
108
|
|
99
109
|
def do_index(
|
100
110
|
config_file: Path,
|
101
|
-
|
102
|
-
|
103
|
-
|
111
|
+
*,
|
112
|
+
dry: bool = False,
|
113
|
+
sources_subset: Iterable[str | int] = (),
|
114
|
+
overwrite_db: bool = False,
|
104
115
|
) -> Sequence[Exception]:
|
105
116
|
config.load_from(config_file) # meh.. should be cleaner
|
106
117
|
try:
|
@@ -120,7 +131,8 @@ def demo_sources() -> dict[str, Callable[[], Extractor]]:
|
|
120
131
|
def lazy(name: str) -> Callable[[], Extractor]:
|
121
132
|
# helper to avoid failed imports etc, since people might be lacking necessary dependencies
|
122
133
|
def inner() -> Extractor:
|
123
|
-
|
134
|
+
# TODO why this import??
|
135
|
+
from . import sources # noqa: F401
|
124
136
|
module = importlib.import_module(f'promnesia.sources.{name}')
|
125
137
|
return getattr(module, 'index')
|
126
138
|
return inner
|
@@ -145,7 +157,7 @@ def do_demo(
|
|
145
157
|
config_file: Path | None,
|
146
158
|
dry: bool=False,
|
147
159
|
name: str='demo',
|
148
|
-
sources_subset: Iterable[
|
160
|
+
sources_subset: Iterable[str | int]=(),
|
149
161
|
overwrite_db: bool=False,
|
150
162
|
) -> None:
|
151
163
|
with TemporaryDirectory() as tdir:
|
@@ -219,9 +231,10 @@ def _config_check(cfg: Path) -> Iterable[Exception]:
|
|
219
231
|
logger.info('config: %s', cfg)
|
220
232
|
|
221
233
|
def check(cmd: list[str | Path], **kwargs) -> Iterable[Exception]:
|
222
|
-
logger.debug(
|
223
|
-
res = run(cmd, **kwargs)
|
234
|
+
logger.debug(shlex.join(map(str, cmd)))
|
235
|
+
res = run(cmd, **kwargs) # noqa: PLW1510
|
224
236
|
if res.returncode > 0:
|
237
|
+
# TODO what's up with empty exception??
|
225
238
|
yield Exception()
|
226
239
|
|
227
240
|
logger.info('Checking syntax...')
|
@@ -239,7 +252,7 @@ def _config_check(cfg: Path) -> Iterable[Exception]:
|
|
239
252
|
# todo not sure if should be more defensive than check_call here
|
240
253
|
logger.info('Checking type safety...')
|
241
254
|
try:
|
242
|
-
import mypy
|
255
|
+
import mypy # noqa: F401
|
243
256
|
except ImportError:
|
244
257
|
logger.warning("mypy not found, can't use it to check config!")
|
245
258
|
else:
|
@@ -291,7 +304,7 @@ def cli_doctor_server(args: argparse.Namespace) -> None:
|
|
291
304
|
logger.info('You should see the database path and version above!')
|
292
305
|
|
293
306
|
|
294
|
-
def _ordinal_or_name(s: str) ->
|
307
|
+
def _ordinal_or_name(s: str) -> str | int:
|
295
308
|
try:
|
296
309
|
s = int(s) # type: ignore
|
297
310
|
except ValueError:
|
@@ -328,7 +341,7 @@ def main() -> None:
|
|
328
341
|
|
329
342
|
F = lambda prog: argparse.ArgumentDefaultsHelpFormatter(prog, width=120)
|
330
343
|
p = argparse.ArgumentParser(formatter_class=F)
|
331
|
-
subp = p.add_subparsers(dest='mode'
|
344
|
+
subp = p.add_subparsers(dest='mode' )
|
332
345
|
ep = subp.add_parser('index', help='Create/update the link database', formatter_class=F)
|
333
346
|
add_index_args(ep, default_config_path())
|
334
347
|
# TODO use some way to override or provide config only via cmdline?
|
@@ -348,7 +361,7 @@ def main() -> None:
|
|
348
361
|
ap.add_argument('--no-serve', action='store_const', const=None, dest='port', help='Pass to only index without running server')
|
349
362
|
ap.add_argument(
|
350
363
|
'--as',
|
351
|
-
choices=
|
364
|
+
choices=sorted(demo_sources().keys()),
|
352
365
|
default='guess',
|
353
366
|
help='Promnesia source to index as (see https://github.com/karlicoss/promnesia/tree/master/src/promnesia/sources for the full list)',
|
354
367
|
)
|
@@ -359,7 +372,7 @@ def main() -> None:
|
|
359
372
|
install_server.setup_parser(isp)
|
360
373
|
|
361
374
|
cp = subp.add_parser('config', help='Config management')
|
362
|
-
cp.set_defaults(func=lambda *
|
375
|
+
cp.set_defaults(func=lambda *_args: cp.print_help())
|
363
376
|
scp = cp.add_subparsers()
|
364
377
|
ccp = scp.add_parser('check', help='Check config')
|
365
378
|
ccp.set_defaults(func=config_check)
|
@@ -373,7 +386,7 @@ def main() -> None:
|
|
373
386
|
|
374
387
|
dp = subp.add_parser('doctor', help='Troubleshooting assistant')
|
375
388
|
dp.add_argument('--config', type=Path, default=default_config_path(), help='Config path')
|
376
|
-
dp.set_defaults(func=lambda *
|
389
|
+
dp.set_defaults(func=lambda *_args: dp.print_help())
|
377
390
|
sdp = dp.add_subparsers()
|
378
391
|
sdp.add_parser('config' , help='Check config' ).set_defaults(func=config_check )
|
379
392
|
sdp.add_parser('database', help='Inspect database').set_defaults(func=cli_doctor_db)
|
promnesia/cannon.py
CHANGED
@@ -9,16 +9,17 @@ are same content, but you can't tell that by URL equality. Even canonical urls a
|
|
9
9
|
|
10
10
|
Also some experiments to establish 'URL hierarchy'.
|
11
11
|
"""
|
12
|
-
|
12
|
+
from __future__ import annotations
|
13
13
|
|
14
|
-
from itertools import chain
|
15
14
|
import re
|
16
15
|
import typing
|
17
|
-
from typing import Iterable, NamedTuple, Set, Optional, List, Sequence, Union, Tuple, Dict, Any, Collection
|
18
|
-
|
19
16
|
import urllib.parse
|
20
|
-
from
|
17
|
+
from collections.abc import Collection, Iterable, Sequence
|
21
18
|
|
19
|
+
# TODO eh?? they fixed mobile.twitter.com?
|
20
|
+
from itertools import chain
|
21
|
+
from typing import Any, NamedTuple, Union
|
22
|
+
from urllib.parse import SplitResult, parse_qsl, urlencode, urlsplit, urlunsplit
|
22
23
|
|
23
24
|
# this has some benchmark, but quite a few librarires seem unmaintained, sadly
|
24
25
|
# I guess i'll stick to default for now, until it's a critical bottleneck
|
@@ -108,11 +109,11 @@ default_qkeep = [
|
|
108
109
|
|
109
110
|
# TODO perhaps, decide if fragment is meaningful (e.g. wiki) or random sequence of letters?
|
110
111
|
class Spec(NamedTuple):
|
111
|
-
qkeep :
|
112
|
-
qremove:
|
112
|
+
qkeep : Collection[str] | bool | None = None
|
113
|
+
qremove: set[str] | None = None
|
113
114
|
fkeep : bool = False
|
114
115
|
|
115
|
-
def keep_query(self, q: str) ->
|
116
|
+
def keep_query(self, q: str) -> int | None: # returns order
|
116
117
|
if self.qkeep is True:
|
117
118
|
return 1
|
118
119
|
qkeep = {
|
@@ -134,13 +135,13 @@ class Spec(NamedTuple):
|
|
134
135
|
return None
|
135
136
|
|
136
137
|
@classmethod
|
137
|
-
def make(cls, **kwargs) ->
|
138
|
+
def make(cls, **kwargs) -> Spec:
|
138
139
|
return cls(**kwargs)
|
139
140
|
|
140
141
|
S = Spec
|
141
142
|
|
142
143
|
# TODO perhaps these can be machine learnt from large set of urls?
|
143
|
-
specs:
|
144
|
+
specs: dict[str, Spec] = {
|
144
145
|
'youtube.com': S(
|
145
146
|
# TODO search_query?
|
146
147
|
qkeep=[ # note: experimental.. order matters here
|
@@ -178,7 +179,6 @@ specs: Dict[str, Spec] = {
|
|
178
179
|
|
179
180
|
'source', 'tsid', 'refsrc', 'pnref', 'rc', '_rdr', 'src', 'hc_location', 'section', 'permPage', 'soft', 'pn_ref', 'action',
|
180
181
|
'ti', 'aref', 'event_time_id', 'action_history', 'filter', 'ref_notif_type', 'has_source', 'source_newsfeed_story_type',
|
181
|
-
'ref_notif_type',
|
182
182
|
},
|
183
183
|
),
|
184
184
|
'physicstravelguide.com': S(fkeep=True), # TODO instead, pass fkeep marker object for shorter spec?
|
@@ -218,10 +218,10 @@ Spec2 = Any # TODO
|
|
218
218
|
|
219
219
|
# TODO this should be a map
|
220
220
|
Frag = Any
|
221
|
-
Parts = Sequence[
|
221
|
+
Parts = Sequence[tuple[str, str]]
|
222
222
|
|
223
223
|
|
224
|
-
def _yc(domain: str, path: str, qq: Parts, frag: Frag) ->
|
224
|
+
def _yc(domain: str, path: str, qq: Parts, frag: Frag) -> tuple[Any, Any, Parts, Frag]:
|
225
225
|
if path[:5] == '/from':
|
226
226
|
site = dict(qq).get('site')
|
227
227
|
if site is not None:
|
@@ -232,7 +232,7 @@ def _yc(domain: str, path: str, qq: Parts, frag: Frag) -> Tuple[Any, Any, Parts,
|
|
232
232
|
# TODO this should be in-place? for brevity?
|
233
233
|
return (domain, path, qq, frag)
|
234
234
|
|
235
|
-
def get_spec2(dom: str) ->
|
235
|
+
def get_spec2(dom: str) -> Spec2 | None:
|
236
236
|
return {
|
237
237
|
'news.ycombinator.com': _yc,
|
238
238
|
}.get(dom)
|
@@ -285,10 +285,10 @@ def transform_split(split: SplitResult):
|
|
285
285
|
REST = r'(?P<rest>.*)'
|
286
286
|
|
287
287
|
Left = Union[str, Sequence[str]]
|
288
|
-
Right =
|
288
|
+
Right = tuple[str, str, str]
|
289
289
|
# the idea is that we can unify certain URLs here and map them to the 'canonical' one
|
290
290
|
# this is a dict only for grouping but should be a list really.. todo
|
291
|
-
rules:
|
291
|
+
rules: dict[Left, Right] = {
|
292
292
|
# TODO m. handling might be quite common
|
293
293
|
# f'm.youtube.com/{REST}': ('youtube.com', '{rest}'),
|
294
294
|
(
|
@@ -322,9 +322,9 @@ def transform_split(split: SplitResult):
|
|
322
322
|
continue
|
323
323
|
gd = m.groupdict()
|
324
324
|
if len(to) == 2:
|
325
|
-
to = to
|
325
|
+
to = (*to, '')
|
326
326
|
|
327
|
-
(netloc, path, qq) =
|
327
|
+
(netloc, path, qq) = (t.format(**gd) for t in to)
|
328
328
|
qparts.extend(parse_qsl(qq, keep_blank_values=True)) # TODO hacky..
|
329
329
|
# TODO eh, qparts should really be a map or something...
|
330
330
|
break
|
@@ -361,7 +361,7 @@ def myunsplit(domain: str, path: str, query: str, fragment: str) -> str:
|
|
361
361
|
# ]
|
362
362
|
# for re in regexes:
|
363
363
|
|
364
|
-
def handle_archive_org(url: str) ->
|
364
|
+
def handle_archive_org(url: str) -> str | None:
|
365
365
|
are = r'web.archive.org/web/(?P<timestamp>\d+)/(?P<rest>.*)'
|
366
366
|
m = re.fullmatch(are, url)
|
367
367
|
if m is None:
|
@@ -697,8 +697,8 @@ def groups(it, args): # pragma: no cover
|
|
697
697
|
all_pats = get_patterns()
|
698
698
|
|
699
699
|
from collections import Counter
|
700
|
-
c: typing.Counter[
|
701
|
-
unmatched:
|
700
|
+
c: typing.Counter[str | None] = Counter()
|
701
|
+
unmatched: list[str] = []
|
702
702
|
|
703
703
|
def dump():
|
704
704
|
print(c)
|
@@ -756,10 +756,10 @@ def groups(it, args): # pragma: no cover
|
|
756
756
|
def display(it, args) -> None: # pragma: no cover
|
757
757
|
# TODO better name?
|
758
758
|
import difflib
|
759
|
-
# pylint: disable=import-error
|
760
|
-
from termcolor import colored as C # type: ignore
|
761
759
|
from sys import stdout
|
762
760
|
|
761
|
+
from termcolor import colored as C # type: ignore
|
762
|
+
|
763
763
|
for line in it:
|
764
764
|
line = line.strip()
|
765
765
|
if args.human:
|
promnesia/common.py
CHANGED
@@ -1,26 +1,28 @@
|
|
1
1
|
from __future__ import annotations
|
2
2
|
|
3
|
-
from contextlib import contextmanager
|
4
|
-
from datetime import datetime, date
|
5
|
-
from functools import lru_cache
|
6
|
-
from glob import glob
|
7
3
|
import itertools
|
8
4
|
import logging
|
9
5
|
import os
|
10
|
-
|
6
|
+
import re
|
11
7
|
import shutil
|
12
|
-
|
8
|
+
import tempfile
|
9
|
+
import warnings
|
10
|
+
from collections.abc import Iterable, Sequence
|
11
|
+
from contextlib import contextmanager
|
12
|
+
from copy import copy
|
13
|
+
from datetime import date, datetime
|
14
|
+
from functools import lru_cache
|
15
|
+
from glob import glob
|
16
|
+
from pathlib import Path
|
17
|
+
from subprocess import PIPE, Popen, run
|
13
18
|
from timeit import default_timer as timer
|
14
19
|
from types import ModuleType
|
15
|
-
from typing import
|
16
|
-
import warnings
|
20
|
+
from typing import TYPE_CHECKING, Callable, NamedTuple, Optional, TypeVar, Union
|
17
21
|
|
18
|
-
from more_itertools import intersperse
|
19
22
|
import pytz
|
23
|
+
from more_itertools import intersperse
|
20
24
|
|
21
25
|
from .cannon import canonify
|
22
|
-
from .compat import removeprefix
|
23
|
-
|
24
26
|
|
25
27
|
_is_windows = os.name == 'nt'
|
26
28
|
|
@@ -38,14 +40,14 @@ Second = int
|
|
38
40
|
# TODO hmm. arguably, source and context are almost same things...
|
39
41
|
class Loc(NamedTuple):
|
40
42
|
title: str
|
41
|
-
href: Optional[str]=None
|
43
|
+
href: Optional[str] = None # noqa: UP007 # looks like hypothesis doesn't like in on python <= 3.9
|
42
44
|
|
43
45
|
@classmethod
|
44
|
-
def make(cls, title: str, href:
|
46
|
+
def make(cls, title: str, href: str | None=None) -> Loc:
|
45
47
|
return cls(title=title, href=href)
|
46
48
|
|
47
49
|
@classmethod
|
48
|
-
def file(cls, path: PathIsh, line:
|
50
|
+
def file(cls, path: PathIsh, line: int | None=None, relative_to: Path | None=None) -> Loc:
|
49
51
|
lstr = '' if line is None else f':{line}'
|
50
52
|
# todo loc should be url encoded? dunno.
|
51
53
|
# or use line=? eh. I don't know. Just ask in issues.
|
@@ -94,7 +96,7 @@ def _warn_no_xdg_mime() -> None:
|
|
94
96
|
def _detect_mime_handler() -> str:
|
95
97
|
def exists(what: str) -> bool:
|
96
98
|
try:
|
97
|
-
r = run(f'xdg-mime query default x-scheme-handler/{what}'.split(), stdout=PIPE)
|
99
|
+
r = run(f'xdg-mime query default x-scheme-handler/{what}'.split(), stdout=PIPE, check=False)
|
98
100
|
except (FileNotFoundError, NotADirectoryError): # ugh seems that osx might throw NotADirectory for some reason
|
99
101
|
_warn_no_xdg_mime()
|
100
102
|
return False
|
@@ -139,12 +141,12 @@ class Visit(NamedTuple):
|
|
139
141
|
# TODO back to DatetimeIsh, but somehow make compatible to dbcache?
|
140
142
|
dt: datetime
|
141
143
|
locator: Loc
|
142
|
-
context:
|
143
|
-
duration:
|
144
|
+
context: Context | None = None
|
145
|
+
duration: Second | None = None
|
144
146
|
# TODO shit. I need to insert it in chrome db....
|
145
147
|
# TODO gonna be hard to fill retroactively.
|
146
148
|
# spent: Optional[Second] = None
|
147
|
-
debug:
|
149
|
+
debug: str | None = None
|
148
150
|
|
149
151
|
Result = Union[Visit, Exception]
|
150
152
|
Results = Iterable[Result]
|
@@ -157,12 +159,12 @@ class DbVisit(NamedTuple):
|
|
157
159
|
orig_url: Url
|
158
160
|
dt: datetime
|
159
161
|
locator: Loc
|
160
|
-
src: Optional[SourceName] = None
|
161
|
-
context: Optional[Context] = None
|
162
|
-
duration: Optional[Second] = None
|
162
|
+
src: Optional[SourceName] = None # noqa: UP007 # looks like hypothesis doesn't like in on python <= 3.9
|
163
|
+
context: Optional[Context] = None # noqa: UP007 # looks like hypothesis doesn't like in on python <= 3.9
|
164
|
+
duration: Optional[Second] = None # noqa: UP007 # looks like hypothesis doesn't like in on python <= 3.9
|
163
165
|
|
164
166
|
@staticmethod
|
165
|
-
def make(p: Visit, src: SourceName) -> Res[
|
167
|
+
def make(p: Visit, src: SourceName) -> Res[DbVisit]:
|
166
168
|
try:
|
167
169
|
# hmm, mypy gets a bit confused here.. presumably because datetime is always datetime (but date is not datetime)
|
168
170
|
if isinstance(p.dt, datetime):
|
@@ -171,7 +173,7 @@ class DbVisit(NamedTuple):
|
|
171
173
|
# TODO that won't be with timezone..
|
172
174
|
dt = datetime.combine(p.dt, datetime.min.time()) # meh..
|
173
175
|
else:
|
174
|
-
raise AssertionError(f'unexpected date: {p.dt}, {type(p.dt)}')
|
176
|
+
raise AssertionError(f'unexpected date: {p.dt}, {type(p.dt)}') # noqa: TRY301
|
175
177
|
except Exception as e:
|
176
178
|
return e
|
177
179
|
|
@@ -196,6 +198,7 @@ Filter = Callable[[Url], bool]
|
|
196
198
|
|
197
199
|
|
198
200
|
from .logging import LazyLogger
|
201
|
+
|
199
202
|
logger = LazyLogger('promnesia', level='DEBUG')
|
200
203
|
|
201
204
|
def get_logger() -> logging.Logger:
|
@@ -204,7 +207,6 @@ def get_logger() -> logging.Logger:
|
|
204
207
|
|
205
208
|
|
206
209
|
|
207
|
-
import tempfile
|
208
210
|
# kinda singleton
|
209
211
|
@lru_cache(1)
|
210
212
|
def get_tmpdir() -> tempfile.TemporaryDirectory[str]:
|
@@ -218,7 +220,7 @@ Syntax = str
|
|
218
220
|
|
219
221
|
@lru_cache(None)
|
220
222
|
def _get_urlextractor(syntax: Syntax):
|
221
|
-
from urlextract import URLExtract
|
223
|
+
from urlextract import URLExtract # type: ignore
|
222
224
|
u = URLExtract()
|
223
225
|
# https://github.com/lipoja/URLExtract/issues/13
|
224
226
|
if syntax in {'org', 'orgmode', 'org-mode'}: # TODO remove hardcoding..
|
@@ -249,7 +251,7 @@ def iter_urls(s: str, *, syntax: Syntax='') -> Iterable[Url]:
|
|
249
251
|
yield _sanitize(u)
|
250
252
|
|
251
253
|
|
252
|
-
def extract_urls(s: str, *, syntax: Syntax='') ->
|
254
|
+
def extract_urls(s: str, *, syntax: Syntax='') -> list[Url]:
|
253
255
|
return list(iter_urls(s=s, syntax=syntax))
|
254
256
|
|
255
257
|
|
@@ -274,7 +276,7 @@ class PathWithMtime(NamedTuple):
|
|
274
276
|
mtime: float
|
275
277
|
|
276
278
|
@classmethod
|
277
|
-
def make(cls, p: Path) ->
|
279
|
+
def make(cls, p: Path) -> PathWithMtime:
|
278
280
|
return cls(
|
279
281
|
path=p,
|
280
282
|
mtime=p.stat().st_mtime,
|
@@ -300,7 +302,7 @@ def _guess_name(thing: PreSource) -> str:
|
|
300
302
|
guess = thing.__module__
|
301
303
|
|
302
304
|
dflt = 'promnesia.sources.'
|
303
|
-
guess = removeprefix(
|
305
|
+
guess = guess.removeprefix(dflt)
|
304
306
|
if guess == 'config':
|
305
307
|
# this happens when we define a lambda in config or something without properly wrapping in Source
|
306
308
|
logger.warning(f'Inferred source name "config" for {thing}. This might be misleading TODO')
|
@@ -362,13 +364,14 @@ Indexer = Source
|
|
362
364
|
# NOTE: used in configs...
|
363
365
|
def last(path: PathIsh, *parts: str) -> Path:
|
364
366
|
import os.path
|
365
|
-
pp = os.path.join(str(path), *parts)
|
366
|
-
return Path(max(glob(pp, recursive=True)))
|
367
|
+
pp = os.path.join(str(path), *parts) # noqa: PTH118
|
368
|
+
return Path(max(glob(pp, recursive=True))) # noqa: PTH207
|
367
369
|
|
368
370
|
|
369
|
-
from .logging import setup_logger
|
371
|
+
from .logging import setup_logger # noqa: F401
|
370
372
|
|
371
|
-
|
373
|
+
|
374
|
+
# TODO get rid of this? not sure if still necessary
|
372
375
|
def echain(ex: Exception, cause: Exception) -> Exception:
|
373
376
|
e = copy(ex)
|
374
377
|
e.__cause__ = cause
|
@@ -382,7 +385,6 @@ def echain(ex: Exception, cause: Exception) -> Exception:
|
|
382
385
|
|
383
386
|
def slugify(x: str) -> str:
|
384
387
|
# https://stackoverflow.com/a/38766141/706389
|
385
|
-
import re
|
386
388
|
valid_file_name = re.sub(r'[^\w_.)( -]', '', x)
|
387
389
|
return valid_file_name
|
388
390
|
|
@@ -392,7 +394,7 @@ def appdirs():
|
|
392
394
|
under_test = os.environ.get('PYTEST_CURRENT_TEST') is not None
|
393
395
|
# todo actually use test name?
|
394
396
|
name = 'promnesia-test' if under_test else 'promnesia'
|
395
|
-
import appdirs as ad
|
397
|
+
import appdirs as ad # type: ignore[import-untyped]
|
396
398
|
return ad.AppDirs(appname=name)
|
397
399
|
|
398
400
|
|
@@ -409,13 +411,13 @@ def default_cache_dir() -> Path:
|
|
409
411
|
# make it lazy, otherwise it might crash on module import (e.g. on Windows)
|
410
412
|
# ideally would be nice to fix it properly https://github.com/ahupp/python-magic#windows
|
411
413
|
@lru_cache(1)
|
412
|
-
def _magic() -> Callable[[PathIsh],
|
414
|
+
def _magic() -> Callable[[PathIsh], str | None]:
|
413
415
|
logger = get_logger()
|
414
416
|
try:
|
415
|
-
import magic
|
417
|
+
import magic # type: ignore
|
416
418
|
except Exception as e:
|
417
419
|
logger.exception(e)
|
418
|
-
defensive_msg:
|
420
|
+
defensive_msg: str | None = None
|
419
421
|
if isinstance(e, ModuleNotFoundError) and e.name == 'magic':
|
420
422
|
defensive_msg = "python-magic is not detected. It's recommended for better file type detection (pip3 install --user python-magic). See https://github.com/ahupp/python-magic#installation"
|
421
423
|
elif isinstance(e, ImportError):
|
@@ -425,7 +427,7 @@ def _magic() -> Callable[[PathIsh], Optional[str]]:
|
|
425
427
|
if defensive_msg is not None:
|
426
428
|
logger.warning(defensive_msg)
|
427
429
|
warnings.warn(defensive_msg)
|
428
|
-
return lambda path: None #
|
430
|
+
return lambda path: None # stub # noqa: ARG005
|
429
431
|
else:
|
430
432
|
raise e
|
431
433
|
else:
|
@@ -441,7 +443,7 @@ def _mimetypes():
|
|
441
443
|
return mimetypes
|
442
444
|
|
443
445
|
|
444
|
-
def mime(path: PathIsh) ->
|
446
|
+
def mime(path: PathIsh) -> str | None:
|
445
447
|
ps = str(path)
|
446
448
|
mimetypes = _mimetypes()
|
447
449
|
# first try mimetypes, it's only using the filename without opening the file
|
@@ -453,7 +455,7 @@ def mime(path: PathIsh) -> Optional[str]:
|
|
453
455
|
return magic(ps)
|
454
456
|
|
455
457
|
|
456
|
-
def find_args(root: Path, follow: bool, ignore:
|
458
|
+
def find_args(root: Path, *, follow: bool, ignore: Sequence[str] = ()) -> list[str]:
|
457
459
|
prune_dir_args = []
|
458
460
|
ignore_file_args = []
|
459
461
|
if ignore:
|
@@ -476,7 +478,7 @@ def find_args(root: Path, follow: bool, ignore: List[str]=[]) -> List[str]:
|
|
476
478
|
]
|
477
479
|
|
478
480
|
|
479
|
-
def fdfind_args(root: Path, follow: bool, ignore:
|
481
|
+
def fdfind_args(root: Path, *, follow: bool, ignore: Sequence[str] = ()) -> list[str]:
|
480
482
|
from .config import extra_fd_args
|
481
483
|
|
482
484
|
ignore_args = []
|
@@ -496,7 +498,7 @@ def fdfind_args(root: Path, follow: bool, ignore: List[str]=[]) -> List[str]:
|
|
496
498
|
]
|
497
499
|
|
498
500
|
|
499
|
-
def traverse(root: Path, *, follow: bool=True, ignore:
|
501
|
+
def traverse(root: Path, *, follow: bool=True, ignore: Sequence[str] = ()) -> Iterable[Path]:
|
500
502
|
if not root.is_dir():
|
501
503
|
yield root
|
502
504
|
return
|
@@ -605,3 +607,8 @@ def is_sqlite_db(x: Path) -> bool:
|
|
605
607
|
'application/vnd.sqlite3',
|
606
608
|
# TODO this mime can also match wal files/journals, not sure
|
607
609
|
}
|
610
|
+
|
611
|
+
|
612
|
+
if not TYPE_CHECKING:
|
613
|
+
# todo deprecate properly --just backwards compat
|
614
|
+
from .compat import removeprefix # noqa: F401
|
promnesia/compare.py
CHANGED
@@ -1,13 +1,14 @@
|
|
1
|
-
|
1
|
+
from __future__ import annotations
|
2
|
+
|
2
3
|
# TODO perhaps make it external script?
|
3
4
|
import argparse
|
4
|
-
from pathlib import Path
|
5
5
|
import logging
|
6
6
|
import sys
|
7
|
-
from
|
8
|
-
|
7
|
+
from collections.abc import Iterator, Sequence
|
8
|
+
from pathlib import Path
|
9
|
+
from typing import TypeVar
|
9
10
|
|
10
|
-
from .common import DbVisit,
|
11
|
+
from .common import DbVisit, PathWithMtime, Url
|
11
12
|
from .database.load import row_to_db_visit
|
12
13
|
|
13
14
|
# TODO include latest too?
|
@@ -19,14 +20,11 @@ def get_logger():
|
|
19
20
|
# TODO return error depending on severity?
|
20
21
|
|
21
22
|
|
22
|
-
from typing import TypeVar, Sequence
|
23
|
-
|
24
|
-
|
25
23
|
T = TypeVar('T')
|
26
24
|
|
27
25
|
def eliminate_by(sa: Sequence[T], sb: Sequence[T], key):
|
28
|
-
def make_dict(s: Sequence[T]) ->
|
29
|
-
res:
|
26
|
+
def make_dict(s: Sequence[T]) -> dict[str, list[T]]:
|
27
|
+
res: dict[str, list[T]] = {}
|
30
28
|
for a in s:
|
31
29
|
k = key(a)
|
32
30
|
ll = res.get(k, None)
|
@@ -39,9 +37,9 @@ def eliminate_by(sa: Sequence[T], sb: Sequence[T], key):
|
|
39
37
|
db = make_dict(sb)
|
40
38
|
ka = set(da.keys())
|
41
39
|
kb = set(db.keys())
|
42
|
-
onlya:
|
43
|
-
common:
|
44
|
-
onlyb:
|
40
|
+
onlya: set[T] = set()
|
41
|
+
common: set[T] = set()
|
42
|
+
onlyb: set[T] = set()
|
45
43
|
for k in ka.union(kb):
|
46
44
|
la = da.get(k, [])
|
47
45
|
lb = db.get(k, [])
|
@@ -54,13 +52,13 @@ def eliminate_by(sa: Sequence[T], sb: Sequence[T], key):
|
|
54
52
|
return onlya, common, onlyb
|
55
53
|
|
56
54
|
|
57
|
-
def compare(before:
|
55
|
+
def compare(before: list[DbVisit], after: list[DbVisit], between: str, *, log=True) -> list[DbVisit]:
|
58
56
|
logger = get_logger()
|
59
57
|
logger.info('comparing between: %s', between)
|
60
58
|
|
61
|
-
errors:
|
59
|
+
errors: list[DbVisit] = []
|
62
60
|
|
63
|
-
umap:
|
61
|
+
umap: dict[Url, list[DbVisit]] = {}
|
64
62
|
for a in after:
|
65
63
|
url = a.norm_url
|
66
64
|
xx = umap.get(url, []) # TODO canonify here?
|
@@ -71,7 +69,7 @@ def compare(before: List[DbVisit], after: List[DbVisit], between: str, *, log=Tr
|
|
71
69
|
errors.append(b)
|
72
70
|
if log:
|
73
71
|
logger.error('between %s missing %s', between, b)
|
74
|
-
print('ignoreline "
|
72
|
+
print('ignoreline "{}", # {} {}'.format('exid', b.norm_url, b.src), file=sys.stderr)
|
75
73
|
|
76
74
|
|
77
75
|
# the idea is that we eliminate items simultaneously from both sets
|
@@ -108,7 +106,7 @@ def get_files(args):
|
|
108
106
|
if len(args.paths) == 0:
|
109
107
|
int_dir = args.intermediate_dir
|
110
108
|
assert int_dir.exists()
|
111
|
-
files =
|
109
|
+
files = sorted(int_dir.glob('*.sqlite*'))
|
112
110
|
files = files[-args.last:]
|
113
111
|
else:
|
114
112
|
files = [Path(p) for p in args.paths]
|
@@ -126,7 +124,7 @@ def main():
|
|
126
124
|
sys.exit(1)
|
127
125
|
|
128
126
|
|
129
|
-
def compare_files(*files: Path, log=True) -> Iterator[
|
127
|
+
def compare_files(*files: Path, log=True) -> Iterator[tuple[str, DbVisit]]:
|
130
128
|
assert len(files) > 0
|
131
129
|
|
132
130
|
logger = get_logger()
|
@@ -139,7 +137,7 @@ def compare_files(*files: Path, log=True) -> Iterator[Tuple[str, DbVisit]]:
|
|
139
137
|
name = f.name
|
140
138
|
this_dts = name[0: name.index('.')] # can't use stem due to multiple extensions..
|
141
139
|
|
142
|
-
from promnesia.server import _get_stuff
|
140
|
+
from promnesia.server import _get_stuff # TODO ugh
|
143
141
|
engine, table = _get_stuff(PathWithMtime.make(f))
|
144
142
|
|
145
143
|
with engine.connect() as conn:
|