promnesia 1.2.20230515__py3-none-any.whl → 1.3.20241021__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- promnesia/__init__.py +14 -3
- promnesia/__main__.py +60 -35
- promnesia/cannon.py +27 -27
- promnesia/common.py +85 -67
- promnesia/compare.py +21 -22
- promnesia/compat.py +10 -10
- promnesia/config.py +23 -23
- promnesia/database/common.py +67 -0
- promnesia/database/dump.py +188 -0
- promnesia/{read_db.py → database/load.py} +16 -17
- promnesia/extract.py +14 -11
- promnesia/kjson.py +12 -11
- promnesia/logging.py +4 -4
- promnesia/misc/__init__.pyi +0 -0
- promnesia/misc/config_example.py +1 -2
- promnesia/misc/install_server.py +7 -9
- promnesia/server.py +57 -47
- promnesia/sources/__init__.pyi +0 -0
- promnesia/sources/auto.py +50 -35
- promnesia/sources/auto_logseq.py +6 -5
- promnesia/sources/auto_obsidian.py +2 -2
- promnesia/sources/browser.py +14 -9
- promnesia/sources/browser_legacy.py +26 -16
- promnesia/sources/demo.py +19 -3
- promnesia/sources/fbmessenger.py +3 -2
- promnesia/sources/filetypes.py +16 -7
- promnesia/sources/github.py +7 -9
- promnesia/sources/guess.py +2 -1
- promnesia/sources/hackernews.py +2 -2
- promnesia/sources/hpi.py +2 -2
- promnesia/sources/html.py +7 -5
- promnesia/sources/hypothesis.py +4 -3
- promnesia/sources/instapaper.py +2 -2
- promnesia/sources/markdown.py +31 -21
- promnesia/sources/org.py +27 -13
- promnesia/sources/plaintext.py +30 -29
- promnesia/sources/pocket.py +3 -2
- promnesia/sources/reddit.py +20 -19
- promnesia/sources/roamresearch.py +2 -1
- promnesia/sources/rss.py +4 -5
- promnesia/sources/shellcmd.py +19 -6
- promnesia/sources/signal.py +33 -24
- promnesia/sources/smscalls.py +2 -2
- promnesia/sources/stackexchange.py +4 -3
- promnesia/sources/takeout.py +76 -9
- promnesia/sources/takeout_legacy.py +24 -12
- promnesia/sources/telegram.py +13 -11
- promnesia/sources/telegram_legacy.py +18 -7
- promnesia/sources/twitter.py +6 -5
- promnesia/sources/vcs.py +5 -3
- promnesia/sources/viber.py +10 -9
- promnesia/sources/website.py +4 -4
- promnesia/sources/zulip.py +3 -2
- promnesia/sqlite.py +7 -4
- promnesia/tests/__init__.py +0 -0
- promnesia/tests/common.py +140 -0
- promnesia/tests/server_helper.py +67 -0
- promnesia/tests/sources/__init__.py +0 -0
- promnesia/tests/sources/test_auto.py +65 -0
- promnesia/tests/sources/test_filetypes.py +43 -0
- promnesia/tests/sources/test_hypothesis.py +39 -0
- promnesia/tests/sources/test_org.py +64 -0
- promnesia/tests/sources/test_plaintext.py +25 -0
- promnesia/tests/sources/test_shellcmd.py +21 -0
- promnesia/tests/sources/test_takeout.py +56 -0
- promnesia/tests/test_cannon.py +325 -0
- promnesia/tests/test_cli.py +40 -0
- promnesia/tests/test_compare.py +30 -0
- promnesia/tests/test_config.py +289 -0
- promnesia/tests/test_db_dump.py +222 -0
- promnesia/tests/test_extract.py +65 -0
- promnesia/tests/test_extract_urls.py +43 -0
- promnesia/tests/test_indexer.py +251 -0
- promnesia/tests/test_server.py +291 -0
- promnesia/tests/test_traverse.py +39 -0
- promnesia/tests/utils.py +35 -0
- {promnesia-1.2.20230515.dist-info → promnesia-1.3.20241021.dist-info}/METADATA +15 -18
- promnesia-1.3.20241021.dist-info/RECORD +83 -0
- {promnesia-1.2.20230515.dist-info → promnesia-1.3.20241021.dist-info}/WHEEL +1 -1
- {promnesia-1.2.20230515.dist-info → promnesia-1.3.20241021.dist-info}/entry_points.txt +0 -1
- promnesia/dump.py +0 -105
- promnesia-1.2.20230515.dist-info/RECORD +0 -58
- {promnesia-1.2.20230515.dist-info → promnesia-1.3.20241021.dist-info}/LICENSE +0 -0
- {promnesia-1.2.20230515.dist-info → promnesia-1.3.20241021.dist-info}/top_level.txt +0 -0
promnesia/common.py
CHANGED
@@ -1,26 +1,29 @@
|
|
1
1
|
from __future__ import annotations
|
2
2
|
|
3
|
-
from contextlib import contextmanager
|
4
|
-
from datetime import datetime, date
|
5
|
-
from functools import lru_cache
|
6
|
-
from glob import glob
|
7
3
|
import itertools
|
8
4
|
import logging
|
9
5
|
import os
|
10
|
-
|
6
|
+
import re
|
11
7
|
import shutil
|
12
|
-
|
8
|
+
import tempfile
|
9
|
+
import warnings
|
10
|
+
from collections.abc import Iterable, Sequence
|
11
|
+
from contextlib import contextmanager
|
12
|
+
from copy import copy
|
13
|
+
from datetime import date, datetime
|
14
|
+
from functools import lru_cache
|
15
|
+
from glob import glob
|
16
|
+
from pathlib import Path
|
17
|
+
from subprocess import PIPE, Popen, run
|
13
18
|
from timeit import default_timer as timer
|
14
19
|
from types import ModuleType
|
15
|
-
from typing import
|
16
|
-
import warnings
|
20
|
+
from typing import TYPE_CHECKING, Callable, NamedTuple, Optional, TypeVar, Union
|
17
21
|
|
18
|
-
from more_itertools import intersperse
|
19
22
|
import pytz
|
23
|
+
from more_itertools import intersperse
|
20
24
|
|
21
25
|
from .cannon import canonify
|
22
26
|
|
23
|
-
|
24
27
|
_is_windows = os.name == 'nt'
|
25
28
|
|
26
29
|
T = TypeVar('T')
|
@@ -37,14 +40,14 @@ Second = int
|
|
37
40
|
# TODO hmm. arguably, source and context are almost same things...
|
38
41
|
class Loc(NamedTuple):
|
39
42
|
title: str
|
40
|
-
href: Optional[str]=None
|
43
|
+
href: Optional[str] = None # noqa: UP007 # looks like hypothesis doesn't like in on python <= 3.9
|
41
44
|
|
42
45
|
@classmethod
|
43
|
-
def make(cls, title: str, href:
|
46
|
+
def make(cls, title: str, href: str | None=None) -> Loc:
|
44
47
|
return cls(title=title, href=href)
|
45
48
|
|
46
49
|
@classmethod
|
47
|
-
def file(cls, path: PathIsh, line:
|
50
|
+
def file(cls, path: PathIsh, line: int | None=None, relative_to: Path | None=None) -> Loc:
|
48
51
|
lstr = '' if line is None else f':{line}'
|
49
52
|
# todo loc should be url encoded? dunno.
|
50
53
|
# or use line=? eh. I don't know. Just ask in issues.
|
@@ -76,13 +79,26 @@ class Loc(NamedTuple):
|
|
76
79
|
# but generally, it will be
|
77
80
|
# (url|file)(linenumber|json_path|anchor)
|
78
81
|
|
82
|
+
|
83
|
+
@lru_cache(None)
|
84
|
+
def warn_once(message: str) -> None:
|
85
|
+
# you'd think that warnings module already logs warnings only once per line..
|
86
|
+
# but sadly it's not the case
|
87
|
+
# see https://github.com/karlicoss/python_duplicate_warnings_investigation/blob/master/test.py
|
88
|
+
warnings.warn(message, stacklevel=2)
|
89
|
+
|
90
|
+
|
91
|
+
def _warn_no_xdg_mime() -> None:
|
92
|
+
warn_once("No xdg-mime on your OS! If you're on OSX, perhaps you can help me! https://github.com/karlicoss/open-in-editor/issues/1")
|
93
|
+
|
94
|
+
|
79
95
|
@lru_cache(1)
|
80
96
|
def _detect_mime_handler() -> str:
|
81
97
|
def exists(what: str) -> bool:
|
82
98
|
try:
|
83
|
-
r = run(f'xdg-mime query default x-scheme-handler/{what}'.split(), stdout=PIPE)
|
84
|
-
except FileNotFoundError:
|
85
|
-
|
99
|
+
r = run(f'xdg-mime query default x-scheme-handler/{what}'.split(), stdout=PIPE, check=False)
|
100
|
+
except (FileNotFoundError, NotADirectoryError): # ugh seems that osx might throw NotADirectory for some reason
|
101
|
+
_warn_no_xdg_mime()
|
86
102
|
return False
|
87
103
|
if r.returncode > 0:
|
88
104
|
warnings.warn('xdg-mime failed') # hopefully rest is in stderr
|
@@ -102,6 +118,7 @@ def _detect_mime_handler() -> str:
|
|
102
118
|
result = 'emacs:'
|
103
119
|
|
104
120
|
# 2. now try to use newer editor:// thing
|
121
|
+
# TODO flip order here? should rely on editor:// first?
|
105
122
|
|
106
123
|
# TODO would be nice to collect warnings and display at the end
|
107
124
|
if not exists('editor'):
|
@@ -124,12 +141,12 @@ class Visit(NamedTuple):
|
|
124
141
|
# TODO back to DatetimeIsh, but somehow make compatible to dbcache?
|
125
142
|
dt: datetime
|
126
143
|
locator: Loc
|
127
|
-
context:
|
128
|
-
duration:
|
144
|
+
context: Context | None = None
|
145
|
+
duration: Second | None = None
|
129
146
|
# TODO shit. I need to insert it in chrome db....
|
130
147
|
# TODO gonna be hard to fill retroactively.
|
131
148
|
# spent: Optional[Second] = None
|
132
|
-
debug:
|
149
|
+
debug: str | None = None
|
133
150
|
|
134
151
|
Result = Union[Visit, Exception]
|
135
152
|
Results = Iterable[Result]
|
@@ -142,12 +159,12 @@ class DbVisit(NamedTuple):
|
|
142
159
|
orig_url: Url
|
143
160
|
dt: datetime
|
144
161
|
locator: Loc
|
145
|
-
src: Optional[SourceName] = None
|
146
|
-
context: Optional[Context] = None
|
147
|
-
duration: Optional[Second] = None
|
162
|
+
src: Optional[SourceName] = None # noqa: UP007 # looks like hypothesis doesn't like in on python <= 3.9
|
163
|
+
context: Optional[Context] = None # noqa: UP007 # looks like hypothesis doesn't like in on python <= 3.9
|
164
|
+
duration: Optional[Second] = None # noqa: UP007 # looks like hypothesis doesn't like in on python <= 3.9
|
148
165
|
|
149
166
|
@staticmethod
|
150
|
-
def make(p: Visit, src: SourceName) -> Res[
|
167
|
+
def make(p: Visit, src: SourceName) -> Res[DbVisit]:
|
151
168
|
try:
|
152
169
|
# hmm, mypy gets a bit confused here.. presumably because datetime is always datetime (but date is not datetime)
|
153
170
|
if isinstance(p.dt, datetime):
|
@@ -156,7 +173,7 @@ class DbVisit(NamedTuple):
|
|
156
173
|
# TODO that won't be with timezone..
|
157
174
|
dt = datetime.combine(p.dt, datetime.min.time()) # meh..
|
158
175
|
else:
|
159
|
-
raise AssertionError(f'unexpected date: {p.dt}, {type(p.dt)}')
|
176
|
+
raise AssertionError(f'unexpected date: {p.dt}, {type(p.dt)}') # noqa: TRY301
|
160
177
|
except Exception as e:
|
161
178
|
return e
|
162
179
|
|
@@ -181,6 +198,7 @@ Filter = Callable[[Url], bool]
|
|
181
198
|
|
182
199
|
|
183
200
|
from .logging import LazyLogger
|
201
|
+
|
184
202
|
logger = LazyLogger('promnesia', level='DEBUG')
|
185
203
|
|
186
204
|
def get_logger() -> logging.Logger:
|
@@ -189,7 +207,6 @@ def get_logger() -> logging.Logger:
|
|
189
207
|
|
190
208
|
|
191
209
|
|
192
|
-
import tempfile
|
193
210
|
# kinda singleton
|
194
211
|
@lru_cache(1)
|
195
212
|
def get_tmpdir() -> tempfile.TemporaryDirectory[str]:
|
@@ -203,7 +220,7 @@ Syntax = str
|
|
203
220
|
|
204
221
|
@lru_cache(None)
|
205
222
|
def _get_urlextractor(syntax: Syntax):
|
206
|
-
from urlextract import URLExtract
|
223
|
+
from urlextract import URLExtract # type: ignore
|
207
224
|
u = URLExtract()
|
208
225
|
# https://github.com/lipoja/URLExtract/issues/13
|
209
226
|
if syntax in {'org', 'orgmode', 'org-mode'}: # TODO remove hardcoding..
|
@@ -234,7 +251,7 @@ def iter_urls(s: str, *, syntax: Syntax='') -> Iterable[Url]:
|
|
234
251
|
yield _sanitize(u)
|
235
252
|
|
236
253
|
|
237
|
-
def extract_urls(s: str, *, syntax: Syntax='') ->
|
254
|
+
def extract_urls(s: str, *, syntax: Syntax='') -> list[Url]:
|
238
255
|
return list(iter_urls(s=s, syntax=syntax))
|
239
256
|
|
240
257
|
|
@@ -259,7 +276,7 @@ class PathWithMtime(NamedTuple):
|
|
259
276
|
mtime: float
|
260
277
|
|
261
278
|
@classmethod
|
262
|
-
def make(cls, p: Path) ->
|
279
|
+
def make(cls, p: Path) -> PathWithMtime:
|
263
280
|
return cls(
|
264
281
|
path=p,
|
265
282
|
mtime=p.stat().st_mtime,
|
@@ -285,9 +302,10 @@ def _guess_name(thing: PreSource) -> str:
|
|
285
302
|
guess = thing.__module__
|
286
303
|
|
287
304
|
dflt = 'promnesia.sources.'
|
288
|
-
|
289
|
-
|
290
|
-
|
305
|
+
guess = guess.removeprefix(dflt)
|
306
|
+
if guess == 'config':
|
307
|
+
# this happens when we define a lambda in config or something without properly wrapping in Source
|
308
|
+
logger.warning(f'Inferred source name "config" for {thing}. This might be misleading TODO')
|
291
309
|
return guess
|
292
310
|
|
293
311
|
|
@@ -297,7 +315,7 @@ def _get_index_function(sourceish: PreSource) -> PreExtractor:
|
|
297
315
|
if hasattr(sourceish, 'index'): # must be a module
|
298
316
|
res = getattr(sourceish, 'index')
|
299
317
|
else:
|
300
|
-
res = sourceish
|
318
|
+
res = sourceish
|
301
319
|
return res
|
302
320
|
|
303
321
|
|
@@ -317,12 +335,17 @@ class Source:
|
|
317
335
|
self.extractor: Extractor = lambda: self.ff(*self.args, **self.kwargs)
|
318
336
|
if src is not None:
|
319
337
|
warnings.warn("'src' argument is deprecated, please use 'name' instead", DeprecationWarning)
|
320
|
-
|
321
|
-
|
322
|
-
|
323
|
-
|
324
|
-
|
325
|
-
|
338
|
+
if name != '':
|
339
|
+
self.name = name
|
340
|
+
elif src != '':
|
341
|
+
self.name = src
|
342
|
+
else:
|
343
|
+
try:
|
344
|
+
name_guess = _guess_name(ff)
|
345
|
+
except:
|
346
|
+
# todo warn?
|
347
|
+
name_guess = ''
|
348
|
+
self.name = name_guess
|
326
349
|
|
327
350
|
@property
|
328
351
|
def description(self) -> str:
|
@@ -341,13 +364,14 @@ Indexer = Source
|
|
341
364
|
# NOTE: used in configs...
|
342
365
|
def last(path: PathIsh, *parts: str) -> Path:
|
343
366
|
import os.path
|
344
|
-
pp = os.path.join(str(path), *parts)
|
345
|
-
return Path(max(glob(pp, recursive=True)))
|
367
|
+
pp = os.path.join(str(path), *parts) # noqa: PTH118
|
368
|
+
return Path(max(glob(pp, recursive=True))) # noqa: PTH207
|
346
369
|
|
347
370
|
|
348
|
-
from .logging import setup_logger
|
371
|
+
from .logging import setup_logger # noqa: F401
|
349
372
|
|
350
|
-
|
373
|
+
|
374
|
+
# TODO get rid of this? not sure if still necessary
|
351
375
|
def echain(ex: Exception, cause: Exception) -> Exception:
|
352
376
|
e = copy(ex)
|
353
377
|
e.__cause__ = cause
|
@@ -361,7 +385,6 @@ def echain(ex: Exception, cause: Exception) -> Exception:
|
|
361
385
|
|
362
386
|
def slugify(x: str) -> str:
|
363
387
|
# https://stackoverflow.com/a/38766141/706389
|
364
|
-
import re
|
365
388
|
valid_file_name = re.sub(r'[^\w_.)( -]', '', x)
|
366
389
|
return valid_file_name
|
367
390
|
|
@@ -371,7 +394,7 @@ def appdirs():
|
|
371
394
|
under_test = os.environ.get('PYTEST_CURRENT_TEST') is not None
|
372
395
|
# todo actually use test name?
|
373
396
|
name = 'promnesia-test' if under_test else 'promnesia'
|
374
|
-
import appdirs as ad
|
397
|
+
import appdirs as ad # type: ignore[import-untyped]
|
375
398
|
return ad.AppDirs(appname=name)
|
376
399
|
|
377
400
|
|
@@ -388,13 +411,13 @@ def default_cache_dir() -> Path:
|
|
388
411
|
# make it lazy, otherwise it might crash on module import (e.g. on Windows)
|
389
412
|
# ideally would be nice to fix it properly https://github.com/ahupp/python-magic#windows
|
390
413
|
@lru_cache(1)
|
391
|
-
def _magic() -> Callable[[PathIsh],
|
414
|
+
def _magic() -> Callable[[PathIsh], str | None]:
|
392
415
|
logger = get_logger()
|
393
416
|
try:
|
394
|
-
import magic
|
417
|
+
import magic # type: ignore
|
395
418
|
except Exception as e:
|
396
419
|
logger.exception(e)
|
397
|
-
defensive_msg:
|
420
|
+
defensive_msg: str | None = None
|
398
421
|
if isinstance(e, ModuleNotFoundError) and e.name == 'magic':
|
399
422
|
defensive_msg = "python-magic is not detected. It's recommended for better file type detection (pip3 install --user python-magic). See https://github.com/ahupp/python-magic#installation"
|
400
423
|
elif isinstance(e, ImportError):
|
@@ -404,7 +427,7 @@ def _magic() -> Callable[[PathIsh], Optional[str]]:
|
|
404
427
|
if defensive_msg is not None:
|
405
428
|
logger.warning(defensive_msg)
|
406
429
|
warnings.warn(defensive_msg)
|
407
|
-
return lambda path: None #
|
430
|
+
return lambda path: None # stub # noqa: ARG005
|
408
431
|
else:
|
409
432
|
raise e
|
410
433
|
else:
|
@@ -420,7 +443,7 @@ def _mimetypes():
|
|
420
443
|
return mimetypes
|
421
444
|
|
422
445
|
|
423
|
-
def mime(path: PathIsh) ->
|
446
|
+
def mime(path: PathIsh) -> str | None:
|
424
447
|
ps = str(path)
|
425
448
|
mimetypes = _mimetypes()
|
426
449
|
# first try mimetypes, it's only using the filename without opening the file
|
@@ -432,7 +455,7 @@ def mime(path: PathIsh) -> Optional[str]:
|
|
432
455
|
return magic(ps)
|
433
456
|
|
434
457
|
|
435
|
-
def find_args(root: Path, follow: bool, ignore:
|
458
|
+
def find_args(root: Path, *, follow: bool, ignore: Sequence[str] = ()) -> list[str]:
|
436
459
|
prune_dir_args = []
|
437
460
|
ignore_file_args = []
|
438
461
|
if ignore:
|
@@ -455,19 +478,19 @@ def find_args(root: Path, follow: bool, ignore: List[str]=[]) -> List[str]:
|
|
455
478
|
]
|
456
479
|
|
457
480
|
|
458
|
-
def fdfind_args(root: Path, follow: bool, ignore:
|
481
|
+
def fdfind_args(root: Path, *, follow: bool, ignore: Sequence[str] = ()) -> list[str]:
|
459
482
|
from .config import extra_fd_args
|
460
483
|
|
461
484
|
ignore_args = []
|
462
485
|
if ignore:
|
463
486
|
# Add a statement that excludes the folder
|
464
|
-
|
487
|
+
_ignore_args = [['--exclude', f'{n}'] for n in ignore]
|
465
488
|
# Flatten the list of lists
|
466
|
-
|
489
|
+
ignore_args = list(itertools.chain(*_ignore_args))
|
467
490
|
|
468
491
|
return [
|
469
492
|
*extra_fd_args(),
|
470
|
-
*
|
493
|
+
*ignore_args,
|
471
494
|
*(['--follow'] if follow else []),
|
472
495
|
'--type', 'f',
|
473
496
|
'.',
|
@@ -475,7 +498,7 @@ def fdfind_args(root: Path, follow: bool, ignore: List[str]=[]) -> List[str]:
|
|
475
498
|
]
|
476
499
|
|
477
500
|
|
478
|
-
def traverse(root: Path, *, follow: bool=True, ignore:
|
501
|
+
def traverse(root: Path, *, follow: bool=True, ignore: Sequence[str] = ()) -> Iterable[Path]:
|
479
502
|
if not root.is_dir():
|
480
503
|
yield root
|
481
504
|
return
|
@@ -516,17 +539,7 @@ def traverse(root: Path, *, follow: bool=True, ignore: List[str]=[]) -> Iterable
|
|
516
539
|
def get_system_zone() -> str:
|
517
540
|
try:
|
518
541
|
import tzlocal
|
519
|
-
|
520
|
-
try:
|
521
|
-
# 4.0 way
|
522
|
-
return tzlocal.get_localzone_name() # type: ignore[attr-defined]
|
523
|
-
except AttributeError as e:
|
524
|
-
# 2.0 way
|
525
|
-
zone = tzlocal.get_localzone().zone # type: ignore[attr-defined]
|
526
|
-
# see https://github.com/python/typeshed/blame/968fd6d01d23470e0c8368e7ee7c43f54aaedc0e/stubs/pytz/pytz/tzinfo.pyi#L6
|
527
|
-
# it says all concrete instances should not be None
|
528
|
-
assert zone is not None
|
529
|
-
return zone
|
542
|
+
return tzlocal.get_localzone_name()
|
530
543
|
except Exception as e:
|
531
544
|
logger.exception(e)
|
532
545
|
logger.error("Couldn't determine system timezone. Falling back to UTC. Please report this as a bug!")
|
@@ -540,7 +553,7 @@ def get_system_tz() -> pytz.BaseTzInfo:
|
|
540
553
|
return pytz.timezone(zone)
|
541
554
|
except Exception as e:
|
542
555
|
logger.exception(e)
|
543
|
-
logger.error(
|
556
|
+
logger.error("Unknown time zone %s. Falling back to UTC. Please report this as a bug!", zone)
|
544
557
|
return pytz.utc
|
545
558
|
|
546
559
|
# used in misc/install_server.py
|
@@ -594,3 +607,8 @@ def is_sqlite_db(x: Path) -> bool:
|
|
594
607
|
'application/vnd.sqlite3',
|
595
608
|
# TODO this mime can also match wal files/journals, not sure
|
596
609
|
}
|
610
|
+
|
611
|
+
|
612
|
+
if not TYPE_CHECKING:
|
613
|
+
# todo deprecate properly --just backwards compat
|
614
|
+
from .compat import removeprefix # noqa: F401
|
promnesia/compare.py
CHANGED
@@ -1,13 +1,15 @@
|
|
1
|
-
|
1
|
+
from __future__ import annotations
|
2
|
+
|
2
3
|
# TODO perhaps make it external script?
|
3
4
|
import argparse
|
4
|
-
from pathlib import Path
|
5
5
|
import logging
|
6
6
|
import sys
|
7
|
-
from
|
8
|
-
|
7
|
+
from collections.abc import Iterator, Sequence
|
8
|
+
from pathlib import Path
|
9
|
+
from typing import TypeVar
|
9
10
|
|
10
|
-
from .common import DbVisit,
|
11
|
+
from .common import DbVisit, PathWithMtime, Url
|
12
|
+
from .database.load import row_to_db_visit
|
11
13
|
|
12
14
|
# TODO include latest too?
|
13
15
|
# from cconfig import ignore, filtered
|
@@ -18,14 +20,11 @@ def get_logger():
|
|
18
20
|
# TODO return error depending on severity?
|
19
21
|
|
20
22
|
|
21
|
-
from typing import TypeVar, Sequence
|
22
|
-
|
23
|
-
|
24
23
|
T = TypeVar('T')
|
25
24
|
|
26
25
|
def eliminate_by(sa: Sequence[T], sb: Sequence[T], key):
|
27
|
-
def make_dict(s: Sequence[T]) ->
|
28
|
-
res:
|
26
|
+
def make_dict(s: Sequence[T]) -> dict[str, list[T]]:
|
27
|
+
res: dict[str, list[T]] = {}
|
29
28
|
for a in s:
|
30
29
|
k = key(a)
|
31
30
|
ll = res.get(k, None)
|
@@ -38,9 +37,9 @@ def eliminate_by(sa: Sequence[T], sb: Sequence[T], key):
|
|
38
37
|
db = make_dict(sb)
|
39
38
|
ka = set(da.keys())
|
40
39
|
kb = set(db.keys())
|
41
|
-
onlya:
|
42
|
-
common:
|
43
|
-
onlyb:
|
40
|
+
onlya: set[T] = set()
|
41
|
+
common: set[T] = set()
|
42
|
+
onlyb: set[T] = set()
|
44
43
|
for k in ka.union(kb):
|
45
44
|
la = da.get(k, [])
|
46
45
|
lb = db.get(k, [])
|
@@ -53,13 +52,13 @@ def eliminate_by(sa: Sequence[T], sb: Sequence[T], key):
|
|
53
52
|
return onlya, common, onlyb
|
54
53
|
|
55
54
|
|
56
|
-
def compare(before:
|
55
|
+
def compare(before: list[DbVisit], after: list[DbVisit], between: str, *, log=True) -> list[DbVisit]:
|
57
56
|
logger = get_logger()
|
58
57
|
logger.info('comparing between: %s', between)
|
59
58
|
|
60
|
-
errors:
|
59
|
+
errors: list[DbVisit] = []
|
61
60
|
|
62
|
-
umap:
|
61
|
+
umap: dict[Url, list[DbVisit]] = {}
|
63
62
|
for a in after:
|
64
63
|
url = a.norm_url
|
65
64
|
xx = umap.get(url, []) # TODO canonify here?
|
@@ -70,7 +69,7 @@ def compare(before: List[DbVisit], after: List[DbVisit], between: str, *, log=Tr
|
|
70
69
|
errors.append(b)
|
71
70
|
if log:
|
72
71
|
logger.error('between %s missing %s', between, b)
|
73
|
-
print('ignoreline "
|
72
|
+
print('ignoreline "{}", # {} {}'.format('exid', b.norm_url, b.src), file=sys.stderr)
|
74
73
|
|
75
74
|
|
76
75
|
# the idea is that we eliminate items simultaneously from both sets
|
@@ -107,7 +106,7 @@ def get_files(args):
|
|
107
106
|
if len(args.paths) == 0:
|
108
107
|
int_dir = args.intermediate_dir
|
109
108
|
assert int_dir.exists()
|
110
|
-
files =
|
109
|
+
files = sorted(int_dir.glob('*.sqlite*'))
|
111
110
|
files = files[-args.last:]
|
112
111
|
else:
|
113
112
|
files = [Path(p) for p in args.paths]
|
@@ -125,7 +124,7 @@ def main():
|
|
125
124
|
sys.exit(1)
|
126
125
|
|
127
126
|
|
128
|
-
def compare_files(*files: Path, log=True) -> Iterator[
|
127
|
+
def compare_files(*files: Path, log=True) -> Iterator[tuple[str, DbVisit]]:
|
129
128
|
assert len(files) > 0
|
130
129
|
|
131
130
|
logger = get_logger()
|
@@ -138,11 +137,11 @@ def compare_files(*files: Path, log=True) -> Iterator[Tuple[str, DbVisit]]:
|
|
138
137
|
name = f.name
|
139
138
|
this_dts = name[0: name.index('.')] # can't use stem due to multiple extensions..
|
140
139
|
|
141
|
-
from promnesia.server import _get_stuff
|
142
|
-
engine,
|
140
|
+
from promnesia.server import _get_stuff # TODO ugh
|
141
|
+
engine, table = _get_stuff(PathWithMtime.make(f))
|
143
142
|
|
144
143
|
with engine.connect() as conn:
|
145
|
-
vis = [
|
144
|
+
vis = [row_to_db_visit(row) for row in conn.execute(table.select())]
|
146
145
|
|
147
146
|
if last is not None:
|
148
147
|
between = f'{last_dts}:{this_dts}'
|
promnesia/compat.py
CHANGED
@@ -1,12 +1,12 @@
|
|
1
|
-
|
2
|
-
## keeping in case any sources depended on compat functions
|
3
|
-
from subprocess import PIPE, run, check_call, check_output, Popen
|
4
|
-
from typing import Protocol, Literal
|
5
|
-
##
|
1
|
+
from typing import TYPE_CHECKING
|
6
2
|
|
3
|
+
if not TYPE_CHECKING:
|
4
|
+
## we used to have compat fixes here for these for python3.7
|
5
|
+
## keeping in case any sources depended on compat functions
|
6
|
+
from subprocess import PIPE, Popen, check_call, check_output, run # noqa: F401
|
7
|
+
from typing import Literal, Protocol # noqa: F401
|
8
|
+
##
|
7
9
|
|
8
|
-
#
|
9
|
-
def removeprefix(text: str, prefix: str) -> str:
|
10
|
-
|
11
|
-
return text[len(prefix):]
|
12
|
-
return text
|
10
|
+
# todo deprecate properly
|
11
|
+
def removeprefix(text: str, prefix: str) -> str:
|
12
|
+
return text.removeprefix(prefix)
|
promnesia/config.py
CHANGED
@@ -1,21 +1,19 @@
|
|
1
|
-
from
|
2
|
-
|
3
|
-
from types import ModuleType
|
4
|
-
from typing import List, Optional, Union, NamedTuple, Iterable, Callable
|
1
|
+
from __future__ import annotations
|
2
|
+
|
5
3
|
import importlib
|
6
4
|
import importlib.util
|
5
|
+
import os
|
7
6
|
import warnings
|
7
|
+
from collections.abc import Iterable
|
8
|
+
from pathlib import Path
|
9
|
+
from types import ModuleType
|
10
|
+
from typing import Callable, NamedTuple, Union
|
8
11
|
|
9
|
-
from .common import
|
10
|
-
from .common import Res, Source, DbVisit
|
11
|
-
|
12
|
+
from .common import DbVisit, PathIsh, Res, Source, default_cache_dir, default_output_dir
|
12
13
|
|
13
14
|
HookT = Callable[[Res[DbVisit]], Iterable[Res[DbVisit]]]
|
14
15
|
|
15
16
|
|
16
|
-
from typing import Any
|
17
|
-
|
18
|
-
|
19
17
|
ModuleName = str
|
20
18
|
|
21
19
|
# something that can be converted into a proper Source
|
@@ -24,19 +22,19 @@ ConfigSource = Union[Source, ModuleName, ModuleType]
|
|
24
22
|
|
25
23
|
class Config(NamedTuple):
|
26
24
|
# TODO remove default from sources once migrated
|
27
|
-
SOURCES:
|
25
|
+
SOURCES: list[ConfigSource] = []
|
28
26
|
|
29
27
|
# if not specified, uses user data dir
|
30
|
-
OUTPUT_DIR:
|
28
|
+
OUTPUT_DIR: PathIsh | None = None
|
31
29
|
|
32
|
-
CACHE_DIR:
|
33
|
-
FILTERS:
|
30
|
+
CACHE_DIR: PathIsh | None = ''
|
31
|
+
FILTERS: list[str] = []
|
34
32
|
|
35
|
-
HOOK:
|
33
|
+
HOOK: HookT | None = None
|
36
34
|
|
37
35
|
#
|
38
36
|
# NOTE: INDEXERS is deprecated, use SOURCES instead
|
39
|
-
INDEXERS:
|
37
|
+
INDEXERS: list[ConfigSource] = []
|
40
38
|
#MIME_HANDLER: Optional[str] = None # TODO
|
41
39
|
|
42
40
|
@property
|
@@ -68,9 +66,11 @@ class Config(NamedTuple):
|
|
68
66
|
yield Source(r)
|
69
67
|
|
70
68
|
@property
|
71
|
-
def cache_dir(self) ->
|
69
|
+
def cache_dir(self) -> Path | None:
|
70
|
+
# TODO we used to use this for cachew, but it's best to rely on HPI modules etc to cofigure this
|
71
|
+
# keeping just in case for now
|
72
72
|
cd = self.CACHE_DIR
|
73
|
-
cpath:
|
73
|
+
cpath: Path | None
|
74
74
|
if cd is None:
|
75
75
|
cpath = None # means 'disabled' in cachew
|
76
76
|
elif cd == '': # meh.. but need to make it None friendly..
|
@@ -94,10 +94,10 @@ class Config(NamedTuple):
|
|
94
94
|
return self.output_dir / 'promnesia.sqlite'
|
95
95
|
|
96
96
|
@property
|
97
|
-
def hook(self) ->
|
97
|
+
def hook(self) -> HookT | None:
|
98
98
|
return self.HOOK
|
99
99
|
|
100
|
-
instance:
|
100
|
+
instance: Config | None = None
|
101
101
|
|
102
102
|
|
103
103
|
def has() -> bool:
|
@@ -127,7 +127,7 @@ def import_config(config_file: PathIsh) -> Config:
|
|
127
127
|
spec = importlib.util.spec_from_file_location(name, p); assert spec is not None
|
128
128
|
mod = importlib.util.module_from_spec(spec); assert mod is not None
|
129
129
|
loader = spec.loader; assert loader is not None
|
130
|
-
loader.exec_module(mod)
|
130
|
+
loader.exec_module(mod)
|
131
131
|
|
132
132
|
d = {}
|
133
133
|
for f in Config._fields:
|
@@ -137,7 +137,7 @@ def import_config(config_file: PathIsh) -> Config:
|
|
137
137
|
|
138
138
|
|
139
139
|
# TODO: ugh. this causes warnings to be repeated multiple times... need to reuse the pool or something..
|
140
|
-
def use_cores() ->
|
140
|
+
def use_cores() -> int | None:
|
141
141
|
'''
|
142
142
|
Somewhat experimental.
|
143
143
|
For now only used in sources.auto, perhaps later will be shared among the other indexers.
|
@@ -152,7 +152,7 @@ def use_cores() -> Optional[int]:
|
|
152
152
|
return 0
|
153
153
|
|
154
154
|
|
155
|
-
def extra_fd_args() ->
|
155
|
+
def extra_fd_args() -> list[str]:
|
156
156
|
'''
|
157
157
|
Not sure where it belongs yet... so via env variable for now
|
158
158
|
Can be used to pass --ignore-file parameter
|
@@ -0,0 +1,67 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
|
3
|
+
from collections.abc import Sequence
|
4
|
+
from datetime import datetime
|
5
|
+
|
6
|
+
from sqlalchemy import (
|
7
|
+
Column,
|
8
|
+
Integer,
|
9
|
+
String,
|
10
|
+
)
|
11
|
+
|
12
|
+
# TODO maybe later move DbVisit here completely?
|
13
|
+
# kinda an issue that it's technically an "api" because hook in config can patch up DbVisit
|
14
|
+
from ..common import DbVisit, Loc
|
15
|
+
|
16
|
+
|
17
|
+
def get_columns() -> Sequence[Column]:
|
18
|
+
# fmt: off
|
19
|
+
res: Sequence[Column] = [
|
20
|
+
Column('norm_url' , String()),
|
21
|
+
Column('orig_url' , String()),
|
22
|
+
Column('dt' , String()),
|
23
|
+
Column('locator_title', String()),
|
24
|
+
Column('locator_href' , String()),
|
25
|
+
Column('src' , String()),
|
26
|
+
Column('context' , String()),
|
27
|
+
Column('duration' , Integer())
|
28
|
+
]
|
29
|
+
# fmt: on
|
30
|
+
assert len(res) == len(DbVisit._fields) + 1 # +1 because Locator is 'flattened'
|
31
|
+
return res
|
32
|
+
|
33
|
+
|
34
|
+
def db_visit_to_row(v: DbVisit) -> tuple:
|
35
|
+
# ugh, very hacky...
|
36
|
+
# we want to make sure the resulting tuple only consists of simple types
|
37
|
+
# so we can use dbengine directly
|
38
|
+
dt_s = v.dt.isoformat()
|
39
|
+
row = (
|
40
|
+
v.norm_url,
|
41
|
+
v.orig_url,
|
42
|
+
dt_s,
|
43
|
+
v.locator.title,
|
44
|
+
v.locator.href,
|
45
|
+
v.src,
|
46
|
+
v.context,
|
47
|
+
v.duration,
|
48
|
+
)
|
49
|
+
return row
|
50
|
+
|
51
|
+
|
52
|
+
def row_to_db_visit(row: Sequence) -> DbVisit:
|
53
|
+
(norm_url, orig_url, dt_s, locator_title, locator_href, src, context, duration) = row
|
54
|
+
dt_s = dt_s.split()[0] # backwards compatibility: previously it could be a string separated with tz name
|
55
|
+
dt = datetime.fromisoformat(dt_s)
|
56
|
+
return DbVisit(
|
57
|
+
norm_url=norm_url,
|
58
|
+
orig_url=orig_url,
|
59
|
+
dt=dt,
|
60
|
+
locator=Loc(
|
61
|
+
title=locator_title,
|
62
|
+
href=locator_href,
|
63
|
+
),
|
64
|
+
src=src,
|
65
|
+
context=context,
|
66
|
+
duration=duration,
|
67
|
+
)
|