promnesia 1.2.20230515__py3-none-any.whl → 1.2.20240810__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- promnesia/__main__.py +26 -14
- promnesia/cannon.py +4 -4
- promnesia/common.py +39 -28
- promnesia/compare.py +3 -2
- promnesia/config.py +4 -2
- promnesia/database/common.py +66 -0
- promnesia/database/dump.py +187 -0
- promnesia/{read_db.py → database/load.py} +10 -11
- promnesia/extract.py +1 -0
- promnesia/kjson.py +1 -1
- promnesia/logging.py +3 -3
- promnesia/misc/__init__.pyi +0 -0
- promnesia/misc/config_example.py +1 -2
- promnesia/misc/install_server.py +2 -3
- promnesia/server.py +18 -19
- promnesia/sources/__init__.pyi +0 -0
- promnesia/sources/auto.py +9 -7
- promnesia/sources/browser_legacy.py +11 -5
- promnesia/sources/demo.py +18 -2
- promnesia/sources/filetypes.py +7 -0
- promnesia/sources/github.py +2 -2
- promnesia/sources/hypothesis.py +1 -1
- promnesia/sources/markdown.py +15 -15
- promnesia/sources/org.py +7 -3
- promnesia/sources/plaintext.py +3 -1
- promnesia/sources/reddit.py +2 -2
- promnesia/sources/rss.py +1 -1
- promnesia/sources/signal.py +22 -14
- promnesia/sources/stackexchange.py +2 -2
- promnesia/sources/takeout.py +58 -1
- promnesia/sources/takeout_legacy.py +10 -2
- promnesia/tests/__init__.py +0 -0
- promnesia/tests/common.py +137 -0
- promnesia/tests/server_helper.py +64 -0
- promnesia/tests/sources/__init__.py +0 -0
- promnesia/tests/sources/test_auto.py +66 -0
- promnesia/tests/sources/test_filetypes.py +42 -0
- promnesia/tests/sources/test_hypothesis.py +39 -0
- promnesia/tests/sources/test_org.py +65 -0
- promnesia/tests/sources/test_plaintext.py +26 -0
- promnesia/tests/sources/test_shellcmd.py +22 -0
- promnesia/tests/sources/test_takeout.py +58 -0
- promnesia/tests/test_cannon.py +325 -0
- promnesia/tests/test_cli.py +42 -0
- promnesia/tests/test_compare.py +30 -0
- promnesia/tests/test_config.py +290 -0
- promnesia/tests/test_db_dump.py +223 -0
- promnesia/tests/test_extract.py +61 -0
- promnesia/tests/test_extract_urls.py +43 -0
- promnesia/tests/test_indexer.py +245 -0
- promnesia/tests/test_server.py +292 -0
- promnesia/tests/test_traverse.py +41 -0
- promnesia/tests/utils.py +35 -0
- {promnesia-1.2.20230515.dist-info → promnesia-1.2.20240810.dist-info}/METADATA +13 -17
- promnesia-1.2.20240810.dist-info/RECORD +83 -0
- {promnesia-1.2.20230515.dist-info → promnesia-1.2.20240810.dist-info}/WHEEL +1 -1
- {promnesia-1.2.20230515.dist-info → promnesia-1.2.20240810.dist-info}/entry_points.txt +0 -1
- promnesia/dump.py +0 -105
- promnesia-1.2.20230515.dist-info/RECORD +0 -58
- {promnesia-1.2.20230515.dist-info → promnesia-1.2.20240810.dist-info}/LICENSE +0 -0
- {promnesia-1.2.20230515.dist-info → promnesia-1.2.20240810.dist-info}/top_level.txt +0 -0
promnesia/__main__.py
CHANGED
@@ -4,11 +4,12 @@ import argparse
|
|
4
4
|
import ast
|
5
5
|
import importlib
|
6
6
|
import inspect
|
7
|
+
import os
|
7
8
|
from pathlib import Path
|
8
9
|
import shutil
|
9
10
|
from subprocess import run, check_call, Popen
|
10
11
|
import sys
|
11
|
-
from tempfile import TemporaryDirectory
|
12
|
+
from tempfile import TemporaryDirectory, gettempdir
|
12
13
|
from typing import Callable, Sequence, Iterable, Iterator, Union
|
13
14
|
|
14
15
|
|
@@ -17,7 +18,7 @@ from . import server
|
|
17
18
|
from .misc import install_server
|
18
19
|
from .common import Extractor, PathIsh, logger, get_tmpdir, DbVisit, Res
|
19
20
|
from .common import Source, get_system_tz, user_config_file, default_config_path
|
20
|
-
from .dump import visits_to_sqlite
|
21
|
+
from .database.dump import visits_to_sqlite
|
21
22
|
from .extract import extract_visits
|
22
23
|
|
23
24
|
|
@@ -96,22 +97,23 @@ def _do_index(dry: bool=False, sources_subset: Iterable[Union[str, int]]=(), ove
|
|
96
97
|
|
97
98
|
|
98
99
|
def do_index(
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
100
|
+
config_file: Path,
|
101
|
+
dry: bool=False,
|
102
|
+
sources_subset: Iterable[Union[str, int]]=(),
|
103
|
+
overwrite_db: bool=False,
|
104
|
+
) -> Sequence[Exception]:
|
104
105
|
config.load_from(config_file) # meh.. should be cleaner
|
105
106
|
try:
|
106
107
|
errors = list(_do_index(dry=dry, sources_subset=sources_subset, overwrite_db=overwrite_db))
|
107
108
|
finally:
|
109
|
+
# this reset is mainly for tests, so we don't end up reusing the same config by accident
|
108
110
|
config.reset()
|
109
111
|
if len(errors) > 0:
|
110
112
|
logger.error('%d errors, printing them out:', len(errors))
|
111
113
|
for e in errors:
|
112
114
|
logger.exception(e)
|
113
115
|
logger.error('%d errors, exit code 1', len(errors))
|
114
|
-
|
116
|
+
return errors
|
115
117
|
|
116
118
|
|
117
119
|
def demo_sources() -> dict[str, Callable[[], Extractor]]:
|
@@ -216,15 +218,23 @@ def config_check(args: argparse.Namespace) -> None:
|
|
216
218
|
def _config_check(cfg: Path) -> Iterable[Exception]:
|
217
219
|
logger.info('config: %s', cfg)
|
218
220
|
|
219
|
-
def check(cmd: list[str | Path]) -> Iterable[Exception]:
|
221
|
+
def check(cmd: list[str | Path], **kwargs) -> Iterable[Exception]:
|
220
222
|
logger.debug(' '.join(map(str, cmd)))
|
221
|
-
res = run(cmd)
|
223
|
+
res = run(cmd, **kwargs)
|
222
224
|
if res.returncode > 0:
|
223
225
|
yield Exception()
|
224
226
|
|
225
227
|
logger.info('Checking syntax...')
|
226
228
|
cmd: list[str | Path] = [sys.executable, '-m', 'compileall', cfg]
|
227
|
-
yield from check(
|
229
|
+
yield from check(
|
230
|
+
cmd,
|
231
|
+
env={
|
232
|
+
**os.environ,
|
233
|
+
# if config is on read only partition, the command would fail due to generated bytecode
|
234
|
+
# so put it in the temporary directory instead
|
235
|
+
'PYTHONPYCACHEPREFIX': gettempdir()
|
236
|
+
},
|
237
|
+
)
|
228
238
|
|
229
239
|
# todo not sure if should be more defensive than check_call here
|
230
240
|
logger.info('Checking type safety...')
|
@@ -317,14 +327,14 @@ def main() -> None:
|
|
317
327
|
)
|
318
328
|
|
319
329
|
F = lambda prog: argparse.ArgumentDefaultsHelpFormatter(prog, width=120)
|
320
|
-
p = argparse.ArgumentParser(formatter_class=F)
|
330
|
+
p = argparse.ArgumentParser(formatter_class=F)
|
321
331
|
subp = p.add_subparsers(dest='mode', )
|
322
332
|
ep = subp.add_parser('index', help='Create/update the link database', formatter_class=F)
|
323
333
|
add_index_args(ep, default_config_path())
|
324
334
|
# TODO use some way to override or provide config only via cmdline?
|
325
335
|
ep.add_argument('--intermediate', required=False, help="Used for development, you don't need it")
|
326
336
|
|
327
|
-
sp = subp.add_parser('serve', help='Serve a link database', formatter_class=F)
|
337
|
+
sp = subp.add_parser('serve', help='Serve a link database', formatter_class=F)
|
328
338
|
server.setup_parser(sp)
|
329
339
|
|
330
340
|
ap = subp.add_parser('demo', help='Demo mode: index and serve a directory in single command', formatter_class=F)
|
@@ -388,12 +398,14 @@ def main() -> None:
|
|
388
398
|
|
389
399
|
with get_tmpdir() as tdir: # TODO??
|
390
400
|
if mode == 'index':
|
391
|
-
do_index(
|
401
|
+
errors = do_index(
|
392
402
|
config_file=args.config,
|
393
403
|
dry=args.dry,
|
394
404
|
sources_subset=args.sources,
|
395
405
|
overwrite_db=args.overwrite,
|
396
406
|
)
|
407
|
+
if len(errors) > 0:
|
408
|
+
sys.exit(1)
|
397
409
|
elif mode == 'serve':
|
398
410
|
server.run(args)
|
399
411
|
elif mode == 'demo':
|
promnesia/cannon.py
CHANGED
@@ -422,7 +422,7 @@ def canonify(url: str) -> str:
|
|
422
422
|
qq = [(k, v) for i, k, v in sorted(iqq)]
|
423
423
|
# TODO still not sure what we should do..
|
424
424
|
# quote_plus replaces %20 with +, not sure if we want it...
|
425
|
-
query = urlencode(qq, quote_via=quote_via)
|
425
|
+
query = urlencode(qq, quote_via=quote_via)
|
426
426
|
|
427
427
|
path = _quote_path(path)
|
428
428
|
|
@@ -683,7 +683,7 @@ def domains(it): # pragma: no cover
|
|
683
683
|
try:
|
684
684
|
nurl = canonify(url)
|
685
685
|
except CanonifyException as e:
|
686
|
-
print(f"ERROR while normalising! {
|
686
|
+
print(f"ERROR while normalising! {url} {e}")
|
687
687
|
c['ERROR'] += 1
|
688
688
|
continue
|
689
689
|
else:
|
@@ -718,7 +718,7 @@ def groups(it, args): # pragma: no cover
|
|
718
718
|
try:
|
719
719
|
nurl = canonify(url)
|
720
720
|
except CanonifyException as e:
|
721
|
-
print(f"ERROR while normalising! {
|
721
|
+
print(f"ERROR while normalising! {url} {e}")
|
722
722
|
continue
|
723
723
|
udom = nurl[:nurl.find('/')]
|
724
724
|
usplit = udom.split('.')
|
@@ -818,7 +818,7 @@ def main() -> None: # pragma: no cover
|
|
818
818
|
|
819
819
|
- running comparison
|
820
820
|
sqlite3 promnesia.sqlite 'select distinct orig_url from visits where norm_url like "%twitter%" order by orig_url' | src/promnesia/cannon.py
|
821
|
-
''', formatter_class=lambda prog: argparse.RawTextHelpFormatter(prog, width=100)
|
821
|
+
''', formatter_class=lambda prog: argparse.RawTextHelpFormatter(prog, width=100)
|
822
822
|
)
|
823
823
|
p.add_argument('input', nargs='?')
|
824
824
|
p.add_argument('--human', action='store_true')
|
promnesia/common.py
CHANGED
@@ -19,6 +19,7 @@ from more_itertools import intersperse
|
|
19
19
|
import pytz
|
20
20
|
|
21
21
|
from .cannon import canonify
|
22
|
+
from .compat import removeprefix
|
22
23
|
|
23
24
|
|
24
25
|
_is_windows = os.name == 'nt'
|
@@ -76,13 +77,26 @@ class Loc(NamedTuple):
|
|
76
77
|
# but generally, it will be
|
77
78
|
# (url|file)(linenumber|json_path|anchor)
|
78
79
|
|
80
|
+
|
81
|
+
@lru_cache(None)
|
82
|
+
def warn_once(message: str) -> None:
|
83
|
+
# you'd think that warnings module already logs warnings only once per line..
|
84
|
+
# but sadly it's not the case
|
85
|
+
# see https://github.com/karlicoss/python_duplicate_warnings_investigation/blob/master/test.py
|
86
|
+
warnings.warn(message, stacklevel=2)
|
87
|
+
|
88
|
+
|
89
|
+
def _warn_no_xdg_mime() -> None:
|
90
|
+
warn_once("No xdg-mime on your OS! If you're on OSX, perhaps you can help me! https://github.com/karlicoss/open-in-editor/issues/1")
|
91
|
+
|
92
|
+
|
79
93
|
@lru_cache(1)
|
80
94
|
def _detect_mime_handler() -> str:
|
81
95
|
def exists(what: str) -> bool:
|
82
96
|
try:
|
83
97
|
r = run(f'xdg-mime query default x-scheme-handler/{what}'.split(), stdout=PIPE)
|
84
|
-
except FileNotFoundError:
|
85
|
-
|
98
|
+
except (FileNotFoundError, NotADirectoryError): # ugh seems that osx might throw NotADirectory for some reason
|
99
|
+
_warn_no_xdg_mime()
|
86
100
|
return False
|
87
101
|
if r.returncode > 0:
|
88
102
|
warnings.warn('xdg-mime failed') # hopefully rest is in stderr
|
@@ -102,6 +116,7 @@ def _detect_mime_handler() -> str:
|
|
102
116
|
result = 'emacs:'
|
103
117
|
|
104
118
|
# 2. now try to use newer editor:// thing
|
119
|
+
# TODO flip order here? should rely on editor:// first?
|
105
120
|
|
106
121
|
# TODO would be nice to collect warnings and display at the end
|
107
122
|
if not exists('editor'):
|
@@ -285,9 +300,10 @@ def _guess_name(thing: PreSource) -> str:
|
|
285
300
|
guess = thing.__module__
|
286
301
|
|
287
302
|
dflt = 'promnesia.sources.'
|
288
|
-
|
289
|
-
|
290
|
-
|
303
|
+
guess = removeprefix(guess, prefix=dflt)
|
304
|
+
if guess == 'config':
|
305
|
+
# this happens when we define a lambda in config or something without properly wrapping in Source
|
306
|
+
logger.warning(f'Inferred source name "config" for {thing}. This might be misleading TODO')
|
291
307
|
return guess
|
292
308
|
|
293
309
|
|
@@ -297,7 +313,7 @@ def _get_index_function(sourceish: PreSource) -> PreExtractor:
|
|
297
313
|
if hasattr(sourceish, 'index'): # must be a module
|
298
314
|
res = getattr(sourceish, 'index')
|
299
315
|
else:
|
300
|
-
res = sourceish
|
316
|
+
res = sourceish
|
301
317
|
return res
|
302
318
|
|
303
319
|
|
@@ -317,12 +333,17 @@ class Source:
|
|
317
333
|
self.extractor: Extractor = lambda: self.ff(*self.args, **self.kwargs)
|
318
334
|
if src is not None:
|
319
335
|
warnings.warn("'src' argument is deprecated, please use 'name' instead", DeprecationWarning)
|
320
|
-
|
321
|
-
|
322
|
-
|
323
|
-
|
324
|
-
|
325
|
-
|
336
|
+
if name != '':
|
337
|
+
self.name = name
|
338
|
+
elif src != '':
|
339
|
+
self.name = src
|
340
|
+
else:
|
341
|
+
try:
|
342
|
+
name_guess = _guess_name(ff)
|
343
|
+
except:
|
344
|
+
# todo warn?
|
345
|
+
name_guess = ''
|
346
|
+
self.name = name_guess
|
326
347
|
|
327
348
|
@property
|
328
349
|
def description(self) -> str:
|
@@ -371,7 +392,7 @@ def appdirs():
|
|
371
392
|
under_test = os.environ.get('PYTEST_CURRENT_TEST') is not None
|
372
393
|
# todo actually use test name?
|
373
394
|
name = 'promnesia-test' if under_test else 'promnesia'
|
374
|
-
import appdirs as ad # type: ignore[import]
|
395
|
+
import appdirs as ad # type: ignore[import-untyped]
|
375
396
|
return ad.AppDirs(appname=name)
|
376
397
|
|
377
398
|
|
@@ -461,13 +482,13 @@ def fdfind_args(root: Path, follow: bool, ignore: List[str]=[]) -> List[str]:
|
|
461
482
|
ignore_args = []
|
462
483
|
if ignore:
|
463
484
|
# Add a statement that excludes the folder
|
464
|
-
|
485
|
+
_ignore_args = [['--exclude', f'{n}'] for n in ignore]
|
465
486
|
# Flatten the list of lists
|
466
|
-
|
487
|
+
ignore_args = list(itertools.chain(*_ignore_args))
|
467
488
|
|
468
489
|
return [
|
469
490
|
*extra_fd_args(),
|
470
|
-
*
|
491
|
+
*ignore_args,
|
471
492
|
*(['--follow'] if follow else []),
|
472
493
|
'--type', 'f',
|
473
494
|
'.',
|
@@ -516,17 +537,7 @@ def traverse(root: Path, *, follow: bool=True, ignore: List[str]=[]) -> Iterable
|
|
516
537
|
def get_system_zone() -> str:
|
517
538
|
try:
|
518
539
|
import tzlocal
|
519
|
-
|
520
|
-
try:
|
521
|
-
# 4.0 way
|
522
|
-
return tzlocal.get_localzone_name() # type: ignore[attr-defined]
|
523
|
-
except AttributeError as e:
|
524
|
-
# 2.0 way
|
525
|
-
zone = tzlocal.get_localzone().zone # type: ignore[attr-defined]
|
526
|
-
# see https://github.com/python/typeshed/blame/968fd6d01d23470e0c8368e7ee7c43f54aaedc0e/stubs/pytz/pytz/tzinfo.pyi#L6
|
527
|
-
# it says all concrete instances should not be None
|
528
|
-
assert zone is not None
|
529
|
-
return zone
|
540
|
+
return tzlocal.get_localzone_name()
|
530
541
|
except Exception as e:
|
531
542
|
logger.exception(e)
|
532
543
|
logger.error("Couldn't determine system timezone. Falling back to UTC. Please report this as a bug!")
|
@@ -540,7 +551,7 @@ def get_system_tz() -> pytz.BaseTzInfo:
|
|
540
551
|
return pytz.timezone(zone)
|
541
552
|
except Exception as e:
|
542
553
|
logger.exception(e)
|
543
|
-
logger.error(
|
554
|
+
logger.error("Unknown time zone %s. Falling back to UTC. Please report this as a bug!", zone)
|
544
555
|
return pytz.utc
|
545
556
|
|
546
557
|
# used in misc/install_server.py
|
promnesia/compare.py
CHANGED
@@ -8,6 +8,7 @@ from typing import Dict, List, Any, NamedTuple, Optional, Iterator, Set, Tuple
|
|
8
8
|
|
9
9
|
|
10
10
|
from .common import DbVisit, Url, PathWithMtime # TODO ugh. figure out pythonpath
|
11
|
+
from .database.load import row_to_db_visit
|
11
12
|
|
12
13
|
# TODO include latest too?
|
13
14
|
# from cconfig import ignore, filtered
|
@@ -139,10 +140,10 @@ def compare_files(*files: Path, log=True) -> Iterator[Tuple[str, DbVisit]]:
|
|
139
140
|
this_dts = name[0: name.index('.')] # can't use stem due to multiple extensions..
|
140
141
|
|
141
142
|
from promnesia.server import _get_stuff # TODO ugh
|
142
|
-
engine,
|
143
|
+
engine, table = _get_stuff(PathWithMtime.make(f))
|
143
144
|
|
144
145
|
with engine.connect() as conn:
|
145
|
-
vis = [
|
146
|
+
vis = [row_to_db_visit(row) for row in conn.execute(table.select())]
|
146
147
|
|
147
148
|
if last is not None:
|
148
149
|
between = f'{last_dts}:{this_dts}'
|
promnesia/config.py
CHANGED
@@ -6,7 +6,7 @@ import importlib
|
|
6
6
|
import importlib.util
|
7
7
|
import warnings
|
8
8
|
|
9
|
-
from .common import PathIsh,
|
9
|
+
from .common import PathIsh, default_output_dir, default_cache_dir
|
10
10
|
from .common import Res, Source, DbVisit
|
11
11
|
|
12
12
|
|
@@ -69,6 +69,8 @@ class Config(NamedTuple):
|
|
69
69
|
|
70
70
|
@property
|
71
71
|
def cache_dir(self) -> Optional[Path]:
|
72
|
+
# TODO we used to use this for cachew, but it's best to rely on HPI modules etc to cofigure this
|
73
|
+
# keeping just in case for now
|
72
74
|
cd = self.CACHE_DIR
|
73
75
|
cpath: Optional[Path]
|
74
76
|
if cd is None:
|
@@ -127,7 +129,7 @@ def import_config(config_file: PathIsh) -> Config:
|
|
127
129
|
spec = importlib.util.spec_from_file_location(name, p); assert spec is not None
|
128
130
|
mod = importlib.util.module_from_spec(spec); assert mod is not None
|
129
131
|
loader = spec.loader; assert loader is not None
|
130
|
-
loader.exec_module(mod)
|
132
|
+
loader.exec_module(mod)
|
131
133
|
|
132
134
|
d = {}
|
133
135
|
for f in Config._fields:
|
@@ -0,0 +1,66 @@
|
|
1
|
+
from datetime import datetime
|
2
|
+
from typing import Sequence, Tuple
|
3
|
+
|
4
|
+
from sqlalchemy import (
|
5
|
+
Column,
|
6
|
+
Integer,
|
7
|
+
Row,
|
8
|
+
String,
|
9
|
+
)
|
10
|
+
|
11
|
+
# TODO maybe later move DbVisit here completely?
|
12
|
+
# kinda an issue that it's technically an "api" because hook in config can patch up DbVisit
|
13
|
+
from ..common import DbVisit, Loc
|
14
|
+
|
15
|
+
|
16
|
+
def get_columns() -> Sequence[Column]:
|
17
|
+
# fmt: off
|
18
|
+
res: Sequence[Column] = [
|
19
|
+
Column('norm_url' , String()),
|
20
|
+
Column('orig_url' , String()),
|
21
|
+
Column('dt' , String()),
|
22
|
+
Column('locator_title', String()),
|
23
|
+
Column('locator_href' , String()),
|
24
|
+
Column('src' , String()),
|
25
|
+
Column('context' , String()),
|
26
|
+
Column('duration' , Integer())
|
27
|
+
]
|
28
|
+
# fmt: on
|
29
|
+
assert len(res) == len(DbVisit._fields) + 1 # +1 because Locator is 'flattened'
|
30
|
+
return res
|
31
|
+
|
32
|
+
|
33
|
+
def db_visit_to_row(v: DbVisit) -> Tuple:
|
34
|
+
# ugh, very hacky...
|
35
|
+
# we want to make sure the resulting tuple only consists of simple types
|
36
|
+
# so we can use dbengine directly
|
37
|
+
dt_s = v.dt.isoformat()
|
38
|
+
row = (
|
39
|
+
v.norm_url,
|
40
|
+
v.orig_url,
|
41
|
+
dt_s,
|
42
|
+
v.locator.title,
|
43
|
+
v.locator.href,
|
44
|
+
v.src,
|
45
|
+
v.context,
|
46
|
+
v.duration,
|
47
|
+
)
|
48
|
+
return row
|
49
|
+
|
50
|
+
|
51
|
+
def row_to_db_visit(row: Sequence) -> DbVisit:
|
52
|
+
(norm_url, orig_url, dt_s, locator_title, locator_href, src, context, duration) = row
|
53
|
+
dt_s = dt_s.split()[0] # backwards compatibility: previously it could be a string separated with tz name
|
54
|
+
dt = datetime.fromisoformat(dt_s)
|
55
|
+
return DbVisit(
|
56
|
+
norm_url=norm_url,
|
57
|
+
orig_url=orig_url,
|
58
|
+
dt=dt,
|
59
|
+
locator=Loc(
|
60
|
+
title=locator_title,
|
61
|
+
href=locator_href,
|
62
|
+
),
|
63
|
+
src=src,
|
64
|
+
context=context,
|
65
|
+
duration=duration,
|
66
|
+
)
|
@@ -0,0 +1,187 @@
|
|
1
|
+
from pathlib import Path
|
2
|
+
import sqlite3
|
3
|
+
from typing import Dict, Iterable, List, Optional, Set
|
4
|
+
|
5
|
+
from more_itertools import chunked
|
6
|
+
|
7
|
+
from sqlalchemy import (
|
8
|
+
Engine,
|
9
|
+
MetaData,
|
10
|
+
Table,
|
11
|
+
create_engine,
|
12
|
+
event,
|
13
|
+
exc,
|
14
|
+
func,
|
15
|
+
select,
|
16
|
+
)
|
17
|
+
from sqlalchemy.dialects import sqlite as dialect_sqlite
|
18
|
+
|
19
|
+
from ..common import (
|
20
|
+
DbVisit,
|
21
|
+
Loc,
|
22
|
+
Res,
|
23
|
+
SourceName,
|
24
|
+
get_logger,
|
25
|
+
now_tz,
|
26
|
+
)
|
27
|
+
from .common import get_columns, db_visit_to_row
|
28
|
+
from .. import config
|
29
|
+
|
30
|
+
|
31
|
+
# NOTE: I guess the main performance benefit from this is not creating too many tmp lists and avoiding overhead
|
32
|
+
# since as far as sql is concerned it should all be in the same transaction. only a guess
|
33
|
+
# not sure it's the proper way to handle it
|
34
|
+
# see test_index_many
|
35
|
+
_CHUNK_BY = 10
|
36
|
+
|
37
|
+
# I guess 1 hour is definitely enough
|
38
|
+
_CONNECTION_TIMEOUT_SECONDS = 3600
|
39
|
+
|
40
|
+
SRC_ERROR = 'error'
|
41
|
+
|
42
|
+
|
43
|
+
# using WAL keeps database readable while we're writing in it
|
44
|
+
# this is tested by test_query_while_indexing
|
45
|
+
def enable_wal(dbapi_con, con_record) -> None:
|
46
|
+
dbapi_con.execute('PRAGMA journal_mode = WAL')
|
47
|
+
|
48
|
+
|
49
|
+
def begin_immediate_transaction(conn):
|
50
|
+
conn.exec_driver_sql('BEGIN IMMEDIATE')
|
51
|
+
|
52
|
+
|
53
|
+
Stats = Dict[Optional[SourceName], int]
|
54
|
+
|
55
|
+
|
56
|
+
# returns critical warnings
|
57
|
+
def visits_to_sqlite(
|
58
|
+
vit: Iterable[Res[DbVisit]],
|
59
|
+
*,
|
60
|
+
overwrite_db: bool,
|
61
|
+
_db_path: Optional[Path] = None, # only used in tests
|
62
|
+
) -> List[Exception]:
|
63
|
+
if _db_path is None:
|
64
|
+
db_path = config.get().db
|
65
|
+
else:
|
66
|
+
db_path = _db_path
|
67
|
+
|
68
|
+
logger = get_logger()
|
69
|
+
|
70
|
+
now = now_tz()
|
71
|
+
|
72
|
+
index_stats: Stats = {}
|
73
|
+
|
74
|
+
def vit_ok() -> Iterable[DbVisit]:
|
75
|
+
for v in vit:
|
76
|
+
ev: DbVisit
|
77
|
+
if isinstance(v, DbVisit):
|
78
|
+
ev = v
|
79
|
+
else:
|
80
|
+
# conform to the schema and dump. can't hurt anyway
|
81
|
+
ev = DbVisit(
|
82
|
+
norm_url='<error>',
|
83
|
+
orig_url='<error>',
|
84
|
+
dt=now,
|
85
|
+
locator=Loc.make('<errror>'),
|
86
|
+
src=SRC_ERROR,
|
87
|
+
# todo attach backtrace?
|
88
|
+
context=repr(v),
|
89
|
+
)
|
90
|
+
index_stats[ev.src] = index_stats.get(ev.src, 0) + 1
|
91
|
+
yield ev
|
92
|
+
|
93
|
+
meta = MetaData()
|
94
|
+
table = Table('visits', meta, *get_columns())
|
95
|
+
|
96
|
+
def query_total_stats(conn) -> Stats:
|
97
|
+
query = select(table.c.src, func.count(table.c.src)).select_from(table).group_by(table.c.src)
|
98
|
+
return {src: cnt for (src, cnt) in conn.execute(query).all()}
|
99
|
+
|
100
|
+
def get_engine(*args, **kwargs) -> Engine:
|
101
|
+
# kwargs['echo'] = True # useful for debugging
|
102
|
+
e = create_engine(*args, **kwargs)
|
103
|
+
event.listen(e, 'connect', enable_wal)
|
104
|
+
return e
|
105
|
+
|
106
|
+
### use readonly database just to get stats
|
107
|
+
pengine = get_engine('sqlite://', creator=lambda: sqlite3.connect(f"file:{db_path}?mode=ro", uri=True))
|
108
|
+
stats_before: Stats
|
109
|
+
try:
|
110
|
+
with pengine.begin() as conn:
|
111
|
+
stats_before = query_total_stats(conn)
|
112
|
+
except exc.OperationalError as oe:
|
113
|
+
if oe.code == 'e3q8':
|
114
|
+
# db doesn't exist yet
|
115
|
+
stats_before = {}
|
116
|
+
else:
|
117
|
+
raise oe
|
118
|
+
pengine.dispose()
|
119
|
+
###
|
120
|
+
|
121
|
+
# needtimeout, othewise concurrent indexing might not work
|
122
|
+
# (note that this also requires WAL mode)
|
123
|
+
engine = get_engine(f'sqlite:///{db_path}', connect_args={'timeout': _CONNECTION_TIMEOUT_SECONDS})
|
124
|
+
|
125
|
+
cleared: Set[str] = set()
|
126
|
+
|
127
|
+
# by default, sqlalchemy does some sort of BEGIN (implicit) transaction, which doesn't provide proper isolation??
|
128
|
+
# see https://docs.sqlalchemy.org/en/20/dialects/sqlite.html#serializable-isolation-savepoints-transactional-ddl
|
129
|
+
event.listen(engine, 'begin', begin_immediate_transaction)
|
130
|
+
# TODO to allow more concurrent indexing, maybe could instead write to a temporary table?
|
131
|
+
# or collect visits first and only then start writing to the db to minimize db access window.. not sure
|
132
|
+
|
133
|
+
# engine.begin() starts a transaction
|
134
|
+
# so everything inside this block will be atomic to the outside observers
|
135
|
+
with engine.begin() as conn:
|
136
|
+
table.create(conn, checkfirst=True)
|
137
|
+
|
138
|
+
if overwrite_db:
|
139
|
+
conn.execute(table.delete())
|
140
|
+
|
141
|
+
insert_stmt = table.insert()
|
142
|
+
# using raw statement gives a massive speedup for inserting visits
|
143
|
+
# see test_benchmark_visits_dumping
|
144
|
+
insert_stmt_raw = str(insert_stmt.compile(dialect=dialect_sqlite.dialect(paramstyle='qmark')))
|
145
|
+
|
146
|
+
for chunk in chunked(vit_ok(), n=_CHUNK_BY):
|
147
|
+
srcs = set(v.src or '' for v in chunk)
|
148
|
+
new = srcs.difference(cleared)
|
149
|
+
|
150
|
+
for src in new:
|
151
|
+
conn.execute(table.delete().where(table.c.src == src))
|
152
|
+
cleared.add(src)
|
153
|
+
|
154
|
+
bound = [db_visit_to_row(v) for v in chunk]
|
155
|
+
conn.exec_driver_sql(insert_stmt_raw, bound)
|
156
|
+
|
157
|
+
stats_after = query_total_stats(conn)
|
158
|
+
engine.dispose()
|
159
|
+
|
160
|
+
stats_changes = {}
|
161
|
+
# map str just in case some srcs are None
|
162
|
+
for k in sorted(map(str, {*stats_before.keys(), *stats_after.keys()})):
|
163
|
+
diff = stats_after.get(k, 0) - stats_before.get(k, 0)
|
164
|
+
if diff == 0:
|
165
|
+
continue
|
166
|
+
sdiff = ('+' if diff > 0 else '') + str(diff)
|
167
|
+
stats_changes[k] = sdiff
|
168
|
+
|
169
|
+
action = 'overwritten' if overwrite_db else 'updated'
|
170
|
+
total_indexed = sum(index_stats.values())
|
171
|
+
total_err = index_stats.get(SRC_ERROR, 0)
|
172
|
+
total_ok = total_indexed - total_err
|
173
|
+
logger.info(f'indexed (current run) : total: {total_indexed}, ok: {total_ok}, errors: {total_err} {index_stats}')
|
174
|
+
logger.info(f'database "{db_path}" : {action}')
|
175
|
+
logger.info(f'database stats before : {stats_before}')
|
176
|
+
logger.info(f'database stats after : {stats_after}')
|
177
|
+
|
178
|
+
if len(stats_changes) == 0:
|
179
|
+
logger.info('database stats changes: no changes')
|
180
|
+
else:
|
181
|
+
for k, v in stats_changes.items():
|
182
|
+
logger.info(f'database stats changes: {k} {v}')
|
183
|
+
|
184
|
+
res: List[Exception] = []
|
185
|
+
if total_ok == 0:
|
186
|
+
res.append(RuntimeError('No visits were indexed, something is probably wrong!'))
|
187
|
+
return res
|
@@ -1,32 +1,29 @@
|
|
1
1
|
from pathlib import Path
|
2
2
|
from typing import Tuple, List
|
3
3
|
|
4
|
-
from cachew import NTBinder
|
5
4
|
from sqlalchemy import (
|
6
5
|
create_engine,
|
7
6
|
exc,
|
7
|
+
Engine,
|
8
8
|
MetaData,
|
9
9
|
Index,
|
10
10
|
Table,
|
11
11
|
)
|
12
|
-
from sqlalchemy.engine import Engine
|
13
12
|
|
14
|
-
from .common import DbVisit
|
13
|
+
from .common import DbVisit, get_columns, row_to_db_visit
|
15
14
|
|
16
15
|
|
17
|
-
DbStuff = Tuple[Engine,
|
16
|
+
DbStuff = Tuple[Engine, Table]
|
18
17
|
|
19
18
|
|
20
19
|
def get_db_stuff(db_path: Path) -> DbStuff:
|
21
20
|
assert db_path.exists(), db_path
|
22
21
|
# todo how to open read only?
|
23
22
|
# actually not sure if we can since we are creating an index here
|
24
|
-
engine = create_engine(f'sqlite:///{db_path}')
|
25
|
-
|
26
|
-
binder = NTBinder.make(DbVisit)
|
23
|
+
engine = create_engine(f'sqlite:///{db_path}') # , echo=True)
|
27
24
|
|
28
25
|
meta = MetaData()
|
29
|
-
table = Table('visits', meta, *
|
26
|
+
table = Table('visits', meta, *get_columns())
|
30
27
|
|
31
28
|
idx = Index('index_norm_url', table.c.norm_url)
|
32
29
|
try:
|
@@ -39,13 +36,15 @@ def get_db_stuff(db_path: Path) -> DbStuff:
|
|
39
36
|
raise e
|
40
37
|
|
41
38
|
# NOTE: apparently it's ok to open connection on every request? at least my comparisons didn't show anything
|
42
|
-
return engine,
|
39
|
+
return engine, table
|
43
40
|
|
44
41
|
|
45
42
|
def get_all_db_visits(db_path: Path) -> List[DbVisit]:
|
46
43
|
# NOTE: this is pretty inefficient if the DB is huge
|
47
44
|
# mostly intended for tests
|
48
|
-
engine,
|
45
|
+
engine, table = get_db_stuff(db_path)
|
49
46
|
query = table.select()
|
50
47
|
with engine.connect() as conn:
|
51
|
-
|
48
|
+
res = [row_to_db_visit(row) for row in conn.execute(query)]
|
49
|
+
engine.dispose()
|
50
|
+
return res
|
promnesia/extract.py
CHANGED
promnesia/kjson.py
CHANGED
@@ -74,7 +74,7 @@ def test_json_processor():
|
|
74
74
|
handled = []
|
75
75
|
class Proc(JsonProcessor):
|
76
76
|
def handle_dict(self, value: JDict, path):
|
77
|
-
if 'skipme' in self.kpath(path):
|
77
|
+
if 'skipme' in self.kpath(path): # type: ignore[comparison-overlap]
|
78
78
|
return JsonProcessor.SKIP
|
79
79
|
|
80
80
|
def handle_str(self, value: str, path):
|