promnesia 1.2.20230515__py3-none-any.whl → 1.2.20240810__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (61) hide show
  1. promnesia/__main__.py +26 -14
  2. promnesia/cannon.py +4 -4
  3. promnesia/common.py +39 -28
  4. promnesia/compare.py +3 -2
  5. promnesia/config.py +4 -2
  6. promnesia/database/common.py +66 -0
  7. promnesia/database/dump.py +187 -0
  8. promnesia/{read_db.py → database/load.py} +10 -11
  9. promnesia/extract.py +1 -0
  10. promnesia/kjson.py +1 -1
  11. promnesia/logging.py +3 -3
  12. promnesia/misc/__init__.pyi +0 -0
  13. promnesia/misc/config_example.py +1 -2
  14. promnesia/misc/install_server.py +2 -3
  15. promnesia/server.py +18 -19
  16. promnesia/sources/__init__.pyi +0 -0
  17. promnesia/sources/auto.py +9 -7
  18. promnesia/sources/browser_legacy.py +11 -5
  19. promnesia/sources/demo.py +18 -2
  20. promnesia/sources/filetypes.py +7 -0
  21. promnesia/sources/github.py +2 -2
  22. promnesia/sources/hypothesis.py +1 -1
  23. promnesia/sources/markdown.py +15 -15
  24. promnesia/sources/org.py +7 -3
  25. promnesia/sources/plaintext.py +3 -1
  26. promnesia/sources/reddit.py +2 -2
  27. promnesia/sources/rss.py +1 -1
  28. promnesia/sources/signal.py +22 -14
  29. promnesia/sources/stackexchange.py +2 -2
  30. promnesia/sources/takeout.py +58 -1
  31. promnesia/sources/takeout_legacy.py +10 -2
  32. promnesia/tests/__init__.py +0 -0
  33. promnesia/tests/common.py +137 -0
  34. promnesia/tests/server_helper.py +64 -0
  35. promnesia/tests/sources/__init__.py +0 -0
  36. promnesia/tests/sources/test_auto.py +66 -0
  37. promnesia/tests/sources/test_filetypes.py +42 -0
  38. promnesia/tests/sources/test_hypothesis.py +39 -0
  39. promnesia/tests/sources/test_org.py +65 -0
  40. promnesia/tests/sources/test_plaintext.py +26 -0
  41. promnesia/tests/sources/test_shellcmd.py +22 -0
  42. promnesia/tests/sources/test_takeout.py +58 -0
  43. promnesia/tests/test_cannon.py +325 -0
  44. promnesia/tests/test_cli.py +42 -0
  45. promnesia/tests/test_compare.py +30 -0
  46. promnesia/tests/test_config.py +290 -0
  47. promnesia/tests/test_db_dump.py +223 -0
  48. promnesia/tests/test_extract.py +61 -0
  49. promnesia/tests/test_extract_urls.py +43 -0
  50. promnesia/tests/test_indexer.py +245 -0
  51. promnesia/tests/test_server.py +292 -0
  52. promnesia/tests/test_traverse.py +41 -0
  53. promnesia/tests/utils.py +35 -0
  54. {promnesia-1.2.20230515.dist-info → promnesia-1.2.20240810.dist-info}/METADATA +13 -17
  55. promnesia-1.2.20240810.dist-info/RECORD +83 -0
  56. {promnesia-1.2.20230515.dist-info → promnesia-1.2.20240810.dist-info}/WHEEL +1 -1
  57. {promnesia-1.2.20230515.dist-info → promnesia-1.2.20240810.dist-info}/entry_points.txt +0 -1
  58. promnesia/dump.py +0 -105
  59. promnesia-1.2.20230515.dist-info/RECORD +0 -58
  60. {promnesia-1.2.20230515.dist-info → promnesia-1.2.20240810.dist-info}/LICENSE +0 -0
  61. {promnesia-1.2.20230515.dist-info → promnesia-1.2.20240810.dist-info}/top_level.txt +0 -0
promnesia/__main__.py CHANGED
@@ -4,11 +4,12 @@ import argparse
4
4
  import ast
5
5
  import importlib
6
6
  import inspect
7
+ import os
7
8
  from pathlib import Path
8
9
  import shutil
9
10
  from subprocess import run, check_call, Popen
10
11
  import sys
11
- from tempfile import TemporaryDirectory
12
+ from tempfile import TemporaryDirectory, gettempdir
12
13
  from typing import Callable, Sequence, Iterable, Iterator, Union
13
14
 
14
15
 
@@ -17,7 +18,7 @@ from . import server
17
18
  from .misc import install_server
18
19
  from .common import Extractor, PathIsh, logger, get_tmpdir, DbVisit, Res
19
20
  from .common import Source, get_system_tz, user_config_file, default_config_path
20
- from .dump import visits_to_sqlite
21
+ from .database.dump import visits_to_sqlite
21
22
  from .extract import extract_visits
22
23
 
23
24
 
@@ -96,22 +97,23 @@ def _do_index(dry: bool=False, sources_subset: Iterable[Union[str, int]]=(), ove
96
97
 
97
98
 
98
99
  def do_index(
99
- config_file: Path,
100
- dry: bool=False,
101
- sources_subset: Iterable[Union[str, int]]=(),
102
- overwrite_db: bool=False,
103
- ) -> None:
100
+ config_file: Path,
101
+ dry: bool=False,
102
+ sources_subset: Iterable[Union[str, int]]=(),
103
+ overwrite_db: bool=False,
104
+ ) -> Sequence[Exception]:
104
105
  config.load_from(config_file) # meh.. should be cleaner
105
106
  try:
106
107
  errors = list(_do_index(dry=dry, sources_subset=sources_subset, overwrite_db=overwrite_db))
107
108
  finally:
109
+ # this reset is mainly for tests, so we don't end up reusing the same config by accident
108
110
  config.reset()
109
111
  if len(errors) > 0:
110
112
  logger.error('%d errors, printing them out:', len(errors))
111
113
  for e in errors:
112
114
  logger.exception(e)
113
115
  logger.error('%d errors, exit code 1', len(errors))
114
- sys.exit(1)
116
+ return errors
115
117
 
116
118
 
117
119
  def demo_sources() -> dict[str, Callable[[], Extractor]]:
@@ -216,15 +218,23 @@ def config_check(args: argparse.Namespace) -> None:
216
218
  def _config_check(cfg: Path) -> Iterable[Exception]:
217
219
  logger.info('config: %s', cfg)
218
220
 
219
- def check(cmd: list[str | Path]) -> Iterable[Exception]:
221
+ def check(cmd: list[str | Path], **kwargs) -> Iterable[Exception]:
220
222
  logger.debug(' '.join(map(str, cmd)))
221
- res = run(cmd)
223
+ res = run(cmd, **kwargs)
222
224
  if res.returncode > 0:
223
225
  yield Exception()
224
226
 
225
227
  logger.info('Checking syntax...')
226
228
  cmd: list[str | Path] = [sys.executable, '-m', 'compileall', cfg]
227
- yield from check(cmd)
229
+ yield from check(
230
+ cmd,
231
+ env={
232
+ **os.environ,
233
+ # if config is on read only partition, the command would fail due to generated bytecode
234
+ # so put it in the temporary directory instead
235
+ 'PYTHONPYCACHEPREFIX': gettempdir()
236
+ },
237
+ )
228
238
 
229
239
  # todo not sure if should be more defensive than check_call here
230
240
  logger.info('Checking type safety...')
@@ -317,14 +327,14 @@ def main() -> None:
317
327
  )
318
328
 
319
329
  F = lambda prog: argparse.ArgumentDefaultsHelpFormatter(prog, width=120)
320
- p = argparse.ArgumentParser(formatter_class=F) # type: ignore
330
+ p = argparse.ArgumentParser(formatter_class=F)
321
331
  subp = p.add_subparsers(dest='mode', )
322
332
  ep = subp.add_parser('index', help='Create/update the link database', formatter_class=F)
323
333
  add_index_args(ep, default_config_path())
324
334
  # TODO use some way to override or provide config only via cmdline?
325
335
  ep.add_argument('--intermediate', required=False, help="Used for development, you don't need it")
326
336
 
327
- sp = subp.add_parser('serve', help='Serve a link database', formatter_class=F) # type: ignore
337
+ sp = subp.add_parser('serve', help='Serve a link database', formatter_class=F)
328
338
  server.setup_parser(sp)
329
339
 
330
340
  ap = subp.add_parser('demo', help='Demo mode: index and serve a directory in single command', formatter_class=F)
@@ -388,12 +398,14 @@ def main() -> None:
388
398
 
389
399
  with get_tmpdir() as tdir: # TODO??
390
400
  if mode == 'index':
391
- do_index(
401
+ errors = do_index(
392
402
  config_file=args.config,
393
403
  dry=args.dry,
394
404
  sources_subset=args.sources,
395
405
  overwrite_db=args.overwrite,
396
406
  )
407
+ if len(errors) > 0:
408
+ sys.exit(1)
397
409
  elif mode == 'serve':
398
410
  server.run(args)
399
411
  elif mode == 'demo':
promnesia/cannon.py CHANGED
@@ -422,7 +422,7 @@ def canonify(url: str) -> str:
422
422
  qq = [(k, v) for i, k, v in sorted(iqq)]
423
423
  # TODO still not sure what we should do..
424
424
  # quote_plus replaces %20 with +, not sure if we want it...
425
- query = urlencode(qq, quote_via=quote_via) # type: ignore[type-var]
425
+ query = urlencode(qq, quote_via=quote_via)
426
426
 
427
427
  path = _quote_path(path)
428
428
 
@@ -683,7 +683,7 @@ def domains(it): # pragma: no cover
683
683
  try:
684
684
  nurl = canonify(url)
685
685
  except CanonifyException as e:
686
- print(f"ERROR while normalising! {nurl} {e}")
686
+ print(f"ERROR while normalising! {url} {e}")
687
687
  c['ERROR'] += 1
688
688
  continue
689
689
  else:
@@ -718,7 +718,7 @@ def groups(it, args): # pragma: no cover
718
718
  try:
719
719
  nurl = canonify(url)
720
720
  except CanonifyException as e:
721
- print(f"ERROR while normalising! {nurl} {e}")
721
+ print(f"ERROR while normalising! {url} {e}")
722
722
  continue
723
723
  udom = nurl[:nurl.find('/')]
724
724
  usplit = udom.split('.')
@@ -818,7 +818,7 @@ def main() -> None: # pragma: no cover
818
818
 
819
819
  - running comparison
820
820
  sqlite3 promnesia.sqlite 'select distinct orig_url from visits where norm_url like "%twitter%" order by orig_url' | src/promnesia/cannon.py
821
- ''', formatter_class=lambda prog: argparse.RawTextHelpFormatter(prog, width=100) # type: ignore
821
+ ''', formatter_class=lambda prog: argparse.RawTextHelpFormatter(prog, width=100)
822
822
  )
823
823
  p.add_argument('input', nargs='?')
824
824
  p.add_argument('--human', action='store_true')
promnesia/common.py CHANGED
@@ -19,6 +19,7 @@ from more_itertools import intersperse
19
19
  import pytz
20
20
 
21
21
  from .cannon import canonify
22
+ from .compat import removeprefix
22
23
 
23
24
 
24
25
  _is_windows = os.name == 'nt'
@@ -76,13 +77,26 @@ class Loc(NamedTuple):
76
77
  # but generally, it will be
77
78
  # (url|file)(linenumber|json_path|anchor)
78
79
 
80
+
81
+ @lru_cache(None)
82
+ def warn_once(message: str) -> None:
83
+ # you'd think that warnings module already logs warnings only once per line..
84
+ # but sadly it's not the case
85
+ # see https://github.com/karlicoss/python_duplicate_warnings_investigation/blob/master/test.py
86
+ warnings.warn(message, stacklevel=2)
87
+
88
+
89
+ def _warn_no_xdg_mime() -> None:
90
+ warn_once("No xdg-mime on your OS! If you're on OSX, perhaps you can help me! https://github.com/karlicoss/open-in-editor/issues/1")
91
+
92
+
79
93
  @lru_cache(1)
80
94
  def _detect_mime_handler() -> str:
81
95
  def exists(what: str) -> bool:
82
96
  try:
83
97
  r = run(f'xdg-mime query default x-scheme-handler/{what}'.split(), stdout=PIPE)
84
- except FileNotFoundError:
85
- warnings.warn("No xdg-mime on your OS! If you're on OSX, perhaps you can help me! https://github.com/karlicoss/open-in-editor/issues/1")
98
+ except (FileNotFoundError, NotADirectoryError): # ugh seems that osx might throw NotADirectory for some reason
99
+ _warn_no_xdg_mime()
86
100
  return False
87
101
  if r.returncode > 0:
88
102
  warnings.warn('xdg-mime failed') # hopefully rest is in stderr
@@ -102,6 +116,7 @@ def _detect_mime_handler() -> str:
102
116
  result = 'emacs:'
103
117
 
104
118
  # 2. now try to use newer editor:// thing
119
+ # TODO flip order here? should rely on editor:// first?
105
120
 
106
121
  # TODO would be nice to collect warnings and display at the end
107
122
  if not exists('editor'):
@@ -285,9 +300,10 @@ def _guess_name(thing: PreSource) -> str:
285
300
  guess = thing.__module__
286
301
 
287
302
  dflt = 'promnesia.sources.'
288
- if guess.startswith(dflt):
289
- # meh
290
- guess = guess[len(dflt):]
303
+ guess = removeprefix(guess, prefix=dflt)
304
+ if guess == 'config':
305
+ # this happens when we define a lambda in config or something without properly wrapping in Source
306
+ logger.warning(f'Inferred source name "config" for {thing}. This might be misleading TODO')
291
307
  return guess
292
308
 
293
309
 
@@ -297,7 +313,7 @@ def _get_index_function(sourceish: PreSource) -> PreExtractor:
297
313
  if hasattr(sourceish, 'index'): # must be a module
298
314
  res = getattr(sourceish, 'index')
299
315
  else:
300
- res = sourceish # type: ignore[assignment]
316
+ res = sourceish
301
317
  return res
302
318
 
303
319
 
@@ -317,12 +333,17 @@ class Source:
317
333
  self.extractor: Extractor = lambda: self.ff(*self.args, **self.kwargs)
318
334
  if src is not None:
319
335
  warnings.warn("'src' argument is deprecated, please use 'name' instead", DeprecationWarning)
320
- try:
321
- name_guess = _guess_name(ff)
322
- except:
323
- # todo warn?
324
- name_guess = ''
325
- self.name = name or src or name_guess
336
+ if name != '':
337
+ self.name = name
338
+ elif src != '':
339
+ self.name = src
340
+ else:
341
+ try:
342
+ name_guess = _guess_name(ff)
343
+ except:
344
+ # todo warn?
345
+ name_guess = ''
346
+ self.name = name_guess
326
347
 
327
348
  @property
328
349
  def description(self) -> str:
@@ -371,7 +392,7 @@ def appdirs():
371
392
  under_test = os.environ.get('PYTEST_CURRENT_TEST') is not None
372
393
  # todo actually use test name?
373
394
  name = 'promnesia-test' if under_test else 'promnesia'
374
- import appdirs as ad # type: ignore[import]
395
+ import appdirs as ad # type: ignore[import-untyped]
375
396
  return ad.AppDirs(appname=name)
376
397
 
377
398
 
@@ -461,13 +482,13 @@ def fdfind_args(root: Path, follow: bool, ignore: List[str]=[]) -> List[str]:
461
482
  ignore_args = []
462
483
  if ignore:
463
484
  # Add a statement that excludes the folder
464
- ignore_args = [['--exclude', f'{n}'] for n in ignore]
485
+ _ignore_args = [['--exclude', f'{n}'] for n in ignore]
465
486
  # Flatten the list of lists
466
- ignore_args_l = list(itertools.chain(*ignore_args))
487
+ ignore_args = list(itertools.chain(*_ignore_args))
467
488
 
468
489
  return [
469
490
  *extra_fd_args(),
470
- *ignore_args_l,
491
+ *ignore_args,
471
492
  *(['--follow'] if follow else []),
472
493
  '--type', 'f',
473
494
  '.',
@@ -516,17 +537,7 @@ def traverse(root: Path, *, follow: bool=True, ignore: List[str]=[]) -> Iterable
516
537
  def get_system_zone() -> str:
517
538
  try:
518
539
  import tzlocal
519
- # note: tzlocal mypy stubs aren't aware of api change yet (see https://github.com/python/typeshed/issues/6038)
520
- try:
521
- # 4.0 way
522
- return tzlocal.get_localzone_name() # type: ignore[attr-defined]
523
- except AttributeError as e:
524
- # 2.0 way
525
- zone = tzlocal.get_localzone().zone # type: ignore[attr-defined]
526
- # see https://github.com/python/typeshed/blame/968fd6d01d23470e0c8368e7ee7c43f54aaedc0e/stubs/pytz/pytz/tzinfo.pyi#L6
527
- # it says all concrete instances should not be None
528
- assert zone is not None
529
- return zone
540
+ return tzlocal.get_localzone_name()
530
541
  except Exception as e:
531
542
  logger.exception(e)
532
543
  logger.error("Couldn't determine system timezone. Falling back to UTC. Please report this as a bug!")
@@ -540,7 +551,7 @@ def get_system_tz() -> pytz.BaseTzInfo:
540
551
  return pytz.timezone(zone)
541
552
  except Exception as e:
542
553
  logger.exception(e)
543
- logger.error(f"Unknown time zone %s. Falling back to UTC. Please report this as a bug!", zone)
554
+ logger.error("Unknown time zone %s. Falling back to UTC. Please report this as a bug!", zone)
544
555
  return pytz.utc
545
556
 
546
557
  # used in misc/install_server.py
promnesia/compare.py CHANGED
@@ -8,6 +8,7 @@ from typing import Dict, List, Any, NamedTuple, Optional, Iterator, Set, Tuple
8
8
 
9
9
 
10
10
  from .common import DbVisit, Url, PathWithMtime # TODO ugh. figure out pythonpath
11
+ from .database.load import row_to_db_visit
11
12
 
12
13
  # TODO include latest too?
13
14
  # from cconfig import ignore, filtered
@@ -139,10 +140,10 @@ def compare_files(*files: Path, log=True) -> Iterator[Tuple[str, DbVisit]]:
139
140
  this_dts = name[0: name.index('.')] # can't use stem due to multiple extensions..
140
141
 
141
142
  from promnesia.server import _get_stuff # TODO ugh
142
- engine, binder, table = _get_stuff(PathWithMtime.make(f))
143
+ engine, table = _get_stuff(PathWithMtime.make(f))
143
144
 
144
145
  with engine.connect() as conn:
145
- vis = [binder.from_row(row) for row in conn.execute(table.select())] # type: ignore[var-annotated]
146
+ vis = [row_to_db_visit(row) for row in conn.execute(table.select())]
146
147
 
147
148
  if last is not None:
148
149
  between = f'{last_dts}:{this_dts}'
promnesia/config.py CHANGED
@@ -6,7 +6,7 @@ import importlib
6
6
  import importlib.util
7
7
  import warnings
8
8
 
9
- from .common import PathIsh, get_tmpdir, appdirs, default_output_dir, default_cache_dir, user_config_file
9
+ from .common import PathIsh, default_output_dir, default_cache_dir
10
10
  from .common import Res, Source, DbVisit
11
11
 
12
12
 
@@ -69,6 +69,8 @@ class Config(NamedTuple):
69
69
 
70
70
  @property
71
71
  def cache_dir(self) -> Optional[Path]:
72
+ # TODO we used to use this for cachew, but it's best to rely on HPI modules etc to cofigure this
73
+ # keeping just in case for now
72
74
  cd = self.CACHE_DIR
73
75
  cpath: Optional[Path]
74
76
  if cd is None:
@@ -127,7 +129,7 @@ def import_config(config_file: PathIsh) -> Config:
127
129
  spec = importlib.util.spec_from_file_location(name, p); assert spec is not None
128
130
  mod = importlib.util.module_from_spec(spec); assert mod is not None
129
131
  loader = spec.loader; assert loader is not None
130
- loader.exec_module(mod) # type: ignore[attr-defined]
132
+ loader.exec_module(mod)
131
133
 
132
134
  d = {}
133
135
  for f in Config._fields:
@@ -0,0 +1,66 @@
1
+ from datetime import datetime
2
+ from typing import Sequence, Tuple
3
+
4
+ from sqlalchemy import (
5
+ Column,
6
+ Integer,
7
+ Row,
8
+ String,
9
+ )
10
+
11
+ # TODO maybe later move DbVisit here completely?
12
+ # kinda an issue that it's technically an "api" because hook in config can patch up DbVisit
13
+ from ..common import DbVisit, Loc
14
+
15
+
16
+ def get_columns() -> Sequence[Column]:
17
+ # fmt: off
18
+ res: Sequence[Column] = [
19
+ Column('norm_url' , String()),
20
+ Column('orig_url' , String()),
21
+ Column('dt' , String()),
22
+ Column('locator_title', String()),
23
+ Column('locator_href' , String()),
24
+ Column('src' , String()),
25
+ Column('context' , String()),
26
+ Column('duration' , Integer())
27
+ ]
28
+ # fmt: on
29
+ assert len(res) == len(DbVisit._fields) + 1 # +1 because Locator is 'flattened'
30
+ return res
31
+
32
+
33
+ def db_visit_to_row(v: DbVisit) -> Tuple:
34
+ # ugh, very hacky...
35
+ # we want to make sure the resulting tuple only consists of simple types
36
+ # so we can use dbengine directly
37
+ dt_s = v.dt.isoformat()
38
+ row = (
39
+ v.norm_url,
40
+ v.orig_url,
41
+ dt_s,
42
+ v.locator.title,
43
+ v.locator.href,
44
+ v.src,
45
+ v.context,
46
+ v.duration,
47
+ )
48
+ return row
49
+
50
+
51
+ def row_to_db_visit(row: Sequence) -> DbVisit:
52
+ (norm_url, orig_url, dt_s, locator_title, locator_href, src, context, duration) = row
53
+ dt_s = dt_s.split()[0] # backwards compatibility: previously it could be a string separated with tz name
54
+ dt = datetime.fromisoformat(dt_s)
55
+ return DbVisit(
56
+ norm_url=norm_url,
57
+ orig_url=orig_url,
58
+ dt=dt,
59
+ locator=Loc(
60
+ title=locator_title,
61
+ href=locator_href,
62
+ ),
63
+ src=src,
64
+ context=context,
65
+ duration=duration,
66
+ )
@@ -0,0 +1,187 @@
1
+ from pathlib import Path
2
+ import sqlite3
3
+ from typing import Dict, Iterable, List, Optional, Set
4
+
5
+ from more_itertools import chunked
6
+
7
+ from sqlalchemy import (
8
+ Engine,
9
+ MetaData,
10
+ Table,
11
+ create_engine,
12
+ event,
13
+ exc,
14
+ func,
15
+ select,
16
+ )
17
+ from sqlalchemy.dialects import sqlite as dialect_sqlite
18
+
19
+ from ..common import (
20
+ DbVisit,
21
+ Loc,
22
+ Res,
23
+ SourceName,
24
+ get_logger,
25
+ now_tz,
26
+ )
27
+ from .common import get_columns, db_visit_to_row
28
+ from .. import config
29
+
30
+
31
+ # NOTE: I guess the main performance benefit from this is not creating too many tmp lists and avoiding overhead
32
+ # since as far as sql is concerned it should all be in the same transaction. only a guess
33
+ # not sure it's the proper way to handle it
34
+ # see test_index_many
35
+ _CHUNK_BY = 10
36
+
37
+ # I guess 1 hour is definitely enough
38
+ _CONNECTION_TIMEOUT_SECONDS = 3600
39
+
40
+ SRC_ERROR = 'error'
41
+
42
+
43
+ # using WAL keeps database readable while we're writing in it
44
+ # this is tested by test_query_while_indexing
45
+ def enable_wal(dbapi_con, con_record) -> None:
46
+ dbapi_con.execute('PRAGMA journal_mode = WAL')
47
+
48
+
49
+ def begin_immediate_transaction(conn):
50
+ conn.exec_driver_sql('BEGIN IMMEDIATE')
51
+
52
+
53
+ Stats = Dict[Optional[SourceName], int]
54
+
55
+
56
+ # returns critical warnings
57
+ def visits_to_sqlite(
58
+ vit: Iterable[Res[DbVisit]],
59
+ *,
60
+ overwrite_db: bool,
61
+ _db_path: Optional[Path] = None, # only used in tests
62
+ ) -> List[Exception]:
63
+ if _db_path is None:
64
+ db_path = config.get().db
65
+ else:
66
+ db_path = _db_path
67
+
68
+ logger = get_logger()
69
+
70
+ now = now_tz()
71
+
72
+ index_stats: Stats = {}
73
+
74
+ def vit_ok() -> Iterable[DbVisit]:
75
+ for v in vit:
76
+ ev: DbVisit
77
+ if isinstance(v, DbVisit):
78
+ ev = v
79
+ else:
80
+ # conform to the schema and dump. can't hurt anyway
81
+ ev = DbVisit(
82
+ norm_url='<error>',
83
+ orig_url='<error>',
84
+ dt=now,
85
+ locator=Loc.make('<errror>'),
86
+ src=SRC_ERROR,
87
+ # todo attach backtrace?
88
+ context=repr(v),
89
+ )
90
+ index_stats[ev.src] = index_stats.get(ev.src, 0) + 1
91
+ yield ev
92
+
93
+ meta = MetaData()
94
+ table = Table('visits', meta, *get_columns())
95
+
96
+ def query_total_stats(conn) -> Stats:
97
+ query = select(table.c.src, func.count(table.c.src)).select_from(table).group_by(table.c.src)
98
+ return {src: cnt for (src, cnt) in conn.execute(query).all()}
99
+
100
+ def get_engine(*args, **kwargs) -> Engine:
101
+ # kwargs['echo'] = True # useful for debugging
102
+ e = create_engine(*args, **kwargs)
103
+ event.listen(e, 'connect', enable_wal)
104
+ return e
105
+
106
+ ### use readonly database just to get stats
107
+ pengine = get_engine('sqlite://', creator=lambda: sqlite3.connect(f"file:{db_path}?mode=ro", uri=True))
108
+ stats_before: Stats
109
+ try:
110
+ with pengine.begin() as conn:
111
+ stats_before = query_total_stats(conn)
112
+ except exc.OperationalError as oe:
113
+ if oe.code == 'e3q8':
114
+ # db doesn't exist yet
115
+ stats_before = {}
116
+ else:
117
+ raise oe
118
+ pengine.dispose()
119
+ ###
120
+
121
+ # needtimeout, othewise concurrent indexing might not work
122
+ # (note that this also requires WAL mode)
123
+ engine = get_engine(f'sqlite:///{db_path}', connect_args={'timeout': _CONNECTION_TIMEOUT_SECONDS})
124
+
125
+ cleared: Set[str] = set()
126
+
127
+ # by default, sqlalchemy does some sort of BEGIN (implicit) transaction, which doesn't provide proper isolation??
128
+ # see https://docs.sqlalchemy.org/en/20/dialects/sqlite.html#serializable-isolation-savepoints-transactional-ddl
129
+ event.listen(engine, 'begin', begin_immediate_transaction)
130
+ # TODO to allow more concurrent indexing, maybe could instead write to a temporary table?
131
+ # or collect visits first and only then start writing to the db to minimize db access window.. not sure
132
+
133
+ # engine.begin() starts a transaction
134
+ # so everything inside this block will be atomic to the outside observers
135
+ with engine.begin() as conn:
136
+ table.create(conn, checkfirst=True)
137
+
138
+ if overwrite_db:
139
+ conn.execute(table.delete())
140
+
141
+ insert_stmt = table.insert()
142
+ # using raw statement gives a massive speedup for inserting visits
143
+ # see test_benchmark_visits_dumping
144
+ insert_stmt_raw = str(insert_stmt.compile(dialect=dialect_sqlite.dialect(paramstyle='qmark')))
145
+
146
+ for chunk in chunked(vit_ok(), n=_CHUNK_BY):
147
+ srcs = set(v.src or '' for v in chunk)
148
+ new = srcs.difference(cleared)
149
+
150
+ for src in new:
151
+ conn.execute(table.delete().where(table.c.src == src))
152
+ cleared.add(src)
153
+
154
+ bound = [db_visit_to_row(v) for v in chunk]
155
+ conn.exec_driver_sql(insert_stmt_raw, bound)
156
+
157
+ stats_after = query_total_stats(conn)
158
+ engine.dispose()
159
+
160
+ stats_changes = {}
161
+ # map str just in case some srcs are None
162
+ for k in sorted(map(str, {*stats_before.keys(), *stats_after.keys()})):
163
+ diff = stats_after.get(k, 0) - stats_before.get(k, 0)
164
+ if diff == 0:
165
+ continue
166
+ sdiff = ('+' if diff > 0 else '') + str(diff)
167
+ stats_changes[k] = sdiff
168
+
169
+ action = 'overwritten' if overwrite_db else 'updated'
170
+ total_indexed = sum(index_stats.values())
171
+ total_err = index_stats.get(SRC_ERROR, 0)
172
+ total_ok = total_indexed - total_err
173
+ logger.info(f'indexed (current run) : total: {total_indexed}, ok: {total_ok}, errors: {total_err} {index_stats}')
174
+ logger.info(f'database "{db_path}" : {action}')
175
+ logger.info(f'database stats before : {stats_before}')
176
+ logger.info(f'database stats after : {stats_after}')
177
+
178
+ if len(stats_changes) == 0:
179
+ logger.info('database stats changes: no changes')
180
+ else:
181
+ for k, v in stats_changes.items():
182
+ logger.info(f'database stats changes: {k} {v}')
183
+
184
+ res: List[Exception] = []
185
+ if total_ok == 0:
186
+ res.append(RuntimeError('No visits were indexed, something is probably wrong!'))
187
+ return res
@@ -1,32 +1,29 @@
1
1
  from pathlib import Path
2
2
  from typing import Tuple, List
3
3
 
4
- from cachew import NTBinder
5
4
  from sqlalchemy import (
6
5
  create_engine,
7
6
  exc,
7
+ Engine,
8
8
  MetaData,
9
9
  Index,
10
10
  Table,
11
11
  )
12
- from sqlalchemy.engine import Engine
13
12
 
14
- from .common import DbVisit
13
+ from .common import DbVisit, get_columns, row_to_db_visit
15
14
 
16
15
 
17
- DbStuff = Tuple[Engine, NTBinder, Table]
16
+ DbStuff = Tuple[Engine, Table]
18
17
 
19
18
 
20
19
  def get_db_stuff(db_path: Path) -> DbStuff:
21
20
  assert db_path.exists(), db_path
22
21
  # todo how to open read only?
23
22
  # actually not sure if we can since we are creating an index here
24
- engine = create_engine(f'sqlite:///{db_path}') # , echo=True)
25
-
26
- binder = NTBinder.make(DbVisit)
23
+ engine = create_engine(f'sqlite:///{db_path}') # , echo=True)
27
24
 
28
25
  meta = MetaData()
29
- table = Table('visits', meta, *binder.columns)
26
+ table = Table('visits', meta, *get_columns())
30
27
 
31
28
  idx = Index('index_norm_url', table.c.norm_url)
32
29
  try:
@@ -39,13 +36,15 @@ def get_db_stuff(db_path: Path) -> DbStuff:
39
36
  raise e
40
37
 
41
38
  # NOTE: apparently it's ok to open connection on every request? at least my comparisons didn't show anything
42
- return engine, binder, table
39
+ return engine, table
43
40
 
44
41
 
45
42
  def get_all_db_visits(db_path: Path) -> List[DbVisit]:
46
43
  # NOTE: this is pretty inefficient if the DB is huge
47
44
  # mostly intended for tests
48
- engine, binder, table = get_db_stuff(db_path)
45
+ engine, table = get_db_stuff(db_path)
49
46
  query = table.select()
50
47
  with engine.connect() as conn:
51
- return [binder.from_row(row) for row in conn.execute(query)]
48
+ res = [row_to_db_visit(row) for row in conn.execute(query)]
49
+ engine.dispose()
50
+ return res
promnesia/extract.py CHANGED
@@ -28,6 +28,7 @@ DEFAULT_FILTERS = (
28
28
  )
29
29
 
30
30
 
31
+ # TODO maybe move these to configs?
31
32
  @lru_cache(1) #meh, not sure what would happen under tests?
32
33
  def filters() -> Sequence[Filter]:
33
34
  from . import config
promnesia/kjson.py CHANGED
@@ -74,7 +74,7 @@ def test_json_processor():
74
74
  handled = []
75
75
  class Proc(JsonProcessor):
76
76
  def handle_dict(self, value: JDict, path):
77
- if 'skipme' in self.kpath(path):
77
+ if 'skipme' in self.kpath(path): # type: ignore[comparison-overlap]
78
78
  return JsonProcessor.SKIP
79
79
 
80
80
  def handle_str(self, value: str, path):