PyPI - promnesia - Versions diffs - 1.2.20230515__py3-none-any.whl → 1.2.20240810__py3-none-any.whl - Mend

promnesia 1.2.20230515py3-none-any.whl → 1.2.20240810py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (61) hide show

promnesia/__main__.py +26 -14
promnesia/cannon.py +4 -4
promnesia/common.py +39 -28
promnesia/compare.py +3 -2
promnesia/config.py +4 -2
promnesia/database/common.py +66 -0
promnesia/database/dump.py +187 -0
promnesia/{read_db.py → database/load.py} +10 -11
promnesia/extract.py +1 -0
promnesia/kjson.py +1 -1
promnesia/logging.py +3 -3
promnesia/misc/__init__.pyi +0 -0
promnesia/misc/config_example.py +1 -2
promnesia/misc/install_server.py +2 -3
promnesia/server.py +18 -19
promnesia/sources/__init__.pyi +0 -0
promnesia/sources/auto.py +9 -7
promnesia/sources/browser_legacy.py +11 -5
promnesia/sources/demo.py +18 -2
promnesia/sources/filetypes.py +7 -0
promnesia/sources/github.py +2 -2
promnesia/sources/hypothesis.py +1 -1
promnesia/sources/markdown.py +15 -15
promnesia/sources/org.py +7 -3
promnesia/sources/plaintext.py +3 -1
promnesia/sources/reddit.py +2 -2
promnesia/sources/rss.py +1 -1
promnesia/sources/signal.py +22 -14
promnesia/sources/stackexchange.py +2 -2
promnesia/sources/takeout.py +58 -1
promnesia/sources/takeout_legacy.py +10 -2
promnesia/tests/__init__.py +0 -0
promnesia/tests/common.py +137 -0
promnesia/tests/server_helper.py +64 -0
promnesia/tests/sources/__init__.py +0 -0
promnesia/tests/sources/test_auto.py +66 -0
promnesia/tests/sources/test_filetypes.py +42 -0
promnesia/tests/sources/test_hypothesis.py +39 -0
promnesia/tests/sources/test_org.py +65 -0
promnesia/tests/sources/test_plaintext.py +26 -0
promnesia/tests/sources/test_shellcmd.py +22 -0
promnesia/tests/sources/test_takeout.py +58 -0
promnesia/tests/test_cannon.py +325 -0
promnesia/tests/test_cli.py +42 -0
promnesia/tests/test_compare.py +30 -0
promnesia/tests/test_config.py +290 -0
promnesia/tests/test_db_dump.py +223 -0
promnesia/tests/test_extract.py +61 -0
promnesia/tests/test_extract_urls.py +43 -0
promnesia/tests/test_indexer.py +245 -0
promnesia/tests/test_server.py +292 -0
promnesia/tests/test_traverse.py +41 -0
promnesia/tests/utils.py +35 -0
{promnesia-1.2.20230515.dist-info → promnesia-1.2.20240810.dist-info}/METADATA +13 -17
promnesia-1.2.20240810.dist-info/RECORD +83 -0
{promnesia-1.2.20230515.dist-info → promnesia-1.2.20240810.dist-info}/WHEEL +1 -1
{promnesia-1.2.20230515.dist-info → promnesia-1.2.20240810.dist-info}/entry_points.txt +0 -1
promnesia/dump.py +0 -105
promnesia-1.2.20230515.dist-info/RECORD +0 -58
{promnesia-1.2.20230515.dist-info → promnesia-1.2.20240810.dist-info}/LICENSE +0 -0
{promnesia-1.2.20230515.dist-info → promnesia-1.2.20240810.dist-info}/top_level.txt +0 -0

promnesia/__main__.py CHANGED Viewed

@@ -4,11 +4,12 @@ import argparse
 import ast
 import importlib
 import inspect
+import os
 from pathlib import Path
 import shutil
 from subprocess import run, check_call, Popen
 import sys
-from tempfile import TemporaryDirectory
+from tempfile import TemporaryDirectory, gettempdir
 from typing import Callable, Sequence, Iterable, Iterator, Union
@@ -17,7 +18,7 @@ from . import server
 from .misc import install_server
 from .common import Extractor, PathIsh, logger, get_tmpdir, DbVisit, Res
 from .common import Source, get_system_tz, user_config_file, default_config_path
-from .dump import visits_to_sqlite
+from .database.dump import visits_to_sqlite
 from .extract import extract_visits
@@ -96,22 +97,23 @@ def _do_index(dry: bool=False, sources_subset: Iterable[Union[str, int]]=(), ove
 def do_index(
-        config_file: Path,
-        dry: bool=False,
-        sources_subset: Iterable[Union[str, int]]=(),
-        overwrite_db: bool=False,
-    ) -> None:
+    config_file: Path,
+    dry: bool=False,
+    sources_subset: Iterable[Union[str, int]]=(),
+    overwrite_db: bool=False,
+) -> Sequence[Exception]:
     config.load_from(config_file) # meh.. should be cleaner
     try:
         errors = list(_do_index(dry=dry, sources_subset=sources_subset, overwrite_db=overwrite_db))
     finally:
+        # this reset is mainly for tests, so we don't end up reusing the same config by accident
         config.reset()
     if len(errors) > 0:
         logger.error('%d errors, printing them out:', len(errors))
         for e in errors:
             logger.exception(e)
         logger.error('%d errors, exit code 1', len(errors))
-        sys.exit(1)
+    return errors
 def demo_sources() -> dict[str, Callable[[], Extractor]]:
@@ -216,15 +218,23 @@ def config_check(args: argparse.Namespace) -> None:
 def _config_check(cfg: Path) -> Iterable[Exception]:
     logger.info('config: %s', cfg)
-    def check(cmd: list[str | Path]) -> Iterable[Exception]:
+    def check(cmd: list[str | Path], **kwargs) -> Iterable[Exception]:
         logger.debug(' '.join(map(str, cmd)))
-        res = run(cmd)
+        res = run(cmd, **kwargs)
         if res.returncode > 0:
             yield Exception()
     logger.info('Checking syntax...')
     cmd: list[str | Path] = [sys.executable, '-m', 'compileall', cfg]
-    yield from check(cmd)
+    yield from check(
+        cmd,
+        env={
+            **os.environ,
+            # if config is on read only partition, the command would fail due to generated bytecode
+            # so put it in the temporary directory instead
+            'PYTHONPYCACHEPREFIX': gettempdir()
+        },
+    )
     # todo not sure if should be more defensive than check_call here
     logger.info('Checking type safety...')
@@ -317,14 +327,14 @@ def main() -> None:
         )
     F = lambda prog: argparse.ArgumentDefaultsHelpFormatter(prog, width=120)
-    p = argparse.ArgumentParser(formatter_class=F) # type: ignore
+    p = argparse.ArgumentParser(formatter_class=F)
     subp = p.add_subparsers(dest='mode', )
     ep = subp.add_parser('index', help='Create/update the link database', formatter_class=F)
     add_index_args(ep, default_config_path())
     # TODO use some way to override or provide config only via cmdline?
     ep.add_argument('--intermediate', required=False, help="Used for development, you don't need it")
-    sp = subp.add_parser('serve', help='Serve a link database', formatter_class=F) # type: ignore
+    sp = subp.add_parser('serve', help='Serve a link database', formatter_class=F)
     server.setup_parser(sp)
     ap = subp.add_parser('demo', help='Demo mode: index and serve a directory in single command', formatter_class=F)
@@ -388,12 +398,14 @@ def main() -> None:
     with get_tmpdir() as tdir: # TODO??
         if mode == 'index':
-            do_index(
+            errors = do_index(
                 config_file=args.config,
                 dry=args.dry,
                 sources_subset=args.sources,
                 overwrite_db=args.overwrite,
             )
+            if len(errors) > 0:
+                sys.exit(1)
         elif mode == 'serve':
             server.run(args)
         elif mode == 'demo':

promnesia/cannon.py CHANGED Viewed

@@ -422,7 +422,7 @@ def canonify(url: str) -> str:
     qq = [(k, v) for i, k, v in sorted(iqq)]
     # TODO still not sure what we should do..
     # quote_plus replaces %20 with +, not sure if we want it...
-    query = urlencode(qq, quote_via=quote_via) # type: ignore[type-var]
+    query = urlencode(qq, quote_via=quote_via)
     path = _quote_path(path)
@@ -683,7 +683,7 @@ def domains(it): # pragma: no cover
         try:
             nurl = canonify(url)
         except CanonifyException as e:
-            print(f"ERROR while normalising! {nurl} {e}")
+            print(f"ERROR while normalising! {url} {e}")
             c['ERROR'] += 1
             continue
         else:
@@ -718,7 +718,7 @@ def groups(it, args): # pragma: no cover
         try:
             nurl = canonify(url)
         except CanonifyException as e:
-            print(f"ERROR while normalising! {nurl} {e}")
+            print(f"ERROR while normalising! {url} {e}")
             continue
         udom = nurl[:nurl.find('/')]
         usplit = udom.split('.')
@@ -818,7 +818,7 @@ def main() -> None: # pragma: no cover
 - running comparison
   sqlite3 promnesia.sqlite 'select distinct orig_url from visits where norm_url like "%twitter%" order by orig_url' | src/promnesia/cannon.py
-''', formatter_class=lambda prog: argparse.RawTextHelpFormatter(prog, width=100) # type: ignore
+''', formatter_class=lambda prog: argparse.RawTextHelpFormatter(prog, width=100)
     )
     p.add_argument('input', nargs='?')
     p.add_argument('--human', action='store_true')

promnesia/common.py CHANGED Viewed

@@ -19,6 +19,7 @@ from more_itertools import intersperse
 import pytz
 from .cannon import canonify
+from .compat import removeprefix
 _is_windows = os.name == 'nt'
@@ -76,13 +77,26 @@ class Loc(NamedTuple):
     # but generally, it will be
     # (url|file)(linenumber|json_path|anchor)
+@lru_cache(None)
+def warn_once(message: str) -> None:
+    # you'd think that warnings module already logs warnings only once per line..
+    # but sadly it's not the case
+    # see https://github.com/karlicoss/python_duplicate_warnings_investigation/blob/master/test.py
+    warnings.warn(message, stacklevel=2)
+def _warn_no_xdg_mime() -> None:
+    warn_once("No xdg-mime on your OS! If you're on OSX, perhaps you can help me! https://github.com/karlicoss/open-in-editor/issues/1")
 @lru_cache(1)
 def _detect_mime_handler() -> str:
     def exists(what: str) -> bool:
         try:
             r = run(f'xdg-mime query default x-scheme-handler/{what}'.split(), stdout=PIPE)
-        except FileNotFoundError:
-            warnings.warn("No xdg-mime on your OS! If you're on OSX, perhaps you can help me! https://github.com/karlicoss/open-in-editor/issues/1")
+        except (FileNotFoundError, NotADirectoryError):  # ugh seems that osx might throw NotADirectory for some reason
+            _warn_no_xdg_mime()
             return False
         if r.returncode > 0:
             warnings.warn('xdg-mime failed') # hopefully rest is in stderr
@@ -102,6 +116,7 @@ def _detect_mime_handler() -> str:
         result = 'emacs:'
     # 2. now try to use newer editor:// thing
+    # TODO flip order here? should rely on editor:// first?
     # TODO would be nice to collect warnings and display at the end
     if not exists('editor'):
@@ -285,9 +300,10 @@ def _guess_name(thing: PreSource) -> str:
         guess = thing.__module__
     dflt = 'promnesia.sources.'
-    if guess.startswith(dflt):
-        # meh
-        guess = guess[len(dflt):]
+    guess = removeprefix(guess, prefix=dflt)
+    if guess == 'config':
+        # this happens when we define a lambda in config or something without properly wrapping in Source
+        logger.warning(f'Inferred source name "config" for {thing}. This might be misleading TODO')
     return guess
@@ -297,7 +313,7 @@ def _get_index_function(sourceish: PreSource) -> PreExtractor:
     if hasattr(sourceish, 'index'):  # must be a module
         res = getattr(sourceish, 'index')
     else:
-        res = sourceish  # type: ignore[assignment]
+        res = sourceish
     return res
@@ -317,12 +333,17 @@ class Source:
         self.extractor: Extractor = lambda: self.ff(*self.args, **self.kwargs)
         if src is not None:
             warnings.warn("'src' argument is deprecated, please use 'name' instead", DeprecationWarning)
-        try:
-            name_guess = _guess_name(ff)
-        except:
-            # todo warn?
-            name_guess = ''
-        self.name = name or src or name_guess
+        if name != '':
+            self.name = name
+        elif src != '':
+            self.name = src
+        else:
+            try:
+                name_guess = _guess_name(ff)
+            except:
+                # todo warn?
+                name_guess = ''
+            self.name = name_guess
     @property
     def description(self) -> str:
@@ -371,7 +392,7 @@ def appdirs():
     under_test = os.environ.get('PYTEST_CURRENT_TEST') is not None
     # todo actually use test name?
     name = 'promnesia-test' if under_test else 'promnesia'
-    import appdirs as ad # type: ignore[import]
+    import appdirs as ad # type: ignore[import-untyped]
     return ad.AppDirs(appname=name)
@@ -461,13 +482,13 @@ def fdfind_args(root: Path, follow: bool, ignore: List[str]=[]) -> List[str]:
     ignore_args = []
     if ignore:
         # Add a statement that excludes the folder
-        ignore_args = [['--exclude', f'{n}'] for n in ignore]
+        _ignore_args = [['--exclude', f'{n}'] for n in ignore]
         # Flatten the list of lists
-        ignore_args_l = list(itertools.chain(*ignore_args))
+        ignore_args = list(itertools.chain(*_ignore_args))
     return [
         *extra_fd_args(),
-        *ignore_args_l,
+        *ignore_args,
         *(['--follow'] if follow else []),
         '--type', 'f',
         '.',
@@ -516,17 +537,7 @@ def traverse(root: Path, *, follow: bool=True, ignore: List[str]=[]) -> Iterable
 def get_system_zone() -> str:
     try:
         import tzlocal
-        # note: tzlocal mypy stubs aren't aware of api change yet (see https://github.com/python/typeshed/issues/6038)
-        try:
-            # 4.0 way
-            return tzlocal.get_localzone_name() # type: ignore[attr-defined]
-        except AttributeError as e:
-            # 2.0 way
-            zone = tzlocal.get_localzone().zone  # type: ignore[attr-defined]
-            # see https://github.com/python/typeshed/blame/968fd6d01d23470e0c8368e7ee7c43f54aaedc0e/stubs/pytz/pytz/tzinfo.pyi#L6
-            # it says all concrete instances should not be None
-            assert zone is not None
-            return zone
+        return tzlocal.get_localzone_name()
     except Exception as e:
         logger.exception(e)
         logger.error("Couldn't determine system timezone. Falling back to UTC. Please report this as a bug!")
@@ -540,7 +551,7 @@ def get_system_tz() -> pytz.BaseTzInfo:
         return pytz.timezone(zone)
     except Exception as e:
         logger.exception(e)
-        logger.error(f"Unknown time zone %s. Falling back to UTC. Please report this as a bug!", zone)
+        logger.error("Unknown time zone %s. Falling back to UTC. Please report this as a bug!", zone)
         return pytz.utc
 # used in misc/install_server.py

promnesia/compare.py CHANGED Viewed

@@ -8,6 +8,7 @@ from typing import Dict, List, Any, NamedTuple, Optional, Iterator, Set, Tuple
 from .common import DbVisit, Url, PathWithMtime # TODO ugh. figure out pythonpath
+from .database.load import row_to_db_visit
 # TODO include latest too?
 # from cconfig import ignore, filtered
@@ -139,10 +140,10 @@ def compare_files(*files: Path, log=True) -> Iterator[Tuple[str, DbVisit]]:
         this_dts = name[0: name.index('.')] # can't use stem due to multiple extensions..
         from promnesia.server import _get_stuff # TODO ugh
-        engine, binder, table = _get_stuff(PathWithMtime.make(f))
+        engine, table = _get_stuff(PathWithMtime.make(f))
         with engine.connect() as conn:
-            vis = [binder.from_row(row) for row in conn.execute(table.select())]  # type: ignore[var-annotated]
+            vis = [row_to_db_visit(row) for row in conn.execute(table.select())]
         if last is not None:
             between = f'{last_dts}:{this_dts}'

promnesia/config.py CHANGED Viewed

@@ -6,7 +6,7 @@ import importlib
 import importlib.util
 import warnings
-from .common import PathIsh, get_tmpdir, appdirs, default_output_dir, default_cache_dir, user_config_file
+from .common import PathIsh, default_output_dir, default_cache_dir
 from .common import Res, Source, DbVisit
@@ -69,6 +69,8 @@ class Config(NamedTuple):
     @property
     def cache_dir(self) -> Optional[Path]:
+        # TODO we used to use this for cachew, but it's best to rely on HPI modules etc to cofigure this
+        # keeping just in case for now
         cd = self.CACHE_DIR
         cpath: Optional[Path]
         if cd is None:
@@ -127,7 +129,7 @@ def import_config(config_file: PathIsh) -> Config:
     spec = importlib.util.spec_from_file_location(name, p); assert spec is not None
     mod = importlib.util.module_from_spec(spec); assert mod is not None
     loader = spec.loader; assert loader is not None
-    loader.exec_module(mod) # type: ignore[attr-defined]
+    loader.exec_module(mod)
     d = {}
     for f in Config._fields:

promnesia/database/common.py ADDED Viewed

@@ -0,0 +1,66 @@
+from datetime import datetime
+from typing import Sequence, Tuple
+from sqlalchemy import (
+    Column,
+    Integer,
+    Row,
+    String,
+)
+# TODO maybe later move DbVisit here completely?
+# kinda an issue that it's technically an "api" because hook in config can patch up DbVisit
+from ..common import DbVisit, Loc
+def get_columns() -> Sequence[Column]:
+    # fmt: off
+    res: Sequence[Column] = [
+        Column('norm_url'     , String()),
+        Column('orig_url'     , String()),
+        Column('dt'           , String()),
+        Column('locator_title', String()),
+        Column('locator_href' , String()),
+        Column('src'          , String()),
+        Column('context'      , String()),
+        Column('duration'     , Integer())
+    ]
+    # fmt: on
+    assert len(res) == len(DbVisit._fields) + 1  # +1 because Locator is 'flattened'
+    return res
+def db_visit_to_row(v: DbVisit) -> Tuple:
+    # ugh, very hacky...
+    # we want to make sure the resulting tuple only consists of simple types
+    # so we can use dbengine directly
+    dt_s = v.dt.isoformat()
+    row = (
+        v.norm_url,
+        v.orig_url,
+        dt_s,
+        v.locator.title,
+        v.locator.href,
+        v.src,
+        v.context,
+        v.duration,
+    )
+    return row
+def row_to_db_visit(row: Sequence) -> DbVisit:
+    (norm_url, orig_url, dt_s, locator_title, locator_href, src, context, duration) = row
+    dt_s = dt_s.split()[0]  # backwards compatibility: previously it could be a string separated with tz name
+    dt = datetime.fromisoformat(dt_s)
+    return DbVisit(
+        norm_url=norm_url,
+        orig_url=orig_url,
+        dt=dt,
+        locator=Loc(
+            title=locator_title,
+            href=locator_href,
+        ),
+        src=src,
+        context=context,
+        duration=duration,
+    )

promnesia/database/dump.py ADDED Viewed

@@ -0,0 +1,187 @@
+from pathlib import Path
+import sqlite3
+from typing import Dict, Iterable, List, Optional, Set
+from more_itertools import chunked
+from sqlalchemy import (
+    Engine,
+    MetaData,
+    Table,
+    create_engine,
+    event,
+    exc,
+    func,
+    select,
+)
+from sqlalchemy.dialects import sqlite as dialect_sqlite
+from ..common import (
+    DbVisit,
+    Loc,
+    Res,
+    SourceName,
+    get_logger,
+    now_tz,
+)
+from .common import get_columns, db_visit_to_row
+from .. import config
+# NOTE: I guess the main performance benefit from this is not creating too many tmp lists and avoiding overhead
+# since as far as sql is concerned it should all be in the same transaction. only a guess
+# not sure it's the proper way to handle it
+# see test_index_many
+_CHUNK_BY = 10
+# I guess 1 hour is definitely enough
+_CONNECTION_TIMEOUT_SECONDS = 3600
+SRC_ERROR = 'error'
+# using WAL keeps database readable while we're writing in it
+# this is tested by test_query_while_indexing
+def enable_wal(dbapi_con, con_record) -> None:
+    dbapi_con.execute('PRAGMA journal_mode = WAL')
+def begin_immediate_transaction(conn):
+    conn.exec_driver_sql('BEGIN IMMEDIATE')
+Stats = Dict[Optional[SourceName], int]
+# returns critical warnings
+def visits_to_sqlite(
+    vit: Iterable[Res[DbVisit]],
+    *,
+    overwrite_db: bool,
+    _db_path: Optional[Path] = None,  # only used in tests
+) -> List[Exception]:
+    if _db_path is None:
+        db_path = config.get().db
+    else:
+        db_path = _db_path
+    logger = get_logger()
+    now = now_tz()
+    index_stats: Stats = {}
+    def vit_ok() -> Iterable[DbVisit]:
+        for v in vit:
+            ev: DbVisit
+            if isinstance(v, DbVisit):
+                ev = v
+            else:
+                # conform to the schema and dump. can't hurt anyway
+                ev = DbVisit(
+                    norm_url='<error>',
+                    orig_url='<error>',
+                    dt=now,
+                    locator=Loc.make('<errror>'),
+                    src=SRC_ERROR,
+                    # todo attach backtrace?
+                    context=repr(v),
+                )
+            index_stats[ev.src] = index_stats.get(ev.src, 0) + 1
+            yield ev
+    meta = MetaData()
+    table = Table('visits', meta, *get_columns())
+    def query_total_stats(conn) -> Stats:
+        query = select(table.c.src, func.count(table.c.src)).select_from(table).group_by(table.c.src)
+        return {src: cnt for (src, cnt) in conn.execute(query).all()}
+    def get_engine(*args, **kwargs) -> Engine:
+        # kwargs['echo'] = True  # useful for debugging
+        e = create_engine(*args, **kwargs)
+        event.listen(e, 'connect', enable_wal)
+        return e
+    ### use readonly database just to get stats
+    pengine = get_engine('sqlite://', creator=lambda: sqlite3.connect(f"file:{db_path}?mode=ro", uri=True))
+    stats_before: Stats
+    try:
+        with pengine.begin() as conn:
+            stats_before = query_total_stats(conn)
+    except exc.OperationalError as oe:
+        if oe.code == 'e3q8':
+            # db doesn't exist yet
+            stats_before = {}
+        else:
+            raise oe
+    pengine.dispose()
+    ###
+    # needtimeout, othewise concurrent indexing might not work
+    # (note that this also requires WAL mode)
+    engine = get_engine(f'sqlite:///{db_path}', connect_args={'timeout': _CONNECTION_TIMEOUT_SECONDS})
+    cleared: Set[str] = set()
+    # by default, sqlalchemy does some sort of BEGIN (implicit) transaction, which doesn't provide proper isolation??
+    # see https://docs.sqlalchemy.org/en/20/dialects/sqlite.html#serializable-isolation-savepoints-transactional-ddl
+    event.listen(engine, 'begin', begin_immediate_transaction)
+    # TODO to allow more concurrent indexing, maybe could instead write to a temporary table?
+    # or collect visits first and only then start writing to the db to minimize db access window.. not sure
+    # engine.begin() starts a transaction
+    # so everything inside this block will be atomic to the outside observers
+    with engine.begin() as conn:
+        table.create(conn, checkfirst=True)
+        if overwrite_db:
+            conn.execute(table.delete())
+        insert_stmt = table.insert()
+        # using raw statement gives a massive speedup for inserting visits
+        # see test_benchmark_visits_dumping
+        insert_stmt_raw = str(insert_stmt.compile(dialect=dialect_sqlite.dialect(paramstyle='qmark')))
+        for chunk in chunked(vit_ok(), n=_CHUNK_BY):
+            srcs = set(v.src or '' for v in chunk)
+            new = srcs.difference(cleared)
+            for src in new:
+                conn.execute(table.delete().where(table.c.src == src))
+                cleared.add(src)
+            bound = [db_visit_to_row(v) for v in chunk]
+            conn.exec_driver_sql(insert_stmt_raw, bound)
+        stats_after = query_total_stats(conn)
+    engine.dispose()
+    stats_changes = {}
+    # map str just in case some srcs are None
+    for k in sorted(map(str, {*stats_before.keys(), *stats_after.keys()})):
+        diff = stats_after.get(k, 0) - stats_before.get(k, 0)
+        if diff == 0:
+            continue
+        sdiff = ('+' if diff > 0 else '') + str(diff)
+        stats_changes[k] = sdiff
+    action = 'overwritten' if overwrite_db else 'updated'
+    total_indexed = sum(index_stats.values())
+    total_err = index_stats.get(SRC_ERROR, 0)
+    total_ok = total_indexed - total_err
+    logger.info(f'indexed (current run) : total: {total_indexed}, ok: {total_ok}, errors: {total_err} {index_stats}')
+    logger.info(f'database "{db_path}" : {action}')
+    logger.info(f'database stats before : {stats_before}')
+    logger.info(f'database stats after  : {stats_after}')
+    if len(stats_changes) == 0:
+        logger.info('database stats changes: no changes')
+    else:
+        for k, v in stats_changes.items():
+            logger.info(f'database stats changes: {k} {v}')
+    res: List[Exception] = []
+    if total_ok == 0:
+        res.append(RuntimeError('No visits were indexed, something is probably wrong!'))
+    return res

promnesia/{read_db.py → database/load.py} RENAMED Viewed

@@ -1,32 +1,29 @@
 from pathlib import Path
 from typing import Tuple, List
-from cachew import NTBinder
 from sqlalchemy import (
     create_engine,
     exc,
+    Engine,
     MetaData,
     Index,
     Table,
 )
-from sqlalchemy.engine import Engine
-from .common import DbVisit
+from .common import DbVisit, get_columns, row_to_db_visit
-DbStuff = Tuple[Engine, NTBinder, Table]
+DbStuff = Tuple[Engine, Table]
 def get_db_stuff(db_path: Path) -> DbStuff:
     assert db_path.exists(), db_path
     # todo how to open read only?
     # actually not sure if we can since we are creating an index here
-    engine = create_engine(f'sqlite:///{db_path}') # , echo=True)
-    binder = NTBinder.make(DbVisit)
+    engine = create_engine(f'sqlite:///{db_path}')  # , echo=True)
     meta = MetaData()
-    table = Table('visits', meta, *binder.columns)
+    table = Table('visits', meta, *get_columns())
     idx = Index('index_norm_url', table.c.norm_url)
     try:
@@ -39,13 +36,15 @@ def get_db_stuff(db_path: Path) -> DbStuff:
             raise e
     # NOTE: apparently it's ok to open connection on every request? at least my comparisons didn't show anything
-    return engine, binder, table
+    return engine, table
 def get_all_db_visits(db_path: Path) -> List[DbVisit]:
     # NOTE: this is pretty inefficient if the DB is huge
     # mostly intended for tests
-    engine, binder, table = get_db_stuff(db_path)
+    engine, table = get_db_stuff(db_path)
     query = table.select()
     with engine.connect() as conn:
-        return [binder.from_row(row) for row in conn.execute(query)]
+        res = [row_to_db_visit(row) for row in conn.execute(query)]
+    engine.dispose()
+    return res

promnesia/extract.py CHANGED Viewed

@@ -28,6 +28,7 @@ DEFAULT_FILTERS = (
 )
+# TODO maybe move these to configs?
 @lru_cache(1) #meh, not sure what would happen under tests?
 def filters() -> Sequence[Filter]:
     from . import config

promnesia/kjson.py CHANGED Viewed

@@ -74,7 +74,7 @@ def test_json_processor():
     handled = []
     class Proc(JsonProcessor):
         def handle_dict(self, value: JDict, path):
-            if 'skipme' in self.kpath(path):
+            if 'skipme' in self.kpath(path):  # type: ignore[comparison-overlap]
                 return JsonProcessor.SKIP
         def handle_str(self, value: str, path):

promnesia 1.2.20230515__py3-none-any.whl → 1.2.20240810__py3-none-any.whl

promnesia 1.2.20230515py3-none-any.whl → 1.2.20240810py3-none-any.whl