PyPI - promnesia - Versions diffs - 1.2.20230515__py3-none-any.whl → 1.3.20241021__py3-none-any.whl - Mend

promnesia 1.2.20230515py3-none-any.whl → 1.3.20241021py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (84) hide show

promnesia/__init__.py +14 -3
promnesia/__main__.py +60 -35
promnesia/cannon.py +27 -27
promnesia/common.py +85 -67
promnesia/compare.py +21 -22
promnesia/compat.py +10 -10
promnesia/config.py +23 -23
promnesia/database/common.py +67 -0
promnesia/database/dump.py +188 -0
promnesia/{read_db.py → database/load.py} +16 -17
promnesia/extract.py +14 -11
promnesia/kjson.py +12 -11
promnesia/logging.py +4 -4
promnesia/misc/__init__.pyi +0 -0
promnesia/misc/config_example.py +1 -2
promnesia/misc/install_server.py +7 -9
promnesia/server.py +57 -47
promnesia/sources/__init__.pyi +0 -0
promnesia/sources/auto.py +50 -35
promnesia/sources/auto_logseq.py +6 -5
promnesia/sources/auto_obsidian.py +2 -2
promnesia/sources/browser.py +14 -9
promnesia/sources/browser_legacy.py +26 -16
promnesia/sources/demo.py +19 -3
promnesia/sources/fbmessenger.py +3 -2
promnesia/sources/filetypes.py +16 -7
promnesia/sources/github.py +7 -9
promnesia/sources/guess.py +2 -1
promnesia/sources/hackernews.py +2 -2
promnesia/sources/hpi.py +2 -2
promnesia/sources/html.py +7 -5
promnesia/sources/hypothesis.py +4 -3
promnesia/sources/instapaper.py +2 -2
promnesia/sources/markdown.py +31 -21
promnesia/sources/org.py +27 -13
promnesia/sources/plaintext.py +30 -29
promnesia/sources/pocket.py +3 -2
promnesia/sources/reddit.py +20 -19
promnesia/sources/roamresearch.py +2 -1
promnesia/sources/rss.py +4 -5
promnesia/sources/shellcmd.py +19 -6
promnesia/sources/signal.py +33 -24
promnesia/sources/smscalls.py +2 -2
promnesia/sources/stackexchange.py +4 -3
promnesia/sources/takeout.py +76 -9
promnesia/sources/takeout_legacy.py +24 -12
promnesia/sources/telegram.py +13 -11
promnesia/sources/telegram_legacy.py +18 -7
promnesia/sources/twitter.py +6 -5
promnesia/sources/vcs.py +5 -3
promnesia/sources/viber.py +10 -9
promnesia/sources/website.py +4 -4
promnesia/sources/zulip.py +3 -2
promnesia/sqlite.py +7 -4
promnesia/tests/__init__.py +0 -0
promnesia/tests/common.py +140 -0
promnesia/tests/server_helper.py +67 -0
promnesia/tests/sources/__init__.py +0 -0
promnesia/tests/sources/test_auto.py +65 -0
promnesia/tests/sources/test_filetypes.py +43 -0
promnesia/tests/sources/test_hypothesis.py +39 -0
promnesia/tests/sources/test_org.py +64 -0
promnesia/tests/sources/test_plaintext.py +25 -0
promnesia/tests/sources/test_shellcmd.py +21 -0
promnesia/tests/sources/test_takeout.py +56 -0
promnesia/tests/test_cannon.py +325 -0
promnesia/tests/test_cli.py +40 -0
promnesia/tests/test_compare.py +30 -0
promnesia/tests/test_config.py +289 -0
promnesia/tests/test_db_dump.py +222 -0
promnesia/tests/test_extract.py +65 -0
promnesia/tests/test_extract_urls.py +43 -0
promnesia/tests/test_indexer.py +251 -0
promnesia/tests/test_server.py +291 -0
promnesia/tests/test_traverse.py +39 -0
promnesia/tests/utils.py +35 -0
{promnesia-1.2.20230515.dist-info → promnesia-1.3.20241021.dist-info}/METADATA +15 -18
promnesia-1.3.20241021.dist-info/RECORD +83 -0
{promnesia-1.2.20230515.dist-info → promnesia-1.3.20241021.dist-info}/WHEEL +1 -1
{promnesia-1.2.20230515.dist-info → promnesia-1.3.20241021.dist-info}/entry_points.txt +0 -1
promnesia/dump.py +0 -105
promnesia-1.2.20230515.dist-info/RECORD +0 -58
{promnesia-1.2.20230515.dist-info → promnesia-1.3.20241021.dist-info}/LICENSE +0 -0
{promnesia-1.2.20230515.dist-info → promnesia-1.3.20241021.dist-info}/top_level.txt +0 -0

promnesia/__init__.py CHANGED Viewed

@@ -1,6 +1,17 @@
-from pathlib import Path
-from .common import PathIsh, Visit, Source, last, Loc, Results, DbVisit, Context, Res
 # add deprecation warning so eventually this may converted to a namespace package?
 import warnings
+from .common import (  # noqa: F401
+    Context,
+    DbVisit,
+    Loc,
+    PathIsh,
+    Res,
+    Results,
+    Source,
+    Visit,
+    last,
+)
+# TODO think again about it -- what are the pros and cons?
 warnings.warn("DEPRECATED! Please import directly from 'promnesia.common', e.g. 'from promnesia.common import Visit, Source, Results'", DeprecationWarning)

promnesia/__main__.py CHANGED Viewed

@@ -4,24 +4,35 @@ import argparse
 import ast
 import importlib
 import inspect
-from pathlib import Path
+import os
+import shlex
 import shutil
-from subprocess import run, check_call, Popen
 import sys
-from tempfile import TemporaryDirectory
-from typing import Callable, Sequence, Iterable, Iterator, Union
-from . import config
-from . import server
-from .misc import install_server
-from .common import Extractor, PathIsh, logger, get_tmpdir, DbVisit, Res
-from .common import Source, get_system_tz, user_config_file, default_config_path
-from .dump import visits_to_sqlite
+from collections.abc import Iterable, Iterator, Sequence
+from pathlib import Path
+from subprocess import Popen, check_call, run
+from tempfile import TemporaryDirectory, gettempdir
+from typing import Callable
+from . import config, server
+from .common import (
+    DbVisit,
+    Extractor,
+    PathIsh,
+    Res,
+    Source,
+    default_config_path,
+    get_system_tz,
+    get_tmpdir,
+    logger,
+    user_config_file,
+)
+from .database.dump import visits_to_sqlite
 from .extract import extract_visits
+from .misc import install_server
-def iter_all_visits(sources_subset: Iterable[Union[str, int]]=()) -> Iterator[Res[DbVisit]]:
+def iter_all_visits(sources_subset: Iterable[str | int] = ()) -> Iterator[Res[DbVisit]]:
     cfg = config.get()
     output_dir = cfg.output_dir
     # not sure if belongs here??
@@ -73,7 +84,7 @@ def iter_all_visits(sources_subset: Iterable[Union[str, int]]=()) -> Iterator[Re
         logger.warning("unknown --sources: %s", ", ".join(repr(i) for i in sources_subset))
-def _do_index(dry: bool=False, sources_subset: Iterable[Union[str, int]]=(), overwrite_db: bool=False) -> Iterable[Exception]:
+def _do_index(*, dry: bool = False, sources_subset: Iterable[str | int] = (), overwrite_db: bool = False) -> Iterable[Exception]:
     # also keep & return errors for further display
     errors: list[Exception] = []
     def it() -> Iterable[Res[DbVisit]]:
@@ -96,29 +107,32 @@ def _do_index(dry: bool=False, sources_subset: Iterable[Union[str, int]]=(), ove
 def do_index(
-        config_file: Path,
-        dry: bool=False,
-        sources_subset: Iterable[Union[str, int]]=(),
-        overwrite_db: bool=False,
-    ) -> None:
+    config_file: Path,
+    *,
+    dry: bool = False,
+    sources_subset: Iterable[str | int] = (),
+    overwrite_db: bool = False,
+) -> Sequence[Exception]:
     config.load_from(config_file) # meh.. should be cleaner
     try:
         errors = list(_do_index(dry=dry, sources_subset=sources_subset, overwrite_db=overwrite_db))
     finally:
+        # this reset is mainly for tests, so we don't end up reusing the same config by accident
         config.reset()
     if len(errors) > 0:
         logger.error('%d errors, printing them out:', len(errors))
         for e in errors:
             logger.exception(e)
         logger.error('%d errors, exit code 1', len(errors))
-        sys.exit(1)
+    return errors
 def demo_sources() -> dict[str, Callable[[], Extractor]]:
     def lazy(name: str) -> Callable[[], Extractor]:
         # helper to avoid failed imports etc, since people might be lacking necessary dependencies
         def inner() -> Extractor:
-            from . import sources
+            # TODO why this import??
+            from . import sources  # noqa: F401
             module = importlib.import_module(f'promnesia.sources.{name}')
             return getattr(module, 'index')
         return inner
@@ -143,7 +157,7 @@ def do_demo(
         config_file: Path | None,
         dry: bool=False,
         name: str='demo',
-        sources_subset: Iterable[Union[str, int]]=(),
+        sources_subset: Iterable[str | int]=(),
         overwrite_db: bool=False,
     ) -> None:
     with TemporaryDirectory() as tdir:
@@ -216,20 +230,29 @@ def config_check(args: argparse.Namespace) -> None:
 def _config_check(cfg: Path) -> Iterable[Exception]:
     logger.info('config: %s', cfg)
-    def check(cmd: list[str | Path]) -> Iterable[Exception]:
-        logger.debug(' '.join(map(str, cmd)))
-        res = run(cmd)
+    def check(cmd: list[str | Path], **kwargs) -> Iterable[Exception]:
+        logger.debug(shlex.join(map(str, cmd)))
+        res = run(cmd, **kwargs)  # noqa: PLW1510
         if res.returncode > 0:
+            # TODO what's up with empty exception??
             yield Exception()
     logger.info('Checking syntax...')
     cmd: list[str | Path] = [sys.executable, '-m', 'compileall', cfg]
-    yield from check(cmd)
+    yield from check(
+        cmd,
+        env={
+            **os.environ,
+            # if config is on read only partition, the command would fail due to generated bytecode
+            # so put it in the temporary directory instead
+            'PYTHONPYCACHEPREFIX': gettempdir()
+        },
+    )
     # todo not sure if should be more defensive than check_call here
     logger.info('Checking type safety...')
     try:
-        import mypy
+        import mypy  # noqa: F401
     except ImportError:
         logger.warning("mypy not found, can't use it to check config!")
     else:
@@ -281,7 +304,7 @@ def cli_doctor_server(args: argparse.Namespace) -> None:
     logger.info('You should see the database path and version above!')
-def _ordinal_or_name(s: str) -> Union[str, int]:
+def _ordinal_or_name(s: str) -> str | int:
     try:
         s = int(s)  # type: ignore
     except ValueError:
@@ -317,14 +340,14 @@ def main() -> None:
         )
     F = lambda prog: argparse.ArgumentDefaultsHelpFormatter(prog, width=120)
-    p = argparse.ArgumentParser(formatter_class=F) # type: ignore
-    subp = p.add_subparsers(dest='mode', )
+    p = argparse.ArgumentParser(formatter_class=F)
+    subp = p.add_subparsers(dest='mode' )
     ep = subp.add_parser('index', help='Create/update the link database', formatter_class=F)
     add_index_args(ep, default_config_path())
     # TODO use some way to override or provide config only via cmdline?
     ep.add_argument('--intermediate', required=False, help="Used for development, you don't need it")
-    sp = subp.add_parser('serve', help='Serve a link database', formatter_class=F) # type: ignore
+    sp = subp.add_parser('serve', help='Serve a link database', formatter_class=F)
     server.setup_parser(sp)
     ap = subp.add_parser('demo', help='Demo mode: index and serve a directory in single command', formatter_class=F)
@@ -338,7 +361,7 @@ def main() -> None:
     ap.add_argument('--no-serve', action='store_const', const=None, dest='port', help='Pass to only index without running server')
     ap.add_argument(
         '--as',
-        choices=list(sorted(demo_sources().keys())),
+        choices=sorted(demo_sources().keys()),
         default='guess',
         help='Promnesia source to index as (see https://github.com/karlicoss/promnesia/tree/master/src/promnesia/sources for the full list)',
     )
@@ -349,7 +372,7 @@ def main() -> None:
     install_server.setup_parser(isp)
     cp = subp.add_parser('config', help='Config management')
-    cp.set_defaults(func=lambda *args: cp.print_help())
+    cp.set_defaults(func=lambda *_args: cp.print_help())
     scp = cp.add_subparsers()
     ccp = scp.add_parser('check', help='Check config')
     ccp.set_defaults(func=config_check)
@@ -363,7 +386,7 @@ def main() -> None:
     dp = subp.add_parser('doctor', help='Troubleshooting assistant')
     dp.add_argument('--config', type=Path, default=default_config_path(), help='Config path')
-    dp.set_defaults(func=lambda *args: dp.print_help())
+    dp.set_defaults(func=lambda *_args: dp.print_help())
     sdp = dp.add_subparsers()
     sdp.add_parser('config'  , help='Check config'    ).set_defaults(func=config_check )
     sdp.add_parser('database', help='Inspect database').set_defaults(func=cli_doctor_db)
@@ -388,12 +411,14 @@ def main() -> None:
     with get_tmpdir() as tdir: # TODO??
         if mode == 'index':
-            do_index(
+            errors = do_index(
                 config_file=args.config,
                 dry=args.dry,
                 sources_subset=args.sources,
                 overwrite_db=args.overwrite,
             )
+            if len(errors) > 0:
+                sys.exit(1)
         elif mode == 'serve':
             server.run(args)
         elif mode == 'demo':

promnesia/cannon.py CHANGED Viewed

@@ -9,16 +9,17 @@ are same content, but you can't tell that by URL equality. Even canonical urls a
 Also some experiments to establish 'URL hierarchy'.
 """
-# TODO eh?? they fixed mobile.twitter.com?
+from __future__ import annotations
-from itertools import chain
 import re
 import typing
-from typing import Iterable, NamedTuple, Set, Optional, List, Sequence, Union, Tuple, Dict, Any, Collection
 import urllib.parse
-from urllib.parse import urlsplit, parse_qsl, urlunsplit, parse_qs, urlencode, SplitResult
+from collections.abc import Collection, Iterable, Sequence
+# TODO eh?? they fixed mobile.twitter.com?
+from itertools import chain
+from typing import Any, NamedTuple, Union
+from urllib.parse import SplitResult, parse_qsl, urlencode, urlsplit, urlunsplit
 # this has some benchmark, but quite a few librarires seem unmaintained, sadly
 # I guess i'll stick to default for now, until it's a critical bottleneck
@@ -108,11 +109,11 @@ default_qkeep = [
 # TODO perhaps, decide if fragment is meaningful (e.g. wiki) or random sequence of letters?
 class Spec(NamedTuple):
-    qkeep  : Optional[Union[Collection[str], bool]] = None
-    qremove: Optional[Set[str]] = None
+    qkeep  : Collection[str] | bool | None = None
+    qremove: set[str] | None = None
     fkeep  : bool = False
-    def keep_query(self, q: str) -> Optional[int]: # returns order
+    def keep_query(self, q: str) -> int | None: # returns order
         if self.qkeep is True:
             return 1
         qkeep = {
@@ -134,13 +135,13 @@ class Spec(NamedTuple):
         return None
     @classmethod
-    def make(cls, **kwargs) -> 'Spec':
+    def make(cls, **kwargs) -> Spec:
         return cls(**kwargs)
 S = Spec
 # TODO perhaps these can be machine learnt from large set of urls?
-specs: Dict[str, Spec] = {
+specs: dict[str, Spec] = {
     'youtube.com': S(
         # TODO search_query?
         qkeep=[ # note: experimental.. order matters here
@@ -178,7 +179,6 @@ specs: Dict[str, Spec] = {
             'source', 'tsid', 'refsrc', 'pnref', 'rc', '_rdr', 'src', 'hc_location', 'section', 'permPage', 'soft', 'pn_ref', 'action',
             'ti', 'aref', 'event_time_id', 'action_history', 'filter', 'ref_notif_type', 'has_source', 'source_newsfeed_story_type',
-            'ref_notif_type',
         },
     ),
     'physicstravelguide.com': S(fkeep=True), # TODO instead, pass fkeep marker object for shorter spec?
@@ -218,10 +218,10 @@ Spec2 = Any # TODO
 # TODO this should be a map
 Frag = Any
-Parts = Sequence[Tuple[str, str]]
+Parts = Sequence[tuple[str, str]]
-def _yc(domain: str, path: str, qq: Parts, frag: Frag) -> Tuple[Any, Any, Parts, Frag]:
+def _yc(domain: str, path: str, qq: Parts, frag: Frag) -> tuple[Any, Any, Parts, Frag]:
     if path[:5] == '/from':
         site = dict(qq).get('site')
         if site is not None:
@@ -232,7 +232,7 @@ def _yc(domain: str, path: str, qq: Parts, frag: Frag) -> Tuple[Any, Any, Parts,
     # TODO this should be in-place? for brevity?
     return (domain, path, qq, frag)
-def get_spec2(dom: str) -> Optional[Spec2]:
+def get_spec2(dom: str) -> Spec2 | None:
     return {
         'news.ycombinator.com': _yc,
     }.get(dom)
@@ -285,10 +285,10 @@ def transform_split(split: SplitResult):
     REST = r'(?P<rest>.*)'
     Left = Union[str, Sequence[str]]
-    Right = Tuple[str, str, str]
+    Right = tuple[str, str, str]
     # the idea is that we can unify certain URLs here and map them to the 'canonical' one
     # this is a dict only for grouping but should be a list really.. todo
-    rules: Dict[Left, Right] = {
+    rules: dict[Left, Right] = {
         # TODO m. handling might be quite common
         # f'm.youtube.com/{REST}': ('youtube.com', '{rest}'),
         (
@@ -322,9 +322,9 @@ def transform_split(split: SplitResult):
             continue
         gd = m.groupdict()
         if len(to) == 2:
-            to = to + ('', )
+            to = (*to, '')
-        (netloc, path, qq) = [t.format(**gd) for t in to]
+        (netloc, path, qq) = (t.format(**gd) for t in to)
         qparts.extend(parse_qsl(qq, keep_blank_values=True)) # TODO hacky..
         # TODO eh, qparts should really be a map or something...
         break
@@ -361,7 +361,7 @@ def myunsplit(domain: str, path: str, query: str, fragment: str) -> str:
 #     ]
 #     for re in regexes:
-def handle_archive_org(url: str) -> Optional[str]:
+def handle_archive_org(url: str) -> str | None:
     are = r'web.archive.org/web/(?P<timestamp>\d+)/(?P<rest>.*)'
     m = re.fullmatch(are, url)
     if m is None:
@@ -422,7 +422,7 @@ def canonify(url: str) -> str:
     qq = [(k, v) for i, k, v in sorted(iqq)]
     # TODO still not sure what we should do..
     # quote_plus replaces %20 with +, not sure if we want it...
-    query = urlencode(qq, quote_via=quote_via) # type: ignore[type-var]
+    query = urlencode(qq, quote_via=quote_via)
     path = _quote_path(path)
@@ -683,7 +683,7 @@ def domains(it): # pragma: no cover
         try:
             nurl = canonify(url)
         except CanonifyException as e:
-            print(f"ERROR while normalising! {nurl} {e}")
+            print(f"ERROR while normalising! {url} {e}")
             c['ERROR'] += 1
             continue
         else:
@@ -697,8 +697,8 @@ def groups(it, args): # pragma: no cover
     all_pats = get_patterns()
     from collections import Counter
-    c: typing.Counter[Optional[str]] = Counter()
-    unmatched: List[str] = []
+    c: typing.Counter[str | None] = Counter()
+    unmatched: list[str] = []
     def dump():
         print(c)
@@ -718,7 +718,7 @@ def groups(it, args): # pragma: no cover
         try:
             nurl = canonify(url)
         except CanonifyException as e:
-            print(f"ERROR while normalising! {nurl} {e}")
+            print(f"ERROR while normalising! {url} {e}")
             continue
         udom = nurl[:nurl.find('/')]
         usplit = udom.split('.')
@@ -756,10 +756,10 @@ def groups(it, args): # pragma: no cover
 def display(it, args) -> None: # pragma: no cover
     # TODO better name?
     import difflib
-    # pylint: disable=import-error
-    from termcolor import colored as C # type: ignore
     from sys import stdout
+    from termcolor import colored as C  # type: ignore
     for line in it:
         line = line.strip()
         if args.human:
@@ -818,7 +818,7 @@ def main() -> None: # pragma: no cover
 - running comparison
   sqlite3 promnesia.sqlite 'select distinct orig_url from visits where norm_url like "%twitter%" order by orig_url' | src/promnesia/cannon.py
-''', formatter_class=lambda prog: argparse.RawTextHelpFormatter(prog, width=100) # type: ignore
+''', formatter_class=lambda prog: argparse.RawTextHelpFormatter(prog, width=100)
     )
     p.add_argument('input', nargs='?')
     p.add_argument('--human', action='store_true')

promnesia 1.2.20230515__py3-none-any.whl → 1.3.20241021__py3-none-any.whl

promnesia 1.2.20230515py3-none-any.whl → 1.3.20241021py3-none-any.whl