PyPI - promnesia - Versions diffs - 1.2.20240810__py3-none-any.whl → 1.4.20250909__py3-none-any.whl - Mend

promnesia 1.2.20240810py3-none-any.whl → 1.4.20250909py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (79) hide show

promnesia/__init__.py +18 -4
promnesia/__main__.py +104 -78
promnesia/cannon.py +108 -107
promnesia/common.py +107 -88
promnesia/compare.py +33 -30
promnesia/compat.py +10 -10
promnesia/config.py +37 -34
promnesia/database/common.py +4 -3
promnesia/database/dump.py +13 -13
promnesia/database/load.py +7 -7
promnesia/extract.py +19 -17
promnesia/logging.py +27 -15
promnesia/misc/install_server.py +32 -27
promnesia/server.py +106 -79
promnesia/sources/auto.py +104 -77
promnesia/sources/auto_logseq.py +6 -5
promnesia/sources/auto_obsidian.py +2 -2
promnesia/sources/browser.py +20 -10
promnesia/sources/browser_legacy.py +65 -50
promnesia/sources/demo.py +7 -8
promnesia/sources/fbmessenger.py +3 -3
promnesia/sources/filetypes.py +22 -16
promnesia/sources/github.py +9 -8
promnesia/sources/guess.py +6 -2
promnesia/sources/hackernews.py +7 -9
promnesia/sources/hpi.py +5 -3
promnesia/sources/html.py +11 -7
promnesia/sources/hypothesis.py +3 -2
promnesia/sources/instapaper.py +3 -2
promnesia/sources/markdown.py +22 -12
promnesia/sources/org.py +36 -17
promnesia/sources/plaintext.py +41 -39
promnesia/sources/pocket.py +5 -3
promnesia/sources/reddit.py +24 -26
promnesia/sources/roamresearch.py +5 -2
promnesia/sources/rss.py +6 -8
promnesia/sources/shellcmd.py +21 -11
promnesia/sources/signal.py +27 -26
promnesia/sources/smscalls.py +2 -3
promnesia/sources/stackexchange.py +5 -4
promnesia/sources/takeout.py +37 -34
promnesia/sources/takeout_legacy.py +29 -19
promnesia/sources/telegram.py +18 -12
promnesia/sources/telegram_legacy.py +22 -11
promnesia/sources/twitter.py +7 -6
promnesia/sources/vcs.py +11 -6
promnesia/sources/viber.py +11 -10
promnesia/sources/website.py +8 -7
promnesia/sources/zulip.py +3 -2
promnesia/sqlite.py +13 -7
promnesia/tests/common.py +10 -5
promnesia/tests/server_helper.py +13 -10
promnesia/tests/sources/test_auto.py +2 -3
promnesia/tests/sources/test_filetypes.py +11 -8
promnesia/tests/sources/test_hypothesis.py +10 -6
promnesia/tests/sources/test_org.py +9 -5
promnesia/tests/sources/test_plaintext.py +9 -8
promnesia/tests/sources/test_shellcmd.py +13 -13
promnesia/tests/sources/test_takeout.py +3 -5
promnesia/tests/test_cannon.py +256 -239
promnesia/tests/test_cli.py +12 -8
promnesia/tests/test_compare.py +17 -13
promnesia/tests/test_config.py +7 -8
promnesia/tests/test_db_dump.py +15 -15
promnesia/tests/test_extract.py +17 -10
promnesia/tests/test_indexer.py +24 -18
promnesia/tests/test_server.py +12 -13
promnesia/tests/test_traverse.py +0 -2
promnesia/tests/utils.py +3 -7
promnesia-1.4.20250909.dist-info/METADATA +66 -0
promnesia-1.4.20250909.dist-info/RECORD +80 -0
{promnesia-1.2.20240810.dist-info → promnesia-1.4.20250909.dist-info}/WHEEL +1 -2
promnesia/kjson.py +0 -121
promnesia/sources/__init__.pyi +0 -0
promnesia-1.2.20240810.dist-info/METADATA +0 -54
promnesia-1.2.20240810.dist-info/RECORD +0 -83
promnesia-1.2.20240810.dist-info/top_level.txt +0 -1
{promnesia-1.2.20240810.dist-info → promnesia-1.4.20250909.dist-info}/entry_points.txt +0 -0
{promnesia-1.2.20240810.dist-info → promnesia-1.4.20250909.dist-info/licenses}/LICENSE +0 -0

promnesia/cannon.py CHANGED Viewed

@@ -9,16 +9,18 @@ are same content, but you can't tell that by URL equality. Even canonical urls a
 Also some experiments to establish 'URL hierarchy'.
 """
-# TODO eh?? they fixed mobile.twitter.com?
-from itertools import chain
+from __future__ import annotations
 import re
 import typing
-from typing import Iterable, NamedTuple, Set, Optional, List, Sequence, Union, Tuple, Dict, Any, Collection
 import urllib.parse
-from urllib.parse import urlsplit, parse_qsl, urlunsplit, parse_qs, urlencode, SplitResult
+from collections.abc import Collection, Iterable, Sequence
+# TODO eh?? they fixed mobile.twitter.com?
+from itertools import chain
+from typing import Any, NamedTuple
+from urllib.parse import SplitResult, parse_qsl, urlencode, urlsplit, urlunsplit
 # this has some benchmark, but quite a few librarires seem unmaintained, sadly
 # I guess i'll stick to default for now, until it's a critical bottleneck
@@ -27,17 +29,6 @@ from urllib.parse import urlsplit, parse_qsl, urlunsplit, parse_qs, urlencode, S
 # TODO perhaps archive.org contributes to both?
-def try_cutl(prefix: str, s: str) -> str:
-    if s.startswith(prefix):
-        return s[len(prefix):]
-    else:
-        return s
-def try_cutr(suffix: str, s: str) -> str:
-    if s.endswith(suffix):
-        return s[:-len(suffix)]
-    else:
-        return s
 # TODO move this to site-specific normalisers?
 dom_subst = [
@@ -58,22 +49,22 @@ dom_subst = [
     # app.getpocket.com is the canonical domain in the JSON returned by
     # https://github.com/karlicoss/pockexport, so let's canonicalize to that.
     ('getpocket.'     , 'app.getpocket.'),
-]
+]  # fmt: skip
 def canonify_domain(dom: str) -> str:
     # TODO perhaps not necessary now that I'm checking suffixes??
     for st in ('www.', 'amp.'):
-        dom = try_cutl(st, dom)
+        dom = dom.removeprefix(st)
     for start, repl in dom_subst:
         if dom.startswith(start):
-            dom = repl + dom[len(start):]
+            dom = repl + dom[len(start) :]
             break
     return dom
 default_qremove = {
     'utm_source',
     'utm_campaign',
@@ -92,7 +83,7 @@ default_qremove = {
     # e.g. on github
     'utf8',
-}
+}  # fmt: skip
 default_qkeep = [
     # ok, various BBS have it, hackernews has it etc?
@@ -104,25 +95,22 @@ default_qkeep = [
     # common to some sites.., usually 'post'
     'p',
-]
+]  # fmt: skip
 # TODO perhaps, decide if fragment is meaningful (e.g. wiki) or random sequence of letters?
 class Spec(NamedTuple):
-    qkeep  : Optional[Union[Collection[str], bool]] = None
-    qremove: Optional[Set[str]] = None
-    fkeep  : bool = False
+    qkeep: Collection[str] | bool | None = None
+    qremove: set[str] | None = None
+    fkeep: bool = False
-    def keep_query(self, q: str) -> Optional[int]: # returns order
+    def keep_query(self, q: str) -> int | None:  # returns order
         if self.qkeep is True:
             return 1
-        qkeep = {
-            q: i for i, q in enumerate(chain(default_qkeep, self.qkeep or []))
-        }
+        qkeep = {q: i for i, q in enumerate(chain(default_qkeep, self.qkeep or []))}
         qremove = default_qremove.union(self.qremove or {})
         # I suppose 'remove' is only useful for logging. we remove by default anyway
-        keep = False
-        remove = False
         qk = qkeep.get(q)
         if qk is not None:
             return qk
@@ -134,13 +122,14 @@ class Spec(NamedTuple):
         return None
     @classmethod
-    def make(cls, **kwargs) -> 'Spec':
+    def make(cls, **kwargs) -> Spec:
         return cls(**kwargs)
 S = Spec
 # TODO perhaps these can be machine learnt from large set of urls?
-specs: Dict[str, Spec] = {
+specs: dict[str, Spec] = {
     'youtube.com': S(
         # TODO search_query?
         qkeep=[ # note: experimental.. order matters here
@@ -178,7 +167,6 @@ specs: Dict[str, Spec] = {
             'source', 'tsid', 'refsrc', 'pnref', 'rc', '_rdr', 'src', 'hc_location', 'section', 'permPage', 'soft', 'pn_ref', 'action',
             'ti', 'aref', 'event_time_id', 'action_history', 'filter', 'ref_notif_type', 'has_source', 'source_newsfeed_story_type',
-            'ref_notif_type',
         },
     ),
     'physicstravelguide.com': S(fkeep=True), # TODO instead, pass fkeep marker object for shorter spec?
@@ -189,9 +177,11 @@ specs: Dict[str, Spec] = {
     'play.google.com'    : S(qkeep={'id'}),
     'answers.yahoo.com'  : S(qkeep={'qid'}),
     'isfdb.org': S(qkeep=True),
-}
+}  # fmt: skip
 _def_spec = S()
 # TODO use cache?
 def get_spec(dom: str) -> Spec:
     # ugh. a bit ugly way of getting stuff without subdomain...
@@ -208,20 +198,19 @@ def get_spec(dom: str) -> Spec:
     return _def_spec
 # ideally we'd just be able to reference the domain name and use it in the subst?
 # some sort of hierarchical matchers? not sure what's got better performance..
 # if 'from?site=' in url:
 #     return p.query('site')
-Spec2 = Any # TODO
+Spec2 = Any  # TODO
 # TODO this should be a map
 Frag = Any
-Parts = Sequence[Tuple[str, str]]
+Parts = Sequence[tuple[str, str]]
-def _yc(domain: str, path: str, qq: Parts, frag: Frag) -> Tuple[Any, Any, Parts, Frag]:
+def _yc(domain: str, path: str, qq: Parts, frag: Frag) -> tuple[Any, Any, Parts, Frag]:
     if path[:5] == '/from':
         site = dict(qq).get('site')
         if site is not None:
@@ -232,7 +221,8 @@ def _yc(domain: str, path: str, qq: Parts, frag: Frag) -> Tuple[Any, Any, Parts,
     # TODO this should be in-place? for brevity?
     return (domain, path, qq, frag)
-def get_spec2(dom: str) -> Optional[Spec2]:
+def get_spec2(dom: str) -> Spec2 | None:
     return {
         'news.ycombinator.com': _yc,
     }.get(dom)
@@ -241,8 +231,9 @@ def get_spec2(dom: str) -> Optional[Spec2]:
 class CanonifyException(Exception):
     pass
 # TODO not so sure if it's better to quote or not?
-quote_via   = urllib.parse.quote
+quote_via = urllib.parse.quote
 unquote_via = urllib.parse.unquote
@@ -251,7 +242,9 @@ def _quote_path(path: str) -> str:
     nparts = []
     for p in parts:
         # TODO maybe re.match?
-        if '%' in p or '+' in p: # some urls are partially encoded... perhaps canonify needs hints indicating if url needs normalising or not
+        if (
+            '%' in p or '+' in p
+        ):  # some urls are partially encoded... perhaps canonify needs hints indicating if url needs normalising or not
             p = unquote_via(p)
         # TODO safe argumnet?
         nparts.append(quote_via(p))
@@ -269,44 +262,46 @@ def _prenormalise(url: str) -> str:
         # not sure how safe it in general...
         first_q = url.find('&')
         if first_q != -1:
-            return url[:first_q] + '?' + url[first_q + 1:]
+            return url[:first_q] + '?' + url[first_q + 1 :]
     return url
+Left = str | Sequence[str]
+Right = tuple[str, str, str]
 def transform_split(split: SplitResult):
     netloc = canonify_domain(split.netloc)
-    path     = split.path
-    qparts   = parse_qsl(split.query, keep_blank_values=True)
+    path = split.path
+    qparts = parse_qsl(split.query, keep_blank_values=True)
     fragment = split.fragment
-    ID   = r'(?P<id>[^/]+)'
-    REST = r'(?P<rest>.*)'
+    ID = r'(?P<id>[^/]+)'
+    # REST = r'(?P<rest>.*)'
-    Left = Union[str, Sequence[str]]
-    Right = Tuple[str, str, str]
     # the idea is that we can unify certain URLs here and map them to the 'canonical' one
     # this is a dict only for grouping but should be a list really.. todo
-    rules: Dict[Left, Right] = {
+    rules: dict[Left, Right] = {
         # TODO m. handling might be quite common
         # f'm.youtube.com/{REST}': ('youtube.com', '{rest}'),
         (
             f'youtu.be/{ID}',
             f'youtube.com/embed/{ID}',
-        ) : ('youtube.com', '/watch', 'v={id}'),
+        ): ('youtube.com', '/watch', 'v={id}'),
         # TODO wonder if there is a better candidate for canonical video link?
         # {DOMAIN} pattern? implicit?
         (
             'twitter.com/home',
             'twitter.com/explore',
-        ) : ('twitter.com', '', ''),
+        ): ('twitter.com', '', ''),
     }
     def iter_rules():
         for fr, to in rules.items():
             if isinstance(fr, str):
-                fr = (fr, )
+                fr = (fr,)
             for f in fr:
                 yield f, to
@@ -322,29 +317,28 @@ def transform_split(split: SplitResult):
             continue
         gd = m.groupdict()
         if len(to) == 2:
-            to = to + ('', )
+            to = (*to, '')
-        (netloc, path, qq) = [t.format(**gd) for t in to]
-        qparts.extend(parse_qsl(qq, keep_blank_values=True)) # TODO hacky..
+        (netloc, path, qq) = (t.format(**gd) for t in to)
+        qparts.extend(parse_qsl(qq, keep_blank_values=True))  # TODO hacky..
         # TODO eh, qparts should really be a map or something...
         break
     return netloc, path, qparts, fragment
 def myunsplit(domain: str, path: str, query: str, fragment: str) -> str:
-    uns = urlunsplit((
-        '', # dummy protocol
-        domain,
-        path,
-        query,
-        fragment,
-    ))
-    uns = try_cutl('//', uns)  # // due to dummy protocol
-    return uns
+    uns = urlunsplit(
+        (
+            '',  # dummy protocol
+            domain,
+            path,
+            query,
+            fragment,
+        )
+    )
+    uns = uns.removeprefix('//')  # // due to dummy protocol
+    return uns  # ty: ignore[invalid-return-type]  # see https://github.com/astral-sh/ty/issues/733
 #
@@ -361,7 +355,8 @@ def myunsplit(domain: str, path: str, query: str, fragment: str) -> str:
 #     ]
 #     for re in regexes:
-def handle_archive_org(url: str) -> Optional[str]:
+def handle_archive_org(url: str) -> str | None:
     are = r'web.archive.org/web/(?P<timestamp>\d+)/(?P<rest>.*)'
     m = re.fullmatch(are, url)
     if m is None:
@@ -397,7 +392,7 @@ def canonify(url: str) -> str:
     res = handle_archive_org(no_protocol)
     if res is not None:
-        assert len(res) < len(no_protocol) # just a paranoia to avoid infinite recursion...
+        assert len(res) < len(no_protocol)  # just a paranoia to avoid infinite recursion...
         return canonify(res)
     domain, path, qq, _frag = transform_split(parts)
@@ -407,7 +402,6 @@ def canonify(url: str) -> str:
         # meh
         domain, path, qq, _frag = spec2(domain, path, qq, _frag)
     spec = get_spec(domain)
     # TODO FIXME turn this logic back on?
@@ -427,11 +421,11 @@ def canonify(url: str) -> str:
     path = _quote_path(path)
     uns = myunsplit(domain, path, query, frag)
-    uns = try_cutr('/', uns) # not sure if there is a better way
+    uns = uns.removesuffix('/')  # not sure if there is a better way
     return uns
- # TODO wonder if lisp could be convenient for this. lol
+# TODO wonder if lisp could be convenient for this. lol
 TW_PATTERNS = [
     {
         'U': r'[\w-]+',
@@ -469,7 +463,7 @@ TW_PATTERNS = [
     r'twitter.com/i/events/\d+',
     r'(dev|api|analytics|developer|help|support|blog|anywhere|careers|pic).twitter.com/.*',
-]
+]  # fmt: skip
 RD_PATTERNS = [
     {
@@ -500,7 +494,7 @@ RD_PATTERNS = [
     r'reddit.com/dev/api',
     r'reddit.com/api/v1/authorize',
     r'reddit.com/domain/.*',
-]
+]  # fmt: skip
 GH_PATTERNS = [
     {
@@ -543,7 +537,7 @@ GH_PATTERNS = [
     # TODO FIXME no canonical here
     # https://gist.github.com/dneto/2258454
     # same as https://gist.github.com/2258454
-]
+]  # fmt: skip
 YT_PATTERNS = [
     {
@@ -566,7 +560,7 @@ YT_PATTERNS = [
     r'youtube.com/feed/(subscriptions|library|trending|history)',
     r'youtube.com',
     r'youtube.com/(post_login|upload)',
-]
+]  # fmt: skip
 SOP = r'(^|\w+\.)stackoverflow.com'
@@ -589,7 +583,7 @@ SO_PATTERNS = [
     SOP + r'/users/UI',
     SOP + r'/users/UI/U',
     SOP,
-]
+]  # fmt: skip
 WKP = r'(^|.+\.)wikipedia.org'
@@ -599,7 +593,7 @@ WK_PATTERNS = [
     },
     WKP + '/wiki/AN',
     WKP,
-]
+]  # fmt: skip
 FB_PATTERNS = [
     {
@@ -627,7 +621,7 @@ FB_PATTERNS = [
     r'F/pages/U/P',
     r'F/stories/I',
     r'F/notes/U/P',
-]
+]  # fmt: skip
 PKP = r'^(app)?\.getpocket\.com'
@@ -636,7 +630,7 @@ PK_PATTERNS = [
         'ID': r'\d+',
     },
     PKP + '/read/ID',
-]
+]  # fmt: skip
 # NOTE: right, I think this is just for analysis so far... not actually used
 PATTERNS = {
@@ -649,7 +643,7 @@ PATTERNS = {
     'wikipedia' : WK_PATTERNS,
     'pocket'    : PK_PATTERNS,
     # 'news.ycombinator.com': YC_PATTERNS,
-}
+}  # fmt: skip
 def get_patterns():  # pragma: no cover
@@ -672,11 +666,13 @@ def get_patterns():  # pragma: no cover
         [rdict] = repls
         for p in pats:
             yield repl(p, rdict)
     return {k: list(handle(v)) for k, v in PATTERNS.items()}
-def domains(it): # pragma: no cover
+def domains(it):  # pragma: no cover
     from collections import Counter
     c: typing.Counter[str] = Counter()
     for line in it:
         url = line.strip()
@@ -687,18 +683,20 @@ def domains(it): # pragma: no cover
             c['ERROR'] += 1
             continue
         else:
-            udom = nurl[:nurl.find('/')]
+            udom = nurl[: nurl.find('/')]
             c[udom] += 1
     from pprint import pprint
     pprint(c.most_common(20))
-def groups(it, args): # pragma: no cover
+def groups(it, args):  # pragma: no cover
     all_pats = get_patterns()
     from collections import Counter
-    c: typing.Counter[Optional[str]] = Counter()
-    unmatched: List[str] = []
+    c: typing.Counter[str | None] = Counter()
+    unmatched: list[str] = []
     def dump():
         print(c)
@@ -709,7 +707,6 @@ def groups(it, args): # pragma: no cover
         if pat is None:
             unmatched.append(nurl)
     for i, line in enumerate(it):
         if i % 10000 == 0:
             pass
@@ -720,12 +717,12 @@ def groups(it, args): # pragma: no cover
         except CanonifyException as e:
             print(f"ERROR while normalising! {url} {e}")
             continue
-        udom = nurl[:nurl.find('/')]
+        udom = nurl[: nurl.find('/')]
         usplit = udom.split('.')
         patterns = None
         for dom, pats in all_pats.items():
             dsplit = dom.split('.')
-            if '$'.join(dsplit) in '$'.join(usplit): # meh
+            if '$'.join(dsplit) in '$'.join(usplit):  # meh
                 patterns = pats
                 break
         else:
@@ -749,17 +746,17 @@ def groups(it, args): # pragma: no cover
     print(f"Unmatched: {nones / sum(c.values()) * 100:.1f}%")
     uc = Counter([u.split('/')[:2][-1] for u in unmatched]).most_common(10)
     from pprint import pprint
-    pprint(uc)
+    pprint(uc)
-def display(it, args) -> None: # pragma: no cover
+def display(it, args) -> None:  # pragma: no cover
     # TODO better name?
     import difflib
-    # pylint: disable=import-error
-    from termcolor import colored as C # type: ignore
     from sys import stdout
+    from termcolor import colored as C  # type: ignore[import-not-found]
     for line in it:
         line = line.strip()
         if args.human:
@@ -773,15 +770,16 @@ def display(it, args) -> None: # pragma: no cover
         can_ = ""
         pr = False
         def delete(x):
             nonlocal pr
             if x in (
-                    'https://www.',
-                    'http://www.',
-                    'http://',
-                    'https://',
-                    'file://',
-                    '/',
+                'https://www.',
+                'http://www.',
+                'http://',
+                'https://',
+                'file://',
+                '/',
             ):
                 col = None
             else:
@@ -800,9 +798,8 @@ def display(it, args) -> None: # pragma: no cover
                 fn = lambda x: C(x, color='cyan')
             # TODO exclude certain items from comparison?
-            org_ += fn(line[ff: tt])
-            can_ += fn(can[ff2: tt2])
+            org_ += fn(line[ff:tt])
+            can_ += fn(can[ff2:tt2])
             cl = max(len(org_), len(can_))
             org_ += ' ' * (cl - len(org_))
             can_ += ' ' * (cl - len(can_))
@@ -811,14 +808,17 @@ def display(it, args) -> None: # pragma: no cover
             stdout.write(f'{org_}\n{can_}\n---\n')
-def main() -> None: # pragma: no cover
+def main() -> None:  # pragma: no cover
     import argparse
-    p = argparse.ArgumentParser(epilog='''
+    p = argparse.ArgumentParser(
+        epilog='''
 - sqlite3 promnesia.sqlite 'select distinct orig_url from visits' | cannon.py --domains
 - running comparison
   sqlite3 promnesia.sqlite 'select distinct orig_url from visits where norm_url like "%twitter%" order by orig_url' | src/promnesia/cannon.py
-''', formatter_class=lambda prog: argparse.RawTextHelpFormatter(prog, width=100)
+''',
+        formatter_class=lambda prog: argparse.RawTextHelpFormatter(prog, width=100),
     )
     p.add_argument('input', nargs='?')
     p.add_argument('--human', action='store_true')
@@ -829,6 +829,7 @@ def main() -> None: # pragma: no cover
     it: Iterable[str]
     if args.input is None:
         import sys
         it = sys.stdin
     else:
         it = [args.input]
@@ -842,7 +843,7 @@ def main() -> None: # pragma: no cover
 if __name__ == '__main__':
-    main() # pragma: no cover
+    main()  # pragma: no cover
 # TODO hmm, it's actually sort of fingerprinter... so maybe that's what I should call it

promnesia 1.2.20240810__py3-none-any.whl → 1.4.20250909__py3-none-any.whl

promnesia 1.2.20240810py3-none-any.whl → 1.4.20250909py3-none-any.whl