PyPI - promnesia - Versions diffs - 1.1.20230129__py3-none-any.whl → 1.2.20240810__py3-none-any.whl - Mend

promnesia 1.1.20230129py3-none-any.whl → 1.2.20240810py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (72) hide show

promnesia/__main__.py +58 -50
promnesia/cannon.py +4 -4
promnesia/common.py +57 -38
promnesia/compare.py +3 -2
promnesia/compat.py +6 -65
promnesia/config.py +4 -2
promnesia/database/common.py +66 -0
promnesia/database/dump.py +187 -0
promnesia/{read_db.py → database/load.py} +10 -11
promnesia/extract.py +1 -0
promnesia/kjson.py +1 -1
promnesia/logging.py +14 -14
promnesia/misc/__init__.pyi +0 -0
promnesia/misc/config_example.py +1 -2
promnesia/misc/install_server.py +5 -4
promnesia/server.py +24 -24
promnesia/sources/__init__.pyi +0 -0
promnesia/sources/auto.py +12 -7
promnesia/sources/browser.py +80 -293
promnesia/sources/browser_legacy.py +298 -0
promnesia/sources/demo.py +18 -2
promnesia/sources/filetypes.py +8 -0
promnesia/sources/github.py +2 -2
promnesia/sources/hackernews.py +1 -2
promnesia/sources/hypothesis.py +1 -1
promnesia/sources/markdown.py +15 -15
promnesia/sources/org.py +7 -3
promnesia/sources/plaintext.py +3 -1
promnesia/sources/reddit.py +2 -2
promnesia/sources/rss.py +5 -1
promnesia/sources/shellcmd.py +6 -2
promnesia/sources/signal.py +29 -20
promnesia/sources/smscalls.py +8 -1
promnesia/sources/stackexchange.py +2 -2
promnesia/sources/takeout.py +132 -12
promnesia/sources/takeout_legacy.py +10 -2
promnesia/sources/telegram.py +79 -123
promnesia/sources/telegram_legacy.py +117 -0
promnesia/sources/vcs.py +1 -1
promnesia/sources/viber.py +6 -15
promnesia/sources/website.py +1 -1
promnesia/sqlite.py +42 -0
promnesia/tests/__init__.py +0 -0
promnesia/tests/common.py +137 -0
promnesia/tests/server_helper.py +64 -0
promnesia/tests/sources/__init__.py +0 -0
promnesia/tests/sources/test_auto.py +66 -0
promnesia/tests/sources/test_filetypes.py +42 -0
promnesia/tests/sources/test_hypothesis.py +39 -0
promnesia/tests/sources/test_org.py +65 -0
promnesia/tests/sources/test_plaintext.py +26 -0
promnesia/tests/sources/test_shellcmd.py +22 -0
promnesia/tests/sources/test_takeout.py +58 -0
promnesia/tests/test_cannon.py +325 -0
promnesia/tests/test_cli.py +42 -0
promnesia/tests/test_compare.py +30 -0
promnesia/tests/test_config.py +290 -0
promnesia/tests/test_db_dump.py +223 -0
promnesia/tests/test_extract.py +61 -0
promnesia/tests/test_extract_urls.py +43 -0
promnesia/tests/test_indexer.py +245 -0
promnesia/tests/test_server.py +292 -0
promnesia/tests/test_traverse.py +41 -0
promnesia/tests/utils.py +35 -0
{promnesia-1.1.20230129.dist-info → promnesia-1.2.20240810.dist-info}/METADATA +14 -19
promnesia-1.2.20240810.dist-info/RECORD +83 -0
{promnesia-1.1.20230129.dist-info → promnesia-1.2.20240810.dist-info}/WHEEL +1 -1
{promnesia-1.1.20230129.dist-info → promnesia-1.2.20240810.dist-info}/entry_points.txt +0 -1
promnesia/dump.py +0 -105
promnesia-1.1.20230129.dist-info/RECORD +0 -55
{promnesia-1.1.20230129.dist-info → promnesia-1.2.20240810.dist-info}/LICENSE +0 -0
{promnesia-1.1.20230129.dist-info → promnesia-1.2.20240810.dist-info}/top_level.txt +0 -0

promnesia/sources/auto.py CHANGED Viewed

@@ -1,6 +1,9 @@
 """
 - discovers files recursively
 - guesses the format (orgmode/markdown/json/etc) by the extension/MIME type
+- can index most of plaintext files, including source code!
+- autodetects Obsidian vault and adds `obsidian://` app protocol support [[file:../src/promnesia/sources/obsidian.py][promnesia.sources.obsidian]]
+- autodetects Logseq graph and adds `logseq://` app protocol support [[file:../src/promnesia/sources/logseq.py][promnesia.sources.logseq]]
 """
 import csv
@@ -19,17 +22,18 @@ import warnings
 import pytz
 from ..common import Visit, Url, PathIsh, get_logger, Loc, get_tmpdir, extract_urls, Extraction, Result, Results, mime, traverse, file_mtime, echain, logger
+from ..common import warn_once
 from ..config import use_cores
-from .filetypes import EUrl
+from .filetypes import EUrl, Ctx
 from .auto_obsidian import obsidian_replacer
 from .auto_logseq import logseq_replacer
 def _collect(thing, path: List[str], result: List[EUrl]) -> None:
     if isinstance(thing, str):
-        ctx: Ctx = tuple(path) # type: ignore
+        ctx: Ctx = tuple(path)
         result.extend([EUrl(url=u, ctx=ctx) for u in extract_urls(thing)])
     elif isinstance(thing, list):
         path.append('[]')
@@ -164,7 +168,7 @@ for t in CODE:
 Replacer = Optional[Callable[[str, str], str]]
 def index(
-        *paths: Union[PathIsh],
+        *paths: PathIsh,
         ignored: Union[Sequence[str], str]=(),
         follow: bool=True,
         replacer: Replacer=None,
@@ -279,6 +283,8 @@ def by_path(pp: Path) -> Tuple[Optional[Ex], Optional[Mime]]:
 def _index_file(pp: Path, opts: Options) -> Results:
     logger = get_logger()
+    # TODO need to keep debug logs here...
+    # logger.info(f"indexing {pp}")
     # TODO use kompress?
     # TODO not even sure if it's used...
     suf = pp.suffix.lower()
@@ -304,10 +310,9 @@ def _index_file(pp: Path, opts: Options) -> Results:
     ip, pm = by_path(pp)
     if ip is None:
-        # TODO use warning (with mime/ext as key?)
-        # TODO only log once? # hmm..
+        # todo not really sure about using warnings vs yielding error here?
         msg = f'No extractor for suffix {suf}, mime {pm}'
-        warnings.warn(msg)
+        warn_once(msg)
         yield echain(ex, RuntimeError(msg))
         return
@@ -315,7 +320,7 @@ def _index_file(pp: Path, opts: Options) -> Results:
     def indexer() -> Union[Urls, Results]:
         # eh, annoying.. need to make more generic..
-        idx = ip(pp) # type: ignore
+        idx = ip(pp)
         try:
             yield from idx
         except Exception as e:

promnesia/sources/browser.py CHANGED Viewed

@@ -1,302 +1,89 @@
-from datetime import datetime
-from pathlib import Path
-from urllib.parse import unquote
-import sqlite3
-from typing import List, Set
+'''
+Uses [[https://github.com/karlicoss/HPI][HPI]] for visits from web browsers.
+'''
-import pytz
+import re
+from typing import Optional, Iterator, Any, TYPE_CHECKING
+import warnings
-from ..common import PathIsh, Results, Visit, Loc, get_logger, Second, mime
-from .. import config
+from promnesia.common import Results, Visit, Loc, Second, PathIsh, logger, is_sqlite_db
-# todo mcachew?
-from cachew import cachew
-logger = get_logger()
+def index(p: Optional[PathIsh]=None) -> Results:
+    from . import hpi
-def index(p: PathIsh) -> Results:
-    pp = Path(p)
-    assert pp.exists(), pp # just in case of broken symlinks
-    # is_file check because it also returns dirs
-    # TODO hmm, not sure what I meant here -- which dirs? behind symlinks?
-    is_db = lambda x: x.is_file() and mime(x) in {
-        'application/x-sqlite3',
-        'application/vnd.sqlite3',
-        # TODO this mime can also match wal files/journals, not sure
-    }
-    # todo warn if filtered out too many?
-    # todo wonder how quickly mimes can be computed?
-    # todo ugh, dunno, maybe this really belongs to hpi?? need get_files etc...
-    dbs = [p for p in sorted(pp.rglob('*')) if is_db(p)]
-    assert len(dbs) > 0, pp
-    logger.info('processing %d databases', len(dbs))
-    cname = str('_'.join(pp.parts[1:])) # meh
-    yield from _index_dbs(dbs, cachew_name=cname)
-def _index_dbs(dbs: List[Path], cachew_name: str):
-    # TODO right... not ideal, need to think how to handle it properly...
-    import sys
-    sys.setrecursionlimit(5000)
-    cache_dir = config.get().cache_dir
-    cpath = None if cache_dir is None else cache_dir / cachew_name
-    emitted: Set = set()
-    yield from _index_dbs_aux(cpath, dbs, emitted=emitted)
-# todo wow, stack traces are ridiculous here...
-# todo hmm, feels like it should be a class or something?
-@cachew(lambda cp, dbs, emitted: cp, depends_on=lambda cp, dbs, emitted: dbs) # , logger=logger)
-def _index_dbs_aux(cache_path: Path, dbs: List[Path], emitted: Set) -> Results:
-    if len(dbs) == 0:
+    if p is None:
+        from my.browser.all import history
+        yield from _index_new(history())
         return
-    xs = dbs[:-1]
-    x  = dbs[-1:]
-    xs_res = _index_dbs_aux(cache_path, xs, emitted)
-    xs_was_cached = False
-    for r in xs_res:
-        # if it was cached, emitted would be empty
-        if len(emitted) == 0:
-            xs_was_cached = True
-            logger.debug('seems that %d first items were previously cached', len(xs))
-        if xs_was_cached:
-            key = (r.url, r.dt)
-            assert key not in emitted, key # todo not sure if this assert is necessary?
-            # hmm ok it might happen if we messed up with indexing individual db?
-            # alternatively, could abuse it to avoid messing with 'emitted' in _index_db?
-            emitted.add(key)
-        yield r # todo not sure about exceptions?
-    for db in x:
-        yield from _index_db(db, emitted=emitted)
-def _index_db(db: Path, emitted: Set):
-    logger.info('processing %s', db) # debug level?
-    # todo schema check (not so critical for cachew though)
-    total = 0
-    new   = 0
-    loc = Loc.file(db) # todo possibly needs to be optimized -- moving from within the loop considerably speeds everything up
-    with sqlite3.connect(f'file:{db}?immutable=1', uri=True) as c:
-        browser = None
-        for b in [Chrome, Firefox, FirefoxPhone, Safari]:
-            try:
-                c.execute(f'SELECT * FROM {b.detector}')
-            except sqlite3.OperationalError: # not sure if the right kind?
-                pass
-            else:
-                browser = b
-                break
-        assert browser is not None
-        proj  = ', '.join(c for c, _ in browser.schema.cols)
-        query = browser.query.replace('chunk.', '')
-        c.row_factory = sqlite3.Row
-        for r in c.execute(f'select {proj} {query}'):
-            v = browser.row2visit(r, loc)
-            total += 1
-            key = (v.url, v.dt)
-            # todo how to keep keys compatible?
-            if key in emitted:
-                continue
-            yield v
-            emitted.add(key)
-            new += 1
-            # eh, ok, almost 2x faster if I don't construct Visit first
-            # maybe it's Loc.file that's too slow?
-            # yeah, seems like it, 4.1 s after computing it only once
-    logger.info('%s: %d/%d new visits', db, new, total)
-Col = str
-ColType = str
-from typing import Any, NamedTuple, Tuple, Union, Sequence, Optional
-class Schema(NamedTuple):
-    cols: Sequence[Tuple[Col, ColType]]
-    key: Sequence[str]
-SchemaCheck = Tuple[str, Union[str, Sequence[str]]] # todo Union: meh
-from dataclasses import dataclass
-# todo protocol?
-@dataclass
-class Extr:
-    detector: str
-    schema_check: SchemaCheck
-    schema: Schema
-    query: str
-    # todo calllable?
-    @staticmethod
-    def row2visit(row: sqlite3.Row, loc: Loc) -> Visit:
-        raise NotImplementedError
-class Chrome(Extr):
-    detector='keyword_search_terms'
-    schema_check=(
-        'visits', [
-            'visits', "id, url, visit_time, from_visit, transition, segment_id, visit_duration, incremented_omnibox_typed_score",
-            'visits', "id, url, visit_time, from_visit, transition, segment_id, visit_duration"
-        ]
-    )
-    schema=Schema(cols=[
-        ('U.url'                                  , 'TEXT'   ),
-        # while these two are not very useful, might be good to have just in case for some debugging
-        ('U.id AS urlid'                          , 'INTEGER'),
-        ('V.id AS vid'                            , 'INTEGER'),
-        ('V.visit_time'                           , 'INTEGER NOT NULL'),
-        ('V.from_visit'                           , 'INTEGER'         ),
-        ('V.transition'                           , 'INTEGER NOT NULL'),
-        # V.segment_id looks useless
-        ('V.visit_duration'                       , 'INTEGER NOT NULL'),
-        # V.omnibox thing looks useless
-    ], key=('url', 'visit_time', 'vid', 'urlid'))
-    query='FROM chunk.visits as V, chunk.urls as U WHERE V.url = U.id'
-    @staticmethod
-    def row2visit(row: sqlite3.Row, loc: Loc) -> Visit:
-        url  = row['url']
-        ts   = row['visit_time']
-        durs = row['visit_duration']
-        dt = chrome_time_to_utc(int(ts))
-        url = unquote(url) # chrome urls are all quoted
-        dd = int(durs)
-        dur: Optional[Second] = None if dd == 0 else dd // 1_000_000
-        return Visit(
-            url=url,
-            dt=dt,
-            locator=loc,
-            duration=dur,
-        )
-# should be utc? https://stackoverflow.com/a/26226771/706389
-# yep, tested it and looks like utc
-def chrome_time_to_utc(chrome_time: int) -> datetime:
-    epoch = (chrome_time / 1_000_000) - 11644473600
-    return datetime.fromtimestamp(epoch, pytz.utc)
-def _row2visit_firefox(row: sqlite3.Row, loc: Loc) -> Visit:
-    url = row['url']
-    ts  = float(row['visit_date'])
-    # ok, looks like it's unix epoch
-    # https://stackoverflow.com/a/19430099/706389
-    # NOTE: ugh. on Fenix (experimental Android version) it uses milliseconds, not nanos...
-    # about year 2001... if someone has browser history exports before that -- please let me know, I'm impressed
-    threshold = 1000000000
-    if ts > threshold * 1_000_000:
-        # presumably it's in microseconds
-        ts /= 1_000_000
-    else:
-        # milliseconds
-        ts /= 1_000
-    dt = datetime.fromtimestamp(ts, pytz.utc)
-    url = unquote(url) # firefox urls are all quoted
-    return Visit(
-        url=url,
-        dt=dt,
-        locator=loc,
-    )
-# https://web.archive.org/web/20201026130310/http://fileformats.archiveteam.org/wiki/History.db
-class Safari(Extr):
-    detector='history_tombstones'
-    schema_check=(
-        'history_visits', [
-            'history_visits', "id, history_item, visit_time",
-            'history_items', "id, url"
-        ]
+    warnings.warn(
+        f'Passing paths to promnesia.sources.browser is deprecated, you should setup my.browser.export instead. '
+        f'See https://github.com/seanbreckenridge/browserexport#hpi .'
+        f'Will try to hack path to browser databases {p} into HPI config.'
     )
-    schema=Schema(cols=[
-        ('U.url'                                  , 'TEXT'   ),
-        # while these two are not very useful, might be good to have just in case for some debugging
-        ('U.id AS urlid'                          , 'INTEGER'),
-        ('V.id AS vid'                            , 'INTEGER'),
-        ('V.visit_time'                           , 'INTEGER NOT NULL'),
-        # ('V.from_visit'                           , 'INTEGER'         ),
-        # ('V.transition'                           , 'INTEGER NOT NULL'),
-        # V.segment_id looks useless
-        # ('V.visit_duration'                       , 'INTEGER NOT NULL'),
-        # V.omnibox thing looks useless
-    ], key=('url', 'visit_time', 'vid', 'urlid'))
-    query='FROM chunk.history_visits as V, chunk.history_items as U WHERE V.history_item = U.id'
-    @staticmethod
-    def row2visit(row: sqlite3.Row, loc: Loc) -> Visit:
-        url  = row['url']
-        ts   = row['visit_time'] + 978307200 # https://stackoverflow.com/a/34546556/16645
-        dt = datetime.fromtimestamp(ts, pytz.utc)
-        return Visit(
-            url=url,
-            dt=dt,
-            locator=loc,
+    try:
+        yield from _index_new_with_adhoc_config(path=p)
+        return
+    except Exception as e:
+        logger.exception(e)
+        warnings.warn("Hacking my.config.browser.export didn't work. You probably need to update HPI.")
+    logger.warning("Falling back onto legacy promnesia.sources.browser_legacy module")
+    yield from _index_old(path=p)
+def _index_old(*, path: PathIsh) -> Results:
+    from . import browser_legacy
+    yield from browser_legacy.index(path)
+def _index_new_with_adhoc_config(*, path: PathIsh) -> Results:
+    from . import hpi
+    ## previously, it was possible to index be called with multiple different db search paths
+    ## this would result in each subsequent call to my.browser.export.history to invalidate cache every time
+    ## so we hack cachew path so it's different for each call
+    from my.core.core_config import config as hpi_core_config
+    hpi_cache_dir = hpi_core_config.get_cache_dir()
+    sanitized_path = re.sub(r'\W', '_', str(path))
+    cache_override = None if hpi_cache_dir is None else hpi_cache_dir / sanitized_path
+    ##
+    from my.core.common import classproperty, Paths, get_files
+    class config:
+        class core:
+            cache_dir = cache_override
+        class browser:
+            class export:
+                @classproperty
+                def export_path(cls) -> Paths:
+                    return tuple([f for f in get_files(path, glob='**/*') if is_sqlite_db(f)])
+    from my.core.cfg import tmp_config
+    with tmp_config(modules='my.browser.export|my.core.core_config', config=config):
+        from my.browser.export import history
+        yield from _index_new(history())
+if TYPE_CHECKING:
+    from browserexport.merge import Visit as BrowserMergeVisit
+else:
+    BrowserMergeVisit = Any
+def _index_new(history: Iterator[BrowserMergeVisit]) -> Results:
+    for v in history:
+        desc: Optional[str] = None
+        duration: Optional[Second] = None
+        metadata = v.metadata
+        if metadata is not None:
+            desc = metadata.title
+            duration = metadata.duration
+        yield Visit(
+            url=v.url,
+            dt=v.dt,
+            locator=Loc(title=desc or v.url, href=v.url),
+            duration=duration,
         )
-# https://web.archive.org/web/20190730231715/https://www.forensicswiki.org/wiki/Mozilla_Firefox_3_History_File_Format#moz_historyvisits
-class Firefox(Extr):
-    detector='moz_meta'
-    schema_check=('moz_historyvisits', "id, from_visit, place_id, visit_date, visit_type")
-    schema=Schema(cols=[
-        ('P.url'       , 'TEXT'),
-        ('P.id AS pid' , 'INTEGER'),
-        ('V.id AS vid' , 'INTEGER'),
-        ('V.from_visit', 'INTEGER'),
-        ('V.visit_date', 'INTEGER'),
-        ('V.visit_type', 'INTEGER'),
-        # not sure what session is form but could be useful?..
-        # NOTE(20210410): for now, commented it out since some older databases from phone have this column commented?
-        # needs to be defensive
-        # ('V.session'   , 'INTEGER'),
-    ], key=('url', 'visit_date', 'vid', 'pid'))
-    query='FROM chunk.moz_historyvisits as V, chunk.moz_places as P WHERE V.place_id = P.id'
-    row2visit = _row2visit_firefox
-class FirefoxPhone(Extr):
-    detector='remote_devices'
-    schema_check=('visits', "_id, history_guid, visit_type, date, is_local")
-    schema=Schema(cols=[
-        ('H.url'               , 'TEXT NOT NULL'   ),
-        ('H.guid AS guid'      , 'TEXT'            ),
-        ('H._id  AS hid'       , 'INTEGER'         ),
-        ('V._id  AS vid'       , 'INTEGER'         ),
-        ('V.visit_type'        , 'INTEGER NOT NULL'),
-        ('V.date as visit_date', 'INTEGER NOT NULL'),
-        # ('is_local'    , 'INTEGER NOT NULL'),
-    ], key=('url', 'date', 'vid', 'hid'))
-    query='FROM chunk.visits as V, chunk.history as H  WHERE V.history_guid = H.guid'
-    row2visit = _row2visit_firefox

promnesia 1.1.20230129__py3-none-any.whl → 1.2.20240810__py3-none-any.whl

promnesia 1.1.20230129py3-none-any.whl → 1.2.20240810py3-none-any.whl