promnesia 1.2.20230515__py3-none-any.whl → 1.2.20240810__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- promnesia/__main__.py +26 -14
- promnesia/cannon.py +4 -4
- promnesia/common.py +39 -28
- promnesia/compare.py +3 -2
- promnesia/config.py +4 -2
- promnesia/database/common.py +66 -0
- promnesia/database/dump.py +187 -0
- promnesia/{read_db.py → database/load.py} +10 -11
- promnesia/extract.py +1 -0
- promnesia/kjson.py +1 -1
- promnesia/logging.py +3 -3
- promnesia/misc/__init__.pyi +0 -0
- promnesia/misc/config_example.py +1 -2
- promnesia/misc/install_server.py +2 -3
- promnesia/server.py +18 -19
- promnesia/sources/__init__.pyi +0 -0
- promnesia/sources/auto.py +9 -7
- promnesia/sources/browser_legacy.py +11 -5
- promnesia/sources/demo.py +18 -2
- promnesia/sources/filetypes.py +7 -0
- promnesia/sources/github.py +2 -2
- promnesia/sources/hypothesis.py +1 -1
- promnesia/sources/markdown.py +15 -15
- promnesia/sources/org.py +7 -3
- promnesia/sources/plaintext.py +3 -1
- promnesia/sources/reddit.py +2 -2
- promnesia/sources/rss.py +1 -1
- promnesia/sources/signal.py +22 -14
- promnesia/sources/stackexchange.py +2 -2
- promnesia/sources/takeout.py +58 -1
- promnesia/sources/takeout_legacy.py +10 -2
- promnesia/tests/__init__.py +0 -0
- promnesia/tests/common.py +137 -0
- promnesia/tests/server_helper.py +64 -0
- promnesia/tests/sources/__init__.py +0 -0
- promnesia/tests/sources/test_auto.py +66 -0
- promnesia/tests/sources/test_filetypes.py +42 -0
- promnesia/tests/sources/test_hypothesis.py +39 -0
- promnesia/tests/sources/test_org.py +65 -0
- promnesia/tests/sources/test_plaintext.py +26 -0
- promnesia/tests/sources/test_shellcmd.py +22 -0
- promnesia/tests/sources/test_takeout.py +58 -0
- promnesia/tests/test_cannon.py +325 -0
- promnesia/tests/test_cli.py +42 -0
- promnesia/tests/test_compare.py +30 -0
- promnesia/tests/test_config.py +290 -0
- promnesia/tests/test_db_dump.py +223 -0
- promnesia/tests/test_extract.py +61 -0
- promnesia/tests/test_extract_urls.py +43 -0
- promnesia/tests/test_indexer.py +245 -0
- promnesia/tests/test_server.py +292 -0
- promnesia/tests/test_traverse.py +41 -0
- promnesia/tests/utils.py +35 -0
- {promnesia-1.2.20230515.dist-info → promnesia-1.2.20240810.dist-info}/METADATA +13 -17
- promnesia-1.2.20240810.dist-info/RECORD +83 -0
- {promnesia-1.2.20230515.dist-info → promnesia-1.2.20240810.dist-info}/WHEEL +1 -1
- {promnesia-1.2.20230515.dist-info → promnesia-1.2.20240810.dist-info}/entry_points.txt +0 -1
- promnesia/dump.py +0 -105
- promnesia-1.2.20230515.dist-info/RECORD +0 -58
- {promnesia-1.2.20230515.dist-info → promnesia-1.2.20240810.dist-info}/LICENSE +0 -0
- {promnesia-1.2.20230515.dist-info → promnesia-1.2.20240810.dist-info}/top_level.txt +0 -0
promnesia/logging.py
CHANGED
@@ -61,7 +61,7 @@ _init_done = 'lazylogger_init_done'
|
|
61
61
|
def setup_logger(logger: logging.Logger, level: LevelIsh) -> None:
|
62
62
|
lvl = mklevel(level)
|
63
63
|
try:
|
64
|
-
import logzero # type: ignore[import]
|
64
|
+
import logzero # type: ignore[import-not-found]
|
65
65
|
formatter = logzero.LogFormatter(
|
66
66
|
fmt=FORMAT_COLOR,
|
67
67
|
datefmt=DATEFMT,
|
@@ -75,7 +75,7 @@ def setup_logger(logger: logging.Logger, level: LevelIsh) -> None:
|
|
75
75
|
logger.addFilter(AddExceptionTraceback())
|
76
76
|
if use_logzero and not COLLAPSE_DEBUG_LOGS: # all set, nothing to do
|
77
77
|
# 'simple' setup
|
78
|
-
logzero.setup_logger(logger.name, level=lvl, formatter=formatter)
|
78
|
+
logzero.setup_logger(logger.name, level=lvl, formatter=formatter) # type: ignore[possibly-undefined]
|
79
79
|
return
|
80
80
|
|
81
81
|
h = CollapseDebugHandler() if COLLAPSE_DEBUG_LOGS else logging.StreamHandler()
|
@@ -101,7 +101,7 @@ class LazyLogger(logging.Logger):
|
|
101
101
|
# oh god.. otherwise might go into an inf loop
|
102
102
|
if not hasattr(logger, _init_done):
|
103
103
|
setattr(logger, _init_done, False) # will setup on the first call
|
104
|
-
logger.isEnabledFor = isEnabledFor_lazyinit # type: ignore[
|
104
|
+
logger.isEnabledFor = isEnabledFor_lazyinit # type: ignore[method-assign]
|
105
105
|
return cast(LazyLogger, logger)
|
106
106
|
|
107
107
|
|
File without changes
|
promnesia/misc/config_example.py
CHANGED
promnesia/misc/install_server.py
CHANGED
@@ -7,6 +7,7 @@ import sys
|
|
7
7
|
import time
|
8
8
|
from pathlib import Path
|
9
9
|
import platform
|
10
|
+
import shutil
|
10
11
|
from subprocess import check_call, run
|
11
12
|
from typing import List
|
12
13
|
|
@@ -118,9 +119,7 @@ def install(args: argparse.Namespace) -> None:
|
|
118
119
|
if os.environ.get('DIRTY_RUN') is not None:
|
119
120
|
launcher = str(root() / 'scripts/promnesia')
|
120
121
|
else:
|
121
|
-
|
122
|
-
import distutils.spawn
|
123
|
-
exe = distutils.spawn.find_executable('promnesia'); assert exe is not None
|
122
|
+
exe = shutil.which('promnesia'); assert exe is not None
|
124
123
|
launcher = exe # older systemd wants absolute paths..
|
125
124
|
|
126
125
|
db = args.db
|
promnesia/server.py
CHANGED
@@ -1,12 +1,11 @@
|
|
1
1
|
#!/usr/bin/python3
|
2
2
|
from __future__ import annotations
|
3
3
|
|
4
|
-
__package__ = 'promnesia' # ugh. hacky way to make wsgi runner work properly...
|
5
|
-
|
6
4
|
import argparse
|
7
5
|
from dataclasses import dataclass
|
8
6
|
from datetime import timedelta
|
9
7
|
from functools import lru_cache
|
8
|
+
import importlib.metadata
|
10
9
|
import json
|
11
10
|
import logging
|
12
11
|
import os
|
@@ -19,7 +18,7 @@ from pytz import BaseTzInfo
|
|
19
18
|
|
20
19
|
import fastapi
|
21
20
|
|
22
|
-
from sqlalchemy import
|
21
|
+
from sqlalchemy import literal, between, or_, and_, exc, select
|
23
22
|
from sqlalchemy import Column, Table, func, types
|
24
23
|
from sqlalchemy.sql.elements import ColumnElement
|
25
24
|
from sqlalchemy.sql import text
|
@@ -27,6 +26,7 @@ from sqlalchemy.sql import text
|
|
27
26
|
|
28
27
|
from .common import PathWithMtime, DbVisit, Url, setup_logger, default_output_dir, get_system_tz
|
29
28
|
from .cannon import canonify
|
29
|
+
from .database.load import DbStuff, get_db_stuff, row_to_db_visit
|
30
30
|
|
31
31
|
|
32
32
|
Json = Dict[str, Any]
|
@@ -51,8 +51,7 @@ def get_logger() -> logging.Logger:
|
|
51
51
|
|
52
52
|
|
53
53
|
def get_version() -> str:
|
54
|
-
|
55
|
-
return get_distribution(__package__).version
|
54
|
+
return importlib.metadata.version(__package__)
|
56
55
|
|
57
56
|
|
58
57
|
class ServerConfig(NamedTuple):
|
@@ -119,8 +118,6 @@ def get_db_path(check: bool=True) -> Path:
|
|
119
118
|
return db
|
120
119
|
|
121
120
|
|
122
|
-
from .read_db import DbStuff, get_db_stuff
|
123
|
-
|
124
121
|
@lru_cache(1)
|
125
122
|
# PathWithMtime aids lru_cache in reloading the sqlalchemy binder
|
126
123
|
def _get_stuff(db_path: PathWithMtime) -> DbStuff:
|
@@ -136,7 +133,7 @@ def get_stuff(db_path: Optional[Path]=None) -> DbStuff: # TODO better name
|
|
136
133
|
|
137
134
|
|
138
135
|
def db_stats(db_path: Path) -> Json:
|
139
|
-
engine,
|
136
|
+
engine, table = get_stuff(db_path)
|
140
137
|
query = select(func.count()).select_from(table)
|
141
138
|
with engine.connect() as conn:
|
142
139
|
total = list(conn.execute(query))[0][0]
|
@@ -151,8 +148,8 @@ class Where(Protocol):
|
|
151
148
|
|
152
149
|
@dataclass
|
153
150
|
class VisitsResponse:
|
154
|
-
original_url:
|
155
|
-
normalised_url:
|
151
|
+
original_url: str
|
152
|
+
normalised_url: str
|
156
153
|
visits: Any
|
157
154
|
|
158
155
|
|
@@ -167,7 +164,7 @@ def search_common(url: str, where: Where) -> VisitsResponse:
|
|
167
164
|
url = original_url
|
168
165
|
logger.info('normalised url: %s', url)
|
169
166
|
|
170
|
-
engine,
|
167
|
+
engine, table = get_stuff()
|
171
168
|
|
172
169
|
query = table.select().where(where(table=table, url=url))
|
173
170
|
logger.debug('query: %s', query)
|
@@ -175,7 +172,7 @@ def search_common(url: str, where: Where) -> VisitsResponse:
|
|
175
172
|
with engine.connect() as conn:
|
176
173
|
try:
|
177
174
|
# TODO make more defensive here
|
178
|
-
visits: List[DbVisit] = [
|
175
|
+
visits: List[DbVisit] = [row_to_db_visit(row) for row in conn.execute(query)]
|
179
176
|
except exc.OperationalError as e:
|
180
177
|
if getattr(e, 'msg', None) == 'no such table: visits':
|
181
178
|
logger.warn('you may have to run indexer first!')
|
@@ -232,6 +229,7 @@ def status() -> Json:
|
|
232
229
|
try:
|
233
230
|
version = get_version()
|
234
231
|
except Exception as e:
|
232
|
+
logger.exception(e)
|
235
233
|
version = None
|
236
234
|
|
237
235
|
return {
|
@@ -241,10 +239,9 @@ def status() -> Json:
|
|
241
239
|
}
|
242
240
|
|
243
241
|
|
244
|
-
from dataclasses import dataclass
|
245
242
|
@dataclass
|
246
243
|
class VisitsRequest:
|
247
|
-
url:
|
244
|
+
url: str
|
248
245
|
|
249
246
|
@app.get ('/visits', response_model=VisitsResponse)
|
250
247
|
@app.post('/visits', response_model=VisitsResponse)
|
@@ -255,15 +252,17 @@ def visits(request: VisitsRequest) -> VisitsResponse:
|
|
255
252
|
url=url,
|
256
253
|
# odd, doesn't work just with: x or (y and z)
|
257
254
|
where=lambda table, url: or_(
|
258
|
-
|
259
|
-
|
255
|
+
# exact match
|
256
|
+
table.c.norm_url == url,
|
257
|
+
# + child visits, but only 'interesting' ones
|
258
|
+
and_(table.c.context != None, table.c.norm_url.startswith(url, autoescape=True)) # noqa: E711
|
260
259
|
),
|
261
260
|
)
|
262
261
|
|
263
262
|
|
264
263
|
@dataclass
|
265
264
|
class SearchRequest:
|
266
|
-
url:
|
265
|
+
url: str
|
267
266
|
|
268
267
|
@app.get ('/search', response_model=VisitsResponse)
|
269
268
|
@app.post('/search', response_model=VisitsResponse)
|
@@ -361,7 +360,7 @@ def visited(request: VisitedRequest) -> VisitedResponse:
|
|
361
360
|
if len(snurls) == 0:
|
362
361
|
return []
|
363
362
|
|
364
|
-
engine,
|
363
|
+
engine, table = get_stuff()
|
365
364
|
|
366
365
|
# sqlalchemy doesn't seem to support SELECT FROM (VALUES (...)) in its api
|
367
366
|
# also doesn't support array binding...
|
@@ -389,7 +388,7 @@ SELECT queried, visits.*
|
|
389
388
|
# brings down large queries to 50ms...
|
390
389
|
with engine.connect() as conn:
|
391
390
|
res = list(conn.execute(query))
|
392
|
-
present: Dict[str, Any] = {row[0]:
|
391
|
+
present: Dict[str, Any] = {row[0]: row_to_db_visit(row[1:]) for row in res}
|
393
392
|
results = []
|
394
393
|
for nu in nurls:
|
395
394
|
r = present.get(nu, None)
|
File without changes
|
promnesia/sources/auto.py
CHANGED
@@ -22,17 +22,18 @@ import warnings
|
|
22
22
|
import pytz
|
23
23
|
|
24
24
|
from ..common import Visit, Url, PathIsh, get_logger, Loc, get_tmpdir, extract_urls, Extraction, Result, Results, mime, traverse, file_mtime, echain, logger
|
25
|
+
from ..common import warn_once
|
25
26
|
from ..config import use_cores
|
26
27
|
|
27
28
|
|
28
|
-
from .filetypes import EUrl
|
29
|
+
from .filetypes import EUrl, Ctx
|
29
30
|
from .auto_obsidian import obsidian_replacer
|
30
31
|
from .auto_logseq import logseq_replacer
|
31
32
|
|
32
33
|
|
33
34
|
def _collect(thing, path: List[str], result: List[EUrl]) -> None:
|
34
35
|
if isinstance(thing, str):
|
35
|
-
ctx: Ctx = tuple(path)
|
36
|
+
ctx: Ctx = tuple(path)
|
36
37
|
result.extend([EUrl(url=u, ctx=ctx) for u in extract_urls(thing)])
|
37
38
|
elif isinstance(thing, list):
|
38
39
|
path.append('[]')
|
@@ -167,7 +168,7 @@ for t in CODE:
|
|
167
168
|
Replacer = Optional[Callable[[str, str], str]]
|
168
169
|
|
169
170
|
def index(
|
170
|
-
*paths:
|
171
|
+
*paths: PathIsh,
|
171
172
|
ignored: Union[Sequence[str], str]=(),
|
172
173
|
follow: bool=True,
|
173
174
|
replacer: Replacer=None,
|
@@ -282,6 +283,8 @@ def by_path(pp: Path) -> Tuple[Optional[Ex], Optional[Mime]]:
|
|
282
283
|
|
283
284
|
def _index_file(pp: Path, opts: Options) -> Results:
|
284
285
|
logger = get_logger()
|
286
|
+
# TODO need to keep debug logs here...
|
287
|
+
# logger.info(f"indexing {pp}")
|
285
288
|
# TODO use kompress?
|
286
289
|
# TODO not even sure if it's used...
|
287
290
|
suf = pp.suffix.lower()
|
@@ -307,10 +310,9 @@ def _index_file(pp: Path, opts: Options) -> Results:
|
|
307
310
|
|
308
311
|
ip, pm = by_path(pp)
|
309
312
|
if ip is None:
|
310
|
-
#
|
311
|
-
# TODO only log once? # hmm..
|
313
|
+
# todo not really sure about using warnings vs yielding error here?
|
312
314
|
msg = f'No extractor for suffix {suf}, mime {pm}'
|
313
|
-
|
315
|
+
warn_once(msg)
|
314
316
|
yield echain(ex, RuntimeError(msg))
|
315
317
|
return
|
316
318
|
|
@@ -318,7 +320,7 @@ def _index_file(pp: Path, opts: Options) -> Results:
|
|
318
320
|
|
319
321
|
def indexer() -> Union[Urls, Results]:
|
320
322
|
# eh, annoying.. need to make more generic..
|
321
|
-
idx = ip(pp)
|
323
|
+
idx = ip(pp)
|
322
324
|
try:
|
323
325
|
yield from idx
|
324
326
|
except Exception as e:
|
@@ -2,15 +2,21 @@ from datetime import datetime
|
|
2
2
|
from pathlib import Path
|
3
3
|
from urllib.parse import unquote
|
4
4
|
import sqlite3
|
5
|
-
from typing import List, Set
|
5
|
+
from typing import List, Set, Optional
|
6
6
|
|
7
7
|
import pytz
|
8
8
|
|
9
9
|
from ..common import PathIsh, Results, Visit, Loc, logger, Second, is_sqlite_db
|
10
10
|
from .. import config
|
11
11
|
|
12
|
-
|
13
|
-
from cachew import cachew
|
12
|
+
try:
|
13
|
+
from cachew import cachew
|
14
|
+
except ModuleNotFoundError as me:
|
15
|
+
if me.name != 'cachew':
|
16
|
+
raise me
|
17
|
+
# this module is legacy anyway, so just make it defensive
|
18
|
+
def cachew(*args, **kwargs): # type: ignore[no-redef]
|
19
|
+
return lambda f: f
|
14
20
|
|
15
21
|
|
16
22
|
def index(p: PathIsh) -> Results:
|
@@ -43,7 +49,7 @@ def _index_dbs(dbs: List[Path], cachew_name: str):
|
|
43
49
|
# todo wow, stack traces are ridiculous here...
|
44
50
|
# todo hmm, feels like it should be a class or something?
|
45
51
|
@cachew(lambda cp, dbs, emitted: cp, depends_on=lambda cp, dbs, emitted: dbs) # , logger=logger)
|
46
|
-
def _index_dbs_aux(cache_path: Path, dbs: List[Path], emitted: Set) -> Results:
|
52
|
+
def _index_dbs_aux(cache_path: Optional[Path], dbs: List[Path], emitted: Set) -> Results:
|
47
53
|
if len(dbs) == 0:
|
48
54
|
return
|
49
55
|
|
@@ -58,7 +64,7 @@ def _index_dbs_aux(cache_path: Path, dbs: List[Path], emitted: Set) -> Results:
|
|
58
64
|
xs_was_cached = True
|
59
65
|
logger.debug('seems that %d first items were previously cached', len(xs))
|
60
66
|
if xs_was_cached:
|
61
|
-
key = (r.url, r.dt)
|
67
|
+
key = str(r) if isinstance(r, Exception) else (r.url, r.dt)
|
62
68
|
assert key not in emitted, key # todo not sure if this assert is necessary?
|
63
69
|
# hmm ok it might happen if we messed up with indexing individual db?
|
64
70
|
# alternatively, could abuse it to avoid messing with 'emitted' in _index_db?
|
promnesia/sources/demo.py
CHANGED
@@ -4,17 +4,33 @@ Generates a sequence of fake evenly separated visits
|
|
4
4
|
'''
|
5
5
|
|
6
6
|
from datetime import datetime, timedelta
|
7
|
+
from typing import Union
|
7
8
|
|
8
9
|
from ..common import Results, Visit, Loc
|
9
10
|
|
10
11
|
|
11
|
-
|
12
|
+
IsoFormatDt = str
|
13
|
+
Seconds = int
|
14
|
+
|
15
|
+
|
16
|
+
# TODO allow passing isoformat string as base_dt?
|
17
|
+
# and maybe something similar as delta? start with seconds maybe
|
18
|
+
def index(
|
19
|
+
count: int=100,
|
20
|
+
*,
|
21
|
+
base_dt: Union[datetime, IsoFormatDt] = datetime.min + timedelta(days=5000),
|
22
|
+
delta: Union[timedelta, Seconds] = timedelta(hours=1),
|
23
|
+
) -> Results:
|
24
|
+
|
25
|
+
base_dt_ = base_dt if isinstance(base_dt, datetime) else datetime.fromisoformat(base_dt)
|
26
|
+
delta_ = delta if isinstance(delta, timedelta) else timedelta(seconds=delta)
|
27
|
+
|
12
28
|
# todo with some errors too?
|
13
29
|
# todo use data generation library suggested for HPI?
|
14
30
|
for i in range(count):
|
15
31
|
yield Visit(
|
16
32
|
url=f'https://demo.com/page{i}.html',
|
17
|
-
dt=
|
33
|
+
dt=base_dt_ + delta_ * i,
|
18
34
|
locator=Loc.make('demo'),
|
19
35
|
)
|
20
36
|
# todo add context?
|
promnesia/sources/filetypes.py
CHANGED
@@ -67,6 +67,7 @@ CODE = {
|
|
67
67
|
'text/vnd.graphviz',
|
68
68
|
'text/x-diff', # patch files
|
69
69
|
'text/x-php',
|
70
|
+
'text/x-lilypond',
|
70
71
|
|
71
72
|
# these didn't have a mime type, or were mistyped?
|
72
73
|
'css',
|
@@ -115,6 +116,12 @@ TYPE2IDX.update({
|
|
115
116
|
'.vcf' : ignore,
|
116
117
|
'message/rfc822': ignore, # ??
|
117
118
|
|
119
|
+
# todo ignore all fonts?
|
120
|
+
'font/woff2': ignore,
|
121
|
+
'font/woff': ignore,
|
122
|
+
'text/x-Algol68': ignore, # ugh some license file had this?? maybe always index text/ as text?
|
123
|
+
'text/x-bytecode.python': ignore, # todo ignore all x-bytecode?
|
124
|
+
|
118
125
|
# TODO not sure what to do about these..
|
119
126
|
'application/octet-stream': handle_later,
|
120
127
|
'application/zip' : handle_later,
|
promnesia/sources/github.py
CHANGED
@@ -31,7 +31,7 @@ def index(*, render_markdown: bool = False) -> Results:
|
|
31
31
|
# if enabled, convert the (markdown) body to HTML
|
32
32
|
context: Optional[str] = e.body
|
33
33
|
if e.body is not None and render_markdown:
|
34
|
-
context = TextParser(e.body)._doc_ashtml()
|
34
|
+
context = TextParser(e.body)._doc_ashtml() # type: ignore[possibly-undefined]
|
35
35
|
|
36
36
|
# locator should link back to this event
|
37
37
|
loc = Loc.make(title=e.summary, href=e.link)
|
@@ -74,7 +74,7 @@ def index(*, render_markdown: bool = False) -> Results:
|
|
74
74
|
# extract from markdown links like [link text](https://...)
|
75
75
|
# incase URLExtract missed any somehow
|
76
76
|
if render_markdown:
|
77
|
-
for res in extract_from_text(e.body):
|
77
|
+
for res in extract_from_text(e.body): # type: ignore[possibly-undefined]
|
78
78
|
if isinstance(res, Exception):
|
79
79
|
yield res
|
80
80
|
continue
|
promnesia/sources/hypothesis.py
CHANGED
promnesia/sources/markdown.py
CHANGED
@@ -1,13 +1,13 @@
|
|
1
1
|
from pathlib import Path
|
2
2
|
from typing import Iterator, NamedTuple, Optional
|
3
3
|
|
4
|
-
from ..common import
|
4
|
+
from ..common import Extraction, Url, PathIsh, Res, Visit, Loc, file_mtime, logger
|
5
5
|
|
6
6
|
|
7
|
-
import mistletoe
|
8
|
-
from mistletoe.span_token import AutoLink, Link
|
9
|
-
import mistletoe.block_token as BT
|
10
|
-
from mistletoe.html_renderer import HTMLRenderer
|
7
|
+
import mistletoe # type: ignore
|
8
|
+
from mistletoe.span_token import AutoLink, Link # type: ignore
|
9
|
+
import mistletoe.block_token as BT # type: ignore
|
10
|
+
from mistletoe.html_renderer import HTMLRenderer # type: ignore
|
11
11
|
|
12
12
|
|
13
13
|
renderer = HTMLRenderer()
|
@@ -42,7 +42,7 @@ HTML_MARKER = '!html '
|
|
42
42
|
def _ashtml(block) -> str:
|
43
43
|
res = renderer.render(block)
|
44
44
|
if res.startswith('<p>') and res.endswith('</p>'):
|
45
|
-
res = res[3
|
45
|
+
res = res[3:-4] # meh, but for now fine
|
46
46
|
return res
|
47
47
|
|
48
48
|
|
@@ -62,7 +62,6 @@ class Parser:
|
|
62
62
|
context = None if last_block is None else HTML_MARKER + _ashtml(last_block)
|
63
63
|
yield Parsed(url=url, context=context)
|
64
64
|
|
65
|
-
|
66
65
|
def _walk(self, cur, last_block) -> Iterator[Result]:
|
67
66
|
if isinstance(cur, block_tokens):
|
68
67
|
last_block = cur
|
@@ -73,12 +72,14 @@ class Parser:
|
|
73
72
|
logger.exception(e)
|
74
73
|
yield e
|
75
74
|
|
76
|
-
|
75
|
+
# keeping getattr for compatibility in older versions of mistletoe, it was optional
|
76
|
+
children = getattr(cur, 'children', None)
|
77
|
+
if children is None:
|
78
|
+
return
|
77
79
|
for c in children:
|
78
80
|
yield from self._walk(c, last_block=last_block)
|
79
81
|
|
80
|
-
|
81
|
-
def walk(self):
|
82
|
+
def walk(self) -> Iterator[Result]:
|
82
83
|
yield from self._walk(self.doc, last_block=None)
|
83
84
|
|
84
85
|
|
@@ -94,7 +95,7 @@ def extract_from_file(fname: PathIsh) -> Iterator[Extraction]:
|
|
94
95
|
yield Visit(
|
95
96
|
url=r.url,
|
96
97
|
dt=fallback_dt,
|
97
|
-
locator=Loc.file(fname),
|
98
|
+
locator=Loc.file(fname), # TODO line number
|
98
99
|
context=r.context,
|
99
100
|
)
|
100
101
|
|
@@ -105,9 +106,9 @@ class TextParser(Parser):
|
|
105
106
|
Instead of chunking blocks like for files, this returns the entire
|
106
107
|
message rendered as the context
|
107
108
|
'''
|
108
|
-
def __init__(self, text: str):
|
109
|
-
self.doc = mistletoe.Document(text)
|
110
109
|
|
110
|
+
def __init__(self, text: str) -> None:
|
111
|
+
self.doc = mistletoe.Document(text)
|
111
112
|
|
112
113
|
def _doc_ashtml(self):
|
113
114
|
'''
|
@@ -117,8 +118,7 @@ class TextParser(Parser):
|
|
117
118
|
self._html = HTML_MARKER + _ashtml(self.doc)
|
118
119
|
return self._html
|
119
120
|
|
120
|
-
|
121
|
-
def _extract(self, cur, last_block = None) -> Iterator[Parsed]:
|
121
|
+
def _extract(self, cur, last_block=None) -> Iterator[Parsed]:
|
122
122
|
if not isinstance(cur, (AutoLink, Link)):
|
123
123
|
return
|
124
124
|
|
promnesia/sources/org.py
CHANGED
@@ -57,8 +57,12 @@ def _parse_node(n: OrgNode) -> Parsed:
|
|
57
57
|
# todo a bit hacky..
|
58
58
|
heading = heading.replace(createds + ' ', '')
|
59
59
|
if createds is not None:
|
60
|
-
|
61
|
-
|
60
|
+
if '<%%' in createds:
|
61
|
+
# sexp date, not supported
|
62
|
+
dt = None
|
63
|
+
else:
|
64
|
+
[odt] = OrgDate.list_from_str(createds)
|
65
|
+
dt = odt.start
|
62
66
|
else:
|
63
67
|
dt = None
|
64
68
|
return Parsed(dt=dt, heading=heading)
|
@@ -80,7 +84,7 @@ def walk_node(*, node: OrgNode, dt: datetime) -> Iterator[Res[Tuple[Parsed, OrgN
|
|
80
84
|
parsed = parsed._replace(dt=dt)
|
81
85
|
else:
|
82
86
|
dt = parsed.dt
|
83
|
-
|
87
|
+
yield parsed, node
|
84
88
|
|
85
89
|
for c in node.children:
|
86
90
|
yield from walk_node(node=c, dt=dt)
|
promnesia/sources/plaintext.py
CHANGED
@@ -98,8 +98,10 @@ def extract_from_path(path: PathIsh) -> Command:
|
|
98
98
|
'.gz',
|
99
99
|
'.zip',
|
100
100
|
)):
|
101
|
-
|
101
|
+
# todo should be debug?
|
102
|
+
# or should delete it completely, feels like unpacking archives here is a bit too much
|
102
103
|
raise RuntimeError(f"Archives aren't supported yet: {path}")
|
104
|
+
logger.info(f"Extracting from compressed file {path}")
|
103
105
|
import lzma
|
104
106
|
from tempfile import NamedTemporaryFile
|
105
107
|
# TODO hopefully, no collisions
|
promnesia/sources/reddit.py
CHANGED
@@ -16,7 +16,7 @@ def index(*, render_markdown: bool = False, renderer: Optional[Type['RedditRende
|
|
16
16
|
if "No module named 'my.reddit.all'" in str(e):
|
17
17
|
import warnings
|
18
18
|
warnings.warn("DEPRECATED/reddit: Using an old version of HPI, please update")
|
19
|
-
from my.reddit import submissions, comments, saved, upvoted
|
19
|
+
from my.reddit import submissions, comments, saved, upvoted
|
20
20
|
else:
|
21
21
|
raise e
|
22
22
|
|
@@ -95,7 +95,7 @@ class RedditRenderer:
|
|
95
95
|
|
96
96
|
def _from_upvote(self, i: 'Upvote') -> Results:
|
97
97
|
locator = Loc.make(
|
98
|
-
title=
|
98
|
+
title='Reddit upvote',
|
99
99
|
href=i.url,
|
100
100
|
)
|
101
101
|
yield from self._from_common(i, locator=locator)
|
promnesia/sources/rss.py
CHANGED
promnesia/sources/signal.py
CHANGED
@@ -63,6 +63,8 @@ def index(
|
|
63
63
|
logger.debug("Paths to harvest: %s", db_paths)
|
64
64
|
if not http_only:
|
65
65
|
sql_query = f"{messages_query}\nWHERE body LIKE '%http%'"
|
66
|
+
else:
|
67
|
+
sql_query = messages_query
|
66
68
|
|
67
69
|
for db_path in resolved_db_paths:
|
68
70
|
logger.info("Ciphered db to harvest %s", db_path)
|
@@ -106,12 +108,18 @@ messages_query = dedent(
|
|
106
108
|
SELECT
|
107
109
|
id,
|
108
110
|
type,
|
109
|
-
coalesce(
|
111
|
+
coalesce(
|
112
|
+
profileFullName,
|
113
|
+
profileName,
|
114
|
+
name,
|
115
|
+
profileFamilyName,
|
116
|
+
e164
|
117
|
+
) as aname,
|
110
118
|
name,
|
111
119
|
profileName,
|
112
120
|
profileFamilyName,
|
113
121
|
e164,
|
114
|
-
|
122
|
+
serviceId
|
115
123
|
FROM conversations
|
116
124
|
),
|
117
125
|
Msgs AS (
|
@@ -123,8 +131,8 @@ messages_query = dedent(
|
|
123
131
|
M.received_at,
|
124
132
|
M.sent_at
|
125
133
|
) AS timestamp,
|
126
|
-
IIF(M.type =
|
127
|
-
|
134
|
+
IIF(M.type = 'outgoing',
|
135
|
+
'Me (' || C2.aname || ')',
|
128
136
|
C2.aname
|
129
137
|
) AS sender,
|
130
138
|
M.conversationId AS cid,
|
@@ -138,7 +146,7 @@ messages_query = dedent(
|
|
138
146
|
INNER JOIN Cons AS C1
|
139
147
|
ON M.conversationId = C1.id
|
140
148
|
INNER JOIN Cons AS C2
|
141
|
-
ON M.
|
149
|
+
ON M.sourceServiceId = C2.serviceId
|
142
150
|
)
|
143
151
|
SELECT id, timestamp, sender, cid, chatname, body
|
144
152
|
FROM Msgs
|
@@ -188,8 +196,8 @@ def _expand_path(path_pattern: PathIsh) -> Iterable[Path]:
|
|
188
196
|
|
189
197
|
def _expand_paths(paths: PathIshes) -> Iterable[Path]:
|
190
198
|
if _is_pathish(paths):
|
191
|
-
paths = [paths] # type: ignore[
|
192
|
-
return [pp.resolve() for p in paths for pp in _expand_path(p)] # type: ignore[union-attr
|
199
|
+
paths = [paths] # type: ignore[list-item]
|
200
|
+
return [pp.resolve() for p in paths for pp in _expand_path(p)] # type: ignore[union-attr]
|
193
201
|
|
194
202
|
|
195
203
|
def collect_db_paths(*db_paths: PathIsh, append: bool = False) -> Iterable[Path]:
|
@@ -236,7 +244,7 @@ def collect_db_paths(*db_paths: PathIsh, append: bool = False) -> Iterable[Path]
|
|
236
244
|
)
|
237
245
|
|
238
246
|
if db_paths and append:
|
239
|
-
db_paths = [ # type: ignore[
|
247
|
+
db_paths = [ # type: ignore[assignment]
|
240
248
|
*([db_paths] if _is_pathish(db_paths) else db_paths),
|
241
249
|
plat_paths,
|
242
250
|
]
|
@@ -310,8 +318,8 @@ def connect_db(
|
|
310
318
|
sql_cmds.extend(
|
311
319
|
[
|
312
320
|
f"ATTACH DATABASE '{decrypted_file}' AS plaintext KEY '';",
|
313
|
-
|
314
|
-
|
321
|
+
"SELECT sqlcipher_export('plaintext');",
|
322
|
+
"DETACH DATABASE plaintext;",
|
315
323
|
]
|
316
324
|
)
|
317
325
|
sql = "\n".join(sql_cmds)
|
@@ -320,7 +328,7 @@ def connect_db(
|
|
320
328
|
"Decrypting db '%s' with cmd: %s <<<EOF\n%s\nEOF", db_path, cmd, sql
|
321
329
|
)
|
322
330
|
try:
|
323
|
-
sbp.run(
|
331
|
+
sbp.run(
|
324
332
|
cmd,
|
325
333
|
check=True,
|
326
334
|
input=sql,
|
@@ -335,7 +343,7 @@ def connect_db(
|
|
335
343
|
) from None
|
336
344
|
db = sqlite3.connect(f"file:{decrypted_file}?mode=ro", uri=True)
|
337
345
|
else:
|
338
|
-
from sqlcipher3 import dbapi2 # type: ignore[import]
|
346
|
+
from sqlcipher3 import dbapi2 # type: ignore[import-not-found]
|
339
347
|
|
340
348
|
db = dbapi2.connect(f"file:{db_path}?mode=ro", uri=True)
|
341
349
|
# Param-binding doesn't work for pragmas, so use a direct string concat.
|
@@ -419,9 +427,9 @@ def _harvest_db(
|
|
419
427
|
|
420
428
|
with connect_db(db_path, key, decrypt_db=decrypt_db, **decryption_pragmas) as db:
|
421
429
|
for mid, tstamp, sender, cid, chatname, text in db.execute(messages_query):
|
430
|
+
tstamp = from_epoch(tstamp / 1000.0)
|
431
|
+
row = (mid, tstamp, sender, cid, chatname, text)
|
422
432
|
try:
|
423
|
-
tstamp = from_epoch(tstamp / 1000.0)
|
424
|
-
row = (mid, tstamp, sender, cid, chatname, text)
|
425
433
|
yield from _handle_row(row, db_path, locator_schema)
|
426
434
|
except Exception as ex:
|
427
435
|
# TODO: also insert errors in db
|
@@ -2,12 +2,12 @@
|
|
2
2
|
Uses [[https://github.com/karlicoss/HPI][HPI]] for Stackexchange data.
|
3
3
|
'''
|
4
4
|
|
5
|
-
from ..common import Results, Visit, Loc
|
5
|
+
from ..common import Results, Visit, Loc
|
6
6
|
|
7
7
|
|
8
8
|
def index() -> Results:
|
9
9
|
from . import hpi
|
10
|
-
import my.stackexchange.gdpr as G
|
10
|
+
import my.stackexchange.gdpr as G
|
11
11
|
for v in G.votes():
|
12
12
|
if isinstance(v, Exception):
|
13
13
|
yield v
|