promnesia 1.2.20230515__py3-none-any.whl → 1.2.20240810__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (61) hide show
  1. promnesia/__main__.py +26 -14
  2. promnesia/cannon.py +4 -4
  3. promnesia/common.py +39 -28
  4. promnesia/compare.py +3 -2
  5. promnesia/config.py +4 -2
  6. promnesia/database/common.py +66 -0
  7. promnesia/database/dump.py +187 -0
  8. promnesia/{read_db.py → database/load.py} +10 -11
  9. promnesia/extract.py +1 -0
  10. promnesia/kjson.py +1 -1
  11. promnesia/logging.py +3 -3
  12. promnesia/misc/__init__.pyi +0 -0
  13. promnesia/misc/config_example.py +1 -2
  14. promnesia/misc/install_server.py +2 -3
  15. promnesia/server.py +18 -19
  16. promnesia/sources/__init__.pyi +0 -0
  17. promnesia/sources/auto.py +9 -7
  18. promnesia/sources/browser_legacy.py +11 -5
  19. promnesia/sources/demo.py +18 -2
  20. promnesia/sources/filetypes.py +7 -0
  21. promnesia/sources/github.py +2 -2
  22. promnesia/sources/hypothesis.py +1 -1
  23. promnesia/sources/markdown.py +15 -15
  24. promnesia/sources/org.py +7 -3
  25. promnesia/sources/plaintext.py +3 -1
  26. promnesia/sources/reddit.py +2 -2
  27. promnesia/sources/rss.py +1 -1
  28. promnesia/sources/signal.py +22 -14
  29. promnesia/sources/stackexchange.py +2 -2
  30. promnesia/sources/takeout.py +58 -1
  31. promnesia/sources/takeout_legacy.py +10 -2
  32. promnesia/tests/__init__.py +0 -0
  33. promnesia/tests/common.py +137 -0
  34. promnesia/tests/server_helper.py +64 -0
  35. promnesia/tests/sources/__init__.py +0 -0
  36. promnesia/tests/sources/test_auto.py +66 -0
  37. promnesia/tests/sources/test_filetypes.py +42 -0
  38. promnesia/tests/sources/test_hypothesis.py +39 -0
  39. promnesia/tests/sources/test_org.py +65 -0
  40. promnesia/tests/sources/test_plaintext.py +26 -0
  41. promnesia/tests/sources/test_shellcmd.py +22 -0
  42. promnesia/tests/sources/test_takeout.py +58 -0
  43. promnesia/tests/test_cannon.py +325 -0
  44. promnesia/tests/test_cli.py +42 -0
  45. promnesia/tests/test_compare.py +30 -0
  46. promnesia/tests/test_config.py +290 -0
  47. promnesia/tests/test_db_dump.py +223 -0
  48. promnesia/tests/test_extract.py +61 -0
  49. promnesia/tests/test_extract_urls.py +43 -0
  50. promnesia/tests/test_indexer.py +245 -0
  51. promnesia/tests/test_server.py +292 -0
  52. promnesia/tests/test_traverse.py +41 -0
  53. promnesia/tests/utils.py +35 -0
  54. {promnesia-1.2.20230515.dist-info → promnesia-1.2.20240810.dist-info}/METADATA +13 -17
  55. promnesia-1.2.20240810.dist-info/RECORD +83 -0
  56. {promnesia-1.2.20230515.dist-info → promnesia-1.2.20240810.dist-info}/WHEEL +1 -1
  57. {promnesia-1.2.20230515.dist-info → promnesia-1.2.20240810.dist-info}/entry_points.txt +0 -1
  58. promnesia/dump.py +0 -105
  59. promnesia-1.2.20230515.dist-info/RECORD +0 -58
  60. {promnesia-1.2.20230515.dist-info → promnesia-1.2.20240810.dist-info}/LICENSE +0 -0
  61. {promnesia-1.2.20230515.dist-info → promnesia-1.2.20240810.dist-info}/top_level.txt +0 -0
promnesia/logging.py CHANGED
@@ -61,7 +61,7 @@ _init_done = 'lazylogger_init_done'
61
61
  def setup_logger(logger: logging.Logger, level: LevelIsh) -> None:
62
62
  lvl = mklevel(level)
63
63
  try:
64
- import logzero # type: ignore[import]
64
+ import logzero # type: ignore[import-not-found]
65
65
  formatter = logzero.LogFormatter(
66
66
  fmt=FORMAT_COLOR,
67
67
  datefmt=DATEFMT,
@@ -75,7 +75,7 @@ def setup_logger(logger: logging.Logger, level: LevelIsh) -> None:
75
75
  logger.addFilter(AddExceptionTraceback())
76
76
  if use_logzero and not COLLAPSE_DEBUG_LOGS: # all set, nothing to do
77
77
  # 'simple' setup
78
- logzero.setup_logger(logger.name, level=lvl, formatter=formatter)
78
+ logzero.setup_logger(logger.name, level=lvl, formatter=formatter) # type: ignore[possibly-undefined]
79
79
  return
80
80
 
81
81
  h = CollapseDebugHandler() if COLLAPSE_DEBUG_LOGS else logging.StreamHandler()
@@ -101,7 +101,7 @@ class LazyLogger(logging.Logger):
101
101
  # oh god.. otherwise might go into an inf loop
102
102
  if not hasattr(logger, _init_done):
103
103
  setattr(logger, _init_done, False) # will setup on the first call
104
- logger.isEnabledFor = isEnabledFor_lazyinit # type: ignore[assignment]
104
+ logger.isEnabledFor = isEnabledFor_lazyinit # type: ignore[method-assign]
105
105
  return cast(LazyLogger, logger)
106
106
 
107
107
 
File without changes
@@ -11,7 +11,6 @@ SOURCES = [
11
11
  Source(
12
12
  auto.index,
13
13
  # just some arbitrary directory with plaintext files
14
- '/usr/include/c++/',
15
- '/usr/local/include/c++/', # on apple they are here apparently..
14
+ '/usr/share/vim/',
16
15
  )
17
16
  ]
@@ -7,6 +7,7 @@ import sys
7
7
  import time
8
8
  from pathlib import Path
9
9
  import platform
10
+ import shutil
10
11
  from subprocess import check_call, run
11
12
  from typing import List
12
13
 
@@ -118,9 +119,7 @@ def install(args: argparse.Namespace) -> None:
118
119
  if os.environ.get('DIRTY_RUN') is not None:
119
120
  launcher = str(root() / 'scripts/promnesia')
120
121
  else:
121
- # must be installed, so available in PATH
122
- import distutils.spawn
123
- exe = distutils.spawn.find_executable('promnesia'); assert exe is not None
122
+ exe = shutil.which('promnesia'); assert exe is not None
124
123
  launcher = exe # older systemd wants absolute paths..
125
124
 
126
125
  db = args.db
promnesia/server.py CHANGED
@@ -1,12 +1,11 @@
1
1
  #!/usr/bin/python3
2
2
  from __future__ import annotations
3
3
 
4
- __package__ = 'promnesia' # ugh. hacky way to make wsgi runner work properly...
5
-
6
4
  import argparse
7
5
  from dataclasses import dataclass
8
6
  from datetime import timedelta
9
7
  from functools import lru_cache
8
+ import importlib.metadata
10
9
  import json
11
10
  import logging
12
11
  import os
@@ -19,7 +18,7 @@ from pytz import BaseTzInfo
19
18
 
20
19
  import fastapi
21
20
 
22
- from sqlalchemy import MetaData, exists, literal, between, or_, and_, exc, select
21
+ from sqlalchemy import literal, between, or_, and_, exc, select
23
22
  from sqlalchemy import Column, Table, func, types
24
23
  from sqlalchemy.sql.elements import ColumnElement
25
24
  from sqlalchemy.sql import text
@@ -27,6 +26,7 @@ from sqlalchemy.sql import text
27
26
 
28
27
  from .common import PathWithMtime, DbVisit, Url, setup_logger, default_output_dir, get_system_tz
29
28
  from .cannon import canonify
29
+ from .database.load import DbStuff, get_db_stuff, row_to_db_visit
30
30
 
31
31
 
32
32
  Json = Dict[str, Any]
@@ -51,8 +51,7 @@ def get_logger() -> logging.Logger:
51
51
 
52
52
 
53
53
  def get_version() -> str:
54
- from pkg_resources import get_distribution
55
- return get_distribution(__package__).version
54
+ return importlib.metadata.version(__package__)
56
55
 
57
56
 
58
57
  class ServerConfig(NamedTuple):
@@ -119,8 +118,6 @@ def get_db_path(check: bool=True) -> Path:
119
118
  return db
120
119
 
121
120
 
122
- from .read_db import DbStuff, get_db_stuff
123
-
124
121
  @lru_cache(1)
125
122
  # PathWithMtime aids lru_cache in reloading the sqlalchemy binder
126
123
  def _get_stuff(db_path: PathWithMtime) -> DbStuff:
@@ -136,7 +133,7 @@ def get_stuff(db_path: Optional[Path]=None) -> DbStuff: # TODO better name
136
133
 
137
134
 
138
135
  def db_stats(db_path: Path) -> Json:
139
- engine, binder, table = get_stuff(db_path)
136
+ engine, table = get_stuff(db_path)
140
137
  query = select(func.count()).select_from(table)
141
138
  with engine.connect() as conn:
142
139
  total = list(conn.execute(query))[0][0]
@@ -151,8 +148,8 @@ class Where(Protocol):
151
148
 
152
149
  @dataclass
153
150
  class VisitsResponse:
154
- original_url: Url
155
- normalised_url: Url
151
+ original_url: str
152
+ normalised_url: str
156
153
  visits: Any
157
154
 
158
155
 
@@ -167,7 +164,7 @@ def search_common(url: str, where: Where) -> VisitsResponse:
167
164
  url = original_url
168
165
  logger.info('normalised url: %s', url)
169
166
 
170
- engine, binder, table = get_stuff()
167
+ engine, table = get_stuff()
171
168
 
172
169
  query = table.select().where(where(table=table, url=url))
173
170
  logger.debug('query: %s', query)
@@ -175,7 +172,7 @@ def search_common(url: str, where: Where) -> VisitsResponse:
175
172
  with engine.connect() as conn:
176
173
  try:
177
174
  # TODO make more defensive here
178
- visits: List[DbVisit] = [binder.from_row(row) for row in conn.execute(query)]
175
+ visits: List[DbVisit] = [row_to_db_visit(row) for row in conn.execute(query)]
179
176
  except exc.OperationalError as e:
180
177
  if getattr(e, 'msg', None) == 'no such table: visits':
181
178
  logger.warn('you may have to run indexer first!')
@@ -232,6 +229,7 @@ def status() -> Json:
232
229
  try:
233
230
  version = get_version()
234
231
  except Exception as e:
232
+ logger.exception(e)
235
233
  version = None
236
234
 
237
235
  return {
@@ -241,10 +239,9 @@ def status() -> Json:
241
239
  }
242
240
 
243
241
 
244
- from dataclasses import dataclass
245
242
  @dataclass
246
243
  class VisitsRequest:
247
- url: Url
244
+ url: str
248
245
 
249
246
  @app.get ('/visits', response_model=VisitsResponse)
250
247
  @app.post('/visits', response_model=VisitsResponse)
@@ -255,15 +252,17 @@ def visits(request: VisitsRequest) -> VisitsResponse:
255
252
  url=url,
256
253
  # odd, doesn't work just with: x or (y and z)
257
254
  where=lambda table, url: or_(
258
- table.c.norm_url == url, # exact match
259
- and_(table.c.context != None, table.c.norm_url.startswith(url, autoescape=True)) # + child visits, but only 'interesting' ones
255
+ # exact match
256
+ table.c.norm_url == url,
257
+ # + child visits, but only 'interesting' ones
258
+ and_(table.c.context != None, table.c.norm_url.startswith(url, autoescape=True)) # noqa: E711
260
259
  ),
261
260
  )
262
261
 
263
262
 
264
263
  @dataclass
265
264
  class SearchRequest:
266
- url: Url
265
+ url: str
267
266
 
268
267
  @app.get ('/search', response_model=VisitsResponse)
269
268
  @app.post('/search', response_model=VisitsResponse)
@@ -361,7 +360,7 @@ def visited(request: VisitedRequest) -> VisitedResponse:
361
360
  if len(snurls) == 0:
362
361
  return []
363
362
 
364
- engine, binder, table = get_stuff()
363
+ engine, table = get_stuff()
365
364
 
366
365
  # sqlalchemy doesn't seem to support SELECT FROM (VALUES (...)) in its api
367
366
  # also doesn't support array binding...
@@ -389,7 +388,7 @@ SELECT queried, visits.*
389
388
  # brings down large queries to 50ms...
390
389
  with engine.connect() as conn:
391
390
  res = list(conn.execute(query))
392
- present: Dict[str, Any] = {row[0]: binder.from_row(row[1:]) for row in res}
391
+ present: Dict[str, Any] = {row[0]: row_to_db_visit(row[1:]) for row in res}
393
392
  results = []
394
393
  for nu in nurls:
395
394
  r = present.get(nu, None)
File without changes
promnesia/sources/auto.py CHANGED
@@ -22,17 +22,18 @@ import warnings
22
22
  import pytz
23
23
 
24
24
  from ..common import Visit, Url, PathIsh, get_logger, Loc, get_tmpdir, extract_urls, Extraction, Result, Results, mime, traverse, file_mtime, echain, logger
25
+ from ..common import warn_once
25
26
  from ..config import use_cores
26
27
 
27
28
 
28
- from .filetypes import EUrl
29
+ from .filetypes import EUrl, Ctx
29
30
  from .auto_obsidian import obsidian_replacer
30
31
  from .auto_logseq import logseq_replacer
31
32
 
32
33
 
33
34
  def _collect(thing, path: List[str], result: List[EUrl]) -> None:
34
35
  if isinstance(thing, str):
35
- ctx: Ctx = tuple(path) # type: ignore
36
+ ctx: Ctx = tuple(path)
36
37
  result.extend([EUrl(url=u, ctx=ctx) for u in extract_urls(thing)])
37
38
  elif isinstance(thing, list):
38
39
  path.append('[]')
@@ -167,7 +168,7 @@ for t in CODE:
167
168
  Replacer = Optional[Callable[[str, str], str]]
168
169
 
169
170
  def index(
170
- *paths: Union[PathIsh],
171
+ *paths: PathIsh,
171
172
  ignored: Union[Sequence[str], str]=(),
172
173
  follow: bool=True,
173
174
  replacer: Replacer=None,
@@ -282,6 +283,8 @@ def by_path(pp: Path) -> Tuple[Optional[Ex], Optional[Mime]]:
282
283
 
283
284
  def _index_file(pp: Path, opts: Options) -> Results:
284
285
  logger = get_logger()
286
+ # TODO need to keep debug logs here...
287
+ # logger.info(f"indexing {pp}")
285
288
  # TODO use kompress?
286
289
  # TODO not even sure if it's used...
287
290
  suf = pp.suffix.lower()
@@ -307,10 +310,9 @@ def _index_file(pp: Path, opts: Options) -> Results:
307
310
 
308
311
  ip, pm = by_path(pp)
309
312
  if ip is None:
310
- # TODO use warning (with mime/ext as key?)
311
- # TODO only log once? # hmm..
313
+ # todo not really sure about using warnings vs yielding error here?
312
314
  msg = f'No extractor for suffix {suf}, mime {pm}'
313
- warnings.warn(msg)
315
+ warn_once(msg)
314
316
  yield echain(ex, RuntimeError(msg))
315
317
  return
316
318
 
@@ -318,7 +320,7 @@ def _index_file(pp: Path, opts: Options) -> Results:
318
320
 
319
321
  def indexer() -> Union[Urls, Results]:
320
322
  # eh, annoying.. need to make more generic..
321
- idx = ip(pp) # type: ignore
323
+ idx = ip(pp)
322
324
  try:
323
325
  yield from idx
324
326
  except Exception as e:
@@ -2,15 +2,21 @@ from datetime import datetime
2
2
  from pathlib import Path
3
3
  from urllib.parse import unquote
4
4
  import sqlite3
5
- from typing import List, Set
5
+ from typing import List, Set, Optional
6
6
 
7
7
  import pytz
8
8
 
9
9
  from ..common import PathIsh, Results, Visit, Loc, logger, Second, is_sqlite_db
10
10
  from .. import config
11
11
 
12
- # todo mcachew?
13
- from cachew import cachew
12
+ try:
13
+ from cachew import cachew
14
+ except ModuleNotFoundError as me:
15
+ if me.name != 'cachew':
16
+ raise me
17
+ # this module is legacy anyway, so just make it defensive
18
+ def cachew(*args, **kwargs): # type: ignore[no-redef]
19
+ return lambda f: f
14
20
 
15
21
 
16
22
  def index(p: PathIsh) -> Results:
@@ -43,7 +49,7 @@ def _index_dbs(dbs: List[Path], cachew_name: str):
43
49
  # todo wow, stack traces are ridiculous here...
44
50
  # todo hmm, feels like it should be a class or something?
45
51
  @cachew(lambda cp, dbs, emitted: cp, depends_on=lambda cp, dbs, emitted: dbs) # , logger=logger)
46
- def _index_dbs_aux(cache_path: Path, dbs: List[Path], emitted: Set) -> Results:
52
+ def _index_dbs_aux(cache_path: Optional[Path], dbs: List[Path], emitted: Set) -> Results:
47
53
  if len(dbs) == 0:
48
54
  return
49
55
 
@@ -58,7 +64,7 @@ def _index_dbs_aux(cache_path: Path, dbs: List[Path], emitted: Set) -> Results:
58
64
  xs_was_cached = True
59
65
  logger.debug('seems that %d first items were previously cached', len(xs))
60
66
  if xs_was_cached:
61
- key = (r.url, r.dt)
67
+ key = str(r) if isinstance(r, Exception) else (r.url, r.dt)
62
68
  assert key not in emitted, key # todo not sure if this assert is necessary?
63
69
  # hmm ok it might happen if we messed up with indexing individual db?
64
70
  # alternatively, could abuse it to avoid messing with 'emitted' in _index_db?
promnesia/sources/demo.py CHANGED
@@ -4,17 +4,33 @@ Generates a sequence of fake evenly separated visits
4
4
  '''
5
5
 
6
6
  from datetime import datetime, timedelta
7
+ from typing import Union
7
8
 
8
9
  from ..common import Results, Visit, Loc
9
10
 
10
11
 
11
- def index(count: int=100, *, base_dt: datetime=datetime.min + timedelta(days=5000), delta: timedelta=timedelta(hours=1)) -> Results:
12
+ IsoFormatDt = str
13
+ Seconds = int
14
+
15
+
16
+ # TODO allow passing isoformat string as base_dt?
17
+ # and maybe something similar as delta? start with seconds maybe
18
+ def index(
19
+ count: int=100,
20
+ *,
21
+ base_dt: Union[datetime, IsoFormatDt] = datetime.min + timedelta(days=5000),
22
+ delta: Union[timedelta, Seconds] = timedelta(hours=1),
23
+ ) -> Results:
24
+
25
+ base_dt_ = base_dt if isinstance(base_dt, datetime) else datetime.fromisoformat(base_dt)
26
+ delta_ = delta if isinstance(delta, timedelta) else timedelta(seconds=delta)
27
+
12
28
  # todo with some errors too?
13
29
  # todo use data generation library suggested for HPI?
14
30
  for i in range(count):
15
31
  yield Visit(
16
32
  url=f'https://demo.com/page{i}.html',
17
- dt=base_dt + delta * i,
33
+ dt=base_dt_ + delta_ * i,
18
34
  locator=Loc.make('demo'),
19
35
  )
20
36
  # todo add context?
@@ -67,6 +67,7 @@ CODE = {
67
67
  'text/vnd.graphviz',
68
68
  'text/x-diff', # patch files
69
69
  'text/x-php',
70
+ 'text/x-lilypond',
70
71
 
71
72
  # these didn't have a mime type, or were mistyped?
72
73
  'css',
@@ -115,6 +116,12 @@ TYPE2IDX.update({
115
116
  '.vcf' : ignore,
116
117
  'message/rfc822': ignore, # ??
117
118
 
119
+ # todo ignore all fonts?
120
+ 'font/woff2': ignore,
121
+ 'font/woff': ignore,
122
+ 'text/x-Algol68': ignore, # ugh some license file had this?? maybe always index text/ as text?
123
+ 'text/x-bytecode.python': ignore, # todo ignore all x-bytecode?
124
+
118
125
  # TODO not sure what to do about these..
119
126
  'application/octet-stream': handle_later,
120
127
  'application/zip' : handle_later,
@@ -31,7 +31,7 @@ def index(*, render_markdown: bool = False) -> Results:
31
31
  # if enabled, convert the (markdown) body to HTML
32
32
  context: Optional[str] = e.body
33
33
  if e.body is not None and render_markdown:
34
- context = TextParser(e.body)._doc_ashtml()
34
+ context = TextParser(e.body)._doc_ashtml() # type: ignore[possibly-undefined]
35
35
 
36
36
  # locator should link back to this event
37
37
  loc = Loc.make(title=e.summary, href=e.link)
@@ -74,7 +74,7 @@ def index(*, render_markdown: bool = False) -> Results:
74
74
  # extract from markdown links like [link text](https://...)
75
75
  # incase URLExtract missed any somehow
76
76
  if render_markdown:
77
- for res in extract_from_text(e.body):
77
+ for res in extract_from_text(e.body): # type: ignore[possibly-undefined]
78
78
  if isinstance(res, Exception):
79
79
  yield res
80
80
  continue
@@ -8,7 +8,7 @@ def index() -> Results:
8
8
  from . import hpi
9
9
  import my.hypothesis as hyp
10
10
 
11
- for h in hyp.get_highlights():
11
+ for h in hyp.highlights():
12
12
  if isinstance(h, Exception):
13
13
  yield h
14
14
  continue
@@ -1,13 +1,13 @@
1
1
  from pathlib import Path
2
2
  from typing import Iterator, NamedTuple, Optional
3
3
 
4
- from ..common import get_logger, Extraction, Url, PathIsh, Res, Visit, Loc, file_mtime, logger
4
+ from ..common import Extraction, Url, PathIsh, Res, Visit, Loc, file_mtime, logger
5
5
 
6
6
 
7
- import mistletoe # type: ignore
8
- from mistletoe.span_token import AutoLink, Link # type: ignore
9
- import mistletoe.block_token as BT # type: ignore
10
- from mistletoe.html_renderer import HTMLRenderer # type: ignore
7
+ import mistletoe # type: ignore
8
+ from mistletoe.span_token import AutoLink, Link # type: ignore
9
+ import mistletoe.block_token as BT # type: ignore
10
+ from mistletoe.html_renderer import HTMLRenderer # type: ignore
11
11
 
12
12
 
13
13
  renderer = HTMLRenderer()
@@ -42,7 +42,7 @@ HTML_MARKER = '!html '
42
42
  def _ashtml(block) -> str:
43
43
  res = renderer.render(block)
44
44
  if res.startswith('<p>') and res.endswith('</p>'):
45
- res = res[3: -4] # meh, but for now fine
45
+ res = res[3:-4] # meh, but for now fine
46
46
  return res
47
47
 
48
48
 
@@ -62,7 +62,6 @@ class Parser:
62
62
  context = None if last_block is None else HTML_MARKER + _ashtml(last_block)
63
63
  yield Parsed(url=url, context=context)
64
64
 
65
-
66
65
  def _walk(self, cur, last_block) -> Iterator[Result]:
67
66
  if isinstance(cur, block_tokens):
68
67
  last_block = cur
@@ -73,12 +72,14 @@ class Parser:
73
72
  logger.exception(e)
74
73
  yield e
75
74
 
76
- children = getattr(cur, 'children', [])
75
+ # keeping getattr for compatibility in older versions of mistletoe, it was optional
76
+ children = getattr(cur, 'children', None)
77
+ if children is None:
78
+ return
77
79
  for c in children:
78
80
  yield from self._walk(c, last_block=last_block)
79
81
 
80
-
81
- def walk(self):
82
+ def walk(self) -> Iterator[Result]:
82
83
  yield from self._walk(self.doc, last_block=None)
83
84
 
84
85
 
@@ -94,7 +95,7 @@ def extract_from_file(fname: PathIsh) -> Iterator[Extraction]:
94
95
  yield Visit(
95
96
  url=r.url,
96
97
  dt=fallback_dt,
97
- locator=Loc.file(fname), # TODO line number
98
+ locator=Loc.file(fname), # TODO line number
98
99
  context=r.context,
99
100
  )
100
101
 
@@ -105,9 +106,9 @@ class TextParser(Parser):
105
106
  Instead of chunking blocks like for files, this returns the entire
106
107
  message rendered as the context
107
108
  '''
108
- def __init__(self, text: str):
109
- self.doc = mistletoe.Document(text)
110
109
 
110
+ def __init__(self, text: str) -> None:
111
+ self.doc = mistletoe.Document(text)
111
112
 
112
113
  def _doc_ashtml(self):
113
114
  '''
@@ -117,8 +118,7 @@ class TextParser(Parser):
117
118
  self._html = HTML_MARKER + _ashtml(self.doc)
118
119
  return self._html
119
120
 
120
-
121
- def _extract(self, cur, last_block = None) -> Iterator[Parsed]:
121
+ def _extract(self, cur, last_block=None) -> Iterator[Parsed]:
122
122
  if not isinstance(cur, (AutoLink, Link)):
123
123
  return
124
124
 
promnesia/sources/org.py CHANGED
@@ -57,8 +57,12 @@ def _parse_node(n: OrgNode) -> Parsed:
57
57
  # todo a bit hacky..
58
58
  heading = heading.replace(createds + ' ', '')
59
59
  if createds is not None:
60
- [odt] = OrgDate.list_from_str(createds)
61
- dt = odt.start
60
+ if '<%%' in createds:
61
+ # sexp date, not supported
62
+ dt = None
63
+ else:
64
+ [odt] = OrgDate.list_from_str(createds)
65
+ dt = odt.start
62
66
  else:
63
67
  dt = None
64
68
  return Parsed(dt=dt, heading=heading)
@@ -80,7 +84,7 @@ def walk_node(*, node: OrgNode, dt: datetime) -> Iterator[Res[Tuple[Parsed, OrgN
80
84
  parsed = parsed._replace(dt=dt)
81
85
  else:
82
86
  dt = parsed.dt
83
- yield parsed, node
87
+ yield parsed, node
84
88
 
85
89
  for c in node.children:
86
90
  yield from walk_node(node=c, dt=dt)
@@ -98,8 +98,10 @@ def extract_from_path(path: PathIsh) -> Command:
98
98
  '.gz',
99
99
  '.zip',
100
100
  )):
101
- logger.info(f"Extracting from compressed file {path}")
101
+ # todo should be debug?
102
+ # or should delete it completely, feels like unpacking archives here is a bit too much
102
103
  raise RuntimeError(f"Archives aren't supported yet: {path}")
104
+ logger.info(f"Extracting from compressed file {path}")
103
105
  import lzma
104
106
  from tempfile import NamedTemporaryFile
105
107
  # TODO hopefully, no collisions
@@ -16,7 +16,7 @@ def index(*, render_markdown: bool = False, renderer: Optional[Type['RedditRende
16
16
  if "No module named 'my.reddit.all'" in str(e):
17
17
  import warnings
18
18
  warnings.warn("DEPRECATED/reddit: Using an old version of HPI, please update")
19
- from my.reddit import submissions, comments, saved, upvoted # type: ignore[no-redef]
19
+ from my.reddit import submissions, comments, saved, upvoted
20
20
  else:
21
21
  raise e
22
22
 
@@ -95,7 +95,7 @@ class RedditRenderer:
95
95
 
96
96
  def _from_upvote(self, i: 'Upvote') -> Results:
97
97
  locator = Loc.make(
98
- title=f'Reddit upvote',
98
+ title='Reddit upvote',
99
99
  href=i.url,
100
100
  )
101
101
  yield from self._from_common(i, locator=locator)
promnesia/sources/rss.py CHANGED
@@ -23,6 +23,6 @@ def index() -> Results:
23
23
  yield Visit(
24
24
  url=feed.url,
25
25
  dt=feed.created_at or default_datetime,
26
- context=f'RSS subscription', # TODO use 'provider', etc?
26
+ context='RSS subscription', # TODO use 'provider', etc?
27
27
  locator=locator,
28
28
  )
@@ -63,6 +63,8 @@ def index(
63
63
  logger.debug("Paths to harvest: %s", db_paths)
64
64
  if not http_only:
65
65
  sql_query = f"{messages_query}\nWHERE body LIKE '%http%'"
66
+ else:
67
+ sql_query = messages_query
66
68
 
67
69
  for db_path in resolved_db_paths:
68
70
  logger.info("Ciphered db to harvest %s", db_path)
@@ -106,12 +108,18 @@ messages_query = dedent(
106
108
  SELECT
107
109
  id,
108
110
  type,
109
- coalesce(name, profileName, profileFamilyName, e164) as aname,
111
+ coalesce(
112
+ profileFullName,
113
+ profileName,
114
+ name,
115
+ profileFamilyName,
116
+ e164
117
+ ) as aname,
110
118
  name,
111
119
  profileName,
112
120
  profileFamilyName,
113
121
  e164,
114
- uuid
122
+ serviceId
115
123
  FROM conversations
116
124
  ),
117
125
  Msgs AS (
@@ -123,8 +131,8 @@ messages_query = dedent(
123
131
  M.received_at,
124
132
  M.sent_at
125
133
  ) AS timestamp,
126
- IIF(M.type = "outgoing",
127
- "Me (" || C2.aname || ")",
134
+ IIF(M.type = 'outgoing',
135
+ 'Me (' || C2.aname || ')',
128
136
  C2.aname
129
137
  ) AS sender,
130
138
  M.conversationId AS cid,
@@ -138,7 +146,7 @@ messages_query = dedent(
138
146
  INNER JOIN Cons AS C1
139
147
  ON M.conversationId = C1.id
140
148
  INNER JOIN Cons AS C2
141
- ON M.sourceUuid = C2.uuid
149
+ ON M.sourceServiceId = C2.serviceId
142
150
  )
143
151
  SELECT id, timestamp, sender, cid, chatname, body
144
152
  FROM Msgs
@@ -188,8 +196,8 @@ def _expand_path(path_pattern: PathIsh) -> Iterable[Path]:
188
196
 
189
197
  def _expand_paths(paths: PathIshes) -> Iterable[Path]:
190
198
  if _is_pathish(paths):
191
- paths = [paths] # type: ignore[assignment,list-item]
192
- return [pp.resolve() for p in paths for pp in _expand_path(p)] # type: ignore[union-attr,list-item]
199
+ paths = [paths] # type: ignore[list-item]
200
+ return [pp.resolve() for p in paths for pp in _expand_path(p)] # type: ignore[union-attr]
193
201
 
194
202
 
195
203
  def collect_db_paths(*db_paths: PathIsh, append: bool = False) -> Iterable[Path]:
@@ -236,7 +244,7 @@ def collect_db_paths(*db_paths: PathIsh, append: bool = False) -> Iterable[Path]
236
244
  )
237
245
 
238
246
  if db_paths and append:
239
- db_paths = [ # type: ignore[misc,assignment]
247
+ db_paths = [ # type: ignore[assignment]
240
248
  *([db_paths] if _is_pathish(db_paths) else db_paths),
241
249
  plat_paths,
242
250
  ]
@@ -310,8 +318,8 @@ def connect_db(
310
318
  sql_cmds.extend(
311
319
  [
312
320
  f"ATTACH DATABASE '{decrypted_file}' AS plaintext KEY '';",
313
- f"SELECT sqlcipher_export('plaintext');",
314
- f"DETACH DATABASE plaintext;",
321
+ "SELECT sqlcipher_export('plaintext');",
322
+ "DETACH DATABASE plaintext;",
315
323
  ]
316
324
  )
317
325
  sql = "\n".join(sql_cmds)
@@ -320,7 +328,7 @@ def connect_db(
320
328
  "Decrypting db '%s' with cmd: %s <<<EOF\n%s\nEOF", db_path, cmd, sql
321
329
  )
322
330
  try:
323
- sbp.run( # type: ignore[call-overload]
331
+ sbp.run(
324
332
  cmd,
325
333
  check=True,
326
334
  input=sql,
@@ -335,7 +343,7 @@ def connect_db(
335
343
  ) from None
336
344
  db = sqlite3.connect(f"file:{decrypted_file}?mode=ro", uri=True)
337
345
  else:
338
- from sqlcipher3 import dbapi2 # type: ignore[import]
346
+ from sqlcipher3 import dbapi2 # type: ignore[import-not-found]
339
347
 
340
348
  db = dbapi2.connect(f"file:{db_path}?mode=ro", uri=True)
341
349
  # Param-binding doesn't work for pragmas, so use a direct string concat.
@@ -419,9 +427,9 @@ def _harvest_db(
419
427
 
420
428
  with connect_db(db_path, key, decrypt_db=decrypt_db, **decryption_pragmas) as db:
421
429
  for mid, tstamp, sender, cid, chatname, text in db.execute(messages_query):
430
+ tstamp = from_epoch(tstamp / 1000.0)
431
+ row = (mid, tstamp, sender, cid, chatname, text)
422
432
  try:
423
- tstamp = from_epoch(tstamp / 1000.0)
424
- row = (mid, tstamp, sender, cid, chatname, text)
425
433
  yield from _handle_row(row, db_path, locator_schema)
426
434
  except Exception as ex:
427
435
  # TODO: also insert errors in db
@@ -2,12 +2,12 @@
2
2
  Uses [[https://github.com/karlicoss/HPI][HPI]] for Stackexchange data.
3
3
  '''
4
4
 
5
- from ..common import Results, Visit, Loc, extract_urls
5
+ from ..common import Results, Visit, Loc
6
6
 
7
7
 
8
8
  def index() -> Results:
9
9
  from . import hpi
10
- import my.stackexchange.gdpr as G # type: ignore[import] # TODO eh, not sure if should run against pypi or not...
10
+ import my.stackexchange.gdpr as G
11
11
  for v in G.votes():
12
12
  if isinstance(v, Exception):
13
13
  yield v