promnesia 1.2.20240810__py3-none-any.whl → 1.4.20250909__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (79) hide show
  1. promnesia/__init__.py +18 -4
  2. promnesia/__main__.py +104 -78
  3. promnesia/cannon.py +108 -107
  4. promnesia/common.py +107 -88
  5. promnesia/compare.py +33 -30
  6. promnesia/compat.py +10 -10
  7. promnesia/config.py +37 -34
  8. promnesia/database/common.py +4 -3
  9. promnesia/database/dump.py +13 -13
  10. promnesia/database/load.py +7 -7
  11. promnesia/extract.py +19 -17
  12. promnesia/logging.py +27 -15
  13. promnesia/misc/install_server.py +32 -27
  14. promnesia/server.py +106 -79
  15. promnesia/sources/auto.py +104 -77
  16. promnesia/sources/auto_logseq.py +6 -5
  17. promnesia/sources/auto_obsidian.py +2 -2
  18. promnesia/sources/browser.py +20 -10
  19. promnesia/sources/browser_legacy.py +65 -50
  20. promnesia/sources/demo.py +7 -8
  21. promnesia/sources/fbmessenger.py +3 -3
  22. promnesia/sources/filetypes.py +22 -16
  23. promnesia/sources/github.py +9 -8
  24. promnesia/sources/guess.py +6 -2
  25. promnesia/sources/hackernews.py +7 -9
  26. promnesia/sources/hpi.py +5 -3
  27. promnesia/sources/html.py +11 -7
  28. promnesia/sources/hypothesis.py +3 -2
  29. promnesia/sources/instapaper.py +3 -2
  30. promnesia/sources/markdown.py +22 -12
  31. promnesia/sources/org.py +36 -17
  32. promnesia/sources/plaintext.py +41 -39
  33. promnesia/sources/pocket.py +5 -3
  34. promnesia/sources/reddit.py +24 -26
  35. promnesia/sources/roamresearch.py +5 -2
  36. promnesia/sources/rss.py +6 -8
  37. promnesia/sources/shellcmd.py +21 -11
  38. promnesia/sources/signal.py +27 -26
  39. promnesia/sources/smscalls.py +2 -3
  40. promnesia/sources/stackexchange.py +5 -4
  41. promnesia/sources/takeout.py +37 -34
  42. promnesia/sources/takeout_legacy.py +29 -19
  43. promnesia/sources/telegram.py +18 -12
  44. promnesia/sources/telegram_legacy.py +22 -11
  45. promnesia/sources/twitter.py +7 -6
  46. promnesia/sources/vcs.py +11 -6
  47. promnesia/sources/viber.py +11 -10
  48. promnesia/sources/website.py +8 -7
  49. promnesia/sources/zulip.py +3 -2
  50. promnesia/sqlite.py +13 -7
  51. promnesia/tests/common.py +10 -5
  52. promnesia/tests/server_helper.py +13 -10
  53. promnesia/tests/sources/test_auto.py +2 -3
  54. promnesia/tests/sources/test_filetypes.py +11 -8
  55. promnesia/tests/sources/test_hypothesis.py +10 -6
  56. promnesia/tests/sources/test_org.py +9 -5
  57. promnesia/tests/sources/test_plaintext.py +9 -8
  58. promnesia/tests/sources/test_shellcmd.py +13 -13
  59. promnesia/tests/sources/test_takeout.py +3 -5
  60. promnesia/tests/test_cannon.py +256 -239
  61. promnesia/tests/test_cli.py +12 -8
  62. promnesia/tests/test_compare.py +17 -13
  63. promnesia/tests/test_config.py +7 -8
  64. promnesia/tests/test_db_dump.py +15 -15
  65. promnesia/tests/test_extract.py +17 -10
  66. promnesia/tests/test_indexer.py +24 -18
  67. promnesia/tests/test_server.py +12 -13
  68. promnesia/tests/test_traverse.py +0 -2
  69. promnesia/tests/utils.py +3 -7
  70. promnesia-1.4.20250909.dist-info/METADATA +66 -0
  71. promnesia-1.4.20250909.dist-info/RECORD +80 -0
  72. {promnesia-1.2.20240810.dist-info → promnesia-1.4.20250909.dist-info}/WHEEL +1 -2
  73. promnesia/kjson.py +0 -121
  74. promnesia/sources/__init__.pyi +0 -0
  75. promnesia-1.2.20240810.dist-info/METADATA +0 -54
  76. promnesia-1.2.20240810.dist-info/RECORD +0 -83
  77. promnesia-1.2.20240810.dist-info/top_level.txt +0 -1
  78. {promnesia-1.2.20240810.dist-info → promnesia-1.4.20250909.dist-info}/entry_points.txt +0 -0
  79. {promnesia-1.2.20240810.dist-info → promnesia-1.4.20250909.dist-info/licenses}/LICENSE +0 -0
promnesia/server.py CHANGED
@@ -1,38 +1,48 @@
1
- #!/usr/bin/python3
2
1
  from __future__ import annotations
3
2
 
4
3
  import argparse
5
- from dataclasses import dataclass
6
- from datetime import timedelta
7
- from functools import lru_cache
8
4
  import importlib.metadata
9
5
  import json
10
6
  import logging
11
7
  import os
8
+ from dataclasses import dataclass
9
+ from datetime import timedelta
10
+ from functools import lru_cache
12
11
  from pathlib import Path
13
- from typing import List, NamedTuple, Dict, Optional, Any, Tuple, Protocol
14
-
15
-
16
- import pytz
17
- from pytz import BaseTzInfo
12
+ from typing import Any, NamedTuple, Protocol
13
+ from zoneinfo import ZoneInfo
18
14
 
19
15
  import fastapi
20
-
21
- from sqlalchemy import literal, between, or_, and_, exc, select
22
- from sqlalchemy import Column, Table, func, types
23
- from sqlalchemy.sql.elements import ColumnElement
16
+ from sqlalchemy import (
17
+ Column,
18
+ Table,
19
+ and_,
20
+ between,
21
+ exc,
22
+ func,
23
+ literal,
24
+ or_,
25
+ select,
26
+ types,
27
+ )
24
28
  from sqlalchemy.sql import text
29
+ from sqlalchemy.sql.elements import ColumnElement
25
30
 
26
-
27
- from .common import PathWithMtime, DbVisit, Url, setup_logger, default_output_dir, get_system_tz
28
31
  from .cannon import canonify
32
+ from .common import (
33
+ DbVisit,
34
+ PathWithMtime,
35
+ default_output_dir,
36
+ get_system_tz,
37
+ setup_logger,
38
+ )
29
39
  from .database.load import DbStuff, get_db_stuff, row_to_db_visit
30
40
 
31
-
32
- Json = Dict[str, Any]
41
+ Json = dict[str, Any]
33
42
 
34
43
  app = fastapi.FastAPI()
35
44
 
45
+
36
46
  # meh. need this since I don't have hooks in hug to initialize logging properly..
37
47
  @lru_cache(1)
38
48
  def get_logger() -> logging.Logger:
@@ -51,26 +61,26 @@ def get_logger() -> logging.Logger:
51
61
 
52
62
 
53
63
  def get_version() -> str:
64
+ assert __package__ is not None # make type checker happy
54
65
  return importlib.metadata.version(__package__)
55
66
 
56
67
 
57
68
  class ServerConfig(NamedTuple):
58
69
  db: Path
59
- timezone: BaseTzInfo
70
+ timezone: ZoneInfo
60
71
 
61
72
  def as_str(self) -> str:
62
- return json.dumps({
63
- 'timezone': self.timezone.zone,
64
- 'db' : str(self.db),
65
- })
73
+ return json.dumps(
74
+ {
75
+ 'timezone': self.timezone.key,
76
+ 'db': str(self.db),
77
+ }
78
+ )
66
79
 
67
80
  @classmethod
68
- def from_str(cls, cfgs: str) -> 'ServerConfig':
81
+ def from_str(cls, cfgs: str) -> ServerConfig:
69
82
  d = json.loads(cfgs)
70
- return cls(
71
- db =Path (d['db']),
72
- timezone=pytz.timezone(d['timezone'])
73
- )
83
+ return cls(db=Path(d['db']), timezone=ZoneInfo(d['timezone']))
74
84
 
75
85
 
76
86
  class EnvConfig:
@@ -88,8 +98,10 @@ class EnvConfig:
88
98
  def set(cfg: ServerConfig) -> None:
89
99
  os.environ[EnvConfig.KEY] = cfg.as_str()
90
100
 
101
+
91
102
  # todo how to return exception in error?
92
103
 
104
+
93
105
  def as_json(v: DbVisit) -> Json:
94
106
  # yep, this is NOT %Y-%m-%d as is seems to be the only format with timezone that Date.parse in JS accepts. Just forget it.
95
107
  dts = v.dt.strftime('%d %b %Y %H:%M:%S %z')
@@ -104,14 +116,14 @@ def as_json(v: DbVisit) -> Json:
104
116
  'duration': v.duration,
105
117
  'locator': {
106
118
  'title': loc.title,
107
- 'href' : loc.href,
119
+ 'href': loc.href,
108
120
  },
109
- 'original_url' : v.orig_url,
121
+ 'original_url': v.orig_url,
110
122
  'normalised_url': v.norm_url,
111
123
  }
112
124
 
113
125
 
114
- def get_db_path(check: bool=True) -> Path:
126
+ def get_db_path(*, check: bool = True) -> Path:
115
127
  db = EnvConfig.get().db
116
128
  if check:
117
129
  assert db.exists(), db
@@ -125,7 +137,7 @@ def _get_stuff(db_path: PathWithMtime) -> DbStuff:
125
137
  return get_db_stuff(db_path=db_path.path)
126
138
 
127
139
 
128
- def get_stuff(db_path: Optional[Path]=None) -> DbStuff: # TODO better name
140
+ def get_stuff(db_path: Path | None = None) -> DbStuff: # TODO better name
129
141
  # ok, it will always load from the same db file; but intermediate would be kinda an optional dump.
130
142
  if db_path is None:
131
143
  db_path = get_db_path()
@@ -136,15 +148,15 @@ def db_stats(db_path: Path) -> Json:
136
148
  engine, table = get_stuff(db_path)
137
149
  query = select(func.count()).select_from(table)
138
150
  with engine.connect() as conn:
139
- total = list(conn.execute(query))[0][0]
151
+ [(total,)] = conn.execute(query)
140
152
  return {
141
153
  'total_visits': total,
142
154
  }
143
155
 
144
156
 
145
157
  class Where(Protocol):
146
- def __call__(self, table: Table, url: str) -> ColumnElement[bool]:
147
- ...
158
+ def __call__(self, table: Table, url: str) -> ColumnElement[bool]: ...
159
+
148
160
 
149
161
  @dataclass
150
162
  class VisitsResponse:
@@ -172,22 +184,21 @@ def search_common(url: str, where: Where) -> VisitsResponse:
172
184
  with engine.connect() as conn:
173
185
  try:
174
186
  # TODO make more defensive here
175
- visits: List[DbVisit] = [row_to_db_visit(row) for row in conn.execute(query)]
187
+ visits: list[DbVisit] = [row_to_db_visit(row) for row in conn.execute(query)]
176
188
  except exc.OperationalError as e:
177
189
  if getattr(e, 'msg', None) == 'no such table: visits':
178
- logger.warn('you may have to run indexer first!')
179
- #result['visits'] = [{an error with a msg}] # TODO
180
- #return result
190
+ logger.warning('you may have to run indexer first!')
191
+ # result['visits'] = [{an error with a msg}] # TODO
192
+ # return result
181
193
  raise
182
194
 
183
195
  logger.debug('got %d visits from db', len(visits))
184
196
 
185
- vlist: List[DbVisit] = []
197
+ vlist: list[DbVisit] = []
186
198
  for vis in visits:
187
199
  dt = vis.dt
188
- if dt.tzinfo is None: # FIXME need this for /visits endpoint as well?
189
- tz = config.timezone
190
- dt = tz.localize(dt)
200
+ if dt.tzinfo is None: # FIXME need this for /visits endpoint as well?
201
+ dt = dt.replace(tzinfo=config.timezone)
191
202
  vis = vis._replace(dt=dt)
192
203
  vlist.append(vis)
193
204
 
@@ -202,8 +213,8 @@ def search_common(url: str, where: Where) -> VisitsResponse:
202
213
 
203
214
  # TODO hmm, seems that the extension is using post for all requests??
204
215
  # perhasp should switch to get for most endpoint
205
- @app.get ('/status', response_model=Json)
206
- @app.post('/status', response_model=Json)
216
+ @app.get ('/status', response_model=Json) # fmt: skip
217
+ @app.post('/status', response_model=Json) # fmt: skip
207
218
  def status() -> Json:
208
219
  '''
209
220
  Ideally, status will always respond, regardless the internal state of the backend?
@@ -225,7 +236,7 @@ def status() -> Json:
225
236
  logger.exception(e)
226
237
  stats = {'ERROR': str(e)}
227
238
 
228
- version: Optional[str]
239
+ version: str | None
229
240
  try:
230
241
  version = get_version()
231
242
  except Exception as e:
@@ -236,15 +247,16 @@ def status() -> Json:
236
247
  'version': version,
237
248
  'db' : db_path,
238
249
  'stats' : stats,
239
- }
250
+ } # fmt: skip
240
251
 
241
252
 
242
253
  @dataclass
243
254
  class VisitsRequest:
244
255
  url: str
245
256
 
246
- @app.get ('/visits', response_model=VisitsResponse)
247
- @app.post('/visits', response_model=VisitsResponse)
257
+
258
+ @app.get ('/visits', response_model=VisitsResponse) # fmt: skip
259
+ @app.post('/visits', response_model=VisitsResponse) # fmt: skip
248
260
  def visits(request: VisitsRequest) -> VisitsResponse:
249
261
  url = request.url
250
262
  get_logger().info('/visited %s', url)
@@ -255,7 +267,7 @@ def visits(request: VisitsRequest) -> VisitsResponse:
255
267
  # exact match
256
268
  table.c.norm_url == url,
257
269
  # + child visits, but only 'interesting' ones
258
- and_(table.c.context != None, table.c.norm_url.startswith(url, autoescape=True)) # noqa: E711
270
+ and_(table.c.context != None, table.c.norm_url.startswith(url, autoescape=True)), # noqa: E711
259
271
  ),
260
272
  )
261
273
 
@@ -264,11 +276,13 @@ def visits(request: VisitsRequest) -> VisitsResponse:
264
276
  class SearchRequest:
265
277
  url: str
266
278
 
267
- @app.get ('/search', response_model=VisitsResponse)
268
- @app.post('/search', response_model=VisitsResponse)
279
+
280
+ @app.get ('/search', response_model=VisitsResponse) # fmt: skip
281
+ @app.post('/search', response_model=VisitsResponse) # fmt: skip
269
282
  def search(request: SearchRequest) -> VisitsResponse:
270
283
  url = request.url
271
284
  get_logger().info('/search %s', url)
285
+ # fmt: off
272
286
  return search_common(
273
287
  url=url,
274
288
  where=lambda table, url: or_(
@@ -279,71 +293,79 @@ def search(request: SearchRequest) -> VisitsResponse:
279
293
  table.c.locator_title.contains(url, autoescape=True),
280
294
  ),
281
295
  )
296
+ # fmt: on
282
297
 
283
298
 
284
299
  @dataclass
285
300
  class SearchAroundRequest:
286
301
  timestamp: float
287
302
 
288
- @app.get ('/search_around', response_model=VisitsResponse)
289
- @app.post('/search_around', response_model=VisitsResponse)
303
+
304
+ @app.get ('/search_around', response_model=VisitsResponse) # fmt: skip
305
+ @app.post('/search_around', response_model=VisitsResponse) # fmt: skip
290
306
  def search_around(request: SearchAroundRequest) -> VisitsResponse:
291
307
  timestamp = request.timestamp
292
308
  get_logger().info('/search_around %s', timestamp)
293
- utc_timestamp = timestamp # old 'timestamp' name is legacy
309
+ utc_timestamp = timestamp # old 'timestamp' name is legacy
294
310
 
295
311
  # TODO meh. use count/pagination instead?
296
- delta_back = timedelta(hours=3 ).total_seconds()
312
+ delta_back = timedelta(hours=3).total_seconds()
297
313
  delta_front = timedelta(minutes=2).total_seconds()
298
314
  # TODO not sure about delta_front.. but it also serves as quick hack to accommodate for all the truncations etc
299
315
 
300
316
  return search_common(
301
- url='http://dummy.org', # NOTE: not used in the where query (below).. perhaps need to get rid of this
302
- where=lambda table, url: between(
317
+ url='http://dummy.org', # NOTE: not used in the where query (below).. perhaps need to get rid of this
318
+ where=lambda table, url: between( # noqa: ARG005
303
319
  func.strftime(
304
- '%s', # NOTE: it's tz aware, e.g. would distinguish +05:00 vs -03:00
320
+ '%s', # NOTE: it's tz aware, e.g. would distinguish +05:00 vs -03:00
305
321
  # this is a bit fragile, relies on cachew internal timestamp format, e.g.
306
322
  # 2020-11-10T06:13:03.196376+00:00 Europe/London
307
323
  func.substr(
308
324
  table.c.dt,
309
- 1, # substr is 1-indexed
325
+ 1, # substr is 1-indexed
310
326
  # instr finds the first match, but if not found it defaults to 0.. which we hack by concatting with ' '
311
327
  func.instr(func.cast(table.c.dt, types.Unicode).op('||')(' '), ' ') - 1,
312
328
  # for fucks sake.. seems that cast is necessary otherwise it tries to treat ' ' as datetime???
313
- )
314
- ) - literal(utc_timestamp),
329
+ ),
330
+ )
331
+ - literal(utc_timestamp),
315
332
  literal(-delta_back),
316
333
  literal(delta_front),
317
334
  ),
318
335
  )
319
336
 
337
+
320
338
  # before 0.11.14 (including), extension didn't share the version
321
339
  # so if it's not shared, assume that version
322
340
  _NO_VERSION = (0, 11, 14)
323
341
  _LATEST = (9999, 9999, 9999)
324
342
 
325
- def as_version(version: str) -> Tuple[int, int, int]:
343
+
344
+ def as_version(version: str) -> tuple[int, int, int]:
326
345
  if version == '':
327
346
  return _NO_VERSION
328
347
  try:
329
348
  [v1, v2, v3] = map(int, version.split('.'))
330
- return (v1, v2, v3)
331
349
  except Exception as e:
332
350
  logger = get_logger()
333
351
  logger.error('error while parsing version %s', version)
334
352
  logger.exception(e)
335
353
  return _LATEST
354
+ else:
355
+ return (v1, v2, v3)
336
356
 
337
357
 
338
358
  @dataclass
339
359
  class VisitedRequest:
340
- urls: List[str]
360
+ urls: list[str]
341
361
  client_version: str = ''
342
362
 
343
- VisitedResponse = List[Optional[Json]]
344
363
 
345
- @app.get ('/visited', response_model=VisitedResponse)
346
- @app.post('/visited', response_model=VisitedResponse)
364
+ VisitedResponse = list[Json | None]
365
+
366
+
367
+ @app.get ('/visited', response_model=VisitedResponse) # fmt: skip
368
+ @app.post('/visited', response_model=VisitedResponse) # fmt: skip
347
369
  def visited(request: VisitedRequest) -> VisitedResponse:
348
370
  # TODO instead switch logging to fastapi
349
371
  urls = request.urls
@@ -352,10 +374,10 @@ def visited(request: VisitedRequest) -> VisitedResponse:
352
374
  logger = get_logger()
353
375
  logger.info('/visited %s %s', urls, client_version)
354
376
 
355
- version = as_version(client_version)
377
+ _version = as_version(client_version) # todo use it?
356
378
 
357
379
  nurls = [canonify(u) for u in urls]
358
- snurls = list(sorted(set(nurls)))
380
+ snurls = sorted(set(nurls))
359
381
 
360
382
  if len(snurls) == 0:
361
383
  return []
@@ -365,10 +387,11 @@ def visited(request: VisitedRequest) -> VisitedResponse:
365
387
  # sqlalchemy doesn't seem to support SELECT FROM (VALUES (...)) in its api
366
388
  # also doesn't support array binding...
367
389
  # https://stackoverflow.com/questions/13190392/how-can-i-bind-a-list-to-a-parameter-in-a-custom-query-in-sqlalchemy
368
- bstring = ','.join(f'(:b{i})' for i, _ in enumerate(snurls))
369
- bdict = { f'b{i}': v for i, v in enumerate(snurls)}
390
+ bstring = ','.join(f'(:b{i})' for i, _ in enumerate(snurls)) # fmt: skip
391
+ bdict = { f'b{i}': v for i, v in enumerate(snurls)} # fmt: skip
370
392
  # TODO hopefully, visits.* thing only returns one visit??
371
- query = text(f"""
393
+ query = (
394
+ text(f"""
372
395
  WITH cte(queried) AS (SELECT * FROM (values {bstring}))
373
396
  SELECT queried, visits.*
374
397
  FROM cte JOIN visits
@@ -378,9 +401,12 @@ SELECT queried, visits.*
378
401
  but somehow DESC is the one that actually works..
379
402
  */
380
403
  ORDER BY visits.context IS NULL DESC
381
- """).bindparams(**bdict).columns(
382
- Column('match', types.Unicode),
383
- *table.columns,
404
+ """)
405
+ .bindparams(**bdict)
406
+ .columns(
407
+ Column('match', types.Unicode),
408
+ *table.columns,
409
+ )
384
410
  )
385
411
  # TODO might be very beneficial for performance to have an intermediate table
386
412
  # SELECT visits.* FROM visits GROUP BY visits.norm_url ORDER BY visits.context IS NULL DESC
@@ -388,10 +414,10 @@ SELECT queried, visits.*
388
414
  # brings down large queries to 50ms...
389
415
  with engine.connect() as conn:
390
416
  res = list(conn.execute(query))
391
- present: Dict[str, Any] = {row[0]: row_to_db_visit(row[1:]) for row in res}
417
+ present: dict[str, Any] = {row[0]: row_to_db_visit(row[1:]) for row in res}
392
418
  results = []
393
419
  for nu in nurls:
394
- r = present.get(nu, None)
420
+ r = present.get(nu)
395
421
  results.append(None if r is None else as_json(r))
396
422
 
397
423
  # no need for it anymore, extension has been updated since
@@ -411,6 +437,7 @@ def _run(*, host: str, port: str, quiet: bool, config: ServerConfig) -> None:
411
437
  EnvConfig.set(config)
412
438
 
413
439
  import uvicorn
440
+
414
441
  uvicorn.run('promnesia.server:app', host=host, port=int(port), log_level='debug')
415
442
 
416
443
 
@@ -422,7 +449,7 @@ def run(args: argparse.Namespace) -> None:
422
449
  config=ServerConfig(
423
450
  db=args.db,
424
451
  timezone=args.timezone,
425
- )
452
+ ),
426
453
  )
427
454
 
428
455
 
@@ -464,7 +491,7 @@ def setup_parser(p: argparse.ArgumentParser) -> None:
464
491
 
465
492
  p.add_argument(
466
493
  '--timezone',
467
- type=pytz.timezone,
494
+ type=ZoneInfo,
468
495
  default=get_system_tz(),
469
496
  help='Fallback timezone, defaults to the system timezone if not specified',
470
497
  )