promnesia 1.2.20230515__py3-none-any.whl → 1.3.20241021__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- promnesia/__init__.py +14 -3
- promnesia/__main__.py +60 -35
- promnesia/cannon.py +27 -27
- promnesia/common.py +85 -67
- promnesia/compare.py +21 -22
- promnesia/compat.py +10 -10
- promnesia/config.py +23 -23
- promnesia/database/common.py +67 -0
- promnesia/database/dump.py +188 -0
- promnesia/{read_db.py → database/load.py} +16 -17
- promnesia/extract.py +14 -11
- promnesia/kjson.py +12 -11
- promnesia/logging.py +4 -4
- promnesia/misc/__init__.pyi +0 -0
- promnesia/misc/config_example.py +1 -2
- promnesia/misc/install_server.py +7 -9
- promnesia/server.py +57 -47
- promnesia/sources/__init__.pyi +0 -0
- promnesia/sources/auto.py +50 -35
- promnesia/sources/auto_logseq.py +6 -5
- promnesia/sources/auto_obsidian.py +2 -2
- promnesia/sources/browser.py +14 -9
- promnesia/sources/browser_legacy.py +26 -16
- promnesia/sources/demo.py +19 -3
- promnesia/sources/fbmessenger.py +3 -2
- promnesia/sources/filetypes.py +16 -7
- promnesia/sources/github.py +7 -9
- promnesia/sources/guess.py +2 -1
- promnesia/sources/hackernews.py +2 -2
- promnesia/sources/hpi.py +2 -2
- promnesia/sources/html.py +7 -5
- promnesia/sources/hypothesis.py +4 -3
- promnesia/sources/instapaper.py +2 -2
- promnesia/sources/markdown.py +31 -21
- promnesia/sources/org.py +27 -13
- promnesia/sources/plaintext.py +30 -29
- promnesia/sources/pocket.py +3 -2
- promnesia/sources/reddit.py +20 -19
- promnesia/sources/roamresearch.py +2 -1
- promnesia/sources/rss.py +4 -5
- promnesia/sources/shellcmd.py +19 -6
- promnesia/sources/signal.py +33 -24
- promnesia/sources/smscalls.py +2 -2
- promnesia/sources/stackexchange.py +4 -3
- promnesia/sources/takeout.py +76 -9
- promnesia/sources/takeout_legacy.py +24 -12
- promnesia/sources/telegram.py +13 -11
- promnesia/sources/telegram_legacy.py +18 -7
- promnesia/sources/twitter.py +6 -5
- promnesia/sources/vcs.py +5 -3
- promnesia/sources/viber.py +10 -9
- promnesia/sources/website.py +4 -4
- promnesia/sources/zulip.py +3 -2
- promnesia/sqlite.py +7 -4
- promnesia/tests/__init__.py +0 -0
- promnesia/tests/common.py +140 -0
- promnesia/tests/server_helper.py +67 -0
- promnesia/tests/sources/__init__.py +0 -0
- promnesia/tests/sources/test_auto.py +65 -0
- promnesia/tests/sources/test_filetypes.py +43 -0
- promnesia/tests/sources/test_hypothesis.py +39 -0
- promnesia/tests/sources/test_org.py +64 -0
- promnesia/tests/sources/test_plaintext.py +25 -0
- promnesia/tests/sources/test_shellcmd.py +21 -0
- promnesia/tests/sources/test_takeout.py +56 -0
- promnesia/tests/test_cannon.py +325 -0
- promnesia/tests/test_cli.py +40 -0
- promnesia/tests/test_compare.py +30 -0
- promnesia/tests/test_config.py +289 -0
- promnesia/tests/test_db_dump.py +222 -0
- promnesia/tests/test_extract.py +65 -0
- promnesia/tests/test_extract_urls.py +43 -0
- promnesia/tests/test_indexer.py +251 -0
- promnesia/tests/test_server.py +291 -0
- promnesia/tests/test_traverse.py +39 -0
- promnesia/tests/utils.py +35 -0
- {promnesia-1.2.20230515.dist-info → promnesia-1.3.20241021.dist-info}/METADATA +15 -18
- promnesia-1.3.20241021.dist-info/RECORD +83 -0
- {promnesia-1.2.20230515.dist-info → promnesia-1.3.20241021.dist-info}/WHEEL +1 -1
- {promnesia-1.2.20230515.dist-info → promnesia-1.3.20241021.dist-info}/entry_points.txt +0 -1
- promnesia/dump.py +0 -105
- promnesia-1.2.20230515.dist-info/RECORD +0 -58
- {promnesia-1.2.20230515.dist-info → promnesia-1.3.20241021.dist-info}/LICENSE +0 -0
- {promnesia-1.2.20230515.dist-info → promnesia-1.3.20241021.dist-info}/top_level.txt +0 -0
promnesia/server.py
CHANGED
@@ -1,35 +1,45 @@
|
|
1
|
-
#!/usr/bin/python3
|
2
1
|
from __future__ import annotations
|
3
2
|
|
4
|
-
__package__ = 'promnesia' # ugh. hacky way to make wsgi runner work properly...
|
5
|
-
|
6
3
|
import argparse
|
7
|
-
|
8
|
-
from datetime import timedelta
|
9
|
-
from functools import lru_cache
|
4
|
+
import importlib.metadata
|
10
5
|
import json
|
11
6
|
import logging
|
12
7
|
import os
|
8
|
+
from dataclasses import dataclass
|
9
|
+
from datetime import timedelta
|
10
|
+
from functools import lru_cache
|
13
11
|
from pathlib import Path
|
14
|
-
from typing import
|
15
|
-
|
12
|
+
from typing import Any, NamedTuple, Optional, Protocol
|
16
13
|
|
14
|
+
import fastapi
|
17
15
|
import pytz
|
18
16
|
from pytz import BaseTzInfo
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
17
|
+
from sqlalchemy import (
|
18
|
+
Column,
|
19
|
+
Table,
|
20
|
+
and_,
|
21
|
+
between,
|
22
|
+
exc,
|
23
|
+
func,
|
24
|
+
literal,
|
25
|
+
or_,
|
26
|
+
select,
|
27
|
+
types,
|
28
|
+
)
|
25
29
|
from sqlalchemy.sql import text
|
30
|
+
from sqlalchemy.sql.elements import ColumnElement
|
26
31
|
|
27
|
-
|
28
|
-
from .common import PathWithMtime, DbVisit, Url, setup_logger, default_output_dir, get_system_tz
|
29
32
|
from .cannon import canonify
|
33
|
+
from .common import (
|
34
|
+
DbVisit,
|
35
|
+
PathWithMtime,
|
36
|
+
default_output_dir,
|
37
|
+
get_system_tz,
|
38
|
+
setup_logger,
|
39
|
+
)
|
40
|
+
from .database.load import DbStuff, get_db_stuff, row_to_db_visit
|
30
41
|
|
31
|
-
|
32
|
-
Json = Dict[str, Any]
|
42
|
+
Json = dict[str, Any]
|
33
43
|
|
34
44
|
app = fastapi.FastAPI()
|
35
45
|
|
@@ -51,8 +61,7 @@ def get_logger() -> logging.Logger:
|
|
51
61
|
|
52
62
|
|
53
63
|
def get_version() -> str:
|
54
|
-
|
55
|
-
return get_distribution(__package__).version
|
64
|
+
return importlib.metadata.version(__package__)
|
56
65
|
|
57
66
|
|
58
67
|
class ServerConfig(NamedTuple):
|
@@ -66,7 +75,7 @@ class ServerConfig(NamedTuple):
|
|
66
75
|
})
|
67
76
|
|
68
77
|
@classmethod
|
69
|
-
def from_str(cls, cfgs: str) ->
|
78
|
+
def from_str(cls, cfgs: str) -> ServerConfig:
|
70
79
|
d = json.loads(cfgs)
|
71
80
|
return cls(
|
72
81
|
db =Path (d['db']),
|
@@ -112,15 +121,13 @@ def as_json(v: DbVisit) -> Json:
|
|
112
121
|
}
|
113
122
|
|
114
123
|
|
115
|
-
def get_db_path(check: bool=True) -> Path:
|
124
|
+
def get_db_path(*, check: bool=True) -> Path:
|
116
125
|
db = EnvConfig.get().db
|
117
126
|
if check:
|
118
127
|
assert db.exists(), db
|
119
128
|
return db
|
120
129
|
|
121
130
|
|
122
|
-
from .read_db import DbStuff, get_db_stuff
|
123
|
-
|
124
131
|
@lru_cache(1)
|
125
132
|
# PathWithMtime aids lru_cache in reloading the sqlalchemy binder
|
126
133
|
def _get_stuff(db_path: PathWithMtime) -> DbStuff:
|
@@ -128,7 +135,7 @@ def _get_stuff(db_path: PathWithMtime) -> DbStuff:
|
|
128
135
|
return get_db_stuff(db_path=db_path.path)
|
129
136
|
|
130
137
|
|
131
|
-
def get_stuff(db_path:
|
138
|
+
def get_stuff(db_path: Path | None=None) -> DbStuff: # TODO better name
|
132
139
|
# ok, it will always load from the same db file; but intermediate would be kinda an optional dump.
|
133
140
|
if db_path is None:
|
134
141
|
db_path = get_db_path()
|
@@ -136,10 +143,10 @@ def get_stuff(db_path: Optional[Path]=None) -> DbStuff: # TODO better name
|
|
136
143
|
|
137
144
|
|
138
145
|
def db_stats(db_path: Path) -> Json:
|
139
|
-
engine,
|
146
|
+
engine, table = get_stuff(db_path)
|
140
147
|
query = select(func.count()).select_from(table)
|
141
148
|
with engine.connect() as conn:
|
142
|
-
total =
|
149
|
+
[(total,)] = conn.execute(query)
|
143
150
|
return {
|
144
151
|
'total_visits': total,
|
145
152
|
}
|
@@ -151,8 +158,8 @@ class Where(Protocol):
|
|
151
158
|
|
152
159
|
@dataclass
|
153
160
|
class VisitsResponse:
|
154
|
-
original_url:
|
155
|
-
normalised_url:
|
161
|
+
original_url: str
|
162
|
+
normalised_url: str
|
156
163
|
visits: Any
|
157
164
|
|
158
165
|
|
@@ -167,7 +174,7 @@ def search_common(url: str, where: Where) -> VisitsResponse:
|
|
167
174
|
url = original_url
|
168
175
|
logger.info('normalised url: %s', url)
|
169
176
|
|
170
|
-
engine,
|
177
|
+
engine, table = get_stuff()
|
171
178
|
|
172
179
|
query = table.select().where(where(table=table, url=url))
|
173
180
|
logger.debug('query: %s', query)
|
@@ -175,17 +182,17 @@ def search_common(url: str, where: Where) -> VisitsResponse:
|
|
175
182
|
with engine.connect() as conn:
|
176
183
|
try:
|
177
184
|
# TODO make more defensive here
|
178
|
-
visits:
|
185
|
+
visits: list[DbVisit] = [row_to_db_visit(row) for row in conn.execute(query)]
|
179
186
|
except exc.OperationalError as e:
|
180
187
|
if getattr(e, 'msg', None) == 'no such table: visits':
|
181
|
-
logger.
|
188
|
+
logger.warning('you may have to run indexer first!')
|
182
189
|
#result['visits'] = [{an error with a msg}] # TODO
|
183
190
|
#return result
|
184
191
|
raise
|
185
192
|
|
186
193
|
logger.debug('got %d visits from db', len(visits))
|
187
194
|
|
188
|
-
vlist:
|
195
|
+
vlist: list[DbVisit] = []
|
189
196
|
for vis in visits:
|
190
197
|
dt = vis.dt
|
191
198
|
if dt.tzinfo is None: # FIXME need this for /visits endpoint as well?
|
@@ -228,10 +235,11 @@ def status() -> Json:
|
|
228
235
|
logger.exception(e)
|
229
236
|
stats = {'ERROR': str(e)}
|
230
237
|
|
231
|
-
version:
|
238
|
+
version: str | None
|
232
239
|
try:
|
233
240
|
version = get_version()
|
234
241
|
except Exception as e:
|
242
|
+
logger.exception(e)
|
235
243
|
version = None
|
236
244
|
|
237
245
|
return {
|
@@ -241,10 +249,9 @@ def status() -> Json:
|
|
241
249
|
}
|
242
250
|
|
243
251
|
|
244
|
-
from dataclasses import dataclass
|
245
252
|
@dataclass
|
246
253
|
class VisitsRequest:
|
247
|
-
url:
|
254
|
+
url: str
|
248
255
|
|
249
256
|
@app.get ('/visits', response_model=VisitsResponse)
|
250
257
|
@app.post('/visits', response_model=VisitsResponse)
|
@@ -255,15 +262,17 @@ def visits(request: VisitsRequest) -> VisitsResponse:
|
|
255
262
|
url=url,
|
256
263
|
# odd, doesn't work just with: x or (y and z)
|
257
264
|
where=lambda table, url: or_(
|
258
|
-
|
259
|
-
|
265
|
+
# exact match
|
266
|
+
table.c.norm_url == url,
|
267
|
+
# + child visits, but only 'interesting' ones
|
268
|
+
and_(table.c.context != None, table.c.norm_url.startswith(url, autoescape=True)) # noqa: E711
|
260
269
|
),
|
261
270
|
)
|
262
271
|
|
263
272
|
|
264
273
|
@dataclass
|
265
274
|
class SearchRequest:
|
266
|
-
url:
|
275
|
+
url: str
|
267
276
|
|
268
277
|
@app.get ('/search', response_model=VisitsResponse)
|
269
278
|
@app.post('/search', response_model=VisitsResponse)
|
@@ -300,7 +309,7 @@ def search_around(request: SearchAroundRequest) -> VisitsResponse:
|
|
300
309
|
|
301
310
|
return search_common(
|
302
311
|
url='http://dummy.org', # NOTE: not used in the where query (below).. perhaps need to get rid of this
|
303
|
-
where=lambda table, url: between(
|
312
|
+
where=lambda table, url: between( # noqa: ARG005
|
304
313
|
func.strftime(
|
305
314
|
'%s', # NOTE: it's tz aware, e.g. would distinguish +05:00 vs -03:00
|
306
315
|
# this is a bit fragile, relies on cachew internal timestamp format, e.g.
|
@@ -323,25 +332,26 @@ def search_around(request: SearchAroundRequest) -> VisitsResponse:
|
|
323
332
|
_NO_VERSION = (0, 11, 14)
|
324
333
|
_LATEST = (9999, 9999, 9999)
|
325
334
|
|
326
|
-
def as_version(version: str) ->
|
335
|
+
def as_version(version: str) -> tuple[int, int, int]:
|
327
336
|
if version == '':
|
328
337
|
return _NO_VERSION
|
329
338
|
try:
|
330
339
|
[v1, v2, v3] = map(int, version.split('.'))
|
331
|
-
return (v1, v2, v3)
|
332
340
|
except Exception as e:
|
333
341
|
logger = get_logger()
|
334
342
|
logger.error('error while parsing version %s', version)
|
335
343
|
logger.exception(e)
|
336
344
|
return _LATEST
|
345
|
+
else:
|
346
|
+
return (v1, v2, v3)
|
337
347
|
|
338
348
|
|
339
349
|
@dataclass
|
340
350
|
class VisitedRequest:
|
341
|
-
urls:
|
351
|
+
urls: list[str]
|
342
352
|
client_version: str = ''
|
343
353
|
|
344
|
-
VisitedResponse =
|
354
|
+
VisitedResponse = list[Optional[Json]]
|
345
355
|
|
346
356
|
@app.get ('/visited', response_model=VisitedResponse)
|
347
357
|
@app.post('/visited', response_model=VisitedResponse)
|
@@ -356,12 +366,12 @@ def visited(request: VisitedRequest) -> VisitedResponse:
|
|
356
366
|
version = as_version(client_version)
|
357
367
|
|
358
368
|
nurls = [canonify(u) for u in urls]
|
359
|
-
snurls =
|
369
|
+
snurls = sorted(set(nurls))
|
360
370
|
|
361
371
|
if len(snurls) == 0:
|
362
372
|
return []
|
363
373
|
|
364
|
-
engine,
|
374
|
+
engine, table = get_stuff()
|
365
375
|
|
366
376
|
# sqlalchemy doesn't seem to support SELECT FROM (VALUES (...)) in its api
|
367
377
|
# also doesn't support array binding...
|
@@ -389,7 +399,7 @@ SELECT queried, visits.*
|
|
389
399
|
# brings down large queries to 50ms...
|
390
400
|
with engine.connect() as conn:
|
391
401
|
res = list(conn.execute(query))
|
392
|
-
present:
|
402
|
+
present: dict[str, Any] = {row[0]: row_to_db_visit(row[1:]) for row in res}
|
393
403
|
results = []
|
394
404
|
for nu in nurls:
|
395
405
|
r = present.get(nu, None)
|
File without changes
|
promnesia/sources/auto.py
CHANGED
@@ -5,34 +5,46 @@
|
|
5
5
|
- autodetects Obsidian vault and adds `obsidian://` app protocol support [[file:../src/promnesia/sources/obsidian.py][promnesia.sources.obsidian]]
|
6
6
|
- autodetects Logseq graph and adds `logseq://` app protocol support [[file:../src/promnesia/sources/logseq.py][promnesia.sources.logseq]]
|
7
7
|
"""
|
8
|
+
from __future__ import annotations
|
8
9
|
|
9
10
|
import csv
|
10
|
-
from concurrent.futures import ProcessPoolExecutor as Pool
|
11
|
-
from contextlib import nullcontext
|
12
|
-
from datetime import datetime
|
13
11
|
import itertools
|
14
12
|
import json
|
15
13
|
import os
|
16
|
-
from
|
14
|
+
from collections.abc import Iterable, Iterator, Sequence
|
15
|
+
from concurrent.futures import ProcessPoolExecutor as Pool
|
16
|
+
from contextlib import nullcontext
|
17
17
|
from fnmatch import fnmatch
|
18
|
+
from functools import wraps
|
18
19
|
from pathlib import Path
|
19
|
-
from
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
20
|
+
from typing import Any, Callable, NamedTuple, Optional
|
21
|
+
|
22
|
+
from promnesia.common import (
|
23
|
+
Loc,
|
24
|
+
PathIsh,
|
25
|
+
Result,
|
26
|
+
Results,
|
27
|
+
Visit,
|
28
|
+
echain,
|
29
|
+
extract_urls,
|
30
|
+
file_mtime,
|
31
|
+
get_logger,
|
32
|
+
get_tmpdir,
|
33
|
+
logger,
|
34
|
+
mime,
|
35
|
+
traverse,
|
36
|
+
warn_once,
|
37
|
+
)
|
38
|
+
from promnesia.config import use_cores
|
26
39
|
|
27
|
-
|
28
|
-
from .filetypes import EUrl
|
29
|
-
from .auto_obsidian import obsidian_replacer
|
30
40
|
from .auto_logseq import logseq_replacer
|
41
|
+
from .auto_obsidian import obsidian_replacer
|
42
|
+
from .filetypes import Ctx, EUrl
|
31
43
|
|
32
44
|
|
33
|
-
def _collect(thing, path:
|
45
|
+
def _collect(thing, path: list[str], result: list[EUrl]) -> None:
|
34
46
|
if isinstance(thing, str):
|
35
|
-
ctx: Ctx = tuple(path)
|
47
|
+
ctx: Ctx = tuple(path)
|
36
48
|
result.extend([EUrl(url=u, ctx=ctx) for u in extract_urls(thing)])
|
37
49
|
elif isinstance(thing, list):
|
38
50
|
path.append('[]')
|
@@ -50,9 +62,9 @@ def _collect(thing, path: List[str], result: List[EUrl]) -> None:
|
|
50
62
|
|
51
63
|
|
52
64
|
# TODO mm. okay, I suppose could use kython consuming thingy?..
|
53
|
-
def collect_from(thing) ->
|
54
|
-
uuu:
|
55
|
-
path:
|
65
|
+
def collect_from(thing) -> list[EUrl]:
|
66
|
+
uuu: list[EUrl] = []
|
67
|
+
path: list[str] = []
|
56
68
|
_collect(thing, path, uuu)
|
57
69
|
return uuu
|
58
70
|
|
@@ -84,7 +96,7 @@ def _plaintext(path: Path) -> Results:
|
|
84
96
|
def fallback(ex):
|
85
97
|
"""Falls back to plaintext in case of issues"""
|
86
98
|
|
87
|
-
fallback_active:
|
99
|
+
fallback_active: dict[Any, bool] = {}
|
88
100
|
@wraps(ex)
|
89
101
|
def wrapped(path: Path):
|
90
102
|
nonlocal fallback_active
|
@@ -98,7 +110,7 @@ def fallback(ex):
|
|
98
110
|
except ModuleNotFoundError as me:
|
99
111
|
logger = get_logger()
|
100
112
|
logger.exception(me)
|
101
|
-
logger.
|
113
|
+
logger.warning('%s: %s not found, falling back to grep! "pip3 install --user %s" for better support!', path, me.name, me.name)
|
102
114
|
yield me
|
103
115
|
fallback_active[ex] = True
|
104
116
|
do_fallback = True
|
@@ -125,7 +137,7 @@ def _org(path: Path) -> Results:
|
|
125
137
|
return org.extract_from_file(path)
|
126
138
|
|
127
139
|
|
128
|
-
from .filetypes import
|
140
|
+
from .filetypes import CODE, IGNORE, TYPE2IDX, type2idx
|
129
141
|
|
130
142
|
TYPE2IDX.update({
|
131
143
|
'application/json': _json,
|
@@ -167,8 +179,8 @@ for t in CODE:
|
|
167
179
|
Replacer = Optional[Callable[[str, str], str]]
|
168
180
|
|
169
181
|
def index(
|
170
|
-
*paths:
|
171
|
-
ignored:
|
182
|
+
*paths: PathIsh,
|
183
|
+
ignored: Sequence[str] | str=(),
|
172
184
|
follow: bool=True,
|
173
185
|
replacer: Replacer=None,
|
174
186
|
) -> Results:
|
@@ -209,10 +221,10 @@ class Options(NamedTuple):
|
|
209
221
|
# TODO option to add ignores? not sure..
|
210
222
|
# TODO I don't like this replacer thing... think about removing it
|
211
223
|
replacer: Replacer
|
212
|
-
root:
|
224
|
+
root: Path | None=None
|
213
225
|
|
214
226
|
|
215
|
-
def _index_file_aux(path: Path, opts: Options) ->
|
227
|
+
def _index_file_aux(path: Path, opts: Options) -> Exception | list[Result]:
|
216
228
|
# just a helper for the concurrent version (the generator isn't picklable)
|
217
229
|
try:
|
218
230
|
return list(_index_file(path, opts=opts))
|
@@ -247,7 +259,7 @@ def _index(path: Path, opts: Options) -> Results:
|
|
247
259
|
continue
|
248
260
|
|
249
261
|
p = p.resolve()
|
250
|
-
if not os.path.exists(p):
|
262
|
+
if not os.path.exists(p): # noqa: PTH110
|
251
263
|
logger.debug('ignoring %s: broken symlink?', p)
|
252
264
|
continue
|
253
265
|
|
@@ -265,8 +277,10 @@ def _index(path: Path, opts: Options) -> Results:
|
|
265
277
|
|
266
278
|
|
267
279
|
Mime = str
|
268
|
-
from .filetypes import Ex
|
269
|
-
|
280
|
+
from .filetypes import Ex # meh
|
281
|
+
|
282
|
+
|
283
|
+
def by_path(pp: Path) -> tuple[Ex | None, Mime | None]:
|
270
284
|
suf = pp.suffix.lower()
|
271
285
|
# firt check suffixes, it's faster
|
272
286
|
s = type2idx(suf)
|
@@ -282,6 +296,8 @@ def by_path(pp: Path) -> Tuple[Optional[Ex], Optional[Mime]]:
|
|
282
296
|
|
283
297
|
def _index_file(pp: Path, opts: Options) -> Results:
|
284
298
|
logger = get_logger()
|
299
|
+
# TODO need to keep debug logs here...
|
300
|
+
# logger.info(f"indexing {pp}")
|
285
301
|
# TODO use kompress?
|
286
302
|
# TODO not even sure if it's used...
|
287
303
|
suf = pp.suffix.lower()
|
@@ -307,18 +323,17 @@ def _index_file(pp: Path, opts: Options) -> Results:
|
|
307
323
|
|
308
324
|
ip, pm = by_path(pp)
|
309
325
|
if ip is None:
|
310
|
-
#
|
311
|
-
# TODO only log once? # hmm..
|
326
|
+
# todo not really sure about using warnings vs yielding error here?
|
312
327
|
msg = f'No extractor for suffix {suf}, mime {pm}'
|
313
|
-
|
328
|
+
warn_once(msg)
|
314
329
|
yield echain(ex, RuntimeError(msg))
|
315
330
|
return
|
316
331
|
|
317
332
|
logger.debug('indexing via %s: %s', ip.__name__, pp)
|
318
333
|
|
319
|
-
def indexer() ->
|
334
|
+
def indexer() -> Urls | Results:
|
320
335
|
# eh, annoying.. need to make more generic..
|
321
|
-
idx = ip(pp)
|
336
|
+
idx = ip(pp)
|
322
337
|
try:
|
323
338
|
yield from idx
|
324
339
|
except Exception as e:
|
@@ -351,7 +366,7 @@ def _index_file(pp: Path, opts: Options) -> Results:
|
|
351
366
|
v = v._replace(locator=loc)
|
352
367
|
|
353
368
|
if replacer is not None and root is not None:
|
354
|
-
upd:
|
369
|
+
upd: dict[str, Any] = {}
|
355
370
|
href = v.locator.href
|
356
371
|
if href is not None:
|
357
372
|
upd['locator'] = v.locator._replace(href=replacer(href, str(root)), title=replacer(v.locator.title, str(root)))
|
promnesia/sources/auto_logseq.py
CHANGED
@@ -1,14 +1,15 @@
|
|
1
1
|
import os.path
|
2
2
|
import urllib.parse
|
3
3
|
|
4
|
+
|
4
5
|
def logseq_replacer(path: str, root: str) -> str:
|
5
|
-
if not path.startswith("editor://") or not (path.endswith(
|
6
|
+
if not path.startswith("editor://") or not (path.endswith((".md", ".org"))):
|
6
7
|
return path
|
7
|
-
|
8
|
-
graph = os.path.basename(root)
|
9
|
-
page_name = os.path.basename(path).rsplit('.', 1)[0]
|
8
|
+
|
9
|
+
graph = os.path.basename(root) # noqa: PTH119
|
10
|
+
page_name = os.path.basename(path).rsplit('.', 1)[0] # noqa: PTH119
|
10
11
|
encoded_page_name = urllib.parse.quote(page_name)
|
11
|
-
|
12
|
+
|
12
13
|
uri = f"logseq://graph/{graph}?page={encoded_page_name}"
|
13
14
|
|
14
15
|
return uri
|
promnesia/sources/browser.py
CHANGED
@@ -2,15 +2,18 @@
|
|
2
2
|
Uses [[https://github.com/karlicoss/HPI][HPI]] for visits from web browsers.
|
3
3
|
'''
|
4
4
|
|
5
|
+
from __future__ import annotations
|
6
|
+
|
5
7
|
import re
|
6
|
-
from typing import Optional, Iterator, Any, TYPE_CHECKING
|
7
8
|
import warnings
|
9
|
+
from collections.abc import Iterator
|
10
|
+
from typing import TYPE_CHECKING, Any
|
8
11
|
|
9
|
-
from promnesia.common import
|
12
|
+
from promnesia.common import Loc, PathIsh, Results, Second, Visit, is_sqlite_db, logger
|
10
13
|
|
11
14
|
|
12
|
-
def index(p:
|
13
|
-
from . import hpi
|
15
|
+
def index(p: PathIsh | None = None) -> Results:
|
16
|
+
from . import hpi # noqa: F401,I001
|
14
17
|
|
15
18
|
if p is None:
|
16
19
|
from my.browser.all import history
|
@@ -24,10 +27,11 @@ def index(p: Optional[PathIsh]=None) -> Results:
|
|
24
27
|
)
|
25
28
|
try:
|
26
29
|
yield from _index_new_with_adhoc_config(path=p)
|
27
|
-
return
|
28
30
|
except Exception as e:
|
29
31
|
logger.exception(e)
|
30
32
|
warnings.warn("Hacking my.config.browser.export didn't work. You probably need to update HPI.")
|
33
|
+
else:
|
34
|
+
return
|
31
35
|
|
32
36
|
logger.warning("Falling back onto legacy promnesia.sources.browser_legacy module")
|
33
37
|
yield from _index_old(path=p)
|
@@ -35,11 +39,12 @@ def index(p: Optional[PathIsh]=None) -> Results:
|
|
35
39
|
|
36
40
|
def _index_old(*, path: PathIsh) -> Results:
|
37
41
|
from . import browser_legacy
|
42
|
+
|
38
43
|
yield from browser_legacy.index(path)
|
39
44
|
|
40
45
|
|
41
46
|
def _index_new_with_adhoc_config(*, path: PathIsh) -> Results:
|
42
|
-
from . import hpi
|
47
|
+
from . import hpi # noqa: F401,I001
|
43
48
|
|
44
49
|
## previously, it was possible to index be called with multiple different db search paths
|
45
50
|
## this would result in each subsequent call to my.browser.export.history to invalidate cache every time
|
@@ -50,7 +55,7 @@ def _index_new_with_adhoc_config(*, path: PathIsh) -> Results:
|
|
50
55
|
cache_override = None if hpi_cache_dir is None else hpi_cache_dir / sanitized_path
|
51
56
|
##
|
52
57
|
|
53
|
-
from my.core.common import
|
58
|
+
from my.core.common import Paths, classproperty, get_files
|
54
59
|
class config:
|
55
60
|
class core:
|
56
61
|
cache_dir = cache_override
|
@@ -75,8 +80,8 @@ else:
|
|
75
80
|
|
76
81
|
def _index_new(history: Iterator[BrowserMergeVisit]) -> Results:
|
77
82
|
for v in history:
|
78
|
-
desc:
|
79
|
-
duration:
|
83
|
+
desc: str | None = None
|
84
|
+
duration: Second | None = None
|
80
85
|
metadata = v.metadata
|
81
86
|
if metadata is not None:
|
82
87
|
desc = metadata.title
|
@@ -1,16 +1,23 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
|
3
|
+
import sqlite3
|
1
4
|
from datetime import datetime
|
2
5
|
from pathlib import Path
|
3
6
|
from urllib.parse import unquote
|
4
|
-
import sqlite3
|
5
|
-
from typing import List, Set
|
6
7
|
|
7
8
|
import pytz
|
8
9
|
|
9
|
-
from
|
10
|
-
from
|
10
|
+
from promnesia import config
|
11
|
+
from promnesia.common import Loc, PathIsh, Results, Second, Visit, is_sqlite_db, logger
|
11
12
|
|
12
|
-
|
13
|
-
from cachew import cachew
|
13
|
+
try:
|
14
|
+
from cachew import cachew
|
15
|
+
except ModuleNotFoundError as me:
|
16
|
+
if me.name != 'cachew':
|
17
|
+
raise me
|
18
|
+
# this module is legacy anyway, so just make it defensive
|
19
|
+
def cachew(*args, **kwargs): # type: ignore[no-redef]
|
20
|
+
return lambda f: f
|
14
21
|
|
15
22
|
|
16
23
|
def index(p: PathIsh) -> Results:
|
@@ -29,21 +36,21 @@ def index(p: PathIsh) -> Results:
|
|
29
36
|
|
30
37
|
|
31
38
|
|
32
|
-
def _index_dbs(dbs:
|
39
|
+
def _index_dbs(dbs: list[Path], cachew_name: str):
|
33
40
|
# TODO right... not ideal, need to think how to handle it properly...
|
34
41
|
import sys
|
35
42
|
sys.setrecursionlimit(5000)
|
36
43
|
|
37
44
|
cache_dir = config.get().cache_dir
|
38
45
|
cpath = None if cache_dir is None else cache_dir / cachew_name
|
39
|
-
emitted:
|
46
|
+
emitted: set = set()
|
40
47
|
yield from _index_dbs_aux(cpath, dbs, emitted=emitted)
|
41
48
|
|
42
49
|
|
43
50
|
# todo wow, stack traces are ridiculous here...
|
44
51
|
# todo hmm, feels like it should be a class or something?
|
45
|
-
@cachew(lambda cp, dbs, emitted: cp, depends_on=lambda cp, dbs, emitted: dbs) # , logger=logger)
|
46
|
-
def _index_dbs_aux(cache_path: Path, dbs:
|
52
|
+
@cachew(lambda cp, dbs, emitted: cp, depends_on=lambda cp, dbs, emitted: dbs) # , logger=logger) # noqa: ARG005
|
53
|
+
def _index_dbs_aux(cache_path: Path | None, dbs: list[Path], emitted: set) -> Results:
|
47
54
|
if len(dbs) == 0:
|
48
55
|
return
|
49
56
|
|
@@ -58,7 +65,7 @@ def _index_dbs_aux(cache_path: Path, dbs: List[Path], emitted: Set) -> Results:
|
|
58
65
|
xs_was_cached = True
|
59
66
|
logger.debug('seems that %d first items were previously cached', len(xs))
|
60
67
|
if xs_was_cached:
|
61
|
-
key = (r.url, r.dt)
|
68
|
+
key = str(r) if isinstance(r, Exception) else (r.url, r.dt)
|
62
69
|
assert key not in emitted, key # todo not sure if this assert is necessary?
|
63
70
|
# hmm ok it might happen if we messed up with indexing individual db?
|
64
71
|
# alternatively, could abuse it to avoid messing with 'emitted' in _index_db?
|
@@ -69,7 +76,7 @@ def _index_dbs_aux(cache_path: Path, dbs: List[Path], emitted: Set) -> Results:
|
|
69
76
|
yield from _index_db(db, emitted=emitted)
|
70
77
|
|
71
78
|
|
72
|
-
def _index_db(db: Path, emitted:
|
79
|
+
def _index_db(db: Path, emitted: set):
|
73
80
|
logger.info('processing %s', db) # debug level?
|
74
81
|
|
75
82
|
# todo schema check (not so critical for cachew though)
|
@@ -115,17 +122,20 @@ Col = str
|
|
115
122
|
ColType = str
|
116
123
|
|
117
124
|
|
118
|
-
from
|
125
|
+
from collections.abc import Sequence
|
126
|
+
from typing import NamedTuple, Union
|
127
|
+
|
119
128
|
|
120
129
|
class Schema(NamedTuple):
|
121
|
-
cols: Sequence[
|
130
|
+
cols: Sequence[tuple[Col, ColType]]
|
122
131
|
key: Sequence[str]
|
123
132
|
|
124
133
|
|
125
|
-
SchemaCheck =
|
134
|
+
SchemaCheck = tuple[str, Union[str, Sequence[str]]] # todo Union: meh
|
126
135
|
|
127
136
|
from dataclasses import dataclass
|
128
137
|
|
138
|
+
|
129
139
|
# todo protocol?
|
130
140
|
@dataclass
|
131
141
|
class Extr:
|
@@ -173,7 +183,7 @@ class Chrome(Extr):
|
|
173
183
|
dt = chrome_time_to_utc(int(ts))
|
174
184
|
url = unquote(url) # chrome urls are all quoted
|
175
185
|
dd = int(durs)
|
176
|
-
dur:
|
186
|
+
dur: Second | None = None if dd == 0 else dd // 1_000_000
|
177
187
|
return Visit(
|
178
188
|
url=url,
|
179
189
|
dt=dt,
|