promnesia 1.2.20240810__py3-none-any.whl → 1.4.20250909__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- promnesia/__init__.py +18 -4
- promnesia/__main__.py +104 -78
- promnesia/cannon.py +108 -107
- promnesia/common.py +107 -88
- promnesia/compare.py +33 -30
- promnesia/compat.py +10 -10
- promnesia/config.py +37 -34
- promnesia/database/common.py +4 -3
- promnesia/database/dump.py +13 -13
- promnesia/database/load.py +7 -7
- promnesia/extract.py +19 -17
- promnesia/logging.py +27 -15
- promnesia/misc/install_server.py +32 -27
- promnesia/server.py +106 -79
- promnesia/sources/auto.py +104 -77
- promnesia/sources/auto_logseq.py +6 -5
- promnesia/sources/auto_obsidian.py +2 -2
- promnesia/sources/browser.py +20 -10
- promnesia/sources/browser_legacy.py +65 -50
- promnesia/sources/demo.py +7 -8
- promnesia/sources/fbmessenger.py +3 -3
- promnesia/sources/filetypes.py +22 -16
- promnesia/sources/github.py +9 -8
- promnesia/sources/guess.py +6 -2
- promnesia/sources/hackernews.py +7 -9
- promnesia/sources/hpi.py +5 -3
- promnesia/sources/html.py +11 -7
- promnesia/sources/hypothesis.py +3 -2
- promnesia/sources/instapaper.py +3 -2
- promnesia/sources/markdown.py +22 -12
- promnesia/sources/org.py +36 -17
- promnesia/sources/plaintext.py +41 -39
- promnesia/sources/pocket.py +5 -3
- promnesia/sources/reddit.py +24 -26
- promnesia/sources/roamresearch.py +5 -2
- promnesia/sources/rss.py +6 -8
- promnesia/sources/shellcmd.py +21 -11
- promnesia/sources/signal.py +27 -26
- promnesia/sources/smscalls.py +2 -3
- promnesia/sources/stackexchange.py +5 -4
- promnesia/sources/takeout.py +37 -34
- promnesia/sources/takeout_legacy.py +29 -19
- promnesia/sources/telegram.py +18 -12
- promnesia/sources/telegram_legacy.py +22 -11
- promnesia/sources/twitter.py +7 -6
- promnesia/sources/vcs.py +11 -6
- promnesia/sources/viber.py +11 -10
- promnesia/sources/website.py +8 -7
- promnesia/sources/zulip.py +3 -2
- promnesia/sqlite.py +13 -7
- promnesia/tests/common.py +10 -5
- promnesia/tests/server_helper.py +13 -10
- promnesia/tests/sources/test_auto.py +2 -3
- promnesia/tests/sources/test_filetypes.py +11 -8
- promnesia/tests/sources/test_hypothesis.py +10 -6
- promnesia/tests/sources/test_org.py +9 -5
- promnesia/tests/sources/test_plaintext.py +9 -8
- promnesia/tests/sources/test_shellcmd.py +13 -13
- promnesia/tests/sources/test_takeout.py +3 -5
- promnesia/tests/test_cannon.py +256 -239
- promnesia/tests/test_cli.py +12 -8
- promnesia/tests/test_compare.py +17 -13
- promnesia/tests/test_config.py +7 -8
- promnesia/tests/test_db_dump.py +15 -15
- promnesia/tests/test_extract.py +17 -10
- promnesia/tests/test_indexer.py +24 -18
- promnesia/tests/test_server.py +12 -13
- promnesia/tests/test_traverse.py +0 -2
- promnesia/tests/utils.py +3 -7
- promnesia-1.4.20250909.dist-info/METADATA +66 -0
- promnesia-1.4.20250909.dist-info/RECORD +80 -0
- {promnesia-1.2.20240810.dist-info → promnesia-1.4.20250909.dist-info}/WHEEL +1 -2
- promnesia/kjson.py +0 -121
- promnesia/sources/__init__.pyi +0 -0
- promnesia-1.2.20240810.dist-info/METADATA +0 -54
- promnesia-1.2.20240810.dist-info/RECORD +0 -83
- promnesia-1.2.20240810.dist-info/top_level.txt +0 -1
- {promnesia-1.2.20240810.dist-info → promnesia-1.4.20250909.dist-info}/entry_points.txt +0 -0
- {promnesia-1.2.20240810.dist-info → promnesia-1.4.20250909.dist-info/licenses}/LICENSE +0 -0
@@ -1,19 +1,19 @@
|
|
1
|
-
from
|
1
|
+
from __future__ import annotations
|
2
|
+
|
3
|
+
import sqlite3
|
4
|
+
from datetime import datetime, timezone
|
2
5
|
from pathlib import Path
|
3
6
|
from urllib.parse import unquote
|
4
|
-
import sqlite3
|
5
|
-
from typing import List, Set, Optional
|
6
7
|
|
7
|
-
import
|
8
|
-
|
9
|
-
from ..common import PathIsh, Results, Visit, Loc, logger, Second, is_sqlite_db
|
10
|
-
from .. import config
|
8
|
+
from promnesia import config
|
9
|
+
from promnesia.common import Loc, PathIsh, Results, Second, Visit, is_sqlite_db, logger
|
11
10
|
|
12
11
|
try:
|
13
12
|
from cachew import cachew
|
14
13
|
except ModuleNotFoundError as me:
|
15
14
|
if me.name != 'cachew':
|
16
15
|
raise me
|
16
|
+
|
17
17
|
# this module is legacy anyway, so just make it defensive
|
18
18
|
def cachew(*args, **kwargs): # type: ignore[no-redef]
|
19
19
|
return lambda f: f
|
@@ -21,7 +21,7 @@ except ModuleNotFoundError as me:
|
|
21
21
|
|
22
22
|
def index(p: PathIsh) -> Results:
|
23
23
|
pp = Path(p)
|
24
|
-
assert pp.exists(), pp
|
24
|
+
assert pp.exists(), pp # just in case of broken symlinks
|
25
25
|
|
26
26
|
# todo warn if filtered out too many?
|
27
27
|
# todo wonder how quickly mimes can be computed?
|
@@ -30,31 +30,31 @@ def index(p: PathIsh) -> Results:
|
|
30
30
|
|
31
31
|
assert len(dbs) > 0, pp
|
32
32
|
logger.info('processing %d databases', len(dbs))
|
33
|
-
cname = str('_'.join(pp.parts[1:]))
|
33
|
+
cname = str('_'.join(pp.parts[1:])) # meh
|
34
34
|
yield from _index_dbs(dbs, cachew_name=cname)
|
35
35
|
|
36
36
|
|
37
|
-
|
38
|
-
def _index_dbs(dbs: List[Path], cachew_name: str):
|
37
|
+
def _index_dbs(dbs: list[Path], cachew_name: str):
|
39
38
|
# TODO right... not ideal, need to think how to handle it properly...
|
40
39
|
import sys
|
40
|
+
|
41
41
|
sys.setrecursionlimit(5000)
|
42
42
|
|
43
43
|
cache_dir = config.get().cache_dir
|
44
44
|
cpath = None if cache_dir is None else cache_dir / cachew_name
|
45
|
-
emitted:
|
45
|
+
emitted: set = set()
|
46
46
|
yield from _index_dbs_aux(cpath, dbs, emitted=emitted)
|
47
47
|
|
48
48
|
|
49
49
|
# todo wow, stack traces are ridiculous here...
|
50
50
|
# todo hmm, feels like it should be a class or something?
|
51
|
-
@cachew(lambda cp, dbs, emitted: cp, depends_on=lambda cp, dbs, emitted: dbs)
|
52
|
-
def _index_dbs_aux(cache_path:
|
51
|
+
@cachew(lambda cp, dbs, emitted: cp, depends_on=lambda cp, dbs, emitted: dbs) # , logger=logger) # noqa: ARG005
|
52
|
+
def _index_dbs_aux(cache_path: Path | None, dbs: list[Path], emitted: set) -> Results:
|
53
53
|
if len(dbs) == 0:
|
54
54
|
return
|
55
55
|
|
56
56
|
xs = dbs[:-1]
|
57
|
-
x
|
57
|
+
x = dbs[-1:]
|
58
58
|
|
59
59
|
xs_res = _index_dbs_aux(cache_path, xs, emitted)
|
60
60
|
xs_was_cached = False
|
@@ -65,36 +65,38 @@ def _index_dbs_aux(cache_path: Optional[Path], dbs: List[Path], emitted: Set) ->
|
|
65
65
|
logger.debug('seems that %d first items were previously cached', len(xs))
|
66
66
|
if xs_was_cached:
|
67
67
|
key = str(r) if isinstance(r, Exception) else (r.url, r.dt)
|
68
|
-
assert key not in emitted, key
|
68
|
+
assert key not in emitted, key # todo not sure if this assert is necessary?
|
69
69
|
# hmm ok it might happen if we messed up with indexing individual db?
|
70
70
|
# alternatively, could abuse it to avoid messing with 'emitted' in _index_db?
|
71
71
|
emitted.add(key)
|
72
|
-
yield r
|
72
|
+
yield r # todo not sure about exceptions?
|
73
73
|
|
74
74
|
for db in x:
|
75
75
|
yield from _index_db(db, emitted=emitted)
|
76
76
|
|
77
77
|
|
78
|
-
def _index_db(db: Path, emitted:
|
79
|
-
logger.info('processing %s', db)
|
78
|
+
def _index_db(db: Path, emitted: set):
|
79
|
+
logger.info('processing %s', db) # debug level?
|
80
80
|
|
81
81
|
# todo schema check (not so critical for cachew though)
|
82
82
|
total = 0
|
83
|
-
new
|
84
|
-
loc = Loc.file(
|
83
|
+
new = 0
|
84
|
+
loc = Loc.file(
|
85
|
+
db
|
86
|
+
) # todo possibly needs to be optimized -- moving from within the loop considerably speeds everything up
|
85
87
|
with sqlite3.connect(f'file:{db}?immutable=1', uri=True) as c:
|
86
88
|
browser = None
|
87
89
|
for b in [Chrome, Firefox, FirefoxPhone, Safari]:
|
88
90
|
try:
|
89
91
|
c.execute(f'SELECT * FROM {b.detector}')
|
90
|
-
except sqlite3.OperationalError:
|
92
|
+
except sqlite3.OperationalError: # not sure if the right kind?
|
91
93
|
pass
|
92
94
|
else:
|
93
95
|
browser = b
|
94
96
|
break
|
95
97
|
assert browser is not None
|
96
98
|
|
97
|
-
proj
|
99
|
+
proj = ', '.join(c for c, _ in browser.schema.cols)
|
98
100
|
query = browser.query.replace('chunk.', '')
|
99
101
|
|
100
102
|
c.row_factory = sqlite3.Row
|
@@ -121,17 +123,20 @@ Col = str
|
|
121
123
|
ColType = str
|
122
124
|
|
123
125
|
|
124
|
-
from
|
126
|
+
from collections.abc import Sequence
|
127
|
+
from typing import NamedTuple
|
128
|
+
|
125
129
|
|
126
130
|
class Schema(NamedTuple):
|
127
|
-
cols: Sequence[
|
131
|
+
cols: Sequence[tuple[Col, ColType]]
|
128
132
|
key: Sequence[str]
|
129
133
|
|
130
134
|
|
131
|
-
SchemaCheck =
|
135
|
+
SchemaCheck = tuple[str, str | Sequence[str]] # todo Union: meh
|
132
136
|
|
133
137
|
from dataclasses import dataclass
|
134
138
|
|
139
|
+
|
135
140
|
# todo protocol?
|
136
141
|
@dataclass
|
137
142
|
class Extr:
|
@@ -147,14 +152,15 @@ class Extr:
|
|
147
152
|
|
148
153
|
|
149
154
|
class Chrome(Extr):
|
150
|
-
detector='keyword_search_terms'
|
155
|
+
detector = 'keyword_search_terms'
|
156
|
+
# fmt: off
|
151
157
|
schema_check=(
|
152
158
|
'visits', [
|
153
159
|
'visits', "id, url, visit_time, from_visit, transition, segment_id, visit_duration, incremented_omnibox_typed_score",
|
154
160
|
'visits', "id, url, visit_time, from_visit, transition, segment_id, visit_duration"
|
155
161
|
]
|
156
162
|
)
|
157
|
-
schema=Schema(cols=[
|
163
|
+
schema = Schema(cols=[
|
158
164
|
('U.url' , 'TEXT' ),
|
159
165
|
|
160
166
|
# while these two are not very useful, might be good to have just in case for some debugging
|
@@ -168,18 +174,19 @@ class Chrome(Extr):
|
|
168
174
|
('V.visit_duration' , 'INTEGER NOT NULL'),
|
169
175
|
# V.omnibox thing looks useless
|
170
176
|
], key=('url', 'visit_time', 'vid', 'urlid'))
|
171
|
-
|
177
|
+
# fmt: on
|
178
|
+
query = 'FROM chunk.visits as V, chunk.urls as U WHERE V.url = U.id'
|
172
179
|
|
173
180
|
@staticmethod
|
174
181
|
def row2visit(row: sqlite3.Row, loc: Loc) -> Visit:
|
175
|
-
url
|
176
|
-
ts
|
182
|
+
url = row['url']
|
183
|
+
ts = row['visit_time']
|
177
184
|
durs = row['visit_duration']
|
178
185
|
|
179
186
|
dt = chrome_time_to_utc(int(ts))
|
180
|
-
url = unquote(url)
|
187
|
+
url = unquote(url) # chrome urls are all quoted
|
181
188
|
dd = int(durs)
|
182
|
-
dur:
|
189
|
+
dur: Second | None = None if dd == 0 else dd // 1_000_000
|
183
190
|
return Visit(
|
184
191
|
url=url,
|
185
192
|
dt=dt,
|
@@ -192,12 +199,12 @@ class Chrome(Extr):
|
|
192
199
|
# yep, tested it and looks like utc
|
193
200
|
def chrome_time_to_utc(chrome_time: int) -> datetime:
|
194
201
|
epoch = (chrome_time / 1_000_000) - 11644473600
|
195
|
-
return datetime.fromtimestamp(epoch,
|
202
|
+
return datetime.fromtimestamp(epoch, timezone.utc)
|
196
203
|
|
197
204
|
|
198
205
|
def _row2visit_firefox(row: sqlite3.Row, loc: Loc) -> Visit:
|
199
206
|
url = row['url']
|
200
|
-
ts
|
207
|
+
ts = float(row['visit_date'])
|
201
208
|
# ok, looks like it's unix epoch
|
202
209
|
# https://stackoverflow.com/a/19430099/706389
|
203
210
|
|
@@ -210,17 +217,19 @@ def _row2visit_firefox(row: sqlite3.Row, loc: Loc) -> Visit:
|
|
210
217
|
else:
|
211
218
|
# milliseconds
|
212
219
|
ts /= 1_000
|
213
|
-
dt = datetime.fromtimestamp(ts,
|
214
|
-
url = unquote(url)
|
220
|
+
dt = datetime.fromtimestamp(ts, timezone.utc)
|
221
|
+
url = unquote(url) # firefox urls are all quoted
|
215
222
|
return Visit(
|
216
223
|
url=url,
|
217
224
|
dt=dt,
|
218
225
|
locator=loc,
|
219
226
|
)
|
220
227
|
|
228
|
+
|
221
229
|
# https://web.archive.org/web/20201026130310/http://fileformats.archiveteam.org/wiki/History.db
|
222
230
|
class Safari(Extr):
|
223
|
-
detector='history_tombstones'
|
231
|
+
detector = 'history_tombstones'
|
232
|
+
# fmt: off
|
224
233
|
schema_check=(
|
225
234
|
'history_visits', [
|
226
235
|
'history_visits', "id, history_item, visit_time",
|
@@ -241,13 +250,14 @@ class Safari(Extr):
|
|
241
250
|
# ('V.visit_duration' , 'INTEGER NOT NULL'),
|
242
251
|
# V.omnibox thing looks useless
|
243
252
|
], key=('url', 'visit_time', 'vid', 'urlid'))
|
244
|
-
|
253
|
+
# fmt: on
|
254
|
+
query = 'FROM chunk.history_visits as V, chunk.history_items as U WHERE V.history_item = U.id'
|
245
255
|
|
246
256
|
@staticmethod
|
247
257
|
def row2visit(row: sqlite3.Row, loc: Loc) -> Visit:
|
248
|
-
url
|
249
|
-
ts
|
250
|
-
dt = datetime.fromtimestamp(ts,
|
258
|
+
url = row['url']
|
259
|
+
ts = row['visit_time'] + 978307200 # https://stackoverflow.com/a/34546556/16645
|
260
|
+
dt = datetime.fromtimestamp(ts, timezone.utc)
|
251
261
|
|
252
262
|
return Visit(
|
253
263
|
url=url,
|
@@ -255,10 +265,12 @@ class Safari(Extr):
|
|
255
265
|
locator=loc,
|
256
266
|
)
|
257
267
|
|
268
|
+
|
258
269
|
# https://web.archive.org/web/20190730231715/https://www.forensicswiki.org/wiki/Mozilla_Firefox_3_History_File_Format#moz_historyvisits
|
259
270
|
class Firefox(Extr):
|
260
|
-
detector='moz_meta'
|
261
|
-
schema_check=('moz_historyvisits', "id, from_visit, place_id, visit_date, visit_type")
|
271
|
+
detector = 'moz_meta'
|
272
|
+
schema_check = ('moz_historyvisits', "id, from_visit, place_id, visit_date, visit_type")
|
273
|
+
# fmt: off
|
262
274
|
schema=Schema(cols=[
|
263
275
|
('P.url' , 'TEXT'),
|
264
276
|
|
@@ -274,14 +286,16 @@ class Firefox(Extr):
|
|
274
286
|
# needs to be defensive
|
275
287
|
# ('V.session' , 'INTEGER'),
|
276
288
|
], key=('url', 'visit_date', 'vid', 'pid'))
|
277
|
-
|
289
|
+
# fmt: on
|
290
|
+
query = 'FROM chunk.moz_historyvisits as V, chunk.moz_places as P WHERE V.place_id = P.id'
|
278
291
|
|
279
|
-
row2visit = _row2visit_firefox
|
292
|
+
row2visit = _row2visit_firefox # type: ignore[assignment]
|
280
293
|
|
281
294
|
|
282
295
|
class FirefoxPhone(Extr):
|
283
|
-
detector='remote_devices'
|
284
|
-
schema_check=('visits', "_id, history_guid, visit_type, date, is_local")
|
296
|
+
detector = 'remote_devices'
|
297
|
+
schema_check = ('visits', "_id, history_guid, visit_type, date, is_local")
|
298
|
+
# fmt: off
|
285
299
|
schema=Schema(cols=[
|
286
300
|
('H.url' , 'TEXT NOT NULL' ),
|
287
301
|
|
@@ -293,6 +307,7 @@ class FirefoxPhone(Extr):
|
|
293
307
|
('V.date as visit_date', 'INTEGER NOT NULL'),
|
294
308
|
# ('is_local' , 'INTEGER NOT NULL'),
|
295
309
|
], key=('url', 'date', 'vid', 'hid'))
|
296
|
-
|
310
|
+
# fmt: on
|
311
|
+
query = 'FROM chunk.visits as V, chunk.history as H WHERE V.history_guid = H.guid'
|
297
312
|
|
298
|
-
row2visit = _row2visit_firefox
|
313
|
+
row2visit = _row2visit_firefox # type: ignore[assignment]
|
promnesia/sources/demo.py
CHANGED
@@ -3,11 +3,11 @@ A dummy source, used for testing
|
|
3
3
|
Generates a sequence of fake evenly separated visits
|
4
4
|
'''
|
5
5
|
|
6
|
-
from
|
7
|
-
from typing import Union
|
6
|
+
from __future__ import annotations
|
8
7
|
|
9
|
-
from
|
8
|
+
from datetime import datetime, timedelta
|
10
9
|
|
10
|
+
from promnesia.common import Loc, Results, Visit
|
11
11
|
|
12
12
|
IsoFormatDt = str
|
13
13
|
Seconds = int
|
@@ -16,12 +16,11 @@ Seconds = int
|
|
16
16
|
# TODO allow passing isoformat string as base_dt?
|
17
17
|
# and maybe something similar as delta? start with seconds maybe
|
18
18
|
def index(
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
19
|
+
count: int = 100,
|
20
|
+
*,
|
21
|
+
base_dt: datetime | IsoFormatDt = datetime.min + timedelta(days=5000),
|
22
|
+
delta: timedelta | Seconds = timedelta(hours=1),
|
23
23
|
) -> Results:
|
24
|
-
|
25
24
|
base_dt_ = base_dt if isinstance(base_dt, datetime) else datetime.fromisoformat(base_dt)
|
26
25
|
delta_ = delta if isinstance(delta, timedelta) else timedelta(seconds=delta)
|
27
26
|
|
promnesia/sources/fbmessenger.py
CHANGED
@@ -2,12 +2,13 @@
|
|
2
2
|
Uses [[https://github.com/karlicoss/HPI][HPI]] for the messages data.
|
3
3
|
'''
|
4
4
|
|
5
|
-
from
|
5
|
+
from promnesia.common import Loc, Results, Visit, extract_urls
|
6
6
|
|
7
7
|
|
8
8
|
def index() -> Results:
|
9
|
-
from . import hpi
|
9
|
+
from . import hpi # noqa: F401,I001
|
10
10
|
from my.fbmessenger import messages
|
11
|
+
|
11
12
|
for m in messages():
|
12
13
|
if isinstance(m, Exception):
|
13
14
|
yield m
|
@@ -32,4 +33,3 @@ def index() -> Results:
|
|
32
33
|
context=m.text,
|
33
34
|
locator=loc,
|
34
35
|
)
|
35
|
-
|
promnesia/sources/filetypes.py
CHANGED
@@ -1,38 +1,42 @@
|
|
1
|
-
|
1
|
+
from __future__ import annotations
|
2
|
+
|
3
|
+
from collections.abc import Callable, Iterable, Sequence
|
2
4
|
from functools import lru_cache
|
3
5
|
from pathlib import Path
|
4
|
-
from typing import
|
6
|
+
from typing import NamedTuple
|
5
7
|
|
6
8
|
from ..common import Results, Url
|
7
9
|
|
8
|
-
|
9
10
|
# TODO doesn't really belong here...
|
10
11
|
Ctx = Sequence[str]
|
11
12
|
|
13
|
+
|
12
14
|
class EUrl(NamedTuple):
|
13
15
|
url: Url
|
14
|
-
ctx: Ctx
|
16
|
+
ctx: Ctx # TODO ctx here is more like a Loc
|
17
|
+
|
18
|
+
|
15
19
|
###
|
16
20
|
|
17
21
|
|
18
22
|
# keys are mime types + extensions
|
19
|
-
Ex = Callable[[Path],
|
23
|
+
Ex = Callable[[Path], Results | Iterable[EUrl]]
|
20
24
|
# None means unhandled
|
21
|
-
TYPE2IDX:
|
25
|
+
TYPE2IDX: dict[str, Ex | None] = {}
|
22
26
|
# NOTE: there are some types in auto.py at the moment... it's a bit messy
|
23
27
|
|
24
28
|
|
25
29
|
# TYPE2IDX only contains the 'prefixes', to speed up the lookup we are using cache..
|
26
30
|
@lru_cache(None)
|
27
|
-
def type2idx(t: str) ->
|
31
|
+
def type2idx(t: str) -> Ex | None:
|
28
32
|
if len(t) == 0:
|
29
|
-
return None
|
33
|
+
return None # just in case?
|
30
34
|
# first try exact match
|
31
|
-
e = TYPE2IDX.get(t
|
35
|
+
e = TYPE2IDX.get(t)
|
32
36
|
if e is not None:
|
33
37
|
return e
|
34
38
|
t = t.strip('.')
|
35
|
-
e = TYPE2IDX.get(t
|
39
|
+
e = TYPE2IDX.get(t)
|
36
40
|
if e is not None:
|
37
41
|
return e
|
38
42
|
# otherwise, try prefixes?
|
@@ -41,6 +45,7 @@ def type2idx(t: str) -> Optional[Ex]:
|
|
41
45
|
return v
|
42
46
|
return None
|
43
47
|
|
48
|
+
|
44
49
|
# for now source code just indexed with grep, not sure if it's good enough?
|
45
50
|
# if not, some fanceir library could be used...
|
46
51
|
# e.g. https://github.com/karlicoss/promnesia/pull/152/commits/c2f00eb4ee4018b02c9bf3966a036db69a43373d
|
@@ -81,7 +86,7 @@ CODE = {
|
|
81
86
|
|
82
87
|
'.ts', # most likely typescript.. otherwise determined as text/vnd.trolltech.linguist mime
|
83
88
|
'.js',
|
84
|
-
}
|
89
|
+
} # fmt: skip
|
85
90
|
# TODO discover more extensions with mimetypes library?
|
86
91
|
|
87
92
|
|
@@ -97,9 +102,10 @@ audio/
|
|
97
102
|
video/
|
98
103
|
'''
|
99
104
|
|
100
|
-
handle_later = lambda *
|
105
|
+
handle_later = lambda *_args, **_kwargs: ()
|
101
106
|
|
102
|
-
|
107
|
+
|
108
|
+
def ignore(*_args, **_kwargs):
|
103
109
|
# TODO log (once?)
|
104
110
|
yield from ()
|
105
111
|
|
@@ -121,13 +127,14 @@ TYPE2IDX.update({
|
|
121
127
|
'font/woff': ignore,
|
122
128
|
'text/x-Algol68': ignore, # ugh some license file had this?? maybe always index text/ as text?
|
123
129
|
'text/x-bytecode.python': ignore, # todo ignore all x-bytecode?
|
130
|
+
'text/calendar': ignore,
|
124
131
|
|
125
132
|
# TODO not sure what to do about these..
|
126
133
|
'application/octet-stream': handle_later,
|
127
134
|
'application/zip' : handle_later,
|
128
135
|
'application/x-tar' : handle_later,
|
129
136
|
'application/gzip' : handle_later,
|
130
|
-
})
|
137
|
+
}) # fmt: skip
|
131
138
|
|
132
139
|
|
133
140
|
# TODO use some existing file for initial gitignore..
|
@@ -146,5 +153,4 @@ IGNORE = [
|
|
146
153
|
# TODO not sure about these:
|
147
154
|
'.gitignore',
|
148
155
|
'.babelrc',
|
149
|
-
]
|
150
|
-
|
156
|
+
] # fmt: skip
|
promnesia/sources/github.py
CHANGED
@@ -2,15 +2,14 @@
|
|
2
2
|
Uses [[https://github.com/karlicoss/HPI][HPI]] github module
|
3
3
|
'''
|
4
4
|
|
5
|
-
|
6
|
-
|
7
|
-
from typing import Optional, Set
|
5
|
+
from __future__ import annotations
|
8
6
|
|
9
|
-
|
7
|
+
# Note: requires the 'mistletoe' module if you enable render_markdown
|
8
|
+
from promnesia.common import Loc, Results, Visit, iter_urls, logger
|
10
9
|
|
11
10
|
|
12
11
|
def index(*, render_markdown: bool = False) -> Results:
|
13
|
-
from . import hpi
|
12
|
+
from . import hpi # noqa: F401,I001
|
14
13
|
from my.github.all import events
|
15
14
|
|
16
15
|
if render_markdown:
|
@@ -18,7 +17,9 @@ def index(*, render_markdown: bool = False) -> Results:
|
|
18
17
|
from .markdown import TextParser, extract_from_text
|
19
18
|
except ImportError as import_err:
|
20
19
|
logger.exception(import_err)
|
21
|
-
logger.critical(
|
20
|
+
logger.critical(
|
21
|
+
"Could not import markdown module to render github body markdown. Try 'python3 -m pip install mistletoe'"
|
22
|
+
)
|
22
23
|
render_markdown = False
|
23
24
|
|
24
25
|
for e in events():
|
@@ -29,7 +30,7 @@ def index(*, render_markdown: bool = False) -> Results:
|
|
29
30
|
continue
|
30
31
|
|
31
32
|
# if enabled, convert the (markdown) body to HTML
|
32
|
-
context:
|
33
|
+
context: str | None = e.body
|
33
34
|
if e.body is not None and render_markdown:
|
34
35
|
context = TextParser(e.body)._doc_ashtml() # type: ignore[possibly-undefined]
|
35
36
|
|
@@ -59,7 +60,7 @@ def index(*, render_markdown: bool = False) -> Results:
|
|
59
60
|
#
|
60
61
|
# Note: this set gets reset every event, is here to
|
61
62
|
# prevent duplicates between URLExtract and the markdown parser
|
62
|
-
emitted:
|
63
|
+
emitted: set[str] = set()
|
63
64
|
for url in iter_urls(e.body):
|
64
65
|
if url in emitted:
|
65
66
|
continue
|
promnesia/sources/guess.py
CHANGED
@@ -1,6 +1,7 @@
|
|
1
1
|
# TODO eh. confusing how guess and auto are different...
|
2
2
|
# maybe merge them later?
|
3
|
-
from
|
3
|
+
from collections.abc import Iterable
|
4
|
+
from typing import Any
|
4
5
|
|
5
6
|
from ..common import Extraction, PathIsh
|
6
7
|
|
@@ -21,14 +22,17 @@ def index(path: PathIsh, *args, **kwargs) -> Iterable[Extraction]:
|
|
21
22
|
ps = str(path)
|
22
23
|
# TODO better url detection
|
23
24
|
|
24
|
-
index_: Any
|
25
|
+
index_: Any # meh
|
25
26
|
if is_git_repo(ps):
|
26
27
|
from . import vcs
|
28
|
+
|
27
29
|
index_ = vcs.index
|
28
30
|
elif is_website(ps):
|
29
31
|
from . import website
|
32
|
+
|
30
33
|
index_ = website.index
|
31
34
|
else:
|
32
35
|
from . import auto
|
36
|
+
|
33
37
|
index_ = auto.index
|
34
38
|
yield from index_(path, *args, **kwargs)
|
promnesia/sources/hackernews.py
CHANGED
@@ -4,11 +4,11 @@ Uses [[https://github.com/karlicoss/HPI][HPI]] dogsheep module to import HackerN
|
|
4
4
|
|
5
5
|
import textwrap
|
6
6
|
|
7
|
-
from promnesia.common import
|
7
|
+
from promnesia.common import Loc, Results, Visit
|
8
8
|
|
9
9
|
|
10
10
|
def index() -> Results:
|
11
|
-
from . import hpi
|
11
|
+
from . import hpi # noqa: F401,I001
|
12
12
|
from my.hackernews import dogsheep
|
13
13
|
|
14
14
|
for item in dogsheep.items():
|
@@ -21,9 +21,7 @@ def index() -> Results:
|
|
21
21
|
title = item.title
|
22
22
|
elif item.text_html:
|
23
23
|
title = item.text_html
|
24
|
-
title = textwrap.shorten(
|
25
|
-
title, width=79, placeholder="…",
|
26
|
-
break_long_words=True)
|
24
|
+
title = textwrap.shorten(title, width=79, placeholder="…", break_long_words=True)
|
27
25
|
# The locator is always the HN story. If the story is a link (as
|
28
26
|
# opposed to a text post), we insert a visit such that the link
|
29
27
|
# will point back to the corresponding HN story.
|
@@ -33,8 +31,8 @@ def index() -> Results:
|
|
33
31
|
urls.append(item.url)
|
34
32
|
for url in urls:
|
35
33
|
yield Visit(
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
34
|
+
url=url,
|
35
|
+
dt=item.created,
|
36
|
+
locator=loc,
|
37
|
+
context=title,
|
40
38
|
)
|
promnesia/sources/hpi.py
CHANGED
@@ -2,10 +2,12 @@
|
|
2
2
|
Just a helper for a more humane error message when importing my.* dependencies
|
3
3
|
'''
|
4
4
|
|
5
|
-
from
|
5
|
+
from promnesia.common import logger
|
6
6
|
|
7
7
|
try:
|
8
|
-
import my
|
8
|
+
import my # noqa: F401
|
9
9
|
except ImportError as e:
|
10
10
|
logger.exception(e)
|
11
|
-
logger.critical(
|
11
|
+
logger.critical(
|
12
|
+
"Failed during 'import my'. You probably need to install & configure HPI package first (see 'https://github.com/karlicoss/HPI/blob/master/doc/SETUP.org')"
|
13
|
+
)
|
promnesia/sources/html.py
CHANGED
@@ -2,19 +2,21 @@
|
|
2
2
|
Extracts links from HTML files
|
3
3
|
'''
|
4
4
|
|
5
|
-
from
|
6
|
-
from typing import Iterator, Tuple
|
5
|
+
from __future__ import annotations
|
7
6
|
|
8
|
-
from
|
7
|
+
from collections.abc import Iterator
|
8
|
+
from pathlib import Path
|
9
9
|
|
10
|
-
from bs4 import BeautifulSoup
|
10
|
+
from bs4 import BeautifulSoup, Tag
|
11
11
|
|
12
|
+
from promnesia.common import Loc, PathIsh, Results, Visit, file_mtime
|
12
13
|
|
13
|
-
# TODO present error summary in the very end; import errors -- makes sense to show
|
14
|
+
# TODO present error summary in the very end; import errors -- makes sense to show
|
14
15
|
# TODO on some exceptions, request a fallback to text?
|
15
16
|
|
16
17
|
|
17
|
-
Url =
|
18
|
+
Url = tuple[str, str]
|
19
|
+
|
18
20
|
|
19
21
|
def extract_urls_from_html(s: str) -> Iterator[Url]:
|
20
22
|
"""
|
@@ -23,11 +25,13 @@ def extract_urls_from_html(s: str) -> Iterator[Url]:
|
|
23
25
|
"""
|
24
26
|
soup = BeautifulSoup(s, 'lxml')
|
25
27
|
for a in soup.find_all('a'):
|
28
|
+
assert isinstance(a, Tag), a # make mypy happy
|
26
29
|
href = a.attrs.get('href')
|
27
30
|
if href is None or ('://' not in href):
|
28
31
|
# second condition means relative link
|
29
32
|
continue
|
30
|
-
|
33
|
+
assert isinstance(href, str), href # make mypy happy
|
34
|
+
text: str = a.text
|
31
35
|
yield (href, text)
|
32
36
|
|
33
37
|
|
promnesia/sources/hypothesis.py
CHANGED
@@ -1,11 +1,12 @@
|
|
1
1
|
"""
|
2
2
|
Uses HPI [[https://github.com/karlicoss/HPI/blob/master/doc/MODULES.org#myhypothesis][hypothesis]] module
|
3
3
|
"""
|
4
|
-
|
4
|
+
|
5
|
+
from promnesia.common import Loc, Results, Visit, extract_urls, join_tags
|
5
6
|
|
6
7
|
|
7
8
|
def index() -> Results:
|
8
|
-
from . import hpi
|
9
|
+
from . import hpi # noqa: F401,I001
|
9
10
|
import my.hypothesis as hyp
|
10
11
|
|
11
12
|
for h in hyp.highlights():
|
promnesia/sources/instapaper.py
CHANGED
@@ -1,11 +1,12 @@
|
|
1
1
|
'''
|
2
2
|
Uses HPI [[https://github.com/karlicoss/HPI/blob/master/doc/MODULES.org#myinstapaper][instapaper]] module
|
3
3
|
'''
|
4
|
-
|
4
|
+
|
5
|
+
from promnesia.common import Loc, Results, Visit
|
5
6
|
|
6
7
|
|
7
8
|
def index() -> Results:
|
8
|
-
from . import hpi
|
9
|
+
from . import hpi # noqa: F401,I001
|
9
10
|
import my.instapaper as ip
|
10
11
|
|
11
12
|
for p in ip.pages():
|