promnesia 1.1.20230129__py3-none-any.whl → 1.2.20240810__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- promnesia/__main__.py +58 -50
- promnesia/cannon.py +4 -4
- promnesia/common.py +57 -38
- promnesia/compare.py +3 -2
- promnesia/compat.py +6 -65
- promnesia/config.py +4 -2
- promnesia/database/common.py +66 -0
- promnesia/database/dump.py +187 -0
- promnesia/{read_db.py → database/load.py} +10 -11
- promnesia/extract.py +1 -0
- promnesia/kjson.py +1 -1
- promnesia/logging.py +14 -14
- promnesia/misc/__init__.pyi +0 -0
- promnesia/misc/config_example.py +1 -2
- promnesia/misc/install_server.py +5 -4
- promnesia/server.py +24 -24
- promnesia/sources/__init__.pyi +0 -0
- promnesia/sources/auto.py +12 -7
- promnesia/sources/browser.py +80 -293
- promnesia/sources/browser_legacy.py +298 -0
- promnesia/sources/demo.py +18 -2
- promnesia/sources/filetypes.py +8 -0
- promnesia/sources/github.py +2 -2
- promnesia/sources/hackernews.py +1 -2
- promnesia/sources/hypothesis.py +1 -1
- promnesia/sources/markdown.py +15 -15
- promnesia/sources/org.py +7 -3
- promnesia/sources/plaintext.py +3 -1
- promnesia/sources/reddit.py +2 -2
- promnesia/sources/rss.py +5 -1
- promnesia/sources/shellcmd.py +6 -2
- promnesia/sources/signal.py +29 -20
- promnesia/sources/smscalls.py +8 -1
- promnesia/sources/stackexchange.py +2 -2
- promnesia/sources/takeout.py +132 -12
- promnesia/sources/takeout_legacy.py +10 -2
- promnesia/sources/telegram.py +79 -123
- promnesia/sources/telegram_legacy.py +117 -0
- promnesia/sources/vcs.py +1 -1
- promnesia/sources/viber.py +6 -15
- promnesia/sources/website.py +1 -1
- promnesia/sqlite.py +42 -0
- promnesia/tests/__init__.py +0 -0
- promnesia/tests/common.py +137 -0
- promnesia/tests/server_helper.py +64 -0
- promnesia/tests/sources/__init__.py +0 -0
- promnesia/tests/sources/test_auto.py +66 -0
- promnesia/tests/sources/test_filetypes.py +42 -0
- promnesia/tests/sources/test_hypothesis.py +39 -0
- promnesia/tests/sources/test_org.py +65 -0
- promnesia/tests/sources/test_plaintext.py +26 -0
- promnesia/tests/sources/test_shellcmd.py +22 -0
- promnesia/tests/sources/test_takeout.py +58 -0
- promnesia/tests/test_cannon.py +325 -0
- promnesia/tests/test_cli.py +42 -0
- promnesia/tests/test_compare.py +30 -0
- promnesia/tests/test_config.py +290 -0
- promnesia/tests/test_db_dump.py +223 -0
- promnesia/tests/test_extract.py +61 -0
- promnesia/tests/test_extract_urls.py +43 -0
- promnesia/tests/test_indexer.py +245 -0
- promnesia/tests/test_server.py +292 -0
- promnesia/tests/test_traverse.py +41 -0
- promnesia/tests/utils.py +35 -0
- {promnesia-1.1.20230129.dist-info → promnesia-1.2.20240810.dist-info}/METADATA +14 -19
- promnesia-1.2.20240810.dist-info/RECORD +83 -0
- {promnesia-1.1.20230129.dist-info → promnesia-1.2.20240810.dist-info}/WHEEL +1 -1
- {promnesia-1.1.20230129.dist-info → promnesia-1.2.20240810.dist-info}/entry_points.txt +0 -1
- promnesia/dump.py +0 -105
- promnesia-1.1.20230129.dist-info/RECORD +0 -55
- {promnesia-1.1.20230129.dist-info → promnesia-1.2.20240810.dist-info}/LICENSE +0 -0
- {promnesia-1.1.20230129.dist-info → promnesia-1.2.20240810.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,298 @@
|
|
1
|
+
from datetime import datetime
|
2
|
+
from pathlib import Path
|
3
|
+
from urllib.parse import unquote
|
4
|
+
import sqlite3
|
5
|
+
from typing import List, Set, Optional
|
6
|
+
|
7
|
+
import pytz
|
8
|
+
|
9
|
+
from ..common import PathIsh, Results, Visit, Loc, logger, Second, is_sqlite_db
|
10
|
+
from .. import config
|
11
|
+
|
12
|
+
try:
|
13
|
+
from cachew import cachew
|
14
|
+
except ModuleNotFoundError as me:
|
15
|
+
if me.name != 'cachew':
|
16
|
+
raise me
|
17
|
+
# this module is legacy anyway, so just make it defensive
|
18
|
+
def cachew(*args, **kwargs): # type: ignore[no-redef]
|
19
|
+
return lambda f: f
|
20
|
+
|
21
|
+
|
22
|
+
def index(p: PathIsh) -> Results:
|
23
|
+
pp = Path(p)
|
24
|
+
assert pp.exists(), pp # just in case of broken symlinks
|
25
|
+
|
26
|
+
# todo warn if filtered out too many?
|
27
|
+
# todo wonder how quickly mimes can be computed?
|
28
|
+
# todo ugh, dunno, maybe this really belongs to hpi?? need get_files etc...
|
29
|
+
dbs = [p for p in sorted(pp.rglob('*')) if is_sqlite_db(p)]
|
30
|
+
|
31
|
+
assert len(dbs) > 0, pp
|
32
|
+
logger.info('processing %d databases', len(dbs))
|
33
|
+
cname = str('_'.join(pp.parts[1:])) # meh
|
34
|
+
yield from _index_dbs(dbs, cachew_name=cname)
|
35
|
+
|
36
|
+
|
37
|
+
|
38
|
+
def _index_dbs(dbs: List[Path], cachew_name: str):
|
39
|
+
# TODO right... not ideal, need to think how to handle it properly...
|
40
|
+
import sys
|
41
|
+
sys.setrecursionlimit(5000)
|
42
|
+
|
43
|
+
cache_dir = config.get().cache_dir
|
44
|
+
cpath = None if cache_dir is None else cache_dir / cachew_name
|
45
|
+
emitted: Set = set()
|
46
|
+
yield from _index_dbs_aux(cpath, dbs, emitted=emitted)
|
47
|
+
|
48
|
+
|
49
|
+
# todo wow, stack traces are ridiculous here...
|
50
|
+
# todo hmm, feels like it should be a class or something?
|
51
|
+
@cachew(lambda cp, dbs, emitted: cp, depends_on=lambda cp, dbs, emitted: dbs) # , logger=logger)
|
52
|
+
def _index_dbs_aux(cache_path: Optional[Path], dbs: List[Path], emitted: Set) -> Results:
|
53
|
+
if len(dbs) == 0:
|
54
|
+
return
|
55
|
+
|
56
|
+
xs = dbs[:-1]
|
57
|
+
x = dbs[-1:]
|
58
|
+
|
59
|
+
xs_res = _index_dbs_aux(cache_path, xs, emitted)
|
60
|
+
xs_was_cached = False
|
61
|
+
for r in xs_res:
|
62
|
+
# if it was cached, emitted would be empty
|
63
|
+
if len(emitted) == 0:
|
64
|
+
xs_was_cached = True
|
65
|
+
logger.debug('seems that %d first items were previously cached', len(xs))
|
66
|
+
if xs_was_cached:
|
67
|
+
key = str(r) if isinstance(r, Exception) else (r.url, r.dt)
|
68
|
+
assert key not in emitted, key # todo not sure if this assert is necessary?
|
69
|
+
# hmm ok it might happen if we messed up with indexing individual db?
|
70
|
+
# alternatively, could abuse it to avoid messing with 'emitted' in _index_db?
|
71
|
+
emitted.add(key)
|
72
|
+
yield r # todo not sure about exceptions?
|
73
|
+
|
74
|
+
for db in x:
|
75
|
+
yield from _index_db(db, emitted=emitted)
|
76
|
+
|
77
|
+
|
78
|
+
def _index_db(db: Path, emitted: Set):
|
79
|
+
logger.info('processing %s', db) # debug level?
|
80
|
+
|
81
|
+
# todo schema check (not so critical for cachew though)
|
82
|
+
total = 0
|
83
|
+
new = 0
|
84
|
+
loc = Loc.file(db) # todo possibly needs to be optimized -- moving from within the loop considerably speeds everything up
|
85
|
+
with sqlite3.connect(f'file:{db}?immutable=1', uri=True) as c:
|
86
|
+
browser = None
|
87
|
+
for b in [Chrome, Firefox, FirefoxPhone, Safari]:
|
88
|
+
try:
|
89
|
+
c.execute(f'SELECT * FROM {b.detector}')
|
90
|
+
except sqlite3.OperationalError: # not sure if the right kind?
|
91
|
+
pass
|
92
|
+
else:
|
93
|
+
browser = b
|
94
|
+
break
|
95
|
+
assert browser is not None
|
96
|
+
|
97
|
+
proj = ', '.join(c for c, _ in browser.schema.cols)
|
98
|
+
query = browser.query.replace('chunk.', '')
|
99
|
+
|
100
|
+
c.row_factory = sqlite3.Row
|
101
|
+
for r in c.execute(f'select {proj} {query}'):
|
102
|
+
v = browser.row2visit(r, loc)
|
103
|
+
total += 1
|
104
|
+
|
105
|
+
key = (v.url, v.dt)
|
106
|
+
# todo how to keep keys compatible?
|
107
|
+
if key in emitted:
|
108
|
+
continue
|
109
|
+
yield v
|
110
|
+
emitted.add(key)
|
111
|
+
new += 1
|
112
|
+
|
113
|
+
# eh, ok, almost 2x faster if I don't construct Visit first
|
114
|
+
# maybe it's Loc.file that's too slow?
|
115
|
+
# yeah, seems like it, 4.1 s after computing it only once
|
116
|
+
|
117
|
+
logger.info('%s: %d/%d new visits', db, new, total)
|
118
|
+
|
119
|
+
|
120
|
+
Col = str
|
121
|
+
ColType = str
|
122
|
+
|
123
|
+
|
124
|
+
from typing import Any, NamedTuple, Tuple, Union, Sequence, Optional
|
125
|
+
|
126
|
+
class Schema(NamedTuple):
|
127
|
+
cols: Sequence[Tuple[Col, ColType]]
|
128
|
+
key: Sequence[str]
|
129
|
+
|
130
|
+
|
131
|
+
SchemaCheck = Tuple[str, Union[str, Sequence[str]]] # todo Union: meh
|
132
|
+
|
133
|
+
from dataclasses import dataclass
|
134
|
+
|
135
|
+
# todo protocol?
|
136
|
+
@dataclass
|
137
|
+
class Extr:
|
138
|
+
detector: str
|
139
|
+
schema_check: SchemaCheck
|
140
|
+
schema: Schema
|
141
|
+
query: str
|
142
|
+
|
143
|
+
# todo calllable?
|
144
|
+
@staticmethod
|
145
|
+
def row2visit(row: sqlite3.Row, loc: Loc) -> Visit:
|
146
|
+
raise NotImplementedError
|
147
|
+
|
148
|
+
|
149
|
+
class Chrome(Extr):
|
150
|
+
detector='keyword_search_terms'
|
151
|
+
schema_check=(
|
152
|
+
'visits', [
|
153
|
+
'visits', "id, url, visit_time, from_visit, transition, segment_id, visit_duration, incremented_omnibox_typed_score",
|
154
|
+
'visits', "id, url, visit_time, from_visit, transition, segment_id, visit_duration"
|
155
|
+
]
|
156
|
+
)
|
157
|
+
schema=Schema(cols=[
|
158
|
+
('U.url' , 'TEXT' ),
|
159
|
+
|
160
|
+
# while these two are not very useful, might be good to have just in case for some debugging
|
161
|
+
('U.id AS urlid' , 'INTEGER'),
|
162
|
+
('V.id AS vid' , 'INTEGER'),
|
163
|
+
|
164
|
+
('V.visit_time' , 'INTEGER NOT NULL'),
|
165
|
+
('V.from_visit' , 'INTEGER' ),
|
166
|
+
('V.transition' , 'INTEGER NOT NULL'),
|
167
|
+
# V.segment_id looks useless
|
168
|
+
('V.visit_duration' , 'INTEGER NOT NULL'),
|
169
|
+
# V.omnibox thing looks useless
|
170
|
+
], key=('url', 'visit_time', 'vid', 'urlid'))
|
171
|
+
query='FROM chunk.visits as V, chunk.urls as U WHERE V.url = U.id'
|
172
|
+
|
173
|
+
@staticmethod
|
174
|
+
def row2visit(row: sqlite3.Row, loc: Loc) -> Visit:
|
175
|
+
url = row['url']
|
176
|
+
ts = row['visit_time']
|
177
|
+
durs = row['visit_duration']
|
178
|
+
|
179
|
+
dt = chrome_time_to_utc(int(ts))
|
180
|
+
url = unquote(url) # chrome urls are all quoted
|
181
|
+
dd = int(durs)
|
182
|
+
dur: Optional[Second] = None if dd == 0 else dd // 1_000_000
|
183
|
+
return Visit(
|
184
|
+
url=url,
|
185
|
+
dt=dt,
|
186
|
+
locator=loc,
|
187
|
+
duration=dur,
|
188
|
+
)
|
189
|
+
|
190
|
+
|
191
|
+
# should be utc? https://stackoverflow.com/a/26226771/706389
|
192
|
+
# yep, tested it and looks like utc
|
193
|
+
def chrome_time_to_utc(chrome_time: int) -> datetime:
|
194
|
+
epoch = (chrome_time / 1_000_000) - 11644473600
|
195
|
+
return datetime.fromtimestamp(epoch, pytz.utc)
|
196
|
+
|
197
|
+
|
198
|
+
def _row2visit_firefox(row: sqlite3.Row, loc: Loc) -> Visit:
|
199
|
+
url = row['url']
|
200
|
+
ts = float(row['visit_date'])
|
201
|
+
# ok, looks like it's unix epoch
|
202
|
+
# https://stackoverflow.com/a/19430099/706389
|
203
|
+
|
204
|
+
# NOTE: ugh. on Fenix (experimental Android version) it uses milliseconds, not nanos...
|
205
|
+
# about year 2001... if someone has browser history exports before that -- please let me know, I'm impressed
|
206
|
+
threshold = 1000000000
|
207
|
+
if ts > threshold * 1_000_000:
|
208
|
+
# presumably it's in microseconds
|
209
|
+
ts /= 1_000_000
|
210
|
+
else:
|
211
|
+
# milliseconds
|
212
|
+
ts /= 1_000
|
213
|
+
dt = datetime.fromtimestamp(ts, pytz.utc)
|
214
|
+
url = unquote(url) # firefox urls are all quoted
|
215
|
+
return Visit(
|
216
|
+
url=url,
|
217
|
+
dt=dt,
|
218
|
+
locator=loc,
|
219
|
+
)
|
220
|
+
|
221
|
+
# https://web.archive.org/web/20201026130310/http://fileformats.archiveteam.org/wiki/History.db
|
222
|
+
class Safari(Extr):
|
223
|
+
detector='history_tombstones'
|
224
|
+
schema_check=(
|
225
|
+
'history_visits', [
|
226
|
+
'history_visits', "id, history_item, visit_time",
|
227
|
+
'history_items', "id, url"
|
228
|
+
]
|
229
|
+
)
|
230
|
+
schema=Schema(cols=[
|
231
|
+
('U.url' , 'TEXT' ),
|
232
|
+
|
233
|
+
# while these two are not very useful, might be good to have just in case for some debugging
|
234
|
+
('U.id AS urlid' , 'INTEGER'),
|
235
|
+
('V.id AS vid' , 'INTEGER'),
|
236
|
+
|
237
|
+
('V.visit_time' , 'INTEGER NOT NULL'),
|
238
|
+
# ('V.from_visit' , 'INTEGER' ),
|
239
|
+
# ('V.transition' , 'INTEGER NOT NULL'),
|
240
|
+
# V.segment_id looks useless
|
241
|
+
# ('V.visit_duration' , 'INTEGER NOT NULL'),
|
242
|
+
# V.omnibox thing looks useless
|
243
|
+
], key=('url', 'visit_time', 'vid', 'urlid'))
|
244
|
+
query='FROM chunk.history_visits as V, chunk.history_items as U WHERE V.history_item = U.id'
|
245
|
+
|
246
|
+
@staticmethod
|
247
|
+
def row2visit(row: sqlite3.Row, loc: Loc) -> Visit:
|
248
|
+
url = row['url']
|
249
|
+
ts = row['visit_time'] + 978307200 # https://stackoverflow.com/a/34546556/16645
|
250
|
+
dt = datetime.fromtimestamp(ts, pytz.utc)
|
251
|
+
|
252
|
+
return Visit(
|
253
|
+
url=url,
|
254
|
+
dt=dt,
|
255
|
+
locator=loc,
|
256
|
+
)
|
257
|
+
|
258
|
+
# https://web.archive.org/web/20190730231715/https://www.forensicswiki.org/wiki/Mozilla_Firefox_3_History_File_Format#moz_historyvisits
|
259
|
+
class Firefox(Extr):
|
260
|
+
detector='moz_meta'
|
261
|
+
schema_check=('moz_historyvisits', "id, from_visit, place_id, visit_date, visit_type")
|
262
|
+
schema=Schema(cols=[
|
263
|
+
('P.url' , 'TEXT'),
|
264
|
+
|
265
|
+
('P.id AS pid' , 'INTEGER'),
|
266
|
+
('V.id AS vid' , 'INTEGER'),
|
267
|
+
|
268
|
+
('V.from_visit', 'INTEGER'),
|
269
|
+
('V.visit_date', 'INTEGER'),
|
270
|
+
('V.visit_type', 'INTEGER'),
|
271
|
+
|
272
|
+
# not sure what session is form but could be useful?..
|
273
|
+
# NOTE(20210410): for now, commented it out since some older databases from phone have this column commented?
|
274
|
+
# needs to be defensive
|
275
|
+
# ('V.session' , 'INTEGER'),
|
276
|
+
], key=('url', 'visit_date', 'vid', 'pid'))
|
277
|
+
query='FROM chunk.moz_historyvisits as V, chunk.moz_places as P WHERE V.place_id = P.id'
|
278
|
+
|
279
|
+
row2visit = _row2visit_firefox
|
280
|
+
|
281
|
+
|
282
|
+
class FirefoxPhone(Extr):
|
283
|
+
detector='remote_devices'
|
284
|
+
schema_check=('visits', "_id, history_guid, visit_type, date, is_local")
|
285
|
+
schema=Schema(cols=[
|
286
|
+
('H.url' , 'TEXT NOT NULL' ),
|
287
|
+
|
288
|
+
('H.guid AS guid' , 'TEXT' ),
|
289
|
+
('H._id AS hid' , 'INTEGER' ),
|
290
|
+
('V._id AS vid' , 'INTEGER' ),
|
291
|
+
|
292
|
+
('V.visit_type' , 'INTEGER NOT NULL'),
|
293
|
+
('V.date as visit_date', 'INTEGER NOT NULL'),
|
294
|
+
# ('is_local' , 'INTEGER NOT NULL'),
|
295
|
+
], key=('url', 'date', 'vid', 'hid'))
|
296
|
+
query='FROM chunk.visits as V, chunk.history as H WHERE V.history_guid = H.guid'
|
297
|
+
|
298
|
+
row2visit = _row2visit_firefox
|
promnesia/sources/demo.py
CHANGED
@@ -4,17 +4,33 @@ Generates a sequence of fake evenly separated visits
|
|
4
4
|
'''
|
5
5
|
|
6
6
|
from datetime import datetime, timedelta
|
7
|
+
from typing import Union
|
7
8
|
|
8
9
|
from ..common import Results, Visit, Loc
|
9
10
|
|
10
11
|
|
11
|
-
|
12
|
+
IsoFormatDt = str
|
13
|
+
Seconds = int
|
14
|
+
|
15
|
+
|
16
|
+
# TODO allow passing isoformat string as base_dt?
|
17
|
+
# and maybe something similar as delta? start with seconds maybe
|
18
|
+
def index(
|
19
|
+
count: int=100,
|
20
|
+
*,
|
21
|
+
base_dt: Union[datetime, IsoFormatDt] = datetime.min + timedelta(days=5000),
|
22
|
+
delta: Union[timedelta, Seconds] = timedelta(hours=1),
|
23
|
+
) -> Results:
|
24
|
+
|
25
|
+
base_dt_ = base_dt if isinstance(base_dt, datetime) else datetime.fromisoformat(base_dt)
|
26
|
+
delta_ = delta if isinstance(delta, timedelta) else timedelta(seconds=delta)
|
27
|
+
|
12
28
|
# todo with some errors too?
|
13
29
|
# todo use data generation library suggested for HPI?
|
14
30
|
for i in range(count):
|
15
31
|
yield Visit(
|
16
32
|
url=f'https://demo.com/page{i}.html',
|
17
|
-
dt=
|
33
|
+
dt=base_dt_ + delta_ * i,
|
18
34
|
locator=Loc.make('demo'),
|
19
35
|
)
|
20
36
|
# todo add context?
|
promnesia/sources/filetypes.py
CHANGED
@@ -66,6 +66,8 @@ CODE = {
|
|
66
66
|
'text/x-lisp',
|
67
67
|
'text/vnd.graphviz',
|
68
68
|
'text/x-diff', # patch files
|
69
|
+
'text/x-php',
|
70
|
+
'text/x-lilypond',
|
69
71
|
|
70
72
|
# these didn't have a mime type, or were mistyped?
|
71
73
|
'css',
|
@@ -114,6 +116,12 @@ TYPE2IDX.update({
|
|
114
116
|
'.vcf' : ignore,
|
115
117
|
'message/rfc822': ignore, # ??
|
116
118
|
|
119
|
+
# todo ignore all fonts?
|
120
|
+
'font/woff2': ignore,
|
121
|
+
'font/woff': ignore,
|
122
|
+
'text/x-Algol68': ignore, # ugh some license file had this?? maybe always index text/ as text?
|
123
|
+
'text/x-bytecode.python': ignore, # todo ignore all x-bytecode?
|
124
|
+
|
117
125
|
# TODO not sure what to do about these..
|
118
126
|
'application/octet-stream': handle_later,
|
119
127
|
'application/zip' : handle_later,
|
promnesia/sources/github.py
CHANGED
@@ -31,7 +31,7 @@ def index(*, render_markdown: bool = False) -> Results:
|
|
31
31
|
# if enabled, convert the (markdown) body to HTML
|
32
32
|
context: Optional[str] = e.body
|
33
33
|
if e.body is not None and render_markdown:
|
34
|
-
context = TextParser(e.body)._doc_ashtml()
|
34
|
+
context = TextParser(e.body)._doc_ashtml() # type: ignore[possibly-undefined]
|
35
35
|
|
36
36
|
# locator should link back to this event
|
37
37
|
loc = Loc.make(title=e.summary, href=e.link)
|
@@ -74,7 +74,7 @@ def index(*, render_markdown: bool = False) -> Results:
|
|
74
74
|
# extract from markdown links like [link text](https://...)
|
75
75
|
# incase URLExtract missed any somehow
|
76
76
|
if render_markdown:
|
77
|
-
for res in extract_from_text(e.body):
|
77
|
+
for res in extract_from_text(e.body): # type: ignore[possibly-undefined]
|
78
78
|
if isinstance(res, Exception):
|
79
79
|
yield res
|
80
80
|
continue
|
promnesia/sources/hackernews.py
CHANGED
promnesia/sources/hypothesis.py
CHANGED
promnesia/sources/markdown.py
CHANGED
@@ -1,13 +1,13 @@
|
|
1
1
|
from pathlib import Path
|
2
2
|
from typing import Iterator, NamedTuple, Optional
|
3
3
|
|
4
|
-
from ..common import
|
4
|
+
from ..common import Extraction, Url, PathIsh, Res, Visit, Loc, file_mtime, logger
|
5
5
|
|
6
6
|
|
7
|
-
import mistletoe
|
8
|
-
from mistletoe.span_token import AutoLink, Link
|
9
|
-
import mistletoe.block_token as BT
|
10
|
-
from mistletoe.html_renderer import HTMLRenderer
|
7
|
+
import mistletoe # type: ignore
|
8
|
+
from mistletoe.span_token import AutoLink, Link # type: ignore
|
9
|
+
import mistletoe.block_token as BT # type: ignore
|
10
|
+
from mistletoe.html_renderer import HTMLRenderer # type: ignore
|
11
11
|
|
12
12
|
|
13
13
|
renderer = HTMLRenderer()
|
@@ -42,7 +42,7 @@ HTML_MARKER = '!html '
|
|
42
42
|
def _ashtml(block) -> str:
|
43
43
|
res = renderer.render(block)
|
44
44
|
if res.startswith('<p>') and res.endswith('</p>'):
|
45
|
-
res = res[3
|
45
|
+
res = res[3:-4] # meh, but for now fine
|
46
46
|
return res
|
47
47
|
|
48
48
|
|
@@ -62,7 +62,6 @@ class Parser:
|
|
62
62
|
context = None if last_block is None else HTML_MARKER + _ashtml(last_block)
|
63
63
|
yield Parsed(url=url, context=context)
|
64
64
|
|
65
|
-
|
66
65
|
def _walk(self, cur, last_block) -> Iterator[Result]:
|
67
66
|
if isinstance(cur, block_tokens):
|
68
67
|
last_block = cur
|
@@ -73,12 +72,14 @@ class Parser:
|
|
73
72
|
logger.exception(e)
|
74
73
|
yield e
|
75
74
|
|
76
|
-
|
75
|
+
# keeping getattr for compatibility in older versions of mistletoe, it was optional
|
76
|
+
children = getattr(cur, 'children', None)
|
77
|
+
if children is None:
|
78
|
+
return
|
77
79
|
for c in children:
|
78
80
|
yield from self._walk(c, last_block=last_block)
|
79
81
|
|
80
|
-
|
81
|
-
def walk(self):
|
82
|
+
def walk(self) -> Iterator[Result]:
|
82
83
|
yield from self._walk(self.doc, last_block=None)
|
83
84
|
|
84
85
|
|
@@ -94,7 +95,7 @@ def extract_from_file(fname: PathIsh) -> Iterator[Extraction]:
|
|
94
95
|
yield Visit(
|
95
96
|
url=r.url,
|
96
97
|
dt=fallback_dt,
|
97
|
-
locator=Loc.file(fname),
|
98
|
+
locator=Loc.file(fname), # TODO line number
|
98
99
|
context=r.context,
|
99
100
|
)
|
100
101
|
|
@@ -105,9 +106,9 @@ class TextParser(Parser):
|
|
105
106
|
Instead of chunking blocks like for files, this returns the entire
|
106
107
|
message rendered as the context
|
107
108
|
'''
|
108
|
-
def __init__(self, text: str):
|
109
|
-
self.doc = mistletoe.Document(text)
|
110
109
|
|
110
|
+
def __init__(self, text: str) -> None:
|
111
|
+
self.doc = mistletoe.Document(text)
|
111
112
|
|
112
113
|
def _doc_ashtml(self):
|
113
114
|
'''
|
@@ -117,8 +118,7 @@ class TextParser(Parser):
|
|
117
118
|
self._html = HTML_MARKER + _ashtml(self.doc)
|
118
119
|
return self._html
|
119
120
|
|
120
|
-
|
121
|
-
def _extract(self, cur, last_block = None) -> Iterator[Parsed]:
|
121
|
+
def _extract(self, cur, last_block=None) -> Iterator[Parsed]:
|
122
122
|
if not isinstance(cur, (AutoLink, Link)):
|
123
123
|
return
|
124
124
|
|
promnesia/sources/org.py
CHANGED
@@ -57,8 +57,12 @@ def _parse_node(n: OrgNode) -> Parsed:
|
|
57
57
|
# todo a bit hacky..
|
58
58
|
heading = heading.replace(createds + ' ', '')
|
59
59
|
if createds is not None:
|
60
|
-
|
61
|
-
|
60
|
+
if '<%%' in createds:
|
61
|
+
# sexp date, not supported
|
62
|
+
dt = None
|
63
|
+
else:
|
64
|
+
[odt] = OrgDate.list_from_str(createds)
|
65
|
+
dt = odt.start
|
62
66
|
else:
|
63
67
|
dt = None
|
64
68
|
return Parsed(dt=dt, heading=heading)
|
@@ -80,7 +84,7 @@ def walk_node(*, node: OrgNode, dt: datetime) -> Iterator[Res[Tuple[Parsed, OrgN
|
|
80
84
|
parsed = parsed._replace(dt=dt)
|
81
85
|
else:
|
82
86
|
dt = parsed.dt
|
83
|
-
|
87
|
+
yield parsed, node
|
84
88
|
|
85
89
|
for c in node.children:
|
86
90
|
yield from walk_node(node=c, dt=dt)
|
promnesia/sources/plaintext.py
CHANGED
@@ -98,8 +98,10 @@ def extract_from_path(path: PathIsh) -> Command:
|
|
98
98
|
'.gz',
|
99
99
|
'.zip',
|
100
100
|
)):
|
101
|
-
|
101
|
+
# todo should be debug?
|
102
|
+
# or should delete it completely, feels like unpacking archives here is a bit too much
|
102
103
|
raise RuntimeError(f"Archives aren't supported yet: {path}")
|
104
|
+
logger.info(f"Extracting from compressed file {path}")
|
103
105
|
import lzma
|
104
106
|
from tempfile import NamedTemporaryFile
|
105
107
|
# TODO hopefully, no collisions
|
promnesia/sources/reddit.py
CHANGED
@@ -16,7 +16,7 @@ def index(*, render_markdown: bool = False, renderer: Optional[Type['RedditRende
|
|
16
16
|
if "No module named 'my.reddit.all'" in str(e):
|
17
17
|
import warnings
|
18
18
|
warnings.warn("DEPRECATED/reddit: Using an old version of HPI, please update")
|
19
|
-
from my.reddit import submissions, comments, saved, upvoted
|
19
|
+
from my.reddit import submissions, comments, saved, upvoted
|
20
20
|
else:
|
21
21
|
raise e
|
22
22
|
|
@@ -95,7 +95,7 @@ class RedditRenderer:
|
|
95
95
|
|
96
96
|
def _from_upvote(self, i: 'Upvote') -> Results:
|
97
97
|
locator = Loc.make(
|
98
|
-
title=
|
98
|
+
title='Reddit upvote',
|
99
99
|
href=i.url,
|
100
100
|
)
|
101
101
|
yield from self._from_common(i, locator=locator)
|
promnesia/sources/rss.py
CHANGED
@@ -1,3 +1,7 @@
|
|
1
|
+
'''
|
2
|
+
Uses [[https://github.com/karlicoss/HPI][HPI]] for RSS data.
|
3
|
+
'''
|
4
|
+
|
1
5
|
from itertools import chain
|
2
6
|
|
3
7
|
from ..common import Visit, Loc, extract_urls, Results, get_logger
|
@@ -19,6 +23,6 @@ def index() -> Results:
|
|
19
23
|
yield Visit(
|
20
24
|
url=feed.url,
|
21
25
|
dt=feed.created_at or default_datetime,
|
22
|
-
context=
|
26
|
+
context='RSS subscription', # TODO use 'provider', etc?
|
23
27
|
locator=locator,
|
24
28
|
)
|
promnesia/sources/shellcmd.py
CHANGED
@@ -1,10 +1,14 @@
|
|
1
|
+
"""
|
2
|
+
Greps out URLs from an arbitrary shell command results.
|
3
|
+
"""
|
4
|
+
|
1
5
|
from datetime import datetime
|
2
6
|
import os
|
3
7
|
import re
|
4
|
-
from
|
8
|
+
from subprocess import run, PIPE
|
9
|
+
from typing import Union, Sequence
|
5
10
|
import warnings
|
6
11
|
|
7
|
-
from ..compat import run, PIPE
|
8
12
|
from ..common import Visit, Loc, Results, extract_urls, file_mtime, get_system_tz, now_tz, _is_windows, PathIsh
|
9
13
|
from .plaintext import _has_grep
|
10
14
|
|