promnesia 1.3.20241021__py3-none-any.whl → 1.4.20250909__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- promnesia/__init__.py +4 -1
- promnesia/__main__.py +72 -59
- promnesia/cannon.py +90 -89
- promnesia/common.py +74 -62
- promnesia/compare.py +15 -10
- promnesia/config.py +22 -17
- promnesia/database/dump.py +1 -2
- promnesia/extract.py +6 -6
- promnesia/logging.py +27 -15
- promnesia/misc/install_server.py +25 -19
- promnesia/server.py +69 -53
- promnesia/sources/auto.py +65 -51
- promnesia/sources/browser.py +7 -2
- promnesia/sources/browser_legacy.py +51 -40
- promnesia/sources/demo.py +0 -1
- promnesia/sources/fbmessenger.py +0 -1
- promnesia/sources/filetypes.py +15 -11
- promnesia/sources/github.py +4 -1
- promnesia/sources/guess.py +4 -1
- promnesia/sources/hackernews.py +5 -7
- promnesia/sources/hpi.py +3 -1
- promnesia/sources/html.py +4 -2
- promnesia/sources/instapaper.py +1 -0
- promnesia/sources/markdown.py +4 -4
- promnesia/sources/org.py +17 -8
- promnesia/sources/plaintext.py +14 -11
- promnesia/sources/pocket.py +2 -1
- promnesia/sources/reddit.py +5 -8
- promnesia/sources/roamresearch.py +3 -1
- promnesia/sources/rss.py +4 -5
- promnesia/sources/shellcmd.py +3 -6
- promnesia/sources/signal.py +14 -14
- promnesia/sources/smscalls.py +0 -1
- promnesia/sources/stackexchange.py +2 -2
- promnesia/sources/takeout.py +14 -21
- promnesia/sources/takeout_legacy.py +16 -10
- promnesia/sources/telegram.py +7 -3
- promnesia/sources/telegram_legacy.py +5 -5
- promnesia/sources/twitter.py +1 -1
- promnesia/sources/vcs.py +6 -3
- promnesia/sources/viber.py +2 -2
- promnesia/sources/website.py +4 -3
- promnesia/sqlite.py +10 -7
- promnesia/tests/common.py +2 -0
- promnesia/tests/server_helper.py +2 -2
- promnesia/tests/sources/test_filetypes.py +9 -7
- promnesia/tests/sources/test_hypothesis.py +7 -3
- promnesia/tests/sources/test_org.py +7 -2
- promnesia/tests/sources/test_plaintext.py +9 -7
- promnesia/tests/sources/test_shellcmd.py +10 -9
- promnesia/tests/test_cannon.py +254 -237
- promnesia/tests/test_cli.py +8 -2
- promnesia/tests/test_compare.py +16 -12
- promnesia/tests/test_db_dump.py +4 -3
- promnesia/tests/test_extract.py +7 -4
- promnesia/tests/test_indexer.py +10 -10
- promnesia/tests/test_server.py +10 -10
- promnesia/tests/utils.py +1 -5
- promnesia-1.4.20250909.dist-info/METADATA +66 -0
- promnesia-1.4.20250909.dist-info/RECORD +80 -0
- {promnesia-1.3.20241021.dist-info → promnesia-1.4.20250909.dist-info}/WHEEL +1 -2
- promnesia/kjson.py +0 -122
- promnesia/sources/__init__.pyi +0 -0
- promnesia-1.3.20241021.dist-info/METADATA +0 -55
- promnesia-1.3.20241021.dist-info/RECORD +0 -83
- promnesia-1.3.20241021.dist-info/top_level.txt +0 -1
- {promnesia-1.3.20241021.dist-info → promnesia-1.4.20250909.dist-info}/entry_points.txt +0 -0
- {promnesia-1.3.20241021.dist-info → promnesia-1.4.20250909.dist-info/licenses}/LICENSE +0 -0
promnesia/sources/auto.py
CHANGED
@@ -2,22 +2,23 @@
|
|
2
2
|
- discovers files recursively
|
3
3
|
- guesses the format (orgmode/markdown/json/etc) by the extension/MIME type
|
4
4
|
- can index most of plaintext files, including source code!
|
5
|
-
- autodetects Obsidian vault and adds `obsidian://` app protocol support [[file:../src/promnesia/sources/
|
6
|
-
- autodetects Logseq graph and adds `logseq://` app protocol support [[file:../src/promnesia/sources/
|
5
|
+
- autodetects Obsidian vault and adds `obsidian://` app protocol support [[file:../src/promnesia/sources/auto_obsidian.py][promnesia.sources.obsidian]]
|
6
|
+
- autodetects Logseq graph and adds `logseq://` app protocol support [[file:../src/promnesia/sources/auto_logseq.py][promnesia.sources.logseq]]
|
7
7
|
"""
|
8
|
+
|
8
9
|
from __future__ import annotations
|
9
10
|
|
10
11
|
import csv
|
11
12
|
import itertools
|
12
13
|
import json
|
13
14
|
import os
|
14
|
-
from collections.abc import Iterable, Iterator, Sequence
|
15
|
+
from collections.abc import Callable, Iterable, Iterator, Sequence
|
15
16
|
from concurrent.futures import ProcessPoolExecutor as Pool
|
16
17
|
from contextlib import nullcontext
|
17
18
|
from fnmatch import fnmatch
|
18
19
|
from functools import wraps
|
19
20
|
from pathlib import Path
|
20
|
-
from typing import Any,
|
21
|
+
from typing import Any, NamedTuple
|
21
22
|
|
22
23
|
from promnesia.common import (
|
23
24
|
Loc,
|
@@ -71,6 +72,7 @@ def collect_from(thing) -> list[EUrl]:
|
|
71
72
|
|
72
73
|
Urls = Iterator[EUrl]
|
73
74
|
|
75
|
+
|
74
76
|
def _csv(path: Path) -> Urls:
|
75
77
|
# TODO these could also have Loc to be fair..
|
76
78
|
with path.open() as fo:
|
@@ -88,6 +90,7 @@ def _json(path: Path) -> Urls:
|
|
88
90
|
def _plaintext(path: Path) -> Results:
|
89
91
|
from . import shellcmd
|
90
92
|
from .plaintext import extract_from_path
|
93
|
+
|
91
94
|
yield from shellcmd.index(extract_from_path(path))
|
92
95
|
|
93
96
|
|
@@ -97,6 +100,7 @@ def fallback(ex):
|
|
97
100
|
"""Falls back to plaintext in case of issues"""
|
98
101
|
|
99
102
|
fallback_active: dict[Any, bool] = {}
|
103
|
+
|
100
104
|
@wraps(ex)
|
101
105
|
def wrapped(path: Path):
|
102
106
|
nonlocal fallback_active
|
@@ -110,79 +114,83 @@ def fallback(ex):
|
|
110
114
|
except ModuleNotFoundError as me:
|
111
115
|
logger = get_logger()
|
112
116
|
logger.exception(me)
|
113
|
-
logger.warning(
|
117
|
+
logger.warning(
|
118
|
+
'%s: %s not found, falling back to grep! "pip3 install --user %s" for better support!',
|
119
|
+
path,
|
120
|
+
me.name,
|
121
|
+
me.name,
|
122
|
+
)
|
114
123
|
yield me
|
115
124
|
fallback_active[ex] = True
|
116
125
|
do_fallback = True
|
117
126
|
if do_fallback:
|
118
127
|
yield from _plaintext(path)
|
128
|
+
|
119
129
|
return wrapped
|
120
130
|
|
121
131
|
|
122
132
|
@fallback
|
123
133
|
def _markdown(path: Path) -> Results:
|
124
134
|
from . import markdown
|
135
|
+
|
125
136
|
yield from markdown.extract_from_file(path)
|
126
137
|
|
127
138
|
|
128
139
|
@fallback
|
129
140
|
def _html(path: Path) -> Results:
|
130
141
|
from . import html
|
142
|
+
|
131
143
|
yield from html.extract_from_file(path)
|
132
144
|
|
133
145
|
|
134
146
|
@fallback
|
135
147
|
def _org(path: Path) -> Results:
|
136
148
|
from . import org
|
149
|
+
|
137
150
|
return org.extract_from_file(path)
|
138
151
|
|
139
152
|
|
140
153
|
from .filetypes import CODE, IGNORE, TYPE2IDX, type2idx
|
141
154
|
|
142
|
-
TYPE2IDX.update(
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
|
160
|
-
|
161
|
-
|
162
|
-
|
163
|
-
|
164
|
-
|
165
|
-
|
166
|
-
|
167
|
-
'.html' : _html,
|
168
|
-
'text/html': _html,
|
169
|
-
'text/xml' : _plaintext,
|
170
|
-
|
171
|
-
'text/x-po': _plaintext, # some translation files
|
172
|
-
})
|
155
|
+
TYPE2IDX.update(
|
156
|
+
{
|
157
|
+
'application/json': _json,
|
158
|
+
'.json': _json,
|
159
|
+
'.ipynb': _json,
|
160
|
+
'.csv': _csv,
|
161
|
+
'application/csv': _csv,
|
162
|
+
'.org': _org,
|
163
|
+
'.org_archive': _org,
|
164
|
+
'.md': _markdown,
|
165
|
+
'.markdown': _markdown,
|
166
|
+
'text/plain': _plaintext,
|
167
|
+
'.txt': _plaintext,
|
168
|
+
'.page': _plaintext,
|
169
|
+
'.rst': _plaintext,
|
170
|
+
# TODO doesn't work that great; weird stuff like
|
171
|
+
# builtins.ImportError.name|2019-07-10T12:12:35.584510+00:00|names::ImportError::node::names::name::node::fullname
|
172
|
+
# TODO could have stricter url extraction for that; always using http/https?
|
173
|
+
# '.ipynb' : _json,
|
174
|
+
'.html': _html,
|
175
|
+
'text/html': _html,
|
176
|
+
'text/xml': _plaintext,
|
177
|
+
'text/x-po': _plaintext, # some translation files
|
178
|
+
}
|
179
|
+
)
|
173
180
|
|
174
181
|
for t in CODE:
|
175
182
|
TYPE2IDX[t] = _plaintext
|
176
183
|
# TODO ok, mime doesn't really tell between org/markdown/etc anyway
|
177
184
|
|
178
185
|
|
179
|
-
Replacer =
|
186
|
+
Replacer = Callable[[str, str], str] | None
|
187
|
+
|
180
188
|
|
181
189
|
def index(
|
182
|
-
|
183
|
-
|
184
|
-
|
185
|
-
|
190
|
+
*paths: PathIsh,
|
191
|
+
ignored: Sequence[str] | str = (),
|
192
|
+
follow: bool = True,
|
193
|
+
replacer: Replacer = None,
|
186
194
|
) -> Results:
|
187
195
|
'''
|
188
196
|
path : a path or list of paths to recursively index
|
@@ -215,13 +223,14 @@ def index(
|
|
215
223
|
)
|
216
224
|
yield from _index(apath, opts=opts)
|
217
225
|
|
226
|
+
|
218
227
|
class Options(NamedTuple):
|
219
228
|
ignored: Sequence[str]
|
220
229
|
follow: bool
|
221
230
|
# TODO option to add ignores? not sure..
|
222
231
|
# TODO I don't like this replacer thing... think about removing it
|
223
232
|
replacer: Replacer
|
224
|
-
root: Path | None=None
|
233
|
+
root: Path | None = None
|
225
234
|
|
226
235
|
|
227
236
|
def _index_file_aux(path: Path, opts: Options) -> Exception | list[Result]:
|
@@ -237,14 +246,14 @@ def _index(path: Path, opts: Options) -> Results:
|
|
237
246
|
logger = get_logger()
|
238
247
|
|
239
248
|
cores = use_cores()
|
240
|
-
if cores is None:
|
249
|
+
if cores is None: # do not use cores
|
241
250
|
# todo use ExitStack instead?
|
242
251
|
pool = nullcontext()
|
243
|
-
mapper = map
|
252
|
+
mapper = map # dummy pool
|
244
253
|
else:
|
245
254
|
workers = None if cores == 0 else cores
|
246
|
-
pool = Pool(workers)
|
247
|
-
mapper = pool.map
|
255
|
+
pool = Pool(workers) # type: ignore[assignment]
|
256
|
+
mapper = pool.map # type: ignore[attr-defined]
|
248
257
|
|
249
258
|
# iterate over resolved paths, to avoid duplicates
|
250
259
|
def rit() -> Iterable[Path]:
|
@@ -254,7 +263,7 @@ def _index(path: Path, opts: Options) -> Results:
|
|
254
263
|
# TODO not sure if should log here... might end up with quite a bit of logs
|
255
264
|
logger.debug('ignoring %s: user ignore rules', p)
|
256
265
|
continue
|
257
|
-
if any(i in p.parts for i in IGNORE):
|
266
|
+
if any(i in p.parts for i in IGNORE): # meh, not very efficient.. pass to traverse??
|
258
267
|
logger.debug('ignoring %s: default ignore rules', p)
|
259
268
|
continue
|
260
269
|
|
@@ -266,6 +275,7 @@ def _index(path: Path, opts: Options) -> Results:
|
|
266
275
|
yield p
|
267
276
|
|
268
277
|
from more_itertools import unique_everseen
|
278
|
+
|
269
279
|
it = unique_everseen(rit())
|
270
280
|
|
271
281
|
with pool:
|
@@ -302,9 +312,10 @@ def _index_file(pp: Path, opts: Options) -> Results:
|
|
302
312
|
# TODO not even sure if it's used...
|
303
313
|
suf = pp.suffix.lower()
|
304
314
|
|
305
|
-
if suf == '.xz':
|
315
|
+
if suf == '.xz': # TODO zstd?
|
306
316
|
import lzma
|
307
|
-
|
317
|
+
|
318
|
+
uname = pp.name[: -len('.xz')] # chop off suffix, so the downstream indexer can handle it
|
308
319
|
|
309
320
|
assert pp.is_absolute(), pp
|
310
321
|
# make sure to keep hierarchy, otherwise might end up with some name conflicts if filenames clash
|
@@ -359,7 +370,8 @@ def _index_file(pp: Path, opts: Options) -> Results:
|
|
359
370
|
v = r
|
360
371
|
|
361
372
|
loc = v.locator
|
362
|
-
|
373
|
+
# FIXME double checke that v.locator indeed can't be none and remove the check?
|
374
|
+
if loc is not None and root is not None: # type: ignore[redundant-expr]
|
363
375
|
# meh. but it works
|
364
376
|
# todo potentially, just use dataclasses instead...
|
365
377
|
loc = loc._replace(title=loc.title.replace(str(root) + os.sep, ''))
|
@@ -369,7 +381,9 @@ def _index_file(pp: Path, opts: Options) -> Results:
|
|
369
381
|
upd: dict[str, Any] = {}
|
370
382
|
href = v.locator.href
|
371
383
|
if href is not None:
|
372
|
-
upd['locator'] = v.locator._replace(
|
384
|
+
upd['locator'] = v.locator._replace(
|
385
|
+
href=replacer(href, str(root)), title=replacer(v.locator.title, str(root))
|
386
|
+
)
|
373
387
|
ctx = v.context
|
374
388
|
if ctx is not None:
|
375
389
|
# TODO in context, http is unnecessary
|
promnesia/sources/browser.py
CHANGED
@@ -13,16 +13,17 @@ from promnesia.common import Loc, PathIsh, Results, Second, Visit, is_sqlite_db,
|
|
13
13
|
|
14
14
|
|
15
15
|
def index(p: PathIsh | None = None) -> Results:
|
16
|
-
from . import hpi # noqa: F401
|
16
|
+
from . import hpi # noqa: F401
|
17
17
|
|
18
18
|
if p is None:
|
19
19
|
from my.browser.all import history
|
20
|
+
|
20
21
|
yield from _index_new(history())
|
21
22
|
return
|
22
23
|
|
23
24
|
warnings.warn(
|
24
25
|
f'Passing paths to promnesia.sources.browser is deprecated, you should setup my.browser.export instead. '
|
25
|
-
f'See https://github.com/
|
26
|
+
f'See https://github.com/purarue/browserexport#hpi .'
|
26
27
|
f'Will try to hack path to browser databases {p} into HPI config.'
|
27
28
|
)
|
28
29
|
try:
|
@@ -50,12 +51,14 @@ def _index_new_with_adhoc_config(*, path: PathIsh) -> Results:
|
|
50
51
|
## this would result in each subsequent call to my.browser.export.history to invalidate cache every time
|
51
52
|
## so we hack cachew path so it's different for each call
|
52
53
|
from my.core.core_config import config as hpi_core_config
|
54
|
+
|
53
55
|
hpi_cache_dir = hpi_core_config.get_cache_dir()
|
54
56
|
sanitized_path = re.sub(r'\W', '_', str(path))
|
55
57
|
cache_override = None if hpi_cache_dir is None else hpi_cache_dir / sanitized_path
|
56
58
|
##
|
57
59
|
|
58
60
|
from my.core.common import Paths, classproperty, get_files
|
61
|
+
|
59
62
|
class config:
|
60
63
|
class core:
|
61
64
|
cache_dir = cache_override
|
@@ -67,8 +70,10 @@ def _index_new_with_adhoc_config(*, path: PathIsh) -> Results:
|
|
67
70
|
return tuple([f for f in get_files(path, glob='**/*') if is_sqlite_db(f)])
|
68
71
|
|
69
72
|
from my.core.cfg import tmp_config
|
73
|
+
|
70
74
|
with tmp_config(modules='my.browser.export|my.core.core_config', config=config):
|
71
75
|
from my.browser.export import history
|
76
|
+
|
72
77
|
yield from _index_new(history())
|
73
78
|
|
74
79
|
|
@@ -1,12 +1,10 @@
|
|
1
1
|
from __future__ import annotations
|
2
2
|
|
3
3
|
import sqlite3
|
4
|
-
from datetime import datetime
|
4
|
+
from datetime import datetime, timezone
|
5
5
|
from pathlib import Path
|
6
6
|
from urllib.parse import unquote
|
7
7
|
|
8
|
-
import pytz
|
9
|
-
|
10
8
|
from promnesia import config
|
11
9
|
from promnesia.common import Loc, PathIsh, Results, Second, Visit, is_sqlite_db, logger
|
12
10
|
|
@@ -15,6 +13,7 @@ try:
|
|
15
13
|
except ModuleNotFoundError as me:
|
16
14
|
if me.name != 'cachew':
|
17
15
|
raise me
|
16
|
+
|
18
17
|
# this module is legacy anyway, so just make it defensive
|
19
18
|
def cachew(*args, **kwargs): # type: ignore[no-redef]
|
20
19
|
return lambda f: f
|
@@ -22,7 +21,7 @@ except ModuleNotFoundError as me:
|
|
22
21
|
|
23
22
|
def index(p: PathIsh) -> Results:
|
24
23
|
pp = Path(p)
|
25
|
-
assert pp.exists(), pp
|
24
|
+
assert pp.exists(), pp # just in case of broken symlinks
|
26
25
|
|
27
26
|
# todo warn if filtered out too many?
|
28
27
|
# todo wonder how quickly mimes can be computed?
|
@@ -31,14 +30,14 @@ def index(p: PathIsh) -> Results:
|
|
31
30
|
|
32
31
|
assert len(dbs) > 0, pp
|
33
32
|
logger.info('processing %d databases', len(dbs))
|
34
|
-
cname = str('_'.join(pp.parts[1:]))
|
33
|
+
cname = str('_'.join(pp.parts[1:])) # meh
|
35
34
|
yield from _index_dbs(dbs, cachew_name=cname)
|
36
35
|
|
37
36
|
|
38
|
-
|
39
37
|
def _index_dbs(dbs: list[Path], cachew_name: str):
|
40
38
|
# TODO right... not ideal, need to think how to handle it properly...
|
41
39
|
import sys
|
40
|
+
|
42
41
|
sys.setrecursionlimit(5000)
|
43
42
|
|
44
43
|
cache_dir = config.get().cache_dir
|
@@ -49,13 +48,13 @@ def _index_dbs(dbs: list[Path], cachew_name: str):
|
|
49
48
|
|
50
49
|
# todo wow, stack traces are ridiculous here...
|
51
50
|
# todo hmm, feels like it should be a class or something?
|
52
|
-
@cachew(lambda cp, dbs, emitted: cp, depends_on=lambda cp, dbs, emitted: dbs)
|
51
|
+
@cachew(lambda cp, dbs, emitted: cp, depends_on=lambda cp, dbs, emitted: dbs) # , logger=logger) # noqa: ARG005
|
53
52
|
def _index_dbs_aux(cache_path: Path | None, dbs: list[Path], emitted: set) -> Results:
|
54
53
|
if len(dbs) == 0:
|
55
54
|
return
|
56
55
|
|
57
56
|
xs = dbs[:-1]
|
58
|
-
x
|
57
|
+
x = dbs[-1:]
|
59
58
|
|
60
59
|
xs_res = _index_dbs_aux(cache_path, xs, emitted)
|
61
60
|
xs_was_cached = False
|
@@ -66,36 +65,38 @@ def _index_dbs_aux(cache_path: Path | None, dbs: list[Path], emitted: set) -> Re
|
|
66
65
|
logger.debug('seems that %d first items were previously cached', len(xs))
|
67
66
|
if xs_was_cached:
|
68
67
|
key = str(r) if isinstance(r, Exception) else (r.url, r.dt)
|
69
|
-
assert key not in emitted, key
|
68
|
+
assert key not in emitted, key # todo not sure if this assert is necessary?
|
70
69
|
# hmm ok it might happen if we messed up with indexing individual db?
|
71
70
|
# alternatively, could abuse it to avoid messing with 'emitted' in _index_db?
|
72
71
|
emitted.add(key)
|
73
|
-
yield r
|
72
|
+
yield r # todo not sure about exceptions?
|
74
73
|
|
75
74
|
for db in x:
|
76
75
|
yield from _index_db(db, emitted=emitted)
|
77
76
|
|
78
77
|
|
79
78
|
def _index_db(db: Path, emitted: set):
|
80
|
-
logger.info('processing %s', db)
|
79
|
+
logger.info('processing %s', db) # debug level?
|
81
80
|
|
82
81
|
# todo schema check (not so critical for cachew though)
|
83
82
|
total = 0
|
84
|
-
new
|
85
|
-
loc = Loc.file(
|
83
|
+
new = 0
|
84
|
+
loc = Loc.file(
|
85
|
+
db
|
86
|
+
) # todo possibly needs to be optimized -- moving from within the loop considerably speeds everything up
|
86
87
|
with sqlite3.connect(f'file:{db}?immutable=1', uri=True) as c:
|
87
88
|
browser = None
|
88
89
|
for b in [Chrome, Firefox, FirefoxPhone, Safari]:
|
89
90
|
try:
|
90
91
|
c.execute(f'SELECT * FROM {b.detector}')
|
91
|
-
except sqlite3.OperationalError:
|
92
|
+
except sqlite3.OperationalError: # not sure if the right kind?
|
92
93
|
pass
|
93
94
|
else:
|
94
95
|
browser = b
|
95
96
|
break
|
96
97
|
assert browser is not None
|
97
98
|
|
98
|
-
proj
|
99
|
+
proj = ', '.join(c for c, _ in browser.schema.cols)
|
99
100
|
query = browser.query.replace('chunk.', '')
|
100
101
|
|
101
102
|
c.row_factory = sqlite3.Row
|
@@ -123,7 +124,7 @@ ColType = str
|
|
123
124
|
|
124
125
|
|
125
126
|
from collections.abc import Sequence
|
126
|
-
from typing import NamedTuple
|
127
|
+
from typing import NamedTuple
|
127
128
|
|
128
129
|
|
129
130
|
class Schema(NamedTuple):
|
@@ -131,7 +132,7 @@ class Schema(NamedTuple):
|
|
131
132
|
key: Sequence[str]
|
132
133
|
|
133
134
|
|
134
|
-
SchemaCheck = tuple[str,
|
135
|
+
SchemaCheck = tuple[str, str | Sequence[str]] # todo Union: meh
|
135
136
|
|
136
137
|
from dataclasses import dataclass
|
137
138
|
|
@@ -151,14 +152,15 @@ class Extr:
|
|
151
152
|
|
152
153
|
|
153
154
|
class Chrome(Extr):
|
154
|
-
detector='keyword_search_terms'
|
155
|
+
detector = 'keyword_search_terms'
|
156
|
+
# fmt: off
|
155
157
|
schema_check=(
|
156
158
|
'visits', [
|
157
159
|
'visits', "id, url, visit_time, from_visit, transition, segment_id, visit_duration, incremented_omnibox_typed_score",
|
158
160
|
'visits', "id, url, visit_time, from_visit, transition, segment_id, visit_duration"
|
159
161
|
]
|
160
162
|
)
|
161
|
-
schema=Schema(cols=[
|
163
|
+
schema = Schema(cols=[
|
162
164
|
('U.url' , 'TEXT' ),
|
163
165
|
|
164
166
|
# while these two are not very useful, might be good to have just in case for some debugging
|
@@ -172,16 +174,17 @@ class Chrome(Extr):
|
|
172
174
|
('V.visit_duration' , 'INTEGER NOT NULL'),
|
173
175
|
# V.omnibox thing looks useless
|
174
176
|
], key=('url', 'visit_time', 'vid', 'urlid'))
|
175
|
-
|
177
|
+
# fmt: on
|
178
|
+
query = 'FROM chunk.visits as V, chunk.urls as U WHERE V.url = U.id'
|
176
179
|
|
177
180
|
@staticmethod
|
178
181
|
def row2visit(row: sqlite3.Row, loc: Loc) -> Visit:
|
179
|
-
url
|
180
|
-
ts
|
182
|
+
url = row['url']
|
183
|
+
ts = row['visit_time']
|
181
184
|
durs = row['visit_duration']
|
182
185
|
|
183
186
|
dt = chrome_time_to_utc(int(ts))
|
184
|
-
url = unquote(url)
|
187
|
+
url = unquote(url) # chrome urls are all quoted
|
185
188
|
dd = int(durs)
|
186
189
|
dur: Second | None = None if dd == 0 else dd // 1_000_000
|
187
190
|
return Visit(
|
@@ -196,12 +199,12 @@ class Chrome(Extr):
|
|
196
199
|
# yep, tested it and looks like utc
|
197
200
|
def chrome_time_to_utc(chrome_time: int) -> datetime:
|
198
201
|
epoch = (chrome_time / 1_000_000) - 11644473600
|
199
|
-
return datetime.fromtimestamp(epoch,
|
202
|
+
return datetime.fromtimestamp(epoch, timezone.utc)
|
200
203
|
|
201
204
|
|
202
205
|
def _row2visit_firefox(row: sqlite3.Row, loc: Loc) -> Visit:
|
203
206
|
url = row['url']
|
204
|
-
ts
|
207
|
+
ts = float(row['visit_date'])
|
205
208
|
# ok, looks like it's unix epoch
|
206
209
|
# https://stackoverflow.com/a/19430099/706389
|
207
210
|
|
@@ -214,17 +217,19 @@ def _row2visit_firefox(row: sqlite3.Row, loc: Loc) -> Visit:
|
|
214
217
|
else:
|
215
218
|
# milliseconds
|
216
219
|
ts /= 1_000
|
217
|
-
dt = datetime.fromtimestamp(ts,
|
218
|
-
url = unquote(url)
|
220
|
+
dt = datetime.fromtimestamp(ts, timezone.utc)
|
221
|
+
url = unquote(url) # firefox urls are all quoted
|
219
222
|
return Visit(
|
220
223
|
url=url,
|
221
224
|
dt=dt,
|
222
225
|
locator=loc,
|
223
226
|
)
|
224
227
|
|
228
|
+
|
225
229
|
# https://web.archive.org/web/20201026130310/http://fileformats.archiveteam.org/wiki/History.db
|
226
230
|
class Safari(Extr):
|
227
|
-
detector='history_tombstones'
|
231
|
+
detector = 'history_tombstones'
|
232
|
+
# fmt: off
|
228
233
|
schema_check=(
|
229
234
|
'history_visits', [
|
230
235
|
'history_visits', "id, history_item, visit_time",
|
@@ -245,13 +250,14 @@ class Safari(Extr):
|
|
245
250
|
# ('V.visit_duration' , 'INTEGER NOT NULL'),
|
246
251
|
# V.omnibox thing looks useless
|
247
252
|
], key=('url', 'visit_time', 'vid', 'urlid'))
|
248
|
-
|
253
|
+
# fmt: on
|
254
|
+
query = 'FROM chunk.history_visits as V, chunk.history_items as U WHERE V.history_item = U.id'
|
249
255
|
|
250
256
|
@staticmethod
|
251
257
|
def row2visit(row: sqlite3.Row, loc: Loc) -> Visit:
|
252
|
-
url
|
253
|
-
ts
|
254
|
-
dt = datetime.fromtimestamp(ts,
|
258
|
+
url = row['url']
|
259
|
+
ts = row['visit_time'] + 978307200 # https://stackoverflow.com/a/34546556/16645
|
260
|
+
dt = datetime.fromtimestamp(ts, timezone.utc)
|
255
261
|
|
256
262
|
return Visit(
|
257
263
|
url=url,
|
@@ -259,10 +265,12 @@ class Safari(Extr):
|
|
259
265
|
locator=loc,
|
260
266
|
)
|
261
267
|
|
268
|
+
|
262
269
|
# https://web.archive.org/web/20190730231715/https://www.forensicswiki.org/wiki/Mozilla_Firefox_3_History_File_Format#moz_historyvisits
|
263
270
|
class Firefox(Extr):
|
264
|
-
detector='moz_meta'
|
265
|
-
schema_check=('moz_historyvisits', "id, from_visit, place_id, visit_date, visit_type")
|
271
|
+
detector = 'moz_meta'
|
272
|
+
schema_check = ('moz_historyvisits', "id, from_visit, place_id, visit_date, visit_type")
|
273
|
+
# fmt: off
|
266
274
|
schema=Schema(cols=[
|
267
275
|
('P.url' , 'TEXT'),
|
268
276
|
|
@@ -278,14 +286,16 @@ class Firefox(Extr):
|
|
278
286
|
# needs to be defensive
|
279
287
|
# ('V.session' , 'INTEGER'),
|
280
288
|
], key=('url', 'visit_date', 'vid', 'pid'))
|
281
|
-
|
289
|
+
# fmt: on
|
290
|
+
query = 'FROM chunk.moz_historyvisits as V, chunk.moz_places as P WHERE V.place_id = P.id'
|
282
291
|
|
283
|
-
row2visit = _row2visit_firefox
|
292
|
+
row2visit = _row2visit_firefox # type: ignore[assignment]
|
284
293
|
|
285
294
|
|
286
295
|
class FirefoxPhone(Extr):
|
287
|
-
detector='remote_devices'
|
288
|
-
schema_check=('visits', "_id, history_guid, visit_type, date, is_local")
|
296
|
+
detector = 'remote_devices'
|
297
|
+
schema_check = ('visits', "_id, history_guid, visit_type, date, is_local")
|
298
|
+
# fmt: off
|
289
299
|
schema=Schema(cols=[
|
290
300
|
('H.url' , 'TEXT NOT NULL' ),
|
291
301
|
|
@@ -297,6 +307,7 @@ class FirefoxPhone(Extr):
|
|
297
307
|
('V.date as visit_date', 'INTEGER NOT NULL'),
|
298
308
|
# ('is_local' , 'INTEGER NOT NULL'),
|
299
309
|
], key=('url', 'date', 'vid', 'hid'))
|
300
|
-
|
310
|
+
# fmt: on
|
311
|
+
query = 'FROM chunk.visits as V, chunk.history as H WHERE V.history_guid = H.guid'
|
301
312
|
|
302
|
-
row2visit = _row2visit_firefox
|
313
|
+
row2visit = _row2visit_firefox # type: ignore[assignment]
|
promnesia/sources/demo.py
CHANGED
@@ -21,7 +21,6 @@ def index(
|
|
21
21
|
base_dt: datetime | IsoFormatDt = datetime.min + timedelta(days=5000),
|
22
22
|
delta: timedelta | Seconds = timedelta(hours=1),
|
23
23
|
) -> Results:
|
24
|
-
|
25
24
|
base_dt_ = base_dt if isinstance(base_dt, datetime) else datetime.fromisoformat(base_dt)
|
26
25
|
delta_ = delta if isinstance(delta, timedelta) else timedelta(seconds=delta)
|
27
26
|
|
promnesia/sources/fbmessenger.py
CHANGED
promnesia/sources/filetypes.py
CHANGED
@@ -1,23 +1,26 @@
|
|
1
1
|
from __future__ import annotations
|
2
2
|
|
3
|
-
from collections.abc import Iterable, Sequence
|
3
|
+
from collections.abc import Callable, Iterable, Sequence
|
4
4
|
from functools import lru_cache
|
5
5
|
from pathlib import Path
|
6
|
-
from typing import
|
6
|
+
from typing import NamedTuple
|
7
7
|
|
8
8
|
from ..common import Results, Url
|
9
9
|
|
10
10
|
# TODO doesn't really belong here...
|
11
11
|
Ctx = Sequence[str]
|
12
12
|
|
13
|
+
|
13
14
|
class EUrl(NamedTuple):
|
14
15
|
url: Url
|
15
|
-
ctx: Ctx
|
16
|
+
ctx: Ctx # TODO ctx here is more like a Loc
|
17
|
+
|
18
|
+
|
16
19
|
###
|
17
20
|
|
18
21
|
|
19
22
|
# keys are mime types + extensions
|
20
|
-
Ex = Callable[[Path],
|
23
|
+
Ex = Callable[[Path], Results | Iterable[EUrl]]
|
21
24
|
# None means unhandled
|
22
25
|
TYPE2IDX: dict[str, Ex | None] = {}
|
23
26
|
# NOTE: there are some types in auto.py at the moment... it's a bit messy
|
@@ -27,13 +30,13 @@ TYPE2IDX: dict[str, Ex | None] = {}
|
|
27
30
|
@lru_cache(None)
|
28
31
|
def type2idx(t: str) -> Ex | None:
|
29
32
|
if len(t) == 0:
|
30
|
-
return None
|
33
|
+
return None # just in case?
|
31
34
|
# first try exact match
|
32
|
-
e = TYPE2IDX.get(t
|
35
|
+
e = TYPE2IDX.get(t)
|
33
36
|
if e is not None:
|
34
37
|
return e
|
35
38
|
t = t.strip('.')
|
36
|
-
e = TYPE2IDX.get(t
|
39
|
+
e = TYPE2IDX.get(t)
|
37
40
|
if e is not None:
|
38
41
|
return e
|
39
42
|
# otherwise, try prefixes?
|
@@ -42,6 +45,7 @@ def type2idx(t: str) -> Ex | None:
|
|
42
45
|
return v
|
43
46
|
return None
|
44
47
|
|
48
|
+
|
45
49
|
# for now source code just indexed with grep, not sure if it's good enough?
|
46
50
|
# if not, some fanceir library could be used...
|
47
51
|
# e.g. https://github.com/karlicoss/promnesia/pull/152/commits/c2f00eb4ee4018b02c9bf3966a036db69a43373d
|
@@ -82,7 +86,7 @@ CODE = {
|
|
82
86
|
|
83
87
|
'.ts', # most likely typescript.. otherwise determined as text/vnd.trolltech.linguist mime
|
84
88
|
'.js',
|
85
|
-
}
|
89
|
+
} # fmt: skip
|
86
90
|
# TODO discover more extensions with mimetypes library?
|
87
91
|
|
88
92
|
|
@@ -100,6 +104,7 @@ video/
|
|
100
104
|
|
101
105
|
handle_later = lambda *_args, **_kwargs: ()
|
102
106
|
|
107
|
+
|
103
108
|
def ignore(*_args, **_kwargs):
|
104
109
|
# TODO log (once?)
|
105
110
|
yield from ()
|
@@ -129,7 +134,7 @@ TYPE2IDX.update({
|
|
129
134
|
'application/zip' : handle_later,
|
130
135
|
'application/x-tar' : handle_later,
|
131
136
|
'application/gzip' : handle_later,
|
132
|
-
})
|
137
|
+
}) # fmt: skip
|
133
138
|
|
134
139
|
|
135
140
|
# TODO use some existing file for initial gitignore..
|
@@ -148,5 +153,4 @@ IGNORE = [
|
|
148
153
|
# TODO not sure about these:
|
149
154
|
'.gitignore',
|
150
155
|
'.babelrc',
|
151
|
-
]
|
152
|
-
|
156
|
+
] # fmt: skip
|