promnesia 1.2.20240810__py3-none-any.whl → 1.4.20250909__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- promnesia/__init__.py +18 -4
- promnesia/__main__.py +104 -78
- promnesia/cannon.py +108 -107
- promnesia/common.py +107 -88
- promnesia/compare.py +33 -30
- promnesia/compat.py +10 -10
- promnesia/config.py +37 -34
- promnesia/database/common.py +4 -3
- promnesia/database/dump.py +13 -13
- promnesia/database/load.py +7 -7
- promnesia/extract.py +19 -17
- promnesia/logging.py +27 -15
- promnesia/misc/install_server.py +32 -27
- promnesia/server.py +106 -79
- promnesia/sources/auto.py +104 -77
- promnesia/sources/auto_logseq.py +6 -5
- promnesia/sources/auto_obsidian.py +2 -2
- promnesia/sources/browser.py +20 -10
- promnesia/sources/browser_legacy.py +65 -50
- promnesia/sources/demo.py +7 -8
- promnesia/sources/fbmessenger.py +3 -3
- promnesia/sources/filetypes.py +22 -16
- promnesia/sources/github.py +9 -8
- promnesia/sources/guess.py +6 -2
- promnesia/sources/hackernews.py +7 -9
- promnesia/sources/hpi.py +5 -3
- promnesia/sources/html.py +11 -7
- promnesia/sources/hypothesis.py +3 -2
- promnesia/sources/instapaper.py +3 -2
- promnesia/sources/markdown.py +22 -12
- promnesia/sources/org.py +36 -17
- promnesia/sources/plaintext.py +41 -39
- promnesia/sources/pocket.py +5 -3
- promnesia/sources/reddit.py +24 -26
- promnesia/sources/roamresearch.py +5 -2
- promnesia/sources/rss.py +6 -8
- promnesia/sources/shellcmd.py +21 -11
- promnesia/sources/signal.py +27 -26
- promnesia/sources/smscalls.py +2 -3
- promnesia/sources/stackexchange.py +5 -4
- promnesia/sources/takeout.py +37 -34
- promnesia/sources/takeout_legacy.py +29 -19
- promnesia/sources/telegram.py +18 -12
- promnesia/sources/telegram_legacy.py +22 -11
- promnesia/sources/twitter.py +7 -6
- promnesia/sources/vcs.py +11 -6
- promnesia/sources/viber.py +11 -10
- promnesia/sources/website.py +8 -7
- promnesia/sources/zulip.py +3 -2
- promnesia/sqlite.py +13 -7
- promnesia/tests/common.py +10 -5
- promnesia/tests/server_helper.py +13 -10
- promnesia/tests/sources/test_auto.py +2 -3
- promnesia/tests/sources/test_filetypes.py +11 -8
- promnesia/tests/sources/test_hypothesis.py +10 -6
- promnesia/tests/sources/test_org.py +9 -5
- promnesia/tests/sources/test_plaintext.py +9 -8
- promnesia/tests/sources/test_shellcmd.py +13 -13
- promnesia/tests/sources/test_takeout.py +3 -5
- promnesia/tests/test_cannon.py +256 -239
- promnesia/tests/test_cli.py +12 -8
- promnesia/tests/test_compare.py +17 -13
- promnesia/tests/test_config.py +7 -8
- promnesia/tests/test_db_dump.py +15 -15
- promnesia/tests/test_extract.py +17 -10
- promnesia/tests/test_indexer.py +24 -18
- promnesia/tests/test_server.py +12 -13
- promnesia/tests/test_traverse.py +0 -2
- promnesia/tests/utils.py +3 -7
- promnesia-1.4.20250909.dist-info/METADATA +66 -0
- promnesia-1.4.20250909.dist-info/RECORD +80 -0
- {promnesia-1.2.20240810.dist-info → promnesia-1.4.20250909.dist-info}/WHEEL +1 -2
- promnesia/kjson.py +0 -121
- promnesia/sources/__init__.pyi +0 -0
- promnesia-1.2.20240810.dist-info/METADATA +0 -54
- promnesia-1.2.20240810.dist-info/RECORD +0 -83
- promnesia-1.2.20240810.dist-info/top_level.txt +0 -1
- {promnesia-1.2.20240810.dist-info → promnesia-1.4.20250909.dist-info}/entry_points.txt +0 -0
- {promnesia-1.2.20240810.dist-info → promnesia-1.4.20250909.dist-info/licenses}/LICENSE +0 -0
promnesia/sources/signal.py
CHANGED
@@ -2,34 +2,33 @@
|
|
2
2
|
Collects visits from Signal Desktop's encrypted SQLIite db(s).
|
3
3
|
"""
|
4
4
|
|
5
|
+
from __future__ import annotations
|
6
|
+
|
5
7
|
# Functions get their defaults from module-data.
|
6
8
|
#
|
7
9
|
# * Open-ciphered-db adapted from:
|
8
10
|
# https://github.com/carderne/signal-export/commit/2284c8f4
|
9
11
|
# * Copyright (c) 2019 Chris Arderne, 2020 Kostis Anagnostopoulos
|
10
|
-
|
11
|
-
|
12
12
|
import json
|
13
13
|
import logging
|
14
14
|
import platform
|
15
15
|
import sqlite3
|
16
16
|
import subprocess as sbp
|
17
|
+
from collections.abc import Iterable, Iterator, Mapping
|
17
18
|
from contextlib import contextmanager
|
18
19
|
from pathlib import Path
|
19
20
|
from textwrap import dedent, indent
|
20
|
-
from typing import Any
|
21
|
+
from typing import Any
|
21
22
|
|
22
23
|
from ..common import Loc, PathIsh, Results, Visit, extract_urls, from_epoch
|
23
24
|
|
24
|
-
PathIshes = Union[PathIsh, Iterable[PathIsh]]
|
25
|
-
|
26
25
|
|
27
26
|
def index(
|
28
27
|
*db_paths: PathIsh,
|
29
28
|
http_only: bool = False,
|
30
|
-
locator_schema: str="editor",
|
29
|
+
locator_schema: str = "editor",
|
31
30
|
append_platform_path: bool = False,
|
32
|
-
override_key:
|
31
|
+
override_key: str | None = None,
|
33
32
|
) -> Results:
|
34
33
|
"""
|
35
34
|
:param db_paths:
|
@@ -51,8 +50,7 @@ def index(
|
|
51
50
|
otherwise, this same key is used for harvesting all db-files.
|
52
51
|
"""
|
53
52
|
logger.debug(
|
54
|
-
"http_only?(%s), locator_schema?(%s), append_platform_path?(%s), "
|
55
|
-
"overide_key given?(%s), db_paths: %s",
|
53
|
+
"http_only?(%s), locator_schema?(%s), append_platform_path?(%s), overide_key given?(%s), db_paths: %s",
|
56
54
|
http_only,
|
57
55
|
locator_schema,
|
58
56
|
append_platform_path,
|
@@ -109,10 +107,10 @@ messages_query = dedent(
|
|
109
107
|
id,
|
110
108
|
type,
|
111
109
|
coalesce(
|
112
|
-
profileFullName,
|
113
|
-
profileName,
|
110
|
+
profileFullName,
|
111
|
+
profileName,
|
114
112
|
name,
|
115
|
-
profileFamilyName,
|
113
|
+
profileFamilyName,
|
116
114
|
e164
|
117
115
|
) as aname,
|
118
116
|
name,
|
@@ -171,7 +169,10 @@ def _expand_path(path_pattern: PathIsh) -> Iterable[Path]:
|
|
171
169
|
|
172
170
|
Expansion code adapted from https://stackoverflow.com/a/51108375/548792
|
173
171
|
to handle also degenerate cases (``'', '.', '/'``):
|
172
|
+
"""
|
174
173
|
|
174
|
+
# NOTE: suppressing doctest from github actions
|
175
|
+
"""
|
175
176
|
>>> str(next(iter(_get_files('/'))))
|
176
177
|
'/'
|
177
178
|
|
@@ -194,7 +195,7 @@ def _expand_path(path_pattern: PathIsh) -> Iterable[Path]:
|
|
194
195
|
return path.glob(str(Path(*parts))) if parts else [path]
|
195
196
|
|
196
197
|
|
197
|
-
def _expand_paths(paths:
|
198
|
+
def _expand_paths(paths: PathIsh | Iterable[PathIsh]) -> Iterable[Path]:
|
198
199
|
if _is_pathish(paths):
|
199
200
|
paths = [paths] # type: ignore[list-item]
|
200
201
|
return [pp.resolve() for p in paths for pp in _expand_path(p)] # type: ignore[union-attr]
|
@@ -214,7 +215,10 @@ def collect_db_paths(*db_paths: PathIsh, append: bool = False) -> Iterable[Path]
|
|
214
215
|
one or more pathish
|
215
216
|
|
216
217
|
Note: needed `append` here, to resolve paths.
|
218
|
+
"""
|
217
219
|
|
220
|
+
# NOTE: suppressing doctest from running on Github actions
|
221
|
+
"""
|
218
222
|
>>> bool(collect_db_paths()) # my home-path
|
219
223
|
True
|
220
224
|
>>> collect_db_paths(None)
|
@@ -237,11 +241,10 @@ def collect_db_paths(*db_paths: PathIsh, append: bool = False) -> Iterable[Path]
|
|
237
241
|
platform_name = platform.system()
|
238
242
|
try:
|
239
243
|
plat_paths = platform_db_paths[platform_name]
|
240
|
-
except LookupError:
|
244
|
+
except LookupError as le:
|
241
245
|
raise ValueError(
|
242
|
-
f"Unknown platform({platform_name}
|
243
|
-
|
244
|
-
)
|
246
|
+
f"Unknown platform({platform_name}!\n Expected one of {list(platform_db_paths.keys())}."
|
247
|
+
) from le
|
245
248
|
|
246
249
|
if db_paths and append:
|
247
250
|
db_paths = [ # type: ignore[assignment]
|
@@ -261,7 +264,7 @@ def _config_for_dbfile(db_path: Path, default_key=None) -> Path:
|
|
261
264
|
|
262
265
|
|
263
266
|
def _key_from_config(signal_desktop_config_path: PathIsh) -> str:
|
264
|
-
with
|
267
|
+
with Path(signal_desktop_config_path).open() as conf:
|
265
268
|
return json.load(conf)["key"]
|
266
269
|
|
267
270
|
|
@@ -269,6 +272,7 @@ def _key_from_config(signal_desktop_config_path: PathIsh) -> str:
|
|
269
272
|
def connect_db(
|
270
273
|
db_path: Path,
|
271
274
|
key,
|
275
|
+
*,
|
272
276
|
decrypt_db: bool = False,
|
273
277
|
sqlcipher_exe: PathIsh = "sqlcipher",
|
274
278
|
**decryption_pragmas: Mapping[str, Any],
|
@@ -324,16 +328,14 @@ def connect_db(
|
|
324
328
|
)
|
325
329
|
sql = "\n".join(sql_cmds)
|
326
330
|
cmd = [sqlcipher_exe, str(db_path)]
|
327
|
-
logger.debug(
|
328
|
-
"Decrypting db '%s' with cmd: %s <<<EOF\n%s\nEOF", db_path, cmd, sql
|
329
|
-
)
|
331
|
+
logger.debug("Decrypting db '%s' with cmd: %s <<<EOF\n%s\nEOF", db_path, cmd, sql)
|
330
332
|
try:
|
331
333
|
sbp.run(
|
332
334
|
cmd,
|
333
335
|
check=True,
|
334
336
|
input=sql,
|
335
337
|
capture_output=True,
|
336
|
-
|
338
|
+
text=True,
|
337
339
|
)
|
338
340
|
except sbp.CalledProcessError as ex:
|
339
341
|
prefix = " " * 4
|
@@ -357,12 +359,11 @@ def connect_db(
|
|
357
359
|
yield db
|
358
360
|
finally:
|
359
361
|
try:
|
360
|
-
if db:
|
362
|
+
if db is not None:
|
361
363
|
db.close()
|
362
364
|
finally:
|
363
365
|
if decrypted_file and decrypted_file.exists():
|
364
366
|
try:
|
365
|
-
|
366
367
|
logger.debug("Deleting temporary decrypted db: %s", decrypted_file)
|
367
368
|
decrypted_file.unlink()
|
368
369
|
except Exception as ex:
|
@@ -380,7 +381,7 @@ def _handle_row(row: tuple, db_path: PathIsh, locator_schema: str) -> Results:
|
|
380
381
|
if not urls:
|
381
382
|
return
|
382
383
|
|
383
|
-
assert (
|
384
|
+
assert ( # noqa: PT018
|
384
385
|
text and mid and sender and chatname
|
385
386
|
), f"should have eliminated messages without 'http' or missing ids: {row}"
|
386
387
|
|
@@ -400,7 +401,7 @@ def _harvest_db(
|
|
400
401
|
db_path: Path,
|
401
402
|
messages_query: str,
|
402
403
|
*,
|
403
|
-
override_key:
|
404
|
+
override_key: str | None = None,
|
404
405
|
locator_schema: str = "editor",
|
405
406
|
decrypt_db: bool = False,
|
406
407
|
**decryption_pragmas,
|
promnesia/sources/smscalls.py
CHANGED
@@ -2,15 +2,14 @@
|
|
2
2
|
Uses [[https://github.com/karlicoss/HPI][HPI]] smscalls module
|
3
3
|
'''
|
4
4
|
|
5
|
-
from promnesia.common import
|
5
|
+
from promnesia.common import Loc, Results, Visit, extract_urls
|
6
6
|
|
7
7
|
|
8
8
|
def index() -> Results:
|
9
|
-
from . import hpi
|
9
|
+
from . import hpi # noqa: F401,I001
|
10
10
|
from my.smscalls import messages
|
11
11
|
|
12
12
|
for m in messages():
|
13
|
-
|
14
13
|
if isinstance(m, Exception):
|
15
14
|
yield m
|
16
15
|
continue
|
@@ -2,12 +2,13 @@
|
|
2
2
|
Uses [[https://github.com/karlicoss/HPI][HPI]] for Stackexchange data.
|
3
3
|
'''
|
4
4
|
|
5
|
-
from
|
5
|
+
from promnesia.common import Loc, Results, Visit
|
6
6
|
|
7
7
|
|
8
8
|
def index() -> Results:
|
9
|
-
from . import hpi
|
9
|
+
from . import hpi # noqa: F401,I001
|
10
10
|
import my.stackexchange.gdpr as G
|
11
|
+
|
11
12
|
for v in G.votes():
|
12
13
|
if isinstance(v, Exception):
|
13
14
|
yield v
|
@@ -15,7 +16,7 @@ def index() -> Results:
|
|
15
16
|
yield Visit(
|
16
17
|
url=v.link,
|
17
18
|
dt=v.when,
|
18
|
-
context='voted',
|
19
|
+
context='voted', # todo use the votetype? although maybe worth ignoring downvotes
|
19
20
|
# or, downvotes could have 'negative' ranking or something
|
20
|
-
locator=Loc.make(title='voted', href=v.link)
|
21
|
+
locator=Loc.make(title='voted', href=v.link),
|
21
22
|
)
|
promnesia/sources/takeout.py
CHANGED
@@ -1,11 +1,14 @@
|
|
1
1
|
'''
|
2
2
|
Uses HPI [[https://github.com/karlicoss/HPI/blob/master/doc/MODULES.org#mygoogletakeoutpaths][google.takeout]] module
|
3
3
|
'''
|
4
|
-
|
4
|
+
|
5
|
+
from __future__ import annotations
|
6
|
+
|
5
7
|
import warnings
|
8
|
+
from collections.abc import Iterable
|
9
|
+
from typing import Any, NamedTuple
|
6
10
|
|
7
|
-
from
|
8
|
-
from ..compat import removeprefix
|
11
|
+
from promnesia.common import Loc, Results, Visit, logger
|
9
12
|
|
10
13
|
|
11
14
|
# incase user is using an old version of google_takeout_parser
|
@@ -14,25 +17,34 @@ class YoutubeCSVStub(NamedTuple):
|
|
14
17
|
|
15
18
|
|
16
19
|
def index() -> Results:
|
17
|
-
from . import hpi
|
18
|
-
import json
|
20
|
+
from . import hpi # noqa: F401
|
19
21
|
|
20
22
|
try:
|
23
|
+
from google_takeout_parser.models import (
|
24
|
+
Activity,
|
25
|
+
ChromeHistory,
|
26
|
+
LikedYoutubeVideo,
|
27
|
+
YoutubeComment,
|
28
|
+
)
|
29
|
+
from google_takeout_parser.parse_csv import (
|
30
|
+
extract_comment_links,
|
31
|
+
reconstruct_comment_content,
|
32
|
+
)
|
21
33
|
from my.google.takeout.parser import events
|
22
|
-
from google_takeout_parser.models import Activity, YoutubeComment, LikedYoutubeVideo, ChromeHistory
|
23
|
-
from google_takeout_parser.parse_csv import reconstruct_comment_content, extract_comment_links
|
24
34
|
except ModuleNotFoundError as ex:
|
25
35
|
logger.exception(ex)
|
26
36
|
yield ex
|
27
37
|
|
28
|
-
warnings.warn(
|
38
|
+
warnings.warn(
|
39
|
+
"Please set up my.google.takeout.parser module for better takeout support. Falling back to legacy implementation."
|
40
|
+
)
|
29
41
|
|
30
42
|
from . import takeout_legacy
|
43
|
+
|
31
44
|
yield from takeout_legacy.index()
|
32
45
|
return
|
33
46
|
|
34
|
-
|
35
|
-
_seen: Set[str] = {
|
47
|
+
_seen: set[str] = {
|
36
48
|
# these are definitely not useful for promnesia
|
37
49
|
'Location',
|
38
50
|
'PlaceVisit',
|
@@ -42,10 +54,13 @@ def index() -> Results:
|
|
42
54
|
imported_yt_csv_models = False
|
43
55
|
try:
|
44
56
|
from google_takeout_parser.models import CSVYoutubeComment, CSVYoutubeLiveChat
|
57
|
+
|
45
58
|
imported_yt_csv_models = True
|
46
59
|
except ImportError:
|
47
60
|
# warn user to upgrade google_takeout_parser
|
48
|
-
warnings.warn(
|
61
|
+
warnings.warn(
|
62
|
+
"Please upgrade google_takeout_parser (`pip install -U google_takeout_parser`) to support the new format for youtube comments"
|
63
|
+
)
|
49
64
|
CSVYoutubeComment = YoutubeCSVStub # type: ignore[misc,assignment]
|
50
65
|
CSVYoutubeLiveChat = YoutubeCSVStub # type: ignore[misc,assignment]
|
51
66
|
|
@@ -54,7 +69,7 @@ def index() -> Results:
|
|
54
69
|
if et_name in _seen:
|
55
70
|
return
|
56
71
|
_seen.add(et_name)
|
57
|
-
yield RuntimeError(f"Unhandled event {
|
72
|
+
yield RuntimeError(f"Unhandled event {type(e)!r}: {e}")
|
58
73
|
|
59
74
|
for e in events():
|
60
75
|
if isinstance(e, Exception):
|
@@ -67,13 +82,13 @@ def index() -> Results:
|
|
67
82
|
# when you follow something from search the actual url goes after this
|
68
83
|
# e.g. https://www.google.com/url?q=https://en.wikipedia.org/wiki/Clapham
|
69
84
|
# note: also title usually starts with 'Visited ', in such case but perhaps fine to keep it
|
70
|
-
url = removeprefix(
|
85
|
+
url = url.removeprefix("https://www.google.com/url?q=")
|
71
86
|
title = e.title
|
72
87
|
|
73
88
|
if e.header == 'Chrome':
|
74
89
|
# title contains 'Visited <page title>' in this case
|
75
90
|
context = None
|
76
|
-
title = removeprefix(
|
91
|
+
title = title.removeprefix('Visited ')
|
77
92
|
elif e.header in _CLEAR_CONTEXT_FOR_HEADERS:
|
78
93
|
# todo perhaps could add to some sort of metadata?
|
79
94
|
# only useful for debugging really
|
@@ -120,18 +135,14 @@ def index() -> Results:
|
|
120
135
|
elif isinstance(e, LikedYoutubeVideo):
|
121
136
|
# TODO not sure if desc makes sense here since it's not user produced data
|
122
137
|
# it's just a part of video meta?
|
123
|
-
yield Visit(
|
124
|
-
url=e.link, dt=e.dt, context=e.desc, locator=Loc(title=e.title, href=e.link)
|
125
|
-
)
|
138
|
+
yield Visit(url=e.link, dt=e.dt, context=e.desc, locator=Loc(title=e.title, href=e.link))
|
126
139
|
elif isinstance(e, YoutubeComment):
|
127
140
|
for url in e.urls:
|
128
141
|
# todo: use url_metadata to improve locator?
|
129
142
|
# or maybe just extract first sentence?
|
130
|
-
yield Visit(
|
131
|
-
url=url, dt=e.dt, context=e.content, locator=Loc(title=e.content, href=url)
|
132
|
-
)
|
143
|
+
yield Visit(url=url, dt=e.dt, context=e.content, locator=Loc(title=e.content, href=url))
|
133
144
|
elif imported_yt_csv_models and isinstance(e, CSVYoutubeComment):
|
134
|
-
contentJSON =
|
145
|
+
contentJSON = e.contentJSON
|
135
146
|
content = reconstruct_comment_content(contentJSON, format='text')
|
136
147
|
if isinstance(content, Exception):
|
137
148
|
yield content
|
@@ -142,14 +153,10 @@ def index() -> Results:
|
|
142
153
|
continue
|
143
154
|
context = f"Commented on {e.video_url}"
|
144
155
|
for url in links:
|
145
|
-
yield Visit(
|
146
|
-
|
147
|
-
)
|
148
|
-
yield Visit(
|
149
|
-
url=e.video_url, dt=e.dt, context=content, locator=Loc(title=context, href=e.video_url)
|
150
|
-
)
|
156
|
+
yield Visit(url=url, dt=e.dt, context=content, locator=Loc(title=context, href=url))
|
157
|
+
yield Visit(url=e.video_url, dt=e.dt, context=content, locator=Loc(title=context, href=e.video_url))
|
151
158
|
elif imported_yt_csv_models and isinstance(e, CSVYoutubeLiveChat):
|
152
|
-
contentJSON =
|
159
|
+
contentJSON = e.contentJSON
|
153
160
|
content = reconstruct_comment_content(contentJSON, format='text')
|
154
161
|
if isinstance(content, Exception):
|
155
162
|
yield content
|
@@ -160,12 +167,8 @@ def index() -> Results:
|
|
160
167
|
continue
|
161
168
|
context = f"Commented on livestream {e.video_url}"
|
162
169
|
for url in links:
|
163
|
-
yield Visit(
|
164
|
-
|
165
|
-
)
|
166
|
-
yield Visit(
|
167
|
-
url=e.video_url, dt=e.dt, context=content, locator=Loc(title=context, href=e.video_url)
|
168
|
-
)
|
170
|
+
yield Visit(url=url, dt=e.dt, context=content, locator=Loc(title=context, href=url))
|
171
|
+
yield Visit(url=e.video_url, dt=e.dt, context=content, locator=Loc(title=context, href=e.video_url))
|
169
172
|
else:
|
170
173
|
yield from warn_once_if_not_seen(e)
|
171
174
|
|
@@ -1,17 +1,23 @@
|
|
1
|
-
from
|
1
|
+
from __future__ import annotations
|
2
|
+
|
3
|
+
from promnesia.common import Loc, Results, Visit, logger
|
4
|
+
|
2
5
|
|
3
6
|
# TODO make an iterator, insert in db as we go? handle errors gracefully?
|
4
7
|
def index() -> Results:
|
5
|
-
from . import hpi
|
8
|
+
from . import hpi # noqa: F401,I001
|
6
9
|
from my.google.takeout.paths import get_takeouts
|
10
|
+
|
7
11
|
takeouts = list(get_takeouts())
|
8
12
|
# TODO if no takeouts, raise?
|
9
13
|
# although could raise a warning on top level, when source emitted no takeouts
|
10
14
|
|
11
15
|
# TODO youtube?
|
16
|
+
# fmt: off
|
12
17
|
google_activities = [read_google_activity(t) for t in takeouts]
|
13
18
|
search_activities = [read_search_activity(t) for t in takeouts]
|
14
19
|
browser_histories = [read_browser_history_json(t) for t in takeouts]
|
20
|
+
# fmt: on
|
15
21
|
|
16
22
|
key = lambda v: (v.dt, v.url)
|
17
23
|
return chain(
|
@@ -21,25 +27,22 @@ def index() -> Results:
|
|
21
27
|
)
|
22
28
|
|
23
29
|
|
24
|
-
|
25
|
-
import
|
30
|
+
import json
|
31
|
+
from collections.abc import Iterable
|
32
|
+
from datetime import datetime, timezone
|
26
33
|
from itertools import chain
|
27
|
-
from datetime import datetime
|
28
|
-
from typing import List, Optional, Iterable, TYPE_CHECKING
|
29
34
|
from pathlib import Path
|
30
|
-
import json
|
31
|
-
|
32
|
-
|
33
|
-
from .. import config
|
34
|
-
|
35
35
|
|
36
36
|
from more_itertools import unique_everseen
|
37
37
|
|
38
|
+
from promnesia import config
|
39
|
+
|
38
40
|
try:
|
39
41
|
from cachew import cachew
|
40
42
|
except ModuleNotFoundError as me:
|
41
43
|
if me.name != 'cachew':
|
42
44
|
raise me
|
45
|
+
|
43
46
|
# this module is legacy anyway, so just make it defensive
|
44
47
|
def cachew(*args, **kwargs): # type: ignore[no-redef]
|
45
48
|
return lambda f: f
|
@@ -50,7 +53,9 @@ TakeoutPath = Path
|
|
50
53
|
|
51
54
|
|
52
55
|
def _read_myactivity_html(takeout: TakeoutPath, kind: str) -> Iterable[Visit]:
|
53
|
-
|
56
|
+
# FIXME switch to actual kompress? and use CPath?
|
57
|
+
from my.core.kompress import kexists # type: ignore[attr-defined]
|
58
|
+
|
54
59
|
# TODO glob
|
55
60
|
# TODO not sure about windows path separators??
|
56
61
|
spath = 'Takeout/My Activity/' + kind
|
@@ -61,7 +66,8 @@ def _read_myactivity_html(takeout: TakeoutPath, kind: str) -> Iterable[Visit]:
|
|
61
66
|
|
62
67
|
locator = Loc.file(spath)
|
63
68
|
from my.google.takeout.html import read_html
|
64
|
-
|
69
|
+
|
70
|
+
for dt, url, _title in read_html(takeout, spath):
|
65
71
|
yield Visit(
|
66
72
|
url=url,
|
67
73
|
dt=dt,
|
@@ -69,6 +75,7 @@ def _read_myactivity_html(takeout: TakeoutPath, kind: str) -> Iterable[Visit]:
|
|
69
75
|
debug=kind,
|
70
76
|
)
|
71
77
|
|
78
|
+
|
72
79
|
def _cpath(suffix: str):
|
73
80
|
def fun(takeout: TakeoutPath):
|
74
81
|
cache_dir = config.get().cache_dir
|
@@ -76,23 +83,27 @@ def _cpath(suffix: str):
|
|
76
83
|
return None
|
77
84
|
# doesn't need a nontrivial hash function, timestsamp is encoded in name
|
78
85
|
return cache_dir / (takeout.name + '_' + suffix + '.cache')
|
86
|
+
|
79
87
|
return fun
|
80
88
|
|
81
89
|
|
82
90
|
# todo caching should this be HPI responsibility?
|
83
91
|
# todo set global cachew logging on init?
|
84
|
-
@cachew(cache_path=_cpath('google_activity')
|
92
|
+
@cachew(cache_path=_cpath('google_activity'), logger=logger)
|
85
93
|
def read_google_activity(takeout: TakeoutPath) -> Iterable[Visit]:
|
86
94
|
return _read_myactivity_html(takeout, 'Chrome/MyActivity.html')
|
87
95
|
|
88
|
-
|
96
|
+
|
97
|
+
@cachew(cache_path=_cpath('search_activity'), logger=logger)
|
89
98
|
def read_search_activity(takeout: TakeoutPath) -> Iterable[Visit]:
|
90
99
|
return _read_myactivity_html(takeout, 'Search/MyActivity.html')
|
91
100
|
|
101
|
+
|
92
102
|
# TODO add this to tests?
|
93
103
|
@cachew(cache_path=_cpath('browser_activity'), logger=logger)
|
94
104
|
def read_browser_history_json(takeout: TakeoutPath) -> Iterable[Visit]:
|
95
|
-
from my.core.kompress import kexists, kopen
|
105
|
+
from my.core.kompress import kexists, kopen # type: ignore[attr-defined]
|
106
|
+
|
96
107
|
# not sure if this deserves moving to HPI? it's pretty trivial for now
|
97
108
|
spath = 'Takeout/Chrome/BrowserHistory.json'
|
98
109
|
|
@@ -107,13 +118,13 @@ def read_browser_history_json(takeout: TakeoutPath) -> Iterable[Visit]:
|
|
107
118
|
# TODO this should be supported by HPI now?
|
108
119
|
|
109
120
|
j = None
|
110
|
-
with kopen(takeout, spath) as fo:
|
121
|
+
with kopen(takeout, spath) as fo: # TODO iterative parser?
|
111
122
|
j = json.load(fo)
|
112
123
|
|
113
124
|
hist = j['Browser History']
|
114
125
|
for item in hist:
|
115
126
|
url = item['url']
|
116
|
-
time = datetime.fromtimestamp(item['time_usec'] / 10
|
127
|
+
time = datetime.fromtimestamp(item['time_usec'] / 10**6, tz=timezone.utc)
|
117
128
|
# TODO any more interesitng info?
|
118
129
|
yield Visit(
|
119
130
|
url=url,
|
@@ -121,4 +132,3 @@ def read_browser_history_json(takeout: TakeoutPath) -> Iterable[Visit]:
|
|
121
132
|
locator=locator,
|
122
133
|
debug='Chrome/BrowserHistory.json',
|
123
134
|
)
|
124
|
-
|
promnesia/sources/telegram.py
CHANGED
@@ -1,11 +1,12 @@
|
|
1
|
-
from
|
2
|
-
|
1
|
+
from __future__ import annotations
|
2
|
+
|
3
3
|
import warnings
|
4
|
+
from urllib.parse import unquote # TODO mm, make it easier to rememember to use...
|
4
5
|
|
5
|
-
from promnesia.common import
|
6
|
+
from promnesia.common import Loc, PathIsh, Results, Visit, extract_urls, logger
|
6
7
|
|
7
8
|
|
8
|
-
def index(database:
|
9
|
+
def index(database: PathIsh | None = None, *, http_only: bool = False, with_extra_media_info: bool = False) -> Results:
|
9
10
|
if database is None:
|
10
11
|
# fully relying on HPI
|
11
12
|
yield from _index_new(http_only=http_only, with_extra_media_info=with_extra_media_info)
|
@@ -16,11 +17,14 @@ def index(database: Optional[PathIsh]=None, *, http_only: bool=False, with_extra
|
|
16
17
|
f'Will try to hack database path {database} into HPI config.'
|
17
18
|
)
|
18
19
|
try:
|
19
|
-
yield from _index_new_with_adhoc_config(
|
20
|
-
|
20
|
+
yield from _index_new_with_adhoc_config(
|
21
|
+
database=database, http_only=http_only, with_extra_media_info=with_extra_media_info
|
22
|
+
)
|
21
23
|
except Exception as e:
|
22
24
|
logger.exception(e)
|
23
25
|
warnings.warn("Hacking my.config.telegram.telegram_backup didn't work. You probably need to update HPI.")
|
26
|
+
else:
|
27
|
+
return
|
24
28
|
|
25
29
|
logger.warning("Falling back onto promnesia.sources.telegram_legacy module")
|
26
30
|
yield from _index_legacy(database=database, http_only=http_only)
|
@@ -28,11 +32,12 @@ def index(database: Optional[PathIsh]=None, *, http_only: bool=False, with_extra
|
|
28
32
|
|
29
33
|
def _index_legacy(*, database: PathIsh, http_only: bool) -> Results:
|
30
34
|
from . import telegram_legacy
|
35
|
+
|
31
36
|
yield from telegram_legacy.index(database=database, http_only=http_only)
|
32
37
|
|
33
38
|
|
34
39
|
def _index_new_with_adhoc_config(*, database: PathIsh, http_only: bool, with_extra_media_info: bool) -> Results:
|
35
|
-
from . import hpi
|
40
|
+
from . import hpi # noqa: F401
|
36
41
|
|
37
42
|
class config:
|
38
43
|
class telegram:
|
@@ -40,19 +45,20 @@ def _index_new_with_adhoc_config(*, database: PathIsh, http_only: bool, with_ext
|
|
40
45
|
export_path: PathIsh = database
|
41
46
|
|
42
47
|
from my.core.cfg import tmp_config
|
48
|
+
|
43
49
|
with tmp_config(modules='my.telegram.telegram_backup', config=config):
|
44
50
|
yield from _index_new(http_only=http_only, with_extra_media_info=with_extra_media_info)
|
45
51
|
|
46
52
|
|
47
53
|
def _index_new(*, http_only: bool, with_extra_media_info: bool) -> Results:
|
48
|
-
from . import hpi
|
54
|
+
from . import hpi # noqa: F401,I001
|
49
55
|
from my.telegram.telegram_backup import messages
|
50
56
|
|
51
57
|
extra_where = "(has_media == 1 OR text LIKE '%http%')" if http_only else None
|
52
|
-
for
|
53
|
-
|
54
|
-
|
55
|
-
)
|
58
|
+
for m in messages(
|
59
|
+
with_extra_media_info=with_extra_media_info,
|
60
|
+
extra_where=extra_where,
|
61
|
+
):
|
56
62
|
text = m.text
|
57
63
|
|
58
64
|
urls = extract_urls(text)
|
@@ -2,34 +2,42 @@
|
|
2
2
|
Uses [[https://github.com/fabianonline/telegram_backup#readme][telegram_backup]] database for messages data
|
3
3
|
'''
|
4
4
|
|
5
|
-
from
|
5
|
+
from __future__ import annotations
|
6
|
+
|
6
7
|
import sqlite3
|
8
|
+
from pathlib import Path
|
7
9
|
from textwrap import dedent
|
8
|
-
from typing import
|
9
|
-
from urllib.parse import unquote
|
10
|
+
from typing import TypeVar
|
11
|
+
from urllib.parse import unquote # TODO mm, make it easier to rememember to use...
|
12
|
+
|
13
|
+
from promnesia.common import (
|
14
|
+
Loc,
|
15
|
+
PathIsh,
|
16
|
+
Results,
|
17
|
+
Visit,
|
18
|
+
echain,
|
19
|
+
extract_urls,
|
20
|
+
from_epoch,
|
21
|
+
)
|
10
22
|
|
11
|
-
from ..common import PathIsh, Visit, get_logger, Loc, extract_urls, from_epoch, Results, echain
|
12
23
|
from ..sqlite import sqlite_connection
|
13
24
|
|
14
25
|
T = TypeVar("T")
|
15
26
|
|
16
27
|
|
17
|
-
def unwrap(res:
|
28
|
+
def unwrap(res: T | Exception) -> T:
|
18
29
|
if isinstance(res, Exception):
|
19
30
|
raise res
|
20
|
-
|
21
|
-
return res
|
31
|
+
return res
|
22
32
|
|
23
33
|
|
24
|
-
def index(database: PathIsh, *, http_only: bool=False) -> Results:
|
34
|
+
def index(database: PathIsh, *, http_only: bool = False) -> Results:
|
25
35
|
"""
|
26
36
|
:param database:
|
27
37
|
the path of the sqlite generated by the _telegram_backup_ java program
|
28
38
|
:param http_only:
|
29
39
|
when true, do not collect IP-addresses and `python.py` strings
|
30
40
|
"""
|
31
|
-
logger = get_logger()
|
32
|
-
|
33
41
|
path = Path(database)
|
34
42
|
assert path.is_file(), path
|
35
43
|
|
@@ -66,7 +74,8 @@ def index(database: PathIsh, *, http_only: bool=False) -> Results:
|
|
66
74
|
M.message_type NOT IN ('service_message', 'empty_message')
|
67
75
|
{extra_criteria}
|
68
76
|
ORDER BY time;
|
69
|
-
"""
|
77
|
+
"""
|
78
|
+
)
|
70
79
|
|
71
80
|
with sqlite_connection(path, immutable=True, row_factory='row') as db:
|
72
81
|
# TODO yield error if chatname or chat or smth else is null?
|
@@ -94,6 +103,7 @@ def _handle_row(row: sqlite3.Row) -> Results:
|
|
94
103
|
urls = extract_urls(text)
|
95
104
|
if len(urls) == 0:
|
96
105
|
return
|
106
|
+
# fmt: off
|
97
107
|
dt = from_epoch(row['time'])
|
98
108
|
mid: str = unwrap(row['mid'])
|
99
109
|
|
@@ -101,6 +111,7 @@ def _handle_row(row: sqlite3.Row) -> Results:
|
|
101
111
|
sender: str = unwrap(row['sender'])
|
102
112
|
chatname: str = unwrap(row['chatname'])
|
103
113
|
chat: str = unwrap(row['chat'])
|
114
|
+
# fmt: on
|
104
115
|
|
105
116
|
in_context = f'https://t.me/{chat}/{mid}'
|
106
117
|
for u in urls:
|