promnesia 1.1.20230129__py3-none-any.whl → 1.2.20240810__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- promnesia/__main__.py +58 -50
- promnesia/cannon.py +4 -4
- promnesia/common.py +57 -38
- promnesia/compare.py +3 -2
- promnesia/compat.py +6 -65
- promnesia/config.py +4 -2
- promnesia/database/common.py +66 -0
- promnesia/database/dump.py +187 -0
- promnesia/{read_db.py → database/load.py} +10 -11
- promnesia/extract.py +1 -0
- promnesia/kjson.py +1 -1
- promnesia/logging.py +14 -14
- promnesia/misc/__init__.pyi +0 -0
- promnesia/misc/config_example.py +1 -2
- promnesia/misc/install_server.py +5 -4
- promnesia/server.py +24 -24
- promnesia/sources/__init__.pyi +0 -0
- promnesia/sources/auto.py +12 -7
- promnesia/sources/browser.py +80 -293
- promnesia/sources/browser_legacy.py +298 -0
- promnesia/sources/demo.py +18 -2
- promnesia/sources/filetypes.py +8 -0
- promnesia/sources/github.py +2 -2
- promnesia/sources/hackernews.py +1 -2
- promnesia/sources/hypothesis.py +1 -1
- promnesia/sources/markdown.py +15 -15
- promnesia/sources/org.py +7 -3
- promnesia/sources/plaintext.py +3 -1
- promnesia/sources/reddit.py +2 -2
- promnesia/sources/rss.py +5 -1
- promnesia/sources/shellcmd.py +6 -2
- promnesia/sources/signal.py +29 -20
- promnesia/sources/smscalls.py +8 -1
- promnesia/sources/stackexchange.py +2 -2
- promnesia/sources/takeout.py +132 -12
- promnesia/sources/takeout_legacy.py +10 -2
- promnesia/sources/telegram.py +79 -123
- promnesia/sources/telegram_legacy.py +117 -0
- promnesia/sources/vcs.py +1 -1
- promnesia/sources/viber.py +6 -15
- promnesia/sources/website.py +1 -1
- promnesia/sqlite.py +42 -0
- promnesia/tests/__init__.py +0 -0
- promnesia/tests/common.py +137 -0
- promnesia/tests/server_helper.py +64 -0
- promnesia/tests/sources/__init__.py +0 -0
- promnesia/tests/sources/test_auto.py +66 -0
- promnesia/tests/sources/test_filetypes.py +42 -0
- promnesia/tests/sources/test_hypothesis.py +39 -0
- promnesia/tests/sources/test_org.py +65 -0
- promnesia/tests/sources/test_plaintext.py +26 -0
- promnesia/tests/sources/test_shellcmd.py +22 -0
- promnesia/tests/sources/test_takeout.py +58 -0
- promnesia/tests/test_cannon.py +325 -0
- promnesia/tests/test_cli.py +42 -0
- promnesia/tests/test_compare.py +30 -0
- promnesia/tests/test_config.py +290 -0
- promnesia/tests/test_db_dump.py +223 -0
- promnesia/tests/test_extract.py +61 -0
- promnesia/tests/test_extract_urls.py +43 -0
- promnesia/tests/test_indexer.py +245 -0
- promnesia/tests/test_server.py +292 -0
- promnesia/tests/test_traverse.py +41 -0
- promnesia/tests/utils.py +35 -0
- {promnesia-1.1.20230129.dist-info → promnesia-1.2.20240810.dist-info}/METADATA +14 -19
- promnesia-1.2.20240810.dist-info/RECORD +83 -0
- {promnesia-1.1.20230129.dist-info → promnesia-1.2.20240810.dist-info}/WHEEL +1 -1
- {promnesia-1.1.20230129.dist-info → promnesia-1.2.20240810.dist-info}/entry_points.txt +0 -1
- promnesia/dump.py +0 -105
- promnesia-1.1.20230129.dist-info/RECORD +0 -55
- {promnesia-1.1.20230129.dist-info → promnesia-1.2.20240810.dist-info}/LICENSE +0 -0
- {promnesia-1.1.20230129.dist-info → promnesia-1.2.20240810.dist-info}/top_level.txt +0 -0
promnesia/sources/signal.py
CHANGED
@@ -1,12 +1,13 @@
|
|
1
1
|
"""
|
2
|
-
|
2
|
+
Collects visits from Signal Desktop's encrypted SQLIite db(s).
|
3
|
+
"""
|
3
4
|
|
4
|
-
Functions get their defaults from module-data.
|
5
|
+
# Functions get their defaults from module-data.
|
6
|
+
#
|
7
|
+
# * Open-ciphered-db adapted from:
|
8
|
+
# https://github.com/carderne/signal-export/commit/2284c8f4
|
9
|
+
# * Copyright (c) 2019 Chris Arderne, 2020 Kostis Anagnostopoulos
|
5
10
|
|
6
|
-
* Open-ciphered-db adapted from:
|
7
|
-
https://github.com/carderne/signal-export/commit/2284c8f4
|
8
|
-
* Copyright (c) 2019 Chris Arderne, 2020 Kostis Anagnostopoulos
|
9
|
-
"""
|
10
11
|
|
11
12
|
import json
|
12
13
|
import logging
|
@@ -62,6 +63,8 @@ def index(
|
|
62
63
|
logger.debug("Paths to harvest: %s", db_paths)
|
63
64
|
if not http_only:
|
64
65
|
sql_query = f"{messages_query}\nWHERE body LIKE '%http%'"
|
66
|
+
else:
|
67
|
+
sql_query = messages_query
|
65
68
|
|
66
69
|
for db_path in resolved_db_paths:
|
67
70
|
logger.info("Ciphered db to harvest %s", db_path)
|
@@ -105,12 +108,18 @@ messages_query = dedent(
|
|
105
108
|
SELECT
|
106
109
|
id,
|
107
110
|
type,
|
108
|
-
coalesce(
|
111
|
+
coalesce(
|
112
|
+
profileFullName,
|
113
|
+
profileName,
|
114
|
+
name,
|
115
|
+
profileFamilyName,
|
116
|
+
e164
|
117
|
+
) as aname,
|
109
118
|
name,
|
110
119
|
profileName,
|
111
120
|
profileFamilyName,
|
112
121
|
e164,
|
113
|
-
|
122
|
+
serviceId
|
114
123
|
FROM conversations
|
115
124
|
),
|
116
125
|
Msgs AS (
|
@@ -122,8 +131,8 @@ messages_query = dedent(
|
|
122
131
|
M.received_at,
|
123
132
|
M.sent_at
|
124
133
|
) AS timestamp,
|
125
|
-
IIF(M.type =
|
126
|
-
|
134
|
+
IIF(M.type = 'outgoing',
|
135
|
+
'Me (' || C2.aname || ')',
|
127
136
|
C2.aname
|
128
137
|
) AS sender,
|
129
138
|
M.conversationId AS cid,
|
@@ -137,7 +146,7 @@ messages_query = dedent(
|
|
137
146
|
INNER JOIN Cons AS C1
|
138
147
|
ON M.conversationId = C1.id
|
139
148
|
INNER JOIN Cons AS C2
|
140
|
-
ON M.
|
149
|
+
ON M.sourceServiceId = C2.serviceId
|
141
150
|
)
|
142
151
|
SELECT id, timestamp, sender, cid, chatname, body
|
143
152
|
FROM Msgs
|
@@ -187,8 +196,8 @@ def _expand_path(path_pattern: PathIsh) -> Iterable[Path]:
|
|
187
196
|
|
188
197
|
def _expand_paths(paths: PathIshes) -> Iterable[Path]:
|
189
198
|
if _is_pathish(paths):
|
190
|
-
paths = [paths] # type: ignore[
|
191
|
-
return [pp.resolve() for p in paths for pp in _expand_path(p)] # type: ignore[union-attr
|
199
|
+
paths = [paths] # type: ignore[list-item]
|
200
|
+
return [pp.resolve() for p in paths for pp in _expand_path(p)] # type: ignore[union-attr]
|
192
201
|
|
193
202
|
|
194
203
|
def collect_db_paths(*db_paths: PathIsh, append: bool = False) -> Iterable[Path]:
|
@@ -235,7 +244,7 @@ def collect_db_paths(*db_paths: PathIsh, append: bool = False) -> Iterable[Path]
|
|
235
244
|
)
|
236
245
|
|
237
246
|
if db_paths and append:
|
238
|
-
db_paths = [ # type: ignore[
|
247
|
+
db_paths = [ # type: ignore[assignment]
|
239
248
|
*([db_paths] if _is_pathish(db_paths) else db_paths),
|
240
249
|
plat_paths,
|
241
250
|
]
|
@@ -309,8 +318,8 @@ def connect_db(
|
|
309
318
|
sql_cmds.extend(
|
310
319
|
[
|
311
320
|
f"ATTACH DATABASE '{decrypted_file}' AS plaintext KEY '';",
|
312
|
-
|
313
|
-
|
321
|
+
"SELECT sqlcipher_export('plaintext');",
|
322
|
+
"DETACH DATABASE plaintext;",
|
314
323
|
]
|
315
324
|
)
|
316
325
|
sql = "\n".join(sql_cmds)
|
@@ -319,7 +328,7 @@ def connect_db(
|
|
319
328
|
"Decrypting db '%s' with cmd: %s <<<EOF\n%s\nEOF", db_path, cmd, sql
|
320
329
|
)
|
321
330
|
try:
|
322
|
-
sbp.run(
|
331
|
+
sbp.run(
|
323
332
|
cmd,
|
324
333
|
check=True,
|
325
334
|
input=sql,
|
@@ -334,7 +343,7 @@ def connect_db(
|
|
334
343
|
) from None
|
335
344
|
db = sqlite3.connect(f"file:{decrypted_file}?mode=ro", uri=True)
|
336
345
|
else:
|
337
|
-
from sqlcipher3 import dbapi2 # type: ignore[import]
|
346
|
+
from sqlcipher3 import dbapi2 # type: ignore[import-not-found]
|
338
347
|
|
339
348
|
db = dbapi2.connect(f"file:{db_path}?mode=ro", uri=True)
|
340
349
|
# Param-binding doesn't work for pragmas, so use a direct string concat.
|
@@ -418,9 +427,9 @@ def _harvest_db(
|
|
418
427
|
|
419
428
|
with connect_db(db_path, key, decrypt_db=decrypt_db, **decryption_pragmas) as db:
|
420
429
|
for mid, tstamp, sender, cid, chatname, text in db.execute(messages_query):
|
430
|
+
tstamp = from_epoch(tstamp / 1000.0)
|
431
|
+
row = (mid, tstamp, sender, cid, chatname, text)
|
421
432
|
try:
|
422
|
-
tstamp = from_epoch(tstamp / 1000.0)
|
423
|
-
row = (mid, tstamp, sender, cid, chatname, text)
|
424
433
|
yield from _handle_row(row, db_path, locator_schema)
|
425
434
|
except Exception as ex:
|
426
435
|
# TODO: also insert errors in db
|
promnesia/sources/smscalls.py
CHANGED
@@ -11,11 +11,18 @@ def index() -> Results:
|
|
11
11
|
|
12
12
|
for m in messages():
|
13
13
|
|
14
|
+
if isinstance(m, Exception):
|
15
|
+
yield m
|
16
|
+
continue
|
17
|
+
|
14
18
|
urls = extract_urls(m.message)
|
15
19
|
if len(urls) == 0:
|
16
20
|
continue
|
17
21
|
|
18
|
-
|
22
|
+
if m.who is None:
|
23
|
+
loc = Loc(title=f"SMS with {m.phone_number}")
|
24
|
+
else:
|
25
|
+
loc = Loc(title=f"SMS with {m.who} ({m.phone_number})")
|
19
26
|
|
20
27
|
for u in urls:
|
21
28
|
yield Visit(
|
@@ -2,12 +2,12 @@
|
|
2
2
|
Uses [[https://github.com/karlicoss/HPI][HPI]] for Stackexchange data.
|
3
3
|
'''
|
4
4
|
|
5
|
-
from ..common import Results, Visit, Loc
|
5
|
+
from ..common import Results, Visit, Loc
|
6
6
|
|
7
7
|
|
8
8
|
def index() -> Results:
|
9
9
|
from . import hpi
|
10
|
-
import my.stackexchange.gdpr as G
|
10
|
+
import my.stackexchange.gdpr as G
|
11
11
|
for v in G.votes():
|
12
12
|
if isinstance(v, Exception):
|
13
13
|
yield v
|
promnesia/sources/takeout.py
CHANGED
@@ -1,19 +1,26 @@
|
|
1
1
|
'''
|
2
2
|
Uses HPI [[https://github.com/karlicoss/HPI/blob/master/doc/MODULES.org#mygoogletakeoutpaths][google.takeout]] module
|
3
3
|
'''
|
4
|
-
from typing import Iterable, Set,
|
4
|
+
from typing import Iterable, Set, Any, NamedTuple
|
5
5
|
import warnings
|
6
6
|
|
7
7
|
from ..common import Visit, Loc, Results, logger
|
8
8
|
from ..compat import removeprefix
|
9
9
|
|
10
10
|
|
11
|
+
# incase user is using an old version of google_takeout_parser
|
12
|
+
class YoutubeCSVStub(NamedTuple):
|
13
|
+
contentJSON: str
|
14
|
+
|
15
|
+
|
11
16
|
def index() -> Results:
|
12
17
|
from . import hpi
|
18
|
+
import json
|
13
19
|
|
14
20
|
try:
|
15
21
|
from my.google.takeout.parser import events
|
16
|
-
from google_takeout_parser.models import Activity, YoutubeComment, LikedYoutubeVideo, ChromeHistory
|
22
|
+
from google_takeout_parser.models import Activity, YoutubeComment, LikedYoutubeVideo, ChromeHistory
|
23
|
+
from google_takeout_parser.parse_csv import reconstruct_comment_content, extract_comment_links
|
17
24
|
except ModuleNotFoundError as ex:
|
18
25
|
logger.exception(ex)
|
19
26
|
yield ex
|
@@ -24,17 +31,30 @@ def index() -> Results:
|
|
24
31
|
yield from takeout_legacy.index()
|
25
32
|
return
|
26
33
|
|
27
|
-
|
34
|
+
|
35
|
+
_seen: Set[str] = {
|
28
36
|
# these are definitely not useful for promnesia
|
29
|
-
Location,
|
30
|
-
|
37
|
+
'Location',
|
38
|
+
'PlaceVisit',
|
39
|
+
'PlayStoreAppInstall',
|
31
40
|
}
|
32
|
-
|
33
|
-
|
34
|
-
|
41
|
+
|
42
|
+
imported_yt_csv_models = False
|
43
|
+
try:
|
44
|
+
from google_takeout_parser.models import CSVYoutubeComment, CSVYoutubeLiveChat
|
45
|
+
imported_yt_csv_models = True
|
46
|
+
except ImportError:
|
47
|
+
# warn user to upgrade google_takeout_parser
|
48
|
+
warnings.warn("Please upgrade google_takeout_parser (`pip install -U google_takeout_parser`) to support the new format for youtube comments")
|
49
|
+
CSVYoutubeComment = YoutubeCSVStub # type: ignore[misc,assignment]
|
50
|
+
CSVYoutubeLiveChat = YoutubeCSVStub # type: ignore[misc,assignment]
|
51
|
+
|
52
|
+
def warn_once_if_not_seen(e: Any) -> Iterable[Exception]:
|
53
|
+
et_name = type(e).__name__
|
54
|
+
if et_name in _seen:
|
35
55
|
return
|
36
|
-
_seen.add(
|
37
|
-
yield RuntimeError(f"Unhandled event {
|
56
|
+
_seen.add(et_name)
|
57
|
+
yield RuntimeError(f"Unhandled event {repr(type(e))}: {e}")
|
38
58
|
|
39
59
|
for e in events():
|
40
60
|
if isinstance(e, Exception):
|
@@ -48,12 +68,37 @@ def index() -> Results:
|
|
48
68
|
# e.g. https://www.google.com/url?q=https://en.wikipedia.org/wiki/Clapham
|
49
69
|
# note: also title usually starts with 'Visited ', in such case but perhaps fine to keep it
|
50
70
|
url = removeprefix(url, "https://www.google.com/url?q=")
|
71
|
+
title = e.title
|
51
72
|
|
73
|
+
if e.header == 'Chrome':
|
74
|
+
# title contains 'Visited <page title>' in this case
|
75
|
+
context = None
|
76
|
+
title = removeprefix(title, 'Visited ')
|
77
|
+
elif e.header in _CLEAR_CONTEXT_FOR_HEADERS:
|
78
|
+
# todo perhaps could add to some sort of metadata?
|
79
|
+
# only useful for debugging really
|
80
|
+
context = None
|
81
|
+
elif e.header in url:
|
82
|
+
# stuff like News only has domain name in the header -- completely useless for promnesia
|
83
|
+
context = None
|
84
|
+
elif e.title == f'Used {e.header}':
|
85
|
+
# app usage tracking -- using app name as context is useless here
|
86
|
+
context = None
|
87
|
+
elif e.products == ['Android']:
|
88
|
+
# seems to be coming from in-app browser, header contains app name in this case
|
89
|
+
context = None
|
90
|
+
elif e.products == ['Ads']:
|
91
|
+
# header contains some weird internal ad id in this case
|
92
|
+
context = None
|
93
|
+
else:
|
94
|
+
context = None
|
95
|
+
# NOTE: at this point seems that context always ends up as None (at least for @karlicoss as of 20230131)
|
96
|
+
# so alternatively could just force it to be None instead of manual dispatching :shrug:
|
52
97
|
yield Visit(
|
53
98
|
url=url,
|
54
99
|
dt=e.time,
|
55
|
-
context=
|
56
|
-
locator=Loc(title=
|
100
|
+
context=context,
|
101
|
+
locator=Loc(title=title, href=url),
|
57
102
|
)
|
58
103
|
for s in e.subtitles:
|
59
104
|
surl = s[1]
|
@@ -73,6 +118,8 @@ def index() -> Results:
|
|
73
118
|
locator=Loc(title=e.title, href=e.url),
|
74
119
|
)
|
75
120
|
elif isinstance(e, LikedYoutubeVideo):
|
121
|
+
# TODO not sure if desc makes sense here since it's not user produced data
|
122
|
+
# it's just a part of video meta?
|
76
123
|
yield Visit(
|
77
124
|
url=e.link, dt=e.dt, context=e.desc, locator=Loc(title=e.title, href=e.link)
|
78
125
|
)
|
@@ -83,5 +130,78 @@ def index() -> Results:
|
|
83
130
|
yield Visit(
|
84
131
|
url=url, dt=e.dt, context=e.content, locator=Loc(title=e.content, href=url)
|
85
132
|
)
|
133
|
+
elif imported_yt_csv_models and isinstance(e, CSVYoutubeComment):
|
134
|
+
contentJSON = json.loads(e.contentJSON)
|
135
|
+
content = reconstruct_comment_content(contentJSON, format='text')
|
136
|
+
if isinstance(content, Exception):
|
137
|
+
yield content
|
138
|
+
continue
|
139
|
+
links = extract_comment_links(contentJSON)
|
140
|
+
if isinstance(links, Exception):
|
141
|
+
yield links
|
142
|
+
continue
|
143
|
+
context = f"Commented on {e.video_url}"
|
144
|
+
for url in links:
|
145
|
+
yield Visit(
|
146
|
+
url=url, dt=e.dt, context=content, locator=Loc(title=context, href=url)
|
147
|
+
)
|
148
|
+
yield Visit(
|
149
|
+
url=e.video_url, dt=e.dt, context=content, locator=Loc(title=context, href=e.video_url)
|
150
|
+
)
|
151
|
+
elif imported_yt_csv_models and isinstance(e, CSVYoutubeLiveChat):
|
152
|
+
contentJSON = json.loads(e.contentJSON)
|
153
|
+
content = reconstruct_comment_content(contentJSON, format='text')
|
154
|
+
if isinstance(content, Exception):
|
155
|
+
yield content
|
156
|
+
continue
|
157
|
+
links = extract_comment_links(contentJSON)
|
158
|
+
if isinstance(links, Exception):
|
159
|
+
yield links
|
160
|
+
continue
|
161
|
+
context = f"Commented on livestream {e.video_url}"
|
162
|
+
for url in links:
|
163
|
+
yield Visit(
|
164
|
+
url=url, dt=e.dt, context=content, locator=Loc(title=context, href=url)
|
165
|
+
)
|
166
|
+
yield Visit(
|
167
|
+
url=e.video_url, dt=e.dt, context=content, locator=Loc(title=context, href=e.video_url)
|
168
|
+
)
|
86
169
|
else:
|
87
170
|
yield from warn_once_if_not_seen(e)
|
171
|
+
|
172
|
+
|
173
|
+
_CLEAR_CONTEXT_FOR_HEADERS = {
|
174
|
+
'Google Cloud',
|
175
|
+
'Travel',
|
176
|
+
'Google Arts & Culture',
|
177
|
+
'Drive',
|
178
|
+
'Calendar',
|
179
|
+
'Google Store',
|
180
|
+
'Shopping',
|
181
|
+
'News',
|
182
|
+
'Help',
|
183
|
+
'Books',
|
184
|
+
'Google My Business',
|
185
|
+
'Google Play Movies & TV',
|
186
|
+
'Developers',
|
187
|
+
'YouTube',
|
188
|
+
'Gmail',
|
189
|
+
'Video Search',
|
190
|
+
'Google Apps',
|
191
|
+
'Google Translate',
|
192
|
+
'Ads',
|
193
|
+
'Image Search',
|
194
|
+
'Assistant',
|
195
|
+
'Google Play Store',
|
196
|
+
'Android',
|
197
|
+
'Maps',
|
198
|
+
'Search',
|
199
|
+
'Google App',
|
200
|
+
'in_app_display_context_client',
|
201
|
+
'Play Music',
|
202
|
+
'Maps - Navigate & Explore',
|
203
|
+
'Google Maps',
|
204
|
+
'google.com',
|
205
|
+
'Google Play Books',
|
206
|
+
'Maps - Navigation & Transit',
|
207
|
+
}
|
@@ -34,7 +34,15 @@ from .. import config
|
|
34
34
|
|
35
35
|
|
36
36
|
from more_itertools import unique_everseen
|
37
|
-
|
37
|
+
|
38
|
+
try:
|
39
|
+
from cachew import cachew
|
40
|
+
except ModuleNotFoundError as me:
|
41
|
+
if me.name != 'cachew':
|
42
|
+
raise me
|
43
|
+
# this module is legacy anyway, so just make it defensive
|
44
|
+
def cachew(*args, **kwargs): # type: ignore[no-redef]
|
45
|
+
return lambda f: f
|
38
46
|
|
39
47
|
|
40
48
|
# TODO use CPath? Could encapsulate a path within an archive *or* within a directory
|
@@ -105,7 +113,7 @@ def read_browser_history_json(takeout: TakeoutPath) -> Iterable[Visit]:
|
|
105
113
|
hist = j['Browser History']
|
106
114
|
for item in hist:
|
107
115
|
url = item['url']
|
108
|
-
time = datetime.
|
116
|
+
time = datetime.fromtimestamp(item['time_usec'] / 10 ** 6, tz=pytz.utc)
|
109
117
|
# TODO any more interesitng info?
|
110
118
|
yield Visit(
|
111
119
|
url=url,
|
promnesia/sources/telegram.py
CHANGED
@@ -1,128 +1,84 @@
|
|
1
|
-
|
2
|
-
Uses [[https://github.com/fabianonline/telegram_backup#readme][telegram_backup]] database for messages data
|
3
|
-
'''
|
4
|
-
|
5
|
-
from pathlib import Path
|
6
|
-
from textwrap import dedent
|
7
|
-
from typing import Optional, Union, TypeVar
|
1
|
+
from typing import Optional
|
8
2
|
from urllib.parse import unquote # TODO mm, make it easier to rememember to use...
|
3
|
+
import warnings
|
9
4
|
|
10
|
-
from
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
def unwrap(res: Union[T, Exception]) -> T:
|
18
|
-
if isinstance(res, Exception):
|
19
|
-
raise res
|
20
|
-
else:
|
21
|
-
return res
|
22
|
-
|
23
|
-
|
24
|
-
# TODO move to common?
|
25
|
-
def dataset_readonly(db: Path):
|
26
|
-
import dataset # type: ignore
|
27
|
-
# see https://github.com/pudo/dataset/issues/136#issuecomment-128693122
|
28
|
-
import sqlite3
|
29
|
-
creator = lambda: sqlite3.connect(f'file:{db}?immutable=1', uri=True)
|
30
|
-
return dataset.connect('sqlite:///' , engine_kwargs={'creator': creator})
|
31
|
-
|
32
|
-
|
33
|
-
def index(database: PathIsh, *, http_only: bool=False) -> Results:
|
34
|
-
"""
|
35
|
-
:param database:
|
36
|
-
the path of the sqlite generated by the _telegram_backup_ java program
|
37
|
-
:param http_only:
|
38
|
-
when true, do not collect IP-addresses and `python.py` strings
|
39
|
-
"""
|
40
|
-
logger = get_logger()
|
41
|
-
|
42
|
-
path = Path(database)
|
43
|
-
assert path.is_file(), path # TODO could check is_file inside `dataset_readonly()`
|
44
|
-
|
45
|
-
def make_query(text_query: str) -> str:
|
46
|
-
extra_criteria = "AND (M.has_media == 1 OR text LIKE '%http%')" if http_only else ""
|
47
|
-
return dedent(
|
48
|
-
f"""
|
49
|
-
WITH entities AS (
|
50
|
-
SELECT 'dialog' as type
|
51
|
-
, id
|
52
|
-
, coalesce(username, id) as handle
|
53
|
-
, coalesce(first_name || " " || last_name
|
54
|
-
, username
|
55
|
-
, id
|
56
|
-
) as display_name FROM users
|
57
|
-
UNION
|
58
|
-
SELECT 'group' as type
|
59
|
-
, id
|
60
|
-
, id as handle
|
61
|
-
, coalesce(name, id) as display_name FROM chats
|
62
|
-
)
|
63
|
-
SELECT src.display_name AS chatname
|
64
|
-
, src.handle AS chat
|
65
|
-
, snd.display_name AS sender
|
66
|
-
, M.time AS time
|
67
|
-
, {text_query} AS text
|
68
|
-
, M.id AS mid
|
69
|
-
FROM messages AS M
|
70
|
-
/* chat types are 'dialog' (1-1), 'group' and 'supergroup' */
|
71
|
-
/* this is abit hacky way to handle all groups in one go */
|
72
|
-
LEFT JOIN entities AS src ON M.source_id = src.id AND src.type = (CASE M.source_type WHEN 'supergroup' THEN 'group' ELSE M.source_type END)
|
73
|
-
LEFT JOIN entities AS snd ON M.sender_id = snd.id AND snd.type = 'dialog'
|
74
|
-
WHERE
|
75
|
-
M.message_type NOT IN ('service_message', 'empty_message')
|
76
|
-
{extra_criteria}
|
77
|
-
ORDER BY time;
|
78
|
-
""")
|
79
|
-
|
80
|
-
# TODO context manager?
|
81
|
-
with dataset_readonly(path) as db:
|
82
|
-
|
83
|
-
# TODO yield error if chatname or chat or smth else is null?
|
84
|
-
for row in db.query(make_query('M.text')):
|
85
|
-
try:
|
86
|
-
yield from _handle_row(row)
|
87
|
-
except Exception as ex:
|
88
|
-
yield echain(RuntimeError(f'While handling {row}'), ex)
|
89
|
-
# , None, sys.exc_info()[2]
|
90
|
-
# TODO hmm. traceback isn't preserved; wonder if that's because it's too heavy to attach to every single exception object..
|
91
|
-
|
92
|
-
# old (also 'stable') version doesn't have 'json' column yet...
|
93
|
-
if 'json' in db['messages'].columns:
|
94
|
-
for row in db.query(make_query("json_extract(json, '$.media.webpage.description')")):
|
95
|
-
try:
|
96
|
-
yield from _handle_row(row)
|
97
|
-
except Exception as ex:
|
98
|
-
yield echain(RuntimeError(f'While handling {row}'), ex)
|
99
|
-
|
100
|
-
|
101
|
-
def _handle_row(row) -> Results:
|
102
|
-
text = row['text']
|
103
|
-
if text is None:
|
5
|
+
from promnesia.common import Results, logger, extract_urls, Visit, Loc, PathIsh
|
6
|
+
|
7
|
+
|
8
|
+
def index(database: Optional[PathIsh]=None, *, http_only: bool=False, with_extra_media_info: bool=False) -> Results:
|
9
|
+
if database is None:
|
10
|
+
# fully relying on HPI
|
11
|
+
yield from _index_new(http_only=http_only, with_extra_media_info=with_extra_media_info)
|
104
12
|
return
|
105
|
-
|
106
|
-
|
13
|
+
|
14
|
+
warnings.warn(
|
15
|
+
f'Passing paths to promnesia.sources.telegram is deprecated, you should setup my.telegram.telegram_backup instead. '
|
16
|
+
f'Will try to hack database path {database} into HPI config.'
|
17
|
+
)
|
18
|
+
try:
|
19
|
+
yield from _index_new_with_adhoc_config(database=database, http_only=http_only, with_extra_media_info=with_extra_media_info)
|
107
20
|
return
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
21
|
+
except Exception as e:
|
22
|
+
logger.exception(e)
|
23
|
+
warnings.warn("Hacking my.config.telegram.telegram_backup didn't work. You probably need to update HPI.")
|
24
|
+
|
25
|
+
logger.warning("Falling back onto promnesia.sources.telegram_legacy module")
|
26
|
+
yield from _index_legacy(database=database, http_only=http_only)
|
27
|
+
|
28
|
+
|
29
|
+
def _index_legacy(*, database: PathIsh, http_only: bool) -> Results:
|
30
|
+
from . import telegram_legacy
|
31
|
+
yield from telegram_legacy.index(database=database, http_only=http_only)
|
32
|
+
|
33
|
+
|
34
|
+
def _index_new_with_adhoc_config(*, database: PathIsh, http_only: bool, with_extra_media_info: bool) -> Results:
|
35
|
+
from . import hpi
|
36
|
+
|
37
|
+
class config:
|
38
|
+
class telegram:
|
39
|
+
class telegram_backup:
|
40
|
+
export_path: PathIsh = database
|
41
|
+
|
42
|
+
from my.core.cfg import tmp_config
|
43
|
+
with tmp_config(modules='my.telegram.telegram_backup', config=config):
|
44
|
+
yield from _index_new(http_only=http_only, with_extra_media_info=with_extra_media_info)
|
45
|
+
|
46
|
+
|
47
|
+
def _index_new(*, http_only: bool, with_extra_media_info: bool) -> Results:
|
48
|
+
from . import hpi
|
49
|
+
from my.telegram.telegram_backup import messages
|
50
|
+
|
51
|
+
extra_where = "(has_media == 1 OR text LIKE '%http%')" if http_only else None
|
52
|
+
for i, m in enumerate(messages(
|
53
|
+
with_extra_media_info=with_extra_media_info,
|
54
|
+
extra_where=extra_where,
|
55
|
+
)):
|
56
|
+
text = m.text
|
57
|
+
|
58
|
+
urls = extract_urls(text)
|
59
|
+
extra_media_info = m.extra_media_info
|
60
|
+
if extra_media_info is not None:
|
61
|
+
urls.extend(extract_urls(extra_media_info))
|
62
|
+
|
63
|
+
if len(urls) == 0:
|
64
|
+
continue
|
65
|
+
|
66
|
+
dt = m.time
|
67
|
+
sender = m.sender.name
|
68
|
+
chat = m.chat
|
69
|
+
|
70
|
+
cname = chat.name if chat.name is not None else str(chat.id)
|
71
|
+
|
72
|
+
locator = Loc.make(
|
73
|
+
title=f"chat with {cname}",
|
74
|
+
href=m.permalink,
|
128
75
|
)
|
76
|
+
context = f'{sender}: {text}'
|
77
|
+
|
78
|
+
for u in urls:
|
79
|
+
yield Visit(
|
80
|
+
url=unquote(u),
|
81
|
+
dt=dt,
|
82
|
+
context=context,
|
83
|
+
locator=locator,
|
84
|
+
)
|