promnesia 1.2.20230515__py3-none-any.whl → 1.3.20241021__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- promnesia/__init__.py +14 -3
- promnesia/__main__.py +60 -35
- promnesia/cannon.py +27 -27
- promnesia/common.py +85 -67
- promnesia/compare.py +21 -22
- promnesia/compat.py +10 -10
- promnesia/config.py +23 -23
- promnesia/database/common.py +67 -0
- promnesia/database/dump.py +188 -0
- promnesia/{read_db.py → database/load.py} +16 -17
- promnesia/extract.py +14 -11
- promnesia/kjson.py +12 -11
- promnesia/logging.py +4 -4
- promnesia/misc/__init__.pyi +0 -0
- promnesia/misc/config_example.py +1 -2
- promnesia/misc/install_server.py +7 -9
- promnesia/server.py +57 -47
- promnesia/sources/__init__.pyi +0 -0
- promnesia/sources/auto.py +50 -35
- promnesia/sources/auto_logseq.py +6 -5
- promnesia/sources/auto_obsidian.py +2 -2
- promnesia/sources/browser.py +14 -9
- promnesia/sources/browser_legacy.py +26 -16
- promnesia/sources/demo.py +19 -3
- promnesia/sources/fbmessenger.py +3 -2
- promnesia/sources/filetypes.py +16 -7
- promnesia/sources/github.py +7 -9
- promnesia/sources/guess.py +2 -1
- promnesia/sources/hackernews.py +2 -2
- promnesia/sources/hpi.py +2 -2
- promnesia/sources/html.py +7 -5
- promnesia/sources/hypothesis.py +4 -3
- promnesia/sources/instapaper.py +2 -2
- promnesia/sources/markdown.py +31 -21
- promnesia/sources/org.py +27 -13
- promnesia/sources/plaintext.py +30 -29
- promnesia/sources/pocket.py +3 -2
- promnesia/sources/reddit.py +20 -19
- promnesia/sources/roamresearch.py +2 -1
- promnesia/sources/rss.py +4 -5
- promnesia/sources/shellcmd.py +19 -6
- promnesia/sources/signal.py +33 -24
- promnesia/sources/smscalls.py +2 -2
- promnesia/sources/stackexchange.py +4 -3
- promnesia/sources/takeout.py +76 -9
- promnesia/sources/takeout_legacy.py +24 -12
- promnesia/sources/telegram.py +13 -11
- promnesia/sources/telegram_legacy.py +18 -7
- promnesia/sources/twitter.py +6 -5
- promnesia/sources/vcs.py +5 -3
- promnesia/sources/viber.py +10 -9
- promnesia/sources/website.py +4 -4
- promnesia/sources/zulip.py +3 -2
- promnesia/sqlite.py +7 -4
- promnesia/tests/__init__.py +0 -0
- promnesia/tests/common.py +140 -0
- promnesia/tests/server_helper.py +67 -0
- promnesia/tests/sources/__init__.py +0 -0
- promnesia/tests/sources/test_auto.py +65 -0
- promnesia/tests/sources/test_filetypes.py +43 -0
- promnesia/tests/sources/test_hypothesis.py +39 -0
- promnesia/tests/sources/test_org.py +64 -0
- promnesia/tests/sources/test_plaintext.py +25 -0
- promnesia/tests/sources/test_shellcmd.py +21 -0
- promnesia/tests/sources/test_takeout.py +56 -0
- promnesia/tests/test_cannon.py +325 -0
- promnesia/tests/test_cli.py +40 -0
- promnesia/tests/test_compare.py +30 -0
- promnesia/tests/test_config.py +289 -0
- promnesia/tests/test_db_dump.py +222 -0
- promnesia/tests/test_extract.py +65 -0
- promnesia/tests/test_extract_urls.py +43 -0
- promnesia/tests/test_indexer.py +251 -0
- promnesia/tests/test_server.py +291 -0
- promnesia/tests/test_traverse.py +39 -0
- promnesia/tests/utils.py +35 -0
- {promnesia-1.2.20230515.dist-info → promnesia-1.3.20241021.dist-info}/METADATA +15 -18
- promnesia-1.3.20241021.dist-info/RECORD +83 -0
- {promnesia-1.2.20230515.dist-info → promnesia-1.3.20241021.dist-info}/WHEEL +1 -1
- {promnesia-1.2.20230515.dist-info → promnesia-1.3.20241021.dist-info}/entry_points.txt +0 -1
- promnesia/dump.py +0 -105
- promnesia-1.2.20230515.dist-info/RECORD +0 -58
- {promnesia-1.2.20230515.dist-info → promnesia-1.3.20241021.dist-info}/LICENSE +0 -0
- {promnesia-1.2.20230515.dist-info → promnesia-1.3.20241021.dist-info}/top_level.txt +0 -0
@@ -2,7 +2,7 @@
|
|
2
2
|
Uses [[https://github.com/karlicoss/HPI][HPI]] for Roam Research data
|
3
3
|
'''
|
4
4
|
|
5
|
-
from
|
5
|
+
from promnesia.common import Loc, Results, Visit, extract_urls
|
6
6
|
|
7
7
|
|
8
8
|
def index() -> Results:
|
@@ -43,6 +43,7 @@ def _collect(node: 'RoamNode') -> Results:
|
|
43
43
|
|
44
44
|
|
45
45
|
import typing
|
46
|
+
|
46
47
|
if typing.TYPE_CHECKING:
|
47
48
|
import my.roamresearch as RR
|
48
49
|
RoamNode = RR.Node
|
promnesia/sources/rss.py
CHANGED
@@ -2,14 +2,12 @@
|
|
2
2
|
Uses [[https://github.com/karlicoss/HPI][HPI]] for RSS data.
|
3
3
|
'''
|
4
4
|
|
5
|
-
from itertools import chain
|
6
|
-
|
7
|
-
from ..common import Visit, Loc, extract_urls, Results, get_logger
|
8
|
-
|
9
5
|
from datetime import datetime
|
10
6
|
|
11
7
|
import pytz
|
12
8
|
|
9
|
+
from promnesia.common import Loc, Results, Visit
|
10
|
+
|
13
11
|
# arbitrary, 2011-11-04 00:05:23.283+00:00
|
14
12
|
default_datetime = datetime.fromtimestamp(1320365123, tz=pytz.utc)
|
15
13
|
# TODO FIXME allow for visit not to have datetime?
|
@@ -17,12 +15,13 @@ default_datetime = datetime.fromtimestamp(1320365123, tz=pytz.utc)
|
|
17
15
|
|
18
16
|
def index() -> Results:
|
19
17
|
from my.rss.all import subscriptions
|
18
|
+
|
20
19
|
for feed in subscriptions():
|
21
20
|
# TODO locator should be optional too? although could use direct link in the rss reader interface
|
22
21
|
locator = Loc.make(title='my.rss')
|
23
22
|
yield Visit(
|
24
23
|
url=feed.url,
|
25
24
|
dt=feed.created_at or default_datetime,
|
26
|
-
context=
|
25
|
+
context='RSS subscription', # TODO use 'provider', etc?
|
27
26
|
locator=locator,
|
28
27
|
)
|
promnesia/sources/shellcmd.py
CHANGED
@@ -2,18 +2,31 @@
|
|
2
2
|
Greps out URLs from an arbitrary shell command results.
|
3
3
|
"""
|
4
4
|
|
5
|
-
from
|
5
|
+
from __future__ import annotations
|
6
|
+
|
6
7
|
import os
|
7
8
|
import re
|
8
|
-
from subprocess import run, PIPE
|
9
|
-
from typing import Union, Sequence
|
10
9
|
import warnings
|
10
|
+
from collections.abc import Sequence
|
11
|
+
from datetime import datetime
|
12
|
+
from subprocess import PIPE, run
|
13
|
+
|
14
|
+
from promnesia.common import (
|
15
|
+
Loc,
|
16
|
+
PathIsh,
|
17
|
+
Results,
|
18
|
+
Visit,
|
19
|
+
_is_windows,
|
20
|
+
extract_urls,
|
21
|
+
file_mtime,
|
22
|
+
get_system_tz,
|
23
|
+
now_tz,
|
24
|
+
)
|
11
25
|
|
12
|
-
from ..common import Visit, Loc, Results, extract_urls, file_mtime, get_system_tz, now_tz, _is_windows, PathIsh
|
13
26
|
from .plaintext import _has_grep
|
14
27
|
|
15
28
|
|
16
|
-
def index(command:
|
29
|
+
def index(command: str | Sequence[PathIsh]) -> Results:
|
17
30
|
cmd: Sequence[PathIsh]
|
18
31
|
cmds: str
|
19
32
|
if isinstance(command, str):
|
@@ -71,7 +84,7 @@ def index(command: Union[str, Sequence[PathIsh]]) -> Results:
|
|
71
84
|
context=context,
|
72
85
|
)
|
73
86
|
|
74
|
-
r = run(cmd, stdout=PIPE)
|
87
|
+
r = run(cmd, stdout=PIPE, check=False)
|
75
88
|
if r.returncode > 0:
|
76
89
|
if not (cmd[0] in {'grep', 'findstr'} and r.returncode == 1): # ugh. grep returns 1 on no matches...
|
77
90
|
r.check_returncode()
|
promnesia/sources/signal.py
CHANGED
@@ -1,23 +1,23 @@
|
|
1
1
|
"""
|
2
2
|
Collects visits from Signal Desktop's encrypted SQLIite db(s).
|
3
3
|
"""
|
4
|
+
from __future__ import annotations
|
4
5
|
|
5
6
|
# Functions get their defaults from module-data.
|
6
7
|
#
|
7
8
|
# * Open-ciphered-db adapted from:
|
8
9
|
# https://github.com/carderne/signal-export/commit/2284c8f4
|
9
10
|
# * Copyright (c) 2019 Chris Arderne, 2020 Kostis Anagnostopoulos
|
10
|
-
|
11
|
-
|
12
11
|
import json
|
13
12
|
import logging
|
14
13
|
import platform
|
15
14
|
import sqlite3
|
16
15
|
import subprocess as sbp
|
16
|
+
from collections.abc import Iterable, Iterator, Mapping
|
17
17
|
from contextlib import contextmanager
|
18
18
|
from pathlib import Path
|
19
19
|
from textwrap import dedent, indent
|
20
|
-
from typing import Any,
|
20
|
+
from typing import Any, Union
|
21
21
|
|
22
22
|
from ..common import Loc, PathIsh, Results, Visit, extract_urls, from_epoch
|
23
23
|
|
@@ -29,7 +29,7 @@ def index(
|
|
29
29
|
http_only: bool = False,
|
30
30
|
locator_schema: str="editor",
|
31
31
|
append_platform_path: bool = False,
|
32
|
-
override_key:
|
32
|
+
override_key: str | None = None,
|
33
33
|
) -> Results:
|
34
34
|
"""
|
35
35
|
:param db_paths:
|
@@ -63,6 +63,8 @@ def index(
|
|
63
63
|
logger.debug("Paths to harvest: %s", db_paths)
|
64
64
|
if not http_only:
|
65
65
|
sql_query = f"{messages_query}\nWHERE body LIKE '%http%'"
|
66
|
+
else:
|
67
|
+
sql_query = messages_query
|
66
68
|
|
67
69
|
for db_path in resolved_db_paths:
|
68
70
|
logger.info("Ciphered db to harvest %s", db_path)
|
@@ -106,12 +108,18 @@ messages_query = dedent(
|
|
106
108
|
SELECT
|
107
109
|
id,
|
108
110
|
type,
|
109
|
-
coalesce(
|
111
|
+
coalesce(
|
112
|
+
profileFullName,
|
113
|
+
profileName,
|
114
|
+
name,
|
115
|
+
profileFamilyName,
|
116
|
+
e164
|
117
|
+
) as aname,
|
110
118
|
name,
|
111
119
|
profileName,
|
112
120
|
profileFamilyName,
|
113
121
|
e164,
|
114
|
-
|
122
|
+
serviceId
|
115
123
|
FROM conversations
|
116
124
|
),
|
117
125
|
Msgs AS (
|
@@ -123,8 +131,8 @@ messages_query = dedent(
|
|
123
131
|
M.received_at,
|
124
132
|
M.sent_at
|
125
133
|
) AS timestamp,
|
126
|
-
IIF(M.type =
|
127
|
-
|
134
|
+
IIF(M.type = 'outgoing',
|
135
|
+
'Me (' || C2.aname || ')',
|
128
136
|
C2.aname
|
129
137
|
) AS sender,
|
130
138
|
M.conversationId AS cid,
|
@@ -138,7 +146,7 @@ messages_query = dedent(
|
|
138
146
|
INNER JOIN Cons AS C1
|
139
147
|
ON M.conversationId = C1.id
|
140
148
|
INNER JOIN Cons AS C2
|
141
|
-
ON M.
|
149
|
+
ON M.sourceServiceId = C2.serviceId
|
142
150
|
)
|
143
151
|
SELECT id, timestamp, sender, cid, chatname, body
|
144
152
|
FROM Msgs
|
@@ -188,8 +196,8 @@ def _expand_path(path_pattern: PathIsh) -> Iterable[Path]:
|
|
188
196
|
|
189
197
|
def _expand_paths(paths: PathIshes) -> Iterable[Path]:
|
190
198
|
if _is_pathish(paths):
|
191
|
-
paths = [paths] # type: ignore[
|
192
|
-
return [pp.resolve() for p in paths for pp in _expand_path(p)] # type: ignore[union-attr
|
199
|
+
paths = [paths] # type: ignore[list-item]
|
200
|
+
return [pp.resolve() for p in paths for pp in _expand_path(p)] # type: ignore[union-attr]
|
193
201
|
|
194
202
|
|
195
203
|
def collect_db_paths(*db_paths: PathIsh, append: bool = False) -> Iterable[Path]:
|
@@ -229,14 +237,14 @@ def collect_db_paths(*db_paths: PathIsh, append: bool = False) -> Iterable[Path]
|
|
229
237
|
platform_name = platform.system()
|
230
238
|
try:
|
231
239
|
plat_paths = platform_db_paths[platform_name]
|
232
|
-
except LookupError:
|
240
|
+
except LookupError as le:
|
233
241
|
raise ValueError(
|
234
242
|
f"Unknown platform({platform_name}!"
|
235
243
|
f"\n Expected one of {list(platform_db_paths.keys())}."
|
236
|
-
)
|
244
|
+
) from le
|
237
245
|
|
238
246
|
if db_paths and append:
|
239
|
-
db_paths = [ # type: ignore[
|
247
|
+
db_paths = [ # type: ignore[assignment]
|
240
248
|
*([db_paths] if _is_pathish(db_paths) else db_paths),
|
241
249
|
plat_paths,
|
242
250
|
]
|
@@ -253,7 +261,7 @@ def _config_for_dbfile(db_path: Path, default_key=None) -> Path:
|
|
253
261
|
|
254
262
|
|
255
263
|
def _key_from_config(signal_desktop_config_path: PathIsh) -> str:
|
256
|
-
with
|
264
|
+
with Path(signal_desktop_config_path).open() as conf:
|
257
265
|
return json.load(conf)["key"]
|
258
266
|
|
259
267
|
|
@@ -261,6 +269,7 @@ def _key_from_config(signal_desktop_config_path: PathIsh) -> str:
|
|
261
269
|
def connect_db(
|
262
270
|
db_path: Path,
|
263
271
|
key,
|
272
|
+
*,
|
264
273
|
decrypt_db: bool = False,
|
265
274
|
sqlcipher_exe: PathIsh = "sqlcipher",
|
266
275
|
**decryption_pragmas: Mapping[str, Any],
|
@@ -310,8 +319,8 @@ def connect_db(
|
|
310
319
|
sql_cmds.extend(
|
311
320
|
[
|
312
321
|
f"ATTACH DATABASE '{decrypted_file}' AS plaintext KEY '';",
|
313
|
-
|
314
|
-
|
322
|
+
"SELECT sqlcipher_export('plaintext');",
|
323
|
+
"DETACH DATABASE plaintext;",
|
315
324
|
]
|
316
325
|
)
|
317
326
|
sql = "\n".join(sql_cmds)
|
@@ -320,12 +329,12 @@ def connect_db(
|
|
320
329
|
"Decrypting db '%s' with cmd: %s <<<EOF\n%s\nEOF", db_path, cmd, sql
|
321
330
|
)
|
322
331
|
try:
|
323
|
-
sbp.run(
|
332
|
+
sbp.run(
|
324
333
|
cmd,
|
325
334
|
check=True,
|
326
335
|
input=sql,
|
327
336
|
capture_output=True,
|
328
|
-
|
337
|
+
text=True,
|
329
338
|
)
|
330
339
|
except sbp.CalledProcessError as ex:
|
331
340
|
prefix = " " * 4
|
@@ -335,7 +344,7 @@ def connect_db(
|
|
335
344
|
) from None
|
336
345
|
db = sqlite3.connect(f"file:{decrypted_file}?mode=ro", uri=True)
|
337
346
|
else:
|
338
|
-
from sqlcipher3 import dbapi2 # type: ignore[import]
|
347
|
+
from sqlcipher3 import dbapi2 # type: ignore[import-not-found]
|
339
348
|
|
340
349
|
db = dbapi2.connect(f"file:{db_path}?mode=ro", uri=True)
|
341
350
|
# Param-binding doesn't work for pragmas, so use a direct string concat.
|
@@ -372,7 +381,7 @@ def _handle_row(row: tuple, db_path: PathIsh, locator_schema: str) -> Results:
|
|
372
381
|
if not urls:
|
373
382
|
return
|
374
383
|
|
375
|
-
assert (
|
384
|
+
assert ( # noqa: PT018
|
376
385
|
text and mid and sender and chatname
|
377
386
|
), f"should have eliminated messages without 'http' or missing ids: {row}"
|
378
387
|
|
@@ -392,7 +401,7 @@ def _harvest_db(
|
|
392
401
|
db_path: Path,
|
393
402
|
messages_query: str,
|
394
403
|
*,
|
395
|
-
override_key:
|
404
|
+
override_key: str | None = None,
|
396
405
|
locator_schema: str = "editor",
|
397
406
|
decrypt_db: bool = False,
|
398
407
|
**decryption_pragmas,
|
@@ -419,9 +428,9 @@ def _harvest_db(
|
|
419
428
|
|
420
429
|
with connect_db(db_path, key, decrypt_db=decrypt_db, **decryption_pragmas) as db:
|
421
430
|
for mid, tstamp, sender, cid, chatname, text in db.execute(messages_query):
|
431
|
+
tstamp = from_epoch(tstamp / 1000.0)
|
432
|
+
row = (mid, tstamp, sender, cid, chatname, text)
|
422
433
|
try:
|
423
|
-
tstamp = from_epoch(tstamp / 1000.0)
|
424
|
-
row = (mid, tstamp, sender, cid, chatname, text)
|
425
434
|
yield from _handle_row(row, db_path, locator_schema)
|
426
435
|
except Exception as ex:
|
427
436
|
# TODO: also insert errors in db
|
promnesia/sources/smscalls.py
CHANGED
@@ -2,11 +2,11 @@
|
|
2
2
|
Uses [[https://github.com/karlicoss/HPI][HPI]] smscalls module
|
3
3
|
'''
|
4
4
|
|
5
|
-
from promnesia.common import
|
5
|
+
from promnesia.common import Loc, Results, Visit, extract_urls
|
6
6
|
|
7
7
|
|
8
8
|
def index() -> Results:
|
9
|
-
from . import hpi
|
9
|
+
from . import hpi # noqa: F401,I001
|
10
10
|
from my.smscalls import messages
|
11
11
|
|
12
12
|
for m in messages():
|
@@ -2,12 +2,13 @@
|
|
2
2
|
Uses [[https://github.com/karlicoss/HPI][HPI]] for Stackexchange data.
|
3
3
|
'''
|
4
4
|
|
5
|
-
from
|
5
|
+
from promnesia.common import Loc, Results, Visit
|
6
6
|
|
7
7
|
|
8
8
|
def index() -> Results:
|
9
|
-
from . import hpi
|
10
|
-
import my.stackexchange.gdpr as G
|
9
|
+
from . import hpi # noqa: F401,I001
|
10
|
+
import my.stackexchange.gdpr as G
|
11
|
+
|
11
12
|
for v in G.votes():
|
12
13
|
if isinstance(v, Exception):
|
13
14
|
yield v
|
promnesia/sources/takeout.py
CHANGED
@@ -1,19 +1,36 @@
|
|
1
1
|
'''
|
2
2
|
Uses HPI [[https://github.com/karlicoss/HPI/blob/master/doc/MODULES.org#mygoogletakeoutpaths][google.takeout]] module
|
3
3
|
'''
|
4
|
-
|
4
|
+
|
5
|
+
from __future__ import annotations
|
6
|
+
|
5
7
|
import warnings
|
8
|
+
from collections.abc import Iterable
|
9
|
+
from typing import Any, NamedTuple
|
10
|
+
|
11
|
+
from promnesia.common import Loc, Results, Visit, logger
|
6
12
|
|
7
|
-
|
8
|
-
|
13
|
+
|
14
|
+
# incase user is using an old version of google_takeout_parser
|
15
|
+
class YoutubeCSVStub(NamedTuple):
|
16
|
+
contentJSON: str
|
9
17
|
|
10
18
|
|
11
19
|
def index() -> Results:
|
12
|
-
from . import hpi
|
20
|
+
from . import hpi # noqa: F401
|
13
21
|
|
14
22
|
try:
|
23
|
+
from google_takeout_parser.models import (
|
24
|
+
Activity,
|
25
|
+
ChromeHistory,
|
26
|
+
LikedYoutubeVideo,
|
27
|
+
YoutubeComment,
|
28
|
+
)
|
29
|
+
from google_takeout_parser.parse_csv import (
|
30
|
+
extract_comment_links,
|
31
|
+
reconstruct_comment_content,
|
32
|
+
)
|
15
33
|
from my.google.takeout.parser import events
|
16
|
-
from google_takeout_parser.models import Activity, YoutubeComment, LikedYoutubeVideo, ChromeHistory
|
17
34
|
except ModuleNotFoundError as ex:
|
18
35
|
logger.exception(ex)
|
19
36
|
yield ex
|
@@ -24,18 +41,30 @@ def index() -> Results:
|
|
24
41
|
yield from takeout_legacy.index()
|
25
42
|
return
|
26
43
|
|
27
|
-
|
44
|
+
|
45
|
+
_seen: set[str] = {
|
28
46
|
# these are definitely not useful for promnesia
|
29
47
|
'Location',
|
30
48
|
'PlaceVisit',
|
31
49
|
'PlayStoreAppInstall',
|
32
50
|
}
|
51
|
+
|
52
|
+
imported_yt_csv_models = False
|
53
|
+
try:
|
54
|
+
from google_takeout_parser.models import CSVYoutubeComment, CSVYoutubeLiveChat
|
55
|
+
imported_yt_csv_models = True
|
56
|
+
except ImportError:
|
57
|
+
# warn user to upgrade google_takeout_parser
|
58
|
+
warnings.warn("Please upgrade google_takeout_parser (`pip install -U google_takeout_parser`) to support the new format for youtube comments")
|
59
|
+
CSVYoutubeComment = YoutubeCSVStub # type: ignore[misc,assignment]
|
60
|
+
CSVYoutubeLiveChat = YoutubeCSVStub # type: ignore[misc,assignment]
|
61
|
+
|
33
62
|
def warn_once_if_not_seen(e: Any) -> Iterable[Exception]:
|
34
63
|
et_name = type(e).__name__
|
35
64
|
if et_name in _seen:
|
36
65
|
return
|
37
66
|
_seen.add(et_name)
|
38
|
-
yield RuntimeError(f"Unhandled event {
|
67
|
+
yield RuntimeError(f"Unhandled event {type(e)!r}: {e}")
|
39
68
|
|
40
69
|
for e in events():
|
41
70
|
if isinstance(e, Exception):
|
@@ -48,13 +77,13 @@ def index() -> Results:
|
|
48
77
|
# when you follow something from search the actual url goes after this
|
49
78
|
# e.g. https://www.google.com/url?q=https://en.wikipedia.org/wiki/Clapham
|
50
79
|
# note: also title usually starts with 'Visited ', in such case but perhaps fine to keep it
|
51
|
-
url = removeprefix(
|
80
|
+
url = url.removeprefix("https://www.google.com/url?q=")
|
52
81
|
title = e.title
|
53
82
|
|
54
83
|
if e.header == 'Chrome':
|
55
84
|
# title contains 'Visited <page title>' in this case
|
56
85
|
context = None
|
57
|
-
title = removeprefix(
|
86
|
+
title = title.removeprefix('Visited ')
|
58
87
|
elif e.header in _CLEAR_CONTEXT_FOR_HEADERS:
|
59
88
|
# todo perhaps could add to some sort of metadata?
|
60
89
|
# only useful for debugging really
|
@@ -71,6 +100,8 @@ def index() -> Results:
|
|
71
100
|
elif e.products == ['Ads']:
|
72
101
|
# header contains some weird internal ad id in this case
|
73
102
|
context = None
|
103
|
+
else:
|
104
|
+
context = None
|
74
105
|
# NOTE: at this point seems that context always ends up as None (at least for @karlicoss as of 20230131)
|
75
106
|
# so alternatively could just force it to be None instead of manual dispatching :shrug:
|
76
107
|
yield Visit(
|
@@ -109,6 +140,42 @@ def index() -> Results:
|
|
109
140
|
yield Visit(
|
110
141
|
url=url, dt=e.dt, context=e.content, locator=Loc(title=e.content, href=url)
|
111
142
|
)
|
143
|
+
elif imported_yt_csv_models and isinstance(e, CSVYoutubeComment):
|
144
|
+
contentJSON = e.contentJSON
|
145
|
+
content = reconstruct_comment_content(contentJSON, format='text')
|
146
|
+
if isinstance(content, Exception):
|
147
|
+
yield content
|
148
|
+
continue
|
149
|
+
links = extract_comment_links(contentJSON)
|
150
|
+
if isinstance(links, Exception):
|
151
|
+
yield links
|
152
|
+
continue
|
153
|
+
context = f"Commented on {e.video_url}"
|
154
|
+
for url in links:
|
155
|
+
yield Visit(
|
156
|
+
url=url, dt=e.dt, context=content, locator=Loc(title=context, href=url)
|
157
|
+
)
|
158
|
+
yield Visit(
|
159
|
+
url=e.video_url, dt=e.dt, context=content, locator=Loc(title=context, href=e.video_url)
|
160
|
+
)
|
161
|
+
elif imported_yt_csv_models and isinstance(e, CSVYoutubeLiveChat):
|
162
|
+
contentJSON = e.contentJSON
|
163
|
+
content = reconstruct_comment_content(contentJSON, format='text')
|
164
|
+
if isinstance(content, Exception):
|
165
|
+
yield content
|
166
|
+
continue
|
167
|
+
links = extract_comment_links(contentJSON)
|
168
|
+
if isinstance(links, Exception):
|
169
|
+
yield links
|
170
|
+
continue
|
171
|
+
context = f"Commented on livestream {e.video_url}"
|
172
|
+
for url in links:
|
173
|
+
yield Visit(
|
174
|
+
url=url, dt=e.dt, context=content, locator=Loc(title=context, href=url)
|
175
|
+
)
|
176
|
+
yield Visit(
|
177
|
+
url=e.video_url, dt=e.dt, context=content, locator=Loc(title=context, href=e.video_url)
|
178
|
+
)
|
112
179
|
else:
|
113
180
|
yield from warn_once_if_not_seen(e)
|
114
181
|
|
@@ -1,9 +1,13 @@
|
|
1
|
-
from
|
1
|
+
from __future__ import annotations
|
2
|
+
|
3
|
+
from promnesia.common import Loc, Results, Visit, logger
|
4
|
+
|
2
5
|
|
3
6
|
# TODO make an iterator, insert in db as we go? handle errors gracefully?
|
4
7
|
def index() -> Results:
|
5
|
-
from . import hpi
|
8
|
+
from . import hpi # noqa: F401,I001
|
6
9
|
from my.google.takeout.paths import get_takeouts
|
10
|
+
|
7
11
|
takeouts = list(get_takeouts())
|
8
12
|
# TODO if no takeouts, raise?
|
9
13
|
# although could raise a warning on top level, when source emitted no takeouts
|
@@ -22,19 +26,25 @@ def index() -> Results:
|
|
22
26
|
|
23
27
|
|
24
28
|
|
25
|
-
import
|
26
|
-
from
|
29
|
+
import json
|
30
|
+
from collections.abc import Iterable
|
27
31
|
from datetime import datetime
|
28
|
-
from
|
32
|
+
from itertools import chain
|
29
33
|
from pathlib import Path
|
30
|
-
import json
|
31
|
-
|
32
34
|
|
33
|
-
|
35
|
+
import pytz
|
36
|
+
from more_itertools import unique_everseen
|
34
37
|
|
38
|
+
from promnesia import config
|
35
39
|
|
36
|
-
|
37
|
-
from cachew import cachew
|
40
|
+
try:
|
41
|
+
from cachew import cachew
|
42
|
+
except ModuleNotFoundError as me:
|
43
|
+
if me.name != 'cachew':
|
44
|
+
raise me
|
45
|
+
# this module is legacy anyway, so just make it defensive
|
46
|
+
def cachew(*args, **kwargs): # type: ignore[no-redef]
|
47
|
+
return lambda f: f
|
38
48
|
|
39
49
|
|
40
50
|
# TODO use CPath? Could encapsulate a path within an archive *or* within a directory
|
@@ -42,7 +52,9 @@ TakeoutPath = Path
|
|
42
52
|
|
43
53
|
|
44
54
|
def _read_myactivity_html(takeout: TakeoutPath, kind: str) -> Iterable[Visit]:
|
55
|
+
# FIXME switch to actual kompress? and use CPath?
|
45
56
|
from my.core.kompress import kexists
|
57
|
+
|
46
58
|
# TODO glob
|
47
59
|
# TODO not sure about windows path separators??
|
48
60
|
spath = 'Takeout/My Activity/' + kind
|
@@ -53,7 +65,7 @@ def _read_myactivity_html(takeout: TakeoutPath, kind: str) -> Iterable[Visit]:
|
|
53
65
|
|
54
66
|
locator = Loc.file(spath)
|
55
67
|
from my.google.takeout.html import read_html
|
56
|
-
for dt, url,
|
68
|
+
for dt, url, _title in read_html(takeout, spath):
|
57
69
|
yield Visit(
|
58
70
|
url=url,
|
59
71
|
dt=dt,
|
@@ -105,7 +117,7 @@ def read_browser_history_json(takeout: TakeoutPath) -> Iterable[Visit]:
|
|
105
117
|
hist = j['Browser History']
|
106
118
|
for item in hist:
|
107
119
|
url = item['url']
|
108
|
-
time = datetime.
|
120
|
+
time = datetime.fromtimestamp(item['time_usec'] / 10 ** 6, tz=pytz.utc)
|
109
121
|
# TODO any more interesitng info?
|
110
122
|
yield Visit(
|
111
123
|
url=url,
|
promnesia/sources/telegram.py
CHANGED
@@ -1,11 +1,12 @@
|
|
1
|
-
from
|
2
|
-
|
1
|
+
from __future__ import annotations
|
2
|
+
|
3
3
|
import warnings
|
4
|
+
from urllib.parse import unquote # TODO mm, make it easier to rememember to use...
|
4
5
|
|
5
|
-
from promnesia.common import
|
6
|
+
from promnesia.common import Loc, PathIsh, Results, Visit, extract_urls, logger
|
6
7
|
|
7
8
|
|
8
|
-
def index(database:
|
9
|
+
def index(database: PathIsh | None=None, *, http_only: bool=False, with_extra_media_info: bool=False) -> Results:
|
9
10
|
if database is None:
|
10
11
|
# fully relying on HPI
|
11
12
|
yield from _index_new(http_only=http_only, with_extra_media_info=with_extra_media_info)
|
@@ -17,10 +18,11 @@ def index(database: Optional[PathIsh]=None, *, http_only: bool=False, with_extra
|
|
17
18
|
)
|
18
19
|
try:
|
19
20
|
yield from _index_new_with_adhoc_config(database=database, http_only=http_only, with_extra_media_info=with_extra_media_info)
|
20
|
-
return
|
21
21
|
except Exception as e:
|
22
22
|
logger.exception(e)
|
23
23
|
warnings.warn("Hacking my.config.telegram.telegram_backup didn't work. You probably need to update HPI.")
|
24
|
+
else:
|
25
|
+
return
|
24
26
|
|
25
27
|
logger.warning("Falling back onto promnesia.sources.telegram_legacy module")
|
26
28
|
yield from _index_legacy(database=database, http_only=http_only)
|
@@ -32,7 +34,7 @@ def _index_legacy(*, database: PathIsh, http_only: bool) -> Results:
|
|
32
34
|
|
33
35
|
|
34
36
|
def _index_new_with_adhoc_config(*, database: PathIsh, http_only: bool, with_extra_media_info: bool) -> Results:
|
35
|
-
from . import hpi
|
37
|
+
from . import hpi # noqa: F401,I001
|
36
38
|
|
37
39
|
class config:
|
38
40
|
class telegram:
|
@@ -45,14 +47,14 @@ def _index_new_with_adhoc_config(*, database: PathIsh, http_only: bool, with_ext
|
|
45
47
|
|
46
48
|
|
47
49
|
def _index_new(*, http_only: bool, with_extra_media_info: bool) -> Results:
|
48
|
-
from . import hpi
|
50
|
+
from . import hpi # noqa: F401,I001
|
49
51
|
from my.telegram.telegram_backup import messages
|
50
52
|
|
51
53
|
extra_where = "(has_media == 1 OR text LIKE '%http%')" if http_only else None
|
52
|
-
for
|
53
|
-
|
54
|
-
|
55
|
-
)
|
54
|
+
for m in messages(
|
55
|
+
with_extra_media_info=with_extra_media_info,
|
56
|
+
extra_where=extra_where,
|
57
|
+
):
|
56
58
|
text = m.text
|
57
59
|
|
58
60
|
urls = extract_urls(text)
|
@@ -2,23 +2,34 @@
|
|
2
2
|
Uses [[https://github.com/fabianonline/telegram_backup#readme][telegram_backup]] database for messages data
|
3
3
|
'''
|
4
4
|
|
5
|
-
from
|
5
|
+
from __future__ import annotations
|
6
|
+
|
6
7
|
import sqlite3
|
8
|
+
from pathlib import Path
|
7
9
|
from textwrap import dedent
|
8
|
-
from typing import
|
9
|
-
from urllib.parse import unquote
|
10
|
+
from typing import TypeVar
|
11
|
+
from urllib.parse import unquote # TODO mm, make it easier to rememember to use...
|
12
|
+
|
13
|
+
from promnesia.common import (
|
14
|
+
Loc,
|
15
|
+
PathIsh,
|
16
|
+
Results,
|
17
|
+
Visit,
|
18
|
+
echain,
|
19
|
+
extract_urls,
|
20
|
+
from_epoch,
|
21
|
+
get_logger,
|
22
|
+
)
|
10
23
|
|
11
|
-
from ..common import PathIsh, Visit, get_logger, Loc, extract_urls, from_epoch, Results, echain
|
12
24
|
from ..sqlite import sqlite_connection
|
13
25
|
|
14
26
|
T = TypeVar("T")
|
15
27
|
|
16
28
|
|
17
|
-
def unwrap(res:
|
29
|
+
def unwrap(res: T | Exception) -> T:
|
18
30
|
if isinstance(res, Exception):
|
19
31
|
raise res
|
20
|
-
|
21
|
-
return res
|
32
|
+
return res
|
22
33
|
|
23
34
|
|
24
35
|
def index(database: PathIsh, *, http_only: bool=False) -> Results:
|
promnesia/sources/twitter.py
CHANGED
@@ -1,18 +1,19 @@
|
|
1
1
|
'''
|
2
2
|
Uses [[https://github.com/karlicoss/HPI][HPI]] for Twitter data.
|
3
3
|
'''
|
4
|
-
from typing import Iterable
|
5
4
|
|
6
|
-
from
|
5
|
+
from collections.abc import Iterable
|
6
|
+
|
7
|
+
from promnesia.common import Loc, Res, Results, Visit, extract_urls, logger
|
7
8
|
|
8
9
|
|
9
10
|
def index() -> Results:
|
10
|
-
from . import hpi
|
11
|
+
from . import hpi # noqa: F401,I001
|
11
12
|
import my.twitter.all as tw
|
13
|
+
from my.twitter.archive import Tweet # todo extract to common or something?
|
14
|
+
|
12
15
|
# TODO hmm. tweets themselves are sort of visits? not sure if they should contribute..
|
13
16
|
processed = 0
|
14
|
-
|
15
|
-
from my.twitter.archive import Tweet # todo extract to common or something?
|
16
17
|
tweets: Iterable[Res[Tweet]] = tw.tweets()
|
17
18
|
for t in tweets:
|
18
19
|
if isinstance(t, Exception):
|
promnesia/sources/vcs.py
CHANGED
@@ -1,12 +1,14 @@
|
|
1
1
|
'''
|
2
2
|
Clones & indexes Git repositories (via sources.auto)
|
3
3
|
'''
|
4
|
-
|
4
|
+
from __future__ import annotations
|
5
5
|
|
6
|
-
from pathlib import Path
|
7
6
|
import re
|
7
|
+
from collections.abc import Iterable
|
8
|
+
|
9
|
+
# TODO not sure if worth exposing... could be just handled by auto or something?)
|
10
|
+
from pathlib import Path
|
8
11
|
from subprocess import check_call
|
9
|
-
from typing import Iterable
|
10
12
|
|
11
13
|
from ..common import Extraction, PathIsh, get_tmpdir, slugify
|
12
14
|
|