promnesia 1.1.20230129__py3-none-any.whl → 1.2.20240810__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (72) hide show
  1. promnesia/__main__.py +58 -50
  2. promnesia/cannon.py +4 -4
  3. promnesia/common.py +57 -38
  4. promnesia/compare.py +3 -2
  5. promnesia/compat.py +6 -65
  6. promnesia/config.py +4 -2
  7. promnesia/database/common.py +66 -0
  8. promnesia/database/dump.py +187 -0
  9. promnesia/{read_db.py → database/load.py} +10 -11
  10. promnesia/extract.py +1 -0
  11. promnesia/kjson.py +1 -1
  12. promnesia/logging.py +14 -14
  13. promnesia/misc/__init__.pyi +0 -0
  14. promnesia/misc/config_example.py +1 -2
  15. promnesia/misc/install_server.py +5 -4
  16. promnesia/server.py +24 -24
  17. promnesia/sources/__init__.pyi +0 -0
  18. promnesia/sources/auto.py +12 -7
  19. promnesia/sources/browser.py +80 -293
  20. promnesia/sources/browser_legacy.py +298 -0
  21. promnesia/sources/demo.py +18 -2
  22. promnesia/sources/filetypes.py +8 -0
  23. promnesia/sources/github.py +2 -2
  24. promnesia/sources/hackernews.py +1 -2
  25. promnesia/sources/hypothesis.py +1 -1
  26. promnesia/sources/markdown.py +15 -15
  27. promnesia/sources/org.py +7 -3
  28. promnesia/sources/plaintext.py +3 -1
  29. promnesia/sources/reddit.py +2 -2
  30. promnesia/sources/rss.py +5 -1
  31. promnesia/sources/shellcmd.py +6 -2
  32. promnesia/sources/signal.py +29 -20
  33. promnesia/sources/smscalls.py +8 -1
  34. promnesia/sources/stackexchange.py +2 -2
  35. promnesia/sources/takeout.py +132 -12
  36. promnesia/sources/takeout_legacy.py +10 -2
  37. promnesia/sources/telegram.py +79 -123
  38. promnesia/sources/telegram_legacy.py +117 -0
  39. promnesia/sources/vcs.py +1 -1
  40. promnesia/sources/viber.py +6 -15
  41. promnesia/sources/website.py +1 -1
  42. promnesia/sqlite.py +42 -0
  43. promnesia/tests/__init__.py +0 -0
  44. promnesia/tests/common.py +137 -0
  45. promnesia/tests/server_helper.py +64 -0
  46. promnesia/tests/sources/__init__.py +0 -0
  47. promnesia/tests/sources/test_auto.py +66 -0
  48. promnesia/tests/sources/test_filetypes.py +42 -0
  49. promnesia/tests/sources/test_hypothesis.py +39 -0
  50. promnesia/tests/sources/test_org.py +65 -0
  51. promnesia/tests/sources/test_plaintext.py +26 -0
  52. promnesia/tests/sources/test_shellcmd.py +22 -0
  53. promnesia/tests/sources/test_takeout.py +58 -0
  54. promnesia/tests/test_cannon.py +325 -0
  55. promnesia/tests/test_cli.py +42 -0
  56. promnesia/tests/test_compare.py +30 -0
  57. promnesia/tests/test_config.py +290 -0
  58. promnesia/tests/test_db_dump.py +223 -0
  59. promnesia/tests/test_extract.py +61 -0
  60. promnesia/tests/test_extract_urls.py +43 -0
  61. promnesia/tests/test_indexer.py +245 -0
  62. promnesia/tests/test_server.py +292 -0
  63. promnesia/tests/test_traverse.py +41 -0
  64. promnesia/tests/utils.py +35 -0
  65. {promnesia-1.1.20230129.dist-info → promnesia-1.2.20240810.dist-info}/METADATA +14 -19
  66. promnesia-1.2.20240810.dist-info/RECORD +83 -0
  67. {promnesia-1.1.20230129.dist-info → promnesia-1.2.20240810.dist-info}/WHEEL +1 -1
  68. {promnesia-1.1.20230129.dist-info → promnesia-1.2.20240810.dist-info}/entry_points.txt +0 -1
  69. promnesia/dump.py +0 -105
  70. promnesia-1.1.20230129.dist-info/RECORD +0 -55
  71. {promnesia-1.1.20230129.dist-info → promnesia-1.2.20240810.dist-info}/LICENSE +0 -0
  72. {promnesia-1.1.20230129.dist-info → promnesia-1.2.20240810.dist-info}/top_level.txt +0 -0
@@ -1,12 +1,13 @@
1
1
  """
2
- Harvest visits from Signal Desktop's chiphered SQLIite db(s).
2
+ Collects visits from Signal Desktop's encrypted SQLIite db(s).
3
+ """
3
4
 
4
- Functions get their defaults from module-data.
5
+ # Functions get their defaults from module-data.
6
+ #
7
+ # * Open-ciphered-db adapted from:
8
+ # https://github.com/carderne/signal-export/commit/2284c8f4
9
+ # * Copyright (c) 2019 Chris Arderne, 2020 Kostis Anagnostopoulos
5
10
 
6
- * Open-ciphered-db adapted from:
7
- https://github.com/carderne/signal-export/commit/2284c8f4
8
- * Copyright (c) 2019 Chris Arderne, 2020 Kostis Anagnostopoulos
9
- """
10
11
 
11
12
  import json
12
13
  import logging
@@ -62,6 +63,8 @@ def index(
62
63
  logger.debug("Paths to harvest: %s", db_paths)
63
64
  if not http_only:
64
65
  sql_query = f"{messages_query}\nWHERE body LIKE '%http%'"
66
+ else:
67
+ sql_query = messages_query
65
68
 
66
69
  for db_path in resolved_db_paths:
67
70
  logger.info("Ciphered db to harvest %s", db_path)
@@ -105,12 +108,18 @@ messages_query = dedent(
105
108
  SELECT
106
109
  id,
107
110
  type,
108
- coalesce(name, profileName, profileFamilyName, e164) as aname,
111
+ coalesce(
112
+ profileFullName,
113
+ profileName,
114
+ name,
115
+ profileFamilyName,
116
+ e164
117
+ ) as aname,
109
118
  name,
110
119
  profileName,
111
120
  profileFamilyName,
112
121
  e164,
113
- uuid
122
+ serviceId
114
123
  FROM conversations
115
124
  ),
116
125
  Msgs AS (
@@ -122,8 +131,8 @@ messages_query = dedent(
122
131
  M.received_at,
123
132
  M.sent_at
124
133
  ) AS timestamp,
125
- IIF(M.type = "outgoing",
126
- "Me (" || C2.aname || ")",
134
+ IIF(M.type = 'outgoing',
135
+ 'Me (' || C2.aname || ')',
127
136
  C2.aname
128
137
  ) AS sender,
129
138
  M.conversationId AS cid,
@@ -137,7 +146,7 @@ messages_query = dedent(
137
146
  INNER JOIN Cons AS C1
138
147
  ON M.conversationId = C1.id
139
148
  INNER JOIN Cons AS C2
140
- ON M.sourceUuid = C2.uuid
149
+ ON M.sourceServiceId = C2.serviceId
141
150
  )
142
151
  SELECT id, timestamp, sender, cid, chatname, body
143
152
  FROM Msgs
@@ -187,8 +196,8 @@ def _expand_path(path_pattern: PathIsh) -> Iterable[Path]:
187
196
 
188
197
  def _expand_paths(paths: PathIshes) -> Iterable[Path]:
189
198
  if _is_pathish(paths):
190
- paths = [paths] # type: ignore[assignment,list-item]
191
- return [pp.resolve() for p in paths for pp in _expand_path(p)] # type: ignore[union-attr,list-item]
199
+ paths = [paths] # type: ignore[list-item]
200
+ return [pp.resolve() for p in paths for pp in _expand_path(p)] # type: ignore[union-attr]
192
201
 
193
202
 
194
203
  def collect_db_paths(*db_paths: PathIsh, append: bool = False) -> Iterable[Path]:
@@ -235,7 +244,7 @@ def collect_db_paths(*db_paths: PathIsh, append: bool = False) -> Iterable[Path]
235
244
  )
236
245
 
237
246
  if db_paths and append:
238
- db_paths = [ # type: ignore[misc,assignment]
247
+ db_paths = [ # type: ignore[assignment]
239
248
  *([db_paths] if _is_pathish(db_paths) else db_paths),
240
249
  plat_paths,
241
250
  ]
@@ -309,8 +318,8 @@ def connect_db(
309
318
  sql_cmds.extend(
310
319
  [
311
320
  f"ATTACH DATABASE '{decrypted_file}' AS plaintext KEY '';",
312
- f"SELECT sqlcipher_export('plaintext');",
313
- f"DETACH DATABASE plaintext;",
321
+ "SELECT sqlcipher_export('plaintext');",
322
+ "DETACH DATABASE plaintext;",
314
323
  ]
315
324
  )
316
325
  sql = "\n".join(sql_cmds)
@@ -319,7 +328,7 @@ def connect_db(
319
328
  "Decrypting db '%s' with cmd: %s <<<EOF\n%s\nEOF", db_path, cmd, sql
320
329
  )
321
330
  try:
322
- sbp.run( # type: ignore[call-overload]
331
+ sbp.run(
323
332
  cmd,
324
333
  check=True,
325
334
  input=sql,
@@ -334,7 +343,7 @@ def connect_db(
334
343
  ) from None
335
344
  db = sqlite3.connect(f"file:{decrypted_file}?mode=ro", uri=True)
336
345
  else:
337
- from sqlcipher3 import dbapi2 # type: ignore[import]
346
+ from sqlcipher3 import dbapi2 # type: ignore[import-not-found]
338
347
 
339
348
  db = dbapi2.connect(f"file:{db_path}?mode=ro", uri=True)
340
349
  # Param-binding doesn't work for pragmas, so use a direct string concat.
@@ -418,9 +427,9 @@ def _harvest_db(
418
427
 
419
428
  with connect_db(db_path, key, decrypt_db=decrypt_db, **decryption_pragmas) as db:
420
429
  for mid, tstamp, sender, cid, chatname, text in db.execute(messages_query):
430
+ tstamp = from_epoch(tstamp / 1000.0)
431
+ row = (mid, tstamp, sender, cid, chatname, text)
421
432
  try:
422
- tstamp = from_epoch(tstamp / 1000.0)
423
- row = (mid, tstamp, sender, cid, chatname, text)
424
433
  yield from _handle_row(row, db_path, locator_schema)
425
434
  except Exception as ex:
426
435
  # TODO: also insert errors in db
@@ -11,11 +11,18 @@ def index() -> Results:
11
11
 
12
12
  for m in messages():
13
13
 
14
+ if isinstance(m, Exception):
15
+ yield m
16
+ continue
17
+
14
18
  urls = extract_urls(m.message)
15
19
  if len(urls) == 0:
16
20
  continue
17
21
 
18
- loc = Loc(title=f"SMS with {m.who} ({m.phone_number})")
22
+ if m.who is None:
23
+ loc = Loc(title=f"SMS with {m.phone_number}")
24
+ else:
25
+ loc = Loc(title=f"SMS with {m.who} ({m.phone_number})")
19
26
 
20
27
  for u in urls:
21
28
  yield Visit(
@@ -2,12 +2,12 @@
2
2
  Uses [[https://github.com/karlicoss/HPI][HPI]] for Stackexchange data.
3
3
  '''
4
4
 
5
- from ..common import Results, Visit, Loc, extract_urls
5
+ from ..common import Results, Visit, Loc
6
6
 
7
7
 
8
8
  def index() -> Results:
9
9
  from . import hpi
10
- import my.stackexchange.gdpr as G # type: ignore[import] # TODO eh, not sure if should run against pypi or not...
10
+ import my.stackexchange.gdpr as G
11
11
  for v in G.votes():
12
12
  if isinstance(v, Exception):
13
13
  yield v
@@ -1,19 +1,26 @@
1
1
  '''
2
2
  Uses HPI [[https://github.com/karlicoss/HPI/blob/master/doc/MODULES.org#mygoogletakeoutpaths][google.takeout]] module
3
3
  '''
4
- from typing import Iterable, Set, Type
4
+ from typing import Iterable, Set, Any, NamedTuple
5
5
  import warnings
6
6
 
7
7
  from ..common import Visit, Loc, Results, logger
8
8
  from ..compat import removeprefix
9
9
 
10
10
 
11
+ # incase user is using an old version of google_takeout_parser
12
+ class YoutubeCSVStub(NamedTuple):
13
+ contentJSON: str
14
+
15
+
11
16
  def index() -> Results:
12
17
  from . import hpi
18
+ import json
13
19
 
14
20
  try:
15
21
  from my.google.takeout.parser import events
16
- from google_takeout_parser.models import Activity, YoutubeComment, LikedYoutubeVideo, ChromeHistory, PlayStoreAppInstall, Location
22
+ from google_takeout_parser.models import Activity, YoutubeComment, LikedYoutubeVideo, ChromeHistory
23
+ from google_takeout_parser.parse_csv import reconstruct_comment_content, extract_comment_links
17
24
  except ModuleNotFoundError as ex:
18
25
  logger.exception(ex)
19
26
  yield ex
@@ -24,17 +31,30 @@ def index() -> Results:
24
31
  yield from takeout_legacy.index()
25
32
  return
26
33
 
27
- _seen: Set[Type] = {
34
+
35
+ _seen: Set[str] = {
28
36
  # these are definitely not useful for promnesia
29
- Location,
30
- PlayStoreAppInstall,
37
+ 'Location',
38
+ 'PlaceVisit',
39
+ 'PlayStoreAppInstall',
31
40
  }
32
- def warn_once_if_not_seen(e) -> Iterable[Exception]:
33
- et = type(e)
34
- if et in _seen:
41
+
42
+ imported_yt_csv_models = False
43
+ try:
44
+ from google_takeout_parser.models import CSVYoutubeComment, CSVYoutubeLiveChat
45
+ imported_yt_csv_models = True
46
+ except ImportError:
47
+ # warn user to upgrade google_takeout_parser
48
+ warnings.warn("Please upgrade google_takeout_parser (`pip install -U google_takeout_parser`) to support the new format for youtube comments")
49
+ CSVYoutubeComment = YoutubeCSVStub # type: ignore[misc,assignment]
50
+ CSVYoutubeLiveChat = YoutubeCSVStub # type: ignore[misc,assignment]
51
+
52
+ def warn_once_if_not_seen(e: Any) -> Iterable[Exception]:
53
+ et_name = type(e).__name__
54
+ if et_name in _seen:
35
55
  return
36
- _seen.add(et)
37
- yield RuntimeError(f"Unhandled event {et}: {e}")
56
+ _seen.add(et_name)
57
+ yield RuntimeError(f"Unhandled event {repr(type(e))}: {e}")
38
58
 
39
59
  for e in events():
40
60
  if isinstance(e, Exception):
@@ -48,12 +68,37 @@ def index() -> Results:
48
68
  # e.g. https://www.google.com/url?q=https://en.wikipedia.org/wiki/Clapham
49
69
  # note: also title usually starts with 'Visited ', in such case but perhaps fine to keep it
50
70
  url = removeprefix(url, "https://www.google.com/url?q=")
71
+ title = e.title
51
72
 
73
+ if e.header == 'Chrome':
74
+ # title contains 'Visited <page title>' in this case
75
+ context = None
76
+ title = removeprefix(title, 'Visited ')
77
+ elif e.header in _CLEAR_CONTEXT_FOR_HEADERS:
78
+ # todo perhaps could add to some sort of metadata?
79
+ # only useful for debugging really
80
+ context = None
81
+ elif e.header in url:
82
+ # stuff like News only has domain name in the header -- completely useless for promnesia
83
+ context = None
84
+ elif e.title == f'Used {e.header}':
85
+ # app usage tracking -- using app name as context is useless here
86
+ context = None
87
+ elif e.products == ['Android']:
88
+ # seems to be coming from in-app browser, header contains app name in this case
89
+ context = None
90
+ elif e.products == ['Ads']:
91
+ # header contains some weird internal ad id in this case
92
+ context = None
93
+ else:
94
+ context = None
95
+ # NOTE: at this point seems that context always ends up as None (at least for @karlicoss as of 20230131)
96
+ # so alternatively could just force it to be None instead of manual dispatching :shrug:
52
97
  yield Visit(
53
98
  url=url,
54
99
  dt=e.time,
55
- context=e.header,
56
- locator=Loc(title=e.title, href=url),
100
+ context=context,
101
+ locator=Loc(title=title, href=url),
57
102
  )
58
103
  for s in e.subtitles:
59
104
  surl = s[1]
@@ -73,6 +118,8 @@ def index() -> Results:
73
118
  locator=Loc(title=e.title, href=e.url),
74
119
  )
75
120
  elif isinstance(e, LikedYoutubeVideo):
121
+ # TODO not sure if desc makes sense here since it's not user produced data
122
+ # it's just a part of video meta?
76
123
  yield Visit(
77
124
  url=e.link, dt=e.dt, context=e.desc, locator=Loc(title=e.title, href=e.link)
78
125
  )
@@ -83,5 +130,78 @@ def index() -> Results:
83
130
  yield Visit(
84
131
  url=url, dt=e.dt, context=e.content, locator=Loc(title=e.content, href=url)
85
132
  )
133
+ elif imported_yt_csv_models and isinstance(e, CSVYoutubeComment):
134
+ contentJSON = json.loads(e.contentJSON)
135
+ content = reconstruct_comment_content(contentJSON, format='text')
136
+ if isinstance(content, Exception):
137
+ yield content
138
+ continue
139
+ links = extract_comment_links(contentJSON)
140
+ if isinstance(links, Exception):
141
+ yield links
142
+ continue
143
+ context = f"Commented on {e.video_url}"
144
+ for url in links:
145
+ yield Visit(
146
+ url=url, dt=e.dt, context=content, locator=Loc(title=context, href=url)
147
+ )
148
+ yield Visit(
149
+ url=e.video_url, dt=e.dt, context=content, locator=Loc(title=context, href=e.video_url)
150
+ )
151
+ elif imported_yt_csv_models and isinstance(e, CSVYoutubeLiveChat):
152
+ contentJSON = json.loads(e.contentJSON)
153
+ content = reconstruct_comment_content(contentJSON, format='text')
154
+ if isinstance(content, Exception):
155
+ yield content
156
+ continue
157
+ links = extract_comment_links(contentJSON)
158
+ if isinstance(links, Exception):
159
+ yield links
160
+ continue
161
+ context = f"Commented on livestream {e.video_url}"
162
+ for url in links:
163
+ yield Visit(
164
+ url=url, dt=e.dt, context=content, locator=Loc(title=context, href=url)
165
+ )
166
+ yield Visit(
167
+ url=e.video_url, dt=e.dt, context=content, locator=Loc(title=context, href=e.video_url)
168
+ )
86
169
  else:
87
170
  yield from warn_once_if_not_seen(e)
171
+
172
+
173
+ _CLEAR_CONTEXT_FOR_HEADERS = {
174
+ 'Google Cloud',
175
+ 'Travel',
176
+ 'Google Arts & Culture',
177
+ 'Drive',
178
+ 'Calendar',
179
+ 'Google Store',
180
+ 'Shopping',
181
+ 'News',
182
+ 'Help',
183
+ 'Books',
184
+ 'Google My Business',
185
+ 'Google Play Movies & TV',
186
+ 'Developers',
187
+ 'YouTube',
188
+ 'Gmail',
189
+ 'Video Search',
190
+ 'Google Apps',
191
+ 'Google Translate',
192
+ 'Ads',
193
+ 'Image Search',
194
+ 'Assistant',
195
+ 'Google Play Store',
196
+ 'Android',
197
+ 'Maps',
198
+ 'Search',
199
+ 'Google App',
200
+ 'in_app_display_context_client',
201
+ 'Play Music',
202
+ 'Maps - Navigate & Explore',
203
+ 'Google Maps',
204
+ 'google.com',
205
+ 'Google Play Books',
206
+ 'Maps - Navigation & Transit',
207
+ }
@@ -34,7 +34,15 @@ from .. import config
34
34
 
35
35
 
36
36
  from more_itertools import unique_everseen
37
- from cachew import cachew
37
+
38
+ try:
39
+ from cachew import cachew
40
+ except ModuleNotFoundError as me:
41
+ if me.name != 'cachew':
42
+ raise me
43
+ # this module is legacy anyway, so just make it defensive
44
+ def cachew(*args, **kwargs): # type: ignore[no-redef]
45
+ return lambda f: f
38
46
 
39
47
 
40
48
  # TODO use CPath? Could encapsulate a path within an archive *or* within a directory
@@ -105,7 +113,7 @@ def read_browser_history_json(takeout: TakeoutPath) -> Iterable[Visit]:
105
113
  hist = j['Browser History']
106
114
  for item in hist:
107
115
  url = item['url']
108
- time = datetime.utcfromtimestamp(item['time_usec'] / 10 ** 6).replace(tzinfo=pytz.utc)
116
+ time = datetime.fromtimestamp(item['time_usec'] / 10 ** 6, tz=pytz.utc)
109
117
  # TODO any more interesitng info?
110
118
  yield Visit(
111
119
  url=url,
@@ -1,128 +1,84 @@
1
- '''
2
- Uses [[https://github.com/fabianonline/telegram_backup#readme][telegram_backup]] database for messages data
3
- '''
4
-
5
- from pathlib import Path
6
- from textwrap import dedent
7
- from typing import Optional, Union, TypeVar
1
+ from typing import Optional
8
2
  from urllib.parse import unquote # TODO mm, make it easier to rememember to use...
3
+ import warnings
9
4
 
10
- from ..common import PathIsh, Visit, get_logger, Loc, extract_urls, from_epoch, Results, echain
11
-
12
- # TODO potentially, belongs to my. package
13
-
14
- T = TypeVar("T")
15
-
16
-
17
- def unwrap(res: Union[T, Exception]) -> T:
18
- if isinstance(res, Exception):
19
- raise res
20
- else:
21
- return res
22
-
23
-
24
- # TODO move to common?
25
- def dataset_readonly(db: Path):
26
- import dataset # type: ignore
27
- # see https://github.com/pudo/dataset/issues/136#issuecomment-128693122
28
- import sqlite3
29
- creator = lambda: sqlite3.connect(f'file:{db}?immutable=1', uri=True)
30
- return dataset.connect('sqlite:///' , engine_kwargs={'creator': creator})
31
-
32
-
33
- def index(database: PathIsh, *, http_only: bool=False) -> Results:
34
- """
35
- :param database:
36
- the path of the sqlite generated by the _telegram_backup_ java program
37
- :param http_only:
38
- when true, do not collect IP-addresses and `python.py` strings
39
- """
40
- logger = get_logger()
41
-
42
- path = Path(database)
43
- assert path.is_file(), path # TODO could check is_file inside `dataset_readonly()`
44
-
45
- def make_query(text_query: str) -> str:
46
- extra_criteria = "AND (M.has_media == 1 OR text LIKE '%http%')" if http_only else ""
47
- return dedent(
48
- f"""
49
- WITH entities AS (
50
- SELECT 'dialog' as type
51
- , id
52
- , coalesce(username, id) as handle
53
- , coalesce(first_name || " " || last_name
54
- , username
55
- , id
56
- ) as display_name FROM users
57
- UNION
58
- SELECT 'group' as type
59
- , id
60
- , id as handle
61
- , coalesce(name, id) as display_name FROM chats
62
- )
63
- SELECT src.display_name AS chatname
64
- , src.handle AS chat
65
- , snd.display_name AS sender
66
- , M.time AS time
67
- , {text_query} AS text
68
- , M.id AS mid
69
- FROM messages AS M
70
- /* chat types are 'dialog' (1-1), 'group' and 'supergroup' */
71
- /* this is abit hacky way to handle all groups in one go */
72
- LEFT JOIN entities AS src ON M.source_id = src.id AND src.type = (CASE M.source_type WHEN 'supergroup' THEN 'group' ELSE M.source_type END)
73
- LEFT JOIN entities AS snd ON M.sender_id = snd.id AND snd.type = 'dialog'
74
- WHERE
75
- M.message_type NOT IN ('service_message', 'empty_message')
76
- {extra_criteria}
77
- ORDER BY time;
78
- """)
79
-
80
- # TODO context manager?
81
- with dataset_readonly(path) as db:
82
-
83
- # TODO yield error if chatname or chat or smth else is null?
84
- for row in db.query(make_query('M.text')):
85
- try:
86
- yield from _handle_row(row)
87
- except Exception as ex:
88
- yield echain(RuntimeError(f'While handling {row}'), ex)
89
- # , None, sys.exc_info()[2]
90
- # TODO hmm. traceback isn't preserved; wonder if that's because it's too heavy to attach to every single exception object..
91
-
92
- # old (also 'stable') version doesn't have 'json' column yet...
93
- if 'json' in db['messages'].columns:
94
- for row in db.query(make_query("json_extract(json, '$.media.webpage.description')")):
95
- try:
96
- yield from _handle_row(row)
97
- except Exception as ex:
98
- yield echain(RuntimeError(f'While handling {row}'), ex)
99
-
100
-
101
- def _handle_row(row) -> Results:
102
- text = row['text']
103
- if text is None:
5
+ from promnesia.common import Results, logger, extract_urls, Visit, Loc, PathIsh
6
+
7
+
8
+ def index(database: Optional[PathIsh]=None, *, http_only: bool=False, with_extra_media_info: bool=False) -> Results:
9
+ if database is None:
10
+ # fully relying on HPI
11
+ yield from _index_new(http_only=http_only, with_extra_media_info=with_extra_media_info)
104
12
  return
105
- urls = extract_urls(text)
106
- if len(urls) == 0:
13
+
14
+ warnings.warn(
15
+ f'Passing paths to promnesia.sources.telegram is deprecated, you should setup my.telegram.telegram_backup instead. '
16
+ f'Will try to hack database path {database} into HPI config.'
17
+ )
18
+ try:
19
+ yield from _index_new_with_adhoc_config(database=database, http_only=http_only, with_extra_media_info=with_extra_media_info)
107
20
  return
108
- dt = from_epoch(row['time'])
109
- mid: str = unwrap(row['mid'])
110
-
111
- # TODO perhaps we could be defensive with null sender/chat etc and still emit the Visit
112
- sender: str = unwrap(row['sender'])
113
- chatname: str = unwrap(row['chatname'])
114
- chat: str = unwrap(row['chat'])
115
-
116
- in_context = f'https://t.me/{chat}/{mid}'
117
- for u in urls:
118
- # https://www.reddit.com/r/Telegram/comments/6ufwi3/link_to_a_specific_message_in_a_channel_possible/
119
- # hmm, only seems to work on mobile app, but better than nothing...
120
- yield Visit(
121
- url=unquote(u),
122
- dt=dt,
123
- context=f"{sender}: {text}",
124
- locator=Loc.make(
125
- title=f"chat with {chatname}",
126
- href=in_context,
127
- ),
21
+ except Exception as e:
22
+ logger.exception(e)
23
+ warnings.warn("Hacking my.config.telegram.telegram_backup didn't work. You probably need to update HPI.")
24
+
25
+ logger.warning("Falling back onto promnesia.sources.telegram_legacy module")
26
+ yield from _index_legacy(database=database, http_only=http_only)
27
+
28
+
29
+ def _index_legacy(*, database: PathIsh, http_only: bool) -> Results:
30
+ from . import telegram_legacy
31
+ yield from telegram_legacy.index(database=database, http_only=http_only)
32
+
33
+
34
+ def _index_new_with_adhoc_config(*, database: PathIsh, http_only: bool, with_extra_media_info: bool) -> Results:
35
+ from . import hpi
36
+
37
+ class config:
38
+ class telegram:
39
+ class telegram_backup:
40
+ export_path: PathIsh = database
41
+
42
+ from my.core.cfg import tmp_config
43
+ with tmp_config(modules='my.telegram.telegram_backup', config=config):
44
+ yield from _index_new(http_only=http_only, with_extra_media_info=with_extra_media_info)
45
+
46
+
47
+ def _index_new(*, http_only: bool, with_extra_media_info: bool) -> Results:
48
+ from . import hpi
49
+ from my.telegram.telegram_backup import messages
50
+
51
+ extra_where = "(has_media == 1 OR text LIKE '%http%')" if http_only else None
52
+ for i, m in enumerate(messages(
53
+ with_extra_media_info=with_extra_media_info,
54
+ extra_where=extra_where,
55
+ )):
56
+ text = m.text
57
+
58
+ urls = extract_urls(text)
59
+ extra_media_info = m.extra_media_info
60
+ if extra_media_info is not None:
61
+ urls.extend(extract_urls(extra_media_info))
62
+
63
+ if len(urls) == 0:
64
+ continue
65
+
66
+ dt = m.time
67
+ sender = m.sender.name
68
+ chat = m.chat
69
+
70
+ cname = chat.name if chat.name is not None else str(chat.id)
71
+
72
+ locator = Loc.make(
73
+ title=f"chat with {cname}",
74
+ href=m.permalink,
128
75
  )
76
+ context = f'{sender}: {text}'
77
+
78
+ for u in urls:
79
+ yield Visit(
80
+ url=unquote(u),
81
+ dt=dt,
82
+ context=context,
83
+ locator=locator,
84
+ )