promnesia 1.1.20230129__py3-none-any.whl → 1.2.20240810__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (72) hide show
  1. promnesia/__main__.py +58 -50
  2. promnesia/cannon.py +4 -4
  3. promnesia/common.py +57 -38
  4. promnesia/compare.py +3 -2
  5. promnesia/compat.py +6 -65
  6. promnesia/config.py +4 -2
  7. promnesia/database/common.py +66 -0
  8. promnesia/database/dump.py +187 -0
  9. promnesia/{read_db.py → database/load.py} +10 -11
  10. promnesia/extract.py +1 -0
  11. promnesia/kjson.py +1 -1
  12. promnesia/logging.py +14 -14
  13. promnesia/misc/__init__.pyi +0 -0
  14. promnesia/misc/config_example.py +1 -2
  15. promnesia/misc/install_server.py +5 -4
  16. promnesia/server.py +24 -24
  17. promnesia/sources/__init__.pyi +0 -0
  18. promnesia/sources/auto.py +12 -7
  19. promnesia/sources/browser.py +80 -293
  20. promnesia/sources/browser_legacy.py +298 -0
  21. promnesia/sources/demo.py +18 -2
  22. promnesia/sources/filetypes.py +8 -0
  23. promnesia/sources/github.py +2 -2
  24. promnesia/sources/hackernews.py +1 -2
  25. promnesia/sources/hypothesis.py +1 -1
  26. promnesia/sources/markdown.py +15 -15
  27. promnesia/sources/org.py +7 -3
  28. promnesia/sources/plaintext.py +3 -1
  29. promnesia/sources/reddit.py +2 -2
  30. promnesia/sources/rss.py +5 -1
  31. promnesia/sources/shellcmd.py +6 -2
  32. promnesia/sources/signal.py +29 -20
  33. promnesia/sources/smscalls.py +8 -1
  34. promnesia/sources/stackexchange.py +2 -2
  35. promnesia/sources/takeout.py +132 -12
  36. promnesia/sources/takeout_legacy.py +10 -2
  37. promnesia/sources/telegram.py +79 -123
  38. promnesia/sources/telegram_legacy.py +117 -0
  39. promnesia/sources/vcs.py +1 -1
  40. promnesia/sources/viber.py +6 -15
  41. promnesia/sources/website.py +1 -1
  42. promnesia/sqlite.py +42 -0
  43. promnesia/tests/__init__.py +0 -0
  44. promnesia/tests/common.py +137 -0
  45. promnesia/tests/server_helper.py +64 -0
  46. promnesia/tests/sources/__init__.py +0 -0
  47. promnesia/tests/sources/test_auto.py +66 -0
  48. promnesia/tests/sources/test_filetypes.py +42 -0
  49. promnesia/tests/sources/test_hypothesis.py +39 -0
  50. promnesia/tests/sources/test_org.py +65 -0
  51. promnesia/tests/sources/test_plaintext.py +26 -0
  52. promnesia/tests/sources/test_shellcmd.py +22 -0
  53. promnesia/tests/sources/test_takeout.py +58 -0
  54. promnesia/tests/test_cannon.py +325 -0
  55. promnesia/tests/test_cli.py +42 -0
  56. promnesia/tests/test_compare.py +30 -0
  57. promnesia/tests/test_config.py +290 -0
  58. promnesia/tests/test_db_dump.py +223 -0
  59. promnesia/tests/test_extract.py +61 -0
  60. promnesia/tests/test_extract_urls.py +43 -0
  61. promnesia/tests/test_indexer.py +245 -0
  62. promnesia/tests/test_server.py +292 -0
  63. promnesia/tests/test_traverse.py +41 -0
  64. promnesia/tests/utils.py +35 -0
  65. {promnesia-1.1.20230129.dist-info → promnesia-1.2.20240810.dist-info}/METADATA +14 -19
  66. promnesia-1.2.20240810.dist-info/RECORD +83 -0
  67. {promnesia-1.1.20230129.dist-info → promnesia-1.2.20240810.dist-info}/WHEEL +1 -1
  68. {promnesia-1.1.20230129.dist-info → promnesia-1.2.20240810.dist-info}/entry_points.txt +0 -1
  69. promnesia/dump.py +0 -105
  70. promnesia-1.1.20230129.dist-info/RECORD +0 -55
  71. {promnesia-1.1.20230129.dist-info → promnesia-1.2.20240810.dist-info}/LICENSE +0 -0
  72. {promnesia-1.1.20230129.dist-info → promnesia-1.2.20240810.dist-info}/top_level.txt +0 -0
promnesia/sources/auto.py CHANGED
@@ -1,6 +1,9 @@
1
1
  """
2
2
  - discovers files recursively
3
3
  - guesses the format (orgmode/markdown/json/etc) by the extension/MIME type
4
+ - can index most of plaintext files, including source code!
5
+ - autodetects Obsidian vault and adds `obsidian://` app protocol support [[file:../src/promnesia/sources/obsidian.py][promnesia.sources.obsidian]]
6
+ - autodetects Logseq graph and adds `logseq://` app protocol support [[file:../src/promnesia/sources/logseq.py][promnesia.sources.logseq]]
4
7
  """
5
8
 
6
9
  import csv
@@ -19,17 +22,18 @@ import warnings
19
22
  import pytz
20
23
 
21
24
  from ..common import Visit, Url, PathIsh, get_logger, Loc, get_tmpdir, extract_urls, Extraction, Result, Results, mime, traverse, file_mtime, echain, logger
25
+ from ..common import warn_once
22
26
  from ..config import use_cores
23
27
 
24
28
 
25
- from .filetypes import EUrl
29
+ from .filetypes import EUrl, Ctx
26
30
  from .auto_obsidian import obsidian_replacer
27
31
  from .auto_logseq import logseq_replacer
28
32
 
29
33
 
30
34
  def _collect(thing, path: List[str], result: List[EUrl]) -> None:
31
35
  if isinstance(thing, str):
32
- ctx: Ctx = tuple(path) # type: ignore
36
+ ctx: Ctx = tuple(path)
33
37
  result.extend([EUrl(url=u, ctx=ctx) for u in extract_urls(thing)])
34
38
  elif isinstance(thing, list):
35
39
  path.append('[]')
@@ -164,7 +168,7 @@ for t in CODE:
164
168
  Replacer = Optional[Callable[[str, str], str]]
165
169
 
166
170
  def index(
167
- *paths: Union[PathIsh],
171
+ *paths: PathIsh,
168
172
  ignored: Union[Sequence[str], str]=(),
169
173
  follow: bool=True,
170
174
  replacer: Replacer=None,
@@ -279,6 +283,8 @@ def by_path(pp: Path) -> Tuple[Optional[Ex], Optional[Mime]]:
279
283
 
280
284
  def _index_file(pp: Path, opts: Options) -> Results:
281
285
  logger = get_logger()
286
+ # TODO need to keep debug logs here...
287
+ # logger.info(f"indexing {pp}")
282
288
  # TODO use kompress?
283
289
  # TODO not even sure if it's used...
284
290
  suf = pp.suffix.lower()
@@ -304,10 +310,9 @@ def _index_file(pp: Path, opts: Options) -> Results:
304
310
 
305
311
  ip, pm = by_path(pp)
306
312
  if ip is None:
307
- # TODO use warning (with mime/ext as key?)
308
- # TODO only log once? # hmm..
313
+ # todo not really sure about using warnings vs yielding error here?
309
314
  msg = f'No extractor for suffix {suf}, mime {pm}'
310
- warnings.warn(msg)
315
+ warn_once(msg)
311
316
  yield echain(ex, RuntimeError(msg))
312
317
  return
313
318
 
@@ -315,7 +320,7 @@ def _index_file(pp: Path, opts: Options) -> Results:
315
320
 
316
321
  def indexer() -> Union[Urls, Results]:
317
322
  # eh, annoying.. need to make more generic..
318
- idx = ip(pp) # type: ignore
323
+ idx = ip(pp)
319
324
  try:
320
325
  yield from idx
321
326
  except Exception as e:
@@ -1,302 +1,89 @@
1
- from datetime import datetime
2
- from pathlib import Path
3
- from urllib.parse import unquote
4
- import sqlite3
5
- from typing import List, Set
1
+ '''
2
+ Uses [[https://github.com/karlicoss/HPI][HPI]] for visits from web browsers.
3
+ '''
6
4
 
7
- import pytz
5
+ import re
6
+ from typing import Optional, Iterator, Any, TYPE_CHECKING
7
+ import warnings
8
8
 
9
- from ..common import PathIsh, Results, Visit, Loc, get_logger, Second, mime
10
- from .. import config
9
+ from promnesia.common import Results, Visit, Loc, Second, PathIsh, logger, is_sqlite_db
11
10
 
12
- # todo mcachew?
13
- from cachew import cachew
14
11
 
15
- logger = get_logger()
12
+ def index(p: Optional[PathIsh]=None) -> Results:
13
+ from . import hpi
16
14
 
17
-
18
- def index(p: PathIsh) -> Results:
19
- pp = Path(p)
20
- assert pp.exists(), pp # just in case of broken symlinks
21
-
22
- # is_file check because it also returns dirs
23
- # TODO hmm, not sure what I meant here -- which dirs? behind symlinks?
24
- is_db = lambda x: x.is_file() and mime(x) in {
25
- 'application/x-sqlite3',
26
- 'application/vnd.sqlite3',
27
- # TODO this mime can also match wal files/journals, not sure
28
- }
29
-
30
- # todo warn if filtered out too many?
31
- # todo wonder how quickly mimes can be computed?
32
- # todo ugh, dunno, maybe this really belongs to hpi?? need get_files etc...
33
- dbs = [p for p in sorted(pp.rglob('*')) if is_db(p)]
34
-
35
- assert len(dbs) > 0, pp
36
- logger.info('processing %d databases', len(dbs))
37
- cname = str('_'.join(pp.parts[1:])) # meh
38
- yield from _index_dbs(dbs, cachew_name=cname)
39
-
40
-
41
-
42
- def _index_dbs(dbs: List[Path], cachew_name: str):
43
- # TODO right... not ideal, need to think how to handle it properly...
44
- import sys
45
- sys.setrecursionlimit(5000)
46
-
47
- cache_dir = config.get().cache_dir
48
- cpath = None if cache_dir is None else cache_dir / cachew_name
49
- emitted: Set = set()
50
- yield from _index_dbs_aux(cpath, dbs, emitted=emitted)
51
-
52
-
53
- # todo wow, stack traces are ridiculous here...
54
- # todo hmm, feels like it should be a class or something?
55
- @cachew(lambda cp, dbs, emitted: cp, depends_on=lambda cp, dbs, emitted: dbs) # , logger=logger)
56
- def _index_dbs_aux(cache_path: Path, dbs: List[Path], emitted: Set) -> Results:
57
- if len(dbs) == 0:
15
+ if p is None:
16
+ from my.browser.all import history
17
+ yield from _index_new(history())
58
18
  return
59
19
 
60
- xs = dbs[:-1]
61
- x = dbs[-1:]
62
-
63
- xs_res = _index_dbs_aux(cache_path, xs, emitted)
64
- xs_was_cached = False
65
- for r in xs_res:
66
- # if it was cached, emitted would be empty
67
- if len(emitted) == 0:
68
- xs_was_cached = True
69
- logger.debug('seems that %d first items were previously cached', len(xs))
70
- if xs_was_cached:
71
- key = (r.url, r.dt)
72
- assert key not in emitted, key # todo not sure if this assert is necessary?
73
- # hmm ok it might happen if we messed up with indexing individual db?
74
- # alternatively, could abuse it to avoid messing with 'emitted' in _index_db?
75
- emitted.add(key)
76
- yield r # todo not sure about exceptions?
77
-
78
- for db in x:
79
- yield from _index_db(db, emitted=emitted)
80
-
81
-
82
- def _index_db(db: Path, emitted: Set):
83
- logger.info('processing %s', db) # debug level?
84
-
85
- # todo schema check (not so critical for cachew though)
86
- total = 0
87
- new = 0
88
- loc = Loc.file(db) # todo possibly needs to be optimized -- moving from within the loop considerably speeds everything up
89
- with sqlite3.connect(f'file:{db}?immutable=1', uri=True) as c:
90
- browser = None
91
- for b in [Chrome, Firefox, FirefoxPhone, Safari]:
92
- try:
93
- c.execute(f'SELECT * FROM {b.detector}')
94
- except sqlite3.OperationalError: # not sure if the right kind?
95
- pass
96
- else:
97
- browser = b
98
- break
99
- assert browser is not None
100
-
101
- proj = ', '.join(c for c, _ in browser.schema.cols)
102
- query = browser.query.replace('chunk.', '')
103
-
104
- c.row_factory = sqlite3.Row
105
- for r in c.execute(f'select {proj} {query}'):
106
- v = browser.row2visit(r, loc)
107
- total += 1
108
-
109
- key = (v.url, v.dt)
110
- # todo how to keep keys compatible?
111
- if key in emitted:
112
- continue
113
- yield v
114
- emitted.add(key)
115
- new += 1
116
-
117
- # eh, ok, almost 2x faster if I don't construct Visit first
118
- # maybe it's Loc.file that's too slow?
119
- # yeah, seems like it, 4.1 s after computing it only once
120
-
121
- logger.info('%s: %d/%d new visits', db, new, total)
122
-
123
-
124
- Col = str
125
- ColType = str
126
-
127
-
128
- from typing import Any, NamedTuple, Tuple, Union, Sequence, Optional
129
-
130
- class Schema(NamedTuple):
131
- cols: Sequence[Tuple[Col, ColType]]
132
- key: Sequence[str]
133
-
134
-
135
- SchemaCheck = Tuple[str, Union[str, Sequence[str]]] # todo Union: meh
136
-
137
- from dataclasses import dataclass
138
-
139
- # todo protocol?
140
- @dataclass
141
- class Extr:
142
- detector: str
143
- schema_check: SchemaCheck
144
- schema: Schema
145
- query: str
146
-
147
- # todo calllable?
148
- @staticmethod
149
- def row2visit(row: sqlite3.Row, loc: Loc) -> Visit:
150
- raise NotImplementedError
151
-
152
-
153
- class Chrome(Extr):
154
- detector='keyword_search_terms'
155
- schema_check=(
156
- 'visits', [
157
- 'visits', "id, url, visit_time, from_visit, transition, segment_id, visit_duration, incremented_omnibox_typed_score",
158
- 'visits', "id, url, visit_time, from_visit, transition, segment_id, visit_duration"
159
- ]
160
- )
161
- schema=Schema(cols=[
162
- ('U.url' , 'TEXT' ),
163
-
164
- # while these two are not very useful, might be good to have just in case for some debugging
165
- ('U.id AS urlid' , 'INTEGER'),
166
- ('V.id AS vid' , 'INTEGER'),
167
-
168
- ('V.visit_time' , 'INTEGER NOT NULL'),
169
- ('V.from_visit' , 'INTEGER' ),
170
- ('V.transition' , 'INTEGER NOT NULL'),
171
- # V.segment_id looks useless
172
- ('V.visit_duration' , 'INTEGER NOT NULL'),
173
- # V.omnibox thing looks useless
174
- ], key=('url', 'visit_time', 'vid', 'urlid'))
175
- query='FROM chunk.visits as V, chunk.urls as U WHERE V.url = U.id'
176
-
177
- @staticmethod
178
- def row2visit(row: sqlite3.Row, loc: Loc) -> Visit:
179
- url = row['url']
180
- ts = row['visit_time']
181
- durs = row['visit_duration']
182
-
183
- dt = chrome_time_to_utc(int(ts))
184
- url = unquote(url) # chrome urls are all quoted
185
- dd = int(durs)
186
- dur: Optional[Second] = None if dd == 0 else dd // 1_000_000
187
- return Visit(
188
- url=url,
189
- dt=dt,
190
- locator=loc,
191
- duration=dur,
192
- )
193
-
194
-
195
- # should be utc? https://stackoverflow.com/a/26226771/706389
196
- # yep, tested it and looks like utc
197
- def chrome_time_to_utc(chrome_time: int) -> datetime:
198
- epoch = (chrome_time / 1_000_000) - 11644473600
199
- return datetime.fromtimestamp(epoch, pytz.utc)
200
-
201
-
202
- def _row2visit_firefox(row: sqlite3.Row, loc: Loc) -> Visit:
203
- url = row['url']
204
- ts = float(row['visit_date'])
205
- # ok, looks like it's unix epoch
206
- # https://stackoverflow.com/a/19430099/706389
207
-
208
- # NOTE: ugh. on Fenix (experimental Android version) it uses milliseconds, not nanos...
209
- # about year 2001... if someone has browser history exports before that -- please let me know, I'm impressed
210
- threshold = 1000000000
211
- if ts > threshold * 1_000_000:
212
- # presumably it's in microseconds
213
- ts /= 1_000_000
214
- else:
215
- # milliseconds
216
- ts /= 1_000
217
- dt = datetime.fromtimestamp(ts, pytz.utc)
218
- url = unquote(url) # firefox urls are all quoted
219
- return Visit(
220
- url=url,
221
- dt=dt,
222
- locator=loc,
223
- )
224
-
225
- # https://web.archive.org/web/20201026130310/http://fileformats.archiveteam.org/wiki/History.db
226
- class Safari(Extr):
227
- detector='history_tombstones'
228
- schema_check=(
229
- 'history_visits', [
230
- 'history_visits', "id, history_item, visit_time",
231
- 'history_items', "id, url"
232
- ]
20
+ warnings.warn(
21
+ f'Passing paths to promnesia.sources.browser is deprecated, you should setup my.browser.export instead. '
22
+ f'See https://github.com/seanbreckenridge/browserexport#hpi .'
23
+ f'Will try to hack path to browser databases {p} into HPI config.'
233
24
  )
234
- schema=Schema(cols=[
235
- ('U.url' , 'TEXT' ),
236
-
237
- # while these two are not very useful, might be good to have just in case for some debugging
238
- ('U.id AS urlid' , 'INTEGER'),
239
- ('V.id AS vid' , 'INTEGER'),
240
-
241
- ('V.visit_time' , 'INTEGER NOT NULL'),
242
- # ('V.from_visit' , 'INTEGER' ),
243
- # ('V.transition' , 'INTEGER NOT NULL'),
244
- # V.segment_id looks useless
245
- # ('V.visit_duration' , 'INTEGER NOT NULL'),
246
- # V.omnibox thing looks useless
247
- ], key=('url', 'visit_time', 'vid', 'urlid'))
248
- query='FROM chunk.history_visits as V, chunk.history_items as U WHERE V.history_item = U.id'
249
-
250
- @staticmethod
251
- def row2visit(row: sqlite3.Row, loc: Loc) -> Visit:
252
- url = row['url']
253
- ts = row['visit_time'] + 978307200 # https://stackoverflow.com/a/34546556/16645
254
- dt = datetime.fromtimestamp(ts, pytz.utc)
255
-
256
- return Visit(
257
- url=url,
258
- dt=dt,
259
- locator=loc,
25
+ try:
26
+ yield from _index_new_with_adhoc_config(path=p)
27
+ return
28
+ except Exception as e:
29
+ logger.exception(e)
30
+ warnings.warn("Hacking my.config.browser.export didn't work. You probably need to update HPI.")
31
+
32
+ logger.warning("Falling back onto legacy promnesia.sources.browser_legacy module")
33
+ yield from _index_old(path=p)
34
+
35
+
36
+ def _index_old(*, path: PathIsh) -> Results:
37
+ from . import browser_legacy
38
+ yield from browser_legacy.index(path)
39
+
40
+
41
+ def _index_new_with_adhoc_config(*, path: PathIsh) -> Results:
42
+ from . import hpi
43
+
44
+ ## previously, it was possible to index be called with multiple different db search paths
45
+ ## this would result in each subsequent call to my.browser.export.history to invalidate cache every time
46
+ ## so we hack cachew path so it's different for each call
47
+ from my.core.core_config import config as hpi_core_config
48
+ hpi_cache_dir = hpi_core_config.get_cache_dir()
49
+ sanitized_path = re.sub(r'\W', '_', str(path))
50
+ cache_override = None if hpi_cache_dir is None else hpi_cache_dir / sanitized_path
51
+ ##
52
+
53
+ from my.core.common import classproperty, Paths, get_files
54
+ class config:
55
+ class core:
56
+ cache_dir = cache_override
57
+
58
+ class browser:
59
+ class export:
60
+ @classproperty
61
+ def export_path(cls) -> Paths:
62
+ return tuple([f for f in get_files(path, glob='**/*') if is_sqlite_db(f)])
63
+
64
+ from my.core.cfg import tmp_config
65
+ with tmp_config(modules='my.browser.export|my.core.core_config', config=config):
66
+ from my.browser.export import history
67
+ yield from _index_new(history())
68
+
69
+
70
+ if TYPE_CHECKING:
71
+ from browserexport.merge import Visit as BrowserMergeVisit
72
+ else:
73
+ BrowserMergeVisit = Any
74
+
75
+
76
+ def _index_new(history: Iterator[BrowserMergeVisit]) -> Results:
77
+ for v in history:
78
+ desc: Optional[str] = None
79
+ duration: Optional[Second] = None
80
+ metadata = v.metadata
81
+ if metadata is not None:
82
+ desc = metadata.title
83
+ duration = metadata.duration
84
+ yield Visit(
85
+ url=v.url,
86
+ dt=v.dt,
87
+ locator=Loc(title=desc or v.url, href=v.url),
88
+ duration=duration,
260
89
  )
261
-
262
- # https://web.archive.org/web/20190730231715/https://www.forensicswiki.org/wiki/Mozilla_Firefox_3_History_File_Format#moz_historyvisits
263
- class Firefox(Extr):
264
- detector='moz_meta'
265
- schema_check=('moz_historyvisits', "id, from_visit, place_id, visit_date, visit_type")
266
- schema=Schema(cols=[
267
- ('P.url' , 'TEXT'),
268
-
269
- ('P.id AS pid' , 'INTEGER'),
270
- ('V.id AS vid' , 'INTEGER'),
271
-
272
- ('V.from_visit', 'INTEGER'),
273
- ('V.visit_date', 'INTEGER'),
274
- ('V.visit_type', 'INTEGER'),
275
-
276
- # not sure what session is form but could be useful?..
277
- # NOTE(20210410): for now, commented it out since some older databases from phone have this column commented?
278
- # needs to be defensive
279
- # ('V.session' , 'INTEGER'),
280
- ], key=('url', 'visit_date', 'vid', 'pid'))
281
- query='FROM chunk.moz_historyvisits as V, chunk.moz_places as P WHERE V.place_id = P.id'
282
-
283
- row2visit = _row2visit_firefox
284
-
285
-
286
- class FirefoxPhone(Extr):
287
- detector='remote_devices'
288
- schema_check=('visits', "_id, history_guid, visit_type, date, is_local")
289
- schema=Schema(cols=[
290
- ('H.url' , 'TEXT NOT NULL' ),
291
-
292
- ('H.guid AS guid' , 'TEXT' ),
293
- ('H._id AS hid' , 'INTEGER' ),
294
- ('V._id AS vid' , 'INTEGER' ),
295
-
296
- ('V.visit_type' , 'INTEGER NOT NULL'),
297
- ('V.date as visit_date', 'INTEGER NOT NULL'),
298
- # ('is_local' , 'INTEGER NOT NULL'),
299
- ], key=('url', 'date', 'vid', 'hid'))
300
- query='FROM chunk.visits as V, chunk.history as H WHERE V.history_guid = H.guid'
301
-
302
- row2visit = _row2visit_firefox