promnesia 1.3.20241021__py3-none-any.whl → 1.4.20250909__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (68) hide show
  1. promnesia/__init__.py +4 -1
  2. promnesia/__main__.py +72 -59
  3. promnesia/cannon.py +90 -89
  4. promnesia/common.py +74 -62
  5. promnesia/compare.py +15 -10
  6. promnesia/config.py +22 -17
  7. promnesia/database/dump.py +1 -2
  8. promnesia/extract.py +6 -6
  9. promnesia/logging.py +27 -15
  10. promnesia/misc/install_server.py +25 -19
  11. promnesia/server.py +69 -53
  12. promnesia/sources/auto.py +65 -51
  13. promnesia/sources/browser.py +7 -2
  14. promnesia/sources/browser_legacy.py +51 -40
  15. promnesia/sources/demo.py +0 -1
  16. promnesia/sources/fbmessenger.py +0 -1
  17. promnesia/sources/filetypes.py +15 -11
  18. promnesia/sources/github.py +4 -1
  19. promnesia/sources/guess.py +4 -1
  20. promnesia/sources/hackernews.py +5 -7
  21. promnesia/sources/hpi.py +3 -1
  22. promnesia/sources/html.py +4 -2
  23. promnesia/sources/instapaper.py +1 -0
  24. promnesia/sources/markdown.py +4 -4
  25. promnesia/sources/org.py +17 -8
  26. promnesia/sources/plaintext.py +14 -11
  27. promnesia/sources/pocket.py +2 -1
  28. promnesia/sources/reddit.py +5 -8
  29. promnesia/sources/roamresearch.py +3 -1
  30. promnesia/sources/rss.py +4 -5
  31. promnesia/sources/shellcmd.py +3 -6
  32. promnesia/sources/signal.py +14 -14
  33. promnesia/sources/smscalls.py +0 -1
  34. promnesia/sources/stackexchange.py +2 -2
  35. promnesia/sources/takeout.py +14 -21
  36. promnesia/sources/takeout_legacy.py +16 -10
  37. promnesia/sources/telegram.py +7 -3
  38. promnesia/sources/telegram_legacy.py +5 -5
  39. promnesia/sources/twitter.py +1 -1
  40. promnesia/sources/vcs.py +6 -3
  41. promnesia/sources/viber.py +2 -2
  42. promnesia/sources/website.py +4 -3
  43. promnesia/sqlite.py +10 -7
  44. promnesia/tests/common.py +2 -0
  45. promnesia/tests/server_helper.py +2 -2
  46. promnesia/tests/sources/test_filetypes.py +9 -7
  47. promnesia/tests/sources/test_hypothesis.py +7 -3
  48. promnesia/tests/sources/test_org.py +7 -2
  49. promnesia/tests/sources/test_plaintext.py +9 -7
  50. promnesia/tests/sources/test_shellcmd.py +10 -9
  51. promnesia/tests/test_cannon.py +254 -237
  52. promnesia/tests/test_cli.py +8 -2
  53. promnesia/tests/test_compare.py +16 -12
  54. promnesia/tests/test_db_dump.py +4 -3
  55. promnesia/tests/test_extract.py +7 -4
  56. promnesia/tests/test_indexer.py +10 -10
  57. promnesia/tests/test_server.py +10 -10
  58. promnesia/tests/utils.py +1 -5
  59. promnesia-1.4.20250909.dist-info/METADATA +66 -0
  60. promnesia-1.4.20250909.dist-info/RECORD +80 -0
  61. {promnesia-1.3.20241021.dist-info → promnesia-1.4.20250909.dist-info}/WHEEL +1 -2
  62. promnesia/kjson.py +0 -122
  63. promnesia/sources/__init__.pyi +0 -0
  64. promnesia-1.3.20241021.dist-info/METADATA +0 -55
  65. promnesia-1.3.20241021.dist-info/RECORD +0 -83
  66. promnesia-1.3.20241021.dist-info/top_level.txt +0 -1
  67. {promnesia-1.3.20241021.dist-info → promnesia-1.4.20250909.dist-info}/entry_points.txt +0 -0
  68. {promnesia-1.3.20241021.dist-info → promnesia-1.4.20250909.dist-info/licenses}/LICENSE +0 -0
promnesia/sources/auto.py CHANGED
@@ -2,22 +2,23 @@
2
2
  - discovers files recursively
3
3
  - guesses the format (orgmode/markdown/json/etc) by the extension/MIME type
4
4
  - can index most of plaintext files, including source code!
5
- - autodetects Obsidian vault and adds `obsidian://` app protocol support [[file:../src/promnesia/sources/obsidian.py][promnesia.sources.obsidian]]
6
- - autodetects Logseq graph and adds `logseq://` app protocol support [[file:../src/promnesia/sources/logseq.py][promnesia.sources.logseq]]
5
+ - autodetects Obsidian vault and adds `obsidian://` app protocol support [[file:../src/promnesia/sources/auto_obsidian.py][promnesia.sources.obsidian]]
6
+ - autodetects Logseq graph and adds `logseq://` app protocol support [[file:../src/promnesia/sources/auto_logseq.py][promnesia.sources.logseq]]
7
7
  """
8
+
8
9
  from __future__ import annotations
9
10
 
10
11
  import csv
11
12
  import itertools
12
13
  import json
13
14
  import os
14
- from collections.abc import Iterable, Iterator, Sequence
15
+ from collections.abc import Callable, Iterable, Iterator, Sequence
15
16
  from concurrent.futures import ProcessPoolExecutor as Pool
16
17
  from contextlib import nullcontext
17
18
  from fnmatch import fnmatch
18
19
  from functools import wraps
19
20
  from pathlib import Path
20
- from typing import Any, Callable, NamedTuple, Optional
21
+ from typing import Any, NamedTuple
21
22
 
22
23
  from promnesia.common import (
23
24
  Loc,
@@ -71,6 +72,7 @@ def collect_from(thing) -> list[EUrl]:
71
72
 
72
73
  Urls = Iterator[EUrl]
73
74
 
75
+
74
76
  def _csv(path: Path) -> Urls:
75
77
  # TODO these could also have Loc to be fair..
76
78
  with path.open() as fo:
@@ -88,6 +90,7 @@ def _json(path: Path) -> Urls:
88
90
  def _plaintext(path: Path) -> Results:
89
91
  from . import shellcmd
90
92
  from .plaintext import extract_from_path
93
+
91
94
  yield from shellcmd.index(extract_from_path(path))
92
95
 
93
96
 
@@ -97,6 +100,7 @@ def fallback(ex):
97
100
  """Falls back to plaintext in case of issues"""
98
101
 
99
102
  fallback_active: dict[Any, bool] = {}
103
+
100
104
  @wraps(ex)
101
105
  def wrapped(path: Path):
102
106
  nonlocal fallback_active
@@ -110,79 +114,83 @@ def fallback(ex):
110
114
  except ModuleNotFoundError as me:
111
115
  logger = get_logger()
112
116
  logger.exception(me)
113
- logger.warning('%s: %s not found, falling back to grep! "pip3 install --user %s" for better support!', path, me.name, me.name)
117
+ logger.warning(
118
+ '%s: %s not found, falling back to grep! "pip3 install --user %s" for better support!',
119
+ path,
120
+ me.name,
121
+ me.name,
122
+ )
114
123
  yield me
115
124
  fallback_active[ex] = True
116
125
  do_fallback = True
117
126
  if do_fallback:
118
127
  yield from _plaintext(path)
128
+
119
129
  return wrapped
120
130
 
121
131
 
122
132
  @fallback
123
133
  def _markdown(path: Path) -> Results:
124
134
  from . import markdown
135
+
125
136
  yield from markdown.extract_from_file(path)
126
137
 
127
138
 
128
139
  @fallback
129
140
  def _html(path: Path) -> Results:
130
141
  from . import html
142
+
131
143
  yield from html.extract_from_file(path)
132
144
 
133
145
 
134
146
  @fallback
135
147
  def _org(path: Path) -> Results:
136
148
  from . import org
149
+
137
150
  return org.extract_from_file(path)
138
151
 
139
152
 
140
153
  from .filetypes import CODE, IGNORE, TYPE2IDX, type2idx
141
154
 
142
- TYPE2IDX.update({
143
- 'application/json': _json,
144
- '.json' : _json,
145
- '.ipynb' : _json,
146
-
147
- '.csv' : _csv,
148
- 'application/csv': _csv,
149
-
150
- '.org' : _org,
151
- '.org_archive': _org,
152
-
153
- '.md' : _markdown,
154
- '.markdown' : _markdown,
155
-
156
- 'text/plain' : _plaintext,
157
- '.txt' : _plaintext,
158
- '.page' : _plaintext,
159
- '.rst' : _plaintext,
160
-
161
-
162
- # TODO doesn't work that great; weird stuff like
163
- # builtins.ImportError.name|2019-07-10T12:12:35.584510+00:00|names::ImportError::node::names::name::node::fullname
164
- # TODO could have stricter url extraction for that; always using http/https?
165
- # '.ipynb' : _json,
166
-
167
- '.html' : _html,
168
- 'text/html': _html,
169
- 'text/xml' : _plaintext,
170
-
171
- 'text/x-po': _plaintext, # some translation files
172
- })
155
+ TYPE2IDX.update(
156
+ {
157
+ 'application/json': _json,
158
+ '.json': _json,
159
+ '.ipynb': _json,
160
+ '.csv': _csv,
161
+ 'application/csv': _csv,
162
+ '.org': _org,
163
+ '.org_archive': _org,
164
+ '.md': _markdown,
165
+ '.markdown': _markdown,
166
+ 'text/plain': _plaintext,
167
+ '.txt': _plaintext,
168
+ '.page': _plaintext,
169
+ '.rst': _plaintext,
170
+ # TODO doesn't work that great; weird stuff like
171
+ # builtins.ImportError.name|2019-07-10T12:12:35.584510+00:00|names::ImportError::node::names::name::node::fullname
172
+ # TODO could have stricter url extraction for that; always using http/https?
173
+ # '.ipynb' : _json,
174
+ '.html': _html,
175
+ 'text/html': _html,
176
+ 'text/xml': _plaintext,
177
+ 'text/x-po': _plaintext, # some translation files
178
+ }
179
+ )
173
180
 
174
181
  for t in CODE:
175
182
  TYPE2IDX[t] = _plaintext
176
183
  # TODO ok, mime doesn't really tell between org/markdown/etc anyway
177
184
 
178
185
 
179
- Replacer = Optional[Callable[[str, str], str]]
186
+ Replacer = Callable[[str, str], str] | None
187
+
180
188
 
181
189
  def index(
182
- *paths: PathIsh,
183
- ignored: Sequence[str] | str=(),
184
- follow: bool=True,
185
- replacer: Replacer=None,
190
+ *paths: PathIsh,
191
+ ignored: Sequence[str] | str = (),
192
+ follow: bool = True,
193
+ replacer: Replacer = None,
186
194
  ) -> Results:
187
195
  '''
188
196
  path : a path or list of paths to recursively index
@@ -215,13 +223,14 @@ def index(
215
223
  )
216
224
  yield from _index(apath, opts=opts)
217
225
 
226
+
218
227
  class Options(NamedTuple):
219
228
  ignored: Sequence[str]
220
229
  follow: bool
221
230
  # TODO option to add ignores? not sure..
222
231
  # TODO I don't like this replacer thing... think about removing it
223
232
  replacer: Replacer
224
- root: Path | None=None
233
+ root: Path | None = None
225
234
 
226
235
 
227
236
  def _index_file_aux(path: Path, opts: Options) -> Exception | list[Result]:
@@ -237,14 +246,14 @@ def _index(path: Path, opts: Options) -> Results:
237
246
  logger = get_logger()
238
247
 
239
248
  cores = use_cores()
240
- if cores is None: # do not use cores
249
+ if cores is None: # do not use cores
241
250
  # todo use ExitStack instead?
242
251
  pool = nullcontext()
243
- mapper = map # dummy pool
252
+ mapper = map # dummy pool
244
253
  else:
245
254
  workers = None if cores == 0 else cores
246
- pool = Pool(workers) # type: ignore
247
- mapper = pool.map # type: ignore
255
+ pool = Pool(workers) # type: ignore[assignment]
256
+ mapper = pool.map # type: ignore[attr-defined]
248
257
 
249
258
  # iterate over resolved paths, to avoid duplicates
250
259
  def rit() -> Iterable[Path]:
@@ -254,7 +263,7 @@ def _index(path: Path, opts: Options) -> Results:
254
263
  # TODO not sure if should log here... might end up with quite a bit of logs
255
264
  logger.debug('ignoring %s: user ignore rules', p)
256
265
  continue
257
- if any(i in p.parts for i in IGNORE): # meh, not very efficient.. pass to traverse??
266
+ if any(i in p.parts for i in IGNORE): # meh, not very efficient.. pass to traverse??
258
267
  logger.debug('ignoring %s: default ignore rules', p)
259
268
  continue
260
269
 
@@ -266,6 +275,7 @@ def _index(path: Path, opts: Options) -> Results:
266
275
  yield p
267
276
 
268
277
  from more_itertools import unique_everseen
278
+
269
279
  it = unique_everseen(rit())
270
280
 
271
281
  with pool:
@@ -302,9 +312,10 @@ def _index_file(pp: Path, opts: Options) -> Results:
302
312
  # TODO not even sure if it's used...
303
313
  suf = pp.suffix.lower()
304
314
 
305
- if suf == '.xz': # TODO zstd?
315
+ if suf == '.xz': # TODO zstd?
306
316
  import lzma
307
- uname = pp.name[:-len('.xz')] # chop off suffix, so the downstream indexer can handle it
317
+
318
+ uname = pp.name[: -len('.xz')] # chop off suffix, so the downstream indexer can handle it
308
319
 
309
320
  assert pp.is_absolute(), pp
310
321
  # make sure to keep hierarchy, otherwise might end up with some name conflicts if filenames clash
@@ -359,7 +370,8 @@ def _index_file(pp: Path, opts: Options) -> Results:
359
370
  v = r
360
371
 
361
372
  loc = v.locator
362
- if loc is not None and root is not None:
373
+ # FIXME double checke that v.locator indeed can't be none and remove the check?
374
+ if loc is not None and root is not None: # type: ignore[redundant-expr]
363
375
  # meh. but it works
364
376
  # todo potentially, just use dataclasses instead...
365
377
  loc = loc._replace(title=loc.title.replace(str(root) + os.sep, ''))
@@ -369,7 +381,9 @@ def _index_file(pp: Path, opts: Options) -> Results:
369
381
  upd: dict[str, Any] = {}
370
382
  href = v.locator.href
371
383
  if href is not None:
372
- upd['locator'] = v.locator._replace(href=replacer(href, str(root)), title=replacer(v.locator.title, str(root)))
384
+ upd['locator'] = v.locator._replace(
385
+ href=replacer(href, str(root)), title=replacer(v.locator.title, str(root))
386
+ )
373
387
  ctx = v.context
374
388
  if ctx is not None:
375
389
  # TODO in context, http is unnecessary
@@ -13,16 +13,17 @@ from promnesia.common import Loc, PathIsh, Results, Second, Visit, is_sqlite_db,
13
13
 
14
14
 
15
15
  def index(p: PathIsh | None = None) -> Results:
16
- from . import hpi # noqa: F401,I001
16
+ from . import hpi # noqa: F401
17
17
 
18
18
  if p is None:
19
19
  from my.browser.all import history
20
+
20
21
  yield from _index_new(history())
21
22
  return
22
23
 
23
24
  warnings.warn(
24
25
  f'Passing paths to promnesia.sources.browser is deprecated, you should setup my.browser.export instead. '
25
- f'See https://github.com/seanbreckenridge/browserexport#hpi .'
26
+ f'See https://github.com/purarue/browserexport#hpi .'
26
27
  f'Will try to hack path to browser databases {p} into HPI config.'
27
28
  )
28
29
  try:
@@ -50,12 +51,14 @@ def _index_new_with_adhoc_config(*, path: PathIsh) -> Results:
50
51
  ## this would result in each subsequent call to my.browser.export.history to invalidate cache every time
51
52
  ## so we hack cachew path so it's different for each call
52
53
  from my.core.core_config import config as hpi_core_config
54
+
53
55
  hpi_cache_dir = hpi_core_config.get_cache_dir()
54
56
  sanitized_path = re.sub(r'\W', '_', str(path))
55
57
  cache_override = None if hpi_cache_dir is None else hpi_cache_dir / sanitized_path
56
58
  ##
57
59
 
58
60
  from my.core.common import Paths, classproperty, get_files
61
+
59
62
  class config:
60
63
  class core:
61
64
  cache_dir = cache_override
@@ -67,8 +70,10 @@ def _index_new_with_adhoc_config(*, path: PathIsh) -> Results:
67
70
  return tuple([f for f in get_files(path, glob='**/*') if is_sqlite_db(f)])
68
71
 
69
72
  from my.core.cfg import tmp_config
73
+
70
74
  with tmp_config(modules='my.browser.export|my.core.core_config', config=config):
71
75
  from my.browser.export import history
76
+
72
77
  yield from _index_new(history())
73
78
 
74
79
 
@@ -1,12 +1,10 @@
1
1
  from __future__ import annotations
2
2
 
3
3
  import sqlite3
4
- from datetime import datetime
4
+ from datetime import datetime, timezone
5
5
  from pathlib import Path
6
6
  from urllib.parse import unquote
7
7
 
8
- import pytz
9
-
10
8
  from promnesia import config
11
9
  from promnesia.common import Loc, PathIsh, Results, Second, Visit, is_sqlite_db, logger
12
10
 
@@ -15,6 +13,7 @@ try:
15
13
  except ModuleNotFoundError as me:
16
14
  if me.name != 'cachew':
17
15
  raise me
16
+
18
17
  # this module is legacy anyway, so just make it defensive
19
18
  def cachew(*args, **kwargs): # type: ignore[no-redef]
20
19
  return lambda f: f
@@ -22,7 +21,7 @@ except ModuleNotFoundError as me:
22
21
 
23
22
  def index(p: PathIsh) -> Results:
24
23
  pp = Path(p)
25
- assert pp.exists(), pp # just in case of broken symlinks
24
+ assert pp.exists(), pp # just in case of broken symlinks
26
25
 
27
26
  # todo warn if filtered out too many?
28
27
  # todo wonder how quickly mimes can be computed?
@@ -31,14 +30,14 @@ def index(p: PathIsh) -> Results:
31
30
 
32
31
  assert len(dbs) > 0, pp
33
32
  logger.info('processing %d databases', len(dbs))
34
- cname = str('_'.join(pp.parts[1:])) # meh
33
+ cname = str('_'.join(pp.parts[1:])) # meh
35
34
  yield from _index_dbs(dbs, cachew_name=cname)
36
35
 
37
36
 
38
-
39
37
  def _index_dbs(dbs: list[Path], cachew_name: str):
40
38
  # TODO right... not ideal, need to think how to handle it properly...
41
39
  import sys
40
+
42
41
  sys.setrecursionlimit(5000)
43
42
 
44
43
  cache_dir = config.get().cache_dir
@@ -49,13 +48,13 @@ def _index_dbs(dbs: list[Path], cachew_name: str):
49
48
 
50
49
  # todo wow, stack traces are ridiculous here...
51
50
  # todo hmm, feels like it should be a class or something?
52
- @cachew(lambda cp, dbs, emitted: cp, depends_on=lambda cp, dbs, emitted: dbs) # , logger=logger) # noqa: ARG005
51
+ @cachew(lambda cp, dbs, emitted: cp, depends_on=lambda cp, dbs, emitted: dbs) # , logger=logger) # noqa: ARG005
53
52
  def _index_dbs_aux(cache_path: Path | None, dbs: list[Path], emitted: set) -> Results:
54
53
  if len(dbs) == 0:
55
54
  return
56
55
 
57
56
  xs = dbs[:-1]
58
- x = dbs[-1:]
57
+ x = dbs[-1:]
59
58
 
60
59
  xs_res = _index_dbs_aux(cache_path, xs, emitted)
61
60
  xs_was_cached = False
@@ -66,36 +65,38 @@ def _index_dbs_aux(cache_path: Path | None, dbs: list[Path], emitted: set) -> Re
66
65
  logger.debug('seems that %d first items were previously cached', len(xs))
67
66
  if xs_was_cached:
68
67
  key = str(r) if isinstance(r, Exception) else (r.url, r.dt)
69
- assert key not in emitted, key # todo not sure if this assert is necessary?
68
+ assert key not in emitted, key # todo not sure if this assert is necessary?
70
69
  # hmm ok it might happen if we messed up with indexing individual db?
71
70
  # alternatively, could abuse it to avoid messing with 'emitted' in _index_db?
72
71
  emitted.add(key)
73
- yield r # todo not sure about exceptions?
72
+ yield r # todo not sure about exceptions?
74
73
 
75
74
  for db in x:
76
75
  yield from _index_db(db, emitted=emitted)
77
76
 
78
77
 
79
78
  def _index_db(db: Path, emitted: set):
80
- logger.info('processing %s', db) # debug level?
79
+ logger.info('processing %s', db) # debug level?
81
80
 
82
81
  # todo schema check (not so critical for cachew though)
83
82
  total = 0
84
- new = 0
85
- loc = Loc.file(db) # todo possibly needs to be optimized -- moving from within the loop considerably speeds everything up
83
+ new = 0
84
+ loc = Loc.file(
85
+ db
86
+ ) # todo possibly needs to be optimized -- moving from within the loop considerably speeds everything up
86
87
  with sqlite3.connect(f'file:{db}?immutable=1', uri=True) as c:
87
88
  browser = None
88
89
  for b in [Chrome, Firefox, FirefoxPhone, Safari]:
89
90
  try:
90
91
  c.execute(f'SELECT * FROM {b.detector}')
91
- except sqlite3.OperationalError: # not sure if the right kind?
92
+ except sqlite3.OperationalError: # not sure if the right kind?
92
93
  pass
93
94
  else:
94
95
  browser = b
95
96
  break
96
97
  assert browser is not None
97
98
 
98
- proj = ', '.join(c for c, _ in browser.schema.cols)
99
+ proj = ', '.join(c for c, _ in browser.schema.cols)
99
100
  query = browser.query.replace('chunk.', '')
100
101
 
101
102
  c.row_factory = sqlite3.Row
@@ -123,7 +124,7 @@ ColType = str
123
124
 
124
125
 
125
126
  from collections.abc import Sequence
126
- from typing import NamedTuple, Union
127
+ from typing import NamedTuple
127
128
 
128
129
 
129
130
  class Schema(NamedTuple):
@@ -131,7 +132,7 @@ class Schema(NamedTuple):
131
132
  key: Sequence[str]
132
133
 
133
134
 
134
- SchemaCheck = tuple[str, Union[str, Sequence[str]]] # todo Union: meh
135
+ SchemaCheck = tuple[str, str | Sequence[str]] # todo Union: meh
135
136
 
136
137
  from dataclasses import dataclass
137
138
 
@@ -151,14 +152,15 @@ class Extr:
151
152
 
152
153
 
153
154
  class Chrome(Extr):
154
- detector='keyword_search_terms'
155
+ detector = 'keyword_search_terms'
156
+ # fmt: off
155
157
  schema_check=(
156
158
  'visits', [
157
159
  'visits', "id, url, visit_time, from_visit, transition, segment_id, visit_duration, incremented_omnibox_typed_score",
158
160
  'visits', "id, url, visit_time, from_visit, transition, segment_id, visit_duration"
159
161
  ]
160
162
  )
161
- schema=Schema(cols=[
163
+ schema = Schema(cols=[
162
164
  ('U.url' , 'TEXT' ),
163
165
 
164
166
  # while these two are not very useful, might be good to have just in case for some debugging
@@ -172,16 +174,17 @@ class Chrome(Extr):
172
174
  ('V.visit_duration' , 'INTEGER NOT NULL'),
173
175
  # V.omnibox thing looks useless
174
176
  ], key=('url', 'visit_time', 'vid', 'urlid'))
175
- query='FROM chunk.visits as V, chunk.urls as U WHERE V.url = U.id'
177
+ # fmt: on
178
+ query = 'FROM chunk.visits as V, chunk.urls as U WHERE V.url = U.id'
176
179
 
177
180
  @staticmethod
178
181
  def row2visit(row: sqlite3.Row, loc: Loc) -> Visit:
179
- url = row['url']
180
- ts = row['visit_time']
182
+ url = row['url']
183
+ ts = row['visit_time']
181
184
  durs = row['visit_duration']
182
185
 
183
186
  dt = chrome_time_to_utc(int(ts))
184
- url = unquote(url) # chrome urls are all quoted
187
+ url = unquote(url) # chrome urls are all quoted
185
188
  dd = int(durs)
186
189
  dur: Second | None = None if dd == 0 else dd // 1_000_000
187
190
  return Visit(
@@ -196,12 +199,12 @@ class Chrome(Extr):
196
199
  # yep, tested it and looks like utc
197
200
  def chrome_time_to_utc(chrome_time: int) -> datetime:
198
201
  epoch = (chrome_time / 1_000_000) - 11644473600
199
- return datetime.fromtimestamp(epoch, pytz.utc)
202
+ return datetime.fromtimestamp(epoch, timezone.utc)
200
203
 
201
204
 
202
205
  def _row2visit_firefox(row: sqlite3.Row, loc: Loc) -> Visit:
203
206
  url = row['url']
204
- ts = float(row['visit_date'])
207
+ ts = float(row['visit_date'])
205
208
  # ok, looks like it's unix epoch
206
209
  # https://stackoverflow.com/a/19430099/706389
207
210
 
@@ -214,17 +217,19 @@ def _row2visit_firefox(row: sqlite3.Row, loc: Loc) -> Visit:
214
217
  else:
215
218
  # milliseconds
216
219
  ts /= 1_000
217
- dt = datetime.fromtimestamp(ts, pytz.utc)
218
- url = unquote(url) # firefox urls are all quoted
220
+ dt = datetime.fromtimestamp(ts, timezone.utc)
221
+ url = unquote(url) # firefox urls are all quoted
219
222
  return Visit(
220
223
  url=url,
221
224
  dt=dt,
222
225
  locator=loc,
223
226
  )
224
227
 
228
+
225
229
  # https://web.archive.org/web/20201026130310/http://fileformats.archiveteam.org/wiki/History.db
226
230
  class Safari(Extr):
227
- detector='history_tombstones'
231
+ detector = 'history_tombstones'
232
+ # fmt: off
228
233
  schema_check=(
229
234
  'history_visits', [
230
235
  'history_visits', "id, history_item, visit_time",
@@ -245,13 +250,14 @@ class Safari(Extr):
245
250
  # ('V.visit_duration' , 'INTEGER NOT NULL'),
246
251
  # V.omnibox thing looks useless
247
252
  ], key=('url', 'visit_time', 'vid', 'urlid'))
248
- query='FROM chunk.history_visits as V, chunk.history_items as U WHERE V.history_item = U.id'
253
+ # fmt: on
254
+ query = 'FROM chunk.history_visits as V, chunk.history_items as U WHERE V.history_item = U.id'
249
255
 
250
256
  @staticmethod
251
257
  def row2visit(row: sqlite3.Row, loc: Loc) -> Visit:
252
- url = row['url']
253
- ts = row['visit_time'] + 978307200 # https://stackoverflow.com/a/34546556/16645
254
- dt = datetime.fromtimestamp(ts, pytz.utc)
258
+ url = row['url']
259
+ ts = row['visit_time'] + 978307200 # https://stackoverflow.com/a/34546556/16645
260
+ dt = datetime.fromtimestamp(ts, timezone.utc)
255
261
 
256
262
  return Visit(
257
263
  url=url,
@@ -259,10 +265,12 @@ class Safari(Extr):
259
265
  locator=loc,
260
266
  )
261
267
 
268
+
262
269
  # https://web.archive.org/web/20190730231715/https://www.forensicswiki.org/wiki/Mozilla_Firefox_3_History_File_Format#moz_historyvisits
263
270
  class Firefox(Extr):
264
- detector='moz_meta'
265
- schema_check=('moz_historyvisits', "id, from_visit, place_id, visit_date, visit_type")
271
+ detector = 'moz_meta'
272
+ schema_check = ('moz_historyvisits', "id, from_visit, place_id, visit_date, visit_type")
273
+ # fmt: off
266
274
  schema=Schema(cols=[
267
275
  ('P.url' , 'TEXT'),
268
276
 
@@ -278,14 +286,16 @@ class Firefox(Extr):
278
286
  # needs to be defensive
279
287
  # ('V.session' , 'INTEGER'),
280
288
  ], key=('url', 'visit_date', 'vid', 'pid'))
281
- query='FROM chunk.moz_historyvisits as V, chunk.moz_places as P WHERE V.place_id = P.id'
289
+ # fmt: on
290
+ query = 'FROM chunk.moz_historyvisits as V, chunk.moz_places as P WHERE V.place_id = P.id'
282
291
 
283
- row2visit = _row2visit_firefox
292
+ row2visit = _row2visit_firefox # type: ignore[assignment]
284
293
 
285
294
 
286
295
  class FirefoxPhone(Extr):
287
- detector='remote_devices'
288
- schema_check=('visits', "_id, history_guid, visit_type, date, is_local")
296
+ detector = 'remote_devices'
297
+ schema_check = ('visits', "_id, history_guid, visit_type, date, is_local")
298
+ # fmt: off
289
299
  schema=Schema(cols=[
290
300
  ('H.url' , 'TEXT NOT NULL' ),
291
301
 
@@ -297,6 +307,7 @@ class FirefoxPhone(Extr):
297
307
  ('V.date as visit_date', 'INTEGER NOT NULL'),
298
308
  # ('is_local' , 'INTEGER NOT NULL'),
299
309
  ], key=('url', 'date', 'vid', 'hid'))
300
- query='FROM chunk.visits as V, chunk.history as H WHERE V.history_guid = H.guid'
310
+ # fmt: on
311
+ query = 'FROM chunk.visits as V, chunk.history as H WHERE V.history_guid = H.guid'
301
312
 
302
- row2visit = _row2visit_firefox
313
+ row2visit = _row2visit_firefox # type: ignore[assignment]
promnesia/sources/demo.py CHANGED
@@ -21,7 +21,6 @@ def index(
21
21
  base_dt: datetime | IsoFormatDt = datetime.min + timedelta(days=5000),
22
22
  delta: timedelta | Seconds = timedelta(hours=1),
23
23
  ) -> Results:
24
-
25
24
  base_dt_ = base_dt if isinstance(base_dt, datetime) else datetime.fromisoformat(base_dt)
26
25
  delta_ = delta if isinstance(delta, timedelta) else timedelta(seconds=delta)
27
26
 
@@ -33,4 +33,3 @@ def index() -> Results:
33
33
  context=m.text,
34
34
  locator=loc,
35
35
  )
36
-
@@ -1,23 +1,26 @@
1
1
  from __future__ import annotations
2
2
 
3
- from collections.abc import Iterable, Sequence
3
+ from collections.abc import Callable, Iterable, Sequence
4
4
  from functools import lru_cache
5
5
  from pathlib import Path
6
- from typing import Callable, NamedTuple, Union
6
+ from typing import NamedTuple
7
7
 
8
8
  from ..common import Results, Url
9
9
 
10
10
  # TODO doesn't really belong here...
11
11
  Ctx = Sequence[str]
12
12
 
13
+
13
14
  class EUrl(NamedTuple):
14
15
  url: Url
15
- ctx: Ctx # TODO ctx here is more like a Loc
16
+ ctx: Ctx # TODO ctx here is more like a Loc
17
+
18
+
16
19
  ###
17
20
 
18
21
 
19
22
  # keys are mime types + extensions
20
- Ex = Callable[[Path], Union[Results, Iterable[EUrl]]]
23
+ Ex = Callable[[Path], Results | Iterable[EUrl]]
21
24
  # None means unhandled
22
25
  TYPE2IDX: dict[str, Ex | None] = {}
23
26
  # NOTE: there are some types in auto.py at the moment... it's a bit messy
@@ -27,13 +30,13 @@ TYPE2IDX: dict[str, Ex | None] = {}
27
30
  @lru_cache(None)
28
31
  def type2idx(t: str) -> Ex | None:
29
32
  if len(t) == 0:
30
- return None # just in case?
33
+ return None # just in case?
31
34
  # first try exact match
32
- e = TYPE2IDX.get(t, None)
35
+ e = TYPE2IDX.get(t)
33
36
  if e is not None:
34
37
  return e
35
38
  t = t.strip('.')
36
- e = TYPE2IDX.get(t, None)
39
+ e = TYPE2IDX.get(t)
37
40
  if e is not None:
38
41
  return e
39
42
  # otherwise, try prefixes?
@@ -42,6 +45,7 @@ def type2idx(t: str) -> Ex | None:
42
45
  return v
43
46
  return None
44
47
 
48
+
45
49
  # for now source code just indexed with grep, not sure if it's good enough?
46
50
  # if not, some fanceir library could be used...
47
51
  # e.g. https://github.com/karlicoss/promnesia/pull/152/commits/c2f00eb4ee4018b02c9bf3966a036db69a43373d
@@ -82,7 +86,7 @@ CODE = {
82
86
 
83
87
  '.ts', # most likely typescript.. otherwise determined as text/vnd.trolltech.linguist mime
84
88
  '.js',
85
- }
89
+ } # fmt: skip
86
90
  # TODO discover more extensions with mimetypes library?
87
91
 
88
92
 
@@ -100,6 +104,7 @@ video/
100
104
 
101
105
  handle_later = lambda *_args, **_kwargs: ()
102
106
 
107
+
103
108
  def ignore(*_args, **_kwargs):
104
109
  # TODO log (once?)
105
110
  yield from ()
@@ -129,7 +134,7 @@ TYPE2IDX.update({
129
134
  'application/zip' : handle_later,
130
135
  'application/x-tar' : handle_later,
131
136
  'application/gzip' : handle_later,
132
- })
137
+ }) # fmt: skip
133
138
 
134
139
 
135
140
  # TODO use some existing file for initial gitignore..
@@ -148,5 +153,4 @@ IGNORE = [
148
153
  # TODO not sure about these:
149
154
  '.gitignore',
150
155
  '.babelrc',
151
- ]
152
-
156
+ ] # fmt: skip