promnesia 1.1.20230129__py3-none-any.whl → 1.2.20240810__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (72) hide show
  1. promnesia/__main__.py +58 -50
  2. promnesia/cannon.py +4 -4
  3. promnesia/common.py +57 -38
  4. promnesia/compare.py +3 -2
  5. promnesia/compat.py +6 -65
  6. promnesia/config.py +4 -2
  7. promnesia/database/common.py +66 -0
  8. promnesia/database/dump.py +187 -0
  9. promnesia/{read_db.py → database/load.py} +10 -11
  10. promnesia/extract.py +1 -0
  11. promnesia/kjson.py +1 -1
  12. promnesia/logging.py +14 -14
  13. promnesia/misc/__init__.pyi +0 -0
  14. promnesia/misc/config_example.py +1 -2
  15. promnesia/misc/install_server.py +5 -4
  16. promnesia/server.py +24 -24
  17. promnesia/sources/__init__.pyi +0 -0
  18. promnesia/sources/auto.py +12 -7
  19. promnesia/sources/browser.py +80 -293
  20. promnesia/sources/browser_legacy.py +298 -0
  21. promnesia/sources/demo.py +18 -2
  22. promnesia/sources/filetypes.py +8 -0
  23. promnesia/sources/github.py +2 -2
  24. promnesia/sources/hackernews.py +1 -2
  25. promnesia/sources/hypothesis.py +1 -1
  26. promnesia/sources/markdown.py +15 -15
  27. promnesia/sources/org.py +7 -3
  28. promnesia/sources/plaintext.py +3 -1
  29. promnesia/sources/reddit.py +2 -2
  30. promnesia/sources/rss.py +5 -1
  31. promnesia/sources/shellcmd.py +6 -2
  32. promnesia/sources/signal.py +29 -20
  33. promnesia/sources/smscalls.py +8 -1
  34. promnesia/sources/stackexchange.py +2 -2
  35. promnesia/sources/takeout.py +132 -12
  36. promnesia/sources/takeout_legacy.py +10 -2
  37. promnesia/sources/telegram.py +79 -123
  38. promnesia/sources/telegram_legacy.py +117 -0
  39. promnesia/sources/vcs.py +1 -1
  40. promnesia/sources/viber.py +6 -15
  41. promnesia/sources/website.py +1 -1
  42. promnesia/sqlite.py +42 -0
  43. promnesia/tests/__init__.py +0 -0
  44. promnesia/tests/common.py +137 -0
  45. promnesia/tests/server_helper.py +64 -0
  46. promnesia/tests/sources/__init__.py +0 -0
  47. promnesia/tests/sources/test_auto.py +66 -0
  48. promnesia/tests/sources/test_filetypes.py +42 -0
  49. promnesia/tests/sources/test_hypothesis.py +39 -0
  50. promnesia/tests/sources/test_org.py +65 -0
  51. promnesia/tests/sources/test_plaintext.py +26 -0
  52. promnesia/tests/sources/test_shellcmd.py +22 -0
  53. promnesia/tests/sources/test_takeout.py +58 -0
  54. promnesia/tests/test_cannon.py +325 -0
  55. promnesia/tests/test_cli.py +42 -0
  56. promnesia/tests/test_compare.py +30 -0
  57. promnesia/tests/test_config.py +290 -0
  58. promnesia/tests/test_db_dump.py +223 -0
  59. promnesia/tests/test_extract.py +61 -0
  60. promnesia/tests/test_extract_urls.py +43 -0
  61. promnesia/tests/test_indexer.py +245 -0
  62. promnesia/tests/test_server.py +292 -0
  63. promnesia/tests/test_traverse.py +41 -0
  64. promnesia/tests/utils.py +35 -0
  65. {promnesia-1.1.20230129.dist-info → promnesia-1.2.20240810.dist-info}/METADATA +14 -19
  66. promnesia-1.2.20240810.dist-info/RECORD +83 -0
  67. {promnesia-1.1.20230129.dist-info → promnesia-1.2.20240810.dist-info}/WHEEL +1 -1
  68. {promnesia-1.1.20230129.dist-info → promnesia-1.2.20240810.dist-info}/entry_points.txt +0 -1
  69. promnesia/dump.py +0 -105
  70. promnesia-1.1.20230129.dist-info/RECORD +0 -55
  71. {promnesia-1.1.20230129.dist-info → promnesia-1.2.20240810.dist-info}/LICENSE +0 -0
  72. {promnesia-1.1.20230129.dist-info → promnesia-1.2.20240810.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,298 @@
1
+ from datetime import datetime
2
+ from pathlib import Path
3
+ from urllib.parse import unquote
4
+ import sqlite3
5
+ from typing import List, Set, Optional
6
+
7
+ import pytz
8
+
9
+ from ..common import PathIsh, Results, Visit, Loc, logger, Second, is_sqlite_db
10
+ from .. import config
11
+
12
+ try:
13
+ from cachew import cachew
14
+ except ModuleNotFoundError as me:
15
+ if me.name != 'cachew':
16
+ raise me
17
+ # this module is legacy anyway, so just make it defensive
18
+ def cachew(*args, **kwargs): # type: ignore[no-redef]
19
+ return lambda f: f
20
+
21
+
22
+ def index(p: PathIsh) -> Results:
23
+ pp = Path(p)
24
+ assert pp.exists(), pp # just in case of broken symlinks
25
+
26
+ # todo warn if filtered out too many?
27
+ # todo wonder how quickly mimes can be computed?
28
+ # todo ugh, dunno, maybe this really belongs to hpi?? need get_files etc...
29
+ dbs = [p for p in sorted(pp.rglob('*')) if is_sqlite_db(p)]
30
+
31
+ assert len(dbs) > 0, pp
32
+ logger.info('processing %d databases', len(dbs))
33
+ cname = str('_'.join(pp.parts[1:])) # meh
34
+ yield from _index_dbs(dbs, cachew_name=cname)
35
+
36
+
37
+
38
+ def _index_dbs(dbs: List[Path], cachew_name: str):
39
+ # TODO right... not ideal, need to think how to handle it properly...
40
+ import sys
41
+ sys.setrecursionlimit(5000)
42
+
43
+ cache_dir = config.get().cache_dir
44
+ cpath = None if cache_dir is None else cache_dir / cachew_name
45
+ emitted: Set = set()
46
+ yield from _index_dbs_aux(cpath, dbs, emitted=emitted)
47
+
48
+
49
+ # todo wow, stack traces are ridiculous here...
50
+ # todo hmm, feels like it should be a class or something?
51
+ @cachew(lambda cp, dbs, emitted: cp, depends_on=lambda cp, dbs, emitted: dbs) # , logger=logger)
52
+ def _index_dbs_aux(cache_path: Optional[Path], dbs: List[Path], emitted: Set) -> Results:
53
+ if len(dbs) == 0:
54
+ return
55
+
56
+ xs = dbs[:-1]
57
+ x = dbs[-1:]
58
+
59
+ xs_res = _index_dbs_aux(cache_path, xs, emitted)
60
+ xs_was_cached = False
61
+ for r in xs_res:
62
+ # if it was cached, emitted would be empty
63
+ if len(emitted) == 0:
64
+ xs_was_cached = True
65
+ logger.debug('seems that %d first items were previously cached', len(xs))
66
+ if xs_was_cached:
67
+ key = str(r) if isinstance(r, Exception) else (r.url, r.dt)
68
+ assert key not in emitted, key # todo not sure if this assert is necessary?
69
+ # hmm ok it might happen if we messed up with indexing individual db?
70
+ # alternatively, could abuse it to avoid messing with 'emitted' in _index_db?
71
+ emitted.add(key)
72
+ yield r # todo not sure about exceptions?
73
+
74
+ for db in x:
75
+ yield from _index_db(db, emitted=emitted)
76
+
77
+
78
+ def _index_db(db: Path, emitted: Set):
79
+ logger.info('processing %s', db) # debug level?
80
+
81
+ # todo schema check (not so critical for cachew though)
82
+ total = 0
83
+ new = 0
84
+ loc = Loc.file(db) # todo possibly needs to be optimized -- moving from within the loop considerably speeds everything up
85
+ with sqlite3.connect(f'file:{db}?immutable=1', uri=True) as c:
86
+ browser = None
87
+ for b in [Chrome, Firefox, FirefoxPhone, Safari]:
88
+ try:
89
+ c.execute(f'SELECT * FROM {b.detector}')
90
+ except sqlite3.OperationalError: # not sure if the right kind?
91
+ pass
92
+ else:
93
+ browser = b
94
+ break
95
+ assert browser is not None
96
+
97
+ proj = ', '.join(c for c, _ in browser.schema.cols)
98
+ query = browser.query.replace('chunk.', '')
99
+
100
+ c.row_factory = sqlite3.Row
101
+ for r in c.execute(f'select {proj} {query}'):
102
+ v = browser.row2visit(r, loc)
103
+ total += 1
104
+
105
+ key = (v.url, v.dt)
106
+ # todo how to keep keys compatible?
107
+ if key in emitted:
108
+ continue
109
+ yield v
110
+ emitted.add(key)
111
+ new += 1
112
+
113
+ # eh, ok, almost 2x faster if I don't construct Visit first
114
+ # maybe it's Loc.file that's too slow?
115
+ # yeah, seems like it, 4.1 s after computing it only once
116
+
117
+ logger.info('%s: %d/%d new visits', db, new, total)
118
+
119
+
120
+ Col = str
121
+ ColType = str
122
+
123
+
124
+ from typing import Any, NamedTuple, Tuple, Union, Sequence, Optional
125
+
126
+ class Schema(NamedTuple):
127
+ cols: Sequence[Tuple[Col, ColType]]
128
+ key: Sequence[str]
129
+
130
+
131
+ SchemaCheck = Tuple[str, Union[str, Sequence[str]]] # todo Union: meh
132
+
133
+ from dataclasses import dataclass
134
+
135
+ # todo protocol?
136
+ @dataclass
137
+ class Extr:
138
+ detector: str
139
+ schema_check: SchemaCheck
140
+ schema: Schema
141
+ query: str
142
+
143
+ # todo calllable?
144
+ @staticmethod
145
+ def row2visit(row: sqlite3.Row, loc: Loc) -> Visit:
146
+ raise NotImplementedError
147
+
148
+
149
+ class Chrome(Extr):
150
+ detector='keyword_search_terms'
151
+ schema_check=(
152
+ 'visits', [
153
+ 'visits', "id, url, visit_time, from_visit, transition, segment_id, visit_duration, incremented_omnibox_typed_score",
154
+ 'visits', "id, url, visit_time, from_visit, transition, segment_id, visit_duration"
155
+ ]
156
+ )
157
+ schema=Schema(cols=[
158
+ ('U.url' , 'TEXT' ),
159
+
160
+ # while these two are not very useful, might be good to have just in case for some debugging
161
+ ('U.id AS urlid' , 'INTEGER'),
162
+ ('V.id AS vid' , 'INTEGER'),
163
+
164
+ ('V.visit_time' , 'INTEGER NOT NULL'),
165
+ ('V.from_visit' , 'INTEGER' ),
166
+ ('V.transition' , 'INTEGER NOT NULL'),
167
+ # V.segment_id looks useless
168
+ ('V.visit_duration' , 'INTEGER NOT NULL'),
169
+ # V.omnibox thing looks useless
170
+ ], key=('url', 'visit_time', 'vid', 'urlid'))
171
+ query='FROM chunk.visits as V, chunk.urls as U WHERE V.url = U.id'
172
+
173
+ @staticmethod
174
+ def row2visit(row: sqlite3.Row, loc: Loc) -> Visit:
175
+ url = row['url']
176
+ ts = row['visit_time']
177
+ durs = row['visit_duration']
178
+
179
+ dt = chrome_time_to_utc(int(ts))
180
+ url = unquote(url) # chrome urls are all quoted
181
+ dd = int(durs)
182
+ dur: Optional[Second] = None if dd == 0 else dd // 1_000_000
183
+ return Visit(
184
+ url=url,
185
+ dt=dt,
186
+ locator=loc,
187
+ duration=dur,
188
+ )
189
+
190
+
191
+ # should be utc? https://stackoverflow.com/a/26226771/706389
192
+ # yep, tested it and looks like utc
193
+ def chrome_time_to_utc(chrome_time: int) -> datetime:
194
+ epoch = (chrome_time / 1_000_000) - 11644473600
195
+ return datetime.fromtimestamp(epoch, pytz.utc)
196
+
197
+
198
+ def _row2visit_firefox(row: sqlite3.Row, loc: Loc) -> Visit:
199
+ url = row['url']
200
+ ts = float(row['visit_date'])
201
+ # ok, looks like it's unix epoch
202
+ # https://stackoverflow.com/a/19430099/706389
203
+
204
+ # NOTE: ugh. on Fenix (experimental Android version) it uses milliseconds, not nanos...
205
+ # about year 2001... if someone has browser history exports before that -- please let me know, I'm impressed
206
+ threshold = 1000000000
207
+ if ts > threshold * 1_000_000:
208
+ # presumably it's in microseconds
209
+ ts /= 1_000_000
210
+ else:
211
+ # milliseconds
212
+ ts /= 1_000
213
+ dt = datetime.fromtimestamp(ts, pytz.utc)
214
+ url = unquote(url) # firefox urls are all quoted
215
+ return Visit(
216
+ url=url,
217
+ dt=dt,
218
+ locator=loc,
219
+ )
220
+
221
+ # https://web.archive.org/web/20201026130310/http://fileformats.archiveteam.org/wiki/History.db
222
+ class Safari(Extr):
223
+ detector='history_tombstones'
224
+ schema_check=(
225
+ 'history_visits', [
226
+ 'history_visits', "id, history_item, visit_time",
227
+ 'history_items', "id, url"
228
+ ]
229
+ )
230
+ schema=Schema(cols=[
231
+ ('U.url' , 'TEXT' ),
232
+
233
+ # while these two are not very useful, might be good to have just in case for some debugging
234
+ ('U.id AS urlid' , 'INTEGER'),
235
+ ('V.id AS vid' , 'INTEGER'),
236
+
237
+ ('V.visit_time' , 'INTEGER NOT NULL'),
238
+ # ('V.from_visit' , 'INTEGER' ),
239
+ # ('V.transition' , 'INTEGER NOT NULL'),
240
+ # V.segment_id looks useless
241
+ # ('V.visit_duration' , 'INTEGER NOT NULL'),
242
+ # V.omnibox thing looks useless
243
+ ], key=('url', 'visit_time', 'vid', 'urlid'))
244
+ query='FROM chunk.history_visits as V, chunk.history_items as U WHERE V.history_item = U.id'
245
+
246
+ @staticmethod
247
+ def row2visit(row: sqlite3.Row, loc: Loc) -> Visit:
248
+ url = row['url']
249
+ ts = row['visit_time'] + 978307200 # https://stackoverflow.com/a/34546556/16645
250
+ dt = datetime.fromtimestamp(ts, pytz.utc)
251
+
252
+ return Visit(
253
+ url=url,
254
+ dt=dt,
255
+ locator=loc,
256
+ )
257
+
258
+ # https://web.archive.org/web/20190730231715/https://www.forensicswiki.org/wiki/Mozilla_Firefox_3_History_File_Format#moz_historyvisits
259
+ class Firefox(Extr):
260
+ detector='moz_meta'
261
+ schema_check=('moz_historyvisits', "id, from_visit, place_id, visit_date, visit_type")
262
+ schema=Schema(cols=[
263
+ ('P.url' , 'TEXT'),
264
+
265
+ ('P.id AS pid' , 'INTEGER'),
266
+ ('V.id AS vid' , 'INTEGER'),
267
+
268
+ ('V.from_visit', 'INTEGER'),
269
+ ('V.visit_date', 'INTEGER'),
270
+ ('V.visit_type', 'INTEGER'),
271
+
272
+ # not sure what session is form but could be useful?..
273
+ # NOTE(20210410): for now, commented it out since some older databases from phone have this column commented?
274
+ # needs to be defensive
275
+ # ('V.session' , 'INTEGER'),
276
+ ], key=('url', 'visit_date', 'vid', 'pid'))
277
+ query='FROM chunk.moz_historyvisits as V, chunk.moz_places as P WHERE V.place_id = P.id'
278
+
279
+ row2visit = _row2visit_firefox
280
+
281
+
282
+ class FirefoxPhone(Extr):
283
+ detector='remote_devices'
284
+ schema_check=('visits', "_id, history_guid, visit_type, date, is_local")
285
+ schema=Schema(cols=[
286
+ ('H.url' , 'TEXT NOT NULL' ),
287
+
288
+ ('H.guid AS guid' , 'TEXT' ),
289
+ ('H._id AS hid' , 'INTEGER' ),
290
+ ('V._id AS vid' , 'INTEGER' ),
291
+
292
+ ('V.visit_type' , 'INTEGER NOT NULL'),
293
+ ('V.date as visit_date', 'INTEGER NOT NULL'),
294
+ # ('is_local' , 'INTEGER NOT NULL'),
295
+ ], key=('url', 'date', 'vid', 'hid'))
296
+ query='FROM chunk.visits as V, chunk.history as H WHERE V.history_guid = H.guid'
297
+
298
+ row2visit = _row2visit_firefox
promnesia/sources/demo.py CHANGED
@@ -4,17 +4,33 @@ Generates a sequence of fake evenly separated visits
4
4
  '''
5
5
 
6
6
  from datetime import datetime, timedelta
7
+ from typing import Union
7
8
 
8
9
  from ..common import Results, Visit, Loc
9
10
 
10
11
 
11
- def index(count: int=100, *, base_dt: datetime=datetime.min + timedelta(days=5000), delta: timedelta=timedelta(hours=1)) -> Results:
12
+ IsoFormatDt = str
13
+ Seconds = int
14
+
15
+
16
+ # TODO allow passing isoformat string as base_dt?
17
+ # and maybe something similar as delta? start with seconds maybe
18
+ def index(
19
+ count: int=100,
20
+ *,
21
+ base_dt: Union[datetime, IsoFormatDt] = datetime.min + timedelta(days=5000),
22
+ delta: Union[timedelta, Seconds] = timedelta(hours=1),
23
+ ) -> Results:
24
+
25
+ base_dt_ = base_dt if isinstance(base_dt, datetime) else datetime.fromisoformat(base_dt)
26
+ delta_ = delta if isinstance(delta, timedelta) else timedelta(seconds=delta)
27
+
12
28
  # todo with some errors too?
13
29
  # todo use data generation library suggested for HPI?
14
30
  for i in range(count):
15
31
  yield Visit(
16
32
  url=f'https://demo.com/page{i}.html',
17
- dt=base_dt + delta * i,
33
+ dt=base_dt_ + delta_ * i,
18
34
  locator=Loc.make('demo'),
19
35
  )
20
36
  # todo add context?
@@ -66,6 +66,8 @@ CODE = {
66
66
  'text/x-lisp',
67
67
  'text/vnd.graphviz',
68
68
  'text/x-diff', # patch files
69
+ 'text/x-php',
70
+ 'text/x-lilypond',
69
71
 
70
72
  # these didn't have a mime type, or were mistyped?
71
73
  'css',
@@ -114,6 +116,12 @@ TYPE2IDX.update({
114
116
  '.vcf' : ignore,
115
117
  'message/rfc822': ignore, # ??
116
118
 
119
+ # todo ignore all fonts?
120
+ 'font/woff2': ignore,
121
+ 'font/woff': ignore,
122
+ 'text/x-Algol68': ignore, # ugh some license file had this?? maybe always index text/ as text?
123
+ 'text/x-bytecode.python': ignore, # todo ignore all x-bytecode?
124
+
117
125
  # TODO not sure what to do about these..
118
126
  'application/octet-stream': handle_later,
119
127
  'application/zip' : handle_later,
@@ -31,7 +31,7 @@ def index(*, render_markdown: bool = False) -> Results:
31
31
  # if enabled, convert the (markdown) body to HTML
32
32
  context: Optional[str] = e.body
33
33
  if e.body is not None and render_markdown:
34
- context = TextParser(e.body)._doc_ashtml()
34
+ context = TextParser(e.body)._doc_ashtml() # type: ignore[possibly-undefined]
35
35
 
36
36
  # locator should link back to this event
37
37
  loc = Loc.make(title=e.summary, href=e.link)
@@ -74,7 +74,7 @@ def index(*, render_markdown: bool = False) -> Results:
74
74
  # extract from markdown links like [link text](https://...)
75
75
  # incase URLExtract missed any somehow
76
76
  if render_markdown:
77
- for res in extract_from_text(e.body):
77
+ for res in extract_from_text(e.body): # type: ignore[possibly-undefined]
78
78
  if isinstance(res, Exception):
79
79
  yield res
80
80
  continue
@@ -1,6 +1,5 @@
1
1
  '''
2
- Uses [[https://github.com/karlicoss/HPI][HPI]] dogsheep module to import
3
- Hacker News items.
2
+ Uses [[https://github.com/karlicoss/HPI][HPI]] dogsheep module to import HackerNews items.
4
3
  '''
5
4
 
6
5
  import textwrap
@@ -8,7 +8,7 @@ def index() -> Results:
8
8
  from . import hpi
9
9
  import my.hypothesis as hyp
10
10
 
11
- for h in hyp.get_highlights():
11
+ for h in hyp.highlights():
12
12
  if isinstance(h, Exception):
13
13
  yield h
14
14
  continue
@@ -1,13 +1,13 @@
1
1
  from pathlib import Path
2
2
  from typing import Iterator, NamedTuple, Optional
3
3
 
4
- from ..common import get_logger, Extraction, Url, PathIsh, Res, Visit, Loc, file_mtime, logger
4
+ from ..common import Extraction, Url, PathIsh, Res, Visit, Loc, file_mtime, logger
5
5
 
6
6
 
7
- import mistletoe # type: ignore
8
- from mistletoe.span_token import AutoLink, Link # type: ignore
9
- import mistletoe.block_token as BT # type: ignore
10
- from mistletoe.html_renderer import HTMLRenderer # type: ignore
7
+ import mistletoe # type: ignore
8
+ from mistletoe.span_token import AutoLink, Link # type: ignore
9
+ import mistletoe.block_token as BT # type: ignore
10
+ from mistletoe.html_renderer import HTMLRenderer # type: ignore
11
11
 
12
12
 
13
13
  renderer = HTMLRenderer()
@@ -42,7 +42,7 @@ HTML_MARKER = '!html '
42
42
  def _ashtml(block) -> str:
43
43
  res = renderer.render(block)
44
44
  if res.startswith('<p>') and res.endswith('</p>'):
45
- res = res[3: -4] # meh, but for now fine
45
+ res = res[3:-4] # meh, but for now fine
46
46
  return res
47
47
 
48
48
 
@@ -62,7 +62,6 @@ class Parser:
62
62
  context = None if last_block is None else HTML_MARKER + _ashtml(last_block)
63
63
  yield Parsed(url=url, context=context)
64
64
 
65
-
66
65
  def _walk(self, cur, last_block) -> Iterator[Result]:
67
66
  if isinstance(cur, block_tokens):
68
67
  last_block = cur
@@ -73,12 +72,14 @@ class Parser:
73
72
  logger.exception(e)
74
73
  yield e
75
74
 
76
- children = getattr(cur, 'children', [])
75
+ # keeping getattr for compatibility in older versions of mistletoe, it was optional
76
+ children = getattr(cur, 'children', None)
77
+ if children is None:
78
+ return
77
79
  for c in children:
78
80
  yield from self._walk(c, last_block=last_block)
79
81
 
80
-
81
- def walk(self):
82
+ def walk(self) -> Iterator[Result]:
82
83
  yield from self._walk(self.doc, last_block=None)
83
84
 
84
85
 
@@ -94,7 +95,7 @@ def extract_from_file(fname: PathIsh) -> Iterator[Extraction]:
94
95
  yield Visit(
95
96
  url=r.url,
96
97
  dt=fallback_dt,
97
- locator=Loc.file(fname), # TODO line number
98
+ locator=Loc.file(fname), # TODO line number
98
99
  context=r.context,
99
100
  )
100
101
 
@@ -105,9 +106,9 @@ class TextParser(Parser):
105
106
  Instead of chunking blocks like for files, this returns the entire
106
107
  message rendered as the context
107
108
  '''
108
- def __init__(self, text: str):
109
- self.doc = mistletoe.Document(text)
110
109
 
110
+ def __init__(self, text: str) -> None:
111
+ self.doc = mistletoe.Document(text)
111
112
 
112
113
  def _doc_ashtml(self):
113
114
  '''
@@ -117,8 +118,7 @@ class TextParser(Parser):
117
118
  self._html = HTML_MARKER + _ashtml(self.doc)
118
119
  return self._html
119
120
 
120
-
121
- def _extract(self, cur, last_block = None) -> Iterator[Parsed]:
121
+ def _extract(self, cur, last_block=None) -> Iterator[Parsed]:
122
122
  if not isinstance(cur, (AutoLink, Link)):
123
123
  return
124
124
 
promnesia/sources/org.py CHANGED
@@ -57,8 +57,12 @@ def _parse_node(n: OrgNode) -> Parsed:
57
57
  # todo a bit hacky..
58
58
  heading = heading.replace(createds + ' ', '')
59
59
  if createds is not None:
60
- [odt] = OrgDate.list_from_str(createds)
61
- dt = odt.start
60
+ if '<%%' in createds:
61
+ # sexp date, not supported
62
+ dt = None
63
+ else:
64
+ [odt] = OrgDate.list_from_str(createds)
65
+ dt = odt.start
62
66
  else:
63
67
  dt = None
64
68
  return Parsed(dt=dt, heading=heading)
@@ -80,7 +84,7 @@ def walk_node(*, node: OrgNode, dt: datetime) -> Iterator[Res[Tuple[Parsed, OrgN
80
84
  parsed = parsed._replace(dt=dt)
81
85
  else:
82
86
  dt = parsed.dt
83
- yield parsed, node
87
+ yield parsed, node
84
88
 
85
89
  for c in node.children:
86
90
  yield from walk_node(node=c, dt=dt)
@@ -98,8 +98,10 @@ def extract_from_path(path: PathIsh) -> Command:
98
98
  '.gz',
99
99
  '.zip',
100
100
  )):
101
- logger.info(f"Extracting from compressed file {path}")
101
+ # todo should be debug?
102
+ # or should delete it completely, feels like unpacking archives here is a bit too much
102
103
  raise RuntimeError(f"Archives aren't supported yet: {path}")
104
+ logger.info(f"Extracting from compressed file {path}")
103
105
  import lzma
104
106
  from tempfile import NamedTemporaryFile
105
107
  # TODO hopefully, no collisions
@@ -16,7 +16,7 @@ def index(*, render_markdown: bool = False, renderer: Optional[Type['RedditRende
16
16
  if "No module named 'my.reddit.all'" in str(e):
17
17
  import warnings
18
18
  warnings.warn("DEPRECATED/reddit: Using an old version of HPI, please update")
19
- from my.reddit import submissions, comments, saved, upvoted # type: ignore[no-redef]
19
+ from my.reddit import submissions, comments, saved, upvoted
20
20
  else:
21
21
  raise e
22
22
 
@@ -95,7 +95,7 @@ class RedditRenderer:
95
95
 
96
96
  def _from_upvote(self, i: 'Upvote') -> Results:
97
97
  locator = Loc.make(
98
- title=f'Reddit upvote',
98
+ title='Reddit upvote',
99
99
  href=i.url,
100
100
  )
101
101
  yield from self._from_common(i, locator=locator)
promnesia/sources/rss.py CHANGED
@@ -1,3 +1,7 @@
1
+ '''
2
+ Uses [[https://github.com/karlicoss/HPI][HPI]] for RSS data.
3
+ '''
4
+
1
5
  from itertools import chain
2
6
 
3
7
  from ..common import Visit, Loc, extract_urls, Results, get_logger
@@ -19,6 +23,6 @@ def index() -> Results:
19
23
  yield Visit(
20
24
  url=feed.url,
21
25
  dt=feed.created_at or default_datetime,
22
- context=f'RSS subscription', # TODO use 'provider', etc?
26
+ context='RSS subscription', # TODO use 'provider', etc?
23
27
  locator=locator,
24
28
  )
@@ -1,10 +1,14 @@
1
+ """
2
+ Greps out URLs from an arbitrary shell command results.
3
+ """
4
+
1
5
  from datetime import datetime
2
6
  import os
3
7
  import re
4
- from typing import Optional, Union, Sequence
8
+ from subprocess import run, PIPE
9
+ from typing import Union, Sequence
5
10
  import warnings
6
11
 
7
- from ..compat import run, PIPE
8
12
  from ..common import Visit, Loc, Results, extract_urls, file_mtime, get_system_tz, now_tz, _is_windows, PathIsh
9
13
  from .plaintext import _has_grep
10
14