promnesia 1.1.20230129__py3-none-any.whl → 1.2.20240810__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (72) hide show
  1. promnesia/__main__.py +58 -50
  2. promnesia/cannon.py +4 -4
  3. promnesia/common.py +57 -38
  4. promnesia/compare.py +3 -2
  5. promnesia/compat.py +6 -65
  6. promnesia/config.py +4 -2
  7. promnesia/database/common.py +66 -0
  8. promnesia/database/dump.py +187 -0
  9. promnesia/{read_db.py → database/load.py} +10 -11
  10. promnesia/extract.py +1 -0
  11. promnesia/kjson.py +1 -1
  12. promnesia/logging.py +14 -14
  13. promnesia/misc/__init__.pyi +0 -0
  14. promnesia/misc/config_example.py +1 -2
  15. promnesia/misc/install_server.py +5 -4
  16. promnesia/server.py +24 -24
  17. promnesia/sources/__init__.pyi +0 -0
  18. promnesia/sources/auto.py +12 -7
  19. promnesia/sources/browser.py +80 -293
  20. promnesia/sources/browser_legacy.py +298 -0
  21. promnesia/sources/demo.py +18 -2
  22. promnesia/sources/filetypes.py +8 -0
  23. promnesia/sources/github.py +2 -2
  24. promnesia/sources/hackernews.py +1 -2
  25. promnesia/sources/hypothesis.py +1 -1
  26. promnesia/sources/markdown.py +15 -15
  27. promnesia/sources/org.py +7 -3
  28. promnesia/sources/plaintext.py +3 -1
  29. promnesia/sources/reddit.py +2 -2
  30. promnesia/sources/rss.py +5 -1
  31. promnesia/sources/shellcmd.py +6 -2
  32. promnesia/sources/signal.py +29 -20
  33. promnesia/sources/smscalls.py +8 -1
  34. promnesia/sources/stackexchange.py +2 -2
  35. promnesia/sources/takeout.py +132 -12
  36. promnesia/sources/takeout_legacy.py +10 -2
  37. promnesia/sources/telegram.py +79 -123
  38. promnesia/sources/telegram_legacy.py +117 -0
  39. promnesia/sources/vcs.py +1 -1
  40. promnesia/sources/viber.py +6 -15
  41. promnesia/sources/website.py +1 -1
  42. promnesia/sqlite.py +42 -0
  43. promnesia/tests/__init__.py +0 -0
  44. promnesia/tests/common.py +137 -0
  45. promnesia/tests/server_helper.py +64 -0
  46. promnesia/tests/sources/__init__.py +0 -0
  47. promnesia/tests/sources/test_auto.py +66 -0
  48. promnesia/tests/sources/test_filetypes.py +42 -0
  49. promnesia/tests/sources/test_hypothesis.py +39 -0
  50. promnesia/tests/sources/test_org.py +65 -0
  51. promnesia/tests/sources/test_plaintext.py +26 -0
  52. promnesia/tests/sources/test_shellcmd.py +22 -0
  53. promnesia/tests/sources/test_takeout.py +58 -0
  54. promnesia/tests/test_cannon.py +325 -0
  55. promnesia/tests/test_cli.py +42 -0
  56. promnesia/tests/test_compare.py +30 -0
  57. promnesia/tests/test_config.py +290 -0
  58. promnesia/tests/test_db_dump.py +223 -0
  59. promnesia/tests/test_extract.py +61 -0
  60. promnesia/tests/test_extract_urls.py +43 -0
  61. promnesia/tests/test_indexer.py +245 -0
  62. promnesia/tests/test_server.py +292 -0
  63. promnesia/tests/test_traverse.py +41 -0
  64. promnesia/tests/utils.py +35 -0
  65. {promnesia-1.1.20230129.dist-info → promnesia-1.2.20240810.dist-info}/METADATA +14 -19
  66. promnesia-1.2.20240810.dist-info/RECORD +83 -0
  67. {promnesia-1.1.20230129.dist-info → promnesia-1.2.20240810.dist-info}/WHEEL +1 -1
  68. {promnesia-1.1.20230129.dist-info → promnesia-1.2.20240810.dist-info}/entry_points.txt +0 -1
  69. promnesia/dump.py +0 -105
  70. promnesia-1.1.20230129.dist-info/RECORD +0 -55
  71. {promnesia-1.1.20230129.dist-info → promnesia-1.2.20240810.dist-info}/LICENSE +0 -0
  72. {promnesia-1.1.20230129.dist-info → promnesia-1.2.20240810.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,245 @@
1
+ from collections import Counter
2
+ from pathlib import Path
3
+ from subprocess import check_call, Popen
4
+
5
+ from ..__main__ import do_index, read_example_config
6
+ from ..common import DbVisit, _is_windows
7
+ from ..database.load import get_all_db_visits
8
+
9
+ import pytest
10
+
11
+ from .common import get_testdata, promnesia_bin, reset_filters, write_config
12
+
13
+
14
+ def get_stats(tmp_path: Path) -> Counter:
15
+ visits = get_all_db_visits(tmp_path / 'promnesia.sqlite')
16
+ return Counter(v.src for v in visits)
17
+
18
+
19
+ @pytest.mark.parametrize('mode', ['update', 'overwrite'])
20
+ def test_indexing_mode(tmp_path: Path, mode: str) -> None:
21
+ # ugh. we modify the config very fast during tests
22
+ # and pycache distinguishes identical filenames based on int mtime in seconds
23
+ # so best to use different names to prevent undesired caching
24
+ # https://github.com/python/cpython/blob/fb202af4470d6051a69bb9d2f44d7e8a1c99eb4f/Lib/importlib/_bootstrap_external.py#L714-L739
25
+ # TODO could probably relax that if we switch from importlib config loading to exec()?
26
+
27
+ def cfg1() -> None:
28
+ from promnesia.common import Source
29
+ from promnesia.sources import demo
30
+
31
+ SOURCES = [
32
+ Source(demo.index, count=10, base_dt='2000-01-01', delta=30, name='demo1'),
33
+ Source(demo.index, count=20, base_dt='2001-01-01', delta=30, name='demo2'),
34
+ ]
35
+
36
+ cfg_path = tmp_path / 'config1.py'
37
+ write_config(cfg_path, cfg1)
38
+ do_index(cfg_path)
39
+
40
+ stats = get_stats(tmp_path)
41
+ assert stats == {'demo1': 10, 'demo2': 20}
42
+
43
+ def cfg2() -> None:
44
+ from promnesia.common import Source
45
+ from promnesia.sources import demo
46
+
47
+ SOURCES = [
48
+ Source(demo.index, count=30, base_dt='2005-01-01', delta=30, name='demo2'),
49
+ Source(demo.index, count=40, base_dt='2010-01-01', delta=30, name='demo3'),
50
+ ]
51
+
52
+ cfg_path = tmp_path / 'config2.py'
53
+ write_config(cfg_path, cfg2)
54
+ do_index(cfg_path, overwrite_db={'overwrite': True, 'update': False}[mode])
55
+ # TODO use some sort of test helper?
56
+ stats = get_stats(tmp_path)
57
+
58
+ if mode == 'update':
59
+ # should keep the original visits too!
60
+ assert stats == {'demo1': 10, 'demo2': 30, 'demo3': 40}
61
+ else:
62
+ # should overwrite with newly indexed visits
63
+ assert stats == {'demo2': 30, 'demo3': 40}
64
+
65
+
66
+ # TODO check both modes?
67
+ def test_concurrent_indexing(tmp_path: Path) -> None:
68
+ def cfg_fast() -> None:
69
+ from promnesia.common import Source
70
+ from promnesia.sources import demo
71
+
72
+ SOURCES = [Source(demo.index, count=10)]
73
+
74
+ cfg_fast_path = tmp_path / 'cfg_fast.py'
75
+ write_config(cfg_fast_path, cfg_fast)
76
+
77
+ def cfg_slow() -> None:
78
+ from promnesia.common import Source
79
+ from promnesia.sources import demo
80
+
81
+ SOURCES = [Source(demo.index, count=100_000)]
82
+
83
+ cfg_slow_path = tmp_path / 'cfg_slow.py'
84
+ write_config(cfg_slow_path, cfg_slow)
85
+
86
+ # init it first, to create the database
87
+ # TODO ideally this shouldn't be necessary but it's reasonable that people would already have the index
88
+ # otherwise it would fail at db creation point.. which is kinda annoying to work around
89
+ # todo in principle can work around same way as in cachew, by having a loop around PRAGMA WAL command?
90
+ check_call(promnesia_bin('index', '--config', cfg_fast_path, '--overwrite'))
91
+
92
+ total_runs = 0
93
+ # run in the background
94
+ with Popen(promnesia_bin('index', '--config', cfg_slow_path, '--overwrite')) as slow_indexer:
95
+ while slow_indexer.poll() is None:
96
+ # create a bunch of 'smaller' indexers running in parallel
97
+ fasts = [
98
+ Popen(promnesia_bin('index', '--config', cfg_fast_path, '--overwrite'))
99
+ for _ in range(10)
100
+ ]
101
+ for fast in fasts:
102
+ assert fast.wait() == 0, fast # should succeed
103
+ total_runs += 1
104
+ assert slow_indexer.poll() == 0, slow_indexer
105
+
106
+ # FIXME ok, need to uncomment this once proper concurrent indexing is supported
107
+ # if not, slow indexer is too fast, so crank up the count in it
108
+ # assert total_runs > 20
109
+
110
+
111
+ def test_filter(tmp_path: Path, reset_filters) -> None:
112
+ domain_to_filter = 'some-weird-domain.xyz'
113
+ testdata = get_testdata('custom')
114
+ assert any(domain_to_filter in p.read_text() for p in testdata.glob('*.txt')) # precondition
115
+
116
+ def cfg(testdata, domain_to_filter) -> None:
117
+ from promnesia.common import Source
118
+ from promnesia.sources import shellcmd
119
+ from promnesia.sources.plaintext import extract_from_path
120
+
121
+ FILTERS = [
122
+ domain_to_filter,
123
+ ]
124
+
125
+ SOURCES = [Source(shellcmd.index, extract_from_path(testdata))]
126
+
127
+ cfg_path = tmp_path / 'config.py'
128
+ write_config(cfg_path, cfg, testdata=testdata, domain_to_filter=domain_to_filter)
129
+ do_index(cfg_path)
130
+
131
+ visits = get_all_db_visits(tmp_path / 'promnesia.sqlite')
132
+ urls = {v.orig_url for v in visits}
133
+ assert not any(domain_to_filter in u for u in urls), urls
134
+ assert len(visits) == 4 # just in case
135
+
136
+
137
+ def test_weird_urls(tmp_path: Path) -> None:
138
+ # specifically test this here (rather than in cannon)
139
+ # to make sure it's not messed up when we insert/extract from sqlite
140
+
141
+ def cfg(testdata: str) -> None:
142
+ from promnesia.common import Source
143
+ from promnesia.sources import shellcmd
144
+ from promnesia.sources.plaintext import extract_from_path
145
+
146
+ SOURCES = [Source(shellcmd.index, extract_from_path(testdata))]
147
+
148
+ cfg_path = tmp_path / 'config.py'
149
+ write_config(cfg_path, cfg, testdata=get_testdata('weird.txt'))
150
+ do_index(cfg_path)
151
+
152
+ [v1, v2] = get_all_db_visits(tmp_path / 'promnesia.sqlite')
153
+
154
+ assert v1.norm_url == "urbandictionary.com/define.php?term=Belgian%20Whistle"
155
+
156
+ assert v2.norm_url == "en.wikipedia.org/wiki/Dinic%27s_algorithm"
157
+ assert v2.locator.title.endswith('weird.txt:2')
158
+ assert v2.context == 'right, so https://en.wikipedia.org/wiki/Dinic%27s_algorithm can be used for max flow'
159
+
160
+
161
+ def test_errors_during_indexing(tmp_path: Path) -> None:
162
+ def cfg() -> None:
163
+ from promnesia.common import Source
164
+ from promnesia.sources import demo
165
+
166
+ def indexer1():
167
+ visits = list(demo.index(count=10))
168
+ yield from visits[:5]
169
+ yield RuntimeError("some error during visits extraction")
170
+ yield from visits[5:]
171
+
172
+ def indexer2():
173
+ raise RuntimeError("in this case indexer itself crashed")
174
+
175
+ SOURCES = [Source(indexer1), Source(indexer2)]
176
+
177
+ cfg_path = tmp_path / 'config.py'
178
+ write_config(cfg_path, cfg)
179
+ do_index(cfg_path)
180
+
181
+ stats = get_stats(tmp_path)
182
+ assert stats == {
183
+ 'error': 2,
184
+ 'config': 10,
185
+ }
186
+
187
+
188
+ def test_hook(tmp_path: Path) -> None:
189
+ def cfg() -> None:
190
+ from promnesia.common import Source
191
+ from promnesia.sources import demo
192
+
193
+ SOURCES = [Source(demo.index, count=7, name='somename')]
194
+
195
+ from typing import cast, Iterator
196
+ from promnesia.common import DbVisit, Loc, Res
197
+ from promnesia.sources import demo
198
+
199
+ def HOOK(visit: Res[DbVisit]) -> Iterator[Res[DbVisit]]:
200
+ visit = cast(DbVisit, visit)
201
+
202
+ # NOTE: might be a good idea to check that the visit is an exception first and yield it intact?
203
+ nurl = visit.norm_url
204
+ if 'page1' in nurl:
205
+ yield visit._replace(norm_url='patched.com')
206
+ elif 'page2' in nurl:
207
+ raise Exception('boom') # deliberately crash
208
+ elif 'page3' in nurl:
209
+ # just don't yield anything! it will be omitted
210
+ pass
211
+ elif 'page4' in nurl:
212
+ # can emit multiple!
213
+ yield visit
214
+ yield visit
215
+ elif 'page6' in nurl:
216
+ # patch locator
217
+ yield visit._replace(locator=Loc.make(title='some custom timte', href='/can/replace/original/path'))
218
+ else:
219
+ yield visit
220
+
221
+ cfg_path = tmp_path / 'config.py'
222
+ write_config(cfg_path, cfg)
223
+ do_index(cfg_path)
224
+
225
+ [p0, p1, e2, p41, p42, p5, p6] = get_all_db_visits(tmp_path / 'promnesia.sqlite')
226
+ assert p0.norm_url == 'demo.com/page0.html'
227
+ assert p1.norm_url == 'patched.com'
228
+ assert e2.norm_url == '<error>'
229
+ assert p41 == p42
230
+ assert isinstance(p6, DbVisit)
231
+ assert p6.locator is not None
232
+
233
+
234
+ def test_example_config(tmp_path: Path) -> None:
235
+ if _is_windows:
236
+ pytest.skip("doesn't work on Windows: example config references /usr/include paths")
237
+
238
+ config = read_example_config() + '\n' + f'OUTPUT_DIR = "{str(tmp_path)}"'
239
+ cfg_path = tmp_path / 'example_config.py'
240
+ cfg_path.write_text(config)
241
+
242
+ do_index(cfg_path)
243
+
244
+ visits = [v for v in get_all_db_visits(tmp_path / 'promnesia.sqlite') if v.src != 'error']
245
+ assert len(visits) > 50 # random sanity check
@@ -0,0 +1,292 @@
1
+ from datetime import datetime
2
+ from pathlib import Path
3
+ from subprocess import Popen
4
+
5
+ import pytest
6
+
7
+ from ..__main__ import do_index
8
+
9
+ from .common import promnesia_bin, write_config
10
+ from .server_helper import run_server
11
+
12
+
13
+ def test_status_error() -> None:
14
+ """
15
+ If DB doesn't exist, server should handle it gracefully and respond with error
16
+ """
17
+ with run_server(db='/does/not/exist') as server:
18
+ response = server.post('/status')
19
+
20
+ # TODO ugh currently returns 200? maybe should return proper error, but need to handle in extension
21
+ # assert response.status_code == 404
22
+
23
+ body = response.json()
24
+
25
+ version = body['version']
26
+ assert version is not None
27
+ assert len(version.split('.')) >= 2 # random check..
28
+
29
+ assert 'ERROR' in body['db'] # defensive, it doesn't exist
30
+
31
+
32
+ def test_status_ok(tmp_path: Path) -> None:
33
+ def cfg() -> None:
34
+ from promnesia.common import Source
35
+ from promnesia.sources import demo
36
+
37
+ SOURCES = [Source(demo.index, count=10)]
38
+
39
+ cfg_path = tmp_path / 'config.py'
40
+ write_config(cfg_path, cfg)
41
+ do_index(cfg_path)
42
+
43
+ db_path = tmp_path / 'promnesia.sqlite'
44
+ with run_server(db=db_path, timezone='America/New_York') as server:
45
+ r = server.post('/status').json()
46
+ version = r['version']
47
+ assert version is not None
48
+ assert len(version.split('.')) >= 2 # random check..
49
+
50
+ assert r['db'] == str(db_path)
51
+
52
+ assert r['stats'] == {'total_visits': 10}
53
+
54
+
55
+ def test_visits(tmp_path: Path) -> None:
56
+ def cfg() -> None:
57
+ from promnesia.common import Source
58
+ from promnesia.sources import demo
59
+
60
+ SOURCES = [Source(demo.index, base_dt='2000-01-01', delta=30 * 60)]
61
+
62
+ cfg_path = tmp_path / 'config.py'
63
+ write_config(cfg_path, cfg)
64
+ do_index(cfg_path)
65
+
66
+ # force timezone here, otherwise dependeing on the test env response varies
67
+ with run_server(db=tmp_path / 'promnesia.sqlite', timezone='America/New_York') as server:
68
+ r = server.post('/visits', json={'url': 'whatever'}).json()
69
+ assert r['visits'] == []
70
+
71
+ r = server.post('/visits', json={'url': 'https://demo.com/page0.html'})
72
+ rj = r.json()
73
+ assert rj['normalised_url'] == 'demo.com/page0.html'
74
+ [v] = rj['visits']
75
+ assert v['src'] == 'demo'
76
+ assert v['locator']['title'] == 'demo'
77
+
78
+ assert v['dt'] == '01 Jan 2000 00:00:00 -0500'
79
+
80
+
81
+ def test_visits_hierarchy(tmp_path: Path) -> None:
82
+ def cfg() -> None:
83
+ from datetime import datetime
84
+
85
+ from promnesia.common import Source, Visit, Loc
86
+ from promnesia.sources import demo
87
+
88
+ def indexer():
89
+ visits = list(demo.index(count=6))
90
+ yield Visit(
91
+ url='https://reddit.com/post1',
92
+ dt=datetime.fromisoformat('2023-12-04'),
93
+ locator=Loc.make('reddit'),
94
+ )
95
+ yield Visit(
96
+ url='https://reddit.com/post1/comment2',
97
+ dt=datetime.fromisoformat('2023-12-02'),
98
+ locator=Loc.make('reddit'),
99
+ context='I am comment 2',
100
+ )
101
+ yield from visits[:3]
102
+ yield Visit(
103
+ url='https://reddit.com/post2',
104
+ dt=datetime.fromisoformat('2023-12-05'),
105
+ locator=Loc.make('reddit'),
106
+ )
107
+ yield from visits[3:]
108
+ yield Visit(
109
+ url='https://reddit.com/post1/ihavenocontext',
110
+ dt=datetime.fromisoformat('2023-12-06'),
111
+ locator=Loc.make('reddit'),
112
+ )
113
+ yield Visit(
114
+ url='https://reddit.com/post1/comment1',
115
+ dt=datetime.fromisoformat('2023-12-06'),
116
+ locator=Loc.make('reddit'),
117
+ context='I am comment 1',
118
+ )
119
+
120
+ SOURCES = [Source(indexer)]
121
+
122
+ cfg_path = tmp_path / 'config.py'
123
+ write_config(cfg_path, cfg)
124
+ do_index(cfg_path)
125
+
126
+ # force timezone here, otherwise dependeing on the test env response varies
127
+ with run_server(db=tmp_path / 'promnesia.sqlite', timezone='America/New_York') as server:
128
+ r = server.post('/visits', json={'url': 'https://reddit.com/post1'}).json()
129
+ # retuns exact match + 'child' visits that are interesting (e.g. have context)
130
+ assert {v['original_url'] for v in r['visits']} == {
131
+ 'https://reddit.com/post1',
132
+ 'https://reddit.com/post1/comment1',
133
+ 'https://reddit.com/post1/comment2',
134
+ }
135
+
136
+
137
+ def test_visited(tmp_path: Path) -> None:
138
+ def cfg() -> None:
139
+ from promnesia.common import Source
140
+ from promnesia.sources import demo
141
+
142
+ SOURCES = [Source(demo.index, base_dt='2000-01-01', delta=30 * 60)]
143
+
144
+ cfg_path = tmp_path / 'config.py'
145
+ write_config(cfg_path, cfg)
146
+ do_index(cfg_path)
147
+
148
+ test_url = 'https://demo.com/page5.html'
149
+
150
+ # force timezone here, otherwise dependeing on the test env response varies
151
+ with run_server(db=tmp_path / 'promnesia.sqlite', timezone='America/New_York') as server:
152
+ r = server.post('/visited', json={'urls': []}).json()
153
+ assert r == []
154
+
155
+ r = server.post('/visited', json={'urls': [test_url, 'http://badurl.org']}).json()
156
+ [r1, r2] = r
157
+ assert r1['original_url'] == test_url
158
+ assert r2 is None
159
+
160
+
161
+ def test_search(tmp_path: Path) -> None:
162
+ # TODO not sure if should index at all here or just insert DbVisits directly?
163
+ def cfg() -> None:
164
+ from datetime import datetime
165
+
166
+ from promnesia.common import Source, Visit, Loc
167
+ from promnesia.sources import demo
168
+
169
+ def indexer():
170
+ visits = list(demo.index(count=6))
171
+ yield Visit(
172
+ url='https://someone.org/something',
173
+ dt=datetime.fromisoformat('2023-12-04T11:12:13+03:00'),
174
+ locator=Loc.make('whatever'),
175
+ )
176
+ yield from visits[:3]
177
+ yield Visit(
178
+ url='https://wiki.termux.com/wiki/Termux-setup-storage',
179
+ locator=Loc.make(
180
+ title='Reddit comment',
181
+ href='https://reddit.com/r/termux/comments/m4qrxt/cant_open_storageshared_in_termux/gso0kak/',
182
+ ),
183
+ dt=datetime.fromisoformat('2023-12-02'),
184
+ context='perhaps it will help someone else https://wiki.termux.com/wiki/Termux-setup-storage',
185
+ )
186
+ yield from visits[3:]
187
+
188
+ SOURCES = [Source(indexer)]
189
+
190
+ cfg_path = tmp_path / 'config.py'
191
+ write_config(cfg_path, cfg)
192
+ do_index(cfg_path)
193
+
194
+ with run_server(db=tmp_path / 'promnesia.sqlite', timezone='America/New_York') as server:
195
+ # FIXME 'url' is actually kinda misleading -- it can be any text
196
+ rj = server.post('/search', json={'url': 'someone'}).json()
197
+ # TODO maybe return in chronological order or something? not sure
198
+ [v1, v2] = sorted(rj['visits'], key=lambda j: j['dt'])
199
+
200
+ assert v1['context'] == 'perhaps it will help someone else https://wiki.termux.com/wiki/Termux-setup-storage'
201
+ assert v1['dt'] == '02 Dec 2023 00:00:00 -0500' # uses server timezone (original visit didn't have it)
202
+
203
+ assert v2['normalised_url'] == 'someone.org/something'
204
+ assert v2['dt'] == '04 Dec 2023 11:12:13 +0300' # uses original visit timezone
205
+
206
+ rj = server.post('/search', json={'url': 'comment'}).json()
207
+ [v] = rj['visits']
208
+ assert v['context'] == 'perhaps it will help someone else https://wiki.termux.com/wiki/Termux-setup-storage'
209
+
210
+
211
+ def test_search_around(tmp_path: Path) -> None:
212
+ # this should return visits up to 3 hours in the past
213
+ def cfg() -> None:
214
+ from promnesia.common import Source
215
+ from promnesia.sources import demo
216
+
217
+ # generates 60 visits within 10 mins of each other -- so spanning over 10 hours
218
+ SOURCES = [Source(demo.index, count=60, base_dt='2000-01-01T00:00:00+03:00', delta=10 * 60)]
219
+
220
+ cfg_path = tmp_path / 'config.py'
221
+ write_config(cfg_path, cfg)
222
+ do_index(cfg_path)
223
+
224
+ # TODO hmm. perhaps it makes more sense to run query in different process and server in main process for testing??
225
+ with run_server(db=tmp_path / 'promnesia.sqlite') as server:
226
+ rj = server.post(
227
+ '/search_around',
228
+ json={'timestamp': datetime.fromisoformat('2005-01-01T00:00:00+06:00').timestamp()},
229
+ ).json()
230
+ assert rj['visits'] == []
231
+
232
+ rj = server.post(
233
+ '/search_around',
234
+ json={'timestamp': datetime.fromisoformat('2000-01-01T07:55:00+06:00').timestamp()},
235
+ ).json()
236
+ visits = rj['visits']
237
+ assert len(visits) == 18 # 6 per hour * 3
238
+ assert visits[0 ]['dt'] == '01 Jan 2000 02:00:00 +0300'
239
+ assert visits[-1]['dt'] == '01 Jan 2000 04:50:00 +0300'
240
+
241
+
242
+ @pytest.mark.parametrize('mode', ['update', 'overwrite'])
243
+ def test_query_while_indexing(tmp_path: Path, mode: str) -> None:
244
+ overwrite = mode == 'overwrite'
245
+ moverwrite = ['--overwrite'] if overwrite else []
246
+
247
+ def _index(run_id: str) -> Popen:
248
+ def cfg(run_id: str) -> None:
249
+ from promnesia.common import Source
250
+ from promnesia.sources import demo
251
+
252
+ SOURCES = [Source(demo.index, count=1_000, name=run_id)]
253
+
254
+ cfg_path = tmp_path / f'config{run_id}.py'
255
+ write_config(cfg_path, cfg, run_id=run_id)
256
+
257
+ return Popen(promnesia_bin('index', '--config', cfg_path, *moverwrite))
258
+
259
+ # trigger initial indexing
260
+ with _index(run_id='0'):
261
+ pass
262
+
263
+ with run_server(db=tmp_path / 'promnesia.sqlite') as server:
264
+ rj = server.post(
265
+ '/search_around',
266
+ json={'timestamp': datetime.fromisoformat('2005-01-01T00:00:00+06:00').timestamp()},
267
+ ).json()
268
+ assert rj['visits'] == []
269
+
270
+ for run_id in range(1, 5):
271
+ with _index(run_id=str(run_id)) as indexer:
272
+ # hammer the backend to increase likelihood of race condition
273
+ while indexer.poll() is None:
274
+ stats = server.post('/status').json()['stats']
275
+ total_visits = stats['total_visits']
276
+ if overwrite:
277
+ assert total_visits >= 1_000
278
+ else:
279
+ assert total_visits >= 1_000 * run_id
280
+
281
+
282
+ # TODO also could check server methods directly?
283
+ # via something like this... but not sure if really makes much difference
284
+ # import promnesia.server as S
285
+ # S.EnvConfig.set(S.ServerConfig(
286
+ # # TODO populate with test db and benchmark properly...
287
+ # db=Path('/todo'),
288
+ # timezone=pytz.utc,
289
+ # ))
290
+ # links = [f'https://reddit.com/whatever{i}.html' for i in range(count)]
291
+ # res = S.visited(links)
292
+ # assert len(res) == len(links)
@@ -0,0 +1,41 @@
1
+ from unittest.mock import patch
2
+
3
+ from ..common import traverse
4
+
5
+ from .common import get_testdata
6
+
7
+
8
+ testDataPath = get_testdata('traverse')
9
+
10
+
11
+ # Patch shutil.which so it always returns false (when trying to which fdfind, etc)
12
+ # so that it falls back to find
13
+ @patch('promnesia.common.shutil.which', return_value=False)
14
+ def test_traverse_ignore_find(patched) -> None:
15
+ '''
16
+ traverse() with `find` but ignore some stuff
17
+ '''
18
+ paths = set(traverse(testDataPath, ignore=['ignoreme.txt', 'ignoreme2']))
19
+
20
+ assert paths == {testDataPath / 'imhere2/real.txt', testDataPath / 'imhere.txt'}
21
+
22
+
23
+ def test_traverse_ignore_fdfind():
24
+ '''
25
+ traverse() with `fdfind` but ignore some stuff
26
+ '''
27
+ paths = set(traverse(testDataPath, ignore=['ignoreme.txt', 'ignoreme2']))
28
+
29
+ assert paths == {testDataPath / 'imhere.txt', testDataPath / 'imhere2/real.txt'}
30
+
31
+
32
+ # TODO: It would be nice to test the implementation directly without having to do this
33
+ # weird patching in the future
34
+ @patch('promnesia.common._is_windows', new_callable=lambda: True)
35
+ def test_traverse_ignore_windows(patched) -> None:
36
+ '''
37
+ traverse() with python when _is_windows is true but ignore some stuff
38
+ '''
39
+ paths = set(traverse(testDataPath, ignore=['ignoreme.txt', 'ignoreme2']))
40
+
41
+ assert paths == {testDataPath / 'imhere.txt', testDataPath / 'imhere2/real.txt'}
@@ -0,0 +1,35 @@
1
+ from datetime import datetime, timedelta
2
+ from pathlib import Path
3
+ from typing import Mapping, Optional, Sequence, Tuple, Union
4
+
5
+ from ..common import Source, Loc, Visit
6
+ from ..database.dump import visits_to_sqlite
7
+ from ..extract import extract_visits
8
+
9
+
10
+ # TODO a bit shit... why did I make it dict at first??
11
+ Urls = Union[
12
+ Mapping[str, Optional[str]],
13
+ Sequence[Tuple[str, Optional[str]]],
14
+ ]
15
+
16
+
17
+ def index_urls(urls: Urls, *, source_name: str = 'test'):
18
+ uuu = list(urls.items()) if isinstance(urls, dict) else urls
19
+
20
+ def idx(tmp_path: Path) -> None:
21
+ def indexer():
22
+ for i, (url, ctx) in enumerate(uuu):
23
+ yield Visit(
24
+ url=url,
25
+ dt=datetime.min + timedelta(days=5000) + timedelta(hours=i),
26
+ locator=Loc.make('test'),
27
+ context=ctx,
28
+ )
29
+
30
+ db_visits = extract_visits(source=Source(indexer), src=source_name)
31
+ errors = visits_to_sqlite(vit=db_visits, overwrite_db=True, _db_path=tmp_path / 'promnesia.sqlite')
32
+
33
+ assert len(errors) == 0, errors
34
+
35
+ return idx
@@ -1,19 +1,17 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: promnesia
3
- Version: 1.1.20230129
3
+ Version: 1.2.20240810
4
4
  Summary: Enhancement of your browsing history
5
5
  Home-page: https://github.com/karlicoss/promnesia
6
6
  Author: Dmitrii Gerasimov
7
7
  Author-email: karlicoss@gmail.com
8
- License: UNKNOWN
9
- Platform: UNKNOWN
10
- Requires-Python: >=3.7
8
+ Requires-Python: >=3.8
9
+ License-File: LICENSE
11
10
  Requires-Dist: appdirs
12
11
  Requires-Dist: tzlocal
13
12
  Requires-Dist: more-itertools
14
13
  Requires-Dist: pytz
15
- Requires-Dist: sqlalchemy
16
- Requires-Dist: cachew (>=0.8.0)
14
+ Requires-Dist: sqlalchemy >=2.0
17
15
  Requires-Dist: urlextract
18
16
  Requires-Dist: fastapi
19
17
  Requires-Dist: uvicorn[standard]
@@ -26,34 +24,31 @@ Requires-Dist: HPI ; extra == 'all'
26
24
  Requires-Dist: beautifulsoup4 ; extra == 'all'
27
25
  Requires-Dist: lxml ; extra == 'all'
28
26
  Requires-Dist: mistletoe ; extra == 'all'
29
- Requires-Dist: orgparse (>=0.3.0) ; extra == 'all'
30
- Requires-Dist: dataset ; extra == 'all'
27
+ Requires-Dist: orgparse >=0.3.0 ; extra == 'all'
31
28
  Provides-Extra: html
32
29
  Requires-Dist: beautifulsoup4 ; extra == 'html'
33
30
  Requires-Dist: lxml ; extra == 'html'
34
- Provides-Extra: linting
35
- Requires-Dist: pytest ; extra == 'linting'
36
- Requires-Dist: mypy ; extra == 'linting'
37
- Requires-Dist: lxml ; extra == 'linting'
38
31
  Provides-Extra: markdown
39
32
  Requires-Dist: mistletoe ; extra == 'markdown'
40
33
  Provides-Extra: optional
41
34
  Requires-Dist: logzero ; extra == 'optional'
42
35
  Requires-Dist: python-magic ; extra == 'optional'
43
36
  Provides-Extra: org
44
- Requires-Dist: orgparse (>=0.3.0) ; extra == 'org'
37
+ Requires-Dist: orgparse >=0.3.0 ; extra == 'org'
45
38
  Provides-Extra: telegram
46
- Requires-Dist: dataset ; extra == 'telegram'
47
39
  Provides-Extra: testing
48
40
  Requires-Dist: pytest ; extra == 'testing'
49
41
  Requires-Dist: pytest-timeout ; extra == 'testing'
50
42
  Requires-Dist: pytest-xdist ; extra == 'testing'
43
+ Requires-Dist: hypothesis ; extra == 'testing'
51
44
  Requires-Dist: psutil ; extra == 'testing'
52
- Requires-Dist: httpie ; extra == 'testing'
45
+ Requires-Dist: requests ; extra == 'testing'
53
46
  Requires-Dist: selenium ; extra == 'testing'
54
47
  Requires-Dist: click ; extra == 'testing'
55
- Requires-Dist: pyautogui ; extra == 'testing'
56
-
57
- UNKNOWN
58
-
48
+ Requires-Dist: ruff ; extra == 'testing'
49
+ Requires-Dist: mypy ; extra == 'testing'
50
+ Requires-Dist: lxml ; extra == 'testing'
51
+ Requires-Dist: loguru ; extra == 'testing'
52
+ Provides-Extra: testing-gui
53
+ Requires-Dist: pyautogui ; extra == 'testing-gui'
59
54