promnesia 1.1.20230129__py3-none-any.whl → 1.2.20240810__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- promnesia/__main__.py +58 -50
- promnesia/cannon.py +4 -4
- promnesia/common.py +57 -38
- promnesia/compare.py +3 -2
- promnesia/compat.py +6 -65
- promnesia/config.py +4 -2
- promnesia/database/common.py +66 -0
- promnesia/database/dump.py +187 -0
- promnesia/{read_db.py → database/load.py} +10 -11
- promnesia/extract.py +1 -0
- promnesia/kjson.py +1 -1
- promnesia/logging.py +14 -14
- promnesia/misc/__init__.pyi +0 -0
- promnesia/misc/config_example.py +1 -2
- promnesia/misc/install_server.py +5 -4
- promnesia/server.py +24 -24
- promnesia/sources/__init__.pyi +0 -0
- promnesia/sources/auto.py +12 -7
- promnesia/sources/browser.py +80 -293
- promnesia/sources/browser_legacy.py +298 -0
- promnesia/sources/demo.py +18 -2
- promnesia/sources/filetypes.py +8 -0
- promnesia/sources/github.py +2 -2
- promnesia/sources/hackernews.py +1 -2
- promnesia/sources/hypothesis.py +1 -1
- promnesia/sources/markdown.py +15 -15
- promnesia/sources/org.py +7 -3
- promnesia/sources/plaintext.py +3 -1
- promnesia/sources/reddit.py +2 -2
- promnesia/sources/rss.py +5 -1
- promnesia/sources/shellcmd.py +6 -2
- promnesia/sources/signal.py +29 -20
- promnesia/sources/smscalls.py +8 -1
- promnesia/sources/stackexchange.py +2 -2
- promnesia/sources/takeout.py +132 -12
- promnesia/sources/takeout_legacy.py +10 -2
- promnesia/sources/telegram.py +79 -123
- promnesia/sources/telegram_legacy.py +117 -0
- promnesia/sources/vcs.py +1 -1
- promnesia/sources/viber.py +6 -15
- promnesia/sources/website.py +1 -1
- promnesia/sqlite.py +42 -0
- promnesia/tests/__init__.py +0 -0
- promnesia/tests/common.py +137 -0
- promnesia/tests/server_helper.py +64 -0
- promnesia/tests/sources/__init__.py +0 -0
- promnesia/tests/sources/test_auto.py +66 -0
- promnesia/tests/sources/test_filetypes.py +42 -0
- promnesia/tests/sources/test_hypothesis.py +39 -0
- promnesia/tests/sources/test_org.py +65 -0
- promnesia/tests/sources/test_plaintext.py +26 -0
- promnesia/tests/sources/test_shellcmd.py +22 -0
- promnesia/tests/sources/test_takeout.py +58 -0
- promnesia/tests/test_cannon.py +325 -0
- promnesia/tests/test_cli.py +42 -0
- promnesia/tests/test_compare.py +30 -0
- promnesia/tests/test_config.py +290 -0
- promnesia/tests/test_db_dump.py +223 -0
- promnesia/tests/test_extract.py +61 -0
- promnesia/tests/test_extract_urls.py +43 -0
- promnesia/tests/test_indexer.py +245 -0
- promnesia/tests/test_server.py +292 -0
- promnesia/tests/test_traverse.py +41 -0
- promnesia/tests/utils.py +35 -0
- {promnesia-1.1.20230129.dist-info → promnesia-1.2.20240810.dist-info}/METADATA +14 -19
- promnesia-1.2.20240810.dist-info/RECORD +83 -0
- {promnesia-1.1.20230129.dist-info → promnesia-1.2.20240810.dist-info}/WHEEL +1 -1
- {promnesia-1.1.20230129.dist-info → promnesia-1.2.20240810.dist-info}/entry_points.txt +0 -1
- promnesia/dump.py +0 -105
- promnesia-1.1.20230129.dist-info/RECORD +0 -55
- {promnesia-1.1.20230129.dist-info → promnesia-1.2.20240810.dist-info}/LICENSE +0 -0
- {promnesia-1.1.20230129.dist-info → promnesia-1.2.20240810.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,245 @@
|
|
1
|
+
from collections import Counter
|
2
|
+
from pathlib import Path
|
3
|
+
from subprocess import check_call, Popen
|
4
|
+
|
5
|
+
from ..__main__ import do_index, read_example_config
|
6
|
+
from ..common import DbVisit, _is_windows
|
7
|
+
from ..database.load import get_all_db_visits
|
8
|
+
|
9
|
+
import pytest
|
10
|
+
|
11
|
+
from .common import get_testdata, promnesia_bin, reset_filters, write_config
|
12
|
+
|
13
|
+
|
14
|
+
def get_stats(tmp_path: Path) -> Counter:
|
15
|
+
visits = get_all_db_visits(tmp_path / 'promnesia.sqlite')
|
16
|
+
return Counter(v.src for v in visits)
|
17
|
+
|
18
|
+
|
19
|
+
@pytest.mark.parametrize('mode', ['update', 'overwrite'])
|
20
|
+
def test_indexing_mode(tmp_path: Path, mode: str) -> None:
|
21
|
+
# ugh. we modify the config very fast during tests
|
22
|
+
# and pycache distinguishes identical filenames based on int mtime in seconds
|
23
|
+
# so best to use different names to prevent undesired caching
|
24
|
+
# https://github.com/python/cpython/blob/fb202af4470d6051a69bb9d2f44d7e8a1c99eb4f/Lib/importlib/_bootstrap_external.py#L714-L739
|
25
|
+
# TODO could probably relax that if we switch from importlib config loading to exec()?
|
26
|
+
|
27
|
+
def cfg1() -> None:
|
28
|
+
from promnesia.common import Source
|
29
|
+
from promnesia.sources import demo
|
30
|
+
|
31
|
+
SOURCES = [
|
32
|
+
Source(demo.index, count=10, base_dt='2000-01-01', delta=30, name='demo1'),
|
33
|
+
Source(demo.index, count=20, base_dt='2001-01-01', delta=30, name='demo2'),
|
34
|
+
]
|
35
|
+
|
36
|
+
cfg_path = tmp_path / 'config1.py'
|
37
|
+
write_config(cfg_path, cfg1)
|
38
|
+
do_index(cfg_path)
|
39
|
+
|
40
|
+
stats = get_stats(tmp_path)
|
41
|
+
assert stats == {'demo1': 10, 'demo2': 20}
|
42
|
+
|
43
|
+
def cfg2() -> None:
|
44
|
+
from promnesia.common import Source
|
45
|
+
from promnesia.sources import demo
|
46
|
+
|
47
|
+
SOURCES = [
|
48
|
+
Source(demo.index, count=30, base_dt='2005-01-01', delta=30, name='demo2'),
|
49
|
+
Source(demo.index, count=40, base_dt='2010-01-01', delta=30, name='demo3'),
|
50
|
+
]
|
51
|
+
|
52
|
+
cfg_path = tmp_path / 'config2.py'
|
53
|
+
write_config(cfg_path, cfg2)
|
54
|
+
do_index(cfg_path, overwrite_db={'overwrite': True, 'update': False}[mode])
|
55
|
+
# TODO use some sort of test helper?
|
56
|
+
stats = get_stats(tmp_path)
|
57
|
+
|
58
|
+
if mode == 'update':
|
59
|
+
# should keep the original visits too!
|
60
|
+
assert stats == {'demo1': 10, 'demo2': 30, 'demo3': 40}
|
61
|
+
else:
|
62
|
+
# should overwrite with newly indexed visits
|
63
|
+
assert stats == {'demo2': 30, 'demo3': 40}
|
64
|
+
|
65
|
+
|
66
|
+
# TODO check both modes?
|
67
|
+
def test_concurrent_indexing(tmp_path: Path) -> None:
|
68
|
+
def cfg_fast() -> None:
|
69
|
+
from promnesia.common import Source
|
70
|
+
from promnesia.sources import demo
|
71
|
+
|
72
|
+
SOURCES = [Source(demo.index, count=10)]
|
73
|
+
|
74
|
+
cfg_fast_path = tmp_path / 'cfg_fast.py'
|
75
|
+
write_config(cfg_fast_path, cfg_fast)
|
76
|
+
|
77
|
+
def cfg_slow() -> None:
|
78
|
+
from promnesia.common import Source
|
79
|
+
from promnesia.sources import demo
|
80
|
+
|
81
|
+
SOURCES = [Source(demo.index, count=100_000)]
|
82
|
+
|
83
|
+
cfg_slow_path = tmp_path / 'cfg_slow.py'
|
84
|
+
write_config(cfg_slow_path, cfg_slow)
|
85
|
+
|
86
|
+
# init it first, to create the database
|
87
|
+
# TODO ideally this shouldn't be necessary but it's reasonable that people would already have the index
|
88
|
+
# otherwise it would fail at db creation point.. which is kinda annoying to work around
|
89
|
+
# todo in principle can work around same way as in cachew, by having a loop around PRAGMA WAL command?
|
90
|
+
check_call(promnesia_bin('index', '--config', cfg_fast_path, '--overwrite'))
|
91
|
+
|
92
|
+
total_runs = 0
|
93
|
+
# run in the background
|
94
|
+
with Popen(promnesia_bin('index', '--config', cfg_slow_path, '--overwrite')) as slow_indexer:
|
95
|
+
while slow_indexer.poll() is None:
|
96
|
+
# create a bunch of 'smaller' indexers running in parallel
|
97
|
+
fasts = [
|
98
|
+
Popen(promnesia_bin('index', '--config', cfg_fast_path, '--overwrite'))
|
99
|
+
for _ in range(10)
|
100
|
+
]
|
101
|
+
for fast in fasts:
|
102
|
+
assert fast.wait() == 0, fast # should succeed
|
103
|
+
total_runs += 1
|
104
|
+
assert slow_indexer.poll() == 0, slow_indexer
|
105
|
+
|
106
|
+
# FIXME ok, need to uncomment this once proper concurrent indexing is supported
|
107
|
+
# if not, slow indexer is too fast, so crank up the count in it
|
108
|
+
# assert total_runs > 20
|
109
|
+
|
110
|
+
|
111
|
+
def test_filter(tmp_path: Path, reset_filters) -> None:
|
112
|
+
domain_to_filter = 'some-weird-domain.xyz'
|
113
|
+
testdata = get_testdata('custom')
|
114
|
+
assert any(domain_to_filter in p.read_text() for p in testdata.glob('*.txt')) # precondition
|
115
|
+
|
116
|
+
def cfg(testdata, domain_to_filter) -> None:
|
117
|
+
from promnesia.common import Source
|
118
|
+
from promnesia.sources import shellcmd
|
119
|
+
from promnesia.sources.plaintext import extract_from_path
|
120
|
+
|
121
|
+
FILTERS = [
|
122
|
+
domain_to_filter,
|
123
|
+
]
|
124
|
+
|
125
|
+
SOURCES = [Source(shellcmd.index, extract_from_path(testdata))]
|
126
|
+
|
127
|
+
cfg_path = tmp_path / 'config.py'
|
128
|
+
write_config(cfg_path, cfg, testdata=testdata, domain_to_filter=domain_to_filter)
|
129
|
+
do_index(cfg_path)
|
130
|
+
|
131
|
+
visits = get_all_db_visits(tmp_path / 'promnesia.sqlite')
|
132
|
+
urls = {v.orig_url for v in visits}
|
133
|
+
assert not any(domain_to_filter in u for u in urls), urls
|
134
|
+
assert len(visits) == 4 # just in case
|
135
|
+
|
136
|
+
|
137
|
+
def test_weird_urls(tmp_path: Path) -> None:
|
138
|
+
# specifically test this here (rather than in cannon)
|
139
|
+
# to make sure it's not messed up when we insert/extract from sqlite
|
140
|
+
|
141
|
+
def cfg(testdata: str) -> None:
|
142
|
+
from promnesia.common import Source
|
143
|
+
from promnesia.sources import shellcmd
|
144
|
+
from promnesia.sources.plaintext import extract_from_path
|
145
|
+
|
146
|
+
SOURCES = [Source(shellcmd.index, extract_from_path(testdata))]
|
147
|
+
|
148
|
+
cfg_path = tmp_path / 'config.py'
|
149
|
+
write_config(cfg_path, cfg, testdata=get_testdata('weird.txt'))
|
150
|
+
do_index(cfg_path)
|
151
|
+
|
152
|
+
[v1, v2] = get_all_db_visits(tmp_path / 'promnesia.sqlite')
|
153
|
+
|
154
|
+
assert v1.norm_url == "urbandictionary.com/define.php?term=Belgian%20Whistle"
|
155
|
+
|
156
|
+
assert v2.norm_url == "en.wikipedia.org/wiki/Dinic%27s_algorithm"
|
157
|
+
assert v2.locator.title.endswith('weird.txt:2')
|
158
|
+
assert v2.context == 'right, so https://en.wikipedia.org/wiki/Dinic%27s_algorithm can be used for max flow'
|
159
|
+
|
160
|
+
|
161
|
+
def test_errors_during_indexing(tmp_path: Path) -> None:
|
162
|
+
def cfg() -> None:
|
163
|
+
from promnesia.common import Source
|
164
|
+
from promnesia.sources import demo
|
165
|
+
|
166
|
+
def indexer1():
|
167
|
+
visits = list(demo.index(count=10))
|
168
|
+
yield from visits[:5]
|
169
|
+
yield RuntimeError("some error during visits extraction")
|
170
|
+
yield from visits[5:]
|
171
|
+
|
172
|
+
def indexer2():
|
173
|
+
raise RuntimeError("in this case indexer itself crashed")
|
174
|
+
|
175
|
+
SOURCES = [Source(indexer1), Source(indexer2)]
|
176
|
+
|
177
|
+
cfg_path = tmp_path / 'config.py'
|
178
|
+
write_config(cfg_path, cfg)
|
179
|
+
do_index(cfg_path)
|
180
|
+
|
181
|
+
stats = get_stats(tmp_path)
|
182
|
+
assert stats == {
|
183
|
+
'error': 2,
|
184
|
+
'config': 10,
|
185
|
+
}
|
186
|
+
|
187
|
+
|
188
|
+
def test_hook(tmp_path: Path) -> None:
|
189
|
+
def cfg() -> None:
|
190
|
+
from promnesia.common import Source
|
191
|
+
from promnesia.sources import demo
|
192
|
+
|
193
|
+
SOURCES = [Source(demo.index, count=7, name='somename')]
|
194
|
+
|
195
|
+
from typing import cast, Iterator
|
196
|
+
from promnesia.common import DbVisit, Loc, Res
|
197
|
+
from promnesia.sources import demo
|
198
|
+
|
199
|
+
def HOOK(visit: Res[DbVisit]) -> Iterator[Res[DbVisit]]:
|
200
|
+
visit = cast(DbVisit, visit)
|
201
|
+
|
202
|
+
# NOTE: might be a good idea to check that the visit is an exception first and yield it intact?
|
203
|
+
nurl = visit.norm_url
|
204
|
+
if 'page1' in nurl:
|
205
|
+
yield visit._replace(norm_url='patched.com')
|
206
|
+
elif 'page2' in nurl:
|
207
|
+
raise Exception('boom') # deliberately crash
|
208
|
+
elif 'page3' in nurl:
|
209
|
+
# just don't yield anything! it will be omitted
|
210
|
+
pass
|
211
|
+
elif 'page4' in nurl:
|
212
|
+
# can emit multiple!
|
213
|
+
yield visit
|
214
|
+
yield visit
|
215
|
+
elif 'page6' in nurl:
|
216
|
+
# patch locator
|
217
|
+
yield visit._replace(locator=Loc.make(title='some custom timte', href='/can/replace/original/path'))
|
218
|
+
else:
|
219
|
+
yield visit
|
220
|
+
|
221
|
+
cfg_path = tmp_path / 'config.py'
|
222
|
+
write_config(cfg_path, cfg)
|
223
|
+
do_index(cfg_path)
|
224
|
+
|
225
|
+
[p0, p1, e2, p41, p42, p5, p6] = get_all_db_visits(tmp_path / 'promnesia.sqlite')
|
226
|
+
assert p0.norm_url == 'demo.com/page0.html'
|
227
|
+
assert p1.norm_url == 'patched.com'
|
228
|
+
assert e2.norm_url == '<error>'
|
229
|
+
assert p41 == p42
|
230
|
+
assert isinstance(p6, DbVisit)
|
231
|
+
assert p6.locator is not None
|
232
|
+
|
233
|
+
|
234
|
+
def test_example_config(tmp_path: Path) -> None:
|
235
|
+
if _is_windows:
|
236
|
+
pytest.skip("doesn't work on Windows: example config references /usr/include paths")
|
237
|
+
|
238
|
+
config = read_example_config() + '\n' + f'OUTPUT_DIR = "{str(tmp_path)}"'
|
239
|
+
cfg_path = tmp_path / 'example_config.py'
|
240
|
+
cfg_path.write_text(config)
|
241
|
+
|
242
|
+
do_index(cfg_path)
|
243
|
+
|
244
|
+
visits = [v for v in get_all_db_visits(tmp_path / 'promnesia.sqlite') if v.src != 'error']
|
245
|
+
assert len(visits) > 50 # random sanity check
|
@@ -0,0 +1,292 @@
|
|
1
|
+
from datetime import datetime
|
2
|
+
from pathlib import Path
|
3
|
+
from subprocess import Popen
|
4
|
+
|
5
|
+
import pytest
|
6
|
+
|
7
|
+
from ..__main__ import do_index
|
8
|
+
|
9
|
+
from .common import promnesia_bin, write_config
|
10
|
+
from .server_helper import run_server
|
11
|
+
|
12
|
+
|
13
|
+
def test_status_error() -> None:
|
14
|
+
"""
|
15
|
+
If DB doesn't exist, server should handle it gracefully and respond with error
|
16
|
+
"""
|
17
|
+
with run_server(db='/does/not/exist') as server:
|
18
|
+
response = server.post('/status')
|
19
|
+
|
20
|
+
# TODO ugh currently returns 200? maybe should return proper error, but need to handle in extension
|
21
|
+
# assert response.status_code == 404
|
22
|
+
|
23
|
+
body = response.json()
|
24
|
+
|
25
|
+
version = body['version']
|
26
|
+
assert version is not None
|
27
|
+
assert len(version.split('.')) >= 2 # random check..
|
28
|
+
|
29
|
+
assert 'ERROR' in body['db'] # defensive, it doesn't exist
|
30
|
+
|
31
|
+
|
32
|
+
def test_status_ok(tmp_path: Path) -> None:
|
33
|
+
def cfg() -> None:
|
34
|
+
from promnesia.common import Source
|
35
|
+
from promnesia.sources import demo
|
36
|
+
|
37
|
+
SOURCES = [Source(demo.index, count=10)]
|
38
|
+
|
39
|
+
cfg_path = tmp_path / 'config.py'
|
40
|
+
write_config(cfg_path, cfg)
|
41
|
+
do_index(cfg_path)
|
42
|
+
|
43
|
+
db_path = tmp_path / 'promnesia.sqlite'
|
44
|
+
with run_server(db=db_path, timezone='America/New_York') as server:
|
45
|
+
r = server.post('/status').json()
|
46
|
+
version = r['version']
|
47
|
+
assert version is not None
|
48
|
+
assert len(version.split('.')) >= 2 # random check..
|
49
|
+
|
50
|
+
assert r['db'] == str(db_path)
|
51
|
+
|
52
|
+
assert r['stats'] == {'total_visits': 10}
|
53
|
+
|
54
|
+
|
55
|
+
def test_visits(tmp_path: Path) -> None:
|
56
|
+
def cfg() -> None:
|
57
|
+
from promnesia.common import Source
|
58
|
+
from promnesia.sources import demo
|
59
|
+
|
60
|
+
SOURCES = [Source(demo.index, base_dt='2000-01-01', delta=30 * 60)]
|
61
|
+
|
62
|
+
cfg_path = tmp_path / 'config.py'
|
63
|
+
write_config(cfg_path, cfg)
|
64
|
+
do_index(cfg_path)
|
65
|
+
|
66
|
+
# force timezone here, otherwise dependeing on the test env response varies
|
67
|
+
with run_server(db=tmp_path / 'promnesia.sqlite', timezone='America/New_York') as server:
|
68
|
+
r = server.post('/visits', json={'url': 'whatever'}).json()
|
69
|
+
assert r['visits'] == []
|
70
|
+
|
71
|
+
r = server.post('/visits', json={'url': 'https://demo.com/page0.html'})
|
72
|
+
rj = r.json()
|
73
|
+
assert rj['normalised_url'] == 'demo.com/page0.html'
|
74
|
+
[v] = rj['visits']
|
75
|
+
assert v['src'] == 'demo'
|
76
|
+
assert v['locator']['title'] == 'demo'
|
77
|
+
|
78
|
+
assert v['dt'] == '01 Jan 2000 00:00:00 -0500'
|
79
|
+
|
80
|
+
|
81
|
+
def test_visits_hierarchy(tmp_path: Path) -> None:
|
82
|
+
def cfg() -> None:
|
83
|
+
from datetime import datetime
|
84
|
+
|
85
|
+
from promnesia.common import Source, Visit, Loc
|
86
|
+
from promnesia.sources import demo
|
87
|
+
|
88
|
+
def indexer():
|
89
|
+
visits = list(demo.index(count=6))
|
90
|
+
yield Visit(
|
91
|
+
url='https://reddit.com/post1',
|
92
|
+
dt=datetime.fromisoformat('2023-12-04'),
|
93
|
+
locator=Loc.make('reddit'),
|
94
|
+
)
|
95
|
+
yield Visit(
|
96
|
+
url='https://reddit.com/post1/comment2',
|
97
|
+
dt=datetime.fromisoformat('2023-12-02'),
|
98
|
+
locator=Loc.make('reddit'),
|
99
|
+
context='I am comment 2',
|
100
|
+
)
|
101
|
+
yield from visits[:3]
|
102
|
+
yield Visit(
|
103
|
+
url='https://reddit.com/post2',
|
104
|
+
dt=datetime.fromisoformat('2023-12-05'),
|
105
|
+
locator=Loc.make('reddit'),
|
106
|
+
)
|
107
|
+
yield from visits[3:]
|
108
|
+
yield Visit(
|
109
|
+
url='https://reddit.com/post1/ihavenocontext',
|
110
|
+
dt=datetime.fromisoformat('2023-12-06'),
|
111
|
+
locator=Loc.make('reddit'),
|
112
|
+
)
|
113
|
+
yield Visit(
|
114
|
+
url='https://reddit.com/post1/comment1',
|
115
|
+
dt=datetime.fromisoformat('2023-12-06'),
|
116
|
+
locator=Loc.make('reddit'),
|
117
|
+
context='I am comment 1',
|
118
|
+
)
|
119
|
+
|
120
|
+
SOURCES = [Source(indexer)]
|
121
|
+
|
122
|
+
cfg_path = tmp_path / 'config.py'
|
123
|
+
write_config(cfg_path, cfg)
|
124
|
+
do_index(cfg_path)
|
125
|
+
|
126
|
+
# force timezone here, otherwise dependeing on the test env response varies
|
127
|
+
with run_server(db=tmp_path / 'promnesia.sqlite', timezone='America/New_York') as server:
|
128
|
+
r = server.post('/visits', json={'url': 'https://reddit.com/post1'}).json()
|
129
|
+
# retuns exact match + 'child' visits that are interesting (e.g. have context)
|
130
|
+
assert {v['original_url'] for v in r['visits']} == {
|
131
|
+
'https://reddit.com/post1',
|
132
|
+
'https://reddit.com/post1/comment1',
|
133
|
+
'https://reddit.com/post1/comment2',
|
134
|
+
}
|
135
|
+
|
136
|
+
|
137
|
+
def test_visited(tmp_path: Path) -> None:
|
138
|
+
def cfg() -> None:
|
139
|
+
from promnesia.common import Source
|
140
|
+
from promnesia.sources import demo
|
141
|
+
|
142
|
+
SOURCES = [Source(demo.index, base_dt='2000-01-01', delta=30 * 60)]
|
143
|
+
|
144
|
+
cfg_path = tmp_path / 'config.py'
|
145
|
+
write_config(cfg_path, cfg)
|
146
|
+
do_index(cfg_path)
|
147
|
+
|
148
|
+
test_url = 'https://demo.com/page5.html'
|
149
|
+
|
150
|
+
# force timezone here, otherwise dependeing on the test env response varies
|
151
|
+
with run_server(db=tmp_path / 'promnesia.sqlite', timezone='America/New_York') as server:
|
152
|
+
r = server.post('/visited', json={'urls': []}).json()
|
153
|
+
assert r == []
|
154
|
+
|
155
|
+
r = server.post('/visited', json={'urls': [test_url, 'http://badurl.org']}).json()
|
156
|
+
[r1, r2] = r
|
157
|
+
assert r1['original_url'] == test_url
|
158
|
+
assert r2 is None
|
159
|
+
|
160
|
+
|
161
|
+
def test_search(tmp_path: Path) -> None:
|
162
|
+
# TODO not sure if should index at all here or just insert DbVisits directly?
|
163
|
+
def cfg() -> None:
|
164
|
+
from datetime import datetime
|
165
|
+
|
166
|
+
from promnesia.common import Source, Visit, Loc
|
167
|
+
from promnesia.sources import demo
|
168
|
+
|
169
|
+
def indexer():
|
170
|
+
visits = list(demo.index(count=6))
|
171
|
+
yield Visit(
|
172
|
+
url='https://someone.org/something',
|
173
|
+
dt=datetime.fromisoformat('2023-12-04T11:12:13+03:00'),
|
174
|
+
locator=Loc.make('whatever'),
|
175
|
+
)
|
176
|
+
yield from visits[:3]
|
177
|
+
yield Visit(
|
178
|
+
url='https://wiki.termux.com/wiki/Termux-setup-storage',
|
179
|
+
locator=Loc.make(
|
180
|
+
title='Reddit comment',
|
181
|
+
href='https://reddit.com/r/termux/comments/m4qrxt/cant_open_storageshared_in_termux/gso0kak/',
|
182
|
+
),
|
183
|
+
dt=datetime.fromisoformat('2023-12-02'),
|
184
|
+
context='perhaps it will help someone else https://wiki.termux.com/wiki/Termux-setup-storage',
|
185
|
+
)
|
186
|
+
yield from visits[3:]
|
187
|
+
|
188
|
+
SOURCES = [Source(indexer)]
|
189
|
+
|
190
|
+
cfg_path = tmp_path / 'config.py'
|
191
|
+
write_config(cfg_path, cfg)
|
192
|
+
do_index(cfg_path)
|
193
|
+
|
194
|
+
with run_server(db=tmp_path / 'promnesia.sqlite', timezone='America/New_York') as server:
|
195
|
+
# FIXME 'url' is actually kinda misleading -- it can be any text
|
196
|
+
rj = server.post('/search', json={'url': 'someone'}).json()
|
197
|
+
# TODO maybe return in chronological order or something? not sure
|
198
|
+
[v1, v2] = sorted(rj['visits'], key=lambda j: j['dt'])
|
199
|
+
|
200
|
+
assert v1['context'] == 'perhaps it will help someone else https://wiki.termux.com/wiki/Termux-setup-storage'
|
201
|
+
assert v1['dt'] == '02 Dec 2023 00:00:00 -0500' # uses server timezone (original visit didn't have it)
|
202
|
+
|
203
|
+
assert v2['normalised_url'] == 'someone.org/something'
|
204
|
+
assert v2['dt'] == '04 Dec 2023 11:12:13 +0300' # uses original visit timezone
|
205
|
+
|
206
|
+
rj = server.post('/search', json={'url': 'comment'}).json()
|
207
|
+
[v] = rj['visits']
|
208
|
+
assert v['context'] == 'perhaps it will help someone else https://wiki.termux.com/wiki/Termux-setup-storage'
|
209
|
+
|
210
|
+
|
211
|
+
def test_search_around(tmp_path: Path) -> None:
|
212
|
+
# this should return visits up to 3 hours in the past
|
213
|
+
def cfg() -> None:
|
214
|
+
from promnesia.common import Source
|
215
|
+
from promnesia.sources import demo
|
216
|
+
|
217
|
+
# generates 60 visits within 10 mins of each other -- so spanning over 10 hours
|
218
|
+
SOURCES = [Source(demo.index, count=60, base_dt='2000-01-01T00:00:00+03:00', delta=10 * 60)]
|
219
|
+
|
220
|
+
cfg_path = tmp_path / 'config.py'
|
221
|
+
write_config(cfg_path, cfg)
|
222
|
+
do_index(cfg_path)
|
223
|
+
|
224
|
+
# TODO hmm. perhaps it makes more sense to run query in different process and server in main process for testing??
|
225
|
+
with run_server(db=tmp_path / 'promnesia.sqlite') as server:
|
226
|
+
rj = server.post(
|
227
|
+
'/search_around',
|
228
|
+
json={'timestamp': datetime.fromisoformat('2005-01-01T00:00:00+06:00').timestamp()},
|
229
|
+
).json()
|
230
|
+
assert rj['visits'] == []
|
231
|
+
|
232
|
+
rj = server.post(
|
233
|
+
'/search_around',
|
234
|
+
json={'timestamp': datetime.fromisoformat('2000-01-01T07:55:00+06:00').timestamp()},
|
235
|
+
).json()
|
236
|
+
visits = rj['visits']
|
237
|
+
assert len(visits) == 18 # 6 per hour * 3
|
238
|
+
assert visits[0 ]['dt'] == '01 Jan 2000 02:00:00 +0300'
|
239
|
+
assert visits[-1]['dt'] == '01 Jan 2000 04:50:00 +0300'
|
240
|
+
|
241
|
+
|
242
|
+
@pytest.mark.parametrize('mode', ['update', 'overwrite'])
|
243
|
+
def test_query_while_indexing(tmp_path: Path, mode: str) -> None:
|
244
|
+
overwrite = mode == 'overwrite'
|
245
|
+
moverwrite = ['--overwrite'] if overwrite else []
|
246
|
+
|
247
|
+
def _index(run_id: str) -> Popen:
|
248
|
+
def cfg(run_id: str) -> None:
|
249
|
+
from promnesia.common import Source
|
250
|
+
from promnesia.sources import demo
|
251
|
+
|
252
|
+
SOURCES = [Source(demo.index, count=1_000, name=run_id)]
|
253
|
+
|
254
|
+
cfg_path = tmp_path / f'config{run_id}.py'
|
255
|
+
write_config(cfg_path, cfg, run_id=run_id)
|
256
|
+
|
257
|
+
return Popen(promnesia_bin('index', '--config', cfg_path, *moverwrite))
|
258
|
+
|
259
|
+
# trigger initial indexing
|
260
|
+
with _index(run_id='0'):
|
261
|
+
pass
|
262
|
+
|
263
|
+
with run_server(db=tmp_path / 'promnesia.sqlite') as server:
|
264
|
+
rj = server.post(
|
265
|
+
'/search_around',
|
266
|
+
json={'timestamp': datetime.fromisoformat('2005-01-01T00:00:00+06:00').timestamp()},
|
267
|
+
).json()
|
268
|
+
assert rj['visits'] == []
|
269
|
+
|
270
|
+
for run_id in range(1, 5):
|
271
|
+
with _index(run_id=str(run_id)) as indexer:
|
272
|
+
# hammer the backend to increase likelihood of race condition
|
273
|
+
while indexer.poll() is None:
|
274
|
+
stats = server.post('/status').json()['stats']
|
275
|
+
total_visits = stats['total_visits']
|
276
|
+
if overwrite:
|
277
|
+
assert total_visits >= 1_000
|
278
|
+
else:
|
279
|
+
assert total_visits >= 1_000 * run_id
|
280
|
+
|
281
|
+
|
282
|
+
# TODO also could check server methods directly?
|
283
|
+
# via something like this... but not sure if really makes much difference
|
284
|
+
# import promnesia.server as S
|
285
|
+
# S.EnvConfig.set(S.ServerConfig(
|
286
|
+
# # TODO populate with test db and benchmark properly...
|
287
|
+
# db=Path('/todo'),
|
288
|
+
# timezone=pytz.utc,
|
289
|
+
# ))
|
290
|
+
# links = [f'https://reddit.com/whatever{i}.html' for i in range(count)]
|
291
|
+
# res = S.visited(links)
|
292
|
+
# assert len(res) == len(links)
|
@@ -0,0 +1,41 @@
|
|
1
|
+
from unittest.mock import patch
|
2
|
+
|
3
|
+
from ..common import traverse
|
4
|
+
|
5
|
+
from .common import get_testdata
|
6
|
+
|
7
|
+
|
8
|
+
testDataPath = get_testdata('traverse')
|
9
|
+
|
10
|
+
|
11
|
+
# Patch shutil.which so it always returns false (when trying to which fdfind, etc)
|
12
|
+
# so that it falls back to find
|
13
|
+
@patch('promnesia.common.shutil.which', return_value=False)
|
14
|
+
def test_traverse_ignore_find(patched) -> None:
|
15
|
+
'''
|
16
|
+
traverse() with `find` but ignore some stuff
|
17
|
+
'''
|
18
|
+
paths = set(traverse(testDataPath, ignore=['ignoreme.txt', 'ignoreme2']))
|
19
|
+
|
20
|
+
assert paths == {testDataPath / 'imhere2/real.txt', testDataPath / 'imhere.txt'}
|
21
|
+
|
22
|
+
|
23
|
+
def test_traverse_ignore_fdfind():
|
24
|
+
'''
|
25
|
+
traverse() with `fdfind` but ignore some stuff
|
26
|
+
'''
|
27
|
+
paths = set(traverse(testDataPath, ignore=['ignoreme.txt', 'ignoreme2']))
|
28
|
+
|
29
|
+
assert paths == {testDataPath / 'imhere.txt', testDataPath / 'imhere2/real.txt'}
|
30
|
+
|
31
|
+
|
32
|
+
# TODO: It would be nice to test the implementation directly without having to do this
|
33
|
+
# weird patching in the future
|
34
|
+
@patch('promnesia.common._is_windows', new_callable=lambda: True)
|
35
|
+
def test_traverse_ignore_windows(patched) -> None:
|
36
|
+
'''
|
37
|
+
traverse() with python when _is_windows is true but ignore some stuff
|
38
|
+
'''
|
39
|
+
paths = set(traverse(testDataPath, ignore=['ignoreme.txt', 'ignoreme2']))
|
40
|
+
|
41
|
+
assert paths == {testDataPath / 'imhere.txt', testDataPath / 'imhere2/real.txt'}
|
promnesia/tests/utils.py
ADDED
@@ -0,0 +1,35 @@
|
|
1
|
+
from datetime import datetime, timedelta
|
2
|
+
from pathlib import Path
|
3
|
+
from typing import Mapping, Optional, Sequence, Tuple, Union
|
4
|
+
|
5
|
+
from ..common import Source, Loc, Visit
|
6
|
+
from ..database.dump import visits_to_sqlite
|
7
|
+
from ..extract import extract_visits
|
8
|
+
|
9
|
+
|
10
|
+
# TODO a bit shit... why did I make it dict at first??
|
11
|
+
Urls = Union[
|
12
|
+
Mapping[str, Optional[str]],
|
13
|
+
Sequence[Tuple[str, Optional[str]]],
|
14
|
+
]
|
15
|
+
|
16
|
+
|
17
|
+
def index_urls(urls: Urls, *, source_name: str = 'test'):
|
18
|
+
uuu = list(urls.items()) if isinstance(urls, dict) else urls
|
19
|
+
|
20
|
+
def idx(tmp_path: Path) -> None:
|
21
|
+
def indexer():
|
22
|
+
for i, (url, ctx) in enumerate(uuu):
|
23
|
+
yield Visit(
|
24
|
+
url=url,
|
25
|
+
dt=datetime.min + timedelta(days=5000) + timedelta(hours=i),
|
26
|
+
locator=Loc.make('test'),
|
27
|
+
context=ctx,
|
28
|
+
)
|
29
|
+
|
30
|
+
db_visits = extract_visits(source=Source(indexer), src=source_name)
|
31
|
+
errors = visits_to_sqlite(vit=db_visits, overwrite_db=True, _db_path=tmp_path / 'promnesia.sqlite')
|
32
|
+
|
33
|
+
assert len(errors) == 0, errors
|
34
|
+
|
35
|
+
return idx
|
@@ -1,19 +1,17 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: promnesia
|
3
|
-
Version: 1.
|
3
|
+
Version: 1.2.20240810
|
4
4
|
Summary: Enhancement of your browsing history
|
5
5
|
Home-page: https://github.com/karlicoss/promnesia
|
6
6
|
Author: Dmitrii Gerasimov
|
7
7
|
Author-email: karlicoss@gmail.com
|
8
|
-
|
9
|
-
|
10
|
-
Requires-Python: >=3.7
|
8
|
+
Requires-Python: >=3.8
|
9
|
+
License-File: LICENSE
|
11
10
|
Requires-Dist: appdirs
|
12
11
|
Requires-Dist: tzlocal
|
13
12
|
Requires-Dist: more-itertools
|
14
13
|
Requires-Dist: pytz
|
15
|
-
Requires-Dist: sqlalchemy
|
16
|
-
Requires-Dist: cachew (>=0.8.0)
|
14
|
+
Requires-Dist: sqlalchemy >=2.0
|
17
15
|
Requires-Dist: urlextract
|
18
16
|
Requires-Dist: fastapi
|
19
17
|
Requires-Dist: uvicorn[standard]
|
@@ -26,34 +24,31 @@ Requires-Dist: HPI ; extra == 'all'
|
|
26
24
|
Requires-Dist: beautifulsoup4 ; extra == 'all'
|
27
25
|
Requires-Dist: lxml ; extra == 'all'
|
28
26
|
Requires-Dist: mistletoe ; extra == 'all'
|
29
|
-
Requires-Dist: orgparse
|
30
|
-
Requires-Dist: dataset ; extra == 'all'
|
27
|
+
Requires-Dist: orgparse >=0.3.0 ; extra == 'all'
|
31
28
|
Provides-Extra: html
|
32
29
|
Requires-Dist: beautifulsoup4 ; extra == 'html'
|
33
30
|
Requires-Dist: lxml ; extra == 'html'
|
34
|
-
Provides-Extra: linting
|
35
|
-
Requires-Dist: pytest ; extra == 'linting'
|
36
|
-
Requires-Dist: mypy ; extra == 'linting'
|
37
|
-
Requires-Dist: lxml ; extra == 'linting'
|
38
31
|
Provides-Extra: markdown
|
39
32
|
Requires-Dist: mistletoe ; extra == 'markdown'
|
40
33
|
Provides-Extra: optional
|
41
34
|
Requires-Dist: logzero ; extra == 'optional'
|
42
35
|
Requires-Dist: python-magic ; extra == 'optional'
|
43
36
|
Provides-Extra: org
|
44
|
-
Requires-Dist: orgparse
|
37
|
+
Requires-Dist: orgparse >=0.3.0 ; extra == 'org'
|
45
38
|
Provides-Extra: telegram
|
46
|
-
Requires-Dist: dataset ; extra == 'telegram'
|
47
39
|
Provides-Extra: testing
|
48
40
|
Requires-Dist: pytest ; extra == 'testing'
|
49
41
|
Requires-Dist: pytest-timeout ; extra == 'testing'
|
50
42
|
Requires-Dist: pytest-xdist ; extra == 'testing'
|
43
|
+
Requires-Dist: hypothesis ; extra == 'testing'
|
51
44
|
Requires-Dist: psutil ; extra == 'testing'
|
52
|
-
Requires-Dist:
|
45
|
+
Requires-Dist: requests ; extra == 'testing'
|
53
46
|
Requires-Dist: selenium ; extra == 'testing'
|
54
47
|
Requires-Dist: click ; extra == 'testing'
|
55
|
-
Requires-Dist:
|
56
|
-
|
57
|
-
|
58
|
-
|
48
|
+
Requires-Dist: ruff ; extra == 'testing'
|
49
|
+
Requires-Dist: mypy ; extra == 'testing'
|
50
|
+
Requires-Dist: lxml ; extra == 'testing'
|
51
|
+
Requires-Dist: loguru ; extra == 'testing'
|
52
|
+
Provides-Extra: testing-gui
|
53
|
+
Requires-Dist: pyautogui ; extra == 'testing-gui'
|
59
54
|
|