promnesia 1.2.20230515__py3-none-any.whl → 1.2.20240810__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- promnesia/__main__.py +26 -14
- promnesia/cannon.py +4 -4
- promnesia/common.py +39 -28
- promnesia/compare.py +3 -2
- promnesia/config.py +4 -2
- promnesia/database/common.py +66 -0
- promnesia/database/dump.py +187 -0
- promnesia/{read_db.py → database/load.py} +10 -11
- promnesia/extract.py +1 -0
- promnesia/kjson.py +1 -1
- promnesia/logging.py +3 -3
- promnesia/misc/__init__.pyi +0 -0
- promnesia/misc/config_example.py +1 -2
- promnesia/misc/install_server.py +2 -3
- promnesia/server.py +18 -19
- promnesia/sources/__init__.pyi +0 -0
- promnesia/sources/auto.py +9 -7
- promnesia/sources/browser_legacy.py +11 -5
- promnesia/sources/demo.py +18 -2
- promnesia/sources/filetypes.py +7 -0
- promnesia/sources/github.py +2 -2
- promnesia/sources/hypothesis.py +1 -1
- promnesia/sources/markdown.py +15 -15
- promnesia/sources/org.py +7 -3
- promnesia/sources/plaintext.py +3 -1
- promnesia/sources/reddit.py +2 -2
- promnesia/sources/rss.py +1 -1
- promnesia/sources/signal.py +22 -14
- promnesia/sources/stackexchange.py +2 -2
- promnesia/sources/takeout.py +58 -1
- promnesia/sources/takeout_legacy.py +10 -2
- promnesia/tests/__init__.py +0 -0
- promnesia/tests/common.py +137 -0
- promnesia/tests/server_helper.py +64 -0
- promnesia/tests/sources/__init__.py +0 -0
- promnesia/tests/sources/test_auto.py +66 -0
- promnesia/tests/sources/test_filetypes.py +42 -0
- promnesia/tests/sources/test_hypothesis.py +39 -0
- promnesia/tests/sources/test_org.py +65 -0
- promnesia/tests/sources/test_plaintext.py +26 -0
- promnesia/tests/sources/test_shellcmd.py +22 -0
- promnesia/tests/sources/test_takeout.py +58 -0
- promnesia/tests/test_cannon.py +325 -0
- promnesia/tests/test_cli.py +42 -0
- promnesia/tests/test_compare.py +30 -0
- promnesia/tests/test_config.py +290 -0
- promnesia/tests/test_db_dump.py +223 -0
- promnesia/tests/test_extract.py +61 -0
- promnesia/tests/test_extract_urls.py +43 -0
- promnesia/tests/test_indexer.py +245 -0
- promnesia/tests/test_server.py +292 -0
- promnesia/tests/test_traverse.py +41 -0
- promnesia/tests/utils.py +35 -0
- {promnesia-1.2.20230515.dist-info → promnesia-1.2.20240810.dist-info}/METADATA +13 -17
- promnesia-1.2.20240810.dist-info/RECORD +83 -0
- {promnesia-1.2.20230515.dist-info → promnesia-1.2.20240810.dist-info}/WHEEL +1 -1
- {promnesia-1.2.20230515.dist-info → promnesia-1.2.20240810.dist-info}/entry_points.txt +0 -1
- promnesia/dump.py +0 -105
- promnesia-1.2.20230515.dist-info/RECORD +0 -58
- {promnesia-1.2.20230515.dist-info → promnesia-1.2.20240810.dist-info}/LICENSE +0 -0
- {promnesia-1.2.20230515.dist-info → promnesia-1.2.20240810.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,223 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
|
3
|
+
from concurrent.futures import ProcessPoolExecutor
|
4
|
+
from datetime import datetime, timedelta, timezone
|
5
|
+
from pathlib import Path
|
6
|
+
from tempfile import TemporaryDirectory
|
7
|
+
from typing import Any, Iterable
|
8
|
+
|
9
|
+
|
10
|
+
from hypothesis import settings, given
|
11
|
+
from hypothesis.strategies import from_type
|
12
|
+
# NOTE: pytest ... -s --hypothesis-verbosity=debug is useful for seeing what hypothesis is doing
|
13
|
+
import pytest
|
14
|
+
import pytz
|
15
|
+
|
16
|
+
|
17
|
+
from ..common import Loc
|
18
|
+
from ..database.common import DbVisit
|
19
|
+
from ..database.dump import visits_to_sqlite
|
20
|
+
from ..database.load import get_all_db_visits
|
21
|
+
from ..sqlite import sqlite_connection
|
22
|
+
|
23
|
+
from .common import gc_control, running_on_ci
|
24
|
+
|
25
|
+
|
26
|
+
HSETTINGS: dict[str, Any] = dict(
|
27
|
+
derandomize=True,
|
28
|
+
deadline=timedelta(seconds=2), # sometimes slow on ci
|
29
|
+
)
|
30
|
+
|
31
|
+
|
32
|
+
def test_no_visits(tmp_path: Path) -> None:
|
33
|
+
visits: list[DbVisit] = []
|
34
|
+
|
35
|
+
db = tmp_path / 'db.sqlite'
|
36
|
+
errors = visits_to_sqlite(
|
37
|
+
vit=visits,
|
38
|
+
overwrite_db=True,
|
39
|
+
_db_path=db,
|
40
|
+
)
|
41
|
+
assert db.exists()
|
42
|
+
[err] = [errors]
|
43
|
+
assert 'No visits were indexed' in str(err)
|
44
|
+
|
45
|
+
|
46
|
+
def test_one_visit(tmp_path: Path) -> None:
|
47
|
+
dt = datetime.fromisoformat('2023-11-14T23:11:01')
|
48
|
+
dt = pytz.timezone('Europe/Warsaw').localize(dt)
|
49
|
+
visit = DbVisit(
|
50
|
+
norm_url='google.com',
|
51
|
+
orig_url='https://google.com',
|
52
|
+
dt=dt,
|
53
|
+
locator=Loc.make(title='title', href='https://whatever.com'),
|
54
|
+
duration=123,
|
55
|
+
src='whatever',
|
56
|
+
)
|
57
|
+
|
58
|
+
visits = [visit]
|
59
|
+
|
60
|
+
db = tmp_path / 'db.sqlite'
|
61
|
+
errors = visits_to_sqlite(
|
62
|
+
vit=visits,
|
63
|
+
overwrite_db=True,
|
64
|
+
_db_path=db,
|
65
|
+
)
|
66
|
+
assert len(errors) == 0
|
67
|
+
assert db.exists()
|
68
|
+
|
69
|
+
with sqlite_connection(db, row_factory='dict') as conn:
|
70
|
+
[sqlite_visit] = conn.execute('SELECT * FROM visits')
|
71
|
+
|
72
|
+
assert sqlite_visit == {
|
73
|
+
'context': None,
|
74
|
+
'dt': '2023-11-14T23:11:01+01:00',
|
75
|
+
'duration': 123,
|
76
|
+
'locator_href': 'https://whatever.com',
|
77
|
+
'locator_title': 'title',
|
78
|
+
'norm_url': 'google.com',
|
79
|
+
'orig_url': 'https://google.com',
|
80
|
+
'src': 'whatever',
|
81
|
+
}
|
82
|
+
|
83
|
+
visits_in_db = get_all_db_visits(db)
|
84
|
+
assert visits_in_db == [visit]
|
85
|
+
|
86
|
+
|
87
|
+
def test_read_db_visits(tmp_path: Path) -> None:
|
88
|
+
"""
|
89
|
+
Deliberately test against "hardcoded" database to check for backwards compatibility
|
90
|
+
"""
|
91
|
+
db = tmp_path / 'db.sqlite'
|
92
|
+
with sqlite_connection(db) as conn:
|
93
|
+
conn.execute(
|
94
|
+
'''
|
95
|
+
CREATE TABLE visits (
|
96
|
+
norm_url VARCHAR,
|
97
|
+
orig_url VARCHAR,
|
98
|
+
dt VARCHAR,
|
99
|
+
locator_title VARCHAR,
|
100
|
+
locator_href VARCHAR,
|
101
|
+
src VARCHAR,
|
102
|
+
context VARCHAR,
|
103
|
+
duration INTEGER
|
104
|
+
);
|
105
|
+
'''
|
106
|
+
)
|
107
|
+
# this dt format (zone name after iso timestap) might occur in legacy databases
|
108
|
+
# (that were created when promnesia was using cachew NTBinder)
|
109
|
+
conn.execute(
|
110
|
+
'''
|
111
|
+
INSERT INTO visits VALUES(
|
112
|
+
'i.redd.it/alala.jpg',
|
113
|
+
'https://i.redd.it/alala.jpg',
|
114
|
+
'2019-04-13T11:55:09-04:00 America/New_York',
|
115
|
+
'Reddit save',
|
116
|
+
'https://reddit.com/r/whatever',
|
117
|
+
'reddit',
|
118
|
+
'',
|
119
|
+
NULL
|
120
|
+
);
|
121
|
+
'''
|
122
|
+
)
|
123
|
+
[visit_in_db] = get_all_db_visits(db)
|
124
|
+
assert visit_in_db == DbVisit(
|
125
|
+
norm_url='i.redd.it/alala.jpg',
|
126
|
+
orig_url='https://i.redd.it/alala.jpg',
|
127
|
+
dt=datetime(2019, 4, 13, 11, 55, 9, tzinfo=timezone(timedelta(hours=-4))),
|
128
|
+
locator=Loc.make(title='Reddit save', href='https://reddit.com/r/whatever'),
|
129
|
+
src='reddit',
|
130
|
+
context='',
|
131
|
+
)
|
132
|
+
|
133
|
+
|
134
|
+
def _test_random_visit_aux(visit: DbVisit, tmp_path: Path) -> None:
|
135
|
+
db = tmp_path / 'db.sqlite'
|
136
|
+
errors = visits_to_sqlite(
|
137
|
+
vit=[visit],
|
138
|
+
overwrite_db=True,
|
139
|
+
_db_path=db,
|
140
|
+
)
|
141
|
+
assert db.exists()
|
142
|
+
assert len(errors) == 0, errors
|
143
|
+
# TODO query the db?
|
144
|
+
|
145
|
+
|
146
|
+
@given(
|
147
|
+
visit=from_type(DbVisit).filter(
|
148
|
+
# if duration is too big it fails to insert in sqlite
|
149
|
+
lambda v: (v.duration is None or 0 <= v.duration <= 10**5)
|
150
|
+
)
|
151
|
+
)
|
152
|
+
@settings(**HSETTINGS, max_examples=100)
|
153
|
+
def test_random_visit(visit: DbVisit) -> None:
|
154
|
+
with TemporaryDirectory() as tdir:
|
155
|
+
tmp_path = Path(tdir)
|
156
|
+
_test_random_visit_aux(visit=visit, tmp_path=tmp_path)
|
157
|
+
|
158
|
+
|
159
|
+
_dt_naive = datetime.fromisoformat('2023-11-14T23:11:01')
|
160
|
+
_dt_aware = pytz.timezone('America/New_York').localize(_dt_naive)
|
161
|
+
|
162
|
+
def make_testvisit(i: int) -> DbVisit:
|
163
|
+
return DbVisit(
|
164
|
+
norm_url=f'google.com/{i}',
|
165
|
+
orig_url=f'https://google.com/{i}',
|
166
|
+
dt=(_dt_naive if i % 2 == 0 else _dt_aware) + timedelta(seconds=i),
|
167
|
+
locator=Loc.make(title=f'title{i}', href=f'https://whatever.com/{i}'),
|
168
|
+
duration=i,
|
169
|
+
src='whatever',
|
170
|
+
)
|
171
|
+
|
172
|
+
|
173
|
+
@pytest.mark.parametrize('count', [99, 100_000, 1_000_000])
|
174
|
+
@pytest.mark.parametrize('gc_on', [True, False], ids=['gc_on', 'gc_off'])
|
175
|
+
def test_benchmark_visits_dumping(count: int, gc_control, tmp_path: Path) -> None:
|
176
|
+
# [20231212] testing differernt CHUNK_BY values with 1_000_000 visits on @karlicoss desktop pc
|
177
|
+
# 1: 25s (perhaps most overhead is from temporary lists?)
|
178
|
+
# 10 (current default): 8s
|
179
|
+
# 100: 6s
|
180
|
+
# 1000: 6s
|
181
|
+
# TODO maybe consider changing default to 100?
|
182
|
+
if count > 99 and running_on_ci:
|
183
|
+
pytest.skip("test would be too slow on CI, only meant to run manually")
|
184
|
+
|
185
|
+
visits = (make_testvisit(i) for i in range(count))
|
186
|
+
db = tmp_path / 'db.sqlite'
|
187
|
+
errors = visits_to_sqlite( # TODO maybe this method should return db stats? would make testing easier
|
188
|
+
vit=visits,
|
189
|
+
overwrite_db=True,
|
190
|
+
_db_path=db,
|
191
|
+
)
|
192
|
+
assert db.exists()
|
193
|
+
assert len(errors) == 0, errors
|
194
|
+
|
195
|
+
|
196
|
+
def _populate_db(db_path: Path, *, overwrite_db: bool, count: int) -> None:
|
197
|
+
visits = [make_testvisit(i) for i in range(count)]
|
198
|
+
errors = visits_to_sqlite(visits, _db_path=db_path, overwrite_db=overwrite_db)
|
199
|
+
assert len(errors) == 0
|
200
|
+
|
201
|
+
|
202
|
+
@pytest.mark.parametrize('mode', ['update', 'overwrite'])
|
203
|
+
def test_concurrent(tmp_path: Path, mode: str) -> None:
|
204
|
+
overwrite_db = {'overwrite': True, 'update': False}[mode]
|
205
|
+
|
206
|
+
db_path = tmp_path / 'db.sqlite'
|
207
|
+
# do initial indexing to initialize the db
|
208
|
+
_populate_db(db_path, overwrite_db=True, count=1)
|
209
|
+
assert db_path.exists() # just in case
|
210
|
+
|
211
|
+
# this simply tests correctness by running many concurrent indexers
|
212
|
+
parallel = 100 # 100 indexers
|
213
|
+
with ProcessPoolExecutor(max_workers=8) as pool:
|
214
|
+
futures = []
|
215
|
+
for _ in range(parallel):
|
216
|
+
futures.append(pool.submit(_populate_db, db_path, overwrite_db=overwrite_db, count=1_000))
|
217
|
+
for f in futures:
|
218
|
+
f.result()
|
219
|
+
assert db_path.exists() # just in case
|
220
|
+
|
221
|
+
|
222
|
+
# TODO test to make sure db is readable while we're indexing?
|
223
|
+
# kinda nicer version of test_query_while_indexing
|
@@ -0,0 +1,61 @@
|
|
1
|
+
from datetime import datetime, timezone
|
2
|
+
|
3
|
+
from ..common import Visit, DbVisit, Loc, Source
|
4
|
+
from ..extract import extract_visits
|
5
|
+
|
6
|
+
from .common import get_testdata, unwrap, running_on_ci, gc_control
|
7
|
+
|
8
|
+
from more_itertools import ilen
|
9
|
+
import pytest
|
10
|
+
|
11
|
+
|
12
|
+
def test_with_error() -> None:
|
13
|
+
class ExtractionError(Exception):
|
14
|
+
pass
|
15
|
+
|
16
|
+
def indexer():
|
17
|
+
yield Visit(url='http://test1', dt=datetime.fromtimestamp(0, tz=timezone.utc), locator=Loc.make('whatever'))
|
18
|
+
yield ExtractionError()
|
19
|
+
yield Visit(url='http://test2', dt=datetime.fromtimestamp(0, tz=timezone.utc), locator=Loc.make('whatever'))
|
20
|
+
|
21
|
+
[v1, e, v2] = extract_visits(source=Source(indexer), src='whatever')
|
22
|
+
assert isinstance(v1, DbVisit)
|
23
|
+
assert isinstance(e, Exception)
|
24
|
+
assert isinstance(v2, DbVisit)
|
25
|
+
|
26
|
+
|
27
|
+
def test_urls_are_normalised() -> None:
|
28
|
+
# generally this stuff is covered by cannon tests, but good to check it's actually inserted in the db
|
29
|
+
# TODO maybe this should be a separate test which takes DbVisit.make separately?
|
30
|
+
# especially to decouple from shellcmd source
|
31
|
+
from ..sources import shellcmd
|
32
|
+
from ..sources.plaintext import extract_from_path
|
33
|
+
|
34
|
+
visits = list(extract_visits(
|
35
|
+
source=Source(shellcmd.index, extract_from_path(get_testdata('normalise'))),
|
36
|
+
src='whatever',
|
37
|
+
))
|
38
|
+
assert len(visits) == 7
|
39
|
+
|
40
|
+
assert {unwrap(v).norm_url for v in visits} == {
|
41
|
+
'hi.com',
|
42
|
+
'reddit.com/post',
|
43
|
+
'argos.co.uk/webapp/wcs/stores/servlet/OrderItemDisplay',
|
44
|
+
'youtube.com/watch?v=XXlZfc1TrD0',
|
45
|
+
'youtube.com/watch?v=XXlZfc1Tr11',
|
46
|
+
}
|
47
|
+
|
48
|
+
|
49
|
+
@pytest.mark.parametrize('count', [99, 100_000, 1_000_000])
|
50
|
+
@pytest.mark.parametrize('gc_on', [True, False], ids=['gc_on', 'gc_off'])
|
51
|
+
def test_benchmark(count: int, gc_control) -> None:
|
52
|
+
# NOTE: at the moment most time is spent canonifying urls, so not much point optimizing this in isolation
|
53
|
+
# TODO maybe could specify custom cannonifying strategy that doesn't do anything to isolate benchmark
|
54
|
+
if count > 99 and running_on_ci:
|
55
|
+
pytest.skip("test would be too slow on CI, only meant to run manually")
|
56
|
+
|
57
|
+
from ..sources import demo
|
58
|
+
source = Source(demo.index, count=count)
|
59
|
+
|
60
|
+
total = ilen(extract_visits(source=source, src='whatever'))
|
61
|
+
assert total == count # sanity check
|
@@ -0,0 +1,43 @@
|
|
1
|
+
from ..common import extract_urls
|
2
|
+
|
3
|
+
|
4
|
+
def test_extract_simple() -> None:
|
5
|
+
lines = """
|
6
|
+
I've enjoyed [Chandler Carruth's _There Are No Zero-cost Abstractions_](
|
7
|
+
https://www.youtube.com/watch?v=rHIkrotSwcc) very much.
|
8
|
+
""".strip()
|
9
|
+
assert set(extract_urls(lines)) == {'https://www.youtube.com/watch?v=rHIkrotSwcc'}
|
10
|
+
|
11
|
+
|
12
|
+
def test_extract_2() -> None:
|
13
|
+
text = '''♂️ Чтобы снизить вероятность ошибиться, важно знать про когнитивные искажения.
|
14
|
+
Если для вас это новое словосочетание, начните с книжки
|
15
|
+
"Гарри Поттер и Методы рационального мышления" - http://hpmor.ru/, если вы знакомы с понятием - читайте цепочки на сайтах
|
16
|
+
lesswrong.ru и lesswrong.com, книжку Даниэля Канемана "Thinking, fast and slow" и канал Пион https://t.me/ontologics
|
17
|
+
'''
|
18
|
+
assert set(extract_urls(text)) == {'http://hpmor.ru/', 'lesswrong.ru', 'lesswrong.com', 'https://t.me/ontologics'}
|
19
|
+
|
20
|
+
|
21
|
+
def test_extract_md() -> None:
|
22
|
+
lines = '''
|
23
|
+
Hey, I recently implemented a new extension for that [addons.mozilla.org](https://addons.mozilla.org/en-US/firefox/addon/org-grasp-for-org-capture/), [github](https://github.com/karlicoss/grasp), perhaps it could be useful for you!
|
24
|
+
'''
|
25
|
+
assert set(extract_urls(lines)) == {
|
26
|
+
'addons.mozilla.org',
|
27
|
+
'https://addons.mozilla.org/en-US/firefox/addon/org-grasp-for-org-capture/',
|
28
|
+
'https://github.com/karlicoss/grasp',
|
29
|
+
}
|
30
|
+
|
31
|
+
|
32
|
+
# just random links to test multiline/whitespace behaviour
|
33
|
+
def test_extract_3() -> None:
|
34
|
+
lines = '''
|
35
|
+
python.org/one.html ?? https://python.org/two.html some extra text
|
36
|
+
|
37
|
+
whatever.org
|
38
|
+
'''
|
39
|
+
assert set(extract_urls(lines, syntax='org')) == {
|
40
|
+
'python.org/one.html',
|
41
|
+
'https://python.org/two.html',
|
42
|
+
'whatever.org',
|
43
|
+
}
|
@@ -0,0 +1,245 @@
|
|
1
|
+
from collections import Counter
|
2
|
+
from pathlib import Path
|
3
|
+
from subprocess import check_call, Popen
|
4
|
+
|
5
|
+
from ..__main__ import do_index, read_example_config
|
6
|
+
from ..common import DbVisit, _is_windows
|
7
|
+
from ..database.load import get_all_db_visits
|
8
|
+
|
9
|
+
import pytest
|
10
|
+
|
11
|
+
from .common import get_testdata, promnesia_bin, reset_filters, write_config
|
12
|
+
|
13
|
+
|
14
|
+
def get_stats(tmp_path: Path) -> Counter:
|
15
|
+
visits = get_all_db_visits(tmp_path / 'promnesia.sqlite')
|
16
|
+
return Counter(v.src for v in visits)
|
17
|
+
|
18
|
+
|
19
|
+
@pytest.mark.parametrize('mode', ['update', 'overwrite'])
|
20
|
+
def test_indexing_mode(tmp_path: Path, mode: str) -> None:
|
21
|
+
# ugh. we modify the config very fast during tests
|
22
|
+
# and pycache distinguishes identical filenames based on int mtime in seconds
|
23
|
+
# so best to use different names to prevent undesired caching
|
24
|
+
# https://github.com/python/cpython/blob/fb202af4470d6051a69bb9d2f44d7e8a1c99eb4f/Lib/importlib/_bootstrap_external.py#L714-L739
|
25
|
+
# TODO could probably relax that if we switch from importlib config loading to exec()?
|
26
|
+
|
27
|
+
def cfg1() -> None:
|
28
|
+
from promnesia.common import Source
|
29
|
+
from promnesia.sources import demo
|
30
|
+
|
31
|
+
SOURCES = [
|
32
|
+
Source(demo.index, count=10, base_dt='2000-01-01', delta=30, name='demo1'),
|
33
|
+
Source(demo.index, count=20, base_dt='2001-01-01', delta=30, name='demo2'),
|
34
|
+
]
|
35
|
+
|
36
|
+
cfg_path = tmp_path / 'config1.py'
|
37
|
+
write_config(cfg_path, cfg1)
|
38
|
+
do_index(cfg_path)
|
39
|
+
|
40
|
+
stats = get_stats(tmp_path)
|
41
|
+
assert stats == {'demo1': 10, 'demo2': 20}
|
42
|
+
|
43
|
+
def cfg2() -> None:
|
44
|
+
from promnesia.common import Source
|
45
|
+
from promnesia.sources import demo
|
46
|
+
|
47
|
+
SOURCES = [
|
48
|
+
Source(demo.index, count=30, base_dt='2005-01-01', delta=30, name='demo2'),
|
49
|
+
Source(demo.index, count=40, base_dt='2010-01-01', delta=30, name='demo3'),
|
50
|
+
]
|
51
|
+
|
52
|
+
cfg_path = tmp_path / 'config2.py'
|
53
|
+
write_config(cfg_path, cfg2)
|
54
|
+
do_index(cfg_path, overwrite_db={'overwrite': True, 'update': False}[mode])
|
55
|
+
# TODO use some sort of test helper?
|
56
|
+
stats = get_stats(tmp_path)
|
57
|
+
|
58
|
+
if mode == 'update':
|
59
|
+
# should keep the original visits too!
|
60
|
+
assert stats == {'demo1': 10, 'demo2': 30, 'demo3': 40}
|
61
|
+
else:
|
62
|
+
# should overwrite with newly indexed visits
|
63
|
+
assert stats == {'demo2': 30, 'demo3': 40}
|
64
|
+
|
65
|
+
|
66
|
+
# TODO check both modes?
|
67
|
+
def test_concurrent_indexing(tmp_path: Path) -> None:
|
68
|
+
def cfg_fast() -> None:
|
69
|
+
from promnesia.common import Source
|
70
|
+
from promnesia.sources import demo
|
71
|
+
|
72
|
+
SOURCES = [Source(demo.index, count=10)]
|
73
|
+
|
74
|
+
cfg_fast_path = tmp_path / 'cfg_fast.py'
|
75
|
+
write_config(cfg_fast_path, cfg_fast)
|
76
|
+
|
77
|
+
def cfg_slow() -> None:
|
78
|
+
from promnesia.common import Source
|
79
|
+
from promnesia.sources import demo
|
80
|
+
|
81
|
+
SOURCES = [Source(demo.index, count=100_000)]
|
82
|
+
|
83
|
+
cfg_slow_path = tmp_path / 'cfg_slow.py'
|
84
|
+
write_config(cfg_slow_path, cfg_slow)
|
85
|
+
|
86
|
+
# init it first, to create the database
|
87
|
+
# TODO ideally this shouldn't be necessary but it's reasonable that people would already have the index
|
88
|
+
# otherwise it would fail at db creation point.. which is kinda annoying to work around
|
89
|
+
# todo in principle can work around same way as in cachew, by having a loop around PRAGMA WAL command?
|
90
|
+
check_call(promnesia_bin('index', '--config', cfg_fast_path, '--overwrite'))
|
91
|
+
|
92
|
+
total_runs = 0
|
93
|
+
# run in the background
|
94
|
+
with Popen(promnesia_bin('index', '--config', cfg_slow_path, '--overwrite')) as slow_indexer:
|
95
|
+
while slow_indexer.poll() is None:
|
96
|
+
# create a bunch of 'smaller' indexers running in parallel
|
97
|
+
fasts = [
|
98
|
+
Popen(promnesia_bin('index', '--config', cfg_fast_path, '--overwrite'))
|
99
|
+
for _ in range(10)
|
100
|
+
]
|
101
|
+
for fast in fasts:
|
102
|
+
assert fast.wait() == 0, fast # should succeed
|
103
|
+
total_runs += 1
|
104
|
+
assert slow_indexer.poll() == 0, slow_indexer
|
105
|
+
|
106
|
+
# FIXME ok, need to uncomment this once proper concurrent indexing is supported
|
107
|
+
# if not, slow indexer is too fast, so crank up the count in it
|
108
|
+
# assert total_runs > 20
|
109
|
+
|
110
|
+
|
111
|
+
def test_filter(tmp_path: Path, reset_filters) -> None:
|
112
|
+
domain_to_filter = 'some-weird-domain.xyz'
|
113
|
+
testdata = get_testdata('custom')
|
114
|
+
assert any(domain_to_filter in p.read_text() for p in testdata.glob('*.txt')) # precondition
|
115
|
+
|
116
|
+
def cfg(testdata, domain_to_filter) -> None:
|
117
|
+
from promnesia.common import Source
|
118
|
+
from promnesia.sources import shellcmd
|
119
|
+
from promnesia.sources.plaintext import extract_from_path
|
120
|
+
|
121
|
+
FILTERS = [
|
122
|
+
domain_to_filter,
|
123
|
+
]
|
124
|
+
|
125
|
+
SOURCES = [Source(shellcmd.index, extract_from_path(testdata))]
|
126
|
+
|
127
|
+
cfg_path = tmp_path / 'config.py'
|
128
|
+
write_config(cfg_path, cfg, testdata=testdata, domain_to_filter=domain_to_filter)
|
129
|
+
do_index(cfg_path)
|
130
|
+
|
131
|
+
visits = get_all_db_visits(tmp_path / 'promnesia.sqlite')
|
132
|
+
urls = {v.orig_url for v in visits}
|
133
|
+
assert not any(domain_to_filter in u for u in urls), urls
|
134
|
+
assert len(visits) == 4 # just in case
|
135
|
+
|
136
|
+
|
137
|
+
def test_weird_urls(tmp_path: Path) -> None:
|
138
|
+
# specifically test this here (rather than in cannon)
|
139
|
+
# to make sure it's not messed up when we insert/extract from sqlite
|
140
|
+
|
141
|
+
def cfg(testdata: str) -> None:
|
142
|
+
from promnesia.common import Source
|
143
|
+
from promnesia.sources import shellcmd
|
144
|
+
from promnesia.sources.plaintext import extract_from_path
|
145
|
+
|
146
|
+
SOURCES = [Source(shellcmd.index, extract_from_path(testdata))]
|
147
|
+
|
148
|
+
cfg_path = tmp_path / 'config.py'
|
149
|
+
write_config(cfg_path, cfg, testdata=get_testdata('weird.txt'))
|
150
|
+
do_index(cfg_path)
|
151
|
+
|
152
|
+
[v1, v2] = get_all_db_visits(tmp_path / 'promnesia.sqlite')
|
153
|
+
|
154
|
+
assert v1.norm_url == "urbandictionary.com/define.php?term=Belgian%20Whistle"
|
155
|
+
|
156
|
+
assert v2.norm_url == "en.wikipedia.org/wiki/Dinic%27s_algorithm"
|
157
|
+
assert v2.locator.title.endswith('weird.txt:2')
|
158
|
+
assert v2.context == 'right, so https://en.wikipedia.org/wiki/Dinic%27s_algorithm can be used for max flow'
|
159
|
+
|
160
|
+
|
161
|
+
def test_errors_during_indexing(tmp_path: Path) -> None:
|
162
|
+
def cfg() -> None:
|
163
|
+
from promnesia.common import Source
|
164
|
+
from promnesia.sources import demo
|
165
|
+
|
166
|
+
def indexer1():
|
167
|
+
visits = list(demo.index(count=10))
|
168
|
+
yield from visits[:5]
|
169
|
+
yield RuntimeError("some error during visits extraction")
|
170
|
+
yield from visits[5:]
|
171
|
+
|
172
|
+
def indexer2():
|
173
|
+
raise RuntimeError("in this case indexer itself crashed")
|
174
|
+
|
175
|
+
SOURCES = [Source(indexer1), Source(indexer2)]
|
176
|
+
|
177
|
+
cfg_path = tmp_path / 'config.py'
|
178
|
+
write_config(cfg_path, cfg)
|
179
|
+
do_index(cfg_path)
|
180
|
+
|
181
|
+
stats = get_stats(tmp_path)
|
182
|
+
assert stats == {
|
183
|
+
'error': 2,
|
184
|
+
'config': 10,
|
185
|
+
}
|
186
|
+
|
187
|
+
|
188
|
+
def test_hook(tmp_path: Path) -> None:
|
189
|
+
def cfg() -> None:
|
190
|
+
from promnesia.common import Source
|
191
|
+
from promnesia.sources import demo
|
192
|
+
|
193
|
+
SOURCES = [Source(demo.index, count=7, name='somename')]
|
194
|
+
|
195
|
+
from typing import cast, Iterator
|
196
|
+
from promnesia.common import DbVisit, Loc, Res
|
197
|
+
from promnesia.sources import demo
|
198
|
+
|
199
|
+
def HOOK(visit: Res[DbVisit]) -> Iterator[Res[DbVisit]]:
|
200
|
+
visit = cast(DbVisit, visit)
|
201
|
+
|
202
|
+
# NOTE: might be a good idea to check that the visit is an exception first and yield it intact?
|
203
|
+
nurl = visit.norm_url
|
204
|
+
if 'page1' in nurl:
|
205
|
+
yield visit._replace(norm_url='patched.com')
|
206
|
+
elif 'page2' in nurl:
|
207
|
+
raise Exception('boom') # deliberately crash
|
208
|
+
elif 'page3' in nurl:
|
209
|
+
# just don't yield anything! it will be omitted
|
210
|
+
pass
|
211
|
+
elif 'page4' in nurl:
|
212
|
+
# can emit multiple!
|
213
|
+
yield visit
|
214
|
+
yield visit
|
215
|
+
elif 'page6' in nurl:
|
216
|
+
# patch locator
|
217
|
+
yield visit._replace(locator=Loc.make(title='some custom timte', href='/can/replace/original/path'))
|
218
|
+
else:
|
219
|
+
yield visit
|
220
|
+
|
221
|
+
cfg_path = tmp_path / 'config.py'
|
222
|
+
write_config(cfg_path, cfg)
|
223
|
+
do_index(cfg_path)
|
224
|
+
|
225
|
+
[p0, p1, e2, p41, p42, p5, p6] = get_all_db_visits(tmp_path / 'promnesia.sqlite')
|
226
|
+
assert p0.norm_url == 'demo.com/page0.html'
|
227
|
+
assert p1.norm_url == 'patched.com'
|
228
|
+
assert e2.norm_url == '<error>'
|
229
|
+
assert p41 == p42
|
230
|
+
assert isinstance(p6, DbVisit)
|
231
|
+
assert p6.locator is not None
|
232
|
+
|
233
|
+
|
234
|
+
def test_example_config(tmp_path: Path) -> None:
|
235
|
+
if _is_windows:
|
236
|
+
pytest.skip("doesn't work on Windows: example config references /usr/include paths")
|
237
|
+
|
238
|
+
config = read_example_config() + '\n' + f'OUTPUT_DIR = "{str(tmp_path)}"'
|
239
|
+
cfg_path = tmp_path / 'example_config.py'
|
240
|
+
cfg_path.write_text(config)
|
241
|
+
|
242
|
+
do_index(cfg_path)
|
243
|
+
|
244
|
+
visits = [v for v in get_all_db_visits(tmp_path / 'promnesia.sqlite') if v.src != 'error']
|
245
|
+
assert len(visits) > 50 # random sanity check
|