promnesia 1.2.20230515__py3-none-any.whl → 1.2.20240810__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (61) hide show
  1. promnesia/__main__.py +26 -14
  2. promnesia/cannon.py +4 -4
  3. promnesia/common.py +39 -28
  4. promnesia/compare.py +3 -2
  5. promnesia/config.py +4 -2
  6. promnesia/database/common.py +66 -0
  7. promnesia/database/dump.py +187 -0
  8. promnesia/{read_db.py → database/load.py} +10 -11
  9. promnesia/extract.py +1 -0
  10. promnesia/kjson.py +1 -1
  11. promnesia/logging.py +3 -3
  12. promnesia/misc/__init__.pyi +0 -0
  13. promnesia/misc/config_example.py +1 -2
  14. promnesia/misc/install_server.py +2 -3
  15. promnesia/server.py +18 -19
  16. promnesia/sources/__init__.pyi +0 -0
  17. promnesia/sources/auto.py +9 -7
  18. promnesia/sources/browser_legacy.py +11 -5
  19. promnesia/sources/demo.py +18 -2
  20. promnesia/sources/filetypes.py +7 -0
  21. promnesia/sources/github.py +2 -2
  22. promnesia/sources/hypothesis.py +1 -1
  23. promnesia/sources/markdown.py +15 -15
  24. promnesia/sources/org.py +7 -3
  25. promnesia/sources/plaintext.py +3 -1
  26. promnesia/sources/reddit.py +2 -2
  27. promnesia/sources/rss.py +1 -1
  28. promnesia/sources/signal.py +22 -14
  29. promnesia/sources/stackexchange.py +2 -2
  30. promnesia/sources/takeout.py +58 -1
  31. promnesia/sources/takeout_legacy.py +10 -2
  32. promnesia/tests/__init__.py +0 -0
  33. promnesia/tests/common.py +137 -0
  34. promnesia/tests/server_helper.py +64 -0
  35. promnesia/tests/sources/__init__.py +0 -0
  36. promnesia/tests/sources/test_auto.py +66 -0
  37. promnesia/tests/sources/test_filetypes.py +42 -0
  38. promnesia/tests/sources/test_hypothesis.py +39 -0
  39. promnesia/tests/sources/test_org.py +65 -0
  40. promnesia/tests/sources/test_plaintext.py +26 -0
  41. promnesia/tests/sources/test_shellcmd.py +22 -0
  42. promnesia/tests/sources/test_takeout.py +58 -0
  43. promnesia/tests/test_cannon.py +325 -0
  44. promnesia/tests/test_cli.py +42 -0
  45. promnesia/tests/test_compare.py +30 -0
  46. promnesia/tests/test_config.py +290 -0
  47. promnesia/tests/test_db_dump.py +223 -0
  48. promnesia/tests/test_extract.py +61 -0
  49. promnesia/tests/test_extract_urls.py +43 -0
  50. promnesia/tests/test_indexer.py +245 -0
  51. promnesia/tests/test_server.py +292 -0
  52. promnesia/tests/test_traverse.py +41 -0
  53. promnesia/tests/utils.py +35 -0
  54. {promnesia-1.2.20230515.dist-info → promnesia-1.2.20240810.dist-info}/METADATA +13 -17
  55. promnesia-1.2.20240810.dist-info/RECORD +83 -0
  56. {promnesia-1.2.20230515.dist-info → promnesia-1.2.20240810.dist-info}/WHEEL +1 -1
  57. {promnesia-1.2.20230515.dist-info → promnesia-1.2.20240810.dist-info}/entry_points.txt +0 -1
  58. promnesia/dump.py +0 -105
  59. promnesia-1.2.20230515.dist-info/RECORD +0 -58
  60. {promnesia-1.2.20230515.dist-info → promnesia-1.2.20240810.dist-info}/LICENSE +0 -0
  61. {promnesia-1.2.20230515.dist-info → promnesia-1.2.20240810.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,223 @@
1
+ from __future__ import annotations
2
+
3
+ from concurrent.futures import ProcessPoolExecutor
4
+ from datetime import datetime, timedelta, timezone
5
+ from pathlib import Path
6
+ from tempfile import TemporaryDirectory
7
+ from typing import Any, Iterable
8
+
9
+
10
+ from hypothesis import settings, given
11
+ from hypothesis.strategies import from_type
12
+ # NOTE: pytest ... -s --hypothesis-verbosity=debug is useful for seeing what hypothesis is doing
13
+ import pytest
14
+ import pytz
15
+
16
+
17
+ from ..common import Loc
18
+ from ..database.common import DbVisit
19
+ from ..database.dump import visits_to_sqlite
20
+ from ..database.load import get_all_db_visits
21
+ from ..sqlite import sqlite_connection
22
+
23
+ from .common import gc_control, running_on_ci
24
+
25
+
26
+ HSETTINGS: dict[str, Any] = dict(
27
+ derandomize=True,
28
+ deadline=timedelta(seconds=2), # sometimes slow on ci
29
+ )
30
+
31
+
32
+ def test_no_visits(tmp_path: Path) -> None:
33
+ visits: list[DbVisit] = []
34
+
35
+ db = tmp_path / 'db.sqlite'
36
+ errors = visits_to_sqlite(
37
+ vit=visits,
38
+ overwrite_db=True,
39
+ _db_path=db,
40
+ )
41
+ assert db.exists()
42
+ [err] = [errors]
43
+ assert 'No visits were indexed' in str(err)
44
+
45
+
46
+ def test_one_visit(tmp_path: Path) -> None:
47
+ dt = datetime.fromisoformat('2023-11-14T23:11:01')
48
+ dt = pytz.timezone('Europe/Warsaw').localize(dt)
49
+ visit = DbVisit(
50
+ norm_url='google.com',
51
+ orig_url='https://google.com',
52
+ dt=dt,
53
+ locator=Loc.make(title='title', href='https://whatever.com'),
54
+ duration=123,
55
+ src='whatever',
56
+ )
57
+
58
+ visits = [visit]
59
+
60
+ db = tmp_path / 'db.sqlite'
61
+ errors = visits_to_sqlite(
62
+ vit=visits,
63
+ overwrite_db=True,
64
+ _db_path=db,
65
+ )
66
+ assert len(errors) == 0
67
+ assert db.exists()
68
+
69
+ with sqlite_connection(db, row_factory='dict') as conn:
70
+ [sqlite_visit] = conn.execute('SELECT * FROM visits')
71
+
72
+ assert sqlite_visit == {
73
+ 'context': None,
74
+ 'dt': '2023-11-14T23:11:01+01:00',
75
+ 'duration': 123,
76
+ 'locator_href': 'https://whatever.com',
77
+ 'locator_title': 'title',
78
+ 'norm_url': 'google.com',
79
+ 'orig_url': 'https://google.com',
80
+ 'src': 'whatever',
81
+ }
82
+
83
+ visits_in_db = get_all_db_visits(db)
84
+ assert visits_in_db == [visit]
85
+
86
+
87
+ def test_read_db_visits(tmp_path: Path) -> None:
88
+ """
89
+ Deliberately test against "hardcoded" database to check for backwards compatibility
90
+ """
91
+ db = tmp_path / 'db.sqlite'
92
+ with sqlite_connection(db) as conn:
93
+ conn.execute(
94
+ '''
95
+ CREATE TABLE visits (
96
+ norm_url VARCHAR,
97
+ orig_url VARCHAR,
98
+ dt VARCHAR,
99
+ locator_title VARCHAR,
100
+ locator_href VARCHAR,
101
+ src VARCHAR,
102
+ context VARCHAR,
103
+ duration INTEGER
104
+ );
105
+ '''
106
+ )
107
+ # this dt format (zone name after iso timestap) might occur in legacy databases
108
+ # (that were created when promnesia was using cachew NTBinder)
109
+ conn.execute(
110
+ '''
111
+ INSERT INTO visits VALUES(
112
+ 'i.redd.it/alala.jpg',
113
+ 'https://i.redd.it/alala.jpg',
114
+ '2019-04-13T11:55:09-04:00 America/New_York',
115
+ 'Reddit save',
116
+ 'https://reddit.com/r/whatever',
117
+ 'reddit',
118
+ '',
119
+ NULL
120
+ );
121
+ '''
122
+ )
123
+ [visit_in_db] = get_all_db_visits(db)
124
+ assert visit_in_db == DbVisit(
125
+ norm_url='i.redd.it/alala.jpg',
126
+ orig_url='https://i.redd.it/alala.jpg',
127
+ dt=datetime(2019, 4, 13, 11, 55, 9, tzinfo=timezone(timedelta(hours=-4))),
128
+ locator=Loc.make(title='Reddit save', href='https://reddit.com/r/whatever'),
129
+ src='reddit',
130
+ context='',
131
+ )
132
+
133
+
134
+ def _test_random_visit_aux(visit: DbVisit, tmp_path: Path) -> None:
135
+ db = tmp_path / 'db.sqlite'
136
+ errors = visits_to_sqlite(
137
+ vit=[visit],
138
+ overwrite_db=True,
139
+ _db_path=db,
140
+ )
141
+ assert db.exists()
142
+ assert len(errors) == 0, errors
143
+ # TODO query the db?
144
+
145
+
146
+ @given(
147
+ visit=from_type(DbVisit).filter(
148
+ # if duration is too big it fails to insert in sqlite
149
+ lambda v: (v.duration is None or 0 <= v.duration <= 10**5)
150
+ )
151
+ )
152
+ @settings(**HSETTINGS, max_examples=100)
153
+ def test_random_visit(visit: DbVisit) -> None:
154
+ with TemporaryDirectory() as tdir:
155
+ tmp_path = Path(tdir)
156
+ _test_random_visit_aux(visit=visit, tmp_path=tmp_path)
157
+
158
+
159
+ _dt_naive = datetime.fromisoformat('2023-11-14T23:11:01')
160
+ _dt_aware = pytz.timezone('America/New_York').localize(_dt_naive)
161
+
162
+ def make_testvisit(i: int) -> DbVisit:
163
+ return DbVisit(
164
+ norm_url=f'google.com/{i}',
165
+ orig_url=f'https://google.com/{i}',
166
+ dt=(_dt_naive if i % 2 == 0 else _dt_aware) + timedelta(seconds=i),
167
+ locator=Loc.make(title=f'title{i}', href=f'https://whatever.com/{i}'),
168
+ duration=i,
169
+ src='whatever',
170
+ )
171
+
172
+
173
+ @pytest.mark.parametrize('count', [99, 100_000, 1_000_000])
174
+ @pytest.mark.parametrize('gc_on', [True, False], ids=['gc_on', 'gc_off'])
175
+ def test_benchmark_visits_dumping(count: int, gc_control, tmp_path: Path) -> None:
176
+ # [20231212] testing differernt CHUNK_BY values with 1_000_000 visits on @karlicoss desktop pc
177
+ # 1: 25s (perhaps most overhead is from temporary lists?)
178
+ # 10 (current default): 8s
179
+ # 100: 6s
180
+ # 1000: 6s
181
+ # TODO maybe consider changing default to 100?
182
+ if count > 99 and running_on_ci:
183
+ pytest.skip("test would be too slow on CI, only meant to run manually")
184
+
185
+ visits = (make_testvisit(i) for i in range(count))
186
+ db = tmp_path / 'db.sqlite'
187
+ errors = visits_to_sqlite( # TODO maybe this method should return db stats? would make testing easier
188
+ vit=visits,
189
+ overwrite_db=True,
190
+ _db_path=db,
191
+ )
192
+ assert db.exists()
193
+ assert len(errors) == 0, errors
194
+
195
+
196
+ def _populate_db(db_path: Path, *, overwrite_db: bool, count: int) -> None:
197
+ visits = [make_testvisit(i) for i in range(count)]
198
+ errors = visits_to_sqlite(visits, _db_path=db_path, overwrite_db=overwrite_db)
199
+ assert len(errors) == 0
200
+
201
+
202
+ @pytest.mark.parametrize('mode', ['update', 'overwrite'])
203
+ def test_concurrent(tmp_path: Path, mode: str) -> None:
204
+ overwrite_db = {'overwrite': True, 'update': False}[mode]
205
+
206
+ db_path = tmp_path / 'db.sqlite'
207
+ # do initial indexing to initialize the db
208
+ _populate_db(db_path, overwrite_db=True, count=1)
209
+ assert db_path.exists() # just in case
210
+
211
+ # this simply tests correctness by running many concurrent indexers
212
+ parallel = 100 # 100 indexers
213
+ with ProcessPoolExecutor(max_workers=8) as pool:
214
+ futures = []
215
+ for _ in range(parallel):
216
+ futures.append(pool.submit(_populate_db, db_path, overwrite_db=overwrite_db, count=1_000))
217
+ for f in futures:
218
+ f.result()
219
+ assert db_path.exists() # just in case
220
+
221
+
222
+ # TODO test to make sure db is readable while we're indexing?
223
+ # kinda nicer version of test_query_while_indexing
@@ -0,0 +1,61 @@
1
+ from datetime import datetime, timezone
2
+
3
+ from ..common import Visit, DbVisit, Loc, Source
4
+ from ..extract import extract_visits
5
+
6
+ from .common import get_testdata, unwrap, running_on_ci, gc_control
7
+
8
+ from more_itertools import ilen
9
+ import pytest
10
+
11
+
12
+ def test_with_error() -> None:
13
+ class ExtractionError(Exception):
14
+ pass
15
+
16
+ def indexer():
17
+ yield Visit(url='http://test1', dt=datetime.fromtimestamp(0, tz=timezone.utc), locator=Loc.make('whatever'))
18
+ yield ExtractionError()
19
+ yield Visit(url='http://test2', dt=datetime.fromtimestamp(0, tz=timezone.utc), locator=Loc.make('whatever'))
20
+
21
+ [v1, e, v2] = extract_visits(source=Source(indexer), src='whatever')
22
+ assert isinstance(v1, DbVisit)
23
+ assert isinstance(e, Exception)
24
+ assert isinstance(v2, DbVisit)
25
+
26
+
27
+ def test_urls_are_normalised() -> None:
28
+ # generally this stuff is covered by cannon tests, but good to check it's actually inserted in the db
29
+ # TODO maybe this should be a separate test which takes DbVisit.make separately?
30
+ # especially to decouple from shellcmd source
31
+ from ..sources import shellcmd
32
+ from ..sources.plaintext import extract_from_path
33
+
34
+ visits = list(extract_visits(
35
+ source=Source(shellcmd.index, extract_from_path(get_testdata('normalise'))),
36
+ src='whatever',
37
+ ))
38
+ assert len(visits) == 7
39
+
40
+ assert {unwrap(v).norm_url for v in visits} == {
41
+ 'hi.com',
42
+ 'reddit.com/post',
43
+ 'argos.co.uk/webapp/wcs/stores/servlet/OrderItemDisplay',
44
+ 'youtube.com/watch?v=XXlZfc1TrD0',
45
+ 'youtube.com/watch?v=XXlZfc1Tr11',
46
+ }
47
+
48
+
49
+ @pytest.mark.parametrize('count', [99, 100_000, 1_000_000])
50
+ @pytest.mark.parametrize('gc_on', [True, False], ids=['gc_on', 'gc_off'])
51
+ def test_benchmark(count: int, gc_control) -> None:
52
+ # NOTE: at the moment most time is spent canonifying urls, so not much point optimizing this in isolation
53
+ # TODO maybe could specify custom cannonifying strategy that doesn't do anything to isolate benchmark
54
+ if count > 99 and running_on_ci:
55
+ pytest.skip("test would be too slow on CI, only meant to run manually")
56
+
57
+ from ..sources import demo
58
+ source = Source(demo.index, count=count)
59
+
60
+ total = ilen(extract_visits(source=source, src='whatever'))
61
+ assert total == count # sanity check
@@ -0,0 +1,43 @@
1
+ from ..common import extract_urls
2
+
3
+
4
+ def test_extract_simple() -> None:
5
+ lines = """
6
+ I've enjoyed [Chandler Carruth's _There Are No Zero-cost Abstractions_](
7
+ https://www.youtube.com/watch?v=rHIkrotSwcc) very much.
8
+ """.strip()
9
+ assert set(extract_urls(lines)) == {'https://www.youtube.com/watch?v=rHIkrotSwcc'}
10
+
11
+
12
+ def test_extract_2() -> None:
13
+ text = '''‍♂️ Чтобы снизить вероятность ошибиться, важно знать про когнитивные искажения.
14
+ Если для вас это новое словосочетание, начните с книжки
15
+ "Гарри Поттер и Методы рационального мышления" - http://hpmor.ru/, если вы знакомы с понятием - читайте цепочки на сайтах
16
+ lesswrong.ru и lesswrong.com, книжку Даниэля Канемана "Thinking, fast and slow" и канал Пион https://t.me/ontologics
17
+ '''
18
+ assert set(extract_urls(text)) == {'http://hpmor.ru/', 'lesswrong.ru', 'lesswrong.com', 'https://t.me/ontologics'}
19
+
20
+
21
+ def test_extract_md() -> None:
22
+ lines = '''
23
+ Hey, I recently implemented a new extension for that [addons.mozilla.org](https://addons.mozilla.org/en-US/firefox/addon/org-grasp-for-org-capture/), [github](https://github.com/karlicoss/grasp), perhaps it could be useful for you!
24
+ '''
25
+ assert set(extract_urls(lines)) == {
26
+ 'addons.mozilla.org',
27
+ 'https://addons.mozilla.org/en-US/firefox/addon/org-grasp-for-org-capture/',
28
+ 'https://github.com/karlicoss/grasp',
29
+ }
30
+
31
+
32
+ # just random links to test multiline/whitespace behaviour
33
+ def test_extract_3() -> None:
34
+ lines = '''
35
+ python.org/one.html ?? https://python.org/two.html some extra text
36
+
37
+ whatever.org
38
+ '''
39
+ assert set(extract_urls(lines, syntax='org')) == {
40
+ 'python.org/one.html',
41
+ 'https://python.org/two.html',
42
+ 'whatever.org',
43
+ }
@@ -0,0 +1,245 @@
1
+ from collections import Counter
2
+ from pathlib import Path
3
+ from subprocess import check_call, Popen
4
+
5
+ from ..__main__ import do_index, read_example_config
6
+ from ..common import DbVisit, _is_windows
7
+ from ..database.load import get_all_db_visits
8
+
9
+ import pytest
10
+
11
+ from .common import get_testdata, promnesia_bin, reset_filters, write_config
12
+
13
+
14
+ def get_stats(tmp_path: Path) -> Counter:
15
+ visits = get_all_db_visits(tmp_path / 'promnesia.sqlite')
16
+ return Counter(v.src for v in visits)
17
+
18
+
19
+ @pytest.mark.parametrize('mode', ['update', 'overwrite'])
20
+ def test_indexing_mode(tmp_path: Path, mode: str) -> None:
21
+ # ugh. we modify the config very fast during tests
22
+ # and pycache distinguishes identical filenames based on int mtime in seconds
23
+ # so best to use different names to prevent undesired caching
24
+ # https://github.com/python/cpython/blob/fb202af4470d6051a69bb9d2f44d7e8a1c99eb4f/Lib/importlib/_bootstrap_external.py#L714-L739
25
+ # TODO could probably relax that if we switch from importlib config loading to exec()?
26
+
27
+ def cfg1() -> None:
28
+ from promnesia.common import Source
29
+ from promnesia.sources import demo
30
+
31
+ SOURCES = [
32
+ Source(demo.index, count=10, base_dt='2000-01-01', delta=30, name='demo1'),
33
+ Source(demo.index, count=20, base_dt='2001-01-01', delta=30, name='demo2'),
34
+ ]
35
+
36
+ cfg_path = tmp_path / 'config1.py'
37
+ write_config(cfg_path, cfg1)
38
+ do_index(cfg_path)
39
+
40
+ stats = get_stats(tmp_path)
41
+ assert stats == {'demo1': 10, 'demo2': 20}
42
+
43
+ def cfg2() -> None:
44
+ from promnesia.common import Source
45
+ from promnesia.sources import demo
46
+
47
+ SOURCES = [
48
+ Source(demo.index, count=30, base_dt='2005-01-01', delta=30, name='demo2'),
49
+ Source(demo.index, count=40, base_dt='2010-01-01', delta=30, name='demo3'),
50
+ ]
51
+
52
+ cfg_path = tmp_path / 'config2.py'
53
+ write_config(cfg_path, cfg2)
54
+ do_index(cfg_path, overwrite_db={'overwrite': True, 'update': False}[mode])
55
+ # TODO use some sort of test helper?
56
+ stats = get_stats(tmp_path)
57
+
58
+ if mode == 'update':
59
+ # should keep the original visits too!
60
+ assert stats == {'demo1': 10, 'demo2': 30, 'demo3': 40}
61
+ else:
62
+ # should overwrite with newly indexed visits
63
+ assert stats == {'demo2': 30, 'demo3': 40}
64
+
65
+
66
+ # TODO check both modes?
67
+ def test_concurrent_indexing(tmp_path: Path) -> None:
68
+ def cfg_fast() -> None:
69
+ from promnesia.common import Source
70
+ from promnesia.sources import demo
71
+
72
+ SOURCES = [Source(demo.index, count=10)]
73
+
74
+ cfg_fast_path = tmp_path / 'cfg_fast.py'
75
+ write_config(cfg_fast_path, cfg_fast)
76
+
77
+ def cfg_slow() -> None:
78
+ from promnesia.common import Source
79
+ from promnesia.sources import demo
80
+
81
+ SOURCES = [Source(demo.index, count=100_000)]
82
+
83
+ cfg_slow_path = tmp_path / 'cfg_slow.py'
84
+ write_config(cfg_slow_path, cfg_slow)
85
+
86
+ # init it first, to create the database
87
+ # TODO ideally this shouldn't be necessary but it's reasonable that people would already have the index
88
+ # otherwise it would fail at db creation point.. which is kinda annoying to work around
89
+ # todo in principle can work around same way as in cachew, by having a loop around PRAGMA WAL command?
90
+ check_call(promnesia_bin('index', '--config', cfg_fast_path, '--overwrite'))
91
+
92
+ total_runs = 0
93
+ # run in the background
94
+ with Popen(promnesia_bin('index', '--config', cfg_slow_path, '--overwrite')) as slow_indexer:
95
+ while slow_indexer.poll() is None:
96
+ # create a bunch of 'smaller' indexers running in parallel
97
+ fasts = [
98
+ Popen(promnesia_bin('index', '--config', cfg_fast_path, '--overwrite'))
99
+ for _ in range(10)
100
+ ]
101
+ for fast in fasts:
102
+ assert fast.wait() == 0, fast # should succeed
103
+ total_runs += 1
104
+ assert slow_indexer.poll() == 0, slow_indexer
105
+
106
+ # FIXME ok, need to uncomment this once proper concurrent indexing is supported
107
+ # if not, slow indexer is too fast, so crank up the count in it
108
+ # assert total_runs > 20
109
+
110
+
111
+ def test_filter(tmp_path: Path, reset_filters) -> None:
112
+ domain_to_filter = 'some-weird-domain.xyz'
113
+ testdata = get_testdata('custom')
114
+ assert any(domain_to_filter in p.read_text() for p in testdata.glob('*.txt')) # precondition
115
+
116
+ def cfg(testdata, domain_to_filter) -> None:
117
+ from promnesia.common import Source
118
+ from promnesia.sources import shellcmd
119
+ from promnesia.sources.plaintext import extract_from_path
120
+
121
+ FILTERS = [
122
+ domain_to_filter,
123
+ ]
124
+
125
+ SOURCES = [Source(shellcmd.index, extract_from_path(testdata))]
126
+
127
+ cfg_path = tmp_path / 'config.py'
128
+ write_config(cfg_path, cfg, testdata=testdata, domain_to_filter=domain_to_filter)
129
+ do_index(cfg_path)
130
+
131
+ visits = get_all_db_visits(tmp_path / 'promnesia.sqlite')
132
+ urls = {v.orig_url for v in visits}
133
+ assert not any(domain_to_filter in u for u in urls), urls
134
+ assert len(visits) == 4 # just in case
135
+
136
+
137
+ def test_weird_urls(tmp_path: Path) -> None:
138
+ # specifically test this here (rather than in cannon)
139
+ # to make sure it's not messed up when we insert/extract from sqlite
140
+
141
+ def cfg(testdata: str) -> None:
142
+ from promnesia.common import Source
143
+ from promnesia.sources import shellcmd
144
+ from promnesia.sources.plaintext import extract_from_path
145
+
146
+ SOURCES = [Source(shellcmd.index, extract_from_path(testdata))]
147
+
148
+ cfg_path = tmp_path / 'config.py'
149
+ write_config(cfg_path, cfg, testdata=get_testdata('weird.txt'))
150
+ do_index(cfg_path)
151
+
152
+ [v1, v2] = get_all_db_visits(tmp_path / 'promnesia.sqlite')
153
+
154
+ assert v1.norm_url == "urbandictionary.com/define.php?term=Belgian%20Whistle"
155
+
156
+ assert v2.norm_url == "en.wikipedia.org/wiki/Dinic%27s_algorithm"
157
+ assert v2.locator.title.endswith('weird.txt:2')
158
+ assert v2.context == 'right, so https://en.wikipedia.org/wiki/Dinic%27s_algorithm can be used for max flow'
159
+
160
+
161
+ def test_errors_during_indexing(tmp_path: Path) -> None:
162
+ def cfg() -> None:
163
+ from promnesia.common import Source
164
+ from promnesia.sources import demo
165
+
166
+ def indexer1():
167
+ visits = list(demo.index(count=10))
168
+ yield from visits[:5]
169
+ yield RuntimeError("some error during visits extraction")
170
+ yield from visits[5:]
171
+
172
+ def indexer2():
173
+ raise RuntimeError("in this case indexer itself crashed")
174
+
175
+ SOURCES = [Source(indexer1), Source(indexer2)]
176
+
177
+ cfg_path = tmp_path / 'config.py'
178
+ write_config(cfg_path, cfg)
179
+ do_index(cfg_path)
180
+
181
+ stats = get_stats(tmp_path)
182
+ assert stats == {
183
+ 'error': 2,
184
+ 'config': 10,
185
+ }
186
+
187
+
188
+ def test_hook(tmp_path: Path) -> None:
189
+ def cfg() -> None:
190
+ from promnesia.common import Source
191
+ from promnesia.sources import demo
192
+
193
+ SOURCES = [Source(demo.index, count=7, name='somename')]
194
+
195
+ from typing import cast, Iterator
196
+ from promnesia.common import DbVisit, Loc, Res
197
+ from promnesia.sources import demo
198
+
199
+ def HOOK(visit: Res[DbVisit]) -> Iterator[Res[DbVisit]]:
200
+ visit = cast(DbVisit, visit)
201
+
202
+ # NOTE: might be a good idea to check that the visit is an exception first and yield it intact?
203
+ nurl = visit.norm_url
204
+ if 'page1' in nurl:
205
+ yield visit._replace(norm_url='patched.com')
206
+ elif 'page2' in nurl:
207
+ raise Exception('boom') # deliberately crash
208
+ elif 'page3' in nurl:
209
+ # just don't yield anything! it will be omitted
210
+ pass
211
+ elif 'page4' in nurl:
212
+ # can emit multiple!
213
+ yield visit
214
+ yield visit
215
+ elif 'page6' in nurl:
216
+ # patch locator
217
+ yield visit._replace(locator=Loc.make(title='some custom timte', href='/can/replace/original/path'))
218
+ else:
219
+ yield visit
220
+
221
+ cfg_path = tmp_path / 'config.py'
222
+ write_config(cfg_path, cfg)
223
+ do_index(cfg_path)
224
+
225
+ [p0, p1, e2, p41, p42, p5, p6] = get_all_db_visits(tmp_path / 'promnesia.sqlite')
226
+ assert p0.norm_url == 'demo.com/page0.html'
227
+ assert p1.norm_url == 'patched.com'
228
+ assert e2.norm_url == '<error>'
229
+ assert p41 == p42
230
+ assert isinstance(p6, DbVisit)
231
+ assert p6.locator is not None
232
+
233
+
234
+ def test_example_config(tmp_path: Path) -> None:
235
+ if _is_windows:
236
+ pytest.skip("doesn't work on Windows: example config references /usr/include paths")
237
+
238
+ config = read_example_config() + '\n' + f'OUTPUT_DIR = "{str(tmp_path)}"'
239
+ cfg_path = tmp_path / 'example_config.py'
240
+ cfg_path.write_text(config)
241
+
242
+ do_index(cfg_path)
243
+
244
+ visits = [v for v in get_all_db_visits(tmp_path / 'promnesia.sqlite') if v.src != 'error']
245
+ assert len(visits) > 50 # random sanity check