promnesia 1.2.20230515__py3-none-any.whl → 1.3.20241021__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (84) hide show
  1. promnesia/__init__.py +14 -3
  2. promnesia/__main__.py +60 -35
  3. promnesia/cannon.py +27 -27
  4. promnesia/common.py +85 -67
  5. promnesia/compare.py +21 -22
  6. promnesia/compat.py +10 -10
  7. promnesia/config.py +23 -23
  8. promnesia/database/common.py +67 -0
  9. promnesia/database/dump.py +188 -0
  10. promnesia/{read_db.py → database/load.py} +16 -17
  11. promnesia/extract.py +14 -11
  12. promnesia/kjson.py +12 -11
  13. promnesia/logging.py +4 -4
  14. promnesia/misc/__init__.pyi +0 -0
  15. promnesia/misc/config_example.py +1 -2
  16. promnesia/misc/install_server.py +7 -9
  17. promnesia/server.py +57 -47
  18. promnesia/sources/__init__.pyi +0 -0
  19. promnesia/sources/auto.py +50 -35
  20. promnesia/sources/auto_logseq.py +6 -5
  21. promnesia/sources/auto_obsidian.py +2 -2
  22. promnesia/sources/browser.py +14 -9
  23. promnesia/sources/browser_legacy.py +26 -16
  24. promnesia/sources/demo.py +19 -3
  25. promnesia/sources/fbmessenger.py +3 -2
  26. promnesia/sources/filetypes.py +16 -7
  27. promnesia/sources/github.py +7 -9
  28. promnesia/sources/guess.py +2 -1
  29. promnesia/sources/hackernews.py +2 -2
  30. promnesia/sources/hpi.py +2 -2
  31. promnesia/sources/html.py +7 -5
  32. promnesia/sources/hypothesis.py +4 -3
  33. promnesia/sources/instapaper.py +2 -2
  34. promnesia/sources/markdown.py +31 -21
  35. promnesia/sources/org.py +27 -13
  36. promnesia/sources/plaintext.py +30 -29
  37. promnesia/sources/pocket.py +3 -2
  38. promnesia/sources/reddit.py +20 -19
  39. promnesia/sources/roamresearch.py +2 -1
  40. promnesia/sources/rss.py +4 -5
  41. promnesia/sources/shellcmd.py +19 -6
  42. promnesia/sources/signal.py +33 -24
  43. promnesia/sources/smscalls.py +2 -2
  44. promnesia/sources/stackexchange.py +4 -3
  45. promnesia/sources/takeout.py +76 -9
  46. promnesia/sources/takeout_legacy.py +24 -12
  47. promnesia/sources/telegram.py +13 -11
  48. promnesia/sources/telegram_legacy.py +18 -7
  49. promnesia/sources/twitter.py +6 -5
  50. promnesia/sources/vcs.py +5 -3
  51. promnesia/sources/viber.py +10 -9
  52. promnesia/sources/website.py +4 -4
  53. promnesia/sources/zulip.py +3 -2
  54. promnesia/sqlite.py +7 -4
  55. promnesia/tests/__init__.py +0 -0
  56. promnesia/tests/common.py +140 -0
  57. promnesia/tests/server_helper.py +67 -0
  58. promnesia/tests/sources/__init__.py +0 -0
  59. promnesia/tests/sources/test_auto.py +65 -0
  60. promnesia/tests/sources/test_filetypes.py +43 -0
  61. promnesia/tests/sources/test_hypothesis.py +39 -0
  62. promnesia/tests/sources/test_org.py +64 -0
  63. promnesia/tests/sources/test_plaintext.py +25 -0
  64. promnesia/tests/sources/test_shellcmd.py +21 -0
  65. promnesia/tests/sources/test_takeout.py +56 -0
  66. promnesia/tests/test_cannon.py +325 -0
  67. promnesia/tests/test_cli.py +40 -0
  68. promnesia/tests/test_compare.py +30 -0
  69. promnesia/tests/test_config.py +289 -0
  70. promnesia/tests/test_db_dump.py +222 -0
  71. promnesia/tests/test_extract.py +65 -0
  72. promnesia/tests/test_extract_urls.py +43 -0
  73. promnesia/tests/test_indexer.py +251 -0
  74. promnesia/tests/test_server.py +291 -0
  75. promnesia/tests/test_traverse.py +39 -0
  76. promnesia/tests/utils.py +35 -0
  77. {promnesia-1.2.20230515.dist-info → promnesia-1.3.20241021.dist-info}/METADATA +15 -18
  78. promnesia-1.3.20241021.dist-info/RECORD +83 -0
  79. {promnesia-1.2.20230515.dist-info → promnesia-1.3.20241021.dist-info}/WHEEL +1 -1
  80. {promnesia-1.2.20230515.dist-info → promnesia-1.3.20241021.dist-info}/entry_points.txt +0 -1
  81. promnesia/dump.py +0 -105
  82. promnesia-1.2.20230515.dist-info/RECORD +0 -58
  83. {promnesia-1.2.20230515.dist-info → promnesia-1.3.20241021.dist-info}/LICENSE +0 -0
  84. {promnesia-1.2.20230515.dist-info → promnesia-1.3.20241021.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,222 @@
1
+ from __future__ import annotations
2
+
3
+ from concurrent.futures import ProcessPoolExecutor
4
+ from datetime import datetime, timedelta, timezone
5
+ from pathlib import Path
6
+ from tempfile import TemporaryDirectory
7
+ from typing import Any
8
+
9
+ # NOTE: pytest ... -s --hypothesis-verbosity=debug is useful for seeing what hypothesis is doing
10
+ import pytest
11
+ import pytz
12
+ from hypothesis import given, settings
13
+ from hypothesis.strategies import from_type
14
+
15
+ from ..common import Loc
16
+ from ..database.common import DbVisit
17
+ from ..database.dump import visits_to_sqlite
18
+ from ..database.load import get_all_db_visits
19
+ from ..sqlite import sqlite_connection
20
+ from .common import (
21
+ gc_control, # noqa: F401
22
+ running_on_ci,
23
+ )
24
+
25
+ HSETTINGS: dict[str, Any] = {
26
+ 'derandomize': True,
27
+ 'deadline': timedelta(seconds=2), # sometimes slow on ci
28
+ }
29
+
30
+
31
+ def test_no_visits(tmp_path: Path) -> None:
32
+ visits: list[DbVisit] = []
33
+
34
+ db = tmp_path / 'db.sqlite'
35
+ errors = visits_to_sqlite(
36
+ vit=visits,
37
+ overwrite_db=True,
38
+ _db_path=db,
39
+ )
40
+ assert db.exists()
41
+ [err] = [errors]
42
+ assert 'No visits were indexed' in str(err)
43
+
44
+
45
+ def test_one_visit(tmp_path: Path) -> None:
46
+ dt = datetime.fromisoformat('2023-11-14T23:11:01')
47
+ dt = pytz.timezone('Europe/Warsaw').localize(dt)
48
+ visit = DbVisit(
49
+ norm_url='google.com',
50
+ orig_url='https://google.com',
51
+ dt=dt,
52
+ locator=Loc.make(title='title', href='https://whatever.com'),
53
+ duration=123,
54
+ src='whatever',
55
+ )
56
+
57
+ visits = [visit]
58
+
59
+ db = tmp_path / 'db.sqlite'
60
+ errors = visits_to_sqlite(
61
+ vit=visits,
62
+ overwrite_db=True,
63
+ _db_path=db,
64
+ )
65
+ assert len(errors) == 0
66
+ assert db.exists()
67
+
68
+ with sqlite_connection(db, row_factory='dict') as conn:
69
+ [sqlite_visit] = conn.execute('SELECT * FROM visits')
70
+
71
+ assert sqlite_visit == {
72
+ 'context': None,
73
+ 'dt': '2023-11-14T23:11:01+01:00',
74
+ 'duration': 123,
75
+ 'locator_href': 'https://whatever.com',
76
+ 'locator_title': 'title',
77
+ 'norm_url': 'google.com',
78
+ 'orig_url': 'https://google.com',
79
+ 'src': 'whatever',
80
+ }
81
+
82
+ visits_in_db = get_all_db_visits(db)
83
+ assert visits_in_db == [visit]
84
+
85
+
86
+ def test_read_db_visits(tmp_path: Path) -> None:
87
+ """
88
+ Deliberately test against "hardcoded" database to check for backwards compatibility
89
+ """
90
+ db = tmp_path / 'db.sqlite'
91
+ with sqlite_connection(db) as conn:
92
+ conn.execute(
93
+ '''
94
+ CREATE TABLE visits (
95
+ norm_url VARCHAR,
96
+ orig_url VARCHAR,
97
+ dt VARCHAR,
98
+ locator_title VARCHAR,
99
+ locator_href VARCHAR,
100
+ src VARCHAR,
101
+ context VARCHAR,
102
+ duration INTEGER
103
+ );
104
+ '''
105
+ )
106
+ # this dt format (zone name after iso timestap) might occur in legacy databases
107
+ # (that were created when promnesia was using cachew NTBinder)
108
+ conn.execute(
109
+ '''
110
+ INSERT INTO visits VALUES(
111
+ 'i.redd.it/alala.jpg',
112
+ 'https://i.redd.it/alala.jpg',
113
+ '2019-04-13T11:55:09-04:00 America/New_York',
114
+ 'Reddit save',
115
+ 'https://reddit.com/r/whatever',
116
+ 'reddit',
117
+ '',
118
+ NULL
119
+ );
120
+ '''
121
+ )
122
+ [visit_in_db] = get_all_db_visits(db)
123
+ assert visit_in_db == DbVisit(
124
+ norm_url='i.redd.it/alala.jpg',
125
+ orig_url='https://i.redd.it/alala.jpg',
126
+ dt=datetime(2019, 4, 13, 11, 55, 9, tzinfo=timezone(timedelta(hours=-4))),
127
+ locator=Loc.make(title='Reddit save', href='https://reddit.com/r/whatever'),
128
+ src='reddit',
129
+ context='',
130
+ )
131
+
132
+
133
+ def _test_random_visit_aux(visit: DbVisit, tmp_path: Path) -> None:
134
+ db = tmp_path / 'db.sqlite'
135
+ errors = visits_to_sqlite(
136
+ vit=[visit],
137
+ overwrite_db=True,
138
+ _db_path=db,
139
+ )
140
+ assert db.exists()
141
+ assert len(errors) == 0, errors
142
+ # TODO query the db?
143
+
144
+
145
+ @given(
146
+ visit=from_type(DbVisit).filter(
147
+ # if duration is too big it fails to insert in sqlite
148
+ lambda v: (v.duration is None or 0 <= v.duration <= 10**5)
149
+ )
150
+ )
151
+ @settings(**HSETTINGS, max_examples=100)
152
+ def test_random_visit(visit: DbVisit) -> None:
153
+ with TemporaryDirectory() as tdir:
154
+ tmp_path = Path(tdir)
155
+ _test_random_visit_aux(visit=visit, tmp_path=tmp_path)
156
+
157
+
158
+ _dt_naive = datetime.fromisoformat('2023-11-14T23:11:01')
159
+ _dt_aware = pytz.timezone('America/New_York').localize(_dt_naive)
160
+
161
+ def make_testvisit(i: int) -> DbVisit:
162
+ return DbVisit(
163
+ norm_url=f'google.com/{i}',
164
+ orig_url=f'https://google.com/{i}',
165
+ dt=(_dt_naive if i % 2 == 0 else _dt_aware) + timedelta(seconds=i),
166
+ locator=Loc.make(title=f'title{i}', href=f'https://whatever.com/{i}'),
167
+ duration=i,
168
+ src='whatever',
169
+ )
170
+
171
+
172
+ @pytest.mark.parametrize('count', [99, 100_000, 1_000_000])
173
+ @pytest.mark.parametrize('gc_on', [True, False], ids=['gc_on', 'gc_off'])
174
+ def test_benchmark_visits_dumping(count: int, gc_control, tmp_path: Path) -> None:
175
+ # [20231212] testing differernt CHUNK_BY values with 1_000_000 visits on @karlicoss desktop pc
176
+ # 1: 25s (perhaps most overhead is from temporary lists?)
177
+ # 10 (current default): 8s
178
+ # 100: 6s
179
+ # 1000: 6s
180
+ # TODO maybe consider changing default to 100?
181
+ if count > 99 and running_on_ci:
182
+ pytest.skip("test would be too slow on CI, only meant to run manually")
183
+
184
+ visits = (make_testvisit(i) for i in range(count))
185
+ db = tmp_path / 'db.sqlite'
186
+ errors = visits_to_sqlite( # TODO maybe this method should return db stats? would make testing easier
187
+ vit=visits,
188
+ overwrite_db=True,
189
+ _db_path=db,
190
+ )
191
+ assert db.exists()
192
+ assert len(errors) == 0, errors
193
+
194
+
195
+ def _populate_db(db_path: Path, *, overwrite_db: bool, count: int) -> None:
196
+ visits = [make_testvisit(i) for i in range(count)]
197
+ errors = visits_to_sqlite(visits, _db_path=db_path, overwrite_db=overwrite_db)
198
+ assert len(errors) == 0
199
+
200
+
201
+ @pytest.mark.parametrize('mode', ['update', 'overwrite'])
202
+ def test_concurrent(tmp_path: Path, mode: str) -> None:
203
+ overwrite_db = {'overwrite': True, 'update': False}[mode]
204
+
205
+ db_path = tmp_path / 'db.sqlite'
206
+ # do initial indexing to initialize the db
207
+ _populate_db(db_path, overwrite_db=True, count=1)
208
+ assert db_path.exists() # just in case
209
+
210
+ # this simply tests correctness by running many concurrent indexers
211
+ parallel = 100 # 100 indexers
212
+ with ProcessPoolExecutor(max_workers=8) as pool:
213
+ futures = []
214
+ for _ in range(parallel):
215
+ futures.append(pool.submit(_populate_db, db_path, overwrite_db=overwrite_db, count=1_000))
216
+ for f in futures:
217
+ f.result()
218
+ assert db_path.exists() # just in case
219
+
220
+
221
+ # TODO test to make sure db is readable while we're indexing?
222
+ # kinda nicer version of test_query_while_indexing
@@ -0,0 +1,65 @@
1
+ from datetime import datetime, timezone
2
+
3
+ import pytest
4
+ from more_itertools import ilen
5
+
6
+ from ..common import DbVisit, Loc, Source, Visit
7
+ from ..extract import extract_visits
8
+ from .common import (
9
+ gc_control, # noqa: F401
10
+ get_testdata,
11
+ running_on_ci,
12
+ unwrap,
13
+ )
14
+
15
+
16
+ def test_with_error() -> None:
17
+ class ExtractionError(Exception):
18
+ pass
19
+
20
+ def indexer():
21
+ yield Visit(url='http://test1', dt=datetime.fromtimestamp(0, tz=timezone.utc), locator=Loc.make('whatever'))
22
+ yield ExtractionError()
23
+ yield Visit(url='http://test2', dt=datetime.fromtimestamp(0, tz=timezone.utc), locator=Loc.make('whatever'))
24
+
25
+ [v1, e, v2] = extract_visits(source=Source(indexer), src='whatever')
26
+ assert isinstance(v1, DbVisit)
27
+ assert isinstance(e, Exception)
28
+ assert isinstance(v2, DbVisit)
29
+
30
+
31
+ def test_urls_are_normalised() -> None:
32
+ # generally this stuff is covered by cannon tests, but good to check it's actually inserted in the db
33
+ # TODO maybe this should be a separate test which takes DbVisit.make separately?
34
+ # especially to decouple from shellcmd source
35
+ from ..sources import shellcmd
36
+ from ..sources.plaintext import extract_from_path
37
+
38
+ visits = list(extract_visits(
39
+ source=Source(shellcmd.index, extract_from_path(get_testdata('normalise'))),
40
+ src='whatever',
41
+ ))
42
+ assert len(visits) == 7
43
+
44
+ assert {unwrap(v).norm_url for v in visits} == {
45
+ 'hi.com',
46
+ 'reddit.com/post',
47
+ 'argos.co.uk/webapp/wcs/stores/servlet/OrderItemDisplay',
48
+ 'youtube.com/watch?v=XXlZfc1TrD0',
49
+ 'youtube.com/watch?v=XXlZfc1Tr11',
50
+ }
51
+
52
+
53
+ @pytest.mark.parametrize('count', [99, 100_000, 1_000_000])
54
+ @pytest.mark.parametrize('gc_on', [True, False], ids=['gc_on', 'gc_off'])
55
+ def test_benchmark(count: int, gc_control) -> None:
56
+ # NOTE: at the moment most time is spent canonifying urls, so not much point optimizing this in isolation
57
+ # TODO maybe could specify custom cannonifying strategy that doesn't do anything to isolate benchmark
58
+ if count > 99 and running_on_ci:
59
+ pytest.skip("test would be too slow on CI, only meant to run manually")
60
+
61
+ from ..sources import demo
62
+ source = Source(demo.index, count=count)
63
+
64
+ total = ilen(extract_visits(source=source, src='whatever'))
65
+ assert total == count # sanity check
@@ -0,0 +1,43 @@
1
+ from ..common import extract_urls
2
+
3
+
4
+ def test_extract_simple() -> None:
5
+ lines = """
6
+ I've enjoyed [Chandler Carruth's _There Are No Zero-cost Abstractions_](
7
+ https://www.youtube.com/watch?v=rHIkrotSwcc) very much.
8
+ """.strip()
9
+ assert set(extract_urls(lines)) == {'https://www.youtube.com/watch?v=rHIkrotSwcc'}
10
+
11
+
12
+ def test_extract_2() -> None:
13
+ text = '''‍♂️ Чтобы снизить вероятность ошибиться, важно знать про когнитивные искажения.
14
+ Если для вас это новое словосочетание, начните с книжки
15
+ "Гарри Поттер и Методы рационального мышления" - http://hpmor.ru/, если вы знакомы с понятием - читайте цепочки на сайтах
16
+ lesswrong.ru и lesswrong.com, книжку Даниэля Канемана "Thinking, fast and slow" и канал Пион https://t.me/ontologics
17
+ '''
18
+ assert set(extract_urls(text)) == {'http://hpmor.ru/', 'lesswrong.ru', 'lesswrong.com', 'https://t.me/ontologics'}
19
+
20
+
21
+ def test_extract_md() -> None:
22
+ lines = '''
23
+ Hey, I recently implemented a new extension for that [addons.mozilla.org](https://addons.mozilla.org/en-US/firefox/addon/org-grasp-for-org-capture/), [github](https://github.com/karlicoss/grasp), perhaps it could be useful for you!
24
+ '''
25
+ assert set(extract_urls(lines)) == {
26
+ 'addons.mozilla.org',
27
+ 'https://addons.mozilla.org/en-US/firefox/addon/org-grasp-for-org-capture/',
28
+ 'https://github.com/karlicoss/grasp',
29
+ }
30
+
31
+
32
+ # just random links to test multiline/whitespace behaviour
33
+ def test_extract_3() -> None:
34
+ lines = '''
35
+ python.org/one.html ?? https://python.org/two.html some extra text
36
+
37
+ whatever.org
38
+ '''
39
+ assert set(extract_urls(lines, syntax='org')) == {
40
+ 'python.org/one.html',
41
+ 'https://python.org/two.html',
42
+ 'whatever.org',
43
+ }
@@ -0,0 +1,251 @@
1
+ from collections import Counter
2
+ from pathlib import Path
3
+ from subprocess import Popen, check_call
4
+
5
+ import pytest
6
+
7
+ from ..__main__ import do_index, read_example_config
8
+ from ..common import DbVisit, _is_windows
9
+ from ..database.load import get_all_db_visits
10
+ from .common import (
11
+ get_testdata,
12
+ promnesia_bin,
13
+ reset_filters, # noqa: F401
14
+ write_config,
15
+ )
16
+
17
+
18
+ def get_stats(tmp_path: Path) -> Counter:
19
+ visits = get_all_db_visits(tmp_path / 'promnesia.sqlite')
20
+ return Counter(v.src for v in visits)
21
+
22
+
23
+ @pytest.mark.parametrize('mode', ['update', 'overwrite'])
24
+ def test_indexing_mode(tmp_path: Path, mode: str) -> None:
25
+ # ugh. we modify the config very fast during tests
26
+ # and pycache distinguishes identical filenames based on int mtime in seconds
27
+ # so best to use different names to prevent undesired caching
28
+ # https://github.com/python/cpython/blob/fb202af4470d6051a69bb9d2f44d7e8a1c99eb4f/Lib/importlib/_bootstrap_external.py#L714-L739
29
+ # TODO could probably relax that if we switch from importlib config loading to exec()?
30
+
31
+ def cfg1() -> None:
32
+ from promnesia.common import Source
33
+ from promnesia.sources import demo
34
+
35
+ SOURCES = [
36
+ Source(demo.index, count=10, base_dt='2000-01-01', delta=30, name='demo1'),
37
+ Source(demo.index, count=20, base_dt='2001-01-01', delta=30, name='demo2'),
38
+ ]
39
+
40
+ cfg_path = tmp_path / 'config1.py'
41
+ write_config(cfg_path, cfg1)
42
+ do_index(cfg_path)
43
+
44
+ stats = get_stats(tmp_path)
45
+ assert stats == {'demo1': 10, 'demo2': 20}
46
+
47
+ def cfg2() -> None:
48
+ from promnesia.common import Source
49
+ from promnesia.sources import demo
50
+
51
+ SOURCES = [
52
+ Source(demo.index, count=30, base_dt='2005-01-01', delta=30, name='demo2'),
53
+ Source(demo.index, count=40, base_dt='2010-01-01', delta=30, name='demo3'),
54
+ ]
55
+
56
+ cfg_path = tmp_path / 'config2.py'
57
+ write_config(cfg_path, cfg2)
58
+ do_index(cfg_path, overwrite_db={'overwrite': True, 'update': False}[mode])
59
+ # TODO use some sort of test helper?
60
+ stats = get_stats(tmp_path)
61
+
62
+ if mode == 'update':
63
+ # should keep the original visits too!
64
+ assert stats == {'demo1': 10, 'demo2': 30, 'demo3': 40}
65
+ else:
66
+ # should overwrite with newly indexed visits
67
+ assert stats == {'demo2': 30, 'demo3': 40}
68
+
69
+
70
+ # TODO check both modes?
71
+ def test_concurrent_indexing(tmp_path: Path) -> None:
72
+ def cfg_fast() -> None:
73
+ from promnesia.common import Source
74
+ from promnesia.sources import demo
75
+
76
+ SOURCES = [Source(demo.index, count=10)]
77
+
78
+ cfg_fast_path = tmp_path / 'cfg_fast.py'
79
+ write_config(cfg_fast_path, cfg_fast)
80
+
81
+ def cfg_slow() -> None:
82
+ from promnesia.common import Source
83
+ from promnesia.sources import demo
84
+
85
+ SOURCES = [Source(demo.index, count=100_000)]
86
+
87
+ cfg_slow_path = tmp_path / 'cfg_slow.py'
88
+ write_config(cfg_slow_path, cfg_slow)
89
+
90
+ # init it first, to create the database
91
+ # TODO ideally this shouldn't be necessary but it's reasonable that people would already have the index
92
+ # otherwise it would fail at db creation point.. which is kinda annoying to work around
93
+ # todo in principle can work around same way as in cachew, by having a loop around PRAGMA WAL command?
94
+ check_call(promnesia_bin('index', '--config', cfg_fast_path, '--overwrite'))
95
+
96
+ total_runs = 0
97
+ # run in the background
98
+ with Popen(promnesia_bin('index', '--config', cfg_slow_path, '--overwrite')) as slow_indexer:
99
+ while slow_indexer.poll() is None:
100
+ # create a bunch of 'smaller' indexers running in parallel
101
+ fasts = [
102
+ Popen(promnesia_bin('index', '--config', cfg_fast_path, '--overwrite'))
103
+ for _ in range(10)
104
+ ]
105
+ for fast in fasts:
106
+ assert fast.wait() == 0, fast # should succeed
107
+ total_runs += 1
108
+ assert slow_indexer.poll() == 0, slow_indexer
109
+
110
+ # FIXME ok, need to uncomment this once proper concurrent indexing is supported
111
+ # if not, slow indexer is too fast, so crank up the count in it
112
+ # assert total_runs > 20
113
+
114
+
115
+ def test_filter(tmp_path: Path, reset_filters) -> None:
116
+ domain_to_filter = 'some-weird-domain.xyz'
117
+ testdata = get_testdata('custom')
118
+ assert any(domain_to_filter in p.read_text() for p in testdata.glob('*.txt')) # precondition
119
+
120
+ def cfg(testdata, domain_to_filter) -> None:
121
+ from promnesia.common import Source
122
+ from promnesia.sources import shellcmd
123
+ from promnesia.sources.plaintext import extract_from_path
124
+
125
+ FILTERS = [
126
+ domain_to_filter,
127
+ ]
128
+
129
+ SOURCES = [Source(shellcmd.index, extract_from_path(testdata))]
130
+
131
+ cfg_path = tmp_path / 'config.py'
132
+ write_config(cfg_path, cfg, testdata=testdata, domain_to_filter=domain_to_filter)
133
+ do_index(cfg_path)
134
+
135
+ visits = get_all_db_visits(tmp_path / 'promnesia.sqlite')
136
+ urls = {v.orig_url for v in visits}
137
+ assert not any(domain_to_filter in u for u in urls), urls
138
+ assert len(visits) == 4 # just in case
139
+
140
+
141
+ def test_weird_urls(tmp_path: Path) -> None:
142
+ # specifically test this here (rather than in cannon)
143
+ # to make sure it's not messed up when we insert/extract from sqlite
144
+
145
+ def cfg(testdata: str) -> None:
146
+ from promnesia.common import Source
147
+ from promnesia.sources import shellcmd
148
+ from promnesia.sources.plaintext import extract_from_path
149
+
150
+ SOURCES = [Source(shellcmd.index, extract_from_path(testdata))]
151
+
152
+ cfg_path = tmp_path / 'config.py'
153
+ write_config(cfg_path, cfg, testdata=get_testdata('weird.txt'))
154
+ do_index(cfg_path)
155
+
156
+ [v1, v2] = get_all_db_visits(tmp_path / 'promnesia.sqlite')
157
+
158
+ assert v1.norm_url == "urbandictionary.com/define.php?term=Belgian%20Whistle"
159
+
160
+ assert v2.norm_url == "en.wikipedia.org/wiki/Dinic%27s_algorithm"
161
+ assert v2.locator.title.endswith('weird.txt:2')
162
+ assert v2.context == 'right, so https://en.wikipedia.org/wiki/Dinic%27s_algorithm can be used for max flow'
163
+
164
+
165
+ def test_errors_during_indexing(tmp_path: Path) -> None:
166
+ def cfg() -> None:
167
+ from promnesia.common import Source
168
+ from promnesia.sources import demo
169
+
170
+ def indexer1():
171
+ visits = list(demo.index(count=10))
172
+ yield from visits[:5]
173
+ yield RuntimeError("some error during visits extraction")
174
+ yield from visits[5:]
175
+
176
+ def indexer2():
177
+ raise RuntimeError("in this case indexer itself crashed")
178
+
179
+ SOURCES = [Source(indexer1), Source(indexer2)]
180
+
181
+ cfg_path = tmp_path / 'config.py'
182
+ write_config(cfg_path, cfg)
183
+ do_index(cfg_path)
184
+
185
+ stats = get_stats(tmp_path)
186
+ assert stats == {
187
+ 'error': 2,
188
+ 'config': 10,
189
+ }
190
+
191
+
192
+ def test_hook(tmp_path: Path) -> None:
193
+ def cfg() -> None:
194
+ from promnesia.common import Source
195
+ from promnesia.sources import demo
196
+
197
+ SOURCES = [Source(demo.index, count=7, name='somename')]
198
+
199
+ from collections.abc import Iterator
200
+ from typing import cast
201
+
202
+ from promnesia.common import DbVisit, Loc, Res
203
+ from promnesia.sources import demo
204
+
205
+ def HOOK(visit: Res[DbVisit]) -> Iterator[Res[DbVisit]]:
206
+ visit = cast(DbVisit, visit)
207
+
208
+ # NOTE: might be a good idea to check that the visit is an exception first and yield it intact?
209
+ nurl = visit.norm_url
210
+ if 'page1' in nurl:
211
+ yield visit._replace(norm_url='patched.com')
212
+ elif 'page2' in nurl:
213
+ raise RuntimeError('boom') # deliberately crash
214
+ elif 'page3' in nurl:
215
+ # just don't yield anything! it will be omitted
216
+ pass
217
+ elif 'page4' in nurl:
218
+ # can emit multiple!
219
+ yield visit
220
+ yield visit
221
+ elif 'page6' in nurl:
222
+ # patch locator
223
+ yield visit._replace(locator=Loc.make(title='some custom timte', href='/can/replace/original/path'))
224
+ else:
225
+ yield visit
226
+
227
+ cfg_path = tmp_path / 'config.py'
228
+ write_config(cfg_path, cfg)
229
+ do_index(cfg_path)
230
+
231
+ [p0, p1, e2, p41, p42, p5, p6] = get_all_db_visits(tmp_path / 'promnesia.sqlite')
232
+ assert p0.norm_url == 'demo.com/page0.html'
233
+ assert p1.norm_url == 'patched.com'
234
+ assert e2.norm_url == '<error>'
235
+ assert p41 == p42
236
+ assert isinstance(p6, DbVisit)
237
+ assert p6.locator is not None
238
+
239
+
240
+ def test_example_config(tmp_path: Path) -> None:
241
+ if _is_windows:
242
+ pytest.skip("doesn't work on Windows: example config references /usr/include paths")
243
+
244
+ config = read_example_config() + '\n' + f'OUTPUT_DIR = "{tmp_path!s}"'
245
+ cfg_path = tmp_path / 'example_config.py'
246
+ cfg_path.write_text(config)
247
+
248
+ do_index(cfg_path)
249
+
250
+ visits = [v for v in get_all_db_visits(tmp_path / 'promnesia.sqlite') if v.src != 'error']
251
+ assert len(visits) > 50 # random sanity check