promnesia 1.1.20230129__py3-none-any.whl → 1.2.20240810__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- promnesia/__main__.py +58 -50
- promnesia/cannon.py +4 -4
- promnesia/common.py +57 -38
- promnesia/compare.py +3 -2
- promnesia/compat.py +6 -65
- promnesia/config.py +4 -2
- promnesia/database/common.py +66 -0
- promnesia/database/dump.py +187 -0
- promnesia/{read_db.py → database/load.py} +10 -11
- promnesia/extract.py +1 -0
- promnesia/kjson.py +1 -1
- promnesia/logging.py +14 -14
- promnesia/misc/__init__.pyi +0 -0
- promnesia/misc/config_example.py +1 -2
- promnesia/misc/install_server.py +5 -4
- promnesia/server.py +24 -24
- promnesia/sources/__init__.pyi +0 -0
- promnesia/sources/auto.py +12 -7
- promnesia/sources/browser.py +80 -293
- promnesia/sources/browser_legacy.py +298 -0
- promnesia/sources/demo.py +18 -2
- promnesia/sources/filetypes.py +8 -0
- promnesia/sources/github.py +2 -2
- promnesia/sources/hackernews.py +1 -2
- promnesia/sources/hypothesis.py +1 -1
- promnesia/sources/markdown.py +15 -15
- promnesia/sources/org.py +7 -3
- promnesia/sources/plaintext.py +3 -1
- promnesia/sources/reddit.py +2 -2
- promnesia/sources/rss.py +5 -1
- promnesia/sources/shellcmd.py +6 -2
- promnesia/sources/signal.py +29 -20
- promnesia/sources/smscalls.py +8 -1
- promnesia/sources/stackexchange.py +2 -2
- promnesia/sources/takeout.py +132 -12
- promnesia/sources/takeout_legacy.py +10 -2
- promnesia/sources/telegram.py +79 -123
- promnesia/sources/telegram_legacy.py +117 -0
- promnesia/sources/vcs.py +1 -1
- promnesia/sources/viber.py +6 -15
- promnesia/sources/website.py +1 -1
- promnesia/sqlite.py +42 -0
- promnesia/tests/__init__.py +0 -0
- promnesia/tests/common.py +137 -0
- promnesia/tests/server_helper.py +64 -0
- promnesia/tests/sources/__init__.py +0 -0
- promnesia/tests/sources/test_auto.py +66 -0
- promnesia/tests/sources/test_filetypes.py +42 -0
- promnesia/tests/sources/test_hypothesis.py +39 -0
- promnesia/tests/sources/test_org.py +65 -0
- promnesia/tests/sources/test_plaintext.py +26 -0
- promnesia/tests/sources/test_shellcmd.py +22 -0
- promnesia/tests/sources/test_takeout.py +58 -0
- promnesia/tests/test_cannon.py +325 -0
- promnesia/tests/test_cli.py +42 -0
- promnesia/tests/test_compare.py +30 -0
- promnesia/tests/test_config.py +290 -0
- promnesia/tests/test_db_dump.py +223 -0
- promnesia/tests/test_extract.py +61 -0
- promnesia/tests/test_extract_urls.py +43 -0
- promnesia/tests/test_indexer.py +245 -0
- promnesia/tests/test_server.py +292 -0
- promnesia/tests/test_traverse.py +41 -0
- promnesia/tests/utils.py +35 -0
- {promnesia-1.1.20230129.dist-info → promnesia-1.2.20240810.dist-info}/METADATA +14 -19
- promnesia-1.2.20240810.dist-info/RECORD +83 -0
- {promnesia-1.1.20230129.dist-info → promnesia-1.2.20240810.dist-info}/WHEEL +1 -1
- {promnesia-1.1.20230129.dist-info → promnesia-1.2.20240810.dist-info}/entry_points.txt +0 -1
- promnesia/dump.py +0 -105
- promnesia-1.1.20230129.dist-info/RECORD +0 -55
- {promnesia-1.1.20230129.dist-info → promnesia-1.2.20240810.dist-info}/LICENSE +0 -0
- {promnesia-1.1.20230129.dist-info → promnesia-1.2.20240810.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,290 @@
|
|
1
|
+
from contextlib import contextmanager
|
2
|
+
from pathlib import Path
|
3
|
+
from tempfile import TemporaryDirectory
|
4
|
+
from typing import Union, List
|
5
|
+
|
6
|
+
from ..common import Source
|
7
|
+
from ..config import import_config, Config
|
8
|
+
|
9
|
+
|
10
|
+
from more_itertools import ilen
|
11
|
+
import pytest
|
12
|
+
|
13
|
+
from .common import throw
|
14
|
+
|
15
|
+
|
16
|
+
def make(body: str) -> Config:
|
17
|
+
with TemporaryDirectory() as td:
|
18
|
+
tdir = Path(td)
|
19
|
+
cp = tdir / 'cfg.py'
|
20
|
+
cp.write_text(body)
|
21
|
+
return import_config(cp)
|
22
|
+
|
23
|
+
|
24
|
+
@contextmanager
|
25
|
+
def with_config(cfg: Union[str, Config]):
|
26
|
+
from .. import config as C
|
27
|
+
|
28
|
+
assert not C.has()
|
29
|
+
cfg2: Config = make(cfg) if isinstance(cfg, str) else cfg
|
30
|
+
try:
|
31
|
+
C.instance = cfg2
|
32
|
+
assert C.has()
|
33
|
+
yield
|
34
|
+
finally:
|
35
|
+
C.reset()
|
36
|
+
|
37
|
+
|
38
|
+
def index(cfg: Union[str, Config], check=True) -> List[Exception]:
|
39
|
+
from ..__main__ import _do_index
|
40
|
+
|
41
|
+
with with_config(cfg):
|
42
|
+
errors = list(_do_index())
|
43
|
+
if check:
|
44
|
+
assert len(errors) == 0, errors
|
45
|
+
# visits = cfg.output_dir / 'promnesia.sqlite'
|
46
|
+
# TODO query visit count too
|
47
|
+
return errors
|
48
|
+
|
49
|
+
|
50
|
+
def test_minimal() -> None:
|
51
|
+
'''
|
52
|
+
Example of a smallest possible config, using a 'demo' source
|
53
|
+
'''
|
54
|
+
# import directly from promnesia, not promnesia.common
|
55
|
+
cfg = make(
|
56
|
+
'''
|
57
|
+
from promnesia import Source
|
58
|
+
from promnesia.sources import demo
|
59
|
+
|
60
|
+
SOURCES = [
|
61
|
+
Source(demo.index),
|
62
|
+
]
|
63
|
+
'''
|
64
|
+
)
|
65
|
+
assert ilen(cfg.sources) == 1
|
66
|
+
assert all(isinstance(s, Source) for s in cfg.sources)
|
67
|
+
# todo output dirs?
|
68
|
+
index(cfg)
|
69
|
+
|
70
|
+
|
71
|
+
def test_sources_style_1() -> None:
|
72
|
+
'''
|
73
|
+
Testing 'styles' of specifying sources
|
74
|
+
'''
|
75
|
+
cfg = make(
|
76
|
+
'''
|
77
|
+
from promnesia.common import Source
|
78
|
+
from promnesia.sources import demo
|
79
|
+
|
80
|
+
SOURCES = [
|
81
|
+
# you can pass arguments to index functions
|
82
|
+
Source(demo.index, count=10, name='explicit name'),
|
83
|
+
|
84
|
+
# or rely on the default argument!
|
85
|
+
Source(demo.index, name='another name'),
|
86
|
+
|
87
|
+
# or rely on default source name name (will be guessed as 'demo')
|
88
|
+
Source(demo.index),
|
89
|
+
|
90
|
+
# rely on default index function
|
91
|
+
Source(demo),
|
92
|
+
|
93
|
+
# no need for Source() either!
|
94
|
+
demo.index,
|
95
|
+
demo,
|
96
|
+
|
97
|
+
# I guess this is as simple as it possibly gets...
|
98
|
+
'promnesia.sources.demo',
|
99
|
+
|
100
|
+
# just in case, test lambdas
|
101
|
+
# with list
|
102
|
+
lambda: list(demo.index()),
|
103
|
+
|
104
|
+
# with generator
|
105
|
+
lambda: iter(list(demo.index())),
|
106
|
+
|
107
|
+
# example of lazy source
|
108
|
+
# useful when arguments are somehow computed dynamically in config
|
109
|
+
Source(lambda: demo.index(count=10), name='lazy'),
|
110
|
+
]
|
111
|
+
'''
|
112
|
+
)
|
113
|
+
|
114
|
+
srcs = [s if isinstance(s, Source) else throw(s) for s in cfg.sources]
|
115
|
+
|
116
|
+
[s1, s2, s3, s4, s5, s55, s6, s7, s77, s777] = srcs
|
117
|
+
|
118
|
+
# just a quick check to make sure tests import promnesia package correctly
|
119
|
+
# (depends on conftests settings)
|
120
|
+
assert type(srcs[0]).__module__ == 'promnesia.common', srcs
|
121
|
+
assert s1.name == 'explicit name'
|
122
|
+
assert s2.name == 'another name'
|
123
|
+
assert s3.name == 'demo'
|
124
|
+
assert s4.name == 'demo'
|
125
|
+
assert s5.name == 'demo'
|
126
|
+
assert s55.name == 'demo'
|
127
|
+
assert s6.name == 'demo'
|
128
|
+
|
129
|
+
# can't say 'cfg' as name is intended here but anyway
|
130
|
+
assert s7.name == 'cfg'
|
131
|
+
assert s77.name == 'cfg'
|
132
|
+
assert s777.name == 'lazy'
|
133
|
+
|
134
|
+
index(cfg)
|
135
|
+
# TODO assert on results count?
|
136
|
+
|
137
|
+
|
138
|
+
# TODO ugh. allow not to have locator
|
139
|
+
# ideally you can construct a visit with a link and that's it
|
140
|
+
def test_sources_style_2() -> None:
|
141
|
+
'''
|
142
|
+
Now, sources are not magic -- they are just functions emitting visits
|
143
|
+
'''
|
144
|
+
cfg = make(
|
145
|
+
'''
|
146
|
+
from typing import Iterable
|
147
|
+
from promnesia.common import Visit, Source, Loc
|
148
|
+
|
149
|
+
def my_indexer() -> Iterable[Visit]:
|
150
|
+
from datetime import datetime
|
151
|
+
for link in ['reddit.com', 'beepb00p.xyz']:
|
152
|
+
yield Visit(
|
153
|
+
url=link,
|
154
|
+
dt=datetime.min,
|
155
|
+
locator=Loc.make('test'),
|
156
|
+
)
|
157
|
+
|
158
|
+
SOURCES = [
|
159
|
+
# you can just pass the function name here
|
160
|
+
my_indexer,
|
161
|
+
|
162
|
+
# or give it an explicit name (instead of a guess)
|
163
|
+
Source(my_indexer, name='nice name'),
|
164
|
+
]
|
165
|
+
|
166
|
+
|
167
|
+
class MyIndexer:
|
168
|
+
def index():
|
169
|
+
from promnesia.sources import demo
|
170
|
+
return list(demo.index())
|
171
|
+
|
172
|
+
SOURCES.append(
|
173
|
+
MyIndexer,
|
174
|
+
)
|
175
|
+
|
176
|
+
'''
|
177
|
+
)
|
178
|
+
[s1, s2, s3] = [s if isinstance(s, Source) else throw(s) for s in cfg.sources]
|
179
|
+
|
180
|
+
assert s1.name == 'cfg' # TODO would be nice to guess 'my_indexer' instead...
|
181
|
+
assert s2.name == 'nice name'
|
182
|
+
assert s3.name == 'cfg' # TODO fix it, make MyIndexer?
|
183
|
+
|
184
|
+
index(cfg)
|
185
|
+
|
186
|
+
|
187
|
+
def test_sources_lazy():
|
188
|
+
'''
|
189
|
+
Demonstration of ways to return 'lazy' and generally more advanced sources
|
190
|
+
|
191
|
+
Lazy sources could be useful to do some conditional magic or make more defensive against imports, excra configuration. You'll know when you need it ;)
|
192
|
+
'''
|
193
|
+
|
194
|
+
cfg = make(
|
195
|
+
'''
|
196
|
+
from promnesia.common import Source
|
197
|
+
|
198
|
+
def lazy():
|
199
|
+
from promnesia.sources import demo
|
200
|
+
print("Hello, I'm so lazy...")
|
201
|
+
yield from demo.index()
|
202
|
+
|
203
|
+
SOURCES = [
|
204
|
+
lazy,
|
205
|
+
]
|
206
|
+
'''
|
207
|
+
)
|
208
|
+
srcs = [s if isinstance(s, Source) else throw(s) for s in cfg.sources]
|
209
|
+
[s] = srcs
|
210
|
+
|
211
|
+
assert s.name == 'cfg' # TODO this should be fixed... but not a big deal
|
212
|
+
|
213
|
+
index(cfg)
|
214
|
+
|
215
|
+
|
216
|
+
# TODO later
|
217
|
+
# or like that:
|
218
|
+
# (i for i in lazy()),
|
219
|
+
|
220
|
+
# TODO later, support stuff that returns sources lazily? e.g. lambda: Source(...)
|
221
|
+
# not sure if it's very useful
|
222
|
+
|
223
|
+
|
224
|
+
def test_sources_errors() -> None:
|
225
|
+
'''
|
226
|
+
Testing defensiveness of config against various errors
|
227
|
+
'''
|
228
|
+
cfg = make(
|
229
|
+
'''
|
230
|
+
SOURCES = [
|
231
|
+
'non.existing.module',
|
232
|
+
|
233
|
+
lambda: bad.attribute,
|
234
|
+
|
235
|
+
'promnesia.sources.demo',
|
236
|
+
]
|
237
|
+
'''
|
238
|
+
)
|
239
|
+
|
240
|
+
# nothing fails so far! It's defensive!
|
241
|
+
srcs = list(cfg.sources)
|
242
|
+
|
243
|
+
[e1, s1, s2] = srcs
|
244
|
+
|
245
|
+
assert isinstance(e1, Exception)
|
246
|
+
assert isinstance(s1, Source)
|
247
|
+
assert isinstance(s2, Source)
|
248
|
+
|
249
|
+
errors = index(cfg, check=False)
|
250
|
+
assert len(errors) == 2 # errors simply propagate
|
251
|
+
|
252
|
+
|
253
|
+
def test_no_sources() -> None:
|
254
|
+
cfg = make(
|
255
|
+
'''
|
256
|
+
'''
|
257
|
+
)
|
258
|
+
# raises because no SOURCES
|
259
|
+
with pytest.raises(RuntimeError):
|
260
|
+
list(cfg.sources)
|
261
|
+
|
262
|
+
|
263
|
+
def test_empty_sources() -> None:
|
264
|
+
cfg = make(
|
265
|
+
'''
|
266
|
+
SOURCES = []
|
267
|
+
'''
|
268
|
+
)
|
269
|
+
# raises because empty SOURCES
|
270
|
+
with pytest.raises(RuntimeError):
|
271
|
+
list(cfg.sources)
|
272
|
+
|
273
|
+
|
274
|
+
def test_legacy() -> None:
|
275
|
+
cfg = make(
|
276
|
+
'''
|
277
|
+
from promnesia.common import Source
|
278
|
+
from promnesia.sources import demo
|
279
|
+
INDEXERS = [
|
280
|
+
Source(demo.index, src='legacy name'),
|
281
|
+
]
|
282
|
+
'''
|
283
|
+
)
|
284
|
+
|
285
|
+
[s1] = cfg.sources
|
286
|
+
assert isinstance(s1, Source)
|
287
|
+
|
288
|
+
assert s1.name == 'legacy name'
|
289
|
+
|
290
|
+
index(cfg)
|
@@ -0,0 +1,223 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
|
3
|
+
from concurrent.futures import ProcessPoolExecutor
|
4
|
+
from datetime import datetime, timedelta, timezone
|
5
|
+
from pathlib import Path
|
6
|
+
from tempfile import TemporaryDirectory
|
7
|
+
from typing import Any, Iterable
|
8
|
+
|
9
|
+
|
10
|
+
from hypothesis import settings, given
|
11
|
+
from hypothesis.strategies import from_type
|
12
|
+
# NOTE: pytest ... -s --hypothesis-verbosity=debug is useful for seeing what hypothesis is doing
|
13
|
+
import pytest
|
14
|
+
import pytz
|
15
|
+
|
16
|
+
|
17
|
+
from ..common import Loc
|
18
|
+
from ..database.common import DbVisit
|
19
|
+
from ..database.dump import visits_to_sqlite
|
20
|
+
from ..database.load import get_all_db_visits
|
21
|
+
from ..sqlite import sqlite_connection
|
22
|
+
|
23
|
+
from .common import gc_control, running_on_ci
|
24
|
+
|
25
|
+
|
26
|
+
HSETTINGS: dict[str, Any] = dict(
|
27
|
+
derandomize=True,
|
28
|
+
deadline=timedelta(seconds=2), # sometimes slow on ci
|
29
|
+
)
|
30
|
+
|
31
|
+
|
32
|
+
def test_no_visits(tmp_path: Path) -> None:
|
33
|
+
visits: list[DbVisit] = []
|
34
|
+
|
35
|
+
db = tmp_path / 'db.sqlite'
|
36
|
+
errors = visits_to_sqlite(
|
37
|
+
vit=visits,
|
38
|
+
overwrite_db=True,
|
39
|
+
_db_path=db,
|
40
|
+
)
|
41
|
+
assert db.exists()
|
42
|
+
[err] = [errors]
|
43
|
+
assert 'No visits were indexed' in str(err)
|
44
|
+
|
45
|
+
|
46
|
+
def test_one_visit(tmp_path: Path) -> None:
|
47
|
+
dt = datetime.fromisoformat('2023-11-14T23:11:01')
|
48
|
+
dt = pytz.timezone('Europe/Warsaw').localize(dt)
|
49
|
+
visit = DbVisit(
|
50
|
+
norm_url='google.com',
|
51
|
+
orig_url='https://google.com',
|
52
|
+
dt=dt,
|
53
|
+
locator=Loc.make(title='title', href='https://whatever.com'),
|
54
|
+
duration=123,
|
55
|
+
src='whatever',
|
56
|
+
)
|
57
|
+
|
58
|
+
visits = [visit]
|
59
|
+
|
60
|
+
db = tmp_path / 'db.sqlite'
|
61
|
+
errors = visits_to_sqlite(
|
62
|
+
vit=visits,
|
63
|
+
overwrite_db=True,
|
64
|
+
_db_path=db,
|
65
|
+
)
|
66
|
+
assert len(errors) == 0
|
67
|
+
assert db.exists()
|
68
|
+
|
69
|
+
with sqlite_connection(db, row_factory='dict') as conn:
|
70
|
+
[sqlite_visit] = conn.execute('SELECT * FROM visits')
|
71
|
+
|
72
|
+
assert sqlite_visit == {
|
73
|
+
'context': None,
|
74
|
+
'dt': '2023-11-14T23:11:01+01:00',
|
75
|
+
'duration': 123,
|
76
|
+
'locator_href': 'https://whatever.com',
|
77
|
+
'locator_title': 'title',
|
78
|
+
'norm_url': 'google.com',
|
79
|
+
'orig_url': 'https://google.com',
|
80
|
+
'src': 'whatever',
|
81
|
+
}
|
82
|
+
|
83
|
+
visits_in_db = get_all_db_visits(db)
|
84
|
+
assert visits_in_db == [visit]
|
85
|
+
|
86
|
+
|
87
|
+
def test_read_db_visits(tmp_path: Path) -> None:
|
88
|
+
"""
|
89
|
+
Deliberately test against "hardcoded" database to check for backwards compatibility
|
90
|
+
"""
|
91
|
+
db = tmp_path / 'db.sqlite'
|
92
|
+
with sqlite_connection(db) as conn:
|
93
|
+
conn.execute(
|
94
|
+
'''
|
95
|
+
CREATE TABLE visits (
|
96
|
+
norm_url VARCHAR,
|
97
|
+
orig_url VARCHAR,
|
98
|
+
dt VARCHAR,
|
99
|
+
locator_title VARCHAR,
|
100
|
+
locator_href VARCHAR,
|
101
|
+
src VARCHAR,
|
102
|
+
context VARCHAR,
|
103
|
+
duration INTEGER
|
104
|
+
);
|
105
|
+
'''
|
106
|
+
)
|
107
|
+
# this dt format (zone name after iso timestap) might occur in legacy databases
|
108
|
+
# (that were created when promnesia was using cachew NTBinder)
|
109
|
+
conn.execute(
|
110
|
+
'''
|
111
|
+
INSERT INTO visits VALUES(
|
112
|
+
'i.redd.it/alala.jpg',
|
113
|
+
'https://i.redd.it/alala.jpg',
|
114
|
+
'2019-04-13T11:55:09-04:00 America/New_York',
|
115
|
+
'Reddit save',
|
116
|
+
'https://reddit.com/r/whatever',
|
117
|
+
'reddit',
|
118
|
+
'',
|
119
|
+
NULL
|
120
|
+
);
|
121
|
+
'''
|
122
|
+
)
|
123
|
+
[visit_in_db] = get_all_db_visits(db)
|
124
|
+
assert visit_in_db == DbVisit(
|
125
|
+
norm_url='i.redd.it/alala.jpg',
|
126
|
+
orig_url='https://i.redd.it/alala.jpg',
|
127
|
+
dt=datetime(2019, 4, 13, 11, 55, 9, tzinfo=timezone(timedelta(hours=-4))),
|
128
|
+
locator=Loc.make(title='Reddit save', href='https://reddit.com/r/whatever'),
|
129
|
+
src='reddit',
|
130
|
+
context='',
|
131
|
+
)
|
132
|
+
|
133
|
+
|
134
|
+
def _test_random_visit_aux(visit: DbVisit, tmp_path: Path) -> None:
|
135
|
+
db = tmp_path / 'db.sqlite'
|
136
|
+
errors = visits_to_sqlite(
|
137
|
+
vit=[visit],
|
138
|
+
overwrite_db=True,
|
139
|
+
_db_path=db,
|
140
|
+
)
|
141
|
+
assert db.exists()
|
142
|
+
assert len(errors) == 0, errors
|
143
|
+
# TODO query the db?
|
144
|
+
|
145
|
+
|
146
|
+
@given(
|
147
|
+
visit=from_type(DbVisit).filter(
|
148
|
+
# if duration is too big it fails to insert in sqlite
|
149
|
+
lambda v: (v.duration is None or 0 <= v.duration <= 10**5)
|
150
|
+
)
|
151
|
+
)
|
152
|
+
@settings(**HSETTINGS, max_examples=100)
|
153
|
+
def test_random_visit(visit: DbVisit) -> None:
|
154
|
+
with TemporaryDirectory() as tdir:
|
155
|
+
tmp_path = Path(tdir)
|
156
|
+
_test_random_visit_aux(visit=visit, tmp_path=tmp_path)
|
157
|
+
|
158
|
+
|
159
|
+
_dt_naive = datetime.fromisoformat('2023-11-14T23:11:01')
|
160
|
+
_dt_aware = pytz.timezone('America/New_York').localize(_dt_naive)
|
161
|
+
|
162
|
+
def make_testvisit(i: int) -> DbVisit:
|
163
|
+
return DbVisit(
|
164
|
+
norm_url=f'google.com/{i}',
|
165
|
+
orig_url=f'https://google.com/{i}',
|
166
|
+
dt=(_dt_naive if i % 2 == 0 else _dt_aware) + timedelta(seconds=i),
|
167
|
+
locator=Loc.make(title=f'title{i}', href=f'https://whatever.com/{i}'),
|
168
|
+
duration=i,
|
169
|
+
src='whatever',
|
170
|
+
)
|
171
|
+
|
172
|
+
|
173
|
+
@pytest.mark.parametrize('count', [99, 100_000, 1_000_000])
|
174
|
+
@pytest.mark.parametrize('gc_on', [True, False], ids=['gc_on', 'gc_off'])
|
175
|
+
def test_benchmark_visits_dumping(count: int, gc_control, tmp_path: Path) -> None:
|
176
|
+
# [20231212] testing differernt CHUNK_BY values with 1_000_000 visits on @karlicoss desktop pc
|
177
|
+
# 1: 25s (perhaps most overhead is from temporary lists?)
|
178
|
+
# 10 (current default): 8s
|
179
|
+
# 100: 6s
|
180
|
+
# 1000: 6s
|
181
|
+
# TODO maybe consider changing default to 100?
|
182
|
+
if count > 99 and running_on_ci:
|
183
|
+
pytest.skip("test would be too slow on CI, only meant to run manually")
|
184
|
+
|
185
|
+
visits = (make_testvisit(i) for i in range(count))
|
186
|
+
db = tmp_path / 'db.sqlite'
|
187
|
+
errors = visits_to_sqlite( # TODO maybe this method should return db stats? would make testing easier
|
188
|
+
vit=visits,
|
189
|
+
overwrite_db=True,
|
190
|
+
_db_path=db,
|
191
|
+
)
|
192
|
+
assert db.exists()
|
193
|
+
assert len(errors) == 0, errors
|
194
|
+
|
195
|
+
|
196
|
+
def _populate_db(db_path: Path, *, overwrite_db: bool, count: int) -> None:
|
197
|
+
visits = [make_testvisit(i) for i in range(count)]
|
198
|
+
errors = visits_to_sqlite(visits, _db_path=db_path, overwrite_db=overwrite_db)
|
199
|
+
assert len(errors) == 0
|
200
|
+
|
201
|
+
|
202
|
+
@pytest.mark.parametrize('mode', ['update', 'overwrite'])
|
203
|
+
def test_concurrent(tmp_path: Path, mode: str) -> None:
|
204
|
+
overwrite_db = {'overwrite': True, 'update': False}[mode]
|
205
|
+
|
206
|
+
db_path = tmp_path / 'db.sqlite'
|
207
|
+
# do initial indexing to initialize the db
|
208
|
+
_populate_db(db_path, overwrite_db=True, count=1)
|
209
|
+
assert db_path.exists() # just in case
|
210
|
+
|
211
|
+
# this simply tests correctness by running many concurrent indexers
|
212
|
+
parallel = 100 # 100 indexers
|
213
|
+
with ProcessPoolExecutor(max_workers=8) as pool:
|
214
|
+
futures = []
|
215
|
+
for _ in range(parallel):
|
216
|
+
futures.append(pool.submit(_populate_db, db_path, overwrite_db=overwrite_db, count=1_000))
|
217
|
+
for f in futures:
|
218
|
+
f.result()
|
219
|
+
assert db_path.exists() # just in case
|
220
|
+
|
221
|
+
|
222
|
+
# TODO test to make sure db is readable while we're indexing?
|
223
|
+
# kinda nicer version of test_query_while_indexing
|
@@ -0,0 +1,61 @@
|
|
1
|
+
from datetime import datetime, timezone
|
2
|
+
|
3
|
+
from ..common import Visit, DbVisit, Loc, Source
|
4
|
+
from ..extract import extract_visits
|
5
|
+
|
6
|
+
from .common import get_testdata, unwrap, running_on_ci, gc_control
|
7
|
+
|
8
|
+
from more_itertools import ilen
|
9
|
+
import pytest
|
10
|
+
|
11
|
+
|
12
|
+
def test_with_error() -> None:
|
13
|
+
class ExtractionError(Exception):
|
14
|
+
pass
|
15
|
+
|
16
|
+
def indexer():
|
17
|
+
yield Visit(url='http://test1', dt=datetime.fromtimestamp(0, tz=timezone.utc), locator=Loc.make('whatever'))
|
18
|
+
yield ExtractionError()
|
19
|
+
yield Visit(url='http://test2', dt=datetime.fromtimestamp(0, tz=timezone.utc), locator=Loc.make('whatever'))
|
20
|
+
|
21
|
+
[v1, e, v2] = extract_visits(source=Source(indexer), src='whatever')
|
22
|
+
assert isinstance(v1, DbVisit)
|
23
|
+
assert isinstance(e, Exception)
|
24
|
+
assert isinstance(v2, DbVisit)
|
25
|
+
|
26
|
+
|
27
|
+
def test_urls_are_normalised() -> None:
|
28
|
+
# generally this stuff is covered by cannon tests, but good to check it's actually inserted in the db
|
29
|
+
# TODO maybe this should be a separate test which takes DbVisit.make separately?
|
30
|
+
# especially to decouple from shellcmd source
|
31
|
+
from ..sources import shellcmd
|
32
|
+
from ..sources.plaintext import extract_from_path
|
33
|
+
|
34
|
+
visits = list(extract_visits(
|
35
|
+
source=Source(shellcmd.index, extract_from_path(get_testdata('normalise'))),
|
36
|
+
src='whatever',
|
37
|
+
))
|
38
|
+
assert len(visits) == 7
|
39
|
+
|
40
|
+
assert {unwrap(v).norm_url for v in visits} == {
|
41
|
+
'hi.com',
|
42
|
+
'reddit.com/post',
|
43
|
+
'argos.co.uk/webapp/wcs/stores/servlet/OrderItemDisplay',
|
44
|
+
'youtube.com/watch?v=XXlZfc1TrD0',
|
45
|
+
'youtube.com/watch?v=XXlZfc1Tr11',
|
46
|
+
}
|
47
|
+
|
48
|
+
|
49
|
+
@pytest.mark.parametrize('count', [99, 100_000, 1_000_000])
|
50
|
+
@pytest.mark.parametrize('gc_on', [True, False], ids=['gc_on', 'gc_off'])
|
51
|
+
def test_benchmark(count: int, gc_control) -> None:
|
52
|
+
# NOTE: at the moment most time is spent canonifying urls, so not much point optimizing this in isolation
|
53
|
+
# TODO maybe could specify custom cannonifying strategy that doesn't do anything to isolate benchmark
|
54
|
+
if count > 99 and running_on_ci:
|
55
|
+
pytest.skip("test would be too slow on CI, only meant to run manually")
|
56
|
+
|
57
|
+
from ..sources import demo
|
58
|
+
source = Source(demo.index, count=count)
|
59
|
+
|
60
|
+
total = ilen(extract_visits(source=source, src='whatever'))
|
61
|
+
assert total == count # sanity check
|
@@ -0,0 +1,43 @@
|
|
1
|
+
from ..common import extract_urls
|
2
|
+
|
3
|
+
|
4
|
+
def test_extract_simple() -> None:
|
5
|
+
lines = """
|
6
|
+
I've enjoyed [Chandler Carruth's _There Are No Zero-cost Abstractions_](
|
7
|
+
https://www.youtube.com/watch?v=rHIkrotSwcc) very much.
|
8
|
+
""".strip()
|
9
|
+
assert set(extract_urls(lines)) == {'https://www.youtube.com/watch?v=rHIkrotSwcc'}
|
10
|
+
|
11
|
+
|
12
|
+
def test_extract_2() -> None:
|
13
|
+
text = '''♂️ Чтобы снизить вероятность ошибиться, важно знать про когнитивные искажения.
|
14
|
+
Если для вас это новое словосочетание, начните с книжки
|
15
|
+
"Гарри Поттер и Методы рационального мышления" - http://hpmor.ru/, если вы знакомы с понятием - читайте цепочки на сайтах
|
16
|
+
lesswrong.ru и lesswrong.com, книжку Даниэля Канемана "Thinking, fast and slow" и канал Пион https://t.me/ontologics
|
17
|
+
'''
|
18
|
+
assert set(extract_urls(text)) == {'http://hpmor.ru/', 'lesswrong.ru', 'lesswrong.com', 'https://t.me/ontologics'}
|
19
|
+
|
20
|
+
|
21
|
+
def test_extract_md() -> None:
|
22
|
+
lines = '''
|
23
|
+
Hey, I recently implemented a new extension for that [addons.mozilla.org](https://addons.mozilla.org/en-US/firefox/addon/org-grasp-for-org-capture/), [github](https://github.com/karlicoss/grasp), perhaps it could be useful for you!
|
24
|
+
'''
|
25
|
+
assert set(extract_urls(lines)) == {
|
26
|
+
'addons.mozilla.org',
|
27
|
+
'https://addons.mozilla.org/en-US/firefox/addon/org-grasp-for-org-capture/',
|
28
|
+
'https://github.com/karlicoss/grasp',
|
29
|
+
}
|
30
|
+
|
31
|
+
|
32
|
+
# just random links to test multiline/whitespace behaviour
|
33
|
+
def test_extract_3() -> None:
|
34
|
+
lines = '''
|
35
|
+
python.org/one.html ?? https://python.org/two.html some extra text
|
36
|
+
|
37
|
+
whatever.org
|
38
|
+
'''
|
39
|
+
assert set(extract_urls(lines, syntax='org')) == {
|
40
|
+
'python.org/one.html',
|
41
|
+
'https://python.org/two.html',
|
42
|
+
'whatever.org',
|
43
|
+
}
|