promnesia 1.1.20230129__py3-none-any.whl → 1.2.20240810__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (72) hide show
  1. promnesia/__main__.py +58 -50
  2. promnesia/cannon.py +4 -4
  3. promnesia/common.py +57 -38
  4. promnesia/compare.py +3 -2
  5. promnesia/compat.py +6 -65
  6. promnesia/config.py +4 -2
  7. promnesia/database/common.py +66 -0
  8. promnesia/database/dump.py +187 -0
  9. promnesia/{read_db.py → database/load.py} +10 -11
  10. promnesia/extract.py +1 -0
  11. promnesia/kjson.py +1 -1
  12. promnesia/logging.py +14 -14
  13. promnesia/misc/__init__.pyi +0 -0
  14. promnesia/misc/config_example.py +1 -2
  15. promnesia/misc/install_server.py +5 -4
  16. promnesia/server.py +24 -24
  17. promnesia/sources/__init__.pyi +0 -0
  18. promnesia/sources/auto.py +12 -7
  19. promnesia/sources/browser.py +80 -293
  20. promnesia/sources/browser_legacy.py +298 -0
  21. promnesia/sources/demo.py +18 -2
  22. promnesia/sources/filetypes.py +8 -0
  23. promnesia/sources/github.py +2 -2
  24. promnesia/sources/hackernews.py +1 -2
  25. promnesia/sources/hypothesis.py +1 -1
  26. promnesia/sources/markdown.py +15 -15
  27. promnesia/sources/org.py +7 -3
  28. promnesia/sources/plaintext.py +3 -1
  29. promnesia/sources/reddit.py +2 -2
  30. promnesia/sources/rss.py +5 -1
  31. promnesia/sources/shellcmd.py +6 -2
  32. promnesia/sources/signal.py +29 -20
  33. promnesia/sources/smscalls.py +8 -1
  34. promnesia/sources/stackexchange.py +2 -2
  35. promnesia/sources/takeout.py +132 -12
  36. promnesia/sources/takeout_legacy.py +10 -2
  37. promnesia/sources/telegram.py +79 -123
  38. promnesia/sources/telegram_legacy.py +117 -0
  39. promnesia/sources/vcs.py +1 -1
  40. promnesia/sources/viber.py +6 -15
  41. promnesia/sources/website.py +1 -1
  42. promnesia/sqlite.py +42 -0
  43. promnesia/tests/__init__.py +0 -0
  44. promnesia/tests/common.py +137 -0
  45. promnesia/tests/server_helper.py +64 -0
  46. promnesia/tests/sources/__init__.py +0 -0
  47. promnesia/tests/sources/test_auto.py +66 -0
  48. promnesia/tests/sources/test_filetypes.py +42 -0
  49. promnesia/tests/sources/test_hypothesis.py +39 -0
  50. promnesia/tests/sources/test_org.py +65 -0
  51. promnesia/tests/sources/test_plaintext.py +26 -0
  52. promnesia/tests/sources/test_shellcmd.py +22 -0
  53. promnesia/tests/sources/test_takeout.py +58 -0
  54. promnesia/tests/test_cannon.py +325 -0
  55. promnesia/tests/test_cli.py +42 -0
  56. promnesia/tests/test_compare.py +30 -0
  57. promnesia/tests/test_config.py +290 -0
  58. promnesia/tests/test_db_dump.py +223 -0
  59. promnesia/tests/test_extract.py +61 -0
  60. promnesia/tests/test_extract_urls.py +43 -0
  61. promnesia/tests/test_indexer.py +245 -0
  62. promnesia/tests/test_server.py +292 -0
  63. promnesia/tests/test_traverse.py +41 -0
  64. promnesia/tests/utils.py +35 -0
  65. {promnesia-1.1.20230129.dist-info → promnesia-1.2.20240810.dist-info}/METADATA +14 -19
  66. promnesia-1.2.20240810.dist-info/RECORD +83 -0
  67. {promnesia-1.1.20230129.dist-info → promnesia-1.2.20240810.dist-info}/WHEEL +1 -1
  68. {promnesia-1.1.20230129.dist-info → promnesia-1.2.20240810.dist-info}/entry_points.txt +0 -1
  69. promnesia/dump.py +0 -105
  70. promnesia-1.1.20230129.dist-info/RECORD +0 -55
  71. {promnesia-1.1.20230129.dist-info → promnesia-1.2.20240810.dist-info}/LICENSE +0 -0
  72. {promnesia-1.1.20230129.dist-info → promnesia-1.2.20240810.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,290 @@
1
+ from contextlib import contextmanager
2
+ from pathlib import Path
3
+ from tempfile import TemporaryDirectory
4
+ from typing import Union, List
5
+
6
+ from ..common import Source
7
+ from ..config import import_config, Config
8
+
9
+
10
+ from more_itertools import ilen
11
+ import pytest
12
+
13
+ from .common import throw
14
+
15
+
16
+ def make(body: str) -> Config:
17
+ with TemporaryDirectory() as td:
18
+ tdir = Path(td)
19
+ cp = tdir / 'cfg.py'
20
+ cp.write_text(body)
21
+ return import_config(cp)
22
+
23
+
24
+ @contextmanager
25
+ def with_config(cfg: Union[str, Config]):
26
+ from .. import config as C
27
+
28
+ assert not C.has()
29
+ cfg2: Config = make(cfg) if isinstance(cfg, str) else cfg
30
+ try:
31
+ C.instance = cfg2
32
+ assert C.has()
33
+ yield
34
+ finally:
35
+ C.reset()
36
+
37
+
38
+ def index(cfg: Union[str, Config], check=True) -> List[Exception]:
39
+ from ..__main__ import _do_index
40
+
41
+ with with_config(cfg):
42
+ errors = list(_do_index())
43
+ if check:
44
+ assert len(errors) == 0, errors
45
+ # visits = cfg.output_dir / 'promnesia.sqlite'
46
+ # TODO query visit count too
47
+ return errors
48
+
49
+
50
+ def test_minimal() -> None:
51
+ '''
52
+ Example of a smallest possible config, using a 'demo' source
53
+ '''
54
+ # import directly from promnesia, not promnesia.common
55
+ cfg = make(
56
+ '''
57
+ from promnesia import Source
58
+ from promnesia.sources import demo
59
+
60
+ SOURCES = [
61
+ Source(demo.index),
62
+ ]
63
+ '''
64
+ )
65
+ assert ilen(cfg.sources) == 1
66
+ assert all(isinstance(s, Source) for s in cfg.sources)
67
+ # todo output dirs?
68
+ index(cfg)
69
+
70
+
71
+ def test_sources_style_1() -> None:
72
+ '''
73
+ Testing 'styles' of specifying sources
74
+ '''
75
+ cfg = make(
76
+ '''
77
+ from promnesia.common import Source
78
+ from promnesia.sources import demo
79
+
80
+ SOURCES = [
81
+ # you can pass arguments to index functions
82
+ Source(demo.index, count=10, name='explicit name'),
83
+
84
+ # or rely on the default argument!
85
+ Source(demo.index, name='another name'),
86
+
87
+ # or rely on default source name name (will be guessed as 'demo')
88
+ Source(demo.index),
89
+
90
+ # rely on default index function
91
+ Source(demo),
92
+
93
+ # no need for Source() either!
94
+ demo.index,
95
+ demo,
96
+
97
+ # I guess this is as simple as it possibly gets...
98
+ 'promnesia.sources.demo',
99
+
100
+ # just in case, test lambdas
101
+ # with list
102
+ lambda: list(demo.index()),
103
+
104
+ # with generator
105
+ lambda: iter(list(demo.index())),
106
+
107
+ # example of lazy source
108
+ # useful when arguments are somehow computed dynamically in config
109
+ Source(lambda: demo.index(count=10), name='lazy'),
110
+ ]
111
+ '''
112
+ )
113
+
114
+ srcs = [s if isinstance(s, Source) else throw(s) for s in cfg.sources]
115
+
116
+ [s1, s2, s3, s4, s5, s55, s6, s7, s77, s777] = srcs
117
+
118
+ # just a quick check to make sure tests import promnesia package correctly
119
+ # (depends on conftests settings)
120
+ assert type(srcs[0]).__module__ == 'promnesia.common', srcs
121
+ assert s1.name == 'explicit name'
122
+ assert s2.name == 'another name'
123
+ assert s3.name == 'demo'
124
+ assert s4.name == 'demo'
125
+ assert s5.name == 'demo'
126
+ assert s55.name == 'demo'
127
+ assert s6.name == 'demo'
128
+
129
+ # can't say 'cfg' as name is intended here but anyway
130
+ assert s7.name == 'cfg'
131
+ assert s77.name == 'cfg'
132
+ assert s777.name == 'lazy'
133
+
134
+ index(cfg)
135
+ # TODO assert on results count?
136
+
137
+
138
+ # TODO ugh. allow not to have locator
139
+ # ideally you can construct a visit with a link and that's it
140
+ def test_sources_style_2() -> None:
141
+ '''
142
+ Now, sources are not magic -- they are just functions emitting visits
143
+ '''
144
+ cfg = make(
145
+ '''
146
+ from typing import Iterable
147
+ from promnesia.common import Visit, Source, Loc
148
+
149
+ def my_indexer() -> Iterable[Visit]:
150
+ from datetime import datetime
151
+ for link in ['reddit.com', 'beepb00p.xyz']:
152
+ yield Visit(
153
+ url=link,
154
+ dt=datetime.min,
155
+ locator=Loc.make('test'),
156
+ )
157
+
158
+ SOURCES = [
159
+ # you can just pass the function name here
160
+ my_indexer,
161
+
162
+ # or give it an explicit name (instead of a guess)
163
+ Source(my_indexer, name='nice name'),
164
+ ]
165
+
166
+
167
+ class MyIndexer:
168
+ def index():
169
+ from promnesia.sources import demo
170
+ return list(demo.index())
171
+
172
+ SOURCES.append(
173
+ MyIndexer,
174
+ )
175
+
176
+ '''
177
+ )
178
+ [s1, s2, s3] = [s if isinstance(s, Source) else throw(s) for s in cfg.sources]
179
+
180
+ assert s1.name == 'cfg' # TODO would be nice to guess 'my_indexer' instead...
181
+ assert s2.name == 'nice name'
182
+ assert s3.name == 'cfg' # TODO fix it, make MyIndexer?
183
+
184
+ index(cfg)
185
+
186
+
187
+ def test_sources_lazy():
188
+ '''
189
+ Demonstration of ways to return 'lazy' and generally more advanced sources
190
+
191
+ Lazy sources could be useful to do some conditional magic or make more defensive against imports, excra configuration. You'll know when you need it ;)
192
+ '''
193
+
194
+ cfg = make(
195
+ '''
196
+ from promnesia.common import Source
197
+
198
+ def lazy():
199
+ from promnesia.sources import demo
200
+ print("Hello, I'm so lazy...")
201
+ yield from demo.index()
202
+
203
+ SOURCES = [
204
+ lazy,
205
+ ]
206
+ '''
207
+ )
208
+ srcs = [s if isinstance(s, Source) else throw(s) for s in cfg.sources]
209
+ [s] = srcs
210
+
211
+ assert s.name == 'cfg' # TODO this should be fixed... but not a big deal
212
+
213
+ index(cfg)
214
+
215
+
216
+ # TODO later
217
+ # or like that:
218
+ # (i for i in lazy()),
219
+
220
+ # TODO later, support stuff that returns sources lazily? e.g. lambda: Source(...)
221
+ # not sure if it's very useful
222
+
223
+
224
+ def test_sources_errors() -> None:
225
+ '''
226
+ Testing defensiveness of config against various errors
227
+ '''
228
+ cfg = make(
229
+ '''
230
+ SOURCES = [
231
+ 'non.existing.module',
232
+
233
+ lambda: bad.attribute,
234
+
235
+ 'promnesia.sources.demo',
236
+ ]
237
+ '''
238
+ )
239
+
240
+ # nothing fails so far! It's defensive!
241
+ srcs = list(cfg.sources)
242
+
243
+ [e1, s1, s2] = srcs
244
+
245
+ assert isinstance(e1, Exception)
246
+ assert isinstance(s1, Source)
247
+ assert isinstance(s2, Source)
248
+
249
+ errors = index(cfg, check=False)
250
+ assert len(errors) == 2 # errors simply propagate
251
+
252
+
253
+ def test_no_sources() -> None:
254
+ cfg = make(
255
+ '''
256
+ '''
257
+ )
258
+ # raises because no SOURCES
259
+ with pytest.raises(RuntimeError):
260
+ list(cfg.sources)
261
+
262
+
263
+ def test_empty_sources() -> None:
264
+ cfg = make(
265
+ '''
266
+ SOURCES = []
267
+ '''
268
+ )
269
+ # raises because empty SOURCES
270
+ with pytest.raises(RuntimeError):
271
+ list(cfg.sources)
272
+
273
+
274
+ def test_legacy() -> None:
275
+ cfg = make(
276
+ '''
277
+ from promnesia.common import Source
278
+ from promnesia.sources import demo
279
+ INDEXERS = [
280
+ Source(demo.index, src='legacy name'),
281
+ ]
282
+ '''
283
+ )
284
+
285
+ [s1] = cfg.sources
286
+ assert isinstance(s1, Source)
287
+
288
+ assert s1.name == 'legacy name'
289
+
290
+ index(cfg)
@@ -0,0 +1,223 @@
1
+ from __future__ import annotations
2
+
3
+ from concurrent.futures import ProcessPoolExecutor
4
+ from datetime import datetime, timedelta, timezone
5
+ from pathlib import Path
6
+ from tempfile import TemporaryDirectory
7
+ from typing import Any, Iterable
8
+
9
+
10
+ from hypothesis import settings, given
11
+ from hypothesis.strategies import from_type
12
+ # NOTE: pytest ... -s --hypothesis-verbosity=debug is useful for seeing what hypothesis is doing
13
+ import pytest
14
+ import pytz
15
+
16
+
17
+ from ..common import Loc
18
+ from ..database.common import DbVisit
19
+ from ..database.dump import visits_to_sqlite
20
+ from ..database.load import get_all_db_visits
21
+ from ..sqlite import sqlite_connection
22
+
23
+ from .common import gc_control, running_on_ci
24
+
25
+
26
+ HSETTINGS: dict[str, Any] = dict(
27
+ derandomize=True,
28
+ deadline=timedelta(seconds=2), # sometimes slow on ci
29
+ )
30
+
31
+
32
+ def test_no_visits(tmp_path: Path) -> None:
33
+ visits: list[DbVisit] = []
34
+
35
+ db = tmp_path / 'db.sqlite'
36
+ errors = visits_to_sqlite(
37
+ vit=visits,
38
+ overwrite_db=True,
39
+ _db_path=db,
40
+ )
41
+ assert db.exists()
42
+ [err] = [errors]
43
+ assert 'No visits were indexed' in str(err)
44
+
45
+
46
+ def test_one_visit(tmp_path: Path) -> None:
47
+ dt = datetime.fromisoformat('2023-11-14T23:11:01')
48
+ dt = pytz.timezone('Europe/Warsaw').localize(dt)
49
+ visit = DbVisit(
50
+ norm_url='google.com',
51
+ orig_url='https://google.com',
52
+ dt=dt,
53
+ locator=Loc.make(title='title', href='https://whatever.com'),
54
+ duration=123,
55
+ src='whatever',
56
+ )
57
+
58
+ visits = [visit]
59
+
60
+ db = tmp_path / 'db.sqlite'
61
+ errors = visits_to_sqlite(
62
+ vit=visits,
63
+ overwrite_db=True,
64
+ _db_path=db,
65
+ )
66
+ assert len(errors) == 0
67
+ assert db.exists()
68
+
69
+ with sqlite_connection(db, row_factory='dict') as conn:
70
+ [sqlite_visit] = conn.execute('SELECT * FROM visits')
71
+
72
+ assert sqlite_visit == {
73
+ 'context': None,
74
+ 'dt': '2023-11-14T23:11:01+01:00',
75
+ 'duration': 123,
76
+ 'locator_href': 'https://whatever.com',
77
+ 'locator_title': 'title',
78
+ 'norm_url': 'google.com',
79
+ 'orig_url': 'https://google.com',
80
+ 'src': 'whatever',
81
+ }
82
+
83
+ visits_in_db = get_all_db_visits(db)
84
+ assert visits_in_db == [visit]
85
+
86
+
87
+ def test_read_db_visits(tmp_path: Path) -> None:
88
+ """
89
+ Deliberately test against "hardcoded" database to check for backwards compatibility
90
+ """
91
+ db = tmp_path / 'db.sqlite'
92
+ with sqlite_connection(db) as conn:
93
+ conn.execute(
94
+ '''
95
+ CREATE TABLE visits (
96
+ norm_url VARCHAR,
97
+ orig_url VARCHAR,
98
+ dt VARCHAR,
99
+ locator_title VARCHAR,
100
+ locator_href VARCHAR,
101
+ src VARCHAR,
102
+ context VARCHAR,
103
+ duration INTEGER
104
+ );
105
+ '''
106
+ )
107
+ # this dt format (zone name after iso timestap) might occur in legacy databases
108
+ # (that were created when promnesia was using cachew NTBinder)
109
+ conn.execute(
110
+ '''
111
+ INSERT INTO visits VALUES(
112
+ 'i.redd.it/alala.jpg',
113
+ 'https://i.redd.it/alala.jpg',
114
+ '2019-04-13T11:55:09-04:00 America/New_York',
115
+ 'Reddit save',
116
+ 'https://reddit.com/r/whatever',
117
+ 'reddit',
118
+ '',
119
+ NULL
120
+ );
121
+ '''
122
+ )
123
+ [visit_in_db] = get_all_db_visits(db)
124
+ assert visit_in_db == DbVisit(
125
+ norm_url='i.redd.it/alala.jpg',
126
+ orig_url='https://i.redd.it/alala.jpg',
127
+ dt=datetime(2019, 4, 13, 11, 55, 9, tzinfo=timezone(timedelta(hours=-4))),
128
+ locator=Loc.make(title='Reddit save', href='https://reddit.com/r/whatever'),
129
+ src='reddit',
130
+ context='',
131
+ )
132
+
133
+
134
+ def _test_random_visit_aux(visit: DbVisit, tmp_path: Path) -> None:
135
+ db = tmp_path / 'db.sqlite'
136
+ errors = visits_to_sqlite(
137
+ vit=[visit],
138
+ overwrite_db=True,
139
+ _db_path=db,
140
+ )
141
+ assert db.exists()
142
+ assert len(errors) == 0, errors
143
+ # TODO query the db?
144
+
145
+
146
+ @given(
147
+ visit=from_type(DbVisit).filter(
148
+ # if duration is too big it fails to insert in sqlite
149
+ lambda v: (v.duration is None or 0 <= v.duration <= 10**5)
150
+ )
151
+ )
152
+ @settings(**HSETTINGS, max_examples=100)
153
+ def test_random_visit(visit: DbVisit) -> None:
154
+ with TemporaryDirectory() as tdir:
155
+ tmp_path = Path(tdir)
156
+ _test_random_visit_aux(visit=visit, tmp_path=tmp_path)
157
+
158
+
159
+ _dt_naive = datetime.fromisoformat('2023-11-14T23:11:01')
160
+ _dt_aware = pytz.timezone('America/New_York').localize(_dt_naive)
161
+
162
+ def make_testvisit(i: int) -> DbVisit:
163
+ return DbVisit(
164
+ norm_url=f'google.com/{i}',
165
+ orig_url=f'https://google.com/{i}',
166
+ dt=(_dt_naive if i % 2 == 0 else _dt_aware) + timedelta(seconds=i),
167
+ locator=Loc.make(title=f'title{i}', href=f'https://whatever.com/{i}'),
168
+ duration=i,
169
+ src='whatever',
170
+ )
171
+
172
+
173
+ @pytest.mark.parametrize('count', [99, 100_000, 1_000_000])
174
+ @pytest.mark.parametrize('gc_on', [True, False], ids=['gc_on', 'gc_off'])
175
+ def test_benchmark_visits_dumping(count: int, gc_control, tmp_path: Path) -> None:
176
+ # [20231212] testing differernt CHUNK_BY values with 1_000_000 visits on @karlicoss desktop pc
177
+ # 1: 25s (perhaps most overhead is from temporary lists?)
178
+ # 10 (current default): 8s
179
+ # 100: 6s
180
+ # 1000: 6s
181
+ # TODO maybe consider changing default to 100?
182
+ if count > 99 and running_on_ci:
183
+ pytest.skip("test would be too slow on CI, only meant to run manually")
184
+
185
+ visits = (make_testvisit(i) for i in range(count))
186
+ db = tmp_path / 'db.sqlite'
187
+ errors = visits_to_sqlite( # TODO maybe this method should return db stats? would make testing easier
188
+ vit=visits,
189
+ overwrite_db=True,
190
+ _db_path=db,
191
+ )
192
+ assert db.exists()
193
+ assert len(errors) == 0, errors
194
+
195
+
196
+ def _populate_db(db_path: Path, *, overwrite_db: bool, count: int) -> None:
197
+ visits = [make_testvisit(i) for i in range(count)]
198
+ errors = visits_to_sqlite(visits, _db_path=db_path, overwrite_db=overwrite_db)
199
+ assert len(errors) == 0
200
+
201
+
202
+ @pytest.mark.parametrize('mode', ['update', 'overwrite'])
203
+ def test_concurrent(tmp_path: Path, mode: str) -> None:
204
+ overwrite_db = {'overwrite': True, 'update': False}[mode]
205
+
206
+ db_path = tmp_path / 'db.sqlite'
207
+ # do initial indexing to initialize the db
208
+ _populate_db(db_path, overwrite_db=True, count=1)
209
+ assert db_path.exists() # just in case
210
+
211
+ # this simply tests correctness by running many concurrent indexers
212
+ parallel = 100 # 100 indexers
213
+ with ProcessPoolExecutor(max_workers=8) as pool:
214
+ futures = []
215
+ for _ in range(parallel):
216
+ futures.append(pool.submit(_populate_db, db_path, overwrite_db=overwrite_db, count=1_000))
217
+ for f in futures:
218
+ f.result()
219
+ assert db_path.exists() # just in case
220
+
221
+
222
+ # TODO test to make sure db is readable while we're indexing?
223
+ # kinda nicer version of test_query_while_indexing
@@ -0,0 +1,61 @@
1
+ from datetime import datetime, timezone
2
+
3
+ from ..common import Visit, DbVisit, Loc, Source
4
+ from ..extract import extract_visits
5
+
6
+ from .common import get_testdata, unwrap, running_on_ci, gc_control
7
+
8
+ from more_itertools import ilen
9
+ import pytest
10
+
11
+
12
+ def test_with_error() -> None:
13
+ class ExtractionError(Exception):
14
+ pass
15
+
16
+ def indexer():
17
+ yield Visit(url='http://test1', dt=datetime.fromtimestamp(0, tz=timezone.utc), locator=Loc.make('whatever'))
18
+ yield ExtractionError()
19
+ yield Visit(url='http://test2', dt=datetime.fromtimestamp(0, tz=timezone.utc), locator=Loc.make('whatever'))
20
+
21
+ [v1, e, v2] = extract_visits(source=Source(indexer), src='whatever')
22
+ assert isinstance(v1, DbVisit)
23
+ assert isinstance(e, Exception)
24
+ assert isinstance(v2, DbVisit)
25
+
26
+
27
+ def test_urls_are_normalised() -> None:
28
+ # generally this stuff is covered by cannon tests, but good to check it's actually inserted in the db
29
+ # TODO maybe this should be a separate test which takes DbVisit.make separately?
30
+ # especially to decouple from shellcmd source
31
+ from ..sources import shellcmd
32
+ from ..sources.plaintext import extract_from_path
33
+
34
+ visits = list(extract_visits(
35
+ source=Source(shellcmd.index, extract_from_path(get_testdata('normalise'))),
36
+ src='whatever',
37
+ ))
38
+ assert len(visits) == 7
39
+
40
+ assert {unwrap(v).norm_url for v in visits} == {
41
+ 'hi.com',
42
+ 'reddit.com/post',
43
+ 'argos.co.uk/webapp/wcs/stores/servlet/OrderItemDisplay',
44
+ 'youtube.com/watch?v=XXlZfc1TrD0',
45
+ 'youtube.com/watch?v=XXlZfc1Tr11',
46
+ }
47
+
48
+
49
+ @pytest.mark.parametrize('count', [99, 100_000, 1_000_000])
50
+ @pytest.mark.parametrize('gc_on', [True, False], ids=['gc_on', 'gc_off'])
51
+ def test_benchmark(count: int, gc_control) -> None:
52
+ # NOTE: at the moment most time is spent canonifying urls, so not much point optimizing this in isolation
53
+ # TODO maybe could specify custom cannonifying strategy that doesn't do anything to isolate benchmark
54
+ if count > 99 and running_on_ci:
55
+ pytest.skip("test would be too slow on CI, only meant to run manually")
56
+
57
+ from ..sources import demo
58
+ source = Source(demo.index, count=count)
59
+
60
+ total = ilen(extract_visits(source=source, src='whatever'))
61
+ assert total == count # sanity check
@@ -0,0 +1,43 @@
1
+ from ..common import extract_urls
2
+
3
+
4
+ def test_extract_simple() -> None:
5
+ lines = """
6
+ I've enjoyed [Chandler Carruth's _There Are No Zero-cost Abstractions_](
7
+ https://www.youtube.com/watch?v=rHIkrotSwcc) very much.
8
+ """.strip()
9
+ assert set(extract_urls(lines)) == {'https://www.youtube.com/watch?v=rHIkrotSwcc'}
10
+
11
+
12
+ def test_extract_2() -> None:
13
+ text = '''‍♂️ Чтобы снизить вероятность ошибиться, важно знать про когнитивные искажения.
14
+ Если для вас это новое словосочетание, начните с книжки
15
+ "Гарри Поттер и Методы рационального мышления" - http://hpmor.ru/, если вы знакомы с понятием - читайте цепочки на сайтах
16
+ lesswrong.ru и lesswrong.com, книжку Даниэля Канемана "Thinking, fast and slow" и канал Пион https://t.me/ontologics
17
+ '''
18
+ assert set(extract_urls(text)) == {'http://hpmor.ru/', 'lesswrong.ru', 'lesswrong.com', 'https://t.me/ontologics'}
19
+
20
+
21
+ def test_extract_md() -> None:
22
+ lines = '''
23
+ Hey, I recently implemented a new extension for that [addons.mozilla.org](https://addons.mozilla.org/en-US/firefox/addon/org-grasp-for-org-capture/), [github](https://github.com/karlicoss/grasp), perhaps it could be useful for you!
24
+ '''
25
+ assert set(extract_urls(lines)) == {
26
+ 'addons.mozilla.org',
27
+ 'https://addons.mozilla.org/en-US/firefox/addon/org-grasp-for-org-capture/',
28
+ 'https://github.com/karlicoss/grasp',
29
+ }
30
+
31
+
32
+ # just random links to test multiline/whitespace behaviour
33
+ def test_extract_3() -> None:
34
+ lines = '''
35
+ python.org/one.html ?? https://python.org/two.html some extra text
36
+
37
+ whatever.org
38
+ '''
39
+ assert set(extract_urls(lines, syntax='org')) == {
40
+ 'python.org/one.html',
41
+ 'https://python.org/two.html',
42
+ 'whatever.org',
43
+ }