promnesia 1.2.20230515__py3-none-any.whl → 1.2.20240810__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (61) hide show
  1. promnesia/__main__.py +26 -14
  2. promnesia/cannon.py +4 -4
  3. promnesia/common.py +39 -28
  4. promnesia/compare.py +3 -2
  5. promnesia/config.py +4 -2
  6. promnesia/database/common.py +66 -0
  7. promnesia/database/dump.py +187 -0
  8. promnesia/{read_db.py → database/load.py} +10 -11
  9. promnesia/extract.py +1 -0
  10. promnesia/kjson.py +1 -1
  11. promnesia/logging.py +3 -3
  12. promnesia/misc/__init__.pyi +0 -0
  13. promnesia/misc/config_example.py +1 -2
  14. promnesia/misc/install_server.py +2 -3
  15. promnesia/server.py +18 -19
  16. promnesia/sources/__init__.pyi +0 -0
  17. promnesia/sources/auto.py +9 -7
  18. promnesia/sources/browser_legacy.py +11 -5
  19. promnesia/sources/demo.py +18 -2
  20. promnesia/sources/filetypes.py +7 -0
  21. promnesia/sources/github.py +2 -2
  22. promnesia/sources/hypothesis.py +1 -1
  23. promnesia/sources/markdown.py +15 -15
  24. promnesia/sources/org.py +7 -3
  25. promnesia/sources/plaintext.py +3 -1
  26. promnesia/sources/reddit.py +2 -2
  27. promnesia/sources/rss.py +1 -1
  28. promnesia/sources/signal.py +22 -14
  29. promnesia/sources/stackexchange.py +2 -2
  30. promnesia/sources/takeout.py +58 -1
  31. promnesia/sources/takeout_legacy.py +10 -2
  32. promnesia/tests/__init__.py +0 -0
  33. promnesia/tests/common.py +137 -0
  34. promnesia/tests/server_helper.py +64 -0
  35. promnesia/tests/sources/__init__.py +0 -0
  36. promnesia/tests/sources/test_auto.py +66 -0
  37. promnesia/tests/sources/test_filetypes.py +42 -0
  38. promnesia/tests/sources/test_hypothesis.py +39 -0
  39. promnesia/tests/sources/test_org.py +65 -0
  40. promnesia/tests/sources/test_plaintext.py +26 -0
  41. promnesia/tests/sources/test_shellcmd.py +22 -0
  42. promnesia/tests/sources/test_takeout.py +58 -0
  43. promnesia/tests/test_cannon.py +325 -0
  44. promnesia/tests/test_cli.py +42 -0
  45. promnesia/tests/test_compare.py +30 -0
  46. promnesia/tests/test_config.py +290 -0
  47. promnesia/tests/test_db_dump.py +223 -0
  48. promnesia/tests/test_extract.py +61 -0
  49. promnesia/tests/test_extract_urls.py +43 -0
  50. promnesia/tests/test_indexer.py +245 -0
  51. promnesia/tests/test_server.py +292 -0
  52. promnesia/tests/test_traverse.py +41 -0
  53. promnesia/tests/utils.py +35 -0
  54. {promnesia-1.2.20230515.dist-info → promnesia-1.2.20240810.dist-info}/METADATA +13 -17
  55. promnesia-1.2.20240810.dist-info/RECORD +83 -0
  56. {promnesia-1.2.20230515.dist-info → promnesia-1.2.20240810.dist-info}/WHEEL +1 -1
  57. {promnesia-1.2.20230515.dist-info → promnesia-1.2.20240810.dist-info}/entry_points.txt +0 -1
  58. promnesia/dump.py +0 -105
  59. promnesia-1.2.20230515.dist-info/RECORD +0 -58
  60. {promnesia-1.2.20230515.dist-info → promnesia-1.2.20240810.dist-info}/LICENSE +0 -0
  61. {promnesia-1.2.20230515.dist-info → promnesia-1.2.20240810.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,325 @@
1
+ from typing import cast
2
+
3
+ import pytest
4
+
5
+ from ..cannon import canonify, CanonifyException
6
+
7
+ # TODO should actually understand 'sequences'?
8
+ # e.g.
9
+ # https://www.scottaaronson.com/blog/?p=3167#comment-1731882 is kinda hierarchy of scottaaronson.com, post 3167 and comment 1731882
10
+ # but when working with it from server, would be easier to just do multiple queries I guess..
11
+ # https://www.scottaaronson.com/blog/?p=3167 is kind ahierarchy of scottaaronson.com ;
12
+
13
+
14
+ param = pytest.mark.parametrize
15
+
16
+
17
+ # mark stuff that in interesting as a testcase, but I'm not sure about yet
18
+ TODO = cast(str, object())
19
+
20
+
21
+ def check(url, expected):
22
+ if expected is TODO:
23
+ pytest.skip(f"'{url}' will be handled later")
24
+ assert canonify(url) == expected
25
+
26
+
27
+ # TODO assume spaces are not meaninfgul??
28
+ # then could align URLs etc?
29
+
30
+ @param('url,expected', [(
31
+ 'https://www.youtube.com/watch?t=491s&v=1NHbPN9pNPM&index=63&list=WL',
32
+ # NOTE: t= reordered, makes it more hierarchical
33
+ # list as well, I guess makes the most sense to keep it at the very end.. since lists are more like tags
34
+ 'youtube.com/watch?v=1NHbPN9pNPM&t=491s&list=WL'
35
+ ), (
36
+ 'youtube.com/watch?v=wHrCkyoe72U&feature=share&time_continue=6',
37
+ 'youtube.com/watch?v=wHrCkyoe72U'
38
+ ), (
39
+ 'youtube.com/embed/nyc6RJEEe0U?feature=oembed',
40
+ 'youtube.com/watch?v=nyc6RJEEe0U'
41
+ ), (
42
+ 'https://youtu.be/iCvmsMzlF7o?list=WL',
43
+ 'youtube.com/watch?v=iCvmsMzlF7o&list=WL'
44
+ ),
45
+ # TODO can even be like that or contain timestamp (&t=)
46
+ # TODO warn if param already present? shouldn't happen..
47
+
48
+ # TODO could be interesting to do automatic rule extraction by querying one represnetative and then extracting canonical
49
+
50
+ # TODO national domains don't matter for youtube
51
+
52
+ # [*, 'youtube', ANY_DOMAIN] / 'embed' -> 'youtube.com/watch'
53
+ # TODO use regex backrefs?
54
+ #
55
+ (
56
+ 'm.youtube.com/watch?v=Zn6gV2sdl38',
57
+ 'youtube.com/watch?v=Zn6gV2sdl38'
58
+ ),
59
+
60
+ # ( "https//youtube.com/playlist?list=PLeOfc0M-50LmJtZwyOfw6aVopmIbU1t7t"
61
+ # , "youtube.com/playlist?list=PLeOfc0M-50LmJtZwyOfw6aVopmIbU1t7t"
62
+ # ),
63
+ # TODO perhaps it should result in video link + sibling link?
64
+ # when exploring other people's playlists this could be quite useful?
65
+
66
+ # ( "https://www.youtube.com/watch?v=1NHbPN9pNPM&index=63&list=WL&t=491s"
67
+ # , "youtube.com/watch?v=1NHbPN9pNPM&list=WL" # TODO not so sure about &t, it's sort of useful
68
+ # ),
69
+ # TODO
70
+ # youtube.com/user/magauchsein/playlists?sort=dd&view=50&shelf_id=14
71
+ # youtube.com/user/TheChemlife/videos?view=0&sort=p&flow=grid
72
+ ])
73
+ def test_youtube(url, expected):
74
+ assert canonify(url) == expected
75
+
76
+
77
+ @param('url,expected', [(
78
+ 'https://web.archive.org/web/20090902224414/http://reason.com/news/show/119237.html',
79
+ 'reason.com/news/show/119237.html',
80
+ )])
81
+ def test_archiveorg(url, expected):
82
+ assert canonify(url) == expected
83
+
84
+
85
+ # ugh. good example of motication for cannon.py?
86
+ @param('url,expected', [(
87
+ 'https://news.ycombinator.com/from?site=jacopo.io',
88
+ 'jacopo.io',
89
+ ), (
90
+ 'https://news.ycombinator.com/item?id=25099862',
91
+ 'news.ycombinator.com/item?id=25099862',
92
+ ), (
93
+ 'https://news.ycombinator.com/reply?id=25100035&goto=item%3Fid%3D25099862%2325100035',
94
+ TODO,
95
+ )])
96
+ def test_hackernews(url, expected):
97
+ check(url, expected)
98
+
99
+
100
+ @param('url, expected', [
101
+ ( 'https://www.reddit.com/r/firefox/comments/bbugc5/firefox_bans_free_speech_commenting_plugin/?ref=readnext'
102
+ , 'reddit.com/r/firefox/comments/bbugc5/firefox_bans_free_speech_commenting_plugin',
103
+ ),
104
+
105
+ ( 'https://www.reddit.com/r/selfhosted/comments/8j8mo3/what_are_you_self_hosting/dz19gh9/?utm_content=permalink&utm_medium=user&utm_source=reddit&utm_name=u_karlicoss'
106
+ , 'reddit.com/r/selfhosted/comments/8j8mo3/what_are_you_self_hosting/dz19gh9',
107
+ )
108
+ # TODO hmm. parent relationship can just rely on urls for reddit
109
+ # just need to support it in server I suppose
110
+
111
+ # TODO search queries?
112
+ # https://www.reddit.com/search?q=AutoValue
113
+
114
+ # TODO def need better markdown handling
115
+ # https://reddit.com/r/intj/comments/cmof04/me_irl/ew4a3dw/][ Me_irl]
116
+ # reddit.com/r/intj/comments/cmof04/me_irl/ew4a3dw/%5D%5BMe_irl%5D
117
+
118
+
119
+
120
+ ])
121
+ def test_reddit(url, expected):
122
+ assert canonify(url) == expected
123
+
124
+ # ugh. good example of motication for cannon.py?
125
+ @param('url,expected', [
126
+ ( 'https://app.getpocket.com/read/3479402594'
127
+ , 'app.getpocket.com/read/3479402594'
128
+ ),
129
+
130
+ ( 'https://getpocket.com/read/3479402594'
131
+ , 'app.getpocket.com/read/3479402594'
132
+ ),
133
+ ])
134
+ def test_pocket(url, expected):
135
+ assert canonify(url) == expected
136
+
137
+ @pytest.mark.parametrize("url,expected", [
138
+ # TODO ?? 'https://groups.google.com/a/list.hypothes.is/forum/#!topic/dev/kcmS7H8ssis',
139
+ #
140
+ # TODO FIXME fragment handling
141
+ # ( "https://www.scottaaronson.com/blog/?p=3167#comment-1731882"
142
+ # , "scottaaronson.com/blog/?p=3167#comment-1731882"
143
+ # ),
144
+
145
+
146
+ # TODO FIXME fragment handling
147
+ # ( "https://en.wikipedia.org/wiki/tendon#cite_note-14"
148
+ # , "en.wikipedia.org/wiki/tendon#cite_note-14"
149
+ # ),
150
+
151
+ # TODO FIXME fragment handling
152
+ # ( "https://physicstravelguide.com/experiments/aharonov-bohm#tab__concrete"
153
+ # , "physicstravelguide.com/experiments/aharonov-bohm#tab__concrete"
154
+ # ),
155
+
156
+ ( "https://github.com/search?o=asc&q=track&s=stars&type=Repositories"
157
+ , "github.com/search?q=track"
158
+ ),
159
+ ( "https://80000hours.org/career-decision/article/?utm_source=The+EA+Newsletter&utm_campaign=04ca3c2244-EMAIL_CAMPAIGN_2019_04_03_04_26&utm_medium=email&utm_term=0_51c1df13ac-04ca3c2244-318697649"
160
+ , "80000hours.org/career-decision/article"
161
+ ),
162
+ ( "https://www.facebook.com/photo.php?fbid=24147689823424326&set=pcb.2414778905423667&type=3&theater"
163
+ , "facebook.com/photo.php?fbid=24147689823424326"
164
+ ),
165
+ ( "https://play.google.com/store/apps/details?id=com.faultexception.reader&hl=en"
166
+ , "play.google.com/store/apps/details?id=com.faultexception.reader"
167
+ ),
168
+ # TODO it also got &p= parameter, which refers to page... not sure how to handle this
169
+ # news.ycombinator.com/item?id=15451442&p=2
170
+ ( "https://news.ycombinator.com/item?id=12172351"
171
+ , "news.ycombinator.com/item?id=12172351"
172
+ ),
173
+ ( "https://urbandictionary.com/define.php?term=Belgian%20Whistle"
174
+ , "urbandictionary.com/define.php?term=Belgian%20Whistle"
175
+ ),
176
+ ( "https://en.wikipedia.org/wiki/Dinic%27s_algorithm"
177
+ , "en.wikipedia.org/wiki/Dinic%27s_algorithm"
178
+ ),
179
+
180
+ ( "zoopla.co.uk/to-rent/details/42756337#D0zlBWeD4X85odsR.97"
181
+ , "zoopla.co.uk/to-rent/details/42756337"
182
+ ),
183
+
184
+ ( "withouthspec.co.uk/rooms/16867952?guests=2&adults=2&location=Berlin%2C+Germany&check_in=2017-08-16&check_out=2017-08-20"
185
+ , "withouthspec.co.uk/rooms/16867952"
186
+ ),
187
+
188
+ ( "amp.theguardian.com/technology/2017/oct/09/mark-zuckerberg-facebook-puerto-rico-virtual-reality"
189
+ , "theguardian.com/technology/2017/oct/09/mark-zuckerberg-facebook-puerto-rico-virtual-reality",
190
+ ),
191
+
192
+ ( "https://answers.yahoo.com/question/index?qid=20071101131442AAk9bGp"
193
+ , "answers.yahoo.com/question/index?qid=20071101131442AAk9bGp"
194
+ ),
195
+ ( "flowingdata.com/2010/12/14/10-best-data-visualization-projects-of-the-year-%e2%80%93-2010"
196
+ , "flowingdata.com/2010/12/14/10-best-data-visualization-projects-of-the-year-%E2%80%93-2010"
197
+ ),
198
+ ( "flowingdata.com/2010/12/14/10-best-data-visualization-projects-of-the-year-–-2010"
199
+ , "flowingdata.com/2010/12/14/10-best-data-visualization-projects-of-the-year-%E2%80%93-2010"
200
+ ),
201
+
202
+ ( "https://spoonuniversity.com/lifestyle/marmite-ways-to-eat-it&usg=AFQjCNH4s1SOEjlpENlfPV5nuvADZpSdow"
203
+ , "spoonuniversity.com/lifestyle/marmite-ways-to-eat-it"
204
+ ),
205
+
206
+ ( 'https://google.co.uk/amp/s/amp.reddit.com/r/androidapps/comments/757e2t/swiftkey_or_gboard'
207
+ , 'reddit.com/r/androidapps/comments/757e2t/swiftkey_or_gboard'
208
+ ),
209
+
210
+ # should sort query params
211
+ ( 'https://www.youtube.com/watch?v=hvoQiF0kBI8&list=WL&index=2'
212
+ , 'youtube.com/watch?v=hvoQiF0kBI8&list=WL',
213
+ ),
214
+ ( 'https://www.youtube.com/watch?list=WL&v=hvoQiF0kBI8&index=2'
215
+ , 'youtube.com/watch?v=hvoQiF0kBI8&list=WL',
216
+ ),
217
+
218
+ # TODO def need to allow the _user_ to define the rules.
219
+ # no way I can predict everything
220
+ # basically, allow *interactively* select
221
+ # also allow introspection, which rule matched?
222
+ ( 'https://bbs.archlinux.org/viewtopic.php?id=212740'
223
+ , 'bbs.archlinux.org/viewtopic.php?id=212740',
224
+ ),
225
+
226
+ ( 'https://ubuntuforums.org/showthread.php?t=1403470&s=0dd67bdb12559c22e73a220752db50c7&p=8806195#post8806195'
227
+ , 'ubuntuforums.org/showthread.php?t=1403470&p=8806195',
228
+ ),
229
+
230
+ ( 'https://arstechnica.com/?p=1371299',
231
+ 'arstechnica.com/?p=1371299',
232
+ # eh. it's a redirect to https://arstechnica.com/information-technology/2018/09/dozens-of-ios-apps-surreptitiously-share-user-location-data-with-tracking-firms/
233
+ # however in the page body there is <link rel="shorturl" href="https://arstechnica.com/?p=1371299"> ...
234
+ ),
235
+
236
+ # ( "gwern.net/DNB+FAQ"
237
+ # , "TODO" # ???
238
+ # ),
239
+
240
+ # TODO shit. is that normal??? perhaps need to manually move fragment?
241
+ # SplitResult(scheme='https', netloc='unix.stackexchange.com', path='/questions/171603/convert-file-contents-to-lower-case/171708', query='', fragment='171708&usg=AFQjCNEFCGqCAa4P4Zlu2x11bThJispNxQ')
242
+ # ( "https://unix.stackexchange.com/questions/171603/convert-file-contents-to-lower-case/171708#171708&usg=AFQjCNEFCGqCAa4P4Zlu2x11bThJispNxQ"
243
+ # , "unix.stackexchange.com/questions/171603/convert-file-contents-to-lower-case/171708#171708"
244
+ # )
245
+ ])
246
+ def test(url, expected):
247
+ assert canonify(url) == expected
248
+ # TODO github queries
249
+ # github.com/search?l=Python&q=reddit+backup
250
+ # github.com/search?p=3&q=ipynb+language%3AHaskell
251
+ # github.com/search?q=kobo+ExtraData
252
+ # github.com/search?q=what-universal-human-experiences-are-you-missing-without-realizing-it
253
+
254
+ # TODO git+https://github.com/expectocode/telegram-export@master
255
+ # TODO again, for that actually sequence would be good...
256
+
257
+ # TODO "https://twitter.com/search?q=pinboard search&src=typd"
258
+
259
+ # TODO https://www.zalando-lounge.ch/#/
260
+ # TODO m.facebook.com
261
+ # TODO [R('^(youtube|urbandictionary|tesco|scottaaronson|answers.yahoo.com|code.google.com)') , None],
262
+
263
+
264
+
265
+ # TODO
266
+ # amazon.co.uk/gp/offer-listing/B00525XKL4/ref=dp_olp_new
267
+ # amazon.co.uk/gp/offer-listing/B00525XKL4/ref=olp_twister_child
268
+
269
+ # TODO
270
+ # en.wikipedia.org/wiki/S&P_500_Index
271
+
272
+
273
+ # TODO
274
+ # google.co.uk/maps/place/Hackney+Bureau/@51.5293789,-0.0527919,16.88z/data=!bla-bla!-bla
275
+
276
+
277
+ # TODO
278
+ # perhaps, disable utf8 everywhere?
279
+ # github.com/search?utf8=%E2%9C%93&q=%22My+Clippings.txt%22
280
+
281
+ # TODO FIXME fragment handling
282
+ # ( "https://www.scottaaronson.com/blog/?p=3167#comment-1731882"
283
+ # , "scottaaronson.com/blog/?p=3167#comment-1731882"
284
+ # ),
285
+
286
+ @pytest.mark.parametrize("urls", [
287
+ {
288
+ "launchpad.net/ubuntu/%2Bsource/okular",
289
+ "launchpad.net/ubuntu/+source/okular",
290
+ },
291
+ {
292
+ "flowingdata.com/2010/12/14/10-best-data-visualization-projects-of-the-year-–-2010",
293
+ "flowingdata.com/2010/12/14/10-best-data-visualization-projects-of-the-year-%e2%80%93-2010",
294
+ "https://flowingdata.com/2010/12/14/10-best-data-visualization-projects-of-the-year-%e2%80%93-2010/&usg=AFQjCNEZsEGz9rqpWqlFXR5Tc7pkCKY5sQ",
295
+ },
296
+ ])
297
+ def test_same_norm(urls):
298
+ urls = list(sorted(urls))
299
+ u0 = urls[0]
300
+ c0 = canonify(u0)
301
+ for u in urls[1:]:
302
+ c = canonify(u)
303
+ assert c0 == c, f'Expected {u0} and {u} to be same canonically; got {c0} and {c} instead'
304
+
305
+ def test_error():
306
+ # canonify('  +74Zo535, fewfwf@gmail.com') # -- apparently was patched in some python3.7 versions
307
+ with pytest.raises(CanonifyException):
308
+ # borrowed from https://bugs.mageia.org/show_bug.cgi?id=24640#c7
309
+ canonify('https://example.com\uFF03@bing.com')
310
+
311
+ @pytest.mark.parametrize("url,expected", [
312
+ ('https://news.ycombinator.com/item?id=', 'news.ycombinator.com/item?id='),
313
+ ('https://www.youtube.com/watch?v=hvoQiF0kBI8&list&index=2',
314
+ 'youtube.com/watch?v=hvoQiF0kBI8&list='),
315
+ ])
316
+ def test_empty_query_parameter(url, expected):
317
+ assert canonify(url) == expected
318
+
319
+ @pytest.mark.parametrize("url,expected", [
320
+ ('http://www.isfdb.org/cgi-bin/title.cgi?2172', 'isfdb.org/cgi-bin/title.cgi?2172='),
321
+ ('http://www.isfdb.org/cgi-bin/title.cgi?2172+1', 'isfdb.org/cgi-bin/title.cgi?2172%201='),
322
+ ('http://www.isfdb.org/cgi-bin/title.cgi?2172&foo=bar&baz&quux', 'isfdb.org/cgi-bin/title.cgi?2172=&baz=&foo=bar&quux='),
323
+ ])
324
+ def test_qkeep_true(url, expected):
325
+ assert canonify(url) == expected
@@ -0,0 +1,42 @@
1
+ import os
2
+ import time
3
+
4
+ from ..common import _is_windows
5
+
6
+ from .common import get_testdata, promnesia_bin, tmp_popen
7
+
8
+ import pytest
9
+ import requests
10
+
11
+
12
+ ox_hugo_data = get_testdata('ox-hugo/test/site')
13
+
14
+
15
+ def test_demo() -> None:
16
+ if _is_windows:
17
+ # for some reason fails to connect to server..
18
+ # not sure maybe something with port choice idk
19
+ pytest.skip("TODO broken on Windows")
20
+
21
+ with tmp_popen(promnesia_bin('demo', '--port', '16789', ox_hugo_data)):
22
+ # TODO why does it want post??
23
+ time.sleep(2) # meh.. need a generic helper to wait till ready...
24
+ res = {}
25
+ for attempt in range(30):
26
+ time.sleep(1)
27
+ try:
28
+ res = requests.post(
29
+ "http://localhost:16789/search",
30
+ json=dict(url="https://github.com/kaushalmodi/ox-hugo/issues"),
31
+ ).json()
32
+ break
33
+ except:
34
+ continue
35
+ else:
36
+ raise RuntimeError("Couldn't connect to the server")
37
+ vis = res['visits']
38
+ assert len(vis) > 50, vis
39
+ mds = [x for x in vis if x['locator']['title'] == 'content/posts/citations-example-toml.md'.replace('/', os.sep)]
40
+ orgs = [x for x in vis if x['locator']['title'].startswith('content-org/single-posts/empty_tag.org'.replace('/', os.sep))]
41
+ assert len(mds) == 1
42
+ assert len(orgs) == 1
@@ -0,0 +1,30 @@
1
+ from pathlib import Path
2
+ import shutil
3
+
4
+ from ..compare import compare_files
5
+ from .utils import index_urls
6
+
7
+
8
+ def test_compare(tmp_path: Path) -> None:
9
+ idx = index_urls({
10
+ 'https://example.com': None,
11
+ 'https://en.wikipedia.org/wiki/Saturn_V': None,
12
+ 'https://plato.stanford.edu/entries/qualia': None,
13
+ })
14
+ idx(tmp_path)
15
+ db = tmp_path / 'promnesia.sqlite'
16
+ old_db = tmp_path / 'promnesia-old.sqlite'
17
+ shutil.move(str(db), str(old_db))
18
+
19
+ idx2 = index_urls({
20
+ 'https://example.com': None,
21
+ 'https://www.reddit.com/r/explainlikeimfive/comments/1ev6e0/eli5entropy': None,
22
+ 'https://en.wikipedia.org/wiki/Saturn_V': None,
23
+ 'https://plato.stanford.edu/entries/qualia': None,
24
+ })
25
+ idx2(tmp_path)
26
+
27
+ # should not crash, as there are more links in the new database
28
+ assert len(list(compare_files(old_db, db))) == 0
29
+
30
+ assert len(list(compare_files(db, old_db))) == 1
@@ -0,0 +1,290 @@
1
+ from contextlib import contextmanager
2
+ from pathlib import Path
3
+ from tempfile import TemporaryDirectory
4
+ from typing import Union, List
5
+
6
+ from ..common import Source
7
+ from ..config import import_config, Config
8
+
9
+
10
+ from more_itertools import ilen
11
+ import pytest
12
+
13
+ from .common import throw
14
+
15
+
16
+ def make(body: str) -> Config:
17
+ with TemporaryDirectory() as td:
18
+ tdir = Path(td)
19
+ cp = tdir / 'cfg.py'
20
+ cp.write_text(body)
21
+ return import_config(cp)
22
+
23
+
24
+ @contextmanager
25
+ def with_config(cfg: Union[str, Config]):
26
+ from .. import config as C
27
+
28
+ assert not C.has()
29
+ cfg2: Config = make(cfg) if isinstance(cfg, str) else cfg
30
+ try:
31
+ C.instance = cfg2
32
+ assert C.has()
33
+ yield
34
+ finally:
35
+ C.reset()
36
+
37
+
38
+ def index(cfg: Union[str, Config], check=True) -> List[Exception]:
39
+ from ..__main__ import _do_index
40
+
41
+ with with_config(cfg):
42
+ errors = list(_do_index())
43
+ if check:
44
+ assert len(errors) == 0, errors
45
+ # visits = cfg.output_dir / 'promnesia.sqlite'
46
+ # TODO query visit count too
47
+ return errors
48
+
49
+
50
+ def test_minimal() -> None:
51
+ '''
52
+ Example of a smallest possible config, using a 'demo' source
53
+ '''
54
+ # import directly from promnesia, not promnesia.common
55
+ cfg = make(
56
+ '''
57
+ from promnesia import Source
58
+ from promnesia.sources import demo
59
+
60
+ SOURCES = [
61
+ Source(demo.index),
62
+ ]
63
+ '''
64
+ )
65
+ assert ilen(cfg.sources) == 1
66
+ assert all(isinstance(s, Source) for s in cfg.sources)
67
+ # todo output dirs?
68
+ index(cfg)
69
+
70
+
71
+ def test_sources_style_1() -> None:
72
+ '''
73
+ Testing 'styles' of specifying sources
74
+ '''
75
+ cfg = make(
76
+ '''
77
+ from promnesia.common import Source
78
+ from promnesia.sources import demo
79
+
80
+ SOURCES = [
81
+ # you can pass arguments to index functions
82
+ Source(demo.index, count=10, name='explicit name'),
83
+
84
+ # or rely on the default argument!
85
+ Source(demo.index, name='another name'),
86
+
87
+ # or rely on default source name name (will be guessed as 'demo')
88
+ Source(demo.index),
89
+
90
+ # rely on default index function
91
+ Source(demo),
92
+
93
+ # no need for Source() either!
94
+ demo.index,
95
+ demo,
96
+
97
+ # I guess this is as simple as it possibly gets...
98
+ 'promnesia.sources.demo',
99
+
100
+ # just in case, test lambdas
101
+ # with list
102
+ lambda: list(demo.index()),
103
+
104
+ # with generator
105
+ lambda: iter(list(demo.index())),
106
+
107
+ # example of lazy source
108
+ # useful when arguments are somehow computed dynamically in config
109
+ Source(lambda: demo.index(count=10), name='lazy'),
110
+ ]
111
+ '''
112
+ )
113
+
114
+ srcs = [s if isinstance(s, Source) else throw(s) for s in cfg.sources]
115
+
116
+ [s1, s2, s3, s4, s5, s55, s6, s7, s77, s777] = srcs
117
+
118
+ # just a quick check to make sure tests import promnesia package correctly
119
+ # (depends on conftests settings)
120
+ assert type(srcs[0]).__module__ == 'promnesia.common', srcs
121
+ assert s1.name == 'explicit name'
122
+ assert s2.name == 'another name'
123
+ assert s3.name == 'demo'
124
+ assert s4.name == 'demo'
125
+ assert s5.name == 'demo'
126
+ assert s55.name == 'demo'
127
+ assert s6.name == 'demo'
128
+
129
+ # can't say 'cfg' as name is intended here but anyway
130
+ assert s7.name == 'cfg'
131
+ assert s77.name == 'cfg'
132
+ assert s777.name == 'lazy'
133
+
134
+ index(cfg)
135
+ # TODO assert on results count?
136
+
137
+
138
+ # TODO ugh. allow not to have locator
139
+ # ideally you can construct a visit with a link and that's it
140
+ def test_sources_style_2() -> None:
141
+ '''
142
+ Now, sources are not magic -- they are just functions emitting visits
143
+ '''
144
+ cfg = make(
145
+ '''
146
+ from typing import Iterable
147
+ from promnesia.common import Visit, Source, Loc
148
+
149
+ def my_indexer() -> Iterable[Visit]:
150
+ from datetime import datetime
151
+ for link in ['reddit.com', 'beepb00p.xyz']:
152
+ yield Visit(
153
+ url=link,
154
+ dt=datetime.min,
155
+ locator=Loc.make('test'),
156
+ )
157
+
158
+ SOURCES = [
159
+ # you can just pass the function name here
160
+ my_indexer,
161
+
162
+ # or give it an explicit name (instead of a guess)
163
+ Source(my_indexer, name='nice name'),
164
+ ]
165
+
166
+
167
+ class MyIndexer:
168
+ def index():
169
+ from promnesia.sources import demo
170
+ return list(demo.index())
171
+
172
+ SOURCES.append(
173
+ MyIndexer,
174
+ )
175
+
176
+ '''
177
+ )
178
+ [s1, s2, s3] = [s if isinstance(s, Source) else throw(s) for s in cfg.sources]
179
+
180
+ assert s1.name == 'cfg' # TODO would be nice to guess 'my_indexer' instead...
181
+ assert s2.name == 'nice name'
182
+ assert s3.name == 'cfg' # TODO fix it, make MyIndexer?
183
+
184
+ index(cfg)
185
+
186
+
187
+ def test_sources_lazy():
188
+ '''
189
+ Demonstration of ways to return 'lazy' and generally more advanced sources
190
+
191
+ Lazy sources could be useful to do some conditional magic or make more defensive against imports, excra configuration. You'll know when you need it ;)
192
+ '''
193
+
194
+ cfg = make(
195
+ '''
196
+ from promnesia.common import Source
197
+
198
+ def lazy():
199
+ from promnesia.sources import demo
200
+ print("Hello, I'm so lazy...")
201
+ yield from demo.index()
202
+
203
+ SOURCES = [
204
+ lazy,
205
+ ]
206
+ '''
207
+ )
208
+ srcs = [s if isinstance(s, Source) else throw(s) for s in cfg.sources]
209
+ [s] = srcs
210
+
211
+ assert s.name == 'cfg' # TODO this should be fixed... but not a big deal
212
+
213
+ index(cfg)
214
+
215
+
216
+ # TODO later
217
+ # or like that:
218
+ # (i for i in lazy()),
219
+
220
+ # TODO later, support stuff that returns sources lazily? e.g. lambda: Source(...)
221
+ # not sure if it's very useful
222
+
223
+
224
+ def test_sources_errors() -> None:
225
+ '''
226
+ Testing defensiveness of config against various errors
227
+ '''
228
+ cfg = make(
229
+ '''
230
+ SOURCES = [
231
+ 'non.existing.module',
232
+
233
+ lambda: bad.attribute,
234
+
235
+ 'promnesia.sources.demo',
236
+ ]
237
+ '''
238
+ )
239
+
240
+ # nothing fails so far! It's defensive!
241
+ srcs = list(cfg.sources)
242
+
243
+ [e1, s1, s2] = srcs
244
+
245
+ assert isinstance(e1, Exception)
246
+ assert isinstance(s1, Source)
247
+ assert isinstance(s2, Source)
248
+
249
+ errors = index(cfg, check=False)
250
+ assert len(errors) == 2 # errors simply propagate
251
+
252
+
253
+ def test_no_sources() -> None:
254
+ cfg = make(
255
+ '''
256
+ '''
257
+ )
258
+ # raises because no SOURCES
259
+ with pytest.raises(RuntimeError):
260
+ list(cfg.sources)
261
+
262
+
263
+ def test_empty_sources() -> None:
264
+ cfg = make(
265
+ '''
266
+ SOURCES = []
267
+ '''
268
+ )
269
+ # raises because empty SOURCES
270
+ with pytest.raises(RuntimeError):
271
+ list(cfg.sources)
272
+
273
+
274
+ def test_legacy() -> None:
275
+ cfg = make(
276
+ '''
277
+ from promnesia.common import Source
278
+ from promnesia.sources import demo
279
+ INDEXERS = [
280
+ Source(demo.index, src='legacy name'),
281
+ ]
282
+ '''
283
+ )
284
+
285
+ [s1] = cfg.sources
286
+ assert isinstance(s1, Source)
287
+
288
+ assert s1.name == 'legacy name'
289
+
290
+ index(cfg)