promnesia 1.2.20230515__py3-none-any.whl → 1.3.20241021__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (84) hide show
  1. promnesia/__init__.py +14 -3
  2. promnesia/__main__.py +60 -35
  3. promnesia/cannon.py +27 -27
  4. promnesia/common.py +85 -67
  5. promnesia/compare.py +21 -22
  6. promnesia/compat.py +10 -10
  7. promnesia/config.py +23 -23
  8. promnesia/database/common.py +67 -0
  9. promnesia/database/dump.py +188 -0
  10. promnesia/{read_db.py → database/load.py} +16 -17
  11. promnesia/extract.py +14 -11
  12. promnesia/kjson.py +12 -11
  13. promnesia/logging.py +4 -4
  14. promnesia/misc/__init__.pyi +0 -0
  15. promnesia/misc/config_example.py +1 -2
  16. promnesia/misc/install_server.py +7 -9
  17. promnesia/server.py +57 -47
  18. promnesia/sources/__init__.pyi +0 -0
  19. promnesia/sources/auto.py +50 -35
  20. promnesia/sources/auto_logseq.py +6 -5
  21. promnesia/sources/auto_obsidian.py +2 -2
  22. promnesia/sources/browser.py +14 -9
  23. promnesia/sources/browser_legacy.py +26 -16
  24. promnesia/sources/demo.py +19 -3
  25. promnesia/sources/fbmessenger.py +3 -2
  26. promnesia/sources/filetypes.py +16 -7
  27. promnesia/sources/github.py +7 -9
  28. promnesia/sources/guess.py +2 -1
  29. promnesia/sources/hackernews.py +2 -2
  30. promnesia/sources/hpi.py +2 -2
  31. promnesia/sources/html.py +7 -5
  32. promnesia/sources/hypothesis.py +4 -3
  33. promnesia/sources/instapaper.py +2 -2
  34. promnesia/sources/markdown.py +31 -21
  35. promnesia/sources/org.py +27 -13
  36. promnesia/sources/plaintext.py +30 -29
  37. promnesia/sources/pocket.py +3 -2
  38. promnesia/sources/reddit.py +20 -19
  39. promnesia/sources/roamresearch.py +2 -1
  40. promnesia/sources/rss.py +4 -5
  41. promnesia/sources/shellcmd.py +19 -6
  42. promnesia/sources/signal.py +33 -24
  43. promnesia/sources/smscalls.py +2 -2
  44. promnesia/sources/stackexchange.py +4 -3
  45. promnesia/sources/takeout.py +76 -9
  46. promnesia/sources/takeout_legacy.py +24 -12
  47. promnesia/sources/telegram.py +13 -11
  48. promnesia/sources/telegram_legacy.py +18 -7
  49. promnesia/sources/twitter.py +6 -5
  50. promnesia/sources/vcs.py +5 -3
  51. promnesia/sources/viber.py +10 -9
  52. promnesia/sources/website.py +4 -4
  53. promnesia/sources/zulip.py +3 -2
  54. promnesia/sqlite.py +7 -4
  55. promnesia/tests/__init__.py +0 -0
  56. promnesia/tests/common.py +140 -0
  57. promnesia/tests/server_helper.py +67 -0
  58. promnesia/tests/sources/__init__.py +0 -0
  59. promnesia/tests/sources/test_auto.py +65 -0
  60. promnesia/tests/sources/test_filetypes.py +43 -0
  61. promnesia/tests/sources/test_hypothesis.py +39 -0
  62. promnesia/tests/sources/test_org.py +64 -0
  63. promnesia/tests/sources/test_plaintext.py +25 -0
  64. promnesia/tests/sources/test_shellcmd.py +21 -0
  65. promnesia/tests/sources/test_takeout.py +56 -0
  66. promnesia/tests/test_cannon.py +325 -0
  67. promnesia/tests/test_cli.py +40 -0
  68. promnesia/tests/test_compare.py +30 -0
  69. promnesia/tests/test_config.py +289 -0
  70. promnesia/tests/test_db_dump.py +222 -0
  71. promnesia/tests/test_extract.py +65 -0
  72. promnesia/tests/test_extract_urls.py +43 -0
  73. promnesia/tests/test_indexer.py +251 -0
  74. promnesia/tests/test_server.py +291 -0
  75. promnesia/tests/test_traverse.py +39 -0
  76. promnesia/tests/utils.py +35 -0
  77. {promnesia-1.2.20230515.dist-info → promnesia-1.3.20241021.dist-info}/METADATA +15 -18
  78. promnesia-1.3.20241021.dist-info/RECORD +83 -0
  79. {promnesia-1.2.20230515.dist-info → promnesia-1.3.20241021.dist-info}/WHEEL +1 -1
  80. {promnesia-1.2.20230515.dist-info → promnesia-1.3.20241021.dist-info}/entry_points.txt +0 -1
  81. promnesia/dump.py +0 -105
  82. promnesia-1.2.20230515.dist-info/RECORD +0 -58
  83. {promnesia-1.2.20230515.dist-info → promnesia-1.3.20241021.dist-info}/LICENSE +0 -0
  84. {promnesia-1.2.20230515.dist-info → promnesia-1.3.20241021.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,325 @@
1
+ from typing import cast
2
+
3
+ import pytest
4
+
5
+ from ..cannon import CanonifyException, canonify
6
+
7
+ # TODO should actually understand 'sequences'?
8
+ # e.g.
9
+ # https://www.scottaaronson.com/blog/?p=3167#comment-1731882 is kinda hierarchy of scottaaronson.com, post 3167 and comment 1731882
10
+ # but when working with it from server, would be easier to just do multiple queries I guess..
11
+ # https://www.scottaaronson.com/blog/?p=3167 is kind ahierarchy of scottaaronson.com ;
12
+
13
+
14
+ param = pytest.mark.parametrize
15
+
16
+
17
+ # mark stuff that in interesting as a testcase, but I'm not sure about yet
18
+ TODO = cast(str, object())
19
+
20
+
21
+ def check(url, expected):
22
+ if expected is TODO:
23
+ pytest.skip(f"'{url}' will be handled later")
24
+ assert canonify(url) == expected
25
+
26
+
27
+ # TODO assume spaces are not meaninfgul??
28
+ # then could align URLs etc?
29
+
30
+ @param('url,expected', [(
31
+ 'https://www.youtube.com/watch?t=491s&v=1NHbPN9pNPM&index=63&list=WL',
32
+ # NOTE: t= reordered, makes it more hierarchical
33
+ # list as well, I guess makes the most sense to keep it at the very end.. since lists are more like tags
34
+ 'youtube.com/watch?v=1NHbPN9pNPM&t=491s&list=WL'
35
+ ), (
36
+ 'youtube.com/watch?v=wHrCkyoe72U&feature=share&time_continue=6',
37
+ 'youtube.com/watch?v=wHrCkyoe72U'
38
+ ), (
39
+ 'youtube.com/embed/nyc6RJEEe0U?feature=oembed',
40
+ 'youtube.com/watch?v=nyc6RJEEe0U'
41
+ ), (
42
+ 'https://youtu.be/iCvmsMzlF7o?list=WL',
43
+ 'youtube.com/watch?v=iCvmsMzlF7o&list=WL'
44
+ ),
45
+ # TODO can even be like that or contain timestamp (&t=)
46
+ # TODO warn if param already present? shouldn't happen..
47
+
48
+ # TODO could be interesting to do automatic rule extraction by querying one represnetative and then extracting canonical
49
+
50
+ # TODO national domains don't matter for youtube
51
+
52
+ # [*, 'youtube', ANY_DOMAIN] / 'embed' -> 'youtube.com/watch'
53
+ # TODO use regex backrefs?
54
+ #
55
+ (
56
+ 'm.youtube.com/watch?v=Zn6gV2sdl38',
57
+ 'youtube.com/watch?v=Zn6gV2sdl38'
58
+ ),
59
+
60
+ # ( "https//youtube.com/playlist?list=PLeOfc0M-50LmJtZwyOfw6aVopmIbU1t7t"
61
+ # , "youtube.com/playlist?list=PLeOfc0M-50LmJtZwyOfw6aVopmIbU1t7t"
62
+ # ),
63
+ # TODO perhaps it should result in video link + sibling link?
64
+ # when exploring other people's playlists this could be quite useful?
65
+
66
+ # ( "https://www.youtube.com/watch?v=1NHbPN9pNPM&index=63&list=WL&t=491s"
67
+ # , "youtube.com/watch?v=1NHbPN9pNPM&list=WL" # TODO not so sure about &t, it's sort of useful
68
+ # ),
69
+ # TODO
70
+ # youtube.com/user/magauchsein/playlists?sort=dd&view=50&shelf_id=14
71
+ # youtube.com/user/TheChemlife/videos?view=0&sort=p&flow=grid
72
+ ])
73
+ def test_youtube(url, expected):
74
+ assert canonify(url) == expected
75
+
76
+
77
+ @param('url,expected', [(
78
+ 'https://web.archive.org/web/20090902224414/http://reason.com/news/show/119237.html',
79
+ 'reason.com/news/show/119237.html',
80
+ )])
81
+ def test_archiveorg(url, expected):
82
+ assert canonify(url) == expected
83
+
84
+
85
+ # ugh. good example of motication for cannon.py?
86
+ @param('url,expected', [(
87
+ 'https://news.ycombinator.com/from?site=jacopo.io',
88
+ 'jacopo.io',
89
+ ), (
90
+ 'https://news.ycombinator.com/item?id=25099862',
91
+ 'news.ycombinator.com/item?id=25099862',
92
+ ), (
93
+ 'https://news.ycombinator.com/reply?id=25100035&goto=item%3Fid%3D25099862%2325100035',
94
+ TODO,
95
+ )])
96
+ def test_hackernews(url, expected):
97
+ check(url, expected)
98
+
99
+
100
+ @param('url, expected', [
101
+ ( 'https://www.reddit.com/r/firefox/comments/bbugc5/firefox_bans_free_speech_commenting_plugin/?ref=readnext'
102
+ , 'reddit.com/r/firefox/comments/bbugc5/firefox_bans_free_speech_commenting_plugin',
103
+ ),
104
+
105
+ ( 'https://www.reddit.com/r/selfhosted/comments/8j8mo3/what_are_you_self_hosting/dz19gh9/?utm_content=permalink&utm_medium=user&utm_source=reddit&utm_name=u_karlicoss'
106
+ , 'reddit.com/r/selfhosted/comments/8j8mo3/what_are_you_self_hosting/dz19gh9',
107
+ )
108
+ # TODO hmm. parent relationship can just rely on urls for reddit
109
+ # just need to support it in server I suppose
110
+
111
+ # TODO search queries?
112
+ # https://www.reddit.com/search?q=AutoValue
113
+
114
+ # TODO def need better markdown handling
115
+ # https://reddit.com/r/intj/comments/cmof04/me_irl/ew4a3dw/][ Me_irl]
116
+ # reddit.com/r/intj/comments/cmof04/me_irl/ew4a3dw/%5D%5BMe_irl%5D
117
+
118
+
119
+
120
+ ])
121
+ def test_reddit(url, expected):
122
+ assert canonify(url) == expected
123
+
124
+ # ugh. good example of motication for cannon.py?
125
+ @param('url,expected', [
126
+ ( 'https://app.getpocket.com/read/3479402594'
127
+ , 'app.getpocket.com/read/3479402594'
128
+ ),
129
+
130
+ ( 'https://getpocket.com/read/3479402594'
131
+ , 'app.getpocket.com/read/3479402594'
132
+ ),
133
+ ])
134
+ def test_pocket(url, expected):
135
+ assert canonify(url) == expected
136
+
137
+ @pytest.mark.parametrize(("url", "expected"), [
138
+ # TODO ?? 'https://groups.google.com/a/list.hypothes.is/forum/#!topic/dev/kcmS7H8ssis',
139
+ #
140
+ # TODO FIXME fragment handling
141
+ # ( "https://www.scottaaronson.com/blog/?p=3167#comment-1731882"
142
+ # , "scottaaronson.com/blog/?p=3167#comment-1731882"
143
+ # ),
144
+
145
+
146
+ # TODO FIXME fragment handling
147
+ # ( "https://en.wikipedia.org/wiki/tendon#cite_note-14"
148
+ # , "en.wikipedia.org/wiki/tendon#cite_note-14"
149
+ # ),
150
+
151
+ # TODO FIXME fragment handling
152
+ # ( "https://physicstravelguide.com/experiments/aharonov-bohm#tab__concrete"
153
+ # , "physicstravelguide.com/experiments/aharonov-bohm#tab__concrete"
154
+ # ),
155
+
156
+ ( "https://github.com/search?o=asc&q=track&s=stars&type=Repositories"
157
+ , "github.com/search?q=track"
158
+ ),
159
+ ( "https://80000hours.org/career-decision/article/?utm_source=The+EA+Newsletter&utm_campaign=04ca3c2244-EMAIL_CAMPAIGN_2019_04_03_04_26&utm_medium=email&utm_term=0_51c1df13ac-04ca3c2244-318697649"
160
+ , "80000hours.org/career-decision/article"
161
+ ),
162
+ ( "https://www.facebook.com/photo.php?fbid=24147689823424326&set=pcb.2414778905423667&type=3&theater"
163
+ , "facebook.com/photo.php?fbid=24147689823424326"
164
+ ),
165
+ ( "https://play.google.com/store/apps/details?id=com.faultexception.reader&hl=en"
166
+ , "play.google.com/store/apps/details?id=com.faultexception.reader"
167
+ ),
168
+ # TODO it also got &p= parameter, which refers to page... not sure how to handle this
169
+ # news.ycombinator.com/item?id=15451442&p=2
170
+ ( "https://news.ycombinator.com/item?id=12172351"
171
+ , "news.ycombinator.com/item?id=12172351"
172
+ ),
173
+ ( "https://urbandictionary.com/define.php?term=Belgian%20Whistle"
174
+ , "urbandictionary.com/define.php?term=Belgian%20Whistle"
175
+ ),
176
+ ( "https://en.wikipedia.org/wiki/Dinic%27s_algorithm"
177
+ , "en.wikipedia.org/wiki/Dinic%27s_algorithm"
178
+ ),
179
+
180
+ ( "zoopla.co.uk/to-rent/details/42756337#D0zlBWeD4X85odsR.97"
181
+ , "zoopla.co.uk/to-rent/details/42756337"
182
+ ),
183
+
184
+ ( "withouthspec.co.uk/rooms/16867952?guests=2&adults=2&location=Berlin%2C+Germany&check_in=2017-08-16&check_out=2017-08-20"
185
+ , "withouthspec.co.uk/rooms/16867952"
186
+ ),
187
+
188
+ ( "amp.theguardian.com/technology/2017/oct/09/mark-zuckerberg-facebook-puerto-rico-virtual-reality"
189
+ , "theguardian.com/technology/2017/oct/09/mark-zuckerberg-facebook-puerto-rico-virtual-reality",
190
+ ),
191
+
192
+ ( "https://answers.yahoo.com/question/index?qid=20071101131442AAk9bGp"
193
+ , "answers.yahoo.com/question/index?qid=20071101131442AAk9bGp"
194
+ ),
195
+ ( "flowingdata.com/2010/12/14/10-best-data-visualization-projects-of-the-year-%e2%80%93-2010"
196
+ , "flowingdata.com/2010/12/14/10-best-data-visualization-projects-of-the-year-%E2%80%93-2010"
197
+ ),
198
+ ( "flowingdata.com/2010/12/14/10-best-data-visualization-projects-of-the-year-–-2010"
199
+ , "flowingdata.com/2010/12/14/10-best-data-visualization-projects-of-the-year-%E2%80%93-2010"
200
+ ),
201
+
202
+ ( "https://spoonuniversity.com/lifestyle/marmite-ways-to-eat-it&usg=AFQjCNH4s1SOEjlpENlfPV5nuvADZpSdow"
203
+ , "spoonuniversity.com/lifestyle/marmite-ways-to-eat-it"
204
+ ),
205
+
206
+ ( 'https://google.co.uk/amp/s/amp.reddit.com/r/androidapps/comments/757e2t/swiftkey_or_gboard'
207
+ , 'reddit.com/r/androidapps/comments/757e2t/swiftkey_or_gboard'
208
+ ),
209
+
210
+ # should sort query params
211
+ ( 'https://www.youtube.com/watch?v=hvoQiF0kBI8&list=WL&index=2'
212
+ , 'youtube.com/watch?v=hvoQiF0kBI8&list=WL',
213
+ ),
214
+ ( 'https://www.youtube.com/watch?list=WL&v=hvoQiF0kBI8&index=2'
215
+ , 'youtube.com/watch?v=hvoQiF0kBI8&list=WL',
216
+ ),
217
+
218
+ # TODO def need to allow the _user_ to define the rules.
219
+ # no way I can predict everything
220
+ # basically, allow *interactively* select
221
+ # also allow introspection, which rule matched?
222
+ ( 'https://bbs.archlinux.org/viewtopic.php?id=212740'
223
+ , 'bbs.archlinux.org/viewtopic.php?id=212740',
224
+ ),
225
+
226
+ ( 'https://ubuntuforums.org/showthread.php?t=1403470&s=0dd67bdb12559c22e73a220752db50c7&p=8806195#post8806195'
227
+ , 'ubuntuforums.org/showthread.php?t=1403470&p=8806195',
228
+ ),
229
+
230
+ ( 'https://arstechnica.com/?p=1371299',
231
+ 'arstechnica.com/?p=1371299',
232
+ # eh. it's a redirect to https://arstechnica.com/information-technology/2018/09/dozens-of-ios-apps-surreptitiously-share-user-location-data-with-tracking-firms/
233
+ # however in the page body there is <link rel="shorturl" href="https://arstechnica.com/?p=1371299"> ...
234
+ ),
235
+
236
+ # ( "gwern.net/DNB+FAQ"
237
+ # , "TODO" # ???
238
+ # ),
239
+
240
+ # TODO shit. is that normal??? perhaps need to manually move fragment?
241
+ # SplitResult(scheme='https', netloc='unix.stackexchange.com', path='/questions/171603/convert-file-contents-to-lower-case/171708', query='', fragment='171708&usg=AFQjCNEFCGqCAa4P4Zlu2x11bThJispNxQ')
242
+ # ( "https://unix.stackexchange.com/questions/171603/convert-file-contents-to-lower-case/171708#171708&usg=AFQjCNEFCGqCAa4P4Zlu2x11bThJispNxQ"
243
+ # , "unix.stackexchange.com/questions/171603/convert-file-contents-to-lower-case/171708#171708"
244
+ # )
245
+ ])
246
+ def test(url, expected):
247
+ assert canonify(url) == expected
248
+ # TODO github queries
249
+ # github.com/search?l=Python&q=reddit+backup
250
+ # github.com/search?p=3&q=ipynb+language%3AHaskell
251
+ # github.com/search?q=kobo+ExtraData
252
+ # github.com/search?q=what-universal-human-experiences-are-you-missing-without-realizing-it
253
+
254
+ # TODO git+https://github.com/expectocode/telegram-export@master
255
+ # TODO again, for that actually sequence would be good...
256
+
257
+ # TODO "https://twitter.com/search?q=pinboard search&src=typd"
258
+
259
+ # TODO https://www.zalando-lounge.ch/#/
260
+ # TODO m.facebook.com
261
+ # TODO [R('^(youtube|urbandictionary|tesco|scottaaronson|answers.yahoo.com|code.google.com)') , None],
262
+
263
+
264
+
265
+ # TODO
266
+ # amazon.co.uk/gp/offer-listing/B00525XKL4/ref=dp_olp_new
267
+ # amazon.co.uk/gp/offer-listing/B00525XKL4/ref=olp_twister_child
268
+
269
+ # TODO
270
+ # en.wikipedia.org/wiki/S&P_500_Index
271
+
272
+
273
+ # TODO
274
+ # google.co.uk/maps/place/Hackney+Bureau/@51.5293789,-0.0527919,16.88z/data=!bla-bla!-bla
275
+
276
+
277
+ # TODO
278
+ # perhaps, disable utf8 everywhere?
279
+ # github.com/search?utf8=%E2%9C%93&q=%22My+Clippings.txt%22
280
+
281
+ # TODO FIXME fragment handling
282
+ # ( "https://www.scottaaronson.com/blog/?p=3167#comment-1731882"
283
+ # , "scottaaronson.com/blog/?p=3167#comment-1731882"
284
+ # ),
285
+
286
+ @pytest.mark.parametrize("urls", [
287
+ {
288
+ "launchpad.net/ubuntu/%2Bsource/okular",
289
+ "launchpad.net/ubuntu/+source/okular",
290
+ },
291
+ {
292
+ "flowingdata.com/2010/12/14/10-best-data-visualization-projects-of-the-year-–-2010",
293
+ "flowingdata.com/2010/12/14/10-best-data-visualization-projects-of-the-year-%e2%80%93-2010",
294
+ "https://flowingdata.com/2010/12/14/10-best-data-visualization-projects-of-the-year-%e2%80%93-2010/&usg=AFQjCNEZsEGz9rqpWqlFXR5Tc7pkCKY5sQ",
295
+ },
296
+ ])
297
+ def test_same_norm(urls):
298
+ urls = sorted(urls)
299
+ u0 = urls[0]
300
+ c0 = canonify(u0)
301
+ for u in urls[1:]:
302
+ c = canonify(u)
303
+ assert c0 == c, f'Expected {u0} and {u} to be same canonically; got {c0} and {c} instead'
304
+
305
+ def test_error():
306
+ # canonify('  +74Zo535, fewfwf@gmail.com') # -- apparently was patched in some python3.7 versions
307
+ with pytest.raises(CanonifyException):
308
+ # borrowed from https://bugs.mageia.org/show_bug.cgi?id=24640#c7
309
+ canonify('https://example.com\uFF03@bing.com')
310
+
311
+ @pytest.mark.parametrize(("url", "expected"), [
312
+ ('https://news.ycombinator.com/item?id=', 'news.ycombinator.com/item?id='),
313
+ ('https://www.youtube.com/watch?v=hvoQiF0kBI8&list&index=2',
314
+ 'youtube.com/watch?v=hvoQiF0kBI8&list='),
315
+ ])
316
+ def test_empty_query_parameter(url, expected):
317
+ assert canonify(url) == expected
318
+
319
+ @pytest.mark.parametrize(("url", "expected"), [
320
+ ('http://www.isfdb.org/cgi-bin/title.cgi?2172', 'isfdb.org/cgi-bin/title.cgi?2172='),
321
+ ('http://www.isfdb.org/cgi-bin/title.cgi?2172+1', 'isfdb.org/cgi-bin/title.cgi?2172%201='),
322
+ ('http://www.isfdb.org/cgi-bin/title.cgi?2172&foo=bar&baz&quux', 'isfdb.org/cgi-bin/title.cgi?2172=&baz=&foo=bar&quux='),
323
+ ])
324
+ def test_qkeep_true(url, expected):
325
+ assert canonify(url) == expected
@@ -0,0 +1,40 @@
1
+ import os
2
+ import time
3
+
4
+ import pytest
5
+ import requests
6
+
7
+ from ..common import _is_windows
8
+ from .common import get_testdata, promnesia_bin, tmp_popen
9
+
10
+ ox_hugo_data = get_testdata('ox-hugo/test/site')
11
+
12
+
13
+ def test_demo() -> None:
14
+ if _is_windows:
15
+ # for some reason fails to connect to server..
16
+ # not sure maybe something with port choice idk
17
+ pytest.skip("TODO broken on Windows")
18
+
19
+ with tmp_popen(promnesia_bin('demo', '--port', '16789', ox_hugo_data)):
20
+ # TODO why does it want post??
21
+ time.sleep(2) # meh.. need a generic helper to wait till ready...
22
+ res = {}
23
+ for _attempt in range(30):
24
+ time.sleep(1)
25
+ try:
26
+ res = requests.post(
27
+ "http://localhost:16789/search",
28
+ json={'url': "https://github.com/kaushalmodi/ox-hugo/issues"},
29
+ ).json()
30
+ break
31
+ except:
32
+ continue
33
+ else:
34
+ raise RuntimeError("Couldn't connect to the server")
35
+ vis = res['visits']
36
+ assert len(vis) > 50, vis
37
+ mds = [x for x in vis if x['locator']['title'] == 'content/posts/citations-example-toml.md'.replace('/', os.sep)]
38
+ orgs = [x for x in vis if x['locator']['title'].startswith('content-org/single-posts/empty_tag.org'.replace('/', os.sep))]
39
+ assert len(mds) == 1
40
+ assert len(orgs) == 1
@@ -0,0 +1,30 @@
1
+ import shutil
2
+ from pathlib import Path
3
+
4
+ from ..compare import compare_files
5
+ from .utils import index_urls
6
+
7
+
8
+ def test_compare(tmp_path: Path) -> None:
9
+ idx = index_urls({
10
+ 'https://example.com': None,
11
+ 'https://en.wikipedia.org/wiki/Saturn_V': None,
12
+ 'https://plato.stanford.edu/entries/qualia': None,
13
+ })
14
+ idx(tmp_path)
15
+ db = tmp_path / 'promnesia.sqlite'
16
+ old_db = tmp_path / 'promnesia-old.sqlite'
17
+ shutil.move(str(db), str(old_db))
18
+
19
+ idx2 = index_urls({
20
+ 'https://example.com': None,
21
+ 'https://www.reddit.com/r/explainlikeimfive/comments/1ev6e0/eli5entropy': None,
22
+ 'https://en.wikipedia.org/wiki/Saturn_V': None,
23
+ 'https://plato.stanford.edu/entries/qualia': None,
24
+ })
25
+ idx2(tmp_path)
26
+
27
+ # should not crash, as there are more links in the new database
28
+ assert len(list(compare_files(old_db, db))) == 0
29
+
30
+ assert len(list(compare_files(db, old_db))) == 1
@@ -0,0 +1,289 @@
1
+ from __future__ import annotations
2
+
3
+ from contextlib import contextmanager
4
+ from pathlib import Path
5
+ from tempfile import TemporaryDirectory
6
+
7
+ import pytest
8
+ from more_itertools import ilen
9
+
10
+ from ..common import Source
11
+ from ..config import Config, import_config
12
+ from .common import throw
13
+
14
+
15
+ def make(body: str) -> Config:
16
+ with TemporaryDirectory() as td:
17
+ tdir = Path(td)
18
+ cp = tdir / 'cfg.py'
19
+ cp.write_text(body)
20
+ return import_config(cp)
21
+
22
+
23
+ @contextmanager
24
+ def with_config(cfg: str | Config):
25
+ from .. import config as C
26
+
27
+ assert not C.has()
28
+ cfg2: Config = make(cfg) if isinstance(cfg, str) else cfg
29
+ try:
30
+ C.instance = cfg2
31
+ assert C.has()
32
+ yield
33
+ finally:
34
+ C.reset()
35
+
36
+
37
+ def index(cfg: str | Config, *, check: bool = True) -> list[Exception]:
38
+ from ..__main__ import _do_index
39
+
40
+ with with_config(cfg):
41
+ errors = list(_do_index())
42
+ if check:
43
+ assert len(errors) == 0, errors
44
+ # visits = cfg.output_dir / 'promnesia.sqlite'
45
+ # TODO query visit count too
46
+ return errors
47
+
48
+
49
+ def test_minimal() -> None:
50
+ '''
51
+ Example of a smallest possible config, using a 'demo' source
52
+ '''
53
+ # import directly from promnesia, not promnesia.common
54
+ cfg = make(
55
+ '''
56
+ from promnesia import Source
57
+ from promnesia.sources import demo
58
+
59
+ SOURCES = [
60
+ Source(demo.index),
61
+ ]
62
+ '''
63
+ )
64
+ assert ilen(cfg.sources) == 1
65
+ assert all(isinstance(s, Source) for s in cfg.sources)
66
+ # todo output dirs?
67
+ index(cfg)
68
+
69
+
70
+ def test_sources_style_1() -> None:
71
+ '''
72
+ Testing 'styles' of specifying sources
73
+ '''
74
+ cfg = make(
75
+ '''
76
+ from promnesia.common import Source
77
+ from promnesia.sources import demo
78
+
79
+ SOURCES = [
80
+ # you can pass arguments to index functions
81
+ Source(demo.index, count=10, name='explicit name'),
82
+
83
+ # or rely on the default argument!
84
+ Source(demo.index, name='another name'),
85
+
86
+ # or rely on default source name name (will be guessed as 'demo')
87
+ Source(demo.index),
88
+
89
+ # rely on default index function
90
+ Source(demo),
91
+
92
+ # no need for Source() either!
93
+ demo.index,
94
+ demo,
95
+
96
+ # I guess this is as simple as it possibly gets...
97
+ 'promnesia.sources.demo',
98
+
99
+ # just in case, test lambdas
100
+ # with list
101
+ lambda: list(demo.index()),
102
+
103
+ # with generator
104
+ lambda: iter(list(demo.index())),
105
+
106
+ # example of lazy source
107
+ # useful when arguments are somehow computed dynamically in config
108
+ Source(lambda: demo.index(count=10), name='lazy'),
109
+ ]
110
+ '''
111
+ )
112
+
113
+ srcs = [s if isinstance(s, Source) else throw(s) for s in cfg.sources]
114
+
115
+ [s1, s2, s3, s4, s5, s55, s6, s7, s77, s777] = srcs
116
+
117
+ # just a quick check to make sure tests import promnesia package correctly
118
+ # (depends on conftests settings)
119
+ assert type(srcs[0]).__module__ == 'promnesia.common', srcs
120
+ assert s1.name == 'explicit name'
121
+ assert s2.name == 'another name'
122
+ assert s3.name == 'demo'
123
+ assert s4.name == 'demo'
124
+ assert s5.name == 'demo'
125
+ assert s55.name == 'demo'
126
+ assert s6.name == 'demo'
127
+
128
+ # can't say 'cfg' as name is intended here but anyway
129
+ assert s7.name == 'cfg'
130
+ assert s77.name == 'cfg'
131
+ assert s777.name == 'lazy'
132
+
133
+ index(cfg)
134
+ # TODO assert on results count?
135
+
136
+
137
+ # TODO ugh. allow not to have locator
138
+ # ideally you can construct a visit with a link and that's it
139
+ def test_sources_style_2() -> None:
140
+ '''
141
+ Now, sources are not magic -- they are just functions emitting visits
142
+ '''
143
+ cfg = make(
144
+ '''
145
+ from typing import Iterable
146
+ from promnesia.common import Visit, Source, Loc
147
+
148
+ def my_indexer() -> Iterable[Visit]:
149
+ from datetime import datetime
150
+ for link in ['reddit.com', 'beepb00p.xyz']:
151
+ yield Visit(
152
+ url=link,
153
+ dt=datetime.min,
154
+ locator=Loc.make('test'),
155
+ )
156
+
157
+ SOURCES = [
158
+ # you can just pass the function name here
159
+ my_indexer,
160
+
161
+ # or give it an explicit name (instead of a guess)
162
+ Source(my_indexer, name='nice name'),
163
+ ]
164
+
165
+
166
+ class MyIndexer:
167
+ def index():
168
+ from promnesia.sources import demo
169
+ return list(demo.index())
170
+
171
+ SOURCES.append(
172
+ MyIndexer,
173
+ )
174
+
175
+ '''
176
+ )
177
+ [s1, s2, s3] = [s if isinstance(s, Source) else throw(s) for s in cfg.sources]
178
+
179
+ assert s1.name == 'cfg' # TODO would be nice to guess 'my_indexer' instead...
180
+ assert s2.name == 'nice name'
181
+ assert s3.name == 'cfg' # TODO fix it, make MyIndexer?
182
+
183
+ index(cfg)
184
+
185
+
186
+ def test_sources_lazy():
187
+ '''
188
+ Demonstration of ways to return 'lazy' and generally more advanced sources
189
+
190
+ Lazy sources could be useful to do some conditional magic or make more defensive against imports, excra configuration. You'll know when you need it ;)
191
+ '''
192
+
193
+ cfg = make(
194
+ '''
195
+ from promnesia.common import Source
196
+
197
+ def lazy():
198
+ from promnesia.sources import demo
199
+ print("Hello, I'm so lazy...")
200
+ yield from demo.index()
201
+
202
+ SOURCES = [
203
+ lazy,
204
+ ]
205
+ '''
206
+ )
207
+ srcs = [s if isinstance(s, Source) else throw(s) for s in cfg.sources]
208
+ [s] = srcs
209
+
210
+ assert s.name == 'cfg' # TODO this should be fixed... but not a big deal
211
+
212
+ index(cfg)
213
+
214
+
215
+ # TODO later
216
+ # or like that:
217
+ # (i for i in lazy()),
218
+
219
+ # TODO later, support stuff that returns sources lazily? e.g. lambda: Source(...)
220
+ # not sure if it's very useful
221
+
222
+
223
+ def test_sources_errors() -> None:
224
+ '''
225
+ Testing defensiveness of config against various errors
226
+ '''
227
+ cfg = make(
228
+ '''
229
+ SOURCES = [
230
+ 'non.existing.module',
231
+
232
+ lambda: bad.attribute,
233
+
234
+ 'promnesia.sources.demo',
235
+ ]
236
+ '''
237
+ )
238
+
239
+ # nothing fails so far! It's defensive!
240
+ srcs = list(cfg.sources)
241
+
242
+ [e1, s1, s2] = srcs
243
+
244
+ assert isinstance(e1, Exception)
245
+ assert isinstance(s1, Source)
246
+ assert isinstance(s2, Source)
247
+
248
+ errors = index(cfg, check=False)
249
+ assert len(errors) == 2 # errors simply propagate
250
+
251
+
252
+ def test_no_sources() -> None:
253
+ cfg = make(
254
+ '''
255
+ '''
256
+ )
257
+ # raises because no SOURCES
258
+ with pytest.raises(RuntimeError):
259
+ list(cfg.sources)
260
+
261
+
262
+ def test_empty_sources() -> None:
263
+ cfg = make(
264
+ '''
265
+ SOURCES = []
266
+ '''
267
+ )
268
+ # raises because empty SOURCES
269
+ with pytest.raises(RuntimeError):
270
+ list(cfg.sources)
271
+
272
+
273
+ def test_legacy() -> None:
274
+ cfg = make(
275
+ '''
276
+ from promnesia.common import Source
277
+ from promnesia.sources import demo
278
+ INDEXERS = [
279
+ Source(demo.index, src='legacy name'),
280
+ ]
281
+ '''
282
+ )
283
+
284
+ [s1] = cfg.sources
285
+ assert isinstance(s1, Source)
286
+
287
+ assert s1.name == 'legacy name'
288
+
289
+ index(cfg)