promnesia 1.2.20230515__py3-none-any.whl → 1.3.20241021__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- promnesia/__init__.py +14 -3
- promnesia/__main__.py +60 -35
- promnesia/cannon.py +27 -27
- promnesia/common.py +85 -67
- promnesia/compare.py +21 -22
- promnesia/compat.py +10 -10
- promnesia/config.py +23 -23
- promnesia/database/common.py +67 -0
- promnesia/database/dump.py +188 -0
- promnesia/{read_db.py → database/load.py} +16 -17
- promnesia/extract.py +14 -11
- promnesia/kjson.py +12 -11
- promnesia/logging.py +4 -4
- promnesia/misc/__init__.pyi +0 -0
- promnesia/misc/config_example.py +1 -2
- promnesia/misc/install_server.py +7 -9
- promnesia/server.py +57 -47
- promnesia/sources/__init__.pyi +0 -0
- promnesia/sources/auto.py +50 -35
- promnesia/sources/auto_logseq.py +6 -5
- promnesia/sources/auto_obsidian.py +2 -2
- promnesia/sources/browser.py +14 -9
- promnesia/sources/browser_legacy.py +26 -16
- promnesia/sources/demo.py +19 -3
- promnesia/sources/fbmessenger.py +3 -2
- promnesia/sources/filetypes.py +16 -7
- promnesia/sources/github.py +7 -9
- promnesia/sources/guess.py +2 -1
- promnesia/sources/hackernews.py +2 -2
- promnesia/sources/hpi.py +2 -2
- promnesia/sources/html.py +7 -5
- promnesia/sources/hypothesis.py +4 -3
- promnesia/sources/instapaper.py +2 -2
- promnesia/sources/markdown.py +31 -21
- promnesia/sources/org.py +27 -13
- promnesia/sources/plaintext.py +30 -29
- promnesia/sources/pocket.py +3 -2
- promnesia/sources/reddit.py +20 -19
- promnesia/sources/roamresearch.py +2 -1
- promnesia/sources/rss.py +4 -5
- promnesia/sources/shellcmd.py +19 -6
- promnesia/sources/signal.py +33 -24
- promnesia/sources/smscalls.py +2 -2
- promnesia/sources/stackexchange.py +4 -3
- promnesia/sources/takeout.py +76 -9
- promnesia/sources/takeout_legacy.py +24 -12
- promnesia/sources/telegram.py +13 -11
- promnesia/sources/telegram_legacy.py +18 -7
- promnesia/sources/twitter.py +6 -5
- promnesia/sources/vcs.py +5 -3
- promnesia/sources/viber.py +10 -9
- promnesia/sources/website.py +4 -4
- promnesia/sources/zulip.py +3 -2
- promnesia/sqlite.py +7 -4
- promnesia/tests/__init__.py +0 -0
- promnesia/tests/common.py +140 -0
- promnesia/tests/server_helper.py +67 -0
- promnesia/tests/sources/__init__.py +0 -0
- promnesia/tests/sources/test_auto.py +65 -0
- promnesia/tests/sources/test_filetypes.py +43 -0
- promnesia/tests/sources/test_hypothesis.py +39 -0
- promnesia/tests/sources/test_org.py +64 -0
- promnesia/tests/sources/test_plaintext.py +25 -0
- promnesia/tests/sources/test_shellcmd.py +21 -0
- promnesia/tests/sources/test_takeout.py +56 -0
- promnesia/tests/test_cannon.py +325 -0
- promnesia/tests/test_cli.py +40 -0
- promnesia/tests/test_compare.py +30 -0
- promnesia/tests/test_config.py +289 -0
- promnesia/tests/test_db_dump.py +222 -0
- promnesia/tests/test_extract.py +65 -0
- promnesia/tests/test_extract_urls.py +43 -0
- promnesia/tests/test_indexer.py +251 -0
- promnesia/tests/test_server.py +291 -0
- promnesia/tests/test_traverse.py +39 -0
- promnesia/tests/utils.py +35 -0
- {promnesia-1.2.20230515.dist-info → promnesia-1.3.20241021.dist-info}/METADATA +15 -18
- promnesia-1.3.20241021.dist-info/RECORD +83 -0
- {promnesia-1.2.20230515.dist-info → promnesia-1.3.20241021.dist-info}/WHEEL +1 -1
- {promnesia-1.2.20230515.dist-info → promnesia-1.3.20241021.dist-info}/entry_points.txt +0 -1
- promnesia/dump.py +0 -105
- promnesia-1.2.20230515.dist-info/RECORD +0 -58
- {promnesia-1.2.20230515.dist-info → promnesia-1.3.20241021.dist-info}/LICENSE +0 -0
- {promnesia-1.2.20230515.dist-info → promnesia-1.3.20241021.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,325 @@
|
|
1
|
+
from typing import cast
|
2
|
+
|
3
|
+
import pytest
|
4
|
+
|
5
|
+
from ..cannon import CanonifyException, canonify
|
6
|
+
|
7
|
+
# TODO should actually understand 'sequences'?
|
8
|
+
# e.g.
|
9
|
+
# https://www.scottaaronson.com/blog/?p=3167#comment-1731882 is kinda hierarchy of scottaaronson.com, post 3167 and comment 1731882
|
10
|
+
# but when working with it from server, would be easier to just do multiple queries I guess..
|
11
|
+
# https://www.scottaaronson.com/blog/?p=3167 is kind ahierarchy of scottaaronson.com ;
|
12
|
+
|
13
|
+
|
14
|
+
param = pytest.mark.parametrize
|
15
|
+
|
16
|
+
|
17
|
+
# mark stuff that in interesting as a testcase, but I'm not sure about yet
|
18
|
+
TODO = cast(str, object())
|
19
|
+
|
20
|
+
|
21
|
+
def check(url, expected):
|
22
|
+
if expected is TODO:
|
23
|
+
pytest.skip(f"'{url}' will be handled later")
|
24
|
+
assert canonify(url) == expected
|
25
|
+
|
26
|
+
|
27
|
+
# TODO assume spaces are not meaninfgul??
|
28
|
+
# then could align URLs etc?
|
29
|
+
|
30
|
+
@param('url,expected', [(
|
31
|
+
'https://www.youtube.com/watch?t=491s&v=1NHbPN9pNPM&index=63&list=WL',
|
32
|
+
# NOTE: t= reordered, makes it more hierarchical
|
33
|
+
# list as well, I guess makes the most sense to keep it at the very end.. since lists are more like tags
|
34
|
+
'youtube.com/watch?v=1NHbPN9pNPM&t=491s&list=WL'
|
35
|
+
), (
|
36
|
+
'youtube.com/watch?v=wHrCkyoe72U&feature=share&time_continue=6',
|
37
|
+
'youtube.com/watch?v=wHrCkyoe72U'
|
38
|
+
), (
|
39
|
+
'youtube.com/embed/nyc6RJEEe0U?feature=oembed',
|
40
|
+
'youtube.com/watch?v=nyc6RJEEe0U'
|
41
|
+
), (
|
42
|
+
'https://youtu.be/iCvmsMzlF7o?list=WL',
|
43
|
+
'youtube.com/watch?v=iCvmsMzlF7o&list=WL'
|
44
|
+
),
|
45
|
+
# TODO can even be like that or contain timestamp (&t=)
|
46
|
+
# TODO warn if param already present? shouldn't happen..
|
47
|
+
|
48
|
+
# TODO could be interesting to do automatic rule extraction by querying one represnetative and then extracting canonical
|
49
|
+
|
50
|
+
# TODO national domains don't matter for youtube
|
51
|
+
|
52
|
+
# [*, 'youtube', ANY_DOMAIN] / 'embed' -> 'youtube.com/watch'
|
53
|
+
# TODO use regex backrefs?
|
54
|
+
#
|
55
|
+
(
|
56
|
+
'm.youtube.com/watch?v=Zn6gV2sdl38',
|
57
|
+
'youtube.com/watch?v=Zn6gV2sdl38'
|
58
|
+
),
|
59
|
+
|
60
|
+
# ( "https//youtube.com/playlist?list=PLeOfc0M-50LmJtZwyOfw6aVopmIbU1t7t"
|
61
|
+
# , "youtube.com/playlist?list=PLeOfc0M-50LmJtZwyOfw6aVopmIbU1t7t"
|
62
|
+
# ),
|
63
|
+
# TODO perhaps it should result in video link + sibling link?
|
64
|
+
# when exploring other people's playlists this could be quite useful?
|
65
|
+
|
66
|
+
# ( "https://www.youtube.com/watch?v=1NHbPN9pNPM&index=63&list=WL&t=491s"
|
67
|
+
# , "youtube.com/watch?v=1NHbPN9pNPM&list=WL" # TODO not so sure about &t, it's sort of useful
|
68
|
+
# ),
|
69
|
+
# TODO
|
70
|
+
# youtube.com/user/magauchsein/playlists?sort=dd&view=50&shelf_id=14
|
71
|
+
# youtube.com/user/TheChemlife/videos?view=0&sort=p&flow=grid
|
72
|
+
])
|
73
|
+
def test_youtube(url, expected):
|
74
|
+
assert canonify(url) == expected
|
75
|
+
|
76
|
+
|
77
|
+
@param('url,expected', [(
|
78
|
+
'https://web.archive.org/web/20090902224414/http://reason.com/news/show/119237.html',
|
79
|
+
'reason.com/news/show/119237.html',
|
80
|
+
)])
|
81
|
+
def test_archiveorg(url, expected):
|
82
|
+
assert canonify(url) == expected
|
83
|
+
|
84
|
+
|
85
|
+
# ugh. good example of motication for cannon.py?
|
86
|
+
@param('url,expected', [(
|
87
|
+
'https://news.ycombinator.com/from?site=jacopo.io',
|
88
|
+
'jacopo.io',
|
89
|
+
), (
|
90
|
+
'https://news.ycombinator.com/item?id=25099862',
|
91
|
+
'news.ycombinator.com/item?id=25099862',
|
92
|
+
), (
|
93
|
+
'https://news.ycombinator.com/reply?id=25100035&goto=item%3Fid%3D25099862%2325100035',
|
94
|
+
TODO,
|
95
|
+
)])
|
96
|
+
def test_hackernews(url, expected):
|
97
|
+
check(url, expected)
|
98
|
+
|
99
|
+
|
100
|
+
@param('url, expected', [
|
101
|
+
( 'https://www.reddit.com/r/firefox/comments/bbugc5/firefox_bans_free_speech_commenting_plugin/?ref=readnext'
|
102
|
+
, 'reddit.com/r/firefox/comments/bbugc5/firefox_bans_free_speech_commenting_plugin',
|
103
|
+
),
|
104
|
+
|
105
|
+
( 'https://www.reddit.com/r/selfhosted/comments/8j8mo3/what_are_you_self_hosting/dz19gh9/?utm_content=permalink&utm_medium=user&utm_source=reddit&utm_name=u_karlicoss'
|
106
|
+
, 'reddit.com/r/selfhosted/comments/8j8mo3/what_are_you_self_hosting/dz19gh9',
|
107
|
+
)
|
108
|
+
# TODO hmm. parent relationship can just rely on urls for reddit
|
109
|
+
# just need to support it in server I suppose
|
110
|
+
|
111
|
+
# TODO search queries?
|
112
|
+
# https://www.reddit.com/search?q=AutoValue
|
113
|
+
|
114
|
+
# TODO def need better markdown handling
|
115
|
+
# https://reddit.com/r/intj/comments/cmof04/me_irl/ew4a3dw/][ Me_irl]
|
116
|
+
# reddit.com/r/intj/comments/cmof04/me_irl/ew4a3dw/%5D%5BMe_irl%5D
|
117
|
+
|
118
|
+
|
119
|
+
|
120
|
+
])
|
121
|
+
def test_reddit(url, expected):
|
122
|
+
assert canonify(url) == expected
|
123
|
+
|
124
|
+
# ugh. good example of motication for cannon.py?
|
125
|
+
@param('url,expected', [
|
126
|
+
( 'https://app.getpocket.com/read/3479402594'
|
127
|
+
, 'app.getpocket.com/read/3479402594'
|
128
|
+
),
|
129
|
+
|
130
|
+
( 'https://getpocket.com/read/3479402594'
|
131
|
+
, 'app.getpocket.com/read/3479402594'
|
132
|
+
),
|
133
|
+
])
|
134
|
+
def test_pocket(url, expected):
|
135
|
+
assert canonify(url) == expected
|
136
|
+
|
137
|
+
@pytest.mark.parametrize(("url", "expected"), [
|
138
|
+
# TODO ?? 'https://groups.google.com/a/list.hypothes.is/forum/#!topic/dev/kcmS7H8ssis',
|
139
|
+
#
|
140
|
+
# TODO FIXME fragment handling
|
141
|
+
# ( "https://www.scottaaronson.com/blog/?p=3167#comment-1731882"
|
142
|
+
# , "scottaaronson.com/blog/?p=3167#comment-1731882"
|
143
|
+
# ),
|
144
|
+
|
145
|
+
|
146
|
+
# TODO FIXME fragment handling
|
147
|
+
# ( "https://en.wikipedia.org/wiki/tendon#cite_note-14"
|
148
|
+
# , "en.wikipedia.org/wiki/tendon#cite_note-14"
|
149
|
+
# ),
|
150
|
+
|
151
|
+
# TODO FIXME fragment handling
|
152
|
+
# ( "https://physicstravelguide.com/experiments/aharonov-bohm#tab__concrete"
|
153
|
+
# , "physicstravelguide.com/experiments/aharonov-bohm#tab__concrete"
|
154
|
+
# ),
|
155
|
+
|
156
|
+
( "https://github.com/search?o=asc&q=track&s=stars&type=Repositories"
|
157
|
+
, "github.com/search?q=track"
|
158
|
+
),
|
159
|
+
( "https://80000hours.org/career-decision/article/?utm_source=The+EA+Newsletter&utm_campaign=04ca3c2244-EMAIL_CAMPAIGN_2019_04_03_04_26&utm_medium=email&utm_term=0_51c1df13ac-04ca3c2244-318697649"
|
160
|
+
, "80000hours.org/career-decision/article"
|
161
|
+
),
|
162
|
+
( "https://www.facebook.com/photo.php?fbid=24147689823424326&set=pcb.2414778905423667&type=3&theater"
|
163
|
+
, "facebook.com/photo.php?fbid=24147689823424326"
|
164
|
+
),
|
165
|
+
( "https://play.google.com/store/apps/details?id=com.faultexception.reader&hl=en"
|
166
|
+
, "play.google.com/store/apps/details?id=com.faultexception.reader"
|
167
|
+
),
|
168
|
+
# TODO it also got &p= parameter, which refers to page... not sure how to handle this
|
169
|
+
# news.ycombinator.com/item?id=15451442&p=2
|
170
|
+
( "https://news.ycombinator.com/item?id=12172351"
|
171
|
+
, "news.ycombinator.com/item?id=12172351"
|
172
|
+
),
|
173
|
+
( "https://urbandictionary.com/define.php?term=Belgian%20Whistle"
|
174
|
+
, "urbandictionary.com/define.php?term=Belgian%20Whistle"
|
175
|
+
),
|
176
|
+
( "https://en.wikipedia.org/wiki/Dinic%27s_algorithm"
|
177
|
+
, "en.wikipedia.org/wiki/Dinic%27s_algorithm"
|
178
|
+
),
|
179
|
+
|
180
|
+
( "zoopla.co.uk/to-rent/details/42756337#D0zlBWeD4X85odsR.97"
|
181
|
+
, "zoopla.co.uk/to-rent/details/42756337"
|
182
|
+
),
|
183
|
+
|
184
|
+
( "withouthspec.co.uk/rooms/16867952?guests=2&adults=2&location=Berlin%2C+Germany&check_in=2017-08-16&check_out=2017-08-20"
|
185
|
+
, "withouthspec.co.uk/rooms/16867952"
|
186
|
+
),
|
187
|
+
|
188
|
+
( "amp.theguardian.com/technology/2017/oct/09/mark-zuckerberg-facebook-puerto-rico-virtual-reality"
|
189
|
+
, "theguardian.com/technology/2017/oct/09/mark-zuckerberg-facebook-puerto-rico-virtual-reality",
|
190
|
+
),
|
191
|
+
|
192
|
+
( "https://answers.yahoo.com/question/index?qid=20071101131442AAk9bGp"
|
193
|
+
, "answers.yahoo.com/question/index?qid=20071101131442AAk9bGp"
|
194
|
+
),
|
195
|
+
( "flowingdata.com/2010/12/14/10-best-data-visualization-projects-of-the-year-%e2%80%93-2010"
|
196
|
+
, "flowingdata.com/2010/12/14/10-best-data-visualization-projects-of-the-year-%E2%80%93-2010"
|
197
|
+
),
|
198
|
+
( "flowingdata.com/2010/12/14/10-best-data-visualization-projects-of-the-year-–-2010"
|
199
|
+
, "flowingdata.com/2010/12/14/10-best-data-visualization-projects-of-the-year-%E2%80%93-2010"
|
200
|
+
),
|
201
|
+
|
202
|
+
( "https://spoonuniversity.com/lifestyle/marmite-ways-to-eat-it&usg=AFQjCNH4s1SOEjlpENlfPV5nuvADZpSdow"
|
203
|
+
, "spoonuniversity.com/lifestyle/marmite-ways-to-eat-it"
|
204
|
+
),
|
205
|
+
|
206
|
+
( 'https://google.co.uk/amp/s/amp.reddit.com/r/androidapps/comments/757e2t/swiftkey_or_gboard'
|
207
|
+
, 'reddit.com/r/androidapps/comments/757e2t/swiftkey_or_gboard'
|
208
|
+
),
|
209
|
+
|
210
|
+
# should sort query params
|
211
|
+
( 'https://www.youtube.com/watch?v=hvoQiF0kBI8&list=WL&index=2'
|
212
|
+
, 'youtube.com/watch?v=hvoQiF0kBI8&list=WL',
|
213
|
+
),
|
214
|
+
( 'https://www.youtube.com/watch?list=WL&v=hvoQiF0kBI8&index=2'
|
215
|
+
, 'youtube.com/watch?v=hvoQiF0kBI8&list=WL',
|
216
|
+
),
|
217
|
+
|
218
|
+
# TODO def need to allow the _user_ to define the rules.
|
219
|
+
# no way I can predict everything
|
220
|
+
# basically, allow *interactively* select
|
221
|
+
# also allow introspection, which rule matched?
|
222
|
+
( 'https://bbs.archlinux.org/viewtopic.php?id=212740'
|
223
|
+
, 'bbs.archlinux.org/viewtopic.php?id=212740',
|
224
|
+
),
|
225
|
+
|
226
|
+
( 'https://ubuntuforums.org/showthread.php?t=1403470&s=0dd67bdb12559c22e73a220752db50c7&p=8806195#post8806195'
|
227
|
+
, 'ubuntuforums.org/showthread.php?t=1403470&p=8806195',
|
228
|
+
),
|
229
|
+
|
230
|
+
( 'https://arstechnica.com/?p=1371299',
|
231
|
+
'arstechnica.com/?p=1371299',
|
232
|
+
# eh. it's a redirect to https://arstechnica.com/information-technology/2018/09/dozens-of-ios-apps-surreptitiously-share-user-location-data-with-tracking-firms/
|
233
|
+
# however in the page body there is <link rel="shorturl" href="https://arstechnica.com/?p=1371299"> ...
|
234
|
+
),
|
235
|
+
|
236
|
+
# ( "gwern.net/DNB+FAQ"
|
237
|
+
# , "TODO" # ???
|
238
|
+
# ),
|
239
|
+
|
240
|
+
# TODO shit. is that normal??? perhaps need to manually move fragment?
|
241
|
+
# SplitResult(scheme='https', netloc='unix.stackexchange.com', path='/questions/171603/convert-file-contents-to-lower-case/171708', query='', fragment='171708&usg=AFQjCNEFCGqCAa4P4Zlu2x11bThJispNxQ')
|
242
|
+
# ( "https://unix.stackexchange.com/questions/171603/convert-file-contents-to-lower-case/171708#171708&usg=AFQjCNEFCGqCAa4P4Zlu2x11bThJispNxQ"
|
243
|
+
# , "unix.stackexchange.com/questions/171603/convert-file-contents-to-lower-case/171708#171708"
|
244
|
+
# )
|
245
|
+
])
|
246
|
+
def test(url, expected):
|
247
|
+
assert canonify(url) == expected
|
248
|
+
# TODO github queries
|
249
|
+
# github.com/search?l=Python&q=reddit+backup
|
250
|
+
# github.com/search?p=3&q=ipynb+language%3AHaskell
|
251
|
+
# github.com/search?q=kobo+ExtraData
|
252
|
+
# github.com/search?q=what-universal-human-experiences-are-you-missing-without-realizing-it
|
253
|
+
|
254
|
+
# TODO git+https://github.com/expectocode/telegram-export@master
|
255
|
+
# TODO again, for that actually sequence would be good...
|
256
|
+
|
257
|
+
# TODO "https://twitter.com/search?q=pinboard search&src=typd"
|
258
|
+
|
259
|
+
# TODO https://www.zalando-lounge.ch/#/
|
260
|
+
# TODO m.facebook.com
|
261
|
+
# TODO [R('^(youtube|urbandictionary|tesco|scottaaronson|answers.yahoo.com|code.google.com)') , None],
|
262
|
+
|
263
|
+
|
264
|
+
|
265
|
+
# TODO
|
266
|
+
# amazon.co.uk/gp/offer-listing/B00525XKL4/ref=dp_olp_new
|
267
|
+
# amazon.co.uk/gp/offer-listing/B00525XKL4/ref=olp_twister_child
|
268
|
+
|
269
|
+
# TODO
|
270
|
+
# en.wikipedia.org/wiki/S&P_500_Index
|
271
|
+
|
272
|
+
|
273
|
+
# TODO
|
274
|
+
# google.co.uk/maps/place/Hackney+Bureau/@51.5293789,-0.0527919,16.88z/data=!bla-bla!-bla
|
275
|
+
|
276
|
+
|
277
|
+
# TODO
|
278
|
+
# perhaps, disable utf8 everywhere?
|
279
|
+
# github.com/search?utf8=%E2%9C%93&q=%22My+Clippings.txt%22
|
280
|
+
|
281
|
+
# TODO FIXME fragment handling
|
282
|
+
# ( "https://www.scottaaronson.com/blog/?p=3167#comment-1731882"
|
283
|
+
# , "scottaaronson.com/blog/?p=3167#comment-1731882"
|
284
|
+
# ),
|
285
|
+
|
286
|
+
@pytest.mark.parametrize("urls", [
|
287
|
+
{
|
288
|
+
"launchpad.net/ubuntu/%2Bsource/okular",
|
289
|
+
"launchpad.net/ubuntu/+source/okular",
|
290
|
+
},
|
291
|
+
{
|
292
|
+
"flowingdata.com/2010/12/14/10-best-data-visualization-projects-of-the-year-–-2010",
|
293
|
+
"flowingdata.com/2010/12/14/10-best-data-visualization-projects-of-the-year-%e2%80%93-2010",
|
294
|
+
"https://flowingdata.com/2010/12/14/10-best-data-visualization-projects-of-the-year-%e2%80%93-2010/&usg=AFQjCNEZsEGz9rqpWqlFXR5Tc7pkCKY5sQ",
|
295
|
+
},
|
296
|
+
])
|
297
|
+
def test_same_norm(urls):
|
298
|
+
urls = sorted(urls)
|
299
|
+
u0 = urls[0]
|
300
|
+
c0 = canonify(u0)
|
301
|
+
for u in urls[1:]:
|
302
|
+
c = canonify(u)
|
303
|
+
assert c0 == c, f'Expected {u0} and {u} to be same canonically; got {c0} and {c} instead'
|
304
|
+
|
305
|
+
def test_error():
|
306
|
+
# canonify(' +74Zo535, fewfwf@gmail.com') # -- apparently was patched in some python3.7 versions
|
307
|
+
with pytest.raises(CanonifyException):
|
308
|
+
# borrowed from https://bugs.mageia.org/show_bug.cgi?id=24640#c7
|
309
|
+
canonify('https://example.com\uFF03@bing.com')
|
310
|
+
|
311
|
+
@pytest.mark.parametrize(("url", "expected"), [
|
312
|
+
('https://news.ycombinator.com/item?id=', 'news.ycombinator.com/item?id='),
|
313
|
+
('https://www.youtube.com/watch?v=hvoQiF0kBI8&list&index=2',
|
314
|
+
'youtube.com/watch?v=hvoQiF0kBI8&list='),
|
315
|
+
])
|
316
|
+
def test_empty_query_parameter(url, expected):
|
317
|
+
assert canonify(url) == expected
|
318
|
+
|
319
|
+
@pytest.mark.parametrize(("url", "expected"), [
|
320
|
+
('http://www.isfdb.org/cgi-bin/title.cgi?2172', 'isfdb.org/cgi-bin/title.cgi?2172='),
|
321
|
+
('http://www.isfdb.org/cgi-bin/title.cgi?2172+1', 'isfdb.org/cgi-bin/title.cgi?2172%201='),
|
322
|
+
('http://www.isfdb.org/cgi-bin/title.cgi?2172&foo=bar&baz&quux', 'isfdb.org/cgi-bin/title.cgi?2172=&baz=&foo=bar&quux='),
|
323
|
+
])
|
324
|
+
def test_qkeep_true(url, expected):
|
325
|
+
assert canonify(url) == expected
|
@@ -0,0 +1,40 @@
|
|
1
|
+
import os
|
2
|
+
import time
|
3
|
+
|
4
|
+
import pytest
|
5
|
+
import requests
|
6
|
+
|
7
|
+
from ..common import _is_windows
|
8
|
+
from .common import get_testdata, promnesia_bin, tmp_popen
|
9
|
+
|
10
|
+
ox_hugo_data = get_testdata('ox-hugo/test/site')
|
11
|
+
|
12
|
+
|
13
|
+
def test_demo() -> None:
|
14
|
+
if _is_windows:
|
15
|
+
# for some reason fails to connect to server..
|
16
|
+
# not sure maybe something with port choice idk
|
17
|
+
pytest.skip("TODO broken on Windows")
|
18
|
+
|
19
|
+
with tmp_popen(promnesia_bin('demo', '--port', '16789', ox_hugo_data)):
|
20
|
+
# TODO why does it want post??
|
21
|
+
time.sleep(2) # meh.. need a generic helper to wait till ready...
|
22
|
+
res = {}
|
23
|
+
for _attempt in range(30):
|
24
|
+
time.sleep(1)
|
25
|
+
try:
|
26
|
+
res = requests.post(
|
27
|
+
"http://localhost:16789/search",
|
28
|
+
json={'url': "https://github.com/kaushalmodi/ox-hugo/issues"},
|
29
|
+
).json()
|
30
|
+
break
|
31
|
+
except:
|
32
|
+
continue
|
33
|
+
else:
|
34
|
+
raise RuntimeError("Couldn't connect to the server")
|
35
|
+
vis = res['visits']
|
36
|
+
assert len(vis) > 50, vis
|
37
|
+
mds = [x for x in vis if x['locator']['title'] == 'content/posts/citations-example-toml.md'.replace('/', os.sep)]
|
38
|
+
orgs = [x for x in vis if x['locator']['title'].startswith('content-org/single-posts/empty_tag.org'.replace('/', os.sep))]
|
39
|
+
assert len(mds) == 1
|
40
|
+
assert len(orgs) == 1
|
@@ -0,0 +1,30 @@
|
|
1
|
+
import shutil
|
2
|
+
from pathlib import Path
|
3
|
+
|
4
|
+
from ..compare import compare_files
|
5
|
+
from .utils import index_urls
|
6
|
+
|
7
|
+
|
8
|
+
def test_compare(tmp_path: Path) -> None:
|
9
|
+
idx = index_urls({
|
10
|
+
'https://example.com': None,
|
11
|
+
'https://en.wikipedia.org/wiki/Saturn_V': None,
|
12
|
+
'https://plato.stanford.edu/entries/qualia': None,
|
13
|
+
})
|
14
|
+
idx(tmp_path)
|
15
|
+
db = tmp_path / 'promnesia.sqlite'
|
16
|
+
old_db = tmp_path / 'promnesia-old.sqlite'
|
17
|
+
shutil.move(str(db), str(old_db))
|
18
|
+
|
19
|
+
idx2 = index_urls({
|
20
|
+
'https://example.com': None,
|
21
|
+
'https://www.reddit.com/r/explainlikeimfive/comments/1ev6e0/eli5entropy': None,
|
22
|
+
'https://en.wikipedia.org/wiki/Saturn_V': None,
|
23
|
+
'https://plato.stanford.edu/entries/qualia': None,
|
24
|
+
})
|
25
|
+
idx2(tmp_path)
|
26
|
+
|
27
|
+
# should not crash, as there are more links in the new database
|
28
|
+
assert len(list(compare_files(old_db, db))) == 0
|
29
|
+
|
30
|
+
assert len(list(compare_files(db, old_db))) == 1
|
@@ -0,0 +1,289 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
|
3
|
+
from contextlib import contextmanager
|
4
|
+
from pathlib import Path
|
5
|
+
from tempfile import TemporaryDirectory
|
6
|
+
|
7
|
+
import pytest
|
8
|
+
from more_itertools import ilen
|
9
|
+
|
10
|
+
from ..common import Source
|
11
|
+
from ..config import Config, import_config
|
12
|
+
from .common import throw
|
13
|
+
|
14
|
+
|
15
|
+
def make(body: str) -> Config:
|
16
|
+
with TemporaryDirectory() as td:
|
17
|
+
tdir = Path(td)
|
18
|
+
cp = tdir / 'cfg.py'
|
19
|
+
cp.write_text(body)
|
20
|
+
return import_config(cp)
|
21
|
+
|
22
|
+
|
23
|
+
@contextmanager
|
24
|
+
def with_config(cfg: str | Config):
|
25
|
+
from .. import config as C
|
26
|
+
|
27
|
+
assert not C.has()
|
28
|
+
cfg2: Config = make(cfg) if isinstance(cfg, str) else cfg
|
29
|
+
try:
|
30
|
+
C.instance = cfg2
|
31
|
+
assert C.has()
|
32
|
+
yield
|
33
|
+
finally:
|
34
|
+
C.reset()
|
35
|
+
|
36
|
+
|
37
|
+
def index(cfg: str | Config, *, check: bool = True) -> list[Exception]:
|
38
|
+
from ..__main__ import _do_index
|
39
|
+
|
40
|
+
with with_config(cfg):
|
41
|
+
errors = list(_do_index())
|
42
|
+
if check:
|
43
|
+
assert len(errors) == 0, errors
|
44
|
+
# visits = cfg.output_dir / 'promnesia.sqlite'
|
45
|
+
# TODO query visit count too
|
46
|
+
return errors
|
47
|
+
|
48
|
+
|
49
|
+
def test_minimal() -> None:
|
50
|
+
'''
|
51
|
+
Example of a smallest possible config, using a 'demo' source
|
52
|
+
'''
|
53
|
+
# import directly from promnesia, not promnesia.common
|
54
|
+
cfg = make(
|
55
|
+
'''
|
56
|
+
from promnesia import Source
|
57
|
+
from promnesia.sources import demo
|
58
|
+
|
59
|
+
SOURCES = [
|
60
|
+
Source(demo.index),
|
61
|
+
]
|
62
|
+
'''
|
63
|
+
)
|
64
|
+
assert ilen(cfg.sources) == 1
|
65
|
+
assert all(isinstance(s, Source) for s in cfg.sources)
|
66
|
+
# todo output dirs?
|
67
|
+
index(cfg)
|
68
|
+
|
69
|
+
|
70
|
+
def test_sources_style_1() -> None:
|
71
|
+
'''
|
72
|
+
Testing 'styles' of specifying sources
|
73
|
+
'''
|
74
|
+
cfg = make(
|
75
|
+
'''
|
76
|
+
from promnesia.common import Source
|
77
|
+
from promnesia.sources import demo
|
78
|
+
|
79
|
+
SOURCES = [
|
80
|
+
# you can pass arguments to index functions
|
81
|
+
Source(demo.index, count=10, name='explicit name'),
|
82
|
+
|
83
|
+
# or rely on the default argument!
|
84
|
+
Source(demo.index, name='another name'),
|
85
|
+
|
86
|
+
# or rely on default source name name (will be guessed as 'demo')
|
87
|
+
Source(demo.index),
|
88
|
+
|
89
|
+
# rely on default index function
|
90
|
+
Source(demo),
|
91
|
+
|
92
|
+
# no need for Source() either!
|
93
|
+
demo.index,
|
94
|
+
demo,
|
95
|
+
|
96
|
+
# I guess this is as simple as it possibly gets...
|
97
|
+
'promnesia.sources.demo',
|
98
|
+
|
99
|
+
# just in case, test lambdas
|
100
|
+
# with list
|
101
|
+
lambda: list(demo.index()),
|
102
|
+
|
103
|
+
# with generator
|
104
|
+
lambda: iter(list(demo.index())),
|
105
|
+
|
106
|
+
# example of lazy source
|
107
|
+
# useful when arguments are somehow computed dynamically in config
|
108
|
+
Source(lambda: demo.index(count=10), name='lazy'),
|
109
|
+
]
|
110
|
+
'''
|
111
|
+
)
|
112
|
+
|
113
|
+
srcs = [s if isinstance(s, Source) else throw(s) for s in cfg.sources]
|
114
|
+
|
115
|
+
[s1, s2, s3, s4, s5, s55, s6, s7, s77, s777] = srcs
|
116
|
+
|
117
|
+
# just a quick check to make sure tests import promnesia package correctly
|
118
|
+
# (depends on conftests settings)
|
119
|
+
assert type(srcs[0]).__module__ == 'promnesia.common', srcs
|
120
|
+
assert s1.name == 'explicit name'
|
121
|
+
assert s2.name == 'another name'
|
122
|
+
assert s3.name == 'demo'
|
123
|
+
assert s4.name == 'demo'
|
124
|
+
assert s5.name == 'demo'
|
125
|
+
assert s55.name == 'demo'
|
126
|
+
assert s6.name == 'demo'
|
127
|
+
|
128
|
+
# can't say 'cfg' as name is intended here but anyway
|
129
|
+
assert s7.name == 'cfg'
|
130
|
+
assert s77.name == 'cfg'
|
131
|
+
assert s777.name == 'lazy'
|
132
|
+
|
133
|
+
index(cfg)
|
134
|
+
# TODO assert on results count?
|
135
|
+
|
136
|
+
|
137
|
+
# TODO ugh. allow not to have locator
|
138
|
+
# ideally you can construct a visit with a link and that's it
|
139
|
+
def test_sources_style_2() -> None:
|
140
|
+
'''
|
141
|
+
Now, sources are not magic -- they are just functions emitting visits
|
142
|
+
'''
|
143
|
+
cfg = make(
|
144
|
+
'''
|
145
|
+
from typing import Iterable
|
146
|
+
from promnesia.common import Visit, Source, Loc
|
147
|
+
|
148
|
+
def my_indexer() -> Iterable[Visit]:
|
149
|
+
from datetime import datetime
|
150
|
+
for link in ['reddit.com', 'beepb00p.xyz']:
|
151
|
+
yield Visit(
|
152
|
+
url=link,
|
153
|
+
dt=datetime.min,
|
154
|
+
locator=Loc.make('test'),
|
155
|
+
)
|
156
|
+
|
157
|
+
SOURCES = [
|
158
|
+
# you can just pass the function name here
|
159
|
+
my_indexer,
|
160
|
+
|
161
|
+
# or give it an explicit name (instead of a guess)
|
162
|
+
Source(my_indexer, name='nice name'),
|
163
|
+
]
|
164
|
+
|
165
|
+
|
166
|
+
class MyIndexer:
|
167
|
+
def index():
|
168
|
+
from promnesia.sources import demo
|
169
|
+
return list(demo.index())
|
170
|
+
|
171
|
+
SOURCES.append(
|
172
|
+
MyIndexer,
|
173
|
+
)
|
174
|
+
|
175
|
+
'''
|
176
|
+
)
|
177
|
+
[s1, s2, s3] = [s if isinstance(s, Source) else throw(s) for s in cfg.sources]
|
178
|
+
|
179
|
+
assert s1.name == 'cfg' # TODO would be nice to guess 'my_indexer' instead...
|
180
|
+
assert s2.name == 'nice name'
|
181
|
+
assert s3.name == 'cfg' # TODO fix it, make MyIndexer?
|
182
|
+
|
183
|
+
index(cfg)
|
184
|
+
|
185
|
+
|
186
|
+
def test_sources_lazy():
|
187
|
+
'''
|
188
|
+
Demonstration of ways to return 'lazy' and generally more advanced sources
|
189
|
+
|
190
|
+
Lazy sources could be useful to do some conditional magic or make more defensive against imports, excra configuration. You'll know when you need it ;)
|
191
|
+
'''
|
192
|
+
|
193
|
+
cfg = make(
|
194
|
+
'''
|
195
|
+
from promnesia.common import Source
|
196
|
+
|
197
|
+
def lazy():
|
198
|
+
from promnesia.sources import demo
|
199
|
+
print("Hello, I'm so lazy...")
|
200
|
+
yield from demo.index()
|
201
|
+
|
202
|
+
SOURCES = [
|
203
|
+
lazy,
|
204
|
+
]
|
205
|
+
'''
|
206
|
+
)
|
207
|
+
srcs = [s if isinstance(s, Source) else throw(s) for s in cfg.sources]
|
208
|
+
[s] = srcs
|
209
|
+
|
210
|
+
assert s.name == 'cfg' # TODO this should be fixed... but not a big deal
|
211
|
+
|
212
|
+
index(cfg)
|
213
|
+
|
214
|
+
|
215
|
+
# TODO later
|
216
|
+
# or like that:
|
217
|
+
# (i for i in lazy()),
|
218
|
+
|
219
|
+
# TODO later, support stuff that returns sources lazily? e.g. lambda: Source(...)
|
220
|
+
# not sure if it's very useful
|
221
|
+
|
222
|
+
|
223
|
+
def test_sources_errors() -> None:
|
224
|
+
'''
|
225
|
+
Testing defensiveness of config against various errors
|
226
|
+
'''
|
227
|
+
cfg = make(
|
228
|
+
'''
|
229
|
+
SOURCES = [
|
230
|
+
'non.existing.module',
|
231
|
+
|
232
|
+
lambda: bad.attribute,
|
233
|
+
|
234
|
+
'promnesia.sources.demo',
|
235
|
+
]
|
236
|
+
'''
|
237
|
+
)
|
238
|
+
|
239
|
+
# nothing fails so far! It's defensive!
|
240
|
+
srcs = list(cfg.sources)
|
241
|
+
|
242
|
+
[e1, s1, s2] = srcs
|
243
|
+
|
244
|
+
assert isinstance(e1, Exception)
|
245
|
+
assert isinstance(s1, Source)
|
246
|
+
assert isinstance(s2, Source)
|
247
|
+
|
248
|
+
errors = index(cfg, check=False)
|
249
|
+
assert len(errors) == 2 # errors simply propagate
|
250
|
+
|
251
|
+
|
252
|
+
def test_no_sources() -> None:
|
253
|
+
cfg = make(
|
254
|
+
'''
|
255
|
+
'''
|
256
|
+
)
|
257
|
+
# raises because no SOURCES
|
258
|
+
with pytest.raises(RuntimeError):
|
259
|
+
list(cfg.sources)
|
260
|
+
|
261
|
+
|
262
|
+
def test_empty_sources() -> None:
|
263
|
+
cfg = make(
|
264
|
+
'''
|
265
|
+
SOURCES = []
|
266
|
+
'''
|
267
|
+
)
|
268
|
+
# raises because empty SOURCES
|
269
|
+
with pytest.raises(RuntimeError):
|
270
|
+
list(cfg.sources)
|
271
|
+
|
272
|
+
|
273
|
+
def test_legacy() -> None:
|
274
|
+
cfg = make(
|
275
|
+
'''
|
276
|
+
from promnesia.common import Source
|
277
|
+
from promnesia.sources import demo
|
278
|
+
INDEXERS = [
|
279
|
+
Source(demo.index, src='legacy name'),
|
280
|
+
]
|
281
|
+
'''
|
282
|
+
)
|
283
|
+
|
284
|
+
[s1] = cfg.sources
|
285
|
+
assert isinstance(s1, Source)
|
286
|
+
|
287
|
+
assert s1.name == 'legacy name'
|
288
|
+
|
289
|
+
index(cfg)
|