promnesia 1.2.20240810__py3-none-any.whl → 1.4.20250909__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (79) hide show
  1. promnesia/__init__.py +18 -4
  2. promnesia/__main__.py +104 -78
  3. promnesia/cannon.py +108 -107
  4. promnesia/common.py +107 -88
  5. promnesia/compare.py +33 -30
  6. promnesia/compat.py +10 -10
  7. promnesia/config.py +37 -34
  8. promnesia/database/common.py +4 -3
  9. promnesia/database/dump.py +13 -13
  10. promnesia/database/load.py +7 -7
  11. promnesia/extract.py +19 -17
  12. promnesia/logging.py +27 -15
  13. promnesia/misc/install_server.py +32 -27
  14. promnesia/server.py +106 -79
  15. promnesia/sources/auto.py +104 -77
  16. promnesia/sources/auto_logseq.py +6 -5
  17. promnesia/sources/auto_obsidian.py +2 -2
  18. promnesia/sources/browser.py +20 -10
  19. promnesia/sources/browser_legacy.py +65 -50
  20. promnesia/sources/demo.py +7 -8
  21. promnesia/sources/fbmessenger.py +3 -3
  22. promnesia/sources/filetypes.py +22 -16
  23. promnesia/sources/github.py +9 -8
  24. promnesia/sources/guess.py +6 -2
  25. promnesia/sources/hackernews.py +7 -9
  26. promnesia/sources/hpi.py +5 -3
  27. promnesia/sources/html.py +11 -7
  28. promnesia/sources/hypothesis.py +3 -2
  29. promnesia/sources/instapaper.py +3 -2
  30. promnesia/sources/markdown.py +22 -12
  31. promnesia/sources/org.py +36 -17
  32. promnesia/sources/plaintext.py +41 -39
  33. promnesia/sources/pocket.py +5 -3
  34. promnesia/sources/reddit.py +24 -26
  35. promnesia/sources/roamresearch.py +5 -2
  36. promnesia/sources/rss.py +6 -8
  37. promnesia/sources/shellcmd.py +21 -11
  38. promnesia/sources/signal.py +27 -26
  39. promnesia/sources/smscalls.py +2 -3
  40. promnesia/sources/stackexchange.py +5 -4
  41. promnesia/sources/takeout.py +37 -34
  42. promnesia/sources/takeout_legacy.py +29 -19
  43. promnesia/sources/telegram.py +18 -12
  44. promnesia/sources/telegram_legacy.py +22 -11
  45. promnesia/sources/twitter.py +7 -6
  46. promnesia/sources/vcs.py +11 -6
  47. promnesia/sources/viber.py +11 -10
  48. promnesia/sources/website.py +8 -7
  49. promnesia/sources/zulip.py +3 -2
  50. promnesia/sqlite.py +13 -7
  51. promnesia/tests/common.py +10 -5
  52. promnesia/tests/server_helper.py +13 -10
  53. promnesia/tests/sources/test_auto.py +2 -3
  54. promnesia/tests/sources/test_filetypes.py +11 -8
  55. promnesia/tests/sources/test_hypothesis.py +10 -6
  56. promnesia/tests/sources/test_org.py +9 -5
  57. promnesia/tests/sources/test_plaintext.py +9 -8
  58. promnesia/tests/sources/test_shellcmd.py +13 -13
  59. promnesia/tests/sources/test_takeout.py +3 -5
  60. promnesia/tests/test_cannon.py +256 -239
  61. promnesia/tests/test_cli.py +12 -8
  62. promnesia/tests/test_compare.py +17 -13
  63. promnesia/tests/test_config.py +7 -8
  64. promnesia/tests/test_db_dump.py +15 -15
  65. promnesia/tests/test_extract.py +17 -10
  66. promnesia/tests/test_indexer.py +24 -18
  67. promnesia/tests/test_server.py +12 -13
  68. promnesia/tests/test_traverse.py +0 -2
  69. promnesia/tests/utils.py +3 -7
  70. promnesia-1.4.20250909.dist-info/METADATA +66 -0
  71. promnesia-1.4.20250909.dist-info/RECORD +80 -0
  72. {promnesia-1.2.20240810.dist-info → promnesia-1.4.20250909.dist-info}/WHEEL +1 -2
  73. promnesia/kjson.py +0 -121
  74. promnesia/sources/__init__.pyi +0 -0
  75. promnesia-1.2.20240810.dist-info/METADATA +0 -54
  76. promnesia-1.2.20240810.dist-info/RECORD +0 -83
  77. promnesia-1.2.20240810.dist-info/top_level.txt +0 -1
  78. {promnesia-1.2.20240810.dist-info → promnesia-1.4.20250909.dist-info}/entry_points.txt +0 -0
  79. {promnesia-1.2.20240810.dist-info → promnesia-1.4.20250909.dist-info/licenses}/LICENSE +0 -0
@@ -2,7 +2,7 @@ from typing import cast
2
2
 
3
3
  import pytest
4
4
 
5
- from ..cannon import canonify, CanonifyException
5
+ from ..cannon import CanonifyException, canonify
6
6
 
7
7
  # TODO should actually understand 'sequences'?
8
8
  # e.g.
@@ -27,299 +27,316 @@ def check(url, expected):
27
27
  # TODO assume spaces are not meaninfgul??
28
28
  # then could align URLs etc?
29
29
 
30
- @param('url,expected', [(
31
- 'https://www.youtube.com/watch?t=491s&v=1NHbPN9pNPM&index=63&list=WL',
32
- # NOTE: t= reordered, makes it more hierarchical
33
- # list as well, I guess makes the most sense to keep it at the very end.. since lists are more like tags
34
- 'youtube.com/watch?v=1NHbPN9pNPM&t=491s&list=WL'
35
- ), (
36
- 'youtube.com/watch?v=wHrCkyoe72U&feature=share&time_continue=6',
37
- 'youtube.com/watch?v=wHrCkyoe72U'
38
- ), (
39
- 'youtube.com/embed/nyc6RJEEe0U?feature=oembed',
40
- 'youtube.com/watch?v=nyc6RJEEe0U'
41
- ), (
42
- 'https://youtu.be/iCvmsMzlF7o?list=WL',
43
- 'youtube.com/watch?v=iCvmsMzlF7o&list=WL'
44
- ),
45
- # TODO can even be like that or contain timestamp (&t=)
46
- # TODO warn if param already present? shouldn't happen..
47
-
48
- # TODO could be interesting to do automatic rule extraction by querying one represnetative and then extracting canonical
49
-
50
- # TODO national domains don't matter for youtube
51
-
52
- # [*, 'youtube', ANY_DOMAIN] / 'embed' -> 'youtube.com/watch'
53
- # TODO use regex backrefs?
54
- #
55
- (
56
- 'm.youtube.com/watch?v=Zn6gV2sdl38',
57
- 'youtube.com/watch?v=Zn6gV2sdl38'
58
- ),
59
-
60
- # ( "https//youtube.com/playlist?list=PLeOfc0M-50LmJtZwyOfw6aVopmIbU1t7t"
61
- # , "youtube.com/playlist?list=PLeOfc0M-50LmJtZwyOfw6aVopmIbU1t7t"
62
- # ),
63
- # TODO perhaps it should result in video link + sibling link?
64
- # when exploring other people's playlists this could be quite useful?
65
-
66
- # ( "https://www.youtube.com/watch?v=1NHbPN9pNPM&index=63&list=WL&t=491s"
67
- # , "youtube.com/watch?v=1NHbPN9pNPM&list=WL" # TODO not so sure about &t, it's sort of useful
68
- # ),
69
- # TODO
70
- # youtube.com/user/magauchsein/playlists?sort=dd&view=50&shelf_id=14
71
- # youtube.com/user/TheChemlife/videos?view=0&sort=p&flow=grid
72
- ])
30
+
31
+ @param(
32
+ 'url,expected',
33
+ [
34
+ (
35
+ 'https://www.youtube.com/watch?t=491s&v=1NHbPN9pNPM&index=63&list=WL',
36
+ # NOTE: t= reordered, makes it more hierarchical
37
+ # list as well, I guess makes the most sense to keep it at the very end.. since lists are more like tags
38
+ 'youtube.com/watch?v=1NHbPN9pNPM&t=491s&list=WL',
39
+ ),
40
+ ('youtube.com/watch?v=wHrCkyoe72U&feature=share&time_continue=6', 'youtube.com/watch?v=wHrCkyoe72U'),
41
+ ('youtube.com/embed/nyc6RJEEe0U?feature=oembed', 'youtube.com/watch?v=nyc6RJEEe0U'),
42
+ ('https://youtu.be/iCvmsMzlF7o?list=WL', 'youtube.com/watch?v=iCvmsMzlF7o&list=WL'),
43
+ # TODO can even be like that or contain timestamp (&t=)
44
+ # TODO warn if param already present? shouldn't happen..
45
+ # TODO could be interesting to do automatic rule extraction by querying one represnetative and then extracting canonical
46
+ # TODO national domains don't matter for youtube
47
+ #
48
+ # [*, 'youtube', ANY_DOMAIN] / 'embed' -> 'youtube.com/watch'
49
+ # TODO use regex backrefs?
50
+ #
51
+ ('m.youtube.com/watch?v=Zn6gV2sdl38', 'youtube.com/watch?v=Zn6gV2sdl38'),
52
+ # ( "https//youtube.com/playlist?list=PLeOfc0M-50LmJtZwyOfw6aVopmIbU1t7t"
53
+ # , "youtube.com/playlist?list=PLeOfc0M-50LmJtZwyOfw6aVopmIbU1t7t"
54
+ # ),
55
+ # TODO perhaps it should result in video link + sibling link?
56
+ # when exploring other people's playlists this could be quite useful?
57
+ #
58
+ # ( "https://www.youtube.com/watch?v=1NHbPN9pNPM&index=63&list=WL&t=491s"
59
+ # , "youtube.com/watch?v=1NHbPN9pNPM&list=WL" # TODO not so sure about &t, it's sort of useful
60
+ # ),
61
+ # TODO
62
+ # youtube.com/user/magauchsein/playlists?sort=dd&view=50&shelf_id=14
63
+ # youtube.com/user/TheChemlife/videos?view=0&sort=p&flow=grid
64
+ ],
65
+ )
73
66
  def test_youtube(url, expected):
74
67
  assert canonify(url) == expected
75
68
 
76
69
 
77
- @param('url,expected', [(
78
- 'https://web.archive.org/web/20090902224414/http://reason.com/news/show/119237.html',
79
- 'reason.com/news/show/119237.html',
80
- )])
70
+ @param(
71
+ 'url,expected',
72
+ [
73
+ (
74
+ 'https://web.archive.org/web/20090902224414/http://reason.com/news/show/119237.html',
75
+ 'reason.com/news/show/119237.html',
76
+ )
77
+ ],
78
+ )
81
79
  def test_archiveorg(url, expected):
82
80
  assert canonify(url) == expected
83
81
 
84
82
 
85
83
  # ugh. good example of motication for cannon.py?
86
- @param('url,expected', [(
87
- 'https://news.ycombinator.com/from?site=jacopo.io',
88
- 'jacopo.io',
89
- ), (
90
- 'https://news.ycombinator.com/item?id=25099862',
91
- 'news.ycombinator.com/item?id=25099862',
92
- ), (
93
- 'https://news.ycombinator.com/reply?id=25100035&goto=item%3Fid%3D25099862%2325100035',
94
- TODO,
95
- )])
84
+ @param(
85
+ 'url,expected',
86
+ [
87
+ (
88
+ 'https://news.ycombinator.com/from?site=jacopo.io',
89
+ 'jacopo.io',
90
+ ),
91
+ (
92
+ 'https://news.ycombinator.com/item?id=25099862',
93
+ 'news.ycombinator.com/item?id=25099862',
94
+ ),
95
+ (
96
+ 'https://news.ycombinator.com/reply?id=25100035&goto=item%3Fid%3D25099862%2325100035',
97
+ TODO,
98
+ ),
99
+ ],
100
+ )
96
101
  def test_hackernews(url, expected):
97
102
  check(url, expected)
98
103
 
99
104
 
100
- @param('url, expected', [
101
- ( 'https://www.reddit.com/r/firefox/comments/bbugc5/firefox_bans_free_speech_commenting_plugin/?ref=readnext'
102
- , 'reddit.com/r/firefox/comments/bbugc5/firefox_bans_free_speech_commenting_plugin',
103
- ),
104
-
105
- ( 'https://www.reddit.com/r/selfhosted/comments/8j8mo3/what_are_you_self_hosting/dz19gh9/?utm_content=permalink&utm_medium=user&utm_source=reddit&utm_name=u_karlicoss'
106
- , 'reddit.com/r/selfhosted/comments/8j8mo3/what_are_you_self_hosting/dz19gh9',
107
- )
108
- # TODO hmm. parent relationship can just rely on urls for reddit
109
- # just need to support it in server I suppose
110
-
111
- # TODO search queries?
112
- # https://www.reddit.com/search?q=AutoValue
113
-
114
- # TODO def need better markdown handling
115
- # https://reddit.com/r/intj/comments/cmof04/me_irl/ew4a3dw/][ Me_irl]
116
- # reddit.com/r/intj/comments/cmof04/me_irl/ew4a3dw/%5D%5BMe_irl%5D
117
-
118
-
119
-
120
- ])
105
+ @param(
106
+ 'url, expected',
107
+ [
108
+ (
109
+ 'https://www.reddit.com/r/firefox/comments/bbugc5/firefox_bans_free_speech_commenting_plugin/?ref=readnext',
110
+ 'reddit.com/r/firefox/comments/bbugc5/firefox_bans_free_speech_commenting_plugin',
111
+ ),
112
+ (
113
+ 'https://www.reddit.com/r/selfhosted/comments/8j8mo3/what_are_you_self_hosting/dz19gh9/?utm_content=permalink&utm_medium=user&utm_source=reddit&utm_name=u_karlicoss',
114
+ 'reddit.com/r/selfhosted/comments/8j8mo3/what_are_you_self_hosting/dz19gh9',
115
+ ),
116
+ # TODO hmm. parent relationship can just rely on urls for reddit
117
+ # just need to support it in server I suppose
118
+ #
119
+ # TODO search queries?
120
+ # https://www.reddit.com/search?q=AutoValue
121
+ #
122
+ # TODO def need better markdown handling
123
+ # https://reddit.com/r/intj/comments/cmof04/me_irl/ew4a3dw/][ Me_irl]
124
+ # reddit.com/r/intj/comments/cmof04/me_irl/ew4a3dw/%5D%5BMe_irl%5D
125
+ ],
126
+ )
121
127
  def test_reddit(url, expected):
122
128
  assert canonify(url) == expected
123
129
 
130
+
124
131
  # ugh. good example of motication for cannon.py?
125
- @param('url,expected', [
126
- ( 'https://app.getpocket.com/read/3479402594'
127
- , 'app.getpocket.com/read/3479402594'
128
- ),
129
-
130
- ( 'https://getpocket.com/read/3479402594'
131
- , 'app.getpocket.com/read/3479402594'
132
- ),
133
- ])
132
+ @param(
133
+ 'url,expected',
134
+ [
135
+ ('https://app.getpocket.com/read/3479402594', 'app.getpocket.com/read/3479402594'),
136
+ ('https://getpocket.com/read/3479402594', 'app.getpocket.com/read/3479402594'),
137
+ ],
138
+ )
134
139
  def test_pocket(url, expected):
135
140
  assert canonify(url) == expected
136
141
 
137
- @pytest.mark.parametrize("url,expected", [
138
- # TODO ?? 'https://groups.google.com/a/list.hypothes.is/forum/#!topic/dev/kcmS7H8ssis',
139
- #
140
- # TODO FIXME fragment handling
141
- # ( "https://www.scottaaronson.com/blog/?p=3167#comment-1731882"
142
- # , "scottaaronson.com/blog/?p=3167#comment-1731882"
143
- # ),
144
-
145
-
146
- # TODO FIXME fragment handling
147
- # ( "https://en.wikipedia.org/wiki/tendon#cite_note-14"
148
- # , "en.wikipedia.org/wiki/tendon#cite_note-14"
149
- # ),
150
-
151
- # TODO FIXME fragment handling
152
- # ( "https://physicstravelguide.com/experiments/aharonov-bohm#tab__concrete"
153
- # , "physicstravelguide.com/experiments/aharonov-bohm#tab__concrete"
154
- # ),
155
-
156
- ( "https://github.com/search?o=asc&q=track&s=stars&type=Repositories"
157
- , "github.com/search?q=track"
158
- ),
159
- ( "https://80000hours.org/career-decision/article/?utm_source=The+EA+Newsletter&utm_campaign=04ca3c2244-EMAIL_CAMPAIGN_2019_04_03_04_26&utm_medium=email&utm_term=0_51c1df13ac-04ca3c2244-318697649"
160
- , "80000hours.org/career-decision/article"
161
- ),
162
- ( "https://www.facebook.com/photo.php?fbid=24147689823424326&set=pcb.2414778905423667&type=3&theater"
163
- , "facebook.com/photo.php?fbid=24147689823424326"
164
- ),
165
- ( "https://play.google.com/store/apps/details?id=com.faultexception.reader&hl=en"
166
- , "play.google.com/store/apps/details?id=com.faultexception.reader"
167
- ),
168
- # TODO it also got &p= parameter, which refers to page... not sure how to handle this
169
- # news.ycombinator.com/item?id=15451442&p=2
170
- ( "https://news.ycombinator.com/item?id=12172351"
171
- , "news.ycombinator.com/item?id=12172351"
172
- ),
173
- ( "https://urbandictionary.com/define.php?term=Belgian%20Whistle"
174
- , "urbandictionary.com/define.php?term=Belgian%20Whistle"
175
- ),
176
- ( "https://en.wikipedia.org/wiki/Dinic%27s_algorithm"
177
- , "en.wikipedia.org/wiki/Dinic%27s_algorithm"
178
- ),
179
-
180
- ( "zoopla.co.uk/to-rent/details/42756337#D0zlBWeD4X85odsR.97"
181
- , "zoopla.co.uk/to-rent/details/42756337"
182
- ),
183
-
184
- ( "withouthspec.co.uk/rooms/16867952?guests=2&adults=2&location=Berlin%2C+Germany&check_in=2017-08-16&check_out=2017-08-20"
185
- , "withouthspec.co.uk/rooms/16867952"
186
- ),
187
-
188
- ( "amp.theguardian.com/technology/2017/oct/09/mark-zuckerberg-facebook-puerto-rico-virtual-reality"
189
- , "theguardian.com/technology/2017/oct/09/mark-zuckerberg-facebook-puerto-rico-virtual-reality",
190
- ),
191
-
192
- ( "https://answers.yahoo.com/question/index?qid=20071101131442AAk9bGp"
193
- , "answers.yahoo.com/question/index?qid=20071101131442AAk9bGp"
194
- ),
195
- ( "flowingdata.com/2010/12/14/10-best-data-visualization-projects-of-the-year-%e2%80%93-2010"
196
- , "flowingdata.com/2010/12/14/10-best-data-visualization-projects-of-the-year-%E2%80%93-2010"
197
- ),
198
- ( "flowingdata.com/2010/12/14/10-best-data-visualization-projects-of-the-year-–-2010"
199
- , "flowingdata.com/2010/12/14/10-best-data-visualization-projects-of-the-year-%E2%80%93-2010"
200
- ),
201
-
202
- ( "https://spoonuniversity.com/lifestyle/marmite-ways-to-eat-it&usg=AFQjCNH4s1SOEjlpENlfPV5nuvADZpSdow"
203
- , "spoonuniversity.com/lifestyle/marmite-ways-to-eat-it"
204
- ),
205
-
206
- ( 'https://google.co.uk/amp/s/amp.reddit.com/r/androidapps/comments/757e2t/swiftkey_or_gboard'
207
- , 'reddit.com/r/androidapps/comments/757e2t/swiftkey_or_gboard'
208
- ),
209
-
210
- # should sort query params
211
- ( 'https://www.youtube.com/watch?v=hvoQiF0kBI8&list=WL&index=2'
212
- , 'youtube.com/watch?v=hvoQiF0kBI8&list=WL',
213
- ),
214
- ( 'https://www.youtube.com/watch?list=WL&v=hvoQiF0kBI8&index=2'
215
- , 'youtube.com/watch?v=hvoQiF0kBI8&list=WL',
216
- ),
217
-
218
- # TODO def need to allow the _user_ to define the rules.
219
- # no way I can predict everything
220
- # basically, allow *interactively* select
221
- # also allow introspection, which rule matched?
222
- ( 'https://bbs.archlinux.org/viewtopic.php?id=212740'
223
- , 'bbs.archlinux.org/viewtopic.php?id=212740',
224
- ),
225
-
226
- ( 'https://ubuntuforums.org/showthread.php?t=1403470&s=0dd67bdb12559c22e73a220752db50c7&p=8806195#post8806195'
227
- , 'ubuntuforums.org/showthread.php?t=1403470&p=8806195',
228
- ),
229
-
230
- ( 'https://arstechnica.com/?p=1371299',
231
- 'arstechnica.com/?p=1371299',
232
- # eh. it's a redirect to https://arstechnica.com/information-technology/2018/09/dozens-of-ios-apps-surreptitiously-share-user-location-data-with-tracking-firms/
233
- # however in the page body there is <link rel="shorturl" href="https://arstechnica.com/?p=1371299"> ...
234
- ),
235
-
236
- # ( "gwern.net/DNB+FAQ"
237
- # , "TODO" # ???
238
- # ),
239
-
240
- # TODO shit. is that normal??? perhaps need to manually move fragment?
241
- # SplitResult(scheme='https', netloc='unix.stackexchange.com', path='/questions/171603/convert-file-contents-to-lower-case/171708', query='', fragment='171708&usg=AFQjCNEFCGqCAa4P4Zlu2x11bThJispNxQ')
242
- # ( "https://unix.stackexchange.com/questions/171603/convert-file-contents-to-lower-case/171708#171708&usg=AFQjCNEFCGqCAa4P4Zlu2x11bThJispNxQ"
243
- # , "unix.stackexchange.com/questions/171603/convert-file-contents-to-lower-case/171708#171708"
244
- # )
245
- ])
142
+
143
+ @pytest.mark.parametrize(
144
+ ("url", "expected"),
145
+ [
146
+ # TODO ?? 'https://groups.google.com/a/list.hypothes.is/forum/#!topic/dev/kcmS7H8ssis',
147
+ #
148
+ # TODO FIXME fragment handling
149
+ # ( "https://www.scottaaronson.com/blog/?p=3167#comment-1731882"
150
+ # , "scottaaronson.com/blog/?p=3167#comment-1731882"
151
+ # ),
152
+ # TODO FIXME fragment handling
153
+ # ( "https://en.wikipedia.org/wiki/tendon#cite_note-14"
154
+ # , "en.wikipedia.org/wiki/tendon#cite_note-14"
155
+ # ),
156
+ # TODO FIXME fragment handling
157
+ # ( "https://physicstravelguide.com/experiments/aharonov-bohm#tab__concrete"
158
+ # , "physicstravelguide.com/experiments/aharonov-bohm#tab__concrete"
159
+ # ),
160
+ ("https://github.com/search?o=asc&q=track&s=stars&type=Repositories", "github.com/search?q=track"),
161
+ (
162
+ "https://80000hours.org/career-decision/article/?utm_source=The+EA+Newsletter&utm_campaign=04ca3c2244-EMAIL_CAMPAIGN_2019_04_03_04_26&utm_medium=email&utm_term=0_51c1df13ac-04ca3c2244-318697649",
163
+ "80000hours.org/career-decision/article",
164
+ ),
165
+ (
166
+ "https://www.facebook.com/photo.php?fbid=24147689823424326&set=pcb.2414778905423667&type=3&theater",
167
+ "facebook.com/photo.php?fbid=24147689823424326",
168
+ ),
169
+ (
170
+ "https://play.google.com/store/apps/details?id=com.faultexception.reader&hl=en",
171
+ "play.google.com/store/apps/details?id=com.faultexception.reader",
172
+ ),
173
+ # TODO it also got &p= parameter, which refers to page... not sure how to handle this
174
+ # news.ycombinator.com/item?id=15451442&p=2
175
+ ("https://news.ycombinator.com/item?id=12172351", "news.ycombinator.com/item?id=12172351"),
176
+ (
177
+ "https://urbandictionary.com/define.php?term=Belgian%20Whistle",
178
+ "urbandictionary.com/define.php?term=Belgian%20Whistle",
179
+ ),
180
+ ("https://en.wikipedia.org/wiki/Dinic%27s_algorithm", "en.wikipedia.org/wiki/Dinic%27s_algorithm"),
181
+ ("zoopla.co.uk/to-rent/details/42756337#D0zlBWeD4X85odsR.97", "zoopla.co.uk/to-rent/details/42756337"),
182
+ (
183
+ "withouthspec.co.uk/rooms/16867952?guests=2&adults=2&location=Berlin%2C+Germany&check_in=2017-08-16&check_out=2017-08-20",
184
+ "withouthspec.co.uk/rooms/16867952",
185
+ ),
186
+ (
187
+ "amp.theguardian.com/technology/2017/oct/09/mark-zuckerberg-facebook-puerto-rico-virtual-reality",
188
+ "theguardian.com/technology/2017/oct/09/mark-zuckerberg-facebook-puerto-rico-virtual-reality",
189
+ ),
190
+ (
191
+ "https://answers.yahoo.com/question/index?qid=20071101131442AAk9bGp",
192
+ "answers.yahoo.com/question/index?qid=20071101131442AAk9bGp",
193
+ ),
194
+ (
195
+ "flowingdata.com/2010/12/14/10-best-data-visualization-projects-of-the-year-%e2%80%93-2010",
196
+ "flowingdata.com/2010/12/14/10-best-data-visualization-projects-of-the-year-%E2%80%93-2010",
197
+ ),
198
+ (
199
+ "flowingdata.com/2010/12/14/10-best-data-visualization-projects-of-the-year-–-2010",
200
+ "flowingdata.com/2010/12/14/10-best-data-visualization-projects-of-the-year-%E2%80%93-2010",
201
+ ),
202
+ (
203
+ "https://spoonuniversity.com/lifestyle/marmite-ways-to-eat-it&usg=AFQjCNH4s1SOEjlpENlfPV5nuvADZpSdow",
204
+ "spoonuniversity.com/lifestyle/marmite-ways-to-eat-it",
205
+ ),
206
+ (
207
+ 'https://google.co.uk/amp/s/amp.reddit.com/r/androidapps/comments/757e2t/swiftkey_or_gboard',
208
+ 'reddit.com/r/androidapps/comments/757e2t/swiftkey_or_gboard',
209
+ ),
210
+ # should sort query params
211
+ (
212
+ 'https://www.youtube.com/watch?v=hvoQiF0kBI8&list=WL&index=2',
213
+ 'youtube.com/watch?v=hvoQiF0kBI8&list=WL',
214
+ ),
215
+ (
216
+ 'https://www.youtube.com/watch?list=WL&v=hvoQiF0kBI8&index=2',
217
+ 'youtube.com/watch?v=hvoQiF0kBI8&list=WL',
218
+ ),
219
+ # TODO def need to allow the _user_ to define the rules.
220
+ # no way I can predict everything
221
+ # basically, allow *interactively* select
222
+ # also allow introspection, which rule matched?
223
+ (
224
+ 'https://bbs.archlinux.org/viewtopic.php?id=212740',
225
+ 'bbs.archlinux.org/viewtopic.php?id=212740',
226
+ ),
227
+ (
228
+ 'https://ubuntuforums.org/showthread.php?t=1403470&s=0dd67bdb12559c22e73a220752db50c7&p=8806195#post8806195',
229
+ 'ubuntuforums.org/showthread.php?t=1403470&p=8806195',
230
+ ),
231
+ (
232
+ 'https://arstechnica.com/?p=1371299',
233
+ 'arstechnica.com/?p=1371299',
234
+ # eh. it's a redirect to https://arstechnica.com/information-technology/2018/09/dozens-of-ios-apps-surreptitiously-share-user-location-data-with-tracking-firms/
235
+ # however in the page body there is <link rel="shorturl" href="https://arstechnica.com/?p=1371299"> ...
236
+ ),
237
+ # ( "gwern.net/DNB+FAQ"
238
+ # , "TODO" # ???
239
+ # ),
240
+ # TODO shit. is that normal??? perhaps need to manually move fragment?
241
+ # SplitResult(scheme='https', netloc='unix.stackexchange.com', path='/questions/171603/convert-file-contents-to-lower-case/171708', query='', fragment='171708&usg=AFQjCNEFCGqCAa4P4Zlu2x11bThJispNxQ')
242
+ # ( "https://unix.stackexchange.com/questions/171603/convert-file-contents-to-lower-case/171708#171708&usg=AFQjCNEFCGqCAa4P4Zlu2x11bThJispNxQ"
243
+ # , "unix.stackexchange.com/questions/171603/convert-file-contents-to-lower-case/171708#171708"
244
+ # )
245
+ ],
246
+ )
246
247
  def test(url, expected):
247
248
  assert canonify(url) == expected
248
249
  # TODO github queries
250
+
251
+
249
252
  # github.com/search?l=Python&q=reddit+backup
250
253
  # github.com/search?p=3&q=ipynb+language%3AHaskell
251
254
  # github.com/search?q=kobo+ExtraData
252
255
  # github.com/search?q=what-universal-human-experiences-are-you-missing-without-realizing-it
253
256
 
254
- # TODO git+https://github.com/expectocode/telegram-export@master
255
- # TODO again, for that actually sequence would be good...
256
-
257
- # TODO "https://twitter.com/search?q=pinboard search&src=typd"
257
+ # TODO git+https://github.com/expectocode/telegram-export@master
258
+ # TODO again, for that actually sequence would be good...
258
259
 
259
- # TODO https://www.zalando-lounge.ch/#/
260
- # TODO m.facebook.com
261
- # TODO [R('^(youtube|urbandictionary|tesco|scottaaronson|answers.yahoo.com|code.google.com)') , None],
260
+ # TODO "https://twitter.com/search?q=pinboard search&src=typd"
262
261
 
262
+ # TODO https://www.zalando-lounge.ch/#/
263
+ # TODO m.facebook.com
264
+ # TODO [R('^(youtube|urbandictionary|tesco|scottaaronson|answers.yahoo.com|code.google.com)') , None],
263
265
 
264
266
 
265
- # TODO
267
+ # TODO
266
268
  # amazon.co.uk/gp/offer-listing/B00525XKL4/ref=dp_olp_new
267
269
  # amazon.co.uk/gp/offer-listing/B00525XKL4/ref=olp_twister_child
268
270
 
269
- # TODO
270
- # en.wikipedia.org/wiki/S&P_500_Index
271
+ # TODO
272
+ # en.wikipedia.org/wiki/S&P_500_Index
271
273
 
272
274
 
273
- # TODO
274
- # google.co.uk/maps/place/Hackney+Bureau/@51.5293789,-0.0527919,16.88z/data=!bla-bla!-bla
275
+ # TODO
276
+ # google.co.uk/maps/place/Hackney+Bureau/@51.5293789,-0.0527919,16.88z/data=!bla-bla!-bla
275
277
 
276
278
 
277
- # TODO
278
- # perhaps, disable utf8 everywhere?
279
- # github.com/search?utf8=%E2%9C%93&q=%22My+Clippings.txt%22
279
+ # TODO
280
+ # perhaps, disable utf8 everywhere?
281
+ # github.com/search?utf8=%E2%9C%93&q=%22My+Clippings.txt%22
280
282
 
281
- # TODO FIXME fragment handling
282
- # ( "https://www.scottaaronson.com/blog/?p=3167#comment-1731882"
283
- # , "scottaaronson.com/blog/?p=3167#comment-1731882"
284
- # ),
283
+ # TODO FIXME fragment handling
284
+ # ( "https://www.scottaaronson.com/blog/?p=3167#comment-1731882"
285
+ # , "scottaaronson.com/blog/?p=3167#comment-1731882"
286
+ # ),
285
287
 
286
- @pytest.mark.parametrize("urls", [
287
- {
288
- "launchpad.net/ubuntu/%2Bsource/okular",
289
- "launchpad.net/ubuntu/+source/okular",
290
- },
291
- {
292
- "flowingdata.com/2010/12/14/10-best-data-visualization-projects-of-the-year-–-2010",
293
- "flowingdata.com/2010/12/14/10-best-data-visualization-projects-of-the-year-%e2%80%93-2010",
294
- "https://flowingdata.com/2010/12/14/10-best-data-visualization-projects-of-the-year-%e2%80%93-2010/&usg=AFQjCNEZsEGz9rqpWqlFXR5Tc7pkCKY5sQ",
295
- },
296
- ])
288
+
289
+ @pytest.mark.parametrize(
290
+ "urls",
291
+ [
292
+ {
293
+ "launchpad.net/ubuntu/%2Bsource/okular",
294
+ "launchpad.net/ubuntu/+source/okular",
295
+ },
296
+ {
297
+ "flowingdata.com/2010/12/14/10-best-data-visualization-projects-of-the-year-–-2010",
298
+ "flowingdata.com/2010/12/14/10-best-data-visualization-projects-of-the-year-%e2%80%93-2010",
299
+ "https://flowingdata.com/2010/12/14/10-best-data-visualization-projects-of-the-year-%e2%80%93-2010/&usg=AFQjCNEZsEGz9rqpWqlFXR5Tc7pkCKY5sQ",
300
+ },
301
+ ],
302
+ )
297
303
  def test_same_norm(urls):
298
- urls = list(sorted(urls))
304
+ urls = sorted(urls)
299
305
  u0 = urls[0]
300
306
  c0 = canonify(u0)
301
307
  for u in urls[1:]:
302
308
  c = canonify(u)
303
309
  assert c0 == c, f'Expected {u0} and {u} to be same canonically; got {c0} and {c} instead'
304
310
 
311
+
305
312
  def test_error():
306
313
  # canonify('  +74Zo535, fewfwf@gmail.com') # -- apparently was patched in some python3.7 versions
307
314
  with pytest.raises(CanonifyException):
308
315
  # borrowed from https://bugs.mageia.org/show_bug.cgi?id=24640#c7
309
- canonify('https://example.com\uFF03@bing.com')
316
+ canonify('https://example.com\uff03@bing.com')
310
317
 
311
- @pytest.mark.parametrize("url,expected", [
312
- ('https://news.ycombinator.com/item?id=', 'news.ycombinator.com/item?id='),
313
- ('https://www.youtube.com/watch?v=hvoQiF0kBI8&list&index=2',
314
- 'youtube.com/watch?v=hvoQiF0kBI8&list='),
315
- ])
318
+
319
+ @pytest.mark.parametrize(
320
+ ("url", "expected"),
321
+ [
322
+ ('https://news.ycombinator.com/item?id=', 'news.ycombinator.com/item?id='),
323
+ ('https://www.youtube.com/watch?v=hvoQiF0kBI8&list&index=2', 'youtube.com/watch?v=hvoQiF0kBI8&list='),
324
+ ],
325
+ )
316
326
  def test_empty_query_parameter(url, expected):
317
327
  assert canonify(url) == expected
318
328
 
319
- @pytest.mark.parametrize("url,expected", [
320
- ('http://www.isfdb.org/cgi-bin/title.cgi?2172', 'isfdb.org/cgi-bin/title.cgi?2172='),
321
- ('http://www.isfdb.org/cgi-bin/title.cgi?2172+1', 'isfdb.org/cgi-bin/title.cgi?2172%201='),
322
- ('http://www.isfdb.org/cgi-bin/title.cgi?2172&foo=bar&baz&quux', 'isfdb.org/cgi-bin/title.cgi?2172=&baz=&foo=bar&quux='),
323
- ])
329
+
330
+ @pytest.mark.parametrize(
331
+ ("url", "expected"),
332
+ [
333
+ ('http://www.isfdb.org/cgi-bin/title.cgi?2172', 'isfdb.org/cgi-bin/title.cgi?2172='),
334
+ ('http://www.isfdb.org/cgi-bin/title.cgi?2172+1', 'isfdb.org/cgi-bin/title.cgi?2172%201='),
335
+ (
336
+ 'http://www.isfdb.org/cgi-bin/title.cgi?2172&foo=bar&baz&quux',
337
+ 'isfdb.org/cgi-bin/title.cgi?2172=&baz=&foo=bar&quux=',
338
+ ),
339
+ ],
340
+ )
324
341
  def test_qkeep_true(url, expected):
325
342
  assert canonify(url) == expected
@@ -1,13 +1,11 @@
1
1
  import os
2
2
  import time
3
3
 
4
- from ..common import _is_windows
5
-
6
- from .common import get_testdata, promnesia_bin, tmp_popen
7
-
8
4
  import pytest
9
5
  import requests
10
6
 
7
+ from ..common import _is_windows
8
+ from .common import get_testdata, promnesia_bin, tmp_popen
11
9
 
12
10
  ox_hugo_data = get_testdata('ox-hugo/test/site')
13
11
 
@@ -22,12 +20,12 @@ def test_demo() -> None:
22
20
  # TODO why does it want post??
23
21
  time.sleep(2) # meh.. need a generic helper to wait till ready...
24
22
  res = {}
25
- for attempt in range(30):
23
+ for _attempt in range(30):
26
24
  time.sleep(1)
27
25
  try:
28
26
  res = requests.post(
29
27
  "http://localhost:16789/search",
30
- json=dict(url="https://github.com/kaushalmodi/ox-hugo/issues"),
28
+ json={'url': "https://github.com/kaushalmodi/ox-hugo/issues"},
31
29
  ).json()
32
30
  break
33
31
  except:
@@ -36,7 +34,13 @@ def test_demo() -> None:
36
34
  raise RuntimeError("Couldn't connect to the server")
37
35
  vis = res['visits']
38
36
  assert len(vis) > 50, vis
39
- mds = [x for x in vis if x['locator']['title'] == 'content/posts/citations-example-toml.md'.replace('/', os.sep)]
40
- orgs = [x for x in vis if x['locator']['title'].startswith('content-org/single-posts/empty_tag.org'.replace('/', os.sep))]
37
+ mds = [
38
+ x for x in vis if x['locator']['title'] == 'content/posts/citations-example-toml.md'.replace('/', os.sep)
39
+ ]
40
+ orgs = [
41
+ x
42
+ for x in vis
43
+ if x['locator']['title'].startswith('content-org/single-posts/empty_tag.org'.replace('/', os.sep))
44
+ ]
41
45
  assert len(mds) == 1
42
46
  assert len(orgs) == 1