promnesia 1.1.20230129__py3-none-any.whl → 1.2.20240810__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- promnesia/__main__.py +58 -50
- promnesia/cannon.py +4 -4
- promnesia/common.py +57 -38
- promnesia/compare.py +3 -2
- promnesia/compat.py +6 -65
- promnesia/config.py +4 -2
- promnesia/database/common.py +66 -0
- promnesia/database/dump.py +187 -0
- promnesia/{read_db.py → database/load.py} +10 -11
- promnesia/extract.py +1 -0
- promnesia/kjson.py +1 -1
- promnesia/logging.py +14 -14
- promnesia/misc/__init__.pyi +0 -0
- promnesia/misc/config_example.py +1 -2
- promnesia/misc/install_server.py +5 -4
- promnesia/server.py +24 -24
- promnesia/sources/__init__.pyi +0 -0
- promnesia/sources/auto.py +12 -7
- promnesia/sources/browser.py +80 -293
- promnesia/sources/browser_legacy.py +298 -0
- promnesia/sources/demo.py +18 -2
- promnesia/sources/filetypes.py +8 -0
- promnesia/sources/github.py +2 -2
- promnesia/sources/hackernews.py +1 -2
- promnesia/sources/hypothesis.py +1 -1
- promnesia/sources/markdown.py +15 -15
- promnesia/sources/org.py +7 -3
- promnesia/sources/plaintext.py +3 -1
- promnesia/sources/reddit.py +2 -2
- promnesia/sources/rss.py +5 -1
- promnesia/sources/shellcmd.py +6 -2
- promnesia/sources/signal.py +29 -20
- promnesia/sources/smscalls.py +8 -1
- promnesia/sources/stackexchange.py +2 -2
- promnesia/sources/takeout.py +132 -12
- promnesia/sources/takeout_legacy.py +10 -2
- promnesia/sources/telegram.py +79 -123
- promnesia/sources/telegram_legacy.py +117 -0
- promnesia/sources/vcs.py +1 -1
- promnesia/sources/viber.py +6 -15
- promnesia/sources/website.py +1 -1
- promnesia/sqlite.py +42 -0
- promnesia/tests/__init__.py +0 -0
- promnesia/tests/common.py +137 -0
- promnesia/tests/server_helper.py +64 -0
- promnesia/tests/sources/__init__.py +0 -0
- promnesia/tests/sources/test_auto.py +66 -0
- promnesia/tests/sources/test_filetypes.py +42 -0
- promnesia/tests/sources/test_hypothesis.py +39 -0
- promnesia/tests/sources/test_org.py +65 -0
- promnesia/tests/sources/test_plaintext.py +26 -0
- promnesia/tests/sources/test_shellcmd.py +22 -0
- promnesia/tests/sources/test_takeout.py +58 -0
- promnesia/tests/test_cannon.py +325 -0
- promnesia/tests/test_cli.py +42 -0
- promnesia/tests/test_compare.py +30 -0
- promnesia/tests/test_config.py +290 -0
- promnesia/tests/test_db_dump.py +223 -0
- promnesia/tests/test_extract.py +61 -0
- promnesia/tests/test_extract_urls.py +43 -0
- promnesia/tests/test_indexer.py +245 -0
- promnesia/tests/test_server.py +292 -0
- promnesia/tests/test_traverse.py +41 -0
- promnesia/tests/utils.py +35 -0
- {promnesia-1.1.20230129.dist-info → promnesia-1.2.20240810.dist-info}/METADATA +14 -19
- promnesia-1.2.20240810.dist-info/RECORD +83 -0
- {promnesia-1.1.20230129.dist-info → promnesia-1.2.20240810.dist-info}/WHEEL +1 -1
- {promnesia-1.1.20230129.dist-info → promnesia-1.2.20240810.dist-info}/entry_points.txt +0 -1
- promnesia/dump.py +0 -105
- promnesia-1.1.20230129.dist-info/RECORD +0 -55
- {promnesia-1.1.20230129.dist-info → promnesia-1.2.20240810.dist-info}/LICENSE +0 -0
- {promnesia-1.1.20230129.dist-info → promnesia-1.2.20240810.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,22 @@
|
|
1
|
+
from ...common import _is_windows, Source
|
2
|
+
from ...extract import extract_visits
|
3
|
+
from ...sources import shellcmd
|
4
|
+
|
5
|
+
import pytest
|
6
|
+
|
7
|
+
from ..common import get_testdata
|
8
|
+
|
9
|
+
|
10
|
+
@pytest.mark.skipif(_is_windows, reason="no grep on windows")
|
11
|
+
def test_via_grep() -> None:
|
12
|
+
|
13
|
+
visits = list(extract_visits(
|
14
|
+
Source(
|
15
|
+
shellcmd.index,
|
16
|
+
# meh. maybe should deprecate plain string here...
|
17
|
+
r"""grep -Eo -r --no-filename (http|https)://\S+ """ + str(get_testdata('custom')),
|
18
|
+
),
|
19
|
+
src='whatever',
|
20
|
+
))
|
21
|
+
# TODO I guess filtering of equivalent urls should rather be tested on something having context (e.g. org mode)
|
22
|
+
assert len(visits) == 5
|
@@ -0,0 +1,58 @@
|
|
1
|
+
from datetime import datetime, timezone
|
2
|
+
|
3
|
+
from ...common import Source
|
4
|
+
from ...extract import extract_visits
|
5
|
+
from ...sources import takeout
|
6
|
+
|
7
|
+
import pytest
|
8
|
+
|
9
|
+
from ..common import get_testdata, unwrap
|
10
|
+
|
11
|
+
from my.core.cfg import tmp_config
|
12
|
+
|
13
|
+
|
14
|
+
# TODO apply in conftest so it's used in all tests?
|
15
|
+
@pytest.fixture
|
16
|
+
def no_cachew():
|
17
|
+
from my.core.cachew import disabled_cachew
|
18
|
+
|
19
|
+
with disabled_cachew():
|
20
|
+
yield
|
21
|
+
|
22
|
+
|
23
|
+
# todo testing this logic probably belongs to hpi or google_takeout_export, but whatever
|
24
|
+
def test_takeout_directory(no_cachew) -> None:
|
25
|
+
class config:
|
26
|
+
class google:
|
27
|
+
takeout_path = get_testdata('takeout')
|
28
|
+
|
29
|
+
with tmp_config(modules='my.google.takeout.*', config=config):
|
30
|
+
visits = list(extract_visits(Source(takeout.index), src='takeout'))
|
31
|
+
|
32
|
+
assert len(visits) == 3
|
33
|
+
assert all(unwrap(v).dt.tzinfo is not None for v in visits)
|
34
|
+
|
35
|
+
|
36
|
+
def test_takeout_zip(no_cachew) -> None:
|
37
|
+
class config:
|
38
|
+
class google:
|
39
|
+
takeout_path = get_testdata('takeout-20150518T000000Z.zip')
|
40
|
+
|
41
|
+
with tmp_config(modules='my.google.takeout.*', config=config):
|
42
|
+
visits = list(extract_visits(Source(takeout.index), src='takeout'))
|
43
|
+
|
44
|
+
assert len(visits) == 3
|
45
|
+
assert all(unwrap(v).dt.tzinfo is not None for v in visits)
|
46
|
+
|
47
|
+
[vis] = [v for v in visits if unwrap(v).norm_url == 'takeout.google.com/settings/takeout']
|
48
|
+
|
49
|
+
edt = datetime(
|
50
|
+
year=2018,
|
51
|
+
month=9,
|
52
|
+
day=18,
|
53
|
+
hour=5,
|
54
|
+
minute=48,
|
55
|
+
second=23,
|
56
|
+
tzinfo=timezone.utc,
|
57
|
+
)
|
58
|
+
assert unwrap(vis).dt == edt
|
@@ -0,0 +1,325 @@
|
|
1
|
+
from typing import cast
|
2
|
+
|
3
|
+
import pytest
|
4
|
+
|
5
|
+
from ..cannon import canonify, CanonifyException
|
6
|
+
|
7
|
+
# TODO should actually understand 'sequences'?
|
8
|
+
# e.g.
|
9
|
+
# https://www.scottaaronson.com/blog/?p=3167#comment-1731882 is kinda hierarchy of scottaaronson.com, post 3167 and comment 1731882
|
10
|
+
# but when working with it from server, would be easier to just do multiple queries I guess..
|
11
|
+
# https://www.scottaaronson.com/blog/?p=3167 is kind ahierarchy of scottaaronson.com ;
|
12
|
+
|
13
|
+
|
14
|
+
param = pytest.mark.parametrize
|
15
|
+
|
16
|
+
|
17
|
+
# mark stuff that in interesting as a testcase, but I'm not sure about yet
|
18
|
+
TODO = cast(str, object())
|
19
|
+
|
20
|
+
|
21
|
+
def check(url, expected):
|
22
|
+
if expected is TODO:
|
23
|
+
pytest.skip(f"'{url}' will be handled later")
|
24
|
+
assert canonify(url) == expected
|
25
|
+
|
26
|
+
|
27
|
+
# TODO assume spaces are not meaninfgul??
|
28
|
+
# then could align URLs etc?
|
29
|
+
|
30
|
+
@param('url,expected', [(
|
31
|
+
'https://www.youtube.com/watch?t=491s&v=1NHbPN9pNPM&index=63&list=WL',
|
32
|
+
# NOTE: t= reordered, makes it more hierarchical
|
33
|
+
# list as well, I guess makes the most sense to keep it at the very end.. since lists are more like tags
|
34
|
+
'youtube.com/watch?v=1NHbPN9pNPM&t=491s&list=WL'
|
35
|
+
), (
|
36
|
+
'youtube.com/watch?v=wHrCkyoe72U&feature=share&time_continue=6',
|
37
|
+
'youtube.com/watch?v=wHrCkyoe72U'
|
38
|
+
), (
|
39
|
+
'youtube.com/embed/nyc6RJEEe0U?feature=oembed',
|
40
|
+
'youtube.com/watch?v=nyc6RJEEe0U'
|
41
|
+
), (
|
42
|
+
'https://youtu.be/iCvmsMzlF7o?list=WL',
|
43
|
+
'youtube.com/watch?v=iCvmsMzlF7o&list=WL'
|
44
|
+
),
|
45
|
+
# TODO can even be like that or contain timestamp (&t=)
|
46
|
+
# TODO warn if param already present? shouldn't happen..
|
47
|
+
|
48
|
+
# TODO could be interesting to do automatic rule extraction by querying one represnetative and then extracting canonical
|
49
|
+
|
50
|
+
# TODO national domains don't matter for youtube
|
51
|
+
|
52
|
+
# [*, 'youtube', ANY_DOMAIN] / 'embed' -> 'youtube.com/watch'
|
53
|
+
# TODO use regex backrefs?
|
54
|
+
#
|
55
|
+
(
|
56
|
+
'm.youtube.com/watch?v=Zn6gV2sdl38',
|
57
|
+
'youtube.com/watch?v=Zn6gV2sdl38'
|
58
|
+
),
|
59
|
+
|
60
|
+
# ( "https//youtube.com/playlist?list=PLeOfc0M-50LmJtZwyOfw6aVopmIbU1t7t"
|
61
|
+
# , "youtube.com/playlist?list=PLeOfc0M-50LmJtZwyOfw6aVopmIbU1t7t"
|
62
|
+
# ),
|
63
|
+
# TODO perhaps it should result in video link + sibling link?
|
64
|
+
# when exploring other people's playlists this could be quite useful?
|
65
|
+
|
66
|
+
# ( "https://www.youtube.com/watch?v=1NHbPN9pNPM&index=63&list=WL&t=491s"
|
67
|
+
# , "youtube.com/watch?v=1NHbPN9pNPM&list=WL" # TODO not so sure about &t, it's sort of useful
|
68
|
+
# ),
|
69
|
+
# TODO
|
70
|
+
# youtube.com/user/magauchsein/playlists?sort=dd&view=50&shelf_id=14
|
71
|
+
# youtube.com/user/TheChemlife/videos?view=0&sort=p&flow=grid
|
72
|
+
])
|
73
|
+
def test_youtube(url, expected):
|
74
|
+
assert canonify(url) == expected
|
75
|
+
|
76
|
+
|
77
|
+
@param('url,expected', [(
|
78
|
+
'https://web.archive.org/web/20090902224414/http://reason.com/news/show/119237.html',
|
79
|
+
'reason.com/news/show/119237.html',
|
80
|
+
)])
|
81
|
+
def test_archiveorg(url, expected):
|
82
|
+
assert canonify(url) == expected
|
83
|
+
|
84
|
+
|
85
|
+
# ugh. good example of motication for cannon.py?
|
86
|
+
@param('url,expected', [(
|
87
|
+
'https://news.ycombinator.com/from?site=jacopo.io',
|
88
|
+
'jacopo.io',
|
89
|
+
), (
|
90
|
+
'https://news.ycombinator.com/item?id=25099862',
|
91
|
+
'news.ycombinator.com/item?id=25099862',
|
92
|
+
), (
|
93
|
+
'https://news.ycombinator.com/reply?id=25100035&goto=item%3Fid%3D25099862%2325100035',
|
94
|
+
TODO,
|
95
|
+
)])
|
96
|
+
def test_hackernews(url, expected):
|
97
|
+
check(url, expected)
|
98
|
+
|
99
|
+
|
100
|
+
@param('url, expected', [
|
101
|
+
( 'https://www.reddit.com/r/firefox/comments/bbugc5/firefox_bans_free_speech_commenting_plugin/?ref=readnext'
|
102
|
+
, 'reddit.com/r/firefox/comments/bbugc5/firefox_bans_free_speech_commenting_plugin',
|
103
|
+
),
|
104
|
+
|
105
|
+
( 'https://www.reddit.com/r/selfhosted/comments/8j8mo3/what_are_you_self_hosting/dz19gh9/?utm_content=permalink&utm_medium=user&utm_source=reddit&utm_name=u_karlicoss'
|
106
|
+
, 'reddit.com/r/selfhosted/comments/8j8mo3/what_are_you_self_hosting/dz19gh9',
|
107
|
+
)
|
108
|
+
# TODO hmm. parent relationship can just rely on urls for reddit
|
109
|
+
# just need to support it in server I suppose
|
110
|
+
|
111
|
+
# TODO search queries?
|
112
|
+
# https://www.reddit.com/search?q=AutoValue
|
113
|
+
|
114
|
+
# TODO def need better markdown handling
|
115
|
+
# https://reddit.com/r/intj/comments/cmof04/me_irl/ew4a3dw/][ Me_irl]
|
116
|
+
# reddit.com/r/intj/comments/cmof04/me_irl/ew4a3dw/%5D%5BMe_irl%5D
|
117
|
+
|
118
|
+
|
119
|
+
|
120
|
+
])
|
121
|
+
def test_reddit(url, expected):
|
122
|
+
assert canonify(url) == expected
|
123
|
+
|
124
|
+
# ugh. good example of motication for cannon.py?
|
125
|
+
@param('url,expected', [
|
126
|
+
( 'https://app.getpocket.com/read/3479402594'
|
127
|
+
, 'app.getpocket.com/read/3479402594'
|
128
|
+
),
|
129
|
+
|
130
|
+
( 'https://getpocket.com/read/3479402594'
|
131
|
+
, 'app.getpocket.com/read/3479402594'
|
132
|
+
),
|
133
|
+
])
|
134
|
+
def test_pocket(url, expected):
|
135
|
+
assert canonify(url) == expected
|
136
|
+
|
137
|
+
@pytest.mark.parametrize("url,expected", [
|
138
|
+
# TODO ?? 'https://groups.google.com/a/list.hypothes.is/forum/#!topic/dev/kcmS7H8ssis',
|
139
|
+
#
|
140
|
+
# TODO FIXME fragment handling
|
141
|
+
# ( "https://www.scottaaronson.com/blog/?p=3167#comment-1731882"
|
142
|
+
# , "scottaaronson.com/blog/?p=3167#comment-1731882"
|
143
|
+
# ),
|
144
|
+
|
145
|
+
|
146
|
+
# TODO FIXME fragment handling
|
147
|
+
# ( "https://en.wikipedia.org/wiki/tendon#cite_note-14"
|
148
|
+
# , "en.wikipedia.org/wiki/tendon#cite_note-14"
|
149
|
+
# ),
|
150
|
+
|
151
|
+
# TODO FIXME fragment handling
|
152
|
+
# ( "https://physicstravelguide.com/experiments/aharonov-bohm#tab__concrete"
|
153
|
+
# , "physicstravelguide.com/experiments/aharonov-bohm#tab__concrete"
|
154
|
+
# ),
|
155
|
+
|
156
|
+
( "https://github.com/search?o=asc&q=track&s=stars&type=Repositories"
|
157
|
+
, "github.com/search?q=track"
|
158
|
+
),
|
159
|
+
( "https://80000hours.org/career-decision/article/?utm_source=The+EA+Newsletter&utm_campaign=04ca3c2244-EMAIL_CAMPAIGN_2019_04_03_04_26&utm_medium=email&utm_term=0_51c1df13ac-04ca3c2244-318697649"
|
160
|
+
, "80000hours.org/career-decision/article"
|
161
|
+
),
|
162
|
+
( "https://www.facebook.com/photo.php?fbid=24147689823424326&set=pcb.2414778905423667&type=3&theater"
|
163
|
+
, "facebook.com/photo.php?fbid=24147689823424326"
|
164
|
+
),
|
165
|
+
( "https://play.google.com/store/apps/details?id=com.faultexception.reader&hl=en"
|
166
|
+
, "play.google.com/store/apps/details?id=com.faultexception.reader"
|
167
|
+
),
|
168
|
+
# TODO it also got &p= parameter, which refers to page... not sure how to handle this
|
169
|
+
# news.ycombinator.com/item?id=15451442&p=2
|
170
|
+
( "https://news.ycombinator.com/item?id=12172351"
|
171
|
+
, "news.ycombinator.com/item?id=12172351"
|
172
|
+
),
|
173
|
+
( "https://urbandictionary.com/define.php?term=Belgian%20Whistle"
|
174
|
+
, "urbandictionary.com/define.php?term=Belgian%20Whistle"
|
175
|
+
),
|
176
|
+
( "https://en.wikipedia.org/wiki/Dinic%27s_algorithm"
|
177
|
+
, "en.wikipedia.org/wiki/Dinic%27s_algorithm"
|
178
|
+
),
|
179
|
+
|
180
|
+
( "zoopla.co.uk/to-rent/details/42756337#D0zlBWeD4X85odsR.97"
|
181
|
+
, "zoopla.co.uk/to-rent/details/42756337"
|
182
|
+
),
|
183
|
+
|
184
|
+
( "withouthspec.co.uk/rooms/16867952?guests=2&adults=2&location=Berlin%2C+Germany&check_in=2017-08-16&check_out=2017-08-20"
|
185
|
+
, "withouthspec.co.uk/rooms/16867952"
|
186
|
+
),
|
187
|
+
|
188
|
+
( "amp.theguardian.com/technology/2017/oct/09/mark-zuckerberg-facebook-puerto-rico-virtual-reality"
|
189
|
+
, "theguardian.com/technology/2017/oct/09/mark-zuckerberg-facebook-puerto-rico-virtual-reality",
|
190
|
+
),
|
191
|
+
|
192
|
+
( "https://answers.yahoo.com/question/index?qid=20071101131442AAk9bGp"
|
193
|
+
, "answers.yahoo.com/question/index?qid=20071101131442AAk9bGp"
|
194
|
+
),
|
195
|
+
( "flowingdata.com/2010/12/14/10-best-data-visualization-projects-of-the-year-%e2%80%93-2010"
|
196
|
+
, "flowingdata.com/2010/12/14/10-best-data-visualization-projects-of-the-year-%E2%80%93-2010"
|
197
|
+
),
|
198
|
+
( "flowingdata.com/2010/12/14/10-best-data-visualization-projects-of-the-year-–-2010"
|
199
|
+
, "flowingdata.com/2010/12/14/10-best-data-visualization-projects-of-the-year-%E2%80%93-2010"
|
200
|
+
),
|
201
|
+
|
202
|
+
( "https://spoonuniversity.com/lifestyle/marmite-ways-to-eat-it&usg=AFQjCNH4s1SOEjlpENlfPV5nuvADZpSdow"
|
203
|
+
, "spoonuniversity.com/lifestyle/marmite-ways-to-eat-it"
|
204
|
+
),
|
205
|
+
|
206
|
+
( 'https://google.co.uk/amp/s/amp.reddit.com/r/androidapps/comments/757e2t/swiftkey_or_gboard'
|
207
|
+
, 'reddit.com/r/androidapps/comments/757e2t/swiftkey_or_gboard'
|
208
|
+
),
|
209
|
+
|
210
|
+
# should sort query params
|
211
|
+
( 'https://www.youtube.com/watch?v=hvoQiF0kBI8&list=WL&index=2'
|
212
|
+
, 'youtube.com/watch?v=hvoQiF0kBI8&list=WL',
|
213
|
+
),
|
214
|
+
( 'https://www.youtube.com/watch?list=WL&v=hvoQiF0kBI8&index=2'
|
215
|
+
, 'youtube.com/watch?v=hvoQiF0kBI8&list=WL',
|
216
|
+
),
|
217
|
+
|
218
|
+
# TODO def need to allow the _user_ to define the rules.
|
219
|
+
# no way I can predict everything
|
220
|
+
# basically, allow *interactively* select
|
221
|
+
# also allow introspection, which rule matched?
|
222
|
+
( 'https://bbs.archlinux.org/viewtopic.php?id=212740'
|
223
|
+
, 'bbs.archlinux.org/viewtopic.php?id=212740',
|
224
|
+
),
|
225
|
+
|
226
|
+
( 'https://ubuntuforums.org/showthread.php?t=1403470&s=0dd67bdb12559c22e73a220752db50c7&p=8806195#post8806195'
|
227
|
+
, 'ubuntuforums.org/showthread.php?t=1403470&p=8806195',
|
228
|
+
),
|
229
|
+
|
230
|
+
( 'https://arstechnica.com/?p=1371299',
|
231
|
+
'arstechnica.com/?p=1371299',
|
232
|
+
# eh. it's a redirect to https://arstechnica.com/information-technology/2018/09/dozens-of-ios-apps-surreptitiously-share-user-location-data-with-tracking-firms/
|
233
|
+
# however in the page body there is <link rel="shorturl" href="https://arstechnica.com/?p=1371299"> ...
|
234
|
+
),
|
235
|
+
|
236
|
+
# ( "gwern.net/DNB+FAQ"
|
237
|
+
# , "TODO" # ???
|
238
|
+
# ),
|
239
|
+
|
240
|
+
# TODO shit. is that normal??? perhaps need to manually move fragment?
|
241
|
+
# SplitResult(scheme='https', netloc='unix.stackexchange.com', path='/questions/171603/convert-file-contents-to-lower-case/171708', query='', fragment='171708&usg=AFQjCNEFCGqCAa4P4Zlu2x11bThJispNxQ')
|
242
|
+
# ( "https://unix.stackexchange.com/questions/171603/convert-file-contents-to-lower-case/171708#171708&usg=AFQjCNEFCGqCAa4P4Zlu2x11bThJispNxQ"
|
243
|
+
# , "unix.stackexchange.com/questions/171603/convert-file-contents-to-lower-case/171708#171708"
|
244
|
+
# )
|
245
|
+
])
|
246
|
+
def test(url, expected):
|
247
|
+
assert canonify(url) == expected
|
248
|
+
# TODO github queries
|
249
|
+
# github.com/search?l=Python&q=reddit+backup
|
250
|
+
# github.com/search?p=3&q=ipynb+language%3AHaskell
|
251
|
+
# github.com/search?q=kobo+ExtraData
|
252
|
+
# github.com/search?q=what-universal-human-experiences-are-you-missing-without-realizing-it
|
253
|
+
|
254
|
+
# TODO git+https://github.com/expectocode/telegram-export@master
|
255
|
+
# TODO again, for that actually sequence would be good...
|
256
|
+
|
257
|
+
# TODO "https://twitter.com/search?q=pinboard search&src=typd"
|
258
|
+
|
259
|
+
# TODO https://www.zalando-lounge.ch/#/
|
260
|
+
# TODO m.facebook.com
|
261
|
+
# TODO [R('^(youtube|urbandictionary|tesco|scottaaronson|answers.yahoo.com|code.google.com)') , None],
|
262
|
+
|
263
|
+
|
264
|
+
|
265
|
+
# TODO
|
266
|
+
# amazon.co.uk/gp/offer-listing/B00525XKL4/ref=dp_olp_new
|
267
|
+
# amazon.co.uk/gp/offer-listing/B00525XKL4/ref=olp_twister_child
|
268
|
+
|
269
|
+
# TODO
|
270
|
+
# en.wikipedia.org/wiki/S&P_500_Index
|
271
|
+
|
272
|
+
|
273
|
+
# TODO
|
274
|
+
# google.co.uk/maps/place/Hackney+Bureau/@51.5293789,-0.0527919,16.88z/data=!bla-bla!-bla
|
275
|
+
|
276
|
+
|
277
|
+
# TODO
|
278
|
+
# perhaps, disable utf8 everywhere?
|
279
|
+
# github.com/search?utf8=%E2%9C%93&q=%22My+Clippings.txt%22
|
280
|
+
|
281
|
+
# TODO FIXME fragment handling
|
282
|
+
# ( "https://www.scottaaronson.com/blog/?p=3167#comment-1731882"
|
283
|
+
# , "scottaaronson.com/blog/?p=3167#comment-1731882"
|
284
|
+
# ),
|
285
|
+
|
286
|
+
@pytest.mark.parametrize("urls", [
|
287
|
+
{
|
288
|
+
"launchpad.net/ubuntu/%2Bsource/okular",
|
289
|
+
"launchpad.net/ubuntu/+source/okular",
|
290
|
+
},
|
291
|
+
{
|
292
|
+
"flowingdata.com/2010/12/14/10-best-data-visualization-projects-of-the-year-–-2010",
|
293
|
+
"flowingdata.com/2010/12/14/10-best-data-visualization-projects-of-the-year-%e2%80%93-2010",
|
294
|
+
"https://flowingdata.com/2010/12/14/10-best-data-visualization-projects-of-the-year-%e2%80%93-2010/&usg=AFQjCNEZsEGz9rqpWqlFXR5Tc7pkCKY5sQ",
|
295
|
+
},
|
296
|
+
])
|
297
|
+
def test_same_norm(urls):
|
298
|
+
urls = list(sorted(urls))
|
299
|
+
u0 = urls[0]
|
300
|
+
c0 = canonify(u0)
|
301
|
+
for u in urls[1:]:
|
302
|
+
c = canonify(u)
|
303
|
+
assert c0 == c, f'Expected {u0} and {u} to be same canonically; got {c0} and {c} instead'
|
304
|
+
|
305
|
+
def test_error():
|
306
|
+
# canonify(' +74Zo535, fewfwf@gmail.com') # -- apparently was patched in some python3.7 versions
|
307
|
+
with pytest.raises(CanonifyException):
|
308
|
+
# borrowed from https://bugs.mageia.org/show_bug.cgi?id=24640#c7
|
309
|
+
canonify('https://example.com\uFF03@bing.com')
|
310
|
+
|
311
|
+
@pytest.mark.parametrize("url,expected", [
|
312
|
+
('https://news.ycombinator.com/item?id=', 'news.ycombinator.com/item?id='),
|
313
|
+
('https://www.youtube.com/watch?v=hvoQiF0kBI8&list&index=2',
|
314
|
+
'youtube.com/watch?v=hvoQiF0kBI8&list='),
|
315
|
+
])
|
316
|
+
def test_empty_query_parameter(url, expected):
|
317
|
+
assert canonify(url) == expected
|
318
|
+
|
319
|
+
@pytest.mark.parametrize("url,expected", [
|
320
|
+
('http://www.isfdb.org/cgi-bin/title.cgi?2172', 'isfdb.org/cgi-bin/title.cgi?2172='),
|
321
|
+
('http://www.isfdb.org/cgi-bin/title.cgi?2172+1', 'isfdb.org/cgi-bin/title.cgi?2172%201='),
|
322
|
+
('http://www.isfdb.org/cgi-bin/title.cgi?2172&foo=bar&baz&quux', 'isfdb.org/cgi-bin/title.cgi?2172=&baz=&foo=bar&quux='),
|
323
|
+
])
|
324
|
+
def test_qkeep_true(url, expected):
|
325
|
+
assert canonify(url) == expected
|
@@ -0,0 +1,42 @@
|
|
1
|
+
import os
|
2
|
+
import time
|
3
|
+
|
4
|
+
from ..common import _is_windows
|
5
|
+
|
6
|
+
from .common import get_testdata, promnesia_bin, tmp_popen
|
7
|
+
|
8
|
+
import pytest
|
9
|
+
import requests
|
10
|
+
|
11
|
+
|
12
|
+
ox_hugo_data = get_testdata('ox-hugo/test/site')
|
13
|
+
|
14
|
+
|
15
|
+
def test_demo() -> None:
|
16
|
+
if _is_windows:
|
17
|
+
# for some reason fails to connect to server..
|
18
|
+
# not sure maybe something with port choice idk
|
19
|
+
pytest.skip("TODO broken on Windows")
|
20
|
+
|
21
|
+
with tmp_popen(promnesia_bin('demo', '--port', '16789', ox_hugo_data)):
|
22
|
+
# TODO why does it want post??
|
23
|
+
time.sleep(2) # meh.. need a generic helper to wait till ready...
|
24
|
+
res = {}
|
25
|
+
for attempt in range(30):
|
26
|
+
time.sleep(1)
|
27
|
+
try:
|
28
|
+
res = requests.post(
|
29
|
+
"http://localhost:16789/search",
|
30
|
+
json=dict(url="https://github.com/kaushalmodi/ox-hugo/issues"),
|
31
|
+
).json()
|
32
|
+
break
|
33
|
+
except:
|
34
|
+
continue
|
35
|
+
else:
|
36
|
+
raise RuntimeError("Couldn't connect to the server")
|
37
|
+
vis = res['visits']
|
38
|
+
assert len(vis) > 50, vis
|
39
|
+
mds = [x for x in vis if x['locator']['title'] == 'content/posts/citations-example-toml.md'.replace('/', os.sep)]
|
40
|
+
orgs = [x for x in vis if x['locator']['title'].startswith('content-org/single-posts/empty_tag.org'.replace('/', os.sep))]
|
41
|
+
assert len(mds) == 1
|
42
|
+
assert len(orgs) == 1
|
@@ -0,0 +1,30 @@
|
|
1
|
+
from pathlib import Path
|
2
|
+
import shutil
|
3
|
+
|
4
|
+
from ..compare import compare_files
|
5
|
+
from .utils import index_urls
|
6
|
+
|
7
|
+
|
8
|
+
def test_compare(tmp_path: Path) -> None:
|
9
|
+
idx = index_urls({
|
10
|
+
'https://example.com': None,
|
11
|
+
'https://en.wikipedia.org/wiki/Saturn_V': None,
|
12
|
+
'https://plato.stanford.edu/entries/qualia': None,
|
13
|
+
})
|
14
|
+
idx(tmp_path)
|
15
|
+
db = tmp_path / 'promnesia.sqlite'
|
16
|
+
old_db = tmp_path / 'promnesia-old.sqlite'
|
17
|
+
shutil.move(str(db), str(old_db))
|
18
|
+
|
19
|
+
idx2 = index_urls({
|
20
|
+
'https://example.com': None,
|
21
|
+
'https://www.reddit.com/r/explainlikeimfive/comments/1ev6e0/eli5entropy': None,
|
22
|
+
'https://en.wikipedia.org/wiki/Saturn_V': None,
|
23
|
+
'https://plato.stanford.edu/entries/qualia': None,
|
24
|
+
})
|
25
|
+
idx2(tmp_path)
|
26
|
+
|
27
|
+
# should not crash, as there are more links in the new database
|
28
|
+
assert len(list(compare_files(old_db, db))) == 0
|
29
|
+
|
30
|
+
assert len(list(compare_files(db, old_db))) == 1
|