promnesia 1.2.20230515__py3-none-any.whl → 1.2.20240810__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- promnesia/__main__.py +26 -14
- promnesia/cannon.py +4 -4
- promnesia/common.py +39 -28
- promnesia/compare.py +3 -2
- promnesia/config.py +4 -2
- promnesia/database/common.py +66 -0
- promnesia/database/dump.py +187 -0
- promnesia/{read_db.py → database/load.py} +10 -11
- promnesia/extract.py +1 -0
- promnesia/kjson.py +1 -1
- promnesia/logging.py +3 -3
- promnesia/misc/__init__.pyi +0 -0
- promnesia/misc/config_example.py +1 -2
- promnesia/misc/install_server.py +2 -3
- promnesia/server.py +18 -19
- promnesia/sources/__init__.pyi +0 -0
- promnesia/sources/auto.py +9 -7
- promnesia/sources/browser_legacy.py +11 -5
- promnesia/sources/demo.py +18 -2
- promnesia/sources/filetypes.py +7 -0
- promnesia/sources/github.py +2 -2
- promnesia/sources/hypothesis.py +1 -1
- promnesia/sources/markdown.py +15 -15
- promnesia/sources/org.py +7 -3
- promnesia/sources/plaintext.py +3 -1
- promnesia/sources/reddit.py +2 -2
- promnesia/sources/rss.py +1 -1
- promnesia/sources/signal.py +22 -14
- promnesia/sources/stackexchange.py +2 -2
- promnesia/sources/takeout.py +58 -1
- promnesia/sources/takeout_legacy.py +10 -2
- promnesia/tests/__init__.py +0 -0
- promnesia/tests/common.py +137 -0
- promnesia/tests/server_helper.py +64 -0
- promnesia/tests/sources/__init__.py +0 -0
- promnesia/tests/sources/test_auto.py +66 -0
- promnesia/tests/sources/test_filetypes.py +42 -0
- promnesia/tests/sources/test_hypothesis.py +39 -0
- promnesia/tests/sources/test_org.py +65 -0
- promnesia/tests/sources/test_plaintext.py +26 -0
- promnesia/tests/sources/test_shellcmd.py +22 -0
- promnesia/tests/sources/test_takeout.py +58 -0
- promnesia/tests/test_cannon.py +325 -0
- promnesia/tests/test_cli.py +42 -0
- promnesia/tests/test_compare.py +30 -0
- promnesia/tests/test_config.py +290 -0
- promnesia/tests/test_db_dump.py +223 -0
- promnesia/tests/test_extract.py +61 -0
- promnesia/tests/test_extract_urls.py +43 -0
- promnesia/tests/test_indexer.py +245 -0
- promnesia/tests/test_server.py +292 -0
- promnesia/tests/test_traverse.py +41 -0
- promnesia/tests/utils.py +35 -0
- {promnesia-1.2.20230515.dist-info → promnesia-1.2.20240810.dist-info}/METADATA +13 -17
- promnesia-1.2.20240810.dist-info/RECORD +83 -0
- {promnesia-1.2.20230515.dist-info → promnesia-1.2.20240810.dist-info}/WHEEL +1 -1
- {promnesia-1.2.20230515.dist-info → promnesia-1.2.20240810.dist-info}/entry_points.txt +0 -1
- promnesia/dump.py +0 -105
- promnesia-1.2.20230515.dist-info/RECORD +0 -58
- {promnesia-1.2.20230515.dist-info → promnesia-1.2.20240810.dist-info}/LICENSE +0 -0
- {promnesia-1.2.20230515.dist-info → promnesia-1.2.20240810.dist-info}/top_level.txt +0 -0
promnesia/sources/takeout.py
CHANGED
@@ -1,19 +1,26 @@
|
|
1
1
|
'''
|
2
2
|
Uses HPI [[https://github.com/karlicoss/HPI/blob/master/doc/MODULES.org#mygoogletakeoutpaths][google.takeout]] module
|
3
3
|
'''
|
4
|
-
from typing import Iterable, Set, Any
|
4
|
+
from typing import Iterable, Set, Any, NamedTuple
|
5
5
|
import warnings
|
6
6
|
|
7
7
|
from ..common import Visit, Loc, Results, logger
|
8
8
|
from ..compat import removeprefix
|
9
9
|
|
10
10
|
|
11
|
+
# incase user is using an old version of google_takeout_parser
|
12
|
+
class YoutubeCSVStub(NamedTuple):
|
13
|
+
contentJSON: str
|
14
|
+
|
15
|
+
|
11
16
|
def index() -> Results:
|
12
17
|
from . import hpi
|
18
|
+
import json
|
13
19
|
|
14
20
|
try:
|
15
21
|
from my.google.takeout.parser import events
|
16
22
|
from google_takeout_parser.models import Activity, YoutubeComment, LikedYoutubeVideo, ChromeHistory
|
23
|
+
from google_takeout_parser.parse_csv import reconstruct_comment_content, extract_comment_links
|
17
24
|
except ModuleNotFoundError as ex:
|
18
25
|
logger.exception(ex)
|
19
26
|
yield ex
|
@@ -24,12 +31,24 @@ def index() -> Results:
|
|
24
31
|
yield from takeout_legacy.index()
|
25
32
|
return
|
26
33
|
|
34
|
+
|
27
35
|
_seen: Set[str] = {
|
28
36
|
# these are definitely not useful for promnesia
|
29
37
|
'Location',
|
30
38
|
'PlaceVisit',
|
31
39
|
'PlayStoreAppInstall',
|
32
40
|
}
|
41
|
+
|
42
|
+
imported_yt_csv_models = False
|
43
|
+
try:
|
44
|
+
from google_takeout_parser.models import CSVYoutubeComment, CSVYoutubeLiveChat
|
45
|
+
imported_yt_csv_models = True
|
46
|
+
except ImportError:
|
47
|
+
# warn user to upgrade google_takeout_parser
|
48
|
+
warnings.warn("Please upgrade google_takeout_parser (`pip install -U google_takeout_parser`) to support the new format for youtube comments")
|
49
|
+
CSVYoutubeComment = YoutubeCSVStub # type: ignore[misc,assignment]
|
50
|
+
CSVYoutubeLiveChat = YoutubeCSVStub # type: ignore[misc,assignment]
|
51
|
+
|
33
52
|
def warn_once_if_not_seen(e: Any) -> Iterable[Exception]:
|
34
53
|
et_name = type(e).__name__
|
35
54
|
if et_name in _seen:
|
@@ -71,6 +90,8 @@ def index() -> Results:
|
|
71
90
|
elif e.products == ['Ads']:
|
72
91
|
# header contains some weird internal ad id in this case
|
73
92
|
context = None
|
93
|
+
else:
|
94
|
+
context = None
|
74
95
|
# NOTE: at this point seems that context always ends up as None (at least for @karlicoss as of 20230131)
|
75
96
|
# so alternatively could just force it to be None instead of manual dispatching :shrug:
|
76
97
|
yield Visit(
|
@@ -109,6 +130,42 @@ def index() -> Results:
|
|
109
130
|
yield Visit(
|
110
131
|
url=url, dt=e.dt, context=e.content, locator=Loc(title=e.content, href=url)
|
111
132
|
)
|
133
|
+
elif imported_yt_csv_models and isinstance(e, CSVYoutubeComment):
|
134
|
+
contentJSON = json.loads(e.contentJSON)
|
135
|
+
content = reconstruct_comment_content(contentJSON, format='text')
|
136
|
+
if isinstance(content, Exception):
|
137
|
+
yield content
|
138
|
+
continue
|
139
|
+
links = extract_comment_links(contentJSON)
|
140
|
+
if isinstance(links, Exception):
|
141
|
+
yield links
|
142
|
+
continue
|
143
|
+
context = f"Commented on {e.video_url}"
|
144
|
+
for url in links:
|
145
|
+
yield Visit(
|
146
|
+
url=url, dt=e.dt, context=content, locator=Loc(title=context, href=url)
|
147
|
+
)
|
148
|
+
yield Visit(
|
149
|
+
url=e.video_url, dt=e.dt, context=content, locator=Loc(title=context, href=e.video_url)
|
150
|
+
)
|
151
|
+
elif imported_yt_csv_models and isinstance(e, CSVYoutubeLiveChat):
|
152
|
+
contentJSON = json.loads(e.contentJSON)
|
153
|
+
content = reconstruct_comment_content(contentJSON, format='text')
|
154
|
+
if isinstance(content, Exception):
|
155
|
+
yield content
|
156
|
+
continue
|
157
|
+
links = extract_comment_links(contentJSON)
|
158
|
+
if isinstance(links, Exception):
|
159
|
+
yield links
|
160
|
+
continue
|
161
|
+
context = f"Commented on livestream {e.video_url}"
|
162
|
+
for url in links:
|
163
|
+
yield Visit(
|
164
|
+
url=url, dt=e.dt, context=content, locator=Loc(title=context, href=url)
|
165
|
+
)
|
166
|
+
yield Visit(
|
167
|
+
url=e.video_url, dt=e.dt, context=content, locator=Loc(title=context, href=e.video_url)
|
168
|
+
)
|
112
169
|
else:
|
113
170
|
yield from warn_once_if_not_seen(e)
|
114
171
|
|
@@ -34,7 +34,15 @@ from .. import config
|
|
34
34
|
|
35
35
|
|
36
36
|
from more_itertools import unique_everseen
|
37
|
-
|
37
|
+
|
38
|
+
try:
|
39
|
+
from cachew import cachew
|
40
|
+
except ModuleNotFoundError as me:
|
41
|
+
if me.name != 'cachew':
|
42
|
+
raise me
|
43
|
+
# this module is legacy anyway, so just make it defensive
|
44
|
+
def cachew(*args, **kwargs): # type: ignore[no-redef]
|
45
|
+
return lambda f: f
|
38
46
|
|
39
47
|
|
40
48
|
# TODO use CPath? Could encapsulate a path within an archive *or* within a directory
|
@@ -105,7 +113,7 @@ def read_browser_history_json(takeout: TakeoutPath) -> Iterable[Visit]:
|
|
105
113
|
hist = j['Browser History']
|
106
114
|
for item in hist:
|
107
115
|
url = item['url']
|
108
|
-
time = datetime.
|
116
|
+
time = datetime.fromtimestamp(item['time_usec'] / 10 ** 6, tz=pytz.utc)
|
109
117
|
# TODO any more interesitng info?
|
110
118
|
yield Visit(
|
111
119
|
url=url,
|
File without changes
|
@@ -0,0 +1,137 @@
|
|
1
|
+
from contextlib import closing, contextmanager
|
2
|
+
import gc
|
3
|
+
import inspect
|
4
|
+
import os
|
5
|
+
from pathlib import Path
|
6
|
+
import socket
|
7
|
+
import sys
|
8
|
+
from textwrap import dedent
|
9
|
+
from typing import Iterator, NoReturn, TypeVar
|
10
|
+
|
11
|
+
import pytest
|
12
|
+
|
13
|
+
from ..common import _is_windows, Res
|
14
|
+
|
15
|
+
|
16
|
+
def under_ci() -> bool:
|
17
|
+
return 'CI' in os.environ
|
18
|
+
|
19
|
+
|
20
|
+
def throw(x: Exception) -> NoReturn:
|
21
|
+
'''
|
22
|
+
like raise, but can be an expression...
|
23
|
+
'''
|
24
|
+
raise x
|
25
|
+
|
26
|
+
|
27
|
+
@pytest.fixture
|
28
|
+
def gc_control(gc_on: bool):
|
29
|
+
if gc_on:
|
30
|
+
# no need to do anything, should be on by default
|
31
|
+
yield
|
32
|
+
return
|
33
|
+
|
34
|
+
gc.disable()
|
35
|
+
try:
|
36
|
+
yield
|
37
|
+
finally:
|
38
|
+
gc.enable()
|
39
|
+
|
40
|
+
|
41
|
+
running_on_ci = 'CI' in os.environ
|
42
|
+
|
43
|
+
|
44
|
+
GIT_ROOT = Path(__file__).absolute().parent.parent.parent.parent
|
45
|
+
TESTDATA = GIT_ROOT / 'tests/testdata'
|
46
|
+
|
47
|
+
|
48
|
+
def get_testdata(path: str) -> Path:
|
49
|
+
assert TESTDATA.is_dir()
|
50
|
+
res = TESTDATA / path
|
51
|
+
if not res.exists():
|
52
|
+
raise RuntimeError(f"'{res}' not found! You propably need to run 'git submodule update --init --recursive'")
|
53
|
+
return TESTDATA / path
|
54
|
+
|
55
|
+
|
56
|
+
@contextmanager
|
57
|
+
def tmp_popen(*args, **kwargs):
|
58
|
+
import psutil
|
59
|
+
with psutil.Popen(*args, **kwargs) as p:
|
60
|
+
try:
|
61
|
+
yield p
|
62
|
+
finally:
|
63
|
+
for c in p.children(recursive=True):
|
64
|
+
c.kill()
|
65
|
+
p.kill()
|
66
|
+
p.wait()
|
67
|
+
|
68
|
+
|
69
|
+
# meh
|
70
|
+
def promnesia_bin(*args):
|
71
|
+
# not sure it's a good idea to diverge, but not sure if there's a better way either?
|
72
|
+
# ugh. on windows there is no bash so can't use the script
|
73
|
+
# whatever...
|
74
|
+
if under_ci() or _is_windows:
|
75
|
+
# should be able to use the installed version
|
76
|
+
return [sys.executable, '-m', 'promnesia', *args]
|
77
|
+
else:
|
78
|
+
# use version from the repository
|
79
|
+
root = Path(__file__).parent.parent.parent.parent
|
80
|
+
pm = root / 'scripts/promnesia'
|
81
|
+
return [pm, *args]
|
82
|
+
|
83
|
+
|
84
|
+
# meh... not great
|
85
|
+
@pytest.fixture
|
86
|
+
def reset_filters():
|
87
|
+
from .. import extract
|
88
|
+
|
89
|
+
extract.filters.cache_clear()
|
90
|
+
try:
|
91
|
+
yield
|
92
|
+
finally:
|
93
|
+
extract.filters.cache_clear()
|
94
|
+
|
95
|
+
|
96
|
+
# TODO could be a TypeGuard from 3.10
|
97
|
+
V = TypeVar('V')
|
98
|
+
|
99
|
+
def unwrap(r: Res[V]) -> V:
|
100
|
+
assert not isinstance(r, Exception), r
|
101
|
+
return r
|
102
|
+
|
103
|
+
|
104
|
+
def write_config(path: Path, gen, **kwargs) -> None:
|
105
|
+
output_dir = path.parent
|
106
|
+
cfg_src = dedent('\n'.join(inspect.getsource(gen).splitlines()[1:])) + f"\nOUTPUT_DIR = r'{output_dir}'"
|
107
|
+
for k, v in kwargs.items():
|
108
|
+
assert k in cfg_src, k
|
109
|
+
cfg_src = cfg_src.replace(k, repr(str(v))) # meh
|
110
|
+
path.write_text(cfg_src)
|
111
|
+
|
112
|
+
|
113
|
+
@contextmanager
|
114
|
+
def free_port() -> Iterator[int]:
|
115
|
+
# this is a generator to make sure there are no race conditions between the time we call this and launch program
|
116
|
+
#
|
117
|
+
# also some relevant articles about this 'technique'
|
118
|
+
# - https://eklitzke.org/binding-on-port-zero
|
119
|
+
# - https://idea.popcount.org/2014-04-03-bind-before-connect
|
120
|
+
# - https://blog.cloudflare.com/the-quantum-state-of-a-tcp-port
|
121
|
+
with closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as s:
|
122
|
+
if sys.platform == 'linux':
|
123
|
+
# Ok, so from what I've been reading, SO_REUSEADDR should only be necessary in the program that reuses the port
|
124
|
+
# However, this answer (or man socket) claims we need it on both sites in Linux? see https://superuser.com/a/587955/300795
|
125
|
+
s.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
|
126
|
+
# also not sure where REUSEADDR is set in uvicorn (e.g. here reuse_address isn't passed?)
|
127
|
+
# https://github.com/encode/uvicorn/blob/6d666d99a285153bc4613e811543c39eca57054a/uvicorn/server.py#L162C37-L162C50
|
128
|
+
# but from strace looks like it is called somewhere :shrug:
|
129
|
+
|
130
|
+
# assign euphemeral port
|
131
|
+
# see table in
|
132
|
+
# https://stackoverflow.com/questions/14388706/how-do-so-reuseaddr-and-so-reuseport-differ/14388707#14388707
|
133
|
+
# we rely on server binding to localhost later (or anything except 0.0.0.0 really)
|
134
|
+
s.bind(('', 0))
|
135
|
+
|
136
|
+
port = s.getsockname()[1]
|
137
|
+
yield port
|
@@ -0,0 +1,64 @@
|
|
1
|
+
from contextlib import contextmanager
|
2
|
+
from dataclasses import dataclass
|
3
|
+
from pathlib import Path
|
4
|
+
import sys
|
5
|
+
import time
|
6
|
+
from typing import Any, Dict, Iterator, Optional
|
7
|
+
|
8
|
+
import psutil
|
9
|
+
import requests
|
10
|
+
|
11
|
+
from ..common import PathIsh
|
12
|
+
from .common import tmp_popen, promnesia_bin, free_port
|
13
|
+
|
14
|
+
|
15
|
+
@dataclass
|
16
|
+
class Helper:
|
17
|
+
host: str
|
18
|
+
port: str
|
19
|
+
process: psutil.Popen
|
20
|
+
|
21
|
+
def get(self, path: str, *args):
|
22
|
+
# check it's alive first so the error is cleaner
|
23
|
+
assert self.process.poll() is None, self.process
|
24
|
+
return requests.get(f'http://{self.host}:{self.port}' + path)
|
25
|
+
|
26
|
+
def post(self, path: str, *, json: Optional[Dict[str, Any]] = None):
|
27
|
+
assert self.process.poll() is None, self.process
|
28
|
+
return requests.post(f'http://{self.host}:{self.port}' + path, json=json)
|
29
|
+
|
30
|
+
|
31
|
+
@contextmanager
|
32
|
+
def run_server(db: Optional[PathIsh] = None, *, timezone: Optional[str] = None) -> Iterator[Helper]:
|
33
|
+
# TODO not sure, perhaps best to use a thread or something?
|
34
|
+
# but for some tests makes more sense to test in a separate process
|
35
|
+
with free_port() as pp:
|
36
|
+
# ugh. under docker 'localhost' tries to bind it to ipv6 (::1) for some reason???
|
37
|
+
host = '0.0.0.0' if Path('/.dockerenv').exists() else 'localhost'
|
38
|
+
port = str(pp)
|
39
|
+
args = [
|
40
|
+
'serve',
|
41
|
+
'--host', host,
|
42
|
+
'--quiet',
|
43
|
+
'--port', port,
|
44
|
+
*([] if timezone is None else ['--timezone', timezone]),
|
45
|
+
*([] if db is None else ['--db' , str(db)]),
|
46
|
+
]
|
47
|
+
with tmp_popen(promnesia_bin(*args)) as server_process:
|
48
|
+
server = Helper(host=host, port=port, process=server_process)
|
49
|
+
|
50
|
+
# wait till ready
|
51
|
+
for _ in range(50):
|
52
|
+
try:
|
53
|
+
server.get('/status').json()
|
54
|
+
break
|
55
|
+
except:
|
56
|
+
time.sleep(0.1)
|
57
|
+
else:
|
58
|
+
raise RuntimeError("Cooldn't connect to '{st}' after 50 attempts")
|
59
|
+
print("Started server up, db: {db}".format(db=db), file=sys.stderr)
|
60
|
+
|
61
|
+
yield server
|
62
|
+
|
63
|
+
# TODO use logger!
|
64
|
+
print("Done with the server", file=sys.stderr)
|
File without changes
|
@@ -0,0 +1,66 @@
|
|
1
|
+
from itertools import groupby
|
2
|
+
import os
|
3
|
+
|
4
|
+
from ...sources import auto
|
5
|
+
|
6
|
+
from ..common import get_testdata, throw
|
7
|
+
|
8
|
+
sa2464 = 'https://www.scottaaronson.com/blog/?p=2464'
|
9
|
+
|
10
|
+
_JSON_URLS = {
|
11
|
+
'https://johncarlosbaez.wordpress.com/2016/09/09/struggles-with-the-continuum-part-2/',
|
12
|
+
sa2464,
|
13
|
+
}
|
14
|
+
|
15
|
+
|
16
|
+
def makemap(visits):
|
17
|
+
key = lambda v: v.url
|
18
|
+
|
19
|
+
def it():
|
20
|
+
vit = (throw(v) if isinstance(v, Exception) else v for v in visits)
|
21
|
+
for k, g in groupby(sorted(vit, key=key), key=key):
|
22
|
+
yield k, list(sorted(g))
|
23
|
+
|
24
|
+
return dict(it())
|
25
|
+
|
26
|
+
|
27
|
+
def test_json() -> None:
|
28
|
+
mm = makemap(auto.index(get_testdata('auto'), ignored='*/orgs/*'))
|
29
|
+
assert mm.keys() == _JSON_URLS
|
30
|
+
|
31
|
+
# TODO not sure if they deserve separate visits..
|
32
|
+
[v1, v2] = mm[sa2464]
|
33
|
+
assert v1.context == 'list::yyy::given_url'
|
34
|
+
# todo not sure if editor:// work on Windows
|
35
|
+
assert v1.locator.href.startswith('editor://')
|
36
|
+
assert v1.locator.href.endswith('pocket.json')
|
37
|
+
# TODO line number?
|
38
|
+
|
39
|
+
|
40
|
+
def test_auto() -> None:
|
41
|
+
mm = makemap(auto.index(get_testdata('auto')))
|
42
|
+
org_link = 'https://www.youtube.com/watch?v=rHIkrotSwcc'
|
43
|
+
assert {
|
44
|
+
*_JSON_URLS,
|
45
|
+
org_link,
|
46
|
+
}.issubset(mm.keys())
|
47
|
+
|
48
|
+
[v] = mm[org_link]
|
49
|
+
assert v.locator.title == 'orgs' + os.sep + 'file.org:14' # meh
|
50
|
+
assert v.locator.href.endswith('file.org:14')
|
51
|
+
assert "xxx /r/cpp" in v.context
|
52
|
+
assert "I've enjoyed [Chandler Carruth's" in v.context
|
53
|
+
|
54
|
+
|
55
|
+
def test_obsidian() -> None:
|
56
|
+
mm = makemap(auto.index(get_testdata('obsidian-vault')))
|
57
|
+
example_url = 'https://example.com'
|
58
|
+
[v] = mm[example_url]
|
59
|
+
assert v.locator.href.startswith('obsidian://')
|
60
|
+
|
61
|
+
|
62
|
+
def test_logseq() -> None:
|
63
|
+
mm = makemap(auto.index(get_testdata('logseq-graph')))
|
64
|
+
example_url = 'https://example.com'
|
65
|
+
[v] = mm[example_url]
|
66
|
+
assert v.locator.href.startswith('logseq://')
|
@@ -0,0 +1,42 @@
|
|
1
|
+
from pathlib import Path
|
2
|
+
|
3
|
+
from ...common import PathIsh, _is_windows as windows
|
4
|
+
from ...sources.auto import by_path
|
5
|
+
|
6
|
+
|
7
|
+
def handled(p: PathIsh) -> bool:
|
8
|
+
idx, m = by_path(Path(p))
|
9
|
+
return idx is not None
|
10
|
+
# ideally these won't hit libmagic path (would try to open the file and cause FileNotFoundError)
|
11
|
+
|
12
|
+
|
13
|
+
def test_filetypes() -> None:
|
14
|
+
# test media
|
15
|
+
for ext in 'avi mp4 mp3 webm'.split() + ([] if windows else 'mkv'.split()):
|
16
|
+
assert handled('file.' + ext)
|
17
|
+
|
18
|
+
# images
|
19
|
+
for ext in 'gif jpg png jpeg'.split():
|
20
|
+
assert handled('file.' + ext)
|
21
|
+
|
22
|
+
# TODO more granual checks that these are ignored?
|
23
|
+
# binaries
|
24
|
+
for ext in 'o sqlite'.split() + ([] if windows else 'class jar'.split()):
|
25
|
+
assert handled('file.' + ext)
|
26
|
+
|
27
|
+
# these might have potentially some links
|
28
|
+
for ext in [
|
29
|
+
'svg',
|
30
|
+
'pdf', 'epub', 'ps',
|
31
|
+
'doc', 'ppt', 'xsl',
|
32
|
+
# seriously, windows doesn't know about docx???
|
33
|
+
*([] if windows else 'docx pptx xlsx'.split()),
|
34
|
+
*([] if windows else 'ods odt rtf'.split()),
|
35
|
+
] + ([] if windows else 'djvu'.split()):
|
36
|
+
assert handled('file.' + ext)
|
37
|
+
|
38
|
+
# source code
|
39
|
+
for ext in 'rs tex el js sh hs pl h py hpp c go css'.split() + ([] if windows else 'java cpp'.split()):
|
40
|
+
assert handled('file.' + ext)
|
41
|
+
|
42
|
+
assert handled('x.html')
|
@@ -0,0 +1,39 @@
|
|
1
|
+
from pathlib import Path
|
2
|
+
|
3
|
+
from ..common import write_config, get_testdata
|
4
|
+
from ...__main__ import do_index
|
5
|
+
from ...database.load import get_all_db_visits
|
6
|
+
|
7
|
+
from my.core.cfg import tmp_config
|
8
|
+
|
9
|
+
|
10
|
+
def index_hypothesis(tmp_path: Path) -> None:
|
11
|
+
def cfg() -> None:
|
12
|
+
from promnesia.common import Source
|
13
|
+
from promnesia.sources import hypothesis
|
14
|
+
|
15
|
+
SOURCES = [Source(hypothesis.index, name='hyp')]
|
16
|
+
|
17
|
+
cfg_path = tmp_path / 'config.py'
|
18
|
+
write_config(cfg_path, cfg)
|
19
|
+
|
20
|
+
class hpi_config:
|
21
|
+
class hypothesis:
|
22
|
+
export_path = get_testdata('hypexport/testdata') / 'netrights-dashboard-mockup/data/*.json'
|
23
|
+
|
24
|
+
with tmp_config(modules='my.hypothesis', config=hpi_config):
|
25
|
+
do_index(cfg_path)
|
26
|
+
|
27
|
+
|
28
|
+
def test_hypothesis(tmp_path: Path) -> None:
|
29
|
+
index_hypothesis(tmp_path)
|
30
|
+
|
31
|
+
visits = get_all_db_visits(tmp_path / 'promnesia.sqlite')
|
32
|
+
assert len(visits) > 100
|
33
|
+
|
34
|
+
[vis] = [x for x in visits if 'fundamental fact of evolution' in (x.context or '')]
|
35
|
+
|
36
|
+
assert vis.norm_url == 'wired.com/2017/04/the-myth-of-a-superhuman-ai'
|
37
|
+
assert vis.orig_url == 'https://www.wired.com/2017/04/the-myth-of-a-superhuman-ai/'
|
38
|
+
assert vis.locator.href == 'https://hyp.is/_Z9ccmVZEeexBOO7mToqdg/www.wired.com/2017/04/the-myth-of-a-superhuman-ai/'
|
39
|
+
assert 'misconception about evolution is fueling misconception about AI' in (vis.context or '') # contains notes as well
|
@@ -0,0 +1,65 @@
|
|
1
|
+
from typing import Optional
|
2
|
+
|
3
|
+
from ...common import Visit
|
4
|
+
from ...sources.org import extract_from_file
|
5
|
+
|
6
|
+
from ..common import get_testdata, throw
|
7
|
+
|
8
|
+
|
9
|
+
def delrf(s: Optional[str]) -> Optional[str]:
|
10
|
+
if s is None:
|
11
|
+
return None
|
12
|
+
# meh.. not sure how ot handle this properly, ideally should be via pytest?
|
13
|
+
# not sure if should just do it in the indexer? e.g. extension might not like it
|
14
|
+
return s.replace('\r', '')
|
15
|
+
|
16
|
+
|
17
|
+
def test_org_indexer() -> None:
|
18
|
+
[_, cpp, cozy] = [v if isinstance(v, Visit) else throw(v) for v in extract_from_file(get_testdata('auto/orgs/file.org'))]
|
19
|
+
|
20
|
+
assert cpp.url == 'https://www.youtube.com/watch?v=rHIkrotSwcc'
|
21
|
+
# TODO not sure about filetags?
|
22
|
+
exp = '''
|
23
|
+
xxx /r/cpp :cpp:programming:
|
24
|
+
I've enjoyed [Chandler Carruth's _There Are No Zero-cost Abstractions_](
|
25
|
+
https://www.youtube.com/watch?v=rHIkrotSwcc) very much.
|
26
|
+
|
27
|
+
'''.lstrip()
|
28
|
+
assert delrf(cpp.context) == exp
|
29
|
+
|
30
|
+
assert cozy.url == 'https://twitter.com/Mappletons/status/1255221220263563269'
|
31
|
+
|
32
|
+
|
33
|
+
def test_org_indexer_2() -> None:
|
34
|
+
items = [v if isinstance(v, Visit) else throw(v) for v in extract_from_file(get_testdata('auto/orgs/file3.org'))]
|
35
|
+
|
36
|
+
assert len(items) == 6
|
37
|
+
assert items[0].url == 'https://www.reddit.com/r/androidapps/comments/4i36z9/how_you_use_your_android_to_the_maximum/d2uq24i'
|
38
|
+
assert items[1].url == 'https://link.com'
|
39
|
+
assert items[-2].url == 'https://en.wikipedia.org/wiki/Resilio_Sync'
|
40
|
+
# TODO shit def need org specific url extractor (and then extract from everything remaining)
|
41
|
+
# assert results[-1].url == 'https://en.wikipedia.org/wiki/InterPlanetary_File_System'
|
42
|
+
|
43
|
+
|
44
|
+
def test_heading() -> None:
|
45
|
+
items = [v if isinstance(v, Visit) else throw(v) for v in extract_from_file(get_testdata('auto/orgs/file2.org'))]
|
46
|
+
assert {i.url for i in items} == {
|
47
|
+
'https://en.wikipedia.org/wiki/Computational_topology',
|
48
|
+
'http://graphics.stanford.edu/courses/cs468-09-fall/',
|
49
|
+
'https://en.wikipedia.org/wiki/Triangulation_(topology)',
|
50
|
+
'https://en.wikipedia.org/wiki/Digital_manifold',
|
51
|
+
}
|
52
|
+
|
53
|
+
|
54
|
+
def test_url_in_properties() -> None:
|
55
|
+
items = [v if isinstance(v, Visit) else throw(v) for v in extract_from_file(get_testdata('auto/orgs/file4.org'))]
|
56
|
+
|
57
|
+
assert len(items) == 2, items
|
58
|
+
assert items[0].url == 'https://example.org/ref_example'
|
59
|
+
assert items[1].url == 'http://example.org/a_test'
|
60
|
+
|
61
|
+
|
62
|
+
def test_5() -> None:
|
63
|
+
items = [v if isinstance(v, Visit) else throw(v) for v in extract_from_file(get_testdata('auto/orgs/file5.org'))]
|
64
|
+
|
65
|
+
assert len(items) == 0 # shouldn't crash at least
|
@@ -0,0 +1,26 @@
|
|
1
|
+
from ...common import Source
|
2
|
+
from ...extract import extract_visits
|
3
|
+
from ...sources import plaintext, shellcmd
|
4
|
+
|
5
|
+
from ..common import get_testdata, unwrap
|
6
|
+
|
7
|
+
|
8
|
+
def test_plaintext_path_extractor() -> None:
|
9
|
+
visits = list(extract_visits(
|
10
|
+
Source(
|
11
|
+
shellcmd.index,
|
12
|
+
plaintext.extract_from_path(get_testdata('custom')),
|
13
|
+
),
|
14
|
+
src='whatever',
|
15
|
+
))
|
16
|
+
assert {unwrap(v).orig_url for v in visits} == {
|
17
|
+
'http://google.com',
|
18
|
+
'http://google.com/',
|
19
|
+
'http://some-weird-domain.xyz/whatever',
|
20
|
+
'https://google.com',
|
21
|
+
'http://what.about.this.link',
|
22
|
+
}
|
23
|
+
|
24
|
+
[wa] = [v for v in visits if unwrap(v).orig_url == 'http://what.about.this.link']
|
25
|
+
f2 = get_testdata('custom') / 'file2.txt'
|
26
|
+
assert unwrap(wa).locator.href == f'editor://{f2}:3' # occurs line 3
|
@@ -0,0 +1,22 @@
|
|
1
|
+
from ...common import _is_windows, Source
|
2
|
+
from ...extract import extract_visits
|
3
|
+
from ...sources import shellcmd
|
4
|
+
|
5
|
+
import pytest
|
6
|
+
|
7
|
+
from ..common import get_testdata
|
8
|
+
|
9
|
+
|
10
|
+
@pytest.mark.skipif(_is_windows, reason="no grep on windows")
|
11
|
+
def test_via_grep() -> None:
|
12
|
+
|
13
|
+
visits = list(extract_visits(
|
14
|
+
Source(
|
15
|
+
shellcmd.index,
|
16
|
+
# meh. maybe should deprecate plain string here...
|
17
|
+
r"""grep -Eo -r --no-filename (http|https)://\S+ """ + str(get_testdata('custom')),
|
18
|
+
),
|
19
|
+
src='whatever',
|
20
|
+
))
|
21
|
+
# TODO I guess filtering of equivalent urls should rather be tested on something having context (e.g. org mode)
|
22
|
+
assert len(visits) == 5
|
@@ -0,0 +1,58 @@
|
|
1
|
+
from datetime import datetime, timezone
|
2
|
+
|
3
|
+
from ...common import Source
|
4
|
+
from ...extract import extract_visits
|
5
|
+
from ...sources import takeout
|
6
|
+
|
7
|
+
import pytest
|
8
|
+
|
9
|
+
from ..common import get_testdata, unwrap
|
10
|
+
|
11
|
+
from my.core.cfg import tmp_config
|
12
|
+
|
13
|
+
|
14
|
+
# TODO apply in conftest so it's used in all tests?
|
15
|
+
@pytest.fixture
|
16
|
+
def no_cachew():
|
17
|
+
from my.core.cachew import disabled_cachew
|
18
|
+
|
19
|
+
with disabled_cachew():
|
20
|
+
yield
|
21
|
+
|
22
|
+
|
23
|
+
# todo testing this logic probably belongs to hpi or google_takeout_export, but whatever
|
24
|
+
def test_takeout_directory(no_cachew) -> None:
|
25
|
+
class config:
|
26
|
+
class google:
|
27
|
+
takeout_path = get_testdata('takeout')
|
28
|
+
|
29
|
+
with tmp_config(modules='my.google.takeout.*', config=config):
|
30
|
+
visits = list(extract_visits(Source(takeout.index), src='takeout'))
|
31
|
+
|
32
|
+
assert len(visits) == 3
|
33
|
+
assert all(unwrap(v).dt.tzinfo is not None for v in visits)
|
34
|
+
|
35
|
+
|
36
|
+
def test_takeout_zip(no_cachew) -> None:
|
37
|
+
class config:
|
38
|
+
class google:
|
39
|
+
takeout_path = get_testdata('takeout-20150518T000000Z.zip')
|
40
|
+
|
41
|
+
with tmp_config(modules='my.google.takeout.*', config=config):
|
42
|
+
visits = list(extract_visits(Source(takeout.index), src='takeout'))
|
43
|
+
|
44
|
+
assert len(visits) == 3
|
45
|
+
assert all(unwrap(v).dt.tzinfo is not None for v in visits)
|
46
|
+
|
47
|
+
[vis] = [v for v in visits if unwrap(v).norm_url == 'takeout.google.com/settings/takeout']
|
48
|
+
|
49
|
+
edt = datetime(
|
50
|
+
year=2018,
|
51
|
+
month=9,
|
52
|
+
day=18,
|
53
|
+
hour=5,
|
54
|
+
minute=48,
|
55
|
+
second=23,
|
56
|
+
tzinfo=timezone.utc,
|
57
|
+
)
|
58
|
+
assert unwrap(vis).dt == edt
|