promnesia 1.1.20230129__py3-none-any.whl → 1.2.20240810__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (72) hide show
  1. promnesia/__main__.py +58 -50
  2. promnesia/cannon.py +4 -4
  3. promnesia/common.py +57 -38
  4. promnesia/compare.py +3 -2
  5. promnesia/compat.py +6 -65
  6. promnesia/config.py +4 -2
  7. promnesia/database/common.py +66 -0
  8. promnesia/database/dump.py +187 -0
  9. promnesia/{read_db.py → database/load.py} +10 -11
  10. promnesia/extract.py +1 -0
  11. promnesia/kjson.py +1 -1
  12. promnesia/logging.py +14 -14
  13. promnesia/misc/__init__.pyi +0 -0
  14. promnesia/misc/config_example.py +1 -2
  15. promnesia/misc/install_server.py +5 -4
  16. promnesia/server.py +24 -24
  17. promnesia/sources/__init__.pyi +0 -0
  18. promnesia/sources/auto.py +12 -7
  19. promnesia/sources/browser.py +80 -293
  20. promnesia/sources/browser_legacy.py +298 -0
  21. promnesia/sources/demo.py +18 -2
  22. promnesia/sources/filetypes.py +8 -0
  23. promnesia/sources/github.py +2 -2
  24. promnesia/sources/hackernews.py +1 -2
  25. promnesia/sources/hypothesis.py +1 -1
  26. promnesia/sources/markdown.py +15 -15
  27. promnesia/sources/org.py +7 -3
  28. promnesia/sources/plaintext.py +3 -1
  29. promnesia/sources/reddit.py +2 -2
  30. promnesia/sources/rss.py +5 -1
  31. promnesia/sources/shellcmd.py +6 -2
  32. promnesia/sources/signal.py +29 -20
  33. promnesia/sources/smscalls.py +8 -1
  34. promnesia/sources/stackexchange.py +2 -2
  35. promnesia/sources/takeout.py +132 -12
  36. promnesia/sources/takeout_legacy.py +10 -2
  37. promnesia/sources/telegram.py +79 -123
  38. promnesia/sources/telegram_legacy.py +117 -0
  39. promnesia/sources/vcs.py +1 -1
  40. promnesia/sources/viber.py +6 -15
  41. promnesia/sources/website.py +1 -1
  42. promnesia/sqlite.py +42 -0
  43. promnesia/tests/__init__.py +0 -0
  44. promnesia/tests/common.py +137 -0
  45. promnesia/tests/server_helper.py +64 -0
  46. promnesia/tests/sources/__init__.py +0 -0
  47. promnesia/tests/sources/test_auto.py +66 -0
  48. promnesia/tests/sources/test_filetypes.py +42 -0
  49. promnesia/tests/sources/test_hypothesis.py +39 -0
  50. promnesia/tests/sources/test_org.py +65 -0
  51. promnesia/tests/sources/test_plaintext.py +26 -0
  52. promnesia/tests/sources/test_shellcmd.py +22 -0
  53. promnesia/tests/sources/test_takeout.py +58 -0
  54. promnesia/tests/test_cannon.py +325 -0
  55. promnesia/tests/test_cli.py +42 -0
  56. promnesia/tests/test_compare.py +30 -0
  57. promnesia/tests/test_config.py +290 -0
  58. promnesia/tests/test_db_dump.py +223 -0
  59. promnesia/tests/test_extract.py +61 -0
  60. promnesia/tests/test_extract_urls.py +43 -0
  61. promnesia/tests/test_indexer.py +245 -0
  62. promnesia/tests/test_server.py +292 -0
  63. promnesia/tests/test_traverse.py +41 -0
  64. promnesia/tests/utils.py +35 -0
  65. {promnesia-1.1.20230129.dist-info → promnesia-1.2.20240810.dist-info}/METADATA +14 -19
  66. promnesia-1.2.20240810.dist-info/RECORD +83 -0
  67. {promnesia-1.1.20230129.dist-info → promnesia-1.2.20240810.dist-info}/WHEEL +1 -1
  68. {promnesia-1.1.20230129.dist-info → promnesia-1.2.20240810.dist-info}/entry_points.txt +0 -1
  69. promnesia/dump.py +0 -105
  70. promnesia-1.1.20230129.dist-info/RECORD +0 -55
  71. {promnesia-1.1.20230129.dist-info → promnesia-1.2.20240810.dist-info}/LICENSE +0 -0
  72. {promnesia-1.1.20230129.dist-info → promnesia-1.2.20240810.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,117 @@
1
+ '''
2
+ Uses [[https://github.com/fabianonline/telegram_backup#readme][telegram_backup]] database for messages data
3
+ '''
4
+
5
+ from pathlib import Path
6
+ import sqlite3
7
+ from textwrap import dedent
8
+ from typing import Union, TypeVar
9
+ from urllib.parse import unquote # TODO mm, make it easier to rememember to use...
10
+
11
+ from ..common import PathIsh, Visit, get_logger, Loc, extract_urls, from_epoch, Results, echain
12
+ from ..sqlite import sqlite_connection
13
+
14
+ T = TypeVar("T")
15
+
16
+
17
+ def unwrap(res: Union[T, Exception]) -> T:
18
+ if isinstance(res, Exception):
19
+ raise res
20
+ else:
21
+ return res
22
+
23
+
24
+ def index(database: PathIsh, *, http_only: bool=False) -> Results:
25
+ """
26
+ :param database:
27
+ the path of the sqlite generated by the _telegram_backup_ java program
28
+ :param http_only:
29
+ when true, do not collect IP-addresses and `python.py` strings
30
+ """
31
+ logger = get_logger()
32
+
33
+ path = Path(database)
34
+ assert path.is_file(), path
35
+
36
+ def make_query(text_query: str) -> str:
37
+ extra_criteria = "AND (M.has_media == 1 OR text LIKE '%http%')" if http_only else ""
38
+ return dedent(
39
+ f"""
40
+ WITH entities AS (
41
+ SELECT 'dialog' as type
42
+ , id
43
+ , coalesce(username, id) as handle
44
+ , coalesce(first_name || " " || last_name
45
+ , username
46
+ , id
47
+ ) as display_name FROM users
48
+ UNION
49
+ SELECT 'group' as type
50
+ , id
51
+ , id as handle
52
+ , coalesce(name, id) as display_name FROM chats
53
+ )
54
+ SELECT src.display_name AS chatname
55
+ , src.handle AS chat
56
+ , snd.display_name AS sender
57
+ , M.time AS time
58
+ , {text_query} AS text
59
+ , M.message_id AS mid
60
+ FROM messages AS M
61
+ /* chat types are 'dialog' (1-1), 'group' and 'supergroup' */
62
+ /* this is abit hacky way to handle all groups in one go */
63
+ LEFT JOIN entities AS src ON M.source_id = src.id AND src.type = (CASE M.source_type WHEN 'supergroup' THEN 'group' ELSE M.source_type END)
64
+ LEFT JOIN entities AS snd ON M.sender_id = snd.id AND snd.type = 'dialog'
65
+ WHERE
66
+ M.message_type NOT IN ('service_message', 'empty_message')
67
+ {extra_criteria}
68
+ ORDER BY time;
69
+ """)
70
+
71
+ with sqlite_connection(path, immutable=True, row_factory='row') as db:
72
+ # TODO yield error if chatname or chat or smth else is null?
73
+ for row in db.execute(make_query('M.text')):
74
+ try:
75
+ yield from _handle_row(row)
76
+ except Exception as ex:
77
+ yield echain(RuntimeError(f'While handling {row}'), ex)
78
+
79
+ # old (also 'stable') version doesn't have 'json' column yet...
80
+ messages_columns = [d[0] for d in db.execute('SELECT * FROM messages').description]
81
+ # todo hmm what is 'markup_json'??
82
+ if 'json' in messages_columns:
83
+ for row in db.execute(make_query("json_extract(json, '$.media.webpage.description')")):
84
+ try:
85
+ yield from _handle_row(row)
86
+ except Exception as ex:
87
+ yield echain(RuntimeError(f'While handling {row}'), ex)
88
+
89
+
90
+ def _handle_row(row: sqlite3.Row) -> Results:
91
+ text = row['text']
92
+ if text is None:
93
+ return
94
+ urls = extract_urls(text)
95
+ if len(urls) == 0:
96
+ return
97
+ dt = from_epoch(row['time'])
98
+ mid: str = unwrap(row['mid'])
99
+
100
+ # TODO perhaps we could be defensive with null sender/chat etc and still emit the Visit
101
+ sender: str = unwrap(row['sender'])
102
+ chatname: str = unwrap(row['chatname'])
103
+ chat: str = unwrap(row['chat'])
104
+
105
+ in_context = f'https://t.me/{chat}/{mid}'
106
+ for u in urls:
107
+ # https://www.reddit.com/r/Telegram/comments/6ufwi3/link_to_a_specific_message_in_a_channel_possible/
108
+ # hmm, only seems to work on mobile app, but better than nothing...
109
+ yield Visit(
110
+ url=unquote(u),
111
+ dt=dt,
112
+ context=f"{sender}: {text}",
113
+ locator=Loc.make(
114
+ title=f"chat with {chatname}",
115
+ href=in_context,
116
+ ),
117
+ )
promnesia/sources/vcs.py CHANGED
@@ -5,10 +5,10 @@ Clones & indexes Git repositories (via sources.auto)
5
5
 
6
6
  from pathlib import Path
7
7
  import re
8
+ from subprocess import check_call
8
9
  from typing import Iterable
9
10
 
10
11
  from ..common import Extraction, PathIsh, get_tmpdir, slugify
11
- from ..compat import check_call
12
12
 
13
13
 
14
14
  def index(path: PathIsh, *args, **kwargs) -> Iterable[Extraction]:
@@ -1,14 +1,16 @@
1
1
  """
2
- Adapted from `telegram.py` to read from `~/.ViberPC/XYZ123/viber.db`
2
+ Collects visits from Viber desktop app (e.g. `~/.ViberPC/XYZ123/viber.db`)
3
3
  """
4
4
 
5
5
  import logging
6
6
  import textwrap
7
7
  from os import PathLike
8
8
  from pathlib import Path
9
+ import sqlite3
9
10
  from typing import Iterable, Optional
10
11
 
11
12
  from ..common import Loc, PathIsh, Results, Visit, extract_urls, from_epoch, join_tags
13
+ from ..sqlite import sqlite_connection
12
14
 
13
15
 
14
16
  logger = logging.getLogger(__name__)
@@ -37,17 +39,6 @@ def index(
37
39
  yield from _harvest_db(db_path, msgs_query, locator_schema)
38
40
 
39
41
 
40
- # TODO move to common?
41
- def _dataset_readonly(db: Path):
42
- # see https://github.com/pudo/dataset/issues/136#issuecomment-128693122
43
- import sqlite3
44
-
45
- import dataset # type: ignore
46
-
47
- creator = lambda: sqlite3.connect(f"file:{db}?immutable=1", uri=True)
48
- return dataset.connect("sqlite:///", engine_kwargs={"creator": creator})
49
-
50
-
51
42
  def messages_query(http_only: Optional[bool]) -> str:
52
43
  """
53
44
  An SQL-query returning 1 row for each message
@@ -118,7 +109,7 @@ def messages_query(http_only: Optional[bool]) -> str:
118
109
  )
119
110
 
120
111
 
121
- def _handle_row(row: dict, db_path: PathLike, locator_schema: str) -> Results:
112
+ def _handle_row(row: sqlite3.Row, db_path: PathLike, locator_schema: str) -> Results:
122
113
  text = row["text"]
123
114
  urls = extract_urls(text)
124
115
  if not urls:
@@ -173,8 +164,8 @@ def _harvest_db(db_path: PathIsh, msgs_query: str, locator_schema: str) -> Resul
173
164
  # but it's safer for debugging resolved.
174
165
  db_path = Path(db_path).resolve()
175
166
 
176
- with _dataset_readonly(db_path) as db:
177
- for row in db.query(msgs_query):
167
+ with sqlite_connection(db_path, immutable=True, row_factory='row') as db:
168
+ for row in db.execute(msgs_query):
178
169
  try:
179
170
  yield from _handle_row(row, db_path, locator_schema)
180
171
  except Exception as ex:
@@ -4,7 +4,7 @@ Clones a website with wget and indexes via sources.auto
4
4
 
5
5
  from pathlib import Path
6
6
  import re
7
- from ..compat import check_call, run
7
+ from subprocess import run
8
8
  from typing import Iterable
9
9
 
10
10
  from ..common import Extraction, PathIsh, get_tmpdir, slugify, get_logger
promnesia/sqlite.py ADDED
@@ -0,0 +1,42 @@
1
+ from contextlib import contextmanager
2
+ import sqlite3
3
+ from typing import Callable, Optional, Any, Iterator, Union, Literal
4
+
5
+ from .common import PathIsh
6
+
7
+ # NOTE: copy pasted from HPI
8
+
9
+ SqliteRowFactory = Callable[[sqlite3.Cursor, sqlite3.Row], Any]
10
+
11
+ def dict_factory(cursor, row):
12
+ fields = [column[0] for column in cursor.description]
13
+ return {key: value for key, value in zip(fields, row)}
14
+
15
+
16
+ Factory = Union[SqliteRowFactory, Literal['row', 'dict']]
17
+
18
+ @contextmanager
19
+ def sqlite_connection(db: PathIsh, *, immutable: bool=False, row_factory: Optional[Factory]=None) -> Iterator[sqlite3.Connection]:
20
+ dbp = f'file:{db}'
21
+ # https://www.sqlite.org/draft/uri.html#uriimmutable
22
+ if immutable:
23
+ dbp = f'{dbp}?immutable=1'
24
+ row_factory_: Any = None
25
+ if row_factory is not None:
26
+ if callable(row_factory):
27
+ row_factory_ = row_factory
28
+ elif row_factory == 'row':
29
+ row_factory_ = sqlite3.Row
30
+ elif row_factory == 'dict':
31
+ row_factory_ = dict_factory
32
+ else:
33
+ raise RuntimeError("should not happen")
34
+
35
+ conn = sqlite3.connect(dbp, uri=True)
36
+ try:
37
+ conn.row_factory = row_factory_
38
+ with conn:
39
+ yield conn
40
+ finally:
41
+ # Connection context manager isn't actually closing the connection, only keeps transaction
42
+ conn.close()
File without changes
@@ -0,0 +1,137 @@
1
+ from contextlib import closing, contextmanager
2
+ import gc
3
+ import inspect
4
+ import os
5
+ from pathlib import Path
6
+ import socket
7
+ import sys
8
+ from textwrap import dedent
9
+ from typing import Iterator, NoReturn, TypeVar
10
+
11
+ import pytest
12
+
13
+ from ..common import _is_windows, Res
14
+
15
+
16
+ def under_ci() -> bool:
17
+ return 'CI' in os.environ
18
+
19
+
20
+ def throw(x: Exception) -> NoReturn:
21
+ '''
22
+ like raise, but can be an expression...
23
+ '''
24
+ raise x
25
+
26
+
27
+ @pytest.fixture
28
+ def gc_control(gc_on: bool):
29
+ if gc_on:
30
+ # no need to do anything, should be on by default
31
+ yield
32
+ return
33
+
34
+ gc.disable()
35
+ try:
36
+ yield
37
+ finally:
38
+ gc.enable()
39
+
40
+
41
+ running_on_ci = 'CI' in os.environ
42
+
43
+
44
+ GIT_ROOT = Path(__file__).absolute().parent.parent.parent.parent
45
+ TESTDATA = GIT_ROOT / 'tests/testdata'
46
+
47
+
48
+ def get_testdata(path: str) -> Path:
49
+ assert TESTDATA.is_dir()
50
+ res = TESTDATA / path
51
+ if not res.exists():
52
+ raise RuntimeError(f"'{res}' not found! You propably need to run 'git submodule update --init --recursive'")
53
+ return TESTDATA / path
54
+
55
+
56
+ @contextmanager
57
+ def tmp_popen(*args, **kwargs):
58
+ import psutil
59
+ with psutil.Popen(*args, **kwargs) as p:
60
+ try:
61
+ yield p
62
+ finally:
63
+ for c in p.children(recursive=True):
64
+ c.kill()
65
+ p.kill()
66
+ p.wait()
67
+
68
+
69
+ # meh
70
+ def promnesia_bin(*args):
71
+ # not sure it's a good idea to diverge, but not sure if there's a better way either?
72
+ # ugh. on windows there is no bash so can't use the script
73
+ # whatever...
74
+ if under_ci() or _is_windows:
75
+ # should be able to use the installed version
76
+ return [sys.executable, '-m', 'promnesia', *args]
77
+ else:
78
+ # use version from the repository
79
+ root = Path(__file__).parent.parent.parent.parent
80
+ pm = root / 'scripts/promnesia'
81
+ return [pm, *args]
82
+
83
+
84
+ # meh... not great
85
+ @pytest.fixture
86
+ def reset_filters():
87
+ from .. import extract
88
+
89
+ extract.filters.cache_clear()
90
+ try:
91
+ yield
92
+ finally:
93
+ extract.filters.cache_clear()
94
+
95
+
96
+ # TODO could be a TypeGuard from 3.10
97
+ V = TypeVar('V')
98
+
99
+ def unwrap(r: Res[V]) -> V:
100
+ assert not isinstance(r, Exception), r
101
+ return r
102
+
103
+
104
+ def write_config(path: Path, gen, **kwargs) -> None:
105
+ output_dir = path.parent
106
+ cfg_src = dedent('\n'.join(inspect.getsource(gen).splitlines()[1:])) + f"\nOUTPUT_DIR = r'{output_dir}'"
107
+ for k, v in kwargs.items():
108
+ assert k in cfg_src, k
109
+ cfg_src = cfg_src.replace(k, repr(str(v))) # meh
110
+ path.write_text(cfg_src)
111
+
112
+
113
+ @contextmanager
114
+ def free_port() -> Iterator[int]:
115
+ # this is a generator to make sure there are no race conditions between the time we call this and launch program
116
+ #
117
+ # also some relevant articles about this 'technique'
118
+ # - https://eklitzke.org/binding-on-port-zero
119
+ # - https://idea.popcount.org/2014-04-03-bind-before-connect
120
+ # - https://blog.cloudflare.com/the-quantum-state-of-a-tcp-port
121
+ with closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as s:
122
+ if sys.platform == 'linux':
123
+ # Ok, so from what I've been reading, SO_REUSEADDR should only be necessary in the program that reuses the port
124
+ # However, this answer (or man socket) claims we need it on both sites in Linux? see https://superuser.com/a/587955/300795
125
+ s.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
126
+ # also not sure where REUSEADDR is set in uvicorn (e.g. here reuse_address isn't passed?)
127
+ # https://github.com/encode/uvicorn/blob/6d666d99a285153bc4613e811543c39eca57054a/uvicorn/server.py#L162C37-L162C50
128
+ # but from strace looks like it is called somewhere :shrug:
129
+
130
+ # assign euphemeral port
131
+ # see table in
132
+ # https://stackoverflow.com/questions/14388706/how-do-so-reuseaddr-and-so-reuseport-differ/14388707#14388707
133
+ # we rely on server binding to localhost later (or anything except 0.0.0.0 really)
134
+ s.bind(('', 0))
135
+
136
+ port = s.getsockname()[1]
137
+ yield port
@@ -0,0 +1,64 @@
1
+ from contextlib import contextmanager
2
+ from dataclasses import dataclass
3
+ from pathlib import Path
4
+ import sys
5
+ import time
6
+ from typing import Any, Dict, Iterator, Optional
7
+
8
+ import psutil
9
+ import requests
10
+
11
+ from ..common import PathIsh
12
+ from .common import tmp_popen, promnesia_bin, free_port
13
+
14
+
15
+ @dataclass
16
+ class Helper:
17
+ host: str
18
+ port: str
19
+ process: psutil.Popen
20
+
21
+ def get(self, path: str, *args):
22
+ # check it's alive first so the error is cleaner
23
+ assert self.process.poll() is None, self.process
24
+ return requests.get(f'http://{self.host}:{self.port}' + path)
25
+
26
+ def post(self, path: str, *, json: Optional[Dict[str, Any]] = None):
27
+ assert self.process.poll() is None, self.process
28
+ return requests.post(f'http://{self.host}:{self.port}' + path, json=json)
29
+
30
+
31
+ @contextmanager
32
+ def run_server(db: Optional[PathIsh] = None, *, timezone: Optional[str] = None) -> Iterator[Helper]:
33
+ # TODO not sure, perhaps best to use a thread or something?
34
+ # but for some tests makes more sense to test in a separate process
35
+ with free_port() as pp:
36
+ # ugh. under docker 'localhost' tries to bind it to ipv6 (::1) for some reason???
37
+ host = '0.0.0.0' if Path('/.dockerenv').exists() else 'localhost'
38
+ port = str(pp)
39
+ args = [
40
+ 'serve',
41
+ '--host', host,
42
+ '--quiet',
43
+ '--port', port,
44
+ *([] if timezone is None else ['--timezone', timezone]),
45
+ *([] if db is None else ['--db' , str(db)]),
46
+ ]
47
+ with tmp_popen(promnesia_bin(*args)) as server_process:
48
+ server = Helper(host=host, port=port, process=server_process)
49
+
50
+ # wait till ready
51
+ for _ in range(50):
52
+ try:
53
+ server.get('/status').json()
54
+ break
55
+ except:
56
+ time.sleep(0.1)
57
+ else:
58
+ raise RuntimeError("Cooldn't connect to '{st}' after 50 attempts")
59
+ print("Started server up, db: {db}".format(db=db), file=sys.stderr)
60
+
61
+ yield server
62
+
63
+ # TODO use logger!
64
+ print("Done with the server", file=sys.stderr)
File without changes
@@ -0,0 +1,66 @@
1
+ from itertools import groupby
2
+ import os
3
+
4
+ from ...sources import auto
5
+
6
+ from ..common import get_testdata, throw
7
+
8
+ sa2464 = 'https://www.scottaaronson.com/blog/?p=2464'
9
+
10
+ _JSON_URLS = {
11
+ 'https://johncarlosbaez.wordpress.com/2016/09/09/struggles-with-the-continuum-part-2/',
12
+ sa2464,
13
+ }
14
+
15
+
16
+ def makemap(visits):
17
+ key = lambda v: v.url
18
+
19
+ def it():
20
+ vit = (throw(v) if isinstance(v, Exception) else v for v in visits)
21
+ for k, g in groupby(sorted(vit, key=key), key=key):
22
+ yield k, list(sorted(g))
23
+
24
+ return dict(it())
25
+
26
+
27
+ def test_json() -> None:
28
+ mm = makemap(auto.index(get_testdata('auto'), ignored='*/orgs/*'))
29
+ assert mm.keys() == _JSON_URLS
30
+
31
+ # TODO not sure if they deserve separate visits..
32
+ [v1, v2] = mm[sa2464]
33
+ assert v1.context == 'list::yyy::given_url'
34
+ # todo not sure if editor:// work on Windows
35
+ assert v1.locator.href.startswith('editor://')
36
+ assert v1.locator.href.endswith('pocket.json')
37
+ # TODO line number?
38
+
39
+
40
+ def test_auto() -> None:
41
+ mm = makemap(auto.index(get_testdata('auto')))
42
+ org_link = 'https://www.youtube.com/watch?v=rHIkrotSwcc'
43
+ assert {
44
+ *_JSON_URLS,
45
+ org_link,
46
+ }.issubset(mm.keys())
47
+
48
+ [v] = mm[org_link]
49
+ assert v.locator.title == 'orgs' + os.sep + 'file.org:14' # meh
50
+ assert v.locator.href.endswith('file.org:14')
51
+ assert "xxx /r/cpp" in v.context
52
+ assert "I've enjoyed [Chandler Carruth's" in v.context
53
+
54
+
55
+ def test_obsidian() -> None:
56
+ mm = makemap(auto.index(get_testdata('obsidian-vault')))
57
+ example_url = 'https://example.com'
58
+ [v] = mm[example_url]
59
+ assert v.locator.href.startswith('obsidian://')
60
+
61
+
62
+ def test_logseq() -> None:
63
+ mm = makemap(auto.index(get_testdata('logseq-graph')))
64
+ example_url = 'https://example.com'
65
+ [v] = mm[example_url]
66
+ assert v.locator.href.startswith('logseq://')
@@ -0,0 +1,42 @@
1
+ from pathlib import Path
2
+
3
+ from ...common import PathIsh, _is_windows as windows
4
+ from ...sources.auto import by_path
5
+
6
+
7
+ def handled(p: PathIsh) -> bool:
8
+ idx, m = by_path(Path(p))
9
+ return idx is not None
10
+ # ideally these won't hit libmagic path (would try to open the file and cause FileNotFoundError)
11
+
12
+
13
+ def test_filetypes() -> None:
14
+ # test media
15
+ for ext in 'avi mp4 mp3 webm'.split() + ([] if windows else 'mkv'.split()):
16
+ assert handled('file.' + ext)
17
+
18
+ # images
19
+ for ext in 'gif jpg png jpeg'.split():
20
+ assert handled('file.' + ext)
21
+
22
+ # TODO more granual checks that these are ignored?
23
+ # binaries
24
+ for ext in 'o sqlite'.split() + ([] if windows else 'class jar'.split()):
25
+ assert handled('file.' + ext)
26
+
27
+ # these might have potentially some links
28
+ for ext in [
29
+ 'svg',
30
+ 'pdf', 'epub', 'ps',
31
+ 'doc', 'ppt', 'xsl',
32
+ # seriously, windows doesn't know about docx???
33
+ *([] if windows else 'docx pptx xlsx'.split()),
34
+ *([] if windows else 'ods odt rtf'.split()),
35
+ ] + ([] if windows else 'djvu'.split()):
36
+ assert handled('file.' + ext)
37
+
38
+ # source code
39
+ for ext in 'rs tex el js sh hs pl h py hpp c go css'.split() + ([] if windows else 'java cpp'.split()):
40
+ assert handled('file.' + ext)
41
+
42
+ assert handled('x.html')
@@ -0,0 +1,39 @@
1
+ from pathlib import Path
2
+
3
+ from ..common import write_config, get_testdata
4
+ from ...__main__ import do_index
5
+ from ...database.load import get_all_db_visits
6
+
7
+ from my.core.cfg import tmp_config
8
+
9
+
10
+ def index_hypothesis(tmp_path: Path) -> None:
11
+ def cfg() -> None:
12
+ from promnesia.common import Source
13
+ from promnesia.sources import hypothesis
14
+
15
+ SOURCES = [Source(hypothesis.index, name='hyp')]
16
+
17
+ cfg_path = tmp_path / 'config.py'
18
+ write_config(cfg_path, cfg)
19
+
20
+ class hpi_config:
21
+ class hypothesis:
22
+ export_path = get_testdata('hypexport/testdata') / 'netrights-dashboard-mockup/data/*.json'
23
+
24
+ with tmp_config(modules='my.hypothesis', config=hpi_config):
25
+ do_index(cfg_path)
26
+
27
+
28
+ def test_hypothesis(tmp_path: Path) -> None:
29
+ index_hypothesis(tmp_path)
30
+
31
+ visits = get_all_db_visits(tmp_path / 'promnesia.sqlite')
32
+ assert len(visits) > 100
33
+
34
+ [vis] = [x for x in visits if 'fundamental fact of evolution' in (x.context or '')]
35
+
36
+ assert vis.norm_url == 'wired.com/2017/04/the-myth-of-a-superhuman-ai'
37
+ assert vis.orig_url == 'https://www.wired.com/2017/04/the-myth-of-a-superhuman-ai/'
38
+ assert vis.locator.href == 'https://hyp.is/_Z9ccmVZEeexBOO7mToqdg/www.wired.com/2017/04/the-myth-of-a-superhuman-ai/'
39
+ assert 'misconception about evolution is fueling misconception about AI' in (vis.context or '') # contains notes as well
@@ -0,0 +1,65 @@
1
+ from typing import Optional
2
+
3
+ from ...common import Visit
4
+ from ...sources.org import extract_from_file
5
+
6
+ from ..common import get_testdata, throw
7
+
8
+
9
+ def delrf(s: Optional[str]) -> Optional[str]:
10
+ if s is None:
11
+ return None
12
+ # meh.. not sure how ot handle this properly, ideally should be via pytest?
13
+ # not sure if should just do it in the indexer? e.g. extension might not like it
14
+ return s.replace('\r', '')
15
+
16
+
17
+ def test_org_indexer() -> None:
18
+ [_, cpp, cozy] = [v if isinstance(v, Visit) else throw(v) for v in extract_from_file(get_testdata('auto/orgs/file.org'))]
19
+
20
+ assert cpp.url == 'https://www.youtube.com/watch?v=rHIkrotSwcc'
21
+ # TODO not sure about filetags?
22
+ exp = '''
23
+ xxx /r/cpp :cpp:programming:
24
+ I've enjoyed [Chandler Carruth's _There Are No Zero-cost Abstractions_](
25
+ https://www.youtube.com/watch?v=rHIkrotSwcc) very much.
26
+
27
+ '''.lstrip()
28
+ assert delrf(cpp.context) == exp
29
+
30
+ assert cozy.url == 'https://twitter.com/Mappletons/status/1255221220263563269'
31
+
32
+
33
+ def test_org_indexer_2() -> None:
34
+ items = [v if isinstance(v, Visit) else throw(v) for v in extract_from_file(get_testdata('auto/orgs/file3.org'))]
35
+
36
+ assert len(items) == 6
37
+ assert items[0].url == 'https://www.reddit.com/r/androidapps/comments/4i36z9/how_you_use_your_android_to_the_maximum/d2uq24i'
38
+ assert items[1].url == 'https://link.com'
39
+ assert items[-2].url == 'https://en.wikipedia.org/wiki/Resilio_Sync'
40
+ # TODO shit def need org specific url extractor (and then extract from everything remaining)
41
+ # assert results[-1].url == 'https://en.wikipedia.org/wiki/InterPlanetary_File_System'
42
+
43
+
44
+ def test_heading() -> None:
45
+ items = [v if isinstance(v, Visit) else throw(v) for v in extract_from_file(get_testdata('auto/orgs/file2.org'))]
46
+ assert {i.url for i in items} == {
47
+ 'https://en.wikipedia.org/wiki/Computational_topology',
48
+ 'http://graphics.stanford.edu/courses/cs468-09-fall/',
49
+ 'https://en.wikipedia.org/wiki/Triangulation_(topology)',
50
+ 'https://en.wikipedia.org/wiki/Digital_manifold',
51
+ }
52
+
53
+
54
+ def test_url_in_properties() -> None:
55
+ items = [v if isinstance(v, Visit) else throw(v) for v in extract_from_file(get_testdata('auto/orgs/file4.org'))]
56
+
57
+ assert len(items) == 2, items
58
+ assert items[0].url == 'https://example.org/ref_example'
59
+ assert items[1].url == 'http://example.org/a_test'
60
+
61
+
62
+ def test_5() -> None:
63
+ items = [v if isinstance(v, Visit) else throw(v) for v in extract_from_file(get_testdata('auto/orgs/file5.org'))]
64
+
65
+ assert len(items) == 0 # shouldn't crash at least
@@ -0,0 +1,26 @@
1
+ from ...common import Source
2
+ from ...extract import extract_visits
3
+ from ...sources import plaintext, shellcmd
4
+
5
+ from ..common import get_testdata, unwrap
6
+
7
+
8
+ def test_plaintext_path_extractor() -> None:
9
+ visits = list(extract_visits(
10
+ Source(
11
+ shellcmd.index,
12
+ plaintext.extract_from_path(get_testdata('custom')),
13
+ ),
14
+ src='whatever',
15
+ ))
16
+ assert {unwrap(v).orig_url for v in visits} == {
17
+ 'http://google.com',
18
+ 'http://google.com/',
19
+ 'http://some-weird-domain.xyz/whatever',
20
+ 'https://google.com',
21
+ 'http://what.about.this.link',
22
+ }
23
+
24
+ [wa] = [v for v in visits if unwrap(v).orig_url == 'http://what.about.this.link']
25
+ f2 = get_testdata('custom') / 'file2.txt'
26
+ assert unwrap(wa).locator.href == f'editor://{f2}:3' # occurs line 3