promnesia 1.2.20230515__py3-none-any.whl → 1.3.20241021__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (84) hide show
  1. promnesia/__init__.py +14 -3
  2. promnesia/__main__.py +60 -35
  3. promnesia/cannon.py +27 -27
  4. promnesia/common.py +85 -67
  5. promnesia/compare.py +21 -22
  6. promnesia/compat.py +10 -10
  7. promnesia/config.py +23 -23
  8. promnesia/database/common.py +67 -0
  9. promnesia/database/dump.py +188 -0
  10. promnesia/{read_db.py → database/load.py} +16 -17
  11. promnesia/extract.py +14 -11
  12. promnesia/kjson.py +12 -11
  13. promnesia/logging.py +4 -4
  14. promnesia/misc/__init__.pyi +0 -0
  15. promnesia/misc/config_example.py +1 -2
  16. promnesia/misc/install_server.py +7 -9
  17. promnesia/server.py +57 -47
  18. promnesia/sources/__init__.pyi +0 -0
  19. promnesia/sources/auto.py +50 -35
  20. promnesia/sources/auto_logseq.py +6 -5
  21. promnesia/sources/auto_obsidian.py +2 -2
  22. promnesia/sources/browser.py +14 -9
  23. promnesia/sources/browser_legacy.py +26 -16
  24. promnesia/sources/demo.py +19 -3
  25. promnesia/sources/fbmessenger.py +3 -2
  26. promnesia/sources/filetypes.py +16 -7
  27. promnesia/sources/github.py +7 -9
  28. promnesia/sources/guess.py +2 -1
  29. promnesia/sources/hackernews.py +2 -2
  30. promnesia/sources/hpi.py +2 -2
  31. promnesia/sources/html.py +7 -5
  32. promnesia/sources/hypothesis.py +4 -3
  33. promnesia/sources/instapaper.py +2 -2
  34. promnesia/sources/markdown.py +31 -21
  35. promnesia/sources/org.py +27 -13
  36. promnesia/sources/plaintext.py +30 -29
  37. promnesia/sources/pocket.py +3 -2
  38. promnesia/sources/reddit.py +20 -19
  39. promnesia/sources/roamresearch.py +2 -1
  40. promnesia/sources/rss.py +4 -5
  41. promnesia/sources/shellcmd.py +19 -6
  42. promnesia/sources/signal.py +33 -24
  43. promnesia/sources/smscalls.py +2 -2
  44. promnesia/sources/stackexchange.py +4 -3
  45. promnesia/sources/takeout.py +76 -9
  46. promnesia/sources/takeout_legacy.py +24 -12
  47. promnesia/sources/telegram.py +13 -11
  48. promnesia/sources/telegram_legacy.py +18 -7
  49. promnesia/sources/twitter.py +6 -5
  50. promnesia/sources/vcs.py +5 -3
  51. promnesia/sources/viber.py +10 -9
  52. promnesia/sources/website.py +4 -4
  53. promnesia/sources/zulip.py +3 -2
  54. promnesia/sqlite.py +7 -4
  55. promnesia/tests/__init__.py +0 -0
  56. promnesia/tests/common.py +140 -0
  57. promnesia/tests/server_helper.py +67 -0
  58. promnesia/tests/sources/__init__.py +0 -0
  59. promnesia/tests/sources/test_auto.py +65 -0
  60. promnesia/tests/sources/test_filetypes.py +43 -0
  61. promnesia/tests/sources/test_hypothesis.py +39 -0
  62. promnesia/tests/sources/test_org.py +64 -0
  63. promnesia/tests/sources/test_plaintext.py +25 -0
  64. promnesia/tests/sources/test_shellcmd.py +21 -0
  65. promnesia/tests/sources/test_takeout.py +56 -0
  66. promnesia/tests/test_cannon.py +325 -0
  67. promnesia/tests/test_cli.py +40 -0
  68. promnesia/tests/test_compare.py +30 -0
  69. promnesia/tests/test_config.py +289 -0
  70. promnesia/tests/test_db_dump.py +222 -0
  71. promnesia/tests/test_extract.py +65 -0
  72. promnesia/tests/test_extract_urls.py +43 -0
  73. promnesia/tests/test_indexer.py +251 -0
  74. promnesia/tests/test_server.py +291 -0
  75. promnesia/tests/test_traverse.py +39 -0
  76. promnesia/tests/utils.py +35 -0
  77. {promnesia-1.2.20230515.dist-info → promnesia-1.3.20241021.dist-info}/METADATA +15 -18
  78. promnesia-1.3.20241021.dist-info/RECORD +83 -0
  79. {promnesia-1.2.20230515.dist-info → promnesia-1.3.20241021.dist-info}/WHEEL +1 -1
  80. {promnesia-1.2.20230515.dist-info → promnesia-1.3.20241021.dist-info}/entry_points.txt +0 -1
  81. promnesia/dump.py +0 -105
  82. promnesia-1.2.20230515.dist-info/RECORD +0 -58
  83. {promnesia-1.2.20230515.dist-info → promnesia-1.3.20241021.dist-info}/LICENSE +0 -0
  84. {promnesia-1.2.20230515.dist-info → promnesia-1.3.20241021.dist-info}/top_level.txt +0 -0
@@ -2,17 +2,18 @@
2
2
  Collects visits from Viber desktop app (e.g. `~/.ViberPC/XYZ123/viber.db`)
3
3
  """
4
4
 
5
+ from __future__ import annotations
6
+
5
7
  import logging
8
+ import sqlite3
6
9
  import textwrap
10
+ from collections.abc import Iterable
7
11
  from os import PathLike
8
12
  from pathlib import Path
9
- import sqlite3
10
- from typing import Iterable, Optional
11
13
 
12
14
  from ..common import Loc, PathIsh, Results, Visit, extract_urls, from_epoch, join_tags
13
15
  from ..sqlite import sqlite_connection
14
16
 
15
-
16
17
  logger = logging.getLogger(__name__)
17
18
 
18
19
 
@@ -34,12 +35,12 @@ def index(
34
35
 
35
36
  msgs_query = messages_query(http_only)
36
37
 
37
- for db_path in _get_files(db_path):
38
- assert db_path.is_file(), f"Is it a (Viber-desktop sqlite) file? {db_path}"
39
- yield from _harvest_db(db_path, msgs_query, locator_schema)
38
+ for db in _get_files(db_path):
39
+ assert db.is_file(), f"Is it a (Viber-desktop sqlite) file? {db}"
40
+ yield from _harvest_db(db, msgs_query, locator_schema)
40
41
 
41
42
 
42
- def messages_query(http_only: Optional[bool]) -> str:
43
+ def messages_query(http_only: bool | None) -> str:
43
44
  """
44
45
  An SQL-query returning 1 row for each message
45
46
 
@@ -123,7 +124,7 @@ def _handle_row(row: sqlite3.Row, db_path: PathLike, locator_schema: str) -> Res
123
124
  tags: str = row["tags"]
124
125
  url_title: str = row["url_title"]
125
126
 
126
- assert (
127
+ assert ( # noqa: PT018
127
128
  text and mid and sender and chatname
128
129
  ), f"sql-query should eliminate messages without 'http' or missing ids: {row}"
129
130
 
@@ -154,7 +155,7 @@ def _get_files(path: PathIsh) -> Iterable[Path]:
154
155
  """
155
156
  path = Path(path).expanduser()
156
157
  parts = path.parts[1:] if path.is_absolute() else path.parts
157
- return Path(path.root).glob(str(Path("").joinpath(*parts)))
158
+ return Path(path.root).glob(str(Path("").joinpath(*parts))) # noqa: PTH201
158
159
 
159
160
 
160
161
  def _harvest_db(db_path: PathIsh, msgs_query: str, locator_schema: str) -> Results:
@@ -2,12 +2,12 @@
2
2
  Clones a website with wget and indexes via sources.auto
3
3
  '''
4
4
 
5
- from pathlib import Path
6
5
  import re
6
+ from collections.abc import Iterable
7
+ from pathlib import Path
7
8
  from subprocess import run
8
- from typing import Iterable
9
9
 
10
- from ..common import Extraction, PathIsh, get_tmpdir, slugify, get_logger
10
+ from promnesia.common import Extraction, PathIsh, get_logger, get_tmpdir, slugify
11
11
 
12
12
 
13
13
  def index(path: PathIsh, *args, **kwargs) -> Iterable[Extraction]:
@@ -30,7 +30,7 @@ def index(path: PathIsh, *args, **kwargs) -> Iterable[Extraction]:
30
30
  ]
31
31
  # TODO follow sitemap? e.g. gwern
32
32
  logger.info(' '.join(cmd))
33
- res = run(cmd)
33
+ res = run(cmd, check=False)
34
34
 
35
35
  if res.returncode == 8:
36
36
  # man wget: 8 means server error (e.g. broken link)
@@ -2,12 +2,13 @@
2
2
  Uses [[https://github.com/karlicoss/HPI][HPI]] for Zulip data.
3
3
  '''
4
4
 
5
- from ..common import Results, Visit, Loc, iter_urls
5
+ from promnesia.common import Loc, Results, Visit, iter_urls
6
6
 
7
7
 
8
8
  def index() -> Results:
9
- from . import hpi
9
+ from . import hpi # noqa: F401,I001
10
10
  import my.zulip.organization as Z
11
+
11
12
  for m in Z.messages():
12
13
  if isinstance(m, Exception):
13
14
  yield m
promnesia/sqlite.py CHANGED
@@ -1,6 +1,9 @@
1
- from contextlib import contextmanager
1
+ from __future__ import annotations
2
+
2
3
  import sqlite3
3
- from typing import Callable, Optional, Any, Iterator, Union, Literal
4
+ from collections.abc import Iterator
5
+ from contextlib import contextmanager
6
+ from typing import Any, Callable, Literal, Union
4
7
 
5
8
  from .common import PathIsh
6
9
 
@@ -10,13 +13,13 @@ SqliteRowFactory = Callable[[sqlite3.Cursor, sqlite3.Row], Any]
10
13
 
11
14
  def dict_factory(cursor, row):
12
15
  fields = [column[0] for column in cursor.description]
13
- return {key: value for key, value in zip(fields, row)}
16
+ return dict(zip(fields, row))
14
17
 
15
18
 
16
19
  Factory = Union[SqliteRowFactory, Literal['row', 'dict']]
17
20
 
18
21
  @contextmanager
19
- def sqlite_connection(db: PathIsh, *, immutable: bool=False, row_factory: Optional[Factory]=None) -> Iterator[sqlite3.Connection]:
22
+ def sqlite_connection(db: PathIsh, *, immutable: bool=False, row_factory: Factory | None=None) -> Iterator[sqlite3.Connection]:
20
23
  dbp = f'file:{db}'
21
24
  # https://www.sqlite.org/draft/uri.html#uriimmutable
22
25
  if immutable:
File without changes
@@ -0,0 +1,140 @@
1
+ from __future__ import annotations
2
+
3
+ import gc
4
+ import inspect
5
+ import os
6
+ import socket
7
+ import sys
8
+ from collections.abc import Iterator
9
+ from contextlib import closing, contextmanager
10
+ from pathlib import Path
11
+ from textwrap import dedent
12
+ from typing import NoReturn, TypeVar
13
+
14
+ import pytest
15
+
16
+ from ..common import Res, _is_windows
17
+
18
+
19
+ def under_ci() -> bool:
20
+ return 'CI' in os.environ
21
+
22
+
23
+ def throw(x: Exception) -> NoReturn:
24
+ '''
25
+ like raise, but can be an expression...
26
+ '''
27
+ raise x
28
+
29
+
30
+ @pytest.fixture
31
+ def gc_control(*, gc_on: bool):
32
+ if gc_on:
33
+ # no need to do anything, should be on by default
34
+ yield
35
+ return
36
+
37
+ gc.disable()
38
+ try:
39
+ yield
40
+ finally:
41
+ gc.enable()
42
+
43
+
44
+ running_on_ci = 'CI' in os.environ
45
+
46
+
47
+ GIT_ROOT = Path(__file__).absolute().parent.parent.parent.parent
48
+ TESTDATA = GIT_ROOT / 'tests/testdata'
49
+
50
+
51
+ def get_testdata(path: str) -> Path:
52
+ assert TESTDATA.is_dir()
53
+ res = TESTDATA / path
54
+ if not res.exists():
55
+ raise RuntimeError(f"'{res}' not found! You propably need to run 'git submodule update --init --recursive'")
56
+ return TESTDATA / path
57
+
58
+
59
+ @contextmanager
60
+ def tmp_popen(*args, **kwargs):
61
+ import psutil
62
+ with psutil.Popen(*args, **kwargs) as p:
63
+ try:
64
+ yield p
65
+ finally:
66
+ for c in p.children(recursive=True):
67
+ c.kill()
68
+ p.kill()
69
+ p.wait()
70
+
71
+
72
+ # meh
73
+ def promnesia_bin(*args):
74
+ # not sure it's a good idea to diverge, but not sure if there's a better way either?
75
+ # ugh. on windows there is no bash so can't use the script
76
+ # whatever...
77
+ if under_ci() or _is_windows:
78
+ # should be able to use the installed version
79
+ return [sys.executable, '-m', 'promnesia', *args]
80
+ else:
81
+ # use version from the repository
82
+ root = Path(__file__).parent.parent.parent.parent
83
+ pm = root / 'scripts/promnesia'
84
+ return [pm, *args]
85
+
86
+
87
+ # meh... not great
88
+ @pytest.fixture
89
+ def reset_filters():
90
+ from .. import extract
91
+
92
+ extract.filters.cache_clear()
93
+ try:
94
+ yield
95
+ finally:
96
+ extract.filters.cache_clear()
97
+
98
+
99
+ # TODO could be a TypeGuard from 3.10
100
+ V = TypeVar('V')
101
+
102
+ def unwrap(r: Res[V]) -> V:
103
+ assert not isinstance(r, Exception), r
104
+ return r
105
+
106
+
107
+ def write_config(path: Path, gen, **kwargs) -> None:
108
+ output_dir = path.parent
109
+ cfg_src = dedent('\n'.join(inspect.getsource(gen).splitlines()[1:])) + f"\nOUTPUT_DIR = r'{output_dir}'"
110
+ for k, v in kwargs.items():
111
+ assert k in cfg_src, k
112
+ cfg_src = cfg_src.replace(k, repr(str(v))) # meh
113
+ path.write_text(cfg_src)
114
+
115
+
116
+ @contextmanager
117
+ def free_port() -> Iterator[int]:
118
+ # this is a generator to make sure there are no race conditions between the time we call this and launch program
119
+ #
120
+ # also some relevant articles about this 'technique'
121
+ # - https://eklitzke.org/binding-on-port-zero
122
+ # - https://idea.popcount.org/2014-04-03-bind-before-connect
123
+ # - https://blog.cloudflare.com/the-quantum-state-of-a-tcp-port
124
+ with closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as s:
125
+ if sys.platform == 'linux':
126
+ # Ok, so from what I've been reading, SO_REUSEADDR should only be necessary in the program that reuses the port
127
+ # However, this answer (or man socket) claims we need it on both sites in Linux? see https://superuser.com/a/587955/300795
128
+ s.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
129
+ # also not sure where REUSEADDR is set in uvicorn (e.g. here reuse_address isn't passed?)
130
+ # https://github.com/encode/uvicorn/blob/6d666d99a285153bc4613e811543c39eca57054a/uvicorn/server.py#L162C37-L162C50
131
+ # but from strace looks like it is called somewhere :shrug:
132
+
133
+ # assign euphemeral port
134
+ # see table in
135
+ # https://stackoverflow.com/questions/14388706/how-do-so-reuseaddr-and-so-reuseport-differ/14388707#14388707
136
+ # we rely on server binding to localhost later (or anything except 0.0.0.0 really)
137
+ s.bind(('', 0))
138
+
139
+ port = s.getsockname()[1]
140
+ yield port
@@ -0,0 +1,67 @@
1
+ from __future__ import annotations
2
+
3
+ import sys
4
+ import time
5
+ from collections.abc import Iterator
6
+ from contextlib import contextmanager
7
+ from dataclasses import dataclass
8
+ from pathlib import Path
9
+ from typing import Any
10
+
11
+ import psutil
12
+ import requests
13
+
14
+ from ..common import PathIsh
15
+ from .common import free_port, promnesia_bin, tmp_popen
16
+
17
+
18
+ @dataclass
19
+ class Helper:
20
+ host: str
21
+ port: str
22
+ process: psutil.Popen
23
+
24
+ def get(self, path: str):
25
+ # check it's alive first so the error is cleaner
26
+ assert self.process.poll() is None, self.process
27
+ return requests.get(f'http://{self.host}:{self.port}' + path)
28
+
29
+ def post(self, path: str, *, json: dict[str, Any] | None = None):
30
+ assert self.process.poll() is None, self.process
31
+ return requests.post(f'http://{self.host}:{self.port}' + path, json=json)
32
+
33
+
34
+ @contextmanager
35
+ def run_server(db: PathIsh | None = None, *, timezone: str | None = None) -> Iterator[Helper]:
36
+ # TODO not sure, perhaps best to use a thread or something?
37
+ # but for some tests makes more sense to test in a separate process
38
+ with free_port() as pp:
39
+ # ugh. under docker 'localhost' tries to bind it to ipv6 (::1) for some reason???
40
+ host = '0.0.0.0' if Path('/.dockerenv').exists() else 'localhost'
41
+ port = str(pp)
42
+ args = [
43
+ 'serve',
44
+ '--host', host,
45
+ '--quiet',
46
+ '--port', port,
47
+ *([] if timezone is None else ['--timezone', timezone]),
48
+ *([] if db is None else ['--db' , str(db)]),
49
+ ]
50
+ with tmp_popen(promnesia_bin(*args)) as server_process:
51
+ server = Helper(host=host, port=port, process=server_process)
52
+
53
+ # wait till ready
54
+ for _ in range(50):
55
+ try:
56
+ server.get('/status').json()
57
+ break
58
+ except:
59
+ time.sleep(0.1)
60
+ else:
61
+ raise RuntimeError("Cooldn't connect to '{st}' after 50 attempts")
62
+ print(f"Started server up, db: {db}", file=sys.stderr)
63
+
64
+ yield server
65
+
66
+ # TODO use logger!
67
+ print("Done with the server", file=sys.stderr)
File without changes
@@ -0,0 +1,65 @@
1
+ import os
2
+ from itertools import groupby
3
+
4
+ from ...sources import auto
5
+ from ..common import get_testdata, throw
6
+
7
+ sa2464 = 'https://www.scottaaronson.com/blog/?p=2464'
8
+
9
+ _JSON_URLS = {
10
+ 'https://johncarlosbaez.wordpress.com/2016/09/09/struggles-with-the-continuum-part-2/',
11
+ sa2464,
12
+ }
13
+
14
+
15
+ def makemap(visits):
16
+ key = lambda v: v.url
17
+
18
+ def it():
19
+ vit = (throw(v) if isinstance(v, Exception) else v for v in visits)
20
+ for k, g in groupby(sorted(vit, key=key), key=key):
21
+ yield k, sorted(g)
22
+
23
+ return dict(it())
24
+
25
+
26
+ def test_json() -> None:
27
+ mm = makemap(auto.index(get_testdata('auto'), ignored='*/orgs/*'))
28
+ assert mm.keys() == _JSON_URLS
29
+
30
+ # TODO not sure if they deserve separate visits..
31
+ [v1, v2] = mm[sa2464]
32
+ assert v1.context == 'list::yyy::given_url'
33
+ # todo not sure if editor:// work on Windows
34
+ assert v1.locator.href.startswith('editor://')
35
+ assert v1.locator.href.endswith('pocket.json')
36
+ # TODO line number?
37
+
38
+
39
+ def test_auto() -> None:
40
+ mm = makemap(auto.index(get_testdata('auto')))
41
+ org_link = 'https://www.youtube.com/watch?v=rHIkrotSwcc'
42
+ assert {
43
+ *_JSON_URLS,
44
+ org_link,
45
+ }.issubset(mm.keys())
46
+
47
+ [v] = mm[org_link]
48
+ assert v.locator.title == 'orgs' + os.sep + 'file.org:14' # meh
49
+ assert v.locator.href.endswith('file.org:14')
50
+ assert "xxx /r/cpp" in v.context
51
+ assert "I've enjoyed [Chandler Carruth's" in v.context
52
+
53
+
54
+ def test_obsidian() -> None:
55
+ mm = makemap(auto.index(get_testdata('obsidian-vault')))
56
+ example_url = 'https://example.com'
57
+ [v] = mm[example_url]
58
+ assert v.locator.href.startswith('obsidian://')
59
+
60
+
61
+ def test_logseq() -> None:
62
+ mm = makemap(auto.index(get_testdata('logseq-graph')))
63
+ example_url = 'https://example.com'
64
+ [v] = mm[example_url]
65
+ assert v.locator.href.startswith('logseq://')
@@ -0,0 +1,43 @@
1
+ from pathlib import Path
2
+
3
+ from ...common import PathIsh
4
+ from ...common import _is_windows as windows
5
+ from ...sources.auto import by_path
6
+
7
+
8
+ def handled(p: PathIsh) -> bool:
9
+ idx, m = by_path(Path(p))
10
+ return idx is not None
11
+ # ideally these won't hit libmagic path (would try to open the file and cause FileNotFoundError)
12
+
13
+
14
+ def test_filetypes() -> None:
15
+ # test media
16
+ for ext in 'avi mp4 mp3 webm'.split() + ([] if windows else 'mkv'.split()):
17
+ assert handled('file.' + ext)
18
+
19
+ # images
20
+ for ext in 'gif jpg png jpeg'.split():
21
+ assert handled('file.' + ext)
22
+
23
+ # TODO more granual checks that these are ignored?
24
+ # binaries
25
+ for ext in 'o sqlite'.split() + ([] if windows else 'class jar'.split()):
26
+ assert handled('file.' + ext)
27
+
28
+ # these might have potentially some links
29
+ for ext in [
30
+ 'svg',
31
+ 'pdf', 'epub', 'ps',
32
+ 'doc', 'ppt', 'xsl',
33
+ # seriously, windows doesn't know about docx???
34
+ *([] if windows else 'docx pptx xlsx'.split()),
35
+ *([] if windows else 'ods odt rtf'.split()),
36
+ ] + ([] if windows else 'djvu'.split()):
37
+ assert handled('file.' + ext)
38
+
39
+ # source code
40
+ for ext in 'rs tex el js sh hs pl h py hpp c go css'.split() + ([] if windows else 'java cpp'.split()):
41
+ assert handled('file.' + ext)
42
+
43
+ assert handled('x.html')
@@ -0,0 +1,39 @@
1
+ from pathlib import Path
2
+
3
+ from my.core.cfg import tmp_config
4
+
5
+ from ...__main__ import do_index
6
+ from ...database.load import get_all_db_visits
7
+ from ..common import get_testdata, write_config
8
+
9
+
10
+ def index_hypothesis(tmp_path: Path) -> None:
11
+ def cfg() -> None:
12
+ from promnesia.common import Source
13
+ from promnesia.sources import hypothesis
14
+
15
+ SOURCES = [Source(hypothesis.index, name='hyp')]
16
+
17
+ cfg_path = tmp_path / 'config.py'
18
+ write_config(cfg_path, cfg)
19
+
20
+ class hpi_config:
21
+ class hypothesis:
22
+ export_path = get_testdata('hypexport/testdata') / 'netrights-dashboard-mockup/data/*.json'
23
+
24
+ with tmp_config(modules='my.hypothesis', config=hpi_config):
25
+ do_index(cfg_path)
26
+
27
+
28
+ def test_hypothesis(tmp_path: Path) -> None:
29
+ index_hypothesis(tmp_path)
30
+
31
+ visits = get_all_db_visits(tmp_path / 'promnesia.sqlite')
32
+ assert len(visits) > 100
33
+
34
+ [vis] = [x for x in visits if 'fundamental fact of evolution' in (x.context or '')]
35
+
36
+ assert vis.norm_url == 'wired.com/2017/04/the-myth-of-a-superhuman-ai'
37
+ assert vis.orig_url == 'https://www.wired.com/2017/04/the-myth-of-a-superhuman-ai/'
38
+ assert vis.locator.href == 'https://hyp.is/_Z9ccmVZEeexBOO7mToqdg/www.wired.com/2017/04/the-myth-of-a-superhuman-ai/'
39
+ assert 'misconception about evolution is fueling misconception about AI' in (vis.context or '') # contains notes as well
@@ -0,0 +1,64 @@
1
+ from __future__ import annotations
2
+
3
+ from ...common import Visit
4
+ from ...sources.org import extract_from_file
5
+ from ..common import get_testdata, throw
6
+
7
+
8
+ def delrf(s: str | None) -> str | None:
9
+ if s is None:
10
+ return None
11
+ # meh.. not sure how ot handle this properly, ideally should be via pytest?
12
+ # not sure if should just do it in the indexer? e.g. extension might not like it
13
+ return s.replace('\r', '')
14
+
15
+
16
+ def test_org_indexer() -> None:
17
+ [_, cpp, cozy] = [v if isinstance(v, Visit) else throw(v) for v in extract_from_file(get_testdata('auto/orgs/file.org'))]
18
+
19
+ assert cpp.url == 'https://www.youtube.com/watch?v=rHIkrotSwcc'
20
+ # TODO not sure about filetags?
21
+ exp = '''
22
+ xxx /r/cpp :cpp:programming:
23
+ I've enjoyed [Chandler Carruth's _There Are No Zero-cost Abstractions_](
24
+ https://www.youtube.com/watch?v=rHIkrotSwcc) very much.
25
+
26
+ '''.lstrip()
27
+ assert delrf(cpp.context) == exp
28
+
29
+ assert cozy.url == 'https://twitter.com/Mappletons/status/1255221220263563269'
30
+
31
+
32
+ def test_org_indexer_2() -> None:
33
+ items = [v if isinstance(v, Visit) else throw(v) for v in extract_from_file(get_testdata('auto/orgs/file3.org'))]
34
+
35
+ assert len(items) == 6
36
+ assert items[0].url == 'https://www.reddit.com/r/androidapps/comments/4i36z9/how_you_use_your_android_to_the_maximum/d2uq24i'
37
+ assert items[1].url == 'https://link.com'
38
+ assert items[-2].url == 'https://en.wikipedia.org/wiki/Resilio_Sync'
39
+ # TODO shit def need org specific url extractor (and then extract from everything remaining)
40
+ # assert results[-1].url == 'https://en.wikipedia.org/wiki/InterPlanetary_File_System'
41
+
42
+
43
+ def test_heading() -> None:
44
+ items = [v if isinstance(v, Visit) else throw(v) for v in extract_from_file(get_testdata('auto/orgs/file2.org'))]
45
+ assert {i.url for i in items} == {
46
+ 'https://en.wikipedia.org/wiki/Computational_topology',
47
+ 'http://graphics.stanford.edu/courses/cs468-09-fall/',
48
+ 'https://en.wikipedia.org/wiki/Triangulation_(topology)',
49
+ 'https://en.wikipedia.org/wiki/Digital_manifold',
50
+ }
51
+
52
+
53
+ def test_url_in_properties() -> None:
54
+ items = [v if isinstance(v, Visit) else throw(v) for v in extract_from_file(get_testdata('auto/orgs/file4.org'))]
55
+
56
+ assert len(items) == 2, items
57
+ assert items[0].url == 'https://example.org/ref_example'
58
+ assert items[1].url == 'http://example.org/a_test'
59
+
60
+
61
+ def test_5() -> None:
62
+ items = [v if isinstance(v, Visit) else throw(v) for v in extract_from_file(get_testdata('auto/orgs/file5.org'))]
63
+
64
+ assert len(items) == 0 # shouldn't crash at least
@@ -0,0 +1,25 @@
1
+ from ...common import Source
2
+ from ...extract import extract_visits
3
+ from ...sources import plaintext, shellcmd
4
+ from ..common import get_testdata, unwrap
5
+
6
+
7
+ def test_plaintext_path_extractor() -> None:
8
+ visits = list(extract_visits(
9
+ Source(
10
+ shellcmd.index,
11
+ plaintext.extract_from_path(get_testdata('custom')),
12
+ ),
13
+ src='whatever',
14
+ ))
15
+ assert {unwrap(v).orig_url for v in visits} == {
16
+ 'http://google.com',
17
+ 'http://google.com/',
18
+ 'http://some-weird-domain.xyz/whatever',
19
+ 'https://google.com',
20
+ 'http://what.about.this.link',
21
+ }
22
+
23
+ [wa] = [v for v in visits if unwrap(v).orig_url == 'http://what.about.this.link']
24
+ f2 = get_testdata('custom') / 'file2.txt'
25
+ assert unwrap(wa).locator.href == f'editor://{f2}:3' # occurs line 3
@@ -0,0 +1,21 @@
1
+ import pytest
2
+
3
+ from ...common import Source, _is_windows
4
+ from ...extract import extract_visits
5
+ from ...sources import shellcmd
6
+ from ..common import get_testdata
7
+
8
+
9
+ @pytest.mark.skipif(_is_windows, reason="no grep on windows")
10
+ def test_via_grep() -> None:
11
+
12
+ visits = list(extract_visits(
13
+ Source(
14
+ shellcmd.index,
15
+ # meh. maybe should deprecate plain string here...
16
+ r"""grep -Eo -r --no-filename (http|https)://\S+ """ + str(get_testdata('custom')),
17
+ ),
18
+ src='whatever',
19
+ ))
20
+ # TODO I guess filtering of equivalent urls should rather be tested on something having context (e.g. org mode)
21
+ assert len(visits) == 5
@@ -0,0 +1,56 @@
1
+ from datetime import datetime, timezone
2
+
3
+ import pytest
4
+ from my.core.cfg import tmp_config
5
+
6
+ from ...common import Source
7
+ from ...extract import extract_visits
8
+ from ...sources import takeout
9
+ from ..common import get_testdata, unwrap
10
+
11
+
12
+ # TODO apply in conftest so it's used in all tests?
13
+ @pytest.fixture
14
+ def no_cachew():
15
+ from my.core.cachew import disabled_cachew
16
+
17
+ with disabled_cachew():
18
+ yield
19
+
20
+
21
+ # todo testing this logic probably belongs to hpi or google_takeout_export, but whatever
22
+ def test_takeout_directory(no_cachew) -> None:
23
+ class config:
24
+ class google:
25
+ takeout_path = get_testdata('takeout')
26
+
27
+ with tmp_config(modules='my.google.takeout.*', config=config):
28
+ visits = list(extract_visits(Source(takeout.index), src='takeout'))
29
+
30
+ assert len(visits) == 3
31
+ assert all(unwrap(v).dt.tzinfo is not None for v in visits)
32
+
33
+
34
+ def test_takeout_zip(no_cachew) -> None:
35
+ class config:
36
+ class google:
37
+ takeout_path = get_testdata('takeout-20150518T000000Z.zip')
38
+
39
+ with tmp_config(modules='my.google.takeout.*', config=config):
40
+ visits = list(extract_visits(Source(takeout.index), src='takeout'))
41
+
42
+ assert len(visits) == 3
43
+ assert all(unwrap(v).dt.tzinfo is not None for v in visits)
44
+
45
+ [vis] = [v for v in visits if unwrap(v).norm_url == 'takeout.google.com/settings/takeout']
46
+
47
+ edt = datetime(
48
+ year=2018,
49
+ month=9,
50
+ day=18,
51
+ hour=5,
52
+ minute=48,
53
+ second=23,
54
+ tzinfo=timezone.utc,
55
+ )
56
+ assert unwrap(vis).dt == edt