promnesia 1.2.20240810__py3-none-any.whl → 1.4.20250909__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (79) hide show
  1. promnesia/__init__.py +18 -4
  2. promnesia/__main__.py +104 -78
  3. promnesia/cannon.py +108 -107
  4. promnesia/common.py +107 -88
  5. promnesia/compare.py +33 -30
  6. promnesia/compat.py +10 -10
  7. promnesia/config.py +37 -34
  8. promnesia/database/common.py +4 -3
  9. promnesia/database/dump.py +13 -13
  10. promnesia/database/load.py +7 -7
  11. promnesia/extract.py +19 -17
  12. promnesia/logging.py +27 -15
  13. promnesia/misc/install_server.py +32 -27
  14. promnesia/server.py +106 -79
  15. promnesia/sources/auto.py +104 -77
  16. promnesia/sources/auto_logseq.py +6 -5
  17. promnesia/sources/auto_obsidian.py +2 -2
  18. promnesia/sources/browser.py +20 -10
  19. promnesia/sources/browser_legacy.py +65 -50
  20. promnesia/sources/demo.py +7 -8
  21. promnesia/sources/fbmessenger.py +3 -3
  22. promnesia/sources/filetypes.py +22 -16
  23. promnesia/sources/github.py +9 -8
  24. promnesia/sources/guess.py +6 -2
  25. promnesia/sources/hackernews.py +7 -9
  26. promnesia/sources/hpi.py +5 -3
  27. promnesia/sources/html.py +11 -7
  28. promnesia/sources/hypothesis.py +3 -2
  29. promnesia/sources/instapaper.py +3 -2
  30. promnesia/sources/markdown.py +22 -12
  31. promnesia/sources/org.py +36 -17
  32. promnesia/sources/plaintext.py +41 -39
  33. promnesia/sources/pocket.py +5 -3
  34. promnesia/sources/reddit.py +24 -26
  35. promnesia/sources/roamresearch.py +5 -2
  36. promnesia/sources/rss.py +6 -8
  37. promnesia/sources/shellcmd.py +21 -11
  38. promnesia/sources/signal.py +27 -26
  39. promnesia/sources/smscalls.py +2 -3
  40. promnesia/sources/stackexchange.py +5 -4
  41. promnesia/sources/takeout.py +37 -34
  42. promnesia/sources/takeout_legacy.py +29 -19
  43. promnesia/sources/telegram.py +18 -12
  44. promnesia/sources/telegram_legacy.py +22 -11
  45. promnesia/sources/twitter.py +7 -6
  46. promnesia/sources/vcs.py +11 -6
  47. promnesia/sources/viber.py +11 -10
  48. promnesia/sources/website.py +8 -7
  49. promnesia/sources/zulip.py +3 -2
  50. promnesia/sqlite.py +13 -7
  51. promnesia/tests/common.py +10 -5
  52. promnesia/tests/server_helper.py +13 -10
  53. promnesia/tests/sources/test_auto.py +2 -3
  54. promnesia/tests/sources/test_filetypes.py +11 -8
  55. promnesia/tests/sources/test_hypothesis.py +10 -6
  56. promnesia/tests/sources/test_org.py +9 -5
  57. promnesia/tests/sources/test_plaintext.py +9 -8
  58. promnesia/tests/sources/test_shellcmd.py +13 -13
  59. promnesia/tests/sources/test_takeout.py +3 -5
  60. promnesia/tests/test_cannon.py +256 -239
  61. promnesia/tests/test_cli.py +12 -8
  62. promnesia/tests/test_compare.py +17 -13
  63. promnesia/tests/test_config.py +7 -8
  64. promnesia/tests/test_db_dump.py +15 -15
  65. promnesia/tests/test_extract.py +17 -10
  66. promnesia/tests/test_indexer.py +24 -18
  67. promnesia/tests/test_server.py +12 -13
  68. promnesia/tests/test_traverse.py +0 -2
  69. promnesia/tests/utils.py +3 -7
  70. promnesia-1.4.20250909.dist-info/METADATA +66 -0
  71. promnesia-1.4.20250909.dist-info/RECORD +80 -0
  72. {promnesia-1.2.20240810.dist-info → promnesia-1.4.20250909.dist-info}/WHEEL +1 -2
  73. promnesia/kjson.py +0 -121
  74. promnesia/sources/__init__.pyi +0 -0
  75. promnesia-1.2.20240810.dist-info/METADATA +0 -54
  76. promnesia-1.2.20240810.dist-info/RECORD +0 -83
  77. promnesia-1.2.20240810.dist-info/top_level.txt +0 -1
  78. {promnesia-1.2.20240810.dist-info → promnesia-1.4.20250909.dist-info}/entry_points.txt +0 -0
  79. {promnesia-1.2.20240810.dist-info → promnesia-1.4.20250909.dist-info/licenses}/LICENSE +0 -0
promnesia/cannon.py CHANGED
@@ -9,16 +9,18 @@ are same content, but you can't tell that by URL equality. Even canonical urls a
9
9
 
10
10
  Also some experiments to establish 'URL hierarchy'.
11
11
  """
12
- # TODO eh?? they fixed mobile.twitter.com?
13
12
 
14
- from itertools import chain
13
+ from __future__ import annotations
14
+
15
15
  import re
16
16
  import typing
17
- from typing import Iterable, NamedTuple, Set, Optional, List, Sequence, Union, Tuple, Dict, Any, Collection
18
-
19
17
  import urllib.parse
20
- from urllib.parse import urlsplit, parse_qsl, urlunsplit, parse_qs, urlencode, SplitResult
18
+ from collections.abc import Collection, Iterable, Sequence
21
19
 
20
+ # TODO eh?? they fixed mobile.twitter.com?
21
+ from itertools import chain
22
+ from typing import Any, NamedTuple
23
+ from urllib.parse import SplitResult, parse_qsl, urlencode, urlsplit, urlunsplit
22
24
 
23
25
  # this has some benchmark, but quite a few librarires seem unmaintained, sadly
24
26
  # I guess i'll stick to default for now, until it's a critical bottleneck
@@ -27,17 +29,6 @@ from urllib.parse import urlsplit, parse_qsl, urlunsplit, parse_qs, urlencode, S
27
29
 
28
30
  # TODO perhaps archive.org contributes to both?
29
31
 
30
- def try_cutl(prefix: str, s: str) -> str:
31
- if s.startswith(prefix):
32
- return s[len(prefix):]
33
- else:
34
- return s
35
-
36
- def try_cutr(suffix: str, s: str) -> str:
37
- if s.endswith(suffix):
38
- return s[:-len(suffix)]
39
- else:
40
- return s
41
32
 
42
33
  # TODO move this to site-specific normalisers?
43
34
  dom_subst = [
@@ -58,22 +49,22 @@ dom_subst = [
58
49
  # app.getpocket.com is the canonical domain in the JSON returned by
59
50
  # https://github.com/karlicoss/pockexport, so let's canonicalize to that.
60
51
  ('getpocket.' , 'app.getpocket.'),
61
- ]
52
+ ] # fmt: skip
53
+
62
54
 
63
55
  def canonify_domain(dom: str) -> str:
64
56
  # TODO perhaps not necessary now that I'm checking suffixes??
65
57
  for st in ('www.', 'amp.'):
66
- dom = try_cutl(st, dom)
58
+ dom = dom.removeprefix(st)
67
59
 
68
60
  for start, repl in dom_subst:
69
61
  if dom.startswith(start):
70
- dom = repl + dom[len(start):]
62
+ dom = repl + dom[len(start) :]
71
63
  break
72
64
 
73
65
  return dom
74
66
 
75
67
 
76
-
77
68
  default_qremove = {
78
69
  'utm_source',
79
70
  'utm_campaign',
@@ -92,7 +83,7 @@ default_qremove = {
92
83
 
93
84
  # e.g. on github
94
85
  'utf8',
95
- }
86
+ } # fmt: skip
96
87
 
97
88
  default_qkeep = [
98
89
  # ok, various BBS have it, hackernews has it etc?
@@ -104,25 +95,22 @@ default_qkeep = [
104
95
 
105
96
  # common to some sites.., usually 'post'
106
97
  'p',
107
- ]
98
+ ] # fmt: skip
99
+
108
100
 
109
101
  # TODO perhaps, decide if fragment is meaningful (e.g. wiki) or random sequence of letters?
110
102
  class Spec(NamedTuple):
111
- qkeep : Optional[Union[Collection[str], bool]] = None
112
- qremove: Optional[Set[str]] = None
113
- fkeep : bool = False
103
+ qkeep: Collection[str] | bool | None = None
104
+ qremove: set[str] | None = None
105
+ fkeep: bool = False
114
106
 
115
- def keep_query(self, q: str) -> Optional[int]: # returns order
107
+ def keep_query(self, q: str) -> int | None: # returns order
116
108
  if self.qkeep is True:
117
109
  return 1
118
- qkeep = {
119
- q: i for i, q in enumerate(chain(default_qkeep, self.qkeep or []))
120
- }
110
+ qkeep = {q: i for i, q in enumerate(chain(default_qkeep, self.qkeep or []))}
121
111
  qremove = default_qremove.union(self.qremove or {})
122
112
  # I suppose 'remove' is only useful for logging. we remove by default anyway
123
113
 
124
- keep = False
125
- remove = False
126
114
  qk = qkeep.get(q)
127
115
  if qk is not None:
128
116
  return qk
@@ -134,13 +122,14 @@ class Spec(NamedTuple):
134
122
  return None
135
123
 
136
124
  @classmethod
137
- def make(cls, **kwargs) -> 'Spec':
125
+ def make(cls, **kwargs) -> Spec:
138
126
  return cls(**kwargs)
139
127
 
128
+
140
129
  S = Spec
141
130
 
142
131
  # TODO perhaps these can be machine learnt from large set of urls?
143
- specs: Dict[str, Spec] = {
132
+ specs: dict[str, Spec] = {
144
133
  'youtube.com': S(
145
134
  # TODO search_query?
146
135
  qkeep=[ # note: experimental.. order matters here
@@ -178,7 +167,6 @@ specs: Dict[str, Spec] = {
178
167
 
179
168
  'source', 'tsid', 'refsrc', 'pnref', 'rc', '_rdr', 'src', 'hc_location', 'section', 'permPage', 'soft', 'pn_ref', 'action',
180
169
  'ti', 'aref', 'event_time_id', 'action_history', 'filter', 'ref_notif_type', 'has_source', 'source_newsfeed_story_type',
181
- 'ref_notif_type',
182
170
  },
183
171
  ),
184
172
  'physicstravelguide.com': S(fkeep=True), # TODO instead, pass fkeep marker object for shorter spec?
@@ -189,9 +177,11 @@ specs: Dict[str, Spec] = {
189
177
  'play.google.com' : S(qkeep={'id'}),
190
178
  'answers.yahoo.com' : S(qkeep={'qid'}),
191
179
  'isfdb.org': S(qkeep=True),
192
- }
180
+ } # fmt: skip
193
181
 
194
182
  _def_spec = S()
183
+
184
+
195
185
  # TODO use cache?
196
186
  def get_spec(dom: str) -> Spec:
197
187
  # ugh. a bit ugly way of getting stuff without subdomain...
@@ -208,20 +198,19 @@ def get_spec(dom: str) -> Spec:
208
198
  return _def_spec
209
199
 
210
200
 
211
-
212
201
  # ideally we'd just be able to reference the domain name and use it in the subst?
213
202
  # some sort of hierarchical matchers? not sure what's got better performance..
214
203
  # if 'from?site=' in url:
215
204
  # return p.query('site')
216
205
 
217
- Spec2 = Any # TODO
206
+ Spec2 = Any # TODO
218
207
 
219
208
  # TODO this should be a map
220
209
  Frag = Any
221
- Parts = Sequence[Tuple[str, str]]
210
+ Parts = Sequence[tuple[str, str]]
222
211
 
223
212
 
224
- def _yc(domain: str, path: str, qq: Parts, frag: Frag) -> Tuple[Any, Any, Parts, Frag]:
213
+ def _yc(domain: str, path: str, qq: Parts, frag: Frag) -> tuple[Any, Any, Parts, Frag]:
225
214
  if path[:5] == '/from':
226
215
  site = dict(qq).get('site')
227
216
  if site is not None:
@@ -232,7 +221,8 @@ def _yc(domain: str, path: str, qq: Parts, frag: Frag) -> Tuple[Any, Any, Parts,
232
221
  # TODO this should be in-place? for brevity?
233
222
  return (domain, path, qq, frag)
234
223
 
235
- def get_spec2(dom: str) -> Optional[Spec2]:
224
+
225
+ def get_spec2(dom: str) -> Spec2 | None:
236
226
  return {
237
227
  'news.ycombinator.com': _yc,
238
228
  }.get(dom)
@@ -241,8 +231,9 @@ def get_spec2(dom: str) -> Optional[Spec2]:
241
231
  class CanonifyException(Exception):
242
232
  pass
243
233
 
234
+
244
235
  # TODO not so sure if it's better to quote or not?
245
- quote_via = urllib.parse.quote
236
+ quote_via = urllib.parse.quote
246
237
  unquote_via = urllib.parse.unquote
247
238
 
248
239
 
@@ -251,7 +242,9 @@ def _quote_path(path: str) -> str:
251
242
  nparts = []
252
243
  for p in parts:
253
244
  # TODO maybe re.match?
254
- if '%' in p or '+' in p: # some urls are partially encoded... perhaps canonify needs hints indicating if url needs normalising or not
245
+ if (
246
+ '%' in p or '+' in p
247
+ ): # some urls are partially encoded... perhaps canonify needs hints indicating if url needs normalising or not
255
248
  p = unquote_via(p)
256
249
  # TODO safe argumnet?
257
250
  nparts.append(quote_via(p))
@@ -269,44 +262,46 @@ def _prenormalise(url: str) -> str:
269
262
  # not sure how safe it in general...
270
263
  first_q = url.find('&')
271
264
  if first_q != -1:
272
- return url[:first_q] + '?' + url[first_q + 1:]
265
+ return url[:first_q] + '?' + url[first_q + 1 :]
273
266
  return url
274
267
 
275
268
 
269
+ Left = str | Sequence[str]
270
+ Right = tuple[str, str, str]
271
+
272
+
276
273
  def transform_split(split: SplitResult):
277
274
  netloc = canonify_domain(split.netloc)
278
275
 
279
- path = split.path
280
- qparts = parse_qsl(split.query, keep_blank_values=True)
276
+ path = split.path
277
+ qparts = parse_qsl(split.query, keep_blank_values=True)
281
278
 
282
279
  fragment = split.fragment
283
280
 
284
- ID = r'(?P<id>[^/]+)'
285
- REST = r'(?P<rest>.*)'
281
+ ID = r'(?P<id>[^/]+)'
282
+ # REST = r'(?P<rest>.*)'
286
283
 
287
- Left = Union[str, Sequence[str]]
288
- Right = Tuple[str, str, str]
289
284
  # the idea is that we can unify certain URLs here and map them to the 'canonical' one
290
285
  # this is a dict only for grouping but should be a list really.. todo
291
- rules: Dict[Left, Right] = {
286
+ rules: dict[Left, Right] = {
292
287
  # TODO m. handling might be quite common
293
288
  # f'm.youtube.com/{REST}': ('youtube.com', '{rest}'),
294
289
  (
295
290
  f'youtu.be/{ID}',
296
291
  f'youtube.com/embed/{ID}',
297
- ) : ('youtube.com', '/watch', 'v={id}'),
292
+ ): ('youtube.com', '/watch', 'v={id}'),
298
293
  # TODO wonder if there is a better candidate for canonical video link?
299
294
  # {DOMAIN} pattern? implicit?
300
295
  (
301
296
  'twitter.com/home',
302
297
  'twitter.com/explore',
303
- ) : ('twitter.com', '', ''),
298
+ ): ('twitter.com', '', ''),
304
299
  }
305
300
 
306
301
  def iter_rules():
307
302
  for fr, to in rules.items():
308
303
  if isinstance(fr, str):
309
- fr = (fr, )
304
+ fr = (fr,)
310
305
  for f in fr:
311
306
  yield f, to
312
307
 
@@ -322,29 +317,28 @@ def transform_split(split: SplitResult):
322
317
  continue
323
318
  gd = m.groupdict()
324
319
  if len(to) == 2:
325
- to = to + ('', )
320
+ to = (*to, '')
326
321
 
327
- (netloc, path, qq) = [t.format(**gd) for t in to]
328
- qparts.extend(parse_qsl(qq, keep_blank_values=True)) # TODO hacky..
322
+ (netloc, path, qq) = (t.format(**gd) for t in to)
323
+ qparts.extend(parse_qsl(qq, keep_blank_values=True)) # TODO hacky..
329
324
  # TODO eh, qparts should really be a map or something...
330
325
  break
331
326
 
332
-
333
327
  return netloc, path, qparts, fragment
334
328
 
335
329
 
336
-
337
330
  def myunsplit(domain: str, path: str, query: str, fragment: str) -> str:
338
- uns = urlunsplit((
339
- '', # dummy protocol
340
- domain,
341
- path,
342
- query,
343
- fragment,
344
- ))
345
- uns = try_cutl('//', uns) # // due to dummy protocol
346
- return uns
347
-
331
+ uns = urlunsplit(
332
+ (
333
+ '', # dummy protocol
334
+ domain,
335
+ path,
336
+ query,
337
+ fragment,
338
+ )
339
+ )
340
+ uns = uns.removeprefix('//') # // due to dummy protocol
341
+ return uns # ty: ignore[invalid-return-type] # see https://github.com/astral-sh/ty/issues/733
348
342
 
349
343
 
350
344
  #
@@ -361,7 +355,8 @@ def myunsplit(domain: str, path: str, query: str, fragment: str) -> str:
361
355
  # ]
362
356
  # for re in regexes:
363
357
 
364
- def handle_archive_org(url: str) -> Optional[str]:
358
+
359
+ def handle_archive_org(url: str) -> str | None:
365
360
  are = r'web.archive.org/web/(?P<timestamp>\d+)/(?P<rest>.*)'
366
361
  m = re.fullmatch(are, url)
367
362
  if m is None:
@@ -397,7 +392,7 @@ def canonify(url: str) -> str:
397
392
 
398
393
  res = handle_archive_org(no_protocol)
399
394
  if res is not None:
400
- assert len(res) < len(no_protocol) # just a paranoia to avoid infinite recursion...
395
+ assert len(res) < len(no_protocol) # just a paranoia to avoid infinite recursion...
401
396
  return canonify(res)
402
397
 
403
398
  domain, path, qq, _frag = transform_split(parts)
@@ -407,7 +402,6 @@ def canonify(url: str) -> str:
407
402
  # meh
408
403
  domain, path, qq, _frag = spec2(domain, path, qq, _frag)
409
404
 
410
-
411
405
  spec = get_spec(domain)
412
406
 
413
407
  # TODO FIXME turn this logic back on?
@@ -427,11 +421,11 @@ def canonify(url: str) -> str:
427
421
  path = _quote_path(path)
428
422
 
429
423
  uns = myunsplit(domain, path, query, frag)
430
- uns = try_cutr('/', uns) # not sure if there is a better way
424
+ uns = uns.removesuffix('/') # not sure if there is a better way
431
425
  return uns
432
426
 
433
427
 
434
- # TODO wonder if lisp could be convenient for this. lol
428
+ # TODO wonder if lisp could be convenient for this. lol
435
429
  TW_PATTERNS = [
436
430
  {
437
431
  'U': r'[\w-]+',
@@ -469,7 +463,7 @@ TW_PATTERNS = [
469
463
  r'twitter.com/i/events/\d+',
470
464
 
471
465
  r'(dev|api|analytics|developer|help|support|blog|anywhere|careers|pic).twitter.com/.*',
472
- ]
466
+ ] # fmt: skip
473
467
 
474
468
  RD_PATTERNS = [
475
469
  {
@@ -500,7 +494,7 @@ RD_PATTERNS = [
500
494
  r'reddit.com/dev/api',
501
495
  r'reddit.com/api/v1/authorize',
502
496
  r'reddit.com/domain/.*',
503
- ]
497
+ ] # fmt: skip
504
498
 
505
499
  GH_PATTERNS = [
506
500
  {
@@ -543,7 +537,7 @@ GH_PATTERNS = [
543
537
  # TODO FIXME no canonical here
544
538
  # https://gist.github.com/dneto/2258454
545
539
  # same as https://gist.github.com/2258454
546
- ]
540
+ ] # fmt: skip
547
541
 
548
542
  YT_PATTERNS = [
549
543
  {
@@ -566,7 +560,7 @@ YT_PATTERNS = [
566
560
  r'youtube.com/feed/(subscriptions|library|trending|history)',
567
561
  r'youtube.com',
568
562
  r'youtube.com/(post_login|upload)',
569
- ]
563
+ ] # fmt: skip
570
564
 
571
565
  SOP = r'(^|\w+\.)stackoverflow.com'
572
566
 
@@ -589,7 +583,7 @@ SO_PATTERNS = [
589
583
  SOP + r'/users/UI',
590
584
  SOP + r'/users/UI/U',
591
585
  SOP,
592
- ]
586
+ ] # fmt: skip
593
587
 
594
588
  WKP = r'(^|.+\.)wikipedia.org'
595
589
 
@@ -599,7 +593,7 @@ WK_PATTERNS = [
599
593
  },
600
594
  WKP + '/wiki/AN',
601
595
  WKP,
602
- ]
596
+ ] # fmt: skip
603
597
 
604
598
  FB_PATTERNS = [
605
599
  {
@@ -627,7 +621,7 @@ FB_PATTERNS = [
627
621
  r'F/pages/U/P',
628
622
  r'F/stories/I',
629
623
  r'F/notes/U/P',
630
- ]
624
+ ] # fmt: skip
631
625
 
632
626
  PKP = r'^(app)?\.getpocket\.com'
633
627
 
@@ -636,7 +630,7 @@ PK_PATTERNS = [
636
630
  'ID': r'\d+',
637
631
  },
638
632
  PKP + '/read/ID',
639
- ]
633
+ ] # fmt: skip
640
634
 
641
635
  # NOTE: right, I think this is just for analysis so far... not actually used
642
636
  PATTERNS = {
@@ -649,7 +643,7 @@ PATTERNS = {
649
643
  'wikipedia' : WK_PATTERNS,
650
644
  'pocket' : PK_PATTERNS,
651
645
  # 'news.ycombinator.com': YC_PATTERNS,
652
- }
646
+ } # fmt: skip
653
647
 
654
648
 
655
649
  def get_patterns(): # pragma: no cover
@@ -672,11 +666,13 @@ def get_patterns(): # pragma: no cover
672
666
  [rdict] = repls
673
667
  for p in pats:
674
668
  yield repl(p, rdict)
669
+
675
670
  return {k: list(handle(v)) for k, v in PATTERNS.items()}
676
671
 
677
672
 
678
- def domains(it): # pragma: no cover
673
+ def domains(it): # pragma: no cover
679
674
  from collections import Counter
675
+
680
676
  c: typing.Counter[str] = Counter()
681
677
  for line in it:
682
678
  url = line.strip()
@@ -687,18 +683,20 @@ def domains(it): # pragma: no cover
687
683
  c['ERROR'] += 1
688
684
  continue
689
685
  else:
690
- udom = nurl[:nurl.find('/')]
686
+ udom = nurl[: nurl.find('/')]
691
687
  c[udom] += 1
692
688
  from pprint import pprint
689
+
693
690
  pprint(c.most_common(20))
694
691
 
695
692
 
696
- def groups(it, args): # pragma: no cover
693
+ def groups(it, args): # pragma: no cover
697
694
  all_pats = get_patterns()
698
695
 
699
696
  from collections import Counter
700
- c: typing.Counter[Optional[str]] = Counter()
701
- unmatched: List[str] = []
697
+
698
+ c: typing.Counter[str | None] = Counter()
699
+ unmatched: list[str] = []
702
700
 
703
701
  def dump():
704
702
  print(c)
@@ -709,7 +707,6 @@ def groups(it, args): # pragma: no cover
709
707
  if pat is None:
710
708
  unmatched.append(nurl)
711
709
 
712
-
713
710
  for i, line in enumerate(it):
714
711
  if i % 10000 == 0:
715
712
  pass
@@ -720,12 +717,12 @@ def groups(it, args): # pragma: no cover
720
717
  except CanonifyException as e:
721
718
  print(f"ERROR while normalising! {url} {e}")
722
719
  continue
723
- udom = nurl[:nurl.find('/')]
720
+ udom = nurl[: nurl.find('/')]
724
721
  usplit = udom.split('.')
725
722
  patterns = None
726
723
  for dom, pats in all_pats.items():
727
724
  dsplit = dom.split('.')
728
- if '$'.join(dsplit) in '$'.join(usplit): # meh
725
+ if '$'.join(dsplit) in '$'.join(usplit): # meh
729
726
  patterns = pats
730
727
  break
731
728
  else:
@@ -749,17 +746,17 @@ def groups(it, args): # pragma: no cover
749
746
  print(f"Unmatched: {nones / sum(c.values()) * 100:.1f}%")
750
747
  uc = Counter([u.split('/')[:2][-1] for u in unmatched]).most_common(10)
751
748
  from pprint import pprint
752
- pprint(uc)
753
749
 
750
+ pprint(uc)
754
751
 
755
752
 
756
- def display(it, args) -> None: # pragma: no cover
753
+ def display(it, args) -> None: # pragma: no cover
757
754
  # TODO better name?
758
755
  import difflib
759
- # pylint: disable=import-error
760
- from termcolor import colored as C # type: ignore
761
756
  from sys import stdout
762
757
 
758
+ from termcolor import colored as C # type: ignore[import-not-found]
759
+
763
760
  for line in it:
764
761
  line = line.strip()
765
762
  if args.human:
@@ -773,15 +770,16 @@ def display(it, args) -> None: # pragma: no cover
773
770
  can_ = ""
774
771
 
775
772
  pr = False
773
+
776
774
  def delete(x):
777
775
  nonlocal pr
778
776
  if x in (
779
- 'https://www.',
780
- 'http://www.',
781
- 'http://',
782
- 'https://',
783
- 'file://',
784
- '/',
777
+ 'https://www.',
778
+ 'http://www.',
779
+ 'http://',
780
+ 'https://',
781
+ 'file://',
782
+ '/',
785
783
  ):
786
784
  col = None
787
785
  else:
@@ -800,9 +798,8 @@ def display(it, args) -> None: # pragma: no cover
800
798
  fn = lambda x: C(x, color='cyan')
801
799
  # TODO exclude certain items from comparison?
802
800
 
803
-
804
- org_ += fn(line[ff: tt])
805
- can_ += fn(can[ff2: tt2])
801
+ org_ += fn(line[ff:tt])
802
+ can_ += fn(can[ff2:tt2])
806
803
  cl = max(len(org_), len(can_))
807
804
  org_ += ' ' * (cl - len(org_))
808
805
  can_ += ' ' * (cl - len(can_))
@@ -811,14 +808,17 @@ def display(it, args) -> None: # pragma: no cover
811
808
  stdout.write(f'{org_}\n{can_}\n---\n')
812
809
 
813
810
 
814
- def main() -> None: # pragma: no cover
811
+ def main() -> None: # pragma: no cover
815
812
  import argparse
816
- p = argparse.ArgumentParser(epilog='''
813
+
814
+ p = argparse.ArgumentParser(
815
+ epilog='''
817
816
  - sqlite3 promnesia.sqlite 'select distinct orig_url from visits' | cannon.py --domains
818
817
 
819
818
  - running comparison
820
819
  sqlite3 promnesia.sqlite 'select distinct orig_url from visits where norm_url like "%twitter%" order by orig_url' | src/promnesia/cannon.py
821
- ''', formatter_class=lambda prog: argparse.RawTextHelpFormatter(prog, width=100)
820
+ ''',
821
+ formatter_class=lambda prog: argparse.RawTextHelpFormatter(prog, width=100),
822
822
  )
823
823
  p.add_argument('input', nargs='?')
824
824
  p.add_argument('--human', action='store_true')
@@ -829,6 +829,7 @@ def main() -> None: # pragma: no cover
829
829
  it: Iterable[str]
830
830
  if args.input is None:
831
831
  import sys
832
+
832
833
  it = sys.stdin
833
834
  else:
834
835
  it = [args.input]
@@ -842,7 +843,7 @@ def main() -> None: # pragma: no cover
842
843
 
843
844
 
844
845
  if __name__ == '__main__':
845
- main() # pragma: no cover
846
+ main() # pragma: no cover
846
847
 
847
848
  # TODO hmm, it's actually sort of fingerprinter... so maybe that's what I should call it
848
849