promnesia 1.2.20240810__py3-none-any.whl → 1.4.20250909__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- promnesia/__init__.py +18 -4
- promnesia/__main__.py +104 -78
- promnesia/cannon.py +108 -107
- promnesia/common.py +107 -88
- promnesia/compare.py +33 -30
- promnesia/compat.py +10 -10
- promnesia/config.py +37 -34
- promnesia/database/common.py +4 -3
- promnesia/database/dump.py +13 -13
- promnesia/database/load.py +7 -7
- promnesia/extract.py +19 -17
- promnesia/logging.py +27 -15
- promnesia/misc/install_server.py +32 -27
- promnesia/server.py +106 -79
- promnesia/sources/auto.py +104 -77
- promnesia/sources/auto_logseq.py +6 -5
- promnesia/sources/auto_obsidian.py +2 -2
- promnesia/sources/browser.py +20 -10
- promnesia/sources/browser_legacy.py +65 -50
- promnesia/sources/demo.py +7 -8
- promnesia/sources/fbmessenger.py +3 -3
- promnesia/sources/filetypes.py +22 -16
- promnesia/sources/github.py +9 -8
- promnesia/sources/guess.py +6 -2
- promnesia/sources/hackernews.py +7 -9
- promnesia/sources/hpi.py +5 -3
- promnesia/sources/html.py +11 -7
- promnesia/sources/hypothesis.py +3 -2
- promnesia/sources/instapaper.py +3 -2
- promnesia/sources/markdown.py +22 -12
- promnesia/sources/org.py +36 -17
- promnesia/sources/plaintext.py +41 -39
- promnesia/sources/pocket.py +5 -3
- promnesia/sources/reddit.py +24 -26
- promnesia/sources/roamresearch.py +5 -2
- promnesia/sources/rss.py +6 -8
- promnesia/sources/shellcmd.py +21 -11
- promnesia/sources/signal.py +27 -26
- promnesia/sources/smscalls.py +2 -3
- promnesia/sources/stackexchange.py +5 -4
- promnesia/sources/takeout.py +37 -34
- promnesia/sources/takeout_legacy.py +29 -19
- promnesia/sources/telegram.py +18 -12
- promnesia/sources/telegram_legacy.py +22 -11
- promnesia/sources/twitter.py +7 -6
- promnesia/sources/vcs.py +11 -6
- promnesia/sources/viber.py +11 -10
- promnesia/sources/website.py +8 -7
- promnesia/sources/zulip.py +3 -2
- promnesia/sqlite.py +13 -7
- promnesia/tests/common.py +10 -5
- promnesia/tests/server_helper.py +13 -10
- promnesia/tests/sources/test_auto.py +2 -3
- promnesia/tests/sources/test_filetypes.py +11 -8
- promnesia/tests/sources/test_hypothesis.py +10 -6
- promnesia/tests/sources/test_org.py +9 -5
- promnesia/tests/sources/test_plaintext.py +9 -8
- promnesia/tests/sources/test_shellcmd.py +13 -13
- promnesia/tests/sources/test_takeout.py +3 -5
- promnesia/tests/test_cannon.py +256 -239
- promnesia/tests/test_cli.py +12 -8
- promnesia/tests/test_compare.py +17 -13
- promnesia/tests/test_config.py +7 -8
- promnesia/tests/test_db_dump.py +15 -15
- promnesia/tests/test_extract.py +17 -10
- promnesia/tests/test_indexer.py +24 -18
- promnesia/tests/test_server.py +12 -13
- promnesia/tests/test_traverse.py +0 -2
- promnesia/tests/utils.py +3 -7
- promnesia-1.4.20250909.dist-info/METADATA +66 -0
- promnesia-1.4.20250909.dist-info/RECORD +80 -0
- {promnesia-1.2.20240810.dist-info → promnesia-1.4.20250909.dist-info}/WHEEL +1 -2
- promnesia/kjson.py +0 -121
- promnesia/sources/__init__.pyi +0 -0
- promnesia-1.2.20240810.dist-info/METADATA +0 -54
- promnesia-1.2.20240810.dist-info/RECORD +0 -83
- promnesia-1.2.20240810.dist-info/top_level.txt +0 -1
- {promnesia-1.2.20240810.dist-info → promnesia-1.4.20250909.dist-info}/entry_points.txt +0 -0
- {promnesia-1.2.20240810.dist-info → promnesia-1.4.20250909.dist-info/licenses}/LICENSE +0 -0
promnesia/cannon.py
CHANGED
@@ -9,16 +9,18 @@ are same content, but you can't tell that by URL equality. Even canonical urls a
|
|
9
9
|
|
10
10
|
Also some experiments to establish 'URL hierarchy'.
|
11
11
|
"""
|
12
|
-
# TODO eh?? they fixed mobile.twitter.com?
|
13
12
|
|
14
|
-
from
|
13
|
+
from __future__ import annotations
|
14
|
+
|
15
15
|
import re
|
16
16
|
import typing
|
17
|
-
from typing import Iterable, NamedTuple, Set, Optional, List, Sequence, Union, Tuple, Dict, Any, Collection
|
18
|
-
|
19
17
|
import urllib.parse
|
20
|
-
from
|
18
|
+
from collections.abc import Collection, Iterable, Sequence
|
21
19
|
|
20
|
+
# TODO eh?? they fixed mobile.twitter.com?
|
21
|
+
from itertools import chain
|
22
|
+
from typing import Any, NamedTuple
|
23
|
+
from urllib.parse import SplitResult, parse_qsl, urlencode, urlsplit, urlunsplit
|
22
24
|
|
23
25
|
# this has some benchmark, but quite a few librarires seem unmaintained, sadly
|
24
26
|
# I guess i'll stick to default for now, until it's a critical bottleneck
|
@@ -27,17 +29,6 @@ from urllib.parse import urlsplit, parse_qsl, urlunsplit, parse_qs, urlencode, S
|
|
27
29
|
|
28
30
|
# TODO perhaps archive.org contributes to both?
|
29
31
|
|
30
|
-
def try_cutl(prefix: str, s: str) -> str:
|
31
|
-
if s.startswith(prefix):
|
32
|
-
return s[len(prefix):]
|
33
|
-
else:
|
34
|
-
return s
|
35
|
-
|
36
|
-
def try_cutr(suffix: str, s: str) -> str:
|
37
|
-
if s.endswith(suffix):
|
38
|
-
return s[:-len(suffix)]
|
39
|
-
else:
|
40
|
-
return s
|
41
32
|
|
42
33
|
# TODO move this to site-specific normalisers?
|
43
34
|
dom_subst = [
|
@@ -58,22 +49,22 @@ dom_subst = [
|
|
58
49
|
# app.getpocket.com is the canonical domain in the JSON returned by
|
59
50
|
# https://github.com/karlicoss/pockexport, so let's canonicalize to that.
|
60
51
|
('getpocket.' , 'app.getpocket.'),
|
61
|
-
]
|
52
|
+
] # fmt: skip
|
53
|
+
|
62
54
|
|
63
55
|
def canonify_domain(dom: str) -> str:
|
64
56
|
# TODO perhaps not necessary now that I'm checking suffixes??
|
65
57
|
for st in ('www.', 'amp.'):
|
66
|
-
dom =
|
58
|
+
dom = dom.removeprefix(st)
|
67
59
|
|
68
60
|
for start, repl in dom_subst:
|
69
61
|
if dom.startswith(start):
|
70
|
-
dom = repl + dom[len(start):]
|
62
|
+
dom = repl + dom[len(start) :]
|
71
63
|
break
|
72
64
|
|
73
65
|
return dom
|
74
66
|
|
75
67
|
|
76
|
-
|
77
68
|
default_qremove = {
|
78
69
|
'utm_source',
|
79
70
|
'utm_campaign',
|
@@ -92,7 +83,7 @@ default_qremove = {
|
|
92
83
|
|
93
84
|
# e.g. on github
|
94
85
|
'utf8',
|
95
|
-
}
|
86
|
+
} # fmt: skip
|
96
87
|
|
97
88
|
default_qkeep = [
|
98
89
|
# ok, various BBS have it, hackernews has it etc?
|
@@ -104,25 +95,22 @@ default_qkeep = [
|
|
104
95
|
|
105
96
|
# common to some sites.., usually 'post'
|
106
97
|
'p',
|
107
|
-
]
|
98
|
+
] # fmt: skip
|
99
|
+
|
108
100
|
|
109
101
|
# TODO perhaps, decide if fragment is meaningful (e.g. wiki) or random sequence of letters?
|
110
102
|
class Spec(NamedTuple):
|
111
|
-
qkeep
|
112
|
-
qremove:
|
113
|
-
fkeep
|
103
|
+
qkeep: Collection[str] | bool | None = None
|
104
|
+
qremove: set[str] | None = None
|
105
|
+
fkeep: bool = False
|
114
106
|
|
115
|
-
def keep_query(self, q: str) ->
|
107
|
+
def keep_query(self, q: str) -> int | None: # returns order
|
116
108
|
if self.qkeep is True:
|
117
109
|
return 1
|
118
|
-
qkeep = {
|
119
|
-
q: i for i, q in enumerate(chain(default_qkeep, self.qkeep or []))
|
120
|
-
}
|
110
|
+
qkeep = {q: i for i, q in enumerate(chain(default_qkeep, self.qkeep or []))}
|
121
111
|
qremove = default_qremove.union(self.qremove or {})
|
122
112
|
# I suppose 'remove' is only useful for logging. we remove by default anyway
|
123
113
|
|
124
|
-
keep = False
|
125
|
-
remove = False
|
126
114
|
qk = qkeep.get(q)
|
127
115
|
if qk is not None:
|
128
116
|
return qk
|
@@ -134,13 +122,14 @@ class Spec(NamedTuple):
|
|
134
122
|
return None
|
135
123
|
|
136
124
|
@classmethod
|
137
|
-
def make(cls, **kwargs) ->
|
125
|
+
def make(cls, **kwargs) -> Spec:
|
138
126
|
return cls(**kwargs)
|
139
127
|
|
128
|
+
|
140
129
|
S = Spec
|
141
130
|
|
142
131
|
# TODO perhaps these can be machine learnt from large set of urls?
|
143
|
-
specs:
|
132
|
+
specs: dict[str, Spec] = {
|
144
133
|
'youtube.com': S(
|
145
134
|
# TODO search_query?
|
146
135
|
qkeep=[ # note: experimental.. order matters here
|
@@ -178,7 +167,6 @@ specs: Dict[str, Spec] = {
|
|
178
167
|
|
179
168
|
'source', 'tsid', 'refsrc', 'pnref', 'rc', '_rdr', 'src', 'hc_location', 'section', 'permPage', 'soft', 'pn_ref', 'action',
|
180
169
|
'ti', 'aref', 'event_time_id', 'action_history', 'filter', 'ref_notif_type', 'has_source', 'source_newsfeed_story_type',
|
181
|
-
'ref_notif_type',
|
182
170
|
},
|
183
171
|
),
|
184
172
|
'physicstravelguide.com': S(fkeep=True), # TODO instead, pass fkeep marker object for shorter spec?
|
@@ -189,9 +177,11 @@ specs: Dict[str, Spec] = {
|
|
189
177
|
'play.google.com' : S(qkeep={'id'}),
|
190
178
|
'answers.yahoo.com' : S(qkeep={'qid'}),
|
191
179
|
'isfdb.org': S(qkeep=True),
|
192
|
-
}
|
180
|
+
} # fmt: skip
|
193
181
|
|
194
182
|
_def_spec = S()
|
183
|
+
|
184
|
+
|
195
185
|
# TODO use cache?
|
196
186
|
def get_spec(dom: str) -> Spec:
|
197
187
|
# ugh. a bit ugly way of getting stuff without subdomain...
|
@@ -208,20 +198,19 @@ def get_spec(dom: str) -> Spec:
|
|
208
198
|
return _def_spec
|
209
199
|
|
210
200
|
|
211
|
-
|
212
201
|
# ideally we'd just be able to reference the domain name and use it in the subst?
|
213
202
|
# some sort of hierarchical matchers? not sure what's got better performance..
|
214
203
|
# if 'from?site=' in url:
|
215
204
|
# return p.query('site')
|
216
205
|
|
217
|
-
Spec2 = Any
|
206
|
+
Spec2 = Any # TODO
|
218
207
|
|
219
208
|
# TODO this should be a map
|
220
209
|
Frag = Any
|
221
|
-
Parts = Sequence[
|
210
|
+
Parts = Sequence[tuple[str, str]]
|
222
211
|
|
223
212
|
|
224
|
-
def _yc(domain: str, path: str, qq: Parts, frag: Frag) ->
|
213
|
+
def _yc(domain: str, path: str, qq: Parts, frag: Frag) -> tuple[Any, Any, Parts, Frag]:
|
225
214
|
if path[:5] == '/from':
|
226
215
|
site = dict(qq).get('site')
|
227
216
|
if site is not None:
|
@@ -232,7 +221,8 @@ def _yc(domain: str, path: str, qq: Parts, frag: Frag) -> Tuple[Any, Any, Parts,
|
|
232
221
|
# TODO this should be in-place? for brevity?
|
233
222
|
return (domain, path, qq, frag)
|
234
223
|
|
235
|
-
|
224
|
+
|
225
|
+
def get_spec2(dom: str) -> Spec2 | None:
|
236
226
|
return {
|
237
227
|
'news.ycombinator.com': _yc,
|
238
228
|
}.get(dom)
|
@@ -241,8 +231,9 @@ def get_spec2(dom: str) -> Optional[Spec2]:
|
|
241
231
|
class CanonifyException(Exception):
|
242
232
|
pass
|
243
233
|
|
234
|
+
|
244
235
|
# TODO not so sure if it's better to quote or not?
|
245
|
-
quote_via
|
236
|
+
quote_via = urllib.parse.quote
|
246
237
|
unquote_via = urllib.parse.unquote
|
247
238
|
|
248
239
|
|
@@ -251,7 +242,9 @@ def _quote_path(path: str) -> str:
|
|
251
242
|
nparts = []
|
252
243
|
for p in parts:
|
253
244
|
# TODO maybe re.match?
|
254
|
-
if
|
245
|
+
if (
|
246
|
+
'%' in p or '+' in p
|
247
|
+
): # some urls are partially encoded... perhaps canonify needs hints indicating if url needs normalising or not
|
255
248
|
p = unquote_via(p)
|
256
249
|
# TODO safe argumnet?
|
257
250
|
nparts.append(quote_via(p))
|
@@ -269,44 +262,46 @@ def _prenormalise(url: str) -> str:
|
|
269
262
|
# not sure how safe it in general...
|
270
263
|
first_q = url.find('&')
|
271
264
|
if first_q != -1:
|
272
|
-
return url[:first_q] + '?' + url[first_q + 1:]
|
265
|
+
return url[:first_q] + '?' + url[first_q + 1 :]
|
273
266
|
return url
|
274
267
|
|
275
268
|
|
269
|
+
Left = str | Sequence[str]
|
270
|
+
Right = tuple[str, str, str]
|
271
|
+
|
272
|
+
|
276
273
|
def transform_split(split: SplitResult):
|
277
274
|
netloc = canonify_domain(split.netloc)
|
278
275
|
|
279
|
-
path
|
280
|
-
qparts
|
276
|
+
path = split.path
|
277
|
+
qparts = parse_qsl(split.query, keep_blank_values=True)
|
281
278
|
|
282
279
|
fragment = split.fragment
|
283
280
|
|
284
|
-
ID
|
285
|
-
REST = r'(?P<rest>.*)'
|
281
|
+
ID = r'(?P<id>[^/]+)'
|
282
|
+
# REST = r'(?P<rest>.*)'
|
286
283
|
|
287
|
-
Left = Union[str, Sequence[str]]
|
288
|
-
Right = Tuple[str, str, str]
|
289
284
|
# the idea is that we can unify certain URLs here and map them to the 'canonical' one
|
290
285
|
# this is a dict only for grouping but should be a list really.. todo
|
291
|
-
rules:
|
286
|
+
rules: dict[Left, Right] = {
|
292
287
|
# TODO m. handling might be quite common
|
293
288
|
# f'm.youtube.com/{REST}': ('youtube.com', '{rest}'),
|
294
289
|
(
|
295
290
|
f'youtu.be/{ID}',
|
296
291
|
f'youtube.com/embed/{ID}',
|
297
|
-
)
|
292
|
+
): ('youtube.com', '/watch', 'v={id}'),
|
298
293
|
# TODO wonder if there is a better candidate for canonical video link?
|
299
294
|
# {DOMAIN} pattern? implicit?
|
300
295
|
(
|
301
296
|
'twitter.com/home',
|
302
297
|
'twitter.com/explore',
|
303
|
-
)
|
298
|
+
): ('twitter.com', '', ''),
|
304
299
|
}
|
305
300
|
|
306
301
|
def iter_rules():
|
307
302
|
for fr, to in rules.items():
|
308
303
|
if isinstance(fr, str):
|
309
|
-
fr = (fr,
|
304
|
+
fr = (fr,)
|
310
305
|
for f in fr:
|
311
306
|
yield f, to
|
312
307
|
|
@@ -322,29 +317,28 @@ def transform_split(split: SplitResult):
|
|
322
317
|
continue
|
323
318
|
gd = m.groupdict()
|
324
319
|
if len(to) == 2:
|
325
|
-
to = to
|
320
|
+
to = (*to, '')
|
326
321
|
|
327
|
-
(netloc, path, qq) =
|
328
|
-
qparts.extend(parse_qsl(qq, keep_blank_values=True))
|
322
|
+
(netloc, path, qq) = (t.format(**gd) for t in to)
|
323
|
+
qparts.extend(parse_qsl(qq, keep_blank_values=True)) # TODO hacky..
|
329
324
|
# TODO eh, qparts should really be a map or something...
|
330
325
|
break
|
331
326
|
|
332
|
-
|
333
327
|
return netloc, path, qparts, fragment
|
334
328
|
|
335
329
|
|
336
|
-
|
337
330
|
def myunsplit(domain: str, path: str, query: str, fragment: str) -> str:
|
338
|
-
uns = urlunsplit(
|
339
|
-
|
340
|
-
|
341
|
-
|
342
|
-
|
343
|
-
|
344
|
-
|
345
|
-
|
346
|
-
|
347
|
-
|
331
|
+
uns = urlunsplit(
|
332
|
+
(
|
333
|
+
'', # dummy protocol
|
334
|
+
domain,
|
335
|
+
path,
|
336
|
+
query,
|
337
|
+
fragment,
|
338
|
+
)
|
339
|
+
)
|
340
|
+
uns = uns.removeprefix('//') # // due to dummy protocol
|
341
|
+
return uns # ty: ignore[invalid-return-type] # see https://github.com/astral-sh/ty/issues/733
|
348
342
|
|
349
343
|
|
350
344
|
#
|
@@ -361,7 +355,8 @@ def myunsplit(domain: str, path: str, query: str, fragment: str) -> str:
|
|
361
355
|
# ]
|
362
356
|
# for re in regexes:
|
363
357
|
|
364
|
-
|
358
|
+
|
359
|
+
def handle_archive_org(url: str) -> str | None:
|
365
360
|
are = r'web.archive.org/web/(?P<timestamp>\d+)/(?P<rest>.*)'
|
366
361
|
m = re.fullmatch(are, url)
|
367
362
|
if m is None:
|
@@ -397,7 +392,7 @@ def canonify(url: str) -> str:
|
|
397
392
|
|
398
393
|
res = handle_archive_org(no_protocol)
|
399
394
|
if res is not None:
|
400
|
-
assert len(res) < len(no_protocol)
|
395
|
+
assert len(res) < len(no_protocol) # just a paranoia to avoid infinite recursion...
|
401
396
|
return canonify(res)
|
402
397
|
|
403
398
|
domain, path, qq, _frag = transform_split(parts)
|
@@ -407,7 +402,6 @@ def canonify(url: str) -> str:
|
|
407
402
|
# meh
|
408
403
|
domain, path, qq, _frag = spec2(domain, path, qq, _frag)
|
409
404
|
|
410
|
-
|
411
405
|
spec = get_spec(domain)
|
412
406
|
|
413
407
|
# TODO FIXME turn this logic back on?
|
@@ -427,11 +421,11 @@ def canonify(url: str) -> str:
|
|
427
421
|
path = _quote_path(path)
|
428
422
|
|
429
423
|
uns = myunsplit(domain, path, query, frag)
|
430
|
-
uns =
|
424
|
+
uns = uns.removesuffix('/') # not sure if there is a better way
|
431
425
|
return uns
|
432
426
|
|
433
427
|
|
434
|
-
|
428
|
+
# TODO wonder if lisp could be convenient for this. lol
|
435
429
|
TW_PATTERNS = [
|
436
430
|
{
|
437
431
|
'U': r'[\w-]+',
|
@@ -469,7 +463,7 @@ TW_PATTERNS = [
|
|
469
463
|
r'twitter.com/i/events/\d+',
|
470
464
|
|
471
465
|
r'(dev|api|analytics|developer|help|support|blog|anywhere|careers|pic).twitter.com/.*',
|
472
|
-
]
|
466
|
+
] # fmt: skip
|
473
467
|
|
474
468
|
RD_PATTERNS = [
|
475
469
|
{
|
@@ -500,7 +494,7 @@ RD_PATTERNS = [
|
|
500
494
|
r'reddit.com/dev/api',
|
501
495
|
r'reddit.com/api/v1/authorize',
|
502
496
|
r'reddit.com/domain/.*',
|
503
|
-
]
|
497
|
+
] # fmt: skip
|
504
498
|
|
505
499
|
GH_PATTERNS = [
|
506
500
|
{
|
@@ -543,7 +537,7 @@ GH_PATTERNS = [
|
|
543
537
|
# TODO FIXME no canonical here
|
544
538
|
# https://gist.github.com/dneto/2258454
|
545
539
|
# same as https://gist.github.com/2258454
|
546
|
-
]
|
540
|
+
] # fmt: skip
|
547
541
|
|
548
542
|
YT_PATTERNS = [
|
549
543
|
{
|
@@ -566,7 +560,7 @@ YT_PATTERNS = [
|
|
566
560
|
r'youtube.com/feed/(subscriptions|library|trending|history)',
|
567
561
|
r'youtube.com',
|
568
562
|
r'youtube.com/(post_login|upload)',
|
569
|
-
]
|
563
|
+
] # fmt: skip
|
570
564
|
|
571
565
|
SOP = r'(^|\w+\.)stackoverflow.com'
|
572
566
|
|
@@ -589,7 +583,7 @@ SO_PATTERNS = [
|
|
589
583
|
SOP + r'/users/UI',
|
590
584
|
SOP + r'/users/UI/U',
|
591
585
|
SOP,
|
592
|
-
]
|
586
|
+
] # fmt: skip
|
593
587
|
|
594
588
|
WKP = r'(^|.+\.)wikipedia.org'
|
595
589
|
|
@@ -599,7 +593,7 @@ WK_PATTERNS = [
|
|
599
593
|
},
|
600
594
|
WKP + '/wiki/AN',
|
601
595
|
WKP,
|
602
|
-
]
|
596
|
+
] # fmt: skip
|
603
597
|
|
604
598
|
FB_PATTERNS = [
|
605
599
|
{
|
@@ -627,7 +621,7 @@ FB_PATTERNS = [
|
|
627
621
|
r'F/pages/U/P',
|
628
622
|
r'F/stories/I',
|
629
623
|
r'F/notes/U/P',
|
630
|
-
]
|
624
|
+
] # fmt: skip
|
631
625
|
|
632
626
|
PKP = r'^(app)?\.getpocket\.com'
|
633
627
|
|
@@ -636,7 +630,7 @@ PK_PATTERNS = [
|
|
636
630
|
'ID': r'\d+',
|
637
631
|
},
|
638
632
|
PKP + '/read/ID',
|
639
|
-
]
|
633
|
+
] # fmt: skip
|
640
634
|
|
641
635
|
# NOTE: right, I think this is just for analysis so far... not actually used
|
642
636
|
PATTERNS = {
|
@@ -649,7 +643,7 @@ PATTERNS = {
|
|
649
643
|
'wikipedia' : WK_PATTERNS,
|
650
644
|
'pocket' : PK_PATTERNS,
|
651
645
|
# 'news.ycombinator.com': YC_PATTERNS,
|
652
|
-
}
|
646
|
+
} # fmt: skip
|
653
647
|
|
654
648
|
|
655
649
|
def get_patterns(): # pragma: no cover
|
@@ -672,11 +666,13 @@ def get_patterns(): # pragma: no cover
|
|
672
666
|
[rdict] = repls
|
673
667
|
for p in pats:
|
674
668
|
yield repl(p, rdict)
|
669
|
+
|
675
670
|
return {k: list(handle(v)) for k, v in PATTERNS.items()}
|
676
671
|
|
677
672
|
|
678
|
-
def domains(it):
|
673
|
+
def domains(it): # pragma: no cover
|
679
674
|
from collections import Counter
|
675
|
+
|
680
676
|
c: typing.Counter[str] = Counter()
|
681
677
|
for line in it:
|
682
678
|
url = line.strip()
|
@@ -687,18 +683,20 @@ def domains(it): # pragma: no cover
|
|
687
683
|
c['ERROR'] += 1
|
688
684
|
continue
|
689
685
|
else:
|
690
|
-
udom = nurl[:nurl.find('/')]
|
686
|
+
udom = nurl[: nurl.find('/')]
|
691
687
|
c[udom] += 1
|
692
688
|
from pprint import pprint
|
689
|
+
|
693
690
|
pprint(c.most_common(20))
|
694
691
|
|
695
692
|
|
696
|
-
def groups(it, args):
|
693
|
+
def groups(it, args): # pragma: no cover
|
697
694
|
all_pats = get_patterns()
|
698
695
|
|
699
696
|
from collections import Counter
|
700
|
-
|
701
|
-
|
697
|
+
|
698
|
+
c: typing.Counter[str | None] = Counter()
|
699
|
+
unmatched: list[str] = []
|
702
700
|
|
703
701
|
def dump():
|
704
702
|
print(c)
|
@@ -709,7 +707,6 @@ def groups(it, args): # pragma: no cover
|
|
709
707
|
if pat is None:
|
710
708
|
unmatched.append(nurl)
|
711
709
|
|
712
|
-
|
713
710
|
for i, line in enumerate(it):
|
714
711
|
if i % 10000 == 0:
|
715
712
|
pass
|
@@ -720,12 +717,12 @@ def groups(it, args): # pragma: no cover
|
|
720
717
|
except CanonifyException as e:
|
721
718
|
print(f"ERROR while normalising! {url} {e}")
|
722
719
|
continue
|
723
|
-
udom = nurl[:nurl.find('/')]
|
720
|
+
udom = nurl[: nurl.find('/')]
|
724
721
|
usplit = udom.split('.')
|
725
722
|
patterns = None
|
726
723
|
for dom, pats in all_pats.items():
|
727
724
|
dsplit = dom.split('.')
|
728
|
-
if '$'.join(dsplit) in '$'.join(usplit):
|
725
|
+
if '$'.join(dsplit) in '$'.join(usplit): # meh
|
729
726
|
patterns = pats
|
730
727
|
break
|
731
728
|
else:
|
@@ -749,17 +746,17 @@ def groups(it, args): # pragma: no cover
|
|
749
746
|
print(f"Unmatched: {nones / sum(c.values()) * 100:.1f}%")
|
750
747
|
uc = Counter([u.split('/')[:2][-1] for u in unmatched]).most_common(10)
|
751
748
|
from pprint import pprint
|
752
|
-
pprint(uc)
|
753
749
|
|
750
|
+
pprint(uc)
|
754
751
|
|
755
752
|
|
756
|
-
def display(it, args) -> None:
|
753
|
+
def display(it, args) -> None: # pragma: no cover
|
757
754
|
# TODO better name?
|
758
755
|
import difflib
|
759
|
-
# pylint: disable=import-error
|
760
|
-
from termcolor import colored as C # type: ignore
|
761
756
|
from sys import stdout
|
762
757
|
|
758
|
+
from termcolor import colored as C # type: ignore[import-not-found]
|
759
|
+
|
763
760
|
for line in it:
|
764
761
|
line = line.strip()
|
765
762
|
if args.human:
|
@@ -773,15 +770,16 @@ def display(it, args) -> None: # pragma: no cover
|
|
773
770
|
can_ = ""
|
774
771
|
|
775
772
|
pr = False
|
773
|
+
|
776
774
|
def delete(x):
|
777
775
|
nonlocal pr
|
778
776
|
if x in (
|
779
|
-
|
780
|
-
|
781
|
-
|
782
|
-
|
783
|
-
|
784
|
-
|
777
|
+
'https://www.',
|
778
|
+
'http://www.',
|
779
|
+
'http://',
|
780
|
+
'https://',
|
781
|
+
'file://',
|
782
|
+
'/',
|
785
783
|
):
|
786
784
|
col = None
|
787
785
|
else:
|
@@ -800,9 +798,8 @@ def display(it, args) -> None: # pragma: no cover
|
|
800
798
|
fn = lambda x: C(x, color='cyan')
|
801
799
|
# TODO exclude certain items from comparison?
|
802
800
|
|
803
|
-
|
804
|
-
|
805
|
-
can_ += fn(can[ff2: tt2])
|
801
|
+
org_ += fn(line[ff:tt])
|
802
|
+
can_ += fn(can[ff2:tt2])
|
806
803
|
cl = max(len(org_), len(can_))
|
807
804
|
org_ += ' ' * (cl - len(org_))
|
808
805
|
can_ += ' ' * (cl - len(can_))
|
@@ -811,14 +808,17 @@ def display(it, args) -> None: # pragma: no cover
|
|
811
808
|
stdout.write(f'{org_}\n{can_}\n---\n')
|
812
809
|
|
813
810
|
|
814
|
-
def main() -> None:
|
811
|
+
def main() -> None: # pragma: no cover
|
815
812
|
import argparse
|
816
|
-
|
813
|
+
|
814
|
+
p = argparse.ArgumentParser(
|
815
|
+
epilog='''
|
817
816
|
- sqlite3 promnesia.sqlite 'select distinct orig_url from visits' | cannon.py --domains
|
818
817
|
|
819
818
|
- running comparison
|
820
819
|
sqlite3 promnesia.sqlite 'select distinct orig_url from visits where norm_url like "%twitter%" order by orig_url' | src/promnesia/cannon.py
|
821
|
-
''',
|
820
|
+
''',
|
821
|
+
formatter_class=lambda prog: argparse.RawTextHelpFormatter(prog, width=100),
|
822
822
|
)
|
823
823
|
p.add_argument('input', nargs='?')
|
824
824
|
p.add_argument('--human', action='store_true')
|
@@ -829,6 +829,7 @@ def main() -> None: # pragma: no cover
|
|
829
829
|
it: Iterable[str]
|
830
830
|
if args.input is None:
|
831
831
|
import sys
|
832
|
+
|
832
833
|
it = sys.stdin
|
833
834
|
else:
|
834
835
|
it = [args.input]
|
@@ -842,7 +843,7 @@ def main() -> None: # pragma: no cover
|
|
842
843
|
|
843
844
|
|
844
845
|
if __name__ == '__main__':
|
845
|
-
main()
|
846
|
+
main() # pragma: no cover
|
846
847
|
|
847
848
|
# TODO hmm, it's actually sort of fingerprinter... so maybe that's what I should call it
|
848
849
|
|