logslop 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: logslop
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Log deduplication: strip repeated lines so readers and agents can focus on key events.
|
|
5
|
+
Author: Advanced Micro Devices, Inc.
|
|
6
|
+
License: MIT
|
|
7
|
+
Project-URL: homepage, https://github.com/amd/logslop
|
|
8
|
+
Project-URL: repository, https://github.com/amd/logslop
|
|
9
|
+
Project-URL: issues, https://github.com/amd/logslop/issues
|
|
10
|
+
Requires-Python: >=3.8
|
|
11
|
+
Description-Content-Type: text/markdown
|
|
12
|
+
License-File: LICENSE
|
|
13
|
+
Dynamic: license-file
|
|
14
|
+
|
|
15
|
+
# LogsLop - Log Deduplication Tool
|
|
16
|
+
|
|
17
|
+
LogsLop is a log summarization tool that removes repeated messages from log files, enabling readers and agents to home in quickly on key message events.
|
|
18
|
+
|
|
19
|
+
In progress for public open source release.
|
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
logslop.py,sha256=MGwYyXf7F2vjtk86bpuWDmN96G2eQNpCeY8DiS5hK-w,4358
|
|
2
|
+
logslop-0.1.0.dist-info/licenses/LICENSE,sha256=UziuL9rXkIgS-RiFkwTp8KnFqQlyydMVYsE2nD8357k,1072
|
|
3
|
+
logslop-0.1.0.dist-info/METADATA,sha256=pQFNlwot06Iz1lbemDe6EIt7LGM7OticoLC33NG8nIg,708
|
|
4
|
+
logslop-0.1.0.dist-info/WHEEL,sha256=aeYiig01lYGDzBgS8HxWXOg3uV61G9ijOsup-k9o1sk,91
|
|
5
|
+
logslop-0.1.0.dist-info/entry_points.txt,sha256=fFIZM5ziy26gBOVP_-ooNAbO9FVoG4VUKCzrYqAjCeg,41
|
|
6
|
+
logslop-0.1.0.dist-info/top_level.txt,sha256=UaZeEUOkXk--fH-0XKPKpolfyHeG_S3Di0-qa5XDons,8
|
|
7
|
+
logslop-0.1.0.dist-info/RECORD,,
|
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
Copyright (c) 2026 Advanced Micro Devices, Inc.
|
|
2
|
+
|
|
3
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
|
|
4
|
+
|
|
5
|
+
The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
|
|
6
|
+
|
|
7
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
logslop
|
logslop.py
ADDED
|
@@ -0,0 +1,102 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
# Copyright (C) 2025 Advanced Micro Devices, Inc.
|
|
3
|
+
# Use of this source code is governed by an MIT-style license that can be
|
|
4
|
+
# found in the LICENSE file or at https://opensource.org/licenses/MIT.
|
|
5
|
+
"""LogsLop Standalone - Log Deduplication Script
|
|
6
|
+
|
|
7
|
+
Large log files are often full of repeated or near-duplicate lines.
|
|
8
|
+
LogsLop strips that redundancy, so you get one representative per message type.
|
|
9
|
+
Particularly useful when logs are too big to send or process as-is (e.g. into LLM context windows).
|
|
10
|
+
|
|
11
|
+
Usage:
|
|
12
|
+
python3 logslop.py (reads stdin)
|
|
13
|
+
your-command 2>&1 | python3 logslop.py (pipes stderr to stdout and both through logslop)
|
|
14
|
+
|
|
15
|
+
Examples:
|
|
16
|
+
journalctl --no-pager | python3 logslop.py
|
|
17
|
+
python3 logslop.py < your_log.txt
|
|
18
|
+
|
|
19
|
+
To add to PATH: copy to e.g. ~/.local/bin, then export PATH="$HOME/.local/bin:$PATH" (add to ~/.bashrc to persist).
|
|
20
|
+
|
|
21
|
+
Options (argparse, all optional):
|
|
22
|
+
-n 5000 max clusters to track
|
|
23
|
+
-t 0.6 Jaccard similarity threshold (0-1, higher = more aggressive deduping)
|
|
24
|
+
--no-normalize-digits disable digit/hex normalization (rare)
|
|
25
|
+
"""
|
|
26
|
+
|
|
27
|
+
import argparse
|
|
28
|
+
import re
|
|
29
|
+
import sys
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def tokenize(line: str, normalize_digits: bool) -> list[str]:
|
|
33
|
+
"""Tokenize with optional digit and hex normalization."""
|
|
34
|
+
placeholder = '\u0001'
|
|
35
|
+
if normalize_digits:
|
|
36
|
+
line = re.sub(r'\d+', placeholder, line)
|
|
37
|
+
word_chars = r'a-zA-Z' + (placeholder if normalize_digits else '0-9') + r'_'
|
|
38
|
+
tokens = re.findall(rf'[{word_chars}]+|[^\w\s]', line)
|
|
39
|
+
if normalize_digits:
|
|
40
|
+
hex_pattern = re.compile(r'^[' + placeholder + r'a-fA-FxX]+$')
|
|
41
|
+
tokens = [placeholder if placeholder in t and hex_pattern.match(t) else t for t in tokens]
|
|
42
|
+
return [t for t in tokens if t]
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def jaccard(a: list[str], b: list[str]) -> float:
|
|
46
|
+
"""Token-counting Jaccard similarity."""
|
|
47
|
+
sa, sb = set(a), set(b)
|
|
48
|
+
inter = len(sa & sb)
|
|
49
|
+
union = len(sa | sb)
|
|
50
|
+
return inter / union if union else 1.0
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def process(lines, *, max_clusters=5000, threshold=0.6, normalize_digits=True):
|
|
54
|
+
"""Yield lines that pass through (first occurrence of each cluster).
|
|
55
|
+
Emits `` ...N...M`` markers (1-based omitted line ranges) before the next kept line; same at EOF for trailing skips."""
|
|
56
|
+
clusters = [] # [(exemplar_tokens, line_text), ...] most recent first; exemplar fixed, not updated on match
|
|
57
|
+
skipped = 0
|
|
58
|
+
lineno = 0
|
|
59
|
+
for line in lines:
|
|
60
|
+
lineno += 1
|
|
61
|
+
line = line.rstrip('\n\r')
|
|
62
|
+
tokens = tokenize(line, normalize_digits)
|
|
63
|
+
matched = False
|
|
64
|
+
for i, (ex_tokens, ex_line) in enumerate(clusters):
|
|
65
|
+
if jaccard(tokens, ex_tokens) >= threshold:
|
|
66
|
+
matched = True
|
|
67
|
+
clusters.pop(i)
|
|
68
|
+
clusters.insert(0, (ex_tokens, ex_line)) # Keep exemplar; replacing causes drift
|
|
69
|
+
break
|
|
70
|
+
if not matched:
|
|
71
|
+
if skipped > 0:
|
|
72
|
+
skip_start = lineno - skipped
|
|
73
|
+
skip_end = lineno - 1
|
|
74
|
+
yield f" ...{skip_start}...{skip_end}"
|
|
75
|
+
yield line
|
|
76
|
+
skipped = 0
|
|
77
|
+
if max_clusters and len(clusters) >= max_clusters:
|
|
78
|
+
clusters.pop()
|
|
79
|
+
clusters.insert(0, (tokens, line))
|
|
80
|
+
else:
|
|
81
|
+
skipped += 1
|
|
82
|
+
if skipped > 0:
|
|
83
|
+
skip_start = lineno - skipped + 1
|
|
84
|
+
skip_end = lineno
|
|
85
|
+
yield f" ...{skip_start}...{skip_end}"
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
def main():
|
|
89
|
+
p = argparse.ArgumentParser(description='Reduce log redundancy: print first occurrence of each pattern.')
|
|
90
|
+
p.add_argument('-n', '--max-clusters', type=int, default=5000, metavar='N', help='Max clusters to track (default: 5000)')
|
|
91
|
+
p.add_argument('-t', '--threshold', type=float, default=0.6, help='Jaccard threshold for match (default: 0.6)')
|
|
92
|
+
p.add_argument('--no-normalize-digits', action='store_true', help='Disable digit normalization')
|
|
93
|
+
args = p.parse_args()
|
|
94
|
+
c0, c1 = ("\033[36m", "\033[0m") if sys.stdout.isatty() else ("", "")
|
|
95
|
+
print(f"{c0}# LogsLop: near-duplicate input lines removed.{c1}", flush=True)
|
|
96
|
+
print(f"{c0}# ...N...M = omitted lines N through M in the original input (1-based).{c1}", flush=True)
|
|
97
|
+
for line in process(sys.stdin, max_clusters=args.max_clusters, threshold=args.threshold, normalize_digits=not args.no_normalize_digits):
|
|
98
|
+
print(line, flush=True)
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
if __name__ == '__main__':
|
|
102
|
+
main()
|