logslop 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- logslop-0.1.0/.github/workflows/release-trusted-publisher.yml +41 -0
- logslop-0.1.0/LICENSE +7 -0
- logslop-0.1.0/PKG-INFO +19 -0
- logslop-0.1.0/README.md +5 -0
- logslop-0.1.0/logslop.egg-info/PKG-INFO +19 -0
- logslop-0.1.0/logslop.egg-info/SOURCES.txt +11 -0
- logslop-0.1.0/logslop.egg-info/dependency_links.txt +1 -0
- logslop-0.1.0/logslop.egg-info/entry_points.txt +2 -0
- logslop-0.1.0/logslop.egg-info/top_level.txt +1 -0
- logslop-0.1.0/logslop.py +102 -0
- logslop-0.1.0/logslop_test.py +81 -0
- logslop-0.1.0/pyproject.toml +26 -0
- logslop-0.1.0/setup.cfg +4 -0
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
name: Release (Trusted Publisher)
|
|
2
|
+
|
|
3
|
+
permissions:
|
|
4
|
+
contents: write
|
|
5
|
+
id-token: write
|
|
6
|
+
|
|
7
|
+
on:
|
|
8
|
+
push:
|
|
9
|
+
tags:
|
|
10
|
+
- 'v*'
|
|
11
|
+
|
|
12
|
+
jobs:
|
|
13
|
+
release:
|
|
14
|
+
runs-on: ubuntu-latest
|
|
15
|
+
steps:
|
|
16
|
+
- uses: actions/checkout@v4
|
|
17
|
+
with:
|
|
18
|
+
fetch-depth: 0
|
|
19
|
+
|
|
20
|
+
- uses: actions/setup-python@v4
|
|
21
|
+
with:
|
|
22
|
+
python-version: '3.11'
|
|
23
|
+
|
|
24
|
+
- name: Build package
|
|
25
|
+
run: |
|
|
26
|
+
python -m pip install --upgrade pip build
|
|
27
|
+
python -m build
|
|
28
|
+
|
|
29
|
+
- name: Upload to PyPI using Trusted Publisher
|
|
30
|
+
uses: pypa/gh-action-pypi-publish@release/v1
|
|
31
|
+
with:
|
|
32
|
+
packages-dir: dist/
|
|
33
|
+
|
|
34
|
+
- name: Create GitHub Release
|
|
35
|
+
env:
|
|
36
|
+
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
|
37
|
+
run: |
|
|
38
|
+
gh release create "${GITHUB_REF_NAME}" \
|
|
39
|
+
--title "${GITHUB_REF_NAME}" \
|
|
40
|
+
--generate-notes \
|
|
41
|
+
dist/*
|
logslop-0.1.0/LICENSE
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
Copyright (c) 2026 Advanced Micro Devices, Inc.
|
|
2
|
+
|
|
3
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
|
|
4
|
+
|
|
5
|
+
The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
|
|
6
|
+
|
|
7
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
logslop-0.1.0/PKG-INFO
ADDED
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: logslop
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Log deduplication: strip repeated lines so readers and agents can focus on key events.
|
|
5
|
+
Author: Advanced Micro Devices, Inc.
|
|
6
|
+
License: MIT
|
|
7
|
+
Project-URL: homepage, https://github.com/amd/logslop
|
|
8
|
+
Project-URL: repository, https://github.com/amd/logslop
|
|
9
|
+
Project-URL: issues, https://github.com/amd/logslop/issues
|
|
10
|
+
Requires-Python: >=3.8
|
|
11
|
+
Description-Content-Type: text/markdown
|
|
12
|
+
License-File: LICENSE
|
|
13
|
+
Dynamic: license-file
|
|
14
|
+
|
|
15
|
+
# LogsLop - Log Deduplication Tool
|
|
16
|
+
|
|
17
|
+
LogsLop is a log summarization tool that removes repeated messages from log files, enabling readers and agents to home in quickly on key message events.
|
|
18
|
+
|
|
19
|
+
In progress for public open source release.
|
logslop-0.1.0/README.md
ADDED
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: logslop
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Log deduplication: strip repeated lines so readers and agents can focus on key events.
|
|
5
|
+
Author: Advanced Micro Devices, Inc.
|
|
6
|
+
License: MIT
|
|
7
|
+
Project-URL: homepage, https://github.com/amd/logslop
|
|
8
|
+
Project-URL: repository, https://github.com/amd/logslop
|
|
9
|
+
Project-URL: issues, https://github.com/amd/logslop/issues
|
|
10
|
+
Requires-Python: >=3.8
|
|
11
|
+
Description-Content-Type: text/markdown
|
|
12
|
+
License-File: LICENSE
|
|
13
|
+
Dynamic: license-file
|
|
14
|
+
|
|
15
|
+
# LogsLop - Log Deduplication Tool
|
|
16
|
+
|
|
17
|
+
LogsLop is a log summarization tool that removes repeated messages from log files, enabling readers and agents to home in quickly on key message events.
|
|
18
|
+
|
|
19
|
+
In progress for public open source release.
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
LICENSE
|
|
2
|
+
README.md
|
|
3
|
+
logslop.py
|
|
4
|
+
logslop_test.py
|
|
5
|
+
pyproject.toml
|
|
6
|
+
.github/workflows/release-trusted-publisher.yml
|
|
7
|
+
logslop.egg-info/PKG-INFO
|
|
8
|
+
logslop.egg-info/SOURCES.txt
|
|
9
|
+
logslop.egg-info/dependency_links.txt
|
|
10
|
+
logslop.egg-info/entry_points.txt
|
|
11
|
+
logslop.egg-info/top_level.txt
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
logslop
|
logslop-0.1.0/logslop.py
ADDED
|
@@ -0,0 +1,102 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
# Copyright (C) 2025 Advanced Micro Devices, Inc.
|
|
3
|
+
# Use of this source code is governed by an MIT-style license that can be
|
|
4
|
+
# found in the LICENSE file or at https://opensource.org/licenses/MIT.
|
|
5
|
+
"""LogsLop Standalone - Log Deduplication Script
|
|
6
|
+
|
|
7
|
+
Large log files are often full of repeated or near-duplicate lines.
|
|
8
|
+
LogsLop strips that redundancy, so you get one representative per message type.
|
|
9
|
+
Particularly useful when logs are too big to send or process as-is (e.g. into LLM context windows).
|
|
10
|
+
|
|
11
|
+
Usage:
|
|
12
|
+
python3 logslop.py (reads stdin)
|
|
13
|
+
your-command 2>&1 | python3 logslop.py (pipes stderr to stdout and both through logslop)
|
|
14
|
+
|
|
15
|
+
Examples:
|
|
16
|
+
journalctl --no-pager | python3 logslop.py
|
|
17
|
+
python3 logslop.py < your_log.txt
|
|
18
|
+
|
|
19
|
+
To add to PATH: copy to e.g. ~/.local/bin, then export PATH="$HOME/.local/bin:$PATH" (add to ~/.bashrc to persist).
|
|
20
|
+
|
|
21
|
+
Options (argparse, all optional):
|
|
22
|
+
-n 5000 max clusters to track
|
|
23
|
+
-t 0.6 Jaccard similarity threshold (0-1, higher = more aggressive deduping)
|
|
24
|
+
--no-normalize-digits disable digit/hex normalization (rare)
|
|
25
|
+
"""
|
|
26
|
+
|
|
27
|
+
import argparse
|
|
28
|
+
import re
|
|
29
|
+
import sys
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def tokenize(line: str, normalize_digits: bool) -> list[str]:
|
|
33
|
+
"""Tokenize with optional digit and hex normalization."""
|
|
34
|
+
placeholder = '\u0001'
|
|
35
|
+
if normalize_digits:
|
|
36
|
+
line = re.sub(r'\d+', placeholder, line)
|
|
37
|
+
word_chars = r'a-zA-Z' + (placeholder if normalize_digits else '0-9') + r'_'
|
|
38
|
+
tokens = re.findall(rf'[{word_chars}]+|[^\w\s]', line)
|
|
39
|
+
if normalize_digits:
|
|
40
|
+
hex_pattern = re.compile(r'^[' + placeholder + r'a-fA-FxX]+$')
|
|
41
|
+
tokens = [placeholder if placeholder in t and hex_pattern.match(t) else t for t in tokens]
|
|
42
|
+
return [t for t in tokens if t]
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def jaccard(a: list[str], b: list[str]) -> float:
|
|
46
|
+
"""Token-counting Jaccard similarity."""
|
|
47
|
+
sa, sb = set(a), set(b)
|
|
48
|
+
inter = len(sa & sb)
|
|
49
|
+
union = len(sa | sb)
|
|
50
|
+
return inter / union if union else 1.0
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def process(lines, *, max_clusters=5000, threshold=0.6, normalize_digits=True):
|
|
54
|
+
"""Yield lines that pass through (first occurrence of each cluster).
|
|
55
|
+
Emits `` ...N...M`` markers (1-based omitted line ranges) before the next kept line; same at EOF for trailing skips."""
|
|
56
|
+
clusters = [] # [(exemplar_tokens, line_text), ...] most recent first; exemplar fixed, not updated on match
|
|
57
|
+
skipped = 0
|
|
58
|
+
lineno = 0
|
|
59
|
+
for line in lines:
|
|
60
|
+
lineno += 1
|
|
61
|
+
line = line.rstrip('\n\r')
|
|
62
|
+
tokens = tokenize(line, normalize_digits)
|
|
63
|
+
matched = False
|
|
64
|
+
for i, (ex_tokens, ex_line) in enumerate(clusters):
|
|
65
|
+
if jaccard(tokens, ex_tokens) >= threshold:
|
|
66
|
+
matched = True
|
|
67
|
+
clusters.pop(i)
|
|
68
|
+
clusters.insert(0, (ex_tokens, ex_line)) # Keep exemplar; replacing causes drift
|
|
69
|
+
break
|
|
70
|
+
if not matched:
|
|
71
|
+
if skipped > 0:
|
|
72
|
+
skip_start = lineno - skipped
|
|
73
|
+
skip_end = lineno - 1
|
|
74
|
+
yield f" ...{skip_start}...{skip_end}"
|
|
75
|
+
yield line
|
|
76
|
+
skipped = 0
|
|
77
|
+
if max_clusters and len(clusters) >= max_clusters:
|
|
78
|
+
clusters.pop()
|
|
79
|
+
clusters.insert(0, (tokens, line))
|
|
80
|
+
else:
|
|
81
|
+
skipped += 1
|
|
82
|
+
if skipped > 0:
|
|
83
|
+
skip_start = lineno - skipped + 1
|
|
84
|
+
skip_end = lineno
|
|
85
|
+
yield f" ...{skip_start}...{skip_end}"
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
def main():
|
|
89
|
+
p = argparse.ArgumentParser(description='Reduce log redundancy: print first occurrence of each pattern.')
|
|
90
|
+
p.add_argument('-n', '--max-clusters', type=int, default=5000, metavar='N', help='Max clusters to track (default: 5000)')
|
|
91
|
+
p.add_argument('-t', '--threshold', type=float, default=0.6, help='Jaccard threshold for match (default: 0.6)')
|
|
92
|
+
p.add_argument('--no-normalize-digits', action='store_true', help='Disable digit normalization')
|
|
93
|
+
args = p.parse_args()
|
|
94
|
+
c0, c1 = ("\033[36m", "\033[0m") if sys.stdout.isatty() else ("", "")
|
|
95
|
+
print(f"{c0}# LogsLop: near-duplicate input lines removed.{c1}", flush=True)
|
|
96
|
+
print(f"{c0}# ...N...M = omitted lines N through M in the original input (1-based).{c1}", flush=True)
|
|
97
|
+
for line in process(sys.stdin, max_clusters=args.max_clusters, threshold=args.threshold, normalize_digits=not args.no_normalize_digits):
|
|
98
|
+
print(line, flush=True)
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
if __name__ == '__main__':
|
|
102
|
+
main()
|
|
@@ -0,0 +1,81 @@
|
|
|
1
|
+
# Copyright (C) 2025 Advanced Micro Devices, Inc.
|
|
2
|
+
# Use of this source code is governed by an MIT-style license that can be
|
|
3
|
+
# found in the LICENSE file or at https://opensource.org/licenses/MIT.
|
|
4
|
+
"""Unit tests for standalone logslop."""
|
|
5
|
+
|
|
6
|
+
import importlib.util
|
|
7
|
+
import unittest
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
|
|
10
|
+
_spec = importlib.util.spec_from_file_location("standalone_logslop", Path(__file__).parent / "logslop.py")
|
|
11
|
+
_standalone = importlib.util.module_from_spec(_spec)
|
|
12
|
+
_spec.loader.exec_module(_standalone)
|
|
13
|
+
tokenize = _standalone.tokenize
|
|
14
|
+
jaccard = _standalone.jaccard
|
|
15
|
+
process = _standalone.process
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class TestTokenize(unittest.TestCase):
|
|
19
|
+
def test_normalize(self):
|
|
20
|
+
self.assertEqual(tokenize("foo 123 bar", True), tokenize("foo 456 bar", True))
|
|
21
|
+
self.assertNotEqual(tokenize("foo 123 bar", True), tokenize("foo 123 bar", False))
|
|
22
|
+
|
|
23
|
+
def test_no_normalize(self):
|
|
24
|
+
self.assertEqual(tokenize("foo 123 bar", False), ["foo", "123", "bar"])
|
|
25
|
+
|
|
26
|
+
def test_hex_normalization(self):
|
|
27
|
+
"""Hex sequences (0x...) collapse to single placeholder like digit sequences."""
|
|
28
|
+
t1 = tokenize("amd_diag_apply_pgprot: 0x413d12000", True)
|
|
29
|
+
t2 = tokenize("amd_diag_apply_pgprot: 0x413d22000", True)
|
|
30
|
+
self.assertEqual(t1, t2, "hex addresses should normalize to same tokens")
|
|
31
|
+
|
|
32
|
+
def test_hex_and_digits_same_structure(self):
|
|
33
|
+
"""Different hex values produce identical token sequences."""
|
|
34
|
+
self.assertEqual(
|
|
35
|
+
tokenize("pre_set_pgprot: phy_base -- 0x413d12000 and phy_type -- 0x4", True),
|
|
36
|
+
tokenize("pre_set_pgprot: phy_base -- 0x413d22000 and phy_type -- 0x5", True),
|
|
37
|
+
)
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
class TestJaccard(unittest.TestCase):
|
|
41
|
+
def test_identical(self):
|
|
42
|
+
self.assertEqual(jaccard(["a", "b"], ["a", "b"]), 1.0)
|
|
43
|
+
|
|
44
|
+
def test_partial(self):
|
|
45
|
+
self.assertEqual(jaccard(["a", "b"], ["a", "c"]), 1 / 3)
|
|
46
|
+
|
|
47
|
+
def test_empty(self):
|
|
48
|
+
self.assertEqual(jaccard([], []), 1.0)
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
class TestProcess(unittest.TestCase):
|
|
52
|
+
def test_dedupes_similar(self):
|
|
53
|
+
lines = ["foo 123 bar", "foo 456 bar", "foo 789 bar", "baz qux", "foo 999 bar"]
|
|
54
|
+
self.assertEqual(list(process(lines)), ["foo 123 bar", " ...2...3", "baz qux", " ...5...5"])
|
|
55
|
+
|
|
56
|
+
def test_no_normalize_keeps_all(self):
|
|
57
|
+
lines = ["line 1", "line 2", "line 3"]
|
|
58
|
+
self.assertEqual(list(process(lines, normalize_digits=False)), lines)
|
|
59
|
+
|
|
60
|
+
def test_threshold_stricter(self):
|
|
61
|
+
lines = ["foo bar baz", "foo bar qux", "foo bar"]
|
|
62
|
+
self.assertEqual(list(process(lines, threshold=0.9)), lines)
|
|
63
|
+
self.assertEqual(list(process(lines, threshold=0.5)), ["foo bar baz", " ...2...3"])
|
|
64
|
+
|
|
65
|
+
def test_max_clusters(self):
|
|
66
|
+
lines = [f"unique_{i}" for i in range(5)]
|
|
67
|
+
self.assertEqual(len(list(process(lines, max_clusters=2, normalize_digits=False))), 5)
|
|
68
|
+
self.assertEqual(list(process([f"x {i}" for i in range(5)], max_clusters=2)), ["x 0", " ...2...5"])
|
|
69
|
+
|
|
70
|
+
def test_hex_lines_deduped(self):
|
|
71
|
+
"""Lines differing only in hex addresses are deduped."""
|
|
72
|
+
lines = [
|
|
73
|
+
"amd_diag_apply_pgprot: 0x413d12000",
|
|
74
|
+
"amd_diag_apply_pgprot: 0x413d22000",
|
|
75
|
+
"amd_diag_apply_pgprot: 0x413d72000",
|
|
76
|
+
]
|
|
77
|
+
self.assertEqual(list(process(lines)), ["amd_diag_apply_pgprot: 0x413d12000", " ...2...3"])
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
if __name__ == "__main__":
|
|
81
|
+
unittest.main()
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=61.0", "setuptools-scm>=8"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "logslop"
|
|
7
|
+
dynamic = ["version"]
|
|
8
|
+
description = "Log deduplication: strip repeated lines so readers and agents can focus on key events."
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
requires-python = ">=3.8"
|
|
11
|
+
license = { text = "MIT" }
|
|
12
|
+
authors = [{ name = "Advanced Micro Devices, Inc." }]
|
|
13
|
+
|
|
14
|
+
[project.urls]
|
|
15
|
+
homepage = "https://github.com/amd/logslop"
|
|
16
|
+
repository = "https://github.com/amd/logslop"
|
|
17
|
+
issues = "https://github.com/amd/logslop/issues"
|
|
18
|
+
|
|
19
|
+
[project.scripts]
|
|
20
|
+
logslop = "logslop:main"
|
|
21
|
+
|
|
22
|
+
[tool.setuptools]
|
|
23
|
+
py-modules = ["logslop"]
|
|
24
|
+
|
|
25
|
+
[tool.setuptools_scm]
|
|
26
|
+
version_scheme = "post-release"
|
logslop-0.1.0/setup.cfg
ADDED