logdetective 0.2.0__tar.gz → 0.2.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- logdetective-0.2.2/PKG-INFO +188 -0
- logdetective-0.2.2/README.md +159 -0
- logdetective-0.2.2/logdetective/__init__.py +0 -0
- logdetective-0.2.2/logdetective/constants.py +27 -0
- logdetective-0.2.2/logdetective/extractors.py +89 -0
- logdetective-0.2.2/logdetective/logdetective.py +76 -0
- logdetective-0.2.2/logdetective/server.py +56 -0
- logdetective-0.2.2/logdetective/utils.py +99 -0
- {logdetective-0.2.0 → logdetective-0.2.2}/pyproject.toml +8 -3
- logdetective-0.2.0/PKG-INFO +0 -90
- logdetective-0.2.0/README.md +0 -61
- logdetective-0.2.0/logdetective/logdetective.py +0 -275
- {logdetective-0.2.0 → logdetective-0.2.2}/LICENSE +0 -0
- {logdetective-0.2.0 → logdetective-0.2.2}/logdetective/drain3.ini +0 -0
|
@@ -0,0 +1,188 @@
|
|
|
1
|
+
Metadata-Version: 2.1
|
|
2
|
+
Name: logdetective
|
|
3
|
+
Version: 0.2.2
|
|
4
|
+
Summary: Log using LLM AI to search for build/test failures and provide ideas for fixing these.
|
|
5
|
+
License: Apache-2.0
|
|
6
|
+
Author: Jiri Podivin
|
|
7
|
+
Author-email: jpodivin@gmail.com
|
|
8
|
+
Requires-Python: >=3.11,<4.0
|
|
9
|
+
Classifier: Development Status :: 4 - Beta
|
|
10
|
+
Classifier: Environment :: Console
|
|
11
|
+
Classifier: Intended Audience :: Developers
|
|
12
|
+
Classifier: License :: OSI Approved :: Apache Software License
|
|
13
|
+
Classifier: Natural Language :: English
|
|
14
|
+
Classifier: Programming Language :: Python :: 3
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
17
|
+
Classifier: Topic :: Internet :: Log Analysis
|
|
18
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
19
|
+
Classifier: Topic :: Software Development :: Debuggers
|
|
20
|
+
Provides-Extra: server
|
|
21
|
+
Requires-Dist: drain3 (>=0.9.11,<0.10.0)
|
|
22
|
+
Requires-Dist: huggingface-hub (>=0.23.2,<0.24.0)
|
|
23
|
+
Requires-Dist: llama-cpp-python (>=0.2.56,<0.3.0)
|
|
24
|
+
Requires-Dist: requests (>=2.31.0,<3.0.0)
|
|
25
|
+
Project-URL: homepage, https://github.com/fedora-copr/logdetective
|
|
26
|
+
Project-URL: issues, https://github.com/fedora-copr/logdetective/issues
|
|
27
|
+
Description-Content-Type: text/markdown
|
|
28
|
+
|
|
29
|
+
Log Detective
|
|
30
|
+
=============
|
|
31
|
+
|
|
32
|
+
A Python tool to analyze logs using a Language Model (LLM) and Drain template miner.
|
|
33
|
+
|
|
34
|
+
Installation
|
|
35
|
+
------------
|
|
36
|
+
|
|
37
|
+
The logdetective project is published on the the the the the [Pypi repository](https://pypi.org/project/logdetective/). The `pip` tool can be used for installation.
|
|
38
|
+
|
|
39
|
+
First, ensure that the necessary dependencies for the `llama-cpp-python` project are installed. For Fedora, install `gcc-c++`:
|
|
40
|
+
|
|
41
|
+
# for Fedora it will be:
|
|
42
|
+
dnf install gcc-c++
|
|
43
|
+
|
|
44
|
+
**From Pypi repository**
|
|
45
|
+
|
|
46
|
+
Then, install the `logdetective` project using pip:
|
|
47
|
+
|
|
48
|
+
# then install logdetective project
|
|
49
|
+
pip install logdetective
|
|
50
|
+
|
|
51
|
+
**Local repository install**
|
|
52
|
+
|
|
53
|
+
pip install .
|
|
54
|
+
|
|
55
|
+
Usage
|
|
56
|
+
-----
|
|
57
|
+
|
|
58
|
+
To analyze a log file, run the script with the following command line arguments:
|
|
59
|
+
- `url` (required): The URL of the log file to be analyzed.
|
|
60
|
+
- `--model` (optional, default: "Mistral-7B-Instruct-v0.2-GGUF"): The path or URL of the language model for analysis. As we are using LLama.cpp we want this to be in the `gguf` format. You can include the download link to the model here. If the model is already on your machine it will skip the download.
|
|
61
|
+
- `--summarizer` (optional, default: "drain"): Choose between LLM and Drain template miner as the log summarizer. You can also provide the path to an existing language model file instead of using a URL.
|
|
62
|
+
- `--n_lines` (optional, default: 8): The number of lines per chunk for LLM analysis. This only makes sense when you are summarizing with LLM.
|
|
63
|
+
- `--n_clusters` (optional, default 8): Number of clusters for Drain to organize log chunks into. This only makes sense when you are summarizing with Drain
|
|
64
|
+
|
|
65
|
+
Example usage:
|
|
66
|
+
|
|
67
|
+
logdetective https://example.com/logs.txt
|
|
68
|
+
|
|
69
|
+
Or if the log file is stored locally:
|
|
70
|
+
|
|
71
|
+
logdetective ./data/logs.txt
|
|
72
|
+
|
|
73
|
+
Example you want to use a different model:
|
|
74
|
+
|
|
75
|
+
logdetective https://example.com/logs.txt --model https://huggingface.co/QuantFactory/Meta-Llama-3-8B-Instruct-GGUF/resolve/main/Meta-Llama-3-8B-Instruct.Q5_K_S.gguf?download=true
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
Real Example
|
|
79
|
+
------------
|
|
80
|
+
Let's have a look at a real world example. Log Detective can work with any logs though we optimize it for build logs.
|
|
81
|
+
|
|
82
|
+
We're going to analyze a failed build of a python-based library that happened in Fedora Koji buildsystem:
|
|
83
|
+
```
|
|
84
|
+
$ logdetective https://kojipkgs.fedoraproject.org//work/tasks/8157/117788157/build.log
|
|
85
|
+
Explanation:
|
|
86
|
+
[Child return code was: 0] : The rpm build process executed successfully without any errors until the 'check' phase.
|
|
87
|
+
|
|
88
|
+
[wamp/test/test_wamp_component_aio.py::test_asyncio_component] : Pytest found
|
|
89
|
+
two tests marked with '@pytest.mark.asyncio' but they are not async functions.
|
|
90
|
+
This warning can be ignored unless the tests are intended to be run
|
|
91
|
+
asynchronously.
|
|
92
|
+
|
|
93
|
+
[wamp/test/test_wamp_component_aio.py::test_asyncio_component_404] : Another
|
|
94
|
+
Pytest warning for the same issue as test_asyncio_component.
|
|
95
|
+
|
|
96
|
+
[-- Docs: https://docs.pytest.org/en/stable/how-to/capture-warnings.html] :
|
|
97
|
+
This line is not related to the error, but it is a reminder to refer to Pytest
|
|
98
|
+
documentation for handling warnings.
|
|
99
|
+
|
|
100
|
+
[=========================== short test summary info
|
|
101
|
+
============================] : This section shows the summary of tests that
|
|
102
|
+
were executed. It shows the number of tests passed, failed, skipped,
|
|
103
|
+
deselected, and warnings.
|
|
104
|
+
|
|
105
|
+
[FAILED wamp/test/test_wamp_cryptosign.py::TestSigVectors::test_vectors] : A
|
|
106
|
+
failed test is reported with the name of the test file, the name of the test
|
|
107
|
+
method, and the name of the test case that failed. In this case,
|
|
108
|
+
TestSigVectors::test_vectors failed.
|
|
109
|
+
|
|
110
|
+
[FAILED
|
|
111
|
+
websocket/test/test_websocket_protocol.py::WebSocketClientProtocolTests::test_auto_ping]
|
|
112
|
+
: Another failed test is reported with the same format as the previous test. In
|
|
113
|
+
this case, it is WebSocketClientProtocolTests::test_auto_ping that failed.
|
|
114
|
+
|
|
115
|
+
[FAILED websocket/test/test_websocket_protocol.py::WebSocketServerProtocolTests::test_interpolate_server_status_template]
|
|
116
|
+
: A third failed test is reported with the same format as the previous tests.
|
|
117
|
+
In this case, it is
|
|
118
|
+
WebSocketServerProtocolTests::test_interpolate_server_status_template that
|
|
119
|
+
failed.
|
|
120
|
+
|
|
121
|
+
[FAILED websocket/test/test_websocket_protocol.py::WebSocketServerProtocolTests::test_sendClose_reason_with_no_code]
|
|
122
|
+
: Another failed test is reported. This time it is
|
|
123
|
+
WebSocketServerProtocolTests::test_sendClose_reason_with_no_code.
|
|
124
|
+
|
|
125
|
+
[FAILED websocket/test/test_websocket_protocol.py::WebSocketServerProtocolTests::test_sendClose_str_reason]
|
|
126
|
+
: Another failed test is reported with the same test file and test method name,
|
|
127
|
+
but a different test case name: test_sendClose_str_reason.
|
|
128
|
+
|
|
129
|
+
[==== 13 failed, 195 passed, 64 skipped, 13 deselected, 2 warnings in 6.55s
|
|
130
|
+
=====] : This is the summary of all tests that were executed, including the
|
|
131
|
+
number of tests that passed, failed, were skipped, deselected, or produced
|
|
132
|
+
warnings. In this case, there were 13 failed tests among a total of 211 tests.
|
|
133
|
+
|
|
134
|
+
[error: Bad exit status from /var/tmp/rpm-tmp.8C0L25 (%check)] : An error
|
|
135
|
+
message is reported indicating that the 'check' phase of the rpm build process
|
|
136
|
+
failed with a bad exit status.
|
|
137
|
+
```
|
|
138
|
+
|
|
139
|
+
It looks like a wall of text. Similar to any log. The main difference is that here we have the most significant lines of a logfile wrapped in `[ ] : ` and followed by textual explanation of the log text done by mistral 7b.
|
|
140
|
+
|
|
141
|
+
|
|
142
|
+
Contributing
|
|
143
|
+
------------
|
|
144
|
+
|
|
145
|
+
Contributions are welcome! Please submit a pull request if you have any improvements or new features to add. Make sure your changes pass all existing tests before submitting.
|
|
146
|
+
|
|
147
|
+
To develop logdetective, you should fork this repository, clone your fork, and install dependencies using pip:
|
|
148
|
+
|
|
149
|
+
git clone https://github.com/yourusername/logdetective.git
|
|
150
|
+
cd logdetective
|
|
151
|
+
pip install .
|
|
152
|
+
|
|
153
|
+
Make changes to the code as needed and run pre-commit.
|
|
154
|
+
|
|
155
|
+
Tests
|
|
156
|
+
-----
|
|
157
|
+
|
|
158
|
+
The [tox](https://github.com/tox-dev/tox) is used to manage tests. Please install `tox` package into your distribution and run:
|
|
159
|
+
|
|
160
|
+
tox
|
|
161
|
+
|
|
162
|
+
This will create a virtual environment with dependencies and run all the tests. For more information follow the tox help.
|
|
163
|
+
|
|
164
|
+
To run only a specific test execute this:
|
|
165
|
+
|
|
166
|
+
tox run -e style # to run flake8
|
|
167
|
+
|
|
168
|
+
or
|
|
169
|
+
|
|
170
|
+
tox run -e lint # to run pylint
|
|
171
|
+
|
|
172
|
+
Server
|
|
173
|
+
------
|
|
174
|
+
|
|
175
|
+
FastApi based server is implemented in `logdetective/server.py`. In order to run in a development mode,
|
|
176
|
+
simply start llama-cpp-python server with your chosen model as described in llama-cpp-python [docs](https://llama-cpp-python.readthedocs.io/en/latest/server/#running-the-server).
|
|
177
|
+
|
|
178
|
+
Afterwards, start the logdetective server with `fastapi dev logdetective/server.py --port 8080`.
|
|
179
|
+
Requests can then be made with post requests, for example:
|
|
180
|
+
|
|
181
|
+
curl --header "Content-Type: application/json" --request POST --data '{"url":"<YOUR_URL_HERE>"}' http://localhost:8080/analyze
|
|
182
|
+
|
|
183
|
+
|
|
184
|
+
License
|
|
185
|
+
-------
|
|
186
|
+
|
|
187
|
+
This project is licensed under the Apache-2.0 License - see the LICENSE file for details.
|
|
188
|
+
|
|
@@ -0,0 +1,159 @@
|
|
|
1
|
+
Log Detective
|
|
2
|
+
=============
|
|
3
|
+
|
|
4
|
+
A Python tool to analyze logs using a Language Model (LLM) and Drain template miner.
|
|
5
|
+
|
|
6
|
+
Installation
|
|
7
|
+
------------
|
|
8
|
+
|
|
9
|
+
The logdetective project is published on the the the the the [Pypi repository](https://pypi.org/project/logdetective/). The `pip` tool can be used for installation.
|
|
10
|
+
|
|
11
|
+
First, ensure that the necessary dependencies for the `llama-cpp-python` project are installed. For Fedora, install `gcc-c++`:
|
|
12
|
+
|
|
13
|
+
# for Fedora it will be:
|
|
14
|
+
dnf install gcc-c++
|
|
15
|
+
|
|
16
|
+
**From Pypi repository**
|
|
17
|
+
|
|
18
|
+
Then, install the `logdetective` project using pip:
|
|
19
|
+
|
|
20
|
+
# then install logdetective project
|
|
21
|
+
pip install logdetective
|
|
22
|
+
|
|
23
|
+
**Local repository install**
|
|
24
|
+
|
|
25
|
+
pip install .
|
|
26
|
+
|
|
27
|
+
Usage
|
|
28
|
+
-----
|
|
29
|
+
|
|
30
|
+
To analyze a log file, run the script with the following command line arguments:
|
|
31
|
+
- `url` (required): The URL of the log file to be analyzed.
|
|
32
|
+
- `--model` (optional, default: "Mistral-7B-Instruct-v0.2-GGUF"): The path or URL of the language model for analysis. As we are using LLama.cpp we want this to be in the `gguf` format. You can include the download link to the model here. If the model is already on your machine it will skip the download.
|
|
33
|
+
- `--summarizer` (optional, default: "drain"): Choose between LLM and Drain template miner as the log summarizer. You can also provide the path to an existing language model file instead of using a URL.
|
|
34
|
+
- `--n_lines` (optional, default: 8): The number of lines per chunk for LLM analysis. This only makes sense when you are summarizing with LLM.
|
|
35
|
+
- `--n_clusters` (optional, default 8): Number of clusters for Drain to organize log chunks into. This only makes sense when you are summarizing with Drain
|
|
36
|
+
|
|
37
|
+
Example usage:
|
|
38
|
+
|
|
39
|
+
logdetective https://example.com/logs.txt
|
|
40
|
+
|
|
41
|
+
Or if the log file is stored locally:
|
|
42
|
+
|
|
43
|
+
logdetective ./data/logs.txt
|
|
44
|
+
|
|
45
|
+
Example you want to use a different model:
|
|
46
|
+
|
|
47
|
+
logdetective https://example.com/logs.txt --model https://huggingface.co/QuantFactory/Meta-Llama-3-8B-Instruct-GGUF/resolve/main/Meta-Llama-3-8B-Instruct.Q5_K_S.gguf?download=true
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
Real Example
|
|
51
|
+
------------
|
|
52
|
+
Let's have a look at a real world example. Log Detective can work with any logs though we optimize it for build logs.
|
|
53
|
+
|
|
54
|
+
We're going to analyze a failed build of a python-based library that happened in Fedora Koji buildsystem:
|
|
55
|
+
```
|
|
56
|
+
$ logdetective https://kojipkgs.fedoraproject.org//work/tasks/8157/117788157/build.log
|
|
57
|
+
Explanation:
|
|
58
|
+
[Child return code was: 0] : The rpm build process executed successfully without any errors until the 'check' phase.
|
|
59
|
+
|
|
60
|
+
[wamp/test/test_wamp_component_aio.py::test_asyncio_component] : Pytest found
|
|
61
|
+
two tests marked with '@pytest.mark.asyncio' but they are not async functions.
|
|
62
|
+
This warning can be ignored unless the tests are intended to be run
|
|
63
|
+
asynchronously.
|
|
64
|
+
|
|
65
|
+
[wamp/test/test_wamp_component_aio.py::test_asyncio_component_404] : Another
|
|
66
|
+
Pytest warning for the same issue as test_asyncio_component.
|
|
67
|
+
|
|
68
|
+
[-- Docs: https://docs.pytest.org/en/stable/how-to/capture-warnings.html] :
|
|
69
|
+
This line is not related to the error, but it is a reminder to refer to Pytest
|
|
70
|
+
documentation for handling warnings.
|
|
71
|
+
|
|
72
|
+
[=========================== short test summary info
|
|
73
|
+
============================] : This section shows the summary of tests that
|
|
74
|
+
were executed. It shows the number of tests passed, failed, skipped,
|
|
75
|
+
deselected, and warnings.
|
|
76
|
+
|
|
77
|
+
[FAILED wamp/test/test_wamp_cryptosign.py::TestSigVectors::test_vectors] : A
|
|
78
|
+
failed test is reported with the name of the test file, the name of the test
|
|
79
|
+
method, and the name of the test case that failed. In this case,
|
|
80
|
+
TestSigVectors::test_vectors failed.
|
|
81
|
+
|
|
82
|
+
[FAILED
|
|
83
|
+
websocket/test/test_websocket_protocol.py::WebSocketClientProtocolTests::test_auto_ping]
|
|
84
|
+
: Another failed test is reported with the same format as the previous test. In
|
|
85
|
+
this case, it is WebSocketClientProtocolTests::test_auto_ping that failed.
|
|
86
|
+
|
|
87
|
+
[FAILED websocket/test/test_websocket_protocol.py::WebSocketServerProtocolTests::test_interpolate_server_status_template]
|
|
88
|
+
: A third failed test is reported with the same format as the previous tests.
|
|
89
|
+
In this case, it is
|
|
90
|
+
WebSocketServerProtocolTests::test_interpolate_server_status_template that
|
|
91
|
+
failed.
|
|
92
|
+
|
|
93
|
+
[FAILED websocket/test/test_websocket_protocol.py::WebSocketServerProtocolTests::test_sendClose_reason_with_no_code]
|
|
94
|
+
: Another failed test is reported. This time it is
|
|
95
|
+
WebSocketServerProtocolTests::test_sendClose_reason_with_no_code.
|
|
96
|
+
|
|
97
|
+
[FAILED websocket/test/test_websocket_protocol.py::WebSocketServerProtocolTests::test_sendClose_str_reason]
|
|
98
|
+
: Another failed test is reported with the same test file and test method name,
|
|
99
|
+
but a different test case name: test_sendClose_str_reason.
|
|
100
|
+
|
|
101
|
+
[==== 13 failed, 195 passed, 64 skipped, 13 deselected, 2 warnings in 6.55s
|
|
102
|
+
=====] : This is the summary of all tests that were executed, including the
|
|
103
|
+
number of tests that passed, failed, were skipped, deselected, or produced
|
|
104
|
+
warnings. In this case, there were 13 failed tests among a total of 211 tests.
|
|
105
|
+
|
|
106
|
+
[error: Bad exit status from /var/tmp/rpm-tmp.8C0L25 (%check)] : An error
|
|
107
|
+
message is reported indicating that the 'check' phase of the rpm build process
|
|
108
|
+
failed with a bad exit status.
|
|
109
|
+
```
|
|
110
|
+
|
|
111
|
+
It looks like a wall of text. Similar to any log. The main difference is that here we have the most significant lines of a logfile wrapped in `[ ] : ` and followed by textual explanation of the log text done by mistral 7b.
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
Contributing
|
|
115
|
+
------------
|
|
116
|
+
|
|
117
|
+
Contributions are welcome! Please submit a pull request if you have any improvements or new features to add. Make sure your changes pass all existing tests before submitting.
|
|
118
|
+
|
|
119
|
+
To develop logdetective, you should fork this repository, clone your fork, and install dependencies using pip:
|
|
120
|
+
|
|
121
|
+
git clone https://github.com/yourusername/logdetective.git
|
|
122
|
+
cd logdetective
|
|
123
|
+
pip install .
|
|
124
|
+
|
|
125
|
+
Make changes to the code as needed and run pre-commit.
|
|
126
|
+
|
|
127
|
+
Tests
|
|
128
|
+
-----
|
|
129
|
+
|
|
130
|
+
The [tox](https://github.com/tox-dev/tox) is used to manage tests. Please install `tox` package into your distribution and run:
|
|
131
|
+
|
|
132
|
+
tox
|
|
133
|
+
|
|
134
|
+
This will create a virtual environment with dependencies and run all the tests. For more information follow the tox help.
|
|
135
|
+
|
|
136
|
+
To run only a specific test execute this:
|
|
137
|
+
|
|
138
|
+
tox run -e style # to run flake8
|
|
139
|
+
|
|
140
|
+
or
|
|
141
|
+
|
|
142
|
+
tox run -e lint # to run pylint
|
|
143
|
+
|
|
144
|
+
Server
|
|
145
|
+
------
|
|
146
|
+
|
|
147
|
+
FastApi based server is implemented in `logdetective/server.py`. In order to run in a development mode,
|
|
148
|
+
simply start llama-cpp-python server with your chosen model as described in llama-cpp-python [docs](https://llama-cpp-python.readthedocs.io/en/latest/server/#running-the-server).
|
|
149
|
+
|
|
150
|
+
Afterwards, start the logdetective server with `fastapi dev logdetective/server.py --port 8080`.
|
|
151
|
+
Requests can then be made with post requests, for example:
|
|
152
|
+
|
|
153
|
+
curl --header "Content-Type: application/json" --request POST --data '{"url":"<YOUR_URL_HERE>"}' http://localhost:8080/analyze
|
|
154
|
+
|
|
155
|
+
|
|
156
|
+
License
|
|
157
|
+
-------
|
|
158
|
+
|
|
159
|
+
This project is licensed under the Apache-2.0 License - see the LICENSE file for details.
|
|
File without changes
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
|
|
2
|
+
# pylint: disable=line-too-long
|
|
3
|
+
DEFAULT_ADVISOR = "TheBloke/Mistral-7B-Instruct-v0.2-GGUF"
|
|
4
|
+
|
|
5
|
+
PROMPT_TEMPLATE = """
|
|
6
|
+
Given following log snippets, and nothing else, explain what failure, if any, occured during build of this package.
|
|
7
|
+
|
|
8
|
+
{}
|
|
9
|
+
|
|
10
|
+
Analysis of the failure must be in a format of [X] : [Y], where [X] is a log snippet, and [Y] is the explanation.
|
|
11
|
+
|
|
12
|
+
Finally, drawing on information from all snippets, provide complete explanation of the issue.
|
|
13
|
+
|
|
14
|
+
Analysis:
|
|
15
|
+
|
|
16
|
+
"""
|
|
17
|
+
|
|
18
|
+
SUMMARIZE_PROMPT_TEMPLATE = """
|
|
19
|
+
Does following log contain error or issue?
|
|
20
|
+
|
|
21
|
+
Log:
|
|
22
|
+
|
|
23
|
+
{}
|
|
24
|
+
|
|
25
|
+
Answer:
|
|
26
|
+
|
|
27
|
+
"""
|
|
@@ -0,0 +1,89 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import logging
|
|
3
|
+
|
|
4
|
+
import drain3
|
|
5
|
+
from drain3.template_miner_config import TemplateMinerConfig
|
|
6
|
+
from llama_cpp import Llama, LlamaGrammar
|
|
7
|
+
|
|
8
|
+
from logdetective.constants import SUMMARIZE_PROMPT_TEMPLATE
|
|
9
|
+
from logdetective.utils import get_chunks
|
|
10
|
+
|
|
11
|
+
LOG = logging.getLogger("logdetective")
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class LLMExtractor:
|
|
15
|
+
"""
|
|
16
|
+
A class that extracts relevant information from logs using a language model.
|
|
17
|
+
"""
|
|
18
|
+
def __init__(self, model: Llama, n_lines: int = 2):
|
|
19
|
+
self.model = model
|
|
20
|
+
self.n_lines = n_lines
|
|
21
|
+
self.grammar = LlamaGrammar.from_string(
|
|
22
|
+
"root ::= (\"Yes\" | \"No\")", verbose=False)
|
|
23
|
+
|
|
24
|
+
def __call__(self, log: str, n_lines: int = 2, neighbors: bool = False) -> str:
|
|
25
|
+
chunks = self.rate_chunks(log)
|
|
26
|
+
out = self.create_extract(chunks, neighbors)
|
|
27
|
+
return out
|
|
28
|
+
|
|
29
|
+
def rate_chunks(self, log: str) -> list[tuple]:
|
|
30
|
+
"""Scan log by the model and store results.
|
|
31
|
+
|
|
32
|
+
:param log: log file content
|
|
33
|
+
"""
|
|
34
|
+
results = []
|
|
35
|
+
log_lines = log.split("\n")
|
|
36
|
+
|
|
37
|
+
for i in range(0, len(log_lines), self.n_lines):
|
|
38
|
+
block = '\n'.join(log_lines[i:i + self.n_lines])
|
|
39
|
+
prompt = SUMMARIZE_PROMPT_TEMPLATE.format(log)
|
|
40
|
+
out = self.model(prompt, max_tokens=7, grammar=self.grammar)
|
|
41
|
+
out = f"{out['choices'][0]['text']}\n"
|
|
42
|
+
results.append((block, out))
|
|
43
|
+
|
|
44
|
+
return results
|
|
45
|
+
|
|
46
|
+
def create_extract(self, chunks: list[tuple], neighbors: bool = False) -> str:
|
|
47
|
+
"""Extract interesting chunks from the model processing.
|
|
48
|
+
"""
|
|
49
|
+
interesting = []
|
|
50
|
+
summary = ""
|
|
51
|
+
# pylint: disable=consider-using-enumerate
|
|
52
|
+
for i in range(len(chunks)):
|
|
53
|
+
if chunks[i][1].startswith("Yes"):
|
|
54
|
+
interesting.append(i)
|
|
55
|
+
if neighbors:
|
|
56
|
+
interesting.extend([max(i - 1, 0), min(i + 1, len(chunks) - 1)])
|
|
57
|
+
|
|
58
|
+
interesting = set(interesting)
|
|
59
|
+
|
|
60
|
+
for i in interesting:
|
|
61
|
+
summary += chunks[i][0] + "\n"
|
|
62
|
+
|
|
63
|
+
return summary
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
class DrainExtractor:
|
|
67
|
+
"""A class that extracts information from logs using a template miner algorithm.
|
|
68
|
+
"""
|
|
69
|
+
def __init__(self, verbose: bool = False, context: bool = False, max_clusters=8):
|
|
70
|
+
config = TemplateMinerConfig()
|
|
71
|
+
config.load(f"{os.path.dirname(__file__)}/drain3.ini")
|
|
72
|
+
config.profiling_enabled = verbose
|
|
73
|
+
config.drain_max_clusters = max_clusters
|
|
74
|
+
self.miner = drain3.TemplateMiner(config=config)
|
|
75
|
+
self.verbose = verbose
|
|
76
|
+
self.context = context
|
|
77
|
+
|
|
78
|
+
def __call__(self, log: str) -> str:
|
|
79
|
+
out = ""
|
|
80
|
+
for chunk in get_chunks(log):
|
|
81
|
+
processed_line = self.miner.add_log_message(chunk)
|
|
82
|
+
LOG.debug(processed_line)
|
|
83
|
+
sorted_clusters = sorted(self.miner.drain.clusters, key=lambda it: it.size, reverse=True)
|
|
84
|
+
for chunk in get_chunks(log):
|
|
85
|
+
cluster = self.miner.match(chunk, "always")
|
|
86
|
+
if cluster in sorted_clusters:
|
|
87
|
+
out += f"{chunk}\n"
|
|
88
|
+
sorted_clusters.remove(cluster)
|
|
89
|
+
return out
|
|
@@ -0,0 +1,76 @@
|
|
|
1
|
+
import argparse
|
|
2
|
+
import logging
|
|
3
|
+
import sys
|
|
4
|
+
|
|
5
|
+
from logdetective.constants import DEFAULT_ADVISOR
|
|
6
|
+
from logdetective.utils import process_log, initialize_model, retrieve_log_content
|
|
7
|
+
from logdetective.extractors import LLMExtractor, DrainExtractor
|
|
8
|
+
|
|
9
|
+
LOG = logging.getLogger("logdetective")
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def main():
|
|
13
|
+
"""Main execution function."""
|
|
14
|
+
parser = argparse.ArgumentParser("logdetective")
|
|
15
|
+
parser.add_argument("file", type=str,
|
|
16
|
+
default="", help="The URL or path to the log file to be analyzed.")
|
|
17
|
+
parser.add_argument("-M", "--model",
|
|
18
|
+
help="The path or Hugging Face name of the language model for analysis.",
|
|
19
|
+
type=str, default=DEFAULT_ADVISOR)
|
|
20
|
+
parser.add_argument("-F", "--filename_suffix",
|
|
21
|
+
help="Suffix of the model file name to be retrieved from Hugging Face.\
|
|
22
|
+
Makes sense only if the model is specified with Hugging Face name.",
|
|
23
|
+
default="Q4_K_S.gguf")
|
|
24
|
+
parser.add_argument("-S", "--summarizer", type=str, default="drain",
|
|
25
|
+
help="Choose between LLM and Drain template miner as the log summarizer.\
|
|
26
|
+
LLM must be specified as path to a model, URL or local file.")
|
|
27
|
+
parser.add_argument("-N", "--n_lines", type=int,
|
|
28
|
+
default=8, help="The number of lines per chunk for LLM analysis.\
|
|
29
|
+
This only makes sense when you are summarizing with LLM.")
|
|
30
|
+
parser.add_argument("-C", "--n_clusters", type=int, default=8,
|
|
31
|
+
help="Number of clusters for Drain to organize log chunks into.\
|
|
32
|
+
This only makes sense when you are summarizing with Drain")
|
|
33
|
+
parser.add_argument("-v", "--verbose", action='count', default=0)
|
|
34
|
+
parser.add_argument("-q", "--quiet", action='store_true')
|
|
35
|
+
args = parser.parse_args()
|
|
36
|
+
|
|
37
|
+
if args.verbose and args.quiet:
|
|
38
|
+
sys.stderr.write("Error: --quiet and --verbose is mutually exclusive.\n")
|
|
39
|
+
sys.exit(2)
|
|
40
|
+
|
|
41
|
+
# Logging facility setup
|
|
42
|
+
log_level = logging.INFO
|
|
43
|
+
if args.verbose >= 1:
|
|
44
|
+
log_level = logging.DEBUG
|
|
45
|
+
if args.quiet:
|
|
46
|
+
log_level = 0
|
|
47
|
+
|
|
48
|
+
logging.basicConfig(stream=sys.stdout)
|
|
49
|
+
LOG.setLevel(log_level)
|
|
50
|
+
|
|
51
|
+
# Primary model initialization
|
|
52
|
+
model = initialize_model(args.model, filename_suffix=args.filename_suffix,
|
|
53
|
+
verbose=args.verbose > 2)
|
|
54
|
+
|
|
55
|
+
# Log file summarizer selection and initialization
|
|
56
|
+
if args.summarizer == "drain":
|
|
57
|
+
extractor = DrainExtractor(args.verbose > 1, context=True, max_clusters=args.n_clusters)
|
|
58
|
+
else:
|
|
59
|
+
summarizer_model = initialize_model(args.summarizer, verbose=args.verbose > 2)
|
|
60
|
+
extractor = LLMExtractor(summarizer_model, args.verbose > 1)
|
|
61
|
+
|
|
62
|
+
LOG.info("Getting summary")
|
|
63
|
+
|
|
64
|
+
log = retrieve_log_content(args.file)
|
|
65
|
+
log_summary = extractor(log)
|
|
66
|
+
|
|
67
|
+
ratio = len(log_summary.split('\n')) / len(log.split('\n'))
|
|
68
|
+
LOG.debug("Log summary: \n %s", log_summary)
|
|
69
|
+
LOG.info("Compression ratio: %s", ratio)
|
|
70
|
+
|
|
71
|
+
LOG.info("Analyzing the text")
|
|
72
|
+
print(f"Explanation: \n{process_log(log_summary, model)}")
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
if __name__ == "__main__":
|
|
76
|
+
main()
|
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
import os
|
|
3
|
+
import json
|
|
4
|
+
|
|
5
|
+
from fastapi import FastAPI
|
|
6
|
+
from pydantic import BaseModel
|
|
7
|
+
|
|
8
|
+
import requests
|
|
9
|
+
|
|
10
|
+
from logdetective.constants import PROMPT_TEMPLATE
|
|
11
|
+
from logdetective.extractors import DrainExtractor
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class BuildLog(BaseModel):
|
|
15
|
+
"""Model of data submitted to API.
|
|
16
|
+
"""
|
|
17
|
+
url: str
|
|
18
|
+
|
|
19
|
+
LOG = logging.getLogger("logdetective")
|
|
20
|
+
|
|
21
|
+
app = FastAPI()
|
|
22
|
+
|
|
23
|
+
LLM_CPP_SERVER_ADDRESS = os.environ.get("LLAMA_CPP_SERVER", " http://localhost")
|
|
24
|
+
LLM_CPP_SERVER_PORT = os.environ.get("LLAMA_CPP_SERVER_PORT", 8000)
|
|
25
|
+
LLM_CPP_SERVER_TIMEOUT = os.environ.get("LLAMA_CPP_SERVER_TIMEOUT", 600)
|
|
26
|
+
LOG_SOURCE_REQUEST_TIMEOUT = os.environ.get("LOG_SOURCE_REQUEST_TIMEOUT", 60)
|
|
27
|
+
|
|
28
|
+
@app.post("/analyze", )
|
|
29
|
+
async def analyze_log(build_log: BuildLog):
|
|
30
|
+
"""Provide endpoint for log file submission and analysis.
|
|
31
|
+
Request must be in form {"url":"<YOUR_URL_HERE>"}.
|
|
32
|
+
"""
|
|
33
|
+
extractor = DrainExtractor(verbose=True, context=True, max_clusters=8)
|
|
34
|
+
|
|
35
|
+
LOG.info("Getting summary")
|
|
36
|
+
|
|
37
|
+
log = requests.get(build_log.url, timeout=int(LOG_SOURCE_REQUEST_TIMEOUT)).text
|
|
38
|
+
log_summary = extractor(log)
|
|
39
|
+
|
|
40
|
+
ratio = len(log_summary.split('\n')) / len(log.split('\n'))
|
|
41
|
+
LOG.debug("Log summary: \n %s", log_summary)
|
|
42
|
+
LOG.info("Compression ratio: %s", ratio)
|
|
43
|
+
|
|
44
|
+
LOG.info("Analyzing the text")
|
|
45
|
+
data = {
|
|
46
|
+
"prompt": PROMPT_TEMPLATE.format(log_summary),
|
|
47
|
+
"max_tokens": "0"}
|
|
48
|
+
|
|
49
|
+
# Expects llama-cpp server to run on LLM_CPP_SERVER_ADDRESS:LLM_CPP_SERVER_PORT
|
|
50
|
+
response = requests.post(
|
|
51
|
+
f"{LLM_CPP_SERVER_ADDRESS}:{LLM_CPP_SERVER_PORT}/v1/completions",
|
|
52
|
+
headers={"Content-Type":"application/json"},
|
|
53
|
+
data=json.dumps(data),
|
|
54
|
+
timeout=int(LLM_CPP_SERVER_TIMEOUT))
|
|
55
|
+
|
|
56
|
+
return response.text
|
|
@@ -0,0 +1,99 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
import os
|
|
3
|
+
from urllib.parse import urlparse
|
|
4
|
+
|
|
5
|
+
import requests
|
|
6
|
+
|
|
7
|
+
from llama_cpp import Llama
|
|
8
|
+
from logdetective.constants import PROMPT_TEMPLATE
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
LOG = logging.getLogger("logdetective")
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def chunk_continues(text: str, index: int) -> bool:
|
|
15
|
+
"""Set of heuristics for determining whether or not
|
|
16
|
+
does the current chunk of log text continue on next line.
|
|
17
|
+
"""
|
|
18
|
+
conditionals = [
|
|
19
|
+
lambda i, string: string[i + 1].isspace(),
|
|
20
|
+
lambda i, string: string[i - 1] == "\\"
|
|
21
|
+
]
|
|
22
|
+
|
|
23
|
+
for c in conditionals:
|
|
24
|
+
y = c(index, text)
|
|
25
|
+
if y:
|
|
26
|
+
return True
|
|
27
|
+
|
|
28
|
+
return False
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def get_chunks(text: str):
|
|
32
|
+
"""Split log into chunks according to heuristic
|
|
33
|
+
based on whitespace and backslash presence.
|
|
34
|
+
"""
|
|
35
|
+
text_len = len(text)
|
|
36
|
+
i = 0
|
|
37
|
+
chunk = ""
|
|
38
|
+
while i < text_len:
|
|
39
|
+
chunk += text[i]
|
|
40
|
+
if text[i] == '\n':
|
|
41
|
+
if i + 1 < text_len and chunk_continues(text, i):
|
|
42
|
+
i += 1
|
|
43
|
+
continue
|
|
44
|
+
yield chunk
|
|
45
|
+
chunk = ""
|
|
46
|
+
i += 1
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def initialize_model(model_pth: str, filename_suffix: str = ".gguf", verbose: bool = False) -> Llama:
|
|
50
|
+
"""Initialize Llama class for inference.
|
|
51
|
+
Args:
|
|
52
|
+
model_pth (str): path to gguf model file or Hugging Face name
|
|
53
|
+
filename_suffix (str): suffix of the model file name to be pulled from Hugging Face
|
|
54
|
+
verbose (bool): level of verbosity for llamacpp
|
|
55
|
+
"""
|
|
56
|
+
if os.path.isfile(model_pth):
|
|
57
|
+
model = Llama(
|
|
58
|
+
model_path=model_pth,
|
|
59
|
+
n_ctx=0, # Maximum context for the model
|
|
60
|
+
verbose=verbose)
|
|
61
|
+
else:
|
|
62
|
+
model = Llama.from_pretrained(
|
|
63
|
+
model_pth,
|
|
64
|
+
f"*{filename_suffix}",
|
|
65
|
+
n_ctx=0, # Maximum context for the model
|
|
66
|
+
verbose=verbose)
|
|
67
|
+
|
|
68
|
+
return model
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
def process_log(log: str, model: Llama) -> str:
|
|
72
|
+
"""
|
|
73
|
+
Processes a given log using the provided language model and returns its summary.
|
|
74
|
+
|
|
75
|
+
Args:
|
|
76
|
+
log (str): The input log to be processed.
|
|
77
|
+
model (Llama): The language model used for processing the log.
|
|
78
|
+
|
|
79
|
+
Returns:
|
|
80
|
+
str: The summary of the given log generated by the language model.
|
|
81
|
+
"""
|
|
82
|
+
return model(PROMPT_TEMPLATE.format(log), max_tokens=0)["choices"][0]["text"]
|
|
83
|
+
|
|
84
|
+
def retrieve_log_content(log_path: str) -> str:
|
|
85
|
+
"""Get content of the file on the log_path path."""
|
|
86
|
+
parsed_url = urlparse(log_path)
|
|
87
|
+
log = ""
|
|
88
|
+
|
|
89
|
+
if not parsed_url.scheme:
|
|
90
|
+
if not os.path.exists(log_path):
|
|
91
|
+
raise ValueError(f"Local log {log_path} doesn't exist!")
|
|
92
|
+
|
|
93
|
+
with open(log_path, "rt") as f:
|
|
94
|
+
log = f.read()
|
|
95
|
+
|
|
96
|
+
else:
|
|
97
|
+
log = requests.get(log_path, timeout=60).text
|
|
98
|
+
|
|
99
|
+
return log
|
|
@@ -1,11 +1,14 @@
|
|
|
1
1
|
[tool.poetry]
|
|
2
2
|
name = "logdetective"
|
|
3
|
-
version = "0.2.
|
|
3
|
+
version = "0.2.2"
|
|
4
4
|
description = "Log using LLM AI to search for build/test failures and provide ideas for fixing these."
|
|
5
5
|
authors = ["Jiri Podivin <jpodivin@gmail.com>"]
|
|
6
6
|
license = "Apache-2.0"
|
|
7
7
|
readme = "README.md"
|
|
8
8
|
include = ["logdetective/drain3.ini"]
|
|
9
|
+
packages = [
|
|
10
|
+
{ include = "logdetective" }
|
|
11
|
+
]
|
|
9
12
|
classifiers = [
|
|
10
13
|
"Development Status :: 4 - Beta",
|
|
11
14
|
"Environment :: Console",
|
|
@@ -26,9 +29,8 @@ issues = "https://github.com/fedora-copr/logdetective/issues"
|
|
|
26
29
|
python = "^3.11"
|
|
27
30
|
requests = "^2.31.0"
|
|
28
31
|
llama-cpp-python = "^0.2.56"
|
|
29
|
-
tiktoken = "^0.6.0"
|
|
30
32
|
drain3 = "^0.9.11"
|
|
31
|
-
|
|
33
|
+
huggingface-hub = "^0.23.2"
|
|
32
34
|
|
|
33
35
|
[build-system]
|
|
34
36
|
requires = ["poetry-core"]
|
|
@@ -37,6 +39,9 @@ build-backend = "poetry.core.masonry.api"
|
|
|
37
39
|
[tool.poetry.scripts]
|
|
38
40
|
logdetective = 'logdetective.logdetective:main'
|
|
39
41
|
|
|
42
|
+
[tool.poetry.extras]
|
|
43
|
+
server = ["fastapi", "pydantic"]
|
|
44
|
+
|
|
40
45
|
[tool.pylint]
|
|
41
46
|
disable = [
|
|
42
47
|
"inconsistent-return-statements",
|
logdetective-0.2.0/PKG-INFO
DELETED
|
@@ -1,90 +0,0 @@
|
|
|
1
|
-
Metadata-Version: 2.1
|
|
2
|
-
Name: logdetective
|
|
3
|
-
Version: 0.2.0
|
|
4
|
-
Summary: Log using LLM AI to search for build/test failures and provide ideas for fixing these.
|
|
5
|
-
License: Apache-2.0
|
|
6
|
-
Author: Jiri Podivin
|
|
7
|
-
Author-email: jpodivin@gmail.com
|
|
8
|
-
Requires-Python: >=3.11,<4.0
|
|
9
|
-
Classifier: Development Status :: 4 - Beta
|
|
10
|
-
Classifier: Environment :: Console
|
|
11
|
-
Classifier: Intended Audience :: Developers
|
|
12
|
-
Classifier: License :: OSI Approved :: Apache Software License
|
|
13
|
-
Classifier: Natural Language :: English
|
|
14
|
-
Classifier: Programming Language :: Python :: 3
|
|
15
|
-
Classifier: Programming Language :: Python :: 3.11
|
|
16
|
-
Classifier: Programming Language :: Python :: 3.12
|
|
17
|
-
Classifier: Topic :: Internet :: Log Analysis
|
|
18
|
-
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
19
|
-
Classifier: Topic :: Software Development :: Debuggers
|
|
20
|
-
Requires-Dist: drain3 (>=0.9.11,<0.10.0)
|
|
21
|
-
Requires-Dist: llama-cpp-python (>=0.2.56,<0.3.0)
|
|
22
|
-
Requires-Dist: progressbar2 (>=4.0.0,<5.0.0)
|
|
23
|
-
Requires-Dist: requests (>=2.31.0,<3.0.0)
|
|
24
|
-
Requires-Dist: tiktoken (>=0.6.0,<0.7.0)
|
|
25
|
-
Project-URL: homepage, https://github.com/fedora-copr/logdetective
|
|
26
|
-
Project-URL: issues, https://github.com/fedora-copr/logdetective/issues
|
|
27
|
-
Description-Content-Type: text/markdown
|
|
28
|
-
|
|
29
|
-
Log Detective
|
|
30
|
-
=============
|
|
31
|
-
|
|
32
|
-
A Python tool to analyze logs using a Language Model (LLM) and Drain template miner.
|
|
33
|
-
|
|
34
|
-
Installation
|
|
35
|
-
------------
|
|
36
|
-
|
|
37
|
-
# optionaly when you prefer system packages
|
|
38
|
-
dnf install python3-jsonpickle python3-tiktoken
|
|
39
|
-
# install all remaining packages
|
|
40
|
-
pip install .
|
|
41
|
-
|
|
42
|
-
Usage
|
|
43
|
-
-----
|
|
44
|
-
|
|
45
|
-
To analyze a log file, run the script with the following command line arguments:
|
|
46
|
-
- `url` (required): The URL of the log file to be analyzed.
|
|
47
|
-
- `--model` (optional, default: "Mistral-7B-Instruct-v0.2-GGUF"): The path or URL of the language model for analysis.
|
|
48
|
-
- `--summarizer` (optional, default: "drain"): Choose between LLM and Drain template miner as the log summarizer. You can also provide the path to an existing language model file instead of using a URL.
|
|
49
|
-
- `--n_lines` (optional, default: 5): The number of lines per chunk for LLM analysis. This only makes sense when you are summarizing with LLM.
|
|
50
|
-
|
|
51
|
-
Example usage:
|
|
52
|
-
|
|
53
|
-
~/.local/bin/logdetective https://example.com/logs.txt
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
Contributing
|
|
57
|
-
------------
|
|
58
|
-
|
|
59
|
-
Contributions are welcome! Please submit a pull request if you have any improvements or new features to add. Make sure your changes pass all existing tests before submitting.
|
|
60
|
-
|
|
61
|
-
To develop logdetective, you should fork this repository, clone your fork, and install dependencies using pip:
|
|
62
|
-
|
|
63
|
-
git clone https://github.com/yourusername/logdetective.git
|
|
64
|
-
cd logdetective
|
|
65
|
-
pip install .
|
|
66
|
-
|
|
67
|
-
Make changes to the code as needed and run pre-commit.
|
|
68
|
-
|
|
69
|
-
Tests
|
|
70
|
-
-----
|
|
71
|
-
|
|
72
|
-
The [tox](https://github.com/tox-dev/tox) is used to manage tests. Please install `tox` package into your distribution and run:
|
|
73
|
-
|
|
74
|
-
tox
|
|
75
|
-
|
|
76
|
-
This will create a virtual environment with dependencies and run all the tests. For more information follow the tox help.
|
|
77
|
-
|
|
78
|
-
To run only a specific test execute this:
|
|
79
|
-
|
|
80
|
-
tox run -e style # to run flake8
|
|
81
|
-
|
|
82
|
-
or
|
|
83
|
-
|
|
84
|
-
tox run -e lint # to run pylint
|
|
85
|
-
|
|
86
|
-
License
|
|
87
|
-
-------
|
|
88
|
-
|
|
89
|
-
This project is licensed under the Apache-2.0 License - see the LICENSE file for details.
|
|
90
|
-
|
logdetective-0.2.0/README.md
DELETED
|
@@ -1,61 +0,0 @@
|
|
|
1
|
-
Log Detective
|
|
2
|
-
=============
|
|
3
|
-
|
|
4
|
-
A Python tool to analyze logs using a Language Model (LLM) and Drain template miner.
|
|
5
|
-
|
|
6
|
-
Installation
|
|
7
|
-
------------
|
|
8
|
-
|
|
9
|
-
# optionaly when you prefer system packages
|
|
10
|
-
dnf install python3-jsonpickle python3-tiktoken
|
|
11
|
-
# install all remaining packages
|
|
12
|
-
pip install .
|
|
13
|
-
|
|
14
|
-
Usage
|
|
15
|
-
-----
|
|
16
|
-
|
|
17
|
-
To analyze a log file, run the script with the following command line arguments:
|
|
18
|
-
- `url` (required): The URL of the log file to be analyzed.
|
|
19
|
-
- `--model` (optional, default: "Mistral-7B-Instruct-v0.2-GGUF"): The path or URL of the language model for analysis.
|
|
20
|
-
- `--summarizer` (optional, default: "drain"): Choose between LLM and Drain template miner as the log summarizer. You can also provide the path to an existing language model file instead of using a URL.
|
|
21
|
-
- `--n_lines` (optional, default: 5): The number of lines per chunk for LLM analysis. This only makes sense when you are summarizing with LLM.
|
|
22
|
-
|
|
23
|
-
Example usage:
|
|
24
|
-
|
|
25
|
-
~/.local/bin/logdetective https://example.com/logs.txt
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
Contributing
|
|
29
|
-
------------
|
|
30
|
-
|
|
31
|
-
Contributions are welcome! Please submit a pull request if you have any improvements or new features to add. Make sure your changes pass all existing tests before submitting.
|
|
32
|
-
|
|
33
|
-
To develop logdetective, you should fork this repository, clone your fork, and install dependencies using pip:
|
|
34
|
-
|
|
35
|
-
git clone https://github.com/yourusername/logdetective.git
|
|
36
|
-
cd logdetective
|
|
37
|
-
pip install .
|
|
38
|
-
|
|
39
|
-
Make changes to the code as needed and run pre-commit.
|
|
40
|
-
|
|
41
|
-
Tests
|
|
42
|
-
-----
|
|
43
|
-
|
|
44
|
-
The [tox](https://github.com/tox-dev/tox) is used to manage tests. Please install `tox` package into your distribution and run:
|
|
45
|
-
|
|
46
|
-
tox
|
|
47
|
-
|
|
48
|
-
This will create a virtual environment with dependencies and run all the tests. For more information follow the tox help.
|
|
49
|
-
|
|
50
|
-
To run only a specific test execute this:
|
|
51
|
-
|
|
52
|
-
tox run -e style # to run flake8
|
|
53
|
-
|
|
54
|
-
or
|
|
55
|
-
|
|
56
|
-
tox run -e lint # to run pylint
|
|
57
|
-
|
|
58
|
-
License
|
|
59
|
-
-------
|
|
60
|
-
|
|
61
|
-
This project is licensed under the Apache-2.0 License - see the LICENSE file for details.
|
|
@@ -1,275 +0,0 @@
|
|
|
1
|
-
import argparse
|
|
2
|
-
import logging
|
|
3
|
-
import os
|
|
4
|
-
import sys
|
|
5
|
-
from urllib.request import urlretrieve
|
|
6
|
-
|
|
7
|
-
import drain3
|
|
8
|
-
import numpy as np
|
|
9
|
-
import progressbar
|
|
10
|
-
import requests
|
|
11
|
-
from drain3.template_miner_config import TemplateMinerConfig
|
|
12
|
-
from llama_cpp import Llama, LlamaGrammar
|
|
13
|
-
|
|
14
|
-
# pylint: disable=line-too-long
|
|
15
|
-
DEFAULT_ADVISOR = "https://huggingface.co/TheBloke/Mistral-7B-Instruct-v0.2-GGUF/resolve/main/mistral-7b-instruct-v0.2.Q4_K_S.gguf?download=true"
|
|
16
|
-
|
|
17
|
-
# pylint: disable=line-too-long
|
|
18
|
-
DEFAULT_LLM_RATER = "https://huggingface.co/TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF/resolve/main/tinyllama-1.1b-chat-v1.0.Q4_K_S.gguf?download=true"
|
|
19
|
-
|
|
20
|
-
PROMPT_TEMPLATE = """
|
|
21
|
-
Given following log snippets, and nothing else, explain what failure, if any occured during build of this package.
|
|
22
|
-
Ignore strings wrapped in <: :>, such as <:*:>.
|
|
23
|
-
|
|
24
|
-
{}
|
|
25
|
-
|
|
26
|
-
Analysis of the failure must be in a format of [X] : [Y], where [X] is a log snippet, and [Y] is the explanation.
|
|
27
|
-
|
|
28
|
-
Finally, drawing on information from all snippets, provide complete explanation of the issue.
|
|
29
|
-
|
|
30
|
-
Analysis:
|
|
31
|
-
|
|
32
|
-
"""
|
|
33
|
-
|
|
34
|
-
SUMMARIZE_PROPT_TEMPLATE = """
|
|
35
|
-
Does following log contain error or issue?
|
|
36
|
-
|
|
37
|
-
Log:
|
|
38
|
-
|
|
39
|
-
{}
|
|
40
|
-
|
|
41
|
-
Answer:
|
|
42
|
-
|
|
43
|
-
"""
|
|
44
|
-
|
|
45
|
-
CACHE_LOC = "~/.cache/logdetective/"
|
|
46
|
-
|
|
47
|
-
LOG = logging.getLogger("logdetective")
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
class MyProgressBar():
|
|
51
|
-
"""Show progress when downloading model."""
|
|
52
|
-
def __init__(self):
|
|
53
|
-
self.pbar = None
|
|
54
|
-
|
|
55
|
-
def __call__(self, block_num, block_size, total_size):
|
|
56
|
-
if not self.pbar:
|
|
57
|
-
self.pbar = progressbar.ProgressBar(maxval=total_size)
|
|
58
|
-
self.pbar.start()
|
|
59
|
-
|
|
60
|
-
downloaded = block_num * block_size
|
|
61
|
-
if downloaded < total_size:
|
|
62
|
-
self.pbar.update(downloaded)
|
|
63
|
-
else:
|
|
64
|
-
self.pbar.finish()
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
def chunk_continues(text: str, index: int) -> bool:
|
|
68
|
-
"""Set of heuristics for determining whether or not
|
|
69
|
-
does the current chunk of log text continue on next line.
|
|
70
|
-
"""
|
|
71
|
-
conditionals = [
|
|
72
|
-
lambda i, string: string[i + 1].isspace(),
|
|
73
|
-
lambda i, string: string[i - 1] == "\\"
|
|
74
|
-
]
|
|
75
|
-
|
|
76
|
-
for c in conditionals:
|
|
77
|
-
y = c(index, text)
|
|
78
|
-
if y:
|
|
79
|
-
return True
|
|
80
|
-
|
|
81
|
-
return False
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
def get_chunks(text: str):
|
|
85
|
-
"""Split log into chunks according to heuristic
|
|
86
|
-
based on whitespace and backslash presence.
|
|
87
|
-
"""
|
|
88
|
-
text_len = len(text)
|
|
89
|
-
i = 0
|
|
90
|
-
chunk = ""
|
|
91
|
-
while i < text_len:
|
|
92
|
-
chunk += text[i]
|
|
93
|
-
if text[i] == '\n':
|
|
94
|
-
if i + 1 < text_len and chunk_continues(text, i):
|
|
95
|
-
i += 1
|
|
96
|
-
continue
|
|
97
|
-
yield chunk
|
|
98
|
-
chunk = ""
|
|
99
|
-
i += 1
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
class LLMExtractor:
|
|
103
|
-
"""
|
|
104
|
-
A class that extracts relevant information from logs using a language model.
|
|
105
|
-
"""
|
|
106
|
-
def __init__(self, model_path: str, verbose: bool):
|
|
107
|
-
self.model = Llama(
|
|
108
|
-
model_path=model_path,
|
|
109
|
-
n_ctx=0,
|
|
110
|
-
verbose=verbose)
|
|
111
|
-
self.grammar = LlamaGrammar.from_string(
|
|
112
|
-
"root ::= (\"Yes\" | \"No\")", verbose=False)
|
|
113
|
-
|
|
114
|
-
def __call__(self, log: str, n_lines: int = 2, neighbors: bool = False) -> str:
|
|
115
|
-
chunks = self.rate_chunks(log, n_lines)
|
|
116
|
-
out = self.create_extract(chunks, neighbors)
|
|
117
|
-
return out
|
|
118
|
-
|
|
119
|
-
def rate_chunks(self, log: str, n_lines: int = 2) -> list[tuple]:
|
|
120
|
-
"""Scan log by the model and store results.
|
|
121
|
-
|
|
122
|
-
:param log: log file content
|
|
123
|
-
:param n_lines: How many lines should the model take into consideration
|
|
124
|
-
"""
|
|
125
|
-
results = []
|
|
126
|
-
log_lines = log.split("\n")
|
|
127
|
-
|
|
128
|
-
for i in range(0, len(log_lines), n_lines):
|
|
129
|
-
block = '\n'.join(log_lines[i:i + n_lines])
|
|
130
|
-
prompt = SUMMARIZE_PROPT_TEMPLATE.format(log)
|
|
131
|
-
out = self.model(prompt, max_tokens=7, grammar=self.grammar)
|
|
132
|
-
out = f"{out['choices'][0]['text']}\n"
|
|
133
|
-
results.append((block, out))
|
|
134
|
-
|
|
135
|
-
return results
|
|
136
|
-
|
|
137
|
-
def create_extract(self, chunks: list[tuple], neighbors: bool = False) -> str:
|
|
138
|
-
"""Extract interesting chunks from the model processing.
|
|
139
|
-
"""
|
|
140
|
-
interesting = []
|
|
141
|
-
summary = ""
|
|
142
|
-
# pylint: disable=consider-using-enumerate
|
|
143
|
-
for i in range(len(chunks)):
|
|
144
|
-
if chunks[i][1].startswith("Yes"):
|
|
145
|
-
interesting.append(i)
|
|
146
|
-
if neighbors:
|
|
147
|
-
interesting.extend([max(i - 1, 0), min(i + 1, len(chunks) - 1)])
|
|
148
|
-
|
|
149
|
-
interesting = np.unique(interesting)
|
|
150
|
-
|
|
151
|
-
for i in interesting:
|
|
152
|
-
summary += chunks[i][0] + "\n"
|
|
153
|
-
|
|
154
|
-
return summary
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
class DrainExtractor:
|
|
158
|
-
"""A class that extracts information from logs using a template miner algorithm.
|
|
159
|
-
"""
|
|
160
|
-
def __init__(self, verbose: bool = False, context: bool = False):
|
|
161
|
-
config = TemplateMinerConfig()
|
|
162
|
-
config.load(f"{os.path.dirname(__file__)}/drain3.ini")
|
|
163
|
-
config.profiling_enabled = verbose
|
|
164
|
-
self.miner = drain3.TemplateMiner(config=config)
|
|
165
|
-
self.verbose = verbose
|
|
166
|
-
self.context = context
|
|
167
|
-
|
|
168
|
-
def __call__(self, log: str) -> str:
|
|
169
|
-
out = ""
|
|
170
|
-
for chunk in get_chunks(log):
|
|
171
|
-
procesed_line = self.miner.add_log_message(chunk)
|
|
172
|
-
LOG.debug(procesed_line)
|
|
173
|
-
sorted_clusters = sorted(self.miner.drain.clusters, key=lambda it: it.size, reverse=True)
|
|
174
|
-
for chunk in get_chunks(log):
|
|
175
|
-
cluster = self.miner.match(chunk, "always")
|
|
176
|
-
if cluster in sorted_clusters:
|
|
177
|
-
out += f"{chunk}\n"
|
|
178
|
-
sorted_clusters.remove(cluster)
|
|
179
|
-
return out
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
def download_model(url: str, verbose: bool = False) -> str:
|
|
183
|
-
""" Downloads a language model from a given URL and saves it to the cache directory.
|
|
184
|
-
|
|
185
|
-
Args:
|
|
186
|
-
url (str): The URL of the language model to be downloaded.
|
|
187
|
-
|
|
188
|
-
Returns:
|
|
189
|
-
str: The local file path of the downloaded language model.
|
|
190
|
-
"""
|
|
191
|
-
path = os.path.join(
|
|
192
|
-
os.path.expanduser(CACHE_LOC), url.split('/')[-1])
|
|
193
|
-
|
|
194
|
-
LOG.info("Downloading model from %s to %s", url, path)
|
|
195
|
-
if not os.path.exists(path):
|
|
196
|
-
if verbose:
|
|
197
|
-
path, _status = urlretrieve(url, path, MyProgressBar())
|
|
198
|
-
else:
|
|
199
|
-
path, _status = urlretrieve(url, path)
|
|
200
|
-
|
|
201
|
-
return path
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
def process_log(log: str, model: Llama) -> str:
|
|
205
|
-
"""
|
|
206
|
-
Processes a given log using the provided language model and returns its summary.
|
|
207
|
-
|
|
208
|
-
Args:
|
|
209
|
-
log (str): The input log to be processed.
|
|
210
|
-
model (Llama): The language model used for processing the log.
|
|
211
|
-
|
|
212
|
-
Returns:
|
|
213
|
-
str: The summary of the given log generated by the language model.
|
|
214
|
-
"""
|
|
215
|
-
return model(PROMPT_TEMPLATE.format(log), max_tokens=0)["choices"][0]["text"]
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
def main():
|
|
219
|
-
"""Main execution function."""
|
|
220
|
-
parser = argparse.ArgumentParser("logdetective")
|
|
221
|
-
parser.add_argument("url", type=str, default="")
|
|
222
|
-
parser.add_argument("-M", "--model", type=str, default=DEFAULT_ADVISOR)
|
|
223
|
-
parser.add_argument("-S", "--summarizer", type=str, default="drain")
|
|
224
|
-
parser.add_argument("-N", "--n_lines", type=int, default=5)
|
|
225
|
-
parser.add_argument("-v", "--verbose", action='count', default=0)
|
|
226
|
-
parser.add_argument("-q", "--quiet", action='store_true')
|
|
227
|
-
|
|
228
|
-
args = parser.parse_args()
|
|
229
|
-
|
|
230
|
-
if args.verbose and args.quiet:
|
|
231
|
-
sys.stderr.write("Error: --quiet and --verbose is mutually exclusive.\n")
|
|
232
|
-
sys.exit(2)
|
|
233
|
-
log_level = logging.INFO
|
|
234
|
-
if args.verbose >= 1:
|
|
235
|
-
log_level = logging.DEBUG
|
|
236
|
-
if args.quiet:
|
|
237
|
-
log_level = 0
|
|
238
|
-
logging.basicConfig(stream=sys.stdout)
|
|
239
|
-
LOG.setLevel(log_level)
|
|
240
|
-
|
|
241
|
-
if not os.path.exists(CACHE_LOC):
|
|
242
|
-
os.makedirs(os.path.expanduser(CACHE_LOC), exist_ok=True)
|
|
243
|
-
|
|
244
|
-
if not os.path.isfile(args.model):
|
|
245
|
-
model_pth = download_model(args.model, not args.quiet)
|
|
246
|
-
else:
|
|
247
|
-
model_pth = args.model
|
|
248
|
-
|
|
249
|
-
if args.summarizer == "drain":
|
|
250
|
-
extractor = DrainExtractor(args.verbose > 1, context=True)
|
|
251
|
-
elif os.path.isfile(args.summarizer):
|
|
252
|
-
extractor = LLMExtractor(args.summarizer, args.verbose > 1)
|
|
253
|
-
else:
|
|
254
|
-
summarizer_pth = download_model(args.summarizer, not args.quiet)
|
|
255
|
-
extractor = LLMExtractor(summarizer_pth, args.verbose > 1)
|
|
256
|
-
|
|
257
|
-
LOG.info("Getting summary")
|
|
258
|
-
model = Llama(
|
|
259
|
-
model_path=model_pth,
|
|
260
|
-
n_ctx=0,
|
|
261
|
-
verbose=args.verbose > 2)
|
|
262
|
-
|
|
263
|
-
log = requests.get(args.url, timeout=60).text
|
|
264
|
-
log_summary = extractor(log)
|
|
265
|
-
|
|
266
|
-
ratio = len(log_summary.split('\n')) / len(log.split('\n'))
|
|
267
|
-
LOG.debug("Log summary: \n %s", log_summary)
|
|
268
|
-
LOG.info("Compression ratio: %s", ratio)
|
|
269
|
-
|
|
270
|
-
LOG.info("Analyzing the text")
|
|
271
|
-
print(f"Explanation: \n{process_log(log_summary, model)}")
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
if __name__ == "__main__":
|
|
275
|
-
main()
|
|
File without changes
|
|
File without changes
|