thrash-protect 0.0.0.dev0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- thrash_protect-0.0.0.dev0/.github/workflows/ci.yml +50 -0
- thrash_protect-0.0.0.dev0/.github/workflows/release.yml +48 -0
- thrash_protect-0.0.0.dev0/.gitignore +28 -0
- thrash_protect-0.0.0.dev0/.pre-commit-config.yaml +15 -0
- thrash_protect-0.0.0.dev0/.travis.yml +10 -0
- thrash_protect-0.0.0.dev0/AUTHORS +15 -0
- thrash_protect-0.0.0.dev0/ChangeLog +150 -0
- thrash_protect-0.0.0.dev0/INSTALL.md +117 -0
- thrash_protect-0.0.0.dev0/LICENSE +674 -0
- thrash_protect-0.0.0.dev0/MANIFEST.in +1 -0
- thrash_protect-0.0.0.dev0/Makefile +60 -0
- thrash_protect-0.0.0.dev0/PKG-INFO +490 -0
- thrash_protect-0.0.0.dev0/README.rst +451 -0
- thrash_protect-0.0.0.dev0/_build/thrash_protect.egg-info/PKG-INFO +490 -0
- thrash_protect-0.0.0.dev0/_build/thrash_protect.egg-info/SOURCES.txt +54 -0
- thrash_protect-0.0.0.dev0/_build/thrash_protect.egg-info/dependency_links.txt +1 -0
- thrash_protect-0.0.0.dev0/_build/thrash_protect.egg-info/entry_points.txt +2 -0
- thrash_protect-0.0.0.dev0/_build/thrash_protect.egg-info/not-zip-safe +1 -0
- thrash_protect-0.0.0.dev0/_build/thrash_protect.egg-info/requires.txt +6 -0
- thrash_protect-0.0.0.dev0/_build/thrash_protect.egg-info/top_level.txt +1 -0
- thrash_protect-0.0.0.dev0/_build/thrash_protect.py +732 -0
- thrash_protect-0.0.0.dev0/archlinux/Makefile +13 -0
- thrash_protect-0.0.0.dev0/archlinux/PKGBUILD_ +21 -0
- thrash_protect-0.0.0.dev0/changelog-0.14.2.md +15 -0
- thrash_protect-0.0.0.dev0/changelog-0.14.md +37 -0
- thrash_protect-0.0.0.dev0/debian/changelog +11 -0
- thrash_protect-0.0.0.dev0/debian/compat +1 -0
- thrash_protect-0.0.0.dev0/debian/control +19 -0
- thrash_protect-0.0.0.dev0/debian/copyright +27 -0
- thrash_protect-0.0.0.dev0/debian/dirs +2 -0
- thrash_protect-0.0.0.dev0/debian/docs +3 -0
- thrash_protect-0.0.0.dev0/debian/rules +13 -0
- thrash_protect-0.0.0.dev0/debian/source/format +1 -0
- thrash_protect-0.0.0.dev0/debian/thrash-protect.init +153 -0
- thrash_protect-0.0.0.dev0/docs/CODE_REVIEW.md +183 -0
- thrash_protect-0.0.0.dev0/docs/TODO.md +95 -0
- thrash_protect-0.0.0.dev0/openrc/thrash-protect +12 -0
- thrash_protect-0.0.0.dev0/puppet/templates/nrpe.cfg.erb +4 -0
- thrash_protect-0.0.0.dev0/puppet/thrash_protect/manifests/init.pp +31 -0
- thrash_protect-0.0.0.dev0/puppet/thrash_protect/templates/nrpe.cfg.erb +4 -0
- thrash_protect-0.0.0.dev0/pyproject.toml +104 -0
- thrash_protect-0.0.0.dev0/rpm/Makefile +7 -0
- thrash_protect-0.0.0.dev0/rpm/thrash-protect.rhel6.spec +40 -0
- thrash_protect-0.0.0.dev0/rpm/thrash-protect.rhel7.spec +40 -0
- thrash_protect-0.0.0.dev0/rpm/thrash-protect.spec +40 -0
- thrash_protect-0.0.0.dev0/setup.cfg +4 -0
- thrash_protect-0.0.0.dev0/setup.py +84 -0
- thrash_protect-0.0.0.dev0/systemd/thrash-protect.service +11 -0
- thrash_protect-0.0.0.dev0/systemv/thrash-protect +54 -0
- thrash_protect-0.0.0.dev0/tests/README.md +73 -0
- thrash_protect-0.0.0.dev0/tests/test_thrash_protect.py +254 -0
- thrash_protect-0.0.0.dev0/tests/thrash-bot.py +17 -0
- thrash_protect-0.0.0.dev0/tests/thrash-bot2.py +25 -0
- thrash_protect-0.0.0.dev0/thrash-protect.py +732 -0
- thrash_protect-0.0.0.dev0/thrash_protect.py +732 -0
- thrash_protect-0.0.0.dev0/upstart/thrash-protect.conf +16 -0
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
name: CI
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
push:
|
|
5
|
+
branches: [master, main]
|
|
6
|
+
pull_request:
|
|
7
|
+
branches: [master, main]
|
|
8
|
+
|
|
9
|
+
jobs:
|
|
10
|
+
lint:
|
|
11
|
+
runs-on: ubuntu-latest
|
|
12
|
+
steps:
|
|
13
|
+
- uses: actions/checkout@v4
|
|
14
|
+
- name: Set up Python
|
|
15
|
+
uses: actions/setup-python@v5
|
|
16
|
+
with:
|
|
17
|
+
python-version: "3.12"
|
|
18
|
+
- name: Install ruff
|
|
19
|
+
run: pip install ruff
|
|
20
|
+
- name: Run ruff check
|
|
21
|
+
run: ruff check .
|
|
22
|
+
- name: Run ruff format check
|
|
23
|
+
run: ruff format --check .
|
|
24
|
+
|
|
25
|
+
test:
|
|
26
|
+
runs-on: ubuntu-latest
|
|
27
|
+
strategy:
|
|
28
|
+
matrix:
|
|
29
|
+
python-version: ["3.9", "3.10", "3.11", "3.12", "3.13"]
|
|
30
|
+
steps:
|
|
31
|
+
- uses: actions/checkout@v4
|
|
32
|
+
with:
|
|
33
|
+
fetch-depth: 0 # Needed for setuptools-scm
|
|
34
|
+
- name: Set up Python ${{ matrix.python-version }}
|
|
35
|
+
uses: actions/setup-python@v5
|
|
36
|
+
with:
|
|
37
|
+
python-version: ${{ matrix.python-version }}
|
|
38
|
+
- name: Install dependencies
|
|
39
|
+
run: |
|
|
40
|
+
python -m pip install --upgrade pip
|
|
41
|
+
pip install pytest pytest-cov
|
|
42
|
+
pip install -e .
|
|
43
|
+
- name: Run tests
|
|
44
|
+
run: pytest tests/ -v --cov=thrash_protect --cov-report=xml
|
|
45
|
+
- name: Upload coverage
|
|
46
|
+
uses: codecov/codecov-action@v3
|
|
47
|
+
if: matrix.python-version == '3.12'
|
|
48
|
+
with:
|
|
49
|
+
files: ./coverage.xml
|
|
50
|
+
fail_ci_if_error: false
|
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
name: Release to PyPI
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
push:
|
|
5
|
+
tags:
|
|
6
|
+
- 'v*'
|
|
7
|
+
- '[0-9]+.*'
|
|
8
|
+
|
|
9
|
+
jobs:
|
|
10
|
+
build:
|
|
11
|
+
runs-on: ubuntu-latest
|
|
12
|
+
steps:
|
|
13
|
+
- uses: actions/checkout@v4
|
|
14
|
+
with:
|
|
15
|
+
fetch-depth: 0 # Needed for setuptools-scm
|
|
16
|
+
|
|
17
|
+
- name: Set up Python
|
|
18
|
+
uses: actions/setup-python@v5
|
|
19
|
+
with:
|
|
20
|
+
python-version: "3.12"
|
|
21
|
+
|
|
22
|
+
- name: Install build dependencies
|
|
23
|
+
run: pip install build
|
|
24
|
+
|
|
25
|
+
- name: Build package
|
|
26
|
+
run: python -m build
|
|
27
|
+
|
|
28
|
+
- name: Upload artifacts
|
|
29
|
+
uses: actions/upload-artifact@v4
|
|
30
|
+
with:
|
|
31
|
+
name: dist
|
|
32
|
+
path: dist/
|
|
33
|
+
|
|
34
|
+
publish:
|
|
35
|
+
needs: build
|
|
36
|
+
runs-on: ubuntu-latest
|
|
37
|
+
environment: pypi
|
|
38
|
+
permissions:
|
|
39
|
+
id-token: write # Required for trusted publishing
|
|
40
|
+
steps:
|
|
41
|
+
- name: Download artifacts
|
|
42
|
+
uses: actions/download-artifact@v4
|
|
43
|
+
with:
|
|
44
|
+
name: dist
|
|
45
|
+
path: dist/
|
|
46
|
+
|
|
47
|
+
- name: Publish to PyPI
|
|
48
|
+
uses: pypa/gh-action-pypi-publish@release/v1
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
# Python
|
|
2
|
+
__pycache__/
|
|
3
|
+
*.pyc
|
|
4
|
+
*.pyo
|
|
5
|
+
*.egg
|
|
6
|
+
*.egg-info/
|
|
7
|
+
/build/
|
|
8
|
+
/dist/
|
|
9
|
+
|
|
10
|
+
# Make-generated files
|
|
11
|
+
ChangeLog.recent
|
|
12
|
+
.tag.*
|
|
13
|
+
/*.tar.gz
|
|
14
|
+
|
|
15
|
+
# archlinux/Makefile generated
|
|
16
|
+
archlinux/PKGBUILD
|
|
17
|
+
archlinux/.SRCINFO
|
|
18
|
+
archlinux/thrash-protect/
|
|
19
|
+
archlinux/*.tar.gz
|
|
20
|
+
|
|
21
|
+
# rpm/Makefile generated
|
|
22
|
+
rpm/*.spec.bak
|
|
23
|
+
|
|
24
|
+
# IDE/editor
|
|
25
|
+
.vscode/
|
|
26
|
+
.idea/
|
|
27
|
+
*.swp
|
|
28
|
+
*~
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
repos:
|
|
2
|
+
- repo: https://github.com/pre-commit/pre-commit-hooks
|
|
3
|
+
rev: v4.5.0
|
|
4
|
+
hooks:
|
|
5
|
+
- id: trailing-whitespace
|
|
6
|
+
- id: end-of-file-fixer
|
|
7
|
+
- id: check-yaml
|
|
8
|
+
- id: check-added-large-files
|
|
9
|
+
|
|
10
|
+
- repo: https://github.com/astral-sh/ruff-pre-commit
|
|
11
|
+
rev: v0.1.9
|
|
12
|
+
hooks:
|
|
13
|
+
- id: ruff
|
|
14
|
+
args: [--fix, --exit-non-zero-on-fix]
|
|
15
|
+
- id: ruff-format
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
language: python
|
|
2
|
+
# should work in 2.5 as well, but the test code depends on unittest library from python3.3
|
|
3
|
+
# we're attempting to mock up open() - for some reason it fails on python3.3 and python3.4
|
|
4
|
+
python:
|
|
5
|
+
- "3.5"
|
|
6
|
+
- "nightly" # currently points to 3.6-dev
|
|
7
|
+
# command to install dependencies
|
|
8
|
+
#install: "..." ## no dependencies
|
|
9
|
+
# command to run tests
|
|
10
|
+
script: nosetests
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
Thrash-protect was initially written in late 2013 by Tobias Brox while
|
|
2
|
+
working at Redpill-Linpro.
|
|
3
|
+
|
|
4
|
+
The PRIMARY AUTHORS are (and/or have been):
|
|
5
|
+
|
|
6
|
+
* Tobias Brox
|
|
7
|
+
|
|
8
|
+
Other significant contributors:
|
|
9
|
+
|
|
10
|
+
* Bjørn Pettersen - quick code review and constructive criticism
|
|
11
|
+
* Xavier Martin - done some work on prettify the logging a bit
|
|
12
|
+
* wyg3958 - code cleanup
|
|
13
|
+
* Riccardo Balbo - bugfixing, ref https://github.com/tobixen/thrash-protect/pull/7
|
|
14
|
+
* Unnamed colleague - for setting up swap and not editing the MaxClients setting under apache on a production server running some php application, such that we got downtime due to thrashing - wouldn't have made thrash-protect without this one.
|
|
15
|
+
* Unnamed employee at an unnamed customer - for bypassing our job-scheduling system and running some very memory-intensive processes directly on some of the computing nodes, repeatedly causing nodes to go down due to thrashing. (thrash-protect eventually did allow his processes to complete, without downtime or further manual work).
|
|
@@ -0,0 +1,150 @@
|
|
|
1
|
+
2025-12-16 Tobias Brox <tobias@redpill-linpro.com>
|
|
2
|
+
* thrash-protect.py: Removed Python 2 compatibility code (Python 3.9+ required now)
|
|
3
|
+
* thrash-protect.py: Applied ruff linting and formatting, removed unused imports
|
|
4
|
+
* tests/test_thrash_protect.py: Migrated from nose to pytest
|
|
5
|
+
* pyproject.toml: New modern build configuration (replaces setup.py)
|
|
6
|
+
* .github/workflows/ci.yml: Added GitHub Actions for linting and testing
|
|
7
|
+
* .github/workflows/release.yml: Added automatic PyPI release on tags
|
|
8
|
+
* .pre-commit-config.yaml: Added pre-commit hooks for ruff
|
|
9
|
+
* docs/CODE_REVIEW.md: Code review with improvement recommendations
|
|
10
|
+
* docs/TODO.md: TODO list including SSD default tuning issue
|
|
11
|
+
|
|
12
|
+
2022-01-31 Tobias <tobias@redpill-linpro.com>
|
|
13
|
+
* thrash-protect.py: terrible, embarrassing bugfix (revealing a lack of code coverage in tests). Bumped version to 0.14.3.
|
|
14
|
+
* Makefile: worked a bit more on rpm releases
|
|
15
|
+
|
|
16
|
+
2021-09-01 Tobias <tobias@redpill-linpro.com>
|
|
17
|
+
* trash-protect.py: bugfixing and adding debug code (see changelog-0.14.2.md)
|
|
18
|
+
* INSTALL.md: information about configuration options
|
|
19
|
+
* README.md: more information about the job control issue
|
|
20
|
+
|
|
21
|
+
2020-04-08 Niektory <niektory@gmail.com>
|
|
22
|
+
* trash-protect.py: OOMScoreProcessSelector should be used before PageFaultingProcessSelector, ref https://github.com/tobixen/thrash-protect/issues/31
|
|
23
|
+
|
|
24
|
+
2019-08-29 Misc authors <tobias@redpill-linpro.com>
|
|
25
|
+
* thrash-protect.py: Seems like I've forgotten to maintain the ChangeLog for more than a year. Misc tweaks have been done, catching up on feedback and pull requests coming through github. Bumping to 0.13.
|
|
26
|
+
* README.md: various documentation tweaks
|
|
27
|
+
|
|
28
|
+
2018-08-23 Tobias Brox <tobias@redpill-linpro.com>
|
|
29
|
+
* README.md: we don't need this file anymore, since we have README.rst
|
|
30
|
+
* README.rst: merged latest changes from README.md
|
|
31
|
+
* thrash-protect.py: more information in an error message. Bumping 0.12.1.
|
|
32
|
+
|
|
33
|
+
2018-03-21 Tobias Brox <tobias@redpill-linpro.com>
|
|
34
|
+
* thrash-protect.py: bugfix in logging, when parent pids are suspended they should also be logged
|
|
35
|
+
* thrash-protect.py: Trying yet another time to fix the thrash-detecting algorithm; apparently the December bugfix was not well enough tested or thought through.
|
|
36
|
+
* ChangeLog: updated, preparing for new release 0.12.
|
|
37
|
+
|
|
38
|
+
2018-02-08 Tobias Brox <tobias@redpill-linpro.com>
|
|
39
|
+
* thrash-protect.py: Silly-stupid bugfix for the THRASH_PROTECT_WHITELIST configuration environmental variable
|
|
40
|
+
* README: doc tweaks
|
|
41
|
+
|
|
42
|
+
2017-12-08 Tobias Brox <tobias@redpill-linpro.com>
|
|
43
|
+
* thrash-protect.py: a bug in the thrash-detecting algorithm; the threshold for detecting thrashing has efficiently been 1 all the time, even though it's defaulted to 512 in the config section. Fixed the bug and lowered the default to 4, not to change the behaviour too drastically. This will cause thrash-protect to be less aggressive.
|
|
44
|
+
* ChangeLog: updated, preparing for new release 0.11.6 ... 0.11.7 even
|
|
45
|
+
* Makefile: latest changes did not work well for the archlinux install
|
|
46
|
+
|
|
47
|
+
2017-10-30 Tobias Brox <tobias@redpill-linpro.com>
|
|
48
|
+
* README: documentation tweaks and new thoughts
|
|
49
|
+
* thrash-protect.py: sudo is added to the list of processes where it may be important to suspend the parent process
|
|
50
|
+
|
|
51
|
+
2017-10-11 Tobias Brox <tobias@redpill-linpro.com>
|
|
52
|
+
* Makefile: attempting to get directories right for other distros - credits to @anophelesgreyhoe@github
|
|
53
|
+
|
|
54
|
+
2017-02-23 Chris Coetzee <chriscz93@gmail.com>
|
|
55
|
+
* setup.py: pypi-compatible setup script
|
|
56
|
+
* thrash-protect.py: moving logics to a main sub - minor refactoring to follow best pypi-practice
|
|
57
|
+
|
|
58
|
+
2017-02-20 Tobias Brox <tobias@redpill-linpro.com>
|
|
59
|
+
* rpm/thrash-protect.spec: bugfixing the rpm build
|
|
60
|
+
|
|
61
|
+
2017-02-17 Tobias Brox <tobias@redpill-linpro.com>
|
|
62
|
+
* rpm/thrash-protect.spec: bugfix, trying to generate an rpm for centos7
|
|
63
|
+
|
|
64
|
+
2017-02-11 Riccardo Balbo <riccardo@forkforge.net>
|
|
65
|
+
* thrash-protect: bugfix: processes with spaces or paranthesis in the name was not properly handled
|
|
66
|
+
|
|
67
|
+
2016-11-21 Tobias Brox <tobias@redpill-linpro.com>
|
|
68
|
+
* puppet/templates/nrpe.cfg.erb: puppet/nrpe tuning; stuck frozen pid list is not allowed to be more than 3s old (was 5s).
|
|
69
|
+
* archlinux/Makefile: bugfix: "make archlinux" should be indempotent
|
|
70
|
+
* AUTHOR: added other contributors
|
|
71
|
+
* README: added info on a theoretical problem that thrash-protect will be suspending processes "unfairly", removed donation section
|
|
72
|
+
|
|
73
|
+
2016-11-15 Tobias Brox <tobias@redpill-linpro.com>
|
|
74
|
+
* thrash-protect.py: the script broke on python 2.6. Done a quick and dirty workaround, the extended logging information just won't work under older python. Released 0.11.4 (0.11.2 and 0.11.3 was redundant releases - I've completely forgotten the prerequisites for a "make release")
|
|
75
|
+
|
|
76
|
+
2016-02-22 Tobias Brox <tobias@redpill-linpro.com>
|
|
77
|
+
* archlinux/ The AUR setup has been changed quite a bit, the PKGBUILD was buggy. Version 0.11.1 only contains changes to the arch linux package management
|
|
78
|
+
|
|
79
|
+
2016-02-19 Tobias Brox <tobias@redpill-linpro.com>
|
|
80
|
+
* thrash-protect.py: misc minor stuff since last release, including some code cleanup, some error handling, better logging; much of it pulled in from other forks (sorry, I'm too lazy to do a proper job creating the missing ChangeLog entries).
|
|
81
|
+
|
|
82
|
+
2014-12-18 Tobias Brox <tobias@redpill-linpro.com>
|
|
83
|
+
* thrash-protect.py: some refactoring, plus now it will STOP a parent bash process before stopping the child, not to disrupt the job control
|
|
84
|
+
* tests/test_thrash_protect.py: more unit tests and a small functional test
|
|
85
|
+
* tests/thrash_bot2.py: now with argparse (though, that requires 2.7) and made compatible with python3.
|
|
86
|
+
|
|
87
|
+
2014-12-09 Tobias Brox <tobias@redpill-linpro.com>
|
|
88
|
+
* thrash-protect.py: started work on solving the annoying bash job control interpherence ... if suspending a bash child, make sure the parent process is already suspended.
|
|
89
|
+
* test/test_thrash_protect: first unit tests are coming up
|
|
90
|
+
|
|
91
|
+
2014-11-20 Tobias Brox <tobias@redpill-linpro.com>
|
|
92
|
+
* thrash-protect.py: major refactoring, cleaning up quite some mess and getting rid of quite some globals
|
|
93
|
+
* thrash-protect.py: fixed a bug - when thrash_protect kicks in, it will at first almost always freeze a pid and then unfreeze it immediately afterwards without any sleep
|
|
94
|
+
|
|
95
|
+
2014-03-10 Tobias Brox <tobias@redpill-linpro.com>
|
|
96
|
+
* thrash-protect.py: found a silly print statement left behind from earlier debug rounds.
|
|
97
|
+
* thrash-protect.py: bumped the version (patchlevel)
|
|
98
|
+
|
|
99
|
+
2014-02-26 Tobias Brox <tobias@redpill-linpro.com>
|
|
100
|
+
* README.md - elaborated a bit on the drawbacks/problems observed
|
|
101
|
+
* thrash-protect.py (thrash_protect): doing a best-effort on running mlockall()
|
|
102
|
+
|
|
103
|
+
2014-01-14 Tobias Brox <tobias@redpill-linpro.com>
|
|
104
|
+
* general: version number bump - there shouldn't have been any 0.6.4, it should have been released as 0.7.
|
|
105
|
+
* README.md: some minor docfixes
|
|
106
|
+
* tests/thrash-bot.py: a bit of confusion on what version of this script I actually used when I managed to thrash the system. At least now it seems to be eating up memory very fast instead of just spinning CPU.
|
|
107
|
+
|
|
108
|
+
2014-01-13 Tobias Brox <tobias@redpill-linpro.com>
|
|
109
|
+
* general: bump of version number to 0.6.4 (post-script: this was a mistake - correct version number should have been 0.7).
|
|
110
|
+
* tests/: new directory, unfortunately rather empty
|
|
111
|
+
* tests/thrash-bot.py: improved the thrash-bot, it consumes far less CPU and gobbles far more memory now
|
|
112
|
+
* thrash-protect.py (general): tweaks and bugfixes to handle the new thrash_bot.py. unfortunately I added lot more complexity. :-(
|
|
113
|
+
* thrash-protect.py (general): debug hooks for randomly stopping processes even if there is no thrashing going on
|
|
114
|
+
* thrash-protect.py (scan_processes): modified from logic to find the process with most pagefaults to "meta-logic" selecting an algorithm for finding which process to stop.
|
|
115
|
+
* thrash-protect.py (scan_processes_pagefaults): old logics from scan_processes
|
|
116
|
+
* thrash-protect.py (scan_processes_pagefaults): various bugfixes
|
|
117
|
+
* thrash-protect.py (scan_processes_pagefaults): added a threshold to make it less likely whitelisted processes will be stopped
|
|
118
|
+
* thrash-protect.py (scan_processes_oom_score): new function to select the process with highest oom_score
|
|
119
|
+
* thrash-protect.py (find_last_unfrozen_process): returns the pid of the last unfrozen process, if it's still running
|
|
120
|
+
|
|
121
|
+
2013-12-26 Tobias Brox <tobias@redpill-linpro.com>
|
|
122
|
+
* general: bump of version number to 0.6.3; only "meta"-changes (Makefile, etc)
|
|
123
|
+
* general: successfully created a .deb-package that has been deployed on ubuntu precise
|
|
124
|
+
* INSTALL.md: new file - extracted the practical information out from all the blah-blah in README.md
|
|
125
|
+
* README.md: even more blah-blah - now information on the alternatives to thrash-protect
|
|
126
|
+
* Makefile: quite some bugfixes and work
|
|
127
|
+
* debian/*: debianization.
|
|
128
|
+
* systemv/thrash-protect: bugfix to get correct return code from "status"
|
|
129
|
+
* puppet/: puppet setup
|
|
130
|
+
|
|
131
|
+
2013-12-20 Tobias Brox <tobias@redpill-linpro.com>
|
|
132
|
+
* general: bumping version number to 0.6
|
|
133
|
+
* thrash-protect.py: Major fix - changed from using the "major page fault"-counter to using the "page swap in" and "page swap out" counters. Bidirectional swap or more than 100 pages in/out during a one second interval will trigger process suspension.
|
|
134
|
+
* thrash-protect.py: Made symmetry between the blacklisting and whitelisting; it's done by weighting instead of "stop this process first" or "don't stop this process".
|
|
135
|
+
* thrash-protect.py: Minor bugfix - unfrozen processes didn't get logged properly due to a ProcessLookupFailure when running CONT on the session or group id.
|
|
136
|
+
* puppet/: includes an erb-template for a nrpe config file, though might not work outside our puppet environment (depends on $nagios::nrpe::nagios_plugin_dir to be set)
|
|
137
|
+
|
|
138
|
+
2013-12-19 Tobias Brox <tobias@redpill-linpro.com>
|
|
139
|
+
* thrash-protect.py: made support for --version at python2.7 and up, and split out the main logic into a subroutine.
|
|
140
|
+
* rpm: work in progress to build rpm files
|
|
141
|
+
* systemv: work in progress to build a working init-script
|
|
142
|
+
* archlinux/Makefile: allow for automatic build of PKGBUILD and upload to AUR
|
|
143
|
+
* Makefile: Working on release-management through Makefile; ideally all the work involved with making a release should be handled by a simple make command. Pardon for all the stupid mini-releases, need to do quite some testing here.
|
|
144
|
+
* ChangeLog: Now with a ChangeLog file. The entries will be used for the commit message when doing releases.
|
|
145
|
+
|
|
146
|
+
2013-09-06 Tobias Brox <tobias@redpill-linpro.com>
|
|
147
|
+
* thrash-protect.py: Got a mostly working prototype here - it has been tested in production, and it seems to do what it promises, and I'm happy with it, maybe the initial plan rewriting it in C is not really needed.
|
|
148
|
+
|
|
149
|
+
2013-07-29 Tobias Brox <tobias@redpill-linpro.com>
|
|
150
|
+
* General: initial commit
|
|
@@ -0,0 +1,117 @@
|
|
|
1
|
+
Installation and usage
|
|
2
|
+
======================
|
|
3
|
+
|
|
4
|
+
Requirements
|
|
5
|
+
------------
|
|
6
|
+
|
|
7
|
+
This will only work on linux, it depends on reading stats from the
|
|
8
|
+
/proc directory, it depends on python 3 (python 2.5 or higher should probably work - and an old version of the script was backported to python 2.4).
|
|
9
|
+
|
|
10
|
+
No other dependencies.
|
|
11
|
+
|
|
12
|
+
The box or VM running thrash-protect needs to be set up with swap, or
|
|
13
|
+
trash-protect won't do anything useful (even if thrash-like situations
|
|
14
|
+
can happen without swap installed). A reasonably large swap partition
|
|
15
|
+
is recommended, possibly twice as much swap as physical memory, though
|
|
16
|
+
YMMV, and even a very small swap partition is enough for
|
|
17
|
+
thrash-protect to do useful work.
|
|
18
|
+
|
|
19
|
+
My original idea was to make a rapid prototype in python, and then
|
|
20
|
+
port it over to C for a smaller memory- and CPU footprint; while
|
|
21
|
+
thrash-protect has successfully been running on single-CPU instances
|
|
22
|
+
with 512M RAM, it's probably best suited on systems with at least 1GB
|
|
23
|
+
RAM and multiple CPUs (or CPU cores) due to the overhead costs.
|
|
24
|
+
|
|
25
|
+
Compile and Install
|
|
26
|
+
-------------------
|
|
27
|
+
|
|
28
|
+
As it's in python, no compilation is needed.
|
|
29
|
+
|
|
30
|
+
"make install" will hopefully do the right thing and install the
|
|
31
|
+
script as a service.
|
|
32
|
+
|
|
33
|
+
Archlinux users may also install through AUR. rpm and deb packages
|
|
34
|
+
will be made available on request. There are some logic in the Makefile for creating such packages, but it's poorly tested.
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
Usage
|
|
38
|
+
-----
|
|
39
|
+
|
|
40
|
+
The service will need to be started and/or set up to start at boot.
|
|
41
|
+
|
|
42
|
+
If everything else fails, just run the script as root and do whatever
|
|
43
|
+
is necessary to ensure it will be started again after next reboot.
|
|
44
|
+
|
|
45
|
+
While it should be possible to adjust configuration through
|
|
46
|
+
environment variables, best practice is probably to run it without any
|
|
47
|
+
configuration.
|
|
48
|
+
|
|
49
|
+
The System V init file is so far quite redhat-specific and may need
|
|
50
|
+
tuning for usage with other distributions.
|
|
51
|
+
|
|
52
|
+
Configuration
|
|
53
|
+
-------------
|
|
54
|
+
|
|
55
|
+
It should be possible to configure the script through environment
|
|
56
|
+
variables, though this is poorly tested - the default configuration
|
|
57
|
+
has mostly been working out for me. However, the defaults was made in 2013 and may possibly need a bit of tweaking for state-of-the-art equipment.
|
|
58
|
+
|
|
59
|
+
Configuration environment variables that may need tweaking:
|
|
60
|
+
|
|
61
|
+
* THRASH_PROTECT_CMD_WHITELIST - a list of processes that you rather don't want thrash-protect to touch (no guarantees - it just adds a weight). Defaults to "sshd bash xinit X spectrwm screen SCREEN mutt ssh xterm rxvt urxvt Xorg.bin Xorg systemd-journal". Can most likely be trimmed down, particularly on servers. On desktop systems you may want to add more processes, depending on your desktop system.
|
|
62
|
+
* THRASH_PROTECT_CMD_BLACKLIST - opposite of whitelist - processes thrash-protect should prioritize to stop. Defaults to ''.
|
|
63
|
+
* THRASH_PROTECT_CMD_JOBCTRLLIST - processes that may be confused if the child process gets suspended. Defaults to "bash sudo". You may want to do some research if you use another shell, run bash under some pseudonym, or have other job control systems or experience problems with other processes. (See the README for details).
|
|
64
|
+
* THRASH_PROTECT_INTERVAL - thrash protect is set to sleep for 0.5s between each normal iteration, as long as no thrashing is detected. This default was set in 2013, perhaps it can be tuned down on modern hardware.
|
|
65
|
+
* SWAP_PAGE_THRESHOLD - defaults to 4. If there is 4 pages swapped in and 4 pages swapped out during the interval, the script will be triggered. There is also a hard coded constant 10x for single-direction swapping during the interval, so if 40 blocks are swapped in or out, the algorithm will also trigger. The default was set in 2013, maybe it should be adjusted upwards on swap media with high bandwidth, to prevent thrash-protect from suspending processes when it's not needed.
|
|
66
|
+
* THRASH_PROTECT_UNFREEZE_POP_RATIO - default 5. TLDR: should probably be lowered on interactive desktops and increased on servers doing only batch processing. All suspended processes are put in a double ended queue (a double ended queue behaves both as a queue and a stack - so the pid is placed at the end of the queue or at the top of the stack according to how you look at it). If the host has stopped thrashing, the "fair" thing to do would be to always resume the process at the front of the queue (unfreeze_pop_ratio set to 1), but the most effective thing to do is probably to resume and suspend the same process over and over again (unfreeze_pop_ratio set to MAXINT). When set to five it will pop four processes from the top of the stack before it pulls out one process from the front of the queue.
|
|
67
|
+
* THRASH_PROTECT_BLACKLIST_SCORE_MULTIPLIER - default 16. A blacklisted job will be 16 times more likely to be picked up for suspension than a non-blacklisted job.
|
|
68
|
+
* THRASH_PROTECT_WHITELIST_SCORE_MULTIPLIER - default 4 times the blacklist score multiplier. A non-whitelisted job will by default be 64 times more likely to be choosen for suspension than a whitelisted job.
|
|
69
|
+
* THRASH_PROTECT_LOG_USER_DATA_ON_FREEZE - we may log extra process data when freezing processes. The current code forks up a `ps` subprocess (should be rewritten to just check up /proc/stat). Since the system may be critically overloaded when we want to freeze a process, it's considered that we probably don't want to do this, so it's defaulted to false. Note that this is about "hard" logging and the log location is hard coded to /var/log/thrash-protect.log (should probably be consolidated with logging done through the logging module).
|
|
70
|
+
* THRASH_PROTECT_LOG_USER_DATA_ON_UNFREEZE - much the same as the former. Since the system is probably not critically overloaded when we want to unfreeze a process, it's considered that we probably do want this logging, so default is set to true.
|
|
71
|
+
* THRASH_PROTECT_DEBUG_LOGGING - leave it turned off, or thrash-protect will log a lot to stderr (trough the logging module).
|
|
72
|
+
* THRASH_PROTECT_DEBUG_CHECKSTATE - will log warnings (through the logging module) if processes are in unexpected states, i.e. because two instances of the script is running at the same time.
|
|
73
|
+
* THRASH_PROTECT_DATE_HUMAN_READABLE - the early versions of the script logged timestamps in unix format (long int). Set to 0 if you prefer such timestamps.
|
|
74
|
+
* THRASH_PROTECT_PGMAJFAULT_SCAN_THRESHOLD - the script maintains a list of processes and amount of "major page faults" every process has done. This is a bit expensive process hence it's only done when the global major page fault counter has passed some threshold. Default set to swap_page_threshold*4. Can probably be left where it is.
|
|
75
|
+
* THRASH_PROTECT_TEST_MODE - pretend the system is thrashed every now and then, for testing purposes. This hasn't been exercised for quite some years, should probably be removed.
|
|
76
|
+
|
|
77
|
+
Monitoring
|
|
78
|
+
----------
|
|
79
|
+
|
|
80
|
+
thrash-protect may relatively safely live it's own life, users will
|
|
81
|
+
only notice some delays and slowness, and bad situations will
|
|
82
|
+
autorecover (i.e. the resource-consuming process will stop by itself,
|
|
83
|
+
or the kernel will finally run out of swap and the OOM-killer will
|
|
84
|
+
kill the rogue process).
|
|
85
|
+
|
|
86
|
+
For production servers, thrash-protect should ideally only be latent,
|
|
87
|
+
only occationally stop something very briefly, if it becomes active a
|
|
88
|
+
system administrator should manually inspect the box and deal with the
|
|
89
|
+
situation, and eventually order more memory.
|
|
90
|
+
|
|
91
|
+
There are three useful ways to monitor:
|
|
92
|
+
|
|
93
|
+
* Monitoring the number of suspended processes. This will possibly
|
|
94
|
+
catch situations where thrash-protect itself has gone haywire,
|
|
95
|
+
suspending processes but unable to reanimate them. Unfortunately it
|
|
96
|
+
may also cause false alarms on systems where processes are being
|
|
97
|
+
suspended legitimately outside thrash-protect (i.e. due to some
|
|
98
|
+
sysadmin pressing ^Z).
|
|
99
|
+
|
|
100
|
+
* Monitoring the /tmp/thrash-protect-frozen-pid-list file. It should
|
|
101
|
+
only exist briefly.
|
|
102
|
+
|
|
103
|
+
* Age of the /tmp/thrash-protect-frozen-pid-list file; if it exists
|
|
104
|
+
and is old, most likely thrash-protect is not running anymore.
|
|
105
|
+
|
|
106
|
+
nrpe-scripts and icinga-configuration may be done available on request.
|
|
107
|
+
|
|
108
|
+
Subdirectories
|
|
109
|
+
--------------
|
|
110
|
+
|
|
111
|
+
The subdirectories contains various logic for deploying the script:
|
|
112
|
+
|
|
113
|
+
* archlinux - contains logic for submitting to AUR for Arch Linux
|
|
114
|
+
* systemv - contains a traditional init-script, though it may be rather RedHat-specific as for now
|
|
115
|
+
* systemd - contains a service config file for running the script under systemd
|
|
116
|
+
* upstart - contains the config file for starting up the script under the (Ubuntu) upstart system
|
|
117
|
+
* debian - contains files necessary for "debianization" and creating .deb-packages for ubuntu and debian
|