glm-factor-optimizer 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- glm_factor_optimizer-0.1.0/.github/dependabot.yml +13 -0
- glm_factor_optimizer-0.1.0/.github/workflows/ci.yml +99 -0
- glm_factor_optimizer-0.1.0/.github/workflows/release.yml +60 -0
- glm_factor_optimizer-0.1.0/.github/workflows/security.yml +103 -0
- glm_factor_optimizer-0.1.0/.gitignore +15 -0
- glm_factor_optimizer-0.1.0/CONTRIBUTING.md +67 -0
- glm_factor_optimizer-0.1.0/LICENSE +21 -0
- glm_factor_optimizer-0.1.0/PKG-INFO +267 -0
- glm_factor_optimizer-0.1.0/README.md +234 -0
- glm_factor_optimizer-0.1.0/docs/explanation/architecture.md +121 -0
- glm_factor_optimizer-0.1.0/docs/explanation/modeling_principles.md +109 -0
- glm_factor_optimizer-0.1.0/docs/how-to/rank_candidate_factors.md +61 -0
- glm_factor_optimizer-0.1.0/docs/how-to/refine_factors.md +70 -0
- glm_factor_optimizer-0.1.0/docs/how-to/run_automatic_workflow.md +72 -0
- glm_factor_optimizer-0.1.0/docs/how-to/save_and_audit.md +71 -0
- glm_factor_optimizer-0.1.0/docs/how-to/test_interactions.md +78 -0
- glm_factor_optimizer-0.1.0/docs/index.md +76 -0
- glm_factor_optimizer-0.1.0/docs/reference/api.md +179 -0
- glm_factor_optimizer-0.1.0/docs/reference/specs.md +117 -0
- glm_factor_optimizer-0.1.0/docs/reference/validation_outputs.md +112 -0
- glm_factor_optimizer-0.1.0/docs/tutorials/notebook_study_workflow.md +290 -0
- glm_factor_optimizer-0.1.0/examples/data.py +47 -0
- glm_factor_optimizer-0.1.0/examples/run_binning_optimization.py +71 -0
- glm_factor_optimizer-0.1.0/examples/run_frequency_model.py +9 -0
- glm_factor_optimizer-0.1.0/examples/run_gamma_severity_model.py +63 -0
- glm_factor_optimizer-0.1.0/examples/run_manual_count_rate_model.py +58 -0
- glm_factor_optimizer-0.1.0/examples/run_spark_backend.py +81 -0
- glm_factor_optimizer-0.1.0/glm_factor_optimizer/__init__.py +83 -0
- glm_factor_optimizer-0.1.0/glm_factor_optimizer/aggregation.py +136 -0
- glm_factor_optimizer-0.1.0/glm_factor_optimizer/bins.py +257 -0
- glm_factor_optimizer-0.1.0/glm_factor_optimizer/core.py +330 -0
- glm_factor_optimizer-0.1.0/glm_factor_optimizer/diagnostics.py +236 -0
- glm_factor_optimizer-0.1.0/glm_factor_optimizer/factor.py +354 -0
- glm_factor_optimizer-0.1.0/glm_factor_optimizer/metrics.py +334 -0
- glm_factor_optimizer-0.1.0/glm_factor_optimizer/model.py +269 -0
- glm_factor_optimizer-0.1.0/glm_factor_optimizer/optimize.py +391 -0
- glm_factor_optimizer-0.1.0/glm_factor_optimizer/penalties.py +167 -0
- glm_factor_optimizer-0.1.0/glm_factor_optimizer/runs.py +192 -0
- glm_factor_optimizer-0.1.0/glm_factor_optimizer/sampling.py +173 -0
- glm_factor_optimizer-0.1.0/glm_factor_optimizer/screening.py +341 -0
- glm_factor_optimizer-0.1.0/glm_factor_optimizer/spark/__init__.py +30 -0
- glm_factor_optimizer-0.1.0/glm_factor_optimizer/spark/_deps.py +72 -0
- glm_factor_optimizer-0.1.0/glm_factor_optimizer/spark/aggregation.py +143 -0
- glm_factor_optimizer-0.1.0/glm_factor_optimizer/spark/bins.py +247 -0
- glm_factor_optimizer-0.1.0/glm_factor_optimizer/spark/core.py +240 -0
- glm_factor_optimizer-0.1.0/glm_factor_optimizer/spark/metrics.py +165 -0
- glm_factor_optimizer-0.1.0/glm_factor_optimizer/spark/model.py +185 -0
- glm_factor_optimizer-0.1.0/glm_factor_optimizer/spark/optimize.py +382 -0
- glm_factor_optimizer-0.1.0/glm_factor_optimizer/spark/split.py +67 -0
- glm_factor_optimizer-0.1.0/glm_factor_optimizer/spark/workflow.py +203 -0
- glm_factor_optimizer-0.1.0/glm_factor_optimizer/split.py +58 -0
- glm_factor_optimizer-0.1.0/glm_factor_optimizer/study.py +1096 -0
- glm_factor_optimizer-0.1.0/glm_factor_optimizer/validation.py +387 -0
- glm_factor_optimizer-0.1.0/glm_factor_optimizer/workflow.py +455 -0
- glm_factor_optimizer-0.1.0/pyproject.toml +62 -0
- glm_factor_optimizer-0.1.0/tests/test_api.py +86 -0
- glm_factor_optimizer-0.1.0/tests/test_binning.py +52 -0
- glm_factor_optimizer-0.1.0/tests/test_docs.py +127 -0
- glm_factor_optimizer-0.1.0/tests/test_helpers.py +120 -0
- glm_factor_optimizer-0.1.0/tests/test_metrics.py +41 -0
- glm_factor_optimizer-0.1.0/tests/test_optimize.py +135 -0
- glm_factor_optimizer-0.1.0/tests/test_screening.py +72 -0
- glm_factor_optimizer-0.1.0/tests/test_spark_backend.py +152 -0
- glm_factor_optimizer-0.1.0/tests/test_split.py +23 -0
- glm_factor_optimizer-0.1.0/tests/test_study.py +161 -0
- glm_factor_optimizer-0.1.0/tests/test_workflow.py +59 -0
|
@@ -0,0 +1,99 @@
|
|
|
1
|
+
name: CI
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
pull_request:
|
|
5
|
+
push:
|
|
6
|
+
branches:
|
|
7
|
+
- main
|
|
8
|
+
workflow_dispatch:
|
|
9
|
+
|
|
10
|
+
permissions:
|
|
11
|
+
contents: read
|
|
12
|
+
|
|
13
|
+
jobs:
|
|
14
|
+
tests:
|
|
15
|
+
name: Tests (Python ${{ matrix.python-version }})
|
|
16
|
+
runs-on: ubuntu-latest
|
|
17
|
+
strategy:
|
|
18
|
+
fail-fast: false
|
|
19
|
+
matrix:
|
|
20
|
+
python-version: ["3.10", "3.11", "3.12"]
|
|
21
|
+
steps:
|
|
22
|
+
- name: Check out repository
|
|
23
|
+
uses: actions/checkout@v6
|
|
24
|
+
|
|
25
|
+
- name: Set up Python
|
|
26
|
+
uses: actions/setup-python@v6
|
|
27
|
+
with:
|
|
28
|
+
python-version: ${{ matrix.python-version }}
|
|
29
|
+
cache: pip
|
|
30
|
+
|
|
31
|
+
- name: Install project and test dependencies
|
|
32
|
+
run: |
|
|
33
|
+
python -m pip install --upgrade pip
|
|
34
|
+
python -m pip install -e .[dev]
|
|
35
|
+
|
|
36
|
+
- name: Run unit tests
|
|
37
|
+
run: python -m unittest discover -s tests -v
|
|
38
|
+
|
|
39
|
+
coverage:
|
|
40
|
+
name: Coverage
|
|
41
|
+
runs-on: ubuntu-latest
|
|
42
|
+
steps:
|
|
43
|
+
- name: Check out repository
|
|
44
|
+
uses: actions/checkout@v6
|
|
45
|
+
|
|
46
|
+
- name: Set up Python
|
|
47
|
+
uses: actions/setup-python@v6
|
|
48
|
+
with:
|
|
49
|
+
python-version: "3.11"
|
|
50
|
+
cache: pip
|
|
51
|
+
|
|
52
|
+
- name: Install project and coverage dependencies
|
|
53
|
+
run: |
|
|
54
|
+
python -m pip install --upgrade pip
|
|
55
|
+
python -m pip install -e .[dev]
|
|
56
|
+
|
|
57
|
+
- name: Run coverage
|
|
58
|
+
run: python -m coverage run -m unittest discover -s tests -v
|
|
59
|
+
|
|
60
|
+
- name: Enforce coverage threshold
|
|
61
|
+
run: |
|
|
62
|
+
python -m coverage report --fail-under=70
|
|
63
|
+
python -m coverage xml
|
|
64
|
+
|
|
65
|
+
- name: Upload coverage artifact
|
|
66
|
+
uses: actions/upload-artifact@v7
|
|
67
|
+
with:
|
|
68
|
+
name: coverage-xml
|
|
69
|
+
path: coverage.xml
|
|
70
|
+
|
|
71
|
+
package:
|
|
72
|
+
name: Build package
|
|
73
|
+
runs-on: ubuntu-latest
|
|
74
|
+
steps:
|
|
75
|
+
- name: Check out repository
|
|
76
|
+
uses: actions/checkout@v6
|
|
77
|
+
|
|
78
|
+
- name: Set up Python
|
|
79
|
+
uses: actions/setup-python@v6
|
|
80
|
+
with:
|
|
81
|
+
python-version: "3.11"
|
|
82
|
+
cache: pip
|
|
83
|
+
|
|
84
|
+
- name: Install packaging tools
|
|
85
|
+
run: |
|
|
86
|
+
python -m pip install --upgrade pip
|
|
87
|
+
python -m pip install build twine
|
|
88
|
+
|
|
89
|
+
- name: Build distributions
|
|
90
|
+
run: python -m build
|
|
91
|
+
|
|
92
|
+
- name: Validate distributions
|
|
93
|
+
run: python -m twine check dist/*
|
|
94
|
+
|
|
95
|
+
- name: Upload distribution artifacts
|
|
96
|
+
uses: actions/upload-artifact@v7
|
|
97
|
+
with:
|
|
98
|
+
name: python-package-distributions
|
|
99
|
+
path: dist/*
|
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
name: Release
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
push:
|
|
5
|
+
tags:
|
|
6
|
+
- "v*"
|
|
7
|
+
workflow_dispatch:
|
|
8
|
+
|
|
9
|
+
permissions:
|
|
10
|
+
contents: read
|
|
11
|
+
|
|
12
|
+
jobs:
|
|
13
|
+
build:
|
|
14
|
+
name: Build release artifacts
|
|
15
|
+
runs-on: ubuntu-latest
|
|
16
|
+
steps:
|
|
17
|
+
- name: Check out repository
|
|
18
|
+
uses: actions/checkout@v6
|
|
19
|
+
|
|
20
|
+
- name: Set up Python
|
|
21
|
+
uses: actions/setup-python@v6
|
|
22
|
+
with:
|
|
23
|
+
python-version: "3.11"
|
|
24
|
+
cache: pip
|
|
25
|
+
|
|
26
|
+
- name: Install packaging tools
|
|
27
|
+
run: |
|
|
28
|
+
python -m pip install --upgrade pip
|
|
29
|
+
python -m pip install build twine
|
|
30
|
+
|
|
31
|
+
- name: Build distributions
|
|
32
|
+
run: python -m build
|
|
33
|
+
|
|
34
|
+
- name: Validate distributions
|
|
35
|
+
run: python -m twine check dist/*
|
|
36
|
+
|
|
37
|
+
- name: Upload distribution artifacts
|
|
38
|
+
uses: actions/upload-artifact@v7
|
|
39
|
+
with:
|
|
40
|
+
name: python-package-distributions
|
|
41
|
+
path: dist/*
|
|
42
|
+
|
|
43
|
+
publish:
|
|
44
|
+
name: Publish to PyPI
|
|
45
|
+
needs: build
|
|
46
|
+
runs-on: ubuntu-latest
|
|
47
|
+
permissions:
|
|
48
|
+
id-token: write
|
|
49
|
+
environment:
|
|
50
|
+
name: pypi
|
|
51
|
+
url: https://pypi.org/p/glm-factor-optimizer
|
|
52
|
+
steps:
|
|
53
|
+
- name: Download distribution artifacts
|
|
54
|
+
uses: actions/download-artifact@v8
|
|
55
|
+
with:
|
|
56
|
+
name: python-package-distributions
|
|
57
|
+
path: dist
|
|
58
|
+
|
|
59
|
+
- name: Publish package
|
|
60
|
+
uses: pypa/gh-action-pypi-publish@release/v1
|
|
@@ -0,0 +1,103 @@
|
|
|
1
|
+
name: Security
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
pull_request:
|
|
5
|
+
push:
|
|
6
|
+
branches:
|
|
7
|
+
- main
|
|
8
|
+
schedule:
|
|
9
|
+
- cron: "17 4 * * 1"
|
|
10
|
+
workflow_dispatch:
|
|
11
|
+
|
|
12
|
+
permissions:
|
|
13
|
+
contents: read
|
|
14
|
+
|
|
15
|
+
jobs:
|
|
16
|
+
dependency-audit:
|
|
17
|
+
name: Dependency Audit
|
|
18
|
+
runs-on: ubuntu-latest
|
|
19
|
+
steps:
|
|
20
|
+
- name: Check out repository
|
|
21
|
+
uses: actions/checkout@v6
|
|
22
|
+
|
|
23
|
+
- name: Set up Python
|
|
24
|
+
uses: actions/setup-python@v6
|
|
25
|
+
with:
|
|
26
|
+
python-version: "3.11"
|
|
27
|
+
cache: pip
|
|
28
|
+
|
|
29
|
+
- name: Install project and audit tooling
|
|
30
|
+
run: |
|
|
31
|
+
python -m pip install --upgrade pip setuptools
|
|
32
|
+
python -m pip install -e .[dev] pip-audit
|
|
33
|
+
|
|
34
|
+
- name: Run pip-audit
|
|
35
|
+
run: python -m pip_audit
|
|
36
|
+
|
|
37
|
+
bandit:
|
|
38
|
+
name: Bandit
|
|
39
|
+
runs-on: ubuntu-latest
|
|
40
|
+
steps:
|
|
41
|
+
- name: Check out repository
|
|
42
|
+
uses: actions/checkout@v6
|
|
43
|
+
|
|
44
|
+
- name: Set up Python
|
|
45
|
+
uses: actions/setup-python@v6
|
|
46
|
+
with:
|
|
47
|
+
python-version: "3.11"
|
|
48
|
+
cache: pip
|
|
49
|
+
|
|
50
|
+
- name: Install Bandit
|
|
51
|
+
run: |
|
|
52
|
+
python -m pip install --upgrade pip
|
|
53
|
+
python -m pip install bandit[toml]
|
|
54
|
+
|
|
55
|
+
- name: Run Bandit
|
|
56
|
+
run: bandit -q -r glm_factor_optimizer examples
|
|
57
|
+
|
|
58
|
+
gitleaks:
|
|
59
|
+
name: Gitleaks
|
|
60
|
+
runs-on: ubuntu-latest
|
|
61
|
+
steps:
|
|
62
|
+
- name: Check out repository
|
|
63
|
+
uses: actions/checkout@v6
|
|
64
|
+
with:
|
|
65
|
+
fetch-depth: 0
|
|
66
|
+
|
|
67
|
+
- name: Install gitleaks
|
|
68
|
+
run: |
|
|
69
|
+
curl -sSL https://github.com/gitleaks/gitleaks/releases/download/v8.24.3/gitleaks_8.24.3_linux_x64.tar.gz -o gitleaks.tar.gz
|
|
70
|
+
tar -xzf gitleaks.tar.gz gitleaks
|
|
71
|
+
sudo install -m 755 gitleaks /usr/local/bin/gitleaks
|
|
72
|
+
|
|
73
|
+
- name: Run gitleaks
|
|
74
|
+
run: gitleaks detect --source . --redact --exit-code 1
|
|
75
|
+
|
|
76
|
+
codeql:
|
|
77
|
+
name: CodeQL
|
|
78
|
+
runs-on: ubuntu-latest
|
|
79
|
+
permissions:
|
|
80
|
+
actions: read
|
|
81
|
+
contents: read
|
|
82
|
+
security-events: write
|
|
83
|
+
steps:
|
|
84
|
+
- name: Check out repository
|
|
85
|
+
uses: actions/checkout@v6
|
|
86
|
+
|
|
87
|
+
- name: Set up Python
|
|
88
|
+
uses: actions/setup-python@v6
|
|
89
|
+
with:
|
|
90
|
+
python-version: "3.11"
|
|
91
|
+
|
|
92
|
+
- name: Initialize CodeQL
|
|
93
|
+
uses: github/codeql-action/init@v4
|
|
94
|
+
with:
|
|
95
|
+
languages: python
|
|
96
|
+
|
|
97
|
+
- name: Install project dependencies
|
|
98
|
+
run: |
|
|
99
|
+
python -m pip install --upgrade pip
|
|
100
|
+
python -m pip install -e .[dev]
|
|
101
|
+
|
|
102
|
+
- name: Analyze
|
|
103
|
+
uses: github/codeql-action/analyze@v4
|
|
@@ -0,0 +1,67 @@
|
|
|
1
|
+
# Contributing
|
|
2
|
+
|
|
3
|
+
Contributions are welcome. Keep changes focused, add or update tests for behavioral changes, and update the public documentation when the user-facing API changes.
|
|
4
|
+
|
|
5
|
+
## Local Setup
|
|
6
|
+
|
|
7
|
+
Create a virtual environment and install the project in editable mode with development dependencies:
|
|
8
|
+
|
|
9
|
+
```bash
|
|
10
|
+
python -m venv .venv
|
|
11
|
+
. .venv/bin/activate
|
|
12
|
+
python -m pip install --upgrade pip
|
|
13
|
+
python -m pip install -e .[dev]
|
|
14
|
+
```
|
|
15
|
+
|
|
16
|
+
On Windows PowerShell, activate the environment with:
|
|
17
|
+
|
|
18
|
+
```powershell
|
|
19
|
+
.venv\Scripts\Activate.ps1
|
|
20
|
+
```
|
|
21
|
+
|
|
22
|
+
## Test And Coverage
|
|
23
|
+
|
|
24
|
+
Run the full test suite:
|
|
25
|
+
|
|
26
|
+
```bash
|
|
27
|
+
python -m unittest discover -s tests -v
|
|
28
|
+
```
|
|
29
|
+
|
|
30
|
+
Generate a local coverage report:
|
|
31
|
+
|
|
32
|
+
```bash
|
|
33
|
+
python -m coverage run -m unittest discover -s tests -v
|
|
34
|
+
python -m coverage report
|
|
35
|
+
python -m coverage xml
|
|
36
|
+
```
|
|
37
|
+
|
|
38
|
+
The default coverage target tracks the core package. The optional Spark backend
|
|
39
|
+
is validated separately because its runtime integration test is gated behind an
|
|
40
|
+
environment flag and extra dependencies.
|
|
41
|
+
|
|
42
|
+
The optional Spark integration test is disabled by default. To run it locally, install the Spark extra and set the integration-test flag:
|
|
43
|
+
|
|
44
|
+
```bash
|
|
45
|
+
python -m pip install -e .[dev,spark]
|
|
46
|
+
GLM_FACTOR_OPTIMIZER_RUN_SPARK_TESTS=1 python -m unittest tests.test_spark_backend -v
|
|
47
|
+
```
|
|
48
|
+
|
|
49
|
+
## Packaging Checks
|
|
50
|
+
|
|
51
|
+
Before opening a release-oriented pull request, verify the package builds cleanly:
|
|
52
|
+
|
|
53
|
+
```bash
|
|
54
|
+
python -m build
|
|
55
|
+
python -m twine check dist/*
|
|
56
|
+
```
|
|
57
|
+
|
|
58
|
+
## Pull Requests
|
|
59
|
+
|
|
60
|
+
- Keep each pull request scoped to one change set.
|
|
61
|
+
- Add regression tests for bug fixes and new features.
|
|
62
|
+
- Prefer neutral, domain-free examples in public docs and examples.
|
|
63
|
+
- Avoid committing generated artifacts, virtual environments, or local secrets.
|
|
64
|
+
|
|
65
|
+
## Release Process
|
|
66
|
+
|
|
67
|
+
Create and push a version tag like `v0.1.0` after the main branch is ready. The release workflow builds the distributions, runs a final Twine check, and publishes to PyPI via GitHub trusted publishing once the repository is configured as a trusted publisher in PyPI.
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 GLM Factor Optimizer contributors
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,267 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: glm-factor-optimizer
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Domain-free GLM factor binning, grouping, and model design tools.
|
|
5
|
+
Project-URL: Homepage, https://github.com/csabar/glm-factor-optimizer
|
|
6
|
+
Project-URL: Repository, https://github.com/csabar/glm-factor-optimizer
|
|
7
|
+
Project-URL: Documentation, https://github.com/csabar/glm-factor-optimizer/blob/main/docs/index.md
|
|
8
|
+
Project-URL: Issues, https://github.com/csabar/glm-factor-optimizer/issues
|
|
9
|
+
Author: GLM Factor Optimizer contributors
|
|
10
|
+
License: MIT
|
|
11
|
+
License-File: LICENSE
|
|
12
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
13
|
+
Classifier: Operating System :: OS Independent
|
|
14
|
+
Classifier: Programming Language :: Python
|
|
15
|
+
Classifier: Programming Language :: Python :: 3
|
|
16
|
+
Classifier: Programming Language :: Python :: 3 :: Only
|
|
17
|
+
Requires-Python: >=3.10
|
|
18
|
+
Requires-Dist: numpy>=1.23
|
|
19
|
+
Requires-Dist: optuna>=3.0
|
|
20
|
+
Requires-Dist: pandas>=1.5
|
|
21
|
+
Requires-Dist: statsmodels>=0.14
|
|
22
|
+
Provides-Extra: dev
|
|
23
|
+
Requires-Dist: build>=1.2; extra == 'dev'
|
|
24
|
+
Requires-Dist: coverage[toml]>=7.6; extra == 'dev'
|
|
25
|
+
Requires-Dist: hatchling>=1.21; extra == 'dev'
|
|
26
|
+
Requires-Dist: pytest>=8.0; extra == 'dev'
|
|
27
|
+
Requires-Dist: twine>=5.1; extra == 'dev'
|
|
28
|
+
Provides-Extra: mlflow
|
|
29
|
+
Requires-Dist: mlflow>=2.0; extra == 'mlflow'
|
|
30
|
+
Provides-Extra: spark
|
|
31
|
+
Requires-Dist: pyspark>=3.5; extra == 'spark'
|
|
32
|
+
Description-Content-Type: text/markdown
|
|
33
|
+
|
|
34
|
+
# glm-factor-optimizer
|
|
35
|
+
|
|
36
|
+
[](https://github.com/csabar/glm-factor-optimizer/actions/workflows/ci.yml)
|
|
37
|
+
[](https://github.com/csabar/glm-factor-optimizer/actions/workflows/security.yml)
|
|
38
|
+
[](LICENSE)
|
|
39
|
+
[](pyproject.toml)
|
|
40
|
+
|
|
41
|
+
Simple GLM tools for factor binning, grouping, model screening, and workflow
|
|
42
|
+
automation. The package is domain-free: it works for count-rate, positive
|
|
43
|
+
continuous, and other small GLM modeling problems.
|
|
44
|
+
|
|
45
|
+
Full project documentation is included in the source distribution under `docs/`
|
|
46
|
+
and browsable on GitHub at
|
|
47
|
+
[docs/index.md](https://github.com/csabar/glm-factor-optimizer/blob/main/docs/index.md).
|
|
48
|
+
The docs are organized as tutorials, how-to guides, reference, and explanation.
|
|
49
|
+
|
|
50
|
+
Use `RateGLM` for count-rate models:
|
|
51
|
+
|
|
52
|
+
- count target, like `events`
|
|
53
|
+
- exposure column, like `hours`
|
|
54
|
+
- numeric or categorical factors
|
|
55
|
+
|
|
56
|
+
```python
|
|
57
|
+
from glm_factor_optimizer import RateGLM, split
|
|
58
|
+
|
|
59
|
+
train, valid, holdout = split(df)
|
|
60
|
+
|
|
61
|
+
glm = RateGLM(target="events", exposure="hours")
|
|
62
|
+
|
|
63
|
+
score_spec = glm.bins(train, "score", bins=5)
|
|
64
|
+
train = glm.apply(train, score_spec)
|
|
65
|
+
valid = glm.apply(valid, score_spec)
|
|
66
|
+
|
|
67
|
+
model = glm.fit(train, factors=[score_spec["output"], "segment"])
|
|
68
|
+
valid = glm.predict(valid, model)
|
|
69
|
+
|
|
70
|
+
report = glm.report(valid)
|
|
71
|
+
print(report["summary"])
|
|
72
|
+
```
|
|
73
|
+
|
|
74
|
+
Use `GLM` for other families, such as Gamma cost or duration models:
|
|
75
|
+
|
|
76
|
+
```python
|
|
77
|
+
from glm_factor_optimizer import GLM
|
|
78
|
+
|
|
79
|
+
glm = GLM(target="severity", family="gamma", prediction="predicted_severity")
|
|
80
|
+
|
|
81
|
+
age_spec = glm.bins(train, "machine_age", bins=6)
|
|
82
|
+
train = glm.apply(train, age_spec)
|
|
83
|
+
valid = glm.apply(valid, age_spec)
|
|
84
|
+
|
|
85
|
+
model = glm.fit(train, factors=[age_spec["output"], "equipment_type"])
|
|
86
|
+
valid = glm.predict(valid, model)
|
|
87
|
+
```
|
|
88
|
+
|
|
89
|
+
Optimize one factor manually:
|
|
90
|
+
|
|
91
|
+
```python
|
|
92
|
+
result = glm.optimize(
|
|
93
|
+
train,
|
|
94
|
+
valid,
|
|
95
|
+
"score",
|
|
96
|
+
fixed=["segment"],
|
|
97
|
+
trials=50,
|
|
98
|
+
)
|
|
99
|
+
|
|
100
|
+
train = glm.apply(train, result.spec)
|
|
101
|
+
valid = glm.apply(valid, result.spec)
|
|
102
|
+
model = glm.fit(train, factors=[result.output, "segment"])
|
|
103
|
+
```
|
|
104
|
+
|
|
105
|
+
The same optimizer is also exposed as `optimize_bins`:
|
|
106
|
+
|
|
107
|
+
```python
|
|
108
|
+
from glm_factor_optimizer import optimize_bins
|
|
109
|
+
|
|
110
|
+
result = optimize_bins(
|
|
111
|
+
train,
|
|
112
|
+
valid,
|
|
113
|
+
target="events",
|
|
114
|
+
exposure="hours",
|
|
115
|
+
factor="score",
|
|
116
|
+
)
|
|
117
|
+
```
|
|
118
|
+
|
|
119
|
+
Add custom penalties with lambdas or named functions:
|
|
120
|
+
|
|
121
|
+
```python
|
|
122
|
+
from glm_factor_optimizer import small_bin_size_penalty, small_count_penalty
|
|
123
|
+
|
|
124
|
+
result = glm.optimize(
|
|
125
|
+
train,
|
|
126
|
+
valid,
|
|
127
|
+
"score",
|
|
128
|
+
penalties={
|
|
129
|
+
"small_count": small_count_penalty(min_count=5, penalty=0.02),
|
|
130
|
+
"many_bins": lambda c: 0.01 * max(c["bin_count"] - 6, 0),
|
|
131
|
+
"gap": lambda c: max(c["validation_deviance"] - c["train_deviance"], 0),
|
|
132
|
+
},
|
|
133
|
+
)
|
|
134
|
+
```
|
|
135
|
+
|
|
136
|
+
Penalty callables receive a context dictionary with the selected `spec`, the
|
|
137
|
+
training `bin_table`, train/validation deviance, predictions, transformed
|
|
138
|
+
dataframes, factor name, kind, and fixed factors.
|
|
139
|
+
|
|
140
|
+
Rank candidate factors before detailed optimization:
|
|
141
|
+
|
|
142
|
+
```python
|
|
143
|
+
ranking = glm.rank(
|
|
144
|
+
train,
|
|
145
|
+
valid,
|
|
146
|
+
["score", "segment", "region"],
|
|
147
|
+
factor_kinds={"segment": "categorical", "region": "categorical"},
|
|
148
|
+
)
|
|
149
|
+
print(ranking[["factor", "deviance_improvement"]])
|
|
150
|
+
```
|
|
151
|
+
|
|
152
|
+
Run a higher-level sequential workflow with optional ranking, logging, and
|
|
153
|
+
interaction diagnostics:
|
|
154
|
+
|
|
155
|
+
```python
|
|
156
|
+
from glm_factor_optimizer import GLMWorkflow
|
|
157
|
+
|
|
158
|
+
workflow = GLMWorkflow(
|
|
159
|
+
target="events",
|
|
160
|
+
family="poisson",
|
|
161
|
+
exposure="hours",
|
|
162
|
+
factor_kinds={"segment": "categorical"},
|
|
163
|
+
trials=50,
|
|
164
|
+
rank_candidates=True,
|
|
165
|
+
top_n=5,
|
|
166
|
+
interaction_diagnostics=True,
|
|
167
|
+
output_dir="runs",
|
|
168
|
+
)
|
|
169
|
+
|
|
170
|
+
result = workflow.fit(df, factors=["score", "segment"])
|
|
171
|
+
print(result.validation_report["summary"])
|
|
172
|
+
print(result.coefficients)
|
|
173
|
+
```
|
|
174
|
+
|
|
175
|
+
For notebook-style iterative model design, use `GLMStudy`:
|
|
176
|
+
|
|
177
|
+
```python
|
|
178
|
+
from glm_factor_optimizer import GLMStudy
|
|
179
|
+
|
|
180
|
+
study = GLMStudy(
|
|
181
|
+
df,
|
|
182
|
+
target="events",
|
|
183
|
+
exposure="hours",
|
|
184
|
+
prediction="predicted_count",
|
|
185
|
+
factor_kinds={"segment": "categorical"},
|
|
186
|
+
)
|
|
187
|
+
|
|
188
|
+
study.split(seed=42)
|
|
189
|
+
ranking = study.rank_candidates(["score", "segment", "region"])
|
|
190
|
+
|
|
191
|
+
score = study.factor("score")
|
|
192
|
+
score.coarse_bins(bins=10)
|
|
193
|
+
score.optimize(trials=100, max_bins=6)
|
|
194
|
+
score.compare()
|
|
195
|
+
score.accept(comment="stable score shape")
|
|
196
|
+
|
|
197
|
+
study.fit_main_effects()
|
|
198
|
+
study.validation_report()
|
|
199
|
+
|
|
200
|
+
refined = study.refine_factor("score", trials=200)
|
|
201
|
+
refined.accept(comment="full-model refinement")
|
|
202
|
+
|
|
203
|
+
study.find_interactions()
|
|
204
|
+
study.finalize()
|
|
205
|
+
study.save("runs")
|
|
206
|
+
```
|
|
207
|
+
|
|
208
|
+
Useful helper modules are available for manual workflows:
|
|
209
|
+
|
|
210
|
+
```python
|
|
211
|
+
from glm_factor_optimizer.aggregation import aggregate_rate_table
|
|
212
|
+
from glm_factor_optimizer.diagnostics import find_interactions
|
|
213
|
+
from glm_factor_optimizer.runs import RunLogger
|
|
214
|
+
from glm_factor_optimizer.sampling import stratified_sample
|
|
215
|
+
```
|
|
216
|
+
|
|
217
|
+
Example synthetic datasets live under `examples/` and are not part of the
|
|
218
|
+
installable package API. The examples cover general event-rate, severity, and
|
|
219
|
+
Spark-style workflows across operational and service settings.
|
|
220
|
+
|
|
221
|
+
Use the optional Spark backend in PySpark environments:
|
|
222
|
+
|
|
223
|
+
```python
|
|
224
|
+
from glm_factor_optimizer.spark import SparkGLM, SparkGLMWorkflow
|
|
225
|
+
|
|
226
|
+
glm = SparkGLM(
|
|
227
|
+
target="events",
|
|
228
|
+
family="poisson",
|
|
229
|
+
exposure="hours",
|
|
230
|
+
prediction="predicted_count",
|
|
231
|
+
)
|
|
232
|
+
|
|
233
|
+
score_spec = glm.bins(train_sdf, "score", bins=8)
|
|
234
|
+
train_sdf = glm.apply(train_sdf, score_spec)
|
|
235
|
+
valid_sdf = glm.apply(valid_sdf, score_spec)
|
|
236
|
+
|
|
237
|
+
model = glm.fit(train_sdf, factors=[score_spec["output"], "segment"])
|
|
238
|
+
valid_sdf = glm.predict(valid_sdf, model)
|
|
239
|
+
```
|
|
240
|
+
|
|
241
|
+
Spark Optuna optimization runs Optuna on the driver and Spark GLM jobs inside
|
|
242
|
+
each trial:
|
|
243
|
+
|
|
244
|
+
```python
|
|
245
|
+
result = glm.optimize(
|
|
246
|
+
train_sdf,
|
|
247
|
+
valid_sdf,
|
|
248
|
+
"score",
|
|
249
|
+
fixed=["segment"],
|
|
250
|
+
trials=30,
|
|
251
|
+
cache_input=True,
|
|
252
|
+
cache_trials=False,
|
|
253
|
+
)
|
|
254
|
+
```
|
|
255
|
+
|
|
256
|
+
Install locally with Spark support using:
|
|
257
|
+
|
|
258
|
+
```bash
|
|
259
|
+
pip install "glm-factor-optimizer[spark]"
|
|
260
|
+
```
|
|
261
|
+
|
|
262
|
+
All binning and grouping specs are plain JSON-serializable dictionaries.
|
|
263
|
+
|
|
264
|
+
## Contributing
|
|
265
|
+
|
|
266
|
+
Development setup, test commands, coverage, and release notes are documented in
|
|
267
|
+
[`CONTRIBUTING.md`](CONTRIBUTING.md).
|