glm-factor-optimizer 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (66) hide show
  1. glm_factor_optimizer-0.1.0/.github/dependabot.yml +13 -0
  2. glm_factor_optimizer-0.1.0/.github/workflows/ci.yml +99 -0
  3. glm_factor_optimizer-0.1.0/.github/workflows/release.yml +60 -0
  4. glm_factor_optimizer-0.1.0/.github/workflows/security.yml +103 -0
  5. glm_factor_optimizer-0.1.0/.gitignore +15 -0
  6. glm_factor_optimizer-0.1.0/CONTRIBUTING.md +67 -0
  7. glm_factor_optimizer-0.1.0/LICENSE +21 -0
  8. glm_factor_optimizer-0.1.0/PKG-INFO +267 -0
  9. glm_factor_optimizer-0.1.0/README.md +234 -0
  10. glm_factor_optimizer-0.1.0/docs/explanation/architecture.md +121 -0
  11. glm_factor_optimizer-0.1.0/docs/explanation/modeling_principles.md +109 -0
  12. glm_factor_optimizer-0.1.0/docs/how-to/rank_candidate_factors.md +61 -0
  13. glm_factor_optimizer-0.1.0/docs/how-to/refine_factors.md +70 -0
  14. glm_factor_optimizer-0.1.0/docs/how-to/run_automatic_workflow.md +72 -0
  15. glm_factor_optimizer-0.1.0/docs/how-to/save_and_audit.md +71 -0
  16. glm_factor_optimizer-0.1.0/docs/how-to/test_interactions.md +78 -0
  17. glm_factor_optimizer-0.1.0/docs/index.md +76 -0
  18. glm_factor_optimizer-0.1.0/docs/reference/api.md +179 -0
  19. glm_factor_optimizer-0.1.0/docs/reference/specs.md +117 -0
  20. glm_factor_optimizer-0.1.0/docs/reference/validation_outputs.md +112 -0
  21. glm_factor_optimizer-0.1.0/docs/tutorials/notebook_study_workflow.md +290 -0
  22. glm_factor_optimizer-0.1.0/examples/data.py +47 -0
  23. glm_factor_optimizer-0.1.0/examples/run_binning_optimization.py +71 -0
  24. glm_factor_optimizer-0.1.0/examples/run_frequency_model.py +9 -0
  25. glm_factor_optimizer-0.1.0/examples/run_gamma_severity_model.py +63 -0
  26. glm_factor_optimizer-0.1.0/examples/run_manual_count_rate_model.py +58 -0
  27. glm_factor_optimizer-0.1.0/examples/run_spark_backend.py +81 -0
  28. glm_factor_optimizer-0.1.0/glm_factor_optimizer/__init__.py +83 -0
  29. glm_factor_optimizer-0.1.0/glm_factor_optimizer/aggregation.py +136 -0
  30. glm_factor_optimizer-0.1.0/glm_factor_optimizer/bins.py +257 -0
  31. glm_factor_optimizer-0.1.0/glm_factor_optimizer/core.py +330 -0
  32. glm_factor_optimizer-0.1.0/glm_factor_optimizer/diagnostics.py +236 -0
  33. glm_factor_optimizer-0.1.0/glm_factor_optimizer/factor.py +354 -0
  34. glm_factor_optimizer-0.1.0/glm_factor_optimizer/metrics.py +334 -0
  35. glm_factor_optimizer-0.1.0/glm_factor_optimizer/model.py +269 -0
  36. glm_factor_optimizer-0.1.0/glm_factor_optimizer/optimize.py +391 -0
  37. glm_factor_optimizer-0.1.0/glm_factor_optimizer/penalties.py +167 -0
  38. glm_factor_optimizer-0.1.0/glm_factor_optimizer/runs.py +192 -0
  39. glm_factor_optimizer-0.1.0/glm_factor_optimizer/sampling.py +173 -0
  40. glm_factor_optimizer-0.1.0/glm_factor_optimizer/screening.py +341 -0
  41. glm_factor_optimizer-0.1.0/glm_factor_optimizer/spark/__init__.py +30 -0
  42. glm_factor_optimizer-0.1.0/glm_factor_optimizer/spark/_deps.py +72 -0
  43. glm_factor_optimizer-0.1.0/glm_factor_optimizer/spark/aggregation.py +143 -0
  44. glm_factor_optimizer-0.1.0/glm_factor_optimizer/spark/bins.py +247 -0
  45. glm_factor_optimizer-0.1.0/glm_factor_optimizer/spark/core.py +240 -0
  46. glm_factor_optimizer-0.1.0/glm_factor_optimizer/spark/metrics.py +165 -0
  47. glm_factor_optimizer-0.1.0/glm_factor_optimizer/spark/model.py +185 -0
  48. glm_factor_optimizer-0.1.0/glm_factor_optimizer/spark/optimize.py +382 -0
  49. glm_factor_optimizer-0.1.0/glm_factor_optimizer/spark/split.py +67 -0
  50. glm_factor_optimizer-0.1.0/glm_factor_optimizer/spark/workflow.py +203 -0
  51. glm_factor_optimizer-0.1.0/glm_factor_optimizer/split.py +58 -0
  52. glm_factor_optimizer-0.1.0/glm_factor_optimizer/study.py +1096 -0
  53. glm_factor_optimizer-0.1.0/glm_factor_optimizer/validation.py +387 -0
  54. glm_factor_optimizer-0.1.0/glm_factor_optimizer/workflow.py +455 -0
  55. glm_factor_optimizer-0.1.0/pyproject.toml +62 -0
  56. glm_factor_optimizer-0.1.0/tests/test_api.py +86 -0
  57. glm_factor_optimizer-0.1.0/tests/test_binning.py +52 -0
  58. glm_factor_optimizer-0.1.0/tests/test_docs.py +127 -0
  59. glm_factor_optimizer-0.1.0/tests/test_helpers.py +120 -0
  60. glm_factor_optimizer-0.1.0/tests/test_metrics.py +41 -0
  61. glm_factor_optimizer-0.1.0/tests/test_optimize.py +135 -0
  62. glm_factor_optimizer-0.1.0/tests/test_screening.py +72 -0
  63. glm_factor_optimizer-0.1.0/tests/test_spark_backend.py +152 -0
  64. glm_factor_optimizer-0.1.0/tests/test_split.py +23 -0
  65. glm_factor_optimizer-0.1.0/tests/test_study.py +161 -0
  66. glm_factor_optimizer-0.1.0/tests/test_workflow.py +59 -0
@@ -0,0 +1,13 @@
1
+ version: 2
2
+ updates:
3
+ - package-ecosystem: "pip"
4
+ directory: "/"
5
+ schedule:
6
+ interval: weekly
7
+ open-pull-requests-limit: 10
8
+
9
+ - package-ecosystem: "github-actions"
10
+ directory: "/"
11
+ schedule:
12
+ interval: weekly
13
+ open-pull-requests-limit: 10
@@ -0,0 +1,99 @@
1
+ name: CI
2
+
3
+ on:
4
+ pull_request:
5
+ push:
6
+ branches:
7
+ - main
8
+ workflow_dispatch:
9
+
10
+ permissions:
11
+ contents: read
12
+
13
+ jobs:
14
+ tests:
15
+ name: Tests (Python ${{ matrix.python-version }})
16
+ runs-on: ubuntu-latest
17
+ strategy:
18
+ fail-fast: false
19
+ matrix:
20
+ python-version: ["3.10", "3.11", "3.12"]
21
+ steps:
22
+ - name: Check out repository
23
+ uses: actions/checkout@v6
24
+
25
+ - name: Set up Python
26
+ uses: actions/setup-python@v6
27
+ with:
28
+ python-version: ${{ matrix.python-version }}
29
+ cache: pip
30
+
31
+ - name: Install project and test dependencies
32
+ run: |
33
+ python -m pip install --upgrade pip
34
+ python -m pip install -e .[dev]
35
+
36
+ - name: Run unit tests
37
+ run: python -m unittest discover -s tests -v
38
+
39
+ coverage:
40
+ name: Coverage
41
+ runs-on: ubuntu-latest
42
+ steps:
43
+ - name: Check out repository
44
+ uses: actions/checkout@v6
45
+
46
+ - name: Set up Python
47
+ uses: actions/setup-python@v6
48
+ with:
49
+ python-version: "3.11"
50
+ cache: pip
51
+
52
+ - name: Install project and coverage dependencies
53
+ run: |
54
+ python -m pip install --upgrade pip
55
+ python -m pip install -e .[dev]
56
+
57
+ - name: Run coverage
58
+ run: python -m coverage run -m unittest discover -s tests -v
59
+
60
+ - name: Enforce coverage threshold
61
+ run: |
62
+ python -m coverage report --fail-under=70
63
+ python -m coverage xml
64
+
65
+ - name: Upload coverage artifact
66
+ uses: actions/upload-artifact@v7
67
+ with:
68
+ name: coverage-xml
69
+ path: coverage.xml
70
+
71
+ package:
72
+ name: Build package
73
+ runs-on: ubuntu-latest
74
+ steps:
75
+ - name: Check out repository
76
+ uses: actions/checkout@v6
77
+
78
+ - name: Set up Python
79
+ uses: actions/setup-python@v6
80
+ with:
81
+ python-version: "3.11"
82
+ cache: pip
83
+
84
+ - name: Install packaging tools
85
+ run: |
86
+ python -m pip install --upgrade pip
87
+ python -m pip install build twine
88
+
89
+ - name: Build distributions
90
+ run: python -m build
91
+
92
+ - name: Validate distributions
93
+ run: python -m twine check dist/*
94
+
95
+ - name: Upload distribution artifacts
96
+ uses: actions/upload-artifact@v7
97
+ with:
98
+ name: python-package-distributions
99
+ path: dist/*
@@ -0,0 +1,60 @@
1
+ name: Release
2
+
3
+ on:
4
+ push:
5
+ tags:
6
+ - "v*"
7
+ workflow_dispatch:
8
+
9
+ permissions:
10
+ contents: read
11
+
12
+ jobs:
13
+ build:
14
+ name: Build release artifacts
15
+ runs-on: ubuntu-latest
16
+ steps:
17
+ - name: Check out repository
18
+ uses: actions/checkout@v6
19
+
20
+ - name: Set up Python
21
+ uses: actions/setup-python@v6
22
+ with:
23
+ python-version: "3.11"
24
+ cache: pip
25
+
26
+ - name: Install packaging tools
27
+ run: |
28
+ python -m pip install --upgrade pip
29
+ python -m pip install build twine
30
+
31
+ - name: Build distributions
32
+ run: python -m build
33
+
34
+ - name: Validate distributions
35
+ run: python -m twine check dist/*
36
+
37
+ - name: Upload distribution artifacts
38
+ uses: actions/upload-artifact@v7
39
+ with:
40
+ name: python-package-distributions
41
+ path: dist/*
42
+
43
+ publish:
44
+ name: Publish to PyPI
45
+ needs: build
46
+ runs-on: ubuntu-latest
47
+ permissions:
48
+ id-token: write
49
+ environment:
50
+ name: pypi
51
+ url: https://pypi.org/p/glm-factor-optimizer
52
+ steps:
53
+ - name: Download distribution artifacts
54
+ uses: actions/download-artifact@v8
55
+ with:
56
+ name: python-package-distributions
57
+ path: dist
58
+
59
+ - name: Publish package
60
+ uses: pypa/gh-action-pypi-publish@release/v1
@@ -0,0 +1,103 @@
1
+ name: Security
2
+
3
+ on:
4
+ pull_request:
5
+ push:
6
+ branches:
7
+ - main
8
+ schedule:
9
+ - cron: "17 4 * * 1"
10
+ workflow_dispatch:
11
+
12
+ permissions:
13
+ contents: read
14
+
15
+ jobs:
16
+ dependency-audit:
17
+ name: Dependency Audit
18
+ runs-on: ubuntu-latest
19
+ steps:
20
+ - name: Check out repository
21
+ uses: actions/checkout@v6
22
+
23
+ - name: Set up Python
24
+ uses: actions/setup-python@v6
25
+ with:
26
+ python-version: "3.11"
27
+ cache: pip
28
+
29
+ - name: Install project and audit tooling
30
+ run: |
31
+ python -m pip install --upgrade pip setuptools
32
+ python -m pip install -e .[dev] pip-audit
33
+
34
+ - name: Run pip-audit
35
+ run: python -m pip_audit
36
+
37
+ bandit:
38
+ name: Bandit
39
+ runs-on: ubuntu-latest
40
+ steps:
41
+ - name: Check out repository
42
+ uses: actions/checkout@v6
43
+
44
+ - name: Set up Python
45
+ uses: actions/setup-python@v6
46
+ with:
47
+ python-version: "3.11"
48
+ cache: pip
49
+
50
+ - name: Install Bandit
51
+ run: |
52
+ python -m pip install --upgrade pip
53
+ python -m pip install bandit[toml]
54
+
55
+ - name: Run Bandit
56
+ run: bandit -q -r glm_factor_optimizer examples
57
+
58
+ gitleaks:
59
+ name: Gitleaks
60
+ runs-on: ubuntu-latest
61
+ steps:
62
+ - name: Check out repository
63
+ uses: actions/checkout@v6
64
+ with:
65
+ fetch-depth: 0
66
+
67
+ - name: Install gitleaks
68
+ run: |
69
+ curl -sSL https://github.com/gitleaks/gitleaks/releases/download/v8.24.3/gitleaks_8.24.3_linux_x64.tar.gz -o gitleaks.tar.gz
70
+ tar -xzf gitleaks.tar.gz gitleaks
71
+ sudo install -m 755 gitleaks /usr/local/bin/gitleaks
72
+
73
+ - name: Run gitleaks
74
+ run: gitleaks detect --source . --redact --exit-code 1
75
+
76
+ codeql:
77
+ name: CodeQL
78
+ runs-on: ubuntu-latest
79
+ permissions:
80
+ actions: read
81
+ contents: read
82
+ security-events: write
83
+ steps:
84
+ - name: Check out repository
85
+ uses: actions/checkout@v6
86
+
87
+ - name: Set up Python
88
+ uses: actions/setup-python@v6
89
+ with:
90
+ python-version: "3.11"
91
+
92
+ - name: Initialize CodeQL
93
+ uses: github/codeql-action/init@v4
94
+ with:
95
+ languages: python
96
+
97
+ - name: Install project dependencies
98
+ run: |
99
+ python -m pip install --upgrade pip
100
+ python -m pip install -e .[dev]
101
+
102
+ - name: Analyze
103
+ uses: github/codeql-action/analyze@v4
@@ -0,0 +1,15 @@
1
+ **/__pycache__/
2
+ **/*.pyc
3
+ .coverage
4
+ .coverage.*
5
+ .env
6
+ .env.*
7
+ .pytest_cache/
8
+ .ruff_cache/
9
+ .venv/
10
+ runs/
11
+ dist/**
12
+ build/**
13
+ *.egg-info/
14
+ coverage.xml
15
+ htmlcov/
@@ -0,0 +1,67 @@
1
+ # Contributing
2
+
3
+ Contributions are welcome. Keep changes focused, add or update tests for behavioral changes, and update the public documentation when the user-facing API changes.
4
+
5
+ ## Local Setup
6
+
7
+ Create a virtual environment and install the project in editable mode with development dependencies:
8
+
9
+ ```bash
10
+ python -m venv .venv
11
+ . .venv/bin/activate
12
+ python -m pip install --upgrade pip
13
+ python -m pip install -e .[dev]
14
+ ```
15
+
16
+ On Windows PowerShell, activate the environment with:
17
+
18
+ ```powershell
19
+ .venv\Scripts\Activate.ps1
20
+ ```
21
+
22
+ ## Test And Coverage
23
+
24
+ Run the full test suite:
25
+
26
+ ```bash
27
+ python -m unittest discover -s tests -v
28
+ ```
29
+
30
+ Generate a local coverage report:
31
+
32
+ ```bash
33
+ python -m coverage run -m unittest discover -s tests -v
34
+ python -m coverage report
35
+ python -m coverage xml
36
+ ```
37
+
38
+ The default coverage target tracks the core package. The optional Spark backend
39
+ is validated separately because its runtime integration test is gated behind an
40
+ environment flag and extra dependencies.
41
+
42
+ The optional Spark integration test is disabled by default. To run it locally, install the Spark extra and set the integration-test flag:
43
+
44
+ ```bash
45
+ python -m pip install -e .[dev,spark]
46
+ GLM_FACTOR_OPTIMIZER_RUN_SPARK_TESTS=1 python -m unittest tests.test_spark_backend -v
47
+ ```
48
+
49
+ ## Packaging Checks
50
+
51
+ Before opening a release-oriented pull request, verify the package builds cleanly:
52
+
53
+ ```bash
54
+ python -m build
55
+ python -m twine check dist/*
56
+ ```
57
+
58
+ ## Pull Requests
59
+
60
+ - Keep each pull request scoped to one change set.
61
+ - Add regression tests for bug fixes and new features.
62
+ - Prefer neutral, domain-free examples in public docs and examples.
63
+ - Avoid committing generated artifacts, virtual environments, or local secrets.
64
+
65
+ ## Release Process
66
+
67
+ Create and push a version tag like `v0.1.0` after the main branch is ready. The release workflow builds the distributions, runs a final Twine check, and publishes to PyPI via GitHub trusted publishing once the repository is configured as a trusted publisher in PyPI.
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 GLM Factor Optimizer contributors
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,267 @@
1
+ Metadata-Version: 2.4
2
+ Name: glm-factor-optimizer
3
+ Version: 0.1.0
4
+ Summary: Domain-free GLM factor binning, grouping, and model design tools.
5
+ Project-URL: Homepage, https://github.com/csabar/glm-factor-optimizer
6
+ Project-URL: Repository, https://github.com/csabar/glm-factor-optimizer
7
+ Project-URL: Documentation, https://github.com/csabar/glm-factor-optimizer/blob/main/docs/index.md
8
+ Project-URL: Issues, https://github.com/csabar/glm-factor-optimizer/issues
9
+ Author: GLM Factor Optimizer contributors
10
+ License: MIT
11
+ License-File: LICENSE
12
+ Classifier: License :: OSI Approved :: MIT License
13
+ Classifier: Operating System :: OS Independent
14
+ Classifier: Programming Language :: Python
15
+ Classifier: Programming Language :: Python :: 3
16
+ Classifier: Programming Language :: Python :: 3 :: Only
17
+ Requires-Python: >=3.10
18
+ Requires-Dist: numpy>=1.23
19
+ Requires-Dist: optuna>=3.0
20
+ Requires-Dist: pandas>=1.5
21
+ Requires-Dist: statsmodels>=0.14
22
+ Provides-Extra: dev
23
+ Requires-Dist: build>=1.2; extra == 'dev'
24
+ Requires-Dist: coverage[toml]>=7.6; extra == 'dev'
25
+ Requires-Dist: hatchling>=1.21; extra == 'dev'
26
+ Requires-Dist: pytest>=8.0; extra == 'dev'
27
+ Requires-Dist: twine>=5.1; extra == 'dev'
28
+ Provides-Extra: mlflow
29
+ Requires-Dist: mlflow>=2.0; extra == 'mlflow'
30
+ Provides-Extra: spark
31
+ Requires-Dist: pyspark>=3.5; extra == 'spark'
32
+ Description-Content-Type: text/markdown
33
+
34
+ # glm-factor-optimizer
35
+
36
+ [![CI](https://github.com/csabar/glm-factor-optimizer/actions/workflows/ci.yml/badge.svg)](https://github.com/csabar/glm-factor-optimizer/actions/workflows/ci.yml)
37
+ [![Security](https://github.com/csabar/glm-factor-optimizer/actions/workflows/security.yml/badge.svg)](https://github.com/csabar/glm-factor-optimizer/actions/workflows/security.yml)
38
+ [![License](https://img.shields.io/github/license/csabar/glm-factor-optimizer)](LICENSE)
39
+ [![Python 3.10+](https://img.shields.io/badge/python-3.10%2B-blue)](pyproject.toml)
40
+
41
+ Simple GLM tools for factor binning, grouping, model screening, and workflow
42
+ automation. The package is domain-free: it works for count-rate, positive
43
+ continuous, and other small GLM modeling problems.
44
+
45
+ Full project documentation is included in the source distribution under `docs/`
46
+ and browsable on GitHub at
47
+ [docs/index.md](https://github.com/csabar/glm-factor-optimizer/blob/main/docs/index.md).
48
+ The docs are organized as tutorials, how-to guides, reference, and explanation.
49
+
50
+ Use `RateGLM` for count-rate models:
51
+
52
+ - count target, like `events`
53
+ - exposure column, like `hours`
54
+ - numeric or categorical factors
55
+
56
+ ```python
57
+ from glm_factor_optimizer import RateGLM, split
58
+
59
+ train, valid, holdout = split(df)
60
+
61
+ glm = RateGLM(target="events", exposure="hours")
62
+
63
+ score_spec = glm.bins(train, "score", bins=5)
64
+ train = glm.apply(train, score_spec)
65
+ valid = glm.apply(valid, score_spec)
66
+
67
+ model = glm.fit(train, factors=[score_spec["output"], "segment"])
68
+ valid = glm.predict(valid, model)
69
+
70
+ report = glm.report(valid)
71
+ print(report["summary"])
72
+ ```
73
+
74
+ Use `GLM` for other families, such as Gamma cost or duration models:
75
+
76
+ ```python
77
+ from glm_factor_optimizer import GLM
78
+
79
+ glm = GLM(target="severity", family="gamma", prediction="predicted_severity")
80
+
81
+ age_spec = glm.bins(train, "machine_age", bins=6)
82
+ train = glm.apply(train, age_spec)
83
+ valid = glm.apply(valid, age_spec)
84
+
85
+ model = glm.fit(train, factors=[age_spec["output"], "equipment_type"])
86
+ valid = glm.predict(valid, model)
87
+ ```
88
+
89
+ Optimize one factor manually:
90
+
91
+ ```python
92
+ result = glm.optimize(
93
+ train,
94
+ valid,
95
+ "score",
96
+ fixed=["segment"],
97
+ trials=50,
98
+ )
99
+
100
+ train = glm.apply(train, result.spec)
101
+ valid = glm.apply(valid, result.spec)
102
+ model = glm.fit(train, factors=[result.output, "segment"])
103
+ ```
104
+
105
+ The same optimizer is also exposed as `optimize_bins`:
106
+
107
+ ```python
108
+ from glm_factor_optimizer import optimize_bins
109
+
110
+ result = optimize_bins(
111
+ train,
112
+ valid,
113
+ target="events",
114
+ exposure="hours",
115
+ factor="score",
116
+ )
117
+ ```
118
+
119
+ Add custom penalties with lambdas or named functions:
120
+
121
+ ```python
122
+ from glm_factor_optimizer import small_bin_size_penalty, small_count_penalty
123
+
124
+ result = glm.optimize(
125
+ train,
126
+ valid,
127
+ "score",
128
+ penalties={
129
+ "small_count": small_count_penalty(min_count=5, penalty=0.02),
130
+ "many_bins": lambda c: 0.01 * max(c["bin_count"] - 6, 0),
131
+ "gap": lambda c: max(c["validation_deviance"] - c["train_deviance"], 0),
132
+ },
133
+ )
134
+ ```
135
+
136
+ Penalty callables receive a context dictionary with the selected `spec`, the
137
+ training `bin_table`, train/validation deviance, predictions, transformed
138
+ dataframes, factor name, kind, and fixed factors.
139
+
140
+ Rank candidate factors before detailed optimization:
141
+
142
+ ```python
143
+ ranking = glm.rank(
144
+ train,
145
+ valid,
146
+ ["score", "segment", "region"],
147
+ factor_kinds={"segment": "categorical", "region": "categorical"},
148
+ )
149
+ print(ranking[["factor", "deviance_improvement"]])
150
+ ```
151
+
152
+ Run a higher-level sequential workflow with optional ranking, logging, and
153
+ interaction diagnostics:
154
+
155
+ ```python
156
+ from glm_factor_optimizer import GLMWorkflow
157
+
158
+ workflow = GLMWorkflow(
159
+ target="events",
160
+ family="poisson",
161
+ exposure="hours",
162
+ factor_kinds={"segment": "categorical"},
163
+ trials=50,
164
+ rank_candidates=True,
165
+ top_n=5,
166
+ interaction_diagnostics=True,
167
+ output_dir="runs",
168
+ )
169
+
170
+ result = workflow.fit(df, factors=["score", "segment"])
171
+ print(result.validation_report["summary"])
172
+ print(result.coefficients)
173
+ ```
174
+
175
+ For notebook-style iterative model design, use `GLMStudy`:
176
+
177
+ ```python
178
+ from glm_factor_optimizer import GLMStudy
179
+
180
+ study = GLMStudy(
181
+ df,
182
+ target="events",
183
+ exposure="hours",
184
+ prediction="predicted_count",
185
+ factor_kinds={"segment": "categorical"},
186
+ )
187
+
188
+ study.split(seed=42)
189
+ ranking = study.rank_candidates(["score", "segment", "region"])
190
+
191
+ score = study.factor("score")
192
+ score.coarse_bins(bins=10)
193
+ score.optimize(trials=100, max_bins=6)
194
+ score.compare()
195
+ score.accept(comment="stable score shape")
196
+
197
+ study.fit_main_effects()
198
+ study.validation_report()
199
+
200
+ refined = study.refine_factor("score", trials=200)
201
+ refined.accept(comment="full-model refinement")
202
+
203
+ study.find_interactions()
204
+ study.finalize()
205
+ study.save("runs")
206
+ ```
207
+
208
+ Useful helper modules are available for manual workflows:
209
+
210
+ ```python
211
+ from glm_factor_optimizer.aggregation import aggregate_rate_table
212
+ from glm_factor_optimizer.diagnostics import find_interactions
213
+ from glm_factor_optimizer.runs import RunLogger
214
+ from glm_factor_optimizer.sampling import stratified_sample
215
+ ```
216
+
217
+ Example synthetic datasets live under `examples/` and are not part of the
218
+ installable package API. The examples cover general event-rate, severity, and
219
+ Spark-style workflows across operational and service settings.
220
+
221
+ Use the optional Spark backend in PySpark environments:
222
+
223
+ ```python
224
+ from glm_factor_optimizer.spark import SparkGLM, SparkGLMWorkflow
225
+
226
+ glm = SparkGLM(
227
+ target="events",
228
+ family="poisson",
229
+ exposure="hours",
230
+ prediction="predicted_count",
231
+ )
232
+
233
+ score_spec = glm.bins(train_sdf, "score", bins=8)
234
+ train_sdf = glm.apply(train_sdf, score_spec)
235
+ valid_sdf = glm.apply(valid_sdf, score_spec)
236
+
237
+ model = glm.fit(train_sdf, factors=[score_spec["output"], "segment"])
238
+ valid_sdf = glm.predict(valid_sdf, model)
239
+ ```
240
+
241
+ Spark Optuna optimization runs Optuna on the driver and Spark GLM jobs inside
242
+ each trial:
243
+
244
+ ```python
245
+ result = glm.optimize(
246
+ train_sdf,
247
+ valid_sdf,
248
+ "score",
249
+ fixed=["segment"],
250
+ trials=30,
251
+ cache_input=True,
252
+ cache_trials=False,
253
+ )
254
+ ```
255
+
256
+ Install locally with Spark support using:
257
+
258
+ ```bash
259
+ pip install "glm-factor-optimizer[spark]"
260
+ ```
261
+
262
+ All binning and grouping specs are plain JSON-serializable dictionaries.
263
+
264
+ ## Contributing
265
+
266
+ Development setup, test commands, coverage, and release notes are documented in
267
+ [`CONTRIBUTING.md`](CONTRIBUTING.md).