dlm-murabei-package 1.0.0b0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dlm_murabei_package-1.0.0b0/PKG-INFO +142 -0
- dlm_murabei_package-1.0.0b0/README.md +122 -0
- dlm_murabei_package-1.0.0b0/pyproject.toml +74 -0
- dlm_murabei_package-1.0.0b0/src/dlm/__init__.py +0 -0
- dlm_murabei_package-1.0.0b0/src/dlm/data/__init__.py +0 -0
- dlm_murabei_package-1.0.0b0/src/dlm/data/aux.py +287 -0
- dlm_murabei_package-1.0.0b0/src/dlm/model.py +1311 -0
- dlm_murabei_package-1.0.0b0/src/dlm/update/__init__.py +0 -0
- dlm_murabei_package-1.0.0b0/src/dlm/update/intervention.py +173 -0
- dlm_murabei_package-1.0.0b0/src/dlm/utils/__init__.py +0 -0
- dlm_murabei_package-1.0.0b0/src/dlm/utils/summary.py +114 -0
- dlm_murabei_package-1.0.0b0/src/dlm/zzz__model_example.py +57 -0
|
@@ -0,0 +1,142 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: dlm-murabei-package
|
|
3
|
+
Version: 1.0.0b0
|
|
4
|
+
Summary:
|
|
5
|
+
Author: Áurea Fonseca
|
|
6
|
+
Author-email: aurea.fonseca@murabei.com
|
|
7
|
+
Requires-Python: >=3.12
|
|
8
|
+
Classifier: Programming Language :: Python :: 3
|
|
9
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
10
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
11
|
+
Classifier: Programming Language :: Python :: 3.14
|
|
12
|
+
Requires-Dist: numpy (==2.3.2)
|
|
13
|
+
Requires-Dist: pandas (==2.3.1)
|
|
14
|
+
Requires-Dist: pyarrow (==21.0.0)
|
|
15
|
+
Requires-Dist: scipy (==1.16.1)
|
|
16
|
+
Requires-Dist: statsmodels (==0.14.5)
|
|
17
|
+
Requires-Dist: tqdm (==4.67.1)
|
|
18
|
+
Description-Content-Type: text/markdown
|
|
19
|
+
|
|
20
|
+
# murabei_models — Dynamic Linear Models (DLM)
|
|
21
|
+
Pacote Python para modelos DLM com regressoras exógenas, incluindo ajuste, previsão, atualização online e backtesting, voltado para séries temporais de negócios.
|
|
22
|
+
|
|
23
|
+
## Instalação
|
|
24
|
+
- Fonte local (desenvolvimento):
|
|
25
|
+
- git clone <URL_DO_REPO>
|
|
26
|
+
- cd <PASTA_DO_REPO>
|
|
27
|
+
- pip install -r src/requirements/python_requirements.txt
|
|
28
|
+
- bash build.bash
|
|
29
|
+
- pip install dist/murabei_models-<VERSION>.tar.gz
|
|
30
|
+
|
|
31
|
+
- Requisitos principais (definidos no pyproject.toml e requirements):
|
|
32
|
+
- pandas==2.3.1
|
|
33
|
+
- numpy==2.3.2
|
|
34
|
+
- statsmodels==0.14.5
|
|
35
|
+
- scipy==1.16.1
|
|
36
|
+
- tqdm==4.67.1
|
|
37
|
+
- pyarrow==21.0.0
|
|
38
|
+
- dynm (via URL zip: https://github.com/EduardoGPinheiro/dynm/archive/refs/tags/v0.0.2.zip)
|
|
39
|
+
|
|
40
|
+
Observações:
|
|
41
|
+
- Garanta acesso à Internet para instalar a dependência dynm via URL.
|
|
42
|
+
- Python mínimo recomendado conforme o ambiente do projeto (definir python_requires no empacotamento, se aplicável).
|
|
43
|
+
|
|
44
|
+
## Exemplo rápido
|
|
45
|
+
Exemplo mínimo baseado em src/dlm/zzz__model_example.py com fit, predict, update e cross-validate:
|
|
46
|
+
|
|
47
|
+
```python
|
|
48
|
+
import numpy as np
|
|
49
|
+
import pandas as pd
|
|
50
|
+
from dlm.model import DynamicLinearModel
|
|
51
|
+
|
|
52
|
+
# Dados sintéticos
|
|
53
|
+
np.random.seed(123)
|
|
54
|
+
n = 24
|
|
55
|
+
time = pd.date_range(start="2020-01-01", periods=n, freq="MS")
|
|
56
|
+
x1 = np.linspace(0, 1, n) + 0.1 * np.random.randn(n)
|
|
57
|
+
x2 = np.linspace(1, 0, n) + 0.1 * np.random.randn(n)
|
|
58
|
+
y = 0.7 * x1 - 0.4 * x2 + 0.1 * np.random.randn(n)
|
|
59
|
+
|
|
60
|
+
data = pd.DataFrame({"time": time, "y": y, "x1": x1, "x2": x2}).set_index("time")
|
|
61
|
+
y = data[["y"]]
|
|
62
|
+
X = data[["x1", "x2"]]
|
|
63
|
+
|
|
64
|
+
# Instancia e ajusta
|
|
65
|
+
dlm_model = DynamicLinearModel(
|
|
66
|
+
trend=False,
|
|
67
|
+
monitoring=True,
|
|
68
|
+
start_predictions_at=12,
|
|
69
|
+
)
|
|
70
|
+
dlm_model.fit(X=X, y=y)
|
|
71
|
+
|
|
72
|
+
# Predição
|
|
73
|
+
X_new = np.array([[1.15, 0.05], [1.20, 0.04], [1.25, 0.03]])
|
|
74
|
+
y_pred = dlm_model.predict(X=X_new)
|
|
75
|
+
|
|
76
|
+
# Atualização online
|
|
77
|
+
y_new = 0.72
|
|
78
|
+
X_new = np.array([[1.18, 0.06]])
|
|
79
|
+
dlm_model.update(y=y_new, X=X_new)
|
|
80
|
+
|
|
81
|
+
# Backtesting / Cross-validation
|
|
82
|
+
dlm_model.cross_validate(X=X, y=y, K=12)
|
|
83
|
+
|
|
84
|
+
# Acessos úteis (atributos/métodos)
|
|
85
|
+
_ = dlm_model.get_predictive
|
|
86
|
+
_ = dlm_model.get_params
|
|
87
|
+
_ = dlm_model.get_monitor
|
|
88
|
+
_ = dlm_model.summary
|
|
89
|
+
_ = dlm_model.llk
|
|
90
|
+
```
|
|
91
|
+
|
|
92
|
+
## Estrutura do projeto
|
|
93
|
+
- src/dlm/
|
|
94
|
+
- model.py: implementação principal do modelo DLM
|
|
95
|
+
- data/aux.py: utilitários de dados
|
|
96
|
+
- update/intervention.py: rotinas de atualização/intervenções
|
|
97
|
+
- utils/summary.py: sumários e utilidades
|
|
98
|
+
- zzz__model_example.py: exemplo completo de uso
|
|
99
|
+
|
|
100
|
+
- src/tests/
|
|
101
|
+
- test_template.py e data/00__data.parquet para reprodução local
|
|
102
|
+
|
|
103
|
+
- Empacotamento:
|
|
104
|
+
- Layout src/, MANIFEST.in e package_data incluindo arquivos .parquet (dlm.data.examples)
|
|
105
|
+
|
|
106
|
+
- Automação e scripts:
|
|
107
|
+
- bitbucket-pipelines.yml, build.sh, check_local_linter.bash
|
|
108
|
+
|
|
109
|
+
- Versionamento:
|
|
110
|
+
- VERSION (usado no setup.py)
|
|
111
|
+
- changelog.md
|
|
112
|
+
|
|
113
|
+
## Desenvolvimento
|
|
114
|
+
- Estilo e lint:
|
|
115
|
+
- Ruff habilitado com: Pyflakes (F), pycodestyle (E, W), pydocstyle (D, convenção Google), isort (I), pandas-vet (PD), pep8-naming (N), bandit (S)
|
|
116
|
+
- Largura de linha: 79
|
|
117
|
+
- ignore: E402, N806, I001
|
|
118
|
+
|
|
119
|
+
- Fluxo sugerido:
|
|
120
|
+
- Criar venv e instalar localmente: pip install -e .
|
|
121
|
+
- Lint: ./check_local_linter.bash
|
|
122
|
+
- Testes: pytest em src/tests
|
|
123
|
+
|
|
124
|
+
## Guia de uso
|
|
125
|
+
- Importação principal:
|
|
126
|
+
- from dlm.model import DynamicLinearModel
|
|
127
|
+
|
|
128
|
+
- Fluxo típico:
|
|
129
|
+
- Preparar X (regressoras) e y (alvo) como pandas/numpy
|
|
130
|
+
- Ajustar: model.fit(X=X, y=y)
|
|
131
|
+
- Prever: model.predict(X=X_new)
|
|
132
|
+
- Atualizar online: model.update(y=y_new, X=X_new)
|
|
133
|
+
- Backtesting: model.cross_validate(X=X, y=y, K=...)
|
|
134
|
+
|
|
135
|
+
- Atributos e diagnósticos:
|
|
136
|
+
- model.get_predictive, model.get_params, model.get_monitor, model.summary, model.llk
|
|
137
|
+
|
|
138
|
+
## Autores
|
|
139
|
+
Áurea Fonseca, Eduardo Pinheiro, Izabel Nolau e Lucas Ribeiro.
|
|
140
|
+
|
|
141
|
+
## Build local
|
|
142
|
+
|
|
@@ -0,0 +1,122 @@
|
|
|
1
|
+
# murabei_models — Dynamic Linear Models (DLM)
|
|
2
|
+
Pacote Python para modelos DLM com regressoras exógenas, incluindo ajuste, previsão, atualização online e backtesting, voltado para séries temporais de negócios.
|
|
3
|
+
|
|
4
|
+
## Instalação
|
|
5
|
+
- Fonte local (desenvolvimento):
|
|
6
|
+
- git clone <URL_DO_REPO>
|
|
7
|
+
- cd <PASTA_DO_REPO>
|
|
8
|
+
- pip install -r src/requirements/python_requirements.txt
|
|
9
|
+
- bash build.bash
|
|
10
|
+
- pip install dist/murabei_models-<VERSION>.tar.gz
|
|
11
|
+
|
|
12
|
+
- Requisitos principais (definidos no pyproject.toml e requirements):
|
|
13
|
+
- pandas==2.3.1
|
|
14
|
+
- numpy==2.3.2
|
|
15
|
+
- statsmodels==0.14.5
|
|
16
|
+
- scipy==1.16.1
|
|
17
|
+
- tqdm==4.67.1
|
|
18
|
+
- pyarrow==21.0.0
|
|
19
|
+
- dynm (via URL zip: https://github.com/EduardoGPinheiro/dynm/archive/refs/tags/v0.0.2.zip)
|
|
20
|
+
|
|
21
|
+
Observações:
|
|
22
|
+
- Garanta acesso à Internet para instalar a dependência dynm via URL.
|
|
23
|
+
- Python mínimo recomendado conforme o ambiente do projeto (definir python_requires no empacotamento, se aplicável).
|
|
24
|
+
|
|
25
|
+
## Exemplo rápido
|
|
26
|
+
Exemplo mínimo baseado em src/dlm/zzz__model_example.py com fit, predict, update e cross-validate:
|
|
27
|
+
|
|
28
|
+
```python
|
|
29
|
+
import numpy as np
|
|
30
|
+
import pandas as pd
|
|
31
|
+
from dlm.model import DynamicLinearModel
|
|
32
|
+
|
|
33
|
+
# Dados sintéticos
|
|
34
|
+
np.random.seed(123)
|
|
35
|
+
n = 24
|
|
36
|
+
time = pd.date_range(start="2020-01-01", periods=n, freq="MS")
|
|
37
|
+
x1 = np.linspace(0, 1, n) + 0.1 * np.random.randn(n)
|
|
38
|
+
x2 = np.linspace(1, 0, n) + 0.1 * np.random.randn(n)
|
|
39
|
+
y = 0.7 * x1 - 0.4 * x2 + 0.1 * np.random.randn(n)
|
|
40
|
+
|
|
41
|
+
data = pd.DataFrame({"time": time, "y": y, "x1": x1, "x2": x2}).set_index("time")
|
|
42
|
+
y = data[["y"]]
|
|
43
|
+
X = data[["x1", "x2"]]
|
|
44
|
+
|
|
45
|
+
# Instancia e ajusta
|
|
46
|
+
dlm_model = DynamicLinearModel(
|
|
47
|
+
trend=False,
|
|
48
|
+
monitoring=True,
|
|
49
|
+
start_predictions_at=12,
|
|
50
|
+
)
|
|
51
|
+
dlm_model.fit(X=X, y=y)
|
|
52
|
+
|
|
53
|
+
# Predição
|
|
54
|
+
X_new = np.array([[1.15, 0.05], [1.20, 0.04], [1.25, 0.03]])
|
|
55
|
+
y_pred = dlm_model.predict(X=X_new)
|
|
56
|
+
|
|
57
|
+
# Atualização online
|
|
58
|
+
y_new = 0.72
|
|
59
|
+
X_new = np.array([[1.18, 0.06]])
|
|
60
|
+
dlm_model.update(y=y_new, X=X_new)
|
|
61
|
+
|
|
62
|
+
# Backtesting / Cross-validation
|
|
63
|
+
dlm_model.cross_validate(X=X, y=y, K=12)
|
|
64
|
+
|
|
65
|
+
# Acessos úteis (atributos/métodos)
|
|
66
|
+
_ = dlm_model.get_predictive
|
|
67
|
+
_ = dlm_model.get_params
|
|
68
|
+
_ = dlm_model.get_monitor
|
|
69
|
+
_ = dlm_model.summary
|
|
70
|
+
_ = dlm_model.llk
|
|
71
|
+
```
|
|
72
|
+
|
|
73
|
+
## Estrutura do projeto
|
|
74
|
+
- src/dlm/
|
|
75
|
+
- model.py: implementação principal do modelo DLM
|
|
76
|
+
- data/aux.py: utilitários de dados
|
|
77
|
+
- update/intervention.py: rotinas de atualização/intervenções
|
|
78
|
+
- utils/summary.py: sumários e utilidades
|
|
79
|
+
- zzz__model_example.py: exemplo completo de uso
|
|
80
|
+
|
|
81
|
+
- src/tests/
|
|
82
|
+
- test_template.py e data/00__data.parquet para reprodução local
|
|
83
|
+
|
|
84
|
+
- Empacotamento:
|
|
85
|
+
- Layout src/, MANIFEST.in e package_data incluindo arquivos .parquet (dlm.data.examples)
|
|
86
|
+
|
|
87
|
+
- Automação e scripts:
|
|
88
|
+
- bitbucket-pipelines.yml, build.sh, check_local_linter.bash
|
|
89
|
+
|
|
90
|
+
- Versionamento:
|
|
91
|
+
- VERSION (usado no setup.py)
|
|
92
|
+
- changelog.md
|
|
93
|
+
|
|
94
|
+
## Desenvolvimento
|
|
95
|
+
- Estilo e lint:
|
|
96
|
+
- Ruff habilitado com: Pyflakes (F), pycodestyle (E, W), pydocstyle (D, convenção Google), isort (I), pandas-vet (PD), pep8-naming (N), bandit (S)
|
|
97
|
+
- Largura de linha: 79
|
|
98
|
+
- ignore: E402, N806, I001
|
|
99
|
+
|
|
100
|
+
- Fluxo sugerido:
|
|
101
|
+
- Criar venv e instalar localmente: pip install -e .
|
|
102
|
+
- Lint: ./check_local_linter.bash
|
|
103
|
+
- Testes: pytest em src/tests
|
|
104
|
+
|
|
105
|
+
## Guia de uso
|
|
106
|
+
- Importação principal:
|
|
107
|
+
- from dlm.model import DynamicLinearModel
|
|
108
|
+
|
|
109
|
+
- Fluxo típico:
|
|
110
|
+
- Preparar X (regressoras) e y (alvo) como pandas/numpy
|
|
111
|
+
- Ajustar: model.fit(X=X, y=y)
|
|
112
|
+
- Prever: model.predict(X=X_new)
|
|
113
|
+
- Atualizar online: model.update(y=y_new, X=X_new)
|
|
114
|
+
- Backtesting: model.cross_validate(X=X, y=y, K=...)
|
|
115
|
+
|
|
116
|
+
- Atributos e diagnósticos:
|
|
117
|
+
- model.get_predictive, model.get_params, model.get_monitor, model.summary, model.llk
|
|
118
|
+
|
|
119
|
+
## Autores
|
|
120
|
+
Áurea Fonseca, Eduardo Pinheiro, Izabel Nolau e Lucas Ribeiro.
|
|
121
|
+
|
|
122
|
+
## Build local
|
|
@@ -0,0 +1,74 @@
|
|
|
1
|
+
[project]
|
|
2
|
+
name = "dlm-murabei-package"
|
|
3
|
+
version = "1.0.0-b.0"
|
|
4
|
+
description = ""
|
|
5
|
+
authors = [
|
|
6
|
+
{name = "Áurea Fonseca",email = "aurea.fonseca@murabei.com"},
|
|
7
|
+
{name = "Eduardo Pinheiro",email = "eduardo.pinheiro@murabei.com"},
|
|
8
|
+
{name = "Izabel Nolau",email = "izabel.souza@murabei.com"},
|
|
9
|
+
{name = "Lucas Ribeiro Magalhaes",email = "lucas.magalhaes@murabei.com"}
|
|
10
|
+
]
|
|
11
|
+
|
|
12
|
+
readme = "README.md"
|
|
13
|
+
requires-python = ">=3.12"
|
|
14
|
+
dependencies = [
|
|
15
|
+
"pandas==2.3.1", "numpy==2.3.2", "statsmodels==0.14.5",
|
|
16
|
+
"scipy==1.16.1", "tqdm==4.67.1", "pyarrow==21.0.0"]
|
|
17
|
+
|
|
18
|
+
# "dynm @ https://github.com/EduardoGPinheiro/dynm/archive/refs/tags/v0.0.2.zip"
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
[tool.poetry]
|
|
22
|
+
packages = [{include = "dlm", from = "src"}]
|
|
23
|
+
|
|
24
|
+
[build-system]
|
|
25
|
+
requires = ["poetry-core>=2.0.0,<3.0.0"]
|
|
26
|
+
build-backend = "poetry.core.masonry.api"
|
|
27
|
+
|
|
28
|
+
[tool.ruff]
|
|
29
|
+
line-length = 79 # Quantidade de caracteres que o linter irá considerar na linha
|
|
30
|
+
indent-width = 4
|
|
31
|
+
|
|
32
|
+
# Pastas onde o linter não atuara. Por padrão ele só lê arquivos python e ignora tudo dentro do .gitignore
|
|
33
|
+
exclude = [
|
|
34
|
+
"secrets/",
|
|
35
|
+
"__init__.py",
|
|
36
|
+
"setup_template.py",
|
|
37
|
+
"setup.py"
|
|
38
|
+
]
|
|
39
|
+
|
|
40
|
+
# Lista de linters que serão aplicados. o ruff da suporte a vários e eles podem ser adicionados aqui.
|
|
41
|
+
[tool.ruff.lint]
|
|
42
|
+
select = [
|
|
43
|
+
"F", # Pyflakes - Procura bugs básicos no codigo
|
|
44
|
+
"E", # pycodestyle - Pep8
|
|
45
|
+
"W", # pycodestyle - Pep8
|
|
46
|
+
"D", # pydocstyle - Docstring
|
|
47
|
+
"I", # Isort - Imports
|
|
48
|
+
"PD", # pandas-vet - linter de pandas (não deixa usar inplace nem df como nome de variavel)
|
|
49
|
+
"N", # pep8-naming - nomes seguindo pep8
|
|
50
|
+
"S", # flake8-bandit - linter de vulnerabilidade (nmao deixa usar assert)
|
|
51
|
+
]
|
|
52
|
+
|
|
53
|
+
# Os códigos de erros colocados aqui são ignorados pelo linter. O erro abaixo é em relação ao assert. Está comentado por padrão.
|
|
54
|
+
ignore = [
|
|
55
|
+
# "S101" # Use of `assert` detected
|
|
56
|
+
"E402", "N806", "I001"
|
|
57
|
+
]
|
|
58
|
+
|
|
59
|
+
# ativa um modo plus do ruff
|
|
60
|
+
preview = true
|
|
61
|
+
|
|
62
|
+
[tool.ruff.format]
|
|
63
|
+
indent-style = "space"
|
|
64
|
+
|
|
65
|
+
[tool.ruff.lint.pydocstyle]
|
|
66
|
+
convention = "google" # seleciona as docstrings do google como padrão
|
|
67
|
+
|
|
68
|
+
[tool.ruff.lint.isort]
|
|
69
|
+
no-sections = true
|
|
70
|
+
|
|
71
|
+
[dependency-groups]
|
|
72
|
+
docs = [
|
|
73
|
+
"mkdocs (>=1.6.1,<2.0.0)"
|
|
74
|
+
]
|
|
File without changes
|
|
File without changes
|
|
@@ -0,0 +1,287 @@
|
|
|
1
|
+
"""Auxiliary functions for data manipulation."""
|
|
2
|
+
import numpy as np
|
|
3
|
+
import pandas as pd
|
|
4
|
+
from pandas import DatetimeIndex
|
|
5
|
+
from typing import Union, Tuple, List
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def custom_mean(y: np.array) -> np.ndarray | float:
|
|
9
|
+
"""Compute the mean of an array with NaN handling.
|
|
10
|
+
|
|
11
|
+
This function computes the mean of a NumPy array while ignoring NaN values.
|
|
12
|
+
If all values are NaN, the result defaults to 0.
|
|
13
|
+
Works for both 1D and higher-dimensional arrays.
|
|
14
|
+
|
|
15
|
+
Args:
|
|
16
|
+
y (np.ndarray): Input array of numeric values
|
|
17
|
+
that may contain NaN values.
|
|
18
|
+
|
|
19
|
+
Returns:
|
|
20
|
+
float | np.ndarray:
|
|
21
|
+
- If `y` is 1D, returns a single float value representing the mean.
|
|
22
|
+
- If `y` has more than 1 dimension, returns an array with means
|
|
23
|
+
along axis=0, replacing columns that contain only NaN
|
|
24
|
+
values with 0.
|
|
25
|
+
|
|
26
|
+
Raises:
|
|
27
|
+
ValueError: If the input is not a NumPy array.
|
|
28
|
+
"""
|
|
29
|
+
if not isinstance(y, np.ndarray):
|
|
30
|
+
raise ValueError("Input must be a NumPy array.")
|
|
31
|
+
|
|
32
|
+
# Case: 1D array
|
|
33
|
+
if y.ndim == 1:
|
|
34
|
+
# If all values are NaN, return 0
|
|
35
|
+
if np.isnan(y).all():
|
|
36
|
+
mu0 = 0
|
|
37
|
+
else:
|
|
38
|
+
# Compute mean ignoring NaN values
|
|
39
|
+
mu0 = np.nanmean(y)
|
|
40
|
+
|
|
41
|
+
# Case: multi-dimensional array
|
|
42
|
+
elif y.ndim > 1:
|
|
43
|
+
# Compute mean along axis=0 while ignoring NaN
|
|
44
|
+
mu0 = np.nanmean(y, axis=0)
|
|
45
|
+
|
|
46
|
+
# Identify columns where all values are NaN
|
|
47
|
+
all_nan_mask = np.isnan(y).all(axis=0)
|
|
48
|
+
|
|
49
|
+
# Replace NaN means with 0 for those columns
|
|
50
|
+
mu0[all_nan_mask] = 0
|
|
51
|
+
|
|
52
|
+
return mu0
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def custom_std(y: np.ndarray) -> np.ndarray | float:
|
|
56
|
+
"""Compute the standard deviation of an array with NaN handling.
|
|
57
|
+
|
|
58
|
+
This function computes the standard deviation of a NumPy array while
|
|
59
|
+
ignoring NaN values. If all values are NaN, the result defaults to 1.
|
|
60
|
+
To avoid zero-variance issues, the function enforces a minimum
|
|
61
|
+
return value of 1e-6.
|
|
62
|
+
|
|
63
|
+
Works for both 1D and multi-dimensional arrays.
|
|
64
|
+
|
|
65
|
+
Args:
|
|
66
|
+
y (np.ndarray): Input array of numeric values that may
|
|
67
|
+
contain NaN values.
|
|
68
|
+
|
|
69
|
+
Returns:
|
|
70
|
+
float | np.ndarray:
|
|
71
|
+
- If `y` is 1D, returns a single float value representing
|
|
72
|
+
the standard deviation.
|
|
73
|
+
- If `y` has more than 1 dimension, returns an array of
|
|
74
|
+
standard deviations computed along axis=0, replacing
|
|
75
|
+
columns of all-NaN values with 1.
|
|
76
|
+
|
|
77
|
+
Raises:
|
|
78
|
+
ValueError: If the input is not a NumPy array.
|
|
79
|
+
"""
|
|
80
|
+
if not isinstance(y, np.ndarray):
|
|
81
|
+
raise ValueError("Input must be a NumPy array.")
|
|
82
|
+
|
|
83
|
+
# Case: 1D array
|
|
84
|
+
if y.ndim == 1:
|
|
85
|
+
# If all values are NaN, return 1.0
|
|
86
|
+
if np.isnan(y).all():
|
|
87
|
+
s0 = 1.0
|
|
88
|
+
else:
|
|
89
|
+
# Compute standard deviation ignoring NaN values
|
|
90
|
+
s0 = np.nanstd(y)
|
|
91
|
+
|
|
92
|
+
# Case: multi-dimensional array
|
|
93
|
+
elif y.ndim > 1:
|
|
94
|
+
# Compute std along axis=0 while ignoring NaN values
|
|
95
|
+
s0 = np.nanstd(y, axis=0)
|
|
96
|
+
|
|
97
|
+
# Identify columns where all values are NaN
|
|
98
|
+
all_nan_mask = np.isnan(y).all(axis=0)
|
|
99
|
+
|
|
100
|
+
# Replace std values of all-NaN columns with 1.0
|
|
101
|
+
s0[all_nan_mask] = 1.0
|
|
102
|
+
|
|
103
|
+
# Ensure minimum standard deviation is at least 1e-6
|
|
104
|
+
s0 = np.maximum(s0, 1e-6)
|
|
105
|
+
|
|
106
|
+
return s0
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
def identify_dummies(X: np.ndarray) -> np.ndarray: # noqa: N803
|
|
110
|
+
"""Identify dummy-variable columns (0/1-only) in a 2D array.
|
|
111
|
+
|
|
112
|
+
This function returns a boolean mask indicating which columns of the input
|
|
113
|
+
array contain only the binary values 0 and 1. It uses vectorized checks
|
|
114
|
+
with NumPy's isin and a column-wise reduction for efficiency.
|
|
115
|
+
|
|
116
|
+
Args:
|
|
117
|
+
X (np.ndarray): A 2D array where each column is a feature;
|
|
118
|
+
may have any numeric dtype.
|
|
119
|
+
|
|
120
|
+
Returns:
|
|
121
|
+
np.ndarray: A 1D boolean array of shape (n_features,), where True marks
|
|
122
|
+
columns that contain only 0s and 1s, and False otherwise.
|
|
123
|
+
|
|
124
|
+
Raises:
|
|
125
|
+
ValueError: If X is not a NumPy array or is not 2-dimensional.
|
|
126
|
+
"""
|
|
127
|
+
if not isinstance(X, np.ndarray):
|
|
128
|
+
raise ValueError("X must be a NumPy ndarray.")
|
|
129
|
+
if X.ndim != 2:
|
|
130
|
+
raise ValueError("X must be 2-dimensional (n_samples, n_features).")
|
|
131
|
+
|
|
132
|
+
# Check membership against {0, 1} for every element,
|
|
133
|
+
# then reduce along rows to mark columns.
|
|
134
|
+
# np.isin returns a boolean array; np.all(..., axis=0)
|
|
135
|
+
# marks columns that are entirely in {0,1}.
|
|
136
|
+
bool_vector = np.all(np.isin(X, [0, 1]), axis=0)
|
|
137
|
+
|
|
138
|
+
return bool_vector
|
|
139
|
+
|
|
140
|
+
|
|
141
|
+
def to_time_df(
|
|
142
|
+
time: Union[np.ndarray, DatetimeIndex, List, None],
|
|
143
|
+
start: int = 1
|
|
144
|
+
) -> pd.DataFrame:
|
|
145
|
+
"""Convert time-like input into a DataFrame with 'time' and sequential 't'.
|
|
146
|
+
|
|
147
|
+
This function converts a time-like input into a DataFrame with two columns:
|
|
148
|
+
'time' (the provided values as a pandas Series) and 't' (a 1-based
|
|
149
|
+
sequential index starting from `start`). If `time` is None, returns an
|
|
150
|
+
empty DataFrame with only column 't' and zero rows.
|
|
151
|
+
|
|
152
|
+
Args:
|
|
153
|
+
time (np.ndarray | DatetimeIndex | list | None): Time values to convert
|
|
154
|
+
into a DataFrame; accepted inputs include NumPy arrays, pandas
|
|
155
|
+
DatetimeIndex, Python lists, or None.
|
|
156
|
+
start (int): Starting value for the 't' sequence; defaults to 1.
|
|
157
|
+
|
|
158
|
+
Returns:
|
|
159
|
+
pd.DataFrame: A DataFrame with columns:
|
|
160
|
+
- 'time': The input time values as a pandas Series (if provided).
|
|
161
|
+
- 't': A sequence starting at `start` with step 1, aligned to rows.
|
|
162
|
+
|
|
163
|
+
Raises:
|
|
164
|
+
TypeError: If `time` is not None and not an accepted type
|
|
165
|
+
(np.ndarray, DatetimeIndex, or list).
|
|
166
|
+
ValueError: If the provided `time` is not one-dimensional.
|
|
167
|
+
|
|
168
|
+
Notes:
|
|
169
|
+
- When `time` is a DatetimeIndex, it is converted to a pandas Series,
|
|
170
|
+
preserving datetime dtype semantics.
|
|
171
|
+
- The 't' column is derived from the DataFrame's RangeIndex plus
|
|
172
|
+
`start`, producing 1-based-like sequencing without changing the
|
|
173
|
+
actual index.
|
|
174
|
+
"""
|
|
175
|
+
# Using an empty Series to construct an empty
|
|
176
|
+
if time is None:
|
|
177
|
+
return pd.DataFrame({"t": []})
|
|
178
|
+
|
|
179
|
+
# Normalize acceptable inputs to a 1D pandas Series.
|
|
180
|
+
# - np.ndarray/list: wrapped into Series directly.
|
|
181
|
+
# - DatetimeIndex: also acceptable; Series preserves datetime nature.
|
|
182
|
+
if isinstance(time, (np.ndarray, list, DatetimeIndex)):
|
|
183
|
+
time = pd.Series(time)
|
|
184
|
+
else:
|
|
185
|
+
raise TypeError(
|
|
186
|
+
"`time` must be a np.ndarray, "
|
|
187
|
+
"pandas.DatetimeIndex, list, or None.")
|
|
188
|
+
|
|
189
|
+
# Validate dimensionality: function expects a 1D sequence of time values.
|
|
190
|
+
if time.ndim != 1:
|
|
191
|
+
raise ValueError("`time` must be one-dimensional.")
|
|
192
|
+
|
|
193
|
+
# Build the output DataFrame with the time values.
|
|
194
|
+
time_df = pd.DataFrame({"time": time})
|
|
195
|
+
|
|
196
|
+
# Create a 1-based sequence starting from `start` without altering
|
|
197
|
+
# the DataFrame index.
|
|
198
|
+
time_df["t"] = time_df.index + start
|
|
199
|
+
|
|
200
|
+
return time_df
|
|
201
|
+
|
|
202
|
+
|
|
203
|
+
def to_numpy(
|
|
204
|
+
y: Union[np.array, pd.Series, pd.DataFrame, float],
|
|
205
|
+
X: Union[np.array, pd.DataFrame, None] # noqa: N803
|
|
206
|
+
) -> Tuple[np.array, Union[np.array, None], pd.DataFrame]:
|
|
207
|
+
"""Convert targets and features to NumPy, returning time metadata.
|
|
208
|
+
|
|
209
|
+
Converts `y` to a 1D NumPy array and `X` (if provided) to a NumPy array,
|
|
210
|
+
preserving row alignment. Also returns a time DataFrame (from index if
|
|
211
|
+
datetime-like) and inferred frequency when applicable.
|
|
212
|
+
|
|
213
|
+
Args:
|
|
214
|
+
y (np.ndarray | pd.Series | pd.DataFrame | float):
|
|
215
|
+
Target variable; must be 1D or a single-column DataFrame.
|
|
216
|
+
X (np.ndarray | pd.DataFrame | None):
|
|
217
|
+
Feature matrix; if not None, its number of rows must match `y`.
|
|
218
|
+
|
|
219
|
+
Returns:
|
|
220
|
+
Tuple[np.ndarray, np.ndarray | None, pd.DataFrame, str | None]:
|
|
221
|
+
- y: 1D NumPy array of targets.
|
|
222
|
+
- X: NumPy array of features or None.
|
|
223
|
+
- time_df: DataFrame with time info derived from
|
|
224
|
+
`y.index` when convertible.
|
|
225
|
+
- freq: Inferred frequency string from datetime index, or None.
|
|
226
|
+
|
|
227
|
+
Raises:
|
|
228
|
+
TypeError: If `y` or `X` are of unsupported types.
|
|
229
|
+
ValueError: If `y` is not 1D/single-column or if `X` rows mismatch `y`.
|
|
230
|
+
"""
|
|
231
|
+
# Extract time from y.index when datetime-like; otherwise ignore.
|
|
232
|
+
if isinstance(y, pd.DataFrame) or isinstance(y, pd.Series):
|
|
233
|
+
index = y.index
|
|
234
|
+
if pd.api.types.is_datetime64_any_dtype(index):
|
|
235
|
+
time = pd.to_datetime(index, errors='coerce')
|
|
236
|
+
elif pd.api.types.is_numeric_dtype(index):
|
|
237
|
+
time = None
|
|
238
|
+
else:
|
|
239
|
+
try:
|
|
240
|
+
time_converted = pd.to_datetime(index, errors='coerce')
|
|
241
|
+
if time_converted.notna().all():
|
|
242
|
+
time = time_converted
|
|
243
|
+
else:
|
|
244
|
+
time = None
|
|
245
|
+
except Exception:
|
|
246
|
+
print("y.index is not a valid datetime or could not be"
|
|
247
|
+
" converted. It will not be used as the time column.")
|
|
248
|
+
else:
|
|
249
|
+
time = None
|
|
250
|
+
time_df = to_time_df(time)
|
|
251
|
+
|
|
252
|
+
if time is not None:
|
|
253
|
+
freq = pd.infer_freq(time)
|
|
254
|
+
else:
|
|
255
|
+
freq = None
|
|
256
|
+
|
|
257
|
+
# Convert y to 1D ndarray.
|
|
258
|
+
if isinstance(y, pd.DataFrame):
|
|
259
|
+
if y.shape[1] == 1:
|
|
260
|
+
y = y.iloc[:, 0]
|
|
261
|
+
else:
|
|
262
|
+
raise ValueError("`y` should be a 1D array, Series,"
|
|
263
|
+
" or single-column DataFrame.")
|
|
264
|
+
if isinstance(y, pd.Series):
|
|
265
|
+
y = y.to_numpy()
|
|
266
|
+
elif isinstance(y, np.ndarray):
|
|
267
|
+
if y.ndim > 1:
|
|
268
|
+
raise ValueError("`y` must be 1D.")
|
|
269
|
+
elif isinstance(y, float):
|
|
270
|
+
y = np.array([y])
|
|
271
|
+
else:
|
|
272
|
+
raise TypeError("`y` must be a np.array, Series,"
|
|
273
|
+
" or single-column DataFrame.")
|
|
274
|
+
|
|
275
|
+
# Convert X to ndarray if provided and validate row count.
|
|
276
|
+
if X is not None:
|
|
277
|
+
if isinstance(X, pd.DataFrame):
|
|
278
|
+
X = X.to_numpy()
|
|
279
|
+
elif not isinstance(X, np.ndarray):
|
|
280
|
+
raise TypeError("`X` must be a np.array or pd.DataFrame.")
|
|
281
|
+
|
|
282
|
+
if (X.shape[0] > 1):
|
|
283
|
+
if (X.shape[0] != y.shape[0]) and (X.shape[0] > 1):
|
|
284
|
+
raise ValueError("`X` and `y` must have the same number"
|
|
285
|
+
" of rows.")
|
|
286
|
+
|
|
287
|
+
return y, X, time_df, freq
|