newsAR 0.3.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (34) hide show
  1. newsar-0.3.0/LICENSE +21 -0
  2. newsar-0.3.0/PKG-INFO +32 -0
  3. newsar-0.3.0/newsAR/__init__.py +62 -0
  4. newsar-0.3.0/newsAR/benchmark.py +352 -0
  5. newsar-0.3.0/newsAR/diagnostics.py +508 -0
  6. newsar-0.3.0/newsAR/forecaster.py +270 -0
  7. newsar-0.3.0/newsAR/lags/__init__.py +3 -0
  8. newsar-0.3.0/newsAR/lags/selector.py +127 -0
  9. newsar-0.3.0/newsAR/model/__init__.py +4 -0
  10. newsar-0.3.0/newsAR/model/bayesian_ar.py +275 -0
  11. newsar-0.3.0/newsAR/model/gibbs.py +111 -0
  12. newsar-0.3.0/newsAR/news/__init__.py +4 -0
  13. newsar-0.3.0/newsAR/news/base.py +34 -0
  14. newsar-0.3.0/newsAR/news/fetcher.py +138 -0
  15. newsar-0.3.0/newsAR/quick.py +204 -0
  16. newsar-0.3.0/newsAR/sentiment/__init__.py +15 -0
  17. newsar-0.3.0/newsAR/sentiment/base.py +52 -0
  18. newsar-0.3.0/newsAR/sentiment/llm_scorer.py +234 -0
  19. newsar-0.3.0/newsAR.egg-info/PKG-INFO +32 -0
  20. newsar-0.3.0/newsAR.egg-info/SOURCES.txt +32 -0
  21. newsar-0.3.0/newsAR.egg-info/dependency_links.txt +1 -0
  22. newsar-0.3.0/newsAR.egg-info/requires.txt +28 -0
  23. newsar-0.3.0/newsAR.egg-info/top_level.txt +1 -0
  24. newsar-0.3.0/pyproject.toml +40 -0
  25. newsar-0.3.0/setup.cfg +4 -0
  26. newsar-0.3.0/tests/test_bayesian_ar.py +226 -0
  27. newsar-0.3.0/tests/test_benchmark.py +138 -0
  28. newsar-0.3.0/tests/test_diagnostics.py +115 -0
  29. newsar-0.3.0/tests/test_fetcher.py +174 -0
  30. newsar-0.3.0/tests/test_forecaster.py +189 -0
  31. newsar-0.3.0/tests/test_groq_scorer.py +99 -0
  32. newsar-0.3.0/tests/test_improvements.py +209 -0
  33. newsar-0.3.0/tests/test_lag_selector.py +84 -0
  34. newsar-0.3.0/tests/test_scorer.py +171 -0
newsar-0.3.0/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Diego Vieira de Souza
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
newsar-0.3.0/PKG-INFO ADDED
@@ -0,0 +1,32 @@
1
+ Metadata-Version: 2.4
2
+ Name: newsAR
3
+ Version: 0.3.0
4
+ Summary: Autorregressão Bayesiana com Sentimento de Notícias para previsão de commodities
5
+ License: MIT
6
+ Requires-Python: >=3.10
7
+ Description-Content-Type: text/markdown
8
+ License-File: LICENSE
9
+ Requires-Dist: numpy>=1.24
10
+ Requires-Dist: pandas>=2.0
11
+ Requires-Dist: matplotlib>=3.7
12
+ Requires-Dist: statsmodels>=0.14
13
+ Requires-Dist: requests>=2.28
14
+ Provides-Extra: anthropic
15
+ Requires-Dist: anthropic>=0.25; extra == "anthropic"
16
+ Provides-Extra: openailib
17
+ Requires-Dist: openai>=1.0; extra == "openailib"
18
+ Provides-Extra: groq
19
+ Requires-Dist: groq>=0.9; extra == "groq"
20
+ Provides-Extra: all
21
+ Requires-Dist: anthropic>=0.25; extra == "all"
22
+ Requires-Dist: openai>=1.0; extra == "all"
23
+ Requires-Dist: groq>=0.9; extra == "all"
24
+ Requires-Dist: joblib>=1.3; extra == "all"
25
+ Provides-Extra: dev
26
+ Requires-Dist: pytest>=8.0; extra == "dev"
27
+ Requires-Dist: pytest-cov>=5.0; extra == "dev"
28
+ Requires-Dist: anthropic>=0.25; extra == "dev"
29
+ Requires-Dist: openai>=1.0; extra == "dev"
30
+ Requires-Dist: groq>=0.9; extra == "dev"
31
+ Requires-Dist: joblib>=1.3; extra == "dev"
32
+ Dynamic: license-file
@@ -0,0 +1,62 @@
1
+ """newsAR — Autorregressão Bayesiana com Sentimento de Notícias.
2
+
3
+ Pipeline integrado: coleta de notícias → pontuação via LLM →
4
+ seleção de lags → modelo AR bayesiano com Gibbs Sampling.
5
+
6
+ Quick start (API simplificada)::
7
+
8
+ from newsAR import quick_fit
9
+
10
+ # Com CSV de notícias pré-coletadas
11
+ forecaster = quick_fit(
12
+ y=serie_mensal,
13
+ keyword="petróleo",
14
+ csv_path="noticias.csv",
15
+ groq_api_key="gsk_...",
16
+ )
17
+ forecast = forecaster.predict(steps=6, credible_interval=0.90)
18
+
19
+ API completa::
20
+
21
+ from newsAR.news.fetcher import CSVNewsFetcher
22
+ from newsAR.sentiment.llm_scorer import ConstantSentimentScorer
23
+ from newsAR.forecaster import NewsARForecaster
24
+
25
+ fetcher = CSVNewsFetcher("noticias.csv")
26
+ forecaster = NewsARForecaster("petróleo", fetcher=fetcher)
27
+ forecaster.fit(y_train)
28
+ forecast = forecaster.predict(steps=6, credible_interval=0.90)
29
+
30
+ """
31
+
32
+ from .forecaster import NewsARForecaster
33
+ from .quick import quick_fit
34
+ from .lags.selector import LagSelector
35
+ from .model.bayesian_ar import BayesianARModel
36
+ from .news.fetcher import CSVNewsFetcher, NewsAPIFetcher
37
+ from .sentiment.llm_scorer import (
38
+ AnthropicSentimentScorer,
39
+ ConstantSentimentScorer,
40
+ GroqSentimentScorer,
41
+ OpenAISentimentScorer,
42
+ )
43
+ from . import diagnostics, benchmark
44
+ from .benchmark import theil_u
45
+
46
+ __version__ = "0.3.0"
47
+
48
+ __all__ = [
49
+ "NewsARForecaster",
50
+ "quick_fit",
51
+ "LagSelector",
52
+ "BayesianARModel",
53
+ "NewsAPIFetcher",
54
+ "CSVNewsFetcher",
55
+ "AnthropicSentimentScorer",
56
+ "OpenAISentimentScorer",
57
+ "GroqSentimentScorer",
58
+ "ConstantSentimentScorer",
59
+ "diagnostics",
60
+ "benchmark",
61
+ "theil_u",
62
+ ]
@@ -0,0 +1,352 @@
1
+ """Benchmarking out-of-sample para o pipeline newsAR.
2
+
3
+ Compara o desempenho preditivo do ``newsAR`` contra modelos clássicos
4
+ usando validação walk-forward (rolling origin) 1-step-ahead.
5
+
6
+ Modelos avaliados
7
+ -----------------
8
+ - **newsAR** : AR Bayesiano + sentimento LLM
9
+ - **BayesAR** : AR Bayesiano puro (sem sentimento)
10
+ - **ARIMA** : ARIMA(p,0,0) com ``p`` selecionado por AIC
11
+ - **OLS-AR** : AR por OLS clássico
12
+ - **Naive** : y_{t+1} = y_t (Random Walk)
13
+ - **SeasonalNaive** : y_{t+1} = y_{t-11} (mesmo mês do ano anterior)
14
+
15
+ Métricas reportadas: RMSE, MAE, MAPE, Theil's U, melhoria_vs_Naive_%.
16
+ """
17
+ from __future__ import annotations
18
+
19
+ import warnings
20
+ from typing import Callable
21
+
22
+ import numpy as np
23
+ import pandas as pd
24
+
25
+ from .lags.selector import LagSelector
26
+ from .model.bayesian_ar import BayesianARModel
27
+ from .news.base import BaseNewsFetcher
28
+ from .sentiment.base import BaseSentimentScorer
29
+ from .sentiment.llm_scorer import ConstantSentimentScorer
30
+
31
+
32
+ def _rmse(y_true: np.ndarray, y_pred: np.ndarray) -> float:
33
+ return float(np.sqrt(np.mean((y_true - y_pred) ** 2)))
34
+
35
+
36
+ def _mae(y_true: np.ndarray, y_pred: np.ndarray) -> float:
37
+ return float(np.mean(np.abs(y_true - y_pred)))
38
+
39
+
40
+ def _mape(y_true: np.ndarray, y_pred: np.ndarray) -> float:
41
+ """MAPE (%), ignorando observações onde y_true == 0."""
42
+ mask = y_true != 0
43
+ if not mask.any():
44
+ return float("nan")
45
+ return float(100 * np.mean(np.abs((y_true[mask] - y_pred[mask]) / y_true[mask])))
46
+
47
+
48
+ def theil_u(y_true: np.ndarray, y_pred: np.ndarray) -> float:
49
+ """Coeficiente Theil's U — compara o modelo com o Random Walk.
50
+
51
+ Fórmula
52
+ -------
53
+ ::
54
+
55
+ Σ (y_t - ŷ_t)²
56
+ Theil U = ─────────────────
57
+ Σ (y_t - y_{t-1})²
58
+
59
+ Onde ``y_{t-1}`` é a previsão do Random Walk (Naive).
60
+
61
+ Interpretação
62
+ -------------
63
+ - U < 1 → modelo melhor que Random Walk ✓
64
+ - U = 1 → igual ao Random Walk
65
+ - U > 1 → pior que Random Walk ✗
66
+
67
+ Parameters
68
+ ----------
69
+ y_true : np.ndarray
70
+ Valores reais da série temporal (comprimento n).
71
+ y_pred : np.ndarray
72
+ Previsões do modelo avaliado (comprimento n).
73
+
74
+ Returns
75
+ -------
76
+ float
77
+ Valor de Theil's U. ``nan`` se o denominador for zero.
78
+ """
79
+ y_true = np.asarray(y_true, dtype=float)
80
+ y_pred = np.asarray(y_pred, dtype=float)
81
+ if len(y_true) != len(y_pred):
82
+ raise ValueError("y_true e y_pred devem ter o mesmo comprimento.")
83
+ numerator = np.sum((y_true[1:] - y_pred[1:]) ** 2)
84
+ denominator = np.sum((y_true[1:] - y_true[:-1]) ** 2)
85
+ if denominator == 0:
86
+ return float("nan")
87
+ return float(numerator / denominator)
88
+
89
+
90
+ def _naive_forecast(y_train: np.ndarray) -> float:
91
+ """Naive: última observação."""
92
+ return float(y_train[-1])
93
+
94
+
95
+ def _seasonal_naive_forecast(y_train: np.ndarray, season: int = 12) -> float:
96
+ """Seasonal naive: mesmo período do ano/ciclo anterior."""
97
+ if len(y_train) >= season:
98
+ return float(y_train[-season])
99
+ return float(y_train[-1]) # fallback se série curta
100
+
101
+
102
+ def _ols_ar_forecast(y_train: np.ndarray, lags: list[int]) -> float:
103
+ """AR por OLS (mínimos quadrados ordinários) via numpy."""
104
+ max_lag = max(lags)
105
+ n = len(y_train)
106
+ if n <= max_lag:
107
+ return float(y_train[-1])
108
+
109
+ # Construir matriz de design
110
+ lag_cols = [y_train[max_lag - lag: n - lag] for lag in lags]
111
+ X = np.column_stack([np.ones(n - max_lag)] + lag_cols)
112
+ y_t = y_train[max_lag:]
113
+
114
+ try:
115
+ beta, _, _, _ = np.linalg.lstsq(X, y_t, rcond=None)
116
+ except np.linalg.LinAlgError:
117
+ return float(y_train[-1])
118
+
119
+ x_new = np.array([1.0] + [y_train[-(lag)] for lag in lags])
120
+ return float(x_new @ beta)
121
+
122
+
123
+ def _arima_forecast(y_train: np.ndarray) -> float:
124
+ """ARIMA(p,0,0) com p escolhido por AIC entre 1..min(12, n//4)."""
125
+ try:
126
+ from statsmodels.tsa.arima.model import ARIMA
127
+ except ImportError:
128
+ return float(y_train[-1])
129
+
130
+ n = len(y_train)
131
+ max_p = min(12, max(1, n // 4))
132
+ best_aic = np.inf
133
+ best_pred = float(y_train[-1])
134
+
135
+ for p in range(1, max_p + 1):
136
+ try:
137
+ with warnings.catch_warnings():
138
+ warnings.simplefilter("ignore")
139
+ model = ARIMA(y_train, order=(p, 0, 0)).fit()
140
+ if model.aic < best_aic:
141
+ best_aic = model.aic
142
+ best_pred = float(model.forecast(steps=1)[0])
143
+ except Exception:
144
+ continue
145
+
146
+ return best_pred
147
+
148
+
149
+ def _bayes_ar_forecast(
150
+ y_train: np.ndarray,
151
+ lags: list[int],
152
+ n_iter: int,
153
+ burnin: int,
154
+ nu_t: float,
155
+ random_seed: int | None,
156
+ exog: np.ndarray | None = None,
157
+ future_exog: float | None = None,
158
+ ) -> float:
159
+ """AR Bayesiano 1-step-ahead via Gibbs."""
160
+ model = BayesianARModel(
161
+ lags=lags, n_iter=n_iter, burnin=burnin, nu_t=nu_t, random_seed=random_seed
162
+ )
163
+ model.fit(y_train, exog=exog)
164
+ fc = model.predict(
165
+ steps=1,
166
+ future_exog=np.array([future_exog]) if future_exog is not None else None,
167
+ )
168
+ return float(fc["mean"].iloc[0])
169
+
170
+
171
+ def run_benchmark(
172
+ y: pd.Series,
173
+ keyword: str,
174
+ fetcher: BaseNewsFetcher | None = None,
175
+ scorer: BaseSentimentScorer | None = None,
176
+ test_size: int = 12,
177
+ n_iter: int = 1000,
178
+ burnin: int = 300,
179
+ max_lag: int = 12,
180
+ nu_t: float = 5.0,
181
+ random_seed: int | None = 42,
182
+ season: int = 12,
183
+ verbose: bool = True,
184
+ ) -> pd.DataFrame:
185
+ """Benchmarking out-of-sample walk-forward 1-step-ahead.
186
+
187
+ Para cada ponto de teste ``t`` (dos últimos ``test_size`` períodos),
188
+ treina cada modelo com os dados até ``t-1`` e faz previsão para ``t``.
189
+ Ao final computa RMSE, MAE e MAPE de cada modelo.
190
+
191
+ Parameters
192
+ ----------
193
+ y : pd.Series
194
+ Série temporal com DatetimeIndex mensal.
195
+ keyword : str
196
+ Keyword para coleta de notícias (passada ao fetcher).
197
+ fetcher : BaseNewsFetcher, optional
198
+ Coletor de notícias. Se ``None``, usa sentimento neutro.
199
+ scorer : BaseSentimentScorer, optional
200
+ Pontuador de sentimento. Se ``None``, usa ``ConstantSentimentScorer``.
201
+ test_size : int
202
+ Número de períodos de teste (previsões a fazer). Padrão: 12.
203
+ n_iter : int
204
+ Iterações Gibbs (reduzido para velocidade no benchmark). Padrão: 1000.
205
+ burnin : int
206
+ Burn-in do Gibbs. Padrão: 300.
207
+ max_lag : int
208
+ Máximo de lags para seleção automática. Padrão: 12.
209
+ nu_t : float
210
+ Graus de liberdade da t-Student. Padrão: 5.0.
211
+ random_seed : int, optional
212
+ Semente para reprodutibilidade.
213
+ season : int
214
+ Sazonalidade para Seasonal Naive (padrão 12 = anual).
215
+ verbose : bool
216
+ Se ``True``, imprime progresso step a step.
217
+
218
+ Returns
219
+ -------
220
+ pd.DataFrame
221
+ Colunas: ``modelo``, ``RMSE``, ``MAE``, ``MAPE_%``, ``Theil_U``,
222
+ ``n_previsoes``, ``melhoria_vs_Naive_%``.
223
+ Ordenado por RMSE crescente.
224
+
225
+ Examples
226
+ --------
227
+ >>> from newsAR.benchmark import run_benchmark
228
+ >>> results = run_benchmark(y=serie, keyword="petróleo", test_size=12)
229
+ >>> results[["modelo", "RMSE", "Theil_U"]]
230
+ """
231
+ if not isinstance(y.index, pd.DatetimeIndex):
232
+ raise TypeError("y deve ter DatetimeIndex.")
233
+
234
+ n_total = len(y)
235
+ n_train_min = n_total - test_size
236
+
237
+ if n_train_min < max_lag + 4:
238
+ raise ValueError(
239
+ f"Série muito curta para {test_size} passos de teste. "
240
+ f"Reduza test_size ou forneça mais dados."
241
+ )
242
+
243
+ if scorer is None:
244
+ scorer = ConstantSentimentScorer(3.0)
245
+
246
+ # Acumular predições de cada modelo
247
+ models_preds: dict[str, list[float]] = {
248
+ "newsAR": [],
249
+ "BayesAR": [],
250
+ "ARIMA": [],
251
+ "OLS-AR": [],
252
+ "Naive": [],
253
+ "SeasonalNaive": [],
254
+ }
255
+ y_actual: list[float] = []
256
+
257
+ lag_selector = LagSelector(max_lag=max_lag, use_aic=False)
258
+
259
+ for step in range(test_size):
260
+ train_end = n_train_min + step
261
+ y_train_series = y.iloc[:train_end]
262
+ y_train = y_train_series.values
263
+ y_true_val = float(y.iloc[train_end])
264
+ y_actual.append(y_true_val)
265
+
266
+ if verbose:
267
+ periodo = y.index[train_end].strftime("%Y-%m")
268
+ print(f" [{step + 1:02d}/{test_size}] Prevendo {periodo} …", end=" ")
269
+
270
+ # Selecionar lags para este trecho de treino
271
+ lags = lag_selector.select(y_train)
272
+
273
+ # ---- Sentimento para newsAR ----
274
+ exog_train = None
275
+ future_sent = 3.0 # neutro como fallback
276
+ if fetcher is not None:
277
+ start = y_train_series.index[0].strftime("%Y-%m-%d")
278
+ end = y_train_series.index[-1].strftime("%Y-%m-%d")
279
+ try:
280
+ news_df = fetcher.fetch(keyword, start, end)
281
+ if not news_df.empty:
282
+ raw_sent = scorer.score_series(news_df)
283
+ periods = y_train_series.index.strftime("%Y-%m")
284
+ sent_aligned = raw_sent.reindex(periods, fill_value=3.0)
285
+ exog_train = sent_aligned.values.astype(float)
286
+ future_sent = float(exog_train[-3:].mean())
287
+ except Exception as exc:
288
+ warnings.warn(f"Erro ao buscar notícias: {exc}. Usando neutro.", stacklevel=2)
289
+
290
+ # ---- newsAR ----
291
+ try:
292
+ pred_news = _bayes_ar_forecast(
293
+ y_train, lags, n_iter, burnin, nu_t, random_seed,
294
+ exog=exog_train, future_exog=future_sent,
295
+ )
296
+ except Exception:
297
+ pred_news = float(y_train[-1])
298
+ models_preds["newsAR"].append(pred_news)
299
+
300
+ # ---- BayesAR puro ----
301
+ try:
302
+ pred_bayes = _bayes_ar_forecast(
303
+ y_train, lags, n_iter, burnin, nu_t, random_seed,
304
+ exog=None, future_exog=None,
305
+ )
306
+ except Exception:
307
+ pred_bayes = float(y_train[-1])
308
+ models_preds["BayesAR"].append(pred_bayes)
309
+
310
+ # ---- ARIMA ----
311
+ pred_arima = _arima_forecast(y_train)
312
+ models_preds["ARIMA"].append(pred_arima)
313
+
314
+ # ---- OLS-AR ----
315
+ pred_ols = _ols_ar_forecast(y_train, lags)
316
+ models_preds["OLS-AR"].append(pred_ols)
317
+
318
+ # ---- Naive ----
319
+ models_preds["Naive"].append(_naive_forecast(y_train))
320
+
321
+ # ---- Seasonal Naive ----
322
+ models_preds["SeasonalNaive"].append(
323
+ _seasonal_naive_forecast(y_train, season=season)
324
+ )
325
+
326
+ if verbose:
327
+ print(f"real={y_true_val:.4f} newsAR={pred_news:.4f} "
328
+ f"Naive={models_preds['Naive'][-1]:.4f}")
329
+
330
+ y_true_arr = np.array(y_actual)
331
+ rows = []
332
+ for name, preds in models_preds.items():
333
+ y_pred_arr = np.array(preds)
334
+ rows.append(
335
+ {
336
+ "modelo": name,
337
+ "RMSE": round(_rmse(y_true_arr, y_pred_arr), 4),
338
+ "MAE": round(_mae(y_true_arr, y_pred_arr), 4),
339
+ "MAPE_%": round(_mape(y_true_arr, y_pred_arr), 2),
340
+ "Theil_U": round(theil_u(y_true_arr, y_pred_arr), 4),
341
+ "n_previsoes": test_size,
342
+ }
343
+ )
344
+
345
+ result_df = pd.DataFrame(rows).sort_values("RMSE").reset_index(drop=True)
346
+
347
+ naive_rmse = float(result_df.loc[result_df["modelo"] == "Naive", "RMSE"].iloc[0])
348
+ result_df["melhoria_vs_Naive_%"] = result_df["RMSE"].apply(
349
+ lambda r: round(100 * (naive_rmse - r) / (naive_rmse + 1e-12), 1)
350
+ )
351
+
352
+ return result_df