agrobr 0.1.0__py3-none-any.whl → 0.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
agrobr/__init__.py CHANGED
@@ -2,9 +2,10 @@
2
2
 
3
3
  from __future__ import annotations
4
4
 
5
- __version__ = "0.1.0"
5
+ __version__ = "0.2.0"
6
6
  __author__ = "Bruno"
7
7
 
8
8
  from agrobr import cepea, conab, ibge
9
+ from agrobr.models import MetaInfo
9
10
 
10
- __all__ = ["cepea", "conab", "ibge", "__version__"]
11
+ __all__ = ["cepea", "conab", "ibge", "MetaInfo", "__version__"]
@@ -0,0 +1,343 @@
1
+ """Benchmark suite para testes de performance do agrobr."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import statistics
6
+ import time
7
+ from collections.abc import Callable, Coroutine
8
+ from dataclasses import dataclass, field
9
+ from datetime import datetime
10
+ from typing import Any
11
+
12
+ import structlog
13
+
14
+ logger = structlog.get_logger()
15
+
16
+
17
+ @dataclass
18
+ class BenchmarkResult:
19
+ """Resultado de um benchmark."""
20
+
21
+ name: str
22
+ iterations: int
23
+ total_time_ms: float
24
+ mean_time_ms: float
25
+ median_time_ms: float
26
+ min_time_ms: float
27
+ max_time_ms: float
28
+ std_dev_ms: float
29
+ times_ms: list[float] = field(default_factory=list)
30
+ timestamp: datetime = field(default_factory=datetime.now)
31
+ metadata: dict[str, Any] = field(default_factory=dict)
32
+
33
+ def to_dict(self) -> dict[str, Any]:
34
+ """Converte para dicionario."""
35
+ return {
36
+ "name": self.name,
37
+ "iterations": self.iterations,
38
+ "total_time_ms": round(self.total_time_ms, 2),
39
+ "mean_time_ms": round(self.mean_time_ms, 2),
40
+ "median_time_ms": round(self.median_time_ms, 2),
41
+ "min_time_ms": round(self.min_time_ms, 2),
42
+ "max_time_ms": round(self.max_time_ms, 2),
43
+ "std_dev_ms": round(self.std_dev_ms, 2),
44
+ "timestamp": self.timestamp.isoformat(),
45
+ "metadata": self.metadata,
46
+ }
47
+
48
+ def summary(self) -> str:
49
+ """Retorna resumo formatado."""
50
+ return (
51
+ f"{self.name}: "
52
+ f"mean={self.mean_time_ms:.2f}ms, "
53
+ f"median={self.median_time_ms:.2f}ms, "
54
+ f"min={self.min_time_ms:.2f}ms, "
55
+ f"max={self.max_time_ms:.2f}ms "
56
+ f"({self.iterations} iterations)"
57
+ )
58
+
59
+
60
+ @dataclass
61
+ class BenchmarkSuite:
62
+ """Suite de benchmarks."""
63
+
64
+ name: str
65
+ results: list[BenchmarkResult] = field(default_factory=list)
66
+ timestamp: datetime = field(default_factory=datetime.now)
67
+
68
+ def add_result(self, result: BenchmarkResult) -> None:
69
+ """Adiciona resultado."""
70
+ self.results.append(result)
71
+
72
+ def to_dict(self) -> dict[str, Any]:
73
+ """Converte para dicionario."""
74
+ return {
75
+ "name": self.name,
76
+ "timestamp": self.timestamp.isoformat(),
77
+ "results": [r.to_dict() for r in self.results],
78
+ }
79
+
80
+ def summary(self) -> str:
81
+ """Retorna resumo formatado."""
82
+ lines = [f"Benchmark Suite: {self.name}", "=" * 50]
83
+ for result in self.results:
84
+ lines.append(result.summary())
85
+ return "\n".join(lines)
86
+
87
+
88
+ async def benchmark_async(
89
+ name: str,
90
+ func: Callable[..., Coroutine[Any, Any, Any]],
91
+ iterations: int = 10,
92
+ warmup: int = 1,
93
+ **kwargs: Any,
94
+ ) -> BenchmarkResult:
95
+ """
96
+ Executa benchmark de funcao async.
97
+
98
+ Args:
99
+ name: Nome do benchmark
100
+ func: Funcao async a testar
101
+ iterations: Numero de iteracoes
102
+ warmup: Iteracoes de aquecimento
103
+ **kwargs: Argumentos para a funcao
104
+
105
+ Returns:
106
+ BenchmarkResult com estatisticas
107
+ """
108
+ for _ in range(warmup):
109
+ await func(**kwargs)
110
+
111
+ times: list[float] = []
112
+ for _ in range(iterations):
113
+ start = time.perf_counter()
114
+ await func(**kwargs)
115
+ elapsed = (time.perf_counter() - start) * 1000
116
+ times.append(elapsed)
117
+
118
+ return BenchmarkResult(
119
+ name=name,
120
+ iterations=iterations,
121
+ total_time_ms=sum(times),
122
+ mean_time_ms=statistics.mean(times),
123
+ median_time_ms=statistics.median(times),
124
+ min_time_ms=min(times),
125
+ max_time_ms=max(times),
126
+ std_dev_ms=statistics.stdev(times) if len(times) > 1 else 0,
127
+ times_ms=times,
128
+ metadata={"warmup": warmup, "kwargs": str(kwargs)},
129
+ )
130
+
131
+
132
+ def benchmark_sync(
133
+ name: str,
134
+ func: Callable[..., Any],
135
+ iterations: int = 10,
136
+ warmup: int = 1,
137
+ **kwargs: Any,
138
+ ) -> BenchmarkResult:
139
+ """
140
+ Executa benchmark de funcao sincrona.
141
+
142
+ Args:
143
+ name: Nome do benchmark
144
+ func: Funcao a testar
145
+ iterations: Numero de iteracoes
146
+ warmup: Iteracoes de aquecimento
147
+ **kwargs: Argumentos para a funcao
148
+
149
+ Returns:
150
+ BenchmarkResult com estatisticas
151
+ """
152
+ for _ in range(warmup):
153
+ func(**kwargs)
154
+
155
+ times: list[float] = []
156
+ for _ in range(iterations):
157
+ start = time.perf_counter()
158
+ func(**kwargs)
159
+ elapsed = (time.perf_counter() - start) * 1000
160
+ times.append(elapsed)
161
+
162
+ return BenchmarkResult(
163
+ name=name,
164
+ iterations=iterations,
165
+ total_time_ms=sum(times),
166
+ mean_time_ms=statistics.mean(times),
167
+ median_time_ms=statistics.median(times),
168
+ min_time_ms=min(times),
169
+ max_time_ms=max(times),
170
+ std_dev_ms=statistics.stdev(times) if len(times) > 1 else 0,
171
+ times_ms=times,
172
+ metadata={"warmup": warmup, "kwargs": str(kwargs)},
173
+ )
174
+
175
+
176
+ async def run_api_benchmarks(iterations: int = 5) -> BenchmarkSuite:
177
+ """
178
+ Executa benchmarks das APIs principais.
179
+
180
+ Args:
181
+ iterations: Numero de iteracoes por benchmark
182
+
183
+ Returns:
184
+ BenchmarkSuite com resultados
185
+ """
186
+ from agrobr import cepea, conab, ibge
187
+
188
+ suite = BenchmarkSuite(name="agrobr_api_benchmarks")
189
+
190
+ try:
191
+ result = await benchmark_async(
192
+ "cepea.indicador(soja, offline=True)",
193
+ cepea.indicador,
194
+ iterations=iterations,
195
+ produto="soja",
196
+ offline=True,
197
+ )
198
+ suite.add_result(result)
199
+ except Exception as e:
200
+ logger.warning("benchmark_failed", name="cepea.indicador", error=str(e))
201
+
202
+ try:
203
+ result = await benchmark_async(
204
+ "cepea.produtos()",
205
+ cepea.produtos,
206
+ iterations=iterations,
207
+ )
208
+ suite.add_result(result)
209
+ except Exception as e:
210
+ logger.warning("benchmark_failed", name="cepea.produtos", error=str(e))
211
+
212
+ try:
213
+ result = await benchmark_async(
214
+ "conab.produtos()",
215
+ conab.produtos,
216
+ iterations=iterations,
217
+ )
218
+ suite.add_result(result)
219
+ except Exception as e:
220
+ logger.warning("benchmark_failed", name="conab.produtos", error=str(e))
221
+
222
+ try:
223
+ result = await benchmark_async(
224
+ "ibge.produtos_pam()",
225
+ ibge.produtos_pam,
226
+ iterations=iterations,
227
+ )
228
+ suite.add_result(result)
229
+ except Exception as e:
230
+ logger.warning("benchmark_failed", name="ibge.produtos_pam", error=str(e))
231
+
232
+ return suite
233
+
234
+
235
+ def run_contract_benchmarks(iterations: int = 100) -> BenchmarkSuite:
236
+ """
237
+ Executa benchmarks de validacao de contratos.
238
+
239
+ Args:
240
+ iterations: Numero de iteracoes por benchmark
241
+
242
+ Returns:
243
+ BenchmarkSuite com resultados
244
+ """
245
+ import pandas as pd
246
+
247
+ from agrobr.contracts.cepea import CEPEA_INDICADOR_V1
248
+
249
+ suite = BenchmarkSuite(name="contract_validation_benchmarks")
250
+
251
+ df_small = pd.DataFrame(
252
+ {
253
+ "data": pd.date_range("2024-01-01", periods=10),
254
+ "produto": ["soja"] * 10,
255
+ "praca": ["paranagua"] * 10,
256
+ "valor": [150.0] * 10,
257
+ "unidade": ["BRL/sc60kg"] * 10,
258
+ "fonte": ["cepea"] * 10,
259
+ "metodologia": [None] * 10,
260
+ "anomalies": [None] * 10,
261
+ }
262
+ )
263
+
264
+ result = benchmark_sync(
265
+ "contract.validate(10 rows)",
266
+ CEPEA_INDICADOR_V1.validate,
267
+ iterations=iterations,
268
+ df=df_small,
269
+ )
270
+ suite.add_result(result)
271
+
272
+ df_large = pd.DataFrame(
273
+ {
274
+ "data": pd.date_range("2020-01-01", periods=1000),
275
+ "produto": ["soja"] * 1000,
276
+ "praca": ["paranagua"] * 1000,
277
+ "valor": [150.0] * 1000,
278
+ "unidade": ["BRL/sc60kg"] * 1000,
279
+ "fonte": ["cepea"] * 1000,
280
+ "metodologia": [None] * 1000,
281
+ "anomalies": [None] * 1000,
282
+ }
283
+ )
284
+
285
+ result = benchmark_sync(
286
+ "contract.validate(1000 rows)",
287
+ CEPEA_INDICADOR_V1.validate,
288
+ iterations=iterations,
289
+ df=df_large,
290
+ )
291
+ suite.add_result(result)
292
+
293
+ return suite
294
+
295
+
296
+ def run_semantic_benchmarks(iterations: int = 50) -> BenchmarkSuite:
297
+ """
298
+ Executa benchmarks de validacao semantica.
299
+
300
+ Args:
301
+ iterations: Numero de iteracoes por benchmark
302
+
303
+ Returns:
304
+ BenchmarkSuite com resultados
305
+ """
306
+ import pandas as pd
307
+
308
+ from agrobr.validators.semantic import validate_semantic
309
+
310
+ suite = BenchmarkSuite(name="semantic_validation_benchmarks")
311
+
312
+ df = pd.DataFrame(
313
+ {
314
+ "data": pd.date_range("2024-01-01", periods=100),
315
+ "valor": [150.0 + i * 0.5 for i in range(100)],
316
+ "produto": ["soja"] * 100,
317
+ "produtividade": [3500.0] * 100,
318
+ "area_plantada": [1000.0] * 100,
319
+ "area_colhida": [950.0] * 100,
320
+ "safra": ["2024/25"] * 100,
321
+ }
322
+ )
323
+
324
+ result = benchmark_sync(
325
+ "validate_semantic(100 rows)",
326
+ validate_semantic,
327
+ iterations=iterations,
328
+ df=df,
329
+ )
330
+ suite.add_result(result)
331
+
332
+ return suite
333
+
334
+
335
+ __all__ = [
336
+ "BenchmarkResult",
337
+ "BenchmarkSuite",
338
+ "benchmark_async",
339
+ "benchmark_sync",
340
+ "run_api_benchmarks",
341
+ "run_contract_benchmarks",
342
+ "run_semantic_benchmarks",
343
+ ]
agrobr/cache/policies.py CHANGED
@@ -1,10 +1,6 @@
1
- """
2
- Políticas de cache e TTL por fonte.
3
- """
4
-
5
1
  from __future__ import annotations
6
2
 
7
- from datetime import datetime, timedelta
3
+ from datetime import datetime, time, timedelta
8
4
  from enum import Enum
9
5
  from typing import NamedTuple
10
6
 
@@ -17,6 +13,7 @@ class CachePolicy(NamedTuple):
17
13
  ttl_seconds: int
18
14
  stale_max_seconds: int
19
15
  description: str
16
+ smart_expiry: bool = False
20
17
 
21
18
 
22
19
  class TTL(Enum):
@@ -33,41 +30,52 @@ class TTL(Enum):
33
30
  DAYS_90 = 90 * 24 * 60 * 60
34
31
 
35
32
 
33
+ CEPEA_UPDATE_HOUR = 18
34
+ CEPEA_UPDATE_MINUTE = 0
35
+
36
+
36
37
  POLICIES: dict[str, CachePolicy] = {
37
38
  "cepea_diario": CachePolicy(
38
- ttl_seconds=TTL.HOURS_4.value,
39
+ ttl_seconds=TTL.HOURS_24.value,
39
40
  stale_max_seconds=TTL.HOURS_24.value * 2,
40
- description="CEPEA indicador diário (atualiza ~18h)",
41
+ description="CEPEA indicador diário (expira às 18h)",
42
+ smart_expiry=True,
41
43
  ),
42
44
  "cepea_semanal": CachePolicy(
43
45
  ttl_seconds=TTL.HOURS_24.value,
44
46
  stale_max_seconds=TTL.DAYS_7.value,
45
47
  description="CEPEA indicador semanal (atualiza sexta)",
48
+ smart_expiry=False,
46
49
  ),
47
50
  "conab_safras": CachePolicy(
48
51
  ttl_seconds=TTL.HOURS_24.value,
49
52
  stale_max_seconds=TTL.DAYS_30.value,
50
53
  description="CONAB safras (atualiza mensalmente)",
54
+ smart_expiry=False,
51
55
  ),
52
56
  "conab_balanco": CachePolicy(
53
57
  ttl_seconds=TTL.HOURS_24.value,
54
58
  stale_max_seconds=TTL.DAYS_30.value,
55
59
  description="CONAB balanço (atualiza mensalmente)",
60
+ smart_expiry=False,
56
61
  ),
57
62
  "ibge_pam": CachePolicy(
58
63
  ttl_seconds=TTL.DAYS_7.value,
59
64
  stale_max_seconds=TTL.DAYS_90.value,
60
65
  description="IBGE PAM (atualiza anualmente)",
66
+ smart_expiry=False,
61
67
  ),
62
68
  "ibge_lspa": CachePolicy(
63
69
  ttl_seconds=TTL.HOURS_24.value,
64
70
  stale_max_seconds=TTL.DAYS_30.value,
65
71
  description="IBGE LSPA (atualiza mensalmente)",
72
+ smart_expiry=False,
66
73
  ),
67
74
  "noticias_agricolas": CachePolicy(
68
- ttl_seconds=TTL.HOURS_4.value,
75
+ ttl_seconds=TTL.HOURS_24.value,
69
76
  stale_max_seconds=TTL.HOURS_24.value * 2,
70
- description="Notícias Agrícolas (mirror CEPEA)",
77
+ description="Notícias Agrícolas (expira às 18h, mirror CEPEA)",
78
+ smart_expiry=True,
71
79
  ),
72
80
  }
73
81
 
@@ -106,6 +114,35 @@ def get_policy(source: Fonte | str, endpoint: str | None = None) -> CachePolicy:
106
114
  return POLICIES[default_key]
107
115
 
108
116
 
117
+ def _get_smart_expiry_time() -> datetime:
118
+ """
119
+ Calcula próximo horário de expiração para CEPEA (18h).
120
+
121
+ CEPEA atualiza dados por volta das 17-18h.
122
+ Cache expira às 18h para pegar dados novos.
123
+
124
+ Returns:
125
+ Datetime da próxima expiração
126
+ """
127
+ now = datetime.now()
128
+ today_expiry = datetime.combine(now.date(), time(CEPEA_UPDATE_HOUR, CEPEA_UPDATE_MINUTE))
129
+
130
+ if now < today_expiry:
131
+ return today_expiry
132
+ else:
133
+ return today_expiry + timedelta(days=1)
134
+
135
+
136
+ def _get_last_expiry_time() -> datetime:
137
+ """
138
+ Retorna o último horário de expiração (18h anterior).
139
+
140
+ Returns:
141
+ Datetime da última expiração
142
+ """
143
+ return _get_smart_expiry_time() - timedelta(days=1)
144
+
145
+
109
146
  def get_ttl(source: Fonte | str, endpoint: str | None = None) -> int:
110
147
  """
111
148
  Retorna TTL em segundos para uma fonte.
@@ -134,20 +171,29 @@ def get_stale_max(source: Fonte | str, endpoint: str | None = None) -> int:
134
171
  return get_policy(source, endpoint).stale_max_seconds
135
172
 
136
173
 
137
- def is_expired(created_at: datetime, source: Fonte | str) -> bool:
174
+ def is_expired(created_at: datetime, source: Fonte | str, endpoint: str | None = None) -> bool:
138
175
  """
139
176
  Verifica se entrada de cache está expirada.
140
177
 
178
+ Para fontes com smart_expiry (CEPEA), expira às 18h.
179
+ Para outras fontes, usa TTL fixo.
180
+
141
181
  Args:
142
182
  created_at: Data de criação
143
183
  source: Fonte de dados
184
+ endpoint: Endpoint específico
144
185
 
145
186
  Returns:
146
187
  True se expirado
147
188
  """
148
- ttl = get_ttl(source)
149
- expires_at = created_at + timedelta(seconds=ttl)
150
- return datetime.utcnow() > expires_at
189
+ policy = get_policy(source, endpoint)
190
+
191
+ if policy.smart_expiry:
192
+ last_expiry = _get_last_expiry_time()
193
+ return created_at < last_expiry
194
+
195
+ expires_at = created_at + timedelta(seconds=policy.ttl_seconds)
196
+ return datetime.now() > expires_at
151
197
 
152
198
 
153
199
  def is_stale_acceptable(created_at: datetime, source: Fonte | str) -> bool:
@@ -163,13 +209,16 @@ def is_stale_acceptable(created_at: datetime, source: Fonte | str) -> bool:
163
209
  """
164
210
  stale_max = get_stale_max(source)
165
211
  max_acceptable = created_at + timedelta(seconds=stale_max)
166
- return datetime.utcnow() <= max_acceptable
212
+ return datetime.now() <= max_acceptable
167
213
 
168
214
 
169
215
  def calculate_expiry(source: Fonte | str, endpoint: str | None = None) -> datetime:
170
216
  """
171
217
  Calcula data de expiração para nova entrada.
172
218
 
219
+ Para fontes com smart_expiry (CEPEA), retorna próximas 18h.
220
+ Para outras fontes, usa TTL fixo.
221
+
173
222
  Args:
174
223
  source: Fonte de dados
175
224
  endpoint: Endpoint específico
@@ -177,8 +226,12 @@ def calculate_expiry(source: Fonte | str, endpoint: str | None = None) -> dateti
177
226
  Returns:
178
227
  Data de expiração
179
228
  """
180
- ttl = get_ttl(source, endpoint)
181
- return datetime.utcnow() + timedelta(seconds=ttl)
229
+ policy = get_policy(source, endpoint)
230
+
231
+ if policy.smart_expiry:
232
+ return _get_smart_expiry_time()
233
+
234
+ return datetime.now() + timedelta(seconds=policy.ttl_seconds)
182
235
 
183
236
 
184
237
  class InvalidationReason(Enum):
@@ -196,6 +249,7 @@ def should_refresh(
196
249
  created_at: datetime,
197
250
  source: Fonte | str,
198
251
  force: bool = False,
252
+ endpoint: str | None = None,
199
253
  ) -> tuple[bool, str]:
200
254
  """
201
255
  Determina se cache deve ser atualizado.
@@ -204,6 +258,7 @@ def should_refresh(
204
258
  created_at: Data de criação do cache
205
259
  source: Fonte de dados
206
260
  force: Forçar atualização
261
+ endpoint: Endpoint específico
207
262
 
208
263
  Returns:
209
264
  Tupla (deve_atualizar, razão)
@@ -211,7 +266,7 @@ def should_refresh(
211
266
  if force:
212
267
  return True, "force_refresh"
213
268
 
214
- if is_expired(created_at, source):
269
+ if is_expired(created_at, source, endpoint):
215
270
  return True, "expired"
216
271
 
217
272
  return False, "fresh"
@@ -238,3 +293,30 @@ def format_ttl(seconds: int) -> str:
238
293
 
239
294
  days = seconds // 86400
240
295
  return f"{days} dia{'s' if days > 1 else ''}"
296
+
297
+
298
+ def get_next_update_info(source: Fonte | str) -> dict[str, str]:
299
+ """
300
+ Retorna informações sobre próxima atualização.
301
+
302
+ Args:
303
+ source: Fonte de dados
304
+
305
+ Returns:
306
+ Dict com info de expiração
307
+ """
308
+ policy = get_policy(source)
309
+
310
+ if policy.smart_expiry:
311
+ next_expiry = _get_smart_expiry_time()
312
+ return {
313
+ "type": "smart",
314
+ "expires_at": next_expiry.strftime("%Y-%m-%d %H:%M"),
315
+ "description": f"Expira às {CEPEA_UPDATE_HOUR}h (atualização CEPEA)",
316
+ }
317
+
318
+ return {
319
+ "type": "ttl",
320
+ "ttl": format_ttl(policy.ttl_seconds),
321
+ "description": policy.description,
322
+ }