agrobr 0.1.0__py3-none-any.whl → 0.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
agrobr/ibge/client.py CHANGED
@@ -14,38 +14,30 @@ from agrobr.http.rate_limiter import RateLimiter
14
14
  logger = structlog.get_logger()
15
15
 
16
16
 
17
- # Códigos das tabelas SIDRA
18
17
  TABELAS = {
19
- # PAM - Produção Agrícola Municipal
20
- "pam_temporarias": "1612", # Lavouras temporárias (1974-2018)
21
- "pam_permanentes": "1613", # Lavouras permanentes (1974-2018)
22
- "pam_nova": "5457", # Nova série PAM (2018+)
23
- # LSPA - Levantamento Sistemático da Produção Agrícola
24
- "lspa": "6588", # Série mensal (2006+)
25
- "lspa_safra": "1618", # Por ano de safra
18
+ "pam_temporarias": "1612",
19
+ "pam_permanentes": "1613",
20
+ "pam_nova": "5457",
21
+ "lspa": "6588",
22
+ "lspa_safra": "1618",
26
23
  }
27
24
 
28
- # Variáveis disponíveis
29
25
  VARIAVEIS = {
30
- # PAM 5457
31
26
  "area_plantada": "214",
32
27
  "area_colhida": "215",
33
28
  "producao": "216",
34
29
  "rendimento": "112",
35
30
  "valor_producao": "215",
36
- # PAM 1612 (lavouras temporárias)
37
31
  "area_plantada_1612": "109",
38
32
  "area_colhida_1612": "1000109",
39
33
  "producao_1612": "214",
40
34
  "rendimento_1612": "112",
41
35
  "valor_1612": "215",
42
- # LSPA 6588
43
36
  "area_lspa": "109",
44
37
  "producao_lspa": "216",
45
38
  "rendimento_lspa": "112",
46
39
  }
47
40
 
48
- # Níveis territoriais
49
41
  NIVEIS_TERRITORIAIS = {
50
42
  "brasil": "1",
51
43
  "regiao": "2",
@@ -55,7 +47,6 @@ NIVEIS_TERRITORIAIS = {
55
47
  "municipio": "6",
56
48
  }
57
49
 
58
- # Códigos de produtos agrícolas (classificação 782 para tabela 5457)
59
50
  PRODUTOS_PAM = {
60
51
  "soja": "40124",
61
52
  "milho": "40126",
@@ -69,7 +60,6 @@ PRODUTOS_PAM = {
69
60
  "laranja": "40125",
70
61
  }
71
62
 
72
- # Códigos para LSPA (classificação 48 para tabela 6588)
73
63
  PRODUTOS_LSPA = {
74
64
  "soja": "39443",
75
65
  "milho_1": "39441",
@@ -125,7 +115,6 @@ async def fetch_sidra(
125
115
  )
126
116
 
127
117
  async with RateLimiter.acquire(constants.Fonte.IBGE):
128
- # sidrapy é síncrono, então apenas chamamos diretamente
129
118
  kwargs: dict[str, Any] = {
130
119
  "table_code": table_code,
131
120
  "territorial_level": territorial_level,
@@ -151,7 +140,6 @@ async def fetch_sidra(
151
140
  try:
152
141
  df = sidrapy.get_table(**kwargs)
153
142
 
154
- # Remove primeira linha que é o header descritivo
155
143
  if header == "n" and len(df) > 1:
156
144
  df = df.iloc[1:].reset_index(drop=True)
157
145
 
@@ -186,7 +174,6 @@ def parse_sidra_response(
186
174
  Returns:
187
175
  DataFrame processado
188
176
  """
189
- # Mapeamento padrão de colunas SIDRA
190
177
  default_rename = {
191
178
  "NC": "nivel_territorial_cod",
192
179
  "NN": "nivel_territorial",
@@ -206,11 +193,9 @@ def parse_sidra_response(
206
193
  if rename_columns:
207
194
  default_rename.update(rename_columns)
208
195
 
209
- # Renomeia apenas colunas que existem
210
196
  rename_map = {k: v for k, v in default_rename.items() if k in df.columns}
211
197
  df = df.rename(columns=rename_map)
212
198
 
213
- # Converte valor para numérico
214
199
  if "valor" in df.columns:
215
200
  df["valor"] = pd.to_numeric(df["valor"], errors="coerce")
216
201
 
agrobr/models.py CHANGED
@@ -2,14 +2,22 @@
2
2
 
3
3
  from __future__ import annotations
4
4
 
5
+ import hashlib
6
+ import json
7
+ import sys
8
+ from dataclasses import dataclass
9
+ from dataclasses import field as dataclass_field
5
10
  from datetime import date, datetime
6
11
  from decimal import Decimal
7
- from typing import Any
12
+ from typing import TYPE_CHECKING, Any
8
13
 
9
14
  from pydantic import BaseModel, Field, field_validator
10
15
 
11
16
  from .constants import Fonte
12
17
 
18
+ if TYPE_CHECKING:
19
+ import pandas as pd
20
+
13
21
 
14
22
  class Indicador(BaseModel):
15
23
  fonte: Fonte
@@ -83,3 +91,94 @@ class Fingerprint(BaseModel):
83
91
  structure_hash: str
84
92
  table_headers: list[list[str]]
85
93
  element_counts: dict[str, int]
94
+
95
+
96
+ @dataclass
97
+ class MetaInfo:
98
+ """Metadados de proveniencia e rastreabilidade para data lineage."""
99
+
100
+ source: str
101
+ source_url: str
102
+ source_method: str
103
+ fetched_at: datetime
104
+ timestamp: datetime = dataclass_field(default_factory=datetime.now)
105
+ fetch_duration_ms: int = 0
106
+ parse_duration_ms: int = 0
107
+ from_cache: bool = False
108
+ cache_key: str | None = None
109
+ cache_expires_at: datetime | None = None
110
+ raw_content_hash: str | None = None
111
+ raw_content_size: int = 0
112
+ records_count: int = 0
113
+ columns: list[str] = dataclass_field(default_factory=list)
114
+ agrobr_version: str = ""
115
+ schema_version: str = "1.0"
116
+ parser_version: int = 1
117
+ python_version: str = ""
118
+ validation_passed: bool = True
119
+ validation_warnings: list[str] = dataclass_field(default_factory=list)
120
+
121
+ def __post_init__(self) -> None:
122
+ """Preenche versoes automaticamente."""
123
+ if not self.agrobr_version:
124
+ from agrobr import __version__
125
+
126
+ self.agrobr_version = __version__
127
+
128
+ if not self.python_version:
129
+ self.python_version = sys.version.split()[0]
130
+
131
+ def to_dict(self) -> dict[str, Any]:
132
+ """Converte para dicionario serializavel."""
133
+ return {
134
+ "source": self.source,
135
+ "source_url": self.source_url,
136
+ "source_method": self.source_method,
137
+ "fetched_at": self.fetched_at.isoformat(),
138
+ "timestamp": self.timestamp.isoformat(),
139
+ "fetch_duration_ms": self.fetch_duration_ms,
140
+ "parse_duration_ms": self.parse_duration_ms,
141
+ "from_cache": self.from_cache,
142
+ "cache_key": self.cache_key,
143
+ "cache_expires_at": (
144
+ self.cache_expires_at.isoformat() if self.cache_expires_at else None
145
+ ),
146
+ "raw_content_hash": self.raw_content_hash,
147
+ "raw_content_size": self.raw_content_size,
148
+ "records_count": self.records_count,
149
+ "columns": self.columns,
150
+ "agrobr_version": self.agrobr_version,
151
+ "schema_version": self.schema_version,
152
+ "parser_version": self.parser_version,
153
+ "python_version": self.python_version,
154
+ "validation_passed": self.validation_passed,
155
+ "validation_warnings": self.validation_warnings,
156
+ }
157
+
158
+ def to_json(self, indent: int = 2) -> str:
159
+ """Serializa para JSON."""
160
+ return json.dumps(self.to_dict(), indent=indent, ensure_ascii=False)
161
+
162
+ @classmethod
163
+ def from_dict(cls, data: dict[str, Any]) -> MetaInfo:
164
+ """Reconstroi a partir de dicionario."""
165
+ data = data.copy()
166
+
167
+ for key in ["fetched_at", "timestamp", "cache_expires_at"]:
168
+ if data.get(key) and isinstance(data[key], str):
169
+ data[key] = datetime.fromisoformat(data[key])
170
+
171
+ return cls(**data)
172
+
173
+ def compute_dataframe_hash(self, df: pd.DataFrame) -> str:
174
+ """Computa hash do DataFrame para verificacao de integridade."""
175
+ csv_bytes = df.to_csv(index=False).encode("utf-8")
176
+ return f"sha256:{hashlib.sha256(csv_bytes).hexdigest()}"
177
+
178
+ def verify_hash(self, df: pd.DataFrame) -> bool:
179
+ """Verifica se DataFrame corresponde ao hash original."""
180
+ if not self.raw_content_hash:
181
+ return True
182
+
183
+ current_hash = self.compute_dataframe_hash(df)
184
+ return current_hash == self.raw_content_hash
@@ -17,7 +17,6 @@ from agrobr.normalize.encoding import decode_content
17
17
 
18
18
  logger = structlog.get_logger()
19
19
 
20
- # Por padrão usa browser pois a página carrega dados via AJAX
21
20
  _use_browser: bool = True
22
21
 
23
22
 
@@ -77,20 +76,17 @@ async def _fetch_with_browser(url: str, produto: str) -> str:
77
76
  last_error="No response received",
78
77
  )
79
78
 
80
- # Aguarda tabela de cotações carregar
81
79
  try:
82
80
  await page.wait_for_selector(
83
81
  "table.cot-fisicas",
84
82
  timeout=15000,
85
83
  )
86
84
  except Exception:
87
- # Tenta seletor alternativo
88
85
  await page.wait_for_selector(
89
86
  "table",
90
87
  timeout=10000,
91
88
  )
92
89
 
93
- # Aguarda AJAX terminar
94
90
  await page.wait_for_timeout(2000)
95
91
 
96
92
  html: str = await page.content()
@@ -193,7 +189,6 @@ async def fetch_indicador_page(produto: str, force_httpx: bool = False) -> str:
193
189
  produto=produto,
194
190
  )
195
191
 
196
- # Por padrão usa browser pois a página carrega dados via AJAX
197
192
  if not force_httpx and _use_browser:
198
193
  try:
199
194
  return await _fetch_with_browser(url, produto)
@@ -203,9 +198,7 @@ async def fetch_indicador_page(produto: str, force_httpx: bool = False) -> str:
203
198
  source="noticias_agricolas",
204
199
  url=url,
205
200
  )
206
- # Fallback para httpx
207
201
 
208
- # Tenta httpx (pode ter dados incompletos)
209
202
  try:
210
203
  return await _fetch_with_httpx(url)
211
204
  except httpx.HTTPError as e:
@@ -14,7 +14,6 @@ from agrobr.models import Indicador
14
14
 
15
15
  logger = structlog.get_logger()
16
16
 
17
- # Mapeamento de produtos para unidades
18
17
  UNIDADES = {
19
18
  "soja": "BRL/sc60kg",
20
19
  "soja_parana": "BRL/sc60kg",
@@ -27,7 +26,6 @@ UNIDADES = {
27
26
  "trigo": "BRL/ton",
28
27
  }
29
28
 
30
- # Mapeamento de produtos para praça
31
29
  PRACAS = {
32
30
  "soja": "Paranaguá/PR",
33
31
  "soja_parana": "Paraná",
@@ -45,7 +43,6 @@ def _parse_date(date_str: str) -> datetime | None:
45
43
  """Converte string de data para datetime."""
46
44
  date_str = date_str.strip()
47
45
 
48
- # Formato: DD/MM/YYYY
49
46
  match = re.match(r"(\d{2})/(\d{2})/(\d{4})", date_str)
50
47
  if match:
51
48
  day, month, year = match.groups()
@@ -61,10 +58,8 @@ def _parse_valor(valor_str: str) -> Decimal | None:
61
58
  """Converte string de valor para Decimal."""
62
59
  valor_str = valor_str.strip()
63
60
 
64
- # Remove "R$" e espaços
65
61
  valor_str = re.sub(r"R\$\s*", "", valor_str)
66
62
 
67
- # Substitui vírgula por ponto
68
63
  valor_str = valor_str.replace(".", "").replace(",", ".")
69
64
 
70
65
  try:
@@ -77,10 +72,8 @@ def _parse_variacao(var_str: str) -> Decimal | None:
77
72
  """Converte string de variação para Decimal."""
78
73
  var_str = var_str.strip()
79
74
 
80
- # Remove % e espaços
81
75
  var_str = re.sub(r"[%\s]", "", var_str)
82
76
 
83
- # Substitui vírgula por ponto
84
77
  var_str = var_str.replace(",", ".")
85
78
 
86
79
  try:
@@ -107,26 +100,18 @@ def parse_indicador(html: str, produto: str) -> list[Indicador]:
107
100
  unidade = UNIDADES.get(produto_lower, "BRL/unidade")
108
101
  praca = PRACAS.get(produto_lower)
109
102
 
110
- # Estrutura do Notícias Agrícolas:
111
- # Tabela com classe "cot-fisicas" ou tabelas genéricas
112
- # Headers: Data | Valor R$ | Variação (%)
113
-
114
- # Primeiro tenta tabela específica de cotações
115
103
  tables = soup.find_all("table", class_="cot-fisicas")
116
104
 
117
- # Se não encontrar, tenta todas as tabelas
118
105
  if not tables:
119
106
  tables = soup.find_all("table")
120
107
 
121
108
  for table in tables:
122
- # Verifica se é tabela de cotação
123
109
  headers = table.find_all("th")
124
110
  header_text = " ".join(h.get_text(strip=True).lower() for h in headers)
125
111
 
126
112
  if "data" not in header_text or "valor" not in header_text:
127
113
  continue
128
114
 
129
- # Extrai todas as linhas de dados (tbody > tr)
130
115
  tbody = table.find("tbody")
131
116
  rows = tbody.find_all("tr") if tbody else table.find_all("tr")[1:]
132
117
 
@@ -136,7 +121,6 @@ def parse_indicador(html: str, produto: str) -> list[Indicador]:
136
121
  if len(cells) < 2:
137
122
  continue
138
123
 
139
- # Extrai data e valor
140
124
  data_str = cells[0].get_text(strip=True)
141
125
  valor_str = cells[1].get_text(strip=True)
142
126
 
@@ -152,7 +136,6 @@ def parse_indicador(html: str, produto: str) -> list[Indicador]:
152
136
  )
153
137
  continue
154
138
 
155
- # Extrai variação se disponível
156
139
  meta: dict[str, str | float] = {}
157
140
  if len(cells) >= 3:
158
141
  var_str = cells[2].get_text(strip=True)
@@ -0,0 +1,205 @@
1
+ """Sistema de plugins para extensibilidade do agrobr."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import importlib
6
+ import importlib.util
7
+ from abc import ABC, abstractmethod
8
+ from dataclasses import dataclass, field
9
+ from pathlib import Path
10
+ from typing import Any, TypeVar
11
+
12
+ import structlog
13
+
14
+ logger = structlog.get_logger()
15
+
16
+ T = TypeVar("T", bound="Plugin")
17
+
18
+ _registry: dict[str, type[Plugin]] = {}
19
+ _instances: dict[str, Plugin] = {}
20
+
21
+
22
+ @dataclass
23
+ class PluginMeta:
24
+ name: str
25
+ version: str
26
+ description: str
27
+ author: str = ""
28
+ requires: list[str] = field(default_factory=list)
29
+ provides: list[str] = field(default_factory=list)
30
+
31
+
32
+ class Plugin(ABC):
33
+ meta: PluginMeta
34
+
35
+ @abstractmethod
36
+ def setup(self) -> None:
37
+ pass
38
+
39
+ @abstractmethod
40
+ def teardown(self) -> None:
41
+ pass
42
+
43
+ def is_enabled(self) -> bool:
44
+ return True
45
+
46
+
47
+ class SourcePlugin(Plugin):
48
+ @abstractmethod
49
+ async def fetch(self, **kwargs: Any) -> Any:
50
+ pass
51
+
52
+ @abstractmethod
53
+ async def parse(self, content: Any, **kwargs: Any) -> Any:
54
+ pass
55
+
56
+ def get_source_name(self) -> str:
57
+ return self.meta.name
58
+
59
+
60
+ class ParserPlugin(Plugin):
61
+ @abstractmethod
62
+ def can_parse(self, content: str) -> bool:
63
+ pass
64
+
65
+ @abstractmethod
66
+ def parse(self, content: str, **kwargs: Any) -> Any:
67
+ pass
68
+
69
+ @property
70
+ def priority(self) -> int:
71
+ return 0
72
+
73
+
74
+ class ExporterPlugin(Plugin):
75
+ @abstractmethod
76
+ def export(self, data: Any, path: Path, **kwargs: Any) -> Path:
77
+ pass
78
+
79
+ @abstractmethod
80
+ def get_extension(self) -> str:
81
+ pass
82
+
83
+
84
+ class ValidatorPlugin(Plugin):
85
+ @abstractmethod
86
+ def validate(self, data: Any, **kwargs: Any) -> tuple[bool, list[str]]:
87
+ pass
88
+
89
+
90
+ def register(plugin_class: type[T]) -> type[T]:
91
+ if not hasattr(plugin_class, "meta"):
92
+ raise ValueError(f"Plugin {plugin_class.__name__} must have 'meta' attribute")
93
+
94
+ name = plugin_class.meta.name
95
+ if name in _registry:
96
+ logger.warning(
97
+ "plugin_override", name=name, old=_registry[name].__name__, new=plugin_class.__name__
98
+ )
99
+
100
+ _registry[name] = plugin_class
101
+ logger.info("plugin_registered", name=name, version=plugin_class.meta.version)
102
+ return plugin_class
103
+
104
+
105
+ def get_plugin(name: str) -> Plugin | None:
106
+ if name in _instances:
107
+ return _instances[name]
108
+
109
+ if name not in _registry:
110
+ return None
111
+
112
+ plugin_class = _registry[name]
113
+ instance = plugin_class()
114
+ instance.setup()
115
+ _instances[name] = instance
116
+ return instance
117
+
118
+
119
+ def list_plugins() -> list[PluginMeta]:
120
+ return [cls.meta for cls in _registry.values()]
121
+
122
+
123
+ def list_plugins_by_type(plugin_type: type[Plugin]) -> list[PluginMeta]:
124
+ return [cls.meta for cls in _registry.values() if issubclass(cls, plugin_type)]
125
+
126
+
127
+ def load_plugin_from_file(path: Path) -> type[Plugin] | None:
128
+ if not path.exists():
129
+ logger.error("plugin_file_not_found", path=str(path))
130
+ return None
131
+
132
+ spec = importlib.util.spec_from_file_location(path.stem, path)
133
+ if spec is None or spec.loader is None:
134
+ logger.error("plugin_spec_failed", path=str(path))
135
+ return None
136
+
137
+ module = importlib.util.module_from_spec(spec)
138
+ spec.loader.exec_module(module)
139
+
140
+ for attr_name in dir(module):
141
+ attr = getattr(module, attr_name)
142
+ if (
143
+ isinstance(attr, type)
144
+ and issubclass(attr, Plugin)
145
+ and attr is not Plugin
146
+ and hasattr(attr, "meta")
147
+ ):
148
+ register(attr)
149
+ return attr
150
+
151
+ logger.warning("no_plugin_found", path=str(path))
152
+ return None
153
+
154
+
155
+ def load_plugins_from_dir(directory: Path) -> list[type[Plugin]]:
156
+ loaded: list[type[Plugin]] = []
157
+ if not directory.exists():
158
+ return loaded
159
+
160
+ for path in directory.glob("*.py"):
161
+ if path.name.startswith("_"):
162
+ continue
163
+ plugin_class = load_plugin_from_file(path)
164
+ if plugin_class:
165
+ loaded.append(plugin_class)
166
+
167
+ return loaded
168
+
169
+
170
+ def unload_plugin(name: str) -> bool:
171
+ if name in _instances:
172
+ _instances[name].teardown()
173
+ del _instances[name]
174
+
175
+ if name in _registry:
176
+ del _registry[name]
177
+ logger.info("plugin_unloaded", name=name)
178
+ return True
179
+
180
+ return False
181
+
182
+
183
+ def unload_all() -> None:
184
+ for name in list(_instances.keys()):
185
+ _instances[name].teardown()
186
+ _instances.clear()
187
+ _registry.clear()
188
+
189
+
190
+ __all__ = [
191
+ "Plugin",
192
+ "PluginMeta",
193
+ "SourcePlugin",
194
+ "ParserPlugin",
195
+ "ExporterPlugin",
196
+ "ValidatorPlugin",
197
+ "register",
198
+ "get_plugin",
199
+ "list_plugins",
200
+ "list_plugins_by_type",
201
+ "load_plugin_from_file",
202
+ "load_plugins_from_dir",
203
+ "unload_plugin",
204
+ "unload_all",
205
+ ]