agrobr 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (58) hide show
  1. agrobr/__init__.py +10 -0
  2. agrobr/alerts/__init__.py +7 -0
  3. agrobr/alerts/notifier.py +167 -0
  4. agrobr/cache/__init__.py +31 -0
  5. agrobr/cache/duckdb_store.py +433 -0
  6. agrobr/cache/history.py +317 -0
  7. agrobr/cache/migrations.py +82 -0
  8. agrobr/cache/policies.py +240 -0
  9. agrobr/cepea/__init__.py +7 -0
  10. agrobr/cepea/api.py +360 -0
  11. agrobr/cepea/client.py +273 -0
  12. agrobr/cepea/parsers/__init__.py +37 -0
  13. agrobr/cepea/parsers/base.py +35 -0
  14. agrobr/cepea/parsers/consensus.py +300 -0
  15. agrobr/cepea/parsers/detector.py +108 -0
  16. agrobr/cepea/parsers/fingerprint.py +226 -0
  17. agrobr/cepea/parsers/v1.py +305 -0
  18. agrobr/cli.py +323 -0
  19. agrobr/conab/__init__.py +21 -0
  20. agrobr/conab/api.py +239 -0
  21. agrobr/conab/client.py +219 -0
  22. agrobr/conab/parsers/__init__.py +7 -0
  23. agrobr/conab/parsers/v1.py +383 -0
  24. agrobr/constants.py +205 -0
  25. agrobr/exceptions.py +104 -0
  26. agrobr/health/__init__.py +23 -0
  27. agrobr/health/checker.py +202 -0
  28. agrobr/health/reporter.py +314 -0
  29. agrobr/http/__init__.py +9 -0
  30. agrobr/http/browser.py +214 -0
  31. agrobr/http/rate_limiter.py +69 -0
  32. agrobr/http/retry.py +93 -0
  33. agrobr/http/user_agents.py +67 -0
  34. agrobr/ibge/__init__.py +19 -0
  35. agrobr/ibge/api.py +273 -0
  36. agrobr/ibge/client.py +256 -0
  37. agrobr/models.py +85 -0
  38. agrobr/normalize/__init__.py +64 -0
  39. agrobr/normalize/dates.py +303 -0
  40. agrobr/normalize/encoding.py +102 -0
  41. agrobr/normalize/regions.py +308 -0
  42. agrobr/normalize/units.py +278 -0
  43. agrobr/noticias_agricolas/__init__.py +6 -0
  44. agrobr/noticias_agricolas/client.py +222 -0
  45. agrobr/noticias_agricolas/parser.py +187 -0
  46. agrobr/sync.py +147 -0
  47. agrobr/telemetry/__init__.py +17 -0
  48. agrobr/telemetry/collector.py +153 -0
  49. agrobr/utils/__init__.py +5 -0
  50. agrobr/utils/logging.py +59 -0
  51. agrobr/validators/__init__.py +35 -0
  52. agrobr/validators/sanity.py +286 -0
  53. agrobr/validators/structural.py +313 -0
  54. agrobr-0.1.0.dist-info/METADATA +243 -0
  55. agrobr-0.1.0.dist-info/RECORD +58 -0
  56. agrobr-0.1.0.dist-info/WHEEL +4 -0
  57. agrobr-0.1.0.dist-info/entry_points.txt +2 -0
  58. agrobr-0.1.0.dist-info/licenses/LICENSE +21 -0
agrobr/http/browser.py ADDED
@@ -0,0 +1,214 @@
1
+ """Browser automation com Playwright para sites com proteção anti-bot."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import asyncio
6
+ from collections.abc import AsyncGenerator
7
+ from contextlib import asynccontextmanager
8
+
9
+ import structlog
10
+ from playwright.async_api import Browser, Page, Playwright, async_playwright
11
+
12
+ from agrobr import constants
13
+ from agrobr.exceptions import SourceUnavailableError
14
+ from agrobr.http.user_agents import UserAgentRotator
15
+
16
+ logger = structlog.get_logger()
17
+
18
+ # Singleton para reutilizar browser
19
+ _playwright: Playwright | None = None
20
+ _browser: Browser | None = None
21
+ _lock = asyncio.Lock()
22
+
23
+
24
+ async def _get_browser() -> Browser:
25
+ """Obtém ou cria instância do browser (singleton)."""
26
+ global _playwright, _browser
27
+
28
+ async with _lock:
29
+ if _browser is None or not _browser.is_connected():
30
+ logger.info("browser_starting", browser="chromium")
31
+
32
+ _playwright = await async_playwright().start()
33
+ _browser = await _playwright.chromium.launch(
34
+ headless=True,
35
+ args=[
36
+ "--disable-blink-features=AutomationControlled",
37
+ "--disable-dev-shm-usage",
38
+ "--no-sandbox",
39
+ ],
40
+ )
41
+
42
+ logger.info("browser_started")
43
+
44
+ return _browser
45
+
46
+
47
+ async def close_browser() -> None:
48
+ """Fecha o browser e libera recursos."""
49
+ global _playwright, _browser
50
+
51
+ async with _lock:
52
+ if _browser is not None:
53
+ await _browser.close()
54
+ _browser = None
55
+ logger.info("browser_closed")
56
+
57
+ if _playwright is not None:
58
+ await _playwright.stop()
59
+ _playwright = None
60
+
61
+
62
+ @asynccontextmanager
63
+ async def get_page() -> AsyncGenerator[Page, None]:
64
+ """Context manager para obter uma página do browser."""
65
+ browser = await _get_browser()
66
+
67
+ # Cria contexto com fingerprint realista
68
+ ua = UserAgentRotator.get_random()
69
+ context = await browser.new_context(
70
+ user_agent=ua,
71
+ viewport={"width": 1920, "height": 1080},
72
+ locale="pt-BR",
73
+ timezone_id="America/Sao_Paulo",
74
+ extra_http_headers={
75
+ "Accept-Language": "pt-BR,pt;q=0.9,en-US;q=0.8,en;q=0.7",
76
+ },
77
+ )
78
+
79
+ page = await context.new_page()
80
+
81
+ # Esconde sinais de automação
82
+ await page.add_init_script(
83
+ """
84
+ Object.defineProperty(navigator, 'webdriver', {
85
+ get: () => undefined
86
+ });
87
+ """
88
+ )
89
+
90
+ try:
91
+ yield page
92
+ finally:
93
+ await context.close()
94
+
95
+
96
+ async def fetch_with_browser(
97
+ url: str,
98
+ source: str = "unknown",
99
+ wait_selector: str | None = None,
100
+ wait_timeout: float = 30000,
101
+ ) -> str:
102
+ """
103
+ Busca página usando browser headless.
104
+
105
+ Contorna proteções anti-bot como Cloudflare.
106
+
107
+ Args:
108
+ url: URL para buscar
109
+ source: Nome da fonte (para logging)
110
+ wait_selector: Seletor CSS para aguardar antes de retornar
111
+ wait_timeout: Timeout em ms para aguardar
112
+
113
+ Returns:
114
+ HTML da página
115
+
116
+ Raises:
117
+ SourceUnavailableError: Se não conseguir carregar a página
118
+ """
119
+ logger.info(
120
+ "browser_fetch_start",
121
+ source=source,
122
+ url=url,
123
+ )
124
+
125
+ try:
126
+ async with get_page() as page:
127
+ # Navega para a URL
128
+ response = await page.goto(
129
+ url,
130
+ wait_until="domcontentloaded",
131
+ timeout=wait_timeout,
132
+ )
133
+
134
+ if response is None:
135
+ raise SourceUnavailableError(
136
+ source=source,
137
+ url=url,
138
+ last_error="No response received",
139
+ )
140
+
141
+ # Aguarda seletor específico se fornecido
142
+ if wait_selector:
143
+ try:
144
+ await page.wait_for_selector(
145
+ wait_selector,
146
+ timeout=wait_timeout,
147
+ )
148
+ except Exception as e:
149
+ logger.warning(
150
+ "browser_wait_selector_timeout",
151
+ selector=wait_selector,
152
+ error=str(e),
153
+ )
154
+
155
+ # Aguarda Cloudflare resolver e JS terminar
156
+ await page.wait_for_timeout(5000)
157
+
158
+ # Verifica se foi bloqueado pelo Cloudflare
159
+ if response.status in (403, 503):
160
+ check_html: str = await page.content()
161
+ # Detecta página de challenge do Cloudflare
162
+ if "cloudflare" in check_html.lower() or "challenge" in check_html.lower():
163
+ raise SourceUnavailableError(
164
+ source=source,
165
+ url=url,
166
+ last_error=f"Cloudflare block detected (status {response.status})",
167
+ )
168
+
169
+ # Obtém HTML
170
+ html: str = await page.content()
171
+
172
+ logger.info(
173
+ "browser_fetch_success",
174
+ source=source,
175
+ url=url,
176
+ content_length=len(html),
177
+ status=response.status,
178
+ )
179
+
180
+ return html
181
+
182
+ except Exception as e:
183
+ logger.error(
184
+ "browser_fetch_failed",
185
+ source=source,
186
+ url=url,
187
+ error=str(e),
188
+ )
189
+ raise SourceUnavailableError(
190
+ source=source,
191
+ url=url,
192
+ last_error=str(e),
193
+ ) from e
194
+
195
+
196
+ async def fetch_cepea_indicador(produto: str) -> str:
197
+ """
198
+ Busca página de indicador do CEPEA usando browser.
199
+
200
+ Args:
201
+ produto: Nome do produto (soja, milho, etc)
202
+
203
+ Returns:
204
+ HTML da página
205
+ """
206
+ produto_key = constants.CEPEA_PRODUTOS.get(produto.lower(), produto.lower())
207
+ url = f"{constants.URLS[constants.Fonte.CEPEA]['indicadores']}/{produto_key}.aspx"
208
+
209
+ return await fetch_with_browser(
210
+ url=url,
211
+ source="cepea",
212
+ wait_selector="table",
213
+ wait_timeout=90000,
214
+ )
@@ -0,0 +1,69 @@
1
+ """Rate limiter por fonte usando semáforos."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import asyncio
6
+ import time
7
+ from collections.abc import AsyncIterator
8
+ from contextlib import asynccontextmanager
9
+
10
+ import structlog
11
+
12
+ from agrobr import constants
13
+
14
+ logger = structlog.get_logger()
15
+
16
+
17
+ class RateLimiter:
18
+ """Garante intervalo mínimo entre requests para cada fonte."""
19
+
20
+ _semaphores: dict[str, asyncio.Semaphore] = {}
21
+ _last_request: dict[str, float] = {}
22
+ _lock = asyncio.Lock()
23
+
24
+ @classmethod
25
+ def _get_delay(cls, source: constants.Fonte) -> float:
26
+ settings = constants.HTTPSettings()
27
+ delays = {
28
+ constants.Fonte.CEPEA: settings.rate_limit_cepea,
29
+ constants.Fonte.CONAB: settings.rate_limit_conab,
30
+ constants.Fonte.IBGE: settings.rate_limit_ibge,
31
+ constants.Fonte.NOTICIAS_AGRICOLAS: settings.rate_limit_noticias_agricolas,
32
+ }
33
+ return delays.get(source, 1.0)
34
+
35
+ @classmethod
36
+ @asynccontextmanager
37
+ async def acquire(cls, source: constants.Fonte) -> AsyncIterator[None]:
38
+ """Context manager que garante rate limiting."""
39
+ source_key = source.value
40
+
41
+ async with cls._lock:
42
+ if source_key not in cls._semaphores:
43
+ cls._semaphores[source_key] = asyncio.Semaphore(1)
44
+
45
+ async with cls._semaphores[source_key]:
46
+ now = time.monotonic()
47
+ last = cls._last_request.get(source_key, 0)
48
+ delay = cls._get_delay(source)
49
+ elapsed = now - last
50
+
51
+ if elapsed < delay:
52
+ wait_time = delay - elapsed
53
+ logger.debug(
54
+ "rate_limit_wait",
55
+ source=source_key,
56
+ wait_seconds=wait_time,
57
+ )
58
+ await asyncio.sleep(wait_time)
59
+
60
+ try:
61
+ yield
62
+ finally:
63
+ cls._last_request[source_key] = time.monotonic()
64
+
65
+ @classmethod
66
+ def reset(cls) -> None:
67
+ """Reseta estado do rate limiter."""
68
+ cls._semaphores.clear()
69
+ cls._last_request.clear()
agrobr/http/retry.py ADDED
@@ -0,0 +1,93 @@
1
+ """Retry com exponential backoff."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import asyncio
6
+ from collections.abc import Awaitable, Callable, Sequence
7
+ from functools import wraps
8
+ from typing import Any, TypeVar
9
+
10
+ import httpx
11
+ import structlog
12
+
13
+ from agrobr import constants
14
+
15
+ logger = structlog.get_logger()
16
+ T = TypeVar("T")
17
+
18
+ RETRIABLE_EXCEPTIONS: tuple[type[Exception], ...] = (
19
+ httpx.TimeoutException,
20
+ httpx.NetworkError,
21
+ httpx.RemoteProtocolError,
22
+ )
23
+
24
+
25
+ async def retry_async(
26
+ func: Callable[[], Awaitable[T]],
27
+ max_attempts: int | None = None,
28
+ base_delay: float | None = None,
29
+ max_delay: float | None = None,
30
+ retriable_exceptions: Sequence[type[Exception]] = RETRIABLE_EXCEPTIONS,
31
+ ) -> T:
32
+ """Executa função async com retry exponential backoff."""
33
+ settings = constants.HTTPSettings()
34
+ max_attempts = max_attempts or settings.max_retries
35
+ base_delay = base_delay or settings.retry_base_delay
36
+ max_delay = max_delay or settings.retry_max_delay
37
+
38
+ last_exception: Exception | None = None
39
+
40
+ for attempt in range(max_attempts):
41
+ try:
42
+ return await func()
43
+
44
+ except tuple(retriable_exceptions) as e:
45
+ last_exception = e
46
+ if attempt < max_attempts - 1:
47
+ delay = min(
48
+ base_delay * (settings.retry_exponential_base**attempt),
49
+ max_delay,
50
+ )
51
+ logger.warning(
52
+ "retry_scheduled",
53
+ attempt=attempt + 1,
54
+ max_attempts=max_attempts,
55
+ delay_seconds=delay,
56
+ error=str(e),
57
+ )
58
+ await asyncio.sleep(delay)
59
+ else:
60
+ logger.error(
61
+ "retry_exhausted",
62
+ attempts=max_attempts,
63
+ last_error=str(e),
64
+ )
65
+
66
+ if last_exception:
67
+ raise last_exception
68
+ raise RuntimeError("Retry logic error: no exception captured")
69
+
70
+
71
+ def with_retry(
72
+ max_attempts: int | None = None,
73
+ base_delay: float | None = None,
74
+ ) -> Callable[[Callable[..., Awaitable[T]]], Callable[..., Awaitable[T]]]:
75
+ """Decorator para retry automático."""
76
+
77
+ def decorator(func: Callable[..., Awaitable[T]]) -> Callable[..., Awaitable[T]]:
78
+ @wraps(func)
79
+ async def wrapper(*args: Any, **kwargs: Any) -> T:
80
+ return await retry_async(
81
+ lambda: func(*args, **kwargs),
82
+ max_attempts=max_attempts,
83
+ base_delay=base_delay,
84
+ )
85
+
86
+ return wrapper
87
+
88
+ return decorator
89
+
90
+
91
+ def should_retry_status(status_code: int) -> bool:
92
+ """Verifica se o status code permite retry."""
93
+ return status_code in constants.RETRIABLE_STATUS_CODES
@@ -0,0 +1,67 @@
1
+ """Pool rotativo de User-Agents."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import random
6
+ from collections.abc import Sequence
7
+
8
+ USER_AGENT_POOL: Sequence[str] = (
9
+ "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
10
+ "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36",
11
+ "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36",
12
+ "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
13
+ "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36",
14
+ "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:121.0) Gecko/20100101 Firefox/121.0",
15
+ "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:120.0) Gecko/20100101 Firefox/120.0",
16
+ "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:121.0) Gecko/20100101 Firefox/121.0",
17
+ "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36 Edg/120.0.0.0",
18
+ "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.2 Safari/605.1.15",
19
+ )
20
+
21
+ DEFAULT_HEADERS: dict[str, str] = {
22
+ "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
23
+ "Accept-Language": "pt-BR,pt;q=0.9,en-US;q=0.8,en;q=0.7",
24
+ "Accept-Encoding": "gzip, deflate, br",
25
+ "Connection": "keep-alive",
26
+ "Upgrade-Insecure-Requests": "1",
27
+ "Sec-Fetch-Dest": "document",
28
+ "Sec-Fetch-Mode": "navigate",
29
+ "Sec-Fetch-Site": "none",
30
+ "Sec-Fetch-User": "?1",
31
+ }
32
+
33
+
34
+ class UserAgentRotator:
35
+ """Rotaciona User-Agents de forma determinística por fonte."""
36
+
37
+ _counters: dict[str, int] = {}
38
+
39
+ @classmethod
40
+ def get(cls, source: str | None = None) -> str:
41
+ """Retorna próximo User-Agent do pool."""
42
+ key = source or "default"
43
+
44
+ if key not in cls._counters:
45
+ cls._counters[key] = random.randint(0, len(USER_AGENT_POOL) - 1)
46
+
47
+ ua = USER_AGENT_POOL[cls._counters[key] % len(USER_AGENT_POOL)]
48
+ cls._counters[key] += 1
49
+
50
+ return ua
51
+
52
+ @classmethod
53
+ def get_random(cls) -> str:
54
+ """Retorna User-Agent aleatório."""
55
+ return random.choice(USER_AGENT_POOL)
56
+
57
+ @classmethod
58
+ def get_headers(cls, source: str | None = None) -> dict[str, str]:
59
+ """Retorna headers completos incluindo User-Agent."""
60
+ headers = DEFAULT_HEADERS.copy()
61
+ headers["User-Agent"] = cls.get(source)
62
+ return headers
63
+
64
+ @classmethod
65
+ def reset(cls) -> None:
66
+ """Reseta contadores."""
67
+ cls._counters.clear()
@@ -0,0 +1,19 @@
1
+ """Modulo IBGE - Dados PAM e LSPA."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from agrobr.ibge.api import (
6
+ lspa,
7
+ pam,
8
+ produtos_lspa,
9
+ produtos_pam,
10
+ ufs,
11
+ )
12
+
13
+ __all__ = [
14
+ "pam",
15
+ "lspa",
16
+ "produtos_pam",
17
+ "produtos_lspa",
18
+ "ufs",
19
+ ]