agrobr 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- agrobr/__init__.py +10 -0
- agrobr/alerts/__init__.py +7 -0
- agrobr/alerts/notifier.py +167 -0
- agrobr/cache/__init__.py +31 -0
- agrobr/cache/duckdb_store.py +433 -0
- agrobr/cache/history.py +317 -0
- agrobr/cache/migrations.py +82 -0
- agrobr/cache/policies.py +240 -0
- agrobr/cepea/__init__.py +7 -0
- agrobr/cepea/api.py +360 -0
- agrobr/cepea/client.py +273 -0
- agrobr/cepea/parsers/__init__.py +37 -0
- agrobr/cepea/parsers/base.py +35 -0
- agrobr/cepea/parsers/consensus.py +300 -0
- agrobr/cepea/parsers/detector.py +108 -0
- agrobr/cepea/parsers/fingerprint.py +226 -0
- agrobr/cepea/parsers/v1.py +305 -0
- agrobr/cli.py +323 -0
- agrobr/conab/__init__.py +21 -0
- agrobr/conab/api.py +239 -0
- agrobr/conab/client.py +219 -0
- agrobr/conab/parsers/__init__.py +7 -0
- agrobr/conab/parsers/v1.py +383 -0
- agrobr/constants.py +205 -0
- agrobr/exceptions.py +104 -0
- agrobr/health/__init__.py +23 -0
- agrobr/health/checker.py +202 -0
- agrobr/health/reporter.py +314 -0
- agrobr/http/__init__.py +9 -0
- agrobr/http/browser.py +214 -0
- agrobr/http/rate_limiter.py +69 -0
- agrobr/http/retry.py +93 -0
- agrobr/http/user_agents.py +67 -0
- agrobr/ibge/__init__.py +19 -0
- agrobr/ibge/api.py +273 -0
- agrobr/ibge/client.py +256 -0
- agrobr/models.py +85 -0
- agrobr/normalize/__init__.py +64 -0
- agrobr/normalize/dates.py +303 -0
- agrobr/normalize/encoding.py +102 -0
- agrobr/normalize/regions.py +308 -0
- agrobr/normalize/units.py +278 -0
- agrobr/noticias_agricolas/__init__.py +6 -0
- agrobr/noticias_agricolas/client.py +222 -0
- agrobr/noticias_agricolas/parser.py +187 -0
- agrobr/sync.py +147 -0
- agrobr/telemetry/__init__.py +17 -0
- agrobr/telemetry/collector.py +153 -0
- agrobr/utils/__init__.py +5 -0
- agrobr/utils/logging.py +59 -0
- agrobr/validators/__init__.py +35 -0
- agrobr/validators/sanity.py +286 -0
- agrobr/validators/structural.py +313 -0
- agrobr-0.1.0.dist-info/METADATA +243 -0
- agrobr-0.1.0.dist-info/RECORD +58 -0
- agrobr-0.1.0.dist-info/WHEEL +4 -0
- agrobr-0.1.0.dist-info/entry_points.txt +2 -0
- agrobr-0.1.0.dist-info/licenses/LICENSE +21 -0
agrobr/http/browser.py
ADDED
|
@@ -0,0 +1,214 @@
|
|
|
1
|
+
"""Browser automation com Playwright para sites com proteção anti-bot."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import asyncio
|
|
6
|
+
from collections.abc import AsyncGenerator
|
|
7
|
+
from contextlib import asynccontextmanager
|
|
8
|
+
|
|
9
|
+
import structlog
|
|
10
|
+
from playwright.async_api import Browser, Page, Playwright, async_playwright
|
|
11
|
+
|
|
12
|
+
from agrobr import constants
|
|
13
|
+
from agrobr.exceptions import SourceUnavailableError
|
|
14
|
+
from agrobr.http.user_agents import UserAgentRotator
|
|
15
|
+
|
|
16
|
+
logger = structlog.get_logger()
|
|
17
|
+
|
|
18
|
+
# Singleton para reutilizar browser
|
|
19
|
+
_playwright: Playwright | None = None
|
|
20
|
+
_browser: Browser | None = None
|
|
21
|
+
_lock = asyncio.Lock()
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
async def _get_browser() -> Browser:
|
|
25
|
+
"""Obtém ou cria instância do browser (singleton)."""
|
|
26
|
+
global _playwright, _browser
|
|
27
|
+
|
|
28
|
+
async with _lock:
|
|
29
|
+
if _browser is None or not _browser.is_connected():
|
|
30
|
+
logger.info("browser_starting", browser="chromium")
|
|
31
|
+
|
|
32
|
+
_playwright = await async_playwright().start()
|
|
33
|
+
_browser = await _playwright.chromium.launch(
|
|
34
|
+
headless=True,
|
|
35
|
+
args=[
|
|
36
|
+
"--disable-blink-features=AutomationControlled",
|
|
37
|
+
"--disable-dev-shm-usage",
|
|
38
|
+
"--no-sandbox",
|
|
39
|
+
],
|
|
40
|
+
)
|
|
41
|
+
|
|
42
|
+
logger.info("browser_started")
|
|
43
|
+
|
|
44
|
+
return _browser
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
async def close_browser() -> None:
|
|
48
|
+
"""Fecha o browser e libera recursos."""
|
|
49
|
+
global _playwright, _browser
|
|
50
|
+
|
|
51
|
+
async with _lock:
|
|
52
|
+
if _browser is not None:
|
|
53
|
+
await _browser.close()
|
|
54
|
+
_browser = None
|
|
55
|
+
logger.info("browser_closed")
|
|
56
|
+
|
|
57
|
+
if _playwright is not None:
|
|
58
|
+
await _playwright.stop()
|
|
59
|
+
_playwright = None
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
@asynccontextmanager
|
|
63
|
+
async def get_page() -> AsyncGenerator[Page, None]:
|
|
64
|
+
"""Context manager para obter uma página do browser."""
|
|
65
|
+
browser = await _get_browser()
|
|
66
|
+
|
|
67
|
+
# Cria contexto com fingerprint realista
|
|
68
|
+
ua = UserAgentRotator.get_random()
|
|
69
|
+
context = await browser.new_context(
|
|
70
|
+
user_agent=ua,
|
|
71
|
+
viewport={"width": 1920, "height": 1080},
|
|
72
|
+
locale="pt-BR",
|
|
73
|
+
timezone_id="America/Sao_Paulo",
|
|
74
|
+
extra_http_headers={
|
|
75
|
+
"Accept-Language": "pt-BR,pt;q=0.9,en-US;q=0.8,en;q=0.7",
|
|
76
|
+
},
|
|
77
|
+
)
|
|
78
|
+
|
|
79
|
+
page = await context.new_page()
|
|
80
|
+
|
|
81
|
+
# Esconde sinais de automação
|
|
82
|
+
await page.add_init_script(
|
|
83
|
+
"""
|
|
84
|
+
Object.defineProperty(navigator, 'webdriver', {
|
|
85
|
+
get: () => undefined
|
|
86
|
+
});
|
|
87
|
+
"""
|
|
88
|
+
)
|
|
89
|
+
|
|
90
|
+
try:
|
|
91
|
+
yield page
|
|
92
|
+
finally:
|
|
93
|
+
await context.close()
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
async def fetch_with_browser(
|
|
97
|
+
url: str,
|
|
98
|
+
source: str = "unknown",
|
|
99
|
+
wait_selector: str | None = None,
|
|
100
|
+
wait_timeout: float = 30000,
|
|
101
|
+
) -> str:
|
|
102
|
+
"""
|
|
103
|
+
Busca página usando browser headless.
|
|
104
|
+
|
|
105
|
+
Contorna proteções anti-bot como Cloudflare.
|
|
106
|
+
|
|
107
|
+
Args:
|
|
108
|
+
url: URL para buscar
|
|
109
|
+
source: Nome da fonte (para logging)
|
|
110
|
+
wait_selector: Seletor CSS para aguardar antes de retornar
|
|
111
|
+
wait_timeout: Timeout em ms para aguardar
|
|
112
|
+
|
|
113
|
+
Returns:
|
|
114
|
+
HTML da página
|
|
115
|
+
|
|
116
|
+
Raises:
|
|
117
|
+
SourceUnavailableError: Se não conseguir carregar a página
|
|
118
|
+
"""
|
|
119
|
+
logger.info(
|
|
120
|
+
"browser_fetch_start",
|
|
121
|
+
source=source,
|
|
122
|
+
url=url,
|
|
123
|
+
)
|
|
124
|
+
|
|
125
|
+
try:
|
|
126
|
+
async with get_page() as page:
|
|
127
|
+
# Navega para a URL
|
|
128
|
+
response = await page.goto(
|
|
129
|
+
url,
|
|
130
|
+
wait_until="domcontentloaded",
|
|
131
|
+
timeout=wait_timeout,
|
|
132
|
+
)
|
|
133
|
+
|
|
134
|
+
if response is None:
|
|
135
|
+
raise SourceUnavailableError(
|
|
136
|
+
source=source,
|
|
137
|
+
url=url,
|
|
138
|
+
last_error="No response received",
|
|
139
|
+
)
|
|
140
|
+
|
|
141
|
+
# Aguarda seletor específico se fornecido
|
|
142
|
+
if wait_selector:
|
|
143
|
+
try:
|
|
144
|
+
await page.wait_for_selector(
|
|
145
|
+
wait_selector,
|
|
146
|
+
timeout=wait_timeout,
|
|
147
|
+
)
|
|
148
|
+
except Exception as e:
|
|
149
|
+
logger.warning(
|
|
150
|
+
"browser_wait_selector_timeout",
|
|
151
|
+
selector=wait_selector,
|
|
152
|
+
error=str(e),
|
|
153
|
+
)
|
|
154
|
+
|
|
155
|
+
# Aguarda Cloudflare resolver e JS terminar
|
|
156
|
+
await page.wait_for_timeout(5000)
|
|
157
|
+
|
|
158
|
+
# Verifica se foi bloqueado pelo Cloudflare
|
|
159
|
+
if response.status in (403, 503):
|
|
160
|
+
check_html: str = await page.content()
|
|
161
|
+
# Detecta página de challenge do Cloudflare
|
|
162
|
+
if "cloudflare" in check_html.lower() or "challenge" in check_html.lower():
|
|
163
|
+
raise SourceUnavailableError(
|
|
164
|
+
source=source,
|
|
165
|
+
url=url,
|
|
166
|
+
last_error=f"Cloudflare block detected (status {response.status})",
|
|
167
|
+
)
|
|
168
|
+
|
|
169
|
+
# Obtém HTML
|
|
170
|
+
html: str = await page.content()
|
|
171
|
+
|
|
172
|
+
logger.info(
|
|
173
|
+
"browser_fetch_success",
|
|
174
|
+
source=source,
|
|
175
|
+
url=url,
|
|
176
|
+
content_length=len(html),
|
|
177
|
+
status=response.status,
|
|
178
|
+
)
|
|
179
|
+
|
|
180
|
+
return html
|
|
181
|
+
|
|
182
|
+
except Exception as e:
|
|
183
|
+
logger.error(
|
|
184
|
+
"browser_fetch_failed",
|
|
185
|
+
source=source,
|
|
186
|
+
url=url,
|
|
187
|
+
error=str(e),
|
|
188
|
+
)
|
|
189
|
+
raise SourceUnavailableError(
|
|
190
|
+
source=source,
|
|
191
|
+
url=url,
|
|
192
|
+
last_error=str(e),
|
|
193
|
+
) from e
|
|
194
|
+
|
|
195
|
+
|
|
196
|
+
async def fetch_cepea_indicador(produto: str) -> str:
|
|
197
|
+
"""
|
|
198
|
+
Busca página de indicador do CEPEA usando browser.
|
|
199
|
+
|
|
200
|
+
Args:
|
|
201
|
+
produto: Nome do produto (soja, milho, etc)
|
|
202
|
+
|
|
203
|
+
Returns:
|
|
204
|
+
HTML da página
|
|
205
|
+
"""
|
|
206
|
+
produto_key = constants.CEPEA_PRODUTOS.get(produto.lower(), produto.lower())
|
|
207
|
+
url = f"{constants.URLS[constants.Fonte.CEPEA]['indicadores']}/{produto_key}.aspx"
|
|
208
|
+
|
|
209
|
+
return await fetch_with_browser(
|
|
210
|
+
url=url,
|
|
211
|
+
source="cepea",
|
|
212
|
+
wait_selector="table",
|
|
213
|
+
wait_timeout=90000,
|
|
214
|
+
)
|
|
@@ -0,0 +1,69 @@
|
|
|
1
|
+
"""Rate limiter por fonte usando semáforos."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import asyncio
|
|
6
|
+
import time
|
|
7
|
+
from collections.abc import AsyncIterator
|
|
8
|
+
from contextlib import asynccontextmanager
|
|
9
|
+
|
|
10
|
+
import structlog
|
|
11
|
+
|
|
12
|
+
from agrobr import constants
|
|
13
|
+
|
|
14
|
+
logger = structlog.get_logger()
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class RateLimiter:
|
|
18
|
+
"""Garante intervalo mínimo entre requests para cada fonte."""
|
|
19
|
+
|
|
20
|
+
_semaphores: dict[str, asyncio.Semaphore] = {}
|
|
21
|
+
_last_request: dict[str, float] = {}
|
|
22
|
+
_lock = asyncio.Lock()
|
|
23
|
+
|
|
24
|
+
@classmethod
|
|
25
|
+
def _get_delay(cls, source: constants.Fonte) -> float:
|
|
26
|
+
settings = constants.HTTPSettings()
|
|
27
|
+
delays = {
|
|
28
|
+
constants.Fonte.CEPEA: settings.rate_limit_cepea,
|
|
29
|
+
constants.Fonte.CONAB: settings.rate_limit_conab,
|
|
30
|
+
constants.Fonte.IBGE: settings.rate_limit_ibge,
|
|
31
|
+
constants.Fonte.NOTICIAS_AGRICOLAS: settings.rate_limit_noticias_agricolas,
|
|
32
|
+
}
|
|
33
|
+
return delays.get(source, 1.0)
|
|
34
|
+
|
|
35
|
+
@classmethod
|
|
36
|
+
@asynccontextmanager
|
|
37
|
+
async def acquire(cls, source: constants.Fonte) -> AsyncIterator[None]:
|
|
38
|
+
"""Context manager que garante rate limiting."""
|
|
39
|
+
source_key = source.value
|
|
40
|
+
|
|
41
|
+
async with cls._lock:
|
|
42
|
+
if source_key not in cls._semaphores:
|
|
43
|
+
cls._semaphores[source_key] = asyncio.Semaphore(1)
|
|
44
|
+
|
|
45
|
+
async with cls._semaphores[source_key]:
|
|
46
|
+
now = time.monotonic()
|
|
47
|
+
last = cls._last_request.get(source_key, 0)
|
|
48
|
+
delay = cls._get_delay(source)
|
|
49
|
+
elapsed = now - last
|
|
50
|
+
|
|
51
|
+
if elapsed < delay:
|
|
52
|
+
wait_time = delay - elapsed
|
|
53
|
+
logger.debug(
|
|
54
|
+
"rate_limit_wait",
|
|
55
|
+
source=source_key,
|
|
56
|
+
wait_seconds=wait_time,
|
|
57
|
+
)
|
|
58
|
+
await asyncio.sleep(wait_time)
|
|
59
|
+
|
|
60
|
+
try:
|
|
61
|
+
yield
|
|
62
|
+
finally:
|
|
63
|
+
cls._last_request[source_key] = time.monotonic()
|
|
64
|
+
|
|
65
|
+
@classmethod
|
|
66
|
+
def reset(cls) -> None:
|
|
67
|
+
"""Reseta estado do rate limiter."""
|
|
68
|
+
cls._semaphores.clear()
|
|
69
|
+
cls._last_request.clear()
|
agrobr/http/retry.py
ADDED
|
@@ -0,0 +1,93 @@
|
|
|
1
|
+
"""Retry com exponential backoff."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import asyncio
|
|
6
|
+
from collections.abc import Awaitable, Callable, Sequence
|
|
7
|
+
from functools import wraps
|
|
8
|
+
from typing import Any, TypeVar
|
|
9
|
+
|
|
10
|
+
import httpx
|
|
11
|
+
import structlog
|
|
12
|
+
|
|
13
|
+
from agrobr import constants
|
|
14
|
+
|
|
15
|
+
logger = structlog.get_logger()
|
|
16
|
+
T = TypeVar("T")
|
|
17
|
+
|
|
18
|
+
RETRIABLE_EXCEPTIONS: tuple[type[Exception], ...] = (
|
|
19
|
+
httpx.TimeoutException,
|
|
20
|
+
httpx.NetworkError,
|
|
21
|
+
httpx.RemoteProtocolError,
|
|
22
|
+
)
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
async def retry_async(
|
|
26
|
+
func: Callable[[], Awaitable[T]],
|
|
27
|
+
max_attempts: int | None = None,
|
|
28
|
+
base_delay: float | None = None,
|
|
29
|
+
max_delay: float | None = None,
|
|
30
|
+
retriable_exceptions: Sequence[type[Exception]] = RETRIABLE_EXCEPTIONS,
|
|
31
|
+
) -> T:
|
|
32
|
+
"""Executa função async com retry exponential backoff."""
|
|
33
|
+
settings = constants.HTTPSettings()
|
|
34
|
+
max_attempts = max_attempts or settings.max_retries
|
|
35
|
+
base_delay = base_delay or settings.retry_base_delay
|
|
36
|
+
max_delay = max_delay or settings.retry_max_delay
|
|
37
|
+
|
|
38
|
+
last_exception: Exception | None = None
|
|
39
|
+
|
|
40
|
+
for attempt in range(max_attempts):
|
|
41
|
+
try:
|
|
42
|
+
return await func()
|
|
43
|
+
|
|
44
|
+
except tuple(retriable_exceptions) as e:
|
|
45
|
+
last_exception = e
|
|
46
|
+
if attempt < max_attempts - 1:
|
|
47
|
+
delay = min(
|
|
48
|
+
base_delay * (settings.retry_exponential_base**attempt),
|
|
49
|
+
max_delay,
|
|
50
|
+
)
|
|
51
|
+
logger.warning(
|
|
52
|
+
"retry_scheduled",
|
|
53
|
+
attempt=attempt + 1,
|
|
54
|
+
max_attempts=max_attempts,
|
|
55
|
+
delay_seconds=delay,
|
|
56
|
+
error=str(e),
|
|
57
|
+
)
|
|
58
|
+
await asyncio.sleep(delay)
|
|
59
|
+
else:
|
|
60
|
+
logger.error(
|
|
61
|
+
"retry_exhausted",
|
|
62
|
+
attempts=max_attempts,
|
|
63
|
+
last_error=str(e),
|
|
64
|
+
)
|
|
65
|
+
|
|
66
|
+
if last_exception:
|
|
67
|
+
raise last_exception
|
|
68
|
+
raise RuntimeError("Retry logic error: no exception captured")
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
def with_retry(
|
|
72
|
+
max_attempts: int | None = None,
|
|
73
|
+
base_delay: float | None = None,
|
|
74
|
+
) -> Callable[[Callable[..., Awaitable[T]]], Callable[..., Awaitable[T]]]:
|
|
75
|
+
"""Decorator para retry automático."""
|
|
76
|
+
|
|
77
|
+
def decorator(func: Callable[..., Awaitable[T]]) -> Callable[..., Awaitable[T]]:
|
|
78
|
+
@wraps(func)
|
|
79
|
+
async def wrapper(*args: Any, **kwargs: Any) -> T:
|
|
80
|
+
return await retry_async(
|
|
81
|
+
lambda: func(*args, **kwargs),
|
|
82
|
+
max_attempts=max_attempts,
|
|
83
|
+
base_delay=base_delay,
|
|
84
|
+
)
|
|
85
|
+
|
|
86
|
+
return wrapper
|
|
87
|
+
|
|
88
|
+
return decorator
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
def should_retry_status(status_code: int) -> bool:
|
|
92
|
+
"""Verifica se o status code permite retry."""
|
|
93
|
+
return status_code in constants.RETRIABLE_STATUS_CODES
|
|
@@ -0,0 +1,67 @@
|
|
|
1
|
+
"""Pool rotativo de User-Agents."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import random
|
|
6
|
+
from collections.abc import Sequence
|
|
7
|
+
|
|
8
|
+
USER_AGENT_POOL: Sequence[str] = (
|
|
9
|
+
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
|
|
10
|
+
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36",
|
|
11
|
+
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36",
|
|
12
|
+
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
|
|
13
|
+
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36",
|
|
14
|
+
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:121.0) Gecko/20100101 Firefox/121.0",
|
|
15
|
+
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:120.0) Gecko/20100101 Firefox/120.0",
|
|
16
|
+
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:121.0) Gecko/20100101 Firefox/121.0",
|
|
17
|
+
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36 Edg/120.0.0.0",
|
|
18
|
+
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.2 Safari/605.1.15",
|
|
19
|
+
)
|
|
20
|
+
|
|
21
|
+
DEFAULT_HEADERS: dict[str, str] = {
|
|
22
|
+
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
|
|
23
|
+
"Accept-Language": "pt-BR,pt;q=0.9,en-US;q=0.8,en;q=0.7",
|
|
24
|
+
"Accept-Encoding": "gzip, deflate, br",
|
|
25
|
+
"Connection": "keep-alive",
|
|
26
|
+
"Upgrade-Insecure-Requests": "1",
|
|
27
|
+
"Sec-Fetch-Dest": "document",
|
|
28
|
+
"Sec-Fetch-Mode": "navigate",
|
|
29
|
+
"Sec-Fetch-Site": "none",
|
|
30
|
+
"Sec-Fetch-User": "?1",
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
class UserAgentRotator:
|
|
35
|
+
"""Rotaciona User-Agents de forma determinística por fonte."""
|
|
36
|
+
|
|
37
|
+
_counters: dict[str, int] = {}
|
|
38
|
+
|
|
39
|
+
@classmethod
|
|
40
|
+
def get(cls, source: str | None = None) -> str:
|
|
41
|
+
"""Retorna próximo User-Agent do pool."""
|
|
42
|
+
key = source or "default"
|
|
43
|
+
|
|
44
|
+
if key not in cls._counters:
|
|
45
|
+
cls._counters[key] = random.randint(0, len(USER_AGENT_POOL) - 1)
|
|
46
|
+
|
|
47
|
+
ua = USER_AGENT_POOL[cls._counters[key] % len(USER_AGENT_POOL)]
|
|
48
|
+
cls._counters[key] += 1
|
|
49
|
+
|
|
50
|
+
return ua
|
|
51
|
+
|
|
52
|
+
@classmethod
|
|
53
|
+
def get_random(cls) -> str:
|
|
54
|
+
"""Retorna User-Agent aleatório."""
|
|
55
|
+
return random.choice(USER_AGENT_POOL)
|
|
56
|
+
|
|
57
|
+
@classmethod
|
|
58
|
+
def get_headers(cls, source: str | None = None) -> dict[str, str]:
|
|
59
|
+
"""Retorna headers completos incluindo User-Agent."""
|
|
60
|
+
headers = DEFAULT_HEADERS.copy()
|
|
61
|
+
headers["User-Agent"] = cls.get(source)
|
|
62
|
+
return headers
|
|
63
|
+
|
|
64
|
+
@classmethod
|
|
65
|
+
def reset(cls) -> None:
|
|
66
|
+
"""Reseta contadores."""
|
|
67
|
+
cls._counters.clear()
|
agrobr/ibge/__init__.py
ADDED
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
"""Modulo IBGE - Dados PAM e LSPA."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from agrobr.ibge.api import (
|
|
6
|
+
lspa,
|
|
7
|
+
pam,
|
|
8
|
+
produtos_lspa,
|
|
9
|
+
produtos_pam,
|
|
10
|
+
ufs,
|
|
11
|
+
)
|
|
12
|
+
|
|
13
|
+
__all__ = [
|
|
14
|
+
"pam",
|
|
15
|
+
"lspa",
|
|
16
|
+
"produtos_pam",
|
|
17
|
+
"produtos_lspa",
|
|
18
|
+
"ufs",
|
|
19
|
+
]
|