ifdata-bcb 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ifdata_bcb/__init__.py +88 -0
- ifdata_bcb/core/__init__.py +9 -0
- ifdata_bcb/core/api.py +23 -0
- ifdata_bcb/core/base_explorer.py +468 -0
- ifdata_bcb/core/constants.py +38 -0
- ifdata_bcb/core/entity_lookup.py +624 -0
- ifdata_bcb/domain/__init__.py +47 -0
- ifdata_bcb/domain/exceptions.py +89 -0
- ifdata_bcb/domain/models.py +14 -0
- ifdata_bcb/domain/types.py +19 -0
- ifdata_bcb/domain/validation.py +80 -0
- ifdata_bcb/infra/__init__.py +46 -0
- ifdata_bcb/infra/cache.py +45 -0
- ifdata_bcb/infra/config.py +33 -0
- ifdata_bcb/infra/log.py +77 -0
- ifdata_bcb/infra/paths.py +25 -0
- ifdata_bcb/infra/query.py +75 -0
- ifdata_bcb/infra/resilience.py +114 -0
- ifdata_bcb/infra/storage.py +134 -0
- ifdata_bcb/providers/__init__.py +33 -0
- ifdata_bcb/providers/base_collector.py +315 -0
- ifdata_bcb/providers/collector_models.py +12 -0
- ifdata_bcb/providers/cosif/__init__.py +4 -0
- ifdata_bcb/providers/cosif/collector.py +195 -0
- ifdata_bcb/providers/cosif/explorer.py +388 -0
- ifdata_bcb/providers/ifdata/__init__.py +13 -0
- ifdata_bcb/providers/ifdata/cadastro_explorer.py +209 -0
- ifdata_bcb/providers/ifdata/collector.py +230 -0
- ifdata_bcb/providers/ifdata/explorer.py +573 -0
- ifdata_bcb/py.typed +0 -0
- ifdata_bcb/ui/__init__.py +6 -0
- ifdata_bcb/ui/display.py +272 -0
- ifdata_bcb/utils/__init__.py +28 -0
- ifdata_bcb/utils/cnpj.py +12 -0
- ifdata_bcb/utils/date.py +102 -0
- ifdata_bcb/utils/fuzzy.py +28 -0
- ifdata_bcb/utils/period.py +35 -0
- ifdata_bcb/utils/text.py +16 -0
- ifdata_bcb-0.1.0.dist-info/METADATA +249 -0
- ifdata_bcb-0.1.0.dist-info/RECORD +42 -0
- ifdata_bcb-0.1.0.dist-info/WHEEL +4 -0
- ifdata_bcb-0.1.0.dist-info/licenses/LICENSE +21 -0
ifdata_bcb/__init__.py
ADDED
|
@@ -0,0 +1,88 @@
|
|
|
1
|
+
"""
|
|
2
|
+
ifdata-bcb - Analise de dados financeiros do Banco Central do Brasil.
|
|
3
|
+
|
|
4
|
+
Biblioteca para coleta e exploracao de dados bancarios do Brasil:
|
|
5
|
+
- COSIF: Plano Contabil das Instituicoes do Sistema Financeiro Nacional
|
|
6
|
+
- IFDATA: Informacoes Financeiras Trimestrais
|
|
7
|
+
|
|
8
|
+
Uso:
|
|
9
|
+
import ifdata_bcb as bcb
|
|
10
|
+
|
|
11
|
+
# Coleta de dados
|
|
12
|
+
bcb.cosif.collect('2024-01', '2024-12')
|
|
13
|
+
bcb.ifdata.collect('2024-01', '2024-12')
|
|
14
|
+
|
|
15
|
+
# Buscar instituicao
|
|
16
|
+
bcb.search('Itau') # Retorna DataFrame com CNPJ_8, INSTITUICAO, FONTES, SCORE
|
|
17
|
+
|
|
18
|
+
# Consultas usando CNPJ de 8 digitos
|
|
19
|
+
# start sozinho = data unica; start + end = range de datas
|
|
20
|
+
# instituicao e start sao OBRIGATORIOS
|
|
21
|
+
df = bcb.ifdata.read(
|
|
22
|
+
instituicao='60872504',
|
|
23
|
+
start='2024-12', # Data unica
|
|
24
|
+
conta='Lucro Liquido',
|
|
25
|
+
)
|
|
26
|
+
|
|
27
|
+
df = bcb.cosif.read(
|
|
28
|
+
instituicao=['60872504', '60746948'],
|
|
29
|
+
start='2024-01',
|
|
30
|
+
end='2024-12', # Range de datas
|
|
31
|
+
conta=['TOTAL GERAL DO ATIVO', 'PATRIMONIO LIQUIDO'],
|
|
32
|
+
) # escopo=None busca em todos os escopos
|
|
33
|
+
"""
|
|
34
|
+
|
|
35
|
+
# Exceptions (BacenAnalysisError = base para capturar todas)
|
|
36
|
+
from ifdata_bcb.domain.exceptions import (
|
|
37
|
+
BacenAnalysisError,
|
|
38
|
+
DataUnavailableError,
|
|
39
|
+
)
|
|
40
|
+
|
|
41
|
+
# Funcoes de alto nivel
|
|
42
|
+
from ifdata_bcb.core.api import search
|
|
43
|
+
|
|
44
|
+
# Lazy loading dos explorers
|
|
45
|
+
_cosif = None
|
|
46
|
+
_ifdata = None
|
|
47
|
+
_cadastro = None
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def __getattr__(name: str):
|
|
51
|
+
"""Lazy loading dos explorers."""
|
|
52
|
+
global _cosif, _ifdata, _cadastro
|
|
53
|
+
|
|
54
|
+
if name == "cosif":
|
|
55
|
+
if _cosif is None:
|
|
56
|
+
from ifdata_bcb.providers.cosif.explorer import COSIFExplorer
|
|
57
|
+
|
|
58
|
+
_cosif = COSIFExplorer()
|
|
59
|
+
return _cosif
|
|
60
|
+
|
|
61
|
+
if name == "ifdata":
|
|
62
|
+
if _ifdata is None:
|
|
63
|
+
from ifdata_bcb.providers.ifdata.explorer import IFDATAExplorer
|
|
64
|
+
|
|
65
|
+
_ifdata = IFDATAExplorer()
|
|
66
|
+
return _ifdata
|
|
67
|
+
|
|
68
|
+
if name == "cadastro":
|
|
69
|
+
if _cadastro is None:
|
|
70
|
+
from ifdata_bcb.providers.ifdata.cadastro_explorer import CadastroExplorer
|
|
71
|
+
|
|
72
|
+
_cadastro = CadastroExplorer()
|
|
73
|
+
return _cadastro
|
|
74
|
+
|
|
75
|
+
raise AttributeError(f"module 'ifdata_bcb' has no attribute '{name}'")
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
__all__ = [
|
|
79
|
+
# Explorers (lazy)
|
|
80
|
+
"cosif",
|
|
81
|
+
"ifdata",
|
|
82
|
+
"cadastro",
|
|
83
|
+
# Funcoes de alto nivel
|
|
84
|
+
"search",
|
|
85
|
+
# Exceptions (BacenAnalysisError = base)
|
|
86
|
+
"BacenAnalysisError",
|
|
87
|
+
"DataUnavailableError",
|
|
88
|
+
]
|
ifdata_bcb/core/api.py
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
"""Funcoes de alto nivel para a API publica."""
|
|
2
|
+
|
|
3
|
+
import pandas as pd
|
|
4
|
+
|
|
5
|
+
# Singleton para lazy loading
|
|
6
|
+
_lookup = None
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def search(termo: str, limit: int = 10) -> pd.DataFrame:
|
|
10
|
+
"""
|
|
11
|
+
Busca instituicoes por nome em todas as fontes de dados.
|
|
12
|
+
|
|
13
|
+
Use esta funcao para encontrar o CNPJ de 8 digitos de uma instituicao
|
|
14
|
+
antes de fazer consultas com bcb.cosif.read(), bcb.ifdata.read(), etc.
|
|
15
|
+
|
|
16
|
+
Retorna DataFrame com: CNPJ_8, INSTITUICAO, FONTES, SCORE.
|
|
17
|
+
"""
|
|
18
|
+
global _lookup
|
|
19
|
+
if _lookup is None:
|
|
20
|
+
from ifdata_bcb.core.entity_lookup import EntityLookup
|
|
21
|
+
|
|
22
|
+
_lookup = EntityLookup()
|
|
23
|
+
return _lookup.search(termo, limit=limit)
|
|
@@ -0,0 +1,468 @@
|
|
|
1
|
+
from abc import ABC, abstractmethod
|
|
2
|
+
from typing import Optional
|
|
3
|
+
|
|
4
|
+
import pandas as pd
|
|
5
|
+
|
|
6
|
+
from ifdata_bcb.core.entity_lookup import EntityLookup
|
|
7
|
+
from ifdata_bcb.domain.exceptions import (
|
|
8
|
+
InvalidDateRangeError,
|
|
9
|
+
InvalidScopeError,
|
|
10
|
+
MissingRequiredParameterError,
|
|
11
|
+
)
|
|
12
|
+
from ifdata_bcb.domain.types import DateInput, AccountInput, InstitutionInput
|
|
13
|
+
from ifdata_bcb.domain.validation import (
|
|
14
|
+
AccountList,
|
|
15
|
+
InstitutionList,
|
|
16
|
+
NormalizedDates,
|
|
17
|
+
ValidatedCnpj8,
|
|
18
|
+
)
|
|
19
|
+
from ifdata_bcb.infra.log import get_logger
|
|
20
|
+
from ifdata_bcb.infra.query import QueryEngine
|
|
21
|
+
from ifdata_bcb.infra.storage import list_parquet_files
|
|
22
|
+
from ifdata_bcb.utils.date import yyyymm_to_datetime
|
|
23
|
+
from ifdata_bcb.utils.text import normalize_accents
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class BaseExplorer(ABC):
|
|
27
|
+
"""
|
|
28
|
+
Classe base abstrata para Explorers de dados do BCB.
|
|
29
|
+
|
|
30
|
+
Um Explorer combina:
|
|
31
|
+
- Coleta de dados (via Collector)
|
|
32
|
+
- Queries de dados (via QueryEngine com DuckDB)
|
|
33
|
+
- Resolucao de entidades (via EntityLookup)
|
|
34
|
+
|
|
35
|
+
Subclasses devem implementar:
|
|
36
|
+
- _get_subdir(): Subdiretorio dos dados (fonte unica)
|
|
37
|
+
- _get_file_prefix(): Prefixo dos arquivos Parquet (fonte unica)
|
|
38
|
+
|
|
39
|
+
Multi-source (mesmo schema, multiplas fontes):
|
|
40
|
+
- Override _get_sources() para retornar dict de fontes
|
|
41
|
+
- Exemplo: COSIF com escopos 'individual' e 'prudencial'
|
|
42
|
+
- Metodos list_periods(), has_data(), describe() suportam parametro source
|
|
43
|
+
|
|
44
|
+
Metodos read() e collect() tem assinaturas especificas por provider,
|
|
45
|
+
portanto nao sao declarados na base.
|
|
46
|
+
|
|
47
|
+
Mapeamento de colunas (storage -> apresentacao):
|
|
48
|
+
- Subclasses podem definir _COLUMN_MAP para mapear nomes de storage
|
|
49
|
+
para nomes de apresentacao
|
|
50
|
+
- Exemplo: _COLUMN_MAP = {"DATA_BASE": "DATA", "NOME_INSTITUICAO": "INSTITUICAO"}
|
|
51
|
+
"""
|
|
52
|
+
|
|
53
|
+
# Mapeamento de colunas: nome_storage -> nome_apresentacao
|
|
54
|
+
# Subclasses devem sobrescrever com seu mapeamento especifico
|
|
55
|
+
_COLUMN_MAP: dict[str, str] = {}
|
|
56
|
+
|
|
57
|
+
# Colunas cadastrais validas para enriquecimento via parametro cadastro=
|
|
58
|
+
# Derivado de CadastroExplorer._COLUMN_MAP (excluindo DATA, CNPJ_8, INSTITUICAO)
|
|
59
|
+
_VALID_CADASTRO_COLUMNS = {
|
|
60
|
+
"SEGMENTO",
|
|
61
|
+
"COD_CONGL_PRUD",
|
|
62
|
+
"COD_CONGL_FIN",
|
|
63
|
+
"SITUACAO",
|
|
64
|
+
"ATIVIDADE",
|
|
65
|
+
"TCB",
|
|
66
|
+
"TD",
|
|
67
|
+
"TC",
|
|
68
|
+
"UF",
|
|
69
|
+
"MUNICIPIO",
|
|
70
|
+
"SR",
|
|
71
|
+
"DATA_INICIO_ATIVIDADE",
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
def __init__(
|
|
75
|
+
self,
|
|
76
|
+
query_engine: Optional[QueryEngine] = None,
|
|
77
|
+
entity_lookup: Optional[EntityLookup] = None,
|
|
78
|
+
):
|
|
79
|
+
self._qe = query_engine or QueryEngine()
|
|
80
|
+
self._resolver = entity_lookup or EntityLookup(query_engine=self._qe)
|
|
81
|
+
self._logger = get_logger(__name__)
|
|
82
|
+
|
|
83
|
+
@property
|
|
84
|
+
def query_engine(self) -> QueryEngine:
|
|
85
|
+
return self._qe
|
|
86
|
+
|
|
87
|
+
@property
|
|
88
|
+
def resolver(self) -> EntityLookup:
|
|
89
|
+
return self._resolver
|
|
90
|
+
|
|
91
|
+
@property
|
|
92
|
+
def _reverse_column_map(self) -> dict[str, str]:
|
|
93
|
+
return {v: k for k, v in self._COLUMN_MAP.items()}
|
|
94
|
+
|
|
95
|
+
def _storage_col(self, presentation_col: str) -> str:
|
|
96
|
+
"""Traduz nome de apresentacao para storage. Retorna original se nao mapeado."""
|
|
97
|
+
return self._reverse_column_map.get(presentation_col, presentation_col)
|
|
98
|
+
|
|
99
|
+
def _apply_column_mapping(self, df: pd.DataFrame) -> pd.DataFrame:
|
|
100
|
+
if not self._COLUMN_MAP:
|
|
101
|
+
return df
|
|
102
|
+
rename_map = {k: v for k, v in self._COLUMN_MAP.items() if k in df.columns}
|
|
103
|
+
return df.rename(columns=rename_map) if rename_map else df
|
|
104
|
+
|
|
105
|
+
@abstractmethod
|
|
106
|
+
def _get_subdir(self) -> str: ...
|
|
107
|
+
|
|
108
|
+
@abstractmethod
|
|
109
|
+
def _get_file_prefix(self) -> str: ...
|
|
110
|
+
|
|
111
|
+
def _get_sources(self) -> dict[str, dict[str, str]]:
|
|
112
|
+
"""
|
|
113
|
+
Retorna fontes de dados do explorer.
|
|
114
|
+
|
|
115
|
+
Override para multiplas fontes (mesmo schema).
|
|
116
|
+
Default: fonte unica derivada de _get_subdir/_get_file_prefix.
|
|
117
|
+
|
|
118
|
+
Retorna dict no formato:
|
|
119
|
+
{"nome_fonte": {"subdir": "...", "prefix": "..."}}
|
|
120
|
+
"""
|
|
121
|
+
return {
|
|
122
|
+
"default": {
|
|
123
|
+
"subdir": self._get_subdir(),
|
|
124
|
+
"prefix": self._get_file_prefix(),
|
|
125
|
+
}
|
|
126
|
+
}
|
|
127
|
+
|
|
128
|
+
@staticmethod
|
|
129
|
+
def _align_to_quarter_end(yyyymm: int) -> int:
|
|
130
|
+
"""Alinha YYYYMM para o fim do trimestre correspondente (03, 06, 09, 12)."""
|
|
131
|
+
year, month = divmod(yyyymm, 100)
|
|
132
|
+
quarter_month = ((month - 1) // 3 + 1) * 3
|
|
133
|
+
return year * 100 + quarter_month
|
|
134
|
+
|
|
135
|
+
def _normalize_dates(self, datas: DateInput) -> list[int]:
|
|
136
|
+
"""Aceita int, str, ou lista. Formatos: 202412, '202412', '2024-12'."""
|
|
137
|
+
result = NormalizedDates(values=datas).values
|
|
138
|
+
self._logger.debug(f"Dates: {datas} -> {result}")
|
|
139
|
+
return result
|
|
140
|
+
|
|
141
|
+
def _normalize_accounts(
|
|
142
|
+
self, contas: Optional[AccountInput]
|
|
143
|
+
) -> Optional[list[str]]:
|
|
144
|
+
if contas is None:
|
|
145
|
+
return None
|
|
146
|
+
return AccountList(values=contas).values
|
|
147
|
+
|
|
148
|
+
def _normalize_institutions(
|
|
149
|
+
self, instituicoes: Optional[InstitutionInput]
|
|
150
|
+
) -> Optional[list[str]]:
|
|
151
|
+
if instituicoes is None:
|
|
152
|
+
return None
|
|
153
|
+
return InstitutionList(values=instituicoes).values
|
|
154
|
+
|
|
155
|
+
def _resolve_date_range(
|
|
156
|
+
self,
|
|
157
|
+
start: Optional[str],
|
|
158
|
+
end: Optional[str],
|
|
159
|
+
trimestral: bool = False,
|
|
160
|
+
) -> Optional[list[int]]:
|
|
161
|
+
"""
|
|
162
|
+
start sozinho: data unica. start + end: range. None: todos periodos.
|
|
163
|
+
|
|
164
|
+
Excecoes:
|
|
165
|
+
InvalidDateRangeError: Se start > end.
|
|
166
|
+
"""
|
|
167
|
+
# Nenhum filtro de data
|
|
168
|
+
if start is None:
|
|
169
|
+
return None
|
|
170
|
+
|
|
171
|
+
# Normalizar start para formato YYYYMM
|
|
172
|
+
start_normalized = self._normalize_dates(start)[0]
|
|
173
|
+
|
|
174
|
+
# Data unica (apenas start)
|
|
175
|
+
if end is None:
|
|
176
|
+
if trimestral:
|
|
177
|
+
return [self._align_to_quarter_end(start_normalized)]
|
|
178
|
+
return [start_normalized]
|
|
179
|
+
|
|
180
|
+
# Normalizar end e validar range
|
|
181
|
+
end_normalized = self._normalize_dates(end)[0]
|
|
182
|
+
if start_normalized > end_normalized:
|
|
183
|
+
raise InvalidDateRangeError(start, end)
|
|
184
|
+
|
|
185
|
+
# Range de datas (start + end)
|
|
186
|
+
from ifdata_bcb.utils.date import (
|
|
187
|
+
generate_month_range,
|
|
188
|
+
generate_quarter_range,
|
|
189
|
+
)
|
|
190
|
+
|
|
191
|
+
if trimestral:
|
|
192
|
+
return generate_quarter_range(start, end)
|
|
193
|
+
return generate_month_range(start, end)
|
|
194
|
+
|
|
195
|
+
def _resolve_entity(self, identificador: str) -> str:
|
|
196
|
+
"""
|
|
197
|
+
Valida CNPJ de 8 digitos.
|
|
198
|
+
|
|
199
|
+
Excecoes:
|
|
200
|
+
InvalidIdentifierError: Se nao for CNPJ de 8 digitos.
|
|
201
|
+
"""
|
|
202
|
+
validated = ValidatedCnpj8(value=identificador).value
|
|
203
|
+
self._logger.debug(f"Entity validated: {validated}")
|
|
204
|
+
return validated
|
|
205
|
+
|
|
206
|
+
def _validate_required_params(
|
|
207
|
+
self,
|
|
208
|
+
instituicao: Optional[InstitutionInput],
|
|
209
|
+
start: Optional[str],
|
|
210
|
+
) -> None:
|
|
211
|
+
if instituicao is None:
|
|
212
|
+
raise MissingRequiredParameterError("instituicao")
|
|
213
|
+
if start is None:
|
|
214
|
+
raise MissingRequiredParameterError("start")
|
|
215
|
+
|
|
216
|
+
def _validate_cadastro_columns(self, cadastro: Optional[list[str]]) -> None:
|
|
217
|
+
"""Valida nomes de colunas cadastrais antes de executar qualquer query."""
|
|
218
|
+
if cadastro is None:
|
|
219
|
+
return
|
|
220
|
+
invalid = set(cadastro) - self._VALID_CADASTRO_COLUMNS
|
|
221
|
+
if invalid:
|
|
222
|
+
raise InvalidScopeError(
|
|
223
|
+
"cadastro",
|
|
224
|
+
str(sorted(invalid)),
|
|
225
|
+
sorted(self._VALID_CADASTRO_COLUMNS),
|
|
226
|
+
)
|
|
227
|
+
|
|
228
|
+
def _build_string_condition(
|
|
229
|
+
self,
|
|
230
|
+
column: str,
|
|
231
|
+
values: list[str],
|
|
232
|
+
case_insensitive: bool = False,
|
|
233
|
+
accent_insensitive: bool = False,
|
|
234
|
+
) -> str:
|
|
235
|
+
"""Constroi condicao para valores string com escape de aspas."""
|
|
236
|
+
escaped = [v.strip().replace("'", "''") for v in values]
|
|
237
|
+
col_expr = column
|
|
238
|
+
|
|
239
|
+
if accent_insensitive:
|
|
240
|
+
col_expr = f"strip_accents({col_expr})"
|
|
241
|
+
escaped = [normalize_accents(v) for v in escaped]
|
|
242
|
+
|
|
243
|
+
if case_insensitive:
|
|
244
|
+
col_expr = f"UPPER({col_expr})"
|
|
245
|
+
escaped = [v.upper() for v in escaped]
|
|
246
|
+
|
|
247
|
+
if len(escaped) == 1:
|
|
248
|
+
return f"{col_expr} = '{escaped[0]}'"
|
|
249
|
+
values_str = ", ".join(f"'{v}'" for v in escaped)
|
|
250
|
+
return f"{col_expr} IN ({values_str})"
|
|
251
|
+
|
|
252
|
+
def _translate_columns(self, columns: Optional[list[str]]) -> Optional[list[str]]:
|
|
253
|
+
"""Traduz nomes de apresentacao para storage. Aceita ambos."""
|
|
254
|
+
if columns is None:
|
|
255
|
+
return None
|
|
256
|
+
return [self._storage_col(c) for c in columns]
|
|
257
|
+
|
|
258
|
+
def _build_int_condition(self, column: str, values: list[int]) -> str:
|
|
259
|
+
"""Constroi condicao para valores inteiros (datas, tipos, etc)."""
|
|
260
|
+
if len(values) == 1:
|
|
261
|
+
return f"{column} = {values[0]}"
|
|
262
|
+
values_str = ", ".join(str(v) for v in values)
|
|
263
|
+
return f"{column} IN ({values_str})"
|
|
264
|
+
|
|
265
|
+
def _build_date_condition(
|
|
266
|
+
self,
|
|
267
|
+
start: Optional[str],
|
|
268
|
+
end: Optional[str],
|
|
269
|
+
trimestral: bool = False,
|
|
270
|
+
) -> Optional[str]:
|
|
271
|
+
"""Constroi condicao WHERE para range de datas. Usa nome de storage."""
|
|
272
|
+
datas = self._resolve_date_range(start, end, trimestral=trimestral)
|
|
273
|
+
if not datas:
|
|
274
|
+
return None
|
|
275
|
+
data_col = self._storage_col("DATA")
|
|
276
|
+
return self._build_int_condition(data_col, datas)
|
|
277
|
+
|
|
278
|
+
def _build_cnpj_condition(
|
|
279
|
+
self,
|
|
280
|
+
instituicoes: Optional[InstitutionInput],
|
|
281
|
+
column: str = "CNPJ_8",
|
|
282
|
+
) -> Optional[str]:
|
|
283
|
+
"""Constroi condicao WHERE para CNPJs."""
|
|
284
|
+
cnpjs = self._normalize_institutions(instituicoes)
|
|
285
|
+
if not cnpjs:
|
|
286
|
+
return None
|
|
287
|
+
return self._build_string_condition(column, cnpjs)
|
|
288
|
+
|
|
289
|
+
def _join_conditions(self, conditions: list[Optional[str]]) -> Optional[str]:
|
|
290
|
+
"""Junta condicoes com AND, ignorando None."""
|
|
291
|
+
valid = [c for c in conditions if c]
|
|
292
|
+
return " AND ".join(valid) if valid else None
|
|
293
|
+
|
|
294
|
+
def _finalize_read(self, df: pd.DataFrame) -> pd.DataFrame:
|
|
295
|
+
"""Aplica mapeamento de colunas, converte DATA para datetime e ordena."""
|
|
296
|
+
# Mapeamento de colunas funciona mesmo em DataFrames vazios
|
|
297
|
+
df = self._apply_column_mapping(df)
|
|
298
|
+
|
|
299
|
+
if df.empty:
|
|
300
|
+
return df
|
|
301
|
+
|
|
302
|
+
df = df.copy()
|
|
303
|
+
df = df.drop_duplicates()
|
|
304
|
+
|
|
305
|
+
if "DATA" in df.columns:
|
|
306
|
+
df["DATA"] = df["DATA"].apply(yyyymm_to_datetime)
|
|
307
|
+
df = df.sort_values("DATA", ascending=True).reset_index(drop=True)
|
|
308
|
+
|
|
309
|
+
return df
|
|
310
|
+
|
|
311
|
+
def _get_latest_period(self, source: Optional[str] = None) -> Optional[int]:
|
|
312
|
+
"""Retorna o periodo mais recente disponivel, ou None."""
|
|
313
|
+
periods = self.list_periods(source)
|
|
314
|
+
return periods[-1] if periods else None
|
|
315
|
+
|
|
316
|
+
def _list_periods_for_source(self, subdir: str, prefix: str) -> list[int]:
|
|
317
|
+
"""Lista periodos de uma fonte especifica."""
|
|
318
|
+
files = list_parquet_files(subdir, base_path=self._qe.cache_path)
|
|
319
|
+
periods = []
|
|
320
|
+
for f in files:
|
|
321
|
+
if f.startswith(prefix + "_"):
|
|
322
|
+
try:
|
|
323
|
+
period_str = f.replace(prefix + "_", "")
|
|
324
|
+
periods.append(int(period_str))
|
|
325
|
+
except ValueError:
|
|
326
|
+
continue
|
|
327
|
+
return periods
|
|
328
|
+
|
|
329
|
+
def list_periods(self, source: Optional[str] = None) -> list[int]:
|
|
330
|
+
"""
|
|
331
|
+
Lista periodos disponiveis.
|
|
332
|
+
|
|
333
|
+
Args:
|
|
334
|
+
source: Nome da fonte (para multi-source). Se None, retorna uniao de todas.
|
|
335
|
+
"""
|
|
336
|
+
sources = self._get_sources()
|
|
337
|
+
|
|
338
|
+
if source:
|
|
339
|
+
cfg = sources[source]
|
|
340
|
+
return sorted(self._list_periods_for_source(cfg["subdir"], cfg["prefix"]))
|
|
341
|
+
|
|
342
|
+
all_periods: set[int] = set()
|
|
343
|
+
for cfg in sources.values():
|
|
344
|
+
all_periods.update(
|
|
345
|
+
self._list_periods_for_source(cfg["subdir"], cfg["prefix"])
|
|
346
|
+
)
|
|
347
|
+
return sorted(all_periods)
|
|
348
|
+
|
|
349
|
+
def has_data(self, source: Optional[str] = None) -> bool:
|
|
350
|
+
"""Verifica se ha dados disponiveis."""
|
|
351
|
+
return len(self.list_periods(source)) > 0
|
|
352
|
+
|
|
353
|
+
def describe(self, source: Optional[str] = None) -> dict:
|
|
354
|
+
"""
|
|
355
|
+
Retorna info do explorer.
|
|
356
|
+
|
|
357
|
+
Args:
|
|
358
|
+
source: Nome da fonte (para multi-source). Se None, descreve todas.
|
|
359
|
+
"""
|
|
360
|
+
sources = self._get_sources()
|
|
361
|
+
|
|
362
|
+
if source:
|
|
363
|
+
cfg = sources[source]
|
|
364
|
+
periods = self.list_periods(source)
|
|
365
|
+
return {
|
|
366
|
+
"source": source,
|
|
367
|
+
"subdir": cfg["subdir"],
|
|
368
|
+
"prefix": cfg["prefix"],
|
|
369
|
+
"periods": periods,
|
|
370
|
+
"period_count": len(periods),
|
|
371
|
+
"has_data": len(periods) > 0,
|
|
372
|
+
"first_period": periods[0] if periods else None,
|
|
373
|
+
"last_period": periods[-1] if periods else None,
|
|
374
|
+
}
|
|
375
|
+
|
|
376
|
+
# Multi-source: retorna info agregada + detalhes por fonte
|
|
377
|
+
all_periods = self.list_periods()
|
|
378
|
+
result = {
|
|
379
|
+
"sources": list(sources.keys()),
|
|
380
|
+
"periods": all_periods,
|
|
381
|
+
"period_count": len(all_periods),
|
|
382
|
+
"has_data": len(all_periods) > 0,
|
|
383
|
+
"first_period": all_periods[0] if all_periods else None,
|
|
384
|
+
"last_period": all_periods[-1] if all_periods else None,
|
|
385
|
+
"by_source": {},
|
|
386
|
+
}
|
|
387
|
+
|
|
388
|
+
for name, cfg in sources.items():
|
|
389
|
+
periods = self.list_periods(name)
|
|
390
|
+
result["by_source"][name] = {
|
|
391
|
+
"subdir": cfg["subdir"],
|
|
392
|
+
"prefix": cfg["prefix"],
|
|
393
|
+
"period_count": len(periods),
|
|
394
|
+
"has_data": len(periods) > 0,
|
|
395
|
+
}
|
|
396
|
+
|
|
397
|
+
return result
|
|
398
|
+
|
|
399
|
+
def _enrich_with_cadastro(
|
|
400
|
+
self,
|
|
401
|
+
df: pd.DataFrame,
|
|
402
|
+
cadastro_columns: list[str],
|
|
403
|
+
) -> pd.DataFrame:
|
|
404
|
+
"""Enriquece DataFrame financeiro com colunas cadastrais.
|
|
405
|
+
|
|
406
|
+
Usa merge temporal backward-looking: cada linha financeira recebe
|
|
407
|
+
os atributos cadastrais do trimestre mais recente <= sua data.
|
|
408
|
+
"""
|
|
409
|
+
if df.empty:
|
|
410
|
+
return df
|
|
411
|
+
|
|
412
|
+
# Lazy-load CadastroExplorer (import local para evitar circular)
|
|
413
|
+
if not hasattr(self, "_cadastro_explorer"):
|
|
414
|
+
from ifdata_bcb.providers.ifdata.cadastro_explorer import CadastroExplorer
|
|
415
|
+
|
|
416
|
+
self._cadastro_explorer = CadastroExplorer(
|
|
417
|
+
query_engine=self._qe, entity_lookup=self._resolver
|
|
418
|
+
)
|
|
419
|
+
|
|
420
|
+
cnpjs = df["CNPJ_8"].unique().tolist()
|
|
421
|
+
min_date = df["DATA"].min()
|
|
422
|
+
max_date = df["DATA"].max()
|
|
423
|
+
|
|
424
|
+
# Buscar cadastro com 1 trimestre de margem anterior
|
|
425
|
+
start_str = (min_date - pd.DateOffset(months=3)).strftime("%Y-%m")
|
|
426
|
+
end_str = max_date.strftime("%Y-%m")
|
|
427
|
+
|
|
428
|
+
df_cad = self._cadastro_explorer.read(
|
|
429
|
+
instituicao=cnpjs,
|
|
430
|
+
start=start_str,
|
|
431
|
+
end=end_str,
|
|
432
|
+
)
|
|
433
|
+
|
|
434
|
+
if df_cad.empty:
|
|
435
|
+
for col in cadastro_columns:
|
|
436
|
+
df[col] = pd.NA
|
|
437
|
+
return df
|
|
438
|
+
|
|
439
|
+
cad_cols = ["CNPJ_8", "DATA"] + cadastro_columns
|
|
440
|
+
df_cad = df_cad[[c for c in cad_cols if c in df_cad.columns]]
|
|
441
|
+
|
|
442
|
+
# Caso data unica: merge simples por CNPJ_8
|
|
443
|
+
if df["DATA"].nunique() == 1:
|
|
444
|
+
df_cad_latest = df_cad.sort_values("DATA").drop_duplicates(
|
|
445
|
+
subset=["CNPJ_8"], keep="last"
|
|
446
|
+
)
|
|
447
|
+
merge_cols = [c for c in cadastro_columns if c in df_cad_latest.columns]
|
|
448
|
+
return df.merge(
|
|
449
|
+
df_cad_latest[["CNPJ_8"] + merge_cols],
|
|
450
|
+
on="CNPJ_8",
|
|
451
|
+
how="left",
|
|
452
|
+
)
|
|
453
|
+
|
|
454
|
+
# Time-series: merge_asof para alinhamento temporal
|
|
455
|
+
# merge_asof exige sort pela coluna on= (DATA) como chave primaria
|
|
456
|
+
df_sorted = df.sort_values("DATA")
|
|
457
|
+
df_cad_sorted = df_cad.sort_values("DATA")
|
|
458
|
+
|
|
459
|
+
merge_cols = [c for c in cadastro_columns if c in df_cad_sorted.columns]
|
|
460
|
+
result = pd.merge_asof(
|
|
461
|
+
df_sorted,
|
|
462
|
+
df_cad_sorted[["CNPJ_8", "DATA"] + merge_cols],
|
|
463
|
+
on="DATA",
|
|
464
|
+
by="CNPJ_8",
|
|
465
|
+
direction="backward",
|
|
466
|
+
)
|
|
467
|
+
|
|
468
|
+
return result.sort_values("DATA").reset_index(drop=True)
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
"""Constantes centralizadas para fontes de dados."""
|
|
2
|
+
|
|
3
|
+
# Mapeamento escopo -> TipoInstituicao (IFDATA)
|
|
4
|
+
TIPO_INST_MAP: dict[str, int] = {
|
|
5
|
+
"individual": 3,
|
|
6
|
+
"prudencial": 1,
|
|
7
|
+
"financeiro": 2,
|
|
8
|
+
}
|
|
9
|
+
|
|
10
|
+
# Configuracao das fontes de dados
|
|
11
|
+
DATA_SOURCES: dict[str, dict[str, str]] = {
|
|
12
|
+
"cadastro": {
|
|
13
|
+
"subdir": "ifdata/cadastro",
|
|
14
|
+
"prefix": "ifdata_cad",
|
|
15
|
+
},
|
|
16
|
+
"ifdata_valores": {
|
|
17
|
+
"subdir": "ifdata/valores",
|
|
18
|
+
"prefix": "ifdata_val",
|
|
19
|
+
},
|
|
20
|
+
"cosif_individual": {
|
|
21
|
+
"subdir": "cosif/individual",
|
|
22
|
+
"prefix": "cosif_ind",
|
|
23
|
+
},
|
|
24
|
+
"cosif_prudencial": {
|
|
25
|
+
"subdir": "cosif/prudencial",
|
|
26
|
+
"prefix": "cosif_prud",
|
|
27
|
+
},
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def get_pattern(source: str) -> str:
|
|
32
|
+
"""Retorna pattern glob para fonte de dados."""
|
|
33
|
+
return f"{DATA_SOURCES[source]['prefix']}_*.parquet"
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def get_subdir(source: str) -> str:
|
|
37
|
+
"""Retorna subdiretorio para fonte de dados."""
|
|
38
|
+
return DATA_SOURCES[source]["subdir"]
|