ifdata-bcb 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (42) hide show
  1. ifdata_bcb/__init__.py +88 -0
  2. ifdata_bcb/core/__init__.py +9 -0
  3. ifdata_bcb/core/api.py +23 -0
  4. ifdata_bcb/core/base_explorer.py +468 -0
  5. ifdata_bcb/core/constants.py +38 -0
  6. ifdata_bcb/core/entity_lookup.py +624 -0
  7. ifdata_bcb/domain/__init__.py +47 -0
  8. ifdata_bcb/domain/exceptions.py +89 -0
  9. ifdata_bcb/domain/models.py +14 -0
  10. ifdata_bcb/domain/types.py +19 -0
  11. ifdata_bcb/domain/validation.py +80 -0
  12. ifdata_bcb/infra/__init__.py +46 -0
  13. ifdata_bcb/infra/cache.py +45 -0
  14. ifdata_bcb/infra/config.py +33 -0
  15. ifdata_bcb/infra/log.py +77 -0
  16. ifdata_bcb/infra/paths.py +25 -0
  17. ifdata_bcb/infra/query.py +75 -0
  18. ifdata_bcb/infra/resilience.py +114 -0
  19. ifdata_bcb/infra/storage.py +134 -0
  20. ifdata_bcb/providers/__init__.py +33 -0
  21. ifdata_bcb/providers/base_collector.py +315 -0
  22. ifdata_bcb/providers/collector_models.py +12 -0
  23. ifdata_bcb/providers/cosif/__init__.py +4 -0
  24. ifdata_bcb/providers/cosif/collector.py +195 -0
  25. ifdata_bcb/providers/cosif/explorer.py +388 -0
  26. ifdata_bcb/providers/ifdata/__init__.py +13 -0
  27. ifdata_bcb/providers/ifdata/cadastro_explorer.py +209 -0
  28. ifdata_bcb/providers/ifdata/collector.py +230 -0
  29. ifdata_bcb/providers/ifdata/explorer.py +573 -0
  30. ifdata_bcb/py.typed +0 -0
  31. ifdata_bcb/ui/__init__.py +6 -0
  32. ifdata_bcb/ui/display.py +272 -0
  33. ifdata_bcb/utils/__init__.py +28 -0
  34. ifdata_bcb/utils/cnpj.py +12 -0
  35. ifdata_bcb/utils/date.py +102 -0
  36. ifdata_bcb/utils/fuzzy.py +28 -0
  37. ifdata_bcb/utils/period.py +35 -0
  38. ifdata_bcb/utils/text.py +16 -0
  39. ifdata_bcb-0.1.0.dist-info/METADATA +249 -0
  40. ifdata_bcb-0.1.0.dist-info/RECORD +42 -0
  41. ifdata_bcb-0.1.0.dist-info/WHEEL +4 -0
  42. ifdata_bcb-0.1.0.dist-info/licenses/LICENSE +21 -0
ifdata_bcb/__init__.py ADDED
@@ -0,0 +1,88 @@
1
+ """
2
+ ifdata-bcb - Analise de dados financeiros do Banco Central do Brasil.
3
+
4
+ Biblioteca para coleta e exploracao de dados bancarios do Brasil:
5
+ - COSIF: Plano Contabil das Instituicoes do Sistema Financeiro Nacional
6
+ - IFDATA: Informacoes Financeiras Trimestrais
7
+
8
+ Uso:
9
+ import ifdata_bcb as bcb
10
+
11
+ # Coleta de dados
12
+ bcb.cosif.collect('2024-01', '2024-12')
13
+ bcb.ifdata.collect('2024-01', '2024-12')
14
+
15
+ # Buscar instituicao
16
+ bcb.search('Itau') # Retorna DataFrame com CNPJ_8, INSTITUICAO, FONTES, SCORE
17
+
18
+ # Consultas usando CNPJ de 8 digitos
19
+ # start sozinho = data unica; start + end = range de datas
20
+ # instituicao e start sao OBRIGATORIOS
21
+ df = bcb.ifdata.read(
22
+ instituicao='60872504',
23
+ start='2024-12', # Data unica
24
+ conta='Lucro Liquido',
25
+ )
26
+
27
+ df = bcb.cosif.read(
28
+ instituicao=['60872504', '60746948'],
29
+ start='2024-01',
30
+ end='2024-12', # Range de datas
31
+ conta=['TOTAL GERAL DO ATIVO', 'PATRIMONIO LIQUIDO'],
32
+ ) # escopo=None busca em todos os escopos
33
+ """
34
+
35
+ # Exceptions (BacenAnalysisError = base para capturar todas)
36
+ from ifdata_bcb.domain.exceptions import (
37
+ BacenAnalysisError,
38
+ DataUnavailableError,
39
+ )
40
+
41
+ # Funcoes de alto nivel
42
+ from ifdata_bcb.core.api import search
43
+
44
+ # Lazy loading dos explorers
45
+ _cosif = None
46
+ _ifdata = None
47
+ _cadastro = None
48
+
49
+
50
+ def __getattr__(name: str):
51
+ """Lazy loading dos explorers."""
52
+ global _cosif, _ifdata, _cadastro
53
+
54
+ if name == "cosif":
55
+ if _cosif is None:
56
+ from ifdata_bcb.providers.cosif.explorer import COSIFExplorer
57
+
58
+ _cosif = COSIFExplorer()
59
+ return _cosif
60
+
61
+ if name == "ifdata":
62
+ if _ifdata is None:
63
+ from ifdata_bcb.providers.ifdata.explorer import IFDATAExplorer
64
+
65
+ _ifdata = IFDATAExplorer()
66
+ return _ifdata
67
+
68
+ if name == "cadastro":
69
+ if _cadastro is None:
70
+ from ifdata_bcb.providers.ifdata.cadastro_explorer import CadastroExplorer
71
+
72
+ _cadastro = CadastroExplorer()
73
+ return _cadastro
74
+
75
+ raise AttributeError(f"module 'ifdata_bcb' has no attribute '{name}'")
76
+
77
+
78
+ __all__ = [
79
+ # Explorers (lazy)
80
+ "cosif",
81
+ "ifdata",
82
+ "cadastro",
83
+ # Funcoes de alto nivel
84
+ "search",
85
+ # Exceptions (BacenAnalysisError = base)
86
+ "BacenAnalysisError",
87
+ "DataUnavailableError",
88
+ ]
@@ -0,0 +1,9 @@
1
+ from ifdata_bcb.core.api import search
2
+ from ifdata_bcb.core.base_explorer import BaseExplorer
3
+ from ifdata_bcb.core.entity_lookup import EntityLookup
4
+
5
+ __all__ = [
6
+ "search",
7
+ "BaseExplorer",
8
+ "EntityLookup",
9
+ ]
ifdata_bcb/core/api.py ADDED
@@ -0,0 +1,23 @@
1
+ """Funcoes de alto nivel para a API publica."""
2
+
3
+ import pandas as pd
4
+
5
+ # Singleton para lazy loading
6
+ _lookup = None
7
+
8
+
9
+ def search(termo: str, limit: int = 10) -> pd.DataFrame:
10
+ """
11
+ Busca instituicoes por nome em todas as fontes de dados.
12
+
13
+ Use esta funcao para encontrar o CNPJ de 8 digitos de uma instituicao
14
+ antes de fazer consultas com bcb.cosif.read(), bcb.ifdata.read(), etc.
15
+
16
+ Retorna DataFrame com: CNPJ_8, INSTITUICAO, FONTES, SCORE.
17
+ """
18
+ global _lookup
19
+ if _lookup is None:
20
+ from ifdata_bcb.core.entity_lookup import EntityLookup
21
+
22
+ _lookup = EntityLookup()
23
+ return _lookup.search(termo, limit=limit)
@@ -0,0 +1,468 @@
1
+ from abc import ABC, abstractmethod
2
+ from typing import Optional
3
+
4
+ import pandas as pd
5
+
6
+ from ifdata_bcb.core.entity_lookup import EntityLookup
7
+ from ifdata_bcb.domain.exceptions import (
8
+ InvalidDateRangeError,
9
+ InvalidScopeError,
10
+ MissingRequiredParameterError,
11
+ )
12
+ from ifdata_bcb.domain.types import DateInput, AccountInput, InstitutionInput
13
+ from ifdata_bcb.domain.validation import (
14
+ AccountList,
15
+ InstitutionList,
16
+ NormalizedDates,
17
+ ValidatedCnpj8,
18
+ )
19
+ from ifdata_bcb.infra.log import get_logger
20
+ from ifdata_bcb.infra.query import QueryEngine
21
+ from ifdata_bcb.infra.storage import list_parquet_files
22
+ from ifdata_bcb.utils.date import yyyymm_to_datetime
23
+ from ifdata_bcb.utils.text import normalize_accents
24
+
25
+
26
+ class BaseExplorer(ABC):
27
+ """
28
+ Classe base abstrata para Explorers de dados do BCB.
29
+
30
+ Um Explorer combina:
31
+ - Coleta de dados (via Collector)
32
+ - Queries de dados (via QueryEngine com DuckDB)
33
+ - Resolucao de entidades (via EntityLookup)
34
+
35
+ Subclasses devem implementar:
36
+ - _get_subdir(): Subdiretorio dos dados (fonte unica)
37
+ - _get_file_prefix(): Prefixo dos arquivos Parquet (fonte unica)
38
+
39
+ Multi-source (mesmo schema, multiplas fontes):
40
+ - Override _get_sources() para retornar dict de fontes
41
+ - Exemplo: COSIF com escopos 'individual' e 'prudencial'
42
+ - Metodos list_periods(), has_data(), describe() suportam parametro source
43
+
44
+ Metodos read() e collect() tem assinaturas especificas por provider,
45
+ portanto nao sao declarados na base.
46
+
47
+ Mapeamento de colunas (storage -> apresentacao):
48
+ - Subclasses podem definir _COLUMN_MAP para mapear nomes de storage
49
+ para nomes de apresentacao
50
+ - Exemplo: _COLUMN_MAP = {"DATA_BASE": "DATA", "NOME_INSTITUICAO": "INSTITUICAO"}
51
+ """
52
+
53
+ # Mapeamento de colunas: nome_storage -> nome_apresentacao
54
+ # Subclasses devem sobrescrever com seu mapeamento especifico
55
+ _COLUMN_MAP: dict[str, str] = {}
56
+
57
+ # Colunas cadastrais validas para enriquecimento via parametro cadastro=
58
+ # Derivado de CadastroExplorer._COLUMN_MAP (excluindo DATA, CNPJ_8, INSTITUICAO)
59
+ _VALID_CADASTRO_COLUMNS = {
60
+ "SEGMENTO",
61
+ "COD_CONGL_PRUD",
62
+ "COD_CONGL_FIN",
63
+ "SITUACAO",
64
+ "ATIVIDADE",
65
+ "TCB",
66
+ "TD",
67
+ "TC",
68
+ "UF",
69
+ "MUNICIPIO",
70
+ "SR",
71
+ "DATA_INICIO_ATIVIDADE",
72
+ }
73
+
74
+ def __init__(
75
+ self,
76
+ query_engine: Optional[QueryEngine] = None,
77
+ entity_lookup: Optional[EntityLookup] = None,
78
+ ):
79
+ self._qe = query_engine or QueryEngine()
80
+ self._resolver = entity_lookup or EntityLookup(query_engine=self._qe)
81
+ self._logger = get_logger(__name__)
82
+
83
+ @property
84
+ def query_engine(self) -> QueryEngine:
85
+ return self._qe
86
+
87
+ @property
88
+ def resolver(self) -> EntityLookup:
89
+ return self._resolver
90
+
91
+ @property
92
+ def _reverse_column_map(self) -> dict[str, str]:
93
+ return {v: k for k, v in self._COLUMN_MAP.items()}
94
+
95
+ def _storage_col(self, presentation_col: str) -> str:
96
+ """Traduz nome de apresentacao para storage. Retorna original se nao mapeado."""
97
+ return self._reverse_column_map.get(presentation_col, presentation_col)
98
+
99
+ def _apply_column_mapping(self, df: pd.DataFrame) -> pd.DataFrame:
100
+ if not self._COLUMN_MAP:
101
+ return df
102
+ rename_map = {k: v for k, v in self._COLUMN_MAP.items() if k in df.columns}
103
+ return df.rename(columns=rename_map) if rename_map else df
104
+
105
+ @abstractmethod
106
+ def _get_subdir(self) -> str: ...
107
+
108
+ @abstractmethod
109
+ def _get_file_prefix(self) -> str: ...
110
+
111
+ def _get_sources(self) -> dict[str, dict[str, str]]:
112
+ """
113
+ Retorna fontes de dados do explorer.
114
+
115
+ Override para multiplas fontes (mesmo schema).
116
+ Default: fonte unica derivada de _get_subdir/_get_file_prefix.
117
+
118
+ Retorna dict no formato:
119
+ {"nome_fonte": {"subdir": "...", "prefix": "..."}}
120
+ """
121
+ return {
122
+ "default": {
123
+ "subdir": self._get_subdir(),
124
+ "prefix": self._get_file_prefix(),
125
+ }
126
+ }
127
+
128
+ @staticmethod
129
+ def _align_to_quarter_end(yyyymm: int) -> int:
130
+ """Alinha YYYYMM para o fim do trimestre correspondente (03, 06, 09, 12)."""
131
+ year, month = divmod(yyyymm, 100)
132
+ quarter_month = ((month - 1) // 3 + 1) * 3
133
+ return year * 100 + quarter_month
134
+
135
+ def _normalize_dates(self, datas: DateInput) -> list[int]:
136
+ """Aceita int, str, ou lista. Formatos: 202412, '202412', '2024-12'."""
137
+ result = NormalizedDates(values=datas).values
138
+ self._logger.debug(f"Dates: {datas} -> {result}")
139
+ return result
140
+
141
+ def _normalize_accounts(
142
+ self, contas: Optional[AccountInput]
143
+ ) -> Optional[list[str]]:
144
+ if contas is None:
145
+ return None
146
+ return AccountList(values=contas).values
147
+
148
+ def _normalize_institutions(
149
+ self, instituicoes: Optional[InstitutionInput]
150
+ ) -> Optional[list[str]]:
151
+ if instituicoes is None:
152
+ return None
153
+ return InstitutionList(values=instituicoes).values
154
+
155
+ def _resolve_date_range(
156
+ self,
157
+ start: Optional[str],
158
+ end: Optional[str],
159
+ trimestral: bool = False,
160
+ ) -> Optional[list[int]]:
161
+ """
162
+ start sozinho: data unica. start + end: range. None: todos periodos.
163
+
164
+ Excecoes:
165
+ InvalidDateRangeError: Se start > end.
166
+ """
167
+ # Nenhum filtro de data
168
+ if start is None:
169
+ return None
170
+
171
+ # Normalizar start para formato YYYYMM
172
+ start_normalized = self._normalize_dates(start)[0]
173
+
174
+ # Data unica (apenas start)
175
+ if end is None:
176
+ if trimestral:
177
+ return [self._align_to_quarter_end(start_normalized)]
178
+ return [start_normalized]
179
+
180
+ # Normalizar end e validar range
181
+ end_normalized = self._normalize_dates(end)[0]
182
+ if start_normalized > end_normalized:
183
+ raise InvalidDateRangeError(start, end)
184
+
185
+ # Range de datas (start + end)
186
+ from ifdata_bcb.utils.date import (
187
+ generate_month_range,
188
+ generate_quarter_range,
189
+ )
190
+
191
+ if trimestral:
192
+ return generate_quarter_range(start, end)
193
+ return generate_month_range(start, end)
194
+
195
+ def _resolve_entity(self, identificador: str) -> str:
196
+ """
197
+ Valida CNPJ de 8 digitos.
198
+
199
+ Excecoes:
200
+ InvalidIdentifierError: Se nao for CNPJ de 8 digitos.
201
+ """
202
+ validated = ValidatedCnpj8(value=identificador).value
203
+ self._logger.debug(f"Entity validated: {validated}")
204
+ return validated
205
+
206
+ def _validate_required_params(
207
+ self,
208
+ instituicao: Optional[InstitutionInput],
209
+ start: Optional[str],
210
+ ) -> None:
211
+ if instituicao is None:
212
+ raise MissingRequiredParameterError("instituicao")
213
+ if start is None:
214
+ raise MissingRequiredParameterError("start")
215
+
216
+ def _validate_cadastro_columns(self, cadastro: Optional[list[str]]) -> None:
217
+ """Valida nomes de colunas cadastrais antes de executar qualquer query."""
218
+ if cadastro is None:
219
+ return
220
+ invalid = set(cadastro) - self._VALID_CADASTRO_COLUMNS
221
+ if invalid:
222
+ raise InvalidScopeError(
223
+ "cadastro",
224
+ str(sorted(invalid)),
225
+ sorted(self._VALID_CADASTRO_COLUMNS),
226
+ )
227
+
228
+ def _build_string_condition(
229
+ self,
230
+ column: str,
231
+ values: list[str],
232
+ case_insensitive: bool = False,
233
+ accent_insensitive: bool = False,
234
+ ) -> str:
235
+ """Constroi condicao para valores string com escape de aspas."""
236
+ escaped = [v.strip().replace("'", "''") for v in values]
237
+ col_expr = column
238
+
239
+ if accent_insensitive:
240
+ col_expr = f"strip_accents({col_expr})"
241
+ escaped = [normalize_accents(v) for v in escaped]
242
+
243
+ if case_insensitive:
244
+ col_expr = f"UPPER({col_expr})"
245
+ escaped = [v.upper() for v in escaped]
246
+
247
+ if len(escaped) == 1:
248
+ return f"{col_expr} = '{escaped[0]}'"
249
+ values_str = ", ".join(f"'{v}'" for v in escaped)
250
+ return f"{col_expr} IN ({values_str})"
251
+
252
+ def _translate_columns(self, columns: Optional[list[str]]) -> Optional[list[str]]:
253
+ """Traduz nomes de apresentacao para storage. Aceita ambos."""
254
+ if columns is None:
255
+ return None
256
+ return [self._storage_col(c) for c in columns]
257
+
258
+ def _build_int_condition(self, column: str, values: list[int]) -> str:
259
+ """Constroi condicao para valores inteiros (datas, tipos, etc)."""
260
+ if len(values) == 1:
261
+ return f"{column} = {values[0]}"
262
+ values_str = ", ".join(str(v) for v in values)
263
+ return f"{column} IN ({values_str})"
264
+
265
+ def _build_date_condition(
266
+ self,
267
+ start: Optional[str],
268
+ end: Optional[str],
269
+ trimestral: bool = False,
270
+ ) -> Optional[str]:
271
+ """Constroi condicao WHERE para range de datas. Usa nome de storage."""
272
+ datas = self._resolve_date_range(start, end, trimestral=trimestral)
273
+ if not datas:
274
+ return None
275
+ data_col = self._storage_col("DATA")
276
+ return self._build_int_condition(data_col, datas)
277
+
278
+ def _build_cnpj_condition(
279
+ self,
280
+ instituicoes: Optional[InstitutionInput],
281
+ column: str = "CNPJ_8",
282
+ ) -> Optional[str]:
283
+ """Constroi condicao WHERE para CNPJs."""
284
+ cnpjs = self._normalize_institutions(instituicoes)
285
+ if not cnpjs:
286
+ return None
287
+ return self._build_string_condition(column, cnpjs)
288
+
289
+ def _join_conditions(self, conditions: list[Optional[str]]) -> Optional[str]:
290
+ """Junta condicoes com AND, ignorando None."""
291
+ valid = [c for c in conditions if c]
292
+ return " AND ".join(valid) if valid else None
293
+
294
+ def _finalize_read(self, df: pd.DataFrame) -> pd.DataFrame:
295
+ """Aplica mapeamento de colunas, converte DATA para datetime e ordena."""
296
+ # Mapeamento de colunas funciona mesmo em DataFrames vazios
297
+ df = self._apply_column_mapping(df)
298
+
299
+ if df.empty:
300
+ return df
301
+
302
+ df = df.copy()
303
+ df = df.drop_duplicates()
304
+
305
+ if "DATA" in df.columns:
306
+ df["DATA"] = df["DATA"].apply(yyyymm_to_datetime)
307
+ df = df.sort_values("DATA", ascending=True).reset_index(drop=True)
308
+
309
+ return df
310
+
311
+ def _get_latest_period(self, source: Optional[str] = None) -> Optional[int]:
312
+ """Retorna o periodo mais recente disponivel, ou None."""
313
+ periods = self.list_periods(source)
314
+ return periods[-1] if periods else None
315
+
316
+ def _list_periods_for_source(self, subdir: str, prefix: str) -> list[int]:
317
+ """Lista periodos de uma fonte especifica."""
318
+ files = list_parquet_files(subdir, base_path=self._qe.cache_path)
319
+ periods = []
320
+ for f in files:
321
+ if f.startswith(prefix + "_"):
322
+ try:
323
+ period_str = f.replace(prefix + "_", "")
324
+ periods.append(int(period_str))
325
+ except ValueError:
326
+ continue
327
+ return periods
328
+
329
+ def list_periods(self, source: Optional[str] = None) -> list[int]:
330
+ """
331
+ Lista periodos disponiveis.
332
+
333
+ Args:
334
+ source: Nome da fonte (para multi-source). Se None, retorna uniao de todas.
335
+ """
336
+ sources = self._get_sources()
337
+
338
+ if source:
339
+ cfg = sources[source]
340
+ return sorted(self._list_periods_for_source(cfg["subdir"], cfg["prefix"]))
341
+
342
+ all_periods: set[int] = set()
343
+ for cfg in sources.values():
344
+ all_periods.update(
345
+ self._list_periods_for_source(cfg["subdir"], cfg["prefix"])
346
+ )
347
+ return sorted(all_periods)
348
+
349
+ def has_data(self, source: Optional[str] = None) -> bool:
350
+ """Verifica se ha dados disponiveis."""
351
+ return len(self.list_periods(source)) > 0
352
+
353
+ def describe(self, source: Optional[str] = None) -> dict:
354
+ """
355
+ Retorna info do explorer.
356
+
357
+ Args:
358
+ source: Nome da fonte (para multi-source). Se None, descreve todas.
359
+ """
360
+ sources = self._get_sources()
361
+
362
+ if source:
363
+ cfg = sources[source]
364
+ periods = self.list_periods(source)
365
+ return {
366
+ "source": source,
367
+ "subdir": cfg["subdir"],
368
+ "prefix": cfg["prefix"],
369
+ "periods": periods,
370
+ "period_count": len(periods),
371
+ "has_data": len(periods) > 0,
372
+ "first_period": periods[0] if periods else None,
373
+ "last_period": periods[-1] if periods else None,
374
+ }
375
+
376
+ # Multi-source: retorna info agregada + detalhes por fonte
377
+ all_periods = self.list_periods()
378
+ result = {
379
+ "sources": list(sources.keys()),
380
+ "periods": all_periods,
381
+ "period_count": len(all_periods),
382
+ "has_data": len(all_periods) > 0,
383
+ "first_period": all_periods[0] if all_periods else None,
384
+ "last_period": all_periods[-1] if all_periods else None,
385
+ "by_source": {},
386
+ }
387
+
388
+ for name, cfg in sources.items():
389
+ periods = self.list_periods(name)
390
+ result["by_source"][name] = {
391
+ "subdir": cfg["subdir"],
392
+ "prefix": cfg["prefix"],
393
+ "period_count": len(periods),
394
+ "has_data": len(periods) > 0,
395
+ }
396
+
397
+ return result
398
+
399
+ def _enrich_with_cadastro(
400
+ self,
401
+ df: pd.DataFrame,
402
+ cadastro_columns: list[str],
403
+ ) -> pd.DataFrame:
404
+ """Enriquece DataFrame financeiro com colunas cadastrais.
405
+
406
+ Usa merge temporal backward-looking: cada linha financeira recebe
407
+ os atributos cadastrais do trimestre mais recente <= sua data.
408
+ """
409
+ if df.empty:
410
+ return df
411
+
412
+ # Lazy-load CadastroExplorer (import local para evitar circular)
413
+ if not hasattr(self, "_cadastro_explorer"):
414
+ from ifdata_bcb.providers.ifdata.cadastro_explorer import CadastroExplorer
415
+
416
+ self._cadastro_explorer = CadastroExplorer(
417
+ query_engine=self._qe, entity_lookup=self._resolver
418
+ )
419
+
420
+ cnpjs = df["CNPJ_8"].unique().tolist()
421
+ min_date = df["DATA"].min()
422
+ max_date = df["DATA"].max()
423
+
424
+ # Buscar cadastro com 1 trimestre de margem anterior
425
+ start_str = (min_date - pd.DateOffset(months=3)).strftime("%Y-%m")
426
+ end_str = max_date.strftime("%Y-%m")
427
+
428
+ df_cad = self._cadastro_explorer.read(
429
+ instituicao=cnpjs,
430
+ start=start_str,
431
+ end=end_str,
432
+ )
433
+
434
+ if df_cad.empty:
435
+ for col in cadastro_columns:
436
+ df[col] = pd.NA
437
+ return df
438
+
439
+ cad_cols = ["CNPJ_8", "DATA"] + cadastro_columns
440
+ df_cad = df_cad[[c for c in cad_cols if c in df_cad.columns]]
441
+
442
+ # Caso data unica: merge simples por CNPJ_8
443
+ if df["DATA"].nunique() == 1:
444
+ df_cad_latest = df_cad.sort_values("DATA").drop_duplicates(
445
+ subset=["CNPJ_8"], keep="last"
446
+ )
447
+ merge_cols = [c for c in cadastro_columns if c in df_cad_latest.columns]
448
+ return df.merge(
449
+ df_cad_latest[["CNPJ_8"] + merge_cols],
450
+ on="CNPJ_8",
451
+ how="left",
452
+ )
453
+
454
+ # Time-series: merge_asof para alinhamento temporal
455
+ # merge_asof exige sort pela coluna on= (DATA) como chave primaria
456
+ df_sorted = df.sort_values("DATA")
457
+ df_cad_sorted = df_cad.sort_values("DATA")
458
+
459
+ merge_cols = [c for c in cadastro_columns if c in df_cad_sorted.columns]
460
+ result = pd.merge_asof(
461
+ df_sorted,
462
+ df_cad_sorted[["CNPJ_8", "DATA"] + merge_cols],
463
+ on="DATA",
464
+ by="CNPJ_8",
465
+ direction="backward",
466
+ )
467
+
468
+ return result.sort_values("DATA").reset_index(drop=True)
@@ -0,0 +1,38 @@
1
+ """Constantes centralizadas para fontes de dados."""
2
+
3
+ # Mapeamento escopo -> TipoInstituicao (IFDATA)
4
+ TIPO_INST_MAP: dict[str, int] = {
5
+ "individual": 3,
6
+ "prudencial": 1,
7
+ "financeiro": 2,
8
+ }
9
+
10
+ # Configuracao das fontes de dados
11
+ DATA_SOURCES: dict[str, dict[str, str]] = {
12
+ "cadastro": {
13
+ "subdir": "ifdata/cadastro",
14
+ "prefix": "ifdata_cad",
15
+ },
16
+ "ifdata_valores": {
17
+ "subdir": "ifdata/valores",
18
+ "prefix": "ifdata_val",
19
+ },
20
+ "cosif_individual": {
21
+ "subdir": "cosif/individual",
22
+ "prefix": "cosif_ind",
23
+ },
24
+ "cosif_prudencial": {
25
+ "subdir": "cosif/prudencial",
26
+ "prefix": "cosif_prud",
27
+ },
28
+ }
29
+
30
+
31
+ def get_pattern(source: str) -> str:
32
+ """Retorna pattern glob para fonte de dados."""
33
+ return f"{DATA_SOURCES[source]['prefix']}_*.parquet"
34
+
35
+
36
+ def get_subdir(source: str) -> str:
37
+ """Retorna subdiretorio para fonte de dados."""
38
+ return DATA_SOURCES[source]["subdir"]