luxorasap 0.1.38__tar.gz → 0.2.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {luxorasap-0.1.38 → luxorasap-0.2.0}/PKG-INFO +1 -1
- {luxorasap-0.1.38 → luxorasap-0.2.0}/pyproject.toml +2 -2
- {luxorasap-0.1.38 → luxorasap-0.2.0}/src/luxorasap/__init__.py +1 -1
- {luxorasap-0.1.38 → luxorasap-0.2.0}/src/luxorasap/datareader/core.py +9 -10
- {luxorasap-0.1.38 → luxorasap-0.2.0}/src/luxorasap/ingest/cloud/__init__.py +26 -5
- luxorasap-0.2.0/src/luxorasap/utils/storage/__init__.py +12 -0
- luxorasap-0.2.0/src/luxorasap/utils/storage/blob.py +263 -0
- luxorasap-0.2.0/src/luxorasap/utils/storage/change_tracker.py +294 -0
- {luxorasap-0.1.38 → luxorasap-0.2.0}/src/luxorasap.egg-info/PKG-INFO +1 -1
- {luxorasap-0.1.38 → luxorasap-0.2.0}/src/luxorasap.egg-info/SOURCES.txt +4 -1
- luxorasap-0.2.0/tests/test_utils_change_tracker.py +180 -0
- luxorasap-0.2.0/tests/tests_utils_pickle_excel.py +92 -0
- luxorasap-0.1.38/src/luxorasap/utils/storage/__init__.py +0 -2
- luxorasap-0.1.38/src/luxorasap/utils/storage/blob.py +0 -124
- {luxorasap-0.1.38 → luxorasap-0.2.0}/README.md +0 -0
- {luxorasap-0.1.38 → luxorasap-0.2.0}/setup.cfg +0 -0
- {luxorasap-0.1.38 → luxorasap-0.2.0}/src/luxorasap/btgapi/__init__.py +0 -0
- {luxorasap-0.1.38 → luxorasap-0.2.0}/src/luxorasap/btgapi/auth.py +0 -0
- {luxorasap-0.1.38 → luxorasap-0.2.0}/src/luxorasap/btgapi/reports.py +0 -0
- {luxorasap-0.1.38 → luxorasap-0.2.0}/src/luxorasap/btgapi/trades.py +0 -0
- {luxorasap-0.1.38 → luxorasap-0.2.0}/src/luxorasap/datareader/__init__.py +0 -0
- {luxorasap-0.1.38 → luxorasap-0.2.0}/src/luxorasap/ingest/__init__.py +0 -0
- {luxorasap-0.1.38 → luxorasap-0.2.0}/src/luxorasap/ingest/legacy_local/dataloader.py +0 -0
- {luxorasap-0.1.38 → luxorasap-0.2.0}/src/luxorasap/utils/__init__.py +0 -0
- {luxorasap-0.1.38 → luxorasap-0.2.0}/src/luxorasap/utils/dataframe/__init__.py +0 -0
- {luxorasap-0.1.38 → luxorasap-0.2.0}/src/luxorasap/utils/dataframe/reader.py +0 -0
- {luxorasap-0.1.38 → luxorasap-0.2.0}/src/luxorasap/utils/dataframe/transforms.py +0 -0
- {luxorasap-0.1.38 → luxorasap-0.2.0}/src/luxorasap/utils/tools/__init__.py +0 -0
- {luxorasap-0.1.38 → luxorasap-0.2.0}/src/luxorasap/utils/tools/excel.py +0 -0
- {luxorasap-0.1.38 → luxorasap-0.2.0}/src/luxorasap.egg-info/dependency_links.txt +0 -0
- {luxorasap-0.1.38 → luxorasap-0.2.0}/src/luxorasap.egg-info/entry_points.txt +0 -0
- {luxorasap-0.1.38 → luxorasap-0.2.0}/src/luxorasap.egg-info/requires.txt +0 -0
- {luxorasap-0.1.38 → luxorasap-0.2.0}/src/luxorasap.egg-info/top_level.txt +0 -0
- {luxorasap-0.1.38 → luxorasap-0.2.0}/tests/test_btgapi_auth.py +0 -0
- {luxorasap-0.1.38 → luxorasap-0.2.0}/tests/test_btgapi_reports.py +0 -0
- {luxorasap-0.1.38 → luxorasap-0.2.0}/tests/test_btgapi_trades.py +0 -0
- {luxorasap-0.1.38 → luxorasap-0.2.0}/tests/test_datareader.py +0 -0
- {luxorasap-0.1.38 → luxorasap-0.2.0}/tests/test_ingest_cloud.py +0 -0
- {luxorasap-0.1.38 → luxorasap-0.2.0}/tests/test_ingest_legacy_local.py +0 -0
- {luxorasap-0.1.38 → luxorasap-0.2.0}/tests/test_utils_dataframe.py +0 -0
- {luxorasap-0.1.38 → luxorasap-0.2.0}/tests/test_utils_storage.py +0 -0
|
@@ -10,7 +10,7 @@ build-backend = "setuptools.build_meta"
|
|
|
10
10
|
#############################
|
|
11
11
|
[project]
|
|
12
12
|
name = "luxorasap"
|
|
13
|
-
version = "0.
|
|
13
|
+
version = "0.2.0"
|
|
14
14
|
description = "Toolbox da Luxor para ingestão, análise e automação de dados financeiros."
|
|
15
15
|
readme = "README.md"
|
|
16
16
|
requires-python = ">=3.9"
|
|
@@ -78,7 +78,7 @@ exclude = ["tests*"]
|
|
|
78
78
|
# bumpver (sem-ver)
|
|
79
79
|
#############################
|
|
80
80
|
[tool.bumpver]
|
|
81
|
-
current_version = "0.
|
|
81
|
+
current_version = "0.2.0"
|
|
82
82
|
version_pattern = "MAJOR.MINOR.PATCH"
|
|
83
83
|
|
|
84
84
|
# regex explícito – obrigatório no bumpver 2024+
|
|
@@ -13,7 +13,7 @@ from types import ModuleType
|
|
|
13
13
|
try:
|
|
14
14
|
__version__: str = metadata.version(__name__)
|
|
15
15
|
except metadata.PackageNotFoundError: # editable install
|
|
16
|
-
__version__ = "0.
|
|
16
|
+
__version__ = "0.2.0"
|
|
17
17
|
|
|
18
18
|
# ─── Lazy loader ─────────────────────────────────────────────────
|
|
19
19
|
def __getattr__(name: str) -> ModuleType:
|
|
@@ -24,12 +24,15 @@ load_dotenv()
|
|
|
24
24
|
#@logger.catch
|
|
25
25
|
class LuxorQuery:
|
|
26
26
|
|
|
27
|
-
|
|
28
|
-
|
|
27
|
+
# Criando construtor com docstring detalhada
|
|
28
|
+
def __init__(self, blob_directory='enriched/parquet', adls_connection_string:str=None,
|
|
29
|
+
container_name="luxorasap"):
|
|
29
30
|
"""
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
31
|
+
Classe para consulta de dados da Luxor.
|
|
32
|
+
Args:
|
|
33
|
+
blob_directory (str, optional): Diretório no blob onde estão as tabelas. Defaults to 'enriched/parquet'.
|
|
34
|
+
adls_connection_string (str, optional): String de conexão com o ADLS. Se None, usa variável de ambiente. Defaults to None.
|
|
35
|
+
container_name (str, optional): Nome do container no blob. Defaults to "luxorasap".
|
|
33
36
|
"""
|
|
34
37
|
|
|
35
38
|
self.blob_client = BlobParquetClient(adls_connection_string=adls_connection_string,
|
|
@@ -38,10 +41,8 @@ class LuxorQuery:
|
|
|
38
41
|
|
|
39
42
|
|
|
40
43
|
self.modified_tables = []
|
|
41
|
-
self.is_develop_mode = is_develop_mode
|
|
42
44
|
|
|
43
|
-
|
|
44
|
-
self.tables_path = tables_path
|
|
45
|
+
|
|
45
46
|
#if tables_path is None:
|
|
46
47
|
# self.tables_path = self.__set_tables_path()
|
|
47
48
|
|
|
@@ -54,8 +55,6 @@ class LuxorQuery:
|
|
|
54
55
|
self.lipi_manga_incorp_date = dt.date(2022,12,9)
|
|
55
56
|
|
|
56
57
|
|
|
57
|
-
self.update_modes_name = {"standard" : 0, "optimized" : 1}
|
|
58
|
-
self.update_mode = self.update_modes_name[update_mode]
|
|
59
58
|
self.update() # Nessa 1° exec. vai inicializar os dicionarios acima
|
|
60
59
|
|
|
61
60
|
|
|
@@ -4,7 +4,7 @@ import pandas as pd
|
|
|
4
4
|
import datetime as dt
|
|
5
5
|
import numpy as np
|
|
6
6
|
|
|
7
|
-
from luxorasap.utils.storage import BlobParquetClient
|
|
7
|
+
from luxorasap.utils.storage import BlobParquetClient, BlobExcelClient, BlobPickleClient
|
|
8
8
|
from luxorasap.utils.dataframe import prep_for_save, astype_str_inplace
|
|
9
9
|
from luxorasap.datareader import LuxorQuery
|
|
10
10
|
|
|
@@ -12,6 +12,8 @@ from luxorasap.datareader import LuxorQuery
|
|
|
12
12
|
__all__ = ["save_table", "incremental_load"]
|
|
13
13
|
|
|
14
14
|
_client = BlobParquetClient() # instância única para o módulo
|
|
15
|
+
_client_excel = None
|
|
16
|
+
_client_pickle = None
|
|
15
17
|
|
|
16
18
|
|
|
17
19
|
# ────────────────────────────────────────────────────────────────
|
|
@@ -23,7 +25,8 @@ def save_table(
|
|
|
23
25
|
index_name: str = "index",
|
|
24
26
|
normalize_columns: bool = True,
|
|
25
27
|
directory: str = "enriched/parquet",
|
|
26
|
-
override=False
|
|
28
|
+
override=False,
|
|
29
|
+
format='parquet'
|
|
27
30
|
):
|
|
28
31
|
"""Salva DataFrame como Parquet em ADLS (sobrescrevendo)."""
|
|
29
32
|
|
|
@@ -43,9 +46,27 @@ def save_table(
|
|
|
43
46
|
|
|
44
47
|
df = prep_for_save(df, index=index, index_name=index_name, normalize=normalize_columns)
|
|
45
48
|
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
+
if format == 'parquet':
|
|
50
|
+
#_client.write_df(df.astype(str), f"{directory}/{table_name}.parquet")
|
|
51
|
+
astype_str_inplace(df)
|
|
52
|
+
_client.write_df(df, f"{directory}/{table_name}.parquet")
|
|
53
|
+
|
|
54
|
+
elif format == 'excel':
|
|
55
|
+
global _client_excel
|
|
56
|
+
if _client_excel is None:
|
|
57
|
+
_client_excel = BlobExcelClient()
|
|
58
|
+
if index:
|
|
59
|
+
df = df.reset_index().rename(columns={"index": index_name})
|
|
60
|
+
_client_excel.write_excel(df, f"{directory}/{table_name}.xlsx")
|
|
61
|
+
|
|
62
|
+
elif format == 'pickle':
|
|
63
|
+
global _client_pickle
|
|
64
|
+
if _client_pickle is None:
|
|
65
|
+
_client_pickle = BlobPickleClient()
|
|
66
|
+
_client_pickle.write_pickle(df, f"{directory}/{table_name}.pkl")
|
|
67
|
+
|
|
68
|
+
else:
|
|
69
|
+
raise ValueError(f"Formato '{format}' não suportado. Use 'parquet', 'excel' ou 'pickle'.")
|
|
49
70
|
|
|
50
71
|
|
|
51
72
|
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
from .blob import BlobParquetClient, BlobPickleClient, BlobExcelClient, delete_blob, list_blob_files
|
|
2
|
+
from .change_tracker import BlobChangeWatcher, BlobMetadata
|
|
3
|
+
|
|
4
|
+
__all__ = [
|
|
5
|
+
"BlobParquetClient",
|
|
6
|
+
"BlobPickleClient",
|
|
7
|
+
"BlobExcelClient",
|
|
8
|
+
"delete_blob",
|
|
9
|
+
"list_blob_files",
|
|
10
|
+
"BlobChangeWatcher",
|
|
11
|
+
"BlobMetadata",
|
|
12
|
+
]
|
|
@@ -0,0 +1,263 @@
|
|
|
1
|
+
import io, os
|
|
2
|
+
from pathlib import PurePosixPath
|
|
3
|
+
from datetime import timezone
|
|
4
|
+
import pandas as pd
|
|
5
|
+
import pyarrow as pa, pyarrow.parquet as pq
|
|
6
|
+
import pickle
|
|
7
|
+
import re
|
|
8
|
+
|
|
9
|
+
from azure.storage.blob import BlobServiceClient
|
|
10
|
+
from azure.core.exceptions import ResourceNotFoundError
|
|
11
|
+
|
|
12
|
+
from ..dataframe import read_bytes
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class BlobParquetClient:
|
|
16
|
+
"""Leitura/gravacao de Parquet em Azure Blob – stateless & reutilizável."""
|
|
17
|
+
|
|
18
|
+
def __init__(self, container: str = "luxorasap", adls_connection_string: str = None):
|
|
19
|
+
if adls_connection_string is None:
|
|
20
|
+
adls_connection_string = os.getenv('AZURE_STORAGE_CONNECTION_STRING')
|
|
21
|
+
|
|
22
|
+
if adls_connection_string is None:
|
|
23
|
+
raise RuntimeError("AZURE_STORAGE_CONNECTION_STRING not set")
|
|
24
|
+
self._svc = BlobServiceClient.from_connection_string(adls_connection_string)
|
|
25
|
+
self._container = container
|
|
26
|
+
|
|
27
|
+
# ---------- API pública ----------
|
|
28
|
+
def read_df(self, blob_path: str) -> (pd.DataFrame, bool):
|
|
29
|
+
buf = io.BytesIO()
|
|
30
|
+
try:
|
|
31
|
+
self._blob(blob_path).download_blob().readinto(buf)
|
|
32
|
+
return (
|
|
33
|
+
read_bytes(buf.getvalue(), filename=PurePosixPath(blob_path).name),
|
|
34
|
+
True,
|
|
35
|
+
)
|
|
36
|
+
except Exception:
|
|
37
|
+
return None, False
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def write_df(self, df, blob_path: str):
|
|
41
|
+
|
|
42
|
+
blob = self._blob(blob_path)
|
|
43
|
+
table = pa.Table.from_pandas(df)
|
|
44
|
+
buf = io.BytesIO()
|
|
45
|
+
pq.write_table(table, buf)
|
|
46
|
+
buf.seek(0)
|
|
47
|
+
blob.upload_blob(buf, overwrite=True)
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def get_df_update_time(self, blob_path: str) -> float:
|
|
51
|
+
try:
|
|
52
|
+
properties = self._blob(blob_path).get_blob_properties()
|
|
53
|
+
return properties['last_modified'].replace(tzinfo=timezone.utc).timestamp()
|
|
54
|
+
except Exception:
|
|
55
|
+
return 0.0
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def exists_df(self, blob_path: str) -> bool:
|
|
59
|
+
try:
|
|
60
|
+
self._blob(blob_path).get_blob_properties()
|
|
61
|
+
return True
|
|
62
|
+
except Exception:
|
|
63
|
+
return False
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
def list_blob_files(self, blob_path: str, ends_with: str = None) -> list:
|
|
67
|
+
"""
|
|
68
|
+
Lista os arquivos em um diretório do blob storage.
|
|
69
|
+
|
|
70
|
+
Args:
|
|
71
|
+
blob_path (str): O caminho do diretório no blob storage.
|
|
72
|
+
ends_with (str, optional): Filtra os arquivos que terminam com esta string.(Ex.: '.parquet')
|
|
73
|
+
|
|
74
|
+
Returns:
|
|
75
|
+
list: Uma lista de nomes de blob.
|
|
76
|
+
|
|
77
|
+
"""
|
|
78
|
+
try:
|
|
79
|
+
container_client = self._svc.get_container_client(self._container)
|
|
80
|
+
blob_list = container_client.list_blobs(name_starts_with=blob_path)
|
|
81
|
+
if ends_with:
|
|
82
|
+
return [blob.name for blob in blob_list if blob.name.endswith(ends_with)]
|
|
83
|
+
return [blob.name for blob in blob_list]
|
|
84
|
+
except Exception:
|
|
85
|
+
return []
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
def table_exists(self, table_path: str) -> bool:
|
|
89
|
+
"""
|
|
90
|
+
Checa se uma tabela existe no blob storage.
|
|
91
|
+
"""
|
|
92
|
+
return self.exists_df(table_path)
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
# ---------- interno --------------
|
|
96
|
+
def _blob(self, path: str):
|
|
97
|
+
path = str(PurePosixPath(path))
|
|
98
|
+
return self._svc.get_blob_client(self._container, path)
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
class BlobPickleClient:
|
|
102
|
+
def __init__(self, *, adls_connection_string: str = None, container: str = "luxorasap"):
|
|
103
|
+
if adls_connection_string is None:
|
|
104
|
+
adls_connection_string = os.getenv("AZURE_STORAGE_CONNECTION_STRING")
|
|
105
|
+
|
|
106
|
+
if adls_connection_string is None:
|
|
107
|
+
raise RuntimeError("AZURE_STORAGE_CONNECTION_STRING not set")
|
|
108
|
+
|
|
109
|
+
self._svc = BlobServiceClient.from_connection_string(adls_connection_string)
|
|
110
|
+
self._container = self._svc.get_container_client(container)
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
def write_pickle(self, obj, blob_name: str):
|
|
114
|
+
"""Salva objeto Python (ex: DataFrame) como pickle no blob"""
|
|
115
|
+
buf = io.BytesIO()
|
|
116
|
+
pickle.dump(obj, buf)
|
|
117
|
+
buf.seek(0)
|
|
118
|
+
self._container.upload_blob(name=blob_name, data=buf, overwrite=True)
|
|
119
|
+
|
|
120
|
+
|
|
121
|
+
def read_pickle(self, blob_name: str):
|
|
122
|
+
"""Lê pickle do blob e retorna objeto Python"""
|
|
123
|
+
downloader = self._container.download_blob(blob_name)
|
|
124
|
+
buf = io.BytesIO(downloader.readall())
|
|
125
|
+
return pickle.load(buf)
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
def exists(self, blob_name: str) -> bool:
|
|
129
|
+
return self._container.get_blob_client(blob_name).exists()
|
|
130
|
+
|
|
131
|
+
|
|
132
|
+
|
|
133
|
+
class BlobExcelClient:
|
|
134
|
+
def __init__(self, *, adls_connection_string: str = None, container: str = "luxorasap"):
|
|
135
|
+
if adls_connection_string is None:
|
|
136
|
+
adls_connection_string = os.getenv("AZURE_STORAGE_CONNECTION_STRING")
|
|
137
|
+
|
|
138
|
+
if adls_connection_string is None:
|
|
139
|
+
raise RuntimeError("AZURE_STORAGE_CONNECTION_STRING not set")
|
|
140
|
+
|
|
141
|
+
self._svc = BlobServiceClient.from_connection_string(adls_connection_string)
|
|
142
|
+
self._container = self._svc.get_container_client(container)
|
|
143
|
+
|
|
144
|
+
|
|
145
|
+
def write_excel(self, df: pd.DataFrame, blob_name: str, **kwargs):
|
|
146
|
+
"""
|
|
147
|
+
Salva um DataFrame como arquivo Excel no blob.
|
|
148
|
+
|
|
149
|
+
Args:
|
|
150
|
+
df (pd.DataFrame): DataFrame a ser salvo
|
|
151
|
+
blob_name (str): caminho/nome do blob (ex: "reports/test.xlsx")
|
|
152
|
+
**kwargs: argumentos extras para `DataFrame.to_excel`
|
|
153
|
+
"""
|
|
154
|
+
buf = io.BytesIO()
|
|
155
|
+
df.to_excel(buf, index=False, engine="openpyxl", **kwargs)
|
|
156
|
+
buf.seek(0)
|
|
157
|
+
self._container.upload_blob(name=blob_name, data=buf, overwrite=True)
|
|
158
|
+
|
|
159
|
+
|
|
160
|
+
def read_excel(self, blob_name: str, **kwargs) -> pd.DataFrame:
|
|
161
|
+
"""
|
|
162
|
+
Lê um arquivo Excel do blob e retorna um DataFrame.
|
|
163
|
+
|
|
164
|
+
Args:
|
|
165
|
+
blob_name (str): caminho/nome do blob (ex: "reports/test.xlsx")
|
|
166
|
+
**kwargs: argumentos extras para `pd.read_excel`
|
|
167
|
+
|
|
168
|
+
Returns:
|
|
169
|
+
pd.DataFrame
|
|
170
|
+
"""
|
|
171
|
+
downloader = self._container.download_blob(blob_name)
|
|
172
|
+
buf = io.BytesIO(downloader.readall())
|
|
173
|
+
return pd.read_excel(buf, engine="openpyxl", **kwargs)
|
|
174
|
+
|
|
175
|
+
def exists(self, blob_name: str) -> bool:
|
|
176
|
+
return self._container.get_blob_client(blob_name).exists()
|
|
177
|
+
|
|
178
|
+
|
|
179
|
+
|
|
180
|
+
def list_blob_files(blob_path: str, container="luxorasap", ends_with: str = None, adls_connection_string: str = None) -> list:
|
|
181
|
+
"""
|
|
182
|
+
Lista os arquivos em um diretório do blob storage.
|
|
183
|
+
|
|
184
|
+
Args:
|
|
185
|
+
blob_path (str): O caminho do diretório no blob storage.
|
|
186
|
+
ends_with (str, optional): Filtra os arquivos que terminam com esta string.(Ex.: '.parquet')
|
|
187
|
+
|
|
188
|
+
Returns:
|
|
189
|
+
list: Uma lista de nomes de blob.
|
|
190
|
+
|
|
191
|
+
"""
|
|
192
|
+
|
|
193
|
+
if adls_connection_string is None:
|
|
194
|
+
adls_connection_string = os.getenv("AZURE_STORAGE_CONNECTION_STRING")
|
|
195
|
+
if adls_connection_string is None:
|
|
196
|
+
raise RuntimeError("AZURE_STORAGE_CONNECTION_STRING not set")
|
|
197
|
+
|
|
198
|
+
try:
|
|
199
|
+
svc = BlobServiceClient.from_connection_string(adls_connection_string)
|
|
200
|
+
container_client = svc.get_container_client(container)
|
|
201
|
+
blob_list = container_client.list_blobs(name_starts_with=blob_path)
|
|
202
|
+
if ends_with:
|
|
203
|
+
return [blob.name for blob in blob_list if blob.name.endswith(ends_with)]
|
|
204
|
+
return [blob.name for blob in blob_list]
|
|
205
|
+
except Exception:
|
|
206
|
+
return []
|
|
207
|
+
|
|
208
|
+
|
|
209
|
+
def delete_blob(
|
|
210
|
+
blob_name: str,
|
|
211
|
+
*,
|
|
212
|
+
adls_connection_string: str | None = None,
|
|
213
|
+
container: str = "luxorasap",
|
|
214
|
+
include_snapshots: bool = False,
|
|
215
|
+
) -> None:
|
|
216
|
+
"""
|
|
217
|
+
Exclui com segurança APENAS um arquivo (blob) exato do Azure Blob Storage.
|
|
218
|
+
|
|
219
|
+
Regras de segurança:
|
|
220
|
+
- Recusa nomes que terminem com "/" (prefixos de diretório virtual).
|
|
221
|
+
- Recusa curingas/shell globs (*, ?, []), para evitar exclusões indevidas.
|
|
222
|
+
- Verifica a existência do blob exato antes de remover.
|
|
223
|
+
|
|
224
|
+
Args:
|
|
225
|
+
blob_name: Caminho EXATO do blob (ex.: "enriched/parquet/tabela.parquet").
|
|
226
|
+
adls_connection_string: Se None, lê de AZURE_STORAGE_CONNECTION_STRING.
|
|
227
|
+
container: Nome do container.
|
|
228
|
+
include_snapshots: Se True, apaga snapshots vinculados ao blob.
|
|
229
|
+
|
|
230
|
+
Raises:
|
|
231
|
+
ValueError: Se o nome parecer um diretório/prefixo ou contiver curingas.
|
|
232
|
+
FileNotFoundError: Se o blob exato não existir.
|
|
233
|
+
RuntimeError: Se a conexão com o Azure não estiver configurada.
|
|
234
|
+
"""
|
|
235
|
+
if adls_connection_string is None:
|
|
236
|
+
adls_connection_string = os.getenv("AZURE_STORAGE_CONNECTION_STRING")
|
|
237
|
+
if adls_connection_string is None:
|
|
238
|
+
raise RuntimeError("AZURE_STORAGE_CONNECTION_STRING not set")
|
|
239
|
+
|
|
240
|
+
# 1) Bloqueios contra “diretórios” e curingas
|
|
241
|
+
if blob_name.endswith("/"):
|
|
242
|
+
raise ValueError("Nome termina com '/': recusa exclusão de diretórios/prefixos.")
|
|
243
|
+
if re.search(r"[\*\?\[\]]", blob_name):
|
|
244
|
+
raise ValueError("Curingas encontrados no nome do blob. Informe um arquivo exato.")
|
|
245
|
+
|
|
246
|
+
svc = BlobServiceClient.from_connection_string(adls_connection_string)
|
|
247
|
+
container_client = svc.get_container_client(container)
|
|
248
|
+
blob_client = container_client.get_blob_client(blob_name)
|
|
249
|
+
|
|
250
|
+
# 2) Checa existência do blob exato
|
|
251
|
+
try:
|
|
252
|
+
blob_client.get_blob_properties()
|
|
253
|
+
except ResourceNotFoundError:
|
|
254
|
+
raise FileNotFoundError(f"Blob não encontrado: {blob_name}")
|
|
255
|
+
|
|
256
|
+
# 3) Exclui apenas o alvo exato
|
|
257
|
+
delete_kwargs = {}
|
|
258
|
+
if include_snapshots:
|
|
259
|
+
delete_kwargs["delete_snapshots"] = "include"
|
|
260
|
+
|
|
261
|
+
blob_client.delete_blob(**delete_kwargs)
|
|
262
|
+
|
|
263
|
+
|
|
@@ -0,0 +1,294 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from dataclasses import dataclass, asdict
|
|
4
|
+
from datetime import datetime, timezone
|
|
5
|
+
from typing import Dict, List, Optional, Sequence, Tuple
|
|
6
|
+
|
|
7
|
+
from azure.core.exceptions import ResourceNotFoundError
|
|
8
|
+
from azure.storage.blob import BlobServiceClient
|
|
9
|
+
|
|
10
|
+
# Reuso dos utilitários que você já tem no projeto
|
|
11
|
+
from luxorasap.utils.storage.blob import BlobPickleClient
|
|
12
|
+
import os
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
# ──────────────────────────────────────────────────────────────────────────────
|
|
16
|
+
# Tipos de dados
|
|
17
|
+
# ──────────────────────────────────────────────────────────────────────────────
|
|
18
|
+
|
|
19
|
+
@dataclass(frozen=True)
|
|
20
|
+
class BlobMetadata:
|
|
21
|
+
"""
|
|
22
|
+
Conjunto mínimo de informações para detectar mudanças em um blob.
|
|
23
|
+
"""
|
|
24
|
+
last_modified_utc: datetime # timezone-aware, sempre em UTC
|
|
25
|
+
etag: str
|
|
26
|
+
size_bytes: int
|
|
27
|
+
|
|
28
|
+
@staticmethod
|
|
29
|
+
def from_blob_properties(props) -> "BlobMetadata":
|
|
30
|
+
"""
|
|
31
|
+
Constrói BlobMetadata a partir de BlobProperties (SDK azure.storage.blob).
|
|
32
|
+
Garante que last_modified seja timezone-aware em UTC.
|
|
33
|
+
"""
|
|
34
|
+
last_mod = props.last_modified
|
|
35
|
+
if last_mod.tzinfo is None:
|
|
36
|
+
last_mod = last_mod.replace(tzinfo=timezone.utc)
|
|
37
|
+
else:
|
|
38
|
+
last_mod = last_mod.astimezone(timezone.utc)
|
|
39
|
+
|
|
40
|
+
return BlobMetadata(
|
|
41
|
+
last_modified_utc=last_mod,
|
|
42
|
+
etag=props.etag,
|
|
43
|
+
size_bytes=int(props.size),
|
|
44
|
+
)
|
|
45
|
+
|
|
46
|
+
def to_dict(self) -> Dict:
|
|
47
|
+
d = asdict(self)
|
|
48
|
+
d["last_modified_utc"] = self.last_modified_utc.isoformat()
|
|
49
|
+
return d
|
|
50
|
+
|
|
51
|
+
@staticmethod
|
|
52
|
+
def from_dict(d: Dict) -> "BlobMetadata":
|
|
53
|
+
lm = d["last_modified_utc"]
|
|
54
|
+
if isinstance(lm, str):
|
|
55
|
+
lm = datetime.fromisoformat(lm)
|
|
56
|
+
if lm.tzinfo is None:
|
|
57
|
+
lm = lm.replace(tzinfo=timezone.utc)
|
|
58
|
+
else:
|
|
59
|
+
lm = lm.astimezone(timezone.utc)
|
|
60
|
+
return BlobMetadata(last_modified_utc=lm, etag=d["etag"], size_bytes=int(d["size_bytes"]))
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
# ──────────────────────────────────────────────────────────────────────────────
|
|
64
|
+
# Watcher (com persistência em pickle no próprio ADLS)
|
|
65
|
+
# ──────────────────────────────────────────────────────────────────────────────
|
|
66
|
+
|
|
67
|
+
class BlobChangeWatcher:
|
|
68
|
+
"""
|
|
69
|
+
Verificador de mudanças de blobs, com snapshot persistido via Pickle no ADLS.
|
|
70
|
+
|
|
71
|
+
Snapshot salvo como dict:
|
|
72
|
+
{
|
|
73
|
+
"<blob_path>": {"last_modified_utc": "...", "etag": "...", "size_bytes": int},
|
|
74
|
+
...
|
|
75
|
+
}
|
|
76
|
+
"""
|
|
77
|
+
|
|
78
|
+
def __init__(
|
|
79
|
+
self,
|
|
80
|
+
*,
|
|
81
|
+
adls_connection_string: Optional[str] = None,
|
|
82
|
+
container: str = "luxorasap",
|
|
83
|
+
snapshot_blob_path: str = "system/state",
|
|
84
|
+
watcher_id: str = "blob_change_watcher.pkl",
|
|
85
|
+
treat_missing_as_changed: bool = True,
|
|
86
|
+
) -> None:
|
|
87
|
+
"""
|
|
88
|
+
Args:
|
|
89
|
+
adls_connection_string: Se None, usa AZURE_STORAGE_CONNECTION_STRING do ambiente.
|
|
90
|
+
container: Nome do container onde estão os blobs (e onde ficará o snapshot).
|
|
91
|
+
snapshot_blob_path: Caminho do arquivo pickle (no próprio container) que guarda o snapshot.
|
|
92
|
+
treat_missing_as_changed: Se True, um blob observado pela primeira vez é considerado "mudado".
|
|
93
|
+
"""
|
|
94
|
+
|
|
95
|
+
if adls_connection_string is None:
|
|
96
|
+
adls_connection_string = os.getenv("AZURE_STORAGE_CONNECTION_STRING")
|
|
97
|
+
|
|
98
|
+
if adls_connection_string is None:
|
|
99
|
+
raise RuntimeError("AZURE_STORAGE_CONNECTION_STRING not set")
|
|
100
|
+
|
|
101
|
+
self._container_name = container
|
|
102
|
+
self._snapshot_blob_path = f"{snapshot_blob_path}/{watcher_id}"
|
|
103
|
+
self._treat_missing_as_changed = treat_missing_as_changed
|
|
104
|
+
|
|
105
|
+
# Clientes
|
|
106
|
+
self._blob_service = BlobServiceClient.from_connection_string(adls_connection_string)
|
|
107
|
+
self._container_client = self._blob_service.get_container_client(self._container_name)
|
|
108
|
+
self._pickle_client = BlobPickleClient(
|
|
109
|
+
adls_connection_string=adls_connection_string,
|
|
110
|
+
container=self._container_name,
|
|
111
|
+
)
|
|
112
|
+
|
|
113
|
+
# Estado em memória
|
|
114
|
+
self._snapshot: Dict[str, Dict] = {}
|
|
115
|
+
|
|
116
|
+
# Carrega snapshot na inicialização (se não existir, começa vazio)
|
|
117
|
+
self._load_snapshot()
|
|
118
|
+
|
|
119
|
+
# ───────────────────────────── Persistência do snapshot ─────────────────────────────
|
|
120
|
+
|
|
121
|
+
def _load_snapshot(self) -> None:
|
|
122
|
+
"""
|
|
123
|
+
Carrega o snapshot do ADLS (pickle).
|
|
124
|
+
Se não existir ou estiver inválido, inicia com dicionário vazio.
|
|
125
|
+
"""
|
|
126
|
+
try:
|
|
127
|
+
data = self._pickle_client.read_pickle(self._snapshot_blob_path)
|
|
128
|
+
self._snapshot = data if isinstance(data, dict) else {}
|
|
129
|
+
except FileNotFoundError:
|
|
130
|
+
self._snapshot = {}
|
|
131
|
+
except Exception:
|
|
132
|
+
# Corrupção/versão antiga/etc → começa do zero
|
|
133
|
+
self._snapshot = {}
|
|
134
|
+
|
|
135
|
+
|
|
136
|
+
def _save_snapshot(self) -> None:
|
|
137
|
+
"""
|
|
138
|
+
Salva o snapshot atual no ADLS via pickle.
|
|
139
|
+
"""
|
|
140
|
+
self._pickle_client.write_pickle(self._snapshot, self._snapshot_blob_path)
|
|
141
|
+
|
|
142
|
+
# ───────────────────────────── Acesso a propriedades remotas ─────────────────────────────
|
|
143
|
+
|
|
144
|
+
def _fetch_remote_metadata(self, blob_path: str) -> BlobMetadata:
|
|
145
|
+
"""
|
|
146
|
+
Busca metadados atuais do blob no ADLS.
|
|
147
|
+
Raises:
|
|
148
|
+
ResourceNotFoundError se o blob não existir.
|
|
149
|
+
"""
|
|
150
|
+
props = self._container_client.get_blob_client(blob_path).get_blob_properties()
|
|
151
|
+
return BlobMetadata.from_blob_properties(props)
|
|
152
|
+
|
|
153
|
+
def _get_snapshot_metadata(self, blob_path: str) -> Optional[BlobMetadata]:
|
|
154
|
+
"""
|
|
155
|
+
Retorna o metadata salvo no snapshot (se houver).
|
|
156
|
+
"""
|
|
157
|
+
raw = self._snapshot.get(blob_path)
|
|
158
|
+
return BlobMetadata.from_dict(raw) if raw else None
|
|
159
|
+
|
|
160
|
+
# ───────────────────────────── API pública ─────────────────────────────
|
|
161
|
+
|
|
162
|
+
def has_changed(
|
|
163
|
+
self,
|
|
164
|
+
blob_path: str,
|
|
165
|
+
*,
|
|
166
|
+
update_snapshot: bool = False,
|
|
167
|
+
treat_missing_as_changed: Optional[bool] = None,
|
|
168
|
+
) -> Tuple[bool, Optional[BlobMetadata], Optional[BlobMetadata]]:
|
|
169
|
+
"""
|
|
170
|
+
Verifica se o blob mudou desde o snapshot anterior.
|
|
171
|
+
|
|
172
|
+
Args:
|
|
173
|
+
blob_path: Caminho do blob (ex.: "raw/xlsx/trades.xlsx").
|
|
174
|
+
update_snapshot: Se True, grava o novo estado no snapshot quando houver mudança.
|
|
175
|
+
treat_missing_as_changed: Override local para a regra de "primeira vez conta como mudança?".
|
|
176
|
+
|
|
177
|
+
Returns:
|
|
178
|
+
(mudou?, metadata_antigo, metadata_atual)
|
|
179
|
+
"""
|
|
180
|
+
if treat_missing_as_changed is None:
|
|
181
|
+
treat_missing_as_changed = self._treat_missing_as_changed
|
|
182
|
+
|
|
183
|
+
previous = self._get_snapshot_metadata(blob_path)
|
|
184
|
+
|
|
185
|
+
# Se o blob não existe mais no remoto:
|
|
186
|
+
try:
|
|
187
|
+
current = self._fetch_remote_metadata(blob_path)
|
|
188
|
+
except ResourceNotFoundError:
|
|
189
|
+
changed = previous is not None
|
|
190
|
+
if update_snapshot and changed:
|
|
191
|
+
# remove do snapshot porque o blob foi apagado
|
|
192
|
+
self._snapshot.pop(blob_path, None)
|
|
193
|
+
self._save_snapshot()
|
|
194
|
+
return changed, previous, None
|
|
195
|
+
|
|
196
|
+
# Primeira observação desse blob?
|
|
197
|
+
if previous is None:
|
|
198
|
+
changed = bool(treat_missing_as_changed)
|
|
199
|
+
else:
|
|
200
|
+
# Critério de mudança (ordem de “força”: etag > last_modified > size)
|
|
201
|
+
changed = (
|
|
202
|
+
current.etag != previous.etag
|
|
203
|
+
or current.last_modified_utc != previous.last_modified_utc
|
|
204
|
+
or current.size_bytes != previous.size_bytes
|
|
205
|
+
)
|
|
206
|
+
|
|
207
|
+
if update_snapshot and changed:
|
|
208
|
+
self._snapshot[blob_path] = current.to_dict()
|
|
209
|
+
self._save_snapshot()
|
|
210
|
+
|
|
211
|
+
return changed, previous, current
|
|
212
|
+
|
|
213
|
+
|
|
214
|
+
def update_snapshot(self, blob_path: str) -> Optional[BlobMetadata]:
|
|
215
|
+
"""
|
|
216
|
+
Força a atualização do snapshot para refletir o estado atual do blob.
|
|
217
|
+
Se o blob não existir, remove do snapshot e retorna None.
|
|
218
|
+
"""
|
|
219
|
+
try:
|
|
220
|
+
current = self._fetch_remote_metadata(blob_path)
|
|
221
|
+
except ResourceNotFoundError:
|
|
222
|
+
self._snapshot.pop(blob_path, None)
|
|
223
|
+
self._save_snapshot()
|
|
224
|
+
return None
|
|
225
|
+
|
|
226
|
+
self._snapshot[blob_path] = current.to_dict()
|
|
227
|
+
self._save_snapshot()
|
|
228
|
+
return current
|
|
229
|
+
|
|
230
|
+
|
|
231
|
+
def mark_as_synchronized(self, blob_path: str, metadata: Optional[BlobMetadata] = None) -> None:
|
|
232
|
+
"""
|
|
233
|
+
Marca explicitamente um blob como “sincronizado” no snapshot (ex.: após processar um pipeline).
|
|
234
|
+
Se `metadata` não for informado, consulta o estado atual no ADLS.
|
|
235
|
+
"""
|
|
236
|
+
if metadata is None:
|
|
237
|
+
metadata = self._fetch_remote_metadata(blob_path)
|
|
238
|
+
self._snapshot[blob_path] = metadata.to_dict()
|
|
239
|
+
self._save_snapshot()
|
|
240
|
+
|
|
241
|
+
|
|
242
|
+
def list_changed_under_prefix(
|
|
243
|
+
self,
|
|
244
|
+
prefix: str,
|
|
245
|
+
*,
|
|
246
|
+
allowed_extensions: Optional[Sequence[str]] = None,
|
|
247
|
+
update_snapshot: bool = False,
|
|
248
|
+
) -> List[str]:
|
|
249
|
+
"""
|
|
250
|
+
Varre todos os blobs sob um prefixo e retorna a lista dos que mudaram
|
|
251
|
+
segundo as regras de comparação de metadados.
|
|
252
|
+
|
|
253
|
+
Args:
|
|
254
|
+
prefix: Ex.: "enriched/parquet/fundos" (com ou sem barra final).
|
|
255
|
+
allowed_extensions: Ex.: [".parquet", ".xlsx"] para filtrar por sufixo.
|
|
256
|
+
update_snapshot: Se True, atualiza o snapshot para os que mudaram.
|
|
257
|
+
|
|
258
|
+
Returns:
|
|
259
|
+
Lista de paths de blobs que mudaram.
|
|
260
|
+
"""
|
|
261
|
+
if prefix and not prefix.endswith("/"):
|
|
262
|
+
prefix += "/"
|
|
263
|
+
|
|
264
|
+
extensions = tuple(e.lower() for e in (allowed_extensions or []))
|
|
265
|
+
changed_paths: List[str] = []
|
|
266
|
+
|
|
267
|
+
for blob_item in self._container_client.list_blobs(name_starts_with=prefix):
|
|
268
|
+
name = blob_item.name
|
|
269
|
+
if name.endswith("/"):
|
|
270
|
+
continue
|
|
271
|
+
if extensions and not name.lower().endswith(extensions):
|
|
272
|
+
continue
|
|
273
|
+
|
|
274
|
+
previous = self._get_snapshot_metadata(name)
|
|
275
|
+
current = BlobMetadata.from_blob_properties(blob_item)
|
|
276
|
+
|
|
277
|
+
if previous is None:
|
|
278
|
+
has_changed = self._treat_missing_as_changed
|
|
279
|
+
else:
|
|
280
|
+
has_changed = (
|
|
281
|
+
current.etag != previous.etag
|
|
282
|
+
or current.last_modified_utc != previous.last_modified_utc
|
|
283
|
+
or current.size_bytes != previous.size_bytes
|
|
284
|
+
)
|
|
285
|
+
|
|
286
|
+
if has_changed:
|
|
287
|
+
changed_paths.append(name)
|
|
288
|
+
if update_snapshot:
|
|
289
|
+
self._snapshot[name] = current.to_dict()
|
|
290
|
+
|
|
291
|
+
if update_snapshot and changed_paths:
|
|
292
|
+
self._save_snapshot()
|
|
293
|
+
|
|
294
|
+
return changed_paths
|
|
@@ -22,6 +22,7 @@ src/luxorasap/utils/dataframe/reader.py
|
|
|
22
22
|
src/luxorasap/utils/dataframe/transforms.py
|
|
23
23
|
src/luxorasap/utils/storage/__init__.py
|
|
24
24
|
src/luxorasap/utils/storage/blob.py
|
|
25
|
+
src/luxorasap/utils/storage/change_tracker.py
|
|
25
26
|
src/luxorasap/utils/tools/__init__.py
|
|
26
27
|
src/luxorasap/utils/tools/excel.py
|
|
27
28
|
tests/test_btgapi_auth.py
|
|
@@ -30,5 +31,7 @@ tests/test_btgapi_trades.py
|
|
|
30
31
|
tests/test_datareader.py
|
|
31
32
|
tests/test_ingest_cloud.py
|
|
32
33
|
tests/test_ingest_legacy_local.py
|
|
34
|
+
tests/test_utils_change_tracker.py
|
|
33
35
|
tests/test_utils_dataframe.py
|
|
34
|
-
tests/test_utils_storage.py
|
|
36
|
+
tests/test_utils_storage.py
|
|
37
|
+
tests/tests_utils_pickle_excel.py
|
|
@@ -0,0 +1,180 @@
|
|
|
1
|
+
import datetime as dt
|
|
2
|
+
from types import SimpleNamespace
|
|
3
|
+
from datetime import timezone
|
|
4
|
+
|
|
5
|
+
import pytest
|
|
6
|
+
|
|
7
|
+
from luxorasap.utils.storage import BlobChangeWatcher, BlobMetadata
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
# ------------------------ Fakes do SDK Azure ------------------------
|
|
11
|
+
|
|
12
|
+
class FakeDownload:
|
|
13
|
+
def __init__(self, content: bytes):
|
|
14
|
+
self._content = content
|
|
15
|
+
def readall(self):
|
|
16
|
+
return self._content
|
|
17
|
+
|
|
18
|
+
class FakeBlobClient:
|
|
19
|
+
def __init__(self, name, props=None, store=None):
|
|
20
|
+
self._name = name
|
|
21
|
+
self._props = props # SimpleNamespace(last_modified, etag, size)
|
|
22
|
+
self._store = store # dict[name] -> bytes (pkl snapshot)
|
|
23
|
+
def get_blob_properties(self):
|
|
24
|
+
if self._props is None:
|
|
25
|
+
from azure.core.exceptions import ResourceNotFoundError
|
|
26
|
+
raise ResourceNotFoundError("not found")
|
|
27
|
+
return self._props
|
|
28
|
+
def download_blob(self, lease=None):
|
|
29
|
+
# apenas para o snapshot (state pickle)
|
|
30
|
+
data = self._store.get(self._name, b"")
|
|
31
|
+
return FakeDownload(data)
|
|
32
|
+
|
|
33
|
+
class FakeContainerClient:
|
|
34
|
+
def __init__(self, blobs, store):
|
|
35
|
+
# blobs: dict[name] -> SimpleNamespace(last_modified, etag, size)
|
|
36
|
+
self._blobs = blobs
|
|
37
|
+
self._store = store
|
|
38
|
+
def get_blob_client(self, name):
|
|
39
|
+
props = self._blobs.get(name)
|
|
40
|
+
return FakeBlobClient(name, props=props, store=self._store)
|
|
41
|
+
def list_blobs(self, name_starts_with=""):
|
|
42
|
+
for name, props in self._blobs.items():
|
|
43
|
+
if name.startswith(name_starts_with):
|
|
44
|
+
# Azure devolve itens com .name + props
|
|
45
|
+
item = SimpleNamespace(
|
|
46
|
+
name=name,
|
|
47
|
+
last_modified=props.last_modified,
|
|
48
|
+
etag=props.etag,
|
|
49
|
+
size=props.size,
|
|
50
|
+
)
|
|
51
|
+
yield item
|
|
52
|
+
def upload_blob(self, name, data, overwrite=False):
|
|
53
|
+
# usado para salvar o snapshot .pkl
|
|
54
|
+
content = data.read() if hasattr(data, "read") else data
|
|
55
|
+
self._store[name] = content
|
|
56
|
+
return SimpleNamespace() # dummy
|
|
57
|
+
|
|
58
|
+
class FakeBlobServiceClient:
|
|
59
|
+
def __init__(self, container_client):
|
|
60
|
+
self._cc = container_client
|
|
61
|
+
def get_container_client(self, container):
|
|
62
|
+
return self._cc
|
|
63
|
+
|
|
64
|
+
# ------------------------ Fixtures ------------------------
|
|
65
|
+
|
|
66
|
+
@pytest.fixture
|
|
67
|
+
def fake_now():
|
|
68
|
+
return dt.datetime(2025, 8, 25, 12, 0, 0, tzinfo=timezone.utc)
|
|
69
|
+
|
|
70
|
+
@pytest.fixture
|
|
71
|
+
def azure_mocks(monkeypatch, fake_now):
|
|
72
|
+
"""
|
|
73
|
+
Prepara um container fake com 2 blobs e storage em memória para o snapshot pkl.
|
|
74
|
+
"""
|
|
75
|
+
# blobs existentes no "ADLS"
|
|
76
|
+
blobs = {
|
|
77
|
+
"raw/x/a.xlsx": SimpleNamespace(
|
|
78
|
+
last_modified=fake_now, etag='"v1-a"', size=100
|
|
79
|
+
),
|
|
80
|
+
"raw/p/tb.parquet": SimpleNamespace(
|
|
81
|
+
last_modified=fake_now, etag='"v1-p"', size=500
|
|
82
|
+
),
|
|
83
|
+
}
|
|
84
|
+
# storage em memória para o snapshot .pkl
|
|
85
|
+
store = {}
|
|
86
|
+
|
|
87
|
+
cc = FakeContainerClient(blobs=blobs, store=store)
|
|
88
|
+
bsc = FakeBlobServiceClient(container_client=cc)
|
|
89
|
+
|
|
90
|
+
# patcha o construtor real para devolver o fake
|
|
91
|
+
import luxorasap.utils.storage.change_tracker as mod
|
|
92
|
+
monkeypatch.setattr(mod, "BlobServiceClient", SimpleNamespace(from_connection_string=lambda *_args, **_kw: bsc))
|
|
93
|
+
|
|
94
|
+
return SimpleNamespace(blobs=blobs, store=store, cc=cc, bsc=bsc)
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
# ------------------------ Testes ------------------------
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
def test_no_change_is_false(azure_mocks):
|
|
101
|
+
watcher = BlobChangeWatcher(
|
|
102
|
+
container="luxorasap",
|
|
103
|
+
snapshot_blob_path="system/state/tests",
|
|
104
|
+
watcher_id='test_watcher.pkl',
|
|
105
|
+
treat_missing_as_changed=True,
|
|
106
|
+
)
|
|
107
|
+
# primeira vez -> muda
|
|
108
|
+
watcher.has_changed("raw/p/tb.parquet", update_snapshot=True)
|
|
109
|
+
|
|
110
|
+
# mesma versão -> não muda
|
|
111
|
+
changed, prev, curr = watcher.has_changed("raw/p/tb.parquet", update_snapshot=False)
|
|
112
|
+
assert changed is False
|
|
113
|
+
assert prev is not None and curr is not None
|
|
114
|
+
assert prev.etag == curr.etag
|
|
115
|
+
assert prev.size_bytes == curr.size_bytes
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
def test_change_by_etag_or_size_is_true(monkeypatch, azure_mocks, fake_now):
|
|
119
|
+
watcher = BlobChangeWatcher(
|
|
120
|
+
container="luxorasap",
|
|
121
|
+
snapshot_blob_path="system/state/tests",
|
|
122
|
+
watcher_id='test_watcher.pkl'
|
|
123
|
+
)
|
|
124
|
+
# baseline
|
|
125
|
+
watcher.has_changed("raw/x/a.xlsx", update_snapshot=True)
|
|
126
|
+
|
|
127
|
+
# muda etag
|
|
128
|
+
azure_mocks.blobs["raw/x/a.xlsx"].etag = '"v2-a"'
|
|
129
|
+
changed, prev, curr = watcher.has_changed("raw/x/a.xlsx", update_snapshot=False)
|
|
130
|
+
assert changed is True
|
|
131
|
+
|
|
132
|
+
# aplica snapshot
|
|
133
|
+
watcher.has_changed("raw/x/a.xlsx", update_snapshot=True)
|
|
134
|
+
|
|
135
|
+
# muda apenas tamanho
|
|
136
|
+
azure_mocks.blobs["raw/x/a.xlsx"].size = 200
|
|
137
|
+
changed2, _, _ = watcher.has_changed("raw/x/a.xlsx", update_snapshot=False)
|
|
138
|
+
assert changed2 is True
|
|
139
|
+
|
|
140
|
+
|
|
141
|
+
def test_deleted_blob_is_considered_changed_if_was_known(azure_mocks):
|
|
142
|
+
watcher = BlobChangeWatcher(
|
|
143
|
+
container="luxorasap",
|
|
144
|
+
snapshot_blob_path="system/state/tests",
|
|
145
|
+
watcher_id='test_watcher.pkl'
|
|
146
|
+
)
|
|
147
|
+
# primeiro registra
|
|
148
|
+
watcher.has_changed("raw/p/tb.parquet", update_snapshot=True)
|
|
149
|
+
|
|
150
|
+
# remove do conjunto remoto
|
|
151
|
+
azure_mocks.blobs.pop("raw/p/tb.parquet")
|
|
152
|
+
|
|
153
|
+
changed, prev, curr = watcher.has_changed("raw/p/tb.parquet", update_snapshot=True)
|
|
154
|
+
assert changed is True
|
|
155
|
+
assert prev is not None
|
|
156
|
+
assert curr is None # não existe mais
|
|
157
|
+
|
|
158
|
+
|
|
159
|
+
def test_list_changed_under_prefix_filters_and_updates(azure_mocks, fake_now):
|
|
160
|
+
watcher = BlobChangeWatcher(
|
|
161
|
+
container="luxorasap",
|
|
162
|
+
snapshot_blob_path="system/state/tests",
|
|
163
|
+
watcher_id='test_watcher.pkl'
|
|
164
|
+
)
|
|
165
|
+
# primeira varredura (primeira vez conta como mudança)
|
|
166
|
+
changed = watcher.list_changed_under_prefix(
|
|
167
|
+
"raw/",
|
|
168
|
+
allowed_extensions=[".xlsx"],
|
|
169
|
+
update_snapshot=True,
|
|
170
|
+
)
|
|
171
|
+
assert changed == ["raw/x/a.xlsx"]
|
|
172
|
+
|
|
173
|
+
# altera parquet, mas filtro é xlsx, então não deve aparecer
|
|
174
|
+
azure_mocks.blobs["raw/p/tb.parquet"].etag = '"v2-p"'
|
|
175
|
+
changed2 = watcher.list_changed_under_prefix(
|
|
176
|
+
"raw/",
|
|
177
|
+
allowed_extensions=[".xlsx"],
|
|
178
|
+
update_snapshot=True,
|
|
179
|
+
)
|
|
180
|
+
assert changed2 == []
|
|
@@ -0,0 +1,92 @@
|
|
|
1
|
+
import io
|
|
2
|
+
import pickle
|
|
3
|
+
import pandas as pd
|
|
4
|
+
import pytest
|
|
5
|
+
from types import SimpleNamespace
|
|
6
|
+
|
|
7
|
+
# Supondo que as classes estejam em luxorasap.utils.storage.blob
|
|
8
|
+
from luxorasap.utils.storage import BlobPickleClient, BlobExcelClient
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
# ------------------------ Fakes Azure ------------------------
|
|
12
|
+
|
|
13
|
+
class FakeDownload:
|
|
14
|
+
def __init__(self, content: bytes):
|
|
15
|
+
self._content = content
|
|
16
|
+
def readall(self):
|
|
17
|
+
return self._content
|
|
18
|
+
|
|
19
|
+
class FakeBlobClient:
|
|
20
|
+
def __init__(self, name, store):
|
|
21
|
+
self._name = name
|
|
22
|
+
self._store = store
|
|
23
|
+
def download_blob(self):
|
|
24
|
+
if self._name not in self._store:
|
|
25
|
+
from azure.core.exceptions import ResourceNotFoundError
|
|
26
|
+
raise ResourceNotFoundError("not found")
|
|
27
|
+
return FakeDownload(self._store[self._name])
|
|
28
|
+
|
|
29
|
+
class FakeContainerClient:
|
|
30
|
+
def __init__(self, store):
|
|
31
|
+
self._store = store
|
|
32
|
+
def get_blob_client(self, name):
|
|
33
|
+
return FakeBlobClient(name, self._store)
|
|
34
|
+
def upload_blob(self, name, data, overwrite=False):
|
|
35
|
+
content = data.read() if hasattr(data, "read") else data
|
|
36
|
+
self._store[name] = content
|
|
37
|
+
return SimpleNamespace()
|
|
38
|
+
|
|
39
|
+
class FakeBlobServiceClient:
|
|
40
|
+
def __init__(self, container_client):
|
|
41
|
+
self._cc = container_client
|
|
42
|
+
def get_container_client(self, container):
|
|
43
|
+
return self._cc
|
|
44
|
+
|
|
45
|
+
# ------------------------ Fixtures ------------------------
|
|
46
|
+
|
|
47
|
+
@pytest.fixture
|
|
48
|
+
def mem_store():
|
|
49
|
+
return {}
|
|
50
|
+
|
|
51
|
+
@pytest.fixture
|
|
52
|
+
def patch_blob_clients(monkeypatch, mem_store):
|
|
53
|
+
# Patch para BlobPickleClient / BlobExcelClient usarem o FakeBlobServiceClient
|
|
54
|
+
import luxorasap.utils.storage.blob as mod
|
|
55
|
+
fake_bsc = FakeBlobServiceClient(FakeContainerClient(mem_store))
|
|
56
|
+
monkeypatch.setattr(mod, "BlobServiceClient", SimpleNamespace(from_connection_string=lambda *_a, **_k: fake_bsc))
|
|
57
|
+
return mem_store
|
|
58
|
+
|
|
59
|
+
# ------------------------ Tests Pickle ------------------------
|
|
60
|
+
|
|
61
|
+
def test_pickle_roundtrip(patch_blob_clients):
|
|
62
|
+
client = BlobPickleClient()
|
|
63
|
+
obj = {"a": 1, "b": [1, 2, 3]}
|
|
64
|
+
path = "aux/test/state.pkl"
|
|
65
|
+
|
|
66
|
+
client.write_pickle(obj, path)
|
|
67
|
+
loaded = client.read_pickle(path)
|
|
68
|
+
assert loaded == obj
|
|
69
|
+
|
|
70
|
+
def test_pickle_read_missing_raises(monkeypatch, patch_blob_clients):
|
|
71
|
+
client = BlobPickleClient()
|
|
72
|
+
with pytest.raises(Exception):
|
|
73
|
+
client.read_pickle("aux/missing.pkl")
|
|
74
|
+
|
|
75
|
+
# ------------------------ Tests Excel ------------------------
|
|
76
|
+
|
|
77
|
+
@pytest.mark.skipif(
|
|
78
|
+
pytest.importorskip("openpyxl", reason="openpyxl é necessário para testar Excel") is None,
|
|
79
|
+
reason="openpyxl não disponível",
|
|
80
|
+
)
|
|
81
|
+
def test_excel_roundtrip(patch_blob_clients, tmp_path):
|
|
82
|
+
df = pd.DataFrame({"Nome": ["Ana", "Bruno"], "Idade": [28, 35]})
|
|
83
|
+
client = BlobExcelClient()
|
|
84
|
+
|
|
85
|
+
blob_path = "reports/teste.xlsx"
|
|
86
|
+
client.write_excel(df, blob_path, index=False)
|
|
87
|
+
|
|
88
|
+
df2 = client.read_excel(blob_path)
|
|
89
|
+
# Comparação tolerante a tipos (pandas pode alterar dtype ao ler)
|
|
90
|
+
assert df2.shape == df.shape
|
|
91
|
+
assert list(df2.columns) == list(df.columns)
|
|
92
|
+
assert df2.astype(str).equals(df.astype(str))
|
|
@@ -1,124 +0,0 @@
|
|
|
1
|
-
import io, os
|
|
2
|
-
from pathlib import PurePosixPath
|
|
3
|
-
from datetime import timezone
|
|
4
|
-
import pandas as pd
|
|
5
|
-
import pyarrow as pa, pyarrow.parquet as pq
|
|
6
|
-
from azure.storage.blob import BlobServiceClient
|
|
7
|
-
import pickle
|
|
8
|
-
|
|
9
|
-
from ..dataframe import read_bytes
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
class BlobParquetClient:
|
|
13
|
-
"""Leitura/gravacao de Parquet em Azure Blob – stateless & reutilizável."""
|
|
14
|
-
|
|
15
|
-
def __init__(self, container: str = "luxorasap", adls_connection_string: str = None):
|
|
16
|
-
if adls_connection_string is None:
|
|
17
|
-
adls_connection_string = os.getenv('AZURE_STORAGE_CONNECTION_STRING')
|
|
18
|
-
|
|
19
|
-
if adls_connection_string is None:
|
|
20
|
-
raise RuntimeError("AZURE_STORAGE_CONNECTION_STRING not set")
|
|
21
|
-
self._svc = BlobServiceClient.from_connection_string(adls_connection_string)
|
|
22
|
-
self._container = container
|
|
23
|
-
|
|
24
|
-
# ---------- API pública ----------
|
|
25
|
-
def read_df(self, blob_path: str) -> (pd.DataFrame, bool):
|
|
26
|
-
buf = io.BytesIO()
|
|
27
|
-
try:
|
|
28
|
-
self._blob(blob_path).download_blob().readinto(buf)
|
|
29
|
-
return (
|
|
30
|
-
read_bytes(buf.getvalue(), filename=PurePosixPath(blob_path).name),
|
|
31
|
-
True,
|
|
32
|
-
)
|
|
33
|
-
except Exception:
|
|
34
|
-
return None, False
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
def write_df(self, df, blob_path: str):
|
|
38
|
-
|
|
39
|
-
blob = self._blob(blob_path)
|
|
40
|
-
table = pa.Table.from_pandas(df)
|
|
41
|
-
buf = io.BytesIO()
|
|
42
|
-
pq.write_table(table, buf)
|
|
43
|
-
buf.seek(0)
|
|
44
|
-
blob.upload_blob(buf, overwrite=True)
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
def get_df_update_time(self, blob_path: str) -> float:
|
|
48
|
-
try:
|
|
49
|
-
properties = self._blob(blob_path).get_blob_properties()
|
|
50
|
-
return properties['last_modified'].replace(tzinfo=timezone.utc).timestamp()
|
|
51
|
-
except Exception:
|
|
52
|
-
return 0.0
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
def exists_df(self, blob_path: str) -> bool:
|
|
56
|
-
try:
|
|
57
|
-
self._blob(blob_path).get_blob_properties()
|
|
58
|
-
return True
|
|
59
|
-
except Exception:
|
|
60
|
-
return False
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
def list_blob_files(self, blob_path: str, ends_with: str = None) -> list:
|
|
64
|
-
"""
|
|
65
|
-
Lista os arquivos em um diretório do blob storage.
|
|
66
|
-
|
|
67
|
-
Args:
|
|
68
|
-
blob_path (str): O caminho do diretório no blob storage.
|
|
69
|
-
ends_with (str, optional): Filtra os arquivos que terminam com esta string.(Ex.: '.parquet')
|
|
70
|
-
|
|
71
|
-
Returns:
|
|
72
|
-
list: Uma lista de nomes de blob.
|
|
73
|
-
|
|
74
|
-
"""
|
|
75
|
-
try:
|
|
76
|
-
container_client = self._svc.get_container_client(self._container)
|
|
77
|
-
blob_list = container_client.list_blobs(name_starts_with=blob_path)
|
|
78
|
-
if ends_with:
|
|
79
|
-
return [blob.name for blob in blob_list if blob.name.endswith(ends_with)]
|
|
80
|
-
return [blob.name for blob in blob_list]
|
|
81
|
-
except Exception:
|
|
82
|
-
return []
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
def table_exists(self, table_path: str) -> bool:
|
|
86
|
-
"""
|
|
87
|
-
Checa se uma tabela existe no blob storage.
|
|
88
|
-
"""
|
|
89
|
-
return self.exists_df(table_path)
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
# ---------- interno --------------
|
|
93
|
-
def _blob(self, path: str):
|
|
94
|
-
path = str(PurePosixPath(path))
|
|
95
|
-
return self._svc.get_blob_client(self._container, path)
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
class BlobPickleClient:
|
|
99
|
-
def __init__(self, *, adls_connection_string: str = None, container: str = "luxorasap"):
|
|
100
|
-
if adls_connection_string is None:
|
|
101
|
-
adls_connection_string = os.getenv("AZURE_STORAGE_CONNECTION_STRING")
|
|
102
|
-
|
|
103
|
-
if adls_connection_string is None:
|
|
104
|
-
raise RuntimeError("AZURE_STORAGE_CONNECTION_STRING not set")
|
|
105
|
-
|
|
106
|
-
self.blob_service_client = BlobServiceClient.from_connection_string(adls_connection_string)
|
|
107
|
-
self.container_client = self.blob_service_client.get_container_client(container)
|
|
108
|
-
|
|
109
|
-
def write_pickle(self, obj, blob_name: str):
|
|
110
|
-
"""Salva objeto Python (ex: DataFrame) como pickle no blob"""
|
|
111
|
-
buf = io.BytesIO()
|
|
112
|
-
pickle.dump(obj, buf)
|
|
113
|
-
buf.seek(0)
|
|
114
|
-
self.container_client.upload_blob(name=blob_name, data=buf, overwrite=True)
|
|
115
|
-
|
|
116
|
-
def read_pickle(self, blob_name: str):
|
|
117
|
-
"""Lê pickle do blob e retorna objeto Python"""
|
|
118
|
-
downloader = self.container_client.download_blob(blob_name)
|
|
119
|
-
buf = io.BytesIO(downloader.readall())
|
|
120
|
-
return pickle.load(buf)
|
|
121
|
-
|
|
122
|
-
def exists(self, blob_name: str) -> bool:
|
|
123
|
-
return self.container_client.get_blob_client(blob_name).exists()
|
|
124
|
-
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|