supertable 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- supertable/__init__.py +0 -0
- supertable/config/__init__.py +0 -0
- supertable/config/defaults.py +81 -0
- supertable/config/homedir.py +34 -0
- supertable/data_reader.py +213 -0
- supertable/data_writer.py +100 -0
- supertable/history_cleaner.py +99 -0
- supertable/locking/__init__.py +3 -0
- supertable/locking/file_lock.py +104 -0
- supertable/locking/locking.py +162 -0
- supertable/locking/locking_backend.py +6 -0
- supertable/locking/redis_lock.py +70 -0
- supertable/meta_reader.py +182 -0
- supertable/plan_extender.py +97 -0
- supertable/processing.py +334 -0
- supertable/query_plan_manager.py +29 -0
- supertable/rbac/__init__.py +0 -0
- supertable/rbac/access_control.py +229 -0
- supertable/rbac/filter_builder.py +67 -0
- supertable/rbac/permissions.py +33 -0
- supertable/rbac/role_manager.py +149 -0
- supertable/rbac/row_column_security.py +53 -0
- supertable/rbac/user_manager.py +236 -0
- supertable/simple_table.py +192 -0
- supertable/staging_area.py +65 -0
- supertable/storage/__init__.py +0 -0
- supertable/storage/azure_storage.py +199 -0
- supertable/storage/gcp_storage.py +0 -0
- supertable/storage/local_storage.py +128 -0
- supertable/storage/minio_storage.py +218 -0
- supertable/storage/s3_storage.py +233 -0
- supertable/storage/storage_factory.py +29 -0
- supertable/storage/storage_interface.py +105 -0
- supertable/super_table.py +275 -0
- supertable/utils/__init__.py +0 -0
- supertable/utils/helper.py +118 -0
- supertable/utils/sql_parser.py +131 -0
- supertable/utils/timer.py +125 -0
- supertable-0.1.0.dist-info/METADATA +93 -0
- supertable-0.1.0.dist-info/RECORD +43 -0
- supertable-0.1.0.dist-info/WHEEL +5 -0
- supertable-0.1.0.dist-info/licenses/LICENSE +84 -0
- supertable-0.1.0.dist-info/top_level.txt +1 -0
supertable/__init__.py
ADDED
|
File without changes
|
|
File without changes
|
|
@@ -0,0 +1,81 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import logging
|
|
3
|
+
import sys
|
|
4
|
+
|
|
5
|
+
from dotenv import load_dotenv
|
|
6
|
+
from dataclasses import dataclass
|
|
7
|
+
|
|
8
|
+
import colorlog
|
|
9
|
+
|
|
10
|
+
# Configure colors for all levels
|
|
11
|
+
handler = colorlog.StreamHandler()
|
|
12
|
+
handler.setFormatter(colorlog.ColoredFormatter(
|
|
13
|
+
'%(log_color)s%(asctime)s - %(levelname)-8s - %(message)s',
|
|
14
|
+
datefmt='%Y-%m-%d %H:%M:%S',
|
|
15
|
+
log_colors={
|
|
16
|
+
'DEBUG': 'cyan',
|
|
17
|
+
'INFO': 'green',
|
|
18
|
+
'WARNING': 'yellow',
|
|
19
|
+
'ERROR': 'red',
|
|
20
|
+
'CRITICAL': 'red,bg_white', # White background with red text
|
|
21
|
+
},
|
|
22
|
+
secondary_log_colors={},
|
|
23
|
+
style='%'
|
|
24
|
+
))
|
|
25
|
+
|
|
26
|
+
logging.basicConfig(level=logging.INFO, handlers=[handler])
|
|
27
|
+
logger = logging.getLogger(__name__)
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
@dataclass
|
|
31
|
+
class Default:
|
|
32
|
+
MAX_MEMORY_CHUNK_SIZE: int = 16 * 1024 * 1024
|
|
33
|
+
DEFAULT_TIMEOUT_SEC: int = 10
|
|
34
|
+
DEFAULT_LOCK_DURATION_SEC: int = 60
|
|
35
|
+
LOG_LEVEL: str = "INFO" # Replaced IS_DEBUG with LOG_LEVEL
|
|
36
|
+
IS_SHOW_TIMING: bool = True
|
|
37
|
+
STORAGE_TYPE: str = "LOCAL"
|
|
38
|
+
|
|
39
|
+
def update_default(self, **kwargs):
|
|
40
|
+
"""
|
|
41
|
+
Updates fields of this Default instance in-place
|
|
42
|
+
with any matching keys in kwargs.
|
|
43
|
+
"""
|
|
44
|
+
for key, value in kwargs.items():
|
|
45
|
+
if hasattr(self, key):
|
|
46
|
+
setattr(self, key, value)
|
|
47
|
+
if key == "LOG_LEVEL":
|
|
48
|
+
self._update_log_level()
|
|
49
|
+
|
|
50
|
+
def _update_log_level(self):
|
|
51
|
+
"""Update the logging level based on the current setting"""
|
|
52
|
+
logging.getLogger().setLevel(self.LOG_LEVEL)
|
|
53
|
+
logger.info(f"Log level changed to {self.LOG_LEVEL}")
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def load_defaults_from_env(env_file: str = ".env") -> Default:
|
|
57
|
+
load_dotenv(env_file)
|
|
58
|
+
|
|
59
|
+
# Get log level from env, default to INFO
|
|
60
|
+
log_level = os.getenv("LOG_LEVEL", "INFO").upper()
|
|
61
|
+
|
|
62
|
+
# Validate log level
|
|
63
|
+
valid_levels = ["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"]
|
|
64
|
+
if log_level not in valid_levels:
|
|
65
|
+
log_level = "INFO"
|
|
66
|
+
logger.warning(f"Invalid LOG_LEVEL in .env. Using default INFO. Valid levels are: {valid_levels}")
|
|
67
|
+
|
|
68
|
+
# Set the log level immediately
|
|
69
|
+
logging.getLogger().setLevel(log_level)
|
|
70
|
+
|
|
71
|
+
return Default(
|
|
72
|
+
MAX_MEMORY_CHUNK_SIZE=int(os.getenv("MAX_MEMORY_CHUNK_SIZE", 16 * 1024 * 1024)),
|
|
73
|
+
DEFAULT_TIMEOUT_SEC=int(os.getenv("DEFAULT_TIMEOUT_SEC", 10)),
|
|
74
|
+
DEFAULT_LOCK_DURATION_SEC=int(os.getenv("DEFAULT_LOCK_DURATION_SEC", 60)),
|
|
75
|
+
LOG_LEVEL=log_level,
|
|
76
|
+
IS_SHOW_TIMING=(os.getenv("IS_SHOW_TIMING", "True").lower() == "true"),
|
|
77
|
+
STORAGE_TYPE=os.getenv("STORAGE_TYPE", "LOCAL").upper(),
|
|
78
|
+
)
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
default = load_defaults_from_env()
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import sys
|
|
3
|
+
|
|
4
|
+
from supertable.config.defaults import default, logger
|
|
5
|
+
|
|
6
|
+
# If this file is located in a subdirectory, adjust the path logic as needed.
|
|
7
|
+
# Currently appending ".." from __file__ to add the project root directory
|
|
8
|
+
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "..")))
|
|
9
|
+
|
|
10
|
+
app_home = os.getenv("SUPERTABLE_HOME", "~/supertable")
|
|
11
|
+
|
|
12
|
+
def change_to_app_home(home_dir: str) -> None:
|
|
13
|
+
"""
|
|
14
|
+
Attempts to change the current working directory to `home_dir`.
|
|
15
|
+
Prints the outcome. Logs (or prints) any error encountered.
|
|
16
|
+
"""
|
|
17
|
+
expanded_dir = os.path.expanduser(home_dir)
|
|
18
|
+
try:
|
|
19
|
+
os.chdir(expanded_dir)
|
|
20
|
+
logger.debug(f"Changed working directory to {expanded_dir}")
|
|
21
|
+
except Exception as e:
|
|
22
|
+
logger.error(f"Failed to change working directory to {expanded_dir}: {e}")
|
|
23
|
+
|
|
24
|
+
if app_home:
|
|
25
|
+
change_to_app_home(app_home)
|
|
26
|
+
else:
|
|
27
|
+
logger.error("SUPERTABLE_HOME environment variable is not set")
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
logger.info(f"Current working directory: {os.getcwd()}")
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def get_app_home():
|
|
34
|
+
return app_home
|
|
@@ -0,0 +1,213 @@
|
|
|
1
|
+
from enum import Enum
|
|
2
|
+
|
|
3
|
+
import duckdb
|
|
4
|
+
import pandas as pd
|
|
5
|
+
|
|
6
|
+
from supertable.config.defaults import logger
|
|
7
|
+
from supertable.utils.timer import Timer
|
|
8
|
+
from supertable.super_table import SuperTable
|
|
9
|
+
from supertable.query_plan_manager import QueryPlanManager
|
|
10
|
+
from supertable.utils.sql_parser import SQLParser
|
|
11
|
+
from supertable.utils.helper import dict_keys_to_lowercase
|
|
12
|
+
from supertable.plan_extender import PlanStats, extend_execution_plan
|
|
13
|
+
from supertable.rbac.access_control import restrict_read_access
|
|
14
|
+
|
|
15
|
+
class Status(Enum):
|
|
16
|
+
OK = "ok"
|
|
17
|
+
ERROR = "error"
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class DataReader:
|
|
21
|
+
def __init__(self, super_name, organization, query):
|
|
22
|
+
self.super_table = SuperTable(super_name=super_name, organization=organization)
|
|
23
|
+
self.parser = SQLParser(query)
|
|
24
|
+
self.parser.parse_sql()
|
|
25
|
+
self.timer = None
|
|
26
|
+
self.plan_stats = None
|
|
27
|
+
self.query_plan_manager = None
|
|
28
|
+
|
|
29
|
+
def filter_snapshots(self, super_table_data, super_table_meta):
|
|
30
|
+
snapshots = super_table_data.get("snapshots")
|
|
31
|
+
file_count = super_table_meta.get("file_count", 0)
|
|
32
|
+
total_rows = super_table_meta.get("total_rows", 0)
|
|
33
|
+
total_file_size = super_table_meta.get("total_file_size", 0)
|
|
34
|
+
self.plan_stats.add_stat({"TABLE_FILES": file_count})
|
|
35
|
+
self.plan_stats.add_stat({"TABLE_SIZE": total_file_size})
|
|
36
|
+
self.plan_stats.add_stat({"TABLE_ROWS": total_rows})
|
|
37
|
+
|
|
38
|
+
if self.super_table.super_name.lower() == self.parser.original_table.lower():
|
|
39
|
+
filtered_snapshots = [
|
|
40
|
+
s for s in snapshots
|
|
41
|
+
if not (s["table_name"].startswith("__") and s["table_name"].endswith("__"))
|
|
42
|
+
]
|
|
43
|
+
return filtered_snapshots
|
|
44
|
+
else:
|
|
45
|
+
filtered_snapshots = [
|
|
46
|
+
entry
|
|
47
|
+
for entry in snapshots
|
|
48
|
+
if entry["table_name"].lower() == self.parser.original_table.lower()
|
|
49
|
+
]
|
|
50
|
+
|
|
51
|
+
return filtered_snapshots
|
|
52
|
+
|
|
53
|
+
timer = Timer()
|
|
54
|
+
@timer
|
|
55
|
+
def execute(self, user_hash: str, with_scan: bool=False):
|
|
56
|
+
status = Status.ERROR
|
|
57
|
+
message = None
|
|
58
|
+
self.timer = Timer()
|
|
59
|
+
self.plan_stats = PlanStats()
|
|
60
|
+
|
|
61
|
+
try:
|
|
62
|
+
super_table_data, super_table_path, super_table_meta = self.super_table.get_super_table_and_path_with_shared_lock()
|
|
63
|
+
|
|
64
|
+
self.timer.capture_and_reset_timing(event="META")
|
|
65
|
+
|
|
66
|
+
self.query_plan_manager = QueryPlanManager(super_name=self.super_table.super_name,
|
|
67
|
+
organization=self.super_table.organization,
|
|
68
|
+
current_meta_path=super_table_path,
|
|
69
|
+
parser=self.parser)
|
|
70
|
+
|
|
71
|
+
snapshots = self.filter_snapshots(super_table_data=super_table_data,
|
|
72
|
+
super_table_meta=super_table_meta)
|
|
73
|
+
logger.debug(f"Filtered snapshots: {snapshots}")
|
|
74
|
+
|
|
75
|
+
parquet_files, schema = self.process_snapshots(snapshots=snapshots,
|
|
76
|
+
with_scan=with_scan)
|
|
77
|
+
logger.debug(f"Parquet Files: {parquet_files}")
|
|
78
|
+
|
|
79
|
+
missing_columns = (
|
|
80
|
+
set([column.lower() for column in self.parser.columns_list])
|
|
81
|
+
- set("*")
|
|
82
|
+
- schema
|
|
83
|
+
)
|
|
84
|
+
logger.debug(f"Mising Columns: {missing_columns}")
|
|
85
|
+
|
|
86
|
+
if len(snapshots) == 0 or missing_columns or not parquet_files:
|
|
87
|
+
message = (
|
|
88
|
+
f"Missing column(s): {', '.join ( missing_columns )}"
|
|
89
|
+
if missing_columns
|
|
90
|
+
else "No parquet files found"
|
|
91
|
+
)
|
|
92
|
+
logger.warning(f"Filter Result: {message}")
|
|
93
|
+
return pd.DataFrame(), status, message
|
|
94
|
+
|
|
95
|
+
restrict_read_access(super_name=self.super_table.super_name,
|
|
96
|
+
organization=self.super_table.organization,
|
|
97
|
+
user_hash=user_hash,
|
|
98
|
+
table_name=self.parser.reflection_table,
|
|
99
|
+
table_schema=schema,
|
|
100
|
+
parsed_columns=self.parser.columns_list,
|
|
101
|
+
parser=self.parser)
|
|
102
|
+
|
|
103
|
+
self.timer.capture_and_reset_timing(event="FILTERING")
|
|
104
|
+
|
|
105
|
+
result = self.execute_with_duckdb(parquet_files=parquet_files,
|
|
106
|
+
query_manager=self.query_plan_manager)
|
|
107
|
+
|
|
108
|
+
status = Status.OK
|
|
109
|
+
except Exception as e:
|
|
110
|
+
message = str(e)
|
|
111
|
+
logger.error(f"Exception: {e}")
|
|
112
|
+
result = pd.DataFrame()
|
|
113
|
+
self.timer.capture_and_reset_timing(event="EXECUTING_QUERY")
|
|
114
|
+
|
|
115
|
+
try:
|
|
116
|
+
extend_execution_plan(super_table=self.super_table,
|
|
117
|
+
user_hash=user_hash,
|
|
118
|
+
query_plan_manager=self.query_plan_manager,
|
|
119
|
+
timing=self.timer.timings,
|
|
120
|
+
status=status.value,
|
|
121
|
+
message=message,
|
|
122
|
+
result_shape=result.shape,
|
|
123
|
+
plan_stats=self.plan_stats)
|
|
124
|
+
except Exception as e:
|
|
125
|
+
logger.error(f"Exception: {e}")
|
|
126
|
+
|
|
127
|
+
self.timer.capture_and_reset_timing(event="EXTENDING_PLAN")
|
|
128
|
+
self.timer.capture_duration(event="TOTAL_EXECUTE")
|
|
129
|
+
return result, status, message
|
|
130
|
+
|
|
131
|
+
def process_snapshots(self, snapshots, with_scan):
|
|
132
|
+
parquet_files = []
|
|
133
|
+
reflection_file_size = 0
|
|
134
|
+
reflection_rows = 0
|
|
135
|
+
|
|
136
|
+
schema = set()
|
|
137
|
+
for snapshot in snapshots:
|
|
138
|
+
current_snapshot_path = snapshot["path"]
|
|
139
|
+
current_snapshot_data = self.super_table.read_simple_table_snapshot(
|
|
140
|
+
current_snapshot_path
|
|
141
|
+
)
|
|
142
|
+
|
|
143
|
+
current_schema = current_snapshot_data.get("schema", {})
|
|
144
|
+
resources = current_snapshot_data.get("resources", {})
|
|
145
|
+
schema.update(dict_keys_to_lowercase(current_schema).keys())
|
|
146
|
+
|
|
147
|
+
for resource in resources:
|
|
148
|
+
file_size = resource.get("file_size", 0)
|
|
149
|
+
file_rows = resource.get("rows", 0)
|
|
150
|
+
|
|
151
|
+
if (
|
|
152
|
+
with_scan
|
|
153
|
+
or self.parser.columns_csv == "*"
|
|
154
|
+
or any(
|
|
155
|
+
col in dict_keys_to_lowercase(current_schema).keys()
|
|
156
|
+
for col in [
|
|
157
|
+
column.lower() for column in self.parser.columns_list
|
|
158
|
+
]
|
|
159
|
+
)
|
|
160
|
+
):
|
|
161
|
+
parquet_files.append(resource["file"])
|
|
162
|
+
reflection_file_size += file_size
|
|
163
|
+
reflection_rows += file_rows
|
|
164
|
+
|
|
165
|
+
logger.debug(f"snapshots: {len ( snapshots )}")
|
|
166
|
+
logger.debug(f"parquet_files: {len ( parquet_files )}")
|
|
167
|
+
logger.debug(f"schema: {schema}")
|
|
168
|
+
|
|
169
|
+
self.plan_stats.add_stat({"REFLECTIONS": len(parquet_files)})
|
|
170
|
+
self.plan_stats.add_stat({"REFLECTION_SIZE": reflection_file_size})
|
|
171
|
+
self.plan_stats.add_stat({"REFLECTION_ROWS": reflection_rows})
|
|
172
|
+
|
|
173
|
+
return parquet_files, schema
|
|
174
|
+
|
|
175
|
+
def execute_with_duckdb(self, parquet_files, query_manager: QueryPlanManager):
|
|
176
|
+
# Use DuckDB to read and query the parquet files directly
|
|
177
|
+
con = duckdb.connect()
|
|
178
|
+
|
|
179
|
+
con.execute("PRAGMA memory_limit='2GB';")
|
|
180
|
+
con.execute(f"PRAGMA temp_directory='{query_manager.temp_dir}';")
|
|
181
|
+
con.execute("PRAGMA enable_profiling='json';")
|
|
182
|
+
#con.execute("SET profiling_mode = 'standard';")
|
|
183
|
+
con.execute(f"PRAGMA profile_output = '{query_manager.query_plan_path}';")
|
|
184
|
+
|
|
185
|
+
# Read and register parquet files directly with DuckDB
|
|
186
|
+
parquet_files_str = ", ".join(f"'{file}'" for file in parquet_files)
|
|
187
|
+
logger.debug(f"parquet files: {len(parquet_files)}")
|
|
188
|
+
|
|
189
|
+
self.timer.capture_and_reset_timing("CONNECTING")
|
|
190
|
+
|
|
191
|
+
create_table = f"""
|
|
192
|
+
CREATE TABLE {self.parser.reflection_table}
|
|
193
|
+
AS
|
|
194
|
+
SELECT {self.parser.columns_csv}
|
|
195
|
+
FROM parquet_scan([{parquet_files_str}], union_by_name=True, HIVE_PARTITIONING=TRUE);
|
|
196
|
+
"""
|
|
197
|
+
|
|
198
|
+
logger.debug(f"create_table: {create_table}")
|
|
199
|
+
con.execute(create_table)
|
|
200
|
+
|
|
201
|
+
create_view = f"""
|
|
202
|
+
CREATE VIEW {self.parser.rbac_view}
|
|
203
|
+
AS
|
|
204
|
+
{self.parser.view_definition}
|
|
205
|
+
"""
|
|
206
|
+
logger.debug(f"create_view: {create_view}")
|
|
207
|
+
con.execute(create_view)
|
|
208
|
+
|
|
209
|
+
self.timer.capture_and_reset_timing("CREATING_REFLECTION")
|
|
210
|
+
logger.debug(f"Executing Query: {self.parser.executing_query}")
|
|
211
|
+
result = con.execute(query=self.parser.executing_query).fetchdf()
|
|
212
|
+
logger.debug(f"result.shape: {result.shape}")
|
|
213
|
+
return result
|
|
@@ -0,0 +1,100 @@
|
|
|
1
|
+
import polars
|
|
2
|
+
import re
|
|
3
|
+
|
|
4
|
+
from polars import DataFrame
|
|
5
|
+
|
|
6
|
+
from supertable.config.defaults import logger
|
|
7
|
+
from supertable.super_table import SuperTable
|
|
8
|
+
from supertable.simple_table import SimpleTable
|
|
9
|
+
from supertable.utils.timer import Timer
|
|
10
|
+
from supertable.processing import (
|
|
11
|
+
process_overlapping_files,
|
|
12
|
+
find_and_lock_overlapping_files,
|
|
13
|
+
)
|
|
14
|
+
from supertable.rbac.access_control import check_write_access
|
|
15
|
+
|
|
16
|
+
class DataWriter:
|
|
17
|
+
def __init__(self, super_name: str, organization: str):
|
|
18
|
+
self.super_table = SuperTable(super_name, organization)
|
|
19
|
+
|
|
20
|
+
timer = Timer()
|
|
21
|
+
|
|
22
|
+
@timer
|
|
23
|
+
def write(self, user_hash, simple_name, data, overwrite_columns, compression_level=1):
|
|
24
|
+
|
|
25
|
+
logger.debug("Checking for Write Access")
|
|
26
|
+
check_write_access(super_name=self.super_table.super_name,
|
|
27
|
+
organization=self.super_table.organization,
|
|
28
|
+
user_hash=user_hash,
|
|
29
|
+
table_name=simple_name)
|
|
30
|
+
logger.debug("Passed Write Access Check")
|
|
31
|
+
|
|
32
|
+
# Convert the input dataset from Arrow format to a Polars DataFrame
|
|
33
|
+
logger.debug("Converting data to DataFrame")
|
|
34
|
+
dataframe: DataFrame = polars.from_arrow(data)
|
|
35
|
+
logger.debug("Converted data to DataFrame")
|
|
36
|
+
|
|
37
|
+
logger.debug("Validating the dataframe")
|
|
38
|
+
self.validation(dataframe, simple_name, overwrite_columns)
|
|
39
|
+
logger.debug("dataframe is valid")
|
|
40
|
+
|
|
41
|
+
logger.debug(f"Reading Simple Table Metadata {simple_name}")
|
|
42
|
+
simple_table = SimpleTable(self.super_table, simple_name)
|
|
43
|
+
last_simple_table, _ = simple_table.get_simple_table_with_shared_lock()
|
|
44
|
+
logger.debug(f"last_simple_table: {last_simple_table}")
|
|
45
|
+
|
|
46
|
+
# Find files that have overlapping data and lock them to prevent concurrent modifications
|
|
47
|
+
overlapping_files = find_and_lock_overlapping_files(
|
|
48
|
+
last_simple_table, dataframe, overwrite_columns, simple_table.locking
|
|
49
|
+
)
|
|
50
|
+
logger.debug(f"overlapping_files: {overlapping_files}")
|
|
51
|
+
|
|
52
|
+
# Process the overlapping files by filtering, merging, and updating resources
|
|
53
|
+
inserted, deleted, total_rows, total_columns, new_resources, sunset_files = (
|
|
54
|
+
process_overlapping_files(
|
|
55
|
+
dataframe,
|
|
56
|
+
overlapping_files,
|
|
57
|
+
overwrite_columns,
|
|
58
|
+
simple_table.data_dir,
|
|
59
|
+
compression_level,
|
|
60
|
+
)
|
|
61
|
+
)
|
|
62
|
+
|
|
63
|
+
new_simple_table_snapshot, new_simple_table_path = simple_table.lock_and_update(
|
|
64
|
+
new_resources, sunset_files, dataframe
|
|
65
|
+
)
|
|
66
|
+
|
|
67
|
+
self.super_table.update_with_lock(
|
|
68
|
+
simple_name, new_simple_table_path, new_simple_table_snapshot
|
|
69
|
+
)
|
|
70
|
+
|
|
71
|
+
simple_table.locking.release_lock()
|
|
72
|
+
# self.super_table.locking.release_lock()
|
|
73
|
+
|
|
74
|
+
return total_columns, total_rows, inserted, deleted
|
|
75
|
+
|
|
76
|
+
def validation(
|
|
77
|
+
self, dataframe: DataFrame, simple_name: str, overwrite_columns: list
|
|
78
|
+
):
|
|
79
|
+
if len(simple_name) == 0 or len(simple_name) > 128:
|
|
80
|
+
raise ValueError("SimpleTable name can't be empty or longer than 128")
|
|
81
|
+
|
|
82
|
+
if simple_name == self.super_table.super_name:
|
|
83
|
+
raise ValueError("SimpleTable name can't match with SuperTable name")
|
|
84
|
+
|
|
85
|
+
# Regular expression pattern for a valid table name
|
|
86
|
+
pattern = r"^[A-Za-z_][A-Za-z0-9_]*$"
|
|
87
|
+
if not re.match(pattern, simple_name):
|
|
88
|
+
raise ValueError(
|
|
89
|
+
f"Invalid table name: '{simple_name}'. Table names must start with a letter or underscore and contain only alphanumeric characters and underscores."
|
|
90
|
+
)
|
|
91
|
+
|
|
92
|
+
# Validate the overwrite columns
|
|
93
|
+
if overwrite_columns and not all(
|
|
94
|
+
col in dataframe.columns for col in overwrite_columns
|
|
95
|
+
):
|
|
96
|
+
raise ValueError("Some overwrite columns are not present in the dataset")
|
|
97
|
+
|
|
98
|
+
# Ensure overwrite_columns is a list
|
|
99
|
+
if isinstance(overwrite_columns, str):
|
|
100
|
+
raise ValueError("overwrite columns must be list")
|
|
@@ -0,0 +1,99 @@
|
|
|
1
|
+
import os
|
|
2
|
+
from supertable.config.defaults import logger
|
|
3
|
+
from supertable.super_table import SuperTable
|
|
4
|
+
from supertable.rbac.access_control import check_write_access
|
|
5
|
+
|
|
6
|
+
class HistoryCleaner:
|
|
7
|
+
def __init__(self, super_name: str, organization: str):
|
|
8
|
+
self.super_table = SuperTable(super_name=super_name, organization=organization)
|
|
9
|
+
# Grab the storage object from the super_table
|
|
10
|
+
self.storage = self.super_table.storage
|
|
11
|
+
|
|
12
|
+
def clean(self, user_hash):
|
|
13
|
+
# Acquire the shared lock and read super table meta
|
|
14
|
+
super_table_data, super_table_path, super_table_meta = (
|
|
15
|
+
self.super_table.get_super_table_and_path_with_shared_lock()
|
|
16
|
+
)
|
|
17
|
+
last_updated_ms = super_table_data["last_updated_ms"]
|
|
18
|
+
|
|
19
|
+
check_write_access(super_name=self.super_table.super_name,
|
|
20
|
+
organization=self.super_table.organization,
|
|
21
|
+
user_hash=user_hash,
|
|
22
|
+
table_name=self.super_table.super_name)
|
|
23
|
+
|
|
24
|
+
# Collect all files in the super_table.super_dir
|
|
25
|
+
super_files = self.collect_files(self.super_table.super_dir)
|
|
26
|
+
# Remove the current super_table pointer from the list
|
|
27
|
+
if super_table_path in super_files:
|
|
28
|
+
super_files.remove(super_table_path)
|
|
29
|
+
|
|
30
|
+
files_to_delete = self.get_files_to_delete(super_files, last_updated_ms)
|
|
31
|
+
self.delete_files(files_to_delete)
|
|
32
|
+
|
|
33
|
+
logger.debug(
|
|
34
|
+
f"{len(files_to_delete)} files cleaned for table: {self.super_table.super_name}"
|
|
35
|
+
)
|
|
36
|
+
|
|
37
|
+
# Iterate over each snapshot in the super table
|
|
38
|
+
for snapshot in super_table_data["snapshots"]:
|
|
39
|
+
# Note: If your snapshot dict uses "table_name" instead of "simple_name", adjust here:
|
|
40
|
+
simple_table_name = snapshot["table_name"] # or snapshot["simple_name"] if appropriate
|
|
41
|
+
simple_table_path = snapshot["path"]
|
|
42
|
+
|
|
43
|
+
# Read the simple table's own snapshot JSON
|
|
44
|
+
simple_table_data = self.storage.read_json(simple_table_path)
|
|
45
|
+
|
|
46
|
+
location = simple_table_data["location"]
|
|
47
|
+
active_files = [entry["file"] for entry in simple_table_data.get("resources", [])]
|
|
48
|
+
|
|
49
|
+
designated_files = set(self.collect_files(location)) - set(active_files)
|
|
50
|
+
# Remove the current snapshot path from the designated_files
|
|
51
|
+
if simple_table_path in designated_files:
|
|
52
|
+
designated_files.remove(simple_table_path)
|
|
53
|
+
|
|
54
|
+
# Convert back to list
|
|
55
|
+
designated_files = list(designated_files)
|
|
56
|
+
|
|
57
|
+
files_to_delete = self.get_files_to_delete(designated_files, last_updated_ms)
|
|
58
|
+
self.delete_files(files_to_delete)
|
|
59
|
+
|
|
60
|
+
logger.debug(f"{len(files_to_delete)} files cleaned for table: {simple_table_name}")
|
|
61
|
+
|
|
62
|
+
def collect_files(self, location):
|
|
63
|
+
"""
|
|
64
|
+
Collect parquet & JSON files under 'location'.
|
|
65
|
+
This uses the storage interface's list_files instead of glob.
|
|
66
|
+
"""
|
|
67
|
+
# Example usage for a typical directory layout:
|
|
68
|
+
parquet_files = self.storage.list_files(os.path.join(location, "data"), "*.parquet")
|
|
69
|
+
json_files = self.storage.list_files(os.path.join(location, "snapshots"), "*.json")
|
|
70
|
+
super_json_files = self.storage.list_files(location, "*.json")
|
|
71
|
+
return parquet_files + json_files + super_json_files
|
|
72
|
+
|
|
73
|
+
def get_files_to_delete(self, designated_files, last_updated_ms):
|
|
74
|
+
"""
|
|
75
|
+
Compare the numeric timestamp in the file name (e.g. 1678900000_*.json)
|
|
76
|
+
against `last_updated_ms`.
|
|
77
|
+
"""
|
|
78
|
+
files_to_delete = []
|
|
79
|
+
for file in designated_files:
|
|
80
|
+
# Example: "1678900000_fileinfo.json" => "1678900000"
|
|
81
|
+
filename = os.path.basename(file)
|
|
82
|
+
timestamp_str = filename.split("_")[0]
|
|
83
|
+
try:
|
|
84
|
+
timestamp_val = int(timestamp_str)
|
|
85
|
+
if timestamp_val <= last_updated_ms:
|
|
86
|
+
files_to_delete.append(file)
|
|
87
|
+
except ValueError:
|
|
88
|
+
# If the file doesn't match the pattern, skip or handle differently
|
|
89
|
+
pass
|
|
90
|
+
|
|
91
|
+
return files_to_delete
|
|
92
|
+
|
|
93
|
+
def delete_files(self, files_to_delete):
|
|
94
|
+
"""
|
|
95
|
+
Deletes files using the storage interface's `delete` method.
|
|
96
|
+
"""
|
|
97
|
+
for file in files_to_delete:
|
|
98
|
+
self.storage.delete(file)
|
|
99
|
+
logger.debug(f"Deleted file: {file}")
|
|
@@ -0,0 +1,104 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import os
|
|
3
|
+
import secrets
|
|
4
|
+
import time
|
|
5
|
+
import fcntl
|
|
6
|
+
from supertable.config.defaults import default, logger
|
|
7
|
+
|
|
8
|
+
class FileLocking:
|
|
9
|
+
def __init__(self, identity, working_dir, lock_file_name=".lock.json", check_interval=0.1):
|
|
10
|
+
self.identity = identity
|
|
11
|
+
self.lock_id = secrets.token_hex(8)
|
|
12
|
+
self.lock_file_dir = working_dir if working_dir is not None else identity
|
|
13
|
+
self.lock_file_path = os.path.join(self.lock_file_dir, lock_file_name)
|
|
14
|
+
self.check_interval = check_interval
|
|
15
|
+
logger.debug(f"lock_file_dir: {self.lock_file_dir}")
|
|
16
|
+
logger.debug(f"lock_file_path: {self.lock_file_path}")
|
|
17
|
+
self.init_lock_file()
|
|
18
|
+
|
|
19
|
+
def init_lock_file(self):
|
|
20
|
+
os.makedirs(self.lock_file_dir, exist_ok=True)
|
|
21
|
+
if not os.path.exists(self.lock_file_path):
|
|
22
|
+
with open(self.lock_file_path, "w") as lock_file:
|
|
23
|
+
json.dump([], lock_file)
|
|
24
|
+
|
|
25
|
+
def read_lock_file(self, lock_file):
|
|
26
|
+
lock_file.seek(0)
|
|
27
|
+
try:
|
|
28
|
+
return json.load(lock_file)
|
|
29
|
+
except json.JSONDecodeError as e:
|
|
30
|
+
logger.error(f"Error reading lock file: {e}")
|
|
31
|
+
return []
|
|
32
|
+
|
|
33
|
+
def write_lock_file(self, lock_data, lock_file):
|
|
34
|
+
lock_file.seek(0)
|
|
35
|
+
lock_file.truncate()
|
|
36
|
+
json.dump(lock_data, lock_file)
|
|
37
|
+
lock_file.flush()
|
|
38
|
+
os.fsync(lock_file.fileno())
|
|
39
|
+
|
|
40
|
+
def remove_expired_locks(self, lock_data):
|
|
41
|
+
current_time = int(time.time())
|
|
42
|
+
return [lock for lock in lock_data if lock["exp"] > current_time]
|
|
43
|
+
|
|
44
|
+
def remove_own_locks(self, lock_data):
|
|
45
|
+
return [lock for lock in lock_data if lock["pid"] != self.lock_id]
|
|
46
|
+
|
|
47
|
+
def self_lock(self, timeout_seconds=default.DEFAULT_TIMEOUT_SEC, lock_duration_seconds=default.DEFAULT_LOCK_DURATION_SEC):
|
|
48
|
+
resources = [self.identity]
|
|
49
|
+
return self.lock_resources(resources, timeout_seconds, lock_duration_seconds)
|
|
50
|
+
|
|
51
|
+
def lock_resources(self, resources, timeout_seconds=default.DEFAULT_TIMEOUT_SEC, lock_duration_seconds=default.DEFAULT_LOCK_DURATION_SEC):
|
|
52
|
+
start_time = time.time()
|
|
53
|
+
expiration_time = int(time.time() + lock_duration_seconds)
|
|
54
|
+
sleep_time = 0
|
|
55
|
+
|
|
56
|
+
while time.time() - start_time < timeout_seconds:
|
|
57
|
+
try:
|
|
58
|
+
if sleep_time > 0:
|
|
59
|
+
logger.debug(f"Waiting {sleep_time} seconds to acquire the lock for {self.identity}")
|
|
60
|
+
time.sleep(sleep_time)
|
|
61
|
+
|
|
62
|
+
with open(self.lock_file_path, "r+") as lock_file:
|
|
63
|
+
fcntl.flock(lock_file, fcntl.LOCK_EX)
|
|
64
|
+
try:
|
|
65
|
+
lock_data = self.read_lock_file(lock_file)
|
|
66
|
+
lock_data = self.remove_expired_locks(lock_data)
|
|
67
|
+
lock_check = self.remove_own_locks(lock_data)
|
|
68
|
+
logger.debug(f"Lock data: {lock_data}")
|
|
69
|
+
logger.debug(f"Lock check: {lock_check}")
|
|
70
|
+
|
|
71
|
+
if any(resource in lock["res"] for lock in lock_check for resource in resources):
|
|
72
|
+
logger.debug(f"{self.identity}: lock can't be acquired for resources {resources}")
|
|
73
|
+
sleep_time = self.check_interval
|
|
74
|
+
continue
|
|
75
|
+
|
|
76
|
+
lock_entry = {"pid": self.lock_id, "exp": expiration_time, "res": resources}
|
|
77
|
+
lock_data.append(lock_entry)
|
|
78
|
+
self.write_lock_file(lock_data, lock_file)
|
|
79
|
+
logger.debug(f"Identity: {self.identity}, lock acquired: {self.lock_id}")
|
|
80
|
+
return True
|
|
81
|
+
finally:
|
|
82
|
+
fcntl.flock(lock_file, fcntl.LOCK_UN)
|
|
83
|
+
except Exception as e:
|
|
84
|
+
logger.error(f"Error during lock acquisition: {e}")
|
|
85
|
+
time.sleep(self.check_interval)
|
|
86
|
+
return False
|
|
87
|
+
|
|
88
|
+
def release_lock(self, resources=None):
|
|
89
|
+
with open(self.lock_file_path, "r+") as lock_file:
|
|
90
|
+
fcntl.flock(lock_file, fcntl.LOCK_EX)
|
|
91
|
+
try:
|
|
92
|
+
lock_data = self.read_lock_file(lock_file)
|
|
93
|
+
lock_data = [lock for lock in lock_data if lock["pid"] != self.lock_id]
|
|
94
|
+
self.write_lock_file(lock_data, lock_file)
|
|
95
|
+
finally:
|
|
96
|
+
fcntl.flock(lock_file, fcntl.LOCK_UN)
|
|
97
|
+
|
|
98
|
+
def __enter__(self):
|
|
99
|
+
if not self.self_lock():
|
|
100
|
+
raise Exception(f"Unable to acquire file lock for {self.identity}")
|
|
101
|
+
return self
|
|
102
|
+
|
|
103
|
+
def __exit__(self, exc_type, exc_value, traceback):
|
|
104
|
+
self.release_lock()
|