supertable 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. supertable/__init__.py +0 -0
  2. supertable/config/__init__.py +0 -0
  3. supertable/config/defaults.py +81 -0
  4. supertable/config/homedir.py +34 -0
  5. supertable/data_reader.py +213 -0
  6. supertable/data_writer.py +100 -0
  7. supertable/history_cleaner.py +99 -0
  8. supertable/locking/__init__.py +3 -0
  9. supertable/locking/file_lock.py +104 -0
  10. supertable/locking/locking.py +162 -0
  11. supertable/locking/locking_backend.py +6 -0
  12. supertable/locking/redis_lock.py +70 -0
  13. supertable/meta_reader.py +182 -0
  14. supertable/plan_extender.py +97 -0
  15. supertable/processing.py +334 -0
  16. supertable/query_plan_manager.py +29 -0
  17. supertable/rbac/__init__.py +0 -0
  18. supertable/rbac/access_control.py +229 -0
  19. supertable/rbac/filter_builder.py +67 -0
  20. supertable/rbac/permissions.py +33 -0
  21. supertable/rbac/role_manager.py +149 -0
  22. supertable/rbac/row_column_security.py +53 -0
  23. supertable/rbac/user_manager.py +236 -0
  24. supertable/simple_table.py +192 -0
  25. supertable/staging_area.py +65 -0
  26. supertable/storage/__init__.py +0 -0
  27. supertable/storage/azure_storage.py +199 -0
  28. supertable/storage/gcp_storage.py +0 -0
  29. supertable/storage/local_storage.py +128 -0
  30. supertable/storage/minio_storage.py +218 -0
  31. supertable/storage/s3_storage.py +233 -0
  32. supertable/storage/storage_factory.py +29 -0
  33. supertable/storage/storage_interface.py +105 -0
  34. supertable/super_table.py +275 -0
  35. supertable/utils/__init__.py +0 -0
  36. supertable/utils/helper.py +118 -0
  37. supertable/utils/sql_parser.py +131 -0
  38. supertable/utils/timer.py +125 -0
  39. supertable-0.1.0.dist-info/METADATA +93 -0
  40. supertable-0.1.0.dist-info/RECORD +43 -0
  41. supertable-0.1.0.dist-info/WHEEL +5 -0
  42. supertable-0.1.0.dist-info/licenses/LICENSE +84 -0
  43. supertable-0.1.0.dist-info/top_level.txt +1 -0
supertable/__init__.py ADDED
File without changes
File without changes
@@ -0,0 +1,81 @@
1
+ import os
2
+ import logging
3
+ import sys
4
+
5
+ from dotenv import load_dotenv
6
+ from dataclasses import dataclass
7
+
8
+ import colorlog
9
+
10
+ # Configure colors for all levels
11
+ handler = colorlog.StreamHandler()
12
+ handler.setFormatter(colorlog.ColoredFormatter(
13
+ '%(log_color)s%(asctime)s - %(levelname)-8s - %(message)s',
14
+ datefmt='%Y-%m-%d %H:%M:%S',
15
+ log_colors={
16
+ 'DEBUG': 'cyan',
17
+ 'INFO': 'green',
18
+ 'WARNING': 'yellow',
19
+ 'ERROR': 'red',
20
+ 'CRITICAL': 'red,bg_white', # White background with red text
21
+ },
22
+ secondary_log_colors={},
23
+ style='%'
24
+ ))
25
+
26
+ logging.basicConfig(level=logging.INFO, handlers=[handler])
27
+ logger = logging.getLogger(__name__)
28
+
29
+
30
+ @dataclass
31
+ class Default:
32
+ MAX_MEMORY_CHUNK_SIZE: int = 16 * 1024 * 1024
33
+ DEFAULT_TIMEOUT_SEC: int = 10
34
+ DEFAULT_LOCK_DURATION_SEC: int = 60
35
+ LOG_LEVEL: str = "INFO" # Replaced IS_DEBUG with LOG_LEVEL
36
+ IS_SHOW_TIMING: bool = True
37
+ STORAGE_TYPE: str = "LOCAL"
38
+
39
+ def update_default(self, **kwargs):
40
+ """
41
+ Updates fields of this Default instance in-place
42
+ with any matching keys in kwargs.
43
+ """
44
+ for key, value in kwargs.items():
45
+ if hasattr(self, key):
46
+ setattr(self, key, value)
47
+ if key == "LOG_LEVEL":
48
+ self._update_log_level()
49
+
50
+ def _update_log_level(self):
51
+ """Update the logging level based on the current setting"""
52
+ logging.getLogger().setLevel(self.LOG_LEVEL)
53
+ logger.info(f"Log level changed to {self.LOG_LEVEL}")
54
+
55
+
56
+ def load_defaults_from_env(env_file: str = ".env") -> Default:
57
+ load_dotenv(env_file)
58
+
59
+ # Get log level from env, default to INFO
60
+ log_level = os.getenv("LOG_LEVEL", "INFO").upper()
61
+
62
+ # Validate log level
63
+ valid_levels = ["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"]
64
+ if log_level not in valid_levels:
65
+ log_level = "INFO"
66
+ logger.warning(f"Invalid LOG_LEVEL in .env. Using default INFO. Valid levels are: {valid_levels}")
67
+
68
+ # Set the log level immediately
69
+ logging.getLogger().setLevel(log_level)
70
+
71
+ return Default(
72
+ MAX_MEMORY_CHUNK_SIZE=int(os.getenv("MAX_MEMORY_CHUNK_SIZE", 16 * 1024 * 1024)),
73
+ DEFAULT_TIMEOUT_SEC=int(os.getenv("DEFAULT_TIMEOUT_SEC", 10)),
74
+ DEFAULT_LOCK_DURATION_SEC=int(os.getenv("DEFAULT_LOCK_DURATION_SEC", 60)),
75
+ LOG_LEVEL=log_level,
76
+ IS_SHOW_TIMING=(os.getenv("IS_SHOW_TIMING", "True").lower() == "true"),
77
+ STORAGE_TYPE=os.getenv("STORAGE_TYPE", "LOCAL").upper(),
78
+ )
79
+
80
+
81
+ default = load_defaults_from_env()
@@ -0,0 +1,34 @@
1
+ import os
2
+ import sys
3
+
4
+ from supertable.config.defaults import default, logger
5
+
6
+ # If this file is located in a subdirectory, adjust the path logic as needed.
7
+ # Currently appending ".." from __file__ to add the project root directory
8
+ sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "..")))
9
+
10
+ app_home = os.getenv("SUPERTABLE_HOME", "~/supertable")
11
+
12
+ def change_to_app_home(home_dir: str) -> None:
13
+ """
14
+ Attempts to change the current working directory to `home_dir`.
15
+ Prints the outcome. Logs (or prints) any error encountered.
16
+ """
17
+ expanded_dir = os.path.expanduser(home_dir)
18
+ try:
19
+ os.chdir(expanded_dir)
20
+ logger.debug(f"Changed working directory to {expanded_dir}")
21
+ except Exception as e:
22
+ logger.error(f"Failed to change working directory to {expanded_dir}: {e}")
23
+
24
+ if app_home:
25
+ change_to_app_home(app_home)
26
+ else:
27
+ logger.error("SUPERTABLE_HOME environment variable is not set")
28
+
29
+
30
+ logger.info(f"Current working directory: {os.getcwd()}")
31
+
32
+
33
+ def get_app_home():
34
+ return app_home
@@ -0,0 +1,213 @@
1
+ from enum import Enum
2
+
3
+ import duckdb
4
+ import pandas as pd
5
+
6
+ from supertable.config.defaults import logger
7
+ from supertable.utils.timer import Timer
8
+ from supertable.super_table import SuperTable
9
+ from supertable.query_plan_manager import QueryPlanManager
10
+ from supertable.utils.sql_parser import SQLParser
11
+ from supertable.utils.helper import dict_keys_to_lowercase
12
+ from supertable.plan_extender import PlanStats, extend_execution_plan
13
+ from supertable.rbac.access_control import restrict_read_access
14
+
15
+ class Status(Enum):
16
+ OK = "ok"
17
+ ERROR = "error"
18
+
19
+
20
+ class DataReader:
21
+ def __init__(self, super_name, organization, query):
22
+ self.super_table = SuperTable(super_name=super_name, organization=organization)
23
+ self.parser = SQLParser(query)
24
+ self.parser.parse_sql()
25
+ self.timer = None
26
+ self.plan_stats = None
27
+ self.query_plan_manager = None
28
+
29
+ def filter_snapshots(self, super_table_data, super_table_meta):
30
+ snapshots = super_table_data.get("snapshots")
31
+ file_count = super_table_meta.get("file_count", 0)
32
+ total_rows = super_table_meta.get("total_rows", 0)
33
+ total_file_size = super_table_meta.get("total_file_size", 0)
34
+ self.plan_stats.add_stat({"TABLE_FILES": file_count})
35
+ self.plan_stats.add_stat({"TABLE_SIZE": total_file_size})
36
+ self.plan_stats.add_stat({"TABLE_ROWS": total_rows})
37
+
38
+ if self.super_table.super_name.lower() == self.parser.original_table.lower():
39
+ filtered_snapshots = [
40
+ s for s in snapshots
41
+ if not (s["table_name"].startswith("__") and s["table_name"].endswith("__"))
42
+ ]
43
+ return filtered_snapshots
44
+ else:
45
+ filtered_snapshots = [
46
+ entry
47
+ for entry in snapshots
48
+ if entry["table_name"].lower() == self.parser.original_table.lower()
49
+ ]
50
+
51
+ return filtered_snapshots
52
+
53
+ timer = Timer()
54
+ @timer
55
+ def execute(self, user_hash: str, with_scan: bool=False):
56
+ status = Status.ERROR
57
+ message = None
58
+ self.timer = Timer()
59
+ self.plan_stats = PlanStats()
60
+
61
+ try:
62
+ super_table_data, super_table_path, super_table_meta = self.super_table.get_super_table_and_path_with_shared_lock()
63
+
64
+ self.timer.capture_and_reset_timing(event="META")
65
+
66
+ self.query_plan_manager = QueryPlanManager(super_name=self.super_table.super_name,
67
+ organization=self.super_table.organization,
68
+ current_meta_path=super_table_path,
69
+ parser=self.parser)
70
+
71
+ snapshots = self.filter_snapshots(super_table_data=super_table_data,
72
+ super_table_meta=super_table_meta)
73
+ logger.debug(f"Filtered snapshots: {snapshots}")
74
+
75
+ parquet_files, schema = self.process_snapshots(snapshots=snapshots,
76
+ with_scan=with_scan)
77
+ logger.debug(f"Parquet Files: {parquet_files}")
78
+
79
+ missing_columns = (
80
+ set([column.lower() for column in self.parser.columns_list])
81
+ - set("*")
82
+ - schema
83
+ )
84
+ logger.debug(f"Mising Columns: {missing_columns}")
85
+
86
+ if len(snapshots) == 0 or missing_columns or not parquet_files:
87
+ message = (
88
+ f"Missing column(s): {', '.join ( missing_columns )}"
89
+ if missing_columns
90
+ else "No parquet files found"
91
+ )
92
+ logger.warning(f"Filter Result: {message}")
93
+ return pd.DataFrame(), status, message
94
+
95
+ restrict_read_access(super_name=self.super_table.super_name,
96
+ organization=self.super_table.organization,
97
+ user_hash=user_hash,
98
+ table_name=self.parser.reflection_table,
99
+ table_schema=schema,
100
+ parsed_columns=self.parser.columns_list,
101
+ parser=self.parser)
102
+
103
+ self.timer.capture_and_reset_timing(event="FILTERING")
104
+
105
+ result = self.execute_with_duckdb(parquet_files=parquet_files,
106
+ query_manager=self.query_plan_manager)
107
+
108
+ status = Status.OK
109
+ except Exception as e:
110
+ message = str(e)
111
+ logger.error(f"Exception: {e}")
112
+ result = pd.DataFrame()
113
+ self.timer.capture_and_reset_timing(event="EXECUTING_QUERY")
114
+
115
+ try:
116
+ extend_execution_plan(super_table=self.super_table,
117
+ user_hash=user_hash,
118
+ query_plan_manager=self.query_plan_manager,
119
+ timing=self.timer.timings,
120
+ status=status.value,
121
+ message=message,
122
+ result_shape=result.shape,
123
+ plan_stats=self.plan_stats)
124
+ except Exception as e:
125
+ logger.error(f"Exception: {e}")
126
+
127
+ self.timer.capture_and_reset_timing(event="EXTENDING_PLAN")
128
+ self.timer.capture_duration(event="TOTAL_EXECUTE")
129
+ return result, status, message
130
+
131
+ def process_snapshots(self, snapshots, with_scan):
132
+ parquet_files = []
133
+ reflection_file_size = 0
134
+ reflection_rows = 0
135
+
136
+ schema = set()
137
+ for snapshot in snapshots:
138
+ current_snapshot_path = snapshot["path"]
139
+ current_snapshot_data = self.super_table.read_simple_table_snapshot(
140
+ current_snapshot_path
141
+ )
142
+
143
+ current_schema = current_snapshot_data.get("schema", {})
144
+ resources = current_snapshot_data.get("resources", {})
145
+ schema.update(dict_keys_to_lowercase(current_schema).keys())
146
+
147
+ for resource in resources:
148
+ file_size = resource.get("file_size", 0)
149
+ file_rows = resource.get("rows", 0)
150
+
151
+ if (
152
+ with_scan
153
+ or self.parser.columns_csv == "*"
154
+ or any(
155
+ col in dict_keys_to_lowercase(current_schema).keys()
156
+ for col in [
157
+ column.lower() for column in self.parser.columns_list
158
+ ]
159
+ )
160
+ ):
161
+ parquet_files.append(resource["file"])
162
+ reflection_file_size += file_size
163
+ reflection_rows += file_rows
164
+
165
+ logger.debug(f"snapshots: {len ( snapshots )}")
166
+ logger.debug(f"parquet_files: {len ( parquet_files )}")
167
+ logger.debug(f"schema: {schema}")
168
+
169
+ self.plan_stats.add_stat({"REFLECTIONS": len(parquet_files)})
170
+ self.plan_stats.add_stat({"REFLECTION_SIZE": reflection_file_size})
171
+ self.plan_stats.add_stat({"REFLECTION_ROWS": reflection_rows})
172
+
173
+ return parquet_files, schema
174
+
175
+ def execute_with_duckdb(self, parquet_files, query_manager: QueryPlanManager):
176
+ # Use DuckDB to read and query the parquet files directly
177
+ con = duckdb.connect()
178
+
179
+ con.execute("PRAGMA memory_limit='2GB';")
180
+ con.execute(f"PRAGMA temp_directory='{query_manager.temp_dir}';")
181
+ con.execute("PRAGMA enable_profiling='json';")
182
+ #con.execute("SET profiling_mode = 'standard';")
183
+ con.execute(f"PRAGMA profile_output = '{query_manager.query_plan_path}';")
184
+
185
+ # Read and register parquet files directly with DuckDB
186
+ parquet_files_str = ", ".join(f"'{file}'" for file in parquet_files)
187
+ logger.debug(f"parquet files: {len(parquet_files)}")
188
+
189
+ self.timer.capture_and_reset_timing("CONNECTING")
190
+
191
+ create_table = f"""
192
+ CREATE TABLE {self.parser.reflection_table}
193
+ AS
194
+ SELECT {self.parser.columns_csv}
195
+ FROM parquet_scan([{parquet_files_str}], union_by_name=True, HIVE_PARTITIONING=TRUE);
196
+ """
197
+
198
+ logger.debug(f"create_table: {create_table}")
199
+ con.execute(create_table)
200
+
201
+ create_view = f"""
202
+ CREATE VIEW {self.parser.rbac_view}
203
+ AS
204
+ {self.parser.view_definition}
205
+ """
206
+ logger.debug(f"create_view: {create_view}")
207
+ con.execute(create_view)
208
+
209
+ self.timer.capture_and_reset_timing("CREATING_REFLECTION")
210
+ logger.debug(f"Executing Query: {self.parser.executing_query}")
211
+ result = con.execute(query=self.parser.executing_query).fetchdf()
212
+ logger.debug(f"result.shape: {result.shape}")
213
+ return result
@@ -0,0 +1,100 @@
1
+ import polars
2
+ import re
3
+
4
+ from polars import DataFrame
5
+
6
+ from supertable.config.defaults import logger
7
+ from supertable.super_table import SuperTable
8
+ from supertable.simple_table import SimpleTable
9
+ from supertable.utils.timer import Timer
10
+ from supertable.processing import (
11
+ process_overlapping_files,
12
+ find_and_lock_overlapping_files,
13
+ )
14
+ from supertable.rbac.access_control import check_write_access
15
+
16
+ class DataWriter:
17
+ def __init__(self, super_name: str, organization: str):
18
+ self.super_table = SuperTable(super_name, organization)
19
+
20
+ timer = Timer()
21
+
22
+ @timer
23
+ def write(self, user_hash, simple_name, data, overwrite_columns, compression_level=1):
24
+
25
+ logger.debug("Checking for Write Access")
26
+ check_write_access(super_name=self.super_table.super_name,
27
+ organization=self.super_table.organization,
28
+ user_hash=user_hash,
29
+ table_name=simple_name)
30
+ logger.debug("Passed Write Access Check")
31
+
32
+ # Convert the input dataset from Arrow format to a Polars DataFrame
33
+ logger.debug("Converting data to DataFrame")
34
+ dataframe: DataFrame = polars.from_arrow(data)
35
+ logger.debug("Converted data to DataFrame")
36
+
37
+ logger.debug("Validating the dataframe")
38
+ self.validation(dataframe, simple_name, overwrite_columns)
39
+ logger.debug("dataframe is valid")
40
+
41
+ logger.debug(f"Reading Simple Table Metadata {simple_name}")
42
+ simple_table = SimpleTable(self.super_table, simple_name)
43
+ last_simple_table, _ = simple_table.get_simple_table_with_shared_lock()
44
+ logger.debug(f"last_simple_table: {last_simple_table}")
45
+
46
+ # Find files that have overlapping data and lock them to prevent concurrent modifications
47
+ overlapping_files = find_and_lock_overlapping_files(
48
+ last_simple_table, dataframe, overwrite_columns, simple_table.locking
49
+ )
50
+ logger.debug(f"overlapping_files: {overlapping_files}")
51
+
52
+ # Process the overlapping files by filtering, merging, and updating resources
53
+ inserted, deleted, total_rows, total_columns, new_resources, sunset_files = (
54
+ process_overlapping_files(
55
+ dataframe,
56
+ overlapping_files,
57
+ overwrite_columns,
58
+ simple_table.data_dir,
59
+ compression_level,
60
+ )
61
+ )
62
+
63
+ new_simple_table_snapshot, new_simple_table_path = simple_table.lock_and_update(
64
+ new_resources, sunset_files, dataframe
65
+ )
66
+
67
+ self.super_table.update_with_lock(
68
+ simple_name, new_simple_table_path, new_simple_table_snapshot
69
+ )
70
+
71
+ simple_table.locking.release_lock()
72
+ # self.super_table.locking.release_lock()
73
+
74
+ return total_columns, total_rows, inserted, deleted
75
+
76
+ def validation(
77
+ self, dataframe: DataFrame, simple_name: str, overwrite_columns: list
78
+ ):
79
+ if len(simple_name) == 0 or len(simple_name) > 128:
80
+ raise ValueError("SimpleTable name can't be empty or longer than 128")
81
+
82
+ if simple_name == self.super_table.super_name:
83
+ raise ValueError("SimpleTable name can't match with SuperTable name")
84
+
85
+ # Regular expression pattern for a valid table name
86
+ pattern = r"^[A-Za-z_][A-Za-z0-9_]*$"
87
+ if not re.match(pattern, simple_name):
88
+ raise ValueError(
89
+ f"Invalid table name: '{simple_name}'. Table names must start with a letter or underscore and contain only alphanumeric characters and underscores."
90
+ )
91
+
92
+ # Validate the overwrite columns
93
+ if overwrite_columns and not all(
94
+ col in dataframe.columns for col in overwrite_columns
95
+ ):
96
+ raise ValueError("Some overwrite columns are not present in the dataset")
97
+
98
+ # Ensure overwrite_columns is a list
99
+ if isinstance(overwrite_columns, str):
100
+ raise ValueError("overwrite columns must be list")
@@ -0,0 +1,99 @@
1
+ import os
2
+ from supertable.config.defaults import logger
3
+ from supertable.super_table import SuperTable
4
+ from supertable.rbac.access_control import check_write_access
5
+
6
+ class HistoryCleaner:
7
+ def __init__(self, super_name: str, organization: str):
8
+ self.super_table = SuperTable(super_name=super_name, organization=organization)
9
+ # Grab the storage object from the super_table
10
+ self.storage = self.super_table.storage
11
+
12
+ def clean(self, user_hash):
13
+ # Acquire the shared lock and read super table meta
14
+ super_table_data, super_table_path, super_table_meta = (
15
+ self.super_table.get_super_table_and_path_with_shared_lock()
16
+ )
17
+ last_updated_ms = super_table_data["last_updated_ms"]
18
+
19
+ check_write_access(super_name=self.super_table.super_name,
20
+ organization=self.super_table.organization,
21
+ user_hash=user_hash,
22
+ table_name=self.super_table.super_name)
23
+
24
+ # Collect all files in the super_table.super_dir
25
+ super_files = self.collect_files(self.super_table.super_dir)
26
+ # Remove the current super_table pointer from the list
27
+ if super_table_path in super_files:
28
+ super_files.remove(super_table_path)
29
+
30
+ files_to_delete = self.get_files_to_delete(super_files, last_updated_ms)
31
+ self.delete_files(files_to_delete)
32
+
33
+ logger.debug(
34
+ f"{len(files_to_delete)} files cleaned for table: {self.super_table.super_name}"
35
+ )
36
+
37
+ # Iterate over each snapshot in the super table
38
+ for snapshot in super_table_data["snapshots"]:
39
+ # Note: If your snapshot dict uses "table_name" instead of "simple_name", adjust here:
40
+ simple_table_name = snapshot["table_name"] # or snapshot["simple_name"] if appropriate
41
+ simple_table_path = snapshot["path"]
42
+
43
+ # Read the simple table's own snapshot JSON
44
+ simple_table_data = self.storage.read_json(simple_table_path)
45
+
46
+ location = simple_table_data["location"]
47
+ active_files = [entry["file"] for entry in simple_table_data.get("resources", [])]
48
+
49
+ designated_files = set(self.collect_files(location)) - set(active_files)
50
+ # Remove the current snapshot path from the designated_files
51
+ if simple_table_path in designated_files:
52
+ designated_files.remove(simple_table_path)
53
+
54
+ # Convert back to list
55
+ designated_files = list(designated_files)
56
+
57
+ files_to_delete = self.get_files_to_delete(designated_files, last_updated_ms)
58
+ self.delete_files(files_to_delete)
59
+
60
+ logger.debug(f"{len(files_to_delete)} files cleaned for table: {simple_table_name}")
61
+
62
+ def collect_files(self, location):
63
+ """
64
+ Collect parquet & JSON files under 'location'.
65
+ This uses the storage interface's list_files instead of glob.
66
+ """
67
+ # Example usage for a typical directory layout:
68
+ parquet_files = self.storage.list_files(os.path.join(location, "data"), "*.parquet")
69
+ json_files = self.storage.list_files(os.path.join(location, "snapshots"), "*.json")
70
+ super_json_files = self.storage.list_files(location, "*.json")
71
+ return parquet_files + json_files + super_json_files
72
+
73
+ def get_files_to_delete(self, designated_files, last_updated_ms):
74
+ """
75
+ Compare the numeric timestamp in the file name (e.g. 1678900000_*.json)
76
+ against `last_updated_ms`.
77
+ """
78
+ files_to_delete = []
79
+ for file in designated_files:
80
+ # Example: "1678900000_fileinfo.json" => "1678900000"
81
+ filename = os.path.basename(file)
82
+ timestamp_str = filename.split("_")[0]
83
+ try:
84
+ timestamp_val = int(timestamp_str)
85
+ if timestamp_val <= last_updated_ms:
86
+ files_to_delete.append(file)
87
+ except ValueError:
88
+ # If the file doesn't match the pattern, skip or handle differently
89
+ pass
90
+
91
+ return files_to_delete
92
+
93
+ def delete_files(self, files_to_delete):
94
+ """
95
+ Deletes files using the storage interface's `delete` method.
96
+ """
97
+ for file in files_to_delete:
98
+ self.storage.delete(file)
99
+ logger.debug(f"Deleted file: {file}")
@@ -0,0 +1,3 @@
1
+ from supertable.locking.locking import Locking
2
+
3
+ __all__ = ["Locking"]
@@ -0,0 +1,104 @@
1
+ import json
2
+ import os
3
+ import secrets
4
+ import time
5
+ import fcntl
6
+ from supertable.config.defaults import default, logger
7
+
8
+ class FileLocking:
9
+ def __init__(self, identity, working_dir, lock_file_name=".lock.json", check_interval=0.1):
10
+ self.identity = identity
11
+ self.lock_id = secrets.token_hex(8)
12
+ self.lock_file_dir = working_dir if working_dir is not None else identity
13
+ self.lock_file_path = os.path.join(self.lock_file_dir, lock_file_name)
14
+ self.check_interval = check_interval
15
+ logger.debug(f"lock_file_dir: {self.lock_file_dir}")
16
+ logger.debug(f"lock_file_path: {self.lock_file_path}")
17
+ self.init_lock_file()
18
+
19
+ def init_lock_file(self):
20
+ os.makedirs(self.lock_file_dir, exist_ok=True)
21
+ if not os.path.exists(self.lock_file_path):
22
+ with open(self.lock_file_path, "w") as lock_file:
23
+ json.dump([], lock_file)
24
+
25
+ def read_lock_file(self, lock_file):
26
+ lock_file.seek(0)
27
+ try:
28
+ return json.load(lock_file)
29
+ except json.JSONDecodeError as e:
30
+ logger.error(f"Error reading lock file: {e}")
31
+ return []
32
+
33
+ def write_lock_file(self, lock_data, lock_file):
34
+ lock_file.seek(0)
35
+ lock_file.truncate()
36
+ json.dump(lock_data, lock_file)
37
+ lock_file.flush()
38
+ os.fsync(lock_file.fileno())
39
+
40
+ def remove_expired_locks(self, lock_data):
41
+ current_time = int(time.time())
42
+ return [lock for lock in lock_data if lock["exp"] > current_time]
43
+
44
+ def remove_own_locks(self, lock_data):
45
+ return [lock for lock in lock_data if lock["pid"] != self.lock_id]
46
+
47
+ def self_lock(self, timeout_seconds=default.DEFAULT_TIMEOUT_SEC, lock_duration_seconds=default.DEFAULT_LOCK_DURATION_SEC):
48
+ resources = [self.identity]
49
+ return self.lock_resources(resources, timeout_seconds, lock_duration_seconds)
50
+
51
+ def lock_resources(self, resources, timeout_seconds=default.DEFAULT_TIMEOUT_SEC, lock_duration_seconds=default.DEFAULT_LOCK_DURATION_SEC):
52
+ start_time = time.time()
53
+ expiration_time = int(time.time() + lock_duration_seconds)
54
+ sleep_time = 0
55
+
56
+ while time.time() - start_time < timeout_seconds:
57
+ try:
58
+ if sleep_time > 0:
59
+ logger.debug(f"Waiting {sleep_time} seconds to acquire the lock for {self.identity}")
60
+ time.sleep(sleep_time)
61
+
62
+ with open(self.lock_file_path, "r+") as lock_file:
63
+ fcntl.flock(lock_file, fcntl.LOCK_EX)
64
+ try:
65
+ lock_data = self.read_lock_file(lock_file)
66
+ lock_data = self.remove_expired_locks(lock_data)
67
+ lock_check = self.remove_own_locks(lock_data)
68
+ logger.debug(f"Lock data: {lock_data}")
69
+ logger.debug(f"Lock check: {lock_check}")
70
+
71
+ if any(resource in lock["res"] for lock in lock_check for resource in resources):
72
+ logger.debug(f"{self.identity}: lock can't be acquired for resources {resources}")
73
+ sleep_time = self.check_interval
74
+ continue
75
+
76
+ lock_entry = {"pid": self.lock_id, "exp": expiration_time, "res": resources}
77
+ lock_data.append(lock_entry)
78
+ self.write_lock_file(lock_data, lock_file)
79
+ logger.debug(f"Identity: {self.identity}, lock acquired: {self.lock_id}")
80
+ return True
81
+ finally:
82
+ fcntl.flock(lock_file, fcntl.LOCK_UN)
83
+ except Exception as e:
84
+ logger.error(f"Error during lock acquisition: {e}")
85
+ time.sleep(self.check_interval)
86
+ return False
87
+
88
+ def release_lock(self, resources=None):
89
+ with open(self.lock_file_path, "r+") as lock_file:
90
+ fcntl.flock(lock_file, fcntl.LOCK_EX)
91
+ try:
92
+ lock_data = self.read_lock_file(lock_file)
93
+ lock_data = [lock for lock in lock_data if lock["pid"] != self.lock_id]
94
+ self.write_lock_file(lock_data, lock_file)
95
+ finally:
96
+ fcntl.flock(lock_file, fcntl.LOCK_UN)
97
+
98
+ def __enter__(self):
99
+ if not self.self_lock():
100
+ raise Exception(f"Unable to acquire file lock for {self.identity}")
101
+ return self
102
+
103
+ def __exit__(self, exc_type, exc_value, traceback):
104
+ self.release_lock()