supertable 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (49) hide show
  1. supertable-0.1.0/LICENSE +84 -0
  2. supertable-0.1.0/MANIFEST.in +17 -0
  3. supertable-0.1.0/PKG-INFO +93 -0
  4. supertable-0.1.0/README.md +73 -0
  5. supertable-0.1.0/pyproject.toml +25 -0
  6. supertable-0.1.0/setup.cfg +4 -0
  7. supertable-0.1.0/setup.py +21 -0
  8. supertable-0.1.0/supertable/__init__.py +0 -0
  9. supertable-0.1.0/supertable/config/__init__.py +0 -0
  10. supertable-0.1.0/supertable/config/defaults.py +81 -0
  11. supertable-0.1.0/supertable/config/homedir.py +34 -0
  12. supertable-0.1.0/supertable/data_reader.py +213 -0
  13. supertable-0.1.0/supertable/data_writer.py +100 -0
  14. supertable-0.1.0/supertable/history_cleaner.py +99 -0
  15. supertable-0.1.0/supertable/locking/__init__.py +3 -0
  16. supertable-0.1.0/supertable/locking/file_lock.py +104 -0
  17. supertable-0.1.0/supertable/locking/locking.py +162 -0
  18. supertable-0.1.0/supertable/locking/locking_backend.py +6 -0
  19. supertable-0.1.0/supertable/locking/redis_lock.py +70 -0
  20. supertable-0.1.0/supertable/meta_reader.py +182 -0
  21. supertable-0.1.0/supertable/plan_extender.py +97 -0
  22. supertable-0.1.0/supertable/processing.py +334 -0
  23. supertable-0.1.0/supertable/query_plan_manager.py +29 -0
  24. supertable-0.1.0/supertable/rbac/__init__.py +0 -0
  25. supertable-0.1.0/supertable/rbac/access_control.py +229 -0
  26. supertable-0.1.0/supertable/rbac/filter_builder.py +67 -0
  27. supertable-0.1.0/supertable/rbac/permissions.py +33 -0
  28. supertable-0.1.0/supertable/rbac/role_manager.py +149 -0
  29. supertable-0.1.0/supertable/rbac/row_column_security.py +53 -0
  30. supertable-0.1.0/supertable/rbac/user_manager.py +236 -0
  31. supertable-0.1.0/supertable/simple_table.py +192 -0
  32. supertable-0.1.0/supertable/staging_area.py +65 -0
  33. supertable-0.1.0/supertable/storage/__init__.py +0 -0
  34. supertable-0.1.0/supertable/storage/azure_storage.py +199 -0
  35. supertable-0.1.0/supertable/storage/gcp_storage.py +0 -0
  36. supertable-0.1.0/supertable/storage/local_storage.py +128 -0
  37. supertable-0.1.0/supertable/storage/minio_storage.py +218 -0
  38. supertable-0.1.0/supertable/storage/s3_storage.py +233 -0
  39. supertable-0.1.0/supertable/storage/storage_factory.py +29 -0
  40. supertable-0.1.0/supertable/storage/storage_interface.py +105 -0
  41. supertable-0.1.0/supertable/super_table.py +275 -0
  42. supertable-0.1.0/supertable/utils/__init__.py +0 -0
  43. supertable-0.1.0/supertable/utils/helper.py +118 -0
  44. supertable-0.1.0/supertable/utils/sql_parser.py +131 -0
  45. supertable-0.1.0/supertable/utils/timer.py +125 -0
  46. supertable-0.1.0/supertable.egg-info/PKG-INFO +93 -0
  47. supertable-0.1.0/supertable.egg-info/SOURCES.txt +47 -0
  48. supertable-0.1.0/supertable.egg-info/dependency_links.txt +1 -0
  49. supertable-0.1.0/supertable.egg-info/top_level.txt +1 -0
@@ -0,0 +1,84 @@
1
+ SUPER TABLE PUBLIC USE LICENSE (STPUL) v1.0
2
+
3
+ 1. Definitions
4
+ 1.1. “Software” refers to the source code, binary code, documentation,
5
+ and related materials for the project known as “SuperTable.”
6
+ 1.2. “Licensor” refers to kladna Soft kft.
7
+ 1.3. “You” (or “Your”) means the individual or entity exercising the
8
+ rights granted under this License.
9
+
10
+ 2. Grant of License
11
+ 2.1. Personal, Non-Commercial, and Evaluation Use
12
+ - You are granted a worldwide, royalty-free, non-exclusive,
13
+ non-transferable license to download, install, and use the
14
+ Software solely for personal, non-commercial, or evaluation
15
+ purposes at no cost.
16
+ - Evaluation use is limited to internal testing and shall not
17
+ exceed 30 days unless otherwise authorized in writing by the
18
+ Licensor.
19
+
20
+ 2.2. Commercial or Enterprise Use
21
+ - If You wish to use the Software for commercial purposes,
22
+ within an enterprise environment, or as part of any product
23
+ or service provided to a third party, You must obtain a
24
+ separate commercial license from the Licensor. Fees may apply.
25
+
26
+ 3. No Modification
27
+ 3.1. You may not modify, reverse engineer, decompile, disassemble,
28
+ create derivative works from, or otherwise alter the Software
29
+ in whole or in part without the prior written consent of the
30
+ Licensor.
31
+
32
+ 4. No Redistribution
33
+ 4.1. You may not publish, distribute, sublicense, or otherwise make
34
+ the Software available to any third party, unless You have
35
+ obtained explicit written permission from the Licensor.
36
+
37
+ 5. Ownership and Reservation of Rights
38
+ 5.1. All rights, title, and interest in the Software remain with the
39
+ Licensor. This License does not transfer any ownership rights.
40
+ 5.2. Access to the source code is provided for transparency and
41
+ evaluation only. It does not constitute a grant of rights
42
+ beyond those explicitly stated in this License.
43
+ 5.3. The Licensor reserves all rights not expressly granted to You
44
+ under this License.
45
+
46
+ 6. Disclaimer of Warranties
47
+ 6.1. THE SOFTWARE IS PROVIDED “AS IS,” WITHOUT WARRANTY OF ANY KIND,
48
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
49
+ OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND
50
+ NON-INFRINGEMENT. THE ENTIRE RISK AS TO THE QUALITY AND
51
+ PERFORMANCE OF THE SOFTWARE IS WITH YOU.
52
+
53
+ 7. Limitation of Liability
54
+ 7.1. IN NO EVENT SHALL THE LICENSOR BE LIABLE FOR ANY INDIRECT,
55
+ INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
56
+ (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
57
+ GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
58
+ INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
59
+ WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
60
+ NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
61
+ THE SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
62
+
63
+ 8. Termination
64
+ 8.1. This License and the rights granted hereunder will terminate
65
+ automatically if You fail to comply with any term or condition
66
+ of this License. Upon termination, You must cease all use of
67
+ the Software and destroy any copies of the Software in Your
68
+ possession or control.
69
+ 8.2. The Licensor reserves the right to revoke this License in the
70
+ event of a violation or at its sole discretion, subject to
71
+ applicable law.
72
+
73
+ 9. Governing Law
74
+ 9.1. This License shall be governed by and construed in accordance
75
+ with the laws of Hungary, without regard to its conflict of
76
+ law principles.
77
+
78
+ 10. Contact Information
79
+ 10.1. For inquiries about commercial licensing or any other questions
80
+ regarding this License, please contact:
81
+ lkupas@kladnasoft.com
82
+
83
+ BY USING THE SOFTWARE, YOU ACKNOWLEDGE THAT YOU HAVE READ THIS LICENSE,
84
+ UNDERSTAND IT, AND AGREE TO BE BOUND BY ITS TERMS AND CONDITIONS.
@@ -0,0 +1,17 @@
1
+ # MANIFEST.in
2
+
3
+ # Essential top‑level docs
4
+ include README.md
5
+ include LICENSE.txt
6
+ include pyproject.toml
7
+ #include setup.py
8
+
9
+ # Remove bulky or dev‑only content
10
+ prune examples
11
+ prune tests
12
+ prune tmp
13
+ prune generator
14
+ prune */__pycache__
15
+
16
+ # Exclude compiled artifacts and large sample data
17
+ global-exclude *.py[cod] *.parquet *.csv *.json
@@ -0,0 +1,93 @@
1
+ Metadata-Version: 2.4
2
+ Name: supertable
3
+ Version: 0.1.0
4
+ Summary: SuperTable revolutionizes data management by integrating multiple basic tables into a single, cohesive framework.
5
+ Home-page: https://github.com/kladnasoft/supertable
6
+ Author: Levente Kupas
7
+ Author-email: Levente Kupas <lkupas@kladnasoft.com>
8
+ License: Super Table Public Use License (STPUL) v1.0
9
+ Project-URL: Homepage, https://github.com/kladnasoft/supertable
10
+ Classifier: Programming Language :: Python :: 3
11
+ Classifier: License :: Other/Proprietary License
12
+ Classifier: Operating System :: OS Independent
13
+ Requires-Python: >=3.10
14
+ Description-Content-Type: text/markdown
15
+ License-File: LICENSE
16
+ Dynamic: author
17
+ Dynamic: home-page
18
+ Dynamic: license-file
19
+ Dynamic: requires-python
20
+
21
+ # SuperTable
22
+
23
+ ![Python](https://img.shields.io/badge/python-3.10+-blue)
24
+ ![License: STPUL](https://img.shields.io/badge/license-STPUL-blue)
25
+
26
+ **SuperTable — The simplest data warehouse & cataloging system.**
27
+ A high‑performance, lightweight transaction catalog that integrates multiple
28
+ basic tables into a single, cohesive framework designed for ultimate
29
+ efficiency.
30
+
31
+ Whether you are processing massive datasets or continuously evolving data
32
+ structures, SuperTable lets you query your information effortlessly. It
33
+ automatically creates and manages tables so you can start running SQL queries
34
+ immediately—no complicated schemas or manual joins required.
35
+
36
+ By holding all your data in one place, SuperTable removes complexity, boosts
37
+ efficiency, and provides a unified view of your information. From startups to
38
+ enterprises, SuperTable helps you make better decisions, faster.
39
+
40
+ ---
41
+
42
+ ## Key Features
43
+
44
+ - **Automatic table creation**
45
+ Load your data and SuperTable instantly builds the required tables and
46
+ columns—no predefined schema or extra setup.
47
+
48
+ - **Self‑referencing architecture**
49
+ Combine and analyze data across tables without writing manual joins; tables
50
+ can dynamically reference each other for richer insights.
51
+
52
+ - **Staging module with history**
53
+ Upload files to a staging area and reload any version at any time, keeping a
54
+ complete audit trail for tracking and compliance.
55
+
56
+ - **Columnar storage for speed**
57
+ Fully denormalized, column‑partitioned tables deliver lightning‑fast queries,
58
+ even with thousands of columns.
59
+
60
+ - **Built‑in RBAC security**
61
+ Define users and roles to control row‑ and column‑level access—no external
62
+ security tools required.
63
+
64
+ - **Platform independent**
65
+ Deploy on any major cloud provider or on‑premise. SuperTable is a pure Python
66
+ library with no hidden costs.
67
+
68
+ ---
69
+
70
+ ## Benefits
71
+
72
+ - **Quick start**
73
+ Go from raw data to query‑ready in minutes—faster than spreadsheets or
74
+ traditional databases.
75
+
76
+ - **Higher efficiency**
77
+ Eliminate manual steps and rework so you spend more time analyzing and less
78
+ time wrangling.
79
+
80
+ - **Holistic insights**
81
+ Analyze datasets individually or together to uncover trends, outliers, and
82
+ cross‑dependencies.
83
+
84
+ - **Cost savings**
85
+ Consolidate licenses, simplify support, and reinvest the savings in deeper
86
+ analytics.
87
+
88
+ ---
89
+
90
+ SuperTable provides a flexible, high‑performance solution that grows with your
91
+ business. Cut complexity, save time, and gain deeper insights—all in a single,
92
+ streamlined platform.
93
+
@@ -0,0 +1,73 @@
1
+ # SuperTable
2
+
3
+ ![Python](https://img.shields.io/badge/python-3.10+-blue)
4
+ ![License: STPUL](https://img.shields.io/badge/license-STPUL-blue)
5
+
6
+ **SuperTable — The simplest data warehouse & cataloging system.**
7
+ A high‑performance, lightweight transaction catalog that integrates multiple
8
+ basic tables into a single, cohesive framework designed for ultimate
9
+ efficiency.
10
+
11
+ Whether you are processing massive datasets or continuously evolving data
12
+ structures, SuperTable lets you query your information effortlessly. It
13
+ automatically creates and manages tables so you can start running SQL queries
14
+ immediately—no complicated schemas or manual joins required.
15
+
16
+ By holding all your data in one place, SuperTable removes complexity, boosts
17
+ efficiency, and provides a unified view of your information. From startups to
18
+ enterprises, SuperTable helps you make better decisions, faster.
19
+
20
+ ---
21
+
22
+ ## Key Features
23
+
24
+ - **Automatic table creation**
25
+ Load your data and SuperTable instantly builds the required tables and
26
+ columns—no predefined schema or extra setup.
27
+
28
+ - **Self‑referencing architecture**
29
+ Combine and analyze data across tables without writing manual joins; tables
30
+ can dynamically reference each other for richer insights.
31
+
32
+ - **Staging module with history**
33
+ Upload files to a staging area and reload any version at any time, keeping a
34
+ complete audit trail for tracking and compliance.
35
+
36
+ - **Columnar storage for speed**
37
+ Fully denormalized, column‑partitioned tables deliver lightning‑fast queries,
38
+ even with thousands of columns.
39
+
40
+ - **Built‑in RBAC security**
41
+ Define users and roles to control row‑ and column‑level access—no external
42
+ security tools required.
43
+
44
+ - **Platform independent**
45
+ Deploy on any major cloud provider or on‑premise. SuperTable is a pure Python
46
+ library with no hidden costs.
47
+
48
+ ---
49
+
50
+ ## Benefits
51
+
52
+ - **Quick start**
53
+ Go from raw data to query‑ready in minutes—faster than spreadsheets or
54
+ traditional databases.
55
+
56
+ - **Higher efficiency**
57
+ Eliminate manual steps and rework so you spend more time analyzing and less
58
+ time wrangling.
59
+
60
+ - **Holistic insights**
61
+ Analyze datasets individually or together to uncover trends, outliers, and
62
+ cross‑dependencies.
63
+
64
+ - **Cost savings**
65
+ Consolidate licenses, simplify support, and reinvest the savings in deeper
66
+ analytics.
67
+
68
+ ---
69
+
70
+ SuperTable provides a flexible, high‑performance solution that grows with your
71
+ business. Cut complexity, save time, and gain deeper insights—all in a single,
72
+ streamlined platform.
73
+
@@ -0,0 +1,25 @@
1
+ [build-system]
2
+ requires = ["setuptools>=61.0"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "supertable"
7
+ version = "0.1.0"
8
+ description = "SuperTable revolutionizes data management by integrating multiple basic tables into a single, cohesive framework."
9
+ readme = "README.md"
10
+ requires-python = ">=3.10"
11
+ license = { text = "Super Table Public Use License (STPUL) v1.0" }
12
+ authors = [
13
+ { name = "Levente Kupas", email = "lkupas@kladnasoft.com" }
14
+ ]
15
+ classifiers = [
16
+ "Programming Language :: Python :: 3",
17
+ "License :: Other/Proprietary License",
18
+ "Operating System :: OS Independent"
19
+ ]
20
+
21
+ [tool.setuptools]
22
+ include-package-data = true
23
+
24
+ [project.urls]
25
+ Homepage = "https://github.com/kladnasoft/supertable"
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
@@ -0,0 +1,21 @@
1
+ from setuptools import setup, find_packages
2
+
3
+ setup(
4
+ name="supertable",
5
+ version="0.1.0",
6
+ packages=find_packages(include=["supertable", "supertable.*"]),
7
+ include_package_data=True,
8
+ author="Levente Kupas",
9
+ author_email="lkupas@kladnasoft.com",
10
+ description="A high-performance, lightweight transaction cataloging system designed for ultimate efficiency.",
11
+ license="Super Table Public Use License (STPUL) v1.0",
12
+ long_description=open("README.md").read(),
13
+ long_description_content_type="text/markdown",
14
+ url="https://github.com/kladnasoft/supertable",
15
+ classifiers=[
16
+ "Programming Language :: Python :: 3",
17
+ "License :: Other/Proprietary License",
18
+ "Operating System :: OS Independent",
19
+ ],
20
+ python_requires=">=3.10",
21
+ )
File without changes
File without changes
@@ -0,0 +1,81 @@
1
+ import os
2
+ import logging
3
+ import sys
4
+
5
+ from dotenv import load_dotenv
6
+ from dataclasses import dataclass
7
+
8
+ import colorlog
9
+
10
+ # Configure colors for all levels
11
+ handler = colorlog.StreamHandler()
12
+ handler.setFormatter(colorlog.ColoredFormatter(
13
+ '%(log_color)s%(asctime)s - %(levelname)-8s - %(message)s',
14
+ datefmt='%Y-%m-%d %H:%M:%S',
15
+ log_colors={
16
+ 'DEBUG': 'cyan',
17
+ 'INFO': 'green',
18
+ 'WARNING': 'yellow',
19
+ 'ERROR': 'red',
20
+ 'CRITICAL': 'red,bg_white', # White background with red text
21
+ },
22
+ secondary_log_colors={},
23
+ style='%'
24
+ ))
25
+
26
+ logging.basicConfig(level=logging.INFO, handlers=[handler])
27
+ logger = logging.getLogger(__name__)
28
+
29
+
30
+ @dataclass
31
+ class Default:
32
+ MAX_MEMORY_CHUNK_SIZE: int = 16 * 1024 * 1024
33
+ DEFAULT_TIMEOUT_SEC: int = 10
34
+ DEFAULT_LOCK_DURATION_SEC: int = 60
35
+ LOG_LEVEL: str = "INFO" # Replaced IS_DEBUG with LOG_LEVEL
36
+ IS_SHOW_TIMING: bool = True
37
+ STORAGE_TYPE: str = "LOCAL"
38
+
39
+ def update_default(self, **kwargs):
40
+ """
41
+ Updates fields of this Default instance in-place
42
+ with any matching keys in kwargs.
43
+ """
44
+ for key, value in kwargs.items():
45
+ if hasattr(self, key):
46
+ setattr(self, key, value)
47
+ if key == "LOG_LEVEL":
48
+ self._update_log_level()
49
+
50
+ def _update_log_level(self):
51
+ """Update the logging level based on the current setting"""
52
+ logging.getLogger().setLevel(self.LOG_LEVEL)
53
+ logger.info(f"Log level changed to {self.LOG_LEVEL}")
54
+
55
+
56
+ def load_defaults_from_env(env_file: str = ".env") -> Default:
57
+ load_dotenv(env_file)
58
+
59
+ # Get log level from env, default to INFO
60
+ log_level = os.getenv("LOG_LEVEL", "INFO").upper()
61
+
62
+ # Validate log level
63
+ valid_levels = ["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"]
64
+ if log_level not in valid_levels:
65
+ log_level = "INFO"
66
+ logger.warning(f"Invalid LOG_LEVEL in .env. Using default INFO. Valid levels are: {valid_levels}")
67
+
68
+ # Set the log level immediately
69
+ logging.getLogger().setLevel(log_level)
70
+
71
+ return Default(
72
+ MAX_MEMORY_CHUNK_SIZE=int(os.getenv("MAX_MEMORY_CHUNK_SIZE", 16 * 1024 * 1024)),
73
+ DEFAULT_TIMEOUT_SEC=int(os.getenv("DEFAULT_TIMEOUT_SEC", 10)),
74
+ DEFAULT_LOCK_DURATION_SEC=int(os.getenv("DEFAULT_LOCK_DURATION_SEC", 60)),
75
+ LOG_LEVEL=log_level,
76
+ IS_SHOW_TIMING=(os.getenv("IS_SHOW_TIMING", "True").lower() == "true"),
77
+ STORAGE_TYPE=os.getenv("STORAGE_TYPE", "LOCAL").upper(),
78
+ )
79
+
80
+
81
+ default = load_defaults_from_env()
@@ -0,0 +1,34 @@
1
+ import os
2
+ import sys
3
+
4
+ from supertable.config.defaults import default, logger
5
+
6
+ # If this file is located in a subdirectory, adjust the path logic as needed.
7
+ # Currently appending ".." from __file__ to add the project root directory
8
+ sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "..")))
9
+
10
+ app_home = os.getenv("SUPERTABLE_HOME", "~/supertable")
11
+
12
+ def change_to_app_home(home_dir: str) -> None:
13
+ """
14
+ Attempts to change the current working directory to `home_dir`.
15
+ Prints the outcome. Logs (or prints) any error encountered.
16
+ """
17
+ expanded_dir = os.path.expanduser(home_dir)
18
+ try:
19
+ os.chdir(expanded_dir)
20
+ logger.debug(f"Changed working directory to {expanded_dir}")
21
+ except Exception as e:
22
+ logger.error(f"Failed to change working directory to {expanded_dir}: {e}")
23
+
24
+ if app_home:
25
+ change_to_app_home(app_home)
26
+ else:
27
+ logger.error("SUPERTABLE_HOME environment variable is not set")
28
+
29
+
30
+ logger.info(f"Current working directory: {os.getcwd()}")
31
+
32
+
33
+ def get_app_home():
34
+ return app_home
@@ -0,0 +1,213 @@
1
+ from enum import Enum
2
+
3
+ import duckdb
4
+ import pandas as pd
5
+
6
+ from supertable.config.defaults import logger
7
+ from supertable.utils.timer import Timer
8
+ from supertable.super_table import SuperTable
9
+ from supertable.query_plan_manager import QueryPlanManager
10
+ from supertable.utils.sql_parser import SQLParser
11
+ from supertable.utils.helper import dict_keys_to_lowercase
12
+ from supertable.plan_extender import PlanStats, extend_execution_plan
13
+ from supertable.rbac.access_control import restrict_read_access
14
+
15
+ class Status(Enum):
16
+ OK = "ok"
17
+ ERROR = "error"
18
+
19
+
20
+ class DataReader:
21
+ def __init__(self, super_name, organization, query):
22
+ self.super_table = SuperTable(super_name=super_name, organization=organization)
23
+ self.parser = SQLParser(query)
24
+ self.parser.parse_sql()
25
+ self.timer = None
26
+ self.plan_stats = None
27
+ self.query_plan_manager = None
28
+
29
+ def filter_snapshots(self, super_table_data, super_table_meta):
30
+ snapshots = super_table_data.get("snapshots")
31
+ file_count = super_table_meta.get("file_count", 0)
32
+ total_rows = super_table_meta.get("total_rows", 0)
33
+ total_file_size = super_table_meta.get("total_file_size", 0)
34
+ self.plan_stats.add_stat({"TABLE_FILES": file_count})
35
+ self.plan_stats.add_stat({"TABLE_SIZE": total_file_size})
36
+ self.plan_stats.add_stat({"TABLE_ROWS": total_rows})
37
+
38
+ if self.super_table.super_name.lower() == self.parser.original_table.lower():
39
+ filtered_snapshots = [
40
+ s for s in snapshots
41
+ if not (s["table_name"].startswith("__") and s["table_name"].endswith("__"))
42
+ ]
43
+ return filtered_snapshots
44
+ else:
45
+ filtered_snapshots = [
46
+ entry
47
+ for entry in snapshots
48
+ if entry["table_name"].lower() == self.parser.original_table.lower()
49
+ ]
50
+
51
+ return filtered_snapshots
52
+
53
+ timer = Timer()
54
+ @timer
55
+ def execute(self, user_hash: str, with_scan: bool=False):
56
+ status = Status.ERROR
57
+ message = None
58
+ self.timer = Timer()
59
+ self.plan_stats = PlanStats()
60
+
61
+ try:
62
+ super_table_data, super_table_path, super_table_meta = self.super_table.get_super_table_and_path_with_shared_lock()
63
+
64
+ self.timer.capture_and_reset_timing(event="META")
65
+
66
+ self.query_plan_manager = QueryPlanManager(super_name=self.super_table.super_name,
67
+ organization=self.super_table.organization,
68
+ current_meta_path=super_table_path,
69
+ parser=self.parser)
70
+
71
+ snapshots = self.filter_snapshots(super_table_data=super_table_data,
72
+ super_table_meta=super_table_meta)
73
+ logger.debug(f"Filtered snapshots: {snapshots}")
74
+
75
+ parquet_files, schema = self.process_snapshots(snapshots=snapshots,
76
+ with_scan=with_scan)
77
+ logger.debug(f"Parquet Files: {parquet_files}")
78
+
79
+ missing_columns = (
80
+ set([column.lower() for column in self.parser.columns_list])
81
+ - set("*")
82
+ - schema
83
+ )
84
+ logger.debug(f"Mising Columns: {missing_columns}")
85
+
86
+ if len(snapshots) == 0 or missing_columns or not parquet_files:
87
+ message = (
88
+ f"Missing column(s): {', '.join ( missing_columns )}"
89
+ if missing_columns
90
+ else "No parquet files found"
91
+ )
92
+ logger.warning(f"Filter Result: {message}")
93
+ return pd.DataFrame(), status, message
94
+
95
+ restrict_read_access(super_name=self.super_table.super_name,
96
+ organization=self.super_table.organization,
97
+ user_hash=user_hash,
98
+ table_name=self.parser.reflection_table,
99
+ table_schema=schema,
100
+ parsed_columns=self.parser.columns_list,
101
+ parser=self.parser)
102
+
103
+ self.timer.capture_and_reset_timing(event="FILTERING")
104
+
105
+ result = self.execute_with_duckdb(parquet_files=parquet_files,
106
+ query_manager=self.query_plan_manager)
107
+
108
+ status = Status.OK
109
+ except Exception as e:
110
+ message = str(e)
111
+ logger.error(f"Exception: {e}")
112
+ result = pd.DataFrame()
113
+ self.timer.capture_and_reset_timing(event="EXECUTING_QUERY")
114
+
115
+ try:
116
+ extend_execution_plan(super_table=self.super_table,
117
+ user_hash=user_hash,
118
+ query_plan_manager=self.query_plan_manager,
119
+ timing=self.timer.timings,
120
+ status=status.value,
121
+ message=message,
122
+ result_shape=result.shape,
123
+ plan_stats=self.plan_stats)
124
+ except Exception as e:
125
+ logger.error(f"Exception: {e}")
126
+
127
+ self.timer.capture_and_reset_timing(event="EXTENDING_PLAN")
128
+ self.timer.capture_duration(event="TOTAL_EXECUTE")
129
+ return result, status, message
130
+
131
+ def process_snapshots(self, snapshots, with_scan):
132
+ parquet_files = []
133
+ reflection_file_size = 0
134
+ reflection_rows = 0
135
+
136
+ schema = set()
137
+ for snapshot in snapshots:
138
+ current_snapshot_path = snapshot["path"]
139
+ current_snapshot_data = self.super_table.read_simple_table_snapshot(
140
+ current_snapshot_path
141
+ )
142
+
143
+ current_schema = current_snapshot_data.get("schema", {})
144
+ resources = current_snapshot_data.get("resources", {})
145
+ schema.update(dict_keys_to_lowercase(current_schema).keys())
146
+
147
+ for resource in resources:
148
+ file_size = resource.get("file_size", 0)
149
+ file_rows = resource.get("rows", 0)
150
+
151
+ if (
152
+ with_scan
153
+ or self.parser.columns_csv == "*"
154
+ or any(
155
+ col in dict_keys_to_lowercase(current_schema).keys()
156
+ for col in [
157
+ column.lower() for column in self.parser.columns_list
158
+ ]
159
+ )
160
+ ):
161
+ parquet_files.append(resource["file"])
162
+ reflection_file_size += file_size
163
+ reflection_rows += file_rows
164
+
165
+ logger.debug(f"snapshots: {len ( snapshots )}")
166
+ logger.debug(f"parquet_files: {len ( parquet_files )}")
167
+ logger.debug(f"schema: {schema}")
168
+
169
+ self.plan_stats.add_stat({"REFLECTIONS": len(parquet_files)})
170
+ self.plan_stats.add_stat({"REFLECTION_SIZE": reflection_file_size})
171
+ self.plan_stats.add_stat({"REFLECTION_ROWS": reflection_rows})
172
+
173
+ return parquet_files, schema
174
+
175
+ def execute_with_duckdb(self, parquet_files, query_manager: QueryPlanManager):
176
+ # Use DuckDB to read and query the parquet files directly
177
+ con = duckdb.connect()
178
+
179
+ con.execute("PRAGMA memory_limit='2GB';")
180
+ con.execute(f"PRAGMA temp_directory='{query_manager.temp_dir}';")
181
+ con.execute("PRAGMA enable_profiling='json';")
182
+ #con.execute("SET profiling_mode = 'standard';")
183
+ con.execute(f"PRAGMA profile_output = '{query_manager.query_plan_path}';")
184
+
185
+ # Read and register parquet files directly with DuckDB
186
+ parquet_files_str = ", ".join(f"'{file}'" for file in parquet_files)
187
+ logger.debug(f"parquet files: {len(parquet_files)}")
188
+
189
+ self.timer.capture_and_reset_timing("CONNECTING")
190
+
191
+ create_table = f"""
192
+ CREATE TABLE {self.parser.reflection_table}
193
+ AS
194
+ SELECT {self.parser.columns_csv}
195
+ FROM parquet_scan([{parquet_files_str}], union_by_name=True, HIVE_PARTITIONING=TRUE);
196
+ """
197
+
198
+ logger.debug(f"create_table: {create_table}")
199
+ con.execute(create_table)
200
+
201
+ create_view = f"""
202
+ CREATE VIEW {self.parser.rbac_view}
203
+ AS
204
+ {self.parser.view_definition}
205
+ """
206
+ logger.debug(f"create_view: {create_view}")
207
+ con.execute(create_view)
208
+
209
+ self.timer.capture_and_reset_timing("CREATING_REFLECTION")
210
+ logger.debug(f"Executing Query: {self.parser.executing_query}")
211
+ result = con.execute(query=self.parser.executing_query).fetchdf()
212
+ logger.debug(f"result.shape: {result.shape}")
213
+ return result