dataforge-core 0.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
dataforge/__init__.py ADDED
File without changes
@@ -0,0 +1,128 @@
1
+ import json
2
+ import os
3
+ import sys
4
+
5
+ import yaml
6
+ from .mainConfig import MainConfig
7
+ from .miniSparky import MiniSparky
8
+
9
+
10
+ class ImportProject:
11
+ def __init__(self, config: MainConfig):
12
+ self.ms = None
13
+ self._config = config
14
+ self.import_id = 0
15
+
16
+ def start(self):
17
+ _import_id = self._config.pg.sql("select meta.svc_import_start()")
18
+ self.import_id = int(_import_id)
19
+ print('Started import with id ', self.import_id)
20
+
21
+ def validate(self):
22
+ print(f"Validating project path {self._config.source_path}")
23
+ meta_flag = False
24
+ source_flag = False
25
+ with os.scandir(self._config.source_path) as entries:
26
+ for file in entries:
27
+ if file.is_dir() and file.name == 'sources':
28
+ source_flag = True
29
+ elif file.name == "meta.yaml":
30
+ meta_flag = True
31
+ if not meta_flag:
32
+ print(f"Missing meta.yaml in project path {self._config.source_path}")
33
+ sys.exit(1)
34
+ if not source_flag:
35
+ print(f"Missing sources folder in project path {self._config.source_path}")
36
+ sys.exit(1)
37
+
38
+ def load(self):
39
+ self.validate()
40
+ self.start()
41
+ with os.scandir(self._config.source_path) as entries:
42
+ for file in entries:
43
+ if file.is_dir() and file.name in ('sources', 'outputs'):
44
+ self.list_files(file.path, file.name)
45
+ elif file.name.endswith(".yaml"):
46
+ self.load_file(file.path, file.name)
47
+ self._config.pg.sql("SELECT meta.svc_import_complete(%s, 'I')", [self.import_id])
48
+ self._config.pg.sql("SELECT meta.imp_parse_objects(%s)", [self.import_id])
49
+ print("Files parsed")
50
+ self._config.pg.sql("SELECT meta.svc_import_execute(%s)", [self.import_id])
51
+ print("Objects loaded")
52
+ self.test_expressions()
53
+ print("Expressions validated")
54
+ print("Import completed successfully")
55
+ self.write_log()
56
+ self.write_queries("source")
57
+ self.write_queries("output")
58
+
59
+ def list_files(self, path: str, folder_name: str):
60
+ print("Importing files..")
61
+ with os.scandir(path) as entries:
62
+ for file in entries:
63
+ if file.is_file() & file.name.endswith(".yaml"):
64
+ self.load_file(file.path, folder_name + '/' + file.name)
65
+
66
+ def load_file(self, full_path: str, path: str):
67
+ print(path)
68
+ with open(full_path, 'r') as file:
69
+ file_js = yaml.safe_load(file)
70
+ self._config.pg.sql("SELECT meta.svc_import_load_object(%s, %s, %s)",
71
+ (self.import_id, path, json.dumps(file_js)))
72
+
73
+ def test_expressions(self):
74
+ exps = self._config.pg.sql("SELECT meta.impc_test_expressions(%s)", [self.import_id])
75
+ self.ms = MiniSparky(self._config)
76
+ self.test_expressions_recursive(exps)
77
+ self.ms.stop()
78
+ del self.ms
79
+
80
+ def test_expressions_recursive(self, test_expressions, recursion_level=0):
81
+ try:
82
+ test_results = []
83
+ for exp in test_expressions:
84
+ query_result = self.ms.execute_query(exp['expression'])
85
+ del exp['expression']
86
+ exp['result'] = query_result
87
+ test_results.append(exp)
88
+ # update test results
89
+ test_results_str = json.dumps(test_results)
90
+ res = self._config.pg.sql("SELECT meta.impc_update_test_results(%s, %s)",
91
+ (self.import_id, test_results_str))
92
+ if res.get('error'):
93
+ self.fail_import('Invalid expression detected. See log file for details')
94
+ if recursion_level > 20:
95
+ self.fail_import('Maximum recursion exceeded while testing expressions. Check error logs and '
96
+ 'expression test tables for more details')
97
+ if len(res['next']) > 0:
98
+ self.test_expressions_recursive(res['next'], recursion_level + 1)
99
+ except Exception as e:
100
+ print(e)
101
+ self.fail_import(str(e))
102
+
103
+ def fail_import(self, message):
104
+ print(f"Import failed: {message}")
105
+ self._config.pg.sql("SELECT meta.svc_import_complete(%s, 'F', %s)", (self.import_id, message))
106
+ self.write_log()
107
+ sys.exit(1)
108
+
109
+ def write_log(self):
110
+ log_file = self._config.pg.sql("SELECT meta.svc_import_get_log(%s)", [self.import_id])
111
+ with open(self._config.log_path, "w") as file:
112
+ # Write the string to the file
113
+ file.write(log_file)
114
+
115
+ def write_queries(self, out_type: str):
116
+ queries = self._config.pg.sql(f"select meta.svcc_get_{out_type}_queries(%s)", [self.import_id])
117
+ if not queries:
118
+ return
119
+ if out_type == "source":
120
+ out_path = self._config.output_source_path
121
+ elif out_type == "output":
122
+ out_path = self._config.output_output_path
123
+ for query in queries:
124
+ file_name = os.path.join(out_path, query['file_name'])
125
+ with open(file_name, "w") as file:
126
+ # Write the string to the file
127
+ file.write(query['query'])
128
+ print(f"Generated {len(queries)} {out_type} queries")
dataforge/main.py ADDED
@@ -0,0 +1,12 @@
1
+ from .importProject import ImportProject
2
+ from .mainConfig import MainConfig
3
+
4
+
5
+ def main():
6
+ conf = MainConfig()
7
+ imp = ImportProject(conf)
8
+ imp.load()
9
+
10
+
11
+ if __name__ == '__main__':
12
+ main()
@@ -0,0 +1,72 @@
1
+ import argparse
2
+ import os
3
+ import shutil
4
+ import sys
5
+ import traceback
6
+ from types import ModuleType
7
+
8
+ from importlib_resources.abc import Traversable
9
+ from .pg import Pg
10
+ import importlib_resources
11
+
12
+
13
+ class MainConfig:
14
+ def __init__(self):
15
+ # check if Java is installed
16
+ if os.environ.get('JAVA_HOME') is None:
17
+ print("Java is not installed or JAVA_HOME environment variable is not set")
18
+ sys.exit(1)
19
+ # parse command line args
20
+ _parser = argparse.ArgumentParser(
21
+ prog='dataforge core',
22
+ description='Dataforge Core compiles project and generates source and output SQL queries',
23
+ epilog='Try our cloud product')
24
+ _parser.add_argument('source', type=str, help='Project folder', metavar='<Project Path>', nargs='?')
25
+ _parser.add_argument('--init', '-i', action='store_true', help='Initialize project folder')
26
+ _parser.add_argument('--seed', action='store_true', help='Deploy and seed postgres database')
27
+ _parser.add_argument('--connect', '-c', type=str, help='Connect to postgres database',
28
+ metavar='<Postgres connection string>')
29
+
30
+ args = _parser.parse_args()
31
+ if args.connect:
32
+ self.pg = Pg(args.connect, initialize=True)
33
+ self.pg.seed()
34
+ sys.exit(0)
35
+ else:
36
+ self.pg = Pg()
37
+ if args.seed:
38
+ self.pg.seed()
39
+ self.source_path = os.getcwd() if args.source is None else args.source
40
+ if args.init:
41
+ try:
42
+ if self.pg.confirm_action(f"All files and subfolders in {self.source_path} will be deleted. Continue (y/n)? "):
43
+ shutil.rmtree(self.source_path, ignore_errors=True)
44
+ os.makedirs(self.source_path)
45
+ self.traverse_resource_dir(importlib_resources.files().joinpath('resources', 'project'))
46
+ print(f"Initialized project in {self.source_path}")
47
+ except Exception as e:
48
+ traceback.print_stack()
49
+ print(f"Error initializing project in {self.source_path} : {e}")
50
+ sys.exit(0)
51
+ self.output_path = os.path.join(self.source_path, 'target')
52
+ self.log_path = os.path.join(self.output_path, 'log.txt')
53
+ shutil.rmtree(self.output_path, ignore_errors=True)
54
+ os.makedirs(self.output_path)
55
+ self.output_source_path = os.path.join(self.output_path, 'sources')
56
+ os.makedirs(self.output_source_path)
57
+ self.output_output_path = os.path.join(self.output_path, 'outputs')
58
+ os.makedirs(self.output_output_path)
59
+
60
+ def traverse_resource_dir(self, resource: Traversable, folder=''):
61
+ for file in resource.iterdir():
62
+ if file.is_dir():
63
+ os.makedirs(os.path.join(self.source_path, folder, file.name))
64
+ self.traverse_resource_dir(resource.joinpath(file.name), file.name)
65
+ if file.is_file():
66
+ self.copy_resource_file(folder, file)
67
+
68
+ def copy_resource_file(self, folder: str, resource: Traversable):
69
+ file_name = os.path.join(self.source_path, folder, resource.name)
70
+ with open(file_name, "w") as file:
71
+ # Write the string to the file
72
+ file.write(resource.read_text())
@@ -0,0 +1,83 @@
1
+ from pyspark.sql import SparkSession
2
+ from pyspark.sql.types import DataType, StructType, ArrayType, DecimalType
3
+ from .mainConfig import MainConfig
4
+
5
+
6
+ class MiniSparky:
7
+ def __init__(self, config: MainConfig):
8
+ self._config = config
9
+ self.spark = SparkSession.builder \
10
+ .appName("dfCore") \
11
+ .master("local[1]") \
12
+ .config("spark.driver.extraJavaOptions", "-Dlog4j.logger.level=FATAL") \
13
+ .config("spark.executor.extraJavaOptions", "-Dlog4j.logger.level=FATAL") \
14
+ .config("spark.log.level", "FATAL") \
15
+ .config("spark.driver.memory", "512m") \
16
+ .config("spark.executor.memory", "512m") \
17
+ .config("spark.executor.cores", "1") \
18
+ .config("spark.ui.enabled", "false") \
19
+ .getOrCreate()
20
+
21
+ self.spark.sql(
22
+ """SELECT CAST(-87.68 as DECIMAL(10,2)) `decimal` , CAST(13518864 as BIGINT) `bigint`, CAST('Western Ave & Walton St' as STRING) `string`, CAST(130 AS INT) `int`, CAST(130 AS INT) `integer`, CAST(41.90331 as FLOAT) `float` , CAST(87.67695 as DOUBLE) `double`, CAST('2017-03-31' as DATE) `date`, CAST('2017-03-31T23:19:17.000+0000' as TIMESTAMP) `timestamp` , true `boolean`, CAST(9999999999 as BIGINT) `long`
23
+ UNION ALL
24
+ SELECT CAST(-8127.68 as DECIMAL(10,2)) `decimal` , CAST(1518864 as BIGINT) `bigint`, CAST('Western Ave & Walton St' as STRING) `string`, CAST(130 AS INT) `int`, CAST(130 AS INT) `integer`, CAST(41.90331 as FLOAT) `float` , CAST(87.67695 as DOUBLE) `double`, CAST('2020-03-31' as DATE) `date`, CAST('2020-03-31T23:19:17' as TIMESTAMP) `timestamp` , true `boolean`, CAST(99999999991 as BIGINT) `long`
25
+ """).createOrReplaceTempView("datatypes")
26
+
27
+ _res = self._config.pg.sql("select meta.svc_select_attribute_types_spark_to_hive()")
28
+ self.type_map = {}
29
+ for x in _res:
30
+ self.type_map[x['spark_type']] = x['hive_type']
31
+
32
+ @staticmethod
33
+ def get_spark_type(spark_type: DataType) -> str:
34
+ match spark_type:
35
+ case StructType(_):
36
+ return "StructType"
37
+ case ArrayType(_, _):
38
+ return "ArrayType"
39
+ case DecimalType():
40
+ return "DecimalType"
41
+ case _:
42
+ return str(spark_type).rstrip("()")
43
+
44
+ def execute_query(self, query: str):
45
+ try:
46
+ df = self.spark.sql(query)
47
+ field = df.schema.fields[0]
48
+ is_null = df.head()[0] is None
49
+ col_name = field.name
50
+ dt = field.dataType
51
+ spark_type = self.get_spark_type(dt)
52
+ att_schema = field.dataType.json()
53
+ data_type = self.type_map.get(spark_type)
54
+
55
+ if is_null:
56
+ return {
57
+ "type": "warning",
58
+ "data_type": data_type,
59
+ "att_schema": att_schema,
60
+ "message": "NULL values detected! This typically indicates improper type casting or that you're doing some very complex logic"
61
+ }
62
+ elif col_name != "col1":
63
+ return {
64
+ "type": "error",
65
+ "message": "Extraneous input detected at end of expression"
66
+ }
67
+ else:
68
+ return {
69
+ "type": "success",
70
+ "data_type": data_type,
71
+ "att_schema": att_schema
72
+ }
73
+ except Exception as e:
74
+ print("spark exception, printing stack trace")
75
+ return {
76
+ "type": "error",
77
+ "message": str(e)
78
+ }
79
+
80
+ def stop(self):
81
+ self.spark.sparkContext.stop()
82
+ del self.spark
83
+
dataforge/pg.py ADDED
@@ -0,0 +1,70 @@
1
+ import os
2
+ import sys
3
+ import psycopg2
4
+ from importlib_resources import files
5
+
6
+
7
+ class Pg:
8
+ def __init__(self, connection_string: str = None, initialize=False):
9
+ try:
10
+ if initialize:
11
+ self.initialize(connection_string)
12
+ else:
13
+ conn_string = os.environ.get('DATAFORGE_PG_CONNECTION')
14
+ print(f"Connecting to Postgres..")
15
+ if conn_string is None:
16
+ print("Postgres connection is not initialized. Run with --mode init")
17
+ sys.exit(1)
18
+ self.conn = psycopg2.connect(conn_string)
19
+ self.conn.set_session(autocommit=True)
20
+
21
+ except Exception as e:
22
+ print(f"Error connecting to Postgres: {e}")
23
+ sys.exit(1)
24
+
25
+ def sql(self, query: str, params=None, fetch=True):
26
+ # Execute a query
27
+ cur = self.conn.cursor()
28
+ cur.execute(query, params)
29
+ # Retrieve query results
30
+ res = cur.fetchone() if fetch else [None]
31
+ cur.close()
32
+ return res[0]
33
+
34
+ def initialize(self, connection_string: str):
35
+ # Execute a query
36
+ try:
37
+ print("Platform :", sys.platform)
38
+ self.conn = psycopg2.connect(connection_string)
39
+ self.sql("select 1") # execute test query
40
+ match sys.platform:
41
+ case 'win32' | 'cygwin':
42
+ os.system(f"SETX DATAFORGE_PG_CONNECTION \"{connection_string}\"")
43
+ case _:
44
+ os.system(f"export DATAFORGE_PG_CONNECTION=\"{connection_string}\"")
45
+ # Change connection
46
+ print("Please restart your console")
47
+ except Exception as e:
48
+ print(f"Error initializing Postgres database or insufficient permissions. Details: {e}")
49
+ sys.exit(1)
50
+
51
+ def seed(self):
52
+ schemas = self.sql(
53
+ "select string_agg(schema_name,',') from information_schema.schemata where schema_name IN ('meta','log')")
54
+ if schemas:
55
+ if not self.confirm_action(
56
+ f"All objects in schema(s) {schemas} in postgres database will be deleted. Do you want to continue (y/n)?"):
57
+ sys.exit(1)
58
+ # Drop schemas
59
+ self.sql("DROP SCHEMA IF EXISTS meta CASCADE;"
60
+ "DROP SCHEMA IF EXISTS log CASCADE;", fetch=False)
61
+ # Deploy DB code
62
+ print("Initializing database..")
63
+ deploy_sql = files().joinpath('resources', 'pg_deploy.sql').read_text()
64
+ self.sql(deploy_sql, fetch=False)
65
+ print("Database initialized")
66
+
67
+ @staticmethod
68
+ def confirm_action(message: str):
69
+ confirmation = input(message).strip().lower()
70
+ return confirmation in ('yes', 'y')