dataforge-core 0.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dataforge/__init__.py +0 -0
- dataforge/importProject.py +128 -0
- dataforge/main.py +12 -0
- dataforge/mainConfig.py +72 -0
- dataforge/miniSparky.py +83 -0
- dataforge/pg.py +70 -0
- dataforge/resources/pg_deploy.sql +5778 -0
- dataforge/resources/project/meta.yaml +2 -0
- dataforge/resources/project/outputs/output1.yaml +14 -0
- dataforge/resources/project/relations.yaml +2 -0
- dataforge/resources/project/sources/source1.yaml +12 -0
- dataforge_core-0.1.1.dist-info/LICENSE +204 -0
- dataforge_core-0.1.1.dist-info/METADATA +20 -0
- dataforge_core-0.1.1.dist-info/RECORD +17 -0
- dataforge_core-0.1.1.dist-info/WHEEL +5 -0
- dataforge_core-0.1.1.dist-info/entry_points.txt +2 -0
- dataforge_core-0.1.1.dist-info/top_level.txt +1 -0
dataforge/__init__.py
ADDED
|
File without changes
|
|
@@ -0,0 +1,128 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import os
|
|
3
|
+
import sys
|
|
4
|
+
|
|
5
|
+
import yaml
|
|
6
|
+
from .mainConfig import MainConfig
|
|
7
|
+
from .miniSparky import MiniSparky
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class ImportProject:
|
|
11
|
+
def __init__(self, config: MainConfig):
|
|
12
|
+
self.ms = None
|
|
13
|
+
self._config = config
|
|
14
|
+
self.import_id = 0
|
|
15
|
+
|
|
16
|
+
def start(self):
|
|
17
|
+
_import_id = self._config.pg.sql("select meta.svc_import_start()")
|
|
18
|
+
self.import_id = int(_import_id)
|
|
19
|
+
print('Started import with id ', self.import_id)
|
|
20
|
+
|
|
21
|
+
def validate(self):
|
|
22
|
+
print(f"Validating project path {self._config.source_path}")
|
|
23
|
+
meta_flag = False
|
|
24
|
+
source_flag = False
|
|
25
|
+
with os.scandir(self._config.source_path) as entries:
|
|
26
|
+
for file in entries:
|
|
27
|
+
if file.is_dir() and file.name == 'sources':
|
|
28
|
+
source_flag = True
|
|
29
|
+
elif file.name == "meta.yaml":
|
|
30
|
+
meta_flag = True
|
|
31
|
+
if not meta_flag:
|
|
32
|
+
print(f"Missing meta.yaml in project path {self._config.source_path}")
|
|
33
|
+
sys.exit(1)
|
|
34
|
+
if not source_flag:
|
|
35
|
+
print(f"Missing sources folder in project path {self._config.source_path}")
|
|
36
|
+
sys.exit(1)
|
|
37
|
+
|
|
38
|
+
def load(self):
|
|
39
|
+
self.validate()
|
|
40
|
+
self.start()
|
|
41
|
+
with os.scandir(self._config.source_path) as entries:
|
|
42
|
+
for file in entries:
|
|
43
|
+
if file.is_dir() and file.name in ('sources', 'outputs'):
|
|
44
|
+
self.list_files(file.path, file.name)
|
|
45
|
+
elif file.name.endswith(".yaml"):
|
|
46
|
+
self.load_file(file.path, file.name)
|
|
47
|
+
self._config.pg.sql("SELECT meta.svc_import_complete(%s, 'I')", [self.import_id])
|
|
48
|
+
self._config.pg.sql("SELECT meta.imp_parse_objects(%s)", [self.import_id])
|
|
49
|
+
print("Files parsed")
|
|
50
|
+
self._config.pg.sql("SELECT meta.svc_import_execute(%s)", [self.import_id])
|
|
51
|
+
print("Objects loaded")
|
|
52
|
+
self.test_expressions()
|
|
53
|
+
print("Expressions validated")
|
|
54
|
+
print("Import completed successfully")
|
|
55
|
+
self.write_log()
|
|
56
|
+
self.write_queries("source")
|
|
57
|
+
self.write_queries("output")
|
|
58
|
+
|
|
59
|
+
def list_files(self, path: str, folder_name: str):
|
|
60
|
+
print("Importing files..")
|
|
61
|
+
with os.scandir(path) as entries:
|
|
62
|
+
for file in entries:
|
|
63
|
+
if file.is_file() & file.name.endswith(".yaml"):
|
|
64
|
+
self.load_file(file.path, folder_name + '/' + file.name)
|
|
65
|
+
|
|
66
|
+
def load_file(self, full_path: str, path: str):
|
|
67
|
+
print(path)
|
|
68
|
+
with open(full_path, 'r') as file:
|
|
69
|
+
file_js = yaml.safe_load(file)
|
|
70
|
+
self._config.pg.sql("SELECT meta.svc_import_load_object(%s, %s, %s)",
|
|
71
|
+
(self.import_id, path, json.dumps(file_js)))
|
|
72
|
+
|
|
73
|
+
def test_expressions(self):
|
|
74
|
+
exps = self._config.pg.sql("SELECT meta.impc_test_expressions(%s)", [self.import_id])
|
|
75
|
+
self.ms = MiniSparky(self._config)
|
|
76
|
+
self.test_expressions_recursive(exps)
|
|
77
|
+
self.ms.stop()
|
|
78
|
+
del self.ms
|
|
79
|
+
|
|
80
|
+
def test_expressions_recursive(self, test_expressions, recursion_level=0):
|
|
81
|
+
try:
|
|
82
|
+
test_results = []
|
|
83
|
+
for exp in test_expressions:
|
|
84
|
+
query_result = self.ms.execute_query(exp['expression'])
|
|
85
|
+
del exp['expression']
|
|
86
|
+
exp['result'] = query_result
|
|
87
|
+
test_results.append(exp)
|
|
88
|
+
# update test results
|
|
89
|
+
test_results_str = json.dumps(test_results)
|
|
90
|
+
res = self._config.pg.sql("SELECT meta.impc_update_test_results(%s, %s)",
|
|
91
|
+
(self.import_id, test_results_str))
|
|
92
|
+
if res.get('error'):
|
|
93
|
+
self.fail_import('Invalid expression detected. See log file for details')
|
|
94
|
+
if recursion_level > 20:
|
|
95
|
+
self.fail_import('Maximum recursion exceeded while testing expressions. Check error logs and '
|
|
96
|
+
'expression test tables for more details')
|
|
97
|
+
if len(res['next']) > 0:
|
|
98
|
+
self.test_expressions_recursive(res['next'], recursion_level + 1)
|
|
99
|
+
except Exception as e:
|
|
100
|
+
print(e)
|
|
101
|
+
self.fail_import(str(e))
|
|
102
|
+
|
|
103
|
+
def fail_import(self, message):
|
|
104
|
+
print(f"Import failed: {message}")
|
|
105
|
+
self._config.pg.sql("SELECT meta.svc_import_complete(%s, 'F', %s)", (self.import_id, message))
|
|
106
|
+
self.write_log()
|
|
107
|
+
sys.exit(1)
|
|
108
|
+
|
|
109
|
+
def write_log(self):
|
|
110
|
+
log_file = self._config.pg.sql("SELECT meta.svc_import_get_log(%s)", [self.import_id])
|
|
111
|
+
with open(self._config.log_path, "w") as file:
|
|
112
|
+
# Write the string to the file
|
|
113
|
+
file.write(log_file)
|
|
114
|
+
|
|
115
|
+
def write_queries(self, out_type: str):
|
|
116
|
+
queries = self._config.pg.sql(f"select meta.svcc_get_{out_type}_queries(%s)", [self.import_id])
|
|
117
|
+
if not queries:
|
|
118
|
+
return
|
|
119
|
+
if out_type == "source":
|
|
120
|
+
out_path = self._config.output_source_path
|
|
121
|
+
elif out_type == "output":
|
|
122
|
+
out_path = self._config.output_output_path
|
|
123
|
+
for query in queries:
|
|
124
|
+
file_name = os.path.join(out_path, query['file_name'])
|
|
125
|
+
with open(file_name, "w") as file:
|
|
126
|
+
# Write the string to the file
|
|
127
|
+
file.write(query['query'])
|
|
128
|
+
print(f"Generated {len(queries)} {out_type} queries")
|
dataforge/main.py
ADDED
dataforge/mainConfig.py
ADDED
|
@@ -0,0 +1,72 @@
|
|
|
1
|
+
import argparse
|
|
2
|
+
import os
|
|
3
|
+
import shutil
|
|
4
|
+
import sys
|
|
5
|
+
import traceback
|
|
6
|
+
from types import ModuleType
|
|
7
|
+
|
|
8
|
+
from importlib_resources.abc import Traversable
|
|
9
|
+
from .pg import Pg
|
|
10
|
+
import importlib_resources
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class MainConfig:
|
|
14
|
+
def __init__(self):
|
|
15
|
+
# check if Java is installed
|
|
16
|
+
if os.environ.get('JAVA_HOME') is None:
|
|
17
|
+
print("Java is not installed or JAVA_HOME environment variable is not set")
|
|
18
|
+
sys.exit(1)
|
|
19
|
+
# parse command line args
|
|
20
|
+
_parser = argparse.ArgumentParser(
|
|
21
|
+
prog='dataforge core',
|
|
22
|
+
description='Dataforge Core compiles project and generates source and output SQL queries',
|
|
23
|
+
epilog='Try our cloud product')
|
|
24
|
+
_parser.add_argument('source', type=str, help='Project folder', metavar='<Project Path>', nargs='?')
|
|
25
|
+
_parser.add_argument('--init', '-i', action='store_true', help='Initialize project folder')
|
|
26
|
+
_parser.add_argument('--seed', action='store_true', help='Deploy and seed postgres database')
|
|
27
|
+
_parser.add_argument('--connect', '-c', type=str, help='Connect to postgres database',
|
|
28
|
+
metavar='<Postgres connection string>')
|
|
29
|
+
|
|
30
|
+
args = _parser.parse_args()
|
|
31
|
+
if args.connect:
|
|
32
|
+
self.pg = Pg(args.connect, initialize=True)
|
|
33
|
+
self.pg.seed()
|
|
34
|
+
sys.exit(0)
|
|
35
|
+
else:
|
|
36
|
+
self.pg = Pg()
|
|
37
|
+
if args.seed:
|
|
38
|
+
self.pg.seed()
|
|
39
|
+
self.source_path = os.getcwd() if args.source is None else args.source
|
|
40
|
+
if args.init:
|
|
41
|
+
try:
|
|
42
|
+
if self.pg.confirm_action(f"All files and subfolders in {self.source_path} will be deleted. Continue (y/n)? "):
|
|
43
|
+
shutil.rmtree(self.source_path, ignore_errors=True)
|
|
44
|
+
os.makedirs(self.source_path)
|
|
45
|
+
self.traverse_resource_dir(importlib_resources.files().joinpath('resources', 'project'))
|
|
46
|
+
print(f"Initialized project in {self.source_path}")
|
|
47
|
+
except Exception as e:
|
|
48
|
+
traceback.print_stack()
|
|
49
|
+
print(f"Error initializing project in {self.source_path} : {e}")
|
|
50
|
+
sys.exit(0)
|
|
51
|
+
self.output_path = os.path.join(self.source_path, 'target')
|
|
52
|
+
self.log_path = os.path.join(self.output_path, 'log.txt')
|
|
53
|
+
shutil.rmtree(self.output_path, ignore_errors=True)
|
|
54
|
+
os.makedirs(self.output_path)
|
|
55
|
+
self.output_source_path = os.path.join(self.output_path, 'sources')
|
|
56
|
+
os.makedirs(self.output_source_path)
|
|
57
|
+
self.output_output_path = os.path.join(self.output_path, 'outputs')
|
|
58
|
+
os.makedirs(self.output_output_path)
|
|
59
|
+
|
|
60
|
+
def traverse_resource_dir(self, resource: Traversable, folder=''):
|
|
61
|
+
for file in resource.iterdir():
|
|
62
|
+
if file.is_dir():
|
|
63
|
+
os.makedirs(os.path.join(self.source_path, folder, file.name))
|
|
64
|
+
self.traverse_resource_dir(resource.joinpath(file.name), file.name)
|
|
65
|
+
if file.is_file():
|
|
66
|
+
self.copy_resource_file(folder, file)
|
|
67
|
+
|
|
68
|
+
def copy_resource_file(self, folder: str, resource: Traversable):
|
|
69
|
+
file_name = os.path.join(self.source_path, folder, resource.name)
|
|
70
|
+
with open(file_name, "w") as file:
|
|
71
|
+
# Write the string to the file
|
|
72
|
+
file.write(resource.read_text())
|
dataforge/miniSparky.py
ADDED
|
@@ -0,0 +1,83 @@
|
|
|
1
|
+
from pyspark.sql import SparkSession
|
|
2
|
+
from pyspark.sql.types import DataType, StructType, ArrayType, DecimalType
|
|
3
|
+
from .mainConfig import MainConfig
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class MiniSparky:
|
|
7
|
+
def __init__(self, config: MainConfig):
|
|
8
|
+
self._config = config
|
|
9
|
+
self.spark = SparkSession.builder \
|
|
10
|
+
.appName("dfCore") \
|
|
11
|
+
.master("local[1]") \
|
|
12
|
+
.config("spark.driver.extraJavaOptions", "-Dlog4j.logger.level=FATAL") \
|
|
13
|
+
.config("spark.executor.extraJavaOptions", "-Dlog4j.logger.level=FATAL") \
|
|
14
|
+
.config("spark.log.level", "FATAL") \
|
|
15
|
+
.config("spark.driver.memory", "512m") \
|
|
16
|
+
.config("spark.executor.memory", "512m") \
|
|
17
|
+
.config("spark.executor.cores", "1") \
|
|
18
|
+
.config("spark.ui.enabled", "false") \
|
|
19
|
+
.getOrCreate()
|
|
20
|
+
|
|
21
|
+
self.spark.sql(
|
|
22
|
+
"""SELECT CAST(-87.68 as DECIMAL(10,2)) `decimal` , CAST(13518864 as BIGINT) `bigint`, CAST('Western Ave & Walton St' as STRING) `string`, CAST(130 AS INT) `int`, CAST(130 AS INT) `integer`, CAST(41.90331 as FLOAT) `float` , CAST(87.67695 as DOUBLE) `double`, CAST('2017-03-31' as DATE) `date`, CAST('2017-03-31T23:19:17.000+0000' as TIMESTAMP) `timestamp` , true `boolean`, CAST(9999999999 as BIGINT) `long`
|
|
23
|
+
UNION ALL
|
|
24
|
+
SELECT CAST(-8127.68 as DECIMAL(10,2)) `decimal` , CAST(1518864 as BIGINT) `bigint`, CAST('Western Ave & Walton St' as STRING) `string`, CAST(130 AS INT) `int`, CAST(130 AS INT) `integer`, CAST(41.90331 as FLOAT) `float` , CAST(87.67695 as DOUBLE) `double`, CAST('2020-03-31' as DATE) `date`, CAST('2020-03-31T23:19:17' as TIMESTAMP) `timestamp` , true `boolean`, CAST(99999999991 as BIGINT) `long`
|
|
25
|
+
""").createOrReplaceTempView("datatypes")
|
|
26
|
+
|
|
27
|
+
_res = self._config.pg.sql("select meta.svc_select_attribute_types_spark_to_hive()")
|
|
28
|
+
self.type_map = {}
|
|
29
|
+
for x in _res:
|
|
30
|
+
self.type_map[x['spark_type']] = x['hive_type']
|
|
31
|
+
|
|
32
|
+
@staticmethod
|
|
33
|
+
def get_spark_type(spark_type: DataType) -> str:
|
|
34
|
+
match spark_type:
|
|
35
|
+
case StructType(_):
|
|
36
|
+
return "StructType"
|
|
37
|
+
case ArrayType(_, _):
|
|
38
|
+
return "ArrayType"
|
|
39
|
+
case DecimalType():
|
|
40
|
+
return "DecimalType"
|
|
41
|
+
case _:
|
|
42
|
+
return str(spark_type).rstrip("()")
|
|
43
|
+
|
|
44
|
+
def execute_query(self, query: str):
|
|
45
|
+
try:
|
|
46
|
+
df = self.spark.sql(query)
|
|
47
|
+
field = df.schema.fields[0]
|
|
48
|
+
is_null = df.head()[0] is None
|
|
49
|
+
col_name = field.name
|
|
50
|
+
dt = field.dataType
|
|
51
|
+
spark_type = self.get_spark_type(dt)
|
|
52
|
+
att_schema = field.dataType.json()
|
|
53
|
+
data_type = self.type_map.get(spark_type)
|
|
54
|
+
|
|
55
|
+
if is_null:
|
|
56
|
+
return {
|
|
57
|
+
"type": "warning",
|
|
58
|
+
"data_type": data_type,
|
|
59
|
+
"att_schema": att_schema,
|
|
60
|
+
"message": "NULL values detected! This typically indicates improper type casting or that you're doing some very complex logic"
|
|
61
|
+
}
|
|
62
|
+
elif col_name != "col1":
|
|
63
|
+
return {
|
|
64
|
+
"type": "error",
|
|
65
|
+
"message": "Extraneous input detected at end of expression"
|
|
66
|
+
}
|
|
67
|
+
else:
|
|
68
|
+
return {
|
|
69
|
+
"type": "success",
|
|
70
|
+
"data_type": data_type,
|
|
71
|
+
"att_schema": att_schema
|
|
72
|
+
}
|
|
73
|
+
except Exception as e:
|
|
74
|
+
print("spark exception, printing stack trace")
|
|
75
|
+
return {
|
|
76
|
+
"type": "error",
|
|
77
|
+
"message": str(e)
|
|
78
|
+
}
|
|
79
|
+
|
|
80
|
+
def stop(self):
|
|
81
|
+
self.spark.sparkContext.stop()
|
|
82
|
+
del self.spark
|
|
83
|
+
|
dataforge/pg.py
ADDED
|
@@ -0,0 +1,70 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import sys
|
|
3
|
+
import psycopg2
|
|
4
|
+
from importlib_resources import files
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class Pg:
|
|
8
|
+
def __init__(self, connection_string: str = None, initialize=False):
|
|
9
|
+
try:
|
|
10
|
+
if initialize:
|
|
11
|
+
self.initialize(connection_string)
|
|
12
|
+
else:
|
|
13
|
+
conn_string = os.environ.get('DATAFORGE_PG_CONNECTION')
|
|
14
|
+
print(f"Connecting to Postgres..")
|
|
15
|
+
if conn_string is None:
|
|
16
|
+
print("Postgres connection is not initialized. Run with --mode init")
|
|
17
|
+
sys.exit(1)
|
|
18
|
+
self.conn = psycopg2.connect(conn_string)
|
|
19
|
+
self.conn.set_session(autocommit=True)
|
|
20
|
+
|
|
21
|
+
except Exception as e:
|
|
22
|
+
print(f"Error connecting to Postgres: {e}")
|
|
23
|
+
sys.exit(1)
|
|
24
|
+
|
|
25
|
+
def sql(self, query: str, params=None, fetch=True):
|
|
26
|
+
# Execute a query
|
|
27
|
+
cur = self.conn.cursor()
|
|
28
|
+
cur.execute(query, params)
|
|
29
|
+
# Retrieve query results
|
|
30
|
+
res = cur.fetchone() if fetch else [None]
|
|
31
|
+
cur.close()
|
|
32
|
+
return res[0]
|
|
33
|
+
|
|
34
|
+
def initialize(self, connection_string: str):
|
|
35
|
+
# Execute a query
|
|
36
|
+
try:
|
|
37
|
+
print("Platform :", sys.platform)
|
|
38
|
+
self.conn = psycopg2.connect(connection_string)
|
|
39
|
+
self.sql("select 1") # execute test query
|
|
40
|
+
match sys.platform:
|
|
41
|
+
case 'win32' | 'cygwin':
|
|
42
|
+
os.system(f"SETX DATAFORGE_PG_CONNECTION \"{connection_string}\"")
|
|
43
|
+
case _:
|
|
44
|
+
os.system(f"export DATAFORGE_PG_CONNECTION=\"{connection_string}\"")
|
|
45
|
+
# Change connection
|
|
46
|
+
print("Please restart your console")
|
|
47
|
+
except Exception as e:
|
|
48
|
+
print(f"Error initializing Postgres database or insufficient permissions. Details: {e}")
|
|
49
|
+
sys.exit(1)
|
|
50
|
+
|
|
51
|
+
def seed(self):
|
|
52
|
+
schemas = self.sql(
|
|
53
|
+
"select string_agg(schema_name,',') from information_schema.schemata where schema_name IN ('meta','log')")
|
|
54
|
+
if schemas:
|
|
55
|
+
if not self.confirm_action(
|
|
56
|
+
f"All objects in schema(s) {schemas} in postgres database will be deleted. Do you want to continue (y/n)?"):
|
|
57
|
+
sys.exit(1)
|
|
58
|
+
# Drop schemas
|
|
59
|
+
self.sql("DROP SCHEMA IF EXISTS meta CASCADE;"
|
|
60
|
+
"DROP SCHEMA IF EXISTS log CASCADE;", fetch=False)
|
|
61
|
+
# Deploy DB code
|
|
62
|
+
print("Initializing database..")
|
|
63
|
+
deploy_sql = files().joinpath('resources', 'pg_deploy.sql').read_text()
|
|
64
|
+
self.sql(deploy_sql, fetch=False)
|
|
65
|
+
print("Database initialized")
|
|
66
|
+
|
|
67
|
+
@staticmethod
|
|
68
|
+
def confirm_action(message: str):
|
|
69
|
+
confirmation = input(message).strip().lower()
|
|
70
|
+
return confirmation in ('yes', 'y')
|