bribrain 1.0.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- bribrain-1.0.0/PKG-INFO +15 -0
- bribrain-1.0.0/bribrain/__init__.py +4 -0
- bribrain-1.0.0/bribrain/config.py +60 -0
- bribrain-1.0.0/bribrain/function.py +222 -0
- bribrain-1.0.0/bribrain/ingestion.py +1223 -0
- bribrain-1.0.0/bribrain/load.py +523 -0
- bribrain-1.0.0/bribrain.egg-info/PKG-INFO +15 -0
- bribrain-1.0.0/bribrain.egg-info/SOURCES.txt +11 -0
- bribrain-1.0.0/bribrain.egg-info/dependency_links.txt +1 -0
- bribrain-1.0.0/bribrain.egg-info/requires.txt +3 -0
- bribrain-1.0.0/bribrain.egg-info/top_level.txt +1 -0
- bribrain-1.0.0/setup.cfg +4 -0
- bribrain-1.0.0/setup.py +22 -0
bribrain-1.0.0/PKG-INFO
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
Metadata-Version: 1.1
|
|
2
|
+
Name: bribrain
|
|
3
|
+
Version: 1.0.0
|
|
4
|
+
Summary: Standard code for Dept BRIBrain
|
|
5
|
+
Home-page: UNKNOWN
|
|
6
|
+
Author: Andri Ariyanto
|
|
7
|
+
Author-email: ariyant.andri@gmail.com
|
|
8
|
+
License: UNKNOWN
|
|
9
|
+
Description: UNKNOWN
|
|
10
|
+
Keywords: python,ddb,bribrain
|
|
11
|
+
Platform: UNKNOWN
|
|
12
|
+
Classifier: Programming Language :: Python :: 3
|
|
13
|
+
Classifier: Operating System :: Unix
|
|
14
|
+
Classifier: Operating System :: MacOS :: MacOS X
|
|
15
|
+
Classifier: Operating System :: Microsoft :: Windows
|
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
#=============================================================#
|
|
2
|
+
### CREATED AT : 29 FEBRUARI 2024 ###
|
|
3
|
+
### UPDATED AT : 07 MARET 2024 ###
|
|
4
|
+
### COPYRIGHT : ANDRI ARIYANTO ###
|
|
5
|
+
### DESCRIPTION : Module untuk pembuatan spark session ###
|
|
6
|
+
#=============================================================#
|
|
7
|
+
|
|
8
|
+
from pyspark.sql import SparkSession
|
|
9
|
+
|
|
10
|
+
def sparkSession(appname="Bribrain Spark Session", executor="small", instances=2, cores=4, memory=4, overhead=2):
|
|
11
|
+
"""Membuat spark session yang dapat digunakan untuk proses data engineering
|
|
12
|
+
|
|
13
|
+
Args:
|
|
14
|
+
appname (str): Nama dari spark session
|
|
15
|
+
(default is Bribrain Spark Session)
|
|
16
|
+
executor (str): Standard resource yang dapat dipilih untuk spark session
|
|
17
|
+
(optional is small, medium, high, custom)
|
|
18
|
+
(default is small)
|
|
19
|
+
(notes is small (2,4,4,2), medium (4,4,6,2), high (6,5,8,4))
|
|
20
|
+
instances (int): Config untuk menentukan instance spark session
|
|
21
|
+
(default is 2)
|
|
22
|
+
cores (int): Config untuk menentukan cores spark session
|
|
23
|
+
(default is 4)
|
|
24
|
+
memory (int): Config untuk menentukan memory spark session
|
|
25
|
+
(default is 4)
|
|
26
|
+
overhead (int): Config untuk menentukan overhead spark session
|
|
27
|
+
(default is 2)
|
|
28
|
+
|
|
29
|
+
Returns:
|
|
30
|
+
pyspark.sql.session.SparkSession: Spark session untuk proses data engineering
|
|
31
|
+
"""
|
|
32
|
+
|
|
33
|
+
# pendefinisian resouce pyspark
|
|
34
|
+
if executor=="small":
|
|
35
|
+
config = [2, 4, 4, 2]
|
|
36
|
+
elif executor=="medium":
|
|
37
|
+
config = [4, 4, 6, 2]
|
|
38
|
+
elif executor=="high":
|
|
39
|
+
config = [6, 5, 8, 4]
|
|
40
|
+
elif executor=="custom":
|
|
41
|
+
config = [instances, cores, memory, overhead]
|
|
42
|
+
|
|
43
|
+
# pembuatan spark session
|
|
44
|
+
spark = SparkSession\
|
|
45
|
+
.builder\
|
|
46
|
+
.appName(appname)\
|
|
47
|
+
.config("spark.sql.crossJoin.enabled", "true")\
|
|
48
|
+
.config("spark.dynamicAllocation.enabled", "false")\
|
|
49
|
+
.config("spark.executor.instances", "{}".format(config[0]))\
|
|
50
|
+
.config("spark.executor.cores", "{}".format(config[1]))\
|
|
51
|
+
.config("spark.executor.memory", "{}g".format(config[2]))\
|
|
52
|
+
.config("spark.yarn.executor.memoryOverhead", "{}g".format(config[3]))\
|
|
53
|
+
.config("spark.sql.broadcastTimeout", "36000")\
|
|
54
|
+
.config("spark.sql.legacy.allowCreatingManagedTableUsingNonemptyLocation", "true")\
|
|
55
|
+
.config("spark.ui.showConsoleProgress", "false")\
|
|
56
|
+
.config("spark.network.timeout", 60)\
|
|
57
|
+
.enableHiveSupport()\
|
|
58
|
+
.getOrCreate()
|
|
59
|
+
|
|
60
|
+
return spark
|
|
@@ -0,0 +1,222 @@
|
|
|
1
|
+
#=============================================================#
|
|
2
|
+
### CREATED AT : 12 MARET 2024 ###
|
|
3
|
+
### UPDATED AT : 14 MARET 2024 ###
|
|
4
|
+
### COPYRIGHT : BRIBRAIN DATA ENGINEER TEAM ###
|
|
5
|
+
### DESCRIPTION : Module untuk kumpulan function ###
|
|
6
|
+
#=============================================================#
|
|
7
|
+
|
|
8
|
+
from time import time
|
|
9
|
+
from pytz import timezone
|
|
10
|
+
from datetime import datetime, timedelta
|
|
11
|
+
from pyspark.sql import functions as F
|
|
12
|
+
from Crypto.Cipher import AES
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
import base64
|
|
16
|
+
import hashlib
|
|
17
|
+
|
|
18
|
+
#=============================================================#
|
|
19
|
+
def try_or(func, default=None, expected_exc=(Exception,)):
|
|
20
|
+
"""Menangkap error dan memberikan keluaran sesuai parameter
|
|
21
|
+
|
|
22
|
+
Args:
|
|
23
|
+
func (function): python function
|
|
24
|
+
(notes is ditambahkan lambda pada sebelum nama function, ex: try_or(lambda:func))
|
|
25
|
+
default (object): keluaran yang diharapkan ketika terjadi error
|
|
26
|
+
expected_exc (Exception): Exception yang diharapkan
|
|
27
|
+
|
|
28
|
+
Returns:
|
|
29
|
+
object: keluaran dari function atau mengembalikan keluaran sesuai input jika terdapat error
|
|
30
|
+
"""
|
|
31
|
+
|
|
32
|
+
try:
|
|
33
|
+
return func()
|
|
34
|
+
except expected_exc:
|
|
35
|
+
return default
|
|
36
|
+
|
|
37
|
+
#=============================================================#
|
|
38
|
+
def set_timer():
|
|
39
|
+
"""Menetapkan waktu awal untuk penghitungan durasi proses
|
|
40
|
+
|
|
41
|
+
Args:
|
|
42
|
+
-
|
|
43
|
+
|
|
44
|
+
Returns:
|
|
45
|
+
float: waktu dalam format float
|
|
46
|
+
"""
|
|
47
|
+
|
|
48
|
+
global START_TIME
|
|
49
|
+
START_TIME = time()
|
|
50
|
+
|
|
51
|
+
return START_TIME
|
|
52
|
+
|
|
53
|
+
#=============================================================#
|
|
54
|
+
def get_timer(start_time=None):
|
|
55
|
+
"""Memperoleh durasi proses berdasarkan waktu awal dikurangi waktu sekarang
|
|
56
|
+
|
|
57
|
+
Args:
|
|
58
|
+
start_time (float) :
|
|
59
|
+
|
|
60
|
+
Returns:
|
|
61
|
+
string: durasi proses dengan format HH:MM:SS
|
|
62
|
+
"""
|
|
63
|
+
|
|
64
|
+
if start_time:
|
|
65
|
+
return (datetime(1,1,1)+timedelta(seconds=int(time()-start_time))).strftime("%H:%M:%S")
|
|
66
|
+
|
|
67
|
+
return (datetime(1,1,1)+timedelta(seconds=int(time()-START_TIME))).strftime("%H:%M:%S")
|
|
68
|
+
|
|
69
|
+
#=============================================================#
|
|
70
|
+
def get_list_partition(spark, schema, table):
|
|
71
|
+
"""Memperoleh list partisi dari hive table yang diurutkan dari partisi terbaru
|
|
72
|
+
|
|
73
|
+
Args:
|
|
74
|
+
spark (pyspark.sql.session.SparkSession): spark session
|
|
75
|
+
schema (str): nama schema dari table di hive
|
|
76
|
+
table (str): nama table di hive
|
|
77
|
+
|
|
78
|
+
Returns:
|
|
79
|
+
list: list partisi diurutkan dari partisi yang terbaru
|
|
80
|
+
"""
|
|
81
|
+
|
|
82
|
+
try:
|
|
83
|
+
partitions = spark.sql("""
|
|
84
|
+
SHOW PARTITIONS {}.{}
|
|
85
|
+
""".format(schema, table)).sort("partition", ascending=False).collect() # ambil partisi sesuai format
|
|
86
|
+
if len(partitions) != 0: # jika ada partisi
|
|
87
|
+
list_partition = []
|
|
88
|
+
for row in partitions:
|
|
89
|
+
if "__HIVE_DEFAULT_PARTITION__" not in row[0]:
|
|
90
|
+
arrange = []
|
|
91
|
+
dict_partition = {}
|
|
92
|
+
for partition in row[0].split("/"):
|
|
93
|
+
value = partition.split("=")
|
|
94
|
+
arrange.append(value[1].zfill(2))
|
|
95
|
+
dict_partition[value[0]] = value[1]
|
|
96
|
+
dict_partition["__formatted_partition"] = "|".join(arrange)
|
|
97
|
+
list_partition.append(dict_partition)
|
|
98
|
+
list_partition = sorted(list_partition, key=lambda row: row['__formatted_partition'], reverse=True)
|
|
99
|
+
return list_partition
|
|
100
|
+
else: # selain itu
|
|
101
|
+
return None # tidak ada partisi
|
|
102
|
+
except:
|
|
103
|
+
print("is not a partitioned table")
|
|
104
|
+
return None
|
|
105
|
+
|
|
106
|
+
#=============================================================#
|
|
107
|
+
def get_first_partition(spark, schema, table):
|
|
108
|
+
"""Memperoleh partisi pertama dari hive table
|
|
109
|
+
|
|
110
|
+
Args:
|
|
111
|
+
spark (pyspark.sql.session.SparkSession): spark session
|
|
112
|
+
schema (str): nama schema dari table di hive
|
|
113
|
+
table (str): nama table di hive
|
|
114
|
+
|
|
115
|
+
Returns:
|
|
116
|
+
dict: partisi pertama
|
|
117
|
+
"""
|
|
118
|
+
|
|
119
|
+
partitions = get_list_partition(spark, schema, table)
|
|
120
|
+
if partitions == None:
|
|
121
|
+
return None
|
|
122
|
+
|
|
123
|
+
return partitions[-1]
|
|
124
|
+
|
|
125
|
+
#=============================================================#
|
|
126
|
+
def get_last_partition(spark, schema, table):
|
|
127
|
+
"""Memperoleh partisi terakhir dari hive table
|
|
128
|
+
|
|
129
|
+
Args:
|
|
130
|
+
spark (pyspark.sql.session.SparkSession): spark session
|
|
131
|
+
schema (str): nama schema dari table di hive
|
|
132
|
+
table (str): nama table di hive
|
|
133
|
+
|
|
134
|
+
Returns:
|
|
135
|
+
dict: partisi terakhir
|
|
136
|
+
"""
|
|
137
|
+
|
|
138
|
+
partitions = get_list_partition(spark, schema, table)
|
|
139
|
+
if partitions == None:
|
|
140
|
+
return None
|
|
141
|
+
|
|
142
|
+
return partitions[0]
|
|
143
|
+
|
|
144
|
+
#=============================================================#
|
|
145
|
+
def get_padding(raw):
|
|
146
|
+
"""Menambahkan padding pada string input untuk memenuhi AES Block Size
|
|
147
|
+
|
|
148
|
+
Args:
|
|
149
|
+
raw (str): string input yang akan di beri padding
|
|
150
|
+
|
|
151
|
+
Returns:
|
|
152
|
+
str: hasil pemberian padding
|
|
153
|
+
"""
|
|
154
|
+
|
|
155
|
+
BLOCK_SIZE = AES.block_size
|
|
156
|
+
|
|
157
|
+
return raw + (BLOCK_SIZE - len(raw) % BLOCK_SIZE) * chr(BLOCK_SIZE - len(raw) % BLOCK_SIZE)
|
|
158
|
+
|
|
159
|
+
#=============================================================#
|
|
160
|
+
def get_unpadding(pad):
|
|
161
|
+
"""Menghilangkan padding pada string input
|
|
162
|
+
|
|
163
|
+
Args:
|
|
164
|
+
raw (str): string input yang akan dihilangkan padding-nya
|
|
165
|
+
|
|
166
|
+
Returns:
|
|
167
|
+
str: hasil penghilangan padding
|
|
168
|
+
"""
|
|
169
|
+
|
|
170
|
+
return pad[:-ord(pad[len(pad) - 1:])]
|
|
171
|
+
|
|
172
|
+
#=============================================================#
|
|
173
|
+
def get_encrypt_aes256(raw, password):
|
|
174
|
+
"""Melakukan enkripsi pada string input menggunakan metode AES 256 pycrypto library
|
|
175
|
+
|
|
176
|
+
Args:
|
|
177
|
+
raw (str): string input yang akan di enkripsi
|
|
178
|
+
password (str): key yang nantinya untuk proses dekripsi
|
|
179
|
+
|
|
180
|
+
Returns:
|
|
181
|
+
str: hasil proses enkripsi
|
|
182
|
+
"""
|
|
183
|
+
|
|
184
|
+
raw = get_padding(raw)
|
|
185
|
+
|
|
186
|
+
iv_original = b"SECRET"
|
|
187
|
+
iv = hashlib.sha256(iv_original).digest()[:16]
|
|
188
|
+
|
|
189
|
+
private_key = hashlib.sha256(password.encode("utf-8")).digest()
|
|
190
|
+
|
|
191
|
+
cipher = AES.new(private_key, AES.MODE_CBC, iv)
|
|
192
|
+
ciphertext = cipher.encrypt(raw)
|
|
193
|
+
|
|
194
|
+
encrypted_data = iv + ciphertext
|
|
195
|
+
|
|
196
|
+
return base64.b64encode(encrypted_data).decode('utf-8')
|
|
197
|
+
|
|
198
|
+
udf_encrypt_aes256 = F.udf(lambda col1,col2: try_or(lambda: get_encrypt_aes256(col1,col2)))
|
|
199
|
+
|
|
200
|
+
#=============================================================#
|
|
201
|
+
def get_decrypt_aes256(enc, password):
|
|
202
|
+
"""Melakukan dekripsi pada string input hasil enkripsi menggunakan metode AES 256 pycrypto library
|
|
203
|
+
|
|
204
|
+
Args:
|
|
205
|
+
raw (str): string input yang akan di dekripsi
|
|
206
|
+
password (str): key untuk proses dekripsi
|
|
207
|
+
|
|
208
|
+
Returns:
|
|
209
|
+
str: hasil proses dekripsi
|
|
210
|
+
"""
|
|
211
|
+
|
|
212
|
+
private_key = hashlib.sha256(password.encode("utf-8")).digest()
|
|
213
|
+
enc = base64.b64decode(enc)
|
|
214
|
+
iv = enc[:16]
|
|
215
|
+
cipher = AES.new(private_key, AES.MODE_CBC, iv)
|
|
216
|
+
return bytes.decode(get_unpadding(cipher.decrypt(enc[16:])))
|
|
217
|
+
|
|
218
|
+
udf_decrypt_aes256 = F.udf(lambda col1,col2: try_or(lambda: get_decrypt_aes256(col1,col2)))
|
|
219
|
+
|
|
220
|
+
#=============================================================#
|
|
221
|
+
|
|
222
|
+
|