bribrain 1.0.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,15 @@
1
+ Metadata-Version: 1.1
2
+ Name: bribrain
3
+ Version: 1.0.0
4
+ Summary: Standard code for Dept BRIBrain
5
+ Home-page: UNKNOWN
6
+ Author: Andri Ariyanto
7
+ Author-email: ariyant.andri@gmail.com
8
+ License: UNKNOWN
9
+ Description: UNKNOWN
10
+ Keywords: python,ddb,bribrain
11
+ Platform: UNKNOWN
12
+ Classifier: Programming Language :: Python :: 3
13
+ Classifier: Operating System :: Unix
14
+ Classifier: Operating System :: MacOS :: MacOS X
15
+ Classifier: Operating System :: Microsoft :: Windows
@@ -0,0 +1,4 @@
1
+ from .config import *
2
+ from .function import *
3
+ from .ingestion import *
4
+ from .load import *
@@ -0,0 +1,60 @@
1
+ #=============================================================#
2
+ ### CREATED AT : 29 FEBRUARI 2024 ###
3
+ ### UPDATED AT : 07 MARET 2024 ###
4
+ ### COPYRIGHT : ANDRI ARIYANTO ###
5
+ ### DESCRIPTION : Module untuk pembuatan spark session ###
6
+ #=============================================================#
7
+
8
+ from pyspark.sql import SparkSession
9
+
10
+ def sparkSession(appname="Bribrain Spark Session", executor="small", instances=2, cores=4, memory=4, overhead=2):
11
+ """Membuat spark session yang dapat digunakan untuk proses data engineering
12
+
13
+ Args:
14
+ appname (str): Nama dari spark session
15
+ (default is Bribrain Spark Session)
16
+ executor (str): Standard resource yang dapat dipilih untuk spark session
17
+ (optional is small, medium, high, custom)
18
+ (default is small)
19
+ (notes is small (2,4,4,2), medium (4,4,6,2), high (6,5,8,4))
20
+ instances (int): Config untuk menentukan instance spark session
21
+ (default is 2)
22
+ cores (int): Config untuk menentukan cores spark session
23
+ (default is 4)
24
+ memory (int): Config untuk menentukan memory spark session
25
+ (default is 4)
26
+ overhead (int): Config untuk menentukan overhead spark session
27
+ (default is 2)
28
+
29
+ Returns:
30
+ pyspark.sql.session.SparkSession: Spark session untuk proses data engineering
31
+ """
32
+
33
+ # pendefinisian resouce pyspark
34
+ if executor=="small":
35
+ config = [2, 4, 4, 2]
36
+ elif executor=="medium":
37
+ config = [4, 4, 6, 2]
38
+ elif executor=="high":
39
+ config = [6, 5, 8, 4]
40
+ elif executor=="custom":
41
+ config = [instances, cores, memory, overhead]
42
+
43
+ # pembuatan spark session
44
+ spark = SparkSession\
45
+ .builder\
46
+ .appName(appname)\
47
+ .config("spark.sql.crossJoin.enabled", "true")\
48
+ .config("spark.dynamicAllocation.enabled", "false")\
49
+ .config("spark.executor.instances", "{}".format(config[0]))\
50
+ .config("spark.executor.cores", "{}".format(config[1]))\
51
+ .config("spark.executor.memory", "{}g".format(config[2]))\
52
+ .config("spark.yarn.executor.memoryOverhead", "{}g".format(config[3]))\
53
+ .config("spark.sql.broadcastTimeout", "36000")\
54
+ .config("spark.sql.legacy.allowCreatingManagedTableUsingNonemptyLocation", "true")\
55
+ .config("spark.ui.showConsoleProgress", "false")\
56
+ .config("spark.network.timeout", 60)\
57
+ .enableHiveSupport()\
58
+ .getOrCreate()
59
+
60
+ return spark
@@ -0,0 +1,222 @@
1
+ #=============================================================#
2
+ ### CREATED AT : 12 MARET 2024 ###
3
+ ### UPDATED AT : 14 MARET 2024 ###
4
+ ### COPYRIGHT : BRIBRAIN DATA ENGINEER TEAM ###
5
+ ### DESCRIPTION : Module untuk kumpulan function ###
6
+ #=============================================================#
7
+
8
+ from time import time
9
+ from pytz import timezone
10
+ from datetime import datetime, timedelta
11
+ from pyspark.sql import functions as F
12
+ from Crypto.Cipher import AES
13
+
14
+
15
+ import base64
16
+ import hashlib
17
+
18
+ #=============================================================#
19
+ def try_or(func, default=None, expected_exc=(Exception,)):
20
+ """Menangkap error dan memberikan keluaran sesuai parameter
21
+
22
+ Args:
23
+ func (function): python function
24
+ (notes is ditambahkan lambda pada sebelum nama function, ex: try_or(lambda:func))
25
+ default (object): keluaran yang diharapkan ketika terjadi error
26
+ expected_exc (Exception): Exception yang diharapkan
27
+
28
+ Returns:
29
+ object: keluaran dari function atau mengembalikan keluaran sesuai input jika terdapat error
30
+ """
31
+
32
+ try:
33
+ return func()
34
+ except expected_exc:
35
+ return default
36
+
37
+ #=============================================================#
38
+ def set_timer():
39
+ """Menetapkan waktu awal untuk penghitungan durasi proses
40
+
41
+ Args:
42
+ -
43
+
44
+ Returns:
45
+ float: waktu dalam format float
46
+ """
47
+
48
+ global START_TIME
49
+ START_TIME = time()
50
+
51
+ return START_TIME
52
+
53
+ #=============================================================#
54
+ def get_timer(start_time=None):
55
+ """Memperoleh durasi proses berdasarkan waktu awal dikurangi waktu sekarang
56
+
57
+ Args:
58
+ start_time (float) :
59
+
60
+ Returns:
61
+ string: durasi proses dengan format HH:MM:SS
62
+ """
63
+
64
+ if start_time:
65
+ return (datetime(1,1,1)+timedelta(seconds=int(time()-start_time))).strftime("%H:%M:%S")
66
+
67
+ return (datetime(1,1,1)+timedelta(seconds=int(time()-START_TIME))).strftime("%H:%M:%S")
68
+
69
+ #=============================================================#
70
+ def get_list_partition(spark, schema, table):
71
+ """Memperoleh list partisi dari hive table yang diurutkan dari partisi terbaru
72
+
73
+ Args:
74
+ spark (pyspark.sql.session.SparkSession): spark session
75
+ schema (str): nama schema dari table di hive
76
+ table (str): nama table di hive
77
+
78
+ Returns:
79
+ list: list partisi diurutkan dari partisi yang terbaru
80
+ """
81
+
82
+ try:
83
+ partitions = spark.sql("""
84
+ SHOW PARTITIONS {}.{}
85
+ """.format(schema, table)).sort("partition", ascending=False).collect() # ambil partisi sesuai format
86
+ if len(partitions) != 0: # jika ada partisi
87
+ list_partition = []
88
+ for row in partitions:
89
+ if "__HIVE_DEFAULT_PARTITION__" not in row[0]:
90
+ arrange = []
91
+ dict_partition = {}
92
+ for partition in row[0].split("/"):
93
+ value = partition.split("=")
94
+ arrange.append(value[1].zfill(2))
95
+ dict_partition[value[0]] = value[1]
96
+ dict_partition["__formatted_partition"] = "|".join(arrange)
97
+ list_partition.append(dict_partition)
98
+ list_partition = sorted(list_partition, key=lambda row: row['__formatted_partition'], reverse=True)
99
+ return list_partition
100
+ else: # selain itu
101
+ return None # tidak ada partisi
102
+ except:
103
+ print("is not a partitioned table")
104
+ return None
105
+
106
+ #=============================================================#
107
+ def get_first_partition(spark, schema, table):
108
+ """Memperoleh partisi pertama dari hive table
109
+
110
+ Args:
111
+ spark (pyspark.sql.session.SparkSession): spark session
112
+ schema (str): nama schema dari table di hive
113
+ table (str): nama table di hive
114
+
115
+ Returns:
116
+ dict: partisi pertama
117
+ """
118
+
119
+ partitions = get_list_partition(spark, schema, table)
120
+ if partitions == None:
121
+ return None
122
+
123
+ return partitions[-1]
124
+
125
+ #=============================================================#
126
+ def get_last_partition(spark, schema, table):
127
+ """Memperoleh partisi terakhir dari hive table
128
+
129
+ Args:
130
+ spark (pyspark.sql.session.SparkSession): spark session
131
+ schema (str): nama schema dari table di hive
132
+ table (str): nama table di hive
133
+
134
+ Returns:
135
+ dict: partisi terakhir
136
+ """
137
+
138
+ partitions = get_list_partition(spark, schema, table)
139
+ if partitions == None:
140
+ return None
141
+
142
+ return partitions[0]
143
+
144
+ #=============================================================#
145
+ def get_padding(raw):
146
+ """Menambahkan padding pada string input untuk memenuhi AES Block Size
147
+
148
+ Args:
149
+ raw (str): string input yang akan di beri padding
150
+
151
+ Returns:
152
+ str: hasil pemberian padding
153
+ """
154
+
155
+ BLOCK_SIZE = AES.block_size
156
+
157
+ return raw + (BLOCK_SIZE - len(raw) % BLOCK_SIZE) * chr(BLOCK_SIZE - len(raw) % BLOCK_SIZE)
158
+
159
+ #=============================================================#
160
+ def get_unpadding(pad):
161
+ """Menghilangkan padding pada string input
162
+
163
+ Args:
164
+ raw (str): string input yang akan dihilangkan padding-nya
165
+
166
+ Returns:
167
+ str: hasil penghilangan padding
168
+ """
169
+
170
+ return pad[:-ord(pad[len(pad) - 1:])]
171
+
172
+ #=============================================================#
173
+ def get_encrypt_aes256(raw, password):
174
+ """Melakukan enkripsi pada string input menggunakan metode AES 256 pycrypto library
175
+
176
+ Args:
177
+ raw (str): string input yang akan di enkripsi
178
+ password (str): key yang nantinya untuk proses dekripsi
179
+
180
+ Returns:
181
+ str: hasil proses enkripsi
182
+ """
183
+
184
+ raw = get_padding(raw)
185
+
186
+ iv_original = b"SECRET"
187
+ iv = hashlib.sha256(iv_original).digest()[:16]
188
+
189
+ private_key = hashlib.sha256(password.encode("utf-8")).digest()
190
+
191
+ cipher = AES.new(private_key, AES.MODE_CBC, iv)
192
+ ciphertext = cipher.encrypt(raw)
193
+
194
+ encrypted_data = iv + ciphertext
195
+
196
+ return base64.b64encode(encrypted_data).decode('utf-8')
197
+
198
+ udf_encrypt_aes256 = F.udf(lambda col1,col2: try_or(lambda: get_encrypt_aes256(col1,col2)))
199
+
200
+ #=============================================================#
201
+ def get_decrypt_aes256(enc, password):
202
+ """Melakukan dekripsi pada string input hasil enkripsi menggunakan metode AES 256 pycrypto library
203
+
204
+ Args:
205
+ raw (str): string input yang akan di dekripsi
206
+ password (str): key untuk proses dekripsi
207
+
208
+ Returns:
209
+ str: hasil proses dekripsi
210
+ """
211
+
212
+ private_key = hashlib.sha256(password.encode("utf-8")).digest()
213
+ enc = base64.b64decode(enc)
214
+ iv = enc[:16]
215
+ cipher = AES.new(private_key, AES.MODE_CBC, iv)
216
+ return bytes.decode(get_unpadding(cipher.decrypt(enc[16:])))
217
+
218
+ udf_decrypt_aes256 = F.udf(lambda col1,col2: try_or(lambda: get_decrypt_aes256(col1,col2)))
219
+
220
+ #=============================================================#
221
+
222
+