mkpipe-extractor-sqlserver 0.1.1__tar.gz → 0.1.3__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {mkpipe_extractor_sqlserver-0.1.1/mkpipe_extractor_sqlserver.egg-info → mkpipe_extractor_sqlserver-0.1.3}/PKG-INFO +12 -2
- {mkpipe_extractor_sqlserver-0.1.1 → mkpipe_extractor_sqlserver-0.1.3}/mkpipe_extractor_sqlserver/__init__.py +73 -62
- {mkpipe_extractor_sqlserver-0.1.1 → mkpipe_extractor_sqlserver-0.1.3/mkpipe_extractor_sqlserver.egg-info}/PKG-INFO +12 -2
- {mkpipe_extractor_sqlserver-0.1.1 → mkpipe_extractor_sqlserver-0.1.3}/setup.py +1 -1
- {mkpipe_extractor_sqlserver-0.1.1 → mkpipe_extractor_sqlserver-0.1.3}/LICENSE +0 -0
- {mkpipe_extractor_sqlserver-0.1.1 → mkpipe_extractor_sqlserver-0.1.3}/MANIFEST.in +0 -0
- {mkpipe_extractor_sqlserver-0.1.1 → mkpipe_extractor_sqlserver-0.1.3}/README.md +0 -0
- {mkpipe_extractor_sqlserver-0.1.1 → mkpipe_extractor_sqlserver-0.1.3}/mkpipe_extractor_sqlserver/jars/com.microsoft.sqlserver_mssql-jdbc-12.8.1.jre11.jar +0 -0
- {mkpipe_extractor_sqlserver-0.1.1 → mkpipe_extractor_sqlserver-0.1.3}/mkpipe_extractor_sqlserver.egg-info/SOURCES.txt +0 -0
- {mkpipe_extractor_sqlserver-0.1.1 → mkpipe_extractor_sqlserver-0.1.3}/mkpipe_extractor_sqlserver.egg-info/dependency_links.txt +0 -0
- {mkpipe_extractor_sqlserver-0.1.1 → mkpipe_extractor_sqlserver-0.1.3}/mkpipe_extractor_sqlserver.egg-info/entry_points.txt +0 -0
- {mkpipe_extractor_sqlserver-0.1.1 → mkpipe_extractor_sqlserver-0.1.3}/mkpipe_extractor_sqlserver.egg-info/requires.txt +0 -0
- {mkpipe_extractor_sqlserver-0.1.1 → mkpipe_extractor_sqlserver-0.1.3}/mkpipe_extractor_sqlserver.egg-info/top_level.txt +0 -0
- {mkpipe_extractor_sqlserver-0.1.1 → mkpipe_extractor_sqlserver-0.1.3}/setup.cfg +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
|
-
Metadata-Version: 2.
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
2
|
Name: mkpipe-extractor-sqlserver
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.3
|
|
4
4
|
Summary: SQLserver extractor for mkpipe.
|
|
5
5
|
Author: Metin Karakus
|
|
6
6
|
Author-email: metin_karakus@yahoo.com
|
|
@@ -11,6 +11,16 @@ Requires-Python: >=3.8
|
|
|
11
11
|
Description-Content-Type: text/markdown
|
|
12
12
|
License-File: LICENSE
|
|
13
13
|
Requires-Dist: mkpipe
|
|
14
|
+
Dynamic: author
|
|
15
|
+
Dynamic: author-email
|
|
16
|
+
Dynamic: classifier
|
|
17
|
+
Dynamic: description
|
|
18
|
+
Dynamic: description-content-type
|
|
19
|
+
Dynamic: license
|
|
20
|
+
Dynamic: license-file
|
|
21
|
+
Dynamic: requires-dist
|
|
22
|
+
Dynamic: requires-python
|
|
23
|
+
Dynamic: summary
|
|
14
24
|
|
|
15
25
|
# MkPipe
|
|
16
26
|
|
|
@@ -1,6 +1,5 @@
|
|
|
1
1
|
import os
|
|
2
2
|
import datetime
|
|
3
|
-
from pathlib import Path
|
|
4
3
|
from urllib.parse import quote_plus
|
|
5
4
|
from pyspark.sql import SparkSession
|
|
6
5
|
from pyspark import SparkConf
|
|
@@ -20,6 +19,8 @@ class SqlserverExtractor:
|
|
|
20
19
|
else:
|
|
21
20
|
self.settings = settings
|
|
22
21
|
self.connection_params = config['connection_params']
|
|
22
|
+
self.table = config['table']
|
|
23
|
+
self.pass_on_error = config.get('pass_on_error', None)
|
|
23
24
|
self.host = self.connection_params['host']
|
|
24
25
|
self.port = self.connection_params['port']
|
|
25
26
|
self.username = self.connection_params['user']
|
|
@@ -32,9 +33,6 @@ class SqlserverExtractor:
|
|
|
32
33
|
self.settings.driver_name = self.driver_name
|
|
33
34
|
self.jdbc_url = f'jdbc:{self.driver_name}://{self.host}:{self.port};databaseName={self.database};user={self.username};password={self.password};encrypt=false;trustServerCertificate=false'
|
|
34
35
|
|
|
35
|
-
self.table = config['table']
|
|
36
|
-
self.pass_on_error = config.get('pass_on_error', None)
|
|
37
|
-
|
|
38
36
|
config = load_config()
|
|
39
37
|
connection_params = config['settings']['backend']
|
|
40
38
|
db_type = connection_params['database_type']
|
|
@@ -78,8 +76,9 @@ class SqlserverExtractor:
|
|
|
78
76
|
name = t['name']
|
|
79
77
|
target_name = t['target_name']
|
|
80
78
|
iterate_column_type = t['iterate_column_type']
|
|
81
|
-
|
|
82
|
-
'
|
|
79
|
+
chunk_count_for_partition = t.get(
|
|
80
|
+
'chunk_count_for_partition',
|
|
81
|
+
self.settings.default_chunk_count_for_partition,
|
|
83
82
|
)
|
|
84
83
|
iterate_max_loop = t.get(
|
|
85
84
|
'iterate_max_loop', self.settings.default_iterate_max_loop
|
|
@@ -101,7 +100,6 @@ class SqlserverExtractor:
|
|
|
101
100
|
|
|
102
101
|
partitions_column = partitions_column_.split(' as ')[0].strip()
|
|
103
102
|
p_col_name = partitions_column_.split(' as ')[-1].strip()
|
|
104
|
-
p_col_select = f'{partitions_column} as {p_col_name}'
|
|
105
103
|
|
|
106
104
|
message = dict(table_name=target_name, status='extracting')
|
|
107
105
|
logger.info(message)
|
|
@@ -112,57 +110,65 @@ class SqlserverExtractor:
|
|
|
112
110
|
last_point = self.backend.get_last_point(target_name)
|
|
113
111
|
if last_point:
|
|
114
112
|
write_mode = 'append'
|
|
115
|
-
|
|
116
|
-
iterate_query = f"""(SELECT {p_col_select} from {name} where {partitions_column} > '{last_point}' ) q"""
|
|
117
|
-
|
|
118
|
-
df_itarate_list = (
|
|
119
|
-
spark.read.format('jdbc')
|
|
120
|
-
.option('url', self.jdbc_url)
|
|
121
|
-
.option('dbtable', iterate_query)
|
|
122
|
-
.option('driver', self.driver_jdbc)
|
|
123
|
-
.option('fetchsize', fetchsize)
|
|
124
|
-
.load()
|
|
125
|
-
)
|
|
126
|
-
|
|
127
|
-
min_val = last_point
|
|
128
|
-
max_val = df_itarate_list.agg(F.max(p_col_name).alias('max')).collect()[
|
|
129
|
-
0
|
|
130
|
-
][0]
|
|
131
|
-
df_itarate_list = df_itarate_list.where(F.col(p_col_name) > min_val)
|
|
113
|
+
iterate_query = f"""(SELECT min({partitions_column}) as min_val, max({partitions_column}) as max_val from {name} where {partitions_column} > '{last_point}' ) q"""
|
|
132
114
|
else:
|
|
133
115
|
write_mode = 'overwrite'
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
)
|
|
144
|
-
|
|
145
|
-
min_max_vals = df_itarate_list.agg(
|
|
146
|
-
F.min(p_col_name).alias('min'), F.max(p_col_name).alias('max')
|
|
147
|
-
).collect()[0]
|
|
148
|
-
min_val = min_max_vals[0]
|
|
149
|
-
max_val = min_max_vals[1]
|
|
150
|
-
df_itarate_list = df_itarate_list.where(F.col(p_col_name) >= min_val)
|
|
151
|
-
|
|
152
|
-
key_list = (
|
|
153
|
-
df_itarate_list.select(p_col_name)
|
|
154
|
-
.distinct()
|
|
155
|
-
.rdd.flatMap(lambda x: x)
|
|
156
|
-
.collect()
|
|
116
|
+
iterate_query = f"""(SELECT min({partitions_column}) as min_val, max({partitions_column}) as max_val from {name}) q"""
|
|
117
|
+
|
|
118
|
+
df_itarate_list = (
|
|
119
|
+
spark.read.format('jdbc')
|
|
120
|
+
.option('url', self.jdbc_url)
|
|
121
|
+
.option('dbtable', iterate_query)
|
|
122
|
+
.option('driver', self.driver_jdbc)
|
|
123
|
+
.option('fetchsize', fetchsize)
|
|
124
|
+
.load()
|
|
157
125
|
)
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
126
|
+
min_val = df_itarate_list.first()['min_val']
|
|
127
|
+
max_val = df_itarate_list.first()['max_val']
|
|
128
|
+
|
|
129
|
+
if min_val is None or max_val is None:
|
|
130
|
+
min_max_tuple = None
|
|
131
|
+
elif iterate_column_type == 'int':
|
|
132
|
+
min_val = int(min_val)
|
|
133
|
+
max_val = int(max_val)
|
|
134
|
+
|
|
135
|
+
total_range = max_val - min_val + 1 # inclusive
|
|
136
|
+
step = total_range // chunk_count_for_partition
|
|
137
|
+
remainder = total_range % chunk_count_for_partition
|
|
138
|
+
|
|
139
|
+
min_max_tuple = []
|
|
140
|
+
start = min_val
|
|
141
|
+
for _ in range(chunk_count_for_partition):
|
|
142
|
+
end = start + step - 1
|
|
143
|
+
if remainder > 0:
|
|
144
|
+
end += 1
|
|
145
|
+
remainder -= 1
|
|
146
|
+
if start <= max_val:
|
|
147
|
+
min_max_tuple.append((start, min(end, max_val)))
|
|
148
|
+
start = end + 1
|
|
149
|
+
|
|
150
|
+
elif iterate_column_type == 'datetime':
|
|
151
|
+
total_seconds = (
|
|
152
|
+
int((max_val - min_val).total_seconds()) + 1
|
|
153
|
+
) # include max_val
|
|
154
|
+
step = total_seconds // chunk_count_for_partition
|
|
155
|
+
remainder = total_seconds % chunk_count_for_partition
|
|
156
|
+
|
|
157
|
+
min_max_tuple = []
|
|
158
|
+
start = min_val
|
|
159
|
+
for _ in range(chunk_count_for_partition):
|
|
160
|
+
step_with_remainder = step
|
|
161
|
+
if remainder > 0:
|
|
162
|
+
step_with_remainder += 1
|
|
163
|
+
remainder -= 1
|
|
164
|
+
end = start + datetime.timedelta(seconds=step_with_remainder - 1)
|
|
165
|
+
if start <= max_val:
|
|
166
|
+
min_max_tuple.append((start, min(end, max_val)))
|
|
167
|
+
start = end + datetime.timedelta(seconds=1)
|
|
168
|
+
else:
|
|
169
|
+
raise ValueError(
|
|
170
|
+
f'Unsupported iterate_column_type: {iterate_column_type}'
|
|
171
|
+
)
|
|
166
172
|
|
|
167
173
|
if not min_max_tuple:
|
|
168
174
|
if not last_point:
|
|
@@ -209,20 +215,24 @@ class SqlserverExtractor:
|
|
|
209
215
|
if custom_query:
|
|
210
216
|
updated_query = custom_query.replace(
|
|
211
217
|
'{query_filter}',
|
|
212
|
-
f""" where {partitions_column}
|
|
218
|
+
f""" where {partitions_column} >= {min_filter} and {partitions_column} < {max_filter} """,
|
|
213
219
|
)
|
|
214
220
|
else:
|
|
215
|
-
updated_query = f'(SELECT * from {name} where
|
|
216
|
-
|
|
217
|
-
min_filter =
|
|
218
|
-
max_filter =
|
|
221
|
+
updated_query = f'(SELECT * from {name} where {partitions_column} >= {min_filter} and {partitions_column} < {max_filter}) q'
|
|
222
|
+
elif iterate_column_type == 'datetime':
|
|
223
|
+
min_filter = chunk[0].strftime('%Y-%m-%d %H:%M:%S')
|
|
224
|
+
max_filter = chunk[-1].strftime('%Y-%m-%d %H:%M:%S')
|
|
219
225
|
if custom_query:
|
|
220
226
|
updated_query = custom_query.replace(
|
|
221
227
|
'{query_filter}',
|
|
222
|
-
f""" where {partitions_column}
|
|
228
|
+
f""" where {partitions_column} >= '{min_filter}' and {partitions_column} < '{max_filter}' """,
|
|
223
229
|
)
|
|
224
230
|
else:
|
|
225
|
-
updated_query = f"""(SELECT * from {name} where {partitions_column}
|
|
231
|
+
updated_query = f"""(SELECT * from {name} where {partitions_column} >= '{min_filter}' and {partitions_column} < '{max_filter}') q"""
|
|
232
|
+
else:
|
|
233
|
+
raise ValueError(
|
|
234
|
+
f'Unsupported iterate_column_type: {iterate_column_type}'
|
|
235
|
+
)
|
|
226
236
|
|
|
227
237
|
df = (
|
|
228
238
|
spark.read.format('jdbc')
|
|
@@ -244,6 +254,7 @@ class SqlserverExtractor:
|
|
|
244
254
|
.mode(p_write_mode)
|
|
245
255
|
.parquet(parquet_path)
|
|
246
256
|
)
|
|
257
|
+
|
|
247
258
|
count_col = len(df.columns)
|
|
248
259
|
count_row = df.count()
|
|
249
260
|
last_point_value = max_filter
|
|
@@ -1,6 +1,6 @@
|
|
|
1
|
-
Metadata-Version: 2.
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
2
|
Name: mkpipe-extractor-sqlserver
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.3
|
|
4
4
|
Summary: SQLserver extractor for mkpipe.
|
|
5
5
|
Author: Metin Karakus
|
|
6
6
|
Author-email: metin_karakus@yahoo.com
|
|
@@ -11,6 +11,16 @@ Requires-Python: >=3.8
|
|
|
11
11
|
Description-Content-Type: text/markdown
|
|
12
12
|
License-File: LICENSE
|
|
13
13
|
Requires-Dist: mkpipe
|
|
14
|
+
Dynamic: author
|
|
15
|
+
Dynamic: author-email
|
|
16
|
+
Dynamic: classifier
|
|
17
|
+
Dynamic: description
|
|
18
|
+
Dynamic: description-content-type
|
|
19
|
+
Dynamic: license
|
|
20
|
+
Dynamic: license-file
|
|
21
|
+
Dynamic: requires-dist
|
|
22
|
+
Dynamic: requires-python
|
|
23
|
+
Dynamic: summary
|
|
14
24
|
|
|
15
25
|
# MkPipe
|
|
16
26
|
|
|
@@ -2,7 +2,7 @@ from setuptools import setup, find_packages
|
|
|
2
2
|
|
|
3
3
|
setup(
|
|
4
4
|
name='mkpipe-extractor-sqlserver',
|
|
5
|
-
version='0.1.
|
|
5
|
+
version='0.1.3',
|
|
6
6
|
license='Apache License 2.0',
|
|
7
7
|
packages=find_packages(exclude=['tests', 'scripts', 'deploy', 'install_jars.py']),
|
|
8
8
|
install_requires=['mkpipe'],
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|