mkpipe-extractor-sqlserver 0.1.1__tar.gz → 0.1.3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (14) hide show
  1. {mkpipe_extractor_sqlserver-0.1.1/mkpipe_extractor_sqlserver.egg-info → mkpipe_extractor_sqlserver-0.1.3}/PKG-INFO +12 -2
  2. {mkpipe_extractor_sqlserver-0.1.1 → mkpipe_extractor_sqlserver-0.1.3}/mkpipe_extractor_sqlserver/__init__.py +73 -62
  3. {mkpipe_extractor_sqlserver-0.1.1 → mkpipe_extractor_sqlserver-0.1.3/mkpipe_extractor_sqlserver.egg-info}/PKG-INFO +12 -2
  4. {mkpipe_extractor_sqlserver-0.1.1 → mkpipe_extractor_sqlserver-0.1.3}/setup.py +1 -1
  5. {mkpipe_extractor_sqlserver-0.1.1 → mkpipe_extractor_sqlserver-0.1.3}/LICENSE +0 -0
  6. {mkpipe_extractor_sqlserver-0.1.1 → mkpipe_extractor_sqlserver-0.1.3}/MANIFEST.in +0 -0
  7. {mkpipe_extractor_sqlserver-0.1.1 → mkpipe_extractor_sqlserver-0.1.3}/README.md +0 -0
  8. {mkpipe_extractor_sqlserver-0.1.1 → mkpipe_extractor_sqlserver-0.1.3}/mkpipe_extractor_sqlserver/jars/com.microsoft.sqlserver_mssql-jdbc-12.8.1.jre11.jar +0 -0
  9. {mkpipe_extractor_sqlserver-0.1.1 → mkpipe_extractor_sqlserver-0.1.3}/mkpipe_extractor_sqlserver.egg-info/SOURCES.txt +0 -0
  10. {mkpipe_extractor_sqlserver-0.1.1 → mkpipe_extractor_sqlserver-0.1.3}/mkpipe_extractor_sqlserver.egg-info/dependency_links.txt +0 -0
  11. {mkpipe_extractor_sqlserver-0.1.1 → mkpipe_extractor_sqlserver-0.1.3}/mkpipe_extractor_sqlserver.egg-info/entry_points.txt +0 -0
  12. {mkpipe_extractor_sqlserver-0.1.1 → mkpipe_extractor_sqlserver-0.1.3}/mkpipe_extractor_sqlserver.egg-info/requires.txt +0 -0
  13. {mkpipe_extractor_sqlserver-0.1.1 → mkpipe_extractor_sqlserver-0.1.3}/mkpipe_extractor_sqlserver.egg-info/top_level.txt +0 -0
  14. {mkpipe_extractor_sqlserver-0.1.1 → mkpipe_extractor_sqlserver-0.1.3}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
- Metadata-Version: 2.1
1
+ Metadata-Version: 2.4
2
2
  Name: mkpipe-extractor-sqlserver
3
- Version: 0.1.1
3
+ Version: 0.1.3
4
4
  Summary: SQLserver extractor for mkpipe.
5
5
  Author: Metin Karakus
6
6
  Author-email: metin_karakus@yahoo.com
@@ -11,6 +11,16 @@ Requires-Python: >=3.8
11
11
  Description-Content-Type: text/markdown
12
12
  License-File: LICENSE
13
13
  Requires-Dist: mkpipe
14
+ Dynamic: author
15
+ Dynamic: author-email
16
+ Dynamic: classifier
17
+ Dynamic: description
18
+ Dynamic: description-content-type
19
+ Dynamic: license
20
+ Dynamic: license-file
21
+ Dynamic: requires-dist
22
+ Dynamic: requires-python
23
+ Dynamic: summary
14
24
 
15
25
  # MkPipe
16
26
 
@@ -1,6 +1,5 @@
1
1
  import os
2
2
  import datetime
3
- from pathlib import Path
4
3
  from urllib.parse import quote_plus
5
4
  from pyspark.sql import SparkSession
6
5
  from pyspark import SparkConf
@@ -20,6 +19,8 @@ class SqlserverExtractor:
20
19
  else:
21
20
  self.settings = settings
22
21
  self.connection_params = config['connection_params']
22
+ self.table = config['table']
23
+ self.pass_on_error = config.get('pass_on_error', None)
23
24
  self.host = self.connection_params['host']
24
25
  self.port = self.connection_params['port']
25
26
  self.username = self.connection_params['user']
@@ -32,9 +33,6 @@ class SqlserverExtractor:
32
33
  self.settings.driver_name = self.driver_name
33
34
  self.jdbc_url = f'jdbc:{self.driver_name}://{self.host}:{self.port};databaseName={self.database};user={self.username};password={self.password};encrypt=false;trustServerCertificate=false'
34
35
 
35
- self.table = config['table']
36
- self.pass_on_error = config.get('pass_on_error', None)
37
-
38
36
  config = load_config()
39
37
  connection_params = config['settings']['backend']
40
38
  db_type = connection_params['database_type']
@@ -78,8 +76,9 @@ class SqlserverExtractor:
78
76
  name = t['name']
79
77
  target_name = t['target_name']
80
78
  iterate_column_type = t['iterate_column_type']
81
- iterate_batch_size = t.get(
82
- 'iterate_batch_size', self.settings.default_iterate_batch_size
79
+ chunk_count_for_partition = t.get(
80
+ 'chunk_count_for_partition',
81
+ self.settings.default_chunk_count_for_partition,
83
82
  )
84
83
  iterate_max_loop = t.get(
85
84
  'iterate_max_loop', self.settings.default_iterate_max_loop
@@ -101,7 +100,6 @@ class SqlserverExtractor:
101
100
 
102
101
  partitions_column = partitions_column_.split(' as ')[0].strip()
103
102
  p_col_name = partitions_column_.split(' as ')[-1].strip()
104
- p_col_select = f'{partitions_column} as {p_col_name}'
105
103
 
106
104
  message = dict(table_name=target_name, status='extracting')
107
105
  logger.info(message)
@@ -112,57 +110,65 @@ class SqlserverExtractor:
112
110
  last_point = self.backend.get_last_point(target_name)
113
111
  if last_point:
114
112
  write_mode = 'append'
115
-
116
- iterate_query = f"""(SELECT {p_col_select} from {name} where {partitions_column} > '{last_point}' ) q"""
117
-
118
- df_itarate_list = (
119
- spark.read.format('jdbc')
120
- .option('url', self.jdbc_url)
121
- .option('dbtable', iterate_query)
122
- .option('driver', self.driver_jdbc)
123
- .option('fetchsize', fetchsize)
124
- .load()
125
- )
126
-
127
- min_val = last_point
128
- max_val = df_itarate_list.agg(F.max(p_col_name).alias('max')).collect()[
129
- 0
130
- ][0]
131
- df_itarate_list = df_itarate_list.where(F.col(p_col_name) > min_val)
113
+ iterate_query = f"""(SELECT min({partitions_column}) as min_val, max({partitions_column}) as max_val from {name} where {partitions_column} > '{last_point}' ) q"""
132
114
  else:
133
115
  write_mode = 'overwrite'
134
-
135
- iterate_query = f'(SELECT {p_col_select} from {name}) q'
136
- df_itarate_list = (
137
- spark.read.format('jdbc')
138
- .option('url', self.jdbc_url)
139
- .option('dbtable', iterate_query)
140
- .option('driver', self.driver_jdbc)
141
- .option('fetchsize', fetchsize)
142
- .load()
143
- )
144
-
145
- min_max_vals = df_itarate_list.agg(
146
- F.min(p_col_name).alias('min'), F.max(p_col_name).alias('max')
147
- ).collect()[0]
148
- min_val = min_max_vals[0]
149
- max_val = min_max_vals[1]
150
- df_itarate_list = df_itarate_list.where(F.col(p_col_name) >= min_val)
151
-
152
- key_list = (
153
- df_itarate_list.select(p_col_name)
154
- .distinct()
155
- .rdd.flatMap(lambda x: x)
156
- .collect()
116
+ iterate_query = f"""(SELECT min({partitions_column}) as min_val, max({partitions_column}) as max_val from {name}) q"""
117
+
118
+ df_itarate_list = (
119
+ spark.read.format('jdbc')
120
+ .option('url', self.jdbc_url)
121
+ .option('dbtable', iterate_query)
122
+ .option('driver', self.driver_jdbc)
123
+ .option('fetchsize', fetchsize)
124
+ .load()
157
125
  )
158
- key_list.sort()
159
-
160
- chunks = [
161
- key_list[x : x + iterate_batch_size]
162
- for x in range(0, len(key_list), iterate_batch_size)
163
- ]
164
-
165
- min_max_tuple = [(min(x), max(x)) for x in chunks]
126
+ min_val = df_itarate_list.first()['min_val']
127
+ max_val = df_itarate_list.first()['max_val']
128
+
129
+ if min_val is None or max_val is None:
130
+ min_max_tuple = None
131
+ elif iterate_column_type == 'int':
132
+ min_val = int(min_val)
133
+ max_val = int(max_val)
134
+
135
+ total_range = max_val - min_val + 1 # inclusive
136
+ step = total_range // chunk_count_for_partition
137
+ remainder = total_range % chunk_count_for_partition
138
+
139
+ min_max_tuple = []
140
+ start = min_val
141
+ for _ in range(chunk_count_for_partition):
142
+ end = start + step - 1
143
+ if remainder > 0:
144
+ end += 1
145
+ remainder -= 1
146
+ if start <= max_val:
147
+ min_max_tuple.append((start, min(end, max_val)))
148
+ start = end + 1
149
+
150
+ elif iterate_column_type == 'datetime':
151
+ total_seconds = (
152
+ int((max_val - min_val).total_seconds()) + 1
153
+ ) # include max_val
154
+ step = total_seconds // chunk_count_for_partition
155
+ remainder = total_seconds % chunk_count_for_partition
156
+
157
+ min_max_tuple = []
158
+ start = min_val
159
+ for _ in range(chunk_count_for_partition):
160
+ step_with_remainder = step
161
+ if remainder > 0:
162
+ step_with_remainder += 1
163
+ remainder -= 1
164
+ end = start + datetime.timedelta(seconds=step_with_remainder - 1)
165
+ if start <= max_val:
166
+ min_max_tuple.append((start, min(end, max_val)))
167
+ start = end + datetime.timedelta(seconds=1)
168
+ else:
169
+ raise ValueError(
170
+ f'Unsupported iterate_column_type: {iterate_column_type}'
171
+ )
166
172
 
167
173
  if not min_max_tuple:
168
174
  if not last_point:
@@ -209,20 +215,24 @@ class SqlserverExtractor:
209
215
  if custom_query:
210
216
  updated_query = custom_query.replace(
211
217
  '{query_filter}',
212
- f""" where {partitions_column} between {min_filter} and {max_filter} """,
218
+ f""" where {partitions_column} >= {min_filter} and {partitions_column} < {max_filter} """,
213
219
  )
214
220
  else:
215
- updated_query = f'(SELECT * from {name} where {partitions_column} between {min_filter} and {max_filter}) q'
216
- else:
217
- min_filter = str(chunk[0])
218
- max_filter = str(chunk[-1])
221
+ updated_query = f'(SELECT * from {name} where {partitions_column} >= {min_filter} and {partitions_column} < {max_filter}) q'
222
+ elif iterate_column_type == 'datetime':
223
+ min_filter = chunk[0].strftime('%Y-%m-%d %H:%M:%S')
224
+ max_filter = chunk[-1].strftime('%Y-%m-%d %H:%M:%S')
219
225
  if custom_query:
220
226
  updated_query = custom_query.replace(
221
227
  '{query_filter}',
222
- f""" where {partitions_column} between '{min_filter}' and '{max_filter}' """,
228
+ f""" where {partitions_column} >= '{min_filter}' and {partitions_column} < '{max_filter}' """,
223
229
  )
224
230
  else:
225
- updated_query = f"""(SELECT * from {name} where {partitions_column} between '{min_filter}' and '{max_filter}') q"""
231
+ updated_query = f"""(SELECT * from {name} where {partitions_column} >= '{min_filter}' and {partitions_column} < '{max_filter}') q"""
232
+ else:
233
+ raise ValueError(
234
+ f'Unsupported iterate_column_type: {iterate_column_type}'
235
+ )
226
236
 
227
237
  df = (
228
238
  spark.read.format('jdbc')
@@ -244,6 +254,7 @@ class SqlserverExtractor:
244
254
  .mode(p_write_mode)
245
255
  .parquet(parquet_path)
246
256
  )
257
+
247
258
  count_col = len(df.columns)
248
259
  count_row = df.count()
249
260
  last_point_value = max_filter
@@ -1,6 +1,6 @@
1
- Metadata-Version: 2.1
1
+ Metadata-Version: 2.4
2
2
  Name: mkpipe-extractor-sqlserver
3
- Version: 0.1.1
3
+ Version: 0.1.3
4
4
  Summary: SQLserver extractor for mkpipe.
5
5
  Author: Metin Karakus
6
6
  Author-email: metin_karakus@yahoo.com
@@ -11,6 +11,16 @@ Requires-Python: >=3.8
11
11
  Description-Content-Type: text/markdown
12
12
  License-File: LICENSE
13
13
  Requires-Dist: mkpipe
14
+ Dynamic: author
15
+ Dynamic: author-email
16
+ Dynamic: classifier
17
+ Dynamic: description
18
+ Dynamic: description-content-type
19
+ Dynamic: license
20
+ Dynamic: license-file
21
+ Dynamic: requires-dist
22
+ Dynamic: requires-python
23
+ Dynamic: summary
14
24
 
15
25
  # MkPipe
16
26
 
@@ -2,7 +2,7 @@ from setuptools import setup, find_packages
2
2
 
3
3
  setup(
4
4
  name='mkpipe-extractor-sqlserver',
5
- version='0.1.1',
5
+ version='0.1.3',
6
6
  license='Apache License 2.0',
7
7
  packages=find_packages(exclude=['tests', 'scripts', 'deploy', 'install_jars.py']),
8
8
  install_requires=['mkpipe'],