mkpipe-loader-sqlite 0.1.2__tar.gz → 0.3.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
- Metadata-Version: 2.1
1
+ Metadata-Version: 2.4
2
2
  Name: mkpipe-loader-sqlite
3
- Version: 0.1.2
3
+ Version: 0.3.0
4
4
  Summary: SQLite loader for mkpipe.
5
5
  Author: Metin Karakus
6
6
  Author-email: metin_karakus@yahoo.com
@@ -11,6 +11,16 @@ Requires-Python: >=3.8
11
11
  Description-Content-Type: text/markdown
12
12
  License-File: LICENSE
13
13
  Requires-Dist: mkpipe
14
+ Dynamic: author
15
+ Dynamic: author-email
16
+ Dynamic: classifier
17
+ Dynamic: description
18
+ Dynamic: description-content-type
19
+ Dynamic: license
20
+ Dynamic: license-file
21
+ Dynamic: requires-dist
22
+ Dynamic: requires-python
23
+ Dynamic: summary
14
24
 
15
25
  # MkPipe
16
26
 
@@ -0,0 +1,13 @@
1
+ import os
2
+
3
+ from mkpipe.spark import JdbcLoader
4
+
5
+
6
+ class SqliteLoader(JdbcLoader, variant='sqlite'):
7
+ driver_name = 'sqlite'
8
+ driver_jdbc = 'org.sqlite.JDBC'
9
+
10
+ def build_jdbc_url(self):
11
+ db_path = self.connection.extra.get('db_path', self.database or 'data.db')
12
+ db_path = os.path.abspath(db_path)
13
+ return f'jdbc:sqlite:{db_path}'
@@ -1,6 +1,6 @@
1
- Metadata-Version: 2.1
1
+ Metadata-Version: 2.4
2
2
  Name: mkpipe-loader-sqlite
3
- Version: 0.1.2
3
+ Version: 0.3.0
4
4
  Summary: SQLite loader for mkpipe.
5
5
  Author: Metin Karakus
6
6
  Author-email: metin_karakus@yahoo.com
@@ -11,6 +11,16 @@ Requires-Python: >=3.8
11
11
  Description-Content-Type: text/markdown
12
12
  License-File: LICENSE
13
13
  Requires-Dist: mkpipe
14
+ Dynamic: author
15
+ Dynamic: author-email
16
+ Dynamic: classifier
17
+ Dynamic: description
18
+ Dynamic: description-content-type
19
+ Dynamic: license
20
+ Dynamic: license-file
21
+ Dynamic: requires-dist
22
+ Dynamic: requires-python
23
+ Dynamic: summary
14
24
 
15
25
  # MkPipe
16
26
 
@@ -2,7 +2,7 @@ from setuptools import setup, find_packages
2
2
 
3
3
  setup(
4
4
  name='mkpipe-loader-sqlite',
5
- version='0.1.2',
5
+ version='0.3.0',
6
6
  license='Apache License 2.0',
7
7
  packages=find_packages(exclude=['tests', 'scripts', 'deploy', 'install_jars.py']),
8
8
  install_requires=['mkpipe'],
@@ -1,142 +0,0 @@
1
- import os
2
- import time
3
- from pathlib import Path
4
- from pyspark.sql import functions as F
5
- from pyspark.sql.types import TimestampType
6
- from mkpipe.config import load_config
7
- from mkpipe.functions_db import get_db_connector
8
- from mkpipe.functions_spark import remove_partitioned_parquet, get_parser
9
- from mkpipe.utils import log_container, Logger
10
- from mkpipe.utils.base_class import PipeSettings
11
-
12
-
13
- class SqliteLoader:
14
- def __init__(self, config, settings):
15
- if isinstance(settings, dict):
16
- self.settings = PipeSettings(**settings)
17
- else:
18
- self.settings = settings
19
- self.connection_params = config['connection_params']
20
-
21
- self.db_path = os.path.abspath(self.connection_params['db_path'])
22
-
23
- self.driver_name = 'sqlite'
24
- self.driver_jdbc = 'org.sqlite.JDBC'
25
- self.settings.driver_name = self.driver_name
26
- self.jdbc_url = f'jdbc:sqlite:{self.db_path}'
27
-
28
- config = load_config()
29
- connection_params = config['settings']['backend']
30
- db_type = connection_params['database_type']
31
- self.backend = get_db_connector(db_type)(connection_params)
32
-
33
- def add_custom_columns(self, df, elt_start_time):
34
- if 'etl_time' in df.columns:
35
- df = df.drop('etl_time')
36
-
37
- df = df.withColumn('etl_time', F.lit(elt_start_time).cast(TimestampType()))
38
- return df
39
-
40
- @log_container(__file__)
41
- def load(self, data, elt_start_time):
42
- try:
43
- logger = Logger(__file__)
44
- start_time = time.time()
45
- name = data['table_name']
46
-
47
- write_mode = data.get('write_mode', None)
48
- file_type = data.get('file_type', None)
49
- last_point_value = data.get('last_point_value', None)
50
- iterate_column_type = data.get('iterate_column_type', None)
51
- replication_method = data.get('replication_method', 'full')
52
- batchsize = data.get('fetchsize', 100_000)
53
- pass_on_error = data.get('pass_on_error', None)
54
-
55
- if not file_type:
56
- 'means that the data fetched before no new data'
57
- self.backend.manifest_table_update(
58
- name=name,
59
- value=None, # Last point remains unchanged
60
- value_type=None, # Type remains unchanged
61
- status='completed', # ('completed', 'failed', 'extracting', 'loading')
62
- replication_method=replication_method, # ('incremental', 'full')
63
- error_message='',
64
- )
65
- return
66
-
67
- self.backend.manifest_table_update(
68
- name=name,
69
- value=None, # Last point remains unchanged
70
- value_type=None, # Type remains unchanged
71
- status='loading', # ('completed', 'failed', 'extracting', 'loading')
72
- replication_method=replication_method, # ('incremental', 'full')
73
- error_message='',
74
- )
75
-
76
- df = get_parser(file_type)(data, self.settings)
77
- df = self.add_custom_columns(df, elt_start_time)
78
- message = dict(
79
- table_name=name,
80
- status='loading',
81
- total_partition_count=df.rdd.getNumPartitions(),
82
- )
83
- logger.info(message)
84
-
85
- (
86
- df.write.format('jdbc')
87
- .mode(
88
- write_mode
89
- ) # Use write_mode for the first iteration, 'append' for others
90
- .option('url', self.jdbc_url)
91
- .option('dbtable', name)
92
- .option('driver', self.driver_jdbc)
93
- .option('batchsize', batchsize)
94
- .save()
95
- )
96
-
97
- # Update last point in the mkpipe_manifest table if applicable
98
- self.backend.manifest_table_update(
99
- name=name,
100
- value=last_point_value,
101
- value_type=iterate_column_type,
102
- status='completed',
103
- replication_method=replication_method,
104
- error_message='',
105
- )
106
-
107
- message = dict(table_name=name, status=write_mode)
108
- logger.info(message)
109
-
110
- # remove the parquet to reduce the storage
111
- remove_partitioned_parquet(data['path'])
112
-
113
- run_time = time.time() - start_time
114
- message = dict(table_name=name, status='success', run_time=run_time)
115
- logger.info(message)
116
-
117
- except Exception as e:
118
- # Log the error message and update the mkpipe_manifest with the error details
119
- message = dict(
120
- table_name=name,
121
- status='failed',
122
- type='loading',
123
- error_message=str(e),
124
- etl_start_time=str(elt_start_time),
125
- )
126
-
127
- self.backend.manifest_table_update(
128
- name=name,
129
- value=None, # Last point remains unchanged
130
- value_type=None, # Type remains unchanged
131
- status='failed',
132
- replication_method=replication_method,
133
- error_message=str(e),
134
- )
135
-
136
- if pass_on_error:
137
- logger.warning(message)
138
- return
139
- else:
140
- logger.error(message)
141
- raise Exception(message) from e
142
- return