move-data 0.1.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
move_data/__init__.py ADDED
@@ -0,0 +1,25 @@
1
+ """
2
+ move-data: A Python package for moving data between various sources and destinations.
3
+
4
+ This package provides utilities for:
5
+ - Google Sheets data extraction
6
+ - SharePoint file operations
7
+ - Google Cloud Storage file operations
8
+ - Snowflake data loading and extraction
9
+ """
10
+
11
+ from .move_data import (
12
+ get_googlesheets_data,
13
+ sharepoint,
14
+ snowflake,
15
+ googlestorage
16
+ )
17
+
18
+ __version__ = "0.1.5"
19
+ __all__ = [
20
+ "get_googlesheets_data",
21
+ "sharepoint",
22
+ "snowflake",
23
+ "googlestorage"
24
+ ]
25
+
move_data/move_data.py ADDED
@@ -0,0 +1,304 @@
1
+ import requests
2
+ from io import StringIO, BytesIO
3
+ import datetime, time
4
+ import warnings
5
+ warnings.filterwarnings('ignore')
6
+ import snowflake.connector as sf
7
+ from snowflake.connector.pandas_tools import write_pandas
8
+ import base64
9
+ import logging,json, pygsheets, pandas as pd
10
+ import chardet
11
+ from zipfile import BadZipFile
12
+ import re
13
+ import os.path
14
+ from google.cloud import storage
15
+
16
+ def get_googlesheets_data(name,sheet,service_account_path,skip_rows=0):
17
+ global new_columns, old_columns,file
18
+ gc = pygsheets.authorize(service_account_file = service_account_path)
19
+
20
+ sh = gc.open(name)
21
+ wks = sh.worksheet_by_title(sheet)
22
+ data = wks.get_as_df()
23
+
24
+ if skip_rows > 0:
25
+ data = data.iloc[skip_rows:].reset_index(drop=True)
26
+
27
+ new_columns = []
28
+ old_columns = data.columns.tolist()
29
+
30
+ for item in old_columns:
31
+ if type(item) == str:
32
+ new_item = re.sub(r'^order$','"order"',item.replace(" ($)","_").replace(" \+ ","_").replace(":","_").replace(" ","_").replace(".","").replace("(","").replace(")","").replace("/","_").replace(",","_").\
33
+ replace("-","_").replace("%","per").replace('unnamed__',"").lstrip('0123456789').replace('unique','a_unique').lower().replace('#','').replace("+","").replace('&','_').replace('___','_').replace('__','_'))
34
+ new_columns.append(new_item)
35
+ elif type(item) == datetime.datetime:
36
+ new_item = item.strftime("%b_%Y").lower()
37
+ new_columns.append(new_item)
38
+
39
+ try:
40
+ data = data.drop('',axis=1)
41
+ except KeyError:
42
+ pass
43
+
44
+ for col in data.columns:
45
+ try:
46
+ data[col] = data[col].astype(str)
47
+ except (ValueError,KeyError):
48
+ continue
49
+
50
+ for i in range(len(new_columns)):
51
+ new_columns[i] = new_columns[i].lower()
52
+
53
+ data.columns = new_columns
54
+
55
+ drop_cols = ['',',','_']
56
+ data = data.drop(drop_cols, axis=1, errors = 'ignore')
57
+ data = data.loc[:,~data.columns.duplicated()]
58
+ new_columns = data.columns.tolist()
59
+
60
+ new_columns = data.columns
61
+
62
+ sf_cols = []
63
+ sf_tr = []
64
+
65
+ for i in range(len(new_columns)):
66
+ new_value = new_columns[i].lower() + ' ' + 'string'
67
+ transform = 'nullif(' + new_columns[i].lower() + ',\'\') as ' + new_columns[i].lower() # + ' ' + 'string'
68
+ sf_cols.append(new_value)
69
+ sf_tr.append(transform)
70
+
71
+ sf_query = "\n,".join(sf_cols)
72
+ sf_tr_query = "\n,".join(sf_tr)
73
+
74
+ return data, sf_query, sf_tr_query
75
+
76
+ class sharepoint:
77
+ def __init__(self,client_id,client_secret,tenant_id,site_id,library_name,drive_id):
78
+ self.client_id = client_id
79
+ self.client_secret = client_secret
80
+ self.tenant_id = tenant_id
81
+
82
+ # SharePoint Online site URL and library name
83
+
84
+ self.site_id = site_id
85
+ self.library_name = library_name
86
+ self.drive_id = drive_id
87
+
88
+ # Authenticate and get an access token
89
+ auth_url = f'https://login.microsoftonline.com/{self.tenant_id}/oauth2/v2.0/token'
90
+ data = {
91
+ 'grant_type': 'client_credentials',
92
+ 'client_id': self.client_id,
93
+ 'client_secret': self.client_secret,
94
+ 'scope': 'https://graph.microsoft.com/.default'
95
+ }
96
+ response = requests.post(auth_url, data=data)
97
+ self.access_token = response.json()['access_token']
98
+
99
+ self.headers = {
100
+ 'Authorization': f'Bearer {self.access_token}',
101
+ 'Content-Type': 'application/octet-stream',
102
+ }
103
+
104
+ def get_data(self,search_query,relative_path,date_col,sheet_name,skip_rows):
105
+ api_url = f'https://graph.microsoft.com/v1.0/sites/{self.site_id}/drives/{self.drive_id}/items/root:/{relative_path}:/children'
106
+ response = requests.get(api_url, headers=self.headers)
107
+
108
+ data = response.json()
109
+
110
+ df = pd.DataFrame(data = data['value'].copy())
111
+ cols = df.columns.to_list()
112
+ df_filtered = df[df[cols]['name'].str.lower().str.contains(search_query.lower())].sort_values('lastModifiedDateTime',ascending=False).head(1).reset_index()
113
+
114
+ file_name = df_filtered['name'].values[0]
115
+ api_url_content = f'https://graph.microsoft.com/v1.0/sites/{self.site_id}/drives/{self.drive_id}/items/root:/{relative_path}/{file_name}:/content'
116
+
117
+ output = StringIO()
118
+ file = requests.get(api_url_content,headers=self.headers)
119
+
120
+ try:
121
+ output = StringIO()
122
+ data = pd.read_csv(StringIO(file.content.decode('utf-8')),skiprows=skip_rows)
123
+ except (UnicodeDecodeError,BadZipFile) as err:
124
+ print(err,'\nNow processing as excel file')
125
+ output = BytesIO()
126
+ dict = pd.read_excel(BytesIO(file.content),sheet_name=[sheet_name],engine='openpyxl',skiprows=skip_rows)
127
+ data = dict[sheet_name]
128
+
129
+ data['insertion_datetime'] = datetime.datetime.now().strftime('%Y-%m-%d %I:%M:%S')
130
+
131
+ new_columns = []
132
+ old_columns = data.columns.tolist()
133
+
134
+ for item in old_columns:
135
+ if type(item) == str:
136
+ new_item = re.sub('^_|_$','',item.replace(" ($)","_").replace('\n','_').replace(" \+ ","_").replace(":","_").replace(" ","_").replace(".","")\
137
+ .replace("(","").replace(")","").replace("/","_").replace(",","_").\
138
+ replace("-","_").replace('__','_').replace('___','_').replace("%","per").replace('unnamed__',"").lstrip('0123456789')\
139
+ .replace('unique','a_unique').lower().replace('#','').replace('?','').replace("+","").replace('^_','').replace('_$',''))
140
+ new_columns.append(new_item)
141
+ elif type(item) == datetime.datetime:
142
+ new_item = item.strftime("%b_%Y").lower()
143
+ new_columns.append(new_item)
144
+
145
+ try:
146
+ data = data.drop('',axis=1)
147
+ except KeyError:
148
+ pass
149
+
150
+ for col in data.columns:
151
+ try:
152
+ data[col] = data[col].astype(str)
153
+ except (ValueError,KeyError):
154
+ continue
155
+
156
+ for i in range(len(new_columns)):
157
+ new_columns[i] = new_columns[i].lower()
158
+
159
+ data.columns = new_columns
160
+
161
+ drop_cols = ['',',','_']
162
+ data = data.drop(drop_cols, axis=1, errors = 'ignore')
163
+ data = data.loc[:,~data.columns.duplicated()]
164
+ new_columns = data.columns.tolist()
165
+
166
+ sf_cols = []
167
+ sf_tr = []
168
+
169
+ for i in range(len(new_columns)):
170
+ new_value = new_columns[i].lower() + ' ' + 'string'
171
+ transform = 'nullif(' + new_columns[i].lower() + ',\'nan\') as ' + new_columns[i].lower() # + ' ' + 'string'
172
+ sf_cols.append(new_value)
173
+ sf_tr.append(transform)
174
+
175
+ sf_query = "\n,".join(sf_cols)
176
+ sf_tr_query = "\n,".join(sf_tr)
177
+
178
+ return data, sf_query, sf_tr_query, file, api_url_content
179
+
180
+ def upload_file(self,upload_url,modified_data,content_type='application/octet-stream'):
181
+ headers = {
182
+ 'Authorization': f'Bearer {self.access_token}',
183
+ 'Content-Type': content_type,
184
+ 'Content-Length': str(len(modified_data)),
185
+ }
186
+ upload_response = requests.put(upload_url, data=modified_data, headers=headers)
187
+ if upload_response.status_code == 200:
188
+ print("File uploaded successfully.")
189
+ else:
190
+ print(f"Failed to upload file. Status code: {upload_response.status_code}")
191
+ upload_response.close()
192
+ return headers
193
+
194
+
195
+ class snowflake:
196
+ def __init__(self,user,pw,database,schema,role):
197
+
198
+ self.database=database
199
+ self.schema=schema
200
+ self.role=role
201
+
202
+ if 'airbyte' in role.lower():
203
+ warehouse = 'airbyte_warehouse'
204
+ else:
205
+ warehouse = 'cart_dev_compute_wh'
206
+
207
+ self.cnn = sf.connect(
208
+ user= user,
209
+ password = pw,
210
+ account = 'og64234.us-central1.gcp',
211
+ warehouse = warehouse,
212
+ database = database,
213
+ role = role,
214
+ schema = schema)
215
+
216
+ def load_data(self,sf_query,sf_tr_query,table_name,data,change_tracking=None):
217
+ print('Table Name: {}'.format(self.database + '.' + self.schema + '.' + table_name))
218
+ print('Start: load to Snowflake...')
219
+ data.reset_index(drop=True, inplace=True)
220
+ print('opening snowflake...')
221
+
222
+ self.cnn.cursor().execute(
223
+ "CREATE SCHEMA IF NOT EXISTS " + self.database + "." + self.schema
224
+ )
225
+
226
+ self.cnn.cursor().execute(
227
+ "CREATE OR REPLACE TABLE " +
228
+ table_name + "(" + sf_query + ")"
229
+ )
230
+
231
+ success, nchunks, nrows, _ = write_pandas(self.cnn, data, table_name, on_error = "CONTINUE",quote_identifiers=False)
232
+ print(str(success) + ', ' + str(nchunks) + ', ' + str(nrows))
233
+
234
+ self.cnn.cursor().execute(
235
+ "CREATE OR REPLACE TABLE " + table_name + " as" + "\nselect\n" + sf_tr_query + '\nfrom\n' + table_name
236
+ )
237
+
238
+ if change_tracking:
239
+ self.cnn.cursor().execute("ALTER TABLE " + table_name + " set CHANGE_TRACKING=TRUE")
240
+ print("Change Tracking Enabled")
241
+
242
+ if self.database.casefold() != "maas_db":
243
+ print(self.database)
244
+ self.cnn.cursor().execute("EXECUTE TASK ENRICHMENT_DB.TASKS.SPROC5_TRIGGER")
245
+
246
+ print('Started: Executed SPROC5...\n\n')
247
+
248
+ self.cnn.close()
249
+ print('Done: Load to Snowflake\n\n')
250
+
251
+ def get_data(self,sheet_name,search_query):
252
+ global df, sqlText, file_path
253
+ print('Start: download from Snowflake for sheet {}'.format(sheet_name))
254
+
255
+ print('opening snowflake...')
256
+
257
+ sqlText = search_query
258
+ print(sqlText)
259
+
260
+ # Create a cursor object
261
+ cur = self.cnn.cursor().execute(sqlText)
262
+
263
+ # Fetch the result set from the cursor and deliver it as the Pandas DataFrame
264
+ self.df = cur.fetch_pandas_all()
265
+
266
+ # Process the DataFrame as needed for each sheet
267
+ columns = self.df.columns.tolist()
268
+
269
+ # Create a new dataframe with dynamic column names
270
+ cumm_df = pd.DataFrame(columns=columns,data=self.df)
271
+ # cumm_df = cumm_df.append(self.df, ignore_index=True)
272
+
273
+ self.cnn.close()
274
+ print('End: download from Snowflake for sheet {}'.format(sheet_name))
275
+
276
+ # Return the new dataframe
277
+ return cumm_df
278
+
279
+ class googlestorage:
280
+ def __init__(self,service_account):
281
+ self.client = storage.Client.from_service_account_json(service_account)
282
+
283
+ def get_data(self,bucket_name,path,search_query,sheet_name,skip_rows):
284
+ bucket = self.client.get_bucket(bucket_name)
285
+ blobs = bucket.list_blobs(prefix=f'{path}')
286
+ max_modified_date = None
287
+
288
+ for blob in blobs:
289
+ if search_query.lower() in blob.name.lower():
290
+ modified_time = blob.updated
291
+ if max_modified_date is None or modified_time > max_modified_date:
292
+ max_modified_date = modified_time
293
+ fblob = blob
294
+ print(f'Object Name: {fblob.name}, Modified: {fblob.updated}')
295
+
296
+ try:
297
+ csv_data = fblob.download_as_string()
298
+ csv_string = csv_data.decode('utf-8')
299
+ df = pd.read_csv(csv_string,skiprows=skip_rows)
300
+ except UnicodeDecodeError:
301
+ csv_data = fblob.download_as_bytes()
302
+ df = pd.read_excel(csv_data,sheet_name=sheet_name,skiprows=skip_rows)
303
+
304
+ return df
move_data/setup.py ADDED
@@ -0,0 +1,37 @@
1
+ from setuptools import setup, find_packages
2
+
3
+ setup(
4
+ name="move-data",
5
+ version="0.1.5",
6
+ description="A Python package for moving data between Google Sheets, SharePoint, Google Cloud Storage, and Snowflake",
7
+ long_description=open("README.md").read(),
8
+ long_description_content_type="text/markdown",
9
+ author="Your Name",
10
+ author_email="your.email@example.com",
11
+ url="https://github.com/yourusername/move-data",
12
+ packages=["move_data"],
13
+ package_dir={"move_data": "."},
14
+ include_package_data=True,
15
+ python_requires=">=3.7",
16
+ install_requires=[
17
+ "requests>=2.25.0",
18
+ "pandas>=1.3.0",
19
+ "pygsheets>=2.0.0",
20
+ "snowflake-connector-python>=2.7.0",
21
+ "google-cloud-storage>=2.0.0",
22
+ "chardet>=4.0.0",
23
+ "openpyxl>=3.0.0",
24
+ ],
25
+ classifiers=[
26
+ "Development Status :: 4 - Beta",
27
+ "Intended Audience :: Developers",
28
+ "License :: OSI Approved :: MIT License",
29
+ "Programming Language :: Python :: 3",
30
+ "Programming Language :: Python :: 3.7",
31
+ "Programming Language :: Python :: 3.8",
32
+ "Programming Language :: Python :: 3.9",
33
+ "Programming Language :: Python :: 3.10",
34
+ "Programming Language :: Python :: 3.11",
35
+ ],
36
+ )
37
+
@@ -0,0 +1,139 @@
1
+ Metadata-Version: 2.4
2
+ Name: move-data
3
+ Version: 0.1.5
4
+ Summary: A Python package for moving data between Google Sheets, SharePoint, Google Cloud Storage, and Snowflake
5
+ Home-page: https://github.com/yourusername/move-data
6
+ Author: Your Name
7
+ Author-email: Your Name <your.email@example.com>
8
+ License: MIT
9
+ Project-URL: Homepage, https://github.com/yourusername/move-data
10
+ Project-URL: Documentation, https://github.com/yourusername/move-data#readme
11
+ Project-URL: Repository, https://github.com/yourusername/move-data
12
+ Project-URL: Issues, https://github.com/yourusername/move-data/issues
13
+ Keywords: data,etl,snowflake,google-sheets,sharepoint,google-cloud-storage
14
+ Classifier: Development Status :: 4 - Beta
15
+ Classifier: Intended Audience :: Developers
16
+ Classifier: License :: OSI Approved :: MIT License
17
+ Classifier: Programming Language :: Python :: 3
18
+ Classifier: Programming Language :: Python :: 3.7
19
+ Classifier: Programming Language :: Python :: 3.8
20
+ Classifier: Programming Language :: Python :: 3.9
21
+ Classifier: Programming Language :: Python :: 3.10
22
+ Classifier: Programming Language :: Python :: 3.11
23
+ Requires-Python: >=3.7
24
+ Description-Content-Type: text/markdown
25
+ Requires-Dist: requests>=2.25.0
26
+ Requires-Dist: pandas>=1.3.0
27
+ Requires-Dist: pygsheets>=2.0.0
28
+ Requires-Dist: snowflake-connector-python>=2.7.0
29
+ Requires-Dist: google-cloud-storage>=2.0.0
30
+ Requires-Dist: chardet>=4.0.0
31
+ Requires-Dist: openpyxl>=3.0.0
32
+ Dynamic: author
33
+ Dynamic: home-page
34
+ Dynamic: requires-python
35
+
36
+ # move-data
37
+
38
+ A Python package for moving data between Google Sheets, SharePoint, Google Cloud Storage, and Snowflake.
39
+
40
+ ## Features
41
+
42
+ - **Google Sheets Integration**: Extract data from Google Sheets with automatic column name normalization
43
+ - **SharePoint Integration**: Download and upload files from SharePoint Online
44
+ - **Google Cloud Storage**: Retrieve files from GCS buckets
45
+ - **Snowflake Integration**: Load data to and extract data from Snowflake databases
46
+
47
+ ## Installation
48
+
49
+ ```bash
50
+ pip install move-data
51
+ ```
52
+
53
+ ## Usage
54
+
55
+ ### Google Sheets
56
+
57
+ ```python
58
+ from move_data import get_googlesheets_data
59
+
60
+ data, sf_query, sf_tr_query = get_googlesheets_data(
61
+ name="My Spreadsheet",
62
+ sheet="Sheet1",
63
+ service_account_path="/path/to/service_account.json",
64
+ skip_rows=0 # Optional: skip first N rows
65
+ )
66
+ ```
67
+
68
+ ### SharePoint
69
+
70
+ ```python
71
+ from move_data import sharepoint
72
+
73
+ sp = sharepoint(
74
+ client_id="your_client_id",
75
+ client_secret="your_client_secret",
76
+ tenant_id="your_tenant_id",
77
+ site_id="your_site_id",
78
+ library_name="Documents",
79
+ drive_id="your_drive_id"
80
+ )
81
+
82
+ data, sf_query, sf_tr_query, file, api_url = sp.get_data(
83
+ search_query="filename",
84
+ relative_path="folder/path",
85
+ date_col="date_column",
86
+ sheet_name="Sheet1",
87
+ skip_rows=0
88
+ )
89
+ ```
90
+
91
+ ### Snowflake
92
+
93
+ ```python
94
+ from move_data import snowflake
95
+
96
+ sf = snowflake(
97
+ user="username",
98
+ pw="password",
99
+ database="database_name",
100
+ schema="schema_name",
101
+ role="role_name"
102
+ )
103
+
104
+ # Load data to Snowflake
105
+ sf.load_data(sf_query, sf_tr_query, "table_name", data, change_tracking=True)
106
+
107
+ # Get data from Snowflake
108
+ df = sf.get_data(sheet_name="Sheet1", search_query="SELECT * FROM table")
109
+ ```
110
+
111
+ ### Google Cloud Storage
112
+
113
+ ```python
114
+ from move_data import googlestorage
115
+
116
+ gs = googlestorage(service_account="/path/to/service_account.json")
117
+
118
+ df = gs.get_data(
119
+ bucket_name="my-bucket",
120
+ path="folder/path",
121
+ search_query="filename",
122
+ sheet_name="Sheet1",
123
+ skip_rows=0
124
+ )
125
+ ```
126
+
127
+ ## Requirements
128
+
129
+ - Python 3.7+
130
+ - See `pyproject.toml` for full dependency list
131
+
132
+ ## License
133
+
134
+ MIT
135
+
136
+ ## Contributing
137
+
138
+ Contributions are welcome! Please open an issue or submit a pull request.
139
+
@@ -0,0 +1,7 @@
1
+ move_data/__init__.py,sha256=3w0EZT8rFCSk4pN5uw1pNmG4wtPHY77TtYH4ecaelXQ,498
2
+ move_data/move_data.py,sha256=VqqNJ8lCDUrO2hF4H-XclrOR5XuhvB8AGMeMl4JDeDw,10799
3
+ move_data/setup.py,sha256=s-pIY-SDMvjDziuYfcNFM6ekadchCjBdpdJd6IHN9u4,1265
4
+ move_data-0.1.5.dist-info/METADATA,sha256=VCXT8UkD3-vq3kG85iM_fV89o1eAK5s_SO1N3o9veX0,3620
5
+ move_data-0.1.5.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
6
+ move_data-0.1.5.dist-info/top_level.txt,sha256=x3YHlcqp8uYWyjg_u22_o646HG9BaUjGKVb9aWDS2FU,10
7
+ move_data-0.1.5.dist-info/RECORD,,
@@ -0,0 +1,5 @@
1
+ Wheel-Version: 1.0
2
+ Generator: setuptools (80.9.0)
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
5
+
@@ -0,0 +1 @@
1
+ move_data