omnata-plugin-runtime 0.2.67__py3-none-any.whl → 0.2.69__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- omnata_plugin_runtime/monkey_patching.py +295 -0
- omnata_plugin_runtime/omnata_plugin.py +6 -0
- {omnata_plugin_runtime-0.2.67.dist-info → omnata_plugin_runtime-0.2.69.dist-info}/METADATA +1 -1
- {omnata_plugin_runtime-0.2.67.dist-info → omnata_plugin_runtime-0.2.69.dist-info}/RECORD +6 -5
- {omnata_plugin_runtime-0.2.67.dist-info → omnata_plugin_runtime-0.2.69.dist-info}/LICENSE +0 -0
- {omnata_plugin_runtime-0.2.67.dist-info → omnata_plugin_runtime-0.2.69.dist-info}/WHEEL +0 -0
@@ -0,0 +1,295 @@
|
|
1
|
+
#
|
2
|
+
# Copyright (c) 2012-2023 Snowflake Computing Inc. All rights reserved.
|
3
|
+
#
|
4
|
+
|
5
|
+
from __future__ import annotations
|
6
|
+
|
7
|
+
import collections.abc
|
8
|
+
import os
|
9
|
+
import warnings
|
10
|
+
from functools import partial
|
11
|
+
from logging import getLogger
|
12
|
+
from tempfile import TemporaryDirectory
|
13
|
+
from typing import (
|
14
|
+
TYPE_CHECKING,
|
15
|
+
Any,
|
16
|
+
Callable,
|
17
|
+
Iterable,
|
18
|
+
Iterator,
|
19
|
+
Literal,
|
20
|
+
Sequence,
|
21
|
+
TypeVar,
|
22
|
+
)
|
23
|
+
|
24
|
+
from snowflake.connector import ProgrammingError
|
25
|
+
from snowflake.connector.options import pandas
|
26
|
+
from snowflake.connector.telemetry import TelemetryData, TelemetryField
|
27
|
+
from snowflake.connector.util_text import random_string
|
28
|
+
|
29
|
+
from snowflake.connector.connection import SnowflakeConnection
|
30
|
+
from snowflake.connector.pandas_tools import (
|
31
|
+
_create_temp_file_format,
|
32
|
+
_create_temp_stage,
|
33
|
+
build_location_helper,
|
34
|
+
chunk_helper
|
35
|
+
)
|
36
|
+
|
37
|
+
logger = getLogger(__name__)
|
38
|
+
|
39
|
+
|
40
|
+
def write_pandas(
|
41
|
+
conn: SnowflakeConnection,
|
42
|
+
df: pandas.DataFrame,
|
43
|
+
table_name: str,
|
44
|
+
database: str | None = None,
|
45
|
+
schema: str | None = None,
|
46
|
+
chunk_size: int | None = None,
|
47
|
+
compression: str = "gzip",
|
48
|
+
on_error: str = "abort_statement",
|
49
|
+
parallel: int = 4,
|
50
|
+
quote_identifiers: bool = True,
|
51
|
+
auto_create_table: bool = False,
|
52
|
+
create_temp_table: bool = False,
|
53
|
+
overwrite: bool = False,
|
54
|
+
table_type: Literal["", "temp", "temporary", "transient"] = "",
|
55
|
+
parquet_engine: Literal["auto", "pyarrow", "fastparquet"] = "auto",
|
56
|
+
**kwargs: Any,
|
57
|
+
) -> tuple[
|
58
|
+
bool,
|
59
|
+
int,
|
60
|
+
int,
|
61
|
+
Sequence[
|
62
|
+
tuple[
|
63
|
+
str,
|
64
|
+
str,
|
65
|
+
int,
|
66
|
+
int,
|
67
|
+
int,
|
68
|
+
int,
|
69
|
+
str | None,
|
70
|
+
int | None,
|
71
|
+
int | None,
|
72
|
+
str | None,
|
73
|
+
]
|
74
|
+
],
|
75
|
+
]:
|
76
|
+
"""Allows users to most efficiently write back a pandas DataFrame to Snowflake.
|
77
|
+
|
78
|
+
It works by dumping the DataFrame into Parquet files, uploading them and finally copying their data into the table.
|
79
|
+
|
80
|
+
Returns whether all files were ingested correctly, number of chunks uploaded, and number of rows ingested
|
81
|
+
with all of the COPY INTO command's output for debugging purposes.
|
82
|
+
|
83
|
+
Example usage:
|
84
|
+
import pandas
|
85
|
+
from snowflake.connector.pandas_tools import write_pandas
|
86
|
+
|
87
|
+
df = pandas.DataFrame([('Mark', 10), ('Luke', 20)], columns=['name', 'balance'])
|
88
|
+
success, nchunks, nrows, _ = write_pandas(cnx, df, 'customers')
|
89
|
+
|
90
|
+
Args:
|
91
|
+
conn: Connection to be used to communicate with Snowflake.
|
92
|
+
df: Dataframe we'd like to write back.
|
93
|
+
table_name: Table name where we want to insert into.
|
94
|
+
database: Database schema and table is in, if not provided the default one will be used (Default value = None).
|
95
|
+
schema: Schema table is in, if not provided the default one will be used (Default value = None).
|
96
|
+
chunk_size: Number of elements to be inserted once, if not provided all elements will be dumped once
|
97
|
+
(Default value = None).
|
98
|
+
compression: The compression used on the Parquet files, can only be gzip, or snappy. Gzip gives supposedly a
|
99
|
+
better compression, while snappy is faster. Use whichever is more appropriate (Default value = 'gzip').
|
100
|
+
on_error: Action to take when COPY INTO statements fail, default follows documentation at:
|
101
|
+
https://docs.snowflake.com/en/sql-reference/sql/copy-into-table.html#copy-options-copyoptions
|
102
|
+
(Default value = 'abort_statement').
|
103
|
+
parallel: Number of threads to be used when uploading chunks, default follows documentation at:
|
104
|
+
https://docs.snowflake.com/en/sql-reference/sql/put.html#optional-parameters (Default value = 4).
|
105
|
+
quote_identifiers: By default, identifiers, specifically database, schema, table and column names
|
106
|
+
(from df.columns) will be quoted. If set to False, identifiers are passed on to Snowflake without quoting.
|
107
|
+
I.e. identifiers will be coerced to uppercase by Snowflake. (Default value = True)
|
108
|
+
auto_create_table: When true, will automatically create a table with corresponding columns for each column in
|
109
|
+
the passed in DataFrame. The table will not be created if it already exists
|
110
|
+
create_temp_table: (Deprecated) Will make the auto-created table as a temporary table
|
111
|
+
overwrite: When true, and if auto_create_table is true, then it drops the table. Otherwise, it
|
112
|
+
truncates the table. In both cases it will replace the existing contents of the table with that of the passed in
|
113
|
+
Pandas DataFrame.
|
114
|
+
table_type: The table type of to-be-created table. The supported table types include ``temp``/``temporary``
|
115
|
+
and ``transient``. Empty means permanent table as per SQL convention.
|
116
|
+
|
117
|
+
Returns:
|
118
|
+
Returns the COPY INTO command's results to verify ingestion in the form of a tuple of whether all chunks were
|
119
|
+
ingested correctly, # of chunks, # of ingested rows, and ingest's output.
|
120
|
+
"""
|
121
|
+
if database is not None and schema is None:
|
122
|
+
raise ProgrammingError(
|
123
|
+
"Schema has to be provided to write_pandas when a database is provided"
|
124
|
+
)
|
125
|
+
# This dictionary maps the compression algorithm to Snowflake put copy into command type
|
126
|
+
# https://docs.snowflake.com/en/sql-reference/sql/copy-into-table.html#type-parquet
|
127
|
+
compression_map = {"gzip": "auto", "snappy": "snappy"}
|
128
|
+
if compression not in compression_map.keys():
|
129
|
+
raise ProgrammingError(
|
130
|
+
f"Invalid compression '{compression}', only acceptable values are: {compression_map.keys()}"
|
131
|
+
)
|
132
|
+
|
133
|
+
if create_temp_table:
|
134
|
+
warnings.warn(
|
135
|
+
"create_temp_table is deprecated, we still respect this parameter when it is True but "
|
136
|
+
'please consider using `table_type="temp"` instead',
|
137
|
+
DeprecationWarning,
|
138
|
+
# warnings.warn -> write_pandas
|
139
|
+
stacklevel=2,
|
140
|
+
)
|
141
|
+
table_type = "temp"
|
142
|
+
|
143
|
+
if table_type and table_type.lower() not in ["temp", "temporary", "transient"]:
|
144
|
+
raise ValueError(
|
145
|
+
"Unsupported table type. Expected table types: temp/temporary, transient"
|
146
|
+
)
|
147
|
+
|
148
|
+
if chunk_size is None:
|
149
|
+
chunk_size = len(df)
|
150
|
+
|
151
|
+
if not (
|
152
|
+
isinstance(df.index, pandas.RangeIndex)
|
153
|
+
and 1 == df.index.step
|
154
|
+
and 0 == df.index.start
|
155
|
+
):
|
156
|
+
warnings.warn(
|
157
|
+
f"Pandas Dataframe has non-standard index of type {str(type(df.index))} which will not be written."
|
158
|
+
f" Consider changing the index to pd.RangeIndex(start=0,...,step=1) or "
|
159
|
+
f"call reset_index() to keep index as column(s)",
|
160
|
+
UserWarning,
|
161
|
+
stacklevel=2,
|
162
|
+
)
|
163
|
+
|
164
|
+
cursor = conn.cursor()
|
165
|
+
stage_location = _create_temp_stage(
|
166
|
+
cursor,
|
167
|
+
database,
|
168
|
+
schema,
|
169
|
+
quote_identifiers,
|
170
|
+
compression,
|
171
|
+
auto_create_table,
|
172
|
+
overwrite,
|
173
|
+
)
|
174
|
+
|
175
|
+
with TemporaryDirectory() as tmp_folder:
|
176
|
+
for i, chunk in chunk_helper(df, chunk_size):
|
177
|
+
chunk_path = os.path.join(tmp_folder, f"file{i}.txt")
|
178
|
+
# Dump chunk into parquet file
|
179
|
+
chunk.to_parquet(chunk_path, compression=compression, engine=parquet_engine, **kwargs)
|
180
|
+
# Upload parquet file
|
181
|
+
upload_sql = (
|
182
|
+
"PUT /* Python:snowflake.connector.pandas_tools.write_pandas() */ "
|
183
|
+
"'file://{path}' @{stage_location} PARALLEL={parallel}"
|
184
|
+
).format(
|
185
|
+
path=chunk_path.replace("\\", "\\\\").replace("'", "\\'"),
|
186
|
+
stage_location=stage_location,
|
187
|
+
parallel=parallel,
|
188
|
+
)
|
189
|
+
logger.debug(f"uploading files with '{upload_sql}'")
|
190
|
+
cursor.execute(upload_sql, _is_internal=True)
|
191
|
+
# Remove chunk file
|
192
|
+
os.remove(chunk_path)
|
193
|
+
|
194
|
+
# in Snowflake, all parquet data is stored in a single column, $1, so we must select columns explicitly
|
195
|
+
# see (https://docs.snowflake.com/en/user-guide/script-data-load-transform-parquet.html)
|
196
|
+
if quote_identifiers:
|
197
|
+
quote = '"'
|
198
|
+
# if the column name contains a double quote, we need to escape it by replacing with two double quotes
|
199
|
+
# https://docs.snowflake.com/en/sql-reference/identifiers-syntax#double-quoted-identifiers
|
200
|
+
snowflake_column_names = [str(c).replace('"', '""') for c in df.columns]
|
201
|
+
else:
|
202
|
+
quote = ""
|
203
|
+
snowflake_column_names = list(df.columns)
|
204
|
+
columns = quote + f"{quote},{quote}".join(snowflake_column_names) + quote
|
205
|
+
|
206
|
+
def drop_object(name: str, object_type: str) -> None:
|
207
|
+
drop_sql = f"DROP {object_type.upper()} IF EXISTS {name} /* Python:snowflake.connector.pandas_tools.write_pandas() */"
|
208
|
+
logger.debug(f"dropping {object_type} with '{drop_sql}'")
|
209
|
+
cursor.execute(drop_sql, _is_internal=True)
|
210
|
+
|
211
|
+
if auto_create_table or overwrite:
|
212
|
+
file_format_location = _create_temp_file_format(
|
213
|
+
cursor, database, schema, quote_identifiers, compression_map[compression]
|
214
|
+
)
|
215
|
+
infer_schema_sql = f"SELECT COLUMN_NAME, TYPE FROM table(infer_schema(location=>'@{stage_location}', file_format=>'{file_format_location}'))"
|
216
|
+
logger.debug(f"inferring schema with '{infer_schema_sql}'")
|
217
|
+
column_type_mapping = dict(
|
218
|
+
cursor.execute(infer_schema_sql, _is_internal=True).fetchall()
|
219
|
+
)
|
220
|
+
# Infer schema can return the columns out of order depending on the chunking we do when uploading
|
221
|
+
# so we have to iterate through the dataframe columns to make sure we create the table with its
|
222
|
+
# columns in order
|
223
|
+
create_table_columns = ", ".join(
|
224
|
+
[
|
225
|
+
f"{quote}{snowflake_col}{quote} {column_type_mapping[col]}"
|
226
|
+
for snowflake_col, col in zip(snowflake_column_names, df.columns)
|
227
|
+
]
|
228
|
+
)
|
229
|
+
|
230
|
+
target_table_location = build_location_helper(
|
231
|
+
database,
|
232
|
+
schema,
|
233
|
+
random_string() if overwrite else table_name,
|
234
|
+
quote_identifiers,
|
235
|
+
)
|
236
|
+
|
237
|
+
create_table_sql = (
|
238
|
+
f"CREATE {table_type.upper()} TABLE IF NOT EXISTS {target_table_location} "
|
239
|
+
f"({create_table_columns})"
|
240
|
+
f" /* Python:snowflake.connector.pandas_tools.write_pandas() */ "
|
241
|
+
)
|
242
|
+
logger.debug(f"auto creating table with '{create_table_sql}'")
|
243
|
+
cursor.execute(create_table_sql, _is_internal=True)
|
244
|
+
# need explicit casting when the underlying table schema is inferred
|
245
|
+
parquet_columns = "$1:" + ",$1:".join(
|
246
|
+
f"{quote}{snowflake_col}{quote}::{column_type_mapping[col]}"
|
247
|
+
for snowflake_col, col in zip(snowflake_column_names, df.columns)
|
248
|
+
)
|
249
|
+
else:
|
250
|
+
target_table_location = build_location_helper(
|
251
|
+
database=database,
|
252
|
+
schema=schema,
|
253
|
+
name=table_name,
|
254
|
+
quote_identifiers=quote_identifiers,
|
255
|
+
)
|
256
|
+
parquet_columns = "$1:" + ",$1:".join(
|
257
|
+
f"{quote}{snowflake_col}{quote}" for snowflake_col in snowflake_column_names
|
258
|
+
)
|
259
|
+
|
260
|
+
try:
|
261
|
+
copy_into_sql = (
|
262
|
+
f"COPY INTO {target_table_location} /* Python:snowflake.connector.pandas_tools.write_pandas() */ "
|
263
|
+
f"({columns}) "
|
264
|
+
f"FROM (SELECT {parquet_columns} FROM @{stage_location}) "
|
265
|
+
f"FILE_FORMAT=(TYPE=PARQUET COMPRESSION={compression_map[compression]}{' BINARY_AS_TEXT=FALSE' if auto_create_table or overwrite else ''}) "
|
266
|
+
f"PURGE=TRUE ON_ERROR={on_error}"
|
267
|
+
)
|
268
|
+
logger.debug(f"copying into with '{copy_into_sql}'")
|
269
|
+
copy_results = cursor.execute(copy_into_sql, _is_internal=True).fetchall()
|
270
|
+
|
271
|
+
if overwrite:
|
272
|
+
original_table_location = build_location_helper(
|
273
|
+
database=database,
|
274
|
+
schema=schema,
|
275
|
+
name=table_name,
|
276
|
+
quote_identifiers=quote_identifiers,
|
277
|
+
)
|
278
|
+
drop_object(original_table_location, "table")
|
279
|
+
rename_table_sql = f"ALTER TABLE {target_table_location} RENAME TO {original_table_location} /* Python:snowflake.connector.pandas_tools.write_pandas() */"
|
280
|
+
logger.debug(f"rename table with '{rename_table_sql}'")
|
281
|
+
cursor.execute(rename_table_sql, _is_internal=True)
|
282
|
+
except ProgrammingError:
|
283
|
+
if overwrite:
|
284
|
+
drop_object(target_table_location, "table")
|
285
|
+
raise
|
286
|
+
finally:
|
287
|
+
cursor._log_telemetry_job_data(TelemetryField.PANDAS_WRITE, TelemetryData.TRUE)
|
288
|
+
cursor.close()
|
289
|
+
|
290
|
+
return (
|
291
|
+
all(e[1] == "LOADED" for e in copy_results),
|
292
|
+
len(copy_results),
|
293
|
+
sum(int(e[3]) for e in copy_results),
|
294
|
+
copy_results,
|
295
|
+
)
|
@@ -67,6 +67,9 @@ from .rate_limiting import (
|
|
67
67
|
logger = getLogger(__name__)
|
68
68
|
SortDirectionType = Literal["asc", "desc"]
|
69
69
|
|
70
|
+
import snowflake.connector.pandas_tools
|
71
|
+
from .monkey_patching import write_pandas
|
72
|
+
snowflake.connector.pandas_tools.write_pandas = write_pandas
|
70
73
|
|
71
74
|
class PluginManifest(SubscriptableBaseModel):
|
72
75
|
"""
|
@@ -650,6 +653,7 @@ class OutboundSyncRequest(SyncRequest):
|
|
650
653
|
quote_identifiers=False,
|
651
654
|
table_name=self._full_results_table_name,
|
652
655
|
auto_create_table=False,
|
656
|
+
engine='fastparquet'
|
653
657
|
)
|
654
658
|
if not success:
|
655
659
|
raise ValueError(
|
@@ -1081,6 +1085,8 @@ class InboundSyncRequest(SyncRequest):
|
|
1081
1085
|
"RETRIEVE_DATE",
|
1082
1086
|
],
|
1083
1087
|
)
|
1088
|
+
# provide record data as a string so that parquet writing doesn't get tangled up trying to guess types
|
1089
|
+
results_df["RECORD_DATA"] = results_df["RECORD_DATA"].astype(str)
|
1084
1090
|
# trim out the columns we don't need to return
|
1085
1091
|
return results_df[
|
1086
1092
|
results_df.columns.intersection(
|
@@ -3,10 +3,11 @@ omnata_plugin_runtime/api.py,sha256=vKq7goVPX5cPQ9CVN9l8RmpJDcqDwS5y9v1IWhWjBbk,
|
|
3
3
|
omnata_plugin_runtime/configuration.py,sha256=NICse7qMtReIYY4ZCu8ng4QDJ4rFP0g3mZwc8m1Xl54,32247
|
4
4
|
omnata_plugin_runtime/forms.py,sha256=_KqSMQG749wImLKxPZh3B3doTZMbP5jDvF6BhQNkPCM,17375
|
5
5
|
omnata_plugin_runtime/logging.py,sha256=Q6eSqrr3SzwfVAg4r4sV1dlxeNS_PzOtZfieoWUEOZQ,3232
|
6
|
-
omnata_plugin_runtime/
|
6
|
+
omnata_plugin_runtime/monkey_patching.py,sha256=ddYEdVBR_SlijNuKLGSRxxEboiOMcDjQ9lKv4KrahVc,12637
|
7
|
+
omnata_plugin_runtime/omnata_plugin.py,sha256=VGTmRr4XYitkLMBPzHswKHUg_Ie0eFiEOUjwH1cXMRQ,85630
|
7
8
|
omnata_plugin_runtime/plugin_entrypoints.py,sha256=dV_JOEWffdkAkofFYJMAsM_-gE9OpM2-paI2dsh5Rzo,24352
|
8
9
|
omnata_plugin_runtime/rate_limiting.py,sha256=QO8VB1H8al6a8-ydohUmL0c5JynXG2bulmuPRs2-2-Y,14910
|
9
|
-
omnata_plugin_runtime-0.2.
|
10
|
-
omnata_plugin_runtime-0.2.
|
11
|
-
omnata_plugin_runtime-0.2.
|
12
|
-
omnata_plugin_runtime-0.2.
|
10
|
+
omnata_plugin_runtime-0.2.69.dist-info/LICENSE,sha256=IMF9i4xIpgCADf0U-V1cuf9HBmqWQd3qtI3FSuyW4zE,26526
|
11
|
+
omnata_plugin_runtime-0.2.69.dist-info/METADATA,sha256=-dAiaEMQ2YOx1TuL9BHNM8uiJm9JXOrNGti4FNtere4,1086
|
12
|
+
omnata_plugin_runtime-0.2.69.dist-info/WHEEL,sha256=FMvqSimYX_P7y0a7UY-_Mc83r5zkBZsCYPm7Lr0Bsq4,88
|
13
|
+
omnata_plugin_runtime-0.2.69.dist-info/RECORD,,
|
File without changes
|
File without changes
|