omnata-plugin-runtime 0.2.67__py3-none-any.whl → 0.2.69__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,295 @@
1
+ #
2
+ # Copyright (c) 2012-2023 Snowflake Computing Inc. All rights reserved.
3
+ #
4
+
5
+ from __future__ import annotations
6
+
7
+ import collections.abc
8
+ import os
9
+ import warnings
10
+ from functools import partial
11
+ from logging import getLogger
12
+ from tempfile import TemporaryDirectory
13
+ from typing import (
14
+ TYPE_CHECKING,
15
+ Any,
16
+ Callable,
17
+ Iterable,
18
+ Iterator,
19
+ Literal,
20
+ Sequence,
21
+ TypeVar,
22
+ )
23
+
24
+ from snowflake.connector import ProgrammingError
25
+ from snowflake.connector.options import pandas
26
+ from snowflake.connector.telemetry import TelemetryData, TelemetryField
27
+ from snowflake.connector.util_text import random_string
28
+
29
+ from snowflake.connector.connection import SnowflakeConnection
30
+ from snowflake.connector.pandas_tools import (
31
+ _create_temp_file_format,
32
+ _create_temp_stage,
33
+ build_location_helper,
34
+ chunk_helper
35
+ )
36
+
37
+ logger = getLogger(__name__)
38
+
39
+
40
+ def write_pandas(
41
+ conn: SnowflakeConnection,
42
+ df: pandas.DataFrame,
43
+ table_name: str,
44
+ database: str | None = None,
45
+ schema: str | None = None,
46
+ chunk_size: int | None = None,
47
+ compression: str = "gzip",
48
+ on_error: str = "abort_statement",
49
+ parallel: int = 4,
50
+ quote_identifiers: bool = True,
51
+ auto_create_table: bool = False,
52
+ create_temp_table: bool = False,
53
+ overwrite: bool = False,
54
+ table_type: Literal["", "temp", "temporary", "transient"] = "",
55
+ parquet_engine: Literal["auto", "pyarrow", "fastparquet"] = "auto",
56
+ **kwargs: Any,
57
+ ) -> tuple[
58
+ bool,
59
+ int,
60
+ int,
61
+ Sequence[
62
+ tuple[
63
+ str,
64
+ str,
65
+ int,
66
+ int,
67
+ int,
68
+ int,
69
+ str | None,
70
+ int | None,
71
+ int | None,
72
+ str | None,
73
+ ]
74
+ ],
75
+ ]:
76
+ """Allows users to most efficiently write back a pandas DataFrame to Snowflake.
77
+
78
+ It works by dumping the DataFrame into Parquet files, uploading them and finally copying their data into the table.
79
+
80
+ Returns whether all files were ingested correctly, number of chunks uploaded, and number of rows ingested
81
+ with all of the COPY INTO command's output for debugging purposes.
82
+
83
+ Example usage:
84
+ import pandas
85
+ from snowflake.connector.pandas_tools import write_pandas
86
+
87
+ df = pandas.DataFrame([('Mark', 10), ('Luke', 20)], columns=['name', 'balance'])
88
+ success, nchunks, nrows, _ = write_pandas(cnx, df, 'customers')
89
+
90
+ Args:
91
+ conn: Connection to be used to communicate with Snowflake.
92
+ df: Dataframe we'd like to write back.
93
+ table_name: Table name where we want to insert into.
94
+ database: Database schema and table is in, if not provided the default one will be used (Default value = None).
95
+ schema: Schema table is in, if not provided the default one will be used (Default value = None).
96
+ chunk_size: Number of elements to be inserted once, if not provided all elements will be dumped once
97
+ (Default value = None).
98
+ compression: The compression used on the Parquet files, can only be gzip, or snappy. Gzip gives supposedly a
99
+ better compression, while snappy is faster. Use whichever is more appropriate (Default value = 'gzip').
100
+ on_error: Action to take when COPY INTO statements fail, default follows documentation at:
101
+ https://docs.snowflake.com/en/sql-reference/sql/copy-into-table.html#copy-options-copyoptions
102
+ (Default value = 'abort_statement').
103
+ parallel: Number of threads to be used when uploading chunks, default follows documentation at:
104
+ https://docs.snowflake.com/en/sql-reference/sql/put.html#optional-parameters (Default value = 4).
105
+ quote_identifiers: By default, identifiers, specifically database, schema, table and column names
106
+ (from df.columns) will be quoted. If set to False, identifiers are passed on to Snowflake without quoting.
107
+ I.e. identifiers will be coerced to uppercase by Snowflake. (Default value = True)
108
+ auto_create_table: When true, will automatically create a table with corresponding columns for each column in
109
+ the passed in DataFrame. The table will not be created if it already exists
110
+ create_temp_table: (Deprecated) Will make the auto-created table as a temporary table
111
+ overwrite: When true, and if auto_create_table is true, then it drops the table. Otherwise, it
112
+ truncates the table. In both cases it will replace the existing contents of the table with that of the passed in
113
+ Pandas DataFrame.
114
+ table_type: The table type of to-be-created table. The supported table types include ``temp``/``temporary``
115
+ and ``transient``. Empty means permanent table as per SQL convention.
116
+
117
+ Returns:
118
+ Returns the COPY INTO command's results to verify ingestion in the form of a tuple of whether all chunks were
119
+ ingested correctly, # of chunks, # of ingested rows, and ingest's output.
120
+ """
121
+ if database is not None and schema is None:
122
+ raise ProgrammingError(
123
+ "Schema has to be provided to write_pandas when a database is provided"
124
+ )
125
+ # This dictionary maps the compression algorithm to Snowflake put copy into command type
126
+ # https://docs.snowflake.com/en/sql-reference/sql/copy-into-table.html#type-parquet
127
+ compression_map = {"gzip": "auto", "snappy": "snappy"}
128
+ if compression not in compression_map.keys():
129
+ raise ProgrammingError(
130
+ f"Invalid compression '{compression}', only acceptable values are: {compression_map.keys()}"
131
+ )
132
+
133
+ if create_temp_table:
134
+ warnings.warn(
135
+ "create_temp_table is deprecated, we still respect this parameter when it is True but "
136
+ 'please consider using `table_type="temp"` instead',
137
+ DeprecationWarning,
138
+ # warnings.warn -> write_pandas
139
+ stacklevel=2,
140
+ )
141
+ table_type = "temp"
142
+
143
+ if table_type and table_type.lower() not in ["temp", "temporary", "transient"]:
144
+ raise ValueError(
145
+ "Unsupported table type. Expected table types: temp/temporary, transient"
146
+ )
147
+
148
+ if chunk_size is None:
149
+ chunk_size = len(df)
150
+
151
+ if not (
152
+ isinstance(df.index, pandas.RangeIndex)
153
+ and 1 == df.index.step
154
+ and 0 == df.index.start
155
+ ):
156
+ warnings.warn(
157
+ f"Pandas Dataframe has non-standard index of type {str(type(df.index))} which will not be written."
158
+ f" Consider changing the index to pd.RangeIndex(start=0,...,step=1) or "
159
+ f"call reset_index() to keep index as column(s)",
160
+ UserWarning,
161
+ stacklevel=2,
162
+ )
163
+
164
+ cursor = conn.cursor()
165
+ stage_location = _create_temp_stage(
166
+ cursor,
167
+ database,
168
+ schema,
169
+ quote_identifiers,
170
+ compression,
171
+ auto_create_table,
172
+ overwrite,
173
+ )
174
+
175
+ with TemporaryDirectory() as tmp_folder:
176
+ for i, chunk in chunk_helper(df, chunk_size):
177
+ chunk_path = os.path.join(tmp_folder, f"file{i}.txt")
178
+ # Dump chunk into parquet file
179
+ chunk.to_parquet(chunk_path, compression=compression, engine=parquet_engine, **kwargs)
180
+ # Upload parquet file
181
+ upload_sql = (
182
+ "PUT /* Python:snowflake.connector.pandas_tools.write_pandas() */ "
183
+ "'file://{path}' @{stage_location} PARALLEL={parallel}"
184
+ ).format(
185
+ path=chunk_path.replace("\\", "\\\\").replace("'", "\\'"),
186
+ stage_location=stage_location,
187
+ parallel=parallel,
188
+ )
189
+ logger.debug(f"uploading files with '{upload_sql}'")
190
+ cursor.execute(upload_sql, _is_internal=True)
191
+ # Remove chunk file
192
+ os.remove(chunk_path)
193
+
194
+ # in Snowflake, all parquet data is stored in a single column, $1, so we must select columns explicitly
195
+ # see (https://docs.snowflake.com/en/user-guide/script-data-load-transform-parquet.html)
196
+ if quote_identifiers:
197
+ quote = '"'
198
+ # if the column name contains a double quote, we need to escape it by replacing with two double quotes
199
+ # https://docs.snowflake.com/en/sql-reference/identifiers-syntax#double-quoted-identifiers
200
+ snowflake_column_names = [str(c).replace('"', '""') for c in df.columns]
201
+ else:
202
+ quote = ""
203
+ snowflake_column_names = list(df.columns)
204
+ columns = quote + f"{quote},{quote}".join(snowflake_column_names) + quote
205
+
206
+ def drop_object(name: str, object_type: str) -> None:
207
+ drop_sql = f"DROP {object_type.upper()} IF EXISTS {name} /* Python:snowflake.connector.pandas_tools.write_pandas() */"
208
+ logger.debug(f"dropping {object_type} with '{drop_sql}'")
209
+ cursor.execute(drop_sql, _is_internal=True)
210
+
211
+ if auto_create_table or overwrite:
212
+ file_format_location = _create_temp_file_format(
213
+ cursor, database, schema, quote_identifiers, compression_map[compression]
214
+ )
215
+ infer_schema_sql = f"SELECT COLUMN_NAME, TYPE FROM table(infer_schema(location=>'@{stage_location}', file_format=>'{file_format_location}'))"
216
+ logger.debug(f"inferring schema with '{infer_schema_sql}'")
217
+ column_type_mapping = dict(
218
+ cursor.execute(infer_schema_sql, _is_internal=True).fetchall()
219
+ )
220
+ # Infer schema can return the columns out of order depending on the chunking we do when uploading
221
+ # so we have to iterate through the dataframe columns to make sure we create the table with its
222
+ # columns in order
223
+ create_table_columns = ", ".join(
224
+ [
225
+ f"{quote}{snowflake_col}{quote} {column_type_mapping[col]}"
226
+ for snowflake_col, col in zip(snowflake_column_names, df.columns)
227
+ ]
228
+ )
229
+
230
+ target_table_location = build_location_helper(
231
+ database,
232
+ schema,
233
+ random_string() if overwrite else table_name,
234
+ quote_identifiers,
235
+ )
236
+
237
+ create_table_sql = (
238
+ f"CREATE {table_type.upper()} TABLE IF NOT EXISTS {target_table_location} "
239
+ f"({create_table_columns})"
240
+ f" /* Python:snowflake.connector.pandas_tools.write_pandas() */ "
241
+ )
242
+ logger.debug(f"auto creating table with '{create_table_sql}'")
243
+ cursor.execute(create_table_sql, _is_internal=True)
244
+ # need explicit casting when the underlying table schema is inferred
245
+ parquet_columns = "$1:" + ",$1:".join(
246
+ f"{quote}{snowflake_col}{quote}::{column_type_mapping[col]}"
247
+ for snowflake_col, col in zip(snowflake_column_names, df.columns)
248
+ )
249
+ else:
250
+ target_table_location = build_location_helper(
251
+ database=database,
252
+ schema=schema,
253
+ name=table_name,
254
+ quote_identifiers=quote_identifiers,
255
+ )
256
+ parquet_columns = "$1:" + ",$1:".join(
257
+ f"{quote}{snowflake_col}{quote}" for snowflake_col in snowflake_column_names
258
+ )
259
+
260
+ try:
261
+ copy_into_sql = (
262
+ f"COPY INTO {target_table_location} /* Python:snowflake.connector.pandas_tools.write_pandas() */ "
263
+ f"({columns}) "
264
+ f"FROM (SELECT {parquet_columns} FROM @{stage_location}) "
265
+ f"FILE_FORMAT=(TYPE=PARQUET COMPRESSION={compression_map[compression]}{' BINARY_AS_TEXT=FALSE' if auto_create_table or overwrite else ''}) "
266
+ f"PURGE=TRUE ON_ERROR={on_error}"
267
+ )
268
+ logger.debug(f"copying into with '{copy_into_sql}'")
269
+ copy_results = cursor.execute(copy_into_sql, _is_internal=True).fetchall()
270
+
271
+ if overwrite:
272
+ original_table_location = build_location_helper(
273
+ database=database,
274
+ schema=schema,
275
+ name=table_name,
276
+ quote_identifiers=quote_identifiers,
277
+ )
278
+ drop_object(original_table_location, "table")
279
+ rename_table_sql = f"ALTER TABLE {target_table_location} RENAME TO {original_table_location} /* Python:snowflake.connector.pandas_tools.write_pandas() */"
280
+ logger.debug(f"rename table with '{rename_table_sql}'")
281
+ cursor.execute(rename_table_sql, _is_internal=True)
282
+ except ProgrammingError:
283
+ if overwrite:
284
+ drop_object(target_table_location, "table")
285
+ raise
286
+ finally:
287
+ cursor._log_telemetry_job_data(TelemetryField.PANDAS_WRITE, TelemetryData.TRUE)
288
+ cursor.close()
289
+
290
+ return (
291
+ all(e[1] == "LOADED" for e in copy_results),
292
+ len(copy_results),
293
+ sum(int(e[3]) for e in copy_results),
294
+ copy_results,
295
+ )
@@ -67,6 +67,9 @@ from .rate_limiting import (
67
67
  logger = getLogger(__name__)
68
68
  SortDirectionType = Literal["asc", "desc"]
69
69
 
70
+ import snowflake.connector.pandas_tools
71
+ from .monkey_patching import write_pandas
72
+ snowflake.connector.pandas_tools.write_pandas = write_pandas
70
73
 
71
74
  class PluginManifest(SubscriptableBaseModel):
72
75
  """
@@ -650,6 +653,7 @@ class OutboundSyncRequest(SyncRequest):
650
653
  quote_identifiers=False,
651
654
  table_name=self._full_results_table_name,
652
655
  auto_create_table=False,
656
+ engine='fastparquet'
653
657
  )
654
658
  if not success:
655
659
  raise ValueError(
@@ -1081,6 +1085,8 @@ class InboundSyncRequest(SyncRequest):
1081
1085
  "RETRIEVE_DATE",
1082
1086
  ],
1083
1087
  )
1088
+ # provide record data as a string so that parquet writing doesn't get tangled up trying to guess types
1089
+ results_df["RECORD_DATA"] = results_df["RECORD_DATA"].astype(str)
1084
1090
  # trim out the columns we don't need to return
1085
1091
  return results_df[
1086
1092
  results_df.columns.intersection(
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: omnata-plugin-runtime
3
- Version: 0.2.67
3
+ Version: 0.2.69
4
4
  Summary: Classes and common runtime components for building and running Omnata Plugins
5
5
  Author: James Weakley
6
6
  Author-email: james.weakley@omnata.com
@@ -3,10 +3,11 @@ omnata_plugin_runtime/api.py,sha256=vKq7goVPX5cPQ9CVN9l8RmpJDcqDwS5y9v1IWhWjBbk,
3
3
  omnata_plugin_runtime/configuration.py,sha256=NICse7qMtReIYY4ZCu8ng4QDJ4rFP0g3mZwc8m1Xl54,32247
4
4
  omnata_plugin_runtime/forms.py,sha256=_KqSMQG749wImLKxPZh3B3doTZMbP5jDvF6BhQNkPCM,17375
5
5
  omnata_plugin_runtime/logging.py,sha256=Q6eSqrr3SzwfVAg4r4sV1dlxeNS_PzOtZfieoWUEOZQ,3232
6
- omnata_plugin_runtime/omnata_plugin.py,sha256=0oqCFj4IGt_q-_nJSjk5QtQuK1BNcec2J9PmyzneD24,85265
6
+ omnata_plugin_runtime/monkey_patching.py,sha256=ddYEdVBR_SlijNuKLGSRxxEboiOMcDjQ9lKv4KrahVc,12637
7
+ omnata_plugin_runtime/omnata_plugin.py,sha256=VGTmRr4XYitkLMBPzHswKHUg_Ie0eFiEOUjwH1cXMRQ,85630
7
8
  omnata_plugin_runtime/plugin_entrypoints.py,sha256=dV_JOEWffdkAkofFYJMAsM_-gE9OpM2-paI2dsh5Rzo,24352
8
9
  omnata_plugin_runtime/rate_limiting.py,sha256=QO8VB1H8al6a8-ydohUmL0c5JynXG2bulmuPRs2-2-Y,14910
9
- omnata_plugin_runtime-0.2.67.dist-info/LICENSE,sha256=IMF9i4xIpgCADf0U-V1cuf9HBmqWQd3qtI3FSuyW4zE,26526
10
- omnata_plugin_runtime-0.2.67.dist-info/METADATA,sha256=U0-X5qtmLZgJobz2xNO4dCRpPY0YGdja_sRNBeFXrPo,1086
11
- omnata_plugin_runtime-0.2.67.dist-info/WHEEL,sha256=FMvqSimYX_P7y0a7UY-_Mc83r5zkBZsCYPm7Lr0Bsq4,88
12
- omnata_plugin_runtime-0.2.67.dist-info/RECORD,,
10
+ omnata_plugin_runtime-0.2.69.dist-info/LICENSE,sha256=IMF9i4xIpgCADf0U-V1cuf9HBmqWQd3qtI3FSuyW4zE,26526
11
+ omnata_plugin_runtime-0.2.69.dist-info/METADATA,sha256=-dAiaEMQ2YOx1TuL9BHNM8uiJm9JXOrNGti4FNtere4,1086
12
+ omnata_plugin_runtime-0.2.69.dist-info/WHEEL,sha256=FMvqSimYX_P7y0a7UY-_Mc83r5zkBZsCYPm7Lr0Bsq4,88
13
+ omnata_plugin_runtime-0.2.69.dist-info/RECORD,,