fakesnow 0.9.22__py3-none-any.whl → 0.9.23__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- fakesnow/arrow.py +38 -3
- fakesnow/conn.py +175 -0
- fakesnow/cursor.py +463 -0
- fakesnow/fakes.py +3 -750
- fakesnow/pandas_tools.py +77 -0
- fakesnow/server.py +5 -11
- fakesnow/types.py +89 -0
- {fakesnow-0.9.22.dist-info → fakesnow-0.9.23.dist-info}/METADATA +2 -1
- {fakesnow-0.9.22.dist-info → fakesnow-0.9.23.dist-info}/RECORD +13 -9
- {fakesnow-0.9.22.dist-info → fakesnow-0.9.23.dist-info}/LICENSE +0 -0
- {fakesnow-0.9.22.dist-info → fakesnow-0.9.23.dist-info}/WHEEL +0 -0
- {fakesnow-0.9.22.dist-info → fakesnow-0.9.23.dist-info}/entry_points.txt +0 -0
- {fakesnow-0.9.22.dist-info → fakesnow-0.9.23.dist-info}/top_level.txt +0 -0
fakesnow/arrow.py
CHANGED
@@ -1,3 +1,5 @@
|
|
1
|
+
from typing import Any
|
2
|
+
|
1
3
|
import pyarrow as pa
|
2
4
|
|
3
5
|
|
@@ -8,10 +10,21 @@ def with_sf_metadata(schema: pa.Schema) -> pa.Schema:
|
|
8
10
|
for i, t in enumerate(schema.types):
|
9
11
|
f = schema.field(i)
|
10
12
|
|
11
|
-
|
13
|
+
# TODO: precision, scale, charLength etc. for all types
|
14
|
+
|
15
|
+
if t == pa.bool_():
|
16
|
+
fm = f.with_metadata({"logicalType": "BOOLEAN"})
|
17
|
+
elif t == pa.int64():
|
18
|
+
# scale and precision required, see here
|
19
|
+
# https://github.com/snowflakedb/snowflake-connector-python/blob/416ff57/src/snowflake/connector/nanoarrow_cpp/ArrowIterator/CArrowChunkIterator.cpp#L147
|
20
|
+
fm = f.with_metadata({"logicalType": "FIXED", "precision": "38", "scale": "0"})
|
21
|
+
elif t == pa.float64():
|
22
|
+
fm = f.with_metadata({"logicalType": "REAL"})
|
23
|
+
elif isinstance(t, pa.Decimal128Type):
|
12
24
|
fm = f.with_metadata({"logicalType": "FIXED", "precision": str(t.precision), "scale": str(t.scale)})
|
13
25
|
elif t == pa.string():
|
14
|
-
|
26
|
+
# TODO: set charLength to size of column
|
27
|
+
fm = f.with_metadata({"logicalType": "TEXT", "charLength": "16777216"})
|
15
28
|
else:
|
16
29
|
raise NotImplementedError(f"Unsupported Arrow type: {t}")
|
17
30
|
fms.append(fm)
|
@@ -26,7 +39,29 @@ def to_ipc(table: pa.Table) -> pa.Buffer:
|
|
26
39
|
|
27
40
|
sink = pa.BufferOutputStream()
|
28
41
|
|
29
|
-
with pa.ipc.new_stream(sink, with_sf_metadata(
|
42
|
+
with pa.ipc.new_stream(sink, with_sf_metadata(table.schema)) as writer:
|
30
43
|
writer.write_batch(batch)
|
31
44
|
|
32
45
|
return sink.getvalue()
|
46
|
+
|
47
|
+
|
48
|
+
# TODO: should this be derived before with_schema?
|
49
|
+
def to_rowtype(schema: pa.Schema) -> list[dict[str, Any]]:
|
50
|
+
return [
|
51
|
+
{
|
52
|
+
"name": f.name,
|
53
|
+
# TODO
|
54
|
+
# "database": "",
|
55
|
+
# "schema": "",
|
56
|
+
# "table": "",
|
57
|
+
"nullable": f.nullable,
|
58
|
+
"type": f.metadata.get(b"logicalType").decode("utf-8").lower(), # type: ignore
|
59
|
+
# TODO
|
60
|
+
# "byteLength": 20,
|
61
|
+
"length": int(f.metadata.get(b"charLength")) if f.metadata.get(b"charLength") else None, # type: ignore
|
62
|
+
"scale": int(f.metadata.get(b"scale")) if f.metadata.get(b"scale") else None, # type: ignore
|
63
|
+
"precision": int(f.metadata.get(b"precision")) if f.metadata.get(b"precision") else None, # type: ignore
|
64
|
+
"collation": None,
|
65
|
+
}
|
66
|
+
for f in schema
|
67
|
+
]
|
fakesnow/conn.py
ADDED
@@ -0,0 +1,175 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
|
3
|
+
import json
|
4
|
+
import os
|
5
|
+
from collections.abc import Iterable
|
6
|
+
from pathlib import Path
|
7
|
+
from types import TracebackType
|
8
|
+
from typing import Any
|
9
|
+
|
10
|
+
import pandas as pd
|
11
|
+
import snowflake.connector.converter
|
12
|
+
import snowflake.connector.errors
|
13
|
+
import sqlglot
|
14
|
+
from duckdb import DuckDBPyConnection
|
15
|
+
from snowflake.connector.cursor import DictCursor, SnowflakeCursor
|
16
|
+
from sqlglot import exp
|
17
|
+
from typing_extensions import Self
|
18
|
+
|
19
|
+
import fakesnow.info_schema as info_schema
|
20
|
+
import fakesnow.macros as macros
|
21
|
+
from fakesnow.cursor import FakeSnowflakeCursor
|
22
|
+
from fakesnow.variables import Variables
|
23
|
+
|
24
|
+
|
25
|
+
class FakeSnowflakeConnection:
|
26
|
+
def __init__(
|
27
|
+
self,
|
28
|
+
duck_conn: DuckDBPyConnection,
|
29
|
+
database: str | None = None,
|
30
|
+
schema: str | None = None,
|
31
|
+
create_database: bool = True,
|
32
|
+
create_schema: bool = True,
|
33
|
+
db_path: str | os.PathLike | None = None,
|
34
|
+
nop_regexes: list[str] | None = None,
|
35
|
+
*args: Any,
|
36
|
+
**kwargs: Any,
|
37
|
+
):
|
38
|
+
self._duck_conn = duck_conn
|
39
|
+
self._is_closed = False
|
40
|
+
# upper case database and schema like snowflake unquoted identifiers
|
41
|
+
# so they appear as upper-cased in information_schema
|
42
|
+
# catalog and schema names are not actually case-sensitive in duckdb even though
|
43
|
+
# they are as cased in information_schema.schemata, so when selecting from
|
44
|
+
# information_schema.schemata below we use upper-case to match any existing duckdb
|
45
|
+
# catalog or schemas like "information_schema"
|
46
|
+
self.database = database and database.upper()
|
47
|
+
self.schema = schema and schema.upper()
|
48
|
+
|
49
|
+
self.database_set = False
|
50
|
+
self.schema_set = False
|
51
|
+
self.db_path = Path(db_path) if db_path else None
|
52
|
+
self.nop_regexes = nop_regexes
|
53
|
+
self._paramstyle = snowflake.connector.paramstyle
|
54
|
+
self.variables = Variables()
|
55
|
+
|
56
|
+
# create database if needed
|
57
|
+
if (
|
58
|
+
create_database
|
59
|
+
and self.database
|
60
|
+
and not duck_conn.execute(
|
61
|
+
f"""select * from information_schema.schemata
|
62
|
+
where upper(catalog_name) = '{self.database}'"""
|
63
|
+
).fetchone()
|
64
|
+
):
|
65
|
+
db_file = f"{self.db_path/self.database}.db" if self.db_path else ":memory:"
|
66
|
+
duck_conn.execute(f"ATTACH DATABASE '{db_file}' AS {self.database}")
|
67
|
+
duck_conn.execute(info_schema.creation_sql(self.database))
|
68
|
+
duck_conn.execute(macros.creation_sql(self.database))
|
69
|
+
|
70
|
+
# create schema if needed
|
71
|
+
if (
|
72
|
+
create_schema
|
73
|
+
and self.database
|
74
|
+
and self.schema
|
75
|
+
and not duck_conn.execute(
|
76
|
+
f"""select * from information_schema.schemata
|
77
|
+
where upper(catalog_name) = '{self.database}' and upper(schema_name) = '{self.schema}'"""
|
78
|
+
).fetchone()
|
79
|
+
):
|
80
|
+
duck_conn.execute(f"CREATE SCHEMA {self.database}.{self.schema}")
|
81
|
+
|
82
|
+
# set database and schema if both exist
|
83
|
+
if (
|
84
|
+
self.database
|
85
|
+
and self.schema
|
86
|
+
and duck_conn.execute(
|
87
|
+
f"""select * from information_schema.schemata
|
88
|
+
where upper(catalog_name) = '{self.database}' and upper(schema_name) = '{self.schema}'"""
|
89
|
+
).fetchone()
|
90
|
+
):
|
91
|
+
duck_conn.execute(f"SET schema='{self.database}.{self.schema}'")
|
92
|
+
self.database_set = True
|
93
|
+
self.schema_set = True
|
94
|
+
# set database if only that exists
|
95
|
+
elif (
|
96
|
+
self.database
|
97
|
+
and duck_conn.execute(
|
98
|
+
f"""select * from information_schema.schemata
|
99
|
+
where upper(catalog_name) = '{self.database}'"""
|
100
|
+
).fetchone()
|
101
|
+
):
|
102
|
+
duck_conn.execute(f"SET schema='{self.database}.main'")
|
103
|
+
self.database_set = True
|
104
|
+
|
105
|
+
# use UTC instead of local time zone for consistent testing
|
106
|
+
duck_conn.execute("SET GLOBAL TimeZone = 'UTC'")
|
107
|
+
|
108
|
+
def __enter__(self) -> Self:
|
109
|
+
return self
|
110
|
+
|
111
|
+
def __exit__(
|
112
|
+
self,
|
113
|
+
exc_type: type[BaseException] | None,
|
114
|
+
exc_value: BaseException | None,
|
115
|
+
traceback: TracebackType | None,
|
116
|
+
) -> None:
|
117
|
+
pass
|
118
|
+
|
119
|
+
def close(self, retry: bool = True) -> None:
|
120
|
+
self._duck_conn.close()
|
121
|
+
self._is_closed = True
|
122
|
+
|
123
|
+
def commit(self) -> None:
|
124
|
+
self.cursor().execute("COMMIT")
|
125
|
+
|
126
|
+
def cursor(self, cursor_class: type[SnowflakeCursor] = SnowflakeCursor) -> FakeSnowflakeCursor:
|
127
|
+
# TODO: use duck_conn cursor for thread-safety
|
128
|
+
return FakeSnowflakeCursor(conn=self, duck_conn=self._duck_conn, use_dict_result=cursor_class == DictCursor)
|
129
|
+
|
130
|
+
def execute_string(
|
131
|
+
self,
|
132
|
+
sql_text: str,
|
133
|
+
remove_comments: bool = False,
|
134
|
+
return_cursors: bool = True,
|
135
|
+
cursor_class: type[SnowflakeCursor] = SnowflakeCursor,
|
136
|
+
**kwargs: dict[str, Any],
|
137
|
+
) -> Iterable[FakeSnowflakeCursor]:
|
138
|
+
cursors = [
|
139
|
+
self.cursor(cursor_class).execute(e.sql(dialect="snowflake"))
|
140
|
+
for e in sqlglot.parse(sql_text, read="snowflake")
|
141
|
+
if e and not isinstance(e, exp.Semicolon) # ignore comments
|
142
|
+
]
|
143
|
+
return cursors if return_cursors else []
|
144
|
+
|
145
|
+
def is_closed(self) -> bool:
|
146
|
+
return self._is_closed
|
147
|
+
|
148
|
+
def rollback(self) -> None:
|
149
|
+
self.cursor().execute("ROLLBACK")
|
150
|
+
|
151
|
+
def _insert_df(self, df: pd.DataFrame, table_name: str) -> int:
|
152
|
+
# Objects in dataframes are written as parquet structs, and snowflake loads parquet structs as json strings.
|
153
|
+
# Whereas duckdb analyses a dataframe see https://duckdb.org/docs/api/python/data_ingestion.html#pandas-dataframes--object-columns
|
154
|
+
# and converts a object to the most specific type possible, eg: dict -> STRUCT, MAP or varchar, and list -> LIST
|
155
|
+
# For dicts see https://github.com/duckdb/duckdb/pull/3985 and https://github.com/duckdb/duckdb/issues/9510
|
156
|
+
#
|
157
|
+
# When the rows have dicts with different keys there isn't a single STRUCT that can cover them, so the type is
|
158
|
+
# varchar and value a string containing a struct representation. In order to support dicts with different keys
|
159
|
+
# we first convert the dicts to json strings. A pity we can't do something inside duckdb and avoid the dataframe
|
160
|
+
# copy and transform in python.
|
161
|
+
|
162
|
+
df = df.copy()
|
163
|
+
|
164
|
+
# Identify columns of type object
|
165
|
+
object_cols = df.select_dtypes(include=["object"]).columns
|
166
|
+
|
167
|
+
# Apply json.dumps to these columns
|
168
|
+
for col in object_cols:
|
169
|
+
# don't jsonify string
|
170
|
+
df[col] = df[col].apply(lambda x: json.dumps(x) if isinstance(x, (dict, list)) else x)
|
171
|
+
|
172
|
+
escaped_cols = ",".join(f'"{col}"' for col in df.columns.to_list())
|
173
|
+
self._duck_conn.execute(f"INSERT INTO {table_name}({escaped_cols}) SELECT * FROM df")
|
174
|
+
|
175
|
+
return self._duck_conn.fetchall()[0][0]
|