fakesnow 0.9.21__py3-none-any.whl → 0.9.23__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
fakesnow/arrow.py CHANGED
@@ -1,3 +1,5 @@
1
+ from typing import Any
2
+
1
3
  import pyarrow as pa
2
4
 
3
5
 
@@ -8,10 +10,21 @@ def with_sf_metadata(schema: pa.Schema) -> pa.Schema:
8
10
  for i, t in enumerate(schema.types):
9
11
  f = schema.field(i)
10
12
 
11
- if isinstance(t, pa.Decimal128Type):
13
+ # TODO: precision, scale, charLength etc. for all types
14
+
15
+ if t == pa.bool_():
16
+ fm = f.with_metadata({"logicalType": "BOOLEAN"})
17
+ elif t == pa.int64():
18
+ # scale and precision required, see here
19
+ # https://github.com/snowflakedb/snowflake-connector-python/blob/416ff57/src/snowflake/connector/nanoarrow_cpp/ArrowIterator/CArrowChunkIterator.cpp#L147
20
+ fm = f.with_metadata({"logicalType": "FIXED", "precision": "38", "scale": "0"})
21
+ elif t == pa.float64():
22
+ fm = f.with_metadata({"logicalType": "REAL"})
23
+ elif isinstance(t, pa.Decimal128Type):
12
24
  fm = f.with_metadata({"logicalType": "FIXED", "precision": str(t.precision), "scale": str(t.scale)})
13
25
  elif t == pa.string():
14
- fm = f.with_metadata({"logicalType": "TEXT"})
26
+ # TODO: set charLength to size of column
27
+ fm = f.with_metadata({"logicalType": "TEXT", "charLength": "16777216"})
15
28
  else:
16
29
  raise NotImplementedError(f"Unsupported Arrow type: {t}")
17
30
  fms.append(fm)
@@ -26,7 +39,29 @@ def to_ipc(table: pa.Table) -> pa.Buffer:
26
39
 
27
40
  sink = pa.BufferOutputStream()
28
41
 
29
- with pa.ipc.new_stream(sink, with_sf_metadata(batch.schema)) as writer:
42
+ with pa.ipc.new_stream(sink, with_sf_metadata(table.schema)) as writer:
30
43
  writer.write_batch(batch)
31
44
 
32
45
  return sink.getvalue()
46
+
47
+
48
+ # TODO: should this be derived before with_schema?
49
+ def to_rowtype(schema: pa.Schema) -> list[dict[str, Any]]:
50
+ return [
51
+ {
52
+ "name": f.name,
53
+ # TODO
54
+ # "database": "",
55
+ # "schema": "",
56
+ # "table": "",
57
+ "nullable": f.nullable,
58
+ "type": f.metadata.get(b"logicalType").decode("utf-8").lower(), # type: ignore
59
+ # TODO
60
+ # "byteLength": 20,
61
+ "length": int(f.metadata.get(b"charLength")) if f.metadata.get(b"charLength") else None, # type: ignore
62
+ "scale": int(f.metadata.get(b"scale")) if f.metadata.get(b"scale") else None, # type: ignore
63
+ "precision": int(f.metadata.get(b"precision")) if f.metadata.get(b"precision") else None, # type: ignore
64
+ "collation": None,
65
+ }
66
+ for f in schema
67
+ ]
fakesnow/conn.py ADDED
@@ -0,0 +1,175 @@
1
+ from __future__ import annotations
2
+
3
+ import json
4
+ import os
5
+ from collections.abc import Iterable
6
+ from pathlib import Path
7
+ from types import TracebackType
8
+ from typing import Any
9
+
10
+ import pandas as pd
11
+ import snowflake.connector.converter
12
+ import snowflake.connector.errors
13
+ import sqlglot
14
+ from duckdb import DuckDBPyConnection
15
+ from snowflake.connector.cursor import DictCursor, SnowflakeCursor
16
+ from sqlglot import exp
17
+ from typing_extensions import Self
18
+
19
+ import fakesnow.info_schema as info_schema
20
+ import fakesnow.macros as macros
21
+ from fakesnow.cursor import FakeSnowflakeCursor
22
+ from fakesnow.variables import Variables
23
+
24
+
25
+ class FakeSnowflakeConnection:
26
+ def __init__(
27
+ self,
28
+ duck_conn: DuckDBPyConnection,
29
+ database: str | None = None,
30
+ schema: str | None = None,
31
+ create_database: bool = True,
32
+ create_schema: bool = True,
33
+ db_path: str | os.PathLike | None = None,
34
+ nop_regexes: list[str] | None = None,
35
+ *args: Any,
36
+ **kwargs: Any,
37
+ ):
38
+ self._duck_conn = duck_conn
39
+ self._is_closed = False
40
+ # upper case database and schema like snowflake unquoted identifiers
41
+ # so they appear as upper-cased in information_schema
42
+ # catalog and schema names are not actually case-sensitive in duckdb even though
43
+ # they are as cased in information_schema.schemata, so when selecting from
44
+ # information_schema.schemata below we use upper-case to match any existing duckdb
45
+ # catalog or schemas like "information_schema"
46
+ self.database = database and database.upper()
47
+ self.schema = schema and schema.upper()
48
+
49
+ self.database_set = False
50
+ self.schema_set = False
51
+ self.db_path = Path(db_path) if db_path else None
52
+ self.nop_regexes = nop_regexes
53
+ self._paramstyle = snowflake.connector.paramstyle
54
+ self.variables = Variables()
55
+
56
+ # create database if needed
57
+ if (
58
+ create_database
59
+ and self.database
60
+ and not duck_conn.execute(
61
+ f"""select * from information_schema.schemata
62
+ where upper(catalog_name) = '{self.database}'"""
63
+ ).fetchone()
64
+ ):
65
+ db_file = f"{self.db_path/self.database}.db" if self.db_path else ":memory:"
66
+ duck_conn.execute(f"ATTACH DATABASE '{db_file}' AS {self.database}")
67
+ duck_conn.execute(info_schema.creation_sql(self.database))
68
+ duck_conn.execute(macros.creation_sql(self.database))
69
+
70
+ # create schema if needed
71
+ if (
72
+ create_schema
73
+ and self.database
74
+ and self.schema
75
+ and not duck_conn.execute(
76
+ f"""select * from information_schema.schemata
77
+ where upper(catalog_name) = '{self.database}' and upper(schema_name) = '{self.schema}'"""
78
+ ).fetchone()
79
+ ):
80
+ duck_conn.execute(f"CREATE SCHEMA {self.database}.{self.schema}")
81
+
82
+ # set database and schema if both exist
83
+ if (
84
+ self.database
85
+ and self.schema
86
+ and duck_conn.execute(
87
+ f"""select * from information_schema.schemata
88
+ where upper(catalog_name) = '{self.database}' and upper(schema_name) = '{self.schema}'"""
89
+ ).fetchone()
90
+ ):
91
+ duck_conn.execute(f"SET schema='{self.database}.{self.schema}'")
92
+ self.database_set = True
93
+ self.schema_set = True
94
+ # set database if only that exists
95
+ elif (
96
+ self.database
97
+ and duck_conn.execute(
98
+ f"""select * from information_schema.schemata
99
+ where upper(catalog_name) = '{self.database}'"""
100
+ ).fetchone()
101
+ ):
102
+ duck_conn.execute(f"SET schema='{self.database}.main'")
103
+ self.database_set = True
104
+
105
+ # use UTC instead of local time zone for consistent testing
106
+ duck_conn.execute("SET GLOBAL TimeZone = 'UTC'")
107
+
108
+ def __enter__(self) -> Self:
109
+ return self
110
+
111
+ def __exit__(
112
+ self,
113
+ exc_type: type[BaseException] | None,
114
+ exc_value: BaseException | None,
115
+ traceback: TracebackType | None,
116
+ ) -> None:
117
+ pass
118
+
119
+ def close(self, retry: bool = True) -> None:
120
+ self._duck_conn.close()
121
+ self._is_closed = True
122
+
123
+ def commit(self) -> None:
124
+ self.cursor().execute("COMMIT")
125
+
126
+ def cursor(self, cursor_class: type[SnowflakeCursor] = SnowflakeCursor) -> FakeSnowflakeCursor:
127
+ # TODO: use duck_conn cursor for thread-safety
128
+ return FakeSnowflakeCursor(conn=self, duck_conn=self._duck_conn, use_dict_result=cursor_class == DictCursor)
129
+
130
+ def execute_string(
131
+ self,
132
+ sql_text: str,
133
+ remove_comments: bool = False,
134
+ return_cursors: bool = True,
135
+ cursor_class: type[SnowflakeCursor] = SnowflakeCursor,
136
+ **kwargs: dict[str, Any],
137
+ ) -> Iterable[FakeSnowflakeCursor]:
138
+ cursors = [
139
+ self.cursor(cursor_class).execute(e.sql(dialect="snowflake"))
140
+ for e in sqlglot.parse(sql_text, read="snowflake")
141
+ if e and not isinstance(e, exp.Semicolon) # ignore comments
142
+ ]
143
+ return cursors if return_cursors else []
144
+
145
+ def is_closed(self) -> bool:
146
+ return self._is_closed
147
+
148
+ def rollback(self) -> None:
149
+ self.cursor().execute("ROLLBACK")
150
+
151
+ def _insert_df(self, df: pd.DataFrame, table_name: str) -> int:
152
+ # Objects in dataframes are written as parquet structs, and snowflake loads parquet structs as json strings.
153
+ # Whereas duckdb analyses a dataframe see https://duckdb.org/docs/api/python/data_ingestion.html#pandas-dataframes--object-columns
154
+ # and converts a object to the most specific type possible, eg: dict -> STRUCT, MAP or varchar, and list -> LIST
155
+ # For dicts see https://github.com/duckdb/duckdb/pull/3985 and https://github.com/duckdb/duckdb/issues/9510
156
+ #
157
+ # When the rows have dicts with different keys there isn't a single STRUCT that can cover them, so the type is
158
+ # varchar and value a string containing a struct representation. In order to support dicts with different keys
159
+ # we first convert the dicts to json strings. A pity we can't do something inside duckdb and avoid the dataframe
160
+ # copy and transform in python.
161
+
162
+ df = df.copy()
163
+
164
+ # Identify columns of type object
165
+ object_cols = df.select_dtypes(include=["object"]).columns
166
+
167
+ # Apply json.dumps to these columns
168
+ for col in object_cols:
169
+ # don't jsonify string
170
+ df[col] = df[col].apply(lambda x: json.dumps(x) if isinstance(x, (dict, list)) else x)
171
+
172
+ escaped_cols = ",".join(f'"{col}"' for col in df.columns.to_list())
173
+ self._duck_conn.execute(f"INSERT INTO {table_name}({escaped_cols}) SELECT * FROM df")
174
+
175
+ return self._duck_conn.fetchall()[0][0]