sqlframe 1.3.0__py3-none-any.whl → 1.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
sqlframe/_version.py CHANGED
@@ -12,5 +12,5 @@ __version__: str
12
12
  __version_tuple__: VERSION_TUPLE
13
13
  version_tuple: VERSION_TUPLE
14
14
 
15
- __version__ = version = '1.3.0'
16
- __version_tuple__ = version_tuple = (1, 3, 0)
15
+ __version__ = version = '1.4.0'
16
+ __version_tuple__ = version_tuple = (1, 4, 0)
@@ -2,13 +2,16 @@
2
2
 
3
3
  from __future__ import annotations
4
4
 
5
+ import enum
5
6
  import functools
6
7
  import itertools
8
+ import json
7
9
  import logging
8
10
  import sys
9
11
  import typing as t
10
12
  import zlib
11
13
  from copy import copy
14
+ from dataclasses import dataclass
12
15
 
13
16
  import sqlglot
14
17
  from prettytable import PrettyTable
@@ -75,6 +78,46 @@ JOIN_HINTS = {
75
78
  DF = t.TypeVar("DF", bound="_BaseDataFrame")
76
79
 
77
80
 
81
+ class OpenAIMode(enum.Enum):
82
+ CTE_ONLY = "cte_only"
83
+ FULL = "full"
84
+
85
+ @property
86
+ def is_cte_only(self) -> bool:
87
+ return self == OpenAIMode.CTE_ONLY
88
+
89
+ @property
90
+ def is_full(self) -> bool:
91
+ return self == OpenAIMode.FULL
92
+
93
+
94
+ @dataclass
95
+ class OpenAIConfig:
96
+ mode: OpenAIMode = OpenAIMode.CTE_ONLY
97
+ model: str = "gpt-4o"
98
+ prompt_override: t.Optional[str] = None
99
+
100
+ @classmethod
101
+ def from_dict(cls, config: t.Dict[str, t.Any]) -> OpenAIConfig:
102
+ if "mode" in config:
103
+ config["mode"] = OpenAIMode(config["mode"].lower())
104
+ return cls(**config)
105
+
106
+ def get_prompt(self, dialect: Dialect) -> str:
107
+ if self.prompt_override:
108
+ return self.prompt_override
109
+ if self.mode.is_cte_only:
110
+ return f"You are a backend tool that creates unique CTE alias names match what a human would write and in snake case. You respond without code blocks and only a json payload with the key being the CTE name that is being replaced and the value being the new CTE human readable name."
111
+ return f"""
112
+ You are a backend tool that converts correct {dialect} SQL to simplified and more human readable version.
113
+ You respond without code block with rewritten {dialect} SQL.
114
+ You don't change any column names in the final select because the user expects those to remain the same.
115
+ You make unique CTE alias names match what a human would write and in snake case.
116
+ You improve formatting with spacing and line-breaks.
117
+ You remove redundant parenthesis and aliases.
118
+ When remove extra quotes, make sure to keep quotes around words that could be reserved words"""
119
+
120
+
78
121
  class _BaseDataFrameNaFunctions(t.Generic[DF]):
79
122
  def __init__(self, df: DF):
80
123
  self.df = df
@@ -476,8 +519,7 @@ class _BaseDataFrame(t.Generic[SESSION, WRITER, NA, STAT, GROUP_DATA]):
476
519
  dialect: DialectType = None,
477
520
  optimize: bool = True,
478
521
  pretty: bool = True,
479
- use_openai: bool = False,
480
- openai_model: str = "gpt-4o",
522
+ openai_config: t.Optional[t.Union[t.Dict[str, t.Any], OpenAIConfig]] = None,
481
523
  as_list: bool = False,
482
524
  **kwargs,
483
525
  ) -> t.Union[str, t.List[str]]:
@@ -487,6 +529,11 @@ class _BaseDataFrame(t.Generic[SESSION, WRITER, NA, STAT, GROUP_DATA]):
487
529
  select_expressions = df._get_select_expressions()
488
530
  output_expressions: t.List[t.Union[exp.Select, exp.Cache, exp.Drop]] = []
489
531
  replacement_mapping: t.Dict[exp.Identifier, exp.Identifier] = {}
532
+ openai_config = (
533
+ OpenAIConfig.from_dict(openai_config)
534
+ if openai_config is not None and isinstance(openai_config, dict)
535
+ else openai_config
536
+ )
490
537
 
491
538
  for expression_type, select_expression in select_expressions:
492
539
  select_expression = select_expression.transform(
@@ -497,7 +544,7 @@ class _BaseDataFrame(t.Generic[SESSION, WRITER, NA, STAT, GROUP_DATA]):
497
544
  select_expression = t.cast(
498
545
  exp.Select, self.session._optimize(select_expression, dialect=dialect)
499
546
  )
500
- elif use_openai:
547
+ elif openai_config:
501
548
  qualify(select_expression, dialect=dialect, schema=self.session.catalog._schema)
502
549
  pushdown_projections(select_expression, schema=self.session.catalog._schema)
503
550
 
@@ -556,35 +603,32 @@ class _BaseDataFrame(t.Generic[SESSION, WRITER, NA, STAT, GROUP_DATA]):
556
603
  results = []
557
604
  for expression in output_expressions:
558
605
  sql = expression.sql(dialect=dialect, pretty=pretty, **kwargs)
559
- if use_openai:
606
+ if openai_config:
607
+ assert isinstance(openai_config, OpenAIConfig)
560
608
  verify_openai_installed()
561
609
  from openai import OpenAI
562
610
 
563
611
  client = OpenAI()
564
- prompt = f"""
565
- You are a backend tool that converts correct {dialect} SQL to simplified and more human readable version.
566
- You respond without code block with rewritten {dialect} SQL.
567
- You don't change any column names in the final select because the user expects those to remain the same.
568
- You make unique CTE alias names match what a human would write and in snake case.
569
- You improve formatting with spacing and line-breaks.
570
- You remove redundant parenthesis and aliases.
571
- When remove extra quotes, make sure to keep quotes around words that could be reserved words
572
- """
573
612
  chat_completed = client.chat.completions.create(
574
613
  messages=[
575
- {
614
+ { # type: ignore
576
615
  "role": "system",
577
- "content": prompt,
616
+ "content": openai_config.get_prompt(dialect),
578
617
  },
579
618
  {
580
619
  "role": "user",
581
620
  "content": sql,
582
621
  },
583
622
  ],
584
- model=openai_model,
623
+ model=openai_config.model,
585
624
  )
586
625
  assert chat_completed.choices[0].message.content is not None
587
- sql = chat_completed.choices[0].message.content
626
+ if openai_config.mode.is_cte_only:
627
+ cte_replacement_mapping = json.loads(chat_completed.choices[0].message.content)
628
+ for old_name, new_name in cte_replacement_mapping.items():
629
+ sql = sql.replace(old_name, new_name)
630
+ else:
631
+ sql = chat_completed.choices[0].message.content
588
632
  results.append(sql)
589
633
 
590
634
  if as_list:
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: sqlframe
3
- Version: 1.3.0
3
+ Version: 1.4.0
4
4
  Summary: Taking the Spark out of PySpark by converting to SQL
5
5
  Home-page: https://github.com/eakmanrq/sqlframe
6
6
  Author: Ryan Eakman
@@ -62,19 +62,19 @@ Provides-Extra: spark
62
62
  Requires-Dist: pyspark (<3.6,>=2) ; extra == 'spark'
63
63
 
64
64
  <div align="center">
65
- <img src="https://sqlframe.readthedocs.io/en/latest/docs/images/sqlframe_logo.png" alt="SQLFrame Logo" width="400"/>
65
+ <img src="https://sqlframe.readthedocs.io/en/stable/docs/images/sqlframe_logo.png" alt="SQLFrame Logo" width="400"/>
66
66
  </div>
67
67
 
68
68
  SQLFrame implements the PySpark DataFrame API in order to enable running transformation pipelines directly on database engines - no Spark clusters or dependencies required.
69
69
 
70
70
  SQLFrame currently supports the following engines (many more in development):
71
71
 
72
- * [BigQuery](https://sqlframe.readthedocs.io/en/latest/bigquery/)
73
- * [DuckDB](https://sqlframe.readthedocs.io/en/latest/duckdb)
74
- * [Postgres](https://sqlframe.readthedocs.io/en/latest/postgres)
72
+ * [BigQuery](https://sqlframe.readthedocs.io/en/stable/bigquery/)
73
+ * [DuckDB](https://sqlframe.readthedocs.io/en/stable/duckdb)
74
+ * [Postgres](https://sqlframe.readthedocs.io/en/stable/postgres)
75
75
 
76
76
  SQLFrame also has a "Standalone" session that be used to generate SQL without any connection to a database engine.
77
- * [Standalone](https://sqlframe.readthedocs.io/en/latest/standalone)
77
+ * [Standalone](https://sqlframe.readthedocs.io/en/stable/standalone)
78
78
 
79
79
  SQLFrame is great for:
80
80
 
@@ -97,6 +97,12 @@ pip install sqlframe
97
97
 
98
98
  See specific engine documentation for additional setup instructions.
99
99
 
100
+ ## Configuration
101
+
102
+ SQLFrame generates consistently accurate yet complex SQL for engine execution.
103
+ However, when using df.sql(), it produces more human-readable SQL.
104
+ For details on how to configure this output and leverage OpenAI to enhance the SQL, see [Generated SQL Configuration](https://sqlframe.readthedocs.io/en/stable/configuration/#generated-sql).
105
+
100
106
  ## Example Usage
101
107
 
102
108
  ```python
@@ -1,10 +1,10 @@
1
1
  sqlframe/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
- sqlframe/_version.py,sha256=HGwtpza1HCPtlyqElUvIyH97K44TO13CYiYVZNezQ1M,411
2
+ sqlframe/_version.py,sha256=R8-T9fmURjcuoxYpHTAjyNAhgJPDtI2jogCjqYYkfCU,411
3
3
  sqlframe/base/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
4
4
  sqlframe/base/_typing.py,sha256=DuTay8-o9W-pw3RPZCgLunKNJLS9PkaV11G_pxXp9NY,1256
5
5
  sqlframe/base/catalog.py,sha256=ATDGirouUjal05P4ymL-wIi8rgjg_8w4PoACamiO64A,37245
6
6
  sqlframe/base/column.py,sha256=p3VrtATBmjAYHollFcsdps2UJTNC-Pvyg4Zt7y4CK9w,15358
7
- sqlframe/base/dataframe.py,sha256=NzvzC7LE5al1uXBchmdV4Ko3ZBis6UoKt51WanYxo-8,61259
7
+ sqlframe/base/dataframe.py,sha256=9PuqC9dBficSE-Y1v_BHyk4gK-Hd43SaVBmxBeyNnD8,62939
8
8
  sqlframe/base/decorators.py,sha256=I5osMgx9BuCgbtp4jVM2DNwYJVLzCv-OtTedhQEik0g,1882
9
9
  sqlframe/base/exceptions.py,sha256=pCB9hXX4jxZWzNg3JN1i38cv3BmpUlee5NoLYx3YXIQ,208
10
10
  sqlframe/base/function_alternatives.py,sha256=to0kv3MTJmQFeVTMcitz0AxBIoUJC3cu5LkEY5aJpoo,31318
@@ -92,8 +92,8 @@ sqlframe/standalone/readwriter.py,sha256=EZNyDJ4ID6sGNog3uP4-e9RvchX4biJJDNtc5hk
92
92
  sqlframe/standalone/session.py,sha256=wQmdu2sv6KMTAv0LRFk7TY7yzlh3xvmsyqilEtRecbY,1191
93
93
  sqlframe/standalone/types.py,sha256=KwNyuXIo-2xVVd4bZED3YrQOobKCtemlxGrJL7DrTC8,34
94
94
  sqlframe/standalone/window.py,sha256=6GKPzuxeSapJakBaKBeT9VpED1ACdjggDv9JRILDyV0,35
95
- sqlframe-1.3.0.dist-info/LICENSE,sha256=VZu79YgW780qxaFJMr0t5ZgbOYEh04xWoxaWOaqIGWk,1068
96
- sqlframe-1.3.0.dist-info/METADATA,sha256=duHVFV6F1x3jqfpChK7g5u4KA5DtqZUIjYhv2fC6r3M,6861
97
- sqlframe-1.3.0.dist-info/WHEEL,sha256=G16H4A3IeoQmnOrYV4ueZGKSjhipXx8zc8nu9FGlvMA,92
98
- sqlframe-1.3.0.dist-info/top_level.txt,sha256=T0_RpoygaZSF6heeWwIDQgaP0varUdSK1pzjeJZRjM8,9
99
- sqlframe-1.3.0.dist-info/RECORD,,
95
+ sqlframe-1.4.0.dist-info/LICENSE,sha256=VZu79YgW780qxaFJMr0t5ZgbOYEh04xWoxaWOaqIGWk,1068
96
+ sqlframe-1.4.0.dist-info/METADATA,sha256=nnz73ML6w8WyctFzwiaKVVNr9RQwmpmfckrcKqEX_PE,7219
97
+ sqlframe-1.4.0.dist-info/WHEEL,sha256=G16H4A3IeoQmnOrYV4ueZGKSjhipXx8zc8nu9FGlvMA,92
98
+ sqlframe-1.4.0.dist-info/top_level.txt,sha256=T0_RpoygaZSF6heeWwIDQgaP0varUdSK1pzjeJZRjM8,9
99
+ sqlframe-1.4.0.dist-info/RECORD,,