Flowfile 0.2.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of Flowfile might be problematic. Click here for more details.

Files changed (171) hide show
  1. build_backends/__init__.py +0 -0
  2. build_backends/main.py +313 -0
  3. build_backends/main_prd.py +202 -0
  4. flowfile/__init__.py +71 -0
  5. flowfile/__main__.py +24 -0
  6. flowfile-0.2.2.dist-info/LICENSE +21 -0
  7. flowfile-0.2.2.dist-info/METADATA +225 -0
  8. flowfile-0.2.2.dist-info/RECORD +171 -0
  9. flowfile-0.2.2.dist-info/WHEEL +4 -0
  10. flowfile-0.2.2.dist-info/entry_points.txt +9 -0
  11. flowfile_core/__init__.py +13 -0
  12. flowfile_core/auth/__init__.py +0 -0
  13. flowfile_core/auth/jwt.py +140 -0
  14. flowfile_core/auth/models.py +40 -0
  15. flowfile_core/auth/secrets.py +178 -0
  16. flowfile_core/configs/__init__.py +35 -0
  17. flowfile_core/configs/flow_logger.py +433 -0
  18. flowfile_core/configs/node_store/__init__.py +0 -0
  19. flowfile_core/configs/node_store/nodes.py +98 -0
  20. flowfile_core/configs/settings.py +120 -0
  21. flowfile_core/database/__init__.py +0 -0
  22. flowfile_core/database/connection.py +51 -0
  23. flowfile_core/database/init_db.py +45 -0
  24. flowfile_core/database/models.py +41 -0
  25. flowfile_core/fileExplorer/__init__.py +0 -0
  26. flowfile_core/fileExplorer/funcs.py +259 -0
  27. flowfile_core/fileExplorer/utils.py +53 -0
  28. flowfile_core/flowfile/FlowfileFlow.py +1403 -0
  29. flowfile_core/flowfile/__init__.py +0 -0
  30. flowfile_core/flowfile/_extensions/__init__.py +0 -0
  31. flowfile_core/flowfile/_extensions/real_time_interface.py +51 -0
  32. flowfile_core/flowfile/analytics/__init__.py +0 -0
  33. flowfile_core/flowfile/analytics/analytics_processor.py +123 -0
  34. flowfile_core/flowfile/analytics/graphic_walker.py +60 -0
  35. flowfile_core/flowfile/analytics/schemas/__init__.py +0 -0
  36. flowfile_core/flowfile/analytics/utils.py +9 -0
  37. flowfile_core/flowfile/connection_manager/__init__.py +3 -0
  38. flowfile_core/flowfile/connection_manager/_connection_manager.py +48 -0
  39. flowfile_core/flowfile/connection_manager/models.py +10 -0
  40. flowfile_core/flowfile/database_connection_manager/__init__.py +0 -0
  41. flowfile_core/flowfile/database_connection_manager/db_connections.py +139 -0
  42. flowfile_core/flowfile/database_connection_manager/models.py +15 -0
  43. flowfile_core/flowfile/extensions.py +36 -0
  44. flowfile_core/flowfile/flow_data_engine/__init__.py +0 -0
  45. flowfile_core/flowfile/flow_data_engine/create/__init__.py +0 -0
  46. flowfile_core/flowfile/flow_data_engine/create/funcs.py +146 -0
  47. flowfile_core/flowfile/flow_data_engine/flow_data_engine.py +1521 -0
  48. flowfile_core/flowfile/flow_data_engine/flow_file_column/__init__.py +0 -0
  49. flowfile_core/flowfile/flow_data_engine/flow_file_column/main.py +144 -0
  50. flowfile_core/flowfile/flow_data_engine/flow_file_column/polars_type.py +24 -0
  51. flowfile_core/flowfile/flow_data_engine/flow_file_column/utils.py +36 -0
  52. flowfile_core/flowfile/flow_data_engine/fuzzy_matching/__init__.py +0 -0
  53. flowfile_core/flowfile/flow_data_engine/fuzzy_matching/prepare_for_fuzzy_match.py +38 -0
  54. flowfile_core/flowfile/flow_data_engine/fuzzy_matching/settings_validator.py +90 -0
  55. flowfile_core/flowfile/flow_data_engine/join/__init__.py +1 -0
  56. flowfile_core/flowfile/flow_data_engine/join/verify_integrity.py +54 -0
  57. flowfile_core/flowfile/flow_data_engine/pivot_table.py +20 -0
  58. flowfile_core/flowfile/flow_data_engine/polars_code_parser.py +249 -0
  59. flowfile_core/flowfile/flow_data_engine/read_excel_tables.py +143 -0
  60. flowfile_core/flowfile/flow_data_engine/sample_data.py +120 -0
  61. flowfile_core/flowfile/flow_data_engine/subprocess_operations/__init__.py +1 -0
  62. flowfile_core/flowfile/flow_data_engine/subprocess_operations/models.py +36 -0
  63. flowfile_core/flowfile/flow_data_engine/subprocess_operations/subprocess_operations.py +503 -0
  64. flowfile_core/flowfile/flow_data_engine/threaded_processes.py +27 -0
  65. flowfile_core/flowfile/flow_data_engine/types.py +0 -0
  66. flowfile_core/flowfile/flow_data_engine/utils.py +212 -0
  67. flowfile_core/flowfile/flow_node/__init__.py +0 -0
  68. flowfile_core/flowfile/flow_node/flow_node.py +771 -0
  69. flowfile_core/flowfile/flow_node/models.py +111 -0
  70. flowfile_core/flowfile/flow_node/schema_callback.py +70 -0
  71. flowfile_core/flowfile/handler.py +123 -0
  72. flowfile_core/flowfile/manage/__init__.py +0 -0
  73. flowfile_core/flowfile/manage/compatibility_enhancements.py +70 -0
  74. flowfile_core/flowfile/manage/manage_flowfile.py +0 -0
  75. flowfile_core/flowfile/manage/open_flowfile.py +136 -0
  76. flowfile_core/flowfile/setting_generator/__init__.py +2 -0
  77. flowfile_core/flowfile/setting_generator/setting_generator.py +41 -0
  78. flowfile_core/flowfile/setting_generator/settings.py +176 -0
  79. flowfile_core/flowfile/sources/__init__.py +0 -0
  80. flowfile_core/flowfile/sources/external_sources/__init__.py +3 -0
  81. flowfile_core/flowfile/sources/external_sources/airbyte_sources/__init__.py +0 -0
  82. flowfile_core/flowfile/sources/external_sources/airbyte_sources/airbyte.py +159 -0
  83. flowfile_core/flowfile/sources/external_sources/airbyte_sources/models.py +172 -0
  84. flowfile_core/flowfile/sources/external_sources/airbyte_sources/settings.py +173 -0
  85. flowfile_core/flowfile/sources/external_sources/base_class.py +39 -0
  86. flowfile_core/flowfile/sources/external_sources/custom_external_sources/__init__.py +2 -0
  87. flowfile_core/flowfile/sources/external_sources/custom_external_sources/exchange_rate.py +0 -0
  88. flowfile_core/flowfile/sources/external_sources/custom_external_sources/external_source.py +100 -0
  89. flowfile_core/flowfile/sources/external_sources/custom_external_sources/google_sheet.py +74 -0
  90. flowfile_core/flowfile/sources/external_sources/custom_external_sources/sample_users.py +29 -0
  91. flowfile_core/flowfile/sources/external_sources/factory.py +22 -0
  92. flowfile_core/flowfile/sources/external_sources/sql_source/__init__.py +0 -0
  93. flowfile_core/flowfile/sources/external_sources/sql_source/models.py +90 -0
  94. flowfile_core/flowfile/sources/external_sources/sql_source/sql_source.py +328 -0
  95. flowfile_core/flowfile/sources/external_sources/sql_source/utils.py +379 -0
  96. flowfile_core/flowfile/util/__init__.py +0 -0
  97. flowfile_core/flowfile/util/calculate_layout.py +137 -0
  98. flowfile_core/flowfile/util/execution_orderer.py +141 -0
  99. flowfile_core/flowfile/utils.py +106 -0
  100. flowfile_core/main.py +138 -0
  101. flowfile_core/routes/__init__.py +0 -0
  102. flowfile_core/routes/auth.py +34 -0
  103. flowfile_core/routes/logs.py +163 -0
  104. flowfile_core/routes/public.py +10 -0
  105. flowfile_core/routes/routes.py +601 -0
  106. flowfile_core/routes/secrets.py +85 -0
  107. flowfile_core/run_lock.py +11 -0
  108. flowfile_core/schemas/__init__.py +0 -0
  109. flowfile_core/schemas/analysis_schemas/__init__.py +0 -0
  110. flowfile_core/schemas/analysis_schemas/graphic_walker_schemas.py +118 -0
  111. flowfile_core/schemas/defaults.py +9 -0
  112. flowfile_core/schemas/external_sources/__init__.py +0 -0
  113. flowfile_core/schemas/external_sources/airbyte_schemas.py +20 -0
  114. flowfile_core/schemas/input_schema.py +477 -0
  115. flowfile_core/schemas/models.py +193 -0
  116. flowfile_core/schemas/output_model.py +115 -0
  117. flowfile_core/schemas/schemas.py +106 -0
  118. flowfile_core/schemas/transform_schema.py +569 -0
  119. flowfile_core/secrets/__init__.py +0 -0
  120. flowfile_core/secrets/secrets.py +64 -0
  121. flowfile_core/utils/__init__.py +0 -0
  122. flowfile_core/utils/arrow_reader.py +247 -0
  123. flowfile_core/utils/excel_file_manager.py +18 -0
  124. flowfile_core/utils/fileManager.py +45 -0
  125. flowfile_core/utils/fl_executor.py +38 -0
  126. flowfile_core/utils/utils.py +8 -0
  127. flowfile_frame/__init__.py +56 -0
  128. flowfile_frame/__main__.py +12 -0
  129. flowfile_frame/adapters.py +17 -0
  130. flowfile_frame/expr.py +1163 -0
  131. flowfile_frame/flow_frame.py +2093 -0
  132. flowfile_frame/group_frame.py +199 -0
  133. flowfile_frame/join.py +75 -0
  134. flowfile_frame/selectors.py +242 -0
  135. flowfile_frame/utils.py +184 -0
  136. flowfile_worker/__init__.py +55 -0
  137. flowfile_worker/configs.py +95 -0
  138. flowfile_worker/create/__init__.py +37 -0
  139. flowfile_worker/create/funcs.py +146 -0
  140. flowfile_worker/create/models.py +86 -0
  141. flowfile_worker/create/pl_types.py +35 -0
  142. flowfile_worker/create/read_excel_tables.py +110 -0
  143. flowfile_worker/create/utils.py +84 -0
  144. flowfile_worker/external_sources/__init__.py +0 -0
  145. flowfile_worker/external_sources/airbyte_sources/__init__.py +0 -0
  146. flowfile_worker/external_sources/airbyte_sources/cache_manager.py +161 -0
  147. flowfile_worker/external_sources/airbyte_sources/main.py +89 -0
  148. flowfile_worker/external_sources/airbyte_sources/models.py +133 -0
  149. flowfile_worker/external_sources/airbyte_sources/settings.py +0 -0
  150. flowfile_worker/external_sources/sql_source/__init__.py +0 -0
  151. flowfile_worker/external_sources/sql_source/main.py +56 -0
  152. flowfile_worker/external_sources/sql_source/models.py +72 -0
  153. flowfile_worker/flow_logger.py +58 -0
  154. flowfile_worker/funcs.py +327 -0
  155. flowfile_worker/main.py +108 -0
  156. flowfile_worker/models.py +95 -0
  157. flowfile_worker/polars_fuzzy_match/__init__.py +0 -0
  158. flowfile_worker/polars_fuzzy_match/matcher.py +435 -0
  159. flowfile_worker/polars_fuzzy_match/models.py +36 -0
  160. flowfile_worker/polars_fuzzy_match/pre_process.py +213 -0
  161. flowfile_worker/polars_fuzzy_match/process.py +86 -0
  162. flowfile_worker/polars_fuzzy_match/utils.py +50 -0
  163. flowfile_worker/process_manager.py +36 -0
  164. flowfile_worker/routes.py +440 -0
  165. flowfile_worker/secrets.py +148 -0
  166. flowfile_worker/spawner.py +187 -0
  167. flowfile_worker/utils.py +25 -0
  168. test_utils/__init__.py +3 -0
  169. test_utils/postgres/__init__.py +1 -0
  170. test_utils/postgres/commands.py +109 -0
  171. test_utils/postgres/fixtures.py +417 -0
@@ -0,0 +1,328 @@
1
+ from typing import Any, Dict, Generator, List, Optional, Literal, Tuple
2
+ import polars as pl
3
+ from flowfile_core.configs import logger
4
+ from flowfile_core.flowfile.flow_data_engine.flow_file_column.main import FlowfileColumn
5
+ from flowfile_core.schemas.input_schema import MinimalFieldInfo, DatabaseSettings
6
+ from sqlalchemy import Engine, inspect, create_engine, text
7
+ from flowfile_core.secrets.secrets import get_encrypted_secret, decrypt_secret
8
+
9
+ from flowfile_core.flowfile.sources.external_sources.base_class import ExternalDataSource
10
+ from flowfile_core.flowfile.sources.external_sources.sql_source.utils import get_polars_type, construct_sql_uri
11
+ from flowfile_core.flowfile.database_connection_manager.db_connections import get_local_database_connection
12
+
13
+ QueryMode = Literal['table', 'query']
14
+
15
+
16
+ def get_query_columns(engine: Engine, query_text: str):
17
+ """
18
+ Get column names from a query and assume string type for all columns
19
+
20
+ Args:
21
+ engine: SQLAlchemy engine object
22
+ query_text: SQL query as a string
23
+
24
+ Returns:
25
+ Dictionary mapping column names to string type
26
+ """
27
+ with engine.connect() as connection:
28
+ # Create a text object from the query
29
+ query = text(query_text)
30
+
31
+ # Execute the query to get column names
32
+ result = connection.execute(query)
33
+ column_names = result.keys()
34
+ result.close() # Close the result to avoid consuming the cursor
35
+
36
+ return list(column_names)
37
+
38
+
39
+ def get_table_column_types(engine: Engine, table_name: str, schema: str = None) -> List[Tuple[str, Any]]:
40
+ """
41
+ Get column types from a database table using a SQLAlchemy engine
42
+
43
+ Args:
44
+ engine: SQLAlchemy engine object
45
+ table_name: Name of the table to inspect
46
+ schema: Optional schema name (e.g., 'public' for PostgreSQL)
47
+
48
+ Returns:
49
+ Dictionary mapping column names to their SQLAlchemy types
50
+ """
51
+ inspector = inspect(engine)
52
+ columns = inspector.get_columns(table_name, schema=schema)
53
+
54
+ return [(column['name'], column['type']) for column in columns]
55
+
56
+
57
+ class BaseSqlSource:
58
+ """
59
+ A simplified base class for SQL sources that handles query generation
60
+ without requiring database connection details.
61
+ """
62
+ table_name: Optional[str] = None
63
+ query: Optional[str] = None
64
+ schema_name: Optional[str] = None
65
+ query_mode: QueryMode = 'table'
66
+ schema: Optional[List[FlowfileColumn]] = None
67
+
68
+ def __init__(self,
69
+ query: str = None,
70
+ table_name: str = None,
71
+ schema_name: str = None,
72
+ fields: Optional[List[MinimalFieldInfo]] = None):
73
+ """
74
+ Initialize a BaseSqlSource object.
75
+
76
+ Args:
77
+ query: SQL query string (if query_mode is 'query')
78
+ table_name: Name of the table to query (if query_mode is 'table')
79
+ schema_name: Optional database schema name
80
+ fields: Optional list of field information
81
+ """
82
+ if schema_name == '':
83
+ schema_name = None
84
+
85
+ # Validate inputs
86
+ if query is not None and table_name is not None:
87
+ raise ValueError("Only one of table_name or query can be provided")
88
+ if query is None and table_name is None:
89
+ raise ValueError("Either table_name or query must be provided")
90
+
91
+ # Set query mode and build query if needed
92
+ if query is not None:
93
+ self.query_mode = 'query'
94
+ self.query = query
95
+ elif table_name is not None:
96
+ self.query_mode = 'table'
97
+ self.table_name = table_name
98
+ self.schema_name = schema_name
99
+
100
+ # Generate the basic query
101
+ if schema_name is not None and schema_name != '':
102
+ self.query = f"SELECT * FROM {schema_name}.{table_name}"
103
+ else:
104
+ self.query = f"SELECT * FROM {table_name}"
105
+
106
+ # Set schema if provided
107
+ if fields:
108
+ self.schema = [FlowfileColumn.from_input(column_name=col.name, data_type=col.data_type) for col in fields]
109
+
110
+ def get_sample_query(self) -> str:
111
+ """
112
+ Get a sample query that returns a limited number of rows.
113
+ """
114
+ if self.query_mode == 'query':
115
+ return f"select * from ({self.query}) as main_query LIMIT 1"
116
+ else:
117
+ return f"{self.query} LIMIT 1"
118
+
119
+ @staticmethod
120
+ def _parse_table_name(table_name: str) -> tuple[Optional[str], str]:
121
+ """
122
+ Parse a table name that may include a schema.
123
+
124
+ Args:
125
+ table_name: Table name possibly in the format 'schema.table'
126
+
127
+ Returns:
128
+ Tuple of (schema, table_name)
129
+ """
130
+ table_parts = table_name.split('.')
131
+ if len(table_parts) > 1:
132
+ # Handle schema.table_name format
133
+ schema = '.'.join(table_parts[:-1])
134
+ table = table_parts[-1]
135
+ return schema, table
136
+ else:
137
+ return None, table_name
138
+
139
+
140
+ class SqlSource(BaseSqlSource, ExternalDataSource):
141
+ connection_string: Optional[str]
142
+ read_result: Optional[pl.DataFrame] = None
143
+
144
+ def __init__(self,
145
+ connection_string: str,
146
+ query: str = None,
147
+ table_name: str = None,
148
+ schema_name: str = None,
149
+ fields: Optional[List[MinimalFieldInfo]] = None):
150
+
151
+ # Initialize the base class first
152
+ BaseSqlSource.__init__(self, query=query, table_name=table_name, schema_name=schema_name, fields=fields)
153
+
154
+ # Set connection-specific attributes
155
+ self.connection_string = connection_string
156
+ self.read_result = None
157
+
158
+ def get_initial_data(self) -> List[Dict[str, Any]]:
159
+ return []
160
+
161
+ def validate(self) -> None:
162
+ try:
163
+ engine = create_engine(self.connection_string)
164
+ if self.query_mode == 'table':
165
+ try:
166
+ if self.schema_name is not None:
167
+ self._get_columns_from_table_and_schema(engine, self.table_name, self.schema_name)
168
+ if self.table_name is not None:
169
+ self._get_columns_from_table(engine, self.table_name)
170
+ except Exception as e:
171
+ logger.warning(f"Error getting column info for table {self.table_name}: {e}")
172
+ c = self._get_columns_from_query(engine, self.get_sample_query())
173
+ if len(c) == 0:
174
+ raise ValueError("No columns found in the query")
175
+ else:
176
+ c = self._get_columns_from_query(engine, self.get_sample_query())
177
+ if len(c) == 0:
178
+ raise ValueError("No columns found in the query")
179
+ except Exception as e:
180
+ logger.error(f"Error validating SQL source: {e}")
181
+ raise e
182
+
183
+ def get_iter(self) -> Generator[Dict[str, Any], None, None]:
184
+ logger.warning('Getting data in iteration, this is suboptimal')
185
+ data = self.data_getter()
186
+ for row in data:
187
+ yield row
188
+
189
+ def get_df(self):
190
+ df = self.get_pl_df()
191
+ return df.to_pandas()
192
+
193
+ def get_sample(self, n: int = 10000) -> Generator[Dict[str, Any], None, None]:
194
+ if self.query_mode == 'table':
195
+ query = f"{self.query} LIMIT {n}"
196
+ try:
197
+ df = pl.read_database_uri(query, self.connection_string)
198
+ return (r for r in df.to_dicts())
199
+ except Exception as e:
200
+ logger.error(f"Error with query: {query}")
201
+ raise e
202
+ else:
203
+ df = self.get_pl_df()
204
+ rows = df.head(n).to_dicts()
205
+ return (r for r in rows)
206
+
207
+ def data_getter(self) -> Generator[Dict[str, Any], None, None]:
208
+ df = self.get_pl_df()
209
+ rows = df.to_dicts()
210
+ return (r for r in rows)
211
+
212
+ def get_pl_df(self) -> pl.DataFrame:
213
+ if self.read_result is None:
214
+ self.read_result = pl.read_database_uri(self.query, self.connection_string)
215
+ return self.read_result
216
+
217
+ def get_flow_file_columns(self) -> List[FlowfileColumn]:
218
+ """
219
+ Get column information from the SQL source and convert to FlowfileColumn objects
220
+
221
+ Returns:
222
+ List of FlowfileColumn objects representing the columns in the SQL source
223
+ """
224
+ engine = create_engine(self.connection_string)
225
+
226
+ if self.query_mode == 'table':
227
+ try:
228
+ if self.schema_name is not None:
229
+ return self._get_columns_from_table_and_schema(engine, self.table_name, self.schema_name)
230
+ if self.table_name is not None:
231
+ return self._get_columns_from_table(engine, self.table_name)
232
+ except Exception as e:
233
+ logger.error(f"Error getting column info for table {self.table_name}: {e}")
234
+
235
+ return self._get_columns_from_query(engine, self.get_sample_query())
236
+
237
+ @staticmethod
238
+ def _get_columns_from_table(engine: Engine, table_name: str) -> List[FlowfileColumn]:
239
+ """
240
+ Get FlowfileColumn objects from a database table
241
+
242
+ Args:
243
+ engine: SQLAlchemy engine
244
+ table_name: Name of the table (possibly including schema)
245
+
246
+ Returns:
247
+ List of FlowfileColumn objects
248
+ """
249
+ schema_name, table = BaseSqlSource._parse_table_name(table_name)
250
+ column_types = get_table_column_types(engine, table, schema=schema_name)
251
+ columns = [FlowfileColumn.create_from_polars_dtype(column_name, get_polars_type(column_type))
252
+ for column_name, column_type in column_types]
253
+
254
+ return columns
255
+
256
+ @staticmethod
257
+ def _get_columns_from_table_and_schema(engine: Engine, table_name: str, schema_name: str):
258
+ """
259
+ Get FlowfileColumn objects from a database table
260
+
261
+ Args:
262
+ engine: SQLAlchemy engine
263
+ table_name: Name of the table (possibly including schema)
264
+ schema_name: Name of the schema
265
+ Returns:
266
+ List of FlowfileColumn objects
267
+ """
268
+ column_types = get_table_column_types(engine, table_name, schema=schema_name)
269
+ columns = [FlowfileColumn.create_from_polars_dtype(column_name, get_polars_type(column_type))
270
+ for column_name, column_type in column_types]
271
+ return columns
272
+
273
+ @staticmethod
274
+ def _get_columns_from_query(engine: Engine, query: str) -> List[FlowfileColumn]:
275
+ """
276
+ Get FlowfileColumn objects from a SQL query
277
+
278
+ Args:
279
+ engine: SQLAlchemy engine
280
+ query: SQL query string
281
+
282
+ Returns:
283
+ List of FlowfileColumn objects
284
+ """
285
+ try:
286
+ column_names = get_query_columns(engine, query)
287
+
288
+ columns = [FlowfileColumn.create_from_polars_dtype(column_name, pl.String()) for column_name in
289
+ column_names]
290
+ return columns
291
+ except Exception as e:
292
+ logger.error(f"Error getting column info for query: {e}")
293
+ raise e
294
+
295
+ def parse_schema(self) -> List[FlowfileColumn]:
296
+ return self.get_schema()
297
+
298
+ def get_schema(self) -> List[FlowfileColumn]:
299
+ if self.schema is None:
300
+ self.schema = self.get_flow_file_columns()
301
+ return self.schema
302
+
303
+
304
+ def create_sql_source_from_db_settings(database_settings: DatabaseSettings, user_id: int) -> SqlSource:
305
+ database_connection = database_settings.database_connection
306
+ if database_settings.connection_mode == 'inline':
307
+ if database_connection is None:
308
+ raise ValueError("Database connection is required in inline mode")
309
+ encrypted_secret = get_encrypted_secret(current_user_id=user_id,
310
+ secret_name=database_connection.password_ref)
311
+ else:
312
+ database_connection = get_local_database_connection(database_settings.database_connection_name, user_id)
313
+ encrypted_secret = database_connection.password.get_secret_value()
314
+ if encrypted_secret is None:
315
+ raise ValueError(f"Secret with name {database_connection.password_ref} not found for user {user_id}")
316
+
317
+ sql_source = SqlSource(connection_string=
318
+ construct_sql_uri(database_type=database_connection.database_type,
319
+ host=database_connection.host,
320
+ port=database_connection.port,
321
+ database=database_connection.database,
322
+ username=database_connection.username,
323
+ password=decrypt_secret(encrypted_secret)),
324
+ query=None if database_settings.query_mode == 'table' else database_settings.query,
325
+ table_name=database_settings.table_name,
326
+ schema_name=database_settings.schema_name,
327
+ )
328
+ return sql_source
@@ -0,0 +1,379 @@
1
+ # Comprehensive mapping from SQLAlchemy types to Polars types
2
+ from typing import Dict, Type, Union, cast, TYPE_CHECKING, Any
3
+ from pydantic import SecretStr
4
+
5
+ import polars as pl
6
+ from polars import DataType as PolarsType
7
+ from sqlalchemy.sql.sqltypes import (
8
+ _Binary, ARRAY, BIGINT, BigInteger, BINARY, BLOB, BOOLEAN, Boolean,
9
+ CHAR, CLOB, Concatenable, DATE, Date, DATETIME, DateTime,
10
+ DECIMAL, DOUBLE, Double, DOUBLE_PRECISION, Enum, FLOAT, Float,
11
+ Indexable, INT, INTEGER, Integer, Interval, JSON, LargeBinary,
12
+ MatchType, NCHAR, NULLTYPE, NullType, NUMERIC, Numeric, NVARCHAR,
13
+ PickleType, REAL, SchemaType, SMALLINT, SmallInteger, String,
14
+ STRINGTYPE, TEXT, Text, TIME, Time, TIMESTAMP, TupleType,
15
+ Unicode, UnicodeText, UUID, Uuid, VARBINARY, VARCHAR
16
+ )
17
+ from sqlalchemy.sql.type_api import (
18
+ ExternalType, TypeDecorator,
19
+ TypeEngine, UserDefinedType, Variant
20
+ )
21
+
22
+
23
+ from typing import Optional
24
+ from urllib.parse import quote_plus
25
+
26
+
27
+ if TYPE_CHECKING:
28
+ SqlType = Union[
29
+ Type[_Binary], Type[ARRAY], Type[BIGINT], Type[BigInteger], Type[BINARY],
30
+ Type[BLOB], Type[BOOLEAN], Type[Boolean], Type[CHAR], Type[CLOB],
31
+ Type[Concatenable], Type[DATE], Type[Date], Type[DATETIME], Type[DateTime],
32
+ Type[DECIMAL], Type[DOUBLE], Type[Double], Type[DOUBLE_PRECISION], Type[Enum],
33
+ Type[FLOAT], Type[Float], Type[Indexable], Type[INT], Type[INTEGER],
34
+ Type[Integer], Type[Interval], Type[JSON], Type[LargeBinary], Type[MatchType],
35
+ Type[NCHAR], Type[NULLTYPE], Type[NullType], Type[NUMERIC], Type[Numeric],
36
+ Type[NVARCHAR], Type[PickleType], Type[REAL], Type[SchemaType], Type[SMALLINT],
37
+ Type[SmallInteger], Type[String], Type[STRINGTYPE], Type[TEXT], Type[Text],
38
+ Type[TIME], Type[Time], Type[TIMESTAMP], Type[TupleType], Type[Unicode],
39
+ Type[UnicodeText], Type[UUID], Type[Uuid], Type[VARBINARY], Type[VARCHAR],
40
+ Type[TypeDecorator], Type[TypeEngine], Type[UserDefinedType], Type[Variant],
41
+ Type[ExternalType]
42
+ ]
43
+ else:
44
+ SqlType = Any
45
+
46
+
47
+ # Comprehensive mapping from SQLAlchemy types to Polars types
48
+ sqlalchemy_to_polars: Dict[SqlType, PolarsType] = {
49
+ # Numeric types
50
+ Integer: pl.Int64,
51
+ INTEGER: pl.Int64,
52
+ INT: pl.Int64,
53
+ SmallInteger: pl.Int16,
54
+ SMALLINT: pl.Int16,
55
+ BigInteger: pl.Int64,
56
+ BIGINT: pl.Int64,
57
+ Float: pl.Float64,
58
+ FLOAT: pl.Float64,
59
+ REAL: pl.Float32,
60
+ DOUBLE: pl.Float64,
61
+ Double: pl.Float64,
62
+ DOUBLE_PRECISION: pl.Float64,
63
+ Numeric: pl.Decimal,
64
+ NUMERIC: pl.Decimal,
65
+ DECIMAL: pl.Decimal,
66
+ Boolean: pl.Boolean,
67
+ BOOLEAN: pl.Boolean,
68
+
69
+ # String types
70
+ String: pl.Utf8,
71
+ VARCHAR: pl.Utf8,
72
+ CHAR: pl.Utf8,
73
+ NVARCHAR: pl.Utf8,
74
+ NCHAR: pl.Utf8,
75
+ Text: pl.Utf8,
76
+ TEXT: pl.Utf8,
77
+ CLOB: pl.Utf8,
78
+ STRINGTYPE: pl.Utf8,
79
+ Unicode: pl.Utf8,
80
+ UnicodeText: pl.Utf8,
81
+
82
+ # Date/Time types
83
+ Date: pl.Date,
84
+ DATE: pl.Date,
85
+ DateTime: pl.Datetime,
86
+ DATETIME: pl.Datetime,
87
+ TIMESTAMP: pl.Datetime,
88
+ Time: pl.Time,
89
+ TIME: pl.Time,
90
+ Interval: pl.Duration,
91
+
92
+ # Binary types
93
+ _Binary: pl.Binary,
94
+ LargeBinary: pl.Binary,
95
+ BINARY: pl.Binary,
96
+ VARBINARY: pl.Binary,
97
+ BLOB: pl.Binary,
98
+
99
+ # JSON types
100
+ JSON: pl.Utf8, # Polars doesn't have a dedicated JSON type, using Utf8
101
+
102
+ # UUID types
103
+ UUID: pl.Utf8, # Mapped to string
104
+ Uuid: pl.Utf8, # Mapped to string
105
+
106
+ # Other types
107
+ ARRAY: pl.List, # Approx mapping
108
+ Enum: pl.String, # Approx mapping
109
+ PickleType: pl.Object, # For storing Python objects
110
+ TupleType: pl.Struct, # Mapped to struct
111
+
112
+ # Special/Abstract types
113
+ NULLTYPE: None,
114
+ NullType: None,
115
+ Concatenable: pl.Utf8, # Default to string since it's a mixin
116
+ Indexable: pl.List, # Default to list since it's a mixin
117
+ MatchType: pl.Utf8, # Default to string
118
+ SchemaType: None, # Base class, not mappable directly
119
+ TypeDecorator: None, # Base class, not mappable directly
120
+ TypeEngine: None, # Base class, not mappable directly
121
+ UserDefinedType: None, # Base class, not mappable directly
122
+ Variant: pl.Object, # For variant data
123
+ ExternalType: None, # Abstract base class
124
+ }
125
+
126
+ # Create string mappings, filtering out None values
127
+ sqlalchemy_to_polars_str: Dict[str, str] = {
128
+ k.__name__: v.__name__
129
+ for k, v in sqlalchemy_to_polars.items()
130
+ if v is not None and hasattr(k, '__name__') and hasattr(v, '__name__')
131
+ }
132
+
133
+ # Additional string mappings for common SQL type names
134
+ sql_type_name_to_polars: Dict[str, PolarsType] = {
135
+ # PostgreSQL types
136
+ 'integer': pl.Int64,
137
+ 'bigint': pl.Int64,
138
+ 'smallint': pl.Int64,
139
+ 'numeric': pl.Decimal,
140
+ 'real': pl.Float32,
141
+ 'double precision': pl.Float64,
142
+ 'boolean': pl.Boolean,
143
+ 'varchar': pl.Utf8,
144
+ 'character varying': pl.Utf8,
145
+ 'character': pl.Utf8,
146
+ 'text': pl.Utf8,
147
+ 'date': pl.Date,
148
+ 'timestamp': pl.Datetime,
149
+ 'timestamp without time zone': pl.Datetime,
150
+ 'timestamp with time zone': pl.Datetime,
151
+ 'time': pl.Time,
152
+ 'time without time zone': pl.Time,
153
+ 'time with time zone': pl.Time,
154
+ 'interval': pl.Duration,
155
+ 'bytea': pl.Binary,
156
+ 'jsonb': pl.Utf8,
157
+ 'json': pl.Utf8,
158
+ 'uuid': pl.Utf8,
159
+ 'cidr': pl.Utf8,
160
+ 'inet': pl.Utf8,
161
+ 'macaddr': pl.Utf8,
162
+ 'bit': pl.Utf8,
163
+ 'bit varying': pl.Utf8,
164
+ 'money': pl.Decimal,
165
+ 'xml': pl.Utf8,
166
+ 'tsquery': pl.Utf8,
167
+ 'tsvector': pl.Utf8,
168
+ 'hstore': pl.Utf8,
169
+
170
+ # MySQL types
171
+ 'int': pl.Int32,
172
+ 'int unsigned': pl.UInt64,
173
+ 'bigint unsigned': pl.UInt64,
174
+ 'smallint unsigned': pl.UInt16,
175
+ 'tinyint': pl.Int8,
176
+ 'tinyint unsigned': pl.UInt8,
177
+ 'mediumint': pl.Int32,
178
+ 'mediumint unsigned': pl.UInt32,
179
+ 'decimal': pl.Decimal,
180
+ 'float': pl.Float32,
181
+ 'double': pl.Float64,
182
+ 'bit': pl.Boolean,
183
+ 'char': pl.Utf8,
184
+ 'varchar': pl.Utf8,
185
+ 'binary': pl.Binary,
186
+ 'varbinary': pl.Binary,
187
+ 'tinyblob': pl.Binary,
188
+ 'blob': pl.Binary,
189
+ 'mediumblob': pl.Binary,
190
+ 'longblob': pl.Binary,
191
+ 'tinytext': pl.Utf8,
192
+ 'text': pl.Utf8,
193
+ 'mediumtext': pl.Utf8,
194
+ 'longtext': pl.Utf8,
195
+ 'datetime': pl.Datetime,
196
+ 'timestamp': pl.Datetime,
197
+ 'year': pl.Int16,
198
+ 'enum': pl.String,
199
+ 'set': pl.List,
200
+ 'json': pl.Utf8,
201
+
202
+ # SQLite types
203
+ 'integer': pl.Int64, # SQLite's INTEGER is 64-bit
204
+ 'real': pl.Float64,
205
+ 'text': pl.Utf8,
206
+ 'blob': pl.Binary,
207
+ 'null': None,
208
+
209
+ # Oracle types
210
+ 'number': pl.Decimal,
211
+ 'float': pl.Float64,
212
+ 'binary_float': pl.Float32,
213
+ 'binary_double': pl.Float64,
214
+ 'varchar2': pl.Utf8,
215
+ 'nvarchar2': pl.Utf8,
216
+ 'char': pl.Utf8,
217
+ 'nchar': pl.Utf8,
218
+ 'clob': pl.Utf8,
219
+ 'nclob': pl.Utf8,
220
+ 'long': pl.Utf8,
221
+ 'raw': pl.Binary,
222
+ 'long raw': pl.Binary,
223
+ 'rowid': pl.Utf8,
224
+ 'urowid': pl.Utf8,
225
+ 'date': pl.Datetime, # Oracle DATE includes time
226
+ 'timestamp': pl.Datetime,
227
+ 'timestamp with time zone': pl.Datetime,
228
+ 'timestamp with local time zone': pl.Datetime,
229
+ 'interval year to month': pl.Duration,
230
+ 'interval day to second': pl.Duration,
231
+ 'bfile': pl.Binary,
232
+ 'xmltype': pl.Utf8,
233
+
234
+ # SQL Server types
235
+ 'bit': pl.Boolean,
236
+ 'tinyint': pl.Int8,
237
+ 'smallint': pl.Int16,
238
+ 'int': pl.Int32,
239
+ 'bigint': pl.Int64,
240
+ 'numeric': pl.Decimal,
241
+ 'decimal': pl.Decimal,
242
+ 'smallmoney': pl.Decimal,
243
+ 'money': pl.Decimal,
244
+ 'float': pl.Float64,
245
+ 'real': pl.Float32,
246
+ 'datetime': pl.Datetime,
247
+ 'datetime2': pl.Datetime,
248
+ 'smalldatetime': pl.Datetime,
249
+ 'date': pl.Date,
250
+ 'time': pl.Time,
251
+ 'datetimeoffset': pl.Datetime,
252
+ 'char': pl.Utf8,
253
+ 'varchar': pl.Utf8,
254
+ 'text': pl.Utf8,
255
+ 'nchar': pl.Utf8,
256
+ 'nvarchar': pl.Utf8,
257
+ 'ntext': pl.Utf8,
258
+ 'binary': pl.Binary,
259
+ 'varbinary': pl.Binary,
260
+ 'image': pl.Binary,
261
+ 'uniqueidentifier': pl.Utf8,
262
+ 'xml': pl.Utf8,
263
+ 'sql_variant': pl.Object,
264
+ 'hierarchyid': pl.Utf8,
265
+ 'geometry': pl.Utf8,
266
+ 'geography': pl.Utf8,
267
+
268
+ # Common abbreviations and aliases
269
+ 'int4': pl.Int32,
270
+ 'int8': pl.Int64,
271
+ 'float4': pl.Float32,
272
+ 'float8': pl.Float64,
273
+ 'bool': pl.Boolean,
274
+ 'serial': pl.Int32, # PostgreSQL auto-incrementing integer
275
+ 'bigserial': pl.Int64, # PostgreSQL auto-incrementing bigint
276
+ 'smallserial': pl.Int16, # PostgreSQL auto-incrementing smallint
277
+ }
278
+
279
+ # String to string mapping
280
+ sql_type_name_to_polars_str: Dict[str, str] = {
281
+ k: v.__name__ for k, v in sql_type_name_to_polars.items() if v is not None
282
+ }
283
+
284
+
285
+ def get_polars_type(sqlalchemy_type: Union[SqlType, str]):
286
+ """
287
+ Get the corresponding Polars type from a SQLAlchemy type or string type name.
288
+
289
+ Parameters:
290
+ -----------
291
+ sqlalchemy_type : SQLAlchemy type object or string
292
+ The SQLAlchemy type or SQL type name string
293
+
294
+ Returns:
295
+ --------
296
+ polars_type : polars.DataType
297
+ The corresponding Polars data type, or None if no mapping exists
298
+ """
299
+ if isinstance(sqlalchemy_type, type):
300
+ # For SQLAlchemy type classes
301
+ return sqlalchemy_to_polars.get(cast(SqlType, sqlalchemy_type), pl.Utf8)
302
+ elif isinstance(sqlalchemy_type, str):
303
+ # For string type names (lowercase for case-insensitive matching)
304
+ return sql_type_name_to_polars.get(sqlalchemy_type.lower(), pl.Utf8)
305
+ else:
306
+ # For SQLAlchemy type instances
307
+ instance_type = type(sqlalchemy_type)
308
+ return sqlalchemy_to_polars.get(cast(SqlType, instance_type), pl.Utf8)
309
+
310
+
311
+ def construct_sql_uri(
312
+ database_type: str = "postgresql",
313
+ host: Optional[str] = None,
314
+ port: Optional[int] = None,
315
+ username: Optional[str] = None,
316
+ password: Optional[SecretStr] = None,
317
+ database: Optional[str] = None,
318
+ url: Optional[str] = None,
319
+ **kwargs
320
+ ) -> str:
321
+ """
322
+ Constructs a SQL URI string from the provided parameters.
323
+
324
+ Args:
325
+ database_type: Database type (postgresql, mysql, sqlite, etc.)
326
+ host: Database host address
327
+ port: Database port number
328
+ username: Database username
329
+ password: Database password as SecretStr
330
+ database: Database name
331
+ url: Complete database URL (overrides other parameters if provided)
332
+ **kwargs: Additional connection parameters
333
+
334
+ Returns:
335
+ str: Formatted database URI
336
+
337
+ Raises:
338
+ ValueError: If insufficient information is provided
339
+ """
340
+ # If URL is explicitly provided, return it directly
341
+ if url:
342
+ return url
343
+
344
+ # For SQLite, we handle differently since it uses a file path
345
+ if database_type.lower() == "sqlite":
346
+ # For SQLite, database is the path to the file
347
+ path = database or "./database.db"
348
+ return f"sqlite:///{path}"
349
+
350
+ # Validate that minimum required fields are present for other databases
351
+ if not host:
352
+ raise ValueError("Host is required to create a URI")
353
+
354
+ # Create credential part if username is provided
355
+ credentials = ""
356
+ if username:
357
+ credentials = username
358
+ if password:
359
+ # Get raw password from SecretStr and encode it
360
+ password_value = password.get_secret_value()
361
+ encoded_password = quote_plus(password_value)
362
+ credentials += f":{encoded_password}"
363
+ credentials += "@"
364
+
365
+ # Add port if specified
366
+ port_section = f":{port}" if port else ""
367
+
368
+ # Create base URI
369
+ if database:
370
+ base_uri = f"{database_type}://{credentials}{host}{port_section}/{database}"
371
+ else:
372
+ base_uri = f"{database_type}://{credentials}{host}{port_section}"
373
+
374
+ # Add any additional connection parameters
375
+ if kwargs:
376
+ params = "&".join(f"{key}={quote_plus(str(value))}" for key, value in kwargs.items())
377
+ base_uri += f"?{params}"
378
+
379
+ return base_uri
File without changes