Flowfile 0.2.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of Flowfile might be problematic. Click here for more details.

Files changed (171) hide show
  1. build_backends/__init__.py +0 -0
  2. build_backends/main.py +313 -0
  3. build_backends/main_prd.py +202 -0
  4. flowfile/__init__.py +71 -0
  5. flowfile/__main__.py +24 -0
  6. flowfile-0.2.2.dist-info/LICENSE +21 -0
  7. flowfile-0.2.2.dist-info/METADATA +225 -0
  8. flowfile-0.2.2.dist-info/RECORD +171 -0
  9. flowfile-0.2.2.dist-info/WHEEL +4 -0
  10. flowfile-0.2.2.dist-info/entry_points.txt +9 -0
  11. flowfile_core/__init__.py +13 -0
  12. flowfile_core/auth/__init__.py +0 -0
  13. flowfile_core/auth/jwt.py +140 -0
  14. flowfile_core/auth/models.py +40 -0
  15. flowfile_core/auth/secrets.py +178 -0
  16. flowfile_core/configs/__init__.py +35 -0
  17. flowfile_core/configs/flow_logger.py +433 -0
  18. flowfile_core/configs/node_store/__init__.py +0 -0
  19. flowfile_core/configs/node_store/nodes.py +98 -0
  20. flowfile_core/configs/settings.py +120 -0
  21. flowfile_core/database/__init__.py +0 -0
  22. flowfile_core/database/connection.py +51 -0
  23. flowfile_core/database/init_db.py +45 -0
  24. flowfile_core/database/models.py +41 -0
  25. flowfile_core/fileExplorer/__init__.py +0 -0
  26. flowfile_core/fileExplorer/funcs.py +259 -0
  27. flowfile_core/fileExplorer/utils.py +53 -0
  28. flowfile_core/flowfile/FlowfileFlow.py +1403 -0
  29. flowfile_core/flowfile/__init__.py +0 -0
  30. flowfile_core/flowfile/_extensions/__init__.py +0 -0
  31. flowfile_core/flowfile/_extensions/real_time_interface.py +51 -0
  32. flowfile_core/flowfile/analytics/__init__.py +0 -0
  33. flowfile_core/flowfile/analytics/analytics_processor.py +123 -0
  34. flowfile_core/flowfile/analytics/graphic_walker.py +60 -0
  35. flowfile_core/flowfile/analytics/schemas/__init__.py +0 -0
  36. flowfile_core/flowfile/analytics/utils.py +9 -0
  37. flowfile_core/flowfile/connection_manager/__init__.py +3 -0
  38. flowfile_core/flowfile/connection_manager/_connection_manager.py +48 -0
  39. flowfile_core/flowfile/connection_manager/models.py +10 -0
  40. flowfile_core/flowfile/database_connection_manager/__init__.py +0 -0
  41. flowfile_core/flowfile/database_connection_manager/db_connections.py +139 -0
  42. flowfile_core/flowfile/database_connection_manager/models.py +15 -0
  43. flowfile_core/flowfile/extensions.py +36 -0
  44. flowfile_core/flowfile/flow_data_engine/__init__.py +0 -0
  45. flowfile_core/flowfile/flow_data_engine/create/__init__.py +0 -0
  46. flowfile_core/flowfile/flow_data_engine/create/funcs.py +146 -0
  47. flowfile_core/flowfile/flow_data_engine/flow_data_engine.py +1521 -0
  48. flowfile_core/flowfile/flow_data_engine/flow_file_column/__init__.py +0 -0
  49. flowfile_core/flowfile/flow_data_engine/flow_file_column/main.py +144 -0
  50. flowfile_core/flowfile/flow_data_engine/flow_file_column/polars_type.py +24 -0
  51. flowfile_core/flowfile/flow_data_engine/flow_file_column/utils.py +36 -0
  52. flowfile_core/flowfile/flow_data_engine/fuzzy_matching/__init__.py +0 -0
  53. flowfile_core/flowfile/flow_data_engine/fuzzy_matching/prepare_for_fuzzy_match.py +38 -0
  54. flowfile_core/flowfile/flow_data_engine/fuzzy_matching/settings_validator.py +90 -0
  55. flowfile_core/flowfile/flow_data_engine/join/__init__.py +1 -0
  56. flowfile_core/flowfile/flow_data_engine/join/verify_integrity.py +54 -0
  57. flowfile_core/flowfile/flow_data_engine/pivot_table.py +20 -0
  58. flowfile_core/flowfile/flow_data_engine/polars_code_parser.py +249 -0
  59. flowfile_core/flowfile/flow_data_engine/read_excel_tables.py +143 -0
  60. flowfile_core/flowfile/flow_data_engine/sample_data.py +120 -0
  61. flowfile_core/flowfile/flow_data_engine/subprocess_operations/__init__.py +1 -0
  62. flowfile_core/flowfile/flow_data_engine/subprocess_operations/models.py +36 -0
  63. flowfile_core/flowfile/flow_data_engine/subprocess_operations/subprocess_operations.py +503 -0
  64. flowfile_core/flowfile/flow_data_engine/threaded_processes.py +27 -0
  65. flowfile_core/flowfile/flow_data_engine/types.py +0 -0
  66. flowfile_core/flowfile/flow_data_engine/utils.py +212 -0
  67. flowfile_core/flowfile/flow_node/__init__.py +0 -0
  68. flowfile_core/flowfile/flow_node/flow_node.py +771 -0
  69. flowfile_core/flowfile/flow_node/models.py +111 -0
  70. flowfile_core/flowfile/flow_node/schema_callback.py +70 -0
  71. flowfile_core/flowfile/handler.py +123 -0
  72. flowfile_core/flowfile/manage/__init__.py +0 -0
  73. flowfile_core/flowfile/manage/compatibility_enhancements.py +70 -0
  74. flowfile_core/flowfile/manage/manage_flowfile.py +0 -0
  75. flowfile_core/flowfile/manage/open_flowfile.py +136 -0
  76. flowfile_core/flowfile/setting_generator/__init__.py +2 -0
  77. flowfile_core/flowfile/setting_generator/setting_generator.py +41 -0
  78. flowfile_core/flowfile/setting_generator/settings.py +176 -0
  79. flowfile_core/flowfile/sources/__init__.py +0 -0
  80. flowfile_core/flowfile/sources/external_sources/__init__.py +3 -0
  81. flowfile_core/flowfile/sources/external_sources/airbyte_sources/__init__.py +0 -0
  82. flowfile_core/flowfile/sources/external_sources/airbyte_sources/airbyte.py +159 -0
  83. flowfile_core/flowfile/sources/external_sources/airbyte_sources/models.py +172 -0
  84. flowfile_core/flowfile/sources/external_sources/airbyte_sources/settings.py +173 -0
  85. flowfile_core/flowfile/sources/external_sources/base_class.py +39 -0
  86. flowfile_core/flowfile/sources/external_sources/custom_external_sources/__init__.py +2 -0
  87. flowfile_core/flowfile/sources/external_sources/custom_external_sources/exchange_rate.py +0 -0
  88. flowfile_core/flowfile/sources/external_sources/custom_external_sources/external_source.py +100 -0
  89. flowfile_core/flowfile/sources/external_sources/custom_external_sources/google_sheet.py +74 -0
  90. flowfile_core/flowfile/sources/external_sources/custom_external_sources/sample_users.py +29 -0
  91. flowfile_core/flowfile/sources/external_sources/factory.py +22 -0
  92. flowfile_core/flowfile/sources/external_sources/sql_source/__init__.py +0 -0
  93. flowfile_core/flowfile/sources/external_sources/sql_source/models.py +90 -0
  94. flowfile_core/flowfile/sources/external_sources/sql_source/sql_source.py +328 -0
  95. flowfile_core/flowfile/sources/external_sources/sql_source/utils.py +379 -0
  96. flowfile_core/flowfile/util/__init__.py +0 -0
  97. flowfile_core/flowfile/util/calculate_layout.py +137 -0
  98. flowfile_core/flowfile/util/execution_orderer.py +141 -0
  99. flowfile_core/flowfile/utils.py +106 -0
  100. flowfile_core/main.py +138 -0
  101. flowfile_core/routes/__init__.py +0 -0
  102. flowfile_core/routes/auth.py +34 -0
  103. flowfile_core/routes/logs.py +163 -0
  104. flowfile_core/routes/public.py +10 -0
  105. flowfile_core/routes/routes.py +601 -0
  106. flowfile_core/routes/secrets.py +85 -0
  107. flowfile_core/run_lock.py +11 -0
  108. flowfile_core/schemas/__init__.py +0 -0
  109. flowfile_core/schemas/analysis_schemas/__init__.py +0 -0
  110. flowfile_core/schemas/analysis_schemas/graphic_walker_schemas.py +118 -0
  111. flowfile_core/schemas/defaults.py +9 -0
  112. flowfile_core/schemas/external_sources/__init__.py +0 -0
  113. flowfile_core/schemas/external_sources/airbyte_schemas.py +20 -0
  114. flowfile_core/schemas/input_schema.py +477 -0
  115. flowfile_core/schemas/models.py +193 -0
  116. flowfile_core/schemas/output_model.py +115 -0
  117. flowfile_core/schemas/schemas.py +106 -0
  118. flowfile_core/schemas/transform_schema.py +569 -0
  119. flowfile_core/secrets/__init__.py +0 -0
  120. flowfile_core/secrets/secrets.py +64 -0
  121. flowfile_core/utils/__init__.py +0 -0
  122. flowfile_core/utils/arrow_reader.py +247 -0
  123. flowfile_core/utils/excel_file_manager.py +18 -0
  124. flowfile_core/utils/fileManager.py +45 -0
  125. flowfile_core/utils/fl_executor.py +38 -0
  126. flowfile_core/utils/utils.py +8 -0
  127. flowfile_frame/__init__.py +56 -0
  128. flowfile_frame/__main__.py +12 -0
  129. flowfile_frame/adapters.py +17 -0
  130. flowfile_frame/expr.py +1163 -0
  131. flowfile_frame/flow_frame.py +2093 -0
  132. flowfile_frame/group_frame.py +199 -0
  133. flowfile_frame/join.py +75 -0
  134. flowfile_frame/selectors.py +242 -0
  135. flowfile_frame/utils.py +184 -0
  136. flowfile_worker/__init__.py +55 -0
  137. flowfile_worker/configs.py +95 -0
  138. flowfile_worker/create/__init__.py +37 -0
  139. flowfile_worker/create/funcs.py +146 -0
  140. flowfile_worker/create/models.py +86 -0
  141. flowfile_worker/create/pl_types.py +35 -0
  142. flowfile_worker/create/read_excel_tables.py +110 -0
  143. flowfile_worker/create/utils.py +84 -0
  144. flowfile_worker/external_sources/__init__.py +0 -0
  145. flowfile_worker/external_sources/airbyte_sources/__init__.py +0 -0
  146. flowfile_worker/external_sources/airbyte_sources/cache_manager.py +161 -0
  147. flowfile_worker/external_sources/airbyte_sources/main.py +89 -0
  148. flowfile_worker/external_sources/airbyte_sources/models.py +133 -0
  149. flowfile_worker/external_sources/airbyte_sources/settings.py +0 -0
  150. flowfile_worker/external_sources/sql_source/__init__.py +0 -0
  151. flowfile_worker/external_sources/sql_source/main.py +56 -0
  152. flowfile_worker/external_sources/sql_source/models.py +72 -0
  153. flowfile_worker/flow_logger.py +58 -0
  154. flowfile_worker/funcs.py +327 -0
  155. flowfile_worker/main.py +108 -0
  156. flowfile_worker/models.py +95 -0
  157. flowfile_worker/polars_fuzzy_match/__init__.py +0 -0
  158. flowfile_worker/polars_fuzzy_match/matcher.py +435 -0
  159. flowfile_worker/polars_fuzzy_match/models.py +36 -0
  160. flowfile_worker/polars_fuzzy_match/pre_process.py +213 -0
  161. flowfile_worker/polars_fuzzy_match/process.py +86 -0
  162. flowfile_worker/polars_fuzzy_match/utils.py +50 -0
  163. flowfile_worker/process_manager.py +36 -0
  164. flowfile_worker/routes.py +440 -0
  165. flowfile_worker/secrets.py +148 -0
  166. flowfile_worker/spawner.py +187 -0
  167. flowfile_worker/utils.py +25 -0
  168. test_utils/__init__.py +3 -0
  169. test_utils/postgres/__init__.py +1 -0
  170. test_utils/postgres/commands.py +109 -0
  171. test_utils/postgres/fixtures.py +417 -0
@@ -0,0 +1,176 @@
1
+
2
+ from flowfile_core.schemas import input_schema, transform_schema
3
+ from typing import Callable, Iterable
4
+ from functools import wraps
5
+ from flowfile_core.schemas.output_model import NodeData
6
+ from flowfile_core.flowfile.setting_generator.setting_generator import SettingGenerator, SettingUpdator
7
+
8
+ setting_generator = SettingGenerator()
9
+ setting_updator = SettingUpdator()
10
+
11
+
12
+ def setting_generator_method(f: callable) -> Callable:
13
+ @wraps(f)
14
+ def inner(node_data: NodeData) -> NodeData:
15
+ if node_data.setting_input is None or isinstance(node_data.setting_input, input_schema.NodePromise):
16
+ f(node_data)
17
+ return node_data
18
+ setting_generator.add_setting_generator_func(inner)
19
+ return inner
20
+
21
+
22
+ def setting_updator_method(f: callable) -> Callable:
23
+ @wraps(f)
24
+ def inner(node_data: NodeData) -> NodeData:
25
+ if node_data.setting_input is not None and not isinstance(node_data.setting_input, input_schema.NodePromise):
26
+ f(node_data)
27
+ return node_data
28
+
29
+ setting_updator.add_setting_updator_func(inner)
30
+ return inner
31
+
32
+
33
+ @setting_generator_method
34
+ def join(node_data: "NodeData") -> NodeData:
35
+ if node_data.right_input and node_data.main_input:
36
+ overlapping_cols = list(set(node_data.main_input.columns) & set(node_data.right_input.columns))
37
+ if len(overlapping_cols) > 0:
38
+ join_key = overlapping_cols[0]
39
+ else:
40
+ join_key = ''
41
+ ji = transform_schema.JoinInput(join_mapping=join_key,
42
+ left_select=node_data.main_input.columns,
43
+ right_select=node_data.right_input.columns
44
+ )
45
+ ji.auto_rename()
46
+ node_data.setting_input = input_schema.NodeJoin(flow_id=node_data.flow_id,
47
+ node_id=node_data.node_id,
48
+ join_input=ji)
49
+ return node_data
50
+
51
+
52
+ @setting_generator_method
53
+ def cross_join(node_data: "NodeData") -> NodeData:
54
+ if node_data.right_input and node_data.main_input:
55
+ ji = transform_schema.CrossJoinInput(left_select=node_data.main_input.columns,
56
+ right_select=node_data.right_input.columns)
57
+ ji.auto_rename()
58
+ print(ji)
59
+ node_data.setting_input = input_schema.NodeCrossJoin(flow_id=node_data.flow_id,
60
+ node_id=node_data.node_id,
61
+ cross_join_input=ji)
62
+ return node_data
63
+
64
+
65
+ @setting_generator_method
66
+ def filter(node_data: "NodeData") -> NodeData:
67
+ if node_data.main_input:
68
+ fi = transform_schema.FilterInput(basic_filter=transform_schema.BasicFilter(), filter_type='advanced')
69
+ node_data.setting_input = input_schema.NodeFilter(flow_id=node_data.flow_id,
70
+ node_id=node_data.node_id,
71
+ filter_input=fi)
72
+ return node_data
73
+
74
+
75
+ @setting_updator_method
76
+ def join(node_data: NodeData):
77
+ if node_data.right_input and node_data.main_input:
78
+ setting_input: input_schema.NodeJoin = node_data.setting_input
79
+ left_columns = set(node_data.main_input.columns)
80
+ right_columns = set(node_data.right_input.columns)
81
+ left_select = setting_input.join_input.left_select
82
+ right_select = setting_input.join_input.right_select
83
+ for ls in left_select.renames:
84
+ if ls.old_name not in right_columns:
85
+ left_select.remove_select_input(ls.old_name)
86
+ for rs in right_select.renames:
87
+ if rs.old_name not in right_columns:
88
+ right_select.remove_select_input(rs.old_name)
89
+ existing_columns_right = set(r.old_name for r in right_select.renames if r.is_available)
90
+ existing_columns_left = set(r.old_name for r in left_select.renames if r.is_available)
91
+ missing_incoming_left_columns = [ilc for ilc in left_columns if ilc not in existing_columns_left]
92
+ missing_incoming_right_columns = [irc for irc in right_columns if irc not in existing_columns_right]
93
+ if not hasattr(setting_input, 'auto_keep_left'):
94
+ setting_input.auto_keep_left = False
95
+ if not hasattr(setting_input, 'auto_keep_right'):
96
+ setting_input.auto_keep_right = False
97
+ for milc in missing_incoming_left_columns:
98
+ select_input = transform_schema.SelectInput(old_name=milc, keep=setting_input.auto_keep_left)
99
+ setting_input.join_input.add_new_select_column(select_input, 'left')
100
+ for mirc in missing_incoming_right_columns:
101
+ select_input = transform_schema.SelectInput(old_name=mirc, keep=setting_input.auto_keep_right)
102
+ setting_input.join_input.add_new_select_column(select_input, 'right')
103
+ return node_data
104
+
105
+
106
+ @setting_updator_method
107
+ def cross_join(node_data: NodeData):
108
+ if node_data.right_input and node_data.main_input:
109
+ setting_input: input_schema.NodeCrossJoin = node_data.setting_input
110
+ left_columns = set(node_data.main_input.columns)
111
+ right_columns = set(node_data.right_input.columns)
112
+ left_select = setting_input.cross_join_input.left_select
113
+ right_select = setting_input.cross_join_input.right_select
114
+ for ls in left_select.renames:
115
+ if ls.old_name not in right_columns:
116
+ left_select.remove_select_input(ls.old_name)
117
+ for rs in right_select.renames:
118
+ if rs.old_name not in right_columns:
119
+ right_select.remove_select_input(rs.old_name)
120
+ existing_columns_right = set(r.old_name for r in right_select.renames if r.is_available)
121
+ existing_columns_left = set(r.old_name for r in left_select.renames if r.is_available)
122
+ missing_incoming_left_columns = [ilc for ilc in left_columns if ilc not in existing_columns_left]
123
+ missing_incoming_right_columns = [irc for irc in right_columns if irc not in existing_columns_right]
124
+ if not hasattr(setting_input, 'auto_keep_left'):
125
+ setting_input.auto_keep_left = False
126
+ if not hasattr(setting_input, 'auto_keep_right'):
127
+ setting_input.auto_keep_right = False
128
+ for milc in missing_incoming_left_columns:
129
+ select_input = transform_schema.SelectInput(old_name=milc, keep=setting_input.auto_keep_left)
130
+ setting_input.cross_join_input.add_new_select_column(select_input, 'left')
131
+ for mirc in missing_incoming_right_columns:
132
+ select_input = transform_schema.SelectInput(old_name=mirc, keep=setting_input.auto_keep_right)
133
+ setting_input.cross_join_input.add_new_select_column(select_input, 'right')
134
+ return node_data
135
+
136
+
137
+ def check_if_fuzzy_match_is_valid(left_columns: Iterable[str], right_columns: Iterable[str],
138
+ fuzzy_map: transform_schema.FuzzyMap) -> bool:
139
+ if fuzzy_map.left_col not in left_columns:
140
+ return False
141
+ if fuzzy_map.right_col not in right_columns:
142
+ return False
143
+ return True
144
+
145
+
146
+ @setting_updator_method
147
+ def fuzzy_match(node_data: NodeData):
148
+ if node_data.right_input and node_data.main_input:
149
+ setting_input: input_schema.NodeFuzzyMatch = node_data.setting_input
150
+ left_columns = set(node_data.main_input.columns)
151
+ right_columns = set(node_data.right_input.columns)
152
+ left_select = setting_input.join_input.left_select
153
+ right_select = setting_input.join_input.right_select
154
+ for fuzzy_map in setting_input.join_input.join_mapping:
155
+ fuzzy_map.valid = check_if_fuzzy_match_is_valid(left_columns, right_columns, fuzzy_map)
156
+ for ls in left_select.renames:
157
+ if ls.old_name not in right_columns:
158
+ left_select.remove_select_input(ls.old_name)
159
+ for rs in right_select.renames:
160
+ if rs.old_name not in right_columns:
161
+ right_select.remove_select_input(rs.old_name)
162
+ existing_columns_right = set(r.old_name for r in right_select.renames if r.is_available)
163
+ existing_columns_left = set(r.old_name for r in left_select.renames if r.is_available)
164
+ missing_incoming_left_columns = [ilc for ilc in left_columns if ilc not in existing_columns_left]
165
+ missing_incoming_right_columns = [irc for irc in right_columns if irc not in existing_columns_right]
166
+ if not hasattr(setting_input, 'auto_keep_left'):
167
+ setting_input.auto_keep_left = False
168
+ if not hasattr(setting_input, 'auto_keep_right'):
169
+ setting_input.auto_keep_right = False
170
+ for milc in missing_incoming_left_columns:
171
+ select_input = transform_schema.SelectInput(old_name=milc, keep=setting_input.auto_keep_left)
172
+ setting_input.join_input.add_new_select_column(select_input, 'left')
173
+ for mirc in missing_incoming_right_columns:
174
+ select_input = transform_schema.SelectInput(old_name=mirc, keep=setting_input.auto_keep_right)
175
+ setting_input.join_input.add_new_select_column(select_input, 'right')
176
+ return node_data
File without changes
@@ -0,0 +1,3 @@
1
+ from flowfile_core.flowfile.sources.external_sources import custom_external_sources
2
+ # from flowfile.sources.external_sources.custom_external_sources.external_source import check_for_key_vault_existence
3
+ # from flowfile.sources.external_sources.airbyte_sources.airbyte import AirbyteSource
@@ -0,0 +1,159 @@
1
+ import os
2
+ from ast import literal_eval
3
+ import polars as pl
4
+ from typing import Any, Dict, Generator, List, Optional
5
+ from flowfile_core.configs import logger
6
+ from flowfile_core.flowfile.flow_data_engine.flow_file_column.main import FlowfileColumn
7
+ from flowfile_core.flowfile.sources.external_sources.base_class import ExternalDataSource
8
+ from flowfile_core.flowfile.sources.external_sources.airbyte_sources.models import (
9
+ AirbyteProperty, JsonSchema, AirbyteResponse, AirbyteSettings
10
+ )
11
+
12
+
13
+ class LazyAirbyteImporter:
14
+ """Lazy importer for airbyte module."""
15
+ _airbyte = None
16
+
17
+ @classmethod
18
+ def get_airbyte(cls):
19
+ if cls._airbyte is None:
20
+ logger.info("Importing airbyte module")
21
+ import airbyte as ab
22
+ cls._airbyte = ab
23
+ return cls._airbyte
24
+
25
+
26
+ class AirbyteSource(ExternalDataSource):
27
+ stream: str
28
+ source_name: str
29
+ cache_store: Optional['airbyte.results.ReadResult'] = None
30
+ _type: str
31
+ is_collected: bool
32
+ _airbyte_response: Optional[AirbyteResponse] = None
33
+ _airbyte_module = None
34
+ _enforce_full_refresh: Optional[bool] = True
35
+ version: Optional[str] = None
36
+
37
+ def __init__(self, airbyte_settings: AirbyteSettings):
38
+ self.is_collected = False
39
+ self._airbyte_response = None
40
+ self.stream = airbyte_settings.stream
41
+ self.source_name = airbyte_settings.source_name
42
+ self._enforce_full_refresh = airbyte_settings.enforce_full_refresh
43
+ if hasattr(airbyte_settings, 'version'):
44
+ self.version = airbyte_settings.version
45
+
46
+ # Handle config
47
+ if airbyte_settings.config_ref and not airbyte_settings.config:
48
+ logger.info(f"Getting config from {airbyte_settings.config_ref}")
49
+ config = literal_eval(os.environ.get(airbyte_settings.config_ref))
50
+ else:
51
+ logger.info(f"Using provided config")
52
+ config = airbyte_settings.config
53
+
54
+ if config is None:
55
+ raise ValueError("Config must be provided")
56
+
57
+ self.config = config
58
+ self._type = 'airbyte'
59
+ self.read_result = None
60
+
61
+ # Only load source if fields aren't provided
62
+ if not airbyte_settings.fields:
63
+ self.load_source(airbyte_settings)
64
+ else:
65
+ logger.info('Using provided schema')
66
+ self.schema = [
67
+ FlowfileColumn.from_input(column_name=col.name, data_type=col.data_type)
68
+ for col in airbyte_settings.fields
69
+ ]
70
+
71
+ def load_source(self, airbyte_settings: AirbyteSettings):
72
+ logger.info(f"Loading source {self.source_name}")
73
+ if airbyte_settings.fields is not None and len(airbyte_settings.fields) > 0:
74
+ logger.info('Using provided schema')
75
+ self.schema = [
76
+ FlowfileColumn.from_input(column_name=col.name, data_type=col.data_type)
77
+ for col in airbyte_settings.fields
78
+ ]
79
+ else:
80
+ logger.info('Using airbyte schema')
81
+ logger.info(f"Loading source {self.source_name}")
82
+ _ = self.airbyte_response
83
+
84
+ @property
85
+ def airbyte_response(self) -> AirbyteResponse:
86
+ if self._airbyte_response is None:
87
+ # Lazy import airbyte
88
+ ab = LazyAirbyteImporter.get_airbyte()
89
+
90
+ source = ab.get_source(
91
+ name=self.source_name,
92
+ config=self.config,
93
+ streams=self.stream,
94
+ docker_image=True,
95
+ version=self.version
96
+ )
97
+
98
+ try:
99
+ source.check()
100
+ except Exception:
101
+ logger.warning('Source check failed, trying to continue')
102
+
103
+ logger.info(f'Source check passed, starting to load data for {self.stream}')
104
+
105
+ json_schema = source.get_stream_json_schema(self.stream)['properties']
106
+ properties = [
107
+ AirbyteProperty(name=name, json_schema=JsonSchema(**schema))
108
+ for name, schema in json_schema.items()
109
+ ]
110
+
111
+ logger.info(f"Loaded source {self.source_name}")
112
+ self._airbyte_response = AirbyteResponse(properties=properties, source=source)
113
+ self.schema = self.parse_schema(self._airbyte_response)
114
+
115
+ return self._airbyte_response
116
+
117
+ def get_initial_data(self):
118
+ return []
119
+
120
+ def get_iter(self) -> Generator[Dict[str, Any], None, None]:
121
+ logger.warning('Getting data in iteration, this is suboptimal')
122
+ data = self.data_getter()
123
+ for row in data:
124
+ yield row
125
+ self.is_collected = True
126
+
127
+ def get_sample(self, n: int = 10000):
128
+ logger.warning('Getting sample in iteration, this is suboptimal')
129
+ data = self.get_iter()
130
+ for i in range(n):
131
+ try:
132
+ yield next(data)
133
+ except StopIteration:
134
+ break
135
+
136
+ @staticmethod
137
+ def parse_schema(airbyte_response: AirbyteResponse) -> List[FlowfileColumn]:
138
+ return airbyte_response.get_flow_file_columns()
139
+
140
+ def get_df(self):
141
+ if self.read_result is None:
142
+ self.read_result = self.airbyte_response.source.read()
143
+
144
+ df = self.read_result[self.stream].to_pandas()
145
+ drop_cols = [c for c in df.columns if c.startswith('_airbyte')]
146
+ df.drop(drop_cols, axis=1, inplace=True)
147
+ return df
148
+
149
+ def get_pl_df(self) -> pl.DataFrame:
150
+ self.is_collected = True
151
+ return pl.from_pandas(self.get_df())
152
+
153
+ def data_getter(self) -> List[Dict]:
154
+ return self.get_df().to_dict(orient='records')
155
+
156
+ @classmethod
157
+ def create_from_frontend_input(cls, config: Any, stream_name: str, source_name: str):
158
+ # Implementation details to be added
159
+ pass
@@ -0,0 +1,172 @@
1
+ from typing import Any, Dict, List, Optional, TYPE_CHECKING, Union
2
+ from pydantic import BaseModel, field_validator, ConfigDict
3
+ import polars as pl
4
+ from flowfile_core.flowfile.flow_data_engine.flow_file_column.utils import type_to_polars_str
5
+ from flowfile_core.flowfile.flow_data_engine.flow_file_column.main import FlowfileColumn
6
+ from flowfile_core.schemas.input_schema import MinimalFieldInfo
7
+ from flowfile_core.flowfile.flow_data_engine.flow_file_column.polars_type import PlType
8
+ from flowfile_core.configs import logger
9
+
10
+ # Use TYPE_CHECKING to avoid circular imports
11
+ if TYPE_CHECKING:
12
+ from airbyte import Source
13
+ else:
14
+ Source = Any
15
+
16
+
17
+ class LazyAirbyteSource:
18
+ """Lazy wrapper for airbyte Source class."""
19
+ _source_class = None
20
+
21
+ @classmethod
22
+ def get_source_class(cls):
23
+ if cls._source_class is None:
24
+ logger.info("Importing airbyte Source class")
25
+ from airbyte import Source
26
+ cls._source_class = Source
27
+ return cls._source_class
28
+
29
+
30
+ class JsonSchema(BaseModel):
31
+ type: Optional[Union[str, List[str]]]
32
+ airbyte_type: Optional[Union[str, List[str]]] = None
33
+ format: Optional[str] = None
34
+
35
+ def get_pl_type(self) -> pl.DataType:
36
+ if self.format:
37
+ format_mapping = {
38
+ 'date-time': 'datetime',
39
+ 'date': 'date',
40
+ 'time': 'time'
41
+ }
42
+ dtype = format_mapping.get(self.format, 'string')
43
+ else:
44
+ type_mapping = {
45
+ 'string': 'string',
46
+ 'boolean': 'bool',
47
+ 'integer': 'int',
48
+ 'number': 'float',
49
+ 'array': 'string',
50
+ 'object': 'string'
51
+ }
52
+ if isinstance(self.type, list) and len(self.type) >= 1:
53
+ _type_mappings = (type_mapping.get(t) for t in self.type)
54
+ dtype = next((t for t in _type_mappings if t is not None), self.type[0])
55
+ elif isinstance(self.type, list) and len(self.type) == 0:
56
+ dtype = 'string'
57
+ else:
58
+ dtype = type_mapping.get(self.type[0] if isinstance(self.type, list) else self.type, 'string')
59
+ return type_to_polars_str(dtype)
60
+
61
+
62
+ class AirbyteProperty(BaseModel):
63
+ name: str
64
+ json_schema: JsonSchema
65
+
66
+ def get_pl_type(self) -> PlType:
67
+ return PlType(column_name=self.name, pl_datatype=self.json_schema.get_pl_type())
68
+
69
+
70
+ class AirbyteResponse(BaseModel):
71
+ source: Any # Using Any to avoid direct Source import
72
+ properties: list[AirbyteProperty]
73
+
74
+ model_config = ConfigDict(arbitrary_types_allowed=True)
75
+
76
+ @field_validator('source')
77
+ @classmethod
78
+ def validate_source(cls, v: Any) -> Any:
79
+ source_class = LazyAirbyteSource.get_source_class()
80
+ if not isinstance(v, source_class):
81
+ raise ValueError(f"Source must be an instance of airbyte.Source, got {type(v)}")
82
+ return v
83
+
84
+ def get_flow_file_columns(self) -> List[FlowfileColumn]:
85
+ return [
86
+ FlowfileColumn.create_from_polars_type(c.get_pl_type(), col_index=i)
87
+ for i, c in enumerate(self.properties)
88
+ ]
89
+
90
+
91
+ class GenericProperties(BaseModel):
92
+ type: str
93
+ title: Optional[str] = None
94
+ description: Optional[str] = None
95
+ order: Optional[int] = None
96
+ required: Optional[List[str]] = None
97
+ airbyte_secret: Optional[bool] = None
98
+ pattern: Optional[str] = None
99
+ pattern_descriptor: Optional[str] = None
100
+ format: Optional[str] = None
101
+ examples: Optional[List[Any]] = None
102
+ enum: Optional[List[str]] = None
103
+ minimum: Optional[float] = None
104
+ maximum: Optional[float] = None
105
+ items: Optional[Any] = None
106
+ properties: Optional[Dict[str, Any]] = None
107
+
108
+ @field_validator('items', 'properties')
109
+ @classmethod
110
+ def validate_nested(cls, value: Any) -> Any:
111
+ if isinstance(value, dict):
112
+ if 'type' in value:
113
+ return GenericProperties(**value)
114
+ return {k: GenericProperties(**v) if isinstance(v, dict) else v for k, v in value.items()}
115
+ return value
116
+
117
+
118
+ class GenericSchema(BaseModel):
119
+ title: str
120
+ type: str
121
+ required: Optional[List[str]] = None
122
+ additionalProperties: Optional[bool] = None
123
+ properties: Dict[str, GenericProperties]
124
+
125
+
126
+ class FieldProperty(BaseModel):
127
+ title: Optional[str] = None
128
+ type: str
129
+ key: str
130
+ description: Optional[str] = None
131
+ airbyte_secret: Optional[bool] = None
132
+ input_value: Optional[str] = None
133
+ default: Any
134
+
135
+
136
+ class OverallFieldProperty(BaseModel):
137
+ title: Optional[str] = None
138
+ type: str
139
+ key: str
140
+ required: bool
141
+ properties: List[FieldProperty]
142
+ items: Optional[List[FieldProperty]]
143
+ isOpen: bool
144
+ description: Optional[str] = None
145
+ input_value: Optional[str] = None
146
+ airbyte_secret: Optional[bool] = None
147
+ default: Any
148
+
149
+
150
+ class AirbyteConfigTemplate(BaseModel):
151
+ source_name: str
152
+ docs_url: Optional[str] = None
153
+ config_spec: Dict
154
+ available_streams: Optional[List[str]] = None
155
+
156
+
157
+ class AirbyteSettings(BaseModel):
158
+ source_name: str
159
+ stream: str
160
+ config_ref: Optional[str] = None
161
+ config: Optional[Dict] = None
162
+ fields: Optional[List[MinimalFieldInfo]] = None
163
+ enforce_full_refresh: Optional[bool] = True
164
+ flowfile_flow_id: int
165
+ flowfile_node_id: int
166
+ version: Optional[str] = None
167
+
168
+
169
+ def get_source_instance(*args, **kwargs) -> 'Source':
170
+ """Helper function to get a Source instance with lazy loading."""
171
+ source_class = LazyAirbyteSource.get_source_class()
172
+ return source_class(*args, **kwargs)
@@ -0,0 +1,173 @@
1
+ from typing import List, Dict, Optional, Any, Type
2
+ from flowfile_core.configs import logger
3
+ from flowfile_core.flowfile.sources.external_sources.airbyte_sources.models import AirbyteConfigTemplate, \
4
+ AirbyteSettings
5
+ from flowfile_core.schemas.external_sources.airbyte_schemas import AirbyteConfig
6
+ from flowfile_core.flowfile.connection_manager import connection_manager
7
+ from flowfile_core.flowfile.connection_manager._connection_manager import Connection
8
+
9
+
10
+ class LazyAirbyteImporter:
11
+ """Lazy importer for airbyte module."""
12
+ _airbyte = None
13
+
14
+ @classmethod
15
+ def get_airbyte(cls):
16
+ if cls._airbyte is None:
17
+ logger.info("Importing airbyte module")
18
+ import airbyte as ab
19
+ cls._airbyte = ab
20
+ return cls._airbyte
21
+
22
+
23
+ class AirbyteConfigHandler:
24
+ _available_connectors: Optional[List[str]] = None
25
+ configs: Dict[str, AirbyteConfigTemplate]
26
+ _airbyte = None
27
+
28
+ def __init__(self):
29
+ self.configs = {}
30
+ self._available_connectors = None
31
+
32
+ @property
33
+ def airbyte(self):
34
+ """Lazy load airbyte module when needed."""
35
+ if self._airbyte is None:
36
+ self._airbyte = LazyAirbyteImporter.get_airbyte()
37
+ return self._airbyte
38
+
39
+ @property
40
+ def available_connectors(self) -> List[str]:
41
+ if self._available_connectors is None:
42
+ self._available_connectors = [
43
+ c.replace('source-', '')
44
+ for c in self.airbyte.get_available_connectors()
45
+ if c.startswith('source-')
46
+ ]
47
+ return self._available_connectors
48
+
49
+ @property
50
+ def available_configs(self) -> List[str]:
51
+ return list(self.configs.keys())
52
+
53
+ def get_config(self, config_name: str) -> AirbyteConfigTemplate:
54
+ """Get configuration for a specific source."""
55
+ logger.info(f"Getting config for {config_name}")
56
+
57
+ if config_name not in self.configs:
58
+ try:
59
+ source = self.airbyte.get_source(
60
+ name=config_name,
61
+ install_if_missing=True,
62
+ docker_image=True
63
+ )
64
+ logger.info(f"Got source {config_name}")
65
+
66
+ self.configs[config_name] = AirbyteConfigTemplate(
67
+ config_spec=source.config_spec,
68
+ docs_url=source.docs_url,
69
+ source_name=config_name
70
+ )
71
+ except Exception as e:
72
+ logger.error(f"Error getting config for {config_name}: {str(e)}")
73
+ raise
74
+
75
+ return self.configs[config_name]
76
+
77
+ def get_available_streams(self, config_name: str, config_settings: Any) -> List[str]:
78
+ """Get available streams for a specific configuration."""
79
+ if config_name not in self.configs:
80
+ raise ValueError(f"Config {config_name} not found")
81
+
82
+ logger.info(f'Getting available streams for {config_name}')
83
+
84
+ try:
85
+ source = self.airbyte.get_source(
86
+ name=config_name,
87
+ install_if_missing=True,
88
+ config=config_settings,
89
+ docker_image=True
90
+ )
91
+ streams = source.get_available_streams()
92
+ if len(streams) == 0 or streams is None:
93
+ raise ValueError(f"No streams found for {config_name}")
94
+ self.configs[config_name].available_streams = streams
95
+ return self.configs[config_name].available_streams
96
+
97
+ except Exception as e:
98
+ logger.error(f"Error getting streams for {config_name}: {str(e)}")
99
+ raise
100
+
101
+
102
+ # Create singleton instance
103
+ airbyte_config_handler = AirbyteConfigHandler()
104
+
105
+
106
+ class AirbyteHandler:
107
+ """Handler for Airbyte configurations and connections."""
108
+ config: AirbyteConfig
109
+
110
+ def __init__(self, airbyte_config: AirbyteConfig):
111
+ self.config = airbyte_config
112
+
113
+ def set_airbyte_config(self, airbyte_config_in: AirbyteConfig) -> AirbyteConfig:
114
+ """Update the current configuration."""
115
+ self.config.mapped_config_spec = airbyte_config_in.mapped_config_spec
116
+ self.config.parsed_config = airbyte_config_in.parsed_config
117
+ return airbyte_config_in
118
+
119
+ def get_available_streams(self) -> List[str]:
120
+ """Get available streams for the current configuration."""
121
+ config_template = airbyte_config_handler.configs.get(self.config.full_source_name)
122
+
123
+ if not config_template:
124
+ logger.warning(
125
+ f"Config {self.config.source_name} not found, trying to recreate the config"
126
+ )
127
+ try:
128
+ _ = airbyte_config_handler.get_config(self.config.full_source_name)
129
+ logger.info(f"Config {self.config.source_name} recreated")
130
+ except Exception as e:
131
+ logger.error(f"Error recreating config: {str(e)}")
132
+ raise
133
+
134
+ return airbyte_config_handler.get_available_streams(
135
+ self.config.full_source_name,
136
+ self.config.mapped_config_spec
137
+ )
138
+
139
+ def save_connection(self, connection_name: str) -> None:
140
+ """Save the current configuration as a connection."""
141
+ connection = Connection(
142
+ group=self.config.source_name,
143
+ name=connection_name,
144
+ config_setting=self.config,
145
+ type='airbyte'
146
+ )
147
+
148
+ connection_manager.add_connection(
149
+ self.config.source_name,
150
+ connection_name=connection_name,
151
+ connection=connection
152
+ )
153
+
154
+
155
+ def airbyte_settings_from_config(airbyte_config: AirbyteConfig, flow_id: int, node_id: int|str) -> AirbyteSettings:
156
+ """Create AirbyteSettings from an AirbyteConfig."""
157
+ if airbyte_config.config_mode == 'key_vault':
158
+ connection = connection_manager.get_connection(
159
+ connection_group=airbyte_config.source_name,
160
+ connection_name=airbyte_config.connection_name
161
+ )
162
+ config = connection.config_setting.mapped_config_spec
163
+ else:
164
+ config = airbyte_config.mapped_config_spec
165
+
166
+ return AirbyteSettings(
167
+ source_name=airbyte_config.full_source_name,
168
+ stream=airbyte_config.selected_stream,
169
+ config=config,
170
+ flowfile_flow_id=flow_id,
171
+ flowfile_node_id=node_id,
172
+ version=airbyte_config.version
173
+ )