Flowfile 0.2.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of Flowfile might be problematic. Click here for more details.

Files changed (171) hide show
  1. build_backends/__init__.py +0 -0
  2. build_backends/main.py +313 -0
  3. build_backends/main_prd.py +202 -0
  4. flowfile/__init__.py +71 -0
  5. flowfile/__main__.py +24 -0
  6. flowfile-0.2.2.dist-info/LICENSE +21 -0
  7. flowfile-0.2.2.dist-info/METADATA +225 -0
  8. flowfile-0.2.2.dist-info/RECORD +171 -0
  9. flowfile-0.2.2.dist-info/WHEEL +4 -0
  10. flowfile-0.2.2.dist-info/entry_points.txt +9 -0
  11. flowfile_core/__init__.py +13 -0
  12. flowfile_core/auth/__init__.py +0 -0
  13. flowfile_core/auth/jwt.py +140 -0
  14. flowfile_core/auth/models.py +40 -0
  15. flowfile_core/auth/secrets.py +178 -0
  16. flowfile_core/configs/__init__.py +35 -0
  17. flowfile_core/configs/flow_logger.py +433 -0
  18. flowfile_core/configs/node_store/__init__.py +0 -0
  19. flowfile_core/configs/node_store/nodes.py +98 -0
  20. flowfile_core/configs/settings.py +120 -0
  21. flowfile_core/database/__init__.py +0 -0
  22. flowfile_core/database/connection.py +51 -0
  23. flowfile_core/database/init_db.py +45 -0
  24. flowfile_core/database/models.py +41 -0
  25. flowfile_core/fileExplorer/__init__.py +0 -0
  26. flowfile_core/fileExplorer/funcs.py +259 -0
  27. flowfile_core/fileExplorer/utils.py +53 -0
  28. flowfile_core/flowfile/FlowfileFlow.py +1403 -0
  29. flowfile_core/flowfile/__init__.py +0 -0
  30. flowfile_core/flowfile/_extensions/__init__.py +0 -0
  31. flowfile_core/flowfile/_extensions/real_time_interface.py +51 -0
  32. flowfile_core/flowfile/analytics/__init__.py +0 -0
  33. flowfile_core/flowfile/analytics/analytics_processor.py +123 -0
  34. flowfile_core/flowfile/analytics/graphic_walker.py +60 -0
  35. flowfile_core/flowfile/analytics/schemas/__init__.py +0 -0
  36. flowfile_core/flowfile/analytics/utils.py +9 -0
  37. flowfile_core/flowfile/connection_manager/__init__.py +3 -0
  38. flowfile_core/flowfile/connection_manager/_connection_manager.py +48 -0
  39. flowfile_core/flowfile/connection_manager/models.py +10 -0
  40. flowfile_core/flowfile/database_connection_manager/__init__.py +0 -0
  41. flowfile_core/flowfile/database_connection_manager/db_connections.py +139 -0
  42. flowfile_core/flowfile/database_connection_manager/models.py +15 -0
  43. flowfile_core/flowfile/extensions.py +36 -0
  44. flowfile_core/flowfile/flow_data_engine/__init__.py +0 -0
  45. flowfile_core/flowfile/flow_data_engine/create/__init__.py +0 -0
  46. flowfile_core/flowfile/flow_data_engine/create/funcs.py +146 -0
  47. flowfile_core/flowfile/flow_data_engine/flow_data_engine.py +1521 -0
  48. flowfile_core/flowfile/flow_data_engine/flow_file_column/__init__.py +0 -0
  49. flowfile_core/flowfile/flow_data_engine/flow_file_column/main.py +144 -0
  50. flowfile_core/flowfile/flow_data_engine/flow_file_column/polars_type.py +24 -0
  51. flowfile_core/flowfile/flow_data_engine/flow_file_column/utils.py +36 -0
  52. flowfile_core/flowfile/flow_data_engine/fuzzy_matching/__init__.py +0 -0
  53. flowfile_core/flowfile/flow_data_engine/fuzzy_matching/prepare_for_fuzzy_match.py +38 -0
  54. flowfile_core/flowfile/flow_data_engine/fuzzy_matching/settings_validator.py +90 -0
  55. flowfile_core/flowfile/flow_data_engine/join/__init__.py +1 -0
  56. flowfile_core/flowfile/flow_data_engine/join/verify_integrity.py +54 -0
  57. flowfile_core/flowfile/flow_data_engine/pivot_table.py +20 -0
  58. flowfile_core/flowfile/flow_data_engine/polars_code_parser.py +249 -0
  59. flowfile_core/flowfile/flow_data_engine/read_excel_tables.py +143 -0
  60. flowfile_core/flowfile/flow_data_engine/sample_data.py +120 -0
  61. flowfile_core/flowfile/flow_data_engine/subprocess_operations/__init__.py +1 -0
  62. flowfile_core/flowfile/flow_data_engine/subprocess_operations/models.py +36 -0
  63. flowfile_core/flowfile/flow_data_engine/subprocess_operations/subprocess_operations.py +503 -0
  64. flowfile_core/flowfile/flow_data_engine/threaded_processes.py +27 -0
  65. flowfile_core/flowfile/flow_data_engine/types.py +0 -0
  66. flowfile_core/flowfile/flow_data_engine/utils.py +212 -0
  67. flowfile_core/flowfile/flow_node/__init__.py +0 -0
  68. flowfile_core/flowfile/flow_node/flow_node.py +771 -0
  69. flowfile_core/flowfile/flow_node/models.py +111 -0
  70. flowfile_core/flowfile/flow_node/schema_callback.py +70 -0
  71. flowfile_core/flowfile/handler.py +123 -0
  72. flowfile_core/flowfile/manage/__init__.py +0 -0
  73. flowfile_core/flowfile/manage/compatibility_enhancements.py +70 -0
  74. flowfile_core/flowfile/manage/manage_flowfile.py +0 -0
  75. flowfile_core/flowfile/manage/open_flowfile.py +136 -0
  76. flowfile_core/flowfile/setting_generator/__init__.py +2 -0
  77. flowfile_core/flowfile/setting_generator/setting_generator.py +41 -0
  78. flowfile_core/flowfile/setting_generator/settings.py +176 -0
  79. flowfile_core/flowfile/sources/__init__.py +0 -0
  80. flowfile_core/flowfile/sources/external_sources/__init__.py +3 -0
  81. flowfile_core/flowfile/sources/external_sources/airbyte_sources/__init__.py +0 -0
  82. flowfile_core/flowfile/sources/external_sources/airbyte_sources/airbyte.py +159 -0
  83. flowfile_core/flowfile/sources/external_sources/airbyte_sources/models.py +172 -0
  84. flowfile_core/flowfile/sources/external_sources/airbyte_sources/settings.py +173 -0
  85. flowfile_core/flowfile/sources/external_sources/base_class.py +39 -0
  86. flowfile_core/flowfile/sources/external_sources/custom_external_sources/__init__.py +2 -0
  87. flowfile_core/flowfile/sources/external_sources/custom_external_sources/exchange_rate.py +0 -0
  88. flowfile_core/flowfile/sources/external_sources/custom_external_sources/external_source.py +100 -0
  89. flowfile_core/flowfile/sources/external_sources/custom_external_sources/google_sheet.py +74 -0
  90. flowfile_core/flowfile/sources/external_sources/custom_external_sources/sample_users.py +29 -0
  91. flowfile_core/flowfile/sources/external_sources/factory.py +22 -0
  92. flowfile_core/flowfile/sources/external_sources/sql_source/__init__.py +0 -0
  93. flowfile_core/flowfile/sources/external_sources/sql_source/models.py +90 -0
  94. flowfile_core/flowfile/sources/external_sources/sql_source/sql_source.py +328 -0
  95. flowfile_core/flowfile/sources/external_sources/sql_source/utils.py +379 -0
  96. flowfile_core/flowfile/util/__init__.py +0 -0
  97. flowfile_core/flowfile/util/calculate_layout.py +137 -0
  98. flowfile_core/flowfile/util/execution_orderer.py +141 -0
  99. flowfile_core/flowfile/utils.py +106 -0
  100. flowfile_core/main.py +138 -0
  101. flowfile_core/routes/__init__.py +0 -0
  102. flowfile_core/routes/auth.py +34 -0
  103. flowfile_core/routes/logs.py +163 -0
  104. flowfile_core/routes/public.py +10 -0
  105. flowfile_core/routes/routes.py +601 -0
  106. flowfile_core/routes/secrets.py +85 -0
  107. flowfile_core/run_lock.py +11 -0
  108. flowfile_core/schemas/__init__.py +0 -0
  109. flowfile_core/schemas/analysis_schemas/__init__.py +0 -0
  110. flowfile_core/schemas/analysis_schemas/graphic_walker_schemas.py +118 -0
  111. flowfile_core/schemas/defaults.py +9 -0
  112. flowfile_core/schemas/external_sources/__init__.py +0 -0
  113. flowfile_core/schemas/external_sources/airbyte_schemas.py +20 -0
  114. flowfile_core/schemas/input_schema.py +477 -0
  115. flowfile_core/schemas/models.py +193 -0
  116. flowfile_core/schemas/output_model.py +115 -0
  117. flowfile_core/schemas/schemas.py +106 -0
  118. flowfile_core/schemas/transform_schema.py +569 -0
  119. flowfile_core/secrets/__init__.py +0 -0
  120. flowfile_core/secrets/secrets.py +64 -0
  121. flowfile_core/utils/__init__.py +0 -0
  122. flowfile_core/utils/arrow_reader.py +247 -0
  123. flowfile_core/utils/excel_file_manager.py +18 -0
  124. flowfile_core/utils/fileManager.py +45 -0
  125. flowfile_core/utils/fl_executor.py +38 -0
  126. flowfile_core/utils/utils.py +8 -0
  127. flowfile_frame/__init__.py +56 -0
  128. flowfile_frame/__main__.py +12 -0
  129. flowfile_frame/adapters.py +17 -0
  130. flowfile_frame/expr.py +1163 -0
  131. flowfile_frame/flow_frame.py +2093 -0
  132. flowfile_frame/group_frame.py +199 -0
  133. flowfile_frame/join.py +75 -0
  134. flowfile_frame/selectors.py +242 -0
  135. flowfile_frame/utils.py +184 -0
  136. flowfile_worker/__init__.py +55 -0
  137. flowfile_worker/configs.py +95 -0
  138. flowfile_worker/create/__init__.py +37 -0
  139. flowfile_worker/create/funcs.py +146 -0
  140. flowfile_worker/create/models.py +86 -0
  141. flowfile_worker/create/pl_types.py +35 -0
  142. flowfile_worker/create/read_excel_tables.py +110 -0
  143. flowfile_worker/create/utils.py +84 -0
  144. flowfile_worker/external_sources/__init__.py +0 -0
  145. flowfile_worker/external_sources/airbyte_sources/__init__.py +0 -0
  146. flowfile_worker/external_sources/airbyte_sources/cache_manager.py +161 -0
  147. flowfile_worker/external_sources/airbyte_sources/main.py +89 -0
  148. flowfile_worker/external_sources/airbyte_sources/models.py +133 -0
  149. flowfile_worker/external_sources/airbyte_sources/settings.py +0 -0
  150. flowfile_worker/external_sources/sql_source/__init__.py +0 -0
  151. flowfile_worker/external_sources/sql_source/main.py +56 -0
  152. flowfile_worker/external_sources/sql_source/models.py +72 -0
  153. flowfile_worker/flow_logger.py +58 -0
  154. flowfile_worker/funcs.py +327 -0
  155. flowfile_worker/main.py +108 -0
  156. flowfile_worker/models.py +95 -0
  157. flowfile_worker/polars_fuzzy_match/__init__.py +0 -0
  158. flowfile_worker/polars_fuzzy_match/matcher.py +435 -0
  159. flowfile_worker/polars_fuzzy_match/models.py +36 -0
  160. flowfile_worker/polars_fuzzy_match/pre_process.py +213 -0
  161. flowfile_worker/polars_fuzzy_match/process.py +86 -0
  162. flowfile_worker/polars_fuzzy_match/utils.py +50 -0
  163. flowfile_worker/process_manager.py +36 -0
  164. flowfile_worker/routes.py +440 -0
  165. flowfile_worker/secrets.py +148 -0
  166. flowfile_worker/spawner.py +187 -0
  167. flowfile_worker/utils.py +25 -0
  168. test_utils/__init__.py +3 -0
  169. test_utils/postgres/__init__.py +1 -0
  170. test_utils/postgres/commands.py +109 -0
  171. test_utils/postgres/fixtures.py +417 -0
@@ -0,0 +1,212 @@
1
+ import polars as pl
2
+ from flowfile_core.configs.settings import AVAILABLE_RAM, WORKER_URL
3
+ from flowfile_core.configs import logger
4
+ from flowfile_core.flowfile.flow_data_engine.subprocess_operations import ExternalDfFetcher
5
+ from flowfile_core.flowfile.flow_data_engine.subprocess_operations import Status
6
+ import os
7
+ from typing import List, Dict, Iterable, Callable, Any
8
+ from itertools import chain
9
+ import requests
10
+ from base64 import encodebytes
11
+
12
+
13
+ def convert_to_string(v):
14
+ try:
15
+ return str(v)
16
+ except:
17
+ return None
18
+
19
+
20
+ def standardize_col_dtype(vals):
21
+ types = set(type(val) for val in vals)
22
+ if len(types) == 1:
23
+ return vals
24
+ elif int in types and float in types:
25
+ return vals
26
+ else:
27
+ return [convert_to_string(v) for v in vals]
28
+
29
+
30
+ def get_data_type(vals: Iterable[Any]):
31
+ types = set(type(val) for val in vals)
32
+ if len(types) == 1:
33
+ return types.pop().__name__
34
+ elif types == {float, int}:
35
+ return 'float'
36
+ else:
37
+ return 'str'
38
+
39
+
40
+ def ensure_similarity_dicts(datas: List[Dict], respect_order: bool = True):
41
+ all_cols = (data.keys() for data in datas)
42
+ if not respect_order:
43
+ unique_cols = set(chain(*all_cols))
44
+ else:
45
+ col_store = set()
46
+ unique_cols = list()
47
+ for row in all_cols:
48
+ for col in row:
49
+ if col not in col_store:
50
+ unique_cols.append(col)
51
+ col_store.update((col,))
52
+ output = []
53
+ for data in datas:
54
+ new_record = dict()
55
+ for col in unique_cols:
56
+ val = data.get(col)
57
+ new_record[col] = val
58
+ output.append(new_record)
59
+ return output
60
+
61
+
62
+ def calculate_schema(lf: pl.LazyFrame) -> List[Dict]:
63
+ r = ExternalDfFetcher(lf=lf, operation_type='calculate_schema', wait_on_completion=False, flow_id=-1, node_id=-1)
64
+ schema_stats: List[Dict] = r.get_result()
65
+ for schema_stat in schema_stats:
66
+ schema_stat['pl_datatype'] = getattr(pl.datatypes, schema_stat['pl_datatype'])
67
+ return schema_stats
68
+
69
+
70
+ def write_polars_frame(_df: pl.LazyFrame | pl.DataFrame, path: str, data_type: str = 'parquet',
71
+ estimated_size: int = 0):
72
+ is_lazy = isinstance(_df, pl.LazyFrame)
73
+ logger.info('Caching data frame')
74
+ if is_lazy:
75
+ if estimated_size > 0:
76
+ fit_memory = estimated_size / 1024 / 1000 / 1000 < AVAILABLE_RAM
77
+ if fit_memory:
78
+ _df = _df.collect()
79
+ is_lazy = False
80
+
81
+ if is_lazy:
82
+ logger.info("Writing in memory efficient mode")
83
+ write_method = getattr(_df, 'sink_' + data_type)
84
+ try:
85
+ write_method(path)
86
+ return True
87
+ except Exception as e:
88
+ pass
89
+ if is_lazy:
90
+ _df = _df.collect()
91
+ try:
92
+ write_method = getattr(_df, 'write_' + data_type)
93
+ write_method(path)
94
+ return True
95
+ except:
96
+ return False
97
+
98
+
99
+ def collect(df: pl.LazyFrame, streamable: bool = True):
100
+ try:
101
+ return df.collect(engine="streaming" if streamable else "auto")
102
+ except:
103
+ return df.collect(engine="auto")
104
+
105
+
106
+ def cache_polars_frame_to_temp(_df: pl.LazyFrame | pl.DataFrame, tempdir: str = None) -> pl.LazyFrame:
107
+ path = f'{tempdir}\\fl_file_{id(_df)}'
108
+ result = write_polars_frame(_df, path)
109
+ if result:
110
+ df = pl.read_parquet(path)
111
+ return df.lazy()
112
+ else:
113
+ raise Exception('Could not cache the data')
114
+
115
+
116
+ def define_pl_col_transformation(col_name: str, col_type: pl.DataType) -> pl.Expr:
117
+ if col_type == pl.Datetime:
118
+ return pl.col(col_name).str.to_datetime(strict=False)
119
+ elif col_type == pl.Date:
120
+ return pl.col(col_name).str.to_date(strict=False)
121
+ else:
122
+ return pl.col(col_name).cast(col_type, strict=False)
123
+
124
+
125
+ def execute_write_method(write_method: Callable, path: str, data_type: str = None, sheet_name: str = None,
126
+ delimiter: str = None,
127
+ write_mode: str = 'create'):
128
+ if data_type == 'excel':
129
+ logger.info('Writing as excel file')
130
+ write_method(path, worksheet=sheet_name)
131
+ elif data_type == 'csv':
132
+ logger.info('Writing as csv file')
133
+ if write_mode == 'append':
134
+ with open(path, 'ab') as f:
135
+ write_method(path=f, separator=delimiter, quote_style='always')
136
+ else:
137
+ write_method(path=path, separator=delimiter, quote_style='always')
138
+ elif data_type == 'parquet':
139
+ logger.info('Writing as parquet file')
140
+ write_method(path)
141
+
142
+
143
+ def write_output(_df: pl.LazyFrame,
144
+ data_type: str, path: str, write_mode: str, sheet_name: str = None,
145
+ delimiter: str = None, flow_id: int = -1, node_id: int | str = -1) -> Status:
146
+ serializable_df = _df.serialize()
147
+ r = requests.post(f'{WORKER_URL}/write_results/',
148
+ json={'operation': encodebytes(serializable_df).decode(),
149
+ 'data_type': data_type,
150
+ 'path': path,
151
+ 'write_mode': write_mode,
152
+ 'sheet_name': sheet_name,
153
+ 'delimiter': delimiter,
154
+ 'flowfile_node_id': node_id,
155
+ 'flowfile_flow_id': flow_id})
156
+ if r.ok:
157
+ return Status(**r.json())
158
+ else:
159
+ raise Exception(f'Could not cache the data, {r.text}')
160
+
161
+
162
+ def local_write_output(_df: pl.LazyFrame | pl.DataFrame, data_type: str, path: str, write_mode: str,
163
+ sheet_name: str = None, delimiter: str = None, flow_id: int = -1, node_id: int | str = -1):
164
+ is_lazy = isinstance(_df, pl.LazyFrame)
165
+ sink_method_str = 'sink_' + data_type
166
+ write_method_str = 'write_' + data_type
167
+ has_sink_method = hasattr(_df, sink_method_str)
168
+ write_method = None
169
+ if os.path.exists(path) and write_mode == 'create':
170
+ return None
171
+ if has_sink_method and is_lazy:
172
+ write_method = getattr(_df, 'sink_' + data_type)
173
+ elif not is_lazy or not has_sink_method:
174
+ if is_lazy:
175
+ _df = _df.collect()
176
+ write_method = getattr(_df, write_method_str)
177
+ if write_method is not None:
178
+ execute_write_method(write_method, path=path, data_type=data_type, sheet_name=sheet_name,
179
+ delimiter=delimiter, write_mode=write_mode)
180
+
181
+
182
+ def create_pl_df_type_save(raw_data: Iterable[Iterable], orient: str = 'row') -> pl.DataFrame:
183
+ """
184
+ orient : {'col', 'row'}, default None
185
+ Whether to interpret two-dimensional data as columns or as rows. If None,
186
+ the orientation is inferred by matching the columns and data dimensions. If
187
+ this does not yield conclusive results, column orientation is used.
188
+ :param raw_data: iterables with values
189
+ :param orient:
190
+ :return: polars dataframe
191
+ """
192
+ if orient == 'row':
193
+ raw_data = zip(*raw_data)
194
+ raw_data = [standardize_col_dtype(values) for values in raw_data]
195
+ return pl.DataFrame(raw_data, orient='col')
196
+
197
+
198
+ def find_first_positions(lst: List[str]) -> Dict[str, int]:
199
+ first_positions: Dict[str, int] = {}
200
+ for i, value in enumerate(lst):
201
+ if value not in first_positions:
202
+ first_positions[value] = i
203
+ return first_positions
204
+
205
+
206
+ def match_order(l: List[str], ref: List[str]) -> List[str]:
207
+ ref_order = find_first_positions(ref)
208
+ order = []
209
+ for v in l:
210
+ org_order = ref_order.get(v, float('inf'))
211
+ order.append(org_order)
212
+ return [v for _, v in sorted(zip(order, l))]
File without changes