Flowfile 0.3.2__py3-none-any.whl → 0.3.3.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. flowfile/__init__.py +3 -2
  2. flowfile/web/__init__.py +3 -0
  3. {flowfile-0.3.2.dist-info → flowfile-0.3.3.1.dist-info}/METADATA +4 -3
  4. {flowfile-0.3.2.dist-info → flowfile-0.3.3.1.dist-info}/RECORD +46 -35
  5. flowfile_core/configs/__init__.py +15 -4
  6. flowfile_core/configs/settings.py +5 -3
  7. flowfile_core/configs/utils.py +18 -0
  8. flowfile_core/flowfile/FlowfileFlow.py +13 -18
  9. flowfile_core/flowfile/database_connection_manager/db_connections.py +1 -1
  10. flowfile_core/flowfile/flow_data_engine/flow_data_engine.py +54 -17
  11. flowfile_core/flowfile/flow_data_engine/flow_file_column/main.py +42 -9
  12. flowfile_core/flowfile/flow_data_engine/flow_file_column/utils.py +42 -3
  13. flowfile_core/flowfile/flow_data_engine/polars_code_parser.py +2 -1
  14. flowfile_core/flowfile/flow_data_engine/sample_data.py +25 -7
  15. flowfile_core/flowfile/flow_data_engine/subprocess_operations/subprocess_operations.py +4 -3
  16. flowfile_core/flowfile/flow_data_engine/utils.py +1 -0
  17. flowfile_core/flowfile/flow_node/flow_node.py +2 -1
  18. flowfile_core/flowfile/sources/external_sources/airbyte_sources/models.py +2 -2
  19. flowfile_core/flowfile/sources/external_sources/sql_source/sql_source.py +1 -1
  20. flowfile_core/flowfile/utils.py +34 -3
  21. flowfile_core/main.py +2 -3
  22. flowfile_core/routes/secrets.py +1 -1
  23. flowfile_core/schemas/input_schema.py +10 -4
  24. flowfile_core/schemas/transform_schema.py +25 -47
  25. flowfile_frame/__init__.py +11 -4
  26. flowfile_frame/adding_expr.py +280 -0
  27. flowfile_frame/config.py +9 -0
  28. flowfile_frame/expr.py +301 -83
  29. flowfile_frame/expr.pyi +2174 -0
  30. flowfile_frame/expr_name.py +258 -0
  31. flowfile_frame/flow_frame.py +584 -1002
  32. flowfile_frame/flow_frame.pyi +368 -0
  33. flowfile_frame/flow_frame_methods.py +617 -0
  34. flowfile_frame/group_frame.py +89 -42
  35. flowfile_frame/join.py +1 -2
  36. flowfile_frame/lazy.py +704 -0
  37. flowfile_frame/lazy_methods.py +201 -0
  38. flowfile_frame/list_name_space.py +324 -0
  39. flowfile_frame/selectors.py +3 -0
  40. flowfile_frame/series.py +70 -0
  41. flowfile_frame/utils.py +80 -4
  42. {flowfile-0.3.2.dist-info → flowfile-0.3.3.1.dist-info}/LICENSE +0 -0
  43. {flowfile-0.3.2.dist-info → flowfile-0.3.3.1.dist-info}/WHEEL +0 -0
  44. {flowfile-0.3.2.dist-info → flowfile-0.3.3.1.dist-info}/entry_points.txt +0 -0
  45. /flowfile_core/{secrets → secret_manager}/__init__.py +0 -0
  46. /flowfile_core/{secrets/secrets.py → secret_manager/secret_manager.py} +0 -0
flowfile/__init__.py CHANGED
@@ -7,7 +7,7 @@ This package ties together the FlowFile ecosystem components:
7
7
  - flowfile_worker: Computation engine
8
8
  """
9
9
 
10
- __version__ = "0.3.1"
10
+ __version__ = "0.3.3.1"
11
11
 
12
12
  import os
13
13
  import logging
@@ -18,8 +18,9 @@ os.environ['SINGLE_FILE_MODE'] = "1"
18
18
  from flowfile.web import start_server as start_web_ui
19
19
  from flowfile.api import open_graph_in_editor
20
20
  from flowfile_frame.flow_frame import (
21
- FlowFrame, read_csv, read_parquet, from_dict, concat
21
+ FlowFrame
22
22
  )
23
+ from flowfile_frame import read_csv, read_parquet, from_dict, concat
23
24
  from flowfile_frame.expr import (
24
25
  col, lit, column, cum_count, len,
25
26
  sum, min, max, mean, count, when
flowfile/web/__init__.py CHANGED
@@ -135,6 +135,9 @@ def start_server(host="127.0.0.1", port=63578, open_browser=True):
135
135
 
136
136
  # Import core app
137
137
  from flowfile_core.main import run, app as core_app
138
+ from flowfile_core.configs.settings import OFFLOAD_TO_WORKER
139
+
140
+ OFFLOAD_TO_WORKER.value = True
138
141
 
139
142
  # Extend the core app with web UI routes and worker functionality
140
143
  extend_app(core_app)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: Flowfile
3
- Version: 0.3.2
3
+ Version: 0.3.3.1
4
4
  Summary: Project combining flowfile core (backend) and flowfile_worker (compute offloader) and flowfile_frame (api)
5
5
  Author: Edward van Eechoud
6
6
  Author-email: evaneechoud@gmail.com
@@ -23,10 +23,11 @@ Requires-Dist: methodtools (>=0.4.7,<0.5.0)
23
23
  Requires-Dist: openpyxl (>=3.1.2,<3.2.0)
24
24
  Requires-Dist: passlib (>=1.7.4,<1.8.0)
25
25
  Requires-Dist: pendulum (==2.1.2) ; python_version < "3.12"
26
- Requires-Dist: polars (>1.8.2,<=1.25.2)
26
+ Requires-Dist: polars (>=1.8.2,<2.0.0) ; sys_platform != "win32"
27
+ Requires-Dist: polars (>=1.8.2,<=1.25.2) ; sys_platform == "win32"
27
28
  Requires-Dist: polars-distance (>=0.4.3,<0.5.0)
28
29
  Requires-Dist: polars-ds (>=0.6.0)
29
- Requires-Dist: polars-expr-transformer (>0.4.7.0)
30
+ Requires-Dist: polars-expr-transformer (>=0.4.9.0)
30
31
  Requires-Dist: polars-grouper (>=0.3.0,<0.4.0)
31
32
  Requires-Dist: polars_simed (>=0.3.4,<0.4.0)
32
33
  Requires-Dist: pyairbyte-flowfile (==0.20.2)
@@ -1,11 +1,11 @@
1
1
  build_backends/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
2
  build_backends/main.py,sha256=hLmfqTeHLSTiwwZ5mUuoLQgtO40Igvl1_4NbnvzWSgI,9912
3
3
  build_backends/main_prd.py,sha256=JR2tYCMWM5ThooQjv5pw6nwVKMQjgsiHgKMhYn9NXWI,6927
4
- flowfile/__init__.py,sha256=B1vnUboOki3pP3BAmoQ0j62nEiB51X9kc9N8Qu7bgcg,2419
4
+ flowfile/__init__.py,sha256=e-OVTnJOMsnqBbMt5WTGQmAYjHBt2YA6ODlkoIpCj20,2447
5
5
  flowfile/__main__.py,sha256=hAMeyORHhLnw1lIXe6-EYDwgUi2odW0Rb4eDkcNtBiM,2612
6
6
  flowfile/api.py,sha256=grbo8pUF8NNEGoqDQX-BTjWXC61mSIgJoD-w-GVeD4Y,15298
7
7
  flowfile/readme.md,sha256=n93Cpp9DPIBfe2jxKcUVokl5PTobxnE1mWHfRD018xE,4137
8
- flowfile/web/__init__.py,sha256=bViGSO1AmLSjCM4nnQVFaYUTFHYKj0XK4VjscGwZGTs,5462
8
+ flowfile/web/__init__.py,sha256=w-nioI8PZzMIEU-wZ7fNDN8Q4sl8aNEMFRgDVU95p9o,5563
9
9
  flowfile/web/static/assets/AirbyteReader-1ac35765.css,sha256=GsNXZRBzBqcgSHWYHFfpQjYnQ1G90hCaWgThLCG80jI,6260
10
10
  flowfile/web/static/assets/AirbyteReader-2b1cf2d8.js,sha256=b9AhzC5wtvBqx0hRqHsZde55lh5sSsgT3vfZwDo84wY,38778
11
11
  flowfile/web/static/assets/CrossJoin-41efa4cb.css,sha256=Qe-ky2QI7rYfXMKV-bCB5HP0OJ6uBU74g9EEmcpXTlc,2838
@@ -140,11 +140,12 @@ flowfile_core/auth/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuF
140
140
  flowfile_core/auth/jwt.py,sha256=-j_eZd5z2ABP0KHo5PmmsuCyJUsvAFFlnANJiRCnNx4,4828
141
141
  flowfile_core/auth/models.py,sha256=ilQqy7ief1mwAP9xiNWbftCR9yyccECMa0Qsnnwax_g,648
142
142
  flowfile_core/auth/secrets.py,sha256=5TixLt9I64pR0OT4AoqgIzAmjCQNoGF4YPGuRhCWBH0,5840
143
- flowfile_core/configs/__init__.py,sha256=RSoO4aiWY-FzOUKLPY-mbwHS_YcbbNw5tNdVzrwi_ek,977
143
+ flowfile_core/configs/__init__.py,sha256=W72OpTyVRF1kJyLPQFXT9tB6FoX-Bzvi5rTUusERfKI,1318
144
144
  flowfile_core/configs/flow_logger.py,sha256=Pk1yhaC58jjISMrgwhBFIue9Qj5XfYo8NfOefqsR-gA,15822
145
145
  flowfile_core/configs/node_store/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
146
146
  flowfile_core/configs/node_store/nodes.py,sha256=mVjHyJzkxrmg0PSAMdJC8CGyqD57NyWVEC3rhhzXosU,5960
147
- flowfile_core/configs/settings.py,sha256=6QI6wOhAwvRYLd3MMP7w3g6wgyT4DeE-zq6-I39Touo,3073
147
+ flowfile_core/configs/settings.py,sha256=37SLuk2sL4E69SrYtN5ygJILIcc8CwAHLm5FiaTlC9Y,3065
148
+ flowfile_core/configs/utils.py,sha256=BjyJCfO4gR_n9Sbs6dPlFfSKwxfxd3aKZ9zUH0Xs5GY,474
148
149
  flowfile_core/database/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
149
150
  flowfile_core/database/connection.py,sha256=Ul1gYnYvwvlTSgxlJr5JexYCdgyZFduXioFjY_Lw2U0,1410
150
151
  flowfile_core/database/init_db.py,sha256=vKQi340caayTqc1tR0CJcd621i4afVc0BLpJO-t4CFk,1236
@@ -152,7 +153,7 @@ flowfile_core/database/models.py,sha256=SUrOgefmGY4lrN9Cv67TEjEOQtyKoYEpkjIVCWs7
152
153
  flowfile_core/fileExplorer/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
153
154
  flowfile_core/fileExplorer/funcs.py,sha256=9o1Wko-wPpKNapRSvwsC-FmofJ6VFmBU5v2aQLRU3sA,9048
154
155
  flowfile_core/fileExplorer/utils.py,sha256=RrXj1NdkB2K3v5khvXpEp-_OnN68k9Ex_9s7CDa3L5w,1703
155
- flowfile_core/flowfile/FlowfileFlow.py,sha256=3HvkfmN_aZCZ4JP9b1HL60Qf4QPmDLYj8dvi8d_JZ2M,72693
156
+ flowfile_core/flowfile/FlowfileFlow.py,sha256=pJTbAxwFbZ9zbdEyAM6u_BpCe47r_IZpar5Gh0MqLJw,72714
156
157
  flowfile_core/flowfile/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
157
158
  flowfile_core/flowfile/_extensions/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
158
159
  flowfile_core/flowfile/_extensions/real_time_interface.py,sha256=F9wGAFmu4gmC-svfwasDEaVYZBinBqYUZmCrEsWos44,1906
@@ -165,35 +166,35 @@ flowfile_core/flowfile/connection_manager/__init__.py,sha256=wLAGuQBA0lgN1tZleYZ
165
166
  flowfile_core/flowfile/connection_manager/_connection_manager.py,sha256=W9FWRAFUT1c2eHa2QhFpKNWA-Kps63o2vcGAH1zXSeo,2612
166
167
  flowfile_core/flowfile/connection_manager/models.py,sha256=o_2FK7aNdjMHfiGX7hIUz5uslfLar8KIcI_760WprzM,227
167
168
  flowfile_core/flowfile/database_connection_manager/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
168
- flowfile_core/flowfile/database_connection_manager/db_connections.py,sha256=omQjmvwhL97d4WAgIEVBHhldOG58QySE1tq7f-timKc,5210
169
+ flowfile_core/flowfile/database_connection_manager/db_connections.py,sha256=e0DpRZ-7wTKbqAs9icDkNwYdMxhAska0WNw1YRRhrPM,5224
169
170
  flowfile_core/flowfile/database_connection_manager/models.py,sha256=lVJSifqznQ8fKGWBEBCy_8JeXmdKF4pnAE5P5oXRrLM,379
170
171
  flowfile_core/flowfile/extensions.py,sha256=vVyM4sdUN5Eyez6IPt9poNWwvcJMJKwP4dLCq1ErQ5k,1795
171
172
  flowfile_core/flowfile/flow_data_engine/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
172
173
  flowfile_core/flowfile/flow_data_engine/create/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
173
174
  flowfile_core/flowfile/flow_data_engine/create/funcs.py,sha256=AC87vQHgna-stFJZPLWF8ErxFioxdCJAhVaXZGVOsY0,7317
174
- flowfile_core/flowfile/flow_data_engine/flow_data_engine.py,sha256=TY2QrFb_fFHksReXlLp4Tt0iqUjsKS6IVdz5z08bvhU,61682
175
+ flowfile_core/flowfile/flow_data_engine/flow_data_engine.py,sha256=61js7mznpHROSxM-DFVXoC8JVXqHxhWFlAATwaNK3DM,63806
175
176
  flowfile_core/flowfile/flow_data_engine/flow_file_column/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
176
- flowfile_core/flowfile/flow_data_engine/flow_file_column/main.py,sha256=e438RcbN9msjgsyoXyqsRXp7hiPO5xdw-F7cRfSOBCw,5533
177
+ flowfile_core/flowfile/flow_data_engine/flow_file_column/main.py,sha256=U7tLBGFngO0QeIJ5YyBt9UrGS8IhlclMfbIuaI_BfMo,7175
177
178
  flowfile_core/flowfile/flow_data_engine/flow_file_column/polars_type.py,sha256=xusyOLwSxevBk8-Uy9ZKISB_KOi0JeYfZ0wihcG-Qjk,530
178
- flowfile_core/flowfile/flow_data_engine/flow_file_column/utils.py,sha256=3AI4ZR4OhNZPaCHq646rRbu-2yLJzj_4gEjGu_5mTzA,775
179
+ flowfile_core/flowfile/flow_data_engine/flow_file_column/utils.py,sha256=2KoODecEwNGSCLZLRLim3vAwVON0QZgv2m1gkGKHht0,1774
179
180
  flowfile_core/flowfile/flow_data_engine/fuzzy_matching/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
180
181
  flowfile_core/flowfile/flow_data_engine/fuzzy_matching/prepare_for_fuzzy_match.py,sha256=67MxjjmOfNaDYYFELo-L-h7HNHWWiAd62VX-vDD7Sqs,1873
181
182
  flowfile_core/flowfile/flow_data_engine/fuzzy_matching/settings_validator.py,sha256=SjIw-dV8SQUClxyCCO-6i-HC3avPd3yzQgLFO0L04nI,4550
182
183
  flowfile_core/flowfile/flow_data_engine/join/__init__.py,sha256=q4K5ZjGmFmGcuK545PKD1y_8WhyPjmEIyfeIkZ3WboM,75
183
184
  flowfile_core/flowfile/flow_data_engine/join/verify_integrity.py,sha256=TfjYEIu2G9I-N6NuXVNsQWbj57_n4WbZO2kphZcHOmc,2245
184
185
  flowfile_core/flowfile/flow_data_engine/pivot_table.py,sha256=seqJqbrhwK4gkO-EQ3MSQWseh5HR3D_slQtq6pjT_pw,366
185
- flowfile_core/flowfile/flow_data_engine/polars_code_parser.py,sha256=rA9X7WZYk1lRvT8YAtVfAIGdNqIsSj21RuD5rY1xqE8,9545
186
+ flowfile_core/flowfile/flow_data_engine/polars_code_parser.py,sha256=R3ZNe7UlkYxWUkT8Sppo8NUCobn0N2Iq610wdmrS2dA,9598
186
187
  flowfile_core/flowfile/flow_data_engine/read_excel_tables.py,sha256=q7TccqyToowJEOU4j2tY40HshXYkDACkYE95bNqoiw8,5937
187
- flowfile_core/flowfile/flow_data_engine/sample_data.py,sha256=Y_9PuM5gBsb3bIrN-Cz70BNQlXuj88A_AmejYj01wys,4451
188
+ flowfile_core/flowfile/flow_data_engine/sample_data.py,sha256=7r-SJBrgtatgy0r0vrd0HQAw-Su2GbIKP4TPzHRLDtk,5026
188
189
  flowfile_core/flowfile/flow_data_engine/subprocess_operations/__init__.py,sha256=04ZpOvZ6gF42vQ02a0Lim9w5EP7-xu2l_S2WrSoll-g,97
189
190
  flowfile_core/flowfile/flow_data_engine/subprocess_operations/models.py,sha256=0ZN8HfpTfXc5LxXjDc2CCl66xNpxf6n_6dwExIxWbjM,1079
190
- flowfile_core/flowfile/flow_data_engine/subprocess_operations/subprocess_operations.py,sha256=jPo9roN3_IikoRMSlDFK6KTshUwpRT6pqkQfFOXADvU,19962
191
+ flowfile_core/flowfile/flow_data_engine/subprocess_operations/subprocess_operations.py,sha256=-JvAXS6OwySY-Zke0mCIzFj0hNhaUJFr9I5qdwny7yM,20045
191
192
  flowfile_core/flowfile/flow_data_engine/threaded_processes.py,sha256=15IPoqIoCfkPMb648o9hnOzNnZINhYQxJi5hNfQuRwE,1311
192
193
  flowfile_core/flowfile/flow_data_engine/types.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
193
- flowfile_core/flowfile/flow_data_engine/utils.py,sha256=JKOVKU-zatSk2oWXU8foR30MXooDJg4cODrw8c1uJqQ,7596
194
+ flowfile_core/flowfile/flow_data_engine/utils.py,sha256=_LmTFZMwt03ZYwYgpPaRxe5pP-lg9k4UGBxLFYN2JPU,7597
194
195
  flowfile_core/flowfile/flow_graph_utils.py,sha256=7b8kHAdtIlohiaelXqicS4Bi4bp8GdIBH-1fSYWY5XI,10107
195
196
  flowfile_core/flowfile/flow_node/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
196
- flowfile_core/flowfile/flow_node/flow_node.py,sha256=sKFcSmxyBRUzCD_7ZJ9w1tEZ_odtS1WBI05frnpVl9o,35073
197
+ flowfile_core/flowfile/flow_node/flow_node.py,sha256=sC0zPXsy25J5ll3qEJz-wJOe03dsRcuY-MOXYsww-fI,35081
197
198
  flowfile_core/flowfile/flow_node/models.py,sha256=jD2yUbrZYXy48nsUppuHUm8xqnrozhT37_oq9TBSYTw,4268
198
199
  flowfile_core/flowfile/flow_node/schema_callback.py,sha256=R9tQh20zC3Z492ne8OtsSpp9_jmDIAfyoX4YqB1s_2M,2288
199
200
  flowfile_core/flowfile/handler.py,sha256=wBeZmREY57ahuITLFTZ7FATIGXHK-3bQHyyM3eA-_s8,4105
@@ -208,7 +209,7 @@ flowfile_core/flowfile/sources/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5
208
209
  flowfile_core/flowfile/sources/external_sources/__init__.py,sha256=bb9QikXEhkP0rdH68qRUJaPGA2lGFIauIWNlOBL-9uE,288
209
210
  flowfile_core/flowfile/sources/external_sources/airbyte_sources/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
210
211
  flowfile_core/flowfile/sources/external_sources/airbyte_sources/airbyte.py,sha256=OBiy2G2AtPBlVhirZrRqTta013nAtQobpSKWEX0oCQU,5718
211
- flowfile_core/flowfile/sources/external_sources/airbyte_sources/models.py,sha256=iM156vf0rjtIHMLFgCjBJozbnXYOi5ohtQ66zxbumQ8,5566
212
+ flowfile_core/flowfile/sources/external_sources/airbyte_sources/models.py,sha256=17lX1B8gyEt57S0Di75CSHZc7WEeyl-_b20LO_zeayU,5576
212
213
  flowfile_core/flowfile/sources/external_sources/airbyte_sources/settings.py,sha256=3ZdWIGLCsbWmJb33wmmkQRIa7iEHpPrE7iuFag_TxBg,6288
213
214
  flowfile_core/flowfile/sources/external_sources/base_class.py,sha256=mQhplw0O4heef2yAFuvKAyQaZJxd7lDyohgGv-4yxOI,981
214
215
  flowfile_core/flowfile/sources/external_sources/custom_external_sources/__init__.py,sha256=cr2lrDCARyiSlGNgl-CWv0w1H0ORhoGE8HwW7sL-Yr0,97
@@ -218,19 +219,19 @@ flowfile_core/flowfile/sources/external_sources/custom_external_sources/sample_u
218
219
  flowfile_core/flowfile/sources/external_sources/factory.py,sha256=8um2kFuICqrOAeA1scLPIVDabAsOHabN2PartvfCpPU,919
219
220
  flowfile_core/flowfile/sources/external_sources/sql_source/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
220
221
  flowfile_core/flowfile/sources/external_sources/sql_source/models.py,sha256=C6lhFKsF90ifoU9k421f2-K-FJwaLbKs9_yqYHWBzGE,4613
221
- flowfile_core/flowfile/sources/external_sources/sql_source/sql_source.py,sha256=jMHLUW2vU110z5_Pru3E2CSvov5PM4Yve2nflotawcs,12978
222
+ flowfile_core/flowfile/sources/external_sources/sql_source/sql_source.py,sha256=202iHv9FthKUisYNXfbhGuX0nsAwdeoX2FliXjkBcgg,12992
222
223
  flowfile_core/flowfile/sources/external_sources/sql_source/utils.py,sha256=Cj-uKhHT6s_S8vmO7mONf0zrKb987ZzxVLISaUdtAVw,11765
223
224
  flowfile_core/flowfile/util/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
224
225
  flowfile_core/flowfile/util/calculate_layout.py,sha256=iquA_-EGzX9mH1NasIZFNaYLfBcKqz5pHaTjHO1rYWE,5070
225
226
  flowfile_core/flowfile/util/execution_orderer.py,sha256=IJ-upXUruZfFj8z4J-2oDLz2hDSOnRIdgd_YLlRKU-c,5828
226
- flowfile_core/flowfile/utils.py,sha256=arSkifiXgbdWqrLcgQ6X3u-A9_6zDHocDuXEy6M3c18,3363
227
- flowfile_core/main.py,sha256=5rcWI1uFlYkUWd7T_ZHkolOmLlbIWjuDpzCyivw6og0,3879
227
+ flowfile_core/flowfile/utils.py,sha256=hR008OuvSZNS9BBUlF3sJDePYhbS7wf-6Um6YLlU0OY,5224
228
+ flowfile_core/main.py,sha256=b_0bbqMwTPpHkP5zooJVAvGVs4mLP5N1yb1oQSM1LDE,3914
228
229
  flowfile_core/routes/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
229
230
  flowfile_core/routes/auth.py,sha256=-2SWMCDDKYSnriiYH3hOadnsHdmVckcIV_vA2mZfBu0,1251
230
231
  flowfile_core/routes/logs.py,sha256=LHGbCN10pjS8yAWHaG9eUHxMoGQlHjVWFSxtAnuQjrM,5868
231
232
  flowfile_core/routes/public.py,sha256=bRmRyphaCLTTdPjPn_M6NWPUEQKHB5eeKlKoOs7jWzA,217
232
233
  flowfile_core/routes/routes.py,sha256=-pJ8LQxX6_-EnkXGu3C-K3UcQI1ZUPNJ0X8REbpvcCI,23181
233
- flowfile_core/routes/secrets.py,sha256=QIrloBSclw9JGcpB-c4VHlNBVy2t-sAAB9RXK9ibNlk,3061
234
+ flowfile_core/routes/secrets.py,sha256=SG5zNGk-XRAVod7GBLKXhjEpTuTqdlCHOCoqsMGi4T8,3075
234
235
  flowfile_core/run_lock.py,sha256=1bINUPwZe7v9rHRePTk0CaExscqf2pvk4hWoYFZWQqE,296
235
236
  flowfile_core/schemas/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
236
237
  flowfile_core/schemas/analysis_schemas/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -238,27 +239,37 @@ flowfile_core/schemas/analysis_schemas/graphic_walker_schemas.py,sha256=1PQeiBFI
238
239
  flowfile_core/schemas/defaults.py,sha256=5FeYJIn1ZJ4Es8WQfSW3EkfoOnWsvifg9bvwZO7fklg,228
239
240
  flowfile_core/schemas/external_sources/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
240
241
  flowfile_core/schemas/external_sources/airbyte_schemas.py,sha256=xjMVevVronSvMJ1lQrBTv2ClkO2VSMSU8YCga_Mw6vk,594
241
- flowfile_core/schemas/input_schema.py,sha256=wUPFzq08Ig3SOsIZxDtlqOUXJSXk9Sj1YtRp0XA8_YU,13611
242
+ flowfile_core/schemas/input_schema.py,sha256=W0b7WG--CwqkiXfDokn-ZGRcWpjy6qT7BCWLRNjuz1A,13885
242
243
  flowfile_core/schemas/models.py,sha256=W22swh1TZ1aAZY8R-OL0AiTXvIEUKtN9B1ZFc0dxZQI,5253
243
244
  flowfile_core/schemas/output_model.py,sha256=yenexlnHI9ecQhTDBPNChO2p25YNZ9btR5_cXd-ehLg,2665
244
245
  flowfile_core/schemas/schemas.py,sha256=_CXS-YUMOwJTlWAX4mPnhqHD7ZiQy3DM7rHB2eSbHUY,2647
245
- flowfile_core/schemas/transform_schema.py,sha256=7MooeHGWdSZnLsX7_WSXesML6Rpbi7rK6vUEwtVjBaY,21259
246
- flowfile_core/secrets/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
247
- flowfile_core/secrets/secrets.py,sha256=6G9hqo-GjTE7va6i6r3_ZmGo4Y26Aw_PY0W8L6pNo7E,2086
246
+ flowfile_core/schemas/transform_schema.py,sha256=HRJhRwTQIxqOJ_zRW_OmGn63dDdcbP_CCdmgjFstGAQ,20072
247
+ flowfile_core/secret_manager/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
248
+ flowfile_core/secret_manager/secret_manager.py,sha256=6G9hqo-GjTE7va6i6r3_ZmGo4Y26Aw_PY0W8L6pNo7E,2086
248
249
  flowfile_core/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
249
250
  flowfile_core/utils/arrow_reader.py,sha256=SbDDzOmtHcZ8rH88v1jN0EZthaDqBmvCrbruX07Qdks,9366
250
251
  flowfile_core/utils/excel_file_manager.py,sha256=EIad2LenHu-3Yw1FcLmE0KgmLflnvNKt07FY6s6mPlE,452
251
252
  flowfile_core/utils/fileManager.py,sha256=LnJhK_pwjb9MIApG2e4Hp3L5Z7Wny8YYHaL9SkW8WlE,1371
252
253
  flowfile_core/utils/fl_executor.py,sha256=eNnNZHZ9451brzZD00_X8aoCHFl1hR1gVOIGxtE0Db4,1301
253
254
  flowfile_core/utils/utils.py,sha256=NkEu21OF1l5weu01g-dAVdJ6BRHdpT2jBrWBSi-wp3c,270
254
- flowfile_frame/__init__.py,sha256=ZEhs_YFgoA2CBGuvHIEY559Dk6q09clFscT3r7tITLM,1326
255
+ flowfile_frame/__init__.py,sha256=L2-jTwdtPEtdIS4c1foVXwnh5T0jDzpkgn3CBJvOPHg,1544
255
256
  flowfile_frame/adapters.py,sha256=C6JZZKANoKbHHmwMaF9AqAcZvITvQeb2Dklcpg5JAdY,530
256
- flowfile_frame/expr.py,sha256=wzJLiHtAqBA4NUEyXW8UhNJJ2jiGDvWEX8RfWT5ICwQ,47370
257
- flowfile_frame/flow_frame.py,sha256=wXiXbQr4FUOvQl7iRQZkJ8Q5q1Vh7MLT7wHK84ngyAM,95588
258
- flowfile_frame/group_frame.py,sha256=MMGE2_DC8n1J2UxBBm_TyacGiRT_1V2CcWUvsIyhbIQ,9115
259
- flowfile_frame/join.py,sha256=pezHyNQMcaSPK9vhjaCaelMkgKdrvOQxwiROsa0fmN8,2480
260
- flowfile_frame/selectors.py,sha256=Ny5IpDP481ClNr5gI7_SjXzeqF16LsPcVQxiyUf5tUw,9130
261
- flowfile_frame/utils.py,sha256=LwsNm8tvqcyaAeGX1PNQbpt-NenDsM8jx9SbxhDslZI,1348
257
+ flowfile_frame/adding_expr.py,sha256=K9KdF3WCbPy2YO5radyEBXQ0yi6SOI4nviaPwXmKW0k,12428
258
+ flowfile_frame/config.py,sha256=Tl_4V8byUiO1y3ANesW2E9qeTtZvtKYJl8KU00hc1Cg,176
259
+ flowfile_frame/expr.py,sha256=5Qz_IKV-5FvnlMGv-10D82cBKMlEyUCq2CXnsmJUrfk,56695
260
+ flowfile_frame/expr.pyi,sha256=HR1qBfGIlei-6Lt3qztGUR_XfyiQJMUbUWCMY1pEBYo,87903
261
+ flowfile_frame/expr_name.py,sha256=0VZZwC3xc6tiwOJPu7emQ86Xp2mVu7U-j-jrsMjbc2Y,9852
262
+ flowfile_frame/flow_frame.py,sha256=My05BjxYIPiokRItV1asSSI3q5_QhKg0nrv0FaRIq68,88112
263
+ flowfile_frame/flow_frame.pyi,sha256=8-P0drJFbtYWr9o9YwJzWtiHrDbcVOMZzC-3Mz0X37A,29725
264
+ flowfile_frame/flow_frame_methods.py,sha256=yBDnDyRY7NAf1pYYD0wMIXZenNTi0u_WSHxfyVPp214,23298
265
+ flowfile_frame/group_frame.py,sha256=VfYxEcvU9KfzTsgy4GiN8PZF83bRr1nCQ9c2DKTS9VI,11558
266
+ flowfile_frame/join.py,sha256=YQCHmw8nYzvGzvrFc0jOmVMy-rahK3CPtuFdKB7loS8,2437
267
+ flowfile_frame/lazy.py,sha256=F0wLzoE5c5EF0D5msjqNi0nTydepx_YtwHT0hDq6Q-I,28619
268
+ flowfile_frame/lazy_methods.py,sha256=ysSmdDF6guDWN-ogBIpGoCHdjAUbnnl8CncjqDUdg_Y,7730
269
+ flowfile_frame/list_name_space.py,sha256=7m7qxZ9GANrVM8U-gOq3hZ7aPGwSGD1hbI6kRJLc0Tg,15778
270
+ flowfile_frame/selectors.py,sha256=Rnaa9AuMgzazt6ujB9yyvcbH9zA7JmYjs8rFqdZ2TYM,9186
271
+ flowfile_frame/series.py,sha256=VkhLr03iA8LF2EifGoKdNvg1ajm7Ln9OgzkMurg_nR0,2199
272
+ flowfile_frame/utils.py,sha256=t19WSnorytJHs8ut0a7le6SYkVjBv5NTFzU3Gv_kUQs,3780
262
273
  flowfile_worker/__init__.py,sha256=ZDdn3JCP7LWTiTsmntVIVduB4p2bUkJcZUKVEj7V9TU,1375
263
274
  flowfile_worker/configs.py,sha256=7fYtlj06vxDrMiRuMbwvSDOD1JRVMZqnPbcQFuikCJM,2714
264
275
  flowfile_worker/create/__init__.py,sha256=vkWy5uODffivUdxt3nNVALj6xgQK3HPBetqR-QqZ-uo,1643
@@ -295,8 +306,8 @@ test_utils/__init__.py,sha256=8WwOgIuKw6YtOc1GWR1DqIhQ8BhlLWqsMyQJSpxnzKk,66
295
306
  test_utils/postgres/__init__.py,sha256=y3V_6a9N1Pvm5NIBaA8CFf3i4mvPVY-H1teHA-rg0VU,33
296
307
  test_utils/postgres/commands.py,sha256=4oA8EHW3EqwGkG02HSqEGbXEBGM01sUW5FsyHm86W4k,4347
297
308
  test_utils/postgres/fixtures.py,sha256=kR8UBjQr3pgbe-xM-V8x8VseTHCPv0EmDEzPHl5Qc8Y,13507
298
- flowfile-0.3.2.dist-info/LICENSE,sha256=pCfLAA27jMHReYk_wGiirZxWRRXz_Bm7PVInRCa9P5g,1075
299
- flowfile-0.3.2.dist-info/METADATA,sha256=N49dUO71mSeEwObw0v4F6corTg9PHcfEn5mWjgQ8g10,7903
300
- flowfile-0.3.2.dist-info/WHEEL,sha256=b4K_helf-jlQoXBBETfwnf4B04YC67LOev0jo4fX5m8,88
301
- flowfile-0.3.2.dist-info/entry_points.txt,sha256=Q3CEYNk33UaWlA9D-8yXYH0FwjKBsrtNuzzzHxhwnNI,333
302
- flowfile-0.3.2.dist-info/RECORD,,
309
+ flowfile-0.3.3.1.dist-info/LICENSE,sha256=pCfLAA27jMHReYk_wGiirZxWRRXz_Bm7PVInRCa9P5g,1075
310
+ flowfile-0.3.3.1.dist-info/METADATA,sha256=0FNb6Bf0qD5R4iD1gVM6ACB59eFkdl2-5R5l_uP_RIE,7998
311
+ flowfile-0.3.3.1.dist-info/WHEEL,sha256=b4K_helf-jlQoXBBETfwnf4B04YC67LOev0jo4fX5m8,88
312
+ flowfile-0.3.3.1.dist-info/entry_points.txt,sha256=Q3CEYNk33UaWlA9D-8yXYH0FwjKBsrtNuzzzHxhwnNI,333
313
+ flowfile-0.3.3.1.dist-info/RECORD,,
@@ -11,16 +11,27 @@ logger = logging.getLogger('PipelineHandler')
11
11
  logger.setLevel(logging.INFO)
12
12
  logger.propagate = False
13
13
 
14
- # Create console handler with a specific format
15
- console_handler = logging.StreamHandler(sys.stdout)
14
+ # Clear any existing handlers
15
+ if logger.hasHandlers():
16
+ logger.handlers.clear()
17
+
18
+ # Try to determine the best output stream
19
+ output_stream = None
20
+ if hasattr(sys.stdout, 'isatty') and sys.stdout.isatty():
21
+ output_stream = sys.stdout
22
+ elif hasattr(sys.stderr, 'isatty') and sys.stderr.isatty():
23
+ output_stream = sys.stderr
24
+ else:
25
+ # Use __stdout__ for debugger environments (PyDev, PyCharm, etc.)
26
+ output_stream = sys.__stdout__
27
+
28
+ console_handler = logging.StreamHandler(output_stream)
16
29
  console_handler.setLevel(logging.INFO)
17
30
 
18
31
  # Create formatter
19
32
  formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
20
33
  console_handler.setFormatter(formatter)
21
34
 
22
- if logger.hasHandlers():
23
- logger.handlers.clear()
24
35
  logger.addHandler(console_handler)
25
36
 
26
37
  # Create logs directory in temp at startup
@@ -5,10 +5,10 @@ import os
5
5
  import tempfile
6
6
  import argparse
7
7
 
8
- from databases import DatabaseURL
9
8
  from passlib.context import CryptContext
10
9
  from starlette.config import Config
11
- from starlette.datastructures import Secret
10
+
11
+ from flowfile_core.configs.utils import MutableBool
12
12
 
13
13
 
14
14
  # Constants for server and worker configuration
@@ -18,6 +18,9 @@ DEFAULT_WORKER_PORT = 63579
18
18
  SINGLE_FILE_MODE: bool = os.environ.get("SINGLE_FILE_MODE", "0") == "1"
19
19
 
20
20
 
21
+ OFFLOAD_TO_WORKER = MutableBool(True)
22
+
23
+
21
24
  def parse_args():
22
25
  """Parse command line arguments"""
23
26
  parser = argparse.ArgumentParser(description="Flowfile Backend Server")
@@ -79,7 +82,6 @@ args = parse_args()
79
82
  SERVER_HOST = args.host if args.host is not None else DEFAULT_SERVER_HOST
80
83
  SERVER_PORT = args.port if args.port is not None else DEFAULT_SERVER_PORT
81
84
  WORKER_PORT = args.worker_port if args.worker_port is not None else int(os.getenv("WORKER_PORT", DEFAULT_WORKER_PORT))
82
- # Worker configuration
83
85
  WORKER_HOST = os.getenv("WORKER_HOST", "0.0.0.0" if platform.system() != "Windows" else "127.0.0.1")
84
86
 
85
87
  config = Config(".env")
@@ -0,0 +1,18 @@
1
+ from dataclasses import dataclass
2
+
3
+
4
+ @dataclass
5
+ class MutableBool:
6
+ value: bool
7
+
8
+ def __bool__(self) -> bool:
9
+ """Allow direct boolean evaluation"""
10
+ return self.value
11
+
12
+ def __eq__(self, other) -> bool:
13
+ """Allow equality comparison with booleans"""
14
+ if isinstance(other, bool):
15
+ return self.value == other
16
+ elif isinstance(other, MutableBool):
17
+ return self.value == other.value
18
+ return NotImplemented
@@ -15,7 +15,7 @@ from flowfile_core.configs import logger
15
15
  from flowfile_core.configs.flow_logger import FlowLogger
16
16
  from flowfile_core.flowfile.sources.external_sources.factory import data_source_factory
17
17
  from flowfile_core.flowfile.sources.external_sources.airbyte_sources.settings import airbyte_settings_from_config
18
- from flowfile_core.flowfile.flow_data_engine.flow_file_column.main import type_to_polars_str, FlowfileColumn
18
+ from flowfile_core.flowfile.flow_data_engine.flow_file_column.main import cast_str_to_polars_type, FlowfileColumn
19
19
  from flowfile_core.flowfile.flow_data_engine.fuzzy_matching.settings_validator import (calculate_fuzzy_match_schema,
20
20
  pre_calculate_pivot_schema)
21
21
  from flowfile_core.utils.arrow_reader import get_read_top_n
@@ -25,7 +25,7 @@ from flowfile_core.flowfile.flow_data_engine.read_excel_tables import get_open_x
25
25
  from flowfile_core.flowfile.sources import external_sources
26
26
  from flowfile_core.schemas import input_schema, schemas, transform_schema
27
27
  from flowfile_core.schemas.output_model import TableExample, NodeData, NodeResult, RunInformation
28
- from flowfile_core.flowfile.utils import snake_case_to_camel_case
28
+ from flowfile_core.flowfile.utils import snake_case_to_camel_case, _handle_raw_data
29
29
  from flowfile_core.flowfile.analytics.utils import create_graphic_walker_node_from_node_promise
30
30
  from flowfile_core.flowfile.flow_node.flow_node import FlowNode
31
31
  from flowfile_core.flowfile.util.execution_orderer import determine_execution_order
@@ -34,7 +34,7 @@ from flowfile_core.flowfile.flow_data_engine.subprocess_operations.subprocess_op
34
34
  ExternalDatabaseFetcher,
35
35
  ExternalDatabaseWriter,
36
36
  ExternalDfFetcher)
37
- from flowfile_core.secrets.secrets import get_encrypted_secret, decrypt_secret
37
+ from flowfile_core.secret_manager.secret_manager import get_encrypted_secret, decrypt_secret
38
38
  from flowfile_core.flowfile.sources.external_sources.sql_source import utils as sql_utils, models as sql_models
39
39
  from flowfile_core.flowfile.sources.external_sources.sql_source.sql_source import SqlSource, BaseSqlSource
40
40
  from flowfile_core.flowfile.database_connection_manager.db_connections import get_local_database_connection
@@ -205,19 +205,12 @@ class FlowGraph:
205
205
  sample_size: int = 10000
206
206
 
207
207
  def analysis_preparation(flowfile_table: FlowDataEngine):
208
- if flowfile_table.number_of_records < 0:
209
-
210
- number_of_records = ExternalDfFetcher(
211
- lf=flowfile_table.data_frame,
212
- operation_type="calculate_number_of_records",
213
- flow_id=self.flow_id,
214
- node_id=node.node_id,
215
- ).result
208
+ if flowfile_table.number_of_records <= 0:
209
+ number_of_records = flowfile_table.get_number_of_records(calculate_in_worker_process=True)
216
210
  else:
217
211
  number_of_records = flowfile_table.number_of_records
218
212
  if number_of_records > sample_size:
219
213
  flowfile_table = flowfile_table.get_sample(sample_size, random=True)
220
-
221
214
  external_sampler = ExternalDfFetcher(
222
215
  lf=flowfile_table.data_frame,
223
216
  file_ref="__gf_walker"+node.hash,
@@ -225,7 +218,7 @@ class FlowGraph:
225
218
  node_id=node.node_id,
226
219
  flow_id=self.flow_id,
227
220
  )
228
- node.results.analysis_data_generator = get_read_top_n(external_sampler.status.file_ref, 10000)
221
+ node.results.analysis_data_generator = get_read_top_n(external_sampler.status.file_ref)
229
222
  return flowfile_table
230
223
 
231
224
  def schema_callback():
@@ -441,7 +434,7 @@ class FlowGraph:
441
434
  def add_formula(self, function_settings: input_schema.NodeFormula):
442
435
  error = ""
443
436
  if function_settings.function.field.data_type not in (None, "Auto"):
444
- output_type = type_to_polars_str(function_settings.function.field.data_type)
437
+ output_type = cast_str_to_polars_type(function_settings.function.field.data_type)
445
438
  else:
446
439
  output_type = None
447
440
  if output_type not in (None, "Auto"):
@@ -486,7 +479,8 @@ class FlowGraph:
486
479
  function=_func,
487
480
  input_columns=[],
488
481
  node_type='cross_join',
489
- setting_input=cross_join_settings)
482
+ setting_input=cross_join_settings,
483
+ input_node_ids=cross_join_settings.depending_on_ids)
490
484
  return self
491
485
 
492
486
  def add_join(self, join_settings: input_schema.NodeJoin) -> "FlowGraph":
@@ -1044,11 +1038,10 @@ class FlowGraph:
1044
1038
  return self
1045
1039
 
1046
1040
  def add_datasource(self, input_file: input_schema.NodeDatasource | input_schema.NodeManualInput):
1047
-
1048
1041
  if isinstance(input_file, input_schema.NodeManualInput):
1049
- input_data = FlowDataEngine(input_file.raw_data)
1042
+ _handle_raw_data(input_file)
1043
+ input_data = FlowDataEngine(input_file.raw_data_format)
1050
1044
  ref = 'manual_input'
1051
-
1052
1045
  else:
1053
1046
  input_data = FlowDataEngine(path_ref=input_file.file_ref)
1054
1047
  ref = 'datasource'
@@ -1061,7 +1054,9 @@ class FlowGraph:
1061
1054
 
1062
1055
  if not input_file.node_id in set(start_node.node_id for start_node in self._flow_starts):
1063
1056
  self._flow_starts.append(node)
1057
+
1064
1058
  else:
1059
+ input_data.collect()
1065
1060
  node = FlowNode(input_file.node_id, function=input_data,
1066
1061
  setting_input=input_file,
1067
1062
  name=ref, node_type=ref, parent_uuid=self.uuid)
@@ -1,7 +1,7 @@
1
1
  from flowfile_core.schemas.input_schema import FullDatabaseConnection, FullDatabaseConnectionInterface
2
2
  from sqlalchemy.orm import Session
3
3
  from flowfile_core.database.models import DatabaseConnection as DBConnectionModel, Secret
4
- from flowfile_core.secrets.secrets import store_secret, SecretInput, decrypt_secret
4
+ from flowfile_core.secret_manager.secret_manager import store_secret, SecretInput, decrypt_secret
5
5
  from flowfile_core.database.connection import get_db_context
6
6
 
7
7
 
@@ -17,6 +17,7 @@ from pyarrow.parquet import ParquetFile
17
17
  # Local imports - Core
18
18
  from flowfile_core.configs import logger
19
19
  from flowfile_core.configs.flow_logger import NodeLogger
20
+ from flowfile_core.configs.settings import OFFLOAD_TO_WORKER
20
21
  from flowfile_core.schemas import (
21
22
  input_schema,
22
23
  transform_schema as transform_schemas
@@ -29,7 +30,7 @@ from flowfile_core.flowfile.flow_data_engine.flow_file_column.main import (
29
30
  FlowfileColumn,
30
31
  convert_stats_to_column_info
31
32
  )
32
- from flowfile_core.flowfile.flow_data_engine.flow_file_column.utils import type_to_polars
33
+ from flowfile_core.flowfile.flow_data_engine.flow_file_column.utils import cast_str_to_polars_type
33
34
  from flowfile_core.flowfile.flow_data_engine.fuzzy_matching.prepare_for_fuzzy_match import prepare_for_fuzzy_match
34
35
  from flowfile_core.flowfile.flow_data_engine.join import (
35
36
  verify_join_select_integrity,
@@ -109,7 +110,7 @@ class FlowDataEngine:
109
110
  # flow_id: int = None # TODO: Implement flow_id
110
111
 
111
112
  def __init__(self,
112
- raw_data: Union[List[Dict], List[Any], 'ParquetFile', pl.DataFrame, pl.LazyFrame] = None,
113
+ raw_data: Union[List[Dict], List[Any], 'ParquetFile', pl.DataFrame, pl.LazyFrame, input_schema.RawData] = None,
113
114
  path_ref: str = None,
114
115
  name: str = None,
115
116
  optimize_memory: bool = True,
@@ -147,7 +148,10 @@ class FlowDataEngine:
147
148
 
148
149
  def _handle_raw_data(self, raw_data, number_of_records, optimize_memory):
149
150
  """Process different types of input data."""
150
- if isinstance(raw_data, pl.DataFrame):
151
+
152
+ if isinstance(raw_data, input_schema.RawData):
153
+ self._handle_raw_data_format(raw_data)
154
+ elif isinstance(raw_data, pl.DataFrame):
151
155
  self._handle_polars_dataframe(raw_data, number_of_records)
152
156
  elif isinstance(raw_data, pl.LazyFrame):
153
157
  self._handle_polars_lazy_frame(raw_data, number_of_records, optimize_memory)
@@ -190,6 +194,20 @@ class FlowDataEngine:
190
194
  self.number_of_records = 1
191
195
  self.data_frame = pl.DataFrame([data])
192
196
 
197
+ def _handle_raw_data_format(self, raw_data: input_schema.RawData):
198
+ """Create a FlowDataEngine from a RawData object."""
199
+ flowfile_schema = list(FlowfileColumn.create_from_minimal_field_info(c) for c in raw_data.columns)
200
+ polars_schema = pl.Schema([(flowfile_column.column_name, flowfile_column.get_polars_type().pl_datatype)
201
+ for flowfile_column in flowfile_schema])
202
+ try:
203
+ df = pl.DataFrame(raw_data.data, polars_schema)
204
+ except TypeError as e:
205
+ logger.warning(f"Could not parse the data with the schema:\n{e}")
206
+ df = pl.DataFrame(raw_data.data)
207
+ self.number_of_records = len(df)
208
+ self.data_frame = df.lazy()
209
+ self.lazy = True
210
+
193
211
  def _handle_list_input(self, data: List):
194
212
  """Handle list input."""
195
213
  number_of_records = len(data)
@@ -462,6 +480,9 @@ class FlowDataEngine:
462
480
  return self.data_frame.collect(engine="streaming" if self._streamable else "auto").to_dicts()
463
481
  return self.data_frame.to_dicts()
464
482
 
483
+ def to_dict(self) -> Dict[str, List]:
484
+ return self.data_frame.collect(engine="streaming" if self._streamable else "auto").to_dict(as_series=False)
485
+
465
486
  @classmethod
466
487
  def create_from_external_source(cls, external_source: ExternalDataSource) -> "FlowDataEngine":
467
488
  """Create a FlowDataEngine from an external data source."""
@@ -484,7 +505,7 @@ class FlowDataEngine:
484
505
  """Create a FlowDataEngine from a schema definition."""
485
506
  pl_schema = []
486
507
  for i, flow_file_column in enumerate(schema):
487
- pl_schema.append((flow_file_column.name, type_to_polars(flow_file_column.data_type)))
508
+ pl_schema.append((flow_file_column.name, cast_str_to_polars_type(flow_file_column.data_type)))
488
509
  schema[i].col_index = i
489
510
  df = pl.LazyFrame(schema=pl_schema)
490
511
  return cls(df, schema=schema, calculate_schema_stats=False, number_of_records=0)
@@ -824,7 +845,7 @@ class FlowDataEngine:
824
845
  Returns:
825
846
  FlowDataEngine: New instance with sampled data
826
847
  """
827
- n_records = min(n_rows, self.number_of_records)
848
+ n_records = min(n_rows, self.get_number_of_records(calculate_in_worker_process=True))
828
849
  logging.info(f'Getting sample of {n_rows} rows')
829
850
 
830
851
  if random:
@@ -1158,14 +1179,25 @@ class FlowDataEngine:
1158
1179
  self.number_of_records = 0
1159
1180
  self._lazy = True
1160
1181
 
1161
- def get_number_of_records(self, warn: bool = False, force_calculate: bool = False) -> int:
1182
+ def _calculate_number_of_records_in_worker(self) -> int:
1183
+ number_of_records = ExternalDfFetcher(
1184
+ lf=self.data_frame,
1185
+ operation_type="calculate_number_of_records",
1186
+ flow_id=-1,
1187
+ node_id=-1,
1188
+ wait_on_completion=True
1189
+ ).result
1190
+ return number_of_records
1191
+
1192
+ def get_number_of_records(self, warn: bool = False, force_calculate: bool = False,
1193
+ calculate_in_worker_process: bool = False) -> int:
1162
1194
  """
1163
1195
  Get the total number of records in the DataFrame.
1164
1196
 
1165
1197
  Args:
1166
1198
  warn: Whether to warn about expensive operations
1167
1199
  force_calculate: Whether to force recalculation
1168
-
1200
+ calculate_in_worker_process: Whether to offload compute to the worker process
1169
1201
  Returns:
1170
1202
  int: Number of records
1171
1203
 
@@ -1174,22 +1206,24 @@ class FlowDataEngine:
1174
1206
  """
1175
1207
  if self.is_future and not self.is_collected:
1176
1208
  return -1
1177
-
1209
+ calculate_in_worker_process = False if not OFFLOAD_TO_WORKER.value else calculate_in_worker_process
1178
1210
  if self.number_of_records is None or self.number_of_records < 0 or force_calculate:
1179
1211
  if self._number_of_records_callback is not None:
1180
1212
  self._number_of_records_callback(self)
1181
1213
 
1182
1214
  if self.lazy:
1183
- if warn:
1184
- logger.warning('Calculating the number of records this can be expensive on a lazy frame')
1185
- try:
1186
- self.number_of_records = self.data_frame.select(pl.len()).collect(
1187
- engine="streaming" if self._streamable else "auto")[0, 0]
1188
- except Exception:
1189
- raise Exception('Could not get number of records')
1215
+ if calculate_in_worker_process:
1216
+ self.number_of_records = self._calculate_number_of_records_in_worker()
1217
+ else:
1218
+ if warn:
1219
+ logger.warning('Calculating the number of records this can be expensive on a lazy frame')
1220
+ try:
1221
+ self.number_of_records = self.data_frame.select(pl.len()).collect(
1222
+ engine="streaming" if self._streamable else "auto")[0, 0]
1223
+ except Exception:
1224
+ raise ValueError('Could not get number of records')
1190
1225
  else:
1191
1226
  self.number_of_records = self.data_frame.__len__()
1192
-
1193
1227
  return self.number_of_records
1194
1228
 
1195
1229
  # Properties
@@ -1518,4 +1552,7 @@ def execute_polars_code(*flowfile_tables: "FlowDataEngine", code: str) -> "FlowD
1518
1552
  kwargs = {'input_df': flowfile_tables[0].data_frame}
1519
1553
  else:
1520
1554
  kwargs = {f'input_df_{i+1}': flowfile_table.data_frame for i, flowfile_table in enumerate(flowfile_tables)}
1521
- return FlowDataEngine(polars_executable(**kwargs))
1555
+ df = polars_executable(**kwargs)
1556
+ if isinstance(df, pl.DataFrame):
1557
+ logger.warning("Got a non lazy DataFrame, possibly harming performance, if possible, try to use a lazy method")
1558
+ return FlowDataEngine(df)