Flowfile 0.3.9__py3-none-any.whl → 0.5.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (201) hide show
  1. flowfile/__init__.py +8 -1
  2. flowfile/api.py +1 -3
  3. flowfile/web/static/assets/{CloudConnectionManager-c97c25f8.js → CloudConnectionManager-0dfba9f2.js} +2 -2
  4. flowfile/web/static/assets/{CloudStorageReader-f1ff509e.js → CloudStorageReader-d5b1b6c9.js} +11 -78
  5. flowfile/web/static/assets/{CloudStorageWriter-034f8b78.js → CloudStorageWriter-00d87aad.js} +12 -79
  6. flowfile/web/static/assets/{CloudStorageWriter-49c9a4b2.css → CloudStorageWriter-b0ee067f.css} +24 -24
  7. flowfile/web/static/assets/ColumnSelector-4685e75d.js +83 -0
  8. flowfile/web/static/assets/ColumnSelector-47996a16.css +10 -0
  9. flowfile/web/static/assets/ContextMenu-23e909da.js +41 -0
  10. flowfile/web/static/assets/{SettingsSection-9c836ecc.css → ContextMenu-4c74eef1.css} +0 -21
  11. flowfile/web/static/assets/ContextMenu-63cfa99b.css +26 -0
  12. flowfile/web/static/assets/ContextMenu-70ae0c79.js +41 -0
  13. flowfile/web/static/assets/ContextMenu-c13f91d0.css +26 -0
  14. flowfile/web/static/assets/ContextMenu-f149cf7c.js +41 -0
  15. flowfile/web/static/assets/{CrossJoin-41efa4cb.css → CrossJoin-1119d18e.css} +18 -18
  16. flowfile/web/static/assets/{CrossJoin-9e156ebe.js → CrossJoin-702a3edd.js} +14 -84
  17. flowfile/web/static/assets/CustomNode-74a37f74.css +32 -0
  18. flowfile/web/static/assets/CustomNode-b1519993.js +211 -0
  19. flowfile/web/static/assets/{DatabaseConnectionSettings-d5c625b3.js → DatabaseConnectionSettings-6f3e4ea5.js} +3 -3
  20. flowfile/web/static/assets/{DatabaseManager-265adc5e.js → DatabaseManager-cf5ef661.js} +2 -2
  21. flowfile/web/static/assets/{DatabaseReader-f50c6558.css → DatabaseReader-ae61773c.css} +0 -27
  22. flowfile/web/static/assets/{DatabaseReader-0b10551e.js → DatabaseReader-d38c7295.js} +14 -114
  23. flowfile/web/static/assets/{DatabaseWriter-c17c6916.js → DatabaseWriter-b04ef46a.js} +13 -74
  24. flowfile/web/static/assets/{ExploreData-5bdae813.css → ExploreData-2d0cf4db.css} +8 -14
  25. flowfile/web/static/assets/ExploreData-5fa10ed8.js +192 -0
  26. flowfile/web/static/assets/{ExternalSource-3a66556c.js → ExternalSource-d39af878.js} +8 -79
  27. flowfile/web/static/assets/{Filter-91ad87e7.js → Filter-9b6d08db.js} +12 -85
  28. flowfile/web/static/assets/{Filter-a9d08ba1.css → Filter-f62091b3.css} +3 -3
  29. flowfile/web/static/assets/{Formula-3c395ab1.js → Formula-6b04fb1d.js} +20 -87
  30. flowfile/web/static/assets/{Formula-29f19d21.css → Formula-bb96803d.css} +4 -4
  31. flowfile/web/static/assets/{FuzzyMatch-6857de82.css → FuzzyMatch-1010f966.css} +42 -42
  32. flowfile/web/static/assets/{FuzzyMatch-2df0d230.js → FuzzyMatch-999521f4.js} +16 -87
  33. flowfile/web/static/assets/{GraphSolver-d285877f.js → GraphSolver-17dd2198.js} +13 -159
  34. flowfile/web/static/assets/GraphSolver-f0cb7bfb.css +22 -0
  35. flowfile/web/static/assets/{GroupBy-0bd1cc6b.js → GroupBy-6b039e18.js} +12 -75
  36. flowfile/web/static/assets/{Unique-b5615727.css → GroupBy-b9505323.css} +8 -8
  37. flowfile/web/static/assets/{Join-5a78a203.js → Join-24d0f113.js} +15 -85
  38. flowfile/web/static/assets/{Join-f45eff22.css → Join-fd79b451.css} +20 -20
  39. flowfile/web/static/assets/{ManualInput-a71b52c6.css → ManualInput-3246a08d.css} +20 -20
  40. flowfile/web/static/assets/{ManualInput-93aef9d6.js → ManualInput-34639209.js} +11 -82
  41. flowfile/web/static/assets/MultiSelect-0e8724a3.js +5 -0
  42. flowfile/web/static/assets/MultiSelect.vue_vue_type_script_setup_true_lang-b0e538c2.js +63 -0
  43. flowfile/web/static/assets/NumericInput-3d63a470.js +5 -0
  44. flowfile/web/static/assets/NumericInput.vue_vue_type_script_setup_true_lang-e0edeccc.js +35 -0
  45. flowfile/web/static/assets/Output-283fe388.css +37 -0
  46. flowfile/web/static/assets/{Output-411ecaee.js → Output-edea9802.js} +62 -273
  47. flowfile/web/static/assets/{Pivot-89db4b04.js → Pivot-61d19301.js} +14 -138
  48. flowfile/web/static/assets/Pivot-cf333e3d.css +22 -0
  49. flowfile/web/static/assets/PivotValidation-891ddfb0.css +13 -0
  50. flowfile/web/static/assets/PivotValidation-c46cd420.css +13 -0
  51. flowfile/web/static/assets/PivotValidation-de9f43fe.js +61 -0
  52. flowfile/web/static/assets/PivotValidation-f97fec5b.js +61 -0
  53. flowfile/web/static/assets/{PolarsCode-a9f974f8.js → PolarsCode-bc3c9984.js} +13 -80
  54. flowfile/web/static/assets/Read-64a3f259.js +218 -0
  55. flowfile/web/static/assets/Read-e808b239.css +62 -0
  56. flowfile/web/static/assets/RecordCount-3d5039be.js +53 -0
  57. flowfile/web/static/assets/{RecordId-55ae7d36.js → RecordId-597510e0.js} +8 -80
  58. flowfile/web/static/assets/SQLQueryComponent-36cef432.css +27 -0
  59. flowfile/web/static/assets/SQLQueryComponent-df51adbe.js +38 -0
  60. flowfile/web/static/assets/{Sample-b4a18476.js → Sample-4be0a507.js} +8 -77
  61. flowfile/web/static/assets/{SecretManager-b066d13a.js → SecretManager-4839be57.js} +2 -2
  62. flowfile/web/static/assets/{Select-727688dc.js → Select-9b72f201.js} +11 -85
  63. flowfile/web/static/assets/SettingsSection-2e4d03c4.css +21 -0
  64. flowfile/web/static/assets/SettingsSection-5c696bee.css +20 -0
  65. flowfile/web/static/assets/SettingsSection-71e6b7e3.css +21 -0
  66. flowfile/web/static/assets/SettingsSection-7ded385d.js +45 -0
  67. flowfile/web/static/assets/{SettingsSection-695ac487.js → SettingsSection-e1e9c953.js} +2 -40
  68. flowfile/web/static/assets/SettingsSection-f0f75a42.js +53 -0
  69. flowfile/web/static/assets/SingleSelect-6c777aac.js +5 -0
  70. flowfile/web/static/assets/SingleSelect.vue_vue_type_script_setup_true_lang-33e3ff9b.js +62 -0
  71. flowfile/web/static/assets/SliderInput-7cb93e62.js +40 -0
  72. flowfile/web/static/assets/SliderInput-b8fb6a8c.css +4 -0
  73. flowfile/web/static/assets/{GroupBy-ab1ea74b.css → Sort-3643d625.css} +8 -8
  74. flowfile/web/static/assets/{Sort-be3339a8.js → Sort-6cbde21a.js} +12 -97
  75. flowfile/web/static/assets/TextInput-d9a40c11.js +5 -0
  76. flowfile/web/static/assets/TextInput.vue_vue_type_script_setup_true_lang-5896c375.js +32 -0
  77. flowfile/web/static/assets/{TextToRows-c92d1ec2.css → TextToRows-5d2c1190.css} +9 -9
  78. flowfile/web/static/assets/{TextToRows-7b8998da.js → TextToRows-c4fcbf4d.js} +14 -83
  79. flowfile/web/static/assets/ToggleSwitch-4ef91d19.js +5 -0
  80. flowfile/web/static/assets/ToggleSwitch.vue_vue_type_script_setup_true_lang-38478c20.js +31 -0
  81. flowfile/web/static/assets/{UnavailableFields-8b0cb48e.js → UnavailableFields-a03f512c.js} +2 -2
  82. flowfile/web/static/assets/{Union-8d9ac7f9.css → Union-af6c3d9b.css} +6 -6
  83. flowfile/web/static/assets/Union-bfe9b996.js +77 -0
  84. flowfile/web/static/assets/{Unique-af5a80b4.js → Unique-5d023a27.js} +23 -104
  85. flowfile/web/static/assets/{Sort-7ccfa0fe.css → Unique-f9fb0809.css} +8 -8
  86. flowfile/web/static/assets/Unpivot-1e422df3.css +30 -0
  87. flowfile/web/static/assets/{Unpivot-5195d411.js → Unpivot-91cc5354.js} +12 -166
  88. flowfile/web/static/assets/UnpivotValidation-0d240eeb.css +13 -0
  89. flowfile/web/static/assets/UnpivotValidation-7ee2de44.js +51 -0
  90. flowfile/web/static/assets/{ExploreData-18a4fe52.js → VueGraphicWalker-e51b9924.js} +4 -264
  91. flowfile/web/static/assets/VueGraphicWalker-ed5ab88b.css +6 -0
  92. flowfile/web/static/assets/{api-cb00cce6.js → api-c1bad5ca.js} +1 -1
  93. flowfile/web/static/assets/{api-023d1733.js → api-cf1221f0.js} +1 -1
  94. flowfile/web/static/assets/{designer-2197d782.css → designer-8da3ba3a.css} +859 -201
  95. flowfile/web/static/assets/{designer-6c322d8e.js → designer-9633482a.js} +2297 -733
  96. flowfile/web/static/assets/{documentation-4d1fafe1.js → documentation-ca400224.js} +1 -1
  97. flowfile/web/static/assets/{dropDown-0b46dd77.js → dropDown-614b998d.js} +1 -1
  98. flowfile/web/static/assets/{fullEditor-ec4e4f95.js → fullEditor-f7971590.js} +2 -2
  99. flowfile/web/static/assets/{genericNodeSettings-def5879b.js → genericNodeSettings-4fe5f36b.js} +3 -3
  100. flowfile/web/static/assets/{index-681a3ed0.css → index-50508d4d.css} +8 -0
  101. flowfile/web/static/assets/{index-683fc198.js → index-5429bbf8.js} +208 -31
  102. flowfile/web/static/assets/nodeInput-5d0d6b79.js +41 -0
  103. flowfile/web/static/assets/outputCsv-076b85ab.js +86 -0
  104. flowfile/web/static/assets/{Output-48f81019.css → outputCsv-9cc59e0b.css} +0 -143
  105. flowfile/web/static/assets/outputExcel-0fd17dbe.js +56 -0
  106. flowfile/web/static/assets/outputExcel-b41305c0.css +102 -0
  107. flowfile/web/static/assets/outputParquet-b61e0847.js +31 -0
  108. flowfile/web/static/assets/outputParquet-cf8cf3f2.css +4 -0
  109. flowfile/web/static/assets/readCsv-a8bb8b61.js +179 -0
  110. flowfile/web/static/assets/readCsv-c767cb37.css +52 -0
  111. flowfile/web/static/assets/readExcel-67b4aee0.js +201 -0
  112. flowfile/web/static/assets/readExcel-806d2826.css +64 -0
  113. flowfile/web/static/assets/readParquet-48c81530.css +19 -0
  114. flowfile/web/static/assets/readParquet-92ce1dbc.js +23 -0
  115. flowfile/web/static/assets/{secretApi-baceb6f9.js → secretApi-68435402.js} +1 -1
  116. flowfile/web/static/assets/{selectDynamic-de91449a.js → selectDynamic-92e25ee3.js} +7 -7
  117. flowfile/web/static/assets/{selectDynamic-b062bc9b.css → selectDynamic-aa913ff4.css} +16 -16
  118. flowfile/web/static/assets/user-defined-icon-0ae16c90.png +0 -0
  119. flowfile/web/static/assets/{vue-codemirror.esm-dc5e3348.js → vue-codemirror.esm-41b0e0d7.js} +65 -36
  120. flowfile/web/static/assets/{vue-content-loader.es-ba94b82f.js → vue-content-loader.es-2c8e608f.js} +1 -1
  121. flowfile/web/static/index.html +2 -2
  122. {flowfile-0.3.9.dist-info → flowfile-0.5.1.dist-info}/METADATA +5 -3
  123. {flowfile-0.3.9.dist-info → flowfile-0.5.1.dist-info}/RECORD +191 -121
  124. {flowfile-0.3.9.dist-info → flowfile-0.5.1.dist-info}/WHEEL +1 -1
  125. {flowfile-0.3.9.dist-info → flowfile-0.5.1.dist-info}/entry_points.txt +1 -0
  126. flowfile_core/__init__.py +3 -0
  127. flowfile_core/configs/flow_logger.py +5 -13
  128. flowfile_core/configs/node_store/__init__.py +30 -0
  129. flowfile_core/configs/node_store/nodes.py +383 -99
  130. flowfile_core/configs/node_store/user_defined_node_registry.py +193 -0
  131. flowfile_core/configs/settings.py +2 -1
  132. flowfile_core/database/connection.py +5 -21
  133. flowfile_core/fileExplorer/funcs.py +239 -121
  134. flowfile_core/flowfile/analytics/analytics_processor.py +1 -0
  135. flowfile_core/flowfile/code_generator/code_generator.py +62 -64
  136. flowfile_core/flowfile/flow_data_engine/create/funcs.py +73 -56
  137. flowfile_core/flowfile/flow_data_engine/flow_data_engine.py +77 -86
  138. flowfile_core/flowfile/flow_data_engine/flow_file_column/interface.py +4 -0
  139. flowfile_core/flowfile/flow_data_engine/flow_file_column/main.py +19 -34
  140. flowfile_core/flowfile/flow_data_engine/flow_file_column/type_registry.py +36 -0
  141. flowfile_core/flowfile/flow_data_engine/fuzzy_matching/prepare_for_fuzzy_match.py +23 -23
  142. flowfile_core/flowfile/flow_data_engine/join/utils.py +1 -1
  143. flowfile_core/flowfile/flow_data_engine/join/verify_integrity.py +9 -4
  144. flowfile_core/flowfile/flow_data_engine/subprocess_operations/subprocess_operations.py +212 -86
  145. flowfile_core/flowfile/flow_data_engine/utils.py +2 -0
  146. flowfile_core/flowfile/flow_graph.py +240 -54
  147. flowfile_core/flowfile/flow_node/flow_node.py +48 -13
  148. flowfile_core/flowfile/flow_node/models.py +2 -1
  149. flowfile_core/flowfile/handler.py +24 -5
  150. flowfile_core/flowfile/manage/compatibility_enhancements.py +404 -41
  151. flowfile_core/flowfile/manage/io_flowfile.py +394 -0
  152. flowfile_core/flowfile/node_designer/__init__.py +47 -0
  153. flowfile_core/flowfile/node_designer/_type_registry.py +197 -0
  154. flowfile_core/flowfile/node_designer/custom_node.py +371 -0
  155. flowfile_core/flowfile/node_designer/ui_components.py +277 -0
  156. flowfile_core/flowfile/schema_callbacks.py +17 -10
  157. flowfile_core/flowfile/setting_generator/settings.py +15 -10
  158. flowfile_core/main.py +5 -1
  159. flowfile_core/routes/routes.py +73 -30
  160. flowfile_core/routes/user_defined_components.py +55 -0
  161. flowfile_core/schemas/cloud_storage_schemas.py +0 -2
  162. flowfile_core/schemas/input_schema.py +228 -65
  163. flowfile_core/schemas/output_model.py +5 -2
  164. flowfile_core/schemas/schemas.py +153 -35
  165. flowfile_core/schemas/transform_schema.py +1083 -412
  166. flowfile_core/schemas/yaml_types.py +103 -0
  167. flowfile_core/types.py +156 -0
  168. flowfile_core/utils/validate_setup.py +3 -1
  169. flowfile_frame/__init__.py +3 -1
  170. flowfile_frame/flow_frame.py +31 -24
  171. flowfile_frame/flow_frame_methods.py +12 -9
  172. flowfile_worker/__init__.py +9 -35
  173. flowfile_worker/create/__init__.py +3 -21
  174. flowfile_worker/create/funcs.py +68 -56
  175. flowfile_worker/create/models.py +130 -62
  176. flowfile_worker/main.py +5 -2
  177. flowfile_worker/routes.py +52 -13
  178. shared/__init__.py +15 -0
  179. shared/storage_config.py +258 -0
  180. tools/migrate/README.md +56 -0
  181. tools/migrate/__init__.py +12 -0
  182. tools/migrate/__main__.py +131 -0
  183. tools/migrate/legacy_schemas.py +621 -0
  184. tools/migrate/migrate.py +598 -0
  185. tools/migrate/tests/__init__.py +0 -0
  186. tools/migrate/tests/conftest.py +23 -0
  187. tools/migrate/tests/test_migrate.py +627 -0
  188. tools/migrate/tests/test_migration_e2e.py +1010 -0
  189. tools/migrate/tests/test_node_migrations.py +813 -0
  190. flowfile/web/static/assets/GraphSolver-17fd26db.css +0 -68
  191. flowfile/web/static/assets/Pivot-f415e85f.css +0 -35
  192. flowfile/web/static/assets/Read-80dc1675.css +0 -197
  193. flowfile/web/static/assets/Read-c3b1929c.js +0 -701
  194. flowfile/web/static/assets/RecordCount-4e95f98e.js +0 -122
  195. flowfile/web/static/assets/Union-89fd73dc.js +0 -146
  196. flowfile/web/static/assets/Unpivot-246e9bbd.css +0 -77
  197. flowfile/web/static/assets/nodeTitle-a16db7c3.js +0 -227
  198. flowfile/web/static/assets/nodeTitle-f4b12bcb.css +0 -134
  199. flowfile_core/flowfile/manage/open_flowfile.py +0 -135
  200. {flowfile-0.3.9.dist-info → flowfile-0.5.1.dist-info/licenses}/LICENSE +0 -0
  201. /flowfile_core/flowfile/manage/manage_flowfile.py → /tools/__init__.py +0 -0
@@ -1,94 +1,100 @@
1
1
  import polars as pl
2
2
  import os
3
3
 
4
- from flowfile_worker.create.models import ReceivedCsvTable, ReceivedParquetTable, ReceivedExcelTable
4
+ from flowfile_worker.create.models import ReceivedTable, InputCsvTable, InputJsonTable, InputExcelTable, InputParquetTable
5
5
  from flowfile_worker.create.utils import create_fake_data
6
6
  from flowfile_worker.create.read_excel_tables import df_from_openpyxl, df_from_calamine_xlsx
7
7
 
8
8
 
9
- def create_from_path_json(received_table: ReceivedCsvTable):
9
+ def create_from_path_json(received_table: ReceivedTable):
10
+ if not isinstance(received_table.table_settings, InputJsonTable):
11
+ raise ValueError("Received table settings are not of type InputJsonTable")
12
+ input_table_settings: InputJsonTable = received_table.table_settings
10
13
  f = received_table.abs_file_path
11
14
  gbs_to_load = os.path.getsize(f) / 1024 / 1000 / 1000
12
15
  low_mem = gbs_to_load > 10
13
- if received_table.encoding.upper() == 'UTF8' or received_table.encoding.upper() == 'UTF-8':
16
+ if input_table_settings.encoding.upper() == 'UTF8' or input_table_settings.encoding.upper() == 'UTF-8':
14
17
  try:
15
18
  df = pl.scan_csv(f,
16
19
  low_memory=low_mem,
17
20
  try_parse_dates=True,
18
- separator=received_table.delimiter,
19
- has_header=received_table.has_headers,
20
- skip_rows=received_table.starting_from_line,
21
+ separator=input_table_settings.delimiter,
22
+ has_header=input_table_settings.has_headers,
23
+ skip_rows=input_table_settings.starting_from_line,
21
24
  encoding='utf8',
22
- infer_schema_length=received_table.infer_schema_length)
25
+ infer_schema_length=input_table_settings.infer_schema_length)
23
26
  df.head(1).collect()
24
27
  return df
25
28
  except:
26
29
  try:
27
30
  df = pl.scan_csv(f, low_memory=low_mem,
28
- separator=received_table.delimiter,
29
- has_header=received_table.has_headers,
30
- skip_rows=received_table.starting_from_line,
31
+ separator=input_table_settings.delimiter,
32
+ has_header=input_table_settings.has_headers,
33
+ skip_rows=input_table_settings.starting_from_line,
31
34
  encoding='utf8-lossy',
32
35
  ignore_errors=True)
33
36
  return df
34
37
  except:
35
38
  df = pl.scan_csv(f, low_memory=low_mem,
36
- separator=received_table.delimiter,
37
- has_header=received_table.has_headers,
38
- skip_rows=received_table.starting_from_line,
39
+ separator=input_table_settings.delimiter,
40
+ has_header=input_table_settings.has_headers,
41
+ skip_rows=input_table_settings.starting_from_line,
39
42
  encoding='utf8',
40
43
  ignore_errors=True)
41
44
  return df
42
45
  else:
43
46
  df = pl.read_csv(f, low_memory=low_mem,
44
- separator=received_table.delimiter,
45
- has_header=received_table.has_headers,
46
- skip_rows=received_table.starting_from_line,
47
- encoding=received_table.encoding,
47
+ separator=input_table_settings.delimiter,
48
+ has_header=input_table_settings.has_headers,
49
+ skip_rows=input_table_settings.starting_from_line,
50
+ encoding=input_table_settings.encoding,
48
51
  ignore_errors=True)
49
52
  return df
50
53
 
51
54
 
52
- def create_from_path_csv(received_table: ReceivedCsvTable) -> pl.DataFrame:
55
+ def create_from_path_csv(received_table: ReceivedTable) -> pl.DataFrame:
53
56
  f = received_table.abs_file_path
57
+ if not isinstance(received_table.table_settings, InputCsvTable):
58
+ raise ValueError("Received table settings are not of type InputCsvTable")
59
+ input_table_settings: InputCsvTable = received_table.table_settings
54
60
  gbs_to_load = os.path.getsize(f) / 1024 / 1000 / 1000
55
61
  low_mem = gbs_to_load > 10
56
- if received_table.encoding.upper() == 'UTF8' or received_table.encoding.upper() == 'UTF-8':
62
+ if input_table_settings.encoding.upper() == 'UTF8' or input_table_settings.encoding.upper() == 'UTF-8':
57
63
  try:
58
64
  df = pl.scan_csv(f,
59
65
  low_memory=low_mem,
60
66
  try_parse_dates=True,
61
- separator=received_table.delimiter,
62
- has_header=received_table.has_headers,
63
- skip_rows=received_table.starting_from_line,
67
+ separator=input_table_settings.delimiter,
68
+ has_header=input_table_settings.has_headers,
69
+ skip_rows=input_table_settings.starting_from_line,
64
70
  encoding='utf8',
65
- infer_schema_length=received_table.infer_schema_length)
71
+ infer_schema_length=input_table_settings.infer_schema_length)
66
72
  df.head(1).collect()
67
73
  return df
68
74
  except:
69
75
  try:
70
76
  df = pl.scan_csv(f, low_memory=low_mem,
71
- separator=received_table.delimiter,
72
- has_header=received_table.has_headers,
73
- skip_rows=received_table.starting_from_line,
77
+ separator=input_table_settings.delimiter,
78
+ has_header=input_table_settings.has_headers,
79
+ skip_rows=input_table_settings.starting_from_line,
74
80
  encoding='utf8-lossy',
75
81
  ignore_errors=True)
76
82
  return df
77
83
  except:
78
84
  df = pl.scan_csv(f, low_memory=low_mem,
79
- separator=received_table.delimiter,
80
- has_header=received_table.has_headers,
81
- skip_rows=received_table.starting_from_line,
85
+ separator=input_table_settings.delimiter,
86
+ has_header=input_table_settings.has_headers,
87
+ skip_rows=input_table_settings.starting_from_line,
82
88
  encoding='utf8',
83
89
  ignore_errors=True)
84
90
  return df
85
91
  else:
86
92
  df = pl.read_csv(f,
87
93
  low_memory=low_mem,
88
- separator=received_table.delimiter,
89
- has_header=received_table.has_headers,
90
- skip_rows=received_table.starting_from_line,
91
- encoding=received_table.encoding,
94
+ separator=input_table_settings.delimiter,
95
+ has_header=input_table_settings.has_headers,
96
+ skip_rows=input_table_settings.starting_from_line,
97
+ encoding=input_table_settings.encoding,
92
98
  ignore_errors=True)
93
99
  return df
94
100
 
@@ -97,50 +103,56 @@ def create_random(number_of_records: int = 1000) -> pl.LazyFrame:
97
103
  return create_fake_data(number_of_records).lazy()
98
104
 
99
105
 
100
- def create_from_path_parquet(received_table: ReceivedParquetTable):
106
+ def create_from_path_parquet(received_table: ReceivedTable):
107
+ if not isinstance(received_table.table_settings, InputParquetTable):
108
+ raise ValueError("Received table settings are not of type InputParquetTable")
101
109
  low_mem = (os.path.getsize(received_table.abs_file_path) / 1024 / 1000 / 1000) > 2
102
110
  return pl.scan_parquet(source=received_table.abs_file_path, low_memory=low_mem)
103
111
 
104
112
 
105
- def create_from_path_excel(received_table: ReceivedExcelTable):
106
- if received_table.type_inference:
113
+ def create_from_path_excel(received_table: ReceivedTable):
114
+ if not isinstance(received_table.table_settings, InputExcelTable):
115
+ raise ValueError("Received table settings are not of type InputExcelTable")
116
+ input_table_settings: InputExcelTable = received_table.table_settings
117
+
118
+ if input_table_settings.type_inference:
107
119
  engine = 'openpyxl'
108
- elif received_table.start_row > 0 and received_table.start_column == 0:
109
- engine = 'calamine' if received_table.has_headers else 'xlsx2csv'
110
- elif received_table.start_column > 0 or received_table.start_row > 0:
120
+ elif input_table_settings.start_row > 0 and input_table_settings.start_column == 0:
121
+ engine = 'calamine' if input_table_settings.has_headers else 'xlsx2csv'
122
+ elif input_table_settings.start_column > 0 or input_table_settings.start_row > 0:
111
123
  engine = 'openpyxl'
112
124
  else:
113
125
  engine = 'calamine'
114
126
 
115
- sheet_name = received_table.sheet_name
127
+ sheet_name = input_table_settings.sheet_name
116
128
 
117
129
  if engine == 'calamine':
118
130
  df = df_from_calamine_xlsx(file_path=received_table.abs_file_path, sheet_name=sheet_name,
119
- start_row=received_table.start_row, end_row=received_table.end_row)
120
- if received_table.end_column > 0:
121
- end_col_index = received_table.end_column
122
- cols_to_select = [df.columns[i] for i in range(received_table.start_column, end_col_index)]
131
+ start_row=input_table_settings.start_row, end_row=input_table_settings.end_row)
132
+ if input_table_settings.end_column > 0:
133
+ end_col_index = input_table_settings.end_column
134
+ cols_to_select = [df.columns[i] for i in range(input_table_settings.start_column, end_col_index)]
123
135
  df = df.select(cols_to_select)
124
136
 
125
137
  elif engine == 'xlsx2csv':
126
- csv_options = {'has_header': received_table.has_headers, 'skip_rows': received_table.start_row}
138
+ csv_options = {'has_header': input_table_settings.has_headers, 'skip_rows': input_table_settings.start_row}
127
139
  df = pl.read_excel(source=received_table.abs_file_path,
128
140
  read_options=csv_options,
129
141
  engine='xlsx2csv',
130
- sheet_name=received_table.sheet_name)
131
- end_col_index = received_table.end_column if received_table.end_column > 0 else len(df.columns)
132
- cols_to_select = [df.columns[i] for i in range(received_table.start_column, end_col_index)]
142
+ sheet_name=input_table_settings.sheet_name)
143
+ end_col_index = input_table_settings.end_column if input_table_settings.end_column > 0 else len(df.columns)
144
+ cols_to_select = [df.columns[i] for i in range(input_table_settings.start_column, end_col_index)]
133
145
  df = df.select(cols_to_select)
134
- if 0 < received_table.end_row < len(df):
135
- df = df.head(received_table.end_row)
146
+ if 0 < input_table_settings.end_row < len(df):
147
+ df = df.head(input_table_settings.end_row)
136
148
 
137
149
  else:
138
- max_col = received_table.end_column if received_table.end_column > 0 else None
139
- max_row = received_table.end_row + 1 if received_table.end_row > 0 else None
150
+ max_col = input_table_settings.end_column if input_table_settings.end_column > 0 else None
151
+ max_row = input_table_settings.end_row + 1 if input_table_settings.end_row > 0 else None
140
152
  df = df_from_openpyxl(file_path=received_table.abs_file_path,
141
- sheet_name=received_table.sheet_name,
142
- min_row=received_table.start_row + 1,
143
- min_col=received_table.start_column + 1,
153
+ sheet_name=input_table_settings.sheet_name,
154
+ min_row=input_table_settings.start_row + 1,
155
+ min_col=input_table_settings.start_column + 1,
144
156
  max_row=max_row,
145
- max_col=max_col, has_headers=received_table.has_headers)
157
+ max_col=max_col, has_headers=input_table_settings.has_headers)
146
158
  return df
@@ -1,5 +1,5 @@
1
- from pydantic import BaseModel, Field, model_validator
2
- from typing import List, Optional
1
+ from pydantic import BaseModel, Field, model_validator, field_validator
2
+ from typing import List, Optional, Literal, Annotated
3
3
  import os
4
4
  from pathlib import Path
5
5
 
@@ -9,78 +9,146 @@ class MinimalFieldInfo(BaseModel):
9
9
  data_type: str
10
10
 
11
11
 
12
- class ReceivedTableBase(BaseModel):
13
- id: Optional[int] = None
14
- name: str
15
- path: str
16
- directory: Optional[str] = None
17
- analysis_file_available: Optional[bool] = False
18
- status: Optional[str] = None
19
- file_type: Optional[str] = None
20
- fields: List[MinimalFieldInfo] = Field(default_factory=list)
21
- abs_file_path: Optional[str] = None
12
+ class InputTableBase(BaseModel):
13
+ """Base settings for input file operations."""
14
+ file_type: str # Will be overridden with Literal in subclasses
22
15
 
23
- @classmethod
24
- def create_from_path(cls, path: str):
25
- filename = os.path.basename(path)
26
- return cls(name=filename, path=path)
27
16
 
28
- @property
29
- def file_path(self) -> str:
30
- if self.name not in self.path:
31
- return os.path.join(self.path, self.name)
32
- return self.path
33
-
34
- @model_validator(mode="after")
35
- def set_abs_file_path(cls, values):
36
- abs_file_path = getattr(values, "abs_file_path", None)
37
- if abs_file_path is None:
38
- path = getattr(values, "path", None)
39
- if not path:
40
- raise ValueError("Field 'path' is required to compute abs_file_path")
41
- setattr(values, "abs_file_path", str(Path(path).absolute()))
42
- return values
43
-
44
-
45
- class ReceivedCsvTable(ReceivedTableBase):
46
- file_type: Optional[str] = 'csv'
47
- reference: Optional[str] = ''
48
- starting_from_line: Optional[int] = 0
49
- delimiter: Optional[str] = ','
50
- has_headers: Optional[bool] = True
51
- encoding: Optional[str] = 'utf-8'
17
+ class InputCsvTable(InputTableBase):
18
+ """Defines settings for reading a CSV file."""
19
+ file_type: Literal['csv'] = 'csv'
20
+ reference: str = ''
21
+ starting_from_line: int = 0
22
+ delimiter: str = ','
23
+ has_headers: bool = True
24
+ encoding: str = 'utf-8'
52
25
  parquet_ref: Optional[str] = None
53
- row_delimiter: Optional[str] = '\n'
54
- quote_char: Optional[str] = '"'
55
- infer_schema_length: Optional[int] = 10_000
56
- truncate_ragged_lines: Optional[bool] = False
57
- ignore_errors: Optional[bool] = False
26
+ row_delimiter: str = '\n'
27
+ quote_char: str = '"'
28
+ infer_schema_length: int = 10_000
29
+ truncate_ragged_lines: bool = False
30
+ ignore_errors: bool = False
58
31
 
59
32
 
60
- class ReceivedJsonTable(ReceivedCsvTable):
61
- pass
33
+ class InputJsonTable(InputCsvTable):
34
+ """Defines settings for reading a JSON file."""
35
+ file_type: Literal['json'] = 'json'
62
36
 
63
37
 
64
- class ReceivedParquetTable(ReceivedTableBase):
65
- file_type: Optional[str] = 'parquet'
38
+ class InputParquetTable(InputTableBase):
39
+ """Defines settings for reading a Parquet file."""
40
+ file_type: Literal['parquet'] = 'parquet'
66
41
 
67
42
 
68
- class ReceivedExcelTable(ReceivedTableBase):
43
+ class InputExcelTable(InputTableBase):
44
+ """Defines settings for reading an Excel file."""
45
+ file_type: Literal['excel'] = 'excel'
69
46
  sheet_name: Optional[str] = None
70
- start_row: Optional[int] = 0 # optional
71
- start_column: Optional[int] = 0 # optional
72
- end_row: Optional[int] = 0 # optional
73
- end_column: Optional[int] = 0 # optional
74
- has_headers: Optional[bool] = True # optional
75
- type_inference: Optional[bool] = False # optional
76
-
47
+ start_row: int = 0
48
+ start_column: int = 0
49
+ end_row: int = 0
50
+ end_column: int = 0
51
+ has_headers: bool = True
52
+ type_inference: bool = False
53
+
54
+ @model_validator(mode='after')
77
55
  def validate_range_values(self):
78
- # Validate that start and end rows/columns are non-negative integers
56
+ """Validates that the Excel cell range is logical."""
79
57
  for attribute in [self.start_row, self.start_column, self.end_row, self.end_column]:
80
58
  if not isinstance(attribute, int) or attribute < 0:
81
59
  raise ValueError("Row and column indices must be non-negative integers")
60
+ if (self.end_row > 0 and self.start_row > self.end_row) or \
61
+ (self.end_column > 0 and self.start_column > self.end_column):
62
+ raise ValueError("Start row/column must not be greater than end row/column")
63
+ return self
64
+
65
+
66
+ # Create the discriminated union (similar to OutputTableSettings)
67
+ InputTableSettings = Annotated[
68
+ InputCsvTable | InputJsonTable | InputParquetTable | InputExcelTable,
69
+ Field(discriminator='file_type')
70
+ ]
71
+
72
+
73
+ # Now create the main ReceivedTable model
74
+ class ReceivedTable(BaseModel):
75
+ """Model for defining a table received from an external source."""
76
+ # Metadata fields
77
+ id: Optional[int] = None
78
+ name: Optional[str] = None
79
+ path: str # This can be an absolute or relative path
80
+ directory: Optional[str] = None
81
+ analysis_file_available: bool = False
82
+ status: Optional[str] = None
83
+ fields: List[MinimalFieldInfo] = Field(default_factory=list)
84
+ abs_file_path: Optional[str] = None
85
+
86
+ file_type: Literal['csv', 'json', 'parquet', 'excel']
87
+
88
+ table_settings: InputTableSettings
89
+
90
+ @classmethod
91
+ def create_from_path(cls, path: str, file_type: Literal['csv', 'json', 'parquet', 'excel'] = 'csv'):
92
+ """Creates an instance from a file path string."""
93
+ filename = Path(path).name
94
+
95
+ # Create appropriate table_settings based on file_type
96
+ settings_map = {
97
+ 'csv': InputCsvTable(),
98
+ 'json': InputJsonTable(),
99
+ 'parquet': InputParquetTable(),
100
+ 'excel': InputExcelTable(),
101
+ }
102
+
103
+ return cls(
104
+ name=filename,
105
+ path=path,
106
+ file_type=file_type,
107
+ table_settings=settings_map.get(file_type, InputCsvTable())
108
+ )
109
+
110
+ @property
111
+ def file_path(self) -> str:
112
+ """Constructs the full file path from the directory and name."""
113
+ if self.name and self.name not in self.path:
114
+ return os.path.join(self.path, self.name)
115
+ else:
116
+ return self.path
117
+
118
+ def set_absolute_filepath(self):
119
+ """Resolves the path to an absolute file path."""
120
+ base_path = Path(self.path).expanduser()
121
+ if not base_path.is_absolute():
122
+ base_path = Path.cwd() / base_path
123
+ if self.name and self.name not in base_path.name:
124
+ base_path = base_path / self.name
125
+ self.abs_file_path = str(base_path.resolve())
126
+
127
+ @field_validator('table_settings', mode='before')
128
+ @classmethod
129
+ def validate_table_settings(cls, v, info):
130
+ """Ensures table_settings matches the file_type."""
131
+ if v is None:
132
+ file_type = info.data.get('file_type', 'csv')
133
+ # Create default based on file_type
134
+ settings_map = {
135
+ 'csv': InputCsvTable(),
136
+ 'json': InputJsonTable(),
137
+ 'parquet': InputParquetTable(),
138
+ 'excel': InputExcelTable(),
139
+ }
140
+ return settings_map.get(file_type, InputCsvTable())
141
+
142
+ # If it's a dict, add file_type if missing
143
+ if isinstance(v, dict) and 'file_type' not in v:
144
+ v['file_type'] = info.data.get('file_type', 'csv')
145
+
146
+ return v
147
+
148
+ @model_validator(mode='after')
149
+ def populate_abs_file_path(self):
150
+ """Ensures the absolute file path is populated after validation."""
151
+ if not self.abs_file_path:
152
+ self.set_absolute_filepath()
153
+ return self
82
154
 
83
- # Validate that start is before end if end is specified (non-zero)
84
- if (0 < self.end_row < self.start_row) or \
85
- (0 < self.end_column < self.start_column):
86
- raise ValueError("Start row/column must not be greater than end row/column if specified")
flowfile_worker/main.py CHANGED
@@ -4,8 +4,11 @@ import signal
4
4
 
5
5
  from contextlib import asynccontextmanager
6
6
  from fastapi import FastAPI
7
+
8
+ from shared.storage_config import storage
9
+
7
10
  from flowfile_worker.routes import router
8
- from flowfile_worker import mp_context, CACHE_DIR
11
+ from flowfile_worker import mp_context
9
12
  from flowfile_worker.configs import logger, FLOWFILE_CORE_URI, SERVICE_HOST, SERVICE_PORT
10
13
 
11
14
 
@@ -30,7 +33,7 @@ async def shutdown_handler(app: FastAPI):
30
33
  logger.error(f"Error cleaning up process: {e}")
31
34
 
32
35
  try:
33
- CACHE_DIR.cleanup()
36
+ storage.cleanup_directories()
34
37
  except Exception as e:
35
38
  print(f"Error cleaning up cache directory: {e}")
36
39
 
flowfile_worker/routes.py CHANGED
@@ -8,22 +8,30 @@ from base64 import encodebytes
8
8
  from flowfile_worker import status_dict, CACHE_DIR, PROCESS_MEMORY_USAGE, status_dict_lock
9
9
  from flowfile_worker import models
10
10
  from flowfile_worker.spawner import start_process, start_fuzzy_process, start_generic_process, process_manager
11
- from flowfile_worker.create import table_creator_factory_method, received_table_parser, FileType
11
+ from flowfile_worker.create import table_creator_factory_method, FileType
12
12
  from flowfile_worker.configs import logger
13
13
  from flowfile_worker.external_sources.sql_source.models import DatabaseReadSettings
14
- from flowfile_worker.external_sources.sql_source.main import read_sql_source, write_serialized_df_to_database
15
-
14
+ from flowfile_worker.external_sources.sql_source.main import read_sql_source
15
+ from flowfile_worker.create.models import ReceivedTable
16
16
 
17
17
  router = APIRouter()
18
18
 
19
19
 
20
+ def create_and_get_default_cache_dir(flowfile_flow_id: int) -> str:
21
+ default_cache_dir = CACHE_DIR / str(flowfile_flow_id)
22
+ default_cache_dir.mkdir(parents=True, exist_ok=True)
23
+ return str(default_cache_dir)
24
+
25
+
20
26
  @router.post("/submit_query/")
21
27
  def submit_query(polars_script: models.PolarsScript, background_tasks: BackgroundTasks) -> models.Status:
22
28
  logger.info(f"Processing query with operation: {polars_script.operation_type}")
23
29
 
24
30
  try:
25
31
  polars_script.task_id = str(uuid.uuid4()) if polars_script.task_id is None else polars_script.task_id
26
- polars_script.cache_dir = polars_script.cache_dir if polars_script.cache_dir is not None else CACHE_DIR.name
32
+ default_cache_dir = create_and_get_default_cache_dir(polars_script.flowfile_flow_id)
33
+
34
+ polars_script.cache_dir = polars_script.cache_dir if polars_script.cache_dir is not None else default_cache_dir
27
35
  polars_serializable_object = polars_script.polars_serializable_object()
28
36
  file_path = os.path.join(polars_script.cache_dir, f"{polars_script.task_id}.arrow")
29
37
  result_type = "polars" if polars_script.operation_type == "store" else "other"
@@ -49,8 +57,9 @@ def store_sample(polars_script: models.PolarsScriptSample, background_tasks: Bac
49
57
  logger.info(f"Processing sample storage with size: {polars_script.sample_size}")
50
58
 
51
59
  try:
60
+ default_cache_dir = create_and_get_default_cache_dir(polars_script.flowfile_flow_id)
52
61
  polars_script.task_id = str(uuid.uuid4()) if polars_script.task_id is None else polars_script.task_id
53
- polars_script.cache_dir = polars_script.cache_dir if polars_script.cache_dir is not None else CACHE_DIR.name
62
+ polars_script.cache_dir = polars_script.cache_dir if polars_script.cache_dir is not None else default_cache_dir
54
63
  polars_serializable_object = polars_script.polars_serializable_object()
55
64
 
56
65
  file_path = os.path.join(polars_script.cache_dir, f"{polars_script.task_id}.arrow")
@@ -210,7 +219,8 @@ def store_sql_db_result(database_read_settings: DatabaseReadSettings, background
210
219
 
211
220
  try:
212
221
  task_id = str(uuid.uuid4())
213
- file_path = os.path.join(CACHE_DIR.name, f"{task_id}.arrow")
222
+ file_path = os.path.join(create_and_get_default_cache_dir(database_read_settings.flowfile_flow_id),
223
+ f"{task_id}.arrow")
214
224
  status = models.Status(background_task_id=task_id, status="Starting", file_ref=file_path,
215
225
  result_type="polars")
216
226
  status_dict[task_id] = status
@@ -227,7 +237,7 @@ def store_sql_db_result(database_read_settings: DatabaseReadSettings, background
227
237
 
228
238
 
229
239
  @router.post('/create_table/{file_type}')
230
- def create_table(file_type: FileType, received_table: Dict, background_tasks: BackgroundTasks,
240
+ def create_table(file_type: FileType, received_table: ReceivedTable, background_tasks: BackgroundTasks,
231
241
  flowfile_flow_id: int = 1, flowfile_node_id: int | str = -1) -> models.Status:
232
242
  """
233
243
  Create a Polars table from received dictionary data based on specified file type.
@@ -243,18 +253,15 @@ def create_table(file_type: FileType, received_table: Dict, background_tasks: Ba
243
253
  models.Status: Status object tracking the table creation
244
254
  """
245
255
  logger.info(f"Creating table of type: {file_type}")
246
-
247
256
  try:
248
257
  task_id = str(uuid.uuid4())
249
- file_ref = os.path.join(CACHE_DIR.name, f"{task_id}.arrow")
250
-
258
+ file_ref = os.path.join(create_and_get_default_cache_dir(flowfile_flow_id), f"{task_id}.arrow")
251
259
  status = models.Status(background_task_id=task_id, status="Starting", file_ref=file_ref,
252
260
  result_type="polars")
253
261
  status_dict[task_id] = status
254
262
  func_ref = table_creator_factory_method(file_type)
255
- received_table_parsed = received_table_parser(received_table, file_type)
256
263
  background_tasks.add_task(start_generic_process, func_ref=func_ref, file_ref=file_ref,
257
- task_id=task_id, kwargs={'received_table': received_table_parsed},
264
+ task_id=task_id, kwargs={'received_table': received_table},
258
265
  flowfile_flow_id=flowfile_flow_id,
259
266
  flowfile_node_id=flowfile_node_id)
260
267
  logger.info(f"Started table creation task: {task_id}")
@@ -382,8 +389,9 @@ async def add_fuzzy_join(polars_script: models.FuzzyJoinInput, background_tasks:
382
389
  """
383
390
  logger.info("Starting fuzzy join operation")
384
391
  try:
392
+ default_cache_dir = create_and_get_default_cache_dir(polars_script.flowfile_flow_id)
385
393
  polars_script.task_id = str(uuid.uuid4()) if polars_script.task_id is None else polars_script.task_id
386
- polars_script.cache_dir = polars_script.cache_dir if polars_script.cache_dir is not None else CACHE_DIR.name
394
+ polars_script.cache_dir = polars_script.cache_dir if polars_script.cache_dir is not None else default_cache_dir
387
395
  left_serializable_object = polars_script.left_df_operation.polars_serializable_object()
388
396
  right_serializable_object = polars_script.right_df_operation.polars_serializable_object()
389
397
 
@@ -405,6 +413,37 @@ async def add_fuzzy_join(polars_script: models.FuzzyJoinInput, background_tasks:
405
413
  raise HTTPException(status_code=500, detail=str(e))
406
414
 
407
415
 
416
+ @router.delete("/clear_task/{task_id}")
417
+ def clear_task(task_id: str):
418
+ """
419
+ Clear task data and status by ID.
420
+
421
+ Args:
422
+ task_id: Unique identifier of the task to clear
423
+ Returns:
424
+ dict: Success message
425
+ Raises:
426
+ HTTPException: If task not found
427
+ """
428
+
429
+ logger.info(f"Clearing task: {task_id}")
430
+ status = status_dict.get(task_id)
431
+ if not status:
432
+ logger.warning(f"Task not found for clearing: {task_id}")
433
+ raise HTTPException(status_code=404, detail="Task not found")
434
+ try:
435
+ if os.path.exists(status.file_ref):
436
+ os.remove(status.file_ref)
437
+ logger.debug(f"Removed file: {status.file_ref}")
438
+ except Exception as e:
439
+ logger.error(f"Error removing file {status.file_ref}: {str(e)}", exc_info=True)
440
+ with status_dict_lock:
441
+ status_dict.pop(task_id, None)
442
+ PROCESS_MEMORY_USAGE.pop(task_id, None)
443
+ logger.info(f"Successfully cleared task: {task_id}")
444
+ return {"message": f"Task {task_id} has been cleared."}
445
+
446
+
408
447
  @router.post("/cancel_task/{task_id}")
409
448
  def cancel_task(task_id: str):
410
449
  """Cancel a running task by ID.
shared/__init__.py ADDED
@@ -0,0 +1,15 @@
1
+ """
2
+ Shared utilities for Flowfile services.
3
+ This package contains common functionality that can be used across
4
+ flowfile_core, flowfile_worker, and other components without creating
5
+ circular dependencies.
6
+ """
7
+
8
+ from .storage_config import storage, get_cache_directory, get_temp_directory, get_flows_directory
9
+
10
+ __all__ = [
11
+ 'storage',
12
+ 'get_cache_directory',
13
+ 'get_temp_directory',
14
+ 'get_flows_directory'
15
+ ]