Flowfile 0.3.9__py3-none-any.whl → 0.5.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- flowfile/__init__.py +8 -1
- flowfile/api.py +1 -3
- flowfile/web/static/assets/{CloudConnectionManager-c97c25f8.js → CloudConnectionManager-0dfba9f2.js} +2 -2
- flowfile/web/static/assets/{CloudStorageReader-f1ff509e.js → CloudStorageReader-d5b1b6c9.js} +11 -78
- flowfile/web/static/assets/{CloudStorageWriter-034f8b78.js → CloudStorageWriter-00d87aad.js} +12 -79
- flowfile/web/static/assets/{CloudStorageWriter-49c9a4b2.css → CloudStorageWriter-b0ee067f.css} +24 -24
- flowfile/web/static/assets/ColumnSelector-4685e75d.js +83 -0
- flowfile/web/static/assets/ColumnSelector-47996a16.css +10 -0
- flowfile/web/static/assets/ContextMenu-23e909da.js +41 -0
- flowfile/web/static/assets/{SettingsSection-9c836ecc.css → ContextMenu-4c74eef1.css} +0 -21
- flowfile/web/static/assets/ContextMenu-63cfa99b.css +26 -0
- flowfile/web/static/assets/ContextMenu-70ae0c79.js +41 -0
- flowfile/web/static/assets/ContextMenu-c13f91d0.css +26 -0
- flowfile/web/static/assets/ContextMenu-f149cf7c.js +41 -0
- flowfile/web/static/assets/{CrossJoin-41efa4cb.css → CrossJoin-1119d18e.css} +18 -18
- flowfile/web/static/assets/{CrossJoin-9e156ebe.js → CrossJoin-702a3edd.js} +14 -84
- flowfile/web/static/assets/CustomNode-74a37f74.css +32 -0
- flowfile/web/static/assets/CustomNode-b1519993.js +211 -0
- flowfile/web/static/assets/{DatabaseConnectionSettings-d5c625b3.js → DatabaseConnectionSettings-6f3e4ea5.js} +3 -3
- flowfile/web/static/assets/{DatabaseManager-265adc5e.js → DatabaseManager-cf5ef661.js} +2 -2
- flowfile/web/static/assets/{DatabaseReader-f50c6558.css → DatabaseReader-ae61773c.css} +0 -27
- flowfile/web/static/assets/{DatabaseReader-0b10551e.js → DatabaseReader-d38c7295.js} +14 -114
- flowfile/web/static/assets/{DatabaseWriter-c17c6916.js → DatabaseWriter-b04ef46a.js} +13 -74
- flowfile/web/static/assets/{ExploreData-5bdae813.css → ExploreData-2d0cf4db.css} +8 -14
- flowfile/web/static/assets/ExploreData-5fa10ed8.js +192 -0
- flowfile/web/static/assets/{ExternalSource-3a66556c.js → ExternalSource-d39af878.js} +8 -79
- flowfile/web/static/assets/{Filter-91ad87e7.js → Filter-9b6d08db.js} +12 -85
- flowfile/web/static/assets/{Filter-a9d08ba1.css → Filter-f62091b3.css} +3 -3
- flowfile/web/static/assets/{Formula-3c395ab1.js → Formula-6b04fb1d.js} +20 -87
- flowfile/web/static/assets/{Formula-29f19d21.css → Formula-bb96803d.css} +4 -4
- flowfile/web/static/assets/{FuzzyMatch-6857de82.css → FuzzyMatch-1010f966.css} +42 -42
- flowfile/web/static/assets/{FuzzyMatch-2df0d230.js → FuzzyMatch-999521f4.js} +16 -87
- flowfile/web/static/assets/{GraphSolver-d285877f.js → GraphSolver-17dd2198.js} +13 -159
- flowfile/web/static/assets/GraphSolver-f0cb7bfb.css +22 -0
- flowfile/web/static/assets/{GroupBy-0bd1cc6b.js → GroupBy-6b039e18.js} +12 -75
- flowfile/web/static/assets/{Unique-b5615727.css → GroupBy-b9505323.css} +8 -8
- flowfile/web/static/assets/{Join-5a78a203.js → Join-24d0f113.js} +15 -85
- flowfile/web/static/assets/{Join-f45eff22.css → Join-fd79b451.css} +20 -20
- flowfile/web/static/assets/{ManualInput-a71b52c6.css → ManualInput-3246a08d.css} +20 -20
- flowfile/web/static/assets/{ManualInput-93aef9d6.js → ManualInput-34639209.js} +11 -82
- flowfile/web/static/assets/MultiSelect-0e8724a3.js +5 -0
- flowfile/web/static/assets/MultiSelect.vue_vue_type_script_setup_true_lang-b0e538c2.js +63 -0
- flowfile/web/static/assets/NumericInput-3d63a470.js +5 -0
- flowfile/web/static/assets/NumericInput.vue_vue_type_script_setup_true_lang-e0edeccc.js +35 -0
- flowfile/web/static/assets/Output-283fe388.css +37 -0
- flowfile/web/static/assets/{Output-411ecaee.js → Output-edea9802.js} +62 -273
- flowfile/web/static/assets/{Pivot-89db4b04.js → Pivot-61d19301.js} +14 -138
- flowfile/web/static/assets/Pivot-cf333e3d.css +22 -0
- flowfile/web/static/assets/PivotValidation-891ddfb0.css +13 -0
- flowfile/web/static/assets/PivotValidation-c46cd420.css +13 -0
- flowfile/web/static/assets/PivotValidation-de9f43fe.js +61 -0
- flowfile/web/static/assets/PivotValidation-f97fec5b.js +61 -0
- flowfile/web/static/assets/{PolarsCode-a9f974f8.js → PolarsCode-bc3c9984.js} +13 -80
- flowfile/web/static/assets/Read-64a3f259.js +218 -0
- flowfile/web/static/assets/Read-e808b239.css +62 -0
- flowfile/web/static/assets/RecordCount-3d5039be.js +53 -0
- flowfile/web/static/assets/{RecordId-55ae7d36.js → RecordId-597510e0.js} +8 -80
- flowfile/web/static/assets/SQLQueryComponent-36cef432.css +27 -0
- flowfile/web/static/assets/SQLQueryComponent-df51adbe.js +38 -0
- flowfile/web/static/assets/{Sample-b4a18476.js → Sample-4be0a507.js} +8 -77
- flowfile/web/static/assets/{SecretManager-b066d13a.js → SecretManager-4839be57.js} +2 -2
- flowfile/web/static/assets/{Select-727688dc.js → Select-9b72f201.js} +11 -85
- flowfile/web/static/assets/SettingsSection-2e4d03c4.css +21 -0
- flowfile/web/static/assets/SettingsSection-5c696bee.css +20 -0
- flowfile/web/static/assets/SettingsSection-71e6b7e3.css +21 -0
- flowfile/web/static/assets/SettingsSection-7ded385d.js +45 -0
- flowfile/web/static/assets/{SettingsSection-695ac487.js → SettingsSection-e1e9c953.js} +2 -40
- flowfile/web/static/assets/SettingsSection-f0f75a42.js +53 -0
- flowfile/web/static/assets/SingleSelect-6c777aac.js +5 -0
- flowfile/web/static/assets/SingleSelect.vue_vue_type_script_setup_true_lang-33e3ff9b.js +62 -0
- flowfile/web/static/assets/SliderInput-7cb93e62.js +40 -0
- flowfile/web/static/assets/SliderInput-b8fb6a8c.css +4 -0
- flowfile/web/static/assets/{GroupBy-ab1ea74b.css → Sort-3643d625.css} +8 -8
- flowfile/web/static/assets/{Sort-be3339a8.js → Sort-6cbde21a.js} +12 -97
- flowfile/web/static/assets/TextInput-d9a40c11.js +5 -0
- flowfile/web/static/assets/TextInput.vue_vue_type_script_setup_true_lang-5896c375.js +32 -0
- flowfile/web/static/assets/{TextToRows-c92d1ec2.css → TextToRows-5d2c1190.css} +9 -9
- flowfile/web/static/assets/{TextToRows-7b8998da.js → TextToRows-c4fcbf4d.js} +14 -83
- flowfile/web/static/assets/ToggleSwitch-4ef91d19.js +5 -0
- flowfile/web/static/assets/ToggleSwitch.vue_vue_type_script_setup_true_lang-38478c20.js +31 -0
- flowfile/web/static/assets/{UnavailableFields-8b0cb48e.js → UnavailableFields-a03f512c.js} +2 -2
- flowfile/web/static/assets/{Union-8d9ac7f9.css → Union-af6c3d9b.css} +6 -6
- flowfile/web/static/assets/Union-bfe9b996.js +77 -0
- flowfile/web/static/assets/{Unique-af5a80b4.js → Unique-5d023a27.js} +23 -104
- flowfile/web/static/assets/{Sort-7ccfa0fe.css → Unique-f9fb0809.css} +8 -8
- flowfile/web/static/assets/Unpivot-1e422df3.css +30 -0
- flowfile/web/static/assets/{Unpivot-5195d411.js → Unpivot-91cc5354.js} +12 -166
- flowfile/web/static/assets/UnpivotValidation-0d240eeb.css +13 -0
- flowfile/web/static/assets/UnpivotValidation-7ee2de44.js +51 -0
- flowfile/web/static/assets/{ExploreData-18a4fe52.js → VueGraphicWalker-e51b9924.js} +4 -264
- flowfile/web/static/assets/VueGraphicWalker-ed5ab88b.css +6 -0
- flowfile/web/static/assets/{api-cb00cce6.js → api-c1bad5ca.js} +1 -1
- flowfile/web/static/assets/{api-023d1733.js → api-cf1221f0.js} +1 -1
- flowfile/web/static/assets/{designer-2197d782.css → designer-8da3ba3a.css} +859 -201
- flowfile/web/static/assets/{designer-6c322d8e.js → designer-9633482a.js} +2297 -733
- flowfile/web/static/assets/{documentation-4d1fafe1.js → documentation-ca400224.js} +1 -1
- flowfile/web/static/assets/{dropDown-0b46dd77.js → dropDown-614b998d.js} +1 -1
- flowfile/web/static/assets/{fullEditor-ec4e4f95.js → fullEditor-f7971590.js} +2 -2
- flowfile/web/static/assets/{genericNodeSettings-def5879b.js → genericNodeSettings-4fe5f36b.js} +3 -3
- flowfile/web/static/assets/{index-681a3ed0.css → index-50508d4d.css} +8 -0
- flowfile/web/static/assets/{index-683fc198.js → index-5429bbf8.js} +208 -31
- flowfile/web/static/assets/nodeInput-5d0d6b79.js +41 -0
- flowfile/web/static/assets/outputCsv-076b85ab.js +86 -0
- flowfile/web/static/assets/{Output-48f81019.css → outputCsv-9cc59e0b.css} +0 -143
- flowfile/web/static/assets/outputExcel-0fd17dbe.js +56 -0
- flowfile/web/static/assets/outputExcel-b41305c0.css +102 -0
- flowfile/web/static/assets/outputParquet-b61e0847.js +31 -0
- flowfile/web/static/assets/outputParquet-cf8cf3f2.css +4 -0
- flowfile/web/static/assets/readCsv-a8bb8b61.js +179 -0
- flowfile/web/static/assets/readCsv-c767cb37.css +52 -0
- flowfile/web/static/assets/readExcel-67b4aee0.js +201 -0
- flowfile/web/static/assets/readExcel-806d2826.css +64 -0
- flowfile/web/static/assets/readParquet-48c81530.css +19 -0
- flowfile/web/static/assets/readParquet-92ce1dbc.js +23 -0
- flowfile/web/static/assets/{secretApi-baceb6f9.js → secretApi-68435402.js} +1 -1
- flowfile/web/static/assets/{selectDynamic-de91449a.js → selectDynamic-92e25ee3.js} +7 -7
- flowfile/web/static/assets/{selectDynamic-b062bc9b.css → selectDynamic-aa913ff4.css} +16 -16
- flowfile/web/static/assets/user-defined-icon-0ae16c90.png +0 -0
- flowfile/web/static/assets/{vue-codemirror.esm-dc5e3348.js → vue-codemirror.esm-41b0e0d7.js} +65 -36
- flowfile/web/static/assets/{vue-content-loader.es-ba94b82f.js → vue-content-loader.es-2c8e608f.js} +1 -1
- flowfile/web/static/index.html +2 -2
- {flowfile-0.3.9.dist-info → flowfile-0.5.1.dist-info}/METADATA +5 -3
- {flowfile-0.3.9.dist-info → flowfile-0.5.1.dist-info}/RECORD +191 -121
- {flowfile-0.3.9.dist-info → flowfile-0.5.1.dist-info}/WHEEL +1 -1
- {flowfile-0.3.9.dist-info → flowfile-0.5.1.dist-info}/entry_points.txt +1 -0
- flowfile_core/__init__.py +3 -0
- flowfile_core/configs/flow_logger.py +5 -13
- flowfile_core/configs/node_store/__init__.py +30 -0
- flowfile_core/configs/node_store/nodes.py +383 -99
- flowfile_core/configs/node_store/user_defined_node_registry.py +193 -0
- flowfile_core/configs/settings.py +2 -1
- flowfile_core/database/connection.py +5 -21
- flowfile_core/fileExplorer/funcs.py +239 -121
- flowfile_core/flowfile/analytics/analytics_processor.py +1 -0
- flowfile_core/flowfile/code_generator/code_generator.py +62 -64
- flowfile_core/flowfile/flow_data_engine/create/funcs.py +73 -56
- flowfile_core/flowfile/flow_data_engine/flow_data_engine.py +77 -86
- flowfile_core/flowfile/flow_data_engine/flow_file_column/interface.py +4 -0
- flowfile_core/flowfile/flow_data_engine/flow_file_column/main.py +19 -34
- flowfile_core/flowfile/flow_data_engine/flow_file_column/type_registry.py +36 -0
- flowfile_core/flowfile/flow_data_engine/fuzzy_matching/prepare_for_fuzzy_match.py +23 -23
- flowfile_core/flowfile/flow_data_engine/join/utils.py +1 -1
- flowfile_core/flowfile/flow_data_engine/join/verify_integrity.py +9 -4
- flowfile_core/flowfile/flow_data_engine/subprocess_operations/subprocess_operations.py +212 -86
- flowfile_core/flowfile/flow_data_engine/utils.py +2 -0
- flowfile_core/flowfile/flow_graph.py +240 -54
- flowfile_core/flowfile/flow_node/flow_node.py +48 -13
- flowfile_core/flowfile/flow_node/models.py +2 -1
- flowfile_core/flowfile/handler.py +24 -5
- flowfile_core/flowfile/manage/compatibility_enhancements.py +404 -41
- flowfile_core/flowfile/manage/io_flowfile.py +394 -0
- flowfile_core/flowfile/node_designer/__init__.py +47 -0
- flowfile_core/flowfile/node_designer/_type_registry.py +197 -0
- flowfile_core/flowfile/node_designer/custom_node.py +371 -0
- flowfile_core/flowfile/node_designer/ui_components.py +277 -0
- flowfile_core/flowfile/schema_callbacks.py +17 -10
- flowfile_core/flowfile/setting_generator/settings.py +15 -10
- flowfile_core/main.py +5 -1
- flowfile_core/routes/routes.py +73 -30
- flowfile_core/routes/user_defined_components.py +55 -0
- flowfile_core/schemas/cloud_storage_schemas.py +0 -2
- flowfile_core/schemas/input_schema.py +228 -65
- flowfile_core/schemas/output_model.py +5 -2
- flowfile_core/schemas/schemas.py +153 -35
- flowfile_core/schemas/transform_schema.py +1083 -412
- flowfile_core/schemas/yaml_types.py +103 -0
- flowfile_core/types.py +156 -0
- flowfile_core/utils/validate_setup.py +3 -1
- flowfile_frame/__init__.py +3 -1
- flowfile_frame/flow_frame.py +31 -24
- flowfile_frame/flow_frame_methods.py +12 -9
- flowfile_worker/__init__.py +9 -35
- flowfile_worker/create/__init__.py +3 -21
- flowfile_worker/create/funcs.py +68 -56
- flowfile_worker/create/models.py +130 -62
- flowfile_worker/main.py +5 -2
- flowfile_worker/routes.py +52 -13
- shared/__init__.py +15 -0
- shared/storage_config.py +258 -0
- tools/migrate/README.md +56 -0
- tools/migrate/__init__.py +12 -0
- tools/migrate/__main__.py +131 -0
- tools/migrate/legacy_schemas.py +621 -0
- tools/migrate/migrate.py +598 -0
- tools/migrate/tests/__init__.py +0 -0
- tools/migrate/tests/conftest.py +23 -0
- tools/migrate/tests/test_migrate.py +627 -0
- tools/migrate/tests/test_migration_e2e.py +1010 -0
- tools/migrate/tests/test_node_migrations.py +813 -0
- flowfile/web/static/assets/GraphSolver-17fd26db.css +0 -68
- flowfile/web/static/assets/Pivot-f415e85f.css +0 -35
- flowfile/web/static/assets/Read-80dc1675.css +0 -197
- flowfile/web/static/assets/Read-c3b1929c.js +0 -701
- flowfile/web/static/assets/RecordCount-4e95f98e.js +0 -122
- flowfile/web/static/assets/Union-89fd73dc.js +0 -146
- flowfile/web/static/assets/Unpivot-246e9bbd.css +0 -77
- flowfile/web/static/assets/nodeTitle-a16db7c3.js +0 -227
- flowfile/web/static/assets/nodeTitle-f4b12bcb.css +0 -134
- flowfile_core/flowfile/manage/open_flowfile.py +0 -135
- {flowfile-0.3.9.dist-info → flowfile-0.5.1.dist-info/licenses}/LICENSE +0 -0
- /flowfile_core/flowfile/manage/manage_flowfile.py → /tools/__init__.py +0 -0
|
@@ -63,7 +63,6 @@ class FlowGraphToPolarsConverter:
|
|
|
63
63
|
"""Generate Polars code for a specific node."""
|
|
64
64
|
node_type = node.node_type
|
|
65
65
|
settings = node.setting_input
|
|
66
|
-
# Skip placeholder nodes
|
|
67
66
|
if isinstance(settings, input_schema.NodePromise):
|
|
68
67
|
self._add_comment(f"# Skipping uninitialized node: {node.node_id}")
|
|
69
68
|
return
|
|
@@ -71,7 +70,7 @@ class FlowGraphToPolarsConverter:
|
|
|
71
70
|
var_name = f"df_{node.node_id}"
|
|
72
71
|
self.node_var_mapping[node.node_id] = var_name
|
|
73
72
|
self.handle_output_node(node, var_name)
|
|
74
|
-
if node.node_template.output>0:
|
|
73
|
+
if node.node_template.output > 0:
|
|
75
74
|
self.last_node_var = var_name
|
|
76
75
|
# Get input variable names
|
|
77
76
|
input_vars = self._get_input_vars(node)
|
|
@@ -111,25 +110,25 @@ class FlowGraphToPolarsConverter:
|
|
|
111
110
|
return input_vars
|
|
112
111
|
|
|
113
112
|
def _handle_csv_read(self, file_settings: input_schema.ReceivedTable, var_name: str):
|
|
114
|
-
if file_settings.encoding.lower() in ('utf-8', 'utf8'):
|
|
113
|
+
if file_settings.table_settings.encoding.lower() in ('utf-8', 'utf8'):
|
|
115
114
|
encoding = "utf8-lossy"
|
|
116
115
|
self._add_code(f"{var_name} = pl.scan_csv(")
|
|
117
116
|
self._add_code(f' "{file_settings.abs_file_path}",')
|
|
118
|
-
self._add_code(f' separator="{file_settings.delimiter}",')
|
|
119
|
-
self._add_code(f' has_header={file_settings.has_headers},')
|
|
120
|
-
self._add_code(f' ignore_errors={file_settings.ignore_errors},')
|
|
117
|
+
self._add_code(f' separator="{file_settings.table_settings.delimiter}",')
|
|
118
|
+
self._add_code(f' has_header={file_settings.table_settings.has_headers},')
|
|
119
|
+
self._add_code(f' ignore_errors={file_settings.table_settings.ignore_errors},')
|
|
121
120
|
self._add_code(f' encoding="{encoding}",')
|
|
122
|
-
self._add_code(f' skip_rows={file_settings.starting_from_line},')
|
|
121
|
+
self._add_code(f' skip_rows={file_settings.table_settings.starting_from_line},')
|
|
123
122
|
self._add_code(")")
|
|
124
123
|
else:
|
|
125
124
|
self._add_code(f"{var_name} = pl.read_csv(")
|
|
126
125
|
self._add_code(f' "{file_settings.abs_file_path}",')
|
|
127
|
-
self._add_code(f' separator="{file_settings.delimiter}",')
|
|
128
|
-
self._add_code(f' has_header={file_settings.has_headers},')
|
|
129
|
-
self._add_code(f' ignore_errors={file_settings.ignore_errors},')
|
|
130
|
-
if file_settings.encoding:
|
|
131
|
-
self._add_code(f' encoding="{file_settings.encoding}",')
|
|
132
|
-
self._add_code(f' skip_rows={file_settings.starting_from_line},')
|
|
126
|
+
self._add_code(f' separator="{file_settings.table_settings.delimiter}",')
|
|
127
|
+
self._add_code(f' has_header={file_settings.table_settings.has_headers},')
|
|
128
|
+
self._add_code(f' ignore_errors={file_settings.table_settings.ignore_errors},')
|
|
129
|
+
if file_settings.table_settings.encoding:
|
|
130
|
+
self._add_code(f' encoding="{file_settings.table_settings.encoding}",')
|
|
131
|
+
self._add_code(f' skip_rows={file_settings.table_settings.starting_from_line},')
|
|
133
132
|
self._add_code(").lazy()")
|
|
134
133
|
|
|
135
134
|
def _handle_cloud_storage_reader(self, settings: input_schema.NodeCloudStorageReader, var_name: str, input_vars: Dict[str, str]):
|
|
@@ -181,8 +180,8 @@ class FlowGraphToPolarsConverter:
|
|
|
181
180
|
elif file_settings.file_type in ('xlsx', 'excel'):
|
|
182
181
|
self._add_code(f"{var_name} = pl.read_excel(")
|
|
183
182
|
self._add_code(f' "{file_settings.abs_file_path}",')
|
|
184
|
-
if file_settings.sheet_name:
|
|
185
|
-
self._add_code(f' sheet_name="{file_settings.sheet_name}",')
|
|
183
|
+
if file_settings.table_settings.sheet_name:
|
|
184
|
+
self._add_code(f' sheet_name="{file_settings.table_settings.sheet_name}",')
|
|
186
185
|
self._add_code(").lazy()")
|
|
187
186
|
|
|
188
187
|
self._add_code("")
|
|
@@ -296,7 +295,6 @@ class FlowGraphToPolarsConverter:
|
|
|
296
295
|
"""
|
|
297
296
|
left_df = input_vars.get('main', input_vars.get('main_0', 'df_left'))
|
|
298
297
|
right_df = input_vars.get('right', input_vars.get('main_1', 'df_right'))
|
|
299
|
-
|
|
300
298
|
# Ensure left and right DataFrames are distinct
|
|
301
299
|
if left_df == right_df:
|
|
302
300
|
right_df = "df_right"
|
|
@@ -359,26 +357,25 @@ class FlowGraphToPolarsConverter:
|
|
|
359
357
|
Returns:
|
|
360
358
|
None: Modifies internal state by adding generated code
|
|
361
359
|
"""
|
|
362
|
-
settings.join_input
|
|
363
|
-
|
|
360
|
+
join_input_manager = transform_schema.JoinInputManager(settings.join_input)
|
|
361
|
+
join_input_manager.auto_rename()
|
|
364
362
|
# Get join keys
|
|
365
|
-
left_on, right_on = self._get_join_keys(
|
|
363
|
+
left_on, right_on = self._get_join_keys(join_input_manager)
|
|
366
364
|
|
|
367
365
|
# Apply pre-join transformations
|
|
368
|
-
left_df, right_df = self._apply_pre_join_transformations(
|
|
369
|
-
|
|
366
|
+
left_df, right_df = self._apply_pre_join_transformations(join_input_manager, left_df, right_df)
|
|
370
367
|
# Handle join-specific key transformations
|
|
371
368
|
left_on, right_on, reverse_action, after_join_drop_cols = self._handle_join_key_transformations(
|
|
372
|
-
|
|
369
|
+
join_input_manager, left_df, right_df, left_on, right_on
|
|
373
370
|
)
|
|
374
|
-
|
|
375
371
|
# Execute the join
|
|
376
372
|
self._execute_join_with_post_processing(
|
|
377
373
|
settings, var_name, left_df, right_df, left_on, right_on,
|
|
378
374
|
after_join_drop_cols, reverse_action
|
|
379
375
|
)
|
|
380
376
|
|
|
381
|
-
|
|
377
|
+
@staticmethod
|
|
378
|
+
def _get_join_keys(settings: transform_schema.JoinInputManager) -> Tuple[List[str], List[str]]:
|
|
382
379
|
"""Extract join keys based on join type.
|
|
383
380
|
|
|
384
381
|
Different join types require different handling of join keys:
|
|
@@ -391,16 +388,16 @@ class FlowGraphToPolarsConverter:
|
|
|
391
388
|
Returns:
|
|
392
389
|
Tuple[List[str], List[str]]: Lists of (left_on, right_on) column names
|
|
393
390
|
"""
|
|
394
|
-
left_on = [jm.left_col for jm in settings.
|
|
391
|
+
left_on = [jm.left_col for jm in settings.get_names_for_table_rename()]
|
|
395
392
|
|
|
396
|
-
if settings.
|
|
397
|
-
right_on = [jm.right_col for jm in settings.
|
|
393
|
+
if settings.how in ("outer", "right"):
|
|
394
|
+
right_on = [jm.right_col for jm in settings.get_names_for_table_rename()]
|
|
398
395
|
else:
|
|
399
|
-
right_on = [jm.right_col for jm in settings.
|
|
396
|
+
right_on = [jm.right_col for jm in settings.join_mapping]
|
|
400
397
|
|
|
401
398
|
return left_on, right_on
|
|
402
399
|
|
|
403
|
-
def _apply_pre_join_transformations(self, settings:
|
|
400
|
+
def _apply_pre_join_transformations(self, settings: transform_schema.JoinInputManager, left_df: str, right_df: str) -> Tuple[
|
|
404
401
|
str, str]:
|
|
405
402
|
"""Apply column renames and drops before the join operation.
|
|
406
403
|
|
|
@@ -421,24 +418,24 @@ class FlowGraphToPolarsConverter:
|
|
|
421
418
|
# Calculate renames and drops
|
|
422
419
|
right_renames = {
|
|
423
420
|
column.old_name: column.new_name
|
|
424
|
-
for column in settings.
|
|
421
|
+
for column in settings.right_select.renames
|
|
425
422
|
if
|
|
426
|
-
column.old_name != column.new_name and not column.join_key or settings.
|
|
423
|
+
column.old_name != column.new_name and not column.join_key or settings.how in ("outer", "right")
|
|
427
424
|
}
|
|
428
425
|
|
|
429
426
|
left_renames = {
|
|
430
427
|
column.old_name: column.new_name
|
|
431
|
-
for column in settings.
|
|
428
|
+
for column in settings.left_select.renames
|
|
432
429
|
if column.old_name != column.new_name
|
|
433
430
|
}
|
|
434
431
|
|
|
435
432
|
left_drop_columns = [
|
|
436
|
-
column.old_name for column in settings.
|
|
433
|
+
column.old_name for column in settings.left_select.renames
|
|
437
434
|
if not column.keep and not column.join_key
|
|
438
435
|
]
|
|
439
436
|
|
|
440
437
|
right_drop_columns = [
|
|
441
|
-
column.old_name for column in settings.
|
|
438
|
+
column.old_name for column in settings.right_select.renames
|
|
442
439
|
if not column.keep and not column.join_key
|
|
443
440
|
]
|
|
444
441
|
|
|
@@ -454,7 +451,7 @@ class FlowGraphToPolarsConverter:
|
|
|
454
451
|
|
|
455
452
|
return left_df, right_df
|
|
456
453
|
|
|
457
|
-
def _handle_join_key_transformations(self, settings:
|
|
454
|
+
def _handle_join_key_transformations(self, settings: transform_schema.JoinInputManager, left_df: str, right_df: str,
|
|
458
455
|
left_on: List[str], right_on: List[str]) \
|
|
459
456
|
-> Tuple[List[str], List[str], Optional[Dict], List[str]]:
|
|
460
457
|
"""Route to appropriate join-specific key transformation handler.
|
|
@@ -476,7 +473,7 @@ class FlowGraphToPolarsConverter:
|
|
|
476
473
|
- reverse_action: Dictionary for renaming columns after join (or None)
|
|
477
474
|
- after_join_drop_cols: List of columns to drop after join
|
|
478
475
|
"""
|
|
479
|
-
join_type = settings.
|
|
476
|
+
join_type = settings.how
|
|
480
477
|
|
|
481
478
|
if join_type in ("left", "inner"):
|
|
482
479
|
return self._handle_left_inner_join_keys(settings, right_df, left_on, right_on)
|
|
@@ -487,7 +484,7 @@ class FlowGraphToPolarsConverter:
|
|
|
487
484
|
else:
|
|
488
485
|
return left_on, right_on, None, []
|
|
489
486
|
|
|
490
|
-
def _handle_left_inner_join_keys(self, settings:
|
|
487
|
+
def _handle_left_inner_join_keys(self, settings: transform_schema.JoinInputManager, right_df: str,
|
|
491
488
|
left_on: List[str], right_on: List[str]) -> Tuple[
|
|
492
489
|
List[str], List[str], Dict, List[str]]:
|
|
493
490
|
"""Handle key transformations for left and inner joins.
|
|
@@ -510,29 +507,28 @@ class FlowGraphToPolarsConverter:
|
|
|
510
507
|
- reverse_action: Mapping to rename __DROP__ columns after join
|
|
511
508
|
- after_join_drop_cols: Left join keys marked for dropping
|
|
512
509
|
"""
|
|
513
|
-
left_join_keys_to_keep = [jk.new_name for jk in settings.
|
|
514
|
-
|
|
510
|
+
left_join_keys_to_keep = [jk.new_name for jk in settings.left_select.join_key_selects if jk.keep]
|
|
515
511
|
join_key_duplication_command = [
|
|
516
512
|
f'pl.col("{rjk.old_name}").alias("__DROP__{rjk.new_name}__DROP__")'
|
|
517
|
-
for rjk in settings.
|
|
513
|
+
for rjk in settings.right_select.join_key_selects if rjk.keep
|
|
518
514
|
]
|
|
519
515
|
|
|
520
516
|
reverse_action = {
|
|
521
517
|
f"__DROP__{rjk.new_name}__DROP__": rjk.new_name
|
|
522
|
-
for rjk in settings.
|
|
518
|
+
for rjk in settings.right_select.join_key_selects if rjk.keep
|
|
523
519
|
}
|
|
524
520
|
|
|
525
521
|
if join_key_duplication_command:
|
|
526
522
|
self._add_code(f"{right_df} = {right_df}.with_columns([{', '.join(join_key_duplication_command)}])")
|
|
527
523
|
|
|
528
524
|
after_join_drop_cols = [
|
|
529
|
-
k.new_name for k in settings.
|
|
525
|
+
k.new_name for k in settings.left_select.join_key_selects
|
|
530
526
|
if not k.keep
|
|
531
527
|
]
|
|
532
528
|
|
|
533
529
|
return left_on, right_on, reverse_action, after_join_drop_cols
|
|
534
530
|
|
|
535
|
-
def _handle_right_join_keys(self, settings:
|
|
531
|
+
def _handle_right_join_keys(self, settings: transform_schema.JoinInputManager, left_df: str,
|
|
536
532
|
left_on: List[str], right_on: List[str]) -> Tuple[
|
|
537
533
|
List[str], List[str], None, List[str]]:
|
|
538
534
|
"""Handle key transformations for right joins.
|
|
@@ -557,12 +553,12 @@ class FlowGraphToPolarsConverter:
|
|
|
557
553
|
"""
|
|
558
554
|
join_key_duplication_command = [
|
|
559
555
|
f'pl.col("{ljk.new_name}").alias("__jk_{ljk.new_name}")'
|
|
560
|
-
for ljk in settings.
|
|
556
|
+
for ljk in settings.left_select.join_key_selects if ljk.keep
|
|
561
557
|
]
|
|
562
558
|
|
|
563
559
|
# Update left_on keys
|
|
564
560
|
for position, left_on_key in enumerate(left_on):
|
|
565
|
-
left_on_select = settings.
|
|
561
|
+
left_on_select = settings.left_select.get_select_input_on_new_name(left_on_key)
|
|
566
562
|
if left_on_select and left_on_select.keep:
|
|
567
563
|
left_on[position] = f"__jk_{left_on_select.new_name}"
|
|
568
564
|
|
|
@@ -570,18 +566,17 @@ class FlowGraphToPolarsConverter:
|
|
|
570
566
|
self._add_code(f"{left_df} = {left_df}.with_columns([{', '.join(join_key_duplication_command)}])")
|
|
571
567
|
|
|
572
568
|
# Calculate columns to drop after join
|
|
573
|
-
left_join_keys_keep = {jk.new_name for jk in settings.
|
|
569
|
+
left_join_keys_keep = {jk.new_name for jk in settings.left_select.join_key_selects if jk.keep}
|
|
574
570
|
after_join_drop_cols_right = [
|
|
575
571
|
jk.new_name if jk.new_name not in left_join_keys_keep else jk.new_name + "_right"
|
|
576
|
-
for jk in settings.
|
|
572
|
+
for jk in settings.right_select.join_key_selects if not jk.keep
|
|
577
573
|
]
|
|
578
574
|
after_join_drop_cols = list(set(after_join_drop_cols_right))
|
|
579
|
-
|
|
580
575
|
return left_on, right_on, None, after_join_drop_cols
|
|
581
576
|
|
|
582
|
-
def _handle_outer_join_keys(self, settings:
|
|
583
|
-
left_on: List[str],
|
|
584
|
-
|
|
577
|
+
def _handle_outer_join_keys(self, settings: transform_schema.JoinInputManager, right_df: str,
|
|
578
|
+
left_on: List[str],
|
|
579
|
+
right_on: List[str]) -> Tuple[List[str], List[str], Dict, List[str]]:
|
|
585
580
|
"""Handle key transformations for outer joins.
|
|
586
581
|
|
|
587
582
|
For outer joins:
|
|
@@ -602,10 +597,10 @@ class FlowGraphToPolarsConverter:
|
|
|
602
597
|
- reverse_action: Mapping to remove __jk_ prefix after join
|
|
603
598
|
- after_join_drop_cols: Combined list of columns to drop from both sides
|
|
604
599
|
"""
|
|
605
|
-
left_join_keys = {jk.new_name for jk in settings.
|
|
600
|
+
left_join_keys = {jk.new_name for jk in settings.left_select.join_key_selects}
|
|
606
601
|
|
|
607
602
|
join_keys_to_keep_and_rename = [
|
|
608
|
-
rjk for rjk in settings.
|
|
603
|
+
rjk for rjk in settings.right_select.join_key_selects
|
|
609
604
|
if rjk.keep and rjk.new_name in left_join_keys
|
|
610
605
|
]
|
|
611
606
|
|
|
@@ -616,7 +611,7 @@ class FlowGraphToPolarsConverter:
|
|
|
616
611
|
|
|
617
612
|
# Update right_on keys
|
|
618
613
|
for position, right_on_key in enumerate(right_on):
|
|
619
|
-
right_on_select = settings.
|
|
614
|
+
right_on_select = settings.right_select.get_select_input_on_new_name(right_on_key)
|
|
620
615
|
if right_on_select and right_on_select.keep and right_on_select.new_name in left_join_keys:
|
|
621
616
|
right_on[position] = f"__jk_{right_on_select.new_name}"
|
|
622
617
|
|
|
@@ -627,11 +622,11 @@ class FlowGraphToPolarsConverter:
|
|
|
627
622
|
|
|
628
623
|
# Calculate columns to drop after join
|
|
629
624
|
after_join_drop_cols_left = [
|
|
630
|
-
jk.new_name for jk in settings.
|
|
625
|
+
jk.new_name for jk in settings.left_select.join_key_selects if not jk.keep
|
|
631
626
|
]
|
|
632
627
|
after_join_drop_cols_right = [
|
|
633
628
|
jk.new_name if jk.new_name not in left_join_keys else jk.new_name + "_right"
|
|
634
|
-
for jk in settings.
|
|
629
|
+
for jk in settings.right_select.join_key_selects if not jk.keep
|
|
635
630
|
]
|
|
636
631
|
after_join_drop_cols = after_join_drop_cols_left + after_join_drop_cols_right
|
|
637
632
|
|
|
@@ -718,7 +713,7 @@ class FlowGraphToPolarsConverter:
|
|
|
718
713
|
col_name = settings.function.field.name
|
|
719
714
|
self._add_code(f"{var_name} = {input_df}.with_columns([")
|
|
720
715
|
self._add_code(f'simple_function_to_expr({repr(formula)}).alias("{col_name}")')
|
|
721
|
-
if settings.function.field.data_type not in (None,
|
|
716
|
+
if settings.function.field.data_type not in (None, transform_schema.AUTO_DATA_TYPE):
|
|
722
717
|
output_type = convert_pl_type_to_string(cast_str_to_polars_type(settings.function.field.data_type))
|
|
723
718
|
if output_type[:3] != "pl.":
|
|
724
719
|
output_type = "pl." + output_type
|
|
@@ -829,6 +824,7 @@ class FlowGraphToPolarsConverter:
|
|
|
829
824
|
|
|
830
825
|
@staticmethod
|
|
831
826
|
def _transform_fuzzy_mappings_to_string(fuzzy_mappings: List[FuzzyMapping]) -> str:
|
|
827
|
+
|
|
832
828
|
output_str = "["
|
|
833
829
|
for i, fuzzy_mapping in enumerate(fuzzy_mappings):
|
|
834
830
|
|
|
@@ -844,18 +840,20 @@ class FlowGraphToPolarsConverter:
|
|
|
844
840
|
def _handle_fuzzy_match(self, settings: input_schema.NodeFuzzyMatch, var_name: str, input_vars: Dict[str, str]) -> None:
|
|
845
841
|
"""Handle fuzzy match nodes."""
|
|
846
842
|
self.imports.add("from pl_fuzzy_frame_match import FuzzyMapping, fuzzy_match_dfs")
|
|
843
|
+
fuzzy_match_handler = transform_schema.FuzzyMatchInputManager(settings.join_input)
|
|
847
844
|
left_df = input_vars.get('main', input_vars.get('main_0', 'df_left'))
|
|
848
845
|
right_df = input_vars.get('right', input_vars.get('main_1', 'df_right'))
|
|
846
|
+
|
|
849
847
|
if left_df == right_df:
|
|
850
848
|
right_df = "df_right"
|
|
851
849
|
self._add_code(f"{right_df} = {left_df}")
|
|
852
850
|
|
|
853
|
-
if
|
|
854
|
-
self._add_code(f"{left_df} = {left_df}.drop({[c.old_name for c in
|
|
855
|
-
if
|
|
856
|
-
self._add_code(f"{right_df} = {right_df}.drop({[c.old_name for c in
|
|
851
|
+
if fuzzy_match_handler.left_select.has_drop_cols():
|
|
852
|
+
self._add_code(f"{left_df} = {left_df}.drop({[c.old_name for c in fuzzy_match_handler.left_select.non_jk_drop_columns]})")
|
|
853
|
+
if fuzzy_match_handler.right_select.has_drop_cols():
|
|
854
|
+
self._add_code(f"{right_df} = {right_df}.drop({[c.old_name for c in fuzzy_match_handler.right_select.non_jk_drop_columns]})")
|
|
857
855
|
|
|
858
|
-
fuzzy_join_mapping_settings = self._transform_fuzzy_mappings_to_string(
|
|
856
|
+
fuzzy_join_mapping_settings = self._transform_fuzzy_mappings_to_string(fuzzy_match_handler.join_mapping)
|
|
859
857
|
self._add_code(f"{var_name} = fuzzy_match_dfs(\n"
|
|
860
858
|
f" left_df={left_df}, right_df={right_df},\n"
|
|
861
859
|
f" fuzzy_maps={fuzzy_join_mapping_settings}\n"
|
|
@@ -961,7 +959,7 @@ class FlowGraphToPolarsConverter:
|
|
|
961
959
|
if output_settings.file_type == 'csv':
|
|
962
960
|
self._add_code(f'{input_df}.sink_csv(')
|
|
963
961
|
self._add_code(f' "{output_settings.abs_file_path}",')
|
|
964
|
-
self._add_code(f' separator="{output_settings.
|
|
962
|
+
self._add_code(f' separator="{output_settings.table_settings.delimiter}"')
|
|
965
963
|
self._add_code(')')
|
|
966
964
|
|
|
967
965
|
elif output_settings.file_type == 'parquet':
|
|
@@ -970,7 +968,7 @@ class FlowGraphToPolarsConverter:
|
|
|
970
968
|
elif output_settings.file_type == 'excel':
|
|
971
969
|
self._add_code(f'{input_df}.collect().write_excel(')
|
|
972
970
|
self._add_code(f' "{output_settings.abs_file_path}",')
|
|
973
|
-
self._add_code(f' worksheet="{output_settings.
|
|
971
|
+
self._add_code(f' worksheet="{output_settings.table_settings.sheet_name}"')
|
|
974
972
|
self._add_code(')')
|
|
975
973
|
|
|
976
974
|
self._add_code("")
|
|
@@ -6,45 +6,50 @@ from flowfile_core.flowfile.flow_data_engine.read_excel_tables import df_from_op
|
|
|
6
6
|
from polars._typing import CsvEncoding
|
|
7
7
|
|
|
8
8
|
|
|
9
|
-
def create_from_json(received_table: input_schema.
|
|
9
|
+
def create_from_json(received_table: input_schema.ReceivedTable):
|
|
10
10
|
f = received_table.abs_file_path
|
|
11
11
|
gbs_to_load = os.path.getsize(f) / 1024 / 1000 / 1000
|
|
12
12
|
low_mem = gbs_to_load > 10
|
|
13
|
-
|
|
13
|
+
|
|
14
|
+
if not isinstance(received_table.table_settings, input_schema.InputJsonTable):
|
|
15
|
+
raise ValueError("Received table settings are not of type InputJsonTable")
|
|
16
|
+
table_settings: input_schema.InputJsonTable = received_table.table_settings
|
|
17
|
+
|
|
18
|
+
if table_settings.encoding.upper() == 'UTF8' or table_settings.encoding.upper() == 'UTF-8':
|
|
14
19
|
try:
|
|
15
20
|
data = pl.scan_csv(f,
|
|
16
21
|
low_memory=low_mem,
|
|
17
22
|
try_parse_dates=True,
|
|
18
|
-
separator=
|
|
19
|
-
has_header=
|
|
20
|
-
skip_rows=
|
|
23
|
+
separator=table_settings.delimiter,
|
|
24
|
+
has_header=table_settings.has_headers,
|
|
25
|
+
skip_rows=table_settings.starting_from_line,
|
|
21
26
|
encoding='utf8',
|
|
22
|
-
infer_schema_length=
|
|
27
|
+
infer_schema_length=table_settings.infer_schema_length)
|
|
23
28
|
data.head(1).collect()
|
|
24
29
|
return data
|
|
25
30
|
except:
|
|
26
31
|
try:
|
|
27
32
|
data = pl.scan_csv(f, low_memory=low_mem,
|
|
28
|
-
separator=
|
|
29
|
-
has_header=
|
|
30
|
-
skip_rows=
|
|
33
|
+
separator=table_settings.delimiter,
|
|
34
|
+
has_header=table_settings.has_headers,
|
|
35
|
+
skip_rows=table_settings.starting_from_line,
|
|
31
36
|
encoding='utf8-lossy',
|
|
32
37
|
ignore_errors=True)
|
|
33
38
|
return data
|
|
34
39
|
except:
|
|
35
40
|
data = pl.scan_csv(f, low_memory=low_mem,
|
|
36
|
-
separator=
|
|
37
|
-
has_header=
|
|
38
|
-
skip_rows=
|
|
41
|
+
separator=table_settings.delimiter,
|
|
42
|
+
has_header=table_settings.has_headers,
|
|
43
|
+
skip_rows=table_settings.starting_from_line,
|
|
39
44
|
encoding='utf8',
|
|
40
45
|
ignore_errors=True)
|
|
41
46
|
return data
|
|
42
47
|
else:
|
|
43
48
|
data = pl.read_csv(f, low_memory=low_mem,
|
|
44
|
-
separator=
|
|
45
|
-
has_header=
|
|
46
|
-
skip_rows=
|
|
47
|
-
encoding=
|
|
49
|
+
separator=table_settings.delimiter,
|
|
50
|
+
has_header=table_settings.has_headers,
|
|
51
|
+
skip_rows=table_settings.starting_from_line,
|
|
52
|
+
encoding=table_settings.encoding,
|
|
48
53
|
ignore_errors=True)
|
|
49
54
|
return data
|
|
50
55
|
|
|
@@ -58,48 +63,54 @@ def standardize_utf8_encoding(non_standardized_encoding: str) -> CsvEncoding:
|
|
|
58
63
|
raise ValueError(f"Encoding {non_standardized_encoding} is not supported.")
|
|
59
64
|
|
|
60
65
|
|
|
61
|
-
def create_from_path_csv(received_table: input_schema.
|
|
66
|
+
def create_from_path_csv(received_table: input_schema.ReceivedTable) -> pl.LazyFrame:
|
|
67
|
+
if not isinstance(received_table.table_settings, input_schema.InputCsvTable):
|
|
68
|
+
raise ValueError("Received table settings are not of type InputCsvTable")
|
|
69
|
+
|
|
70
|
+
table_settings: input_schema.InputCsvTable = received_table.table_settings
|
|
71
|
+
|
|
62
72
|
f = received_table.abs_file_path
|
|
63
73
|
gbs_to_load = os.path.getsize(f) / 1024 / 1000 / 1000
|
|
64
74
|
low_mem = gbs_to_load > 10
|
|
65
|
-
|
|
66
|
-
|
|
75
|
+
|
|
76
|
+
if table_settings.encoding.upper() in ("UTF-8", "UTF8", 'UTF8-LOSSY', 'UTF-8-LOSSY'):
|
|
77
|
+
encoding: CsvEncoding = standardize_utf8_encoding(table_settings.encoding)
|
|
67
78
|
try:
|
|
68
79
|
data = pl.scan_csv(f,
|
|
69
80
|
low_memory=low_mem,
|
|
70
81
|
try_parse_dates=True,
|
|
71
|
-
separator=
|
|
72
|
-
has_header=
|
|
73
|
-
skip_rows=
|
|
82
|
+
separator=table_settings.delimiter,
|
|
83
|
+
has_header=table_settings.has_headers,
|
|
84
|
+
skip_rows=table_settings.starting_from_line,
|
|
74
85
|
encoding=encoding,
|
|
75
|
-
infer_schema_length=
|
|
86
|
+
infer_schema_length=table_settings.infer_schema_length)
|
|
76
87
|
data.head(1).collect()
|
|
77
88
|
return data
|
|
78
89
|
except:
|
|
79
90
|
|
|
80
91
|
try:
|
|
81
92
|
data = pl.scan_csv(f, low_memory=low_mem,
|
|
82
|
-
separator=
|
|
83
|
-
has_header=
|
|
84
|
-
skip_rows=
|
|
93
|
+
separator=table_settings.delimiter,
|
|
94
|
+
has_header=table_settings.has_headers,
|
|
95
|
+
skip_rows=table_settings.starting_from_line,
|
|
85
96
|
encoding='utf8-lossy',
|
|
86
97
|
ignore_errors=True)
|
|
87
98
|
return data
|
|
88
99
|
except:
|
|
89
100
|
data = pl.scan_csv(f, low_memory=False,
|
|
90
|
-
separator=
|
|
91
|
-
has_header=
|
|
92
|
-
skip_rows=
|
|
101
|
+
separator=table_settings.delimiter,
|
|
102
|
+
has_header=table_settings.has_headers,
|
|
103
|
+
skip_rows=table_settings.starting_from_line,
|
|
93
104
|
encoding=encoding,
|
|
94
105
|
ignore_errors=True)
|
|
95
106
|
return data
|
|
96
107
|
else:
|
|
97
108
|
data = pl.read_csv_batched(f,
|
|
98
109
|
low_memory=low_mem,
|
|
99
|
-
separator=
|
|
100
|
-
has_header=
|
|
101
|
-
skip_rows=
|
|
102
|
-
encoding=
|
|
110
|
+
separator=table_settings.delimiter,
|
|
111
|
+
has_header=table_settings.has_headers,
|
|
112
|
+
skip_rows=table_settings.starting_from_line,
|
|
113
|
+
encoding=table_settings.encoding,
|
|
103
114
|
ignore_errors=True, batch_size=2).next_batches(1)
|
|
104
115
|
return data[0].lazy()
|
|
105
116
|
|
|
@@ -108,50 +119,56 @@ def create_random(number_of_records: int = 1000) -> pl.LazyFrame:
|
|
|
108
119
|
return create_fake_data(number_of_records).lazy()
|
|
109
120
|
|
|
110
121
|
|
|
111
|
-
def create_from_path_parquet(received_table: input_schema.
|
|
122
|
+
def create_from_path_parquet(received_table: input_schema.ReceivedTable) -> pl.LazyFrame:
|
|
123
|
+
if not isinstance(received_table.table_settings, input_schema.InputParquetTable):
|
|
124
|
+
raise ValueError("Received table settings are not of type InputParquetTable")
|
|
112
125
|
low_mem = (os.path.getsize(received_table.abs_file_path) / 1024 / 1000 / 1000) > 2
|
|
113
126
|
return pl.scan_parquet(source=received_table.abs_file_path, low_memory=low_mem)
|
|
114
127
|
|
|
115
128
|
|
|
116
|
-
def create_from_path_excel(received_table: input_schema.
|
|
117
|
-
if received_table.
|
|
129
|
+
def create_from_path_excel(received_table: input_schema.ReceivedTable):
|
|
130
|
+
if not isinstance(received_table.table_settings, input_schema.InputExcelTable):
|
|
131
|
+
raise ValueError("Received table settings are not of type InputExcelTable")
|
|
132
|
+
|
|
133
|
+
table_settings: input_schema.InputExcelTable = received_table.table_settings
|
|
134
|
+
if table_settings.type_inference:
|
|
118
135
|
engine = 'openpyxl'
|
|
119
|
-
elif
|
|
120
|
-
engine = 'calamine' if
|
|
121
|
-
elif
|
|
136
|
+
elif table_settings.start_row > 0 and table_settings.start_column == 0:
|
|
137
|
+
engine = 'calamine' if table_settings.has_headers else 'xlsx2csv'
|
|
138
|
+
elif table_settings.start_column > 0 or table_settings.start_row > 0:
|
|
122
139
|
engine = 'openpyxl'
|
|
123
140
|
else:
|
|
124
141
|
engine = 'calamine'
|
|
125
142
|
|
|
126
|
-
sheet_name =
|
|
143
|
+
sheet_name = table_settings.sheet_name
|
|
127
144
|
|
|
128
145
|
if engine == 'calamine':
|
|
129
146
|
df = df_from_calamine_xlsx(file_path=received_table.abs_file_path, sheet_name=sheet_name,
|
|
130
|
-
start_row=
|
|
131
|
-
if
|
|
132
|
-
end_col_index =
|
|
133
|
-
cols_to_select = [df.columns[i] for i in range(
|
|
147
|
+
start_row=table_settings.start_row, end_row=table_settings.end_row)
|
|
148
|
+
if table_settings.end_column > 0:
|
|
149
|
+
end_col_index = table_settings.end_column
|
|
150
|
+
cols_to_select = [df.columns[i] for i in range(table_settings.start_column, end_col_index)]
|
|
134
151
|
df = df.select(cols_to_select)
|
|
135
152
|
|
|
136
153
|
elif engine == 'xlsx2csv':
|
|
137
|
-
csv_options = {'has_header':
|
|
154
|
+
csv_options = {'has_header': table_settings.has_headers, 'skip_rows': table_settings.start_row}
|
|
138
155
|
df = pl.read_excel(source=received_table.abs_file_path,
|
|
139
156
|
read_options=csv_options,
|
|
140
157
|
engine='xlsx2csv',
|
|
141
|
-
sheet_name=
|
|
142
|
-
end_col_index =
|
|
143
|
-
cols_to_select = [df.columns[i] for i in range(
|
|
158
|
+
sheet_name=table_settings.sheet_name)
|
|
159
|
+
end_col_index = table_settings.end_column if table_settings.end_column > 0 else len(df.columns)
|
|
160
|
+
cols_to_select = [df.columns[i] for i in range(table_settings.start_column, end_col_index)]
|
|
144
161
|
df = df.select(cols_to_select)
|
|
145
|
-
if 0 <
|
|
146
|
-
df = df.head(
|
|
162
|
+
if 0 < table_settings.end_row < len(df):
|
|
163
|
+
df = df.head(table_settings.end_row)
|
|
147
164
|
|
|
148
165
|
else:
|
|
149
|
-
max_col =
|
|
150
|
-
max_row =
|
|
166
|
+
max_col = table_settings.end_column if table_settings.end_column > 0 else None
|
|
167
|
+
max_row = table_settings.end_row + 1 if table_settings.end_row > 0 else None
|
|
151
168
|
df = df_from_openpyxl(file_path=received_table.abs_file_path,
|
|
152
|
-
sheet_name=
|
|
153
|
-
min_row=
|
|
154
|
-
min_col=
|
|
169
|
+
sheet_name=table_settings.sheet_name,
|
|
170
|
+
min_row=table_settings.start_row + 1,
|
|
171
|
+
min_col=table_settings.start_column + 1,
|
|
155
172
|
max_row=max_row,
|
|
156
|
-
max_col=max_col, has_headers=
|
|
173
|
+
max_col=max_col, has_headers=table_settings.has_headers)
|
|
157
174
|
return df
|