Flowfile 0.4.1__py3-none-any.whl → 0.5.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- flowfile/__init__.py +3 -1
- flowfile/api.py +1 -2
- flowfile/web/static/assets/{CloudConnectionManager-d3248f8d.js → CloudConnectionManager-0dfba9f2.js} +2 -2
- flowfile/web/static/assets/{CloudStorageReader-d65bf041.js → CloudStorageReader-d5b1b6c9.js} +6 -6
- flowfile/web/static/assets/{CloudStorageWriter-e83be3ed.js → CloudStorageWriter-00d87aad.js} +6 -6
- flowfile/web/static/assets/{ColumnSelector-cce661cf.js → ColumnSelector-4685e75d.js} +1 -1
- flowfile/web/static/assets/{ContextMenu-cf18d2cc.js → ContextMenu-23e909da.js} +1 -1
- flowfile/web/static/assets/{ContextMenu-160afb08.js → ContextMenu-70ae0c79.js} +1 -1
- flowfile/web/static/assets/{ContextMenu-11a4652a.js → ContextMenu-f149cf7c.js} +1 -1
- flowfile/web/static/assets/{CrossJoin-d395d38c.js → CrossJoin-702a3edd.js} +7 -7
- flowfile/web/static/assets/{CustomNode-b812dc0b.js → CustomNode-b1519993.js} +11 -11
- flowfile/web/static/assets/{DatabaseConnectionSettings-7000bf2c.js → DatabaseConnectionSettings-6f3e4ea5.js} +2 -2
- flowfile/web/static/assets/{DatabaseManager-9662ec5b.js → DatabaseManager-cf5ef661.js} +2 -2
- flowfile/web/static/assets/{DatabaseReader-4f035d0c.js → DatabaseReader-d38c7295.js} +9 -9
- flowfile/web/static/assets/{DatabaseWriter-f65dcd54.js → DatabaseWriter-b04ef46a.js} +8 -8
- flowfile/web/static/assets/{ExploreData-94c43dfc.js → ExploreData-5fa10ed8.js} +5 -5
- flowfile/web/static/assets/{ExternalSource-ac04b3cc.js → ExternalSource-d39af878.js} +5 -5
- flowfile/web/static/assets/{Filter-812dcbca.js → Filter-9b6d08db.js} +7 -7
- flowfile/web/static/assets/{Formula-71472193.js → Formula-6b04fb1d.js} +7 -7
- flowfile/web/static/assets/{FuzzyMatch-b317f631.js → FuzzyMatch-999521f4.js} +8 -8
- flowfile/web/static/assets/{GraphSolver-754a234f.js → GraphSolver-17dd2198.js} +6 -6
- flowfile/web/static/assets/{GroupBy-6c6f9802.js → GroupBy-6b039e18.js} +5 -5
- flowfile/web/static/assets/{Join-a1b800be.js → Join-24d0f113.js} +8 -8
- flowfile/web/static/assets/{ManualInput-a9640276.js → ManualInput-34639209.js} +4 -4
- flowfile/web/static/assets/{MultiSelect-97213888.js → MultiSelect-0e8724a3.js} +2 -2
- flowfile/web/static/assets/{MultiSelect.vue_vue_type_script_setup_true_lang-6ffe088a.js → MultiSelect.vue_vue_type_script_setup_true_lang-b0e538c2.js} +1 -1
- flowfile/web/static/assets/{NumericInput-e638088a.js → NumericInput-3d63a470.js} +2 -2
- flowfile/web/static/assets/{NumericInput.vue_vue_type_script_setup_true_lang-90eb2cba.js → NumericInput.vue_vue_type_script_setup_true_lang-e0edeccc.js} +1 -1
- flowfile/web/static/assets/{Output-ddc9079f.css → Output-283fe388.css} +5 -5
- flowfile/web/static/assets/{Output-76750610.js → Output-edea9802.js} +57 -38
- flowfile/web/static/assets/{Pivot-7814803f.js → Pivot-61d19301.js} +7 -7
- flowfile/web/static/assets/{PivotValidation-f92137d2.js → PivotValidation-de9f43fe.js} +1 -1
- flowfile/web/static/assets/{PivotValidation-76dd431a.js → PivotValidation-f97fec5b.js} +1 -1
- flowfile/web/static/assets/{PolarsCode-889c3008.js → PolarsCode-bc3c9984.js} +5 -5
- flowfile/web/static/assets/{Read-637b72a7.js → Read-64a3f259.js} +80 -105
- flowfile/web/static/assets/{Read-6b17491f.css → Read-e808b239.css} +10 -10
- flowfile/web/static/assets/{RecordCount-2b050c41.js → RecordCount-3d5039be.js} +4 -4
- flowfile/web/static/assets/{RecordId-81df7784.js → RecordId-597510e0.js} +6 -6
- flowfile/web/static/assets/{SQLQueryComponent-88dcfe53.js → SQLQueryComponent-df51adbe.js} +1 -1
- flowfile/web/static/assets/{Sample-258ad2a9.js → Sample-4be0a507.js} +4 -4
- flowfile/web/static/assets/{SecretManager-2a2cb7e2.js → SecretManager-4839be57.js} +2 -2
- flowfile/web/static/assets/{Select-850215fd.js → Select-9b72f201.js} +7 -7
- flowfile/web/static/assets/{SettingsSection-29b4fa6b.js → SettingsSection-7ded385d.js} +1 -1
- flowfile/web/static/assets/{SettingsSection-0e8d9123.js → SettingsSection-e1e9c953.js} +1 -1
- flowfile/web/static/assets/{SettingsSection-55bae608.js → SettingsSection-f0f75a42.js} +1 -1
- flowfile/web/static/assets/{SingleSelect-bebd408b.js → SingleSelect-6c777aac.js} +2 -2
- flowfile/web/static/assets/{SingleSelect.vue_vue_type_script_setup_true_lang-6093741c.js → SingleSelect.vue_vue_type_script_setup_true_lang-33e3ff9b.js} +1 -1
- flowfile/web/static/assets/{SliderInput-6a05ab61.js → SliderInput-7cb93e62.js} +1 -1
- flowfile/web/static/assets/{Sort-10ab48ed.js → Sort-6cbde21a.js} +5 -5
- flowfile/web/static/assets/{TextInput-df9d6259.js → TextInput-d9a40c11.js} +2 -2
- flowfile/web/static/assets/{TextInput.vue_vue_type_script_setup_true_lang-000e1178.js → TextInput.vue_vue_type_script_setup_true_lang-5896c375.js} +1 -1
- flowfile/web/static/assets/{TextToRows-6c2d93d8.js → TextToRows-c4fcbf4d.js} +7 -7
- flowfile/web/static/assets/{ToggleSwitch-0ff7ac52.js → ToggleSwitch-4ef91d19.js} +2 -2
- flowfile/web/static/assets/{ToggleSwitch.vue_vue_type_script_setup_true_lang-c6dc3029.js → ToggleSwitch.vue_vue_type_script_setup_true_lang-38478c20.js} +1 -1
- flowfile/web/static/assets/{UnavailableFields-1bab97cb.js → UnavailableFields-a03f512c.js} +2 -2
- flowfile/web/static/assets/{Union-b563478a.js → Union-bfe9b996.js} +4 -4
- flowfile/web/static/assets/{Unique-f90db5db.js → Unique-5d023a27.js} +8 -20
- flowfile/web/static/assets/{Unpivot-bcb0025f.js → Unpivot-91cc5354.js} +6 -6
- flowfile/web/static/assets/{UnpivotValidation-c4e73b04.js → UnpivotValidation-7ee2de44.js} +1 -1
- flowfile/web/static/assets/{VueGraphicWalker-bb8535e2.js → VueGraphicWalker-e51b9924.js} +1 -1
- flowfile/web/static/assets/{api-2d6adc4f.js → api-c1bad5ca.js} +1 -1
- flowfile/web/static/assets/{api-4c8e3822.js → api-cf1221f0.js} +1 -1
- flowfile/web/static/assets/{designer-e3c150ec.css → designer-8da3ba3a.css} +90 -67
- flowfile/web/static/assets/{designer-f3656d8c.js → designer-9633482a.js} +119 -51
- flowfile/web/static/assets/{documentation-52b241e7.js → documentation-ca400224.js} +1 -1
- flowfile/web/static/assets/{dropDown-1bca8a74.js → dropDown-614b998d.js} +1 -1
- flowfile/web/static/assets/{fullEditor-2985687e.js → fullEditor-f7971590.js} +2 -2
- flowfile/web/static/assets/{genericNodeSettings-0476ba4e.js → genericNodeSettings-4fe5f36b.js} +3 -3
- flowfile/web/static/assets/{index-246f201c.js → index-5429bbf8.js} +6 -8
- flowfile/web/static/assets/nodeInput-5d0d6b79.js +41 -0
- flowfile/web/static/assets/{outputCsv-d686eeaf.js → outputCsv-076b85ab.js} +1 -1
- flowfile/web/static/assets/{outputExcel-8809ea2f.js → outputExcel-0fd17dbe.js} +1 -1
- flowfile/web/static/assets/{outputParquet-53ba645a.js → outputParquet-b61e0847.js} +1 -1
- flowfile/web/static/assets/{readCsv-053bf97b.js → readCsv-a8bb8b61.js} +21 -20
- flowfile/web/static/assets/{readCsv-bca3ed53.css → readCsv-c767cb37.css} +13 -13
- flowfile/web/static/assets/{readExcel-ad531eab.js → readExcel-67b4aee0.js} +10 -12
- flowfile/web/static/assets/{readExcel-e1b381ea.css → readExcel-806d2826.css} +12 -12
- flowfile/web/static/assets/{readParquet-cee068e2.css → readParquet-48c81530.css} +3 -3
- flowfile/web/static/assets/{readParquet-58e899a1.js → readParquet-92ce1dbc.js} +4 -7
- flowfile/web/static/assets/{secretApi-538058f3.js → secretApi-68435402.js} +1 -1
- flowfile/web/static/assets/{selectDynamic-b38de2ba.js → selectDynamic-92e25ee3.js} +3 -3
- flowfile/web/static/assets/{vue-codemirror.esm-db9b8936.js → vue-codemirror.esm-41b0e0d7.js} +7 -4
- flowfile/web/static/assets/{vue-content-loader.es-b5f3ac30.js → vue-content-loader.es-2c8e608f.js} +1 -1
- flowfile/web/static/index.html +1 -1
- {flowfile-0.4.1.dist-info → flowfile-0.5.1.dist-info}/METADATA +3 -2
- {flowfile-0.4.1.dist-info → flowfile-0.5.1.dist-info}/RECORD +138 -126
- {flowfile-0.4.1.dist-info → flowfile-0.5.1.dist-info}/WHEEL +1 -1
- {flowfile-0.4.1.dist-info → flowfile-0.5.1.dist-info}/entry_points.txt +1 -0
- flowfile_core/__init__.py +3 -0
- flowfile_core/flowfile/analytics/analytics_processor.py +1 -0
- flowfile_core/flowfile/code_generator/code_generator.py +62 -64
- flowfile_core/flowfile/flow_data_engine/create/funcs.py +73 -56
- flowfile_core/flowfile/flow_data_engine/flow_data_engine.py +77 -86
- flowfile_core/flowfile/flow_data_engine/fuzzy_matching/prepare_for_fuzzy_match.py +23 -23
- flowfile_core/flowfile/flow_data_engine/join/utils.py +1 -1
- flowfile_core/flowfile/flow_data_engine/join/verify_integrity.py +9 -4
- flowfile_core/flowfile/flow_data_engine/subprocess_operations/subprocess_operations.py +184 -78
- flowfile_core/flowfile/flow_data_engine/utils.py +2 -0
- flowfile_core/flowfile/flow_graph.py +129 -26
- flowfile_core/flowfile/flow_node/flow_node.py +3 -0
- flowfile_core/flowfile/flow_node/models.py +2 -1
- flowfile_core/flowfile/handler.py +5 -5
- flowfile_core/flowfile/manage/compatibility_enhancements.py +404 -41
- flowfile_core/flowfile/manage/io_flowfile.py +394 -0
- flowfile_core/flowfile/node_designer/__init__.py +1 -1
- flowfile_core/flowfile/node_designer/_type_registry.py +2 -2
- flowfile_core/flowfile/node_designer/custom_node.py +1 -1
- flowfile_core/flowfile/node_designer/ui_components.py +1 -1
- flowfile_core/flowfile/schema_callbacks.py +8 -5
- flowfile_core/flowfile/setting_generator/settings.py +15 -9
- flowfile_core/routes/routes.py +8 -10
- flowfile_core/schemas/cloud_storage_schemas.py +0 -2
- flowfile_core/schemas/input_schema.py +222 -65
- flowfile_core/schemas/output_model.py +1 -1
- flowfile_core/schemas/schemas.py +145 -32
- flowfile_core/schemas/transform_schema.py +1083 -413
- flowfile_core/schemas/yaml_types.py +103 -0
- flowfile_core/{flowfile/node_designer/data_types.py → types.py} +11 -1
- flowfile_frame/__init__.py +3 -1
- flowfile_frame/flow_frame.py +15 -18
- flowfile_frame/flow_frame_methods.py +12 -9
- flowfile_worker/__init__.py +3 -0
- flowfile_worker/create/__init__.py +3 -21
- flowfile_worker/create/funcs.py +68 -56
- flowfile_worker/create/models.py +130 -62
- flowfile_worker/routes.py +5 -8
- tools/migrate/README.md +56 -0
- tools/migrate/__init__.py +12 -0
- tools/migrate/__main__.py +131 -0
- tools/migrate/legacy_schemas.py +621 -0
- tools/migrate/migrate.py +598 -0
- tools/migrate/tests/__init__.py +0 -0
- tools/migrate/tests/conftest.py +23 -0
- tools/migrate/tests/test_migrate.py +627 -0
- tools/migrate/tests/test_migration_e2e.py +1010 -0
- tools/migrate/tests/test_node_migrations.py +813 -0
- flowfile_core/flowfile/manage/open_flowfile.py +0 -143
- {flowfile-0.4.1.dist-info → flowfile-0.5.1.dist-info}/licenses/LICENSE +0 -0
- /flowfile_core/flowfile/manage/manage_flowfile.py → /tools/__init__.py +0 -0
|
@@ -0,0 +1,103 @@
|
|
|
1
|
+
from typing import TypedDict, List
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
# === Transform Schema YAML Types ===
|
|
5
|
+
|
|
6
|
+
class SelectInputYaml(TypedDict, total=False):
|
|
7
|
+
old_name: str
|
|
8
|
+
new_name: str
|
|
9
|
+
keep: bool
|
|
10
|
+
data_type: str
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class JoinInputsYaml(TypedDict):
|
|
14
|
+
select: List[SelectInputYaml]
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class JoinMapYaml(TypedDict):
|
|
18
|
+
left_col: str
|
|
19
|
+
right_col: str
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class JoinInputYaml(TypedDict):
|
|
23
|
+
join_mapping: List[JoinMapYaml]
|
|
24
|
+
left_select: JoinInputsYaml
|
|
25
|
+
right_select: JoinInputsYaml
|
|
26
|
+
how: str
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
class CrossJoinInputYaml(TypedDict):
|
|
30
|
+
left_select: JoinInputsYaml
|
|
31
|
+
right_select: JoinInputsYaml
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
class FuzzyMappingYaml(TypedDict, total=False):
|
|
35
|
+
left_col: str
|
|
36
|
+
right_col: str
|
|
37
|
+
threshold_score: float
|
|
38
|
+
fuzzy_type: str
|
|
39
|
+
perc_unique: float
|
|
40
|
+
output_column_name: str
|
|
41
|
+
valid: bool
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
class FuzzyMatchInputYaml(TypedDict):
|
|
45
|
+
join_mapping: List[FuzzyMappingYaml]
|
|
46
|
+
left_select: JoinInputsYaml
|
|
47
|
+
right_select: JoinInputsYaml
|
|
48
|
+
how: str
|
|
49
|
+
aggregate_output: bool
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
# === Input Schema YAML Types ===
|
|
53
|
+
|
|
54
|
+
class OutputSettingsYaml(TypedDict, total=False):
|
|
55
|
+
name: str
|
|
56
|
+
directory: str
|
|
57
|
+
file_type: str
|
|
58
|
+
write_mode: str
|
|
59
|
+
abs_file_path: str
|
|
60
|
+
fields: List[str]
|
|
61
|
+
table_settings: dict
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
class NodeSelectYaml(TypedDict):
|
|
65
|
+
cache_results: bool
|
|
66
|
+
keep_missing: bool
|
|
67
|
+
select_input: List[SelectInputYaml]
|
|
68
|
+
sorted_by: str
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
class NodeJoinYaml(TypedDict):
|
|
72
|
+
cache_results: bool
|
|
73
|
+
auto_generate_selection: bool
|
|
74
|
+
verify_integrity: bool
|
|
75
|
+
join_input: JoinInputYaml
|
|
76
|
+
auto_keep_all: bool
|
|
77
|
+
auto_keep_right: bool
|
|
78
|
+
auto_keep_left: bool
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
class NodeCrossJoinYaml(TypedDict):
|
|
82
|
+
cache_results: bool
|
|
83
|
+
auto_generate_selection: bool
|
|
84
|
+
verify_integrity: bool
|
|
85
|
+
cross_join_input: CrossJoinInputYaml
|
|
86
|
+
auto_keep_all: bool
|
|
87
|
+
auto_keep_right: bool
|
|
88
|
+
auto_keep_left: bool
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
class NodeFuzzyMatchYaml(TypedDict):
|
|
92
|
+
cache_results: bool
|
|
93
|
+
auto_generate_selection: bool
|
|
94
|
+
verify_integrity: bool
|
|
95
|
+
join_input: FuzzyMatchInputYaml
|
|
96
|
+
auto_keep_all: bool
|
|
97
|
+
auto_keep_right: bool
|
|
98
|
+
auto_keep_left: bool
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
class NodeOutputYaml(TypedDict):
|
|
102
|
+
cache_results: bool
|
|
103
|
+
output_settings: OutputSettingsYaml
|
|
@@ -18,10 +18,20 @@ Usage:
|
|
|
18
18
|
"""
|
|
19
19
|
|
|
20
20
|
from enum import Enum
|
|
21
|
-
from typing import List, Union
|
|
21
|
+
from typing import List, Literal, Union
|
|
22
22
|
import polars as pl
|
|
23
23
|
|
|
24
24
|
|
|
25
|
+
DataTypeStr = Literal[
|
|
26
|
+
"Int8", "Int16", "Int32", "Int64",
|
|
27
|
+
"UInt8", "UInt16", "UInt32", "UInt64",
|
|
28
|
+
"Float32", "Float64", "Decimal",
|
|
29
|
+
"String",
|
|
30
|
+
"Date", "Datetime", "Time", "Duration",
|
|
31
|
+
"Boolean", "Binary", "List", "Struct", "Array", "Integer", "Double", "Utf8"
|
|
32
|
+
]
|
|
33
|
+
|
|
34
|
+
|
|
25
35
|
class TypeGroup(str, Enum):
|
|
26
36
|
"""High-level type groups for column selection."""
|
|
27
37
|
Numeric = "Numeric"
|
flowfile_frame/__init__.py
CHANGED
|
@@ -1,6 +1,8 @@
|
|
|
1
1
|
# flowframe/__init__.py
|
|
2
2
|
"""A Polars-like API for building ETL graphs."""
|
|
3
3
|
|
|
4
|
+
from importlib.metadata import version
|
|
5
|
+
|
|
4
6
|
# Core classes
|
|
5
7
|
from flowfile_frame.flow_frame import FlowFrame # noqa: F401
|
|
6
8
|
from pl_fuzzy_frame_match.models import FuzzyMapping # noqa: F401
|
|
@@ -64,4 +66,4 @@ from polars.datatypes import ( # noqa: F401
|
|
|
64
66
|
DataType, DataTypeClass, Field
|
|
65
67
|
)
|
|
66
68
|
|
|
67
|
-
__version__ = "
|
|
69
|
+
__version__ = version("Flowfile")
|
flowfile_frame/flow_frame.py
CHANGED
|
@@ -10,7 +10,7 @@ from flowfile_frame.lazy_methods import add_lazyframe_methods
|
|
|
10
10
|
from polars._typing import (CsvEncoding, FrameInitTypes, SchemaDefinition, SchemaDict, Orientation)
|
|
11
11
|
from collections.abc import Iterator
|
|
12
12
|
|
|
13
|
-
from pl_fuzzy_frame_match import FuzzyMapping
|
|
13
|
+
from pl_fuzzy_frame_match import FuzzyMapping
|
|
14
14
|
|
|
15
15
|
from flowfile_core.flowfile.flow_graph import FlowGraph, add_connection
|
|
16
16
|
from flowfile_core.flowfile.flow_graph_utils import combine_flow_graphs_with_mapping
|
|
@@ -626,7 +626,6 @@ class FlowFrame:
|
|
|
626
626
|
left_columns, right_columns = self._parse_join_columns(
|
|
627
627
|
on, left_on, right_on, how
|
|
628
628
|
)
|
|
629
|
-
|
|
630
629
|
# Step 5: Validate column lists have same length (except for cross join)
|
|
631
630
|
if how != 'cross' and left_columns is not None and right_columns is not None:
|
|
632
631
|
if len(left_columns) != len(right_columns):
|
|
@@ -798,33 +797,36 @@ class FlowFrame:
|
|
|
798
797
|
) -> "FlowFrame":
|
|
799
798
|
"""Execute join using native FlowFile join nodes."""
|
|
800
799
|
# Create select inputs for both frames
|
|
800
|
+
|
|
801
801
|
left_select = transform_schema.SelectInputs.create_from_pl_df(self.data)
|
|
802
802
|
right_select = transform_schema.SelectInputs.create_from_pl_df(other.data)
|
|
803
803
|
# Create appropriate join input based on join type
|
|
804
804
|
if how == 'cross':
|
|
805
805
|
join_input = transform_schema.CrossJoinInput(
|
|
806
|
-
left_select=left_select.renames,
|
|
806
|
+
left_select=transform_schema.JoinInputs(renames=left_select.renames),
|
|
807
807
|
right_select=right_select.renames,
|
|
808
808
|
)
|
|
809
|
+
join_input_manager = transform_schema.CrossJoinInputManager(join_input)
|
|
810
|
+
|
|
809
811
|
else:
|
|
810
812
|
join_input = transform_schema.JoinInput(
|
|
811
813
|
join_mapping=join_mappings,
|
|
812
|
-
left_select=left_select.renames,
|
|
814
|
+
left_select=transform_schema.JoinInputs(renames=left_select.renames),
|
|
813
815
|
right_select=right_select.renames,
|
|
814
816
|
how=how,
|
|
815
817
|
)
|
|
818
|
+
join_input_manager = transform_schema.JoinInputManager(join_input)
|
|
816
819
|
|
|
817
820
|
# Configure join input
|
|
818
|
-
|
|
819
|
-
for right_column in right_select.renames:
|
|
821
|
+
for right_column in join_input_manager.right_select.renames:
|
|
820
822
|
if right_column.join_key:
|
|
821
823
|
right_column.keep = False
|
|
822
824
|
|
|
823
825
|
# Create and add appropriate node
|
|
824
826
|
if how == 'cross':
|
|
825
|
-
self._add_cross_join_node(new_node_id,
|
|
827
|
+
self._add_cross_join_node(new_node_id, join_input_manager.to_cross_join_input(), description, other)
|
|
826
828
|
else:
|
|
827
|
-
self._add_regular_join_node(new_node_id,
|
|
829
|
+
self._add_regular_join_node(new_node_id, join_input_manager.to_join_input(), description, other)
|
|
828
830
|
|
|
829
831
|
# Add connections
|
|
830
832
|
self._add_connection(self.node_id, new_node_id, "main")
|
|
@@ -1140,16 +1142,11 @@ class FlowFrame:
|
|
|
1140
1142
|
file_name = file_str.split(os.sep)[-1]
|
|
1141
1143
|
use_polars_code = bool(kwargs.items()) or not is_path_input
|
|
1142
1144
|
|
|
1143
|
-
output_parquet_table = input_schema.OutputParquetTable(
|
|
1144
|
-
file_type="parquet"
|
|
1145
|
-
)
|
|
1146
1145
|
output_settings = input_schema.OutputSettings(
|
|
1147
1146
|
file_type='parquet',
|
|
1148
1147
|
name=file_name,
|
|
1149
1148
|
directory=file_str if is_path_input else str(file_str),
|
|
1150
|
-
|
|
1151
|
-
output_csv_table=input_schema.OutputCsvTable(),
|
|
1152
|
-
output_excel_table=input_schema.OutputExcelTable()
|
|
1149
|
+
table_settings=input_schema.OutputParquetTable()
|
|
1153
1150
|
)
|
|
1154
1151
|
|
|
1155
1152
|
if is_path_input:
|
|
@@ -1220,10 +1217,10 @@ class FlowFrame:
|
|
|
1220
1217
|
file_type='csv',
|
|
1221
1218
|
name=file_name,
|
|
1222
1219
|
directory=file_str if is_path_input else str(file_str),
|
|
1223
|
-
|
|
1224
|
-
|
|
1225
|
-
|
|
1226
|
-
|
|
1220
|
+
table_settings=input_schema.OutputCsvTable(
|
|
1221
|
+
delimiter=separator,
|
|
1222
|
+
encoding=encoding
|
|
1223
|
+
)
|
|
1227
1224
|
)
|
|
1228
1225
|
if is_path_input:
|
|
1229
1226
|
try:
|
|
@@ -186,15 +186,17 @@ def read_csv(
|
|
|
186
186
|
file_type='csv',
|
|
187
187
|
path=current_source_path_for_native,
|
|
188
188
|
name=Path(current_source_path_for_native).name,
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
189
|
+
table_settings=input_schema.InputCsvTable(
|
|
190
|
+
delimiter=separator,
|
|
191
|
+
has_headers=has_header,
|
|
192
|
+
encoding=encoding,
|
|
193
|
+
starting_from_line=skip_rows,
|
|
194
|
+
quote_char=quote_char if quote_char is not None else '"',
|
|
195
|
+
infer_schema_length=actual_infer_schema_length if actual_infer_schema_length is not None else 10000,
|
|
196
|
+
truncate_ragged_lines=truncate_ragged_lines,
|
|
197
|
+
ignore_errors=ignore_errors,
|
|
198
|
+
row_delimiter=eol_char
|
|
199
|
+
)
|
|
198
200
|
)
|
|
199
201
|
if convert_to_absolute_path:
|
|
200
202
|
try:
|
|
@@ -407,6 +409,7 @@ def read_parquet(source, *, flow_graph: FlowGraph = None, description: str = Non
|
|
|
407
409
|
file_type='parquet',
|
|
408
410
|
path=source,
|
|
409
411
|
name=Path(source).name,
|
|
412
|
+
table_settings=input_schema.InputParquetTable()
|
|
410
413
|
)
|
|
411
414
|
if convert_to_absolute_path:
|
|
412
415
|
received_table.path = received_table.abs_file_path
|
flowfile_worker/__init__.py
CHANGED
|
@@ -1,29 +1,11 @@
|
|
|
1
|
-
|
|
2
|
-
ReceivedJsonTable)
|
|
1
|
+
|
|
3
2
|
from flowfile_worker.create.funcs import (create_from_path_csv, create_from_path_parquet, create_from_path_excel,
|
|
4
3
|
create_from_path_json)
|
|
5
|
-
from typing import
|
|
4
|
+
from typing import Literal
|
|
6
5
|
|
|
7
|
-
ReceivedTableCollection = ReceivedCsvTable | ReceivedParquetTable | ReceivedJsonTable | ReceivedExcelTable
|
|
8
6
|
FileType = Literal['csv', 'parquet', 'json', 'excel']
|
|
9
7
|
|
|
10
|
-
|
|
11
|
-
def received_table_parser(received_table_raw: Dict, file_type: FileType) -> ReceivedTableCollection:
|
|
12
|
-
match file_type:
|
|
13
|
-
case 'csv':
|
|
14
|
-
received_table = ReceivedCsvTable.model_validate(received_table_raw)
|
|
15
|
-
case 'parquet':
|
|
16
|
-
received_table = ReceivedParquetTable.model_validate(received_table_raw)
|
|
17
|
-
case 'excel':
|
|
18
|
-
received_table = ReceivedExcelTable.model_validate(received_table_raw)
|
|
19
|
-
case 'json':
|
|
20
|
-
return ReceivedJsonTable.model_validate(received_table_raw)
|
|
21
|
-
case _:
|
|
22
|
-
raise ValueError(f'Unsupported file type: {file_type}')
|
|
23
|
-
return received_table
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
def table_creator_factory_method(file_type: Literal['csv', 'parquet', 'json', 'excel']) -> callable:
|
|
8
|
+
def table_creator_factory_method(file_type: FileType) -> callable:
|
|
27
9
|
match file_type:
|
|
28
10
|
case 'csv':
|
|
29
11
|
return create_from_path_csv
|
flowfile_worker/create/funcs.py
CHANGED
|
@@ -1,94 +1,100 @@
|
|
|
1
1
|
import polars as pl
|
|
2
2
|
import os
|
|
3
3
|
|
|
4
|
-
from flowfile_worker.create.models import
|
|
4
|
+
from flowfile_worker.create.models import ReceivedTable, InputCsvTable, InputJsonTable, InputExcelTable, InputParquetTable
|
|
5
5
|
from flowfile_worker.create.utils import create_fake_data
|
|
6
6
|
from flowfile_worker.create.read_excel_tables import df_from_openpyxl, df_from_calamine_xlsx
|
|
7
7
|
|
|
8
8
|
|
|
9
|
-
def create_from_path_json(received_table:
|
|
9
|
+
def create_from_path_json(received_table: ReceivedTable):
|
|
10
|
+
if not isinstance(received_table.table_settings, InputJsonTable):
|
|
11
|
+
raise ValueError("Received table settings are not of type InputJsonTable")
|
|
12
|
+
input_table_settings: InputJsonTable = received_table.table_settings
|
|
10
13
|
f = received_table.abs_file_path
|
|
11
14
|
gbs_to_load = os.path.getsize(f) / 1024 / 1000 / 1000
|
|
12
15
|
low_mem = gbs_to_load > 10
|
|
13
|
-
if
|
|
16
|
+
if input_table_settings.encoding.upper() == 'UTF8' or input_table_settings.encoding.upper() == 'UTF-8':
|
|
14
17
|
try:
|
|
15
18
|
df = pl.scan_csv(f,
|
|
16
19
|
low_memory=low_mem,
|
|
17
20
|
try_parse_dates=True,
|
|
18
|
-
separator=
|
|
19
|
-
has_header=
|
|
20
|
-
skip_rows=
|
|
21
|
+
separator=input_table_settings.delimiter,
|
|
22
|
+
has_header=input_table_settings.has_headers,
|
|
23
|
+
skip_rows=input_table_settings.starting_from_line,
|
|
21
24
|
encoding='utf8',
|
|
22
|
-
infer_schema_length=
|
|
25
|
+
infer_schema_length=input_table_settings.infer_schema_length)
|
|
23
26
|
df.head(1).collect()
|
|
24
27
|
return df
|
|
25
28
|
except:
|
|
26
29
|
try:
|
|
27
30
|
df = pl.scan_csv(f, low_memory=low_mem,
|
|
28
|
-
separator=
|
|
29
|
-
has_header=
|
|
30
|
-
skip_rows=
|
|
31
|
+
separator=input_table_settings.delimiter,
|
|
32
|
+
has_header=input_table_settings.has_headers,
|
|
33
|
+
skip_rows=input_table_settings.starting_from_line,
|
|
31
34
|
encoding='utf8-lossy',
|
|
32
35
|
ignore_errors=True)
|
|
33
36
|
return df
|
|
34
37
|
except:
|
|
35
38
|
df = pl.scan_csv(f, low_memory=low_mem,
|
|
36
|
-
separator=
|
|
37
|
-
has_header=
|
|
38
|
-
skip_rows=
|
|
39
|
+
separator=input_table_settings.delimiter,
|
|
40
|
+
has_header=input_table_settings.has_headers,
|
|
41
|
+
skip_rows=input_table_settings.starting_from_line,
|
|
39
42
|
encoding='utf8',
|
|
40
43
|
ignore_errors=True)
|
|
41
44
|
return df
|
|
42
45
|
else:
|
|
43
46
|
df = pl.read_csv(f, low_memory=low_mem,
|
|
44
|
-
separator=
|
|
45
|
-
has_header=
|
|
46
|
-
skip_rows=
|
|
47
|
-
encoding=
|
|
47
|
+
separator=input_table_settings.delimiter,
|
|
48
|
+
has_header=input_table_settings.has_headers,
|
|
49
|
+
skip_rows=input_table_settings.starting_from_line,
|
|
50
|
+
encoding=input_table_settings.encoding,
|
|
48
51
|
ignore_errors=True)
|
|
49
52
|
return df
|
|
50
53
|
|
|
51
54
|
|
|
52
|
-
def create_from_path_csv(received_table:
|
|
55
|
+
def create_from_path_csv(received_table: ReceivedTable) -> pl.DataFrame:
|
|
53
56
|
f = received_table.abs_file_path
|
|
57
|
+
if not isinstance(received_table.table_settings, InputCsvTable):
|
|
58
|
+
raise ValueError("Received table settings are not of type InputCsvTable")
|
|
59
|
+
input_table_settings: InputCsvTable = received_table.table_settings
|
|
54
60
|
gbs_to_load = os.path.getsize(f) / 1024 / 1000 / 1000
|
|
55
61
|
low_mem = gbs_to_load > 10
|
|
56
|
-
if
|
|
62
|
+
if input_table_settings.encoding.upper() == 'UTF8' or input_table_settings.encoding.upper() == 'UTF-8':
|
|
57
63
|
try:
|
|
58
64
|
df = pl.scan_csv(f,
|
|
59
65
|
low_memory=low_mem,
|
|
60
66
|
try_parse_dates=True,
|
|
61
|
-
separator=
|
|
62
|
-
has_header=
|
|
63
|
-
skip_rows=
|
|
67
|
+
separator=input_table_settings.delimiter,
|
|
68
|
+
has_header=input_table_settings.has_headers,
|
|
69
|
+
skip_rows=input_table_settings.starting_from_line,
|
|
64
70
|
encoding='utf8',
|
|
65
|
-
infer_schema_length=
|
|
71
|
+
infer_schema_length=input_table_settings.infer_schema_length)
|
|
66
72
|
df.head(1).collect()
|
|
67
73
|
return df
|
|
68
74
|
except:
|
|
69
75
|
try:
|
|
70
76
|
df = pl.scan_csv(f, low_memory=low_mem,
|
|
71
|
-
separator=
|
|
72
|
-
has_header=
|
|
73
|
-
skip_rows=
|
|
77
|
+
separator=input_table_settings.delimiter,
|
|
78
|
+
has_header=input_table_settings.has_headers,
|
|
79
|
+
skip_rows=input_table_settings.starting_from_line,
|
|
74
80
|
encoding='utf8-lossy',
|
|
75
81
|
ignore_errors=True)
|
|
76
82
|
return df
|
|
77
83
|
except:
|
|
78
84
|
df = pl.scan_csv(f, low_memory=low_mem,
|
|
79
|
-
separator=
|
|
80
|
-
has_header=
|
|
81
|
-
skip_rows=
|
|
85
|
+
separator=input_table_settings.delimiter,
|
|
86
|
+
has_header=input_table_settings.has_headers,
|
|
87
|
+
skip_rows=input_table_settings.starting_from_line,
|
|
82
88
|
encoding='utf8',
|
|
83
89
|
ignore_errors=True)
|
|
84
90
|
return df
|
|
85
91
|
else:
|
|
86
92
|
df = pl.read_csv(f,
|
|
87
93
|
low_memory=low_mem,
|
|
88
|
-
separator=
|
|
89
|
-
has_header=
|
|
90
|
-
skip_rows=
|
|
91
|
-
encoding=
|
|
94
|
+
separator=input_table_settings.delimiter,
|
|
95
|
+
has_header=input_table_settings.has_headers,
|
|
96
|
+
skip_rows=input_table_settings.starting_from_line,
|
|
97
|
+
encoding=input_table_settings.encoding,
|
|
92
98
|
ignore_errors=True)
|
|
93
99
|
return df
|
|
94
100
|
|
|
@@ -97,50 +103,56 @@ def create_random(number_of_records: int = 1000) -> pl.LazyFrame:
|
|
|
97
103
|
return create_fake_data(number_of_records).lazy()
|
|
98
104
|
|
|
99
105
|
|
|
100
|
-
def create_from_path_parquet(received_table:
|
|
106
|
+
def create_from_path_parquet(received_table: ReceivedTable):
|
|
107
|
+
if not isinstance(received_table.table_settings, InputParquetTable):
|
|
108
|
+
raise ValueError("Received table settings are not of type InputParquetTable")
|
|
101
109
|
low_mem = (os.path.getsize(received_table.abs_file_path) / 1024 / 1000 / 1000) > 2
|
|
102
110
|
return pl.scan_parquet(source=received_table.abs_file_path, low_memory=low_mem)
|
|
103
111
|
|
|
104
112
|
|
|
105
|
-
def create_from_path_excel(received_table:
|
|
106
|
-
if received_table.
|
|
113
|
+
def create_from_path_excel(received_table: ReceivedTable):
|
|
114
|
+
if not isinstance(received_table.table_settings, InputExcelTable):
|
|
115
|
+
raise ValueError("Received table settings are not of type InputExcelTable")
|
|
116
|
+
input_table_settings: InputExcelTable = received_table.table_settings
|
|
117
|
+
|
|
118
|
+
if input_table_settings.type_inference:
|
|
107
119
|
engine = 'openpyxl'
|
|
108
|
-
elif
|
|
109
|
-
engine = 'calamine' if
|
|
110
|
-
elif
|
|
120
|
+
elif input_table_settings.start_row > 0 and input_table_settings.start_column == 0:
|
|
121
|
+
engine = 'calamine' if input_table_settings.has_headers else 'xlsx2csv'
|
|
122
|
+
elif input_table_settings.start_column > 0 or input_table_settings.start_row > 0:
|
|
111
123
|
engine = 'openpyxl'
|
|
112
124
|
else:
|
|
113
125
|
engine = 'calamine'
|
|
114
126
|
|
|
115
|
-
sheet_name =
|
|
127
|
+
sheet_name = input_table_settings.sheet_name
|
|
116
128
|
|
|
117
129
|
if engine == 'calamine':
|
|
118
130
|
df = df_from_calamine_xlsx(file_path=received_table.abs_file_path, sheet_name=sheet_name,
|
|
119
|
-
start_row=
|
|
120
|
-
if
|
|
121
|
-
end_col_index =
|
|
122
|
-
cols_to_select = [df.columns[i] for i in range(
|
|
131
|
+
start_row=input_table_settings.start_row, end_row=input_table_settings.end_row)
|
|
132
|
+
if input_table_settings.end_column > 0:
|
|
133
|
+
end_col_index = input_table_settings.end_column
|
|
134
|
+
cols_to_select = [df.columns[i] for i in range(input_table_settings.start_column, end_col_index)]
|
|
123
135
|
df = df.select(cols_to_select)
|
|
124
136
|
|
|
125
137
|
elif engine == 'xlsx2csv':
|
|
126
|
-
csv_options = {'has_header':
|
|
138
|
+
csv_options = {'has_header': input_table_settings.has_headers, 'skip_rows': input_table_settings.start_row}
|
|
127
139
|
df = pl.read_excel(source=received_table.abs_file_path,
|
|
128
140
|
read_options=csv_options,
|
|
129
141
|
engine='xlsx2csv',
|
|
130
|
-
sheet_name=
|
|
131
|
-
end_col_index =
|
|
132
|
-
cols_to_select = [df.columns[i] for i in range(
|
|
142
|
+
sheet_name=input_table_settings.sheet_name)
|
|
143
|
+
end_col_index = input_table_settings.end_column if input_table_settings.end_column > 0 else len(df.columns)
|
|
144
|
+
cols_to_select = [df.columns[i] for i in range(input_table_settings.start_column, end_col_index)]
|
|
133
145
|
df = df.select(cols_to_select)
|
|
134
|
-
if 0 <
|
|
135
|
-
df = df.head(
|
|
146
|
+
if 0 < input_table_settings.end_row < len(df):
|
|
147
|
+
df = df.head(input_table_settings.end_row)
|
|
136
148
|
|
|
137
149
|
else:
|
|
138
|
-
max_col =
|
|
139
|
-
max_row =
|
|
150
|
+
max_col = input_table_settings.end_column if input_table_settings.end_column > 0 else None
|
|
151
|
+
max_row = input_table_settings.end_row + 1 if input_table_settings.end_row > 0 else None
|
|
140
152
|
df = df_from_openpyxl(file_path=received_table.abs_file_path,
|
|
141
|
-
sheet_name=
|
|
142
|
-
min_row=
|
|
143
|
-
min_col=
|
|
153
|
+
sheet_name=input_table_settings.sheet_name,
|
|
154
|
+
min_row=input_table_settings.start_row + 1,
|
|
155
|
+
min_col=input_table_settings.start_column + 1,
|
|
144
156
|
max_row=max_row,
|
|
145
|
-
max_col=max_col, has_headers=
|
|
157
|
+
max_col=max_col, has_headers=input_table_settings.has_headers)
|
|
146
158
|
return df
|