Flowfile 0.4.1__py3-none-any.whl → 0.5.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (139) hide show
  1. flowfile/__init__.py +3 -1
  2. flowfile/api.py +1 -2
  3. flowfile/web/static/assets/{CloudConnectionManager-d3248f8d.js → CloudConnectionManager-0dfba9f2.js} +2 -2
  4. flowfile/web/static/assets/{CloudStorageReader-d65bf041.js → CloudStorageReader-d5b1b6c9.js} +6 -6
  5. flowfile/web/static/assets/{CloudStorageWriter-e83be3ed.js → CloudStorageWriter-00d87aad.js} +6 -6
  6. flowfile/web/static/assets/{ColumnSelector-cce661cf.js → ColumnSelector-4685e75d.js} +1 -1
  7. flowfile/web/static/assets/{ContextMenu-cf18d2cc.js → ContextMenu-23e909da.js} +1 -1
  8. flowfile/web/static/assets/{ContextMenu-160afb08.js → ContextMenu-70ae0c79.js} +1 -1
  9. flowfile/web/static/assets/{ContextMenu-11a4652a.js → ContextMenu-f149cf7c.js} +1 -1
  10. flowfile/web/static/assets/{CrossJoin-d395d38c.js → CrossJoin-702a3edd.js} +7 -7
  11. flowfile/web/static/assets/{CustomNode-b812dc0b.js → CustomNode-b1519993.js} +11 -11
  12. flowfile/web/static/assets/{DatabaseConnectionSettings-7000bf2c.js → DatabaseConnectionSettings-6f3e4ea5.js} +2 -2
  13. flowfile/web/static/assets/{DatabaseManager-9662ec5b.js → DatabaseManager-cf5ef661.js} +2 -2
  14. flowfile/web/static/assets/{DatabaseReader-4f035d0c.js → DatabaseReader-d38c7295.js} +9 -9
  15. flowfile/web/static/assets/{DatabaseWriter-f65dcd54.js → DatabaseWriter-b04ef46a.js} +8 -8
  16. flowfile/web/static/assets/{ExploreData-94c43dfc.js → ExploreData-5fa10ed8.js} +5 -5
  17. flowfile/web/static/assets/{ExternalSource-ac04b3cc.js → ExternalSource-d39af878.js} +5 -5
  18. flowfile/web/static/assets/{Filter-812dcbca.js → Filter-9b6d08db.js} +7 -7
  19. flowfile/web/static/assets/{Formula-71472193.js → Formula-6b04fb1d.js} +7 -7
  20. flowfile/web/static/assets/{FuzzyMatch-b317f631.js → FuzzyMatch-999521f4.js} +8 -8
  21. flowfile/web/static/assets/{GraphSolver-754a234f.js → GraphSolver-17dd2198.js} +6 -6
  22. flowfile/web/static/assets/{GroupBy-6c6f9802.js → GroupBy-6b039e18.js} +5 -5
  23. flowfile/web/static/assets/{Join-a1b800be.js → Join-24d0f113.js} +8 -8
  24. flowfile/web/static/assets/{ManualInput-a9640276.js → ManualInput-34639209.js} +4 -4
  25. flowfile/web/static/assets/{MultiSelect-97213888.js → MultiSelect-0e8724a3.js} +2 -2
  26. flowfile/web/static/assets/{MultiSelect.vue_vue_type_script_setup_true_lang-6ffe088a.js → MultiSelect.vue_vue_type_script_setup_true_lang-b0e538c2.js} +1 -1
  27. flowfile/web/static/assets/{NumericInput-e638088a.js → NumericInput-3d63a470.js} +2 -2
  28. flowfile/web/static/assets/{NumericInput.vue_vue_type_script_setup_true_lang-90eb2cba.js → NumericInput.vue_vue_type_script_setup_true_lang-e0edeccc.js} +1 -1
  29. flowfile/web/static/assets/{Output-ddc9079f.css → Output-283fe388.css} +5 -5
  30. flowfile/web/static/assets/{Output-76750610.js → Output-edea9802.js} +57 -38
  31. flowfile/web/static/assets/{Pivot-7814803f.js → Pivot-61d19301.js} +7 -7
  32. flowfile/web/static/assets/{PivotValidation-f92137d2.js → PivotValidation-de9f43fe.js} +1 -1
  33. flowfile/web/static/assets/{PivotValidation-76dd431a.js → PivotValidation-f97fec5b.js} +1 -1
  34. flowfile/web/static/assets/{PolarsCode-889c3008.js → PolarsCode-bc3c9984.js} +5 -5
  35. flowfile/web/static/assets/{Read-637b72a7.js → Read-64a3f259.js} +80 -105
  36. flowfile/web/static/assets/{Read-6b17491f.css → Read-e808b239.css} +10 -10
  37. flowfile/web/static/assets/{RecordCount-2b050c41.js → RecordCount-3d5039be.js} +4 -4
  38. flowfile/web/static/assets/{RecordId-81df7784.js → RecordId-597510e0.js} +6 -6
  39. flowfile/web/static/assets/{SQLQueryComponent-88dcfe53.js → SQLQueryComponent-df51adbe.js} +1 -1
  40. flowfile/web/static/assets/{Sample-258ad2a9.js → Sample-4be0a507.js} +4 -4
  41. flowfile/web/static/assets/{SecretManager-2a2cb7e2.js → SecretManager-4839be57.js} +2 -2
  42. flowfile/web/static/assets/{Select-850215fd.js → Select-9b72f201.js} +7 -7
  43. flowfile/web/static/assets/{SettingsSection-29b4fa6b.js → SettingsSection-7ded385d.js} +1 -1
  44. flowfile/web/static/assets/{SettingsSection-0e8d9123.js → SettingsSection-e1e9c953.js} +1 -1
  45. flowfile/web/static/assets/{SettingsSection-55bae608.js → SettingsSection-f0f75a42.js} +1 -1
  46. flowfile/web/static/assets/{SingleSelect-bebd408b.js → SingleSelect-6c777aac.js} +2 -2
  47. flowfile/web/static/assets/{SingleSelect.vue_vue_type_script_setup_true_lang-6093741c.js → SingleSelect.vue_vue_type_script_setup_true_lang-33e3ff9b.js} +1 -1
  48. flowfile/web/static/assets/{SliderInput-6a05ab61.js → SliderInput-7cb93e62.js} +1 -1
  49. flowfile/web/static/assets/{Sort-10ab48ed.js → Sort-6cbde21a.js} +5 -5
  50. flowfile/web/static/assets/{TextInput-df9d6259.js → TextInput-d9a40c11.js} +2 -2
  51. flowfile/web/static/assets/{TextInput.vue_vue_type_script_setup_true_lang-000e1178.js → TextInput.vue_vue_type_script_setup_true_lang-5896c375.js} +1 -1
  52. flowfile/web/static/assets/{TextToRows-6c2d93d8.js → TextToRows-c4fcbf4d.js} +7 -7
  53. flowfile/web/static/assets/{ToggleSwitch-0ff7ac52.js → ToggleSwitch-4ef91d19.js} +2 -2
  54. flowfile/web/static/assets/{ToggleSwitch.vue_vue_type_script_setup_true_lang-c6dc3029.js → ToggleSwitch.vue_vue_type_script_setup_true_lang-38478c20.js} +1 -1
  55. flowfile/web/static/assets/{UnavailableFields-1bab97cb.js → UnavailableFields-a03f512c.js} +2 -2
  56. flowfile/web/static/assets/{Union-b563478a.js → Union-bfe9b996.js} +4 -4
  57. flowfile/web/static/assets/{Unique-f90db5db.js → Unique-5d023a27.js} +8 -20
  58. flowfile/web/static/assets/{Unpivot-bcb0025f.js → Unpivot-91cc5354.js} +6 -6
  59. flowfile/web/static/assets/{UnpivotValidation-c4e73b04.js → UnpivotValidation-7ee2de44.js} +1 -1
  60. flowfile/web/static/assets/{VueGraphicWalker-bb8535e2.js → VueGraphicWalker-e51b9924.js} +1 -1
  61. flowfile/web/static/assets/{api-2d6adc4f.js → api-c1bad5ca.js} +1 -1
  62. flowfile/web/static/assets/{api-4c8e3822.js → api-cf1221f0.js} +1 -1
  63. flowfile/web/static/assets/{designer-e3c150ec.css → designer-8da3ba3a.css} +90 -67
  64. flowfile/web/static/assets/{designer-f3656d8c.js → designer-9633482a.js} +119 -51
  65. flowfile/web/static/assets/{documentation-52b241e7.js → documentation-ca400224.js} +1 -1
  66. flowfile/web/static/assets/{dropDown-1bca8a74.js → dropDown-614b998d.js} +1 -1
  67. flowfile/web/static/assets/{fullEditor-2985687e.js → fullEditor-f7971590.js} +2 -2
  68. flowfile/web/static/assets/{genericNodeSettings-0476ba4e.js → genericNodeSettings-4fe5f36b.js} +3 -3
  69. flowfile/web/static/assets/{index-246f201c.js → index-5429bbf8.js} +6 -8
  70. flowfile/web/static/assets/nodeInput-5d0d6b79.js +41 -0
  71. flowfile/web/static/assets/{outputCsv-d686eeaf.js → outputCsv-076b85ab.js} +1 -1
  72. flowfile/web/static/assets/{outputExcel-8809ea2f.js → outputExcel-0fd17dbe.js} +1 -1
  73. flowfile/web/static/assets/{outputParquet-53ba645a.js → outputParquet-b61e0847.js} +1 -1
  74. flowfile/web/static/assets/{readCsv-053bf97b.js → readCsv-a8bb8b61.js} +21 -20
  75. flowfile/web/static/assets/{readCsv-bca3ed53.css → readCsv-c767cb37.css} +13 -13
  76. flowfile/web/static/assets/{readExcel-ad531eab.js → readExcel-67b4aee0.js} +10 -12
  77. flowfile/web/static/assets/{readExcel-e1b381ea.css → readExcel-806d2826.css} +12 -12
  78. flowfile/web/static/assets/{readParquet-cee068e2.css → readParquet-48c81530.css} +3 -3
  79. flowfile/web/static/assets/{readParquet-58e899a1.js → readParquet-92ce1dbc.js} +4 -7
  80. flowfile/web/static/assets/{secretApi-538058f3.js → secretApi-68435402.js} +1 -1
  81. flowfile/web/static/assets/{selectDynamic-b38de2ba.js → selectDynamic-92e25ee3.js} +3 -3
  82. flowfile/web/static/assets/{vue-codemirror.esm-db9b8936.js → vue-codemirror.esm-41b0e0d7.js} +7 -4
  83. flowfile/web/static/assets/{vue-content-loader.es-b5f3ac30.js → vue-content-loader.es-2c8e608f.js} +1 -1
  84. flowfile/web/static/index.html +1 -1
  85. {flowfile-0.4.1.dist-info → flowfile-0.5.1.dist-info}/METADATA +3 -2
  86. {flowfile-0.4.1.dist-info → flowfile-0.5.1.dist-info}/RECORD +138 -126
  87. {flowfile-0.4.1.dist-info → flowfile-0.5.1.dist-info}/WHEEL +1 -1
  88. {flowfile-0.4.1.dist-info → flowfile-0.5.1.dist-info}/entry_points.txt +1 -0
  89. flowfile_core/__init__.py +3 -0
  90. flowfile_core/flowfile/analytics/analytics_processor.py +1 -0
  91. flowfile_core/flowfile/code_generator/code_generator.py +62 -64
  92. flowfile_core/flowfile/flow_data_engine/create/funcs.py +73 -56
  93. flowfile_core/flowfile/flow_data_engine/flow_data_engine.py +77 -86
  94. flowfile_core/flowfile/flow_data_engine/fuzzy_matching/prepare_for_fuzzy_match.py +23 -23
  95. flowfile_core/flowfile/flow_data_engine/join/utils.py +1 -1
  96. flowfile_core/flowfile/flow_data_engine/join/verify_integrity.py +9 -4
  97. flowfile_core/flowfile/flow_data_engine/subprocess_operations/subprocess_operations.py +184 -78
  98. flowfile_core/flowfile/flow_data_engine/utils.py +2 -0
  99. flowfile_core/flowfile/flow_graph.py +129 -26
  100. flowfile_core/flowfile/flow_node/flow_node.py +3 -0
  101. flowfile_core/flowfile/flow_node/models.py +2 -1
  102. flowfile_core/flowfile/handler.py +5 -5
  103. flowfile_core/flowfile/manage/compatibility_enhancements.py +404 -41
  104. flowfile_core/flowfile/manage/io_flowfile.py +394 -0
  105. flowfile_core/flowfile/node_designer/__init__.py +1 -1
  106. flowfile_core/flowfile/node_designer/_type_registry.py +2 -2
  107. flowfile_core/flowfile/node_designer/custom_node.py +1 -1
  108. flowfile_core/flowfile/node_designer/ui_components.py +1 -1
  109. flowfile_core/flowfile/schema_callbacks.py +8 -5
  110. flowfile_core/flowfile/setting_generator/settings.py +15 -9
  111. flowfile_core/routes/routes.py +8 -10
  112. flowfile_core/schemas/cloud_storage_schemas.py +0 -2
  113. flowfile_core/schemas/input_schema.py +222 -65
  114. flowfile_core/schemas/output_model.py +1 -1
  115. flowfile_core/schemas/schemas.py +145 -32
  116. flowfile_core/schemas/transform_schema.py +1083 -413
  117. flowfile_core/schemas/yaml_types.py +103 -0
  118. flowfile_core/{flowfile/node_designer/data_types.py → types.py} +11 -1
  119. flowfile_frame/__init__.py +3 -1
  120. flowfile_frame/flow_frame.py +15 -18
  121. flowfile_frame/flow_frame_methods.py +12 -9
  122. flowfile_worker/__init__.py +3 -0
  123. flowfile_worker/create/__init__.py +3 -21
  124. flowfile_worker/create/funcs.py +68 -56
  125. flowfile_worker/create/models.py +130 -62
  126. flowfile_worker/routes.py +5 -8
  127. tools/migrate/README.md +56 -0
  128. tools/migrate/__init__.py +12 -0
  129. tools/migrate/__main__.py +131 -0
  130. tools/migrate/legacy_schemas.py +621 -0
  131. tools/migrate/migrate.py +598 -0
  132. tools/migrate/tests/__init__.py +0 -0
  133. tools/migrate/tests/conftest.py +23 -0
  134. tools/migrate/tests/test_migrate.py +627 -0
  135. tools/migrate/tests/test_migration_e2e.py +1010 -0
  136. tools/migrate/tests/test_node_migrations.py +813 -0
  137. flowfile_core/flowfile/manage/open_flowfile.py +0 -143
  138. {flowfile-0.4.1.dist-info → flowfile-0.5.1.dist-info}/licenses/LICENSE +0 -0
  139. /flowfile_core/flowfile/manage/manage_flowfile.py → /tools/__init__.py +0 -0
@@ -0,0 +1,103 @@
1
+ from typing import TypedDict, List
2
+
3
+
4
+ # === Transform Schema YAML Types ===
5
+
6
+ class SelectInputYaml(TypedDict, total=False):
7
+ old_name: str
8
+ new_name: str
9
+ keep: bool
10
+ data_type: str
11
+
12
+
13
+ class JoinInputsYaml(TypedDict):
14
+ select: List[SelectInputYaml]
15
+
16
+
17
+ class JoinMapYaml(TypedDict):
18
+ left_col: str
19
+ right_col: str
20
+
21
+
22
+ class JoinInputYaml(TypedDict):
23
+ join_mapping: List[JoinMapYaml]
24
+ left_select: JoinInputsYaml
25
+ right_select: JoinInputsYaml
26
+ how: str
27
+
28
+
29
+ class CrossJoinInputYaml(TypedDict):
30
+ left_select: JoinInputsYaml
31
+ right_select: JoinInputsYaml
32
+
33
+
34
+ class FuzzyMappingYaml(TypedDict, total=False):
35
+ left_col: str
36
+ right_col: str
37
+ threshold_score: float
38
+ fuzzy_type: str
39
+ perc_unique: float
40
+ output_column_name: str
41
+ valid: bool
42
+
43
+
44
+ class FuzzyMatchInputYaml(TypedDict):
45
+ join_mapping: List[FuzzyMappingYaml]
46
+ left_select: JoinInputsYaml
47
+ right_select: JoinInputsYaml
48
+ how: str
49
+ aggregate_output: bool
50
+
51
+
52
+ # === Input Schema YAML Types ===
53
+
54
+ class OutputSettingsYaml(TypedDict, total=False):
55
+ name: str
56
+ directory: str
57
+ file_type: str
58
+ write_mode: str
59
+ abs_file_path: str
60
+ fields: List[str]
61
+ table_settings: dict
62
+
63
+
64
+ class NodeSelectYaml(TypedDict):
65
+ cache_results: bool
66
+ keep_missing: bool
67
+ select_input: List[SelectInputYaml]
68
+ sorted_by: str
69
+
70
+
71
+ class NodeJoinYaml(TypedDict):
72
+ cache_results: bool
73
+ auto_generate_selection: bool
74
+ verify_integrity: bool
75
+ join_input: JoinInputYaml
76
+ auto_keep_all: bool
77
+ auto_keep_right: bool
78
+ auto_keep_left: bool
79
+
80
+
81
+ class NodeCrossJoinYaml(TypedDict):
82
+ cache_results: bool
83
+ auto_generate_selection: bool
84
+ verify_integrity: bool
85
+ cross_join_input: CrossJoinInputYaml
86
+ auto_keep_all: bool
87
+ auto_keep_right: bool
88
+ auto_keep_left: bool
89
+
90
+
91
+ class NodeFuzzyMatchYaml(TypedDict):
92
+ cache_results: bool
93
+ auto_generate_selection: bool
94
+ verify_integrity: bool
95
+ join_input: FuzzyMatchInputYaml
96
+ auto_keep_all: bool
97
+ auto_keep_right: bool
98
+ auto_keep_left: bool
99
+
100
+
101
+ class NodeOutputYaml(TypedDict):
102
+ cache_results: bool
103
+ output_settings: OutputSettingsYaml
@@ -18,10 +18,20 @@ Usage:
18
18
  """
19
19
 
20
20
  from enum import Enum
21
- from typing import List, Union
21
+ from typing import List, Literal, Union
22
22
  import polars as pl
23
23
 
24
24
 
25
+ DataTypeStr = Literal[
26
+ "Int8", "Int16", "Int32", "Int64",
27
+ "UInt8", "UInt16", "UInt32", "UInt64",
28
+ "Float32", "Float64", "Decimal",
29
+ "String",
30
+ "Date", "Datetime", "Time", "Duration",
31
+ "Boolean", "Binary", "List", "Struct", "Array", "Integer", "Double", "Utf8"
32
+ ]
33
+
34
+
25
35
  class TypeGroup(str, Enum):
26
36
  """High-level type groups for column selection."""
27
37
  Numeric = "Numeric"
@@ -1,6 +1,8 @@
1
1
  # flowframe/__init__.py
2
2
  """A Polars-like API for building ETL graphs."""
3
3
 
4
+ from importlib.metadata import version
5
+
4
6
  # Core classes
5
7
  from flowfile_frame.flow_frame import FlowFrame # noqa: F401
6
8
  from pl_fuzzy_frame_match.models import FuzzyMapping # noqa: F401
@@ -64,4 +66,4 @@ from polars.datatypes import ( # noqa: F401
64
66
  DataType, DataTypeClass, Field
65
67
  )
66
68
 
67
- __version__ = "0.1.0"
69
+ __version__ = version("Flowfile")
@@ -10,7 +10,7 @@ from flowfile_frame.lazy_methods import add_lazyframe_methods
10
10
  from polars._typing import (CsvEncoding, FrameInitTypes, SchemaDefinition, SchemaDict, Orientation)
11
11
  from collections.abc import Iterator
12
12
 
13
- from pl_fuzzy_frame_match import FuzzyMapping, fuzzy_match_dfs
13
+ from pl_fuzzy_frame_match import FuzzyMapping
14
14
 
15
15
  from flowfile_core.flowfile.flow_graph import FlowGraph, add_connection
16
16
  from flowfile_core.flowfile.flow_graph_utils import combine_flow_graphs_with_mapping
@@ -626,7 +626,6 @@ class FlowFrame:
626
626
  left_columns, right_columns = self._parse_join_columns(
627
627
  on, left_on, right_on, how
628
628
  )
629
-
630
629
  # Step 5: Validate column lists have same length (except for cross join)
631
630
  if how != 'cross' and left_columns is not None and right_columns is not None:
632
631
  if len(left_columns) != len(right_columns):
@@ -798,33 +797,36 @@ class FlowFrame:
798
797
  ) -> "FlowFrame":
799
798
  """Execute join using native FlowFile join nodes."""
800
799
  # Create select inputs for both frames
800
+
801
801
  left_select = transform_schema.SelectInputs.create_from_pl_df(self.data)
802
802
  right_select = transform_schema.SelectInputs.create_from_pl_df(other.data)
803
803
  # Create appropriate join input based on join type
804
804
  if how == 'cross':
805
805
  join_input = transform_schema.CrossJoinInput(
806
- left_select=left_select.renames,
806
+ left_select=transform_schema.JoinInputs(renames=left_select.renames),
807
807
  right_select=right_select.renames,
808
808
  )
809
+ join_input_manager = transform_schema.CrossJoinInputManager(join_input)
810
+
809
811
  else:
810
812
  join_input = transform_schema.JoinInput(
811
813
  join_mapping=join_mappings,
812
- left_select=left_select.renames,
814
+ left_select=transform_schema.JoinInputs(renames=left_select.renames),
813
815
  right_select=right_select.renames,
814
816
  how=how,
815
817
  )
818
+ join_input_manager = transform_schema.JoinInputManager(join_input)
816
819
 
817
820
  # Configure join input
818
- join_input.auto_rename()
819
- for right_column in right_select.renames:
821
+ for right_column in join_input_manager.right_select.renames:
820
822
  if right_column.join_key:
821
823
  right_column.keep = False
822
824
 
823
825
  # Create and add appropriate node
824
826
  if how == 'cross':
825
- self._add_cross_join_node(new_node_id, join_input, description, other)
827
+ self._add_cross_join_node(new_node_id, join_input_manager.to_cross_join_input(), description, other)
826
828
  else:
827
- self._add_regular_join_node(new_node_id, join_input, description, other)
829
+ self._add_regular_join_node(new_node_id, join_input_manager.to_join_input(), description, other)
828
830
 
829
831
  # Add connections
830
832
  self._add_connection(self.node_id, new_node_id, "main")
@@ -1140,16 +1142,11 @@ class FlowFrame:
1140
1142
  file_name = file_str.split(os.sep)[-1]
1141
1143
  use_polars_code = bool(kwargs.items()) or not is_path_input
1142
1144
 
1143
- output_parquet_table = input_schema.OutputParquetTable(
1144
- file_type="parquet"
1145
- )
1146
1145
  output_settings = input_schema.OutputSettings(
1147
1146
  file_type='parquet',
1148
1147
  name=file_name,
1149
1148
  directory=file_str if is_path_input else str(file_str),
1150
- output_parquet_table=output_parquet_table,
1151
- output_csv_table=input_schema.OutputCsvTable(),
1152
- output_excel_table=input_schema.OutputExcelTable()
1149
+ table_settings=input_schema.OutputParquetTable()
1153
1150
  )
1154
1151
 
1155
1152
  if is_path_input:
@@ -1220,10 +1217,10 @@ class FlowFrame:
1220
1217
  file_type='csv',
1221
1218
  name=file_name,
1222
1219
  directory=file_str if is_path_input else str(file_str),
1223
- output_csv_table=input_schema.OutputCsvTable(
1224
- file_type="csv", delimiter=separator, encoding=encoding),
1225
- output_excel_table=input_schema.OutputExcelTable(),
1226
- output_parquet_table=input_schema.OutputParquetTable()
1220
+ table_settings=input_schema.OutputCsvTable(
1221
+ delimiter=separator,
1222
+ encoding=encoding
1223
+ )
1227
1224
  )
1228
1225
  if is_path_input:
1229
1226
  try:
@@ -186,15 +186,17 @@ def read_csv(
186
186
  file_type='csv',
187
187
  path=current_source_path_for_native,
188
188
  name=Path(current_source_path_for_native).name,
189
- delimiter=separator,
190
- has_headers=has_header,
191
- encoding=encoding,
192
- starting_from_line=skip_rows,
193
- quote_char=quote_char if quote_char is not None else '"',
194
- infer_schema_length=actual_infer_schema_length if actual_infer_schema_length is not None else 10000,
195
- truncate_ragged_lines=truncate_ragged_lines,
196
- ignore_errors=ignore_errors,
197
- row_delimiter=eol_char
189
+ table_settings=input_schema.InputCsvTable(
190
+ delimiter=separator,
191
+ has_headers=has_header,
192
+ encoding=encoding,
193
+ starting_from_line=skip_rows,
194
+ quote_char=quote_char if quote_char is not None else '"',
195
+ infer_schema_length=actual_infer_schema_length if actual_infer_schema_length is not None else 10000,
196
+ truncate_ragged_lines=truncate_ragged_lines,
197
+ ignore_errors=ignore_errors,
198
+ row_delimiter=eol_char
199
+ )
198
200
  )
199
201
  if convert_to_absolute_path:
200
202
  try:
@@ -407,6 +409,7 @@ def read_parquet(source, *, flow_graph: FlowGraph = None, description: str = Non
407
409
  file_type='parquet',
408
410
  path=source,
409
411
  name=Path(source).name,
412
+ table_settings=input_schema.InputParquetTable()
410
413
  )
411
414
  if convert_to_absolute_path:
412
415
  received_table.path = received_table.abs_file_path
@@ -2,6 +2,9 @@ from typing import Dict
2
2
  import threading
3
3
  import multiprocessing
4
4
  from shared.storage_config import storage
5
+ from importlib.metadata import version
6
+
7
+ __version__ = version("Flowfile")
5
8
 
6
9
  multiprocessing.set_start_method('spawn', force=True)
7
10
 
@@ -1,29 +1,11 @@
1
- from flowfile_worker.create.models import (ReceivedCsvTable, ReceivedParquetTable, ReceivedExcelTable,
2
- ReceivedJsonTable)
1
+
3
2
  from flowfile_worker.create.funcs import (create_from_path_csv, create_from_path_parquet, create_from_path_excel,
4
3
  create_from_path_json)
5
- from typing import Dict, Literal
4
+ from typing import Literal
6
5
 
7
- ReceivedTableCollection = ReceivedCsvTable | ReceivedParquetTable | ReceivedJsonTable | ReceivedExcelTable
8
6
  FileType = Literal['csv', 'parquet', 'json', 'excel']
9
7
 
10
-
11
- def received_table_parser(received_table_raw: Dict, file_type: FileType) -> ReceivedTableCollection:
12
- match file_type:
13
- case 'csv':
14
- received_table = ReceivedCsvTable.model_validate(received_table_raw)
15
- case 'parquet':
16
- received_table = ReceivedParquetTable.model_validate(received_table_raw)
17
- case 'excel':
18
- received_table = ReceivedExcelTable.model_validate(received_table_raw)
19
- case 'json':
20
- return ReceivedJsonTable.model_validate(received_table_raw)
21
- case _:
22
- raise ValueError(f'Unsupported file type: {file_type}')
23
- return received_table
24
-
25
-
26
- def table_creator_factory_method(file_type: Literal['csv', 'parquet', 'json', 'excel']) -> callable:
8
+ def table_creator_factory_method(file_type: FileType) -> callable:
27
9
  match file_type:
28
10
  case 'csv':
29
11
  return create_from_path_csv
@@ -1,94 +1,100 @@
1
1
  import polars as pl
2
2
  import os
3
3
 
4
- from flowfile_worker.create.models import ReceivedCsvTable, ReceivedParquetTable, ReceivedExcelTable
4
+ from flowfile_worker.create.models import ReceivedTable, InputCsvTable, InputJsonTable, InputExcelTable, InputParquetTable
5
5
  from flowfile_worker.create.utils import create_fake_data
6
6
  from flowfile_worker.create.read_excel_tables import df_from_openpyxl, df_from_calamine_xlsx
7
7
 
8
8
 
9
- def create_from_path_json(received_table: ReceivedCsvTable):
9
+ def create_from_path_json(received_table: ReceivedTable):
10
+ if not isinstance(received_table.table_settings, InputJsonTable):
11
+ raise ValueError("Received table settings are not of type InputJsonTable")
12
+ input_table_settings: InputJsonTable = received_table.table_settings
10
13
  f = received_table.abs_file_path
11
14
  gbs_to_load = os.path.getsize(f) / 1024 / 1000 / 1000
12
15
  low_mem = gbs_to_load > 10
13
- if received_table.encoding.upper() == 'UTF8' or received_table.encoding.upper() == 'UTF-8':
16
+ if input_table_settings.encoding.upper() == 'UTF8' or input_table_settings.encoding.upper() == 'UTF-8':
14
17
  try:
15
18
  df = pl.scan_csv(f,
16
19
  low_memory=low_mem,
17
20
  try_parse_dates=True,
18
- separator=received_table.delimiter,
19
- has_header=received_table.has_headers,
20
- skip_rows=received_table.starting_from_line,
21
+ separator=input_table_settings.delimiter,
22
+ has_header=input_table_settings.has_headers,
23
+ skip_rows=input_table_settings.starting_from_line,
21
24
  encoding='utf8',
22
- infer_schema_length=received_table.infer_schema_length)
25
+ infer_schema_length=input_table_settings.infer_schema_length)
23
26
  df.head(1).collect()
24
27
  return df
25
28
  except:
26
29
  try:
27
30
  df = pl.scan_csv(f, low_memory=low_mem,
28
- separator=received_table.delimiter,
29
- has_header=received_table.has_headers,
30
- skip_rows=received_table.starting_from_line,
31
+ separator=input_table_settings.delimiter,
32
+ has_header=input_table_settings.has_headers,
33
+ skip_rows=input_table_settings.starting_from_line,
31
34
  encoding='utf8-lossy',
32
35
  ignore_errors=True)
33
36
  return df
34
37
  except:
35
38
  df = pl.scan_csv(f, low_memory=low_mem,
36
- separator=received_table.delimiter,
37
- has_header=received_table.has_headers,
38
- skip_rows=received_table.starting_from_line,
39
+ separator=input_table_settings.delimiter,
40
+ has_header=input_table_settings.has_headers,
41
+ skip_rows=input_table_settings.starting_from_line,
39
42
  encoding='utf8',
40
43
  ignore_errors=True)
41
44
  return df
42
45
  else:
43
46
  df = pl.read_csv(f, low_memory=low_mem,
44
- separator=received_table.delimiter,
45
- has_header=received_table.has_headers,
46
- skip_rows=received_table.starting_from_line,
47
- encoding=received_table.encoding,
47
+ separator=input_table_settings.delimiter,
48
+ has_header=input_table_settings.has_headers,
49
+ skip_rows=input_table_settings.starting_from_line,
50
+ encoding=input_table_settings.encoding,
48
51
  ignore_errors=True)
49
52
  return df
50
53
 
51
54
 
52
- def create_from_path_csv(received_table: ReceivedCsvTable) -> pl.DataFrame:
55
+ def create_from_path_csv(received_table: ReceivedTable) -> pl.DataFrame:
53
56
  f = received_table.abs_file_path
57
+ if not isinstance(received_table.table_settings, InputCsvTable):
58
+ raise ValueError("Received table settings are not of type InputCsvTable")
59
+ input_table_settings: InputCsvTable = received_table.table_settings
54
60
  gbs_to_load = os.path.getsize(f) / 1024 / 1000 / 1000
55
61
  low_mem = gbs_to_load > 10
56
- if received_table.encoding.upper() == 'UTF8' or received_table.encoding.upper() == 'UTF-8':
62
+ if input_table_settings.encoding.upper() == 'UTF8' or input_table_settings.encoding.upper() == 'UTF-8':
57
63
  try:
58
64
  df = pl.scan_csv(f,
59
65
  low_memory=low_mem,
60
66
  try_parse_dates=True,
61
- separator=received_table.delimiter,
62
- has_header=received_table.has_headers,
63
- skip_rows=received_table.starting_from_line,
67
+ separator=input_table_settings.delimiter,
68
+ has_header=input_table_settings.has_headers,
69
+ skip_rows=input_table_settings.starting_from_line,
64
70
  encoding='utf8',
65
- infer_schema_length=received_table.infer_schema_length)
71
+ infer_schema_length=input_table_settings.infer_schema_length)
66
72
  df.head(1).collect()
67
73
  return df
68
74
  except:
69
75
  try:
70
76
  df = pl.scan_csv(f, low_memory=low_mem,
71
- separator=received_table.delimiter,
72
- has_header=received_table.has_headers,
73
- skip_rows=received_table.starting_from_line,
77
+ separator=input_table_settings.delimiter,
78
+ has_header=input_table_settings.has_headers,
79
+ skip_rows=input_table_settings.starting_from_line,
74
80
  encoding='utf8-lossy',
75
81
  ignore_errors=True)
76
82
  return df
77
83
  except:
78
84
  df = pl.scan_csv(f, low_memory=low_mem,
79
- separator=received_table.delimiter,
80
- has_header=received_table.has_headers,
81
- skip_rows=received_table.starting_from_line,
85
+ separator=input_table_settings.delimiter,
86
+ has_header=input_table_settings.has_headers,
87
+ skip_rows=input_table_settings.starting_from_line,
82
88
  encoding='utf8',
83
89
  ignore_errors=True)
84
90
  return df
85
91
  else:
86
92
  df = pl.read_csv(f,
87
93
  low_memory=low_mem,
88
- separator=received_table.delimiter,
89
- has_header=received_table.has_headers,
90
- skip_rows=received_table.starting_from_line,
91
- encoding=received_table.encoding,
94
+ separator=input_table_settings.delimiter,
95
+ has_header=input_table_settings.has_headers,
96
+ skip_rows=input_table_settings.starting_from_line,
97
+ encoding=input_table_settings.encoding,
92
98
  ignore_errors=True)
93
99
  return df
94
100
 
@@ -97,50 +103,56 @@ def create_random(number_of_records: int = 1000) -> pl.LazyFrame:
97
103
  return create_fake_data(number_of_records).lazy()
98
104
 
99
105
 
100
- def create_from_path_parquet(received_table: ReceivedParquetTable):
106
+ def create_from_path_parquet(received_table: ReceivedTable):
107
+ if not isinstance(received_table.table_settings, InputParquetTable):
108
+ raise ValueError("Received table settings are not of type InputParquetTable")
101
109
  low_mem = (os.path.getsize(received_table.abs_file_path) / 1024 / 1000 / 1000) > 2
102
110
  return pl.scan_parquet(source=received_table.abs_file_path, low_memory=low_mem)
103
111
 
104
112
 
105
- def create_from_path_excel(received_table: ReceivedExcelTable):
106
- if received_table.type_inference:
113
+ def create_from_path_excel(received_table: ReceivedTable):
114
+ if not isinstance(received_table.table_settings, InputExcelTable):
115
+ raise ValueError("Received table settings are not of type InputExcelTable")
116
+ input_table_settings: InputExcelTable = received_table.table_settings
117
+
118
+ if input_table_settings.type_inference:
107
119
  engine = 'openpyxl'
108
- elif received_table.start_row > 0 and received_table.start_column == 0:
109
- engine = 'calamine' if received_table.has_headers else 'xlsx2csv'
110
- elif received_table.start_column > 0 or received_table.start_row > 0:
120
+ elif input_table_settings.start_row > 0 and input_table_settings.start_column == 0:
121
+ engine = 'calamine' if input_table_settings.has_headers else 'xlsx2csv'
122
+ elif input_table_settings.start_column > 0 or input_table_settings.start_row > 0:
111
123
  engine = 'openpyxl'
112
124
  else:
113
125
  engine = 'calamine'
114
126
 
115
- sheet_name = received_table.sheet_name
127
+ sheet_name = input_table_settings.sheet_name
116
128
 
117
129
  if engine == 'calamine':
118
130
  df = df_from_calamine_xlsx(file_path=received_table.abs_file_path, sheet_name=sheet_name,
119
- start_row=received_table.start_row, end_row=received_table.end_row)
120
- if received_table.end_column > 0:
121
- end_col_index = received_table.end_column
122
- cols_to_select = [df.columns[i] for i in range(received_table.start_column, end_col_index)]
131
+ start_row=input_table_settings.start_row, end_row=input_table_settings.end_row)
132
+ if input_table_settings.end_column > 0:
133
+ end_col_index = input_table_settings.end_column
134
+ cols_to_select = [df.columns[i] for i in range(input_table_settings.start_column, end_col_index)]
123
135
  df = df.select(cols_to_select)
124
136
 
125
137
  elif engine == 'xlsx2csv':
126
- csv_options = {'has_header': received_table.has_headers, 'skip_rows': received_table.start_row}
138
+ csv_options = {'has_header': input_table_settings.has_headers, 'skip_rows': input_table_settings.start_row}
127
139
  df = pl.read_excel(source=received_table.abs_file_path,
128
140
  read_options=csv_options,
129
141
  engine='xlsx2csv',
130
- sheet_name=received_table.sheet_name)
131
- end_col_index = received_table.end_column if received_table.end_column > 0 else len(df.columns)
132
- cols_to_select = [df.columns[i] for i in range(received_table.start_column, end_col_index)]
142
+ sheet_name=input_table_settings.sheet_name)
143
+ end_col_index = input_table_settings.end_column if input_table_settings.end_column > 0 else len(df.columns)
144
+ cols_to_select = [df.columns[i] for i in range(input_table_settings.start_column, end_col_index)]
133
145
  df = df.select(cols_to_select)
134
- if 0 < received_table.end_row < len(df):
135
- df = df.head(received_table.end_row)
146
+ if 0 < input_table_settings.end_row < len(df):
147
+ df = df.head(input_table_settings.end_row)
136
148
 
137
149
  else:
138
- max_col = received_table.end_column if received_table.end_column > 0 else None
139
- max_row = received_table.end_row + 1 if received_table.end_row > 0 else None
150
+ max_col = input_table_settings.end_column if input_table_settings.end_column > 0 else None
151
+ max_row = input_table_settings.end_row + 1 if input_table_settings.end_row > 0 else None
140
152
  df = df_from_openpyxl(file_path=received_table.abs_file_path,
141
- sheet_name=received_table.sheet_name,
142
- min_row=received_table.start_row + 1,
143
- min_col=received_table.start_column + 1,
153
+ sheet_name=input_table_settings.sheet_name,
154
+ min_row=input_table_settings.start_row + 1,
155
+ min_col=input_table_settings.start_column + 1,
144
156
  max_row=max_row,
145
- max_col=max_col, has_headers=received_table.has_headers)
157
+ max_col=max_col, has_headers=input_table_settings.has_headers)
146
158
  return df