Flowfile 0.3.7__py3-none-any.whl → 0.3.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (93) hide show
  1. flowfile/__init__.py +4 -3
  2. flowfile/api.py +5 -2
  3. flowfile/web/__init__.py +2 -0
  4. flowfile/web/static/assets/{CloudConnectionManager-c20a740f.js → CloudConnectionManager-c97c25f8.js} +2 -2
  5. flowfile/web/static/assets/{CloudStorageReader-960b400a.js → CloudStorageReader-f1ff509e.js} +7 -7
  6. flowfile/web/static/assets/{CloudStorageWriter-e3decbdd.js → CloudStorageWriter-034f8b78.js} +7 -7
  7. flowfile/web/static/assets/{CrossJoin-d67e2405.js → CrossJoin-9e156ebe.js} +8 -8
  8. flowfile/web/static/assets/{DatabaseConnectionSettings-a81e0f7e.js → DatabaseConnectionSettings-d5c625b3.js} +2 -2
  9. flowfile/web/static/assets/{DatabaseManager-9ea35e84.js → DatabaseManager-265adc5e.js} +2 -2
  10. flowfile/web/static/assets/{DatabaseReader-9578bfa5.js → DatabaseReader-0b10551e.js} +9 -9
  11. flowfile/web/static/assets/{DatabaseWriter-19531098.js → DatabaseWriter-c17c6916.js} +9 -9
  12. flowfile/web/static/assets/{ExploreData-40476474.js → ExploreData-18a4fe52.js} +5 -5
  13. flowfile/web/static/assets/{ExternalSource-2297ef96.js → ExternalSource-3a66556c.js} +6 -6
  14. flowfile/web/static/assets/{Filter-f211c03a.js → Filter-91ad87e7.js} +8 -8
  15. flowfile/web/static/assets/{Formula-4207ea31.js → Formula-3c395ab1.js} +8 -8
  16. flowfile/web/static/assets/{FuzzyMatch-bf120df0.js → FuzzyMatch-2df0d230.js} +9 -9
  17. flowfile/web/static/assets/{GraphSolver-5bb7497a.js → GraphSolver-d285877f.js} +5 -5
  18. flowfile/web/static/assets/{GroupBy-92c81b65.js → GroupBy-0bd1cc6b.js} +6 -6
  19. flowfile/web/static/assets/{Join-4e49a274.js → Join-5a78a203.js} +9 -9
  20. flowfile/web/static/assets/{ManualInput-90998ae8.js → ManualInput-93aef9d6.js} +5 -5
  21. flowfile/web/static/assets/{Output-81e3e917.js → Output-411ecaee.js} +4 -4
  22. flowfile/web/static/assets/{Pivot-a3419842.js → Pivot-89db4b04.js} +6 -6
  23. flowfile/web/static/assets/{PolarsCode-72710deb.js → PolarsCode-a9f974f8.js} +6 -6
  24. flowfile/web/static/assets/{Read-c4059daf.js → Read-c3b1929c.js} +6 -6
  25. flowfile/web/static/assets/{RecordCount-c2b5e095.js → RecordCount-4e95f98e.js} +5 -5
  26. flowfile/web/static/assets/{RecordId-10baf191.js → RecordId-55ae7d36.js} +6 -6
  27. flowfile/web/static/assets/{Sample-3ed9a0ae.js → Sample-b4a18476.js} +5 -5
  28. flowfile/web/static/assets/{SecretManager-0d49c0e8.js → SecretManager-b066d13a.js} +2 -2
  29. flowfile/web/static/assets/{Select-8a02a0b3.js → Select-727688dc.js} +8 -8
  30. flowfile/web/static/assets/{SettingsSection-4c0f45f5.js → SettingsSection-695ac487.js} +1 -1
  31. flowfile/web/static/assets/{Sort-f55c9f9d.js → Sort-be3339a8.js} +6 -6
  32. flowfile/web/static/assets/{TextToRows-5dbc2145.js → TextToRows-7b8998da.js} +8 -8
  33. flowfile/web/static/assets/{UnavailableFields-a1768e52.js → UnavailableFields-8b0cb48e.js} +2 -2
  34. flowfile/web/static/assets/{Union-f2aefdc9.js → Union-89fd73dc.js} +5 -5
  35. flowfile/web/static/assets/{Unique-46b250da.js → Unique-af5a80b4.js} +8 -8
  36. flowfile/web/static/assets/{Unpivot-25ac84cc.js → Unpivot-5195d411.js} +5 -5
  37. flowfile/web/static/assets/{api-a0abbdc7.js → api-023d1733.js} +1 -1
  38. flowfile/web/static/assets/{api-6ef0dcef.js → api-cb00cce6.js} +1 -1
  39. flowfile/web/static/assets/{designer-186f2e71.css → designer-2197d782.css} +17 -17
  40. flowfile/web/static/assets/{designer-13eabd83.js → designer-6c322d8e.js} +67 -21
  41. flowfile/web/static/assets/{documentation-b87e7f6f.js → documentation-4d1fafe1.js} +1 -1
  42. flowfile/web/static/assets/{dropDown-13564764.js → dropDown-0b46dd77.js} +1 -1
  43. flowfile/web/static/assets/{fullEditor-fd2cd6f9.js → fullEditor-ec4e4f95.js} +2 -2
  44. flowfile/web/static/assets/{genericNodeSettings-71e11604.js → genericNodeSettings-def5879b.js} +3 -3
  45. flowfile/web/static/assets/{index-f6c15e76.js → index-683fc198.js} +6 -6
  46. flowfile/web/static/assets/{nodeTitle-988d9efe.js → nodeTitle-a16db7c3.js} +3 -3
  47. flowfile/web/static/assets/{secretApi-dd636aa2.js → secretApi-baceb6f9.js} +1 -1
  48. flowfile/web/static/assets/{selectDynamic-af36165e.js → selectDynamic-de91449a.js} +3 -3
  49. flowfile/web/static/assets/{vue-codemirror.esm-2847001e.js → vue-codemirror.esm-dc5e3348.js} +1 -1
  50. flowfile/web/static/assets/{vue-content-loader.es-0371da73.js → vue-content-loader.es-ba94b82f.js} +1 -1
  51. flowfile/web/static/index.html +1 -1
  52. {flowfile-0.3.7.dist-info → flowfile-0.3.9.dist-info}/METADATA +2 -1
  53. {flowfile-0.3.7.dist-info → flowfile-0.3.9.dist-info}/RECORD +88 -90
  54. flowfile_core/configs/settings.py +4 -2
  55. flowfile_core/configs/utils.py +5 -0
  56. flowfile_core/database/connection.py +1 -3
  57. flowfile_core/flowfile/code_generator/code_generator.py +36 -0
  58. flowfile_core/flowfile/flow_data_engine/cloud_storage_reader.py +0 -1
  59. flowfile_core/flowfile/flow_data_engine/flow_data_engine.py +60 -80
  60. flowfile_core/flowfile/flow_data_engine/flow_file_column/main.py +61 -0
  61. flowfile_core/flowfile/flow_data_engine/fuzzy_matching/prepare_for_fuzzy_match.py +44 -3
  62. flowfile_core/flowfile/flow_data_engine/subprocess_operations/models.py +3 -3
  63. flowfile_core/flowfile/flow_data_engine/subprocess_operations/subprocess_operations.py +5 -2
  64. flowfile_core/flowfile/flow_graph.py +129 -88
  65. flowfile_core/flowfile/flow_node/flow_node.py +30 -15
  66. flowfile_core/flowfile/flow_node/models.py +0 -2
  67. flowfile_core/flowfile/flow_node/schema_callback.py +138 -43
  68. flowfile_core/flowfile/graph_tree/graph_tree.py +250 -0
  69. flowfile_core/flowfile/graph_tree/models.py +15 -0
  70. flowfile_core/flowfile/manage/compatibility_enhancements.py +1 -1
  71. flowfile_core/flowfile/{flow_data_engine/fuzzy_matching/settings_validator.py → schema_callbacks.py} +65 -13
  72. flowfile_core/flowfile/setting_generator/settings.py +2 -1
  73. flowfile_core/flowfile/util/execution_orderer.py +9 -0
  74. flowfile_core/flowfile/util/node_skipper.py +8 -0
  75. flowfile_core/schemas/schemas.py +46 -3
  76. flowfile_core/schemas/transform_schema.py +27 -38
  77. flowfile_core/utils/arrow_reader.py +8 -3
  78. flowfile_core/utils/validate_setup.py +0 -2
  79. flowfile_frame/__init__.py +1 -4
  80. flowfile_frame/expr.py +14 -0
  81. flowfile_frame/flow_frame.py +34 -5
  82. flowfile_frame/flow_frame.pyi +5 -6
  83. flowfile_worker/funcs.py +7 -3
  84. flowfile_worker/models.py +3 -1
  85. flowfile_worker/polars_fuzzy_match/matcher.py +0 -435
  86. flowfile_worker/polars_fuzzy_match/models.py +0 -36
  87. flowfile_worker/polars_fuzzy_match/pre_process.py +0 -213
  88. flowfile_worker/polars_fuzzy_match/process.py +0 -86
  89. flowfile_worker/polars_fuzzy_match/utils.py +0 -50
  90. {flowfile-0.3.7.dist-info → flowfile-0.3.9.dist-info}/LICENSE +0 -0
  91. {flowfile-0.3.7.dist-info → flowfile-0.3.9.dist-info}/WHEEL +0 -0
  92. {flowfile-0.3.7.dist-info → flowfile-0.3.9.dist-info}/entry_points.txt +0 -0
  93. {flowfile_worker/polars_fuzzy_match → flowfile_core/flowfile/graph_tree}/__init__.py +0 -0
@@ -1,213 +0,0 @@
1
- from logging import Logger
2
- from typing import List, Dict, Tuple
3
-
4
- import polars as pl
5
-
6
- from flowfile_worker.polars_fuzzy_match.models import FuzzyMapping
7
- from flowfile_worker.utils import collect_lazy_frame
8
-
9
-
10
- def get_approx_uniqueness(lf: pl.LazyFrame) -> Dict[str, int]:
11
- """
12
- Calculate the approximate number of unique values for each column in a LazyFrame.
13
-
14
- Args:
15
- lf (pl.LazyFrame): Input LazyFrame to analyze.
16
-
17
- Returns:
18
- Dict[str, int]: Dictionary mapping column names to their approximate unique value counts.
19
-
20
- Raises:
21
- Exception: If the uniqueness calculation fails (empty result).
22
- """
23
- uniqueness = lf.select(pl.all().approx_n_unique()).collect().to_dicts()
24
- if len(uniqueness) == 0:
25
- raise Exception('Approximate uniqueness calculation failed')
26
- return uniqueness[0]
27
-
28
-
29
- def calculate_uniqueness(a: float, b: float) -> float:
30
- """
31
- Calculate a combined uniqueness score from two individual uniqueness ratios.
32
-
33
- The formula prioritizes columns with high combined uniqueness while accounting for
34
- differences between the two input values.
35
-
36
- Args:
37
- a (float): First uniqueness ratio, typically from the left dataframe.
38
- b (float): Second uniqueness ratio, typically from the right dataframe.
39
-
40
- Returns:
41
- float: Combined uniqueness score.
42
- """
43
- return ((pow(a + 0.5, 2) + pow(b + 0.5, 2)) / 2 - pow(0.5, 2)) + 0.5 * abs(a - b)
44
-
45
-
46
- def calculate_df_len(df: pl.LazyFrame) -> int:
47
- """
48
- Calculate the number of rows in a LazyFrame.
49
-
50
- Args:
51
- df (pl.LazyFrame): Input LazyFrame.
52
-
53
- Returns:
54
- int: Number of rows in the LazyFrame.
55
- """
56
- return collect_lazy_frame(df.select(pl.len()))[0, 0]
57
-
58
-
59
- def fill_perc_unique_in_fuzzy_maps(left_df: pl.LazyFrame, right_df: pl.LazyFrame, fuzzy_maps: List[FuzzyMapping],
60
- flowfile_logger: Logger, left_len: int, right_len: int) -> List[FuzzyMapping]:
61
- """
62
- Calculate and set uniqueness percentages for all fuzzy mapping columns.
63
-
64
- Computes the approximate unique value counts in both dataframes for the columns
65
- specified in fuzzy_maps, then calculates a combined uniqueness score for each mapping.
66
-
67
- Args:
68
- left_df (pl.LazyFrame): Left dataframe.
69
- right_df (pl.LazyFrame): Right dataframe.
70
- fuzzy_maps (List[FuzzyMapping]): List of fuzzy mappings between left and right columns.
71
- flowfile_logger (Logger): Logger for information output.
72
- left_len (int): Number of rows in the left dataframe.
73
- right_len (int): Number of rows in the right dataframe.
74
-
75
- Returns:
76
- List[FuzzyMapping]: Updated fuzzy mappings with calculated uniqueness percentages.
77
- """
78
- left_unique_values = get_approx_uniqueness(left_df.select(fuzzy_map.left_col for fuzzy_map in fuzzy_maps))
79
- right_unique_values = get_approx_uniqueness(right_df.select(fuzzy_map.right_col for fuzzy_map in fuzzy_maps))
80
- flowfile_logger.info(f'Left unique values: {left_unique_values}')
81
- flowfile_logger.info(f'Right unique values: {right_unique_values}')
82
- for fuzzy_map in fuzzy_maps:
83
- fuzzy_map.perc_unique = calculate_uniqueness(left_unique_values[fuzzy_map.left_col] / left_len,
84
- right_unique_values[fuzzy_map.right_col] / right_len)
85
- return fuzzy_maps
86
-
87
-
88
- def determine_order_of_fuzzy_maps(fuzzy_maps: List[FuzzyMapping]) -> List[FuzzyMapping]:
89
- """
90
- Sort fuzzy mappings by their uniqueness percentages in descending order.
91
-
92
- This ensures that columns with higher uniqueness are prioritized in the
93
- fuzzy matching process.
94
-
95
- Args:
96
- fuzzy_maps (List[FuzzyMapping]): List of fuzzy mappings between columns.
97
-
98
- Returns:
99
- List[FuzzyMapping]: Sorted list of fuzzy mappings by uniqueness (highest first).
100
- """
101
- return sorted(fuzzy_maps, key=lambda x: x.perc_unique, reverse=True)
102
-
103
-
104
- def calculate_uniqueness_rate(fuzzy_maps: List[FuzzyMapping]) -> float:
105
- """
106
- Calculate the total uniqueness rate across all fuzzy mappings.
107
-
108
- Args:
109
- fuzzy_maps (List[FuzzyMapping]): List of fuzzy mappings with calculated uniqueness.
110
-
111
- Returns:
112
- float: Sum of uniqueness percentages across all mappings.
113
- """
114
- return sum(jm.perc_unique for jm in fuzzy_maps)
115
-
116
-
117
- def determine_need_for_aggregation(uniqueness_rate: float, cartesian_join_number: int) -> bool:
118
- """
119
- Determine if aggregation is needed based on uniqueness and potential join size.
120
-
121
- Aggregation helps prevent explosive cartesian joins when matching columns
122
- have low uniqueness, which could lead to performance issues.
123
-
124
- Args:
125
- uniqueness_rate (float): Total uniqueness rate across fuzzy mappings.
126
- cartesian_join_number (int): Potential size of the cartesian join (left_len * right_len).
127
-
128
- Returns:
129
- bool: True if aggregation is needed, False otherwise.
130
- """
131
- return uniqueness_rate < 1.2 and cartesian_join_number > 1_000_000
132
-
133
-
134
- def aggregate_output(left_df: pl.LazyFrame, right_df: pl.LazyFrame,
135
- fuzzy_maps: List[FuzzyMapping]) -> Tuple[pl.LazyFrame, pl.LazyFrame]:
136
- """
137
- Deduplicate the dataframes based on the fuzzy mapping columns.
138
-
139
- This reduces the size of the join by removing duplicate rows when the
140
- uniqueness rate is low and the potential join size is large.
141
-
142
- Args:
143
- left_df (pl.LazyFrame): Left dataframe.
144
- right_df (pl.LazyFrame): Right dataframe.
145
- fuzzy_maps (List[FuzzyMapping]): List of fuzzy mappings between columns.
146
-
147
- Returns:
148
- Tuple[pl.LazyFrame, pl.LazyFrame]: Deduplicated left and right dataframes.
149
- """
150
- left_df = left_df.unique([fuzzy_map.left_col for fuzzy_map in fuzzy_maps])
151
- right_df = right_df.unique([fuzzy_map.right_col for fuzzy_map in fuzzy_maps])
152
- return left_df, right_df
153
-
154
-
155
- def report_on_order_of_fuzzy_maps(fuzzy_maps: List[FuzzyMapping], flowfile_logger: Logger) -> None:
156
- """
157
- Log the order of fuzzy mappings based on uniqueness.
158
- Parameters
159
- ----------
160
- fuzzy_maps: List[FuzzyMapping]
161
- flowfile_logger: Logger
162
-
163
- -------
164
- """
165
- flowfile_logger.info('Fuzzy mappings sorted by uniqueness')
166
- for i, fuzzy_map in enumerate(fuzzy_maps):
167
- flowfile_logger.info(f'{i}. Fuzzy mapping: {fuzzy_map.left_col} -> {fuzzy_map.right_col} '
168
- f'Uniqueness: {fuzzy_map.perc_unique}')
169
-
170
-
171
- def pre_process_for_fuzzy_matching(left_df: pl.LazyFrame, right_df: pl.LazyFrame,
172
- fuzzy_maps: List[FuzzyMapping],
173
- flowfile_logger: Logger) -> Tuple[pl.LazyFrame, pl.LazyFrame, List[FuzzyMapping]]:
174
- """
175
- Preprocess dataframes and fuzzy mappings for optimal fuzzy matching.
176
-
177
- This function:
178
- 1. Calculates dataframe sizes
179
- 2. Calculates uniqueness percentages for each fuzzy mapping
180
- 3. Sorts the fuzzy mappings by uniqueness
181
- 4. Determines if aggregation is needed to prevent large cartesian joins
182
- 5. Performs aggregation if necessary
183
-
184
- Args:
185
- left_df (pl.LazyFrame): Left dataframe.
186
- right_df (pl.LazyFrame): Right dataframe.
187
- fuzzy_maps (List[FuzzyMapping]): List of fuzzy mappings between columns.
188
- flowfile_logger (Logger): Logger for information output.
189
-
190
- Returns:
191
- Tuple[pl.LazyFrame, pl.LazyFrame, List[FuzzyMapping]]:
192
- - Potentially modified left dataframe
193
- - Potentially modified right dataframe
194
- - Sorted and updated fuzzy mappings
195
- """
196
- flowfile_logger.info('Optimizing data and settings for fuzzy matching')
197
- left_df_len = calculate_df_len(left_df)
198
- right_df_len = calculate_df_len(right_df)
199
- if left_df_len == 0 or right_df_len == 0:
200
- return left_df, right_df, fuzzy_maps
201
- fuzzy_maps = fill_perc_unique_in_fuzzy_maps(left_df, right_df, fuzzy_maps, flowfile_logger, left_df_len,
202
- right_df_len)
203
- fuzzy_maps = determine_order_of_fuzzy_maps(fuzzy_maps)
204
- report_on_order_of_fuzzy_maps(fuzzy_maps, flowfile_logger)
205
-
206
- uniqueness_rate = calculate_uniqueness_rate(fuzzy_maps)
207
- flowfile_logger.info(f'Uniqueness rate: {uniqueness_rate}')
208
- if determine_need_for_aggregation(uniqueness_rate, left_df_len * right_df_len):
209
- flowfile_logger.warning('The join fields are not unique enough, resulting in many duplicates, '
210
- 'therefore removing duplicates on the join field')
211
- left_df, right_df = aggregate_output(left_df, right_df, fuzzy_maps)
212
- flowfile_logger.info('Data and settings optimized for fuzzy matching')
213
- return left_df, right_df, fuzzy_maps
@@ -1,86 +0,0 @@
1
- import polars as pl
2
- import polars_distance as pld
3
- from flowfile_worker.polars_fuzzy_match.utils import cache_polars_frame_to_temp
4
- from flowfile_worker.utils import collect_lazy_frame
5
- from flowfile_worker.polars_fuzzy_match.models import FuzzyTypeLiteral
6
-
7
-
8
- def calculate_fuzzy_score(mapping_table: pl.LazyFrame, left_col_name: str, right_col_name: str,
9
- fuzzy_method: FuzzyTypeLiteral, th_score: float) -> pl.LazyFrame:
10
- """
11
- Calculate fuzzy matching scores between columns in a LazyFrame.
12
-
13
- Args:
14
- mapping_table: The DataFrame containing columns to compare
15
- left_col_name: Name of the left column for comparison
16
- right_col_name: Name of the right column for comparison
17
- fuzzy_method: Type of fuzzy matching algorithm to use
18
- th_score: The threshold score for fuzzy matching
19
-
20
- Returns:
21
- A LazyFrame with fuzzy matching scores
22
- """
23
- mapping_table = mapping_table.with_columns(pl.col(left_col_name).str.to_lowercase().alias('left'),
24
- pl.col(right_col_name).str.to_lowercase().alias('right'))
25
- dist_col = pld.DistancePairWiseString(pl.col('left'))
26
- if fuzzy_method in ("jaro_winkler"):
27
- fm_method = getattr(dist_col, fuzzy_method)(pl.col('right')).alias('s')
28
- else:
29
- fm_method = getattr(dist_col, fuzzy_method)(pl.col('right'), normalized=True).alias('s')
30
- return (mapping_table.with_columns(fm_method).drop(['left', 'right']).filter(pl.col('s') <= th_score).
31
- with_columns((1-pl.col('s')).alias('s')))
32
-
33
-
34
- def process_fuzzy_frames(left_df: pl.LazyFrame, right_df: pl.LazyFrame, left_col_name: str, right_col_name: str,
35
- temp_dir_ref: str):
36
- """
37
- Process left and right data frames to create fuzzy frames,
38
- cache them temporarily, and adjust based on their lengths.
39
-
40
- Args:
41
- - left_df (pl.DataFrame): The left data frame.
42
- - right_df (pl.DataFrame): The right data frame.
43
- - fm (object): An object containing configuration such as the left column name.
44
- - temp_dir_ref (str): A reference to the temporary directory for caching frames.
45
-
46
- Returns:
47
- - Tuple[pl.DataFrame, pl.DataFrame, str, str]: Processed left and right fuzzy frames and their respective column names.
48
- """
49
-
50
- # Process left and right data frames
51
- left_fuzzy_frame = cache_polars_frame_to_temp(left_df.group_by(left_col_name).agg('__left_index').
52
- filter(pl.col(left_col_name).is_not_null()), temp_dir_ref)
53
- right_fuzzy_frame = cache_polars_frame_to_temp(right_df.group_by(right_col_name).agg('__right_index').
54
- filter(pl.col(right_col_name).is_not_null()), temp_dir_ref)
55
- # Calculate lengths of fuzzy frames
56
- len_left_df = collect_lazy_frame(left_fuzzy_frame.select(pl.len()))[0, 0]
57
- len_right_df = collect_lazy_frame(right_fuzzy_frame.select(pl.len()))[0, 0]
58
-
59
- # Decide which frame to use as left or right based on their lengths
60
- if len_left_df < len_right_df:
61
- # Swap the frames and column names if right frame is larger
62
- left_fuzzy_frame, right_fuzzy_frame = right_fuzzy_frame, left_fuzzy_frame
63
- left_col_name, right_col_name = right_col_name, left_col_name
64
-
65
- # Return the processed frames and column names
66
- return left_fuzzy_frame, right_fuzzy_frame, left_col_name, right_col_name, len_left_df, len_right_df
67
-
68
-
69
- def calculate_and_parse_fuzzy(mapping_table: pl.LazyFrame, left_col_name: str, right_col_name: str,
70
- fuzzy_method: FuzzyTypeLiteral, th_score: float) -> pl.LazyFrame:
71
- """
72
- Calculate fuzzy scores and parse/explode the results for further processing.
73
-
74
- Args:
75
- mapping_table: The DataFrame containing columns to compare
76
- left_col_name: Name of the left column for comparison
77
- right_col_name: Name of the right column for comparison
78
- fuzzy_method: Type of fuzzy matching algorithm to use
79
- th_score: Minimum similarity score threshold (0-1)
80
-
81
- Returns:
82
- A LazyFrame with exploded indices and fuzzy scores
83
- """
84
- return calculate_fuzzy_score(mapping_table, left_col_name, right_col_name, fuzzy_method, th_score).select(
85
- pl.col('s'), pl.col('__left_index'), pl.col('__right_index')).explode(pl.col('__left_index')).explode(
86
- pl.col('__right_index'))
@@ -1,50 +0,0 @@
1
- import polars as pl
2
- from flowfile_worker.configs import logger
3
- from flowfile_worker.utils import collect_lazy_frame
4
- import os
5
- import uuid
6
-
7
-
8
- def write_polars_frame(_df: pl.LazyFrame | pl.DataFrame, path: str,
9
- estimated_size: int = 0):
10
- is_lazy = isinstance(_df, pl.LazyFrame)
11
- logger.info('Caching data frame')
12
- if is_lazy:
13
- if estimated_size > 0:
14
- fit_memory = estimated_size / 1024 / 1000 / 1000 < 8
15
- if fit_memory:
16
- _df = _df.collect()
17
- is_lazy = False
18
-
19
- if is_lazy:
20
- logger.info("Writing in memory efficient mode")
21
- write_method = getattr(_df, 'sink_ipc')
22
- try:
23
- write_method(path)
24
- return True
25
- except Exception as e:
26
- pass
27
- try:
28
- write_method(path)
29
- return True
30
- except Exception as e:
31
- pass
32
- if is_lazy:
33
- _df = collect_lazy_frame(_df)
34
- try:
35
- write_method = getattr(_df, 'write_ipc')
36
- write_method(path)
37
- return True
38
- except Exception as e:
39
- print('error', e)
40
- return False
41
-
42
-
43
- def cache_polars_frame_to_temp(_df: pl.LazyFrame | pl.DataFrame, tempdir: str = None) -> pl.LazyFrame:
44
- path = f'{tempdir}{os.sep}{uuid.uuid4()}'
45
- result = write_polars_frame(_df, path)
46
- if result:
47
- df = pl.read_ipc(path)
48
- return df.lazy()
49
- else:
50
- raise Exception('Could not cache the data')