imsciences 0.9.6.9__py3-none-any.whl → 1.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
imsciences/unittesting.py CHANGED
@@ -1,25 +1,27 @@
1
+ import os
1
2
  import unittest
2
- import pandas as pd
3
+
3
4
  import numpy as np
4
- import os
5
+ import pandas as pd
5
6
  from mmm import dataprocessing
6
7
 
8
+
7
9
  class TestDataProcessor(unittest.TestCase):
8
-
9
10
  def setUp(self):
10
11
  self.dp = dataprocessing()
11
- self.df = pd.DataFrame({
12
- 'date': pd.date_range(start='2023-01-01', periods=10, freq='D'),
13
- 'value1': range(10),
14
- 'value2': range(10, 20)
15
- })
16
- self.mixed_date_df = pd.DataFrame({
17
- 'mixed_date': ['2023-01-01', '01/02/2023', '2023/03/01', '2023-04-01']
18
- })
19
- self.merged_df = pd.DataFrame({
20
- 'col1': ["A", "B", "C"],
21
- 'col2': ["X", "Y", "Z"]
22
- })
12
+ self.df = pd.DataFrame(
13
+ {
14
+ "date": pd.date_range(start="2023-01-01", periods=10, freq="D"),
15
+ "value1": range(10),
16
+ "value2": range(10, 20),
17
+ },
18
+ )
19
+ self.mixed_date_df = pd.DataFrame(
20
+ {"mixed_date": ["2023-01-01", "01/02/2023", "2023/03/01", "2023-04-01"]},
21
+ )
22
+ self.merged_df = pd.DataFrame(
23
+ {"col1": ["A", "B", "C"], "col2": ["X", "Y", "Z"]},
24
+ )
23
25
 
24
26
  def test_get_wd_levels(self):
25
27
  current_dir = os.getcwd()
@@ -29,258 +31,345 @@ class TestDataProcessor(unittest.TestCase):
29
31
  def test_aggregate_daily_to_wc_long(self):
30
32
  # Create a test DataFrame
31
33
  test_data = {
32
- 'date': ['2023-01-01', '2023-01-02', '2023-01-08', '2023-01-09', '2023-01-10'],
33
- 'group_col': ['A', 'A', 'B', 'B', 'B'],
34
- 'value1': [10, 20, 30, 40, np.nan],
35
- 'value2': [100, 200, 300, np.nan, 500]
34
+ "date": [
35
+ "2023-01-01",
36
+ "2023-01-02",
37
+ "2023-01-08",
38
+ "2023-01-09",
39
+ "2023-01-10",
40
+ ],
41
+ "group_col": ["A", "A", "B", "B", "B"],
42
+ "value1": [10, 20, 30, 40, np.nan],
43
+ "value2": [100, 200, 300, np.nan, 500],
36
44
  }
37
45
  df = pd.DataFrame(test_data)
38
46
 
39
47
  # Expected output for different test cases
40
- expected_sum_output = pd.DataFrame({
41
- 'OBS': ['2023-01-01', '2023-01-08'], # Week starting on Sunday
42
- 'group_col': ['A', 'B'],
43
- 'value1': [30.0, 70.0],
44
- 'value2': [300.0, 800.0]
45
- })
48
+ expected_sum_output = pd.DataFrame(
49
+ {
50
+ "OBS": ["2023-01-01", "2023-01-08"], # Week starting on Sunday
51
+ "group_col": ["A", "B"],
52
+ "value1": [30.0, 70.0],
53
+ "value2": [300.0, 800.0],
54
+ },
55
+ )
46
56
 
47
57
  # Convert OBS column to datetime for expected DataFrame
48
- expected_sum_output['OBS'] = pd.to_datetime(expected_sum_output['OBS'])
58
+ expected_sum_output["OBS"] = pd.to_datetime(expected_sum_output["OBS"])
49
59
 
50
60
  # Test sum aggregation
51
- result_sum = self.dp.aggregate_daily_to_wc_long(df, 'date', ['group_col'], ['value1', 'value2'], wc='sun', aggregation='sum')
61
+ result_sum = self.dp.aggregate_daily_to_wc_long(
62
+ df,
63
+ "date",
64
+ ["group_col"],
65
+ ["value1", "value2"],
66
+ wc="sun",
67
+ aggregation="sum",
68
+ )
52
69
 
53
70
  # Ensure both OBS columns are datetime for comparison
54
- result_sum['OBS'] = pd.to_datetime(result_sum['OBS'])
71
+ result_sum["OBS"] = pd.to_datetime(result_sum["OBS"])
55
72
 
56
73
  # Compare the resulting DataFrame with the expected DataFrame
57
74
  pd.testing.assert_frame_equal(result_sum, expected_sum_output)
58
-
75
+
59
76
  def test_convert_monthly_to_daily(self):
60
77
  # Create a test DataFrame with monthly data
61
78
  test_data = {
62
- 'date': ['2023-01-01', '2023-02-01', '2023-03-01'],
63
- 'value1': [31, 28, 31],
64
- 'value2': [310, 280, 310]
79
+ "date": ["2023-01-01", "2023-02-01", "2023-03-01"],
80
+ "value1": [31, 28, 31],
81
+ "value2": [310, 280, 310],
65
82
  }
66
83
  df = pd.DataFrame(test_data)
67
84
 
68
85
  # Expected output DataFrame when divide=True
69
86
  expected_daily_data_divide = {
70
- 'date': pd.date_range(start='2023-01-01', end='2023-01-31').tolist() +
71
- pd.date_range(start='2023-02-01', end='2023-02-28').tolist() +
72
- pd.date_range(start='2023-03-01', end='2023-03-31').tolist(),
73
- 'value1': [1.0] * 31 + [1.0] * 28 + [1.0] * 31,
74
- 'value2': [10.0] * 31 + [10.0] * 28 + [10.0] * 31
87
+ "date": pd.date_range(start="2023-01-01", end="2023-01-31").tolist()
88
+ + pd.date_range(start="2023-02-01", end="2023-02-28").tolist()
89
+ + pd.date_range(start="2023-03-01", end="2023-03-31").tolist(),
90
+ "value1": [1.0] * 31 + [1.0] * 28 + [1.0] * 31,
91
+ "value2": [10.0] * 31 + [10.0] * 28 + [10.0] * 31,
75
92
  }
76
93
  expected_daily_df_divide = pd.DataFrame(expected_daily_data_divide)
77
94
 
78
95
  # Call the function with divide=True
79
- result_divide = self.dp.convert_monthly_to_daily(df, 'date', divide=True)
96
+ result_divide = self.dp.convert_monthly_to_daily(df, "date", divide=True)
80
97
 
81
98
  # Compare the resulting DataFrame with the expected DataFrame
82
- pd.testing.assert_frame_equal(result_divide.reset_index(drop=True), expected_daily_df_divide)
99
+ pd.testing.assert_frame_equal(
100
+ result_divide.reset_index(drop=True),
101
+ expected_daily_df_divide,
102
+ )
83
103
 
84
104
  # Expected output DataFrame when divide=False
85
105
  expected_daily_data_no_divide = {
86
- 'date': pd.date_range(start='2023-01-01', end='2023-01-31').tolist() +
87
- pd.date_range(start='2023-02-01', end='2023-02-28').tolist() +
88
- pd.date_range(start='2023-03-01', end='2023-03-31').tolist(),
89
- 'value1': [31] * 31 + [28] * 28 + [31] * 31,
90
- 'value2': [310] * 31 + [280] * 28 + [310] * 31
106
+ "date": pd.date_range(start="2023-01-01", end="2023-01-31").tolist()
107
+ + pd.date_range(start="2023-02-01", end="2023-02-28").tolist()
108
+ + pd.date_range(start="2023-03-01", end="2023-03-31").tolist(),
109
+ "value1": [31] * 31 + [28] * 28 + [31] * 31,
110
+ "value2": [310] * 31 + [280] * 28 + [310] * 31,
91
111
  }
92
112
  expected_daily_df_no_divide = pd.DataFrame(expected_daily_data_no_divide)
93
113
 
94
114
  # Call the function with divide=False
95
- result_no_divide = self.dp.convert_monthly_to_daily(df, 'date', divide=False)
115
+ result_no_divide = self.dp.convert_monthly_to_daily(df, "date", divide=False)
96
116
 
97
117
  # Compare the resulting DataFrame with the expected DataFrame
98
- pd.testing.assert_frame_equal(result_no_divide.reset_index(drop=True), expected_daily_df_no_divide)
118
+ pd.testing.assert_frame_equal(
119
+ result_no_divide.reset_index(drop=True),
120
+ expected_daily_df_no_divide,
121
+ )
99
122
 
100
123
  def test_week_of_year_mapping(self):
101
124
  # Create a test DataFrame with ISO week format
102
- test_data = {
103
- 'week_col': ['2023-W01', '2023-W05', '2023-W10', '2023-W52']
104
- }
125
+ test_data = {"week_col": ["2023-W01", "2023-W05", "2023-W10", "2023-W52"]}
105
126
  df = pd.DataFrame(test_data)
106
127
 
107
128
  # Expected outputs for different start days
108
- expected_output_mon = pd.DataFrame({
109
- 'week_col': ['2023-W01', '2023-W05', '2023-W10', '2023-W52'],
110
- 'OBS': ['02/01/2023', '30/01/2023', '06/03/2023', '25/12/2023']
111
- })
129
+ expected_output_mon = pd.DataFrame(
130
+ {
131
+ "week_col": ["2023-W01", "2023-W05", "2023-W10", "2023-W52"],
132
+ "OBS": ["02/01/2023", "30/01/2023", "06/03/2023", "25/12/2023"],
133
+ },
134
+ )
112
135
 
113
- expected_output_sun = pd.DataFrame({
114
- 'week_col': ['2023-W01', '2023-W05', '2023-W10', '2023-W52'],
115
- 'OBS': ['01/01/2023', '29/01/2023', '05/03/2023', '24/12/2023']
116
- })
136
+ expected_output_sun = pd.DataFrame(
137
+ {
138
+ "week_col": ["2023-W01", "2023-W05", "2023-W10", "2023-W52"],
139
+ "OBS": ["01/01/2023", "29/01/2023", "05/03/2023", "24/12/2023"],
140
+ },
141
+ )
117
142
 
118
143
  # Test mapping with Monday as start day
119
- result_mon = self.dp.week_of_year_mapping(df.copy(), 'week_col', 'mon')
144
+ result_mon = self.dp.week_of_year_mapping(df.copy(), "week_col", "mon")
120
145
  pd.testing.assert_frame_equal(result_mon, expected_output_mon)
121
146
 
122
147
  # Test mapping with Sunday as start day
123
- result_sun = self.dp.week_of_year_mapping(df.copy(), 'week_col', 'sun')
148
+ result_sun = self.dp.week_of_year_mapping(df.copy(), "week_col", "sun")
124
149
  pd.testing.assert_frame_equal(result_sun, expected_output_sun)
125
150
 
126
151
  # Test with invalid start day input
127
152
  with self.assertRaises(ValueError) as context:
128
- self.dp.week_of_year_mapping(df.copy(), 'week_col', 'invalid_day')
153
+ self.dp.week_of_year_mapping(df.copy(), "week_col", "invalid_day")
129
154
  self.assertIn("Invalid day input", str(context.exception))
130
155
 
131
156
  def test_rename_cols(self):
132
157
  # Create a test DataFrame
133
158
  test_data = {
134
- 'OBS': [1, 2, 3],
135
- 'Column One': [10, 20, 30],
136
- 'Another Column': [100, 200, 300],
137
- 'Special Characters !@#': [5, 15, 25]
159
+ "OBS": [1, 2, 3],
160
+ "Column One": [10, 20, 30],
161
+ "Another Column": [100, 200, 300],
162
+ "Special Characters !@#": [5, 15, 25],
138
163
  }
139
164
  df = pd.DataFrame(test_data)
140
165
 
141
166
  # Expected output with default prefix
142
- expected_output_default = pd.DataFrame({
143
- 'OBS': [1, 2, 3],
144
- 'ame_column_one': [10, 20, 30],
145
- 'ame_another_column': [100, 200, 300],
146
- 'ame_special_characters_!@#': [5, 15, 25]
147
- })
167
+ expected_output_default = pd.DataFrame(
168
+ {
169
+ "OBS": [1, 2, 3],
170
+ "ame_column_one": [10, 20, 30],
171
+ "ame_another_column": [100, 200, 300],
172
+ "ame_special_characters_!@#": [5, 15, 25],
173
+ },
174
+ )
148
175
 
149
176
  # Expected output with custom prefix
150
- expected_output_custom = pd.DataFrame({
151
- 'OBS': [1, 2, 3],
152
- 'custom_column_one': [10, 20, 30],
153
- 'custom_another_column': [100, 200, 300],
154
- 'custom_special_characters_!@#': [5, 15, 25]
155
- })
177
+ expected_output_custom = pd.DataFrame(
178
+ {
179
+ "OBS": [1, 2, 3],
180
+ "custom_column_one": [10, 20, 30],
181
+ "custom_another_column": [100, 200, 300],
182
+ "custom_special_characters_!@#": [5, 15, 25],
183
+ },
184
+ )
156
185
 
157
186
  # Test renaming columns with default prefix
158
187
  result_default = self.dp.rename_cols(df)
159
188
  pd.testing.assert_frame_equal(result_default, expected_output_default)
160
189
 
161
190
  # Test renaming columns with custom prefix
162
- result_custom = self.dp.rename_cols(df, name='custom_')
191
+ result_custom = self.dp.rename_cols(df, name="custom_")
163
192
  pd.testing.assert_frame_equal(result_custom, expected_output_custom)
164
193
 
165
194
  # Test that 'OBS' column remains unchanged
166
- self.assertIn('OBS', result_default.columns)
167
- self.assertIn('OBS', result_custom.columns)
195
+ self.assertIn("OBS", result_default.columns)
196
+ self.assertIn("OBS", result_custom.columns)
168
197
 
169
198
  def test_merge_new_and_old(self):
170
199
  # Create test DataFrames for old and new data
171
200
  old_data = {
172
- 'OBS': ['2023-01-01', '2023-01-02', '2023-01-03', '2023-01-04'],
173
- 'old_values': [10, 20, 30, 40]
201
+ "OBS": ["2023-01-01", "2023-01-02", "2023-01-03", "2023-01-04"],
202
+ "old_values": [10, 20, 30, 40],
174
203
  }
175
204
  new_data = {
176
- 'OBS': ['2023-01-04', '2023-01-05', '2023-01-06'],
177
- 'new_values': [100, 200, 300]
205
+ "OBS": ["2023-01-04", "2023-01-05", "2023-01-06"],
206
+ "new_values": [100, 200, 300],
178
207
  }
179
208
  old_df = pd.DataFrame(old_data)
180
209
  new_df = pd.DataFrame(new_data)
181
210
 
182
211
  # Expected output
183
- expected_output = pd.DataFrame({
184
- 'OBS': pd.to_datetime(['2023-01-01', '2023-01-02', '2023-01-03', '2023-01-04', '2023-01-05', '2023-01-06']),
185
- 'new_values': [10, 20, 30, 40, 200, 300]
186
- })
212
+ expected_output = pd.DataFrame(
213
+ {
214
+ "OBS": pd.to_datetime(
215
+ [
216
+ "2023-01-01",
217
+ "2023-01-02",
218
+ "2023-01-03",
219
+ "2023-01-04",
220
+ "2023-01-05",
221
+ "2023-01-06",
222
+ ],
223
+ ),
224
+ "new_values": [10, 20, 30, 40, 200, 300],
225
+ },
226
+ )
187
227
 
188
228
  # Test merging with cutoff_date='2023-01-04'
189
- result = self.dp.merge_new_and_old(old_df, 'old_values', new_df, 'new_values', '2023-01-04')
229
+ result = self.dp.merge_new_and_old(
230
+ old_df,
231
+ "old_values",
232
+ new_df,
233
+ "new_values",
234
+ "2023-01-04",
235
+ )
190
236
 
191
237
  # Assertions
192
238
  pd.testing.assert_frame_equal(result, expected_output)
193
239
 
194
240
  # Test that columns are correctly renamed and sorted
195
- self.assertIn('OBS', result.columns)
196
- self.assertIn('new_values', result.columns)
241
+ self.assertIn("OBS", result.columns)
242
+ self.assertIn("new_values", result.columns)
197
243
  self.assertEqual(len(result), len(expected_output)) # Ensure row count matches
198
- self.assertTrue((result['OBS'].diff().dropna() >= pd.Timedelta(0)).all()) # Check that dates are in order
244
+ self.assertTrue(
245
+ (result["OBS"].diff().dropna() >= pd.Timedelta(0)).all(),
246
+ ) # Check that dates are in order
199
247
 
200
248
  def test_merge_dataframes_on_column(self):
201
249
  # Create test DataFrames
202
- df1 = pd.DataFrame({
203
- 'OBS': ['2023-01-01', '2023-01-02', '2023-01-03'],
204
- 'value1': [10, 20, 30]
205
- })
206
- df2 = pd.DataFrame({
207
- 'OBS': ['2023-01-02', '2023-01-03', '2023-01-04'],
208
- 'value2': [40, 50, 60]
209
- })
210
- df3 = pd.DataFrame({
211
- 'OBS': ['2023-01-03', '2023-01-04', '2023-01-05'],
212
- 'value3': [70, 80, 90]
213
- })
250
+ df1 = pd.DataFrame(
251
+ {"OBS": ["2023-01-01", "2023-01-02", "2023-01-03"], "value1": [10, 20, 30]},
252
+ )
253
+ df2 = pd.DataFrame(
254
+ {"OBS": ["2023-01-02", "2023-01-03", "2023-01-04"], "value2": [40, 50, 60]},
255
+ )
256
+ df3 = pd.DataFrame(
257
+ {"OBS": ["2023-01-03", "2023-01-04", "2023-01-05"], "value3": [70, 80, 90]},
258
+ )
214
259
 
215
260
  # Ensure test DataFrame columns are datetime
216
- df1['OBS'] = pd.to_datetime(df1['OBS'])
217
- df2['OBS'] = pd.to_datetime(df2['OBS'])
218
- df3['OBS'] = pd.to_datetime(df3['OBS'])
261
+ df1["OBS"] = pd.to_datetime(df1["OBS"])
262
+ df2["OBS"] = pd.to_datetime(df2["OBS"])
263
+ df3["OBS"] = pd.to_datetime(df3["OBS"])
219
264
 
220
265
  # Expected output for outer merge (cast to float64 to match the behavior of fillna)
221
- expected_output_outer = pd.DataFrame({
222
- 'OBS': pd.to_datetime(['2023-01-01', '2023-01-02', '2023-01-03', '2023-01-04', '2023-01-05']),
223
- 'value1': [10.0, 20.0, 30.0, 0.0, 0.0],
224
- 'value2': [0.0, 40.0, 50.0, 60.0, 0.0],
225
- 'value3': [0.0, 0.0, 70.0, 80.0, 90.0]
226
- })
266
+ expected_output_outer = pd.DataFrame(
267
+ {
268
+ "OBS": pd.to_datetime(
269
+ [
270
+ "2023-01-01",
271
+ "2023-01-02",
272
+ "2023-01-03",
273
+ "2023-01-04",
274
+ "2023-01-05",
275
+ ],
276
+ ),
277
+ "value1": [10.0, 20.0, 30.0, 0.0, 0.0],
278
+ "value2": [0.0, 40.0, 50.0, 60.0, 0.0],
279
+ "value3": [0.0, 0.0, 70.0, 80.0, 90.0],
280
+ },
281
+ )
227
282
 
228
283
  # Expected output for inner merge
229
- expected_output_inner = pd.DataFrame({
230
- 'OBS': pd.to_datetime(['2023-01-03']),
231
- 'value1': [30],
232
- 'value2': [50],
233
- 'value3': [70]
234
- })
284
+ expected_output_inner = pd.DataFrame(
285
+ {
286
+ "OBS": pd.to_datetime(["2023-01-03"]),
287
+ "value1": [30],
288
+ "value2": [50],
289
+ "value3": [70],
290
+ },
291
+ )
235
292
 
236
293
  # Test outer merge
237
- result_outer = self.dp.merge_dataframes_on_column([df1, df2, df3], common_column='OBS', merge_how='outer')
238
- pd.testing.assert_frame_equal(result_outer.reset_index(drop=True), expected_output_outer)
294
+ result_outer = self.dp.merge_dataframes_on_column(
295
+ [df1, df2, df3],
296
+ common_column="OBS",
297
+ merge_how="outer",
298
+ )
299
+ pd.testing.assert_frame_equal(
300
+ result_outer.reset_index(drop=True),
301
+ expected_output_outer,
302
+ )
239
303
 
240
304
  # Test inner merge
241
- result_inner = self.dp.merge_dataframes_on_column([df1, df2, df3], common_column='OBS', merge_how='inner')
242
- pd.testing.assert_frame_equal(result_inner.reset_index(drop=True), expected_output_inner)
305
+ result_inner = self.dp.merge_dataframes_on_column(
306
+ [df1, df2, df3],
307
+ common_column="OBS",
308
+ merge_how="inner",
309
+ )
310
+ pd.testing.assert_frame_equal(
311
+ result_inner.reset_index(drop=True),
312
+ expected_output_inner,
313
+ )
243
314
 
244
315
  # Test with empty DataFrame list
245
- result_empty = self.dp.merge_dataframes_on_column([], common_column='OBS', merge_how='outer')
316
+ result_empty = self.dp.merge_dataframes_on_column(
317
+ [],
318
+ common_column="OBS",
319
+ merge_how="outer",
320
+ )
246
321
  self.assertIsNone(result_empty)
247
322
 
248
323
  # Test with one DataFrame in the list
249
- result_single = self.dp.merge_dataframes_on_column([df1], common_column='OBS', merge_how='outer')
324
+ result_single = self.dp.merge_dataframes_on_column(
325
+ [df1],
326
+ common_column="OBS",
327
+ merge_how="outer",
328
+ )
250
329
  pd.testing.assert_frame_equal(result_single.reset_index(drop=True), df1)
251
330
 
252
331
  # Test that the common column is sorted and converted to datetime
253
- self.assertTrue(pd.api.types.is_datetime64_any_dtype(result_outer['OBS']))
254
- self.assertTrue((result_outer['OBS'].diff().dropna() >= pd.Timedelta(0)).all()) # Check sorted dates
332
+ self.assertTrue(pd.api.types.is_datetime64_any_dtype(result_outer["OBS"]))
333
+ self.assertTrue(
334
+ (result_outer["OBS"].diff().dropna() >= pd.Timedelta(0)).all(),
335
+ ) # Check sorted dates
255
336
 
256
337
  def test_merge_and_update_dfs(self):
257
338
  # Create test DataFrames
258
- df1 = pd.DataFrame({
259
- 'OBS': ['2023-01-01', '2023-01-02', '2023-01-03'],
260
- 'value1': [10, 20, 30],
261
- 'value2': [100, 200, 300]
262
- })
263
-
264
- df2 = pd.DataFrame({
265
- 'OBS': ['2023-01-02', '2023-01-03', '2023-01-04'],
266
- 'value1': [15, 25, 35], # Updates for value1
267
- 'value3': [400, 500, 600] # New column
268
- })
339
+ df1 = pd.DataFrame(
340
+ {
341
+ "OBS": ["2023-01-01", "2023-01-02", "2023-01-03"],
342
+ "value1": [10, 20, 30],
343
+ "value2": [100, 200, 300],
344
+ },
345
+ )
346
+
347
+ df2 = pd.DataFrame(
348
+ {
349
+ "OBS": ["2023-01-02", "2023-01-03", "2023-01-04"],
350
+ "value1": [15, 25, 35], # Updates for value1
351
+ "value3": [400, 500, 600], # New column
352
+ },
353
+ )
269
354
 
270
355
  # Ensure test DataFrame columns are datetime
271
- df1['OBS'] = pd.to_datetime(df1['OBS'])
272
- df2['OBS'] = pd.to_datetime(df2['OBS'])
356
+ df1["OBS"] = pd.to_datetime(df1["OBS"])
357
+ df2["OBS"] = pd.to_datetime(df2["OBS"])
273
358
 
274
359
  # Expected output with float64 for numeric columns
275
- expected_output = pd.DataFrame({
276
- 'OBS': pd.to_datetime(['2023-01-01', '2023-01-02', '2023-01-03', '2023-01-04']),
277
- 'value1': [10.0, 15.0, 25.0, 35.0], # Updated where applicable
278
- 'value2': [100.0, 200.0, 300.0, 0.0], # From df1, 0 where not available
279
- 'value3': [0.0, 400.0, 500.0, 600.0] # From df2, 0 where not available
280
- })
360
+ expected_output = pd.DataFrame(
361
+ {
362
+ "OBS": pd.to_datetime(
363
+ ["2023-01-01", "2023-01-02", "2023-01-03", "2023-01-04"],
364
+ ),
365
+ "value1": [10.0, 15.0, 25.0, 35.0], # Updated where applicable
366
+ "value2": [100.0, 200.0, 300.0, 0.0], # From df1, 0 where not available
367
+ "value3": [0.0, 400.0, 500.0, 600.0], # From df2, 0 where not available
368
+ },
369
+ )
281
370
 
282
371
  # Test the merge and update function
283
- result = self.dp.merge_and_update_dfs(df1, df2, key_column='OBS')
372
+ result = self.dp.merge_and_update_dfs(df1, df2, key_column="OBS")
284
373
 
285
374
  # Assertions
286
375
  pd.testing.assert_frame_equal(result.reset_index(drop=True), expected_output)
@@ -289,187 +378,256 @@ class TestDataProcessor(unittest.TestCase):
289
378
  self.assertListEqual(list(result.columns), list(expected_output.columns))
290
379
 
291
380
  # Test that the OBS column is sorted
292
- self.assertTrue((result['OBS'].diff().dropna() >= pd.Timedelta(0)).all())
381
+ self.assertTrue((result["OBS"].diff().dropna() >= pd.Timedelta(0)).all())
293
382
 
294
383
  def test_convert_us_to_uk_dates(self):
295
384
  # Create a test DataFrame
296
385
  test_data = {
297
- 'date_col': ['01-02-2023', '03/04/2023', '05-06-2023', '07/08/2023']
386
+ "date_col": ["01-02-2023", "03/04/2023", "05-06-2023", "07/08/2023"],
298
387
  }
299
388
  df = pd.DataFrame(test_data)
300
389
 
301
390
  # Expected output
302
- expected_output = pd.DataFrame({
303
- 'date_col': pd.to_datetime(['2023-01-02', '2023-03-04', '2023-05-06', '2023-07-08'])
304
- })
391
+ expected_output = pd.DataFrame(
392
+ {
393
+ "date_col": pd.to_datetime(
394
+ ["2023-01-02", "2023-03-04", "2023-05-06", "2023-07-08"],
395
+ ),
396
+ },
397
+ )
305
398
 
306
399
  # Test the conversion function
307
- result = self.dp.convert_us_to_uk_dates(df.copy(), 'date_col')
400
+ result = self.dp.convert_us_to_uk_dates(df.copy(), "date_col")
308
401
 
309
402
  # Assertions
310
403
  pd.testing.assert_frame_equal(result, expected_output)
311
404
 
312
405
  # Test invalid input formats
313
- invalid_data = pd.DataFrame({'date_col': ['invalid-date', '12345']})
406
+ invalid_data = pd.DataFrame({"date_col": ["invalid-date", "12345"]})
314
407
  with self.assertRaises(ValueError):
315
- self.dp.convert_us_to_uk_dates(invalid_data.copy(), 'date_col')
408
+ self.dp.convert_us_to_uk_dates(invalid_data.copy(), "date_col")
316
409
 
317
410
  # Test missing values
318
- missing_data = pd.DataFrame({'date_col': [None, '03/04/2023']})
319
- result_with_missing = self.dp.convert_us_to_uk_dates(missing_data.copy(), 'date_col')
320
- expected_with_missing = pd.DataFrame({
321
- 'date_col': [pd.NaT, pd.to_datetime('2023-03-04')]
322
- })
411
+ missing_data = pd.DataFrame({"date_col": [None, "03/04/2023"]})
412
+ result_with_missing = self.dp.convert_us_to_uk_dates(
413
+ missing_data.copy(),
414
+ "date_col",
415
+ )
416
+ expected_with_missing = pd.DataFrame(
417
+ {"date_col": [pd.NaT, pd.to_datetime("2023-03-04")]},
418
+ )
323
419
  pd.testing.assert_frame_equal(result_with_missing, expected_with_missing)
324
420
 
325
421
  def test_pivot_table(self):
326
422
  # Create a test DataFrame
327
423
  test_data = {
328
- 'date': ['2023-01-01', '2023-01-01', '2023-01-02', '2023-01-02', '2023-01-03'],
329
- 'category': ['A', 'B', 'A', 'B', 'A'],
330
- 'value': [10.0, 20.0, 30.0, 40.0, 50.0]
424
+ "date": [
425
+ "2023-01-01",
426
+ "2023-01-01",
427
+ "2023-01-02",
428
+ "2023-01-02",
429
+ "2023-01-03",
430
+ ],
431
+ "category": ["A", "B", "A", "B", "A"],
432
+ "value": [10.0, 20.0, 30.0, 40.0, 50.0],
331
433
  }
332
434
  df = pd.DataFrame(test_data)
333
435
 
334
436
  # Ensure the 'date' column is in datetime format
335
- df['date'] = pd.to_datetime(df['date'])
437
+ df["date"] = pd.to_datetime(df["date"])
336
438
 
337
439
  # Expected output for basic pivot table
338
- expected_output_basic = pd.DataFrame({
339
- 'date': pd.to_datetime(['2023-01-01', '2023-01-02', '2023-01-03']),
340
- 'A': [10.0, 30.0, 50.0], # Cast to float64
341
- 'B': [20.0, 40.0, 0.0] # Cast to float64
342
- })
343
- expected_output_basic.columns.name = 'category'
440
+ expected_output_basic = pd.DataFrame(
441
+ {
442
+ "date": pd.to_datetime(["2023-01-01", "2023-01-02", "2023-01-03"]),
443
+ "A": [10.0, 30.0, 50.0], # Cast to float64
444
+ "B": [20.0, 40.0, 0.0], # Cast to float64
445
+ },
446
+ )
447
+ expected_output_basic.columns.name = "category"
344
448
 
345
449
  # Test basic pivot table
346
- result_basic = self.dp.pivot_table(df.copy(), index_col='date', columns='category', values_col='value', margins=False, fill_value=0)
450
+ result_basic = self.dp.pivot_table(
451
+ df.copy(),
452
+ index_col="date",
453
+ columns="category",
454
+ values_col="value",
455
+ margins=False,
456
+ fill_value=0,
457
+ )
347
458
 
348
459
  # Convert 'date' columns in both DataFrames to datetime for comparison
349
- result_basic['date'] = pd.to_datetime(result_basic['date'])
350
- expected_output_basic['date'] = pd.to_datetime(expected_output_basic['date'])
460
+ result_basic["date"] = pd.to_datetime(result_basic["date"])
461
+ expected_output_basic["date"] = pd.to_datetime(expected_output_basic["date"])
351
462
  pd.testing.assert_frame_equal(result_basic, expected_output_basic)
352
463
 
353
464
  # Expected output for pivot table with margins
354
- expected_output_with_margins = pd.DataFrame({
355
- 'date': ['2023-01-01', '2023-01-02', '2023-01-03', 'Total'],
356
- 'A': [10.0, 30.0, 50.0, 90.0],
357
- 'B': [20.0, 40.0, 0.0, 60.0],
358
- 'Total': [30.0, 70.0, 50.0, 150.0]
359
- })
360
- expected_output_with_margins['date'] = pd.to_datetime(
361
- expected_output_with_margins['date'], errors='coerce'
362
- ).fillna('Total')
363
- expected_output_with_margins.columns.name = 'category'
465
+ expected_output_with_margins = pd.DataFrame(
466
+ {
467
+ "date": ["2023-01-01", "2023-01-02", "2023-01-03", "Total"],
468
+ "A": [10.0, 30.0, 50.0, 90.0],
469
+ "B": [20.0, 40.0, 0.0, 60.0],
470
+ "Total": [30.0, 70.0, 50.0, 150.0],
471
+ },
472
+ )
473
+ expected_output_with_margins["date"] = pd.to_datetime(
474
+ expected_output_with_margins["date"],
475
+ errors="coerce",
476
+ ).fillna("Total")
477
+ expected_output_with_margins.columns.name = "category"
364
478
 
365
479
  # Test pivot table with margins
366
- result_with_margins = self.dp.pivot_table(df.copy(), index_col='date', columns='category', values_col='value', margins=True, fill_value=0)
367
- result_with_margins['date'] = pd.to_datetime(result_with_margins['date'], errors='coerce').fillna('Total')
480
+ result_with_margins = self.dp.pivot_table(
481
+ df.copy(),
482
+ index_col="date",
483
+ columns="category",
484
+ values_col="value",
485
+ margins=True,
486
+ fill_value=0,
487
+ )
488
+ result_with_margins["date"] = pd.to_datetime(
489
+ result_with_margins["date"],
490
+ errors="coerce",
491
+ ).fillna("Total")
368
492
  pd.testing.assert_frame_equal(result_with_margins, expected_output_with_margins)
369
493
 
370
494
  def test_apply_lookup_table_for_columns(self):
371
495
  # Create a test DataFrame
372
496
  test_data = {
373
- 'col1': ['apple', 'banana', 'carrot', 'date', 'eggplant'],
374
- 'col2': ['fruit', 'fruit', 'vegetable', 'fruit', 'vegetable']
497
+ "col1": ["apple", "banana", "carrot", "date", "eggplant"],
498
+ "col2": ["fruit", "fruit", "vegetable", "fruit", "vegetable"],
375
499
  }
376
500
  df = pd.DataFrame(test_data)
377
501
 
378
502
  # Lookup dictionary
379
503
  lookup_dict = {
380
- 'apple': 'Red Fruit',
381
- 'banana': 'Yellow Fruit',
382
- 'carrot': 'Orange Vegetable',
383
- 'date': 'Brown Fruit'
504
+ "apple": "Red Fruit",
505
+ "banana": "Yellow Fruit",
506
+ "carrot": "Orange Vegetable",
507
+ "date": "Brown Fruit",
384
508
  }
385
509
 
386
510
  # Expected output with single column lookup
387
511
  expected_output_single = df.copy()
388
- expected_output_single['Mapping'] = ['Red Fruit', 'Yellow Fruit', 'Orange Vegetable', 'Brown Fruit', 'Other']
512
+ expected_output_single["Mapping"] = [
513
+ "Red Fruit",
514
+ "Yellow Fruit",
515
+ "Orange Vegetable",
516
+ "Brown Fruit",
517
+ "Other",
518
+ ]
389
519
 
390
520
  # Test with a single column
391
- result_single = self.dp.apply_lookup_table_for_columns(df.copy(), col_names=['col1'], to_find_dict=lookup_dict)
521
+ result_single = self.dp.apply_lookup_table_for_columns(
522
+ df.copy(),
523
+ col_names=["col1"],
524
+ to_find_dict=lookup_dict,
525
+ )
392
526
  pd.testing.assert_frame_equal(result_single, expected_output_single)
393
527
 
394
528
  # Expected output with multiple column lookup
395
529
  expected_output_multiple = df.copy()
396
- expected_output_multiple['Mapping'] = ['Other', 'Other', 'Other', 'Brown Fruit', 'Other']
530
+ expected_output_multiple["Mapping"] = [
531
+ "Other",
532
+ "Other",
533
+ "Other",
534
+ "Brown Fruit",
535
+ "Other",
536
+ ]
397
537
 
398
538
  # Update lookup dictionary to match merged keys
399
- lookup_dict_merged = {
400
- 'date|fruit': 'Brown Fruit'
401
- }
539
+ lookup_dict_merged = {"date|fruit": "Brown Fruit"}
402
540
 
403
541
  # Test with multiple columns
404
- result_multiple = self.dp.apply_lookup_table_for_columns(df.copy(), col_names=['col1', 'col2'], to_find_dict=lookup_dict_merged)
542
+ result_multiple = self.dp.apply_lookup_table_for_columns(
543
+ df.copy(),
544
+ col_names=["col1", "col2"],
545
+ to_find_dict=lookup_dict_merged,
546
+ )
405
547
  pd.testing.assert_frame_equal(result_multiple, expected_output_multiple)
406
548
 
407
549
  # Test case where no match is found
408
- df_no_match = pd.DataFrame({'col1': ['unknown']})
550
+ df_no_match = pd.DataFrame({"col1": ["unknown"]})
409
551
  expected_no_match = df_no_match.copy()
410
- expected_no_match['Mapping'] = ['Other']
411
- result_no_match = self.dp.apply_lookup_table_for_columns(df_no_match, col_names=['col1'], to_find_dict=lookup_dict)
552
+ expected_no_match["Mapping"] = ["Other"]
553
+ result_no_match = self.dp.apply_lookup_table_for_columns(
554
+ df_no_match,
555
+ col_names=["col1"],
556
+ to_find_dict=lookup_dict,
557
+ )
412
558
  pd.testing.assert_frame_equal(result_no_match, expected_no_match)
413
559
 
414
560
  def test_aggregate_daily_to_wc_wide(self):
415
561
  # Create a test DataFrame
416
562
  test_data = {
417
- 'date': ['2023-01-01', '2023-01-02', '2023-01-08', '2023-01-09', '2023-01-10'],
418
- 'group': ['A', 'A', 'B', 'B', 'B'],
419
- 'value1': [10, 20, 30, 40, None],
420
- 'value2': [100, 200, 300, None, 500]
563
+ "date": [
564
+ "2023-01-01",
565
+ "2023-01-02",
566
+ "2023-01-08",
567
+ "2023-01-09",
568
+ "2023-01-10",
569
+ ],
570
+ "group": ["A", "A", "B", "B", "B"],
571
+ "value1": [10, 20, 30, 40, None],
572
+ "value2": [100, 200, 300, None, 500],
421
573
  }
422
574
  df = pd.DataFrame(test_data)
423
575
 
424
576
  # Expected output for weekly aggregation in wide format
425
- expected_output = pd.DataFrame({
426
- 'OBS': ['2023-01-01', '2023-01-08'], # Weeks starting on Sunday
427
- 'value1_A': [30.0, 0.0],
428
- 'value1_B': [0.0, 70.0],
429
- 'value2_A': [300.0, 0.0],
430
- 'value2_B': [0.0, 800.0],
431
- 'Total value1': [30.0, 70.0],
432
- 'Total value2': [300.0, 800.0]
433
- })
577
+ expected_output = pd.DataFrame(
578
+ {
579
+ "OBS": ["2023-01-01", "2023-01-08"], # Weeks starting on Sunday
580
+ "value1_A": [30.0, 0.0],
581
+ "value1_B": [0.0, 70.0],
582
+ "value2_A": [300.0, 0.0],
583
+ "value2_B": [0.0, 800.0],
584
+ "Total value1": [30.0, 70.0],
585
+ "Total value2": [300.0, 800.0],
586
+ },
587
+ )
434
588
 
435
589
  # Test aggregation with totals included
436
590
  result = self.dp.aggregate_daily_to_wc_wide(
437
591
  df=df.copy(),
438
- date_column='date',
439
- group_columns=['group'],
440
- sum_columns=['value1', 'value2'],
441
- wc='sun',
442
- aggregation='sum',
443
- include_totals=True
592
+ date_column="date",
593
+ group_columns=["group"],
594
+ sum_columns=["value1", "value2"],
595
+ wc="sun",
596
+ aggregation="sum",
597
+ include_totals=True,
444
598
  )
445
599
 
446
600
  # Ensure 'OBS' columns are datetime for comparison
447
- result['OBS'] = pd.to_datetime(result['OBS'])
448
- expected_output['OBS'] = pd.to_datetime(expected_output['OBS'])
601
+ result["OBS"] = pd.to_datetime(result["OBS"])
602
+ expected_output["OBS"] = pd.to_datetime(expected_output["OBS"])
449
603
 
450
604
  # Compare the resulting DataFrame with the expected DataFrame
451
605
  pd.testing.assert_frame_equal(result, expected_output)
452
606
 
453
607
  # Test without group columns (no totals, single wide column)
454
- expected_output_no_group = pd.DataFrame({
455
- 'OBS': ['2023-01-01', '2023-01-08'],
456
- 'value1': [30.0, 70.0],
457
- 'value2': [300.0, 800.0]
458
- })
608
+ expected_output_no_group = pd.DataFrame(
609
+ {
610
+ "OBS": ["2023-01-01", "2023-01-08"],
611
+ "value1": [30.0, 70.0],
612
+ "value2": [300.0, 800.0],
613
+ },
614
+ )
459
615
 
460
616
  result_no_group = self.dp.aggregate_daily_to_wc_wide(
461
617
  df=df.copy(),
462
- date_column='date',
618
+ date_column="date",
463
619
  group_columns=[],
464
- sum_columns=['value1', 'value2'],
465
- wc='sun',
466
- aggregation='sum',
467
- include_totals=False
620
+ sum_columns=["value1", "value2"],
621
+ wc="sun",
622
+ aggregation="sum",
623
+ include_totals=False,
468
624
  )
469
625
 
470
626
  # Ensure 'OBS' columns are datetime for comparison
471
- result_no_group['OBS'] = pd.to_datetime(result_no_group['OBS'])
472
- expected_output_no_group['OBS'] = pd.to_datetime(expected_output_no_group['OBS'])
627
+ result_no_group["OBS"] = pd.to_datetime(result_no_group["OBS"])
628
+ expected_output_no_group["OBS"] = pd.to_datetime(
629
+ expected_output_no_group["OBS"],
630
+ )
473
631
 
474
632
  # Compare the resulting DataFrame with the expected DataFrame
475
633
  pd.testing.assert_frame_equal(result_no_group, expected_output_no_group)
@@ -477,200 +635,233 @@ class TestDataProcessor(unittest.TestCase):
477
635
  def test_merge_cols_with_seperator(self):
478
636
  # Create a test DataFrame
479
637
  test_data = {
480
- 'col1': ['apple', 'banana', 'cherry'],
481
- 'col2': ['red', 'yellow', 'red'],
482
- 'col3': ['fruit', 'fruit', 'fruit']
638
+ "col1": ["apple", "banana", "cherry"],
639
+ "col2": ["red", "yellow", "red"],
640
+ "col3": ["fruit", "fruit", "fruit"],
483
641
  }
484
642
  df = pd.DataFrame(test_data)
485
643
 
486
644
  # Test merging two columns with default separator
487
645
  expected_output_default = df.copy()
488
- expected_output_default['Merged'] = ['apple_red', 'banana_yellow', 'cherry_red']
646
+ expected_output_default["Merged"] = ["apple_red", "banana_yellow", "cherry_red"]
489
647
 
490
- result_default = self.dp.merge_cols_with_seperator(df.copy(), col_names=['col1', 'col2'])
648
+ result_default = self.dp.merge_cols_with_seperator(
649
+ df.copy(),
650
+ col_names=["col1", "col2"],
651
+ )
491
652
  pd.testing.assert_frame_equal(result_default, expected_output_default)
492
653
 
493
654
  # Test merging three columns with custom separator
494
655
  expected_output_custom = df.copy()
495
- expected_output_custom['Merged'] = ['apple-red-fruit', 'banana-yellow-fruit', 'cherry-red-fruit']
656
+ expected_output_custom["Merged"] = [
657
+ "apple-red-fruit",
658
+ "banana-yellow-fruit",
659
+ "cherry-red-fruit",
660
+ ]
496
661
 
497
- result_custom = self.dp.merge_cols_with_seperator(df.copy(), col_names=['col1', 'col2', 'col3'], seperator='-')
662
+ result_custom = self.dp.merge_cols_with_seperator(
663
+ df.copy(),
664
+ col_names=["col1", "col2", "col3"],
665
+ seperator="-",
666
+ )
498
667
  pd.testing.assert_frame_equal(result_custom, expected_output_custom)
499
668
 
500
669
  # Test merging with starting and ending prefix
501
670
  expected_output_prefix = df.copy()
502
- expected_output_prefix['Merged'] = ['Start:apple_red:End', 'Start:banana_yellow:End', 'Start:cherry_red:End']
671
+ expected_output_prefix["Merged"] = [
672
+ "Start:apple_red:End",
673
+ "Start:banana_yellow:End",
674
+ "Start:cherry_red:End",
675
+ ]
503
676
 
504
677
  result_prefix = self.dp.merge_cols_with_seperator(
505
678
  df.copy(),
506
- col_names=['col1', 'col2'],
507
- seperator='_',
508
- starting_prefix_str='Start:',
509
- ending_prefix_str=':End'
679
+ col_names=["col1", "col2"],
680
+ seperator="_",
681
+ starting_prefix_str="Start:",
682
+ ending_prefix_str=":End",
510
683
  )
511
684
  pd.testing.assert_frame_equal(result_prefix, expected_output_prefix)
512
685
 
513
686
  # Test error for less than two columns
514
687
  with self.assertRaises(ValueError):
515
- self.dp.merge_cols_with_seperator(df.copy(), col_names=['col1'])
688
+ self.dp.merge_cols_with_seperator(df.copy(), col_names=["col1"])
516
689
 
517
690
  def test_check_sum_of_df_cols_are_equal(self):
518
691
  # Create test DataFrames
519
- df1 = pd.DataFrame({
520
- 'col1': [1, 2, 3],
521
- 'col2': [4, 5, 6]
522
- })
692
+ df1 = pd.DataFrame({"col1": [1, 2, 3], "col2": [4, 5, 6]})
523
693
 
524
- df2 = pd.DataFrame({
525
- 'colA': [1, 2, 3],
526
- 'colB': [4, 5, 6]
527
- })
694
+ df2 = pd.DataFrame({"colA": [1, 2, 3], "colB": [4, 5, 6]})
528
695
 
529
- df3 = pd.DataFrame({
530
- 'colX': [1, 2, 3],
531
- 'colY': [4, 5, 7]
532
- })
696
+ df3 = pd.DataFrame({"colX": [1, 2, 3], "colY": [4, 5, 7]})
533
697
 
534
698
  # Test case where sums are equal
535
- result_equal = self.dp.check_sum_of_df_cols_are_equal(df1, df2, cols_1=['col1', 'col2'], cols_2=['colA', 'colB'])
699
+ result_equal = self.dp.check_sum_of_df_cols_are_equal(
700
+ df1,
701
+ df2,
702
+ cols_1=["col1", "col2"],
703
+ cols_2=["colA", "colB"],
704
+ )
536
705
  self.assertEqual(result_equal[0], "They are equal")
537
706
  self.assertEqual(result_equal[1], 21) # Sum of df1's columns
538
707
  self.assertEqual(result_equal[2], 21) # Sum of df2's columns
539
708
 
540
709
  # Test case where sums are not equal
541
- result_not_equal = self.dp.check_sum_of_df_cols_are_equal(df1, df3, cols_1=['col1', 'col2'], cols_2=['colX', 'colY'])
710
+ result_not_equal = self.dp.check_sum_of_df_cols_are_equal(
711
+ df1,
712
+ df3,
713
+ cols_1=["col1", "col2"],
714
+ cols_2=["colX", "colY"],
715
+ )
542
716
  self.assertTrue(result_not_equal[0].startswith("They are different by "))
543
717
  self.assertEqual(result_not_equal[1], 21) # Sum of df1's columns
544
718
  self.assertEqual(result_not_equal[2], 22) # Sum of df3's columns
545
719
 
546
720
  # Test case with mismatched column names
547
721
  with self.assertRaises(KeyError):
548
- self.dp.check_sum_of_df_cols_are_equal(df1, df2, cols_1=['nonexistent_col'], cols_2=['colA', 'colB'])
722
+ self.dp.check_sum_of_df_cols_are_equal(
723
+ df1,
724
+ df2,
725
+ cols_1=["nonexistent_col"],
726
+ cols_2=["colA", "colB"],
727
+ )
549
728
 
550
729
  # Test case with empty columns
551
- result_empty_cols = self.dp.check_sum_of_df_cols_are_equal(df1, df2, cols_1=[], cols_2=[])
730
+ result_empty_cols = self.dp.check_sum_of_df_cols_are_equal(
731
+ df1,
732
+ df2,
733
+ cols_1=[],
734
+ cols_2=[],
735
+ )
552
736
  self.assertEqual(result_empty_cols[1], 0) # Sum of empty columns
553
737
  self.assertEqual(result_empty_cols[2], 0) # Sum of empty columns
554
738
  self.assertEqual(result_empty_cols[0], "They are equal")
555
739
 
556
740
  def test_convert_2_df_cols_to_dict(self):
557
741
  # Create a test DataFrame
558
- df = pd.DataFrame({
559
- 'key_col': ['key1', 'key2', 'key3'],
560
- 'value_col': [10, 20, 30]
561
- })
742
+ df = pd.DataFrame(
743
+ {"key_col": ["key1", "key2", "key3"], "value_col": [10, 20, 30]},
744
+ )
562
745
 
563
746
  # Expected dictionary
564
- expected_dict = {
565
- 'key1': 10,
566
- 'key2': 20,
567
- 'key3': 30
568
- }
747
+ expected_dict = {"key1": 10, "key2": 20, "key3": 30}
569
748
 
570
749
  # Test basic functionality
571
- result = self.dp.convert_2_df_cols_to_dict(df, 'key_col', 'value_col')
750
+ result = self.dp.convert_2_df_cols_to_dict(df, "key_col", "value_col")
572
751
  self.assertEqual(result, expected_dict)
573
752
 
574
753
  # Test with non-unique keys
575
- df_non_unique = pd.DataFrame({
576
- 'key_col': ['key1', 'key2', 'key1'],
577
- 'value_col': [10, 20, 30]
578
- })
754
+ df_non_unique = pd.DataFrame(
755
+ {"key_col": ["key1", "key2", "key1"], "value_col": [10, 20, 30]},
756
+ )
579
757
  expected_dict_non_unique = {
580
- 'key1': 30, # Last occurrence of 'key1' should overwrite the earlier one
581
- 'key2': 20
758
+ "key1": 30, # Last occurrence of 'key1' should overwrite the earlier one
759
+ "key2": 20,
582
760
  }
583
- result_non_unique = self.dp.convert_2_df_cols_to_dict(df_non_unique, 'key_col', 'value_col')
761
+ result_non_unique = self.dp.convert_2_df_cols_to_dict(
762
+ df_non_unique,
763
+ "key_col",
764
+ "value_col",
765
+ )
584
766
  self.assertEqual(result_non_unique, expected_dict_non_unique)
585
767
 
586
768
  # Test with missing key or value column
587
769
  with self.assertRaises(ValueError):
588
- self.dp.convert_2_df_cols_to_dict(df, 'missing_key_col', 'value_col')
770
+ self.dp.convert_2_df_cols_to_dict(df, "missing_key_col", "value_col")
589
771
 
590
772
  with self.assertRaises(ValueError):
591
- self.dp.convert_2_df_cols_to_dict(df, 'key_col', 'missing_value_col')
773
+ self.dp.convert_2_df_cols_to_dict(df, "key_col", "missing_value_col")
592
774
 
593
775
  # Test with empty DataFrame
594
- df_empty = pd.DataFrame(columns=['key_col', 'value_col'])
776
+ df_empty = pd.DataFrame(columns=["key_col", "value_col"])
595
777
  expected_empty_dict = {}
596
- result_empty = self.dp.convert_2_df_cols_to_dict(df_empty, 'key_col', 'value_col')
778
+ result_empty = self.dp.convert_2_df_cols_to_dict(
779
+ df_empty,
780
+ "key_col",
781
+ "value_col",
782
+ )
597
783
  self.assertEqual(result_empty, expected_empty_dict)
598
784
 
599
785
  def test_keyword_lookup_replacement(self):
600
786
  # Create a test DataFrame
601
787
  test_data = {
602
- 'col1': ['A', 'B', 'C', 'D'],
603
- 'col2': ['X', 'Y', 'Z', 'W'],
604
- 'value_col': ['old_value', 'old_value', 'unchanged', 'old_value']
788
+ "col1": ["A", "B", "C", "D"],
789
+ "col2": ["X", "Y", "Z", "W"],
790
+ "value_col": ["old_value", "old_value", "unchanged", "old_value"],
605
791
  }
606
792
  df = pd.DataFrame(test_data)
607
793
 
608
794
  # Lookup dictionary for replacements
609
- lookup_dict = {
610
- 'A|X': 'new_value_1',
611
- 'B|Y': 'new_value_2',
612
- 'D|W': 'new_value_3'
613
- }
795
+ lookup_dict = {"A|X": "new_value_1", "B|Y": "new_value_2", "D|W": "new_value_3"}
614
796
 
615
797
  # Expected output
616
798
  expected_output = df.copy()
617
- expected_output['Updated Column'] = ['new_value_1', 'new_value_2', 'unchanged', 'new_value_3']
799
+ expected_output["Updated Column"] = [
800
+ "new_value_1",
801
+ "new_value_2",
802
+ "unchanged",
803
+ "new_value_3",
804
+ ]
618
805
 
619
806
  # Apply the function
620
807
  result = self.dp.keyword_lookup_replacement(
621
808
  df.copy(),
622
- col='value_col',
623
- replacement_rows='old_value',
624
- cols_to_merge=['col1', 'col2'],
625
- replacement_lookup_dict=lookup_dict
809
+ col="value_col",
810
+ replacement_rows="old_value",
811
+ cols_to_merge=["col1", "col2"],
812
+ replacement_lookup_dict=lookup_dict,
626
813
  )
627
814
 
628
815
  # Compare the resulting DataFrame with the expected DataFrame
629
816
  pd.testing.assert_frame_equal(result, expected_output)
630
817
 
631
818
  # Test case where no replacement is needed
632
- df_no_replacement = pd.DataFrame({
633
- 'col1': ['E', 'F'],
634
- 'col2': ['G', 'H'],
635
- 'value_col': ['unchanged', 'unchanged']
636
- })
819
+ df_no_replacement = pd.DataFrame(
820
+ {
821
+ "col1": ["E", "F"],
822
+ "col2": ["G", "H"],
823
+ "value_col": ["unchanged", "unchanged"],
824
+ },
825
+ )
637
826
  expected_no_replacement = df_no_replacement.copy()
638
- expected_no_replacement['Updated Column'] = ['unchanged', 'unchanged']
827
+ expected_no_replacement["Updated Column"] = ["unchanged", "unchanged"]
639
828
 
640
829
  result_no_replacement = self.dp.keyword_lookup_replacement(
641
830
  df_no_replacement.copy(),
642
- col='value_col',
643
- replacement_rows='old_value',
644
- cols_to_merge=['col1', 'col2'],
645
- replacement_lookup_dict=lookup_dict
831
+ col="value_col",
832
+ replacement_rows="old_value",
833
+ cols_to_merge=["col1", "col2"],
834
+ replacement_lookup_dict=lookup_dict,
646
835
  )
647
836
 
648
837
  pd.testing.assert_frame_equal(result_no_replacement, expected_no_replacement)
649
-
838
+
650
839
  def test_convert_df_wide_2_long(self):
651
840
  # Create a test DataFrame
652
841
  test_data = {
653
- 'id': [1, 2, 3],
654
- 'name': ['Alice', 'Bob', 'Charlie'],
655
- 'score1': [85, 90, 78],
656
- 'score2': [88, 92, 81]
842
+ "id": [1, 2, 3],
843
+ "name": ["Alice", "Bob", "Charlie"],
844
+ "score1": [85, 90, 78],
845
+ "score2": [88, 92, 81],
657
846
  }
658
847
  df = pd.DataFrame(test_data)
659
848
 
660
849
  # Expected output for the transformation
661
- expected_output = pd.DataFrame({
662
- 'id': [1, 2, 3, 1, 2, 3],
663
- 'name': ['Alice', 'Bob', 'Charlie', 'Alice', 'Bob', 'Charlie'],
664
- 'Stacked': ['score1', 'score1', 'score1', 'score2', 'score2', 'score2'],
665
- 'Value': [85, 90, 78, 88, 92, 81]
666
- })
850
+ expected_output = pd.DataFrame(
851
+ {
852
+ "id": [1, 2, 3, 1, 2, 3],
853
+ "name": ["Alice", "Bob", "Charlie", "Alice", "Bob", "Charlie"],
854
+ "Stacked": ["score1", "score1", "score1", "score2", "score2", "score2"],
855
+ "Value": [85, 90, 78, 88, 92, 81],
856
+ },
857
+ )
667
858
 
668
859
  # Apply the function
669
860
  result = self.dp.convert_df_wide_2_long(
670
861
  df.copy(),
671
- value_cols=['score1', 'score2'],
672
- variable_col_name='Stacked',
673
- value_col_name='Value'
862
+ value_cols=["score1", "score2"],
863
+ variable_col_name="Stacked",
864
+ value_col_name="Value",
674
865
  )
675
866
 
676
867
  # Compare the resulting DataFrame with the expected DataFrame
@@ -680,9 +871,9 @@ class TestDataProcessor(unittest.TestCase):
680
871
  with self.assertRaises(ValueError):
681
872
  self.dp.convert_df_wide_2_long(
682
873
  df.copy(),
683
- value_cols=['score1'],
684
- variable_col_name='Stacked',
685
- value_col_name='Value'
874
+ value_cols=["score1"],
875
+ variable_col_name="Stacked",
876
+ value_col_name="Value",
686
877
  )
687
878
 
688
879
  # Test case with no value columns (should raise ValueError)
@@ -690,24 +881,24 @@ class TestDataProcessor(unittest.TestCase):
690
881
  self.dp.convert_df_wide_2_long(
691
882
  df.copy(),
692
883
  value_cols=[],
693
- variable_col_name='Stacked',
694
- value_col_name='Value'
884
+ variable_col_name="Stacked",
885
+ value_col_name="Value",
695
886
  )
696
887
 
697
888
  def test_format_numbers_with_commas(self):
698
889
  # Create a test DataFrame
699
890
  test_data = {
700
- 'col1': [1000, 2500000, 12345.678, None],
701
- 'col2': [2000.5, 350000.75, 0, -12345],
702
- 'col3': ['text', 'another text', 50000, 123.45]
891
+ "col1": [1000, 2500000, 12345.678, None],
892
+ "col2": [2000.5, 350000.75, 0, -12345],
893
+ "col3": ["text", "another text", 50000, 123.45],
703
894
  }
704
895
  df = pd.DataFrame(test_data).fillna(value=pd.NA) # Normalize None to pd.NA
705
896
 
706
897
  # Expected output with 2 decimal places
707
898
  expected_data = {
708
- 'col1': ['1,000.00', '2,500,000.00', '12,345.68', pd.NA],
709
- 'col2': ['2,000.50', '350,000.75', '0.00', '-12,345.00'],
710
- 'col3': ['text', 'another text', '50,000.00', '123.45']
899
+ "col1": ["1,000.00", "2,500,000.00", "12,345.68", pd.NA],
900
+ "col2": ["2,000.50", "350,000.75", "0.00", "-12,345.00"],
901
+ "col3": ["text", "another text", "50,000.00", "123.45"],
711
902
  }
712
903
  expected_output = pd.DataFrame(expected_data)
713
904
 
@@ -720,254 +911,314 @@ class TestDataProcessor(unittest.TestCase):
720
911
  def test_filter_df_on_multiple_conditions(self):
721
912
  # Create a test DataFrame
722
913
  test_data = {
723
- 'id': [1, 2, 3, 4, 5],
724
- 'value': [10, 20, 30, 40, 50],
725
- 'category': ['A', 'B', 'A', 'C', 'A'],
726
- 'date': pd.to_datetime(['2023-01-01', '2023-01-02', '2023-01-03', '2023-01-04', '2023-01-05'])
914
+ "id": [1, 2, 3, 4, 5],
915
+ "value": [10, 20, 30, 40, 50],
916
+ "category": ["A", "B", "A", "C", "A"],
917
+ "date": pd.to_datetime(
918
+ ["2023-01-01", "2023-01-02", "2023-01-03", "2023-01-04", "2023-01-05"],
919
+ ),
727
920
  }
728
921
  df = pd.DataFrame(test_data)
729
922
 
730
923
  # Test Case 1: Single condition (Equality)
731
- filters_dict = {'category': "== 'A'"}
732
- expected_output = df[df['category'] == 'A']
924
+ filters_dict = {"category": "== 'A'"}
925
+ expected_output = df[df["category"] == "A"]
733
926
  result = self.dp.filter_df_on_multiple_conditions(df, filters_dict)
734
927
  pd.testing.assert_frame_equal(result, expected_output)
735
928
 
736
929
  # Test Case 2: Multiple conditions (Equality and Greater Than)
737
- filters_dict = {'category': "== 'A'", 'value': "> 20"}
738
- expected_output = df[(df['category'] == 'A') & (df['value'] > 20)]
930
+ filters_dict = {"category": "== 'A'", "value": "> 20"}
931
+ expected_output = df[(df["category"] == "A") & (df["value"] > 20)]
739
932
  result = self.dp.filter_df_on_multiple_conditions(df, filters_dict)
740
933
  pd.testing.assert_frame_equal(result, expected_output)
741
934
 
742
935
  # Test Case 3: Date comparison
743
- filters_dict = {'date': ">= '2023-01-03'"}
744
- expected_output = df[df['date'] >= pd.to_datetime('2023-01-03')]
936
+ filters_dict = {"date": ">= '2023-01-03'"}
937
+ expected_output = df[df["date"] >= pd.to_datetime("2023-01-03")]
745
938
  result = self.dp.filter_df_on_multiple_conditions(df, filters_dict)
746
939
  pd.testing.assert_frame_equal(result, expected_output)
747
940
 
748
941
  # Test Case 4: Inequality
749
- filters_dict = {'value': "!= 30"}
750
- expected_output = df[df['value'] != 30]
942
+ filters_dict = {"value": "!= 30"}
943
+ expected_output = df[df["value"] != 30]
751
944
  result = self.dp.filter_df_on_multiple_conditions(df, filters_dict)
752
945
  pd.testing.assert_frame_equal(result, expected_output)
753
946
 
754
947
  # Test Case 5: Mixed conditions
755
- filters_dict = {'category': "== 'A'", 'date': "<= '2023-01-03'"}
756
- expected_output = df[(df['category'] == 'A') & (df['date'] <= pd.to_datetime('2023-01-03'))]
948
+ filters_dict = {"category": "== 'A'", "date": "<= '2023-01-03'"}
949
+ expected_output = df[
950
+ (df["category"] == "A") & (df["date"] <= pd.to_datetime("2023-01-03"))
951
+ ]
757
952
  result = self.dp.filter_df_on_multiple_conditions(df, filters_dict)
758
953
  pd.testing.assert_frame_equal(result, expected_output)
759
954
 
760
955
  def test_fill_weekly_date_range(self):
761
956
  # Test input DataFrame
762
957
  test_data = {
763
- 'date': ['2023-01-02', '2023-01-16', '2023-01-30'], # Weekly data with gaps
764
- 'value': [10.0, 20.0, 30.0]
958
+ "date": ["2023-01-02", "2023-01-16", "2023-01-30"], # Weekly data with gaps
959
+ "value": [10.0, 20.0, 30.0],
765
960
  }
766
961
  df = pd.DataFrame(test_data)
767
- df['date'] = pd.to_datetime(df['date'])
962
+ df["date"] = pd.to_datetime(df["date"])
768
963
 
769
964
  # Expected output DataFrame
770
965
  expected_data = {
771
- 'date': ['2023-01-02', '2023-01-09', '2023-01-16', '2023-01-23', '2023-01-30'],
772
- 'value': [10.0, 0.0, 20.0, 0.0, 30.0]
966
+ "date": [
967
+ "2023-01-02",
968
+ "2023-01-09",
969
+ "2023-01-16",
970
+ "2023-01-23",
971
+ "2023-01-30",
972
+ ],
973
+ "value": [10.0, 0.0, 20.0, 0.0, 30.0],
773
974
  }
774
975
  expected_output = pd.DataFrame(expected_data)
775
- expected_output['date'] = pd.to_datetime(expected_output['date'])
976
+ expected_output["date"] = pd.to_datetime(expected_output["date"])
776
977
 
777
978
  # Call the function
778
979
  dp = dataprocessing() # Replace with the correct instantiation of your class
779
- result = dp.fill_weekly_date_range(df, date_column='date', freq='W-MON')
980
+ result = dp.fill_weekly_date_range(df, date_column="date", freq="W-MON")
780
981
 
781
982
  # Assert the result matches the expected output
782
- pd.testing.assert_frame_equal(result.reset_index(drop=True), expected_output.reset_index(drop=True))
983
+ pd.testing.assert_frame_equal(
984
+ result.reset_index(drop=True),
985
+ expected_output.reset_index(drop=True),
986
+ )
783
987
 
784
988
  def test_add_prefix_and_suffix(self):
785
989
  # Test DataFrame
786
990
  test_data = {
787
- 'date': ['2023-01-01', '2023-01-02', '2023-01-03'],
788
- 'value1': [10, 20, 30],
789
- 'value2': [40, 50, 60]
991
+ "date": ["2023-01-01", "2023-01-02", "2023-01-03"],
992
+ "value1": [10, 20, 30],
993
+ "value2": [40, 50, 60],
790
994
  }
791
995
  df = pd.DataFrame(test_data)
792
996
 
793
997
  # Expected output when no date column is excluded
794
998
  expected_data_no_date_col = {
795
- 'prefix_date_suffix': ['2023-01-01', '2023-01-02', '2023-01-03'],
796
- 'prefix_value1_suffix': [10, 20, 30],
797
- 'prefix_value2_suffix': [40, 50, 60]
999
+ "prefix_date_suffix": ["2023-01-01", "2023-01-02", "2023-01-03"],
1000
+ "prefix_value1_suffix": [10, 20, 30],
1001
+ "prefix_value2_suffix": [40, 50, 60],
798
1002
  }
799
1003
  expected_output_no_date_col = pd.DataFrame(expected_data_no_date_col)
800
1004
 
801
1005
  # Expected output when date column is excluded
802
1006
  expected_data_with_date_col = {
803
- 'date': ['2023-01-01', '2023-01-02', '2023-01-03'],
804
- 'prefix_value1_suffix': [10, 20, 30],
805
- 'prefix_value2_suffix': [40, 50, 60]
1007
+ "date": ["2023-01-01", "2023-01-02", "2023-01-03"],
1008
+ "prefix_value1_suffix": [10, 20, 30],
1009
+ "prefix_value2_suffix": [40, 50, 60],
806
1010
  }
807
1011
  expected_output_with_date_col = pd.DataFrame(expected_data_with_date_col)
808
1012
 
809
1013
  # Call the function without excluding a date column
810
1014
  dp = dataprocessing() # Replace with the correct instantiation of your class
811
- result_no_date_col = dp.add_prefix_and_suffix(df.copy(), prefix='prefix_', suffix='_suffix')
1015
+ result_no_date_col = dp.add_prefix_and_suffix(
1016
+ df.copy(),
1017
+ prefix="prefix_",
1018
+ suffix="_suffix",
1019
+ )
812
1020
 
813
1021
  # Assert result matches the expected output
814
1022
  pd.testing.assert_frame_equal(result_no_date_col, expected_output_no_date_col)
815
1023
 
816
1024
  # Call the function with a date column excluded
817
- result_with_date_col = dp.add_prefix_and_suffix(df.copy(), prefix='prefix_', suffix='_suffix', date_col='date')
1025
+ result_with_date_col = dp.add_prefix_and_suffix(
1026
+ df.copy(),
1027
+ prefix="prefix_",
1028
+ suffix="_suffix",
1029
+ date_col="date",
1030
+ )
818
1031
 
819
1032
  # Assert result matches the expected output
820
- pd.testing.assert_frame_equal(result_with_date_col, expected_output_with_date_col)
1033
+ pd.testing.assert_frame_equal(
1034
+ result_with_date_col,
1035
+ expected_output_with_date_col,
1036
+ )
821
1037
 
822
1038
  def test_create_dummies(self):
823
1039
  # Test Case 1: Basic functionality without date column
824
- df = pd.DataFrame({
825
- 'col1': [0, 1, 2],
826
- 'col2': [3, 4, 0],
827
- 'col3': [5, 0, 0]
828
- })
1040
+ df = pd.DataFrame({"col1": [0, 1, 2], "col2": [3, 4, 0], "col3": [5, 0, 0]})
829
1041
  dummy_threshold = 1
830
- expected_output = pd.DataFrame({
831
- 'col1': [0, 0, 1],
832
- 'col2': [1, 1, 0],
833
- 'col3': [1, 0, 0]
834
- })
1042
+ expected_output = pd.DataFrame(
1043
+ {"col1": [0, 0, 1], "col2": [1, 1, 0], "col3": [1, 0, 0]},
1044
+ )
835
1045
  result = self.dp.create_dummies(df.copy(), dummy_threshold=dummy_threshold)
836
1046
  pd.testing.assert_frame_equal(result, expected_output)
837
1047
 
838
1048
  # Test Case 2: With date column
839
- df_with_date = pd.DataFrame({
840
- 'date': ['2023-01-01', '2023-01-02', '2023-01-03'],
841
- 'col1': [0, 1, 2],
842
- 'col2': [3, 4, 0]
843
- })
844
- expected_output_with_date = pd.DataFrame({
845
- 'date': ['2023-01-01', '2023-01-02', '2023-01-03'],
846
- 'col1': [0, 0, 1],
847
- 'col2': [1, 1, 0]
848
- })
849
- result_with_date = self.dp.create_dummies(df_with_date.copy(), date_col='date', dummy_threshold=dummy_threshold)
1049
+ df_with_date = pd.DataFrame(
1050
+ {
1051
+ "date": ["2023-01-01", "2023-01-02", "2023-01-03"],
1052
+ "col1": [0, 1, 2],
1053
+ "col2": [3, 4, 0],
1054
+ },
1055
+ )
1056
+ expected_output_with_date = pd.DataFrame(
1057
+ {
1058
+ "date": ["2023-01-01", "2023-01-02", "2023-01-03"],
1059
+ "col1": [0, 0, 1],
1060
+ "col2": [1, 1, 0],
1061
+ },
1062
+ )
1063
+ result_with_date = self.dp.create_dummies(
1064
+ df_with_date.copy(),
1065
+ date_col="date",
1066
+ dummy_threshold=dummy_threshold,
1067
+ )
850
1068
  pd.testing.assert_frame_equal(result_with_date, expected_output_with_date)
851
1069
 
852
1070
  # Test Case 3: Adding total dummy column
853
1071
  expected_output_with_total = expected_output.copy()
854
- expected_output_with_total['total'] = [1, 1, 1]
855
- result_with_total = self.dp.create_dummies(df.copy(), dummy_threshold=dummy_threshold, add_total_dummy_col='Yes')
1072
+ expected_output_with_total["total"] = [1, 1, 1]
1073
+ result_with_total = self.dp.create_dummies(
1074
+ df.copy(),
1075
+ dummy_threshold=dummy_threshold,
1076
+ add_total_dummy_col="Yes",
1077
+ )
856
1078
  pd.testing.assert_frame_equal(result_with_total, expected_output_with_total)
857
1079
 
858
1080
  # Test Case 4: Adding total dummy column with date column
859
1081
  expected_output_with_date_and_total = expected_output_with_date.copy()
860
- expected_output_with_date_and_total['total'] = [1, 1, 1]
1082
+ expected_output_with_date_and_total["total"] = [1, 1, 1]
861
1083
  result_with_date_and_total = self.dp.create_dummies(
862
1084
  df_with_date.copy(),
863
- date_col='date',
1085
+ date_col="date",
864
1086
  dummy_threshold=dummy_threshold,
865
- add_total_dummy_col='Yes',
1087
+ add_total_dummy_col="Yes",
1088
+ )
1089
+ pd.testing.assert_frame_equal(
1090
+ result_with_date_and_total,
1091
+ expected_output_with_date_and_total,
866
1092
  )
867
- pd.testing.assert_frame_equal(result_with_date_and_total, expected_output_with_date_and_total)
868
1093
 
869
1094
  # Test Case 5: Threshold of 0 (all positive numbers become 1)
870
- df_threshold_0 = pd.DataFrame({
871
- 'col1': [-1, 0, 1],
872
- 'col2': [0, 2, -3]
873
- })
874
- expected_output_threshold_0 = pd.DataFrame({
875
- 'col1': [0, 0, 1],
876
- 'col2': [0, 1, 0]
877
- })
878
- result_threshold_0 = self.dp.create_dummies(df_threshold_0.copy(), dummy_threshold=0)
1095
+ df_threshold_0 = pd.DataFrame({"col1": [-1, 0, 1], "col2": [0, 2, -3]})
1096
+ expected_output_threshold_0 = pd.DataFrame(
1097
+ {"col1": [0, 0, 1], "col2": [0, 1, 0]},
1098
+ )
1099
+ result_threshold_0 = self.dp.create_dummies(
1100
+ df_threshold_0.copy(),
1101
+ dummy_threshold=0,
1102
+ )
879
1103
  pd.testing.assert_frame_equal(result_threshold_0, expected_output_threshold_0)
880
1104
 
881
1105
  def test_replace_substrings(self):
882
1106
  # Test Case 1: Basic replacement
883
- df = pd.DataFrame({
884
- 'text': ['hello world', 'python programming', 'hello python']
885
- })
886
- replacements = {'hello': 'hi', 'python': 'java'}
887
- expected_output = pd.DataFrame({
888
- 'text': ['hi world', 'java programming', 'hi java']
889
- })
890
- result = self.dp.replace_substrings(df.copy(), 'text', replacements)
1107
+ df = pd.DataFrame(
1108
+ {"text": ["hello world", "python programming", "hello python"]},
1109
+ )
1110
+ replacements = {"hello": "hi", "python": "java"}
1111
+ expected_output = pd.DataFrame(
1112
+ {"text": ["hi world", "java programming", "hi java"]},
1113
+ )
1114
+ result = self.dp.replace_substrings(df.copy(), "text", replacements)
891
1115
  pd.testing.assert_frame_equal(result, expected_output)
892
1116
 
893
1117
  # Test Case 2: Replacement with to_lower=True
894
- df_mixed_case = pd.DataFrame({
895
- 'text': ['Hello World', 'PYTHON Programming', 'hello PYTHON']
896
- })
897
- expected_output_lower = pd.DataFrame({
898
- 'text': ['hi world', 'java programming', 'hi java']
899
- })
900
- result_lower = self.dp.replace_substrings(df_mixed_case.copy(), 'text', replacements, to_lower=True)
1118
+ df_mixed_case = pd.DataFrame(
1119
+ {"text": ["Hello World", "PYTHON Programming", "hello PYTHON"]},
1120
+ )
1121
+ expected_output_lower = pd.DataFrame(
1122
+ {"text": ["hi world", "java programming", "hi java"]},
1123
+ )
1124
+ result_lower = self.dp.replace_substrings(
1125
+ df_mixed_case.copy(),
1126
+ "text",
1127
+ replacements,
1128
+ to_lower=True,
1129
+ )
901
1130
  pd.testing.assert_frame_equal(result_lower, expected_output_lower)
902
1131
 
903
1132
  # Test Case 3: Replacement with a new column
904
- df_new_col = pd.DataFrame({
905
- 'text': ['hello world', 'python programming', 'hello python']
906
- })
907
- expected_output_new_col = pd.DataFrame({
908
- 'text': ['hello world', 'python programming', 'hello python'],
909
- 'new_text': ['hi world', 'java programming', 'hi java']
910
- })
911
- result_new_col = self.dp.replace_substrings(df_new_col.copy(), 'text', replacements, new_column='new_text')
1133
+ df_new_col = pd.DataFrame(
1134
+ {"text": ["hello world", "python programming", "hello python"]},
1135
+ )
1136
+ expected_output_new_col = pd.DataFrame(
1137
+ {
1138
+ "text": ["hello world", "python programming", "hello python"],
1139
+ "new_text": ["hi world", "java programming", "hi java"],
1140
+ },
1141
+ )
1142
+ result_new_col = self.dp.replace_substrings(
1143
+ df_new_col.copy(),
1144
+ "text",
1145
+ replacements,
1146
+ new_column="new_text",
1147
+ )
912
1148
  pd.testing.assert_frame_equal(result_new_col, expected_output_new_col)
913
1149
 
914
1150
  def test_add_total_column(self):
915
1151
  # Test Case 1: Basic functionality without excluding any column
916
- df = pd.DataFrame({
917
- 'col1': [1, 2, 3],
918
- 'col2': [4, 5, 6],
919
- 'col3': [7, 8, 9]
920
- })
1152
+ df = pd.DataFrame({"col1": [1, 2, 3], "col2": [4, 5, 6], "col3": [7, 8, 9]})
921
1153
  expected_output = df.copy()
922
- expected_output['Total'] = [12, 15, 18]
1154
+ expected_output["Total"] = [12, 15, 18]
923
1155
  result = self.dp.add_total_column(df.copy())
924
1156
  pd.testing.assert_frame_equal(result, expected_output)
925
1157
 
926
1158
  # Test Case 2: Excluding a column from the total
927
- df = pd.DataFrame({
928
- 'col1': [1, 2, 3],
929
- 'col2': [4, 5, 6],
930
- 'col3': [7, 8, 9]
931
- })
932
- expected_output_exclude = pd.DataFrame({
933
- 'col1': [1, 2, 3],
934
- 'col2': [4, 5, 6],
935
- 'col3': [7, 8, 9],
936
- 'Total': [5, 7, 9] # Sum without 'col3'
937
- })
938
- result_exclude = self.dp.add_total_column(df.copy(), exclude_col='col3')
1159
+ df = pd.DataFrame({"col1": [1, 2, 3], "col2": [4, 5, 6], "col3": [7, 8, 9]})
1160
+ expected_output_exclude = pd.DataFrame(
1161
+ {
1162
+ "col1": [1, 2, 3],
1163
+ "col2": [4, 5, 6],
1164
+ "col3": [7, 8, 9],
1165
+ "Total": [5, 7, 9], # Sum without 'col3'
1166
+ },
1167
+ )
1168
+ result_exclude = self.dp.add_total_column(df.copy(), exclude_col="col3")
939
1169
  pd.testing.assert_frame_equal(result_exclude, expected_output_exclude)
940
1170
 
941
1171
  # Test Case 3: Custom total column name
942
- custom_total_col_name = 'Sum'
1172
+ custom_total_col_name = "Sum"
943
1173
  expected_output_custom = df.copy()
944
1174
  expected_output_custom[custom_total_col_name] = [12, 15, 18]
945
- result_custom = self.dp.add_total_column(df.copy(), total_col_name=custom_total_col_name)
1175
+ result_custom = self.dp.add_total_column(
1176
+ df.copy(),
1177
+ total_col_name=custom_total_col_name,
1178
+ )
946
1179
  pd.testing.assert_frame_equal(result_custom, expected_output_custom)
947
1180
 
948
1181
  # Test Case 4: DataFrame with a single column
949
- single_col_df = pd.DataFrame({'col1': [1, 2, 3]})
1182
+ single_col_df = pd.DataFrame({"col1": [1, 2, 3]})
950
1183
  expected_single_col = single_col_df.copy()
951
- expected_single_col['Total'] = [1, 2, 3]
1184
+ expected_single_col["Total"] = [1, 2, 3]
952
1185
  result_single_col = self.dp.add_total_column(single_col_df.copy())
953
1186
  pd.testing.assert_frame_equal(result_single_col, expected_single_col)
954
1187
 
955
1188
  def test_apply_lookup_table_based_on_substring(self):
956
1189
  # Test Case 1: Basic categorization
957
- df = pd.DataFrame({
958
- 'text': ['I love apples', 'Bananas are great', 'Something else', 'Grapes are sour']
959
- })
1190
+ df = pd.DataFrame(
1191
+ {
1192
+ "text": [
1193
+ "I love apples",
1194
+ "Bananas are great",
1195
+ "Something else",
1196
+ "Grapes are sour",
1197
+ ],
1198
+ },
1199
+ )
960
1200
  category_dict = {
961
- 'apple': 'Fruit',
962
- 'banana': 'Fruit',
963
- 'cherry': 'Fruit',
964
- 'grape': 'Fruit'
1201
+ "apple": "Fruit",
1202
+ "banana": "Fruit",
1203
+ "cherry": "Fruit",
1204
+ "grape": "Fruit",
965
1205
  }
966
- expected_output = pd.DataFrame({
967
- 'text': ['I love apples', 'Bananas are great', 'Something else', 'Grapes are sour'],
968
- 'Category': ['Fruit', 'Fruit', 'Other', 'Fruit']
969
- })
970
- result = self.dp.apply_lookup_table_based_on_substring(df.copy(), 'text', category_dict)
1206
+ expected_output = pd.DataFrame(
1207
+ {
1208
+ "text": [
1209
+ "I love apples",
1210
+ "Bananas are great",
1211
+ "Something else",
1212
+ "Grapes are sour",
1213
+ ],
1214
+ "Category": ["Fruit", "Fruit", "Other", "Fruit"],
1215
+ },
1216
+ )
1217
+ result = self.dp.apply_lookup_table_based_on_substring(
1218
+ df.copy(),
1219
+ "text",
1220
+ category_dict,
1221
+ )
971
1222
  pd.testing.assert_frame_equal(result, expected_output)
972
1223
 
973
1224
  def test_compare_overlap(self):
@@ -993,29 +1244,30 @@ class TestDataProcessor(unittest.TestCase):
993
1244
  df2 = pd.DataFrame(df2_data)
994
1245
 
995
1246
  # 3. Call compare_overlap from your dataprocessing class
996
- diff_df, total_diff_df = self.dp.compare_overlap(df1, df2, 'date')
997
- expected_diff_df = pd.DataFrame({
998
- 'date': pd.to_datetime(['2021-01-03', '2021-01-04']),
999
- 'diff_value': [-2, 5],
1000
- 'diff_count': [1, -1]
1001
- })
1002
-
1003
- expected_total_diff_df = pd.DataFrame({
1004
- 'Column': ['value', 'count'],
1005
- 'Total Difference': [3, 0]
1006
- })
1247
+ diff_df, total_diff_df = self.dp.compare_overlap(df1, df2, "date")
1248
+ expected_diff_df = pd.DataFrame(
1249
+ {
1250
+ "date": pd.to_datetime(["2021-01-03", "2021-01-04"]),
1251
+ "diff_value": [-2, 5],
1252
+ "diff_count": [1, -1],
1253
+ },
1254
+ )
1255
+
1256
+ expected_total_diff_df = pd.DataFrame(
1257
+ {"Column": ["value", "count"], "Total Difference": [3, 0]},
1258
+ )
1007
1259
 
1008
1260
  # 5. Use pd.testing.assert_frame_equal to check the outputs
1009
1261
  # Sort and reset index to ensure matching row order
1010
1262
  pd.testing.assert_frame_equal(
1011
- diff_df.sort_values('date').reset_index(drop=True),
1012
- expected_diff_df.sort_values('date').reset_index(drop=True)
1263
+ diff_df.sort_values("date").reset_index(drop=True),
1264
+ expected_diff_df.sort_values("date").reset_index(drop=True),
1013
1265
  )
1014
1266
 
1015
1267
  # Sort by 'Column' to ensure matching row order in summary
1016
1268
  pd.testing.assert_frame_equal(
1017
- total_diff_df.sort_values('Column').reset_index(drop=True),
1018
- expected_total_diff_df.sort_values('Column').reset_index(drop=True)
1269
+ total_diff_df.sort_values("Column").reset_index(drop=True),
1270
+ expected_total_diff_df.sort_values("Column").reset_index(drop=True),
1019
1271
  )
1020
1272
 
1021
1273
  def test_week_commencing_2_week_commencing_conversion_isoweekday(self):
@@ -1035,29 +1287,28 @@ class TestDataProcessor(unittest.TestCase):
1035
1287
  pd.Timestamp("2023-01-02"), # Friday
1036
1288
  pd.Timestamp("2023-01-02"), # Saturday
1037
1289
  ],
1038
- name="week_start_mon"
1290
+ name="week_start_mon",
1039
1291
  )
1040
1292
 
1041
1293
  # Use the new function from our data processing object
1042
1294
  result = self.dp.week_commencing_2_week_commencing_conversion_isoweekday(
1043
1295
  df.copy(),
1044
1296
  date_col="date",
1045
- week_commencing="mon"
1297
+ week_commencing="mon",
1046
1298
  )
1047
1299
 
1048
1300
  # Compare the 'week_start_mon' column with our expected results
1049
1301
  pd.testing.assert_series_equal(
1050
1302
  result["week_start_mon"], # actual
1051
- expected_mon # expected
1303
+ expected_mon, # expected
1052
1304
  )
1053
1305
 
1054
1306
 
1055
-
1056
1307
  ###################################################################################################################################################
1057
1308
  ###################################################################################################################################################
1058
1309
 
1059
1310
  # class TestDataPull(unittest.TestCase)
1060
1311
 
1061
1312
 
1062
- if __name__ == '__main__':
1313
+ if __name__ == "__main__":
1063
1314
  unittest.main()