imsciences 0.5.4.8__py3-none-any.whl → 0.9.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,1064 @@
1
+ import unittest
2
+ import pandas as pd
3
+ import numpy as np
4
+ import os
5
+ from mmm import dataprocessing
6
+ import plotly.graph_objects as go
7
+
8
+ class TestDataProcessor(unittest.TestCase):
9
+
10
+ def setUp(self):
11
+ self.dp = dataprocessing()
12
+ self.df = pd.DataFrame({
13
+ 'date': pd.date_range(start='2023-01-01', periods=10, freq='D'),
14
+ 'value1': range(10),
15
+ 'value2': range(10, 20)
16
+ })
17
+ self.mixed_date_df = pd.DataFrame({
18
+ 'mixed_date': ['2023-01-01', '01/02/2023', '2023/03/01', '2023-04-01']
19
+ })
20
+ self.merged_df = pd.DataFrame({
21
+ 'col1': ["A", "B", "C"],
22
+ 'col2': ["X", "Y", "Z"]
23
+ })
24
+
25
+ def test_get_wd_levels(self):
26
+ current_dir = os.getcwd()
27
+ parent_dir = self.dp.get_wd_levels(1)
28
+ self.assertEqual(parent_dir, os.path.dirname(current_dir))
29
+
30
+ def test_aggregate_daily_to_wc_long(self):
31
+ # Create a test DataFrame
32
+ test_data = {
33
+ 'date': ['2023-01-01', '2023-01-02', '2023-01-08', '2023-01-09', '2023-01-10'],
34
+ 'group_col': ['A', 'A', 'B', 'B', 'B'],
35
+ 'value1': [10, 20, 30, 40, np.nan],
36
+ 'value2': [100, 200, 300, np.nan, 500]
37
+ }
38
+ df = pd.DataFrame(test_data)
39
+
40
+ # Expected output for different test cases
41
+ expected_sum_output = pd.DataFrame({
42
+ 'OBS': ['2023-01-01', '2023-01-08'], # Week starting on Sunday
43
+ 'group_col': ['A', 'B'],
44
+ 'value1': [30.0, 70.0],
45
+ 'value2': [300.0, 800.0]
46
+ })
47
+
48
+ # Convert OBS column to datetime for expected DataFrame
49
+ expected_sum_output['OBS'] = pd.to_datetime(expected_sum_output['OBS'])
50
+
51
+ # Test sum aggregation
52
+ result_sum = self.dp.aggregate_daily_to_wc_long(df, 'date', ['group_col'], ['value1', 'value2'], wc='sun', aggregation='sum')
53
+
54
+ # Ensure both OBS columns are datetime for comparison
55
+ result_sum['OBS'] = pd.to_datetime(result_sum['OBS'])
56
+
57
+ # Compare the resulting DataFrame with the expected DataFrame
58
+ pd.testing.assert_frame_equal(result_sum, expected_sum_output)
59
+
60
+ def test_convert_monthly_to_daily(self):
61
+ # Create a test DataFrame with monthly data
62
+ test_data = {
63
+ 'date': ['2023-01-01', '2023-02-01', '2023-03-01'],
64
+ 'value1': [31, 28, 31],
65
+ 'value2': [310, 280, 310]
66
+ }
67
+ df = pd.DataFrame(test_data)
68
+
69
+ # Expected output DataFrame when divide=True
70
+ expected_daily_data_divide = {
71
+ 'date': pd.date_range(start='2023-01-01', end='2023-01-31').tolist() +
72
+ pd.date_range(start='2023-02-01', end='2023-02-28').tolist() +
73
+ pd.date_range(start='2023-03-01', end='2023-03-31').tolist(),
74
+ 'value1': [1.0] * 31 + [1.0] * 28 + [1.0] * 31,
75
+ 'value2': [10.0] * 31 + [10.0] * 28 + [10.0] * 31
76
+ }
77
+ expected_daily_df_divide = pd.DataFrame(expected_daily_data_divide)
78
+
79
+ # Call the function with divide=True
80
+ result_divide = self.dp.convert_monthly_to_daily(df, 'date', divide=True)
81
+
82
+ # Compare the resulting DataFrame with the expected DataFrame
83
+ pd.testing.assert_frame_equal(result_divide.reset_index(drop=True), expected_daily_df_divide)
84
+
85
+ # Expected output DataFrame when divide=False
86
+ expected_daily_data_no_divide = {
87
+ 'date': pd.date_range(start='2023-01-01', end='2023-01-31').tolist() +
88
+ pd.date_range(start='2023-02-01', end='2023-02-28').tolist() +
89
+ pd.date_range(start='2023-03-01', end='2023-03-31').tolist(),
90
+ 'value1': [31] * 31 + [28] * 28 + [31] * 31,
91
+ 'value2': [310] * 31 + [280] * 28 + [310] * 31
92
+ }
93
+ expected_daily_df_no_divide = pd.DataFrame(expected_daily_data_no_divide)
94
+
95
+ # Call the function with divide=False
96
+ result_no_divide = self.dp.convert_monthly_to_daily(df, 'date', divide=False)
97
+
98
+ # Compare the resulting DataFrame with the expected DataFrame
99
+ pd.testing.assert_frame_equal(result_no_divide.reset_index(drop=True), expected_daily_df_no_divide)
100
+
101
+ def test_week_of_year_mapping(self):
102
+ # Create a test DataFrame with ISO week format
103
+ test_data = {
104
+ 'week_col': ['2023-W01', '2023-W05', '2023-W10', '2023-W52']
105
+ }
106
+ df = pd.DataFrame(test_data)
107
+
108
+ # Expected outputs for different start days
109
+ expected_output_mon = pd.DataFrame({
110
+ 'week_col': ['2023-W01', '2023-W05', '2023-W10', '2023-W52'],
111
+ 'OBS': ['02/01/2023', '30/01/2023', '06/03/2023', '25/12/2023']
112
+ })
113
+
114
+ expected_output_sun = pd.DataFrame({
115
+ 'week_col': ['2023-W01', '2023-W05', '2023-W10', '2023-W52'],
116
+ 'OBS': ['01/01/2023', '29/01/2023', '05/03/2023', '24/12/2023']
117
+ })
118
+
119
+ # Test mapping with Monday as start day
120
+ result_mon = self.dp.week_of_year_mapping(df.copy(), 'week_col', 'mon')
121
+ pd.testing.assert_frame_equal(result_mon, expected_output_mon)
122
+
123
+ # Test mapping with Sunday as start day
124
+ result_sun = self.dp.week_of_year_mapping(df.copy(), 'week_col', 'sun')
125
+ pd.testing.assert_frame_equal(result_sun, expected_output_sun)
126
+
127
+ # Test with invalid start day input
128
+ with self.assertRaises(ValueError) as context:
129
+ self.dp.week_of_year_mapping(df.copy(), 'week_col', 'invalid_day')
130
+ self.assertIn("Invalid day input", str(context.exception))
131
+
132
+ def test_rename_cols(self):
133
+ # Create a test DataFrame
134
+ test_data = {
135
+ 'OBS': [1, 2, 3],
136
+ 'Column One': [10, 20, 30],
137
+ 'Another Column': [100, 200, 300],
138
+ 'Special Characters !@#': [5, 15, 25]
139
+ }
140
+ df = pd.DataFrame(test_data)
141
+
142
+ # Expected output with default prefix
143
+ expected_output_default = pd.DataFrame({
144
+ 'OBS': [1, 2, 3],
145
+ 'ame_column_one': [10, 20, 30],
146
+ 'ame_another_column': [100, 200, 300],
147
+ 'ame_special_characters_!@#': [5, 15, 25]
148
+ })
149
+
150
+ # Expected output with custom prefix
151
+ expected_output_custom = pd.DataFrame({
152
+ 'OBS': [1, 2, 3],
153
+ 'custom_column_one': [10, 20, 30],
154
+ 'custom_another_column': [100, 200, 300],
155
+ 'custom_special_characters_!@#': [5, 15, 25]
156
+ })
157
+
158
+ # Test renaming columns with default prefix
159
+ result_default = self.dp.rename_cols(df)
160
+ pd.testing.assert_frame_equal(result_default, expected_output_default)
161
+
162
+ # Test renaming columns with custom prefix
163
+ result_custom = self.dp.rename_cols(df, name='custom_')
164
+ pd.testing.assert_frame_equal(result_custom, expected_output_custom)
165
+
166
+ # Test that 'OBS' column remains unchanged
167
+ self.assertIn('OBS', result_default.columns)
168
+ self.assertIn('OBS', result_custom.columns)
169
+
170
+ def test_merge_new_and_old(self):
171
+ # Create test DataFrames for old and new data
172
+ old_data = {
173
+ 'OBS': ['2023-01-01', '2023-01-02', '2023-01-03', '2023-01-04'],
174
+ 'old_values': [10, 20, 30, 40]
175
+ }
176
+ new_data = {
177
+ 'OBS': ['2023-01-04', '2023-01-05', '2023-01-06'],
178
+ 'new_values': [100, 200, 300]
179
+ }
180
+ old_df = pd.DataFrame(old_data)
181
+ new_df = pd.DataFrame(new_data)
182
+
183
+ # Expected output
184
+ expected_output = pd.DataFrame({
185
+ 'OBS': pd.to_datetime(['2023-01-01', '2023-01-02', '2023-01-03', '2023-01-04', '2023-01-05', '2023-01-06']),
186
+ 'new_values': [10, 20, 30, 40, 200, 300]
187
+ })
188
+
189
+ # Test merging with cutoff_date='2023-01-04'
190
+ result = self.dp.merge_new_and_old(old_df, 'old_values', new_df, 'new_values', '2023-01-04')
191
+
192
+ # Assertions
193
+ pd.testing.assert_frame_equal(result, expected_output)
194
+
195
+ # Test that columns are correctly renamed and sorted
196
+ self.assertIn('OBS', result.columns)
197
+ self.assertIn('new_values', result.columns)
198
+ self.assertEqual(len(result), len(expected_output)) # Ensure row count matches
199
+ self.assertTrue((result['OBS'].diff().dropna() >= pd.Timedelta(0)).all()) # Check that dates are in order
200
+
201
+ def test_merge_dataframes_on_column(self):
202
+ # Create test DataFrames
203
+ df1 = pd.DataFrame({
204
+ 'OBS': ['2023-01-01', '2023-01-02', '2023-01-03'],
205
+ 'value1': [10, 20, 30]
206
+ })
207
+ df2 = pd.DataFrame({
208
+ 'OBS': ['2023-01-02', '2023-01-03', '2023-01-04'],
209
+ 'value2': [40, 50, 60]
210
+ })
211
+ df3 = pd.DataFrame({
212
+ 'OBS': ['2023-01-03', '2023-01-04', '2023-01-05'],
213
+ 'value3': [70, 80, 90]
214
+ })
215
+
216
+ # Ensure test DataFrame columns are datetime
217
+ df1['OBS'] = pd.to_datetime(df1['OBS'])
218
+ df2['OBS'] = pd.to_datetime(df2['OBS'])
219
+ df3['OBS'] = pd.to_datetime(df3['OBS'])
220
+
221
+ # Expected output for outer merge (cast to float64 to match the behavior of fillna)
222
+ expected_output_outer = pd.DataFrame({
223
+ 'OBS': pd.to_datetime(['2023-01-01', '2023-01-02', '2023-01-03', '2023-01-04', '2023-01-05']),
224
+ 'value1': [10.0, 20.0, 30.0, 0.0, 0.0],
225
+ 'value2': [0.0, 40.0, 50.0, 60.0, 0.0],
226
+ 'value3': [0.0, 0.0, 70.0, 80.0, 90.0]
227
+ })
228
+
229
+ # Expected output for inner merge
230
+ expected_output_inner = pd.DataFrame({
231
+ 'OBS': pd.to_datetime(['2023-01-03']),
232
+ 'value1': [30],
233
+ 'value2': [50],
234
+ 'value3': [70]
235
+ })
236
+
237
+ # Test outer merge
238
+ result_outer = self.dp.merge_dataframes_on_column([df1, df2, df3], common_column='OBS', merge_how='outer')
239
+ pd.testing.assert_frame_equal(result_outer.reset_index(drop=True), expected_output_outer)
240
+
241
+ # Test inner merge
242
+ result_inner = self.dp.merge_dataframes_on_column([df1, df2, df3], common_column='OBS', merge_how='inner')
243
+ pd.testing.assert_frame_equal(result_inner.reset_index(drop=True), expected_output_inner)
244
+
245
+ # Test with empty DataFrame list
246
+ result_empty = self.dp.merge_dataframes_on_column([], common_column='OBS', merge_how='outer')
247
+ self.assertIsNone(result_empty)
248
+
249
+ # Test with one DataFrame in the list
250
+ result_single = self.dp.merge_dataframes_on_column([df1], common_column='OBS', merge_how='outer')
251
+ pd.testing.assert_frame_equal(result_single.reset_index(drop=True), df1)
252
+
253
+ # Test that the common column is sorted and converted to datetime
254
+ self.assertTrue(pd.api.types.is_datetime64_any_dtype(result_outer['OBS']))
255
+ self.assertTrue((result_outer['OBS'].diff().dropna() >= pd.Timedelta(0)).all()) # Check sorted dates
256
+
257
+ def test_merge_and_update_dfs(self):
258
+ # Create test DataFrames
259
+ df1 = pd.DataFrame({
260
+ 'OBS': ['2023-01-01', '2023-01-02', '2023-01-03'],
261
+ 'value1': [10, 20, 30],
262
+ 'value2': [100, 200, 300]
263
+ })
264
+
265
+ df2 = pd.DataFrame({
266
+ 'OBS': ['2023-01-02', '2023-01-03', '2023-01-04'],
267
+ 'value1': [15, 25, 35], # Updates for value1
268
+ 'value3': [400, 500, 600] # New column
269
+ })
270
+
271
+ # Ensure test DataFrame columns are datetime
272
+ df1['OBS'] = pd.to_datetime(df1['OBS'])
273
+ df2['OBS'] = pd.to_datetime(df2['OBS'])
274
+
275
+ # Expected output with float64 for numeric columns
276
+ expected_output = pd.DataFrame({
277
+ 'OBS': pd.to_datetime(['2023-01-01', '2023-01-02', '2023-01-03', '2023-01-04']),
278
+ 'value1': [10.0, 15.0, 25.0, 35.0], # Updated where applicable
279
+ 'value2': [100.0, 200.0, 300.0, 0.0], # From df1, 0 where not available
280
+ 'value3': [0.0, 400.0, 500.0, 600.0] # From df2, 0 where not available
281
+ })
282
+
283
+ # Test the merge and update function
284
+ result = self.dp.merge_and_update_dfs(df1, df2, key_column='OBS')
285
+
286
+ # Assertions
287
+ pd.testing.assert_frame_equal(result.reset_index(drop=True), expected_output)
288
+
289
+ # Test column order is preserved in the result
290
+ self.assertListEqual(list(result.columns), list(expected_output.columns))
291
+
292
+ # Test that the OBS column is sorted
293
+ self.assertTrue((result['OBS'].diff().dropna() >= pd.Timedelta(0)).all())
294
+
295
+ def test_convert_us_to_uk_dates(self):
296
+ # Create a test DataFrame
297
+ test_data = {
298
+ 'date_col': ['01-02-2023', '03/04/2023', '05-06-2023', '07/08/2023']
299
+ }
300
+ df = pd.DataFrame(test_data)
301
+
302
+ # Expected output
303
+ expected_output = pd.DataFrame({
304
+ 'date_col': pd.to_datetime(['2023-01-02', '2023-03-04', '2023-05-06', '2023-07-08'])
305
+ })
306
+
307
+ # Test the conversion function
308
+ result = self.dp.convert_us_to_uk_dates(df.copy(), 'date_col')
309
+
310
+ # Assertions
311
+ pd.testing.assert_frame_equal(result, expected_output)
312
+
313
+ # Test invalid input formats
314
+ invalid_data = pd.DataFrame({'date_col': ['invalid-date', '12345']})
315
+ with self.assertRaises(ValueError):
316
+ self.dp.convert_us_to_uk_dates(invalid_data.copy(), 'date_col')
317
+
318
+ # Test missing values
319
+ missing_data = pd.DataFrame({'date_col': [None, '03/04/2023']})
320
+ result_with_missing = self.dp.convert_us_to_uk_dates(missing_data.copy(), 'date_col')
321
+ expected_with_missing = pd.DataFrame({
322
+ 'date_col': [pd.NaT, pd.to_datetime('2023-03-04')]
323
+ })
324
+ pd.testing.assert_frame_equal(result_with_missing, expected_with_missing)
325
+
326
+ def test_pivot_table(self):
327
+ # Create a test DataFrame
328
+ test_data = {
329
+ 'date': ['2023-01-01', '2023-01-01', '2023-01-02', '2023-01-02', '2023-01-03'],
330
+ 'category': ['A', 'B', 'A', 'B', 'A'],
331
+ 'value': [10.0, 20.0, 30.0, 40.0, 50.0]
332
+ }
333
+ df = pd.DataFrame(test_data)
334
+
335
+ # Ensure the 'date' column is in datetime format
336
+ df['date'] = pd.to_datetime(df['date'])
337
+
338
+ # Expected output for basic pivot table
339
+ expected_output_basic = pd.DataFrame({
340
+ 'date': pd.to_datetime(['2023-01-01', '2023-01-02', '2023-01-03']),
341
+ 'A': [10.0, 30.0, 50.0], # Cast to float64
342
+ 'B': [20.0, 40.0, 0.0] # Cast to float64
343
+ })
344
+ expected_output_basic.columns.name = 'category'
345
+
346
+ # Test basic pivot table
347
+ result_basic = self.dp.pivot_table(df.copy(), index_col='date', columns='category', values_col='value', margins=False, fill_value=0)
348
+
349
+ # Convert 'date' columns in both DataFrames to datetime for comparison
350
+ result_basic['date'] = pd.to_datetime(result_basic['date'])
351
+ expected_output_basic['date'] = pd.to_datetime(expected_output_basic['date'])
352
+ pd.testing.assert_frame_equal(result_basic, expected_output_basic)
353
+
354
+ # Expected output for pivot table with margins
355
+ expected_output_with_margins = pd.DataFrame({
356
+ 'date': ['2023-01-01', '2023-01-02', '2023-01-03', 'Total'],
357
+ 'A': [10.0, 30.0, 50.0, 90.0],
358
+ 'B': [20.0, 40.0, 0.0, 60.0],
359
+ 'Total': [30.0, 70.0, 50.0, 150.0]
360
+ })
361
+ expected_output_with_margins['date'] = pd.to_datetime(
362
+ expected_output_with_margins['date'], errors='coerce'
363
+ ).fillna('Total')
364
+ expected_output_with_margins.columns.name = 'category'
365
+
366
+ # Test pivot table with margins
367
+ result_with_margins = self.dp.pivot_table(df.copy(), index_col='date', columns='category', values_col='value', margins=True, fill_value=0)
368
+ result_with_margins['date'] = pd.to_datetime(result_with_margins['date'], errors='coerce').fillna('Total')
369
+ pd.testing.assert_frame_equal(result_with_margins, expected_output_with_margins)
370
+
371
+ def test_apply_lookup_table_for_columns(self):
372
+ # Create a test DataFrame
373
+ test_data = {
374
+ 'col1': ['apple', 'banana', 'carrot', 'date', 'eggplant'],
375
+ 'col2': ['fruit', 'fruit', 'vegetable', 'fruit', 'vegetable']
376
+ }
377
+ df = pd.DataFrame(test_data)
378
+
379
+ # Lookup dictionary
380
+ lookup_dict = {
381
+ 'apple': 'Red Fruit',
382
+ 'banana': 'Yellow Fruit',
383
+ 'carrot': 'Orange Vegetable',
384
+ 'date': 'Brown Fruit'
385
+ }
386
+
387
+ # Expected output with single column lookup
388
+ expected_output_single = df.copy()
389
+ expected_output_single['Mapping'] = ['Red Fruit', 'Yellow Fruit', 'Orange Vegetable', 'Brown Fruit', 'Other']
390
+
391
+ # Test with a single column
392
+ result_single = self.dp.apply_lookup_table_for_columns(df.copy(), col_names=['col1'], to_find_dict=lookup_dict)
393
+ pd.testing.assert_frame_equal(result_single, expected_output_single)
394
+
395
+ # Expected output with multiple column lookup
396
+ expected_output_multiple = df.copy()
397
+ expected_output_multiple['Mapping'] = ['Other', 'Other', 'Other', 'Brown Fruit', 'Other']
398
+
399
+ # Update lookup dictionary to match merged keys
400
+ lookup_dict_merged = {
401
+ 'date|fruit': 'Brown Fruit'
402
+ }
403
+
404
+ # Test with multiple columns
405
+ result_multiple = self.dp.apply_lookup_table_for_columns(df.copy(), col_names=['col1', 'col2'], to_find_dict=lookup_dict_merged)
406
+ pd.testing.assert_frame_equal(result_multiple, expected_output_multiple)
407
+
408
+ # Test case where no match is found
409
+ df_no_match = pd.DataFrame({'col1': ['unknown']})
410
+ expected_no_match = df_no_match.copy()
411
+ expected_no_match['Mapping'] = ['Other']
412
+ result_no_match = self.dp.apply_lookup_table_for_columns(df_no_match, col_names=['col1'], to_find_dict=lookup_dict)
413
+ pd.testing.assert_frame_equal(result_no_match, expected_no_match)
414
+
415
+ def test_aggregate_daily_to_wc_wide(self):
416
+ # Create a test DataFrame
417
+ test_data = {
418
+ 'date': ['2023-01-01', '2023-01-02', '2023-01-08', '2023-01-09', '2023-01-10'],
419
+ 'group': ['A', 'A', 'B', 'B', 'B'],
420
+ 'value1': [10, 20, 30, 40, None],
421
+ 'value2': [100, 200, 300, None, 500]
422
+ }
423
+ df = pd.DataFrame(test_data)
424
+
425
+ # Expected output for weekly aggregation in wide format
426
+ expected_output = pd.DataFrame({
427
+ 'OBS': ['2023-01-01', '2023-01-08'], # Weeks starting on Sunday
428
+ 'value1_A': [30.0, 0.0],
429
+ 'value1_B': [0.0, 70.0],
430
+ 'value2_A': [300.0, 0.0],
431
+ 'value2_B': [0.0, 800.0],
432
+ 'Total value1': [30.0, 70.0],
433
+ 'Total value2': [300.0, 800.0]
434
+ })
435
+
436
+ # Test aggregation with totals included
437
+ result = self.dp.aggregate_daily_to_wc_wide(
438
+ df=df.copy(),
439
+ date_column='date',
440
+ group_columns=['group'],
441
+ sum_columns=['value1', 'value2'],
442
+ wc='sun',
443
+ aggregation='sum',
444
+ include_totals=True
445
+ )
446
+
447
+ # Ensure 'OBS' columns are datetime for comparison
448
+ result['OBS'] = pd.to_datetime(result['OBS'])
449
+ expected_output['OBS'] = pd.to_datetime(expected_output['OBS'])
450
+
451
+ # Compare the resulting DataFrame with the expected DataFrame
452
+ pd.testing.assert_frame_equal(result, expected_output)
453
+
454
+ # Test without group columns (no totals, single wide column)
455
+ expected_output_no_group = pd.DataFrame({
456
+ 'OBS': ['2023-01-01', '2023-01-08'],
457
+ 'value1': [30.0, 70.0],
458
+ 'value2': [300.0, 800.0]
459
+ })
460
+
461
+ result_no_group = self.dp.aggregate_daily_to_wc_wide(
462
+ df=df.copy(),
463
+ date_column='date',
464
+ group_columns=[],
465
+ sum_columns=['value1', 'value2'],
466
+ wc='sun',
467
+ aggregation='sum',
468
+ include_totals=False
469
+ )
470
+
471
+ # Ensure 'OBS' columns are datetime for comparison
472
+ result_no_group['OBS'] = pd.to_datetime(result_no_group['OBS'])
473
+ expected_output_no_group['OBS'] = pd.to_datetime(expected_output_no_group['OBS'])
474
+
475
+ # Compare the resulting DataFrame with the expected DataFrame
476
+ pd.testing.assert_frame_equal(result_no_group, expected_output_no_group)
477
+
478
+ def test_merge_cols_with_seperator(self):
479
+ # Create a test DataFrame
480
+ test_data = {
481
+ 'col1': ['apple', 'banana', 'cherry'],
482
+ 'col2': ['red', 'yellow', 'red'],
483
+ 'col3': ['fruit', 'fruit', 'fruit']
484
+ }
485
+ df = pd.DataFrame(test_data)
486
+
487
+ # Test merging two columns with default separator
488
+ expected_output_default = df.copy()
489
+ expected_output_default['Merged'] = ['apple_red', 'banana_yellow', 'cherry_red']
490
+
491
+ result_default = self.dp.merge_cols_with_seperator(df.copy(), col_names=['col1', 'col2'])
492
+ pd.testing.assert_frame_equal(result_default, expected_output_default)
493
+
494
+ # Test merging three columns with custom separator
495
+ expected_output_custom = df.copy()
496
+ expected_output_custom['Merged'] = ['apple-red-fruit', 'banana-yellow-fruit', 'cherry-red-fruit']
497
+
498
+ result_custom = self.dp.merge_cols_with_seperator(df.copy(), col_names=['col1', 'col2', 'col3'], seperator='-')
499
+ pd.testing.assert_frame_equal(result_custom, expected_output_custom)
500
+
501
+ # Test merging with starting and ending prefix
502
+ expected_output_prefix = df.copy()
503
+ expected_output_prefix['Merged'] = ['Start:apple_red:End', 'Start:banana_yellow:End', 'Start:cherry_red:End']
504
+
505
+ result_prefix = self.dp.merge_cols_with_seperator(
506
+ df.copy(),
507
+ col_names=['col1', 'col2'],
508
+ seperator='_',
509
+ starting_prefix_str='Start:',
510
+ ending_prefix_str=':End'
511
+ )
512
+ pd.testing.assert_frame_equal(result_prefix, expected_output_prefix)
513
+
514
+ # Test error for less than two columns
515
+ with self.assertRaises(ValueError):
516
+ self.dp.merge_cols_with_seperator(df.copy(), col_names=['col1'])
517
+
518
+ def test_check_sum_of_df_cols_are_equal(self):
519
+ # Create test DataFrames
520
+ df1 = pd.DataFrame({
521
+ 'col1': [1, 2, 3],
522
+ 'col2': [4, 5, 6]
523
+ })
524
+
525
+ df2 = pd.DataFrame({
526
+ 'colA': [1, 2, 3],
527
+ 'colB': [4, 5, 6]
528
+ })
529
+
530
+ df3 = pd.DataFrame({
531
+ 'colX': [1, 2, 3],
532
+ 'colY': [4, 5, 7]
533
+ })
534
+
535
+ # Test case where sums are equal
536
+ result_equal = self.dp.check_sum_of_df_cols_are_equal(df1, df2, cols_1=['col1', 'col2'], cols_2=['colA', 'colB'])
537
+ self.assertEqual(result_equal[0], "They are equal")
538
+ self.assertEqual(result_equal[1], 21) # Sum of df1's columns
539
+ self.assertEqual(result_equal[2], 21) # Sum of df2's columns
540
+
541
+ # Test case where sums are not equal
542
+ result_not_equal = self.dp.check_sum_of_df_cols_are_equal(df1, df3, cols_1=['col1', 'col2'], cols_2=['colX', 'colY'])
543
+ self.assertTrue(result_not_equal[0].startswith("They are different by "))
544
+ self.assertEqual(result_not_equal[1], 21) # Sum of df1's columns
545
+ self.assertEqual(result_not_equal[2], 22) # Sum of df3's columns
546
+
547
+ # Test case with mismatched column names
548
+ with self.assertRaises(KeyError):
549
+ self.dp.check_sum_of_df_cols_are_equal(df1, df2, cols_1=['nonexistent_col'], cols_2=['colA', 'colB'])
550
+
551
+ # Test case with empty columns
552
+ result_empty_cols = self.dp.check_sum_of_df_cols_are_equal(df1, df2, cols_1=[], cols_2=[])
553
+ self.assertEqual(result_empty_cols[1], 0) # Sum of empty columns
554
+ self.assertEqual(result_empty_cols[2], 0) # Sum of empty columns
555
+ self.assertEqual(result_empty_cols[0], "They are equal")
556
+
557
+ def test_convert_2_df_cols_to_dict(self):
558
+ # Create a test DataFrame
559
+ df = pd.DataFrame({
560
+ 'key_col': ['key1', 'key2', 'key3'],
561
+ 'value_col': [10, 20, 30]
562
+ })
563
+
564
+ # Expected dictionary
565
+ expected_dict = {
566
+ 'key1': 10,
567
+ 'key2': 20,
568
+ 'key3': 30
569
+ }
570
+
571
+ # Test basic functionality
572
+ result = self.dp.convert_2_df_cols_to_dict(df, 'key_col', 'value_col')
573
+ self.assertEqual(result, expected_dict)
574
+
575
+ # Test with non-unique keys
576
+ df_non_unique = pd.DataFrame({
577
+ 'key_col': ['key1', 'key2', 'key1'],
578
+ 'value_col': [10, 20, 30]
579
+ })
580
+ expected_dict_non_unique = {
581
+ 'key1': 30, # Last occurrence of 'key1' should overwrite the earlier one
582
+ 'key2': 20
583
+ }
584
+ result_non_unique = self.dp.convert_2_df_cols_to_dict(df_non_unique, 'key_col', 'value_col')
585
+ self.assertEqual(result_non_unique, expected_dict_non_unique)
586
+
587
+ # Test with missing key or value column
588
+ with self.assertRaises(ValueError):
589
+ self.dp.convert_2_df_cols_to_dict(df, 'missing_key_col', 'value_col')
590
+
591
+ with self.assertRaises(ValueError):
592
+ self.dp.convert_2_df_cols_to_dict(df, 'key_col', 'missing_value_col')
593
+
594
+ # Test with empty DataFrame
595
+ df_empty = pd.DataFrame(columns=['key_col', 'value_col'])
596
+ expected_empty_dict = {}
597
+ result_empty = self.dp.convert_2_df_cols_to_dict(df_empty, 'key_col', 'value_col')
598
+ self.assertEqual(result_empty, expected_empty_dict)
599
+
600
+ def test_keyword_lookup_replacement(self):
601
+ # Create a test DataFrame
602
+ test_data = {
603
+ 'col1': ['A', 'B', 'C', 'D'],
604
+ 'col2': ['X', 'Y', 'Z', 'W'],
605
+ 'value_col': ['old_value', 'old_value', 'unchanged', 'old_value']
606
+ }
607
+ df = pd.DataFrame(test_data)
608
+
609
+ # Lookup dictionary for replacements
610
+ lookup_dict = {
611
+ 'A|X': 'new_value_1',
612
+ 'B|Y': 'new_value_2',
613
+ 'D|W': 'new_value_3'
614
+ }
615
+
616
+ # Expected output
617
+ expected_output = df.copy()
618
+ expected_output['Updated Column'] = ['new_value_1', 'new_value_2', 'unchanged', 'new_value_3']
619
+
620
+ # Apply the function
621
+ result = self.dp.keyword_lookup_replacement(
622
+ df.copy(),
623
+ col='value_col',
624
+ replacement_rows='old_value',
625
+ cols_to_merge=['col1', 'col2'],
626
+ replacement_lookup_dict=lookup_dict
627
+ )
628
+
629
+ # Compare the resulting DataFrame with the expected DataFrame
630
+ pd.testing.assert_frame_equal(result, expected_output)
631
+
632
+ # Test case where no replacement is needed
633
+ df_no_replacement = pd.DataFrame({
634
+ 'col1': ['E', 'F'],
635
+ 'col2': ['G', 'H'],
636
+ 'value_col': ['unchanged', 'unchanged']
637
+ })
638
+ expected_no_replacement = df_no_replacement.copy()
639
+ expected_no_replacement['Updated Column'] = ['unchanged', 'unchanged']
640
+
641
+ result_no_replacement = self.dp.keyword_lookup_replacement(
642
+ df_no_replacement.copy(),
643
+ col='value_col',
644
+ replacement_rows='old_value',
645
+ cols_to_merge=['col1', 'col2'],
646
+ replacement_lookup_dict=lookup_dict
647
+ )
648
+
649
+ pd.testing.assert_frame_equal(result_no_replacement, expected_no_replacement)
650
+
651
+ def test_convert_df_wide_2_long(self):
652
+ # Create a test DataFrame
653
+ test_data = {
654
+ 'id': [1, 2, 3],
655
+ 'name': ['Alice', 'Bob', 'Charlie'],
656
+ 'score1': [85, 90, 78],
657
+ 'score2': [88, 92, 81]
658
+ }
659
+ df = pd.DataFrame(test_data)
660
+
661
+ # Expected output for the transformation
662
+ expected_output = pd.DataFrame({
663
+ 'id': [1, 2, 3, 1, 2, 3],
664
+ 'name': ['Alice', 'Bob', 'Charlie', 'Alice', 'Bob', 'Charlie'],
665
+ 'Stacked': ['score1', 'score1', 'score1', 'score2', 'score2', 'score2'],
666
+ 'Value': [85, 90, 78, 88, 92, 81]
667
+ })
668
+
669
+ # Apply the function
670
+ result = self.dp.convert_df_wide_2_long(
671
+ df.copy(),
672
+ value_cols=['score1', 'score2'],
673
+ variable_col_name='Stacked',
674
+ value_col_name='Value'
675
+ )
676
+
677
+ # Compare the resulting DataFrame with the expected DataFrame
678
+ pd.testing.assert_frame_equal(result, expected_output)
679
+
680
+ # Test case with only one column (should raise ValueError)
681
+ with self.assertRaises(ValueError):
682
+ self.dp.convert_df_wide_2_long(
683
+ df.copy(),
684
+ value_cols=['score1'],
685
+ variable_col_name='Stacked',
686
+ value_col_name='Value'
687
+ )
688
+
689
+ # Test case with no value columns (should raise ValueError)
690
+ with self.assertRaises(ValueError):
691
+ self.dp.convert_df_wide_2_long(
692
+ df.copy(),
693
+ value_cols=[],
694
+ variable_col_name='Stacked',
695
+ value_col_name='Value'
696
+ )
697
+
698
+ def test_format_numbers_with_commas(self):
699
+ # Create a test DataFrame
700
+ test_data = {
701
+ 'col1': [1000, 2500000, 12345.678, None],
702
+ 'col2': [2000.5, 350000.75, 0, -12345],
703
+ 'col3': ['text', 'another text', 50000, 123.45]
704
+ }
705
+ df = pd.DataFrame(test_data).fillna(value=pd.NA) # Normalize None to pd.NA
706
+
707
+ # Expected output with 2 decimal places
708
+ expected_data = {
709
+ 'col1': ['1,000.00', '2,500,000.00', '12,345.68', pd.NA],
710
+ 'col2': ['2,000.50', '350,000.75', '0.00', '-12,345.00'],
711
+ 'col3': ['text', 'another text', '50,000.00', '123.45']
712
+ }
713
+ expected_output = pd.DataFrame(expected_data)
714
+
715
+ # Apply the function
716
+ result = self.dp.format_numbers_with_commas(df, decimal_length_chosen=2)
717
+
718
+ # Compare the resulting DataFrame with the expected DataFrame
719
+ pd.testing.assert_frame_equal(result, expected_output, check_dtype=False)
720
+
721
+ def test_filter_df_on_multiple_conditions(self):
722
+ # Create a test DataFrame
723
+ test_data = {
724
+ 'id': [1, 2, 3, 4, 5],
725
+ 'value': [10, 20, 30, 40, 50],
726
+ 'category': ['A', 'B', 'A', 'C', 'A'],
727
+ 'date': pd.to_datetime(['2023-01-01', '2023-01-02', '2023-01-03', '2023-01-04', '2023-01-05'])
728
+ }
729
+ df = pd.DataFrame(test_data)
730
+
731
+ # Test Case 1: Single condition (Equality)
732
+ filters_dict = {'category': "== 'A'"}
733
+ expected_output = df[df['category'] == 'A']
734
+ result = self.dp.filter_df_on_multiple_conditions(df, filters_dict)
735
+ pd.testing.assert_frame_equal(result, expected_output)
736
+
737
+ # Test Case 2: Multiple conditions (Equality and Greater Than)
738
+ filters_dict = {'category': "== 'A'", 'value': "> 20"}
739
+ expected_output = df[(df['category'] == 'A') & (df['value'] > 20)]
740
+ result = self.dp.filter_df_on_multiple_conditions(df, filters_dict)
741
+ pd.testing.assert_frame_equal(result, expected_output)
742
+
743
+ # Test Case 3: Date comparison
744
+ filters_dict = {'date': ">= '2023-01-03'"}
745
+ expected_output = df[df['date'] >= pd.to_datetime('2023-01-03')]
746
+ result = self.dp.filter_df_on_multiple_conditions(df, filters_dict)
747
+ pd.testing.assert_frame_equal(result, expected_output)
748
+
749
+ # Test Case 4: Inequality
750
+ filters_dict = {'value': "!= 30"}
751
+ expected_output = df[df['value'] != 30]
752
+ result = self.dp.filter_df_on_multiple_conditions(df, filters_dict)
753
+ pd.testing.assert_frame_equal(result, expected_output)
754
+
755
+ # Test Case 5: Mixed conditions
756
+ filters_dict = {'category': "== 'A'", 'date': "<= '2023-01-03'"}
757
+ expected_output = df[(df['category'] == 'A') & (df['date'] <= pd.to_datetime('2023-01-03'))]
758
+ result = self.dp.filter_df_on_multiple_conditions(df, filters_dict)
759
+ pd.testing.assert_frame_equal(result, expected_output)
760
+
761
+ def test_fill_weekly_date_range(self):
762
+ # Test input DataFrame
763
+ test_data = {
764
+ 'date': ['2023-01-02', '2023-01-16', '2023-01-30'], # Weekly data with gaps
765
+ 'value': [10.0, 20.0, 30.0]
766
+ }
767
+ df = pd.DataFrame(test_data)
768
+ df['date'] = pd.to_datetime(df['date'])
769
+
770
+ # Expected output DataFrame
771
+ expected_data = {
772
+ 'date': ['2023-01-02', '2023-01-09', '2023-01-16', '2023-01-23', '2023-01-30'],
773
+ 'value': [10.0, 0.0, 20.0, 0.0, 30.0]
774
+ }
775
+ expected_output = pd.DataFrame(expected_data)
776
+ expected_output['date'] = pd.to_datetime(expected_output['date'])
777
+
778
+ # Call the function
779
+ dp = dataprocessing() # Replace with the correct instantiation of your class
780
+ result = dp.fill_weekly_date_range(df, date_column='date', freq='W-MON')
781
+
782
+ # Assert the result matches the expected output
783
+ pd.testing.assert_frame_equal(result.reset_index(drop=True), expected_output.reset_index(drop=True))
784
+
785
+ def test_add_prefix_and_suffix(self):
786
+ # Test DataFrame
787
+ test_data = {
788
+ 'date': ['2023-01-01', '2023-01-02', '2023-01-03'],
789
+ 'value1': [10, 20, 30],
790
+ 'value2': [40, 50, 60]
791
+ }
792
+ df = pd.DataFrame(test_data)
793
+
794
+ # Expected output when no date column is excluded
795
+ expected_data_no_date_col = {
796
+ 'prefix_date_suffix': ['2023-01-01', '2023-01-02', '2023-01-03'],
797
+ 'prefix_value1_suffix': [10, 20, 30],
798
+ 'prefix_value2_suffix': [40, 50, 60]
799
+ }
800
+ expected_output_no_date_col = pd.DataFrame(expected_data_no_date_col)
801
+
802
+ # Expected output when date column is excluded
803
+ expected_data_with_date_col = {
804
+ 'date': ['2023-01-01', '2023-01-02', '2023-01-03'],
805
+ 'prefix_value1_suffix': [10, 20, 30],
806
+ 'prefix_value2_suffix': [40, 50, 60]
807
+ }
808
+ expected_output_with_date_col = pd.DataFrame(expected_data_with_date_col)
809
+
810
+ # Call the function without excluding a date column
811
+ dp = dataprocessing() # Replace with the correct instantiation of your class
812
+ result_no_date_col = dp.add_prefix_and_suffix(df.copy(), prefix='prefix_', suffix='_suffix')
813
+
814
+ # Assert result matches the expected output
815
+ pd.testing.assert_frame_equal(result_no_date_col, expected_output_no_date_col)
816
+
817
+ # Call the function with a date column excluded
818
+ result_with_date_col = dp.add_prefix_and_suffix(df.copy(), prefix='prefix_', suffix='_suffix', date_col='date')
819
+
820
+ # Assert result matches the expected output
821
+ pd.testing.assert_frame_equal(result_with_date_col, expected_output_with_date_col)
822
+
823
+ def test_create_dummies(self):
824
+ # Test Case 1: Basic functionality without date column
825
+ df = pd.DataFrame({
826
+ 'col1': [0, 1, 2],
827
+ 'col2': [3, 4, 0],
828
+ 'col3': [5, 0, 0]
829
+ })
830
+ dummy_threshold = 1
831
+ expected_output = pd.DataFrame({
832
+ 'col1': [0, 0, 1],
833
+ 'col2': [1, 1, 0],
834
+ 'col3': [1, 0, 0]
835
+ })
836
+ result = self.dp.create_dummies(df.copy(), dummy_threshold=dummy_threshold)
837
+ pd.testing.assert_frame_equal(result, expected_output)
838
+
839
+ # Test Case 2: With date column
840
+ df_with_date = pd.DataFrame({
841
+ 'date': ['2023-01-01', '2023-01-02', '2023-01-03'],
842
+ 'col1': [0, 1, 2],
843
+ 'col2': [3, 4, 0]
844
+ })
845
+ expected_output_with_date = pd.DataFrame({
846
+ 'date': ['2023-01-01', '2023-01-02', '2023-01-03'],
847
+ 'col1': [0, 0, 1],
848
+ 'col2': [1, 1, 0]
849
+ })
850
+ result_with_date = self.dp.create_dummies(df_with_date.copy(), date_col='date', dummy_threshold=dummy_threshold)
851
+ pd.testing.assert_frame_equal(result_with_date, expected_output_with_date)
852
+
853
+ # Test Case 3: Adding total dummy column
854
+ expected_output_with_total = expected_output.copy()
855
+ expected_output_with_total['total'] = [1, 1, 1]
856
+ result_with_total = self.dp.create_dummies(df.copy(), dummy_threshold=dummy_threshold, add_total_dummy_col='Yes')
857
+ pd.testing.assert_frame_equal(result_with_total, expected_output_with_total)
858
+
859
+ # Test Case 4: Adding total dummy column with date column
860
+ expected_output_with_date_and_total = expected_output_with_date.copy()
861
+ expected_output_with_date_and_total['total'] = [1, 1, 1]
862
+ result_with_date_and_total = self.dp.create_dummies(
863
+ df_with_date.copy(),
864
+ date_col='date',
865
+ dummy_threshold=dummy_threshold,
866
+ add_total_dummy_col='Yes',
867
+ )
868
+ pd.testing.assert_frame_equal(result_with_date_and_total, expected_output_with_date_and_total)
869
+
870
+ # Test Case 5: Threshold of 0 (all positive numbers become 1)
871
+ df_threshold_0 = pd.DataFrame({
872
+ 'col1': [-1, 0, 1],
873
+ 'col2': [0, 2, -3]
874
+ })
875
+ expected_output_threshold_0 = pd.DataFrame({
876
+ 'col1': [0, 0, 1],
877
+ 'col2': [0, 1, 0]
878
+ })
879
+ result_threshold_0 = self.dp.create_dummies(df_threshold_0.copy(), dummy_threshold=0)
880
+ pd.testing.assert_frame_equal(result_threshold_0, expected_output_threshold_0)
881
+
882
+ def test_replace_substrings(self):
883
+ # Test Case 1: Basic replacement
884
+ df = pd.DataFrame({
885
+ 'text': ['hello world', 'python programming', 'hello python']
886
+ })
887
+ replacements = {'hello': 'hi', 'python': 'java'}
888
+ expected_output = pd.DataFrame({
889
+ 'text': ['hi world', 'java programming', 'hi java']
890
+ })
891
+ result = self.dp.replace_substrings(df.copy(), 'text', replacements)
892
+ pd.testing.assert_frame_equal(result, expected_output)
893
+
894
+ # Test Case 2: Replacement with to_lower=True
895
+ df_mixed_case = pd.DataFrame({
896
+ 'text': ['Hello World', 'PYTHON Programming', 'hello PYTHON']
897
+ })
898
+ expected_output_lower = pd.DataFrame({
899
+ 'text': ['hi world', 'java programming', 'hi java']
900
+ })
901
+ result_lower = self.dp.replace_substrings(df_mixed_case.copy(), 'text', replacements, to_lower=True)
902
+ pd.testing.assert_frame_equal(result_lower, expected_output_lower)
903
+
904
+ # Test Case 3: Replacement with a new column
905
+ df_new_col = pd.DataFrame({
906
+ 'text': ['hello world', 'python programming', 'hello python']
907
+ })
908
+ expected_output_new_col = pd.DataFrame({
909
+ 'text': ['hello world', 'python programming', 'hello python'],
910
+ 'new_text': ['hi world', 'java programming', 'hi java']
911
+ })
912
+ result_new_col = self.dp.replace_substrings(df_new_col.copy(), 'text', replacements, new_column='new_text')
913
+ pd.testing.assert_frame_equal(result_new_col, expected_output_new_col)
914
+
915
+ def test_add_total_column(self):
916
+ # Test Case 1: Basic functionality without excluding any column
917
+ df = pd.DataFrame({
918
+ 'col1': [1, 2, 3],
919
+ 'col2': [4, 5, 6],
920
+ 'col3': [7, 8, 9]
921
+ })
922
+ expected_output = df.copy()
923
+ expected_output['Total'] = [12, 15, 18]
924
+ result = self.dp.add_total_column(df.copy())
925
+ pd.testing.assert_frame_equal(result, expected_output)
926
+
927
+ # Test Case 2: Excluding a column from the total
928
+ df = pd.DataFrame({
929
+ 'col1': [1, 2, 3],
930
+ 'col2': [4, 5, 6],
931
+ 'col3': [7, 8, 9]
932
+ })
933
+ expected_output_exclude = pd.DataFrame({
934
+ 'col1': [1, 2, 3],
935
+ 'col2': [4, 5, 6],
936
+ 'col3': [7, 8, 9],
937
+ 'Total': [5, 7, 9] # Sum without 'col3'
938
+ })
939
+ result_exclude = self.dp.add_total_column(df.copy(), exclude_col='col3')
940
+ pd.testing.assert_frame_equal(result_exclude, expected_output_exclude)
941
+
942
+ # Test Case 3: Custom total column name
943
+ custom_total_col_name = 'Sum'
944
+ expected_output_custom = df.copy()
945
+ expected_output_custom[custom_total_col_name] = [12, 15, 18]
946
+ result_custom = self.dp.add_total_column(df.copy(), total_col_name=custom_total_col_name)
947
+ pd.testing.assert_frame_equal(result_custom, expected_output_custom)
948
+
949
+ # Test Case 4: DataFrame with a single column
950
+ single_col_df = pd.DataFrame({'col1': [1, 2, 3]})
951
+ expected_single_col = single_col_df.copy()
952
+ expected_single_col['Total'] = [1, 2, 3]
953
+ result_single_col = self.dp.add_total_column(single_col_df.copy())
954
+ pd.testing.assert_frame_equal(result_single_col, expected_single_col)
955
+
956
+ def test_apply_lookup_table_based_on_substring(self):
957
+ # Test Case 1: Basic categorization
958
+ df = pd.DataFrame({
959
+ 'text': ['I love apples', 'Bananas are great', 'Something else', 'Grapes are sour']
960
+ })
961
+ category_dict = {
962
+ 'apple': 'Fruit',
963
+ 'banana': 'Fruit',
964
+ 'cherry': 'Fruit',
965
+ 'grape': 'Fruit'
966
+ }
967
+ expected_output = pd.DataFrame({
968
+ 'text': ['I love apples', 'Bananas are great', 'Something else', 'Grapes are sour'],
969
+ 'Category': ['Fruit', 'Fruit', 'Other', 'Fruit']
970
+ })
971
+ result = self.dp.apply_lookup_table_based_on_substring(df.copy(), 'text', category_dict)
972
+ pd.testing.assert_frame_equal(result, expected_output)
973
+
974
+ def test_compare_overlap(self):
975
+ """
976
+ Test the compare_overlap function to ensure it calculates differences
977
+ and their totals correctly across overlapping date ranges.
978
+ """
979
+ # 1. Create sample data for df1 (covers 2021-01-01 to 2021-01-04)
980
+ df1_data = [
981
+ {"date": "2021-01-01", "value": 10, "count": 1},
982
+ {"date": "2021-01-02", "value": 15, "count": 2},
983
+ {"date": "2021-01-03", "value": 20, "count": 3},
984
+ {"date": "2021-01-04", "value": 25, "count": 4},
985
+ ]
986
+ df1 = pd.DataFrame(df1_data)
987
+
988
+ # 2. Create sample data for df2 (covers 2021-01-03 to 2021-01-05)
989
+ df2_data = [
990
+ {"date": "2021-01-03", "value": 22, "count": 2},
991
+ {"date": "2021-01-04", "value": 20, "count": 5},
992
+ {"date": "2021-01-05", "value": 30, "count": 6},
993
+ ]
994
+ df2 = pd.DataFrame(df2_data)
995
+
996
+ # 3. Call compare_overlap from your dataprocessing class
997
+ diff_df, total_diff_df = self.dp.compare_overlap(df1, df2, 'date')
998
+ expected_diff_df = pd.DataFrame({
999
+ 'date': pd.to_datetime(['2021-01-03', '2021-01-04']),
1000
+ 'diff_value': [-2, 5],
1001
+ 'diff_count': [1, -1]
1002
+ })
1003
+
1004
+ expected_total_diff_df = pd.DataFrame({
1005
+ 'Column': ['value', 'count'],
1006
+ 'Total Difference': [3, 0]
1007
+ })
1008
+
1009
+ # 5. Use pd.testing.assert_frame_equal to check the outputs
1010
+ # Sort and reset index to ensure matching row order
1011
+ pd.testing.assert_frame_equal(
1012
+ diff_df.sort_values('date').reset_index(drop=True),
1013
+ expected_diff_df.sort_values('date').reset_index(drop=True)
1014
+ )
1015
+
1016
+ # Sort by 'Column' to ensure matching row order in summary
1017
+ pd.testing.assert_frame_equal(
1018
+ total_diff_df.sort_values('Column').reset_index(drop=True),
1019
+ expected_total_diff_df.sort_values('Column').reset_index(drop=True)
1020
+ )
1021
+
1022
+ def test_week_commencing_2_week_commencing_conversion_isoweekday(self):
1023
+ """
1024
+ Test the isoweekday-based function to confirm each date is mapped back
1025
+ to the 'week_commencing' day of that ISO week.
1026
+ """
1027
+ # 2023-01-01 was a Sunday; we'll go through Saturday (7 days).
1028
+ df = pd.DataFrame({"date": pd.date_range("2023-01-01", periods=7, freq="D")})
1029
+ expected_mon = pd.Series(
1030
+ [
1031
+ pd.Timestamp("2022-12-26"), # Sunday -> previous Monday
1032
+ pd.Timestamp("2023-01-02"), # Monday
1033
+ pd.Timestamp("2023-01-02"), # Tuesday
1034
+ pd.Timestamp("2023-01-02"), # Wednesday
1035
+ pd.Timestamp("2023-01-02"), # Thursday
1036
+ pd.Timestamp("2023-01-02"), # Friday
1037
+ pd.Timestamp("2023-01-02"), # Saturday
1038
+ ],
1039
+ name="week_start_mon"
1040
+ )
1041
+
1042
+ # Use the new function from our data processing object
1043
+ result = self.dp.week_commencing_2_week_commencing_conversion_isoweekday(
1044
+ df.copy(),
1045
+ date_col="date",
1046
+ week_commencing="mon"
1047
+ )
1048
+
1049
+ # Compare the 'week_start_mon' column with our expected results
1050
+ pd.testing.assert_series_equal(
1051
+ result["week_start_mon"], # actual
1052
+ expected_mon # expected
1053
+ )
1054
+
1055
+
1056
+
1057
+ ###################################################################################################################################################
1058
+ ###################################################################################################################################################
1059
+
1060
+ # class TestDataPull(unittest.TestCase)
1061
+
1062
+
1063
+ if __name__ == '__main__':
1064
+ unittest.main()