captest 0.13.3rc1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
captest/capdata.py ADDED
@@ -0,0 +1,3380 @@
1
+ """
2
+ Provides the CapData class and supporting functions.
3
+
4
+ The CapData class provides methods for loading, filtering, and regressing solar
5
+ data. A capacity test following the ASTM standard can be performed using a
6
+ CapData object for the measured data and a seperate CapData object for the
7
+ modeled data. The get_summary and captest_results functions accept two CapData
8
+ objects as arguments and provide a summary of the data filtering steps and
9
+ the results of the capacity test, respectively.
10
+ """
11
+
12
+ # standard library imports
13
+ import re
14
+ import copy
15
+ from functools import wraps
16
+ from itertools import combinations
17
+ import warnings
18
+ import pytz
19
+ import importlib
20
+
21
+ # anaconda distribution defaults
22
+ import numpy as np
23
+ import pandas as pd
24
+
25
+ # anaconda distribution defaults
26
+ # statistics and machine learning imports
27
+ import statsmodels.formula.api as smf
28
+
29
+ # from sklearn.covariance import EllipticEnvelope
30
+ import sklearn.covariance as sk_cv
31
+
32
+ # anaconda distribution defaults
33
+ # visualization library imports
34
+ from bokeh.models import HoverTool, NumeralTickFormatter
35
+
36
+ import param
37
+
38
+ from captest import util
39
+ from captest import plotting
40
+
41
+ # visualization library imports
42
+ hv_spec = importlib.util.find_spec("holoviews")
43
+ if hv_spec is not None:
44
+ import holoviews as hv
45
+ from holoviews.plotting.links import DataLink
46
+ from holoviews import opts
47
+
48
+ hv.extension("bokeh")
49
+ else:
50
+ warnings.warn(
51
+ "Some plotting functions will not work without the holoviews package."
52
+ )
53
+
54
+ pn_spec = importlib.util.find_spec("panel")
55
+ if pn_spec is not None:
56
+ import panel as pn
57
+
58
+ pn.extension()
59
+ else:
60
+ warnings.warn(
61
+ "The ReportingIrradiance.dashboard method will not work without "
62
+ "the panel package."
63
+ )
64
+
65
+ xlsx_spec = importlib.util.find_spec("openpyxl")
66
+ if xlsx_spec is None:
67
+ warnings.warn(
68
+ "Specifying a column grouping in an excel file will not work without "
69
+ "the openpyxl package."
70
+ )
71
+
72
+ # pvlib imports
73
+ pvlib_spec = importlib.util.find_spec("pvlib")
74
+ if pvlib_spec is not None:
75
+ from pvlib.location import Location
76
+ from pvlib.pvsystem import PVSystem, Array, FixedMount, SingleAxisTrackerMount
77
+ from pvlib.pvsystem import retrieve_sam
78
+ from pvlib.modelchain import ModelChain
79
+ from pvlib.clearsky import detect_clearsky
80
+ else:
81
+ warnings.warn("Clear sky functions will not work without the pvlib package.")
82
+
83
+
84
+ plot_colors_brewer = {
85
+ "real_pwr": ["#2b8cbe", "#7bccc4", "#bae4bc", "#f0f9e8"],
86
+ "irr_poa": ["#e31a1c", "#fd8d3c", "#fecc5c", "#ffffb2"],
87
+ "irr_ghi": ["#91003f", "#e7298a", "#c994c7", "#e7e1ef"],
88
+ "temp_amb": ["#238443", "#78c679", "#c2e699", "#ffffcc"],
89
+ "temp_mod": ["#88419d", "#8c96c6", "#b3cde3", "#edf8fb"],
90
+ "wind": ["#238b45", "#66c2a4", "#b2e2e2", "#edf8fb"],
91
+ }
92
+
93
+ met_keys = ["poa", "t_amb", "w_vel", "power"]
94
+
95
+
96
+ columns = ["pts_after_filter", "pts_removed", "filter_arguments"]
97
+
98
+
99
+ def round_kwarg_floats(kwarg_dict, decimals=3):
100
+ """
101
+ Round float values in a dictionary.
102
+
103
+ Parameters
104
+ ----------
105
+ kwarg_dict : dict
106
+ decimals : int, default 3
107
+ Number of decimal places to round to.
108
+
109
+ Returns
110
+ -------
111
+ dict
112
+ Dictionary with rounded floats.
113
+ """
114
+ rounded_vals = []
115
+ for val in kwarg_dict.values():
116
+ if isinstance(val, float):
117
+ rounded_vals.append(round(val, decimals))
118
+ else:
119
+ rounded_vals.append(val)
120
+ return {key: val for key, val in zip(kwarg_dict.keys(), rounded_vals)}
121
+
122
+
123
+ def tstamp_kwarg_to_strings(kwarg_dict):
124
+ """
125
+ Convert timestamp values in dictionary to strings.
126
+
127
+ Parameters
128
+ ----------
129
+ kwarg_dict : dict
130
+
131
+ Returns
132
+ -------
133
+ dict
134
+ """
135
+ output_vals = []
136
+ for val in kwarg_dict.values():
137
+ if isinstance(val, pd.Timestamp):
138
+ output_vals.append(val.strftime("%Y-%m-%d %H:%M"))
139
+ else:
140
+ output_vals.append(val)
141
+ return {key: val for key, val in zip(kwarg_dict.keys(), output_vals)}
142
+
143
+
144
+ def update_summary(func):
145
+ """
146
+ Decoratates the CapData class filter methods.
147
+
148
+ Updates the CapData.summary and CapData.summary_ix attributes, which
149
+ are used to generate summary data by the CapData.get_summary method.
150
+
151
+ Todo
152
+ ----
153
+ not in place
154
+ Check if summary is updated when function is called with inplace=False.
155
+ It should not be.
156
+ """
157
+
158
+ @wraps(func)
159
+ def wrapper(self, *args, **kwargs):
160
+ pts_before = self.data_filtered.shape[0]
161
+ ix_before = self.data_filtered.index
162
+ if pts_before == 0:
163
+ pts_before = self.data.shape[0]
164
+ self.summary_ix.append((self.name, "count"))
165
+ self.summary.append(
166
+ {columns[0]: pts_before, columns[1]: 0, columns[2]: "no filters"}
167
+ )
168
+
169
+ ret_val = func(self, *args, **kwargs)
170
+
171
+ arg_str = args.__repr__()
172
+ lst = arg_str.split(",")
173
+ arg_lst = [item.strip("()") for item in lst]
174
+ arg_lst_one = arg_lst[0]
175
+ if arg_lst_one == "das" or arg_lst_one == "sim":
176
+ arg_lst = arg_lst[1:]
177
+ arg_str = ", ".join(arg_lst)
178
+
179
+ func_re = re.compile("<function (.*) at", re.IGNORECASE)
180
+ if func_re.search(arg_str) is not None:
181
+ custom_func_name = func_re.search(arg_str).group(1)
182
+ arg_str = re.sub("<function.*>", custom_func_name, arg_str)
183
+
184
+ kwargs = round_kwarg_floats(kwargs)
185
+ kwargs = tstamp_kwarg_to_strings(kwargs)
186
+ kwarg_str = kwargs.__repr__()
187
+ kwarg_str = kwarg_str.strip("{}")
188
+ kwarg_str = kwarg_str.replace("'", "")
189
+
190
+ if len(arg_str) == 0 and len(kwarg_str) == 0:
191
+ arg_str = "Default arguments"
192
+ elif len(arg_str) == 0:
193
+ arg_str = kwarg_str
194
+ else:
195
+ arg_str = arg_str + ", " + kwarg_str
196
+
197
+ filter_name = func.__name__
198
+ if filter_name in self.filter_counts.keys():
199
+ filter_name_enum = filter_name + "-" + str(self.filter_counts[filter_name])
200
+ self.filter_counts[filter_name] += 1
201
+ else:
202
+ self.filter_counts[filter_name] = 1
203
+ filter_name_enum = filter_name
204
+
205
+ pts_after = self.data_filtered.shape[0]
206
+ pts_removed = pts_before - pts_after
207
+ self.summary_ix.append((self.name, filter_name_enum))
208
+ self.summary.append(
209
+ {columns[0]: pts_after, columns[1]: pts_removed, columns[2]: arg_str}
210
+ )
211
+
212
+ ix_after = self.data_filtered.index
213
+ self.removed.append(
214
+ {"name": filter_name_enum, "index": ix_before.difference(ix_after)}
215
+ )
216
+ self.kept.append({"name": filter_name_enum, "index": ix_after})
217
+
218
+ if pts_after == 0:
219
+ warnings.warn(
220
+ "The last filter removed all data! "
221
+ "Calling additional filtering or visualization "
222
+ "methods that reference the data_filtered attribute "
223
+ "will raise an error."
224
+ )
225
+
226
+ return ret_val
227
+
228
+ return wrapper
229
+
230
+
231
+ def wrap_year_end(df, start, end):
232
+ """
233
+ Shifts data before or after new year to form a contigous time period.
234
+
235
+ This function shifts data from the end of the year a year back or data from
236
+ the begining of the year a year forward, to create a contiguous time
237
+ period. Intended to be used on historical typical year data.
238
+
239
+ If start date is in dataframe, then data at the beginning of the year will
240
+ be moved ahead one year. If end date is in dataframe, then data at the end
241
+ of the year will be moved back one year.
242
+
243
+ cntg (contiguous); eoy (end of year)
244
+
245
+ Parameters
246
+ ----------
247
+ df: pandas DataFrame
248
+ Dataframe to be adjusted.
249
+ start: pandas Timestamp
250
+ Start date for time period.
251
+ end: pandas Timestamp
252
+ End date for time period.
253
+
254
+ Todo
255
+ ----
256
+ Need to test and debug this for years not matching.
257
+ """
258
+ if df.index[0].year == start.year:
259
+ df_start = df.loc[start:, :]
260
+
261
+ df_end = df.copy()
262
+ df_end.index = df_end.index + pd.DateOffset(days=365)
263
+ df_end = df_end.loc[:end, :]
264
+
265
+ elif df.index[0].year == end.year:
266
+ df_end = df.loc[:end, :]
267
+
268
+ df_start = df.copy()
269
+ df_start.index = df_start.index - pd.DateOffset(days=365)
270
+ df_start = df_start.loc[start:, :]
271
+
272
+ df_return = pd.concat([df_start, df_end], axis=0)
273
+ ix_series = df_return.index.to_series()
274
+ df_return["index"] = ix_series.apply(lambda x: x.strftime("%m/%d/%Y %H %M")) # noqa E501
275
+ return df_return
276
+
277
+
278
+ def spans_year(start_date, end_date):
279
+ """
280
+ Determine if dates passed are in the same year.
281
+
282
+ Parameters
283
+ ----------
284
+ start_date: pandas Timestamp
285
+ end_date: pandas Timestamp
286
+
287
+ Returns
288
+ -------
289
+ bool
290
+ """
291
+ if start_date.year != end_date.year:
292
+ return True
293
+ else:
294
+ return False
295
+
296
+
297
+ def wrap_seasons(df, freq):
298
+ """
299
+ Rearrange an 8760 so a quarterly groupby will result in seasonal groups.
300
+
301
+ Parameters
302
+ ----------
303
+ df : DataFrame
304
+ Dataframe to be rearranged.
305
+ freq : str
306
+ String pandas offset alias to specify aggregattion frequency
307
+ for reporting condition calculation.
308
+
309
+ Returns
310
+ -------
311
+ DataFrame
312
+
313
+ Todo
314
+ ----
315
+ Write unit test
316
+ BQ-NOV vs BQS vs QS
317
+ Need to review if BQ is the correct offset alias vs BQS or QS.
318
+ """
319
+ check_freqs = [
320
+ "BQ-JAN",
321
+ "BQ-FEB",
322
+ "BQ-APR",
323
+ "BQ-MAY",
324
+ "BQ-JUL",
325
+ "BQ-AUG",
326
+ "BQ-OCT",
327
+ "BQ-NOV",
328
+ ]
329
+ month_int = {
330
+ "JAN": 1,
331
+ "FEB": 2,
332
+ "APR": 4,
333
+ "MAY": 5,
334
+ "JUL": 7,
335
+ "AUG": 8,
336
+ "OCT": 10,
337
+ "NOV": 11,
338
+ }
339
+
340
+ if freq in check_freqs:
341
+ warnings.warn(
342
+ "DataFrame index adjusted to be continous through new"
343
+ "year, but not returned or set to attribute for user."
344
+ "This is not an issue if using RCs with"
345
+ "predict_capacities."
346
+ )
347
+ if isinstance(freq, str):
348
+ month = month_int[freq.split("-")[1]]
349
+ else:
350
+ month = freq.startingMonth
351
+ year = df.index[0].year
352
+ months_year_end = 12 - month
353
+ months_year_start = 3 - months_year_end
354
+ if int(month) >= 10:
355
+ str_date = str(months_year_start) + "/" + str(year)
356
+ else:
357
+ str_date = str(month) + "/" + str(year)
358
+ tdelta = df.index[1] - df.index[0]
359
+ date_to_offset = df.loc[str_date].index[-1].to_pydatetime()
360
+ start = date_to_offset + tdelta
361
+ end = date_to_offset + pd.DateOffset(years=1)
362
+ if month < 8 or month >= 10:
363
+ df = wrap_year_end(df, start, end)
364
+ else:
365
+ df = wrap_year_end(df, end, start)
366
+ return df
367
+ else:
368
+ return df
369
+
370
+
371
+ def perc_wrap(p):
372
+ """Wrap numpy percentile function for use in rep_cond method."""
373
+
374
+ def numpy_percentile(x):
375
+ return np.percentile(x.T, p, interpolation="nearest")
376
+
377
+ return numpy_percentile
378
+
379
+
380
+ def perc_bounds(percent_filter):
381
+ """
382
+ Convert +/- percentage to decimals to be used to determine bounds.
383
+
384
+ Parameters
385
+ ----------
386
+ percent_filter : float or tuple, default None
387
+ Percentage or tuple of percentages used to filter around reporting
388
+ irradiance in the irr_rc_balanced function. Required argument when
389
+ irr_bal is True.
390
+
391
+ Returns
392
+ -------
393
+ tuple
394
+ Decimal versions of the percent irradiance filter. 0.8 and 1.2 would be
395
+ returned when passing 20 to the input.
396
+ """
397
+ if isinstance(percent_filter, tuple):
398
+ perc_low = percent_filter[0] / 100
399
+ perc_high = percent_filter[1] / 100
400
+ else:
401
+ perc_low = percent_filter / 100
402
+ perc_high = percent_filter / 100
403
+ low = 1 - (perc_low)
404
+ high = 1 + (perc_high)
405
+ return (low, high)
406
+
407
+
408
+ def perc_difference(x, y):
409
+ """Calculate percent difference of two values."""
410
+ if x == y == 0:
411
+ return 0
412
+ else:
413
+ if x + y == 0:
414
+ return 1
415
+ else:
416
+ return abs(x - y) / ((x + y) / 2)
417
+
418
+
419
+ def check_all_perc_diff_comb(series, perc_diff):
420
+ """
421
+ Check series for pairs of values with percent difference above perc_diff.
422
+
423
+ Calculates the percent difference between all combinations of two values in
424
+ the passed series and checks if all of them are below the passed perc_diff.
425
+
426
+ Parameters
427
+ ----------
428
+ series : pd.Series
429
+ Pandas series of values to check.
430
+ perc_diff : float
431
+ Percent difference threshold value as decimal i.e. 5% is 0.05.
432
+
433
+ Returns
434
+ -------
435
+ bool
436
+ """
437
+ c = combinations(series.__iter__(), 2)
438
+ return all([perc_difference(x, y) < perc_diff for x, y in c])
439
+
440
+
441
+ def abs_diff_from_average(series, threshold):
442
+ """Check each value in series <= average of other values.
443
+
444
+ Drops NaNs from series before calculating difference from average for each value.
445
+
446
+ Returns True if there is only one value in the series.
447
+
448
+ Parameters
449
+ ----------
450
+ series : pd.Series
451
+ Pandas series of values to check.
452
+ threshold : numeric
453
+ Threshold value for absolute difference from average.
454
+
455
+ Returns
456
+ -------
457
+ bool
458
+ """
459
+ series = series.dropna()
460
+ if len(series) == 1:
461
+ return True
462
+ abs_diffs = []
463
+ for i, val in enumerate(series):
464
+ abs_diffs.append(abs(val - series.drop(series.index[i]).mean()) <= threshold)
465
+ return all(abs_diffs)
466
+
467
+
468
+ def sensor_filter(df, threshold, row_filter=check_all_perc_diff_comb):
469
+ """
470
+ Check dataframe for rows with inconsistent values.
471
+
472
+ Applies check_all_perc_diff_comb function along rows of passed dataframe.
473
+
474
+ Parameters
475
+ ----------
476
+ df : pandas DataFrame
477
+ perc_diff : float
478
+ Percent difference as decimal.
479
+ """
480
+ if df.shape[1] >= 2:
481
+ bool_ser = df.apply(row_filter, args=(threshold,), axis=1)
482
+ return df[bool_ser].index
483
+ elif df.shape[1] == 1:
484
+ return df.index
485
+
486
+
487
+ def filter_irr(df, irr_col, low, high, ref_val=None):
488
+ """
489
+ Top level filter on irradiance values.
490
+
491
+ Parameters
492
+ ----------
493
+ df : DataFrame
494
+ Dataframe to be filtered.
495
+ irr_col : str
496
+ String that is the name of the column with the irradiance data.
497
+ low : float or int
498
+ Minimum value as fraction (0.8) or absolute 200 (W/m^2)
499
+ high : float or int
500
+ Max value as fraction (1.2) or absolute 800 (W/m^2)
501
+ ref_val : float or int
502
+ Must provide arg when low/high are fractions
503
+
504
+ Returns
505
+ -------
506
+ DataFrame
507
+ """
508
+ if ref_val is not None:
509
+ low *= ref_val
510
+ high *= ref_val
511
+
512
+ df_renamed = df.rename(columns={irr_col: "poa"})
513
+
514
+ flt_str = "@low <= " + "poa" + " <= @high"
515
+ indx = df_renamed.query(flt_str).index
516
+
517
+ return df.loc[indx, :]
518
+
519
+
520
+ def filter_grps(grps, rcs, irr_col, low, high, freq, **kwargs):
521
+ """
522
+ Apply irradiance filter around passsed reporting irradiances to groupby.
523
+
524
+ For each group in the grps argument the irradiance is filtered by a
525
+ percentage around the reporting irradiance provided in rcs.
526
+
527
+ Parameters
528
+ ----------
529
+ grps : pandas groupby
530
+ Groupby object with time groups (months, seasons, etc.).
531
+ rcs : pandas DataFrame
532
+ Dataframe of reporting conditions. Use the rep_cond method to generate
533
+ a dataframe for this argument.
534
+ irr_col : str
535
+ String that is the name of the column with the irradiance data.
536
+ low : float
537
+ Minimum value as fraction e.g. 0.8.
538
+ high : float
539
+ Max value as fraction e.g. 1.2.
540
+ freq : str
541
+ Frequency to groupby e.g. 'MS' for month start.
542
+ **kwargs
543
+ Passed to pandas Grouper to control label and closed side of intervals.
544
+ See pandas Grouper doucmentation for details. Default is left labeled
545
+ and left closed.
546
+
547
+ Returns
548
+ -------
549
+ pandas groupby
550
+ """
551
+ flt_dfs = []
552
+ for grp_name, grp_df in grps:
553
+ ref_val = rcs.loc[grp_name, "poa"]
554
+ grp_df_flt = filter_irr(grp_df, irr_col, low, high, ref_val=ref_val)
555
+ flt_dfs.append(grp_df_flt)
556
+ df_flt = pd.concat(flt_dfs)
557
+ df_flt_grpby = df_flt.groupby(pd.Grouper(freq=freq, **kwargs))
558
+ return df_flt_grpby
559
+
560
+
561
+ class ReportingIrradiance(param.Parameterized):
562
+ df = param.DataFrame(
563
+ doc="Data to use to calculate reporting irradiance.", precedence=-1
564
+ )
565
+ irr_col = param.String(
566
+ default="GlobInc",
567
+ doc="Name of column in `df` containing irradiance data.",
568
+ precedence=-1,
569
+ )
570
+ irr_rc = param.Number(precedence=-1)
571
+ poa_flt = param.DataFrame(precedence=-1)
572
+ total_pts = param.Number(precedence=-1)
573
+ rc_irr_60th_perc = param.Number(precedence=-1)
574
+ percent_band = param.Integer(20, softbounds=(2, 50), step=1)
575
+ min_percent_below = param.Integer(
576
+ default=40,
577
+ doc="Minimum number of points as a percentage allowed below the \
578
+ reporting irradiance.",
579
+ )
580
+ max_percent_above = param.Integer(
581
+ default=60,
582
+ doc="Maximum number of points as a percentage allowed above the \
583
+ reporting irradiance.",
584
+ )
585
+ min_ref_irradiance = param.Integer(
586
+ default=None, doc="Minimum value allowed for the reference irradiance."
587
+ )
588
+ max_ref_irradiance = param.Integer(
589
+ None,
590
+ doc="Maximum value allowed for the reference irradiance. By default this\
591
+ maximum is calculated by dividing the highest irradiance value in `df`\
592
+ by `high`.",
593
+ )
594
+ points_required = param.Integer(
595
+ default=750,
596
+ doc="This is value is only used in the plot to overlay a horizontal \
597
+ line on the plot of the total points.",
598
+ )
599
+
600
+ def __init__(self, df, irr_col, **param):
601
+ super().__init__(**param)
602
+ self.df = df
603
+ self.irr_col = irr_col
604
+ self.rc_irr_60th_perc = np.percentile(self.df[self.irr_col], 60)
605
+
606
+ def get_rep_irr(self):
607
+ """
608
+ Calculates the reporting irradiance.
609
+
610
+ Returns
611
+ -------
612
+ Tuple
613
+ Float reporting irradiance and filtered dataframe.
614
+ """
615
+ low, high = perc_bounds(self.percent_band)
616
+ poa_flt = self.df.copy()
617
+
618
+ poa_flt.sort_values(self.irr_col, inplace=True)
619
+
620
+ poa_flt["plus_perc"] = poa_flt[self.irr_col] * high
621
+ poa_flt["minus_perc"] = poa_flt[self.irr_col] * low
622
+
623
+ poa_flt["below_count"] = [
624
+ poa_flt[self.irr_col].between(low, ref).sum()
625
+ for low, ref in zip(poa_flt["minus_perc"], poa_flt[self.irr_col])
626
+ ]
627
+ poa_flt["above_count"] = [
628
+ poa_flt[self.irr_col].between(ref, high).sum()
629
+ for ref, high in zip(poa_flt[self.irr_col], poa_flt["plus_perc"])
630
+ ]
631
+
632
+ poa_flt["total_pts"] = poa_flt["above_count"] + poa_flt["below_count"]
633
+ poa_flt["perc_above"] = (poa_flt["above_count"] / poa_flt["total_pts"]) * 100
634
+ poa_flt["perc_below"] = (poa_flt["below_count"] / poa_flt["total_pts"]) * 100
635
+
636
+ # set index to the poa irradiance
637
+ poa_flt.set_index(self.irr_col, inplace=True)
638
+
639
+ if self.max_ref_irradiance is None:
640
+ self.max_ref_irradiance = int(poa_flt.index[-1] / high)
641
+ if self.min_ref_irradiance is None:
642
+ self.min_ref_irradiance = int(poa_flt.index[0] / low)
643
+ if self.min_ref_irradiance > self.max_ref_irradiance:
644
+ warnings.warn(
645
+ "The minimum reference irradiance ({:.2f}) is greater than the maximum "
646
+ "reference irradiance ({:.2f}). Setting the minimum to 400 and the "
647
+ "maximum to 1000.".format(
648
+ self.min_ref_irradiance, self.max_ref_irradiance
649
+ )
650
+ )
651
+ self.min_ref_irradiance = 400
652
+ self.max_ref_irradiance = 1000
653
+
654
+ # determine ref irradiance by finding 50/50 irradiance in upper group of data
655
+ poa_flt["valid"] = poa_flt["perc_below"].between(
656
+ self.min_percent_below, self.max_percent_above
657
+ ) & poa_flt.index.to_series().between(
658
+ self.min_ref_irradiance, self.max_ref_irradiance
659
+ )
660
+ if poa_flt["valid"].sum() == 0:
661
+ self.poa_flt = poa_flt
662
+ self.irr_rc = np.nan
663
+ warnings.warn(
664
+ "No valid reference irradiance found. Try reviewing the min and max "
665
+ "reference irradiance values and the min and max percent below and "
666
+ "above values. The dashboard method will show these values with "
667
+ "related plots and allow you to adjust them."
668
+ )
669
+ return None
670
+ poa_flt["perc_below_minus_50_abs"] = (poa_flt["perc_below"] - 50).abs()
671
+ valid_df = poa_flt[poa_flt["valid"]].copy()
672
+ valid_df.sort_values("perc_below_minus_50_abs", inplace=True)
673
+ # if there are more than one points that are exactly 50 points above and
674
+ # 50 above then pick the one that results in the most points
675
+ self.valid_df = valid_df
676
+ fifty_fifty_points = valid_df["perc_below_minus_50_abs"] == 0
677
+ if (fifty_fifty_points).sum() > 1:
678
+ possible_points = poa_flt.loc[
679
+ fifty_fifty_points[fifty_fifty_points].index, "total_pts"
680
+ ]
681
+ possible_points.sort_values(ascending=False, inplace=True)
682
+ irr_RC = possible_points.index[0]
683
+ else:
684
+ irr_RC = valid_df.index[0]
685
+ flt_df = filter_irr(self.df, self.irr_col, low, high, ref_val=irr_RC)
686
+ self.irr_rc = irr_RC
687
+ self.poa_flt = poa_flt
688
+ self.total_pts = poa_flt.loc[self.irr_rc, "total_pts"]
689
+
690
+ return (irr_RC, flt_df)
691
+
692
+ def save_plot(self, output_plot_path=None):
693
+ """
694
+ Save a plot of the possible reporting irradiances and time intervals.
695
+
696
+ Saves plot as an html file at path given.
697
+
698
+ output_plot_path : str or Path
699
+ Path to save plot to.
700
+ """
701
+ hv.save(self.plot(), output_plot_path, fmt="html", toolbar=True)
702
+
703
+ def save_csv(self, output_csv_path):
704
+ """
705
+ Save possible reporting irradiance data to csv file at given path.
706
+ """
707
+ self.poa_flt.to_csv(output_csv_path)
708
+
709
+ @param.depends(
710
+ "percent_band",
711
+ "min_percent_below",
712
+ "max_percent_above",
713
+ "min_ref_irradiance",
714
+ "points_required",
715
+ "max_ref_irradiance",
716
+ )
717
+ def plot(self):
718
+ self.get_rep_irr()
719
+ below_count_scatter = hv.Scatter(
720
+ self.poa_flt["below_count"].reset_index(),
721
+ ["poa"],
722
+ ["below_count"],
723
+ label="Count pts below",
724
+ )
725
+ above_count_scatter = hv.Scatter(
726
+ self.poa_flt["above_count"].reset_index(),
727
+ ["poa"],
728
+ ["above_count"],
729
+ label="Count pts above",
730
+ )
731
+ if self.irr_rc is not np.nan:
732
+ count_ellipse = hv.Ellipse(
733
+ self.irr_rc, self.poa_flt.loc[self.irr_rc, "below_count"], (20, 50)
734
+ )
735
+ perc_below_scatter = (
736
+ hv.Scatter(
737
+ self.poa_flt["perc_below"].reset_index(), ["poa"], ["perc_below"]
738
+ )
739
+ * hv.HLine(self.min_percent_below)
740
+ * hv.HLine(self.max_percent_above)
741
+ * hv.VLine(self.min_ref_irradiance)
742
+ * hv.VLine(self.max_ref_irradiance)
743
+ )
744
+ if self.irr_rc is not np.nan:
745
+ perc_ellipse = hv.Ellipse(
746
+ self.irr_rc, self.poa_flt.loc[self.irr_rc, "perc_below"], (20, 10)
747
+ )
748
+ total_points_scatter = hv.Scatter(
749
+ self.poa_flt["total_pts"].reset_index(), ["poa"], ["total_pts"]
750
+ ) * hv.HLine(self.points_required)
751
+ if self.irr_rc is not np.nan:
752
+ total_points_ellipse = hv.Ellipse(
753
+ self.irr_rc, self.poa_flt.loc[self.irr_rc, "total_pts"], (20, 50)
754
+ )
755
+
756
+ ylim_bottom = self.poa_flt["total_pts"].min() - 20
757
+ if self.total_pts < self.points_required:
758
+ ylim_top = self.points_required + 20
759
+ else:
760
+ ylim_top = self.total_pts + 50
761
+ vl = hv.VLine(self.rc_irr_60th_perc).opts(line_color="gray")
762
+ if self.irr_rc is not np.nan:
763
+ rep_cond_plot = (
764
+ (
765
+ (
766
+ below_count_scatter * above_count_scatter * count_ellipse * vl
767
+ ).opts(ylabel="count points")
768
+ + (perc_below_scatter * perc_ellipse).opts(ylim=(0, 100))
769
+ + (total_points_scatter * total_points_ellipse).opts(
770
+ ylim=(ylim_bottom, ylim_top)
771
+ )
772
+ )
773
+ .opts(
774
+ opts.HLine(line_width=1),
775
+ opts.VLine(line_width=1),
776
+ opts.Scatter(
777
+ size=4,
778
+ show_legend=True,
779
+ legend_position="right",
780
+ tools=["hover"],
781
+ ),
782
+ opts.Overlay(width=700),
783
+ opts.Layout(
784
+ title="Reporting Irradiance: {:0.2f}, Total Points {}".format(
785
+ self.irr_rc, self.total_pts
786
+ )
787
+ ),
788
+ )
789
+ .cols(1)
790
+ )
791
+ else:
792
+ rep_cond_plot = (
793
+ (
794
+ (below_count_scatter * above_count_scatter * vl).opts(
795
+ ylabel="count points"
796
+ )
797
+ + perc_below_scatter.opts(ylim=(0, 100))
798
+ + total_points_scatter.opts(ylim=(ylim_bottom, ylim_top))
799
+ )
800
+ .opts(
801
+ opts.HLine(line_width=1),
802
+ opts.VLine(line_width=1),
803
+ opts.Scatter(
804
+ size=4,
805
+ show_legend=True,
806
+ legend_position="right",
807
+ tools=["hover"],
808
+ ),
809
+ opts.Overlay(width=700),
810
+ opts.Layout(
811
+ title=(
812
+ "Reporting Irradiance: None identified, "
813
+ f"Total Points {self.total_pts}"
814
+ )
815
+ ),
816
+ )
817
+ .cols(1)
818
+ )
819
+ return rep_cond_plot
820
+
821
+ def dashboard(self):
822
+ return pn.Row(self.param, self.plot)
823
+
824
+
825
+ def fit_model(
826
+ df, fml="power ~ poa + I(poa * poa) + I(poa * t_amb) + I(poa * w_vel) - 1"
827
+ ): # noqa E501
828
+ """
829
+ Fits linear regression using statsmodels to dataframe passed.
830
+
831
+ Dataframe must be first argument for use with pandas groupby object
832
+ apply method.
833
+
834
+ Parameters
835
+ ----------
836
+ df : pandas dataframe
837
+ fml : str
838
+ Formula to fit refer to statsmodels and patsy documentation for format.
839
+ Default is the formula in ASTM E2848.
840
+
841
+ Returns
842
+ -------
843
+ Statsmodels linear model regression results wrapper object.
844
+ """
845
+ mod = smf.ols(formula=fml, data=df)
846
+ reg = mod.fit()
847
+ return reg
848
+
849
+
850
+ def predict(regs, rcs):
851
+ """
852
+ Calculate predicted values for given linear models and predictor values.
853
+
854
+ Evaluates the first linear model in the iterable with the first row of the
855
+ predictor values in the dataframe. Passed arguments must be aligned.
856
+
857
+ Parameters
858
+ ----------
859
+ regs : iterable of statsmodels regression results wrappers
860
+ rcs : pandas dataframe
861
+ Dataframe of predictor values used to evaluate each linear model.
862
+ The column names must match the strings used in the regression
863
+ formuala.
864
+
865
+ Returns
866
+ -------
867
+ Pandas series of predicted values.
868
+ """
869
+ pred_cap = list()
870
+ for i, mod in enumerate(regs):
871
+ RC_df = pd.DataFrame(rcs.iloc[i, :]).T
872
+ pred_cap.append(mod.predict(RC_df).values[0])
873
+ return pd.Series(pred_cap)
874
+
875
+
876
+ def pred_summary(grps, rcs, allowance, **kwargs):
877
+ """
878
+ Summarize reporting conditions, predicted cap, and gauranteed cap.
879
+
880
+ This method does not calculate reporting conditions.
881
+
882
+ Parameters
883
+ ----------
884
+ grps : pandas groupby object
885
+ Solar data grouped by season or month used to calculate reporting
886
+ conditions. This argument is used to fit models for each group.
887
+ rcs : pandas dataframe
888
+ Dataframe of reporting conditions used to predict capacities.
889
+ allowance : float
890
+ Percent allowance to calculate gauranteed capacity from predicted
891
+ capacity.
892
+
893
+ Returns
894
+ -------
895
+ Dataframe of reporting conditions, model coefficients, predicted capacities
896
+ gauranteed capacities, and points in each grouping.
897
+ """
898
+ regs = grps.apply(fit_model, **kwargs)
899
+ predictions = predict(regs, rcs)
900
+ params = regs.apply(lambda x: x.params.transpose())
901
+ pt_qty = grps.agg("count").iloc[:, 0]
902
+ predictions.index = pt_qty.index
903
+
904
+ params.index = pt_qty.index
905
+ rcs.index = pt_qty.index
906
+ predictions.name = "PredCap"
907
+
908
+ for rc_col_name in rcs.columns:
909
+ for param_col_name in params.columns:
910
+ if rc_col_name == param_col_name:
911
+ new_col_name = param_col_name + "-param"
912
+ params.rename(columns={param_col_name: new_col_name}, inplace=True)
913
+
914
+ results = pd.concat([rcs, predictions, params], axis=1)
915
+
916
+ results["guaranteedCap"] = results["PredCap"] * (1 - allowance)
917
+ results["pt_qty"] = pt_qty.values
918
+
919
+ return results
920
+
921
+
922
+ def pvlib_location(loc):
923
+ """
924
+ Create a pvlib location object.
925
+
926
+ Parameters
927
+ ----------
928
+ loc : dict
929
+ Dictionary of values required to instantiate a pvlib Location object.
930
+
931
+ loc = {'latitude': float,
932
+ 'longitude': float,
933
+ 'altitude': float/int,
934
+ 'tz': str, int, float, or pytz.timezone, default 'UTC'}
935
+ See
936
+ http://en.wikipedia.org/wiki/List_of_tz_database_time_zones
937
+ for a list of valid time zones.
938
+ pytz.timezone objects will be converted to strings.
939
+ ints and floats must be in hours from UTC.
940
+
941
+ Returns
942
+ -------
943
+ pvlib location object.
944
+ """
945
+ return Location(**loc)
946
+
947
+
948
+ def pvlib_system(sys):
949
+ """
950
+ Create a pvlib :py:class:`~pvlib.pvsystem.PVSystem` object.
951
+
952
+ The :py:class:`~pvlib.pvsystem.PVSystem` will have either a
953
+ :py:class:`~pvlib.pvsystem.FixedMount` or a
954
+ :py:class:`~pvlib.pvsystem.SingleAxisTrackerMount` depending on
955
+ the keys of the passed dictionary.
956
+
957
+ Parameters
958
+ ----------
959
+ sys : dict
960
+ Dictionary of keywords required to create a pvlib
961
+ ``SingleAxisTrackerMount`` or ``FixedMount``, plus ``albedo``.
962
+
963
+ Example dictionaries:
964
+
965
+ fixed_sys = {'surface_tilt': 20,
966
+ 'surface_azimuth': 180,
967
+ 'albedo': 0.2}
968
+
969
+ tracker_sys1 = {'axis_tilt': 0, 'axis_azimuth': 0,
970
+ 'max_angle': 90, 'backtrack': True,
971
+ 'gcr': 0.2, 'albedo': 0.2}
972
+
973
+ Refer to pvlib documentation for details.
974
+
975
+ Returns
976
+ -------
977
+ pvlib PVSystem object.
978
+ """
979
+ sandia_modules = retrieve_sam("SandiaMod")
980
+ cec_inverters = retrieve_sam("cecinverter")
981
+ sandia_module = sandia_modules.iloc[:, 0]
982
+ cec_inverter = cec_inverters.iloc[:, 0]
983
+
984
+ albedo = sys.pop("albedo", None)
985
+ trck_kwords = ["axis_tilt", "axis_azimuth", "max_angle", "backtrack", "gcr"] # noqa: E501
986
+ if any(kword in sys.keys() for kword in trck_kwords):
987
+ mount = SingleAxisTrackerMount(**sys)
988
+ else:
989
+ mount = FixedMount(**sys)
990
+ array = Array(
991
+ mount,
992
+ albedo=albedo,
993
+ module_parameters=sandia_module,
994
+ temperature_model_parameters={"u_c": 29.0, "u_v": 0.0},
995
+ )
996
+ system = PVSystem(arrays=[array], inverter_parameters=cec_inverter)
997
+
998
+ return system
999
+
1000
+
1001
+ def get_tz_index(time_source, loc):
1002
+ """
1003
+ Create DatetimeIndex with timezone aligned with location dictionary.
1004
+
1005
+ Handles generating a DatetimeIndex with a timezone for use as an agrument
1006
+ to pvlib ModelChain prepare_inputs method or pvlib Location get_clearsky
1007
+ method.
1008
+
1009
+ Parameters
1010
+ ----------
1011
+ time_source : dataframe or DatetimeIndex
1012
+ If passing a dataframe the index of the dataframe will be used. If the
1013
+ index does not have a timezone the timezone will be set using the
1014
+ timezone in the passed loc dictionary. If passing a DatetimeIndex with
1015
+ a timezone it will be returned directly. If passing a DatetimeIndex
1016
+ without a timezone the timezone in the timezone dictionary will be
1017
+ used.
1018
+
1019
+ Returns
1020
+ -------
1021
+ DatetimeIndex with timezone
1022
+ """
1023
+ if isinstance(time_source, pd.core.indexes.datetimes.DatetimeIndex):
1024
+ if time_source.tz is None:
1025
+ time_source = time_source.tz_localize(
1026
+ loc["tz"], ambiguous="infer", nonexistent="NaT"
1027
+ )
1028
+ return time_source
1029
+ else:
1030
+ if pytz.timezone(loc["tz"]) != time_source.tz:
1031
+ warnings.warn(
1032
+ "Passed a DatetimeIndex with a timezone that "
1033
+ "does not match the timezone in the loc dict. "
1034
+ "Using the timezone of the DatetimeIndex."
1035
+ )
1036
+ return time_source
1037
+ elif isinstance(time_source, pd.core.frame.DataFrame):
1038
+ if time_source.index.tz is None:
1039
+ return time_source.index.tz_localize(
1040
+ loc["tz"], ambiguous="infer", nonexistent="NaT"
1041
+ )
1042
+ else:
1043
+ if pytz.timezone(loc["tz"]) != time_source.index.tz:
1044
+ warnings.warn(
1045
+ "Passed a DataFrame with a timezone that "
1046
+ "does not match the timezone in the loc dict. "
1047
+ "Using the timezone of the DataFrame."
1048
+ )
1049
+ return time_source.index
1050
+
1051
+
1052
+ def csky(time_source, loc=None, sys=None, concat=True, output="both"):
1053
+ """
1054
+ Calculate clear sky poa and ghi.
1055
+
1056
+ Parameters
1057
+ ----------
1058
+ time_source : dataframe or DatetimeIndex
1059
+ If passing a dataframe the index of the dataframe will be used. If the
1060
+ index does not have a timezone the timezone will be set using the
1061
+ timezone in the passed loc dictionary. If passing a DatetimeIndex with
1062
+ a timezone it will be returned directly. If passing a DatetimeIndex
1063
+ without a timezone the timezone in the timezone dictionary will
1064
+ be used.
1065
+ loc : dict
1066
+ Dictionary of values required to instantiate a pvlib Location object.
1067
+
1068
+ loc = {'latitude': float,
1069
+ 'longitude': float,
1070
+ 'altitude': float/int,
1071
+ 'tz': str, int, float, or pytz.timezone, default 'UTC'}
1072
+ See
1073
+ http://en.wikipedia.org/wiki/List_of_tz_database_time_zones
1074
+ for a list of valid time zones.
1075
+ pytz.timezone objects will be converted to strings.
1076
+ ints and floats must be in hours from UTC.
1077
+ sys : dict
1078
+ Dictionary of keywords required to create a pvlib
1079
+ :py:class:`~pvlib.pvsystem.SingleAxisTrackerMount` or
1080
+ :py:class:`~pvlib.pvsystem.FixedMount`.
1081
+
1082
+ Example dictionaries:
1083
+
1084
+ fixed_sys = {'surface_tilt': 20,
1085
+ 'surface_azimuth': 180,
1086
+ 'albedo': 0.2}
1087
+
1088
+ tracker_sys1 = {'axis_tilt': 0, 'axis_azimuth': 0,
1089
+ 'max_angle': 90, 'backtrack': True,
1090
+ 'gcr': 0.2, 'albedo': 0.2}
1091
+
1092
+ Refer to pvlib documentation for details.
1093
+ concat : bool, default True
1094
+ If concat is True then returns columns as defined by return argument
1095
+ added to passed dataframe, otherwise returns just clear sky data.
1096
+ output : str, default 'both'
1097
+ both - returns only total poa and ghi
1098
+ poa_all - returns all components of poa
1099
+ ghi_all - returns all components of ghi
1100
+ all - returns all components of poa and ghi
1101
+ """
1102
+ location = pvlib_location(loc)
1103
+ system = pvlib_system(sys)
1104
+ mc = ModelChain(system, location)
1105
+ times = get_tz_index(time_source, loc)
1106
+ ghi = location.get_clearsky(times=times)
1107
+ # pvlib get_Clearsky also returns 'wind_speed' and 'temp_air'
1108
+ mc.prepare_inputs(weather=ghi)
1109
+ cols = [
1110
+ "poa_global",
1111
+ "poa_direct",
1112
+ "poa_diffuse",
1113
+ "poa_sky_diffuse",
1114
+ "poa_ground_diffuse",
1115
+ ]
1116
+
1117
+ if output == "both":
1118
+ csky_df = pd.DataFrame(
1119
+ {
1120
+ "poa_mod_csky": mc.results.total_irrad["poa_global"],
1121
+ "ghi_mod_csky": ghi["ghi"],
1122
+ }
1123
+ )
1124
+ if output == "poa_all":
1125
+ csky_df = mc.results.total_irrad[cols]
1126
+ if output == "ghi_all":
1127
+ csky_df = ghi[["ghi", "dni", "dhi"]]
1128
+ if output == "all":
1129
+ csky_df = pd.concat(
1130
+ [mc.results.total_irrad[cols], ghi[["ghi", "dni", "dhi"]]], axis=1
1131
+ )
1132
+
1133
+ ix_no_tz = csky_df.index.tz_localize(None, ambiguous="infer", nonexistent="NaT")
1134
+ csky_df.index = ix_no_tz
1135
+
1136
+ if concat:
1137
+ if isinstance(time_source, pd.core.frame.DataFrame):
1138
+ try:
1139
+ df_with_csky = pd.concat([time_source, csky_df], axis=1)
1140
+ except pd.errors.InvalidIndexError:
1141
+ # Drop NaT that occur for March DST shift in US data
1142
+ df_with_csky = pd.concat(
1143
+ [time_source, csky_df.loc[csky_df.index.dropna(), :]], axis=1
1144
+ )
1145
+ return df_with_csky
1146
+ else:
1147
+ warnings.warn(
1148
+ "time_source is not a dataframe; only clear sky data\
1149
+ returned"
1150
+ )
1151
+ return csky_df
1152
+ else:
1153
+ return csky_df
1154
+
1155
+
1156
+ def get_summary(*args):
1157
+ """
1158
+ Return summary dataframe of filtering steps for multiple CapData objects.
1159
+
1160
+ See documentation for the CapData.get_summary method for additional
1161
+ details.
1162
+ """
1163
+ summaries = [cd.get_summary() for cd in args]
1164
+ return pd.concat(summaries)
1165
+
1166
+
1167
+ def pick_attr(sim, das, name):
1168
+ """Check for conflict between attributes of two CapData objects."""
1169
+ sim_attr = getattr(sim, name)
1170
+ das_attr = getattr(das, name)
1171
+ if sim_attr is None and das_attr is None:
1172
+ warn_str = "{} must be set for either sim or das".format(name)
1173
+ return warnings.warn(warn_str)
1174
+ elif sim_attr is None and das_attr is not None:
1175
+ return (das_attr, "das")
1176
+ elif sim_attr is not None and das_attr is None:
1177
+ return (sim_attr, "sim")
1178
+ elif sim_attr is not None and das_attr is not None:
1179
+ warn_str = "{} found for sim and das set {} to None for one of the two".format(
1180
+ name, name
1181
+ )
1182
+ return warnings.warn(warn_str)
1183
+
1184
+
1185
+ def determine_pass_or_fail(cap_ratio, tolerance, nameplate):
1186
+ """
1187
+ Determine a pass/fail result from a capacity ratio and test tolerance.
1188
+
1189
+ Parameters
1190
+ ----------
1191
+ cap_ratio : float
1192
+ Ratio of the measured data regression result to the simulated data
1193
+ regression result.
1194
+ tolerance : str
1195
+ String representing error band. Ex. '+/- 3' or '- 5'
1196
+ There must be space between the sign and number. Number is
1197
+ interpreted as a percent. For example, 5 percent is 5 not 0.05.
1198
+ nameplate : numeric
1199
+ Nameplate rating of the PV plant.
1200
+
1201
+ Returns
1202
+ -------
1203
+ tuple of boolean and string
1204
+ True for a passing test and false for a failing test.
1205
+ Limits for passing and failing test.
1206
+ """
1207
+ sign = tolerance.split(sep=" ")[0]
1208
+ error = float(tolerance.split(sep=" ")[1]) / 100
1209
+
1210
+ nameplate_plus_error = nameplate * (1 + error)
1211
+ nameplate_minus_error = nameplate * (1 - error)
1212
+
1213
+ if sign == "+/-" or sign == "-/+":
1214
+ return (
1215
+ round(np.abs(1 - cap_ratio), ndigits=6) <= error,
1216
+ str(nameplate_minus_error) + ", " + str(nameplate_plus_error),
1217
+ )
1218
+ elif sign == "-":
1219
+ return (cap_ratio >= 1 - error, str(nameplate_minus_error) + ", None")
1220
+ else:
1221
+ warnings.warn("Sign must be '-', '+/-', or '-/+'.")
1222
+
1223
+
1224
+ def captest_results(
1225
+ sim, das, nameplate, tolerance, check_pvalues=False, pval=0.05, print_res=True
1226
+ ):
1227
+ """
1228
+ Print a summary indicating if system passed or failed capacity test.
1229
+
1230
+ NOTE: Method will try to adjust for 1000x differences in units.
1231
+
1232
+ Parameters
1233
+ ----------
1234
+ sim : CapData
1235
+ CapData object for simulated data.
1236
+ das : CapData
1237
+ CapData object for measured data.
1238
+ nameplate : numeric
1239
+ Nameplate rating of the PV plant.
1240
+ tolerance : str
1241
+ String representing error band. Ex. +/- 3', '- 5'
1242
+ There must be space between the sign and number. Number is
1243
+ interpreted as a percent. For example, 5 percent is 5 not 0.05.
1244
+ check_pvalues : boolean, default False
1245
+ Set to true to check p values for each coefficient. If p values is
1246
+ greater than pval, then the coefficient is set to zero.
1247
+ pval : float, default 0.05
1248
+ p value to use as cutoff. Regresion coefficients with a p value
1249
+ greater than pval will be set to zero.
1250
+ print_res : boolean, default True
1251
+ Set to False to prevent printing results.
1252
+
1253
+ Returns
1254
+ -------
1255
+ Capacity test ratio - the capacity calculated from the reporting conditions
1256
+ and the measured data divided by the capacity calculated from the reporting
1257
+ conditions and the simulated data.
1258
+ """
1259
+ sim_int = sim.copy()
1260
+ das_int = das.copy()
1261
+
1262
+ if sim_int.regression_formula != das_int.regression_formula:
1263
+ return warnings.warn("CapData objects do not have the sameregression formula.")
1264
+
1265
+ if check_pvalues:
1266
+ for cd in [sim_int, das_int]:
1267
+ for key, val in cd.regression_results.pvalues.items():
1268
+ if val > pval:
1269
+ cd.regression_results.params[key] = 0
1270
+
1271
+ rc = pick_attr(sim_int, das_int, "rc")
1272
+ if print_res:
1273
+ print("Using reporting conditions from {}. \n".format(rc[1]))
1274
+ rc = rc[0]
1275
+
1276
+ actual = das_int.regression_results.predict(rc)[0]
1277
+ expected = sim_int.regression_results.predict(rc)[0]
1278
+ cap_ratio = actual / expected
1279
+ if cap_ratio < 0.01:
1280
+ cap_ratio *= 1000
1281
+ actual *= 1000
1282
+ warnings.warn(
1283
+ "Capacity ratio and actual capacity multiplied by 1000"
1284
+ " because the capacity ratio was less than 0.01."
1285
+ )
1286
+ capacity = nameplate * cap_ratio
1287
+
1288
+ if print_res:
1289
+ test_passed = determine_pass_or_fail(cap_ratio, tolerance, nameplate)
1290
+ print_results(
1291
+ test_passed, expected, actual, cap_ratio, capacity, test_passed[1]
1292
+ )
1293
+
1294
+ return cap_ratio
1295
+
1296
+
1297
+ def print_results(test_passed, expected, actual, cap_ratio, capacity, bounds):
1298
+ """Print formatted results of capacity test."""
1299
+ if test_passed[0]:
1300
+ print("{:<30s}{}".format("Capacity Test Result:", "PASS"))
1301
+ else:
1302
+ print("{:<25s}{}".format("Capacity Test Result:", "FAIL"))
1303
+
1304
+ print(
1305
+ "{:<30s}{:0.3f}".format("Modeled test output:", expected)
1306
+ + "\n"
1307
+ + "{:<30s}{:0.3f}".format("Actual test output:", actual)
1308
+ + "\n"
1309
+ + "{:<30s}{:0.3f}".format("Tested output ratio:", cap_ratio)
1310
+ + "\n"
1311
+ + "{:<30s}{:0.3f}".format("Tested Capacity:", capacity)
1312
+ )
1313
+
1314
+ print("{:<30s}{}\n\n".format("Bounds:", test_passed[1]))
1315
+
1316
+
1317
+ def highlight_pvals(s):
1318
+ """Highlight vals greater than or equal to 0.05 in a Series yellow."""
1319
+ is_greaterthan = s >= 0.05
1320
+ return ["background-color: yellow" if v else "" for v in is_greaterthan]
1321
+
1322
+
1323
+ def captest_results_check_pvalues(
1324
+ sim, das, nameplate, tolerance, print_res=False, **kwargs
1325
+ ):
1326
+ """
1327
+ Print a summary of the capacity test results.
1328
+
1329
+ Capacity ratio is the capacity calculated from the reporting conditions
1330
+ and the measured data divided by the capacity calculated from the reporting
1331
+ conditions and the simulated data.
1332
+
1333
+ The tolerance is applied to the capacity test ratio to determine if the
1334
+ test passes or fails.
1335
+
1336
+ Parameters
1337
+ ----------
1338
+ sim : CapData
1339
+ CapData object for simulated data.
1340
+ das : CapData
1341
+ CapData object for measured data.
1342
+ nameplate : numeric
1343
+ Nameplate rating of the PV plant.
1344
+ tolerance : str
1345
+ String representing error band. Ex. '+ 3', '+/- 3', '- 5'
1346
+ There must be space between the sign and number. Number is
1347
+ interpreted as a percent. For example, 5 percent is 5 not 0.05.
1348
+ print_res : boolean, default True
1349
+ Set to False to prevent printing results.
1350
+ **kwargs
1351
+ kwargs are passed to captest_results. See documentation for
1352
+ captest_results for options. check_pvalues is set in this method,
1353
+ so do not pass again.
1354
+
1355
+ Prints:
1356
+ Capacity ratio without setting parameters with high p-values to zero.
1357
+ Capacity ratio after setting paramters with high p-values to zero.
1358
+ P-values for simulated and measured regression coefficients.
1359
+ Regression coefficients (parameters) for simulated and measured data.
1360
+ """
1361
+ das_pvals = das.regression_results.pvalues
1362
+ sim_pvals = sim.regression_results.pvalues
1363
+ das_params = das.regression_results.params
1364
+ sim_params = sim.regression_results.params
1365
+
1366
+ df_pvals = pd.DataFrame([das_pvals, sim_pvals, das_params, sim_params])
1367
+ df_pvals = df_pvals.transpose()
1368
+ df_pvals.rename(
1369
+ columns={0: "das_pvals", 1: "sim_pvals", 2: "das_params", 3: "sim_params"},
1370
+ inplace=True,
1371
+ )
1372
+
1373
+ cap_ratio = captest_results(
1374
+ sim,
1375
+ das,
1376
+ nameplate,
1377
+ tolerance,
1378
+ print_res=print_res,
1379
+ check_pvalues=False,
1380
+ **kwargs,
1381
+ )
1382
+ cap_ratio_check_pvalues = captest_results(
1383
+ sim,
1384
+ das,
1385
+ nameplate,
1386
+ tolerance,
1387
+ print_res=print_res,
1388
+ check_pvalues=True,
1389
+ **kwargs,
1390
+ )
1391
+
1392
+ cap_ratio_rounded = np.round(cap_ratio, decimals=4) * 100
1393
+ cap_ratio_check_pvalues_rounded = (
1394
+ np.round(cap_ratio_check_pvalues, decimals=4) * 100
1395
+ )
1396
+
1397
+ result_str = "{:.3f}% - Cap Ratio"
1398
+ print(result_str.format(cap_ratio_rounded))
1399
+
1400
+ result_str_pval_check = "{:.3f}% - Cap Ratio after pval check"
1401
+ print(result_str_pval_check.format(cap_ratio_check_pvalues_rounded))
1402
+
1403
+ return df_pvals.style.format("{:20,.5f}").apply(
1404
+ highlight_pvals, subset=["das_pvals", "sim_pvals"]
1405
+ )
1406
+
1407
+
1408
+ def run_test(cd, steps):
1409
+ """
1410
+ Apply a list of capacity test steps to a given CapData object.
1411
+
1412
+ A list of CapData methods is applied sequentially with the passed
1413
+ parameters. This method allows succintly defining a capacity test,
1414
+ which facilitates parametric and automatic testing.
1415
+
1416
+ Parameters
1417
+ ----------
1418
+ cd : CapData
1419
+ The CapData methods will be applied to this instance of the pvcaptest
1420
+ CapData class.
1421
+ steps : list of tuples
1422
+ A list of the methods to be applied and the arguments to be used.
1423
+ Each item in the list should be a tuple of the CapData method followed
1424
+ by a tuple of arguments and a dictionary of keyword arguments. If
1425
+ there are not args or kwargs an empty tuple or dict should be included.
1426
+ Example: [(CapData.filter_irr, (400, 1500), {})]
1427
+ """
1428
+ for step in steps:
1429
+ step[0](cd, *step[1], **step[2])
1430
+
1431
+
1432
+ def overlay_scatters(measured, expected, expected_label="PVsyst"):
1433
+ """
1434
+ Plot labeled overlay scatter of final filtered measured and simulated data.
1435
+
1436
+ Parameters
1437
+ ----------
1438
+ measured : Overlay
1439
+ Holoviews overlay scatter plot produced from CapData object used to
1440
+ calculate reporting conditions.
1441
+ expected : Overlay
1442
+ Holoviews overlay scatter plot produced from CapData object not used to
1443
+ calculate reporting conditions.
1444
+ rcs_from_meas : bool
1445
+ If rest was run calculating reporting conditions from measured or
1446
+ simulated data.
1447
+
1448
+ Returns
1449
+ -------
1450
+ Overlay scatter plot of remaining data after filtering from measured and
1451
+ simulated data.
1452
+ """
1453
+ meas_last_filter_scatter = getattr(
1454
+ measured.Scatter, measured.Scatter.children[-1]
1455
+ ).relabel("Measured")
1456
+ exp_last_filter_scatter = getattr(
1457
+ expected.Scatter, expected.Scatter.children[-1]
1458
+ ).relabel(expected_label)
1459
+ overlay = (meas_last_filter_scatter * exp_last_filter_scatter).opts(
1460
+ hv.opts.Overlay(legend_position="right")
1461
+ )
1462
+ return overlay
1463
+
1464
+
1465
+ def index_capdata(capdata, label, filtered=True):
1466
+ """
1467
+ Like Dataframe.loc but for CapData objects.
1468
+
1469
+ Pass a single label or list of labels to select the columns from the `data` or
1470
+ `data_filtered` DataFrames. The label can be a column name, a column group key, or
1471
+ a regression column key.
1472
+
1473
+ The special label `regcols` will return the columns identified in `regression_cols`.
1474
+
1475
+ Parameters
1476
+ ----------
1477
+ capdata : CapData
1478
+ The CapData object to select from.
1479
+ label : str or list
1480
+ The label or list of labels to select from the `data` or `data_filtered`
1481
+ DataFrames. The label can be a column name, a column group key, or a
1482
+ regression column key. The special label `regcols` will return the columns
1483
+ identified in `regression_cols`.
1484
+ filtered : bool, default True
1485
+ By default the method will return columns from the `data_filtered` DataFrame.
1486
+ Set to False to return columns from the `data` DataFrame.
1487
+
1488
+ Returns
1489
+ --------
1490
+ DataFrame
1491
+ """
1492
+ if filtered:
1493
+ data = capdata.data_filtered
1494
+ else:
1495
+ data = capdata.data
1496
+ if label == "regcols":
1497
+ label = list(capdata.regression_cols.values())
1498
+ if isinstance(label, str):
1499
+ if label in capdata.column_groups.keys():
1500
+ selected_data = data[capdata.column_groups[label]]
1501
+ elif label in capdata.regression_cols.keys():
1502
+ col_or_grp = capdata.regression_cols[label]
1503
+ if col_or_grp in capdata.column_groups.keys():
1504
+ selected_data = data[capdata.column_groups[col_or_grp]]
1505
+ elif col_or_grp in data.columns:
1506
+ selected_data = data[col_or_grp]
1507
+ else:
1508
+ warnings.warn(
1509
+ 'Group or column "{}" mapped to the "{}" key of regression_cols '
1510
+ "not found in column_groups keys or columns of CapData.data".format(
1511
+ col_or_grp, label
1512
+ )
1513
+ )
1514
+ elif label in data.columns:
1515
+ selected_data = data.loc[:, label]
1516
+ if isinstance(selected_data, pd.Series):
1517
+ return selected_data.to_frame()
1518
+ else:
1519
+ return selected_data
1520
+ elif isinstance(label, list):
1521
+ cols_to_return = []
1522
+ for label_item in label:
1523
+ if label_item in capdata.column_groups.keys():
1524
+ cols_to_return.extend(capdata.column_groups[label_item])
1525
+ elif label_item in capdata.regression_cols.keys():
1526
+ col_or_grp = capdata.regression_cols[label_item]
1527
+ if col_or_grp in capdata.column_groups.keys():
1528
+ cols_to_return.extend(capdata.column_groups[col_or_grp])
1529
+ elif col_or_grp in data.columns:
1530
+ cols_to_return.append(col_or_grp)
1531
+ elif label_item in data.columns:
1532
+ cols_to_return.append(label_item)
1533
+ return data[cols_to_return]
1534
+
1535
+
1536
+ class LocIndexer(object):
1537
+ """
1538
+ Class to implement __getitem__ for indexing the CapData.data dataframe.
1539
+
1540
+ Allows passing a column_groups key, a list of column_groups keys, or a column or
1541
+ list of columns of the CapData.data dataframe.
1542
+ """
1543
+
1544
+ def __init__(self, _capdata):
1545
+ self._capdata = _capdata
1546
+
1547
+ def __getitem__(self, label):
1548
+ return index_capdata(self._capdata, label, filtered=False)
1549
+
1550
+
1551
+ class FilteredLocIndexer(object):
1552
+ """
1553
+ Class to implement __getitem__ for indexing the CapData.data_filtered dataframe.
1554
+
1555
+ Allows passing a column_groups key, a list of column_groups keys, or a column or
1556
+ list of columns of the CapData.data_filtered dataframe.
1557
+ """
1558
+
1559
+ def __init__(self, _capdata):
1560
+ self._capdata = _capdata
1561
+
1562
+ def __getitem__(self, label):
1563
+ return index_capdata(self._capdata, label, filtered=True)
1564
+
1565
+
1566
+ class CapData(object):
1567
+ """
1568
+ Class to store capacity test data and column grouping.
1569
+
1570
+ CapData objects store a pandas dataframe of measured or simulated data
1571
+ and a dictionary grouping columns by type of measurement.
1572
+
1573
+ The `column_groups` dictionary allows maintaining the original column names
1574
+ while also grouping measurements of the same type from different
1575
+ sensors. Many of the methods for plotting and filtering data rely on the
1576
+ column groupings.
1577
+
1578
+ Parameters
1579
+ ----------
1580
+ name : str
1581
+ Name for the CapData object.
1582
+ data : pandas dataframe
1583
+ Used to store measured or simulated data imported from csv.
1584
+ data_filtered : pandas dataframe
1585
+ Holds filtered data. Filtering methods act on and write to this
1586
+ attribute.
1587
+ column_groups : dictionary
1588
+ Assigned by the `group_columns` method, which attempts to infer the
1589
+ type of measurement recorded in each column of the dataframe stored in
1590
+ the `data` attribute. For each inferred measurement type,
1591
+ `group_columns` creates an abbreviated name and a list of columns that
1592
+ contain measurements of that type. The abbreviated names are the keys
1593
+ and the corresponding values are the lists of columns.
1594
+ regression_cols : dictionary
1595
+ Dictionary identifying which columns in `data` or groups of columns as
1596
+ identified by the keys of `column_groups` are the independent variables
1597
+ of the ASTM Capacity test regression equation. Set using
1598
+ `set_regression_cols` or by directly assigning a dictionary.
1599
+ summary_ix : list of tuples
1600
+ Holds the row index data modified by the update_summary decorator
1601
+ function.
1602
+ summary : list of dicts
1603
+ Holds the data modified by the update_summary decorator function.
1604
+ rc : DataFrame
1605
+ Dataframe for the reporting conditions (poa, t_amb, and w_vel).
1606
+ regression_results : statsmodels linear regression model
1607
+ Holds the linear regression model object.
1608
+ regression_formula : str
1609
+ Regression formula to be fit to measured and simulated data. Must
1610
+ follow the requirements of statsmodels use of patsy.
1611
+ tolerance : str
1612
+ String representing error band. Ex. '+ 3', '+/- 3', '- 5'
1613
+ There must be space between the sign and number. Number is
1614
+ interpreted as a percent. For example, 5 percent is 5 not 0.05.
1615
+ """
1616
+
1617
+ def __init__(self, name): # noqa: D107
1618
+ super(CapData, self).__init__()
1619
+ self.name = name
1620
+ self.data = pd.DataFrame()
1621
+ self.data_filtered = None
1622
+ self.column_groups = {}
1623
+ self.regression_cols = {}
1624
+ self.summary_ix = []
1625
+ self.summary = []
1626
+ self.removed = []
1627
+ self.kept = []
1628
+ self.filter_counts = {}
1629
+ self.rc = None
1630
+ self.regression_results = None
1631
+ self.regression_formula = (
1632
+ "power ~ poa + I(poa * poa) + I(poa * t_amb) + I(poa * w_vel) - 1"
1633
+ )
1634
+ self.tolerance = None
1635
+ self.pre_agg_cols = None
1636
+ self.pre_agg_trans = None
1637
+ self.pre_agg_reg_trans = None
1638
+ self.loc = LocIndexer(self)
1639
+ self.floc = FilteredLocIndexer(self)
1640
+
1641
+ def set_regression_cols(self, power="", poa="", t_amb="", w_vel=""):
1642
+ """
1643
+ Create a dictionary linking the regression variables to data.
1644
+
1645
+ Links the independent regression variables to the appropriate
1646
+ translation keys or a column name may be used to specify a
1647
+ single column of data.
1648
+
1649
+ Sets attribute and returns nothing.
1650
+
1651
+ Parameters
1652
+ ----------
1653
+ power : str
1654
+ Translation key for the power variable.
1655
+ poa : str
1656
+ Translation key for the plane of array (poa) irradiance variable.
1657
+ t_amb : str
1658
+ Translation key for the ambient temperature variable.
1659
+ w_vel : str
1660
+ Translation key for the wind velocity key.
1661
+ """
1662
+ self.regression_cols = {
1663
+ "power": power,
1664
+ "poa": poa,
1665
+ "t_amb": t_amb,
1666
+ "w_vel": w_vel,
1667
+ }
1668
+
1669
+ def copy(self):
1670
+ """Create and returns a copy of self."""
1671
+ cd_c = CapData("")
1672
+ cd_c.name = copy.copy(self.name)
1673
+ cd_c.data = self.data.copy()
1674
+ cd_c.data_filtered = self.data_filtered.copy()
1675
+ cd_c.column_groups = copy.copy(self.column_groups)
1676
+ cd_c.regression_cols = copy.copy(self.regression_cols)
1677
+ cd_c.summary_ix = copy.copy(self.summary_ix)
1678
+ cd_c.summary = copy.copy(self.summary)
1679
+ cd_c.rc = copy.copy(self.rc)
1680
+ cd_c.regression_results = copy.deepcopy(self.regression_results)
1681
+ cd_c.regression_formula = copy.copy(self.regression_formula)
1682
+ cd_c.pre_agg_cols = copy.copy(self.pre_agg_cols)
1683
+ cd_c.pre_agg_trans = copy.deepcopy(self.pre_agg_trans)
1684
+ cd_c.pre_agg_reg_trans = copy.deepcopy(self.pre_agg_reg_trans)
1685
+ return cd_c
1686
+
1687
+ def empty(self):
1688
+ """Return a boolean indicating if the CapData object contains data."""
1689
+ tests_indicating_empty = [self.data.empty, len(self.column_groups) == 0]
1690
+ return all(tests_indicating_empty)
1691
+
1692
+ def drop_cols(self, columns):
1693
+ """
1694
+ Drop columns from CapData `data` and `column_groups`.
1695
+
1696
+ Parameters
1697
+ ----------
1698
+ Columns : list
1699
+ List of columns to drop.
1700
+
1701
+ Todo
1702
+ ----
1703
+ Change to accept a string column name or list of strings
1704
+ """
1705
+ for key, value in self.column_groups.items():
1706
+ for col in columns:
1707
+ try:
1708
+ value.remove(col)
1709
+ self.column_groups[key] = value
1710
+ except ValueError:
1711
+ continue
1712
+ self.data.drop(columns, axis=1, inplace=True)
1713
+ self.data_filtered.drop(columns, axis=1, inplace=True)
1714
+
1715
+ def get_reg_cols(self, reg_vars=None, filtered_data=True):
1716
+ """
1717
+ Get regression columns renamed with keys from `regression_cols`.
1718
+
1719
+ Parameters
1720
+ ----------
1721
+ reg_vars : list or str, default None
1722
+ By default returns all columns identified in `regression_cols`.
1723
+ A list with any combination of the keys of `regression_cols` is valid
1724
+ or pass a single key as a string.
1725
+ filtered_data : bool, default true
1726
+ Return filtered or unfiltered data.
1727
+
1728
+ Returns
1729
+ -------
1730
+ DataFrame
1731
+ """
1732
+ if reg_vars is None:
1733
+ reg_vars = list(self.regression_cols.keys())
1734
+ if filtered_data:
1735
+ df = self.floc[reg_vars].copy()
1736
+ else:
1737
+ df = self.loc[reg_vars].copy()
1738
+ rename = {df.columns[0]: reg_vars}
1739
+
1740
+ if isinstance(reg_vars, list):
1741
+ for reg_var in reg_vars:
1742
+ if self.regression_cols[reg_var] in self.data_filtered.columns:
1743
+ continue
1744
+ else:
1745
+ columns = self.column_groups[self.regression_cols[reg_var]]
1746
+ if len(columns) != 1:
1747
+ return warnings.warn(
1748
+ "Multiple columns per translation "
1749
+ "dictionary group. Run agg_sensors "
1750
+ "before this method."
1751
+ )
1752
+ rename = {old: new for old, new in zip(df.columns, reg_vars)}
1753
+
1754
+ df.rename(columns=rename, inplace=True)
1755
+ return df
1756
+
1757
+ def review_column_groups(self):
1758
+ """Print `column_groups` with nice formatting."""
1759
+ if len(self.column_groups) == 0:
1760
+ return "column_groups attribute is empty."
1761
+ else:
1762
+ for trans_grp, col_list in self.column_groups.items():
1763
+ print(trans_grp)
1764
+ for col in col_list:
1765
+ print(" " + col)
1766
+
1767
+ # PLOTTING METHODS
1768
+ def reg_scatter_matrix(self):
1769
+ """Create pandas scatter matrix of regression variables."""
1770
+ df = self.get_reg_cols(reg_vars=["poa", "t_amb", "w_vel"])
1771
+ df["poa_poa"] = df["poa"] * df["poa"]
1772
+ df["poa_t_amb"] = df["poa"] * df["t_amb"]
1773
+ df["poa_w_vel"] = df["poa"] * df["w_vel"]
1774
+ df.drop(["t_amb", "w_vel"], axis=1, inplace=True)
1775
+ return pd.plotting.scatter_matrix(df)
1776
+
1777
+ def scatter(self, filtered=True):
1778
+ """
1779
+ Create scatter plot of irradiance vs power.
1780
+
1781
+ Parameters
1782
+ ----------
1783
+ filtered : bool, default true
1784
+ Plots filtered data when true and all data when false.
1785
+ """
1786
+ if filtered:
1787
+ df = self.floc[["power", "poa"]]
1788
+ else:
1789
+ df = self.loc[["power", "poa"]]
1790
+
1791
+ if df.shape[1] != 2:
1792
+ return warnings.warn("Aggregate sensors before using this method.")
1793
+
1794
+ df = df.rename(columns={df.columns[0]: "power", df.columns[1]: "poa"})
1795
+ plt = df.plot(kind="scatter", x="poa", y="power", title=self.name, alpha=0.2)
1796
+ return plt
1797
+
1798
+ def scatter_hv(self, timeseries=False, all_reg_columns=False):
1799
+ """
1800
+ Create holoviews scatter plot of irradiance vs power.
1801
+
1802
+ Use holoviews opts magics in notebook cell before calling method to
1803
+ adjust height and width of plots:
1804
+
1805
+ %%opts Scatter [height=200, width=400]
1806
+ %%opts Curve [height=200, width=400]
1807
+
1808
+ Parameters
1809
+ ----------
1810
+ timeseries : boolean, default False
1811
+ True adds timeseries plot of the data linked to the scatter plot.
1812
+ Points selected in teh scatter plot will be highlighted in the
1813
+ timeseries plot.
1814
+ all_reg_columns : boolean, default False
1815
+ Set to True to include the data used in the regression in addition
1816
+ to poa irradiance and power in the hover tooltip.
1817
+ """
1818
+ df = self.get_reg_cols(filtered_data=True)
1819
+ df.index.name = "index"
1820
+ df.reset_index(inplace=True)
1821
+ vdims = ["power", "index"]
1822
+ if all_reg_columns:
1823
+ vdims.extend(list(df.columns.difference(vdims)))
1824
+ hover = HoverTool(
1825
+ tooltips=[
1826
+ ("datetime", "@index{%Y-%m-%d %H:%M}"),
1827
+ ("poa", "@poa{0,0.0}"),
1828
+ ("power", "@power{0,0.0}"),
1829
+ ],
1830
+ formatters={
1831
+ "@index": "datetime",
1832
+ },
1833
+ )
1834
+ poa_vs_kw = hv.Scatter(df, "poa", vdims).opts(
1835
+ size=5,
1836
+ tools=[hover, "lasso_select", "box_select"],
1837
+ legend_position="right",
1838
+ height=400,
1839
+ width=400,
1840
+ selection_fill_color="red",
1841
+ selection_line_color="red",
1842
+ yformatter=NumeralTickFormatter(format="0,0"),
1843
+ )
1844
+ # layout_scatter = (poa_vs_kw).opts(opt_dict)
1845
+ if timeseries:
1846
+ power_vs_time = hv.Scatter(df, "index", ["power", "poa"]).opts(
1847
+ tools=[hover, "lasso_select", "box_select"],
1848
+ height=400,
1849
+ width=800,
1850
+ selection_fill_color="red",
1851
+ selection_line_color="red",
1852
+ )
1853
+ power_col, poa_col = self.loc[["power", "poa"]].columns
1854
+ power_vs_time_underlay = hv.Curve(
1855
+ self.data.rename_axis("index", axis="index"),
1856
+ "index",
1857
+ [power_col, poa_col],
1858
+ ).opts(
1859
+ tools=["lasso_select", "box_select"],
1860
+ height=400,
1861
+ width=800,
1862
+ line_color="gray",
1863
+ line_width=1,
1864
+ line_alpha=0.4,
1865
+ yformatter=NumeralTickFormatter(format="0,0"),
1866
+ )
1867
+ layout_timeseries = poa_vs_kw + power_vs_time * power_vs_time_underlay
1868
+ DataLink(poa_vs_kw, power_vs_time)
1869
+ return layout_timeseries.cols(1)
1870
+ else:
1871
+ return poa_vs_kw
1872
+
1873
+ def plot(
1874
+ self,
1875
+ combine=plotting.COMBINE,
1876
+ default_groups=plotting.DEFAULT_GROUPS,
1877
+ width=1500,
1878
+ height=250,
1879
+ **kwargs,
1880
+ ):
1881
+ """
1882
+ Create a dashboard to explore timeseries plots of the data.
1883
+
1884
+ The dashboard contains three tabs: Groups, Layout, and Overlay. The first tab,
1885
+ Groups, presents a column of plots with a separate plot overlaying the
1886
+ measurements for each group of the `column_groups`. The groups plotted are
1887
+ defined by the `default_groups` argument.
1888
+
1889
+ The second tab, Layout, allows manually selecting groups to plot. The button
1890
+ on this tab can be used to replace the column of plots on the Groups tab with
1891
+ the current figure on the Layout tab. Rerun this method after clicking the
1892
+ button to see the new plots in the Groups tab.
1893
+
1894
+ The third tab, Overlay, allows picking a group or any combination of individual
1895
+ tags to overlay on a single plot. The list of groups and tags can be filtered
1896
+ using regular expressions. Adding a text id in the box and clicking Update will
1897
+ add the current overlay to the list of groups on the Layout tab.
1898
+
1899
+ Parameters
1900
+ ----------
1901
+ combine : dict, optional
1902
+ Dictionary of group names and regex strings to use to identify groups from
1903
+ column groups and individual tags (columns) to combine into new groups. See
1904
+ the `parse_combine` function for more details.
1905
+ default_groups : list of str, optional
1906
+ List of regex strings to use to identify default groups to plot. See the
1907
+ `plotting.find_default_groups` function for more details.
1908
+ group_width : int, optional
1909
+ The width of the plots on the Groups tab.
1910
+ group_height : int, optional
1911
+ The height of the plots on the Groups tab.
1912
+ **kwargs : optional
1913
+ Additional keyword arguments are passed to the options of the scatter plot.
1914
+
1915
+ Returns
1916
+ -------
1917
+ Panel tabbed layout
1918
+ """
1919
+ return plotting.plot(
1920
+ self,
1921
+ combine=combine,
1922
+ default_groups=default_groups,
1923
+ group_width=width,
1924
+ group_height=height,
1925
+ **kwargs,
1926
+ )
1927
+
1928
+ def scatter_filters(self):
1929
+ """
1930
+ Returns an overlay of scatter plots of intervals removed for each filter.
1931
+
1932
+ A scatter plot of power vs irradiance is generated for the time intervals
1933
+ removed for each filtering step. Each of these plots is labeled and
1934
+ overlayed.
1935
+ """
1936
+ scatters = []
1937
+
1938
+ data = self.get_reg_cols(reg_vars=["power", "poa"], filtered_data=False)
1939
+ data["index"] = self.data.index
1940
+ plt_no_filtering = hv.Scatter(data, "poa", ["power", "index"]).relabel("all")
1941
+ scatters.append(plt_no_filtering)
1942
+
1943
+ d1 = data.loc[self.removed[0]["index"], :]
1944
+ plt_first_filter = hv.Scatter(d1, "poa", ["power", "index"]).relabel(
1945
+ self.removed[0]["name"]
1946
+ )
1947
+ scatters.append(plt_first_filter)
1948
+
1949
+ for i, filtering_step in enumerate(self.kept):
1950
+ if i >= len(self.kept) - 1:
1951
+ break
1952
+ else:
1953
+ flt_legend = self.kept[i + 1]["name"]
1954
+ d_flt = data.loc[filtering_step["index"], :]
1955
+ plt = hv.Scatter(d_flt, "poa", ["power", "index"]).relabel(flt_legend)
1956
+ scatters.append(plt)
1957
+
1958
+ scatter_overlay = hv.Overlay(scatters)
1959
+ hover = HoverTool(
1960
+ tooltips=[
1961
+ ("datetime", "@index{%Y-%m-%d %H:%M}"),
1962
+ ("poa", "@poa{0,0.0}"),
1963
+ ("power", "@power{0,0.0}"),
1964
+ ],
1965
+ formatters={
1966
+ "@index": "datetime",
1967
+ },
1968
+ )
1969
+ scatter_overlay.opts(
1970
+ hv.opts.Scatter(
1971
+ size=5,
1972
+ width=650,
1973
+ height=500,
1974
+ muted_fill_alpha=0,
1975
+ fill_alpha=0.4,
1976
+ line_width=0,
1977
+ tools=[hover],
1978
+ yformatter=NumeralTickFormatter(format="0,0"),
1979
+ ),
1980
+ hv.opts.Overlay(legend_position="right", toolbar="above"),
1981
+ )
1982
+ return scatter_overlay
1983
+
1984
+ def timeseries_filters(self):
1985
+ """
1986
+ Returns an overlay of scatter plots of intervals removed for each filter.
1987
+
1988
+ A scatter plot of power vs irradiance is generated for the time intervals
1989
+ removed for each filtering step. Each of these plots is labeled and
1990
+ overlayed.
1991
+ """
1992
+ plots = []
1993
+
1994
+ data = self.get_reg_cols(reg_vars="power", filtered_data=False)
1995
+ data["Timestamp"] = data.index
1996
+ plt_no_filtering = hv.Curve(data, ["Timestamp"], ["power"], label="all")
1997
+ plt_no_filtering.opts(
1998
+ line_color="black",
1999
+ line_width=1,
2000
+ width=1500,
2001
+ height=450,
2002
+ )
2003
+ plots.append(plt_no_filtering)
2004
+
2005
+ d1 = data.loc[self.removed[0]["index"], ["power", "Timestamp"]]
2006
+ plt_first_filter = hv.Scatter(
2007
+ d1, ["Timestamp"], ["power"], label=self.removed[0]["name"]
2008
+ )
2009
+ plots.append(plt_first_filter)
2010
+
2011
+ for i, filtering_step in enumerate(self.kept):
2012
+ if i >= len(self.kept) - 1:
2013
+ break
2014
+ else:
2015
+ flt_legend = self.kept[i + 1]["name"]
2016
+ d_flt = data.loc[filtering_step["index"], :]
2017
+ plt = hv.Scatter(d_flt, ["Timestamp"], ["power"], label=flt_legend)
2018
+ plots.append(plt)
2019
+
2020
+ scatter_overlay = hv.Overlay(plots)
2021
+ hover = HoverTool(
2022
+ tooltips=[
2023
+ ("datetime", "@Timestamp{%Y-%m-%d %H:%M}"),
2024
+ ("power", "@power{0,0.0}"),
2025
+ ],
2026
+ formatters={
2027
+ "@Timestamp": "datetime",
2028
+ },
2029
+ )
2030
+ scatter_overlay.opts(
2031
+ hv.opts.Scatter(
2032
+ size=5,
2033
+ muted_fill_alpha=0,
2034
+ fill_alpha=1,
2035
+ line_width=0,
2036
+ tools=[hover],
2037
+ yformatter=NumeralTickFormatter(format="0,0"),
2038
+ ),
2039
+ hv.opts.Overlay(
2040
+ legend_position="bottom",
2041
+ toolbar="right",
2042
+ ),
2043
+ )
2044
+ return scatter_overlay
2045
+
2046
+ def reset_filter(self):
2047
+ """
2048
+ Set `data_filtered` to `data` and reset filtering summary.
2049
+
2050
+ Parameters
2051
+ ----------
2052
+ data : str
2053
+ 'sim' or 'das' determines if filter is on sim or das data.
2054
+ """
2055
+ self.data_filtered = self.data.copy()
2056
+ self.summary_ix = []
2057
+ self.summary = []
2058
+ self.filter_counts = {}
2059
+ self.removed = []
2060
+ self.kept = []
2061
+
2062
+ def reset_agg(self):
2063
+ """
2064
+ Remove aggregation columns from data and data_filtered attributes.
2065
+
2066
+ Does not reset filtering of data or data_filtered.
2067
+ """
2068
+ if self.pre_agg_cols is None:
2069
+ return warnings.warn("Nothing to reset; agg_sensors has not beenused.")
2070
+ else:
2071
+ self.data = self.data[self.pre_agg_cols].copy()
2072
+ self.data_filtered = self.data_filtered[self.pre_agg_cols].copy()
2073
+
2074
+ self.column_groups = self.pre_agg_trans.copy()
2075
+ self.regression_cols = self.pre_agg_reg_trans.copy()
2076
+
2077
+ def __get_poa_col(self):
2078
+ """
2079
+ Return poa column name from `column_groups`.
2080
+
2081
+ Also, issues warning if there are more than one poa columns in
2082
+ `column_groups`.
2083
+ """
2084
+ poa_trans_key = self.regression_cols["poa"]
2085
+ if poa_trans_key in self.data.columns:
2086
+ return poa_trans_key
2087
+ else:
2088
+ poa_cols = self.column_groups[poa_trans_key]
2089
+ if len(poa_cols) > 1:
2090
+ return warnings.warn(
2091
+ "{} columns of irradiance data. "
2092
+ "Use col_name to specify a single "
2093
+ "column.".format(len(poa_cols))
2094
+ )
2095
+ else:
2096
+ return poa_cols[0]
2097
+
2098
+ def agg_sensors(self, agg_map=None):
2099
+ """
2100
+ Aggregate measurments of the same variable from different sensors.
2101
+
2102
+ Parameters
2103
+ ----------
2104
+ agg_map : dict, default None
2105
+ Dictionary specifying aggregations to be performed on
2106
+ the specified groups from the `column_groups` attribute. The dictionary
2107
+ keys should be keys from the `column_gruops` attribute. The
2108
+ dictionary values should be aggregation functions. See pandas API
2109
+ documentation of Computations / descriptive statistics for a list of all
2110
+ options.
2111
+ By default the groups of columns assigned to the 'power', 'poa', 't_amb',
2112
+ and 'w_vel' keys in the `regression_cols` attribute are aggregated:
2113
+ - sum power
2114
+ - mean of poa, t_amb, w_vel
2115
+
2116
+ Returns
2117
+ -------
2118
+ None
2119
+ Acts in place on the data, data_filtered, and regression_cols attributes.
2120
+
2121
+ Notes
2122
+ -----
2123
+ This method is intended to be used before any filtering methods are applied.
2124
+ Filtering steps applied when this method is used will be lost.
2125
+ """
2126
+ if not len(self.summary) == 0:
2127
+ warnings.warn(
2128
+ "The data_filtered attribute has been overwritten "
2129
+ "and previously applied filtering steps have been "
2130
+ "lost. It is recommended to use agg_sensors "
2131
+ "before any filtering methods."
2132
+ )
2133
+ # reset summary data
2134
+ self.summary_ix = []
2135
+ self.summary = []
2136
+
2137
+ self.pre_agg_cols = self.data.columns.copy()
2138
+ self.pre_agg_trans = copy.deepcopy(self.column_groups)
2139
+ self.pre_agg_reg_trans = copy.deepcopy(self.regression_cols)
2140
+
2141
+ if agg_map is None:
2142
+ agg_map = {
2143
+ self.regression_cols["power"]: "sum",
2144
+ self.regression_cols["poa"]: "mean",
2145
+ self.regression_cols["t_amb"]: "mean",
2146
+ self.regression_cols["w_vel"]: "mean",
2147
+ }
2148
+
2149
+ dfs_to_concat = []
2150
+ agg_names = {}
2151
+ for group_id, agg_func in agg_map.items():
2152
+ columns_to_aggregate = self.loc[group_id]
2153
+ if columns_to_aggregate.shape[1] == 1:
2154
+ continue
2155
+ agg_result = columns_to_aggregate.agg(agg_func, axis=1).to_frame()
2156
+ if isinstance(agg_func, str):
2157
+ col_name = group_id + "_" + agg_func + "_agg"
2158
+ else:
2159
+ col_name = group_id + "_" + agg_func.__name__ + "_agg"
2160
+ agg_result.rename(columns={agg_result.columns[0]: col_name}, inplace=True)
2161
+ dfs_to_concat.append(agg_result)
2162
+ agg_names[group_id] = col_name
2163
+
2164
+ dfs_to_concat.append(self.data)
2165
+ # write over data and data_filtered attributes
2166
+ self.data = pd.concat(dfs_to_concat, axis=1)
2167
+ self.data_filtered = self.data.copy()
2168
+
2169
+ # update regression_cols attribute
2170
+ for reg_var, trans_group in self.regression_cols.items():
2171
+ if self.loc[reg_var].shape[1] == 1:
2172
+ continue
2173
+ if trans_group in agg_names.keys():
2174
+ print(
2175
+ "Regression variable '{}' has been remapped: '{}' to '{}'".format(
2176
+ reg_var, trans_group, agg_names[trans_group]
2177
+ )
2178
+ )
2179
+ self.regression_cols[reg_var] = agg_names[trans_group]
2180
+
2181
+ def data_columns_to_excel(self, sort_by_reversed_names=True):
2182
+ """
2183
+ Write the columns of data to an excel file as a template for a column grouping.
2184
+
2185
+ Parameters
2186
+ ----------
2187
+ sort_by_inverted_names : bool, default False
2188
+ If true sort column names after reversing them.
2189
+
2190
+ Returns
2191
+ -------
2192
+ None
2193
+ Writes to excel file at self.data_loader.path / 'column_groups.xlsx'.
2194
+ """
2195
+ df = self.data.columns.to_frame().reset_index(drop=True)
2196
+ df["a"] = ""
2197
+ df = df[["a", 0]]
2198
+ # print(df)
2199
+ df.sort_values(by=0, inplace=True, ascending=True)
2200
+ if sort_by_reversed_names:
2201
+ df["reversed"] = df[0].str[::-1]
2202
+ df.sort_values(by="reversed", inplace=True, ascending=True)
2203
+ df = df[["a", 0]]
2204
+ if self.data_loader.path.is_dir():
2205
+ df.to_excel(
2206
+ self.data_loader.path / "column_groups.xlsx", index=False, header=False
2207
+ )
2208
+ elif self.data_loader.path.is_file():
2209
+ print(self.data_loader.path.parent)
2210
+ df.to_excel(
2211
+ self.data_loader.path.parent / "column_groups.xlsx",
2212
+ index=False,
2213
+ header=False,
2214
+ )
2215
+
2216
+ @update_summary
2217
+ def filter_irr(self, low, high, ref_val=None, col_name=None, inplace=True):
2218
+ """
2219
+ Filter on irradiance values.
2220
+
2221
+ Parameters
2222
+ ----------
2223
+ low : float or int
2224
+ Minimum value as fraction (0.8) or absolute 200 (W/m^2).
2225
+ high : float or int
2226
+ Max value as fraction (1.2) or absolute 800 (W/m^2).
2227
+ ref_val : float or int or `self_val`
2228
+ Must provide arg when `low` and `high` are fractions.
2229
+ Pass `self_val` to use the value in `self.rc`.
2230
+ col_name : str, default None
2231
+ Column name of irradiance data to filter. By default uses the POA
2232
+ irradiance set in regression_cols attribute or average of the POA
2233
+ columns.
2234
+ inplace : bool, default True
2235
+ Default true write back to data_filtered or return filtered
2236
+ dataframe.
2237
+
2238
+ Returns
2239
+ -------
2240
+ DataFrame
2241
+ Filtered dataframe if inplace is False.
2242
+ """
2243
+ if col_name is None:
2244
+ irr_col = self.__get_poa_col()
2245
+ else:
2246
+ irr_col = col_name
2247
+
2248
+ if ref_val == "self_val":
2249
+ ref_val = self.rc["poa"][0]
2250
+
2251
+ df_flt = filter_irr(self.data_filtered, irr_col, low, high, ref_val=ref_val)
2252
+ if inplace:
2253
+ self.data_filtered = df_flt
2254
+ else:
2255
+ return df_flt
2256
+
2257
+ @update_summary
2258
+ def filter_pvsyst(self, inplace=True):
2259
+ """
2260
+ Filter pvsyst data for off max power point tracking operation.
2261
+
2262
+ This function is only applicable to simulated data generated by PVsyst.
2263
+ Filters the 'IL Pmin', IL Vmin', 'IL Pmax', 'IL Vmax' values if they
2264
+ are greater than 0.
2265
+
2266
+ Parameters
2267
+ ----------
2268
+ inplace: bool, default True
2269
+ If inplace is true, then function overwrites the filtered data. If
2270
+ false returns a CapData object.
2271
+
2272
+ Returns
2273
+ -------
2274
+ CapData object if inplace is set to False.
2275
+ """
2276
+ df = self.data_filtered
2277
+
2278
+ columns = ["IL Pmin", "IL Vmin", "IL Pmax", "IL Vmax"]
2279
+ index = df.index
2280
+
2281
+ for column in columns:
2282
+ if column not in df.columns:
2283
+ column = column.replace(" ", "_")
2284
+ if column in df.columns:
2285
+ indices_to_drop = df[df[column] > 0].index
2286
+ if not index.equals(indices_to_drop):
2287
+ index = index.difference(indices_to_drop)
2288
+ else:
2289
+ warnings.warn(
2290
+ "{} or {} is not a column in the data.".format(
2291
+ column, column.replace("_", " ")
2292
+ )
2293
+ )
2294
+
2295
+ if inplace:
2296
+ self.data_filtered = self.data_filtered.loc[index, :]
2297
+ else:
2298
+ return self.data_filtered.loc[index, :]
2299
+
2300
+ @update_summary
2301
+ def filter_shade(self, fshdbm=1.0, query_str=None, inplace=True):
2302
+ """
2303
+ Remove data during periods of array shading.
2304
+
2305
+ The default behavior assumes the filter is applied to data output from
2306
+ PVsyst and removes all periods where values in the column 'FShdBm' are
2307
+ less than 1.0.
2308
+
2309
+ Use the query_str parameter when shading losses (power) rather than a
2310
+ shading fraction are available.
2311
+
2312
+ Parameters
2313
+ ----------
2314
+ fshdbm : float, default 1.0
2315
+ The value for fractional shading of beam irradiance as given by the
2316
+ PVsyst output parameter FShdBm. Data is removed when the shading
2317
+ fraction is less than the value passed to fshdbm. By default all
2318
+ periods of shading are removed.
2319
+ query_str : str
2320
+ Query string to pass to pd.DataFrame.query method. The query string
2321
+ should be a boolean expression comparing a column name to a numeric
2322
+ filter value, like 'ShdLoss<=50'. The column name must not contain
2323
+ spaces.
2324
+ inplace: bool, default True
2325
+ If inplace is true, then function overwrites the filtered
2326
+ dataframe. If false returns a DataFrame.
2327
+
2328
+ Returns
2329
+ -------
2330
+ pd.DataFrame
2331
+ If inplace is false returns a dataframe.
2332
+ """
2333
+ df = self.data_filtered
2334
+
2335
+ if query_str is None:
2336
+ query_str = "FShdBm>=@fshdbm"
2337
+
2338
+ index_shd = df.query(query_str).index
2339
+
2340
+ if inplace:
2341
+ self.data_filtered = self.data_filtered.loc[index_shd, :]
2342
+ else:
2343
+ return self.data_filtered.loc[index_shd, :]
2344
+
2345
+ @update_summary
2346
+ def filter_time(
2347
+ self,
2348
+ start=None,
2349
+ end=None,
2350
+ drop=False,
2351
+ days=None,
2352
+ test_date=None,
2353
+ inplace=True,
2354
+ wrap_year=False,
2355
+ ):
2356
+ """
2357
+ Select data for a specified time period.
2358
+
2359
+ Parameters
2360
+ ----------
2361
+ start : str or pd.Timestamp or None, default None
2362
+ Start date for data to be returned. If a string is passed it must
2363
+ be in format that can be converted by pandas.to_datetime. Not
2364
+ required if test_date and days arguments are passed.
2365
+ end : str or pd.Timestamp or None, default None
2366
+ End date for data to be returned. If a string is passed it must
2367
+ be in format that can be converted by pandas.to_datetime. Not
2368
+ required if test_date and days arguments are passed.
2369
+ drop : bool, default False
2370
+ Set to true to drop time period between `start` and `end` rather
2371
+ than keep it. Must supply `start` and `end` and `wrap_year` must
2372
+ be false.
2373
+ days : int or None, default None
2374
+ Days in time period to be returned. Not required if `start` and
2375
+ `end` are specified.
2376
+ test_date : str or pd.Timestamp or None, default None
2377
+ Must be format that can be converted by pandas.to_datetime. Not
2378
+ required if `start` and `end` are specified. Requires `days`
2379
+ argument. Time period returned will be centered on this date.
2380
+ inplace : bool, default True
2381
+ If inplace is true, then function overwrites the filtered
2382
+ dataframe. If false returns a DataFrame.
2383
+ wrap_year : bool, default False
2384
+ If true calls the wrap_year_end function. See wrap_year_end
2385
+ docstring for details. wrap_year_end was cntg_eoy prior to v0.7.0.
2386
+
2387
+ Todo
2388
+ ----
2389
+ Add inverse options to remove time between start end rather than return
2390
+ it.
2391
+ """
2392
+ if start is not None and end is not None:
2393
+ start = pd.to_datetime(start)
2394
+ end = pd.to_datetime(end)
2395
+ if wrap_year and spans_year(start, end):
2396
+ df_temp = wrap_year_end(self.data_filtered, start, end)
2397
+ else:
2398
+ df_temp = self.data_filtered.loc[start:end, :]
2399
+ if drop:
2400
+ keep_ix = self.data_filtered.index.difference(df_temp.index)
2401
+ df_temp = self.data_filtered.loc[keep_ix, :]
2402
+
2403
+ if start is not None and end is None:
2404
+ if days is None:
2405
+ return warnings.warn("Must specify end date or days.")
2406
+ else:
2407
+ start = pd.to_datetime(start)
2408
+ end = start + pd.DateOffset(days=days)
2409
+ if wrap_year and spans_year(start, end):
2410
+ df_temp = wrap_year_end(self.data_filtered, start, end)
2411
+ else:
2412
+ df_temp = self.data_filtered.loc[start:end, :]
2413
+
2414
+ if start is None and end is not None:
2415
+ if days is None:
2416
+ return warnings.warn("Must specify end date or days.")
2417
+ else:
2418
+ end = pd.to_datetime(end)
2419
+ start = end - pd.DateOffset(days=days)
2420
+ if wrap_year and spans_year(start, end):
2421
+ df_temp = wrap_year_end(self.data_filtered, start, end)
2422
+ else:
2423
+ df_temp = self.data_filtered.loc[start:end, :]
2424
+
2425
+ if test_date is not None:
2426
+ test_date = pd.to_datetime(test_date)
2427
+ if days is None:
2428
+ return warnings.warn("Must specify days")
2429
+ else:
2430
+ offset = pd.DateOffset(days=days // 2)
2431
+ start = test_date - offset
2432
+ end = test_date + offset
2433
+ if wrap_year and spans_year(start, end):
2434
+ df_temp = wrap_year_end(self.data_filtered, start, end)
2435
+ else:
2436
+ df_temp = self.data_filtered.loc[start:end, :]
2437
+
2438
+ if inplace:
2439
+ self.data_filtered = df_temp
2440
+ else:
2441
+ return df_temp
2442
+
2443
+ @update_summary
2444
+ def filter_days(self, days, drop=False, inplace=True):
2445
+ """
2446
+ Select or drop timestamps for days passed.
2447
+
2448
+ Parameters
2449
+ ----------
2450
+ days : list
2451
+ List of days to select or drop.
2452
+ drop : bool, default False
2453
+ Set to true to drop the timestamps for the days passed instead of
2454
+ keeping only those days.
2455
+ inplace : bool, default True
2456
+ If inplace is true, then function overwrites the filtered
2457
+ dataframe. If false returns a DataFrame.
2458
+ """
2459
+ ix_all_days = None
2460
+ for day in days:
2461
+ ix_day = self.data_filtered.loc[day].index
2462
+ if ix_all_days is None:
2463
+ ix_all_days = ix_day
2464
+ else:
2465
+ ix_all_days = ix_all_days.union(ix_day)
2466
+
2467
+ if drop:
2468
+ ix_wo_days = self.data_filtered.index.difference(ix_all_days)
2469
+ filtered_data = self.data_filtered.loc[ix_wo_days, :]
2470
+ else:
2471
+ filtered_data = self.data_filtered.loc[ix_all_days, :]
2472
+
2473
+ if inplace:
2474
+ self.data_filtered = filtered_data
2475
+ else:
2476
+ return filtered_data
2477
+
2478
+ @update_summary
2479
+ def filter_outliers(self, inplace=True, **kwargs):
2480
+ """
2481
+ Apply eliptic envelope from scikit-learn to remove outliers.
2482
+
2483
+ Parameters
2484
+ ----------
2485
+ inplace : bool
2486
+ Default of true writes filtered dataframe back to data_filtered
2487
+ attribute.
2488
+ **kwargs
2489
+ Passed to sklearn EllipticEnvelope. Contamination keyword
2490
+ is useful to adjust proportion of outliers in dataset.
2491
+ Default is 0.04.
2492
+ Todo
2493
+ ----
2494
+ Add plot option
2495
+ Add option to return plot showing envelope with points not removed
2496
+ alpha decreased.
2497
+ """
2498
+ XandY = self.floc[["poa", "power"]]
2499
+ if XandY.shape[1] > 2:
2500
+ return warnings.warn(
2501
+ "Too many columns. Try running "
2502
+ "aggregate_sensors before using "
2503
+ "filter_outliers."
2504
+ )
2505
+ X1 = XandY.values
2506
+
2507
+ if "support_fraction" not in kwargs.keys():
2508
+ kwargs["support_fraction"] = 0.9
2509
+ if "contamination" not in kwargs.keys():
2510
+ kwargs["contamination"] = 0.04
2511
+
2512
+ clf_1 = sk_cv.EllipticEnvelope(**kwargs)
2513
+ clf_1.fit(X1)
2514
+
2515
+ if inplace:
2516
+ self.data_filtered = self.data_filtered[clf_1.predict(X1) == 1]
2517
+ else:
2518
+ return self.data_filtered[clf_1.predict(X1) == 1]
2519
+
2520
+ @update_summary
2521
+ def filter_pf(self, pf, inplace=True):
2522
+ """
2523
+ Filter data on the power factor.
2524
+
2525
+ Parameters
2526
+ ----------
2527
+ pf: float
2528
+ 0.999 or similar to remove timestamps with lower power factor
2529
+ values. Values greater than or equal to `pf` are kept.
2530
+ inplace : bool
2531
+ Default of true writes filtered dataframe back to data_filtered
2532
+ attribute.
2533
+
2534
+ Returns
2535
+ -------
2536
+ Dataframe when inplace is False.
2537
+
2538
+ Todo
2539
+ ----
2540
+ Spec pf column
2541
+ Increase options to specify which columns are used in the filter.
2542
+ """
2543
+ for key in self.column_groups.keys():
2544
+ if key.find("pf") == 0:
2545
+ selection = key
2546
+
2547
+ df = self.data_filtered[self.column_groups[selection]]
2548
+
2549
+ df_flt = self.data_filtered[(np.abs(df) >= pf).all(axis=1)]
2550
+
2551
+ if inplace:
2552
+ self.data_filtered = df_flt
2553
+ else:
2554
+ return df_flt
2555
+
2556
+ @update_summary
2557
+ def filter_power(self, power, percent=None, columns=None, inplace=True):
2558
+ """
2559
+ Remove data above the specified power threshold.
2560
+
2561
+ Parameters
2562
+ ----------
2563
+ power : numeric
2564
+ If `percent` is none, all data equal to or greater than `power`
2565
+ is removed.
2566
+ If `percent` is not None, then power should be the nameplate power.
2567
+ percent : None, or numeric, default None
2568
+ Data greater than or equal to `percent` of `power` is removed.
2569
+ Specify percentage as decimal i.e. 1% is passed as 0.01.
2570
+ columns : None or str, default None
2571
+ By default filter is applied to the power data identified in the
2572
+ `regression_cols` attribute.
2573
+ Pass a column name or column group to filter on. When passing a
2574
+ column group the power filter is applied to each column in the
2575
+ group.
2576
+ inplace : bool, default True
2577
+ Default of true writes filtered dataframe back to data_filtered
2578
+ attribute.
2579
+
2580
+ Returns
2581
+ -------
2582
+ Dataframe when inplace is false.
2583
+ """
2584
+ if percent is not None:
2585
+ power = power * (1 - percent)
2586
+
2587
+ multiple_columns = False
2588
+
2589
+ if columns is None:
2590
+ power_data = self.get_reg_cols("power")
2591
+ elif isinstance(columns, str):
2592
+ if columns in self.column_groups.keys():
2593
+ power_data = self.floc[columns]
2594
+ multiple_columns = True
2595
+ else:
2596
+ power_data = pd.DataFrame(self.data_filtered[columns])
2597
+ power_data.rename(
2598
+ columns={power_data.columns[0]: "power"}, inplace=True
2599
+ )
2600
+ else:
2601
+ return warnings.warn("columns must be None or a string.")
2602
+
2603
+ if multiple_columns:
2604
+ filtered_power_bool = power_data.apply(lambda x: all(x < power), axis=1)
2605
+ else:
2606
+ filtered_power_bool = power_data["power"] < power
2607
+
2608
+ df_flt = self.data_filtered[filtered_power_bool]
2609
+
2610
+ if inplace:
2611
+ self.data_filtered = df_flt
2612
+ else:
2613
+ return df_flt
2614
+
2615
+ @update_summary
2616
+ def filter_custom(self, func, *args, **kwargs):
2617
+ """
2618
+ Apply `update_summary` decorator to passed function.
2619
+
2620
+ Parameters
2621
+ ----------
2622
+ func : function
2623
+ Any function that takes a dataframe as the first argument and
2624
+ returns a dataframe.
2625
+ Many pandas dataframe methods meet this requirement, like
2626
+ pd.DataFrame.between_time.
2627
+ *args
2628
+ Additional positional arguments passed to func.
2629
+ **kwds
2630
+ Additional keyword arguments passed to func.
2631
+
2632
+ Examples
2633
+ --------
2634
+ Example use of the pandas dropna method to remove rows with missing
2635
+ data.
2636
+
2637
+ >>> das.custom_filter(pd.DataFrame.dropna, axis=0, how='any')
2638
+ >>> summary = das.get_summary()
2639
+ >>> summary['pts_before_filter'][0]
2640
+ 1424
2641
+ >>> summary['pts_removed'][0]
2642
+ 16
2643
+
2644
+ Example use of the pandas between_time method to remove time periods.
2645
+
2646
+ >>> das.reset_filter()
2647
+ >>> das.custom_filter(pd.DataFrame.between_time, '9:00', '13:00')
2648
+ >>> summary = das.get_summary()
2649
+ >>> summary['pts_before_filter'][0]
2650
+ 245
2651
+ >>> summary['pts_removed'][0]
2652
+ 1195
2653
+ >>> das.data_filtered.index[0].hour
2654
+ 9
2655
+ >>> das.data_filtered.index[-1].hour
2656
+ 13
2657
+ """
2658
+ self.data_filtered = func(self.data_filtered, *args, **kwargs)
2659
+
2660
+ @update_summary
2661
+ def filter_sensors(
2662
+ self, perc_diff=None, inplace=True, row_filter=check_all_perc_diff_comb
2663
+ ):
2664
+ """
2665
+ Drop suspicious measurments by comparing values from different sensors.
2666
+
2667
+ This method ignores columns generated by the agg_sensors method.
2668
+
2669
+ Parameters
2670
+ ----------
2671
+ perc_diff : dict
2672
+ Dictionary to specify a different threshold for
2673
+ each group of sensors. Dictionary keys should be translation
2674
+ dictionary keys and values are floats, like {'irr-poa-': 0.05}.
2675
+ By default the poa sensors as set by the regression_cols dictionary
2676
+ are filtered with a 5% percent difference threshold.
2677
+ inplace : bool, default True
2678
+ If True, writes over current filtered dataframe. If False, returns
2679
+ CapData object.
2680
+
2681
+ Returns
2682
+ -------
2683
+ DataFrame
2684
+ Returns filtered dataframe if inplace is False.
2685
+ """
2686
+ if self.pre_agg_cols is not None:
2687
+ df = self.data_filtered[self.pre_agg_cols]
2688
+ trans = self.pre_agg_trans
2689
+ regression_cols = self.pre_agg_reg_trans
2690
+ else:
2691
+ df = self.data_filtered
2692
+ trans = self.column_groups
2693
+ regression_cols = self.regression_cols
2694
+
2695
+ if perc_diff is None:
2696
+ poa_trans_key = regression_cols["poa"]
2697
+ perc_diff = {poa_trans_key: 0.05}
2698
+
2699
+ for key, threshold in perc_diff.items():
2700
+ if "index" in locals():
2701
+ # if index has been assigned then take intersection
2702
+ sensors_df = df[trans[key]]
2703
+ next_index = sensor_filter(sensors_df, threshold, row_filter=row_filter)
2704
+ index = index.intersection(next_index) # noqa: F821
2705
+ else:
2706
+ # if index has not been assigned then assign it
2707
+ sensors_df = df[trans[key]]
2708
+ index = sensor_filter(sensors_df, threshold, row_filter=row_filter)
2709
+
2710
+ df_out = self.data_filtered.loc[index, :]
2711
+
2712
+ if inplace:
2713
+ self.data_filtered = df_out
2714
+ else:
2715
+ return df_out
2716
+
2717
+ @update_summary
2718
+ def filter_clearsky(
2719
+ self, window_length=20, ghi_col=None, inplace=True, keep_clear=True, **kwargs
2720
+ ):
2721
+ """
2722
+ Use pvlib detect_clearsky to remove periods with unstable irradiance.
2723
+
2724
+ The pvlib detect_clearsky function compares modeled clear sky ghi
2725
+ against measured clear sky ghi to detect periods of clear sky. Refer
2726
+ to the pvlib documentation for additional information.
2727
+
2728
+ By default uses data identified by the `column_groups` dictionary
2729
+ as ghi and modeled ghi. Issues warning if there is no modeled ghi
2730
+ data, or the measured ghi data has not been aggregated.
2731
+
2732
+ Parameters:
2733
+ window_length : int, default 20
2734
+ Length of sliding time window in minutes. Must be greater than 2
2735
+ periods. Default of 20 works well for 5 minute data intervals.
2736
+ pvlib default of 10 minutes works well for 1min data.
2737
+ ghi_col : str, default None
2738
+ The name of a column name of measured GHI data. Overrides default
2739
+ attempt to automatically identify a column of GHI data.
2740
+ inplace : bool, default True
2741
+ When true removes periods with unstable irradiance. When false
2742
+ returns pvlib detect_clearsky results, which by default is a series
2743
+ of booleans.
2744
+ keep_clear : bool, default True
2745
+ Set to False to keep cloudy periods.
2746
+ **kwargs
2747
+ kwargs are passed to pvlib detect_clearsky. See pvlib
2748
+ documentation for details.
2749
+ """
2750
+ if "ghi_mod_csky" not in self.data_filtered.columns:
2751
+ return warnings.warn(
2752
+ "Modeled clear sky data must be availabe to "
2753
+ "run this filter method. Use CapData "
2754
+ "load_data clear_sky option."
2755
+ )
2756
+ if ghi_col is None:
2757
+ ghi_keys = []
2758
+ for key in self.column_groups.keys():
2759
+ defs = key.split("-")
2760
+ if len(defs) == 1:
2761
+ continue
2762
+ if "ghi" == key.split("-")[1]:
2763
+ ghi_keys.append(key)
2764
+ ghi_keys.remove("irr-ghi-clear_sky")
2765
+
2766
+ if len(ghi_keys) > 1:
2767
+ return warnings.warn(
2768
+ "Too many ghi categories. Pass column "
2769
+ "name to ghi_col to use a specific "
2770
+ "column."
2771
+ )
2772
+ else:
2773
+ meas_ghi = ghi_keys[0]
2774
+
2775
+ meas_ghi = self.floc[meas_ghi]
2776
+ if meas_ghi.shape[1] > 1:
2777
+ warnings.warn(
2778
+ "Averaging measured GHI data. Pass column name "
2779
+ "to ghi_col to use a specific column."
2780
+ )
2781
+ meas_ghi = meas_ghi.mean(axis=1)
2782
+ else:
2783
+ meas_ghi = self.data_filtered[ghi_col]
2784
+
2785
+ clear_per = detect_clearsky(
2786
+ meas_ghi,
2787
+ self.data_filtered["ghi_mod_csky"],
2788
+ meas_ghi.index,
2789
+ window_length,
2790
+ **kwargs,
2791
+ )
2792
+ if not any(clear_per):
2793
+ return warnings.warn(
2794
+ "No clear periods detected. Try increasing the window length."
2795
+ )
2796
+
2797
+ if keep_clear:
2798
+ df_out = self.data_filtered[clear_per]
2799
+ else:
2800
+ df_out = self.data_filtered[~clear_per]
2801
+
2802
+ if inplace:
2803
+ self.data_filtered = df_out
2804
+ else:
2805
+ return df_out
2806
+
2807
+ @update_summary
2808
+ def filter_missing(self, columns=None):
2809
+ """
2810
+ Drops time intervals with missing data for specified columns.
2811
+
2812
+ By default drops intervals which have missing data in the columns defined
2813
+ by `regression_cols`.
2814
+
2815
+ Parameters
2816
+ ----------
2817
+ columns : list, default None
2818
+ Subset of columns to check for missing data.
2819
+ """
2820
+ if columns is None:
2821
+ columns = list(self.regression_cols.values())
2822
+ df_reg_vars = self.data_filtered[columns]
2823
+ ix = df_reg_vars.dropna().index
2824
+ self.data_filtered = self.data_filtered.loc[ix, :]
2825
+
2826
+ def filter_op_state(self, op_state, mult_inv=None, inplace=True):
2827
+ """
2828
+ NOT CURRENTLY IMPLEMENTED - Filter on inverter operation state.
2829
+
2830
+ This filter is rarely useful in practice, but will be re-implemented
2831
+ if requested.
2832
+
2833
+ Parameters
2834
+ ----------
2835
+ data : str
2836
+ 'sim' or 'das' determines if filter is on sim or das data
2837
+ op_state : int
2838
+ integer inverter operating state to keep
2839
+ mult_inv : list of tuples, [(start, stop, op_state), ...]
2840
+ List of tuples where start is the first column of an type of
2841
+ inverter, stop is the last column and op_state is the operating
2842
+ state for the inverter type.
2843
+ inplace : bool, default True
2844
+ When True writes over current filtered dataframe. When False
2845
+ returns CapData object.
2846
+
2847
+ Returns
2848
+ -------
2849
+ CapData
2850
+ Returns filtered CapData object when inplace is False.
2851
+
2852
+ Todo
2853
+ ----
2854
+ Complete move to capdata
2855
+ Needs to be updated to work as capdata rather than captest method.
2856
+ Remove call to __flt_setup and related subsequent use of flt_cd.
2857
+ """
2858
+ pass
2859
+ # if data == 'sim':
2860
+ # print('Method not implemented for pvsyst data.')
2861
+ # return None
2862
+ #
2863
+ # flt_cd = self.__flt_setup(data)
2864
+ #
2865
+ # for key in flt_cd.trans_keys:
2866
+ # if key.find('op') == 0:
2867
+ # selection = key
2868
+ #
2869
+ # df = flt_cd.df[flt_cd.trans[selection]]
2870
+ # # print('df shape: {}'.format(df.shape))
2871
+ #
2872
+ # if mult_inv is not None:
2873
+ # return_index = flt_cd.df.index
2874
+ # for pos_tup in mult_inv:
2875
+ # # print('pos_tup: {}'.format(pos_tup))
2876
+ # inverters = df.iloc[:, pos_tup[0]:pos_tup[1]]
2877
+ # # print('inv shape: {}'.format(inverters.shape))
2878
+ # df_temp = flt_cd.df[(inverters == pos_tup[2]).all(axis=1)]
2879
+ # # print('df_temp shape: {}'.format(df_temp.shape))
2880
+ # return_index = return_index.intersection(df_temp.index)
2881
+ # flt_cd.df = flt_cd.df.loc[return_index, :]
2882
+ # else:
2883
+ # flt_cd.df = flt_cd.df[(df == op_state).all(axis=1)]
2884
+ #
2885
+ # if inplace:
2886
+ # if data == 'das':
2887
+ # self.flt_das = flt_cd
2888
+ # if data == 'sim':
2889
+ # # should not run as 'sim' is not implemented
2890
+ # self.flt_sim = flt_cd
2891
+ # else:
2892
+ # return flt_cd
2893
+
2894
+ def get_summary(self):
2895
+ """
2896
+ Print a summary of filtering applied to the data_filtered attribute.
2897
+
2898
+ The summary dataframe shows the history of the filtering steps applied
2899
+ to the data including the timestamps remaining after each step, the
2900
+ timestamps removed by each step and the arguments used to call each
2901
+ filtering method.
2902
+
2903
+ If the filter arguments are cutoff, the max column width can be
2904
+ increased by setting pd.options.display.max_colwidth.
2905
+
2906
+ Parameters
2907
+ ----------
2908
+ None
2909
+
2910
+ Returns
2911
+ -------
2912
+ Pandas DataFrame
2913
+ """
2914
+ try:
2915
+ df = pd.DataFrame(
2916
+ data=self.summary,
2917
+ index=pd.MultiIndex.from_tuples(self.summary_ix),
2918
+ columns=columns,
2919
+ )
2920
+ return df
2921
+ except TypeError:
2922
+ print("No filters have been run.")
2923
+
2924
+ @update_summary
2925
+ def rep_cond(
2926
+ self,
2927
+ irr_bal=False,
2928
+ percent_filter=20,
2929
+ w_vel=None,
2930
+ inplace=True,
2931
+ func={"poa": perc_wrap(60), "t_amb": "mean", "w_vel": "mean"},
2932
+ freq=None,
2933
+ grouper_kwargs={},
2934
+ rc_kwargs={},
2935
+ ):
2936
+ """
2937
+ Calculate reporting conditons.
2938
+
2939
+ Parameters
2940
+ ----------
2941
+ irr_bal: boolean, default False
2942
+ If true, uses the irr_rc_balanced function to determine the
2943
+ reporting conditions. Replaces the calculations specified by func
2944
+ with or without freq.
2945
+ percent_filter : Int, default 20
2946
+ Percentage as integer used to filter around reporting
2947
+ irradiance in the irr_rc_balanced function.
2948
+ func: callable, string, dictionary, or list of string/callables
2949
+ Determines how the reporting condition is calculated.
2950
+ Default is a dictionary poa - 60th numpy_percentile, t_amb - mean
2951
+ w_vel - mean
2952
+ Can pass a string function ('mean') to calculate each reporting
2953
+ condition the same way.
2954
+ freq: str
2955
+ String pandas offset alias to specify aggregation frequency
2956
+ for reporting condition calculation. Ex '60D' for 60 Days or
2957
+ 'MS' for months start.
2958
+ w_vel: int
2959
+ If w_vel is not none, then wind reporting condition will be set to
2960
+ value specified for predictions. Does not affect output unless pred
2961
+ is True and irr_bal is True.
2962
+ inplace: bool, True by default
2963
+ When true updates object rc parameter, when false returns
2964
+ dicitionary of reporting conditions.
2965
+ grouper_kwargs : dict
2966
+ Passed to pandas Grouper to control label and closed side of
2967
+ intervals. See pandas Grouper doucmentation for details. Default is
2968
+ left labeled and left closed.
2969
+ rc_kwargs : dict
2970
+ Passed to the irr_rc_balanced function if `irr_bal` is set to True.
2971
+
2972
+ Returns
2973
+ -------
2974
+ dict
2975
+ Returns a dictionary of reporting conditions if inplace=False
2976
+ otherwise returns None.
2977
+ pandas DataFrame
2978
+ If pred=True, then returns a pandas dataframe of results.
2979
+ """
2980
+ df = self.floc[["poa", "t_amb", "w_vel"]]
2981
+ df = df.rename(
2982
+ columns={
2983
+ df.columns[0]: "poa",
2984
+ df.columns[1]: "t_amb",
2985
+ df.columns[2]: "w_vel",
2986
+ }
2987
+ )
2988
+
2989
+ RCs_df = pd.DataFrame(df.agg(func)).T
2990
+
2991
+ if irr_bal:
2992
+ self.rc_tool = ReportingIrradiance(
2993
+ df,
2994
+ "poa",
2995
+ percent_band=percent_filter,
2996
+ **rc_kwargs,
2997
+ )
2998
+ results = self.rc_tool.get_rep_irr()
2999
+ flt_df = results[1]
3000
+ temp_RC = flt_df["t_amb"].mean()
3001
+ wind_RC = flt_df["w_vel"].mean()
3002
+ RCs_df = pd.DataFrame(
3003
+ {"poa": results[0], "t_amb": temp_RC, "w_vel": wind_RC}, index=[0]
3004
+ )
3005
+
3006
+ if w_vel is not None:
3007
+ RCs_df["w_vel"][0] = w_vel
3008
+
3009
+ if freq is not None:
3010
+ # wrap_seasons passes df through unchanged unless freq is one of
3011
+ # 'BQ-JAN', 'BQ-FEB', 'BQ-APR', 'BQ-MAY', 'BQ-JUL',
3012
+ # 'BQ-AUG', 'BQ-OCT', 'BQ-NOV'
3013
+ df = wrap_seasons(df, freq)
3014
+ df_grpd = df.groupby(pd.Grouper(freq=freq, **grouper_kwargs))
3015
+
3016
+ if irr_bal:
3017
+ ix = pd.DatetimeIndex(list(df_grpd.groups.keys()), freq=freq)
3018
+ poa_RC = []
3019
+ temp_RC = []
3020
+ wind_RC = []
3021
+ for name, month in df_grpd:
3022
+ self.rc_tool = ReportingIrradiance(
3023
+ month,
3024
+ "poa",
3025
+ percent_band=percent_filter,
3026
+ **rc_kwargs,
3027
+ )
3028
+ results = self.rc_tool.get_rep_irr()
3029
+ poa_RC.append(results[0])
3030
+ flt_df = results[1]
3031
+ temp_RC.append(flt_df["t_amb"].mean())
3032
+ wind_RC.append(flt_df["w_vel"].mean())
3033
+ RCs_df = pd.DataFrame(
3034
+ {"poa": poa_RC, "t_amb": temp_RC, "w_vel": wind_RC}, index=ix
3035
+ )
3036
+ else:
3037
+ RCs_df = df_grpd.agg(func)
3038
+
3039
+ if w_vel is not None:
3040
+ RCs_df["w_vel"] = w_vel
3041
+
3042
+ if inplace:
3043
+ print("Reporting conditions saved to rc attribute.")
3044
+ print(RCs_df)
3045
+ self.rc = RCs_df
3046
+ else:
3047
+ return RCs_df
3048
+
3049
+ def predict_capacities(self, irr_filter=True, percent_filter=20, **kwargs):
3050
+ """
3051
+ Calculate expected capacities.
3052
+
3053
+ Parameters
3054
+ ----------
3055
+ irr_filter : bool, default True
3056
+ When true will filter each group of data by a percentage around the
3057
+ reporting irradiance for that group. The data groups are
3058
+ determined from the reporting irradiance attribute.
3059
+ percent_filter : float or int or tuple, default 20
3060
+ Percentage or tuple of percentages used to filter around reporting
3061
+ irradiance in the irr_rc_balanced function. Required argument when
3062
+ irr_bal is True.
3063
+ Tuple option allows specifying different percentage for above and
3064
+ below reporting irradiance. (below, above)
3065
+ **kwargs
3066
+ NOTE: Should match kwargs used to calculate reporting conditions.
3067
+ Passed to filter_grps which passes on to pandas Grouper to control
3068
+ label and closed side of intervals.
3069
+ See pandas Grouper doucmentation for details. Default is left
3070
+ labeled and left closed.
3071
+ """
3072
+ df = self.floc[["poa", "t_amb", "w_vel", "power"]]
3073
+ df = df.rename(
3074
+ columns={
3075
+ df.columns[0]: "poa",
3076
+ df.columns[1]: "t_amb",
3077
+ df.columns[2]: "w_vel",
3078
+ df.columns[3]: "power",
3079
+ }
3080
+ )
3081
+
3082
+ if self.rc is None:
3083
+ return warnings.warn(
3084
+ "Reporting condition attribute is None.\
3085
+ Use rep_cond to generate RCs."
3086
+ )
3087
+
3088
+ low, high = perc_bounds(percent_filter)
3089
+ freq = self.rc.index.freq
3090
+ df = wrap_seasons(df, freq)
3091
+ grps = df.groupby(by=pd.Grouper(freq=freq, **kwargs))
3092
+
3093
+ if irr_filter:
3094
+ grps = filter_grps(grps, self.rc, "poa", low, high, freq)
3095
+
3096
+ error = float(self.tolerance.split(sep=" ")[1]) / 100
3097
+ results = pred_summary(grps, self.rc, error, fml=self.regression_formula)
3098
+
3099
+ return results
3100
+
3101
+ @update_summary
3102
+ def fit_regression(self, filter=False, inplace=True, summary=True):
3103
+ """
3104
+ Perform a regression with statsmodels on filtered data.
3105
+
3106
+ Parameters
3107
+ ----------
3108
+ filter: bool, default False
3109
+ When true removes timestamps where the residuals are greater than
3110
+ two standard deviations. When false just calcualtes ordinary least
3111
+ squares regression.
3112
+ inplace: bool, default True
3113
+ If filter is true and inplace is true, then function overwrites the
3114
+ filtered data for sim or das. If false returns a CapData object.
3115
+ summary: bool, default True
3116
+ Set to false to not print regression summary.
3117
+
3118
+ Returns
3119
+ -------
3120
+ CapData
3121
+ Returns a filtered CapData object if filter is True and inplace is
3122
+ False.
3123
+ """
3124
+ df = self.get_reg_cols()
3125
+
3126
+ reg = fit_model(df, fml=self.regression_formula)
3127
+
3128
+ if filter:
3129
+ print("NOTE: Regression used to filter outlying points.\n\n")
3130
+ if summary:
3131
+ print(reg.summary())
3132
+ df = df[np.abs(reg.resid) < 2 * np.sqrt(reg.scale)]
3133
+ dframe_flt = self.data_filtered.loc[df.index, :]
3134
+ if inplace:
3135
+ self.data_filtered = dframe_flt
3136
+ else:
3137
+ return dframe_flt
3138
+ else:
3139
+ if summary:
3140
+ print(reg.summary())
3141
+ self.regression_results = reg
3142
+
3143
+ def uncertainty():
3144
+ """Calculate random standard uncertainty of the regression.
3145
+
3146
+ (SEE times the square root of the leverage of the reporting
3147
+ conditions).
3148
+
3149
+ Not fully implemented yet. Need to review and determine what actual
3150
+ variable should be.
3151
+ """
3152
+ pass
3153
+ # SEE = np.sqrt(self.regression_results.mse_resid)
3154
+ #
3155
+ # df = self.get_reg_cols()
3156
+ #
3157
+ # rc_pt = {key: val[0] for key, val in self.rc.items()}
3158
+ # rc_pt['power'] = actual
3159
+ # df.append([rc_pt])
3160
+ #
3161
+ # reg = fit_model(df, fml=self.regression_formula)
3162
+ #
3163
+ # infl = reg.get_influence()
3164
+ # leverage = infl.hat_matrix_diag[-1]
3165
+ # sy = SEE * np.sqrt(leverage)
3166
+ #
3167
+ # return(sy)
3168
+
3169
+ def spatial_uncert(self, column_groups):
3170
+ """
3171
+ Spatial uncertainties of the independent regression variables.
3172
+
3173
+ Parameters
3174
+ ----------
3175
+ column_groups : list
3176
+ Measurement groups to calculate spatial uncertainty.
3177
+
3178
+ Returns
3179
+ -------
3180
+ None, stores dictionary of spatial uncertainties as an attribute.
3181
+ """
3182
+ spatial_uncerts = {}
3183
+ for group in column_groups:
3184
+ df = self.floc[group]
3185
+ # prevent aggregation from updating column groups?
3186
+ # would not need the below line then
3187
+ df = df[[col for col in df.columns if "agg" not in col]]
3188
+ qty_sensors = df.shape[1]
3189
+ s_spatial = df.std(axis=1)
3190
+ b_spatial_j = s_spatial / (qty_sensors ** (1 / 2))
3191
+ b_spatial = ((b_spatial_j**2).sum() / b_spatial_j.shape[0]) ** (1 / 2)
3192
+ spatial_uncerts[group] = b_spatial
3193
+ self.spatial_uncerts = spatial_uncerts
3194
+
3195
+ def expanded_uncert(self, grp_to_term, k=1.96):
3196
+ """
3197
+ Calculate expanded uncertainty of the predicted power.
3198
+
3199
+ Adds instrument uncertainty and spatial uncertainty in quadrature and
3200
+ passes the result through the regression to calculate the
3201
+ Systematic Standard Uncertainty, which is then added in quadrature with
3202
+ the Random Standard Uncertainty of the regression and multiplied by the
3203
+ k factor, `k`.
3204
+
3205
+ 1. Combine by adding in quadrature the spatial and instrument uncertainties
3206
+ for each measurand.
3207
+ 2. Add the absolute uncertainties from step 1 to each of the respective
3208
+ reporting conditions to determine a value for the reporting condition
3209
+ plus the uncertainty.
3210
+ 3. Calculate the predicted power using the RCs plus uncertainty three
3211
+ times i.e. calculate for each RC plus uncertainty. For example, to
3212
+ estimate the impact of the uncertainty of the reporting irradiance one
3213
+ would calculate expected power using the irradiance RC plus irradiance
3214
+ uncertainty at the reporting irradiance and the original temperature and
3215
+ wind reporting conditions that have not had any uncertainty added to them.
3216
+ 6. Calculate the percent difference between the three new expected power
3217
+ values that include uncertainty of the RCs and the expected power with
3218
+ the unmodified RC.
3219
+ 7. Take the square root of the sum of the squares of those three percent
3220
+ differences to obtain the Systematic Standard Uncertainty (bY).
3221
+
3222
+ Expects CapData to have a instrument_uncert and spatial_uncerts
3223
+ attributes with matching keys.
3224
+
3225
+ Parameters
3226
+ ----------
3227
+ grp_to_term : dict
3228
+ Map the groups of measurement types to the term in the
3229
+ regression formula that was regressed against an aggregated value
3230
+ (typically mean) from that group.
3231
+ k : numeric
3232
+ Coverage factor.
3233
+
3234
+ Returns
3235
+ -------
3236
+ Expanded uncertainty as a decimal value.
3237
+ """
3238
+ pred = self.regression_results.get_prediction(self.rc)
3239
+ pred_cap = pred.predicted_mean[0]
3240
+ perc_diffs = {}
3241
+ for group, inst_uncert in self.instrument_uncert.items():
3242
+ by_group = (inst_uncert**2 + self.spatial_uncerts[group] ** 2) ** (1 / 2)
3243
+ rcs = self.rc.copy()
3244
+ rcs.loc[0, grp_to_term[group]] = rcs.loc[0, grp_to_term[group]] + by_group
3245
+ pred_cap_uncert = self.regression_results.get_prediction(
3246
+ rcs
3247
+ ).predicted_mean[0]
3248
+ perc_diffs[group] = (pred_cap_uncert - pred_cap) / pred_cap
3249
+ df = pd.DataFrame(perc_diffs.values())
3250
+ by = (df**2).sum().values[0] ** (1 / 2)
3251
+ sy = pred.se_obs[0] / pred_cap
3252
+ return (by**2 + sy**2) ** (1 / 2) * k
3253
+
3254
+ def get_filtering_table(self):
3255
+ """
3256
+ Returns DataFrame showing which filter removed each filtered time interval.
3257
+
3258
+ Time intervals removed are marked with a "1".
3259
+ Time intervals kept are marked with a "0".
3260
+ Time intervals removed by a previous filter are np.nan/blank.
3261
+ Columns/filters are in order they are run from left to right.
3262
+ The last column labeled "all_filters" shows is True for intervals that were
3263
+ not removed by any of the filters.
3264
+ """
3265
+ filtering_data = pd.DataFrame(index=self.data.index)
3266
+ for i, (flt_step_kept, flt_step_removed) in enumerate(
3267
+ zip(self.kept, self.removed)
3268
+ ):
3269
+ if i == 0:
3270
+ filtering_data.loc[:, flt_step_removed["name"]] = 0
3271
+ else:
3272
+ filtering_data.loc[self.kept[i - 1]["index"], flt_step_kept["name"]] = 0
3273
+ filtering_data.loc[flt_step_removed["index"], flt_step_removed["name"]] = 1
3274
+
3275
+ filtering_data["all_filters"] = filtering_data.apply(
3276
+ lambda x: all(x == 0), axis=1
3277
+ )
3278
+ return filtering_data
3279
+
3280
+ def print_points_summary(self, hrs_req=12.5):
3281
+ """
3282
+ print summary data on the number of points collected.
3283
+ """
3284
+ self.get_length_test_period()
3285
+ self.get_pts_required(hrs_req=hrs_req)
3286
+ self.set_test_complete(self.pts_required)
3287
+ pts_collected = self.data_filtered.shape[0]
3288
+ avg_pts_per_day = pts_collected / self.length_test_period
3289
+ print("length of test period to date: {} days".format(self.length_test_period))
3290
+ if self.test_complete:
3291
+ print(
3292
+ "sufficient points have been collected. {} points required; "
3293
+ "{} points collected".format(self.pts_required, pts_collected)
3294
+ )
3295
+ else:
3296
+ print(
3297
+ "{} points of {} points needed, {} remaining to collect.".format(
3298
+ pts_collected, self.pts_required, self.pts_required - pts_collected
3299
+ )
3300
+ )
3301
+ print("{:0.2f} points / day on average.".format(avg_pts_per_day))
3302
+ print(
3303
+ "Approximate days remaining: {:0.0f}".format(
3304
+ round(((self.pts_required - pts_collected) / avg_pts_per_day), 0)
3305
+ + 1
3306
+ )
3307
+ )
3308
+
3309
+ def get_length_test_period(self):
3310
+ """
3311
+ Get length of test period.
3312
+
3313
+ Uses length of `data` unless `filter_time` has been run, then uses length
3314
+ of the kept data after `filter_time` was run the first time. Subsequent
3315
+ uses of `filter_time` are ignored.
3316
+
3317
+ Rounds up to a period of full days.
3318
+
3319
+ Returns
3320
+ -------
3321
+ int
3322
+ Days in test period.
3323
+ """
3324
+ test_period = self.data.index[-1] - self.data.index[0]
3325
+ for filter in self.kept:
3326
+ if "filter_time" == filter["name"]:
3327
+ test_period = filter["index"][-1] - filter["index"][0]
3328
+ self.length_test_period = test_period.ceil("D").days
3329
+
3330
+ def get_pts_required(self, hrs_req=12.5):
3331
+ """
3332
+ Set number of data points required for complete test attribute.
3333
+
3334
+ Parameters
3335
+ ----------
3336
+ hrs_req : numeric, default 12.5
3337
+ Number of hours to be represented by final filtered test data set.
3338
+ Default of 12.5 hours is dictated by ASTM E2848 and corresponds to
3339
+ 750 1-minute data points, 150 5-minute, or 50 15-minute points.
3340
+ """
3341
+ self.pts_required = (hrs_req * 60) / util.get_common_timestep(
3342
+ self.data, units="m", string_output=False
3343
+ )
3344
+
3345
+ def set_test_complete(self, pts_required):
3346
+ """Sets `test_complete` attribute.
3347
+
3348
+ Parameters
3349
+ ----------
3350
+ pts_required : int
3351
+ Number of points required to remain after filtering for a complete test.
3352
+ """
3353
+ self.test_complete = self.data_filtered.shape[0] >= pts_required
3354
+
3355
+ def column_groups_to_excel(self, save_to="./column_groups.xlsx"):
3356
+ """Export the column groups attribute to an excel file.
3357
+
3358
+ Parameters
3359
+ ----------
3360
+ save_to : str
3361
+ File path to save column groups to. Should include .xlsx.
3362
+ """
3363
+ pd.DataFrame.from_dict(
3364
+ self.column_groups.data, orient="index"
3365
+ ).stack().to_frame().droplevel(1).to_excel(save_to, header=False)
3366
+
3367
+
3368
+ if __name__ == "__main__":
3369
+ import doctest
3370
+ import pandas as pd # noqa F811
3371
+
3372
+ das = CapData("das")
3373
+ das.load_data(
3374
+ path="../examples/data/", fname="example_meas_data.csv", source="AlsoEnergy"
3375
+ )
3376
+ das.set_regression_cols(
3377
+ power="-mtr-", poa="irr-poa-", t_amb="temp-amb-", w_vel="wind--"
3378
+ )
3379
+
3380
+ doctest.testmod()