captest 0.11.2__py2.py3-none-any.whl → 0.13.0__py2.py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- captest/__init__.py +1 -0
- captest/_version.py +3 -3
- captest/capdata.py +275 -339
- captest/io.py +150 -64
- captest/plotting.py +492 -0
- captest/prtest.py +1 -1
- captest/util.py +14 -6
- {captest-0.11.2.dist-info → captest-0.13.0.dist-info}/METADATA +30 -29
- captest-0.13.0.dist-info/RECORD +13 -0
- {captest-0.11.2.dist-info → captest-0.13.0.dist-info}/WHEEL +1 -1
- captest-0.11.2.dist-info/RECORD +0 -12
- {captest-0.11.2.dist-info → captest-0.13.0.dist-info}/LICENSE.txt +0 -0
- {captest-0.11.2.dist-info → captest-0.13.0.dist-info}/top_level.txt +0 -0
captest/io.py
CHANGED
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
# this file is formatted with black
|
|
2
|
+
import copy
|
|
2
3
|
import dateutil
|
|
3
4
|
import datetime
|
|
4
5
|
from pathlib import Path
|
|
@@ -19,6 +20,7 @@ from captest import util
|
|
|
19
20
|
def flatten_multi_index(columns):
|
|
20
21
|
return ["_".join(col_name) for col_name in columns.to_list()]
|
|
21
22
|
|
|
23
|
+
|
|
22
24
|
def load_excel_column_groups(path):
|
|
23
25
|
"""
|
|
24
26
|
Load column groups from an excel file.
|
|
@@ -45,9 +47,10 @@ def load_excel_column_groups(path):
|
|
|
45
47
|
dict
|
|
46
48
|
Dictionary mapping column group names to lists of column names.
|
|
47
49
|
"""
|
|
48
|
-
df = pd.read_excel(path, header=None).
|
|
50
|
+
df = pd.read_excel(path, header=None).ffill(axis='index')
|
|
49
51
|
return df.groupby(0)[1].apply(list).to_dict()
|
|
50
52
|
|
|
53
|
+
|
|
51
54
|
def load_pvsyst(
|
|
52
55
|
path,
|
|
53
56
|
name="pvsyst",
|
|
@@ -58,6 +61,9 @@ def load_pvsyst(
|
|
|
58
61
|
"""
|
|
59
62
|
Load data from a PVsyst energy production model.
|
|
60
63
|
|
|
64
|
+
Will load day first or month first dates. Expects files that use a comma as a
|
|
65
|
+
separator rather than a semicolon.
|
|
66
|
+
|
|
61
67
|
Parameters
|
|
62
68
|
----------
|
|
63
69
|
path : str
|
|
@@ -70,7 +76,8 @@ def load_pvsyst(
|
|
|
70
76
|
By default sets power to E_Grid, poa to GlobInc, t_amb to T Amb, and w_vel to
|
|
71
77
|
WindVel. Set to False to not set regression columns on load.
|
|
72
78
|
**kwargs
|
|
73
|
-
Use to pass additional kwargs to pandas read_csv.
|
|
79
|
+
Use to pass additional kwargs to pandas read_csv. Pass sep=';' to load files
|
|
80
|
+
that use semicolons instead of commas as the separator.
|
|
74
81
|
|
|
75
82
|
Returns
|
|
76
83
|
-------
|
|
@@ -107,8 +114,33 @@ def load_pvsyst(
|
|
|
107
114
|
break
|
|
108
115
|
|
|
109
116
|
pvraw.columns = pvraw.columns.droplevel(1)
|
|
110
|
-
dates = pvraw.loc[:, "date"]
|
|
111
117
|
try:
|
|
118
|
+
dates = pvraw.loc[:, "date"]
|
|
119
|
+
except KeyError:
|
|
120
|
+
warnings.warn(
|
|
121
|
+
"No 'date' column found in the PVsyst data. This may be due to "
|
|
122
|
+
"the separator being a semicolon ';' rather than a comma ','. "
|
|
123
|
+
"If this is the case, try passing sep=';' when calling load_pvsyst. "
|
|
124
|
+
"Otherwise the date column may actually be missing. Exception:"
|
|
125
|
+
)
|
|
126
|
+
raise
|
|
127
|
+
# PVsyst creates dates like '01/01/90 00:00' i.e. January 1st, 1990.
|
|
128
|
+
# Opening the PVsyst output in excel will likely result in the dates modified to
|
|
129
|
+
# 1/1/1990 0:00. The strftime format specified won't load the excel modified dates
|
|
130
|
+
# so these are caught by checking for consistent length and reformatted
|
|
131
|
+
if not all(dates.str.len() == 14):
|
|
132
|
+
date_parts = dates.str.split(' ').str[0].str.split('/')
|
|
133
|
+
time_parts = dates.str.split(' ').str[1].str.split(':')
|
|
134
|
+
dates = (
|
|
135
|
+
date_parts.str[0].str.zfill(2) + '/' +
|
|
136
|
+
date_parts.str[1].str.zfill(2) + '/' +
|
|
137
|
+
'90 ' +
|
|
138
|
+
time_parts.str[0].str.zfill(2) + ':' +
|
|
139
|
+
time_parts.str[1]
|
|
140
|
+
)
|
|
141
|
+
try:
|
|
142
|
+
# mm/dd/yy hh:mm, lower case y gives
|
|
143
|
+
# Year without century as a zero-padded decimal number. e.g. 00, 01, …, 99
|
|
112
144
|
dt_index = pd.to_datetime(dates, format="%m/%d/%y %H:%M")
|
|
113
145
|
except ValueError:
|
|
114
146
|
warnings.warn(
|
|
@@ -131,7 +163,6 @@ def load_pvsyst(
|
|
|
131
163
|
cd.data["E_Grid"] = cd.data["E_Grid"] / egrid_unit_adj_factor
|
|
132
164
|
cd.data_filtered = cd.data.copy()
|
|
133
165
|
cd.column_groups = cg.group_columns(cd.data)
|
|
134
|
-
cd.trans_keys = list(cd.column_groups.keys())
|
|
135
166
|
if set_regression_columns:
|
|
136
167
|
cd.set_regression_cols(
|
|
137
168
|
power="E_Grid", poa="GlobInc", t_amb="T_Amb", w_vel="WindVel"
|
|
@@ -139,15 +170,13 @@ def load_pvsyst(
|
|
|
139
170
|
return cd
|
|
140
171
|
|
|
141
172
|
|
|
142
|
-
|
|
143
|
-
|
|
144
173
|
def file_reader(path, **kwargs):
|
|
145
174
|
"""
|
|
146
175
|
Read measured solar data from a csv file.
|
|
147
176
|
|
|
148
177
|
Utilizes pandas read_csv to import measure solar data from a csv file.
|
|
149
|
-
Attempts a few
|
|
150
|
-
by looking for a date in the first column, and
|
|
178
|
+
Attempts a few different encodings, tries to determine the header end
|
|
179
|
+
by looking for a date in the first column, and concatenates column
|
|
151
180
|
headings to a single string.
|
|
152
181
|
|
|
153
182
|
Parameters
|
|
@@ -161,16 +190,20 @@ def file_reader(path, **kwargs):
|
|
|
161
190
|
-------
|
|
162
191
|
pandas DataFrame
|
|
163
192
|
"""
|
|
193
|
+
default_kwargs = {
|
|
194
|
+
'index_col': 0,
|
|
195
|
+
'parse_dates': True,
|
|
196
|
+
'skip_blank_lines': True,
|
|
197
|
+
'low_memory': False,
|
|
198
|
+
}
|
|
199
|
+
for key, value in default_kwargs.items():
|
|
200
|
+
kwargs.setdefault(key, value)
|
|
164
201
|
encodings = ["utf-8", "latin1", "iso-8859-1", "cp1252"]
|
|
165
202
|
for encoding in encodings:
|
|
203
|
+
kwargs['encoding'] = encoding
|
|
166
204
|
try:
|
|
167
205
|
data_file = pd.read_csv(
|
|
168
206
|
path,
|
|
169
|
-
encoding=encoding,
|
|
170
|
-
index_col=0,
|
|
171
|
-
parse_dates=True,
|
|
172
|
-
skip_blank_lines=True,
|
|
173
|
-
low_memory=False,
|
|
174
207
|
**kwargs,
|
|
175
208
|
)
|
|
176
209
|
except UnicodeDecodeError:
|
|
@@ -178,6 +211,12 @@ def file_reader(path, **kwargs):
|
|
|
178
211
|
else:
|
|
179
212
|
break
|
|
180
213
|
data_file.dropna(how="all", axis=0, inplace=True)
|
|
214
|
+
if data_file.index.equals(pd.Index(np.arange(len(data_file.index)))):
|
|
215
|
+
kwargs['index_col'] = 1
|
|
216
|
+
data_file = pd.read_csv(
|
|
217
|
+
path,
|
|
218
|
+
**kwargs,
|
|
219
|
+
)
|
|
181
220
|
if not isinstance(data_file.index[0], pd.Timestamp):
|
|
182
221
|
for i, _indice in enumerate(data_file.index):
|
|
183
222
|
try:
|
|
@@ -189,18 +228,11 @@ def file_reader(path, **kwargs):
|
|
|
189
228
|
except ValueError:
|
|
190
229
|
continue
|
|
191
230
|
header = list(np.arange(header_end))
|
|
231
|
+
kwargs.setdefault('header', header)
|
|
192
232
|
data_file = pd.read_csv(
|
|
193
233
|
path,
|
|
194
|
-
encoding=encoding,
|
|
195
|
-
header=header,
|
|
196
|
-
index_col=0,
|
|
197
|
-
parse_dates=True,
|
|
198
|
-
skip_blank_lines=True,
|
|
199
|
-
low_memory=False,
|
|
200
234
|
**kwargs,
|
|
201
235
|
)
|
|
202
|
-
|
|
203
|
-
data_file = data_file.apply(pd.to_numeric)
|
|
204
236
|
if isinstance(data_file.columns, pd.MultiIndex):
|
|
205
237
|
data_file.columns = flatten_multi_index(data_file.columns)
|
|
206
238
|
data_file = data_file.rename(columns=(lambda x: x.strip()))
|
|
@@ -218,6 +250,7 @@ class DataLoader:
|
|
|
218
250
|
sys: Optional[dict] = field(default=None)
|
|
219
251
|
file_reader: object = file_reader
|
|
220
252
|
files_to_load: Optional[list] = field(default=None)
|
|
253
|
+
failed_to_load: Optional[list] = field(default=None)
|
|
221
254
|
|
|
222
255
|
def __setattr__(self, key, value):
|
|
223
256
|
if key == "path":
|
|
@@ -256,7 +289,6 @@ class DataLoader:
|
|
|
256
289
|
current_file, missing_intervals, freq_str = util.reindex_datetime(
|
|
257
290
|
file,
|
|
258
291
|
report=False,
|
|
259
|
-
add_index_col=True,
|
|
260
292
|
)
|
|
261
293
|
reindexed_dfs[name] = current_file
|
|
262
294
|
file_frequencies.append(freq_str)
|
|
@@ -315,43 +347,86 @@ class DataLoader:
|
|
|
315
347
|
data = data.apply(pd.to_numeric, errors="coerce")
|
|
316
348
|
return data
|
|
317
349
|
|
|
318
|
-
def load(self, extension="csv", **kwargs):
|
|
350
|
+
def load(self, extension="csv", verbose=True, print_errors=False, **kwargs):
|
|
319
351
|
"""
|
|
320
352
|
Load file(s) of timeseries data from SCADA / DAS systems.
|
|
321
353
|
|
|
322
|
-
|
|
323
|
-
|
|
354
|
+
Set `path` to the path to a file to load a single file. Set `path` to the path
|
|
355
|
+
to a directory of files to load all the files in the directory ending in "csv".
|
|
356
|
+
Or, set `files_to_load` to a list of specific files to load.
|
|
324
357
|
|
|
325
|
-
|
|
326
|
-
|
|
327
|
-
|
|
328
|
-
|
|
358
|
+
Multiple files will be joined together and may include files with different
|
|
359
|
+
column headings. When multiple files with matching column headings are loaded,
|
|
360
|
+
the individual files will be reindexed and then joined.
|
|
361
|
+
|
|
362
|
+
Missing time intervals within the individual files will be filled,
|
|
329
363
|
but missing time intervals between the individual files will not be filled.
|
|
330
364
|
|
|
365
|
+
When loading multiple files they will be stored in `loaded_files`, a dictionary,
|
|
366
|
+
mapping the file names to a dataframe for each file.
|
|
367
|
+
|
|
331
368
|
Parameters
|
|
332
369
|
----------
|
|
333
370
|
extension : str, default "csv"
|
|
334
371
|
Change the extension to allow loading different filetypes. Must also set
|
|
335
372
|
the `file_reader` attribute to a function that will read that type of file.
|
|
373
|
+
Do not include a period ".".
|
|
374
|
+
verbose : bool, default True
|
|
375
|
+
By default prints path of each file attempted to load and then confirmation
|
|
376
|
+
it was loaded or states it failed to load. Is only relevant if `path` is
|
|
377
|
+
set to a directory not a file. Set to False to not print out any file
|
|
378
|
+
loading status.
|
|
379
|
+
print_errors : bool, default False
|
|
380
|
+
Set to true to print error if file fails to load.
|
|
381
|
+
**kwargs
|
|
382
|
+
Are passed through to the file_reader callable, which by default will pass
|
|
383
|
+
them on to pandas.read_csv.
|
|
384
|
+
|
|
385
|
+
Returns
|
|
386
|
+
-------
|
|
387
|
+
None
|
|
388
|
+
Resulting DataFrame of data is stored to the `data` attribute.
|
|
336
389
|
"""
|
|
337
390
|
if self.path.is_file():
|
|
338
|
-
self.data = self.file_reader(self.path)
|
|
391
|
+
self.data = self.file_reader(self.path, **kwargs)
|
|
339
392
|
elif self.path.is_dir():
|
|
340
|
-
if self.files_to_load is
|
|
341
|
-
self.loaded_files = {
|
|
342
|
-
file.stem: self.file_reader(file) for file in self.files_to_load
|
|
343
|
-
}
|
|
344
|
-
else:
|
|
393
|
+
if self.files_to_load is None:
|
|
345
394
|
self.set_files_to_load(extension=extension)
|
|
346
|
-
|
|
347
|
-
|
|
348
|
-
|
|
349
|
-
|
|
350
|
-
|
|
351
|
-
|
|
352
|
-
|
|
353
|
-
|
|
354
|
-
|
|
395
|
+
self.loaded_files = dict()
|
|
396
|
+
failed_to_load_count = 0
|
|
397
|
+
for file in self.files_to_load:
|
|
398
|
+
try:
|
|
399
|
+
if verbose:
|
|
400
|
+
print('trying to load {}'.format(file))
|
|
401
|
+
self.loaded_files[file.stem] = self.file_reader(file, **kwargs)
|
|
402
|
+
if verbose:
|
|
403
|
+
print(' loaded {}'.format(file))
|
|
404
|
+
except Exception as err:
|
|
405
|
+
if self.failed_to_load is None:
|
|
406
|
+
self.failed_to_load = []
|
|
407
|
+
self.failed_to_load.append(file)
|
|
408
|
+
print(' **FAILED to load {}'.format(file))
|
|
409
|
+
print(
|
|
410
|
+
' To review full stack traceback run \n'
|
|
411
|
+
' meas.data_loader.file_reader(meas.data_loader'
|
|
412
|
+
'.failed_to_load[{}])'.format(failed_to_load_count)
|
|
413
|
+
)
|
|
414
|
+
if print_errors:
|
|
415
|
+
print(err)
|
|
416
|
+
failed_to_load_count += 1
|
|
417
|
+
continue
|
|
418
|
+
if len(self.loaded_files) == 0:
|
|
419
|
+
warnings.warn(
|
|
420
|
+
"No files were loaded. Check that file_reader is working")
|
|
421
|
+
elif len(self.loaded_files) > 1:
|
|
422
|
+
(
|
|
423
|
+
self.loaded_files,
|
|
424
|
+
self.common_freq,
|
|
425
|
+
self.file_frequencies,
|
|
426
|
+
) = self._reindex_loaded_files()
|
|
427
|
+
data = self._join_files()
|
|
428
|
+
elif len(self.loaded_files) == 1:
|
|
429
|
+
data = list(self.loaded_files.values())[0]
|
|
355
430
|
data.index.name = "Timestamp"
|
|
356
431
|
self.data = data
|
|
357
432
|
else:
|
|
@@ -380,12 +455,13 @@ def load_data(
|
|
|
380
455
|
reindex=True,
|
|
381
456
|
site=None,
|
|
382
457
|
column_groups_template=False,
|
|
458
|
+
verbose=False,
|
|
383
459
|
**kwargs,
|
|
384
460
|
):
|
|
385
461
|
"""
|
|
386
462
|
Load file(s) of timeseries data from SCADA / DAS systems.
|
|
387
463
|
|
|
388
|
-
This is a
|
|
464
|
+
This is a convenience function to generate an instance of DataLoader
|
|
389
465
|
and call the `load` method.
|
|
390
466
|
|
|
391
467
|
A single file or multiple files can be loaded. Multiple files will be joined together
|
|
@@ -397,7 +473,7 @@ def load_data(
|
|
|
397
473
|
Path to either a single file to load or a directory of files to load.
|
|
398
474
|
group_columns : function or str, default columngroups.group_columns
|
|
399
475
|
Function to use to group the columns of the loaded data. Function should accept
|
|
400
|
-
a DataFrame and return a dictionary with keys that are ids and
|
|
476
|
+
a DataFrame and return a dictionary with keys that are ids and values that are
|
|
401
477
|
lists of column names. Will be set to the `group_columns` attribute of the
|
|
402
478
|
CapData.DataLoader object.
|
|
403
479
|
Provide a string to load column grouping from a json, yaml, or excel file. The
|
|
@@ -414,32 +490,34 @@ def load_data(
|
|
|
414
490
|
sort : bool, default True
|
|
415
491
|
By default sorts the data by the datetime index from old to new.
|
|
416
492
|
drop_duplicates : bool, default True
|
|
417
|
-
By default drops rows of the joined data where all the columns are
|
|
493
|
+
By default drops rows of the joined data where all the columns are duplicates
|
|
418
494
|
of another row. Keeps the first instance of the duplicated values. This is
|
|
419
|
-
helpful if individual
|
|
495
|
+
helpful if individual data files have overlapping rows with the same data.
|
|
420
496
|
reindex : bool, default True
|
|
421
497
|
By default will create a new index for the data using the earliest datetime,
|
|
422
498
|
latest datetime, and the most frequent time interval ensuring there are no
|
|
423
499
|
missing intervals.
|
|
424
|
-
site : dict, default None
|
|
425
|
-
Pass a dictionary containing site data, which
|
|
426
|
-
modeled clear sky ghi and poa values. The clear sky
|
|
427
|
-
added to the data and the column_groups attribute is
|
|
428
|
-
two irradiance columns. The site data dictionary should
|
|
429
|
-
{sys: {system data}, loc: {location data}}. See the capdata.csky
|
|
430
|
-
for the format of the system data and location data.
|
|
500
|
+
site : dict or str, default None
|
|
501
|
+
Pass a dictionary or path to a json or yaml file containing site data, which
|
|
502
|
+
will be used to generate modeled clear sky ghi and poa values. The clear sky
|
|
503
|
+
irradiance values are added to the data and the column_groups attribute is
|
|
504
|
+
updated to include these two irradiance columns. The site data dictionary should
|
|
505
|
+
be {sys: {system data}, loc: {location data}}. See the capdata.csky
|
|
506
|
+
documentation for the format of the system data and location data.
|
|
431
507
|
column_groups_template : bool, default False
|
|
432
508
|
If True, will call `CapData.data_columns_to_excel` to save a file to use to
|
|
433
509
|
manually create column groupings at `path`.
|
|
510
|
+
verbose : bool, default False
|
|
511
|
+
Set to True to print status of file loading.
|
|
434
512
|
**kwargs
|
|
435
|
-
Passed to `DataLoader.load
|
|
436
|
-
|
|
513
|
+
Passed to `DataLoader.load`, which passes them to the `file_reader` function.
|
|
514
|
+
The default `file_reader` function passes them to pandas.read_csv.
|
|
437
515
|
"""
|
|
438
516
|
dl = DataLoader(
|
|
439
517
|
path=path,
|
|
440
518
|
file_reader=file_reader,
|
|
441
519
|
)
|
|
442
|
-
dl.load(**kwargs)
|
|
520
|
+
dl.load(verbose=verbose, **kwargs)
|
|
443
521
|
|
|
444
522
|
if sort:
|
|
445
523
|
dl.sort_data()
|
|
@@ -454,7 +532,7 @@ def load_data(
|
|
|
454
532
|
cd.data_loader = dl
|
|
455
533
|
# group columns
|
|
456
534
|
if callable(group_columns):
|
|
457
|
-
cd.column_groups = group_columns(cd.data)
|
|
535
|
+
cd.column_groups = cg.ColumnGroups(group_columns(cd.data))
|
|
458
536
|
elif isinstance(group_columns, str):
|
|
459
537
|
p = Path(group_columns)
|
|
460
538
|
if p.suffix == ".json":
|
|
@@ -464,12 +542,20 @@ def load_data(
|
|
|
464
542
|
elif (p.suffix == '.xlsx') or (p.suffix == '.xls'):
|
|
465
543
|
cd.column_groups = cg.ColumnGroups(load_excel_column_groups(group_columns))
|
|
466
544
|
if site is not None:
|
|
467
|
-
|
|
468
|
-
|
|
469
|
-
|
|
470
|
-
|
|
545
|
+
if isinstance(site, str):
|
|
546
|
+
path_to_site = Path(site)
|
|
547
|
+
if path_to_site.is_file():
|
|
548
|
+
if path_to_site.suffix == ".json":
|
|
549
|
+
site = util.read_json(site)
|
|
550
|
+
if (path_to_site.suffix == ".yaml") or (path_to_site.suffix == ".yml"):
|
|
551
|
+
site = util.read_yaml(site)
|
|
552
|
+
cd.site = copy.deepcopy(site)
|
|
553
|
+
if isinstance(site, dict):
|
|
554
|
+
cd.data = csky(cd.data, loc=site['loc'], sys=site['sys'])
|
|
555
|
+
cd.data_filtered = cd.data.copy()
|
|
556
|
+
cd.column_groups['irr-poa-clear_sky'] = ['poa_mod_csky']
|
|
557
|
+
cd.column_groups['irr-ghi-clear_sky'] = ['ghi_mod_csky']
|
|
471
558
|
cd.trans_keys = list(cd.column_groups.keys())
|
|
472
|
-
cd.set_plot_attributes()
|
|
473
559
|
if column_groups_template:
|
|
474
560
|
cd.data_columns_to_excel()
|
|
475
561
|
return cd
|