captest 0.11.2__py2.py3-none-any.whl → 0.13.0__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
captest/io.py CHANGED
@@ -1,4 +1,5 @@
1
1
  # this file is formatted with black
2
+ import copy
2
3
  import dateutil
3
4
  import datetime
4
5
  from pathlib import Path
@@ -19,6 +20,7 @@ from captest import util
19
20
  def flatten_multi_index(columns):
20
21
  return ["_".join(col_name) for col_name in columns.to_list()]
21
22
 
23
+
22
24
  def load_excel_column_groups(path):
23
25
  """
24
26
  Load column groups from an excel file.
@@ -45,9 +47,10 @@ def load_excel_column_groups(path):
45
47
  dict
46
48
  Dictionary mapping column group names to lists of column names.
47
49
  """
48
- df = pd.read_excel(path, header=None).fillna(method="ffill")
50
+ df = pd.read_excel(path, header=None).ffill(axis='index')
49
51
  return df.groupby(0)[1].apply(list).to_dict()
50
52
 
53
+
51
54
  def load_pvsyst(
52
55
  path,
53
56
  name="pvsyst",
@@ -58,6 +61,9 @@ def load_pvsyst(
58
61
  """
59
62
  Load data from a PVsyst energy production model.
60
63
 
64
+ Will load day first or month first dates. Expects files that use a comma as a
65
+ separator rather than a semicolon.
66
+
61
67
  Parameters
62
68
  ----------
63
69
  path : str
@@ -70,7 +76,8 @@ def load_pvsyst(
70
76
  By default sets power to E_Grid, poa to GlobInc, t_amb to T Amb, and w_vel to
71
77
  WindVel. Set to False to not set regression columns on load.
72
78
  **kwargs
73
- Use to pass additional kwargs to pandas read_csv.
79
+ Use to pass additional kwargs to pandas read_csv. Pass sep=';' to load files
80
+ that use semicolons instead of commas as the separator.
74
81
 
75
82
  Returns
76
83
  -------
@@ -107,8 +114,33 @@ def load_pvsyst(
107
114
  break
108
115
 
109
116
  pvraw.columns = pvraw.columns.droplevel(1)
110
- dates = pvraw.loc[:, "date"]
111
117
  try:
118
+ dates = pvraw.loc[:, "date"]
119
+ except KeyError:
120
+ warnings.warn(
121
+ "No 'date' column found in the PVsyst data. This may be due to "
122
+ "the separator being a semicolon ';' rather than a comma ','. "
123
+ "If this is the case, try passing sep=';' when calling load_pvsyst. "
124
+ "Otherwise the date column may actually be missing. Exception:"
125
+ )
126
+ raise
127
+ # PVsyst creates dates like '01/01/90 00:00' i.e. January 1st, 1990.
128
+ # Opening the PVsyst output in excel will likely result in the dates modified to
129
+ # 1/1/1990 0:00. The strftime format specified won't load the excel modified dates
130
+ # so these are caught by checking for consistent length and reformatted
131
+ if not all(dates.str.len() == 14):
132
+ date_parts = dates.str.split(' ').str[0].str.split('/')
133
+ time_parts = dates.str.split(' ').str[1].str.split(':')
134
+ dates = (
135
+ date_parts.str[0].str.zfill(2) + '/' +
136
+ date_parts.str[1].str.zfill(2) + '/' +
137
+ '90 ' +
138
+ time_parts.str[0].str.zfill(2) + ':' +
139
+ time_parts.str[1]
140
+ )
141
+ try:
142
+ # mm/dd/yy hh:mm, lower case y gives
143
+ # Year without century as a zero-padded decimal number. e.g. 00, 01, …, 99
112
144
  dt_index = pd.to_datetime(dates, format="%m/%d/%y %H:%M")
113
145
  except ValueError:
114
146
  warnings.warn(
@@ -131,7 +163,6 @@ def load_pvsyst(
131
163
  cd.data["E_Grid"] = cd.data["E_Grid"] / egrid_unit_adj_factor
132
164
  cd.data_filtered = cd.data.copy()
133
165
  cd.column_groups = cg.group_columns(cd.data)
134
- cd.trans_keys = list(cd.column_groups.keys())
135
166
  if set_regression_columns:
136
167
  cd.set_regression_cols(
137
168
  power="E_Grid", poa="GlobInc", t_amb="T_Amb", w_vel="WindVel"
@@ -139,15 +170,13 @@ def load_pvsyst(
139
170
  return cd
140
171
 
141
172
 
142
-
143
-
144
173
  def file_reader(path, **kwargs):
145
174
  """
146
175
  Read measured solar data from a csv file.
147
176
 
148
177
  Utilizes pandas read_csv to import measure solar data from a csv file.
149
- Attempts a few diferent encodings, trys to determine the header end
150
- by looking for a date in the first column, and concantenates column
178
+ Attempts a few different encodings, tries to determine the header end
179
+ by looking for a date in the first column, and concatenates column
151
180
  headings to a single string.
152
181
 
153
182
  Parameters
@@ -161,16 +190,20 @@ def file_reader(path, **kwargs):
161
190
  -------
162
191
  pandas DataFrame
163
192
  """
193
+ default_kwargs = {
194
+ 'index_col': 0,
195
+ 'parse_dates': True,
196
+ 'skip_blank_lines': True,
197
+ 'low_memory': False,
198
+ }
199
+ for key, value in default_kwargs.items():
200
+ kwargs.setdefault(key, value)
164
201
  encodings = ["utf-8", "latin1", "iso-8859-1", "cp1252"]
165
202
  for encoding in encodings:
203
+ kwargs['encoding'] = encoding
166
204
  try:
167
205
  data_file = pd.read_csv(
168
206
  path,
169
- encoding=encoding,
170
- index_col=0,
171
- parse_dates=True,
172
- skip_blank_lines=True,
173
- low_memory=False,
174
207
  **kwargs,
175
208
  )
176
209
  except UnicodeDecodeError:
@@ -178,6 +211,12 @@ def file_reader(path, **kwargs):
178
211
  else:
179
212
  break
180
213
  data_file.dropna(how="all", axis=0, inplace=True)
214
+ if data_file.index.equals(pd.Index(np.arange(len(data_file.index)))):
215
+ kwargs['index_col'] = 1
216
+ data_file = pd.read_csv(
217
+ path,
218
+ **kwargs,
219
+ )
181
220
  if not isinstance(data_file.index[0], pd.Timestamp):
182
221
  for i, _indice in enumerate(data_file.index):
183
222
  try:
@@ -189,18 +228,11 @@ def file_reader(path, **kwargs):
189
228
  except ValueError:
190
229
  continue
191
230
  header = list(np.arange(header_end))
231
+ kwargs.setdefault('header', header)
192
232
  data_file = pd.read_csv(
193
233
  path,
194
- encoding=encoding,
195
- header=header,
196
- index_col=0,
197
- parse_dates=True,
198
- skip_blank_lines=True,
199
- low_memory=False,
200
234
  **kwargs,
201
235
  )
202
-
203
- data_file = data_file.apply(pd.to_numeric)
204
236
  if isinstance(data_file.columns, pd.MultiIndex):
205
237
  data_file.columns = flatten_multi_index(data_file.columns)
206
238
  data_file = data_file.rename(columns=(lambda x: x.strip()))
@@ -218,6 +250,7 @@ class DataLoader:
218
250
  sys: Optional[dict] = field(default=None)
219
251
  file_reader: object = file_reader
220
252
  files_to_load: Optional[list] = field(default=None)
253
+ failed_to_load: Optional[list] = field(default=None)
221
254
 
222
255
  def __setattr__(self, key, value):
223
256
  if key == "path":
@@ -256,7 +289,6 @@ class DataLoader:
256
289
  current_file, missing_intervals, freq_str = util.reindex_datetime(
257
290
  file,
258
291
  report=False,
259
- add_index_col=True,
260
292
  )
261
293
  reindexed_dfs[name] = current_file
262
294
  file_frequencies.append(freq_str)
@@ -315,43 +347,86 @@ class DataLoader:
315
347
  data = data.apply(pd.to_numeric, errors="coerce")
316
348
  return data
317
349
 
318
- def load(self, extension="csv", **kwargs):
350
+ def load(self, extension="csv", verbose=True, print_errors=False, **kwargs):
319
351
  """
320
352
  Load file(s) of timeseries data from SCADA / DAS systems.
321
353
 
322
- This is a convience function to generate an instance of DataLoader
323
- and call the `load` method.
354
+ Set `path` to the path to a file to load a single file. Set `path` to the path
355
+ to a directory of files to load all the files in the directory ending in "csv".
356
+ Or, set `files_to_load` to a list of specific files to load.
324
357
 
325
- A single file or multiple files can be loaded. Multiple files will be joined together
326
- and may include files with different column headings. When multiple files with
327
- matching column headings are loaded, the individual files will be reindexed and
328
- then joined. Missing time intervals within the individual files will be filled,
358
+ Multiple files will be joined together and may include files with different
359
+ column headings. When multiple files with matching column headings are loaded,
360
+ the individual files will be reindexed and then joined.
361
+
362
+ Missing time intervals within the individual files will be filled,
329
363
  but missing time intervals between the individual files will not be filled.
330
364
 
365
+ When loading multiple files they will be stored in `loaded_files`, a dictionary,
366
+ mapping the file names to a dataframe for each file.
367
+
331
368
  Parameters
332
369
  ----------
333
370
  extension : str, default "csv"
334
371
  Change the extension to allow loading different filetypes. Must also set
335
372
  the `file_reader` attribute to a function that will read that type of file.
373
+ Do not include a period ".".
374
+ verbose : bool, default True
375
+ By default prints path of each file attempted to load and then confirmation
376
+ it was loaded or states it failed to load. Is only relevant if `path` is
377
+ set to a directory not a file. Set to False to not print out any file
378
+ loading status.
379
+ print_errors : bool, default False
380
+ Set to true to print error if file fails to load.
381
+ **kwargs
382
+ Are passed through to the file_reader callable, which by default will pass
383
+ them on to pandas.read_csv.
384
+
385
+ Returns
386
+ -------
387
+ None
388
+ Resulting DataFrame of data is stored to the `data` attribute.
336
389
  """
337
390
  if self.path.is_file():
338
- self.data = self.file_reader(self.path)
391
+ self.data = self.file_reader(self.path, **kwargs)
339
392
  elif self.path.is_dir():
340
- if self.files_to_load is not None:
341
- self.loaded_files = {
342
- file.stem: self.file_reader(file) for file in self.files_to_load
343
- }
344
- else:
393
+ if self.files_to_load is None:
345
394
  self.set_files_to_load(extension=extension)
346
- self.loaded_files = {
347
- file.stem: self.file_reader(file, **kwargs) for file in self.files_to_load
348
- }
349
- (
350
- self.loaded_files,
351
- self.common_freq,
352
- self.file_frequencies,
353
- ) = self._reindex_loaded_files()
354
- data = self._join_files()
395
+ self.loaded_files = dict()
396
+ failed_to_load_count = 0
397
+ for file in self.files_to_load:
398
+ try:
399
+ if verbose:
400
+ print('trying to load {}'.format(file))
401
+ self.loaded_files[file.stem] = self.file_reader(file, **kwargs)
402
+ if verbose:
403
+ print(' loaded {}'.format(file))
404
+ except Exception as err:
405
+ if self.failed_to_load is None:
406
+ self.failed_to_load = []
407
+ self.failed_to_load.append(file)
408
+ print(' **FAILED to load {}'.format(file))
409
+ print(
410
+ ' To review full stack traceback run \n'
411
+ ' meas.data_loader.file_reader(meas.data_loader'
412
+ '.failed_to_load[{}])'.format(failed_to_load_count)
413
+ )
414
+ if print_errors:
415
+ print(err)
416
+ failed_to_load_count += 1
417
+ continue
418
+ if len(self.loaded_files) == 0:
419
+ warnings.warn(
420
+ "No files were loaded. Check that file_reader is working")
421
+ elif len(self.loaded_files) > 1:
422
+ (
423
+ self.loaded_files,
424
+ self.common_freq,
425
+ self.file_frequencies,
426
+ ) = self._reindex_loaded_files()
427
+ data = self._join_files()
428
+ elif len(self.loaded_files) == 1:
429
+ data = list(self.loaded_files.values())[0]
355
430
  data.index.name = "Timestamp"
356
431
  self.data = data
357
432
  else:
@@ -380,12 +455,13 @@ def load_data(
380
455
  reindex=True,
381
456
  site=None,
382
457
  column_groups_template=False,
458
+ verbose=False,
383
459
  **kwargs,
384
460
  ):
385
461
  """
386
462
  Load file(s) of timeseries data from SCADA / DAS systems.
387
463
 
388
- This is a convience function to generate an instance of DataLoader
464
+ This is a convenience function to generate an instance of DataLoader
389
465
  and call the `load` method.
390
466
 
391
467
  A single file or multiple files can be loaded. Multiple files will be joined together
@@ -397,7 +473,7 @@ def load_data(
397
473
  Path to either a single file to load or a directory of files to load.
398
474
  group_columns : function or str, default columngroups.group_columns
399
475
  Function to use to group the columns of the loaded data. Function should accept
400
- a DataFrame and return a dictionary with keys that are ids and valeus that are
476
+ a DataFrame and return a dictionary with keys that are ids and values that are
401
477
  lists of column names. Will be set to the `group_columns` attribute of the
402
478
  CapData.DataLoader object.
403
479
  Provide a string to load column grouping from a json, yaml, or excel file. The
@@ -414,32 +490,34 @@ def load_data(
414
490
  sort : bool, default True
415
491
  By default sorts the data by the datetime index from old to new.
416
492
  drop_duplicates : bool, default True
417
- By default drops rows of the joined data where all the columns are duplicats
493
+ By default drops rows of the joined data where all the columns are duplicates
418
494
  of another row. Keeps the first instance of the duplicated values. This is
419
- helpful if individual datafiles have overlaping rows with the same data.
495
+ helpful if individual data files have overlapping rows with the same data.
420
496
  reindex : bool, default True
421
497
  By default will create a new index for the data using the earliest datetime,
422
498
  latest datetime, and the most frequent time interval ensuring there are no
423
499
  missing intervals.
424
- site : dict, default None
425
- Pass a dictionary containing site data, which will be used to generate
426
- modeled clear sky ghi and poa values. The clear sky irradiance values are
427
- added to the data and the column_groups attribute is updated to include these
428
- two irradiance columns. The site data dictionary should be
429
- {sys: {system data}, loc: {location data}}. See the capdata.csky documentation
430
- for the format of the system data and location data.
500
+ site : dict or str, default None
501
+ Pass a dictionary or path to a json or yaml file containing site data, which
502
+ will be used to generate modeled clear sky ghi and poa values. The clear sky
503
+ irradiance values are added to the data and the column_groups attribute is
504
+ updated to include these two irradiance columns. The site data dictionary should
505
+ be {sys: {system data}, loc: {location data}}. See the capdata.csky
506
+ documentation for the format of the system data and location data.
431
507
  column_groups_template : bool, default False
432
508
  If True, will call `CapData.data_columns_to_excel` to save a file to use to
433
509
  manually create column groupings at `path`.
510
+ verbose : bool, default False
511
+ Set to True to print status of file loading.
434
512
  **kwargs
435
- Passed to `DataLoader.load` Options include: sort, drop_duplicates, reindex,
436
- extension. See `DataLoader` for complete documentation.
513
+ Passed to `DataLoader.load`, which passes them to the `file_reader` function.
514
+ The default `file_reader` function passes them to pandas.read_csv.
437
515
  """
438
516
  dl = DataLoader(
439
517
  path=path,
440
518
  file_reader=file_reader,
441
519
  )
442
- dl.load(**kwargs)
520
+ dl.load(verbose=verbose, **kwargs)
443
521
 
444
522
  if sort:
445
523
  dl.sort_data()
@@ -454,7 +532,7 @@ def load_data(
454
532
  cd.data_loader = dl
455
533
  # group columns
456
534
  if callable(group_columns):
457
- cd.column_groups = group_columns(cd.data)
535
+ cd.column_groups = cg.ColumnGroups(group_columns(cd.data))
458
536
  elif isinstance(group_columns, str):
459
537
  p = Path(group_columns)
460
538
  if p.suffix == ".json":
@@ -464,12 +542,20 @@ def load_data(
464
542
  elif (p.suffix == '.xlsx') or (p.suffix == '.xls'):
465
543
  cd.column_groups = cg.ColumnGroups(load_excel_column_groups(group_columns))
466
544
  if site is not None:
467
- cd.data = csky(cd.data, loc=site['loc'], sys=site['sys'])
468
- cd.data_filtered = cd.data.copy()
469
- cd.column_groups['irr-poa-clear_sky'] = ['poa_mod_csky']
470
- cd.column_groups['irr-ghi-clear_sky'] = ['ghi_mod_csky']
545
+ if isinstance(site, str):
546
+ path_to_site = Path(site)
547
+ if path_to_site.is_file():
548
+ if path_to_site.suffix == ".json":
549
+ site = util.read_json(site)
550
+ if (path_to_site.suffix == ".yaml") or (path_to_site.suffix == ".yml"):
551
+ site = util.read_yaml(site)
552
+ cd.site = copy.deepcopy(site)
553
+ if isinstance(site, dict):
554
+ cd.data = csky(cd.data, loc=site['loc'], sys=site['sys'])
555
+ cd.data_filtered = cd.data.copy()
556
+ cd.column_groups['irr-poa-clear_sky'] = ['poa_mod_csky']
557
+ cd.column_groups['irr-ghi-clear_sky'] = ['ghi_mod_csky']
471
558
  cd.trans_keys = list(cd.column_groups.keys())
472
- cd.set_plot_attributes()
473
559
  if column_groups_template:
474
560
  cd.data_columns_to_excel()
475
561
  return cd