sinapsis-data-readers 0.1.12__tar.gz → 0.1.14__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (47) hide show
  1. {sinapsis_data_readers-0.1.12/src/sinapsis_data_readers.egg-info → sinapsis_data_readers-0.1.14}/PKG-INFO +1 -1
  2. {sinapsis_data_readers-0.1.12 → sinapsis_data_readers-0.1.14}/pyproject.toml +1 -1
  3. sinapsis_data_readers-0.1.14/src/sinapsis_data_readers/helpers/csv_reader.py +22 -0
  4. {sinapsis_data_readers-0.1.12 → sinapsis_data_readers-0.1.14}/src/sinapsis_data_readers/helpers/sklearn_dataset_subset.py +13 -1
  5. sinapsis_data_readers-0.1.14/src/sinapsis_data_readers/helpers/sktime_datasets_subset.py +24 -0
  6. {sinapsis_data_readers-0.1.12 → sinapsis_data_readers-0.1.14}/src/sinapsis_data_readers/templates/__init__.py +1 -1
  7. {sinapsis_data_readers-0.1.12 → sinapsis_data_readers-0.1.14}/src/sinapsis_data_readers/templates/audio_readers/audio_reader_pydub.py +10 -2
  8. {sinapsis_data_readers-0.1.12 → sinapsis_data_readers-0.1.14}/src/sinapsis_data_readers/templates/audio_readers/audio_reader_soundfile.py +10 -2
  9. sinapsis_data_readers-0.1.14/src/sinapsis_data_readers/templates/datasets_readers/csv_datasets.py +26 -0
  10. {sinapsis_data_readers-0.1.12 → sinapsis_data_readers-0.1.14}/src/sinapsis_data_readers/templates/datasets_readers/dataset_splitter.py +11 -5
  11. {sinapsis_data_readers-0.1.12 → sinapsis_data_readers-0.1.14}/src/sinapsis_data_readers/templates/datasets_readers/sklearn_datasets.py +66 -12
  12. {sinapsis_data_readers-0.1.12 → sinapsis_data_readers-0.1.14}/src/sinapsis_data_readers/templates/datasets_readers/sktime_datasets.py +71 -12
  13. {sinapsis_data_readers-0.1.12 → sinapsis_data_readers-0.1.14}/src/sinapsis_data_readers/templates/image_readers/coco_dataset_reader.py +2 -2
  14. {sinapsis_data_readers-0.1.12 → sinapsis_data_readers-0.1.14}/src/sinapsis_data_readers/templates/image_readers/csv_dataset_reader.py +1 -21
  15. {sinapsis_data_readers-0.1.12 → sinapsis_data_readers-0.1.14}/src/sinapsis_data_readers/templates/video_readers/video_reader_dali.py +7 -0
  16. {sinapsis_data_readers-0.1.12 → sinapsis_data_readers-0.1.14}/src/sinapsis_data_readers/templates/video_readers/video_reader_ffmpeg.py +0 -1
  17. {sinapsis_data_readers-0.1.12 → sinapsis_data_readers-0.1.14}/src/sinapsis_data_readers/templates/video_readers/video_reader_torchcodec.py +7 -1
  18. {sinapsis_data_readers-0.1.12 → sinapsis_data_readers-0.1.14/src/sinapsis_data_readers.egg-info}/PKG-INFO +1 -1
  19. {sinapsis_data_readers-0.1.12 → sinapsis_data_readers-0.1.14}/src/sinapsis_data_readers.egg-info/SOURCES.txt +3 -0
  20. {sinapsis_data_readers-0.1.12 → sinapsis_data_readers-0.1.14}/LICENSE +0 -0
  21. {sinapsis_data_readers-0.1.12 → sinapsis_data_readers-0.1.14}/README.md +0 -0
  22. {sinapsis_data_readers-0.1.12 → sinapsis_data_readers-0.1.14}/setup.cfg +0 -0
  23. {sinapsis_data_readers-0.1.12 → sinapsis_data_readers-0.1.14}/src/sinapsis_data_readers/__init__.py +0 -0
  24. {sinapsis_data_readers-0.1.12 → sinapsis_data_readers-0.1.14}/src/sinapsis_data_readers/helpers/__init__.py +0 -0
  25. {sinapsis_data_readers-0.1.12 → sinapsis_data_readers-0.1.14}/src/sinapsis_data_readers/helpers/coco_dataclasses.py +0 -0
  26. {sinapsis_data_readers-0.1.12 → sinapsis_data_readers-0.1.14}/src/sinapsis_data_readers/helpers/file_path_helpers.py +0 -0
  27. {sinapsis_data_readers-0.1.12 → sinapsis_data_readers-0.1.14}/src/sinapsis_data_readers/helpers/image_color_space_converter.py +0 -0
  28. {sinapsis_data_readers-0.1.12 → sinapsis_data_readers-0.1.14}/src/sinapsis_data_readers/helpers/tags.py +0 -0
  29. {sinapsis_data_readers-0.1.12 → sinapsis_data_readers-0.1.14}/src/sinapsis_data_readers/helpers/text_input_helpers.py +0 -0
  30. {sinapsis_data_readers-0.1.12 → sinapsis_data_readers-0.1.14}/src/sinapsis_data_readers/templates/audio_readers/__init__.py +0 -0
  31. {sinapsis_data_readers-0.1.12 → sinapsis_data_readers-0.1.14}/src/sinapsis_data_readers/templates/audio_readers/audio_reader_to_bytes.py +0 -0
  32. {sinapsis_data_readers-0.1.12 → sinapsis_data_readers-0.1.14}/src/sinapsis_data_readers/templates/audio_readers/base_audio_reader.py +0 -0
  33. {sinapsis_data_readers-0.1.12 → sinapsis_data_readers-0.1.14}/src/sinapsis_data_readers/templates/base_file_data_loader.py +0 -0
  34. {sinapsis_data_readers-0.1.12 → sinapsis_data_readers-0.1.14}/src/sinapsis_data_readers/templates/datasets_readers/__init__.py +0 -0
  35. {sinapsis_data_readers-0.1.12 → sinapsis_data_readers-0.1.14}/src/sinapsis_data_readers/templates/image_readers/__init__.py +0 -0
  36. {sinapsis_data_readers-0.1.12 → sinapsis_data_readers-0.1.14}/src/sinapsis_data_readers/templates/image_readers/base_image_folder_data_loader.py +0 -0
  37. {sinapsis_data_readers-0.1.12 → sinapsis_data_readers-0.1.14}/src/sinapsis_data_readers/templates/image_readers/image_folder_reader_cv2.py +0 -0
  38. {sinapsis_data_readers-0.1.12 → sinapsis_data_readers-0.1.14}/src/sinapsis_data_readers/templates/image_readers/image_folder_reader_kornia.py +0 -0
  39. {sinapsis_data_readers-0.1.12 → sinapsis_data_readers-0.1.14}/src/sinapsis_data_readers/templates/text_readers/__init__.py +0 -0
  40. {sinapsis_data_readers-0.1.12 → sinapsis_data_readers-0.1.14}/src/sinapsis_data_readers/templates/text_readers/text_input.py +0 -0
  41. {sinapsis_data_readers-0.1.12 → sinapsis_data_readers-0.1.14}/src/sinapsis_data_readers/templates/video_readers/__init__.py +0 -0
  42. {sinapsis_data_readers-0.1.12 → sinapsis_data_readers-0.1.14}/src/sinapsis_data_readers/templates/video_readers/base_video_reader.py +0 -0
  43. {sinapsis_data_readers-0.1.12 → sinapsis_data_readers-0.1.14}/src/sinapsis_data_readers/templates/video_readers/video_reader_cv2.py +0 -0
  44. {sinapsis_data_readers-0.1.12 → sinapsis_data_readers-0.1.14}/src/sinapsis_data_readers.egg-info/dependency_links.txt +0 -0
  45. {sinapsis_data_readers-0.1.12 → sinapsis_data_readers-0.1.14}/src/sinapsis_data_readers.egg-info/requires.txt +0 -0
  46. {sinapsis_data_readers-0.1.12 → sinapsis_data_readers-0.1.14}/src/sinapsis_data_readers.egg-info/top_level.txt +0 -0
  47. {sinapsis_data_readers-0.1.12 → sinapsis_data_readers-0.1.14}/src/test_gradio_client.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: sinapsis-data-readers
3
- Version: 0.1.12
3
+ Version: 0.1.14
4
4
  Summary: Templates to read data in different formats
5
5
  Author-email: SinapsisAI <dev@sinapsis.tech>
6
6
  Project-URL: Homepage, https://sinapsis.tech
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "sinapsis-data-readers"
3
- version = "0.1.12"
3
+ version = "0.1.14"
4
4
  description = "Templates to read data in different formats"
5
5
  authors = [{ name = "SinapsisAI", email = "dev@sinapsis.tech" }]
6
6
 
@@ -0,0 +1,22 @@
1
+ # -*- coding: utf-8 -*-
2
+ import pandas as pd
3
+
4
+
5
+ def read_file(file: str) -> pd.DataFrame:
6
+ """
7
+ Reads a CSV file and returns its contents as a pandas DataFrame.
8
+
9
+ Args:
10
+ file (str): The path to the CSV file to be read.
11
+
12
+ Returns:
13
+ pd.DataFrame: The data from the CSV file as a pandas DataFrame.
14
+
15
+ Raises:
16
+ ValueError: If the file does not have a .csv extension.
17
+ """
18
+ if not file.endswith("csv"):
19
+ raise ValueError("The file must have a .csv extension.")
20
+
21
+ data = pd.read_csv(file, header=0)
22
+ return data
@@ -6,10 +6,22 @@ from sklearn import datasets
6
6
  _sklearn_supported_loaders = {
7
7
  name: getattr(datasets, name) for name in dir(datasets) if name.startswith(("load", "fetch"))
8
8
  }
9
+ excluded_loaders = [
10
+ "fetch_lfw_pairs",
11
+ "fetch_20newsgroups",
12
+ "fetch_20newgroups_vectorized",
13
+ "load_sample_images",
14
+ "load_sample_image",
15
+ "load_svmlight_file",
16
+ "load_svmlight_files",
17
+ "fetch_rcv1",
18
+ "fetch_species_distribution",
19
+ "fetch_file",
20
+ ]
9
21
 
10
22
 
11
23
  def __getattr__(name: str) -> Callable:
12
- if name in _sklearn_supported_loaders:
24
+ if name in _sklearn_supported_loaders and name not in excluded_loaders:
13
25
  return _sklearn_supported_loaders[name]
14
26
  raise AttributeError(f"Function `{name}` not found in sklearn.datasets.")
15
27
 
@@ -0,0 +1,24 @@
1
+ # -*- coding: utf-8 -*-
2
+ """ Excluded sktime loaders"""
3
+ from typing import Callable
4
+
5
+ from sktime import datasets
6
+
7
+ class_datasets = [
8
+ "Airline",
9
+ "Longley",
10
+ "Lynx",
11
+ "Macroeconomic",
12
+ "ShampooSales",
13
+ "Solar",
14
+ "USChange"
15
+ ]
16
+
17
+
18
+ def __getattr__(name: str) -> Callable:
19
+ if name in class_datasets:
20
+ return getattr(datasets, name)
21
+ raise AttributeError(f"Class `{name}` not found in sktime.datasets.")
22
+
23
+
24
+ __all__ = class_datasets
@@ -10,6 +10,7 @@ _template_lookup = {
10
10
  "AudioReaderPydub": f"{_root_lib_path}.audio_readers.audio_reader_pydub",
11
11
  "AudioReaderSoundfile": f"{_root_lib_path}.audio_readers.audio_reader_soundfile",
12
12
  "AudioReaderToBytes": f"{_root_lib_path}.audio_readers.audio_reader_to_bytes",
13
+ "CSVDatasetReader": f"{_root_lib_path}.datasets_readers.csv_datasets",
13
14
  "CSVImageDataset": f"{_root_lib_path}.image_readers.csv_dataset_reader",
14
15
  "CocoDetectionDatasetCV2": f"{_root_lib_path}.image_readers.coco_dataset_reader",
15
16
  "CocoKeypointsDatasetCV2": f"{_root_lib_path}.image_readers.coco_dataset_reader",
@@ -19,7 +20,6 @@ _template_lookup = {
19
20
  "ExecuteNTimesLazyAudioReaderPydub": f"{_root_lib_path}.audio_readers.audio_reader_pydub",
20
21
  "ExecuteNTimesLazyAudioReaderSoundfile": f"{_root_lib_path}.audio_readers.audio_reader_soundfile",
21
22
  "FolderImageDatasetCV2": f"{_root_lib_path}.image_readers.image_folder_reader_cv2",
22
- "FolderImageDatasetKornia": f"{_root_lib_path}.image_readers.image_folder_reader_kornia",
23
23
  "ImageDatasetSplitter": f"{_root_lib_path}.datasets_readers.dataset_splitter",
24
24
  "LazyAudioReaderPydub": f"{_root_lib_path}.audio_readers.audio_reader_pydub",
25
25
  "LazyAudioReaderSoundfile": f"{_root_lib_path}.audio_readers.audio_reader_soundfile",
@@ -134,6 +134,10 @@ class LazyAudioReaderPydub(AudioReaderPydub):
134
134
  from_bytes: False
135
135
  """
136
136
 
137
+ class AttributesBaseModel(AudioReaderPydub.AttributesBaseModel):
138
+ generic_key: str
139
+ audio_file_path: str | None = None # type:ignore[assignment]
140
+
137
141
  def get_file_path_from_generic_data(self, container: DataContainer) -> None:
138
142
  """Method to retrieve the file path from the genetic data field of DataContainer.
139
143
  The method extracts the file path from the generic field and sets as attribute
@@ -141,8 +145,12 @@ class LazyAudioReaderPydub(AudioReaderPydub):
141
145
  Args:
142
146
  container (DataContainer): The DataContainer to extract the file path from
143
147
  """
144
- file_path = container.generic_data.get("audio_path", "")
145
- self.attributes.audio_file_path = file_path
148
+ if self.attributes.generic_key:
149
+ file_path = self._get_generic_data(container, self.attributes.generic_key)
150
+ if file_path:
151
+ self.attributes.audio_file_path = file_path
152
+ else:
153
+ self.logger.warning("No audio path in the existing container")
146
154
 
147
155
  def execute(self, container: DataContainer) -> DataContainer:
148
156
  self.get_file_path_from_generic_data(container)
@@ -110,6 +110,10 @@ class LazyAudioReaderSoundfile(AudioReaderSoundfile):
110
110
  from_bytes: true
111
111
  """
112
112
 
113
+ class AttributesBaseModel(_AudioBaseReader.AttributesBaseModel):
114
+ generic_key: str
115
+ audio_file_path: str | None = None # type:ignore[assignment]
116
+
113
117
  def get_file_path_from_generic_data(self, container: DataContainer) -> None:
114
118
  """Method to retrieve the file path from the genetic data field of DataContainer.
115
119
  The method extracts the file path from the generic field and sets as attribute
@@ -117,8 +121,12 @@ class LazyAudioReaderSoundfile(AudioReaderSoundfile):
117
121
  Args:
118
122
  container (DataContainer): The DataContainer to extract the file path from
119
123
  """
120
- file_path = container.generic_data["audio_path"]
121
- self.attributes.audio_file_path = file_path
124
+ if self.attributes.generic_key:
125
+ file_path = self._get_generic_data(container, self.attributes.generic_key)
126
+ if file_path:
127
+ self.attributes.audio_file_path = file_path
128
+ else:
129
+ self.logger.warning("No audio path in the existing container")
122
130
 
123
131
  def execute(self, container: DataContainer) -> DataContainer:
124
132
  self.get_file_path_from_generic_data(container)
@@ -0,0 +1,26 @@
1
+ # -*- coding: utf-8 -*-
2
+ from sinapsis_core.data_containers.data_packet import DataContainer, TextPacket, TimeSeriesPacket
3
+ from sinapsis_core.template_base.base_models import TemplateAttributes, TemplateAttributeType
4
+ from sinapsis_core.template_base.template import Template
5
+
6
+ from sinapsis_data_readers.helpers.csv_reader import read_file
7
+
8
+
9
+ class CSVDatasetReader(Template):
10
+ class AttributesBaseModel(TemplateAttributes):
11
+ path_to_csv: str
12
+ store_as_time_series: bool = False
13
+ store_as_text_packet: bool = True
14
+
15
+ def __init__(self, attributes: TemplateAttributeType) -> None:
16
+ super().__init__(attributes)
17
+ self.csv_file = read_file(self.attributes.path_to_csv)
18
+
19
+ def execute(self, container: DataContainer) -> DataContainer:
20
+ if self.attributes.store_as_time_series:
21
+ packet = TimeSeriesPacket(content=self.csv_file)
22
+ container.time_series.append(packet)
23
+ if self.attributes.store_as_text_packet:
24
+ packet = TextPacket(content=self.csv_file)
25
+ container.texts.append(packet)
26
+ return container
@@ -11,7 +11,7 @@ from sinapsis_core.template_base.base_models import TemplateAttributes
11
11
  from sklearn.model_selection import train_test_split
12
12
 
13
13
  ArrayDataFrameType = Union[list[np.ndarray], pd.DataFrame]
14
- StringDataFrameType = Union[list[str], pd.DataFrame]
14
+ StringDataFrameType = Union[list[str | int], pd.DataFrame]
15
15
  OptionalArrayDataFrameType = Union[ArrayDataFrameType, None]
16
16
 
17
17
  OptionalStringDataFrameType = Union[StringDataFrameType, None]
@@ -30,9 +30,9 @@ class ImageDatasetSplit(BaseModel):
30
30
  """
31
31
 
32
32
  x_train: list[np.ndarray] = []
33
- y_train: list[str] = []
33
+ y_train: list[str | int] = []
34
34
  x_test: list[np.ndarray] | None = None
35
- y_test: list[str] | None = None
35
+ y_test: list[str | int] | None = None
36
36
 
37
37
  class Config:
38
38
  """allow arbitrary types"""
@@ -93,7 +93,11 @@ class DatasetSplitterBase(Template):
93
93
  x_train, x_test, y_train, y_test = x_data, None, y_data, None
94
94
  if self.attributes.train_size:
95
95
  x_train, x_test, y_train, y_test = train_test_split(
96
- x_data, y_data, train_size=self.attributes.train_size, random_state=0
96
+ x_data,
97
+ y_data,
98
+ train_size=self.attributes.train_size,
99
+ test_size=1 - self.attributes.train_size,
100
+ random_state=0,
97
101
  )
98
102
  split_dataset = self.return_data_splitter_object(x_train=x_train, y_train=y_train, x_test=x_test, y_test=y_test)
99
103
  return split_dataset
@@ -126,7 +130,9 @@ class DatasetSplitterBase(Template):
126
130
  if not packet:
127
131
  self.logger.debug("No data to be processed by dataset splitter")
128
132
  return container
129
-
133
+ if len(packet) == 1:
134
+ self.logger.debug("Not enough entries to divide dataset, returning original container")
135
+ return container
130
136
  x_data, y_data = self.extract_x_y_from_packet(packet)
131
137
 
132
138
  custom_dataset = self.store_data_in_data_splitter(x_data, y_data)
@@ -1,5 +1,6 @@
1
1
  # -*- coding: utf-8 -*-
2
2
 
3
+ import numpy as np
3
4
  import pandas as pd
4
5
  from sinapsis_core.data_containers.data_packet import DataContainer, TimeSeriesPacket
5
6
  from sinapsis_core.template_base import Template
@@ -14,6 +15,7 @@ from sinapsis_core.template_base.multi_execute_template import (
14
15
  )
15
16
  from sinapsis_core.utils.env_var_keys import SINAPSIS_BUILD_DOCS
16
17
  from sklearn.model_selection import train_test_split
18
+ from sklearn.utils import Bunch
17
19
 
18
20
  from sinapsis_data_readers.helpers import sklearn_dataset_subset
19
21
  from sinapsis_data_readers.helpers.tags import Tags
@@ -65,7 +67,7 @@ class SKLearnDatasets(BaseDynamicWrapperTemplate):
65
67
  """
66
68
 
67
69
  split_dataset: bool = True
68
- train_size: float = 1
70
+ train_size: float = 0.9
69
71
  store_as_time_series: bool = False
70
72
 
71
73
  def __init__(self, attributes: TemplateAttributeType) -> None:
@@ -73,7 +75,23 @@ class SKLearnDatasets(BaseDynamicWrapperTemplate):
73
75
  self.dataset_attributes = getattr(self.attributes, self.wrapped_callable.__name__)
74
76
 
75
77
  @staticmethod
76
- def parse_results(results: pd.DataFrame) -> pd.DataFrame:
78
+ def process_bunch(bunch: Bunch) -> tuple:
79
+ data = bunch.get("data")
80
+ original_target = bunch.get("target")
81
+
82
+ target = np.asarray(original_target)
83
+ target = target.reshape(-1, 1) if target.ndim == 1 else target
84
+ feature_column = bunch.get("feature_names", None)
85
+ target_column = bunch.get("target_names", None)
86
+ if target.shape[1] == 1:
87
+ target_column = ["target"]
88
+ elif target_column is not None and len(target_column) == target.shape[1]:
89
+ target_column = list(target_column)
90
+ else:
91
+ target_column = [f"target_{i}" for i in range(target.shape[1])]
92
+ return data, target, feature_column, target_column
93
+
94
+ def parse_results(self, results: pd.DataFrame) -> tuple[pd.DataFrame, list, list, int]:
77
95
  """Parses the dataset as a pandas dataframe with the feature names as columns
78
96
 
79
97
  Args:
@@ -84,17 +102,51 @@ class SKLearnDatasets(BaseDynamicWrapperTemplate):
84
102
  the additional column for target values
85
103
 
86
104
  """
87
-
88
- data_frame = pd.DataFrame(data=results.data, columns=results.feature_names)
89
- data_frame[TARGET] = results.target
90
- return data_frame
105
+ if isinstance(results, tuple):
106
+ data = results[0]
107
+ target = results[1]
108
+ feature_column = None
109
+ target_column = None
110
+ elif isinstance(results, Bunch):
111
+ data, target, feature_column, target_column = self.process_bunch(results)
112
+ else:
113
+ try:
114
+ data = results.data
115
+
116
+ except (KeyError, AttributeError, ValueError):
117
+ data = None
118
+ try:
119
+ target = results.target
120
+ except (KeyError, AttributeError, ValueError):
121
+ target = None
122
+ try:
123
+ feature_column = results.feature_names
124
+ target_column = results.target_names
125
+ except AttributeError:
126
+ feature_column = None
127
+ target_column = None
128
+ _, n_features = data.shape
129
+
130
+ feature_data_frame = pd.DataFrame(data=data, columns=feature_column)
131
+ target_data_frame = pd.DataFrame(data=target, columns=target_column)
132
+ data_frame = pd.concat([feature_data_frame, target_data_frame], axis=1)
133
+ return data_frame, feature_column, target_column, n_features
91
134
 
92
135
  @staticmethod
93
- def split_dataset(results: pd.DataFrame, split_size: float) -> TabularDatasetSplit:
136
+ def split_dataset(
137
+ results: pd.DataFrame, feature_name_cols: list, target_name_cols: list, n_features: int, split_size: float
138
+ ) -> TabularDatasetSplit:
94
139
  """Method to split the dataset into training and testing samples"""
95
- x_vals = results.drop(columns=[TARGET], axis=1)
96
- y_vals = results[TARGET]
97
- x_train, x_test, y_train, y_test = train_test_split(x_vals, y_vals, train_size=split_size, random_state=0)
140
+ if feature_name_cols:
141
+ X = results[feature_name_cols]
142
+ y = results[target_name_cols]
143
+ else:
144
+ X = results.iloc[:, :n_features]
145
+ y = results.iloc[:, n_features:]
146
+
147
+ # x_vals = results.drop(columns=[TARGET], axis=1)
148
+ # y_vals = results[TARGET]
149
+ x_train, x_test, y_train, y_test = train_test_split(X, y, train_size=split_size, random_state=0)
98
150
  split_data = TabularDatasetSplit(
99
151
  x_train=pd.DataFrame(x_train),
100
152
  x_test=pd.DataFrame(x_test),
@@ -106,13 +158,15 @@ class SKLearnDatasets(BaseDynamicWrapperTemplate):
106
158
 
107
159
  def execute(self, container: DataContainer) -> DataContainer:
108
160
  sklearn_dataset = self.wrapped_callable.__func__(**self.dataset_attributes.model_dump())
109
- dataset = self.parse_results(sklearn_dataset)
161
+ dataset, feature_columns, target_columns, n_features = self.parse_results(sklearn_dataset)
110
162
  if self.attributes.store_as_time_series:
111
163
  time_series_packet = TimeSeriesPacket(content=dataset)
112
164
  container.time_series.append(time_series_packet)
113
165
 
114
166
  if self.attributes.split_dataset:
115
- split_dataset = self.split_dataset(dataset, split_size=self.attributes.train_size)
167
+ split_dataset = self.split_dataset(
168
+ dataset, feature_columns, target_columns, n_features, split_size=self.attributes.train_size
169
+ )
116
170
  self._set_generic_data(container, split_dataset)
117
171
  if sklearn_dataset and not self.attributes.split_dataset:
118
172
  self._set_generic_data(container, dataset)
@@ -1,4 +1,5 @@
1
1
  # -*- coding: utf-8 -*-
2
+ from types import NoneType
2
3
  from typing import Any
3
4
 
4
5
  import pandas as pd
@@ -18,12 +19,24 @@ from sklearn.model_selection import train_test_split
18
19
  from sktime import datasets
19
20
  from sktime.split import temporal_train_test_split
20
21
 
22
+ from sinapsis_data_readers.helpers import sktime_datasets_subset
23
+ from sinapsis_data_readers.helpers.sktime_datasets_subset import class_datasets
21
24
  from sinapsis_data_readers.helpers.tags import Tags
22
25
  from sinapsis_data_readers.templates.datasets_readers.dataset_splitter import (
23
26
  TabularDatasetSplit,
24
27
  )
25
28
 
26
- EXCLUDE_MODULES = ["load_forecastingdata", "DATASET_NAMES_FPP3"]
29
+ EXCLUDE_MODULES = ["load_forecastingdata", "DATASET_NAMES_FPP3", "BaseDataset",
30
+ "load_gun_point_segmentation", "load_electric_devices_segments",
31
+ "write_dataframe_to_tsfile",
32
+ "write_ndarray_to_tsfile",
33
+ "write_results_to_uea_format",
34
+ "write_tabular_transformation_to_arff",
35
+ "write_panel_to_tsfileWrapper",
36
+ "_load_fpp3",
37
+ "load_hierarchical_sales_toydata",
38
+ "load_unitest_tsf"
39
+ ] + class_datasets
27
40
 
28
41
 
29
42
  class SKTimeDatasets(BaseDynamicWrapperTemplate):
@@ -77,8 +90,10 @@ class SKTimeDatasets(BaseDynamicWrapperTemplate):
77
90
 
78
91
  def __init__(self, attributes: TemplateAttributeType) -> None:
79
92
  super().__init__(attributes)
80
- self.dataset_attributes = getattr(self.attributes, self.wrapped_callable.__name__)
93
+ self.dataset_attributes = self.initialize_attributes()
81
94
 
95
+ def initialize_attributes(self):
96
+ return getattr(self.attributes, self.wrapped_callable.__name__)
82
97
  def split_time_series_dataset(self, dataset: Any) -> TabularDatasetSplit:
83
98
  """Split a time series dataset into training and testing sets
84
99
 
@@ -106,14 +121,22 @@ class SKTimeDatasets(BaseDynamicWrapperTemplate):
106
121
  Returns:
107
122
  TabularDatasetSplit: Object containing the split dataset.
108
123
  """
109
- X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=self.attributes.train_size, random_state=0)
110
- return TabularDatasetSplit(
111
- x_train=pd.DataFrame(X_train),
112
- x_test=pd.DataFrame(X_test),
113
- y_train=pd.DataFrame(y_train),
114
- y_test=pd.DataFrame(y_test),
115
- )
116
-
124
+ try:
125
+ X_train, X_test, y_train, y_test = train_test_split(
126
+ X, y, train_size=self.attributes.train_size, random_state=0
127
+ )
128
+ return TabularDatasetSplit(
129
+ x_train=pd.DataFrame(X_train),
130
+ x_test=pd.DataFrame(X_test),
131
+ y_train=pd.DataFrame(y_train),
132
+ y_test=pd.DataFrame(y_test),
133
+ )
134
+ except ValueError:
135
+ self.logger.debug("Wrong format for split. original values")
136
+ return TabularDatasetSplit(x_train=pd.DataFrame(X), y_train=pd.DataFrame(y))
137
+
138
+ def create_dataset(self):
139
+ return self.wrapped_callable.__func__(**self.dataset_attributes.model_dump())
117
140
  def execute(self, container: DataContainer) -> DataContainer:
118
141
  """Execute the SKTimeDatasets template to load and process a dataset.
119
142
 
@@ -126,7 +149,7 @@ class SKTimeDatasets(BaseDynamicWrapperTemplate):
126
149
  Returns:
127
150
  DataContainer: The container with the dataset added to it.
128
151
  """
129
- dataset = self.wrapped_callable.__func__(**self.dataset_attributes.model_dump())
152
+ dataset = self.create_dataset()
130
153
  split_dataset = dataset
131
154
  if isinstance(dataset, tuple):
132
155
  if self.attributes.split_dataset:
@@ -161,6 +184,36 @@ class ExecuteNTimesSKTimeDatasets(SKTimeDatasets):
161
184
  )
162
185
 
163
186
 
187
+ class SKTimeClassDatasets(SKTimeDatasets):
188
+ WrapperEntry = WrapperEntryConfig(
189
+ wrapped_object=sktime_datasets_subset,
190
+ signature_from_doc_string=True,
191
+ )
192
+ def initialize_attributes(self):
193
+ return None
194
+ def create_dataset(self):
195
+ dataset = self.wrapped_callable.load("X", "y")
196
+ if isinstance(dataset[0], NoneType):
197
+ return dataset[1]
198
+ elif isinstance(dataset[1], NoneType):
199
+ return dataset[0]
200
+ return dataset
201
+
202
+ @execute_template_n_times_wrapper
203
+ class ExecuteNTimesSKTimeClassDatasets(SKTimeDatasets):
204
+ """This template extends the functionality of the SKTimeDatasets template
205
+ by loading the sktime dataset n times.
206
+
207
+ This is useful for running the same dataset loading operation multiple
208
+ times with different parameters or for benchmark purposes.
209
+ """
210
+
211
+ WrapperEntry = WrapperEntryConfig(
212
+ wrapped_object=sktime_datasets_subset,
213
+ signature_from_doc_string=True,
214
+ template_name_suffix="ExecuteNTimes",
215
+ )
216
+
164
217
  def __getattr__(name: str) -> Template:
165
218
  """
166
219
  Only create a template if it's imported, this avoids creating all the base models for all templates
@@ -170,10 +223,16 @@ def __getattr__(name: str) -> Template:
170
223
  return make_dynamic_template(name, SKTimeDatasets)
171
224
  if name in ExecuteNTimesSKTimeDatasets.WrapperEntry.module_att_names:
172
225
  return make_dynamic_template(name, ExecuteNTimesSKTimeDatasets)
226
+ if name in SKTimeClassDatasets.WrapperEntry.module_att_names:
227
+ return make_dynamic_template(name, SKTimeClassDatasets)
228
+ if name in ExecuteNTimesSKTimeClassDatasets.WrapperEntry.module_att_names:
229
+ return make_dynamic_template(name, ExecuteNTimesSKTimeClassDatasets)
173
230
  raise AttributeError(f"template `{name}` not found in {__name__}")
174
231
 
175
232
 
176
- __all__ = SKTimeDatasets.WrapperEntry.module_att_names + ExecuteNTimesSKTimeDatasets.WrapperEntry.module_att_names
233
+ __all__ = (SKTimeDatasets.WrapperEntry.module_att_names + ExecuteNTimesSKTimeDatasets.WrapperEntry.module_att_names +
234
+ SKTimeClassDatasets.WrapperEntry.module_att_names +
235
+ ExecuteNTimesSKTimeClassDatasets.WrapperEntry.module_att_names)
177
236
 
178
237
 
179
238
  if SINAPSIS_BUILD_DOCS:
@@ -54,10 +54,10 @@ class CocoImageDatasetBaseCV2(FolderImageDatasetCV2):
54
54
  annotations_path: str
55
55
 
56
56
  def __init__(self, attributes: TemplateAttributeType) -> None:
57
- super().__init__(attributes)
58
- self.annotations_file = os.path.join(self.attributes.data_dir, self.attributes.annotations_path)
57
+ self.annotations_file = os.path.join(attributes.get("data_dir"), attributes.get("annotations_path"))
59
58
  self.raw_annotations_dict: list[dict[str, dict[str, Any]]] = self.read_annotations_file(self.annotations_file)
60
59
  self.annotations = self.images_annotations()
60
+ super().__init__(attributes)
61
61
 
62
62
  @staticmethod
63
63
  def read_annotations_file(file: str) -> list[dict[str, dict[str, Any]]]:
@@ -3,7 +3,6 @@
3
3
  from typing import cast
4
4
 
5
5
  import numpy as np
6
- import pandas as pd
7
6
  from sinapsis_core.data_containers.annotations import ImageAnnotations
8
7
  from sinapsis_core.data_containers.data_packet import ImagePacket
9
8
  from sinapsis_core.template_base.base_models import (
@@ -12,6 +11,7 @@ from sinapsis_core.template_base.base_models import (
12
11
  UIPropertiesMetadata,
13
12
  )
14
13
 
14
+ from sinapsis_data_readers.helpers.csv_reader import read_file
15
15
  from sinapsis_data_readers.helpers.tags import Tags
16
16
  from sinapsis_data_readers.templates.base_file_data_loader import (
17
17
  ContentNotSetException,
@@ -19,26 +19,6 @@ from sinapsis_data_readers.templates.base_file_data_loader import (
19
19
  )
20
20
 
21
21
 
22
- def read_file(file: str) -> pd.DataFrame:
23
- """
24
- Reads a CSV file and returns its contents as a pandas DataFrame.
25
-
26
- Args:
27
- file (str): The path to the CSV file to be read.
28
-
29
- Returns:
30
- pd.DataFrame: The data from the CSV file as a pandas DataFrame.
31
-
32
- Raises:
33
- ValueError: If the file does not have a .csv extension.
34
- """
35
- if not file.endswith("csv"):
36
- raise ValueError("The file must have a .csv extension.")
37
-
38
- data = pd.read_csv(file, header=0)
39
- return data
40
-
41
-
42
22
  class CSVImageDataset(_BaseDataReader):
43
23
  """
44
24
  A dataset reader for CSV-based image datasets, inheriting from _BaseDataReader.
@@ -4,6 +4,7 @@
4
4
  from typing import Literal, cast
5
5
 
6
6
  import nvidia.dali.fn as fn
7
+ import torch
7
8
  from nvidia.dali import pipeline_def
8
9
  from nvidia.dali.pipeline import DataNode, Pipeline
9
10
  from nvidia.dali.plugin.pytorch import DALIGenericIterator
@@ -149,6 +150,12 @@ class VideoReaderDali(BaseVideoReader):
149
150
  video_frames.append(self._make_image_packet(frame, frame_index=self.frame_count + idx))
150
151
  return video_frames
151
152
 
153
+ def reset_state(self, template_name: str | None = None) -> None:
154
+ _ = template_name
155
+ if self.attributes.device == "gpu":
156
+ torch.cuda.empty_cache()
157
+ super().reset_state(template_name)
158
+
152
159
 
153
160
  @multi_video_wrapper
154
161
  class MultiVideoReaderDali(VideoReaderDali):
@@ -37,7 +37,6 @@ class VideoReaderFFMPEG(BaseVideoReader):
37
37
  video_file_path: '/path/to/video/file'
38
38
  batch_size: 1
39
39
  video_source: 4d2a355f-cda4-4742-9042-8e6ee842d1cf
40
- device: cpu
41
40
  loop_forever: false
42
41
  """
43
42
 
@@ -1,4 +1,5 @@
1
1
  # -*- coding: utf-8 -*-
2
+ import torch
2
3
  from sinapsis_core.data_containers.data_packet import ImagePacket
3
4
  from torchcodec.decoders import SimpleVideoDecoder
4
5
 
@@ -84,6 +85,12 @@ class VideoReaderTorchCodec(BaseVideoReader):
84
85
  video_frames.append(self._make_image_packet(frame, frame_index=self.frame_count + idx))
85
86
  return video_frames
86
87
 
88
+ def reset_state(self, template_name: str | None = None) -> None:
89
+ _ = template_name
90
+ if self.attributes.device == "gpu":
91
+ torch.cuda.empty_cache()
92
+ super().reset_state(template_name)
93
+
87
94
 
88
95
  @multi_video_wrapper
89
96
  class MultiVideoReaderTorchCodec(VideoReaderTorchCodec):
@@ -92,5 +99,4 @@ class MultiVideoReaderTorchCodec(VideoReaderTorchCodec):
92
99
  by adding as many video_readers as needed depending on the lenght of
93
100
  video_file_path list. It appends the dataframes of each of the videos to the
94
101
  ImagePacket object in DataContainer
95
-
96
102
  """
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: sinapsis-data-readers
3
- Version: 0.1.12
3
+ Version: 0.1.14
4
4
  Summary: Templates to read data in different formats
5
5
  Author-email: SinapsisAI <dev@sinapsis.tech>
6
6
  Project-URL: Homepage, https://sinapsis.tech
@@ -10,9 +10,11 @@ src/sinapsis_data_readers.egg-info/requires.txt
10
10
  src/sinapsis_data_readers.egg-info/top_level.txt
11
11
  src/sinapsis_data_readers/helpers/__init__.py
12
12
  src/sinapsis_data_readers/helpers/coco_dataclasses.py
13
+ src/sinapsis_data_readers/helpers/csv_reader.py
13
14
  src/sinapsis_data_readers/helpers/file_path_helpers.py
14
15
  src/sinapsis_data_readers/helpers/image_color_space_converter.py
15
16
  src/sinapsis_data_readers/helpers/sklearn_dataset_subset.py
17
+ src/sinapsis_data_readers/helpers/sktime_datasets_subset.py
16
18
  src/sinapsis_data_readers/helpers/tags.py
17
19
  src/sinapsis_data_readers/helpers/text_input_helpers.py
18
20
  src/sinapsis_data_readers/templates/__init__.py
@@ -23,6 +25,7 @@ src/sinapsis_data_readers/templates/audio_readers/audio_reader_soundfile.py
23
25
  src/sinapsis_data_readers/templates/audio_readers/audio_reader_to_bytes.py
24
26
  src/sinapsis_data_readers/templates/audio_readers/base_audio_reader.py
25
27
  src/sinapsis_data_readers/templates/datasets_readers/__init__.py
28
+ src/sinapsis_data_readers/templates/datasets_readers/csv_datasets.py
26
29
  src/sinapsis_data_readers/templates/datasets_readers/dataset_splitter.py
27
30
  src/sinapsis_data_readers/templates/datasets_readers/sklearn_datasets.py
28
31
  src/sinapsis_data_readers/templates/datasets_readers/sktime_datasets.py