sinapsis-data-readers 0.1.12__tar.gz → 0.1.14__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {sinapsis_data_readers-0.1.12/src/sinapsis_data_readers.egg-info → sinapsis_data_readers-0.1.14}/PKG-INFO +1 -1
- {sinapsis_data_readers-0.1.12 → sinapsis_data_readers-0.1.14}/pyproject.toml +1 -1
- sinapsis_data_readers-0.1.14/src/sinapsis_data_readers/helpers/csv_reader.py +22 -0
- {sinapsis_data_readers-0.1.12 → sinapsis_data_readers-0.1.14}/src/sinapsis_data_readers/helpers/sklearn_dataset_subset.py +13 -1
- sinapsis_data_readers-0.1.14/src/sinapsis_data_readers/helpers/sktime_datasets_subset.py +24 -0
- {sinapsis_data_readers-0.1.12 → sinapsis_data_readers-0.1.14}/src/sinapsis_data_readers/templates/__init__.py +1 -1
- {sinapsis_data_readers-0.1.12 → sinapsis_data_readers-0.1.14}/src/sinapsis_data_readers/templates/audio_readers/audio_reader_pydub.py +10 -2
- {sinapsis_data_readers-0.1.12 → sinapsis_data_readers-0.1.14}/src/sinapsis_data_readers/templates/audio_readers/audio_reader_soundfile.py +10 -2
- sinapsis_data_readers-0.1.14/src/sinapsis_data_readers/templates/datasets_readers/csv_datasets.py +26 -0
- {sinapsis_data_readers-0.1.12 → sinapsis_data_readers-0.1.14}/src/sinapsis_data_readers/templates/datasets_readers/dataset_splitter.py +11 -5
- {sinapsis_data_readers-0.1.12 → sinapsis_data_readers-0.1.14}/src/sinapsis_data_readers/templates/datasets_readers/sklearn_datasets.py +66 -12
- {sinapsis_data_readers-0.1.12 → sinapsis_data_readers-0.1.14}/src/sinapsis_data_readers/templates/datasets_readers/sktime_datasets.py +71 -12
- {sinapsis_data_readers-0.1.12 → sinapsis_data_readers-0.1.14}/src/sinapsis_data_readers/templates/image_readers/coco_dataset_reader.py +2 -2
- {sinapsis_data_readers-0.1.12 → sinapsis_data_readers-0.1.14}/src/sinapsis_data_readers/templates/image_readers/csv_dataset_reader.py +1 -21
- {sinapsis_data_readers-0.1.12 → sinapsis_data_readers-0.1.14}/src/sinapsis_data_readers/templates/video_readers/video_reader_dali.py +7 -0
- {sinapsis_data_readers-0.1.12 → sinapsis_data_readers-0.1.14}/src/sinapsis_data_readers/templates/video_readers/video_reader_ffmpeg.py +0 -1
- {sinapsis_data_readers-0.1.12 → sinapsis_data_readers-0.1.14}/src/sinapsis_data_readers/templates/video_readers/video_reader_torchcodec.py +7 -1
- {sinapsis_data_readers-0.1.12 → sinapsis_data_readers-0.1.14/src/sinapsis_data_readers.egg-info}/PKG-INFO +1 -1
- {sinapsis_data_readers-0.1.12 → sinapsis_data_readers-0.1.14}/src/sinapsis_data_readers.egg-info/SOURCES.txt +3 -0
- {sinapsis_data_readers-0.1.12 → sinapsis_data_readers-0.1.14}/LICENSE +0 -0
- {sinapsis_data_readers-0.1.12 → sinapsis_data_readers-0.1.14}/README.md +0 -0
- {sinapsis_data_readers-0.1.12 → sinapsis_data_readers-0.1.14}/setup.cfg +0 -0
- {sinapsis_data_readers-0.1.12 → sinapsis_data_readers-0.1.14}/src/sinapsis_data_readers/__init__.py +0 -0
- {sinapsis_data_readers-0.1.12 → sinapsis_data_readers-0.1.14}/src/sinapsis_data_readers/helpers/__init__.py +0 -0
- {sinapsis_data_readers-0.1.12 → sinapsis_data_readers-0.1.14}/src/sinapsis_data_readers/helpers/coco_dataclasses.py +0 -0
- {sinapsis_data_readers-0.1.12 → sinapsis_data_readers-0.1.14}/src/sinapsis_data_readers/helpers/file_path_helpers.py +0 -0
- {sinapsis_data_readers-0.1.12 → sinapsis_data_readers-0.1.14}/src/sinapsis_data_readers/helpers/image_color_space_converter.py +0 -0
- {sinapsis_data_readers-0.1.12 → sinapsis_data_readers-0.1.14}/src/sinapsis_data_readers/helpers/tags.py +0 -0
- {sinapsis_data_readers-0.1.12 → sinapsis_data_readers-0.1.14}/src/sinapsis_data_readers/helpers/text_input_helpers.py +0 -0
- {sinapsis_data_readers-0.1.12 → sinapsis_data_readers-0.1.14}/src/sinapsis_data_readers/templates/audio_readers/__init__.py +0 -0
- {sinapsis_data_readers-0.1.12 → sinapsis_data_readers-0.1.14}/src/sinapsis_data_readers/templates/audio_readers/audio_reader_to_bytes.py +0 -0
- {sinapsis_data_readers-0.1.12 → sinapsis_data_readers-0.1.14}/src/sinapsis_data_readers/templates/audio_readers/base_audio_reader.py +0 -0
- {sinapsis_data_readers-0.1.12 → sinapsis_data_readers-0.1.14}/src/sinapsis_data_readers/templates/base_file_data_loader.py +0 -0
- {sinapsis_data_readers-0.1.12 → sinapsis_data_readers-0.1.14}/src/sinapsis_data_readers/templates/datasets_readers/__init__.py +0 -0
- {sinapsis_data_readers-0.1.12 → sinapsis_data_readers-0.1.14}/src/sinapsis_data_readers/templates/image_readers/__init__.py +0 -0
- {sinapsis_data_readers-0.1.12 → sinapsis_data_readers-0.1.14}/src/sinapsis_data_readers/templates/image_readers/base_image_folder_data_loader.py +0 -0
- {sinapsis_data_readers-0.1.12 → sinapsis_data_readers-0.1.14}/src/sinapsis_data_readers/templates/image_readers/image_folder_reader_cv2.py +0 -0
- {sinapsis_data_readers-0.1.12 → sinapsis_data_readers-0.1.14}/src/sinapsis_data_readers/templates/image_readers/image_folder_reader_kornia.py +0 -0
- {sinapsis_data_readers-0.1.12 → sinapsis_data_readers-0.1.14}/src/sinapsis_data_readers/templates/text_readers/__init__.py +0 -0
- {sinapsis_data_readers-0.1.12 → sinapsis_data_readers-0.1.14}/src/sinapsis_data_readers/templates/text_readers/text_input.py +0 -0
- {sinapsis_data_readers-0.1.12 → sinapsis_data_readers-0.1.14}/src/sinapsis_data_readers/templates/video_readers/__init__.py +0 -0
- {sinapsis_data_readers-0.1.12 → sinapsis_data_readers-0.1.14}/src/sinapsis_data_readers/templates/video_readers/base_video_reader.py +0 -0
- {sinapsis_data_readers-0.1.12 → sinapsis_data_readers-0.1.14}/src/sinapsis_data_readers/templates/video_readers/video_reader_cv2.py +0 -0
- {sinapsis_data_readers-0.1.12 → sinapsis_data_readers-0.1.14}/src/sinapsis_data_readers.egg-info/dependency_links.txt +0 -0
- {sinapsis_data_readers-0.1.12 → sinapsis_data_readers-0.1.14}/src/sinapsis_data_readers.egg-info/requires.txt +0 -0
- {sinapsis_data_readers-0.1.12 → sinapsis_data_readers-0.1.14}/src/sinapsis_data_readers.egg-info/top_level.txt +0 -0
- {sinapsis_data_readers-0.1.12 → sinapsis_data_readers-0.1.14}/src/test_gradio_client.py +0 -0
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
# -*- coding: utf-8 -*-
|
|
2
|
+
import pandas as pd
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
def read_file(file: str) -> pd.DataFrame:
|
|
6
|
+
"""
|
|
7
|
+
Reads a CSV file and returns its contents as a pandas DataFrame.
|
|
8
|
+
|
|
9
|
+
Args:
|
|
10
|
+
file (str): The path to the CSV file to be read.
|
|
11
|
+
|
|
12
|
+
Returns:
|
|
13
|
+
pd.DataFrame: The data from the CSV file as a pandas DataFrame.
|
|
14
|
+
|
|
15
|
+
Raises:
|
|
16
|
+
ValueError: If the file does not have a .csv extension.
|
|
17
|
+
"""
|
|
18
|
+
if not file.endswith("csv"):
|
|
19
|
+
raise ValueError("The file must have a .csv extension.")
|
|
20
|
+
|
|
21
|
+
data = pd.read_csv(file, header=0)
|
|
22
|
+
return data
|
|
@@ -6,10 +6,22 @@ from sklearn import datasets
|
|
|
6
6
|
_sklearn_supported_loaders = {
|
|
7
7
|
name: getattr(datasets, name) for name in dir(datasets) if name.startswith(("load", "fetch"))
|
|
8
8
|
}
|
|
9
|
+
excluded_loaders = [
|
|
10
|
+
"fetch_lfw_pairs",
|
|
11
|
+
"fetch_20newsgroups",
|
|
12
|
+
"fetch_20newgroups_vectorized",
|
|
13
|
+
"load_sample_images",
|
|
14
|
+
"load_sample_image",
|
|
15
|
+
"load_svmlight_file",
|
|
16
|
+
"load_svmlight_files",
|
|
17
|
+
"fetch_rcv1",
|
|
18
|
+
"fetch_species_distribution",
|
|
19
|
+
"fetch_file",
|
|
20
|
+
]
|
|
9
21
|
|
|
10
22
|
|
|
11
23
|
def __getattr__(name: str) -> Callable:
|
|
12
|
-
if name in _sklearn_supported_loaders:
|
|
24
|
+
if name in _sklearn_supported_loaders and name not in excluded_loaders:
|
|
13
25
|
return _sklearn_supported_loaders[name]
|
|
14
26
|
raise AttributeError(f"Function `{name}` not found in sklearn.datasets.")
|
|
15
27
|
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
# -*- coding: utf-8 -*-
|
|
2
|
+
""" Excluded sktime loaders"""
|
|
3
|
+
from typing import Callable
|
|
4
|
+
|
|
5
|
+
from sktime import datasets
|
|
6
|
+
|
|
7
|
+
class_datasets = [
|
|
8
|
+
"Airline",
|
|
9
|
+
"Longley",
|
|
10
|
+
"Lynx",
|
|
11
|
+
"Macroeconomic",
|
|
12
|
+
"ShampooSales",
|
|
13
|
+
"Solar",
|
|
14
|
+
"USChange"
|
|
15
|
+
]
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def __getattr__(name: str) -> Callable:
|
|
19
|
+
if name in class_datasets:
|
|
20
|
+
return getattr(datasets, name)
|
|
21
|
+
raise AttributeError(f"Class `{name}` not found in sktime.datasets.")
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
__all__ = class_datasets
|
|
@@ -10,6 +10,7 @@ _template_lookup = {
|
|
|
10
10
|
"AudioReaderPydub": f"{_root_lib_path}.audio_readers.audio_reader_pydub",
|
|
11
11
|
"AudioReaderSoundfile": f"{_root_lib_path}.audio_readers.audio_reader_soundfile",
|
|
12
12
|
"AudioReaderToBytes": f"{_root_lib_path}.audio_readers.audio_reader_to_bytes",
|
|
13
|
+
"CSVDatasetReader": f"{_root_lib_path}.datasets_readers.csv_datasets",
|
|
13
14
|
"CSVImageDataset": f"{_root_lib_path}.image_readers.csv_dataset_reader",
|
|
14
15
|
"CocoDetectionDatasetCV2": f"{_root_lib_path}.image_readers.coco_dataset_reader",
|
|
15
16
|
"CocoKeypointsDatasetCV2": f"{_root_lib_path}.image_readers.coco_dataset_reader",
|
|
@@ -19,7 +20,6 @@ _template_lookup = {
|
|
|
19
20
|
"ExecuteNTimesLazyAudioReaderPydub": f"{_root_lib_path}.audio_readers.audio_reader_pydub",
|
|
20
21
|
"ExecuteNTimesLazyAudioReaderSoundfile": f"{_root_lib_path}.audio_readers.audio_reader_soundfile",
|
|
21
22
|
"FolderImageDatasetCV2": f"{_root_lib_path}.image_readers.image_folder_reader_cv2",
|
|
22
|
-
"FolderImageDatasetKornia": f"{_root_lib_path}.image_readers.image_folder_reader_kornia",
|
|
23
23
|
"ImageDatasetSplitter": f"{_root_lib_path}.datasets_readers.dataset_splitter",
|
|
24
24
|
"LazyAudioReaderPydub": f"{_root_lib_path}.audio_readers.audio_reader_pydub",
|
|
25
25
|
"LazyAudioReaderSoundfile": f"{_root_lib_path}.audio_readers.audio_reader_soundfile",
|
|
@@ -134,6 +134,10 @@ class LazyAudioReaderPydub(AudioReaderPydub):
|
|
|
134
134
|
from_bytes: False
|
|
135
135
|
"""
|
|
136
136
|
|
|
137
|
+
class AttributesBaseModel(AudioReaderPydub.AttributesBaseModel):
|
|
138
|
+
generic_key: str
|
|
139
|
+
audio_file_path: str | None = None # type:ignore[assignment]
|
|
140
|
+
|
|
137
141
|
def get_file_path_from_generic_data(self, container: DataContainer) -> None:
|
|
138
142
|
"""Method to retrieve the file path from the genetic data field of DataContainer.
|
|
139
143
|
The method extracts the file path from the generic field and sets as attribute
|
|
@@ -141,8 +145,12 @@ class LazyAudioReaderPydub(AudioReaderPydub):
|
|
|
141
145
|
Args:
|
|
142
146
|
container (DataContainer): The DataContainer to extract the file path from
|
|
143
147
|
"""
|
|
144
|
-
|
|
145
|
-
|
|
148
|
+
if self.attributes.generic_key:
|
|
149
|
+
file_path = self._get_generic_data(container, self.attributes.generic_key)
|
|
150
|
+
if file_path:
|
|
151
|
+
self.attributes.audio_file_path = file_path
|
|
152
|
+
else:
|
|
153
|
+
self.logger.warning("No audio path in the existing container")
|
|
146
154
|
|
|
147
155
|
def execute(self, container: DataContainer) -> DataContainer:
|
|
148
156
|
self.get_file_path_from_generic_data(container)
|
|
@@ -110,6 +110,10 @@ class LazyAudioReaderSoundfile(AudioReaderSoundfile):
|
|
|
110
110
|
from_bytes: true
|
|
111
111
|
"""
|
|
112
112
|
|
|
113
|
+
class AttributesBaseModel(_AudioBaseReader.AttributesBaseModel):
|
|
114
|
+
generic_key: str
|
|
115
|
+
audio_file_path: str | None = None # type:ignore[assignment]
|
|
116
|
+
|
|
113
117
|
def get_file_path_from_generic_data(self, container: DataContainer) -> None:
|
|
114
118
|
"""Method to retrieve the file path from the genetic data field of DataContainer.
|
|
115
119
|
The method extracts the file path from the generic field and sets as attribute
|
|
@@ -117,8 +121,12 @@ class LazyAudioReaderSoundfile(AudioReaderSoundfile):
|
|
|
117
121
|
Args:
|
|
118
122
|
container (DataContainer): The DataContainer to extract the file path from
|
|
119
123
|
"""
|
|
120
|
-
|
|
121
|
-
|
|
124
|
+
if self.attributes.generic_key:
|
|
125
|
+
file_path = self._get_generic_data(container, self.attributes.generic_key)
|
|
126
|
+
if file_path:
|
|
127
|
+
self.attributes.audio_file_path = file_path
|
|
128
|
+
else:
|
|
129
|
+
self.logger.warning("No audio path in the existing container")
|
|
122
130
|
|
|
123
131
|
def execute(self, container: DataContainer) -> DataContainer:
|
|
124
132
|
self.get_file_path_from_generic_data(container)
|
sinapsis_data_readers-0.1.14/src/sinapsis_data_readers/templates/datasets_readers/csv_datasets.py
ADDED
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
# -*- coding: utf-8 -*-
|
|
2
|
+
from sinapsis_core.data_containers.data_packet import DataContainer, TextPacket, TimeSeriesPacket
|
|
3
|
+
from sinapsis_core.template_base.base_models import TemplateAttributes, TemplateAttributeType
|
|
4
|
+
from sinapsis_core.template_base.template import Template
|
|
5
|
+
|
|
6
|
+
from sinapsis_data_readers.helpers.csv_reader import read_file
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class CSVDatasetReader(Template):
|
|
10
|
+
class AttributesBaseModel(TemplateAttributes):
|
|
11
|
+
path_to_csv: str
|
|
12
|
+
store_as_time_series: bool = False
|
|
13
|
+
store_as_text_packet: bool = True
|
|
14
|
+
|
|
15
|
+
def __init__(self, attributes: TemplateAttributeType) -> None:
|
|
16
|
+
super().__init__(attributes)
|
|
17
|
+
self.csv_file = read_file(self.attributes.path_to_csv)
|
|
18
|
+
|
|
19
|
+
def execute(self, container: DataContainer) -> DataContainer:
|
|
20
|
+
if self.attributes.store_as_time_series:
|
|
21
|
+
packet = TimeSeriesPacket(content=self.csv_file)
|
|
22
|
+
container.time_series.append(packet)
|
|
23
|
+
if self.attributes.store_as_text_packet:
|
|
24
|
+
packet = TextPacket(content=self.csv_file)
|
|
25
|
+
container.texts.append(packet)
|
|
26
|
+
return container
|
|
@@ -11,7 +11,7 @@ from sinapsis_core.template_base.base_models import TemplateAttributes
|
|
|
11
11
|
from sklearn.model_selection import train_test_split
|
|
12
12
|
|
|
13
13
|
ArrayDataFrameType = Union[list[np.ndarray], pd.DataFrame]
|
|
14
|
-
StringDataFrameType = Union[list[str], pd.DataFrame]
|
|
14
|
+
StringDataFrameType = Union[list[str | int], pd.DataFrame]
|
|
15
15
|
OptionalArrayDataFrameType = Union[ArrayDataFrameType, None]
|
|
16
16
|
|
|
17
17
|
OptionalStringDataFrameType = Union[StringDataFrameType, None]
|
|
@@ -30,9 +30,9 @@ class ImageDatasetSplit(BaseModel):
|
|
|
30
30
|
"""
|
|
31
31
|
|
|
32
32
|
x_train: list[np.ndarray] = []
|
|
33
|
-
y_train: list[str] = []
|
|
33
|
+
y_train: list[str | int] = []
|
|
34
34
|
x_test: list[np.ndarray] | None = None
|
|
35
|
-
y_test: list[str] | None = None
|
|
35
|
+
y_test: list[str | int] | None = None
|
|
36
36
|
|
|
37
37
|
class Config:
|
|
38
38
|
"""allow arbitrary types"""
|
|
@@ -93,7 +93,11 @@ class DatasetSplitterBase(Template):
|
|
|
93
93
|
x_train, x_test, y_train, y_test = x_data, None, y_data, None
|
|
94
94
|
if self.attributes.train_size:
|
|
95
95
|
x_train, x_test, y_train, y_test = train_test_split(
|
|
96
|
-
x_data,
|
|
96
|
+
x_data,
|
|
97
|
+
y_data,
|
|
98
|
+
train_size=self.attributes.train_size,
|
|
99
|
+
test_size=1 - self.attributes.train_size,
|
|
100
|
+
random_state=0,
|
|
97
101
|
)
|
|
98
102
|
split_dataset = self.return_data_splitter_object(x_train=x_train, y_train=y_train, x_test=x_test, y_test=y_test)
|
|
99
103
|
return split_dataset
|
|
@@ -126,7 +130,9 @@ class DatasetSplitterBase(Template):
|
|
|
126
130
|
if not packet:
|
|
127
131
|
self.logger.debug("No data to be processed by dataset splitter")
|
|
128
132
|
return container
|
|
129
|
-
|
|
133
|
+
if len(packet) == 1:
|
|
134
|
+
self.logger.debug("Not enough entries to divide dataset, returning original container")
|
|
135
|
+
return container
|
|
130
136
|
x_data, y_data = self.extract_x_y_from_packet(packet)
|
|
131
137
|
|
|
132
138
|
custom_dataset = self.store_data_in_data_splitter(x_data, y_data)
|
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
# -*- coding: utf-8 -*-
|
|
2
2
|
|
|
3
|
+
import numpy as np
|
|
3
4
|
import pandas as pd
|
|
4
5
|
from sinapsis_core.data_containers.data_packet import DataContainer, TimeSeriesPacket
|
|
5
6
|
from sinapsis_core.template_base import Template
|
|
@@ -14,6 +15,7 @@ from sinapsis_core.template_base.multi_execute_template import (
|
|
|
14
15
|
)
|
|
15
16
|
from sinapsis_core.utils.env_var_keys import SINAPSIS_BUILD_DOCS
|
|
16
17
|
from sklearn.model_selection import train_test_split
|
|
18
|
+
from sklearn.utils import Bunch
|
|
17
19
|
|
|
18
20
|
from sinapsis_data_readers.helpers import sklearn_dataset_subset
|
|
19
21
|
from sinapsis_data_readers.helpers.tags import Tags
|
|
@@ -65,7 +67,7 @@ class SKLearnDatasets(BaseDynamicWrapperTemplate):
|
|
|
65
67
|
"""
|
|
66
68
|
|
|
67
69
|
split_dataset: bool = True
|
|
68
|
-
train_size: float =
|
|
70
|
+
train_size: float = 0.9
|
|
69
71
|
store_as_time_series: bool = False
|
|
70
72
|
|
|
71
73
|
def __init__(self, attributes: TemplateAttributeType) -> None:
|
|
@@ -73,7 +75,23 @@ class SKLearnDatasets(BaseDynamicWrapperTemplate):
|
|
|
73
75
|
self.dataset_attributes = getattr(self.attributes, self.wrapped_callable.__name__)
|
|
74
76
|
|
|
75
77
|
@staticmethod
|
|
76
|
-
def
|
|
78
|
+
def process_bunch(bunch: Bunch) -> tuple:
|
|
79
|
+
data = bunch.get("data")
|
|
80
|
+
original_target = bunch.get("target")
|
|
81
|
+
|
|
82
|
+
target = np.asarray(original_target)
|
|
83
|
+
target = target.reshape(-1, 1) if target.ndim == 1 else target
|
|
84
|
+
feature_column = bunch.get("feature_names", None)
|
|
85
|
+
target_column = bunch.get("target_names", None)
|
|
86
|
+
if target.shape[1] == 1:
|
|
87
|
+
target_column = ["target"]
|
|
88
|
+
elif target_column is not None and len(target_column) == target.shape[1]:
|
|
89
|
+
target_column = list(target_column)
|
|
90
|
+
else:
|
|
91
|
+
target_column = [f"target_{i}" for i in range(target.shape[1])]
|
|
92
|
+
return data, target, feature_column, target_column
|
|
93
|
+
|
|
94
|
+
def parse_results(self, results: pd.DataFrame) -> tuple[pd.DataFrame, list, list, int]:
|
|
77
95
|
"""Parses the dataset as a pandas dataframe with the feature names as columns
|
|
78
96
|
|
|
79
97
|
Args:
|
|
@@ -84,17 +102,51 @@ class SKLearnDatasets(BaseDynamicWrapperTemplate):
|
|
|
84
102
|
the additional column for target values
|
|
85
103
|
|
|
86
104
|
"""
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
105
|
+
if isinstance(results, tuple):
|
|
106
|
+
data = results[0]
|
|
107
|
+
target = results[1]
|
|
108
|
+
feature_column = None
|
|
109
|
+
target_column = None
|
|
110
|
+
elif isinstance(results, Bunch):
|
|
111
|
+
data, target, feature_column, target_column = self.process_bunch(results)
|
|
112
|
+
else:
|
|
113
|
+
try:
|
|
114
|
+
data = results.data
|
|
115
|
+
|
|
116
|
+
except (KeyError, AttributeError, ValueError):
|
|
117
|
+
data = None
|
|
118
|
+
try:
|
|
119
|
+
target = results.target
|
|
120
|
+
except (KeyError, AttributeError, ValueError):
|
|
121
|
+
target = None
|
|
122
|
+
try:
|
|
123
|
+
feature_column = results.feature_names
|
|
124
|
+
target_column = results.target_names
|
|
125
|
+
except AttributeError:
|
|
126
|
+
feature_column = None
|
|
127
|
+
target_column = None
|
|
128
|
+
_, n_features = data.shape
|
|
129
|
+
|
|
130
|
+
feature_data_frame = pd.DataFrame(data=data, columns=feature_column)
|
|
131
|
+
target_data_frame = pd.DataFrame(data=target, columns=target_column)
|
|
132
|
+
data_frame = pd.concat([feature_data_frame, target_data_frame], axis=1)
|
|
133
|
+
return data_frame, feature_column, target_column, n_features
|
|
91
134
|
|
|
92
135
|
@staticmethod
|
|
93
|
-
def split_dataset(
|
|
136
|
+
def split_dataset(
|
|
137
|
+
results: pd.DataFrame, feature_name_cols: list, target_name_cols: list, n_features: int, split_size: float
|
|
138
|
+
) -> TabularDatasetSplit:
|
|
94
139
|
"""Method to split the dataset into training and testing samples"""
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
140
|
+
if feature_name_cols:
|
|
141
|
+
X = results[feature_name_cols]
|
|
142
|
+
y = results[target_name_cols]
|
|
143
|
+
else:
|
|
144
|
+
X = results.iloc[:, :n_features]
|
|
145
|
+
y = results.iloc[:, n_features:]
|
|
146
|
+
|
|
147
|
+
# x_vals = results.drop(columns=[TARGET], axis=1)
|
|
148
|
+
# y_vals = results[TARGET]
|
|
149
|
+
x_train, x_test, y_train, y_test = train_test_split(X, y, train_size=split_size, random_state=0)
|
|
98
150
|
split_data = TabularDatasetSplit(
|
|
99
151
|
x_train=pd.DataFrame(x_train),
|
|
100
152
|
x_test=pd.DataFrame(x_test),
|
|
@@ -106,13 +158,15 @@ class SKLearnDatasets(BaseDynamicWrapperTemplate):
|
|
|
106
158
|
|
|
107
159
|
def execute(self, container: DataContainer) -> DataContainer:
|
|
108
160
|
sklearn_dataset = self.wrapped_callable.__func__(**self.dataset_attributes.model_dump())
|
|
109
|
-
dataset = self.parse_results(sklearn_dataset)
|
|
161
|
+
dataset, feature_columns, target_columns, n_features = self.parse_results(sklearn_dataset)
|
|
110
162
|
if self.attributes.store_as_time_series:
|
|
111
163
|
time_series_packet = TimeSeriesPacket(content=dataset)
|
|
112
164
|
container.time_series.append(time_series_packet)
|
|
113
165
|
|
|
114
166
|
if self.attributes.split_dataset:
|
|
115
|
-
split_dataset = self.split_dataset(
|
|
167
|
+
split_dataset = self.split_dataset(
|
|
168
|
+
dataset, feature_columns, target_columns, n_features, split_size=self.attributes.train_size
|
|
169
|
+
)
|
|
116
170
|
self._set_generic_data(container, split_dataset)
|
|
117
171
|
if sklearn_dataset and not self.attributes.split_dataset:
|
|
118
172
|
self._set_generic_data(container, dataset)
|
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
# -*- coding: utf-8 -*-
|
|
2
|
+
from types import NoneType
|
|
2
3
|
from typing import Any
|
|
3
4
|
|
|
4
5
|
import pandas as pd
|
|
@@ -18,12 +19,24 @@ from sklearn.model_selection import train_test_split
|
|
|
18
19
|
from sktime import datasets
|
|
19
20
|
from sktime.split import temporal_train_test_split
|
|
20
21
|
|
|
22
|
+
from sinapsis_data_readers.helpers import sktime_datasets_subset
|
|
23
|
+
from sinapsis_data_readers.helpers.sktime_datasets_subset import class_datasets
|
|
21
24
|
from sinapsis_data_readers.helpers.tags import Tags
|
|
22
25
|
from sinapsis_data_readers.templates.datasets_readers.dataset_splitter import (
|
|
23
26
|
TabularDatasetSplit,
|
|
24
27
|
)
|
|
25
28
|
|
|
26
|
-
EXCLUDE_MODULES = ["load_forecastingdata", "DATASET_NAMES_FPP3"
|
|
29
|
+
EXCLUDE_MODULES = ["load_forecastingdata", "DATASET_NAMES_FPP3", "BaseDataset",
|
|
30
|
+
"load_gun_point_segmentation", "load_electric_devices_segments",
|
|
31
|
+
"write_dataframe_to_tsfile",
|
|
32
|
+
"write_ndarray_to_tsfile",
|
|
33
|
+
"write_results_to_uea_format",
|
|
34
|
+
"write_tabular_transformation_to_arff",
|
|
35
|
+
"write_panel_to_tsfileWrapper",
|
|
36
|
+
"_load_fpp3",
|
|
37
|
+
"load_hierarchical_sales_toydata",
|
|
38
|
+
"load_unitest_tsf"
|
|
39
|
+
] + class_datasets
|
|
27
40
|
|
|
28
41
|
|
|
29
42
|
class SKTimeDatasets(BaseDynamicWrapperTemplate):
|
|
@@ -77,8 +90,10 @@ class SKTimeDatasets(BaseDynamicWrapperTemplate):
|
|
|
77
90
|
|
|
78
91
|
def __init__(self, attributes: TemplateAttributeType) -> None:
|
|
79
92
|
super().__init__(attributes)
|
|
80
|
-
self.dataset_attributes =
|
|
93
|
+
self.dataset_attributes = self.initialize_attributes()
|
|
81
94
|
|
|
95
|
+
def initialize_attributes(self):
|
|
96
|
+
return getattr(self.attributes, self.wrapped_callable.__name__)
|
|
82
97
|
def split_time_series_dataset(self, dataset: Any) -> TabularDatasetSplit:
|
|
83
98
|
"""Split a time series dataset into training and testing sets
|
|
84
99
|
|
|
@@ -106,14 +121,22 @@ class SKTimeDatasets(BaseDynamicWrapperTemplate):
|
|
|
106
121
|
Returns:
|
|
107
122
|
TabularDatasetSplit: Object containing the split dataset.
|
|
108
123
|
"""
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
124
|
+
try:
|
|
125
|
+
X_train, X_test, y_train, y_test = train_test_split(
|
|
126
|
+
X, y, train_size=self.attributes.train_size, random_state=0
|
|
127
|
+
)
|
|
128
|
+
return TabularDatasetSplit(
|
|
129
|
+
x_train=pd.DataFrame(X_train),
|
|
130
|
+
x_test=pd.DataFrame(X_test),
|
|
131
|
+
y_train=pd.DataFrame(y_train),
|
|
132
|
+
y_test=pd.DataFrame(y_test),
|
|
133
|
+
)
|
|
134
|
+
except ValueError:
|
|
135
|
+
self.logger.debug("Wrong format for split. original values")
|
|
136
|
+
return TabularDatasetSplit(x_train=pd.DataFrame(X), y_train=pd.DataFrame(y))
|
|
137
|
+
|
|
138
|
+
def create_dataset(self):
|
|
139
|
+
return self.wrapped_callable.__func__(**self.dataset_attributes.model_dump())
|
|
117
140
|
def execute(self, container: DataContainer) -> DataContainer:
|
|
118
141
|
"""Execute the SKTimeDatasets template to load and process a dataset.
|
|
119
142
|
|
|
@@ -126,7 +149,7 @@ class SKTimeDatasets(BaseDynamicWrapperTemplate):
|
|
|
126
149
|
Returns:
|
|
127
150
|
DataContainer: The container with the dataset added to it.
|
|
128
151
|
"""
|
|
129
|
-
dataset = self.
|
|
152
|
+
dataset = self.create_dataset()
|
|
130
153
|
split_dataset = dataset
|
|
131
154
|
if isinstance(dataset, tuple):
|
|
132
155
|
if self.attributes.split_dataset:
|
|
@@ -161,6 +184,36 @@ class ExecuteNTimesSKTimeDatasets(SKTimeDatasets):
|
|
|
161
184
|
)
|
|
162
185
|
|
|
163
186
|
|
|
187
|
+
class SKTimeClassDatasets(SKTimeDatasets):
|
|
188
|
+
WrapperEntry = WrapperEntryConfig(
|
|
189
|
+
wrapped_object=sktime_datasets_subset,
|
|
190
|
+
signature_from_doc_string=True,
|
|
191
|
+
)
|
|
192
|
+
def initialize_attributes(self):
|
|
193
|
+
return None
|
|
194
|
+
def create_dataset(self):
|
|
195
|
+
dataset = self.wrapped_callable.load("X", "y")
|
|
196
|
+
if isinstance(dataset[0], NoneType):
|
|
197
|
+
return dataset[1]
|
|
198
|
+
elif isinstance(dataset[1], NoneType):
|
|
199
|
+
return dataset[0]
|
|
200
|
+
return dataset
|
|
201
|
+
|
|
202
|
+
@execute_template_n_times_wrapper
|
|
203
|
+
class ExecuteNTimesSKTimeClassDatasets(SKTimeDatasets):
|
|
204
|
+
"""This template extends the functionality of the SKTimeDatasets template
|
|
205
|
+
by loading the sktime dataset n times.
|
|
206
|
+
|
|
207
|
+
This is useful for running the same dataset loading operation multiple
|
|
208
|
+
times with different parameters or for benchmark purposes.
|
|
209
|
+
"""
|
|
210
|
+
|
|
211
|
+
WrapperEntry = WrapperEntryConfig(
|
|
212
|
+
wrapped_object=sktime_datasets_subset,
|
|
213
|
+
signature_from_doc_string=True,
|
|
214
|
+
template_name_suffix="ExecuteNTimes",
|
|
215
|
+
)
|
|
216
|
+
|
|
164
217
|
def __getattr__(name: str) -> Template:
|
|
165
218
|
"""
|
|
166
219
|
Only create a template if it's imported, this avoids creating all the base models for all templates
|
|
@@ -170,10 +223,16 @@ def __getattr__(name: str) -> Template:
|
|
|
170
223
|
return make_dynamic_template(name, SKTimeDatasets)
|
|
171
224
|
if name in ExecuteNTimesSKTimeDatasets.WrapperEntry.module_att_names:
|
|
172
225
|
return make_dynamic_template(name, ExecuteNTimesSKTimeDatasets)
|
|
226
|
+
if name in SKTimeClassDatasets.WrapperEntry.module_att_names:
|
|
227
|
+
return make_dynamic_template(name, SKTimeClassDatasets)
|
|
228
|
+
if name in ExecuteNTimesSKTimeClassDatasets.WrapperEntry.module_att_names:
|
|
229
|
+
return make_dynamic_template(name, ExecuteNTimesSKTimeClassDatasets)
|
|
173
230
|
raise AttributeError(f"template `{name}` not found in {__name__}")
|
|
174
231
|
|
|
175
232
|
|
|
176
|
-
__all__ = SKTimeDatasets.WrapperEntry.module_att_names + ExecuteNTimesSKTimeDatasets.WrapperEntry.module_att_names
|
|
233
|
+
__all__ = (SKTimeDatasets.WrapperEntry.module_att_names + ExecuteNTimesSKTimeDatasets.WrapperEntry.module_att_names +
|
|
234
|
+
SKTimeClassDatasets.WrapperEntry.module_att_names +
|
|
235
|
+
ExecuteNTimesSKTimeClassDatasets.WrapperEntry.module_att_names)
|
|
177
236
|
|
|
178
237
|
|
|
179
238
|
if SINAPSIS_BUILD_DOCS:
|
|
@@ -54,10 +54,10 @@ class CocoImageDatasetBaseCV2(FolderImageDatasetCV2):
|
|
|
54
54
|
annotations_path: str
|
|
55
55
|
|
|
56
56
|
def __init__(self, attributes: TemplateAttributeType) -> None:
|
|
57
|
-
|
|
58
|
-
self.annotations_file = os.path.join(self.attributes.data_dir, self.attributes.annotations_path)
|
|
57
|
+
self.annotations_file = os.path.join(attributes.get("data_dir"), attributes.get("annotations_path"))
|
|
59
58
|
self.raw_annotations_dict: list[dict[str, dict[str, Any]]] = self.read_annotations_file(self.annotations_file)
|
|
60
59
|
self.annotations = self.images_annotations()
|
|
60
|
+
super().__init__(attributes)
|
|
61
61
|
|
|
62
62
|
@staticmethod
|
|
63
63
|
def read_annotations_file(file: str) -> list[dict[str, dict[str, Any]]]:
|
|
@@ -3,7 +3,6 @@
|
|
|
3
3
|
from typing import cast
|
|
4
4
|
|
|
5
5
|
import numpy as np
|
|
6
|
-
import pandas as pd
|
|
7
6
|
from sinapsis_core.data_containers.annotations import ImageAnnotations
|
|
8
7
|
from sinapsis_core.data_containers.data_packet import ImagePacket
|
|
9
8
|
from sinapsis_core.template_base.base_models import (
|
|
@@ -12,6 +11,7 @@ from sinapsis_core.template_base.base_models import (
|
|
|
12
11
|
UIPropertiesMetadata,
|
|
13
12
|
)
|
|
14
13
|
|
|
14
|
+
from sinapsis_data_readers.helpers.csv_reader import read_file
|
|
15
15
|
from sinapsis_data_readers.helpers.tags import Tags
|
|
16
16
|
from sinapsis_data_readers.templates.base_file_data_loader import (
|
|
17
17
|
ContentNotSetException,
|
|
@@ -19,26 +19,6 @@ from sinapsis_data_readers.templates.base_file_data_loader import (
|
|
|
19
19
|
)
|
|
20
20
|
|
|
21
21
|
|
|
22
|
-
def read_file(file: str) -> pd.DataFrame:
|
|
23
|
-
"""
|
|
24
|
-
Reads a CSV file and returns its contents as a pandas DataFrame.
|
|
25
|
-
|
|
26
|
-
Args:
|
|
27
|
-
file (str): The path to the CSV file to be read.
|
|
28
|
-
|
|
29
|
-
Returns:
|
|
30
|
-
pd.DataFrame: The data from the CSV file as a pandas DataFrame.
|
|
31
|
-
|
|
32
|
-
Raises:
|
|
33
|
-
ValueError: If the file does not have a .csv extension.
|
|
34
|
-
"""
|
|
35
|
-
if not file.endswith("csv"):
|
|
36
|
-
raise ValueError("The file must have a .csv extension.")
|
|
37
|
-
|
|
38
|
-
data = pd.read_csv(file, header=0)
|
|
39
|
-
return data
|
|
40
|
-
|
|
41
|
-
|
|
42
22
|
class CSVImageDataset(_BaseDataReader):
|
|
43
23
|
"""
|
|
44
24
|
A dataset reader for CSV-based image datasets, inheriting from _BaseDataReader.
|
|
@@ -4,6 +4,7 @@
|
|
|
4
4
|
from typing import Literal, cast
|
|
5
5
|
|
|
6
6
|
import nvidia.dali.fn as fn
|
|
7
|
+
import torch
|
|
7
8
|
from nvidia.dali import pipeline_def
|
|
8
9
|
from nvidia.dali.pipeline import DataNode, Pipeline
|
|
9
10
|
from nvidia.dali.plugin.pytorch import DALIGenericIterator
|
|
@@ -149,6 +150,12 @@ class VideoReaderDali(BaseVideoReader):
|
|
|
149
150
|
video_frames.append(self._make_image_packet(frame, frame_index=self.frame_count + idx))
|
|
150
151
|
return video_frames
|
|
151
152
|
|
|
153
|
+
def reset_state(self, template_name: str | None = None) -> None:
|
|
154
|
+
_ = template_name
|
|
155
|
+
if self.attributes.device == "gpu":
|
|
156
|
+
torch.cuda.empty_cache()
|
|
157
|
+
super().reset_state(template_name)
|
|
158
|
+
|
|
152
159
|
|
|
153
160
|
@multi_video_wrapper
|
|
154
161
|
class MultiVideoReaderDali(VideoReaderDali):
|
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
# -*- coding: utf-8 -*-
|
|
2
|
+
import torch
|
|
2
3
|
from sinapsis_core.data_containers.data_packet import ImagePacket
|
|
3
4
|
from torchcodec.decoders import SimpleVideoDecoder
|
|
4
5
|
|
|
@@ -84,6 +85,12 @@ class VideoReaderTorchCodec(BaseVideoReader):
|
|
|
84
85
|
video_frames.append(self._make_image_packet(frame, frame_index=self.frame_count + idx))
|
|
85
86
|
return video_frames
|
|
86
87
|
|
|
88
|
+
def reset_state(self, template_name: str | None = None) -> None:
|
|
89
|
+
_ = template_name
|
|
90
|
+
if self.attributes.device == "gpu":
|
|
91
|
+
torch.cuda.empty_cache()
|
|
92
|
+
super().reset_state(template_name)
|
|
93
|
+
|
|
87
94
|
|
|
88
95
|
@multi_video_wrapper
|
|
89
96
|
class MultiVideoReaderTorchCodec(VideoReaderTorchCodec):
|
|
@@ -92,5 +99,4 @@ class MultiVideoReaderTorchCodec(VideoReaderTorchCodec):
|
|
|
92
99
|
by adding as many video_readers as needed depending on the lenght of
|
|
93
100
|
video_file_path list. It appends the dataframes of each of the videos to the
|
|
94
101
|
ImagePacket object in DataContainer
|
|
95
|
-
|
|
96
102
|
"""
|
|
@@ -10,9 +10,11 @@ src/sinapsis_data_readers.egg-info/requires.txt
|
|
|
10
10
|
src/sinapsis_data_readers.egg-info/top_level.txt
|
|
11
11
|
src/sinapsis_data_readers/helpers/__init__.py
|
|
12
12
|
src/sinapsis_data_readers/helpers/coco_dataclasses.py
|
|
13
|
+
src/sinapsis_data_readers/helpers/csv_reader.py
|
|
13
14
|
src/sinapsis_data_readers/helpers/file_path_helpers.py
|
|
14
15
|
src/sinapsis_data_readers/helpers/image_color_space_converter.py
|
|
15
16
|
src/sinapsis_data_readers/helpers/sklearn_dataset_subset.py
|
|
17
|
+
src/sinapsis_data_readers/helpers/sktime_datasets_subset.py
|
|
16
18
|
src/sinapsis_data_readers/helpers/tags.py
|
|
17
19
|
src/sinapsis_data_readers/helpers/text_input_helpers.py
|
|
18
20
|
src/sinapsis_data_readers/templates/__init__.py
|
|
@@ -23,6 +25,7 @@ src/sinapsis_data_readers/templates/audio_readers/audio_reader_soundfile.py
|
|
|
23
25
|
src/sinapsis_data_readers/templates/audio_readers/audio_reader_to_bytes.py
|
|
24
26
|
src/sinapsis_data_readers/templates/audio_readers/base_audio_reader.py
|
|
25
27
|
src/sinapsis_data_readers/templates/datasets_readers/__init__.py
|
|
28
|
+
src/sinapsis_data_readers/templates/datasets_readers/csv_datasets.py
|
|
26
29
|
src/sinapsis_data_readers/templates/datasets_readers/dataset_splitter.py
|
|
27
30
|
src/sinapsis_data_readers/templates/datasets_readers/sklearn_datasets.py
|
|
28
31
|
src/sinapsis_data_readers/templates/datasets_readers/sktime_datasets.py
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{sinapsis_data_readers-0.1.12 → sinapsis_data_readers-0.1.14}/src/sinapsis_data_readers/__init__.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|