sibi-dst 0.3.14__py3-none-any.whl → 0.3.16__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,4 +1,5 @@
1
1
  import dask.dataframe as dd
2
+ import dask_expr
2
3
  import pandas as pd
3
4
 
4
5
  from sibi_dst.df_helper.core import ParamsConfig, QueryConfig
@@ -28,7 +29,6 @@ class SqlAlchemyLoadFromDb:
28
29
  self.query_config = plugin_query
29
30
  self.params_config = plugin_params
30
31
  self.debug = kwargs.pop("debug", False)
31
- self.verbose_debug = kwargs.pop("verbose_debug", False)
32
32
 
33
33
  def build_and_load(self) -> dd.DataFrame:
34
34
  """
@@ -40,7 +40,6 @@ class SqlAlchemyLoadFromDb:
40
40
  def _build_and_load(self) -> dd.DataFrame:
41
41
 
42
42
  try:
43
- # reader = SQLAlchemyDask(model=self.model, filters=self.params_config.filters,engine_url=self.engine.url, logger=self.logger, chunk_size=1000, debug=self.debug)
44
43
  self.df = SQLAlchemyDask(
45
44
  model=self.model,
46
45
  filters=self.params_config.filters,
@@ -49,10 +48,13 @@ class SqlAlchemyLoadFromDb:
49
48
  chunk_size=1000,
50
49
  debug=self.debug).read_frame()
51
50
  if self.df is None or len(self.df.head().index) == 0:
52
- self.logger.warning("Query returned no results.")
53
- return dd.from_pandas(pd.DataFrame(), npartitions=1)
51
+ self.logger.debug("Query returned no results.")
52
+ dask_df=dd.from_pandas(pd.DataFrame(), npartitions=1)
54
53
 
54
+ return dask_df
55
55
  return self.df
56
56
  except Exception as e:
57
- self.logger.error(f"Failed to load data into Dask DataFrame.{e}")
58
- return dd.from_pandas(pd.DataFrame(), npartitions=1)
57
+ self.logger.debug(f"Failed to load data into Dask DataFrame.{e}")
58
+ dask_df = dd.from_pandas(pd.DataFrame(), npartitions=1)
59
+
60
+ return dask_df
@@ -59,7 +59,7 @@ class SqlAlchemyModelBuilder:
59
59
  attrs = {
60
60
  "__tablename__": self.table_name,
61
61
  "__table__": self.table,
62
- #"__module__": f"{apps_label}.models",
62
+ "__module__": f"{apps_label}.models",
63
63
  "__mapper_args__": {"eager_defaults": True},
64
64
  }
65
65
 
@@ -82,9 +82,12 @@ class SqlAlchemyModelBuilder:
82
82
  dict: Dictionary of column attributes.
83
83
  """
84
84
  columns = {}
85
+ reserved_names = ["metadata", "class_", "table"]
86
+
85
87
  for column in table.columns:
86
88
  column_name = self.normalize_column_name(column.name)
87
- columns[column_name] = column
89
+ if column_name not in reserved_names:
90
+ columns[column_name] = column
88
91
  return columns
89
92
 
90
93
  def add_relationships(self, attrs, table: Table):
@@ -26,7 +26,6 @@ class SQLModelLoadFromDb:
26
26
  self.query_config = db_query or {}
27
27
  self.params_config = db_params or {}
28
28
  self.debug = kwargs.pop("debug", False)
29
- self.verbose_debug = kwargs.pop("verbose_debug", False)
30
29
 
31
30
  def _default_logger(self):
32
31
  """Create a default logger."""
@@ -69,7 +68,7 @@ class SQLModelLoadFromDb:
69
68
  query = query.limit(n_records)
70
69
 
71
70
  # Debug: Log the SQL query
72
- self.logger.info(f"Executing query: {str(query)}")
71
+ self.logger.debug(f"Executing query: {str(query)}")
73
72
 
74
73
  # Execute the query
75
74
  results = session.exec(query).fetchall()
@@ -79,7 +78,7 @@ class SQLModelLoadFromDb:
79
78
  if results:
80
79
  df = dd.from_pandas(pd.DataFrame([r.dict() for r in results]), npartitions=1)
81
80
  else:
82
- self.logger.warning("Query returned no results.")
81
+ self.logger.debug("Query returned no results.")
83
82
  df = dd.from_pandas(pd.DataFrame(), npartitions=1)
84
83
 
85
84
  except Exception as e:
@@ -34,7 +34,7 @@ class ClickHouseWriter:
34
34
  self.df = df.copy()
35
35
  self.order_by = kwargs.setdefault('order_by',self.order_by)
36
36
  if len(self.df.head().index) == 0:
37
- self.logger.info("Dataframe is empty")
37
+ self.logger.debug("Dataframe is empty")
38
38
  return
39
39
  self._handle_missing_values()
40
40
  self._connect()
@@ -51,7 +51,7 @@ class ClickHouseWriter:
51
51
  user=self.clickhouse_user,
52
52
  password=self.clickhouse_password
53
53
  )
54
- self.logger.info("Connected to ClickHouse")
54
+ self.logger.debug("Connected to ClickHouse")
55
55
  except Exception as e:
56
56
  self.logger.error(e)
57
57
  raise
@@ -80,7 +80,7 @@ class ClickHouseWriter:
80
80
  def _drop_table(self):
81
81
  if self.client:
82
82
  self.client.command('DROP TABLE IF EXISTS {}'.format(self.clickhouse_table))
83
- self.logger.info(f"Dropped table {self.clickhouse_table}")
83
+ self.logger.debug(f"Dropped table {self.clickhouse_table}")
84
84
 
85
85
  def _create_table_from_dask(self, engine=None):
86
86
  if engine is None:
@@ -88,18 +88,18 @@ class ClickHouseWriter:
88
88
  dtypes = self.df.dtypes
89
89
  clickhouse_schema = self._generate_clickhouse_schema(dtypes,self.dtype_to_clickhouse)
90
90
  create_table_sql= f"CREATE TABLE IF NOT EXISTS {self.clickhouse_table} ({clickhouse_schema}) {engine};"
91
- self.logger.info(f"Creating table SQL:{create_table_sql}")
91
+ self.logger.debug(f"Creating table SQL:{create_table_sql}")
92
92
  if self.client:
93
93
  self.client.command(create_table_sql)
94
- self.logger.info("Created table '{}'".format(self.clickhouse_table))
94
+ self.logger.debug("Created table '{}'".format(self.clickhouse_table))
95
95
 
96
96
  def _handle_missing_values(self):
97
97
  """
98
98
  Handle missing values in the Dask DataFrame before writing to ClickHouse.
99
99
  """
100
- self.logger.info("Checking for missing values...")
100
+ self.logger.debug("Checking for missing values...")
101
101
  missing_counts = self.df.isnull().sum().compute()
102
- self.logger.info(f"Missing values per column:\n{missing_counts}")
102
+ self.logger.debug(f"Missing values per column:\n{missing_counts}")
103
103
 
104
104
  # Replace missing values based on column types
105
105
  def replace_missing_values(df):
@@ -116,14 +116,14 @@ class ClickHouseWriter:
116
116
 
117
117
  # Apply replacement
118
118
  self.df = replace_missing_values(self.df)
119
- self.logger.info("Missing values replaced.")
119
+ self.logger.debug("Missing values replaced.")
120
120
 
121
121
  def _write_data(self):
122
122
  """
123
123
  Writes the Dask DataFrame to a ClickHouse table partition by partition.
124
124
  """
125
125
  if len(self.df.head().index) == 0:
126
- self.logger.info("No data found. Nothing written.")
126
+ self.logger.debug("No data found. Nothing written.")
127
127
  return
128
128
 
129
129
  for i, partition in enumerate(self.df.to_delayed()):
@@ -132,10 +132,10 @@ class ClickHouseWriter:
132
132
  df = partition.compute()
133
133
 
134
134
  if df.empty:
135
- self.logger.info(f"Partition {i} is empty. Skipping...")
135
+ self.logger.debug(f"Partition {i} is empty. Skipping...")
136
136
  continue
137
137
 
138
- self.logger.info(f"Writing partition {i} with {len(df)} rows to ClickHouse.")
138
+ self.logger.debug(f"Writing partition {i} with {len(df)} rows to ClickHouse.")
139
139
 
140
140
  # Write the partition to the ClickHouse table
141
141
  self.client.insert_df(self.clickhouse_table, df)
@@ -148,7 +148,7 @@ class ClickHouseWriter:
148
148
  Ensures a separate client instance is used per thread to avoid session conflicts.
149
149
  """
150
150
  if len(self.df.index) == 0:
151
- self.logger.info("No data found. Nothing written.")
151
+ self.logger.debug("No data found. Nothing written.")
152
152
  return
153
153
 
154
154
  def create_client():
@@ -170,13 +170,13 @@ class ClickHouseWriter:
170
170
  Write a single partition to ClickHouse using a separate client instance.
171
171
  """
172
172
  try:
173
- self.logger.info(f"Starting to process partition {index}")
173
+ self.logger.debug(f"Starting to process partition {index}")
174
174
  client = create_client() # Create a new client for the thread
175
175
 
176
176
  # Compute the Dask partition into a Pandas DataFrame
177
177
  df = partition.compute()
178
178
  if df.empty:
179
- self.logger.info(f"Partition {index} is empty. Skipping...")
179
+ self.logger.debug(f"Partition {index} is empty. Skipping...")
180
180
  return
181
181
 
182
182
  # Convert DataFrame to list of tuples
@@ -184,7 +184,7 @@ class ClickHouseWriter:
184
184
  columns = df.columns.tolist()
185
185
 
186
186
  # Perform the insert
187
- self.logger.info(f"Writing partition {index} with {len(df)} rows to ClickHouse.")
187
+ self.logger.debug(f"Writing partition {index} with {len(df)} rows to ClickHouse.")
188
188
  client.execute(f"INSERT INTO {self.clickhouse_table} ({', '.join(columns)}) VALUES", data)
189
189
 
190
190
  except Exception as e:
@@ -192,7 +192,7 @@ class ClickHouseWriter:
192
192
  finally:
193
193
  if 'client' in locals() and hasattr(client, 'close'):
194
194
  client.close()
195
- self.logger.info(f"Closed client for partition {index}")
195
+ self.logger.debug(f"Closed client for partition {index}")
196
196
 
197
197
  try:
198
198
  # Get delayed partitions and enumerate them
@@ -1,77 +1,32 @@
1
- import pandas as pd
2
1
  import dask.dataframe as dd
2
+ import pandas as pd
3
+
3
4
  from sibi_dst.utils import Logger
4
5
 
6
+
5
7
  class DataUtils:
6
8
 
7
- def __init__(self, logger=None):
9
+ def __init__(self, logger=None, **kwargs):
8
10
  self.logger = logger or Logger.default_logger(logger_name=self.__class__.__name__)
11
+ self.debug = kwargs.get('debug', False)
9
12
 
10
13
  def transform_numeric_cols(self, df, columns, fill_value=0, dtype=int):
11
14
  if not columns:
12
15
  self.logger.warning('No columns specified')
13
-
16
+ self.logger.debug(f'Dataframe type:{type(df)}')
14
17
  columns = [column for column in columns if column in df.columns]
15
18
  for col in columns:
16
- if isinstance(df, dd.DataFrame):
17
- # Replace NaN with 0, then convert to boolean
18
- df[col] = df[col].map_partitions(
19
- lambda s: pd.to_numeric(s, errors='coerce') # Convert to numeric, invalid to NaN
20
- .fillna(fill_value) # Replace NaN with 0
21
- .astype(dtype),
22
- meta=(col, dtype)
23
- )
24
- else:
25
- # For Pandas DataFrame, handle mixed types and invalid values
26
- df[col] = pd.to_numeric(df[col], errors='coerce') # Convert to numeric, invalid to NaN
27
- df[col] = df[col].fillna(fill_value).astype(dtype)
19
+ # Replace NaN with 0, then convert to boolean
20
+ df[col] = df[col].map_partitions(
21
+ lambda s: pd.to_numeric(s, errors='coerce') # Convert to numeric, invalid to NaN
22
+ .fillna(fill_value) # Replace NaN with 0
23
+ .astype(dtype),
24
+ meta=(col, dtype)
25
+ )
28
26
 
29
27
  return df
30
28
 
31
- @staticmethod
32
- def transform_numeric_columns(df, columns=None, fill_value=0, transform_func=None):
33
- """
34
- Transform numeric columns in a DataFrame (Pandas or Dask), handling missing values and applying optional transformations.
35
-
36
- Parameters:
37
- - df (pandas.DataFrame or dask.dataframe.DataFrame): The DataFrame.
38
- - columns (list of str, optional): Specific columns to transform. If None, all numeric columns are transformed.
39
- - fill_value (int or float): The value to replace NA values with.
40
- - transform_func (callable, optional): The transformation function to apply.
41
- If None, no additional transformation is applied.
42
-
43
- Returns:
44
- - pandas.DataFrame or dask.dataframe.DataFrame: Updated DataFrame with transformed numeric columns.
45
- """
46
- if columns is None:
47
- # Detect numeric columns
48
- columns = df.select_dtypes(include=['number']).columns.tolist()
49
-
50
- if not columns:
51
- return df
52
-
53
- columns = [column for column in columns if column in df.columns]
54
- # Default transformation function (identity) if none is provided
55
- if transform_func is None:
56
- transform_func = lambda x: x
57
-
58
- # Batch processing for Dask
59
- if isinstance(df, dd.DataFrame):
60
- def transform_partition(partition):
61
- # Apply transformations for all numeric columns in a single pass
62
- partition[columns] = partition[columns].fillna(fill_value).map(transform_func)
63
- return partition
64
-
65
- # Apply the transformation function to all specified columns
66
- df = df.map_partitions(transform_partition, meta=df)
67
- else:
68
- # Pandas: Vectorized operations for all specified columns
69
- df[columns] = df[columns].fillna(fill_value).map(transform_func)
70
-
71
- return df
72
-
73
- @staticmethod
74
- def transform_boolean_columns(df, columns=None):
29
+ def transform_boolean_columns(self, df, columns=None):
75
30
  """
76
31
  Detect if the provided columns in a DataFrame (Pandas or Dask) contain only 0 and 1
77
32
  and convert them to boolean. Detection is performed using a sample.
@@ -84,23 +39,20 @@ class DataUtils:
84
39
  Returns:
85
40
  - pandas.DataFrame or dask.dataframe.DataFrame: Updated DataFrame with transformed boolean columns.
86
41
  """
42
+
87
43
  # Apply transformation to each specified column
88
44
  for col in columns:
89
45
  if col in df.columns:
90
- if isinstance(df, dd.DataFrame):
91
- # Replace NaN with 0, then convert to boolean
92
- df[col] = df[col].map_partitions(
93
- lambda s: pd.to_numeric(s, errors='coerce') # Convert to numeric, invalid to NaN
94
- .fillna(0) # Replace NaN with 0
95
- .astype(int) # Ensure integer type
96
- .astype(bool), # Convert to boolean
97
- meta=(col, 'bool')
98
- )
99
- else:
100
- # For Pandas DataFrame, handle mixed types and invalid values
101
- df[col] = pd.to_numeric(df[col], errors='coerce') # Convert to numeric, invalid to NaN
102
- df[col] = df[col].fillna(0).astype(int).astype(bool)
103
-
46
+ # Replace NaN with 0, then convert to boolean
47
+ df[col] = df[col].map_partitions(
48
+ lambda s: pd.to_numeric(s, errors='coerce') # Convert to numeric, invalid to NaN
49
+ .fillna(0) # Replace NaN with 0
50
+ .astype(int) # Ensure integer type
51
+ .astype(bool), # Convert to boolean
52
+ meta=(col, 'bool')
53
+ )
54
+ if self.debug:
55
+ self.logger.debug(f'Dataframe type:{type(df)}, boolean applied to columns: {columns}')
104
56
  return df
105
57
 
106
58
  def merge_lookup_data(self, classname, df, **kwargs):
@@ -116,6 +68,7 @@ class DataUtils:
116
68
  - pandas.DataFrame or dask.dataframe.DataFrame: Updated DataFrame with merged lookup data.
117
69
  """
118
70
  # Return early if the DataFrame is empty
71
+ debug = kwargs.setdefault("debug", False)
119
72
  if self.is_dataframe_empty(df):
120
73
  return df
121
74
 
@@ -136,17 +89,24 @@ class DataUtils:
136
89
  column_names = kwargs.pop('column_names', ['temp_join_col', source_description_alias])
137
90
 
138
91
  if source_col not in df.columns:
139
- self.logger.info(f"{source_col} not in DataFrame columns")
92
+ self.logger.debug(f"{source_col} not in DataFrame columns")
140
93
  return df
141
94
 
142
95
  # Get unique IDs from source column
143
96
  ids = df[source_col].dropna().unique()
144
- if isinstance(ids, dd.Series):
97
+ # Compute if it's a Dask Series
98
+ if isinstance(ids, dd.core.Series):
145
99
  ids = ids.compute()
100
+
101
+ # Check if any IDs are found
146
102
  if not len(ids):
147
- self.logger.info(f"No IDs found in the source column: {source_col}")
103
+ self.logger.debug(f"No IDs found in the source column: {source_col}")
148
104
  return df
149
- ids = sorted(ids.tolist())
105
+
106
+ # Convert to a list only if necessary and sort
107
+ if not isinstance(ids, list):
108
+ ids = ids.tolist()
109
+ ids = sorted(ids)
150
110
  # Prepare kwargs for loading lookup data
151
111
  load_kwargs = kwargs.copy()
152
112
  load_kwargs.update({
@@ -155,10 +115,10 @@ class DataUtils:
155
115
  f'{lookup_col}__in': ids
156
116
  })
157
117
  # Load lookup data
158
- lookup_instance = classname(debug=True, verbose_debug=True)
118
+ lookup_instance = classname(debug=debug)
159
119
  result = lookup_instance.load(**load_kwargs)
160
120
  if len(result.index) == 0:
161
- self.logger.info(f"No IDs found in the source column: {source_col}")
121
+ self.logger.debug(f"No IDs found in the source column: {source_col}")
162
122
  return df
163
123
  # Determine the join column on the result DataFrame
164
124
  temp_join_col = 'temp_join_col' if 'temp_join_col' in column_names else lookup_col
@@ -167,14 +127,13 @@ class DataUtils:
167
127
  df = df.merge(result, how='left', left_on=source_col, right_on=temp_join_col)
168
128
 
169
129
  if fillna_source_description_alias and source_description_alias in df.columns:
170
- df[source_description_alias]=df[source_description_alias].fillna('')
130
+ df[source_description_alias] = df[source_description_alias].fillna('')
171
131
 
172
132
  # Drop temp_join_col if present
173
133
  df = df.drop(columns='temp_join_col', errors='ignore')
174
134
 
175
135
  return df
176
136
 
177
-
178
137
  def is_dataframe_empty(self, df):
179
138
  """
180
139
  Check if a DataFrame (Pandas or Dask) is empty.
@@ -1,8 +1,12 @@
1
1
  import datetime
2
2
  from typing import Type, Any, Dict, Optional
3
+
4
+ import dask_expr
3
5
  import fsspec
4
6
  import pandas as pd
5
7
  from IPython.display import display
8
+ from dask.dataframe import dd
9
+
6
10
  from sibi_dst.utils import Logger
7
11
  from tqdm import tqdm
8
12
  from sibi_dst.utils import ParquetSaver
@@ -112,7 +116,7 @@ class DataWrapper:
112
116
  file_age_minutes = (current_time - file_modification_datetime).total_seconds() / 60
113
117
 
114
118
  if self.verbose:
115
- self.logger.info(
119
+ self.logger.debug(
116
120
  f"File {file_path} is {round(file_age_minutes, 2)} minutes old "
117
121
  f"(threshold: {self.max_age_minutes} minutes)"
118
122
  )
@@ -129,14 +133,14 @@ class DataWrapper:
129
133
  start_time = datetime.datetime.now()
130
134
 
131
135
  if self.verbose:
132
- self.logger.info(f"Processing {full_parquet_filename}...")
136
+ self.logger.debug(f"Processing {full_parquet_filename}...")
133
137
 
134
138
  data_object = self.dataclass(**self.class_params)
135
139
  df = data_object.load_period(dt_field=self.date_field, start=date, end=date)
136
140
 
137
141
  if len(df.index)==0:
138
142
  if self.verbose:
139
- self.logger.info("No data found for the specified date.")
143
+ self.logger.debug("No data found for the specified date.")
140
144
  return
141
145
 
142
146
  parquet_saver = ParquetSaver(df, folder, self.logger)
@@ -146,7 +150,7 @@ class DataWrapper:
146
150
  duration_seconds = (end_time - start_time).total_seconds()
147
151
 
148
152
  if self.verbose:
149
- self.logger.info(
153
+ self.logger.debug(
150
154
  f"Data saved to {full_parquet_filename}. Processing time: {duration_seconds:.2f} seconds"
151
155
  )
152
156
 
@@ -85,7 +85,7 @@ class DfUtils:
85
85
  # Ensure all specified columns exist in the DataFrame
86
86
  missing_columns = [col for col, _, _ in conditions if col not in df.columns]
87
87
  if missing_columns:
88
- self.logger.info(f"The following columns are missing in the DataFrame: {', '.join(missing_columns)}")
88
+ self.logger.debug(f"The following columns are missing in the DataFrame: {', '.join(missing_columns)}")
89
89
  return df
90
90
 
91
91
  # Build the combined filtering condition
@@ -117,7 +117,7 @@ class DfUtils:
117
117
  DataFrame: Grouped DataFrame with counts.
118
118
  """
119
119
  if debug:
120
- self.logger.info(f"Grouping by: {group_by_expr}")
120
+ self.logger.debug(f"Grouping by: {group_by_expr}")
121
121
 
122
122
  df_grouped = df.groupby(by=group_by_expr).size().reset_index(name=group_expr)
123
123
  return df_grouped
@@ -141,7 +141,7 @@ class DfUtils:
141
141
 
142
142
  if debug:
143
143
  df_duplicates = df[df.duplicated(subset=duplicate_expr)]
144
- self.logger.info(f"Duplicate Rows based on columns {duplicate_expr} are:\n{df_duplicates}")
144
+ self.logger.debug(f"Duplicate Rows based on columns {duplicate_expr} are:\n{df_duplicates}")
145
145
 
146
146
  if sort_field:
147
147
  if isinstance(df, dd.DataFrame):
@@ -224,9 +224,9 @@ class DfUtils:
224
224
  Returns:
225
225
  DataFrame: Resampled pivot table.
226
226
  """
227
- if isinstance(df, dd.DataFrame):
227
+ if isinstance(df, dd.core.DataFrame):
228
228
  # Implement Dask-compatible pivot and resample
229
- self.logger.info("Performing summarization with Dask DataFrame.")
229
+ self.logger.debug("Performing summarization with Dask DataFrame.")
230
230
  # Ensure the index is a datetime for resampling
231
231
  if not isinstance(df.index, (pd.DatetimeIndex, dd.core.DatetimeIndex)):
232
232
  self.logger.warning("Index is not a DatetimeIndex. Converting index to datetime.")
@@ -55,6 +55,9 @@ class Logger:
55
55
  log_file = log_file or logger_name
56
56
  return cls(log_dir=log_dir, logger_name=logger_name, log_file=log_file)
57
57
 
58
+ def setLevel(self, level):
59
+ self.logger.setLevel(level)
60
+
58
61
  def debug(self, msg):
59
62
  self.logger.debug(msg)
60
63
 
@@ -1,18 +1,16 @@
1
- import datetime
2
1
  from pathlib import Path
3
2
  from typing import Optional
4
3
 
5
- import dask.dataframe as dd
4
+ import dask_expr
6
5
  import fsspec
7
- import pandas as pd
8
6
  import pyarrow as pa
7
+
9
8
  from sibi_dst.utils import Logger
10
9
 
10
+
11
11
  class ParquetSaver:
12
12
  def __init__(self, df_result, parquet_storage_path, logger=None):
13
13
  # Ensure df_result is a Dask DataFrame
14
- if not isinstance(df_result, dd.DataFrame):
15
- df_result = dd.from_pandas(df_result, npartitions=1)
16
14
  self.df_result = df_result
17
15
  self.parquet_storage_path = parquet_storage_path
18
16
  self.logger = logger or Logger.default_logger(logger_name=self.__class__.__name__)
@@ -106,106 +104,3 @@ class ParquetSaver:
106
104
  str(full_path), engine="pyarrow", schema=schema, write_index=False
107
105
  )
108
106
 
109
- # import datetime
110
- # from pathlib import Path
111
- # from typing import Optional
112
- #
113
- # import dask.dataframe as dd
114
- # import fsspec
115
- # import pandas as pd
116
- # import pyarrow as pa
117
- # from sibi_dst.utils import Logger
118
- #
119
- # class ParquetSaver:
120
- # def __init__(self, df_result, parquet_storage_path, logger):
121
- # self.df_result = df_result
122
- # self.parquet_storage_path = parquet_storage_path
123
- # self.logger = logger or Logger.default_logger(logger_name=self.__class__.__name__)
124
- #
125
- #
126
- # def save_to_parquet(self, parquet_filename: Optional[str] = None, clear_existing=True):
127
- # full_path = self._construct_full_path(parquet_filename)
128
- #
129
- # if len(self.df_result) == 0:
130
- # self.logger.warning('No data to save')
131
- # return # Exit early if there's no data to save
132
- #
133
- # # Ensure directory exists and clear if necessary
134
- # self._ensure_directory_exists(full_path, clear_existing=True)
135
- #
136
- # # Define schema and save DataFrame to parquet
137
- # schema = self._define_schema()
138
- # self._convert_dtypes(schema)
139
- # self._save_dataframe_to_parquet(full_path, schema)
140
- #
141
- # def _define_schema(self) -> pa.Schema:
142
- # """Define a PyArrow schema dynamically based on df_result column types."""
143
- # pandas_dtype_to_pa = {
144
- # 'object': pa.string(),
145
- # 'string': pa.string(),
146
- # 'Int64': pa.int64(),
147
- # 'int64': pa.int64(),
148
- # 'float64': pa.float64(),
149
- # 'bool': pa.bool_(),
150
- # 'boolean': pa.bool_(), # pandas nullable boolean
151
- # 'datetime64[ns]': pa.timestamp('ns'),
152
- # 'timedelta[ns]': pa.duration('ns')
153
- # }
154
- #
155
- # fields = [
156
- # pa.field(col, pandas_dtype_to_pa.get(str(dtype), pa.string()))
157
- # for col, dtype in self.df_result.dtypes.items()
158
- # ]
159
- # return pa.schema(fields)
160
- #
161
- # def _convert_dtypes(self, schema: pa.Schema):
162
- # """Convert DataFrame columns to match the specified schema."""
163
- # dtype_mapping = {}
164
- # for field in schema:
165
- # col_name = field.name
166
- # if col_name in self.df_result.columns:
167
- # if pa.types.is_string(field.type):
168
- # dtype_mapping[col_name] = 'string'
169
- # elif pa.types.is_int64(field.type):
170
- # dtype_mapping[col_name] = 'Int64' # pandas nullable integer
171
- # elif pa.types.is_float64(field.type):
172
- # dtype_mapping[col_name] = 'float64'
173
- # elif pa.types.is_boolean(field.type):
174
- # dtype_mapping[col_name] = 'boolean' # pandas nullable boolean
175
- # elif pa.types.is_timestamp(field.type):
176
- # dtype_mapping[col_name] = 'datetime64[ns]'
177
- # else:
178
- # dtype_mapping[col_name] = 'object' # Fallback to object
179
- # self.df_result = self.df_result.astype(dtype_mapping)
180
- #
181
- # def _construct_full_path(self, parquet_filename: Optional[str]) -> Path:
182
- # """Construct and return the full path for the parquet file."""
183
- # fs, base_path = fsspec.core.url_to_fs(self.parquet_storage_path)
184
- # parquet_filename = parquet_filename or "default.parquet"
185
- # return Path(base_path) / parquet_filename
186
- #
187
- # @staticmethod
188
- # def _ensure_directory_exists(full_path: Path, clear_existing=False):
189
- # """Ensure that the directory for the path exists, clearing it if specified."""
190
- # fs, _ = fsspec.core.url_to_fs(str(full_path))
191
- # directory = str(full_path.parent)
192
- #
193
- # if fs.exists(directory):
194
- # if clear_existing:
195
- # fs.rm(directory, recursive=True)
196
- # else:
197
- # fs.mkdirs(directory, exist_ok=True)
198
- #
199
- # def _save_dataframe_to_parquet(self, full_path: Path, schema: pa.Schema):
200
- # """Save the DataFrame to parquet with fsspec using specified schema."""
201
- # fs, _ = fsspec.core.url_to_fs(str(full_path))
202
- # if fs.exists(full_path):
203
- # fs.rm(full_path, recursive=True)
204
- # if isinstance(self.df_result, dd.DataFrame):
205
- # self.df_result.to_parquet(
206
- # str(full_path), engine="pyarrow", schema=schema, write_index=False
207
- # )
208
- # elif isinstance(self.df_result, pd.DataFrame):
209
- # dd.from_pandas(self.df_result, npartitions=1).to_parquet(
210
- # str(full_path), engine="pyarrow", schema=schema, write_index=False
211
- # )
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: sibi-dst
3
- Version: 0.3.14
3
+ Version: 0.3.16
4
4
  Summary: Data Science Toolkit
5
5
  Author: Luis Valverde
6
6
  Author-email: lvalverdeb@gmail.com
@@ -13,6 +13,7 @@ Requires-Dist: chardet (>=5.2.0,<6.0.0)
13
13
  Requires-Dist: charset-normalizer (>=3.4.0,<4.0.0)
14
14
  Requires-Dist: clickhouse-connect (>=0.8.7,<0.9.0)
15
15
  Requires-Dist: clickhouse-driver (>=0.2.9,<0.3.0)
16
+ Requires-Dist: dask-expr (>=1.1.20,<2.0.0)
16
17
  Requires-Dist: dask[complete] (>=2024.11.1,<2025.0.0)
17
18
  Requires-Dist: django (>=5.1.4,<6.0.0)
18
19
  Requires-Dist: djangorestframework (>=3.15.2,<4.0.0)