sdk-seshat-python 0.4.3__tar.gz → 0.4.4__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {sdk_seshat_python-0.4.3 → sdk_seshat_python-0.4.4}/PKG-INFO +1 -1
- {sdk_seshat_python-0.4.3 → sdk_seshat_python-0.4.4}/pyproject.toml +1 -1
- {sdk_seshat_python-0.4.3 → sdk_seshat_python-0.4.4}/seshat/data_class/base.py +1 -1
- {sdk_seshat_python-0.4.3 → sdk_seshat_python-0.4.4}/seshat/general/config.py +1 -0
- sdk_seshat_python-0.4.4/seshat/transformer/aggregator/base.py +160 -0
- {sdk_seshat_python-0.4.3 → sdk_seshat_python-0.4.4}/seshat/transformer/deriver/__init__.py +2 -1
- {sdk_seshat_python-0.4.3 → sdk_seshat_python-0.4.4}/seshat/transformer/deriver/base.py +405 -147
- sdk_seshat_python-0.4.4/seshat/transformer/imputer/base.py +57 -0
- {sdk_seshat_python-0.4.3 → sdk_seshat_python-0.4.4}/seshat/transformer/merger/nested_key.py +32 -37
- {sdk_seshat_python-0.4.3 → sdk_seshat_python-0.4.4}/seshat/transformer/pipeline/branch.py +7 -0
- {sdk_seshat_python-0.4.3 → sdk_seshat_python-0.4.4}/seshat/transformer/pseudo/action_gate.py +1 -0
- {sdk_seshat_python-0.4.3 → sdk_seshat_python-0.4.4}/seshat/transformer/trimmer/base.py +14 -0
- sdk_seshat_python-0.4.3/seshat/transformer/aggregator/base.py +0 -107
- sdk_seshat_python-0.4.3/seshat/transformer/imputer/base.py +0 -6
- {sdk_seshat_python-0.4.3 → sdk_seshat_python-0.4.4}/LICENSE +0 -0
- {sdk_seshat_python-0.4.3 → sdk_seshat_python-0.4.4}/README.md +0 -0
- {sdk_seshat_python-0.4.3 → sdk_seshat_python-0.4.4}/seshat/__init__.py +0 -0
- {sdk_seshat_python-0.4.3 → sdk_seshat_python-0.4.4}/seshat/__main__.py +0 -0
- {sdk_seshat_python-0.4.3 → sdk_seshat_python-0.4.4}/seshat/data_class/__init__.py +0 -0
- {sdk_seshat_python-0.4.3 → sdk_seshat_python-0.4.4}/seshat/data_class/pandas.py +0 -0
- {sdk_seshat_python-0.4.3 → sdk_seshat_python-0.4.4}/seshat/data_class/pyspark.py +0 -0
- {sdk_seshat_python-0.4.3 → sdk_seshat_python-0.4.4}/seshat/evaluation/__init__.py +0 -0
- {sdk_seshat_python-0.4.3 → sdk_seshat_python-0.4.4}/seshat/evaluation/base.py +0 -0
- {sdk_seshat_python-0.4.3 → sdk_seshat_python-0.4.4}/seshat/evaluation/evaluator/__init__.py +0 -0
- {sdk_seshat_python-0.4.3 → sdk_seshat_python-0.4.4}/seshat/evaluation/evaluator/base.py +0 -0
- {sdk_seshat_python-0.4.3 → sdk_seshat_python-0.4.4}/seshat/evaluation/evaluator/general/__init__.py +0 -0
- {sdk_seshat_python-0.4.3 → sdk_seshat_python-0.4.4}/seshat/evaluation/evaluator/general/classification.py +0 -0
- {sdk_seshat_python-0.4.3 → sdk_seshat_python-0.4.4}/seshat/evaluation/evaluator/general/clustering.py +0 -0
- {sdk_seshat_python-0.4.3 → sdk_seshat_python-0.4.4}/seshat/evaluation/evaluator/general/regression.py +0 -0
- {sdk_seshat_python-0.4.3 → sdk_seshat_python-0.4.4}/seshat/evaluation/evaluator/recommendation/__init__.py +0 -0
- {sdk_seshat_python-0.4.3 → sdk_seshat_python-0.4.4}/seshat/evaluation/evaluator/recommendation/diversity.py +0 -0
- {sdk_seshat_python-0.4.3 → sdk_seshat_python-0.4.4}/seshat/evaluation/evaluator/recommendation/ranking.py +0 -0
- {sdk_seshat_python-0.4.3 → sdk_seshat_python-0.4.4}/seshat/feature_view/__init__.py +0 -0
- {sdk_seshat_python-0.4.3 → sdk_seshat_python-0.4.4}/seshat/feature_view/base.py +0 -0
- {sdk_seshat_python-0.4.3 → sdk_seshat_python-0.4.4}/seshat/general/__init__.py +0 -0
- {sdk_seshat_python-0.4.3 → sdk_seshat_python-0.4.4}/seshat/general/command/__init__.py +0 -0
- {sdk_seshat_python-0.4.3 → sdk_seshat_python-0.4.4}/seshat/general/command/base.py +0 -0
- {sdk_seshat_python-0.4.3 → sdk_seshat_python-0.4.4}/seshat/general/command/code_inspect.py +0 -0
- {sdk_seshat_python-0.4.3 → sdk_seshat_python-0.4.4}/seshat/general/command/job_status.py +0 -0
- {sdk_seshat_python-0.4.3 → sdk_seshat_python-0.4.4}/seshat/general/command/setup_project.py +0 -0
- {sdk_seshat_python-0.4.3 → sdk_seshat_python-0.4.4}/seshat/general/command/submit_to_network.py +0 -0
- {sdk_seshat_python-0.4.3 → sdk_seshat_python-0.4.4}/seshat/general/exceptions.py +0 -0
- {sdk_seshat_python-0.4.3 → sdk_seshat_python-0.4.4}/seshat/general/lazy_config.py +0 -0
- {sdk_seshat_python-0.4.3 → sdk_seshat_python-0.4.4}/seshat/general/models.py +0 -0
- {sdk_seshat_python-0.4.3 → sdk_seshat_python-0.4.4}/seshat/general/template/README.md-tmpl +0 -0
- {sdk_seshat_python-0.4.3 → sdk_seshat_python-0.4.4}/seshat/general/template/config.py-tmpl +0 -0
- {sdk_seshat_python-0.4.3 → sdk_seshat_python-0.4.4}/seshat/general/template/env-templ +0 -0
- {sdk_seshat_python-0.4.3 → sdk_seshat_python-0.4.4}/seshat/general/template/jobignore-tmpl +0 -0
- {sdk_seshat_python-0.4.3 → sdk_seshat_python-0.4.4}/seshat/general/template/pyproject._toml-tmpl +0 -0
- {sdk_seshat_python-0.4.3 → sdk_seshat_python-0.4.4}/seshat/general/template/recommender-jupyter.ipynb-tmpl +0 -0
- {sdk_seshat_python-0.4.3 → sdk_seshat_python-0.4.4}/seshat/general/template/recommender.py-tmpl +0 -0
- {sdk_seshat_python-0.4.3 → sdk_seshat_python-0.4.4}/seshat/profiler/__init__.py +0 -0
- {sdk_seshat_python-0.4.3 → sdk_seshat_python-0.4.4}/seshat/profiler/base.py +0 -0
- {sdk_seshat_python-0.4.3 → sdk_seshat_python-0.4.4}/seshat/profiler/decorator.py +0 -0
- {sdk_seshat_python-0.4.3 → sdk_seshat_python-0.4.4}/seshat/profiler/format.py +0 -0
- {sdk_seshat_python-0.4.3 → sdk_seshat_python-0.4.4}/seshat/source/__init__.py +0 -0
- {sdk_seshat_python-0.4.3 → sdk_seshat_python-0.4.4}/seshat/source/base.py +0 -0
- {sdk_seshat_python-0.4.3 → sdk_seshat_python-0.4.4}/seshat/source/database/__init__.py +0 -0
- {sdk_seshat_python-0.4.3 → sdk_seshat_python-0.4.4}/seshat/source/database/base.py +0 -0
- {sdk_seshat_python-0.4.3 → sdk_seshat_python-0.4.4}/seshat/source/exceptions.py +0 -0
- {sdk_seshat_python-0.4.3 → sdk_seshat_python-0.4.4}/seshat/source/flip_side/__init__.py +0 -0
- {sdk_seshat_python-0.4.3 → sdk_seshat_python-0.4.4}/seshat/source/flip_side/base.py +0 -0
- {sdk_seshat_python-0.4.3 → sdk_seshat_python-0.4.4}/seshat/source/local/__init__.py +0 -0
- {sdk_seshat_python-0.4.3 → sdk_seshat_python-0.4.4}/seshat/source/local/base.py +0 -0
- {sdk_seshat_python-0.4.3 → sdk_seshat_python-0.4.4}/seshat/source/mixins.py +0 -0
- {sdk_seshat_python-0.4.3 → sdk_seshat_python-0.4.4}/seshat/source/multisource/__init__.py +0 -0
- {sdk_seshat_python-0.4.3 → sdk_seshat_python-0.4.4}/seshat/source/multisource/base.py +0 -0
- {sdk_seshat_python-0.4.3 → sdk_seshat_python-0.4.4}/seshat/source/saver/__init__.py +0 -0
- {sdk_seshat_python-0.4.3 → sdk_seshat_python-0.4.4}/seshat/source/saver/base.py +0 -0
- {sdk_seshat_python-0.4.3 → sdk_seshat_python-0.4.4}/seshat/source/saver/database.py +0 -0
- {sdk_seshat_python-0.4.3 → sdk_seshat_python-0.4.4}/seshat/source/saver/utils/__init__.py +0 -0
- {sdk_seshat_python-0.4.3 → sdk_seshat_python-0.4.4}/seshat/source/saver/utils/postgres.py +0 -0
- {sdk_seshat_python-0.4.3 → sdk_seshat_python-0.4.4}/seshat/transformer/__init__.py +0 -0
- {sdk_seshat_python-0.4.3 → sdk_seshat_python-0.4.4}/seshat/transformer/aggregator/__init__.py +0 -0
- {sdk_seshat_python-0.4.3 → sdk_seshat_python-0.4.4}/seshat/transformer/augmenter/__init__.py +0 -0
- {sdk_seshat_python-0.4.3 → sdk_seshat_python-0.4.4}/seshat/transformer/augmenter/base.py +0 -0
- {sdk_seshat_python-0.4.3 → sdk_seshat_python-0.4.4}/seshat/transformer/base.py +0 -0
- {sdk_seshat_python-0.4.3 → sdk_seshat_python-0.4.4}/seshat/transformer/deriver/from_database.py +0 -0
- {sdk_seshat_python-0.4.3 → sdk_seshat_python-0.4.4}/seshat/transformer/imputer/__init__.py +0 -0
- {sdk_seshat_python-0.4.3 → sdk_seshat_python-0.4.4}/seshat/transformer/merger/__init__.py +0 -0
- {sdk_seshat_python-0.4.3 → sdk_seshat_python-0.4.4}/seshat/transformer/merger/base.py +0 -0
- {sdk_seshat_python-0.4.3 → sdk_seshat_python-0.4.4}/seshat/transformer/pipeline/__init__.py +0 -0
- {sdk_seshat_python-0.4.3 → sdk_seshat_python-0.4.4}/seshat/transformer/pipeline/base.py +0 -0
- {sdk_seshat_python-0.4.3 → sdk_seshat_python-0.4.4}/seshat/transformer/pipeline/recommendation/__init__.py +0 -0
- {sdk_seshat_python-0.4.3 → sdk_seshat_python-0.4.4}/seshat/transformer/pipeline/recommendation/address_pipeline.py +0 -0
- {sdk_seshat_python-0.4.3 → sdk_seshat_python-0.4.4}/seshat/transformer/pseudo/__init__.py +0 -0
- {sdk_seshat_python-0.4.3 → sdk_seshat_python-0.4.4}/seshat/transformer/pseudo/table_existence.py +0 -0
- {sdk_seshat_python-0.4.3 → sdk_seshat_python-0.4.4}/seshat/transformer/reducer/__init__.py +0 -0
- {sdk_seshat_python-0.4.3 → sdk_seshat_python-0.4.4}/seshat/transformer/reducer/base.py +0 -0
- {sdk_seshat_python-0.4.3 → sdk_seshat_python-0.4.4}/seshat/transformer/scaler/__init__.py +0 -0
- {sdk_seshat_python-0.4.3 → sdk_seshat_python-0.4.4}/seshat/transformer/scaler/base.py +0 -0
- {sdk_seshat_python-0.4.3 → sdk_seshat_python-0.4.4}/seshat/transformer/schema/__init__.py +0 -0
- {sdk_seshat_python-0.4.3 → sdk_seshat_python-0.4.4}/seshat/transformer/schema/base.py +0 -0
- {sdk_seshat_python-0.4.3 → sdk_seshat_python-0.4.4}/seshat/transformer/splitter/__init__.py +0 -0
- {sdk_seshat_python-0.4.3 → sdk_seshat_python-0.4.4}/seshat/transformer/splitter/base.py +0 -0
- {sdk_seshat_python-0.4.3 → sdk_seshat_python-0.4.4}/seshat/transformer/splitter/block/__init__.py +0 -0
- {sdk_seshat_python-0.4.3 → sdk_seshat_python-0.4.4}/seshat/transformer/splitter/block/base.py +0 -0
- {sdk_seshat_python-0.4.3 → sdk_seshat_python-0.4.4}/seshat/transformer/splitter/random/__init__.py +0 -0
- {sdk_seshat_python-0.4.3 → sdk_seshat_python-0.4.4}/seshat/transformer/splitter/random/base.py +0 -0
- {sdk_seshat_python-0.4.3 → sdk_seshat_python-0.4.4}/seshat/transformer/splitter/time_line/__init__.py +0 -0
- {sdk_seshat_python-0.4.3 → sdk_seshat_python-0.4.4}/seshat/transformer/splitter/time_line/base.py +0 -0
- {sdk_seshat_python-0.4.3 → sdk_seshat_python-0.4.4}/seshat/transformer/trimmer/__init__.py +0 -0
- {sdk_seshat_python-0.4.3 → sdk_seshat_python-0.4.4}/seshat/transformer/vectorizer/__init__.py +0 -0
- {sdk_seshat_python-0.4.3 → sdk_seshat_python-0.4.4}/seshat/transformer/vectorizer/base.py +0 -0
- {sdk_seshat_python-0.4.3 → sdk_seshat_python-0.4.4}/seshat/transformer/vectorizer/cosine_similarity.py +0 -0
- {sdk_seshat_python-0.4.3 → sdk_seshat_python-0.4.4}/seshat/transformer/vectorizer/pivot.py +0 -0
- {sdk_seshat_python-0.4.3 → sdk_seshat_python-0.4.4}/seshat/transformer/vectorizer/utils.py +0 -0
- {sdk_seshat_python-0.4.3 → sdk_seshat_python-0.4.4}/seshat/utils/__init__.py +0 -0
- {sdk_seshat_python-0.4.3 → sdk_seshat_python-0.4.4}/seshat/utils/batcher.py +0 -0
- {sdk_seshat_python-0.4.3 → sdk_seshat_python-0.4.4}/seshat/utils/binary_utils.py +0 -0
- {sdk_seshat_python-0.4.3 → sdk_seshat_python-0.4.4}/seshat/utils/clean_json.py +0 -0
- {sdk_seshat_python-0.4.3 → sdk_seshat_python-0.4.4}/seshat/utils/col_to_list.py +0 -0
- {sdk_seshat_python-0.4.3 → sdk_seshat_python-0.4.4}/seshat/utils/contracts.py +0 -0
- {sdk_seshat_python-0.4.3 → sdk_seshat_python-0.4.4}/seshat/utils/date_utils.py +0 -0
- {sdk_seshat_python-0.4.3 → sdk_seshat_python-0.4.4}/seshat/utils/file.py +0 -0
- {sdk_seshat_python-0.4.3 → sdk_seshat_python-0.4.4}/seshat/utils/file_cryptography.py +0 -0
- {sdk_seshat_python-0.4.3 → sdk_seshat_python-0.4.4}/seshat/utils/filter_json.py +0 -0
- {sdk_seshat_python-0.4.3 → sdk_seshat_python-0.4.4}/seshat/utils/grouper.py +0 -0
- {sdk_seshat_python-0.4.3 → sdk_seshat_python-0.4.4}/seshat/utils/jobignore.py +0 -0
- {sdk_seshat_python-0.4.3 → sdk_seshat_python-0.4.4}/seshat/utils/join_columns_to_list.py +0 -0
- {sdk_seshat_python-0.4.3 → sdk_seshat_python-0.4.4}/seshat/utils/join_str.py +0 -0
- {sdk_seshat_python-0.4.3 → sdk_seshat_python-0.4.4}/seshat/utils/llm_client/__init__.py +0 -0
- {sdk_seshat_python-0.4.3 → sdk_seshat_python-0.4.4}/seshat/utils/llm_client/chatbot_factory.py +0 -0
- {sdk_seshat_python-0.4.3 → sdk_seshat_python-0.4.4}/seshat/utils/logging/__init__.py +0 -0
- {sdk_seshat_python-0.4.3 → sdk_seshat_python-0.4.4}/seshat/utils/logging/base_logger.py +0 -0
- {sdk_seshat_python-0.4.3 → sdk_seshat_python-0.4.4}/seshat/utils/logging/console_logger.py +0 -0
- {sdk_seshat_python-0.4.3 → sdk_seshat_python-0.4.4}/seshat/utils/logging/logstash_logger.py +0 -0
- {sdk_seshat_python-0.4.3 → sdk_seshat_python-0.4.4}/seshat/utils/logging/multi_logger.py +0 -0
- {sdk_seshat_python-0.4.3 → sdk_seshat_python-0.4.4}/seshat/utils/memory.py +0 -0
- {sdk_seshat_python-0.4.3 → sdk_seshat_python-0.4.4}/seshat/utils/mixin.py +0 -0
- {sdk_seshat_python-0.4.3 → sdk_seshat_python-0.4.4}/seshat/utils/obfuscate.py +0 -0
- {sdk_seshat_python-0.4.3 → sdk_seshat_python-0.4.4}/seshat/utils/package_utils.py +0 -0
- {sdk_seshat_python-0.4.3 → sdk_seshat_python-0.4.4}/seshat/utils/pandas_func.py +0 -0
- {sdk_seshat_python-0.4.3 → sdk_seshat_python-0.4.4}/seshat/utils/patching.py +0 -0
- {sdk_seshat_python-0.4.3 → sdk_seshat_python-0.4.4}/seshat/utils/pyspark_func.py +0 -0
- {sdk_seshat_python-0.4.3 → sdk_seshat_python-0.4.4}/seshat/utils/rest.py +0 -0
- {sdk_seshat_python-0.4.3 → sdk_seshat_python-0.4.4}/seshat/utils/singleton.py +0 -0
- {sdk_seshat_python-0.4.3 → sdk_seshat_python-0.4.4}/seshat/utils/validation.py +0 -0
- {sdk_seshat_python-0.4.3 → sdk_seshat_python-0.4.4}/seshat/utils/zip_utils.py +0 -0
|
@@ -61,7 +61,7 @@ class SFrame:
|
|
|
61
61
|
def iterrows(self, column_name: str, key: str = configs.DEFAULT_SF_KEY):
|
|
62
62
|
pass
|
|
63
63
|
|
|
64
|
-
def make_group(self, default_key=configs.DEFAULT_SF_KEY):
|
|
64
|
+
def make_group(self, default_key=configs.DEFAULT_SF_KEY) -> "GroupSFrame":
|
|
65
65
|
pass
|
|
66
66
|
|
|
67
67
|
def convert(
|
|
@@ -0,0 +1,160 @@
|
|
|
1
|
+
from typing import List, Dict, Tuple, Callable, Optional
|
|
2
|
+
|
|
3
|
+
from pandas import DataFrame
|
|
4
|
+
from pyspark.sql import DataFrame as PySparkDataFrame
|
|
5
|
+
from pyspark.sql import functions as F
|
|
6
|
+
|
|
7
|
+
from seshat.data_class import SFrame, DFrame
|
|
8
|
+
from seshat.general import configs
|
|
9
|
+
from seshat.transformer import Transformer
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class Aggregator(Transformer):
|
|
13
|
+
ONLY_GROUP = False
|
|
14
|
+
HANDLER_NAME = "aggregate"
|
|
15
|
+
DEFAULT_FRAME = DFrame
|
|
16
|
+
DEFAULT_GROUP_KEYS = {"default": configs.DEFAULT_SF_KEY}
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class FieldAggregation(Aggregator):
|
|
20
|
+
"""
|
|
21
|
+
This class is used to perform aggregations on dataframes, supporting both grouped
|
|
22
|
+
and global aggregations. When no group_by is specified, aggregations are added
|
|
23
|
+
as new columns to all rows.
|
|
24
|
+
|
|
25
|
+
Parameters
|
|
26
|
+
----------
|
|
27
|
+
agg : dict
|
|
28
|
+
Dictionary mapping result column names to aggregation specifications.
|
|
29
|
+
Each value can be either:
|
|
30
|
+
- A tuple of (source_column, aggregation_function) for standard aggregations
|
|
31
|
+
- A callable function that takes a DataFrame and returns an aggregated value
|
|
32
|
+
group_by : list of str, optional
|
|
33
|
+
List of column names to group by. If None or empty, performs
|
|
34
|
+
global aggregations across the entire dataset.
|
|
35
|
+
"""
|
|
36
|
+
|
|
37
|
+
def __init__(
|
|
38
|
+
self,
|
|
39
|
+
agg: Dict[str, Tuple[str, str] | Callable],
|
|
40
|
+
group_by: Optional[List[str]] = None,
|
|
41
|
+
group_keys=None,
|
|
42
|
+
*args,
|
|
43
|
+
**kwargs,
|
|
44
|
+
):
|
|
45
|
+
super().__init__(group_keys, *args, **kwargs)
|
|
46
|
+
self.group_on = group_by or []
|
|
47
|
+
self.agg = agg
|
|
48
|
+
|
|
49
|
+
def calculate_complexity(self):
|
|
50
|
+
return 10
|
|
51
|
+
|
|
52
|
+
def validate(self, sf: SFrame):
|
|
53
|
+
super().validate(sf)
|
|
54
|
+
|
|
55
|
+
columns = []
|
|
56
|
+
for _, agg_info in self.agg.items():
|
|
57
|
+
if isinstance(agg_info, tuple):
|
|
58
|
+
source_col, _ = agg_info
|
|
59
|
+
columns.append(source_col)
|
|
60
|
+
self._validate_columns(sf, self.default_sf_key, *columns)
|
|
61
|
+
|
|
62
|
+
def aggregate_df(self, default: DataFrame, *args, **kwargs) -> Dict[str, DataFrame]:
|
|
63
|
+
if self.group_on:
|
|
64
|
+
to_agg_df = default.groupby(self.group_on)
|
|
65
|
+
result_df = (
|
|
66
|
+
default[self.group_on]
|
|
67
|
+
.drop_duplicates()
|
|
68
|
+
.sort_values(self.group_on)
|
|
69
|
+
.reset_index(drop=True)
|
|
70
|
+
)
|
|
71
|
+
else:
|
|
72
|
+
to_agg_df = default
|
|
73
|
+
result_df = default
|
|
74
|
+
|
|
75
|
+
# Find the aggregations
|
|
76
|
+
for result_col, agg_info in self.agg.items():
|
|
77
|
+
if isinstance(agg_info, tuple):
|
|
78
|
+
source_col, agg_func = agg_info
|
|
79
|
+
res = getattr(to_agg_df[source_col], agg_func)()
|
|
80
|
+
if self.group_on:
|
|
81
|
+
res = res.values
|
|
82
|
+
result_df[result_col] = res
|
|
83
|
+
else:
|
|
84
|
+
result_df[result_col] = (
|
|
85
|
+
to_agg_df.apply(agg_info).values
|
|
86
|
+
if self.group_on
|
|
87
|
+
else agg_info(default)
|
|
88
|
+
)
|
|
89
|
+
|
|
90
|
+
return {"default": result_df}
|
|
91
|
+
|
|
92
|
+
def aggregate_spf(
|
|
93
|
+
self, default: PySparkDataFrame, *args, **kwargs
|
|
94
|
+
) -> Dict[str, PySparkDataFrame]:
|
|
95
|
+
if any(not isinstance(agg_info, tuple) for agg_info in self.agg.values()):
|
|
96
|
+
return self._fallback_to_pandas(default, *args, **kwargs)
|
|
97
|
+
|
|
98
|
+
spark_agg_funcs = {
|
|
99
|
+
"first": F.first,
|
|
100
|
+
"last": F.last,
|
|
101
|
+
"sum": F.sum,
|
|
102
|
+
"mean": F.avg,
|
|
103
|
+
"avg": F.avg,
|
|
104
|
+
"min": F.min,
|
|
105
|
+
"max": F.max,
|
|
106
|
+
"count": F.count,
|
|
107
|
+
"std": F.stddev,
|
|
108
|
+
"var": F.variance,
|
|
109
|
+
"nunique": F.countDistinct,
|
|
110
|
+
}
|
|
111
|
+
|
|
112
|
+
# Prepare aggregations
|
|
113
|
+
standard_aggs = []
|
|
114
|
+
for result_col, agg_info in self.agg.items():
|
|
115
|
+
if isinstance(agg_info, tuple) and len(agg_info) == 2:
|
|
116
|
+
source_col, agg_func = agg_info
|
|
117
|
+
if agg_func in spark_agg_funcs:
|
|
118
|
+
standard_aggs.append(
|
|
119
|
+
spark_agg_funcs[agg_func](source_col).alias(result_col)
|
|
120
|
+
)
|
|
121
|
+
else:
|
|
122
|
+
return self._fallback_to_pandas(default, *args, **kwargs)
|
|
123
|
+
else:
|
|
124
|
+
return self._fallback_to_pandas(default, *args, **kwargs)
|
|
125
|
+
|
|
126
|
+
if not self.group_on:
|
|
127
|
+
if standard_aggs:
|
|
128
|
+
agg_result = default.agg(*standard_aggs)
|
|
129
|
+
agg_row = agg_result.collect()[0]
|
|
130
|
+
|
|
131
|
+
result_df = default
|
|
132
|
+
for result_col in self.agg.keys():
|
|
133
|
+
result_df = result_df.withColumn(
|
|
134
|
+
result_col, F.lit(agg_row[result_col])
|
|
135
|
+
)
|
|
136
|
+
else:
|
|
137
|
+
result_df = default
|
|
138
|
+
else:
|
|
139
|
+
if isinstance(self.group_on, str):
|
|
140
|
+
group_cols = [self.group_on]
|
|
141
|
+
else:
|
|
142
|
+
group_cols = list(self.group_on)
|
|
143
|
+
|
|
144
|
+
result_df = default.groupBy(*group_cols).agg(*standard_aggs)
|
|
145
|
+
|
|
146
|
+
final_cols = group_cols + list(self.agg.keys())
|
|
147
|
+
result_df = result_df.select(*final_cols)
|
|
148
|
+
|
|
149
|
+
return {"default": result_df}
|
|
150
|
+
|
|
151
|
+
def _fallback_to_pandas(
|
|
152
|
+
self, default: PySparkDataFrame, *args, **kwargs
|
|
153
|
+
) -> Dict[str, PySparkDataFrame]:
|
|
154
|
+
pandas_df = default.toPandas()
|
|
155
|
+
result_dict = self.aggregate_df(pandas_df, *args, **kwargs)
|
|
156
|
+
spark = default.sparkSession
|
|
157
|
+
result_df = spark.createDataFrame(
|
|
158
|
+
result_dict[self.DEFAULT_GROUP_KEYS["default"]]
|
|
159
|
+
)
|
|
160
|
+
return {"default": result_df}
|
|
@@ -5,7 +5,6 @@ from .base import (
|
|
|
5
5
|
PercentileTransactionValueDeriver,
|
|
6
6
|
SFrameFromColsDeriver,
|
|
7
7
|
StaticValueColumnAdder,
|
|
8
|
-
TimeWindowTransformer,
|
|
9
8
|
FractionDeriver,
|
|
10
9
|
ProfitLossDeriver,
|
|
11
10
|
DateTimeTypeDeriver,
|
|
@@ -20,6 +19,8 @@ from .base import (
|
|
|
20
19
|
TokenPriceDeriver,
|
|
21
20
|
TokenSwapTradeDeriver,
|
|
22
21
|
TokenFeatureTransformationDeriver,
|
|
22
|
+
BranchClassifier,
|
|
23
|
+
Tagger,
|
|
23
24
|
)
|
|
24
25
|
|
|
25
26
|
from .from_database import FromSQLDBDeriver
|