sdk-seshat-python 0.3.16__tar.gz → 0.4.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {sdk_seshat_python-0.3.16 → sdk_seshat_python-0.4.2}/PKG-INFO +3 -2
- {sdk_seshat_python-0.3.16 → sdk_seshat_python-0.4.2}/pyproject.toml +5 -4
- {sdk_seshat_python-0.3.16 → sdk_seshat_python-0.4.2}/seshat/__init__.py +7 -1
- {sdk_seshat_python-0.3.16 → sdk_seshat_python-0.4.2}/seshat/data_class/pandas.py +2 -3
- {sdk_seshat_python-0.3.16 → sdk_seshat_python-0.4.2}/seshat/data_class/pyspark.py +6 -3
- {sdk_seshat_python-0.3.16 → sdk_seshat_python-0.4.2}/seshat/feature_view/base.py +7 -2
- {sdk_seshat_python-0.3.16 → sdk_seshat_python-0.4.2}/seshat/general/command/submit_to_network.py +2 -1
- {sdk_seshat_python-0.3.16 → sdk_seshat_python-0.4.2}/seshat/source/local/base.py +9 -4
- {sdk_seshat_python-0.3.16 → sdk_seshat_python-0.4.2}/seshat/source/saver/database.py +12 -1
- sdk_seshat_python-0.4.2/seshat/transformer/deriver/__init__.py +25 -0
- {sdk_seshat_python-0.3.16 → sdk_seshat_python-0.4.2}/seshat/transformer/deriver/base.py +90 -210
- {sdk_seshat_python-0.3.16 → sdk_seshat_python-0.4.2}/seshat/transformer/pseudo/__init__.py +1 -0
- sdk_seshat_python-0.4.2/seshat/transformer/pseudo/action_gate.py +119 -0
- {sdk_seshat_python-0.3.16 → sdk_seshat_python-0.4.2}/seshat/transformer/pseudo/table_existence.py +3 -11
- sdk_seshat_python-0.4.2/seshat/transformer/reducer/base.py +469 -0
- sdk_seshat_python-0.3.16/seshat/transformer/deriver/__init__.py +0 -9
- sdk_seshat_python-0.3.16/seshat/transformer/reducer/base.py +0 -562
- {sdk_seshat_python-0.3.16 → sdk_seshat_python-0.4.2}/LICENSE +0 -0
- {sdk_seshat_python-0.3.16 → sdk_seshat_python-0.4.2}/README.md +0 -0
- {sdk_seshat_python-0.3.16 → sdk_seshat_python-0.4.2}/seshat/__main__.py +0 -0
- {sdk_seshat_python-0.3.16 → sdk_seshat_python-0.4.2}/seshat/data_class/__init__.py +0 -0
- {sdk_seshat_python-0.3.16 → sdk_seshat_python-0.4.2}/seshat/data_class/base.py +0 -0
- {sdk_seshat_python-0.3.16 → sdk_seshat_python-0.4.2}/seshat/evaluation/__init__.py +0 -0
- {sdk_seshat_python-0.3.16 → sdk_seshat_python-0.4.2}/seshat/evaluation/base.py +0 -0
- {sdk_seshat_python-0.3.16 → sdk_seshat_python-0.4.2}/seshat/evaluation/evaluator/__init__.py +0 -0
- {sdk_seshat_python-0.3.16 → sdk_seshat_python-0.4.2}/seshat/evaluation/evaluator/base.py +0 -0
- {sdk_seshat_python-0.3.16 → sdk_seshat_python-0.4.2}/seshat/evaluation/evaluator/general/__init__.py +0 -0
- {sdk_seshat_python-0.3.16 → sdk_seshat_python-0.4.2}/seshat/evaluation/evaluator/general/classification.py +0 -0
- {sdk_seshat_python-0.3.16 → sdk_seshat_python-0.4.2}/seshat/evaluation/evaluator/general/clustering.py +0 -0
- {sdk_seshat_python-0.3.16 → sdk_seshat_python-0.4.2}/seshat/evaluation/evaluator/general/regression.py +0 -0
- {sdk_seshat_python-0.3.16 → sdk_seshat_python-0.4.2}/seshat/evaluation/evaluator/recommendation/__init__.py +0 -0
- {sdk_seshat_python-0.3.16 → sdk_seshat_python-0.4.2}/seshat/evaluation/evaluator/recommendation/diversity.py +0 -0
- {sdk_seshat_python-0.3.16 → sdk_seshat_python-0.4.2}/seshat/evaluation/evaluator/recommendation/ranking.py +0 -0
- {sdk_seshat_python-0.3.16 → sdk_seshat_python-0.4.2}/seshat/feature_view/__init__.py +0 -0
- {sdk_seshat_python-0.3.16 → sdk_seshat_python-0.4.2}/seshat/general/__init__.py +0 -0
- {sdk_seshat_python-0.3.16 → sdk_seshat_python-0.4.2}/seshat/general/command/__init__.py +0 -0
- {sdk_seshat_python-0.3.16 → sdk_seshat_python-0.4.2}/seshat/general/command/base.py +0 -0
- {sdk_seshat_python-0.3.16 → sdk_seshat_python-0.4.2}/seshat/general/command/code_inspect.py +0 -0
- {sdk_seshat_python-0.3.16 → sdk_seshat_python-0.4.2}/seshat/general/command/job_status.py +0 -0
- {sdk_seshat_python-0.3.16 → sdk_seshat_python-0.4.2}/seshat/general/command/setup_project.py +0 -0
- {sdk_seshat_python-0.3.16 → sdk_seshat_python-0.4.2}/seshat/general/config.py +0 -0
- {sdk_seshat_python-0.3.16 → sdk_seshat_python-0.4.2}/seshat/general/exceptions.py +0 -0
- {sdk_seshat_python-0.3.16 → sdk_seshat_python-0.4.2}/seshat/general/lazy_config.py +0 -0
- {sdk_seshat_python-0.3.16 → sdk_seshat_python-0.4.2}/seshat/general/models.py +0 -0
- {sdk_seshat_python-0.3.16 → sdk_seshat_python-0.4.2}/seshat/general/template/README.md-tmpl +0 -0
- {sdk_seshat_python-0.3.16 → sdk_seshat_python-0.4.2}/seshat/general/template/config.py-tmpl +0 -0
- {sdk_seshat_python-0.3.16 → sdk_seshat_python-0.4.2}/seshat/general/template/env-templ +0 -0
- {sdk_seshat_python-0.3.16 → sdk_seshat_python-0.4.2}/seshat/general/template/jobignore-tmpl +0 -0
- {sdk_seshat_python-0.3.16 → sdk_seshat_python-0.4.2}/seshat/general/template/pyproject._toml-tmpl +0 -0
- {sdk_seshat_python-0.3.16 → sdk_seshat_python-0.4.2}/seshat/general/template/recommender-jupyter.ipynb-tmpl +0 -0
- {sdk_seshat_python-0.3.16 → sdk_seshat_python-0.4.2}/seshat/general/template/recommender.py-tmpl +0 -0
- {sdk_seshat_python-0.3.16 → sdk_seshat_python-0.4.2}/seshat/profiler/__init__.py +0 -0
- {sdk_seshat_python-0.3.16 → sdk_seshat_python-0.4.2}/seshat/profiler/base.py +0 -0
- {sdk_seshat_python-0.3.16 → sdk_seshat_python-0.4.2}/seshat/profiler/decorator.py +0 -0
- {sdk_seshat_python-0.3.16 → sdk_seshat_python-0.4.2}/seshat/profiler/format.py +0 -0
- {sdk_seshat_python-0.3.16 → sdk_seshat_python-0.4.2}/seshat/source/__init__.py +0 -0
- {sdk_seshat_python-0.3.16 → sdk_seshat_python-0.4.2}/seshat/source/base.py +0 -0
- {sdk_seshat_python-0.3.16 → sdk_seshat_python-0.4.2}/seshat/source/database/__init__.py +0 -0
- {sdk_seshat_python-0.3.16 → sdk_seshat_python-0.4.2}/seshat/source/database/base.py +0 -0
- {sdk_seshat_python-0.3.16 → sdk_seshat_python-0.4.2}/seshat/source/exceptions.py +0 -0
- {sdk_seshat_python-0.3.16 → sdk_seshat_python-0.4.2}/seshat/source/flip_side/__init__.py +0 -0
- {sdk_seshat_python-0.3.16 → sdk_seshat_python-0.4.2}/seshat/source/flip_side/base.py +0 -0
- {sdk_seshat_python-0.3.16 → sdk_seshat_python-0.4.2}/seshat/source/local/__init__.py +0 -0
- {sdk_seshat_python-0.3.16 → sdk_seshat_python-0.4.2}/seshat/source/mixins.py +0 -0
- {sdk_seshat_python-0.3.16 → sdk_seshat_python-0.4.2}/seshat/source/multisource/__init__.py +0 -0
- {sdk_seshat_python-0.3.16 → sdk_seshat_python-0.4.2}/seshat/source/multisource/base.py +0 -0
- {sdk_seshat_python-0.3.16 → sdk_seshat_python-0.4.2}/seshat/source/saver/__init__.py +0 -0
- {sdk_seshat_python-0.3.16 → sdk_seshat_python-0.4.2}/seshat/source/saver/base.py +0 -0
- {sdk_seshat_python-0.3.16 → sdk_seshat_python-0.4.2}/seshat/source/saver/utils/__init__.py +0 -0
- {sdk_seshat_python-0.3.16 → sdk_seshat_python-0.4.2}/seshat/source/saver/utils/postgres.py +0 -0
- {sdk_seshat_python-0.3.16 → sdk_seshat_python-0.4.2}/seshat/transformer/__init__.py +0 -0
- {sdk_seshat_python-0.3.16 → sdk_seshat_python-0.4.2}/seshat/transformer/aggregator/__init__.py +0 -0
- {sdk_seshat_python-0.3.16 → sdk_seshat_python-0.4.2}/seshat/transformer/aggregator/base.py +0 -0
- {sdk_seshat_python-0.3.16 → sdk_seshat_python-0.4.2}/seshat/transformer/augmenter/__init__.py +0 -0
- {sdk_seshat_python-0.3.16 → sdk_seshat_python-0.4.2}/seshat/transformer/augmenter/base.py +0 -0
- {sdk_seshat_python-0.3.16 → sdk_seshat_python-0.4.2}/seshat/transformer/base.py +0 -0
- {sdk_seshat_python-0.3.16 → sdk_seshat_python-0.4.2}/seshat/transformer/deriver/from_database.py +0 -0
- {sdk_seshat_python-0.3.16 → sdk_seshat_python-0.4.2}/seshat/transformer/imputer/__init__.py +0 -0
- {sdk_seshat_python-0.3.16 → sdk_seshat_python-0.4.2}/seshat/transformer/imputer/base.py +0 -0
- {sdk_seshat_python-0.3.16 → sdk_seshat_python-0.4.2}/seshat/transformer/merger/__init__.py +0 -0
- {sdk_seshat_python-0.3.16 → sdk_seshat_python-0.4.2}/seshat/transformer/merger/base.py +0 -0
- {sdk_seshat_python-0.3.16 → sdk_seshat_python-0.4.2}/seshat/transformer/merger/nested_key.py +0 -0
- {sdk_seshat_python-0.3.16 → sdk_seshat_python-0.4.2}/seshat/transformer/pipeline/__init__.py +0 -0
- {sdk_seshat_python-0.3.16 → sdk_seshat_python-0.4.2}/seshat/transformer/pipeline/base.py +0 -0
- {sdk_seshat_python-0.3.16 → sdk_seshat_python-0.4.2}/seshat/transformer/pipeline/branch.py +0 -0
- {sdk_seshat_python-0.3.16 → sdk_seshat_python-0.4.2}/seshat/transformer/pipeline/recommendation/__init__.py +0 -0
- {sdk_seshat_python-0.3.16 → sdk_seshat_python-0.4.2}/seshat/transformer/pipeline/recommendation/address_pipeline.py +0 -0
- {sdk_seshat_python-0.3.16 → sdk_seshat_python-0.4.2}/seshat/transformer/reducer/__init__.py +0 -0
- {sdk_seshat_python-0.3.16 → sdk_seshat_python-0.4.2}/seshat/transformer/scaler/__init__.py +0 -0
- {sdk_seshat_python-0.3.16 → sdk_seshat_python-0.4.2}/seshat/transformer/scaler/base.py +0 -0
- {sdk_seshat_python-0.3.16 → sdk_seshat_python-0.4.2}/seshat/transformer/schema/__init__.py +0 -0
- {sdk_seshat_python-0.3.16 → sdk_seshat_python-0.4.2}/seshat/transformer/schema/base.py +0 -0
- {sdk_seshat_python-0.3.16 → sdk_seshat_python-0.4.2}/seshat/transformer/splitter/__init__.py +0 -0
- {sdk_seshat_python-0.3.16 → sdk_seshat_python-0.4.2}/seshat/transformer/splitter/base.py +0 -0
- {sdk_seshat_python-0.3.16 → sdk_seshat_python-0.4.2}/seshat/transformer/splitter/block/__init__.py +0 -0
- {sdk_seshat_python-0.3.16 → sdk_seshat_python-0.4.2}/seshat/transformer/splitter/block/base.py +0 -0
- {sdk_seshat_python-0.3.16 → sdk_seshat_python-0.4.2}/seshat/transformer/splitter/random/__init__.py +0 -0
- {sdk_seshat_python-0.3.16 → sdk_seshat_python-0.4.2}/seshat/transformer/splitter/random/base.py +0 -0
- {sdk_seshat_python-0.3.16 → sdk_seshat_python-0.4.2}/seshat/transformer/splitter/time_line/__init__.py +0 -0
- {sdk_seshat_python-0.3.16 → sdk_seshat_python-0.4.2}/seshat/transformer/splitter/time_line/base.py +0 -0
- {sdk_seshat_python-0.3.16 → sdk_seshat_python-0.4.2}/seshat/transformer/trimmer/__init__.py +0 -0
- {sdk_seshat_python-0.3.16 → sdk_seshat_python-0.4.2}/seshat/transformer/trimmer/base.py +0 -0
- {sdk_seshat_python-0.3.16 → sdk_seshat_python-0.4.2}/seshat/transformer/vectorizer/__init__.py +0 -0
- {sdk_seshat_python-0.3.16 → sdk_seshat_python-0.4.2}/seshat/transformer/vectorizer/base.py +0 -0
- {sdk_seshat_python-0.3.16 → sdk_seshat_python-0.4.2}/seshat/transformer/vectorizer/cosine_similarity.py +0 -0
- {sdk_seshat_python-0.3.16 → sdk_seshat_python-0.4.2}/seshat/transformer/vectorizer/pivot.py +0 -0
- {sdk_seshat_python-0.3.16 → sdk_seshat_python-0.4.2}/seshat/transformer/vectorizer/utils.py +0 -0
- {sdk_seshat_python-0.3.16 → sdk_seshat_python-0.4.2}/seshat/utils/__init__.py +0 -0
- {sdk_seshat_python-0.3.16 → sdk_seshat_python-0.4.2}/seshat/utils/batcher.py +0 -0
- {sdk_seshat_python-0.3.16 → sdk_seshat_python-0.4.2}/seshat/utils/binary_utils.py +0 -0
- {sdk_seshat_python-0.3.16 → sdk_seshat_python-0.4.2}/seshat/utils/clean_json.py +0 -0
- {sdk_seshat_python-0.3.16 → sdk_seshat_python-0.4.2}/seshat/utils/col_to_list.py +0 -0
- {sdk_seshat_python-0.3.16 → sdk_seshat_python-0.4.2}/seshat/utils/contracts.py +0 -0
- {sdk_seshat_python-0.3.16 → sdk_seshat_python-0.4.2}/seshat/utils/file.py +0 -0
- {sdk_seshat_python-0.3.16 → sdk_seshat_python-0.4.2}/seshat/utils/file_cryptography.py +0 -0
- {sdk_seshat_python-0.3.16 → sdk_seshat_python-0.4.2}/seshat/utils/filter_json.py +0 -0
- {sdk_seshat_python-0.3.16 → sdk_seshat_python-0.4.2}/seshat/utils/grouper.py +0 -0
- {sdk_seshat_python-0.3.16 → sdk_seshat_python-0.4.2}/seshat/utils/jobignore.py +0 -0
- {sdk_seshat_python-0.3.16 → sdk_seshat_python-0.4.2}/seshat/utils/join_columns_to_list.py +0 -0
- {sdk_seshat_python-0.3.16 → sdk_seshat_python-0.4.2}/seshat/utils/join_str.py +0 -0
- {sdk_seshat_python-0.3.16 → sdk_seshat_python-0.4.2}/seshat/utils/llm_client/__init__.py +0 -0
- {sdk_seshat_python-0.3.16 → sdk_seshat_python-0.4.2}/seshat/utils/llm_client/chatbot_factory.py +0 -0
- {sdk_seshat_python-0.3.16 → sdk_seshat_python-0.4.2}/seshat/utils/logging/__init__.py +0 -0
- {sdk_seshat_python-0.3.16 → sdk_seshat_python-0.4.2}/seshat/utils/logging/base_logger.py +0 -0
- {sdk_seshat_python-0.3.16 → sdk_seshat_python-0.4.2}/seshat/utils/logging/console_logger.py +0 -0
- {sdk_seshat_python-0.3.16 → sdk_seshat_python-0.4.2}/seshat/utils/logging/logstash_logger.py +0 -0
- {sdk_seshat_python-0.3.16 → sdk_seshat_python-0.4.2}/seshat/utils/logging/multi_logger.py +0 -0
- {sdk_seshat_python-0.3.16 → sdk_seshat_python-0.4.2}/seshat/utils/memory.py +0 -0
- {sdk_seshat_python-0.3.16 → sdk_seshat_python-0.4.2}/seshat/utils/mixin.py +0 -0
- {sdk_seshat_python-0.3.16 → sdk_seshat_python-0.4.2}/seshat/utils/obfuscate.py +0 -0
- {sdk_seshat_python-0.3.16 → sdk_seshat_python-0.4.2}/seshat/utils/package_utils.py +0 -0
- {sdk_seshat_python-0.3.16 → sdk_seshat_python-0.4.2}/seshat/utils/pandas_func.py +0 -0
- {sdk_seshat_python-0.3.16 → sdk_seshat_python-0.4.2}/seshat/utils/patching.py +0 -0
- {sdk_seshat_python-0.3.16 → sdk_seshat_python-0.4.2}/seshat/utils/pyspark_func.py +0 -0
- {sdk_seshat_python-0.3.16 → sdk_seshat_python-0.4.2}/seshat/utils/rest.py +0 -0
- {sdk_seshat_python-0.3.16 → sdk_seshat_python-0.4.2}/seshat/utils/singleton.py +0 -0
- {sdk_seshat_python-0.3.16 → sdk_seshat_python-0.4.2}/seshat/utils/validation.py +0 -0
- {sdk_seshat_python-0.3.16 → sdk_seshat_python-0.4.2}/seshat/utils/zip_utils.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: sdk-seshat-python
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.4.2
|
|
4
4
|
Summary: Seshat python SDK is a library to help create ML data pipelines.
|
|
5
5
|
License: Commercial - see LICENSE.txt
|
|
6
6
|
Author: SeshatLabs
|
|
@@ -25,7 +25,7 @@ Requires-Dist: loguru (>=0.7.3,<0.8.0)
|
|
|
25
25
|
Requires-Dist: memory-profiler (>=0.61.0,<0.62.0)
|
|
26
26
|
Requires-Dist: openai (>=1.73.0,<2.0.0)
|
|
27
27
|
Requires-Dist: pandas (>=2.2.1,<3.0.0)
|
|
28
|
-
Requires-Dist: psycopg2 (>=2.9,<3.0) ; extra == "postgres-support"
|
|
28
|
+
Requires-Dist: psycopg2-binary (>=2.9,<3.0) ; extra == "postgres-support"
|
|
29
29
|
Requires-Dist: pyarmor (>=8.5.1,<9.0.0)
|
|
30
30
|
Requires-Dist: pydantic (>=2.7.4,<3.0.0)
|
|
31
31
|
Requires-Dist: pyspark (>=3.5.1,<4.0.0)
|
|
@@ -33,6 +33,7 @@ Requires-Dist: python-logstash-async (>=4.0.2,<5.0.0)
|
|
|
33
33
|
Requires-Dist: requests (==2.32.0)
|
|
34
34
|
Requires-Dist: rich (>=13.9.4,<14.0.0)
|
|
35
35
|
Requires-Dist: scikit-learn (>=1.4.1.post1,<2.0.0)
|
|
36
|
+
Requires-Dist: setuptools (>=80.9.0,<81.0.0)
|
|
36
37
|
Requires-Dist: sqlalchemy (>=2.0.29,<3.0.0)
|
|
37
38
|
Requires-Dist: toml (>=0.10.2,<0.11.0)
|
|
38
39
|
Requires-Dist: typer (>=0.12.3,<0.13.0)
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[tool.poetry]
|
|
2
2
|
name = "sdk-seshat-python"
|
|
3
|
-
version = "0.
|
|
3
|
+
version = "0.4.2"
|
|
4
4
|
description = "Seshat python SDK is a library to help create ML data pipelines."
|
|
5
5
|
authors = ["SeshatLabs <info@seshatlabs.xyz>"]
|
|
6
6
|
packages = [{ include = "seshat", from = "." }]
|
|
@@ -32,13 +32,14 @@ langchain = "^0.3.23"
|
|
|
32
32
|
langchain-community = "^0.3.21"
|
|
33
33
|
langchain-openai = "^0.3.12"
|
|
34
34
|
pyarmor = "^8.5.1"
|
|
35
|
-
croniter = "^6.0.0"
|
|
36
|
-
psycopg2 = { version = "^2.9", optional = true }
|
|
37
35
|
python-logstash-async = "^4.0.2"
|
|
36
|
+
croniter = "^6.0.0"
|
|
37
|
+
psycopg2-binary = { version = "^2.9", optional = true }
|
|
38
|
+
setuptools = "^80.9.0"
|
|
38
39
|
|
|
39
40
|
[tool.poetry.extras]
|
|
40
41
|
flipside_support = ["flipside"]
|
|
41
|
-
postgres_support = ["psycopg2"]
|
|
42
|
+
postgres_support = ["psycopg2-binary"]
|
|
42
43
|
|
|
43
44
|
[tool.poetry.group.dev.dependencies]
|
|
44
45
|
flake8 = "^7.0.0"
|
|
@@ -20,6 +20,7 @@ from seshat.general.exceptions import NoConfigSetError, RestClientException
|
|
|
20
20
|
app = typer.Typer()
|
|
21
21
|
console = Console()
|
|
22
22
|
DEFAULT_DATA_SIZE = 1_000_000 # 1 GB
|
|
23
|
+
DEFAULT_EXPIRATION = 86400
|
|
23
24
|
|
|
24
25
|
state = {"verbose": False}
|
|
25
26
|
|
|
@@ -141,7 +142,12 @@ def submit_job(
|
|
|
141
142
|
identifier = manager.store_code(package)
|
|
142
143
|
|
|
143
144
|
job_response = manager.submit_job(
|
|
144
|
-
identifier,
|
|
145
|
+
identifier,
|
|
146
|
+
name,
|
|
147
|
+
version,
|
|
148
|
+
executor_image_tag,
|
|
149
|
+
job_metadata,
|
|
150
|
+
expiration=config.get("aws", {}).get("expiration", DEFAULT_EXPIRATION),
|
|
145
151
|
)
|
|
146
152
|
job_response_data = job_response.get("data", {})
|
|
147
153
|
|
|
@@ -1,10 +1,9 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
|
-
from typing import Iterable, List
|
|
3
|
+
from typing import Dict, Iterable, List
|
|
4
4
|
|
|
5
5
|
import pandas as pd
|
|
6
6
|
from pandas import DataFrame
|
|
7
|
-
from pyspark.sql import SparkSession
|
|
8
7
|
|
|
9
8
|
from seshat.data_class import SFrame
|
|
10
9
|
from seshat.data_class.base import GroupSFrame
|
|
@@ -33,7 +32,7 @@ class DFrame(SFrame):
|
|
|
33
32
|
def to_spf(self) -> SFrame:
|
|
34
33
|
from seshat.data_class import SPFrame
|
|
35
34
|
|
|
36
|
-
spark =
|
|
35
|
+
spark = SPFrame.get_spark()
|
|
37
36
|
return SPFrame.from_raw(spark.createDataFrame(self.data))
|
|
38
37
|
|
|
39
38
|
def extend_vertically(self, other: DataFrame):
|
|
@@ -1,9 +1,10 @@
|
|
|
1
|
-
from typing import Iterable, List
|
|
1
|
+
from typing import Dict, Iterable, List
|
|
2
2
|
|
|
3
3
|
from pandas import DataFrame
|
|
4
|
-
from pyspark.sql import DataFrame as PySparkDataFrame
|
|
4
|
+
from pyspark.sql import DataFrame as PySparkDataFrame
|
|
5
|
+
from pyspark.sql import SparkSession
|
|
5
6
|
|
|
6
|
-
from seshat.data_class import
|
|
7
|
+
from seshat.data_class import DFrame, SFrame
|
|
7
8
|
from seshat.data_class.base import GroupSFrame
|
|
8
9
|
from seshat.general import configs
|
|
9
10
|
|
|
@@ -56,6 +57,8 @@ class SPFrame(SFrame):
|
|
|
56
57
|
def from_raw(cls, data, *args, **kwargs) -> "SPFrame":
|
|
57
58
|
if isinstance(data, DataFrame):
|
|
58
59
|
data = DFrame.from_raw(data).convert(cls).to_raw()
|
|
60
|
+
elif not isinstance(data, PySparkDataFrame) and data:
|
|
61
|
+
data = cls.get_spark().createDataFrame(data or [])
|
|
59
62
|
return cls(data)
|
|
60
63
|
|
|
61
64
|
@staticmethod
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
import logging
|
|
2
|
-
from typing import Dict,
|
|
2
|
+
from typing import Callable, Dict, Optional
|
|
3
3
|
|
|
4
4
|
from seshat.data_class import SFrame
|
|
5
5
|
from seshat.evaluation.base import Evaluation
|
|
@@ -7,6 +7,7 @@ from seshat.profiler import ProfileConfig
|
|
|
7
7
|
from seshat.profiler.base import profiler
|
|
8
8
|
from seshat.source import Source
|
|
9
9
|
from seshat.source.saver import Saver
|
|
10
|
+
from seshat.transformer.base import Transformer
|
|
10
11
|
from seshat.transformer.pipeline import Pipeline
|
|
11
12
|
from seshat.transformer.splitter import Splitter
|
|
12
13
|
|
|
@@ -36,7 +37,8 @@ class FeatureView:
|
|
|
36
37
|
saver : Saver, optional
|
|
37
38
|
An optional component responsible for saving the processed data during training
|
|
38
39
|
Required only in offline mode.
|
|
39
|
-
|
|
40
|
+
on_save_finished : Transformer, Optional
|
|
41
|
+
A Transformer to be called after the save operation completes.
|
|
40
42
|
Examples
|
|
41
43
|
--------
|
|
42
44
|
Define feature view:
|
|
@@ -81,6 +83,7 @@ class FeatureView:
|
|
|
81
83
|
saver: Saver = None
|
|
82
84
|
profile_config = ProfileConfig(logging.INFO, default_tracking=True)
|
|
83
85
|
evaluation: Evaluation
|
|
86
|
+
on_save_finished: Optional[Transformer] = None
|
|
84
87
|
|
|
85
88
|
def __call__(self, *args, **kwargs):
|
|
86
89
|
source = self._get_source()
|
|
@@ -143,6 +146,8 @@ class FeatureView:
|
|
|
143
146
|
if hasattr(self, "splitter")
|
|
144
147
|
else self.saver(self.data)
|
|
145
148
|
)
|
|
149
|
+
if self.on_save_finished is not None:
|
|
150
|
+
self.on_save_finished(self.data)
|
|
146
151
|
|
|
147
152
|
def _split(self, *args, **kwargs):
|
|
148
153
|
self.split_data = self.splitter(self.data, *args, **kwargs)
|
{sdk_seshat_python-0.3.16 → sdk_seshat_python-0.4.2}/seshat/general/command/submit_to_network.py
RENAMED
|
@@ -313,6 +313,7 @@ class SubmitCommand(BaseTyperCommand):
|
|
|
313
313
|
version: str,
|
|
314
314
|
executor_image_tag: str,
|
|
315
315
|
metadata: JobMetadata,
|
|
316
|
+
expiration=86400,
|
|
316
317
|
) -> dict:
|
|
317
318
|
"""Submit job to API after successful upload"""
|
|
318
319
|
if not self.job_config.base_url or not self.job_config.auth_token:
|
|
@@ -320,7 +321,7 @@ class SubmitCommand(BaseTyperCommand):
|
|
|
320
321
|
"API configuration missing. Please set base_url and auth_token"
|
|
321
322
|
)
|
|
322
323
|
|
|
323
|
-
presigned_url = self.backend.generate_presigned_url(s3_key)
|
|
324
|
+
presigned_url = self.backend.generate_presigned_url(s3_key, expiration)
|
|
324
325
|
executor_label = self.config.get("executor", {}).get("label")
|
|
325
326
|
|
|
326
327
|
payload = {
|
|
@@ -5,24 +5,29 @@ from seshat.source import Source
|
|
|
5
5
|
|
|
6
6
|
class LocalSource(Source):
|
|
7
7
|
"""
|
|
8
|
-
LocalSource is a source that can read
|
|
8
|
+
LocalSource is a data source that can read from a local file or an in-memory source.
|
|
9
9
|
"""
|
|
10
10
|
|
|
11
11
|
def __init__(
|
|
12
12
|
self,
|
|
13
|
-
|
|
13
|
+
data_source,
|
|
14
14
|
query=None,
|
|
15
15
|
schema=None,
|
|
16
16
|
mode=configs.DEFAULT_MODE,
|
|
17
17
|
):
|
|
18
18
|
super().__init__(query, schema, mode)
|
|
19
|
-
self.
|
|
19
|
+
self.data_source = data_source
|
|
20
20
|
|
|
21
21
|
def convert_data_type(self, data) -> SFrame:
|
|
22
22
|
return self.data_class.from_raw(data)
|
|
23
23
|
|
|
24
24
|
def fetch(self) -> SFrame:
|
|
25
|
-
d =
|
|
25
|
+
d = (
|
|
26
|
+
self.data_class.read_csv(path=self.data_source)
|
|
27
|
+
if isinstance(self.data_source, str)
|
|
28
|
+
else self.data_source
|
|
29
|
+
)
|
|
30
|
+
|
|
26
31
|
return self.convert_data_type(d)
|
|
27
32
|
|
|
28
33
|
def calculate_complexity(self):
|
|
@@ -35,7 +35,13 @@ class SQLDBSaver(SQLMixin, Saver):
|
|
|
35
35
|
self.create_index(config)
|
|
36
36
|
|
|
37
37
|
selected_sf = sf.get(config.sf_key)
|
|
38
|
-
|
|
38
|
+
has_id = False
|
|
39
|
+
for col in config.schema.cols:
|
|
40
|
+
if col.is_id:
|
|
41
|
+
has_id = True
|
|
42
|
+
break
|
|
43
|
+
if has_id:
|
|
44
|
+
selected_sf = self.drop_nan_ids(selected_sf, config.schema)
|
|
39
45
|
|
|
40
46
|
if config.clear_table:
|
|
41
47
|
self.delete(config.table)
|
|
@@ -65,6 +71,11 @@ class SQLDBSaver(SQLMixin, Saver):
|
|
|
65
71
|
table, _ = self.get_table(table_name, autoload=True)
|
|
66
72
|
self.write_on_db(table.delete())
|
|
67
73
|
|
|
74
|
+
def drop_table(self, table_name):
|
|
75
|
+
if table_name in inspect(self.get_engine()).get_table_names():
|
|
76
|
+
table, _ = self.get_table(table_name, autoload=True)
|
|
77
|
+
table.drop(self.get_engine())
|
|
78
|
+
|
|
68
79
|
def insert(self, selected_sf: SFrame, config: SaveConfig):
|
|
69
80
|
values = self.prepare_sf_to_insert(selected_sf, config).to_dict()
|
|
70
81
|
table, _ = self.get_table(config.table, autoload=True)
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
from .base import (
|
|
2
|
+
FeatureForAddressDeriver,
|
|
3
|
+
InteractedSymbolsToSentenceDeriver,
|
|
4
|
+
OperationOnColsDeriver,
|
|
5
|
+
PercentileTransactionValueDeriver,
|
|
6
|
+
SFrameFromColsDeriver,
|
|
7
|
+
StaticValueColumnAdder,
|
|
8
|
+
TimeWindowTransformer,
|
|
9
|
+
FractionDeriver,
|
|
10
|
+
ProfitLossDeriver,
|
|
11
|
+
DateTimeTypeDeriver,
|
|
12
|
+
ComprehensiveFeaturesDeriver,
|
|
13
|
+
GroupByDeriverCount,
|
|
14
|
+
GroupByDeriverMeanMax,
|
|
15
|
+
ChangingOverTimeDeriver,
|
|
16
|
+
TokenLastPriceDeriver,
|
|
17
|
+
GroupByTimeWindowDeriver,
|
|
18
|
+
OneColumnPercentileFilterDeriver,
|
|
19
|
+
SenderReceiverTokensDeriver,
|
|
20
|
+
TokenPriceDeriver,
|
|
21
|
+
TokenSwapTradeDeriver,
|
|
22
|
+
TokenFeatureTransformationDeriver,
|
|
23
|
+
)
|
|
24
|
+
|
|
25
|
+
from .from_database import FromSQLDBDeriver
|
|
@@ -1,12 +1,13 @@
|
|
|
1
1
|
from datetime import timedelta
|
|
2
|
-
from typing import
|
|
2
|
+
from typing import Any, Callable, Dict, List, Literal, Tuple
|
|
3
3
|
|
|
4
4
|
import pandas as pd
|
|
5
5
|
from pandas import DataFrame
|
|
6
|
-
from pyspark.sql import DataFrame as PySparkDataFrame
|
|
6
|
+
from pyspark.sql import DataFrame as PySparkDataFrame
|
|
7
|
+
from pyspark.sql import Window
|
|
7
8
|
from pyspark.sql import functions as F
|
|
8
|
-
from pyspark.sql.functions import array_distinct, array_union, coalesce,
|
|
9
|
-
from pyspark.sql.types import IntegerType,
|
|
9
|
+
from pyspark.sql.functions import array, array_distinct, array_union, coalesce, lit
|
|
10
|
+
from pyspark.sql.types import IntegerType, StructField, StructType
|
|
10
11
|
|
|
11
12
|
from seshat.data_class import SFrame, SPFrame
|
|
12
13
|
from seshat.data_class.base import GroupSFrame
|
|
@@ -15,9 +16,9 @@ from seshat.general import configs
|
|
|
15
16
|
from seshat.general.exceptions import InvalidArgumentsError
|
|
16
17
|
from seshat.transformer import Transformer
|
|
17
18
|
from seshat.transformer.merger import Merger
|
|
18
|
-
from seshat.transformer.schema import
|
|
19
|
-
from seshat.
|
|
20
|
-
from seshat.utils import pyspark_func
|
|
19
|
+
from seshat.transformer.schema import Col, Schema
|
|
20
|
+
from seshat.transformer.trimmer.base import FilterTrimmer
|
|
21
|
+
from seshat.utils import pandas_func, pyspark_func
|
|
21
22
|
from seshat.utils.validation import NumericColumnValidator, TimeStampColumnValidator
|
|
22
23
|
|
|
23
24
|
SYMBOLS_RECEIVED_COL = "symbols_received"
|
|
@@ -1091,54 +1092,32 @@ class GroupByDeriverMeanMax(SFrameDeriver):
|
|
|
1091
1092
|
|
|
1092
1093
|
class TimeWindowTransformer(SFrameDeriver):
|
|
1093
1094
|
"""
|
|
1094
|
-
|
|
1095
|
+
A class that derives time series-based features from a default SFrame using a transformer and merger strategy.
|
|
1096
|
+
It supports applying transformations over multiple durations and combines the results using a merger.
|
|
1095
1097
|
|
|
1096
|
-
|
|
1097
|
-
|
|
1098
|
-
|
|
1099
|
-
|
|
1098
|
+
Duration labeling can be handled in two ways:
|
|
1099
|
+
1. By adding a new column with the name specified in `label_col`, where each row is labeled with its corresponding duration.
|
|
1100
|
+
2. By appending the duration label as a suffix to the names of the new columns created by the transformer, excluding those required for merging.
|
|
1101
|
+
|
|
1102
|
+
This behavior is controlled by the `specify_duration_by` parameter.
|
|
1100
1103
|
|
|
1101
1104
|
Parameters
|
|
1102
1105
|
----------
|
|
1103
|
-
transformer : Transformer
|
|
1104
|
-
|
|
1106
|
+
transformer : Transformer or Callable
|
|
1107
|
+
An instance of a transformer or a callable function that performs the transformation on the input data.
|
|
1105
1108
|
merger : Merger
|
|
1106
|
-
|
|
1107
|
-
Will be set to work in-place.
|
|
1109
|
+
An instance of a merger used to combine features generated across different durations.
|
|
1108
1110
|
time_col : str, optional
|
|
1109
|
-
The
|
|
1110
|
-
|
|
1111
|
-
|
|
1112
|
-
List of tuples where each tuple contains a label and a duration in minutes.
|
|
1113
|
-
The label identifies the time window and the duration specifies how far back
|
|
1114
|
-
from the latest timestamp to include data.
|
|
1111
|
+
The name of the timestamp column in the SFrame. Defaults to `configs.BLOCK_TIMESTAMP_COL`.
|
|
1112
|
+
durations : List[Tuple[str, int]]
|
|
1113
|
+
A list of (label, duration) pairs where each label identifies a time window (in minutes) for feature extraction.
|
|
1115
1114
|
label_col : str, optional
|
|
1116
|
-
The column
|
|
1117
|
-
|
|
1118
|
-
|
|
1119
|
-
|
|
1120
|
-
|
|
1121
|
-
|
|
1122
|
-
----------
|
|
1123
|
-
time_col : str
|
|
1124
|
-
The column name containing timestamp data.
|
|
1125
|
-
transformer : Transformer
|
|
1126
|
-
The transformer applied to each time window's data.
|
|
1127
|
-
label_col : str
|
|
1128
|
-
The column name storing the time window label.
|
|
1129
|
-
merger : Merger
|
|
1130
|
-
The merger used to combine results from different time windows.
|
|
1131
|
-
periods : List[Tuple[str, int]]
|
|
1132
|
-
The list of time windows with their labels and durations.
|
|
1133
|
-
|
|
1134
|
-
Notes
|
|
1135
|
-
-----
|
|
1136
|
-
The transformer processes data by:
|
|
1137
|
-
1. Finding the latest timestamp in the data
|
|
1138
|
-
2. Creating time windows based on the specified durations
|
|
1139
|
-
3. Filtering data for each time window
|
|
1140
|
-
4. Applying the transformer to each filtered dataset
|
|
1141
|
-
5. Merging the results using the provided merger
|
|
1115
|
+
The name of the column that will store the duration label for each record. Defaults to `configs.DURATION_LABEL_COL`.
|
|
1116
|
+
specify_duration_by : {'column', 'suffix'}, default 'column'
|
|
1117
|
+
Specifies how duration information is represented in the output:
|
|
1118
|
+
* 'column' – Adds a new column (named by `label_col`) containing the duration label.
|
|
1119
|
+
* 'suffix' – Appends the duration label as a suffix to the names of columns created by the transformer,
|
|
1120
|
+
excluding those required by the merger.
|
|
1142
1121
|
"""
|
|
1143
1122
|
|
|
1144
1123
|
DEFAULT_GROUP_KEYS = {
|
|
@@ -1148,11 +1127,12 @@ class TimeWindowTransformer(SFrameDeriver):
|
|
|
1148
1127
|
|
|
1149
1128
|
def __init__(
|
|
1150
1129
|
self,
|
|
1151
|
-
transformer: Transformer,
|
|
1130
|
+
transformer: Transformer | Callable,
|
|
1152
1131
|
merger: Merger,
|
|
1132
|
+
durations: List[Tuple[str, int]],
|
|
1153
1133
|
time_col=configs.BLOCK_TIMESTAMP_COL,
|
|
1154
|
-
durations: List[Tuple[str, int]] = None,
|
|
1155
1134
|
label_col=configs.DURATION_LABEL_COL,
|
|
1135
|
+
specify_duration_by: Literal["column", "suffix"] = "column",
|
|
1156
1136
|
group_keys=None,
|
|
1157
1137
|
):
|
|
1158
1138
|
super().__init__(group_keys)
|
|
@@ -1160,191 +1140,89 @@ class TimeWindowTransformer(SFrameDeriver):
|
|
|
1160
1140
|
self.transformer = transformer
|
|
1161
1141
|
self.label_col = label_col
|
|
1162
1142
|
self.merger = merger
|
|
1163
|
-
self.
|
|
1143
|
+
self.specify_duration_by = specify_duration_by
|
|
1144
|
+
self.durations = durations
|
|
1164
1145
|
|
|
1165
|
-
# Ensure that
|
|
1146
|
+
# Ensure that the merge works in place and group keys are set to their default values.
|
|
1166
1147
|
self.merger.inplace = True
|
|
1148
|
+
self.merger.group_keys = {"default": "default", "other": "other"}
|
|
1149
|
+
if self.specify_duration_by == "column":
|
|
1150
|
+
self.merger.axis = 0
|
|
1167
1151
|
|
|
1168
1152
|
def validate(self, sf: SFrame):
|
|
1169
|
-
"""
|
|
1170
|
-
Validate that the input SFrame contains the required columns.
|
|
1171
|
-
|
|
1172
|
-
Parameters
|
|
1173
|
-
----------
|
|
1174
|
-
sf : SFrame
|
|
1175
|
-
The input SFrame to validate.
|
|
1176
|
-
|
|
1177
|
-
Raises
|
|
1178
|
-
------
|
|
1179
|
-
ColDoesNotExistError
|
|
1180
|
-
If the required time column is not present in the SFrame.
|
|
1181
|
-
"""
|
|
1182
1153
|
super().validate(sf)
|
|
1183
1154
|
self._validate_columns(sf, self.group_keys["default"], self.time_col)
|
|
1184
1155
|
|
|
1185
|
-
def
|
|
1186
|
-
|
|
1187
|
-
|
|
1156
|
+
def _is_empty(self, sf: SFrame) -> bool:
|
|
1157
|
+
raw = sf.to_raw()
|
|
1158
|
+
if raw is None:
|
|
1159
|
+
return True
|
|
1160
|
+
elif hasattr(raw, "rdd"):
|
|
1161
|
+
return raw.rdd.isEmpty()
|
|
1162
|
+
else:
|
|
1163
|
+
return len(raw) == 0
|
|
1188
1164
|
|
|
1189
|
-
|
|
1190
|
-
|
|
1191
|
-
|
|
1192
|
-
The input DataFrame containing the data.
|
|
1193
|
-
label : str
|
|
1194
|
-
The label for the time window.
|
|
1195
|
-
start : datetime
|
|
1196
|
-
The start timestamp for the time window.
|
|
1165
|
+
def calculate_metrics(self, sf: SFrame, label, start) -> SFrame:
|
|
1166
|
+
trimmer = FilterTrimmer(op=">=", col=self.time_col, value=start)
|
|
1167
|
+
filtered = trimmer(sf)
|
|
1197
1168
|
|
|
1198
|
-
|
|
1199
|
-
|
|
1200
|
-
SFrame
|
|
1201
|
-
An SFrame containing the calculated metrics for the time window.
|
|
1202
|
-
Returns an empty DFrame if no data is available for the time window.
|
|
1203
|
-
"""
|
|
1204
|
-
filtered = default[default[self.time_col] >= start]
|
|
1169
|
+
if self._is_empty(filtered):
|
|
1170
|
+
return filtered
|
|
1205
1171
|
|
|
1206
|
-
|
|
1207
|
-
|
|
1172
|
+
result = self.transformer(sf_input=filtered, label=label, start=start)
|
|
1173
|
+
if self.specify_duration_by == "column":
|
|
1174
|
+
deriver = StaticValueColumnAdder(col_name=self.label_col, value=label)
|
|
1175
|
+
result = deriver(result)
|
|
1208
1176
|
|
|
1209
|
-
|
|
1210
|
-
|
|
1177
|
+
else:
|
|
1178
|
+
# Find columns excluding those used by,
|
|
1179
|
+
# the merger for merging.
|
|
1180
|
+
cols = set(result[self.transformer].get_columns()) - set(self.merger.on)
|
|
1181
|
+
schema = Schema(
|
|
1182
|
+
cols=[Col(col, to=f"{col}_{label}") for col in cols],
|
|
1183
|
+
exclusive=False,
|
|
1184
|
+
)
|
|
1185
|
+
result = schema(result)
|
|
1186
|
+
return result
|
|
1211
1187
|
|
|
1212
1188
|
def derive_df(self, default: DataFrame, *args, **kwargs):
|
|
1213
|
-
|
|
1214
|
-
|
|
1215
|
-
|
|
1216
|
-
|
|
1217
|
-
|
|
1218
|
-
2. Finds the latest timestamp in the data
|
|
1219
|
-
3. Creates time windows based on the specified durations
|
|
1220
|
-
4. Calculates metrics for each time window
|
|
1221
|
-
5. Merges the results using the provided merger
|
|
1222
|
-
|
|
1223
|
-
Parameters
|
|
1224
|
-
----------
|
|
1225
|
-
default : DataFrame
|
|
1226
|
-
The input DataFrame containing the data.
|
|
1227
|
-
*args : tuple
|
|
1228
|
-
Variable length argument list.
|
|
1229
|
-
**kwargs : dict
|
|
1230
|
-
Arbitrary keyword arguments.
|
|
1189
|
+
default = TimeStampColumnValidator().validate(default, self.time_col)
|
|
1190
|
+
last_timestamp = default[self.time_col].max()
|
|
1191
|
+
sf_input = DFrame.from_raw(default)
|
|
1192
|
+
result = DFrame()
|
|
1193
|
+
return self.derive(sf_input, result, last_timestamp)
|
|
1231
1194
|
|
|
1232
|
-
|
|
1233
|
-
-------
|
|
1234
|
-
dict
|
|
1235
|
-
A dictionary with two keys:
|
|
1236
|
-
- 'default': The original input DataFrame
|
|
1237
|
-
- 'timeseries_feature': The derived time-based features
|
|
1238
|
-
"""
|
|
1195
|
+
def derive_spf(self, default: PySparkDataFrame, *args, **kwargs):
|
|
1239
1196
|
default = TimeStampColumnValidator().validate(default, self.time_col)
|
|
1197
|
+
last_timestamp = default.select(F.max(self.time_col)).collect()[0][0]
|
|
1198
|
+
sf_input = SPFrame.from_raw(default)
|
|
1199
|
+
result = SPFrame()
|
|
1200
|
+
return self.derive(sf_input, result, last_timestamp)
|
|
1240
1201
|
|
|
1241
|
-
|
|
1202
|
+
def derive(self, sf_input: SFrame, result: SFrame, last_timestamp):
|
|
1242
1203
|
intervals = {
|
|
1243
1204
|
label: last_timestamp - timedelta(minutes=minutes)
|
|
1244
|
-
for label, minutes in self.
|
|
1205
|
+
for label, minutes in self.durations
|
|
1245
1206
|
}
|
|
1246
1207
|
|
|
1247
|
-
|
|
1248
|
-
|
|
1249
|
-
|
|
1250
|
-
if metrics.data.empty:
|
|
1208
|
+
for idx, (label, start) in enumerate(intervals.items()):
|
|
1209
|
+
metrics = self.calculate_metrics(sf_input, label, start)
|
|
1210
|
+
if self._is_empty(metrics):
|
|
1251
1211
|
continue
|
|
1252
1212
|
metrics = metrics.make_group()
|
|
1253
|
-
metrics[
|
|
1213
|
+
metrics["other"] = metrics["default"]
|
|
1214
|
+
metrics["default"] = result
|
|
1254
1215
|
result = (
|
|
1255
|
-
metrics["
|
|
1256
|
-
if result
|
|
1216
|
+
metrics["other"]
|
|
1217
|
+
if self._is_empty(result)
|
|
1257
1218
|
else self.merger(metrics)["default"]
|
|
1258
1219
|
)
|
|
1259
1220
|
|
|
1260
|
-
return {
|
|
1261
|
-
|
|
1262
|
-
|
|
1263
|
-
self, default: PySparkDataFrame, label, start
|
|
1264
|
-
) -> PySparkDataFrame:
|
|
1265
|
-
"""
|
|
1266
|
-
Calculate metrics for a specific time window using PySpark DataFrame.
|
|
1267
|
-
|
|
1268
|
-
Parameters
|
|
1269
|
-
----------
|
|
1270
|
-
default : PySparkDataFrame
|
|
1271
|
-
The input PySpark DataFrame containing the data.
|
|
1272
|
-
label : str
|
|
1273
|
-
The label for the time window.
|
|
1274
|
-
start : datetime
|
|
1275
|
-
The start timestamp for the time window.
|
|
1276
|
-
|
|
1277
|
-
Returns
|
|
1278
|
-
-------
|
|
1279
|
-
PySparkDataFrame
|
|
1280
|
-
A PySpark DataFrame containing the calculated metrics for the time window.
|
|
1281
|
-
Returns an empty DataFrame if no data is available for the time window.
|
|
1282
|
-
"""
|
|
1283
|
-
filtered = default.filter(F.col(self.time_col) >= start)
|
|
1284
|
-
|
|
1285
|
-
if filtered.count() == 0:
|
|
1286
|
-
return SPFrame.get_spark().createDataFrame([], default.schema)
|
|
1287
|
-
|
|
1288
|
-
filtered = filtered.withColumn(self.label_col, F.lit(label))
|
|
1289
|
-
return self.transformer(SPFrame(filtered), label=label, start=start).data
|
|
1290
|
-
|
|
1291
|
-
def derive_spf(self, default: PySparkDataFrame, *args, **kwargs):
|
|
1292
|
-
"""
|
|
1293
|
-
Derive time-based features from the input PySpark DataFrame.
|
|
1294
|
-
|
|
1295
|
-
This method:
|
|
1296
|
-
1. Validates that the time column contains timestamp data
|
|
1297
|
-
2. Finds the latest timestamp in the data
|
|
1298
|
-
3. Creates time windows based on the specified durations
|
|
1299
|
-
4. Calculates metrics for each time window
|
|
1300
|
-
5. Merges the results using the provided merger
|
|
1301
|
-
|
|
1302
|
-
Parameters
|
|
1303
|
-
----------
|
|
1304
|
-
default : PySparkDataFrame
|
|
1305
|
-
The input PySpark DataFrame containing the data.
|
|
1306
|
-
*args : tuple
|
|
1307
|
-
Variable length argument list.
|
|
1308
|
-
**kwargs : dict
|
|
1309
|
-
Arbitrary keyword arguments.
|
|
1310
|
-
|
|
1311
|
-
Returns
|
|
1312
|
-
-------
|
|
1313
|
-
dict
|
|
1314
|
-
A dictionary with two keys:
|
|
1315
|
-
- 'default': The original input DataFrame
|
|
1316
|
-
- 'timeseries_feature': The derived time-based features
|
|
1317
|
-
"""
|
|
1318
|
-
default = TimeStampColumnValidator().validate(default, self.time_col)
|
|
1319
|
-
|
|
1320
|
-
last_timestamp_row = default.agg(F.max(self.time_col)).collect()[0][0]
|
|
1321
|
-
|
|
1322
|
-
intervals = {
|
|
1323
|
-
label: last_timestamp_row - timedelta(minutes=minutes)
|
|
1324
|
-
for label, minutes in self.periods
|
|
1221
|
+
return {
|
|
1222
|
+
"default": sf_input[self.default_sf_key].to_raw(),
|
|
1223
|
+
"timeseries_feature": result.to_raw(),
|
|
1325
1224
|
}
|
|
1326
1225
|
|
|
1327
|
-
result = None
|
|
1328
|
-
|
|
1329
|
-
for label, start in intervals.items():
|
|
1330
|
-
metrics = self.calculate_metrics_spf(default, label, start)
|
|
1331
|
-
|
|
1332
|
-
if metrics.count() == 0:
|
|
1333
|
-
continue
|
|
1334
|
-
|
|
1335
|
-
metrics_group = {self.merger.group_keys["default"]: metrics}
|
|
1336
|
-
|
|
1337
|
-
if result is not None:
|
|
1338
|
-
metrics_group[self.merger.group_keys["other"]] = result
|
|
1339
|
-
result = self.merger(SPFrame(metrics_group))["default"]
|
|
1340
|
-
else:
|
|
1341
|
-
result = metrics
|
|
1342
|
-
|
|
1343
|
-
if result is not None:
|
|
1344
|
-
return {"default": default, "timeseries_feature": result}
|
|
1345
|
-
else:
|
|
1346
|
-
return {"default": default}
|
|
1347
|
-
|
|
1348
1226
|
|
|
1349
1227
|
class StaticValueColumnAdder(SFrameDeriver):
|
|
1350
1228
|
"""
|
|
@@ -1714,9 +1592,11 @@ class ComprehensiveFeaturesDeriver(SFrameDeriver):
|
|
|
1714
1592
|
last_price=("TOKEN_PRICE", "last"),
|
|
1715
1593
|
transaction_count_hourly=(
|
|
1716
1594
|
"BLOCK_TIMESTAMP",
|
|
1717
|
-
lambda x:
|
|
1718
|
-
|
|
1719
|
-
|
|
1595
|
+
lambda x: (
|
|
1596
|
+
len(x) / ((x.max() - x.min()).total_seconds() / 3600)
|
|
1597
|
+
if (x.max() > x.min())
|
|
1598
|
+
else len(x)
|
|
1599
|
+
),
|
|
1720
1600
|
),
|
|
1721
1601
|
)
|
|
1722
1602
|
.reset_index()
|