sdk-seshat-python 0.3.16__tar.gz → 0.4.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (138) hide show
  1. {sdk_seshat_python-0.3.16 → sdk_seshat_python-0.4.2}/PKG-INFO +3 -2
  2. {sdk_seshat_python-0.3.16 → sdk_seshat_python-0.4.2}/pyproject.toml +5 -4
  3. {sdk_seshat_python-0.3.16 → sdk_seshat_python-0.4.2}/seshat/__init__.py +7 -1
  4. {sdk_seshat_python-0.3.16 → sdk_seshat_python-0.4.2}/seshat/data_class/pandas.py +2 -3
  5. {sdk_seshat_python-0.3.16 → sdk_seshat_python-0.4.2}/seshat/data_class/pyspark.py +6 -3
  6. {sdk_seshat_python-0.3.16 → sdk_seshat_python-0.4.2}/seshat/feature_view/base.py +7 -2
  7. {sdk_seshat_python-0.3.16 → sdk_seshat_python-0.4.2}/seshat/general/command/submit_to_network.py +2 -1
  8. {sdk_seshat_python-0.3.16 → sdk_seshat_python-0.4.2}/seshat/source/local/base.py +9 -4
  9. {sdk_seshat_python-0.3.16 → sdk_seshat_python-0.4.2}/seshat/source/saver/database.py +12 -1
  10. sdk_seshat_python-0.4.2/seshat/transformer/deriver/__init__.py +25 -0
  11. {sdk_seshat_python-0.3.16 → sdk_seshat_python-0.4.2}/seshat/transformer/deriver/base.py +90 -210
  12. {sdk_seshat_python-0.3.16 → sdk_seshat_python-0.4.2}/seshat/transformer/pseudo/__init__.py +1 -0
  13. sdk_seshat_python-0.4.2/seshat/transformer/pseudo/action_gate.py +119 -0
  14. {sdk_seshat_python-0.3.16 → sdk_seshat_python-0.4.2}/seshat/transformer/pseudo/table_existence.py +3 -11
  15. sdk_seshat_python-0.4.2/seshat/transformer/reducer/base.py +469 -0
  16. sdk_seshat_python-0.3.16/seshat/transformer/deriver/__init__.py +0 -9
  17. sdk_seshat_python-0.3.16/seshat/transformer/reducer/base.py +0 -562
  18. {sdk_seshat_python-0.3.16 → sdk_seshat_python-0.4.2}/LICENSE +0 -0
  19. {sdk_seshat_python-0.3.16 → sdk_seshat_python-0.4.2}/README.md +0 -0
  20. {sdk_seshat_python-0.3.16 → sdk_seshat_python-0.4.2}/seshat/__main__.py +0 -0
  21. {sdk_seshat_python-0.3.16 → sdk_seshat_python-0.4.2}/seshat/data_class/__init__.py +0 -0
  22. {sdk_seshat_python-0.3.16 → sdk_seshat_python-0.4.2}/seshat/data_class/base.py +0 -0
  23. {sdk_seshat_python-0.3.16 → sdk_seshat_python-0.4.2}/seshat/evaluation/__init__.py +0 -0
  24. {sdk_seshat_python-0.3.16 → sdk_seshat_python-0.4.2}/seshat/evaluation/base.py +0 -0
  25. {sdk_seshat_python-0.3.16 → sdk_seshat_python-0.4.2}/seshat/evaluation/evaluator/__init__.py +0 -0
  26. {sdk_seshat_python-0.3.16 → sdk_seshat_python-0.4.2}/seshat/evaluation/evaluator/base.py +0 -0
  27. {sdk_seshat_python-0.3.16 → sdk_seshat_python-0.4.2}/seshat/evaluation/evaluator/general/__init__.py +0 -0
  28. {sdk_seshat_python-0.3.16 → sdk_seshat_python-0.4.2}/seshat/evaluation/evaluator/general/classification.py +0 -0
  29. {sdk_seshat_python-0.3.16 → sdk_seshat_python-0.4.2}/seshat/evaluation/evaluator/general/clustering.py +0 -0
  30. {sdk_seshat_python-0.3.16 → sdk_seshat_python-0.4.2}/seshat/evaluation/evaluator/general/regression.py +0 -0
  31. {sdk_seshat_python-0.3.16 → sdk_seshat_python-0.4.2}/seshat/evaluation/evaluator/recommendation/__init__.py +0 -0
  32. {sdk_seshat_python-0.3.16 → sdk_seshat_python-0.4.2}/seshat/evaluation/evaluator/recommendation/diversity.py +0 -0
  33. {sdk_seshat_python-0.3.16 → sdk_seshat_python-0.4.2}/seshat/evaluation/evaluator/recommendation/ranking.py +0 -0
  34. {sdk_seshat_python-0.3.16 → sdk_seshat_python-0.4.2}/seshat/feature_view/__init__.py +0 -0
  35. {sdk_seshat_python-0.3.16 → sdk_seshat_python-0.4.2}/seshat/general/__init__.py +0 -0
  36. {sdk_seshat_python-0.3.16 → sdk_seshat_python-0.4.2}/seshat/general/command/__init__.py +0 -0
  37. {sdk_seshat_python-0.3.16 → sdk_seshat_python-0.4.2}/seshat/general/command/base.py +0 -0
  38. {sdk_seshat_python-0.3.16 → sdk_seshat_python-0.4.2}/seshat/general/command/code_inspect.py +0 -0
  39. {sdk_seshat_python-0.3.16 → sdk_seshat_python-0.4.2}/seshat/general/command/job_status.py +0 -0
  40. {sdk_seshat_python-0.3.16 → sdk_seshat_python-0.4.2}/seshat/general/command/setup_project.py +0 -0
  41. {sdk_seshat_python-0.3.16 → sdk_seshat_python-0.4.2}/seshat/general/config.py +0 -0
  42. {sdk_seshat_python-0.3.16 → sdk_seshat_python-0.4.2}/seshat/general/exceptions.py +0 -0
  43. {sdk_seshat_python-0.3.16 → sdk_seshat_python-0.4.2}/seshat/general/lazy_config.py +0 -0
  44. {sdk_seshat_python-0.3.16 → sdk_seshat_python-0.4.2}/seshat/general/models.py +0 -0
  45. {sdk_seshat_python-0.3.16 → sdk_seshat_python-0.4.2}/seshat/general/template/README.md-tmpl +0 -0
  46. {sdk_seshat_python-0.3.16 → sdk_seshat_python-0.4.2}/seshat/general/template/config.py-tmpl +0 -0
  47. {sdk_seshat_python-0.3.16 → sdk_seshat_python-0.4.2}/seshat/general/template/env-templ +0 -0
  48. {sdk_seshat_python-0.3.16 → sdk_seshat_python-0.4.2}/seshat/general/template/jobignore-tmpl +0 -0
  49. {sdk_seshat_python-0.3.16 → sdk_seshat_python-0.4.2}/seshat/general/template/pyproject._toml-tmpl +0 -0
  50. {sdk_seshat_python-0.3.16 → sdk_seshat_python-0.4.2}/seshat/general/template/recommender-jupyter.ipynb-tmpl +0 -0
  51. {sdk_seshat_python-0.3.16 → sdk_seshat_python-0.4.2}/seshat/general/template/recommender.py-tmpl +0 -0
  52. {sdk_seshat_python-0.3.16 → sdk_seshat_python-0.4.2}/seshat/profiler/__init__.py +0 -0
  53. {sdk_seshat_python-0.3.16 → sdk_seshat_python-0.4.2}/seshat/profiler/base.py +0 -0
  54. {sdk_seshat_python-0.3.16 → sdk_seshat_python-0.4.2}/seshat/profiler/decorator.py +0 -0
  55. {sdk_seshat_python-0.3.16 → sdk_seshat_python-0.4.2}/seshat/profiler/format.py +0 -0
  56. {sdk_seshat_python-0.3.16 → sdk_seshat_python-0.4.2}/seshat/source/__init__.py +0 -0
  57. {sdk_seshat_python-0.3.16 → sdk_seshat_python-0.4.2}/seshat/source/base.py +0 -0
  58. {sdk_seshat_python-0.3.16 → sdk_seshat_python-0.4.2}/seshat/source/database/__init__.py +0 -0
  59. {sdk_seshat_python-0.3.16 → sdk_seshat_python-0.4.2}/seshat/source/database/base.py +0 -0
  60. {sdk_seshat_python-0.3.16 → sdk_seshat_python-0.4.2}/seshat/source/exceptions.py +0 -0
  61. {sdk_seshat_python-0.3.16 → sdk_seshat_python-0.4.2}/seshat/source/flip_side/__init__.py +0 -0
  62. {sdk_seshat_python-0.3.16 → sdk_seshat_python-0.4.2}/seshat/source/flip_side/base.py +0 -0
  63. {sdk_seshat_python-0.3.16 → sdk_seshat_python-0.4.2}/seshat/source/local/__init__.py +0 -0
  64. {sdk_seshat_python-0.3.16 → sdk_seshat_python-0.4.2}/seshat/source/mixins.py +0 -0
  65. {sdk_seshat_python-0.3.16 → sdk_seshat_python-0.4.2}/seshat/source/multisource/__init__.py +0 -0
  66. {sdk_seshat_python-0.3.16 → sdk_seshat_python-0.4.2}/seshat/source/multisource/base.py +0 -0
  67. {sdk_seshat_python-0.3.16 → sdk_seshat_python-0.4.2}/seshat/source/saver/__init__.py +0 -0
  68. {sdk_seshat_python-0.3.16 → sdk_seshat_python-0.4.2}/seshat/source/saver/base.py +0 -0
  69. {sdk_seshat_python-0.3.16 → sdk_seshat_python-0.4.2}/seshat/source/saver/utils/__init__.py +0 -0
  70. {sdk_seshat_python-0.3.16 → sdk_seshat_python-0.4.2}/seshat/source/saver/utils/postgres.py +0 -0
  71. {sdk_seshat_python-0.3.16 → sdk_seshat_python-0.4.2}/seshat/transformer/__init__.py +0 -0
  72. {sdk_seshat_python-0.3.16 → sdk_seshat_python-0.4.2}/seshat/transformer/aggregator/__init__.py +0 -0
  73. {sdk_seshat_python-0.3.16 → sdk_seshat_python-0.4.2}/seshat/transformer/aggregator/base.py +0 -0
  74. {sdk_seshat_python-0.3.16 → sdk_seshat_python-0.4.2}/seshat/transformer/augmenter/__init__.py +0 -0
  75. {sdk_seshat_python-0.3.16 → sdk_seshat_python-0.4.2}/seshat/transformer/augmenter/base.py +0 -0
  76. {sdk_seshat_python-0.3.16 → sdk_seshat_python-0.4.2}/seshat/transformer/base.py +0 -0
  77. {sdk_seshat_python-0.3.16 → sdk_seshat_python-0.4.2}/seshat/transformer/deriver/from_database.py +0 -0
  78. {sdk_seshat_python-0.3.16 → sdk_seshat_python-0.4.2}/seshat/transformer/imputer/__init__.py +0 -0
  79. {sdk_seshat_python-0.3.16 → sdk_seshat_python-0.4.2}/seshat/transformer/imputer/base.py +0 -0
  80. {sdk_seshat_python-0.3.16 → sdk_seshat_python-0.4.2}/seshat/transformer/merger/__init__.py +0 -0
  81. {sdk_seshat_python-0.3.16 → sdk_seshat_python-0.4.2}/seshat/transformer/merger/base.py +0 -0
  82. {sdk_seshat_python-0.3.16 → sdk_seshat_python-0.4.2}/seshat/transformer/merger/nested_key.py +0 -0
  83. {sdk_seshat_python-0.3.16 → sdk_seshat_python-0.4.2}/seshat/transformer/pipeline/__init__.py +0 -0
  84. {sdk_seshat_python-0.3.16 → sdk_seshat_python-0.4.2}/seshat/transformer/pipeline/base.py +0 -0
  85. {sdk_seshat_python-0.3.16 → sdk_seshat_python-0.4.2}/seshat/transformer/pipeline/branch.py +0 -0
  86. {sdk_seshat_python-0.3.16 → sdk_seshat_python-0.4.2}/seshat/transformer/pipeline/recommendation/__init__.py +0 -0
  87. {sdk_seshat_python-0.3.16 → sdk_seshat_python-0.4.2}/seshat/transformer/pipeline/recommendation/address_pipeline.py +0 -0
  88. {sdk_seshat_python-0.3.16 → sdk_seshat_python-0.4.2}/seshat/transformer/reducer/__init__.py +0 -0
  89. {sdk_seshat_python-0.3.16 → sdk_seshat_python-0.4.2}/seshat/transformer/scaler/__init__.py +0 -0
  90. {sdk_seshat_python-0.3.16 → sdk_seshat_python-0.4.2}/seshat/transformer/scaler/base.py +0 -0
  91. {sdk_seshat_python-0.3.16 → sdk_seshat_python-0.4.2}/seshat/transformer/schema/__init__.py +0 -0
  92. {sdk_seshat_python-0.3.16 → sdk_seshat_python-0.4.2}/seshat/transformer/schema/base.py +0 -0
  93. {sdk_seshat_python-0.3.16 → sdk_seshat_python-0.4.2}/seshat/transformer/splitter/__init__.py +0 -0
  94. {sdk_seshat_python-0.3.16 → sdk_seshat_python-0.4.2}/seshat/transformer/splitter/base.py +0 -0
  95. {sdk_seshat_python-0.3.16 → sdk_seshat_python-0.4.2}/seshat/transformer/splitter/block/__init__.py +0 -0
  96. {sdk_seshat_python-0.3.16 → sdk_seshat_python-0.4.2}/seshat/transformer/splitter/block/base.py +0 -0
  97. {sdk_seshat_python-0.3.16 → sdk_seshat_python-0.4.2}/seshat/transformer/splitter/random/__init__.py +0 -0
  98. {sdk_seshat_python-0.3.16 → sdk_seshat_python-0.4.2}/seshat/transformer/splitter/random/base.py +0 -0
  99. {sdk_seshat_python-0.3.16 → sdk_seshat_python-0.4.2}/seshat/transformer/splitter/time_line/__init__.py +0 -0
  100. {sdk_seshat_python-0.3.16 → sdk_seshat_python-0.4.2}/seshat/transformer/splitter/time_line/base.py +0 -0
  101. {sdk_seshat_python-0.3.16 → sdk_seshat_python-0.4.2}/seshat/transformer/trimmer/__init__.py +0 -0
  102. {sdk_seshat_python-0.3.16 → sdk_seshat_python-0.4.2}/seshat/transformer/trimmer/base.py +0 -0
  103. {sdk_seshat_python-0.3.16 → sdk_seshat_python-0.4.2}/seshat/transformer/vectorizer/__init__.py +0 -0
  104. {sdk_seshat_python-0.3.16 → sdk_seshat_python-0.4.2}/seshat/transformer/vectorizer/base.py +0 -0
  105. {sdk_seshat_python-0.3.16 → sdk_seshat_python-0.4.2}/seshat/transformer/vectorizer/cosine_similarity.py +0 -0
  106. {sdk_seshat_python-0.3.16 → sdk_seshat_python-0.4.2}/seshat/transformer/vectorizer/pivot.py +0 -0
  107. {sdk_seshat_python-0.3.16 → sdk_seshat_python-0.4.2}/seshat/transformer/vectorizer/utils.py +0 -0
  108. {sdk_seshat_python-0.3.16 → sdk_seshat_python-0.4.2}/seshat/utils/__init__.py +0 -0
  109. {sdk_seshat_python-0.3.16 → sdk_seshat_python-0.4.2}/seshat/utils/batcher.py +0 -0
  110. {sdk_seshat_python-0.3.16 → sdk_seshat_python-0.4.2}/seshat/utils/binary_utils.py +0 -0
  111. {sdk_seshat_python-0.3.16 → sdk_seshat_python-0.4.2}/seshat/utils/clean_json.py +0 -0
  112. {sdk_seshat_python-0.3.16 → sdk_seshat_python-0.4.2}/seshat/utils/col_to_list.py +0 -0
  113. {sdk_seshat_python-0.3.16 → sdk_seshat_python-0.4.2}/seshat/utils/contracts.py +0 -0
  114. {sdk_seshat_python-0.3.16 → sdk_seshat_python-0.4.2}/seshat/utils/file.py +0 -0
  115. {sdk_seshat_python-0.3.16 → sdk_seshat_python-0.4.2}/seshat/utils/file_cryptography.py +0 -0
  116. {sdk_seshat_python-0.3.16 → sdk_seshat_python-0.4.2}/seshat/utils/filter_json.py +0 -0
  117. {sdk_seshat_python-0.3.16 → sdk_seshat_python-0.4.2}/seshat/utils/grouper.py +0 -0
  118. {sdk_seshat_python-0.3.16 → sdk_seshat_python-0.4.2}/seshat/utils/jobignore.py +0 -0
  119. {sdk_seshat_python-0.3.16 → sdk_seshat_python-0.4.2}/seshat/utils/join_columns_to_list.py +0 -0
  120. {sdk_seshat_python-0.3.16 → sdk_seshat_python-0.4.2}/seshat/utils/join_str.py +0 -0
  121. {sdk_seshat_python-0.3.16 → sdk_seshat_python-0.4.2}/seshat/utils/llm_client/__init__.py +0 -0
  122. {sdk_seshat_python-0.3.16 → sdk_seshat_python-0.4.2}/seshat/utils/llm_client/chatbot_factory.py +0 -0
  123. {sdk_seshat_python-0.3.16 → sdk_seshat_python-0.4.2}/seshat/utils/logging/__init__.py +0 -0
  124. {sdk_seshat_python-0.3.16 → sdk_seshat_python-0.4.2}/seshat/utils/logging/base_logger.py +0 -0
  125. {sdk_seshat_python-0.3.16 → sdk_seshat_python-0.4.2}/seshat/utils/logging/console_logger.py +0 -0
  126. {sdk_seshat_python-0.3.16 → sdk_seshat_python-0.4.2}/seshat/utils/logging/logstash_logger.py +0 -0
  127. {sdk_seshat_python-0.3.16 → sdk_seshat_python-0.4.2}/seshat/utils/logging/multi_logger.py +0 -0
  128. {sdk_seshat_python-0.3.16 → sdk_seshat_python-0.4.2}/seshat/utils/memory.py +0 -0
  129. {sdk_seshat_python-0.3.16 → sdk_seshat_python-0.4.2}/seshat/utils/mixin.py +0 -0
  130. {sdk_seshat_python-0.3.16 → sdk_seshat_python-0.4.2}/seshat/utils/obfuscate.py +0 -0
  131. {sdk_seshat_python-0.3.16 → sdk_seshat_python-0.4.2}/seshat/utils/package_utils.py +0 -0
  132. {sdk_seshat_python-0.3.16 → sdk_seshat_python-0.4.2}/seshat/utils/pandas_func.py +0 -0
  133. {sdk_seshat_python-0.3.16 → sdk_seshat_python-0.4.2}/seshat/utils/patching.py +0 -0
  134. {sdk_seshat_python-0.3.16 → sdk_seshat_python-0.4.2}/seshat/utils/pyspark_func.py +0 -0
  135. {sdk_seshat_python-0.3.16 → sdk_seshat_python-0.4.2}/seshat/utils/rest.py +0 -0
  136. {sdk_seshat_python-0.3.16 → sdk_seshat_python-0.4.2}/seshat/utils/singleton.py +0 -0
  137. {sdk_seshat_python-0.3.16 → sdk_seshat_python-0.4.2}/seshat/utils/validation.py +0 -0
  138. {sdk_seshat_python-0.3.16 → sdk_seshat_python-0.4.2}/seshat/utils/zip_utils.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: sdk-seshat-python
3
- Version: 0.3.16
3
+ Version: 0.4.2
4
4
  Summary: Seshat python SDK is a library to help create ML data pipelines.
5
5
  License: Commercial - see LICENSE.txt
6
6
  Author: SeshatLabs
@@ -25,7 +25,7 @@ Requires-Dist: loguru (>=0.7.3,<0.8.0)
25
25
  Requires-Dist: memory-profiler (>=0.61.0,<0.62.0)
26
26
  Requires-Dist: openai (>=1.73.0,<2.0.0)
27
27
  Requires-Dist: pandas (>=2.2.1,<3.0.0)
28
- Requires-Dist: psycopg2 (>=2.9,<3.0) ; extra == "postgres-support"
28
+ Requires-Dist: psycopg2-binary (>=2.9,<3.0) ; extra == "postgres-support"
29
29
  Requires-Dist: pyarmor (>=8.5.1,<9.0.0)
30
30
  Requires-Dist: pydantic (>=2.7.4,<3.0.0)
31
31
  Requires-Dist: pyspark (>=3.5.1,<4.0.0)
@@ -33,6 +33,7 @@ Requires-Dist: python-logstash-async (>=4.0.2,<5.0.0)
33
33
  Requires-Dist: requests (==2.32.0)
34
34
  Requires-Dist: rich (>=13.9.4,<14.0.0)
35
35
  Requires-Dist: scikit-learn (>=1.4.1.post1,<2.0.0)
36
+ Requires-Dist: setuptools (>=80.9.0,<81.0.0)
36
37
  Requires-Dist: sqlalchemy (>=2.0.29,<3.0.0)
37
38
  Requires-Dist: toml (>=0.10.2,<0.11.0)
38
39
  Requires-Dist: typer (>=0.12.3,<0.13.0)
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "sdk-seshat-python"
3
- version = "0.3.16"
3
+ version = "0.4.2"
4
4
  description = "Seshat python SDK is a library to help create ML data pipelines."
5
5
  authors = ["SeshatLabs <info@seshatlabs.xyz>"]
6
6
  packages = [{ include = "seshat", from = "." }]
@@ -32,13 +32,14 @@ langchain = "^0.3.23"
32
32
  langchain-community = "^0.3.21"
33
33
  langchain-openai = "^0.3.12"
34
34
  pyarmor = "^8.5.1"
35
- croniter = "^6.0.0"
36
- psycopg2 = { version = "^2.9", optional = true }
37
35
  python-logstash-async = "^4.0.2"
36
+ croniter = "^6.0.0"
37
+ psycopg2-binary = { version = "^2.9", optional = true }
38
+ setuptools = "^80.9.0"
38
39
 
39
40
  [tool.poetry.extras]
40
41
  flipside_support = ["flipside"]
41
- postgres_support = ["psycopg2"]
42
+ postgres_support = ["psycopg2-binary"]
42
43
 
43
44
  [tool.poetry.group.dev.dependencies]
44
45
  flake8 = "^7.0.0"
@@ -20,6 +20,7 @@ from seshat.general.exceptions import NoConfigSetError, RestClientException
20
20
  app = typer.Typer()
21
21
  console = Console()
22
22
  DEFAULT_DATA_SIZE = 1_000_000 # 1 GB
23
+ DEFAULT_EXPIRATION = 86400
23
24
 
24
25
  state = {"verbose": False}
25
26
 
@@ -141,7 +142,12 @@ def submit_job(
141
142
  identifier = manager.store_code(package)
142
143
 
143
144
  job_response = manager.submit_job(
144
- identifier, name, version, executor_image_tag, job_metadata
145
+ identifier,
146
+ name,
147
+ version,
148
+ executor_image_tag,
149
+ job_metadata,
150
+ expiration=config.get("aws", {}).get("expiration", DEFAULT_EXPIRATION),
145
151
  )
146
152
  job_response_data = job_response.get("data", {})
147
153
 
@@ -1,10 +1,9 @@
1
1
  from __future__ import annotations
2
2
 
3
- from typing import Iterable, List, Dict
3
+ from typing import Dict, Iterable, List
4
4
 
5
5
  import pandas as pd
6
6
  from pandas import DataFrame
7
- from pyspark.sql import SparkSession
8
7
 
9
8
  from seshat.data_class import SFrame
10
9
  from seshat.data_class.base import GroupSFrame
@@ -33,7 +32,7 @@ class DFrame(SFrame):
33
32
  def to_spf(self) -> SFrame:
34
33
  from seshat.data_class import SPFrame
35
34
 
36
- spark = SparkSession.builder.appName(configs.SPARK_APP_NAME).getOrCreate()
35
+ spark = SPFrame.get_spark()
37
36
  return SPFrame.from_raw(spark.createDataFrame(self.data))
38
37
 
39
38
  def extend_vertically(self, other: DataFrame):
@@ -1,9 +1,10 @@
1
- from typing import Iterable, List, Dict
1
+ from typing import Dict, Iterable, List
2
2
 
3
3
  from pandas import DataFrame
4
- from pyspark.sql import DataFrame as PySparkDataFrame, SparkSession
4
+ from pyspark.sql import DataFrame as PySparkDataFrame
5
+ from pyspark.sql import SparkSession
5
6
 
6
- from seshat.data_class import SFrame, DFrame
7
+ from seshat.data_class import DFrame, SFrame
7
8
  from seshat.data_class.base import GroupSFrame
8
9
  from seshat.general import configs
9
10
 
@@ -56,6 +57,8 @@ class SPFrame(SFrame):
56
57
  def from_raw(cls, data, *args, **kwargs) -> "SPFrame":
57
58
  if isinstance(data, DataFrame):
58
59
  data = DFrame.from_raw(data).convert(cls).to_raw()
60
+ elif not isinstance(data, PySparkDataFrame) and data:
61
+ data = cls.get_spark().createDataFrame(data or [])
59
62
  return cls(data)
60
63
 
61
64
  @staticmethod
@@ -1,5 +1,5 @@
1
1
  import logging
2
- from typing import Dict, Callable
2
+ from typing import Callable, Dict, Optional
3
3
 
4
4
  from seshat.data_class import SFrame
5
5
  from seshat.evaluation.base import Evaluation
@@ -7,6 +7,7 @@ from seshat.profiler import ProfileConfig
7
7
  from seshat.profiler.base import profiler
8
8
  from seshat.source import Source
9
9
  from seshat.source.saver import Saver
10
+ from seshat.transformer.base import Transformer
10
11
  from seshat.transformer.pipeline import Pipeline
11
12
  from seshat.transformer.splitter import Splitter
12
13
 
@@ -36,7 +37,8 @@ class FeatureView:
36
37
  saver : Saver, optional
37
38
  An optional component responsible for saving the processed data during training
38
39
  Required only in offline mode.
39
-
40
+ on_save_finished : Transformer, Optional
41
+ A Transformer to be called after the save operation completes.
40
42
  Examples
41
43
  --------
42
44
  Define feature view:
@@ -81,6 +83,7 @@ class FeatureView:
81
83
  saver: Saver = None
82
84
  profile_config = ProfileConfig(logging.INFO, default_tracking=True)
83
85
  evaluation: Evaluation
86
+ on_save_finished: Optional[Transformer] = None
84
87
 
85
88
  def __call__(self, *args, **kwargs):
86
89
  source = self._get_source()
@@ -143,6 +146,8 @@ class FeatureView:
143
146
  if hasattr(self, "splitter")
144
147
  else self.saver(self.data)
145
148
  )
149
+ if self.on_save_finished is not None:
150
+ self.on_save_finished(self.data)
146
151
 
147
152
  def _split(self, *args, **kwargs):
148
153
  self.split_data = self.splitter(self.data, *args, **kwargs)
@@ -313,6 +313,7 @@ class SubmitCommand(BaseTyperCommand):
313
313
  version: str,
314
314
  executor_image_tag: str,
315
315
  metadata: JobMetadata,
316
+ expiration=86400,
316
317
  ) -> dict:
317
318
  """Submit job to API after successful upload"""
318
319
  if not self.job_config.base_url or not self.job_config.auth_token:
@@ -320,7 +321,7 @@ class SubmitCommand(BaseTyperCommand):
320
321
  "API configuration missing. Please set base_url and auth_token"
321
322
  )
322
323
 
323
- presigned_url = self.backend.generate_presigned_url(s3_key)
324
+ presigned_url = self.backend.generate_presigned_url(s3_key, expiration)
324
325
  executor_label = self.config.get("executor", {}).get("label")
325
326
 
326
327
  payload = {
@@ -5,24 +5,29 @@ from seshat.source import Source
5
5
 
6
6
  class LocalSource(Source):
7
7
  """
8
- LocalSource is a source that can read data from csv file.
8
+ LocalSource is a data source that can read from a local file or an in-memory source.
9
9
  """
10
10
 
11
11
  def __init__(
12
12
  self,
13
- path,
13
+ data_source,
14
14
  query=None,
15
15
  schema=None,
16
16
  mode=configs.DEFAULT_MODE,
17
17
  ):
18
18
  super().__init__(query, schema, mode)
19
- self.path = path
19
+ self.data_source = data_source
20
20
 
21
21
  def convert_data_type(self, data) -> SFrame:
22
22
  return self.data_class.from_raw(data)
23
23
 
24
24
  def fetch(self) -> SFrame:
25
- d = self.data_class.read_csv(path=self.path)
25
+ d = (
26
+ self.data_class.read_csv(path=self.data_source)
27
+ if isinstance(self.data_source, str)
28
+ else self.data_source
29
+ )
30
+
26
31
  return self.convert_data_type(d)
27
32
 
28
33
  def calculate_complexity(self):
@@ -35,7 +35,13 @@ class SQLDBSaver(SQLMixin, Saver):
35
35
  self.create_index(config)
36
36
 
37
37
  selected_sf = sf.get(config.sf_key)
38
- selected_sf = self.drop_nan_ids(selected_sf, config.schema)
38
+ has_id = False
39
+ for col in config.schema.cols:
40
+ if col.is_id:
41
+ has_id = True
42
+ break
43
+ if has_id:
44
+ selected_sf = self.drop_nan_ids(selected_sf, config.schema)
39
45
 
40
46
  if config.clear_table:
41
47
  self.delete(config.table)
@@ -65,6 +71,11 @@ class SQLDBSaver(SQLMixin, Saver):
65
71
  table, _ = self.get_table(table_name, autoload=True)
66
72
  self.write_on_db(table.delete())
67
73
 
74
+ def drop_table(self, table_name):
75
+ if table_name in inspect(self.get_engine()).get_table_names():
76
+ table, _ = self.get_table(table_name, autoload=True)
77
+ table.drop(self.get_engine())
78
+
68
79
  def insert(self, selected_sf: SFrame, config: SaveConfig):
69
80
  values = self.prepare_sf_to_insert(selected_sf, config).to_dict()
70
81
  table, _ = self.get_table(config.table, autoload=True)
@@ -0,0 +1,25 @@
1
+ from .base import (
2
+ FeatureForAddressDeriver,
3
+ InteractedSymbolsToSentenceDeriver,
4
+ OperationOnColsDeriver,
5
+ PercentileTransactionValueDeriver,
6
+ SFrameFromColsDeriver,
7
+ StaticValueColumnAdder,
8
+ TimeWindowTransformer,
9
+ FractionDeriver,
10
+ ProfitLossDeriver,
11
+ DateTimeTypeDeriver,
12
+ ComprehensiveFeaturesDeriver,
13
+ GroupByDeriverCount,
14
+ GroupByDeriverMeanMax,
15
+ ChangingOverTimeDeriver,
16
+ TokenLastPriceDeriver,
17
+ GroupByTimeWindowDeriver,
18
+ OneColumnPercentileFilterDeriver,
19
+ SenderReceiverTokensDeriver,
20
+ TokenPriceDeriver,
21
+ TokenSwapTradeDeriver,
22
+ TokenFeatureTransformationDeriver,
23
+ )
24
+
25
+ from .from_database import FromSQLDBDeriver
@@ -1,12 +1,13 @@
1
1
  from datetime import timedelta
2
- from typing import Dict, Callable, List, Tuple, Any
2
+ from typing import Any, Callable, Dict, List, Literal, Tuple
3
3
 
4
4
  import pandas as pd
5
5
  from pandas import DataFrame
6
- from pyspark.sql import DataFrame as PySparkDataFrame, Window
6
+ from pyspark.sql import DataFrame as PySparkDataFrame
7
+ from pyspark.sql import Window
7
8
  from pyspark.sql import functions as F
8
- from pyspark.sql.functions import array_distinct, array_union, coalesce, array, lit
9
- from pyspark.sql.types import IntegerType, StructType, StructField
9
+ from pyspark.sql.functions import array, array_distinct, array_union, coalesce, lit
10
+ from pyspark.sql.types import IntegerType, StructField, StructType
10
11
 
11
12
  from seshat.data_class import SFrame, SPFrame
12
13
  from seshat.data_class.base import GroupSFrame
@@ -15,9 +16,9 @@ from seshat.general import configs
15
16
  from seshat.general.exceptions import InvalidArgumentsError
16
17
  from seshat.transformer import Transformer
17
18
  from seshat.transformer.merger import Merger
18
- from seshat.transformer.schema import Schema, Col
19
- from seshat.utils import pandas_func
20
- from seshat.utils import pyspark_func
19
+ from seshat.transformer.schema import Col, Schema
20
+ from seshat.transformer.trimmer.base import FilterTrimmer
21
+ from seshat.utils import pandas_func, pyspark_func
21
22
  from seshat.utils.validation import NumericColumnValidator, TimeStampColumnValidator
22
23
 
23
24
  SYMBOLS_RECEIVED_COL = "symbols_received"
@@ -1091,54 +1092,32 @@ class GroupByDeriverMeanMax(SFrameDeriver):
1091
1092
 
1092
1093
  class TimeWindowTransformer(SFrameDeriver):
1093
1094
  """
1094
- Applies transformations to data within specified time windows and merges the results.
1095
+ A class that derives time series-based features from a default SFrame using a transformer and merger strategy.
1096
+ It supports applying transformations over multiple durations and combines the results using a merger.
1095
1097
 
1096
- This transformer filters data based on time windows defined by durations, applies a
1097
- specified transformer to each filtered dataset, and merges the results using a provided merger.
1098
- It's particularly useful for generating time-based features or analyzing data across
1099
- different time periods.
1098
+ Duration labeling can be handled in two ways:
1099
+ 1. By adding a new column with the name specified in `label_col`, where each row is labeled with its corresponding duration.
1100
+ 2. By appending the duration label as a suffix to the names of the new columns created by the transformer, excluding those required for merging.
1101
+
1102
+ This behavior is controlled by the `specify_duration_by` parameter.
1100
1103
 
1101
1104
  Parameters
1102
1105
  ----------
1103
- transformer : Transformer
1104
- The transformer to apply to each time window's data.
1106
+ transformer : Transformer or Callable
1107
+ An instance of a transformer or a callable function that performs the transformation on the input data.
1105
1108
  merger : Merger
1106
- The merger used to combine results from different time windows.
1107
- Will be set to work in-place.
1109
+ An instance of a merger used to combine features generated across different durations.
1108
1110
  time_col : str, optional
1109
- The column name containing timestamp data.
1110
- Default is configs.BLOCK_TIMESTAMP_COL.
1111
- durations : List[Tuple[str, int]], optional
1112
- List of tuples where each tuple contains a label and a duration in minutes.
1113
- The label identifies the time window and the duration specifies how far back
1114
- from the latest timestamp to include data.
1111
+ The name of the timestamp column in the SFrame. Defaults to `configs.BLOCK_TIMESTAMP_COL`.
1112
+ durations : List[Tuple[str, int]]
1113
+ A list of (label, duration) pairs where each label identifies a time window (in minutes) for feature extraction.
1115
1114
  label_col : str, optional
1116
- The column name to store the time window label.
1117
- Default is configs.DURATION_LABEL_COL.
1118
- group_keys : dict, optional
1119
- Keys used to identify and retrieve data from a grouped SFrame.
1120
-
1121
- Attributes
1122
- ----------
1123
- time_col : str
1124
- The column name containing timestamp data.
1125
- transformer : Transformer
1126
- The transformer applied to each time window's data.
1127
- label_col : str
1128
- The column name storing the time window label.
1129
- merger : Merger
1130
- The merger used to combine results from different time windows.
1131
- periods : List[Tuple[str, int]]
1132
- The list of time windows with their labels and durations.
1133
-
1134
- Notes
1135
- -----
1136
- The transformer processes data by:
1137
- 1. Finding the latest timestamp in the data
1138
- 2. Creating time windows based on the specified durations
1139
- 3. Filtering data for each time window
1140
- 4. Applying the transformer to each filtered dataset
1141
- 5. Merging the results using the provided merger
1115
+ The name of the column that will store the duration label for each record. Defaults to `configs.DURATION_LABEL_COL`.
1116
+ specify_duration_by : {'column', 'suffix'}, default 'column'
1117
+ Specifies how duration information is represented in the output:
1118
+ * 'column' Adds a new column (named by `label_col`) containing the duration label.
1119
+ * 'suffix' – Appends the duration label as a suffix to the names of columns created by the transformer,
1120
+ excluding those required by the merger.
1142
1121
  """
1143
1122
 
1144
1123
  DEFAULT_GROUP_KEYS = {
@@ -1148,11 +1127,12 @@ class TimeWindowTransformer(SFrameDeriver):
1148
1127
 
1149
1128
  def __init__(
1150
1129
  self,
1151
- transformer: Transformer,
1130
+ transformer: Transformer | Callable,
1152
1131
  merger: Merger,
1132
+ durations: List[Tuple[str, int]],
1153
1133
  time_col=configs.BLOCK_TIMESTAMP_COL,
1154
- durations: List[Tuple[str, int]] = None,
1155
1134
  label_col=configs.DURATION_LABEL_COL,
1135
+ specify_duration_by: Literal["column", "suffix"] = "column",
1156
1136
  group_keys=None,
1157
1137
  ):
1158
1138
  super().__init__(group_keys)
@@ -1160,191 +1140,89 @@ class TimeWindowTransformer(SFrameDeriver):
1160
1140
  self.transformer = transformer
1161
1141
  self.label_col = label_col
1162
1142
  self.merger = merger
1163
- self.periods = durations
1143
+ self.specify_duration_by = specify_duration_by
1144
+ self.durations = durations
1164
1145
 
1165
- # Ensure that merger work inplace
1146
+ # Ensure that the merge works in place and group keys are set to their default values.
1166
1147
  self.merger.inplace = True
1148
+ self.merger.group_keys = {"default": "default", "other": "other"}
1149
+ if self.specify_duration_by == "column":
1150
+ self.merger.axis = 0
1167
1151
 
1168
1152
  def validate(self, sf: SFrame):
1169
- """
1170
- Validate that the input SFrame contains the required columns.
1171
-
1172
- Parameters
1173
- ----------
1174
- sf : SFrame
1175
- The input SFrame to validate.
1176
-
1177
- Raises
1178
- ------
1179
- ColDoesNotExistError
1180
- If the required time column is not present in the SFrame.
1181
- """
1182
1153
  super().validate(sf)
1183
1154
  self._validate_columns(sf, self.group_keys["default"], self.time_col)
1184
1155
 
1185
- def calculate_metrics_df(self, default: DataFrame, label, start) -> SFrame:
1186
- """
1187
- Calculate metrics for a specific time window.
1156
+ def _is_empty(self, sf: SFrame) -> bool:
1157
+ raw = sf.to_raw()
1158
+ if raw is None:
1159
+ return True
1160
+ elif hasattr(raw, "rdd"):
1161
+ return raw.rdd.isEmpty()
1162
+ else:
1163
+ return len(raw) == 0
1188
1164
 
1189
- Parameters
1190
- ----------
1191
- default : DataFrame
1192
- The input DataFrame containing the data.
1193
- label : str
1194
- The label for the time window.
1195
- start : datetime
1196
- The start timestamp for the time window.
1165
+ def calculate_metrics(self, sf: SFrame, label, start) -> SFrame:
1166
+ trimmer = FilterTrimmer(op=">=", col=self.time_col, value=start)
1167
+ filtered = trimmer(sf)
1197
1168
 
1198
- Returns
1199
- -------
1200
- SFrame
1201
- An SFrame containing the calculated metrics for the time window.
1202
- Returns an empty DFrame if no data is available for the time window.
1203
- """
1204
- filtered = default[default[self.time_col] >= start]
1169
+ if self._is_empty(filtered):
1170
+ return filtered
1205
1171
 
1206
- if len(filtered) == 0:
1207
- return DFrame(DataFrame())
1172
+ result = self.transformer(sf_input=filtered, label=label, start=start)
1173
+ if self.specify_duration_by == "column":
1174
+ deriver = StaticValueColumnAdder(col_name=self.label_col, value=label)
1175
+ result = deriver(result)
1208
1176
 
1209
- filtered.loc[:, self.label_col] = label
1210
- return self.transformer(DFrame(filtered), label=label, start=start)
1177
+ else:
1178
+ # Find columns excluding those used by,
1179
+ # the merger for merging.
1180
+ cols = set(result[self.transformer].get_columns()) - set(self.merger.on)
1181
+ schema = Schema(
1182
+ cols=[Col(col, to=f"{col}_{label}") for col in cols],
1183
+ exclusive=False,
1184
+ )
1185
+ result = schema(result)
1186
+ return result
1211
1187
 
1212
1188
  def derive_df(self, default: DataFrame, *args, **kwargs):
1213
- """
1214
- Derive time-based features from the input DataFrame.
1215
-
1216
- This method:
1217
- 1. Validates that the time column contains timestamp data
1218
- 2. Finds the latest timestamp in the data
1219
- 3. Creates time windows based on the specified durations
1220
- 4. Calculates metrics for each time window
1221
- 5. Merges the results using the provided merger
1222
-
1223
- Parameters
1224
- ----------
1225
- default : DataFrame
1226
- The input DataFrame containing the data.
1227
- *args : tuple
1228
- Variable length argument list.
1229
- **kwargs : dict
1230
- Arbitrary keyword arguments.
1189
+ default = TimeStampColumnValidator().validate(default, self.time_col)
1190
+ last_timestamp = default[self.time_col].max()
1191
+ sf_input = DFrame.from_raw(default)
1192
+ result = DFrame()
1193
+ return self.derive(sf_input, result, last_timestamp)
1231
1194
 
1232
- Returns
1233
- -------
1234
- dict
1235
- A dictionary with two keys:
1236
- - 'default': The original input DataFrame
1237
- - 'timeseries_feature': The derived time-based features
1238
- """
1195
+ def derive_spf(self, default: PySparkDataFrame, *args, **kwargs):
1239
1196
  default = TimeStampColumnValidator().validate(default, self.time_col)
1197
+ last_timestamp = default.select(F.max(self.time_col)).collect()[0][0]
1198
+ sf_input = SPFrame.from_raw(default)
1199
+ result = SPFrame()
1200
+ return self.derive(sf_input, result, last_timestamp)
1240
1201
 
1241
- last_timestamp = default[self.time_col].max()
1202
+ def derive(self, sf_input: SFrame, result: SFrame, last_timestamp):
1242
1203
  intervals = {
1243
1204
  label: last_timestamp - timedelta(minutes=minutes)
1244
- for label, minutes in self.periods
1205
+ for label, minutes in self.durations
1245
1206
  }
1246
1207
 
1247
- result = DFrame(DataFrame())
1248
- for label, start in intervals.items():
1249
- metrics = self.calculate_metrics_df(default, label, start)
1250
- if metrics.data.empty:
1208
+ for idx, (label, start) in enumerate(intervals.items()):
1209
+ metrics = self.calculate_metrics(sf_input, label, start)
1210
+ if self._is_empty(metrics):
1251
1211
  continue
1252
1212
  metrics = metrics.make_group()
1253
- metrics[self.merger.group_keys["other"]] = result
1213
+ metrics["other"] = metrics["default"]
1214
+ metrics["default"] = result
1254
1215
  result = (
1255
- metrics["default"]
1256
- if result.data.empty
1216
+ metrics["other"]
1217
+ if self._is_empty(result)
1257
1218
  else self.merger(metrics)["default"]
1258
1219
  )
1259
1220
 
1260
- return {"default": default, "timeseries_feature": result.to_raw()}
1261
-
1262
- def calculate_metrics_spf(
1263
- self, default: PySparkDataFrame, label, start
1264
- ) -> PySparkDataFrame:
1265
- """
1266
- Calculate metrics for a specific time window using PySpark DataFrame.
1267
-
1268
- Parameters
1269
- ----------
1270
- default : PySparkDataFrame
1271
- The input PySpark DataFrame containing the data.
1272
- label : str
1273
- The label for the time window.
1274
- start : datetime
1275
- The start timestamp for the time window.
1276
-
1277
- Returns
1278
- -------
1279
- PySparkDataFrame
1280
- A PySpark DataFrame containing the calculated metrics for the time window.
1281
- Returns an empty DataFrame if no data is available for the time window.
1282
- """
1283
- filtered = default.filter(F.col(self.time_col) >= start)
1284
-
1285
- if filtered.count() == 0:
1286
- return SPFrame.get_spark().createDataFrame([], default.schema)
1287
-
1288
- filtered = filtered.withColumn(self.label_col, F.lit(label))
1289
- return self.transformer(SPFrame(filtered), label=label, start=start).data
1290
-
1291
- def derive_spf(self, default: PySparkDataFrame, *args, **kwargs):
1292
- """
1293
- Derive time-based features from the input PySpark DataFrame.
1294
-
1295
- This method:
1296
- 1. Validates that the time column contains timestamp data
1297
- 2. Finds the latest timestamp in the data
1298
- 3. Creates time windows based on the specified durations
1299
- 4. Calculates metrics for each time window
1300
- 5. Merges the results using the provided merger
1301
-
1302
- Parameters
1303
- ----------
1304
- default : PySparkDataFrame
1305
- The input PySpark DataFrame containing the data.
1306
- *args : tuple
1307
- Variable length argument list.
1308
- **kwargs : dict
1309
- Arbitrary keyword arguments.
1310
-
1311
- Returns
1312
- -------
1313
- dict
1314
- A dictionary with two keys:
1315
- - 'default': The original input DataFrame
1316
- - 'timeseries_feature': The derived time-based features
1317
- """
1318
- default = TimeStampColumnValidator().validate(default, self.time_col)
1319
-
1320
- last_timestamp_row = default.agg(F.max(self.time_col)).collect()[0][0]
1321
-
1322
- intervals = {
1323
- label: last_timestamp_row - timedelta(minutes=minutes)
1324
- for label, minutes in self.periods
1221
+ return {
1222
+ "default": sf_input[self.default_sf_key].to_raw(),
1223
+ "timeseries_feature": result.to_raw(),
1325
1224
  }
1326
1225
 
1327
- result = None
1328
-
1329
- for label, start in intervals.items():
1330
- metrics = self.calculate_metrics_spf(default, label, start)
1331
-
1332
- if metrics.count() == 0:
1333
- continue
1334
-
1335
- metrics_group = {self.merger.group_keys["default"]: metrics}
1336
-
1337
- if result is not None:
1338
- metrics_group[self.merger.group_keys["other"]] = result
1339
- result = self.merger(SPFrame(metrics_group))["default"]
1340
- else:
1341
- result = metrics
1342
-
1343
- if result is not None:
1344
- return {"default": default, "timeseries_feature": result}
1345
- else:
1346
- return {"default": default}
1347
-
1348
1226
 
1349
1227
  class StaticValueColumnAdder(SFrameDeriver):
1350
1228
  """
@@ -1714,9 +1592,11 @@ class ComprehensiveFeaturesDeriver(SFrameDeriver):
1714
1592
  last_price=("TOKEN_PRICE", "last"),
1715
1593
  transaction_count_hourly=(
1716
1594
  "BLOCK_TIMESTAMP",
1717
- lambda x: len(x) / ((x.max() - x.min()).total_seconds() / 3600)
1718
- if (x.max() > x.min())
1719
- else len(x),
1595
+ lambda x: (
1596
+ len(x) / ((x.max() - x.min()).total_seconds() / 3600)
1597
+ if (x.max() > x.min())
1598
+ else len(x)
1599
+ ),
1720
1600
  ),
1721
1601
  )
1722
1602
  .reset_index()
@@ -1 +1,2 @@
1
+ from .action_gate import ActionGate
1
2
  from .table_existence import SQLTableExistenceValidator