sdk-seshat-python 0.4.1__tar.gz → 0.4.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (136) hide show
  1. {sdk_seshat_python-0.4.1 → sdk_seshat_python-0.4.2}/PKG-INFO +8 -7
  2. {sdk_seshat_python-0.4.1 → sdk_seshat_python-0.4.2}/pyproject.toml +9 -8
  3. {sdk_seshat_python-0.4.1 → sdk_seshat_python-0.4.2}/seshat/transformer/reducer/base.py +68 -17
  4. {sdk_seshat_python-0.4.1 → sdk_seshat_python-0.4.2}/LICENSE +0 -0
  5. {sdk_seshat_python-0.4.1 → sdk_seshat_python-0.4.2}/README.md +0 -0
  6. {sdk_seshat_python-0.4.1 → sdk_seshat_python-0.4.2}/seshat/__init__.py +0 -0
  7. {sdk_seshat_python-0.4.1 → sdk_seshat_python-0.4.2}/seshat/__main__.py +0 -0
  8. {sdk_seshat_python-0.4.1 → sdk_seshat_python-0.4.2}/seshat/data_class/__init__.py +0 -0
  9. {sdk_seshat_python-0.4.1 → sdk_seshat_python-0.4.2}/seshat/data_class/base.py +0 -0
  10. {sdk_seshat_python-0.4.1 → sdk_seshat_python-0.4.2}/seshat/data_class/pandas.py +0 -0
  11. {sdk_seshat_python-0.4.1 → sdk_seshat_python-0.4.2}/seshat/data_class/pyspark.py +0 -0
  12. {sdk_seshat_python-0.4.1 → sdk_seshat_python-0.4.2}/seshat/evaluation/__init__.py +0 -0
  13. {sdk_seshat_python-0.4.1 → sdk_seshat_python-0.4.2}/seshat/evaluation/base.py +0 -0
  14. {sdk_seshat_python-0.4.1 → sdk_seshat_python-0.4.2}/seshat/evaluation/evaluator/__init__.py +0 -0
  15. {sdk_seshat_python-0.4.1 → sdk_seshat_python-0.4.2}/seshat/evaluation/evaluator/base.py +0 -0
  16. {sdk_seshat_python-0.4.1 → sdk_seshat_python-0.4.2}/seshat/evaluation/evaluator/general/__init__.py +0 -0
  17. {sdk_seshat_python-0.4.1 → sdk_seshat_python-0.4.2}/seshat/evaluation/evaluator/general/classification.py +0 -0
  18. {sdk_seshat_python-0.4.1 → sdk_seshat_python-0.4.2}/seshat/evaluation/evaluator/general/clustering.py +0 -0
  19. {sdk_seshat_python-0.4.1 → sdk_seshat_python-0.4.2}/seshat/evaluation/evaluator/general/regression.py +0 -0
  20. {sdk_seshat_python-0.4.1 → sdk_seshat_python-0.4.2}/seshat/evaluation/evaluator/recommendation/__init__.py +0 -0
  21. {sdk_seshat_python-0.4.1 → sdk_seshat_python-0.4.2}/seshat/evaluation/evaluator/recommendation/diversity.py +0 -0
  22. {sdk_seshat_python-0.4.1 → sdk_seshat_python-0.4.2}/seshat/evaluation/evaluator/recommendation/ranking.py +0 -0
  23. {sdk_seshat_python-0.4.1 → sdk_seshat_python-0.4.2}/seshat/feature_view/__init__.py +0 -0
  24. {sdk_seshat_python-0.4.1 → sdk_seshat_python-0.4.2}/seshat/feature_view/base.py +0 -0
  25. {sdk_seshat_python-0.4.1 → sdk_seshat_python-0.4.2}/seshat/general/__init__.py +0 -0
  26. {sdk_seshat_python-0.4.1 → sdk_seshat_python-0.4.2}/seshat/general/command/__init__.py +0 -0
  27. {sdk_seshat_python-0.4.1 → sdk_seshat_python-0.4.2}/seshat/general/command/base.py +0 -0
  28. {sdk_seshat_python-0.4.1 → sdk_seshat_python-0.4.2}/seshat/general/command/code_inspect.py +0 -0
  29. {sdk_seshat_python-0.4.1 → sdk_seshat_python-0.4.2}/seshat/general/command/job_status.py +0 -0
  30. {sdk_seshat_python-0.4.1 → sdk_seshat_python-0.4.2}/seshat/general/command/setup_project.py +0 -0
  31. {sdk_seshat_python-0.4.1 → sdk_seshat_python-0.4.2}/seshat/general/command/submit_to_network.py +0 -0
  32. {sdk_seshat_python-0.4.1 → sdk_seshat_python-0.4.2}/seshat/general/config.py +0 -0
  33. {sdk_seshat_python-0.4.1 → sdk_seshat_python-0.4.2}/seshat/general/exceptions.py +0 -0
  34. {sdk_seshat_python-0.4.1 → sdk_seshat_python-0.4.2}/seshat/general/lazy_config.py +0 -0
  35. {sdk_seshat_python-0.4.1 → sdk_seshat_python-0.4.2}/seshat/general/models.py +0 -0
  36. {sdk_seshat_python-0.4.1 → sdk_seshat_python-0.4.2}/seshat/general/template/README.md-tmpl +0 -0
  37. {sdk_seshat_python-0.4.1 → sdk_seshat_python-0.4.2}/seshat/general/template/config.py-tmpl +0 -0
  38. {sdk_seshat_python-0.4.1 → sdk_seshat_python-0.4.2}/seshat/general/template/env-templ +0 -0
  39. {sdk_seshat_python-0.4.1 → sdk_seshat_python-0.4.2}/seshat/general/template/jobignore-tmpl +0 -0
  40. {sdk_seshat_python-0.4.1 → sdk_seshat_python-0.4.2}/seshat/general/template/pyproject._toml-tmpl +0 -0
  41. {sdk_seshat_python-0.4.1 → sdk_seshat_python-0.4.2}/seshat/general/template/recommender-jupyter.ipynb-tmpl +0 -0
  42. {sdk_seshat_python-0.4.1 → sdk_seshat_python-0.4.2}/seshat/general/template/recommender.py-tmpl +0 -0
  43. {sdk_seshat_python-0.4.1 → sdk_seshat_python-0.4.2}/seshat/profiler/__init__.py +0 -0
  44. {sdk_seshat_python-0.4.1 → sdk_seshat_python-0.4.2}/seshat/profiler/base.py +0 -0
  45. {sdk_seshat_python-0.4.1 → sdk_seshat_python-0.4.2}/seshat/profiler/decorator.py +0 -0
  46. {sdk_seshat_python-0.4.1 → sdk_seshat_python-0.4.2}/seshat/profiler/format.py +0 -0
  47. {sdk_seshat_python-0.4.1 → sdk_seshat_python-0.4.2}/seshat/source/__init__.py +0 -0
  48. {sdk_seshat_python-0.4.1 → sdk_seshat_python-0.4.2}/seshat/source/base.py +0 -0
  49. {sdk_seshat_python-0.4.1 → sdk_seshat_python-0.4.2}/seshat/source/database/__init__.py +0 -0
  50. {sdk_seshat_python-0.4.1 → sdk_seshat_python-0.4.2}/seshat/source/database/base.py +0 -0
  51. {sdk_seshat_python-0.4.1 → sdk_seshat_python-0.4.2}/seshat/source/exceptions.py +0 -0
  52. {sdk_seshat_python-0.4.1 → sdk_seshat_python-0.4.2}/seshat/source/flip_side/__init__.py +0 -0
  53. {sdk_seshat_python-0.4.1 → sdk_seshat_python-0.4.2}/seshat/source/flip_side/base.py +0 -0
  54. {sdk_seshat_python-0.4.1 → sdk_seshat_python-0.4.2}/seshat/source/local/__init__.py +0 -0
  55. {sdk_seshat_python-0.4.1 → sdk_seshat_python-0.4.2}/seshat/source/local/base.py +0 -0
  56. {sdk_seshat_python-0.4.1 → sdk_seshat_python-0.4.2}/seshat/source/mixins.py +0 -0
  57. {sdk_seshat_python-0.4.1 → sdk_seshat_python-0.4.2}/seshat/source/multisource/__init__.py +0 -0
  58. {sdk_seshat_python-0.4.1 → sdk_seshat_python-0.4.2}/seshat/source/multisource/base.py +0 -0
  59. {sdk_seshat_python-0.4.1 → sdk_seshat_python-0.4.2}/seshat/source/saver/__init__.py +0 -0
  60. {sdk_seshat_python-0.4.1 → sdk_seshat_python-0.4.2}/seshat/source/saver/base.py +0 -0
  61. {sdk_seshat_python-0.4.1 → sdk_seshat_python-0.4.2}/seshat/source/saver/database.py +0 -0
  62. {sdk_seshat_python-0.4.1 → sdk_seshat_python-0.4.2}/seshat/source/saver/utils/__init__.py +0 -0
  63. {sdk_seshat_python-0.4.1 → sdk_seshat_python-0.4.2}/seshat/source/saver/utils/postgres.py +0 -0
  64. {sdk_seshat_python-0.4.1 → sdk_seshat_python-0.4.2}/seshat/transformer/__init__.py +0 -0
  65. {sdk_seshat_python-0.4.1 → sdk_seshat_python-0.4.2}/seshat/transformer/aggregator/__init__.py +0 -0
  66. {sdk_seshat_python-0.4.1 → sdk_seshat_python-0.4.2}/seshat/transformer/aggregator/base.py +0 -0
  67. {sdk_seshat_python-0.4.1 → sdk_seshat_python-0.4.2}/seshat/transformer/augmenter/__init__.py +0 -0
  68. {sdk_seshat_python-0.4.1 → sdk_seshat_python-0.4.2}/seshat/transformer/augmenter/base.py +0 -0
  69. {sdk_seshat_python-0.4.1 → sdk_seshat_python-0.4.2}/seshat/transformer/base.py +0 -0
  70. {sdk_seshat_python-0.4.1 → sdk_seshat_python-0.4.2}/seshat/transformer/deriver/__init__.py +0 -0
  71. {sdk_seshat_python-0.4.1 → sdk_seshat_python-0.4.2}/seshat/transformer/deriver/base.py +0 -0
  72. {sdk_seshat_python-0.4.1 → sdk_seshat_python-0.4.2}/seshat/transformer/deriver/from_database.py +0 -0
  73. {sdk_seshat_python-0.4.1 → sdk_seshat_python-0.4.2}/seshat/transformer/imputer/__init__.py +0 -0
  74. {sdk_seshat_python-0.4.1 → sdk_seshat_python-0.4.2}/seshat/transformer/imputer/base.py +0 -0
  75. {sdk_seshat_python-0.4.1 → sdk_seshat_python-0.4.2}/seshat/transformer/merger/__init__.py +0 -0
  76. {sdk_seshat_python-0.4.1 → sdk_seshat_python-0.4.2}/seshat/transformer/merger/base.py +0 -0
  77. {sdk_seshat_python-0.4.1 → sdk_seshat_python-0.4.2}/seshat/transformer/merger/nested_key.py +0 -0
  78. {sdk_seshat_python-0.4.1 → sdk_seshat_python-0.4.2}/seshat/transformer/pipeline/__init__.py +0 -0
  79. {sdk_seshat_python-0.4.1 → sdk_seshat_python-0.4.2}/seshat/transformer/pipeline/base.py +0 -0
  80. {sdk_seshat_python-0.4.1 → sdk_seshat_python-0.4.2}/seshat/transformer/pipeline/branch.py +0 -0
  81. {sdk_seshat_python-0.4.1 → sdk_seshat_python-0.4.2}/seshat/transformer/pipeline/recommendation/__init__.py +0 -0
  82. {sdk_seshat_python-0.4.1 → sdk_seshat_python-0.4.2}/seshat/transformer/pipeline/recommendation/address_pipeline.py +0 -0
  83. {sdk_seshat_python-0.4.1 → sdk_seshat_python-0.4.2}/seshat/transformer/pseudo/__init__.py +0 -0
  84. {sdk_seshat_python-0.4.1 → sdk_seshat_python-0.4.2}/seshat/transformer/pseudo/action_gate.py +0 -0
  85. {sdk_seshat_python-0.4.1 → sdk_seshat_python-0.4.2}/seshat/transformer/pseudo/table_existence.py +0 -0
  86. {sdk_seshat_python-0.4.1 → sdk_seshat_python-0.4.2}/seshat/transformer/reducer/__init__.py +0 -0
  87. {sdk_seshat_python-0.4.1 → sdk_seshat_python-0.4.2}/seshat/transformer/scaler/__init__.py +0 -0
  88. {sdk_seshat_python-0.4.1 → sdk_seshat_python-0.4.2}/seshat/transformer/scaler/base.py +0 -0
  89. {sdk_seshat_python-0.4.1 → sdk_seshat_python-0.4.2}/seshat/transformer/schema/__init__.py +0 -0
  90. {sdk_seshat_python-0.4.1 → sdk_seshat_python-0.4.2}/seshat/transformer/schema/base.py +0 -0
  91. {sdk_seshat_python-0.4.1 → sdk_seshat_python-0.4.2}/seshat/transformer/splitter/__init__.py +0 -0
  92. {sdk_seshat_python-0.4.1 → sdk_seshat_python-0.4.2}/seshat/transformer/splitter/base.py +0 -0
  93. {sdk_seshat_python-0.4.1 → sdk_seshat_python-0.4.2}/seshat/transformer/splitter/block/__init__.py +0 -0
  94. {sdk_seshat_python-0.4.1 → sdk_seshat_python-0.4.2}/seshat/transformer/splitter/block/base.py +0 -0
  95. {sdk_seshat_python-0.4.1 → sdk_seshat_python-0.4.2}/seshat/transformer/splitter/random/__init__.py +0 -0
  96. {sdk_seshat_python-0.4.1 → sdk_seshat_python-0.4.2}/seshat/transformer/splitter/random/base.py +0 -0
  97. {sdk_seshat_python-0.4.1 → sdk_seshat_python-0.4.2}/seshat/transformer/splitter/time_line/__init__.py +0 -0
  98. {sdk_seshat_python-0.4.1 → sdk_seshat_python-0.4.2}/seshat/transformer/splitter/time_line/base.py +0 -0
  99. {sdk_seshat_python-0.4.1 → sdk_seshat_python-0.4.2}/seshat/transformer/trimmer/__init__.py +0 -0
  100. {sdk_seshat_python-0.4.1 → sdk_seshat_python-0.4.2}/seshat/transformer/trimmer/base.py +0 -0
  101. {sdk_seshat_python-0.4.1 → sdk_seshat_python-0.4.2}/seshat/transformer/vectorizer/__init__.py +0 -0
  102. {sdk_seshat_python-0.4.1 → sdk_seshat_python-0.4.2}/seshat/transformer/vectorizer/base.py +0 -0
  103. {sdk_seshat_python-0.4.1 → sdk_seshat_python-0.4.2}/seshat/transformer/vectorizer/cosine_similarity.py +0 -0
  104. {sdk_seshat_python-0.4.1 → sdk_seshat_python-0.4.2}/seshat/transformer/vectorizer/pivot.py +0 -0
  105. {sdk_seshat_python-0.4.1 → sdk_seshat_python-0.4.2}/seshat/transformer/vectorizer/utils.py +0 -0
  106. {sdk_seshat_python-0.4.1 → sdk_seshat_python-0.4.2}/seshat/utils/__init__.py +0 -0
  107. {sdk_seshat_python-0.4.1 → sdk_seshat_python-0.4.2}/seshat/utils/batcher.py +0 -0
  108. {sdk_seshat_python-0.4.1 → sdk_seshat_python-0.4.2}/seshat/utils/binary_utils.py +0 -0
  109. {sdk_seshat_python-0.4.1 → sdk_seshat_python-0.4.2}/seshat/utils/clean_json.py +0 -0
  110. {sdk_seshat_python-0.4.1 → sdk_seshat_python-0.4.2}/seshat/utils/col_to_list.py +0 -0
  111. {sdk_seshat_python-0.4.1 → sdk_seshat_python-0.4.2}/seshat/utils/contracts.py +0 -0
  112. {sdk_seshat_python-0.4.1 → sdk_seshat_python-0.4.2}/seshat/utils/file.py +0 -0
  113. {sdk_seshat_python-0.4.1 → sdk_seshat_python-0.4.2}/seshat/utils/file_cryptography.py +0 -0
  114. {sdk_seshat_python-0.4.1 → sdk_seshat_python-0.4.2}/seshat/utils/filter_json.py +0 -0
  115. {sdk_seshat_python-0.4.1 → sdk_seshat_python-0.4.2}/seshat/utils/grouper.py +0 -0
  116. {sdk_seshat_python-0.4.1 → sdk_seshat_python-0.4.2}/seshat/utils/jobignore.py +0 -0
  117. {sdk_seshat_python-0.4.1 → sdk_seshat_python-0.4.2}/seshat/utils/join_columns_to_list.py +0 -0
  118. {sdk_seshat_python-0.4.1 → sdk_seshat_python-0.4.2}/seshat/utils/join_str.py +0 -0
  119. {sdk_seshat_python-0.4.1 → sdk_seshat_python-0.4.2}/seshat/utils/llm_client/__init__.py +0 -0
  120. {sdk_seshat_python-0.4.1 → sdk_seshat_python-0.4.2}/seshat/utils/llm_client/chatbot_factory.py +0 -0
  121. {sdk_seshat_python-0.4.1 → sdk_seshat_python-0.4.2}/seshat/utils/logging/__init__.py +0 -0
  122. {sdk_seshat_python-0.4.1 → sdk_seshat_python-0.4.2}/seshat/utils/logging/base_logger.py +0 -0
  123. {sdk_seshat_python-0.4.1 → sdk_seshat_python-0.4.2}/seshat/utils/logging/console_logger.py +0 -0
  124. {sdk_seshat_python-0.4.1 → sdk_seshat_python-0.4.2}/seshat/utils/logging/logstash_logger.py +0 -0
  125. {sdk_seshat_python-0.4.1 → sdk_seshat_python-0.4.2}/seshat/utils/logging/multi_logger.py +0 -0
  126. {sdk_seshat_python-0.4.1 → sdk_seshat_python-0.4.2}/seshat/utils/memory.py +0 -0
  127. {sdk_seshat_python-0.4.1 → sdk_seshat_python-0.4.2}/seshat/utils/mixin.py +0 -0
  128. {sdk_seshat_python-0.4.1 → sdk_seshat_python-0.4.2}/seshat/utils/obfuscate.py +0 -0
  129. {sdk_seshat_python-0.4.1 → sdk_seshat_python-0.4.2}/seshat/utils/package_utils.py +0 -0
  130. {sdk_seshat_python-0.4.1 → sdk_seshat_python-0.4.2}/seshat/utils/pandas_func.py +0 -0
  131. {sdk_seshat_python-0.4.1 → sdk_seshat_python-0.4.2}/seshat/utils/patching.py +0 -0
  132. {sdk_seshat_python-0.4.1 → sdk_seshat_python-0.4.2}/seshat/utils/pyspark_func.py +0 -0
  133. {sdk_seshat_python-0.4.1 → sdk_seshat_python-0.4.2}/seshat/utils/rest.py +0 -0
  134. {sdk_seshat_python-0.4.1 → sdk_seshat_python-0.4.2}/seshat/utils/singleton.py +0 -0
  135. {sdk_seshat_python-0.4.1 → sdk_seshat_python-0.4.2}/seshat/utils/validation.py +0 -0
  136. {sdk_seshat_python-0.4.1 → sdk_seshat_python-0.4.2}/seshat/utils/zip_utils.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: sdk-seshat-python
3
- Version: 0.4.1
3
+ Version: 0.4.2
4
4
  Summary: Seshat python SDK is a library to help create ML data pipelines.
5
5
  License: Commercial - see LICENSE.txt
6
6
  Author: SeshatLabs
@@ -13,29 +13,30 @@ Provides-Extra: flipside-support
13
13
  Provides-Extra: postgres-support
14
14
  Requires-Dist: backoff (>=2.2.1,<3.0.0)
15
15
  Requires-Dist: bokeh (>=3.6.0,<4.0.0)
16
- Requires-Dist: boto3 (>=1.39.4,<2.0.0)
16
+ Requires-Dist: boto3 (>=1.35.68,<2.0.0)
17
17
  Requires-Dist: croniter (>=6.0.0,<7.0.0)
18
18
  Requires-Dist: cryptography (>=44.0.0,<45.0.0)
19
19
  Requires-Dist: dask[array,complete,dataframe,distributed] (>=2024.10.0,<2025.0.0)
20
20
  Requires-Dist: flipside (>=2.1.0,<3.0.0) ; extra == "flipside-support"
21
21
  Requires-Dist: langchain (>=0.3.23,<0.4.0)
22
22
  Requires-Dist: langchain-community (>=0.3.21,<0.4.0)
23
- Requires-Dist: langchain-openai (==0.3.28)
23
+ Requires-Dist: langchain-openai (>=0.3.12,<0.4.0)
24
24
  Requires-Dist: loguru (>=0.7.3,<0.8.0)
25
25
  Requires-Dist: memory-profiler (>=0.61.0,<0.62.0)
26
26
  Requires-Dist: openai (>=1.73.0,<2.0.0)
27
- Requires-Dist: pandas (>=2.3.1,<3.0.0)
27
+ Requires-Dist: pandas (>=2.2.1,<3.0.0)
28
28
  Requires-Dist: psycopg2-binary (>=2.9,<3.0) ; extra == "postgres-support"
29
29
  Requires-Dist: pyarmor (>=8.5.1,<9.0.0)
30
30
  Requires-Dist: pydantic (>=2.7.4,<3.0.0)
31
31
  Requires-Dist: pyspark (>=3.5.1,<4.0.0)
32
32
  Requires-Dist: python-logstash-async (>=4.0.2,<5.0.0)
33
- Requires-Dist: requests (>=2.32.0,<3.0.0)
34
- Requires-Dist: rich (>=14.0.0,<15.0.0)
33
+ Requires-Dist: requests (==2.32.0)
34
+ Requires-Dist: rich (>=13.9.4,<14.0.0)
35
35
  Requires-Dist: scikit-learn (>=1.4.1.post1,<2.0.0)
36
+ Requires-Dist: setuptools (>=80.9.0,<81.0.0)
36
37
  Requires-Dist: sqlalchemy (>=2.0.29,<3.0.0)
37
38
  Requires-Dist: toml (>=0.10.2,<0.11.0)
38
- Requires-Dist: typer (>=0.16.0,<0.17.0)
39
+ Requires-Dist: typer (>=0.12.3,<0.13.0)
39
40
  Description-Content-Type: text/markdown
40
41
 
41
42
 
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "sdk-seshat-python"
3
- version = "0.4.1"
3
+ version = "0.4.2"
4
4
  description = "Seshat python SDK is a library to help create ML data pipelines."
5
5
  authors = ["SeshatLabs <info@seshatlabs.xyz>"]
6
6
  packages = [{ include = "seshat", from = "." }]
@@ -10,19 +10,19 @@ license = "Commercial - see LICENSE.txt"
10
10
 
11
11
  [tool.poetry.dependencies]
12
12
  python = "^3.11"
13
- pandas = "^2.3.1"
13
+ pandas = "^2.2.1"
14
14
  scikit-learn = "^1.4.1.post1"
15
15
  pyspark = "^3.5.1"
16
16
  flipside = "^2.1.0"
17
17
  sqlalchemy = "^2.0.29"
18
18
  memory-profiler = "^0.61.0"
19
- typer = "^0.16.0"
19
+ typer = "^0.12.3"
20
20
  dask = {extras = ["array", "complete", "dataframe", "distributed"], version = "^2024.10.0"}
21
21
  bokeh = "^3.6.0"
22
22
  toml = "^0.10.2"
23
- rich = "^14.0.0"
24
- boto3 = "^1.39.4"
25
- requests = "^2.32.0"
23
+ rich = "^13.9.4"
24
+ boto3 = "^1.35.68"
25
+ requests = "2.32.0"
26
26
  backoff = "^2.2.1"
27
27
  cryptography = "^44.0.0"
28
28
  loguru = "^0.7.3"
@@ -30,11 +30,12 @@ openai = "^1.73.0"
30
30
  pydantic = "^2.7.4"
31
31
  langchain = "^0.3.23"
32
32
  langchain-community = "^0.3.21"
33
- langchain-openai = "0.3.28"
33
+ langchain-openai = "^0.3.12"
34
34
  pyarmor = "^8.5.1"
35
+ python-logstash-async = "^4.0.2"
35
36
  croniter = "^6.0.0"
36
37
  psycopg2-binary = { version = "^2.9", optional = true }
37
- python-logstash-async = "^4.0.2"
38
+ setuptools = "^80.9.0"
38
39
 
39
40
  [tool.poetry.extras]
40
41
  flipside_support = ["flipside"]
@@ -1,4 +1,5 @@
1
1
  import logging
2
+ import math
2
3
  from typing import Any, Callable, Dict, List, TypeAlias
3
4
 
4
5
  import pandas as pd
@@ -18,7 +19,17 @@ from seshat.utils.clean_json import JSONCleaner
18
19
  InputType: TypeAlias = List[Dict[str, Any]]
19
20
  OutputType: TypeAlias = List[Dict[str, Any]]
20
21
  ProcessResponseFn: TypeAlias = Callable[[InputType, InputType], OutputType]
21
- ProcessBatchFn: TypeAlias = Callable[[list[dict[str, Any]]], dict]
22
+ GetExtraCtxFn: TypeAlias = Callable[[List[Dict[str, Any]]], Dict[str, Any]]
23
+
24
+
25
+ def math_nan_to_none(row):
26
+ d = row.asDict()
27
+ for key, value in d.items():
28
+ if value is None:
29
+ continue
30
+ if isinstance(value, float) and math.isnan(value):
31
+ d[key] = None
32
+ return Row(**d)
22
33
 
23
34
 
24
35
  class SFrameReducer(Transformer):
@@ -39,12 +50,14 @@ class LLMInsightExtractor(SFrameReducer):
39
50
 
40
51
  Parameters
41
52
  ----------
42
- get_llm_client : BaseChatModel
53
+ get_llm_client : () -> BaseChatModel
43
54
  The LLM client used to generate insights.
44
55
  template_prompt : str
45
56
  The template prompt to send to the LLM. Should include placeholders for data.
46
57
  id_column : str, optional
47
58
  The column name to use as an identifier when expanding results. Required if expand_on_id is True.
59
+ join_cols : list[str], optional
60
+ The columns to use for joining the extracted insights back to the original DataFrame. If not provided, defaults to [id_column].
48
61
  template_context : str, optional
49
62
  The system context to provide to the LLM. Defaults to a basic data scientist role.
50
63
  llm_input_columns : List[str], optional
@@ -52,7 +65,7 @@ class LLMInsightExtractor(SFrameReducer):
52
65
  process_llm_json_response_fn : ProcessResponseFn, optional
53
66
  Function to process the JSON response from the LLM.
54
67
  get_extra_context : ProcessBatchFn, optional
55
- Function to process batches of data before sending to the LLM. Receives the current format_args dict and the list of batch_responses, and should return a dict to update format_args.
68
+ Function to process data before sending to the LLM. Receives the current data and should return a dict to update format_args.
56
69
  process_llm_response : Callable, optional
57
70
  Function to process the raw LLM response before JSON parsing.
58
71
  retry : int, default=3
@@ -71,6 +84,8 @@ class LLMInsightExtractor(SFrameReducer):
71
84
  Whether to expand results based on ID column. Requires id_column.
72
85
  inject_keys : dict[str, str], optional
73
86
  Additional keys to inject into the template prompt.
87
+ merge_result : bool, default=True
88
+ Whether to merge the extracted insights back to the original DataFrame.
74
89
 
75
90
  Raises
76
91
  ------
@@ -137,10 +152,11 @@ class LLMInsightExtractor(SFrameReducer):
137
152
  get_llm_client: Callable[[], "BaseChatModel"],
138
153
  template_prompt: str,
139
154
  id_column: str = None,
155
+ join_cols: list[str] = None,
140
156
  template_context: str = None,
141
157
  llm_input_columns: List[str] = None,
142
158
  process_llm_json_response_fn: ProcessResponseFn = None,
143
- get_extra_context: ProcessBatchFn = None,
159
+ get_extra_context: GetExtraCtxFn = None,
144
160
  process_llm_response: Callable = None,
145
161
  retry: int = 3,
146
162
  llm_result_cleaner: Callable = JSONCleaner().clean,
@@ -151,6 +167,7 @@ class LLMInsightExtractor(SFrameReducer):
151
167
  groupby_inject_key: str = None,
152
168
  expand_on_id: bool = False,
153
169
  inject_keys: dict[str, str] = None,
170
+ merge_result: bool = True,
154
171
  ):
155
172
 
156
173
  super().__init__(group_keys)
@@ -172,9 +189,11 @@ class LLMInsightExtractor(SFrameReducer):
172
189
  Your task is to analyze and provide insights about the given dataset.
173
190
  """
174
191
  )
192
+ self.merge_result = merge_result
175
193
 
176
194
  self.group_by_columns = group_by_columns
177
195
  self.id_column = id_column
196
+ self.join_cols = join_cols or [self.id_column]
178
197
  self.expand_on_id = expand_on_id
179
198
  self.static_injected_data = inject_keys
180
199
  self.process_llm_response = process_llm_response
@@ -277,11 +296,19 @@ class LLMInsightExtractor(SFrameReducer):
277
296
  prompt_kwargs = (
278
297
  self.get_extra_context(batch_result) if self.get_extra_context else {}
279
298
  )
280
- batch_result += self.extract_insight(batch_data, prompt_kwargs, **kwargs)
299
+ res = self.perform_extract(batch_data, prompt_kwargs, **kwargs)
300
+ if res:
301
+ batch_result += res
281
302
  return batch_result
282
303
 
304
+ def extract_insight_one_shot(
305
+ self, data: List[Dict[str, Any]], **kwargs
306
+ ) -> List[Dict[str, Any]]:
307
+ prompt_kwargs = self.get_extra_context(data) if self.get_extra_context else {}
308
+ return self.perform_extract(data, prompt_kwargs, **kwargs)
309
+
283
310
  @track
284
- def extract_insight(
311
+ def perform_extract(
285
312
  self, data: List[Dict[str, Any]], prompt_kwargs=None, **kwargs
286
313
  ) -> List[Dict[str, Any]]:
287
314
  """
@@ -329,11 +356,17 @@ class LLMInsightExtractor(SFrameReducer):
329
356
  mask &= default[col] == val
330
357
  group_df = default[mask]
331
358
 
332
- selected = (
333
- group_df[[*self.llm_input_columns]]
334
- if self.llm_input_columns
335
- else group_df
336
- )
359
+ llm_input_columns = self.llm_input_columns or group_df.columns
360
+ if set(llm_input_columns) - set(group_df.columns):
361
+ continue
362
+
363
+ selected = group_df[[*llm_input_columns]]
364
+
365
+ if (
366
+ self.id_column in selected.columns
367
+ and selected[self.id_column].isnull().all()
368
+ ):
369
+ continue
337
370
 
338
371
  # If batch mode, create chunks otherwise use whole data
339
372
  if self.batch_mode:
@@ -343,11 +376,16 @@ class LLMInsightExtractor(SFrameReducer):
343
376
  ]
344
377
  else:
345
378
  data = selected.to_dict("records")
346
- group_name = "-".join([group.get(c) for c in self.group_by_columns])
379
+ group_name = (
380
+ "-".join([group.get(c) for c in self.group_by_columns]) if group else ""
381
+ )
347
382
  inputs.append({"group_name": group_name, "data": data})
348
383
 
349
384
  def reduce_df(self, default: DataFrame, **kwargs) -> Dict[str, DataFrame]:
350
385
  # Find the groups if group_by_columns set
386
+ if default.empty:
387
+ return {"default": default}
388
+
351
389
  groups = (
352
390
  default[[*self.group_by_columns]].drop_duplicates().to_dict("records")
353
391
  if self.group_by_columns
@@ -359,28 +397,38 @@ class LLMInsightExtractor(SFrameReducer):
359
397
  self._find_extract_inputs(default, groups, inputs)
360
398
 
361
399
  extract_func = (
362
- self.extract_insight_batch if self.batch_mode else self.extract_insight
400
+ self.extract_insight_batch
401
+ if self.batch_mode
402
+ else self.extract_insight_one_shot
363
403
  )
364
404
  results = []
365
405
  for d in inputs:
366
406
  results.extend(extract_func(**d))
367
407
  results = pd.DataFrame(results)
368
408
 
369
- if self.expand_on_id:
409
+ if self.expand_on_id and not results.empty:
370
410
  redundant_cols = [
371
411
  col
372
412
  for col in results.columns
373
- if col in default.columns and col != self.id_column
413
+ if col in default.columns
414
+ and col != self.id_column
415
+ and col not in set(self.join_cols)
374
416
  ]
375
417
  results = (
376
418
  results.explode(self.id_column)
377
419
  .set_index(self.id_column)
378
420
  .drop(columns=redundant_cols, axis=1)
379
- .join(default.set_index(self.id_column), how="right")
421
+ )
422
+ if not self.merge_result:
423
+ return {"default": results.drop_duplicates()}
424
+ if not results.empty:
425
+ default = (
426
+ pd.merge(default, results, on=self.join_cols, how="left")
380
427
  .reset_index()
428
+ .drop_duplicates()
381
429
  )
382
430
 
383
- return {"default": results}
431
+ return {"default": default}
384
432
 
385
433
  def reduce_spf(
386
434
  self, default: PySparkDataFrame, **kwargs
@@ -407,6 +455,9 @@ class LLMInsightExtractor(SFrameReducer):
407
455
  # To avoid calling process group again, cache rdd and
408
456
  # use count() to trigger running process_group and cache the result.
409
457
  rdd = default.rdd.mapPartitions(process_group)
458
+ # Because pandas result maybe contains math nan values and
459
+ # these values are not valid for spark
460
+ rdd = rdd.map(math_nan_to_none)
410
461
  rdd.cache()
411
462
  rdd.count()
412
463