PyPI - pyspark-connectby - Versions diffs - 1.0.5__tar.gz - Mend

pyspark-connectby 1.0.5__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of pyspark-connectby might be problematic. Click here for more details.

Files changed (6) hide show

pyspark_connectby-1.0.5/PKG-INFO +92 -0
pyspark_connectby-1.0.5/README.md +75 -0
pyspark_connectby-1.0.5/pyproject.toml +21 -0
pyspark_connectby-1.0.5/pyspark_connectby/__init__.py +3 -0
pyspark_connectby-1.0.5/pyspark_connectby/connectby_query.py +107 -0
pyspark_connectby-1.0.5/pyspark_connectby/query.py +15 -0

pyspark_connectby-1.0.5/PKG-INFO ADDED Viewed

@@ -0,0 +1,92 @@
+Metadata-Version: 2.1
+Name: pyspark-connectby
+Version: 1.0.5
+Summary: connectby hierarchy query in spark
+Author: Chen, Yu
+Author-email: cheny@fcc.ca
+Requires-Python: >=3.7,<4.0
+Classifier: Programming Language :: Python :: 3
+Classifier: Programming Language :: Python :: 3.7
+Classifier: Programming Language :: Python :: 3.8
+Classifier: Programming Language :: Python :: 3.9
+Classifier: Programming Language :: Python :: 3.10
+Classifier: Programming Language :: Python :: 3.11
+Classifier: Programming Language :: Python :: 3.12
+Description-Content-Type: text/markdown
+# pyspark-connectby
+Spark currently does not support hierarchy query `connect by` as of version 3.5.0.
+And there is a [PR](https://github.com/apache/spark/pull/40744) opened to support recursive CTE query. But that is not available yet.
+This is an attempt to add `connectBy` method to [DataFrame](https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/api/pyspark.sql.DataFrame.html)
+# Concept
+Hierarchy query is one of the important feature that many traditional relational databases, such as Oracle, DB2, My SQL,
+Snowflake, [Redshift](https://docs.aws.amazon.com/redshift/latest/dg/r_CONNECT_BY_clause.html), etc.,
+would support directly or alternatively by using recursive CTE.
+Example in Redshift:
+```sql
+select id, name, manager_id, level
+from employee
+start with emp_id = 1
+connect by prior emp_id = manager_id;
+```
+With this library, we can use `connectBy()` on `Dateframe`:
+```python
+from pyspark_connectby.query import connectBy
+from pyspark.sql import SparkSession
+schema = 'emp_id string, manager_id string, name string'
+data = [[1, None, 'Carlos'],
+        [11, 1, 'John'],
+        [111, 11, 'Jorge'],
+        [112, 11, 'Kwaku'],
+        [113, 11, 'Liu']
+        ]
+df = spark.createDataFrame(data, schema)
+df2 = df.connectBy(prior='emp_id', to='manager_id', start_with='1')
+df2.show()
+```
+With result:
+```
++------+-----+----------+------+
+|emp_id|level|manager_id|  name|
++------+-----+----------+------+
+|     1|    1|      null|Carlos|
+|    11|    2|         1|  John|
+|   111|    3|        11| Jorge|
+|   112|    3|        11| Kwaku|
+|   113|    3|        11|   Liu|
++------+-----+----------+------+
+```
+# Installation
+Python Version >= 3.7
+```
+$ pip install --upgrade pyspark-connectby
+```
+# Usage
+```python
+from pyspark_connectby.query import connectBy
+df = ...
+df.connectBy(prior='emp_id', to='manager_id', start_with='1')  # start_with `emp_id` as 1
+df.transform(connectBy, prior='emp_id', to='manager_id', start_with='1')  # or by using df.transform() method
+df.connectBy(prior='emp_id', to='manager_id')  # without start_with, it will go through each node
+df.connectBy(prior='emp_id', to='manager_id', level_col='the_level')  # level column name other than `level`
+df.connectBy(prior='emp_id', to='manager_id', start_with=['1', '2'])  # start_with a list of top nodes ids.
+```

pyspark_connectby-1.0.5/README.md ADDED Viewed

@@ -0,0 +1,75 @@
+# pyspark-connectby
+Spark currently does not support hierarchy query `connect by` as of version 3.5.0.
+And there is a [PR](https://github.com/apache/spark/pull/40744) opened to support recursive CTE query. But that is not available yet.
+This is an attempt to add `connectBy` method to [DataFrame](https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/api/pyspark.sql.DataFrame.html)
+# Concept
+Hierarchy query is one of the important feature that many traditional relational databases, such as Oracle, DB2, My SQL,
+Snowflake, [Redshift](https://docs.aws.amazon.com/redshift/latest/dg/r_CONNECT_BY_clause.html), etc.,
+would support directly or alternatively by using recursive CTE.
+Example in Redshift:
+```sql
+select id, name, manager_id, level
+from employee
+start with emp_id = 1
+connect by prior emp_id = manager_id;
+```
+With this library, we can use `connectBy()` on `Dateframe`:
+```python
+from pyspark_connectby.query import connectBy
+from pyspark.sql import SparkSession
+schema = 'emp_id string, manager_id string, name string'
+data = [[1, None, 'Carlos'],
+        [11, 1, 'John'],
+        [111, 11, 'Jorge'],
+        [112, 11, 'Kwaku'],
+        [113, 11, 'Liu']
+        ]
+df = spark.createDataFrame(data, schema)
+df2 = df.connectBy(prior='emp_id', to='manager_id', start_with='1')
+df2.show()
+```
+With result:
+```
++------+-----+----------+------+
+|emp_id|level|manager_id|  name|
++------+-----+----------+------+
+|     1|    1|      null|Carlos|
+|    11|    2|         1|  John|
+|   111|    3|        11| Jorge|
+|   112|    3|        11| Kwaku|
+|   113|    3|        11|   Liu|
++------+-----+----------+------+
+```
+# Installation
+Python Version >= 3.7
+```
+$ pip install --upgrade pyspark-connectby
+```
+# Usage
+```python
+from pyspark_connectby.query import connectBy
+df = ...
+df.connectBy(prior='emp_id', to='manager_id', start_with='1')  # start_with `emp_id` as 1
+df.transform(connectBy, prior='emp_id', to='manager_id', start_with='1')  # or by using df.transform() method
+df.connectBy(prior='emp_id', to='manager_id')  # without start_with, it will go through each node
+df.connectBy(prior='emp_id', to='manager_id', level_col='the_level')  # level column name other than `level`
+df.connectBy(prior='emp_id', to='manager_id', start_with=['1', '2'])  # start_with a list of top nodes ids.
+```

pyspark_connectby-1.0.5/pyproject.toml ADDED Viewed

@@ -0,0 +1,21 @@
+[tool.poetry]
+name = "pyspark-connectby"
+version = "1.0.5"
+description = "connectby hierarchy query in spark"
+authors = ["Chen, Yu <cheny@fcc.ca>"]
+readme = "README.md"
+packages = [{include = "pyspark_connectby"}]
+[tool.poetry.dependencies]
+python = "^3.7"
+[tool.poetry.group.dev.dependencies]
+pyspark = ">3"
+[tool.poetry.group.test.dependencies]
+pytest = "^7"
+pyspark = ">3"
+[build-system]
+requires = ["poetry-core"]
+build-backend = "poetry.core.masonry.api"

pyspark_connectby-1.0.5/pyspark_connectby/__init__.py ADDED Viewed

@@ -0,0 +1,3 @@
+from pyspark_connectby.query import connectBy
+__all__ = ['connectBy']

pyspark_connectby-1.0.5/pyspark_connectby/connectby_query.py ADDED Viewed

@@ -0,0 +1,107 @@
+__author__ = 'Chen, Yu'
+__date__ = '2024-02'
+__email__ = 'cheny@fcc.ca'
+__version__ = '0.4'
+from dataclasses import dataclass
+from typing import Union, List
+from pyspark.sql import DataFrame
+TOP_NODE_LEVEL = 1
+LEVEL_COLUMN = 'LEVEL'
+@dataclass
+class Node:
+    nid: str
+    level: int
+    @classmethod
+    def for_top(cls, nid: str) -> 'Node':
+        return cls(nid, level=TOP_NODE_LEVEL)
+class ConnectByQuery:
+    def __init__(self, df: DataFrame, child_column: str, parent_column: str,
+                 start_with: Union[List[str], str] = None, level_column: str = LEVEL_COLUMN):
+        self.df: DataFrame = df
+        self.child_column = child_column
+        self.parent_column = parent_column
+        self.start_with = start_with
+        self.level_colum = level_column
+        self._top_nodes: [Node] = None
+        self._all_data: [(str, str)] = None
+    @property
+    def top_nodes(self) -> [Node]:
+        print(self._top_nodes)
+        if self._top_nodes is None:
+            if self.start_with is None:
+                top_nodes = []
+            elif isinstance(self.start_with, list):
+                top_nodes = [Node.for_top(i) for i in self.start_with]
+            else:
+                top_nodes = [Node.for_top(self.start_with)]
+            self._top_nodes = top_nodes or self._default_top_nodes()
+        return self._top_nodes
+    @property
+    def all_data(self) -> [(str, str)]:
+        if self._all_data is None:
+            rows = self.df.select(self.child_column, self.parent_column).collect()
+            self._all_data = [(r[self.child_column], r[self.parent_column]) for r in rows]
+        return self._all_data
+    def children_with_parent(self, parent_id: str) -> []:
+        result = list(filter(lambda d: d[1] == parent_id, self.all_data))
+        return result
+    def _default_top_nodes(self) -> [Node]:
+        rows = (
+            self.df
+            # .filter(psf.col(self.parent_column).isNull())
+            .collect()
+        )
+        result = [Node.for_top(r[self.child_column]) for r in rows]
+        assert len(result) > 0
+        return result
+    def get_descendants_recursive(self, node: Node) -> []:
+        level = node.level + 1
+        result_list = []
+        direct_list = [Node(nid=c[0], level=level) for c in self.children_with_parent(node.nid)]
+        indirect_list = list(map(lambda e: self.get_descendants_recursive(e), direct_list))
+        descendant_list = direct_list + indirect_list
+        result_list.append(descendant_list)
+        return result_list
+    @staticmethod
+    def _flatten(nested_list):
+        flat_list = []
+        for item in nested_list:
+            if isinstance(item, list):
+                flat_list += ConnectByQuery._flatten(item)
+            else:
+                flat_list.append(item)
+        return flat_list
+    def run(self) -> [Node]:
+        descendants_list = list(map(lambda e: self.get_descendants_recursive(e), self.top_nodes))
+        descendants_list_flatten = ConnectByQuery._flatten(descendants_list)
+        return self.top_nodes + descendants_list_flatten
+    def get_result_df(self) -> DataFrame:
+        result_list = self.run()
+        schema = f'{self.child_column} string, {self.level_colum} int'
+        spark = self.df._session
+        result_df = spark.createDataFrame([(r.nid, r.level) for r in result_list], schema=schema)
+        result_df = result_df.join(self.df, on=self.child_column)
+        return result_df

pyspark_connectby-1.0.5/pyspark_connectby/query.py ADDED Viewed

@@ -0,0 +1,15 @@
+from typing import Union, List
+from pyspark.sql import DataFrame
+from pyspark_connectby.connectby_query import ConnectByQuery, Node
+def connectBy(df: DataFrame, prior: str, to: str,
+              start_with: Union[List[str], str] = None, level_col: str = 'level') -> DataFrame:
+    query = ConnectByQuery(df=df, child_column=prior, parent_column=to, start_with=start_with, level_column=level_col)
+    return query.get_result_df()
+DataFrame.connectBy = connectBy