pyspark-connectby 1.0.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of pyspark-connectby might be problematic. Click here for more details.

@@ -0,0 +1,3 @@
1
+ from pyspark_connectby.query import connectBy
2
+
3
+ __all__ = ['connectBy']
@@ -0,0 +1,107 @@
1
+ __author__ = 'Chen, Yu'
2
+ __date__ = '2024-02'
3
+ __email__ = 'cheny@fcc.ca'
4
+ __version__ = '0.4'
5
+
6
+ from dataclasses import dataclass
7
+ from typing import Union, List
8
+
9
+ from pyspark.sql import DataFrame
10
+
11
+ TOP_NODE_LEVEL = 1
12
+ LEVEL_COLUMN = 'LEVEL'
13
+
14
+
15
+ @dataclass
16
+ class Node:
17
+ nid: str
18
+ level: int
19
+
20
+ @classmethod
21
+ def for_top(cls, nid: str) -> 'Node':
22
+ return cls(nid, level=TOP_NODE_LEVEL)
23
+
24
+
25
+ class ConnectByQuery:
26
+ def __init__(self, df: DataFrame, child_column: str, parent_column: str,
27
+ start_with: Union[List[str], str] = None, level_column: str = LEVEL_COLUMN):
28
+ self.df: DataFrame = df
29
+ self.child_column = child_column
30
+ self.parent_column = parent_column
31
+ self.start_with = start_with
32
+ self.level_colum = level_column
33
+
34
+ self._top_nodes: [Node] = None
35
+ self._all_data: [(str, str)] = None
36
+
37
+ @property
38
+ def top_nodes(self) -> [Node]:
39
+ print(self._top_nodes)
40
+ if self._top_nodes is None:
41
+ if self.start_with is None:
42
+ top_nodes = []
43
+ elif isinstance(self.start_with, list):
44
+ top_nodes = [Node.for_top(i) for i in self.start_with]
45
+ else:
46
+ top_nodes = [Node.for_top(self.start_with)]
47
+
48
+ self._top_nodes = top_nodes or self._default_top_nodes()
49
+ return self._top_nodes
50
+
51
+ @property
52
+ def all_data(self) -> [(str, str)]:
53
+ if self._all_data is None:
54
+ rows = self.df.select(self.child_column, self.parent_column).collect()
55
+ self._all_data = [(r[self.child_column], r[self.parent_column]) for r in rows]
56
+ return self._all_data
57
+
58
+ def children_with_parent(self, parent_id: str) -> []:
59
+ result = list(filter(lambda d: d[1] == parent_id, self.all_data))
60
+ return result
61
+
62
+ def _default_top_nodes(self) -> [Node]:
63
+ rows = (
64
+ self.df
65
+ # .filter(psf.col(self.parent_column).isNull())
66
+ .collect()
67
+ )
68
+ result = [Node.for_top(r[self.child_column]) for r in rows]
69
+ assert len(result) > 0
70
+ return result
71
+
72
+ def get_descendants_recursive(self, node: Node) -> []:
73
+ level = node.level + 1
74
+ result_list = []
75
+
76
+ direct_list = [Node(nid=c[0], level=level) for c in self.children_with_parent(node.nid)]
77
+ indirect_list = list(map(lambda e: self.get_descendants_recursive(e), direct_list))
78
+ descendant_list = direct_list + indirect_list
79
+
80
+ result_list.append(descendant_list)
81
+ return result_list
82
+
83
+ @staticmethod
84
+ def _flatten(nested_list):
85
+ flat_list = []
86
+ for item in nested_list:
87
+ if isinstance(item, list):
88
+ flat_list += ConnectByQuery._flatten(item)
89
+ else:
90
+ flat_list.append(item)
91
+ return flat_list
92
+
93
+ def run(self) -> [Node]:
94
+ descendants_list = list(map(lambda e: self.get_descendants_recursive(e), self.top_nodes))
95
+ descendants_list_flatten = ConnectByQuery._flatten(descendants_list)
96
+
97
+ return self.top_nodes + descendants_list_flatten
98
+
99
+ def get_result_df(self) -> DataFrame:
100
+ result_list = self.run()
101
+
102
+ schema = f'{self.child_column} string, {self.level_colum} int'
103
+ spark = self.df._session
104
+ result_df = spark.createDataFrame([(r.nid, r.level) for r in result_list], schema=schema)
105
+
106
+ result_df = result_df.join(self.df, on=self.child_column)
107
+ return result_df
@@ -0,0 +1,15 @@
1
+ from typing import Union, List
2
+
3
+ from pyspark.sql import DataFrame
4
+
5
+ from pyspark_connectby.connectby_query import ConnectByQuery, Node
6
+
7
+
8
+ def connectBy(df: DataFrame, prior: str, to: str,
9
+ start_with: Union[List[str], str] = None, level_col: str = 'level') -> DataFrame:
10
+ query = ConnectByQuery(df=df, child_column=prior, parent_column=to, start_with=start_with, level_column=level_col)
11
+
12
+ return query.get_result_df()
13
+
14
+
15
+ DataFrame.connectBy = connectBy
@@ -0,0 +1,92 @@
1
+ Metadata-Version: 2.1
2
+ Name: pyspark-connectby
3
+ Version: 1.0.5
4
+ Summary: connectby hierarchy query in spark
5
+ Author: Chen, Yu
6
+ Author-email: cheny@fcc.ca
7
+ Requires-Python: >=3.7,<4.0
8
+ Classifier: Programming Language :: Python :: 3
9
+ Classifier: Programming Language :: Python :: 3.7
10
+ Classifier: Programming Language :: Python :: 3.8
11
+ Classifier: Programming Language :: Python :: 3.9
12
+ Classifier: Programming Language :: Python :: 3.10
13
+ Classifier: Programming Language :: Python :: 3.11
14
+ Classifier: Programming Language :: Python :: 3.12
15
+ Description-Content-Type: text/markdown
16
+
17
+ # pyspark-connectby
18
+ Spark currently does not support hierarchy query `connect by` as of version 3.5.0.
19
+ And there is a [PR](https://github.com/apache/spark/pull/40744) opened to support recursive CTE query. But that is not available yet.
20
+
21
+ This is an attempt to add `connectBy` method to [DataFrame](https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/api/pyspark.sql.DataFrame.html)
22
+
23
+ # Concept
24
+ Hierarchy query is one of the important feature that many traditional relational databases, such as Oracle, DB2, My SQL,
25
+ Snowflake, [Redshift](https://docs.aws.amazon.com/redshift/latest/dg/r_CONNECT_BY_clause.html), etc.,
26
+ would support directly or alternatively by using recursive CTE.
27
+
28
+ Example in Redshift:
29
+ ```sql
30
+ select id, name, manager_id, level
31
+ from employee
32
+ start with emp_id = 1
33
+ connect by prior emp_id = manager_id;
34
+ ```
35
+
36
+ With this library, we can use `connectBy()` on `Dateframe`:
37
+
38
+ ```python
39
+ from pyspark_connectby.query import connectBy
40
+ from pyspark.sql import SparkSession
41
+
42
+ schema = 'emp_id string, manager_id string, name string'
43
+ data = [[1, None, 'Carlos'],
44
+ [11, 1, 'John'],
45
+ [111, 11, 'Jorge'],
46
+ [112, 11, 'Kwaku'],
47
+ [113, 11, 'Liu']
48
+ ]
49
+ df = spark.createDataFrame(data, schema)
50
+ df2 = df.connectBy(prior='emp_id', to='manager_id', start_with='1')
51
+ df2.show()
52
+ ```
53
+ With result:
54
+ ```
55
+ +------+-----+----------+------+
56
+ |emp_id|level|manager_id| name|
57
+ +------+-----+----------+------+
58
+ | 1| 1| null|Carlos|
59
+ | 11| 2| 1| John|
60
+ | 111| 3| 11| Jorge|
61
+ | 112| 3| 11| Kwaku|
62
+ | 113| 3| 11| Liu|
63
+ +------+-----+----------+------+
64
+ ```
65
+
66
+ # Installation
67
+ Python Version >= 3.7
68
+ ```
69
+ $ pip install --upgrade pyspark-connectby
70
+ ```
71
+
72
+ # Usage
73
+
74
+ ```python
75
+ from pyspark_connectby.query import connectBy
76
+
77
+ df = ...
78
+
79
+ df.connectBy(prior='emp_id', to='manager_id', start_with='1') # start_with `emp_id` as 1
80
+
81
+ df.transform(connectBy, prior='emp_id', to='manager_id', start_with='1') # or by using df.transform() method
82
+
83
+ df.connectBy(prior='emp_id', to='manager_id') # without start_with, it will go through each node
84
+
85
+ df.connectBy(prior='emp_id', to='manager_id', level_col='the_level') # level column name other than `level`
86
+
87
+ df.connectBy(prior='emp_id', to='manager_id', start_with=['1', '2']) # start_with a list of top nodes ids.
88
+
89
+ ```
90
+
91
+
92
+
@@ -0,0 +1,6 @@
1
+ pyspark_connectby/__init__.py,sha256=foBOOOeH6ZHnwuJdkcU0Dud0Sew_xYiXGD__oFRTSwY,70
2
+ pyspark_connectby/connectby_query.py,sha256=37-PVB_ozfKjuZk9LYpSrzl4-W538QuMegyhPrRjW9A,3454
3
+ pyspark_connectby/query.py,sha256=UdUfvJZwgDpifB1IEdZD8GWQ21_RQ6L44ruA2fcDua4,469
4
+ pyspark_connectby-1.0.5.dist-info/METADATA,sha256=ruK7E1ZNcx23d_bZVRBDiRXUBrso_R9d6q1T1LRw4kc,2909
5
+ pyspark_connectby-1.0.5.dist-info/WHEEL,sha256=FMvqSimYX_P7y0a7UY-_Mc83r5zkBZsCYPm7Lr0Bsq4,88
6
+ pyspark_connectby-1.0.5.dist-info/RECORD,,
@@ -0,0 +1,4 @@
1
+ Wheel-Version: 1.0
2
+ Generator: poetry-core 1.8.1
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any