pyspark-connectby 1.0.5__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of pyspark-connectby might be problematic. Click here for more details.
- pyspark_connectby-1.0.5/PKG-INFO +92 -0
- pyspark_connectby-1.0.5/README.md +75 -0
- pyspark_connectby-1.0.5/pyproject.toml +21 -0
- pyspark_connectby-1.0.5/pyspark_connectby/__init__.py +3 -0
- pyspark_connectby-1.0.5/pyspark_connectby/connectby_query.py +107 -0
- pyspark_connectby-1.0.5/pyspark_connectby/query.py +15 -0
|
@@ -0,0 +1,92 @@
|
|
|
1
|
+
Metadata-Version: 2.1
|
|
2
|
+
Name: pyspark-connectby
|
|
3
|
+
Version: 1.0.5
|
|
4
|
+
Summary: connectby hierarchy query in spark
|
|
5
|
+
Author: Chen, Yu
|
|
6
|
+
Author-email: cheny@fcc.ca
|
|
7
|
+
Requires-Python: >=3.7,<4.0
|
|
8
|
+
Classifier: Programming Language :: Python :: 3
|
|
9
|
+
Classifier: Programming Language :: Python :: 3.7
|
|
10
|
+
Classifier: Programming Language :: Python :: 3.8
|
|
11
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
12
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
15
|
+
Description-Content-Type: text/markdown
|
|
16
|
+
|
|
17
|
+
# pyspark-connectby
|
|
18
|
+
Spark currently does not support hierarchy query `connect by` as of version 3.5.0.
|
|
19
|
+
And there is a [PR](https://github.com/apache/spark/pull/40744) opened to support recursive CTE query. But that is not available yet.
|
|
20
|
+
|
|
21
|
+
This is an attempt to add `connectBy` method to [DataFrame](https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/api/pyspark.sql.DataFrame.html)
|
|
22
|
+
|
|
23
|
+
# Concept
|
|
24
|
+
Hierarchy query is one of the important feature that many traditional relational databases, such as Oracle, DB2, My SQL,
|
|
25
|
+
Snowflake, [Redshift](https://docs.aws.amazon.com/redshift/latest/dg/r_CONNECT_BY_clause.html), etc.,
|
|
26
|
+
would support directly or alternatively by using recursive CTE.
|
|
27
|
+
|
|
28
|
+
Example in Redshift:
|
|
29
|
+
```sql
|
|
30
|
+
select id, name, manager_id, level
|
|
31
|
+
from employee
|
|
32
|
+
start with emp_id = 1
|
|
33
|
+
connect by prior emp_id = manager_id;
|
|
34
|
+
```
|
|
35
|
+
|
|
36
|
+
With this library, we can use `connectBy()` on `Dateframe`:
|
|
37
|
+
|
|
38
|
+
```python
|
|
39
|
+
from pyspark_connectby.query import connectBy
|
|
40
|
+
from pyspark.sql import SparkSession
|
|
41
|
+
|
|
42
|
+
schema = 'emp_id string, manager_id string, name string'
|
|
43
|
+
data = [[1, None, 'Carlos'],
|
|
44
|
+
[11, 1, 'John'],
|
|
45
|
+
[111, 11, 'Jorge'],
|
|
46
|
+
[112, 11, 'Kwaku'],
|
|
47
|
+
[113, 11, 'Liu']
|
|
48
|
+
]
|
|
49
|
+
df = spark.createDataFrame(data, schema)
|
|
50
|
+
df2 = df.connectBy(prior='emp_id', to='manager_id', start_with='1')
|
|
51
|
+
df2.show()
|
|
52
|
+
```
|
|
53
|
+
With result:
|
|
54
|
+
```
|
|
55
|
+
+------+-----+----------+------+
|
|
56
|
+
|emp_id|level|manager_id| name|
|
|
57
|
+
+------+-----+----------+------+
|
|
58
|
+
| 1| 1| null|Carlos|
|
|
59
|
+
| 11| 2| 1| John|
|
|
60
|
+
| 111| 3| 11| Jorge|
|
|
61
|
+
| 112| 3| 11| Kwaku|
|
|
62
|
+
| 113| 3| 11| Liu|
|
|
63
|
+
+------+-----+----------+------+
|
|
64
|
+
```
|
|
65
|
+
|
|
66
|
+
# Installation
|
|
67
|
+
Python Version >= 3.7
|
|
68
|
+
```
|
|
69
|
+
$ pip install --upgrade pyspark-connectby
|
|
70
|
+
```
|
|
71
|
+
|
|
72
|
+
# Usage
|
|
73
|
+
|
|
74
|
+
```python
|
|
75
|
+
from pyspark_connectby.query import connectBy
|
|
76
|
+
|
|
77
|
+
df = ...
|
|
78
|
+
|
|
79
|
+
df.connectBy(prior='emp_id', to='manager_id', start_with='1') # start_with `emp_id` as 1
|
|
80
|
+
|
|
81
|
+
df.transform(connectBy, prior='emp_id', to='manager_id', start_with='1') # or by using df.transform() method
|
|
82
|
+
|
|
83
|
+
df.connectBy(prior='emp_id', to='manager_id') # without start_with, it will go through each node
|
|
84
|
+
|
|
85
|
+
df.connectBy(prior='emp_id', to='manager_id', level_col='the_level') # level column name other than `level`
|
|
86
|
+
|
|
87
|
+
df.connectBy(prior='emp_id', to='manager_id', start_with=['1', '2']) # start_with a list of top nodes ids.
|
|
88
|
+
|
|
89
|
+
```
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
|
|
@@ -0,0 +1,75 @@
|
|
|
1
|
+
# pyspark-connectby
|
|
2
|
+
Spark currently does not support hierarchy query `connect by` as of version 3.5.0.
|
|
3
|
+
And there is a [PR](https://github.com/apache/spark/pull/40744) opened to support recursive CTE query. But that is not available yet.
|
|
4
|
+
|
|
5
|
+
This is an attempt to add `connectBy` method to [DataFrame](https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/api/pyspark.sql.DataFrame.html)
|
|
6
|
+
|
|
7
|
+
# Concept
|
|
8
|
+
Hierarchy query is one of the important feature that many traditional relational databases, such as Oracle, DB2, My SQL,
|
|
9
|
+
Snowflake, [Redshift](https://docs.aws.amazon.com/redshift/latest/dg/r_CONNECT_BY_clause.html), etc.,
|
|
10
|
+
would support directly or alternatively by using recursive CTE.
|
|
11
|
+
|
|
12
|
+
Example in Redshift:
|
|
13
|
+
```sql
|
|
14
|
+
select id, name, manager_id, level
|
|
15
|
+
from employee
|
|
16
|
+
start with emp_id = 1
|
|
17
|
+
connect by prior emp_id = manager_id;
|
|
18
|
+
```
|
|
19
|
+
|
|
20
|
+
With this library, we can use `connectBy()` on `Dateframe`:
|
|
21
|
+
|
|
22
|
+
```python
|
|
23
|
+
from pyspark_connectby.query import connectBy
|
|
24
|
+
from pyspark.sql import SparkSession
|
|
25
|
+
|
|
26
|
+
schema = 'emp_id string, manager_id string, name string'
|
|
27
|
+
data = [[1, None, 'Carlos'],
|
|
28
|
+
[11, 1, 'John'],
|
|
29
|
+
[111, 11, 'Jorge'],
|
|
30
|
+
[112, 11, 'Kwaku'],
|
|
31
|
+
[113, 11, 'Liu']
|
|
32
|
+
]
|
|
33
|
+
df = spark.createDataFrame(data, schema)
|
|
34
|
+
df2 = df.connectBy(prior='emp_id', to='manager_id', start_with='1')
|
|
35
|
+
df2.show()
|
|
36
|
+
```
|
|
37
|
+
With result:
|
|
38
|
+
```
|
|
39
|
+
+------+-----+----------+------+
|
|
40
|
+
|emp_id|level|manager_id| name|
|
|
41
|
+
+------+-----+----------+------+
|
|
42
|
+
| 1| 1| null|Carlos|
|
|
43
|
+
| 11| 2| 1| John|
|
|
44
|
+
| 111| 3| 11| Jorge|
|
|
45
|
+
| 112| 3| 11| Kwaku|
|
|
46
|
+
| 113| 3| 11| Liu|
|
|
47
|
+
+------+-----+----------+------+
|
|
48
|
+
```
|
|
49
|
+
|
|
50
|
+
# Installation
|
|
51
|
+
Python Version >= 3.7
|
|
52
|
+
```
|
|
53
|
+
$ pip install --upgrade pyspark-connectby
|
|
54
|
+
```
|
|
55
|
+
|
|
56
|
+
# Usage
|
|
57
|
+
|
|
58
|
+
```python
|
|
59
|
+
from pyspark_connectby.query import connectBy
|
|
60
|
+
|
|
61
|
+
df = ...
|
|
62
|
+
|
|
63
|
+
df.connectBy(prior='emp_id', to='manager_id', start_with='1') # start_with `emp_id` as 1
|
|
64
|
+
|
|
65
|
+
df.transform(connectBy, prior='emp_id', to='manager_id', start_with='1') # or by using df.transform() method
|
|
66
|
+
|
|
67
|
+
df.connectBy(prior='emp_id', to='manager_id') # without start_with, it will go through each node
|
|
68
|
+
|
|
69
|
+
df.connectBy(prior='emp_id', to='manager_id', level_col='the_level') # level column name other than `level`
|
|
70
|
+
|
|
71
|
+
df.connectBy(prior='emp_id', to='manager_id', start_with=['1', '2']) # start_with a list of top nodes ids.
|
|
72
|
+
|
|
73
|
+
```
|
|
74
|
+
|
|
75
|
+
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
[tool.poetry]
|
|
2
|
+
name = "pyspark-connectby"
|
|
3
|
+
version = "1.0.5"
|
|
4
|
+
description = "connectby hierarchy query in spark"
|
|
5
|
+
authors = ["Chen, Yu <cheny@fcc.ca>"]
|
|
6
|
+
readme = "README.md"
|
|
7
|
+
packages = [{include = "pyspark_connectby"}]
|
|
8
|
+
|
|
9
|
+
[tool.poetry.dependencies]
|
|
10
|
+
python = "^3.7"
|
|
11
|
+
|
|
12
|
+
[tool.poetry.group.dev.dependencies]
|
|
13
|
+
pyspark = ">3"
|
|
14
|
+
|
|
15
|
+
[tool.poetry.group.test.dependencies]
|
|
16
|
+
pytest = "^7"
|
|
17
|
+
pyspark = ">3"
|
|
18
|
+
|
|
19
|
+
[build-system]
|
|
20
|
+
requires = ["poetry-core"]
|
|
21
|
+
build-backend = "poetry.core.masonry.api"
|
|
@@ -0,0 +1,107 @@
|
|
|
1
|
+
__author__ = 'Chen, Yu'
|
|
2
|
+
__date__ = '2024-02'
|
|
3
|
+
__email__ = 'cheny@fcc.ca'
|
|
4
|
+
__version__ = '0.4'
|
|
5
|
+
|
|
6
|
+
from dataclasses import dataclass
|
|
7
|
+
from typing import Union, List
|
|
8
|
+
|
|
9
|
+
from pyspark.sql import DataFrame
|
|
10
|
+
|
|
11
|
+
TOP_NODE_LEVEL = 1
|
|
12
|
+
LEVEL_COLUMN = 'LEVEL'
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
@dataclass
|
|
16
|
+
class Node:
|
|
17
|
+
nid: str
|
|
18
|
+
level: int
|
|
19
|
+
|
|
20
|
+
@classmethod
|
|
21
|
+
def for_top(cls, nid: str) -> 'Node':
|
|
22
|
+
return cls(nid, level=TOP_NODE_LEVEL)
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class ConnectByQuery:
|
|
26
|
+
def __init__(self, df: DataFrame, child_column: str, parent_column: str,
|
|
27
|
+
start_with: Union[List[str], str] = None, level_column: str = LEVEL_COLUMN):
|
|
28
|
+
self.df: DataFrame = df
|
|
29
|
+
self.child_column = child_column
|
|
30
|
+
self.parent_column = parent_column
|
|
31
|
+
self.start_with = start_with
|
|
32
|
+
self.level_colum = level_column
|
|
33
|
+
|
|
34
|
+
self._top_nodes: [Node] = None
|
|
35
|
+
self._all_data: [(str, str)] = None
|
|
36
|
+
|
|
37
|
+
@property
|
|
38
|
+
def top_nodes(self) -> [Node]:
|
|
39
|
+
print(self._top_nodes)
|
|
40
|
+
if self._top_nodes is None:
|
|
41
|
+
if self.start_with is None:
|
|
42
|
+
top_nodes = []
|
|
43
|
+
elif isinstance(self.start_with, list):
|
|
44
|
+
top_nodes = [Node.for_top(i) for i in self.start_with]
|
|
45
|
+
else:
|
|
46
|
+
top_nodes = [Node.for_top(self.start_with)]
|
|
47
|
+
|
|
48
|
+
self._top_nodes = top_nodes or self._default_top_nodes()
|
|
49
|
+
return self._top_nodes
|
|
50
|
+
|
|
51
|
+
@property
|
|
52
|
+
def all_data(self) -> [(str, str)]:
|
|
53
|
+
if self._all_data is None:
|
|
54
|
+
rows = self.df.select(self.child_column, self.parent_column).collect()
|
|
55
|
+
self._all_data = [(r[self.child_column], r[self.parent_column]) for r in rows]
|
|
56
|
+
return self._all_data
|
|
57
|
+
|
|
58
|
+
def children_with_parent(self, parent_id: str) -> []:
|
|
59
|
+
result = list(filter(lambda d: d[1] == parent_id, self.all_data))
|
|
60
|
+
return result
|
|
61
|
+
|
|
62
|
+
def _default_top_nodes(self) -> [Node]:
|
|
63
|
+
rows = (
|
|
64
|
+
self.df
|
|
65
|
+
# .filter(psf.col(self.parent_column).isNull())
|
|
66
|
+
.collect()
|
|
67
|
+
)
|
|
68
|
+
result = [Node.for_top(r[self.child_column]) for r in rows]
|
|
69
|
+
assert len(result) > 0
|
|
70
|
+
return result
|
|
71
|
+
|
|
72
|
+
def get_descendants_recursive(self, node: Node) -> []:
|
|
73
|
+
level = node.level + 1
|
|
74
|
+
result_list = []
|
|
75
|
+
|
|
76
|
+
direct_list = [Node(nid=c[0], level=level) for c in self.children_with_parent(node.nid)]
|
|
77
|
+
indirect_list = list(map(lambda e: self.get_descendants_recursive(e), direct_list))
|
|
78
|
+
descendant_list = direct_list + indirect_list
|
|
79
|
+
|
|
80
|
+
result_list.append(descendant_list)
|
|
81
|
+
return result_list
|
|
82
|
+
|
|
83
|
+
@staticmethod
|
|
84
|
+
def _flatten(nested_list):
|
|
85
|
+
flat_list = []
|
|
86
|
+
for item in nested_list:
|
|
87
|
+
if isinstance(item, list):
|
|
88
|
+
flat_list += ConnectByQuery._flatten(item)
|
|
89
|
+
else:
|
|
90
|
+
flat_list.append(item)
|
|
91
|
+
return flat_list
|
|
92
|
+
|
|
93
|
+
def run(self) -> [Node]:
|
|
94
|
+
descendants_list = list(map(lambda e: self.get_descendants_recursive(e), self.top_nodes))
|
|
95
|
+
descendants_list_flatten = ConnectByQuery._flatten(descendants_list)
|
|
96
|
+
|
|
97
|
+
return self.top_nodes + descendants_list_flatten
|
|
98
|
+
|
|
99
|
+
def get_result_df(self) -> DataFrame:
|
|
100
|
+
result_list = self.run()
|
|
101
|
+
|
|
102
|
+
schema = f'{self.child_column} string, {self.level_colum} int'
|
|
103
|
+
spark = self.df._session
|
|
104
|
+
result_df = spark.createDataFrame([(r.nid, r.level) for r in result_list], schema=schema)
|
|
105
|
+
|
|
106
|
+
result_df = result_df.join(self.df, on=self.child_column)
|
|
107
|
+
return result_df
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
from typing import Union, List
|
|
2
|
+
|
|
3
|
+
from pyspark.sql import DataFrame
|
|
4
|
+
|
|
5
|
+
from pyspark_connectby.connectby_query import ConnectByQuery, Node
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def connectBy(df: DataFrame, prior: str, to: str,
|
|
9
|
+
start_with: Union[List[str], str] = None, level_col: str = 'level') -> DataFrame:
|
|
10
|
+
query = ConnectByQuery(df=df, child_column=prior, parent_column=to, start_with=start_with, level_column=level_col)
|
|
11
|
+
|
|
12
|
+
return query.get_result_df()
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
DataFrame.connectBy = connectBy
|