pyspark-connectby 1.1.0__tar.gz → 1.1.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of pyspark-connectby might be problematic. Click here for more details.
- {pyspark_connectby-1.1.0 → pyspark_connectby-1.1.1}/PKG-INFO +1 -1
- {pyspark_connectby-1.1.0 → pyspark_connectby-1.1.1}/pyproject.toml +1 -1
- {pyspark_connectby-1.1.0 → pyspark_connectby-1.1.1}/pyspark_connectby/connectby_query.py +9 -14
- {pyspark_connectby-1.1.0 → pyspark_connectby-1.1.1}/README.md +0 -0
- {pyspark_connectby-1.1.0 → pyspark_connectby-1.1.1}/pyspark_connectby/__init__.py +0 -0
- {pyspark_connectby-1.1.0 → pyspark_connectby-1.1.1}/pyspark_connectby/dataframe_connectby.py +0 -0
|
@@ -1,7 +1,3 @@
|
|
|
1
|
-
__author__ = 'Chen, Yu'
|
|
2
|
-
__date__ = '2024-02'
|
|
3
|
-
__version__ = '0.8'
|
|
4
|
-
|
|
5
1
|
from dataclasses import dataclass
|
|
6
2
|
from typing import Union, List
|
|
7
3
|
|
|
@@ -30,7 +26,7 @@ class Path:
|
|
|
30
26
|
return self.steps[-1]
|
|
31
27
|
|
|
32
28
|
@property
|
|
33
|
-
def level(self) ->
|
|
29
|
+
def level(self) -> int:
|
|
34
30
|
return len(self.steps)
|
|
35
31
|
|
|
36
32
|
|
|
@@ -79,7 +75,7 @@ class ConnectByQuery:
|
|
|
79
75
|
rows = self.df.collect()
|
|
80
76
|
return [Path.path_start_with(r[self.child_col]) for r in rows]
|
|
81
77
|
|
|
82
|
-
def
|
|
78
|
+
def __fetch_descendants(self, path: Path) -> []:
|
|
83
79
|
children_nodes: [Node] = self.__children_with_parent(path.end_id)
|
|
84
80
|
is_leaf = len(children_nodes) == 0
|
|
85
81
|
if is_leaf:
|
|
@@ -87,30 +83,29 @@ class ConnectByQuery:
|
|
|
87
83
|
return []
|
|
88
84
|
|
|
89
85
|
children = [Path(steps=path.steps + [c.node_id]) for c in children_nodes]
|
|
90
|
-
grandchildren = list(map(lambda c: self.
|
|
86
|
+
grandchildren = list(map(lambda c: self.__fetch_descendants(c), children))
|
|
91
87
|
|
|
92
88
|
descendants = children + grandchildren
|
|
93
89
|
return descendants
|
|
94
90
|
|
|
95
91
|
@staticmethod
|
|
96
|
-
def
|
|
92
|
+
def __flatten_list(nested_list: []) -> []:
|
|
97
93
|
flat_list = []
|
|
98
94
|
for item in nested_list:
|
|
99
95
|
if isinstance(item, list):
|
|
100
|
-
flat_list += ConnectByQuery.
|
|
96
|
+
flat_list += ConnectByQuery.__flatten_list(item)
|
|
101
97
|
else:
|
|
102
98
|
flat_list.append(item)
|
|
103
99
|
return flat_list
|
|
104
100
|
|
|
105
101
|
def __run(self) -> [Path]:
|
|
106
|
-
descendants = list(map(lambda e: self.
|
|
107
|
-
descendants_paths: [Path] = self.
|
|
102
|
+
descendants = list(map(lambda e: self.__fetch_descendants(e), self.start_paths))
|
|
103
|
+
descendants_paths: [Path] = self.__flatten_list(descendants)
|
|
108
104
|
|
|
109
105
|
return self.start_paths + descendants_paths
|
|
110
106
|
|
|
111
107
|
def get_result_df(self) -> DataFrame:
|
|
112
108
|
result_paths: [Path] = self.__run()
|
|
113
|
-
|
|
114
109
|
schema = f'''
|
|
115
110
|
{COLUMN_START_WITH} string,
|
|
116
111
|
{self.child_col} string,
|
|
@@ -118,7 +113,7 @@ class ConnectByQuery:
|
|
|
118
113
|
{COLUMN_CONNECT_BY_ISLEAF} boolean
|
|
119
114
|
'''
|
|
120
115
|
spark = self.df._session
|
|
121
|
-
result_df = \
|
|
122
|
-
spark.createDataFrame([(p.start_id, p.end_id, p.level, p.is_leaf) for p in result_paths], schema=schema)
|
|
123
116
|
|
|
117
|
+
result_df = spark.createDataFrame([(p.start_id, p.end_id, p.level, p.is_leaf) for p in result_paths],
|
|
118
|
+
schema=schema)
|
|
124
119
|
return result_df.join(self.df, on=self.child_col)
|
|
File without changes
|
|
File without changes
|
{pyspark_connectby-1.1.0 → pyspark_connectby-1.1.1}/pyspark_connectby/dataframe_connectby.py
RENAMED
|
File without changes
|