pyspark-connectby 1.1.0__tar.gz → 1.1.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of pyspark-connectby might be problematic. Click here for more details.

@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: pyspark-connectby
3
- Version: 1.1.0
3
+ Version: 1.1.1
4
4
  Summary: connectby hierarchy query in spark
5
5
  Author: Chen, Yu
6
6
  Requires-Python: >=3.7,<4.0
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "pyspark-connectby"
3
- version = "1.1.0"
3
+ version = "1.1.1"
4
4
  description = "connectby hierarchy query in spark"
5
5
  authors = ["Chen, Yu"]
6
6
  readme = "README.md"
@@ -1,7 +1,3 @@
1
- __author__ = 'Chen, Yu'
2
- __date__ = '2024-02'
3
- __version__ = '0.8'
4
-
5
1
  from dataclasses import dataclass
6
2
  from typing import Union, List
7
3
 
@@ -30,7 +26,7 @@ class Path:
30
26
  return self.steps[-1]
31
27
 
32
28
  @property
33
- def level(self) -> str:
29
+ def level(self) -> int:
34
30
  return len(self.steps)
35
31
 
36
32
 
@@ -79,7 +75,7 @@ class ConnectByQuery:
79
75
  rows = self.df.collect()
80
76
  return [Path.path_start_with(r[self.child_col]) for r in rows]
81
77
 
82
- def __descendants_recursive(self, path: Path) -> []:
78
+ def __fetch_descendants(self, path: Path) -> []:
83
79
  children_nodes: [Node] = self.__children_with_parent(path.end_id)
84
80
  is_leaf = len(children_nodes) == 0
85
81
  if is_leaf:
@@ -87,30 +83,29 @@ class ConnectByQuery:
87
83
  return []
88
84
 
89
85
  children = [Path(steps=path.steps + [c.node_id]) for c in children_nodes]
90
- grandchildren = list(map(lambda c: self.__descendants_recursive(c), children))
86
+ grandchildren = list(map(lambda c: self.__fetch_descendants(c), children))
91
87
 
92
88
  descendants = children + grandchildren
93
89
  return descendants
94
90
 
95
91
  @staticmethod
96
- def __flatten(nested_list):
92
+ def __flatten_list(nested_list: []) -> []:
97
93
  flat_list = []
98
94
  for item in nested_list:
99
95
  if isinstance(item, list):
100
- flat_list += ConnectByQuery.__flatten(item)
96
+ flat_list += ConnectByQuery.__flatten_list(item)
101
97
  else:
102
98
  flat_list.append(item)
103
99
  return flat_list
104
100
 
105
101
  def __run(self) -> [Path]:
106
- descendants = list(map(lambda e: self.__descendants_recursive(e), self.start_paths))
107
- descendants_paths: [Path] = self.__flatten(descendants)
102
+ descendants = list(map(lambda e: self.__fetch_descendants(e), self.start_paths))
103
+ descendants_paths: [Path] = self.__flatten_list(descendants)
108
104
 
109
105
  return self.start_paths + descendants_paths
110
106
 
111
107
  def get_result_df(self) -> DataFrame:
112
108
  result_paths: [Path] = self.__run()
113
-
114
109
  schema = f'''
115
110
  {COLUMN_START_WITH} string,
116
111
  {self.child_col} string,
@@ -118,7 +113,7 @@ class ConnectByQuery:
118
113
  {COLUMN_CONNECT_BY_ISLEAF} boolean
119
114
  '''
120
115
  spark = self.df._session
121
- result_df = \
122
- spark.createDataFrame([(p.start_id, p.end_id, p.level, p.is_leaf) for p in result_paths], schema=schema)
123
116
 
117
+ result_df = spark.createDataFrame([(p.start_id, p.end_id, p.level, p.is_leaf) for p in result_paths],
118
+ schema=schema)
124
119
  return result_df.join(self.df, on=self.child_col)