pyspark-explorer 0.0.11__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,163 @@
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+
6
+ # C extensions
7
+ *.so
8
+
9
+ # Distribution / packaging
10
+ .Python
11
+ build/
12
+ develop-eggs/
13
+ dist/
14
+ downloads/
15
+ eggs/
16
+ .eggs/
17
+ lib/
18
+ lib64/
19
+ parts/
20
+ sdist/
21
+ var/
22
+ wheels/
23
+ share/python-wheels/
24
+ *.egg-info/
25
+ .installed.cfg
26
+ *.egg
27
+ MANIFEST
28
+
29
+ # PyInstaller
30
+ # Usually these files are written by a python script from a template
31
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
32
+ *.manifest
33
+ *.spec
34
+
35
+ # Installer logs
36
+ pip-log.txt
37
+ pip-delete-this-directory.txt
38
+
39
+ # Unit test / coverage reports
40
+ htmlcov/
41
+ .tox/
42
+ .nox/
43
+ .coverage
44
+ .coverage.*
45
+ .cache
46
+ nosetests.xml
47
+ coverage.xml
48
+ *.cover
49
+ *.py,cover
50
+ .hypothesis/
51
+ .pytest_cache/
52
+ cover/
53
+
54
+ # Translations
55
+ *.mo
56
+ *.pot
57
+
58
+ # Django stuff:
59
+ *.log
60
+ local_settings.py
61
+ db.sqlite3
62
+ db.sqlite3-journal
63
+
64
+ # Flask stuff:
65
+ instance/
66
+ .webassets-cache
67
+
68
+ # Scrapy stuff:
69
+ .scrapy
70
+
71
+ # Sphinx documentation
72
+ docs/_build/
73
+
74
+ # PyBuilder
75
+ .pybuilder/
76
+ target/
77
+
78
+ # Jupyter Notebook
79
+ .ipynb_checkpoints
80
+
81
+ # IPython
82
+ profile_default/
83
+ ipython_config.py
84
+
85
+ # pyenv
86
+ # For a library or package, you might want to ignore these files since the code is
87
+ # intended to run in multiple environments; otherwise, check them in:
88
+ # .python-version
89
+
90
+ # pipenv
91
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
92
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
93
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
94
+ # install all needed dependencies.
95
+ #Pipfile.lock
96
+
97
+ # poetry
98
+ # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
99
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
100
+ # commonly ignored for libraries.
101
+ # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102
+ #poetry.lock
103
+
104
+ # pdm
105
+ # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
106
+ #pdm.lock
107
+ # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
108
+ # in version control.
109
+ # https://pdm.fming.dev/latest/usage/project/#working-with-version-control
110
+ .pdm.toml
111
+ .pdm-python
112
+ .pdm-build/
113
+
114
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
115
+ __pypackages__/
116
+
117
+ # Celery stuff
118
+ celerybeat-schedule
119
+ celerybeat.pid
120
+
121
+ # SageMath parsed files
122
+ *.sage.py
123
+
124
+ # Environments
125
+ .env
126
+ .venv
127
+ env/
128
+ venv/
129
+ ENV/
130
+ env.bak/
131
+ venv.bak/
132
+
133
+ # Spyder project settings
134
+ .spyderproject
135
+ .spyproject
136
+
137
+ # Rope project settings
138
+ .ropeproject
139
+
140
+ # mkdocs documentation
141
+ /site
142
+
143
+ # mypy
144
+ .mypy_cache/
145
+ .dmypy.json
146
+ dmypy.json
147
+
148
+ # Pyre type checker
149
+ .pyre/
150
+
151
+ # pytype static type analyzer
152
+ .pytype/
153
+
154
+ # Cython debug symbols
155
+ cython_debug/
156
+
157
+ # PyCharm
158
+ # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
159
+ # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
160
+ # and can be added to the global gitignore or merged into this file. For a more nuclear
161
+ # option (not recommended) you can uncomment the following to ignore the entire idea folder.
162
+ .idea/
163
+ ignore/
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2024 Krzysztof Ruta
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,103 @@
1
+ Metadata-Version: 2.4
2
+ Name: pyspark-explorer
3
+ Version: 0.0.11
4
+ Summary: Explore data files with pyspark
5
+ Project-URL: Homepage, https://github.com/krzys9876/pyspark_explorer
6
+ Project-URL: Repository, https://github.com/krzys9876/pyspark_explorer
7
+ Author-email: Krzysztof Ruta <krzys9876@gmail.com>
8
+ Maintainer-email: Krzysztof Ruta <krzys9876@gmail.com>
9
+ License: MIT License
10
+
11
+ Copyright (c) 2024 Krzysztof Ruta
12
+
13
+ Permission is hereby granted, free of charge, to any person obtaining a copy
14
+ of this software and associated documentation files (the "Software"), to deal
15
+ in the Software without restriction, including without limitation the rights
16
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
17
+ copies of the Software, and to permit persons to whom the Software is
18
+ furnished to do so, subject to the following conditions:
19
+
20
+ The above copyright notice and this permission notice shall be included in all
21
+ copies or substantial portions of the Software.
22
+
23
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
24
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
25
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
26
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
27
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
28
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
29
+ SOFTWARE.
30
+ License-File: LICENSE
31
+ Keywords: data,explorer,pyspark,spark
32
+ Classifier: Development Status :: 3 - Alpha
33
+ Classifier: Programming Language :: Python
34
+ Requires-Python: >=3.11
35
+ Requires-Dist: pyspark>=3.5.1
36
+ Requires-Dist: textual>=1.0.0
37
+ Description-Content-Type: text/markdown
38
+
39
+ # Spark File Explorer
40
+ When developing spark applications I came across the growing number of data files that I create.
41
+
42
+ ## CSVs are fine but what about JSON and complex PARQUET files?
43
+
44
+ To open and explore a file I used Excel to view CSV files, text editors with plugins to view JSON files,
45
+ but there was nothing handy to view PARQUETs. Event formatted JSONs were not always readable. What about viewing schemas?
46
+
47
+ Each time I had to use spark and write simple apps which was not a problem itself but was tedious and boring.
48
+
49
+ ## Why not a database?
50
+
51
+ Well, for tabular data there problems is already solved - just use your preferred database.
52
+ Quite often we can load text files or even parquets directly to the database.
53
+
54
+ So what's the big deal?
55
+
56
+ ## Hierarchical data sets
57
+
58
+ Unfortunately the files I often deal with have hierarchical structure. They cannot be simply visualized as tables
59
+ or rather some fields contain tables of other structures. Each of these structures is a table itself but how to load
60
+ and explore such embedded tables in a database?
61
+
62
+ ## For Spark files use... Spark!
63
+
64
+ Hold on - since I generate files using Apache Spark, why can't I use it to explore them?
65
+ I can easily handle complex structures and file types using built-in features. So all I need is to build a use interface
66
+ to display directories, files and their contents.
67
+
68
+ ## Why console?
69
+
70
+ I use Kubernetes in production environment, I develop Spark applications locally or in VM.
71
+ In all environments I would like to have _one tool to rule them all_.
72
+
73
+ I like console tools a lot, they require some sort of simplicity. They can run locally or over SSH connection on
74
+ the remote cluster. Sounds perfect. All I needed was a console UI library, so I wouldn't have to reinvent the wheel.
75
+
76
+ ## Textual
77
+
78
+ What a great project [_textual_](https://textual.textualize.io/) is!
79
+
80
+ Years ago I used [_curses_](https://docs.python.org/3/library/curses.html) but
81
+ [_textual_](https://textual.textualize.io/) is so superior to what I used back then. It has so many features packed in
82
+ a friendly form of simple to use components. Highly recommended.
83
+
84
+ # Usage
85
+
86
+ Install package with pip:
87
+
88
+ pip install pyspark-explorer
89
+
90
+ Run:
91
+
92
+ pyspark-explorer
93
+
94
+ I recommend that you provide a base path. For local files that could be for example:
95
+
96
+ # Linux
97
+ pyspark-explorer file:///home/myuser/datafiles/base_path
98
+ # Windows
99
+ pyspark-explorer file:///c:/datafiles/base_path
100
+ # Remote hdfs cluster
101
+ pyspark-explorer hdfs://somecluster/datafiles/base_path
102
+
103
+ Default path is set to /, which represents local root filesystem and works fine even in Windows thanks to Spark logics.
@@ -0,0 +1,65 @@
1
+ # Spark File Explorer
2
+ When developing spark applications I came across the growing number of data files that I create.
3
+
4
+ ## CSVs are fine but what about JSON and complex PARQUET files?
5
+
6
+ To open and explore a file I used Excel to view CSV files, text editors with plugins to view JSON files,
7
+ but there was nothing handy to view PARQUETs. Event formatted JSONs were not always readable. What about viewing schemas?
8
+
9
+ Each time I had to use spark and write simple apps which was not a problem itself but was tedious and boring.
10
+
11
+ ## Why not a database?
12
+
13
+ Well, for tabular data there problems is already solved - just use your preferred database.
14
+ Quite often we can load text files or even parquets directly to the database.
15
+
16
+ So what's the big deal?
17
+
18
+ ## Hierarchical data sets
19
+
20
+ Unfortunately the files I often deal with have hierarchical structure. They cannot be simply visualized as tables
21
+ or rather some fields contain tables of other structures. Each of these structures is a table itself but how to load
22
+ and explore such embedded tables in a database?
23
+
24
+ ## For Spark files use... Spark!
25
+
26
+ Hold on - since I generate files using Apache Spark, why can't I use it to explore them?
27
+ I can easily handle complex structures and file types using built-in features. So all I need is to build a use interface
28
+ to display directories, files and their contents.
29
+
30
+ ## Why console?
31
+
32
+ I use Kubernetes in production environment, I develop Spark applications locally or in VM.
33
+ In all environments I would like to have _one tool to rule them all_.
34
+
35
+ I like console tools a lot, they require some sort of simplicity. They can run locally or over SSH connection on
36
+ the remote cluster. Sounds perfect. All I needed was a console UI library, so I wouldn't have to reinvent the wheel.
37
+
38
+ ## Textual
39
+
40
+ What a great project [_textual_](https://textual.textualize.io/) is!
41
+
42
+ Years ago I used [_curses_](https://docs.python.org/3/library/curses.html) but
43
+ [_textual_](https://textual.textualize.io/) is so superior to what I used back then. It has so many features packed in
44
+ a friendly form of simple to use components. Highly recommended.
45
+
46
+ # Usage
47
+
48
+ Install package with pip:
49
+
50
+ pip install pyspark-explorer
51
+
52
+ Run:
53
+
54
+ pyspark-explorer
55
+
56
+ I recommend that you provide a base path. For local files that could be for example:
57
+
58
+ # Linux
59
+ pyspark-explorer file:///home/myuser/datafiles/base_path
60
+ # Windows
61
+ pyspark-explorer file:///c:/datafiles/base_path
62
+ # Remote hdfs cluster
63
+ pyspark-explorer hdfs://somecluster/datafiles/base_path
64
+
65
+ Default path is set to /, which represents local root filesystem and works fine even in Windows thanks to Spark logics.
@@ -0,0 +1,5 @@
1
+ from pyspark_explorer.run import run
2
+
3
+
4
+ if __name__ == "__main__":
5
+ run()
@@ -0,0 +1,33 @@
1
+ [build-system]
2
+ requires = ["hatchling"]
3
+ build-backend = "hatchling.build"
4
+
5
+ [project]
6
+ name = "pyspark-explorer"
7
+ version = "0.0.11"
8
+ dependencies = [
9
+ "pyspark>=3.5.1",
10
+ "textual>=1.0.0"
11
+ ]
12
+ requires-python = ">=3.11"
13
+ authors = [
14
+ {name = "Krzysztof Ruta", email = "krzys9876@gmail.com"}
15
+ ]
16
+ maintainers = [
17
+ {name = "Krzysztof Ruta", email = "krzys9876@gmail.com"}
18
+ ]
19
+ description = "Explore data files with pyspark"
20
+ readme = "README.md"
21
+ license = {file = "LICENSE"}
22
+ keywords = ["pyspark", "spark", "explorer", "data"]
23
+ classifiers = [
24
+ "Development Status :: 3 - Alpha",
25
+ "Programming Language :: Python"
26
+ ]
27
+
28
+ [project.urls]
29
+ Homepage = "https://github.com/krzys9876/pyspark_explorer"
30
+ Repository = "https://github.com/krzys9876/pyspark_explorer"
31
+
32
+ [project.scripts]
33
+ pyspark-explorer = "pyspark_explorer.run:run"
File without changes
@@ -0,0 +1,217 @@
1
+ from typing import Any
2
+
3
+ from pyspark.sql.types import StructField, Row, StructType, ArrayType, DataType
4
+ import copy
5
+
6
+
7
+ class DataFrameTable:
8
+ TEXT_LEN = 50
9
+
10
+ # allow original rows (Row type) or previously transformed rows (when drilling to details)
11
+ def __init__(self, schema: [StructField], data=None, transformed_data=None, expand_structs: bool = False):
12
+ if transformed_data is None:
13
+ transformed_data = []
14
+ if data is None:
15
+ data = []
16
+ self._schema: [StructField] = schema
17
+ self._data: [Row] = data
18
+ self._expand_structs: bool = expand_structs
19
+
20
+ self.columns = []
21
+ self.column_names = []
22
+ self.schema_tree = []
23
+ self.rows = []
24
+ self.row_values = []
25
+ self.__extract_columns__()
26
+
27
+ if len(data):
28
+ self.__extract_rows__()
29
+ else:
30
+ self.__set_rows__(transformed_data)
31
+
32
+ if self._expand_structs:
33
+ self.__expand_structs__()
34
+
35
+
36
+ @staticmethod
37
+ def __extract_kind__(field: StructField) -> str:
38
+ if type(field.dataType) == StructType:
39
+ kind = "struct"
40
+ elif type(field.dataType) == ArrayType:
41
+ kind = "array"
42
+ else:
43
+ kind = "simple"
44
+ return kind
45
+
46
+
47
+ def __expand_structs__(self):
48
+ new_cols = [] # NOTE: we cannot modify self.columns on the fly in the loop below, we would modify the loop
49
+ new_rows = copy.deepcopy(self.rows)
50
+ col_index = 0
51
+ existing_columns = copy.deepcopy(self.columns)
52
+ for ci, col in enumerate(existing_columns):
53
+ col["col_index"] = col_index
54
+ new_cols.append(col)
55
+ col_index += 1
56
+ if col["kind"] == "struct":
57
+ for fi,field in enumerate(col["field_type"].fields):
58
+ kind = self.__extract_kind__(field)
59
+ field_type = self.__extract_type__(field)
60
+ new_col = {"col_index": col_index, "name": f"*{field.name}", "kind": kind, "type": type(field.dataType).__name__, "field_type": field_type}
61
+ new_cols.append(new_col)
62
+
63
+ for row in new_rows:
64
+ struct_value = row["row"][ci]["value"]
65
+ row["row"].insert(col_index, struct_value["row"][fi])
66
+
67
+ col_index += 1
68
+
69
+ self.columns = new_cols
70
+ self.rows = new_rows
71
+ self.__extract_column_names__()
72
+ self.__extract_row_values__()
73
+
74
+
75
+ @staticmethod
76
+ def __extract_type__(field) -> StructType:
77
+ # extract inner type from ArrayType, return field type otherwise
78
+ if type(field.dataType) == ArrayType:
79
+ return field.dataType.elementType
80
+
81
+ return field.dataType
82
+
83
+
84
+ @staticmethod
85
+ def __extract_embedded_schema_tree__(fields: [StructField]) -> []:
86
+ subfields = []
87
+ for f in fields:
88
+ subfield = {"name": f.name, "kind": DataFrameTable.__extract_kind__(f), "type": type(f.dataType).__name__}
89
+ if subfield["kind"] == "array":
90
+ embedded_subfields = DataFrameTable.__extract_embedded_array_to_tree__(f.dataType)
91
+ elif subfield["kind"] == "struct":
92
+ embedded_subfields = DataFrameTable.__extract_embedded_schema_tree__(f.dataType.fields)
93
+ else:
94
+ embedded_subfields = []
95
+ subfield["subfields"] = embedded_subfields
96
+ subfields.append(subfield)
97
+ return subfields
98
+
99
+
100
+ @staticmethod
101
+ def __extract_embedded_array_to_tree__(subfield: DataType) -> []:
102
+ if type(subfield) == StructType:
103
+ subfields = DataFrameTable.__extract_embedded_schema_tree__(subfield.fields)
104
+ elif type(subfield) == ArrayType:
105
+ subfields = DataFrameTable.__extract_embedded_array_to_tree__(subfield.elementType)
106
+ else:
107
+ subfields = [{"name": "", "kind": "simple", "type": type(subfield).__name__, "subfields": []}]
108
+
109
+ return subfields
110
+
111
+ def __extract_schema_tree__(self) -> None:
112
+ tree = []
113
+ for col in self.columns:
114
+ tree_field = {"name": col["name"], "kind": col["kind"], "type": col["type"]}
115
+ if col["kind"] == "array":
116
+ subfields = self.__extract_embedded_array_to_tree__(col["field_type"])
117
+ elif col["kind"] == "struct":
118
+ subfields = self.__extract_embedded_schema_tree__(col["field_type"])
119
+ else:
120
+ subfields = []
121
+ tree_field["subfields"] = subfields
122
+ tree.append(tree_field)
123
+
124
+ self.schema_tree = tree
125
+
126
+
127
+ def __extract_columns__(self) -> None:
128
+ cols = []
129
+ for i,field in enumerate(self._schema):
130
+ field_type = self.__extract_type__(field)
131
+
132
+ kind = self.__extract_kind__(field)
133
+ cols.append({"col_index": i, "name": field.name, "kind": kind, "type": type(field.dataType).__name__, "field_type": field_type})
134
+
135
+ self.columns = cols
136
+ self.__extract_column_names__()
137
+ self.__extract_schema_tree__()
138
+
139
+
140
+ def __extract_column_names__(self) -> None:
141
+ self.column_names = [c["name"] for c in self.columns]
142
+
143
+
144
+ def __extract_rows__(self) -> None:
145
+ assert len(self.columns) > 0 # ensure columns are calculated BEFORE rows
146
+
147
+ rows = []
148
+ for ri,data_row in enumerate(self._data):
149
+ row=[]
150
+ for fi, field in enumerate(data_row.__fields__):
151
+ if self.columns[fi]["kind"] == "array":
152
+ # create internal schema as a single field
153
+ column = StructField(self.columns[fi]["name"], self.columns[fi]["field_type"])
154
+ # specify row schema in a form of name = value
155
+ values_as_row = list(map(lambda r: Row(**{self.columns[fi]["name"] : r}), data_row[field])) if data_row[field] is not None else None
156
+ value = DataFrameTable([column], values_as_row).rows
157
+ display_value = self.disp_value(data_row[field]) if data_row[field] is not None else "[]"
158
+ elif self.columns[fi]["kind"] == "struct":
159
+ # extract internal schema as an array of fields
160
+ inner_schema = self.columns[fi]["field_type"].fields
161
+ # a value is just a single Row, so we must pack it as an array and then unpack it
162
+ value = DataFrameTable(inner_schema, [data_row[field]]).rows[0]
163
+ display_value = self.disp_value(data_row[field])
164
+ else:
165
+ value = data_row[field]
166
+ display_value = self.disp_value(value)
167
+
168
+ row.append({"display_value": display_value, "value": value})
169
+
170
+ rows.append({"row_index": ri, "row": row})
171
+
172
+ self.__set_rows__(rows)
173
+
174
+
175
+ def __set_rows__(self, rows: []) -> None:
176
+ self.rows = rows
177
+ self.__extract_row_values__()
178
+
179
+
180
+ def __extract_row_values__(self) -> None:
181
+ # maybe it is not very readable but still it's one-liner
182
+ self.row_values = [[c["display_value"][:DataFrameTable.TEXT_LEN] for c in r["row"]] for r in self.rows]
183
+
184
+
185
+ def select(self, x: int, y: int) -> ({}, {}):
186
+ return self.columns[x], self.rows[y]["row"][x]
187
+
188
+
189
+ @staticmethod
190
+ def disp_value(value: Any) -> str:
191
+ disp_value = str(value)
192
+ # remove single "Row(...)"
193
+ disp_value = disp_value[4:-1] if disp_value.startswith("Row(") and disp_value.endswith(")") else disp_value
194
+ # remove "Row" from array "[Row(...), ... , Row(...)]"
195
+ disp_value = disp_value[4:-1].replace("), Row(", "), (") if disp_value.startswith("[Row(") and disp_value.endswith(")]") else disp_value
196
+ return disp_value
197
+
198
+
199
+ def extract_embedded_table(tab: DataFrameTable, x: int, y: int, expand_structs: bool = False) -> DataFrameTable | None:
200
+ # check for kind=array but field_type=StructType - this means that we want to drill down from array to structs
201
+ column, cell = tab.select(x,y)
202
+ kind = column["kind"]
203
+ if kind == "struct": # or type(column["field_type"]) == StructType:
204
+ columns = copy.deepcopy(column["field_type"].fields)
205
+ rows = copy.deepcopy([cell["value"]])
206
+ new_tab = DataFrameTable(columns, data= [], transformed_data=rows, expand_structs= expand_structs)
207
+ return new_tab
208
+
209
+ # other case for array
210
+ if kind=="array":
211
+ col_name = column["name"][1:] if column["name"].startswith("*") else column["name"]
212
+ columns = copy.deepcopy([StructField(col_name, column["field_type"])])
213
+ rows = copy.deepcopy(cell["value"])
214
+ new_tab = DataFrameTable(columns, data= [], transformed_data=rows, expand_structs= expand_structs)
215
+ return new_tab
216
+
217
+ return None
@@ -0,0 +1,81 @@
1
+ import math
2
+
3
+ from pyspark.sql import SparkSession
4
+
5
+ from pyspark_explorer.data_table import DataFrameTable
6
+
7
+
8
+ def __ensure_path_separator__(path: str) -> str:
9
+ res = path.strip()
10
+ return res + ("" if res.endswith("/") else "/")
11
+
12
+
13
+ def __human_readable_size__(size: int) -> str:
14
+ formats = [".0f", ".1f", ".3f", ".3f", ".3f"]
15
+ units = ["B", "k", "M", "G", "T"]
16
+ exp = math.log(size,10) if size>0 else 0
17
+ ref_exp = math.log(10.24,10)
18
+ # -2 to scale properly and avoid too early rounding
19
+ scale = max(0, min(round((exp / ref_exp - 2) / 3), 4))
20
+ text = "{val:" + formats[scale]+"}" + units[scale]
21
+ return format(text.format(val = size / math.pow(1024, scale)))
22
+
23
+
24
+ class Explorer:
25
+ def __init__(self, spark: SparkSession) -> None:
26
+ self.spark = spark
27
+ self.fs = spark._jvm.org.apache.hadoop.fs.FileSystem.get(spark._jsc.hadoopConfiguration())
28
+ self.params = {
29
+ "auto_refresh": True,
30
+ "file_limit": 300,
31
+ "take_rows": 1000,
32
+ "sort_file_desc": False
33
+ }
34
+
35
+
36
+ def __file_info__(self, path) -> {}:
37
+ file_status = self.fs.getFileStatus(path)
38
+ file_name = path.getName()
39
+ is_file = file_status.isFile()
40
+ file = {"name": file_name, "full_path": path.toString(), "is_dir": not is_file,
41
+ "size": 0, "hr_size": "", "type": ""}
42
+ if is_file:
43
+ file_info = self.fs.getContentSummary(path)
44
+ file["size"] = file_info.getLength()
45
+ file["hr_size"] = __human_readable_size__(file_info.getLength())
46
+ file["type"] = "CSV" if file_name.lower().endswith(".csv") \
47
+ else "JSON" if file_name.lower().endswith(".json") \
48
+ else "PARQUET" if file_name.lower().endswith(".parquet") \
49
+ else "OTHER"
50
+
51
+ return file
52
+
53
+
54
+ def read_directory(self, path: str) -> []:
55
+ files: [dict] = []
56
+ st = self.fs.getFileStatus(self.spark._jvm.org.apache.hadoop.fs.Path(path))
57
+ if st.isFile():
58
+ return []
59
+
60
+ l = self.fs.listStatus(self.spark._jvm.org.apache.hadoop.fs.Path(path))
61
+ if self.params["sort_file_desc"]:
62
+ l = list(reversed(l))
63
+ for f in l[:self.params["file_limit"]]:
64
+ file = self.__file_info__(f.getPath())
65
+ files.append(file)
66
+
67
+ return files
68
+
69
+
70
+ def file_info(self, path: str) -> {}:
71
+ return self.__file_info__(self.spark._jvm.org.apache.hadoop.fs.Path(path))
72
+
73
+
74
+ def read_file(self, file_format: str, path: str) -> DataFrameTable | None:
75
+ try:
76
+ df = self.spark.read.format(file_format).load(path)
77
+ tab = DataFrameTable(df.schema.fields, df.take(self.params["take_rows"]), True)
78
+ except Exception as e:
79
+ tab = None
80
+
81
+ return tab