datadock 0.1.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- datadock-0.1.2/LICENSE +21 -0
- datadock-0.1.2/PKG-INFO +108 -0
- datadock-0.1.2/README.md +91 -0
- datadock-0.1.2/pyproject.toml +24 -0
- datadock-0.1.2/src/datadock/__init__.py +3 -0
- datadock-0.1.2/src/datadock/_reader.py +69 -0
- datadock-0.1.2/src/datadock/_schema_manager.py +115 -0
- datadock-0.1.2/src/datadock/_utils.py +6 -0
- datadock-0.1.2/src/datadock/api.py +136 -0
datadock-0.1.2/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2025 OtΓ‘vio Oliveira
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
datadock-0.1.2/PKG-INFO
ADDED
|
@@ -0,0 +1,108 @@
|
|
|
1
|
+
Metadata-Version: 2.1
|
|
2
|
+
Name: datadock
|
|
3
|
+
Version: 0.1.2
|
|
4
|
+
Summary: Datadock is a PySpark-based data interoperability library. It automatically detects schemas from heterogeneous files (CSV, JSON, Parquet), groups them by structural similarity, and performs standardized batch reads. Designed for pipelines handling non-uniform large-scale data, enabling robust integration and reuse in distributed environments.
|
|
5
|
+
Author: Otavio Oliveira
|
|
6
|
+
Author-email: datadock.sup@gmail.com
|
|
7
|
+
Requires-Python: >=3.10,<4.0
|
|
8
|
+
Classifier: Programming Language :: Python :: 3
|
|
9
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
10
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
11
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
12
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
13
|
+
Requires-Dist: loguru (>=0.7.3,<0.8.0)
|
|
14
|
+
Requires-Dist: pyarrow (>=20.0.0,<21.0.0)
|
|
15
|
+
Requires-Dist: pyspark (>=3.5.5,<4.0.0)
|
|
16
|
+
Description-Content-Type: text/markdown
|
|
17
|
+
|
|
18
|
+
# Datadock
|
|
19
|
+
|
|
20
|
+
**Datadock** is a Python library built on top of PySpark, designed to simplify **data interoperability** between files of different formats and schemas in modern data engineering pipelines.
|
|
21
|
+
|
|
22
|
+
It automatically detects schemas from CSV, JSON and Parquet files, groups structurally similar files, and allows standardized reading of all grouped files into a single Spark DataFrame β even in highly heterogeneous datasets.
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
## β¨ Key Features
|
|
26
|
+
|
|
27
|
+
- π **Automatic parsing** of multiple file formats: `.csv`, `.json`, `.parquet`
|
|
28
|
+
- π§ **Schema-based file grouping** by structural similarity
|
|
29
|
+
- π **Auto-selection of dominant schemas**
|
|
30
|
+
- π οΈ **Unified read** across similar files into a single PySpark DataFrame
|
|
31
|
+
- π **Schema insight** for diagnostics and inspection
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
## π§ Installation
|
|
35
|
+
|
|
36
|
+
```bash
|
|
37
|
+
pip install datadock
|
|
38
|
+
```
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
## ποΈ Expected Input Structure
|
|
42
|
+
|
|
43
|
+
Place your data files (CSV, JSON or Parquet) inside a single folder. The library will automatically detect supported files and organize them by schema similarity.
|
|
44
|
+
|
|
45
|
+
```bash
|
|
46
|
+
/data/input/
|
|
47
|
+
βββ sales_2020.csv
|
|
48
|
+
βββ sales_2021.csv
|
|
49
|
+
βββ products.json
|
|
50
|
+
βββ archive.parquet
|
|
51
|
+
βββ log.parquet
|
|
52
|
+
```
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
## π§ͺ Usage Example
|
|
56
|
+
|
|
57
|
+
```python
|
|
58
|
+
from datadock import scan_schema, get_schema_info, read_data
|
|
59
|
+
|
|
60
|
+
path = "/path/to/your/data"
|
|
61
|
+
|
|
62
|
+
# Logs schema groups detected
|
|
63
|
+
scan_schema(path)
|
|
64
|
+
|
|
65
|
+
# Retrieves schema metadata
|
|
66
|
+
info = get_schema_info(path)
|
|
67
|
+
print(info)
|
|
68
|
+
|
|
69
|
+
# Loads all files from schema group 1
|
|
70
|
+
df = read_data(path, schema_id=1, logs=True)
|
|
71
|
+
df.show()
|
|
72
|
+
```
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
## π Public API
|
|
76
|
+
|
|
77
|
+
### `scan_schema`
|
|
78
|
+
Logs the identified schema groups found in the specified folder.
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
### `get_schema_info`
|
|
82
|
+
Returns a list of dictionaries containing:
|
|
83
|
+
- `schema_id`: ID of the schema group
|
|
84
|
+
- `file_count`: number of files in the group
|
|
85
|
+
- `column_count`: number of columns in the schema
|
|
86
|
+
- `files`: list of file names in the group
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
### `read_data`
|
|
90
|
+
Reads and merges all files that share the same schema.
|
|
91
|
+
If `schema_id` is not specified, the group with the most columns will be selected.
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
## β
Requirements
|
|
95
|
+
|
|
96
|
+
- Python 3.10+
|
|
97
|
+
- PySpark
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
## π Motivation
|
|
101
|
+
|
|
102
|
+
In real-world data engineering workflows, it's common to deal with files that represent the same data domain but have slight structural variations β such as missing columns, different orders, or evolving schemas.
|
|
103
|
+
**Datadock** automates the process of grouping, inspecting, and reading these files reliably, allowing you to build pipelines that are schema-aware, scalable, and format-agnostic.
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
## π License
|
|
107
|
+
|
|
108
|
+
This project is licensed under the **MIT License**.
|
datadock-0.1.2/README.md
ADDED
|
@@ -0,0 +1,91 @@
|
|
|
1
|
+
# Datadock
|
|
2
|
+
|
|
3
|
+
**Datadock** is a Python library built on top of PySpark, designed to simplify **data interoperability** between files of different formats and schemas in modern data engineering pipelines.
|
|
4
|
+
|
|
5
|
+
It automatically detects schemas from CSV, JSON and Parquet files, groups structurally similar files, and allows standardized reading of all grouped files into a single Spark DataFrame β even in highly heterogeneous datasets.
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
## β¨ Key Features
|
|
9
|
+
|
|
10
|
+
- π **Automatic parsing** of multiple file formats: `.csv`, `.json`, `.parquet`
|
|
11
|
+
- π§ **Schema-based file grouping** by structural similarity
|
|
12
|
+
- π **Auto-selection of dominant schemas**
|
|
13
|
+
- π οΈ **Unified read** across similar files into a single PySpark DataFrame
|
|
14
|
+
- π **Schema insight** for diagnostics and inspection
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
## π§ Installation
|
|
18
|
+
|
|
19
|
+
```bash
|
|
20
|
+
pip install datadock
|
|
21
|
+
```
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
## ποΈ Expected Input Structure
|
|
25
|
+
|
|
26
|
+
Place your data files (CSV, JSON or Parquet) inside a single folder. The library will automatically detect supported files and organize them by schema similarity.
|
|
27
|
+
|
|
28
|
+
```bash
|
|
29
|
+
/data/input/
|
|
30
|
+
βββ sales_2020.csv
|
|
31
|
+
βββ sales_2021.csv
|
|
32
|
+
βββ products.json
|
|
33
|
+
βββ archive.parquet
|
|
34
|
+
βββ log.parquet
|
|
35
|
+
```
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
## π§ͺ Usage Example
|
|
39
|
+
|
|
40
|
+
```python
|
|
41
|
+
from datadock import scan_schema, get_schema_info, read_data
|
|
42
|
+
|
|
43
|
+
path = "/path/to/your/data"
|
|
44
|
+
|
|
45
|
+
# Logs schema groups detected
|
|
46
|
+
scan_schema(path)
|
|
47
|
+
|
|
48
|
+
# Retrieves schema metadata
|
|
49
|
+
info = get_schema_info(path)
|
|
50
|
+
print(info)
|
|
51
|
+
|
|
52
|
+
# Loads all files from schema group 1
|
|
53
|
+
df = read_data(path, schema_id=1, logs=True)
|
|
54
|
+
df.show()
|
|
55
|
+
```
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
## π Public API
|
|
59
|
+
|
|
60
|
+
### `scan_schema`
|
|
61
|
+
Logs the identified schema groups found in the specified folder.
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
### `get_schema_info`
|
|
65
|
+
Returns a list of dictionaries containing:
|
|
66
|
+
- `schema_id`: ID of the schema group
|
|
67
|
+
- `file_count`: number of files in the group
|
|
68
|
+
- `column_count`: number of columns in the schema
|
|
69
|
+
- `files`: list of file names in the group
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
### `read_data`
|
|
73
|
+
Reads and merges all files that share the same schema.
|
|
74
|
+
If `schema_id` is not specified, the group with the most columns will be selected.
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
## β
Requirements
|
|
78
|
+
|
|
79
|
+
- Python 3.10+
|
|
80
|
+
- PySpark
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
## π Motivation
|
|
84
|
+
|
|
85
|
+
In real-world data engineering workflows, it's common to deal with files that represent the same data domain but have slight structural variations β such as missing columns, different orders, or evolving schemas.
|
|
86
|
+
**Datadock** automates the process of grouping, inspecting, and reading these files reliably, allowing you to build pipelines that are schema-aware, scalable, and format-agnostic.
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
## π License
|
|
90
|
+
|
|
91
|
+
This project is licensed under the **MIT License**.
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
[tool.poetry]
|
|
2
|
+
name = "datadock"
|
|
3
|
+
version = "0.1.2"
|
|
4
|
+
description = "Datadock is a PySpark-based data interoperability library. It automatically detects schemas from heterogeneous files (CSV, JSON, Parquet), groups them by structural similarity, and performs standardized batch reads. Designed for pipelines handling non-uniform large-scale data, enabling robust integration and reuse in distributed environments."
|
|
5
|
+
authors = ["Otavio Oliveira <datadock.sup@gmail.com>"]
|
|
6
|
+
readme = "README.md"
|
|
7
|
+
packages = [{include = "datadock", from = "src"}]
|
|
8
|
+
|
|
9
|
+
[tool.poetry.dependencies]
|
|
10
|
+
python = "^3.10"
|
|
11
|
+
pyspark = "^3.5.5"
|
|
12
|
+
loguru = "^0.7.3"
|
|
13
|
+
pyarrow = "^20.0.0"
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
[tool.poetry.group.dev.dependencies]
|
|
17
|
+
pytest = "^8.3.5"
|
|
18
|
+
black = "^25.1.0"
|
|
19
|
+
mypy = "^1.15.0"
|
|
20
|
+
flake8 = "^7.2.0"
|
|
21
|
+
|
|
22
|
+
[build-system]
|
|
23
|
+
requires = ["poetry-core"]
|
|
24
|
+
build-backend = "poetry.core.masonry.api"
|
|
@@ -0,0 +1,69 @@
|
|
|
1
|
+
from pyspark.sql import SparkSession, DataFrame
|
|
2
|
+
import csv
|
|
3
|
+
import json
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from typing import Optional, List, Tuple
|
|
6
|
+
import pyarrow.parquet as pq
|
|
7
|
+
from datadock._utils import logger
|
|
8
|
+
|
|
9
|
+
def _read_schema_only(path: str) -> Optional[List[Tuple[str, str]]]:
|
|
10
|
+
"""
|
|
11
|
+
Reads only the schema (field names and types) from a file without loading data.
|
|
12
|
+
"""
|
|
13
|
+
ext = Path(path).suffix.lower()
|
|
14
|
+
|
|
15
|
+
try:
|
|
16
|
+
if ext == ".csv":
|
|
17
|
+
with open(path, newline='', encoding='utf-8') as f:
|
|
18
|
+
reader = csv.DictReader(f)
|
|
19
|
+
columns = reader.fieldnames
|
|
20
|
+
if columns:
|
|
21
|
+
return [(col, "string") for col in columns]
|
|
22
|
+
|
|
23
|
+
elif ext == ".json":
|
|
24
|
+
with open(path, encoding='utf-8') as f:
|
|
25
|
+
line = json.loads(f.readline())
|
|
26
|
+
if isinstance(line, dict):
|
|
27
|
+
return [(key, "string") for key in line.keys()]
|
|
28
|
+
|
|
29
|
+
elif ext == ".parquet":
|
|
30
|
+
schema = pq.read_schema(path)
|
|
31
|
+
return [(col.name, str(col.type)) for col in schema]
|
|
32
|
+
|
|
33
|
+
else:
|
|
34
|
+
logger.warning(f"[WARNING] Unsupported file extension: {ext}")
|
|
35
|
+
return None
|
|
36
|
+
|
|
37
|
+
except Exception as e:
|
|
38
|
+
logger.error(f"Failed to read schema for {path}: {e}")
|
|
39
|
+
return None
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def _load_file(spark: SparkSession, file: str) -> Optional[DataFrame]:
|
|
43
|
+
"""
|
|
44
|
+
Loads a file in the appropriate format (CSV, JSON, Parquet, Avro, TXT).
|
|
45
|
+
|
|
46
|
+
Args:
|
|
47
|
+
spark (SparkSession): The active Spark session.
|
|
48
|
+
file (str): The path to the file.
|
|
49
|
+
|
|
50
|
+
Returns:
|
|
51
|
+
Optional[DataFrame]: A Spark DataFrame if successful, None otherwise.
|
|
52
|
+
"""
|
|
53
|
+
ext = Path(file).suffix.lower()
|
|
54
|
+
|
|
55
|
+
try:
|
|
56
|
+
if ext == ".csv":
|
|
57
|
+
return spark.read.option("header", True).csv(file)
|
|
58
|
+
elif ext == ".json":
|
|
59
|
+
return spark.read.option("multiline", True).json(file)
|
|
60
|
+
elif ext == ".parquet":
|
|
61
|
+
return spark.read.parquet(file)
|
|
62
|
+
elif ext == ".txt":
|
|
63
|
+
return spark.read.text(file)
|
|
64
|
+
else:
|
|
65
|
+
logger.warning(f"Unsupported file format: {ext}")
|
|
66
|
+
return None
|
|
67
|
+
except Exception as e:
|
|
68
|
+
logger.error(f"Error loading file {file}: {e}")
|
|
69
|
+
return None
|
|
@@ -0,0 +1,115 @@
|
|
|
1
|
+
from pyspark.sql import SparkSession, DataFrame
|
|
2
|
+
from pyspark.sql.types import StructType
|
|
3
|
+
from datadock._reader import _read_schema_only
|
|
4
|
+
from datadock._utils import logger
|
|
5
|
+
from typing import List, Dict, Optional, Tuple
|
|
6
|
+
from collections import defaultdict
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def _extract_schema_signature(schema: List[Tuple[str, str]]) -> Tuple[str, ...]:
|
|
11
|
+
"""
|
|
12
|
+
Create a normalized schema signature: tuple of (column_name, type), sorted by column name.
|
|
13
|
+
"""
|
|
14
|
+
return tuple(sorted((col, dtype) for col, dtype in schema))
|
|
15
|
+
|
|
16
|
+
def _structtype_to_list(schema: StructType) -> List[Tuple[str, str]]:
|
|
17
|
+
"""
|
|
18
|
+
Convert a Spark StructType schema to a list of (column_name, data_type) tuples.
|
|
19
|
+
"""
|
|
20
|
+
return [(field.name, field.dataType.simpleString()) for field in schema]
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def _group_by_schema(paths: List[str], min_similarity: float = 0.8) -> Dict[int, List[Tuple[str, List[Tuple[str, str]]]]]:
|
|
24
|
+
"""
|
|
25
|
+
Groups files based on schema similarity using only column names.
|
|
26
|
+
Files are assigned to the first group where the similarity exceeds the given threshold.
|
|
27
|
+
If no group is similar enough, a new group is created.
|
|
28
|
+
|
|
29
|
+
Args:
|
|
30
|
+
paths (List[str]):
|
|
31
|
+
A list of file paths to be analyzed.
|
|
32
|
+
min_similarity (float, optional):
|
|
33
|
+
Minimum similarity ratio (0β1) required to group files together.
|
|
34
|
+
Defaults to 0.8.
|
|
35
|
+
|
|
36
|
+
Returns:
|
|
37
|
+
Dict[int, List[Tuple[str, List[Tuple[str, str]]]]]:
|
|
38
|
+
A dictionary mapping each schema ID to a list of tuples,
|
|
39
|
+
each containing a file path and its inferred schema.
|
|
40
|
+
"""
|
|
41
|
+
schema_groups = {}
|
|
42
|
+
schema_signatures = {}
|
|
43
|
+
next_id = 1
|
|
44
|
+
|
|
45
|
+
for path in paths:
|
|
46
|
+
schema = _read_schema_only(path)
|
|
47
|
+
if schema is None:
|
|
48
|
+
logger.warning(f"Could not read schema for file: {path}")
|
|
49
|
+
continue
|
|
50
|
+
|
|
51
|
+
matched = False
|
|
52
|
+
for schema_id, ref_schema in schema_signatures.items():
|
|
53
|
+
sim = _schema_similarity(schema, ref_schema)
|
|
54
|
+
if sim >= min_similarity:
|
|
55
|
+
schema_groups[schema_id].append((path, schema))
|
|
56
|
+
matched = True
|
|
57
|
+
break
|
|
58
|
+
|
|
59
|
+
if not matched:
|
|
60
|
+
schema_signatures[next_id] = schema
|
|
61
|
+
schema_groups[next_id] = [(path, schema)]
|
|
62
|
+
next_id += 1
|
|
63
|
+
|
|
64
|
+
return schema_groups
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
def _read_schema_group(
|
|
68
|
+
grouped_by_id: Dict[int, List[Tuple[str, List[Tuple[str, str]]]]],
|
|
69
|
+
schema_id: Optional[int] = None
|
|
70
|
+
) -> Optional[List[str]]:
|
|
71
|
+
"""
|
|
72
|
+
Returns the file paths that share the same schema.
|
|
73
|
+
If schema_id is not provided, returns the group with the most columns (rank 1).
|
|
74
|
+
|
|
75
|
+
Args:
|
|
76
|
+
grouped_by_id (Dict[int, List[Tuple[str, List[Tuple[str, str]]]]]):
|
|
77
|
+
A dictionary where the key is the schema ID and the value is a list of tuples
|
|
78
|
+
containing file paths and their associated schema.
|
|
79
|
+
schema_id (Optional[int]):
|
|
80
|
+
The schema ID to retrieve. If not specified, defaults to the group with the most columns.
|
|
81
|
+
|
|
82
|
+
Returns:
|
|
83
|
+
Optional[List[str]]:
|
|
84
|
+
A list of file paths that share the specified schema.
|
|
85
|
+
Returns None if the group is not found or if there are no datasets.
|
|
86
|
+
"""
|
|
87
|
+
if not grouped_by_id:
|
|
88
|
+
logger.error("No datasets available to read.")
|
|
89
|
+
return None
|
|
90
|
+
|
|
91
|
+
selected_id = schema_id or sorted(grouped_by_id.keys())[0]
|
|
92
|
+
logger.info(f"Selected dataset from schema group {selected_id}.")
|
|
93
|
+
|
|
94
|
+
return [path for path, _ in grouped_by_id[selected_id]]
|
|
95
|
+
|
|
96
|
+
def _schema_similarity(a: List[Tuple[str, str]], b: List[Tuple[str, str]]) -> float:
|
|
97
|
+
"""
|
|
98
|
+
Calculates the similarity between two schemas based on column names only,
|
|
99
|
+
ignoring data types. Uses Jaccard similarity.
|
|
100
|
+
|
|
101
|
+
Args:
|
|
102
|
+
a (List[Tuple[str, str]]):
|
|
103
|
+
Schema A as a list of (column name, data type) tuples.
|
|
104
|
+
b (List[Tuple[str, str]]):
|
|
105
|
+
Schema B as a list of (column name, data type) tuples.
|
|
106
|
+
|
|
107
|
+
Returns:
|
|
108
|
+
float:
|
|
109
|
+
A value between 0.0 and 1.0 representing the schema similarity.
|
|
110
|
+
"""
|
|
111
|
+
set_a = set(col_name for col_name, _ in a)
|
|
112
|
+
set_b = set(col_name for col_name, _ in b)
|
|
113
|
+
intersection = set_a & set_b
|
|
114
|
+
union = set_a | set_b
|
|
115
|
+
return len(intersection) / len(union) if union else 0
|
|
@@ -0,0 +1,136 @@
|
|
|
1
|
+
from pyspark.sql import SparkSession, DataFrame
|
|
2
|
+
from datadock._schema_manager import _group_by_schema, _read_schema_group
|
|
3
|
+
from datadock._reader import _load_file
|
|
4
|
+
from datadock._utils import logger
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from typing import Optional, List, Dict, Any
|
|
7
|
+
|
|
8
|
+
SUPPORTED_EXTENSIONS = {".csv", ".json", ".parquet"}
|
|
9
|
+
|
|
10
|
+
def scan_schema(path: str):
|
|
11
|
+
"""
|
|
12
|
+
Scans and prints schema groupings for all supported files in the specified path.
|
|
13
|
+
This is the public entry point for schema inspection.
|
|
14
|
+
"""
|
|
15
|
+
data_dir = Path(path)
|
|
16
|
+
paths = [str(p) for p in data_dir.glob("*") if p.suffix.lower() in SUPPORTED_EXTENSIONS]
|
|
17
|
+
|
|
18
|
+
if not paths:
|
|
19
|
+
logger.warning("No supported data files found.")
|
|
20
|
+
return
|
|
21
|
+
|
|
22
|
+
logger.info(f"Found {len(paths)} files to process.")
|
|
23
|
+
grouped = _group_by_schema(paths)
|
|
24
|
+
|
|
25
|
+
if len(grouped) > 1:
|
|
26
|
+
logger.info(f"Found {len(grouped)} different schemas.")
|
|
27
|
+
else:
|
|
28
|
+
logger.info("Found 1 schema.")
|
|
29
|
+
|
|
30
|
+
for sid, group in grouped.items():
|
|
31
|
+
col_count = len(group[0][1])
|
|
32
|
+
logger.info(f"Schema {sid}: {col_count} columns β {len(group)} file(s):")
|
|
33
|
+
for file_path, _ in group:
|
|
34
|
+
logger.info(f" β’ {file_path}")
|
|
35
|
+
|
|
36
|
+
def read_data(path: str, schema_id: Optional[int] = None, logs: bool = False) -> Optional[DataFrame]:
|
|
37
|
+
"""
|
|
38
|
+
Reads and merges all files that belong to a schema group.
|
|
39
|
+
|
|
40
|
+
:param path: Folder containing data files
|
|
41
|
+
:param schema_id: ID of the schema group to read (defaults to most complex)
|
|
42
|
+
:param logs: Whether to print detailed logs during loading
|
|
43
|
+
:return: Spark DataFrame or None if load failed
|
|
44
|
+
"""
|
|
45
|
+
spark = SparkSession.builder.appName("Databridge").getOrCreate()
|
|
46
|
+
|
|
47
|
+
data_dir = Path(path)
|
|
48
|
+
paths = [str(p) for p in data_dir.glob("*") if p.suffix.lower() in SUPPORTED_EXTENSIONS]
|
|
49
|
+
if not paths:
|
|
50
|
+
if logs:
|
|
51
|
+
logger.warning("No supported data files found.")
|
|
52
|
+
return None
|
|
53
|
+
|
|
54
|
+
if logs:
|
|
55
|
+
logger.info(f"Found {len(paths)} files to process.")
|
|
56
|
+
elif logs == False:
|
|
57
|
+
logger.info(f"For more details about the reading process, set `logs=True`.")
|
|
58
|
+
grouped = _group_by_schema(paths)
|
|
59
|
+
|
|
60
|
+
if len(grouped) > 1 and logs:
|
|
61
|
+
logger.warning(f"Multiple schemas found in path '{path}'. Total: {len(grouped)}")
|
|
62
|
+
|
|
63
|
+
if schema_id is None:
|
|
64
|
+
if logs:
|
|
65
|
+
logger.warning("Schema ID not specified. Defaulting to schema 1.")
|
|
66
|
+
schema_id = 1
|
|
67
|
+
|
|
68
|
+
if schema_id not in grouped:
|
|
69
|
+
logger.error(f"Schema ID {schema_id} not found among detected schema groups. Available IDs: {list(grouped.keys())}")
|
|
70
|
+
return None
|
|
71
|
+
|
|
72
|
+
selected_files = _read_schema_group(grouped, schema_id=schema_id)
|
|
73
|
+
if not selected_files:
|
|
74
|
+
if logs:
|
|
75
|
+
logger.warning(f"No dataset found for schema {schema_id}.")
|
|
76
|
+
return None
|
|
77
|
+
|
|
78
|
+
if logs:
|
|
79
|
+
logger.info(f"Reading data from schema group {schema_id}")
|
|
80
|
+
|
|
81
|
+
dfs = []
|
|
82
|
+
for file in selected_files:
|
|
83
|
+
df = _load_file(spark, file)
|
|
84
|
+
if df:
|
|
85
|
+
dfs.append(df)
|
|
86
|
+
if logs:
|
|
87
|
+
logger.info(f"Loaded file: {file}")
|
|
88
|
+
|
|
89
|
+
if not dfs:
|
|
90
|
+
if logs:
|
|
91
|
+
logger.warning("No DataFrames were loaded.")
|
|
92
|
+
return None
|
|
93
|
+
|
|
94
|
+
final_df = dfs[0]
|
|
95
|
+
for df in dfs[1:]:
|
|
96
|
+
try:
|
|
97
|
+
final_df = final_df.unionByName(df)
|
|
98
|
+
except Exception as e:
|
|
99
|
+
if logs:
|
|
100
|
+
logger.error(f"Error merging DataFrames: {e}")
|
|
101
|
+
|
|
102
|
+
if logs:
|
|
103
|
+
logger.info("Dataset successfully loaded.")
|
|
104
|
+
|
|
105
|
+
return final_df
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
def get_schema_info(path: str) -> List[Dict[str, Any]]:
|
|
109
|
+
"""
|
|
110
|
+
Returns detailed information about schema groups detected in the given directory.
|
|
111
|
+
|
|
112
|
+
:param path: Path to the folder containing raw data files.
|
|
113
|
+
:return: A list of dictionaries with schema_id, file count, column count, and list of files.
|
|
114
|
+
"""
|
|
115
|
+
data_dir = Path(path)
|
|
116
|
+
paths = [str(p) for p in data_dir.glob("*") if p.suffix.lower() in SUPPORTED_EXTENSIONS]
|
|
117
|
+
|
|
118
|
+
if not paths:
|
|
119
|
+
logger.warning("No supported data files found in the provided directory.")
|
|
120
|
+
return []
|
|
121
|
+
|
|
122
|
+
grouped = _group_by_schema(paths)
|
|
123
|
+
schema_info = []
|
|
124
|
+
|
|
125
|
+
for schema_id, group in grouped.items():
|
|
126
|
+
file_list = [Path(file_path).name for file_path, _ in group]
|
|
127
|
+
column_count = len(group[0][1]) if group else 0
|
|
128
|
+
|
|
129
|
+
schema_info.append({
|
|
130
|
+
"schema_id": schema_id,
|
|
131
|
+
"file_count": len(file_list),
|
|
132
|
+
"column_count": column_count,
|
|
133
|
+
"files": file_list
|
|
134
|
+
})
|
|
135
|
+
|
|
136
|
+
return schema_info
|